ocean-lang.org Git - ocean/blob - csrc/scanner.mdc

   1 # Lexical Scanner #
   2
   3 ## The Task at Hand ##
   4
   5 The main task of the lexical scanner is to convert a stream of
   6 characters into a stream of tokens.  The tokens are then typically
   7 used by a parser to extract the syntactic structure.
   8
   9 The stream of characters are assumed to be in memory identified by a
  10 linked list of blocks, such as provided by the "[mdcode][]" literate
  11 program extractor.  A single token may never cross a block boundary.
  12
  13 [mdcode]: mdcode.html
  14
  15 ###### includes
  16         #include "mdcode.h"
  17
  18 The text is assumed to be UTF-8 though some matching assumes the
  19 ASCII subset.  If the text provided does not conform to UTF-8 an error
  20 will be reported and some number of bytes will be skipped.
  21
  22 ###### public types
  23         #include <wchar.h>
  24         #include <wctype.h>
  25         #include <unicode/uchar.h>
  26
  27 Tokens are returned by successive calls to the main interface
  28 function: `token_next()` which has a `state` structure to keep track
  29 of where it is up to.  Each token carries not just a numeric
  30 identifier but also the code block, the line and character within that
  31 block, and the actual start and length using the `struct text` from
  32 "mdcode".
  33
  34 ###### public types
  35         struct token {
  36                 int               num;
  37                 struct code_node *node;
  38                 struct text       txt;
  39                 int               line, col;
  40         };
  41         struct token_state;
  42
  43 ###### private types
  44         struct token_state {
  45                 ## state fields
  46         };
  47
  48 ###### exported functions
  49         struct token token_next(struct token_state *state);
  50
  51 ###### main functions
  52         struct token token_next(struct token_state *state)
  53         {
  54                 ## token_next init
  55                 while (1) {
  56                         wint_t ch;
  57                         struct token tk;
  58
  59                         ## one token
  60                 }
  61         }
  62
  63 The `line` and `col` offsets are useful for reporting errors.
  64 The `txt` provides the content when that is important.
  65
  66 ### Token types and configuration ##
  67
  68 The scanner is not completely general, yet not completely specified.
  69 There are a fixed set of token types, though particular tokens within
  70 those types can be distinguish via configuration.
  71
  72 Most token types may be explicitly ignored, as typically comments
  73 would be.  The exact consequence of ignoring each token type varies
  74 from token to token.
  75
  76 ###### public types
  77         struct token_config {
  78                 int ignored;    // bit set of ignored tokens.
  79                 ## token config parameters
  80         };
  81
  82 ###### state fields
  83         struct token_config *conf;
  84
  85 ###### token_next init
  86         int ignored = state->conf->ignored;
  87
  88
  89 The different tokens are numbers, words, marks, strings, comments,
  90 newlines, EOF, and indents, each of which is examined in detail below.
  91
  92 There are various cases where no token can be found in part of the
  93 input.  All of these will be reported as a `TK_error` token.
  94
  95 It is possible to declare a number of strings which form distinct
  96 tokens (rather than being grouped as e.g. 'word').  These are given
  97 token numbers from `TK_reserved` upwards.
  98
  99 ###### public types
 100         enum token_num {
 101                 TK_error,
 102                 ## token types
 103                 TK_reserved
 104         };
 105
 106 ### Numbers
 107
 108 Numbers are the messiest tokens to parse, primarily because they can
 109 contain characters that also have meaning outside of numbers and,
 110 particularly, immediately after numbers.
 111
 112 The obvious example is the '`-`' sign.  It can come inside a number for
 113 a negative exponent, or after a number as a subtraction operator.  To
 114 be sure we have parsed as best as possible we need to only allow the
 115 '`-`' inside a number if it is after an exponent character.  This can be
 116 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
 117 digit, so we don't allow '`-`' after just any `e`.
 118
 119 To make matters worse, our language designer has decided to experiment
 120 with allowing commas to be used as the decimal indicator, and spaces
 121 to be used to separate groups of digits in large numbers.  Both of
 122 these can reasonably be restricted to appear between two digits, so we
 123 have to add that condition to our tests.
 124
 125 So we cannot just treat numbers as starting with a digit and being
 126 followed by some set of characters.  We need more structure than that.
 127
 128 So:
 129
 130 - Numbers must start with a digit.
 131 - If the first digit is zero, the next character must be a base
 132   signifier (one of `xob`) or a decimal marker (`.` or `,`).
 133   In the first case the first `p` or `P` may be followed by a sign.
 134 - If the number doesn't start with `0` followed by one of `xob`, the
 135   first `e` may be followed by a sign.
 136 - Any digit or hex digit may be followed by a space or underscore
 137   providing that the subsequence character is also a (hex) digit.
 138   This rule will require an extra level of 'unget' to be
 139   supported when handling characters.
 140 - Otherwise any digits or ASCII letters are allowed.  We do not at
 141   this point check that the digits given are permitted by the base.
 142   That will happen when the token is converted to a number.
 143
 144 To allow easy configuration, the various non alphanumeric characters
 145 are only permitted if they are listed in a configuration parameter.
 146
 147 ###### token config parameters
 148         char *number_chars;
 149
 150 Note that numbers may not start with a period, so `.75` is not a
 151 number.  This is not the norm, but is not unheard of.  Excluding these
 152 numbers simplifies the rule at very little cost.
 153
 154 ###### token types
 155         TK_number,
 156
 157 If TK_number is ignored, digits will result in an error unless they
 158 are declared to be a start character for words.
 159
 160 ###### includes
 161
 162         #include <string.h>
 163
 164 ###### parse number
 165
 166         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
 167                 int prev_special = 0;
 168                 int expect_p = 0;
 169                 int decimal_mark = 0;
 170                 if (ch == '0') {
 171                         wchar_t ch2 = get_char(state);
 172                         if (strchr("xobXOB", ch2) != NULL)
 173                                 expect_p = 1;
 174                         unget_char(state);
 175                 }
 176                 while (1) {
 177                         int sign_ok = 0;
 178                         switch(expect_p) {
 179                         case 0:
 180                                 if (ch == 'e' || ch == 'E')
 181                                         sign_ok = 1;
 182                                 break;
 183                         case 1:
 184                                 if (ch == 'p' || ch == 'P')
 185                                         sign_ok = 1;
 186                                 break;
 187                         }
 188                         save_unget_state(state);
 189                         ch = get_char(state);
 190                         if (iswalnum(ch)) {
 191                                 prev_special = 0;
 192                                 continue;
 193                         }
 194                         if (ch == '+' || ch == '-') {
 195                                 if (!sign_ok)
 196                                         break;
 197                                 expect_p = -1;
 198                         }
 199                         if (ch == '.' || ch == ',') {
 200                                 if (decimal_mark)
 201                                         break;
 202                                 decimal_mark = 1;
 203                         }
 204                         if (prev_special) {
 205                                 /* Don't allow that special char,
 206                                  * need two 'ungets'
 207                                  */
 208                                 restore_unget_state(state);
 209                                 break;
 210                         }
 211                         if (strchr(state->conf->number_chars, ch)) {
 212                                 prev_special = 1;
 213                                 continue;
 214                         }
 215                         /* non-number char */
 216                         break;
 217                 }
 218                 /* We seem to have a "number" token */
 219                 unget_char(state);
 220                 close_token(state, &tk);
 221                 tk.num = TK_number;
 222                 return tk;
 223         }
 224
 225 ### Words
 226 Words start with a "start" character followed by the longest
 227 sequence of "continue" characters.  The Unicode ID_START and
 228 ID_CONTINUE sets are always permitted, but other ASCII characters
 229 can be added to these sets.
 230
 231 ###### token config parameters
 232         char *word_start;
 233         char *word_cont;
 234
 235 ###### internal functions
 236         static int is_word_start(wchar_t ch, struct token_config *conf)
 237         {
 238                 return iswalpha(ch) ||
 239                        strchr(conf->word_start, ch) != NULL ||
 240                        u_hasBinaryProperty(ch, UCHAR_ID_START);
 241         }
 242
 243         static int is_word_continue(wchar_t ch, struct token_config *conf)
 244         {
 245                 return iswalnum(ch) ||
 246                        strchr(conf->word_cont, ch) != NULL ||
 247                        u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
 248         }
 249
 250 Words can be either known or unknown.  Known words are referred to as
 251 "reserved words" and get a unique token number.  Unknown words are
 252 "identifiers" and are syntactically a single token.
 253
 254 ###### token types
 255         TK_ident,
 256
 257 A list of known words must be provided.  This list is shared with the
 258 "marks" which are described next.  The list must be lexically sorted
 259 and the length of the list must be given (`known_count`).
 260 Tokens matching these known words are reported as the index of the
 261 list added to `TK_reserved`.
 262
 263 If identifiers are ignored, then any word which is not listed as a
 264 known word results in an error.
 265
 266 ###### token config parameters
 267         const char **words_marks;
 268         int known_count;
 269
 270 ###### parse word
 271
 272         if (is_word_start(ch, state->conf)) {
 273                 int n;
 274                 /* A word: identifier or reserved */
 275                 do
 276                         ch = get_char(state);
 277                 while (is_word_continue(ch, state->conf));
 278                 unget_char(state);
 279                 close_token(state, &tk);
 280                 tk.num = TK_ident;
 281                 if (ignored & (1<<TK_ident))
 282                         tk.num = TK_error;
 283                 n = find_known(state->conf, tk.txt);
 284                 if (n >= 0)
 285                         tk.num = TK_reserved + n;
 286                 return tk;
 287         }
 288
 289 ### Marks
 290
 291 Marks are generally one or more punctuation marks joined together.  It
 292 would be nice to use the term "symbol" for these, but that causes
 293 confusion in a subsequent discussion of the grammar, which has terminal
 294 symbols and non-terminal symbols which are conceptually quite
 295 different.  So strings of punctuation characters will be marks.
 296
 297 A "mark" consists of ASCII characters that are not white space, are not
 298 "start" characters for words, and are not digits.
 299 These will collectively be called mark characters.
 300
 301 ###### internal functions
 302         static int is_mark(wchar_t ch, struct token_config *conf)
 303         {
 304                 return ch > ' ' &&
 305                        ch < 0x7f &&
 306                        !iswalnum(ch) &&
 307                        strchr(conf->word_start, ch) == NULL;
 308         }
 309
 310 As with words, there can be known and unknown marks, though the rules
 311 are slightly different.
 312
 313 Two marks do not need to be separated by a non-mark characters.  This
 314 is different from words which do need to be separated by at least one
 315 non-continue character.
 316
 317 The scanner will normally prefer longer sequences of mark characters,
 318 but will more strongly prefer known marks over unknown marks.  So if
 319 it finds a known mark where adding one more character does not result
 320 in a known mark, it will return that first known mark.
 321
 322 If no known mark is found we will test against strings and comments
 323 below before giving up and assuming an unknown mark.
 324
 325 If an unknown mark contains a quote character or a comment marker, and
 326 that token is not being ignored, then we terminate the unknown mark
 327 before that quote or comment.  This ensures that an unknown mark
 328 immediately before a string is handled correctly.
 329
 330 If the first character of a comment marker (i.e. '/') is a known mark,
 331 the above rules would suggest that the start of a comment would be
 332 parsed as that mark, which is not what is wanted.  So the introductory
 333 sequences for a comment ("//" and "/*") are treated as
 334 partially-known.  They prevent the leading "/" from being a mark by
 335 itself, but do not actually constitute a stand-alone mark.
 336
 337 If `TK_mark` is ignored, then unknown marks are returned as errors.
 338
 339 ###### token types
 340         TK_mark,
 341
 342 Known marks are included in the same list as the list of known words.
 343
 344 ###### parse mark
 345         tk.num = TK_error;
 346         while (is_mark(ch, state->conf)) {
 347                 int n;
 348                 wchar_t prev;
 349                 close_token(state, &tk);
 350                 n = find_known(state->conf, tk.txt);
 351                 if (n >= 0)
 352                         tk.num = TK_reserved + n;
 353                 else if (tk.num != TK_error) {
 354                         /* found a longest-known-mark, still need to
 355                          * check for comments
 356                          */
 357                         if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
 358                             (ch == '/' || ch == '*')) {
 359                                 /* Yes, this is a comment, not a '/' */
 360                                 restore_unget_state(state);
 361                                 tk.num = TK_error;
 362                                 break;
 363                         }
 364                         unget_char(state);
 365                         close_token(state, &tk);
 366                         return tk;
 367                 }
 368                 prev = ch;
 369                 save_unget_state(state);
 370                 ch = get_char(state);
 371                 if (!(ignored && (1<<TK_string)) && is_quote(ch))
 372                         break;
 373                 if (prev == '#' && n < 0)
 374                         /* '#' is not a known mark, so assume it is a comment */
 375                         break;
 376                 if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
 377                         close_token(state, &tk);
 378                         restore_unget_state(state);
 379                         break;
 380                 }
 381                 if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
 382                         close_token(state, &tk);
 383                         restore_unget_state(state);
 384                         break;
 385                 }
 386         }
 387         unget_char(state);
 388         if (tk.num != TK_error) {
 389                 close_token(state, &tk);
 390                 return tk;
 391         }
 392
 393 If we don't find a known mark, we will check for strings and comments
 394 before assuming that we have an unknown mark
 395
 396 ###### parse mark
 397         ## parse string
 398         ## parse comment
 399         ## unknown mark
 400
 401 ###### unknown mark
 402         if (tk.txt.len) {
 403                 if (ignored & (1<<TK_mark))
 404                         tk.num = TK_error;
 405                 else
 406                         tk.num = TK_mark;
 407                 return tk;
 408         }
 409
 410 ### Strings
 411
 412 Strings start with one of single quote, double quote, or back quote
 413 and continue until a matching character on the same line.  Any of
 414 these characters can be included in the list of known marks and then
 415 they will not be used for identifying strings.
 416
 417 Immediately following the close quote, one or two ASCII letters may
 418 appear.  These are somewhat like the arbitrary letters allowed in
 419 "Numbers" above.  They can be used by the language in various ways.
 420
 421 If 3 identical quote characters appear in a row and are
 422 followed by a newline, then this forms a multi-line string which
 423 continues until an identical triple quote appears on a line preceded
 424 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
 425
 426 Multi-line strings may not extend beyond the end of the `code_node` in
 427 which they start.
 428
 429 Normal strings and multi-line strings are encoded as two different
 430 token types.
 431
 432 ###### token types
 433         TK_string,
 434         TK_multi_string,
 435
 436 ###### internal functions
 437         static int is_quote(wchar_t ch)
 438         {
 439                 return ch == '\'' || ch == '"' || ch == '`';
 440         }
 441
 442 #### Multi-line strings
 443
 444 The multi-line string is checked for first.  If they are being
 445 ignored, we fall through and treat a triple quote as an empty string
 446 followed by the start of a new string.
 447
 448 ###### parse string
 449         if (tk.txt.len == 3 &&
 450             !(ignored & (1 << TK_multi_string)) &&
 451             is_quote(tk.txt.txt[0]) &&
 452             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
 453             is_newline(tk.txt.txt[3])) {
 454                 // triple quote
 455                 wchar_t first = tk.txt.txt[0];
 456                 int qseen = 0;
 457                 int at_sol = 1;
 458                 while (!at_eon(state) && qseen < 3) {
 459                         ch = get_char(state);
 460                         if (is_newline(ch)) {
 461                                 at_sol = 1;
 462                                 qseen = 0;
 463                         } else if (at_sol && ch == first) {
 464                                 qseen += 1;
 465                         } else if (ch != ' ' && ch != '\t') {
 466                                 at_sol = 0;
 467                                 qseen = 0;
 468                         }
 469                 }
 470                 if (qseen != 3) {
 471                         /* Hit end of node - error.
 472                          * unget so the newline is seen,
 473                          * but return rest of string as an error.
 474                          */
 475                         if (is_newline(ch))
 476                                 unget_char(state);
 477                         close_token(state, &tk);
 478                         tk.num = TK_error;
 479                         return tk;
 480                 }
 481                 /* 2 letters are allowed */
 482                 ch = get_char(state);
 483                 if (iswalpha(ch))
 484                         ch = get_char(state);
 485                 if (iswalpha(ch))
 486                         ch = get_char(state);
 487                 /* Now we must have a newline, but we don't return it
 488                  * whatever it is.*/
 489                 unget_char(state);
 490                 close_token(state, &tk);
 491                 tk.num = TK_multi_string;
 492                 if (!is_newline(ch))
 493                         tk.num = TK_error;
 494                 return tk;
 495         }
 496
 497 #### Single-line strings
 498
 499 The sequence of marks collected may be more than a single-line
 500 string, so we reset to the start and collect characters until
 501 we find a close quote or a newline.
 502
 503 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
 504
 505 ###### parse string
 506         if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
 507             !(ignored & (1<<TK_string))) {
 508                 wchar_t first = tk.txt.txt[0];
 509                 reset_token(state, &tk);
 510                 ch = get_char(state);
 511                 tk.num = TK_error;
 512                 while (!at_eon(state) && !is_newline(ch)) {
 513                         ch = get_char(state);
 514                         if (ch == first) {
 515                                 tk.num = TK_string;
 516                                 break;
 517                         }
 518                         if (is_newline(ch)) {
 519                                 unget_char(state);
 520                                 break;
 521                         }
 522                 }
 523                 close_token(state, &tk);
 524                 return tk;
 525         }
 526
 527 ### Comments
 528
 529 Single line comments may start with '`//`' or '`#`' providing that these
 530 are not known marks.  They continue to the end of the line.
 531
 532 Block comments start with '`/*`' if this is not a known mark.  They
 533 continue to the first occurrence of '`*/`' and may not contain any
 534 occurrence of '`/*`'.
 535
 536 Block comments can be wholly within one line or can continue over
 537 multiple lines.  The multi-line version should be followed immediately
 538 by a newline.  The Linux kernel contains over 285000 multi-line
 539 comments are only 34 are followed by characters other than white space
 540 (which should be removed) or a backslash (only needed in macros).  So
 541 it would not suffer from this rule.
 542
 543 These two comment types are reported as two separate token types, and
 544 consequently can be ignored separately.  When ignored a comment is
 545 still parsed, but is discarded.
 546
 547 ###### token types
 548         TK_line_comment,
 549         TK_block_comment,
 550
 551 ###### internal functions
 552         static int is_line_comment(struct text txt)
 553         {
 554                 return (txt.len >= 1 && txt.txt[0] == '#') ||
 555                        (txt.len >= 2 && txt.txt[0] == '/' &&
 556                                         txt.txt[1] == '/');
 557         }
 558
 559         static int is_block_comment(struct text txt)
 560         {
 561                 return txt.len >= 2 && txt.txt[0] == '/' &&
 562                        txt.txt[1] == '*';
 563         }
 564
 565 #### Single line comments
 566
 567 A single-line comment continues up to, but not including the newline
 568 or end of node.
 569
 570 ###### parse comment
 571
 572         if (is_line_comment(tk.txt)) {
 573                 while (!is_newline(ch) && !at_eon(state))
 574                         ch = get_char(state);
 575                 if (is_newline(ch))
 576                         unget_char(state);
 577                 close_token(state, &tk);
 578                 tk.num = TK_line_comment;
 579                 if (ignored & (1 << TK_line_comment))
 580                         continue;
 581                 return tk;
 582         }
 583
 584 #### Block comments
 585
 586 The token text collected so far could exceed the comment, so we need
 587 to reset it first.
 588
 589 If we find an embedded `/*` we reset to just before the '/' and report
 590 an error.  That way the next thing to be parsed will be the rest of
 591 the comment.  This requires a double unget, so we need to save/restore
 592 the unget state (explained later).
 593
 594 ###### parse comment
 595
 596         if (is_block_comment(tk.txt)) {
 597                 wchar_t prev;
 598                 int newlines = 0;
 599                 reset_token(state, &tk);
 600                 get_char(state);
 601                 get_char(state);
 602                 save_unget_state(state);
 603                 ch = get_char(state);
 604                 prev = 0;
 605                 while (!at_eon(state) &&
 606                        (prev != '/' || ch != '*') &&
 607                        (prev != '*' || ch != '/')) {
 608                         if (is_newline(ch))
 609                                 newlines = 1;
 610                         prev = ch;
 611                         save_unget_state(state);
 612                         ch = get_char(state);
 613                 }
 614                 close_token(state, &tk);
 615                 if (at_eon(state)) {
 616                         tk.num = TK_error;
 617                         return tk;
 618                 }
 619                 if (prev == '/') {
 620                         /* embedded.  Need to unget twice! */
 621                         restore_unget_state(state);
 622                         unget_char(state);
 623                         tk.num = TK_error;
 624                         return tk;
 625                 }
 626                 tk.num = TK_block_comment;
 627                 if (newlines && !(ignored & (1<<TK_newline))) {
 628                         /* next char must be newline */
 629                         ch = get_char(state);
 630                         unget_char(state);
 631                         if (!is_newline(ch))
 632                                 tk.num = TK_error;
 633                 }
 634                 if (tk.num == TK_error ||
 635                     !(ignored & (1 << TK_block_comment)))
 636                         return tk;
 637                 continue;
 638         }
 639
 640 ### Indents, Newlines, and White Space.
 641
 642 Normally white space is ignored.  However newlines can be important as
 643 can indents, which are either after a newline or at the start of a
 644 node (detected by `at_son()`);
 645
 646 ###### exported functions
 647         static inline int is_newline(wchar_t ch)
 648         {
 649                 return ch == '\n' || ch == '\f' || ch == '\v';
 650         }
 651
 652 ###### white space
 653         if (ch <= ' ' && !is_newline(ch)
 654             && ! at_son(state))
 655                 continue;
 656
 657 If a line starts with more white-space than the previous non-blank
 658 line - or if the first non-blank line in the document starts with any
 659 white-space - then an "IN" is reported at the start of the line.
 660
 661 Before the next non-blank line which starts with less white space, or
 662 at the latest at the end of the document, a matching "OUT" token
 663 is reported.  There will always be an exact match between "IN" and
 664 "OUT" tokens.
 665
 666 It is possible for "OUT" to be followed (almost) immediately by an
 667 "IN".  This happens if, for example, the indent of three consecutive
 668 lines are 0, 8, 4 spaces.  Before the second line we report an
 669 "IN".  Before the third line we must report an "OUT", as 4 is less
 670 than 8, then also an Ident as 4 is greater than 0.
 671
 672 ###### token types
 673         TK_in,
 674         TK_out,
 675
 676 For the purpose of measuring the length of white space, a tab adds at
 677 least one space, and rounds up to a multiple of 8.
 678
 679 ###### exported functions
 680         static inline int indent_tab(int indent)
 681         {
 682                 return (indent|7)+1;
 683         }
 684
 685 We need to track the current levels of indent.  This requires some
 686 sort of stack as indent levels are pushed on and popped off.  In
 687 practice this stack is unlikely to often exceed 5 so we will used a
 688 fixed stack of 20 indent levels.  More than this will be silently
 689 ignored.
 690
 691 ###### state fields
 692         int     indent_level;
 693         int     indent_sizes[20];
 694
 695 #### Newlines
 696
 697 Newlines can optionally be reported.  Newlines within a block comment
 698 or a multi-line string are not reported separately, but each of these
 699 must be followed immediately by a newline so these constructs cannot
 700 hide the fact that a newline was present.
 701
 702 When indents are being reported, the Newline which would normally be
 703 reported immediately before the "IN" is delayed until after the
 704 matching "OUT".  This makes an indented section act like a
 705 continuation of the previous line to some extent.
 706
 707 A blank line would normally be reported simply as two consecutive Newline
 708 tokens.  However if the subsequent line is indented (and indents are being
 709 reported) then the right thing to do is less obvious as Newlines should be
 710 delayed - but how many Newlines?
 711
 712 The approach we will take is to report the extra Newlines immediately after
 713 the IN token, so the blank line is treated as though it were an indented
 714 blank line.
 715
 716 ###### token types
 717         TK_newline,
 718
 719 If we find a newline or white space at the start of a block, we keep
 720 collecting spaces, tabs, and newlines until we find some real text.
 721 Then depending on the indent we generate some number of tokens.  These
 722 will be a sequence of "Newline OUT" pairs representing a decrease
 723 in indent, then either a Newline or an IN depending on whether the
 724 next line is indented, then zero or more Newlines representing all the
 725 blank lines that have been skipped.
 726
 727 When a Newline leads to the next block of code there is a question of
 728 whether the various Newline and OUT/IN tokens should appear to
 729 pbelong to the earlier or later block.  This is addressed by processing
 730 the tokens in two stages based on the relative indent levels of the
 731 two blocks (each block has a base indent to which the actual indents
 732 are added).
 733
 734 Any "Newline OUT" pairs needed to reduce the current indent to the
 735 maximum of the base indents of the old and new blocks are generated
 736 against the old block.  Then if the next block does not have an
 737 increased indent, one more "Newline" is generated.
 738
 739 If further "Newline OUT" pairs are needed to get to the indent
 740 level of the 'next' block, they are generated against that block,
 741 though the first Newline is suppressed (it having already been
 742 generated).
 743
 744 Finally the Newline or IN for the first line of the new block is
 745 generated, unless the Newline needs to be suppressed because it
 746 appeared at the end of the previous block.
 747
 748 This means that a block may start with an OUT or an IN, but
 749 will only start with a Newline if it actually starts with a blank
 750 line.
 751
 752 We will need to represent in the `token_state` where in this sequence
 753 of delayed tokens we are.  As `state.col` records the target indent we
 754 don't need to record how many OUTs or INs are needed.  We do
 755 need to record the number of blank lines, and which of Newline and
 756 OUT is needed next in the initial sequence of pairs.
 757
 758 For this we store one more than the number of blank lines as
 759 `delayed_lines` and a flag for `out_next`.
 760
 761 ###### state fields
 762         int check_indent;
 763         int delayed_lines;
 764         int out_next;
 765
 766 Generating these tokens involve two separate pieces of code.
 767
 768 Firstly we need to recognise white space and count the indents and
 769 newlines.  These are recorded in the above state fields.
 770
 771 Separately we need, on each call to `token_next`, we need to check if
 772 there are some delayed tokens and if so we need to advance the state
 773 information and return one token.
 774
 775 ###### white space
 776         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 777                 int newlines = 0;
 778                 int was_son = at_son(state);
 779                 if (ignored & (1<<TK_in)) {
 780                         if (!is_newline(ch))
 781                                 continue;
 782                         if (ignored & (1<<TK_newline))
 783                                 continue;
 784                         tk.num = TK_newline;
 785                         close_token(state, &tk);
 786                         return tk;
 787                 }
 788                 // Indents are needed, so check all white space.
 789                 while (ch <= ' ' && !at_eon(state)) {
 790                         if (is_newline(ch))
 791                                 newlines += 1;
 792                         ch = get_char(state);
 793                 }
 794                 if (at_eon(state)) {
 795                         newlines += 1;
 796                         if (state->node->next &&
 797                             state->node->next->indent > state->node->indent)
 798                                 state->col = state->node->next->indent;
 799                         else
 800                                 state->col = state->node->indent;
 801                 } else
 802                         unget_char(state);
 803                 state->delayed_lines = newlines;
 804                 state->out_next = was_son;
 805                 state->check_indent = 1;
 806                 continue;
 807         }
 808
 809
 810 ###### delayed tokens
 811
 812         if (state->check_indent || state->delayed_lines) {
 813                 if (state->col < state->indent_sizes[state->indent_level]) {
 814                         if (!state->out_next &&
 815                             !(ignored & (1<<TK_newline))) {
 816                                 state->out_next = 1;
 817                                 tk.num = TK_newline;
 818                                 return tk;
 819                         }
 820                         state->indent_level -= 1;
 821                         state->out_next = 0;
 822                         tk.num = TK_out;
 823                         return tk;
 824                 }
 825                 if (state->col > state->indent_sizes[state->indent_level] &&
 826                     state->indent_level < sizeof(state->indent_sizes)-1) {
 827                         state->indent_level += 1;
 828                         state->indent_sizes[state->indent_level] = state->col;
 829                         state->delayed_lines -= 1;
 830                         tk.num = TK_in;
 831                         return tk;
 832                 }
 833                 state->check_indent = 0;
 834                 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
 835                         tk.num = TK_newline;
 836                         state->delayed_lines -= 1;
 837                         return tk;
 838                 }
 839                 state->delayed_lines = 0;
 840                 continue;
 841         }
 842
 843 ### End of File
 844
 845 After the last newline in the file has been processed, a special
 846 end-of-file token will be returned.  any further attempts to get more
 847 tokens will continue to return the same end-of-file token.
 848
 849 ###### token types
 850         TK_eof,
 851
 852
 853 ###### white space
 854         if (ch == WEOF) {
 855                 if (state->col) {
 856                         state->col = 0;
 857                         state->check_indent = 1;
 858                         continue;
 859                 }
 860                 tk.num = TK_eof;
 861                 return tk;
 862         }
 863
 864 ### Unknown Marks, or errors.
 865
 866 We have now handled all the possible known mark-like tokens.
 867 If the token we have is not empty and `TK_mark` is allowed,
 868 we have an unknown mark, otherwise this must be an error.
 869
 870 ###### unknown mark
 871         /* one unknown character */
 872         close_token(state, &tk);
 873         tk.num = TK_error;
 874         return tk;
 875
 876 ## Tools For The Task
 877
 878 You may have noticed that are few gaps we left in the above -
 879 functions used without first defining them.  Doing so above would have
 880 broken the flow.
 881
 882 ### Character by character
 883
 884 As we walk through the various `code_node`s we need to process whole
 885 Unicode codepoints, and keep track of which line and column we are on.
 886 We will assume for now that any printing character uses one column,
 887 though that is not true in general.
 888
 889 As the text in a `code_node` may include an indent that identifies it as
 890 being code, we need to be careful to strip that.  The `code_node` has
 891 a flag that tells us whether or not we need to strip.
 892
 893 ###### includes
 894         #include <memory.h>
 895
 896 ###### state fields
 897         struct code_node *node;
 898         int    offset;
 899         int    line;
 900         int    col;
 901
 902 ###### internal functions
 903
 904         static int do_strip(struct token_state *state)
 905         {
 906                 int indent = 0;
 907                 if (state->node->needs_strip) {
 908                         int n = 4;
 909                         while (n && state->node->code.txt[state->offset] == ' ') {
 910                                 indent += 1;
 911                                 state->offset += 1;
 912                                 n -= 1;
 913                         }
 914                         while (n == 4 && state->node->code.txt[state->offset] == '\t') {
 915                                 indent = indent_tab(indent);
 916                                 state->offset += 1;
 917                                 n -= 4;
 918                         }
 919                 }
 920                 return indent;
 921         }
 922
 923         static wint_t get_char(struct token_state *state)
 924         {
 925                 wchar_t next;
 926                 size_t n;
 927                 mbstate_t mbstate;
 928
 929                 if (state->node == NULL)
 930                         return WEOF;
 931                 if (state->node->code.len <= state->offset) {
 932                         do
 933                                 state->node = state->node->next;
 934                         while (state->node && state->node->code.txt == NULL);
 935                         state->offset = 0;
 936                         if (state->node == NULL)
 937                                 return WEOF;
 938                         state->line = state->node->line_no;
 939                         state->col = do_strip(state);
 940                 }
 941
 942                 ## before get_char
 943
 944                 memset(&mbstate, 0, sizeof(mbstate));
 945
 946                 n = mbrtowc(&next, state->node->code.txt + state->offset,
 947                             state->node->code.len - state->offset,
 948                             &mbstate);
 949                 if (n == -2 || n == 0) {
 950                         /* Not enough bytes - not really possible */
 951                         next = '\n';
 952                         state->offset = state->node->code.len;
 953                 } else if (n == -1) {
 954                         /* error */
 955                         state->offset += 1;
 956                         next = 0x7f; // an illegal character
 957                 } else
 958                         state->offset += n;
 959
 960                 if (next >= ' ') {
 961                         state->col += 1;
 962                 } else if (is_newline(next)) {
 963                         state->line += 1;
 964                         state->col = do_strip(state);
 965                 } else if (next == '\t') {
 966                         state->col = indent_tab(state->col);
 967                 }
 968                 return next;
 969         }
 970
 971 We will sometimes want to "unget" the last character as it needs to be
 972 considered again as part of the next token.  So we need to store a
 973 'previous' version of all metadata.
 974
 975 ###### state fields
 976         int    prev_offset;
 977         int    prev_line;
 978         int    prev_col;
 979
 980 ###### before get_char
 981         state->prev_offset = state->offset;
 982         state->prev_line   = state->line;
 983         state->prev_col    = state->col;
 984
 985 ###### internal functions
 986
 987         static void unget_char(struct token_state *state)
 988         {
 989                 if (state->node) {
 990                         state->offset = state->prev_offset;
 991                         state->line   = state->prev_line;
 992                         state->col    = state->prev_col;
 993                 }
 994         }
 995
 996 We occasionally need a double-unget, particularly for numbers and
 997 block comments.  We don't impose this cost on all scanning, but
 998 require those code sections that need it to call `save_unget_state`
 999 before each `get_char`, and then `restore_unget_state` when a
1000 double-unget is needed.
1001
1002 ###### state fields
1003         int     prev_offset2;
1004         int     prev_line2;
1005         int     prev_col2;
1006
1007 ###### internal functions
1008         static void save_unget_state(struct token_state *state)
1009         {
1010                 state->prev_offset2 = state->prev_offset;
1011                 state->prev_line2 = state->prev_line;
1012                 state->prev_col2 = state->prev_col;
1013         }
1014
1015         static void restore_unget_state(struct token_state *state)
1016         {
1017                 state->prev_offset = state->prev_offset2;
1018                 state->prev_line = state->prev_line2;
1019                 state->prev_col = state->prev_col2;
1020         }
1021
1022 At the start of a token we don't want to be at the end of a code block
1023 if we can help it.  To avoid this possibility, we 'get' and 'unget' a
1024 single character.  This will move into the next non-empty code block
1025 and leave the current pointer at the start of it.
1026
1027 This has to happen _after_ dealing with delayed tokens as some of them
1028 must appear in the previous node.  When we do this, we need to reset
1029 the data in the token.
1030
1031 ###### delayed tokens
1032         if (at_eon(state)) {
1033                 get_char(state);
1034                 unget_char(state);
1035                 tk.node = state->node;
1036                 if (state->node)
1037                         tk.txt.txt = state->node->code.txt + state->offset;
1038                 tk.line = state->line;
1039                 tk.col = state->col;
1040                 tk.txt.len = 0;
1041         }
1042
1043 ### Managing tokens
1044
1045 The current token is initialized to line up with the first character
1046 that we 'get' for each token.  When we have, or might have, a full
1047 token we can call `close_token` to set the `len` of the token
1048 appropriately.  This can safely be called multiple times.
1049
1050 Finally we occasionally (for single-line strings and block comments)
1051 need to reset to the beginning of the current token as we might have
1052 parsed too much already.  For that there is `reset_token`.
1053
1054 ###### one token
1055         tk.node = state->node;
1056         if (state->node)
1057                 tk.txt.txt = state->node->code.txt + state->offset;
1058         tk.line = state->line;
1059         tk.col = state->col;
1060         tk.txt.len = 0;
1061
1062 ###### internal functions
1063
1064         static void close_token(struct token_state *state,
1065                                 struct token *tk)
1066         {
1067                 tk->txt.len = (state->node->code.txt + state->offset)
1068                               - tk->txt.txt;
1069         }
1070
1071         static void reset_token(struct token_state *state, struct token *tok)
1072         {
1073                 state->prev_line = tok->line;
1074                 state->prev_col = tok->col;
1075                 state->prev_offset = tok->txt.txt - state->node->code.txt;
1076                 unget_char(state);
1077                 tok->txt.len = 0;
1078         }
1079
1080
1081 Tokens make not cross into the next `code_node`, and some tokens can
1082 include the newline at the and of a `code_node`, we must be able to
1083 easily check if we have reached the end.  Equally we need to know if
1084 we are at the start of a node, as white space is treated a little
1085 differently there.
1086
1087 ###### internal functions
1088
1089         static int at_son(struct token_state *state)
1090         {
1091                 return state->offset == 0;
1092         }
1093
1094         static int at_eon(struct token_state *state)
1095         {
1096                 // at end-of-node ??
1097                 return state->node == NULL ||
1098                        state->offset >= state->node->code.len;
1099         }
1100
1101 ### Find a known word
1102
1103 As the known-word list is sorted we can use a simple binary search.
1104 Following the pattern established in "mdcode", we will use a `struct
1105 text` with start and length to represent the code fragment we are
1106 searching for.
1107
1108 ###### internal functions
1109         static int find_known(struct token_config *conf, struct text txt)
1110         {
1111                 int lo = 0;
1112                 int hi = conf->known_count;
1113
1114                 while (lo + 1 < hi) {
1115                         int mid = (lo + hi) / 2;
1116                         int cmp = strncmp(conf->words_marks[mid],
1117                                           txt.txt, txt.len);
1118                         if (cmp == 0 && conf->words_marks[mid][txt.len])
1119                                 cmp = 1;
1120                         if (cmp <= 0)
1121                                 lo = mid;
1122                         else
1123                                 hi = mid;
1124                 }
1125                 if (strncmp(conf->words_marks[lo],
1126                            txt.txt, txt.len) == 0
1127                     && conf->words_marks[lo][txt.len] == 0)
1128                         return lo;
1129                 else
1130                         return -1;
1131         }
1132
1133 ### Bringing it all together
1134
1135 Now we have all the bits there is just one section missing:  combining
1136 all the token parsing code into one block.
1137
1138 The handling of delayed tokens (Newlines, INs, OUTs) must come
1139 first before we try getting another character.
1140
1141 Then we parse all the test, making sure that we check for known marks
1142 before strings and comments, but unknown marks after strings and comments.
1143
1144 This block of code will either return a token, or will choose to
1145 ignore one, in which case it will `continue` around to the top of the
1146 loop.
1147
1148 ###### one token
1149         ## delayed tokens
1150
1151         ch = get_char(state);
1152
1153         ## white space
1154         ## parse number
1155         ## parse word
1156         ## parse mark
1157
1158 ### Start and stop
1159
1160 As well as getting tokens, we need to be able to create the
1161 `token_state` to start with, and discard it later.
1162
1163 ###### includes
1164         #include <malloc.h>
1165
1166 ###### main functions
1167         struct token_state *token_open(struct code_node *code, struct
1168                                        token_config *conf)
1169         {
1170                 struct token_state *state = malloc(sizeof(*state));
1171                 memset(state, 0, sizeof(*state));
1172                 state->node = code;
1173                 state->line = code->line_no;
1174                 state->col = do_strip(state);
1175                 state->conf = conf;
1176                 return state;
1177         }
1178         void token_close(struct token_state *state)
1179         {
1180                 free(state);
1181         }
1182
1183 ###### exported functions
1184         struct token_state *token_open(struct code_node *code, struct
1185                                        token_config *conf);
1186         void token_close(struct token_state *state);
1187
1188 ### Trace tokens
1189
1190 Getting tokens is the main thing but it is also useful to be able to
1191 print out token information, particularly for tracing and testing.
1192
1193 Known tokens are printed verbatim.  Other tokens are printed as
1194 `type(content)` where content is truncated to a given number of characters.
1195
1196 The function for printing a truncated string (`text_dump`) is also exported
1197 so that it can be used to tracing processed strings too.
1198
1199 ###### includes
1200         #include <stdio.h>
1201
1202 ###### exported functions
1203         void token_trace(FILE *f, struct token tok, int max);
1204         void text_dump(FILE *f, struct text t, int max);
1205
1206 ###### main functions
1207
1208         void text_dump(FILE *f, struct text txt, int max)
1209         {
1210                 int i;
1211                 if (txt.len > max)
1212                         max -= 2;
1213                 else
1214                         max = txt.len;
1215                 for (i = 0; i < max; i++) {
1216                         char c = txt.txt[i];
1217                         if (c < ' ' || c > '~')
1218                                 fprintf(f, "\\x%02x", c & 0xff);
1219                         else if (c == '\\')
1220                                 fprintf(f, "\\\\");
1221                         else
1222                                 fprintf(f, "%c", c);
1223                 }
1224                 if (i < txt.len)
1225                         fprintf(f, "..");
1226         }
1227
1228         void token_trace(FILE *f, struct token tok, int max)
1229         {
1230                 static char *types[] = {
1231                         [TK_ident] = "ident",
1232                         [TK_mark] = "mark",
1233                         [TK_number] = "number",
1234                         [TK_string] = "string",
1235                         [TK_multi_string] = "mstring",
1236                         [TK_line_comment] = "lcomment",
1237                         [TK_block_comment] = "bcomment",
1238                         [TK_in] = "in",
1239                         [TK_out] = "out",
1240                         [TK_newline] = "newline",
1241                         [TK_eof] = "eof",
1242                         [TK_error] = "ERROR",
1243                         };
1244
1245                 switch (tok.num) {
1246                 default: /* known word or mark */
1247                         fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1248                         break;
1249                 case TK_in:
1250                 case TK_out:
1251                 case TK_newline:
1252                 case TK_eof:
1253                         /* No token text included */
1254                         fprintf(f, "%s()", types[tok.num]);
1255                         break;
1256                 case TK_ident:
1257                 case TK_mark:
1258                 case TK_number:
1259                 case TK_string:
1260                 case TK_multi_string:
1261                 case TK_line_comment:
1262                 case TK_block_comment:
1263                 case TK_error:
1264                         fprintf(f, "%s(", types[tok.num]);
1265                         text_dump(f, tok.txt, max);
1266                         fprintf(f, ")");
1267                         break;
1268                 }
1269         }
1270
1271 ### And there we have it
1272
1273 We now have all the library functions defined for reading and printing
1274 tokens.  Now we just need C files to store them, and a mk file to make them.
1275
1276 ###### File: scanner.h
1277         ## public types
1278         ## exported functions
1279
1280 ###### File: libscanner.c
1281         ## includes
1282         #include "scanner.h"
1283         ## private types
1284         ## internal functions
1285         ## main functions
1286
1287 ###### File: scanner.mk
1288
1289         CFLAGS += -Wall -g
1290         all ::
1291         scanner.mk scanner.h libscanner.c : scanner.mdc
1292                 ./md2c scanner.mdc
1293         all :: libscanner.o
1294         libscanner.o : libscanner.c
1295                 $(CC) $(CFLAGS) -c libscanner.c
1296
1297 ## Processing numbers
1298
1299 Converting a `TK_number` token to a numerical value is a slightly
1300 higher level task than lexical analysis, and slightly lower than
1301 grammar parsing, so put it here - as an index if you like.
1302
1303 Importantly it will be used by the same testing rig that is used for
1304 testing the token scanner.
1305
1306 The numeric value that we will convert all numbers into is the `mpq_t`
1307 from the GNU high precision number library "libgmp".
1308
1309 ###### number includes
1310         #include <gmp.h>
1311         #include "mdcode.h"
1312
1313 Firstly we need to be able to parse a string of digits in a given base
1314 and possibly with a decimal marker.  We store this in an `mpz_t`
1315 integer and report the number of digits after the decimal mark.
1316
1317 On error we return zero and ensure that the 'mpz_t' has been freed, or
1318 had never been initialised.
1319
1320 ###### number functions
1321
1322         static int parse_digits(mpz_t num, struct text tok, int base,
1323                                 int *placesp)
1324         {
1325                 /* Accept digits up to 'base', ignore '_' and
1326                  * ' ' if they appear between two legal digits,
1327                  * and if `placesp` is not NULL, allow a single
1328                  * '.' or ',' and report the number of digits
1329                  * beyond there.
1330                  * Return number of characters processed (p),
1331                  * or 0 if something illegal was found.
1332                  */
1333                 int p;
1334                 int decimal = -1; // digits after marker
1335                 enum {Digit, Space, Other} prev = Other;
1336                 int digits = 0;
1337
1338                 for (p = 0; p < tok.len; p++) {
1339                         int dig;
1340                         char c = tok.txt[p];
1341
1342                         if (c == '_' || c == ' ') {
1343                                 if (prev != Digit)
1344                                         goto bad;
1345                                 prev = Space;
1346                                 continue;
1347                         }
1348                         if (c == '.' || c == ',') {
1349                                 if (prev != Digit)
1350                                         goto bad;
1351                                 if (!placesp || decimal >= 0)
1352                                         return p-1;
1353                                 decimal = 0;
1354                                 prev = Other;
1355                                 continue;
1356                         }
1357                         if (isdigit(c))
1358                                 dig = c - '0';
1359                         else if (isupper(c))
1360                                 dig = 10 + c - 'A';
1361                         else if (islower(c))
1362                                 dig = 10 + c - 'a';
1363                         else
1364                                 dig = base;
1365                         if (dig >= base) {
1366                                 if (prev == Space)
1367                                         p--;
1368                                 break;
1369                         }
1370                         prev = Digit;
1371                         if (digits)
1372                                 mpz_mul_ui(num, num, base);
1373                         else
1374                                 mpz_init(num);
1375                         digits += 1;
1376                         mpz_add_ui(num, num, dig);
1377                         if (decimal >= 0)
1378                                 decimal++;
1379                 }
1380                 if (digits == 0)
1381                         return 0;
1382                 if (placesp) {
1383                         if (decimal >= 0)
1384                                 *placesp = decimal;
1385                         else
1386                                 *placesp = 0;
1387                 }
1388                 return p;
1389         bad:
1390                 if (digits)
1391                         mpz_clear(num);
1392                 return 0;
1393         }
1394
1395 ###### number includes
1396         #include <ctype.h>
1397
1398 To parse a full number we need to consider the optional base, the
1399 mantissa, and the optional exponent.  We will treat these one at a
1400 time.
1401
1402 The base is indicated by a letter after a leading zero, which must be
1403 followed by a base letter or a period.  The base also determines the
1404 character which will mark an exponent.
1405
1406 ###### number vars
1407         int base = 10;
1408         char expc = 'e';
1409
1410 ###### parse base
1411
1412         if (tok.txt[0] == '0' && tok.len > 1) {
1413                 int skip = 0;
1414                 switch(tok.txt[1]) {
1415                 case 'x':
1416                 case 'X':
1417                         base = 16;
1418                         skip = 2;
1419                         expc = 'p';
1420                         break;
1421                 case 'o':
1422                 case 'O':
1423                         base = 8;
1424                         skip = 2;
1425                         expc = 'p';
1426                         break;
1427                 case 'b':
1428                 case 'B':
1429                         base = 2;
1430                         skip = 2;
1431                         expc = 'p';
1432                         break;
1433                 case '0':
1434                 case '1':
1435                 case '2':
1436                 case '3':
1437                 case '4':
1438                 case '5':
1439                 case '6':
1440                 case '7':
1441                 case '8':
1442                 case '9':
1443                 case '_':
1444                 case ' ':
1445                         // another digit is not permitted
1446                         // after a zero.
1447                         return 0;
1448                 default:
1449                         // must be decimal marker or trailing
1450                         // letter, which are OK;
1451                         break;
1452                 }
1453                 tok.txt += skip;
1454                 tok.len -= skip;
1455         }
1456
1457 After the base is the mantissa, which may contain a decimal mark, so
1458 we need to record the number of places.  We won't impose the number of
1459 places until we have the exponent as well.
1460
1461 ###### number vars
1462         int places =0;
1463         mpz_t mant;
1464         int d;
1465
1466 ###### parse mantissa
1467
1468         d = parse_digits(mant, tok, base, &places);
1469         if (d == 0)
1470                 return 0;
1471         tok.txt += d;
1472         tok.len -= d;
1473         mpq_init(num);
1474         mpq_set_z(num, mant);
1475         mpz_clear(mant);
1476
1477 After the mantissa number may come an exponent which may be positive
1478 or negative.  We assume at this point that we have seen the exponent
1479 character `expc`.
1480
1481 ###### number vars
1482         long lexp = 0;
1483         mpz_t exp;
1484         int esign = 1;
1485
1486 ###### parse exponent
1487         if (tok.len > 1) {
1488                 if (tok.txt[0] == '+') {
1489                         tok.txt++;
1490                         tok.len--;
1491                 } else if (tok.txt[0] == '-') {
1492                         esign = -1;
1493                         tok.txt++;
1494                         tok.len--;
1495                 }
1496         }
1497         d = parse_digits(exp, tok, 10, NULL);
1498         if (d == 0) {
1499                 mpq_clear(num);
1500                 return 0;
1501         }
1502         if (!mpz_fits_slong_p(exp)) {
1503                 mpq_clear(num);
1504                 mpz_clear(exp);
1505                 return 0;
1506         }
1507         lexp = mpz_get_si(exp) * esign;
1508         mpz_clear(exp);
1509         tok.txt += d;
1510         tok.len -= d;
1511
1512
1513 Now that we have the mantissa and the exponent we can multiply them
1514 together, also allowing for the number of digits after the decimal
1515 mark.
1516
1517 For base 10, we simply subtract the decimal places from the exponent.
1518 For the other bases, as the exponent is alway based on 2, even for
1519 octal and hex, we need a bit more detail.
1520 We then recover the sign from the exponent, as division is quite
1521 different from multiplication.
1522
1523 ###### calc exponent
1524         switch (base) {
1525         case 10:
1526         case 2:
1527                 lexp -= places;
1528                 break;
1529         case 16:
1530                 lexp -= 4*places;
1531                 break;
1532         case 8:
1533                 lexp -= 3*places;
1534                 break;
1535         }
1536         if (lexp < 0) {
1537                 lexp = -lexp;
1538                 esign = -1;
1539         } else
1540                 esign = 1;
1541
1542 Imposing the exponent on the number is also very different for base 10
1543 than for the others.  For the binary shift `gmp` provides a simple
1544 function.  For base 10 we use something like Russian Peasant
1545 Multiplication.
1546
1547 ###### calc exponent
1548         if (expc == 'e') {
1549                 mpq_t tens;
1550                 mpq_init(tens);
1551                 mpq_set_ui(tens, 10, 1);
1552                 while (1) {
1553                         if (lexp & 1) {
1554                                 if (esign > 0)
1555                                         mpq_mul(num, num, tens);
1556                                 else
1557                                         mpq_div(num, num, tens);
1558                         }
1559                         lexp >>= 1;
1560                         if (lexp == 0)
1561                                 break;
1562                         mpq_mul(tens, tens, tens);
1563                 }
1564                 mpq_clear(tens);
1565         } else {
1566                 if (esign > 0)
1567                         mpq_mul_2exp(num, num, lexp);
1568                 else
1569                         mpq_div_2exp(num, num, lexp);
1570         }
1571
1572 Now we are ready to parse a number: the base, mantissa, and exponent.
1573 If all goes well we check for the possible trailing letters and
1574 return.  Return value is 1 for success and 0 for failure.
1575
1576
1577 ###### number functions
1578         int number_parse(mpq_t num, char tail[3], struct text tok)
1579         {
1580                 ## number vars
1581                 int i;
1582
1583                 ## parse base
1584                 ## parse mantissa
1585                 if (tok.len > 1 && (tok.txt[0] == expc ||
1586                                     tok.txt[0] == toupper(expc))) {
1587                         tok.txt++;
1588                         tok.len--;
1589                         ## parse exponent
1590                 }
1591                 ## calc exponent
1592
1593                 for (i = 0; i < 2; i++) {
1594                         if (tok.len <= i)
1595                                 break;
1596                         if (!isalpha(tok.txt[i]))
1597                                 goto err;
1598                         tail[i] = tok.txt[i];
1599                 }
1600                 tail[i] = 0;
1601                 if (i == tok.len)
1602                         return 1;
1603         err:
1604                 mpq_clear(num);
1605                 return 0;
1606         }
1607
1608 Number parsing goes in `libnumber.c`
1609
1610 ###### File: libnumber.c
1611
1612         #include <unistd.h>
1613         #include <stdlib.h>
1614
1615         ## number includes
1616         ## number functions
1617
1618 ###### File: number.h
1619         int number_parse(mpq_t num, char tail[3], struct text tok);
1620
1621 ###### File: scanner.mk
1622         all :: libnumber.o
1623         libnumber.o : libnumber.c
1624                 $(CC) $(CFLAGS) -c libnumber.c
1625
1626 ## Processing strings
1627
1628 Both `TK_string` and `TK_multi_string` require post-processing which
1629 can be one of two types: literal or with escapes processed.
1630 Even literal processing is non-trivial as the file may contain indents
1631 which need to be stripped.
1632
1633 Errors can only occur when processing escapes.  Any unrecognised
1634 character following the escape character will cause an error.
1635
1636 Processing escapes and striping indents can only make the string
1637 shorter, not longer, so we allocate a buffer which is the same size as
1638 the string and process into that.
1639
1640 To request escape processing, we pass the character we want to use for
1641 quoting, usually '`\`'.  To avoid escape processing we pass a zero.
1642
1643 ###### string main
1644         int string_parse(struct token *tok, char escape,
1645                          struct text *str, char tail[3])
1646         {
1647                 ## string vars
1648                 struct text t = tok->txt;
1649
1650                 str->txt = NULL;
1651                 ## strip tail
1652                 if (tok->num == TK_string) {
1653                         ## strip single
1654                 } else {
1655                         ## strip multi
1656                 }
1657                 str->txt = malloc(t.len);
1658                 str->len = 0;
1659
1660                 ## process string
1661                 return 1;
1662         err:
1663                 free(str->txt);
1664                 str->txt = NULL;
1665                 return 0;
1666         }
1667
1668 ### strip tail
1669
1670 The tail of the string can be 0, 1, or 2 letters
1671
1672         i = t.len;
1673         if (i >= 0 && isalpha(t.txt[i-1]))
1674                 i -= 1;
1675         if (i >= 0 && isalpha(t.txt[i-1]))
1676                 i -= 1;
1677         strncpy(tail, t.txt+i, t.len-i);
1678         tail[t.len-i] = 0;
1679         t.len = i;
1680
1681 ###### string vars
1682         int i;
1683
1684 ### strip single
1685
1686 Stripping the quote of a single-line string is trivial.
1687 The only part that is at all interesting is that quote character must
1688 be remembered.
1689
1690         quote = t.txt[0];
1691         if (t.txt[t.len-1] != quote)
1692                 goto err;
1693         t.txt += 1;
1694         t.len -= 2;
1695
1696 ###### string vars
1697         char quote;
1698
1699 ### strip multi
1700
1701 For a multi-line string we have a little more work to do.  We need to
1702 remove 3 quotes, not 1, and need to count the indent of the close
1703 quote as it will need to be stripped from all lines.
1704
1705         quote = t.txt[0];
1706         if (t.len < 7 ||
1707             t.txt[1] != quote || t.txt[2] != quote ||
1708             !is_newline(t.txt[3]))
1709                 goto err;
1710         t.txt += 4;
1711         t.len -= 4;
1712         i = t.len;
1713         if (i <= 0 || t.txt[i-1] != quote)
1714                 goto err;
1715         i -= 1;
1716         if (i <= 0 || t.txt[i-1] != quote)
1717                 goto err;
1718         i -= 1;
1719         if (i <= 0 || t.txt[i-1] != quote)
1720                 goto err;
1721         i -= 1;
1722         t.len = i;
1723         while (i > 0 && !is_newline(t.txt[i-1]))
1724                 i--;
1725         indent = 0;
1726         while (i < t.len) {
1727                 if (t.txt[i] == ' ')
1728                         indent += 1;
1729                 if (t.txt[i] == '\t')
1730                         indent = indent_tab(indent);
1731                 i++;
1732         }
1733
1734 ###### string vars
1735         int indent = 0;
1736
1737 ### process string
1738
1739 Now we just take one byte at a time. trans-ASCII unicode won't look
1740 like anything we are interested in so it will just be copied byte by
1741 byte.
1742
1743         cp = str->txt;
1744         at_sol = 1;
1745         for (i = 0; i < t.len; i++) {
1746                 char c;
1747                 if (at_sol) {
1748                         at_sol = 0;
1749                         ## strip indent
1750                         if (i >= t.len)
1751                                 break;
1752                 }
1753                 c = t.txt[i];
1754                 if (c != escape) {
1755                         *cp = c;
1756                         cp += 1;
1757                         if (is_newline(c))
1758                                 at_sol = 1;
1759                 } else if (i+1 >= t.len) {
1760                         // escape and end of string
1761                         goto err;
1762                 } else {
1763                         i += 1;
1764                         c = t.txt[i];
1765                         ## parse escape
1766                 }
1767         }
1768         str->len = cp - str->txt;
1769
1770 ###### string vars
1771         char *cp;
1772         int at_sol;
1773
1774 ### strip indent
1775
1776 Every time we find a start of line, we strip spaces and tabs until the
1777 required indent is found.
1778
1779         int skipped = 0;
1780         while (i < t.len && skipped < indent) {
1781                 c = t.txt[i];
1782                 if (c == ' ')
1783                         skipped += 1;
1784                 else if (c == '\t')
1785                         skipped = indent_tab(skipped);
1786                 else
1787                         break;
1788                 i+= 1;
1789         }
1790
1791 ### parse escape
1792         switch (c) {
1793         case 'n':
1794                 *cp++ = '\n'; break;
1795         case 'r':
1796                 *cp++ = '\r'; break;
1797         case 't':
1798                 *cp++ = '\t'; break;
1799         case 'b':
1800                 *cp++ = '\b'; break;
1801         case 'q':
1802                 *cp++ = quote; break;
1803         case 'f':
1804                 *cp++ = '\f'; break;
1805         case 'v':
1806                 *cp++ = '\v'; break;
1807         case 'a':
1808                 *cp++ = '\a'; break;
1809         case '0':
1810         case '1':
1811         case '2':
1812         case '3':
1813                 // 3 digit octal number
1814                 if (i+2 >= t.len)
1815                         goto err;
1816                 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1817                     t.txt[i+2] < '0' || t.txt[i+1] > '7')
1818                         goto err;
1819                 n = (t.txt[i  ]-'0') * 64 +
1820                     (t.txt[i+1]-'0') *  8 +
1821                     (t.txt[i+2]-'0') *  1;
1822                 *cp++ = n;
1823                 i += 2;
1824                 break;
1825         case 'x':
1826                 // 2 hex digits
1827                 n = take_hex(2, t.txt+i+1, t.len-i-1);
1828                 if (n < 0)
1829                         goto err;
1830                 *cp++ = n;
1831                 i += 2;
1832                 break;
1833         case 'u':
1834         case 'U':
1835                 // 4 or 8 hex digits for unicode
1836                 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1837                 if (n < 0)
1838                         goto err;
1839                 memset(&pstate, 0, sizeof(pstate));
1840                 n = wcrtomb(cp, n, &pstate);
1841                 if (n <= 0)
1842                         goto err;
1843                 cp += n;
1844                 i += c == 'u' ? 4 : 8;
1845                 break;
1846         default:
1847                 if (c == escape)
1848                         *cp++ = c;
1849                 else if (is_newline(c))
1850                         at_sol = 1;
1851                 else
1852                         goto err;
1853         }
1854
1855 ###### string vars
1856         long n;
1857         mbstate_t pstate;
1858
1859 For `\x` `\u` and `\U` we need to collect a specific number of
1860 hexadecimal digits
1861
1862 ###### string functions
1863
1864         static long take_hex(int digits, char *cp, int l)
1865         {
1866                 long n = 0;
1867                 if (l < digits)
1868                         return -1;
1869                 while (digits) {
1870                         char  c = *cp;
1871                         int d;
1872                         if (!isxdigit(c))
1873                                 return -1;
1874                         if (isdigit(c))
1875                                 d = c - '0';
1876                         else if (isupper(c))
1877                                 d = 10 + c - 'A';
1878                         else
1879                                 d = 10 + c - 'a';
1880                         n = n * 16 + d;
1881                         digits--;
1882                         cp++;
1883                 }
1884                 return n;
1885         }
1886
1887 #### File: libstring.c
1888
1889 String parsing goes in `libstring.c`
1890
1891         #include <unistd.h>
1892         #include <stdlib.h>
1893         #include <stdio.h>
1894         #include <string.h>
1895         #include <ctype.h>
1896         #include <wchar.h>
1897         #include "mdcode.h"
1898         #include "scanner.h"
1899         ## string functions
1900         ## string main
1901
1902 ###### File: string.h
1903         int string_parse(struct token *tok, char escape,
1904                          struct text *str, char tail[3]);
1905
1906 ###### File: scanner.mk
1907         all :: libstring.o
1908         libstring.o : libstring.c
1909                 $(CC) $(CFLAGS) -c libstring.c
1910
1911
1912 ## Testing
1913
1914 As "untested code is buggy code" we need a program to easily test
1915 the scanner library.  This will simply parse a given file and report
1916 the tokens one per line.
1917
1918 ###### File: scanner.c
1919
1920         #include <unistd.h>
1921         #include <stdlib.h>
1922         #include <fcntl.h>
1923         #include <errno.h>
1924         #include <sys/mman.h>
1925         #include <string.h>
1926         #include <stdio.h>
1927         #include <gmp.h>
1928         #include <locale.h>
1929         #include "mdcode.h"
1930         #include "scanner.h"
1931         #include "number.h"
1932         #include "string.h"
1933
1934         static int errs;
1935         static void pr_err(char *msg)
1936         {
1937                 errs++;
1938                 fprintf(stderr, "%s\n", msg);
1939         }
1940
1941         int main(int argc, char *argv[])
1942         {
1943                 int fd;
1944                 int len;
1945                 char *file;
1946                 struct token_state *state;
1947                 const char *known[] = {
1948                         "==",
1949                         "else",
1950                         "if",
1951                         "then",
1952                         "while",
1953                         "{",
1954                         "}",
1955                 };
1956                 struct token_config conf = {
1957                         .word_start = "_$",
1958                         .word_cont = "",
1959                         .words_marks = known,
1960                         .number_chars = "., _+-",
1961                         .known_count = sizeof(known)/sizeof(known[0]),
1962                         .ignored = (0 << TK_line_comment)
1963                                   |(0 << TK_block_comment),
1964                 };
1965                 struct section *table, *s, *prev;
1966                 setlocale(LC_ALL,"");
1967                 if (argc != 2) {
1968                         fprintf(stderr, "Usage: scanner file\n");
1969                         exit(2);
1970                 }
1971                 fd = open(argv[1], O_RDONLY);
1972                 if (fd < 0) {
1973                         fprintf(stderr, "scanner: cannot open %s: %s\n",
1974                                 argv[1], strerror(errno));
1975                         exit(1);
1976                 }
1977                 len = lseek(fd, 0, 2);
1978                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
1979                 table = code_extract(file, file+len, pr_err);
1980
1981                 for (s = table; s;
1982                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
1983                         printf("Tokenizing: %.*s\n", s->section.len,
1984                                 s->section.txt);
1985                         state = token_open(s->code, &conf);
1986                         while(1) {
1987                                 struct token tk = token_next(state);
1988                                 printf("%d:%d ", tk.line, tk.col);
1989                                 token_trace(stdout, tk, 20);
1990                                 if (tk.num == TK_number) {
1991                                         mpq_t num;
1992                                         char tail[3];
1993                                         if (number_parse(num, tail,tk.txt)) {
1994                                                 printf(" %s ", tail);
1995                                                 mpq_out_str(stdout, 10, num);
1996                                                 mpq_clear(num);
1997                                         } else
1998                                                 printf(" BAD NUMBER");
1999                                 }
2000                                 if (tk.num == TK_string ||
2001                                     tk.num == TK_multi_string) {
2002                                         char esc = '\\';
2003                                         struct text str;
2004                                         char tail[3];
2005                                         if (tk.txt.txt[0] == '`')
2006                                                 esc = 0;
2007                                         if (string_parse(&tk, esc,
2008                                                          &str, tail)) {
2009                                                 printf(" %s ", tail);
2010                                                 text_dump(stdout, str, 20);
2011                                                 free(str.txt);
2012                                         } else
2013                                                 printf(" BAD STRING");
2014                                 }
2015                                 printf("\n");
2016                                 if (tk.num == TK_error)
2017                                         errs = 1;
2018                                 if (tk.num == TK_eof)
2019                                         break;
2020                         }
2021                 }
2022                 exit(!!errs);
2023         }
2024 ###### File: scanner.mk
2025         scanner.c : scanner.mdc
2026                 ./md2c scanner.mdc
2027         all :: scanner
2028         scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2029                 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2030                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2031         scanner.o : scanner.c
2032                 $(CC) $(CFLAGS) -c scanner.c