ocean-lang.org Git - ocean/blob - csrc/scanner.mdc

   1 # Lexical Scanner #
   2
   3 ## The Task at Hand ##
   4
   5 The main task of the lexical scanner is to convert a stream of
   6 characters into a stream of tokens.  The tokens are then typically
   7 used by a parser to extract the syntactic structure.
   8
   9 The stream of characters are assumed to be in memory identified by a
  10 linked list of blocks, such as provided by the "[mdcode][]" literate
  11 program extractor.  A single token may never cross a block boundary.
  12
  13 [mdcode]: mdcode.html
  14
  15 ###### includes
  16         #include "mdcode.h"
  17
  18 The text is assumed to be UTF-8 though some matching assumes the
  19 ASCII subset.  If the text provided does not conform to UTF-8 an error
  20 will be reported and some number of bytes will be skipped.
  21
  22 ###### public types
  23         #include <wchar.h>
  24         #include <wctype.h>
  25         #include <unicode/uchar.h>
  26
  27 Tokens are returned by successive calls to the main interface
  28 function: `token_next()` which has a `state` structure to keep track
  29 of where it is up to.  Each token carries not just a numeric
  30 identifier but also the code block, the line and character within that
  31 block, and the actual start and length using the `struct text` from
  32 "mdcode".
  33
  34 ###### public types
  35         struct token {
  36                 int               num;
  37                 struct code_node *node;
  38                 struct text       txt;
  39                 int               line, col;
  40         };
  41         struct token_state;
  42
  43 ###### private types
  44         struct token_state {
  45                 ## state fields
  46         };
  47
  48 ###### exported functions
  49         struct token token_next(struct token_state *state);
  50
  51 ###### main functions
  52         struct token token_next(struct token_state *state)
  53         {
  54                 ## token_next init
  55                 while (1) {
  56                         wint_t ch;
  57                         struct token tk;
  58
  59                         ## one token
  60                 }
  61         }
  62
  63 The `line` and `col` offsets are useful for reporting errors.
  64 The `txt` provides the content when that is important.
  65
  66 ### Token types and configuration ##
  67
  68 The scanner is not completely general, yet not completely specified.
  69 There are a fixed set of token types, though particular tokens within
  70 those types can be distinguish via configuration.
  71
  72 Most token types may be explicitly ignored, as typically comments
  73 would be.  The exact consequence of ignoring each token type varies
  74 from token to token.
  75
  76 ###### public types
  77         struct token_config {
  78                 int ignored;    // bit set of ignored tokens.
  79                 ## token config parameters
  80         };
  81
  82 ###### state fields
  83         struct token_config *conf;
  84
  85 ###### token_next init
  86         int ignored = state->conf->ignored;
  87
  88 The different tokens are numbers, words, marks, strings, comments,
  89 newlines, EOF, and indents, each of which is examined in detail below.
  90
  91 There are various cases where no token can be found in part of the
  92 input.  All of these will be reported as a `TK_error` token.
  93
  94 It is possible to declare a number of strings which form distinct
  95 tokens (rather than being grouped as e.g. 'word').  These are given
  96 token numbers from `TK_reserved` upwards.
  97
  98 ###### public types
  99         enum token_num {
 100                 TK_error,
 101                 ## token types
 102                 TK_reserved
 103         };
 104
 105 ### Numbers
 106
 107 Numbers are the messiest tokens to parse, primarily because they can
 108 contain characters that also have meaning outside of numbers and,
 109 particularly, immediately after numbers.
 110
 111 The obvious example is the '`-`' sign.  It can come inside a number for
 112 a negative exponent, or after a number as a subtraction operator.  To
 113 be sure we have parsed as best as possible we need to only allow the
 114 '`-`' inside a number if it is after an exponent character.  This can be
 115 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
 116 digit, so we don't allow '`-`' after just any `e`.
 117
 118 To make matters worse, our language designer has decided to experiment
 119 with allowing commas to be used as the decimal indicator, and spaces
 120 to be used to separate groups of digits in large numbers.  Both of
 121 these can reasonably be restricted to appear between two digits, so we
 122 have to add that condition to our tests.  For consistency we require
 123 every non-alpha-numeric to appear between two hex digits, with the
 124 exception that a sign can appear only after a 'p' or 'e', and a space
 125 can only appear between decimal digits.  Allowing a space before a
 126 letter easily leads to confusion, such a in `a < 3 and b < 4`.
 127
 128 So we cannot just treat numbers as starting with a digit and being
 129 followed by some set of characters.  We need more structure than that.
 130
 131 So:
 132
 133 - Numbers must start with a digit.
 134 - If the first digit is zero, the next character should be a base
 135   signifier (one of `xob`) or a decimal marker (`.` or `,`) (though this isn't
 136   enforced at this stage)
 137   In the first case the only first `p` or `P` may be followed by a sign.
 138 - If the number doesn't start with `0` followed by one of `xob`, the
 139   first `e` may be followed by a sign.
 140 - A sign must always be followed by a digit.
 141 - Any digit may be followed by a space or underscore and any hex digit
 142   maybe followed by an underscore, providing that the subsequence character
 143   is also a digit (for space) or hex digit (for underscore).
 144   This rule will require an extra level of 'unget' to be
 145   supported when handling characters.
 146 - Otherwise any digits or ASCII letters are allowed.  We do not at
 147   this point check that the digits given are permitted by the base.
 148   That will happen when the token is converted to a number.
 149
 150 To allow easy configuration, the various non alphanumeric characters
 151 are only permitted if they are listed in a configuration parameter.
 152
 153 ###### token config parameters
 154         char *number_chars;
 155
 156 Note that numbers may not start with a period, so `.75` is not a
 157 number.  This is not the norm, but is not unheard of.  Excluding these
 158 numbers simplifies the rule at very little cost.
 159
 160 ###### token types
 161         TK_number,
 162
 163 If TK_number is ignored, digits will result in an error unless they
 164 are declared to be a start character for words.
 165
 166 ###### includes
 167
 168         #include <string.h>
 169
 170 ###### parse number
 171
 172         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
 173                 int prev = 0;
 174                 int expect_p = 0;
 175                 int decimal_mark = 0;
 176                 if (ch == '0') {
 177                         wchar_t ch2 = get_char(state);
 178                         if (strchr("xobXOB", ch2) != NULL)
 179                                 expect_p = 1;
 180                         unget_char(state);
 181                 }
 182                 while (1) {
 183                         int sign_ok = 0;
 184                         switch(expect_p) {
 185                         case 0:
 186                                 if (ch == 'e' || ch == 'E') {
 187                                         sign_ok = 1;
 188                                         decimal_mark = 1;
 189                                 }
 190                                 break;
 191                         case 1:
 192                                 if (ch == 'p' || ch == 'P') {
 193                                         sign_ok = 1;
 194                                         decimal_mark = 1;
 195                                 }
 196                                 break;
 197                         }
 198                         save_unget_state(state);
 199                         prev = ch;
 200                         ch = get_char(state);
 201
 202                         if (!iswalnum(prev)) {
 203                                 /* special characters, like separators and decimal marks
 204                                  * and signs, must be followed by a hexdigit, and the
 205                                  * space and signs must be followed by a decimal digit.
 206                                  */
 207                                 if (!iswxdigit(ch) ||
 208                                    ((prev == '-' || prev == '+') && !iswdigit(ch)) ||
 209                                    (prev == ' ' && !iswdigit(ch))) {
 210                                         /* don't want the new char or the special */
 211                                         restore_unget_state(state);
 212                                         break;
 213                                 }
 214                         }
 215                         if (iswalnum(ch))
 216                                 continue;
 217
 218                         if (!strchr(state->conf->number_chars, ch)) {
 219                                 /* non-number char */
 220                                 break;
 221                         }
 222                         if (ch == '+' || ch == '-') {
 223                                 /* previous must be 'e' or 'p' in appropraite context */
 224                                 if (!sign_ok)
 225                                         break;
 226                                 expect_p = -1;
 227                         } else if (ch == ' ') {
 228                                 /* previous must be a digit */
 229                                 if (!iswdigit(prev))
 230                                         break;
 231                         } else {
 232                                 /* previous must be a hex digit */
 233                                 if (!iswxdigit(prev))
 234                                         break;
 235                         }
 236                         if (ch == '.' || ch == ',') {
 237                                 /* only one of these permitted */
 238                                 if (decimal_mark)
 239                                         break;
 240                                 decimal_mark = 1;
 241                         }
 242                 }
 243                 /* We seem to have a "number" token */
 244                 unget_char(state);
 245                 close_token(state, &tk);
 246                 tk.num = TK_number;
 247                 return tk;
 248         }
 249
 250 ### Words
 251 Words start with a "start" character followed by the longest
 252 sequence of "continue" characters.  The Unicode ID_START and
 253 ID_CONTINUE sets are always permitted, but other ASCII characters
 254 can be added to these sets.
 255
 256 ###### token config parameters
 257         char *word_start;
 258         char *word_cont;
 259
 260 ###### internal functions
 261         static int is_word_start(wchar_t ch, struct token_config *conf)
 262         {
 263                 return iswalpha(ch) ||
 264                        strchr(conf->word_start, ch) != NULL ||
 265                        u_hasBinaryProperty(ch, UCHAR_ID_START);
 266         }
 267
 268         static int is_word_continue(wchar_t ch, struct token_config *conf)
 269         {
 270                 return iswalnum(ch) ||
 271                        strchr(conf->word_cont, ch) != NULL ||
 272                        u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
 273         }
 274
 275 Words can be either known or unknown.  Known words are referred to as
 276 "reserved words" and get a unique token number.  Unknown words are
 277 "identifiers" and are syntactically a single token.
 278
 279 ###### token types
 280         TK_ident,
 281
 282 A list of known words must be provided.  This list is shared with the
 283 "marks" which are described next.  The list must be lexically sorted
 284 and the length of the list must be given (`known_count`).
 285 Tokens matching these known words are reported as the index of the
 286 list added to `TK_reserved`.
 287
 288 If identifiers are ignored, then any word which is not listed as a
 289 known word results in an error.
 290
 291 ###### token config parameters
 292         const char **words_marks;
 293         int known_count;
 294
 295 ###### parse word
 296
 297         if (is_word_start(ch, state->conf)) {
 298                 int n;
 299                 /* A word: identifier or reserved */
 300                 do
 301                         ch = get_char(state);
 302                 while (is_word_continue(ch, state->conf));
 303                 unget_char(state);
 304                 close_token(state, &tk);
 305                 tk.num = TK_ident;
 306                 if (ignored & (1<<TK_ident))
 307                         tk.num = TK_error;
 308                 n = find_known(state->conf, tk.txt);
 309                 if (n >= 0)
 310                         tk.num = TK_reserved + n;
 311                 return tk;
 312         }
 313
 314 ### Marks
 315
 316 Marks are generally one or more punctuation marks joined together.  It
 317 would be nice to use the term "symbol" for these, but that causes
 318 confusion in a subsequent discussion of the grammar, which has terminal
 319 symbols and non-terminal symbols which are conceptually quite
 320 different.  So strings of punctuation characters will be marks.
 321
 322 A "mark" consists of ASCII characters that are not white space, are not
 323 "start" characters for words, and are not digits.
 324 These will collectively be called mark characters.
 325
 326 ###### internal functions
 327         static int is_mark(wchar_t ch, struct token_config *conf)
 328         {
 329                 return ch > ' ' &&
 330                        ch < 0x7f &&
 331                        !iswalnum(ch) &&
 332                        strchr(conf->word_start, ch) == NULL;
 333         }
 334
 335 As with words, there can be known and unknown marks, though the rules
 336 are slightly different.
 337
 338 Two marks do not need to be separated by a non-mark characters.  This
 339 is different from words which do need to be separated by at least one
 340 non-continue character.
 341
 342 The scanner will normally prefer longer sequences of mark characters,
 343 but will more strongly prefer known marks over unknown marks.  So if
 344 it finds a known mark where adding one more character does not result
 345 in a known mark, it will return that first known mark.
 346
 347 If no known mark is found we will test against strings and comments
 348 below before giving up and assuming an unknown mark.
 349
 350 If an unknown mark contains a quote character or a comment marker, and
 351 that token is not being ignored, then we terminate the unknown mark
 352 before that quote or comment.  This ensures that an unknown mark
 353 immediately before a string is handled correctly.
 354
 355 If the first character of a comment marker (i.e. '/') is a known mark,
 356 the above rules would suggest that the start of a comment would be
 357 parsed as that mark, which is not what is wanted.  So the introductory
 358 sequences for a comment ("//" and "/*") are treated as
 359 partially-known.  They prevent the leading "/" from being a mark by
 360 itself, but do not actually constitute a stand-alone mark.
 361
 362 If `TK_mark` is ignored, then unknown marks are returned as errors.
 363
 364 ###### token types
 365         TK_mark,
 366
 367 Known marks are included in the same list as the list of known words.
 368
 369 ###### parse mark
 370         tk.num = TK_error;
 371         while (is_mark(ch, state->conf)) {
 372                 int n;
 373                 wchar_t prev;
 374                 close_token(state, &tk);
 375                 n = find_known(state->conf, tk.txt);
 376                 if (n >= 0)
 377                         tk.num = TK_reserved + n;
 378                 else if (tk.num != TK_error) {
 379                         /* found a longest-known-mark, still need to
 380                          * check for comments
 381                          */
 382                         if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
 383                             (ch == '/' || ch == '*')) {
 384                                 /* Yes, this is a comment, not a '/' */
 385                                 restore_unget_state(state);
 386                                 tk.num = TK_error;
 387                                 break;
 388                         }
 389                         unget_char(state);
 390                         close_token(state, &tk);
 391                         return tk;
 392                 }
 393                 prev = ch;
 394                 save_unget_state(state);
 395                 ch = get_char(state);
 396                 if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
 397                         /* If strings are allowed, a quote (Which isn't a known mark)
 398                          * mustn't be treated as part of an unknown mark.  It can be
 399                          * part of a multi-line srtings though.
 400                          */
 401                         break;
 402                 if (prev == '#' && n < 0)
 403                         /* '#' is not a known mark, so assume it is a comment */
 404                         break;
 405                 if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
 406                         close_token(state, &tk);
 407                         restore_unget_state(state);
 408                         break;
 409                 }
 410                 if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
 411                         close_token(state, &tk);
 412                         restore_unget_state(state);
 413                         break;
 414                 }
 415         }
 416         unget_char(state);
 417         if (tk.num != TK_error) {
 418                 close_token(state, &tk);
 419                 return tk;
 420         }
 421
 422 If we don't find a known mark, we will check for strings and comments
 423 before assuming that we have an unknown mark
 424
 425 ###### parse mark
 426         ## parse string
 427         ## parse comment
 428         ## unknown mark
 429
 430 ### Strings
 431
 432 Strings start with one of single quote, double quote, or back quote
 433 and continue until a matching character on the same line.  Any of
 434 these characters can be included in the list of known marks and then
 435 they will not be used for identifying strings.
 436
 437 Immediately following the close quote, one or two ASCII letters may
 438 appear.  These are somewhat like the arbitrary letters allowed in
 439 "Numbers" above.  They can be used by the language in various ways.
 440
 441 If 3 identical quote characters appear in a row and are
 442 followed by a newline, then this forms a multi-line string which
 443 continues until an identical triple quote appears on a line preceded
 444 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
 445
 446 Multi-line strings may not extend beyond the end of the `code_node` in
 447 which they start.
 448
 449 Normal strings and multi-line strings are encoded as two different
 450 token types.
 451
 452 ###### token types
 453         TK_string,
 454         TK_multi_string,
 455
 456 ###### internal functions
 457         static int is_quote(wchar_t ch)
 458         {
 459                 return ch == '\'' || ch == '"' || ch == '`'; // "
 460         }
 461
 462 #### Multi-line strings
 463
 464 The multi-line string is checked for first.  If they are being
 465 ignored, we fall through and treat a triple quote as an empty string
 466 followed by the start of a new string.
 467
 468 ###### parse string
 469         if (tk.txt.len == 3 &&
 470             !(ignored & (1 << TK_multi_string)) &&
 471             is_quote(tk.txt.txt[0]) &&
 472             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
 473             is_newline(tk.txt.txt[3])) {
 474                 // triple quote
 475                 wchar_t first = tk.txt.txt[0];
 476                 int qseen = 0;
 477                 int at_sol = 1;
 478                 while (!at_eon(state) && qseen < 3) {
 479                         ch = get_char(state);
 480                         if (is_newline(ch)) {
 481                                 at_sol = 1;
 482                                 qseen = 0;
 483                         } else if (at_sol && ch == first) {
 484                                 qseen += 1;
 485                         } else if (ch != ' ' && ch != '\t') {
 486                                 at_sol = 0;
 487                                 qseen = 0;
 488                         }
 489                 }
 490                 if (qseen != 3) {
 491                         /* Hit end of node - error.
 492                          * unget so the newline is seen,
 493                          * but return rest of string as an error.
 494                          */
 495                         if (is_newline(ch))
 496                                 unget_char(state);
 497                         close_token(state, &tk);
 498                         tk.num = TK_error;
 499                         return tk;
 500                 }
 501                 /* 2 letters are allowed */
 502                 ch = get_char(state);
 503                 if (iswalpha(ch))
 504                         ch = get_char(state);
 505                 if (iswalpha(ch))
 506                         ch = get_char(state);
 507                 /* Now we must have a newline, but we don't return it
 508                  * whatever it is.*/
 509                 unget_char(state);
 510                 close_token(state, &tk);
 511                 tk.num = TK_multi_string;
 512                 if (!is_newline(ch))
 513                         tk.num = TK_error;
 514                 return tk;
 515         }
 516
 517 #### Single-line strings
 518
 519 The sequence of marks collected may be more than a single-line
 520 string, so we reset to the start and collect characters until
 521 we find a close quote or a newline.
 522
 523 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
 524
 525 ###### parse string
 526         if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
 527             !(ignored & (1<<TK_string))) {
 528                 wchar_t first = tk.txt.txt[0];
 529                 reset_token(state, &tk);
 530                 ch = get_char(state);
 531                 tk.num = TK_error;
 532                 while (!at_eon(state) && !is_newline(ch)) {
 533                         ch = get_char(state);
 534                         if (ch == first) {
 535                                 tk.num = TK_string;
 536                                 break;
 537                         }
 538                         if (is_newline(ch)) {
 539                                 unget_char(state);
 540                                 break;
 541                         }
 542                 }
 543                 while (!at_eon(state) && (ch = get_char(state)) &&
 544                                           iswalpha(ch))
 545                         ;
 546                 unget_char(state);
 547                 close_token(state, &tk);
 548                 return tk;
 549         }
 550
 551 ### Comments
 552
 553 Single line comments may start with '`//`' or '`#`' providing that these
 554 are not known marks.  They continue to the end of the line.
 555
 556 Block comments start with '`/*`' if this is not a known mark.  They
 557 continue to the first occurrence of '`*/`' and may not contain any
 558 occurrence of '`/*`'.
 559
 560 Block comments can be wholly within one line or can continue over
 561 multiple lines.  The multi-line version should be followed immediately
 562 by a newline.  The Linux kernel contains over 285000 multi-line
 563 comments are only 34 are followed by characters other than white space
 564 (which should be removed) or a backslash (only needed in macros).  So
 565 it would not suffer from this rule.
 566
 567 These two comment types are reported as two separate token types, and
 568 consequently can be ignored separately.  When ignored a comment is
 569 still parsed, but is discarded.
 570
 571 ###### token types
 572         TK_line_comment,
 573         TK_block_comment,
 574
 575 ###### internal functions
 576         static int is_line_comment(struct text txt)
 577         {
 578                 return (txt.len >= 1 && txt.txt[0] == '#') ||
 579                        (txt.len >= 2 && txt.txt[0] == '/' &&
 580                                         txt.txt[1] == '/');
 581         }
 582
 583         static int is_block_comment(struct text txt)
 584         {
 585                 return txt.len >= 2 && txt.txt[0] == '/' &&
 586                        txt.txt[1] == '*';
 587         }
 588
 589 #### Single line comments
 590
 591 A single-line comment continues up to, but not including the newline
 592 or end of node.
 593
 594 ###### parse comment
 595
 596         if (is_line_comment(tk.txt)) {
 597                 while (!is_newline(ch) && !at_eon(state))
 598                         ch = get_char(state);
 599                 if (is_newline(ch))
 600                         unget_char(state);
 601                 close_token(state, &tk);
 602                 tk.num = TK_line_comment;
 603                 if (ignored & (1 << TK_line_comment))
 604                         continue;
 605                 return tk;
 606         }
 607
 608 #### Block comments
 609
 610 The token text collected so far could exceed the comment, so we need
 611 to reset it first.
 612
 613 If we find an embedded `/*` we reset to just before the '/' and report
 614 an error.  That way the next thing to be parsed will be the rest of
 615 the comment.  This requires a double unget, so we need to save/restore
 616 the unget state (explained later).
 617
 618 ###### parse comment
 619
 620         if (is_block_comment(tk.txt)) {
 621                 wchar_t prev;
 622                 int newlines = 0;
 623                 reset_token(state, &tk);
 624                 get_char(state);
 625                 get_char(state);
 626                 save_unget_state(state);
 627                 ch = get_char(state);
 628                 prev = 0;
 629                 while (!at_eon(state) &&
 630                        (prev != '/' || ch != '*') &&
 631                        (prev != '*' || ch != '/')) {
 632                         if (is_newline(ch))
 633                                 newlines = 1;
 634                         prev = ch;
 635                         save_unget_state(state);
 636                         ch = get_char(state);
 637                 }
 638                 close_token(state, &tk);
 639                 if (at_eon(state)) {
 640                         tk.num = TK_error;
 641                         return tk;
 642                 }
 643                 if (prev == '/') {
 644                         /* embedded.  Need to unget twice! */
 645                         restore_unget_state(state);
 646                         unget_char(state);
 647                         tk.num = TK_error;
 648                         return tk;
 649                 }
 650                 tk.num = TK_block_comment;
 651                 if (newlines && !(ignored & (1<<TK_newline))) {
 652                         /* next char must be newline */
 653                         ch = get_char(state);
 654                         unget_char(state);
 655                         if (!is_newline(ch))
 656                                 tk.num = TK_error;
 657                 }
 658                 if (tk.num == TK_error ||
 659                     !(ignored & (1 << TK_block_comment)))
 660                         return tk;
 661                 continue;
 662         }
 663
 664 ### Indents, Newlines, and White Space.
 665
 666 Normally white space is ignored.  However newlines can be important as
 667 can indents, which are either after a newline or at the start of a
 668 node (detected by `at_son()`);
 669
 670 ###### exported functions
 671         static inline int is_newline(wchar_t ch)
 672         {
 673                 return ch == '\n' || ch == '\f' || ch == '\v';
 674         }
 675
 676 ###### white space
 677         if (ch <= ' ' && !is_newline(ch)
 678             && ! at_son(state))
 679                 continue;
 680
 681 If a line starts with more white-space than the previous non-blank
 682 line - or if the first non-blank line in the document starts with any
 683 white-space - then an "IN" is reported at the start of the line.
 684
 685 Before the next non-blank line which starts with less white space, or
 686 at the latest at the end of the document, a matching "OUT" token
 687 is reported.  There will always be an exact match between "IN" and
 688 "OUT" tokens.
 689
 690 It is possible for "OUT" to be followed (almost) immediately by an
 691 "IN".  This happens if, for example, the indent of three consecutive
 692 lines are 0, 8, 4 spaces.  Before the second line we report an
 693 "IN".  Before the third line we must report an "OUT", as 4 is less
 694 than 8, then also an Ident as 4 is greater than 0.
 695
 696 ###### token types
 697         TK_in,
 698         TK_out,
 699
 700 For the purpose of measuring the length of white space, a tab adds at
 701 least one space, and rounds up to a multiple of 8.
 702
 703 ###### exported functions
 704         static inline int indent_tab(int indent)
 705         {
 706                 return (indent|7)+1;
 707         }
 708
 709 We need to track the current levels of indent.  This requires some
 710 sort of stack as indent levels are pushed on and popped off.  In
 711 practice this stack is unlikely to often exceed 5 so we will used a
 712 fixed stack of 20 indent levels.  More than this will be silently
 713 ignored.
 714
 715 ###### state fields
 716         int     indent_level;
 717         int     indent_sizes[20];
 718
 719 #### Newlines
 720
 721 Newlines can optionally be reported.  Newlines within a block comment
 722 or a multi-line string are not reported separately, but each of these
 723 must be followed immediately by a newline so these constructs cannot
 724 hide the fact that a newline was present.
 725
 726 When indents are being reported, the Newline which would normally be
 727 reported immediately before the "IN" is delayed until after the
 728 matching "OUT".  This makes an indented section act like a
 729 continuation of the previous line to some extent.
 730
 731 A blank line would normally be reported simply as two consecutive Newline
 732 tokens.  However if the subsequent line is indented (and indents are being
 733 reported) then the right thing to do is less obvious as Newlines should be
 734 delayed - but how many Newlines?
 735
 736 The approach we will take is to report the extra Newlines immediately after
 737 the IN token, so the blank line is treated as though it were an indented
 738 blank line.
 739
 740 ###### token types
 741         TK_newline,
 742
 743 If we find a newline or white space at the start of a block, we keep
 744 collecting spaces, tabs, and newlines until we find some real text.
 745 Then depending on the indent we generate some number of tokens.  These
 746 will be a sequence of "Newline OUT" pairs representing a decrease
 747 in indent, then either a Newline or an IN depending on whether the
 748 next line is indented, then zero or more Newlines representing all the
 749 blank lines that have been skipped.
 750
 751 When a Newline leads to the next block of code there is a question of
 752 whether the various Newline and OUT/IN tokens should appear to
 753 belong to the earlier or later block.  This is addressed by processing
 754 the tokens in two stages based on the relative indent levels of the
 755 two blocks (each block has a base indent to which the actual indents
 756 are added).
 757
 758 Any "Newline OUT" pairs needed to reduce the current indent to the
 759 maximum of the base indents of the old and new blocks are generated
 760 against the old block.  Then if the next block does not have an
 761 increased indent, one more "Newline" is generated.
 762
 763 If further "Newline OUT" pairs are needed to get to the indent
 764 level of the 'next' block, they are generated against that block,
 765 though the first Newline is suppressed (it having already been
 766 generated).
 767
 768 Finally the Newline or IN for the first line of the new block is
 769 generated, unless the Newline needs to be suppressed because it
 770 appeared at the end of the previous block.
 771
 772 This means that a block may start with an OUT or an IN, but
 773 will only start with a Newline if it actually starts with a blank
 774 line.
 775
 776 We will need to represent in the `token_state` where in this sequence
 777 of delayed tokens we are.  As `state.col` records the target indent we
 778 don't need to record how many OUTs or INs are needed.  We do
 779 need to record the number of blank lines, and which of Newline and
 780 OUT is needed next in the initial sequence of pairs.
 781
 782 For this we store one more than the number of blank lines as
 783 `delayed_lines` and a flag for `out_next`.
 784
 785 ###### state fields
 786         int check_indent;
 787         int delayed_lines;
 788         int out_next;
 789
 790 Generating these tokens involves two separate pieces of code.
 791
 792 Firstly we need to recognise white space and count the indents and
 793 newlines.  These are recorded in the above state fields.
 794
 795 Separately we need, on each call to `token_next`, to check if
 796 there are some delayed tokens and if so we need to advance the state
 797 information and return one token.
 798
 799 ###### white space
 800         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 801                 int newlines = 0;
 802                 int was_son = at_son(state);
 803                 if (ignored & (1<<TK_in)) {
 804                         if (!is_newline(ch))
 805                                 continue;
 806                         if (ignored & (1<<TK_newline))
 807                                 continue;
 808                         tk.num = TK_newline;
 809                         close_token(state, &tk);
 810                         return tk;
 811                 }
 812                 // Indents are needed, so check all white space.
 813                 while (ch <= ' ' && !at_eon(state)) {
 814                         if (is_newline(ch))
 815                                 newlines += 1;
 816                         ch = get_char(state);
 817                 }
 818                 if (at_eon(state)) {
 819                         newlines += 1;
 820                         if (state->node->next &&
 821                             state->node->next->indent > state->node->indent)
 822                                 state->col = state->node->next->indent;
 823                         else
 824                                 state->col = state->node->indent;
 825                 } else
 826                         unget_char(state);
 827                 state->delayed_lines = newlines;
 828                 state->out_next = was_son;
 829                 state->check_indent = 1;
 830                 continue;
 831         }
 832
 833 ###### delayed tokens
 834
 835         if (state->check_indent || state->delayed_lines) {
 836                 if (state->col < state->indent_sizes[state->indent_level]) {
 837                         if (!state->out_next &&
 838                             !(ignored & (1<<TK_newline))) {
 839                                 state->out_next = 1;
 840                                 tk.num = TK_newline;
 841                                 return tk;
 842                         }
 843                         state->indent_level -= 1;
 844                         state->out_next = 0;
 845                         tk.num = TK_out;
 846                         return tk;
 847                 }
 848                 if (state->col > state->indent_sizes[state->indent_level] &&
 849                     state->indent_level < sizeof(state->indent_sizes)-1) {
 850                         state->indent_level += 1;
 851                         state->indent_sizes[state->indent_level] = state->col;
 852                         state->delayed_lines -= 1;
 853                         tk.num = TK_in;
 854                         return tk;
 855                 }
 856                 state->check_indent = 0;
 857                 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
 858                         tk.num = TK_newline;
 859                         state->delayed_lines -= 1;
 860                         return tk;
 861                 }
 862                 state->delayed_lines = 0;
 863                 continue;
 864         }
 865
 866 ### End of File
 867
 868 After the last newline in the file has been processed, a special
 869 end-of-file token will be returned.  any further attempts to get more
 870 tokens will continue to return the same end-of-file token.
 871
 872 ###### token types
 873         TK_eof,
 874
 875 ###### white space
 876         if (ch == WEOF) {
 877                 if (state->col) {
 878                         state->col = 0;
 879                         state->check_indent = 1;
 880                         continue;
 881                 }
 882                 tk.num = TK_eof;
 883                 return tk;
 884         }
 885
 886 ### Unknown Marks, or errors.
 887
 888 We have now handled all the possible known mark-like tokens.
 889 If the token we have is not empty and `TK_mark` is allowed,
 890 we have an unknown mark, otherwise this must be an error.
 891
 892 ###### unknown mark
 893
 894         /* one unknown mark character */
 895         if (tk.txt.len) {
 896                 close_token(state, &tk);
 897                 if (ignored & (1<<TK_mark))
 898                         tk.num = TK_error;
 899                 else
 900                         tk.num = TK_mark;
 901                 return tk;
 902         }
 903         /* Completely unrecognised character is next, possibly
 904          * a digit and we are ignoring numbers.
 905          * What ever it is, make it an error.
 906          */
 907         get_char(state);
 908         close_token(state, &tk);
 909         tk.num = TK_error;
 910         return tk;
 911
 912 ## Tools For The Task
 913
 914 You may have noticed that are few gaps we left in the above -
 915 functions used without first defining them.  Doing so above would have
 916 broken the flow.
 917
 918 ### Character by character
 919
 920 As we walk through the various `code_node`s we need to process whole
 921 Unicode codepoints, and keep track of which line and column we are on.
 922 We will assume for now that any printing character uses one column,
 923 though that is not true in general.
 924
 925 As the text in a `code_node` may include an indent that identifies it as
 926 being code, we need to be careful to strip that.  The `code_node` has
 927 a flag that tells us whether or not we need to strip.
 928
 929 ###### includes
 930         #include <memory.h>
 931
 932 ###### state fields
 933         struct code_node *node;
 934         int    offset;
 935         int    line;
 936         int    col;
 937         int    strip_offset;
 938
 939 ###### internal functions
 940
 941         static int do_strip(struct token_state *state)
 942         {
 943                 int indent = 0;
 944                 if (state->node->needs_strip) {
 945                         int n = 4;
 946                         while (n && state->node->code.txt[state->offset] == ' ') {
 947                                 indent += 1;
 948                                 state->offset += 1;
 949                                 n -= 1;
 950                         }
 951                         while (n == 4 && state->node->code.txt[state->offset] == '\t') {
 952                                 indent = indent_tab(indent);
 953                                 state->offset += 1;
 954                                 n -= 4;
 955                         }
 956                 }
 957                 return indent;
 958         }
 959
 960         static wint_t get_char(struct token_state *state)
 961         {
 962                 wchar_t next;
 963                 size_t n;
 964                 mbstate_t mbstate;
 965
 966                 if (state->node == NULL)
 967                         return WEOF;
 968                 if (state->node->code.len <= state->offset) {
 969                         do
 970                                 state->node = state->node->next;
 971                         while (state->node && state->node->code.txt == NULL);
 972                         state->offset = 0;
 973                         if (state->node == NULL)
 974                                 return WEOF;
 975                         state->line = state->node->line_no;
 976                         state->col = do_strip(state);
 977                         state->strip_offset = state->offset;
 978                 }
 979
 980                 ## before get_char
 981
 982                 memset(&mbstate, 0, sizeof(mbstate));
 983
 984                 n = mbrtowc(&next, state->node->code.txt + state->offset,
 985                             state->node->code.len - state->offset,
 986                             &mbstate);
 987                 if (n == -2 || n == 0) {
 988                         /* Not enough bytes - not really possible */
 989                         next = '\n';
 990                         state->offset = state->node->code.len;
 991                 } else if (n == -1) {
 992                         /* error */
 993                         state->offset += 1;
 994                         next = 0x7f; // an illegal character
 995                 } else
 996                         state->offset += n;
 997
 998                 if (next >= ' ') {
 999                         state->col += 1;
1000                 } else if (is_newline(next)) {
1001                         state->line += 1;
1002                         state->col = do_strip(state);
1003                 } else if (next == '\t') {
1004                         state->col = indent_tab(state->col);
1005                 }
1006                 return next;
1007         }
1008
1009 We will sometimes want to "unget" the last character as it needs to be
1010 considered again as part of the next token.  So we need to store a
1011 'previous' version of all metadata.
1012
1013 ###### state fields
1014         int    prev_offset;
1015         int    prev_line;
1016         int    prev_col;
1017
1018 ###### before get_char
1019         state->prev_offset = state->offset;
1020         state->prev_line   = state->line;
1021         state->prev_col    = state->col;
1022
1023 ###### internal functions
1024
1025         static void unget_char(struct token_state *state)
1026         {
1027                 if (state->node) {
1028                         state->offset = state->prev_offset;
1029                         state->line   = state->prev_line;
1030                         state->col    = state->prev_col;
1031                 }
1032         }
1033
1034 We occasionally need a double-unget, particularly for numbers and
1035 block comments.  We don't impose this cost on all scanning, but
1036 require those code sections that need it to call `save_unget_state`
1037 before each `get_char`, and then `restore_unget_state` when a
1038 double-unget is needed.
1039
1040 ###### state fields
1041         int     prev_offset2;
1042         int     prev_line2;
1043         int     prev_col2;
1044
1045 ###### internal functions
1046         static void save_unget_state(struct token_state *state)
1047         {
1048                 state->prev_offset2 = state->prev_offset;
1049                 state->prev_line2 = state->prev_line;
1050                 state->prev_col2 = state->prev_col;
1051         }
1052
1053         static void restore_unget_state(struct token_state *state)
1054         {
1055                 state->prev_offset = state->prev_offset2;
1056                 state->prev_line = state->prev_line2;
1057                 state->prev_col = state->prev_col2;
1058         }
1059
1060 At the start of a token we don't want to be at the end of a code block
1061 if we can help it.  To avoid this possibility, we 'get' and 'unget' a
1062 single character.  This will move into the next non-empty code block
1063 and leave the current pointer at the start of it.
1064
1065 This has to happen _after_ dealing with delayed tokens as some of them
1066 must appear in the previous node.  When we do this, we need to reset
1067 the data in the token.
1068
1069 ###### delayed tokens
1070         if (at_eon(state)) {
1071                 get_char(state);
1072                 unget_char(state);
1073                 tk.node = state->node;
1074                 if (state->node)
1075                         tk.txt.txt = state->node->code.txt + state->offset;
1076                 tk.line = state->line;
1077                 tk.col = state->col;
1078                 tk.txt.len = 0;
1079         }
1080
1081 ### Managing tokens
1082
1083 The current token is initialized to line up with the first character
1084 that we 'get' for each token.  When we have, or might have, a full
1085 token we can call `close_token` to set the `len` of the token
1086 appropriately.  This can safely be called multiple times.
1087
1088 Finally we occasionally (for single-line strings and block comments)
1089 need to reset to the beginning of the current token as we might have
1090 parsed too much already.  For that there is `reset_token`.
1091
1092 ###### one token
1093         tk.node = state->node;
1094         if (state->node)
1095                 tk.txt.txt = state->node->code.txt + state->offset;
1096         tk.line = state->line;
1097         tk.col = state->col;
1098         tk.txt.len = 0;
1099
1100 ###### internal functions
1101
1102         static void close_token(struct token_state *state,
1103                                 struct token *tk)
1104         {
1105                 if (state->node != tk->node)
1106                         tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
1107                 else
1108                         tk->txt.len = (state->node->code.txt + state->offset)
1109                                       - tk->txt.txt;
1110         }
1111
1112         static void reset_token(struct token_state *state, struct token *tok)
1113         {
1114                 state->prev_line = tok->line;
1115                 state->prev_col = tok->col;
1116                 state->prev_offset = tok->txt.txt - state->node->code.txt;
1117                 unget_char(state);
1118                 tok->txt.len = 0;
1119         }
1120
1121 Tokens may not cross into the next `code_node`, and some tokens can
1122 include the newline at the and of a `code_node`, we must be able to
1123 easily check if we have reached the end.  Equally we need to know if
1124 we are at the start of a node, as white space is treated a little
1125 differently there.
1126
1127 ###### internal functions
1128
1129         static int at_son(struct token_state *state)
1130         {
1131                 return state->prev_offset <= state->strip_offset;
1132         }
1133
1134         static int at_eon(struct token_state *state)
1135         {
1136                 // at end-of-node ??
1137                 return state->node == NULL ||
1138                        state->offset >= state->node->code.len;
1139         }
1140
1141 ### Find a known word
1142
1143 As the known-word list is sorted we can use a simple binary search.
1144 Following the pattern established in "mdcode", we will use a `struct
1145 text` with start and length to represent the code fragment we are
1146 searching for.
1147
1148 ###### internal functions
1149         static int find_known(struct token_config *conf, struct text txt)
1150         {
1151                 int lo = 0;
1152                 int hi = conf->known_count;
1153
1154                 while (lo + 1 < hi) {
1155                         int mid = (lo + hi) / 2;
1156                         int cmp = strncmp(conf->words_marks[mid],
1157                                           txt.txt, txt.len);
1158                         if (cmp == 0 && conf->words_marks[mid][txt.len])
1159                                 cmp = 1;
1160                         if (cmp <= 0)
1161                                 lo = mid;
1162                         else
1163                                 hi = mid;
1164                 }
1165                 if (strncmp(conf->words_marks[lo],
1166                            txt.txt, txt.len) == 0
1167                     && conf->words_marks[lo][txt.len] == 0)
1168                         return lo;
1169                 else
1170                         return -1;
1171         }
1172
1173 ### Bringing it all together
1174
1175 Now we have all the bits there is just one section missing:  combining
1176 all the token parsing code into one block.
1177
1178 The handling of delayed tokens (Newlines, INs, OUTs) must come
1179 first before we try getting another character.
1180
1181 Then we parse all the test, making sure that we check for known marks
1182 before strings and comments, but unknown marks after strings and comments.
1183
1184 This block of code will either return a token, or will choose to
1185 ignore one, in which case it will `continue` around to the top of the
1186 loop.
1187
1188 ###### one token
1189         ## delayed tokens
1190
1191         ch = get_char(state);
1192
1193         ## white space
1194         ## parse number
1195         ## parse word
1196         ## parse mark
1197
1198 ### Start and stop
1199
1200 As well as getting tokens, we need to be able to create the
1201 `token_state` to start with, and discard it later.
1202
1203 ###### includes
1204         #include <malloc.h>
1205
1206 ###### main functions
1207         struct token_state *token_open(struct code_node *code, struct
1208                                        token_config *conf)
1209         {
1210                 struct token_state *state = malloc(sizeof(*state));
1211                 memset(state, 0, sizeof(*state));
1212                 state->node = code;
1213                 state->line = code->line_no;
1214                 state->col = do_strip(state);
1215                 state->strip_offset = state->offset;
1216                 state->conf = conf;
1217                 return state;
1218         }
1219         void token_close(struct token_state *state)
1220         {
1221                 free(state);
1222         }
1223
1224 ###### exported functions
1225         struct token_state *token_open(struct code_node *code, struct
1226                                        token_config *conf);
1227         void token_close(struct token_state *state);
1228
1229 ### Trace tokens
1230
1231 Getting tokens is the main thing but it is also useful to be able to
1232 print out token information, particularly for tracing and testing.
1233
1234 Known tokens are printed verbatim.  Other tokens are printed as
1235 `type(content)` where content is truncated to a given number of characters.
1236
1237 The function for printing a truncated string (`text_dump`) is also exported
1238 so that it can be used to tracing processed strings too.
1239
1240 ###### includes
1241         #include <stdio.h>
1242
1243 ###### exported functions
1244         void token_trace(FILE *f, struct token tok, int max);
1245         void text_dump(FILE *f, struct text t, int max);
1246
1247 ###### main functions
1248
1249         void text_dump(FILE *f, struct text txt, int max)
1250         {
1251                 int i;
1252                 if (txt.len > max)
1253                         max -= 2;
1254                 else
1255                         max = txt.len;
1256                 for (i = 0; i < max; i++) {
1257                         char c = txt.txt[i];
1258                         if (c < ' ' || c > '~')
1259                                 fprintf(f, "\\x%02x", c & 0xff);
1260                         else if (c == '\\')
1261                                 fprintf(f, "\\\\");
1262                         else
1263                                 fprintf(f, "%c", c);
1264                 }
1265                 if (i < txt.len)
1266                         fprintf(f, "..");
1267         }
1268
1269         void token_trace(FILE *f, struct token tok, int max)
1270         {
1271                 static char *types[] = {
1272                         [TK_ident] = "ident",
1273                         [TK_mark] = "mark",
1274                         [TK_number] = "number",
1275                         [TK_string] = "string",
1276                         [TK_multi_string] = "mstring",
1277                         [TK_line_comment] = "lcomment",
1278                         [TK_block_comment] = "bcomment",
1279                         [TK_in] = "in",
1280                         [TK_out] = "out",
1281                         [TK_newline] = "newline",
1282                         [TK_eof] = "eof",
1283                         [TK_error] = "ERROR",
1284                         };
1285
1286                 switch (tok.num) {
1287                 default: /* known word or mark */
1288                         fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1289                         break;
1290                 case TK_in:
1291                 case TK_out:
1292                 case TK_newline:
1293                 case TK_eof:
1294                         /* No token text included */
1295                         fprintf(f, "%s()", types[tok.num]);
1296                         break;
1297                 case TK_ident:
1298                 case TK_mark:
1299                 case TK_number:
1300                 case TK_string:
1301                 case TK_multi_string:
1302                 case TK_line_comment:
1303                 case TK_block_comment:
1304                 case TK_error:
1305                         fprintf(f, "%s(", types[tok.num]);
1306                         text_dump(f, tok.txt, max);
1307                         fprintf(f, ")");
1308                         break;
1309                 }
1310         }
1311
1312 ### And there we have it
1313
1314 We now have all the library functions defined for reading and printing
1315 tokens.  Now we just need C files to store them, and a mk file to make them.
1316
1317 ###### File: scanner.h
1318         ## public types
1319         ## exported functions
1320
1321 ###### File: libscanner.c
1322         ## includes
1323         #include "scanner.h"
1324         ## private types
1325         ## internal functions
1326         ## main functions
1327
1328 ###### File: scanner.mk
1329
1330         CFLAGS += -Wall -g
1331         all ::
1332         scanner.mk scanner.h libscanner.c : scanner.mdc
1333                 ./md2c scanner.mdc
1334         all :: libscanner.o
1335         libscanner.o : libscanner.c
1336                 $(CC) $(CFLAGS) -c libscanner.c
1337
1338 ## Processing numbers
1339
1340 Converting a `TK_number` token to a numerical value is a slightly
1341 higher level task than lexical analysis, and slightly lower than
1342 grammar parsing, so put it here - as an appendix if you like.
1343
1344 Importantly it will be used by the same testing rig that is used for
1345 testing the token scanner.
1346
1347 The numeric value that we will convert all numbers into is the `mpq_t`
1348 from the GNU high precision number library "libgmp".
1349
1350 ###### number includes
1351         #include <gmp.h>
1352         #include "mdcode.h"
1353
1354 Firstly we need to be able to parse a string of digits in a given base
1355 and possibly with a decimal marker.  We store this in an `mpz_t`
1356 integer and report the number of digits after the decimal mark.
1357
1358 On error we return zero and ensure that the 'mpz_t' has been freed, or
1359 had never been initialised.
1360
1361 ###### number functions
1362
1363         static int parse_digits(mpz_t num, struct text tok, int base,
1364                                 int *placesp)
1365         {
1366                 /* Accept digits up to 'base', ignore '_' and
1367                  * (for base 10) ' ' if they appear between two
1368                  * legal digits, and if `placesp` is not NULL,
1369                  * allow a single '.' or ',' and report the number
1370                  * of digits beyond there.
1371                  * Return number of characters processed (p),
1372                  * or 0 if something illegal was found.
1373                  */
1374                 int p;
1375                 int decimal = -1; // digits after marker
1376                 enum {Digit, Space, Other} prev = Other;
1377                 int digits = 0;
1378
1379                 for (p = 0; p < tok.len; p++) {
1380                         int dig;
1381                         char c = tok.txt[p];
1382
1383                         if (c == '_' || (c == ' ' && base == 10)) {
1384                                 if (prev != Digit)
1385                                         goto bad;
1386                                 prev = Space;
1387                                 continue;
1388                         }
1389                         if (c == '.' || c == ',') {
1390                                 if (prev != Digit)
1391                                         goto bad;
1392                                 if (!placesp || decimal >= 0)
1393                                         return p-1;
1394                                 decimal = 0;
1395                                 prev = Other;
1396                                 continue;
1397                         }
1398                         if (isdigit(c))
1399                                 dig = c - '0';
1400                         else if (isupper(c))
1401                                 dig = 10 + c - 'A';
1402                         else if (islower(c))
1403                                 dig = 10 + c - 'a';
1404                         else
1405                                 dig = base;
1406                         if (dig >= base) {
1407                                 if (prev == Space)
1408                                         p--;
1409                                 break;
1410                         }
1411                         prev = Digit;
1412                         if (digits)
1413                                 mpz_mul_ui(num, num, base);
1414                         else
1415                                 mpz_init(num);
1416                         digits += 1;
1417                         mpz_add_ui(num, num, dig);
1418                         if (decimal >= 0)
1419                                 decimal++;
1420                 }
1421                 if (digits == 0)
1422                         return 0;
1423                 if (placesp) {
1424                         if (decimal >= 0)
1425                                 *placesp = decimal;
1426                         else
1427                                 *placesp = 0;
1428                 }
1429                 return p;
1430         bad:
1431                 if (digits)
1432                         mpz_clear(num);
1433                 return 0;
1434         }
1435
1436 ###### number includes
1437         #include <ctype.h>
1438
1439 To parse a full number we need to consider the optional base, the
1440 mantissa, and the optional exponent.  We will treat these one at a
1441 time.
1442
1443 The base is indicated by a letter after a leading zero, which must be
1444 followed by a base letter or a period.  The base also determines the
1445 character which will mark an exponent.
1446
1447 ###### number vars
1448         int base = 10;
1449         char expc = 'e';
1450
1451 ###### parse base
1452
1453         if (tok.txt[0] == '0' && tok.len > 1) {
1454                 int skip = 0;
1455                 switch(tok.txt[1]) {
1456                 case 'x':
1457                 case 'X':
1458                         base = 16;
1459                         skip = 2;
1460                         expc = 'p';
1461                         break;
1462                 case 'o':
1463                 case 'O':
1464                         base = 8;
1465                         skip = 2;
1466                         expc = 'p';
1467                         break;
1468                 case 'b':
1469                 case 'B':
1470                         base = 2;
1471                         skip = 2;
1472                         expc = 'p';
1473                         break;
1474                 case '0':
1475                 case '1':
1476                 case '2':
1477                 case '3':
1478                 case '4':
1479                 case '5':
1480                 case '6':
1481                 case '7':
1482                 case '8':
1483                 case '9':
1484                 case '_':
1485                 case ' ':
1486                         // another digit is not permitted
1487                         // after a zero.
1488                         return 0;
1489                 default:
1490                         // must be decimal marker or trailing
1491                         // letter, which are OK;
1492                         break;
1493                 }
1494                 tok.txt += skip;
1495                 tok.len -= skip;
1496         }
1497
1498 After the base is the mantissa, which may contain a decimal mark, so
1499 we need to record the number of places.  We won't impose the number of
1500 places until we have the exponent as well.
1501
1502 ###### number vars
1503         int places = 0;
1504         mpz_t mant;
1505         int d;
1506
1507 ###### parse mantissa
1508
1509         d = parse_digits(mant, tok, base, &places);
1510         if (d == 0)
1511                 return 0;
1512         tok.txt += d;
1513         tok.len -= d;
1514         mpq_init(num);
1515         mpq_set_z(num, mant);
1516         mpz_clear(mant);
1517
1518 After the mantissa number may come an exponent which may be positive
1519 or negative.  We assume at this point that we have seen the exponent
1520 character `expc`.
1521
1522 ###### number vars
1523         long lexp = 0;
1524         mpz_t exp;
1525         int esign = 1;
1526
1527 ###### parse exponent
1528         if (tok.len > 1) {
1529                 if (tok.txt[0] == '+') {
1530                         tok.txt++;
1531                         tok.len--;
1532                 } else if (tok.txt[0] == '-') {
1533                         esign = -1;
1534                         tok.txt++;
1535                         tok.len--;
1536                 }
1537         }
1538         d = parse_digits(exp, tok, 10, NULL);
1539         if (d == 0) {
1540                 mpq_clear(num);
1541                 return 0;
1542         }
1543         if (!mpz_fits_slong_p(exp)) {
1544                 mpq_clear(num);
1545                 mpz_clear(exp);
1546                 return 0;
1547         }
1548         lexp = mpz_get_si(exp) * esign;
1549         mpz_clear(exp);
1550         tok.txt += d;
1551         tok.len -= d;
1552
1553 Now that we have the mantissa and the exponent we can multiply them
1554 together, also allowing for the number of digits after the decimal
1555 mark.
1556
1557 For base 10, we simply subtract the decimal places from the exponent.
1558 For the other bases, as the exponent is alway based on 2, even for
1559 octal and hex, we need a bit more detail.
1560 We then recover the sign from the exponent, as division is quite
1561 different from multiplication.
1562
1563 ###### calc exponent
1564         switch (base) {
1565         case 10:
1566         case 2:
1567                 lexp -= places;
1568                 break;
1569         case 16:
1570                 lexp -= 4*places;
1571                 break;
1572         case 8:
1573                 lexp -= 3*places;
1574                 break;
1575         }
1576         if (lexp < 0) {
1577                 lexp = -lexp;
1578                 esign = -1;
1579         } else
1580                 esign = 1;
1581
1582 Imposing the exponent on the number is also very different for base 10
1583 than for the others.  For the binary shift `gmp` provides a simple
1584 function.  For base 10 we use something like Russian Peasant
1585 Multiplication.
1586
1587 ###### calc exponent
1588         if (expc == 'e') {
1589                 mpq_t tens;
1590                 mpq_init(tens);
1591                 mpq_set_ui(tens, 10, 1);
1592                 while (1) {
1593                         if (lexp & 1) {
1594                                 if (esign > 0)
1595                                         mpq_mul(num, num, tens);
1596                                 else
1597                                         mpq_div(num, num, tens);
1598                         }
1599                         lexp >>= 1;
1600                         if (lexp == 0)
1601                                 break;
1602                         mpq_mul(tens, tens, tens);
1603                 }
1604                 mpq_clear(tens);
1605         } else {
1606                 if (esign > 0)
1607                         mpq_mul_2exp(num, num, lexp);
1608                 else
1609                         mpq_div_2exp(num, num, lexp);
1610         }
1611
1612 Now we are ready to parse a number: the base, mantissa, and exponent.
1613 If all goes well we check for the possible trailing letters and
1614 return.  Return value is 1 for success and 0 for failure.
1615
1616 ###### number functions
1617         int number_parse(mpq_t num, char tail[3], struct text tok)
1618         {
1619                 ## number vars
1620                 int i;
1621
1622                 ## parse base
1623                 ## parse mantissa
1624                 if (tok.len > 1 && (tok.txt[0] == expc ||
1625                                     tok.txt[0] == toupper(expc))) {
1626                         tok.txt++;
1627                         tok.len--;
1628                         ## parse exponent
1629                 }
1630                 ## calc exponent
1631
1632                 for (i = 0; i < 2; i++) {
1633                         if (tok.len <= i)
1634                                 break;
1635                         if (!isalpha(tok.txt[i]))
1636                                 goto err;
1637                         tail[i] = tok.txt[i];
1638                 }
1639                 tail[i] = 0;
1640                 if (i == tok.len)
1641                         return 1;
1642         err:
1643                 mpq_clear(num);
1644                 return 0;
1645         }
1646
1647 Number parsing goes in `libnumber.c`
1648
1649 ###### File: libnumber.c
1650
1651         #include <unistd.h>
1652         #include <stdlib.h>
1653
1654         ## number includes
1655         ## number functions
1656
1657 ###### File: number.h
1658         int number_parse(mpq_t num, char tail[3], struct text tok);
1659
1660 ###### File: scanner.mk
1661         all :: libnumber.o
1662         libnumber.o : libnumber.c
1663                 $(CC) $(CFLAGS) -c libnumber.c
1664
1665 ## Processing strings
1666
1667 Both `TK_string` and `TK_multi_string` require post-processing which
1668 can be one of two types: literal or with escapes processed.
1669 Even literal processing is non-trivial as the file may contain indents
1670 which need to be stripped.
1671
1672 Errors can only occur when processing escapes.  Any unrecognised
1673 character following the escape character will cause an error.
1674
1675 Processing escapes and striping indents can only make the string
1676 shorter, not longer, so we allocate a buffer which is the same size as
1677 the string and process into that.
1678
1679 To request escape processing, we pass the character we want to use for
1680 quoting, usually '`\`'.  To avoid escape processing we pass a zero.
1681
1682 ###### string main
1683         int string_parse(struct token *tok, char escape,
1684                          struct text *str, char tail[3])
1685         {
1686                 ## string vars
1687                 struct text t = tok->txt;
1688
1689                 str->txt = NULL;
1690                 ## strip tail
1691                 if (tok->num == TK_string) {
1692                         ## strip single
1693                 } else {
1694                         ## strip multi
1695                 }
1696                 str->txt = malloc(t.len);
1697                 str->len = 0;
1698
1699                 ## process string
1700                 return 1;
1701         err:
1702                 free(str->txt);
1703                 str->txt = NULL;
1704                 return 0;
1705         }
1706
1707 ### strip tail
1708
1709 The tail of the string can be 0, 1, or 2 letters
1710
1711         i = t.len;
1712         if (i >= 0 && isalpha(t.txt[i-1]))
1713                 i -= 1;
1714         if (i >= 0 && isalpha(t.txt[i-1]))
1715                 i -= 1;
1716         strncpy(tail, t.txt+i, t.len-i);
1717         tail[t.len-i] = 0;
1718         t.len = i;
1719
1720 ###### string vars
1721         int i;
1722
1723 ### strip single
1724
1725 Stripping the quote of a single-line string is trivial.
1726 The only part that is at all interesting is that quote character must
1727 be remembered.
1728
1729         quote = t.txt[0];
1730         if (t.txt[t.len-1] != quote)
1731                 goto err;
1732         t.txt += 1;
1733         t.len -= 2;
1734
1735 ###### string vars
1736         char quote;
1737
1738 ### strip multi
1739
1740 For a multi-line string we have a little more work to do.  We need to
1741 remove 3 quotes, not 1, and need to count the indent of the close
1742 quote as it will need to be stripped from all lines.
1743
1744         quote = t.txt[0];
1745         if (t.len < 7 ||
1746             t.txt[1] != quote || t.txt[2] != quote ||
1747             !is_newline(t.txt[3]))
1748                 goto err;
1749         t.txt += 4;
1750         t.len -= 4;
1751         i = t.len;
1752         if (i <= 0 || t.txt[i-1] != quote)
1753                 goto err;
1754         i -= 1;
1755         if (i <= 0 || t.txt[i-1] != quote)
1756                 goto err;
1757         i -= 1;
1758         if (i <= 0 || t.txt[i-1] != quote)
1759                 goto err;
1760         i -= 1;
1761         t.len = i;
1762         while (i > 0 && !is_newline(t.txt[i-1]))
1763                 i--;
1764         indent = 0;
1765         while (i < t.len) {
1766                 if (t.txt[i] == ' ')
1767                         indent += 1;
1768                 if (t.txt[i] == '\t')
1769                         indent = indent_tab(indent);
1770                 i++;
1771         }
1772
1773 ###### string vars
1774         int indent = 0;
1775
1776 ### process string
1777
1778 Now we just take one byte at a time. trans-ASCII unicode won't look
1779 like anything we are interested in so it will just be copied byte by
1780 byte.
1781
1782         cp = str->txt;
1783         at_sol = 1;
1784         for (i = 0; i < t.len; i++) {
1785                 char c;
1786                 if (at_sol) {
1787                         at_sol = 0;
1788                         ## strip indent
1789                         if (i >= t.len)
1790                                 break;
1791                 }
1792                 c = t.txt[i];
1793                 if (c != escape) {
1794                         *cp = c;
1795                         cp += 1;
1796                         if (is_newline(c))
1797                                 at_sol = 1;
1798                 } else if (i+1 >= t.len) {
1799                         // escape and end of string
1800                         goto err;
1801                 } else {
1802                         i += 1;
1803                         c = t.txt[i];
1804                         ## parse escape
1805                 }
1806         }
1807         str->len = cp - str->txt;
1808
1809 ###### string vars
1810         char *cp;
1811         int at_sol;
1812
1813 ### strip indent
1814
1815 Every time we find a start of line, we strip spaces and tabs until the
1816 required indent is found.
1817
1818         int skipped = 0;
1819         while (i < t.len && skipped < indent) {
1820                 c = t.txt[i];
1821                 if (c == ' ')
1822                         skipped += 1;
1823                 else if (c == '\t')
1824                         skipped = indent_tab(skipped);
1825                 else
1826                         break;
1827                 i+= 1;
1828         }
1829
1830 ### parse escape
1831         switch (c) {
1832         case 'n':
1833                 *cp++ = '\n'; break;
1834         case 'r':
1835                 *cp++ = '\r'; break;
1836         case 't':
1837                 *cp++ = '\t'; break;
1838         case 'b':
1839                 *cp++ = '\b'; break;
1840         case 'q':
1841                 *cp++ = quote; break;
1842         case 'f':
1843                 *cp++ = '\f'; break;
1844         case 'v':
1845                 *cp++ = '\v'; break;
1846         case 'a':
1847                 *cp++ = '\a'; break;
1848         case '0':
1849         case '1':
1850         case '2':
1851         case '3':
1852                 // 3 digit octal number
1853                 if (i+2 >= t.len)
1854                         goto err;
1855                 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1856                     t.txt[i+2] < '0' || t.txt[i+1] > '7')
1857                         goto err;
1858                 n = (t.txt[i  ]-'0') * 64 +
1859                     (t.txt[i+1]-'0') *  8 +
1860                     (t.txt[i+2]-'0') *  1;
1861                 *cp++ = n;
1862                 i += 2;
1863                 break;
1864         case 'x':
1865                 // 2 hex digits
1866                 n = take_hex(2, t.txt+i+1, t.len-i-1);
1867                 if (n < 0)
1868                         goto err;
1869                 *cp++ = n;
1870                 i += 2;
1871                 break;
1872         case 'u':
1873         case 'U':
1874                 // 4 or 8 hex digits for unicode
1875                 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1876                 if (n < 0)
1877                         goto err;
1878                 memset(&pstate, 0, sizeof(pstate));
1879                 n = wcrtomb(cp, n, &pstate);
1880                 if (n <= 0)
1881                         goto err;
1882                 cp += n;
1883                 i += c == 'u' ? 4 : 8;
1884                 break;
1885         default:
1886                 if (c == escape)
1887                         *cp++ = c;
1888                 else if (is_newline(c))
1889                         at_sol = 1;
1890                 else
1891                         goto err;
1892         }
1893
1894 ###### string vars
1895         long n;
1896         mbstate_t pstate;
1897
1898 For `\x` `\u` and `\U` we need to collect a specific number of
1899 hexadecimal digits
1900
1901 ###### string functions
1902
1903         static long take_hex(int digits, char *cp, int l)
1904         {
1905                 long n = 0;
1906                 if (l < digits)
1907                         return -1;
1908                 while (digits) {
1909                         char  c = *cp;
1910                         int d;
1911                         if (!isxdigit(c))
1912                                 return -1;
1913                         if (isdigit(c))
1914                                 d = c - '0';
1915                         else if (isupper(c))
1916                                 d = 10 + c - 'A';
1917                         else
1918                                 d = 10 + c - 'a';
1919                         n = n * 16 + d;
1920                         digits--;
1921                         cp++;
1922                 }
1923                 return n;
1924         }
1925
1926 #### File: libstring.c
1927
1928 String parsing goes in `libstring.c`
1929
1930         #include <unistd.h>
1931         #include <stdlib.h>
1932         #include <stdio.h>
1933         #include <string.h>
1934         #include <ctype.h>
1935         #include <wchar.h>
1936         #include "mdcode.h"
1937         #include "scanner.h"
1938         ## string functions
1939         ## string main
1940
1941 ###### File: string.h
1942         int string_parse(struct token *tok, char escape,
1943                          struct text *str, char tail[3]);
1944
1945 ###### File: scanner.mk
1946         all :: libstring.o
1947         libstring.o : libstring.c
1948                 $(CC) $(CFLAGS) -c libstring.c
1949
1950 ## Testing
1951
1952 As "untested code is buggy code" we need a program to easily test
1953 the scanner library.  This will simply parse a given file and report
1954 the tokens one per line.
1955
1956 ###### File: scanner.c
1957
1958         #include <unistd.h>
1959         #include <stdlib.h>
1960         #include <fcntl.h>
1961         #include <errno.h>
1962         #include <sys/mman.h>
1963         #include <string.h>
1964         #include <stdio.h>
1965         #include <gmp.h>
1966         #include <locale.h>
1967         #include <getopt.h>
1968         #include "mdcode.h"
1969         #include "scanner.h"
1970         #include "number.h"
1971         #include "string.h"
1972
1973         static int errs;
1974         static void pr_err(char *msg)
1975         {
1976                 errs++;
1977                 fprintf(stderr, "%s\n", msg);
1978         }
1979
1980         static int kcmp(const void *ap, const void *bp)
1981         {
1982                 char * const *a = ap;
1983                 char * const *b = bp;
1984                 return strcmp(*a, *b);
1985         }
1986
1987         int main(int argc, char *argv[])
1988         {
1989                 int fd;
1990                 int len;
1991                 char *file;
1992                 char *filename = NULL;
1993                 struct token_state *state;
1994                 const char *known[] = {
1995                         "==",
1996                         "else",
1997                         "if",
1998                         "then",
1999                         "while",
2000                         "{",
2001                         "}",
2002                 };
2003                 struct token_config conf = {
2004                         .word_start = "_$",
2005                         .word_cont = "",
2006                         .words_marks = known,
2007                         .number_chars = "., _+-",
2008                         .known_count = sizeof(known)/sizeof(known[0]),
2009                         .ignored = 0,
2010                 };
2011                 static const struct option long_options[] = {
2012                         { "word-start",         1, NULL, 'W'},
2013                         { "word-cont",          1, NULL, 'w'},
2014                         { "number-chars",       1, NULL, 'n'},
2015                         { "ignore-numbers",     0, NULL, 'N'},
2016                         { "ignore-ident",       0, NULL, 'I'},
2017                         { "ignore-marks",       0, NULL, 'M'},
2018                         { "ignore-strings",     0, NULL, 'S'},
2019                         { "ignore-multi-strings",0, NULL, 'z'},
2020                         { "ignore-line-comment",0, NULL, 'c'},
2021                         { "ignore-newline",     0, NULL, 'l'},
2022                         { "ignore-block-comment", 0, NULL, 'C'},
2023                         { "ignore-indent",      0, NULL, 'i'},
2024                         { "file",               1, NULL, 'f'},
2025                         { NULL,                 0, NULL, 0},
2026                 };
2027                 static const char options[] = "W:w:n:NIMSzclCif:";
2028
2029                 struct section *table, *s, *prev;
2030                 int opt;
2031
2032                 setlocale(LC_ALL,"");
2033                 while ((opt = getopt_long(argc, argv, options, long_options, NULL))
2034                        != -1) {
2035                         switch(opt) {
2036                         case 'W': conf.word_start = optarg; break;
2037                         case 'w': conf.word_cont = optarg; break;
2038                         case 'n': conf.number_chars = optarg; break;
2039                         case 'N': conf.ignored |= 1 << TK_number; break;
2040                         case 'I': conf.ignored |= 1 << TK_ident; break;
2041                         case 'M': conf.ignored |= 1 << TK_mark; break;
2042                         case 'S': conf.ignored |= 1 << TK_string; break;
2043                         case 'z': conf.ignored |= 1 << TK_multi_string; break;
2044                         case 'c': conf.ignored |= 1 << TK_line_comment; break;
2045                         case 'C': conf.ignored |= 1 << TK_block_comment; break;
2046                         case 'l': conf.ignored |= 1 << TK_newline; break;
2047                         case 'i': conf.ignored |= 1 << TK_in; break;
2048                         case 'f': filename = optarg; break;
2049                         default: fprintf(stderr, "scanner: unknown option '%c'.\n",
2050                                          opt);
2051                                 exit(1);
2052                         }
2053                 }
2054
2055                 if (optind < argc) {
2056                         const char **wm = calloc(argc - optind, sizeof(char*));
2057                         int i;
2058                         for (i = optind; i < argc; i++)
2059                                 wm[i - optind] = argv[i];
2060                         qsort(wm, argc-optind, sizeof(char*), kcmp);
2061                         conf.words_marks = wm;
2062                         conf.known_count = argc - optind;
2063                 }
2064
2065                 if (filename)
2066                         fd = open(filename, O_RDONLY);
2067                 else
2068                         fd = 0;
2069                 if (fd < 0) {
2070                         fprintf(stderr, "scanner: cannot open %s: %s\n",
2071                                 filename, strerror(errno));
2072                         exit(1);
2073                 }
2074                 len = lseek(fd, 0, 2);
2075                 if (len <= 0) {
2076                         fprintf(stderr,"scanner: %s is empty or not seekable\n",
2077                                 filename ?: "stdin");
2078                         exit(1);
2079                 }
2080                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2081                 table = code_extract(file, file+len, pr_err);
2082
2083                 for (s = table; s;
2084                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
2085                         printf("Tokenizing: %.*s\n", s->section.len,
2086                                 s->section.txt);
2087                         state = token_open(s->code, &conf);
2088                         while(1) {
2089                                 struct token tk = token_next(state);
2090                                 printf("%d:%d ", tk.line, tk.col);
2091                                 token_trace(stdout, tk, 20);
2092                                 if (tk.num == TK_number) {
2093                                         mpq_t num;
2094                                         char tail[3];
2095                                         if (number_parse(num, tail,tk.txt)) {
2096                                                 printf(" %s ", tail);
2097                                                 mpq_out_str(stdout, 10, num);
2098                                                 mpq_clear(num);
2099                                         } else
2100                                                 printf(" BAD NUMBER");
2101                                 }
2102                                 if (tk.num == TK_string ||
2103                                     tk.num == TK_multi_string) {
2104                                         char esc = '\\';
2105                                         struct text str;
2106                                         char tail[3];
2107                                         if (tk.txt.txt[0] == '`')
2108                                                 esc = 0;
2109                                         if (string_parse(&tk, esc,
2110                                                          &str, tail)) {
2111                                                 printf(" %s ", tail);
2112                                                 text_dump(stdout, str, 20);
2113                                                 free(str.txt);
2114                                         } else
2115                                                 printf(" BAD STRING");
2116                                 }
2117                                 printf("\n");
2118                                 if (tk.num == TK_error)
2119                                         errs = 1;
2120                                 if (tk.num == TK_eof)
2121                                         break;
2122                         }
2123                         token_close(state);
2124                 }
2125                 if (conf.words_marks != known)
2126                         free(conf.words_marks);
2127                 exit(!!errs);
2128         }
2129 ###### File: scanner.mk
2130         scanner.c : scanner.mdc
2131                 ./md2c scanner.mdc
2132         all :: scanner
2133         scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2134                 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2135                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2136         scanner.o : scanner.c
2137                 $(CC) $(CFLAGS) -c scanner.c