ocean-lang.org Git - ocean/blob - csrc/scanner.mdc

   1 # Lexical Scanner #
   2
   3 ## The Task at Hand ##
   4
   5 The main task of the lexical scanner is to convert a stream of
   6 characters into a stream of tokens.  The tokens are then typically
   7 used by a parser to extract the syntactic structure.
   8
   9 The stream of characters are assumed to be in memory identified by a
  10 linked list of blocks, such as provided by the "[mdcode][]" literate
  11 program extractor.  A single token may never cross a block boundary.
  12
  13 [mdcode]: mdcode.html
  14
  15 ###### includes
  16         #include "mdcode.h"
  17
  18 The text is assumed to be UTF-8 though some matching assumes the
  19 ASCII subset.  If the text provided does not conform to UTF-8 an error
  20 will be reported and some number of bytes will be skipped.
  21
  22 ###### public types
  23         #include <wchar.h>
  24         #include <wctype.h>
  25         #include <unicode/uchar.h>
  26
  27 Tokens are returned by successive calls to the main interface
  28 function: `token_next()` which has a `state` structure to keep track
  29 of where it is up to.  Each token carries not just a numeric
  30 identifier but also the code block, the line and character within that
  31 block, and the actual start and length using the `struct text` from
  32 "mdcode".
  33
  34 ###### public types
  35         struct token {
  36                 int               num;
  37                 struct code_node *node;
  38                 struct text       txt;
  39                 int               line, col;
  40         };
  41         struct token_state;
  42
  43 ###### private types
  44         struct token_state {
  45                 ## state fields
  46         };
  47
  48 ###### exported functions
  49         struct token token_next(struct token_state *state);
  50
  51 ###### main functions
  52         struct token token_next(struct token_state *state)
  53         {
  54                 ## token_next init
  55                 while (1) {
  56                         wint_t ch;
  57                         struct token tk;
  58
  59                         ## one token
  60                 }
  61         }
  62
  63 The `line` and `col` offsets are useful for reporting errors.
  64 The `txt` provides the content when that is important.
  65
  66 ### Token types and configuration ##
  67
  68 The scanner is not completely general, yet not completely specified.
  69 There are a fixed set of token types, though particular tokens within
  70 those types can be distinguish via configuration.
  71
  72 Most token types may be explicitly ignored, so they aren't parsed.
  73 Comments typically parsed but not returned, but an option is provided to
  74 return comments for further processing.  The exact consequence of
  75 ignoring each token type varies from token to token.
  76
  77 ###### public types
  78         struct token_config {
  79                 int ignored;    // bit set of ignored tokens.
  80                 int return_comments;
  81                 ## token config parameters
  82         };
  83
  84 ###### state fields
  85         struct token_config *conf;
  86
  87 ###### token_next init
  88         int ignored = state->conf->ignored;
  89
  90 The different tokens are numbers, words, marks, strings, comments,
  91 newlines, EOF, and indents, each of which is examined in detail below.
  92
  93 There are various cases where no token can be found in part of the
  94 input.  All of these will be reported as a `TK_error` token.
  95
  96 It is possible to declare a number of strings which form distinct
  97 tokens (rather than being grouped as e.g. 'word').  These are given
  98 token numbers from `TK_reserved` upwards.
  99
 100 ###### public types
 101         enum token_num {
 102                 TK_error,
 103                 ## token types
 104                 TK_reserved
 105         };
 106
 107 ### Numbers
 108
 109 Numbers are the messiest tokens to parse, primarily because they can
 110 contain characters that also have meaning outside of numbers and,
 111 particularly, immediately after numbers.
 112
 113 The obvious example is the '`-`' sign.  It can come inside a number for
 114 a negative exponent, or after a number as a subtraction operator.  To
 115 be sure we have parsed as best as possible we need to only allow the
 116 '`-`' inside a number if it is after an exponent character.  This can be
 117 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
 118 digit, so we don't allow '`-`' after just any `e`.
 119
 120 To make matters worse, our language designer has decided to experiment
 121 with allowing commas to be used as the decimal indicator, and spaces
 122 to be used to separate groups of digits in large numbers.  Both of
 123 these can reasonably be restricted to appear between two digits, so we
 124 have to add that condition to our tests.  For consistency we require
 125 every non-alpha-numeric to appear between two hex digits, with the
 126 exception that a sign can appear only after a 'p' or 'e', and a space
 127 can only appear between decimal digits.  Allowing a space before a
 128 letter easily leads to confusion, such a in `a < 3 and b < 4`.
 129
 130 So we cannot just treat numbers as starting with a digit and being
 131 followed by some set of characters.  We need more structure than that.
 132
 133 So:
 134
 135 - Numbers must start with a digit.
 136 - If the first digit is zero, the next character should be a base
 137   signifier (one of `xob`) or a decimal marker (`.` or `,`) (though this isn't
 138   enforced at this stage)
 139   In the first case the only first `p` or `P` may be followed by a sign.
 140 - If the number doesn't start with `0` followed by one of `xob`, the
 141   first `e` may be followed by a sign.
 142 - A sign must always be followed by a digit.
 143 - Any digit may be followed by a space or underscore and any hex digit
 144   maybe followed by an underscore, providing that the subsequence character
 145   is also a digit (for space) or hex digit (for underscore).
 146   This rule will require an extra level of 'unget' to be
 147   supported when handling characters.
 148 - Otherwise any digits or ASCII letters are allowed.  We do not at
 149   this point check that the digits given are permitted by the base.
 150   That will happen when the token is converted to a number.
 151
 152 To allow easy configuration, the various non alphanumeric characters
 153 are only permitted if they are listed in a configuration parameter.
 154
 155 ###### token config parameters
 156         char *number_chars;
 157
 158 Note that numbers may not start with a period, so `.75` is not a
 159 number.  This is not the norm, but is not unheard of.  Excluding these
 160 numbers simplifies the rule at very little cost.
 161
 162 ###### token types
 163         TK_number,
 164
 165 If TK_number is ignored, digits will result in an error unless they
 166 are declared to be a start character for words.
 167
 168 ###### includes
 169
 170         #include <string.h>
 171
 172 ###### parse number
 173
 174         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
 175                 int prev = 0;
 176                 int expect_p = 0;
 177                 int decimal_mark = 0;
 178                 if (ch == '0') {
 179                         wchar_t ch2 = get_char(state);
 180                         if (strchr("xobXOB", ch2) != NULL)
 181                                 expect_p = 1;
 182                         unget_char(state);
 183                 }
 184                 while (1) {
 185                         int sign_ok = 0;
 186                         switch(expect_p) {
 187                         case 0:
 188                                 if (ch == 'e' || ch == 'E') {
 189                                         sign_ok = 1;
 190                                         decimal_mark = 1;
 191                                 }
 192                                 break;
 193                         case 1:
 194                                 if (ch == 'p' || ch == 'P') {
 195                                         sign_ok = 1;
 196                                         decimal_mark = 1;
 197                                 }
 198                                 break;
 199                         }
 200                         save_unget_state(state);
 201                         prev = ch;
 202                         ch = get_char(state);
 203
 204                         if (!iswalnum(prev)) {
 205                                 /* special characters, like separators and decimal marks
 206                                  * and signs, must be followed by a hexdigit, and the
 207                                  * space and signs must be followed by a decimal digit.
 208                                  */
 209                                 if (!iswxdigit(ch) ||
 210                                    ((prev == '-' || prev == '+') && !iswdigit(ch)) ||
 211                                    (prev == ' ' && !iswdigit(ch))) {
 212                                         /* don't want the new char or the special */
 213                                         restore_unget_state(state);
 214                                         break;
 215                                 }
 216                         }
 217                         if (iswalnum(ch))
 218                                 continue;
 219
 220                         if (!strchr(state->conf->number_chars, ch)) {
 221                                 /* non-number char */
 222                                 break;
 223                         }
 224                         if (ch == '+' || ch == '-') {
 225                                 /* previous must be 'e' or 'p' in appropraite context */
 226                                 if (!sign_ok)
 227                                         break;
 228                                 expect_p = -1;
 229                         } else if (ch == ' ') {
 230                                 /* previous must be a digit */
 231                                 if (!iswdigit(prev))
 232                                         break;
 233                         } else {
 234                                 /* previous must be a hex digit */
 235                                 if (!iswxdigit(prev))
 236                                         break;
 237                         }
 238                         if (ch == '.' || ch == ',') {
 239                                 /* only one of these permitted */
 240                                 if (decimal_mark)
 241                                         break;
 242                                 decimal_mark = 1;
 243                         }
 244                 }
 245                 /* We seem to have a "number" token */
 246                 unget_char(state);
 247                 close_token(state, &tk);
 248                 tk.num = TK_number;
 249                 return tk;
 250         }
 251
 252 ### Words
 253 Words start with a "start" character followed by the longest
 254 sequence of "continue" characters.  The Unicode ID_START and
 255 ID_CONTINUE sets are always permitted, but other ASCII characters
 256 can be added to these sets.
 257
 258 ###### token config parameters
 259         char *word_start;
 260         char *word_cont;
 261
 262 ###### internal functions
 263         static int is_word_start(wchar_t ch, struct token_config *conf)
 264         {
 265                 return iswalpha(ch) ||
 266                        strchr(conf->word_start, ch) != NULL ||
 267                        u_hasBinaryProperty(ch, UCHAR_ID_START);
 268         }
 269
 270         static int is_word_continue(wchar_t ch, struct token_config *conf)
 271         {
 272                 return iswalnum(ch) ||
 273                        strchr(conf->word_cont, ch) != NULL ||
 274                        u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
 275         }
 276
 277 Words can be either known or unknown.  Known words are referred to as
 278 "reserved words" and get a unique token number.  Unknown words are
 279 "identifiers" and are syntactically a single token.
 280
 281 ###### token types
 282         TK_ident,
 283
 284 A list of known words must be provided.  This list is shared with the
 285 "marks" which are described next.  The list must be lexically sorted
 286 and the length of the list must be given (`known_count`).
 287 Tokens matching these known words are reported as the index of the
 288 list added to `TK_reserved`.
 289
 290 If identifiers are ignored, then any word which is not listed as a
 291 known word results in an error.
 292
 293 ###### token config parameters
 294         const char **words_marks;
 295         int known_count;
 296
 297 ###### parse word
 298
 299         if (is_word_start(ch, state->conf)) {
 300                 int n;
 301                 /* A word: identifier or reserved */
 302                 do
 303                         ch = get_char(state);
 304                 while (is_word_continue(ch, state->conf));
 305                 unget_char(state);
 306                 close_token(state, &tk);
 307                 tk.num = TK_ident;
 308                 if (ignored & (1<<TK_ident))
 309                         tk.num = TK_error;
 310                 n = find_known(state->conf, tk.txt);
 311                 if (n >= 0)
 312                         tk.num = TK_reserved + n;
 313                 return tk;
 314         }
 315
 316 ### Marks
 317
 318 Marks are generally one or more punctuation marks joined together.  It
 319 would be nice to use the term "symbol" for these, but that causes
 320 confusion in a subsequent discussion of the grammar, which has terminal
 321 symbols and non-terminal symbols which are conceptually quite
 322 different.  So strings of punctuation characters will be marks.
 323
 324 A "mark" consists of ASCII characters that are not white space, are not
 325 "start" characters for words, and are not digits.
 326 These will collectively be called mark characters.
 327
 328 ###### internal functions
 329         static int is_mark(wchar_t ch, struct token_config *conf)
 330         {
 331                 return ch > ' ' &&
 332                        ch < 0x7f &&
 333                        !iswalnum(ch) &&
 334                        strchr(conf->word_start, ch) == NULL;
 335         }
 336
 337 As with words, there can be known and unknown marks, though the rules
 338 are slightly different.
 339
 340 Two marks do not need to be separated by a non-mark characters.  This
 341 is different from words which do need to be separated by at least one
 342 non-continue character.
 343
 344 The scanner will normally prefer longer sequences of mark characters,
 345 but will more strongly prefer known marks over unknown marks.  So if
 346 it finds a known mark where adding one more character does not result
 347 in a known mark, it will return that first known mark.
 348
 349 If no known mark is found we will test against strings and comments
 350 below before giving up and assuming an unknown mark.
 351
 352 If an unknown mark contains a quote character or a comment marker, and
 353 that token is not being ignored, then we terminate the unknown mark
 354 before that quote or comment.  This ensures that an unknown mark
 355 immediately before a string is handled correctly.
 356
 357 If the first character of a comment marker (i.e. '/') is a known mark,
 358 the above rules would suggest that the start of a comment would be
 359 parsed as that mark, which is not what is wanted.  So when comments are
 360 not ignored, the introductory sequences for a comment ("//" and "/*")
 361 are treated as partially-known.  They prevent the leading "/" from being
 362 a mark by itself, but do not actually constitute a stand-alone mark.
 363
 364 If `TK_mark` is ignored, then unknown marks are returned as errors.
 365
 366 ###### token types
 367         TK_mark,
 368
 369 Known marks are included in the same list as the list of known words.
 370
 371 ###### parse mark
 372         tk.num = TK_error;
 373         while (is_mark(ch, state->conf)) {
 374                 int n;
 375                 wchar_t prev;
 376                 close_token(state, &tk);
 377                 n = find_known(state->conf, tk.txt);
 378                 if (n >= 0)
 379                         tk.num = TK_reserved + n;
 380                 else if (tk.num != TK_error) {
 381                         /* found a longest-known-mark, still need to
 382                          * check for comments
 383                          */
 384                         if (is_comment(ignored, tk.txt)) {
 385                                 /* Yes, this is a comment, not a '/' */
 386                                 restore_unget_state(state);
 387                                 tk.num = TK_error;
 388                                 break;
 389                         }
 390                         unget_char(state);
 391                         close_token(state, &tk);
 392                         return tk;
 393                 }
 394                 prev = ch;
 395                 save_unget_state(state);
 396                 ch = get_char(state);
 397                 if (n >= 0)
 398                         /* No need to worry about other token types */
 399                         continue;
 400                 if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
 401                         /* If strings are allowed, a quote (Which isn't a known mark)
 402                          * mustn't be treated as part of an unknown mark.  It can be
 403                          * part of a multi-line string though.
 404                          */
 405                         break;
 406
 407                 close_token(state, &tk);
 408                 if (is_comment(ignored, tk.txt)) {
 409                         /* looks like a permitted comment, and not a known mark,
 410                          * so assume it is a comment.
 411                          */
 412                         restore_unget_state(state);
 413                         break;
 414                 }
 415         }
 416         unget_char(state);
 417         if (tk.num != TK_error) {
 418                 close_token(state, &tk);
 419                 return tk;
 420         }
 421
 422 If we don't find a known mark, we will check for strings and comments
 423 before assuming that we have an unknown mark
 424
 425 ###### parse mark
 426         ## parse string
 427         ## parse comment
 428         ## unknown mark
 429
 430 ### Strings
 431
 432 Strings start with one of single quote, double quote, or back quote
 433 and continue until a matching character on the same line.  Any of
 434 these characters can be included in the list of known marks and then
 435 they will not be used for identifying strings.
 436
 437 Immediately following the close quote, one or two ASCII letters may
 438 appear.  These are somewhat like the arbitrary letters allowed in
 439 "Numbers" above.  They can be used by the language in various ways.
 440
 441 If 3 identical quote characters appear in a row and are
 442 followed by a newline, then this forms a multi-line string which
 443 continues until an identical triple quote appears on a line preceded
 444 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
 445
 446 Multi-line strings may not extend beyond the end of the `code_node` in
 447 which they start.
 448
 449 Normal strings and multi-line strings are encoded as two different
 450 token types.
 451
 452 ###### token types
 453         TK_string,
 454         TK_multi_string,
 455
 456 ###### internal functions
 457         static int is_quote(wchar_t ch)
 458         {
 459                 return ch == '\'' || ch == '"' || ch == '`'; // "
 460         }
 461
 462 #### Multi-line strings
 463
 464 The multi-line string is checked for first.  If they are being
 465 ignored, we fall through and treat a triple quote as an empty string
 466 followed by the start of a new string.
 467
 468 ###### parse string
 469         if (tk.txt.len >= 3 &&
 470             !(ignored & (1 << TK_multi_string)) &&
 471             is_quote(tk.txt.txt[0]) &&
 472             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
 473             is_newline(tk.txt.txt[3])) {
 474                 // triple quote
 475                 wchar_t first = tk.txt.txt[0];
 476                 int qseen = 0;
 477                 int at_sol = 1;
 478                 while (!at_eon(state) && qseen < 3) {
 479                         ch = get_char(state);
 480                         if (is_newline(ch)) {
 481                                 at_sol = 1;
 482                                 qseen = 0;
 483                         } else if (at_sol && ch == first) {
 484                                 qseen += 1;
 485                         } else if (ch != ' ' && ch != '\t') {
 486                                 at_sol = 0;
 487                                 qseen = 0;
 488                         }
 489                 }
 490                 if (qseen != 3) {
 491                         /* Hit end of node - error.
 492                          * unget so the newline is seen,
 493                          * but return rest of string as an error.
 494                          */
 495                         if (is_newline(ch))
 496                                 unget_char(state);
 497                         close_token(state, &tk);
 498                         tk.num = TK_error;
 499                         return tk;
 500                 }
 501                 /* 2 letters are allowed */
 502                 ch = get_char(state);
 503                 if (iswalpha(ch))
 504                         ch = get_char(state);
 505                 if (iswalpha(ch))
 506                         ch = get_char(state);
 507                 /* Now we must have a newline, but we don't return it
 508                  * whatever it is.*/
 509                 unget_char(state);
 510                 close_token(state, &tk);
 511                 tk.num = TK_multi_string;
 512                 if (!is_newline(ch))
 513                         tk.num = TK_error;
 514                 return tk;
 515         }
 516
 517 #### Single-line strings
 518
 519 The sequence of marks collected may be more than a single-line
 520 string, so we reset to the start and collect characters until
 521 we find a close quote or a newline.
 522
 523 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
 524
 525 ###### parse string
 526         if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
 527             !(ignored & (1<<TK_string))) {
 528                 wchar_t first = tk.txt.txt[0];
 529                 reset_token(state, &tk);
 530                 ch = get_char(state);
 531                 tk.num = TK_error;
 532                 while (!at_eon(state) && !is_newline(ch)) {
 533                         ch = get_char(state);
 534                         if (ch == first) {
 535                                 tk.num = TK_string;
 536                                 break;
 537                         }
 538                         if (is_newline(ch)) {
 539                                 unget_char(state);
 540                                 break;
 541                         }
 542                 }
 543                 while (!at_eon(state) && (ch = get_char(state)) &&
 544                                           iswalpha(ch))
 545                         ;
 546                 unget_char(state);
 547                 close_token(state, &tk);
 548                 return tk;
 549         }
 550
 551 ### Comments
 552
 553 Single line comments may start with '`//`' or '`#`' providing that these
 554 are not known marks.  They continue to the end of the line.
 555
 556 Block comments start with '`/*`' if this is not a known mark.  They
 557 continue to the first occurrence of '`*/`' and may not contain any
 558 occurrence of '`/*`'.
 559
 560 Block comments can be wholly within one line or can continue over
 561 multiple lines.  The multi-line version should be followed immediately
 562 by a newline.  The Linux kernel contains over 285000 multi-line
 563 comments are only 34 are followed by characters other than white space
 564 (which should be removed) or a backslash (only needed in macros).  So
 565 it would not suffer from this rule.
 566
 567 These two comment types are reported as two separate token types, and
 568 consequently can be ignored separately.  When ignored a comment is
 569 still parsed, but is discarded.
 570
 571 ###### token types
 572         TK_line_comment,
 573         TK_block_comment,
 574
 575 ###### internal functions
 576         static int is_line_comment(int ignored, struct text txt)
 577         {
 578                 if (ignored & (1 << TK_line_comment))
 579                         return 0;
 580                 return (txt.len >= 1 && txt.txt[0] == '#') ||
 581                        (txt.len >= 2 && txt.txt[0] == '/' &&
 582                                         txt.txt[1] == '/');
 583         }
 584
 585         static int is_block_comment(int ignored, struct text txt)
 586         {
 587                 if (ignored & (1 << TK_block_comment))
 588                         return 0;
 589                 return txt.len >= 2 && txt.txt[0] == '/' &&
 590                        txt.txt[1] == '*';
 591         }
 592
 593         static int is_comment(int ignored, struct text txt)
 594         {
 595                 return is_line_comment(ignored, txt) ||
 596                        is_block_comment(ignored, txt);
 597         }
 598
 599 #### Single line comments
 600
 601 A single-line comment continues up to, but not including the newline
 602 or end of node.
 603
 604 ###### parse comment
 605
 606         if (is_line_comment(ignored, tk.txt)) {
 607                 while (!is_newline(ch) && !at_eon(state))
 608                         ch = get_char(state);
 609                 if (is_newline(ch))
 610                         unget_char(state);
 611                 close_token(state, &tk);
 612                 tk.num = TK_line_comment;
 613                 if (!state->conf->return_comments)
 614                         continue;
 615                 return tk;
 616         }
 617
 618 #### Block comments
 619
 620 The token text collected so far could exceed the comment, so we need
 621 to reset it first.
 622
 623 If we find an embedded `/*` we reset to just before the '/' and report
 624 an error.  That way the next thing to be parsed will be the rest of
 625 the comment.  This requires a double unget, so we need to save/restore
 626 the unget state (explained later).
 627
 628 ###### parse comment
 629
 630         if (is_block_comment(ignored, tk.txt)) {
 631                 wchar_t prev;
 632                 int newlines = 0;
 633                 reset_token(state, &tk);
 634                 get_char(state);
 635                 get_char(state);
 636                 save_unget_state(state);
 637                 ch = get_char(state);
 638                 prev = 0;
 639                 while (!at_eon(state) &&
 640                        (prev != '/' || ch != '*') &&
 641                        (prev != '*' || ch != '/')) {
 642                         if (is_newline(ch))
 643                                 newlines = 1;
 644                         prev = ch;
 645                         save_unget_state(state);
 646                         ch = get_char(state);
 647                 }
 648                 close_token(state, &tk);
 649                 if (at_eon(state)) {
 650                         tk.num = TK_error;
 651                         return tk;
 652                 }
 653                 if (prev == '/') {
 654                         /* embedded.  Need to unget twice! */
 655                         restore_unget_state(state);
 656                         unget_char(state);
 657                         tk.num = TK_error;
 658                         return tk;
 659                 }
 660                 tk.num = TK_block_comment;
 661                 if (newlines && !(ignored & (1<<TK_newline))) {
 662                         /* next char must be newline */
 663                         ch = get_char(state);
 664                         unget_char(state);
 665                         if (!is_newline(ch))
 666                                 tk.num = TK_error;
 667                 }
 668                 if (tk.num == TK_error || state->conf->return_comments)
 669                         return tk;
 670                 continue;
 671         }
 672
 673 ### Indents, Newlines, and White Space.
 674
 675 Normally white space is ignored.  However newlines can be important as
 676 can indents, which are either after a newline or at the start of a
 677 node (detected by `at_son()`);
 678
 679 ###### exported functions
 680         static inline int is_newline(wchar_t ch)
 681         {
 682                 return ch == '\n' || ch == '\f' || ch == '\v';
 683         }
 684
 685 ###### white space
 686         if (ch <= ' ' && !is_newline(ch)
 687             && ! at_son(state))
 688                 continue;
 689
 690 If a line starts with more white-space than the previous non-blank
 691 line - or if the first non-blank line in the document starts with any
 692 white-space - then an "IN" is reported at the start of the line.
 693
 694 Before the next non-blank line which starts with less white space, or
 695 at the latest at the end of the document, a matching "OUT" token
 696 is reported.  There will always be an exact match between "IN" and
 697 "OUT" tokens.
 698
 699 It is possible for "OUT" to be followed (almost) immediately by an
 700 "IN".  This happens if, for example, the indent of three consecutive
 701 lines are 0, 8, 4 spaces.  Before the second line we report an
 702 "IN".  Before the third line we must report an "OUT", as 4 is less
 703 than 8, then also an Ident as 4 is greater than 0.
 704
 705 ###### token types
 706         TK_in,
 707         TK_out,
 708
 709 For the purpose of measuring the length of white space, a tab adds at
 710 least one space, and rounds up to a multiple of 8.
 711
 712 ###### exported functions
 713         static inline int indent_tab(int indent)
 714         {
 715                 return (indent|7)+1;
 716         }
 717
 718 We need to track the current levels of indent.  This requires some
 719 sort of stack as indent levels are pushed on and popped off.  In
 720 practice this stack is unlikely to often exceed 5 so we will used a
 721 fixed stack of 20 indent levels.  More than this will be silently
 722 ignored.
 723
 724 ###### state fields
 725         int     indent_level;
 726         int     indent_sizes[20];
 727
 728 `indent_sizes[0]` will always be zero - this simplifies some code.
 729
 730 #### Newlines
 731
 732 Newlines can optionally be reported.  Newlines within a block comment
 733 or a multi-line string are not reported separately, but each of these
 734 must be followed immediately by a newline so these constructs cannot
 735 hide the fact that a newline was present.
 736
 737 When indents are being reported, the Newline which would normally be
 738 reported immediately before the "IN" is delayed until after the
 739 matching "OUT".  This makes an indented section act like a
 740 continuation of the previous line to some extent.
 741
 742 A blank line would normally be reported simply as two consecutive Newline
 743 tokens.  However if the subsequent line is indented (and indents are being
 744 reported) then the right thing to do is less obvious as Newlines should be
 745 delayed - but how many Newlines?
 746
 747 The approach we will take is to report the extra Newlines immediately after
 748 the IN token, so the blank line is treated as though it were an indented
 749 blank line.
 750
 751 ###### token types
 752         TK_newline,
 753
 754 If we find a newline or white space at the start of a block, we keep
 755 collecting spaces, tabs, and newlines until we find some real text.
 756 Then depending on the indent we generate some number of tokens.  These
 757 will be a sequence of "Newline OUT" pairs representing a decrease
 758 in indent, then either a Newline or an IN depending on whether the
 759 next line is indented, then zero or more Newlines representing all the
 760 blank lines that have been skipped.
 761
 762 When a Newline leads to the next block of code there is a question of
 763 whether the various Newline and OUT/IN tokens should appear to
 764 belong to the earlier or later block.  This is addressed by processing
 765 the tokens in two stages based on the relative indent levels of the
 766 two blocks (each block has a base indent to which the actual indents
 767 are added).
 768
 769 Any "Newline OUT" pairs needed to reduce the current indent to the
 770 maximum of the base indents of the old and new blocks are generated
 771 against the old block.  Then if the next block does not have an
 772 increased indent, one more "Newline" is generated.
 773
 774 If further "Newline OUT" pairs are needed to get to the indent
 775 level of the 'next' block, they are generated against that block,
 776 though the first Newline is suppressed (it having already been
 777 generated).
 778
 779 Finally the Newline or IN for the first line of the new block is
 780 generated, unless the Newline needs to be suppressed because it
 781 appeared at the end of the previous block.
 782
 783 This means that a block may start with an OUT or an IN, but
 784 will only start with a Newline if it actually starts with a blank
 785 line.
 786
 787 We will need to represent in the `token_state` where in this sequence
 788 of delayed tokens we are.  As `state.col` records the target indent we
 789 don't need to record how many OUTs or INs are needed.  We do
 790 need to record the number of blank lines, and which of Newline and
 791 OUT is needed next in the initial sequence of pairs.
 792
 793 For this we store one more than the number of blank lines as
 794 `delayed_lines` and a flag for `out_next`.
 795
 796 ###### state fields
 797         int check_indent;
 798         int delayed_lines;
 799         int out_next;
 800
 801 Generating these tokens involves two separate pieces of code.
 802
 803 Firstly we need to recognise white space and count the indents and
 804 newlines.  These are recorded in the above state fields.
 805
 806 Separately we need, on each call to `token_next`, to check if
 807 there are some delayed tokens and if so we need to advance the state
 808 information and return one token.
 809
 810 ###### internal functions
 811         static int state_indent(struct token_state *state)
 812         {
 813                 if (state->node == NULL)
 814                         return state->col;
 815                 return state->node->indent - state->node->needs_strip + state->col;
 816         }
 817
 818 ###### white space
 819         if (is_newline(ch))
 820                 state_check_node(state);
 821         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 822                 int newlines = 0;
 823                 int was_nl = is_newline(ch);
 824                 if (ignored & (1<<TK_in)) {
 825                         if (!is_newline(ch))
 826                                 continue;
 827                         if (ignored & (1<<TK_newline))
 828                                 continue;
 829                         tk.num = TK_newline;
 830                         close_token(state, &tk);
 831                         return tk;
 832                 }
 833                 // Indents are needed, so check all white space.
 834                 while (ch <= ' ' && ch != WEOF) {
 835                         if (is_newline(ch))
 836                                 newlines += 1;
 837                         ch = get_char(state);
 838                         if (is_newline(ch))
 839                                 state_check_node(state);
 840                 }
 841                 if (ch != WEOF)
 842                         unget_char(state);
 843                 state->delayed_lines = newlines;
 844                 state->out_next = !was_nl;
 845                 state->check_indent = 1;
 846                 continue;
 847         }
 848
 849 ###### delayed tokens
 850
 851         if (state->check_indent || state->delayed_lines) {
 852                 if (state_indent(state) < state->indent_sizes[state->indent_level]) {
 853                         if (!state->out_next &&
 854                             !(ignored & (1<<TK_newline))) {
 855                                 state->out_next = 1;
 856                                 tk.num = TK_newline;
 857                                 return tk;
 858                         }
 859                         state->indent_level -= 1;
 860                         state->out_next = 0;
 861                         tk.num = TK_out;
 862                         return tk;
 863                 }
 864                 if (state_indent(state) > state->indent_sizes[state->indent_level] &&
 865                     state->indent_level < sizeof(state->indent_sizes)-1) {
 866                         state->indent_level += 1;
 867                         state->indent_sizes[state->indent_level] = state_indent(state);
 868                         if (state->delayed_lines)
 869                                 state->delayed_lines -= 1;
 870                         tk.num = TK_in;
 871                         return tk;
 872                 }
 873                 state->check_indent = 0;
 874                 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
 875                         tk.num = TK_newline;
 876                         state->delayed_lines -= 1;
 877                         return tk;
 878                 }
 879                 state->delayed_lines = 0;
 880                 continue;
 881         }
 882
 883 ### End of File
 884
 885 After the last newline in the file has been processed, a special
 886 end-of-file token will be returned.  any further attempts to get more
 887 tokens will continue to return the same end-of-file token.
 888
 889 ###### token types
 890         TK_eof,
 891
 892 ###### white space
 893         if (ch == WEOF) {
 894                 tk.num = TK_eof;
 895                 return tk;
 896         }
 897
 898 ### Unknown Marks, or errors.
 899
 900 We have now handled all the possible known mark-like tokens.
 901 If the token we have is not empty and `TK_mark` is allowed,
 902 we have an unknown mark, otherwise this must be an error.
 903
 904 ###### unknown mark
 905
 906         /* one unknown mark character */
 907         if (tk.txt.len) {
 908                 close_token(state, &tk);
 909                 if (ignored & (1<<TK_mark))
 910                         tk.num = TK_error;
 911                 else
 912                         tk.num = TK_mark;
 913                 return tk;
 914         }
 915         /* Completely unrecognised character is next, possibly
 916          * a digit and we are ignoring numbers.
 917          * What ever it is, make it an error.
 918          */
 919         get_char(state);
 920         close_token(state, &tk);
 921         tk.num = TK_error;
 922         return tk;
 923
 924 ## Tools For The Task
 925
 926 You may have noticed that are few gaps we left in the above -
 927 functions used without first defining them.  Doing so above would have
 928 broken the flow.
 929
 930 ### Character by character
 931
 932 As we walk through the various `code_node`s we need to process whole
 933 Unicode codepoints, and keep track of which line and column we are on.
 934 We will assume for now that any printing character uses one column,
 935 though that is not true in general.
 936
 937 As the text in a `code_node` may include an indent that identifies it as
 938 being code, we need to be careful to strip that.  The `code_node` has
 939 a flag that tells us whether or not we need to strip.
 940
 941 ###### includes
 942         #include <memory.h>
 943
 944 ###### state fields
 945         struct code_node *node;
 946         int    offset;
 947         int    line;
 948         int    col;
 949         int    strip_offset;
 950
 951 ###### internal functions
 952
 953         static void do_strip(struct token_state *state)
 954         {
 955                 int indent = 0;
 956                 if (state->node->needs_strip) {
 957                         int n = 4;
 958                         while (n && state->node->code.txt[state->offset] == ' ') {
 959                                 indent += 1;
 960                                 state->offset += 1;
 961                                 n -= 1;
 962                         }
 963                         while (n == 4 && state->node->code.txt[state->offset] == '\t') {
 964                                 indent = indent_tab(indent);
 965                                 state->offset += 1;
 966                                 n -= 4;
 967                         }
 968                 }
 969         }
 970
 971         static void state_check_node(struct token_state *state)
 972         {
 973                 if (!state->node)
 974                         return;
 975                 if (state->node->code.len > state->offset)
 976                         return;
 977
 978                 do
 979                         state->node = state->node->next;
 980                 while (state->node && state->node->code.txt == NULL);
 981                 state->offset = 0;
 982                 state->prev_offset = 0;
 983                 state->strip_offset = 0;
 984                 state->col = 0;
 985                 if (state->node == NULL)
 986                         return;
 987                 state->line = state->node->line_no;
 988                 do_strip(state);
 989                 state->col = state->node->needs_strip;
 990                 state->strip_offset = state->offset;
 991         }
 992
 993         static wint_t get_char(struct token_state *state)
 994         {
 995                 wchar_t next;
 996                 size_t n;
 997                 mbstate_t mbstate;
 998
 999                 state_check_node(state);
1000                 if (state->node == NULL)
1001                         return WEOF;
1002
1003                 ## before get_char
1004
1005                 memset(&mbstate, 0, sizeof(mbstate));
1006
1007                 n = mbrtowc(&next, state->node->code.txt + state->offset,
1008                             state->node->code.len - state->offset,
1009                             &mbstate);
1010                 if (n == -2 || n == 0) {
1011                         /* Not enough bytes - not really possible */
1012                         next = '\n';                            // NOTEST
1013                         state->offset = state->node->code.len;  // NOTEST
1014                 } else if (n == -1) {
1015                         /* error */
1016                         state->offset += 1;                     // NOTEST
1017                         next = 0x7f; // an illegal character    // NOTEST
1018                 } else
1019                         state->offset += n;
1020
1021                 if (next >= ' ') {
1022                         state->col += 1;
1023                 } else if (is_newline(next)) {
1024                         state->line += 1;
1025                         do_strip(state);
1026                         state->col = state->node->needs_strip;
1027                 } else if (next == '\t') {
1028                         state->col = indent_tab(state->col);
1029                 }
1030                 return next;
1031         }
1032
1033 We will sometimes want to "unget" the last character as it needs to be
1034 considered again as part of the next token.  So we need to store a
1035 'previous' version of all metadata.
1036
1037 ###### state fields
1038         int    prev_offset;
1039         int    prev_line;
1040         int    prev_col;
1041
1042 ###### before get_char
1043         state->prev_offset = state->offset;
1044         state->prev_line   = state->line;
1045         state->prev_col    = state->col;
1046
1047 ###### internal functions
1048
1049         static void unget_char(struct token_state *state)
1050         {
1051                 if (state->node) {
1052                         state->offset = state->prev_offset;
1053                         state->line   = state->prev_line;
1054                         state->col    = state->prev_col;
1055                 }
1056         }
1057
1058 We occasionally need a double-unget, particularly for numbers and
1059 block comments.  We don't impose this cost on all scanning, but
1060 require those code sections that need it to call `save_unget_state`
1061 before each `get_char`, and then `restore_unget_state` when a
1062 double-unget is needed.
1063
1064 ###### state fields
1065         int     prev_offset2;
1066         int     prev_line2;
1067         int     prev_col2;
1068
1069 ###### internal functions
1070         static void save_unget_state(struct token_state *state)
1071         {
1072                 state->prev_offset2 = state->prev_offset;
1073                 state->prev_line2 = state->prev_line;
1074                 state->prev_col2 = state->prev_col;
1075         }
1076
1077         static void restore_unget_state(struct token_state *state)
1078         {
1079                 state->prev_offset = state->prev_offset2;
1080                 state->prev_line = state->prev_line2;
1081                 state->prev_col = state->prev_col2;
1082         }
1083
1084 At the start of a token we don't want to be at the end of a code block
1085 if we can help it.  To avoid this possibility, we 'get' and 'unget' a
1086 single character.  This will move into the next non-empty code block
1087 and leave the current pointer at the start of it.
1088
1089 This has to happen _after_ dealing with delayed tokens as some of them
1090 must appear in the previous node.  When we do this, we need to reset
1091 the data in the token.
1092
1093 ###### delayed tokens
1094         if (at_eon(state)) {
1095                 get_char(state);
1096                 unget_char(state);
1097                 tk.node = state->node;
1098                 if (state->node)
1099                         tk.txt.txt = state->node->code.txt + state->offset;
1100                 tk.line = state->line;
1101                 tk.col = state->col;
1102                 tk.txt.len = 0;
1103         }
1104
1105 ### Managing tokens
1106
1107 The current token is initialized to line up with the first character
1108 that we 'get' for each token.  When we have, or might have, a full
1109 token we can call `close_token` to set the `len` of the token
1110 appropriately.  This can safely be called multiple times.
1111
1112 Finally we occasionally (for single-line strings and block comments)
1113 need to reset to the beginning of the current token as we might have
1114 parsed too much already.  For that there is `reset_token`.
1115
1116 ###### one token
1117         tk.node = state->node;
1118         if (state->node)
1119                 tk.txt.txt = state->node->code.txt + state->offset;
1120         tk.line = state->line;
1121         tk.col = state->col;
1122         tk.txt.len = 0;
1123
1124 ###### internal functions
1125
1126         static void close_token(struct token_state *state,
1127                                 struct token *tk)
1128         {
1129                 if (state->node != tk->node)
1130                         tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
1131                 else
1132                         tk->txt.len = (state->node->code.txt + state->offset)
1133                                       - tk->txt.txt;
1134         }
1135
1136         static void reset_token(struct token_state *state, struct token *tok)
1137         {
1138                 state->prev_line = tok->line;
1139                 state->prev_col = tok->col;
1140                 state->prev_offset = tok->txt.txt - state->node->code.txt;
1141                 unget_char(state);
1142                 tok->txt.len = 0;
1143         }
1144
1145 Tokens may not cross into the next `code_node`, and some tokens can
1146 include the newline at the and of a `code_node`, we must be able to
1147 easily check if we have reached the end.  Equally we need to know if
1148 we are at the start of a node, as white space is treated a little
1149 differently there.
1150
1151 ###### internal functions
1152
1153         static int at_son(struct token_state *state)
1154         {
1155                 return state->prev_offset <= state->strip_offset;
1156         }
1157
1158         static int at_eon(struct token_state *state)
1159         {
1160                 // at end-of-node ??
1161                 return state->node == NULL ||
1162                        state->offset >= state->node->code.len;
1163         }
1164
1165 ### Find a known word
1166
1167 As the known-word list is sorted we can use a simple binary search.
1168 Following the pattern established in "mdcode", we will use a `struct
1169 text` with start and length to represent the code fragment we are
1170 searching for.
1171
1172 ###### internal functions
1173         static int find_known(struct token_config *conf, struct text txt)
1174         {
1175                 int lo = 0;
1176                 int hi = conf->known_count;
1177
1178                 while (lo + 1 < hi) {
1179                         int mid = (lo + hi) / 2;
1180                         int cmp = strncmp(conf->words_marks[mid],
1181                                           txt.txt, txt.len);
1182                         if (cmp == 0 && conf->words_marks[mid][txt.len])
1183                                 cmp = 1;
1184                         if (cmp <= 0)
1185                                 lo = mid;
1186                         else
1187                                 hi = mid;
1188                 }
1189                 if (strncmp(conf->words_marks[lo],
1190                            txt.txt, txt.len) == 0
1191                     && conf->words_marks[lo][txt.len] == 0)
1192                         return lo;
1193                 else
1194                         return -1;
1195         }
1196
1197 ### Bringing it all together
1198
1199 Now we have all the bits there is just one section missing:  combining
1200 all the token parsing code into one block.
1201
1202 The handling of delayed tokens (Newlines, INs, OUTs) must come
1203 first before we try getting another character.
1204
1205 Then we parse all the test, making sure that we check for known marks
1206 before strings and comments, but unknown marks after strings and comments.
1207
1208 This block of code will either return a token, or will choose to
1209 ignore one, in which case it will `continue` around to the top of the
1210 loop.
1211
1212 ###### one token
1213         ## delayed tokens
1214
1215         ch = get_char(state);
1216
1217         ## white space
1218         ## parse number
1219         ## parse word
1220         ## parse mark
1221
1222 ### Start and stop
1223
1224 As well as getting tokens, we need to be able to create the
1225 `token_state` to start with, and discard it later.
1226
1227 ###### includes
1228         #include <malloc.h>
1229
1230 ###### main functions
1231         struct token_state *token_open(struct code_node *code, struct
1232                                        token_config *conf)
1233         {
1234                 struct token_state *state = malloc(sizeof(*state));
1235                 memset(state, 0, sizeof(*state));
1236                 state->node = code;
1237                 state->line = code->line_no;
1238                 do_strip(state);
1239                 state->col = state->node->needs_strip;
1240                 state->strip_offset = state->offset;
1241                 state->conf = conf;
1242                 return state;
1243         }
1244         void token_close(struct token_state *state)
1245         {
1246                 free(state);
1247         }
1248
1249 ###### exported functions
1250         struct token_state *token_open(struct code_node *code, struct
1251                                        token_config *conf);
1252         void token_close(struct token_state *state);
1253
1254 ### Trace tokens
1255
1256 Getting tokens is the main thing but it is also useful to be able to
1257 print out token information, particularly for tracing and testing.
1258
1259 Known tokens are printed verbatim.  Other tokens are printed as
1260 `type(content)` where content is truncated to a given number of characters.
1261
1262 The function for printing a truncated string (`text_dump`) is also exported
1263 so that it can be used to tracing processed strings too.
1264
1265 ###### includes
1266         #include <stdio.h>
1267
1268 ###### exported functions
1269         void token_trace(FILE *f, struct token tok, int max);
1270         void text_dump(FILE *f, struct text t, int max);
1271
1272 ###### main functions
1273
1274         void text_dump(FILE *f, struct text txt, int max)
1275         {
1276                 int i;
1277                 if (txt.len > max)
1278                         max -= 2;
1279                 else
1280                         max = txt.len;
1281                 for (i = 0; i < max; i++) {
1282                         char c = txt.txt[i];
1283                         if (c < ' ' || c > '~')
1284                                 fprintf(f, "\\x%02x", c & 0xff);
1285                         else if (c == '\\')
1286                                 fprintf(f, "\\\\");
1287                         else
1288                                 fprintf(f, "%c", c);
1289                 }
1290                 if (i < txt.len)
1291                         fprintf(f, "..");
1292         }
1293
1294         void token_trace(FILE *f, struct token tok, int max)
1295         {
1296                 static char *types[] = {
1297                         [TK_ident] = "ident",
1298                         [TK_mark] = "mark",
1299                         [TK_number] = "number",
1300                         [TK_string] = "string",
1301                         [TK_multi_string] = "mstring",
1302                         [TK_line_comment] = "lcomment",
1303                         [TK_block_comment] = "bcomment",
1304                         [TK_in] = "in",
1305                         [TK_out] = "out",
1306                         [TK_newline] = "newline",
1307                         [TK_eof] = "eof",
1308                         [TK_error] = "ERROR",
1309                         };
1310
1311                 switch (tok.num) {
1312                 default: /* known word or mark */
1313                         fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1314                         break;
1315                 case TK_in:
1316                 case TK_out:
1317                 case TK_newline:
1318                 case TK_eof:
1319                         /* No token text included */
1320                         fprintf(f, "%s()", types[tok.num]);
1321                         break;
1322                 case TK_ident:
1323                 case TK_mark:
1324                 case TK_number:
1325                 case TK_string:
1326                 case TK_multi_string:
1327                 case TK_line_comment:
1328                 case TK_block_comment:
1329                 case TK_error:
1330                         fprintf(f, "%s(", types[tok.num]);
1331                         text_dump(f, tok.txt, max);
1332                         fprintf(f, ")");
1333                         break;
1334                 }
1335         }
1336
1337 ### And there we have it
1338
1339 We now have all the library functions defined for reading and printing
1340 tokens.  Now we just need C files to store them, and a mk file to make them.
1341
1342 ###### File: scanner.h
1343         ## public types
1344         ## exported functions
1345
1346 ###### File: libscanner.c
1347         ## includes
1348         #include "scanner.h"
1349         ## private types
1350         ## internal functions
1351         ## main functions
1352
1353 ###### File: scanner.mk
1354
1355         CFLAGS += -Wall -g
1356         all ::
1357         scanner.mk scanner.h libscanner.c : scanner.mdc
1358                 ./md2c scanner.mdc
1359         all :: libscanner.o
1360         libscanner.o : libscanner.c
1361                 $(CC) $(CFLAGS) -c libscanner.c
1362
1363 ## Processing numbers
1364
1365 Converting a `TK_number` token to a numerical value is a slightly
1366 higher level task than lexical analysis, and slightly lower than
1367 grammar parsing, so put it here - as an appendix if you like.
1368
1369 Importantly it will be used by the same testing rig that is used for
1370 testing the token scanner.
1371
1372 The numeric value that we will convert all numbers into is the `mpq_t`
1373 from the GNU high precision number library "libgmp".
1374
1375 ###### number includes
1376         #include <gmp.h>
1377         #include "mdcode.h"
1378
1379 Firstly we need to be able to parse a string of digits in a given base
1380 and possibly with a decimal marker.  We store this in an `mpz_t`
1381 integer and report the number of digits after the decimal mark.
1382
1383 On error we return zero and ensure that the 'mpz_t' has been freed, or
1384 had never been initialised.
1385
1386 ###### number functions
1387
1388         static int parse_digits(mpz_t num, struct text tok, int base,
1389                                 int *placesp)
1390         {
1391                 /* Accept digits up to 'base', ignore '_' and
1392                  * (for base 10) ' ' if they appear between two
1393                  * legal digits, and if `placesp` is not NULL,
1394                  * allow a single '.' or ',' and report the number
1395                  * of digits beyond there.
1396                  * Return number of characters processed (p),
1397                  * or 0 if something illegal was found.
1398                  */
1399                 int p;
1400                 int decimal = -1; // digits after marker
1401                 enum {Digit, Space, Other} prev = Other;
1402                 int digits = 0;
1403
1404                 for (p = 0; p < tok.len; p++) {
1405                         int dig;
1406                         char c = tok.txt[p];
1407
1408                         if (c == '_' || (c == ' ' && base == 10)) {
1409                                 if (prev != Digit)
1410                                         goto bad;
1411                                 prev = Space;
1412                                 continue;
1413                         }
1414                         if (c == '.' || c == ',') {
1415                                 if (prev != Digit)
1416                                         goto bad;
1417                                 if (!placesp || decimal >= 0)
1418                                         return p-1;
1419                                 decimal = 0;
1420                                 prev = Other;
1421                                 continue;
1422                         }
1423                         if (isdigit(c))
1424                                 dig = c - '0';
1425                         else if (isupper(c))
1426                                 dig = 10 + c - 'A';
1427                         else if (islower(c))
1428                                 dig = 10 + c - 'a';
1429                         else
1430                                 dig = base;
1431                         if (dig >= base) {
1432                                 if (prev == Space)
1433                                         p--;
1434                                 break;
1435                         }
1436                         prev = Digit;
1437                         if (digits)
1438                                 mpz_mul_ui(num, num, base);
1439                         else
1440                                 mpz_init(num);
1441                         digits += 1;
1442                         mpz_add_ui(num, num, dig);
1443                         if (decimal >= 0)
1444                                 decimal++;
1445                 }
1446                 if (digits == 0)
1447                         return 0;
1448                 if (placesp) {
1449                         if (decimal >= 0)
1450                                 *placesp = decimal;
1451                         else
1452                                 *placesp = 0;
1453                 }
1454                 return p;
1455         bad:
1456                 if (digits)
1457                         mpz_clear(num);
1458                 return 0;
1459         }
1460
1461 ###### number includes
1462         #include <ctype.h>
1463
1464 To parse a full number we need to consider the optional base, the
1465 mantissa, and the optional exponent.  We will treat these one at a
1466 time.
1467
1468 The base is indicated by a letter after a leading zero, which must be
1469 followed by a base letter or a period.  The base also determines the
1470 character which will mark an exponent.
1471
1472 ###### number vars
1473         int base = 10;
1474         char expc = 'e';
1475
1476 ###### parse base
1477
1478         if (tok.txt[0] == '0' && tok.len > 1) {
1479                 int skip = 0;
1480                 switch(tok.txt[1]) {
1481                 case 'x':
1482                 case 'X':
1483                         base = 16;
1484                         skip = 2;
1485                         expc = 'p';
1486                         break;
1487                 case 'o':
1488                 case 'O':
1489                         base = 8;
1490                         skip = 2;
1491                         expc = 'p';
1492                         break;
1493                 case 'b':
1494                 case 'B':
1495                         base = 2;
1496                         skip = 2;
1497                         expc = 'p';
1498                         break;
1499                 case '0':
1500                 case '1':
1501                 case '2':
1502                 case '3':
1503                 case '4':
1504                 case '5':
1505                 case '6':
1506                 case '7':
1507                 case '8':
1508                 case '9':
1509                 case '_':
1510                 case ' ':
1511                         // another digit is not permitted
1512                         // after a zero.
1513                         return 0;
1514                 default:
1515                         // must be decimal marker or trailing
1516                         // letter, which are OK;
1517                         break;
1518                 }
1519                 tok.txt += skip;
1520                 tok.len -= skip;
1521         }
1522
1523 After the base is the mantissa, which may contain a decimal mark, so
1524 we need to record the number of places.  We won't impose the number of
1525 places until we have the exponent as well.
1526
1527 ###### number vars
1528         int places = 0;
1529         mpz_t mant;
1530         int d;
1531
1532 ###### parse mantissa
1533
1534         d = parse_digits(mant, tok, base, &places);
1535         if (d == 0)
1536                 return 0;
1537         tok.txt += d;
1538         tok.len -= d;
1539         mpq_init(num);
1540         mpq_set_z(num, mant);
1541         mpz_clear(mant);
1542
1543 After the mantissa number may come an exponent which may be positive
1544 or negative.  We assume at this point that we have seen the exponent
1545 character `expc`.
1546
1547 ###### number vars
1548         long lexp = 0;
1549         mpz_t exp;
1550         int esign = 1;
1551
1552 ###### parse exponent
1553         if (tok.len > 1) {
1554                 if (tok.txt[0] == '+') {
1555                         tok.txt++;
1556                         tok.len--;
1557                 } else if (tok.txt[0] == '-') {
1558                         esign = -1;
1559                         tok.txt++;
1560                         tok.len--;
1561                 }
1562         }
1563         d = parse_digits(exp, tok, 10, NULL);
1564         if (d == 0) {
1565                 mpq_clear(num);
1566                 return 0;
1567         }
1568         if (!mpz_fits_slong_p(exp)) {
1569                 mpq_clear(num);
1570                 mpz_clear(exp);
1571                 return 0;
1572         }
1573         lexp = mpz_get_si(exp) * esign;
1574         mpz_clear(exp);
1575         tok.txt += d;
1576         tok.len -= d;
1577
1578 Now that we have the mantissa and the exponent we can multiply them
1579 together, also allowing for the number of digits after the decimal
1580 mark.
1581
1582 For base 10, we simply subtract the decimal places from the exponent.
1583 For the other bases, as the exponent is alway based on 2, even for
1584 octal and hex, we need a bit more detail.
1585 We then recover the sign from the exponent, as division is quite
1586 different from multiplication.
1587
1588 ###### calc exponent
1589         switch (base) {
1590         case 10:
1591         case 2:
1592                 lexp -= places;
1593                 break;
1594         case 16:
1595                 lexp -= 4*places;
1596                 break;
1597         case 8:
1598                 lexp -= 3*places;
1599                 break;
1600         }
1601         if (lexp < 0) {
1602                 lexp = -lexp;
1603                 esign = -1;
1604         } else
1605                 esign = 1;
1606
1607 Imposing the exponent on the number is also very different for base 10
1608 than for the others.  For the binary shift `gmp` provides a simple
1609 function.  For base 10 we use something like Russian Peasant
1610 Multiplication.
1611
1612 ###### calc exponent
1613         if (expc == 'e') {
1614                 mpq_t tens;
1615                 mpq_init(tens);
1616                 mpq_set_ui(tens, 10, 1);
1617                 while (1) {
1618                         if (lexp & 1) {
1619                                 if (esign > 0)
1620                                         mpq_mul(num, num, tens);
1621                                 else
1622                                         mpq_div(num, num, tens);
1623                         }
1624                         lexp >>= 1;
1625                         if (lexp == 0)
1626                                 break;
1627                         mpq_mul(tens, tens, tens);
1628                 }
1629                 mpq_clear(tens);
1630         } else {
1631                 if (esign > 0)
1632                         mpq_mul_2exp(num, num, lexp);
1633                 else
1634                         mpq_div_2exp(num, num, lexp);
1635         }
1636
1637 Now we are ready to parse a number: the base, mantissa, and exponent.
1638 If all goes well we check for the possible trailing letters and
1639 return.  Return value is 1 for success and 0 for failure.
1640
1641 ###### number functions
1642         int number_parse(mpq_t num, char tail[3], struct text tok)
1643         {
1644                 ## number vars
1645                 int i;
1646
1647                 ## parse base
1648                 ## parse mantissa
1649                 if (tok.len > 1 && (tok.txt[0] == expc ||
1650                                     tok.txt[0] == toupper(expc))) {
1651                         tok.txt++;
1652                         tok.len--;
1653                         ## parse exponent
1654                 }
1655                 ## calc exponent
1656
1657                 for (i = 0; i < 2; i++) {
1658                         if (tok.len <= i)
1659                                 break;
1660                         if (!isalpha(tok.txt[i]))
1661                                 goto err;
1662                         tail[i] = tok.txt[i];
1663                 }
1664                 tail[i] = 0;
1665                 if (i == tok.len)
1666                         return 1;
1667         err:
1668                 mpq_clear(num);
1669                 return 0;
1670         }
1671
1672 Number parsing goes in `libnumber.c`
1673
1674 ###### File: libnumber.c
1675
1676         #include <unistd.h>
1677         #include <stdlib.h>
1678
1679         ## number includes
1680         ## number functions
1681
1682 ###### File: parse_number.h
1683         int number_parse(mpq_t num, char tail[3], struct text tok);
1684
1685 ###### File: scanner.mk
1686         all :: libnumber.o
1687         libnumber.o : libnumber.c
1688                 $(CC) $(CFLAGS) -c libnumber.c
1689
1690 ## Processing strings
1691
1692 Both `TK_string` and `TK_multi_string` require post-processing which
1693 can be one of two types: literal or with escapes processed.
1694 Even literal processing is non-trivial as the file may contain indents
1695 which need to be stripped.
1696
1697 Errors can only occur when processing escapes.  Any unrecognised
1698 character following the escape character will cause an error.
1699
1700 Processing escapes and striping indents can only make the string
1701 shorter, not longer, so we allocate a buffer which is the same size as
1702 the string and process into that.
1703
1704 To request escape processing, we pass the character we want to use for
1705 quoting, usually '`\`'.  To avoid escape processing we pass a zero.
1706
1707 ###### string main
1708         int string_parse(struct token *tok, char escape,
1709                          struct text *str, char tail[3])
1710         {
1711                 ## string vars
1712                 struct text t = tok->txt;
1713
1714                 str->txt = NULL;
1715                 ## strip tail
1716                 if (tok->num == TK_string) {
1717                         ## strip single
1718                 } else {
1719                         ## strip multi
1720                 }
1721                 str->txt = malloc(t.len);
1722                 str->len = 0;
1723
1724                 ## process string
1725                 return 1;
1726         err:
1727                 free(str->txt);
1728                 str->txt = NULL;
1729                 return 0;
1730         }
1731
1732 ### strip tail
1733
1734 The tail of the string can be 0, 1, or 2 letters
1735
1736         i = t.len;
1737         if (i >= 0 && isalpha(t.txt[i-1]))
1738                 i -= 1;
1739         if (i >= 0 && isalpha(t.txt[i-1]))
1740                 i -= 1;
1741         strncpy(tail, t.txt+i, t.len-i);
1742         tail[t.len-i] = 0;
1743         t.len = i;
1744
1745 ###### string vars
1746         int i;
1747
1748 ### strip single
1749
1750 Stripping the quote of a single-line string is trivial.
1751 The only part that is at all interesting is that quote character must
1752 be remembered.
1753
1754         quote = t.txt[0];
1755         if (t.txt[t.len-1] != quote)
1756                 goto err;
1757         t.txt += 1;
1758         t.len -= 2;
1759
1760 ###### string vars
1761         char quote;
1762
1763 ### strip multi
1764
1765 For a multi-line string we have a little more work to do.  We need to
1766 remove 3 quotes, not 1, and need to count the indent of the close
1767 quote as it will need to be stripped from all lines.
1768
1769         quote = t.txt[0];
1770         if (t.len < 7 ||
1771             t.txt[1] != quote || t.txt[2] != quote ||
1772             !is_newline(t.txt[3]))
1773                 goto err;
1774         t.txt += 4;
1775         t.len -= 4;
1776         i = t.len;
1777         if (i <= 0 || t.txt[i-1] != quote)
1778                 goto err;
1779         i -= 1;
1780         if (i <= 0 || t.txt[i-1] != quote)
1781                 goto err;
1782         i -= 1;
1783         if (i <= 0 || t.txt[i-1] != quote)
1784                 goto err;
1785         i -= 1;
1786         t.len = i;
1787         while (i > 0 && !is_newline(t.txt[i-1]))
1788                 i--;
1789         indent = 0;
1790         while (i < t.len) {
1791                 if (t.txt[i] == ' ')
1792                         indent += 1;
1793                 if (t.txt[i] == '\t')
1794                         indent = indent_tab(indent);
1795                 i++;
1796         }
1797
1798 ###### string vars
1799         int indent = 0;
1800
1801 ### process string
1802
1803 Now we just take one byte at a time. trans-ASCII unicode won't look
1804 like anything we are interested in so it will just be copied byte by
1805 byte.
1806
1807         cp = str->txt;
1808         at_sol = 1;
1809         for (i = 0; i < t.len; i++) {
1810                 char c;
1811                 if (at_sol) {
1812                         at_sol = 0;
1813                         ## strip indent
1814                         if (i >= t.len)
1815                                 break;
1816                 }
1817                 c = t.txt[i];
1818                 if (c != escape) {
1819                         *cp = c;
1820                         cp += 1;
1821                         if (is_newline(c))
1822                                 at_sol = 1;
1823                 } else if (i+1 >= t.len) {
1824                         // escape and end of string
1825                         goto err;
1826                 } else {
1827                         i += 1;
1828                         c = t.txt[i];
1829                         ## parse escape
1830                 }
1831         }
1832         str->len = cp - str->txt;
1833
1834 ###### string vars
1835         char *cp;
1836         int at_sol;
1837
1838 ### strip indent
1839
1840 Every time we find a start of line, we strip spaces and tabs until the
1841 required indent is found.
1842
1843         int skipped = 0;
1844         while (i < t.len && skipped < indent) {
1845                 c = t.txt[i];
1846                 if (c == ' ')
1847                         skipped += 1;
1848                 else if (c == '\t')
1849                         skipped = indent_tab(skipped);
1850                 else
1851                         break;
1852                 i+= 1;
1853         }
1854
1855 ### parse escape
1856         switch (c) {
1857         case 'n':
1858                 *cp++ = '\n'; break;
1859         case 'r':
1860                 *cp++ = '\r'; break;
1861         case 't':
1862                 *cp++ = '\t'; break;
1863         case 'b':
1864                 *cp++ = '\b'; break;
1865         case 'q':
1866                 *cp++ = quote; break;
1867         case 'f':
1868                 *cp++ = '\f'; break;
1869         case 'v':
1870                 *cp++ = '\v'; break;
1871         case 'a':
1872                 *cp++ = '\a'; break;
1873         case '0':
1874         case '1':
1875         case '2':
1876         case '3':
1877                 // 3 digit octal number
1878                 if (i+2 >= t.len)
1879                         goto err;
1880                 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1881                     t.txt[i+2] < '0' || t.txt[i+1] > '7')
1882                         goto err;
1883                 n = (t.txt[i  ]-'0') * 64 +
1884                     (t.txt[i+1]-'0') *  8 +
1885                     (t.txt[i+2]-'0') *  1;
1886                 *cp++ = n;
1887                 i += 2;
1888                 break;
1889         case 'x':
1890                 // 2 hex digits
1891                 n = take_hex(2, t.txt+i+1, t.len-i-1);
1892                 if (n < 0)
1893                         goto err;
1894                 *cp++ = n;
1895                 i += 2;
1896                 break;
1897         case 'u':
1898         case 'U':
1899                 // 4 or 8 hex digits for unicode
1900                 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1901                 if (n < 0)
1902                         goto err;
1903                 memset(&pstate, 0, sizeof(pstate));
1904                 n = wcrtomb(cp, n, &pstate);
1905                 if (n <= 0)
1906                         goto err;
1907                 cp += n;
1908                 i += c == 'u' ? 4 : 8;
1909                 break;
1910         default:
1911                 if (c == escape)
1912                         *cp++ = c;
1913                 else if (is_newline(c))
1914                         at_sol = 1;
1915                 else
1916                         goto err;
1917         }
1918
1919 ###### string vars
1920         long n;
1921         mbstate_t pstate;
1922
1923 For `\x` `\u` and `\U` we need to collect a specific number of
1924 hexadecimal digits
1925
1926 ###### string functions
1927
1928         static long take_hex(int digits, char *cp, int l)
1929         {
1930                 long n = 0;
1931                 if (l < digits)
1932                         return -1;
1933                 while (digits) {
1934                         char  c = *cp;
1935                         int d;
1936                         if (!isxdigit(c))
1937                                 return -1;
1938                         if (isdigit(c))
1939                                 d = c - '0';
1940                         else if (isupper(c))
1941                                 d = 10 + c - 'A';
1942                         else
1943                                 d = 10 + c - 'a';
1944                         n = n * 16 + d;
1945                         digits--;
1946                         cp++;
1947                 }
1948                 return n;
1949         }
1950
1951 #### File: libstring.c
1952
1953 String parsing goes in `libstring.c`
1954
1955         #include <unistd.h>
1956         #include <stdlib.h>
1957         #include <stdio.h>
1958         #include <string.h>
1959         #include <ctype.h>
1960         #include <wchar.h>
1961         #include "mdcode.h"
1962         #include "scanner.h"
1963         ## string functions
1964         ## string main
1965
1966 ###### File: parse_string.h
1967         int string_parse(struct token *tok, char escape,
1968                          struct text *str, char tail[3]);
1969
1970 ###### File: scanner.mk
1971         all :: libstring.o
1972         libstring.o : libstring.c
1973                 $(CC) $(CFLAGS) -c libstring.c
1974
1975 ## Testing
1976
1977 As "untested code is buggy code" we need a program to easily test
1978 the scanner library.  This will simply parse a given file and report
1979 the tokens one per line.
1980
1981 ###### File: scanner.c
1982
1983         #include <unistd.h>
1984         #include <stdlib.h>
1985         #include <fcntl.h>
1986         #include <errno.h>
1987         #include <sys/mman.h>
1988         #include <string.h>
1989         #include <stdio.h>
1990         #include <gmp.h>
1991         #include <locale.h>
1992         #include <getopt.h>
1993         #include "mdcode.h"
1994         #include "scanner.h"
1995         #include "parse_number.h"
1996         #include "parse_string.h"
1997
1998         static int errs;
1999         static void pr_err(char *msg)
2000         {
2001                 errs++;
2002                 fprintf(stderr, "%s\n", msg);
2003         }
2004
2005         static int kcmp(const void *ap, const void *bp)
2006         {
2007                 char * const *a = ap;
2008                 char * const *b = bp;
2009                 return strcmp(*a, *b);
2010         }
2011
2012         int main(int argc, char *argv[])
2013         {
2014                 int fd;
2015                 int len;
2016                 char *file;
2017                 char *filename = NULL;
2018                 struct token_state *state;
2019                 const char *known[] = {
2020                         "==",
2021                         "else",
2022                         "if",
2023                         "then",
2024                         "while",
2025                         "{",
2026                         "}",
2027                 };
2028                 struct token_config conf = {
2029                         .word_start = "_$",
2030                         .word_cont = "",
2031                         .words_marks = known,
2032                         .number_chars = "., _+-",
2033                         .known_count = sizeof(known)/sizeof(known[0]),
2034                         .ignored = 0,
2035                 };
2036                 static const struct option long_options[] = {
2037                         { "word-start",         1, NULL, 'W'},
2038                         { "word-cont",          1, NULL, 'w'},
2039                         { "number-chars",       1, NULL, 'n'},
2040                         { "ignore-numbers",     0, NULL, 'N'},
2041                         { "ignore-ident",       0, NULL, 'I'},
2042                         { "ignore-marks",       0, NULL, 'M'},
2043                         { "ignore-strings",     0, NULL, 'S'},
2044                         { "ignore-multi-strings",0, NULL, 'z'},
2045                         { "ignore-line-comment",0, NULL, 'c'},
2046                         { "ignore-newline",     0, NULL, 'l'},
2047                         { "ignore-block-comment", 0, NULL, 'C'},
2048                         { "ignore-indent",      0, NULL, 'i'},
2049                         { "return-comments",    0, NULL, 'r'},
2050                         { "file",               1, NULL, 'f'},
2051                         { "section",            1, NULL, 's'},
2052                         { NULL,                 0, NULL, 0},
2053                 };
2054                 static const char options[] = "W:w:n:NIMSzclCirf:s:";
2055
2056                 struct section *table, *s, *prev;
2057                 int opt;
2058                 char *section_name = NULL;
2059                 int section_found = 0;
2060
2061                 setlocale(LC_ALL,"");
2062                 while ((opt = getopt_long(argc, argv, options, long_options, NULL))
2063                        != -1) {
2064                         switch(opt) {
2065                         case 'W': conf.word_start = optarg; break;
2066                         case 'w': conf.word_cont = optarg; break;
2067                         case 'n': conf.number_chars = optarg; break;
2068                         case 'N': conf.ignored |= 1 << TK_number; break;
2069                         case 'I': conf.ignored |= 1 << TK_ident; break;
2070                         case 'M': conf.ignored |= 1 << TK_mark; break;
2071                         case 'S': conf.ignored |= 1 << TK_string; break;
2072                         case 'z': conf.ignored |= 1 << TK_multi_string; break;
2073                         case 'c': conf.ignored |= 1 << TK_line_comment; break;
2074                         case 'C': conf.ignored |= 1 << TK_block_comment; break;
2075                         case 'l': conf.ignored |= 1 << TK_newline; break;
2076                         case 'i': conf.ignored |= 1 << TK_in; break;
2077                         case 'r': conf.return_comments = 1; break;
2078                         case 'f': filename = optarg; break;
2079                         case 's': section_name = optarg; break;
2080                         default: fprintf(stderr, "scanner: unknown option '%c'.\n",
2081                                          opt);
2082                                 exit(1);
2083                         }
2084                 }
2085
2086                 if (optind < argc) {
2087                         const char **wm = calloc(argc - optind, sizeof(char*));
2088                         int i;
2089                         for (i = optind; i < argc; i++)
2090                                 wm[i - optind] = argv[i];
2091                         qsort(wm, argc-optind, sizeof(char*), kcmp);
2092                         conf.words_marks = wm;
2093                         conf.known_count = argc - optind;
2094                 }
2095
2096                 if (filename)
2097                         fd = open(filename, O_RDONLY);
2098                 else
2099                         fd = 0;
2100                 if (fd < 0) {
2101                         fprintf(stderr, "scanner: cannot open %s: %s\n",
2102                                 filename, strerror(errno));
2103                         exit(1);
2104                 }
2105                 len = lseek(fd, 0, 2);
2106                 if (len <= 0) {
2107                         fprintf(stderr,"scanner: %s is empty or not seekable\n",
2108                                 filename ?: "stdin");
2109                         exit(1);
2110                 }
2111                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2112                 table = code_extract(file, file+len, pr_err);
2113
2114                 for (s = table; s;
2115                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
2116                         if (section_name &&
2117                             (s->section.len != strlen(section_name) ||
2118                              strncmp(s->section.txt, section_name, s->section.len) != 0))
2119                                 continue;
2120                         if (section_name)
2121                                 section_found = 1;
2122                         printf("Tokenizing: %.*s\n", s->section.len,
2123                                 s->section.txt);
2124                         state = token_open(s->code, &conf);
2125                         while(1) {
2126                                 struct token tk = token_next(state);
2127                                 printf("%d:%d ", tk.line, tk.col);
2128                                 token_trace(stdout, tk, 20);
2129                                 if (tk.num == TK_number) {
2130                                         mpq_t num;
2131                                         char tail[3];
2132                                         if (number_parse(num, tail,tk.txt)) {
2133                                                 printf(" %s ", tail);
2134                                                 mpq_out_str(stdout, 10, num);
2135                                                 mpq_clear(num);
2136                                         } else
2137                                                 printf(" BAD NUMBER");
2138                                 }
2139                                 if (tk.num == TK_string ||
2140                                     tk.num == TK_multi_string) {
2141                                         char esc = '\\';
2142                                         struct text str;
2143                                         char tail[3];
2144                                         if (tk.txt.txt[0] == '`')
2145                                                 esc = 0;
2146                                         if (string_parse(&tk, esc,
2147                                                          &str, tail)) {
2148                                                 printf(" %s ", tail);
2149                                                 text_dump(stdout, str, 20);
2150                                                 free(str.txt);
2151                                         } else
2152                                                 printf(" BAD STRING");
2153                                 }
2154                                 printf("\n");
2155                                 if (tk.num == TK_error)
2156                                         errs = 1;
2157                                 if (tk.num == TK_eof)
2158                                         break;
2159                         }
2160                         token_close(state);
2161                 }
2162                 if (conf.words_marks != known)
2163                         free(conf.words_marks);
2164                 if (section_name && !section_found) {
2165                         fprintf(stderr, "scanner: section %s not found\n", section_name);
2166                         errs = 1;
2167                 }
2168                 exit(!!errs);
2169         }
2170 ###### File: scanner.mk
2171         scanner.c : scanner.mdc
2172                 ./md2c scanner.mdc
2173         all :: scanner
2174         scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2175                 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2176                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2177         scanner.o : scanner.c
2178                 $(CC) $(CFLAGS) -c scanner.c