5 The main task of the lexical scanner is to convert a stream of
6 characters into a stream of tokens. The tokens are then typically
7 used by a parser to extract the syntactic structure.
9 The stream of characters are assumed to be in memory identified by a
10 linked list of blocks, such as provided by the "[mdcode][]" literate
11 program extractor. A single token may never cross a block boundary.
18 The text is assumed to be UTF-8 though some matching assumes the
19 ASCII subset. If the text provided does not conform to UTF-8 an error
20 will be reported and some number of bytes will be skipped.
25 #include <unicode/uchar.h>
27 Tokens are returned by successive calls to the main interface
28 function: `token_next()` which has a `state` structure to keep track
29 of where it is up to. Each token carries not just a numeric
30 identifier but also the code block, the line and character within that
31 block, and the actual start and length using the `struct text` from
37 struct code_node *node;
48 ###### exported functions
49 struct token token_next(struct token_state *state);
52 struct token token_next(struct token_state *state)
63 The `line` and `col` offsets are useful for reporting errors.
64 The `txt` provides the content when that is important.
66 ### Token types and configuration ##
68 The scanner is not completely general, yet not completely specified.
69 There are a fixed set of token types, though particular tokens within
70 those types can be distinguish via configuration.
72 Most token types may be explicitly ignored, as typically comments
73 would be. The exact consequence of ignoring each token type varies
78 int ignored; // bit set of ignored tokens.
79 ## token config parameters
83 struct token_config *conf;
85 ###### token_next init
86 int ignored = state->conf->ignored;
88 The different tokens are numbers, words, marks, strings, comments,
89 newlines, EOF, and indents, each of which is examined in detail below.
91 There are various cases where no token can be found in part of the
92 input. All of these will be reported as a `TK_error` token.
94 It is possible to declare a number of strings which form distinct
95 tokens (rather than being grouped as e.g. 'word'). These are given
96 token numbers from `TK_reserved` upwards.
107 Numbers are the messiest tokens to parse, primarily because they can
108 contain characters that also have meaning outside of numbers and,
109 particularly, immediately after numbers.
111 The obvious example is the '`-`' sign. It can come inside a number for
112 a negative exponent, or after a number as a subtraction operator. To
113 be sure we have parsed as best as possible we need to only allow the
114 '`-`' inside a number if it is after an exponent character. This can be
115 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
116 digit, so we don't allow '`-`' after just any `e`.
118 To make matters worse, our language designer has decided to experiment
119 with allowing commas to be used as the decimal indicator, and spaces
120 to be used to separate groups of digits in large numbers. Both of
121 these can reasonably be restricted to appear between two digits, so we
122 have to add that condition to our tests.
124 So we cannot just treat numbers as starting with a digit and being
125 followed by some set of characters. We need more structure than that.
129 - Numbers must start with a digit.
130 - If the first digit is zero, the next character must be a base
131 signifier (one of `xob`) or a decimal marker (`.` or `,`).
132 In the first case the first `p` or `P` may be followed by a sign.
133 - If the number doesn't start with `0` followed by one of `xob`, the
134 first `e` may be followed by a sign.
135 - Any digit or hex digit may be followed by a space or underscore
136 providing that the subsequence character is also a (hex) digit.
137 This rule will require an extra level of 'unget' to be
138 supported when handling characters.
139 - Otherwise any digits or ASCII letters are allowed. We do not at
140 this point check that the digits given are permitted by the base.
141 That will happen when the token is converted to a number.
143 To allow easy configuration, the various non alphanumeric characters
144 are only permitted if they are listed in a configuration parameter.
146 ###### token config parameters
149 Note that numbers may not start with a period, so `.75` is not a
150 number. This is not the norm, but is not unheard of. Excluding these
151 numbers simplifies the rule at very little cost.
156 If TK_number is ignored, digits will result in an error unless they
157 are declared to be a start character for words.
165 if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
166 int prev_special = 0;
168 int decimal_mark = 0;
170 wchar_t ch2 = get_char(state);
171 if (strchr("xobXOB", ch2) != NULL)
179 if (ch == 'e' || ch == 'E')
183 if (ch == 'p' || ch == 'P')
187 save_unget_state(state);
188 ch = get_char(state);
193 if (ch == '+' || ch == '-') {
198 if (ch == '.' || ch == ',') {
204 /* Don't allow that special char,
207 restore_unget_state(state);
210 if (strchr(state->conf->number_chars, ch)) {
214 /* non-number char */
217 /* We seem to have a "number" token */
219 close_token(state, &tk);
225 Words start with a "start" character followed by the longest
226 sequence of "continue" characters. The Unicode ID_START and
227 ID_CONTINUE sets are always permitted, but other ASCII characters
228 can be added to these sets.
230 ###### token config parameters
234 ###### internal functions
235 static int is_word_start(wchar_t ch, struct token_config *conf)
237 return iswalpha(ch) ||
238 strchr(conf->word_start, ch) != NULL ||
239 u_hasBinaryProperty(ch, UCHAR_ID_START);
242 static int is_word_continue(wchar_t ch, struct token_config *conf)
244 return iswalnum(ch) ||
245 strchr(conf->word_cont, ch) != NULL ||
246 u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
249 Words can be either known or unknown. Known words are referred to as
250 "reserved words" and get a unique token number. Unknown words are
251 "identifiers" and are syntactically a single token.
256 A list of known words must be provided. This list is shared with the
257 "marks" which are described next. The list must be lexically sorted
258 and the length of the list must be given (`known_count`).
259 Tokens matching these known words are reported as the index of the
260 list added to `TK_reserved`.
262 If identifiers are ignored, then any word which is not listed as a
263 known word results in an error.
265 ###### token config parameters
266 const char **words_marks;
271 if (is_word_start(ch, state->conf)) {
273 /* A word: identifier or reserved */
275 ch = get_char(state);
276 while (is_word_continue(ch, state->conf));
278 close_token(state, &tk);
280 if (ignored & (1<<TK_ident))
282 n = find_known(state->conf, tk.txt);
284 tk.num = TK_reserved + n;
290 Marks are generally one or more punctuation marks joined together. It
291 would be nice to use the term "symbol" for these, but that causes
292 confusion in a subsequent discussion of the grammar, which has terminal
293 symbols and non-terminal symbols which are conceptually quite
294 different. So strings of punctuation characters will be marks.
296 A "mark" consists of ASCII characters that are not white space, are not
297 "start" characters for words, and are not digits.
298 These will collectively be called mark characters.
300 ###### internal functions
301 static int is_mark(wchar_t ch, struct token_config *conf)
306 strchr(conf->word_start, ch) == NULL;
309 As with words, there can be known and unknown marks, though the rules
310 are slightly different.
312 Two marks do not need to be separated by a non-mark characters. This
313 is different from words which do need to be separated by at least one
314 non-continue character.
316 The scanner will normally prefer longer sequences of mark characters,
317 but will more strongly prefer known marks over unknown marks. So if
318 it finds a known mark where adding one more character does not result
319 in a known mark, it will return that first known mark.
321 If no known mark is found we will test against strings and comments
322 below before giving up and assuming an unknown mark.
324 If an unknown mark contains a quote character or a comment marker, and
325 that token is not being ignored, then we terminate the unknown mark
326 before that quote or comment. This ensures that an unknown mark
327 immediately before a string is handled correctly.
329 If the first character of a comment marker (i.e. '/') is a known mark,
330 the above rules would suggest that the start of a comment would be
331 parsed as that mark, which is not what is wanted. So the introductory
332 sequences for a comment ("//" and "/*") are treated as
333 partially-known. They prevent the leading "/" from being a mark by
334 itself, but do not actually constitute a stand-alone mark.
336 If `TK_mark` is ignored, then unknown marks are returned as errors.
341 Known marks are included in the same list as the list of known words.
345 while (is_mark(ch, state->conf)) {
348 close_token(state, &tk);
349 n = find_known(state->conf, tk.txt);
351 tk.num = TK_reserved + n;
352 else if (tk.num != TK_error) {
353 /* found a longest-known-mark, still need to
356 if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
357 (ch == '/' || ch == '*')) {
358 /* Yes, this is a comment, not a '/' */
359 restore_unget_state(state);
364 close_token(state, &tk);
368 save_unget_state(state);
369 ch = get_char(state);
370 if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
371 /* If strings are allowed, a quote (Which isn't a known mark)
372 * mustn't be treated as part of an unknown mark. It can be
373 * part of a multi-line srtings though.
376 if (prev == '#' && n < 0)
377 /* '#' is not a known mark, so assume it is a comment */
379 if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
380 close_token(state, &tk);
381 restore_unget_state(state);
384 if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
385 close_token(state, &tk);
386 restore_unget_state(state);
391 if (tk.num != TK_error) {
392 close_token(state, &tk);
396 If we don't find a known mark, we will check for strings and comments
397 before assuming that we have an unknown mark
406 Strings start with one of single quote, double quote, or back quote
407 and continue until a matching character on the same line. Any of
408 these characters can be included in the list of known marks and then
409 they will not be used for identifying strings.
411 Immediately following the close quote, one or two ASCII letters may
412 appear. These are somewhat like the arbitrary letters allowed in
413 "Numbers" above. They can be used by the language in various ways.
415 If 3 identical quote characters appear in a row and are
416 followed by a newline, then this forms a multi-line string which
417 continues until an identical triple quote appears on a line preceded
418 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
420 Multi-line strings may not extend beyond the end of the `code_node` in
423 Normal strings and multi-line strings are encoded as two different
430 ###### internal functions
431 static int is_quote(wchar_t ch)
433 return ch == '\'' || ch == '"' || ch == '`'; // "
436 #### Multi-line strings
438 The multi-line string is checked for first. If they are being
439 ignored, we fall through and treat a triple quote as an empty string
440 followed by the start of a new string.
443 if (tk.txt.len == 3 &&
444 !(ignored & (1 << TK_multi_string)) &&
445 is_quote(tk.txt.txt[0]) &&
446 memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
447 is_newline(tk.txt.txt[3])) {
449 wchar_t first = tk.txt.txt[0];
452 while (!at_eon(state) && qseen < 3) {
453 ch = get_char(state);
454 if (is_newline(ch)) {
457 } else if (at_sol && ch == first) {
459 } else if (ch != ' ' && ch != '\t') {
465 /* Hit end of node - error.
466 * unget so the newline is seen,
467 * but return rest of string as an error.
471 close_token(state, &tk);
475 /* 2 letters are allowed */
476 ch = get_char(state);
478 ch = get_char(state);
480 ch = get_char(state);
481 /* Now we must have a newline, but we don't return it
484 close_token(state, &tk);
485 tk.num = TK_multi_string;
491 #### Single-line strings
493 The sequence of marks collected may be more than a single-line
494 string, so we reset to the start and collect characters until
495 we find a close quote or a newline.
497 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
500 if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
501 !(ignored & (1<<TK_string))) {
502 wchar_t first = tk.txt.txt[0];
503 reset_token(state, &tk);
504 ch = get_char(state);
506 while (!at_eon(state) && !is_newline(ch)) {
507 ch = get_char(state);
512 if (is_newline(ch)) {
517 while (!at_eon(state) && (ch = get_char(state)) &&
521 close_token(state, &tk);
527 Single line comments may start with '`//`' or '`#`' providing that these
528 are not known marks. They continue to the end of the line.
530 Block comments start with '`/*`' if this is not a known mark. They
531 continue to the first occurrence of '`*/`' and may not contain any
532 occurrence of '`/*`'.
534 Block comments can be wholly within one line or can continue over
535 multiple lines. The multi-line version should be followed immediately
536 by a newline. The Linux kernel contains over 285000 multi-line
537 comments are only 34 are followed by characters other than white space
538 (which should be removed) or a backslash (only needed in macros). So
539 it would not suffer from this rule.
541 These two comment types are reported as two separate token types, and
542 consequently can be ignored separately. When ignored a comment is
543 still parsed, but is discarded.
549 ###### internal functions
550 static int is_line_comment(struct text txt)
552 return (txt.len >= 1 && txt.txt[0] == '#') ||
553 (txt.len >= 2 && txt.txt[0] == '/' &&
557 static int is_block_comment(struct text txt)
559 return txt.len >= 2 && txt.txt[0] == '/' &&
563 #### Single line comments
565 A single-line comment continues up to, but not including the newline
570 if (is_line_comment(tk.txt)) {
571 while (!is_newline(ch) && !at_eon(state))
572 ch = get_char(state);
575 close_token(state, &tk);
576 tk.num = TK_line_comment;
577 if (ignored & (1 << TK_line_comment))
584 The token text collected so far could exceed the comment, so we need
587 If we find an embedded `/*` we reset to just before the '/' and report
588 an error. That way the next thing to be parsed will be the rest of
589 the comment. This requires a double unget, so we need to save/restore
590 the unget state (explained later).
594 if (is_block_comment(tk.txt)) {
597 reset_token(state, &tk);
600 save_unget_state(state);
601 ch = get_char(state);
603 while (!at_eon(state) &&
604 (prev != '/' || ch != '*') &&
605 (prev != '*' || ch != '/')) {
609 save_unget_state(state);
610 ch = get_char(state);
612 close_token(state, &tk);
618 /* embedded. Need to unget twice! */
619 restore_unget_state(state);
624 tk.num = TK_block_comment;
625 if (newlines && !(ignored & (1<<TK_newline))) {
626 /* next char must be newline */
627 ch = get_char(state);
632 if (tk.num == TK_error ||
633 !(ignored & (1 << TK_block_comment)))
638 ### Indents, Newlines, and White Space.
640 Normally white space is ignored. However newlines can be important as
641 can indents, which are either after a newline or at the start of a
642 node (detected by `at_son()`);
644 ###### exported functions
645 static inline int is_newline(wchar_t ch)
647 return ch == '\n' || ch == '\f' || ch == '\v';
651 if (ch <= ' ' && !is_newline(ch)
655 If a line starts with more white-space than the previous non-blank
656 line - or if the first non-blank line in the document starts with any
657 white-space - then an "IN" is reported at the start of the line.
659 Before the next non-blank line which starts with less white space, or
660 at the latest at the end of the document, a matching "OUT" token
661 is reported. There will always be an exact match between "IN" and
664 It is possible for "OUT" to be followed (almost) immediately by an
665 "IN". This happens if, for example, the indent of three consecutive
666 lines are 0, 8, 4 spaces. Before the second line we report an
667 "IN". Before the third line we must report an "OUT", as 4 is less
668 than 8, then also an Ident as 4 is greater than 0.
674 For the purpose of measuring the length of white space, a tab adds at
675 least one space, and rounds up to a multiple of 8.
677 ###### exported functions
678 static inline int indent_tab(int indent)
683 We need to track the current levels of indent. This requires some
684 sort of stack as indent levels are pushed on and popped off. In
685 practice this stack is unlikely to often exceed 5 so we will used a
686 fixed stack of 20 indent levels. More than this will be silently
691 int indent_sizes[20];
695 Newlines can optionally be reported. Newlines within a block comment
696 or a multi-line string are not reported separately, but each of these
697 must be followed immediately by a newline so these constructs cannot
698 hide the fact that a newline was present.
700 When indents are being reported, the Newline which would normally be
701 reported immediately before the "IN" is delayed until after the
702 matching "OUT". This makes an indented section act like a
703 continuation of the previous line to some extent.
705 A blank line would normally be reported simply as two consecutive Newline
706 tokens. However if the subsequent line is indented (and indents are being
707 reported) then the right thing to do is less obvious as Newlines should be
708 delayed - but how many Newlines?
710 The approach we will take is to report the extra Newlines immediately after
711 the IN token, so the blank line is treated as though it were an indented
717 If we find a newline or white space at the start of a block, we keep
718 collecting spaces, tabs, and newlines until we find some real text.
719 Then depending on the indent we generate some number of tokens. These
720 will be a sequence of "Newline OUT" pairs representing a decrease
721 in indent, then either a Newline or an IN depending on whether the
722 next line is indented, then zero or more Newlines representing all the
723 blank lines that have been skipped.
725 When a Newline leads to the next block of code there is a question of
726 whether the various Newline and OUT/IN tokens should appear to
727 pbelong to the earlier or later block. This is addressed by processing
728 the tokens in two stages based on the relative indent levels of the
729 two blocks (each block has a base indent to which the actual indents
732 Any "Newline OUT" pairs needed to reduce the current indent to the
733 maximum of the base indents of the old and new blocks are generated
734 against the old block. Then if the next block does not have an
735 increased indent, one more "Newline" is generated.
737 If further "Newline OUT" pairs are needed to get to the indent
738 level of the 'next' block, they are generated against that block,
739 though the first Newline is suppressed (it having already been
742 Finally the Newline or IN for the first line of the new block is
743 generated, unless the Newline needs to be suppressed because it
744 appeared at the end of the previous block.
746 This means that a block may start with an OUT or an IN, but
747 will only start with a Newline if it actually starts with a blank
750 We will need to represent in the `token_state` where in this sequence
751 of delayed tokens we are. As `state.col` records the target indent we
752 don't need to record how many OUTs or INs are needed. We do
753 need to record the number of blank lines, and which of Newline and
754 OUT is needed next in the initial sequence of pairs.
756 For this we store one more than the number of blank lines as
757 `delayed_lines` and a flag for `out_next`.
764 Generating these tokens involve two separate pieces of code.
766 Firstly we need to recognise white space and count the indents and
767 newlines. These are recorded in the above state fields.
769 Separately we need, on each call to `token_next`, we need to check if
770 there are some delayed tokens and if so we need to advance the state
771 information and return one token.
774 if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
776 int was_son = at_son(state);
777 if (ignored & (1<<TK_in)) {
780 if (ignored & (1<<TK_newline))
783 close_token(state, &tk);
786 // Indents are needed, so check all white space.
787 while (ch <= ' ' && !at_eon(state)) {
790 ch = get_char(state);
794 if (state->node->next &&
795 state->node->next->indent > state->node->indent)
796 state->col = state->node->next->indent;
798 state->col = state->node->indent;
801 state->delayed_lines = newlines;
802 state->out_next = was_son;
803 state->check_indent = 1;
807 ###### delayed tokens
809 if (state->check_indent || state->delayed_lines) {
810 if (state->col < state->indent_sizes[state->indent_level]) {
811 if (!state->out_next &&
812 !(ignored & (1<<TK_newline))) {
817 state->indent_level -= 1;
822 if (state->col > state->indent_sizes[state->indent_level] &&
823 state->indent_level < sizeof(state->indent_sizes)-1) {
824 state->indent_level += 1;
825 state->indent_sizes[state->indent_level] = state->col;
826 state->delayed_lines -= 1;
830 state->check_indent = 0;
831 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
833 state->delayed_lines -= 1;
836 state->delayed_lines = 0;
842 After the last newline in the file has been processed, a special
843 end-of-file token will be returned. any further attempts to get more
844 tokens will continue to return the same end-of-file token.
853 state->check_indent = 1;
860 ### Unknown Marks, or errors.
862 We have now handled all the possible known mark-like tokens.
863 If the token we have is not empty and `TK_mark` is allowed,
864 we have an unknown mark, otherwise this must be an error.
868 /* one unknown mark character */
870 close_token(state, &tk);
871 if (ignored & (1<<TK_mark))
877 /* Completely unrecognised character is next, possibly
878 * a digit and we are ignoring numbers.
879 * What ever it is, make it an error.
882 close_token(state, &tk);
886 ## Tools For The Task
888 You may have noticed that are few gaps we left in the above -
889 functions used without first defining them. Doing so above would have
892 ### Character by character
894 As we walk through the various `code_node`s we need to process whole
895 Unicode codepoints, and keep track of which line and column we are on.
896 We will assume for now that any printing character uses one column,
897 though that is not true in general.
899 As the text in a `code_node` may include an indent that identifies it as
900 being code, we need to be careful to strip that. The `code_node` has
901 a flag that tells us whether or not we need to strip.
907 struct code_node *node;
912 ###### internal functions
914 static int do_strip(struct token_state *state)
917 if (state->node->needs_strip) {
919 while (n && state->node->code.txt[state->offset] == ' ') {
924 while (n == 4 && state->node->code.txt[state->offset] == '\t') {
925 indent = indent_tab(indent);
933 static wint_t get_char(struct token_state *state)
939 if (state->node == NULL)
941 if (state->node->code.len <= state->offset) {
943 state->node = state->node->next;
944 while (state->node && state->node->code.txt == NULL);
946 if (state->node == NULL)
948 state->line = state->node->line_no;
949 state->col = do_strip(state);
954 memset(&mbstate, 0, sizeof(mbstate));
956 n = mbrtowc(&next, state->node->code.txt + state->offset,
957 state->node->code.len - state->offset,
959 if (n == -2 || n == 0) {
960 /* Not enough bytes - not really possible */
962 state->offset = state->node->code.len;
963 } else if (n == -1) {
966 next = 0x7f; // an illegal character
972 } else if (is_newline(next)) {
974 state->col = do_strip(state);
975 } else if (next == '\t') {
976 state->col = indent_tab(state->col);
981 We will sometimes want to "unget" the last character as it needs to be
982 considered again as part of the next token. So we need to store a
983 'previous' version of all metadata.
990 ###### before get_char
991 state->prev_offset = state->offset;
992 state->prev_line = state->line;
993 state->prev_col = state->col;
995 ###### internal functions
997 static void unget_char(struct token_state *state)
1000 state->offset = state->prev_offset;
1001 state->line = state->prev_line;
1002 state->col = state->prev_col;
1006 We occasionally need a double-unget, particularly for numbers and
1007 block comments. We don't impose this cost on all scanning, but
1008 require those code sections that need it to call `save_unget_state`
1009 before each `get_char`, and then `restore_unget_state` when a
1010 double-unget is needed.
1017 ###### internal functions
1018 static void save_unget_state(struct token_state *state)
1020 state->prev_offset2 = state->prev_offset;
1021 state->prev_line2 = state->prev_line;
1022 state->prev_col2 = state->prev_col;
1025 static void restore_unget_state(struct token_state *state)
1027 state->prev_offset = state->prev_offset2;
1028 state->prev_line = state->prev_line2;
1029 state->prev_col = state->prev_col2;
1032 At the start of a token we don't want to be at the end of a code block
1033 if we can help it. To avoid this possibility, we 'get' and 'unget' a
1034 single character. This will move into the next non-empty code block
1035 and leave the current pointer at the start of it.
1037 This has to happen _after_ dealing with delayed tokens as some of them
1038 must appear in the previous node. When we do this, we need to reset
1039 the data in the token.
1041 ###### delayed tokens
1042 if (at_eon(state)) {
1045 tk.node = state->node;
1047 tk.txt.txt = state->node->code.txt + state->offset;
1048 tk.line = state->line;
1049 tk.col = state->col;
1055 The current token is initialized to line up with the first character
1056 that we 'get' for each token. When we have, or might have, a full
1057 token we can call `close_token` to set the `len` of the token
1058 appropriately. This can safely be called multiple times.
1060 Finally we occasionally (for single-line strings and block comments)
1061 need to reset to the beginning of the current token as we might have
1062 parsed too much already. For that there is `reset_token`.
1065 tk.node = state->node;
1067 tk.txt.txt = state->node->code.txt + state->offset;
1068 tk.line = state->line;
1069 tk.col = state->col;
1072 ###### internal functions
1074 static void close_token(struct token_state *state,
1077 if (state->node != tk->node)
1078 tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
1080 tk->txt.len = (state->node->code.txt + state->offset)
1084 static void reset_token(struct token_state *state, struct token *tok)
1086 state->prev_line = tok->line;
1087 state->prev_col = tok->col;
1088 state->prev_offset = tok->txt.txt - state->node->code.txt;
1093 Tokens make not cross into the next `code_node`, and some tokens can
1094 include the newline at the and of a `code_node`, we must be able to
1095 easily check if we have reached the end. Equally we need to know if
1096 we are at the start of a node, as white space is treated a little
1099 ###### internal functions
1101 static int at_son(struct token_state *state)
1103 return state->offset == 0;
1106 static int at_eon(struct token_state *state)
1108 // at end-of-node ??
1109 return state->node == NULL ||
1110 state->offset >= state->node->code.len;
1113 ### Find a known word
1115 As the known-word list is sorted we can use a simple binary search.
1116 Following the pattern established in "mdcode", we will use a `struct
1117 text` with start and length to represent the code fragment we are
1120 ###### internal functions
1121 static int find_known(struct token_config *conf, struct text txt)
1124 int hi = conf->known_count;
1126 while (lo + 1 < hi) {
1127 int mid = (lo + hi) / 2;
1128 int cmp = strncmp(conf->words_marks[mid],
1130 if (cmp == 0 && conf->words_marks[mid][txt.len])
1137 if (strncmp(conf->words_marks[lo],
1138 txt.txt, txt.len) == 0
1139 && conf->words_marks[lo][txt.len] == 0)
1145 ### Bringing it all together
1147 Now we have all the bits there is just one section missing: combining
1148 all the token parsing code into one block.
1150 The handling of delayed tokens (Newlines, INs, OUTs) must come
1151 first before we try getting another character.
1153 Then we parse all the test, making sure that we check for known marks
1154 before strings and comments, but unknown marks after strings and comments.
1156 This block of code will either return a token, or will choose to
1157 ignore one, in which case it will `continue` around to the top of the
1163 ch = get_char(state);
1172 As well as getting tokens, we need to be able to create the
1173 `token_state` to start with, and discard it later.
1178 ###### main functions
1179 struct token_state *token_open(struct code_node *code, struct
1182 struct token_state *state = malloc(sizeof(*state));
1183 memset(state, 0, sizeof(*state));
1185 state->line = code->line_no;
1186 state->col = do_strip(state);
1190 void token_close(struct token_state *state)
1195 ###### exported functions
1196 struct token_state *token_open(struct code_node *code, struct
1197 token_config *conf);
1198 void token_close(struct token_state *state);
1202 Getting tokens is the main thing but it is also useful to be able to
1203 print out token information, particularly for tracing and testing.
1205 Known tokens are printed verbatim. Other tokens are printed as
1206 `type(content)` where content is truncated to a given number of characters.
1208 The function for printing a truncated string (`text_dump`) is also exported
1209 so that it can be used to tracing processed strings too.
1214 ###### exported functions
1215 void token_trace(FILE *f, struct token tok, int max);
1216 void text_dump(FILE *f, struct text t, int max);
1218 ###### main functions
1220 void text_dump(FILE *f, struct text txt, int max)
1227 for (i = 0; i < max; i++) {
1228 char c = txt.txt[i];
1229 if (c < ' ' || c > '~')
1230 fprintf(f, "\\x%02x", c & 0xff);
1234 fprintf(f, "%c", c);
1240 void token_trace(FILE *f, struct token tok, int max)
1242 static char *types[] = {
1243 [TK_ident] = "ident",
1245 [TK_number] = "number",
1246 [TK_string] = "string",
1247 [TK_multi_string] = "mstring",
1248 [TK_line_comment] = "lcomment",
1249 [TK_block_comment] = "bcomment",
1252 [TK_newline] = "newline",
1254 [TK_error] = "ERROR",
1258 default: /* known word or mark */
1259 fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1265 /* No token text included */
1266 fprintf(f, "%s()", types[tok.num]);
1272 case TK_multi_string:
1273 case TK_line_comment:
1274 case TK_block_comment:
1276 fprintf(f, "%s(", types[tok.num]);
1277 text_dump(f, tok.txt, max);
1283 ### And there we have it
1285 We now have all the library functions defined for reading and printing
1286 tokens. Now we just need C files to store them, and a mk file to make them.
1288 ###### File: scanner.h
1290 ## exported functions
1292 ###### File: libscanner.c
1294 #include "scanner.h"
1296 ## internal functions
1299 ###### File: scanner.mk
1303 scanner.mk scanner.h libscanner.c : scanner.mdc
1306 libscanner.o : libscanner.c
1307 $(CC) $(CFLAGS) -c libscanner.c
1309 ## Processing numbers
1311 Converting a `TK_number` token to a numerical value is a slightly
1312 higher level task than lexical analysis, and slightly lower than
1313 grammar parsing, so put it here - as an index if you like.
1315 Importantly it will be used by the same testing rig that is used for
1316 testing the token scanner.
1318 The numeric value that we will convert all numbers into is the `mpq_t`
1319 from the GNU high precision number library "libgmp".
1321 ###### number includes
1325 Firstly we need to be able to parse a string of digits in a given base
1326 and possibly with a decimal marker. We store this in an `mpz_t`
1327 integer and report the number of digits after the decimal mark.
1329 On error we return zero and ensure that the 'mpz_t' has been freed, or
1330 had never been initialised.
1332 ###### number functions
1334 static int parse_digits(mpz_t num, struct text tok, int base,
1337 /* Accept digits up to 'base', ignore '_' and
1338 * ' ' if they appear between two legal digits,
1339 * and if `placesp` is not NULL, allow a single
1340 * '.' or ',' and report the number of digits
1342 * Return number of characters processed (p),
1343 * or 0 if something illegal was found.
1346 int decimal = -1; // digits after marker
1347 enum {Digit, Space, Other} prev = Other;
1350 for (p = 0; p < tok.len; p++) {
1352 char c = tok.txt[p];
1354 if (c == '_' || c == ' ') {
1360 if (c == '.' || c == ',') {
1363 if (!placesp || decimal >= 0)
1371 else if (isupper(c))
1373 else if (islower(c))
1384 mpz_mul_ui(num, num, base);
1388 mpz_add_ui(num, num, dig);
1407 ###### number includes
1410 To parse a full number we need to consider the optional base, the
1411 mantissa, and the optional exponent. We will treat these one at a
1414 The base is indicated by a letter after a leading zero, which must be
1415 followed by a base letter or a period. The base also determines the
1416 character which will mark an exponent.
1424 if (tok.txt[0] == '0' && tok.len > 1) {
1426 switch(tok.txt[1]) {
1457 // another digit is not permitted
1461 // must be decimal marker or trailing
1462 // letter, which are OK;
1469 After the base is the mantissa, which may contain a decimal mark, so
1470 we need to record the number of places. We won't impose the number of
1471 places until we have the exponent as well.
1478 ###### parse mantissa
1480 d = parse_digits(mant, tok, base, &places);
1486 mpq_set_z(num, mant);
1489 After the mantissa number may come an exponent which may be positive
1490 or negative. We assume at this point that we have seen the exponent
1498 ###### parse exponent
1500 if (tok.txt[0] == '+') {
1503 } else if (tok.txt[0] == '-') {
1509 d = parse_digits(exp, tok, 10, NULL);
1514 if (!mpz_fits_slong_p(exp)) {
1519 lexp = mpz_get_si(exp) * esign;
1524 Now that we have the mantissa and the exponent we can multiply them
1525 together, also allowing for the number of digits after the decimal
1528 For base 10, we simply subtract the decimal places from the exponent.
1529 For the other bases, as the exponent is alway based on 2, even for
1530 octal and hex, we need a bit more detail.
1531 We then recover the sign from the exponent, as division is quite
1532 different from multiplication.
1534 ###### calc exponent
1553 Imposing the exponent on the number is also very different for base 10
1554 than for the others. For the binary shift `gmp` provides a simple
1555 function. For base 10 we use something like Russian Peasant
1558 ###### calc exponent
1562 mpq_set_ui(tens, 10, 1);
1566 mpq_mul(num, num, tens);
1568 mpq_div(num, num, tens);
1573 mpq_mul(tens, tens, tens);
1578 mpq_mul_2exp(num, num, lexp);
1580 mpq_div_2exp(num, num, lexp);
1583 Now we are ready to parse a number: the base, mantissa, and exponent.
1584 If all goes well we check for the possible trailing letters and
1585 return. Return value is 1 for success and 0 for failure.
1587 ###### number functions
1588 int number_parse(mpq_t num, char tail[3], struct text tok)
1595 if (tok.len > 1 && (tok.txt[0] == expc ||
1596 tok.txt[0] == toupper(expc))) {
1603 for (i = 0; i < 2; i++) {
1606 if (!isalpha(tok.txt[i]))
1608 tail[i] = tok.txt[i];
1618 Number parsing goes in `libnumber.c`
1620 ###### File: libnumber.c
1628 ###### File: number.h
1629 int number_parse(mpq_t num, char tail[3], struct text tok);
1631 ###### File: scanner.mk
1633 libnumber.o : libnumber.c
1634 $(CC) $(CFLAGS) -c libnumber.c
1636 ## Processing strings
1638 Both `TK_string` and `TK_multi_string` require post-processing which
1639 can be one of two types: literal or with escapes processed.
1640 Even literal processing is non-trivial as the file may contain indents
1641 which need to be stripped.
1643 Errors can only occur when processing escapes. Any unrecognised
1644 character following the escape character will cause an error.
1646 Processing escapes and striping indents can only make the string
1647 shorter, not longer, so we allocate a buffer which is the same size as
1648 the string and process into that.
1650 To request escape processing, we pass the character we want to use for
1651 quoting, usually '`\`'. To avoid escape processing we pass a zero.
1654 int string_parse(struct token *tok, char escape,
1655 struct text *str, char tail[3])
1658 struct text t = tok->txt;
1662 if (tok->num == TK_string) {
1667 str->txt = malloc(t.len);
1680 The tail of the string can be 0, 1, or 2 letters
1683 if (i >= 0 && isalpha(t.txt[i-1]))
1685 if (i >= 0 && isalpha(t.txt[i-1]))
1687 strncpy(tail, t.txt+i, t.len-i);
1696 Stripping the quote of a single-line string is trivial.
1697 The only part that is at all interesting is that quote character must
1701 if (t.txt[t.len-1] != quote)
1711 For a multi-line string we have a little more work to do. We need to
1712 remove 3 quotes, not 1, and need to count the indent of the close
1713 quote as it will need to be stripped from all lines.
1717 t.txt[1] != quote || t.txt[2] != quote ||
1718 !is_newline(t.txt[3]))
1723 if (i <= 0 || t.txt[i-1] != quote)
1726 if (i <= 0 || t.txt[i-1] != quote)
1729 if (i <= 0 || t.txt[i-1] != quote)
1733 while (i > 0 && !is_newline(t.txt[i-1]))
1737 if (t.txt[i] == ' ')
1739 if (t.txt[i] == '\t')
1740 indent = indent_tab(indent);
1749 Now we just take one byte at a time. trans-ASCII unicode won't look
1750 like anything we are interested in so it will just be copied byte by
1755 for (i = 0; i < t.len; i++) {
1769 } else if (i+1 >= t.len) {
1770 // escape and end of string
1778 str->len = cp - str->txt;
1786 Every time we find a start of line, we strip spaces and tabs until the
1787 required indent is found.
1790 while (i < t.len && skipped < indent) {
1795 skipped = indent_tab(skipped);
1804 *cp++ = '\n'; break;
1806 *cp++ = '\r'; break;
1808 *cp++ = '\t'; break;
1810 *cp++ = '\b'; break;
1812 *cp++ = quote; break;
1814 *cp++ = '\f'; break;
1816 *cp++ = '\v'; break;
1818 *cp++ = '\a'; break;
1823 // 3 digit octal number
1826 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1827 t.txt[i+2] < '0' || t.txt[i+1] > '7')
1829 n = (t.txt[i ]-'0') * 64 +
1830 (t.txt[i+1]-'0') * 8 +
1831 (t.txt[i+2]-'0') * 1;
1837 n = take_hex(2, t.txt+i+1, t.len-i-1);
1845 // 4 or 8 hex digits for unicode
1846 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1849 memset(&pstate, 0, sizeof(pstate));
1850 n = wcrtomb(cp, n, &pstate);
1854 i += c == 'u' ? 4 : 8;
1859 else if (is_newline(c))
1869 For `\x` `\u` and `\U` we need to collect a specific number of
1872 ###### string functions
1874 static long take_hex(int digits, char *cp, int l)
1886 else if (isupper(c))
1897 #### File: libstring.c
1899 String parsing goes in `libstring.c`
1908 #include "scanner.h"
1912 ###### File: string.h
1913 int string_parse(struct token *tok, char escape,
1914 struct text *str, char tail[3]);
1916 ###### File: scanner.mk
1918 libstring.o : libstring.c
1919 $(CC) $(CFLAGS) -c libstring.c
1923 As "untested code is buggy code" we need a program to easily test
1924 the scanner library. This will simply parse a given file and report
1925 the tokens one per line.
1927 ###### File: scanner.c
1933 #include <sys/mman.h>
1940 #include "scanner.h"
1945 static void pr_err(char *msg)
1948 fprintf(stderr, "%s\n", msg);
1951 static int kcmp(const void *ap, const void *bp)
1953 char * const *a = ap;
1954 char * const *b = bp;
1955 return strcmp(*a, *b);
1958 int main(int argc, char *argv[])
1963 char *filename = NULL;
1964 struct token_state *state;
1965 const char *known[] = {
1974 struct token_config conf = {
1977 .words_marks = known,
1978 .number_chars = "., _+-",
1979 .known_count = sizeof(known)/sizeof(known[0]),
1982 static const struct option long_options[] = {
1983 { "word-start", 1, NULL, 'W'},
1984 { "word-cont", 1, NULL, 'w'},
1985 { "number-chars", 1, NULL, 'n'},
1986 { "ignore-numbers", 0, NULL, 'N'},
1987 { "ignore-ident", 0, NULL, 'I'},
1988 { "ignore-marks", 0, NULL, 'M'},
1989 { "ignore-strings", 0, NULL, 'S'},
1990 { "ignore-multi-strings",0, NULL, 'z'},
1991 { "ignore-line-comment",0, NULL, 'c'},
1992 { "ignore-newline", 0, NULL, 'l'},
1993 { "ignore-block-comment", 0, NULL, 'C'},
1994 { "ignore-indent", 0, NULL, 'i'},
1995 { "file", 1, NULL, 'f'},
1996 { NULL, 0, NULL, 0},
1998 static const char options[] = "W:w:n:NIMSzclCif:";
2000 struct section *table, *s, *prev;
2003 setlocale(LC_ALL,"");
2004 while ((opt = getopt_long(argc, argv, options, long_options, NULL))
2007 case 'W': conf.word_start = optarg; break;
2008 case 'w': conf.word_cont = optarg; break;
2009 case 'n': conf.number_chars = optarg; break;
2010 case 'N': conf.ignored |= 1 << TK_number; break;
2011 case 'I': conf.ignored |= 1 << TK_ident; break;
2012 case 'M': conf.ignored |= 1 << TK_mark; break;
2013 case 'S': conf.ignored |= 1 << TK_string; break;
2014 case 'z': conf.ignored |= 1 << TK_multi_string; break;
2015 case 'c': conf.ignored |= 1 << TK_line_comment; break;
2016 case 'C': conf.ignored |= 1 << TK_block_comment; break;
2017 case 'l': conf.ignored |= 1 << TK_newline; break;
2018 case 'i': conf.ignored |= 1 << TK_in; break;
2019 case 'f': filename = optarg; break;
2020 default: fprintf(stderr, "scanner: unknown option '%c'.\n",
2026 if (optind < argc) {
2027 const char **wm = calloc(argc - optind, sizeof(char*));
2029 for (i = optind; i < argc; i++)
2030 wm[i - optind] = argv[i];
2031 qsort(wm, argc-optind, sizeof(char*), kcmp);
2032 conf.words_marks = wm;
2033 conf.known_count = argc - optind;
2037 fd = open(filename, O_RDONLY);
2041 fprintf(stderr, "scanner: cannot open %s: %s\n",
2042 filename, strerror(errno));
2045 len = lseek(fd, 0, 2);
2047 fprintf(stderr,"scanner: %s is empty or not seekable\n",
2048 filename ?: "stdin");
2051 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2052 table = code_extract(file, file+len, pr_err);
2055 (code_free(s->code), prev = s, s = s->next, free(prev))) {
2056 printf("Tokenizing: %.*s\n", s->section.len,
2058 state = token_open(s->code, &conf);
2060 struct token tk = token_next(state);
2061 printf("%d:%d ", tk.line, tk.col);
2062 token_trace(stdout, tk, 20);
2063 if (tk.num == TK_number) {
2066 if (number_parse(num, tail,tk.txt)) {
2067 printf(" %s ", tail);
2068 mpq_out_str(stdout, 10, num);
2071 printf(" BAD NUMBER");
2073 if (tk.num == TK_string ||
2074 tk.num == TK_multi_string) {
2078 if (tk.txt.txt[0] == '`')
2080 if (string_parse(&tk, esc,
2082 printf(" %s ", tail);
2083 text_dump(stdout, str, 20);
2086 printf(" BAD STRING");
2089 if (tk.num == TK_error)
2091 if (tk.num == TK_eof)
2096 if (conf.words_marks != known)
2097 free(conf.words_marks);
2100 ###### File: scanner.mk
2101 scanner.c : scanner.mdc
2104 scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2105 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2106 libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2107 scanner.o : scanner.c
2108 $(CC) $(CFLAGS) -c scanner.c