5 The main task of the lexical scanner is to convert a stream of
6 characters into a stream of tokens. The tokens are then typically
7 used by a parser to extract the syntactic structure.
9 The stream of characters are assumed to be in memory identified by a
10 linked list of blocks, such as provided by the "[mdcode][]" literate
11 program extractor. A single token may never cross a block boundary.
18 The text is assumed to be UTF-8 though some matching assumes the
19 ASCII subset. If the text provided does not conform to UTF-8 an error
20 will be reported and some number of bytes will be skipped.
25 #include <unicode/uchar.h>
27 Tokens are returned by successive calls to the main interface
28 function: `token_next()` which has a `state` structure to keep track
29 of where it is up to. Each token carries not just a numeric
30 identifier but also the code block, the line and character within that
31 block, and the actual start and length using the `struct text` from
37 struct code_node *node;
48 ###### exported functions
49 struct token token_next(struct token_state *state);
52 struct token token_next(struct token_state *state)
63 The `line` and `col` offsets are useful for reporting errors.
64 The `txt` provides the content when that is important.
66 ### Token types and configuration ##
68 The scanner is not completely general, yet not completely specified.
69 There are a fixed set of token types, though particular tokens within
70 those types can be distinguish via configuration.
72 Most token types may be explicitly ignored, as typically comments
73 would be. The exact consequence of ignoring each token type varies
78 int ignored; // bit set of ignored tokens.
79 ## token config parameters
83 struct token_config *conf;
85 ###### token_next init
86 int ignored = state->conf->ignored;
88 The different tokens are numbers, words, marks, strings, comments,
89 newlines, EOF, and indents, each of which is examined in detail below.
91 There are various cases where no token can be found in part of the
92 input. All of these will be reported as a `TK_error` token.
94 It is possible to declare a number of strings which form distinct
95 tokens (rather than being grouped as e.g. 'word'). These are given
96 token numbers from `TK_reserved` upwards.
107 Numbers are the messiest tokens to parse, primarily because they can
108 contain characters that also have meaning outside of numbers and,
109 particularly, immediately after numbers.
111 The obvious example is the '`-`' sign. It can come inside a number for
112 a negative exponent, or after a number as a subtraction operator. To
113 be sure we have parsed as best as possible we need to only allow the
114 '`-`' inside a number if it is after an exponent character. This can be
115 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
116 digit, so we don't allow '`-`' after just any `e`.
118 To make matters worse, our language designer has decided to experiment
119 with allowing commas to be used as the decimal indicator, and spaces
120 to be used to separate groups of digits in large numbers. Both of
121 these can reasonably be restricted to appear between two digits, so we
122 have to add that condition to our tests. For consistency we require
123 every non-alpha-numeric to appear between two hex digits, with the
124 exception that a sign can appear only after a 'p' or 'e', and a space
125 can only appear between decimal digits. Allowing a space before a
126 letter easily leads to confusion, such a in `a < 3 and b < 4`.
128 So we cannot just treat numbers as starting with a digit and being
129 followed by some set of characters. We need more structure than that.
133 - Numbers must start with a digit.
134 - If the first digit is zero, the next character should be a base
135 signifier (one of `xob`) or a decimal marker (`.` or `,`) (though this isn't
136 enforced at this stage)
137 In the first case the only first `p` or `P` may be followed by a sign.
138 - If the number doesn't start with `0` followed by one of `xob`, the
139 first `e` may be followed by a sign.
140 - A sign must always be followed by a digit.
141 - Any digit may be followed by a space or underscore and any hex digit
142 maybe followed by an underscore, providing that the subsequence character
143 is also a digit (for space) or hex digit (for underscore).
144 This rule will require an extra level of 'unget' to be
145 supported when handling characters.
146 - Otherwise any digits or ASCII letters are allowed. We do not at
147 this point check that the digits given are permitted by the base.
148 That will happen when the token is converted to a number.
150 To allow easy configuration, the various non alphanumeric characters
151 are only permitted if they are listed in a configuration parameter.
153 ###### token config parameters
156 Note that numbers may not start with a period, so `.75` is not a
157 number. This is not the norm, but is not unheard of. Excluding these
158 numbers simplifies the rule at very little cost.
163 If TK_number is ignored, digits will result in an error unless they
164 are declared to be a start character for words.
172 if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
175 int decimal_mark = 0;
177 wchar_t ch2 = get_char(state);
178 if (strchr("xobXOB", ch2) != NULL)
186 if (ch == 'e' || ch == 'E') {
192 if (ch == 'p' || ch == 'P') {
198 save_unget_state(state);
200 ch = get_char(state);
202 if (!iswalnum(prev)) {
203 /* special characters, like separators and decimal marks
204 * and signs, must be followed by a hexdigit, and the
205 * space and signs must be followed by a decimal digit.
207 if (!iswxdigit(ch) ||
208 ((prev == '-' || prev == '+') && !iswdigit(ch)) ||
209 (prev == ' ' && !iswdigit(ch))) {
210 /* don't want the new char or the special */
211 restore_unget_state(state);
218 if (!strchr(state->conf->number_chars, ch)) {
219 /* non-number char */
222 if (ch == '+' || ch == '-') {
223 /* previous must be 'e' or 'p' in appropraite context */
227 } else if (ch == ' ') {
228 /* previous must be a digit */
232 /* previous must be a hex digit */
233 if (!iswxdigit(prev))
236 if (ch == '.' || ch == ',') {
237 /* only one of these permitted */
243 /* We seem to have a "number" token */
245 close_token(state, &tk);
251 Words start with a "start" character followed by the longest
252 sequence of "continue" characters. The Unicode ID_START and
253 ID_CONTINUE sets are always permitted, but other ASCII characters
254 can be added to these sets.
256 ###### token config parameters
260 ###### internal functions
261 static int is_word_start(wchar_t ch, struct token_config *conf)
263 return iswalpha(ch) ||
264 strchr(conf->word_start, ch) != NULL ||
265 u_hasBinaryProperty(ch, UCHAR_ID_START);
268 static int is_word_continue(wchar_t ch, struct token_config *conf)
270 return iswalnum(ch) ||
271 strchr(conf->word_cont, ch) != NULL ||
272 u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
275 Words can be either known or unknown. Known words are referred to as
276 "reserved words" and get a unique token number. Unknown words are
277 "identifiers" and are syntactically a single token.
282 A list of known words must be provided. This list is shared with the
283 "marks" which are described next. The list must be lexically sorted
284 and the length of the list must be given (`known_count`).
285 Tokens matching these known words are reported as the index of the
286 list added to `TK_reserved`.
288 If identifiers are ignored, then any word which is not listed as a
289 known word results in an error.
291 ###### token config parameters
292 const char **words_marks;
297 if (is_word_start(ch, state->conf)) {
299 /* A word: identifier or reserved */
301 ch = get_char(state);
302 while (is_word_continue(ch, state->conf));
304 close_token(state, &tk);
306 if (ignored & (1<<TK_ident))
308 n = find_known(state->conf, tk.txt);
310 tk.num = TK_reserved + n;
316 Marks are generally one or more punctuation marks joined together. It
317 would be nice to use the term "symbol" for these, but that causes
318 confusion in a subsequent discussion of the grammar, which has terminal
319 symbols and non-terminal symbols which are conceptually quite
320 different. So strings of punctuation characters will be marks.
322 A "mark" consists of ASCII characters that are not white space, are not
323 "start" characters for words, and are not digits.
324 These will collectively be called mark characters.
326 ###### internal functions
327 static int is_mark(wchar_t ch, struct token_config *conf)
332 strchr(conf->word_start, ch) == NULL;
335 As with words, there can be known and unknown marks, though the rules
336 are slightly different.
338 Two marks do not need to be separated by a non-mark characters. This
339 is different from words which do need to be separated by at least one
340 non-continue character.
342 The scanner will normally prefer longer sequences of mark characters,
343 but will more strongly prefer known marks over unknown marks. So if
344 it finds a known mark where adding one more character does not result
345 in a known mark, it will return that first known mark.
347 If no known mark is found we will test against strings and comments
348 below before giving up and assuming an unknown mark.
350 If an unknown mark contains a quote character or a comment marker, and
351 that token is not being ignored, then we terminate the unknown mark
352 before that quote or comment. This ensures that an unknown mark
353 immediately before a string is handled correctly.
355 If the first character of a comment marker (i.e. '/') is a known mark,
356 the above rules would suggest that the start of a comment would be
357 parsed as that mark, which is not what is wanted. So the introductory
358 sequences for a comment ("//" and "/*") are treated as
359 partially-known. They prevent the leading "/" from being a mark by
360 itself, but do not actually constitute a stand-alone mark.
362 If `TK_mark` is ignored, then unknown marks are returned as errors.
367 Known marks are included in the same list as the list of known words.
371 while (is_mark(ch, state->conf)) {
374 close_token(state, &tk);
375 n = find_known(state->conf, tk.txt);
377 tk.num = TK_reserved + n;
378 else if (tk.num != TK_error) {
379 /* found a longest-known-mark, still need to
382 if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
383 (ch == '/' || ch == '*')) {
384 /* Yes, this is a comment, not a '/' */
385 restore_unget_state(state);
390 close_token(state, &tk);
394 save_unget_state(state);
395 ch = get_char(state);
396 if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
397 /* If strings are allowed, a quote (Which isn't a known mark)
398 * mustn't be treated as part of an unknown mark. It can be
399 * part of a multi-line srtings though.
402 if (prev == '#' && n < 0)
403 /* '#' is not a known mark, so assume it is a comment */
405 if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
406 close_token(state, &tk);
407 restore_unget_state(state);
410 if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
411 close_token(state, &tk);
412 restore_unget_state(state);
417 if (tk.num != TK_error) {
418 close_token(state, &tk);
422 If we don't find a known mark, we will check for strings and comments
423 before assuming that we have an unknown mark
432 Strings start with one of single quote, double quote, or back quote
433 and continue until a matching character on the same line. Any of
434 these characters can be included in the list of known marks and then
435 they will not be used for identifying strings.
437 Immediately following the close quote, one or two ASCII letters may
438 appear. These are somewhat like the arbitrary letters allowed in
439 "Numbers" above. They can be used by the language in various ways.
441 If 3 identical quote characters appear in a row and are
442 followed by a newline, then this forms a multi-line string which
443 continues until an identical triple quote appears on a line preceded
444 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
446 Multi-line strings may not extend beyond the end of the `code_node` in
449 Normal strings and multi-line strings are encoded as two different
456 ###### internal functions
457 static int is_quote(wchar_t ch)
459 return ch == '\'' || ch == '"' || ch == '`'; // "
462 #### Multi-line strings
464 The multi-line string is checked for first. If they are being
465 ignored, we fall through and treat a triple quote as an empty string
466 followed by the start of a new string.
469 if (tk.txt.len == 3 &&
470 !(ignored & (1 << TK_multi_string)) &&
471 is_quote(tk.txt.txt[0]) &&
472 memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
473 is_newline(tk.txt.txt[3])) {
475 wchar_t first = tk.txt.txt[0];
478 while (!at_eon(state) && qseen < 3) {
479 ch = get_char(state);
480 if (is_newline(ch)) {
483 } else if (at_sol && ch == first) {
485 } else if (ch != ' ' && ch != '\t') {
491 /* Hit end of node - error.
492 * unget so the newline is seen,
493 * but return rest of string as an error.
497 close_token(state, &tk);
501 /* 2 letters are allowed */
502 ch = get_char(state);
504 ch = get_char(state);
506 ch = get_char(state);
507 /* Now we must have a newline, but we don't return it
510 close_token(state, &tk);
511 tk.num = TK_multi_string;
517 #### Single-line strings
519 The sequence of marks collected may be more than a single-line
520 string, so we reset to the start and collect characters until
521 we find a close quote or a newline.
523 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
526 if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
527 !(ignored & (1<<TK_string))) {
528 wchar_t first = tk.txt.txt[0];
529 reset_token(state, &tk);
530 ch = get_char(state);
532 while (!at_eon(state) && !is_newline(ch)) {
533 ch = get_char(state);
538 if (is_newline(ch)) {
543 while (!at_eon(state) && (ch = get_char(state)) &&
547 close_token(state, &tk);
553 Single line comments may start with '`//`' or '`#`' providing that these
554 are not known marks. They continue to the end of the line.
556 Block comments start with '`/*`' if this is not a known mark. They
557 continue to the first occurrence of '`*/`' and may not contain any
558 occurrence of '`/*`'.
560 Block comments can be wholly within one line or can continue over
561 multiple lines. The multi-line version should be followed immediately
562 by a newline. The Linux kernel contains over 285000 multi-line
563 comments are only 34 are followed by characters other than white space
564 (which should be removed) or a backslash (only needed in macros). So
565 it would not suffer from this rule.
567 These two comment types are reported as two separate token types, and
568 consequently can be ignored separately. When ignored a comment is
569 still parsed, but is discarded.
575 ###### internal functions
576 static int is_line_comment(struct text txt)
578 return (txt.len >= 1 && txt.txt[0] == '#') ||
579 (txt.len >= 2 && txt.txt[0] == '/' &&
583 static int is_block_comment(struct text txt)
585 return txt.len >= 2 && txt.txt[0] == '/' &&
589 #### Single line comments
591 A single-line comment continues up to, but not including the newline
596 if (is_line_comment(tk.txt)) {
597 while (!is_newline(ch) && !at_eon(state))
598 ch = get_char(state);
601 close_token(state, &tk);
602 tk.num = TK_line_comment;
603 if (ignored & (1 << TK_line_comment))
610 The token text collected so far could exceed the comment, so we need
613 If we find an embedded `/*` we reset to just before the '/' and report
614 an error. That way the next thing to be parsed will be the rest of
615 the comment. This requires a double unget, so we need to save/restore
616 the unget state (explained later).
620 if (is_block_comment(tk.txt)) {
623 reset_token(state, &tk);
626 save_unget_state(state);
627 ch = get_char(state);
629 while (!at_eon(state) &&
630 (prev != '/' || ch != '*') &&
631 (prev != '*' || ch != '/')) {
635 save_unget_state(state);
636 ch = get_char(state);
638 close_token(state, &tk);
644 /* embedded. Need to unget twice! */
645 restore_unget_state(state);
650 tk.num = TK_block_comment;
651 if (newlines && !(ignored & (1<<TK_newline))) {
652 /* next char must be newline */
653 ch = get_char(state);
658 if (tk.num == TK_error ||
659 !(ignored & (1 << TK_block_comment)))
664 ### Indents, Newlines, and White Space.
666 Normally white space is ignored. However newlines can be important as
667 can indents, which are either after a newline or at the start of a
668 node (detected by `at_son()`);
670 ###### exported functions
671 static inline int is_newline(wchar_t ch)
673 return ch == '\n' || ch == '\f' || ch == '\v';
677 if (ch <= ' ' && !is_newline(ch)
681 If a line starts with more white-space than the previous non-blank
682 line - or if the first non-blank line in the document starts with any
683 white-space - then an "IN" is reported at the start of the line.
685 Before the next non-blank line which starts with less white space, or
686 at the latest at the end of the document, a matching "OUT" token
687 is reported. There will always be an exact match between "IN" and
690 It is possible for "OUT" to be followed (almost) immediately by an
691 "IN". This happens if, for example, the indent of three consecutive
692 lines are 0, 8, 4 spaces. Before the second line we report an
693 "IN". Before the third line we must report an "OUT", as 4 is less
694 than 8, then also an Ident as 4 is greater than 0.
700 For the purpose of measuring the length of white space, a tab adds at
701 least one space, and rounds up to a multiple of 8.
703 ###### exported functions
704 static inline int indent_tab(int indent)
709 We need to track the current levels of indent. This requires some
710 sort of stack as indent levels are pushed on and popped off. In
711 practice this stack is unlikely to often exceed 5 so we will used a
712 fixed stack of 20 indent levels. More than this will be silently
717 int indent_sizes[20];
719 `indent_sizes[0]` will always be zero - this simplifies some code.
723 Newlines can optionally be reported. Newlines within a block comment
724 or a multi-line string are not reported separately, but each of these
725 must be followed immediately by a newline so these constructs cannot
726 hide the fact that a newline was present.
728 When indents are being reported, the Newline which would normally be
729 reported immediately before the "IN" is delayed until after the
730 matching "OUT". This makes an indented section act like a
731 continuation of the previous line to some extent.
733 A blank line would normally be reported simply as two consecutive Newline
734 tokens. However if the subsequent line is indented (and indents are being
735 reported) then the right thing to do is less obvious as Newlines should be
736 delayed - but how many Newlines?
738 The approach we will take is to report the extra Newlines immediately after
739 the IN token, so the blank line is treated as though it were an indented
745 If we find a newline or white space at the start of a block, we keep
746 collecting spaces, tabs, and newlines until we find some real text.
747 Then depending on the indent we generate some number of tokens. These
748 will be a sequence of "Newline OUT" pairs representing a decrease
749 in indent, then either a Newline or an IN depending on whether the
750 next line is indented, then zero or more Newlines representing all the
751 blank lines that have been skipped.
753 When a Newline leads to the next block of code there is a question of
754 whether the various Newline and OUT/IN tokens should appear to
755 belong to the earlier or later block. This is addressed by processing
756 the tokens in two stages based on the relative indent levels of the
757 two blocks (each block has a base indent to which the actual indents
760 Any "Newline OUT" pairs needed to reduce the current indent to the
761 maximum of the base indents of the old and new blocks are generated
762 against the old block. Then if the next block does not have an
763 increased indent, one more "Newline" is generated.
765 If further "Newline OUT" pairs are needed to get to the indent
766 level of the 'next' block, they are generated against that block,
767 though the first Newline is suppressed (it having already been
770 Finally the Newline or IN for the first line of the new block is
771 generated, unless the Newline needs to be suppressed because it
772 appeared at the end of the previous block.
774 This means that a block may start with an OUT or an IN, but
775 will only start with a Newline if it actually starts with a blank
778 We will need to represent in the `token_state` where in this sequence
779 of delayed tokens we are. As `state.col` records the target indent we
780 don't need to record how many OUTs or INs are needed. We do
781 need to record the number of blank lines, and which of Newline and
782 OUT is needed next in the initial sequence of pairs.
784 For this we store one more than the number of blank lines as
785 `delayed_lines` and a flag for `out_next`.
792 Generating these tokens involves two separate pieces of code.
794 Firstly we need to recognise white space and count the indents and
795 newlines. These are recorded in the above state fields.
797 Separately we need, on each call to `token_next`, to check if
798 there are some delayed tokens and if so we need to advance the state
799 information and return one token.
801 ###### internal functions
802 static int state_indent(struct token_state *state)
804 if (state->node == NULL)
806 return state->node->indent - state->node->needs_strip + state->col;
811 state_check_node(state);
812 if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
814 int was_nl = is_newline(ch);
815 if (ignored & (1<<TK_in)) {
818 if (ignored & (1<<TK_newline))
821 close_token(state, &tk);
824 // Indents are needed, so check all white space.
825 while (ch <= ' ' && ch != WEOF) {
828 ch = get_char(state);
830 state_check_node(state);
834 state->delayed_lines = newlines;
835 state->out_next = !was_nl;
836 state->check_indent = 1;
840 ###### delayed tokens
842 if (state->check_indent || state->delayed_lines) {
843 if (state_indent(state) < state->indent_sizes[state->indent_level]) {
844 if (!state->out_next &&
845 !(ignored & (1<<TK_newline))) {
850 state->indent_level -= 1;
855 if (state_indent(state) > state->indent_sizes[state->indent_level] &&
856 state->indent_level < sizeof(state->indent_sizes)-1) {
857 state->indent_level += 1;
858 state->indent_sizes[state->indent_level] = state_indent(state);
859 if (state->delayed_lines)
860 state->delayed_lines -= 1;
864 state->check_indent = 0;
865 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
867 state->delayed_lines -= 1;
870 state->delayed_lines = 0;
876 After the last newline in the file has been processed, a special
877 end-of-file token will be returned. any further attempts to get more
878 tokens will continue to return the same end-of-file token.
889 ### Unknown Marks, or errors.
891 We have now handled all the possible known mark-like tokens.
892 If the token we have is not empty and `TK_mark` is allowed,
893 we have an unknown mark, otherwise this must be an error.
897 /* one unknown mark character */
899 close_token(state, &tk);
900 if (ignored & (1<<TK_mark))
906 /* Completely unrecognised character is next, possibly
907 * a digit and we are ignoring numbers.
908 * What ever it is, make it an error.
911 close_token(state, &tk);
915 ## Tools For The Task
917 You may have noticed that are few gaps we left in the above -
918 functions used without first defining them. Doing so above would have
921 ### Character by character
923 As we walk through the various `code_node`s we need to process whole
924 Unicode codepoints, and keep track of which line and column we are on.
925 We will assume for now that any printing character uses one column,
926 though that is not true in general.
928 As the text in a `code_node` may include an indent that identifies it as
929 being code, we need to be careful to strip that. The `code_node` has
930 a flag that tells us whether or not we need to strip.
936 struct code_node *node;
942 ###### internal functions
944 static void do_strip(struct token_state *state)
947 if (state->node->needs_strip) {
949 while (n && state->node->code.txt[state->offset] == ' ') {
954 while (n == 4 && state->node->code.txt[state->offset] == '\t') {
955 indent = indent_tab(indent);
962 static void state_check_node(struct token_state *state)
966 if (state->node->code.len > state->offset)
970 state->node = state->node->next;
971 while (state->node && state->node->code.txt == NULL);
973 state->prev_offset = 0;
974 state->strip_offset = 0;
976 if (state->node == NULL)
978 state->line = state->node->line_no;
980 state->col = state->node->needs_strip;
981 state->strip_offset = state->offset;
984 static wint_t get_char(struct token_state *state)
990 state_check_node(state);
991 if (state->node == NULL)
996 memset(&mbstate, 0, sizeof(mbstate));
998 n = mbrtowc(&next, state->node->code.txt + state->offset,
999 state->node->code.len - state->offset,
1001 if (n == -2 || n == 0) {
1002 /* Not enough bytes - not really possible */
1003 next = '\n'; // NOTEST
1004 state->offset = state->node->code.len; // NOTEST
1005 } else if (n == -1) {
1007 state->offset += 1; // NOTEST
1008 next = 0x7f; // an illegal character // NOTEST
1014 } else if (is_newline(next)) {
1017 state->col = state->node->needs_strip;
1018 } else if (next == '\t') {
1019 state->col = indent_tab(state->col);
1024 We will sometimes want to "unget" the last character as it needs to be
1025 considered again as part of the next token. So we need to store a
1026 'previous' version of all metadata.
1033 ###### before get_char
1034 state->prev_offset = state->offset;
1035 state->prev_line = state->line;
1036 state->prev_col = state->col;
1038 ###### internal functions
1040 static void unget_char(struct token_state *state)
1043 state->offset = state->prev_offset;
1044 state->line = state->prev_line;
1045 state->col = state->prev_col;
1049 We occasionally need a double-unget, particularly for numbers and
1050 block comments. We don't impose this cost on all scanning, but
1051 require those code sections that need it to call `save_unget_state`
1052 before each `get_char`, and then `restore_unget_state` when a
1053 double-unget is needed.
1060 ###### internal functions
1061 static void save_unget_state(struct token_state *state)
1063 state->prev_offset2 = state->prev_offset;
1064 state->prev_line2 = state->prev_line;
1065 state->prev_col2 = state->prev_col;
1068 static void restore_unget_state(struct token_state *state)
1070 state->prev_offset = state->prev_offset2;
1071 state->prev_line = state->prev_line2;
1072 state->prev_col = state->prev_col2;
1075 At the start of a token we don't want to be at the end of a code block
1076 if we can help it. To avoid this possibility, we 'get' and 'unget' a
1077 single character. This will move into the next non-empty code block
1078 and leave the current pointer at the start of it.
1080 This has to happen _after_ dealing with delayed tokens as some of them
1081 must appear in the previous node. When we do this, we need to reset
1082 the data in the token.
1084 ###### delayed tokens
1085 if (at_eon(state)) {
1088 tk.node = state->node;
1090 tk.txt.txt = state->node->code.txt + state->offset;
1091 tk.line = state->line;
1092 tk.col = state->col;
1098 The current token is initialized to line up with the first character
1099 that we 'get' for each token. When we have, or might have, a full
1100 token we can call `close_token` to set the `len` of the token
1101 appropriately. This can safely be called multiple times.
1103 Finally we occasionally (for single-line strings and block comments)
1104 need to reset to the beginning of the current token as we might have
1105 parsed too much already. For that there is `reset_token`.
1108 tk.node = state->node;
1110 tk.txt.txt = state->node->code.txt + state->offset;
1111 tk.line = state->line;
1112 tk.col = state->col;
1115 ###### internal functions
1117 static void close_token(struct token_state *state,
1120 if (state->node != tk->node)
1121 tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
1123 tk->txt.len = (state->node->code.txt + state->offset)
1127 static void reset_token(struct token_state *state, struct token *tok)
1129 state->prev_line = tok->line;
1130 state->prev_col = tok->col;
1131 state->prev_offset = tok->txt.txt - state->node->code.txt;
1136 Tokens may not cross into the next `code_node`, and some tokens can
1137 include the newline at the and of a `code_node`, we must be able to
1138 easily check if we have reached the end. Equally we need to know if
1139 we are at the start of a node, as white space is treated a little
1142 ###### internal functions
1144 static int at_son(struct token_state *state)
1146 return state->prev_offset <= state->strip_offset;
1149 static int at_eon(struct token_state *state)
1151 // at end-of-node ??
1152 return state->node == NULL ||
1153 state->offset >= state->node->code.len;
1156 ### Find a known word
1158 As the known-word list is sorted we can use a simple binary search.
1159 Following the pattern established in "mdcode", we will use a `struct
1160 text` with start and length to represent the code fragment we are
1163 ###### internal functions
1164 static int find_known(struct token_config *conf, struct text txt)
1167 int hi = conf->known_count;
1169 while (lo + 1 < hi) {
1170 int mid = (lo + hi) / 2;
1171 int cmp = strncmp(conf->words_marks[mid],
1173 if (cmp == 0 && conf->words_marks[mid][txt.len])
1180 if (strncmp(conf->words_marks[lo],
1181 txt.txt, txt.len) == 0
1182 && conf->words_marks[lo][txt.len] == 0)
1188 ### Bringing it all together
1190 Now we have all the bits there is just one section missing: combining
1191 all the token parsing code into one block.
1193 The handling of delayed tokens (Newlines, INs, OUTs) must come
1194 first before we try getting another character.
1196 Then we parse all the test, making sure that we check for known marks
1197 before strings and comments, but unknown marks after strings and comments.
1199 This block of code will either return a token, or will choose to
1200 ignore one, in which case it will `continue` around to the top of the
1206 ch = get_char(state);
1215 As well as getting tokens, we need to be able to create the
1216 `token_state` to start with, and discard it later.
1221 ###### main functions
1222 struct token_state *token_open(struct code_node *code, struct
1225 struct token_state *state = malloc(sizeof(*state));
1226 memset(state, 0, sizeof(*state));
1228 state->line = code->line_no;
1230 state->col = state->node->needs_strip;
1231 state->strip_offset = state->offset;
1235 void token_close(struct token_state *state)
1240 ###### exported functions
1241 struct token_state *token_open(struct code_node *code, struct
1242 token_config *conf);
1243 void token_close(struct token_state *state);
1247 Getting tokens is the main thing but it is also useful to be able to
1248 print out token information, particularly for tracing and testing.
1250 Known tokens are printed verbatim. Other tokens are printed as
1251 `type(content)` where content is truncated to a given number of characters.
1253 The function for printing a truncated string (`text_dump`) is also exported
1254 so that it can be used to tracing processed strings too.
1259 ###### exported functions
1260 void token_trace(FILE *f, struct token tok, int max);
1261 void text_dump(FILE *f, struct text t, int max);
1263 ###### main functions
1265 void text_dump(FILE *f, struct text txt, int max)
1272 for (i = 0; i < max; i++) {
1273 char c = txt.txt[i];
1274 if (c < ' ' || c > '~')
1275 fprintf(f, "\\x%02x", c & 0xff);
1279 fprintf(f, "%c", c);
1285 void token_trace(FILE *f, struct token tok, int max)
1287 static char *types[] = {
1288 [TK_ident] = "ident",
1290 [TK_number] = "number",
1291 [TK_string] = "string",
1292 [TK_multi_string] = "mstring",
1293 [TK_line_comment] = "lcomment",
1294 [TK_block_comment] = "bcomment",
1297 [TK_newline] = "newline",
1299 [TK_error] = "ERROR",
1303 default: /* known word or mark */
1304 fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1310 /* No token text included */
1311 fprintf(f, "%s()", types[tok.num]);
1317 case TK_multi_string:
1318 case TK_line_comment:
1319 case TK_block_comment:
1321 fprintf(f, "%s(", types[tok.num]);
1322 text_dump(f, tok.txt, max);
1328 ### And there we have it
1330 We now have all the library functions defined for reading and printing
1331 tokens. Now we just need C files to store them, and a mk file to make them.
1333 ###### File: scanner.h
1335 ## exported functions
1337 ###### File: libscanner.c
1339 #include "scanner.h"
1341 ## internal functions
1344 ###### File: scanner.mk
1348 scanner.mk scanner.h libscanner.c : scanner.mdc
1351 libscanner.o : libscanner.c
1352 $(CC) $(CFLAGS) -c libscanner.c
1354 ## Processing numbers
1356 Converting a `TK_number` token to a numerical value is a slightly
1357 higher level task than lexical analysis, and slightly lower than
1358 grammar parsing, so put it here - as an appendix if you like.
1360 Importantly it will be used by the same testing rig that is used for
1361 testing the token scanner.
1363 The numeric value that we will convert all numbers into is the `mpq_t`
1364 from the GNU high precision number library "libgmp".
1366 ###### number includes
1370 Firstly we need to be able to parse a string of digits in a given base
1371 and possibly with a decimal marker. We store this in an `mpz_t`
1372 integer and report the number of digits after the decimal mark.
1374 On error we return zero and ensure that the 'mpz_t' has been freed, or
1375 had never been initialised.
1377 ###### number functions
1379 static int parse_digits(mpz_t num, struct text tok, int base,
1382 /* Accept digits up to 'base', ignore '_' and
1383 * (for base 10) ' ' if they appear between two
1384 * legal digits, and if `placesp` is not NULL,
1385 * allow a single '.' or ',' and report the number
1386 * of digits beyond there.
1387 * Return number of characters processed (p),
1388 * or 0 if something illegal was found.
1391 int decimal = -1; // digits after marker
1392 enum {Digit, Space, Other} prev = Other;
1395 for (p = 0; p < tok.len; p++) {
1397 char c = tok.txt[p];
1399 if (c == '_' || (c == ' ' && base == 10)) {
1405 if (c == '.' || c == ',') {
1408 if (!placesp || decimal >= 0)
1416 else if (isupper(c))
1418 else if (islower(c))
1429 mpz_mul_ui(num, num, base);
1433 mpz_add_ui(num, num, dig);
1452 ###### number includes
1455 To parse a full number we need to consider the optional base, the
1456 mantissa, and the optional exponent. We will treat these one at a
1459 The base is indicated by a letter after a leading zero, which must be
1460 followed by a base letter or a period. The base also determines the
1461 character which will mark an exponent.
1469 if (tok.txt[0] == '0' && tok.len > 1) {
1471 switch(tok.txt[1]) {
1502 // another digit is not permitted
1506 // must be decimal marker or trailing
1507 // letter, which are OK;
1514 After the base is the mantissa, which may contain a decimal mark, so
1515 we need to record the number of places. We won't impose the number of
1516 places until we have the exponent as well.
1523 ###### parse mantissa
1525 d = parse_digits(mant, tok, base, &places);
1531 mpq_set_z(num, mant);
1534 After the mantissa number may come an exponent which may be positive
1535 or negative. We assume at this point that we have seen the exponent
1543 ###### parse exponent
1545 if (tok.txt[0] == '+') {
1548 } else if (tok.txt[0] == '-') {
1554 d = parse_digits(exp, tok, 10, NULL);
1559 if (!mpz_fits_slong_p(exp)) {
1564 lexp = mpz_get_si(exp) * esign;
1569 Now that we have the mantissa and the exponent we can multiply them
1570 together, also allowing for the number of digits after the decimal
1573 For base 10, we simply subtract the decimal places from the exponent.
1574 For the other bases, as the exponent is alway based on 2, even for
1575 octal and hex, we need a bit more detail.
1576 We then recover the sign from the exponent, as division is quite
1577 different from multiplication.
1579 ###### calc exponent
1598 Imposing the exponent on the number is also very different for base 10
1599 than for the others. For the binary shift `gmp` provides a simple
1600 function. For base 10 we use something like Russian Peasant
1603 ###### calc exponent
1607 mpq_set_ui(tens, 10, 1);
1611 mpq_mul(num, num, tens);
1613 mpq_div(num, num, tens);
1618 mpq_mul(tens, tens, tens);
1623 mpq_mul_2exp(num, num, lexp);
1625 mpq_div_2exp(num, num, lexp);
1628 Now we are ready to parse a number: the base, mantissa, and exponent.
1629 If all goes well we check for the possible trailing letters and
1630 return. Return value is 1 for success and 0 for failure.
1632 ###### number functions
1633 int number_parse(mpq_t num, char tail[3], struct text tok)
1640 if (tok.len > 1 && (tok.txt[0] == expc ||
1641 tok.txt[0] == toupper(expc))) {
1648 for (i = 0; i < 2; i++) {
1651 if (!isalpha(tok.txt[i]))
1653 tail[i] = tok.txt[i];
1663 Number parsing goes in `libnumber.c`
1665 ###### File: libnumber.c
1673 ###### File: number.h
1674 int number_parse(mpq_t num, char tail[3], struct text tok);
1676 ###### File: scanner.mk
1678 libnumber.o : libnumber.c
1679 $(CC) $(CFLAGS) -c libnumber.c
1681 ## Processing strings
1683 Both `TK_string` and `TK_multi_string` require post-processing which
1684 can be one of two types: literal or with escapes processed.
1685 Even literal processing is non-trivial as the file may contain indents
1686 which need to be stripped.
1688 Errors can only occur when processing escapes. Any unrecognised
1689 character following the escape character will cause an error.
1691 Processing escapes and striping indents can only make the string
1692 shorter, not longer, so we allocate a buffer which is the same size as
1693 the string and process into that.
1695 To request escape processing, we pass the character we want to use for
1696 quoting, usually '`\`'. To avoid escape processing we pass a zero.
1699 int string_parse(struct token *tok, char escape,
1700 struct text *str, char tail[3])
1703 struct text t = tok->txt;
1707 if (tok->num == TK_string) {
1712 str->txt = malloc(t.len);
1725 The tail of the string can be 0, 1, or 2 letters
1728 if (i >= 0 && isalpha(t.txt[i-1]))
1730 if (i >= 0 && isalpha(t.txt[i-1]))
1732 strncpy(tail, t.txt+i, t.len-i);
1741 Stripping the quote of a single-line string is trivial.
1742 The only part that is at all interesting is that quote character must
1746 if (t.txt[t.len-1] != quote)
1756 For a multi-line string we have a little more work to do. We need to
1757 remove 3 quotes, not 1, and need to count the indent of the close
1758 quote as it will need to be stripped from all lines.
1762 t.txt[1] != quote || t.txt[2] != quote ||
1763 !is_newline(t.txt[3]))
1768 if (i <= 0 || t.txt[i-1] != quote)
1771 if (i <= 0 || t.txt[i-1] != quote)
1774 if (i <= 0 || t.txt[i-1] != quote)
1778 while (i > 0 && !is_newline(t.txt[i-1]))
1782 if (t.txt[i] == ' ')
1784 if (t.txt[i] == '\t')
1785 indent = indent_tab(indent);
1794 Now we just take one byte at a time. trans-ASCII unicode won't look
1795 like anything we are interested in so it will just be copied byte by
1800 for (i = 0; i < t.len; i++) {
1814 } else if (i+1 >= t.len) {
1815 // escape and end of string
1823 str->len = cp - str->txt;
1831 Every time we find a start of line, we strip spaces and tabs until the
1832 required indent is found.
1835 while (i < t.len && skipped < indent) {
1840 skipped = indent_tab(skipped);
1849 *cp++ = '\n'; break;
1851 *cp++ = '\r'; break;
1853 *cp++ = '\t'; break;
1855 *cp++ = '\b'; break;
1857 *cp++ = quote; break;
1859 *cp++ = '\f'; break;
1861 *cp++ = '\v'; break;
1863 *cp++ = '\a'; break;
1868 // 3 digit octal number
1871 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1872 t.txt[i+2] < '0' || t.txt[i+1] > '7')
1874 n = (t.txt[i ]-'0') * 64 +
1875 (t.txt[i+1]-'0') * 8 +
1876 (t.txt[i+2]-'0') * 1;
1882 n = take_hex(2, t.txt+i+1, t.len-i-1);
1890 // 4 or 8 hex digits for unicode
1891 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1894 memset(&pstate, 0, sizeof(pstate));
1895 n = wcrtomb(cp, n, &pstate);
1899 i += c == 'u' ? 4 : 8;
1904 else if (is_newline(c))
1914 For `\x` `\u` and `\U` we need to collect a specific number of
1917 ###### string functions
1919 static long take_hex(int digits, char *cp, int l)
1931 else if (isupper(c))
1942 #### File: libstring.c
1944 String parsing goes in `libstring.c`
1953 #include "scanner.h"
1957 ###### File: string.h
1958 int string_parse(struct token *tok, char escape,
1959 struct text *str, char tail[3]);
1961 ###### File: scanner.mk
1963 libstring.o : libstring.c
1964 $(CC) $(CFLAGS) -c libstring.c
1968 As "untested code is buggy code" we need a program to easily test
1969 the scanner library. This will simply parse a given file and report
1970 the tokens one per line.
1972 ###### File: scanner.c
1978 #include <sys/mman.h>
1985 #include "scanner.h"
1990 static void pr_err(char *msg)
1993 fprintf(stderr, "%s\n", msg);
1996 static int kcmp(const void *ap, const void *bp)
1998 char * const *a = ap;
1999 char * const *b = bp;
2000 return strcmp(*a, *b);
2003 int main(int argc, char *argv[])
2008 char *filename = NULL;
2009 struct token_state *state;
2010 const char *known[] = {
2019 struct token_config conf = {
2022 .words_marks = known,
2023 .number_chars = "., _+-",
2024 .known_count = sizeof(known)/sizeof(known[0]),
2027 static const struct option long_options[] = {
2028 { "word-start", 1, NULL, 'W'},
2029 { "word-cont", 1, NULL, 'w'},
2030 { "number-chars", 1, NULL, 'n'},
2031 { "ignore-numbers", 0, NULL, 'N'},
2032 { "ignore-ident", 0, NULL, 'I'},
2033 { "ignore-marks", 0, NULL, 'M'},
2034 { "ignore-strings", 0, NULL, 'S'},
2035 { "ignore-multi-strings",0, NULL, 'z'},
2036 { "ignore-line-comment",0, NULL, 'c'},
2037 { "ignore-newline", 0, NULL, 'l'},
2038 { "ignore-block-comment", 0, NULL, 'C'},
2039 { "ignore-indent", 0, NULL, 'i'},
2040 { "file", 1, NULL, 'f'},
2041 { "section", 1, NULL, 's'},
2042 { NULL, 0, NULL, 0},
2044 static const char options[] = "W:w:n:NIMSzclCif:s:";
2046 struct section *table, *s, *prev;
2048 char *section_name = NULL;
2049 int section_found = 0;
2051 setlocale(LC_ALL,"");
2052 while ((opt = getopt_long(argc, argv, options, long_options, NULL))
2055 case 'W': conf.word_start = optarg; break;
2056 case 'w': conf.word_cont = optarg; break;
2057 case 'n': conf.number_chars = optarg; break;
2058 case 'N': conf.ignored |= 1 << TK_number; break;
2059 case 'I': conf.ignored |= 1 << TK_ident; break;
2060 case 'M': conf.ignored |= 1 << TK_mark; break;
2061 case 'S': conf.ignored |= 1 << TK_string; break;
2062 case 'z': conf.ignored |= 1 << TK_multi_string; break;
2063 case 'c': conf.ignored |= 1 << TK_line_comment; break;
2064 case 'C': conf.ignored |= 1 << TK_block_comment; break;
2065 case 'l': conf.ignored |= 1 << TK_newline; break;
2066 case 'i': conf.ignored |= 1 << TK_in; break;
2067 case 'f': filename = optarg; break;
2068 case 's': section_name = optarg; break;
2069 default: fprintf(stderr, "scanner: unknown option '%c'.\n",
2075 if (optind < argc) {
2076 const char **wm = calloc(argc - optind, sizeof(char*));
2078 for (i = optind; i < argc; i++)
2079 wm[i - optind] = argv[i];
2080 qsort(wm, argc-optind, sizeof(char*), kcmp);
2081 conf.words_marks = wm;
2082 conf.known_count = argc - optind;
2086 fd = open(filename, O_RDONLY);
2090 fprintf(stderr, "scanner: cannot open %s: %s\n",
2091 filename, strerror(errno));
2094 len = lseek(fd, 0, 2);
2096 fprintf(stderr,"scanner: %s is empty or not seekable\n",
2097 filename ?: "stdin");
2100 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2101 table = code_extract(file, file+len, pr_err);
2104 (code_free(s->code), prev = s, s = s->next, free(prev))) {
2106 (s->section.len != strlen(section_name) ||
2107 strncmp(s->section.txt, section_name, s->section.len) != 0))
2111 printf("Tokenizing: %.*s\n", s->section.len,
2113 state = token_open(s->code, &conf);
2115 struct token tk = token_next(state);
2116 printf("%d:%d ", tk.line, tk.col);
2117 token_trace(stdout, tk, 20);
2118 if (tk.num == TK_number) {
2121 if (number_parse(num, tail,tk.txt)) {
2122 printf(" %s ", tail);
2123 mpq_out_str(stdout, 10, num);
2126 printf(" BAD NUMBER");
2128 if (tk.num == TK_string ||
2129 tk.num == TK_multi_string) {
2133 if (tk.txt.txt[0] == '`')
2135 if (string_parse(&tk, esc,
2137 printf(" %s ", tail);
2138 text_dump(stdout, str, 20);
2141 printf(" BAD STRING");
2144 if (tk.num == TK_error)
2146 if (tk.num == TK_eof)
2151 if (conf.words_marks != known)
2152 free(conf.words_marks);
2153 if (section_name && !section_found) {
2154 fprintf(stderr, "scanner: section %s not found\n", section_name);
2159 ###### File: scanner.mk
2160 scanner.c : scanner.mdc
2163 scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2164 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2165 libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2166 scanner.o : scanner.c
2167 $(CC) $(CFLAGS) -c scanner.c