5 The main task of the lexical scanner is to convert a stream of
6 characters into a stream of tokens. The tokens are then typically
7 used by a parser to extract the syntactic structure.
9 The stream of characters are assumed to be in memory identified by a
10 linked list of blocks, such as provided by the "[mdcode][]" literate
11 program extractor. A single token may never cross a block boundary.
18 The text is assumed to be UTF-8 though some matching assumes the
19 ASCII subset. If the text provided does not conform to UTF-8 an error
20 will be reported and some number of bytes will be skipped.
25 #include <unicode/uchar.h>
27 Tokens are returned by successive calls to the main interface
28 function: `token_next()` which has a `state` structure to keep track
29 of where it is up to. Each token carries not just a numeric
30 identifier but also the code block, the line and character within that
31 block, and the actual start and length using the `struct text` from
37 struct code_node *node;
48 ###### exported functions
49 struct token token_next(struct token_state *state);
52 struct token token_next(struct token_state *state)
63 The `line` and `col` offsets are useful for reporting errors.
64 The `txt` provides the content when that is important.
66 ### Token types and configuration ##
68 The scanner is not completely general, yet not completely specified.
69 There are a fixed set of token types, though particular tokens within
70 those types can be distinguish via configuration.
72 Most token types may be explicitly ignored, so they aren't parsed.
73 Comments typically parsed but not returned, but an option is provided to
74 return comments for further processing. The exact consequence of
75 ignoring each token type varies from token to token.
79 int ignored; // bit set of ignored tokens.
81 ## token config parameters
85 struct token_config *conf;
87 ###### token_next init
88 int ignored = state->conf->ignored;
90 The different tokens are numbers, words, marks, strings, comments,
91 newlines, EOF, and indents, each of which is examined in detail below.
93 There are various cases where no token can be found in part of the
94 input. All of these will be reported as a `TK_error` token.
96 It is possible to declare a number of strings which form distinct
97 tokens (rather than being grouped as e.g. 'word'). These are given
98 token numbers from `TK_reserved` upwards.
109 Numbers are the messiest tokens to parse, primarily because they can
110 contain characters that also have meaning outside of numbers and,
111 particularly, immediately after numbers.
113 The obvious example is the '`-`' sign. It can come inside a number for
114 a negative exponent, or after a number as a subtraction operator. To
115 be sure we have parsed as best as possible we need to only allow the
116 '`-`' inside a number if it is after an exponent character. This can be
117 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
118 digit, so we don't allow '`-`' after just any `e`.
120 To make matters worse, our language designer has decided to experiment
121 with allowing commas to be used as the decimal indicator, and spaces
122 to be used to separate groups of digits in large numbers. Both of
123 these can reasonably be restricted to appear between two digits, so we
124 have to add that condition to our tests. For consistency we require
125 every non-alpha-numeric to appear between two hex digits, with the
126 exception that a sign can appear only after a 'p' or 'e', and a space
127 can only appear between decimal digits. Allowing a space before a
128 letter easily leads to confusion, such a in `a < 3 and b < 4`.
130 So we cannot just treat numbers as starting with a digit and being
131 followed by some set of characters. We need more structure than that.
135 - Numbers must start with a digit.
136 - If the first digit is zero, the next character should be a base
137 signifier (one of `xob`) or a decimal marker (`.` or `,`) (though this isn't
138 enforced at this stage)
139 In the first case the only first `p` or `P` may be followed by a sign.
140 - If the number doesn't start with `0` followed by one of `xob`, the
141 first `e` may be followed by a sign.
142 - A sign must always be followed by a digit.
143 - Any digit may be followed by a space or underscore and any hex digit
144 maybe followed by an underscore, providing that the subsequence character
145 is also a digit (for space) or hex digit (for underscore).
146 This rule will require an extra level of 'unget' to be
147 supported when handling characters.
148 - Otherwise any digits or ASCII letters are allowed. We do not at
149 this point check that the digits given are permitted by the base.
150 That will happen when the token is converted to a number.
152 To allow easy configuration, the various non alphanumeric characters
153 are only permitted if they are listed in a configuration parameter.
155 ###### token config parameters
158 Note that numbers may not start with a period, so `.75` is not a
159 number. This is not the norm, but is not unheard of. Excluding these
160 numbers simplifies the rule at very little cost.
165 If TK_number is ignored, digits will result in an error unless they
166 are declared to be a start character for words.
174 if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
177 int decimal_mark = 0;
179 wchar_t ch2 = get_char(state);
180 if (strchr("xobXOB", ch2) != NULL)
188 if (ch == 'e' || ch == 'E') {
194 if (ch == 'p' || ch == 'P') {
200 save_unget_state(state);
202 ch = get_char(state);
204 if (!iswalnum(prev)) {
205 /* special characters, like separators and decimal marks
206 * and signs, must be followed by a hexdigit, and the
207 * space and signs must be followed by a decimal digit.
209 if (!iswxdigit(ch) ||
210 ((prev == '-' || prev == '+') && !iswdigit(ch)) ||
211 (prev == ' ' && !iswdigit(ch))) {
212 /* don't want the new char or the special */
213 restore_unget_state(state);
220 if (!strchr(state->conf->number_chars, ch)) {
221 /* non-number char */
224 if (ch == '+' || ch == '-') {
225 /* previous must be 'e' or 'p' in appropraite context */
229 } else if (ch == ' ') {
230 /* previous must be a digit */
234 /* previous must be a hex digit */
235 if (!iswxdigit(prev))
238 if (ch == '.' || ch == ',') {
239 /* only one of these permitted */
245 /* We seem to have a "number" token */
247 close_token(state, &tk);
253 Words start with a "start" character followed by the longest
254 sequence of "continue" characters. The Unicode ID_START and
255 ID_CONTINUE sets are always permitted, but other ASCII characters
256 can be added to these sets.
258 ###### token config parameters
262 ###### internal functions
263 static int is_word_start(wchar_t ch, struct token_config *conf)
265 return iswalpha(ch) ||
266 strchr(conf->word_start, ch) != NULL ||
267 u_hasBinaryProperty(ch, UCHAR_ID_START);
270 static int is_word_continue(wchar_t ch, struct token_config *conf)
272 return iswalnum(ch) ||
273 strchr(conf->word_cont, ch) != NULL ||
274 u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
277 Words can be either known or unknown. Known words are referred to as
278 "reserved words" and get a unique token number. Unknown words are
279 "identifiers" and are syntactically a single token.
284 A list of known words must be provided. This list is shared with the
285 "marks" which are described next. The list must be lexically sorted
286 and the length of the list must be given (`known_count`).
287 Tokens matching these known words are reported as the index of the
288 list added to `TK_reserved`.
290 If identifiers are ignored, then any word which is not listed as a
291 known word results in an error.
293 ###### token config parameters
294 const char **words_marks;
299 if (is_word_start(ch, state->conf)) {
301 /* A word: identifier or reserved */
303 ch = get_char(state);
304 while (is_word_continue(ch, state->conf));
306 close_token(state, &tk);
308 if (ignored & (1<<TK_ident))
310 n = find_known(state->conf, tk.txt);
312 tk.num = TK_reserved + n;
318 Marks are generally one or more punctuation marks joined together. It
319 would be nice to use the term "symbol" for these, but that causes
320 confusion in a subsequent discussion of the grammar, which has terminal
321 symbols and non-terminal symbols which are conceptually quite
322 different. So strings of punctuation characters will be marks.
324 A "mark" consists of ASCII characters that are not white space, are not
325 "start" characters for words, and are not digits.
326 These will collectively be called mark characters.
328 ###### internal functions
329 static int is_mark(wchar_t ch, struct token_config *conf)
334 strchr(conf->word_start, ch) == NULL;
337 As with words, there can be known and unknown marks, though the rules
338 are slightly different.
340 Two marks do not need to be separated by a non-mark characters. This
341 is different from words which do need to be separated by at least one
342 non-continue character.
344 The scanner will normally prefer longer sequences of mark characters,
345 but will more strongly prefer known marks over unknown marks. So if
346 it finds a known mark where adding one more character does not result
347 in a known mark, it will return that first known mark.
349 If no known mark is found we will test against strings and comments
350 below before giving up and assuming an unknown mark.
352 If an unknown mark contains a quote character or a comment marker, and
353 that token is not being ignored, then we terminate the unknown mark
354 before that quote or comment. This ensures that an unknown mark
355 immediately before a string is handled correctly.
357 If the first character of a comment marker (i.e. '/') is a known mark,
358 the above rules would suggest that the start of a comment would be
359 parsed as that mark, which is not what is wanted. So when comments are
360 not ignored, the introductory sequences for a comment ("//" and "/*")
361 are treated as partially-known. They prevent the leading "/" from being
362 a mark by itself, but do not actually constitute a stand-alone mark.
364 If `TK_mark` is ignored, then unknown marks are returned as errors.
369 Known marks are included in the same list as the list of known words.
373 while (is_mark(ch, state->conf)) {
376 close_token(state, &tk);
377 n = find_known(state->conf, tk.txt);
379 tk.num = TK_reserved + n;
380 else if (tk.num != TK_error) {
381 /* found a longest-known-mark, still need to
384 if (is_comment(ignored, tk.txt)) {
385 /* Yes, this is a comment, not a '/' */
386 restore_unget_state(state);
391 close_token(state, &tk);
395 save_unget_state(state);
396 ch = get_char(state);
398 /* No need to worry about other token types */
400 if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
401 /* If strings are allowed, a quote (Which isn't a known mark)
402 * mustn't be treated as part of an unknown mark. It can be
403 * part of a multi-line string though.
407 close_token(state, &tk);
408 if (is_comment(ignored, tk.txt)) {
409 /* looks like a permitted comment, and not a known mark,
410 * so assume it is a comment.
412 restore_unget_state(state);
417 if (tk.num != TK_error) {
418 close_token(state, &tk);
422 If we don't find a known mark, we will check for strings and comments
423 before assuming that we have an unknown mark
432 Strings start with one of single quote, double quote, or back quote
433 and continue until a matching character on the same line. Any of
434 these characters can be included in the list of known marks and then
435 they will not be used for identifying strings.
437 Immediately following the close quote, one or two ASCII letters may
438 appear. These are somewhat like the arbitrary letters allowed in
439 "Numbers" above. They can be used by the language in various ways.
441 If 3 identical quote characters appear in a row and are
442 followed by a newline, then this forms a multi-line string which
443 continues until an identical triple quote appears on a line preceded
444 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
446 Multi-line strings may not extend beyond the end of the `code_node` in
449 Normal strings and multi-line strings are encoded as two different
456 ###### internal functions
457 static int is_quote(wchar_t ch)
459 return ch == '\'' || ch == '"' || ch == '`'; // "
462 #### Multi-line strings
464 The multi-line string is checked for first. If they are being
465 ignored, we fall through and treat a triple quote as an empty string
466 followed by the start of a new string.
469 if (tk.txt.len >= 3 &&
470 !(ignored & (1 << TK_multi_string)) &&
471 is_quote(tk.txt.txt[0]) &&
472 memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
473 is_newline(tk.txt.txt[3])) {
475 wchar_t first = tk.txt.txt[0];
478 while (!at_eon(state) && qseen < 3) {
479 ch = get_char(state);
480 if (is_newline(ch)) {
483 } else if (at_sol && ch == first) {
485 } else if (ch != ' ' && ch != '\t') {
491 /* Hit end of node - error.
492 * unget so the newline is seen,
493 * but return rest of string as an error.
497 close_token(state, &tk);
501 /* 2 letters are allowed */
502 ch = get_char(state);
504 ch = get_char(state);
506 ch = get_char(state);
507 /* Now we must have a newline, but we don't return it
510 close_token(state, &tk);
511 tk.num = TK_multi_string;
517 #### Single-line strings
519 The sequence of marks collected may be more than a single-line
520 string, so we reset to the start and collect characters until
521 we find a close quote or a newline.
523 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
526 if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
527 !(ignored & (1<<TK_string))) {
528 wchar_t first = tk.txt.txt[0];
529 reset_token(state, &tk);
530 ch = get_char(state);
532 while (!at_eon(state) && !is_newline(ch)) {
533 ch = get_char(state);
538 if (is_newline(ch)) {
543 while (!at_eon(state) && (ch = get_char(state)) &&
547 close_token(state, &tk);
553 Single line comments may start with '`//`' or '`#`' providing that these
554 are not known marks. They continue to the end of the line.
556 Block comments start with '`/*`' if this is not a known mark. They
557 continue to the first occurrence of '`*/`' and may not contain any
558 occurrence of '`/*`'.
560 Block comments can be wholly within one line or can continue over
561 multiple lines. The multi-line version should be followed immediately
562 by a newline. The Linux kernel contains over 285000 multi-line
563 comments are only 34 are followed by characters other than white space
564 (which should be removed) or a backslash (only needed in macros). So
565 it would not suffer from this rule.
567 These two comment types are reported as two separate token types, and
568 consequently can be ignored separately. When ignored a comment is
569 still parsed, but is discarded.
575 ###### internal functions
576 static int is_line_comment(int ignored, struct text txt)
578 if (ignored & (1 << TK_line_comment))
580 return (txt.len >= 1 && txt.txt[0] == '#') ||
581 (txt.len >= 2 && txt.txt[0] == '/' &&
585 static int is_block_comment(int ignored, struct text txt)
587 if (ignored & (1 << TK_block_comment))
589 return txt.len >= 2 && txt.txt[0] == '/' &&
593 static int is_comment(int ignored, struct text txt)
595 return is_line_comment(ignored, txt) ||
596 is_block_comment(ignored, txt);
599 #### Single line comments
601 A single-line comment continues up to, but not including the newline
606 if (is_line_comment(ignored, tk.txt)) {
607 while (!is_newline(ch) && !at_eon(state))
608 ch = get_char(state);
611 close_token(state, &tk);
612 tk.num = TK_line_comment;
613 if (!state->conf->return_comments)
620 The token text collected so far could exceed the comment, so we need
623 If we find an embedded `/*` we reset to just before the '/' and report
624 an error. That way the next thing to be parsed will be the rest of
625 the comment. This requires a double unget, so we need to save/restore
626 the unget state (explained later).
630 if (is_block_comment(ignored, tk.txt)) {
633 reset_token(state, &tk);
636 save_unget_state(state);
637 ch = get_char(state);
639 while (!at_eon(state) &&
640 (prev != '/' || ch != '*') &&
641 (prev != '*' || ch != '/')) {
645 save_unget_state(state);
646 ch = get_char(state);
648 close_token(state, &tk);
654 /* embedded. Need to unget twice! */
655 restore_unget_state(state);
660 tk.num = TK_block_comment;
661 if (newlines && !(ignored & (1<<TK_newline))) {
662 /* next char must be newline */
663 ch = get_char(state);
668 if (tk.num == TK_error || state->conf->return_comments)
673 ### Indents, Newlines, and White Space.
675 Normally white space is ignored. However newlines can be important as
676 can indents, which are either after a newline or at the start of a
677 node (detected by `at_son()`);
679 ###### exported functions
680 static inline int is_newline(wchar_t ch)
682 return ch == '\n' || ch == '\f' || ch == '\v';
686 if (ch <= ' ' && !is_newline(ch)
690 If a line starts with more white-space than the previous non-blank
691 line - or if the first non-blank line in the document starts with any
692 white-space - then an "IN" is reported at the start of the line.
694 Before the next non-blank line which starts with less white space, or
695 at the latest at the end of the document, a matching "OUT" token
696 is reported. There will always be an exact match between "IN" and
699 It is possible for "OUT" to be followed (almost) immediately by an
700 "IN". This happens if, for example, the indent of three consecutive
701 lines are 0, 8, 4 spaces. Before the second line we report an
702 "IN". Before the third line we must report an "OUT", as 4 is less
703 than 8, then also an Ident as 4 is greater than 0.
709 For the purpose of measuring the length of white space, a tab adds at
710 least one space, and rounds up to a multiple of 8.
712 ###### exported functions
713 static inline int indent_tab(int indent)
718 We need to track the current levels of indent. This requires some
719 sort of stack as indent levels are pushed on and popped off. In
720 practice this stack is unlikely to often exceed 5 so we will used a
721 fixed stack of 20 indent levels. More than this will be silently
726 int indent_sizes[20];
728 `indent_sizes[0]` will always be zero - this simplifies some code.
732 Newlines can optionally be reported. Newlines within a block comment
733 or a multi-line string are not reported separately, but each of these
734 must be followed immediately by a newline so these constructs cannot
735 hide the fact that a newline was present.
737 When indents are being reported, the Newline which would normally be
738 reported immediately before the "IN" is delayed until after the
739 matching "OUT". This makes an indented section act like a
740 continuation of the previous line to some extent.
742 A blank line would normally be reported simply as two consecutive Newline
743 tokens. However if the subsequent line is indented (and indents are being
744 reported) then the right thing to do is less obvious as Newlines should be
745 delayed - but how many Newlines?
747 The approach we will take is to report the extra Newlines immediately after
748 the IN token, so the blank line is treated as though it were an indented
754 If we find a newline or white space at the start of a block, we keep
755 collecting spaces, tabs, and newlines until we find some real text.
756 Then depending on the indent we generate some number of tokens. These
757 will be a sequence of "Newline OUT" pairs representing a decrease
758 in indent, then either a Newline or an IN depending on whether the
759 next line is indented, then zero or more Newlines representing all the
760 blank lines that have been skipped.
762 When a Newline leads to the next block of code there is a question of
763 whether the various Newline and OUT/IN tokens should appear to
764 belong to the earlier or later block. This is addressed by processing
765 the tokens in two stages based on the relative indent levels of the
766 two blocks (each block has a base indent to which the actual indents
769 Any "Newline OUT" pairs needed to reduce the current indent to the
770 maximum of the base indents of the old and new blocks are generated
771 against the old block. Then if the next block does not have an
772 increased indent, one more "Newline" is generated.
774 If further "Newline OUT" pairs are needed to get to the indent
775 level of the 'next' block, they are generated against that block,
776 though the first Newline is suppressed (it having already been
779 Finally the Newline or IN for the first line of the new block is
780 generated, unless the Newline needs to be suppressed because it
781 appeared at the end of the previous block.
783 This means that a block may start with an OUT or an IN, but
784 will only start with a Newline if it actually starts with a blank
787 We will need to represent in the `token_state` where in this sequence
788 of delayed tokens we are. As `state.col` records the target indent we
789 don't need to record how many OUTs or INs are needed. We do
790 need to record the number of blank lines, and which of Newline and
791 OUT is needed next in the initial sequence of pairs.
793 For this we store one more than the number of blank lines as
794 `delayed_lines` and a flag for `out_next`.
801 Generating these tokens involves two separate pieces of code.
803 Firstly we need to recognise white space and count the indents and
804 newlines. These are recorded in the above state fields.
806 Separately we need, on each call to `token_next`, to check if
807 there are some delayed tokens and if so we need to advance the state
808 information and return one token.
810 ###### internal functions
811 static int state_indent(struct token_state *state)
813 if (state->node == NULL)
815 return state->node->indent - state->node->needs_strip + state->col;
820 state_check_node(state);
821 if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
823 int was_nl = is_newline(ch);
824 if (ignored & (1<<TK_in)) {
827 if (ignored & (1<<TK_newline))
830 close_token(state, &tk);
833 // Indents are needed, so check all white space.
834 while (ch <= ' ' && ch != WEOF) {
837 ch = get_char(state);
839 state_check_node(state);
843 state->delayed_lines = newlines;
844 state->out_next = !was_nl;
845 state->check_indent = 1;
849 ###### delayed tokens
851 if (state->check_indent || state->delayed_lines) {
852 if (state_indent(state) < state->indent_sizes[state->indent_level]) {
853 if (!state->out_next &&
854 !(ignored & (1<<TK_newline))) {
859 state->indent_level -= 1;
864 if (state_indent(state) > state->indent_sizes[state->indent_level] &&
865 state->indent_level < sizeof(state->indent_sizes)-1) {
866 state->indent_level += 1;
867 state->indent_sizes[state->indent_level] = state_indent(state);
868 if (state->delayed_lines)
869 state->delayed_lines -= 1;
873 state->check_indent = 0;
874 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
876 state->delayed_lines -= 1;
879 state->delayed_lines = 0;
885 After the last newline in the file has been processed, a special
886 end-of-file token will be returned. any further attempts to get more
887 tokens will continue to return the same end-of-file token.
898 ### Unknown Marks, or errors.
900 We have now handled all the possible known mark-like tokens.
901 If the token we have is not empty and `TK_mark` is allowed,
902 we have an unknown mark, otherwise this must be an error.
906 /* one unknown mark character */
908 close_token(state, &tk);
909 if (ignored & (1<<TK_mark))
915 /* Completely unrecognised character is next, possibly
916 * a digit and we are ignoring numbers.
917 * What ever it is, make it an error.
920 close_token(state, &tk);
924 ## Tools For The Task
926 You may have noticed that are few gaps we left in the above -
927 functions used without first defining them. Doing so above would have
930 ### Character by character
932 As we walk through the various `code_node`s we need to process whole
933 Unicode codepoints, and keep track of which line and column we are on.
934 We will assume for now that any printing character uses one column,
935 though that is not true in general.
937 As the text in a `code_node` may include an indent that identifies it as
938 being code, we need to be careful to strip that. The `code_node` has
939 a flag that tells us whether or not we need to strip.
945 struct code_node *node;
951 ###### internal functions
953 static void do_strip(struct token_state *state)
956 if (state->node->needs_strip) {
958 while (n && state->node->code.txt[state->offset] == ' ') {
963 while (n == 4 && state->node->code.txt[state->offset] == '\t') {
964 indent = indent_tab(indent);
971 static void state_check_node(struct token_state *state)
975 if (state->node->code.len > state->offset)
979 state->node = state->node->next;
980 while (state->node && state->node->code.txt == NULL);
982 state->prev_offset = 0;
983 state->strip_offset = 0;
985 if (state->node == NULL)
987 state->line = state->node->line_no;
989 state->col = state->node->needs_strip;
990 state->strip_offset = state->offset;
993 static wint_t get_char(struct token_state *state)
999 state_check_node(state);
1000 if (state->node == NULL)
1005 memset(&mbstate, 0, sizeof(mbstate));
1007 n = mbrtowc(&next, state->node->code.txt + state->offset,
1008 state->node->code.len - state->offset,
1010 if (n == -2 || n == 0) {
1011 /* Not enough bytes - not really possible */
1012 next = '\n'; // NOTEST
1013 state->offset = state->node->code.len; // NOTEST
1014 } else if (n == -1) {
1016 state->offset += 1; // NOTEST
1017 next = 0x7f; // an illegal character // NOTEST
1023 } else if (is_newline(next)) {
1026 state->col = state->node->needs_strip;
1027 } else if (next == '\t') {
1028 state->col = indent_tab(state->col);
1033 We will sometimes want to "unget" the last character as it needs to be
1034 considered again as part of the next token. So we need to store a
1035 'previous' version of all metadata.
1042 ###### before get_char
1043 state->prev_offset = state->offset;
1044 state->prev_line = state->line;
1045 state->prev_col = state->col;
1047 ###### internal functions
1049 static void unget_char(struct token_state *state)
1052 state->offset = state->prev_offset;
1053 state->line = state->prev_line;
1054 state->col = state->prev_col;
1058 We occasionally need a double-unget, particularly for numbers and
1059 block comments. We don't impose this cost on all scanning, but
1060 require those code sections that need it to call `save_unget_state`
1061 before each `get_char`, and then `restore_unget_state` when a
1062 double-unget is needed.
1069 ###### internal functions
1070 static void save_unget_state(struct token_state *state)
1072 state->prev_offset2 = state->prev_offset;
1073 state->prev_line2 = state->prev_line;
1074 state->prev_col2 = state->prev_col;
1077 static void restore_unget_state(struct token_state *state)
1079 state->prev_offset = state->prev_offset2;
1080 state->prev_line = state->prev_line2;
1081 state->prev_col = state->prev_col2;
1084 At the start of a token we don't want to be at the end of a code block
1085 if we can help it. To avoid this possibility, we 'get' and 'unget' a
1086 single character. This will move into the next non-empty code block
1087 and leave the current pointer at the start of it.
1089 This has to happen _after_ dealing with delayed tokens as some of them
1090 must appear in the previous node. When we do this, we need to reset
1091 the data in the token.
1093 ###### delayed tokens
1094 if (at_eon(state)) {
1097 tk.node = state->node;
1099 tk.txt.txt = state->node->code.txt + state->offset;
1100 tk.line = state->line;
1101 tk.col = state->col;
1107 The current token is initialized to line up with the first character
1108 that we 'get' for each token. When we have, or might have, a full
1109 token we can call `close_token` to set the `len` of the token
1110 appropriately. This can safely be called multiple times.
1112 Finally we occasionally (for single-line strings and block comments)
1113 need to reset to the beginning of the current token as we might have
1114 parsed too much already. For that there is `reset_token`.
1117 tk.node = state->node;
1119 tk.txt.txt = state->node->code.txt + state->offset;
1120 tk.line = state->line;
1121 tk.col = state->col;
1124 ###### internal functions
1126 static void close_token(struct token_state *state,
1129 if (state->node != tk->node)
1130 tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
1132 tk->txt.len = (state->node->code.txt + state->offset)
1136 static void reset_token(struct token_state *state, struct token *tok)
1138 state->prev_line = tok->line;
1139 state->prev_col = tok->col;
1140 state->prev_offset = tok->txt.txt - state->node->code.txt;
1145 Tokens may not cross into the next `code_node`, and some tokens can
1146 include the newline at the and of a `code_node`, we must be able to
1147 easily check if we have reached the end. Equally we need to know if
1148 we are at the start of a node, as white space is treated a little
1151 ###### internal functions
1153 static int at_son(struct token_state *state)
1155 return state->prev_offset <= state->strip_offset;
1158 static int at_eon(struct token_state *state)
1160 // at end-of-node ??
1161 return state->node == NULL ||
1162 state->offset >= state->node->code.len;
1165 ### Find a known word
1167 As the known-word list is sorted we can use a simple binary search.
1168 Following the pattern established in "mdcode", we will use a `struct
1169 text` with start and length to represent the code fragment we are
1172 ###### internal functions
1173 static int find_known(struct token_config *conf, struct text txt)
1176 int hi = conf->known_count;
1178 while (lo + 1 < hi) {
1179 int mid = (lo + hi) / 2;
1180 int cmp = strncmp(conf->words_marks[mid],
1182 if (cmp == 0 && conf->words_marks[mid][txt.len])
1189 if (strncmp(conf->words_marks[lo],
1190 txt.txt, txt.len) == 0
1191 && conf->words_marks[lo][txt.len] == 0)
1197 ### Bringing it all together
1199 Now we have all the bits there is just one section missing: combining
1200 all the token parsing code into one block.
1202 The handling of delayed tokens (Newlines, INs, OUTs) must come
1203 first before we try getting another character.
1205 Then we parse all the test, making sure that we check for known marks
1206 before strings and comments, but unknown marks after strings and comments.
1208 This block of code will either return a token, or will choose to
1209 ignore one, in which case it will `continue` around to the top of the
1215 ch = get_char(state);
1224 As well as getting tokens, we need to be able to create the
1225 `token_state` to start with, and discard it later.
1230 ###### main functions
1231 struct token_state *token_open(struct code_node *code, struct
1234 struct token_state *state = malloc(sizeof(*state));
1235 memset(state, 0, sizeof(*state));
1237 state->line = code->line_no;
1239 state->col = state->node->needs_strip;
1240 state->strip_offset = state->offset;
1244 void token_close(struct token_state *state)
1249 ###### exported functions
1250 struct token_state *token_open(struct code_node *code, struct
1251 token_config *conf);
1252 void token_close(struct token_state *state);
1256 Getting tokens is the main thing but it is also useful to be able to
1257 print out token information, particularly for tracing and testing.
1259 Known tokens are printed verbatim. Other tokens are printed as
1260 `type(content)` where content is truncated to a given number of characters.
1262 The function for printing a truncated string (`text_dump`) is also exported
1263 so that it can be used to tracing processed strings too.
1268 ###### exported functions
1269 void token_trace(FILE *f, struct token tok, int max);
1270 void text_dump(FILE *f, struct text t, int max);
1272 ###### main functions
1274 void text_dump(FILE *f, struct text txt, int max)
1281 for (i = 0; i < max; i++) {
1282 char c = txt.txt[i];
1283 if (c < ' ' || c > '~')
1284 fprintf(f, "\\x%02x", c & 0xff);
1288 fprintf(f, "%c", c);
1294 void token_trace(FILE *f, struct token tok, int max)
1296 static char *types[] = {
1297 [TK_ident] = "ident",
1299 [TK_number] = "number",
1300 [TK_string] = "string",
1301 [TK_multi_string] = "mstring",
1302 [TK_line_comment] = "lcomment",
1303 [TK_block_comment] = "bcomment",
1306 [TK_newline] = "newline",
1308 [TK_error] = "ERROR",
1312 default: /* known word or mark */
1313 fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1319 /* No token text included */
1320 fprintf(f, "%s()", types[tok.num]);
1326 case TK_multi_string:
1327 case TK_line_comment:
1328 case TK_block_comment:
1330 fprintf(f, "%s(", types[tok.num]);
1331 text_dump(f, tok.txt, max);
1337 ### And there we have it
1339 We now have all the library functions defined for reading and printing
1340 tokens. Now we just need C files to store them, and a mk file to make them.
1342 ###### File: scanner.h
1344 ## exported functions
1346 ###### File: libscanner.c
1348 #include "scanner.h"
1350 ## internal functions
1353 ###### File: scanner.mk
1357 scanner.mk scanner.h libscanner.c : scanner.mdc
1360 libscanner.o : libscanner.c
1361 $(CC) $(CFLAGS) -c libscanner.c
1363 ## Processing numbers
1365 Converting a `TK_number` token to a numerical value is a slightly
1366 higher level task than lexical analysis, and slightly lower than
1367 grammar parsing, so put it here - as an appendix if you like.
1369 Importantly it will be used by the same testing rig that is used for
1370 testing the token scanner.
1372 The numeric value that we will convert all numbers into is the `mpq_t`
1373 from the GNU high precision number library "libgmp".
1375 ###### number includes
1379 Firstly we need to be able to parse a string of digits in a given base
1380 and possibly with a decimal marker. We store this in an `mpz_t`
1381 integer and report the number of digits after the decimal mark.
1383 On error we return zero and ensure that the 'mpz_t' has been freed, or
1384 had never been initialised.
1386 ###### number functions
1388 static int parse_digits(mpz_t num, struct text tok, int base,
1391 /* Accept digits up to 'base', ignore '_' and
1392 * (for base 10) ' ' if they appear between two
1393 * legal digits, and if `placesp` is not NULL,
1394 * allow a single '.' or ',' and report the number
1395 * of digits beyond there.
1396 * Return number of characters processed (p),
1397 * or 0 if something illegal was found.
1400 int decimal = -1; // digits after marker
1401 enum {Digit, Space, Other} prev = Other;
1404 for (p = 0; p < tok.len; p++) {
1406 char c = tok.txt[p];
1408 if (c == '_' || (c == ' ' && base == 10)) {
1414 if (c == '.' || c == ',') {
1417 if (!placesp || decimal >= 0)
1425 else if (isupper(c))
1427 else if (islower(c))
1438 mpz_mul_ui(num, num, base);
1442 mpz_add_ui(num, num, dig);
1461 ###### number includes
1464 To parse a full number we need to consider the optional base, the
1465 mantissa, and the optional exponent. We will treat these one at a
1468 The base is indicated by a letter after a leading zero, which must be
1469 followed by a base letter or a period. The base also determines the
1470 character which will mark an exponent.
1478 if (tok.txt[0] == '0' && tok.len > 1) {
1480 switch(tok.txt[1]) {
1511 // another digit is not permitted
1515 // must be decimal marker or trailing
1516 // letter, which are OK;
1523 After the base is the mantissa, which may contain a decimal mark, so
1524 we need to record the number of places. We won't impose the number of
1525 places until we have the exponent as well.
1532 ###### parse mantissa
1534 d = parse_digits(mant, tok, base, &places);
1540 mpq_set_z(num, mant);
1543 After the mantissa number may come an exponent which may be positive
1544 or negative. We assume at this point that we have seen the exponent
1552 ###### parse exponent
1554 if (tok.txt[0] == '+') {
1557 } else if (tok.txt[0] == '-') {
1563 d = parse_digits(exp, tok, 10, NULL);
1568 if (!mpz_fits_slong_p(exp)) {
1573 lexp = mpz_get_si(exp) * esign;
1578 Now that we have the mantissa and the exponent we can multiply them
1579 together, also allowing for the number of digits after the decimal
1582 For base 10, we simply subtract the decimal places from the exponent.
1583 For the other bases, as the exponent is alway based on 2, even for
1584 octal and hex, we need a bit more detail.
1585 We then recover the sign from the exponent, as division is quite
1586 different from multiplication.
1588 ###### calc exponent
1607 Imposing the exponent on the number is also very different for base 10
1608 than for the others. For the binary shift `gmp` provides a simple
1609 function. For base 10 we use something like Russian Peasant
1612 ###### calc exponent
1616 mpq_set_ui(tens, 10, 1);
1620 mpq_mul(num, num, tens);
1622 mpq_div(num, num, tens);
1627 mpq_mul(tens, tens, tens);
1632 mpq_mul_2exp(num, num, lexp);
1634 mpq_div_2exp(num, num, lexp);
1637 Now we are ready to parse a number: the base, mantissa, and exponent.
1638 If all goes well we check for the possible trailing letters and
1639 return. Return value is 1 for success and 0 for failure.
1641 ###### number functions
1642 int number_parse(mpq_t num, char tail[3], struct text tok)
1649 if (tok.len > 1 && (tok.txt[0] == expc ||
1650 tok.txt[0] == toupper(expc))) {
1657 for (i = 0; i < 2; i++) {
1660 if (!isalpha(tok.txt[i]))
1662 tail[i] = tok.txt[i];
1672 Number parsing goes in `libnumber.c`
1674 ###### File: libnumber.c
1682 ###### File: parse_number.h
1683 int number_parse(mpq_t num, char tail[3], struct text tok);
1685 ###### File: scanner.mk
1687 libnumber.o : libnumber.c
1688 $(CC) $(CFLAGS) -c libnumber.c
1690 ## Processing strings
1692 Both `TK_string` and `TK_multi_string` require post-processing which
1693 can be one of two types: literal or with escapes processed.
1694 Even literal processing is non-trivial as the file may contain indents
1695 which need to be stripped.
1697 Errors can only occur when processing escapes. Any unrecognised
1698 character following the escape character will cause an error.
1700 Processing escapes and striping indents can only make the string
1701 shorter, not longer, so we allocate a buffer which is the same size as
1702 the string and process into that.
1704 To request escape processing, we pass the character we want to use for
1705 quoting, usually '`\`'. To avoid escape processing we pass a zero.
1708 int string_parse(struct token *tok, char escape,
1709 struct text *str, char tail[3])
1712 struct text t = tok->txt;
1716 if (tok->num == TK_string) {
1721 str->txt = malloc(t.len);
1734 The tail of the string can be 0, 1, or 2 letters
1737 if (i >= 0 && isalpha(t.txt[i-1]))
1739 if (i >= 0 && isalpha(t.txt[i-1]))
1741 strncpy(tail, t.txt+i, t.len-i);
1750 Stripping the quote of a single-line string is trivial.
1751 The only part that is at all interesting is that quote character must
1755 if (t.txt[t.len-1] != quote)
1765 For a multi-line string we have a little more work to do. We need to
1766 remove 3 quotes, not 1, and need to count the indent of the close
1767 quote as it will need to be stripped from all lines.
1771 t.txt[1] != quote || t.txt[2] != quote ||
1772 !is_newline(t.txt[3]))
1777 if (i <= 0 || t.txt[i-1] != quote)
1780 if (i <= 0 || t.txt[i-1] != quote)
1783 if (i <= 0 || t.txt[i-1] != quote)
1787 while (i > 0 && !is_newline(t.txt[i-1]))
1791 if (t.txt[i] == ' ')
1793 if (t.txt[i] == '\t')
1794 indent = indent_tab(indent);
1803 Now we just take one byte at a time. trans-ASCII unicode won't look
1804 like anything we are interested in so it will just be copied byte by
1809 for (i = 0; i < t.len; i++) {
1823 } else if (i+1 >= t.len) {
1824 // escape and end of string
1832 str->len = cp - str->txt;
1840 Every time we find a start of line, we strip spaces and tabs until the
1841 required indent is found.
1844 while (i < t.len && skipped < indent) {
1849 skipped = indent_tab(skipped);
1858 *cp++ = '\n'; break;
1860 *cp++ = '\r'; break;
1862 *cp++ = '\t'; break;
1864 *cp++ = '\b'; break;
1866 *cp++ = quote; break;
1868 *cp++ = '\f'; break;
1870 *cp++ = '\v'; break;
1872 *cp++ = '\a'; break;
1877 // 3 digit octal number
1880 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1881 t.txt[i+2] < '0' || t.txt[i+1] > '7')
1883 n = (t.txt[i ]-'0') * 64 +
1884 (t.txt[i+1]-'0') * 8 +
1885 (t.txt[i+2]-'0') * 1;
1891 n = take_hex(2, t.txt+i+1, t.len-i-1);
1899 // 4 or 8 hex digits for unicode
1900 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1903 memset(&pstate, 0, sizeof(pstate));
1904 n = wcrtomb(cp, n, &pstate);
1908 i += c == 'u' ? 4 : 8;
1913 else if (is_newline(c))
1923 For `\x` `\u` and `\U` we need to collect a specific number of
1926 ###### string functions
1928 static long take_hex(int digits, char *cp, int l)
1940 else if (isupper(c))
1951 #### File: libstring.c
1953 String parsing goes in `libstring.c`
1962 #include "scanner.h"
1966 ###### File: parse_string.h
1967 int string_parse(struct token *tok, char escape,
1968 struct text *str, char tail[3]);
1970 ###### File: scanner.mk
1972 libstring.o : libstring.c
1973 $(CC) $(CFLAGS) -c libstring.c
1977 As "untested code is buggy code" we need a program to easily test
1978 the scanner library. This will simply parse a given file and report
1979 the tokens one per line.
1981 ###### File: scanner.c
1987 #include <sys/mman.h>
1994 #include "scanner.h"
1995 #include "parse_number.h"
1996 #include "parse_string.h"
1999 static void pr_err(char *msg)
2002 fprintf(stderr, "%s\n", msg);
2005 static int kcmp(const void *ap, const void *bp)
2007 char * const *a = ap;
2008 char * const *b = bp;
2009 return strcmp(*a, *b);
2012 int main(int argc, char *argv[])
2017 char *filename = NULL;
2018 struct token_state *state;
2019 const char *known[] = {
2028 struct token_config conf = {
2031 .words_marks = known,
2032 .number_chars = "., _+-",
2033 .known_count = sizeof(known)/sizeof(known[0]),
2036 static const struct option long_options[] = {
2037 { "word-start", 1, NULL, 'W'},
2038 { "word-cont", 1, NULL, 'w'},
2039 { "number-chars", 1, NULL, 'n'},
2040 { "ignore-numbers", 0, NULL, 'N'},
2041 { "ignore-ident", 0, NULL, 'I'},
2042 { "ignore-marks", 0, NULL, 'M'},
2043 { "ignore-strings", 0, NULL, 'S'},
2044 { "ignore-multi-strings",0, NULL, 'z'},
2045 { "ignore-line-comment",0, NULL, 'c'},
2046 { "ignore-newline", 0, NULL, 'l'},
2047 { "ignore-block-comment", 0, NULL, 'C'},
2048 { "ignore-indent", 0, NULL, 'i'},
2049 { "return-comments", 0, NULL, 'r'},
2050 { "file", 1, NULL, 'f'},
2051 { "section", 1, NULL, 's'},
2052 { NULL, 0, NULL, 0},
2054 static const char options[] = "W:w:n:NIMSzclCirf:s:";
2056 struct section *table, *s, *prev;
2058 char *section_name = NULL;
2059 int section_found = 0;
2061 setlocale(LC_ALL,"");
2062 while ((opt = getopt_long(argc, argv, options, long_options, NULL))
2065 case 'W': conf.word_start = optarg; break;
2066 case 'w': conf.word_cont = optarg; break;
2067 case 'n': conf.number_chars = optarg; break;
2068 case 'N': conf.ignored |= 1 << TK_number; break;
2069 case 'I': conf.ignored |= 1 << TK_ident; break;
2070 case 'M': conf.ignored |= 1 << TK_mark; break;
2071 case 'S': conf.ignored |= 1 << TK_string; break;
2072 case 'z': conf.ignored |= 1 << TK_multi_string; break;
2073 case 'c': conf.ignored |= 1 << TK_line_comment; break;
2074 case 'C': conf.ignored |= 1 << TK_block_comment; break;
2075 case 'l': conf.ignored |= 1 << TK_newline; break;
2076 case 'i': conf.ignored |= 1 << TK_in; break;
2077 case 'r': conf.return_comments = 1; break;
2078 case 'f': filename = optarg; break;
2079 case 's': section_name = optarg; break;
2080 default: fprintf(stderr, "scanner: unknown option '%c'.\n",
2086 if (optind < argc) {
2087 const char **wm = calloc(argc - optind, sizeof(char*));
2089 for (i = optind; i < argc; i++)
2090 wm[i - optind] = argv[i];
2091 qsort(wm, argc-optind, sizeof(char*), kcmp);
2092 conf.words_marks = wm;
2093 conf.known_count = argc - optind;
2097 fd = open(filename, O_RDONLY);
2101 fprintf(stderr, "scanner: cannot open %s: %s\n",
2102 filename, strerror(errno));
2105 len = lseek(fd, 0, 2);
2107 fprintf(stderr,"scanner: %s is empty or not seekable\n",
2108 filename ?: "stdin");
2111 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2112 table = code_extract(file, file+len, pr_err);
2115 (code_free(s->code), prev = s, s = s->next, free(prev))) {
2117 (s->section.len != strlen(section_name) ||
2118 strncmp(s->section.txt, section_name, s->section.len) != 0))
2122 printf("Tokenizing: %.*s\n", s->section.len,
2124 state = token_open(s->code, &conf);
2126 struct token tk = token_next(state);
2127 printf("%d:%d ", tk.line, tk.col);
2128 token_trace(stdout, tk, 20);
2129 if (tk.num == TK_number) {
2132 if (number_parse(num, tail,tk.txt)) {
2133 printf(" %s ", tail);
2134 mpq_out_str(stdout, 10, num);
2137 printf(" BAD NUMBER");
2139 if (tk.num == TK_string ||
2140 tk.num == TK_multi_string) {
2144 if (tk.txt.txt[0] == '`')
2146 if (string_parse(&tk, esc,
2148 printf(" %s ", tail);
2149 text_dump(stdout, str, 20);
2152 printf(" BAD STRING");
2155 if (tk.num == TK_error)
2157 if (tk.num == TK_eof)
2162 if (conf.words_marks != known)
2163 free(conf.words_marks);
2164 if (section_name && !section_found) {
2165 fprintf(stderr, "scanner: section %s not found\n", section_name);
2170 ###### File: scanner.mk
2171 scanner.c : scanner.mdc
2174 scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2175 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2176 libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2177 scanner.o : scanner.c
2178 $(CC) $(CFLAGS) -c scanner.c