5 The main task of the lexical scanner is to convert a stream of
6 characters into a stream of tokens. The tokens are then typically
7 used by a parser to extract the syntactic structure.
9 The stream of characters are assumed to be in memory identified by a
10 linked list of blocks, such as provided by the "[mdcode][]" literate
11 program extractor. A single token may never cross a block boundary.
18 The text is assumed to be UTF-8 though some matching assumes the
19 ASCII subset. If the text provided does not conform to UTF-8 an error
20 will be reported and some number of bytes will be skipped.
25 #include <unicode/uchar.h>
27 Tokens are returned by successive calls to the main interface
28 function: `token_next()` which has a `state` structure to keep track
29 of where it is up to. Each token carries not just a numeric
30 identifier but also the code block, the line and character within that
31 block, and the actual start and length using the `struct text` from
37 struct code_node *node;
48 ###### exported functions
49 struct token token_next(struct token_state *state);
52 struct token token_next(struct token_state *state)
63 The `line` and `col` offsets are useful for reporting errors.
64 The `txt` provides the content when that is important.
66 ### Token types and configuration ##
68 The scanner is not completely general, yet not completely specified.
69 There are a fixed set of token types, though particular tokens within
70 those types can be distinguish via configuration.
72 Most token types may be explicitly ignored, as typically comments
73 would be. The exact consequence of ignoring each token type varies
78 int ignored; // bit set of ignored tokens.
79 ## token config parameters
83 struct token_config *conf;
85 ###### token_next init
86 int ignored = state->conf->ignored;
89 The different tokens are numbers, words, marks, strings, comments,
90 newlines, EOF, and indents, each of which is examined in detail below.
92 There are various cases where no token can be found in part of the
93 input. All of these will be reported as a `TK_error` token.
95 It is possible to declare a number of strings which form distinct
96 tokens (rather than being grouped as e.g. 'word'). These are given
97 token numbers from `TK_reserved` upwards.
108 Numbers are the messiest tokens to parse, primarily because they can
109 contain characters that also have meaning outside of numbers and,
110 particularly, immediately after numbers.
112 The obvious example is the '`-`' sign. It can come inside a number for
113 a negative exponent, or after a number as a subtraction operator. To
114 be sure we have parsed as best as possible we need to only allow the
115 '`-`' inside a number if it is after an exponent character. This can be
116 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
117 digit, so we don't allow '`-`' after just any `e`.
119 To make matters worse, our language designer has decided to experiment
120 with allowing commas to be used as the decimal indicator, and spaces
121 to be used to separate groups of digits in large numbers. Both of
122 these can reasonably be restricted to appear between two digits, so we
123 have to add that condition to our tests.
125 So we cannot just treat numbers as starting with a digit and being
126 followed by some set of characters. We need more structure than that.
130 - Numbers must start with a digit.
131 - If the first digit is zero, the next character must be a base
132 signifier (one of `xob`) or a decimal marker (`.` or `,`).
133 In the first case the first `p` or `P` may be followed by a sign.
134 - If the number doesn't start with `0` followed by one of `xob`, the
135 first `e` may be followed by a sign.
136 - Any digit or hex digit may be followed by a space or underscore
137 providing that the subsequence character is also a (hex) digit.
138 This rule will require an extra level of 'unget' to be
139 supported when handling characters.
140 - Otherwise any digits or ASCII letters are allowed. We do not at
141 this point check that the digits given are permitted by the base.
142 That will happen when the token is converted to a number.
144 To allow easy configuration, the various non alphanumeric characters
145 are only permitted if they are listed in a configuration parameter.
147 ###### token config parameters
150 Note that numbers may not start with a period, so `.75` is not a
151 number. This is not the norm, but is not unheard of. Excluding these
152 numbers simplifies the rule at very little cost.
157 If TK_number is ignored, digits will result in an error unless they
158 are declared to be a start character for words.
166 if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
167 int prev_special = 0;
169 int decimal_mark = 0;
171 wchar_t ch2 = get_char(state);
172 if (strchr("xobXOB", ch2) != NULL)
180 if (ch == 'e' || ch == 'E')
184 if (ch == 'p' || ch == 'P')
188 save_unget_state(state);
189 ch = get_char(state);
194 if (ch == '+' || ch == '-') {
199 if (ch == '.' || ch == ',') {
205 /* Don't allow that special char,
208 restore_unget_state(state);
211 if (strchr(state->conf->number_chars, ch)) {
215 /* non-number char */
218 /* We seem to have a "number" token */
220 close_token(state, &tk);
226 Words start with a "start" character followed by the longest
227 sequence of "continue" characters. The Unicode ID_START and
228 ID_CONTINUE sets are always permitted, but other ASCII characters
229 can be added to these sets.
231 ###### token config parameters
235 ###### internal functions
236 static int is_word_start(wchar_t ch, struct token_config *conf)
238 return iswalpha(ch) ||
239 strchr(conf->word_start, ch) != NULL ||
240 u_hasBinaryProperty(ch, UCHAR_ID_START);
243 static int is_word_continue(wchar_t ch, struct token_config *conf)
245 return iswalnum(ch) ||
246 strchr(conf->word_cont, ch) != NULL ||
247 u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
250 Words can be either known or unknown. Known words are referred to as
251 "reserved words" and get a unique token number. Unknown words are
252 "identifiers" and are syntactically a single token.
257 A list of known words must be provided. This list is shared with the
258 "marks" which are described next. The list must be lexically sorted
259 and the length of the list must be given (`known_count`).
260 Tokens matching these known words are reported as the index of the
261 list added to `TK_reserved`.
263 If identifiers are ignored, then any word which is not listed as a
264 known word results in an error.
266 ###### token config parameters
267 const char **words_marks;
272 if (is_word_start(ch, state->conf)) {
274 /* A word: identifier or reserved */
276 ch = get_char(state);
277 while (is_word_continue(ch, state->conf));
279 close_token(state, &tk);
281 if (ignored & (1<<TK_ident))
283 n = find_known(state->conf, tk.txt);
285 tk.num = TK_reserved + n;
291 Marks are generally one or more punctuation marks joined together. It
292 would be nice to use the term "symbol" for these, but that causes
293 confusion in a subsequent discussion of the grammar, which has terminal
294 symbols and non-terminal symbols which are conceptually quite
295 different. So strings of punctuation characters will be marks.
297 A "mark" consists of ASCII characters that are not white space, are not
298 "start" characters for words, and are not digits.
299 These will collectively be called mark characters.
301 ###### internal functions
302 static int is_mark(wchar_t ch, struct token_config *conf)
307 strchr(conf->word_start, ch) == NULL;
310 As with words, there can be known and unknown marks, though the rules
311 are slightly different.
313 Two marks do not need to be separated by a non-mark characters. This
314 is different from words which do need to be separated by at least one
315 non-continue character.
317 The scanner will normally prefer longer sequences of mark characters,
318 but will more strongly prefer known marks over unknown marks. So if
319 it finds a known mark where adding one more character does not result
320 in a known mark, it will return that first known mark.
322 If no known mark is found we will test against strings and comments
323 below before giving up and assuming an unknown mark.
325 If an unknown mark contains a quote character or a comment marker, and
326 that token is not being ignored, then we terminate the unknown mark
327 before that quote or comment. This ensures that an unknown mark
328 immediately before a string is handled correctly.
330 If the first character of a comment marker (i.e. '/') is a known mark,
331 the above rules would suggest that the start of a comment would be
332 parsed as that mark, which is not what is wanted. So the introductory
333 sequences for a comment ("//" and "/*") are treated as
334 partially-known. They prevent the leading "/" from being a mark by
335 itself, but do not actually constitute a stand-alone mark.
337 If `TK_mark` is ignored, then unknown marks are returned as errors.
342 Known marks are included in the same list as the list of known words.
346 while (is_mark(ch, state->conf)) {
349 close_token(state, &tk);
350 n = find_known(state->conf, tk.txt);
352 tk.num = TK_reserved + n;
353 else if (tk.num != TK_error) {
354 /* found a longest-known-mark, still need to
357 if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
358 (ch == '/' || ch == '*')) {
359 /* Yes, this is a comment, not a '/' */
360 restore_unget_state(state);
365 close_token(state, &tk);
369 save_unget_state(state);
370 ch = get_char(state);
371 if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
372 /* If strings are allowed, a quote (Which isn't a known mark)
373 * mustn't be treated as part of an unknown mark. It can be
374 * part of a multi-line srtings though.
377 if (prev == '#' && n < 0)
378 /* '#' is not a known mark, so assume it is a comment */
380 if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
381 close_token(state, &tk);
382 restore_unget_state(state);
385 if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
386 close_token(state, &tk);
387 restore_unget_state(state);
392 if (tk.num != TK_error) {
393 close_token(state, &tk);
397 If we don't find a known mark, we will check for strings and comments
398 before assuming that we have an unknown mark
407 Strings start with one of single quote, double quote, or back quote
408 and continue until a matching character on the same line. Any of
409 these characters can be included in the list of known marks and then
410 they will not be used for identifying strings.
412 Immediately following the close quote, one or two ASCII letters may
413 appear. These are somewhat like the arbitrary letters allowed in
414 "Numbers" above. They can be used by the language in various ways.
416 If 3 identical quote characters appear in a row and are
417 followed by a newline, then this forms a multi-line string which
418 continues until an identical triple quote appears on a line preceded
419 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
421 Multi-line strings may not extend beyond the end of the `code_node` in
424 Normal strings and multi-line strings are encoded as two different
431 ###### internal functions
432 static int is_quote(wchar_t ch)
434 return ch == '\'' || ch == '"' || ch == '`'; // "
437 #### Multi-line strings
439 The multi-line string is checked for first. If they are being
440 ignored, we fall through and treat a triple quote as an empty string
441 followed by the start of a new string.
444 if (tk.txt.len == 3 &&
445 !(ignored & (1 << TK_multi_string)) &&
446 is_quote(tk.txt.txt[0]) &&
447 memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
448 is_newline(tk.txt.txt[3])) {
450 wchar_t first = tk.txt.txt[0];
453 while (!at_eon(state) && qseen < 3) {
454 ch = get_char(state);
455 if (is_newline(ch)) {
458 } else if (at_sol && ch == first) {
460 } else if (ch != ' ' && ch != '\t') {
466 /* Hit end of node - error.
467 * unget so the newline is seen,
468 * but return rest of string as an error.
472 close_token(state, &tk);
476 /* 2 letters are allowed */
477 ch = get_char(state);
479 ch = get_char(state);
481 ch = get_char(state);
482 /* Now we must have a newline, but we don't return it
485 close_token(state, &tk);
486 tk.num = TK_multi_string;
492 #### Single-line strings
494 The sequence of marks collected may be more than a single-line
495 string, so we reset to the start and collect characters until
496 we find a close quote or a newline.
498 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
501 if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
502 !(ignored & (1<<TK_string))) {
503 wchar_t first = tk.txt.txt[0];
504 reset_token(state, &tk);
505 ch = get_char(state);
507 while (!at_eon(state) && !is_newline(ch)) {
508 ch = get_char(state);
513 if (is_newline(ch)) {
518 while (!at_eon(state) && (ch = get_char(state)) &&
522 close_token(state, &tk);
528 Single line comments may start with '`//`' or '`#`' providing that these
529 are not known marks. They continue to the end of the line.
531 Block comments start with '`/*`' if this is not a known mark. They
532 continue to the first occurrence of '`*/`' and may not contain any
533 occurrence of '`/*`'.
535 Block comments can be wholly within one line or can continue over
536 multiple lines. The multi-line version should be followed immediately
537 by a newline. The Linux kernel contains over 285000 multi-line
538 comments are only 34 are followed by characters other than white space
539 (which should be removed) or a backslash (only needed in macros). So
540 it would not suffer from this rule.
542 These two comment types are reported as two separate token types, and
543 consequently can be ignored separately. When ignored a comment is
544 still parsed, but is discarded.
550 ###### internal functions
551 static int is_line_comment(struct text txt)
553 return (txt.len >= 1 && txt.txt[0] == '#') ||
554 (txt.len >= 2 && txt.txt[0] == '/' &&
558 static int is_block_comment(struct text txt)
560 return txt.len >= 2 && txt.txt[0] == '/' &&
564 #### Single line comments
566 A single-line comment continues up to, but not including the newline
571 if (is_line_comment(tk.txt)) {
572 while (!is_newline(ch) && !at_eon(state))
573 ch = get_char(state);
576 close_token(state, &tk);
577 tk.num = TK_line_comment;
578 if (ignored & (1 << TK_line_comment))
585 The token text collected so far could exceed the comment, so we need
588 If we find an embedded `/*` we reset to just before the '/' and report
589 an error. That way the next thing to be parsed will be the rest of
590 the comment. This requires a double unget, so we need to save/restore
591 the unget state (explained later).
595 if (is_block_comment(tk.txt)) {
598 reset_token(state, &tk);
601 save_unget_state(state);
602 ch = get_char(state);
604 while (!at_eon(state) &&
605 (prev != '/' || ch != '*') &&
606 (prev != '*' || ch != '/')) {
610 save_unget_state(state);
611 ch = get_char(state);
613 close_token(state, &tk);
619 /* embedded. Need to unget twice! */
620 restore_unget_state(state);
625 tk.num = TK_block_comment;
626 if (newlines && !(ignored & (1<<TK_newline))) {
627 /* next char must be newline */
628 ch = get_char(state);
633 if (tk.num == TK_error ||
634 !(ignored & (1 << TK_block_comment)))
639 ### Indents, Newlines, and White Space.
641 Normally white space is ignored. However newlines can be important as
642 can indents, which are either after a newline or at the start of a
643 node (detected by `at_son()`);
645 ###### exported functions
646 static inline int is_newline(wchar_t ch)
648 return ch == '\n' || ch == '\f' || ch == '\v';
652 if (ch <= ' ' && !is_newline(ch)
656 If a line starts with more white-space than the previous non-blank
657 line - or if the first non-blank line in the document starts with any
658 white-space - then an "IN" is reported at the start of the line.
660 Before the next non-blank line which starts with less white space, or
661 at the latest at the end of the document, a matching "OUT" token
662 is reported. There will always be an exact match between "IN" and
665 It is possible for "OUT" to be followed (almost) immediately by an
666 "IN". This happens if, for example, the indent of three consecutive
667 lines are 0, 8, 4 spaces. Before the second line we report an
668 "IN". Before the third line we must report an "OUT", as 4 is less
669 than 8, then also an Ident as 4 is greater than 0.
675 For the purpose of measuring the length of white space, a tab adds at
676 least one space, and rounds up to a multiple of 8.
678 ###### exported functions
679 static inline int indent_tab(int indent)
684 We need to track the current levels of indent. This requires some
685 sort of stack as indent levels are pushed on and popped off. In
686 practice this stack is unlikely to often exceed 5 so we will used a
687 fixed stack of 20 indent levels. More than this will be silently
692 int indent_sizes[20];
696 Newlines can optionally be reported. Newlines within a block comment
697 or a multi-line string are not reported separately, but each of these
698 must be followed immediately by a newline so these constructs cannot
699 hide the fact that a newline was present.
701 When indents are being reported, the Newline which would normally be
702 reported immediately before the "IN" is delayed until after the
703 matching "OUT". This makes an indented section act like a
704 continuation of the previous line to some extent.
706 A blank line would normally be reported simply as two consecutive Newline
707 tokens. However if the subsequent line is indented (and indents are being
708 reported) then the right thing to do is less obvious as Newlines should be
709 delayed - but how many Newlines?
711 The approach we will take is to report the extra Newlines immediately after
712 the IN token, so the blank line is treated as though it were an indented
718 If we find a newline or white space at the start of a block, we keep
719 collecting spaces, tabs, and newlines until we find some real text.
720 Then depending on the indent we generate some number of tokens. These
721 will be a sequence of "Newline OUT" pairs representing a decrease
722 in indent, then either a Newline or an IN depending on whether the
723 next line is indented, then zero or more Newlines representing all the
724 blank lines that have been skipped.
726 When a Newline leads to the next block of code there is a question of
727 whether the various Newline and OUT/IN tokens should appear to
728 pbelong to the earlier or later block. This is addressed by processing
729 the tokens in two stages based on the relative indent levels of the
730 two blocks (each block has a base indent to which the actual indents
733 Any "Newline OUT" pairs needed to reduce the current indent to the
734 maximum of the base indents of the old and new blocks are generated
735 against the old block. Then if the next block does not have an
736 increased indent, one more "Newline" is generated.
738 If further "Newline OUT" pairs are needed to get to the indent
739 level of the 'next' block, they are generated against that block,
740 though the first Newline is suppressed (it having already been
743 Finally the Newline or IN for the first line of the new block is
744 generated, unless the Newline needs to be suppressed because it
745 appeared at the end of the previous block.
747 This means that a block may start with an OUT or an IN, but
748 will only start with a Newline if it actually starts with a blank
751 We will need to represent in the `token_state` where in this sequence
752 of delayed tokens we are. As `state.col` records the target indent we
753 don't need to record how many OUTs or INs are needed. We do
754 need to record the number of blank lines, and which of Newline and
755 OUT is needed next in the initial sequence of pairs.
757 For this we store one more than the number of blank lines as
758 `delayed_lines` and a flag for `out_next`.
765 Generating these tokens involve two separate pieces of code.
767 Firstly we need to recognise white space and count the indents and
768 newlines. These are recorded in the above state fields.
770 Separately we need, on each call to `token_next`, we need to check if
771 there are some delayed tokens and if so we need to advance the state
772 information and return one token.
775 if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
777 int was_son = at_son(state);
778 if (ignored & (1<<TK_in)) {
781 if (ignored & (1<<TK_newline))
784 close_token(state, &tk);
787 // Indents are needed, so check all white space.
788 while (ch <= ' ' && !at_eon(state)) {
791 ch = get_char(state);
795 if (state->node->next &&
796 state->node->next->indent > state->node->indent)
797 state->col = state->node->next->indent;
799 state->col = state->node->indent;
802 state->delayed_lines = newlines;
803 state->out_next = was_son;
804 state->check_indent = 1;
809 ###### delayed tokens
811 if (state->check_indent || state->delayed_lines) {
812 if (state->col < state->indent_sizes[state->indent_level]) {
813 if (!state->out_next &&
814 !(ignored & (1<<TK_newline))) {
819 state->indent_level -= 1;
824 if (state->col > state->indent_sizes[state->indent_level] &&
825 state->indent_level < sizeof(state->indent_sizes)-1) {
826 state->indent_level += 1;
827 state->indent_sizes[state->indent_level] = state->col;
828 state->delayed_lines -= 1;
832 state->check_indent = 0;
833 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
835 state->delayed_lines -= 1;
838 state->delayed_lines = 0;
844 After the last newline in the file has been processed, a special
845 end-of-file token will be returned. any further attempts to get more
846 tokens will continue to return the same end-of-file token.
856 state->check_indent = 1;
863 ### Unknown Marks, or errors.
865 We have now handled all the possible known mark-like tokens.
866 If the token we have is not empty and `TK_mark` is allowed,
867 we have an unknown mark, otherwise this must be an error.
871 /* one unknown mark character */
873 close_token(state, &tk);
874 if (ignored & (1<<TK_mark))
880 /* Completely unrecognised character is next, possibly
881 * a digit and we are ignoring numbers.
882 * What ever it is, make it an error.
885 close_token(state, &tk);
889 ## Tools For The Task
891 You may have noticed that are few gaps we left in the above -
892 functions used without first defining them. Doing so above would have
895 ### Character by character
897 As we walk through the various `code_node`s we need to process whole
898 Unicode codepoints, and keep track of which line and column we are on.
899 We will assume for now that any printing character uses one column,
900 though that is not true in general.
902 As the text in a `code_node` may include an indent that identifies it as
903 being code, we need to be careful to strip that. The `code_node` has
904 a flag that tells us whether or not we need to strip.
910 struct code_node *node;
915 ###### internal functions
917 static int do_strip(struct token_state *state)
920 if (state->node->needs_strip) {
922 while (n && state->node->code.txt[state->offset] == ' ') {
927 while (n == 4 && state->node->code.txt[state->offset] == '\t') {
928 indent = indent_tab(indent);
936 static wint_t get_char(struct token_state *state)
942 if (state->node == NULL)
944 if (state->node->code.len <= state->offset) {
946 state->node = state->node->next;
947 while (state->node && state->node->code.txt == NULL);
949 if (state->node == NULL)
951 state->line = state->node->line_no;
952 state->col = do_strip(state);
957 memset(&mbstate, 0, sizeof(mbstate));
959 n = mbrtowc(&next, state->node->code.txt + state->offset,
960 state->node->code.len - state->offset,
962 if (n == -2 || n == 0) {
963 /* Not enough bytes - not really possible */
965 state->offset = state->node->code.len;
966 } else if (n == -1) {
969 next = 0x7f; // an illegal character
975 } else if (is_newline(next)) {
977 state->col = do_strip(state);
978 } else if (next == '\t') {
979 state->col = indent_tab(state->col);
984 We will sometimes want to "unget" the last character as it needs to be
985 considered again as part of the next token. So we need to store a
986 'previous' version of all metadata.
993 ###### before get_char
994 state->prev_offset = state->offset;
995 state->prev_line = state->line;
996 state->prev_col = state->col;
998 ###### internal functions
1000 static void unget_char(struct token_state *state)
1003 state->offset = state->prev_offset;
1004 state->line = state->prev_line;
1005 state->col = state->prev_col;
1009 We occasionally need a double-unget, particularly for numbers and
1010 block comments. We don't impose this cost on all scanning, but
1011 require those code sections that need it to call `save_unget_state`
1012 before each `get_char`, and then `restore_unget_state` when a
1013 double-unget is needed.
1020 ###### internal functions
1021 static void save_unget_state(struct token_state *state)
1023 state->prev_offset2 = state->prev_offset;
1024 state->prev_line2 = state->prev_line;
1025 state->prev_col2 = state->prev_col;
1028 static void restore_unget_state(struct token_state *state)
1030 state->prev_offset = state->prev_offset2;
1031 state->prev_line = state->prev_line2;
1032 state->prev_col = state->prev_col2;
1035 At the start of a token we don't want to be at the end of a code block
1036 if we can help it. To avoid this possibility, we 'get' and 'unget' a
1037 single character. This will move into the next non-empty code block
1038 and leave the current pointer at the start of it.
1040 This has to happen _after_ dealing with delayed tokens as some of them
1041 must appear in the previous node. When we do this, we need to reset
1042 the data in the token.
1044 ###### delayed tokens
1045 if (at_eon(state)) {
1048 tk.node = state->node;
1050 tk.txt.txt = state->node->code.txt + state->offset;
1051 tk.line = state->line;
1052 tk.col = state->col;
1058 The current token is initialized to line up with the first character
1059 that we 'get' for each token. When we have, or might have, a full
1060 token we can call `close_token` to set the `len` of the token
1061 appropriately. This can safely be called multiple times.
1063 Finally we occasionally (for single-line strings and block comments)
1064 need to reset to the beginning of the current token as we might have
1065 parsed too much already. For that there is `reset_token`.
1068 tk.node = state->node;
1070 tk.txt.txt = state->node->code.txt + state->offset;
1071 tk.line = state->line;
1072 tk.col = state->col;
1075 ###### internal functions
1077 static void close_token(struct token_state *state,
1080 if (state->node != tk->node)
1081 tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
1083 tk->txt.len = (state->node->code.txt + state->offset)
1087 static void reset_token(struct token_state *state, struct token *tok)
1089 state->prev_line = tok->line;
1090 state->prev_col = tok->col;
1091 state->prev_offset = tok->txt.txt - state->node->code.txt;
1097 Tokens make not cross into the next `code_node`, and some tokens can
1098 include the newline at the and of a `code_node`, we must be able to
1099 easily check if we have reached the end. Equally we need to know if
1100 we are at the start of a node, as white space is treated a little
1103 ###### internal functions
1105 static int at_son(struct token_state *state)
1107 return state->offset == 0;
1110 static int at_eon(struct token_state *state)
1112 // at end-of-node ??
1113 return state->node == NULL ||
1114 state->offset >= state->node->code.len;
1117 ### Find a known word
1119 As the known-word list is sorted we can use a simple binary search.
1120 Following the pattern established in "mdcode", we will use a `struct
1121 text` with start and length to represent the code fragment we are
1124 ###### internal functions
1125 static int find_known(struct token_config *conf, struct text txt)
1128 int hi = conf->known_count;
1130 while (lo + 1 < hi) {
1131 int mid = (lo + hi) / 2;
1132 int cmp = strncmp(conf->words_marks[mid],
1134 if (cmp == 0 && conf->words_marks[mid][txt.len])
1141 if (strncmp(conf->words_marks[lo],
1142 txt.txt, txt.len) == 0
1143 && conf->words_marks[lo][txt.len] == 0)
1149 ### Bringing it all together
1151 Now we have all the bits there is just one section missing: combining
1152 all the token parsing code into one block.
1154 The handling of delayed tokens (Newlines, INs, OUTs) must come
1155 first before we try getting another character.
1157 Then we parse all the test, making sure that we check for known marks
1158 before strings and comments, but unknown marks after strings and comments.
1160 This block of code will either return a token, or will choose to
1161 ignore one, in which case it will `continue` around to the top of the
1167 ch = get_char(state);
1176 As well as getting tokens, we need to be able to create the
1177 `token_state` to start with, and discard it later.
1182 ###### main functions
1183 struct token_state *token_open(struct code_node *code, struct
1186 struct token_state *state = malloc(sizeof(*state));
1187 memset(state, 0, sizeof(*state));
1189 state->line = code->line_no;
1190 state->col = do_strip(state);
1194 void token_close(struct token_state *state)
1199 ###### exported functions
1200 struct token_state *token_open(struct code_node *code, struct
1201 token_config *conf);
1202 void token_close(struct token_state *state);
1206 Getting tokens is the main thing but it is also useful to be able to
1207 print out token information, particularly for tracing and testing.
1209 Known tokens are printed verbatim. Other tokens are printed as
1210 `type(content)` where content is truncated to a given number of characters.
1212 The function for printing a truncated string (`text_dump`) is also exported
1213 so that it can be used to tracing processed strings too.
1218 ###### exported functions
1219 void token_trace(FILE *f, struct token tok, int max);
1220 void text_dump(FILE *f, struct text t, int max);
1222 ###### main functions
1224 void text_dump(FILE *f, struct text txt, int max)
1231 for (i = 0; i < max; i++) {
1232 char c = txt.txt[i];
1233 if (c < ' ' || c > '~')
1234 fprintf(f, "\\x%02x", c & 0xff);
1238 fprintf(f, "%c", c);
1244 void token_trace(FILE *f, struct token tok, int max)
1246 static char *types[] = {
1247 [TK_ident] = "ident",
1249 [TK_number] = "number",
1250 [TK_string] = "string",
1251 [TK_multi_string] = "mstring",
1252 [TK_line_comment] = "lcomment",
1253 [TK_block_comment] = "bcomment",
1256 [TK_newline] = "newline",
1258 [TK_error] = "ERROR",
1262 default: /* known word or mark */
1263 fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1269 /* No token text included */
1270 fprintf(f, "%s()", types[tok.num]);
1276 case TK_multi_string:
1277 case TK_line_comment:
1278 case TK_block_comment:
1280 fprintf(f, "%s(", types[tok.num]);
1281 text_dump(f, tok.txt, max);
1287 ### And there we have it
1289 We now have all the library functions defined for reading and printing
1290 tokens. Now we just need C files to store them, and a mk file to make them.
1292 ###### File: scanner.h
1294 ## exported functions
1296 ###### File: libscanner.c
1298 #include "scanner.h"
1300 ## internal functions
1303 ###### File: scanner.mk
1307 scanner.mk scanner.h libscanner.c : scanner.mdc
1310 libscanner.o : libscanner.c
1311 $(CC) $(CFLAGS) -c libscanner.c
1313 ## Processing numbers
1315 Converting a `TK_number` token to a numerical value is a slightly
1316 higher level task than lexical analysis, and slightly lower than
1317 grammar parsing, so put it here - as an index if you like.
1319 Importantly it will be used by the same testing rig that is used for
1320 testing the token scanner.
1322 The numeric value that we will convert all numbers into is the `mpq_t`
1323 from the GNU high precision number library "libgmp".
1325 ###### number includes
1329 Firstly we need to be able to parse a string of digits in a given base
1330 and possibly with a decimal marker. We store this in an `mpz_t`
1331 integer and report the number of digits after the decimal mark.
1333 On error we return zero and ensure that the 'mpz_t' has been freed, or
1334 had never been initialised.
1336 ###### number functions
1338 static int parse_digits(mpz_t num, struct text tok, int base,
1341 /* Accept digits up to 'base', ignore '_' and
1342 * ' ' if they appear between two legal digits,
1343 * and if `placesp` is not NULL, allow a single
1344 * '.' or ',' and report the number of digits
1346 * Return number of characters processed (p),
1347 * or 0 if something illegal was found.
1350 int decimal = -1; // digits after marker
1351 enum {Digit, Space, Other} prev = Other;
1354 for (p = 0; p < tok.len; p++) {
1356 char c = tok.txt[p];
1358 if (c == '_' || c == ' ') {
1364 if (c == '.' || c == ',') {
1367 if (!placesp || decimal >= 0)
1375 else if (isupper(c))
1377 else if (islower(c))
1388 mpz_mul_ui(num, num, base);
1392 mpz_add_ui(num, num, dig);
1411 ###### number includes
1414 To parse a full number we need to consider the optional base, the
1415 mantissa, and the optional exponent. We will treat these one at a
1418 The base is indicated by a letter after a leading zero, which must be
1419 followed by a base letter or a period. The base also determines the
1420 character which will mark an exponent.
1428 if (tok.txt[0] == '0' && tok.len > 1) {
1430 switch(tok.txt[1]) {
1461 // another digit is not permitted
1465 // must be decimal marker or trailing
1466 // letter, which are OK;
1473 After the base is the mantissa, which may contain a decimal mark, so
1474 we need to record the number of places. We won't impose the number of
1475 places until we have the exponent as well.
1482 ###### parse mantissa
1484 d = parse_digits(mant, tok, base, &places);
1490 mpq_set_z(num, mant);
1493 After the mantissa number may come an exponent which may be positive
1494 or negative. We assume at this point that we have seen the exponent
1502 ###### parse exponent
1504 if (tok.txt[0] == '+') {
1507 } else if (tok.txt[0] == '-') {
1513 d = parse_digits(exp, tok, 10, NULL);
1518 if (!mpz_fits_slong_p(exp)) {
1523 lexp = mpz_get_si(exp) * esign;
1529 Now that we have the mantissa and the exponent we can multiply them
1530 together, also allowing for the number of digits after the decimal
1533 For base 10, we simply subtract the decimal places from the exponent.
1534 For the other bases, as the exponent is alway based on 2, even for
1535 octal and hex, we need a bit more detail.
1536 We then recover the sign from the exponent, as division is quite
1537 different from multiplication.
1539 ###### calc exponent
1558 Imposing the exponent on the number is also very different for base 10
1559 than for the others. For the binary shift `gmp` provides a simple
1560 function. For base 10 we use something like Russian Peasant
1563 ###### calc exponent
1567 mpq_set_ui(tens, 10, 1);
1571 mpq_mul(num, num, tens);
1573 mpq_div(num, num, tens);
1578 mpq_mul(tens, tens, tens);
1583 mpq_mul_2exp(num, num, lexp);
1585 mpq_div_2exp(num, num, lexp);
1588 Now we are ready to parse a number: the base, mantissa, and exponent.
1589 If all goes well we check for the possible trailing letters and
1590 return. Return value is 1 for success and 0 for failure.
1593 ###### number functions
1594 int number_parse(mpq_t num, char tail[3], struct text tok)
1601 if (tok.len > 1 && (tok.txt[0] == expc ||
1602 tok.txt[0] == toupper(expc))) {
1609 for (i = 0; i < 2; i++) {
1612 if (!isalpha(tok.txt[i]))
1614 tail[i] = tok.txt[i];
1624 Number parsing goes in `libnumber.c`
1626 ###### File: libnumber.c
1634 ###### File: number.h
1635 int number_parse(mpq_t num, char tail[3], struct text tok);
1637 ###### File: scanner.mk
1639 libnumber.o : libnumber.c
1640 $(CC) $(CFLAGS) -c libnumber.c
1642 ## Processing strings
1644 Both `TK_string` and `TK_multi_string` require post-processing which
1645 can be one of two types: literal or with escapes processed.
1646 Even literal processing is non-trivial as the file may contain indents
1647 which need to be stripped.
1649 Errors can only occur when processing escapes. Any unrecognised
1650 character following the escape character will cause an error.
1652 Processing escapes and striping indents can only make the string
1653 shorter, not longer, so we allocate a buffer which is the same size as
1654 the string and process into that.
1656 To request escape processing, we pass the character we want to use for
1657 quoting, usually '`\`'. To avoid escape processing we pass a zero.
1660 int string_parse(struct token *tok, char escape,
1661 struct text *str, char tail[3])
1664 struct text t = tok->txt;
1668 if (tok->num == TK_string) {
1673 str->txt = malloc(t.len);
1686 The tail of the string can be 0, 1, or 2 letters
1689 if (i >= 0 && isalpha(t.txt[i-1]))
1691 if (i >= 0 && isalpha(t.txt[i-1]))
1693 strncpy(tail, t.txt+i, t.len-i);
1702 Stripping the quote of a single-line string is trivial.
1703 The only part that is at all interesting is that quote character must
1707 if (t.txt[t.len-1] != quote)
1717 For a multi-line string we have a little more work to do. We need to
1718 remove 3 quotes, not 1, and need to count the indent of the close
1719 quote as it will need to be stripped from all lines.
1723 t.txt[1] != quote || t.txt[2] != quote ||
1724 !is_newline(t.txt[3]))
1729 if (i <= 0 || t.txt[i-1] != quote)
1732 if (i <= 0 || t.txt[i-1] != quote)
1735 if (i <= 0 || t.txt[i-1] != quote)
1739 while (i > 0 && !is_newline(t.txt[i-1]))
1743 if (t.txt[i] == ' ')
1745 if (t.txt[i] == '\t')
1746 indent = indent_tab(indent);
1755 Now we just take one byte at a time. trans-ASCII unicode won't look
1756 like anything we are interested in so it will just be copied byte by
1761 for (i = 0; i < t.len; i++) {
1775 } else if (i+1 >= t.len) {
1776 // escape and end of string
1784 str->len = cp - str->txt;
1792 Every time we find a start of line, we strip spaces and tabs until the
1793 required indent is found.
1796 while (i < t.len && skipped < indent) {
1801 skipped = indent_tab(skipped);
1810 *cp++ = '\n'; break;
1812 *cp++ = '\r'; break;
1814 *cp++ = '\t'; break;
1816 *cp++ = '\b'; break;
1818 *cp++ = quote; break;
1820 *cp++ = '\f'; break;
1822 *cp++ = '\v'; break;
1824 *cp++ = '\a'; break;
1829 // 3 digit octal number
1832 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1833 t.txt[i+2] < '0' || t.txt[i+1] > '7')
1835 n = (t.txt[i ]-'0') * 64 +
1836 (t.txt[i+1]-'0') * 8 +
1837 (t.txt[i+2]-'0') * 1;
1843 n = take_hex(2, t.txt+i+1, t.len-i-1);
1851 // 4 or 8 hex digits for unicode
1852 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1855 memset(&pstate, 0, sizeof(pstate));
1856 n = wcrtomb(cp, n, &pstate);
1860 i += c == 'u' ? 4 : 8;
1865 else if (is_newline(c))
1875 For `\x` `\u` and `\U` we need to collect a specific number of
1878 ###### string functions
1880 static long take_hex(int digits, char *cp, int l)
1892 else if (isupper(c))
1903 #### File: libstring.c
1905 String parsing goes in `libstring.c`
1914 #include "scanner.h"
1918 ###### File: string.h
1919 int string_parse(struct token *tok, char escape,
1920 struct text *str, char tail[3]);
1922 ###### File: scanner.mk
1924 libstring.o : libstring.c
1925 $(CC) $(CFLAGS) -c libstring.c
1930 As "untested code is buggy code" we need a program to easily test
1931 the scanner library. This will simply parse a given file and report
1932 the tokens one per line.
1934 ###### File: scanner.c
1940 #include <sys/mman.h>
1947 #include "scanner.h"
1952 static void pr_err(char *msg)
1955 fprintf(stderr, "%s\n", msg);
1958 static int kcmp(const void *ap, const void *bp)
1960 char * const *a = ap;
1961 char * const *b = bp;
1962 return strcmp(*a, *b);
1965 int main(int argc, char *argv[])
1970 char *filename = NULL;
1971 struct token_state *state;
1972 const char *known[] = {
1981 struct token_config conf = {
1984 .words_marks = known,
1985 .number_chars = "., _+-",
1986 .known_count = sizeof(known)/sizeof(known[0]),
1989 static const struct option long_options[] = {
1990 { "word-start", 1, NULL, 'W'},
1991 { "word-cont", 1, NULL, 'w'},
1992 { "number-chars", 1, NULL, 'n'},
1993 { "ignore-numbers", 0, NULL, 'N'},
1994 { "ignore-ident", 0, NULL, 'I'},
1995 { "ignore-marks", 0, NULL, 'M'},
1996 { "ignore-strings", 0, NULL, 'S'},
1997 { "ignore-multi-strings",0, NULL, 'z'},
1998 { "ignore-line-comment",0, NULL, 'c'},
1999 { "ignore-newline", 0, NULL, 'l'},
2000 { "ignore-block-comment", 0, NULL, 'C'},
2001 { "ignore-indent", 0, NULL, 'i'},
2002 { "file", 1, NULL, 'f'},
2003 { NULL, 0, NULL, 0},
2005 static const char options[] = "W:w:n:NIMSzclCif:";
2007 struct section *table, *s, *prev;
2010 setlocale(LC_ALL,"");
2011 while ((opt = getopt_long(argc, argv, options, long_options, NULL))
2014 case 'W': conf.word_start = optarg; break;
2015 case 'w': conf.word_cont = optarg; break;
2016 case 'n': conf.number_chars = optarg; break;
2017 case 'N': conf.ignored |= 1 << TK_number; break;
2018 case 'I': conf.ignored |= 1 << TK_ident; break;
2019 case 'M': conf.ignored |= 1 << TK_mark; break;
2020 case 'S': conf.ignored |= 1 << TK_string; break;
2021 case 'z': conf.ignored |= 1 << TK_multi_string; break;
2022 case 'c': conf.ignored |= 1 << TK_line_comment; break;
2023 case 'C': conf.ignored |= 1 << TK_block_comment; break;
2024 case 'l': conf.ignored |= 1 << TK_newline; break;
2025 case 'i': conf.ignored |= 1 << TK_in; break;
2026 case 'f': filename = optarg; break;
2027 default: fprintf(stderr, "scanner: unknown option '%c'.\n",
2033 if (optind < argc) {
2034 const char **wm = calloc(argc - optind, sizeof(char*));
2036 for (i = optind; i < argc; i++)
2037 wm[i - optind] = argv[i];
2038 qsort(wm, argc-optind, sizeof(char*), kcmp);
2039 conf.words_marks = wm;
2040 conf.known_count = argc - optind;
2044 fd = open(filename, O_RDONLY);
2048 fprintf(stderr, "scanner: cannot open %s: %s\n",
2049 filename, strerror(errno));
2052 len = lseek(fd, 0, 2);
2054 fprintf(stderr,"scanner: %s is empty or not seekable\n",
2055 filename ?: "stdin");
2058 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2059 table = code_extract(file, file+len, pr_err);
2062 (code_free(s->code), prev = s, s = s->next, free(prev))) {
2063 printf("Tokenizing: %.*s\n", s->section.len,
2065 state = token_open(s->code, &conf);
2067 struct token tk = token_next(state);
2068 printf("%d:%d ", tk.line, tk.col);
2069 token_trace(stdout, tk, 20);
2070 if (tk.num == TK_number) {
2073 if (number_parse(num, tail,tk.txt)) {
2074 printf(" %s ", tail);
2075 mpq_out_str(stdout, 10, num);
2078 printf(" BAD NUMBER");
2080 if (tk.num == TK_string ||
2081 tk.num == TK_multi_string) {
2085 if (tk.txt.txt[0] == '`')
2087 if (string_parse(&tk, esc,
2089 printf(" %s ", tail);
2090 text_dump(stdout, str, 20);
2093 printf(" BAD STRING");
2096 if (tk.num == TK_error)
2098 if (tk.num == TK_eof)
2103 if (conf.words_marks != known)
2104 free(conf.words_marks);
2107 ###### File: scanner.mk
2108 scanner.c : scanner.mdc
2111 scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2112 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2113 libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2114 scanner.o : scanner.c
2115 $(CC) $(CFLAGS) -c scanner.c