5 The main task of the lexical scanner is to convert a stream of
6 characters into a stream of tokens. The tokens are then typically
7 used by a parser to extract the syntactic structure.
9 The stream of characters are assumed to be in memory identified by a
10 linked list of blocks, such as provided by the "[mdcode][]" literate
11 program extractor. A single token may never cross a block boundary.
18 The text is assumed to be UTF-8 though some matching assumes the
19 ASCII subset. If the text provided does not conform to UTF-8 an error
20 will be reported and some number of bytes will be skipped.
25 #include <unicode/uchar.h>
27 Tokens are returned by successive calls to the main interface
28 function: `token_next()` which has a `state` structure to keep track
29 of where it is up to. Each token carries not just a numeric
30 identifier but also the code block, the line and character within that
31 block, and the actual start and length using the `struct text` from
37 struct code_node *node;
48 ###### exported functions
49 struct token token_next(struct token_state *state);
52 struct token token_next(struct token_state *state)
63 The `line` and `col` offsets are useful for reporting errors.
64 The `txt` provides the content when that is important.
66 ### Token types and configuration ##
68 The scanner is not completely general, yet not completely specified.
69 There are a fixed set of token types, though particular tokens within
70 those types can be distinguish via configuration.
72 Most token types may be explicitly ignored, as typically comments
73 would be. The exact consequence of ignoring each token type varies
78 int ignored; // bit set of ignored tokens.
79 ## token config parameters
83 struct token_config *conf;
85 ###### token_next init
86 int ignored = state->conf->ignored;
89 The different tokens are numbers, words, marks, strings, comments,
90 newlines, EOF, and indents, each of which is examined in detail below.
92 There are various cases where no token can be found in part of the
93 input. All of these will be reported as an `TK_error` token.
95 It is possible to declare a number of strings which form distinct
96 tokens (rather than being grouped as e.g. 'word'). These are given
97 token numbers from `TK_reserved` upwards.
108 Numbers are the messiest tokens to parse, primarily because they can
109 contain characters that also have meaning outside of number and,
110 particularly, immediately after numbers.
112 The obvious example is the '`-`' sign. It can come inside a number for
113 a negative exponent, or after a number as a subtraction operator. To
114 be sure we have parsed as best as possible we need to only allow the
115 '`-`' inside a number if it is after an exponent character. This can be
116 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
117 digit, so we don't allow '`-`' after just any `e`.
119 To make matters worse, our language designer has decided to experiment
120 with allowing commas to be used as the decimal indicator, and spaces
121 to be used to separate groups of digits in large numbers. Both of
122 these can reasonably be restricted to appear between two digits, so we
123 have to add that condition to our tests.
125 So we cannot just treat numbers as starting with a digit and being
126 followed by some set of characters. We need more structure than that.
130 - Numbers must start with a digit.
131 - If the first digit is zero, the next character must be a base
132 signifier (one of `xob`) or a decimal marker (`.` or `,`).
133 In the first case the first `p` or `P` may be followed by a sign.
134 - If the number doesn't start with `0` followed by one of `xob`, the
135 first `e` may be followed by a sign.
136 - Any digit or hex digit may be followed by a space or underscore
137 providing that the subsequence character is also a (hex) digit.
138 This rule will require an extra level of 'unget' to be
139 supported when handling characters.
140 - Otherwise any digits or ASCII letters are allowed. We do not at
141 this point check that the digits given are permitted by the base.
142 That will happen when the token is converted to a number.
144 To allow easy configuration, the various non alphanumeric characters
145 are only permitted if they are listed in a configuration parameter.
147 ###### token config parameters
150 Note that numbers may not start with a period, so `.75` is not a
151 number. This is not the norm, but is not unheard of. Excluding these
152 numbers simplifies the rule at very little cost.
157 If TK_number is ignored, digits will result in an error unless they
158 are declared to be a start character for words.
166 if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
167 int prev_special = 0;
169 int decimal_mark = 0;
171 wchar_t ch2 = get_char(state);
172 if (strchr("xobXOB", ch2) != NULL)
188 save_unget_state(state);
189 ch = get_char(state);
194 if (ch == '+' || ch == '-') {
199 if (ch == '.' || ch == ',') {
205 /* Don't allow that special char,
208 restore_unget_state(state);
211 if (strchr(state->conf->number_chars, ch)) {
215 /* non-number char */
218 /* We seem to have a "number" token */
220 close_token(state, &tk);
226 Words start with a "start" character followed by the longest
227 sequence of "continue" characters. The Unicode ID_START and
228 ID_CONTINUE sets are always permitted, but other ASCII characters
229 can be added to these sets.
231 ###### token config parameters
235 ###### internal functions
236 static int is_word_start(wchar_t ch, struct token_config *conf)
238 return iswalpha(ch) ||
239 strchr(conf->word_start, ch) != NULL ||
240 u_hasBinaryProperty(ch, UCHAR_ID_START);
243 static int is_word_continue(wchar_t ch, struct token_config *conf)
245 return iswalnum(ch) ||
246 strchr(conf->word_cont, ch) != NULL ||
247 u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
250 Words can be either known or unknown. Known words are referred to as
251 "reserved words" and get a unique token number. Unknown words are
252 "identifiers" and are syntactically a single token.
257 A list of known words must be provided. This list is shared with the
258 "marks" which are described next. The list must be lexically sorted
259 and the length of the list must be given (`known_count`).
260 Tokens matching these known words are reported as the index of the
261 list added to `TK_reserved`.
263 ###### token config parameters
264 const char **words_marks;
269 if (is_word_start(ch, state->conf)) {
271 /* A word: identifier or reserved */
273 ch = get_char(state);
274 while (is_word_continue(ch, state->conf));
276 close_token(state, &tk);
278 if (ignored & (1<<TK_ident))
280 n = find_known(state->conf, tk.txt);
282 tk.num = TK_reserved + n;
288 Marks are generally one or more punctuation marks joined together. It
289 would be nice to use the term "symbol" for these, but that causes
290 confusion in a subsequent discussion of the grammar, which has terminal
291 symbols and non-terminal symbols which are conceptually quite
292 different. So strings of punctuation characters will be marks.
294 A "mark" consists of ASCII characters that are not white space, are not
295 "start" characters for words, and are not digits.
296 These will collectively be called mark characters.
298 ###### internal functions
299 static int is_mark(wchar_t ch, struct token_config *conf)
304 strchr(conf->word_start, ch) == NULL;
307 As with words, there can be known and unknown marks, though the rules
308 are slightly different.
310 Two marks do not need to be separated by a non-mark characters. This
311 is different from words which do need to be separated by at least one
312 non-continue character.
314 The scanner will normally prefer longer sequences of mark characters,
315 but will more strongly prefer known marks over unknown marks. So if
316 it finds a known mark where adding one more character does not result
317 in a known mark, it will return that first known mark.
319 If no known mark is found we will test against strings and comments
320 below before giving up and assuming an unknown mark.
322 If an unknown mark contains a quote character or a comment marker, and
323 that token is not being ignored, then we terminate the unknown mark
324 before that quote or comment. This ensure that an unknown mark
325 immediately before a string is handled correctly.
327 If `TK_mark` is ignored, then unknown marks as returned as an error.
332 Known marks are included in the same list as the list of known words.
336 while (is_mark(ch, state->conf)) {
339 close_token(state, &tk);
340 n = find_known(state->conf, tk.txt);
342 tk.num = TK_reserved + n;
343 else if (tk.num != TK_error) {
344 /* found a longest-known-mark */
346 close_token(state, &tk);
351 save_unget_state(state);
352 ch = get_char(state);
353 if (!(ignored && (1<<TK_string)) && is_quote(ch))
355 if (!(ignored && (1<<TK_line_comment)) &&
356 prev == '/' && ch == '/') {
357 restore_unget_state(state);
360 if (!(ignored && (1<<TK_block_comment)) &&
361 prev == '/' && ch == '*') {
362 restore_unget_state(state);
367 if (tk.num != TK_error)
372 if (ignored & (1<<TK_mark))
381 Strings start with one of single quote, double quote, or back quote
382 and continue until a matching character on the same line. Any of
383 these characters can be included in the list of known marks and then
384 they will not be used for identifying strings.
386 Immediately following the close quote one or two ASCII letters may
387 appear. These are somewhat like the arbitrary letters allowed in
388 "Numbers" above. They can be used by the language in various ways.
390 If 3 identical quote characters appear in a row and are
391 followed by a newline, then this forms a multi-line string which
392 continues until an identical triple quote appears on a line preceded
393 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
395 Multi-line strings may not extend beyond the end of the `code_node` in
398 Normal strings and multi-line strings are encoded as two different
405 ###### internal functions
406 static int is_quote(wchar_t ch)
408 return ch == '\'' || ch == '"' || ch == '`';
411 #### Multi-line strings
413 The multi-line string is checked for first. If they are being
414 ignored, we fall through and treat a triple quote as an empty string
415 followed by the start of a new string.
418 if (tk.txt.len == 3 &&
419 !(ignored & (1 << TK_multi_string)) &&
420 is_quote(tk.txt.txt[0]) &&
421 memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
422 is_newline(tk.txt.txt[3])) {
424 wchar_t first = tk.txt.txt[0];
427 while (!at_eon(state) && qseen < 3) {
428 ch = get_char(state);
429 if (is_newline(ch)) {
432 } else if (at_sol && ch == first) {
434 } else if (ch != ' ' && ch != '\t') {
440 /* Hit end of node - error.
441 * unget so the newline is seen,
442 * but return rest of string as an error.
445 close_token(state, &tk);
449 /* 2 letters are allowed */
450 ch = get_char(state);
452 ch = get_char(state);
454 ch = get_char(state);
455 /* Now we must have a newline, but we don't return it
458 close_token(state, &tk);
459 tk.num = TK_multi_string;
465 #### Single-line strings
467 The sequence of marks collected may be more than a single-line
468 string, so we reset to the start and collect characters until
469 we find a close quote or a newline.
471 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
474 if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
475 !(ignored & (1<<TK_string))) {
476 wchar_t first = tk.txt.txt[0];
477 reset_token(state, &tk);
480 ch = get_char(state);
481 while (ch != first && !is_newline(ch));
483 if (is_newline(ch)) {
487 close_token(state, &tk);
493 Single line comments may start with '`//`' or '`#`' providing that these
494 are not known marks. They continue to the end of the line.
496 Block comments start with '`/*`' if this is not a known mark. They
497 continue to the first occurrence of '`*/`' and may not contain any
498 occurrence of '`/*`'.
500 Block comments can be wholly within one line or can continue over
501 multiple lines. The multi-line version should be followed immediately
502 by a newline. The Linux kernel contains over 285000 multi-line
503 comments are only 34 are followed by characters other than white space
504 (which should be removed) or a backslash (only needed in macros). So
505 it would not suffer from this rule.
507 These two comment types are reported as two separate token types, and
508 consequently can be ignored separately. When ignored a comment is
509 parsed and discarded.
515 ###### internal functions
516 static int is_line_comment(struct text txt)
518 return (txt.len >= 1 && txt.txt[0] == '#') ||
519 (txt.len >= 2 && txt.txt[0] == '/' &&
523 static int is_block_comment(struct text txt)
525 return txt.len >= 2 && txt.txt[0] == '/' &&
529 #### Single line comments
531 A single-line comment continues up to, but not including the newline.
535 if (is_line_comment(tk.txt)) {
536 while (!is_newline(ch))
537 ch = get_char(state);
539 close_token(state, &tk);
540 tk.num = TK_line_comment;
541 if (ignored & (1 << TK_line_comment))
548 The token text collected so far could exceed the comment, so we need
551 If we find an embedded `/*` we reset to just before the '/' and report
552 an error. That way the next thing to be parsed will be the rest of
553 the comment. This requires a double unget, so we need to save/restore
554 the unget state (explained later).
558 if (is_block_comment(tk.txt)) {
561 reset_token(state, &tk);
564 save_unget_state(state);
565 ch = get_char(state);
567 while (!at_eon(state) &&
568 (prev != '/' || ch != '*') &&
569 (prev != '*' || ch != '/')) {
573 save_unget_state(state);
574 ch = get_char(state);
576 close_token(state, &tk);
582 /* embedded. Need to unget twice! */
583 restore_unget_state(state);
588 tk.num = TK_block_comment;
589 if (newlines && !(ignored & (1<<TK_newline))) {
590 /* next char must be newline */
591 ch = get_char(state);
596 if (tk.num == TK_error ||
597 !(ignored & (1 << TK_block_comment)))
602 ### Indents, Newlines, and White Space.
604 Normally white space is ignored. However newlines can be important as
605 can indents, which are either after a newline or at the start of a
606 node (detected by `at_son()`);
608 ###### exported functions
609 static inline int is_newline(wchar_t ch)
611 return ch == '\n' || ch == '\f' || ch == '\v';
615 if (ch <= ' ' && !is_newline(ch)
619 If a line starts with more white-space than the previous non-blank
620 line - or if the first non-blank line in the document starts with any
621 white-space - then an Indent is reported at the start of the line.
623 Before the next non-blank line which starts with less white space, or
624 at the latest at the end of the document, a matching Undent token
625 is reported. There will always be an exact match between Indent and
628 It is possible for Undent to be followed (almost) immediately by an
629 Indent. This happens if, for example, the indent of three consecutive
630 lines are 0, 8, 4 spaces. Before the second line we report an
631 Indent. Before the third line we must report an Undent, as 4 is less
632 than 8, then also an Ident as 4 is greater than 0.
638 For the purpose of measuring the length of white space, a tab adds at
639 least one space, and rounds up to a multiple of 8.
641 ###### exported functions
642 static inline int indent_tab(int indent)
647 We need to track the current levels of indent. This requires some
648 sort of stack as indent levels are pushed on and popped off. In
649 practice this stack is unlikely to often exceed 5 so we will used a
650 fixed stack of 20 indent levels. More than this will be silently
655 int indent_sizes[20];
659 Newlines can optionally be reported. Newlines within a block comment
660 or a multi-line string are not reported separately, but each of these
661 must be followed immediately by a newline so these constructs cannot
662 hide the fact that a newline was present.
664 When Indents are being reported, the Newline which would normally be
665 reported immediately before the Indent is delayed until after the
666 matching undent. This makes an indented section act like a
667 continuation of the previous line to some extent.
669 A blank line would normally be reported simply as two consecutive Newline
670 tokens. However if the subsequent line is indented (and indents are being
671 reported) then the right thing to do is less obvious as Newlines should be
672 delayed - but how many Newlines?
674 The approach we will take is to report the extra Newlines immediately after
675 the Indent token, so the blank line is treated as though it were an indented
681 If we find a newline or white space at the start of a block, we keep
682 collecting spaces, tabs, and newlines until we find some real text.
683 Then depending on the indent we generate some number of tokens. These
684 will be a sequence of "Newline Undent" pairs representing a decrease
685 in indent, then either a Newline or an Indent depending on whether the
686 next line is indented, then zero or more Newlines representing all the
687 blank lines that have been skipped.
689 When a Newline leads to the next block of code there is a question of
690 whether the various Newline and Undent/Indent tokens should appear to
691 pbelong to the earlier or later block. This is addressed by processing
692 the tokens in two stages based on the relative indent levels of the
693 two blocks (each block has a base indent to which the actual indents
696 Any "Newline Undent" pairs needed to reduce the current indent to the
697 maximum of the base indents of the old and new blocks are generated
698 against the old block. Then if the next block does not have an
699 increased indent, one more "Newline" is generated.
701 If further "Newline Undent" pairs are needed to get to the indent
702 level of the 'next' block, they are generated against that block,
703 though the first Newline is suppressed (it having already been
706 Finally the Newline or Indent for the first line of the new block is
707 generated, unless the Newline needs to be suppressed because it
708 appeared at the end of the previous block.
710 This means that a block may start with an Undent or an Indent, but
711 will only start with a Newline if it actually starts with a blank
714 We will need to represent in the `token_state` where in this sequence
715 of delayed tokens we are. As `state.col` records the target indent we
716 don't need to record how many undents or indents are needed. We do
717 need to record the number of blank lines, and which of Newline and
718 Undent is needed next in the initial sequence of pairs.
720 For this we store one more than the number of blank lines as
721 `delayed_lines` and a flag for `undent_next`.
728 Generating these tokens involve two separate pieces of code.
730 Firstly we need to recognise white space and count the indents and
731 newlines. These are recorded in the above state fields.
733 Separately we need, on each call to `token_next`, we need to check if
734 there are some delayed tokens and if so we need to advance the state
735 information and return one token.
738 if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
740 int was_son = at_son(state);
741 if (ignored & (1<<TK_indent)) {
744 if (ignored & (1<<TK_newline))
747 close_token(state, &tk);
750 // Indents are needed, so check all white space.
751 while (ch <= ' ' && !at_eon(state)) {
754 ch = get_char(state);
758 if (state->node->next &&
759 state->node->next->indent > state->node->indent)
760 state->col = state->node->next->indent;
762 state->col = state->node->indent;
765 state->delayed_lines = newlines;
766 state->undent_next = was_son;
767 state->check_indent = 1;
772 ###### delayed tokens
774 if (state->check_indent || state->delayed_lines) {
775 if (state->col < state->indent_sizes[state->indent_level]) {
776 if (!state->undent_next &&
777 !(ignored & (1<<TK_newline))) {
778 state->undent_next = 1;
782 state->indent_level -= 1;
783 state->undent_next = 0;
787 if (state->col > state->indent_sizes[state->indent_level] &&
788 state->indent_level < sizeof(state->indent_sizes)-1) {
789 state->indent_level += 1;
790 state->indent_sizes[state->indent_level] = state->col;
791 state->delayed_lines -= 1;
795 state->check_indent = 0;
796 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
798 state->delayed_lines -= 1;
801 state->delayed_lines = 0;
807 After the last newline in the file has been processed, a special
808 end-of-file token will be returned. any further attempts to get more
809 tokens will continue to return the same end-of-file token.
821 ### Unknown Marks, or errors.
823 We have now handled all the possible known mark-like tokens.
824 If the token we have is not empty and `TK_mark` is allowed,
825 we have an unknown mark, otherwise this must be an error.
828 /* one unknown character */
829 close_token(state, &tk);
833 ## Tools For The Task
835 You may have noticed that are few gaps we left in the above -
836 functions used without first defining them. Doing so above would have
839 ### Character by character
841 As we walk through the various `code_node`s we need to process whole
842 Unicode codepoints, and keep track of which line and column we are on.
843 We will assume for now that any printing character uses one column,
844 though that is not true in general.
846 As the text in a `code_node` may include an indent that identifies it as
847 being code, we need to be careful to strip that. The `code_node` has
848 a flag that tells us whether or not we need to strip.
854 struct code_node *node;
859 ###### internal functions
861 static void do_strip(struct token_state *state)
863 if (state->node->needs_strip) {
865 while (n && state->node->code.txt[state->offset] == ' ') {
869 while (n == 4 && state->node->code.txt[state->offset] == '\t') {
876 static wint_t get_char(struct token_state *state)
882 if (state->node == NULL)
884 if (state->node->code.len <= state->offset) {
886 state->node = state->node->next;
887 while (state->node && state->node->code.txt == NULL);
889 if (state->node == NULL)
892 state->line = state->node->line_no;
893 state->col = state->node->indent;
898 memset(&mbstate, 0, sizeof(mbstate));
900 n = mbrtowc(&next, state->node->code.txt + state->offset,
901 state->node->code.len - state->offset,
903 if (n == -2 || n == 0) {
904 /* Not enough bytes - not really possible */
906 state->offset = state->node->code.len;
907 } else if (n == -1) {
910 next = 0x7f; // an illegal character
916 } else if (is_newline(next)) {
918 state->col = state->node->indent;
920 } else if (next == '\t') {
921 state->col = indent_tab(state->col);
926 We will sometimes want to "unget" the last character as it needs to be
927 considered again as part of the next token. So we need to store a
928 'previous' version of all metadata.
935 ###### before get_char
936 state->prev_offset = state->offset;
937 state->prev_line = state->line;
938 state->prev_col = state->col;
940 ###### internal functions
942 static void unget_char(struct token_state *state)
945 state->offset = state->prev_offset;
946 state->line = state->prev_line;
947 state->col = state->prev_col;
951 We occasionally need a double-unget, particularly for numbers and
952 block comments. We don't impose this cost on all scanning, but
953 require those code sections that need it to call `save_unget_state`
954 before each `get_char`, and then `restore_unget_state` when a
955 double-unget is needed.
962 ###### internal functions
963 static void save_unget_state(struct token_state *state)
965 state->prev_offset2 = state->prev_offset;
966 state->prev_line2 = state->prev_line;
967 state->prev_col2 = state->prev_col;
970 static void restore_unget_state(struct token_state *state)
972 state->prev_offset = state->prev_offset2;
973 state->prev_line = state->prev_line2;
974 state->prev_col = state->prev_col2;
977 At the start of a token we don't want to be at the end of a code block
978 if we can help it. To avoid this possibility, we 'get' and 'unget' a
979 single character. This will move into the next non-empty code block
980 and leave the current pointer at the start of it.
982 This has to happen _after_ dealing with delayed tokens as some of them
983 must appear in the previous node. When we do this, we need to reset
984 the data in the token.
986 ###### delayed tokens
990 tk.node = state->node;
992 tk.txt.txt = state->node->code.txt + state->offset;
993 tk.line = state->line;
1000 The current token is initialized to line up with the first character
1001 that we 'get' for each token. When we have, or might have, a full
1002 token we can call `close_token` to set the `len` of the token
1003 appropriately. This can safely be called multiple times.
1005 Finally we occasionally (for single-line strings and block comments)
1006 need to reset to the beginning of the current token as we might have
1007 parsed too much already. For that there is `reset_token`.
1010 tk.node = state->node;
1012 tk.txt.txt = state->node->code.txt + state->offset;
1013 tk.line = state->line;
1014 tk.col = state->col;
1017 ###### internal functions
1019 static void close_token(struct token_state *state,
1022 tk->txt.len = (state->node->code.txt + state->offset)
1026 static void reset_token(struct token_state *state, struct token *tok)
1028 state->prev_line = tok->line;
1029 state->prev_col = tok->col;
1030 state->prev_offset = tok->txt.txt - state->node->code.txt;
1036 Tokens make not cross into the next `code_node`, and some tokens can
1037 include the newline at the and of a `code_node`, we must be able to
1038 easily check if we have reached the end. Equally we need to know if
1039 we are at the start of a node, as white space is treated a little
1042 ###### internal functions
1044 static int at_son(struct token_state *state)
1046 return state->offset == 0;
1049 static int at_eon(struct token_state *state)
1051 // at end-of-node ??
1052 return state->node == NULL ||
1053 state->offset >= state->node->code.len;
1056 ### Find a known word
1058 As the known-word list is sorted we can use a simple binary search.
1059 Following the pattern established in "mdcode", we will use a `struct
1060 text` with start and length to represent the code fragment we are
1063 ###### internal functions
1064 static int find_known(struct token_config *conf, struct text txt)
1067 int hi = conf->known_count;
1069 while (lo + 1 < hi) {
1070 int mid = (lo + hi) / 2;
1071 int cmp = strncmp(conf->words_marks[mid],
1073 if (cmp == 0 && conf->words_marks[mid][txt.len])
1080 if (strncmp(conf->words_marks[lo],
1081 txt.txt, txt.len) == 0
1082 && conf->words_marks[lo][txt.len] == 0)
1088 ### Bringing it all together
1090 Now we have all the bits there is just one section missing: combining
1091 all the token parsing code into one block.
1093 The handling of delayed tokens (newlines, indents, undents) must come
1094 first before we try getting another character.
1096 Then we parse all the test, making sure that we check for known marks
1097 before strings and comments, but unknown marks after strings and comments.
1099 This block of code will either return a token, or will choose to
1100 ignore one, in which case it will `continue` around to the top of the
1106 ch = get_char(state);
1118 As well as getting tokens, we need to be able to create the
1119 `token_state` to start with, and discard it later.
1124 ###### main functions
1125 struct token_state *token_open(struct code_node *code, struct
1128 struct token_state *state = malloc(sizeof(*state));
1129 memset(state, 0, sizeof(*state));
1131 state->line = code->line_no;
1132 state->col = code->indent;
1137 void token_close(struct token_state *state)
1142 ###### exported functions
1143 struct token_state *token_open(struct code_node *code, struct
1144 token_config *conf);
1145 void token_close(struct token_state *state);
1149 Getting tokens is the main thing but it is also useful to be able to
1150 print out token information, particularly for tracing and testing.
1152 Known tokens are printed verbatim. Other tokens are printed as
1153 `type(content)` where content is truncated to a given number of characters.
1155 The function for printing a truncated string (`text_dump`) is also exported
1156 so that it can be used to tracing processed strings too.
1161 ###### exported functions
1162 void token_trace(FILE *f, struct token tok, int max);
1163 void text_dump(FILE *f, struct text t, int max);
1165 ###### main functions
1167 void text_dump(FILE *f, struct text txt, int max)
1174 for (i = 0; i < max; i++) {
1175 char c = txt.txt[i];
1176 if (c < ' ' || c > '~')
1177 fprintf(f, "\\x%02x", c & 0xff);
1181 fprintf(f, "%c", c);
1187 void token_trace(FILE *f, struct token tok, int max)
1189 static char *types[] = {
1190 [TK_ident] = "ident",
1192 [TK_number] = "number",
1193 [TK_string] = "string",
1194 [TK_multi_string] = "mstring",
1195 [TK_line_comment] = "lcomment",
1196 [TK_block_comment] = "bcomment",
1197 [TK_indent] = "indent",
1198 [TK_undent] = "undent",
1199 [TK_newline] = "newline",
1201 [TK_error] = "ERROR",
1205 default: /* known word or mark */
1206 fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1212 /* No token text included */
1213 fprintf(f, "%s()", types[tok.num]);
1219 case TK_multi_string:
1220 case TK_line_comment:
1221 case TK_block_comment:
1223 fprintf(f, "%s(", types[tok.num]);
1224 text_dump(f, tok.txt, max);
1230 ### And there we have it
1232 We now have all the library functions defined for reading and printing
1233 tokens. Now we just need C files to store them, and a mk file to make them.
1235 ###### File: scanner.h
1237 ## exported functions
1239 ###### File: libscanner.c
1241 #include "scanner.h"
1243 ## internal functions
1246 ###### File: scanner.mk
1250 scanner.mk scanner.h libscanner.c : scanner.mdc
1253 libscanner.o : libscanner.c
1254 $(CC) $(CFLAGS) -c libscanner.c
1256 ## Processing numbers
1258 Converting a `TK_number` token to a numerical value is a slightly
1259 higher level task than lexical analysis, and slightly lower than
1260 grammar parsing, so put it here - as an index if you like.
1262 Importantly it will be used by the same testing rig that is used for
1263 testing the token scanner.
1265 The numeric value that we will convert all numbers into is the `mpq_t`
1266 from the GNU high precision number library "libgmp".
1268 ###### number includes
1272 Firstly we need to be able to parse a string of digits in a given base
1273 and possibly with a decimal marker. We store this in an `mpz_t`
1274 integer and report the number of digits after the decimal mark.
1276 On error we return zero and ensure that the 'mpz_t' has been freed, or
1277 had never been initialised.
1279 ###### number functions
1281 static int parse_digits(mpz_t num, struct text tok, int base,
1284 /* Accept digits up to 'base', ignore '_' and
1285 * ' ' if they appear between two legal digits,
1286 * and if `placesp` is not NULL, allow a single
1287 * '.' or ',' and report the number of digits
1289 * Return number of characters processed (p),
1290 * or 0 if something illegal was found.
1293 int decimal = -1; // digits after marker
1294 enum {Digit, Space, Other} prev = Other;
1297 for (p = 0; p < tok.len; p++) {
1299 char c = tok.txt[p];
1301 if (c == '_' || c == ' ') {
1307 if (c == '.' || c == ',') {
1310 if (!placesp || decimal >= 0)
1318 else if (isupper(c))
1320 else if (islower(c))
1331 mpz_mul_ui(num, num, base);
1335 mpz_add_ui(num, num, dig);
1354 ###### number includes
1357 To parse a full number we need to consider the optional base, the
1358 mantissa, and the optional exponent. We will treat these one at a
1361 The base is indicated by a letter after a leading zero, which must be
1362 followed by a base letter or a period. The base also determines the
1363 character which will mark an exponent.
1371 if (tok.txt[0] == '0' && tok.len > 1) {
1373 switch(tok.txt[1]) {
1404 // another digit is not permitted
1408 // must be decimal marker or trailing
1409 // letter, which are OK;
1416 After the base is the mantissa, which may contain a decimal mark, so
1417 we need to record the number of places. We won't impose the number of
1418 places until we have the exponent as well.
1425 ###### parse mantissa
1427 d = parse_digits(mant, tok, base, &places);
1433 mpq_set_z(num, mant);
1436 After the mantissa number may come an exponent which may be positive
1437 or negative. We assume at this point that we have seen the exponent
1445 ###### parse exponent
1447 if (tok.txt[0] == '+') {
1450 } else if (tok.txt[0] == '-') {
1456 d = parse_digits(exp, tok, 10, NULL);
1461 if (!mpz_fits_slong_p(exp)) {
1466 lexp = mpz_get_si(exp) * esign;
1472 Now that we have the mantissa and the exponent we can multiply them
1473 together, also allowing for the number of digits after the decimal
1476 For base 10, we simply subtract the decimal places from the exponent.
1477 For the other bases, as the exponent is alway based on 2, even for
1478 octal and hex, we need a bit more detail.
1479 We then recover the sign from the exponent, as division is quite
1480 different from multiplication.
1482 ###### calc exponent
1501 Imposing the exponent on the number is also very different for base 10
1502 than for the others. For the binary shift `gmp` provides a simple
1503 function. For base 10 we use something like Russian Peasant
1506 ###### calc exponent
1510 mpq_set_ui(tens, 10, 1);
1514 mpq_mul(num, num, tens);
1516 mpq_div(num, num, tens);
1521 mpq_mul(tens, tens, tens);
1526 mpq_mul_2exp(num, num, lexp);
1528 mpq_div_2exp(num, num, lexp);
1531 Now we are ready to parse a number: the base, mantissa, and exponent.
1532 If all goes well we check for the possible trailing letters and
1533 return. Return value is 1 for success and 0 for failure.
1536 ###### number functions
1537 int number_parse(mpq_t num, char tail[3], struct text tok)
1544 if (tok.len > 1 && (tok.txt[0] == expc ||
1545 tok.txt[0] == toupper(expc))) {
1552 for (i = 0; i < 2; i++) {
1555 if (!isalpha(tok.txt[i]))
1557 tail[i] = tok.txt[i];
1567 Number parsing goes in `libnumber.c`
1569 ###### File: libnumber.c
1577 ###### File: number.h
1578 int number_parse(mpq_t num, char tail[3], struct text tok);
1580 ###### File: scanner.mk
1582 libnumber.o : libnumber.c
1583 $(CC) $(CFLAGS) -c libnumber.c
1585 ## Processing strings
1587 Both `TK_string` and `TK_multi_string` require post-processing which
1588 can be one of two types: literal or with escapes processed.
1589 Even literal processing is non-trivial as the file may contain indents
1590 which need to be stripped.
1592 Errors can only occur when processing escapes. Any unrecognised
1593 character following the escape character will cause an error.
1595 Processing escapes and striping indents can only make the string
1596 shorter, not longer, so we allocate a buffer which is the same size as
1597 the string and process into that.
1599 To request escape processing, we pass the character we want to use for
1600 quoting, usually '`\`'. To avoid escape processing we pass a zero.
1603 int string_parse(struct token *tok, char escape,
1604 struct text *str, char tail[3])
1607 struct text t = tok->txt;
1611 if (tok->num == TK_string) {
1616 str->txt = malloc(t.len);
1629 The tail of the string can be 0, 1, or 2 letters
1632 if (i >= 0 && isalpha(t.txt[i-1]))
1634 if (i >= 0 && isalpha(t.txt[i-1]))
1636 strncpy(tail, t.txt+i, t.len-i);
1645 Stripping the quote of a single-line string is trivial.
1646 The only part that is at all interesting is that quote character must
1650 if (t.txt[t.len-1] != quote)
1660 For a multi-line string we have a little more work to do. We need to
1661 remove 3 quotes, not 1, and need to count the indent of the close
1662 quote as it will need to be stripped from all lines.
1666 t.txt[1] != quote || t.txt[2] != quote ||
1667 !is_newline(t.txt[3]))
1672 if (i <= 0 || t.txt[i-1] != quote)
1675 if (i <= 0 || t.txt[i-1] != quote)
1678 if (i <= 0 || t.txt[i-1] != quote)
1682 while (i > 0 && !is_newline(t.txt[i-1]))
1686 if (t.txt[i] == ' ')
1688 if (t.txt[i] == '\t')
1689 indent = indent_tab(indent);
1698 Now we just take one byte at a time. trans-ASCII unicode won't look
1699 like anything we are interested in so it will just be copied byte by
1704 for (i = 0; i < t.len; i++) {
1718 } else if (i+1 >= t.len) {
1719 // escape and end of string
1727 str->len = cp - str->txt;
1735 Every time we find a start of line, we strip spaces and tabs until the
1736 required indent is found.
1739 while (i < t.len && skipped < indent) {
1744 skipped = indent_tab(c);
1753 *cp++ = '\n'; break;
1755 *cp++ = '\r'; break;
1757 *cp++ = '\t'; break;
1759 *cp++ = '\b'; break;
1761 *cp++ = quote; break;
1763 *cp++ = '\f'; break;
1765 *cp++ = '\v'; break;
1767 *cp++ = '\a'; break;
1772 // 3 digit octal number
1775 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1776 t.txt[i+2] < '0' || t.txt[i+1] > '7')
1778 n = (t.txt[i ]-'0') * 64 +
1779 (t.txt[i+1]-'0') * 8 +
1780 (t.txt[i+2]-'0') * 1;
1786 n = take_hex(2, t.txt+i+1, t.len-i-1);
1794 // 4 or 8 hex digits for unicode
1795 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1798 memset(&pstate, 0, sizeof(pstate));
1799 n = wcrtomb(cp, n, &pstate);
1803 i += c == 'u' ? 4 : 8;
1808 else if (is_newline(c))
1818 For `\x` `\u` and `\U` we need to collect a specific number of
1821 ###### string functions
1823 static long take_hex(int digits, char *cp, int l)
1835 else if (isupper(c))
1846 #### File: libstring.c
1848 String parsing goes in `libstring.c`
1857 #include "scanner.h"
1861 ###### File: string.h
1862 int string_parse(struct token *tok, char escape,
1863 struct text *str, char tail[3]);
1865 ###### File: scanner.mk
1867 libstring.o : libstring.c
1868 $(CC) $(CFLAGS) -c libstring.c
1873 As "untested code is buggy code" we need a program to easily test
1874 the scanner library. This will simply parse a given file and report
1875 the tokens one per line.
1877 ###### File: scanner.c
1883 #include <sys/mman.h>
1889 #include "scanner.h"
1894 static void pr_err(char *msg)
1897 fprintf(stderr, "%s\n", msg);
1900 int main(int argc, char *argv[])
1905 struct token_state *state;
1906 const char *known[] = {
1915 struct token_config conf = {
1918 .words_marks = known,
1919 .number_chars = "., _+-",
1920 .known_count = sizeof(known)/sizeof(known[0]),
1921 .ignored = (0 << TK_line_comment)
1922 |(0 << TK_block_comment),
1924 struct section *table, *s, *prev;
1925 setlocale(LC_ALL,"");
1927 fprintf(stderr, "Usage: scanner file\n");
1930 fd = open(argv[1], O_RDONLY);
1932 fprintf(stderr, "scanner: cannot open %s: %s\n",
1933 argv[1], strerror(errno));
1936 len = lseek(fd, 0, 2);
1937 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
1938 table = code_extract(file, file+len, pr_err);
1941 (code_free(s->code), prev = s, s = s->next, free(prev))) {
1942 printf("Tokenizing: %.*s\n", s->section.len,
1944 state = token_open(s->code, &conf);
1946 struct token tk = token_next(state);
1947 printf("%d:%d ", tk.line, tk.col);
1948 token_trace(stdout, tk, 20);
1949 if (tk.num == TK_number) {
1952 if (number_parse(num, tail,tk.txt)) {
1953 printf(" %s ", tail);
1954 mpq_out_str(stdout, 10, num);
1957 printf(" BAD NUMBER");
1959 if (tk.num == TK_string ||
1960 tk.num == TK_multi_string) {
1964 if (tk.txt.txt[0] == '`')
1966 if (string_parse(&tk, esc,
1968 printf(" %s ", tail);
1969 text_dump(stdout, str, 20);
1972 printf(" BAD STRING");
1975 if (tk.num == TK_error)
1977 if (tk.num == TK_eof)
1983 ###### File: scanner.mk
1984 scanner.c : scanner.mdc
1987 scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
1988 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
1989 libmdcode.o libnumber.o libstring.o -licuuc -lgmp
1990 scanner.o : scanner.c
1991 $(CC) $(CFLAGS) -c scanner.c