From 6b6ee9a12345d8e5c8fc7e27d2106b1e7fb15f67 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 22 Jun 2013 19:18:55 +1000 Subject: [PATCH] scanner.mdc: lexical scanner for Ocean. This scanner does lexical analysis and produces tokens. It also handles numbers and escapes in strings. Signed-off-by: NeilBrown --- csrc/scanner.mdc | 1967 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1967 insertions(+) create mode 100644 csrc/scanner.mdc diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc new file mode 100644 index 0000000..547a037 --- /dev/null +++ b/csrc/scanner.mdc @@ -0,0 +1,1967 @@ +# Lexical Scanner # + +## The Task at Hand ## + +The main task of the lexical scanner is to convert a stream of +characters into a stream of tokens. The tokens are then typically +used by a parser to extract the syntactic structure. + +The stream of characters are assumed to be in memory identified by a +linked list of blocks, such as provided by the "[mdcode][]" literate +program extractor. A single token may never cross a block boundary. + +[mdcode]: mdcode.html + +###### includes + #include "mdcode.h" + +The text is assumed to be UTF-8 though some matching assumes the +ASCII subset. If the text provided does not conform to UTF-8 an error +will be reported and some number of bytes will be skipped. + +###### includes + #include + #include + #include + +Tokens are returned by successive calls to the main interface +function: `token_next()` which has a `state` structure to keep track +of where it is up to. Each token carries not just a numeric +identifier but also the code block, the line and character within that +block, and the actual start and length using the `struct text` from +"mdcode". + +###### public types + struct token { + int num; + struct code_node *node; + struct text txt; + int line, col; + }; + struct token_state; + +###### private types + struct token_state { + ## state fields + }; + +###### exported functions + struct token token_next(struct token_state *state); + +###### main functions + struct token token_next(struct token_state *state) + { + ## token_next init + while (1) { + wint_t ch; + struct token tk; + + ## one token + } + } + +The `line` and `col` offsets are useful for reporting errors. +The `txt` provides the content when that is important. + +### Token types and configuration ## + +The scanner is not completely general, yet not completely specified. +There are a fixed set of token types, though particular tokens within +those types can be distinguish via configuration. + +Most token types may be explicitly ignored, as typically comments +would be. The exact consequence of ignoring each token type varies +from token to token. + +###### public types + struct token_config { + int ignored; // bit set of ignored tokens. + ## token config parameters + }; + +###### state fields + struct token_config *conf; + +###### token_next init + int ignored = state->conf->ignored; + + +The different tokens are numbers, words, marks, strings, comments, +newlines, EOF, and indents, each of which is examined in detail below. + +There are various cases where no token can be found in part of the +input. All of these will be reported as an `TK_error` token. + +It is possible to declare a number of strings which form distinct +tokens (rather than being grouped as e.g. 'word'). These are given +token numbers from `TK_reserved` upwards. + +###### public types + enum token_num { + TK_error, + ## token types + TK_reserved + }; + +### Numbers + +Numbers are the messiest tokens to parse, primarily because they can +contain characters that also have meaning outside of number and, +particularly, immediately after numbers. + +The obvious example is the '`-`' sign. It can come inside a number for +a negative exponent, or after a number as a subtraction operator. To +be sure we have parsed as best as possible we need to only allow the +'`-`' inside a number if it is after an exponent character. This can be +`e` or `p` (for hex exponents), but `e` can also be a hexadecimal +digit, so we don't allow '`-`' after just any `e`. + +To make matters worse, our language designer has decided to experiment +with allowing commas to be used as the decimal indicator, and spaces +to be used to separate groups of digits in large numbers. Both of +these can reasonably be restricted to appear between two digits, so we +have to add that condition to our tests. + +So we cannot just treat numbers as starting with a digit and being +followed by some set of characters. We need more structure than that. + +So: + +- Numbers must start with a digit. +- If the first digit is zero, the next character must be a base + signifier (one of `xob`) or a decimal marker (`.` or `,`). + In the first case the first `p` or `P` may be followed by a sign. +- If the number doesn't start with `0` followed by one of `xob`, the + first `e` may be followed by a sign. +- Any digit or hex digit may be followed by a space or underscore + providing that the subsequence character is also a (hex) digit. + This rule will require an extra level of 'unget' to be + supported when handling characters. +- Otherwise any digits or ASCII letters are allowed. We do not at + this point check that the digits given are permitted by the base. + That will happen when the token is converted to a number. + +To allow easy configuration, the various non alphanumeric characters +are only permitted if they are listed in a configuration parameter. + +###### token config parameters + char *number_chars; + +Note that numbers may not start with a period, so `.75` is not a +number. This is not the norm, but is not unheard of. Excluding these +numbers simplifies the rule at very little cost. + +###### token types + TK_number, + +If TK_number is ignored, digits will result in an error unless they +are declared to be a start character for words. + +###### includes + + #include + +###### parse number + + if (iswdigit(ch) && !(ignored & (1<conf->number_chars, ch)) { + prev_special = 1; + continue; + } + /* non-number char */ + break; + } + /* We seem to have a "number" token */ + unget_char(state); + close_token(state, &tk); + tk.num = TK_number; + return tk; + } + +### Words +Words start with a "start" character followed by the longest +sequence of "continue" characters. The Unicode ID_START and +ID_CONTINUE sets are always permitted, but other ASCII characters +can be added to these sets. + +###### token config parameters + char *word_start; + char *word_cont; + +###### internal functions + static int is_word_start(wchar_t ch, struct token_config *conf) + { + return iswalpha(ch) || + strchr(conf->word_start, ch) != NULL || + u_hasBinaryProperty(ch, UCHAR_ID_START); + } + + static int is_word_continue(wchar_t ch, struct token_config *conf) + { + return iswalnum(ch) || + strchr(conf->word_cont, ch) != NULL || + u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE); + } + +Words can be either known or unknown. Known words are referred to as +"reserved words" and get a unique token number. Unknown words are +"identifiers" and are syntactically a single token. + +###### token types + TK_ident, + +A list of known words must be provided. This list is shared with the +"marks" which are described next. The list must be lexically sorted +and the length of the list must be given (`known_count`). +Tokens matching these known words are reported as the index of the +list added to `TK_reserved`. + +###### token config parameters + char **words_marks; + int known_count; + +###### parse word + + if (is_word_start(ch, state->conf)) { + int n; + /* A word: identifier or reserved */ + do + ch = get_char(state); + while (is_word_continue(ch, state->conf)); + unget_char(state); + close_token(state, &tk); + tk.num = TK_ident; + if (ignored & (1<conf, tk.txt); + if (n >= 0) + tk.num = TK_reserved + n; + return tk; + } + +### Marks + +Marks are generally one or more punctuation marks joined together. It +would be nice to use the term "symbol" for these, but that causes +confusion in a subsequent discussion of the grammar, which has terminal +symbols and non-terminal symbols which are conceptually quite +different. So strings of punctuation characters will be marks. + +A "mark" consists of ASCII characters that are not white space, are not +"start" characters for words, and are not digits. +These will collectively be called mark characters. + +###### internal functions + static int is_mark(wchar_t ch, struct token_config *conf) + { + return ch > ' ' && + ch < 0x7f && + !iswalnum(ch) && + strchr(conf->word_start, ch) == NULL; + } + +As with words, there can be known and unknown marks, though the rules +are slightly different. + +Two marks do not need to be separated by a non-mark characters. This +is different from words which do need to be separated by at least one +non-continue character. + +The scanner will normally prefer longer sequences of mark characters, +but will more strongly prefer known marks over unknown marks. So if +it finds a known mark where adding one more character does not result +in a known mark, it will return that first known mark. + +If no known mark is found we will test against strings and comments +below before giving up and assuming an unknown mark. +If `TK_mark` is ignored, then unknown marks as returned as an error. + +###### token types + TK_mark, + +Known marks are included in the same list as the list of known words. + +###### parse mark + tk.num = TK_error; + while (is_mark(ch, state->conf)) { + int n; + close_token(state, &tk); + n = find_known(state->conf, tk.txt); + if (n >= 0) + tk.num = TK_reserved + n; + else if (tk.num != TK_error) { + /* found a longest-known-mark */ + unget_char(state); + close_token(state, &tk); + return tk; + } + ch = get_char(state); + } + unget_char(state); + if (tk.num != TK_error) + return tk; + +###### unknown mark + if (tk.txt.len) { + if (ignored & (1<= 1 && txt.txt[0] == '#') || + (txt.len >= 2 && txt.txt[0] == '/' && + txt.txt[1] == '/'); + } + + static int is_block_comment(struct text txt) + { + return txt.len >= 2 && txt.txt[0] == '/' && + txt.txt[1] == '*'; + } + +#### Single line comments + +A single-line comment continues up to, but not including the newline. + +###### parse comment + + if (is_line_comment(tk.txt)) { + while (!is_newline(ch)) + ch = get_char(state); + unget_char(state); + close_token(state, &tk); + tk.num = TK_line_comment; + if (ignored & (1 << TK_line_comment)) + continue; + return tk; + } + +#### Block comments + +The token text collected so far could exceed the comment, so we need +to reset it first. + +If we find an embedded `/*` we reset to just before the '/' and report +an error. That way the next thing to be parsed will be the rest of +the comment. This requires a double unget, so we need to save/restore +the unget state (explained later). + +###### parse comment + + if (is_block_comment(tk.txt)) { + wchar_t prev; + int newlines = 0; + reset_token(state, &tk); + get_char(state); + get_char(state); + save_unget_state(state); + ch = get_char(state); + prev = 0; + while (!at_eon(state) && + (prev != '/' || ch != '*') && + (prev != '*' || ch != '/')) { + if (is_newline(ch)) + newlines = 1; + prev = ch; + save_unget_state(state); + ch = get_char(state); + } + close_token(state, &tk); + if (at_eon(state)) { + tk.num = TK_error; + return tk; + } + if (prev == '/') { + /* embedded. Need to unget twice! */ + restore_unget_state(state); + unget_char(state); + tk.num = TK_error; + return tk; + } + tk.num = TK_block_comment; + if (newlines && !(ignored & (1<node->next && + state->node->next->indent > state->node->indent) + state->col = state->node->next->indent; + else + state->col = state->node->indent; + } else + unget_char(state); + state->delayed_lines = newlines; + state->undent_next = was_son; + state->check_indent = 1; + continue; + } + + +###### delayed tokens + + if (state->check_indent || state->delayed_lines) { + if (state->col < state->indent_sizes[state->indent_level]) { + if (!state->undent_next && + !(ignored & (1<undent_next = 1; + tk.num = TK_newline; + return tk; + } + state->indent_level -= 1; + state->undent_next = 0; + tk.num = TK_undent; + return tk; + } + if (state->col > state->indent_sizes[state->indent_level] && + state->indent_level < sizeof(state->indent_sizes)-1) { + state->indent_level += 1; + state->indent_sizes[state->indent_level] = state->col; + state->delayed_lines -= 1; + tk.num = TK_indent; + return tk; + } + state->check_indent = 0; + if (state->delayed_lines && !(ignored & (1<delayed_lines -= 1; + return tk; + } + state->delayed_lines = 0; + continue; + } + +### End of File + +After the last newline in the file has been processed, a special +end-of-file token will be returned. any further attempts to get more +tokens will continue to return the same end-of-file token. + +###### token types + TK_eof, + + +###### white space + if (ch == WEOF) { + tk.num = TK_eof; + return tk; + } + +### Unknown Marks, or errors. + +We have now handled all the possible known mark-like tokens. +If the token we have is not empty and `TK_mark` is allowed, +we have an unknown mark, otherwise this must be an error. + +###### unknown mark + /* one unknown character */ + close_token(state, &tk); + tk.num = TK_error; + return tk; + +## Tools For The Task + +You may have noticed that are few gaps we left in the above - +functions used without first defining them. Doing so above would have +broken the flow. + +### Character by character + +As we walk through the various `code_node`s we need to process whole +Unicode codepoints, and keep track of which line and column we are on. +We will assume for now that any printing character uses one column, +though that is not true in general. + +As the text in a `code_node` may include an indent that identifies it as +being code, we need to be careful to strip that. The `code_node` has +a flag that tells us whether or not we need to strip. + +###### includes + #include + +###### state fields + struct code_node *node; + int offset; + int line; + int col; + +###### internal functions + + static void do_strip(struct token_state *state) + { + if (state->node->needs_strip) { + int n = 4; + while (n && state->node->code.txt[state->offset] == ' ') { + state->offset += 1; + n -= 1; + } + while (n == 4 && state->node->code.txt[0] == '\t') { + state->offset += 1; + n -= 4; + } + } + } + + static wint_t get_char(struct token_state *state) + { + wchar_t next; + size_t n; + mbstate_t mbstate; + + if (state->node == NULL) + return WEOF; + if (state->node->code.len <= state->offset) { + do + state->node = state->node->next; + while (state->node && state->node->code.txt == NULL); + state->offset = 0; + if (state->node == NULL) + return WEOF; + do_strip(state); + state->line = state->node->line_no; + state->col = state->node->indent; + } + + ## before get_char + + memset(&mbstate, 0, sizeof(mbstate)); + + n = mbrtowc(&next, state->node->code.txt + state->offset, + state->node->code.len - state->offset, + &mbstate); + if (n == -2 || n == 0) { + /* Not enough bytes - not really possible */ + next = '\n'; + state->offset = state->node->code.len; + } else if (n == -1) { + /* error */ + state->offset += 1; + next = 0x7f; // an illegal character + } else + state->offset += n; + + if (next >= ' ') { + state->col += 1; + } else if (is_newline(next)) { + state->line += 1; + state->col = state->node->indent; + do_strip(state); + } else if (next == '\t') { + state->col = indent_tab(state->col); + } + return next; + } + +We will sometimes want to "unget" the last character as it needs to be +considered again as part of the next token. So we need to store a +'previous' version of all metadata. + +###### state fields + int prev_offset; + int prev_line; + int prev_col; + +###### before get_char + state->prev_offset = state->offset; + state->prev_line = state->line; + state->prev_col = state->col; + +###### internal functions + + static void unget_char(struct token_state *state) + { + if (state->node) { + state->offset = state->prev_offset; + state->line = state->prev_line; + state->col = state->prev_col; + } + } + +We occasionally need a double-unget, particularly for numbers and +block comments. We don't impose this cost on all scanning, but +require those code sections that need it to call `save_unget_state` +before each `get_char`, and then `restore_unget_state` when a +double-unget is needed. + +###### state fields + int prev_offset2; + int prev_line2; + int prev_col2; + +###### internal functions + static void save_unget_state(struct token_state *state) + { + state->prev_offset2 = state->prev_offset; + state->prev_line2 = state->prev_line; + state->prev_col2 = state->prev_col; + } + + static void restore_unget_state(struct token_state *state) + { + state->prev_offset = state->prev_offset2; + state->prev_line = state->prev_line2; + state->prev_col = state->prev_col2; + } + +At the start of a token we don't want to be at the end of a code block +if we can help it. To avoid this possibility, we 'get' and 'unget' a +single character. This will move into the next non-empty code block +and leave the current pointer at the start of it. + +This has to happen _after_ dealing with delayed tokens as some of them +must appear in the previous node. When we do this, we need to reset +the data in the token. + +###### delayed tokens + if (at_eon(state)) { + get_char(state); + unget_char(state); + tk.node = state->node; + if (state->node) + tk.txt.txt = state->node->code.txt + state->offset; + tk.line = state->line; + tk.col = state->col; + tk.txt.len = 0; + } + +### Managing tokens + +The current token is initialized to line up with the first character +that we 'get' for each token. When we have, or might have, a full +token we can call `close_token` to set the `len` of the token +appropriately. This can safely be called multiple times. + +Finally we occasionally (for single-line strings and block comments) +need to reset to the beginning of the current token as we might have +parsed too much already. For that there is `reset_token`. + +###### one token + tk.node = state->node; + if (state->node) + tk.txt.txt = state->node->code.txt + state->offset; + tk.line = state->line; + tk.col = state->col; + tk.txt.len = 0; + +###### internal functions + + static void close_token(struct token_state *state, + struct token *tk) + { + tk->txt.len = (state->node->code.txt + state->offset) + - tk->txt.txt; + } + + static void reset_token(struct token_state *state, struct token *tok) + { + state->prev_line = tok->line; + state->prev_col = tok->col; + state->prev_offset = tok->txt.txt - state->node->code.txt; + unget_char(state); + tok->txt.len = 0; + } + + +Tokens make not cross into the next `code_node`, and some tokens can +include the newline at the and of a `code_node`, we must be able to +easily check if we have reached the end. Equally we need to know if +we are at the start of a node, as white space is treated a little +differently there. + +###### internal functions + + static int at_son(struct token_state *state) + { + return state->offset == 0; + } + + static int at_eon(struct token_state *state) + { + // at end-of-node ?? + return state->node == NULL || + state->offset >= state->node->code.len; + } + +### Find a known word + +As the known-word list is sorted we can use a simple binary search. +Following the pattern established in "mdcode", we will use a `struct +text` with start and length to represent the code fragment we are +searching for. + +###### internal functions + static int find_known(struct token_config *conf, struct text txt) + { + int lo = 0; + int hi = conf->known_count; + + while (lo + 1 < hi) { + int mid = (lo + hi) / 2; + int cmp = strncmp(conf->words_marks[mid], + txt.txt, txt.len); + if (cmp == 0 && conf->words_marks[mid][txt.len]) + cmp = 1; + if (cmp <= 0) + lo = mid; + else + hi = mid; + } + if (strncmp(conf->words_marks[lo], + txt.txt, txt.len) == 0 + && conf->words_marks[lo][txt.len] == 0) + return lo; + else + return -1; + } + +### Bringing it all together + +Now we have all the bits there is just one section missing: combining +all the token parsing code into one block. + +The handling of delayed tokens (newlines, indents, undents) must come +first before we try getting another character. + +Then we parse all the test, making sure that we check for known marks +before strings and comments, but unknown marks after strings and comments. + +This block of code will either return a token, or will choose to +ignore one, in which case it will `continue` around to the top of the +loop. + +###### one token + ## delayed tokens + + ch = get_char(state); + + ## white space + ## parse number + ## parse word + ## parse mark + ## parse string + ## parse comment + ## unknown mark + +### Start and stop + +As well as getting tokens, we need to be able to create the +`token_state` to start with, and discard it later. + +###### includes + #include + +###### main functions + struct token_state *token_open(struct code_node *code, struct + token_config *conf) + { + struct token_state *state = malloc(sizeof(*state)); + memset(state, 0, sizeof(*state)); + state->node = code; + state->line = code->line_no; + state->conf = conf; + return state; + } + void token_close(struct token_state *state) + { + free(state); + } + +###### exported functions + struct token_state *token_open(struct code_node *code, struct + token_config *conf); + void token_close(struct token_state *state); + +### Trace tokens + +Getting tokens is the main thing but it is also useful to be able to +print out token information, particularly for tracing and testing. + +Known tokens are printed verbatim. Other tokens are printed as +`type(content)` where content is truncated to a given number of characters. + +The function for printing a truncated string (`text_dump`) is also exported +so that it can be used to tracing processed strings too. + +###### includes + #include + +###### exported functions + void token_trace(FILE *f, struct token tok, int max); + void text_dump(FILE *f, struct text t, int max); + +###### main functions + + void text_dump(FILE *f, struct text txt, int max) + { + int i; + if (txt.len > max) + max -= 2; + else + max = txt.len; + for (i = 0; i < max; i++) { + char c = txt.txt[i]; + if (c < ' ' || c > '~') + fprintf(f, "\\x%02x", c & 0xff); + else if (c == '\\') + fprintf(f, "\\\\"); + else + fprintf(f, "%c", c); + } + if (i < txt.len) + fprintf(f, ".."); + } + + void token_trace(FILE *f, struct token tok, int max) + { + static char *types[] = { + [TK_ident] = "ident", + [TK_mark] = "mark", + [TK_number] = "number", + [TK_string] = "string", + [TK_multi_string] = "mstring", + [TK_line_comment] = "lcomment", + [TK_block_comment] = "bcomment", + [TK_indent] = "indent", + [TK_undent] = "undent", + [TK_newline] = "newline", + [TK_eof] = "eof", + [TK_error] = "ERROR", + }; + + switch (tok.num) { + default: /* known word or mark */ + fprintf(f, "%.*s", tok.txt.len, tok.txt.txt); + break; + case TK_indent: + case TK_undent: + case TK_newline: + case TK_eof: + /* No token text included */ + fprintf(f, "%s()", types[tok.num]); + break; + case TK_ident: + case TK_mark: + case TK_number: + case TK_string: + case TK_multi_string: + case TK_line_comment: + case TK_block_comment: + case TK_error: + fprintf(f, "%s(", types[tok.num]); + text_dump(f, tok.txt, max); + fprintf(f, ")"); + break; + } + } + +### And there we have it + +We now have all the library functions defined for reading and printing +tokens. Now we just need C files to store them, and a mk file to make them. + +###### File: scanner.h + ## public types + ## exported functions + +###### File: libscanner.c + ## includes + #include "scanner.h" + ## private types + ## internal functions + ## main functions + +###### File: scanner.mk + + CFLAGS += -Wall -g + all :: + scanner.mk scanner.h libscanner.c : scanner.mdc + ./md2c scanner.mdc + all :: libscanner.o + libscanner.o : libscanner.c + $(CC) $(CFLAGS) -c libscanner.c + +## Processing numbers + +Converting a `TK_number` token to a numerical value is a slightly +higher level task than lexical analysis, and slightly lower than +grammar parsing, so put it here - as an index if you like. + +Importantly it will be used by the same testing rig that is used for +testing the token scanner. + +The numeric value that we will convert all numbers into is the `mpq_t` +from the GNU high precision number library "libgmp". + +###### number includes + #include + #include "mdcode.h" + +Firstly we need to be able to parse a string of digits in a given base +and possibly with a decimal marker. We store this in an `mpz_t` +integer and report the number of digits after the decimal mark. + +On error we return zero and ensure that the 'mpz_t' has been freed, or +had never been initialised. + +###### number functions + + static int parse_digits(mpz_t num, struct text tok, int base, + int *placesp) + { + /* Accept digits up to 'base', ignore '_' and + * ' ' if they appear between two legal digits, + * and if `placesp` is not NULL, allow a single + * '.' or ',' and report the number of digits + * beyond there. + * Return number of characters processed (p), + * or 0 if something illegal was found. + */ + int p; + int decimal = -1; // digits after marker + enum {Digit, Space, Other} prev = Other; + int digits = 0; + + for (p = 0; p < tok.len; p++) { + int dig; + char c = tok.txt[p]; + + if (c == '_' || c == ' ') { + if (prev != Digit) + goto bad; + prev = Space; + continue; + } + if (c == '.' || c == ',') { + if (prev != Digit) + goto bad; + if (!placesp || decimal >= 0) + return p-1; + decimal = 0; + prev = Other; + continue; + } + if (isdigit(c)) + dig = c - '0'; + else if (isupper(c)) + dig = 10 + c - 'A'; + else if (islower(c)) + dig = 10 + c - 'a'; + else + dig = base; + if (dig >= base) { + if (prev == Space) + p--; + break; + } + prev = Digit; + if (digits) + mpz_mul_ui(num, num, base); + else + mpz_init(num); + digits += 1; + mpz_add_ui(num, num, dig); + if (decimal >= 0) + decimal++; + } + if (digits == 0) + return 0; + if (placesp) { + if (decimal >= 0) + *placesp = decimal; + else + *placesp = 0; + } + return p; + bad: + if (digits) + mpz_clear(num); + return 0; + } + +###### number includes + #include + +To parse a full number we need to consider the optional base, the +mantissa, and the optional exponent. We will treat these one at a +time. + +The base is indicated by a letter after a leading zero, which must be +followed by a base letter or a period. The base also determines the +character which will mark an exponent. + +###### number vars + int base = 10; + char expc = 'e'; + +###### parse base + + if (tok.txt[0] == '0' && tok.len > 1) { + int skip = 0; + switch(tok.txt[1]) { + case 'x': + case 'X': + base = 16; + skip = 2; + expc = 'p'; + break; + case 'o': + case 'O': + base = 8; + skip = 2; + expc = 'p'; + break; + case 'b': + case 'B': + base = 2; + skip = 2; + expc = 'p'; + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '_': + case ' ': + // another digit is not permitted + // after a zero. + return 0; + default: + // must be decimal marker or trailing + // letter, which are OK; + break; + } + tok.txt += skip; + tok.len -= skip; + } + +After the base is the mantissa, which may contain a decimal mark, so +we need to record the number of places. We won't impose the number of +places until we have the exponent as well. + +###### number vars + int places =0; + mpz_t mant; + int d; + +###### parse mantissa + + d = parse_digits(mant, tok, base, &places); + if (d == 0) + return 0; + tok.txt += d; + tok.len -= d; + mpq_init(num); + mpq_set_z(num, mant); + mpz_clear(mant); + +After the mantissa number may come an exponent which may be positive +or negative. We assume at this point that we have seen the exponent +character `expc`. + +###### number vars + long lexp = 0; + mpz_t exp; + int esign = 1; + +###### parse exponent + if (tok.len > 1) { + if (tok.txt[0] == '+') { + tok.txt++; + tok.len--; + } else if (tok.txt[0] == '-') { + esign = -1; + tok.txt++; + tok.len--; + } + } + d = parse_digits(exp, tok, 10, NULL); + if (d == 0) { + mpq_clear(num); + return 0; + } + if (!mpz_fits_slong_p(exp)) { + mpq_clear(num); + mpz_clear(exp); + return 0; + } + lexp = mpz_get_si(exp) * esign; + mpz_clear(exp); + tok.txt += d; + tok.len -= d; + + +Now that we have the mantissa and the exponent we can multiply them +together, also allowing for the number of digits after the decimal +mark. + +For base 10, we simply subtract the decimal places from the exponent. +For the other bases, as the exponent is alway based on 2, even for +octal and hex, we need a bit more detail. +We then recover the sign from the exponent, as division is quite +different from multiplication. + +###### calc exponent + switch (base) { + case 10: + case 2: + lexp -= places; + break; + case 16: + lexp -= 4*places; + break; + case 8: + lexp -= 3*places; + break; + } + if (lexp < 0) { + lexp = -lexp; + esign = -1; + } else + esign = 1; + +Imposing the exponent on the number is also very different for base 10 +than for the others. For the binary shift `gmp` provides a simple +function. For base 10 we use something like Russian Peasant +Multiplication. + +###### calc exponent + if (expc == 'e') { + mpq_t tens; + mpq_init(tens); + mpq_set_ui(tens, 10, 1); + while (1) { + if (lexp & 1) { + if (esign > 1) + mpq_mul(num, num, tens); + else + mpq_div(num, num, tens); + } + lexp >>= 1; + if (lexp == 0) + break; + mpq_mul(tens, tens, tens); + } + mpq_clear(tens); + } else { + if (esign > 0) + mpq_mul_2exp(num, num, lexp); + else + mpq_div_2exp(num, num, lexp); + } + +Now we are ready to parse a number: the base, mantissa, and exponent. +If all goes well we check for the possible trailing letters and +return. Return value is 1 for success and 0 for failure. + + +###### number functions + int number_parse(mpq_t num, char tail[3], struct text tok) + { + ## number vars + int i; + + ## parse base + ## parse mantissa + if (tok.len > 1 && (tok.txt[0] == expc || + tok.txt[0] == toupper(expc))) { + tok.txt++; + tok.len--; + ## parse exponent + } + ## calc exponent + + for (i = 0; i < 2; i++) { + if (tok.len <= i) + break; + if (!isalpha(tok.txt[i])) + goto err; + tail[i] = tok.txt[i]; + } + tail[i] = 0; + if (i == tok.len) + return 1; + err: + mpq_clear(num); + return 0; + } + +Number parsing goes in `libnumber.c` + +###### File: libnumber.c + + #include + #include + + ## number includes + ## number functions + +###### File: number.h + int number_parse(mpq_t num, char tail[3], struct text tok); + +###### File: scanner.mk + all :: libnumber.o + libnumber.o : libnumber.c + $(CC) $(CFLAGS) -c libnumber.c + +## Processing strings + +Both `TK_string` and `TK_multi_string` require post-processing which +can be one of two types: literal or with escapes processed. +Even literal processing is non-trivial as the file may contain indents +which need to be stripped. + +Errors can only occur when processing escapes. Any unrecognised +character following the escape character will cause an error. + +Processing escapes and striping indents can only make the string +shorter, not longer, so we allocate a buffer which is the same size as +the string and process into that. + +To request escape processing, we pass the character we want to use for +quoting, usually '`\`'. To avoid escape processing we pass a zero. + +###### string main + int string_parse(struct token *tok, char escape, + struct text *str, char tail[3]) + { + ## string vars + struct text t = tok->txt; + + str->txt = NULL; + ## strip tail + if (tok->num == TK_string) { + ## strip single + } else { + ## strip multi + } + str->txt = malloc(t.len); + str->len = 0; + + ## process string + return 1; + err: + free(str->txt); + str->txt = NULL; + return 0; + } + +### strip tail + +The tail of the string can be 0, 1, or 2 letters + + i = t.len; + if (i >= 0 && isalpha(t.txt[i-1])) + i -= 1; + if (i >= 0 && isalpha(t.txt[i-1])) + i -= 1; + strncpy(tail, t.txt+i, t.len-i); + tail[t.len-i] = 0; + t.len = i; + +###### string vars + int i; + +### strip single + +Stripping the quote of a single-line string is trivial. +The only part that is at all interesting is that quote character must +be remembered. + + quote = t.txt[0]; + if (t.txt[t.len-1] != quote) + goto err; + t.txt += 1; + t.len -= 2; + +###### string vars + char quote; + +### strip multi + +For a multi-line string we have a little more work to do. We need to +remove 3 quotes, not 1, and need to count the indent of the close +quote as it will need to be stripped from all lines. + + quote = t.txt[0]; + if (t.len < 7 || + t.txt[1] != quote || t.txt[2] != quote || + !is_newline(t.txt[3])) + goto err; + t.txt += 4; + t.len -= 4; + i = t.len; + if (i <= 0 || t.txt[i-1] != quote) + goto err; + i -= 1; + if (i <= 0 || t.txt[i-1] != quote) + goto err; + i -= 1; + if (i <= 0 || t.txt[i-1] != quote) + goto err; + i -= 1; + t.len = i; + while (i > 0 && !is_newline(t.txt[i-1])) + i--; + indent = 0; + while (i < t.len) { + if (t.txt[i] == ' ') + indent += 1; + if (t.txt[i] == '\t') + indent = indent_tab(indent); + i++; + } + +###### string vars + int indent = 0; + +### process string + +Now we just take one byte at a time. trans-ASCII unicode won't look +like anything we are interested in so it will just be copied byte by +byte. + + cp = str->txt; + at_sol = 1; + for (i = 0; i < t.len; i++) { + char c; + if (at_sol) { + at_sol = 0; + ## strip indent + if (i >= t.len) + break; + } + c = t.txt[i]; + if (c != escape) { + *cp = c; + cp += 1; + if (is_newline(c)) + at_sol = 1; + } else if (i+1 >= t.len) { + // escape and end of string + goto err; + } else { + i += 1; + c = t.txt[i]; + ## parse escape + } + } + str->len = cp - str->txt; + +###### string vars + char *cp; + int at_sol; + +### strip indent + +Every time we find a start of line, we strip spaces and tabs until the +required indent is found. + + int skipped = 0; + while (i < t.len && skipped < indent) { + c = t.txt[i]; + if (c == ' ') + skipped += 1; + else if (c == '\t') + skipped = indent_tab(c); + else + break; + i+= 1; + } + +### parse escape + switch (c) { + case 'n': + *cp++ = '\n'; break; + case 'r': + *cp++ = '\r'; break; + case 't': + *cp++ = '\t'; break; + case 'b': + *cp++ = '\b'; break; + case 'q': + *cp++ = quote; break; + case 'f': + *cp++ = '\f'; break; + case 'v': + *cp++ = '\v'; break; + case 'a': + *cp++ = '\a'; break; + case '0': + case '1': + case '2': + case '3': + // 3 digit octal number + if (i+2 >= t.len) + goto err; + if (t.txt[i+1] < '0' || t.txt[i+1] > '7' || + t.txt[i+2] < '0' || t.txt[i+1] > '7') + goto err; + n = (t.txt[i ]-'0') * 64 + + (t.txt[i+1]-'0') * 8 + + (t.txt[i+2]-'0') * 1; + *cp++ = n; + i += 2; + break; + case 'x': + // 2 hex digits + n = take_hex(2, t.txt+i+1, t.len-i-1); + if (n < 0) + goto err; + *cp++ = n; + i += 2; + break; + case 'u': + case 'U': + // 4 or 8 hex digits for unicode + n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1); + if (n < 0) + goto err; + memset(&pstate, 0, sizeof(pstate)); + n = wcrtomb(cp, n, &pstate); + if (n <= 0) + goto err; + cp += n; + i += c == 'u' ? 4 : 8; + break; + default: + if (c == escape) + *cp++ = c; + else if (is_newline(c)) + at_sol = 1; + else + goto err; + } + +###### string vars + long n; + mbstate_t pstate; + +For `\x` `\u` and `\U` we need to collect a specific number of +hexadecimal digits + +###### string functions + + static long take_hex(int digits, char *cp, int l) + { + long n = 0; + if (l < digits) + return -1; + while (digits) { + char c = *cp; + int d; + if (!isxdigit(c)) + return -1; + if (isdigit(c)) + d = c - '0'; + else if (isupper(c)) + d = 10 + c - 'A'; + else + d = 10 + c - 'a'; + n = n * 16 + d; + digits--; + cp++; + } + return n; + } + +#### File: libstring.c + +String parsing goes in `libstring.c` + + #include + #include + #include + #include + #include + #include + #include "mdcode.h" + #include "scanner.h" + ## string functions + ## string main + +###### File: string.h + int string_parse(struct token *tok, char escape, + struct text *str, char tail[3]); + +###### File: scanner.mk + all :: libstring.o + libstring.o : libstring.c + $(CC) $(CFLAGS) -c libstring.c + + +## Testing + +As "untested code is buggy code" we need a program to easily test +the scanner library. This will simply parse a given file and report +the tokens one per line. + +###### File: scanner.c + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include "mdcode.h" + #include "scanner.h" + #include "number.h" + #include "string.h" + + static int errs; + static void pr_err(char *msg) + { + errs++; + fprintf(stderr, "%s\n", msg); + } + + int main(int argc, char *argv[]) + { + int fd; + int len; + char *file; + struct token_state *state; + char *known[] = { + "==", + "else", + "if", + "then", + "while", + "{", + "}", + }; + struct token_config conf = { + .word_start = "_$", + .word_cont = "", + .words_marks = known, + .number_chars = "., _+-", + .known_count = sizeof(known)/sizeof(known[0]), + .ignored = (0 << TK_line_comment) + |(0 << TK_block_comment), + }; + struct section *table, *s, *prev; + setlocale(LC_ALL,""); + if (argc != 2) { + fprintf(stderr, "Usage: scanner file\n"); + exit(2); + } + fd = open(argv[1], O_RDONLY); + if (fd < 0) { + fprintf(stderr, "scanner: cannot open %s: %s\n", + argv[1], strerror(errno)); + exit(1); + } + len = lseek(fd, 0, 2); + file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); + table = code_extract(file, file+len, pr_err); + + for (s = table; s; + (code_free(s->code), prev = s, s = s->next, free(prev))) { + printf("Tokenizing: %.*s\n", s->section.len, + s->section.txt); + state = token_open(s->code, &conf); + while(1) { + struct token tk = token_next(state); + printf("%d:%d ", tk.line, tk.col); + token_trace(stdout, tk, 20); + if (tk.num == TK_number) { + mpq_t num; + char tail[3]; + if (number_parse(num, tail,tk.txt)) { + printf(" %s ", tail); + mpq_out_str(stdout, 10, num); + mpq_clear(num); + } else + printf(" BAD NUMBER"); + } + if (tk.num == TK_string || + tk.num == TK_multi_string) { + char esc = '\\'; + struct text str; + char tail[3]; + if (tk.txt.txt[0] == '`') + esc = 0; + if (string_parse(&tk, esc, + &str, tail)) { + printf(" %s ", tail); + text_dump(stdout, str, 20); + free(str.txt); + } else + printf(" BAD STRING"); + } + printf("\n"); + if (tk.num == TK_error) + errs = 1; + if (tk.num == TK_eof) + break; + } + } + exit(!!errs); + } +###### File: scanner.mk + scanner.c : scanner.mdc + ./md2c scanner.mdc + all :: scanner + scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h + $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \ + libmdcode.o libnumber.o libstring.o -licuuc -lgmp + scanner.o : scanner.c + $(CC) $(CFLAGS) -c scanner.c + -- 2.43.0