X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;f=csrc%2Fscanner.mdc;h=b57db29c0b38fcefead7e0cc4cafb41494d1264e;hb=74275b2e557383a028515ca4dacdf07cb4858aa1;hp=57f9e21b0c4c916af6b9cda625bcdc444ef0ae0a;hpb=7571ab0ca1e9d2761983903c33f5559ab871d532;p=ocean diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc index 57f9e21..b57db29 100644 --- a/csrc/scanner.mdc +++ b/csrc/scanner.mdc @@ -19,7 +19,7 @@ The text is assumed to be UTF-8 though some matching assumes the ASCII subset. If the text provided does not conform to UTF-8 an error will be reported and some number of bytes will be skipped. -###### includes +###### public types #include #include #include @@ -85,12 +85,11 @@ from token to token. ###### token_next init int ignored = state->conf->ignored; - The different tokens are numbers, words, marks, strings, comments, newlines, EOF, and indents, each of which is examined in detail below. There are various cases where no token can be found in part of the -input. All of these will be reported as an `TK_error` token. +input. All of these will be reported as a `TK_error` token. It is possible to declare a number of strings which form distinct tokens (rather than being grouped as e.g. 'word'). These are given @@ -106,7 +105,7 @@ token numbers from `TK_reserved` upwards. ### Numbers Numbers are the messiest tokens to parse, primarily because they can -contain characters that also have meaning outside of number and, +contain characters that also have meaning outside of numbers and, particularly, immediately after numbers. The obvious example is the '`-`' sign. It can come inside a number for @@ -120,7 +119,11 @@ To make matters worse, our language designer has decided to experiment with allowing commas to be used as the decimal indicator, and spaces to be used to separate groups of digits in large numbers. Both of these can reasonably be restricted to appear between two digits, so we -have to add that condition to our tests. +have to add that condition to our tests. For consistency we require +every non-alpha-numeric to appear between two hex digits, with the +exception that a sign can appear only after a 'p' or 'e', and a space +can only appear between decimal digits. Allowing a space before a +letter easily leads to confusion, such a in `a < 3 and b < 4`. So we cannot just treat numbers as starting with a digit and being followed by some set of characters. We need more structure than that. @@ -128,13 +131,16 @@ followed by some set of characters. We need more structure than that. So: - Numbers must start with a digit. -- If the first digit is zero, the next character must be a base - signifier (one of `xob`) or a decimal marker (`.` or `,`). - In the first case the first `p` or `P` may be followed by a sign. +- If the first digit is zero, the next character should be a base + signifier (one of `xob`) or a decimal marker (`.` or `,`) (though this isn't + enforced at this stage) + In the first case the only first `p` or `P` may be followed by a sign. - If the number doesn't start with `0` followed by one of `xob`, the first `e` may be followed by a sign. -- Any digit or hex digit may be followed by a space or underscore - providing that the subsequence character is also a (hex) digit. +- A sign must always be followed by a digit. +- Any digit may be followed by a space or underscore and any hex digit + maybe followed by an underscore, providing that the subsequence character + is also a digit (for space) or hex digit (for underscore). This rule will require an extra level of 'unget' to be supported when handling characters. - Otherwise any digits or ASCII letters are allowed. We do not at @@ -164,7 +170,7 @@ are declared to be a start character for words. ###### parse number if (iswdigit(ch) && !(ignored & (1<conf->number_chars, ch)) { + /* non-number char */ + break; } if (ch == '+' || ch == '-') { + /* previous must be 'e' or 'p' in appropraite context */ if (!sign_ok) break; expect_p = -1; + } else if (ch == ' ') { + /* previous must be a digit */ + if (!iswdigit(prev)) + break; + } else { + /* previous must be a hex digit */ + if (!iswxdigit(prev)) + break; } if (ch == '.' || ch == ',') { + /* only one of these permitted */ if (decimal_mark) break; decimal_mark = 1; } - if (prev_special) { - /* Don't allow that special char, - * need two 'ungets' - */ - restore_unget_state(state); - break; - } - if (strchr(state->conf->number_chars, ch)) { - prev_special = 1; - continue; - } - /* non-number char */ - break; } /* We seem to have a "number" token */ unget_char(state); @@ -260,6 +285,9 @@ and the length of the list must be given (`known_count`). Tokens matching these known words are reported as the index of the list added to `TK_reserved`. +If identifiers are ignored, then any word which is not listed as a +known word results in an error. + ###### token config parameters const char **words_marks; int known_count; @@ -318,7 +346,20 @@ in a known mark, it will return that first known mark. If no known mark is found we will test against strings and comments below before giving up and assuming an unknown mark. -If `TK_mark` is ignored, then unknown marks as returned as an error. + +If an unknown mark contains a quote character or a comment marker, and +that token is not being ignored, then we terminate the unknown mark +before that quote or comment. This ensures that an unknown mark +immediately before a string is handled correctly. + +If the first character of a comment marker (i.e. '/') is a known mark, +the above rules would suggest that the start of a comment would be +parsed as that mark, which is not what is wanted. So the introductory +sequences for a comment ("//" and "/*") are treated as +partially-known. They prevent the leading "/" from being a mark by +itself, but do not actually constitute a stand-alone mark. + +If `TK_mark` is ignored, then unknown marks are returned as errors. ###### token types TK_mark, @@ -329,31 +370,63 @@ Known marks are included in the same list as the list of known words. tk.num = TK_error; while (is_mark(ch, state->conf)) { int n; + wchar_t prev; close_token(state, &tk); n = find_known(state->conf, tk.txt); if (n >= 0) tk.num = TK_reserved + n; else if (tk.num != TK_error) { - /* found a longest-known-mark */ + /* found a longest-known-mark, still need to + * check for comments + */ + if (tk.txt.len == 2 && tk.txt.txt[0] == '/' && + (ch == '/' || ch == '*')) { + /* Yes, this is a comment, not a '/' */ + restore_unget_state(state); + tk.num = TK_error; + break; + } unget_char(state); close_token(state, &tk); return tk; } + prev = ch; + save_unget_state(state); ch = get_char(state); + if (!(ignored & (1<node == NULL) + return state->col; + return state->node->indent - state->node->needs_strip + state->col; + } + ###### white space + if (is_newline(ch)) + state_check_node(state); if (is_newline(ch) || (at_son(state) && ch <= ' ')) { int newlines = 0; - int was_son = at_son(state); - if (ignored & (1<node->next && - state->node->next->indent > state->node->indent) - state->col = state->node->next->indent; - else - state->col = state->node->indent; - } else + if (ch != WEOF) unget_char(state); state->delayed_lines = newlines; - state->undent_next = was_son; + state->out_next = !was_nl; state->check_indent = 1; continue; } - ###### delayed tokens if (state->check_indent || state->delayed_lines) { - if (state->col < state->indent_sizes[state->indent_level]) { - if (!state->undent_next && + if (state_indent(state) < state->indent_sizes[state->indent_level]) { + if (!state->out_next && !(ignored & (1<undent_next = 1; + state->out_next = 1; tk.num = TK_newline; return tk; } state->indent_level -= 1; - state->undent_next = 0; - tk.num = TK_undent; + state->out_next = 0; + tk.num = TK_out; return tk; } - if (state->col > state->indent_sizes[state->indent_level] && + if (state_indent(state) > state->indent_sizes[state->indent_level] && state->indent_level < sizeof(state->indent_sizes)-1) { state->indent_level += 1; - state->indent_sizes[state->indent_level] = state->col; - state->delayed_lines -= 1; - tk.num = TK_indent; + state->indent_sizes[state->indent_level] = state_indent(state); + if (state->delayed_lines) + state->delayed_lines -= 1; + tk.num = TK_in; return tk; } state->check_indent = 0; @@ -789,7 +880,6 @@ tokens will continue to return the same end-of-file token. ###### token types TK_eof, - ###### white space if (ch == WEOF) { tk.num = TK_eof; @@ -803,7 +893,21 @@ If the token we have is not empty and `TK_mark` is allowed, we have an unknown mark, otherwise this must be an error. ###### unknown mark - /* one unknown character */ + + /* one unknown mark character */ + if (tk.txt.len) { + close_token(state, &tk); + if (ignored & (1<node->needs_strip) { int n = 4; while (n && state->node->code.txt[state->offset] == ' ') { + indent += 1; state->offset += 1; n -= 1; } while (n == 4 && state->node->code.txt[state->offset] == '\t') { + indent = indent_tab(indent); state->offset += 1; n -= 4; } } } + static void state_check_node(struct token_state *state) + { + if (!state->node) + return; + if (state->node->code.len > state->offset) + return; + + do + state->node = state->node->next; + while (state->node && state->node->code.txt == NULL); + state->offset = 0; + state->prev_offset = 0; + state->strip_offset = 0; + state->col = 0; + if (state->node == NULL) + return; + state->line = state->node->line_no; + do_strip(state); + state->col = state->node->needs_strip; + state->strip_offset = state->offset; + } + static wint_t get_char(struct token_state *state) { wchar_t next; size_t n; mbstate_t mbstate; + state_check_node(state); if (state->node == NULL) return WEOF; - if (state->node->code.len <= state->offset) { - do - state->node = state->node->next; - while (state->node && state->node->code.txt == NULL); - state->offset = 0; - if (state->node == NULL) - return WEOF; - do_strip(state); - state->line = state->node->line_no; - state->col = state->node->indent; - } ## before get_char @@ -880,12 +1000,12 @@ a flag that tells us whether or not we need to strip. &mbstate); if (n == -2 || n == 0) { /* Not enough bytes - not really possible */ - next = '\n'; - state->offset = state->node->code.len; + next = '\n'; // NOTEST + state->offset = state->node->code.len; // NOTEST } else if (n == -1) { /* error */ - state->offset += 1; - next = 0x7f; // an illegal character + state->offset += 1; // NOTEST + next = 0x7f; // an illegal character // NOTEST } else state->offset += n; @@ -893,8 +1013,8 @@ a flag that tells us whether or not we need to strip. state->col += 1; } else if (is_newline(next)) { state->line += 1; - state->col = state->node->indent; do_strip(state); + state->col = state->node->needs_strip; } else if (next == '\t') { state->col = indent_tab(state->col); } @@ -997,8 +1117,11 @@ parsed too much already. For that there is `reset_token`. static void close_token(struct token_state *state, struct token *tk) { - tk->txt.len = (state->node->code.txt + state->offset) - - tk->txt.txt; + if (state->node != tk->node) + tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt); + else + tk->txt.len = (state->node->code.txt + state->offset) + - tk->txt.txt; } static void reset_token(struct token_state *state, struct token *tok) @@ -1010,8 +1133,7 @@ parsed too much already. For that there is `reset_token`. tok->txt.len = 0; } - -Tokens make not cross into the next `code_node`, and some tokens can +Tokens may not cross into the next `code_node`, and some tokens can include the newline at the and of a `code_node`, we must be able to easily check if we have reached the end. Equally we need to know if we are at the start of a node, as white space is treated a little @@ -1021,7 +1143,7 @@ differently there. static int at_son(struct token_state *state) { - return state->offset == 0; + return state->prev_offset <= state->strip_offset; } static int at_eon(struct token_state *state) @@ -1068,7 +1190,7 @@ searching for. Now we have all the bits there is just one section missing: combining all the token parsing code into one block. -The handling of delayed tokens (newlines, indents, undents) must come +The handling of delayed tokens (Newlines, INs, OUTs) must come first before we try getting another character. Then we parse all the test, making sure that we check for known marks @@ -1087,9 +1209,6 @@ loop. ## parse number ## parse word ## parse mark - ## parse string - ## parse comment - ## unknown mark ### Start and stop @@ -1107,6 +1226,9 @@ As well as getting tokens, we need to be able to create the memset(state, 0, sizeof(*state)); state->node = code; state->line = code->line_no; + do_strip(state); + state->col = state->node->needs_strip; + state->strip_offset = state->offset; state->conf = conf; return state; } @@ -1170,8 +1292,8 @@ so that it can be used to tracing processed strings too. [TK_multi_string] = "mstring", [TK_line_comment] = "lcomment", [TK_block_comment] = "bcomment", - [TK_indent] = "indent", - [TK_undent] = "undent", + [TK_in] = "in", + [TK_out] = "out", [TK_newline] = "newline", [TK_eof] = "eof", [TK_error] = "ERROR", @@ -1181,8 +1303,8 @@ so that it can be used to tracing processed strings too. default: /* known word or mark */ fprintf(f, "%.*s", tok.txt.len, tok.txt.txt); break; - case TK_indent: - case TK_undent: + case TK_in: + case TK_out: case TK_newline: case TK_eof: /* No token text included */ @@ -1233,7 +1355,7 @@ tokens. Now we just need C files to store them, and a mk file to make them. Converting a `TK_number` token to a numerical value is a slightly higher level task than lexical analysis, and slightly lower than -grammar parsing, so put it here - as an index if you like. +grammar parsing, so put it here - as an appendix if you like. Importantly it will be used by the same testing rig that is used for testing the token scanner. @@ -1258,10 +1380,10 @@ had never been initialised. int *placesp) { /* Accept digits up to 'base', ignore '_' and - * ' ' if they appear between two legal digits, - * and if `placesp` is not NULL, allow a single - * '.' or ',' and report the number of digits - * beyond there. + * (for base 10) ' ' if they appear between two + * legal digits, and if `placesp` is not NULL, + * allow a single '.' or ',' and report the number + * of digits beyond there. * Return number of characters processed (p), * or 0 if something illegal was found. */ @@ -1274,7 +1396,7 @@ had never been initialised. int dig; char c = tok.txt[p]; - if (c == '_' || c == ' ') { + if (c == '_' || (c == ' ' && base == 10)) { if (prev != Digit) goto bad; prev = Space; @@ -1394,7 +1516,7 @@ we need to record the number of places. We won't impose the number of places until we have the exponent as well. ###### number vars - int places =0; + int places = 0; mpz_t mant; int d; @@ -1444,7 +1566,6 @@ character `expc`. tok.txt += d; tok.len -= d; - Now that we have the mantissa and the exponent we can multiply them together, also allowing for the number of digits after the decimal mark. @@ -1508,7 +1629,6 @@ Now we are ready to parse a number: the base, mantissa, and exponent. If all goes well we check for the possible trailing letters and return. Return value is 1 for success and 0 for failure. - ###### number functions int number_parse(mpq_t num, char tail[3], struct text tok) { @@ -1550,7 +1670,7 @@ Number parsing goes in `libnumber.c` ## number includes ## number functions -###### File: number.h +###### File: parse_number.h int number_parse(mpq_t num, char tail[3], struct text tok); ###### File: scanner.mk @@ -1717,7 +1837,7 @@ required indent is found. if (c == ' ') skipped += 1; else if (c == '\t') - skipped = indent_tab(c); + skipped = indent_tab(skipped); else break; i+= 1; @@ -1834,7 +1954,7 @@ String parsing goes in `libstring.c` ## string functions ## string main -###### File: string.h +###### File: parse_string.h int string_parse(struct token *tok, char escape, struct text *str, char tail[3]); @@ -1843,7 +1963,6 @@ String parsing goes in `libstring.c` libstring.o : libstring.c $(CC) $(CFLAGS) -c libstring.c - ## Testing As "untested code is buggy code" we need a program to easily test @@ -1861,10 +1980,11 @@ the tokens one per line. #include #include #include + #include #include "mdcode.h" #include "scanner.h" - #include "number.h" - #include "string.h" + #include "parse_number.h" + #include "parse_string.h" static int errs; static void pr_err(char *msg) @@ -1873,11 +1993,19 @@ the tokens one per line. fprintf(stderr, "%s\n", msg); } + static int kcmp(const void *ap, const void *bp) + { + char * const *a = ap; + char * const *b = bp; + return strcmp(*a, *b); + } + int main(int argc, char *argv[]) { int fd; int len; char *file; + char *filename = NULL; struct token_state *state; const char *known[] = { "==", @@ -1894,27 +2022,92 @@ the tokens one per line. .words_marks = known, .number_chars = "., _+-", .known_count = sizeof(known)/sizeof(known[0]), - .ignored = (0 << TK_line_comment) - |(0 << TK_block_comment), + .ignored = 0, + }; + static const struct option long_options[] = { + { "word-start", 1, NULL, 'W'}, + { "word-cont", 1, NULL, 'w'}, + { "number-chars", 1, NULL, 'n'}, + { "ignore-numbers", 0, NULL, 'N'}, + { "ignore-ident", 0, NULL, 'I'}, + { "ignore-marks", 0, NULL, 'M'}, + { "ignore-strings", 0, NULL, 'S'}, + { "ignore-multi-strings",0, NULL, 'z'}, + { "ignore-line-comment",0, NULL, 'c'}, + { "ignore-newline", 0, NULL, 'l'}, + { "ignore-block-comment", 0, NULL, 'C'}, + { "ignore-indent", 0, NULL, 'i'}, + { "file", 1, NULL, 'f'}, + { "section", 1, NULL, 's'}, + { NULL, 0, NULL, 0}, }; + static const char options[] = "W:w:n:NIMSzclCif:s:"; + struct section *table, *s, *prev; + int opt; + char *section_name = NULL; + int section_found = 0; + setlocale(LC_ALL,""); - if (argc != 2) { - fprintf(stderr, "Usage: scanner file\n"); - exit(2); + while ((opt = getopt_long(argc, argv, options, long_options, NULL)) + != -1) { + switch(opt) { + case 'W': conf.word_start = optarg; break; + case 'w': conf.word_cont = optarg; break; + case 'n': conf.number_chars = optarg; break; + case 'N': conf.ignored |= 1 << TK_number; break; + case 'I': conf.ignored |= 1 << TK_ident; break; + case 'M': conf.ignored |= 1 << TK_mark; break; + case 'S': conf.ignored |= 1 << TK_string; break; + case 'z': conf.ignored |= 1 << TK_multi_string; break; + case 'c': conf.ignored |= 1 << TK_line_comment; break; + case 'C': conf.ignored |= 1 << TK_block_comment; break; + case 'l': conf.ignored |= 1 << TK_newline; break; + case 'i': conf.ignored |= 1 << TK_in; break; + case 'f': filename = optarg; break; + case 's': section_name = optarg; break; + default: fprintf(stderr, "scanner: unknown option '%c'.\n", + opt); + exit(1); + } + } + + if (optind < argc) { + const char **wm = calloc(argc - optind, sizeof(char*)); + int i; + for (i = optind; i < argc; i++) + wm[i - optind] = argv[i]; + qsort(wm, argc-optind, sizeof(char*), kcmp); + conf.words_marks = wm; + conf.known_count = argc - optind; } - fd = open(argv[1], O_RDONLY); + + if (filename) + fd = open(filename, O_RDONLY); + else + fd = 0; if (fd < 0) { fprintf(stderr, "scanner: cannot open %s: %s\n", - argv[1], strerror(errno)); + filename, strerror(errno)); exit(1); } len = lseek(fd, 0, 2); + if (len <= 0) { + fprintf(stderr,"scanner: %s is empty or not seekable\n", + filename ?: "stdin"); + exit(1); + } file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); table = code_extract(file, file+len, pr_err); for (s = table; s; (code_free(s->code), prev = s, s = s->next, free(prev))) { + if (section_name && + (s->section.len != strlen(section_name) || + strncmp(s->section.txt, section_name, s->section.len) != 0)) + continue; + if (section_name) + section_found = 1; printf("Tokenizing: %.*s\n", s->section.len, s->section.txt); state = token_open(s->code, &conf); @@ -1953,6 +2146,13 @@ the tokens one per line. if (tk.num == TK_eof) break; } + token_close(state); + } + if (conf.words_marks != known) + free(conf.words_marks); + if (section_name && !section_found) { + fprintf(stderr, "scanner: section %s not found\n", section_name); + errs = 1; } exit(!!errs); } @@ -1965,4 +2165,3 @@ the tokens one per line. libmdcode.o libnumber.o libstring.o -licuuc -lgmp scanner.o : scanner.c $(CC) $(CFLAGS) -c scanner.c -