X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;f=csrc%2Fscanner.mdc;h=9a1ea7144b612a6624e39d799921c37682d46c99;hb=5513fc2e3fb56bdf2292e834077e0c33f9a5c2a9;hp=7e33d0cbc1a89126706d2f72ce1f2351e0d395ee;hpb=4d085c0f91408abb43eeeddd022b13569e3682a4;p=ocean diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc index 7e33d0c..9a1ea71 100644 --- a/csrc/scanner.mdc +++ b/csrc/scanner.mdc @@ -19,7 +19,7 @@ The text is assumed to be UTF-8 though some matching assumes the ASCII subset. If the text provided does not conform to UTF-8 an error will be reported and some number of bytes will be skipped. -###### includes +###### public types #include #include #include @@ -90,7 +90,7 @@ The different tokens are numbers, words, marks, strings, comments, newlines, EOF, and indents, each of which is examined in detail below. There are various cases where no token can be found in part of the -input. All of these will be reported as an `TK_error` token. +input. All of these will be reported as a `TK_error` token. It is possible to declare a number of strings which form distinct tokens (rather than being grouped as e.g. 'word'). These are given @@ -106,7 +106,7 @@ token numbers from `TK_reserved` upwards. ### Numbers Numbers are the messiest tokens to parse, primarily because they can -contain characters that also have meaning outside of number and, +contain characters that also have meaning outside of numbers and, particularly, immediately after numbers. The obvious example is the '`-`' sign. It can come inside a number for @@ -177,11 +177,11 @@ are declared to be a start character for words. int sign_ok = 0; switch(expect_p) { case 0: - if (ch == 'e') + if (ch == 'e' || ch == 'E') sign_ok = 1; break; case 1: - if (ch == 'p') + if (ch == 'p' || ch == 'P') sign_ok = 1; break; } @@ -260,6 +260,9 @@ and the length of the list must be given (`known_count`). Tokens matching these known words are reported as the index of the list added to `TK_reserved`. +If identifiers are ignored, then any word which is not listed as a +known word results in an error. + ###### token config parameters const char **words_marks; int known_count; @@ -321,10 +324,17 @@ below before giving up and assuming an unknown mark. If an unknown mark contains a quote character or a comment marker, and that token is not being ignored, then we terminate the unknown mark -before that quote or comment. This ensure that an unknown mark +before that quote or comment. This ensures that an unknown mark immediately before a string is handled correctly. -If `TK_mark` is ignored, then unknown marks as returned as an error. +If the first character of a comment marker (i.e. '/') is a known mark, +the above rules would suggest that the start of a comment would be +parsed as that mark, which is not what is wanted. So the introductory +sequences for a comment ("//" and "/*") are treated as +partially-known. They prevent the leading "/" from being a mark by +itself, but do not actually constitute a stand-alone mark. + +If `TK_mark` is ignored, then unknown marks are returned as errors. ###### token types TK_mark, @@ -341,31 +351,56 @@ Known marks are included in the same list as the list of known words. if (n >= 0) tk.num = TK_reserved + n; else if (tk.num != TK_error) { - /* found a longest-known-mark */ + /* found a longest-known-mark, still need to + * check for comments + */ + if (tk.txt.len == 2 && tk.txt.txt[0] == '/' && + (ch == '/' || ch == '*')) { + /* Yes, this is a comment, not a '/' */ + restore_unget_state(state); + tk.num = TK_error; + break; + } unget_char(state); close_token(state, &tk); return tk; } prev = ch; - if (prev == '/') - save_unget_state(state); + save_unget_state(state); ch = get_char(state); - if (!(ignored && (1<delayed_lines = newlines; - state->undent_next = was_son; + state->out_next = was_son; state->check_indent = 1; continue; } @@ -773,15 +819,15 @@ information and return one token. if (state->check_indent || state->delayed_lines) { if (state->col < state->indent_sizes[state->indent_level]) { - if (!state->undent_next && + if (!state->out_next && !(ignored & (1<undent_next = 1; + state->out_next = 1; tk.num = TK_newline; return tk; } state->indent_level -= 1; - state->undent_next = 0; - tk.num = TK_undent; + state->out_next = 0; + tk.num = TK_out; return tk; } if (state->col > state->indent_sizes[state->indent_level] && @@ -789,7 +835,7 @@ information and return one token. state->indent_level += 1; state->indent_sizes[state->indent_level] = state->col; state->delayed_lines -= 1; - tk.num = TK_indent; + tk.num = TK_in; return tk; } state->check_indent = 0; @@ -814,6 +860,11 @@ tokens will continue to return the same end-of-file token. ###### white space if (ch == WEOF) { + if (state->col) { + state->col = 0; + state->check_indent = 1; + continue; + } tk.num = TK_eof; return tk; } @@ -858,19 +909,23 @@ a flag that tells us whether or not we need to strip. ###### internal functions - static void do_strip(struct token_state *state) + static int do_strip(struct token_state *state) { + int indent = 0; if (state->node->needs_strip) { int n = 4; while (n && state->node->code.txt[state->offset] == ' ') { + indent += 1; state->offset += 1; n -= 1; } while (n == 4 && state->node->code.txt[state->offset] == '\t') { + indent = indent_tab(indent); state->offset += 1; n -= 4; } } + return indent; } static wint_t get_char(struct token_state *state) @@ -888,9 +943,8 @@ a flag that tells us whether or not we need to strip. state->offset = 0; if (state->node == NULL) return WEOF; - do_strip(state); state->line = state->node->line_no; - state->col = state->node->indent; + state->col = do_strip(state); } ## before get_char @@ -915,8 +969,7 @@ a flag that tells us whether or not we need to strip. state->col += 1; } else if (is_newline(next)) { state->line += 1; - state->col = state->node->indent; - do_strip(state); + state->col = do_strip(state); } else if (next == '\t') { state->col = indent_tab(state->col); } @@ -1019,8 +1072,11 @@ parsed too much already. For that there is `reset_token`. static void close_token(struct token_state *state, struct token *tk) { - tk->txt.len = (state->node->code.txt + state->offset) - - tk->txt.txt; + if (state->node != tk->node) + tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt); + else + tk->txt.len = (state->node->code.txt + state->offset) + - tk->txt.txt; } static void reset_token(struct token_state *state, struct token *tok) @@ -1090,7 +1146,7 @@ searching for. Now we have all the bits there is just one section missing: combining all the token parsing code into one block. -The handling of delayed tokens (newlines, indents, undents) must come +The handling of delayed tokens (Newlines, INs, OUTs) must come first before we try getting another character. Then we parse all the test, making sure that we check for known marks @@ -1109,9 +1165,6 @@ loop. ## parse number ## parse word ## parse mark - ## parse string - ## parse comment - ## unknown mark ### Start and stop @@ -1129,9 +1182,8 @@ As well as getting tokens, we need to be able to create the memset(state, 0, sizeof(*state)); state->node = code; state->line = code->line_no; - state->col = code->indent; + state->col = do_strip(state); state->conf = conf; - do_strip(state); return state; } void token_close(struct token_state *state) @@ -1194,8 +1246,8 @@ so that it can be used to tracing processed strings too. [TK_multi_string] = "mstring", [TK_line_comment] = "lcomment", [TK_block_comment] = "bcomment", - [TK_indent] = "indent", - [TK_undent] = "undent", + [TK_in] = "in", + [TK_out] = "out", [TK_newline] = "newline", [TK_eof] = "eof", [TK_error] = "ERROR", @@ -1205,8 +1257,8 @@ so that it can be used to tracing processed strings too. default: /* known word or mark */ fprintf(f, "%.*s", tok.txt.len, tok.txt.txt); break; - case TK_indent: - case TK_undent: + case TK_in: + case TK_out: case TK_newline: case TK_eof: /* No token text included */ @@ -1741,7 +1793,7 @@ required indent is found. if (c == ' ') skipped += 1; else if (c == '\t') - skipped = indent_tab(c); + skipped = indent_tab(skipped); else break; i+= 1; @@ -1885,6 +1937,7 @@ the tokens one per line. #include #include #include + #include #include "mdcode.h" #include "scanner.h" #include "number.h" @@ -1897,11 +1950,19 @@ the tokens one per line. fprintf(stderr, "%s\n", msg); } + static int kcmp(const void *ap, const void *bp) + { + char * const *a = ap; + char * const *b = bp; + return strcmp(*a, *b); + } + int main(int argc, char *argv[]) { int fd; int len; char *file; + char *filename = NULL; struct token_state *state; const char *known[] = { "==", @@ -1918,22 +1979,77 @@ the tokens one per line. .words_marks = known, .number_chars = "., _+-", .known_count = sizeof(known)/sizeof(known[0]), - .ignored = (0 << TK_line_comment) - |(0 << TK_block_comment), + .ignored = 0, + }; + static const struct option long_options[] = { + { "word-start", 1, NULL, 'W'}, + { "word-cont", 1, NULL, 'w'}, + { "number-chars", 1, NULL, 'n'}, + { "ignore-numbers", 0, NULL, 'N'}, + { "ignore-ident", 0, NULL, 'I'}, + { "ignore-marks", 0, NULL, 'M'}, + { "ignore-strings", 0, NULL, 'S'}, + { "ignore-multi-strings",0, NULL, 'z'}, + { "ignore-line-comment",0, NULL, 'c'}, + { "ignore-newline", 0, NULL, 'l'}, + { "ignore-block-comment", 0, NULL, 'C'}, + { "ignore-indent", 0, NULL, 'i'}, + { "file", 1, NULL, 'f'}, + { NULL, 0, NULL, 0}, }; + static const char options[] = "W:w:n:NIMSzclCif:"; + struct section *table, *s, *prev; + int opt; + setlocale(LC_ALL,""); - if (argc != 2) { - fprintf(stderr, "Usage: scanner file\n"); - exit(2); + while ((opt = getopt_long(argc, argv, options, long_options, NULL)) + != -1) { + switch(opt) { + case 'W': conf.word_start = optarg; break; + case 'w': conf.word_cont = optarg; break; + case 'n': conf.number_chars = optarg; break; + case 'N': conf.ignored |= 1 << TK_number; break; + case 'I': conf.ignored |= 1 << TK_ident; break; + case 'M': conf.ignored |= 1 << TK_mark; break; + case 'S': conf.ignored |= 1 << TK_string; break; + case 'z': conf.ignored |= 1 << TK_multi_string; break; + case 'c': conf.ignored |= 1 << TK_line_comment; break; + case 'C': conf.ignored |= 1 << TK_block_comment; break; + case 'l': conf.ignored |= 1 << TK_newline; break; + case 'i': conf.ignored |= 1 << TK_in; break; + case 'f': filename = optarg; break; + default: fprintf(stderr, "scanner: unknown option '%c'.\n", + opt); + exit(1); + } } - fd = open(argv[1], O_RDONLY); + + if (optind < argc) { + const char **wm = calloc(argc - optind, sizeof(char*)); + int i; + for (i = optind; i < argc; i++) + wm[i - optind] = argv[i]; + qsort(wm, argc-optind, sizeof(char*), kcmp); + conf.words_marks = wm; + conf.known_count = argc - optind; + } + + if (filename) + fd = open(filename, O_RDONLY); + else + fd = 0; if (fd < 0) { fprintf(stderr, "scanner: cannot open %s: %s\n", - argv[1], strerror(errno)); + filename, strerror(errno)); exit(1); } len = lseek(fd, 0, 2); + if (len <= 0) { + fprintf(stderr,"scanner: %s is empty or not seekable\n", + filename ?: "stdin"); + exit(1); + } file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); table = code_extract(file, file+len, pr_err); @@ -1977,7 +2093,10 @@ the tokens one per line. if (tk.num == TK_eof) break; } + token_close(state); } + if (conf.words_marks != known) + free(conf.words_marks); exit(!!errs); } ###### File: scanner.mk @@ -1989,4 +2108,3 @@ the tokens one per line. libmdcode.o libnumber.o libstring.o -licuuc -lgmp scanner.o : scanner.c $(CC) $(CFLAGS) -c scanner.c -