X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;f=csrc%2Fscanner.mdc;h=6eee0892deb07e78fb3f6de7497a810515a313c7;hb=91f07aed5745363d8d612c600dd66e863088f978;hp=d15b44b88d200984552805d856d50e3ff8730e85;hpb=47a08b42df1255227f54a2846404649ee4de6e98;p=ocean diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc index d15b44b..6eee089 100644 --- a/csrc/scanner.mdc +++ b/csrc/scanner.mdc @@ -106,7 +106,7 @@ token numbers from `TK_reserved` upwards. ### Numbers Numbers are the messiest tokens to parse, primarily because they can -contain characters that also have meaning outside of number and, +contain characters that also have meaning outside of numbers and, particularly, immediately after numbers. The obvious example is the '`-`' sign. It can come inside a number for @@ -260,7 +260,7 @@ and the length of the list must be given (`known_count`). Tokens matching these known words are reported as the index of the list added to `TK_reserved`. -If identifiers are ignored, then any work which is not listed as a +If identifiers are ignored, then any word which is not listed as a known word results in an error. ###### token config parameters @@ -324,10 +324,17 @@ below before giving up and assuming an unknown mark. If an unknown mark contains a quote character or a comment marker, and that token is not being ignored, then we terminate the unknown mark -before that quote or comment. This ensure that an unknown mark +before that quote or comment. This ensures that an unknown mark immediately before a string is handled correctly. -If `TK_mark` is ignored, then unknown marks as returned as an error. +If the first character of a comment marker (i.e. '/') is a known mark, +the above rules would suggest that the start of a comment would be +parsed as that mark, which is not what is wanted. So the introductory +sequences for a comment ("//" and "/*") are treated as +partially-known. They prevent the leading "/" from being a mark by +itself, but do not actually constitute a stand-alone mark. + +If `TK_mark` is ignored, then unknown marks are returned as errors. ###### token types TK_mark, @@ -344,7 +351,16 @@ Known marks are included in the same list as the list of known words. if (n >= 0) tk.num = TK_reserved + n; else if (tk.num != TK_error) { - /* found a longest-known-mark */ + /* found a longest-known-mark, still need to + * check for comments + */ + if (tk.txt.len == 2 && tk.txt.txt[0] == '/' && + (ch == '/' || ch == '*')) { + /* Yes, this is a comment, not a '/' */ + restore_unget_state(state); + tk.num = TK_error; + break; + } unget_char(state); close_token(state, &tk); return tk; @@ -352,15 +368,22 @@ Known marks are included in the same list as the list of known words. prev = ch; save_unget_state(state); ch = get_char(state); - if (!(ignored && (1< 1) { + if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) { + close_token(state, &tk); restore_unget_state(state); break; } - if (prev == '/' && ch == '*' && tk.txt.len > 1) { + if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) { + close_token(state, &tk); restore_unget_state(state); break; } @@ -379,15 +402,6 @@ before assuming that we have an unknown mark ## parse comment ## unknown mark -###### unknown mark - if (tk.txt.len) { - if (ignored & (1<node->needs_strip) { int n = 4; while (n && state->node->code.txt[state->offset] == ' ') { + indent += 1; state->offset += 1; n -= 1; } while (n == 4 && state->node->code.txt[state->offset] == '\t') { + indent = indent_tab(indent); state->offset += 1; n -= 4; } } + return indent; } static wint_t get_char(struct token_state *state) @@ -912,9 +937,8 @@ a flag that tells us whether or not we need to strip. state->offset = 0; if (state->node == NULL) return WEOF; - do_strip(state); state->line = state->node->line_no; - state->col = state->node->indent; + state->col = do_strip(state); } ## before get_char @@ -939,8 +963,7 @@ a flag that tells us whether or not we need to strip. state->col += 1; } else if (is_newline(next)) { state->line += 1; - state->col = state->node->indent; - do_strip(state); + state->col = do_strip(state); } else if (next == '\t') { state->col = indent_tab(state->col); } @@ -1043,8 +1066,11 @@ parsed too much already. For that there is `reset_token`. static void close_token(struct token_state *state, struct token *tk) { - tk->txt.len = (state->node->code.txt + state->offset) - - tk->txt.txt; + if (state->node != tk->node) + tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt); + else + tk->txt.len = (state->node->code.txt + state->offset) + - tk->txt.txt; } static void reset_token(struct token_state *state, struct token *tok) @@ -1150,9 +1176,8 @@ As well as getting tokens, we need to be able to create the memset(state, 0, sizeof(*state)); state->node = code; state->line = code->line_no; - state->col = code->indent; + state->col = do_strip(state); state->conf = conf; - do_strip(state); return state; } void token_close(struct token_state *state) @@ -1762,7 +1787,7 @@ required indent is found. if (c == ' ') skipped += 1; else if (c == '\t') - skipped = indent_tab(c); + skipped = indent_tab(skipped); else break; i+= 1; @@ -1906,6 +1931,7 @@ the tokens one per line. #include #include #include + #include #include "mdcode.h" #include "scanner.h" #include "number.h" @@ -1918,11 +1944,19 @@ the tokens one per line. fprintf(stderr, "%s\n", msg); } + static int kcmp(const void *ap, const void *bp) + { + char * const *a = ap; + char * const *b = bp; + return strcmp(*a, *b); + } + int main(int argc, char *argv[]) { int fd; int len; char *file; + char *filename = NULL; struct token_state *state; const char *known[] = { "==", @@ -1939,22 +1973,77 @@ the tokens one per line. .words_marks = known, .number_chars = "., _+-", .known_count = sizeof(known)/sizeof(known[0]), - .ignored = (0 << TK_line_comment) - |(0 << TK_block_comment), + .ignored = 0, + }; + static const struct option long_options[] = { + { "word-start", 1, NULL, 'W'}, + { "word-cont", 1, NULL, 'w'}, + { "number-chars", 1, NULL, 'n'}, + { "ignore-numbers", 0, NULL, 'N'}, + { "ignore-ident", 0, NULL, 'I'}, + { "ignore-marks", 0, NULL, 'M'}, + { "ignore-strings", 0, NULL, 'S'}, + { "ignore-multi-strings",0, NULL, 'z'}, + { "ignore-line-comment",0, NULL, 'c'}, + { "ignore-newline", 0, NULL, 'l'}, + { "ignore-block-comment", 0, NULL, 'C'}, + { "ignore-indent", 0, NULL, 'i'}, + { "file", 1, NULL, 'f'}, + { NULL, 0, NULL, 0}, }; + static const char options[] = "W:w:n:NIMSzclCif:"; + struct section *table, *s, *prev; + int opt; + setlocale(LC_ALL,""); - if (argc != 2) { - fprintf(stderr, "Usage: scanner file\n"); - exit(2); + while ((opt = getopt_long(argc, argv, options, long_options, NULL)) + != -1) { + switch(opt) { + case 'W': conf.word_start = optarg; break; + case 'w': conf.word_cont = optarg; break; + case 'n': conf.number_chars = optarg; break; + case 'N': conf.ignored |= 1 << TK_number; break; + case 'I': conf.ignored |= 1 << TK_ident; break; + case 'M': conf.ignored |= 1 << TK_mark; break; + case 'S': conf.ignored |= 1 << TK_string; break; + case 'z': conf.ignored |= 1 << TK_multi_string; break; + case 'c': conf.ignored |= 1 << TK_line_comment; break; + case 'C': conf.ignored |= 1 << TK_block_comment; break; + case 'l': conf.ignored |= 1 << TK_newline; break; + case 'i': conf.ignored |= 1 << TK_in; break; + case 'f': filename = optarg; break; + default: fprintf(stderr, "scanner: unknown option '%c'.\n", + opt); + exit(1); + } + } + + if (optind < argc) { + const char **wm = calloc(argc - optind, sizeof(char*)); + int i; + for (i = optind; i < argc; i++) + wm[i - optind] = argv[i]; + qsort(wm, argc-optind, sizeof(char*), kcmp); + conf.words_marks = wm; + conf.known_count = argc - optind; } - fd = open(argv[1], O_RDONLY); + + if (filename) + fd = open(filename, O_RDONLY); + else + fd = 0; if (fd < 0) { fprintf(stderr, "scanner: cannot open %s: %s\n", - argv[1], strerror(errno)); + filename, strerror(errno)); exit(1); } len = lseek(fd, 0, 2); + if (len <= 0) { + fprintf(stderr,"scanner: %s is empty or not seekable\n", + filename ?: "stdin"); + exit(1); + } file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); table = code_extract(file, file+len, pr_err); @@ -1998,7 +2087,10 @@ the tokens one per line. if (tk.num == TK_eof) break; } + token_close(state); } + if (conf.words_marks != known) + free(conf.words_marks); exit(!!errs); } ###### File: scanner.mk @@ -2010,4 +2102,3 @@ the tokens one per line. libmdcode.o libnumber.o libstring.o -licuuc -lgmp scanner.o : scanner.c $(CC) $(CFLAGS) -c scanner.c -