X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;f=csrc%2Fscanner.mdc;h=6b706411f5010e3da61b6214742f3db39e91de77;hb=850a39a0a761e0af89c15253f075ecd9e9ecc6ee;hp=fa27a89524a85d4eeb9a16019558e61a993afb5f;hpb=c040191336b755321af667a0251b97782d8eed71;p=ocean diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc index fa27a89..6b70641 100644 --- a/csrc/scanner.mdc +++ b/csrc/scanner.mdc @@ -69,13 +69,15 @@ The scanner is not completely general, yet not completely specified. There are a fixed set of token types, though particular tokens within those types can be distinguish via configuration. -Most token types may be explicitly ignored, as typically comments -would be. The exact consequence of ignoring each token type varies -from token to token. +Most token types may be explicitly ignored, so they aren't parsed. +Comments typically parsed but not returned, but an option is provided to +return comments for further processing. The exact consequence of +ignoring each token type varies from token to token. ###### public types struct token_config { int ignored; // bit set of ignored tokens. + int return_comments; ## token config parameters }; @@ -354,10 +356,10 @@ immediately before a string is handled correctly. If the first character of a comment marker (i.e. '/') is a known mark, the above rules would suggest that the start of a comment would be -parsed as that mark, which is not what is wanted. So the introductory -sequences for a comment ("//" and "/*") are treated as -partially-known. They prevent the leading "/" from being a mark by -itself, but do not actually constitute a stand-alone mark. +parsed as that mark, which is not what is wanted. So when comments are +not ignored, the introductory sequences for a comment ("//" and "/*") +are treated as partially-known. They prevent the leading "/" from being +a mark by itself, but do not actually constitute a stand-alone mark. If `TK_mark` is ignored, then unknown marks are returned as errors. @@ -379,8 +381,7 @@ Known marks are included in the same list as the list of known words. /* found a longest-known-mark, still need to * check for comments */ - if (tk.txt.len == 2 && tk.txt.txt[0] == '/' && - (ch == '/' || ch == '*')) { + if (is_comment(ignored, tk.txt)) { /* Yes, this is a comment, not a '/' */ restore_unget_state(state); tk.num = TK_error; @@ -393,22 +394,21 @@ Known marks are included in the same list as the list of known words. prev = ch; save_unget_state(state); ch = get_char(state); + if (n >= 0) + /* No need to worry about other token types */ + continue; if (!(ignored & (1<= 3 && !(ignored & (1 << TK_multi_string)) && is_quote(tk.txt.txt[0]) && memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 && @@ -573,19 +573,29 @@ still parsed, but is discarded. TK_block_comment, ###### internal functions - static int is_line_comment(struct text txt) + static int is_line_comment(int ignored, struct text txt) { + if (ignored & (1 << TK_line_comment)) + return 0; return (txt.len >= 1 && txt.txt[0] == '#') || (txt.len >= 2 && txt.txt[0] == '/' && txt.txt[1] == '/'); } - static int is_block_comment(struct text txt) + static int is_block_comment(int ignored, struct text txt) { + if (ignored & (1 << TK_block_comment)) + return 0; return txt.len >= 2 && txt.txt[0] == '/' && txt.txt[1] == '*'; } + static int is_comment(int ignored, struct text txt) + { + return is_line_comment(ignored, txt) || + is_block_comment(ignored, txt); + } + #### Single line comments A single-line comment continues up to, but not including the newline @@ -593,14 +603,14 @@ or end of node. ###### parse comment - if (is_line_comment(tk.txt)) { + if (is_line_comment(ignored, tk.txt)) { while (!is_newline(ch) && !at_eon(state)) ch = get_char(state); if (is_newline(ch)) unget_char(state); close_token(state, &tk); tk.num = TK_line_comment; - if (ignored & (1 << TK_line_comment)) + if (!state->conf->return_comments) continue; return tk; } @@ -617,7 +627,7 @@ the unget state (explained later). ###### parse comment - if (is_block_comment(tk.txt)) { + if (is_block_comment(ignored, tk.txt)) { wchar_t prev; int newlines = 0; reset_token(state, &tk); @@ -655,8 +665,7 @@ the unget state (explained later). if (!is_newline(ch)) tk.num = TK_error; } - if (tk.num == TK_error || - !(ignored & (1 << TK_block_comment))) + if (tk.num == TK_error || state->conf->return_comments) return tk; continue; } @@ -716,6 +725,8 @@ ignored. int indent_level; int indent_sizes[20]; +`indent_sizes[0]` will always be zero - this simplifies some code. + #### Newlines Newlines can optionally be reported. Newlines within a block comment @@ -796,10 +807,20 @@ Separately we need, on each call to `token_next`, to check if there are some delayed tokens and if so we need to advance the state information and return one token. +###### internal functions + static int state_indent(struct token_state *state) + { + if (state->node == NULL) + return state->col; + return state->node->indent - state->node->needs_strip + state->col; + } + ###### white space + if (is_newline(ch)) + state_check_node(state); if (is_newline(ch) || (at_son(state) && ch <= ' ')) { int newlines = 0; - int was_son = at_son(state); + int was_nl = is_newline(ch); if (ignored & (1<node->next && - state->node->next->indent > state->node->indent) - state->col = state->node->next->indent; - else - state->col = state->node->indent; - } else + if (ch != WEOF) unget_char(state); state->delayed_lines = newlines; - state->out_next = was_son; + state->out_next = !was_nl; state->check_indent = 1; continue; } @@ -833,7 +849,7 @@ information and return one token. ###### delayed tokens if (state->check_indent || state->delayed_lines) { - if (state->col < state->indent_sizes[state->indent_level]) { + if (state_indent(state) < state->indent_sizes[state->indent_level]) { if (!state->out_next && !(ignored & (1<out_next = 1; @@ -845,11 +861,12 @@ information and return one token. tk.num = TK_out; return tk; } - if (state->col > state->indent_sizes[state->indent_level] && + if (state_indent(state) > state->indent_sizes[state->indent_level] && state->indent_level < sizeof(state->indent_sizes)-1) { state->indent_level += 1; - state->indent_sizes[state->indent_level] = state->col; - state->delayed_lines -= 1; + state->indent_sizes[state->indent_level] = state_indent(state); + if (state->delayed_lines) + state->delayed_lines -= 1; tk.num = TK_in; return tk; } @@ -874,11 +891,6 @@ tokens will continue to return the same end-of-file token. ###### white space if (ch == WEOF) { - if (state->col) { - state->col = 0; - state->check_indent = 1; - continue; - } tk.num = TK_eof; return tk; } @@ -938,7 +950,7 @@ a flag that tells us whether or not we need to strip. ###### internal functions - static int do_strip(struct token_state *state) + static void do_strip(struct token_state *state) { int indent = 0; if (state->node->needs_strip) { @@ -954,7 +966,28 @@ a flag that tells us whether or not we need to strip. n -= 4; } } - return indent; + } + + static void state_check_node(struct token_state *state) + { + if (!state->node) + return; + if (state->node->code.len > state->offset) + return; + + do + state->node = state->node->next; + while (state->node && state->node->code.txt == NULL); + state->offset = 0; + state->prev_offset = 0; + state->strip_offset = 0; + state->col = 0; + if (state->node == NULL) + return; + state->line = state->node->line_no; + do_strip(state); + state->col = state->node->needs_strip; + state->strip_offset = state->offset; } static wint_t get_char(struct token_state *state) @@ -963,19 +996,9 @@ a flag that tells us whether or not we need to strip. size_t n; mbstate_t mbstate; + state_check_node(state); if (state->node == NULL) return WEOF; - if (state->node->code.len <= state->offset) { - do - state->node = state->node->next; - while (state->node && state->node->code.txt == NULL); - state->offset = 0; - if (state->node == NULL) - return WEOF; - state->line = state->node->line_no; - state->col = do_strip(state); - state->strip_offset = state->offset; - } ## before get_char @@ -986,12 +1009,12 @@ a flag that tells us whether or not we need to strip. &mbstate); if (n == -2 || n == 0) { /* Not enough bytes - not really possible */ - next = '\n'; - state->offset = state->node->code.len; + next = '\n'; // NOTEST + state->offset = state->node->code.len; // NOTEST } else if (n == -1) { /* error */ - state->offset += 1; - next = 0x7f; // an illegal character + state->offset += 1; // NOTEST + next = 0x7f; // an illegal character // NOTEST } else state->offset += n; @@ -999,7 +1022,8 @@ a flag that tells us whether or not we need to strip. state->col += 1; } else if (is_newline(next)) { state->line += 1; - state->col = do_strip(state); + do_strip(state); + state->col = state->node->needs_strip; } else if (next == '\t') { state->col = indent_tab(state->col); } @@ -1211,7 +1235,8 @@ As well as getting tokens, we need to be able to create the memset(state, 0, sizeof(*state)); state->node = code; state->line = code->line_no; - state->col = do_strip(state); + do_strip(state); + state->col = state->node->needs_strip; state->strip_offset = state->offset; state->conf = conf; return state; @@ -1654,7 +1679,7 @@ Number parsing goes in `libnumber.c` ## number includes ## number functions -###### File: number.h +###### File: parse_number.h int number_parse(mpq_t num, char tail[3], struct text tok); ###### File: scanner.mk @@ -1938,7 +1963,7 @@ String parsing goes in `libstring.c` ## string functions ## string main -###### File: string.h +###### File: parse_string.h int string_parse(struct token *tok, char escape, struct text *str, char tail[3]); @@ -1967,8 +1992,8 @@ the tokens one per line. #include #include "mdcode.h" #include "scanner.h" - #include "number.h" - #include "string.h" + #include "parse_number.h" + #include "parse_string.h" static int errs; static void pr_err(char *msg) @@ -2021,13 +2046,17 @@ the tokens one per line. { "ignore-newline", 0, NULL, 'l'}, { "ignore-block-comment", 0, NULL, 'C'}, { "ignore-indent", 0, NULL, 'i'}, + { "return-comments", 0, NULL, 'r'}, { "file", 1, NULL, 'f'}, + { "section", 1, NULL, 's'}, { NULL, 0, NULL, 0}, }; - static const char options[] = "W:w:n:NIMSzclCif:"; + static const char options[] = "W:w:n:NIMSzclCirf:s:"; struct section *table, *s, *prev; int opt; + char *section_name = NULL; + int section_found = 0; setlocale(LC_ALL,""); while ((opt = getopt_long(argc, argv, options, long_options, NULL)) @@ -2045,7 +2074,9 @@ the tokens one per line. case 'C': conf.ignored |= 1 << TK_block_comment; break; case 'l': conf.ignored |= 1 << TK_newline; break; case 'i': conf.ignored |= 1 << TK_in; break; + case 'r': conf.return_comments = 1; break; case 'f': filename = optarg; break; + case 's': section_name = optarg; break; default: fprintf(stderr, "scanner: unknown option '%c'.\n", opt); exit(1); @@ -2082,6 +2113,12 @@ the tokens one per line. for (s = table; s; (code_free(s->code), prev = s, s = s->next, free(prev))) { + if (section_name && + (s->section.len != strlen(section_name) || + strncmp(s->section.txt, section_name, s->section.len) != 0)) + continue; + if (section_name) + section_found = 1; printf("Tokenizing: %.*s\n", s->section.len, s->section.txt); state = token_open(s->code, &conf); @@ -2124,6 +2161,10 @@ the tokens one per line. } if (conf.words_marks != known) free(conf.words_marks); + if (section_name && !section_found) { + fprintf(stderr, "scanner: section %s not found\n", section_name); + errs = 1; + } exit(!!errs); } ###### File: scanner.mk