X-Git-Url: https://ocean-lang.org/code/?p=ocean;a=blobdiff_plain;f=csrc%2Fscanner.mdc;h=6b706411f5010e3da61b6214742f3db39e91de77;hp=b57db29c0b38fcefead7e0cc4cafb41494d1264e;hb=850a39a0a761e0af89c15253f075ecd9e9ecc6ee;hpb=86e54542650f7f72a04c70618e07091f905398c8 diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc index b57db29..6b70641 100644 --- a/csrc/scanner.mdc +++ b/csrc/scanner.mdc @@ -69,13 +69,15 @@ The scanner is not completely general, yet not completely specified. There are a fixed set of token types, though particular tokens within those types can be distinguish via configuration. -Most token types may be explicitly ignored, as typically comments -would be. The exact consequence of ignoring each token type varies -from token to token. +Most token types may be explicitly ignored, so they aren't parsed. +Comments typically parsed but not returned, but an option is provided to +return comments for further processing. The exact consequence of +ignoring each token type varies from token to token. ###### public types struct token_config { int ignored; // bit set of ignored tokens. + int return_comments; ## token config parameters }; @@ -354,10 +356,10 @@ immediately before a string is handled correctly. If the first character of a comment marker (i.e. '/') is a known mark, the above rules would suggest that the start of a comment would be -parsed as that mark, which is not what is wanted. So the introductory -sequences for a comment ("//" and "/*") are treated as -partially-known. They prevent the leading "/" from being a mark by -itself, but do not actually constitute a stand-alone mark. +parsed as that mark, which is not what is wanted. So when comments are +not ignored, the introductory sequences for a comment ("//" and "/*") +are treated as partially-known. They prevent the leading "/" from being +a mark by itself, but do not actually constitute a stand-alone mark. If `TK_mark` is ignored, then unknown marks are returned as errors. @@ -379,8 +381,7 @@ Known marks are included in the same list as the list of known words. /* found a longest-known-mark, still need to * check for comments */ - if (tk.txt.len == 2 && tk.txt.txt[0] == '/' && - (ch == '/' || ch == '*')) { + if (is_comment(ignored, tk.txt)) { /* Yes, this is a comment, not a '/' */ restore_unget_state(state); tk.num = TK_error; @@ -393,22 +394,21 @@ Known marks are included in the same list as the list of known words. prev = ch; save_unget_state(state); ch = get_char(state); + if (n >= 0) + /* No need to worry about other token types */ + continue; if (!(ignored & (1<= 3 && !(ignored & (1 << TK_multi_string)) && is_quote(tk.txt.txt[0]) && memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 && @@ -573,19 +573,29 @@ still parsed, but is discarded. TK_block_comment, ###### internal functions - static int is_line_comment(struct text txt) + static int is_line_comment(int ignored, struct text txt) { + if (ignored & (1 << TK_line_comment)) + return 0; return (txt.len >= 1 && txt.txt[0] == '#') || (txt.len >= 2 && txt.txt[0] == '/' && txt.txt[1] == '/'); } - static int is_block_comment(struct text txt) + static int is_block_comment(int ignored, struct text txt) { + if (ignored & (1 << TK_block_comment)) + return 0; return txt.len >= 2 && txt.txt[0] == '/' && txt.txt[1] == '*'; } + static int is_comment(int ignored, struct text txt) + { + return is_line_comment(ignored, txt) || + is_block_comment(ignored, txt); + } + #### Single line comments A single-line comment continues up to, but not including the newline @@ -593,14 +603,14 @@ or end of node. ###### parse comment - if (is_line_comment(tk.txt)) { + if (is_line_comment(ignored, tk.txt)) { while (!is_newline(ch) && !at_eon(state)) ch = get_char(state); if (is_newline(ch)) unget_char(state); close_token(state, &tk); tk.num = TK_line_comment; - if (ignored & (1 << TK_line_comment)) + if (!state->conf->return_comments) continue; return tk; } @@ -617,7 +627,7 @@ the unget state (explained later). ###### parse comment - if (is_block_comment(tk.txt)) { + if (is_block_comment(ignored, tk.txt)) { wchar_t prev; int newlines = 0; reset_token(state, &tk); @@ -655,8 +665,7 @@ the unget state (explained later). if (!is_newline(ch)) tk.num = TK_error; } - if (tk.num == TK_error || - !(ignored & (1 << TK_block_comment))) + if (tk.num == TK_error || state->conf->return_comments) return tk; continue; } @@ -2037,11 +2046,12 @@ the tokens one per line. { "ignore-newline", 0, NULL, 'l'}, { "ignore-block-comment", 0, NULL, 'C'}, { "ignore-indent", 0, NULL, 'i'}, + { "return-comments", 0, NULL, 'r'}, { "file", 1, NULL, 'f'}, { "section", 1, NULL, 's'}, { NULL, 0, NULL, 0}, }; - static const char options[] = "W:w:n:NIMSzclCif:s:"; + static const char options[] = "W:w:n:NIMSzclCirf:s:"; struct section *table, *s, *prev; int opt; @@ -2064,6 +2074,7 @@ the tokens one per line. case 'C': conf.ignored |= 1 << TK_block_comment; break; case 'l': conf.ignored |= 1 << TK_newline; break; case 'i': conf.ignored |= 1 << TK_in; break; + case 'r': conf.return_comments = 1; break; case 'f': filename = optarg; break; case 's': section_name = optarg; break; default: fprintf(stderr, "scanner: unknown option '%c'.\n",