]> ocean-lang.org Git - ocean/commitdiff
scanner: change the meaning of ignoring comment tokens.
authorNeilBrown <neil@brown.name>
Tue, 6 Oct 2020 04:44:46 +0000 (15:44 +1100)
committerNeilBrown <neil@brown.name>
Tue, 6 Oct 2020 04:53:50 +0000 (15:53 +1100)
Previously ignoring comment tokens meant they were still parsed, but not
returned.  The only way to stop them being parsed was to declare
known marks for the start symbols.

This made is not possible for parsergen to define a language that had
a known mark that would otherwise start a comment.

So change the ignoring of comment tokens to mean they aren't parsed.  If
you want to parse comments but not return them, leave the new
"return_comments" field as so.  In the unusual case that you want to
return comments set return_comments to 1.

Confirm that this has the desired effect by added in "//" as an
integer-division operator to the sample calculator.

Signed-off-by: NeilBrown <neil@brown.name>
csrc/indent_test.mdc
csrc/oceani.mdc
csrc/parsergen.mdc
csrc/scanner-tests.mdc
csrc/scanner.mdc

index 960f63eb8122a2b8791952498e5c7b42832d2029..3df78d08a937245deb0fbbc6f821ec65a9736c05 100644 (file)
@@ -114,8 +114,6 @@ with complete bracketing and indenting.
                char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
                struct section *s = code_extract(file, file+len, NULL);
                struct token_config config = {
                char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
                struct section *s = code_extract(file, file+len, NULL);
                struct token_config config = {
-                       .ignored = (1 << TK_line_comment)
-                                | (1 << TK_block_comment),
                        .number_chars = ".,_+-",
                        .word_start = "",
                        .word_cont = "",
                        .number_chars = ".,_+-",
                        .word_start = "",
                        .word_cont = "",
index 2d17598658b51ecfb20da1878fd1190f493d3f8a..ab6abe91f67156e03a0e81926897d320e652b04a 100644 (file)
@@ -171,9 +171,7 @@ structures can be used.
                char *section = NULL;
                struct parse_context context = {
                        .config = {
                char *section = NULL;
                struct parse_context context = {
                        .config = {
-                               .ignored = (1 << TK_line_comment)
-                                        | (1 << TK_block_comment)
-                                        | (1 << TK_mark),
+                               .ignored = (1 << TK_mark),
                                .number_chars = ".,_+- ",
                                .word_start = "_",
                                .word_cont = "_",
                                .number_chars = ".,_+- ",
                                .word_start = "_",
                                .word_cont = "_",
index 803d9b6e87e0d21ac7029617bd7b8fa79c292cff..3f36df9a6873c3c94f2f127d8a0ca6419e50603c 100644 (file)
@@ -636,6 +636,11 @@ to produce errors that the parser is better positioned to handle.
                        } else if (tk.num == TK_mark
                                   && text_is(tk.txt, "$*")) {
                                err = dollar_line(state, g, 1);
                        } else if (tk.num == TK_mark
                                   && text_is(tk.txt, "$*")) {
                                err = dollar_line(state, g, 1);
+                       } else if (tk.num == TK_mark
+                                  && text_is(tk.txt, "//")) {
+                               while (tk.num != TK_newline &&
+                                      tk.num != TK_eof)
+                                       tk = token_next(state);
                        } else {
                                err = "Unrecognised token at start of line.";
                        }
                        } else {
                                err = "Unrecognised token at start of line.";
                        }
@@ -1911,7 +1916,6 @@ pieces of code provided in the grammar file, so they are generated first.
                fprintf(f, "\tstruct token_state *tokens;\n");
                fprintf(f, "\tconfig->words_marks = known;\n");
                fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
                fprintf(f, "\tstruct token_state *tokens;\n");
                fprintf(f, "\tconfig->words_marks = known;\n");
                fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
-               fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
                fprintf(f, "\ttokens = token_open(code, config);\n");
                fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
                fprintf(f, "\ttoken_close(tokens);\n");
                fprintf(f, "\ttokens = token_open(code, config);\n");
                fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
                fprintf(f, "\ttoken_close(tokens);\n");
@@ -3105,7 +3109,6 @@ an error.
                struct section *s;
                struct token_config config = {
                        .ignored = (1 << TK_line_comment)
                struct section *s;
                struct token_config config = {
                        .ignored = (1 << TK_line_comment)
-                                | (1 << TK_block_comment)
                                 | (1 << TK_in)
                                 | (1 << TK_out),
                        .number_chars = ".,_+-",
                                 | (1 << TK_in)
                                 | (1 << TK_out),
                        .number_chars = ".,_+-",
@@ -3127,7 +3130,7 @@ an error.
 # calc: grammar
 
        $LEFT + -
 # calc: grammar
 
        $LEFT + -
-       $LEFT * /
+       $LEFT * / //
 
        Session -> Session Line
                | Line
 
        Session -> Session Line
                | Line
@@ -3155,6 +3158,16 @@ an error.
                | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
                | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
                | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
                | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
                | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
                | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+               | Expression // Expression ${ {
+                       mpz_t z0, z1, z2;
+                       mpq_init($0.val);
+                       mpz_init(z0); mpz_init(z1); mpz_init(z2);
+                       mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+                       mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+                       mpz_tdiv_q(z0, z1, z2);
+                       mpq_set_z($0.val, z0);
+                       mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+               } }$
                | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
                | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
 
                | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
                | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
 
@@ -3167,4 +3180,6 @@ an error.
        10 * 9 / 2
        1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
 
        10 * 9 / 2
        1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
 
+       355//113
+
        error
        error
index ec34812d0cf9313c847364e7e86805425e786553..d527ed0442fc9c68bbbe2f0e586bfa65409ce2c5 100644 (file)
@@ -53,12 +53,12 @@ Some simple tests... maybe all tests are simple.
 Include a special test for numbers, as they are interesting.
 
 ###### test list
 Include a special test for numbers, as they are interesting.
 
 ###### test list
-       scanner_tests += "test1,if,then,+,-"
-       scanner_tests += "test1,if,then,+,-,/"
-       scanner_tests += "test1,--ignore-indent,if,then,+,-,/"
+       scanner_tests += "test1,-r,if,then,+,-"
+       scanner_tests += "test1,-r,if,then,+,-,/"
+       scanner_tests += "test1,-r,--ignore-indent,if,then,+,-,/"
+       scanner_tests += "test1,-r,--ignore-indent,--ignore-newline,if,then,+,-,/"
        scanner_tests += "test1,--ignore-indent,--ignore-newline,if,then,+,-,/"
        scanner_tests += "test1,--ignore-indent,--ignore-newline,if,then,+,-,/"
-       scanner_tests += "test1,-Cc,--ignore-indent,--ignore-newline,if,then,+,-,/"
-       scanner_tests += "test1,-CcSz,--ignore-indent,--ignore-newline,if,then,+,-,/"
+       scanner_tests += "test1,-Sz,--ignore-indent,--ignore-newline,if,then,+,-,/"
 
 ###### test: test1
 
 
 ###### test: test1
 
@@ -93,7 +93,7 @@ Include a special test for numbers, as they are interesting.
        lines */
        divident /+ divisor
 
        lines */
        divident /+ divisor
 
-###### output: test1,if,then,+,-
+###### output: test1,-r,if,then,+,-
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
@@ -180,7 +180,7 @@ Include a special test for numbers, as they are interesting.
        32:0 newline()
        32:0 eof()
 
        32:0 newline()
        32:0 eof()
 
-###### output: test1,if,then,+,-,/
+###### output: test1,-r,if,then,+,-,/
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
@@ -268,7 +268,7 @@ Include a special test for numbers, as they are interesting.
        32:0 newline()
        32:0 eof()
 
        32:0 newline()
        32:0 eof()
 
-###### output: test1,--ignore-indent,if,then,+,-,/
+###### output: test1,-r,--ignore-indent,if,then,+,-,/
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
@@ -344,7 +344,7 @@ Include a special test for numbers, as they are interesting.
        31:19 newline()
        32:0 eof()
 
        31:19 newline()
        32:0 eof()
 
-###### output: test1,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,-r,--ignore-indent,--ignore-newline,if,then,+,-,/
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
@@ -396,7 +396,7 @@ Include a special test for numbers, as they are interesting.
        31:12 ident(divisor)
        32:0 eof()
 
        31:12 ident(divisor)
        32:0 eof()
 
-###### output: test1,-Cc,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,--ignore-indent,--ignore-newline,if,then,+,-,/
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
@@ -445,7 +445,7 @@ Include a special test for numbers, as they are interesting.
        31:12 ident(divisor)
        32:0 eof()
 
        31:12 ident(divisor)
        32:0 eof()
 
-###### output: test1,-CcSz,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,-Sz,--ignore-indent,--ignore-newline,if,then,+,-,/
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
@@ -607,8 +607,8 @@ Now to test for some errors ... though things I thought would be errors
 sometimes aren't.
 
 ###### test list
 sometimes aren't.
 
 ###### test list
-       scanner_tests += "errtest,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-"
-       scanner_tests += "errtest,--ignore-ident,--ignore-mark,-N,if,then,+,-"
+       scanner_tests += "errtest,-r,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-"
+       scanner_tests += "errtest,-r,--ignore-ident,--ignore-mark,-N,if,then,+,-"
 
 ###### test: errtest
 
 
 ###### test: errtest
 
@@ -626,7 +626,7 @@ sometimes aren't.
 
        "  \\ \t \n special chars in strings"
 
 
        "  \\ \t \n special chars in strings"
 
-###### output: errtest,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-
+###### output: errtest,-r,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-
 
        Tokenizing: 
        2:0 ERROR(multiple)
 
        Tokenizing: 
        2:0 ERROR(multiple)
@@ -664,7 +664,7 @@ sometimes aren't.
        15:0 newline()
        15:0 eof()
 
        15:0 newline()
        15:0 eof()
 
-###### output: errtest,--ignore-ident,--ignore-mark,-N,if,then,+,-
+###### output: errtest,-r,--ignore-ident,--ignore-mark,-N,if,then,+,-
        Tokenizing: 
        2:0 ERROR(multiple)
        2:9 ERROR(decimal)
        Tokenizing: 
        2:0 ERROR(multiple)
        2:9 ERROR(decimal)
index b57db29c0b38fcefead7e0cc4cafb41494d1264e..6b706411f5010e3da61b6214742f3db39e91de77 100644 (file)
@@ -69,13 +69,15 @@ The scanner is not completely general, yet not completely specified.
 There are a fixed set of token types, though particular tokens within
 those types can be distinguish via configuration.
 
 There are a fixed set of token types, though particular tokens within
 those types can be distinguish via configuration.
 
-Most token types may be explicitly ignored, as typically comments
-would be.  The exact consequence of ignoring each token type varies
-from token to token.
+Most token types may be explicitly ignored, so they aren't parsed.
+Comments typically parsed but not returned, but an option is provided to
+return comments for further processing.  The exact consequence of
+ignoring each token type varies from token to token.
 
 ###### public types
        struct token_config {
                int ignored;    // bit set of ignored tokens.
 
 ###### public types
        struct token_config {
                int ignored;    // bit set of ignored tokens.
+               int return_comments;
                ## token config parameters
        };
 
                ## token config parameters
        };
 
@@ -354,10 +356,10 @@ immediately before a string is handled correctly.
 
 If the first character of a comment marker (i.e. '/') is a known mark,
 the above rules would suggest that the start of a comment would be
 
 If the first character of a comment marker (i.e. '/') is a known mark,
 the above rules would suggest that the start of a comment would be
-parsed as that mark, which is not what is wanted.  So the introductory
-sequences for a comment ("//" and "/*") are treated as
-partially-known.  They prevent the leading "/" from being a mark by
-itself, but do not actually constitute a stand-alone mark.
+parsed as that mark, which is not what is wanted.  So when comments are
+not ignored, the introductory sequences for a comment ("//" and "/*")
+are treated as partially-known.  They prevent the leading "/" from being
+a mark by itself, but do not actually constitute a stand-alone mark.
 
 If `TK_mark` is ignored, then unknown marks are returned as errors.
 
 
 If `TK_mark` is ignored, then unknown marks are returned as errors.
 
@@ -379,8 +381,7 @@ Known marks are included in the same list as the list of known words.
                        /* found a longest-known-mark, still need to
                         * check for comments
                         */
                        /* found a longest-known-mark, still need to
                         * check for comments
                         */
-                       if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
-                           (ch == '/' || ch == '*')) {
+                       if (is_comment(ignored, tk.txt)) {
                                /* Yes, this is a comment, not a '/' */
                                restore_unget_state(state);
                                tk.num = TK_error;
                                /* Yes, this is a comment, not a '/' */
                                restore_unget_state(state);
                                tk.num = TK_error;
@@ -393,22 +394,21 @@ Known marks are included in the same list as the list of known words.
                prev = ch;
                save_unget_state(state);
                ch = get_char(state);
                prev = ch;
                save_unget_state(state);
                ch = get_char(state);
+               if (n >= 0)
+                       /* No need to worry about other token types */
+                       continue;
                if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
                        /* If strings are allowed, a quote (Which isn't a known mark)
                         * mustn't be treated as part of an unknown mark.  It can be
                if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
                        /* If strings are allowed, a quote (Which isn't a known mark)
                         * mustn't be treated as part of an unknown mark.  It can be
-                        * part of a multi-line srtings though.
+                        * part of a multi-line string though.
                         */
                        break;
                         */
                        break;
-               if (prev == '#' && n < 0)
-                       /* '#' is not a known mark, so assume it is a comment */
-                       break;
-               if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
-                       close_token(state, &tk);
-                       restore_unget_state(state);
-                       break;
-               }
-               if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
-                       close_token(state, &tk);
+
+               close_token(state, &tk);
+               if (is_comment(ignored, tk.txt)) {
+                       /* looks like a permitted comment, and not a known mark,
+                        * so assume it is a comment.
+                        */
                        restore_unget_state(state);
                        break;
                }
                        restore_unget_state(state);
                        break;
                }
@@ -466,7 +466,7 @@ ignored, we fall through and treat a triple quote as an empty string
 followed by the start of a new string.
 
 ###### parse string
 followed by the start of a new string.
 
 ###### parse string
-       if (tk.txt.len == 3 &&
+       if (tk.txt.len >= 3 &&
            !(ignored & (1 << TK_multi_string)) &&
            is_quote(tk.txt.txt[0]) &&
            memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
            !(ignored & (1 << TK_multi_string)) &&
            is_quote(tk.txt.txt[0]) &&
            memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
@@ -573,19 +573,29 @@ still parsed, but is discarded.
        TK_block_comment,
 
 ###### internal functions
        TK_block_comment,
 
 ###### internal functions
-       static int is_line_comment(struct text txt)
+       static int is_line_comment(int ignored, struct text txt)
        {
        {
+               if (ignored & (1 << TK_line_comment))
+                       return 0;
                return (txt.len >= 1 && txt.txt[0] == '#') ||
                       (txt.len >= 2 && txt.txt[0] == '/' &&
                                        txt.txt[1] == '/');
        }
 
                return (txt.len >= 1 && txt.txt[0] == '#') ||
                       (txt.len >= 2 && txt.txt[0] == '/' &&
                                        txt.txt[1] == '/');
        }
 
-       static int is_block_comment(struct text txt)
+       static int is_block_comment(int ignored, struct text txt)
        {
        {
+               if (ignored & (1 << TK_block_comment))
+                       return 0;
                return txt.len >= 2 && txt.txt[0] == '/' &&
                       txt.txt[1] == '*';
        }
 
                return txt.len >= 2 && txt.txt[0] == '/' &&
                       txt.txt[1] == '*';
        }
 
+       static int is_comment(int ignored, struct text txt)
+       {
+               return is_line_comment(ignored, txt) ||
+                      is_block_comment(ignored, txt);
+       }
+
 #### Single line comments
 
 A single-line comment continues up to, but not including the newline
 #### Single line comments
 
 A single-line comment continues up to, but not including the newline
@@ -593,14 +603,14 @@ or end of node.
 
 ###### parse comment
 
 
 ###### parse comment
 
-       if (is_line_comment(tk.txt)) {
+       if (is_line_comment(ignored, tk.txt)) {
                while (!is_newline(ch) && !at_eon(state))
                        ch = get_char(state);
                if (is_newline(ch))
                        unget_char(state);
                close_token(state, &tk);
                tk.num = TK_line_comment;
                while (!is_newline(ch) && !at_eon(state))
                        ch = get_char(state);
                if (is_newline(ch))
                        unget_char(state);
                close_token(state, &tk);
                tk.num = TK_line_comment;
-               if (ignored & (1 << TK_line_comment))
+               if (!state->conf->return_comments)
                        continue;
                return tk;
        }
                        continue;
                return tk;
        }
@@ -617,7 +627,7 @@ the unget state (explained later).
 
 ###### parse comment
 
 
 ###### parse comment
 
-       if (is_block_comment(tk.txt)) {
+       if (is_block_comment(ignored, tk.txt)) {
                wchar_t prev;
                int newlines = 0;
                reset_token(state, &tk);
                wchar_t prev;
                int newlines = 0;
                reset_token(state, &tk);
@@ -655,8 +665,7 @@ the unget state (explained later).
                        if (!is_newline(ch))
                                tk.num = TK_error;
                }
                        if (!is_newline(ch))
                                tk.num = TK_error;
                }
-               if (tk.num == TK_error ||
-                   !(ignored & (1 << TK_block_comment)))
+               if (tk.num == TK_error || state->conf->return_comments)
                        return tk;
                continue;
        }
                        return tk;
                continue;
        }
@@ -2037,11 +2046,12 @@ the tokens one per line.
                        { "ignore-newline",     0, NULL, 'l'},
                        { "ignore-block-comment", 0, NULL, 'C'},
                        { "ignore-indent",      0, NULL, 'i'},
                        { "ignore-newline",     0, NULL, 'l'},
                        { "ignore-block-comment", 0, NULL, 'C'},
                        { "ignore-indent",      0, NULL, 'i'},
+                       { "return-comments",    0, NULL, 'r'},
                        { "file",               1, NULL, 'f'},
                        { "section",            1, NULL, 's'},
                        { NULL,                 0, NULL, 0},
                };
                        { "file",               1, NULL, 'f'},
                        { "section",            1, NULL, 's'},
                        { NULL,                 0, NULL, 0},
                };
-               static const char options[] = "W:w:n:NIMSzclCif:s:";
+               static const char options[] = "W:w:n:NIMSzclCirf:s:";
 
                struct section *table, *s, *prev;
                int opt;
 
                struct section *table, *s, *prev;
                int opt;
@@ -2064,6 +2074,7 @@ the tokens one per line.
                        case 'C': conf.ignored |= 1 << TK_block_comment; break;
                        case 'l': conf.ignored |= 1 << TK_newline; break;
                        case 'i': conf.ignored |= 1 << TK_in; break;
                        case 'C': conf.ignored |= 1 << TK_block_comment; break;
                        case 'l': conf.ignored |= 1 << TK_newline; break;
                        case 'i': conf.ignored |= 1 << TK_in; break;
+                       case 'r': conf.return_comments = 1; break;
                        case 'f': filename = optarg; break;
                        case 's': section_name = optarg; break;
                        default: fprintf(stderr, "scanner: unknown option '%c'.\n",
                        case 'f': filename = optarg; break;
                        case 's': section_name = optarg; break;
                        default: fprintf(stderr, "scanner: unknown option '%c'.\n",