]> ocean-lang.org Git - ocean/commitdiff
scanner: change the meaning of ignoring comment tokens.
authorNeilBrown <neil@brown.name>
Tue, 6 Oct 2020 04:44:46 +0000 (15:44 +1100)
committerNeilBrown <neil@brown.name>
Tue, 6 Oct 2020 04:53:50 +0000 (15:53 +1100)
Previously ignoring comment tokens meant they were still parsed, but not
returned.  The only way to stop them being parsed was to declare
known marks for the start symbols.

This made is not possible for parsergen to define a language that had
a known mark that would otherwise start a comment.

So change the ignoring of comment tokens to mean they aren't parsed.  If
you want to parse comments but not return them, leave the new
"return_comments" field as so.  In the unusual case that you want to
return comments set return_comments to 1.

Confirm that this has the desired effect by added in "//" as an
integer-division operator to the sample calculator.

Signed-off-by: NeilBrown <neil@brown.name>
csrc/indent_test.mdc
csrc/oceani.mdc
csrc/parsergen.mdc
csrc/scanner-tests.mdc
csrc/scanner.mdc

index 960f63eb8122a2b8791952498e5c7b42832d2029..3df78d08a937245deb0fbbc6f821ec65a9736c05 100644 (file)
@@ -114,8 +114,6 @@ with complete bracketing and indenting.
                char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
                struct section *s = code_extract(file, file+len, NULL);
                struct token_config config = {
-                       .ignored = (1 << TK_line_comment)
-                                | (1 << TK_block_comment),
                        .number_chars = ".,_+-",
                        .word_start = "",
                        .word_cont = "",
index 2d17598658b51ecfb20da1878fd1190f493d3f8a..ab6abe91f67156e03a0e81926897d320e652b04a 100644 (file)
@@ -171,9 +171,7 @@ structures can be used.
                char *section = NULL;
                struct parse_context context = {
                        .config = {
-                               .ignored = (1 << TK_line_comment)
-                                        | (1 << TK_block_comment)
-                                        | (1 << TK_mark),
+                               .ignored = (1 << TK_mark),
                                .number_chars = ".,_+- ",
                                .word_start = "_",
                                .word_cont = "_",
index 803d9b6e87e0d21ac7029617bd7b8fa79c292cff..3f36df9a6873c3c94f2f127d8a0ca6419e50603c 100644 (file)
@@ -636,6 +636,11 @@ to produce errors that the parser is better positioned to handle.
                        } else if (tk.num == TK_mark
                                   && text_is(tk.txt, "$*")) {
                                err = dollar_line(state, g, 1);
+                       } else if (tk.num == TK_mark
+                                  && text_is(tk.txt, "//")) {
+                               while (tk.num != TK_newline &&
+                                      tk.num != TK_eof)
+                                       tk = token_next(state);
                        } else {
                                err = "Unrecognised token at start of line.";
                        }
@@ -1911,7 +1916,6 @@ pieces of code provided in the grammar file, so they are generated first.
                fprintf(f, "\tstruct token_state *tokens;\n");
                fprintf(f, "\tconfig->words_marks = known;\n");
                fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
-               fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
                fprintf(f, "\ttokens = token_open(code, config);\n");
                fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
                fprintf(f, "\ttoken_close(tokens);\n");
@@ -3105,7 +3109,6 @@ an error.
                struct section *s;
                struct token_config config = {
                        .ignored = (1 << TK_line_comment)
-                                | (1 << TK_block_comment)
                                 | (1 << TK_in)
                                 | (1 << TK_out),
                        .number_chars = ".,_+-",
@@ -3127,7 +3130,7 @@ an error.
 # calc: grammar
 
        $LEFT + -
-       $LEFT * /
+       $LEFT * / //
 
        Session -> Session Line
                | Line
@@ -3155,6 +3158,16 @@ an error.
                | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
                | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
                | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+               | Expression // Expression ${ {
+                       mpz_t z0, z1, z2;
+                       mpq_init($0.val);
+                       mpz_init(z0); mpz_init(z1); mpz_init(z2);
+                       mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+                       mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+                       mpz_tdiv_q(z0, z1, z2);
+                       mpq_set_z($0.val, z0);
+                       mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+               } }$
                | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
                | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
 
@@ -3167,4 +3180,6 @@ an error.
        10 * 9 / 2
        1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
 
+       355//113
+
        error
index ec34812d0cf9313c847364e7e86805425e786553..d527ed0442fc9c68bbbe2f0e586bfa65409ce2c5 100644 (file)
@@ -53,12 +53,12 @@ Some simple tests... maybe all tests are simple.
 Include a special test for numbers, as they are interesting.
 
 ###### test list
-       scanner_tests += "test1,if,then,+,-"
-       scanner_tests += "test1,if,then,+,-,/"
-       scanner_tests += "test1,--ignore-indent,if,then,+,-,/"
+       scanner_tests += "test1,-r,if,then,+,-"
+       scanner_tests += "test1,-r,if,then,+,-,/"
+       scanner_tests += "test1,-r,--ignore-indent,if,then,+,-,/"
+       scanner_tests += "test1,-r,--ignore-indent,--ignore-newline,if,then,+,-,/"
        scanner_tests += "test1,--ignore-indent,--ignore-newline,if,then,+,-,/"
-       scanner_tests += "test1,-Cc,--ignore-indent,--ignore-newline,if,then,+,-,/"
-       scanner_tests += "test1,-CcSz,--ignore-indent,--ignore-newline,if,then,+,-,/"
+       scanner_tests += "test1,-Sz,--ignore-indent,--ignore-newline,if,then,+,-,/"
 
 ###### test: test1
 
@@ -93,7 +93,7 @@ Include a special test for numbers, as they are interesting.
        lines */
        divident /+ divisor
 
-###### output: test1,if,then,+,-
+###### output: test1,-r,if,then,+,-
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
@@ -180,7 +180,7 @@ Include a special test for numbers, as they are interesting.
        32:0 newline()
        32:0 eof()
 
-###### output: test1,if,then,+,-,/
+###### output: test1,-r,if,then,+,-,/
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
@@ -268,7 +268,7 @@ Include a special test for numbers, as they are interesting.
        32:0 newline()
        32:0 eof()
 
-###### output: test1,--ignore-indent,if,then,+,-,/
+###### output: test1,-r,--ignore-indent,if,then,+,-,/
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
@@ -344,7 +344,7 @@ Include a special test for numbers, as they are interesting.
        31:19 newline()
        32:0 eof()
 
-###### output: test1,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,-r,--ignore-indent,--ignore-newline,if,then,+,-,/
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
@@ -396,7 +396,7 @@ Include a special test for numbers, as they are interesting.
        31:12 ident(divisor)
        32:0 eof()
 
-###### output: test1,-Cc,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,--ignore-indent,--ignore-newline,if,then,+,-,/
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
@@ -445,7 +445,7 @@ Include a special test for numbers, as they are interesting.
        31:12 ident(divisor)
        32:0 eof()
 
-###### output: test1,-CcSz,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,-Sz,--ignore-indent,--ignore-newline,if,then,+,-,/
        Tokenizing: 
        2:0 ident(A)
        2:2 ident(B)
@@ -607,8 +607,8 @@ Now to test for some errors ... though things I thought would be errors
 sometimes aren't.
 
 ###### test list
-       scanner_tests += "errtest,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-"
-       scanner_tests += "errtest,--ignore-ident,--ignore-mark,-N,if,then,+,-"
+       scanner_tests += "errtest,-r,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-"
+       scanner_tests += "errtest,-r,--ignore-ident,--ignore-mark,-N,if,then,+,-"
 
 ###### test: errtest
 
@@ -626,7 +626,7 @@ sometimes aren't.
 
        "  \\ \t \n special chars in strings"
 
-###### output: errtest,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-
+###### output: errtest,-r,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-
 
        Tokenizing: 
        2:0 ERROR(multiple)
@@ -664,7 +664,7 @@ sometimes aren't.
        15:0 newline()
        15:0 eof()
 
-###### output: errtest,--ignore-ident,--ignore-mark,-N,if,then,+,-
+###### output: errtest,-r,--ignore-ident,--ignore-mark,-N,if,then,+,-
        Tokenizing: 
        2:0 ERROR(multiple)
        2:9 ERROR(decimal)
index b57db29c0b38fcefead7e0cc4cafb41494d1264e..6b706411f5010e3da61b6214742f3db39e91de77 100644 (file)
@@ -69,13 +69,15 @@ The scanner is not completely general, yet not completely specified.
 There are a fixed set of token types, though particular tokens within
 those types can be distinguish via configuration.
 
-Most token types may be explicitly ignored, as typically comments
-would be.  The exact consequence of ignoring each token type varies
-from token to token.
+Most token types may be explicitly ignored, so they aren't parsed.
+Comments typically parsed but not returned, but an option is provided to
+return comments for further processing.  The exact consequence of
+ignoring each token type varies from token to token.
 
 ###### public types
        struct token_config {
                int ignored;    // bit set of ignored tokens.
+               int return_comments;
                ## token config parameters
        };
 
@@ -354,10 +356,10 @@ immediately before a string is handled correctly.
 
 If the first character of a comment marker (i.e. '/') is a known mark,
 the above rules would suggest that the start of a comment would be
-parsed as that mark, which is not what is wanted.  So the introductory
-sequences for a comment ("//" and "/*") are treated as
-partially-known.  They prevent the leading "/" from being a mark by
-itself, but do not actually constitute a stand-alone mark.
+parsed as that mark, which is not what is wanted.  So when comments are
+not ignored, the introductory sequences for a comment ("//" and "/*")
+are treated as partially-known.  They prevent the leading "/" from being
+a mark by itself, but do not actually constitute a stand-alone mark.
 
 If `TK_mark` is ignored, then unknown marks are returned as errors.
 
@@ -379,8 +381,7 @@ Known marks are included in the same list as the list of known words.
                        /* found a longest-known-mark, still need to
                         * check for comments
                         */
-                       if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
-                           (ch == '/' || ch == '*')) {
+                       if (is_comment(ignored, tk.txt)) {
                                /* Yes, this is a comment, not a '/' */
                                restore_unget_state(state);
                                tk.num = TK_error;
@@ -393,22 +394,21 @@ Known marks are included in the same list as the list of known words.
                prev = ch;
                save_unget_state(state);
                ch = get_char(state);
+               if (n >= 0)
+                       /* No need to worry about other token types */
+                       continue;
                if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
                        /* If strings are allowed, a quote (Which isn't a known mark)
                         * mustn't be treated as part of an unknown mark.  It can be
-                        * part of a multi-line srtings though.
+                        * part of a multi-line string though.
                         */
                        break;
-               if (prev == '#' && n < 0)
-                       /* '#' is not a known mark, so assume it is a comment */
-                       break;
-               if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
-                       close_token(state, &tk);
-                       restore_unget_state(state);
-                       break;
-               }
-               if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
-                       close_token(state, &tk);
+
+               close_token(state, &tk);
+               if (is_comment(ignored, tk.txt)) {
+                       /* looks like a permitted comment, and not a known mark,
+                        * so assume it is a comment.
+                        */
                        restore_unget_state(state);
                        break;
                }
@@ -466,7 +466,7 @@ ignored, we fall through and treat a triple quote as an empty string
 followed by the start of a new string.
 
 ###### parse string
-       if (tk.txt.len == 3 &&
+       if (tk.txt.len >= 3 &&
            !(ignored & (1 << TK_multi_string)) &&
            is_quote(tk.txt.txt[0]) &&
            memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
@@ -573,19 +573,29 @@ still parsed, but is discarded.
        TK_block_comment,
 
 ###### internal functions
-       static int is_line_comment(struct text txt)
+       static int is_line_comment(int ignored, struct text txt)
        {
+               if (ignored & (1 << TK_line_comment))
+                       return 0;
                return (txt.len >= 1 && txt.txt[0] == '#') ||
                       (txt.len >= 2 && txt.txt[0] == '/' &&
                                        txt.txt[1] == '/');
        }
 
-       static int is_block_comment(struct text txt)
+       static int is_block_comment(int ignored, struct text txt)
        {
+               if (ignored & (1 << TK_block_comment))
+                       return 0;
                return txt.len >= 2 && txt.txt[0] == '/' &&
                       txt.txt[1] == '*';
        }
 
+       static int is_comment(int ignored, struct text txt)
+       {
+               return is_line_comment(ignored, txt) ||
+                      is_block_comment(ignored, txt);
+       }
+
 #### Single line comments
 
 A single-line comment continues up to, but not including the newline
@@ -593,14 +603,14 @@ or end of node.
 
 ###### parse comment
 
-       if (is_line_comment(tk.txt)) {
+       if (is_line_comment(ignored, tk.txt)) {
                while (!is_newline(ch) && !at_eon(state))
                        ch = get_char(state);
                if (is_newline(ch))
                        unget_char(state);
                close_token(state, &tk);
                tk.num = TK_line_comment;
-               if (ignored & (1 << TK_line_comment))
+               if (!state->conf->return_comments)
                        continue;
                return tk;
        }
@@ -617,7 +627,7 @@ the unget state (explained later).
 
 ###### parse comment
 
-       if (is_block_comment(tk.txt)) {
+       if (is_block_comment(ignored, tk.txt)) {
                wchar_t prev;
                int newlines = 0;
                reset_token(state, &tk);
@@ -655,8 +665,7 @@ the unget state (explained later).
                        if (!is_newline(ch))
                                tk.num = TK_error;
                }
-               if (tk.num == TK_error ||
-                   !(ignored & (1 << TK_block_comment)))
+               if (tk.num == TK_error || state->conf->return_comments)
                        return tk;
                continue;
        }
@@ -2037,11 +2046,12 @@ the tokens one per line.
                        { "ignore-newline",     0, NULL, 'l'},
                        { "ignore-block-comment", 0, NULL, 'C'},
                        { "ignore-indent",      0, NULL, 'i'},
+                       { "return-comments",    0, NULL, 'r'},
                        { "file",               1, NULL, 'f'},
                        { "section",            1, NULL, 's'},
                        { NULL,                 0, NULL, 0},
                };
-               static const char options[] = "W:w:n:NIMSzclCif:s:";
+               static const char options[] = "W:w:n:NIMSzclCirf:s:";
 
                struct section *table, *s, *prev;
                int opt;
@@ -2064,6 +2074,7 @@ the tokens one per line.
                        case 'C': conf.ignored |= 1 << TK_block_comment; break;
                        case 'l': conf.ignored |= 1 << TK_newline; break;
                        case 'i': conf.ignored |= 1 << TK_in; break;
+                       case 'r': conf.return_comments = 1; break;
                        case 'f': filename = optarg; break;
                        case 's': section_name = optarg; break;
                        default: fprintf(stderr, "scanner: unknown option '%c'.\n",