scanner: change the meaning of ignoring comment tokens.

author NeilBrown <neil@brown.name>

Tue, 6 Oct 2020 04:44:46 +0000 (15:44 +1100)

committer NeilBrown <neil@brown.name>

Tue, 6 Oct 2020 04:53:50 +0000 (15:53 +1100)
author NeilBrown <neil@brown.name>
Tue, 6 Oct 2020 04:44:46 +0000 (15:44 +1100)
committer NeilBrown <neil@brown.name>
Tue, 6 Oct 2020 04:53:50 +0000 (15:53 +1100)
diff --git a/csrc/indent_test.mdc b/csrc/indent_test.mdc

index 960f63eb8122a2b8791952498e5c7b42832d2029..3df78d08a937245deb0fbbc6f821ec65a9736c05 100644 (file)
--- a/csrc/indent_test.mdc
+++ b/csrc/indent_test.mdc
@@ -114,8 +114,6 @@ with complete bracketing and indenting.
                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
                 struct section *s = code_extract(file, file+len, NULL);
                 struct token_config config = {
-                       .ignored = (1 << TK_line_comment)
-                                | (1 << TK_block_comment),
                         .number_chars = ".,_+-",
                         .word_start = "",
                         .word_cont = "",
diff --git a/csrc/oceani.mdc b/csrc/oceani.mdc

index 2d17598658b51ecfb20da1878fd1190f493d3f8a..ab6abe91f67156e03a0e81926897d320e652b04a 100644 (file)
--- a/csrc/oceani.mdc
+++ b/csrc/oceani.mdc
@@ -171,9 +171,7 @@ structures can be used.
                 char *section = NULL;
                 struct parse_context context = {
                         .config = {
-                               .ignored = (1 << TK_line_comment)
-                                        | (1 << TK_block_comment)
-                                        | (1 << TK_mark),
+                               .ignored = (1 << TK_mark),
                                 .number_chars = ".,_+- ",
                                 .word_start = "_",
                                 .word_cont = "_",
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index 803d9b6e87e0d21ac7029617bd7b8fa79c292cff..3f36df9a6873c3c94f2f127d8a0ca6419e50603c 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -636,6 +636,11 @@ to produce errors that the parser is better positioned to handle.
                         } else if (tk.num == TK_mark
                                    && text_is(tk.txt, "$*")) {
                                 err = dollar_line(state, g, 1);
+                       } else if (tk.num == TK_mark
+                                  && text_is(tk.txt, "//")) {
+                               while (tk.num != TK_newline &&
+                                      tk.num != TK_eof)
+                                       tk = token_next(state);
                         } else {
                                 err = "Unrecognised token at start of line.";
                         }
@@ -1911,7 +1916,6 @@ pieces of code provided in the grammar file, so they are generated first.
                 fprintf(f, "\tstruct token_state *tokens;\n");
                 fprintf(f, "\tconfig->words_marks = known;\n");
                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
-               fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
                 fprintf(f, "\ttokens = token_open(code, config);\n");
                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
                 fprintf(f, "\ttoken_close(tokens);\n");
@@ -3105,7 +3109,6 @@ an error.
                 struct section *s;
                 struct token_config config = {
                         .ignored = (1 << TK_line_comment)
-                                | (1 << TK_block_comment)
                                  | (1 << TK_in)
                                  | (1 << TK_out),
                         .number_chars = ".,_+-",
@@ -3127,7 +3130,7 @@ an error.
  # calc: grammar
  
         $LEFT + -
-       $LEFT * /
+       $LEFT * / //
  
         Session -> Session Line
                 | Line
@@ -3155,6 +3158,16 @@ an error.
                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+               | Expression // Expression ${ {
+                       mpz_t z0, z1, z2;
+                       mpq_init($0.val);
+                       mpz_init(z0); mpz_init(z1); mpz_init(z2);
+                       mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+                       mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+                       mpz_tdiv_q(z0, z1, z2);
+                       mpq_set_z($0.val, z0);
+                       mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+               } }$
                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
  
@@ -3167,4 +3180,6 @@ an error.
         10 * 9 / 2
         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
  
+       355//113
+
         error
diff --git a/csrc/scanner-tests.mdc b/csrc/scanner-tests.mdc

index ec34812d0cf9313c847364e7e86805425e786553..d527ed0442fc9c68bbbe2f0e586bfa65409ce2c5 100644 (file)
--- a/csrc/scanner-tests.mdc
+++ b/csrc/scanner-tests.mdc
@@ -53,12 +53,12 @@ Some simple tests... maybe all tests are simple.
  Include a special test for numbers, as they are interesting.
  
  ###### test list
-       scanner_tests += "test1,if,then,+,-"
-       scanner_tests += "test1,if,then,+,-,/"
-       scanner_tests += "test1,--ignore-indent,if,then,+,-,/"
+       scanner_tests += "test1,-r,if,then,+,-"
+       scanner_tests += "test1,-r,if,then,+,-,/"
+       scanner_tests += "test1,-r,--ignore-indent,if,then,+,-,/"
+       scanner_tests += "test1,-r,--ignore-indent,--ignore-newline,if,then,+,-,/"
         scanner_tests += "test1,--ignore-indent,--ignore-newline,if,then,+,-,/"
-       scanner_tests += "test1,-Cc,--ignore-indent,--ignore-newline,if,then,+,-,/"
-       scanner_tests += "test1,-CcSz,--ignore-indent,--ignore-newline,if,then,+,-,/"
+       scanner_tests += "test1,-Sz,--ignore-indent,--ignore-newline,if,then,+,-,/"
  
  ###### test: test1
  
@@ -93,7 +93,7 @@ Include a special test for numbers, as they are interesting.
         lines */
         divident /+ divisor
  
-###### output: test1,if,then,+,-
+###### output: test1,-r,if,then,+,-
         Tokenizing: 
         2:0 ident(A)
         2:2 ident(B)
@@ -180,7 +180,7 @@ Include a special test for numbers, as they are interesting.
         32:0 newline()
         32:0 eof()
  
-###### output: test1,if,then,+,-,/
+###### output: test1,-r,if,then,+,-,/
         Tokenizing: 
         2:0 ident(A)
         2:2 ident(B)
@@ -268,7 +268,7 @@ Include a special test for numbers, as they are interesting.
         32:0 newline()
         32:0 eof()
  
-###### output: test1,--ignore-indent,if,then,+,-,/
+###### output: test1,-r,--ignore-indent,if,then,+,-,/
         Tokenizing: 
         2:0 ident(A)
         2:2 ident(B)
@@ -344,7 +344,7 @@ Include a special test for numbers, as they are interesting.
         31:19 newline()
         32:0 eof()
  
-###### output: test1,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,-r,--ignore-indent,--ignore-newline,if,then,+,-,/
         Tokenizing: 
         2:0 ident(A)
         2:2 ident(B)
@@ -396,7 +396,7 @@ Include a special test for numbers, as they are interesting.
         31:12 ident(divisor)
         32:0 eof()
  
-###### output: test1,-Cc,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,--ignore-indent,--ignore-newline,if,then,+,-,/
         Tokenizing: 
         2:0 ident(A)
         2:2 ident(B)
@@ -445,7 +445,7 @@ Include a special test for numbers, as they are interesting.
         31:12 ident(divisor)
         32:0 eof()
  
-###### output: test1,-CcSz,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,-Sz,--ignore-indent,--ignore-newline,if,then,+,-,/
         Tokenizing: 
         2:0 ident(A)
         2:2 ident(B)
@@ -607,8 +607,8 @@ Now to test for some errors ... though things I thought would be errors
  sometimes aren't.
  
  ###### test list
-       scanner_tests += "errtest,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-"
-       scanner_tests += "errtest,--ignore-ident,--ignore-mark,-N,if,then,+,-"
+       scanner_tests += "errtest,-r,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-"
+       scanner_tests += "errtest,-r,--ignore-ident,--ignore-mark,-N,if,then,+,-"
  
  ###### test: errtest
  
@@ -626,7 +626,7 @@ sometimes aren't.
  
         "  \\ \t \n special chars in strings"
  
-###### output: errtest,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-
+###### output: errtest,-r,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-
  
         Tokenizing: 
         2:0 ERROR(multiple)
@@ -664,7 +664,7 @@ sometimes aren't.
         15:0 newline()
         15:0 eof()
  
-###### output: errtest,--ignore-ident,--ignore-mark,-N,if,then,+,-
+###### output: errtest,-r,--ignore-ident,--ignore-mark,-N,if,then,+,-
         Tokenizing: 
         2:0 ERROR(multiple)
         2:9 ERROR(decimal)
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index b57db29c0b38fcefead7e0cc4cafb41494d1264e..6b706411f5010e3da61b6214742f3db39e91de77 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -69,13 +69,15 @@ The scanner is not completely general, yet not completely specified.
  There are a fixed set of token types, though particular tokens within
  those types can be distinguish via configuration.
  
-Most token types may be explicitly ignored, as typically comments
-would be.  The exact consequence of ignoring each token type varies
-from token to token.
+Most token types may be explicitly ignored, so they aren't parsed.
+Comments typically parsed but not returned, but an option is provided to
+return comments for further processing.  The exact consequence of
+ignoring each token type varies from token to token.
  
  ###### public types
         struct token_config {
                 int ignored;    // bit set of ignored tokens.
+               int return_comments;
                 ## token config parameters
         };
  
@@ -354,10 +356,10 @@ immediately before a string is handled correctly.
  
  If the first character of a comment marker (i.e. '/') is a known mark,
  the above rules would suggest that the start of a comment would be
-parsed as that mark, which is not what is wanted.  So the introductory
-sequences for a comment ("//" and "/*") are treated as
-partially-known.  They prevent the leading "/" from being a mark by
-itself, but do not actually constitute a stand-alone mark.
+parsed as that mark, which is not what is wanted.  So when comments are
+not ignored, the introductory sequences for a comment ("//" and "/*")
+are treated as partially-known.  They prevent the leading "/" from being
+a mark by itself, but do not actually constitute a stand-alone mark.
  
  If `TK_mark` is ignored, then unknown marks are returned as errors.
  
@@ -379,8 +381,7 @@ Known marks are included in the same list as the list of known words.
                         /* found a longest-known-mark, still need to
                          * check for comments
                          */
-                       if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
-                           (ch == '/' || ch == '*')) {
+                       if (is_comment(ignored, tk.txt)) {
                                 /* Yes, this is a comment, not a '/' */
                                 restore_unget_state(state);
                                 tk.num = TK_error;
@@ -393,22 +394,21 @@ Known marks are included in the same list as the list of known words.
                 prev = ch;
                 save_unget_state(state);
                 ch = get_char(state);
+               if (n >= 0)
+                       /* No need to worry about other token types */
+                       continue;
                 if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
                         /* If strings are allowed, a quote (Which isn't a known mark)
                          * mustn't be treated as part of an unknown mark.  It can be
-                        * part of a multi-line srtings though.
+                        * part of a multi-line string though.
                          */
                         break;
-               if (prev == '#' && n < 0)
-                       /* '#' is not a known mark, so assume it is a comment */
-                       break;
-               if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
-                       close_token(state, &tk);
-                       restore_unget_state(state);
-                       break;
-               }
-               if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
-                       close_token(state, &tk);
+
+               close_token(state, &tk);
+               if (is_comment(ignored, tk.txt)) {
+                       /* looks like a permitted comment, and not a known mark,
+                        * so assume it is a comment.
+                        */
                         restore_unget_state(state);
                         break;
                 }
@@ -466,7 +466,7 @@ ignored, we fall through and treat a triple quote as an empty string
  followed by the start of a new string.
  
  ###### parse string
-       if (tk.txt.len == 3 &&
+       if (tk.txt.len >= 3 &&
             !(ignored & (1 << TK_multi_string)) &&
             is_quote(tk.txt.txt[0]) &&
             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
@@ -573,19 +573,29 @@ still parsed, but is discarded.
         TK_block_comment,
  
  ###### internal functions
-       static int is_line_comment(struct text txt)
+       static int is_line_comment(int ignored, struct text txt)
         {
+               if (ignored & (1 << TK_line_comment))
+                       return 0;
                 return (txt.len >= 1 && txt.txt[0] == '#') ||
                        (txt.len >= 2 && txt.txt[0] == '/' &&
                                         txt.txt[1] == '/');
         }
  
-       static int is_block_comment(struct text txt)
+       static int is_block_comment(int ignored, struct text txt)
         {
+               if (ignored & (1 << TK_block_comment))
+                       return 0;
                 return txt.len >= 2 && txt.txt[0] == '/' &&
                        txt.txt[1] == '*';
         }
  
+       static int is_comment(int ignored, struct text txt)
+       {
+               return is_line_comment(ignored, txt) ||
+                      is_block_comment(ignored, txt);
+       }
+
  #### Single line comments
  
  A single-line comment continues up to, but not including the newline
@@ -593,14 +603,14 @@ or end of node.
  
  ###### parse comment
  
-       if (is_line_comment(tk.txt)) {
+       if (is_line_comment(ignored, tk.txt)) {
                 while (!is_newline(ch) && !at_eon(state))
                         ch = get_char(state);
                 if (is_newline(ch))
                         unget_char(state);
                 close_token(state, &tk);
                 tk.num = TK_line_comment;
-               if (ignored & (1 << TK_line_comment))
+               if (!state->conf->return_comments)
                         continue;
                 return tk;
         }
@@ -617,7 +627,7 @@ the unget state (explained later).
  
  ###### parse comment
  
-       if (is_block_comment(tk.txt)) {
+       if (is_block_comment(ignored, tk.txt)) {
                 wchar_t prev;
                 int newlines = 0;
                 reset_token(state, &tk);
@@ -655,8 +665,7 @@ the unget state (explained later).
                         if (!is_newline(ch))
                                 tk.num = TK_error;
                 }
-               if (tk.num == TK_error ||
-                   !(ignored & (1 << TK_block_comment)))
+               if (tk.num == TK_error || state->conf->return_comments)
                         return tk;
                 continue;
         }
@@ -2037,11 +2046,12 @@ the tokens one per line.
                         { "ignore-newline",     0, NULL, 'l'},
                         { "ignore-block-comment", 0, NULL, 'C'},
                         { "ignore-indent",      0, NULL, 'i'},
+                       { "return-comments",    0, NULL, 'r'},
                         { "file",               1, NULL, 'f'},
                         { "section",            1, NULL, 's'},
                         { NULL,                 0, NULL, 0},
                 };
-               static const char options[] = "W:w:n:NIMSzclCif:s:";
+               static const char options[] = "W:w:n:NIMSzclCirf:s:";
  
                 struct section *table, *s, *prev;
                 int opt;
@@ -2064,6 +2074,7 @@ the tokens one per line.
                         case 'C': conf.ignored |= 1 << TK_block_comment; break;
                         case 'l': conf.ignored |= 1 << TK_newline; break;
                         case 'i': conf.ignored |= 1 << TK_in; break;
+                       case 'r': conf.return_comments = 1; break;
                         case 'f': filename = optarg; break;
                         case 's': section_name = optarg; break;
                         default: fprintf(stderr, "scanner: unknown option '%c'.\n",
author	NeilBrown <neil@brown.name>
	Tue, 6 Oct 2020 04:44:46 +0000 (15:44 +1100)
committer	NeilBrown <neil@brown.name>
	Tue, 6 Oct 2020 04:53:50 +0000 (15:53 +1100)
csrc/indent_test.mdc		patch \| blob \| history
csrc/oceani.mdc		patch \| blob \| history
csrc/parsergen.mdc		patch \| blob \| history
csrc/scanner-tests.mdc		patch \| blob \| history
csrc/scanner.mdc		patch \| blob \| history