From 850a39a0a761e0af89c15253f075ecd9e9ecc6ee Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Tue, 6 Oct 2020 15:44:46 +1100
Subject: [PATCH] scanner: change the meaning of ignoring comment tokens.

Previously ignoring comment tokens meant they were still parsed, but not
returned.  The only way to stop them being parsed was to declare
known marks for the start symbols.

This made is not possible for parsergen to define a language that had
a known mark that would otherwise start a comment.

So change the ignoring of comment tokens to mean they aren't parsed.  If
you want to parse comments but not return them, leave the new
"return_comments" field as so.  In the unusual case that you want to
return comments set return_comments to 1.

Confirm that this has the desired effect by added in "//" as an
integer-division operator to the sample calculator.

Signed-off-by: NeilBrown <neil@brown.name>
---
 csrc/indent_test.mdc   |  2 --
 csrc/oceani.mdc        |  4 +--
 csrc/parsergen.mdc     | 21 +++++++++++--
 csrc/scanner-tests.mdc | 30 +++++++++---------
 csrc/scanner.mdc       | 69 ++++++++++++++++++++++++------------------
 5 files changed, 74 insertions(+), 52 deletions(-)

diff --git a/csrc/indent_test.mdc b/csrc/indent_test.mdc
index 960f63e..3df78d0 100644
--- a/csrc/indent_test.mdc
+++ b/csrc/indent_test.mdc
@@ -114,8 +114,6 @@ with complete bracketing and indenting.
 		char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
 		struct section *s = code_extract(file, file+len, NULL);
 		struct token_config config = {
-			.ignored = (1 << TK_line_comment)
-			         | (1 << TK_block_comment),
 			.number_chars = ".,_+-",
 			.word_start = "",
 			.word_cont = "",
diff --git a/csrc/oceani.mdc b/csrc/oceani.mdc
index 2d17598..ab6abe9 100644
--- a/csrc/oceani.mdc
+++ b/csrc/oceani.mdc
@@ -171,9 +171,7 @@ structures can be used.
 		char *section = NULL;
 		struct parse_context context = {
 			.config = {
-				.ignored = (1 << TK_line_comment)
-				         | (1 << TK_block_comment)
-				         | (1 << TK_mark),
+				.ignored = (1 << TK_mark),
 				.number_chars = ".,_+- ",
 				.word_start = "_",
 				.word_cont = "_",
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc
index 803d9b6..3f36df9 100644
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -636,6 +636,11 @@ to produce errors that the parser is better positioned to handle.
 			} else if (tk.num == TK_mark
 			           && text_is(tk.txt, "$*")) {
 				err = dollar_line(state, g, 1);
+			} else if (tk.num == TK_mark
+				   && text_is(tk.txt, "//")) {
+				while (tk.num != TK_newline &&
+				       tk.num != TK_eof)
+					tk = token_next(state);
 			} else {
 				err = "Unrecognised token at start of line.";
 			}
@@ -1911,7 +1916,6 @@ pieces of code provided in the grammar file, so they are generated first.
 		fprintf(f, "\tstruct token_state *tokens;\n");
 		fprintf(f, "\tconfig->words_marks = known;\n");
 		fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
-		fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
 		fprintf(f, "\ttokens = token_open(code, config);\n");
 		fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
 		fprintf(f, "\ttoken_close(tokens);\n");
@@ -3105,7 +3109,6 @@ an error.
 		struct section *s;
 		struct token_config config = {
 			.ignored = (1 << TK_line_comment)
-			         | (1 << TK_block_comment)
 			         | (1 << TK_in)
 			         | (1 << TK_out),
 			.number_chars = ".,_+-",
@@ -3127,7 +3130,7 @@ an error.
 # calc: grammar
 
 	$LEFT + -
-	$LEFT * /
+	$LEFT * / //
 
 	Session -> Session Line
 		| Line
@@ -3155,6 +3158,16 @@ an error.
 		| Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
 		| Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
 		| Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+		| Expression // Expression ${ {
+			mpz_t z0, z1, z2;
+			mpq_init($0.val);
+			mpz_init(z0); mpz_init(z1); mpz_init(z2);
+			mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+			mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+			mpz_tdiv_q(z0, z1, z2);
+			mpq_set_z($0.val, z0);
+			mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+		} }$
 		| NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
 		| ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
 
@@ -3167,4 +3180,6 @@ an error.
 	10 * 9 / 2
 	1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
 
+	355//113
+
 	error
diff --git a/csrc/scanner-tests.mdc b/csrc/scanner-tests.mdc
index ec34812..d527ed0 100644
--- a/csrc/scanner-tests.mdc
+++ b/csrc/scanner-tests.mdc
@@ -53,12 +53,12 @@ Some simple tests... maybe all tests are simple.
 Include a special test for numbers, as they are interesting.
 
 ###### test list
-	scanner_tests += "test1,if,then,+,-"
-	scanner_tests += "test1,if,then,+,-,/"
-	scanner_tests += "test1,--ignore-indent,if,then,+,-,/"
+	scanner_tests += "test1,-r,if,then,+,-"
+	scanner_tests += "test1,-r,if,then,+,-,/"
+	scanner_tests += "test1,-r,--ignore-indent,if,then,+,-,/"
+	scanner_tests += "test1,-r,--ignore-indent,--ignore-newline,if,then,+,-,/"
 	scanner_tests += "test1,--ignore-indent,--ignore-newline,if,then,+,-,/"
-	scanner_tests += "test1,-Cc,--ignore-indent,--ignore-newline,if,then,+,-,/"
-	scanner_tests += "test1,-CcSz,--ignore-indent,--ignore-newline,if,then,+,-,/"
+	scanner_tests += "test1,-Sz,--ignore-indent,--ignore-newline,if,then,+,-,/"
 
 ###### test: test1
 
@@ -93,7 +93,7 @@ Include a special test for numbers, as they are interesting.
 	lines */
 	divident /+ divisor
 
-###### output: test1,if,then,+,-
+###### output: test1,-r,if,then,+,-
 	Tokenizing: 
 	2:0 ident(A)
 	2:2 ident(B)
@@ -180,7 +180,7 @@ Include a special test for numbers, as they are interesting.
 	32:0 newline()
 	32:0 eof()
 
-###### output: test1,if,then,+,-,/
+###### output: test1,-r,if,then,+,-,/
 	Tokenizing: 
 	2:0 ident(A)
 	2:2 ident(B)
@@ -268,7 +268,7 @@ Include a special test for numbers, as they are interesting.
 	32:0 newline()
 	32:0 eof()
 
-###### output: test1,--ignore-indent,if,then,+,-,/
+###### output: test1,-r,--ignore-indent,if,then,+,-,/
 	Tokenizing: 
 	2:0 ident(A)
 	2:2 ident(B)
@@ -344,7 +344,7 @@ Include a special test for numbers, as they are interesting.
 	31:19 newline()
 	32:0 eof()
 
-###### output: test1,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,-r,--ignore-indent,--ignore-newline,if,then,+,-,/
 	Tokenizing: 
 	2:0 ident(A)
 	2:2 ident(B)
@@ -396,7 +396,7 @@ Include a special test for numbers, as they are interesting.
 	31:12 ident(divisor)
 	32:0 eof()
 
-###### output: test1,-Cc,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,--ignore-indent,--ignore-newline,if,then,+,-,/
 	Tokenizing: 
 	2:0 ident(A)
 	2:2 ident(B)
@@ -445,7 +445,7 @@ Include a special test for numbers, as they are interesting.
 	31:12 ident(divisor)
 	32:0 eof()
 
-###### output: test1,-CcSz,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,-Sz,--ignore-indent,--ignore-newline,if,then,+,-,/
 	Tokenizing: 
 	2:0 ident(A)
 	2:2 ident(B)
@@ -607,8 +607,8 @@ Now to test for some errors ... though things I thought would be errors
 sometimes aren't.
 
 ###### test list
-	scanner_tests += "errtest,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-"
-	scanner_tests += "errtest,--ignore-ident,--ignore-mark,-N,if,then,+,-"
+	scanner_tests += "errtest,-r,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-"
+	scanner_tests += "errtest,-r,--ignore-ident,--ignore-mark,-N,if,then,+,-"
 
 ###### test: errtest
 
@@ -626,7 +626,7 @@ sometimes aren't.
 
 	"  \\ \t \n special chars in strings"
 
-###### output: errtest,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-
+###### output: errtest,-r,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-
 
 	Tokenizing: 
 	2:0 ERROR(multiple)
@@ -664,7 +664,7 @@ sometimes aren't.
 	15:0 newline()
 	15:0 eof()
 
-###### output: errtest,--ignore-ident,--ignore-mark,-N,if,then,+,-
+###### output: errtest,-r,--ignore-ident,--ignore-mark,-N,if,then,+,-
 	Tokenizing: 
 	2:0 ERROR(multiple)
 	2:9 ERROR(decimal)
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc
index b57db29..6b70641 100644
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -69,13 +69,15 @@ The scanner is not completely general, yet not completely specified.
 There are a fixed set of token types, though particular tokens within
 those types can be distinguish via configuration.
 
-Most token types may be explicitly ignored, as typically comments
-would be.  The exact consequence of ignoring each token type varies
-from token to token.
+Most token types may be explicitly ignored, so they aren't parsed.
+Comments typically parsed but not returned, but an option is provided to
+return comments for further processing.  The exact consequence of
+ignoring each token type varies from token to token.
 
 ###### public types
 	struct token_config {
 		int ignored;	// bit set of ignored tokens.
+		int return_comments;
 		## token config parameters
 	};
 
@@ -354,10 +356,10 @@ immediately before a string is handled correctly.
 
 If the first character of a comment marker (i.e. '/') is a known mark,
 the above rules would suggest that the start of a comment would be
-parsed as that mark, which is not what is wanted.  So the introductory
-sequences for a comment ("//" and "/*") are treated as
-partially-known.  They prevent the leading "/" from being a mark by
-itself, but do not actually constitute a stand-alone mark.
+parsed as that mark, which is not what is wanted.  So when comments are
+not ignored, the introductory sequences for a comment ("//" and "/*")
+are treated as partially-known.  They prevent the leading "/" from being
+a mark by itself, but do not actually constitute a stand-alone mark.
 
 If `TK_mark` is ignored, then unknown marks are returned as errors.
 
@@ -379,8 +381,7 @@ Known marks are included in the same list as the list of known words.
 			/* found a longest-known-mark, still need to
 			 * check for comments
 			 */
-			if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
-			    (ch == '/' || ch == '*')) {
+			if (is_comment(ignored, tk.txt)) {
 				/* Yes, this is a comment, not a '/' */
 				restore_unget_state(state);
 				tk.num = TK_error;
@@ -393,22 +394,21 @@ Known marks are included in the same list as the list of known words.
 		prev = ch;
 		save_unget_state(state);
 		ch = get_char(state);
+		if (n >= 0)
+			/* No need to worry about other token types */
+			continue;
 		if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
 			/* If strings are allowed, a quote (Which isn't a known mark)
 			 * mustn't be treated as part of an unknown mark.  It can be
-			 * part of a multi-line srtings though.
+			 * part of a multi-line string though.
 			 */
 			break;
-		if (prev == '#' && n < 0)
-			/* '#' is not a known mark, so assume it is a comment */
-			break;
-		if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
-			close_token(state, &tk);
-			restore_unget_state(state);
-			break;
-		}
-		if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
-			close_token(state, &tk);
+
+		close_token(state, &tk);
+		if (is_comment(ignored, tk.txt)) {
+			/* looks like a permitted comment, and not a known mark,
+			 * so assume it is a comment.
+			 */
 			restore_unget_state(state);
 			break;
 		}
@@ -466,7 +466,7 @@ ignored, we fall through and treat a triple quote as an empty string
 followed by the start of a new string.
 
 ###### parse string
-	if (tk.txt.len == 3 &&
+	if (tk.txt.len >= 3 &&
 	    !(ignored & (1 << TK_multi_string)) &&
 	    is_quote(tk.txt.txt[0]) &&
 	    memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
@@ -573,19 +573,29 @@ still parsed, but is discarded.
 	TK_block_comment,
 
 ###### internal functions
-	static int is_line_comment(struct text txt)
+	static int is_line_comment(int ignored, struct text txt)
 	{
+		if (ignored & (1 << TK_line_comment))
+			return 0;
 		return (txt.len >= 1 && txt.txt[0] == '#') ||
 		       (txt.len >= 2 && txt.txt[0] == '/' &&
 		                        txt.txt[1] == '/');
 	}
 
-	static int is_block_comment(struct text txt)
+	static int is_block_comment(int ignored, struct text txt)
 	{
+		if (ignored & (1 << TK_block_comment))
+			return 0;
 		return txt.len >= 2 && txt.txt[0] == '/' &&
 		       txt.txt[1] == '*';
 	}
 
+	static int is_comment(int ignored, struct text txt)
+	{
+		return is_line_comment(ignored, txt) ||
+		       is_block_comment(ignored, txt);
+	}
+
 #### Single line comments
 
 A single-line comment continues up to, but not including the newline
@@ -593,14 +603,14 @@ or end of node.
 
 ###### parse comment
 
-	if (is_line_comment(tk.txt)) {
+	if (is_line_comment(ignored, tk.txt)) {
 		while (!is_newline(ch) && !at_eon(state))
 			ch = get_char(state);
 		if (is_newline(ch))
 			unget_char(state);
 		close_token(state, &tk);
 		tk.num = TK_line_comment;
-		if (ignored & (1 << TK_line_comment))
+		if (!state->conf->return_comments)
 			continue;
 		return tk;
 	}
@@ -617,7 +627,7 @@ the unget state (explained later).
 
 ###### parse comment
 
-	if (is_block_comment(tk.txt)) {
+	if (is_block_comment(ignored, tk.txt)) {
 		wchar_t prev;
 		int newlines = 0;
 		reset_token(state, &tk);
@@ -655,8 +665,7 @@ the unget state (explained later).
 			if (!is_newline(ch))
 				tk.num = TK_error;
 		}
-		if (tk.num == TK_error ||
-		    !(ignored & (1 << TK_block_comment)))
+		if (tk.num == TK_error || state->conf->return_comments)
 			return tk;
 		continue;
 	}
@@ -2037,11 +2046,12 @@ the tokens one per line.
 			{ "ignore-newline",	0, NULL, 'l'},
 			{ "ignore-block-comment", 0, NULL, 'C'},
 			{ "ignore-indent",	0, NULL, 'i'},
+			{ "return-comments",	0, NULL, 'r'},
 			{ "file",		1, NULL, 'f'},
 			{ "section",		1, NULL, 's'},
 			{ NULL,			0, NULL, 0},
 		};
-		static const char options[] = "W:w:n:NIMSzclCif:s:";
+		static const char options[] = "W:w:n:NIMSzclCirf:s:";
 
 		struct section *table, *s, *prev;
 		int opt;
@@ -2064,6 +2074,7 @@ the tokens one per line.
 			case 'C': conf.ignored |= 1 << TK_block_comment; break;
 			case 'l': conf.ignored |= 1 << TK_newline; break;
 			case 'i': conf.ignored |= 1 << TK_in; break;
+			case 'r': conf.return_comments = 1; break;
 			case 'f': filename = optarg; break;
 			case 's': section_name = optarg; break;
 			default: fprintf(stderr, "scanner: unknown option '%c'.\n",
-- 
2.43.0