X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;f=csrc%2Fscanner.mdc;h=6b706411f5010e3da61b6214742f3db39e91de77;hb=850a39a0a761e0af89c15253f075ecd9e9ecc6ee;hp=fa27a89524a85d4eeb9a16019558e61a993afb5f;hpb=c040191336b755321af667a0251b97782d8eed71;p=ocean

diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc
index fa27a89..6b70641 100644
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -69,13 +69,15 @@ The scanner is not completely general, yet not completely specified.
 There are a fixed set of token types, though particular tokens within
 those types can be distinguish via configuration.
 
-Most token types may be explicitly ignored, as typically comments
-would be.  The exact consequence of ignoring each token type varies
-from token to token.
+Most token types may be explicitly ignored, so they aren't parsed.
+Comments typically parsed but not returned, but an option is provided to
+return comments for further processing.  The exact consequence of
+ignoring each token type varies from token to token.
 
 ###### public types
 	struct token_config {
 		int ignored;	// bit set of ignored tokens.
+		int return_comments;
 		## token config parameters
 	};
 
@@ -354,10 +356,10 @@ immediately before a string is handled correctly.
 
 If the first character of a comment marker (i.e. '/') is a known mark,
 the above rules would suggest that the start of a comment would be
-parsed as that mark, which is not what is wanted.  So the introductory
-sequences for a comment ("//" and "/*") are treated as
-partially-known.  They prevent the leading "/" from being a mark by
-itself, but do not actually constitute a stand-alone mark.
+parsed as that mark, which is not what is wanted.  So when comments are
+not ignored, the introductory sequences for a comment ("//" and "/*")
+are treated as partially-known.  They prevent the leading "/" from being
+a mark by itself, but do not actually constitute a stand-alone mark.
 
 If `TK_mark` is ignored, then unknown marks are returned as errors.
 
@@ -379,8 +381,7 @@ Known marks are included in the same list as the list of known words.
 			/* found a longest-known-mark, still need to
 			 * check for comments
 			 */
-			if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
-			    (ch == '/' || ch == '*')) {
+			if (is_comment(ignored, tk.txt)) {
 				/* Yes, this is a comment, not a '/' */
 				restore_unget_state(state);
 				tk.num = TK_error;
@@ -393,22 +394,21 @@ Known marks are included in the same list as the list of known words.
 		prev = ch;
 		save_unget_state(state);
 		ch = get_char(state);
+		if (n >= 0)
+			/* No need to worry about other token types */
+			continue;
 		if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
 			/* If strings are allowed, a quote (Which isn't a known mark)
 			 * mustn't be treated as part of an unknown mark.  It can be
-			 * part of a multi-line srtings though.
+			 * part of a multi-line string though.
 			 */
 			break;
-		if (prev == '#' && n < 0)
-			/* '#' is not a known mark, so assume it is a comment */
-			break;
-		if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
-			close_token(state, &tk);
-			restore_unget_state(state);
-			break;
-		}
-		if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
-			close_token(state, &tk);
+
+		close_token(state, &tk);
+		if (is_comment(ignored, tk.txt)) {
+			/* looks like a permitted comment, and not a known mark,
+			 * so assume it is a comment.
+			 */
 			restore_unget_state(state);
 			break;
 		}
@@ -466,7 +466,7 @@ ignored, we fall through and treat a triple quote as an empty string
 followed by the start of a new string.
 
 ###### parse string
-	if (tk.txt.len == 3 &&
+	if (tk.txt.len >= 3 &&
 	    !(ignored & (1 << TK_multi_string)) &&
 	    is_quote(tk.txt.txt[0]) &&
 	    memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
@@ -573,19 +573,29 @@ still parsed, but is discarded.
 	TK_block_comment,
 
 ###### internal functions
-	static int is_line_comment(struct text txt)
+	static int is_line_comment(int ignored, struct text txt)
 	{
+		if (ignored & (1 << TK_line_comment))
+			return 0;
 		return (txt.len >= 1 && txt.txt[0] == '#') ||
 		       (txt.len >= 2 && txt.txt[0] == '/' &&
 		                        txt.txt[1] == '/');
 	}
 
-	static int is_block_comment(struct text txt)
+	static int is_block_comment(int ignored, struct text txt)
 	{
+		if (ignored & (1 << TK_block_comment))
+			return 0;
 		return txt.len >= 2 && txt.txt[0] == '/' &&
 		       txt.txt[1] == '*';
 	}
 
+	static int is_comment(int ignored, struct text txt)
+	{
+		return is_line_comment(ignored, txt) ||
+		       is_block_comment(ignored, txt);
+	}
+
 #### Single line comments
 
 A single-line comment continues up to, but not including the newline
@@ -593,14 +603,14 @@ or end of node.
 
 ###### parse comment
 
-	if (is_line_comment(tk.txt)) {
+	if (is_line_comment(ignored, tk.txt)) {
 		while (!is_newline(ch) && !at_eon(state))
 			ch = get_char(state);
 		if (is_newline(ch))
 			unget_char(state);
 		close_token(state, &tk);
 		tk.num = TK_line_comment;
-		if (ignored & (1 << TK_line_comment))
+		if (!state->conf->return_comments)
 			continue;
 		return tk;
 	}
@@ -617,7 +627,7 @@ the unget state (explained later).
 
 ###### parse comment
 
-	if (is_block_comment(tk.txt)) {
+	if (is_block_comment(ignored, tk.txt)) {
 		wchar_t prev;
 		int newlines = 0;
 		reset_token(state, &tk);
@@ -655,8 +665,7 @@ the unget state (explained later).
 			if (!is_newline(ch))
 				tk.num = TK_error;
 		}
-		if (tk.num == TK_error ||
-		    !(ignored & (1 << TK_block_comment)))
+		if (tk.num == TK_error || state->conf->return_comments)
 			return tk;
 		continue;
 	}
@@ -716,6 +725,8 @@ ignored.
 	int	indent_level;
 	int	indent_sizes[20];
 
+`indent_sizes[0]` will always be zero - this simplifies some code.
+
 #### Newlines
 
 Newlines can optionally be reported.  Newlines within a block comment
@@ -796,10 +807,20 @@ Separately we need, on each call to `token_next`, to check if
 there are some delayed tokens and if so we need to advance the state
 information and return one token.
 
+###### internal functions
+	static int state_indent(struct token_state *state)
+	{
+		if (state->node == NULL)
+			return state->col;
+		return state->node->indent - state->node->needs_strip + state->col;
+	}
+
 ###### white space
+	if (is_newline(ch))
+		state_check_node(state);
 	if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 		int newlines = 0;
-		int was_son = at_son(state);
+		int was_nl = is_newline(ch);
 		if (ignored & (1<<TK_in)) {
 			if (!is_newline(ch))
 				continue;
@@ -810,22 +831,17 @@ information and return one token.
 			return tk;
 		}
 		// Indents are needed, so check all white space.
-		while (ch <= ' ' && !at_eon(state)) {
+		while (ch <= ' ' && ch != WEOF) {
 			if (is_newline(ch))
 				newlines += 1;
 			ch = get_char(state);
+			if (is_newline(ch))
+				state_check_node(state);
 		}
-		if (at_eon(state)) {
-			newlines += 1;
-			if (state->node->next &&
-			    state->node->next->indent > state->node->indent)
-				state->col = state->node->next->indent;
-			else
-				state->col = state->node->indent;
-		} else
+		if (ch != WEOF)
 			unget_char(state);
 		state->delayed_lines = newlines;
-		state->out_next = was_son;
+		state->out_next = !was_nl;
 		state->check_indent = 1;
 		continue;
 	}
@@ -833,7 +849,7 @@ information and return one token.
 ###### delayed tokens
 
 	if (state->check_indent || state->delayed_lines) {
-		if (state->col < state->indent_sizes[state->indent_level]) {
+		if (state_indent(state) < state->indent_sizes[state->indent_level]) {
 			if (!state->out_next &&
 			    !(ignored & (1<<TK_newline))) {
 				state->out_next = 1;
@@ -845,11 +861,12 @@ information and return one token.
 			tk.num = TK_out;
 			return tk;
 		}
-		if (state->col > state->indent_sizes[state->indent_level] &&
+		if (state_indent(state) > state->indent_sizes[state->indent_level] &&
 		    state->indent_level < sizeof(state->indent_sizes)-1) {
 			state->indent_level += 1;
-			state->indent_sizes[state->indent_level] = state->col;
-			state->delayed_lines -= 1;
+			state->indent_sizes[state->indent_level] = state_indent(state);
+			if (state->delayed_lines)
+				state->delayed_lines -= 1;
 			tk.num = TK_in;
 			return tk;
 		}
@@ -874,11 +891,6 @@ tokens will continue to return the same end-of-file token.
 
 ###### white space
 	if (ch == WEOF) {
-		if (state->col) {
-			state->col = 0;
-			state->check_indent = 1;
-			continue;
-		}
 		tk.num = TK_eof;
 		return tk;
 	}
@@ -938,7 +950,7 @@ a flag that tells us whether or not we need to strip.
 
 ###### internal functions
 
-	static int do_strip(struct token_state *state)
+	static void do_strip(struct token_state *state)
 	{
 		int indent = 0;
 		if (state->node->needs_strip) {
@@ -954,7 +966,28 @@ a flag that tells us whether or not we need to strip.
 				n -= 4;
 			}
 		}
-		return indent;
+	}
+
+	static void state_check_node(struct token_state *state)
+	{
+		if (!state->node)
+			return;
+		if (state->node->code.len > state->offset)
+			return;
+
+		do
+			state->node = state->node->next;
+		while (state->node && state->node->code.txt == NULL);
+		state->offset = 0;
+		state->prev_offset = 0;
+		state->strip_offset = 0;
+		state->col = 0;
+		if (state->node == NULL)
+			return;
+		state->line = state->node->line_no;
+		do_strip(state);
+		state->col = state->node->needs_strip;
+		state->strip_offset = state->offset;
 	}
 
 	static wint_t get_char(struct token_state *state)
@@ -963,19 +996,9 @@ a flag that tells us whether or not we need to strip.
 		size_t n;
 		mbstate_t mbstate;
 
+		state_check_node(state);
 		if (state->node == NULL)
 			return WEOF;
-		if (state->node->code.len <= state->offset) {
-			do
-				state->node = state->node->next;
-			while (state->node && state->node->code.txt == NULL);
-			state->offset = 0;
-			if (state->node == NULL)
-				return WEOF;
-			state->line = state->node->line_no;
-			state->col = do_strip(state);
-			state->strip_offset = state->offset;
-		}
 
 		## before get_char
 
@@ -986,12 +1009,12 @@ a flag that tells us whether or not we need to strip.
 			    &mbstate);
 		if (n == -2 || n == 0) {
 			/* Not enough bytes - not really possible */
-			next = '\n';
-			state->offset = state->node->code.len;
+			next = '\n';				// NOTEST
+			state->offset = state->node->code.len;	// NOTEST
 		} else if (n == -1) {
 			/* error */
-			state->offset += 1;
-			next = 0x7f; // an illegal character
+			state->offset += 1;			// NOTEST
+			next = 0x7f; // an illegal character	// NOTEST
 		} else
 			state->offset += n;
 
@@ -999,7 +1022,8 @@ a flag that tells us whether or not we need to strip.
 			state->col += 1;
 		} else if (is_newline(next)) {
 			state->line += 1;
-			state->col = do_strip(state);
+			do_strip(state);
+			state->col = state->node->needs_strip;
 		} else if (next == '\t') {
 			state->col = indent_tab(state->col);
 		}
@@ -1211,7 +1235,8 @@ As well as getting tokens, we need to be able to create the
 		memset(state, 0, sizeof(*state));
 		state->node = code;
 		state->line = code->line_no;
-		state->col = do_strip(state);
+		do_strip(state);
+		state->col = state->node->needs_strip;
 		state->strip_offset = state->offset;
 		state->conf = conf;
 		return state;
@@ -1654,7 +1679,7 @@ Number parsing goes in `libnumber.c`
 	## number includes
 	## number functions
 
-###### File: number.h
+###### File: parse_number.h
 	int number_parse(mpq_t num, char tail[3], struct text tok);
 
 ###### File: scanner.mk
@@ -1938,7 +1963,7 @@ String parsing goes in `libstring.c`
 	## string functions
 	## string main
 
-###### File: string.h
+###### File: parse_string.h
 	int string_parse(struct token *tok, char escape,
 	                 struct text *str, char tail[3]);
 
@@ -1967,8 +1992,8 @@ the tokens one per line.
 	#include <getopt.h>
 	#include "mdcode.h"
 	#include "scanner.h"
-	#include "number.h"
-	#include "string.h"
+	#include "parse_number.h"
+	#include "parse_string.h"
 
 	static int errs;
 	static void pr_err(char *msg)
@@ -2021,13 +2046,17 @@ the tokens one per line.
 			{ "ignore-newline",	0, NULL, 'l'},
 			{ "ignore-block-comment", 0, NULL, 'C'},
 			{ "ignore-indent",	0, NULL, 'i'},
+			{ "return-comments",	0, NULL, 'r'},
 			{ "file",		1, NULL, 'f'},
+			{ "section",		1, NULL, 's'},
 			{ NULL,			0, NULL, 0},
 		};
-		static const char options[] = "W:w:n:NIMSzclCif:";
+		static const char options[] = "W:w:n:NIMSzclCirf:s:";
 
 		struct section *table, *s, *prev;
 		int opt;
+		char *section_name = NULL;
+		int section_found = 0;
 
 		setlocale(LC_ALL,"");
 		while ((opt = getopt_long(argc, argv, options, long_options, NULL))
@@ -2045,7 +2074,9 @@ the tokens one per line.
 			case 'C': conf.ignored |= 1 << TK_block_comment; break;
 			case 'l': conf.ignored |= 1 << TK_newline; break;
 			case 'i': conf.ignored |= 1 << TK_in; break;
+			case 'r': conf.return_comments = 1; break;
 			case 'f': filename = optarg; break;
+			case 's': section_name = optarg; break;
 			default: fprintf(stderr, "scanner: unknown option '%c'.\n",
 			                 opt);
 				exit(1);
@@ -2082,6 +2113,12 @@ the tokens one per line.
 
 		for (s = table; s;
 			(code_free(s->code), prev = s, s = s->next, free(prev))) {
+			if (section_name &&
+			    (s->section.len != strlen(section_name) ||
+			     strncmp(s->section.txt, section_name, s->section.len) != 0))
+				continue;
+			if (section_name)
+				section_found = 1;
 			printf("Tokenizing: %.*s\n", s->section.len,
 			        s->section.txt);
 			state = token_open(s->code, &conf);
@@ -2124,6 +2161,10 @@ the tokens one per line.
 		}
 		if (conf.words_marks != known)
 			free(conf.words_marks);
+		if (section_name && !section_found) {
+			fprintf(stderr, "scanner: section %s not found\n", section_name);
+			errs = 1;
+		}
 		exit(!!errs);
 	}
 ###### File: scanner.mk