X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;f=csrc%2Fscanner.mdc;h=9a1ea7144b612a6624e39d799921c37682d46c99;hb=5513fc2e3fb56bdf2292e834077e0c33f9a5c2a9;hp=7e33d0cbc1a89126706d2f72ce1f2351e0d395ee;hpb=4d085c0f91408abb43eeeddd022b13569e3682a4;p=ocean

diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc
index 7e33d0c..9a1ea71 100644
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -19,7 +19,7 @@ The text is assumed to be UTF-8 though some matching assumes the
 ASCII subset.  If the text provided does not conform to UTF-8 an error
 will be reported and some number of bytes will be skipped.
 
-###### includes
+###### public types
 	#include <wchar.h>
 	#include <wctype.h>
 	#include <unicode/uchar.h>
@@ -90,7 +90,7 @@ The different tokens are numbers, words, marks, strings, comments,
 newlines, EOF, and indents, each of which is examined in detail below.
 
 There are various cases where no token can be found in part of the
-input.  All of these will be reported as an `TK_error` token.
+input.  All of these will be reported as a `TK_error` token.
 
 It is possible to declare a number of strings which form distinct
 tokens (rather than being grouped as e.g. 'word').  These are given
@@ -106,7 +106,7 @@ token numbers from `TK_reserved` upwards.
 ### Numbers
 
 Numbers are the messiest tokens to parse, primarily because they can
-contain characters that also have meaning outside of number and,
+contain characters that also have meaning outside of numbers and,
 particularly, immediately after numbers.
 
 The obvious example is the '`-`' sign.  It can come inside a number for
@@ -177,11 +177,11 @@ are declared to be a start character for words.
 			int sign_ok = 0;
 			switch(expect_p) {
 			case 0:
-				if (ch == 'e')
+				if (ch == 'e' || ch == 'E')
 					sign_ok = 1;
 				break;
 			case 1:
-				if (ch == 'p')
+				if (ch == 'p' || ch == 'P')
 					sign_ok = 1;
 				break;
 			}
@@ -260,6 +260,9 @@ and the length of the list must be given (`known_count`).
 Tokens matching these known words are reported as the index of the
 list added to `TK_reserved`.
 
+If identifiers are ignored, then any word which is not listed as a
+known word results in an error.
+
 ###### token config parameters
 	const char **words_marks;
 	int known_count;
@@ -321,10 +324,17 @@ below before giving up and assuming an unknown mark.
 
 If an unknown mark contains a quote character or a comment marker, and
 that token is not being ignored, then we terminate the unknown mark
-before that quote or comment.  This ensure that an unknown mark
+before that quote or comment.  This ensures that an unknown mark
 immediately before a string is handled correctly.
 
-If `TK_mark` is ignored, then unknown marks as returned as an error.
+If the first character of a comment marker (i.e. '/') is a known mark,
+the above rules would suggest that the start of a comment would be
+parsed as that mark, which is not what is wanted.  So the introductory
+sequences for a comment ("//" and "/*") are treated as
+partially-known.  They prevent the leading "/" from being a mark by
+itself, but do not actually constitute a stand-alone mark.
+
+If `TK_mark` is ignored, then unknown marks are returned as errors.
 
 ###### token types
 	TK_mark,
@@ -341,31 +351,56 @@ Known marks are included in the same list as the list of known words.
 		if (n >= 0)
 			tk.num = TK_reserved + n;
 		else if (tk.num != TK_error) {
-			/* found a longest-known-mark */
+			/* found a longest-known-mark, still need to
+			 * check for comments
+			 */
+			if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
+			    (ch == '/' || ch == '*')) {
+				/* Yes, this is a comment, not a '/' */
+				restore_unget_state(state);
+				tk.num = TK_error;
+				break;
+			}
 			unget_char(state);
 			close_token(state, &tk);
 			return tk;
 		}
 		prev = ch;
-		if (prev == '/')
-			save_unget_state(state);
+		save_unget_state(state);
 		ch = get_char(state);
-		if (!(ignored && (1<<TK_string)) && is_quote(ch))
+		if (!(ignored && (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
+			/* If strings are allowed, a quote (Which isn't a known mark)
+			 * mustn't be treated as part of an unknown mark.  It can be
+			 * part of a multi-line srtings though.
+			 */
 			break;
-		if (!(ignored && (1<<TK_line_comment)) &&
-		    prev == '/' && ch == '/') {
+		if (prev == '#' && n < 0)
+			/* '#' is not a known mark, so assume it is a comment */
+			break;
+		if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
+			close_token(state, &tk);
 			restore_unget_state(state);
 			break;
 		}
-		if (!(ignored && (1<<TK_block_comment)) &&
-		    prev == '/' && ch == '*') {
+		if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
+			close_token(state, &tk);
 			restore_unget_state(state);
 			break;
 		}
 	}
 	unget_char(state);
-	if (tk.num != TK_error)
+	if (tk.num != TK_error) {
+		close_token(state, &tk);
 		return tk;
+	}
+
+If we don't find a known mark, we will check for strings and comments
+before assuming that we have an unknown mark
+
+###### parse mark
+	## parse string
+	## parse comment
+	## unknown mark
 
 ###### unknown mark
 	if (tk.txt.len) {
@@ -383,7 +418,7 @@ and continue until a matching character on the same line.  Any of
 these characters can be included in the list of known marks and then
 they will not be used for identifying strings.
 
-Immediately following the close quote one or two ASCII letters may
+Immediately following the close quote, one or two ASCII letters may
 appear.  These are somewhat like the arbitrary letters allowed in
 "Numbers" above.  They can be used by the language in various ways.
 
@@ -405,7 +440,7 @@ token types.
 ###### internal functions
 	static int is_quote(wchar_t ch)
 	{
-		return ch == '\'' || ch == '"' || ch == '`';
+		return ch == '\'' || ch == '"' || ch == '`'; // "
 	}
 
 #### Multi-line strings
@@ -441,7 +476,8 @@ followed by the start of a new string.
 			 * unget so the newline is seen,
 			 * but return rest of string as an error.
 			 */
-			unget_char(state);
+			if (is_newline(ch))
+				unget_char(state);
 			close_token(state, &tk);
 			tk.num = TK_error;
 			return tk;
@@ -475,15 +511,23 @@ If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
 	    !(ignored & (1<<TK_string))) {
 		wchar_t first = tk.txt.txt[0];
 		reset_token(state, &tk);
-		get_char(state);
-		do
+		ch = get_char(state);
+		tk.num = TK_error;
+		while (!at_eon(state) && !is_newline(ch)) {
 			ch = get_char(state);
-		while (ch != first && !is_newline(ch));
-		tk.num = TK_string;
-		if (is_newline(ch)) {
-			unget_char(state);
-			tk.num = TK_error;
+			if (ch == first) {
+				tk.num = TK_string;
+				break;
+			}
+			if (is_newline(ch)) {
+				unget_char(state);
+				break;
+			}
 		}
+		while (!at_eon(state) && (ch = get_char(state)) &&
+		                          iswalpha(ch))
+			;
+		unget_char(state);
 		close_token(state, &tk);
 		return tk;
 	}
@@ -506,7 +550,7 @@ it would not suffer from this rule.
 
 These two comment types are reported as two separate token types, and
 consequently can be ignored separately.  When ignored a comment is
-parsed and discarded.
+still parsed, but is discarded.
 
 ###### token types
 	TK_line_comment,
@@ -528,14 +572,16 @@ parsed and discarded.
 
 #### Single line comments
 
-A single-line comment continues up to, but not including the newline.
+A single-line comment continues up to, but not including the newline
+or end of node.
 
 ###### parse comment
 
 	if (is_line_comment(tk.txt)) {
-		while (!is_newline(ch))
+		while (!is_newline(ch) && !at_eon(state))
 			ch = get_char(state);
-		unget_char(state);
+		if (is_newline(ch))
+			unget_char(state);
 		close_token(state, &tk);
 		tk.num = TK_line_comment;
 		if (ignored & (1 << TK_line_comment))
@@ -618,22 +664,22 @@ node (detected by `at_son()`);
 
 If a line starts with more white-space than the previous non-blank
 line - or if the first non-blank line in the document starts with any
-white-space - then an Indent is reported at the start of the line.
+white-space - then an "IN" is reported at the start of the line.
 
 Before the next non-blank line which starts with less white space, or
-at the latest at the end of the document, a matching Undent token
-is reported.  There will always be an exact match between Indent and
-Undent tokens.
+at the latest at the end of the document, a matching "OUT" token
+is reported.  There will always be an exact match between "IN" and
+"OUT" tokens.
 
-It is possible for Undent to be followed (almost) immediately by an
-Indent.  This happens if, for example, the indent of three consecutive
+It is possible for "OUT" to be followed (almost) immediately by an
+"IN".  This happens if, for example, the indent of three consecutive
 lines are 0, 8, 4 spaces.  Before the second line we report an
-Indent.  Before the third line we must report an Undent, as 4 is less
+"IN".  Before the third line we must report an "OUT", as 4 is less
 than 8, then also an Ident as 4 is greater than 0.
 
 ###### token types
-	TK_indent,
-	TK_undent,
+	TK_in,
+	TK_out,
 
 For the purpose of measuring the length of white space, a tab adds at
 least one space, and rounds up to a multiple of 8.
@@ -661,9 +707,9 @@ or a multi-line string are not reported separately, but each of these
 must be followed immediately by a newline so these constructs cannot
 hide the fact that a newline was present.
 
-When Indents are being reported, the Newline which would normally be
-reported immediately before the Indent is delayed until after the
-matching undent.  This makes an indented section act like a
+When indents are being reported, the Newline which would normally be
+reported immediately before the "IN" is delayed until after the
+matching "OUT".  This makes an indented section act like a
 continuation of the previous line to some extent.
 
 A blank line would normally be reported simply as two consecutive Newline
@@ -672,7 +718,7 @@ reported) then the right thing to do is less obvious as Newlines should be
 delayed - but how many Newlines?
 
 The approach we will take is to report the extra Newlines immediately after
-the Indent token, so the blank line is treated as though it were an indented
+the IN token, so the blank line is treated as though it were an indented
 blank line.
 
 ###### token types
@@ -681,49 +727,49 @@ blank line.
 If we find a newline or white space at the start of a block, we keep
 collecting spaces, tabs, and newlines until we find some real text.
 Then depending on the indent we generate some number of tokens.  These
-will be a sequence of "Newline Undent" pairs representing a decrease
-in indent, then either a Newline or an Indent depending on whether the
+will be a sequence of "Newline OUT" pairs representing a decrease
+in indent, then either a Newline or an IN depending on whether the
 next line is indented, then zero or more Newlines representing all the
 blank lines that have been skipped.
 
 When a Newline leads to the next block of code there is a question of
-whether the various Newline and Undent/Indent tokens should appear to
+whether the various Newline and OUT/IN tokens should appear to
 pbelong to the earlier or later block.  This is addressed by processing
 the tokens in two stages based on the relative indent levels of the
 two blocks (each block has a base indent to which the actual indents
 are added).
 
-Any "Newline Undent" pairs needed to reduce the current indent to the
+Any "Newline OUT" pairs needed to reduce the current indent to the
 maximum of the base indents of the old and new blocks are generated
 against the old block.  Then if the next block does not have an
 increased indent, one more "Newline" is generated.
 
-If further "Newline Undent" pairs are needed to get to the indent
+If further "Newline OUT" pairs are needed to get to the indent
 level of the 'next' block, they are generated against that block,
 though the first Newline is suppressed (it having already been
 generated).
 
-Finally the Newline or Indent for the first line of the new block is
+Finally the Newline or IN for the first line of the new block is
 generated, unless the Newline needs to be suppressed because it
 appeared at the end of the previous block.
 
-This means that a block may start with an Undent or an Indent, but
+This means that a block may start with an OUT or an IN, but
 will only start with a Newline if it actually starts with a blank
 line.
 
 We will need to represent in the `token_state` where in this sequence
 of delayed tokens we are.  As `state.col` records the target indent we
-don't need to record how many undents or indents are needed.  We do
+don't need to record how many OUTs or INs are needed.  We do
 need to record the number of blank lines, and which of Newline and
-Undent is needed next in the initial sequence of pairs.
+OUT is needed next in the initial sequence of pairs.
 
 For this we store one more than the number of blank lines as
-`delayed_lines` and a flag for `undent_next`.
+`delayed_lines` and a flag for `out_next`.
 
 ###### state fields
 	int check_indent;
 	int delayed_lines;
-	int undent_next;
+	int out_next;
 
 Generating these tokens involve two separate pieces of code.
 
@@ -738,7 +784,7 @@ information and return one token.
 	if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 		int newlines = 0;
 		int was_son = at_son(state);
-		if (ignored & (1<<TK_indent)) {
+		if (ignored & (1<<TK_in)) {
 			if (!is_newline(ch))
 				continue;
 			if (ignored & (1<<TK_newline))
@@ -763,7 +809,7 @@ information and return one token.
 		} else
 			unget_char(state);
 		state->delayed_lines = newlines;
-		state->undent_next = was_son;
+		state->out_next = was_son;
 		state->check_indent = 1;
 		continue;
 	}
@@ -773,15 +819,15 @@ information and return one token.
 
 	if (state->check_indent || state->delayed_lines) {
 		if (state->col < state->indent_sizes[state->indent_level]) {
-			if (!state->undent_next &&
+			if (!state->out_next &&
 			    !(ignored & (1<<TK_newline))) {
-				state->undent_next = 1;
+				state->out_next = 1;
 				tk.num = TK_newline;
 				return tk;
 			}
 			state->indent_level -= 1;
-			state->undent_next = 0;
-			tk.num = TK_undent;
+			state->out_next = 0;
+			tk.num = TK_out;
 			return tk;
 		}
 		if (state->col > state->indent_sizes[state->indent_level] &&
@@ -789,7 +835,7 @@ information and return one token.
 			state->indent_level += 1;
 			state->indent_sizes[state->indent_level] = state->col;
 			state->delayed_lines -= 1;
-			tk.num = TK_indent;
+			tk.num = TK_in;
 			return tk;
 		}
 		state->check_indent = 0;
@@ -814,6 +860,11 @@ tokens will continue to return the same end-of-file token.
 
 ###### white space
 	if (ch == WEOF) {
+		if (state->col) {
+			state->col = 0;
+			state->check_indent = 1;
+			continue;
+		}
 		tk.num = TK_eof;
 		return tk;
 	}
@@ -858,19 +909,23 @@ a flag that tells us whether or not we need to strip.
 
 ###### internal functions
 
-	static void do_strip(struct token_state *state)
+	static int do_strip(struct token_state *state)
 	{
+		int indent = 0;
 		if (state->node->needs_strip) {
 			int n = 4;
 			while (n && state->node->code.txt[state->offset] == ' ') {
+				indent += 1;
 				state->offset += 1;
 				n -= 1;
 			}
 			while (n == 4 && state->node->code.txt[state->offset] == '\t') {
+				indent = indent_tab(indent);
 				state->offset += 1;
 				n -= 4;
 			}
 		}
+		return indent;
 	}
 
 	static wint_t get_char(struct token_state *state)
@@ -888,9 +943,8 @@ a flag that tells us whether or not we need to strip.
 			state->offset = 0;
 			if (state->node == NULL)
 				return WEOF;
-			do_strip(state);
 			state->line = state->node->line_no;
-			state->col = state->node->indent;
+			state->col = do_strip(state);
 		}
 
 		## before get_char
@@ -915,8 +969,7 @@ a flag that tells us whether or not we need to strip.
 			state->col += 1;
 		} else if (is_newline(next)) {
 			state->line += 1;
-			state->col = state->node->indent;
-			do_strip(state);
+			state->col = do_strip(state);
 		} else if (next == '\t') {
 			state->col = indent_tab(state->col);
 		}
@@ -1019,8 +1072,11 @@ parsed too much already.  For that there is `reset_token`.
 	static void close_token(struct token_state *state,
 	                        struct token *tk)
 	{
-		tk->txt.len = (state->node->code.txt + state->offset)
-		              - tk->txt.txt;
+		if (state->node != tk->node)
+			tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
+		else
+			tk->txt.len = (state->node->code.txt + state->offset)
+			              - tk->txt.txt;
 	}
 
 	static void reset_token(struct token_state *state, struct token *tok)
@@ -1090,7 +1146,7 @@ searching for.
 Now we have all the bits there is just one section missing:  combining
 all the token parsing code into one block.
 
-The handling of delayed tokens (newlines, indents, undents) must come
+The handling of delayed tokens (Newlines, INs, OUTs) must come
 first before we try getting another character.
 
 Then we parse all the test, making sure that we check for known marks
@@ -1109,9 +1165,6 @@ loop.
 	## parse number
 	## parse word
 	## parse mark
-	## parse string
-	## parse comment
-	## unknown mark
 
 ### Start and stop
 
@@ -1129,9 +1182,8 @@ As well as getting tokens, we need to be able to create the
 		memset(state, 0, sizeof(*state));
 		state->node = code;
 		state->line = code->line_no;
-		state->col = code->indent;
+		state->col = do_strip(state);
 		state->conf = conf;
-		do_strip(state);
 		return state;
 	}
 	void token_close(struct token_state *state)
@@ -1194,8 +1246,8 @@ so that it can be used to tracing processed strings too.
 			[TK_multi_string] = "mstring",
 			[TK_line_comment] = "lcomment",
 			[TK_block_comment] = "bcomment",
-			[TK_indent] = "indent",
-			[TK_undent] = "undent",
+			[TK_in] = "in",
+			[TK_out] = "out",
 			[TK_newline] = "newline",
 			[TK_eof] = "eof",
 			[TK_error] = "ERROR",
@@ -1205,8 +1257,8 @@ so that it can be used to tracing processed strings too.
 		default: /* known word or mark */
 			fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
 			break;
-		case TK_indent:
-		case TK_undent:
+		case TK_in:
+		case TK_out:
 		case TK_newline:
 		case TK_eof:
 			/* No token text included */
@@ -1741,7 +1793,7 @@ required indent is found.
 		if (c == ' ')
 			skipped += 1;
 		else if (c == '\t')
-			skipped = indent_tab(c);
+			skipped = indent_tab(skipped);
 		else
 			break;
 		i+= 1;
@@ -1885,6 +1937,7 @@ the tokens one per line.
 	#include <stdio.h>
 	#include <gmp.h>
 	#include <locale.h>
+	#include <getopt.h>
 	#include "mdcode.h"
 	#include "scanner.h"
 	#include "number.h"
@@ -1897,11 +1950,19 @@ the tokens one per line.
 		fprintf(stderr, "%s\n", msg);
 	}
 
+	static int kcmp(const void *ap, const void *bp)
+	{
+		char * const *a = ap;
+		char * const *b = bp;
+		return strcmp(*a, *b);
+	}
+
 	int main(int argc, char *argv[])
 	{
 		int fd;
 		int len;
 		char *file;
+		char *filename = NULL;
 		struct token_state *state;
 		const char *known[] = {
 			"==",
@@ -1918,22 +1979,77 @@ the tokens one per line.
 			.words_marks = known,
 			.number_chars = "., _+-",
 			.known_count = sizeof(known)/sizeof(known[0]),
-			.ignored = (0 << TK_line_comment)
-			          |(0 << TK_block_comment),
+			.ignored = 0,
+		};
+		static const struct option long_options[] = {
+			{ "word-start",		1, NULL, 'W'},
+			{ "word-cont",		1, NULL, 'w'},
+			{ "number-chars",	1, NULL, 'n'},
+			{ "ignore-numbers",	0, NULL, 'N'},
+			{ "ignore-ident",	0, NULL, 'I'},
+			{ "ignore-marks",	0, NULL, 'M'},
+			{ "ignore-strings",	0, NULL, 'S'},
+			{ "ignore-multi-strings",0, NULL, 'z'},
+			{ "ignore-line-comment",0, NULL, 'c'},
+			{ "ignore-newline",	0, NULL, 'l'},
+			{ "ignore-block-comment", 0, NULL, 'C'},
+			{ "ignore-indent",	0, NULL, 'i'},
+			{ "file",		1, NULL, 'f'},
+			{ NULL,			0, NULL, 0},
 		};
+		static const char options[] = "W:w:n:NIMSzclCif:";
+
 		struct section *table, *s, *prev;
+		int opt;
+
 		setlocale(LC_ALL,"");
-		if (argc != 2) {
-			fprintf(stderr, "Usage: scanner file\n");
-			exit(2);
+		while ((opt = getopt_long(argc, argv, options, long_options, NULL))
+		       != -1) {
+			switch(opt) {
+			case 'W': conf.word_start = optarg; break;
+			case 'w': conf.word_cont = optarg; break;
+			case 'n': conf.number_chars = optarg; break;
+			case 'N': conf.ignored |= 1 << TK_number; break;
+			case 'I': conf.ignored |= 1 << TK_ident; break;
+			case 'M': conf.ignored |= 1 << TK_mark; break;
+			case 'S': conf.ignored |= 1 << TK_string; break;
+			case 'z': conf.ignored |= 1 << TK_multi_string; break;
+			case 'c': conf.ignored |= 1 << TK_line_comment; break;
+			case 'C': conf.ignored |= 1 << TK_block_comment; break;
+			case 'l': conf.ignored |= 1 << TK_newline; break;
+			case 'i': conf.ignored |= 1 << TK_in; break;
+			case 'f': filename = optarg; break;
+			default: fprintf(stderr, "scanner: unknown option '%c'.\n",
+			                 opt);
+				exit(1);
+			}
 		}
-		fd = open(argv[1], O_RDONLY);
+
+		if (optind < argc) {
+			const char **wm = calloc(argc - optind, sizeof(char*));
+			int i;
+			for (i = optind; i < argc; i++)
+				wm[i - optind] = argv[i];
+			qsort(wm, argc-optind, sizeof(char*), kcmp);
+			conf.words_marks = wm;
+			conf.known_count = argc - optind;
+		}
+
+		if (filename)
+			fd = open(filename, O_RDONLY);
+		else
+			fd = 0;
 		if (fd < 0) {
 			fprintf(stderr, "scanner: cannot open %s: %s\n",
-				argv[1], strerror(errno));
+				filename, strerror(errno));
 			exit(1);
 		}
 		len = lseek(fd, 0, 2);
+		if (len <= 0) {
+			fprintf(stderr,"scanner: %s is empty or not seekable\n",
+			        filename ?: "stdin");
+			exit(1);
+		}
 		file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
 		table = code_extract(file, file+len, pr_err);
 
@@ -1977,7 +2093,10 @@ the tokens one per line.
 				if (tk.num == TK_eof)
 					break;
 			}
+			token_close(state);
 		}
+		if (conf.words_marks != known)
+			free(conf.words_marks);
 		exit(!!errs);
 	}
 ###### File: scanner.mk
@@ -1989,4 +2108,3 @@ the tokens one per line.
 			libmdcode.o libnumber.o libstring.o -licuuc -lgmp
 	scanner.o : scanner.c
 		$(CC) $(CFLAGS) -c scanner.c
-