From: NeilBrown <neil@brown.name>
Date: Wed, 29 May 2019 11:51:22 +0000 (+1000)
Subject: scanner: improve number parsing.
X-Git-Url: https://ocean-lang.org/code/?p=ocean;a=commitdiff_plain;h=6bf80b764821a6f1b335996b756e4844ff1bdd71

scanner: improve number parsing.

In particular, space must be preceeded and followed by a digit
(not a letter).
Also '_' must be preceded and followed by a hex digit, but this
wasn't enforced.

Add tests to check on numbers more thoroughly.

Signed-off-by: NeilBrown <neil@brown.name>
---

diff --git a/csrc/scanner-tests.mdc b/csrc/scanner-tests.mdc
index 3496f79..95c3556 100644
--- a/csrc/scanner-tests.mdc
+++ b/csrc/scanner-tests.mdc
@@ -40,7 +40,7 @@ about each test.
 		        coverage/scanner.mdc.gcov
 		@rm -f .tmp*
 
-	coverage_scanner: scanner.c libscanner.c
+	coverage_scanner: scanner.c libscanner.c libmdcode.o libnumber.o libstring.o
 		$(CC) $(CFLAGS) --coverage -fprofile-dir=coverage -o coverage_scanner \
 			scanner.c libscanner.c \
 			libmdcode.o libnumber.o libstring.o -licuuc -lgmp
@@ -48,6 +48,7 @@ about each test.
 ## Basic tests
 
 Some simple tests... maybe all tests are simple.
+Include a special test for numbers, as they are interesting.
 
 ###### test list
 	scanner_tests += "test1,if,then,+,-"
@@ -144,11 +145,11 @@ Some simple tests... maybe all tests are simple.
 	18:0 newline()
 	18:0 number(1234)  1234
 	18:4 mark(,)
-	18:7 number(1.234 )  617/500
+	18:7 number(1.234)  617/500
 	18:13 -
 	18:14 number(123.456e45)  123456000000000000000000000000000000000000000000
 	19:0 newline()
-	19:0 number(0x1234 )  4660
+	19:0 number(0x1234)  4660
 	19:7 +
 	19:10 number(0x543p+3)  10776
 	20:0 newline()
@@ -231,11 +232,11 @@ Some simple tests... maybe all tests are simple.
 	18:0 newline()
 	18:0 number(1234)  1234
 	18:4 mark(,)
-	18:7 number(1.234 )  617/500
+	18:7 number(1.234)  617/500
 	18:13 -
 	18:14 number(123.456e45)  123456000000000000000000000000000000000000000000
 	19:0 newline()
-	19:0 number(0x1234 )  4660
+	19:0 number(0x1234)  4660
 	19:7 +
 	19:10 number(0x543p+3)  10776
 	20:0 newline()
@@ -307,11 +308,11 @@ Some simple tests... maybe all tests are simple.
 	17:21 newline()
 	18:0 number(1234)  1234
 	18:4 mark(,)
-	18:7 number(1.234 )  617/500
+	18:7 number(1.234)  617/500
 	18:13 -
 	18:14 number(123.456e45)  123456000000000000000000000000000000000000000000
 	18:24 newline()
-	19:0 number(0x1234 )  4660
+	19:0 number(0x1234)  4660
 	19:7 +
 	19:10 number(0x543p+3)  10776
 	19:18 newline()
@@ -367,10 +368,10 @@ Some simple tests... maybe all tests are simple.
 	17:16 ident(while)
 	18:0 number(1234)  1234
 	18:4 mark(,)
-	18:7 number(1.234 )  617/500
+	18:7 number(1.234)  617/500
 	18:13 -
 	18:14 number(123.456e45)  123456000000000000000000000000000000000000000000
-	19:0 number(0x1234 )  4660
+	19:0 number(0x1234)  4660
 	19:7 +
 	19:10 number(0x543p+3)  10776
 	20:0 string("This is a string")  This is a string
@@ -419,10 +420,10 @@ Some simple tests... maybe all tests are simple.
 	17:16 ident(while)
 	18:0 number(1234)  1234
 	18:4 mark(,)
-	18:7 number(1.234 )  617/500
+	18:7 number(1.234)  617/500
 	18:13 -
 	18:14 number(123.456e45)  123456000000000000000000000000000000000000000000
-	19:0 number(0x1234 )  4660
+	19:0 number(0x1234)  4660
 	19:7 +
 	19:10 number(0x543p+3)  10776
 	20:0 string("This is a string")  This is a string
@@ -468,10 +469,10 @@ Some simple tests... maybe all tests are simple.
 	17:16 ident(while)
 	18:0 number(1234)  1234
 	18:4 mark(,)
-	18:7 number(1.234 )  617/500
+	18:7 number(1.234)  617/500
 	18:13 -
 	18:14 number(123.456e45)  123456000000000000000000000000000000000000000000
-	19:0 number(0x1234 )  4660
+	19:0 number(0x1234)  4660
 	19:7 +
 	19:10 number(0x543p+3)  10776
 	20:0 mark(")
@@ -513,6 +514,91 @@ Some simple tests... maybe all tests are simple.
 	31:12 ident(divisor)
 	32:0 eof()
 
+###### test list
+	scanner_tests += "testnum"
+
+###### test: testnum
+	12345
+	1234.56
+	1234.56e7
+	1234.56e-7
+	0x1234
+	0x123,456
+	0o777
+	0o111.111p4
+	0b11011110p3
+
+	123 456 789
+	0x1234_5678_9abc
+
+	"Now for some non-number"
+	1234p4
+	12-34
+	01234
+	0c1234
+	123.456e1a
+	123.e4
+	0x123 456
+	0b1234
+	123_345_.34
+	.75
+
+###### output: testnum
+	Tokenizing: 
+	2:0 number(12345)  12345
+	3:0 newline()
+	3:0 number(1234.56)  30864/25
+	4:0 newline()
+	4:0 number(1234.56e7)  12345600000
+	5:0 newline()
+	5:0 number(1234.56e-7)  1929/15625000
+	6:0 newline()
+	6:0 number(0x1234)  4660
+	7:0 newline()
+	7:0 number(0x123,456)  596523/2048
+	8:0 newline()
+	8:0 number(0o777)  511
+	9:0 newline()
+	9:0 number(0o111.111p4)  37449/32
+	10:0 newline()
+	10:0 number(0b11011110p3)  1776
+	12:0 newline()
+	12:0 newline()
+	12:0 number(123 456 789)  123456789
+	13:0 newline()
+	13:0 number(0x1234_5678_9abc)  20015998343868
+	15:0 newline()
+	15:0 newline()
+	15:0 string("Now for some non-..)  Now for some non-n..
+	16:0 newline()
+	16:0 number(1234p4) BAD NUMBER
+	17:0 newline()
+	17:0 number(12)  12
+	17:2 mark(-)
+	17:3 number(34)  34
+	18:0 newline()
+	18:0 number(01234) BAD NUMBER
+	19:0 newline()
+	19:0 number(0c1234) BAD NUMBER
+	20:0 newline()
+	20:0 number(123.456e1a) a 30864/25
+	21:0 newline()
+	21:0 number(123.e4)  1230000
+	22:0 newline()
+	22:0 number(0x123 456) BAD NUMBER
+	23:0 newline()
+	23:0 number(0b1234) BAD NUMBER
+	24:0 newline()
+	24:0 number(123_345)  123345
+	24:7 ident(_)
+	24:8 mark(.)
+	24:9 number(34)  34
+	25:0 newline()
+	25:0 mark(.)
+	25:1 number(75)  75
+	26:0 newline()
+	26:0 eof()
+
 ## Error tests
 
 Now to test for some errors ... though things I thought would be errors
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc
index 42001ff..e54dac6 100644
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -119,7 +119,11 @@ To make matters worse, our language designer has decided to experiment
 with allowing commas to be used as the decimal indicator, and spaces
 to be used to separate groups of digits in large numbers.  Both of
 these can reasonably be restricted to appear between two digits, so we
-have to add that condition to our tests.
+have to add that condition to our tests.  For consistency we require
+every non-alpha-numeric to appear between two hex digits, with the
+exception that a sign can appear only after a 'p' or 'e', and a space
+can only appear between decimal digits.  Allowing a space before a
+letter easily leads to confusion, such a in `a < 3 and b < 4`.
 
 So we cannot just treat numbers as starting with a digit and being
 followed by some set of characters.  We need more structure than that.
@@ -127,13 +131,16 @@ followed by some set of characters.  We need more structure than that.
 So:
 
 - Numbers must start with a digit.
-- If the first digit is zero, the next character must be a base
-  signifier (one of `xob`) or a decimal marker (`.` or `,`).
-  In the first case the first `p` or `P` may be followed by a sign.
+- If the first digit is zero, the next character should be a base
+  signifier (one of `xob`) or a decimal marker (`.` or `,`) (though this isn't
+  enforced at this stage)
+  In the first case the only first `p` or `P` may be followed by a sign.
 - If the number doesn't start with `0` followed by one of `xob`, the
   first `e` may be followed by a sign.
-- Any digit or hex digit may be followed by a space or underscore
-  providing that the subsequence character is also a (hex) digit.
+- A sign must always be followed by a digit.
+- Any digit may be followed by a space or underscore and any hex digit
+  maybe followed by an underscore, providing that the subsequence character
+  is also a digit (for space) or hex digit (for underscore).
   This rule will require an extra level of 'unget' to be
   supported when handling characters.
 - Otherwise any digits or ASCII letters are allowed.  We do not at
@@ -163,7 +170,7 @@ are declared to be a start character for words.
 ###### parse number
 
 	if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
-		int prev_special = 0;
+		int prev = 0;
 		int expect_p = 0;
 		int decimal_mark = 0;
 		if (ch == '0') {
@@ -176,43 +183,62 @@ are declared to be a start character for words.
 			int sign_ok = 0;
 			switch(expect_p) {
 			case 0:
-				if (ch == 'e' || ch == 'E')
+				if (ch == 'e' || ch == 'E') {
 					sign_ok = 1;
+					decimal_mark = 1;
+				}
 				break;
 			case 1:
-				if (ch == 'p' || ch == 'P')
+				if (ch == 'p' || ch == 'P') {
 					sign_ok = 1;
+					decimal_mark = 1;
+				}
 				break;
 			}
 			save_unget_state(state);
+			prev = ch;
 			ch = get_char(state);
-			if (iswalnum(ch)) {
-				prev_special = 0;
+
+			if (!iswalnum(prev)) {
+				/* special characters, like separators and decimal marks
+				 * and signs, must be followed by a hexdigit, and the
+				 * space and signs must be followed by a decimal digit.
+				 */
+				if (!iswxdigit(ch) ||
+				   ((prev == '-' || prev == '+') && !iswdigit(ch)) ||
+				   (prev == ' ' && !iswdigit(ch))) {
+					/* don't want the new char or the special */
+					restore_unget_state(state);
+					break;
+				}
+			}
+			if (iswalnum(ch))
 				continue;
+
+			if (!strchr(state->conf->number_chars, ch)) {
+				/* non-number char */
+				break;
 			}
 			if (ch == '+' || ch == '-') {
+				/* previous must be 'e' or 'p' in appropraite context */
 				if (!sign_ok)
 					break;
 				expect_p = -1;
+			} else if (ch == ' ') {
+				/* previous must be a digit */
+				if (!iswdigit(prev))
+					break;
+			} else {
+				/* previous must be a hex digit */
+				if (!iswxdigit(prev))
+					break;
 			}
 			if (ch == '.' || ch == ',') {
+				/* only one of these permitted */
 				if (decimal_mark)
 					break;
 				decimal_mark = 1;
 			}
-			if (prev_special) {
-				/* Don't allow that special char,
-				 * need two 'ungets'
-				 */
-				restore_unget_state(state);
-				break;
-			}
-			if (strchr(state->conf->number_chars, ch)) {
-				prev_special = 1;
-				continue;
-			}
-			/* non-number char */
-			break;
 		}
 		/* We seem to have a "number" token */
 		unget_char(state);
@@ -1310,7 +1336,7 @@ tokens.  Now we just need C files to store them, and a mk file to make them.
 
 Converting a `TK_number` token to a numerical value is a slightly
 higher level task than lexical analysis, and slightly lower than
-grammar parsing, so put it here - as an index if you like.
+grammar parsing, so put it here - as an appendix if you like.
 
 Importantly it will be used by the same testing rig that is used for
 testing the token scanner.
@@ -1335,10 +1361,10 @@ had never been initialised.
 	                        int *placesp)
 	{
 		/* Accept digits up to 'base', ignore '_' and
-		 * ' ' if they appear between two legal digits,
-		 * and if `placesp` is not NULL, allow a single
-		 * '.' or ',' and report the number of digits
-		 * beyond there.
+		 * (for base 10) ' ' if they appear between two
+		 * legal digits, and if `placesp` is not NULL,
+		 * allow a single '.' or ',' and report the number
+		 * of digits beyond there.
 		 * Return number of characters processed (p),
 		 * or 0 if something illegal was found.
 		 */
@@ -1351,7 +1377,7 @@ had never been initialised.
 			int dig;
 			char c = tok.txt[p];
 
-			if (c == '_' || c == ' ') {
+			if (c == '_' || (c == ' ' && base == 10)) {
 				if (prev != Digit)
 					goto bad;
 				prev = Space;
@@ -1471,7 +1497,7 @@ we need to record the number of places.  We won't impose the number of
 places until we have the exponent as well.
 
 ###### number vars
-	int places =0;
+	int places = 0;
 	mpz_t mant;
 	int d;