scanner: improve number parsing.

[ocean] / csrc / scanner.mdc
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index a5eeb1f128b5163b1715b263a033a3a64265c73a..e54dac686bb8906734ba3bf712df8c4aa763edc6 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -85,7 +85,6 @@ from token to token.
  ###### token_next init
         int ignored = state->conf->ignored;
  
-
  The different tokens are numbers, words, marks, strings, comments,
  newlines, EOF, and indents, each of which is examined in detail below.
  
@@ -120,7 +119,11 @@ To make matters worse, our language designer has decided to experiment
  with allowing commas to be used as the decimal indicator, and spaces
  to be used to separate groups of digits in large numbers.  Both of
  these can reasonably be restricted to appear between two digits, so we
-have to add that condition to our tests.
+have to add that condition to our tests.  For consistency we require
+every non-alpha-numeric to appear between two hex digits, with the
+exception that a sign can appear only after a 'p' or 'e', and a space
+can only appear between decimal digits.  Allowing a space before a
+letter easily leads to confusion, such a in `a < 3 and b < 4`.
  
  So we cannot just treat numbers as starting with a digit and being
  followed by some set of characters.  We need more structure than that.
@@ -128,13 +131,16 @@ followed by some set of characters.  We need more structure than that.
  So:
  
  - Numbers must start with a digit.
-- If the first digit is zero, the next character must be a base
-  signifier (one of `xob`) or a decimal marker (`.` or `,`).
-  In the first case the first `p` or `P` may be followed by a sign.
+- If the first digit is zero, the next character should be a base
+  signifier (one of `xob`) or a decimal marker (`.` or `,`) (though this isn't
+  enforced at this stage)
+  In the first case the only first `p` or `P` may be followed by a sign.
  - If the number doesn't start with `0` followed by one of `xob`, the
    first `e` may be followed by a sign.
-- Any digit or hex digit may be followed by a space or underscore
-  providing that the subsequence character is also a (hex) digit.
+- A sign must always be followed by a digit.
+- Any digit may be followed by a space or underscore and any hex digit
+  maybe followed by an underscore, providing that the subsequence character
+  is also a digit (for space) or hex digit (for underscore).
    This rule will require an extra level of 'unget' to be
    supported when handling characters.
  - Otherwise any digits or ASCII letters are allowed.  We do not at
@@ -164,7 +170,7 @@ are declared to be a start character for words.
  ###### parse number
  
         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
-               int prev_special = 0;
+               int prev = 0;
                 int expect_p = 0;
                 int decimal_mark = 0;
                 if (ch == '0') {
@@ -177,43 +183,62 @@ are declared to be a start character for words.
                         int sign_ok = 0;
                         switch(expect_p) {
                         case 0:
-                               if (ch == 'e' || ch == 'E')
+                               if (ch == 'e' || ch == 'E') {
                                         sign_ok = 1;
+                                       decimal_mark = 1;
+                               }
                                 break;
                         case 1:
-                               if (ch == 'p' || ch == 'P')
+                               if (ch == 'p' || ch == 'P') {
                                         sign_ok = 1;
+                                       decimal_mark = 1;
+                               }
                                 break;
                         }
                         save_unget_state(state);
+                       prev = ch;
                         ch = get_char(state);
-                       if (iswalnum(ch)) {
-                               prev_special = 0;
+
+                       if (!iswalnum(prev)) {
+                               /* special characters, like separators and decimal marks
+                                * and signs, must be followed by a hexdigit, and the
+                                * space and signs must be followed by a decimal digit.
+                                */
+                               if (!iswxdigit(ch) ||
+                                  ((prev == '-' || prev == '+') && !iswdigit(ch)) ||
+                                  (prev == ' ' && !iswdigit(ch))) {
+                                       /* don't want the new char or the special */
+                                       restore_unget_state(state);
+                                       break;
+                               }
+                       }
+                       if (iswalnum(ch))
                                 continue;
+
+                       if (!strchr(state->conf->number_chars, ch)) {
+                               /* non-number char */
+                               break;
                         }
                         if (ch == '+' || ch == '-') {
+                               /* previous must be 'e' or 'p' in appropraite context */
                                 if (!sign_ok)
                                         break;
                                 expect_p = -1;
+                       } else if (ch == ' ') {
+                               /* previous must be a digit */
+                               if (!iswdigit(prev))
+                                       break;
+                       } else {
+                               /* previous must be a hex digit */
+                               if (!iswxdigit(prev))
+                                       break;
                         }
                         if (ch == '.' || ch == ',') {
+                               /* only one of these permitted */
                                 if (decimal_mark)
                                         break;
                                 decimal_mark = 1;
                         }
-                       if (prev_special) {
-                               /* Don't allow that special char,
-                                * need two 'ungets'
-                                */
-                               restore_unget_state(state);
-                               break;
-                       }
-                       if (strchr(state->conf->number_chars, ch)) {
-                               prev_special = 1;
-                               continue;
-                       }
-                       /* non-number char */
-                       break;
                 }
                 /* We seem to have a "number" token */
                 unget_char(state);
@@ -368,7 +393,11 @@ Known marks are included in the same list as the list of known words.
                 prev = ch;
                 save_unget_state(state);
                 ch = get_char(state);
-               if (!(ignored && (1<<TK_string)) && is_quote(ch))
+               if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
+                       /* If strings are allowed, a quote (Which isn't a known mark)
+                        * mustn't be treated as part of an unknown mark.  It can be
+                        * part of a multi-line srtings though.
+                        */
                         break;
                 if (prev == '#' && n < 0)
                         /* '#' is not a known mark, so assume it is a comment */
@@ -398,15 +427,6 @@ before assuming that we have an unknown mark
         ## parse comment
         ## unknown mark
  
-###### unknown mark
-       if (tk.txt.len) {
-               if (ignored & (1<<TK_mark))
-                       tk.num = TK_error;
-               else
-                       tk.num = TK_mark;
-               return tk;
-       }
-
  ### Strings
  
  Strings start with one of single quote, double quote, or back quote
@@ -436,7 +456,7 @@ token types.
  ###### internal functions
         static int is_quote(wchar_t ch)
         {
-               return ch == '\'' || ch == '"' || ch == '`';
+               return ch == '\'' || ch == '"' || ch == '`'; // "
         }
  
  #### Multi-line strings
@@ -520,6 +540,10 @@ If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
                                 break;
                         }
                 }
+               while (!at_eon(state) && (ch = get_char(state)) &&
+                                         iswalpha(ch))
+                       ;
+               unget_char(state);
                 close_token(state, &tk);
                 return tk;
         }
@@ -806,7 +830,6 @@ information and return one token.
                 continue;
         }
  
-
  ###### delayed tokens
  
         if (state->check_indent || state->delayed_lines) {
@@ -849,7 +872,6 @@ tokens will continue to return the same end-of-file token.
  ###### token types
         TK_eof,
  
-
  ###### white space
         if (ch == WEOF) {
                 if (state->col) {
@@ -868,7 +890,21 @@ If the token we have is not empty and `TK_mark` is allowed,
  we have an unknown mark, otherwise this must be an error.
  
  ###### unknown mark
-       /* one unknown character */
+
+       /* one unknown mark character */
+       if (tk.txt.len) {
+               close_token(state, &tk);
+               if (ignored & (1<<TK_mark))
+                       tk.num = TK_error;
+               else
+                       tk.num = TK_mark;
+               return tk;
+       }
+       /* Completely unrecognised character is next, possibly
+        * a digit and we are ignoring numbers.
+        * What ever it is, make it an error.
+        */
+       get_char(state);
         close_token(state, &tk);
         tk.num = TK_error;
         return tk;
@@ -1064,8 +1100,11 @@ parsed too much already.  For that there is `reset_token`.
         static void close_token(struct token_state *state,
                                 struct token *tk)
         {
-               tk->txt.len = (state->node->code.txt + state->offset)
-                             - tk->txt.txt;
+               if (state->node != tk->node)
+                       tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
+               else
+                       tk->txt.len = (state->node->code.txt + state->offset)
+                                     - tk->txt.txt;
         }
  
         static void reset_token(struct token_state *state, struct token *tok)
@@ -1077,7 +1116,6 @@ parsed too much already.  For that there is `reset_token`.
                 tok->txt.len = 0;
         }
  
-
  Tokens make not cross into the next `code_node`, and some tokens can
  include the newline at the and of a `code_node`, we must be able to
  easily check if we have reached the end.  Equally we need to know if
@@ -1298,7 +1336,7 @@ tokens.  Now we just need C files to store them, and a mk file to make them.
  
  Converting a `TK_number` token to a numerical value is a slightly
  higher level task than lexical analysis, and slightly lower than
-grammar parsing, so put it here - as an index if you like.
+grammar parsing, so put it here - as an appendix if you like.
  
  Importantly it will be used by the same testing rig that is used for
  testing the token scanner.
@@ -1323,10 +1361,10 @@ had never been initialised.
                                 int *placesp)
         {
                 /* Accept digits up to 'base', ignore '_' and
-                * ' ' if they appear between two legal digits,
-                * and if `placesp` is not NULL, allow a single
-                * '.' or ',' and report the number of digits
-                * beyond there.
+                * (for base 10) ' ' if they appear between two
+                * legal digits, and if `placesp` is not NULL,
+                * allow a single '.' or ',' and report the number
+                * of digits beyond there.
                  * Return number of characters processed (p),
                  * or 0 if something illegal was found.
                  */
@@ -1339,7 +1377,7 @@ had never been initialised.
                         int dig;
                         char c = tok.txt[p];
  
-                       if (c == '_' || c == ' ') {
+                       if (c == '_' || (c == ' ' && base == 10)) {
                                 if (prev != Digit)
                                         goto bad;
                                 prev = Space;
@@ -1459,7 +1497,7 @@ we need to record the number of places.  We won't impose the number of
  places until we have the exponent as well.
  
  ###### number vars
-       int places =0;
+       int places = 0;
         mpz_t mant;
         int d;
  
@@ -1509,7 +1547,6 @@ character `expc`.
         tok.txt += d;
         tok.len -= d;
  
-
  Now that we have the mantissa and the exponent we can multiply them
  together, also allowing for the number of digits after the decimal
  mark.
@@ -1573,7 +1610,6 @@ Now we are ready to parse a number: the base, mantissa, and exponent.
  If all goes well we check for the possible trailing letters and
  return.  Return value is 1 for success and 0 for failure.
  
-
  ###### number functions
         int number_parse(mpq_t num, char tail[3], struct text tok)
         {
@@ -1908,7 +1944,6 @@ String parsing goes in `libstring.c`
         libstring.o : libstring.c
                 $(CC) $(CFLAGS) -c libstring.c
  
-
  ## Testing
  
  As "untested code is buggy code" we need a program to easily test
@@ -1926,6 +1961,7 @@ the tokens one per line.
         #include <stdio.h>
         #include <gmp.h>
         #include <locale.h>
+       #include <getopt.h>
         #include "mdcode.h"
         #include "scanner.h"
         #include "number.h"
@@ -1938,11 +1974,19 @@ the tokens one per line.
                 fprintf(stderr, "%s\n", msg);
         }
  
+       static int kcmp(const void *ap, const void *bp)
+       {
+               char * const *a = ap;
+               char * const *b = bp;
+               return strcmp(*a, *b);
+       }
+
         int main(int argc, char *argv[])
         {
                 int fd;
                 int len;
                 char *file;
+               char *filename = NULL;
                 struct token_state *state;
                 const char *known[] = {
                         "==",
@@ -1959,22 +2003,77 @@ the tokens one per line.
                         .words_marks = known,
                         .number_chars = "., _+-",
                         .known_count = sizeof(known)/sizeof(known[0]),
-                       .ignored = (0 << TK_line_comment)
-                                 |(0 << TK_block_comment),
+                       .ignored = 0,
+               };
+               static const struct option long_options[] = {
+                       { "word-start",         1, NULL, 'W'},
+                       { "word-cont",          1, NULL, 'w'},
+                       { "number-chars",       1, NULL, 'n'},
+                       { "ignore-numbers",     0, NULL, 'N'},
+                       { "ignore-ident",       0, NULL, 'I'},
+                       { "ignore-marks",       0, NULL, 'M'},
+                       { "ignore-strings",     0, NULL, 'S'},
+                       { "ignore-multi-strings",0, NULL, 'z'},
+                       { "ignore-line-comment",0, NULL, 'c'},
+                       { "ignore-newline",     0, NULL, 'l'},
+                       { "ignore-block-comment", 0, NULL, 'C'},
+                       { "ignore-indent",      0, NULL, 'i'},
+                       { "file",               1, NULL, 'f'},
+                       { NULL,                 0, NULL, 0},
                 };
+               static const char options[] = "W:w:n:NIMSzclCif:";
+
                 struct section *table, *s, *prev;
+               int opt;
+
                 setlocale(LC_ALL,"");
-               if (argc != 2) {
-                       fprintf(stderr, "Usage: scanner file\n");
-                       exit(2);
+               while ((opt = getopt_long(argc, argv, options, long_options, NULL))
+                      != -1) {
+                       switch(opt) {
+                       case 'W': conf.word_start = optarg; break;
+                       case 'w': conf.word_cont = optarg; break;
+                       case 'n': conf.number_chars = optarg; break;
+                       case 'N': conf.ignored |= 1 << TK_number; break;
+                       case 'I': conf.ignored |= 1 << TK_ident; break;
+                       case 'M': conf.ignored |= 1 << TK_mark; break;
+                       case 'S': conf.ignored |= 1 << TK_string; break;
+                       case 'z': conf.ignored |= 1 << TK_multi_string; break;
+                       case 'c': conf.ignored |= 1 << TK_line_comment; break;
+                       case 'C': conf.ignored |= 1 << TK_block_comment; break;
+                       case 'l': conf.ignored |= 1 << TK_newline; break;
+                       case 'i': conf.ignored |= 1 << TK_in; break;
+                       case 'f': filename = optarg; break;
+                       default: fprintf(stderr, "scanner: unknown option '%c'.\n",
+                                        opt);
+                               exit(1);
+                       }
+               }
+
+               if (optind < argc) {
+                       const char **wm = calloc(argc - optind, sizeof(char*));
+                       int i;
+                       for (i = optind; i < argc; i++)
+                               wm[i - optind] = argv[i];
+                       qsort(wm, argc-optind, sizeof(char*), kcmp);
+                       conf.words_marks = wm;
+                       conf.known_count = argc - optind;
                 }
-               fd = open(argv[1], O_RDONLY);
+
+               if (filename)
+                       fd = open(filename, O_RDONLY);
+               else
+                       fd = 0;
                 if (fd < 0) {
                         fprintf(stderr, "scanner: cannot open %s: %s\n",
-                               argv[1], strerror(errno));
+                               filename, strerror(errno));
                         exit(1);
                 }
                 len = lseek(fd, 0, 2);
+               if (len <= 0) {
+                       fprintf(stderr,"scanner: %s is empty or not seekable\n",
+                               filename ?: "stdin");
+                       exit(1);
+               }
                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
                 table = code_extract(file, file+len, pr_err);
  
@@ -2018,7 +2117,10 @@ the tokens one per line.
                                 if (tk.num == TK_eof)
                                         break;
                         }
+                       token_close(state);
                 }
+               if (conf.words_marks != known)
+                       free(conf.words_marks);
                 exit(!!errs);
         }
  ###### File: scanner.mk