parsergen: revert the allowance for "non-critical" conflicts.

[ocean] / csrc / scanner.mdc
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index 42001ff895b0196a7ab83e30afb5c50b5eae1cf0..fcfadca97c7803ede6422d5bc6e06493691f58d3 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -119,7 +119,11 @@ To make matters worse, our language designer has decided to experiment
  with allowing commas to be used as the decimal indicator, and spaces
  to be used to separate groups of digits in large numbers.  Both of
  these can reasonably be restricted to appear between two digits, so we
-have to add that condition to our tests.
+have to add that condition to our tests.  For consistency we require
+every non-alpha-numeric to appear between two hex digits, with the
+exception that a sign can appear only after a 'p' or 'e', and a space
+can only appear between decimal digits.  Allowing a space before a
+letter easily leads to confusion, such a in `a < 3 and b < 4`.
  
  So we cannot just treat numbers as starting with a digit and being
  followed by some set of characters.  We need more structure than that.
@@ -127,13 +131,16 @@ followed by some set of characters.  We need more structure than that.
  So:
  
  - Numbers must start with a digit.
-- If the first digit is zero, the next character must be a base
-  signifier (one of `xob`) or a decimal marker (`.` or `,`).
-  In the first case the first `p` or `P` may be followed by a sign.
+- If the first digit is zero, the next character should be a base
+  signifier (one of `xob`) or a decimal marker (`.` or `,`) (though this isn't
+  enforced at this stage)
+  In the first case the only first `p` or `P` may be followed by a sign.
  - If the number doesn't start with `0` followed by one of `xob`, the
    first `e` may be followed by a sign.
-- Any digit or hex digit may be followed by a space or underscore
-  providing that the subsequence character is also a (hex) digit.
+- A sign must always be followed by a digit.
+- Any digit may be followed by a space or underscore and any hex digit
+  maybe followed by an underscore, providing that the subsequence character
+  is also a digit (for space) or hex digit (for underscore).
    This rule will require an extra level of 'unget' to be
    supported when handling characters.
  - Otherwise any digits or ASCII letters are allowed.  We do not at
@@ -163,7 +170,7 @@ are declared to be a start character for words.
  ###### parse number
  
         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
-               int prev_special = 0;
+               int prev = 0;
                 int expect_p = 0;
                 int decimal_mark = 0;
                 if (ch == '0') {
@@ -176,43 +183,62 @@ are declared to be a start character for words.
                         int sign_ok = 0;
                         switch(expect_p) {
                         case 0:
-                               if (ch == 'e' || ch == 'E')
+                               if (ch == 'e' || ch == 'E') {
                                         sign_ok = 1;
+                                       decimal_mark = 1;
+                               }
                                 break;
                         case 1:
-                               if (ch == 'p' || ch == 'P')
+                               if (ch == 'p' || ch == 'P') {
                                         sign_ok = 1;
+                                       decimal_mark = 1;
+                               }
                                 break;
                         }
                         save_unget_state(state);
+                       prev = ch;
                         ch = get_char(state);
-                       if (iswalnum(ch)) {
-                               prev_special = 0;
+
+                       if (!iswalnum(prev)) {
+                               /* special characters, like separators and decimal marks
+                                * and signs, must be followed by a hexdigit, and the
+                                * space and signs must be followed by a decimal digit.
+                                */
+                               if (!iswxdigit(ch) ||
+                                  ((prev == '-' || prev == '+') && !iswdigit(ch)) ||
+                                  (prev == ' ' && !iswdigit(ch))) {
+                                       /* don't want the new char or the special */
+                                       restore_unget_state(state);
+                                       break;
+                               }
+                       }
+                       if (iswalnum(ch))
                                 continue;
+
+                       if (!strchr(state->conf->number_chars, ch)) {
+                               /* non-number char */
+                               break;
                         }
                         if (ch == '+' || ch == '-') {
+                               /* previous must be 'e' or 'p' in appropraite context */
                                 if (!sign_ok)
                                         break;
                                 expect_p = -1;
+                       } else if (ch == ' ') {
+                               /* previous must be a digit */
+                               if (!iswdigit(prev))
+                                       break;
+                       } else {
+                               /* previous must be a hex digit */
+                               if (!iswxdigit(prev))
+                                       break;
                         }
                         if (ch == '.' || ch == ',') {
+                               /* only one of these permitted */
                                 if (decimal_mark)
                                         break;
                                 decimal_mark = 1;
                         }
-                       if (prev_special) {
-                               /* Don't allow that special char,
-                                * need two 'ungets'
-                                */
-                               restore_unget_state(state);
-                               break;
-                       }
-                       if (strchr(state->conf->number_chars, ch)) {
-                               prev_special = 1;
-                               continue;
-                       }
-                       /* non-number char */
-                       break;
                 }
                 /* We seem to have a "number" token */
                 unget_char(state);
@@ -690,6 +716,8 @@ ignored.
         int     indent_level;
         int     indent_sizes[20];
  
+`indent_sizes[0]` will always be zero - this simplifies some code.
+
  #### Newlines
  
  Newlines can optionally be reported.  Newlines within a block comment
@@ -724,7 +752,7 @@ blank lines that have been skipped.
  
  When a Newline leads to the next block of code there is a question of
  whether the various Newline and OUT/IN tokens should appear to
-pbelong to the earlier or later block.  This is addressed by processing
+belong to the earlier or later block.  This is addressed by processing
  the tokens in two stages based on the relative indent levels of the
  two blocks (each block has a base indent to which the actual indents
  are added).
@@ -761,19 +789,29 @@ For this we store one more than the number of blank lines as
         int delayed_lines;
         int out_next;
  
-Generating these tokens involve two separate pieces of code.
+Generating these tokens involves two separate pieces of code.
  
  Firstly we need to recognise white space and count the indents and
  newlines.  These are recorded in the above state fields.
  
-Separately we need, on each call to `token_next`, we need to check if
+Separately we need, on each call to `token_next`, to check if
  there are some delayed tokens and if so we need to advance the state
  information and return one token.
  
+###### internal functions
+       static int state_indent(struct token_state *state)
+       {
+               if (state->node == NULL)
+                       return state->col;
+               return state->node->indent - state->node->needs_strip + state->col;
+       }
+
  ###### white space
+       if (is_newline(ch))
+               state_check_node(state);
         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
                 int newlines = 0;
-               int was_son = at_son(state);
+               int was_nl = is_newline(ch);
                 if (ignored & (1<<TK_in)) {
                         if (!is_newline(ch))
                                 continue;
@@ -784,22 +822,17 @@ information and return one token.
                         return tk;
                 }
                 // Indents are needed, so check all white space.
-               while (ch <= ' ' && !at_eon(state)) {
+               while (ch <= ' ' && ch != WEOF) {
                         if (is_newline(ch))
                                 newlines += 1;
                         ch = get_char(state);
+                       if (is_newline(ch))
+                               state_check_node(state);
                 }
-               if (at_eon(state)) {
-                       newlines += 1;
-                       if (state->node->next &&
-                           state->node->next->indent > state->node->indent)
-                               state->col = state->node->next->indent;
-                       else
-                               state->col = state->node->indent;
-               } else
+               if (ch != WEOF)
                         unget_char(state);
                 state->delayed_lines = newlines;
-               state->out_next = was_son;
+               state->out_next = !was_nl;
                 state->check_indent = 1;
                 continue;
         }
@@ -807,7 +840,7 @@ information and return one token.
  ###### delayed tokens
  
         if (state->check_indent || state->delayed_lines) {
-               if (state->col < state->indent_sizes[state->indent_level]) {
+               if (state_indent(state) < state->indent_sizes[state->indent_level]) {
                         if (!state->out_next &&
                             !(ignored & (1<<TK_newline))) {
                                 state->out_next = 1;
@@ -819,11 +852,12 @@ information and return one token.
                         tk.num = TK_out;
                         return tk;
                 }
-               if (state->col > state->indent_sizes[state->indent_level] &&
+               if (state_indent(state) > state->indent_sizes[state->indent_level] &&
                     state->indent_level < sizeof(state->indent_sizes)-1) {
                         state->indent_level += 1;
-                       state->indent_sizes[state->indent_level] = state->col;
-                       state->delayed_lines -= 1;
+                       state->indent_sizes[state->indent_level] = state_indent(state);
+                       if (state->delayed_lines)
+                               state->delayed_lines -= 1;
                         tk.num = TK_in;
                         return tk;
                 }
@@ -848,11 +882,6 @@ tokens will continue to return the same end-of-file token.
  
  ###### white space
         if (ch == WEOF) {
-               if (state->col) {
-                       state->col = 0;
-                       state->check_indent = 1;
-                       continue;
-               }
                 tk.num = TK_eof;
                 return tk;
         }
@@ -908,10 +937,11 @@ a flag that tells us whether or not we need to strip.
         int    offset;
         int    line;
         int    col;
+       int    strip_offset;
  
  ###### internal functions
  
-       static int do_strip(struct token_state *state)
+       static void do_strip(struct token_state *state)
         {
                 int indent = 0;
                 if (state->node->needs_strip) {
@@ -927,7 +957,28 @@ a flag that tells us whether or not we need to strip.
                                 n -= 4;
                         }
                 }
-               return indent;
+       }
+
+       static void state_check_node(struct token_state *state)
+       {
+               if (!state->node)
+                       return;
+               if (state->node->code.len > state->offset)
+                       return;
+
+               do
+                       state->node = state->node->next;
+               while (state->node && state->node->code.txt == NULL);
+               state->offset = 0;
+               state->prev_offset = 0;
+               state->strip_offset = 0;
+               state->col = 0;
+               if (state->node == NULL)
+                       return;
+               state->line = state->node->line_no;
+               do_strip(state);
+               state->col = state->node->needs_strip;
+               state->strip_offset = state->offset;
         }
  
         static wint_t get_char(struct token_state *state)
@@ -936,18 +987,9 @@ a flag that tells us whether or not we need to strip.
                 size_t n;
                 mbstate_t mbstate;
  
+               state_check_node(state);
                 if (state->node == NULL)
                         return WEOF;
-               if (state->node->code.len <= state->offset) {
-                       do
-                               state->node = state->node->next;
-                       while (state->node && state->node->code.txt == NULL);
-                       state->offset = 0;
-                       if (state->node == NULL)
-                               return WEOF;
-                       state->line = state->node->line_no;
-                       state->col = do_strip(state);
-               }
  
                 ## before get_char
  
@@ -958,12 +1000,12 @@ a flag that tells us whether or not we need to strip.
                             &mbstate);
                 if (n == -2 || n == 0) {
                         /* Not enough bytes - not really possible */
-                       next = '\n';
-                       state->offset = state->node->code.len;
+                       next = '\n';                            // NOTEST
+                       state->offset = state->node->code.len;  // NOTEST
                 } else if (n == -1) {
                         /* error */
-                       state->offset += 1;
-                       next = 0x7f; // an illegal character
+                       state->offset += 1;                     // NOTEST
+                       next = 0x7f; // an illegal character    // NOTEST
                 } else
                         state->offset += n;
  
@@ -971,7 +1013,8 @@ a flag that tells us whether or not we need to strip.
                         state->col += 1;
                 } else if (is_newline(next)) {
                         state->line += 1;
-                       state->col = do_strip(state);
+                       do_strip(state);
+                       state->col = state->node->needs_strip;
                 } else if (next == '\t') {
                         state->col = indent_tab(state->col);
                 }
@@ -1090,7 +1133,7 @@ parsed too much already.  For that there is `reset_token`.
                 tok->txt.len = 0;
         }
  
-Tokens make not cross into the next `code_node`, and some tokens can
+Tokens may not cross into the next `code_node`, and some tokens can
  include the newline at the and of a `code_node`, we must be able to
  easily check if we have reached the end.  Equally we need to know if
  we are at the start of a node, as white space is treated a little
@@ -1100,7 +1143,7 @@ differently there.
  
         static int at_son(struct token_state *state)
         {
-               return state->offset == 0;
+               return state->prev_offset <= state->strip_offset;
         }
  
         static int at_eon(struct token_state *state)
@@ -1183,7 +1226,9 @@ As well as getting tokens, we need to be able to create the
                 memset(state, 0, sizeof(*state));
                 state->node = code;
                 state->line = code->line_no;
-               state->col = do_strip(state);
+               do_strip(state);
+               state->col = state->node->needs_strip;
+               state->strip_offset = state->offset;
                 state->conf = conf;
                 return state;
         }
@@ -1310,7 +1355,7 @@ tokens.  Now we just need C files to store them, and a mk file to make them.
  
  Converting a `TK_number` token to a numerical value is a slightly
  higher level task than lexical analysis, and slightly lower than
-grammar parsing, so put it here - as an index if you like.
+grammar parsing, so put it here - as an appendix if you like.
  
  Importantly it will be used by the same testing rig that is used for
  testing the token scanner.
@@ -1335,10 +1380,10 @@ had never been initialised.
                                 int *placesp)
         {
                 /* Accept digits up to 'base', ignore '_' and
-                * ' ' if they appear between two legal digits,
-                * and if `placesp` is not NULL, allow a single
-                * '.' or ',' and report the number of digits
-                * beyond there.
+                * (for base 10) ' ' if they appear between two
+                * legal digits, and if `placesp` is not NULL,
+                * allow a single '.' or ',' and report the number
+                * of digits beyond there.
                  * Return number of characters processed (p),
                  * or 0 if something illegal was found.
                  */
@@ -1351,7 +1396,7 @@ had never been initialised.
                         int dig;
                         char c = tok.txt[p];
  
-                       if (c == '_' || c == ' ') {
+                       if (c == '_' || (c == ' ' && base == 10)) {
                                 if (prev != Digit)
                                         goto bad;
                                 prev = Space;
@@ -1471,7 +1516,7 @@ we need to record the number of places.  We won't impose the number of
  places until we have the exponent as well.
  
  ###### number vars
-       int places =0;
+       int places = 0;
         mpz_t mant;
         int d;
  
@@ -1993,12 +2038,15 @@ the tokens one per line.
                         { "ignore-block-comment", 0, NULL, 'C'},
                         { "ignore-indent",      0, NULL, 'i'},
                         { "file",               1, NULL, 'f'},
+                       { "section",            1, NULL, 's'},
                         { NULL,                 0, NULL, 0},
                 };
-               static const char options[] = "W:w:n:NIMSzclCif:";
+               static const char options[] = "W:w:n:NIMSzclCif:s:";
  
                 struct section *table, *s, *prev;
                 int opt;
+               char *section_name = NULL;
+               int section_found = 0;
  
                 setlocale(LC_ALL,"");
                 while ((opt = getopt_long(argc, argv, options, long_options, NULL))
@@ -2017,6 +2065,7 @@ the tokens one per line.
                         case 'l': conf.ignored |= 1 << TK_newline; break;
                         case 'i': conf.ignored |= 1 << TK_in; break;
                         case 'f': filename = optarg; break;
+                       case 's': section_name = optarg; break;
                         default: fprintf(stderr, "scanner: unknown option '%c'.\n",
                                          opt);
                                 exit(1);
@@ -2053,6 +2102,12 @@ the tokens one per line.
  
                 for (s = table; s;
                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
+                       if (section_name &&
+                           (s->section.len != strlen(section_name) ||
+                            strncmp(s->section.txt, section_name, s->section.len) != 0))
+                               continue;
+                       if (section_name)
+                               section_found = 1;
                         printf("Tokenizing: %.*s\n", s->section.len,
                                 s->section.txt);
                         state = token_open(s->code, &conf);
@@ -2095,6 +2150,10 @@ the tokens one per line.
                 }
                 if (conf.words_marks != known)
                         free(conf.words_marks);
+               if (section_name && !section_found) {
+                       fprintf(stderr, "scanner: section %s not found\n", section_name);
+                       errs = 1;
+               }
                 exit(!!errs);
         }
  ###### File: scanner.mk