Remove excess blank lines

[ocean] / csrc / scanner.mdc
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index 0bd30b0fe5607686cf80a04a21e6a3bd0f6a7e9c..42001ff895b0196a7ab83e30afb5c50b5eae1cf0 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -85,12 +85,11 @@ from token to token.
  ###### token_next init
         int ignored = state->conf->ignored;
  
-
  The different tokens are numbers, words, marks, strings, comments,
  newlines, EOF, and indents, each of which is examined in detail below.
  
  There are various cases where no token can be found in part of the
-input.  All of these will be reported as an `TK_error` token.
+input.  All of these will be reported as a `TK_error` token.
  
  It is possible to declare a number of strings which form distinct
  tokens (rather than being grouped as e.g. 'word').  These are given
@@ -106,7 +105,7 @@ token numbers from `TK_reserved` upwards.
  ### Numbers
  
  Numbers are the messiest tokens to parse, primarily because they can
-contain characters that also have meaning outside of number and,
+contain characters that also have meaning outside of numbers and,
  particularly, immediately after numbers.
  
  The obvious example is the '`-`' sign.  It can come inside a number for
@@ -177,11 +176,11 @@ are declared to be a start character for words.
                         int sign_ok = 0;
                         switch(expect_p) {
                         case 0:
-                               if (ch == 'e')
+                               if (ch == 'e' || ch == 'E')
                                         sign_ok = 1;
                                 break;
                         case 1:
-                               if (ch == 'p')
+                               if (ch == 'p' || ch == 'P')
                                         sign_ok = 1;
                                 break;
                         }
@@ -260,6 +259,9 @@ and the length of the list must be given (`known_count`).
  Tokens matching these known words are reported as the index of the
  list added to `TK_reserved`.
  
+If identifiers are ignored, then any word which is not listed as a
+known word results in an error.
+
  ###### token config parameters
         const char **words_marks;
         int known_count;
@@ -321,10 +323,17 @@ below before giving up and assuming an unknown mark.
  
  If an unknown mark contains a quote character or a comment marker, and
  that token is not being ignored, then we terminate the unknown mark
-before that quote or comment.  This ensure that an unknown mark
+before that quote or comment.  This ensures that an unknown mark
  immediately before a string is handled correctly.
  
-If `TK_mark` is ignored, then unknown marks as returned as an error.
+If the first character of a comment marker (i.e. '/') is a known mark,
+the above rules would suggest that the start of a comment would be
+parsed as that mark, which is not what is wanted.  So the introductory
+sequences for a comment ("//" and "/*") are treated as
+partially-known.  They prevent the leading "/" from being a mark by
+itself, but do not actually constitute a stand-alone mark.
+
+If `TK_mark` is ignored, then unknown marks are returned as errors.
  
  ###### token types
         TK_mark,
@@ -341,41 +350,57 @@ Known marks are included in the same list as the list of known words.
                 if (n >= 0)
                         tk.num = TK_reserved + n;
                 else if (tk.num != TK_error) {
-                       /* found a longest-known-mark */
+                       /* found a longest-known-mark, still need to
+                        * check for comments
+                        */
+                       if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
+                           (ch == '/' || ch == '*')) {
+                               /* Yes, this is a comment, not a '/' */
+                               restore_unget_state(state);
+                               tk.num = TK_error;
+                               break;
+                       }
                         unget_char(state);
                         close_token(state, &tk);
                         return tk;
                 }
                 prev = ch;
-               if (prev == '/')
-                       save_unget_state(state);
+               save_unget_state(state);
                 ch = get_char(state);
-               if (!(ignored && (1<<TK_string)) && is_quote(ch))
+               if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
+                       /* If strings are allowed, a quote (Which isn't a known mark)
+                        * mustn't be treated as part of an unknown mark.  It can be
+                        * part of a multi-line srtings though.
+                        */
+                       break;
+               if (prev == '#' && n < 0)
+                       /* '#' is not a known mark, so assume it is a comment */
                         break;
-               if (!(ignored && (1<<TK_line_comment)) &&
-                   prev == '/' && ch == '/') {
+               if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
+                       close_token(state, &tk);
                         restore_unget_state(state);
                         break;
                 }
-               if (!(ignored && (1<<TK_block_comment)) &&
-                   prev == '/' && ch == '*') {
+               if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
+                       close_token(state, &tk);
                         restore_unget_state(state);
                         break;
                 }
         }
         unget_char(state);
-       if (tk.num != TK_error)
-               return tk;
-
-###### unknown mark
-       if (tk.txt.len) {
-               if (ignored & (1<<TK_mark))
-                       tk.num = TK_error;
-               else
-                       tk.num = TK_mark;
+       if (tk.num != TK_error) {
+               close_token(state, &tk);
                 return tk;
         }
  
+If we don't find a known mark, we will check for strings and comments
+before assuming that we have an unknown mark
+
+###### parse mark
+       ## parse string
+       ## parse comment
+       ## unknown mark
+
  ### Strings
  
  Strings start with one of single quote, double quote, or back quote
@@ -383,7 +408,7 @@ and continue until a matching character on the same line.  Any of
  these characters can be included in the list of known marks and then
  they will not be used for identifying strings.
  
-Immediately following the close quote one or two ASCII letters may
+Immediately following the close quote, one or two ASCII letters may
  appear.  These are somewhat like the arbitrary letters allowed in
  "Numbers" above.  They can be used by the language in various ways.
  
@@ -405,7 +430,7 @@ token types.
  ###### internal functions
         static int is_quote(wchar_t ch)
         {
-               return ch == '\'' || ch == '"' || ch == '`';
+               return ch == '\'' || ch == '"' || ch == '`'; // "
         }
  
  #### Multi-line strings
@@ -441,7 +466,8 @@ followed by the start of a new string.
                          * unget so the newline is seen,
                          * but return rest of string as an error.
                          */
-                       unget_char(state);
+                       if (is_newline(ch))
+                               unget_char(state);
                         close_token(state, &tk);
                         tk.num = TK_error;
                         return tk;
@@ -475,15 +501,23 @@ If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
             !(ignored & (1<<TK_string))) {
                 wchar_t first = tk.txt.txt[0];
                 reset_token(state, &tk);
-               get_char(state);
-               do
+               ch = get_char(state);
+               tk.num = TK_error;
+               while (!at_eon(state) && !is_newline(ch)) {
                         ch = get_char(state);
-               while (ch != first && !is_newline(ch));
-               tk.num = TK_string;
-               if (is_newline(ch)) {
-                       unget_char(state);
-                       tk.num = TK_error;
+                       if (ch == first) {
+                               tk.num = TK_string;
+                               break;
+                       }
+                       if (is_newline(ch)) {
+                               unget_char(state);
+                               break;
+                       }
                 }
+               while (!at_eon(state) && (ch = get_char(state)) &&
+                                         iswalpha(ch))
+                       ;
+               unget_char(state);
                 close_token(state, &tk);
                 return tk;
         }
@@ -506,7 +540,7 @@ it would not suffer from this rule.
  
  These two comment types are reported as two separate token types, and
  consequently can be ignored separately.  When ignored a comment is
-parsed and discarded.
+still parsed, but is discarded.
  
  ###### token types
         TK_line_comment,
@@ -528,14 +562,16 @@ parsed and discarded.
  
  #### Single line comments
  
-A single-line comment continues up to, but not including the newline.
+A single-line comment continues up to, but not including the newline
+or end of node.
  
  ###### parse comment
  
         if (is_line_comment(tk.txt)) {
-               while (!is_newline(ch))
+               while (!is_newline(ch) && !at_eon(state))
                         ch = get_char(state);
-               unget_char(state);
+               if (is_newline(ch))
+                       unget_char(state);
                 close_token(state, &tk);
                 tk.num = TK_line_comment;
                 if (ignored & (1 << TK_line_comment))
@@ -768,7 +804,6 @@ information and return one token.
                 continue;
         }
  
-
  ###### delayed tokens
  
         if (state->check_indent || state->delayed_lines) {
@@ -811,9 +846,13 @@ tokens will continue to return the same end-of-file token.
  ###### token types
         TK_eof,
  
-
  ###### white space
         if (ch == WEOF) {
+               if (state->col) {
+                       state->col = 0;
+                       state->check_indent = 1;
+                       continue;
+               }
                 tk.num = TK_eof;
                 return tk;
         }
@@ -825,7 +864,21 @@ If the token we have is not empty and `TK_mark` is allowed,
  we have an unknown mark, otherwise this must be an error.
  
  ###### unknown mark
-       /* one unknown character */
+
+       /* one unknown mark character */
+       if (tk.txt.len) {
+               close_token(state, &tk);
+               if (ignored & (1<<TK_mark))
+                       tk.num = TK_error;
+               else
+                       tk.num = TK_mark;
+               return tk;
+       }
+       /* Completely unrecognised character is next, possibly
+        * a digit and we are ignoring numbers.
+        * What ever it is, make it an error.
+        */
+       get_char(state);
         close_token(state, &tk);
         tk.num = TK_error;
         return tk;
@@ -858,19 +911,23 @@ a flag that tells us whether or not we need to strip.
  
  ###### internal functions
  
-       static void do_strip(struct token_state *state)
+       static int do_strip(struct token_state *state)
         {
+               int indent = 0;
                 if (state->node->needs_strip) {
                         int n = 4;
                         while (n && state->node->code.txt[state->offset] == ' ') {
+                               indent += 1;
                                 state->offset += 1;
                                 n -= 1;
                         }
                         while (n == 4 && state->node->code.txt[state->offset] == '\t') {
+                               indent = indent_tab(indent);
                                 state->offset += 1;
                                 n -= 4;
                         }
                 }
+               return indent;
         }
  
         static wint_t get_char(struct token_state *state)
@@ -888,9 +945,8 @@ a flag that tells us whether or not we need to strip.
                         state->offset = 0;
                         if (state->node == NULL)
                                 return WEOF;
-                       do_strip(state);
                         state->line = state->node->line_no;
-                       state->col = state->node->indent;
+                       state->col = do_strip(state);
                 }
  
                 ## before get_char
@@ -915,8 +971,7 @@ a flag that tells us whether or not we need to strip.
                         state->col += 1;
                 } else if (is_newline(next)) {
                         state->line += 1;
-                       state->col = state->node->indent;
-                       do_strip(state);
+                       state->col = do_strip(state);
                 } else if (next == '\t') {
                         state->col = indent_tab(state->col);
                 }
@@ -1019,8 +1074,11 @@ parsed too much already.  For that there is `reset_token`.
         static void close_token(struct token_state *state,
                                 struct token *tk)
         {
-               tk->txt.len = (state->node->code.txt + state->offset)
-                             - tk->txt.txt;
+               if (state->node != tk->node)
+                       tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
+               else
+                       tk->txt.len = (state->node->code.txt + state->offset)
+                                     - tk->txt.txt;
         }
  
         static void reset_token(struct token_state *state, struct token *tok)
@@ -1032,7 +1090,6 @@ parsed too much already.  For that there is `reset_token`.
                 tok->txt.len = 0;
         }
  
-
  Tokens make not cross into the next `code_node`, and some tokens can
  include the newline at the and of a `code_node`, we must be able to
  easily check if we have reached the end.  Equally we need to know if
@@ -1109,9 +1166,6 @@ loop.
         ## parse number
         ## parse word
         ## parse mark
-       ## parse string
-       ## parse comment
-       ## unknown mark
  
  ### Start and stop
  
@@ -1129,9 +1183,8 @@ As well as getting tokens, we need to be able to create the
                 memset(state, 0, sizeof(*state));
                 state->node = code;
                 state->line = code->line_no;
-               state->col = code->indent;
+               state->col = do_strip(state);
                 state->conf = conf;
-               do_strip(state);
                 return state;
         }
         void token_close(struct token_state *state)
@@ -1468,7 +1521,6 @@ character `expc`.
         tok.txt += d;
         tok.len -= d;
  
-
  Now that we have the mantissa and the exponent we can multiply them
  together, also allowing for the number of digits after the decimal
  mark.
@@ -1532,7 +1584,6 @@ Now we are ready to parse a number: the base, mantissa, and exponent.
  If all goes well we check for the possible trailing letters and
  return.  Return value is 1 for success and 0 for failure.
  
-
  ###### number functions
         int number_parse(mpq_t num, char tail[3], struct text tok)
         {
@@ -1741,7 +1792,7 @@ required indent is found.
                 if (c == ' ')
                         skipped += 1;
                 else if (c == '\t')
-                       skipped = indent_tab(c);
+                       skipped = indent_tab(skipped);
                 else
                         break;
                 i+= 1;
@@ -1867,7 +1918,6 @@ String parsing goes in `libstring.c`
         libstring.o : libstring.c
                 $(CC) $(CFLAGS) -c libstring.c
  
-
  ## Testing
  
  As "untested code is buggy code" we need a program to easily test
@@ -1885,6 +1935,7 @@ the tokens one per line.
         #include <stdio.h>
         #include <gmp.h>
         #include <locale.h>
+       #include <getopt.h>
         #include "mdcode.h"
         #include "scanner.h"
         #include "number.h"
@@ -1897,11 +1948,19 @@ the tokens one per line.
                 fprintf(stderr, "%s\n", msg);
         }
  
+       static int kcmp(const void *ap, const void *bp)
+       {
+               char * const *a = ap;
+               char * const *b = bp;
+               return strcmp(*a, *b);
+       }
+
         int main(int argc, char *argv[])
         {
                 int fd;
                 int len;
                 char *file;
+               char *filename = NULL;
                 struct token_state *state;
                 const char *known[] = {
                         "==",
@@ -1918,22 +1977,77 @@ the tokens one per line.
                         .words_marks = known,
                         .number_chars = "., _+-",
                         .known_count = sizeof(known)/sizeof(known[0]),
-                       .ignored = (0 << TK_line_comment)
-                                 |(0 << TK_block_comment),
+                       .ignored = 0,
                 };
+               static const struct option long_options[] = {
+                       { "word-start",         1, NULL, 'W'},
+                       { "word-cont",          1, NULL, 'w'},
+                       { "number-chars",       1, NULL, 'n'},
+                       { "ignore-numbers",     0, NULL, 'N'},
+                       { "ignore-ident",       0, NULL, 'I'},
+                       { "ignore-marks",       0, NULL, 'M'},
+                       { "ignore-strings",     0, NULL, 'S'},
+                       { "ignore-multi-strings",0, NULL, 'z'},
+                       { "ignore-line-comment",0, NULL, 'c'},
+                       { "ignore-newline",     0, NULL, 'l'},
+                       { "ignore-block-comment", 0, NULL, 'C'},
+                       { "ignore-indent",      0, NULL, 'i'},
+                       { "file",               1, NULL, 'f'},
+                       { NULL,                 0, NULL, 0},
+               };
+               static const char options[] = "W:w:n:NIMSzclCif:";
+
                 struct section *table, *s, *prev;
+               int opt;
+
                 setlocale(LC_ALL,"");
-               if (argc != 2) {
-                       fprintf(stderr, "Usage: scanner file\n");
-                       exit(2);
+               while ((opt = getopt_long(argc, argv, options, long_options, NULL))
+                      != -1) {
+                       switch(opt) {
+                       case 'W': conf.word_start = optarg; break;
+                       case 'w': conf.word_cont = optarg; break;
+                       case 'n': conf.number_chars = optarg; break;
+                       case 'N': conf.ignored |= 1 << TK_number; break;
+                       case 'I': conf.ignored |= 1 << TK_ident; break;
+                       case 'M': conf.ignored |= 1 << TK_mark; break;
+                       case 'S': conf.ignored |= 1 << TK_string; break;
+                       case 'z': conf.ignored |= 1 << TK_multi_string; break;
+                       case 'c': conf.ignored |= 1 << TK_line_comment; break;
+                       case 'C': conf.ignored |= 1 << TK_block_comment; break;
+                       case 'l': conf.ignored |= 1 << TK_newline; break;
+                       case 'i': conf.ignored |= 1 << TK_in; break;
+                       case 'f': filename = optarg; break;
+                       default: fprintf(stderr, "scanner: unknown option '%c'.\n",
+                                        opt);
+                               exit(1);
+                       }
                 }
-               fd = open(argv[1], O_RDONLY);
+
+               if (optind < argc) {
+                       const char **wm = calloc(argc - optind, sizeof(char*));
+                       int i;
+                       for (i = optind; i < argc; i++)
+                               wm[i - optind] = argv[i];
+                       qsort(wm, argc-optind, sizeof(char*), kcmp);
+                       conf.words_marks = wm;
+                       conf.known_count = argc - optind;
+               }
+
+               if (filename)
+                       fd = open(filename, O_RDONLY);
+               else
+                       fd = 0;
                 if (fd < 0) {
                         fprintf(stderr, "scanner: cannot open %s: %s\n",
-                               argv[1], strerror(errno));
+                               filename, strerror(errno));
                         exit(1);
                 }
                 len = lseek(fd, 0, 2);
+               if (len <= 0) {
+                       fprintf(stderr,"scanner: %s is empty or not seekable\n",
+                               filename ?: "stdin");
+                       exit(1);
+               }
                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
                 table = code_extract(file, file+len, pr_err);
  
@@ -1977,7 +2091,10 @@ the tokens one per line.
                                 if (tk.num == TK_eof)
                                         break;
                         }
+                       token_close(state);
                 }
+               if (conf.words_marks != known)
+                       free(conf.words_marks);
                 exit(!!errs);
         }
  ###### File: scanner.mk
@@ -1989,4 +2106,3 @@ the tokens one per line.
                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
         scanner.o : scanner.c
                 $(CC) $(CFLAGS) -c scanner.c
-