Remove excess blank lines

[ocean] / csrc / scanner.mdc
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index a5eeb1f128b5163b1715b263a033a3a64265c73a..42001ff895b0196a7ab83e30afb5c50b5eae1cf0 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -85,7 +85,6 @@ from token to token.
  ###### token_next init
         int ignored = state->conf->ignored;
  
-
  The different tokens are numbers, words, marks, strings, comments,
  newlines, EOF, and indents, each of which is examined in detail below.
  
@@ -368,7 +367,11 @@ Known marks are included in the same list as the list of known words.
                 prev = ch;
                 save_unget_state(state);
                 ch = get_char(state);
-               if (!(ignored && (1<<TK_string)) && is_quote(ch))
+               if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
+                       /* If strings are allowed, a quote (Which isn't a known mark)
+                        * mustn't be treated as part of an unknown mark.  It can be
+                        * part of a multi-line srtings though.
+                        */
                         break;
                 if (prev == '#' && n < 0)
                         /* '#' is not a known mark, so assume it is a comment */
@@ -398,15 +401,6 @@ before assuming that we have an unknown mark
         ## parse comment
         ## unknown mark
  
-###### unknown mark
-       if (tk.txt.len) {
-               if (ignored & (1<<TK_mark))
-                       tk.num = TK_error;
-               else
-                       tk.num = TK_mark;
-               return tk;
-       }
-
  ### Strings
  
  Strings start with one of single quote, double quote, or back quote
@@ -436,7 +430,7 @@ token types.
  ###### internal functions
         static int is_quote(wchar_t ch)
         {
-               return ch == '\'' || ch == '"' || ch == '`';
+               return ch == '\'' || ch == '"' || ch == '`'; // "
         }
  
  #### Multi-line strings
@@ -520,6 +514,10 @@ If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
                                 break;
                         }
                 }
+               while (!at_eon(state) && (ch = get_char(state)) &&
+                                         iswalpha(ch))
+                       ;
+               unget_char(state);
                 close_token(state, &tk);
                 return tk;
         }
@@ -806,7 +804,6 @@ information and return one token.
                 continue;
         }
  
-
  ###### delayed tokens
  
         if (state->check_indent || state->delayed_lines) {
@@ -849,7 +846,6 @@ tokens will continue to return the same end-of-file token.
  ###### token types
         TK_eof,
  
-
  ###### white space
         if (ch == WEOF) {
                 if (state->col) {
@@ -868,7 +864,21 @@ If the token we have is not empty and `TK_mark` is allowed,
  we have an unknown mark, otherwise this must be an error.
  
  ###### unknown mark
-       /* one unknown character */
+
+       /* one unknown mark character */
+       if (tk.txt.len) {
+               close_token(state, &tk);
+               if (ignored & (1<<TK_mark))
+                       tk.num = TK_error;
+               else
+                       tk.num = TK_mark;
+               return tk;
+       }
+       /* Completely unrecognised character is next, possibly
+        * a digit and we are ignoring numbers.
+        * What ever it is, make it an error.
+        */
+       get_char(state);
         close_token(state, &tk);
         tk.num = TK_error;
         return tk;
@@ -1064,8 +1074,11 @@ parsed too much already.  For that there is `reset_token`.
         static void close_token(struct token_state *state,
                                 struct token *tk)
         {
-               tk->txt.len = (state->node->code.txt + state->offset)
-                             - tk->txt.txt;
+               if (state->node != tk->node)
+                       tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
+               else
+                       tk->txt.len = (state->node->code.txt + state->offset)
+                                     - tk->txt.txt;
         }
  
         static void reset_token(struct token_state *state, struct token *tok)
@@ -1077,7 +1090,6 @@ parsed too much already.  For that there is `reset_token`.
                 tok->txt.len = 0;
         }
  
-
  Tokens make not cross into the next `code_node`, and some tokens can
  include the newline at the and of a `code_node`, we must be able to
  easily check if we have reached the end.  Equally we need to know if
@@ -1509,7 +1521,6 @@ character `expc`.
         tok.txt += d;
         tok.len -= d;
  
-
  Now that we have the mantissa and the exponent we can multiply them
  together, also allowing for the number of digits after the decimal
  mark.
@@ -1573,7 +1584,6 @@ Now we are ready to parse a number: the base, mantissa, and exponent.
  If all goes well we check for the possible trailing letters and
  return.  Return value is 1 for success and 0 for failure.
  
-
  ###### number functions
         int number_parse(mpq_t num, char tail[3], struct text tok)
         {
@@ -1908,7 +1918,6 @@ String parsing goes in `libstring.c`
         libstring.o : libstring.c
                 $(CC) $(CFLAGS) -c libstring.c
  
-
  ## Testing
  
  As "untested code is buggy code" we need a program to easily test
@@ -1926,6 +1935,7 @@ the tokens one per line.
         #include <stdio.h>
         #include <gmp.h>
         #include <locale.h>
+       #include <getopt.h>
         #include "mdcode.h"
         #include "scanner.h"
         #include "number.h"
@@ -1938,11 +1948,19 @@ the tokens one per line.
                 fprintf(stderr, "%s\n", msg);
         }
  
+       static int kcmp(const void *ap, const void *bp)
+       {
+               char * const *a = ap;
+               char * const *b = bp;
+               return strcmp(*a, *b);
+       }
+
         int main(int argc, char *argv[])
         {
                 int fd;
                 int len;
                 char *file;
+               char *filename = NULL;
                 struct token_state *state;
                 const char *known[] = {
                         "==",
@@ -1959,22 +1977,77 @@ the tokens one per line.
                         .words_marks = known,
                         .number_chars = "., _+-",
                         .known_count = sizeof(known)/sizeof(known[0]),
-                       .ignored = (0 << TK_line_comment)
-                                 |(0 << TK_block_comment),
+                       .ignored = 0,
                 };
+               static const struct option long_options[] = {
+                       { "word-start",         1, NULL, 'W'},
+                       { "word-cont",          1, NULL, 'w'},
+                       { "number-chars",       1, NULL, 'n'},
+                       { "ignore-numbers",     0, NULL, 'N'},
+                       { "ignore-ident",       0, NULL, 'I'},
+                       { "ignore-marks",       0, NULL, 'M'},
+                       { "ignore-strings",     0, NULL, 'S'},
+                       { "ignore-multi-strings",0, NULL, 'z'},
+                       { "ignore-line-comment",0, NULL, 'c'},
+                       { "ignore-newline",     0, NULL, 'l'},
+                       { "ignore-block-comment", 0, NULL, 'C'},
+                       { "ignore-indent",      0, NULL, 'i'},
+                       { "file",               1, NULL, 'f'},
+                       { NULL,                 0, NULL, 0},
+               };
+               static const char options[] = "W:w:n:NIMSzclCif:";
+
                 struct section *table, *s, *prev;
+               int opt;
+
                 setlocale(LC_ALL,"");
-               if (argc != 2) {
-                       fprintf(stderr, "Usage: scanner file\n");
-                       exit(2);
+               while ((opt = getopt_long(argc, argv, options, long_options, NULL))
+                      != -1) {
+                       switch(opt) {
+                       case 'W': conf.word_start = optarg; break;
+                       case 'w': conf.word_cont = optarg; break;
+                       case 'n': conf.number_chars = optarg; break;
+                       case 'N': conf.ignored |= 1 << TK_number; break;
+                       case 'I': conf.ignored |= 1 << TK_ident; break;
+                       case 'M': conf.ignored |= 1 << TK_mark; break;
+                       case 'S': conf.ignored |= 1 << TK_string; break;
+                       case 'z': conf.ignored |= 1 << TK_multi_string; break;
+                       case 'c': conf.ignored |= 1 << TK_line_comment; break;
+                       case 'C': conf.ignored |= 1 << TK_block_comment; break;
+                       case 'l': conf.ignored |= 1 << TK_newline; break;
+                       case 'i': conf.ignored |= 1 << TK_in; break;
+                       case 'f': filename = optarg; break;
+                       default: fprintf(stderr, "scanner: unknown option '%c'.\n",
+                                        opt);
+                               exit(1);
+                       }
+               }
+
+               if (optind < argc) {
+                       const char **wm = calloc(argc - optind, sizeof(char*));
+                       int i;
+                       for (i = optind; i < argc; i++)
+                               wm[i - optind] = argv[i];
+                       qsort(wm, argc-optind, sizeof(char*), kcmp);
+                       conf.words_marks = wm;
+                       conf.known_count = argc - optind;
                 }
-               fd = open(argv[1], O_RDONLY);
+
+               if (filename)
+                       fd = open(filename, O_RDONLY);
+               else
+                       fd = 0;
                 if (fd < 0) {
                         fprintf(stderr, "scanner: cannot open %s: %s\n",
-                               argv[1], strerror(errno));
+                               filename, strerror(errno));
                         exit(1);
                 }
                 len = lseek(fd, 0, 2);
+               if (len <= 0) {
+                       fprintf(stderr,"scanner: %s is empty or not seekable\n",
+                               filename ?: "stdin");
+                       exit(1);
+               }
                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
                 table = code_extract(file, file+len, pr_err);
  
@@ -2018,7 +2091,10 @@ the tokens one per line.
                                 if (tk.num == TK_eof)
                                         break;
                         }
+                       token_close(state);
                 }
+               if (conf.words_marks != known)
+                       free(conf.words_marks);
                 exit(!!errs);
         }
  ###### File: scanner.mk