]> ocean-lang.org Git - ocean/blobdiff - csrc/scanner.mdc
oceani: simplify test in var_block_close.
[ocean] / csrc / scanner.mdc
index e54dac686bb8906734ba3bf712df8c4aa763edc6..6b706411f5010e3da61b6214742f3db39e91de77 100644 (file)
@@ -69,13 +69,15 @@ The scanner is not completely general, yet not completely specified.
 There are a fixed set of token types, though particular tokens within
 those types can be distinguish via configuration.
 
-Most token types may be explicitly ignored, as typically comments
-would be.  The exact consequence of ignoring each token type varies
-from token to token.
+Most token types may be explicitly ignored, so they aren't parsed.
+Comments typically parsed but not returned, but an option is provided to
+return comments for further processing.  The exact consequence of
+ignoring each token type varies from token to token.
 
 ###### public types
        struct token_config {
                int ignored;    // bit set of ignored tokens.
+               int return_comments;
                ## token config parameters
        };
 
@@ -354,10 +356,10 @@ immediately before a string is handled correctly.
 
 If the first character of a comment marker (i.e. '/') is a known mark,
 the above rules would suggest that the start of a comment would be
-parsed as that mark, which is not what is wanted.  So the introductory
-sequences for a comment ("//" and "/*") are treated as
-partially-known.  They prevent the leading "/" from being a mark by
-itself, but do not actually constitute a stand-alone mark.
+parsed as that mark, which is not what is wanted.  So when comments are
+not ignored, the introductory sequences for a comment ("//" and "/*")
+are treated as partially-known.  They prevent the leading "/" from being
+a mark by itself, but do not actually constitute a stand-alone mark.
 
 If `TK_mark` is ignored, then unknown marks are returned as errors.
 
@@ -379,8 +381,7 @@ Known marks are included in the same list as the list of known words.
                        /* found a longest-known-mark, still need to
                         * check for comments
                         */
-                       if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
-                           (ch == '/' || ch == '*')) {
+                       if (is_comment(ignored, tk.txt)) {
                                /* Yes, this is a comment, not a '/' */
                                restore_unget_state(state);
                                tk.num = TK_error;
@@ -393,22 +394,21 @@ Known marks are included in the same list as the list of known words.
                prev = ch;
                save_unget_state(state);
                ch = get_char(state);
+               if (n >= 0)
+                       /* No need to worry about other token types */
+                       continue;
                if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
                        /* If strings are allowed, a quote (Which isn't a known mark)
                         * mustn't be treated as part of an unknown mark.  It can be
-                        * part of a multi-line srtings though.
+                        * part of a multi-line string though.
                         */
                        break;
-               if (prev == '#' && n < 0)
-                       /* '#' is not a known mark, so assume it is a comment */
-                       break;
-               if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
-                       close_token(state, &tk);
-                       restore_unget_state(state);
-                       break;
-               }
-               if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
-                       close_token(state, &tk);
+
+               close_token(state, &tk);
+               if (is_comment(ignored, tk.txt)) {
+                       /* looks like a permitted comment, and not a known mark,
+                        * so assume it is a comment.
+                        */
                        restore_unget_state(state);
                        break;
                }
@@ -466,7 +466,7 @@ ignored, we fall through and treat a triple quote as an empty string
 followed by the start of a new string.
 
 ###### parse string
-       if (tk.txt.len == 3 &&
+       if (tk.txt.len >= 3 &&
            !(ignored & (1 << TK_multi_string)) &&
            is_quote(tk.txt.txt[0]) &&
            memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
@@ -573,19 +573,29 @@ still parsed, but is discarded.
        TK_block_comment,
 
 ###### internal functions
-       static int is_line_comment(struct text txt)
+       static int is_line_comment(int ignored, struct text txt)
        {
+               if (ignored & (1 << TK_line_comment))
+                       return 0;
                return (txt.len >= 1 && txt.txt[0] == '#') ||
                       (txt.len >= 2 && txt.txt[0] == '/' &&
                                        txt.txt[1] == '/');
        }
 
-       static int is_block_comment(struct text txt)
+       static int is_block_comment(int ignored, struct text txt)
        {
+               if (ignored & (1 << TK_block_comment))
+                       return 0;
                return txt.len >= 2 && txt.txt[0] == '/' &&
                       txt.txt[1] == '*';
        }
 
+       static int is_comment(int ignored, struct text txt)
+       {
+               return is_line_comment(ignored, txt) ||
+                      is_block_comment(ignored, txt);
+       }
+
 #### Single line comments
 
 A single-line comment continues up to, but not including the newline
@@ -593,14 +603,14 @@ or end of node.
 
 ###### parse comment
 
-       if (is_line_comment(tk.txt)) {
+       if (is_line_comment(ignored, tk.txt)) {
                while (!is_newline(ch) && !at_eon(state))
                        ch = get_char(state);
                if (is_newline(ch))
                        unget_char(state);
                close_token(state, &tk);
                tk.num = TK_line_comment;
-               if (ignored & (1 << TK_line_comment))
+               if (!state->conf->return_comments)
                        continue;
                return tk;
        }
@@ -617,7 +627,7 @@ the unget state (explained later).
 
 ###### parse comment
 
-       if (is_block_comment(tk.txt)) {
+       if (is_block_comment(ignored, tk.txt)) {
                wchar_t prev;
                int newlines = 0;
                reset_token(state, &tk);
@@ -655,8 +665,7 @@ the unget state (explained later).
                        if (!is_newline(ch))
                                tk.num = TK_error;
                }
-               if (tk.num == TK_error ||
-                   !(ignored & (1 << TK_block_comment)))
+               if (tk.num == TK_error || state->conf->return_comments)
                        return tk;
                continue;
        }
@@ -716,6 +725,8 @@ ignored.
        int     indent_level;
        int     indent_sizes[20];
 
+`indent_sizes[0]` will always be zero - this simplifies some code.
+
 #### Newlines
 
 Newlines can optionally be reported.  Newlines within a block comment
@@ -750,7 +761,7 @@ blank lines that have been skipped.
 
 When a Newline leads to the next block of code there is a question of
 whether the various Newline and OUT/IN tokens should appear to
-pbelong to the earlier or later block.  This is addressed by processing
+belong to the earlier or later block.  This is addressed by processing
 the tokens in two stages based on the relative indent levels of the
 two blocks (each block has a base indent to which the actual indents
 are added).
@@ -787,19 +798,29 @@ For this we store one more than the number of blank lines as
        int delayed_lines;
        int out_next;
 
-Generating these tokens involve two separate pieces of code.
+Generating these tokens involves two separate pieces of code.
 
 Firstly we need to recognise white space and count the indents and
 newlines.  These are recorded in the above state fields.
 
-Separately we need, on each call to `token_next`, we need to check if
+Separately we need, on each call to `token_next`, to check if
 there are some delayed tokens and if so we need to advance the state
 information and return one token.
 
+###### internal functions
+       static int state_indent(struct token_state *state)
+       {
+               if (state->node == NULL)
+                       return state->col;
+               return state->node->indent - state->node->needs_strip + state->col;
+       }
+
 ###### white space
+       if (is_newline(ch))
+               state_check_node(state);
        if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
                int newlines = 0;
-               int was_son = at_son(state);
+               int was_nl = is_newline(ch);
                if (ignored & (1<<TK_in)) {
                        if (!is_newline(ch))
                                continue;
@@ -810,22 +831,17 @@ information and return one token.
                        return tk;
                }
                // Indents are needed, so check all white space.
-               while (ch <= ' ' && !at_eon(state)) {
+               while (ch <= ' ' && ch != WEOF) {
                        if (is_newline(ch))
                                newlines += 1;
                        ch = get_char(state);
+                       if (is_newline(ch))
+                               state_check_node(state);
                }
-               if (at_eon(state)) {
-                       newlines += 1;
-                       if (state->node->next &&
-                           state->node->next->indent > state->node->indent)
-                               state->col = state->node->next->indent;
-                       else
-                               state->col = state->node->indent;
-               } else
+               if (ch != WEOF)
                        unget_char(state);
                state->delayed_lines = newlines;
-               state->out_next = was_son;
+               state->out_next = !was_nl;
                state->check_indent = 1;
                continue;
        }
@@ -833,7 +849,7 @@ information and return one token.
 ###### delayed tokens
 
        if (state->check_indent || state->delayed_lines) {
-               if (state->col < state->indent_sizes[state->indent_level]) {
+               if (state_indent(state) < state->indent_sizes[state->indent_level]) {
                        if (!state->out_next &&
                            !(ignored & (1<<TK_newline))) {
                                state->out_next = 1;
@@ -845,11 +861,12 @@ information and return one token.
                        tk.num = TK_out;
                        return tk;
                }
-               if (state->col > state->indent_sizes[state->indent_level] &&
+               if (state_indent(state) > state->indent_sizes[state->indent_level] &&
                    state->indent_level < sizeof(state->indent_sizes)-1) {
                        state->indent_level += 1;
-                       state->indent_sizes[state->indent_level] = state->col;
-                       state->delayed_lines -= 1;
+                       state->indent_sizes[state->indent_level] = state_indent(state);
+                       if (state->delayed_lines)
+                               state->delayed_lines -= 1;
                        tk.num = TK_in;
                        return tk;
                }
@@ -874,11 +891,6 @@ tokens will continue to return the same end-of-file token.
 
 ###### white space
        if (ch == WEOF) {
-               if (state->col) {
-                       state->col = 0;
-                       state->check_indent = 1;
-                       continue;
-               }
                tk.num = TK_eof;
                return tk;
        }
@@ -934,10 +946,11 @@ a flag that tells us whether or not we need to strip.
        int    offset;
        int    line;
        int    col;
+       int    strip_offset;
 
 ###### internal functions
 
-       static int do_strip(struct token_state *state)
+       static void do_strip(struct token_state *state)
        {
                int indent = 0;
                if (state->node->needs_strip) {
@@ -953,7 +966,28 @@ a flag that tells us whether or not we need to strip.
                                n -= 4;
                        }
                }
-               return indent;
+       }
+
+       static void state_check_node(struct token_state *state)
+       {
+               if (!state->node)
+                       return;
+               if (state->node->code.len > state->offset)
+                       return;
+
+               do
+                       state->node = state->node->next;
+               while (state->node && state->node->code.txt == NULL);
+               state->offset = 0;
+               state->prev_offset = 0;
+               state->strip_offset = 0;
+               state->col = 0;
+               if (state->node == NULL)
+                       return;
+               state->line = state->node->line_no;
+               do_strip(state);
+               state->col = state->node->needs_strip;
+               state->strip_offset = state->offset;
        }
 
        static wint_t get_char(struct token_state *state)
@@ -962,18 +996,9 @@ a flag that tells us whether or not we need to strip.
                size_t n;
                mbstate_t mbstate;
 
+               state_check_node(state);
                if (state->node == NULL)
                        return WEOF;
-               if (state->node->code.len <= state->offset) {
-                       do
-                               state->node = state->node->next;
-                       while (state->node && state->node->code.txt == NULL);
-                       state->offset = 0;
-                       if (state->node == NULL)
-                               return WEOF;
-                       state->line = state->node->line_no;
-                       state->col = do_strip(state);
-               }
 
                ## before get_char
 
@@ -984,12 +1009,12 @@ a flag that tells us whether or not we need to strip.
                            &mbstate);
                if (n == -2 || n == 0) {
                        /* Not enough bytes - not really possible */
-                       next = '\n';
-                       state->offset = state->node->code.len;
+                       next = '\n';                            // NOTEST
+                       state->offset = state->node->code.len;  // NOTEST
                } else if (n == -1) {
                        /* error */
-                       state->offset += 1;
-                       next = 0x7f; // an illegal character
+                       state->offset += 1;                     // NOTEST
+                       next = 0x7f; // an illegal character    // NOTEST
                } else
                        state->offset += n;
 
@@ -997,7 +1022,8 @@ a flag that tells us whether or not we need to strip.
                        state->col += 1;
                } else if (is_newline(next)) {
                        state->line += 1;
-                       state->col = do_strip(state);
+                       do_strip(state);
+                       state->col = state->node->needs_strip;
                } else if (next == '\t') {
                        state->col = indent_tab(state->col);
                }
@@ -1116,7 +1142,7 @@ parsed too much already.  For that there is `reset_token`.
                tok->txt.len = 0;
        }
 
-Tokens make not cross into the next `code_node`, and some tokens can
+Tokens may not cross into the next `code_node`, and some tokens can
 include the newline at the and of a `code_node`, we must be able to
 easily check if we have reached the end.  Equally we need to know if
 we are at the start of a node, as white space is treated a little
@@ -1126,7 +1152,7 @@ differently there.
 
        static int at_son(struct token_state *state)
        {
-               return state->offset == 0;
+               return state->prev_offset <= state->strip_offset;
        }
 
        static int at_eon(struct token_state *state)
@@ -1209,7 +1235,9 @@ As well as getting tokens, we need to be able to create the
                memset(state, 0, sizeof(*state));
                state->node = code;
                state->line = code->line_no;
-               state->col = do_strip(state);
+               do_strip(state);
+               state->col = state->node->needs_strip;
+               state->strip_offset = state->offset;
                state->conf = conf;
                return state;
        }
@@ -1651,7 +1679,7 @@ Number parsing goes in `libnumber.c`
        ## number includes
        ## number functions
 
-###### File: number.h
+###### File: parse_number.h
        int number_parse(mpq_t num, char tail[3], struct text tok);
 
 ###### File: scanner.mk
@@ -1935,7 +1963,7 @@ String parsing goes in `libstring.c`
        ## string functions
        ## string main
 
-###### File: string.h
+###### File: parse_string.h
        int string_parse(struct token *tok, char escape,
                         struct text *str, char tail[3]);
 
@@ -1964,8 +1992,8 @@ the tokens one per line.
        #include <getopt.h>
        #include "mdcode.h"
        #include "scanner.h"
-       #include "number.h"
-       #include "string.h"
+       #include "parse_number.h"
+       #include "parse_string.h"
 
        static int errs;
        static void pr_err(char *msg)
@@ -2018,13 +2046,17 @@ the tokens one per line.
                        { "ignore-newline",     0, NULL, 'l'},
                        { "ignore-block-comment", 0, NULL, 'C'},
                        { "ignore-indent",      0, NULL, 'i'},
+                       { "return-comments",    0, NULL, 'r'},
                        { "file",               1, NULL, 'f'},
+                       { "section",            1, NULL, 's'},
                        { NULL,                 0, NULL, 0},
                };
-               static const char options[] = "W:w:n:NIMSzclCif:";
+               static const char options[] = "W:w:n:NIMSzclCirf:s:";
 
                struct section *table, *s, *prev;
                int opt;
+               char *section_name = NULL;
+               int section_found = 0;
 
                setlocale(LC_ALL,"");
                while ((opt = getopt_long(argc, argv, options, long_options, NULL))
@@ -2042,7 +2074,9 @@ the tokens one per line.
                        case 'C': conf.ignored |= 1 << TK_block_comment; break;
                        case 'l': conf.ignored |= 1 << TK_newline; break;
                        case 'i': conf.ignored |= 1 << TK_in; break;
+                       case 'r': conf.return_comments = 1; break;
                        case 'f': filename = optarg; break;
+                       case 's': section_name = optarg; break;
                        default: fprintf(stderr, "scanner: unknown option '%c'.\n",
                                         opt);
                                exit(1);
@@ -2079,6 +2113,12 @@ the tokens one per line.
 
                for (s = table; s;
                        (code_free(s->code), prev = s, s = s->next, free(prev))) {
+                       if (section_name &&
+                           (s->section.len != strlen(section_name) ||
+                            strncmp(s->section.txt, section_name, s->section.len) != 0))
+                               continue;
+                       if (section_name)
+                               section_found = 1;
                        printf("Tokenizing: %.*s\n", s->section.len,
                                s->section.txt);
                        state = token_open(s->code, &conf);
@@ -2121,6 +2161,10 @@ the tokens one per line.
                }
                if (conf.words_marks != known)
                        free(conf.words_marks);
+               if (section_name && !section_found) {
+                       fprintf(stderr, "scanner: section %s not found\n", section_name);
+                       errs = 1;
+               }
                exit(!!errs);
        }
 ###### File: scanner.mk