###### token_next init
int ignored = state->conf->ignored;
-
The different tokens are numbers, words, marks, strings, comments,
newlines, EOF, and indents, each of which is examined in detail below.
### Numbers
Numbers are the messiest tokens to parse, primarily because they can
-contain characters that also have meaning outside of number and,
+contain characters that also have meaning outside of numbers and,
particularly, immediately after numbers.
The obvious example is the '`-`' sign. It can come inside a number for
Tokens matching these known words are reported as the index of the
list added to `TK_reserved`.
-If identifiers are ignored, then any work which is not listed as a
+If identifiers are ignored, then any word which is not listed as a
known word results in an error.
###### token config parameters
If an unknown mark contains a quote character or a comment marker, and
that token is not being ignored, then we terminate the unknown mark
-before that quote or comment. This ensure that an unknown mark
+before that quote or comment. This ensures that an unknown mark
immediately before a string is handled correctly.
-If `TK_mark` is ignored, then unknown marks as returned as an error.
+If the first character of a comment marker (i.e. '/') is a known mark,
+the above rules would suggest that the start of a comment would be
+parsed as that mark, which is not what is wanted. So the introductory
+sequences for a comment ("//" and "/*") are treated as
+partially-known. They prevent the leading "/" from being a mark by
+itself, but do not actually constitute a stand-alone mark.
+
+If `TK_mark` is ignored, then unknown marks are returned as errors.
###### token types
TK_mark,
prev = ch;
save_unget_state(state);
ch = get_char(state);
- if (!(ignored && (1<<TK_string)) && is_quote(ch))
+ if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
+ /* If strings are allowed, a quote (Which isn't a known mark)
+ * mustn't be treated as part of an unknown mark. It can be
+ * part of a multi-line srtings though.
+ */
break;
if (prev == '#' && n < 0)
/* '#' is not a known mark, so assume it is a comment */
## parse comment
## unknown mark
-###### unknown mark
- if (tk.txt.len) {
- if (ignored & (1<<TK_mark))
- tk.num = TK_error;
- else
- tk.num = TK_mark;
- return tk;
- }
-
### Strings
Strings start with one of single quote, double quote, or back quote
these characters can be included in the list of known marks and then
they will not be used for identifying strings.
-Immediately following the close quote one or two ASCII letters may
+Immediately following the close quote, one or two ASCII letters may
appear. These are somewhat like the arbitrary letters allowed in
"Numbers" above. They can be used by the language in various ways.
###### internal functions
static int is_quote(wchar_t ch)
{
- return ch == '\'' || ch == '"' || ch == '`';
+ return ch == '\'' || ch == '"' || ch == '`'; // "
}
#### Multi-line strings
break;
}
}
+ while (!at_eon(state) && (ch = get_char(state)) &&
+ iswalpha(ch))
+ ;
+ unget_char(state);
close_token(state, &tk);
return tk;
}
continue;
}
-
###### delayed tokens
if (state->check_indent || state->delayed_lines) {
###### token types
TK_eof,
-
###### white space
if (ch == WEOF) {
if (state->col) {
we have an unknown mark, otherwise this must be an error.
###### unknown mark
- /* one unknown character */
+
+ /* one unknown mark character */
+ if (tk.txt.len) {
+ close_token(state, &tk);
+ if (ignored & (1<<TK_mark))
+ tk.num = TK_error;
+ else
+ tk.num = TK_mark;
+ return tk;
+ }
+ /* Completely unrecognised character is next, possibly
+ * a digit and we are ignoring numbers.
+ * What ever it is, make it an error.
+ */
+ get_char(state);
close_token(state, &tk);
tk.num = TK_error;
return tk;
###### internal functions
- static void do_strip(struct token_state *state)
+ static int do_strip(struct token_state *state)
{
+ int indent = 0;
if (state->node->needs_strip) {
int n = 4;
while (n && state->node->code.txt[state->offset] == ' ') {
+ indent += 1;
state->offset += 1;
n -= 1;
}
while (n == 4 && state->node->code.txt[state->offset] == '\t') {
+ indent = indent_tab(indent);
state->offset += 1;
n -= 4;
}
}
+ return indent;
}
static wint_t get_char(struct token_state *state)
state->offset = 0;
if (state->node == NULL)
return WEOF;
- do_strip(state);
state->line = state->node->line_no;
- state->col = state->node->indent;
+ state->col = do_strip(state);
}
## before get_char
state->col += 1;
} else if (is_newline(next)) {
state->line += 1;
- state->col = state->node->indent;
- do_strip(state);
+ state->col = do_strip(state);
} else if (next == '\t') {
state->col = indent_tab(state->col);
}
static void close_token(struct token_state *state,
struct token *tk)
{
- tk->txt.len = (state->node->code.txt + state->offset)
- - tk->txt.txt;
+ if (state->node != tk->node)
+ tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
+ else
+ tk->txt.len = (state->node->code.txt + state->offset)
+ - tk->txt.txt;
}
static void reset_token(struct token_state *state, struct token *tok)
tok->txt.len = 0;
}
-
Tokens make not cross into the next `code_node`, and some tokens can
include the newline at the and of a `code_node`, we must be able to
easily check if we have reached the end. Equally we need to know if
memset(state, 0, sizeof(*state));
state->node = code;
state->line = code->line_no;
- state->col = code->indent;
+ state->col = do_strip(state);
state->conf = conf;
- do_strip(state);
return state;
}
void token_close(struct token_state *state)
tok.txt += d;
tok.len -= d;
-
Now that we have the mantissa and the exponent we can multiply them
together, also allowing for the number of digits after the decimal
mark.
If all goes well we check for the possible trailing letters and
return. Return value is 1 for success and 0 for failure.
-
###### number functions
int number_parse(mpq_t num, char tail[3], struct text tok)
{
libstring.o : libstring.c
$(CC) $(CFLAGS) -c libstring.c
-
## Testing
As "untested code is buggy code" we need a program to easily test
#include <stdio.h>
#include <gmp.h>
#include <locale.h>
+ #include <getopt.h>
#include "mdcode.h"
#include "scanner.h"
#include "number.h"
fprintf(stderr, "%s\n", msg);
}
+ static int kcmp(const void *ap, const void *bp)
+ {
+ char * const *a = ap;
+ char * const *b = bp;
+ return strcmp(*a, *b);
+ }
+
int main(int argc, char *argv[])
{
int fd;
int len;
char *file;
+ char *filename = NULL;
struct token_state *state;
const char *known[] = {
"==",
.words_marks = known,
.number_chars = "., _+-",
.known_count = sizeof(known)/sizeof(known[0]),
- .ignored = (0 << TK_line_comment)
- |(0 << TK_block_comment),
+ .ignored = 0,
+ };
+ static const struct option long_options[] = {
+ { "word-start", 1, NULL, 'W'},
+ { "word-cont", 1, NULL, 'w'},
+ { "number-chars", 1, NULL, 'n'},
+ { "ignore-numbers", 0, NULL, 'N'},
+ { "ignore-ident", 0, NULL, 'I'},
+ { "ignore-marks", 0, NULL, 'M'},
+ { "ignore-strings", 0, NULL, 'S'},
+ { "ignore-multi-strings",0, NULL, 'z'},
+ { "ignore-line-comment",0, NULL, 'c'},
+ { "ignore-newline", 0, NULL, 'l'},
+ { "ignore-block-comment", 0, NULL, 'C'},
+ { "ignore-indent", 0, NULL, 'i'},
+ { "file", 1, NULL, 'f'},
+ { NULL, 0, NULL, 0},
};
+ static const char options[] = "W:w:n:NIMSzclCif:";
+
struct section *table, *s, *prev;
+ int opt;
+
setlocale(LC_ALL,"");
- if (argc != 2) {
- fprintf(stderr, "Usage: scanner file\n");
- exit(2);
+ while ((opt = getopt_long(argc, argv, options, long_options, NULL))
+ != -1) {
+ switch(opt) {
+ case 'W': conf.word_start = optarg; break;
+ case 'w': conf.word_cont = optarg; break;
+ case 'n': conf.number_chars = optarg; break;
+ case 'N': conf.ignored |= 1 << TK_number; break;
+ case 'I': conf.ignored |= 1 << TK_ident; break;
+ case 'M': conf.ignored |= 1 << TK_mark; break;
+ case 'S': conf.ignored |= 1 << TK_string; break;
+ case 'z': conf.ignored |= 1 << TK_multi_string; break;
+ case 'c': conf.ignored |= 1 << TK_line_comment; break;
+ case 'C': conf.ignored |= 1 << TK_block_comment; break;
+ case 'l': conf.ignored |= 1 << TK_newline; break;
+ case 'i': conf.ignored |= 1 << TK_in; break;
+ case 'f': filename = optarg; break;
+ default: fprintf(stderr, "scanner: unknown option '%c'.\n",
+ opt);
+ exit(1);
+ }
+ }
+
+ if (optind < argc) {
+ const char **wm = calloc(argc - optind, sizeof(char*));
+ int i;
+ for (i = optind; i < argc; i++)
+ wm[i - optind] = argv[i];
+ qsort(wm, argc-optind, sizeof(char*), kcmp);
+ conf.words_marks = wm;
+ conf.known_count = argc - optind;
}
- fd = open(argv[1], O_RDONLY);
+
+ if (filename)
+ fd = open(filename, O_RDONLY);
+ else
+ fd = 0;
if (fd < 0) {
fprintf(stderr, "scanner: cannot open %s: %s\n",
- argv[1], strerror(errno));
+ filename, strerror(errno));
exit(1);
}
len = lseek(fd, 0, 2);
+ if (len <= 0) {
+ fprintf(stderr,"scanner: %s is empty or not seekable\n",
+ filename ?: "stdin");
+ exit(1);
+ }
file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
table = code_extract(file, file+len, pr_err);
if (tk.num == TK_eof)
break;
}
+ token_close(state);
}
+ if (conf.words_marks != known)
+ free(conf.words_marks);
exit(!!errs);
}
###### File: scanner.mk
libmdcode.o libnumber.o libstring.o -licuuc -lgmp
scanner.o : scanner.c
$(CC) $(CFLAGS) -c scanner.c
-