There are a fixed set of token types, though particular tokens within
those types can be distinguish via configuration.
-Most token types may be explicitly ignored, as typically comments
-would be. The exact consequence of ignoring each token type varies
-from token to token.
+Most token types may be explicitly ignored, so they aren't parsed.
+Comments typically parsed but not returned, but an option is provided to
+return comments for further processing. The exact consequence of
+ignoring each token type varies from token to token.
###### public types
struct token_config {
int ignored; // bit set of ignored tokens.
+ int return_comments;
## token config parameters
};
###### token_next init
int ignored = state->conf->ignored;
-
The different tokens are numbers, words, marks, strings, comments,
newlines, EOF, and indents, each of which is examined in detail below.
with allowing commas to be used as the decimal indicator, and spaces
to be used to separate groups of digits in large numbers. Both of
these can reasonably be restricted to appear between two digits, so we
-have to add that condition to our tests.
+have to add that condition to our tests. For consistency we require
+every non-alpha-numeric to appear between two hex digits, with the
+exception that a sign can appear only after a 'p' or 'e', and a space
+can only appear between decimal digits. Allowing a space before a
+letter easily leads to confusion, such a in `a < 3 and b < 4`.
So we cannot just treat numbers as starting with a digit and being
followed by some set of characters. We need more structure than that.
So:
- Numbers must start with a digit.
-- If the first digit is zero, the next character must be a base
- signifier (one of `xob`) or a decimal marker (`.` or `,`).
- In the first case the first `p` or `P` may be followed by a sign.
+- If the first digit is zero, the next character should be a base
+ signifier (one of `xob`) or a decimal marker (`.` or `,`) (though this isn't
+ enforced at this stage)
+ In the first case the only first `p` or `P` may be followed by a sign.
- If the number doesn't start with `0` followed by one of `xob`, the
first `e` may be followed by a sign.
-- Any digit or hex digit may be followed by a space or underscore
- providing that the subsequence character is also a (hex) digit.
+- A sign must always be followed by a digit.
+- Any digit may be followed by a space or underscore and any hex digit
+ maybe followed by an underscore, providing that the subsequence character
+ is also a digit (for space) or hex digit (for underscore).
This rule will require an extra level of 'unget' to be
supported when handling characters.
- Otherwise any digits or ASCII letters are allowed. We do not at
###### parse number
if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
- int prev_special = 0;
+ int prev = 0;
int expect_p = 0;
int decimal_mark = 0;
if (ch == '0') {
int sign_ok = 0;
switch(expect_p) {
case 0:
- if (ch == 'e' || ch == 'E')
+ if (ch == 'e' || ch == 'E') {
sign_ok = 1;
+ decimal_mark = 1;
+ }
break;
case 1:
- if (ch == 'p' || ch == 'P')
+ if (ch == 'p' || ch == 'P') {
sign_ok = 1;
+ decimal_mark = 1;
+ }
break;
}
save_unget_state(state);
+ prev = ch;
ch = get_char(state);
- if (iswalnum(ch)) {
- prev_special = 0;
+
+ if (!iswalnum(prev)) {
+ /* special characters, like separators and decimal marks
+ * and signs, must be followed by a hexdigit, and the
+ * space and signs must be followed by a decimal digit.
+ */
+ if (!iswxdigit(ch) ||
+ ((prev == '-' || prev == '+') && !iswdigit(ch)) ||
+ (prev == ' ' && !iswdigit(ch))) {
+ /* don't want the new char or the special */
+ restore_unget_state(state);
+ break;
+ }
+ }
+ if (iswalnum(ch))
continue;
+
+ if (!strchr(state->conf->number_chars, ch)) {
+ /* non-number char */
+ break;
}
if (ch == '+' || ch == '-') {
+ /* previous must be 'e' or 'p' in appropraite context */
if (!sign_ok)
break;
expect_p = -1;
+ } else if (ch == ' ') {
+ /* previous must be a digit */
+ if (!iswdigit(prev))
+ break;
+ } else {
+ /* previous must be a hex digit */
+ if (!iswxdigit(prev))
+ break;
}
if (ch == '.' || ch == ',') {
+ /* only one of these permitted */
if (decimal_mark)
break;
decimal_mark = 1;
}
- if (prev_special) {
- /* Don't allow that special char,
- * need two 'ungets'
- */
- restore_unget_state(state);
- break;
- }
- if (strchr(state->conf->number_chars, ch)) {
- prev_special = 1;
- continue;
- }
- /* non-number char */
- break;
}
/* We seem to have a "number" token */
unget_char(state);
If the first character of a comment marker (i.e. '/') is a known mark,
the above rules would suggest that the start of a comment would be
-parsed as that mark, which is not what is wanted. So the introductory
-sequences for a comment ("//" and "/*") are treated as
-partially-known. They prevent the leading "/" from being a mark by
-itself, but do not actually constitute a stand-alone mark.
+parsed as that mark, which is not what is wanted. So when comments are
+not ignored, the introductory sequences for a comment ("//" and "/*")
+are treated as partially-known. They prevent the leading "/" from being
+a mark by itself, but do not actually constitute a stand-alone mark.
If `TK_mark` is ignored, then unknown marks are returned as errors.
/* found a longest-known-mark, still need to
* check for comments
*/
- if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
- (ch == '/' || ch == '*')) {
+ if (is_comment(ignored, tk.txt)) {
/* Yes, this is a comment, not a '/' */
restore_unget_state(state);
tk.num = TK_error;
prev = ch;
save_unget_state(state);
ch = get_char(state);
- if (!(ignored && (1<<TK_string)) && is_quote(ch))
- break;
- if (prev == '#' && n < 0)
- /* '#' is not a known mark, so assume it is a comment */
- break;
- if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
- close_token(state, &tk);
- restore_unget_state(state);
+ if (n >= 0)
+ /* No need to worry about other token types */
+ continue;
+ if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
+ /* If strings are allowed, a quote (Which isn't a known mark)
+ * mustn't be treated as part of an unknown mark. It can be
+ * part of a multi-line string though.
+ */
break;
- }
- if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
- close_token(state, &tk);
+
+ close_token(state, &tk);
+ if (is_comment(ignored, tk.txt)) {
+ /* looks like a permitted comment, and not a known mark,
+ * so assume it is a comment.
+ */
restore_unget_state(state);
break;
}
## parse comment
## unknown mark
-###### unknown mark
- if (tk.txt.len) {
- if (ignored & (1<<TK_mark))
- tk.num = TK_error;
- else
- tk.num = TK_mark;
- return tk;
- }
-
### Strings
Strings start with one of single quote, double quote, or back quote
###### internal functions
static int is_quote(wchar_t ch)
{
- return ch == '\'' || ch == '"' || ch == '`';
+ return ch == '\'' || ch == '"' || ch == '`'; // "
}
#### Multi-line strings
followed by the start of a new string.
###### parse string
- if (tk.txt.len == 3 &&
+ if (tk.txt.len >= 3 &&
!(ignored & (1 << TK_multi_string)) &&
is_quote(tk.txt.txt[0]) &&
memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
break;
}
}
+ while (!at_eon(state) && (ch = get_char(state)) &&
+ iswalpha(ch))
+ ;
+ unget_char(state);
close_token(state, &tk);
return tk;
}
TK_block_comment,
###### internal functions
- static int is_line_comment(struct text txt)
+ static int is_line_comment(int ignored, struct text txt)
{
+ if (ignored & (1 << TK_line_comment))
+ return 0;
return (txt.len >= 1 && txt.txt[0] == '#') ||
(txt.len >= 2 && txt.txt[0] == '/' &&
txt.txt[1] == '/');
}
- static int is_block_comment(struct text txt)
+ static int is_block_comment(int ignored, struct text txt)
{
+ if (ignored & (1 << TK_block_comment))
+ return 0;
return txt.len >= 2 && txt.txt[0] == '/' &&
txt.txt[1] == '*';
}
+ static int is_comment(int ignored, struct text txt)
+ {
+ return is_line_comment(ignored, txt) ||
+ is_block_comment(ignored, txt);
+ }
+
#### Single line comments
A single-line comment continues up to, but not including the newline
###### parse comment
- if (is_line_comment(tk.txt)) {
+ if (is_line_comment(ignored, tk.txt)) {
while (!is_newline(ch) && !at_eon(state))
ch = get_char(state);
if (is_newline(ch))
unget_char(state);
close_token(state, &tk);
tk.num = TK_line_comment;
- if (ignored & (1 << TK_line_comment))
+ if (!state->conf->return_comments)
continue;
return tk;
}
###### parse comment
- if (is_block_comment(tk.txt)) {
+ if (is_block_comment(ignored, tk.txt)) {
wchar_t prev;
int newlines = 0;
reset_token(state, &tk);
if (!is_newline(ch))
tk.num = TK_error;
}
- if (tk.num == TK_error ||
- !(ignored & (1 << TK_block_comment)))
+ if (tk.num == TK_error || state->conf->return_comments)
return tk;
continue;
}
int indent_level;
int indent_sizes[20];
+`indent_sizes[0]` will always be zero - this simplifies some code.
+
#### Newlines
Newlines can optionally be reported. Newlines within a block comment
When a Newline leads to the next block of code there is a question of
whether the various Newline and OUT/IN tokens should appear to
-pbelong to the earlier or later block. This is addressed by processing
+belong to the earlier or later block. This is addressed by processing
the tokens in two stages based on the relative indent levels of the
two blocks (each block has a base indent to which the actual indents
are added).
int delayed_lines;
int out_next;
-Generating these tokens involve two separate pieces of code.
+Generating these tokens involves two separate pieces of code.
Firstly we need to recognise white space and count the indents and
newlines. These are recorded in the above state fields.
-Separately we need, on each call to `token_next`, we need to check if
+Separately we need, on each call to `token_next`, to check if
there are some delayed tokens and if so we need to advance the state
information and return one token.
+###### internal functions
+ static int state_indent(struct token_state *state)
+ {
+ if (state->node == NULL)
+ return state->col;
+ return state->node->indent - state->node->needs_strip + state->col;
+ }
+
###### white space
+ if (is_newline(ch))
+ state_check_node(state);
if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
int newlines = 0;
- int was_son = at_son(state);
+ int was_nl = is_newline(ch);
if (ignored & (1<<TK_in)) {
if (!is_newline(ch))
continue;
return tk;
}
// Indents are needed, so check all white space.
- while (ch <= ' ' && !at_eon(state)) {
+ while (ch <= ' ' && ch != WEOF) {
if (is_newline(ch))
newlines += 1;
ch = get_char(state);
+ if (is_newline(ch))
+ state_check_node(state);
}
- if (at_eon(state)) {
- newlines += 1;
- if (state->node->next &&
- state->node->next->indent > state->node->indent)
- state->col = state->node->next->indent;
- else
- state->col = state->node->indent;
- } else
+ if (ch != WEOF)
unget_char(state);
state->delayed_lines = newlines;
- state->out_next = was_son;
+ state->out_next = !was_nl;
state->check_indent = 1;
continue;
}
-
###### delayed tokens
if (state->check_indent || state->delayed_lines) {
- if (state->col < state->indent_sizes[state->indent_level]) {
+ if (state_indent(state) < state->indent_sizes[state->indent_level]) {
if (!state->out_next &&
!(ignored & (1<<TK_newline))) {
state->out_next = 1;
tk.num = TK_out;
return tk;
}
- if (state->col > state->indent_sizes[state->indent_level] &&
+ if (state_indent(state) > state->indent_sizes[state->indent_level] &&
state->indent_level < sizeof(state->indent_sizes)-1) {
state->indent_level += 1;
- state->indent_sizes[state->indent_level] = state->col;
- state->delayed_lines -= 1;
+ state->indent_sizes[state->indent_level] = state_indent(state);
+ if (state->delayed_lines)
+ state->delayed_lines -= 1;
tk.num = TK_in;
return tk;
}
###### token types
TK_eof,
-
###### white space
if (ch == WEOF) {
- if (state->col) {
- state->col = 0;
- state->check_indent = 1;
- continue;
- }
tk.num = TK_eof;
return tk;
}
we have an unknown mark, otherwise this must be an error.
###### unknown mark
- /* one unknown character */
+
+ /* one unknown mark character */
+ if (tk.txt.len) {
+ close_token(state, &tk);
+ if (ignored & (1<<TK_mark))
+ tk.num = TK_error;
+ else
+ tk.num = TK_mark;
+ return tk;
+ }
+ /* Completely unrecognised character is next, possibly
+ * a digit and we are ignoring numbers.
+ * What ever it is, make it an error.
+ */
+ get_char(state);
close_token(state, &tk);
tk.num = TK_error;
return tk;
int offset;
int line;
int col;
+ int strip_offset;
###### internal functions
static void do_strip(struct token_state *state)
{
+ int indent = 0;
if (state->node->needs_strip) {
int n = 4;
while (n && state->node->code.txt[state->offset] == ' ') {
+ indent += 1;
state->offset += 1;
n -= 1;
}
while (n == 4 && state->node->code.txt[state->offset] == '\t') {
+ indent = indent_tab(indent);
state->offset += 1;
n -= 4;
}
}
}
+ static void state_check_node(struct token_state *state)
+ {
+ if (!state->node)
+ return;
+ if (state->node->code.len > state->offset)
+ return;
+
+ do
+ state->node = state->node->next;
+ while (state->node && state->node->code.txt == NULL);
+ state->offset = 0;
+ state->prev_offset = 0;
+ state->strip_offset = 0;
+ state->col = 0;
+ if (state->node == NULL)
+ return;
+ state->line = state->node->line_no;
+ do_strip(state);
+ state->col = state->node->needs_strip;
+ state->strip_offset = state->offset;
+ }
+
static wint_t get_char(struct token_state *state)
{
wchar_t next;
size_t n;
mbstate_t mbstate;
+ state_check_node(state);
if (state->node == NULL)
return WEOF;
- if (state->node->code.len <= state->offset) {
- do
- state->node = state->node->next;
- while (state->node && state->node->code.txt == NULL);
- state->offset = 0;
- if (state->node == NULL)
- return WEOF;
- do_strip(state);
- state->line = state->node->line_no;
- state->col = state->node->indent;
- }
## before get_char
&mbstate);
if (n == -2 || n == 0) {
/* Not enough bytes - not really possible */
- next = '\n';
- state->offset = state->node->code.len;
+ next = '\n'; // NOTEST
+ state->offset = state->node->code.len; // NOTEST
} else if (n == -1) {
/* error */
- state->offset += 1;
- next = 0x7f; // an illegal character
+ state->offset += 1; // NOTEST
+ next = 0x7f; // an illegal character // NOTEST
} else
state->offset += n;
state->col += 1;
} else if (is_newline(next)) {
state->line += 1;
- state->col = state->node->indent;
do_strip(state);
+ state->col = state->node->needs_strip;
} else if (next == '\t') {
state->col = indent_tab(state->col);
}
static void close_token(struct token_state *state,
struct token *tk)
{
- tk->txt.len = (state->node->code.txt + state->offset)
- - tk->txt.txt;
+ if (state->node != tk->node)
+ tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
+ else
+ tk->txt.len = (state->node->code.txt + state->offset)
+ - tk->txt.txt;
}
static void reset_token(struct token_state *state, struct token *tok)
tok->txt.len = 0;
}
-
-Tokens make not cross into the next `code_node`, and some tokens can
+Tokens may not cross into the next `code_node`, and some tokens can
include the newline at the and of a `code_node`, we must be able to
easily check if we have reached the end. Equally we need to know if
we are at the start of a node, as white space is treated a little
static int at_son(struct token_state *state)
{
- return state->offset == 0;
+ return state->prev_offset <= state->strip_offset;
}
static int at_eon(struct token_state *state)
memset(state, 0, sizeof(*state));
state->node = code;
state->line = code->line_no;
- state->col = code->indent;
- state->conf = conf;
do_strip(state);
+ state->col = state->node->needs_strip;
+ state->strip_offset = state->offset;
+ state->conf = conf;
return state;
}
void token_close(struct token_state *state)
Converting a `TK_number` token to a numerical value is a slightly
higher level task than lexical analysis, and slightly lower than
-grammar parsing, so put it here - as an index if you like.
+grammar parsing, so put it here - as an appendix if you like.
Importantly it will be used by the same testing rig that is used for
testing the token scanner.
int *placesp)
{
/* Accept digits up to 'base', ignore '_' and
- * ' ' if they appear between two legal digits,
- * and if `placesp` is not NULL, allow a single
- * '.' or ',' and report the number of digits
- * beyond there.
+ * (for base 10) ' ' if they appear between two
+ * legal digits, and if `placesp` is not NULL,
+ * allow a single '.' or ',' and report the number
+ * of digits beyond there.
* Return number of characters processed (p),
* or 0 if something illegal was found.
*/
int dig;
char c = tok.txt[p];
- if (c == '_' || c == ' ') {
+ if (c == '_' || (c == ' ' && base == 10)) {
if (prev != Digit)
goto bad;
prev = Space;
places until we have the exponent as well.
###### number vars
- int places =0;
+ int places = 0;
mpz_t mant;
int d;
tok.txt += d;
tok.len -= d;
-
Now that we have the mantissa and the exponent we can multiply them
together, also allowing for the number of digits after the decimal
mark.
If all goes well we check for the possible trailing letters and
return. Return value is 1 for success and 0 for failure.
-
###### number functions
int number_parse(mpq_t num, char tail[3], struct text tok)
{
## number includes
## number functions
-###### File: number.h
+###### File: parse_number.h
int number_parse(mpq_t num, char tail[3], struct text tok);
###### File: scanner.mk
## string functions
## string main
-###### File: string.h
+###### File: parse_string.h
int string_parse(struct token *tok, char escape,
struct text *str, char tail[3]);
libstring.o : libstring.c
$(CC) $(CFLAGS) -c libstring.c
-
## Testing
As "untested code is buggy code" we need a program to easily test
#include <stdio.h>
#include <gmp.h>
#include <locale.h>
+ #include <getopt.h>
#include "mdcode.h"
#include "scanner.h"
- #include "number.h"
- #include "string.h"
+ #include "parse_number.h"
+ #include "parse_string.h"
static int errs;
static void pr_err(char *msg)
fprintf(stderr, "%s\n", msg);
}
+ static int kcmp(const void *ap, const void *bp)
+ {
+ char * const *a = ap;
+ char * const *b = bp;
+ return strcmp(*a, *b);
+ }
+
int main(int argc, char *argv[])
{
int fd;
int len;
char *file;
+ char *filename = NULL;
struct token_state *state;
const char *known[] = {
"==",
.words_marks = known,
.number_chars = "., _+-",
.known_count = sizeof(known)/sizeof(known[0]),
- .ignored = (0 << TK_line_comment)
- |(0 << TK_block_comment),
+ .ignored = 0,
+ };
+ static const struct option long_options[] = {
+ { "word-start", 1, NULL, 'W'},
+ { "word-cont", 1, NULL, 'w'},
+ { "number-chars", 1, NULL, 'n'},
+ { "ignore-numbers", 0, NULL, 'N'},
+ { "ignore-ident", 0, NULL, 'I'},
+ { "ignore-marks", 0, NULL, 'M'},
+ { "ignore-strings", 0, NULL, 'S'},
+ { "ignore-multi-strings",0, NULL, 'z'},
+ { "ignore-line-comment",0, NULL, 'c'},
+ { "ignore-newline", 0, NULL, 'l'},
+ { "ignore-block-comment", 0, NULL, 'C'},
+ { "ignore-indent", 0, NULL, 'i'},
+ { "return-comments", 0, NULL, 'r'},
+ { "file", 1, NULL, 'f'},
+ { "section", 1, NULL, 's'},
+ { NULL, 0, NULL, 0},
};
+ static const char options[] = "W:w:n:NIMSzclCirf:s:";
+
struct section *table, *s, *prev;
+ int opt;
+ char *section_name = NULL;
+ int section_found = 0;
+
setlocale(LC_ALL,"");
- if (argc != 2) {
- fprintf(stderr, "Usage: scanner file\n");
- exit(2);
+ while ((opt = getopt_long(argc, argv, options, long_options, NULL))
+ != -1) {
+ switch(opt) {
+ case 'W': conf.word_start = optarg; break;
+ case 'w': conf.word_cont = optarg; break;
+ case 'n': conf.number_chars = optarg; break;
+ case 'N': conf.ignored |= 1 << TK_number; break;
+ case 'I': conf.ignored |= 1 << TK_ident; break;
+ case 'M': conf.ignored |= 1 << TK_mark; break;
+ case 'S': conf.ignored |= 1 << TK_string; break;
+ case 'z': conf.ignored |= 1 << TK_multi_string; break;
+ case 'c': conf.ignored |= 1 << TK_line_comment; break;
+ case 'C': conf.ignored |= 1 << TK_block_comment; break;
+ case 'l': conf.ignored |= 1 << TK_newline; break;
+ case 'i': conf.ignored |= 1 << TK_in; break;
+ case 'r': conf.return_comments = 1; break;
+ case 'f': filename = optarg; break;
+ case 's': section_name = optarg; break;
+ default: fprintf(stderr, "scanner: unknown option '%c'.\n",
+ opt);
+ exit(1);
+ }
+ }
+
+ if (optind < argc) {
+ const char **wm = calloc(argc - optind, sizeof(char*));
+ int i;
+ for (i = optind; i < argc; i++)
+ wm[i - optind] = argv[i];
+ qsort(wm, argc-optind, sizeof(char*), kcmp);
+ conf.words_marks = wm;
+ conf.known_count = argc - optind;
}
- fd = open(argv[1], O_RDONLY);
+
+ if (filename)
+ fd = open(filename, O_RDONLY);
+ else
+ fd = 0;
if (fd < 0) {
fprintf(stderr, "scanner: cannot open %s: %s\n",
- argv[1], strerror(errno));
+ filename, strerror(errno));
exit(1);
}
len = lseek(fd, 0, 2);
+ if (len <= 0) {
+ fprintf(stderr,"scanner: %s is empty or not seekable\n",
+ filename ?: "stdin");
+ exit(1);
+ }
file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
table = code_extract(file, file+len, pr_err);
for (s = table; s;
(code_free(s->code), prev = s, s = s->next, free(prev))) {
+ if (section_name &&
+ (s->section.len != strlen(section_name) ||
+ strncmp(s->section.txt, section_name, s->section.len) != 0))
+ continue;
+ if (section_name)
+ section_found = 1;
printf("Tokenizing: %.*s\n", s->section.len,
s->section.txt);
state = token_open(s->code, &conf);
if (tk.num == TK_eof)
break;
}
+ token_close(state);
+ }
+ if (conf.words_marks != known)
+ free(conf.words_marks);
+ if (section_name && !section_found) {
+ fprintf(stderr, "scanner: section %s not found\n", section_name);
+ errs = 1;
}
exit(!!errs);
}