newlines, EOF, and indents, each of which is examined in detail below.
There are various cases where no token can be found in part of the
-input. All of these will be reported as an `TK_error` token.
+input. All of these will be reported as a `TK_error` token.
It is possible to declare a number of strings which form distinct
tokens (rather than being grouped as e.g. 'word'). These are given
### Numbers
Numbers are the messiest tokens to parse, primarily because they can
-contain characters that also have meaning outside of number and,
+contain characters that also have meaning outside of numbers and,
particularly, immediately after numbers.
The obvious example is the '`-`' sign. It can come inside a number for
int sign_ok = 0;
switch(expect_p) {
case 0:
- if (ch == 'e')
+ if (ch == 'e' || ch == 'E')
sign_ok = 1;
break;
case 1:
- if (ch == 'p')
+ if (ch == 'p' || ch == 'P')
sign_ok = 1;
break;
}
Tokens matching these known words are reported as the index of the
list added to `TK_reserved`.
+If identifiers are ignored, then any word which is not listed as a
+known word results in an error.
+
###### token config parameters
const char **words_marks;
int known_count;
If an unknown mark contains a quote character or a comment marker, and
that token is not being ignored, then we terminate the unknown mark
-before that quote or comment. This ensure that an unknown mark
+before that quote or comment. This ensures that an unknown mark
immediately before a string is handled correctly.
-If `TK_mark` is ignored, then unknown marks as returned as an error.
+If the first character of a comment marker (i.e. '/') is a known mark,
+the above rules would suggest that the start of a comment would be
+parsed as that mark, which is not what is wanted. So the introductory
+sequences for a comment ("//" and "/*") are treated as
+partially-known. They prevent the leading "/" from being a mark by
+itself, but do not actually constitute a stand-alone mark.
+
+If `TK_mark` is ignored, then unknown marks are returned as errors.
###### token types
TK_mark,
if (n >= 0)
tk.num = TK_reserved + n;
else if (tk.num != TK_error) {
- /* found a longest-known-mark */
+ /* found a longest-known-mark, still need to
+ * check for comments
+ */
+ if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
+ (ch == '/' || ch == '*')) {
+ /* Yes, this is a comment, not a '/' */
+ restore_unget_state(state);
+ tk.num = TK_error;
+ break;
+ }
unget_char(state);
close_token(state, &tk);
return tk;
}
prev = ch;
- if (prev == '/')
- save_unget_state(state);
+ save_unget_state(state);
ch = get_char(state);
if (!(ignored && (1<<TK_string)) && is_quote(ch))
break;
- if (!(ignored && (1<<TK_line_comment)) &&
- prev == '/' && ch == '/') {
+ if (prev == '#' && n < 0)
+ /* '#' is not a known mark, so assume it is a comment */
+ break;
+ if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
+ close_token(state, &tk);
restore_unget_state(state);
break;
}
- if (!(ignored && (1<<TK_block_comment)) &&
- prev == '/' && ch == '*') {
+ if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
+ close_token(state, &tk);
restore_unget_state(state);
break;
}
}
unget_char(state);
- if (tk.num != TK_error)
+ if (tk.num != TK_error) {
+ close_token(state, &tk);
return tk;
+ }
+
+If we don't find a known mark, we will check for strings and comments
+before assuming that we have an unknown mark
+
+###### parse mark
+ ## parse string
+ ## parse comment
+ ## unknown mark
###### unknown mark
if (tk.txt.len) {
these characters can be included in the list of known marks and then
they will not be used for identifying strings.
-Immediately following the close quote one or two ASCII letters may
+Immediately following the close quote, one or two ASCII letters may
appear. These are somewhat like the arbitrary letters allowed in
"Numbers" above. They can be used by the language in various ways.
* unget so the newline is seen,
* but return rest of string as an error.
*/
- unget_char(state);
+ if (is_newline(ch))
+ unget_char(state);
close_token(state, &tk);
tk.num = TK_error;
return tk;
!(ignored & (1<<TK_string))) {
wchar_t first = tk.txt.txt[0];
reset_token(state, &tk);
- get_char(state);
- do
+ ch = get_char(state);
+ tk.num = TK_error;
+ while (!at_eon(state) && !is_newline(ch)) {
ch = get_char(state);
- while (ch != first && !is_newline(ch));
- tk.num = TK_string;
- if (is_newline(ch)) {
- unget_char(state);
- tk.num = TK_error;
+ if (ch == first) {
+ tk.num = TK_string;
+ break;
+ }
+ if (is_newline(ch)) {
+ unget_char(state);
+ break;
+ }
}
+ while (!at_eon(state) && (ch = get_char(state)) &&
+ iswalpha(ch))
+ ;
+ unget_char(state);
close_token(state, &tk);
return tk;
}
These two comment types are reported as two separate token types, and
consequently can be ignored separately. When ignored a comment is
-parsed and discarded.
+still parsed, but is discarded.
###### token types
TK_line_comment,
#### Single line comments
-A single-line comment continues up to, but not including the newline.
+A single-line comment continues up to, but not including the newline
+or end of node.
###### parse comment
if (is_line_comment(tk.txt)) {
- while (!is_newline(ch))
+ while (!is_newline(ch) && !at_eon(state))
ch = get_char(state);
- unget_char(state);
+ if (is_newline(ch))
+ unget_char(state);
close_token(state, &tk);
tk.num = TK_line_comment;
if (ignored & (1 << TK_line_comment))
###### white space
if (ch == WEOF) {
+ if (state->col) {
+ state->col = 0;
+ state->check_indent = 1;
+ continue;
+ }
tk.num = TK_eof;
return tk;
}
###### internal functions
- static void do_strip(struct token_state *state)
+ static int do_strip(struct token_state *state)
{
+ int indent = 0;
if (state->node->needs_strip) {
int n = 4;
while (n && state->node->code.txt[state->offset] == ' ') {
+ indent += 1;
state->offset += 1;
n -= 1;
}
while (n == 4 && state->node->code.txt[state->offset] == '\t') {
+ indent = indent_tab(indent);
state->offset += 1;
n -= 4;
}
}
+ return indent;
}
static wint_t get_char(struct token_state *state)
state->offset = 0;
if (state->node == NULL)
return WEOF;
- do_strip(state);
state->line = state->node->line_no;
- state->col = state->node->indent;
+ state->col = do_strip(state);
}
## before get_char
state->col += 1;
} else if (is_newline(next)) {
state->line += 1;
- state->col = state->node->indent;
- do_strip(state);
+ state->col = do_strip(state);
} else if (next == '\t') {
state->col = indent_tab(state->col);
}
static void close_token(struct token_state *state,
struct token *tk)
{
- tk->txt.len = (state->node->code.txt + state->offset)
- - tk->txt.txt;
+ if (state->node != tk->node)
+ tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
+ else
+ tk->txt.len = (state->node->code.txt + state->offset)
+ - tk->txt.txt;
}
static void reset_token(struct token_state *state, struct token *tok)
## parse number
## parse word
## parse mark
- ## parse string
- ## parse comment
- ## unknown mark
### Start and stop
memset(state, 0, sizeof(*state));
state->node = code;
state->line = code->line_no;
- state->col = code->indent;
+ state->col = do_strip(state);
state->conf = conf;
- do_strip(state);
return state;
}
void token_close(struct token_state *state)
if (c == ' ')
skipped += 1;
else if (c == '\t')
- skipped = indent_tab(c);
+ skipped = indent_tab(skipped);
else
break;
i+= 1;
libmdcode.o libnumber.o libstring.o -licuuc -lgmp
scanner.o : scanner.c
$(CC) $(CFLAGS) -c scanner.c
-