There are a fixed set of token types, though particular tokens within
those types can be distinguish via configuration.
-Most token types may be explicitly ignored, as typically comments
-would be. The exact consequence of ignoring each token type varies
-from token to token.
+Most token types may be explicitly ignored, so they aren't parsed.
+Comments typically parsed but not returned, but an option is provided to
+return comments for further processing. The exact consequence of
+ignoring each token type varies from token to token.
###### public types
struct token_config {
int ignored; // bit set of ignored tokens.
+ int return_comments;
## token config parameters
};
If the first character of a comment marker (i.e. '/') is a known mark,
the above rules would suggest that the start of a comment would be
-parsed as that mark, which is not what is wanted. So the introductory
-sequences for a comment ("//" and "/*") are treated as
-partially-known. They prevent the leading "/" from being a mark by
-itself, but do not actually constitute a stand-alone mark.
+parsed as that mark, which is not what is wanted. So when comments are
+not ignored, the introductory sequences for a comment ("//" and "/*")
+are treated as partially-known. They prevent the leading "/" from being
+a mark by itself, but do not actually constitute a stand-alone mark.
If `TK_mark` is ignored, then unknown marks are returned as errors.
/* found a longest-known-mark, still need to
* check for comments
*/
- if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
- (ch == '/' || ch == '*')) {
+ if (is_comment(ignored, tk.txt)) {
/* Yes, this is a comment, not a '/' */
restore_unget_state(state);
tk.num = TK_error;
prev = ch;
save_unget_state(state);
ch = get_char(state);
+ if (n >= 0)
+ /* No need to worry about other token types */
+ continue;
if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
/* If strings are allowed, a quote (Which isn't a known mark)
* mustn't be treated as part of an unknown mark. It can be
- * part of a multi-line srtings though.
+ * part of a multi-line string though.
*/
break;
- if (prev == '#' && n < 0)
- /* '#' is not a known mark, so assume it is a comment */
- break;
- if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
- close_token(state, &tk);
- restore_unget_state(state);
- break;
- }
- if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
- close_token(state, &tk);
+
+ close_token(state, &tk);
+ if (is_comment(ignored, tk.txt)) {
+ /* looks like a permitted comment, and not a known mark,
+ * so assume it is a comment.
+ */
restore_unget_state(state);
break;
}
followed by the start of a new string.
###### parse string
- if (tk.txt.len == 3 &&
+ if (tk.txt.len >= 3 &&
!(ignored & (1 << TK_multi_string)) &&
is_quote(tk.txt.txt[0]) &&
memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
TK_block_comment,
###### internal functions
- static int is_line_comment(struct text txt)
+ static int is_line_comment(int ignored, struct text txt)
{
+ if (ignored & (1 << TK_line_comment))
+ return 0;
return (txt.len >= 1 && txt.txt[0] == '#') ||
(txt.len >= 2 && txt.txt[0] == '/' &&
txt.txt[1] == '/');
}
- static int is_block_comment(struct text txt)
+ static int is_block_comment(int ignored, struct text txt)
{
+ if (ignored & (1 << TK_block_comment))
+ return 0;
return txt.len >= 2 && txt.txt[0] == '/' &&
txt.txt[1] == '*';
}
+ static int is_comment(int ignored, struct text txt)
+ {
+ return is_line_comment(ignored, txt) ||
+ is_block_comment(ignored, txt);
+ }
+
#### Single line comments
A single-line comment continues up to, but not including the newline
###### parse comment
- if (is_line_comment(tk.txt)) {
+ if (is_line_comment(ignored, tk.txt)) {
while (!is_newline(ch) && !at_eon(state))
ch = get_char(state);
if (is_newline(ch))
unget_char(state);
close_token(state, &tk);
tk.num = TK_line_comment;
- if (ignored & (1 << TK_line_comment))
+ if (!state->conf->return_comments)
continue;
return tk;
}
###### parse comment
- if (is_block_comment(tk.txt)) {
+ if (is_block_comment(ignored, tk.txt)) {
wchar_t prev;
int newlines = 0;
reset_token(state, &tk);
if (!is_newline(ch))
tk.num = TK_error;
}
- if (tk.num == TK_error ||
- !(ignored & (1 << TK_block_comment)))
+ if (tk.num == TK_error || state->conf->return_comments)
return tk;
continue;
}
###### white space
if (ch == WEOF) {
- if (state->col) {
- state->col = 0;
- state->check_indent = 1;
- continue;
- }
tk.num = TK_eof;
return tk;
}
&mbstate);
if (n == -2 || n == 0) {
/* Not enough bytes - not really possible */
- next = '\n';
- state->offset = state->node->code.len;
+ next = '\n'; // NOTEST
+ state->offset = state->node->code.len; // NOTEST
} else if (n == -1) {
/* error */
- state->offset += 1;
- next = 0x7f; // an illegal character
+ state->offset += 1; // NOTEST
+ next = 0x7f; // an illegal character // NOTEST
} else
state->offset += n;
## number includes
## number functions
-###### File: number.h
+###### File: parse_number.h
int number_parse(mpq_t num, char tail[3], struct text tok);
###### File: scanner.mk
## string functions
## string main
-###### File: string.h
+###### File: parse_string.h
int string_parse(struct token *tok, char escape,
struct text *str, char tail[3]);
#include <getopt.h>
#include "mdcode.h"
#include "scanner.h"
- #include "number.h"
- #include "string.h"
+ #include "parse_number.h"
+ #include "parse_string.h"
static int errs;
static void pr_err(char *msg)
{ "ignore-newline", 0, NULL, 'l'},
{ "ignore-block-comment", 0, NULL, 'C'},
{ "ignore-indent", 0, NULL, 'i'},
+ { "return-comments", 0, NULL, 'r'},
{ "file", 1, NULL, 'f'},
{ "section", 1, NULL, 's'},
{ NULL, 0, NULL, 0},
};
- static const char options[] = "W:w:n:NIMSzclCif:s:";
+ static const char options[] = "W:w:n:NIMSzclCirf:s:";
struct section *table, *s, *prev;
int opt;
case 'C': conf.ignored |= 1 << TK_block_comment; break;
case 'l': conf.ignored |= 1 << TK_newline; break;
case 'i': conf.ignored |= 1 << TK_in; break;
+ case 'r': conf.return_comments = 1; break;
case 'f': filename = optarg; break;
case 's': section_name = optarg; break;
default: fprintf(stderr, "scanner: unknown option '%c'.\n",