char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
struct section *s = code_extract(file, file+len, NULL);
struct token_config config = {
- .ignored = (1 << TK_line_comment)
- | (1 << TK_block_comment),
.number_chars = ".,_+-",
.word_start = "",
.word_cont = "",
char *section = NULL;
struct parse_context context = {
.config = {
- .ignored = (1 << TK_line_comment)
- | (1 << TK_block_comment)
- | (1 << TK_mark),
+ .ignored = (1 << TK_mark),
.number_chars = ".,_+- ",
.word_start = "_",
.word_cont = "_",
} else if (tk.num == TK_mark
&& text_is(tk.txt, "$*")) {
err = dollar_line(state, g, 1);
+ } else if (tk.num == TK_mark
+ && text_is(tk.txt, "//")) {
+ while (tk.num != TK_newline &&
+ tk.num != TK_eof)
+ tk = token_next(state);
} else {
err = "Unrecognised token at start of line.";
}
fprintf(f, "\tstruct token_state *tokens;\n");
fprintf(f, "\tconfig->words_marks = known;\n");
fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
- fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
fprintf(f, "\ttokens = token_open(code, config);\n");
fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
fprintf(f, "\ttoken_close(tokens);\n");
struct section *s;
struct token_config config = {
.ignored = (1 << TK_line_comment)
- | (1 << TK_block_comment)
| (1 << TK_in)
| (1 << TK_out),
.number_chars = ".,_+-",
# calc: grammar
$LEFT + -
- $LEFT * /
+ $LEFT * / //
Session -> Session Line
| Line
| Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
| Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
| Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+ | Expression // Expression ${ {
+ mpz_t z0, z1, z2;
+ mpq_init($0.val);
+ mpz_init(z0); mpz_init(z1); mpz_init(z2);
+ mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+ mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+ mpz_tdiv_q(z0, z1, z2);
+ mpq_set_z($0.val, z0);
+ mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+ } }$
| NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
| ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
10 * 9 / 2
1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
+ 355//113
+
error
Include a special test for numbers, as they are interesting.
###### test list
- scanner_tests += "test1,if,then,+,-"
- scanner_tests += "test1,if,then,+,-,/"
- scanner_tests += "test1,--ignore-indent,if,then,+,-,/"
+ scanner_tests += "test1,-r,if,then,+,-"
+ scanner_tests += "test1,-r,if,then,+,-,/"
+ scanner_tests += "test1,-r,--ignore-indent,if,then,+,-,/"
+ scanner_tests += "test1,-r,--ignore-indent,--ignore-newline,if,then,+,-,/"
scanner_tests += "test1,--ignore-indent,--ignore-newline,if,then,+,-,/"
- scanner_tests += "test1,-Cc,--ignore-indent,--ignore-newline,if,then,+,-,/"
- scanner_tests += "test1,-CcSz,--ignore-indent,--ignore-newline,if,then,+,-,/"
+ scanner_tests += "test1,-Sz,--ignore-indent,--ignore-newline,if,then,+,-,/"
###### test: test1
lines */
divident /+ divisor
-###### output: test1,if,then,+,-
+###### output: test1,-r,if,then,+,-
Tokenizing:
2:0 ident(A)
2:2 ident(B)
32:0 newline()
32:0 eof()
-###### output: test1,if,then,+,-,/
+###### output: test1,-r,if,then,+,-,/
Tokenizing:
2:0 ident(A)
2:2 ident(B)
32:0 newline()
32:0 eof()
-###### output: test1,--ignore-indent,if,then,+,-,/
+###### output: test1,-r,--ignore-indent,if,then,+,-,/
Tokenizing:
2:0 ident(A)
2:2 ident(B)
31:19 newline()
32:0 eof()
-###### output: test1,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,-r,--ignore-indent,--ignore-newline,if,then,+,-,/
Tokenizing:
2:0 ident(A)
2:2 ident(B)
31:12 ident(divisor)
32:0 eof()
-###### output: test1,-Cc,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,--ignore-indent,--ignore-newline,if,then,+,-,/
Tokenizing:
2:0 ident(A)
2:2 ident(B)
31:12 ident(divisor)
32:0 eof()
-###### output: test1,-CcSz,--ignore-indent,--ignore-newline,if,then,+,-,/
+###### output: test1,-Sz,--ignore-indent,--ignore-newline,if,then,+,-,/
Tokenizing:
2:0 ident(A)
2:2 ident(B)
sometimes aren't.
###### test list
- scanner_tests += "errtest,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-"
- scanner_tests += "errtest,--ignore-ident,--ignore-mark,-N,if,then,+,-"
+ scanner_tests += "errtest,-r,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-"
+ scanner_tests += "errtest,-r,--ignore-ident,--ignore-mark,-N,if,then,+,-"
###### test: errtest
" \\ \t \n special chars in strings"
-###### output: errtest,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-
+###### output: errtest,-r,--ignore-ident,--ignore-mark,-W_,-w_,if,then,+,-
Tokenizing:
2:0 ERROR(multiple)
15:0 newline()
15:0 eof()
-###### output: errtest,--ignore-ident,--ignore-mark,-N,if,then,+,-
+###### output: errtest,-r,--ignore-ident,--ignore-mark,-N,if,then,+,-
Tokenizing:
2:0 ERROR(multiple)
2:9 ERROR(decimal)
There are a fixed set of token types, though particular tokens within
those types can be distinguish via configuration.
-Most token types may be explicitly ignored, as typically comments
-would be. The exact consequence of ignoring each token type varies
-from token to token.
+Most token types may be explicitly ignored, so they aren't parsed.
+Comments typically parsed but not returned, but an option is provided to
+return comments for further processing. The exact consequence of
+ignoring each token type varies from token to token.
###### public types
struct token_config {
int ignored; // bit set of ignored tokens.
+ int return_comments;
## token config parameters
};
If the first character of a comment marker (i.e. '/') is a known mark,
the above rules would suggest that the start of a comment would be
-parsed as that mark, which is not what is wanted. So the introductory
-sequences for a comment ("//" and "/*") are treated as
-partially-known. They prevent the leading "/" from being a mark by
-itself, but do not actually constitute a stand-alone mark.
+parsed as that mark, which is not what is wanted. So when comments are
+not ignored, the introductory sequences for a comment ("//" and "/*")
+are treated as partially-known. They prevent the leading "/" from being
+a mark by itself, but do not actually constitute a stand-alone mark.
If `TK_mark` is ignored, then unknown marks are returned as errors.
/* found a longest-known-mark, still need to
* check for comments
*/
- if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
- (ch == '/' || ch == '*')) {
+ if (is_comment(ignored, tk.txt)) {
/* Yes, this is a comment, not a '/' */
restore_unget_state(state);
tk.num = TK_error;
prev = ch;
save_unget_state(state);
ch = get_char(state);
+ if (n >= 0)
+ /* No need to worry about other token types */
+ continue;
if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
/* If strings are allowed, a quote (Which isn't a known mark)
* mustn't be treated as part of an unknown mark. It can be
- * part of a multi-line srtings though.
+ * part of a multi-line string though.
*/
break;
- if (prev == '#' && n < 0)
- /* '#' is not a known mark, so assume it is a comment */
- break;
- if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
- close_token(state, &tk);
- restore_unget_state(state);
- break;
- }
- if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
- close_token(state, &tk);
+
+ close_token(state, &tk);
+ if (is_comment(ignored, tk.txt)) {
+ /* looks like a permitted comment, and not a known mark,
+ * so assume it is a comment.
+ */
restore_unget_state(state);
break;
}
followed by the start of a new string.
###### parse string
- if (tk.txt.len == 3 &&
+ if (tk.txt.len >= 3 &&
!(ignored & (1 << TK_multi_string)) &&
is_quote(tk.txt.txt[0]) &&
memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
TK_block_comment,
###### internal functions
- static int is_line_comment(struct text txt)
+ static int is_line_comment(int ignored, struct text txt)
{
+ if (ignored & (1 << TK_line_comment))
+ return 0;
return (txt.len >= 1 && txt.txt[0] == '#') ||
(txt.len >= 2 && txt.txt[0] == '/' &&
txt.txt[1] == '/');
}
- static int is_block_comment(struct text txt)
+ static int is_block_comment(int ignored, struct text txt)
{
+ if (ignored & (1 << TK_block_comment))
+ return 0;
return txt.len >= 2 && txt.txt[0] == '/' &&
txt.txt[1] == '*';
}
+ static int is_comment(int ignored, struct text txt)
+ {
+ return is_line_comment(ignored, txt) ||
+ is_block_comment(ignored, txt);
+ }
+
#### Single line comments
A single-line comment continues up to, but not including the newline
###### parse comment
- if (is_line_comment(tk.txt)) {
+ if (is_line_comment(ignored, tk.txt)) {
while (!is_newline(ch) && !at_eon(state))
ch = get_char(state);
if (is_newline(ch))
unget_char(state);
close_token(state, &tk);
tk.num = TK_line_comment;
- if (ignored & (1 << TK_line_comment))
+ if (!state->conf->return_comments)
continue;
return tk;
}
###### parse comment
- if (is_block_comment(tk.txt)) {
+ if (is_block_comment(ignored, tk.txt)) {
wchar_t prev;
int newlines = 0;
reset_token(state, &tk);
if (!is_newline(ch))
tk.num = TK_error;
}
- if (tk.num == TK_error ||
- !(ignored & (1 << TK_block_comment)))
+ if (tk.num == TK_error || state->conf->return_comments)
return tk;
continue;
}
{ "ignore-newline", 0, NULL, 'l'},
{ "ignore-block-comment", 0, NULL, 'C'},
{ "ignore-indent", 0, NULL, 'i'},
+ { "return-comments", 0, NULL, 'r'},
{ "file", 1, NULL, 'f'},
{ "section", 1, NULL, 's'},
{ NULL, 0, NULL, 0},
};
- static const char options[] = "W:w:n:NIMSzclCif:s:";
+ static const char options[] = "W:w:n:NIMSzclCirf:s:";
struct section *table, *s, *prev;
int opt;
case 'C': conf.ignored |= 1 << TK_block_comment; break;
case 'l': conf.ignored |= 1 << TK_newline; break;
case 'i': conf.ignored |= 1 << TK_in; break;
+ case 'r': conf.return_comments = 1; break;
case 'f': filename = optarg; break;
case 's': section_name = optarg; break;
default: fprintf(stderr, "scanner: unknown option '%c'.\n",