#include <stdio.h>
## parser includes
## parser functions
- ## parser
+ ## parser_run
###### File: calc.cgm
## demo grammar
###### File: parsergen.mk
};
The strings reported by `mdcode` and `scanner` are `struct text` which have
-length rather than being null terminated. To help with printing an
+length rather than being null terminated. To help with printing and
comparing we define `text_is` and `prtxt`, which should possibly go in
`mdcode`. `scanner` does provide `text_dump` which is useful for strings
which might contain control characters.
is given, the precedence is inherited from the last symbol in the
production which has a precedence specified.
-After the optional precedence may come the `${` mark. This indicates
+After the optional precedence may come the `${` mark. This indicates
the start of a code fragment. If present, this must be on the same
line as the start of the production.
information for the particular symbol in the production. `$0` is the
head of the production, `$1` is the first symbol of the body, etc.
The type of `$N` for a terminal symbol is `struct token`. For
-non-terminal, it is whatever has been declared for that symbol.
+a non-terminal, it is whatever has been declared for that symbol.
While building productions we will need to add to an array which needs to
grow dynamically.
that we need to parse a grammar from a `code_node`.
The head of the first production will effectively be the `start` symbol of
-the grammar. However it wont _actually_ be so. Processing the grammar is
+the grammar. However it won't _actually_ be so. Processing the grammar is
greatly simplified if the real start symbol only has a single production,
-and expect `$eof` as the final terminal. So when we find the first explicit
-production we insert an extra production as production zero which looks like
+and expects `$eof` as the final terminal. So when we find the first
+explicit production we insert an extra production as production zero which
+looks like
###### Example: production 0
$start -> START $eof
-where `START` is the first non-terminal give.
+where `START` is the first non-terminal given.
###### create production zero
struct production *p = calloc(1,sizeof(*p));
p->head->type = Nonterminal;
array_add(&p->body, &p->body_size, head);
array_add(&p->body, &p->body_size, sym_find(g, eof));
- g->start = p->head->num;
+ g->start = p->head->num;
p->head->first_production = g->production_count;
array_add(&g->productions, &g->production_count, p);
else
err = "First production must have a head";
} else if (tk.num == TK_mark
- && text_is(tk.txt, "$")) {
+ && text_is(tk.txt, "$")) {
err = dollar_line(state, g);
} else {
err = "Unrecognised token at start of line.";
1. LR(0) or SLR(1), where no look-ahead is considered.
2. LALR(1) where we build look-ahead sets with each item and merge
the LA sets when we find two paths to the same "kernel" set of items.
-3. LR(1) where different look-ahead for any item in the code means
+3. LR(1) where different look-ahead for any item in the set means
a different state must be created.
###### forward declarations
itemset, so we need to ignore the offset=zero items which are added during
completion.
-To facilitate this, we modify the "DOT" number so that "0" sorts to the end of
-the list in the symset, and then only compare items before the first "0".
+To facilitate this, we modify the "DOT" number so that "0" sorts to
+the end of the list in the symset, and then only compare items before
+the first "0".
###### declarations
static inline unsigned short item_num(int production, int index)
la = save_set(g, eof);
first = INIT_DATASET;
}
- // production 0, offset 0 (with no data)
+ // production 0, offset 0 (with no data)
symset_add(&first, item_num(0, 0), la);
add_itemset(g, first, type);
for (again = 0, is = g->items;
scanner.
`parse_XX` then call the library function `parser_run` to actually complete
-the parse, This needs the `states` table and function to call the various
+the parse. This needs the `states` table and function to call the various
pieces of code provided in the grammar file, so they are generated first.
###### parser_generate
### The state stack.
-The core data structure for the parser is the stack. This track all the
+The core data structure for the parser is the stack. This tracks all the
symbols that have been recognised or partially recognised.
The stack usually won't grow very large - maybe a few tens of entries. So
production, and by keeping a separate `asn` stack, we can just pass a
pointer into this stack.
-The other allocation store all other stack fields of which there are two.
+The other allocation stores all other stack fields of which there are two.
The `state` is the most important one and guides the parsing process. The
`sym` is nearly unnecessary. However when we want to free entries from the
`asn_stack`, it helps to know what type they are so we can call the right
#### Shift and pop
-The operations are needed on the stack - shift (which is like push) and pop.
+Two operations are needed on the stack - shift (which is like push) and pop.
-Shift applies no only to terminals but also to non-terminals. When we
+Shift applies not only to terminals but also to non-terminals. When we
reduce a production we will pop off entries corresponding to the body
symbols, then push on an item for the head of the production. This last is
exactly the same process as shifting in a terminal so we use the same
}
`pop` simply moves the top of stack (`tos`) back down the required amount
-and frees and `asn` entries that need to be freed. It is called _after_ we
+and frees any `asn` entries that need to be freed. It is called _after_ we
reduce a production, just before we `shift` the nonterminal in.
###### parser functions
### Memory allocation
The `scanner` returns tokens in a local variable - we want them in allocated
-memory so they can live in the `asn_stack`. Similarly the `asn` produce by
+memory so they can live in the `asn_stack`. Similarly the `asn` produced by
a reduce is in a large buffer. Both of these require some allocation and
copying, hence `memdup` and `tokcopy`.
we do. If the production we reduced was production zero, then we have
accepted the input and can finish.
+We return whatever `asn` was returned by reducing production zero.
+
If we can neither shift nor reduce we have an error to handle. We pop
-single entries off the stack until we can shift the `TK_error` symbol, the
+single entries off the stack until we can shift the `TK_error` symbol, then
drop input tokens until we find one we can shift into the new error state.
-We return whatever `asn` was returned by reducing production zero.
###### parser includes
#include "parser.h"
-###### parser
+###### parser_run
void *parser_run(struct token_state *tokens,
const struct state states[],
int (*do_reduce)(int, void**, void*),
accepted = 1;
continue;
}
- /* Error. we walk up the stack until we
+ /* Error. We walk up the stack until we
* find a state which will accept TK_error.
* We then shift in TK_error and see what state
* that takes us too.
exit(1);
}
}$
- | NEWLINE ${ printf("Blank line\n"); }$
+ | NEWLINE ${ printf("Blank line\n"); }$
| ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
$number
- Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
- | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
+ Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
+ | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
| Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
| Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
- | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
+ | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$