X-Git-Url: https://ocean-lang.org/code/?p=ocean;a=blobdiff_plain;f=csrc%2Fparsergen.mdc;h=0099f5785db32072118b228473d8aa94ed9bf09f;hp=5d5ea1ebc352b13eacfd809a11416ce1ba2e5e18;hb=6a0710bfe1622b5420ba53dca034ed18376b8740;hpb=c1a39a23cb7b948b142bee9ae6ad61b40c81b142 diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc index 5d5ea1e..0099f57 100644 --- a/csrc/parsergen.mdc +++ b/csrc/parsergen.mdc @@ -40,6 +40,7 @@ There are several distinct sections. #include #include ## parser includes + ## parser functions ## parser ###### File: calc.cgm ## demo grammar @@ -234,6 +235,12 @@ The data type name is simply stored and applied to the head of all subsequent productions. It must be the name of a structure, so `$expr` maps to `struct expr`. +Any productions given before the first data type will have no data type +and can carry no information. In order to allow other non-terminals to +have no type, the data type `$void` can be given. This does *not* mean +that `struct void` will be used, but rather than no type will be +associated with future non-terminals. + The precedence line must contain a list of symbols - typically terminal symbols, but not necessarily. It can only contain symbols that have not been seen yet, so precedence declaration must precede @@ -281,6 +288,8 @@ production inherits from the last symbol which has a precedence. assoc = Non; else { g->current_type = t.txt; + if (text_is(t.txt, "void")) + g->current_type.txt = NULL; t = token_next(ts); if (t.num != TK_newline) { err = "Extra tokens after type name"; @@ -636,7 +645,7 @@ allocated space as it can be derived directly from the current `cnt` using `((cnt - 1) | 7) + 1`. ###### functions - static void symset_add(struct symset *s, int key, int val) + static void symset_add(struct symset *s, unsigned short key, unsigned short val) { int i; int current = ((s->cnt-1) | 7) + 1; @@ -663,7 +672,7 @@ Finding a symbol (or item) in a `symset` uses a simple binary search. We return the index where the value was found (so data can be accessed), or `-1` to indicate failure. - static int symset_find(struct symset *ss, int key) + static int symset_find(struct symset *ss, unsigned short key) { int lo = 0; int hi = ss->cnt; @@ -695,7 +704,7 @@ can be optimised later. int added = 0; for (i = 0; i < b->cnt; i++) if (symset_find(a, b->syms[i]) < 0) { - int data = 0; + unsigned short data = 0; if (b->data != NO_DATA) data = b->data[i]; symset_add(a, b->syms[i], data); @@ -1003,7 +1012,7 @@ the list in the symset, and then only compare items before the first "0". ###### declarations static inline unsigned short item_num(int production, int index) { - return production | (((index-1)&0x1f) << 11); + return production | ((31-index) << 11); } static inline int item_prod(unsigned short item) { @@ -1011,7 +1020,7 @@ the list in the symset, and then only compare items before the first "0". } static inline int item_index(unsigned short item) { - return ((item >> 11)+1) & 0x1f; + return (31-(item >> 11)) & 0x1f; } For LR(1) analysis we need to compare not just the itemset in a state @@ -1168,7 +1177,7 @@ though. int p2; struct symbol *s; struct symset LA = INIT_SYMSET; - int sn = 0; + unsigned short sn = 0; if (bs == pr->body_size) continue; @@ -1226,7 +1235,7 @@ with a pre-existing itemset). // if they don't exist. for (i = 0; i < done.cnt; i++) { int j; - int state; + unsigned short state; struct symset newitemset = INIT_SYMSET; if (type >= LALR) newitemset = INIT_DATASET; @@ -1236,7 +1245,7 @@ with a pre-existing itemset). int p = item_prod(itm); int bp = item_index(itm); struct production *pr = g->productions[p]; - int la = 0; + unsigned short la = 0; int pos; if (bp == pr->body_size) @@ -1277,7 +1286,7 @@ with `TK_eof` as the LA set. struct symset first = INIT_SYMSET; struct itemset *is; int again; - int la = 0; + unsigned short la = 0; if (type >= LALR) { // LA set just has eof struct symset eof = INIT_SYMSET; @@ -1296,6 +1305,7 @@ with `TK_eof` as the LA set. if (is->completed) continue; is->completed = 1; + again = 1; ## complete itemset ## derive itemsets symset_free(done); @@ -1607,7 +1617,7 @@ between the two. continue; /* First collect the shifts */ for (j = 0; j < is->items.cnt; j++) { - int itm = is->items.syms[j]; + unsigned short itm = is->items.syms[j]; int p = item_prod(itm); int bp = item_index(itm); struct production *pr = g->productions[p]; @@ -1622,7 +1632,7 @@ between the two. } /* Now look for reduction and conflicts */ for (j = 0; j < is->items.cnt; j++) { - int itm = is->items.syms[j]; + unsigned short itm = is->items.syms[j]; int p = item_prod(itm); int bp = item_index(itm); struct production *pr = g->productions[p]; @@ -1685,6 +1695,7 @@ pieces of code provided in the grammar file, so they are generated first. static void gen_parser(FILE *f, struct grammar *g, char *file, char *name) { gen_known(f, g); + gen_non_term(f, g); gen_goto(f, g); gen_states(f, g); gen_reduce(f, g, file); @@ -1699,7 +1710,7 @@ pieces of code provided in the grammar file, so they are generated first. fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n"); fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n"); fprintf(f, "\ttokens = token_open(code, config);\n"); - fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace);\n"); + fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n"); fprintf(f, "\ttoken_close(tokens);\n"); fprintf(f, "\treturn rv;\n"); fprintf(f, "}\n\n"); @@ -1708,6 +1719,7 @@ pieces of code provided in the grammar file, so they are generated first. ### Table words table The know words is simply an array of terminal symbols. +The table of nonterminals used for tracing is a similar array. ###### functions @@ -1724,6 +1736,20 @@ The know words is simply an array of terminal symbols. fprintf(f, "};\n\n"); } + static void gen_non_term(FILE *f, struct grammar *g) + { + int i; + fprintf(f, "#line 0 \"gen_non_term\"\n"); + fprintf(f, "static const char *non_term[] = {\n"); + for (i = TK_reserved; + i < g->num_syms; + i++) + if (g->symtab[i]->type == Nonterminal) + fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len, + g->symtab[i]->name.txt); + fprintf(f, "};\n\n"); + } + ### States and the goto tables. For each state we record the goto table and the reducible production if @@ -1867,10 +1893,10 @@ to the appropriate type for each access. All this is handling in static void gen_reduce(FILE *f, struct grammar *g, char *file) { - int i, j; + int i; fprintf(f, "#line 0 \"gen_reduce\"\n"); fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n"); - fprintf(f, " void *ret, FILE *trace)\n"); + fprintf(f, " void *ret)\n"); fprintf(f, "{\n"); fprintf(f, "\tint ret_size = 0;\n"); @@ -1882,21 +1908,6 @@ to the appropriate type for each access. All this is handling in if (p->code.txt) gen_code(p, f, g); - fprintf(f, "\t\tif (trace) {\n"); - fprintf(f, "\t\t\tfprintf(trace, \"[%%2d]%.*s ->\", depth);\n", - p->head->name.len, p->head->name.txt); - for (j = 0; j < p->body_size; j++) { - if (p->body[j]->type == Terminal) { - fputs("\t\t\tfputs(\" \", trace);\n", f); - fprintf(f, "\t\t\ttext_dump(trace, (*(struct token*)body[%d]).txt, 20);\n", j); - } else { - fprintf(f, "\t\t\tfprintf(trace, \" %.*s\");\n", - p->body[j]->name.len, - p->body[j]->name.txt); - } - } - fprintf(f, "\t\t}\n"); - if (p->head->struct_name.txt) fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n", p->head->struct_name.len, @@ -2195,7 +2206,7 @@ The parser generator has nicely provided us with goto tables sorted by symbol number. We need a binary search function to find a symbol in the table. -###### parser +###### parser functions static int search(const struct state *l, int sym) { @@ -2239,7 +2250,7 @@ The `state` is the most important one and guides the parsing process. The freeing function. The symbol leads us to the right free function through `do_free`. -###### parser +###### parser functions struct parser { int state; @@ -2270,7 +2281,7 @@ function reports if it could. So `shift` finds the next state. If that succeed it extends the allocations if needed and pushed all the information onto the stacks. -###### parser +###### parser functions static int shift(struct parser *p, int sym, void *asn, @@ -2299,7 +2310,7 @@ if needed and pushed all the information onto the stacks. and frees and `asn` entries that need to be freed. It is called _after_ we reduce a production, just before we `shift` the nonterminal in. -###### parser +###### parser functions static void pop(struct parser *p, int num, void(*do_free)(short sym, void *asn)) @@ -2323,7 +2334,7 @@ copying, hence `memdup` and `tokcopy`. ###### parser includes #include -###### parser +###### parser functions void *memdup(void *m, int len) { @@ -2359,9 +2370,9 @@ We return whatever `asn` was returned by reducing production zero. ###### parser void *parser_run(struct token_state *tokens, const struct state states[], - int (*do_reduce)(int, int, void**, void*, FILE*), + int (*do_reduce)(int, int, void**, void*), void (*do_free)(short, void*), - FILE *trace) + FILE *trace, const char *non_term[], int knowns) { struct parser p = { 0 }; struct token *tk; @@ -2370,12 +2381,10 @@ We return whatever `asn` was returned by reducing production zero. tk = tok_copy(token_next(tokens)); while (!accepted) { + if (trace) + parser_trace(trace, &p, tk, states, non_term, knowns); + if (shift(&p, tk->num, tk, states)) { - if (trace) { - fputs("Shift ", trace); - text_dump(trace, tk->txt, 20); - fputs("\n", trace); - } tk = tok_copy(token_next(tokens)); continue; } @@ -2391,9 +2400,7 @@ We return whatever `asn` was returned by reducing production zero. (p.tos - states[p.state].reduce_size); bufsize = do_reduce(prod, p.tos, body, - buf, trace); - if (trace) - fputs("\n", trace); + buf); pop(&p, size, do_free); shift(&p, sym, memdup(buf, bufsize), states); @@ -2408,7 +2415,7 @@ We return whatever `asn` was returned by reducing production zero. * Then we discard input tokens until * we find one that is acceptable. */ - while (shift(&p, TK_error, tk, states) < 0 + while (shift(&p, TK_error, tk, states) == 0 && p.tos > 0) // discard this state pop(&p, 1, do_free); @@ -2434,10 +2441,49 @@ We return whatever `asn` was returned by reducing production zero. ###### exported functions void *parser_run(struct token_state *tokens, const struct state states[], - int (*do_reduce)(int, int, void**, void*, FILE*), + int (*do_reduce)(int, int, void**, void*), void (*do_free)(short, void*), - FILE *trace); + FILE *trace, const char *non_term[], int knowns); + +### Tracing + +Being able to visualize the parser in action can be invaluable when +debugging the parser code, or trying to understand how the parse of a +particular grammar progresses. The stack contains all the important +state, so just printing out the stack every time around the parse loop +can make it possible to see exactly what is happening. +This doesn't explicitly show each SHIFT and REDUCE action. However they +are easily deduced from the change between consecutive lines, and the +details of each state can be found by cross referencing the states list +in the stack with the "report" that parsergen can generate. + +For terminal symbols, we just dump the token. For non-terminals we +print the name of the symbol. The look ahead token is reported at the +end inside square brackets. + +###### parser functions + + void parser_trace(FILE *trace, struct parser *p, + struct token *tk, const struct state states[], + const char *non_term[], int knowns) + { + int i; + for (i = 0; i < p->tos; i++) { + int sym = p->stack[i].sym; + fprintf(trace, "(%d) ", p->stack[i].state); + if (sym < TK_reserved + knowns) { + struct token *t = p->asn_stack[i]; + text_dump(trace, t->txt, 20); + } else + fputs(non_term[sym - TK_reserved - knowns], + trace); + fputs(" ", trace); + } + fprintf(trace, "(%d) [", p->state); + text_dump(trace, tk->txt, 20); + fputs("]\n", trace); + } # A Worked Example