X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;f=csrc%2Fparsergen.mdc;h=6cb5a7371aa3935663d05df35a186ef8d2a6da32;hb=69756809ef2d0324ec94b277f9750f92db4ae416;hp=0bef7934d3ec112b77eff4ecd4a007240ff92f34;hpb=77165c59bca010dba0bdb8775552d6af944046a3;p=ocean diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc index 0bef793..6cb5a73 100644 --- a/csrc/parsergen.mdc +++ b/csrc/parsergen.mdc @@ -5,9 +5,9 @@ fragments, analyses it, and can report details about the analysis and write out C code files which can be compiled to make a parser. "2D support" means that indentation and line breaks can be significant. -Indent tokens (IN and OUT) and end-of-line (EOL) tokens can be used to -describe the grammar and the parser can selectively ignore these where -they aren't relevant. +Indent tokens (IN and OUT) and end-of-line tokens (EOL and NEWLINE) can +be used to describe the grammar and the parser can selectively ignore +these where they aren't relevant. There are several distinct sections. @@ -152,11 +152,11 @@ because that is what we need to detect tags. Productions are comprised primarily of symbols - terminal and non-terminal. We do not make a syntactic distinction between the two, -though non-terminals must be identifiers. Non-terminal symbols are -those which appear in the head of a production, terminal symbols are -those which don't. There are also "virtual" symbols used for precedence -marking discussed later, and sometimes we won't know what type a symbol -is yet. +though non-terminals must be identifiers while terminals can also be +marks. Non-terminal symbols are those which appear in the head of a +production, terminal symbols are those which don't. There are also +"virtual" symbols used for precedence marking discussed later, and +sometimes we won't know what type a symbol is yet. To help with code safety it is possible to declare the terminal symbols. If this is done, then any symbol used in a production that does not @@ -291,7 +291,7 @@ carry precedence information but are not included in the grammar. A production can declare that it inherits the precedence of a given virtual symbol. -This use for `$$` precludes it from being used as a symbol in the +This use of `$$` precludes it from being used as a symbol in the described language. Two other symbols: `${` and `}$` are also unavailable. @@ -393,9 +393,10 @@ here is told which was found in the `isref` argument. found += 1; t = token_next(ts); } - if (found == 0) + if (found == 0) { err = "No symbols given on precedence/TERM line"; goto abort; + } return NULL; abort: while (t.num != TK_newline && t.num != TK_eof) @@ -514,10 +515,6 @@ Now we have all the bits we need to parse a full production. tk = token_next(state); while (tk.num == TK_ident || tk.num == TK_mark) { struct symbol *bs = sym_find(g, tk.txt); - if (bs->type == Unknown) { - if (!g->terminals_declared) - bs->type = Terminal; - } if (bs->type == Virtual) { err = "Virtual symbol not permitted in production"; goto abort; @@ -691,20 +688,22 @@ used as a terminal anywhere that a terminal is expected. goto abort; } token_close(state); - if (g->terminals_declared) { - struct symbol *s; - int errs = 0; - for (s = g->syms; s; s = s->next) { - if (s->type != Unknown) - continue; - errs += 1; - fprintf(stderr, "Token %.*s not declared\n", - s->name.len, s->name.txt); - } - if (errs) { - free(g); // FIXME free content - g = NULL; + + struct symbol *s; + for (s = g->syms; s; s = s->next) { + if (s->type != Unknown) + continue; + if (!g->terminals_declared) { + s->type = Terminal; + continue; } + err = "not declared"; + fprintf(stderr, "Token %.*s not declared\n", + s->name.len, s->name.txt); + } + if (err) { + free(g); // FIXME free content + g = NULL; } return g; abort: @@ -728,8 +727,8 @@ and to simplify some comparisons of sets, these sets will be stored in a list of unique sets, each assigned a number. Once we have the data structures in hand to manage these sets and lists, -we can start setting the 'nullable' flag, build the 'FIRST' sets, and -then create the item sets which define the various states. +we can start setting the 'nullable' flag, build the 'FIRST' and 'FOLLOW' +sets, and then create the item sets which define the various states. ### Symbol sets. @@ -2042,9 +2041,9 @@ When the parser engine decides to reduce a production, it calls `do_reduce` which runs the code that was included with the production, if any. -This code needs to be able to store data somewhere. Rather than -requiring `do_reduce` to `malloc` that "somewhere", we pass in a large -buffer and have `do_reduce` return the size to be saved. +This code needs to be able to store data somewhere so we record the size +of the data expected with each state so it can be allocated before +`do_reduce` is called. In order for the code to access "global" context, we pass in the "config" pointer that was passed to the parser function. If the `struct @@ -2620,14 +2619,14 @@ The stack usually won't grow very large - maybe a few tens of entries. So we dynamically grow an array as required but never bother to shrink it down again. -We keep the stack as two separate allocations. One, `asn_stack` stores +We keep the stack as two separate allocations. One, `asn_stack`, stores the "abstract syntax nodes" which are created by each reduction. When we call `do_reduce` we need to pass an array of the `asn`s of the body of the production, and by keeping a separate `asn` stack, we can just pass a pointer into this stack. The other allocation stores all other stack fields of which there are -several. The `state` is the most important one and guides the parsing +two. The `state` is the most important one and guides the parsing process. The `sym` is nearly unnecessary as it is implicit in the state. However when we want to free entries from the `asn_stack`, it helps to know what type they are so we can call the right freeing @@ -2641,13 +2640,6 @@ bottom of stack holds the start state but no symbol, as nothing came before the beginning. As we need to store some value, `TK_eof` is used to mark the beginning of the file as well as the end. -Indents (IN) are sometimes shifted and sometimes only accounted. -Whatever decision is made must apply equally to the matching OUT. To -ensure this we keep a stack of bits in `ignored_indents` and -`indent_depth`. When we process an IN, we record if it was ignored. -When we see an out, we pop of the relavant bit and use it to decide how -to handle the OUT. - ###### parser functions struct parser { @@ -2793,15 +2785,15 @@ might also be ignored. Ignoring tokens is combined with shifting. } Indents are ignored unless they can be shifted onto the stack -immediately. The end of an indented section - the OUT token - is -ignored precisely when the indent was ignored. To keep track of this we -need a small stack of flags, which is easily stored as bits in an -`unsigned long`. This will never overflow and the scanner only allows -20 levels of indentation. +immediately or nothing can be shifted (in which case we reduce, and try +again). The end of an indented section - the OUT token - is ignored +precisely when the indent was ignored. To keep track of this we need a +small stack of flags, which is easily stored as bits in an `unsigned +long`. This will never overflow and the scanner only allows 20 levels +of indentation. ###### parser state unsigned long ignored_indents; - int indent_depth; NEWLINE/EOL is ignored when in an indented section of text which was not explicitly expected by the grammar. So if the most recent indent is @@ -2813,10 +2805,10 @@ we try to reduce a production. ###### try shift or ignore if ((tk->num == TK_newline || tk->num == TK_out) && - (p.ignored_indents & (1 << p.indent_depth))) { + (p.ignored_indents & 1)) { /* indented, so ignore OUT and NEWLINE */ if (tk->num == TK_out) - p.indent_depth -= 1; + p.ignored_indents >>= 1; free(tk); tk = NULL; parser_trace_action(trace, "Ignore"); @@ -2826,11 +2818,10 @@ we try to reduce a production. switch (shift(&p, tk->num, tk, states)) { case 1: if (tk->num == TK_out) - p.indent_depth -= 1; - if (tk->num == TK_in) { - p.indent_depth += 1; - p.ignored_indents &= ~(1 << p.indent_depth); - } + p.ignored_indents >>= 1; + if (tk->num == TK_in) + p.ignored_indents <<= 1; + tk = NULL; /* fallthrough */ case 2: @@ -2839,12 +2830,12 @@ we try to reduce a production. continue; } - if (tk->num == TK_in) { - /* No indent expected here, so ignore IN */ + if (tk->num == TK_in && states[p.stack[p.tos-1].state].go_to_cnt > 0) { + /* No indent expected here and reduce is not mandatory, so ignore IN */ free(tk); tk = NULL; - p.indent_depth += 1; - p.ignored_indents |= (1 << p.indent_depth); + p.ignored_indents <<= 1; + p.ignored_indents |= 1; parser_trace_action(trace, "Ignore"); continue; }