X-Git-Url: https://ocean-lang.org/code/?p=ocean;a=blobdiff_plain;f=csrc%2Fparsergen.mdc;h=b78a00d3511e7825bcbe9ef8cbc7340b8ed66fae;hp=554e63c37c921faea604399f7a9dd1f968b7cc84;hb=e9af634f13a0957cd510052483b6f9e35a986596;hpb=29614f479bd2539b7401dc2e5b0df649727a15d8 diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc index 554e63c..b78a00d 100644 --- a/csrc/parsergen.mdc +++ b/csrc/parsergen.mdc @@ -5,6 +5,9 @@ fragments, analyses it, and can report details about the analysis and write out C code files which can be compiled to make a parser. "2D support" means that indentation and line breaks can be significant. +Indent tokens (IN and OUT) and end-of-line (EOL) tokens can be used to +describe the grammar and the parser can selectively ignore these where +they aren't relevant. There are several distinct sections. @@ -2638,6 +2641,13 @@ bottom of stack holds the start state but no symbol, as nothing came before the beginning. As we need to store some value, `TK_eof` is used to mark the beginning of the file as well as the end. +Indents (IN) are sometimes shifted and sometimes only accounted. +Whatever decision is made must apply equally to the matching OUT. To +ensure this we keep a stack of bits in `ignored_indents` and +`indent_depth`. When we process an IN, we record if it was ignored. +When we see an out, we pop of the relavant bit and use it to decide how +to handle the OUT. + ###### parser functions struct parser { @@ -2648,6 +2658,8 @@ to mark the beginning of the file as well as the end. void **asn_stack; int stack_size; int tos; + + ## parser state }; #### Shift and pop @@ -2721,8 +2733,8 @@ in. ### The heart of the parser. Now we have the parser. For each token we might shift it, trigger a -reduction, or start error handling. 2D tokens (IN, OUT, EOL) also need -to be handled. +reduction, or start error handling. 2D tokens (IN, OUT, NEWLINE) might +also be ignored. Ignoring tokens is combined with shifting. ###### parser vars @@ -2745,73 +2757,37 @@ to be handled. ## handle error } +Indents are ignored unless they can be shifted onto the stack +immediately. The end of an indented section - the OUT token - is +ignored precisely when the indent was ignored. To keep track of this we +need a small stack of flags, which is easily stored as bits in an +`unsigned long`. This will never overflow and the scanner only allows +20 levels of indentation. -We return whatever `asn` was returned by reducing production zero. - -When we find `TK_in` and `TK_out` tokens which report indents we need -to handle them directly as the grammar cannot express what we want to -do with them. - -`TK_in` tokens are easy: we simply update indent count in the top stack frame to -record how many indents there are following the previous token. - -`TK_out` tokens must be canceled against an indent count -within the stack. If we can reduce some symbols that are all since -the most recent indent, then we do that first. If the minimum prefix -of the current state then extends back before the most recent indent, -that indent can be cancelled. If the minimum prefix is shorter then -the indent had ended prematurely and we must start error handling, which -is still a work-in-progress. - -`TK_newline` tokens are ignored unless the top stack frame records -that they are permitted. In that case they will not be considered for -shifting if it is possible to reduce some symbols that are all since -the most recent start of line. This is how a newline forcibly -terminates any line-like structure - we try to reduce down to at most -one symbol for each line where newlines are allowed. -A consequence of this is that a rule like - -###### Example: newlines - broken +###### parser state + unsigned long ignored_indents; + int indent_depth; - Newlines -> - | NEWLINE Newlines - IfStatement -> Newlines if .... - -cannot work, as the NEWLINE will never be shifted as the empty string -will be reduced first. Optional sets of newlines need to be include -in the thing that preceed: - -###### Example: newlines - works - - If -> if - | NEWLINE If - IfStatement -> If .... - -Here the NEWLINE will be shifted because nothing can be reduced until -the `if` is seen. +NEWLINE is ignored when in an indented section of text which was not +explicitly expected by the grammar. So if the most recent indent is +ignored, so is any EOL token. For other tokens, we shift the next token if that is possible, otherwise we try to reduce a production. ###### try shift or ignore - if (tk->num == TK_in) { + if ((tk->num == TK_newline || tk->num == TK_out) && + (p.ignored_indents & (1 << p.indent_depth))) { + /* indented, so ignore OUT and NEWLINE */ + if (tk->num == TK_out) + p.indent_depth -= 1; free(tk); tk = NULL; - parser_trace_action(trace, "Record"); + parser_trace_action(trace, "Ignore"); continue; } - if (tk->num == TK_out) { - if (1) { - // OK to cancel - free(tk); - tk = NULL; - parser_trace_action(trace, "Cancel"); - continue; - } - // fall through to error handling as both SHIFT and REDUCE - // will fail. - } + if (tk->num == TK_newline) { if (1) { free(tk); @@ -2821,12 +2797,28 @@ we try to reduce a production. } } if (shift(&p, tk->num, tk, states)) { + if (tk->num == TK_out) + p.indent_depth -= 1; + if (tk->num == TK_in) { + p.indent_depth += 1; + p.ignored_indents &= ~(1 << p.indent_depth); + } tk = NULL; parser_trace_action(trace, "Shift"); ## did shift continue; } + if (tk->num == TK_in) { + /* No indent expected here, so ignore IN */ + free(tk); + tk = NULL; + p.indent_depth += 1; + p.ignored_indents |= (1 << p.indent_depth); + parser_trace_action(trace, "Ignore"); + continue; + } + We have already discussed the bulk of the handling of a "reduce" action, with the `pop()` and `shift()` functions doing much of the work. There is a little more complexity needed to manage storage for the asn (Abstract