X-Git-Url: https://ocean-lang.org/code/?p=ocean;a=blobdiff_plain;f=csrc%2Fparsergen.mdc;h=76fec3a70a779cefe4589a9d79e861a61acd726e;hp=aacc59810be2e13aad50d1494171609797dcd71d;hb=ca00beb39b9d02578c1f0b556a2c2f70f28cf6e7;hpb=c7719a192d6d2a4934405775a208614a56b2ce72

diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc
index aacc598..76fec3a 100644
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -21,7 +21,6 @@ There are several distinct sections.
    `parsergen` program built from the C code in this file can extract
    that grammar directly from this file and process it.
 
-
 ###### File: parsergen.c
 	#include <unistd.h>
 	#include <stdlib.h>
@@ -101,6 +100,7 @@ symbol.
 	struct production {
 		unsigned short precedence;
 		enum assoc assoc;
+		char line_like;
 		## production fields
 	};
 	struct grammar {
@@ -278,6 +278,9 @@ declares how it associates.  This level is stored in each symbol
 listed and may be inherited by any production which uses the symbol.  A
 production inherits from the last symbol which has a precedence.
 
+The symbols on the first precedence line have the lowest precedence.
+Subsequent lines introduce symbols with higher precedence.
+
 ###### grammar fields
 	struct text current_type;
 	int type_isref;
@@ -494,12 +497,17 @@ Now we have all the bits we need to parse a full production.
 				goto abort;
 			}
 			vs = sym_find(g, tk.txt);
-			if (vs->type != Virtual) {
-				err = "symbol after $$ must be virtual";
+			if (vs->num == TK_newline)
+				p.line_like = 1;
+			else if (vs->num == TK_out)
+				p.line_like = 2;
+			else if (vs->precedence == 0) {
+				err = "symbol after $$ must have precedence";
 				goto abort;
+			} else {
+				p.precedence = vs->precedence;
+				p.assoc = vs->assoc;
 			}
-			p.precedence = vs->precedence;
-			p.assoc = vs->assoc;
 			tk = token_next(state);
 		}
 		if (tk.num == TK_open) {
@@ -839,7 +847,6 @@ array like the productions.
 		return sl->ss;
 	}
 
-
 ### Setting `nullable`
 
 We set `nullable` on the head symbol for any production for which all
@@ -877,7 +884,7 @@ changes happen.
 		}
 	}
 
-### Setting `can_eol` and `line_like`
+### Setting `line_like`
 
 In order to be able to ignore newline tokens when not relevant, but
 still include them in the parse when needed, we will need to know
@@ -885,30 +892,25 @@ which states can start a "line-like" section of code.  We ignore
 newlines when there is an indent since the most recent start of a
 line-like symbol.
 
-To know which symbols are line-like, we first need to know which
-symbols start with a NEWLINE token.  Any symbol which is followed by a
-NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
-Certainly when trying to parse one of these we must take note of NEWLINEs.
+A "line_like" symbol is simply any symbol that can derive a NEWLINE.
+If a symbol cannot derive a NEWLINE, then it is only part of a line -
+so is word-like.  If it can derive a NEWLINE, then we consider it to
+be like a line.
 
-Clearly the `TK_newline` token can start with a NEWLINE.  Any symbol
-which is the head of a production that contains a starts-with-NEWLINE
-symbol preceeded only by nullable symbols is also a
-starts-with-NEWLINE symbol.  We use a new field `can_eol` to record
-this attribute of symbols, and compute it in a repetitive manner
-similar to `set_nullable`.
-
-Once we have that, we can determine which symbols are `line_like` by
-seeing which are followed by a `can_eol` symbol in any production.
+Clearly the `TK_newline` token can derive a NEWLINE.  Any symbol which
+is the head of a production that contains a line_like symbol is also a
+line-like symbol.  We use a new field `line_like` to record this
+attribute of symbols, and compute it in a repetitive manner similar to
+`set_nullable`.
 
 ###### symbol fields
-	int can_eol;
 	int line_like;
 
 ###### functions
-	static void set_can_eol(struct grammar *g)
+	static void set_line_like(struct grammar *g)
 	{
 		int check_again = 1;
-		g->symtab[TK_newline]->can_eol = 1;
+		g->symtab[TK_newline]->line_like = 1;
 		while (check_again) {
 			int p;
 			check_again = 0;
@@ -916,35 +918,20 @@ seeing which are followed by a `can_eol` symbol in any production.
 				struct production *pr = g->productions[p];
 				int s;
 
-				if (pr->head->can_eol)
+				if (pr->head->line_like)
 					continue;
 
 				for (s = 0 ; s < pr->body_size; s++) {
-					if (pr->body[s]->can_eol) {
-						pr->head->can_eol = 1;
+					if (pr->body[s]->line_like) {
+						pr->head->line_like = 1;
 						check_again = 1;
 						break;
 					}
-					if (!pr->body[s]->nullable)
-						break;
 				}
 			}
 		}
 	}
 
-	static void set_line_like(struct grammar *g)
-	{
-		int p;
-		for (p = 0; p < g->production_count; p++) {
-			struct production *pr = g->productions[p];
-			int s;
-
-			for (s = 1; s < pr->body_size; s++)
-				if (pr->body[s]->can_eol)
-					pr->body[s-1]->line_like = 1;
-		}
-	}
-
 ### Building the `first` sets
 
 When calculating what can follow a particular non-terminal, we will need to
@@ -1180,9 +1167,10 @@ need to be consider for completion again.  So  a `completed` flag is needed.
 
 For correct handling of `TK_newline` when parsing, we will need to
 know which states (itemsets) can occur at the start of a line, so we
-will record a `starts_line` flag too.
+will record a `starts_line` flag too whenever DOT is at the start of a
+`line_like` symbol.
 
-Finally, for handling `TK_out` we need to know where production in the
+Finally, for handling `TK_out` we need to know whether productions in the
 current state started *before* the most recent indent.  A state
 doesn't usually keep details of individual productions, so we need to
 add one extra detail. `min_prefix` is the smallest non-zero number of
@@ -1301,19 +1289,18 @@ be supplemented by the LA set for the item which produce the new item.
 
 We also collect a set of all symbols which follow "DOT" (in `done`) as this
 is used in the next stage.
-If any of these symbols are flagged as starting a line, then this
+If any of these symbols are flagged as `line_like`, then this
 state must be a `starts_line` state so now is a good time to record that.
 
 When itemsets are created we assign a precedence to the itemset from
 the complete item, if there is one.  We ignore the possibility of
 there being two and don't (currently) handle precedence in such
 grammars.  When completing a grammar we ignore any item where DOT is
-followed by a terminal with a precedence lower (numerically higher)
-than that for the itemset.  Unless the terminal has right
-associativity, we also ignore items where the terminal has the same
-precedence.  The result is that unwanted items are still in the
-itemset, but the terminal doesn't get into the go to set, so the item
-is ineffective.
+followed by a terminal with a precedence lower than that for the
+itemset.  Unless the terminal has right associativity, we also ignore
+items where the terminal has the same precedence.  The result is that
+unwanted items are still in the itemset, but the terminal doesn't get
+into the go to set, so the item is ineffective.
 
 ###### complete itemset
 	for (i = 0; i < is->items.cnt; i++) {
@@ -1324,6 +1311,8 @@ is ineffective.
 		struct symbol *s;
 		struct symset LA = INIT_SYMSET;
 		unsigned short sn = 0;
+		struct symset LAnl = INIT_SYMSET;
+		unsigned short snnl = 0;
 
 		if (is->min_prefix == 0 ||
 		    (bs > 0 && bs < is->min_prefix))
@@ -1332,7 +1321,7 @@ is ineffective.
 			continue;
 		s = pr->body[bs];
 		if (s->precedence && is->precedence &&
-		    is->precedence < s->precedence)
+		    is->precedence > s->precedence)
 			/* This terminal has a low precedence and
 			 * shouldn't be shifted
 			 */
@@ -1361,6 +1350,10 @@ is ineffective.
 			}
 			sn = save_set(g, LA);
 			LA = set_find(g, sn);
+			if (symset_find(&LA, TK_newline))
+				symset_add(&LAnl, TK_newline, 0);
+			snnl = save_set(g, LAnl);
+			LAnl = set_find(g, snnl);
 		}
 
 		/* Add productions for this symbol */
@@ -1371,19 +1364,25 @@ is ineffective.
 			int itm = item_num(p2, 0);
 			int pos = symset_find(&is->items, itm);
 			if (pos < 0) {
-				symset_add(&is->items, itm, sn);
+				if (g->productions[p2]->line_like)
+					symset_add(&is->items, itm, snnl);
+				else
+					symset_add(&is->items, itm, sn);
 				/* Will have re-ordered, so start
 				 * from beginning again */
 				i = -1;
 			} else if (type >= LALR) {
 				struct symset ss = set_find(g, is->items.data[pos]);
 				struct symset tmp = INIT_SYMSET;
+				struct symset *la = &LA;
 
+				if (g->productions[p2]->line_like)
+					la = &LAnl;
 				symset_union(&tmp, &ss);
-				if (symset_union(&tmp, &LA)) {
+				if (symset_union(&tmp, la)) {
 					is->items.data[pos] = save_set(g, tmp);
 					i = -1;
-				}else
+				} else
 					symset_free(tmp);
 			}
 		}
@@ -1425,8 +1424,7 @@ with a pre-existing itemset).
 			pos = symset_find(&newitemset, pr->head->num);
 			if (bp + 1 == pr->body_size &&
 			    pr->precedence > 0 &&
-			    (precedence == 0 ||
-			     pr->precedence < precedence)) {
+			    pr->precedence > precedence) {
 				// new itemset is reducible and has a precedence.
 				precedence = pr->precedence;
 				assoc = pr->assoc;
@@ -1532,7 +1530,6 @@ changeover point in `first_nonterm`.
 			g->symtab[s->num] = s;
 
 		set_nullable(g);
-		set_can_eol(g);
 		set_line_like(g);
 		if (type >= SLR)
 			build_first(g);
@@ -1583,9 +1580,8 @@ show if it can end in a newline (`>`), if it is considered to be
 			if (!s)
 				continue;
 
-			printf(" %c%c%c%3d%c: ",
+			printf(" %c%c%3d%c: ",
 			       s->nullable ? '.':' ',
-			       s->can_eol ? '>':' ',
 			       s->line_like ? '<':' ',
 			       s->num, symtypes[s->type]);
 			prtxt(s->name);
@@ -1657,6 +1653,10 @@ it up a bit.  First the items, with production number and associativity.
 			printf(" [%d%s]", s->precedence,
 			       assoc_names[s->assoc]);
 		}
+		if (pr->line_like == 1)
+			printf(" $$NEWLINE");
+		else if (pr->line_like)
+			printf(" $$OUT");
 		printf("\n");
 	}
 
@@ -1679,7 +1679,6 @@ The LA sets which are (possibly) reported with each item:
 
 Then the go to sets:
 
-
 	static void report_goto(struct grammar *g, struct symset gt)
 	{
 		int i;
@@ -1796,6 +1795,11 @@ terminals to items where that terminal could be shifted and another
 which maps terminals to items that could be reduced when the terminal
 is in look-ahead.  We report when we get conflicts between the two.
 
+As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
+terminal, we ignore it.  NEWLINES are handled specially with its own
+rules for when to shift and when to reduce.  Conflicts are expected,
+but handled internally.
+
 	static int conflicts_slr(struct grammar *g, enum grammar_type type)
 	{
 		int i;
@@ -1814,16 +1818,22 @@ is in look-ahead.  We report when we get conflicts between the two.
 				int p = item_prod(itm);
 				int bp = item_index(itm);
 				struct production *pr = g->productions[p];
+				struct symbol *s;
 
-				if (bp < pr->body_size &&
-				    pr->body[bp]->type == Terminal) {
-					/* shiftable */
-					int sym = pr->body[bp]->num;
-					if (symset_find(&shifts, sym) < 0)
-						symset_add(&shifts, sym, itm);
-				}
+				if (bp >= pr->body_size ||
+				    pr->body[bp]->type != Terminal)
+					/* not shiftable */
+					continue;
+
+				s = pr->body[bp];
+				if (s->precedence && is->precedence)
+					/* Precedence resolves this, so no conflict */
+					continue;
+
+				if (symset_find(&shifts, s->num) < 0)
+					symset_add(&shifts, s->num, itm);
 			}
-			/* Now look for reduction and conflicts */
+			/* Now look for reductions and conflicts */
 			for (j = 0; j < is->items.cnt; j++) {
 				unsigned short itm = is->items.syms[j];
 				int p = item_prod(itm);
@@ -1841,13 +1851,13 @@ is in look-ahead.  We report when we get conflicts between the two.
 				int k;
 				for (k = 0; k < la.cnt; k++) {
 					int pos = symset_find(&shifts, la.syms[k]);
-					if (pos >= 0) {
+					if (pos >= 0 && la.syms[k] != TK_newline) {
 						printf("  State %d has SHIFT/REDUCE conflict on ", i);
-						prtxt(g->symtab[la.syms[k]]->name);
+						cnt++;
+							prtxt(g->symtab[la.syms[k]]->name);
 						printf(":\n");
 						report_item(g, shifts.data[pos]);
 						report_item(g, itm);
-						cnt++;
 					}
 					pos = symset_find(&reduce, la.syms[k]);
 					if (pos < 0) {
@@ -1868,7 +1878,6 @@ is in look-ahead.  We report when we get conflicts between the two.
 		return cnt;
 	}
 
-
 ## Generating the parser
 
 The exported part of the parser is the `parse_XX` function, where the name
@@ -1885,13 +1894,14 @@ pieces of code provided in the grammar file, so they are generated first.
 
 ###### parser_generate
 
-	static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
+	static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
+		               struct code_node *pre_reduce)
 	{
 		gen_known(f, g);
 		gen_non_term(f, g);
 		gen_goto(f, g);
 		gen_states(f, g);
-		gen_reduce(f, g, file);
+		gen_reduce(f, g, file, pre_reduce);
 		gen_free(f, g);
 
 		fprintf(f, "#line 0 \"gen_parser\"\n");
@@ -1912,7 +1922,9 @@ pieces of code provided in the grammar file, so they are generated first.
 ### Known words table
 
 The known words table is simply an array of terminal symbols.
-The table of nonterminals used for tracing is a similar array.
+The table of nonterminals used for tracing is a similar array.  We
+include virtual symbols in the table of non_terminals to keep the
+numbers right.
 
 ###### functions
 
@@ -1937,7 +1949,7 @@ The table of nonterminals used for tracing is a similar array.
 		for (i = TK_reserved;
 		     i < g->num_syms;
 		     i++)
-			if (g->symtab[i]->type == Nonterminal)
+			if (g->symtab[i]->type != Terminal)
 				fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
 					g->symtab[i]->name.txt);
 		fprintf(f, "};\n\n");
@@ -1967,11 +1979,11 @@ The go to table is stored in a simple array of `sym` and corresponding
 		short reduce_prod;
 		short reduce_size;
 		short reduce_sym;
-		short starts_line;
+		char starts_line;
+		char newline_only;
 		short min_prefix;
 	};
 
-
 ###### functions
 
 	static void gen_goto(FILE *f, struct grammar *g)
@@ -2017,13 +2029,15 @@ The go to table is stored in a simple array of `sym` and corresponding
 			}
 
 			if (prod >= 0)
-				fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d },\n",
+				fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
 					i, is->go_to.cnt, i, prod,
 					g->productions[prod]->body_size,
 					g->productions[prod]->head->num,
-					is->starts_line, is->min_prefix);
+					is->starts_line,
+					g->productions[prod]->line_like,
+					is->min_prefix);
 			else
-				fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
+				fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
 					i, is->go_to.cnt, i,
 					is->starts_line, is->min_prefix);
 		}
@@ -2116,24 +2130,31 @@ automatically freed.  This is equivalent to assigning `NULL` to the pointer.
 		fputs("\n", f);
 		for (i = 0; i < p->body_size; i++) {
 			if (p->body[i]->struct_name.txt &&
-			    p->body[i]->isref &&
-			    used[i])
+			    used[i]) {
 				// assume this has been copied out
-				fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
+				if (p->body[i]->isref)
+					fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
+				else
+					fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
+			}
 		}
 		free(used);
 	}
 
 ###### functions
 
-	static void gen_reduce(FILE *f, struct grammar *g, char *file)
+	static void gen_reduce(FILE *f, struct grammar *g, char *file,
+	                       struct code_node *code)
 	{
 		int i;
-		fprintf(f, "#line 0 \"gen_reduce\"\n");
+		fprintf(f, "#line 1 \"gen_reduce\"\n");
 		fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
 		fprintf(f, "{\n");
 		fprintf(f, "\tint ret_size = 0;\n");
+		if (code)
+			code_node_print(f, code, file);
 
+		fprintf(f, "#line 4 \"gen_reduce\"\n");
 		fprintf(f, "\tswitch(prod) {\n");
 		for (i = 0; i < g->production_count; i++) {
 			struct production *p = g->productions[i];
@@ -2330,6 +2351,7 @@ parser with neither. "grammar" must be provided.
 	struct code_node *hdr = NULL;
 	struct code_node *code = NULL;
 	struct code_node *gram = NULL;
+	struct code_node *pre_reduce = NULL;
 	for (s = table; s; s = s->next) {
 		struct text sec = s->section;
 		if (tag && !strip_tag(&sec, tag))
@@ -2340,6 +2362,8 @@ parser with neither. "grammar" must be provided.
 			code = s->code;
 		else if (text_is(sec, "grammar"))
 			gram = s->code;
+		else if (text_is(sec, "reduce"))
+			pre_reduce = s->code;
 		else {
 			fprintf(stderr, "Unknown content section: %.*s\n",
 			        s->section.len, s->section.txt);
@@ -2411,7 +2435,7 @@ file with the code section (if any) and the parser tables and function.
 		if (f) {
 			if (code)
 				code_node_print(f, code, infile);
-			gen_parser(f, g, infile, name);
+			gen_parser(f, g, infile, name, pre_reduce);
 			fclose(f);
 		} else {
 			fprintf(stderr, "Cannot create %s.c\n",
@@ -2709,9 +2733,29 @@ is still a work-in-progress.
 `TK_newline` tokens are ignored unless the top stack frame records
 that they are permitted.  In that case they will not be considered for
 shifting if it is possible to reduce some symbols that are all since
-the most recent start of line.  This is how a newline forcible
+the most recent start of line.  This is how a newline forcibly
 terminates any line-like structure - we try to reduce down to at most
 one symbol for each line where newlines are allowed.
+A consequence of this is that a rule like
+
+###### Example: newlines - broken
+
+	Newlines ->
+		| NEWLINE Newlines
+	IfStatement -> Newlines if ....
+
+cannot work, as the NEWLINE will never be shifted as the empty string
+will be reduced first.  Optional sets of newlines need to be include
+in the thing that preceed:
+
+###### Example: newlines - works
+
+	If -> if
+		| NEWLINE If
+	IfStatement -> If ....
+
+Here the NEWLINE will be shifted because nothing can be reduced until
+the `if` is seen.
 
 When, during error handling, we discard token read in, we want to keep
 discarding until we see one that is recognised.  If we had a full set
@@ -2825,7 +2869,13 @@ checks if a given token is in any of these look-ahead sets.
 				continue;
 			}
 		force_reduce:
-			if (states[tos->state].reduce_prod >= 0) {
+			if (states[tos->state].reduce_prod >= 0 &&
+			    states[tos->state].newline_only &&
+			    tk->num != TK_newline && tk->num != TK_eof && tk->num != TK_out) {
+				/* Anything other than newline in an error as this
+				 * production must end at EOL
+				 */
+			} else if (states[tos->state].reduce_prod >= 0) {
 				void **body;
 				void *res;
 				const struct state *nextstate = &states[tos->state];
@@ -2888,11 +2938,7 @@ checks if a given token is in any of these look-ahead sets.
 					// FIXME update since_indent here
 				}
 			}
-			if (p.tos == 0 && tk->num == TK_eof)
-				break;
-			tos = &p.stack[p.tos-1];
 			tos->indents += indents;
-			exit(1);
 		}
 		free(tk);
 		pop(&p, p.tos, NULL, do_free);
@@ -3004,6 +3050,9 @@ an error.
 		./parsergen --tag calc -o calc parsergen.mdc
 	calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
 		$(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
+	calctest : calc
+		./calc parsergen.mdc
+	demos :: calctest
 
 # calc: header
 
@@ -3024,6 +3073,7 @@ an error.
 	#include <stdio.h>
 	#include <malloc.h>
 	#include <gmp.h>
+	#include <string.h>
 	#include "mdcode.h"
 	#include "scanner.h"
 	#include "number.h"
@@ -3037,12 +3087,19 @@ an error.
 		free(n);
 	}
 
+	static int text_is(struct text t, char *s)
+	{
+		return (strlen(s) == t.len &&
+			strncmp(s, t.txt, t.len) == 0);
+	}
+
 	int main(int argc, char *argv[])
 	{
 		int fd = open(argv[1], O_RDONLY);
 		int len = lseek(fd, 0, 2);
 		char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
-		struct section *s = code_extract(file, file+len, NULL);
+		struct section *table = code_extract(file, file+len, NULL);
+		struct section *s;
 		struct token_config config = {
 			.ignored = (1 << TK_line_comment)
 			         | (1 << TK_block_comment)
@@ -3052,20 +3109,22 @@ an error.
 			.word_start = "",
 			.word_cont = "",
 		};
-		parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
-		while (s) {
-			struct section *t = s->next;
-			code_free(s->code);
-			free(s);
-			s = t;
+		for (s = table; s; s = s->next)
+			if (text_is(s->section, "example: input"))
+				parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
+		while (table) {
+			struct section *t = table->next;
+			code_free(table->code);
+			free(table);
+			table = t;
 		}
 		exit(0);
 	}
 
 # calc: grammar
 
-	$LEFT * /
 	$LEFT + -
+	$LEFT * /
 
 	Session -> Session Line
 		| Line
@@ -3095,3 +3154,14 @@ an error.
 		| Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
 		| NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
 		| ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
+
+# example: input
+
+	355/113
+	3.1415926535 - 355/113
+	2 + 4 * 5
+	1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
+	10 * 9 / 2
+	1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
+
+	error