From c1e21eb4619c6e86bfebf324eaf6becbc25450bd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 2 Oct 2014 20:42:04 +1000
Subject: [PATCH] parsergen: various updates.

- add starts_line flag for symbols
- add starts_newline flag for stack frames

and related changes

Signed-off-by: NeilBrown <neil@brown.name>
---
 csrc/parsergen.mdc | 83 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 64 insertions(+), 19 deletions(-)

diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc
index 9306592..2aa296d 100644
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -869,7 +869,7 @@ changes happen.
 		}
 	}
 
-### Setting `can_eol`
+### Setting `can_eol` and `starts_line`
 
 In order to be able to ignore newline tokens when not relevant, but
 still include them in the parse when needed, we will need to know
@@ -891,6 +891,7 @@ compute it in a repetitive manner similar to `set_nullable`.
 
 ###### symbol fields
 	int can_eol;
+	int starts_line;
 
 ###### functions
 	static void set_can_eol(struct grammar *g)
@@ -920,6 +921,19 @@ compute it in a repetitive manner similar to `set_nullable`.
 		}
 	}
 
+	static void set_starts_line(struct grammar *g)
+	{
+		int p;
+		for (p = 0; p < g->production_count; p++) {
+			struct production *pr = g->productions[p];
+			int s;
+
+			for (s = 0; s < pr->body_size - 1; s++)
+				if (pr->body[s]->can_eol)
+					pr->body[s+1]->starts_line = 1;
+		}
+	}
+
 ### Building the `first` sets
 
 When calculating what can follow a particular non-terminal, we will need to
@@ -1198,7 +1212,7 @@ recalculated and their LA sets updated.
 them to a data structure, of freeing them.
 
 	static int add_itemset(struct grammar *g, struct symset ss,
-	                       enum grammar_type type, int starts_line)
+	                       enum grammar_type type)
 	{
 		struct itemset **where, *is;
 		int i;
@@ -1210,7 +1224,6 @@ them to a data structure, of freeing them.
 			is->items = ss;
 			is->next = *where;
 			is->go_to = INIT_DATASET;
-			is->starts_line = starts_line;
 			*where = is;
 			return is->state;
 		}
@@ -1252,6 +1265,8 @@ be supplemented by the LA set for the item which produce the new item.
 
 We also collect a set of all symbols which follow "DOT" (in `done`) as this
 is used in the next stage.
+If any of these symbols are flagged as starting a line, then this
+state must be a `starts_line` state so now is a good time to record that.
 
 NOTE: precedence handling should happen here - I haven't written this yet
 though.
@@ -1269,8 +1284,11 @@ though.
 		if (bs == pr->body_size)
 			continue;
 		s = pr->body[bs];
-		if (symset_find(&done, s->num) < 0)
+		if (symset_find(&done, s->num) < 0) {
 			symset_add(&done, s->num, 0);
+			if (s->starts_line)
+				is->starts_line = 1;
+		}
 		if (s->type != Nonterminal)
 			continue;
 		again = 1;
@@ -1324,15 +1342,11 @@ with a pre-existing itemset).
 	for (i = 0; i < done.cnt; i++) {
 		int j;
 		unsigned short state;
-		int starts_line = 0;
 		struct symbol *sym = g->symtab[done.syms[i]];
 		struct symset newitemset = INIT_SYMSET;
 		if (type >= LALR)
 			newitemset = INIT_DATASET;
 
-		if (sym->can_eol ||
-		    (sym->nullable && is->starts_line))
-			starts_line = 1;
 		for (j = 0; j < is->items.cnt; j++) {
 			int itm = is->items.syms[j];
 			int p = item_prod(itm);
@@ -1365,7 +1379,7 @@ with a pre-existing itemset).
 				}
 			}
 		}
-		state = add_itemset(g, newitemset, type, starts_line);
+		state = add_itemset(g, newitemset, type);
 		if (symset_find(&is->go_to, done.syms[i]) < 0)
 			symset_add(&is->go_to, done.syms[i], state);
 	}
@@ -1389,7 +1403,7 @@ with `TK_eof` as the LA set.
 		}
 		// production 0, offset 0 (with no data)
 		symset_add(&first, item_num(0, 0), la);
-		add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
+		add_itemset(g, first, type);
 		for (again = 0, is = g->items;
 		     is;
 		     is = is->next ?: again ? (again = 0, g->items) : NULL) {
@@ -1450,6 +1464,7 @@ changeover point in `first_nonterm`.
 
 		set_nullable(g);
 		set_can_eol(g);
+		set_starts_line(g);
 		if (type >= SLR)
 			build_first(g);
 
@@ -1481,7 +1496,8 @@ all the tables that have been generated, plus a description of any conflicts.
 
 Firstly we have the complete list of symbols, together with the
 "FIRST" set if that was generated.  We add a mark to each symbol to
-show if it can end in a newline (`>`), or if it is nullable (`.`).
+show if it can end in a newline (`>`), if it implies the start of a
+line (`<`), or if it is nullable (`.`).
 
 ###### functions
 
@@ -1498,9 +1514,10 @@ show if it can end in a newline (`>`), or if it is nullable (`.`).
 			if (!s)
 				continue;
 
-			printf(" %c%c%3d%c: ",
+			printf(" %c%c%c%3d%c: ",
 			       s->nullable ? '.':' ',
 			       s->can_eol ? '>':' ',
+			       s->starts_line ? '<':' ',
 			       s->num, symtypes[s->type]);
 			prtxt(s->name);
 			if (s->precedence)
@@ -2415,10 +2432,14 @@ The `indents` count and the `starts_indented` flag track the line
 indents in the symbol.  These are used to allow indent information to
 guide parsing and error recovery.
 
+`since_newline` tracks how many stack frames since the last
+start-of-line (whether indented or not).  So if `since_newline` is
+zero, then this symbol is at the start of a line.
+
 `newline_permitted` keeps track of whether newlines should be ignored
 or not, and `starts_line` records if this state stated on a newline.
 
-The stack is more properly seen as alternating states and symbols -
+The stack is most properly seen as alternating states and symbols -
 states, like the 'DOT' in items, are between symbols.  Each frame in
 our stack holds a state and the symbol that was before it.  The
 bottom of stack holds the start state, but no symbol, as nothing came
@@ -2434,6 +2455,7 @@ before the beginning.
 			short sym;
 			short starts_indented;
 			short indents;
+			short starts_newline;
 		} *stack;
 		void **asn_stack;
 		int stack_size;
@@ -2462,6 +2484,15 @@ stack is empty, it always chooses zero as the next state.
 So `shift` finds the next state.  If that succeed it extends the allocations
 if needed and pushes all the information onto the stacks.
 
+Newlines are permitted after a starts_line state until an internal
+indent.  So we need to find the topmost state which `starts_line` and
+see if there are any indents other than immediately after it.
+
+So we walk down:
+
+-  if state starts_line, then newlines_permitted.
+-  if any non-initial indents, newlines not permitted
+
 ###### parser functions
 
 	static int shift(struct parser *p, struct frame *next,
@@ -2486,8 +2517,10 @@ if needed and pushes all the information onto the stacks.
 		next->newline_permitted = 0;
 		if (p->tos)
 			next->newline_permitted =
-				p->stack[p->tos-1].newline_permitted;
-		if (next->indents)
+				(p->stack[p->tos-1].newline_permitted?:-1)+1;
+		if (next->indents > next->starts_indented)
+			next->newline_permitted = 0;
+		if (next->indents && next->newline_permitted > 2)
 			next->newline_permitted = 0;
 		if (states[newstate].starts_line)
 			next->newline_permitted = 1;
@@ -2512,7 +2545,10 @@ removed. It is called _after_ we reduce a production, just before we
 	{
 		int i;
 		p->tos -= num;
-		next->starts_indented = p->stack[p->tos].starts_indented;
+		next->starts_indented =
+			p->stack[p->tos].starts_indented;
+		next->starts_newline =
+			p->stack[p->tos].starts_newline;
 		next->indents = 0;
 		for (i = 0; i < num; i++) {
 			next->indents += p->stack[p->tos+i].indents;
@@ -2594,6 +2630,7 @@ since the last state which could have been at the start of a line.
 		int accepted = 0;
 		void *ret = NULL;
 
+		next.starts_newline = 1;
 		shift(&p, &next, NULL, states);
 		while (!accepted) {
 			struct token *err_tk;
@@ -2606,6 +2643,7 @@ since the last state which could have been at the start of a line.
 			if (next.sym == TK_in) {
 				next.starts_indented = 1;
 				next.indents = 1;
+				next.starts_newline = 1;
 				free(tk);
 				tk = NULL;
 				parser_trace_action(trace, "Record");
@@ -2621,7 +2659,7 @@ since the last state which could have been at the start of a line.
 						if (states[tos->state].starts_line)
 							tos->newline_permitted = 1;
 						else if (p.tos > 1)
-							tos->newline_permitted = p.stack[p.tos-2].newline_permitted;
+							tos->newline_permitted = (p.stack[p.tos-2].newline_permitted ?:-1)+1;
 					}
 					free(tk);
 					tk = NULL;
@@ -2640,9 +2678,11 @@ since the last state which could have been at the start of a line.
 				}
 			}
 			if (shift(&p, &next, tk, states)) {
-				tk = NULL;
+				next.starts_newline =
+					tk->num == TK_newline;
 				next.starts_indented = 0;
 				next.indents = 0;
+				tk = NULL;
 				parser_trace_action(trace, "Shift");
 				continue;
 			}
@@ -2666,6 +2706,7 @@ since the last state which could have been at the start of a line.
 				else {
 					frame.indents = next.indents;
 					frame.starts_indented = frame.indents;
+					frame.starts_newline = 0;
 					next.indents = 0;
 					next.starts_indented = 0;
 				}
@@ -2774,7 +2815,7 @@ end inside square brackets.
 		if (states[f->state].starts_line)
 			fprintf(trace, "s");
 		if (f->newline_permitted)
-			fprintf(trace, "n");
+			fprintf(trace, "n%d", f->newline_permitted);
 		fprintf(trace, ") ");
 	}
 
@@ -2801,6 +2842,8 @@ end inside square brackets.
 				if (f->indents)
 					fprintf(trace, "%c%d", f->starts_indented?':':'.',
 						f->indents);
+				if (f->starts_newline)
+					fputs("/", trace);
 				fputs(" ", trace);
 			}
 			parser_trace_state(trace, f, states);
@@ -2814,6 +2857,8 @@ end inside square brackets.
 		if (n->indents)
 			fprintf(trace, "%c%d", n->starts_indented?':':'.',
 				n->indents);
+		if (n->starts_newline)
+			fputs("/", trace);
 		fputs("]", trace);
 	}
 
-- 
2.43.0