From: NeilBrown <neilb@suse.de>
Date: Wed, 5 Jun 2013 20:20:35 +0000 (+1000)
Subject: mdcode, md2c - extract C code from a literate markdown program
X-Git-Tag: draftparser~21
X-Git-Url: https://ocean-lang.org/code/?p=ocean;a=commitdiff_plain;h=bf502feaba385e5e8ebd5f80c005522c59660496

mdcode, md2c - extract C code from a literate markdown program

All C programs here will be written in literate style using markdown.
md2c strips out the C code so that it can be compiled.

Signed-off-by: NeilBrown <neilb@suse.de>
---

bf502feaba385e5e8ebd5f80c005522c59660496
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5761abc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.o
diff --git a/csrc/.gitignore b/csrc/.gitignore
new file mode 100644
index 0000000..3cfcc75
--- /dev/null
+++ b/csrc/.gitignore
@@ -0,0 +1,5 @@
+*.c
+*.h
+*.mk
+md2c
+boot-strap/md2c
diff --git a/csrc/Makefile b/csrc/Makefile
new file mode 100644
index 0000000..d1beada
--- /dev/null
+++ b/csrc/Makefile
@@ -0,0 +1,11 @@
+
+ifneq "$(wildcard *.mk)" ""
+include *.mk
+endif
+
+bootstrap:
+	touch boot-strap/* # make sure mdcode.mdc isn't newer
+	$(MAKE) -C boot-strap -f mdcode.mk VPATH=..
+	cp boot-strap/md2c .
+	./md2c mdcode.mdc
+
diff --git a/csrc/boot-strap/libmdcode.c b/csrc/boot-strap/libmdcode.c
new file mode 100644
index 0000000..78a4262
--- /dev/null
+++ b/csrc/boot-strap/libmdcode.c
@@ -0,0 +1,384 @@
+#line 100 "../mdcode.mdc"
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "mdcode.h"
+#line 461 "../mdcode.mdc"
+#include  <ctype.h>
+#include  <string.h>
+
+#line 179 "../mdcode.mdc"
+struct psection {
+	struct section;
+	struct code_node *last;
+	int refcnt;
+	int indent;
+};
+
+#line 216 "../mdcode.mdc"
+static void code_linearize(struct code_node *code)
+{
+	struct code_node *t;
+	for (t = code; t; t = t->next)
+		t->indent = 0;
+	for (; code; code = code->next)
+		if (code->child) {
+			struct code_node *next = code->next;
+			struct psection *pchild =
+				(struct	psection *)code->child;
+			int indent = pchild->indent;
+			code->next = code->child->code;
+			code->child->code = NULL;
+			code->child = NULL;
+			for (t = code; t->next; t = t->next)
+				t->next->indent = code->indent + indent;
+			t->next = next;
+		}
+}
+
+#line 239 "../mdcode.mdc"
+void code_free(struct code_node *code)
+{
+	while (code) {
+		struct code_node *this;
+		if (code->child)
+			code_linearize(code);
+		this = code;
+		code = code->next;
+		free(this);
+	}
+}
+
+#line 268 "../mdcode.mdc"
+static void code_add_text(struct psection *where, struct text txt,
+			  int line_no)
+{
+	struct code_node *n;
+	if (txt.len == 0)
+		return;
+	n = malloc(sizeof(*n));
+	n->code = txt;
+	n->indent = 0;
+	n->line_no = line_no;
+	n->next = NULL;
+	n->child = NULL;
+	if (where->last)
+		where->last->next = n;
+	else
+		where->code = n;
+	where->last = n;
+}
+
+#line 290 "../mdcode.mdc"
+void code_add_link(struct psection *where, struct psection *to,
+		   int indent)
+{
+	struct code_node *n;
+
+	to->indent = indent;
+	to->refcnt++;	// this will be checked elsewhere
+	if (where->last && where->last->child == NULL) {
+		where->last->child = to;
+		return;
+	}
+	n = malloc(sizeof(*n));
+	n->code.len = 0;
+	n->indent = 0;
+	n->line_no = 0;
+	n->next = NULL;
+	n->child = to;
+	if (where->last)
+		where->last->next = n;
+	else
+		where->code = n;
+	where->last = n;
+}
+
+#line 329 "../mdcode.mdc"
+static int text_cmp(struct text a, struct text b)
+{
+	if (a.len != b.len)
+		return a.len - b.len;
+	return strncmp(a.txt, b.txt, a.len);
+}
+
+static struct psection *section_find(struct psection **list, struct text name)
+{
+	struct psection *new;
+	while (*list) {
+		int cmp = text_cmp((*list)->section, name);
+		if (cmp == 0)
+			return *list;
+		if (cmp > 0)
+			break;
+		list = (struct psection **)&((*list)->next);
+	}
+	/* Add this section */
+	new = malloc(sizeof(*new));
+	new->next = *list;
+	*list = new;
+	new->section = name;
+	new->code = NULL;
+	new->last = NULL;
+	new->refcnt = 0;
+	new->indent = 0;
+	return new;
+}
+
+#line 410 "../mdcode.mdc"
+static char *skip_lws(char *pos, char *end)
+{
+	while (pos < end && (*pos == ' ' || *pos == '\t'))
+		pos++;
+	return pos;
+}
+
+static char *skip_line(char *pos, char *end)
+{
+	while (pos < end && *pos != '\n')
+		pos++;
+	if (pos < end)
+		pos++;
+	return pos;
+}
+
+static char *skip_para(char *pos, char *end, int *line_no)
+{
+	/* Might return a pointer to a blank line, as only
+	 * one trailing blank line is skipped
+	 */
+	if (*pos == '#') {
+		pos = skip_line(pos, end);
+		(*line_no) += 1;
+		return pos;
+	}
+	while (pos < end &&
+	       *pos != '#' &&
+	       *(pos = skip_lws(pos, end)) != '\n') {
+		pos = skip_line(pos, end);
+		(*line_no) += 1;
+	}
+	if (pos < end && *pos == '\n') {
+		pos++;
+		(*line_no) += 1;
+	}
+	return pos;
+}
+
+#line 466 "../mdcode.mdc"
+static struct text take_header(char *pos, char *end)
+{
+	struct text section;
+
+	while (pos < end && *pos == '#')
+		pos++;
+	while (pos < end && *pos == ' ')
+		pos++;
+	section.txt = pos;
+	while (pos < end && *pos != '\n')
+		pos++;
+	while (pos > section.txt &&
+	       (pos[-1] == '#' || pos[-1] == ' '))
+		pos--;
+	section.len = pos - section.txt;
+	return section;
+}
+
+static int is_list(char *pos, char *end)
+{
+	if (strchr("-*+", *pos))
+		return 1;
+	if (isdigit(*pos)) {
+		while (pos < end && isdigit(*pos))
+			pos += 1;
+		if  (pos < end && *pos == '.')
+			return 1;
+	}
+	return 0;
+}
+
+static int matches(char *start, char *pos, char *end)
+{
+	if (start == NULL)
+		return matches("\t", pos, end) ||
+		       matches("    ", pos, end);
+	return (pos + strlen(start) < end &&
+		strncmp(pos, start, strlen(start)) == 0);
+}
+
+#line 538 "../mdcode.mdc"
+static int count_space(char *sol, char *p)
+{
+	int c = 0;
+	while (sol < p) {
+		if (sol[0] == ' ')
+			c++;
+		if (sol[0] == '\t')
+			c+= 8;
+		sol++;
+	}
+	return c;
+}
+
+
+static char *take_code(char *pos, char *end, char *marker,
+		       struct psection **table, struct text section,
+		       int *line_nop)
+{
+	char *start = pos;
+	int line_no = *line_nop;
+	int start_line = line_no;
+	struct psection *sect;
+
+	sect = section_find(table, section);
+
+	while (pos < end) {
+		char *sol, *t;
+		struct text ref;
+
+		if (marker && matches(marker, pos, end))
+			break;
+		if (!marker &&
+		    (skip_lws(pos, end))[0] != '\n' &&
+		    !matches(NULL, pos, end))
+			/* Paragraph not indented */
+			break;
+
+		/* Still in code - check for reference */
+		sol = pos;
+		if (!marker) {
+			if (*sol == '\t')
+				sol++;
+			else if (strcmp(sol, "    ") == 0)
+				sol += 4;
+		}
+		t = skip_lws(sol, end);
+		if (t[0] != '#' || t[1] != '#') {
+			/* Just regular code here */
+			pos = skip_line(sol, end);
+			line_no++;
+			continue;
+		}
+
+		if (pos > start) {
+			struct text txt;
+			txt.txt = start;
+			txt.len = pos - start;
+			code_add_text(sect, txt, start_line);
+		}
+		ref = take_header(t, end);
+		if (ref.len) {
+			struct psection *refsec = section_find(table, ref);
+			code_add_link(sect, refsec, count_space(sol, t));
+		}
+		pos = skip_line(t, end);
+		line_no++;
+		start = pos;
+		start_line = line_no;
+	}
+	if (pos > start) {
+		struct text txt;
+		txt.txt = start;
+		txt.len = pos - start;
+		code_add_text(sect, txt, start_line);
+	}
+	if (marker) {
+		pos = skip_line(pos, end);
+		line_no++;
+	}
+	*line_nop = line_no;
+	return pos;
+}
+
+#line 630 "../mdcode.mdc"
+static struct psection *code_find(char *pos, char *end)
+{
+	struct psection *table = NULL;
+	int in_list = 0;
+	int line_no = 1;
+	struct text section = {0};
+
+	while (pos < end) {
+		if (pos[0] == '#') {
+			section = take_header(pos, end);
+			in_list = 0;
+			pos = skip_line(pos, end);
+			line_no++;
+		} else if (is_list(pos, end)) {
+			in_list = 1;
+			pos = skip_para(pos, end, &line_no);
+		} else if (!in_list && matches(NULL, pos, end)) {
+			pos = take_code(pos, end, NULL, &table,
+					section, &line_no);
+		} else if (matches("```", pos, end)) {
+			in_list = 0;
+			pos = skip_line(pos, end);
+			line_no++;
+			pos = take_code(pos, end, "```", &table,
+					section, &line_no);
+		} else if (matches("~~~", pos, end)) {
+			in_list = 0;
+			pos = skip_line(pos, end);
+			line_no++;
+			pos = take_code(pos, end, "~~~", &table,
+					section, &line_no);
+		} else {
+			if (!isspace(*pos))
+				in_list = 0;
+			pos = skip_para(pos, end, &line_no);
+		}
+	}
+	return table;
+}
+
+#line 690 "../mdcode.mdc"
+struct section *code_extract(char *pos, char *end, code_err_fn error)
+{
+	struct psection *table;
+	struct section *result = NULL;
+	struct section *tofree = NULL;
+
+	table = code_find(pos, end);
+
+	while (table) {
+		struct psection *t = (struct psection*)table->next;
+		if (table->last == NULL) {
+			char *msg;
+			asprintf(&msg,
+				"Section \"%.*s\" is referenced but not declared",
+				 table->section.len, table->section.txt);
+			error(msg);
+			free(msg);
+		}
+		if (table->refcnt == 0) {
+			/* Root-section,  return it */
+			table->next = result;
+			result = table;
+			code_linearize(result->code);
+		} else {
+			table->next = tofree;
+			tofree = table;
+			if (table->refcnt > 1) {
+				char *msg;
+				asprintf(&msg,
+					 "Section \"%.*s\" referenced multiple times (%d).",
+					 table->section.len, table->section.txt,
+					 table->refcnt);
+				error(msg);
+				free(msg);
+			}
+		}
+		table = t;
+	}
+	while (tofree) {
+		struct section *t = tofree->next;
+		free(tofree);
+		tofree = t;
+	}
+	return result;
+}
+
+#line 109 "../mdcode.mdc"
+
diff --git a/csrc/boot-strap/md2c.c b/csrc/boot-strap/md2c.c
new file mode 100644
index 0000000..2b18737
--- /dev/null
+++ b/csrc/boot-strap/md2c.c
@@ -0,0 +1,133 @@
+#line 119 "../mdcode.mdc"
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "mdcode.h"
+
+#line 849 "../mdcode.mdc"
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <stdio.h>
+
+#line 771 "../mdcode.mdc"
+static void code_print(FILE *out, struct code_node *node,
+		       char *fname)
+{
+	for (; node; node = node->next) {
+		char *c = node->code.txt;
+		int len = node->code.len;
+		int undent = 0;
+
+		if (!len)
+			continue;
+
+		fprintf(out, "#line %d \"%s\"\n",
+			node->line_no, fname);
+		if (*c == ' ' || *c == '\t')
+			undent = 1;
+		while (len && *c) {
+			fprintf(out, "%*s", node->indent, "");
+			if (undent) {
+				if (*c == '\t' && len > 1) {
+					c++;
+					len--;
+				} else if (strncmp(c, "    ", 4) == 0 && len > 4) {
+					c += 4;
+					len-= 4;
+				}
+			}
+			do {
+				fputc(*c, out);
+				c++;
+				len--;
+			} while (len && c[-1] != '\n');
+		}
+	}
+}
+
+#line 821 "../mdcode.mdc"
+static void copy_fname(char *name, int space, struct text t)
+{
+	char *sec = t.txt;
+	int len = t.len;
+	name[0] = 0;
+	if (len < 5 || strncmp(sec, "File:", 5) != 0)
+		return;
+	sec += 5;
+	len -= 5;
+	while (len && sec[0] == ' ') {
+		sec++;
+		len--;
+	}
+	if (len >= space)
+		len = space - 1;
+	strncpy(name, sec, len);
+	name[len] = 0;
+}
+
+#line 857 "../mdcode.mdc"
+static int errs;
+static void pr_err(char *msg)
+{
+	errs++;
+	fprintf(stderr, "%s\n", msg);
+}
+
+int main(int argc, char *argv[])
+{
+	int fd;
+	size_t len;
+	char *file;
+	struct section *table, *s, *prev;
+
+	errs = 0;
+	if (argc != 2) {
+		fprintf(stderr, "Usage: mdcode file.mdc\n");
+		exit(2);
+	}
+	fd = open(argv[1], O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr, "mdcode: cannot open %s: %s\n",
+			argv[1], strerror(errno));
+		exit(1);
+	}
+	len = lseek(fd, 0, 2);
+	file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+	table = code_extract(file, file+len, pr_err);
+
+	for (s = table; s;
+		(code_free(s->code), prev = s, s = s->next, free(prev))) {
+		FILE *fl;
+		char fname[1024];
+		if (strncmp(s->section.txt, "Example:", 8) == 0)
+			continue;
+		if (strncmp(s->section.txt, "File:", 5) != 0) {
+			fprintf(stderr, "Unreferenced section is not a file name: %.*s\n",
+				s->section.len, s->section.txt);
+			errs++;
+			continue;
+		}
+		copy_fname(fname, sizeof(fname), s->section);
+		if (fname[0] == 0) {
+			fprintf(stderr, "Missing file name at:%.*s\n",
+				s->section.len, s->section.txt);
+			errs++;
+			continue;
+		}
+		fl = fopen(fname, "w");
+		if (!fl) {
+			fprintf(stderr, "Cannot create %s: %s\n",
+				fname, strerror(errno));
+			errs++;
+			continue;
+		}
+		code_print(fl, s->code, argv[1]);
+		fclose(fl);
+	}
+	exit(!!errs);
+}
+
+#line 126 "../mdcode.mdc"
+
diff --git a/csrc/boot-strap/mdcode.h b/csrc/boot-strap/mdcode.h
new file mode 100644
index 0000000..6a17e44
--- /dev/null
+++ b/csrc/boot-strap/mdcode.h
@@ -0,0 +1,32 @@
+#line 158 "../mdcode.mdc"
+struct text {
+	char *txt;
+	int len;
+};
+
+struct section {
+	struct text section;
+	struct code_node *code;
+	struct section *next;
+};
+
+struct code_node {
+	struct text code;
+	int indent;
+	int line_no;
+	struct code_node *next;
+	struct section *child;
+};
+
+#line 687 "../mdcode.mdc"
+typedef void (*code_err_fn)(char *msg);
+
+#line 253 "../mdcode.mdc"
+void code_free(struct code_node *code);
+
+#line 738 "../mdcode.mdc"
+struct section *code_extract(char *pos, char *end, code_err_fn error);
+
+
+#line 98 "../mdcode.mdc"
+
diff --git a/csrc/boot-strap/mdcode.mk b/csrc/boot-strap/mdcode.mk
new file mode 100644
index 0000000..22a35a1
--- /dev/null
+++ b/csrc/boot-strap/mdcode.mk
@@ -0,0 +1,23 @@
+#line 88 "../mdcode.mdc"
+CFLAGS += -Wall -g
+all::
+mdcode.h libmdcode.c md2c.c mdcode.mk :  mdcode.mdc
+	./md2c mdcode.mdc
+
+
+#line 112 "../mdcode.mdc"
+all :: libmdcode.o
+libmdcode.o : libmdcode.c mdcode.h
+	$(CC) $(CFLAGS) -c libmdcode.c
+
+
+#line 129 "../mdcode.mdc"
+all :: md2c
+md2c : md2c.o libmdcode.o
+	$(CC) $(CFLAGS) -o md2c md2c.o libmdcode.o
+md2c.o : md2c.c mdcode.h
+	$(CC) $(CFLAGS) -c md2c.c
+
+#line 192 "../mdcode.mdc"
+CFLAGS += -fplan9-extensions
+
diff --git a/csrc/mdcode.mdc b/csrc/mdcode.mdc
new file mode 100644
index 0000000..dc78a12
--- /dev/null
+++ b/csrc/mdcode.mdc
@@ -0,0 +1,917 @@
+# mdcode: extract C code from a _markdown_ file.
+
+_markdown_ is a popular format for simple text markup which can easily
+be converted to HTML.  As it allows easy indication of sections of
+code, it is quite suitable for use in literate programming.  This file
+is an example of that usage.
+
+The code included below provides two related functionalities.
+Firstly it provides a library routine for extracting code out of a
+_markdown_ file, so that other routines might make use of it.
+
+Secondly it provides a simple client of this routine which extracts
+1 or more C-language files from a markdown document so they can be
+passed to a C compiler.  These two combined to make a tool that is needed
+to compile this tool.  Yes, this is circular.  A prototype tool was
+used for the first extraction.
+
+The tool provided is described as specific to the C language as it
+generates
+
+##### Example: a _line_ command
+
+	#line __line-number__ __file-name__
+
+lines so that the C compiler will report where in the markdown file
+any error is found.  This tool is suitable for any other language
+which allows the same directive, or will treat it as a comment.
+
+## Literate Details
+
+Literate programming is more than just including comments with the
+code, even nicely formatted comments.  It also involves presenting the
+code in an order that makes sense to a human, rather than an order
+that makes sense to a compiler.  For this reason a core part of any
+literate programming tool is the ability to re-arrange the code found
+in the document into a different order in the final code file - or
+files.  This requires some form of linkage to be encoded.
+
+The approach taken here is focused around section headings - of any
+depth.
+
+All the code in any section is treated as a single sequential
+collection of code, and is named by the section that it is in.  If
+multiple sections have the same name, then the code blocks in all of
+them are joined together in the order they appear in the document.
+
+A code section can contain a special marker which starts with 2
+hashes: __##__.
+The text after the marker must be the name of some section which
+contains code.  Code from that section will be interpolated in place
+of the marker, and will be indented to match the indent of the marker.
+
+It is not permitted for the same code to be interpolated multiple
+times.  Allowing this might make some sense, but it is probably a
+mistake, and prohibiting it make some of the code a bit cleaner.
+
+Equally, every section of code should be interpolated at least once -
+with two exceptions.  These exceptions are imposed by the tool, not
+the library.  A different client could impose different rules on the
+names of top-level code sections.
+
+The first exception we have already seen.  A section name starting
+__Example:__ indicates code that is not to be included in the final product.
+
+The second exception is for the top level code sections which will be
+written to files.  Again these are identified by their section name.
+This must start with __File:__  the following text (after optional
+spaces) will be used as a file name.
+
+Any section containing code that does not start __Example:__ or
+__File:__ must be included in some other section exactly once.
+
+### Multiple files
+
+Allowing multiple top level code sections which name different files
+means that one _markdown_ document can describe several files.  This
+is very useful with the C language where a program file and a header
+file might be related.  For the present document we will have a header
+file and two code files, one with the library content and one for the
+tool.
+
+It will also be very convenient to create a `makefile` fragment to
+ensure the code is compiled correctly.  A simple `make -f mdcode.mk`
+will "do the right thing".
+
+### File: mdcode.mk
+
+	CFLAGS += -Wall -g
+	all::
+	mdcode.h libmdcode.c md2c.c mdcode.mk :  mdcode.mdc
+		./md2c mdcode.mdc
+
+
+### File: mdcode.h
+
+	## exported types
+	## exported functions
+
+### File: libmdcode.c
+	#define _GNU_SOURCE
+	#include <unistd.h>
+	#include <stdlib.h>
+	#include <stdio.h>
+
+	#include "mdcode.h"
+	## internal includes
+	## private types
+	## internal functions
+
+### File: mdcode.mk
+
+	all :: libmdcode.o
+	libmdcode.o : libmdcode.c mdcode.h
+		$(CC) $(CFLAGS) -c libmdcode.c
+
+
+### File: md2c.c
+
+	#include <unistd.h>
+	#include <stdlib.h>
+
+	#include "mdcode.h"
+
+	## client includes
+	## client functions
+
+### File: mdcode.mk
+
+	all :: md2c
+	md2c : md2c.o libmdcode.o
+		$(CC) $(CFLAGS) -o md2c md2c.o libmdcode.o
+	md2c.o : md2c.c mdcode.h
+		$(CC) $(CFLAGS) -c md2c.c
+
+## Data Structures
+
+As the core purpose of _mdcode_ is to discover and re-arrange blocks
+of text, it makes sense to map the whole document file into memory and
+produce a data structure which lists various parts of the file in the
+appropriate order.  Each node in this structure will have some text
+from the document, a child pointer, and a next pointer, any of which
+might not be present.  The text is most easily stored as a pointer and a
+length.  We'll call this a `text`
+
+A list of these `code_nodes` will belong to each section and it will
+be useful to have a separate `section` data structure to store the
+list of `code_nodes`, the section name, and some other information.
+
+This other information will include a reference counter so we can
+ensure proper referencing, and an `indent` depth.  As referenced
+content can have an extra indent added, we need to know what that is.
+The `code_node` will also have an `indent` depth which eventually gets
+set to the sum for the indents from all references on the path from
+the root.
+
+##### exported types
+
+	struct text {
+		char *txt;
+		int len;
+	};
+
+	struct section {
+		struct text section;
+		struct code_node *code;
+		struct section *next;
+	};
+
+	struct code_node {
+		struct text code;
+		int indent;
+		int line_no;
+		struct code_node *next;
+		struct section *child;
+	};
+
+##### private types
+
+	struct psection {
+		struct section;
+		struct code_node *last;
+		int refcnt;
+		int indent;
+	};
+
+You will note that the `struct psection` contains an anonymous `struct
+section` embedded at the start.  To make this work right, GCC
+requires the `-fplan9-extensions` flag.
+
+##### File: mdcode.mk
+
+	CFLAGS += -fplan9-extensions
+
+### Manipulating the node
+
+Though a tree with `next` and `child` links is the easiest way to
+assemble the various code sections, it is not the easiest form for
+using them.  For that a simple list would be best.
+
+So once we have a fully linked File section we will want to linearize
+it, so that the `child` links become `NULL` and the `next` links will
+find everything required.  It is at this stage that the requirements
+that each section is linked only once becomes import.
+
+`code_linearize` will merge the `code_node`s from any child into the
+given `code_node`.  As it does this it sets the 'indent' field for
+each `code_node`.
+
+Note that we don't clear the section's `last` pointer, even though
+it no longer owns any code.  This allows subsequent code to see if a
+section ever had any code, and to report an error if a section is
+referenced but not defined.
+
+##### internal functions
+
+	static void code_linearize(struct code_node *code)
+	{
+		struct code_node *t;
+		for (t = code; t; t = t->next)
+			t->indent = 0;
+		for (; code; code = code->next)
+			if (code->child) {
+				struct code_node *next = code->next;
+				struct psection *pchild =
+					(struct	psection *)code->child;
+				int indent = pchild->indent;
+				code->next = code->child->code;
+				code->child->code = NULL;
+				code->child = NULL;
+				for (t = code; t->next; t = t->next)
+					t->next->indent = code->indent + indent;
+				t->next = next;
+			}
+	}
+
+Once a client has made use of a linearized code set, it will probably
+want to free it.
+
+	void code_free(struct code_node *code)
+	{
+		while (code) {
+			struct code_node *this;
+			if (code->child)
+				code_linearize(code);
+			this = code;
+			code = code->next;
+			free(this);
+		}
+	}
+
+##### exported functions
+
+	void code_free(struct code_node *code);
+
+### Building the tree
+
+As we parse the document there are two things we will want to do to
+node trees: add some text or add a reference.  We'll assume for now
+that the relevant section structures have been found, and will just
+deal with the `code_node`.
+
+Adding text simply means adding another node.  We will never have
+empty nodes, even if the last node only has a child, new text must go
+in a new node.
+
+##### internal functions
+
+	static void code_add_text(struct psection *where, struct text txt,
+				  int line_no)
+	{
+		struct code_node *n;
+		if (txt.len == 0)
+			return;
+		n = malloc(sizeof(*n));
+		n->code = txt;
+		n->indent = 0;
+		n->line_no = line_no;
+		n->next = NULL;
+		n->child = NULL;
+		if (where->last)
+			where->last->next = n;
+		else
+			where->code = n;
+		where->last = n;
+	}
+
+However when adding a link, we might be able to include it in the last
+`code_node` if it currently only has text.
+
+	void code_add_link(struct psection *where, struct psection *to,
+			   int indent)
+	{
+		struct code_node *n;
+
+		to->indent = indent;
+		to->refcnt++;	// this will be checked elsewhere
+		if (where->last && where->last->child == NULL) {
+			where->last->child = to;
+			return;
+		}
+		n = malloc(sizeof(*n));
+		n->code.len = 0;
+		n->indent = 0;
+		n->line_no = 0;
+		n->next = NULL;
+		n->child = to;
+		if (where->last)
+			where->last->next = n;
+		else
+			where->code = n;
+		where->last = n;
+	}
+
+### Finding sections
+
+Now we need a lookup table to be able to find sections by name.
+Something that provides an `n*log(N)` search time is probably
+justified, but for now I want a minimal stand-alone program so a
+linked list managed by insertion-sort will do.  As a comparison
+function it is easiest to sort based on length before content.  So
+sections won't be in standard lexical order, but that isn't important.
+
+If we cannot find a section, we simply want to create it.  This allows
+sections and references to be created in any order.  Sections with
+no references or no content will cause a warning eventually.
+
+#### internal functions
+
+	static int text_cmp(struct text a, struct text b)
+	{
+		if (a.len != b.len)
+			return a.len - b.len;
+		return strncmp(a.txt, b.txt, a.len);
+	}
+
+	static struct psection *section_find(struct psection **list, struct text name)
+	{
+		struct psection *new;
+		while (*list) {
+			int cmp = text_cmp((*list)->section, name);
+			if (cmp == 0)
+				return *list;
+			if (cmp > 0)
+				break;
+			list = (struct psection **)&((*list)->next);
+		}
+		/* Add this section */
+		new = malloc(sizeof(*new));
+		new->next = *list;
+		*list = new;
+		new->section = name;
+		new->code = NULL;
+		new->last = NULL;
+		new->refcnt = 0;
+		new->indent = 0;
+		return new;
+	}
+
+## Parsing the _markdown_
+
+Parsing markdown is fairly easy, though there are complications.
+
+The document is divided into "paragraphs" which are mostly separated by blank
+lines (which may contain white space).  The first few characters of
+the first line of a paragraph determine the type of paragraph.  For
+our purposes we are only interested in list paragraphs, code
+paragraphs, section headings, and everything else.  Section headings
+are single-line paragraphs and so do not require a preceding or
+following blank line.
+
+Section headings start with 1 or more hash characters (__#__).  List
+paragraphs start with hyphen, asterisk, plus, or digits followed by a
+period.  Code paragraphs aren't quite so easy.
+
+The "standard" code paragraph starts with 4 or more spaces, or a tab.
+However if the previous paragraph was a list paragraph, then those
+spaces indicate another  paragraph in the same list item, and 8 or
+more spaces are required.  Unless a nested list is in effect, in
+which case 12 or more are need.   Unfortunately not all _markdown_
+parsers agree on nested lists.
+
+Two alternate styles for marking code are in active use.  "Github" uses
+three backticks(_`` ``` ``_), while "pandoc" uses three or more tildes
+(_~~~_).  In these cases the code should not be indented.
+
+Trying to please everyone as much as possible, this parser will handle
+everything except for code inside lists.
+
+So an indented (4+) paragraph after a list paragraph is always a list
+paragraph, otherwise it is a code paragraph.  A paragraph that starts
+with three backticks or three tildes is code which continues until a
+matching string of backticks or tildes.
+
+### Skipping bits
+
+While walking the document looking for various markers we will *not*
+use the `struct text` introduced earlier as advancing that requires
+updating both start and length which feels clumsy.  Instead we will
+carry `pos` and `end` pointers, only the first of which needs to
+change.
+
+So to start, we need to skip various parts of the document.  `lws`
+stands for "Linear White Space" and is a term that comes from the
+Email RFCs (e.g. RFC822).  `line` and `para` are self explanatory.
+Note that `skip_para` needs to update the current line number.
+`skip_line` doesn't but every caller should.
+
+#### internal functions
+
+	static char *skip_lws(char *pos, char *end)
+	{
+		while (pos < end && (*pos == ' ' || *pos == '\t'))
+			pos++;
+		return pos;
+	}
+
+	static char *skip_line(char *pos, char *end)
+	{
+		while (pos < end && *pos != '\n')
+			pos++;
+		if (pos < end)
+			pos++;
+		return pos;
+	}
+
+	static char *skip_para(char *pos, char *end, int *line_no)
+	{
+		/* Might return a pointer to a blank line, as only
+		 * one trailing blank line is skipped
+		 */
+		if (*pos == '#') {
+			pos = skip_line(pos, end);
+			(*line_no) += 1;
+			return pos;
+		}
+		while (pos < end &&
+		       *pos != '#' &&
+		       *(pos = skip_lws(pos, end)) != '\n') {
+			pos = skip_line(pos, end);
+			(*line_no) += 1;
+		}
+		if (pos < end && *pos == '\n') {
+			pos++;
+			(*line_no) += 1;
+		}
+		return pos;
+	}
+
+### Recognising things
+
+Recognising a section header is trivial and doesn't require a
+function.  However we need to extract the content of a section header
+as a `struct text` for passing to `section_find`.
+Recognising the start of a new list is fairly easy.  Recognising the
+start (and end) of code is a little messy so we provide a function for
+matching the first few characters, which has a special case for "4
+spaces or tab".
+
+#### internal includes
+
+	#include  <ctype.h>
+	#include  <string.h>
+
+#### internal functions
+
+	static struct text take_header(char *pos, char *end)
+	{
+		struct text section;
+
+		while (pos < end && *pos == '#')
+			pos++;
+		while (pos < end && *pos == ' ')
+			pos++;
+		section.txt = pos;
+		while (pos < end && *pos != '\n')
+			pos++;
+		while (pos > section.txt &&
+		       (pos[-1] == '#' || pos[-1] == ' '))
+			pos--;
+		section.len = pos - section.txt;
+		return section;
+	}
+
+	static int is_list(char *pos, char *end)
+	{
+		if (strchr("-*+", *pos))
+			return 1;
+		if (isdigit(*pos)) {
+			while (pos < end && isdigit(*pos))
+				pos += 1;
+			if  (pos < end && *pos == '.')
+				return 1;
+		}
+		return 0;
+	}
+
+	static int matches(char *start, char *pos, char *end)
+	{
+		if (start == NULL)
+			return matches("\t", pos, end) ||
+			       matches("    ", pos, end);
+		return (pos + strlen(start) < end &&
+			strncmp(pos, start, strlen(start)) == 0);
+	}
+
+### Extracting the code
+
+Now that we can skip paragraphs and recognise what type each paragraph
+is, it is time to parse the file and extract the code.  We'll do this
+in two parts, first we look at what to do with some code once we
+find it, and then how to actually find it.
+
+When we have some code, we know where it is, what the end marker
+should look like, and which section it is in.
+
+There are two sorts of end markers: the presence of a particular
+string, or the absence of an indent.  We will use a string to
+represent a presence, and a `NULL` to represent the absence.
+
+While looking at code we don't think about paragraphs are all - just
+look for a line that starts with the right thing.
+Every line that is still code then needs to be examined to see if it
+is a section reference.
+
+When a section reference is found, all preceding code (if any) must be
+added to the current section, then the reference is added.
+
+When we do find the end of the code, all text that we have found but
+not processed needs to be saved too.
+
+When adding a reference we need to set the `indent`.  This is the
+number of spaces (counting 8 for tabs) after the natural indent of the
+code (which is a tab or 4 spaces).  We use a separate function `count_spaces`
+for that.
+
+#### internal functions
+
+	static int count_space(char *sol, char *p)
+	{
+		int c = 0;
+		while (sol < p) {
+			if (sol[0] == ' ')
+				c++;
+			if (sol[0] == '\t')
+				c+= 8;
+			sol++;
+		}
+		return c;
+	}
+
+
+	static char *take_code(char *pos, char *end, char *marker,
+			       struct psection **table, struct text section,
+			       int *line_nop)
+	{
+		char *start = pos;
+		int line_no = *line_nop;
+		int start_line = line_no;
+		struct psection *sect;
+
+		sect = section_find(table, section);
+
+		while (pos < end) {
+			char *sol, *t;
+			struct text ref;
+
+			if (marker && matches(marker, pos, end))
+				break;
+			if (!marker &&
+			    (skip_lws(pos, end))[0] != '\n' &&
+			    !matches(NULL, pos, end))
+				/* Paragraph not indented */
+				break;
+
+			/* Still in code - check for reference */
+			sol = pos;
+			if (!marker) {
+				if (*sol == '\t')
+					sol++;
+				else if (strcmp(sol, "    ") == 0)
+					sol += 4;
+			}
+			t = skip_lws(sol, end);
+			if (t[0] != '#' || t[1] != '#') {
+				/* Just regular code here */
+				pos = skip_line(sol, end);
+				line_no++;
+				continue;
+			}
+
+			if (pos > start) {
+				struct text txt;
+				txt.txt = start;
+				txt.len = pos - start;
+				code_add_text(sect, txt, start_line);
+			}
+			ref = take_header(t, end);
+			if (ref.len) {
+				struct psection *refsec = section_find(table, ref);
+				code_add_link(sect, refsec, count_space(sol, t));
+			}
+			pos = skip_line(t, end);
+			line_no++;
+			start = pos;
+			start_line = line_no;
+		}
+		if (pos > start) {
+			struct text txt;
+			txt.txt = start;
+			txt.len = pos - start;
+			code_add_text(sect, txt, start_line);
+		}
+		if (marker) {
+			pos = skip_line(pos, end);
+			line_no++;
+		}
+		*line_nop = line_no;
+		return pos;
+	}
+
+### Finding the code
+
+It is when looking for the code that we actually use the paragraph
+structure.  We need to recognise section headings so we can record the
+name, list paragraphs so we can ignore indented follow-on paragraphs,
+and the three different markings for code.
+
+#### internal functions
+
+	static struct psection *code_find(char *pos, char *end)
+	{
+		struct psection *table = NULL;
+		int in_list = 0;
+		int line_no = 1;
+		struct text section = {0};
+
+		while (pos < end) {
+			if (pos[0] == '#') {
+				section = take_header(pos, end);
+				in_list = 0;
+				pos = skip_line(pos, end);
+				line_no++;
+			} else if (is_list(pos, end)) {
+				in_list = 1;
+				pos = skip_para(pos, end, &line_no);
+			} else if (!in_list && matches(NULL, pos, end)) {
+				pos = take_code(pos, end, NULL, &table,
+						section, &line_no);
+			} else if (matches("```", pos, end)) {
+				in_list = 0;
+				pos = skip_line(pos, end);
+				line_no++;
+				pos = take_code(pos, end, "```", &table,
+						section, &line_no);
+			} else if (matches("~~~", pos, end)) {
+				in_list = 0;
+				pos = skip_line(pos, end);
+				line_no++;
+				pos = take_code(pos, end, "~~~", &table,
+						section, &line_no);
+			} else {
+				if (!isspace(*pos))
+					in_list = 0;
+				pos = skip_para(pos, end, &line_no);
+			}
+		}
+		return table;
+	}
+
+### Returning the code
+
+Having found all the code blocks and gathered them into a list of
+section, we are now ready to return them to the caller.  This is where
+to perform consistency checks, like at most one reference and at least
+one definition for each section.
+
+All the sections with no references are returned in a list for the
+caller to consider.  The are linearized first so that the substructure
+is completely hidden -- except for the small amount of structure
+displayed in the line numbers.
+
+To return errors, we have the caller pass a function which takes an
+error message - a `code_err_fn`.
+
+#### exported types
+
+	typedef void (*code_err_fn)(char *msg);
+
+#### internal functions
+	struct section *code_extract(char *pos, char *end, code_err_fn error)
+	{
+		struct psection *table;
+		struct section *result = NULL;
+		struct section *tofree = NULL;
+
+		table = code_find(pos, end);
+
+		while (table) {
+			struct psection *t = (struct psection*)table->next;
+			if (table->last == NULL) {
+				char *msg;
+				asprintf(&msg,
+					"Section \"%.*s\" is referenced but not declared",
+					 table->section.len, table->section.txt);
+				error(msg);
+				free(msg);
+			}
+			if (table->refcnt == 0) {
+				/* Root-section,  return it */
+				table->next = result;
+				result = table;
+				code_linearize(result->code);
+			} else {
+				table->next = tofree;
+				tofree = table;
+				if (table->refcnt > 1) {
+					char *msg;
+					asprintf(&msg,
+						 "Section \"%.*s\" referenced multiple times (%d).",
+						 table->section.len, table->section.txt,
+						 table->refcnt);
+					error(msg);
+					free(msg);
+				}
+			}
+			table = t;
+		}
+		while (tofree) {
+			struct section *t = tofree->next;
+			free(tofree);
+			tofree = t;
+		}
+		return result;
+	}
+
+##### exported functions
+
+	struct section *code_extract(char *pos, char *end, code_err_fn error);
+
+
+## Using the library
+
+Now that we can extract code from a document and link it all together
+it is time to do something with that code.  Firstly we need to print
+it out.
+
+### Printing the Code
+
+Printing is mostly straight forward - we just walk the list and print
+the code sections, adding whatever indent is required for each line.
+However there is a complication (isn't there always)?
+
+For code that was recognised because the paragraph was indented, we
+need to strip that indent first.  For other code, we don't.
+
+The approach taken here is simple, though it could arguably be wrong
+in some unlikely cases.  So it might need to be fixed later.
+
+If the first line of a code block is indented, then either one tab or
+4 spaces are striped from every non-blank line.
+
+This could go wrong if the first line of a code block marked by
+_`` ``` ``_ is indented.  To overcome this we would need to
+record someextra state in each `code_node`.  For now we won't bother.
+
+The indents we insert will all be spaces.  This might not work well
+for `Makefiles`.
+
+##### client functions
+
+	static void code_print(FILE *out, struct code_node *node,
+			       char *fname)
+	{
+		for (; node; node = node->next) {
+			char *c = node->code.txt;
+			int len = node->code.len;
+			int undent = 0;
+
+			if (!len)
+				continue;
+
+			fprintf(out, "#line %d \"%s\"\n",
+				node->line_no, fname);
+			if (*c == ' ' || *c == '\t')
+				undent = 1;
+			while (len && *c) {
+				fprintf(out, "%*s", node->indent, "");
+				if (undent) {
+					if (*c == '\t' && len > 1) {
+						c++;
+						len--;
+					} else if (strncmp(c, "    ", 4) == 0 && len > 4) {
+						c += 4;
+						len-= 4;
+					}
+				}
+				do {
+					fputc(*c, out);
+					c++;
+					len--;
+				} while (len && c[-1] != '\n');
+			}
+		}
+	}
+
+### Bringing it all together
+
+We are just about ready for the `main` function of the tool which will
+extract all this lovely code and compile it.  Just one helper is still
+needed.
+
+#### Handling filenames
+
+Section names are stored in `struct text` which is not `nul`
+terminated.  Filenames passed to `open` need to be null terminated.
+So we need to convert one to the other, and strip the leading `File:`
+of while we are at it.
+
+##### client functions
+
+	static void copy_fname(char *name, int space, struct text t)
+	{
+		char *sec = t.txt;
+		int len = t.len;
+		name[0] = 0;
+		if (len < 5 || strncmp(sec, "File:", 5) != 0)
+			return;
+		sec += 5;
+		len -= 5;
+		while (len && sec[0] == ' ') {
+			sec++;
+			len--;
+		}
+		if (len >= space)
+			len = space - 1;
+		strncpy(name, sec, len);
+		name[len] = 0;
+	}
+
+#### Main
+
+And now we take a single file name, extract the code, and if there are
+no error we write out a file for each appropriate code section.  And
+we are done.
+
+
+##### client includes
+
+	#include <fcntl.h>
+	#include <errno.h>
+	#include <sys/mman.h>
+	#include <string.h>
+	#include <stdio.h>
+
+##### client functions
+
+	static int errs;
+	static void pr_err(char *msg)
+	{
+		errs++;
+		fprintf(stderr, "%s\n", msg);
+	}
+
+	int main(int argc, char *argv[])
+	{
+		int fd;
+		size_t len;
+		char *file;
+		struct section *table, *s, *prev;
+
+		errs = 0;
+		if (argc != 2) {
+			fprintf(stderr, "Usage: mdcode file.mdc\n");
+			exit(2);
+		}
+		fd = open(argv[1], O_RDONLY);
+		if (fd < 0) {
+			fprintf(stderr, "mdcode: cannot open %s: %s\n",
+				argv[1], strerror(errno));
+			exit(1);
+		}
+		len = lseek(fd, 0, 2);
+		file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+		table = code_extract(file, file+len, pr_err);
+
+		for (s = table; s;
+			(code_free(s->code), prev = s, s = s->next, free(prev))) {
+			FILE *fl;
+			char fname[1024];
+			if (strncmp(s->section.txt, "Example:", 8) == 0)
+				continue;
+			if (strncmp(s->section.txt, "File:", 5) != 0) {
+				fprintf(stderr, "Unreferenced section is not a file name: %.*s\n",
+					s->section.len, s->section.txt);
+				errs++;
+				continue;
+			}
+			copy_fname(fname, sizeof(fname), s->section);
+			if (fname[0] == 0) {
+				fprintf(stderr, "Missing file name at:%.*s\n",
+					s->section.len, s->section.txt);
+				errs++;
+				continue;
+			}
+			fl = fopen(fname, "w");
+			if (!fl) {
+				fprintf(stderr, "Cannot create %s: %s\n",
+					fname, strerror(errno));
+				errs++;
+				continue;
+			}
+			code_print(fl, s->code, argv[1]);
+			fclose(fl);
+		}
+		exit(!!errs);
+	}
+