--- /dev/null
+#line 100 "../mdcode.mdc"
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "mdcode.h"
+#line 461 "../mdcode.mdc"
+#include <ctype.h>
+#include <string.h>
+
+#line 179 "../mdcode.mdc"
+struct psection {
+ struct section;
+ struct code_node *last;
+ int refcnt;
+ int indent;
+};
+
+#line 216 "../mdcode.mdc"
+static void code_linearize(struct code_node *code)
+{
+ struct code_node *t;
+ for (t = code; t; t = t->next)
+ t->indent = 0;
+ for (; code; code = code->next)
+ if (code->child) {
+ struct code_node *next = code->next;
+ struct psection *pchild =
+ (struct psection *)code->child;
+ int indent = pchild->indent;
+ code->next = code->child->code;
+ code->child->code = NULL;
+ code->child = NULL;
+ for (t = code; t->next; t = t->next)
+ t->next->indent = code->indent + indent;
+ t->next = next;
+ }
+}
+
+#line 239 "../mdcode.mdc"
+void code_free(struct code_node *code)
+{
+ while (code) {
+ struct code_node *this;
+ if (code->child)
+ code_linearize(code);
+ this = code;
+ code = code->next;
+ free(this);
+ }
+}
+
+#line 268 "../mdcode.mdc"
+static void code_add_text(struct psection *where, struct text txt,
+ int line_no)
+{
+ struct code_node *n;
+ if (txt.len == 0)
+ return;
+ n = malloc(sizeof(*n));
+ n->code = txt;
+ n->indent = 0;
+ n->line_no = line_no;
+ n->next = NULL;
+ n->child = NULL;
+ if (where->last)
+ where->last->next = n;
+ else
+ where->code = n;
+ where->last = n;
+}
+
+#line 290 "../mdcode.mdc"
+void code_add_link(struct psection *where, struct psection *to,
+ int indent)
+{
+ struct code_node *n;
+
+ to->indent = indent;
+ to->refcnt++; // this will be checked elsewhere
+ if (where->last && where->last->child == NULL) {
+ where->last->child = to;
+ return;
+ }
+ n = malloc(sizeof(*n));
+ n->code.len = 0;
+ n->indent = 0;
+ n->line_no = 0;
+ n->next = NULL;
+ n->child = to;
+ if (where->last)
+ where->last->next = n;
+ else
+ where->code = n;
+ where->last = n;
+}
+
+#line 329 "../mdcode.mdc"
+static int text_cmp(struct text a, struct text b)
+{
+ if (a.len != b.len)
+ return a.len - b.len;
+ return strncmp(a.txt, b.txt, a.len);
+}
+
+static struct psection *section_find(struct psection **list, struct text name)
+{
+ struct psection *new;
+ while (*list) {
+ int cmp = text_cmp((*list)->section, name);
+ if (cmp == 0)
+ return *list;
+ if (cmp > 0)
+ break;
+ list = (struct psection **)&((*list)->next);
+ }
+ /* Add this section */
+ new = malloc(sizeof(*new));
+ new->next = *list;
+ *list = new;
+ new->section = name;
+ new->code = NULL;
+ new->last = NULL;
+ new->refcnt = 0;
+ new->indent = 0;
+ return new;
+}
+
+#line 410 "../mdcode.mdc"
+static char *skip_lws(char *pos, char *end)
+{
+ while (pos < end && (*pos == ' ' || *pos == '\t'))
+ pos++;
+ return pos;
+}
+
+static char *skip_line(char *pos, char *end)
+{
+ while (pos < end && *pos != '\n')
+ pos++;
+ if (pos < end)
+ pos++;
+ return pos;
+}
+
+static char *skip_para(char *pos, char *end, int *line_no)
+{
+ /* Might return a pointer to a blank line, as only
+ * one trailing blank line is skipped
+ */
+ if (*pos == '#') {
+ pos = skip_line(pos, end);
+ (*line_no) += 1;
+ return pos;
+ }
+ while (pos < end &&
+ *pos != '#' &&
+ *(pos = skip_lws(pos, end)) != '\n') {
+ pos = skip_line(pos, end);
+ (*line_no) += 1;
+ }
+ if (pos < end && *pos == '\n') {
+ pos++;
+ (*line_no) += 1;
+ }
+ return pos;
+}
+
+#line 466 "../mdcode.mdc"
+static struct text take_header(char *pos, char *end)
+{
+ struct text section;
+
+ while (pos < end && *pos == '#')
+ pos++;
+ while (pos < end && *pos == ' ')
+ pos++;
+ section.txt = pos;
+ while (pos < end && *pos != '\n')
+ pos++;
+ while (pos > section.txt &&
+ (pos[-1] == '#' || pos[-1] == ' '))
+ pos--;
+ section.len = pos - section.txt;
+ return section;
+}
+
+static int is_list(char *pos, char *end)
+{
+ if (strchr("-*+", *pos))
+ return 1;
+ if (isdigit(*pos)) {
+ while (pos < end && isdigit(*pos))
+ pos += 1;
+ if (pos < end && *pos == '.')
+ return 1;
+ }
+ return 0;
+}
+
+static int matches(char *start, char *pos, char *end)
+{
+ if (start == NULL)
+ return matches("\t", pos, end) ||
+ matches(" ", pos, end);
+ return (pos + strlen(start) < end &&
+ strncmp(pos, start, strlen(start)) == 0);
+}
+
+#line 538 "../mdcode.mdc"
+static int count_space(char *sol, char *p)
+{
+ int c = 0;
+ while (sol < p) {
+ if (sol[0] == ' ')
+ c++;
+ if (sol[0] == '\t')
+ c+= 8;
+ sol++;
+ }
+ return c;
+}
+
+
+static char *take_code(char *pos, char *end, char *marker,
+ struct psection **table, struct text section,
+ int *line_nop)
+{
+ char *start = pos;
+ int line_no = *line_nop;
+ int start_line = line_no;
+ struct psection *sect;
+
+ sect = section_find(table, section);
+
+ while (pos < end) {
+ char *sol, *t;
+ struct text ref;
+
+ if (marker && matches(marker, pos, end))
+ break;
+ if (!marker &&
+ (skip_lws(pos, end))[0] != '\n' &&
+ !matches(NULL, pos, end))
+ /* Paragraph not indented */
+ break;
+
+ /* Still in code - check for reference */
+ sol = pos;
+ if (!marker) {
+ if (*sol == '\t')
+ sol++;
+ else if (strcmp(sol, " ") == 0)
+ sol += 4;
+ }
+ t = skip_lws(sol, end);
+ if (t[0] != '#' || t[1] != '#') {
+ /* Just regular code here */
+ pos = skip_line(sol, end);
+ line_no++;
+ continue;
+ }
+
+ if (pos > start) {
+ struct text txt;
+ txt.txt = start;
+ txt.len = pos - start;
+ code_add_text(sect, txt, start_line);
+ }
+ ref = take_header(t, end);
+ if (ref.len) {
+ struct psection *refsec = section_find(table, ref);
+ code_add_link(sect, refsec, count_space(sol, t));
+ }
+ pos = skip_line(t, end);
+ line_no++;
+ start = pos;
+ start_line = line_no;
+ }
+ if (pos > start) {
+ struct text txt;
+ txt.txt = start;
+ txt.len = pos - start;
+ code_add_text(sect, txt, start_line);
+ }
+ if (marker) {
+ pos = skip_line(pos, end);
+ line_no++;
+ }
+ *line_nop = line_no;
+ return pos;
+}
+
+#line 630 "../mdcode.mdc"
+static struct psection *code_find(char *pos, char *end)
+{
+ struct psection *table = NULL;
+ int in_list = 0;
+ int line_no = 1;
+ struct text section = {0};
+
+ while (pos < end) {
+ if (pos[0] == '#') {
+ section = take_header(pos, end);
+ in_list = 0;
+ pos = skip_line(pos, end);
+ line_no++;
+ } else if (is_list(pos, end)) {
+ in_list = 1;
+ pos = skip_para(pos, end, &line_no);
+ } else if (!in_list && matches(NULL, pos, end)) {
+ pos = take_code(pos, end, NULL, &table,
+ section, &line_no);
+ } else if (matches("```", pos, end)) {
+ in_list = 0;
+ pos = skip_line(pos, end);
+ line_no++;
+ pos = take_code(pos, end, "```", &table,
+ section, &line_no);
+ } else if (matches("~~~", pos, end)) {
+ in_list = 0;
+ pos = skip_line(pos, end);
+ line_no++;
+ pos = take_code(pos, end, "~~~", &table,
+ section, &line_no);
+ } else {
+ if (!isspace(*pos))
+ in_list = 0;
+ pos = skip_para(pos, end, &line_no);
+ }
+ }
+ return table;
+}
+
+#line 690 "../mdcode.mdc"
+struct section *code_extract(char *pos, char *end, code_err_fn error)
+{
+ struct psection *table;
+ struct section *result = NULL;
+ struct section *tofree = NULL;
+
+ table = code_find(pos, end);
+
+ while (table) {
+ struct psection *t = (struct psection*)table->next;
+ if (table->last == NULL) {
+ char *msg;
+ asprintf(&msg,
+ "Section \"%.*s\" is referenced but not declared",
+ table->section.len, table->section.txt);
+ error(msg);
+ free(msg);
+ }
+ if (table->refcnt == 0) {
+ /* Root-section, return it */
+ table->next = result;
+ result = table;
+ code_linearize(result->code);
+ } else {
+ table->next = tofree;
+ tofree = table;
+ if (table->refcnt > 1) {
+ char *msg;
+ asprintf(&msg,
+ "Section \"%.*s\" referenced multiple times (%d).",
+ table->section.len, table->section.txt,
+ table->refcnt);
+ error(msg);
+ free(msg);
+ }
+ }
+ table = t;
+ }
+ while (tofree) {
+ struct section *t = tofree->next;
+ free(tofree);
+ tofree = t;
+ }
+ return result;
+}
+
+#line 109 "../mdcode.mdc"
+
--- /dev/null
+# mdcode: extract C code from a _markdown_ file.
+
+_markdown_ is a popular format for simple text markup which can easily
+be converted to HTML. As it allows easy indication of sections of
+code, it is quite suitable for use in literate programming. This file
+is an example of that usage.
+
+The code included below provides two related functionalities.
+Firstly it provides a library routine for extracting code out of a
+_markdown_ file, so that other routines might make use of it.
+
+Secondly it provides a simple client of this routine which extracts
+1 or more C-language files from a markdown document so they can be
+passed to a C compiler. These two combined to make a tool that is needed
+to compile this tool. Yes, this is circular. A prototype tool was
+used for the first extraction.
+
+The tool provided is described as specific to the C language as it
+generates
+
+##### Example: a _line_ command
+
+ #line __line-number__ __file-name__
+
+lines so that the C compiler will report where in the markdown file
+any error is found. This tool is suitable for any other language
+which allows the same directive, or will treat it as a comment.
+
+## Literate Details
+
+Literate programming is more than just including comments with the
+code, even nicely formatted comments. It also involves presenting the
+code in an order that makes sense to a human, rather than an order
+that makes sense to a compiler. For this reason a core part of any
+literate programming tool is the ability to re-arrange the code found
+in the document into a different order in the final code file - or
+files. This requires some form of linkage to be encoded.
+
+The approach taken here is focused around section headings - of any
+depth.
+
+All the code in any section is treated as a single sequential
+collection of code, and is named by the section that it is in. If
+multiple sections have the same name, then the code blocks in all of
+them are joined together in the order they appear in the document.
+
+A code section can contain a special marker which starts with 2
+hashes: __##__.
+The text after the marker must be the name of some section which
+contains code. Code from that section will be interpolated in place
+of the marker, and will be indented to match the indent of the marker.
+
+It is not permitted for the same code to be interpolated multiple
+times. Allowing this might make some sense, but it is probably a
+mistake, and prohibiting it make some of the code a bit cleaner.
+
+Equally, every section of code should be interpolated at least once -
+with two exceptions. These exceptions are imposed by the tool, not
+the library. A different client could impose different rules on the
+names of top-level code sections.
+
+The first exception we have already seen. A section name starting
+__Example:__ indicates code that is not to be included in the final product.
+
+The second exception is for the top level code sections which will be
+written to files. Again these are identified by their section name.
+This must start with __File:__ the following text (after optional
+spaces) will be used as a file name.
+
+Any section containing code that does not start __Example:__ or
+__File:__ must be included in some other section exactly once.
+
+### Multiple files
+
+Allowing multiple top level code sections which name different files
+means that one _markdown_ document can describe several files. This
+is very useful with the C language where a program file and a header
+file might be related. For the present document we will have a header
+file and two code files, one with the library content and one for the
+tool.
+
+It will also be very convenient to create a `makefile` fragment to
+ensure the code is compiled correctly. A simple `make -f mdcode.mk`
+will "do the right thing".
+
+### File: mdcode.mk
+
+ CFLAGS += -Wall -g
+ all::
+ mdcode.h libmdcode.c md2c.c mdcode.mk : mdcode.mdc
+ ./md2c mdcode.mdc
+
+
+### File: mdcode.h
+
+ ## exported types
+ ## exported functions
+
+### File: libmdcode.c
+ #define _GNU_SOURCE
+ #include <unistd.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+
+ #include "mdcode.h"
+ ## internal includes
+ ## private types
+ ## internal functions
+
+### File: mdcode.mk
+
+ all :: libmdcode.o
+ libmdcode.o : libmdcode.c mdcode.h
+ $(CC) $(CFLAGS) -c libmdcode.c
+
+
+### File: md2c.c
+
+ #include <unistd.h>
+ #include <stdlib.h>
+
+ #include "mdcode.h"
+
+ ## client includes
+ ## client functions
+
+### File: mdcode.mk
+
+ all :: md2c
+ md2c : md2c.o libmdcode.o
+ $(CC) $(CFLAGS) -o md2c md2c.o libmdcode.o
+ md2c.o : md2c.c mdcode.h
+ $(CC) $(CFLAGS) -c md2c.c
+
+## Data Structures
+
+As the core purpose of _mdcode_ is to discover and re-arrange blocks
+of text, it makes sense to map the whole document file into memory and
+produce a data structure which lists various parts of the file in the
+appropriate order. Each node in this structure will have some text
+from the document, a child pointer, and a next pointer, any of which
+might not be present. The text is most easily stored as a pointer and a
+length. We'll call this a `text`
+
+A list of these `code_nodes` will belong to each section and it will
+be useful to have a separate `section` data structure to store the
+list of `code_nodes`, the section name, and some other information.
+
+This other information will include a reference counter so we can
+ensure proper referencing, and an `indent` depth. As referenced
+content can have an extra indent added, we need to know what that is.
+The `code_node` will also have an `indent` depth which eventually gets
+set to the sum for the indents from all references on the path from
+the root.
+
+##### exported types
+
+ struct text {
+ char *txt;
+ int len;
+ };
+
+ struct section {
+ struct text section;
+ struct code_node *code;
+ struct section *next;
+ };
+
+ struct code_node {
+ struct text code;
+ int indent;
+ int line_no;
+ struct code_node *next;
+ struct section *child;
+ };
+
+##### private types
+
+ struct psection {
+ struct section;
+ struct code_node *last;
+ int refcnt;
+ int indent;
+ };
+
+You will note that the `struct psection` contains an anonymous `struct
+section` embedded at the start. To make this work right, GCC
+requires the `-fplan9-extensions` flag.
+
+##### File: mdcode.mk
+
+ CFLAGS += -fplan9-extensions
+
+### Manipulating the node
+
+Though a tree with `next` and `child` links is the easiest way to
+assemble the various code sections, it is not the easiest form for
+using them. For that a simple list would be best.
+
+So once we have a fully linked File section we will want to linearize
+it, so that the `child` links become `NULL` and the `next` links will
+find everything required. It is at this stage that the requirements
+that each section is linked only once becomes import.
+
+`code_linearize` will merge the `code_node`s from any child into the
+given `code_node`. As it does this it sets the 'indent' field for
+each `code_node`.
+
+Note that we don't clear the section's `last` pointer, even though
+it no longer owns any code. This allows subsequent code to see if a
+section ever had any code, and to report an error if a section is
+referenced but not defined.
+
+##### internal functions
+
+ static void code_linearize(struct code_node *code)
+ {
+ struct code_node *t;
+ for (t = code; t; t = t->next)
+ t->indent = 0;
+ for (; code; code = code->next)
+ if (code->child) {
+ struct code_node *next = code->next;
+ struct psection *pchild =
+ (struct psection *)code->child;
+ int indent = pchild->indent;
+ code->next = code->child->code;
+ code->child->code = NULL;
+ code->child = NULL;
+ for (t = code; t->next; t = t->next)
+ t->next->indent = code->indent + indent;
+ t->next = next;
+ }
+ }
+
+Once a client has made use of a linearized code set, it will probably
+want to free it.
+
+ void code_free(struct code_node *code)
+ {
+ while (code) {
+ struct code_node *this;
+ if (code->child)
+ code_linearize(code);
+ this = code;
+ code = code->next;
+ free(this);
+ }
+ }
+
+##### exported functions
+
+ void code_free(struct code_node *code);
+
+### Building the tree
+
+As we parse the document there are two things we will want to do to
+node trees: add some text or add a reference. We'll assume for now
+that the relevant section structures have been found, and will just
+deal with the `code_node`.
+
+Adding text simply means adding another node. We will never have
+empty nodes, even if the last node only has a child, new text must go
+in a new node.
+
+##### internal functions
+
+ static void code_add_text(struct psection *where, struct text txt,
+ int line_no)
+ {
+ struct code_node *n;
+ if (txt.len == 0)
+ return;
+ n = malloc(sizeof(*n));
+ n->code = txt;
+ n->indent = 0;
+ n->line_no = line_no;
+ n->next = NULL;
+ n->child = NULL;
+ if (where->last)
+ where->last->next = n;
+ else
+ where->code = n;
+ where->last = n;
+ }
+
+However when adding a link, we might be able to include it in the last
+`code_node` if it currently only has text.
+
+ void code_add_link(struct psection *where, struct psection *to,
+ int indent)
+ {
+ struct code_node *n;
+
+ to->indent = indent;
+ to->refcnt++; // this will be checked elsewhere
+ if (where->last && where->last->child == NULL) {
+ where->last->child = to;
+ return;
+ }
+ n = malloc(sizeof(*n));
+ n->code.len = 0;
+ n->indent = 0;
+ n->line_no = 0;
+ n->next = NULL;
+ n->child = to;
+ if (where->last)
+ where->last->next = n;
+ else
+ where->code = n;
+ where->last = n;
+ }
+
+### Finding sections
+
+Now we need a lookup table to be able to find sections by name.
+Something that provides an `n*log(N)` search time is probably
+justified, but for now I want a minimal stand-alone program so a
+linked list managed by insertion-sort will do. As a comparison
+function it is easiest to sort based on length before content. So
+sections won't be in standard lexical order, but that isn't important.
+
+If we cannot find a section, we simply want to create it. This allows
+sections and references to be created in any order. Sections with
+no references or no content will cause a warning eventually.
+
+#### internal functions
+
+ static int text_cmp(struct text a, struct text b)
+ {
+ if (a.len != b.len)
+ return a.len - b.len;
+ return strncmp(a.txt, b.txt, a.len);
+ }
+
+ static struct psection *section_find(struct psection **list, struct text name)
+ {
+ struct psection *new;
+ while (*list) {
+ int cmp = text_cmp((*list)->section, name);
+ if (cmp == 0)
+ return *list;
+ if (cmp > 0)
+ break;
+ list = (struct psection **)&((*list)->next);
+ }
+ /* Add this section */
+ new = malloc(sizeof(*new));
+ new->next = *list;
+ *list = new;
+ new->section = name;
+ new->code = NULL;
+ new->last = NULL;
+ new->refcnt = 0;
+ new->indent = 0;
+ return new;
+ }
+
+## Parsing the _markdown_
+
+Parsing markdown is fairly easy, though there are complications.
+
+The document is divided into "paragraphs" which are mostly separated by blank
+lines (which may contain white space). The first few characters of
+the first line of a paragraph determine the type of paragraph. For
+our purposes we are only interested in list paragraphs, code
+paragraphs, section headings, and everything else. Section headings
+are single-line paragraphs and so do not require a preceding or
+following blank line.
+
+Section headings start with 1 or more hash characters (__#__). List
+paragraphs start with hyphen, asterisk, plus, or digits followed by a
+period. Code paragraphs aren't quite so easy.
+
+The "standard" code paragraph starts with 4 or more spaces, or a tab.
+However if the previous paragraph was a list paragraph, then those
+spaces indicate another paragraph in the same list item, and 8 or
+more spaces are required. Unless a nested list is in effect, in
+which case 12 or more are need. Unfortunately not all _markdown_
+parsers agree on nested lists.
+
+Two alternate styles for marking code are in active use. "Github" uses
+three backticks(_`` ``` ``_), while "pandoc" uses three or more tildes
+(_~~~_). In these cases the code should not be indented.
+
+Trying to please everyone as much as possible, this parser will handle
+everything except for code inside lists.
+
+So an indented (4+) paragraph after a list paragraph is always a list
+paragraph, otherwise it is a code paragraph. A paragraph that starts
+with three backticks or three tildes is code which continues until a
+matching string of backticks or tildes.
+
+### Skipping bits
+
+While walking the document looking for various markers we will *not*
+use the `struct text` introduced earlier as advancing that requires
+updating both start and length which feels clumsy. Instead we will
+carry `pos` and `end` pointers, only the first of which needs to
+change.
+
+So to start, we need to skip various parts of the document. `lws`
+stands for "Linear White Space" and is a term that comes from the
+Email RFCs (e.g. RFC822). `line` and `para` are self explanatory.
+Note that `skip_para` needs to update the current line number.
+`skip_line` doesn't but every caller should.
+
+#### internal functions
+
+ static char *skip_lws(char *pos, char *end)
+ {
+ while (pos < end && (*pos == ' ' || *pos == '\t'))
+ pos++;
+ return pos;
+ }
+
+ static char *skip_line(char *pos, char *end)
+ {
+ while (pos < end && *pos != '\n')
+ pos++;
+ if (pos < end)
+ pos++;
+ return pos;
+ }
+
+ static char *skip_para(char *pos, char *end, int *line_no)
+ {
+ /* Might return a pointer to a blank line, as only
+ * one trailing blank line is skipped
+ */
+ if (*pos == '#') {
+ pos = skip_line(pos, end);
+ (*line_no) += 1;
+ return pos;
+ }
+ while (pos < end &&
+ *pos != '#' &&
+ *(pos = skip_lws(pos, end)) != '\n') {
+ pos = skip_line(pos, end);
+ (*line_no) += 1;
+ }
+ if (pos < end && *pos == '\n') {
+ pos++;
+ (*line_no) += 1;
+ }
+ return pos;
+ }
+
+### Recognising things
+
+Recognising a section header is trivial and doesn't require a
+function. However we need to extract the content of a section header
+as a `struct text` for passing to `section_find`.
+Recognising the start of a new list is fairly easy. Recognising the
+start (and end) of code is a little messy so we provide a function for
+matching the first few characters, which has a special case for "4
+spaces or tab".
+
+#### internal includes
+
+ #include <ctype.h>
+ #include <string.h>
+
+#### internal functions
+
+ static struct text take_header(char *pos, char *end)
+ {
+ struct text section;
+
+ while (pos < end && *pos == '#')
+ pos++;
+ while (pos < end && *pos == ' ')
+ pos++;
+ section.txt = pos;
+ while (pos < end && *pos != '\n')
+ pos++;
+ while (pos > section.txt &&
+ (pos[-1] == '#' || pos[-1] == ' '))
+ pos--;
+ section.len = pos - section.txt;
+ return section;
+ }
+
+ static int is_list(char *pos, char *end)
+ {
+ if (strchr("-*+", *pos))
+ return 1;
+ if (isdigit(*pos)) {
+ while (pos < end && isdigit(*pos))
+ pos += 1;
+ if (pos < end && *pos == '.')
+ return 1;
+ }
+ return 0;
+ }
+
+ static int matches(char *start, char *pos, char *end)
+ {
+ if (start == NULL)
+ return matches("\t", pos, end) ||
+ matches(" ", pos, end);
+ return (pos + strlen(start) < end &&
+ strncmp(pos, start, strlen(start)) == 0);
+ }
+
+### Extracting the code
+
+Now that we can skip paragraphs and recognise what type each paragraph
+is, it is time to parse the file and extract the code. We'll do this
+in two parts, first we look at what to do with some code once we
+find it, and then how to actually find it.
+
+When we have some code, we know where it is, what the end marker
+should look like, and which section it is in.
+
+There are two sorts of end markers: the presence of a particular
+string, or the absence of an indent. We will use a string to
+represent a presence, and a `NULL` to represent the absence.
+
+While looking at code we don't think about paragraphs are all - just
+look for a line that starts with the right thing.
+Every line that is still code then needs to be examined to see if it
+is a section reference.
+
+When a section reference is found, all preceding code (if any) must be
+added to the current section, then the reference is added.
+
+When we do find the end of the code, all text that we have found but
+not processed needs to be saved too.
+
+When adding a reference we need to set the `indent`. This is the
+number of spaces (counting 8 for tabs) after the natural indent of the
+code (which is a tab or 4 spaces). We use a separate function `count_spaces`
+for that.
+
+#### internal functions
+
+ static int count_space(char *sol, char *p)
+ {
+ int c = 0;
+ while (sol < p) {
+ if (sol[0] == ' ')
+ c++;
+ if (sol[0] == '\t')
+ c+= 8;
+ sol++;
+ }
+ return c;
+ }
+
+
+ static char *take_code(char *pos, char *end, char *marker,
+ struct psection **table, struct text section,
+ int *line_nop)
+ {
+ char *start = pos;
+ int line_no = *line_nop;
+ int start_line = line_no;
+ struct psection *sect;
+
+ sect = section_find(table, section);
+
+ while (pos < end) {
+ char *sol, *t;
+ struct text ref;
+
+ if (marker && matches(marker, pos, end))
+ break;
+ if (!marker &&
+ (skip_lws(pos, end))[0] != '\n' &&
+ !matches(NULL, pos, end))
+ /* Paragraph not indented */
+ break;
+
+ /* Still in code - check for reference */
+ sol = pos;
+ if (!marker) {
+ if (*sol == '\t')
+ sol++;
+ else if (strcmp(sol, " ") == 0)
+ sol += 4;
+ }
+ t = skip_lws(sol, end);
+ if (t[0] != '#' || t[1] != '#') {
+ /* Just regular code here */
+ pos = skip_line(sol, end);
+ line_no++;
+ continue;
+ }
+
+ if (pos > start) {
+ struct text txt;
+ txt.txt = start;
+ txt.len = pos - start;
+ code_add_text(sect, txt, start_line);
+ }
+ ref = take_header(t, end);
+ if (ref.len) {
+ struct psection *refsec = section_find(table, ref);
+ code_add_link(sect, refsec, count_space(sol, t));
+ }
+ pos = skip_line(t, end);
+ line_no++;
+ start = pos;
+ start_line = line_no;
+ }
+ if (pos > start) {
+ struct text txt;
+ txt.txt = start;
+ txt.len = pos - start;
+ code_add_text(sect, txt, start_line);
+ }
+ if (marker) {
+ pos = skip_line(pos, end);
+ line_no++;
+ }
+ *line_nop = line_no;
+ return pos;
+ }
+
+### Finding the code
+
+It is when looking for the code that we actually use the paragraph
+structure. We need to recognise section headings so we can record the
+name, list paragraphs so we can ignore indented follow-on paragraphs,
+and the three different markings for code.
+
+#### internal functions
+
+ static struct psection *code_find(char *pos, char *end)
+ {
+ struct psection *table = NULL;
+ int in_list = 0;
+ int line_no = 1;
+ struct text section = {0};
+
+ while (pos < end) {
+ if (pos[0] == '#') {
+ section = take_header(pos, end);
+ in_list = 0;
+ pos = skip_line(pos, end);
+ line_no++;
+ } else if (is_list(pos, end)) {
+ in_list = 1;
+ pos = skip_para(pos, end, &line_no);
+ } else if (!in_list && matches(NULL, pos, end)) {
+ pos = take_code(pos, end, NULL, &table,
+ section, &line_no);
+ } else if (matches("```", pos, end)) {
+ in_list = 0;
+ pos = skip_line(pos, end);
+ line_no++;
+ pos = take_code(pos, end, "```", &table,
+ section, &line_no);
+ } else if (matches("~~~", pos, end)) {
+ in_list = 0;
+ pos = skip_line(pos, end);
+ line_no++;
+ pos = take_code(pos, end, "~~~", &table,
+ section, &line_no);
+ } else {
+ if (!isspace(*pos))
+ in_list = 0;
+ pos = skip_para(pos, end, &line_no);
+ }
+ }
+ return table;
+ }
+
+### Returning the code
+
+Having found all the code blocks and gathered them into a list of
+section, we are now ready to return them to the caller. This is where
+to perform consistency checks, like at most one reference and at least
+one definition for each section.
+
+All the sections with no references are returned in a list for the
+caller to consider. The are linearized first so that the substructure
+is completely hidden -- except for the small amount of structure
+displayed in the line numbers.
+
+To return errors, we have the caller pass a function which takes an
+error message - a `code_err_fn`.
+
+#### exported types
+
+ typedef void (*code_err_fn)(char *msg);
+
+#### internal functions
+ struct section *code_extract(char *pos, char *end, code_err_fn error)
+ {
+ struct psection *table;
+ struct section *result = NULL;
+ struct section *tofree = NULL;
+
+ table = code_find(pos, end);
+
+ while (table) {
+ struct psection *t = (struct psection*)table->next;
+ if (table->last == NULL) {
+ char *msg;
+ asprintf(&msg,
+ "Section \"%.*s\" is referenced but not declared",
+ table->section.len, table->section.txt);
+ error(msg);
+ free(msg);
+ }
+ if (table->refcnt == 0) {
+ /* Root-section, return it */
+ table->next = result;
+ result = table;
+ code_linearize(result->code);
+ } else {
+ table->next = tofree;
+ tofree = table;
+ if (table->refcnt > 1) {
+ char *msg;
+ asprintf(&msg,
+ "Section \"%.*s\" referenced multiple times (%d).",
+ table->section.len, table->section.txt,
+ table->refcnt);
+ error(msg);
+ free(msg);
+ }
+ }
+ table = t;
+ }
+ while (tofree) {
+ struct section *t = tofree->next;
+ free(tofree);
+ tofree = t;
+ }
+ return result;
+ }
+
+##### exported functions
+
+ struct section *code_extract(char *pos, char *end, code_err_fn error);
+
+
+## Using the library
+
+Now that we can extract code from a document and link it all together
+it is time to do something with that code. Firstly we need to print
+it out.
+
+### Printing the Code
+
+Printing is mostly straight forward - we just walk the list and print
+the code sections, adding whatever indent is required for each line.
+However there is a complication (isn't there always)?
+
+For code that was recognised because the paragraph was indented, we
+need to strip that indent first. For other code, we don't.
+
+The approach taken here is simple, though it could arguably be wrong
+in some unlikely cases. So it might need to be fixed later.
+
+If the first line of a code block is indented, then either one tab or
+4 spaces are striped from every non-blank line.
+
+This could go wrong if the first line of a code block marked by
+_`` ``` ``_ is indented. To overcome this we would need to
+record someextra state in each `code_node`. For now we won't bother.
+
+The indents we insert will all be spaces. This might not work well
+for `Makefiles`.
+
+##### client functions
+
+ static void code_print(FILE *out, struct code_node *node,
+ char *fname)
+ {
+ for (; node; node = node->next) {
+ char *c = node->code.txt;
+ int len = node->code.len;
+ int undent = 0;
+
+ if (!len)
+ continue;
+
+ fprintf(out, "#line %d \"%s\"\n",
+ node->line_no, fname);
+ if (*c == ' ' || *c == '\t')
+ undent = 1;
+ while (len && *c) {
+ fprintf(out, "%*s", node->indent, "");
+ if (undent) {
+ if (*c == '\t' && len > 1) {
+ c++;
+ len--;
+ } else if (strncmp(c, " ", 4) == 0 && len > 4) {
+ c += 4;
+ len-= 4;
+ }
+ }
+ do {
+ fputc(*c, out);
+ c++;
+ len--;
+ } while (len && c[-1] != '\n');
+ }
+ }
+ }
+
+### Bringing it all together
+
+We are just about ready for the `main` function of the tool which will
+extract all this lovely code and compile it. Just one helper is still
+needed.
+
+#### Handling filenames
+
+Section names are stored in `struct text` which is not `nul`
+terminated. Filenames passed to `open` need to be null terminated.
+So we need to convert one to the other, and strip the leading `File:`
+of while we are at it.
+
+##### client functions
+
+ static void copy_fname(char *name, int space, struct text t)
+ {
+ char *sec = t.txt;
+ int len = t.len;
+ name[0] = 0;
+ if (len < 5 || strncmp(sec, "File:", 5) != 0)
+ return;
+ sec += 5;
+ len -= 5;
+ while (len && sec[0] == ' ') {
+ sec++;
+ len--;
+ }
+ if (len >= space)
+ len = space - 1;
+ strncpy(name, sec, len);
+ name[len] = 0;
+ }
+
+#### Main
+
+And now we take a single file name, extract the code, and if there are
+no error we write out a file for each appropriate code section. And
+we are done.
+
+
+##### client includes
+
+ #include <fcntl.h>
+ #include <errno.h>
+ #include <sys/mman.h>
+ #include <string.h>
+ #include <stdio.h>
+
+##### client functions
+
+ static int errs;
+ static void pr_err(char *msg)
+ {
+ errs++;
+ fprintf(stderr, "%s\n", msg);
+ }
+
+ int main(int argc, char *argv[])
+ {
+ int fd;
+ size_t len;
+ char *file;
+ struct section *table, *s, *prev;
+
+ errs = 0;
+ if (argc != 2) {
+ fprintf(stderr, "Usage: mdcode file.mdc\n");
+ exit(2);
+ }
+ fd = open(argv[1], O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "mdcode: cannot open %s: %s\n",
+ argv[1], strerror(errno));
+ exit(1);
+ }
+ len = lseek(fd, 0, 2);
+ file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+ table = code_extract(file, file+len, pr_err);
+
+ for (s = table; s;
+ (code_free(s->code), prev = s, s = s->next, free(prev))) {
+ FILE *fl;
+ char fname[1024];
+ if (strncmp(s->section.txt, "Example:", 8) == 0)
+ continue;
+ if (strncmp(s->section.txt, "File:", 5) != 0) {
+ fprintf(stderr, "Unreferenced section is not a file name: %.*s\n",
+ s->section.len, s->section.txt);
+ errs++;
+ continue;
+ }
+ copy_fname(fname, sizeof(fname), s->section);
+ if (fname[0] == 0) {
+ fprintf(stderr, "Missing file name at:%.*s\n",
+ s->section.len, s->section.txt);
+ errs++;
+ continue;
+ }
+ fl = fopen(fname, "w");
+ if (!fl) {
+ fprintf(stderr, "Cannot create %s: %s\n",
+ fname, strerror(errno));
+ errs++;
+ continue;
+ }
+ code_print(fl, s->code, argv[1]);
+ fclose(fl);
+ }
+ exit(!!errs);
+ }
+