mdcode, md2c - extract C code from a literate markdown program

author NeilBrown <neilb@suse.de>

Wed, 5 Jun 2013 20:20:35 +0000 (06:20 +1000)

committer NeilBrown <neilb@suse.de>

Wed, 5 Jun 2013 21:40:31 +0000 (07:40 +1000)
author NeilBrown <neilb@suse.de>
Wed, 5 Jun 2013 20:20:35 +0000 (06:20 +1000)
committer NeilBrown <neilb@suse.de>
Wed, 5 Jun 2013 21:40:31 +0000 (07:40 +1000)
diff --git a/.gitignore b/.gitignore

new file mode 100644 (file)

index 0000000..5761abc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.o
diff --git a/csrc/.gitignore b/csrc/.gitignore

new file mode 100644 (file)

index 0000000..3cfcc75
--- /dev/null
+++ b/csrc/.gitignore
@@ -0,0 +1,5 @@
+*.c
+*.h
+*.mk
+md2c
+boot-strap/md2c
diff --git a/csrc/Makefile b/csrc/Makefile

new file mode 100644 (file)

index 0000000..d1beada
--- /dev/null
+++ b/csrc/Makefile
@@ -0,0 +1,11 @@
+
+ifneq "$(wildcard *.mk)" ""
+include *.mk
+endif
+
+bootstrap:
+       touch boot-strap/* # make sure mdcode.mdc isn't newer
+       $(MAKE) -C boot-strap -f mdcode.mk VPATH=..
+       cp boot-strap/md2c .
+       ./md2c mdcode.mdc
+
diff --git a/csrc/boot-strap/libmdcode.c b/csrc/boot-strap/libmdcode.c

new file mode 100644 (file)

index 0000000..78a4262
--- /dev/null
+++ b/csrc/boot-strap/libmdcode.c
@@ -0,0 +1,384 @@
+#line 100 "../mdcode.mdc"
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "mdcode.h"
+#line 461 "../mdcode.mdc"
+#include  <ctype.h>
+#include  <string.h>
+
+#line 179 "../mdcode.mdc"
+struct psection {
+       struct section;
+       struct code_node *last;
+       int refcnt;
+       int indent;
+};
+
+#line 216 "../mdcode.mdc"
+static void code_linearize(struct code_node *code)
+{
+       struct code_node *t;
+       for (t = code; t; t = t->next)
+               t->indent = 0;
+       for (; code; code = code->next)
+               if (code->child) {
+                       struct code_node *next = code->next;
+                       struct psection *pchild =
+                               (struct psection *)code->child;
+                       int indent = pchild->indent;
+                       code->next = code->child->code;
+                       code->child->code = NULL;
+                       code->child = NULL;
+                       for (t = code; t->next; t = t->next)
+                               t->next->indent = code->indent + indent;
+                       t->next = next;
+               }
+}
+
+#line 239 "../mdcode.mdc"
+void code_free(struct code_node *code)
+{
+       while (code) {
+               struct code_node *this;
+               if (code->child)
+                       code_linearize(code);
+               this = code;
+               code = code->next;
+               free(this);
+       }
+}
+
+#line 268 "../mdcode.mdc"
+static void code_add_text(struct psection *where, struct text txt,
+                         int line_no)
+{
+       struct code_node *n;
+       if (txt.len == 0)
+               return;
+       n = malloc(sizeof(*n));
+       n->code = txt;
+       n->indent = 0;
+       n->line_no = line_no;
+       n->next = NULL;
+       n->child = NULL;
+       if (where->last)
+               where->last->next = n;
+       else
+               where->code = n;
+       where->last = n;
+}
+
+#line 290 "../mdcode.mdc"
+void code_add_link(struct psection *where, struct psection *to,
+                  int indent)
+{
+       struct code_node *n;
+
+       to->indent = indent;
+       to->refcnt++;   // this will be checked elsewhere
+       if (where->last && where->last->child == NULL) {
+               where->last->child = to;
+               return;
+       }
+       n = malloc(sizeof(*n));
+       n->code.len = 0;
+       n->indent = 0;
+       n->line_no = 0;
+       n->next = NULL;
+       n->child = to;
+       if (where->last)
+               where->last->next = n;
+       else
+               where->code = n;
+       where->last = n;
+}
+
+#line 329 "../mdcode.mdc"
+static int text_cmp(struct text a, struct text b)
+{
+       if (a.len != b.len)
+               return a.len - b.len;
+       return strncmp(a.txt, b.txt, a.len);
+}
+
+static struct psection *section_find(struct psection **list, struct text name)
+{
+       struct psection *new;
+       while (*list) {
+               int cmp = text_cmp((*list)->section, name);
+               if (cmp == 0)
+                       return *list;
+               if (cmp > 0)
+                       break;
+               list = (struct psection **)&((*list)->next);
+       }
+       /* Add this section */
+       new = malloc(sizeof(*new));
+       new->next = *list;
+       *list = new;
+       new->section = name;
+       new->code = NULL;
+       new->last = NULL;
+       new->refcnt = 0;
+       new->indent = 0;
+       return new;
+}
+
+#line 410 "../mdcode.mdc"
+static char *skip_lws(char *pos, char *end)
+{
+       while (pos < end && (*pos == ' ' || *pos == '\t'))
+               pos++;
+       return pos;
+}
+
+static char *skip_line(char *pos, char *end)
+{
+       while (pos < end && *pos != '\n')
+               pos++;
+       if (pos < end)
+               pos++;
+       return pos;
+}
+
+static char *skip_para(char *pos, char *end, int *line_no)
+{
+       /* Might return a pointer to a blank line, as only
+        * one trailing blank line is skipped
+        */
+       if (*pos == '#') {
+               pos = skip_line(pos, end);
+               (*line_no) += 1;
+               return pos;
+       }
+       while (pos < end &&
+              *pos != '#' &&
+              *(pos = skip_lws(pos, end)) != '\n') {
+               pos = skip_line(pos, end);
+               (*line_no) += 1;
+       }
+       if (pos < end && *pos == '\n') {
+               pos++;
+               (*line_no) += 1;
+       }
+       return pos;
+}
+
+#line 466 "../mdcode.mdc"
+static struct text take_header(char *pos, char *end)
+{
+       struct text section;
+
+       while (pos < end && *pos == '#')
+               pos++;
+       while (pos < end && *pos == ' ')
+               pos++;
+       section.txt = pos;
+       while (pos < end && *pos != '\n')
+               pos++;
+       while (pos > section.txt &&
+              (pos[-1] == '#' || pos[-1] == ' '))
+               pos--;
+       section.len = pos - section.txt;
+       return section;
+}
+
+static int is_list(char *pos, char *end)
+{
+       if (strchr("-*+", *pos))
+               return 1;
+       if (isdigit(*pos)) {
+               while (pos < end && isdigit(*pos))
+                       pos += 1;
+               if  (pos < end && *pos == '.')
+                       return 1;
+       }
+       return 0;
+}
+
+static int matches(char *start, char *pos, char *end)
+{
+       if (start == NULL)
+               return matches("\t", pos, end) ||
+                      matches("    ", pos, end);
+       return (pos + strlen(start) < end &&
+               strncmp(pos, start, strlen(start)) == 0);
+}
+
+#line 538 "../mdcode.mdc"
+static int count_space(char *sol, char *p)
+{
+       int c = 0;
+       while (sol < p) {
+               if (sol[0] == ' ')
+                       c++;
+               if (sol[0] == '\t')
+                       c+= 8;
+               sol++;
+       }
+       return c;
+}
+
+
+static char *take_code(char *pos, char *end, char *marker,
+                      struct psection **table, struct text section,
+                      int *line_nop)
+{
+       char *start = pos;
+       int line_no = *line_nop;
+       int start_line = line_no;
+       struct psection *sect;
+
+       sect = section_find(table, section);
+
+       while (pos < end) {
+               char *sol, *t;
+               struct text ref;
+
+               if (marker && matches(marker, pos, end))
+                       break;
+               if (!marker &&
+                   (skip_lws(pos, end))[0] != '\n' &&
+                   !matches(NULL, pos, end))
+                       /* Paragraph not indented */
+                       break;
+
+               /* Still in code - check for reference */
+               sol = pos;
+               if (!marker) {
+                       if (*sol == '\t')
+                               sol++;
+                       else if (strcmp(sol, "    ") == 0)
+                               sol += 4;
+               }
+               t = skip_lws(sol, end);
+               if (t[0] != '#' || t[1] != '#') {
+                       /* Just regular code here */
+                       pos = skip_line(sol, end);
+                       line_no++;
+                       continue;
+               }
+
+               if (pos > start) {
+                       struct text txt;
+                       txt.txt = start;
+                       txt.len = pos - start;
+                       code_add_text(sect, txt, start_line);
+               }
+               ref = take_header(t, end);
+               if (ref.len) {
+                       struct psection *refsec = section_find(table, ref);
+                       code_add_link(sect, refsec, count_space(sol, t));
+               }
+               pos = skip_line(t, end);
+               line_no++;
+               start = pos;
+               start_line = line_no;
+       }
+       if (pos > start) {
+               struct text txt;
+               txt.txt = start;
+               txt.len = pos - start;
+               code_add_text(sect, txt, start_line);
+       }
+       if (marker) {
+               pos = skip_line(pos, end);
+               line_no++;
+       }
+       *line_nop = line_no;
+       return pos;
+}
+
+#line 630 "../mdcode.mdc"
+static struct psection *code_find(char *pos, char *end)
+{
+       struct psection *table = NULL;
+       int in_list = 0;
+       int line_no = 1;
+       struct text section = {0};
+
+       while (pos < end) {
+               if (pos[0] == '#') {
+                       section = take_header(pos, end);
+                       in_list = 0;
+                       pos = skip_line(pos, end);
+                       line_no++;
+               } else if (is_list(pos, end)) {
+                       in_list = 1;
+                       pos = skip_para(pos, end, &line_no);
+               } else if (!in_list && matches(NULL, pos, end)) {
+                       pos = take_code(pos, end, NULL, &table,
+                                       section, &line_no);
+               } else if (matches("```", pos, end)) {
+                       in_list = 0;
+                       pos = skip_line(pos, end);
+                       line_no++;
+                       pos = take_code(pos, end, "```", &table,
+                                       section, &line_no);
+               } else if (matches("~~~", pos, end)) {
+                       in_list = 0;
+                       pos = skip_line(pos, end);
+                       line_no++;
+                       pos = take_code(pos, end, "~~~", &table,
+                                       section, &line_no);
+               } else {
+                       if (!isspace(*pos))
+                               in_list = 0;
+                       pos = skip_para(pos, end, &line_no);
+               }
+       }
+       return table;
+}
+
+#line 690 "../mdcode.mdc"
+struct section *code_extract(char *pos, char *end, code_err_fn error)
+{
+       struct psection *table;
+       struct section *result = NULL;
+       struct section *tofree = NULL;
+
+       table = code_find(pos, end);
+
+       while (table) {
+               struct psection *t = (struct psection*)table->next;
+               if (table->last == NULL) {
+                       char *msg;
+                       asprintf(&msg,
+                               "Section \"%.*s\" is referenced but not declared",
+                                table->section.len, table->section.txt);
+                       error(msg);
+                       free(msg);
+               }
+               if (table->refcnt == 0) {
+                       /* Root-section,  return it */
+                       table->next = result;
+                       result = table;
+                       code_linearize(result->code);
+               } else {
+                       table->next = tofree;
+                       tofree = table;
+                       if (table->refcnt > 1) {
+                               char *msg;
+                               asprintf(&msg,
+                                        "Section \"%.*s\" referenced multiple times (%d).",
+                                        table->section.len, table->section.txt,
+                                        table->refcnt);
+                               error(msg);
+                               free(msg);
+                       }
+               }
+               table = t;
+       }
+       while (tofree) {
+               struct section *t = tofree->next;
+               free(tofree);
+               tofree = t;
+       }
+       return result;
+}
+
+#line 109 "../mdcode.mdc"
+
diff --git a/csrc/boot-strap/md2c.c b/csrc/boot-strap/md2c.c

new file mode 100644 (file)

index 0000000..2b18737
--- /dev/null
+++ b/csrc/boot-strap/md2c.c
@@ -0,0 +1,133 @@
+#line 119 "../mdcode.mdc"
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "mdcode.h"
+
+#line 849 "../mdcode.mdc"
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <stdio.h>
+
+#line 771 "../mdcode.mdc"
+static void code_print(FILE *out, struct code_node *node,
+                      char *fname)
+{
+       for (; node; node = node->next) {
+               char *c = node->code.txt;
+               int len = node->code.len;
+               int undent = 0;
+
+               if (!len)
+                       continue;
+
+               fprintf(out, "#line %d \"%s\"\n",
+                       node->line_no, fname);
+               if (*c == ' ' || *c == '\t')
+                       undent = 1;
+               while (len && *c) {
+                       fprintf(out, "%*s", node->indent, "");
+                       if (undent) {
+                               if (*c == '\t' && len > 1) {
+                                       c++;
+                                       len--;
+                               } else if (strncmp(c, "    ", 4) == 0 && len > 4) {
+                                       c += 4;
+                                       len-= 4;
+                               }
+                       }
+                       do {
+                               fputc(*c, out);
+                               c++;
+                               len--;
+                       } while (len && c[-1] != '\n');
+               }
+       }
+}
+
+#line 821 "../mdcode.mdc"
+static void copy_fname(char *name, int space, struct text t)
+{
+       char *sec = t.txt;
+       int len = t.len;
+       name[0] = 0;
+       if (len < 5 || strncmp(sec, "File:", 5) != 0)
+               return;
+       sec += 5;
+       len -= 5;
+       while (len && sec[0] == ' ') {
+               sec++;
+               len--;
+       }
+       if (len >= space)
+               len = space - 1;
+       strncpy(name, sec, len);
+       name[len] = 0;
+}
+
+#line 857 "../mdcode.mdc"
+static int errs;
+static void pr_err(char *msg)
+{
+       errs++;
+       fprintf(stderr, "%s\n", msg);
+}
+
+int main(int argc, char *argv[])
+{
+       int fd;
+       size_t len;
+       char *file;
+       struct section *table, *s, *prev;
+
+       errs = 0;
+       if (argc != 2) {
+               fprintf(stderr, "Usage: mdcode file.mdc\n");
+               exit(2);
+       }
+       fd = open(argv[1], O_RDONLY);
+       if (fd < 0) {
+               fprintf(stderr, "mdcode: cannot open %s: %s\n",
+                       argv[1], strerror(errno));
+               exit(1);
+       }
+       len = lseek(fd, 0, 2);
+       file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+       table = code_extract(file, file+len, pr_err);
+
+       for (s = table; s;
+               (code_free(s->code), prev = s, s = s->next, free(prev))) {
+               FILE *fl;
+               char fname[1024];
+               if (strncmp(s->section.txt, "Example:", 8) == 0)
+                       continue;
+               if (strncmp(s->section.txt, "File:", 5) != 0) {
+                       fprintf(stderr, "Unreferenced section is not a file name: %.*s\n",
+                               s->section.len, s->section.txt);
+                       errs++;
+                       continue;
+               }
+               copy_fname(fname, sizeof(fname), s->section);
+               if (fname[0] == 0) {
+                       fprintf(stderr, "Missing file name at:%.*s\n",
+                               s->section.len, s->section.txt);
+                       errs++;
+                       continue;
+               }
+               fl = fopen(fname, "w");
+               if (!fl) {
+                       fprintf(stderr, "Cannot create %s: %s\n",
+                               fname, strerror(errno));
+                       errs++;
+                       continue;
+               }
+               code_print(fl, s->code, argv[1]);
+               fclose(fl);
+       }
+       exit(!!errs);
+}
+
+#line 126 "../mdcode.mdc"
+
diff --git a/csrc/boot-strap/mdcode.h b/csrc/boot-strap/mdcode.h

new file mode 100644 (file)

index 0000000..6a17e44
--- /dev/null
+++ b/csrc/boot-strap/mdcode.h
@@ -0,0 +1,32 @@
+#line 158 "../mdcode.mdc"
+struct text {
+       char *txt;
+       int len;
+};
+
+struct section {
+       struct text section;
+       struct code_node *code;
+       struct section *next;
+};
+
+struct code_node {
+       struct text code;
+       int indent;
+       int line_no;
+       struct code_node *next;
+       struct section *child;
+};
+
+#line 687 "../mdcode.mdc"
+typedef void (*code_err_fn)(char *msg);
+
+#line 253 "../mdcode.mdc"
+void code_free(struct code_node *code);
+
+#line 738 "../mdcode.mdc"
+struct section *code_extract(char *pos, char *end, code_err_fn error);
+
+
+#line 98 "../mdcode.mdc"
+
diff --git a/csrc/boot-strap/mdcode.mk b/csrc/boot-strap/mdcode.mk

new file mode 100644 (file)

index 0000000..22a35a1
--- /dev/null
+++ b/csrc/boot-strap/mdcode.mk
@@ -0,0 +1,23 @@
+#line 88 "../mdcode.mdc"
+CFLAGS += -Wall -g
+all::
+mdcode.h libmdcode.c md2c.c mdcode.mk :  mdcode.mdc
+       ./md2c mdcode.mdc
+
+
+#line 112 "../mdcode.mdc"
+all :: libmdcode.o
+libmdcode.o : libmdcode.c mdcode.h
+       $(CC) $(CFLAGS) -c libmdcode.c
+
+
+#line 129 "../mdcode.mdc"
+all :: md2c
+md2c : md2c.o libmdcode.o
+       $(CC) $(CFLAGS) -o md2c md2c.o libmdcode.o
+md2c.o : md2c.c mdcode.h
+       $(CC) $(CFLAGS) -c md2c.c
+
+#line 192 "../mdcode.mdc"
+CFLAGS += -fplan9-extensions
+
diff --git a/csrc/mdcode.mdc b/csrc/mdcode.mdc

new file mode 100644 (file)

index 0000000..dc78a12
--- /dev/null
+++ b/csrc/mdcode.mdc
@@ -0,0 +1,917 @@
+# mdcode: extract C code from a _markdown_ file.
+
+_markdown_ is a popular format for simple text markup which can easily
+be converted to HTML.  As it allows easy indication of sections of
+code, it is quite suitable for use in literate programming.  This file
+is an example of that usage.
+
+The code included below provides two related functionalities.
+Firstly it provides a library routine for extracting code out of a
+_markdown_ file, so that other routines might make use of it.
+
+Secondly it provides a simple client of this routine which extracts
+1 or more C-language files from a markdown document so they can be
+passed to a C compiler.  These two combined to make a tool that is needed
+to compile this tool.  Yes, this is circular.  A prototype tool was
+used for the first extraction.
+
+The tool provided is described as specific to the C language as it
+generates
+
+##### Example: a _line_ command
+
+       #line __line-number__ __file-name__
+
+lines so that the C compiler will report where in the markdown file
+any error is found.  This tool is suitable for any other language
+which allows the same directive, or will treat it as a comment.
+
+## Literate Details
+
+Literate programming is more than just including comments with the
+code, even nicely formatted comments.  It also involves presenting the
+code in an order that makes sense to a human, rather than an order
+that makes sense to a compiler.  For this reason a core part of any
+literate programming tool is the ability to re-arrange the code found
+in the document into a different order in the final code file - or
+files.  This requires some form of linkage to be encoded.
+
+The approach taken here is focused around section headings - of any
+depth.
+
+All the code in any section is treated as a single sequential
+collection of code, and is named by the section that it is in.  If
+multiple sections have the same name, then the code blocks in all of
+them are joined together in the order they appear in the document.
+
+A code section can contain a special marker which starts with 2
+hashes: __##__.
+The text after the marker must be the name of some section which
+contains code.  Code from that section will be interpolated in place
+of the marker, and will be indented to match the indent of the marker.
+
+It is not permitted for the same code to be interpolated multiple
+times.  Allowing this might make some sense, but it is probably a
+mistake, and prohibiting it make some of the code a bit cleaner.
+
+Equally, every section of code should be interpolated at least once -
+with two exceptions.  These exceptions are imposed by the tool, not
+the library.  A different client could impose different rules on the
+names of top-level code sections.
+
+The first exception we have already seen.  A section name starting
+__Example:__ indicates code that is not to be included in the final product.
+
+The second exception is for the top level code sections which will be
+written to files.  Again these are identified by their section name.
+This must start with __File:__  the following text (after optional
+spaces) will be used as a file name.
+
+Any section containing code that does not start __Example:__ or
+__File:__ must be included in some other section exactly once.
+
+### Multiple files
+
+Allowing multiple top level code sections which name different files
+means that one _markdown_ document can describe several files.  This
+is very useful with the C language where a program file and a header
+file might be related.  For the present document we will have a header
+file and two code files, one with the library content and one for the
+tool.
+
+It will also be very convenient to create a `makefile` fragment to
+ensure the code is compiled correctly.  A simple `make -f mdcode.mk`
+will "do the right thing".
+
+### File: mdcode.mk
+
+       CFLAGS += -Wall -g
+       all::
+       mdcode.h libmdcode.c md2c.c mdcode.mk :  mdcode.mdc
+               ./md2c mdcode.mdc
+
+
+### File: mdcode.h
+
+       ## exported types
+       ## exported functions
+
+### File: libmdcode.c
+       #define _GNU_SOURCE
+       #include <unistd.h>
+       #include <stdlib.h>
+       #include <stdio.h>
+
+       #include "mdcode.h"
+       ## internal includes
+       ## private types
+       ## internal functions
+
+### File: mdcode.mk
+
+       all :: libmdcode.o
+       libmdcode.o : libmdcode.c mdcode.h
+               $(CC) $(CFLAGS) -c libmdcode.c
+
+
+### File: md2c.c
+
+       #include <unistd.h>
+       #include <stdlib.h>
+
+       #include "mdcode.h"
+
+       ## client includes
+       ## client functions
+
+### File: mdcode.mk
+
+       all :: md2c
+       md2c : md2c.o libmdcode.o
+               $(CC) $(CFLAGS) -o md2c md2c.o libmdcode.o
+       md2c.o : md2c.c mdcode.h
+               $(CC) $(CFLAGS) -c md2c.c
+
+## Data Structures
+
+As the core purpose of _mdcode_ is to discover and re-arrange blocks
+of text, it makes sense to map the whole document file into memory and
+produce a data structure which lists various parts of the file in the
+appropriate order.  Each node in this structure will have some text
+from the document, a child pointer, and a next pointer, any of which
+might not be present.  The text is most easily stored as a pointer and a
+length.  We'll call this a `text`
+
+A list of these `code_nodes` will belong to each section and it will
+be useful to have a separate `section` data structure to store the
+list of `code_nodes`, the section name, and some other information.
+
+This other information will include a reference counter so we can
+ensure proper referencing, and an `indent` depth.  As referenced
+content can have an extra indent added, we need to know what that is.
+The `code_node` will also have an `indent` depth which eventually gets
+set to the sum for the indents from all references on the path from
+the root.
+
+##### exported types
+
+       struct text {
+               char *txt;
+               int len;
+       };
+
+       struct section {
+               struct text section;
+               struct code_node *code;
+               struct section *next;
+       };
+
+       struct code_node {
+               struct text code;
+               int indent;
+               int line_no;
+               struct code_node *next;
+               struct section *child;
+       };
+
+##### private types
+
+       struct psection {
+               struct section;
+               struct code_node *last;
+               int refcnt;
+               int indent;
+       };
+
+You will note that the `struct psection` contains an anonymous `struct
+section` embedded at the start.  To make this work right, GCC
+requires the `-fplan9-extensions` flag.
+
+##### File: mdcode.mk
+
+       CFLAGS += -fplan9-extensions
+
+### Manipulating the node
+
+Though a tree with `next` and `child` links is the easiest way to
+assemble the various code sections, it is not the easiest form for
+using them.  For that a simple list would be best.
+
+So once we have a fully linked File section we will want to linearize
+it, so that the `child` links become `NULL` and the `next` links will
+find everything required.  It is at this stage that the requirements
+that each section is linked only once becomes import.
+
+`code_linearize` will merge the `code_node`s from any child into the
+given `code_node`.  As it does this it sets the 'indent' field for
+each `code_node`.
+
+Note that we don't clear the section's `last` pointer, even though
+it no longer owns any code.  This allows subsequent code to see if a
+section ever had any code, and to report an error if a section is
+referenced but not defined.
+
+##### internal functions
+
+       static void code_linearize(struct code_node *code)
+       {
+               struct code_node *t;
+               for (t = code; t; t = t->next)
+                       t->indent = 0;
+               for (; code; code = code->next)
+                       if (code->child) {
+                               struct code_node *next = code->next;
+                               struct psection *pchild =
+                                       (struct psection *)code->child;
+                               int indent = pchild->indent;
+                               code->next = code->child->code;
+                               code->child->code = NULL;
+                               code->child = NULL;
+                               for (t = code; t->next; t = t->next)
+                                       t->next->indent = code->indent + indent;
+                               t->next = next;
+                       }
+       }
+
+Once a client has made use of a linearized code set, it will probably
+want to free it.
+
+       void code_free(struct code_node *code)
+       {
+               while (code) {
+                       struct code_node *this;
+                       if (code->child)
+                               code_linearize(code);
+                       this = code;
+                       code = code->next;
+                       free(this);
+               }
+       }
+
+##### exported functions
+
+       void code_free(struct code_node *code);
+
+### Building the tree
+
+As we parse the document there are two things we will want to do to
+node trees: add some text or add a reference.  We'll assume for now
+that the relevant section structures have been found, and will just
+deal with the `code_node`.
+
+Adding text simply means adding another node.  We will never have
+empty nodes, even if the last node only has a child, new text must go
+in a new node.
+
+##### internal functions
+
+       static void code_add_text(struct psection *where, struct text txt,
+                                 int line_no)
+       {
+               struct code_node *n;
+               if (txt.len == 0)
+                       return;
+               n = malloc(sizeof(*n));
+               n->code = txt;
+               n->indent = 0;
+               n->line_no = line_no;
+               n->next = NULL;
+               n->child = NULL;
+               if (where->last)
+                       where->last->next = n;
+               else
+                       where->code = n;
+               where->last = n;
+       }
+
+However when adding a link, we might be able to include it in the last
+`code_node` if it currently only has text.
+
+       void code_add_link(struct psection *where, struct psection *to,
+                          int indent)
+       {
+               struct code_node *n;
+
+               to->indent = indent;
+               to->refcnt++;   // this will be checked elsewhere
+               if (where->last && where->last->child == NULL) {
+                       where->last->child = to;
+                       return;
+               }
+               n = malloc(sizeof(*n));
+               n->code.len = 0;
+               n->indent = 0;
+               n->line_no = 0;
+               n->next = NULL;
+               n->child = to;
+               if (where->last)
+                       where->last->next = n;
+               else
+                       where->code = n;
+               where->last = n;
+       }
+
+### Finding sections
+
+Now we need a lookup table to be able to find sections by name.
+Something that provides an `n*log(N)` search time is probably
+justified, but for now I want a minimal stand-alone program so a
+linked list managed by insertion-sort will do.  As a comparison
+function it is easiest to sort based on length before content.  So
+sections won't be in standard lexical order, but that isn't important.
+
+If we cannot find a section, we simply want to create it.  This allows
+sections and references to be created in any order.  Sections with
+no references or no content will cause a warning eventually.
+
+#### internal functions
+
+       static int text_cmp(struct text a, struct text b)
+       {
+               if (a.len != b.len)
+                       return a.len - b.len;
+               return strncmp(a.txt, b.txt, a.len);
+       }
+
+       static struct psection *section_find(struct psection **list, struct text name)
+       {
+               struct psection *new;
+               while (*list) {
+                       int cmp = text_cmp((*list)->section, name);
+                       if (cmp == 0)
+                               return *list;
+                       if (cmp > 0)
+                               break;
+                       list = (struct psection **)&((*list)->next);
+               }
+               /* Add this section */
+               new = malloc(sizeof(*new));
+               new->next = *list;
+               *list = new;
+               new->section = name;
+               new->code = NULL;
+               new->last = NULL;
+               new->refcnt = 0;
+               new->indent = 0;
+               return new;
+       }
+
+## Parsing the _markdown_
+
+Parsing markdown is fairly easy, though there are complications.
+
+The document is divided into "paragraphs" which are mostly separated by blank
+lines (which may contain white space).  The first few characters of
+the first line of a paragraph determine the type of paragraph.  For
+our purposes we are only interested in list paragraphs, code
+paragraphs, section headings, and everything else.  Section headings
+are single-line paragraphs and so do not require a preceding or
+following blank line.
+
+Section headings start with 1 or more hash characters (__#__).  List
+paragraphs start with hyphen, asterisk, plus, or digits followed by a
+period.  Code paragraphs aren't quite so easy.
+
+The "standard" code paragraph starts with 4 or more spaces, or a tab.
+However if the previous paragraph was a list paragraph, then those
+spaces indicate another  paragraph in the same list item, and 8 or
+more spaces are required.  Unless a nested list is in effect, in
+which case 12 or more are need.   Unfortunately not all _markdown_
+parsers agree on nested lists.
+
+Two alternate styles for marking code are in active use.  "Github" uses
+three backticks(_`` ``` ``_), while "pandoc" uses three or more tildes
+(_~~~_).  In these cases the code should not be indented.
+
+Trying to please everyone as much as possible, this parser will handle
+everything except for code inside lists.
+
+So an indented (4+) paragraph after a list paragraph is always a list
+paragraph, otherwise it is a code paragraph.  A paragraph that starts
+with three backticks or three tildes is code which continues until a
+matching string of backticks or tildes.
+
+### Skipping bits
+
+While walking the document looking for various markers we will *not*
+use the `struct text` introduced earlier as advancing that requires
+updating both start and length which feels clumsy.  Instead we will
+carry `pos` and `end` pointers, only the first of which needs to
+change.
+
+So to start, we need to skip various parts of the document.  `lws`
+stands for "Linear White Space" and is a term that comes from the
+Email RFCs (e.g. RFC822).  `line` and `para` are self explanatory.
+Note that `skip_para` needs to update the current line number.
+`skip_line` doesn't but every caller should.
+
+#### internal functions
+
+       static char *skip_lws(char *pos, char *end)
+       {
+               while (pos < end && (*pos == ' ' || *pos == '\t'))
+                       pos++;
+               return pos;
+       }
+
+       static char *skip_line(char *pos, char *end)
+       {
+               while (pos < end && *pos != '\n')
+                       pos++;
+               if (pos < end)
+                       pos++;
+               return pos;
+       }
+
+       static char *skip_para(char *pos, char *end, int *line_no)
+       {
+               /* Might return a pointer to a blank line, as only
+                * one trailing blank line is skipped
+                */
+               if (*pos == '#') {
+                       pos = skip_line(pos, end);
+                       (*line_no) += 1;
+                       return pos;
+               }
+               while (pos < end &&
+                      *pos != '#' &&
+                      *(pos = skip_lws(pos, end)) != '\n') {
+                       pos = skip_line(pos, end);
+                       (*line_no) += 1;
+               }
+               if (pos < end && *pos == '\n') {
+                       pos++;
+                       (*line_no) += 1;
+               }
+               return pos;
+       }
+
+### Recognising things
+
+Recognising a section header is trivial and doesn't require a
+function.  However we need to extract the content of a section header
+as a `struct text` for passing to `section_find`.
+Recognising the start of a new list is fairly easy.  Recognising the
+start (and end) of code is a little messy so we provide a function for
+matching the first few characters, which has a special case for "4
+spaces or tab".
+
+#### internal includes
+
+       #include  <ctype.h>
+       #include  <string.h>
+
+#### internal functions
+
+       static struct text take_header(char *pos, char *end)
+       {
+               struct text section;
+
+               while (pos < end && *pos == '#')
+                       pos++;
+               while (pos < end && *pos == ' ')
+                       pos++;
+               section.txt = pos;
+               while (pos < end && *pos != '\n')
+                       pos++;
+               while (pos > section.txt &&
+                      (pos[-1] == '#' || pos[-1] == ' '))
+                       pos--;
+               section.len = pos - section.txt;
+               return section;
+       }
+
+       static int is_list(char *pos, char *end)
+       {
+               if (strchr("-*+", *pos))
+                       return 1;
+               if (isdigit(*pos)) {
+                       while (pos < end && isdigit(*pos))
+                               pos += 1;
+                       if  (pos < end && *pos == '.')
+                               return 1;
+               }
+               return 0;
+       }
+
+       static int matches(char *start, char *pos, char *end)
+       {
+               if (start == NULL)
+                       return matches("\t", pos, end) ||
+                              matches("    ", pos, end);
+               return (pos + strlen(start) < end &&
+                       strncmp(pos, start, strlen(start)) == 0);
+       }
+
+### Extracting the code
+
+Now that we can skip paragraphs and recognise what type each paragraph
+is, it is time to parse the file and extract the code.  We'll do this
+in two parts, first we look at what to do with some code once we
+find it, and then how to actually find it.
+
+When we have some code, we know where it is, what the end marker
+should look like, and which section it is in.
+
+There are two sorts of end markers: the presence of a particular
+string, or the absence of an indent.  We will use a string to
+represent a presence, and a `NULL` to represent the absence.
+
+While looking at code we don't think about paragraphs are all - just
+look for a line that starts with the right thing.
+Every line that is still code then needs to be examined to see if it
+is a section reference.
+
+When a section reference is found, all preceding code (if any) must be
+added to the current section, then the reference is added.
+
+When we do find the end of the code, all text that we have found but
+not processed needs to be saved too.
+
+When adding a reference we need to set the `indent`.  This is the
+number of spaces (counting 8 for tabs) after the natural indent of the
+code (which is a tab or 4 spaces).  We use a separate function `count_spaces`
+for that.
+
+#### internal functions
+
+       static int count_space(char *sol, char *p)
+       {
+               int c = 0;
+               while (sol < p) {
+                       if (sol[0] == ' ')
+                               c++;
+                       if (sol[0] == '\t')
+                               c+= 8;
+                       sol++;
+               }
+               return c;
+       }
+
+
+       static char *take_code(char *pos, char *end, char *marker,
+                              struct psection **table, struct text section,
+                              int *line_nop)
+       {
+               char *start = pos;
+               int line_no = *line_nop;
+               int start_line = line_no;
+               struct psection *sect;
+
+               sect = section_find(table, section);
+
+               while (pos < end) {
+                       char *sol, *t;
+                       struct text ref;
+
+                       if (marker && matches(marker, pos, end))
+                               break;
+                       if (!marker &&
+                           (skip_lws(pos, end))[0] != '\n' &&
+                           !matches(NULL, pos, end))
+                               /* Paragraph not indented */
+                               break;
+
+                       /* Still in code - check for reference */
+                       sol = pos;
+                       if (!marker) {
+                               if (*sol == '\t')
+                                       sol++;
+                               else if (strcmp(sol, "    ") == 0)
+                                       sol += 4;
+                       }
+                       t = skip_lws(sol, end);
+                       if (t[0] != '#' || t[1] != '#') {
+                               /* Just regular code here */
+                               pos = skip_line(sol, end);
+                               line_no++;
+                               continue;
+                       }
+
+                       if (pos > start) {
+                               struct text txt;
+                               txt.txt = start;
+                               txt.len = pos - start;
+                               code_add_text(sect, txt, start_line);
+                       }
+                       ref = take_header(t, end);
+                       if (ref.len) {
+                               struct psection *refsec = section_find(table, ref);
+                               code_add_link(sect, refsec, count_space(sol, t));
+                       }
+                       pos = skip_line(t, end);
+                       line_no++;
+                       start = pos;
+                       start_line = line_no;
+               }
+               if (pos > start) {
+                       struct text txt;
+                       txt.txt = start;
+                       txt.len = pos - start;
+                       code_add_text(sect, txt, start_line);
+               }
+               if (marker) {
+                       pos = skip_line(pos, end);
+                       line_no++;
+               }
+               *line_nop = line_no;
+               return pos;
+       }
+
+### Finding the code
+
+It is when looking for the code that we actually use the paragraph
+structure.  We need to recognise section headings so we can record the
+name, list paragraphs so we can ignore indented follow-on paragraphs,
+and the three different markings for code.
+
+#### internal functions
+
+       static struct psection *code_find(char *pos, char *end)
+       {
+               struct psection *table = NULL;
+               int in_list = 0;
+               int line_no = 1;
+               struct text section = {0};
+
+               while (pos < end) {
+                       if (pos[0] == '#') {
+                               section = take_header(pos, end);
+                               in_list = 0;
+                               pos = skip_line(pos, end);
+                               line_no++;
+                       } else if (is_list(pos, end)) {
+                               in_list = 1;
+                               pos = skip_para(pos, end, &line_no);
+                       } else if (!in_list && matches(NULL, pos, end)) {
+                               pos = take_code(pos, end, NULL, &table,
+                                               section, &line_no);
+                       } else if (matches("```", pos, end)) {
+                               in_list = 0;
+                               pos = skip_line(pos, end);
+                               line_no++;
+                               pos = take_code(pos, end, "```", &table,
+                                               section, &line_no);
+                       } else if (matches("~~~", pos, end)) {
+                               in_list = 0;
+                               pos = skip_line(pos, end);
+                               line_no++;
+                               pos = take_code(pos, end, "~~~", &table,
+                                               section, &line_no);
+                       } else {
+                               if (!isspace(*pos))
+                                       in_list = 0;
+                               pos = skip_para(pos, end, &line_no);
+                       }
+               }
+               return table;
+       }
+
+### Returning the code
+
+Having found all the code blocks and gathered them into a list of
+section, we are now ready to return them to the caller.  This is where
+to perform consistency checks, like at most one reference and at least
+one definition for each section.
+
+All the sections with no references are returned in a list for the
+caller to consider.  The are linearized first so that the substructure
+is completely hidden -- except for the small amount of structure
+displayed in the line numbers.
+
+To return errors, we have the caller pass a function which takes an
+error message - a `code_err_fn`.
+
+#### exported types
+
+       typedef void (*code_err_fn)(char *msg);
+
+#### internal functions
+       struct section *code_extract(char *pos, char *end, code_err_fn error)
+       {
+               struct psection *table;
+               struct section *result = NULL;
+               struct section *tofree = NULL;
+
+               table = code_find(pos, end);
+
+               while (table) {
+                       struct psection *t = (struct psection*)table->next;
+                       if (table->last == NULL) {
+                               char *msg;
+                               asprintf(&msg,
+                                       "Section \"%.*s\" is referenced but not declared",
+                                        table->section.len, table->section.txt);
+                               error(msg);
+                               free(msg);
+                       }
+                       if (table->refcnt == 0) {
+                               /* Root-section,  return it */
+                               table->next = result;
+                               result = table;
+                               code_linearize(result->code);
+                       } else {
+                               table->next = tofree;
+                               tofree = table;
+                               if (table->refcnt > 1) {
+                                       char *msg;
+                                       asprintf(&msg,
+                                                "Section \"%.*s\" referenced multiple times (%d).",
+                                                table->section.len, table->section.txt,
+                                                table->refcnt);
+                                       error(msg);
+                                       free(msg);
+                               }
+                       }
+                       table = t;
+               }
+               while (tofree) {
+                       struct section *t = tofree->next;
+                       free(tofree);
+                       tofree = t;
+               }
+               return result;
+       }
+
+##### exported functions
+
+       struct section *code_extract(char *pos, char *end, code_err_fn error);
+
+
+## Using the library
+
+Now that we can extract code from a document and link it all together
+it is time to do something with that code.  Firstly we need to print
+it out.
+
+### Printing the Code
+
+Printing is mostly straight forward - we just walk the list and print
+the code sections, adding whatever indent is required for each line.
+However there is a complication (isn't there always)?
+
+For code that was recognised because the paragraph was indented, we
+need to strip that indent first.  For other code, we don't.
+
+The approach taken here is simple, though it could arguably be wrong
+in some unlikely cases.  So it might need to be fixed later.
+
+If the first line of a code block is indented, then either one tab or
+4 spaces are striped from every non-blank line.
+
+This could go wrong if the first line of a code block marked by
+_`` ``` ``_ is indented.  To overcome this we would need to
+record someextra state in each `code_node`.  For now we won't bother.
+
+The indents we insert will all be spaces.  This might not work well
+for `Makefiles`.
+
+##### client functions
+
+       static void code_print(FILE *out, struct code_node *node,
+                              char *fname)
+       {
+               for (; node; node = node->next) {
+                       char *c = node->code.txt;
+                       int len = node->code.len;
+                       int undent = 0;
+
+                       if (!len)
+                               continue;
+
+                       fprintf(out, "#line %d \"%s\"\n",
+                               node->line_no, fname);
+                       if (*c == ' ' || *c == '\t')
+                               undent = 1;
+                       while (len && *c) {
+                               fprintf(out, "%*s", node->indent, "");
+                               if (undent) {
+                                       if (*c == '\t' && len > 1) {
+                                               c++;
+                                               len--;
+                                       } else if (strncmp(c, "    ", 4) == 0 && len > 4) {
+                                               c += 4;
+                                               len-= 4;
+                                       }
+                               }
+                               do {
+                                       fputc(*c, out);
+                                       c++;
+                                       len--;
+                               } while (len && c[-1] != '\n');
+                       }
+               }
+       }
+
+### Bringing it all together
+
+We are just about ready for the `main` function of the tool which will
+extract all this lovely code and compile it.  Just one helper is still
+needed.
+
+#### Handling filenames
+
+Section names are stored in `struct text` which is not `nul`
+terminated.  Filenames passed to `open` need to be null terminated.
+So we need to convert one to the other, and strip the leading `File:`
+of while we are at it.
+
+##### client functions
+
+       static void copy_fname(char *name, int space, struct text t)
+       {
+               char *sec = t.txt;
+               int len = t.len;
+               name[0] = 0;
+               if (len < 5 || strncmp(sec, "File:", 5) != 0)
+                       return;
+               sec += 5;
+               len -= 5;
+               while (len && sec[0] == ' ') {
+                       sec++;
+                       len--;
+               }
+               if (len >= space)
+                       len = space - 1;
+               strncpy(name, sec, len);
+               name[len] = 0;
+       }
+
+#### Main
+
+And now we take a single file name, extract the code, and if there are
+no error we write out a file for each appropriate code section.  And
+we are done.
+
+
+##### client includes
+
+       #include <fcntl.h>
+       #include <errno.h>
+       #include <sys/mman.h>
+       #include <string.h>
+       #include <stdio.h>
+
+##### client functions
+
+       static int errs;
+       static void pr_err(char *msg)
+       {
+               errs++;
+               fprintf(stderr, "%s\n", msg);
+       }
+
+       int main(int argc, char *argv[])
+       {
+               int fd;
+               size_t len;
+               char *file;
+               struct section *table, *s, *prev;
+
+               errs = 0;
+               if (argc != 2) {
+                       fprintf(stderr, "Usage: mdcode file.mdc\n");
+                       exit(2);
+               }
+               fd = open(argv[1], O_RDONLY);
+               if (fd < 0) {
+                       fprintf(stderr, "mdcode: cannot open %s: %s\n",
+                               argv[1], strerror(errno));
+                       exit(1);
+               }
+               len = lseek(fd, 0, 2);
+               file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+               table = code_extract(file, file+len, pr_err);
+
+               for (s = table; s;
+                       (code_free(s->code), prev = s, s = s->next, free(prev))) {
+                       FILE *fl;
+                       char fname[1024];
+                       if (strncmp(s->section.txt, "Example:", 8) == 0)
+                               continue;
+                       if (strncmp(s->section.txt, "File:", 5) != 0) {
+                               fprintf(stderr, "Unreferenced section is not a file name: %.*s\n",
+                                       s->section.len, s->section.txt);
+                               errs++;
+                               continue;
+                       }
+                       copy_fname(fname, sizeof(fname), s->section);
+                       if (fname[0] == 0) {
+                               fprintf(stderr, "Missing file name at:%.*s\n",
+                                       s->section.len, s->section.txt);
+                               errs++;
+                               continue;
+                       }
+                       fl = fopen(fname, "w");
+                       if (!fl) {
+                               fprintf(stderr, "Cannot create %s: %s\n",
+                                       fname, strerror(errno));
+                               errs++;
+                               continue;
+                       }
+                       code_print(fl, s->code, argv[1]);
+                       fclose(fl);
+               }
+               exit(!!errs);
+       }
+
author	NeilBrown <neilb@suse.de>
	Wed, 5 Jun 2013 20:20:35 +0000 (06:20 +1000)
committer	NeilBrown <neilb@suse.de>
	Wed, 5 Jun 2013 21:40:31 +0000 (07:40 +1000)
.gitignore	[new file with mode: 0644]	patch \| blob
csrc/.gitignore	[new file with mode: 0644]	patch \| blob
csrc/Makefile	[new file with mode: 0644]	patch \| blob
csrc/boot-strap/libmdcode.c	[new file with mode: 0644]	patch \| blob
csrc/boot-strap/md2c.c	[new file with mode: 0644]	patch \| blob
csrc/boot-strap/mdcode.h	[new file with mode: 0644]	patch \| blob
csrc/boot-strap/mdcode.mk	[new file with mode: 0644]	patch \| blob
csrc/mdcode.mdc	[new file with mode: 0644]	patch \| blob