From: NeilBrown Date: Wed, 5 Jun 2013 20:20:35 +0000 (+1000) Subject: mdcode, md2c - extract C code from a literate markdown program X-Git-Tag: draftparser~21 X-Git-Url: https://ocean-lang.org/code/?p=ocean;a=commitdiff_plain;h=bf502feaba385e5e8ebd5f80c005522c59660496 mdcode, md2c - extract C code from a literate markdown program All C programs here will be written in literate style using markdown. md2c strips out the C code so that it can be compiled. Signed-off-by: NeilBrown --- bf502feaba385e5e8ebd5f80c005522c59660496 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5761abc --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.o diff --git a/csrc/.gitignore b/csrc/.gitignore new file mode 100644 index 0000000..3cfcc75 --- /dev/null +++ b/csrc/.gitignore @@ -0,0 +1,5 @@ +*.c +*.h +*.mk +md2c +boot-strap/md2c diff --git a/csrc/Makefile b/csrc/Makefile new file mode 100644 index 0000000..d1beada --- /dev/null +++ b/csrc/Makefile @@ -0,0 +1,11 @@ + +ifneq "$(wildcard *.mk)" "" +include *.mk +endif + +bootstrap: + touch boot-strap/* # make sure mdcode.mdc isn't newer + $(MAKE) -C boot-strap -f mdcode.mk VPATH=.. + cp boot-strap/md2c . + ./md2c mdcode.mdc + diff --git a/csrc/boot-strap/libmdcode.c b/csrc/boot-strap/libmdcode.c new file mode 100644 index 0000000..78a4262 --- /dev/null +++ b/csrc/boot-strap/libmdcode.c @@ -0,0 +1,384 @@ +#line 100 "../mdcode.mdc" +#define _GNU_SOURCE +#include +#include +#include + +#include "mdcode.h" +#line 461 "../mdcode.mdc" +#include +#include + +#line 179 "../mdcode.mdc" +struct psection { + struct section; + struct code_node *last; + int refcnt; + int indent; +}; + +#line 216 "../mdcode.mdc" +static void code_linearize(struct code_node *code) +{ + struct code_node *t; + for (t = code; t; t = t->next) + t->indent = 0; + for (; code; code = code->next) + if (code->child) { + struct code_node *next = code->next; + struct psection *pchild = + (struct psection *)code->child; + int indent = pchild->indent; + code->next = code->child->code; + code->child->code = NULL; + code->child = NULL; + for (t = code; t->next; t = t->next) + t->next->indent = code->indent + indent; + t->next = next; + } +} + +#line 239 "../mdcode.mdc" +void code_free(struct code_node *code) +{ + while (code) { + struct code_node *this; + if (code->child) + code_linearize(code); + this = code; + code = code->next; + free(this); + } +} + +#line 268 "../mdcode.mdc" +static void code_add_text(struct psection *where, struct text txt, + int line_no) +{ + struct code_node *n; + if (txt.len == 0) + return; + n = malloc(sizeof(*n)); + n->code = txt; + n->indent = 0; + n->line_no = line_no; + n->next = NULL; + n->child = NULL; + if (where->last) + where->last->next = n; + else + where->code = n; + where->last = n; +} + +#line 290 "../mdcode.mdc" +void code_add_link(struct psection *where, struct psection *to, + int indent) +{ + struct code_node *n; + + to->indent = indent; + to->refcnt++; // this will be checked elsewhere + if (where->last && where->last->child == NULL) { + where->last->child = to; + return; + } + n = malloc(sizeof(*n)); + n->code.len = 0; + n->indent = 0; + n->line_no = 0; + n->next = NULL; + n->child = to; + if (where->last) + where->last->next = n; + else + where->code = n; + where->last = n; +} + +#line 329 "../mdcode.mdc" +static int text_cmp(struct text a, struct text b) +{ + if (a.len != b.len) + return a.len - b.len; + return strncmp(a.txt, b.txt, a.len); +} + +static struct psection *section_find(struct psection **list, struct text name) +{ + struct psection *new; + while (*list) { + int cmp = text_cmp((*list)->section, name); + if (cmp == 0) + return *list; + if (cmp > 0) + break; + list = (struct psection **)&((*list)->next); + } + /* Add this section */ + new = malloc(sizeof(*new)); + new->next = *list; + *list = new; + new->section = name; + new->code = NULL; + new->last = NULL; + new->refcnt = 0; + new->indent = 0; + return new; +} + +#line 410 "../mdcode.mdc" +static char *skip_lws(char *pos, char *end) +{ + while (pos < end && (*pos == ' ' || *pos == '\t')) + pos++; + return pos; +} + +static char *skip_line(char *pos, char *end) +{ + while (pos < end && *pos != '\n') + pos++; + if (pos < end) + pos++; + return pos; +} + +static char *skip_para(char *pos, char *end, int *line_no) +{ + /* Might return a pointer to a blank line, as only + * one trailing blank line is skipped + */ + if (*pos == '#') { + pos = skip_line(pos, end); + (*line_no) += 1; + return pos; + } + while (pos < end && + *pos != '#' && + *(pos = skip_lws(pos, end)) != '\n') { + pos = skip_line(pos, end); + (*line_no) += 1; + } + if (pos < end && *pos == '\n') { + pos++; + (*line_no) += 1; + } + return pos; +} + +#line 466 "../mdcode.mdc" +static struct text take_header(char *pos, char *end) +{ + struct text section; + + while (pos < end && *pos == '#') + pos++; + while (pos < end && *pos == ' ') + pos++; + section.txt = pos; + while (pos < end && *pos != '\n') + pos++; + while (pos > section.txt && + (pos[-1] == '#' || pos[-1] == ' ')) + pos--; + section.len = pos - section.txt; + return section; +} + +static int is_list(char *pos, char *end) +{ + if (strchr("-*+", *pos)) + return 1; + if (isdigit(*pos)) { + while (pos < end && isdigit(*pos)) + pos += 1; + if (pos < end && *pos == '.') + return 1; + } + return 0; +} + +static int matches(char *start, char *pos, char *end) +{ + if (start == NULL) + return matches("\t", pos, end) || + matches(" ", pos, end); + return (pos + strlen(start) < end && + strncmp(pos, start, strlen(start)) == 0); +} + +#line 538 "../mdcode.mdc" +static int count_space(char *sol, char *p) +{ + int c = 0; + while (sol < p) { + if (sol[0] == ' ') + c++; + if (sol[0] == '\t') + c+= 8; + sol++; + } + return c; +} + + +static char *take_code(char *pos, char *end, char *marker, + struct psection **table, struct text section, + int *line_nop) +{ + char *start = pos; + int line_no = *line_nop; + int start_line = line_no; + struct psection *sect; + + sect = section_find(table, section); + + while (pos < end) { + char *sol, *t; + struct text ref; + + if (marker && matches(marker, pos, end)) + break; + if (!marker && + (skip_lws(pos, end))[0] != '\n' && + !matches(NULL, pos, end)) + /* Paragraph not indented */ + break; + + /* Still in code - check for reference */ + sol = pos; + if (!marker) { + if (*sol == '\t') + sol++; + else if (strcmp(sol, " ") == 0) + sol += 4; + } + t = skip_lws(sol, end); + if (t[0] != '#' || t[1] != '#') { + /* Just regular code here */ + pos = skip_line(sol, end); + line_no++; + continue; + } + + if (pos > start) { + struct text txt; + txt.txt = start; + txt.len = pos - start; + code_add_text(sect, txt, start_line); + } + ref = take_header(t, end); + if (ref.len) { + struct psection *refsec = section_find(table, ref); + code_add_link(sect, refsec, count_space(sol, t)); + } + pos = skip_line(t, end); + line_no++; + start = pos; + start_line = line_no; + } + if (pos > start) { + struct text txt; + txt.txt = start; + txt.len = pos - start; + code_add_text(sect, txt, start_line); + } + if (marker) { + pos = skip_line(pos, end); + line_no++; + } + *line_nop = line_no; + return pos; +} + +#line 630 "../mdcode.mdc" +static struct psection *code_find(char *pos, char *end) +{ + struct psection *table = NULL; + int in_list = 0; + int line_no = 1; + struct text section = {0}; + + while (pos < end) { + if (pos[0] == '#') { + section = take_header(pos, end); + in_list = 0; + pos = skip_line(pos, end); + line_no++; + } else if (is_list(pos, end)) { + in_list = 1; + pos = skip_para(pos, end, &line_no); + } else if (!in_list && matches(NULL, pos, end)) { + pos = take_code(pos, end, NULL, &table, + section, &line_no); + } else if (matches("```", pos, end)) { + in_list = 0; + pos = skip_line(pos, end); + line_no++; + pos = take_code(pos, end, "```", &table, + section, &line_no); + } else if (matches("~~~", pos, end)) { + in_list = 0; + pos = skip_line(pos, end); + line_no++; + pos = take_code(pos, end, "~~~", &table, + section, &line_no); + } else { + if (!isspace(*pos)) + in_list = 0; + pos = skip_para(pos, end, &line_no); + } + } + return table; +} + +#line 690 "../mdcode.mdc" +struct section *code_extract(char *pos, char *end, code_err_fn error) +{ + struct psection *table; + struct section *result = NULL; + struct section *tofree = NULL; + + table = code_find(pos, end); + + while (table) { + struct psection *t = (struct psection*)table->next; + if (table->last == NULL) { + char *msg; + asprintf(&msg, + "Section \"%.*s\" is referenced but not declared", + table->section.len, table->section.txt); + error(msg); + free(msg); + } + if (table->refcnt == 0) { + /* Root-section, return it */ + table->next = result; + result = table; + code_linearize(result->code); + } else { + table->next = tofree; + tofree = table; + if (table->refcnt > 1) { + char *msg; + asprintf(&msg, + "Section \"%.*s\" referenced multiple times (%d).", + table->section.len, table->section.txt, + table->refcnt); + error(msg); + free(msg); + } + } + table = t; + } + while (tofree) { + struct section *t = tofree->next; + free(tofree); + tofree = t; + } + return result; +} + +#line 109 "../mdcode.mdc" + diff --git a/csrc/boot-strap/md2c.c b/csrc/boot-strap/md2c.c new file mode 100644 index 0000000..2b18737 --- /dev/null +++ b/csrc/boot-strap/md2c.c @@ -0,0 +1,133 @@ +#line 119 "../mdcode.mdc" +#include +#include + +#include "mdcode.h" + +#line 849 "../mdcode.mdc" +#include +#include +#include +#include +#include + +#line 771 "../mdcode.mdc" +static void code_print(FILE *out, struct code_node *node, + char *fname) +{ + for (; node; node = node->next) { + char *c = node->code.txt; + int len = node->code.len; + int undent = 0; + + if (!len) + continue; + + fprintf(out, "#line %d \"%s\"\n", + node->line_no, fname); + if (*c == ' ' || *c == '\t') + undent = 1; + while (len && *c) { + fprintf(out, "%*s", node->indent, ""); + if (undent) { + if (*c == '\t' && len > 1) { + c++; + len--; + } else if (strncmp(c, " ", 4) == 0 && len > 4) { + c += 4; + len-= 4; + } + } + do { + fputc(*c, out); + c++; + len--; + } while (len && c[-1] != '\n'); + } + } +} + +#line 821 "../mdcode.mdc" +static void copy_fname(char *name, int space, struct text t) +{ + char *sec = t.txt; + int len = t.len; + name[0] = 0; + if (len < 5 || strncmp(sec, "File:", 5) != 0) + return; + sec += 5; + len -= 5; + while (len && sec[0] == ' ') { + sec++; + len--; + } + if (len >= space) + len = space - 1; + strncpy(name, sec, len); + name[len] = 0; +} + +#line 857 "../mdcode.mdc" +static int errs; +static void pr_err(char *msg) +{ + errs++; + fprintf(stderr, "%s\n", msg); +} + +int main(int argc, char *argv[]) +{ + int fd; + size_t len; + char *file; + struct section *table, *s, *prev; + + errs = 0; + if (argc != 2) { + fprintf(stderr, "Usage: mdcode file.mdc\n"); + exit(2); + } + fd = open(argv[1], O_RDONLY); + if (fd < 0) { + fprintf(stderr, "mdcode: cannot open %s: %s\n", + argv[1], strerror(errno)); + exit(1); + } + len = lseek(fd, 0, 2); + file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); + table = code_extract(file, file+len, pr_err); + + for (s = table; s; + (code_free(s->code), prev = s, s = s->next, free(prev))) { + FILE *fl; + char fname[1024]; + if (strncmp(s->section.txt, "Example:", 8) == 0) + continue; + if (strncmp(s->section.txt, "File:", 5) != 0) { + fprintf(stderr, "Unreferenced section is not a file name: %.*s\n", + s->section.len, s->section.txt); + errs++; + continue; + } + copy_fname(fname, sizeof(fname), s->section); + if (fname[0] == 0) { + fprintf(stderr, "Missing file name at:%.*s\n", + s->section.len, s->section.txt); + errs++; + continue; + } + fl = fopen(fname, "w"); + if (!fl) { + fprintf(stderr, "Cannot create %s: %s\n", + fname, strerror(errno)); + errs++; + continue; + } + code_print(fl, s->code, argv[1]); + fclose(fl); + } + exit(!!errs); +} + +#line 126 "../mdcode.mdc" + diff --git a/csrc/boot-strap/mdcode.h b/csrc/boot-strap/mdcode.h new file mode 100644 index 0000000..6a17e44 --- /dev/null +++ b/csrc/boot-strap/mdcode.h @@ -0,0 +1,32 @@ +#line 158 "../mdcode.mdc" +struct text { + char *txt; + int len; +}; + +struct section { + struct text section; + struct code_node *code; + struct section *next; +}; + +struct code_node { + struct text code; + int indent; + int line_no; + struct code_node *next; + struct section *child; +}; + +#line 687 "../mdcode.mdc" +typedef void (*code_err_fn)(char *msg); + +#line 253 "../mdcode.mdc" +void code_free(struct code_node *code); + +#line 738 "../mdcode.mdc" +struct section *code_extract(char *pos, char *end, code_err_fn error); + + +#line 98 "../mdcode.mdc" + diff --git a/csrc/boot-strap/mdcode.mk b/csrc/boot-strap/mdcode.mk new file mode 100644 index 0000000..22a35a1 --- /dev/null +++ b/csrc/boot-strap/mdcode.mk @@ -0,0 +1,23 @@ +#line 88 "../mdcode.mdc" +CFLAGS += -Wall -g +all:: +mdcode.h libmdcode.c md2c.c mdcode.mk : mdcode.mdc + ./md2c mdcode.mdc + + +#line 112 "../mdcode.mdc" +all :: libmdcode.o +libmdcode.o : libmdcode.c mdcode.h + $(CC) $(CFLAGS) -c libmdcode.c + + +#line 129 "../mdcode.mdc" +all :: md2c +md2c : md2c.o libmdcode.o + $(CC) $(CFLAGS) -o md2c md2c.o libmdcode.o +md2c.o : md2c.c mdcode.h + $(CC) $(CFLAGS) -c md2c.c + +#line 192 "../mdcode.mdc" +CFLAGS += -fplan9-extensions + diff --git a/csrc/mdcode.mdc b/csrc/mdcode.mdc new file mode 100644 index 0000000..dc78a12 --- /dev/null +++ b/csrc/mdcode.mdc @@ -0,0 +1,917 @@ +# mdcode: extract C code from a _markdown_ file. + +_markdown_ is a popular format for simple text markup which can easily +be converted to HTML. As it allows easy indication of sections of +code, it is quite suitable for use in literate programming. This file +is an example of that usage. + +The code included below provides two related functionalities. +Firstly it provides a library routine for extracting code out of a +_markdown_ file, so that other routines might make use of it. + +Secondly it provides a simple client of this routine which extracts +1 or more C-language files from a markdown document so they can be +passed to a C compiler. These two combined to make a tool that is needed +to compile this tool. Yes, this is circular. A prototype tool was +used for the first extraction. + +The tool provided is described as specific to the C language as it +generates + +##### Example: a _line_ command + + #line __line-number__ __file-name__ + +lines so that the C compiler will report where in the markdown file +any error is found. This tool is suitable for any other language +which allows the same directive, or will treat it as a comment. + +## Literate Details + +Literate programming is more than just including comments with the +code, even nicely formatted comments. It also involves presenting the +code in an order that makes sense to a human, rather than an order +that makes sense to a compiler. For this reason a core part of any +literate programming tool is the ability to re-arrange the code found +in the document into a different order in the final code file - or +files. This requires some form of linkage to be encoded. + +The approach taken here is focused around section headings - of any +depth. + +All the code in any section is treated as a single sequential +collection of code, and is named by the section that it is in. If +multiple sections have the same name, then the code blocks in all of +them are joined together in the order they appear in the document. + +A code section can contain a special marker which starts with 2 +hashes: __##__. +The text after the marker must be the name of some section which +contains code. Code from that section will be interpolated in place +of the marker, and will be indented to match the indent of the marker. + +It is not permitted for the same code to be interpolated multiple +times. Allowing this might make some sense, but it is probably a +mistake, and prohibiting it make some of the code a bit cleaner. + +Equally, every section of code should be interpolated at least once - +with two exceptions. These exceptions are imposed by the tool, not +the library. A different client could impose different rules on the +names of top-level code sections. + +The first exception we have already seen. A section name starting +__Example:__ indicates code that is not to be included in the final product. + +The second exception is for the top level code sections which will be +written to files. Again these are identified by their section name. +This must start with __File:__ the following text (after optional +spaces) will be used as a file name. + +Any section containing code that does not start __Example:__ or +__File:__ must be included in some other section exactly once. + +### Multiple files + +Allowing multiple top level code sections which name different files +means that one _markdown_ document can describe several files. This +is very useful with the C language where a program file and a header +file might be related. For the present document we will have a header +file and two code files, one with the library content and one for the +tool. + +It will also be very convenient to create a `makefile` fragment to +ensure the code is compiled correctly. A simple `make -f mdcode.mk` +will "do the right thing". + +### File: mdcode.mk + + CFLAGS += -Wall -g + all:: + mdcode.h libmdcode.c md2c.c mdcode.mk : mdcode.mdc + ./md2c mdcode.mdc + + +### File: mdcode.h + + ## exported types + ## exported functions + +### File: libmdcode.c + #define _GNU_SOURCE + #include + #include + #include + + #include "mdcode.h" + ## internal includes + ## private types + ## internal functions + +### File: mdcode.mk + + all :: libmdcode.o + libmdcode.o : libmdcode.c mdcode.h + $(CC) $(CFLAGS) -c libmdcode.c + + +### File: md2c.c + + #include + #include + + #include "mdcode.h" + + ## client includes + ## client functions + +### File: mdcode.mk + + all :: md2c + md2c : md2c.o libmdcode.o + $(CC) $(CFLAGS) -o md2c md2c.o libmdcode.o + md2c.o : md2c.c mdcode.h + $(CC) $(CFLAGS) -c md2c.c + +## Data Structures + +As the core purpose of _mdcode_ is to discover and re-arrange blocks +of text, it makes sense to map the whole document file into memory and +produce a data structure which lists various parts of the file in the +appropriate order. Each node in this structure will have some text +from the document, a child pointer, and a next pointer, any of which +might not be present. The text is most easily stored as a pointer and a +length. We'll call this a `text` + +A list of these `code_nodes` will belong to each section and it will +be useful to have a separate `section` data structure to store the +list of `code_nodes`, the section name, and some other information. + +This other information will include a reference counter so we can +ensure proper referencing, and an `indent` depth. As referenced +content can have an extra indent added, we need to know what that is. +The `code_node` will also have an `indent` depth which eventually gets +set to the sum for the indents from all references on the path from +the root. + +##### exported types + + struct text { + char *txt; + int len; + }; + + struct section { + struct text section; + struct code_node *code; + struct section *next; + }; + + struct code_node { + struct text code; + int indent; + int line_no; + struct code_node *next; + struct section *child; + }; + +##### private types + + struct psection { + struct section; + struct code_node *last; + int refcnt; + int indent; + }; + +You will note that the `struct psection` contains an anonymous `struct +section` embedded at the start. To make this work right, GCC +requires the `-fplan9-extensions` flag. + +##### File: mdcode.mk + + CFLAGS += -fplan9-extensions + +### Manipulating the node + +Though a tree with `next` and `child` links is the easiest way to +assemble the various code sections, it is not the easiest form for +using them. For that a simple list would be best. + +So once we have a fully linked File section we will want to linearize +it, so that the `child` links become `NULL` and the `next` links will +find everything required. It is at this stage that the requirements +that each section is linked only once becomes import. + +`code_linearize` will merge the `code_node`s from any child into the +given `code_node`. As it does this it sets the 'indent' field for +each `code_node`. + +Note that we don't clear the section's `last` pointer, even though +it no longer owns any code. This allows subsequent code to see if a +section ever had any code, and to report an error if a section is +referenced but not defined. + +##### internal functions + + static void code_linearize(struct code_node *code) + { + struct code_node *t; + for (t = code; t; t = t->next) + t->indent = 0; + for (; code; code = code->next) + if (code->child) { + struct code_node *next = code->next; + struct psection *pchild = + (struct psection *)code->child; + int indent = pchild->indent; + code->next = code->child->code; + code->child->code = NULL; + code->child = NULL; + for (t = code; t->next; t = t->next) + t->next->indent = code->indent + indent; + t->next = next; + } + } + +Once a client has made use of a linearized code set, it will probably +want to free it. + + void code_free(struct code_node *code) + { + while (code) { + struct code_node *this; + if (code->child) + code_linearize(code); + this = code; + code = code->next; + free(this); + } + } + +##### exported functions + + void code_free(struct code_node *code); + +### Building the tree + +As we parse the document there are two things we will want to do to +node trees: add some text or add a reference. We'll assume for now +that the relevant section structures have been found, and will just +deal with the `code_node`. + +Adding text simply means adding another node. We will never have +empty nodes, even if the last node only has a child, new text must go +in a new node. + +##### internal functions + + static void code_add_text(struct psection *where, struct text txt, + int line_no) + { + struct code_node *n; + if (txt.len == 0) + return; + n = malloc(sizeof(*n)); + n->code = txt; + n->indent = 0; + n->line_no = line_no; + n->next = NULL; + n->child = NULL; + if (where->last) + where->last->next = n; + else + where->code = n; + where->last = n; + } + +However when adding a link, we might be able to include it in the last +`code_node` if it currently only has text. + + void code_add_link(struct psection *where, struct psection *to, + int indent) + { + struct code_node *n; + + to->indent = indent; + to->refcnt++; // this will be checked elsewhere + if (where->last && where->last->child == NULL) { + where->last->child = to; + return; + } + n = malloc(sizeof(*n)); + n->code.len = 0; + n->indent = 0; + n->line_no = 0; + n->next = NULL; + n->child = to; + if (where->last) + where->last->next = n; + else + where->code = n; + where->last = n; + } + +### Finding sections + +Now we need a lookup table to be able to find sections by name. +Something that provides an `n*log(N)` search time is probably +justified, but for now I want a minimal stand-alone program so a +linked list managed by insertion-sort will do. As a comparison +function it is easiest to sort based on length before content. So +sections won't be in standard lexical order, but that isn't important. + +If we cannot find a section, we simply want to create it. This allows +sections and references to be created in any order. Sections with +no references or no content will cause a warning eventually. + +#### internal functions + + static int text_cmp(struct text a, struct text b) + { + if (a.len != b.len) + return a.len - b.len; + return strncmp(a.txt, b.txt, a.len); + } + + static struct psection *section_find(struct psection **list, struct text name) + { + struct psection *new; + while (*list) { + int cmp = text_cmp((*list)->section, name); + if (cmp == 0) + return *list; + if (cmp > 0) + break; + list = (struct psection **)&((*list)->next); + } + /* Add this section */ + new = malloc(sizeof(*new)); + new->next = *list; + *list = new; + new->section = name; + new->code = NULL; + new->last = NULL; + new->refcnt = 0; + new->indent = 0; + return new; + } + +## Parsing the _markdown_ + +Parsing markdown is fairly easy, though there are complications. + +The document is divided into "paragraphs" which are mostly separated by blank +lines (which may contain white space). The first few characters of +the first line of a paragraph determine the type of paragraph. For +our purposes we are only interested in list paragraphs, code +paragraphs, section headings, and everything else. Section headings +are single-line paragraphs and so do not require a preceding or +following blank line. + +Section headings start with 1 or more hash characters (__#__). List +paragraphs start with hyphen, asterisk, plus, or digits followed by a +period. Code paragraphs aren't quite so easy. + +The "standard" code paragraph starts with 4 or more spaces, or a tab. +However if the previous paragraph was a list paragraph, then those +spaces indicate another paragraph in the same list item, and 8 or +more spaces are required. Unless a nested list is in effect, in +which case 12 or more are need. Unfortunately not all _markdown_ +parsers agree on nested lists. + +Two alternate styles for marking code are in active use. "Github" uses +three backticks(_`` ``` ``_), while "pandoc" uses three or more tildes +(_~~~_). In these cases the code should not be indented. + +Trying to please everyone as much as possible, this parser will handle +everything except for code inside lists. + +So an indented (4+) paragraph after a list paragraph is always a list +paragraph, otherwise it is a code paragraph. A paragraph that starts +with three backticks or three tildes is code which continues until a +matching string of backticks or tildes. + +### Skipping bits + +While walking the document looking for various markers we will *not* +use the `struct text` introduced earlier as advancing that requires +updating both start and length which feels clumsy. Instead we will +carry `pos` and `end` pointers, only the first of which needs to +change. + +So to start, we need to skip various parts of the document. `lws` +stands for "Linear White Space" and is a term that comes from the +Email RFCs (e.g. RFC822). `line` and `para` are self explanatory. +Note that `skip_para` needs to update the current line number. +`skip_line` doesn't but every caller should. + +#### internal functions + + static char *skip_lws(char *pos, char *end) + { + while (pos < end && (*pos == ' ' || *pos == '\t')) + pos++; + return pos; + } + + static char *skip_line(char *pos, char *end) + { + while (pos < end && *pos != '\n') + pos++; + if (pos < end) + pos++; + return pos; + } + + static char *skip_para(char *pos, char *end, int *line_no) + { + /* Might return a pointer to a blank line, as only + * one trailing blank line is skipped + */ + if (*pos == '#') { + pos = skip_line(pos, end); + (*line_no) += 1; + return pos; + } + while (pos < end && + *pos != '#' && + *(pos = skip_lws(pos, end)) != '\n') { + pos = skip_line(pos, end); + (*line_no) += 1; + } + if (pos < end && *pos == '\n') { + pos++; + (*line_no) += 1; + } + return pos; + } + +### Recognising things + +Recognising a section header is trivial and doesn't require a +function. However we need to extract the content of a section header +as a `struct text` for passing to `section_find`. +Recognising the start of a new list is fairly easy. Recognising the +start (and end) of code is a little messy so we provide a function for +matching the first few characters, which has a special case for "4 +spaces or tab". + +#### internal includes + + #include + #include + +#### internal functions + + static struct text take_header(char *pos, char *end) + { + struct text section; + + while (pos < end && *pos == '#') + pos++; + while (pos < end && *pos == ' ') + pos++; + section.txt = pos; + while (pos < end && *pos != '\n') + pos++; + while (pos > section.txt && + (pos[-1] == '#' || pos[-1] == ' ')) + pos--; + section.len = pos - section.txt; + return section; + } + + static int is_list(char *pos, char *end) + { + if (strchr("-*+", *pos)) + return 1; + if (isdigit(*pos)) { + while (pos < end && isdigit(*pos)) + pos += 1; + if (pos < end && *pos == '.') + return 1; + } + return 0; + } + + static int matches(char *start, char *pos, char *end) + { + if (start == NULL) + return matches("\t", pos, end) || + matches(" ", pos, end); + return (pos + strlen(start) < end && + strncmp(pos, start, strlen(start)) == 0); + } + +### Extracting the code + +Now that we can skip paragraphs and recognise what type each paragraph +is, it is time to parse the file and extract the code. We'll do this +in two parts, first we look at what to do with some code once we +find it, and then how to actually find it. + +When we have some code, we know where it is, what the end marker +should look like, and which section it is in. + +There are two sorts of end markers: the presence of a particular +string, or the absence of an indent. We will use a string to +represent a presence, and a `NULL` to represent the absence. + +While looking at code we don't think about paragraphs are all - just +look for a line that starts with the right thing. +Every line that is still code then needs to be examined to see if it +is a section reference. + +When a section reference is found, all preceding code (if any) must be +added to the current section, then the reference is added. + +When we do find the end of the code, all text that we have found but +not processed needs to be saved too. + +When adding a reference we need to set the `indent`. This is the +number of spaces (counting 8 for tabs) after the natural indent of the +code (which is a tab or 4 spaces). We use a separate function `count_spaces` +for that. + +#### internal functions + + static int count_space(char *sol, char *p) + { + int c = 0; + while (sol < p) { + if (sol[0] == ' ') + c++; + if (sol[0] == '\t') + c+= 8; + sol++; + } + return c; + } + + + static char *take_code(char *pos, char *end, char *marker, + struct psection **table, struct text section, + int *line_nop) + { + char *start = pos; + int line_no = *line_nop; + int start_line = line_no; + struct psection *sect; + + sect = section_find(table, section); + + while (pos < end) { + char *sol, *t; + struct text ref; + + if (marker && matches(marker, pos, end)) + break; + if (!marker && + (skip_lws(pos, end))[0] != '\n' && + !matches(NULL, pos, end)) + /* Paragraph not indented */ + break; + + /* Still in code - check for reference */ + sol = pos; + if (!marker) { + if (*sol == '\t') + sol++; + else if (strcmp(sol, " ") == 0) + sol += 4; + } + t = skip_lws(sol, end); + if (t[0] != '#' || t[1] != '#') { + /* Just regular code here */ + pos = skip_line(sol, end); + line_no++; + continue; + } + + if (pos > start) { + struct text txt; + txt.txt = start; + txt.len = pos - start; + code_add_text(sect, txt, start_line); + } + ref = take_header(t, end); + if (ref.len) { + struct psection *refsec = section_find(table, ref); + code_add_link(sect, refsec, count_space(sol, t)); + } + pos = skip_line(t, end); + line_no++; + start = pos; + start_line = line_no; + } + if (pos > start) { + struct text txt; + txt.txt = start; + txt.len = pos - start; + code_add_text(sect, txt, start_line); + } + if (marker) { + pos = skip_line(pos, end); + line_no++; + } + *line_nop = line_no; + return pos; + } + +### Finding the code + +It is when looking for the code that we actually use the paragraph +structure. We need to recognise section headings so we can record the +name, list paragraphs so we can ignore indented follow-on paragraphs, +and the three different markings for code. + +#### internal functions + + static struct psection *code_find(char *pos, char *end) + { + struct psection *table = NULL; + int in_list = 0; + int line_no = 1; + struct text section = {0}; + + while (pos < end) { + if (pos[0] == '#') { + section = take_header(pos, end); + in_list = 0; + pos = skip_line(pos, end); + line_no++; + } else if (is_list(pos, end)) { + in_list = 1; + pos = skip_para(pos, end, &line_no); + } else if (!in_list && matches(NULL, pos, end)) { + pos = take_code(pos, end, NULL, &table, + section, &line_no); + } else if (matches("```", pos, end)) { + in_list = 0; + pos = skip_line(pos, end); + line_no++; + pos = take_code(pos, end, "```", &table, + section, &line_no); + } else if (matches("~~~", pos, end)) { + in_list = 0; + pos = skip_line(pos, end); + line_no++; + pos = take_code(pos, end, "~~~", &table, + section, &line_no); + } else { + if (!isspace(*pos)) + in_list = 0; + pos = skip_para(pos, end, &line_no); + } + } + return table; + } + +### Returning the code + +Having found all the code blocks and gathered them into a list of +section, we are now ready to return them to the caller. This is where +to perform consistency checks, like at most one reference and at least +one definition for each section. + +All the sections with no references are returned in a list for the +caller to consider. The are linearized first so that the substructure +is completely hidden -- except for the small amount of structure +displayed in the line numbers. + +To return errors, we have the caller pass a function which takes an +error message - a `code_err_fn`. + +#### exported types + + typedef void (*code_err_fn)(char *msg); + +#### internal functions + struct section *code_extract(char *pos, char *end, code_err_fn error) + { + struct psection *table; + struct section *result = NULL; + struct section *tofree = NULL; + + table = code_find(pos, end); + + while (table) { + struct psection *t = (struct psection*)table->next; + if (table->last == NULL) { + char *msg; + asprintf(&msg, + "Section \"%.*s\" is referenced but not declared", + table->section.len, table->section.txt); + error(msg); + free(msg); + } + if (table->refcnt == 0) { + /* Root-section, return it */ + table->next = result; + result = table; + code_linearize(result->code); + } else { + table->next = tofree; + tofree = table; + if (table->refcnt > 1) { + char *msg; + asprintf(&msg, + "Section \"%.*s\" referenced multiple times (%d).", + table->section.len, table->section.txt, + table->refcnt); + error(msg); + free(msg); + } + } + table = t; + } + while (tofree) { + struct section *t = tofree->next; + free(tofree); + tofree = t; + } + return result; + } + +##### exported functions + + struct section *code_extract(char *pos, char *end, code_err_fn error); + + +## Using the library + +Now that we can extract code from a document and link it all together +it is time to do something with that code. Firstly we need to print +it out. + +### Printing the Code + +Printing is mostly straight forward - we just walk the list and print +the code sections, adding whatever indent is required for each line. +However there is a complication (isn't there always)? + +For code that was recognised because the paragraph was indented, we +need to strip that indent first. For other code, we don't. + +The approach taken here is simple, though it could arguably be wrong +in some unlikely cases. So it might need to be fixed later. + +If the first line of a code block is indented, then either one tab or +4 spaces are striped from every non-blank line. + +This could go wrong if the first line of a code block marked by +_`` ``` ``_ is indented. To overcome this we would need to +record someextra state in each `code_node`. For now we won't bother. + +The indents we insert will all be spaces. This might not work well +for `Makefiles`. + +##### client functions + + static void code_print(FILE *out, struct code_node *node, + char *fname) + { + for (; node; node = node->next) { + char *c = node->code.txt; + int len = node->code.len; + int undent = 0; + + if (!len) + continue; + + fprintf(out, "#line %d \"%s\"\n", + node->line_no, fname); + if (*c == ' ' || *c == '\t') + undent = 1; + while (len && *c) { + fprintf(out, "%*s", node->indent, ""); + if (undent) { + if (*c == '\t' && len > 1) { + c++; + len--; + } else if (strncmp(c, " ", 4) == 0 && len > 4) { + c += 4; + len-= 4; + } + } + do { + fputc(*c, out); + c++; + len--; + } while (len && c[-1] != '\n'); + } + } + } + +### Bringing it all together + +We are just about ready for the `main` function of the tool which will +extract all this lovely code and compile it. Just one helper is still +needed. + +#### Handling filenames + +Section names are stored in `struct text` which is not `nul` +terminated. Filenames passed to `open` need to be null terminated. +So we need to convert one to the other, and strip the leading `File:` +of while we are at it. + +##### client functions + + static void copy_fname(char *name, int space, struct text t) + { + char *sec = t.txt; + int len = t.len; + name[0] = 0; + if (len < 5 || strncmp(sec, "File:", 5) != 0) + return; + sec += 5; + len -= 5; + while (len && sec[0] == ' ') { + sec++; + len--; + } + if (len >= space) + len = space - 1; + strncpy(name, sec, len); + name[len] = 0; + } + +#### Main + +And now we take a single file name, extract the code, and if there are +no error we write out a file for each appropriate code section. And +we are done. + + +##### client includes + + #include + #include + #include + #include + #include + +##### client functions + + static int errs; + static void pr_err(char *msg) + { + errs++; + fprintf(stderr, "%s\n", msg); + } + + int main(int argc, char *argv[]) + { + int fd; + size_t len; + char *file; + struct section *table, *s, *prev; + + errs = 0; + if (argc != 2) { + fprintf(stderr, "Usage: mdcode file.mdc\n"); + exit(2); + } + fd = open(argv[1], O_RDONLY); + if (fd < 0) { + fprintf(stderr, "mdcode: cannot open %s: %s\n", + argv[1], strerror(errno)); + exit(1); + } + len = lseek(fd, 0, 2); + file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); + table = code_extract(file, file+len, pr_err); + + for (s = table; s; + (code_free(s->code), prev = s, s = s->next, free(prev))) { + FILE *fl; + char fname[1024]; + if (strncmp(s->section.txt, "Example:", 8) == 0) + continue; + if (strncmp(s->section.txt, "File:", 5) != 0) { + fprintf(stderr, "Unreferenced section is not a file name: %.*s\n", + s->section.len, s->section.txt); + errs++; + continue; + } + copy_fname(fname, sizeof(fname), s->section); + if (fname[0] == 0) { + fprintf(stderr, "Missing file name at:%.*s\n", + s->section.len, s->section.txt); + errs++; + continue; + } + fl = fopen(fname, "w"); + if (!fl) { + fprintf(stderr, "Cannot create %s: %s\n", + fname, strerror(errno)); + errs++; + continue; + } + code_print(fl, s->code, argv[1]); + fclose(fl); + } + exit(!!errs); + } +