1 # mdcode: extract C code from a _markdown_ file.
3 _markdown_ is a popular format for simple text markup which can easily
4 be converted to HTML. As it allows easy indication of sections of
5 code, it is quite suitable for use in literate programming. This file
6 is an example of that usage.
8 The code included below provides two related functionalities.
9 Firstly it provides a library routine for extracting code out of a
10 _markdown_ file, so that other routines might make use of it.
12 Secondly it provides a simple client of this routine which extracts
13 1 or more C-language files from a markdown document so they can be
14 passed to a C compiler. These two combined to make a tool that is needed
15 to compile this tool. Yes, this is circular. A prototype tool was
16 used for the first extraction.
18 The tool provided is described as specific to the C language as it
21 ##### Example: a _line_ command
23 #line __line-number__ __file-name__
25 lines so that the C compiler will report where in the markdown file
26 any error is found. This tool is suitable for any other language
27 which allows the same directive, or will treat it as a comment.
31 Literate programming is more than just including comments with the
32 code, even nicely formatted comments. It also involves presenting the
33 code in an order that makes sense to a human, rather than an order
34 that makes sense to a compiler. For this reason a core part of any
35 literate programming tool is the ability to re-arrange the code found
36 in the document into a different order in the final code file - or
37 files. This requires some form of linkage to be encoded.
39 The approach taken here is focused around section headings - of any
42 All the code in any section is treated as a single sequential
43 collection of code, and is named by the section that it is in. If
44 multiple sections have the same name, then the code blocks in all of
45 them are joined together in the order they appear in the document.
47 A code section can contain a special marker which starts with 2
49 The text after the marker must be the name of some section which
50 contains code. Code from that section will be interpolated in place
51 of the marker, and will be indented to match the indent of the marker.
53 It is not permitted for the same code to be interpolated multiple
54 times. Allowing this might make some sense, but it is probably a
55 mistake, and prohibiting it make some of the code a bit cleaner.
57 Equally, every section of code should be interpolated at least once -
58 with one exception. This exception is imposed by the
59 tool, not the library. A different client could impose different
60 rules on the names of top-level code sections.
62 One example of the exception we have already seen. A section name
63 starting __Example:__ indicates code that is not to be included in the
64 final product. Any leading word will do, providing there is a space,
65 and the first space is preceded by a colon, that section name will be
68 A special case of this exception exists for the leading word
69 __File__. These sections are the top level code sections and they
70 will be written to the named file. Thus a section named
71 __File: foo__ should not be referenced by another section, and its
72 contents after all references are expanded will be written to the file
75 Any section containing code that does not start __Word:__
76 must be included in some other section exactly once.
80 Allowing multiple top level code sections which name different files
81 means that one _markdown_ document can describe several files. This
82 is very useful with the C language where a program file and a header
83 file might be related. For the present document we will have a header
84 file and two code files, one with the library content and one for the
87 It will also be very convenient to create a `makefile` fragment to
88 ensure the code is compiled correctly. A simple `make -f mdcode.mk`
89 will "do the right thing".
95 mdcode.h libmdcode.c md2c.c mdcode.mk : mdcode.mdc
103 ## exported functions
105 ### File: libmdcode.c
114 ## internal functions
119 libmdcode.o : libmdcode.c mdcode.h
120 $(CC) $(CFLAGS) -c libmdcode.c
136 md2c : md2c.o libmdcode.o
137 $(CC) $(CFLAGS) -o md2c md2c.o libmdcode.o
138 md2c.o : md2c.c mdcode.h
139 $(CC) $(CFLAGS) -c md2c.c
143 As the core purpose of _mdcode_ is to discover and re-arrange blocks
144 of text, it makes sense to map the whole document file into memory and
145 produce a data structure which lists various parts of the file in the
146 appropriate order. Each node in this structure will have some text
147 from the document, a child pointer, and a next pointer, any of which
148 might not be present. The text is most easily stored as a pointer and a
149 length. We'll call this a `text`
151 A list of these `code_nodes` will belong to each section and it will
152 be useful to have a separate `section` data structure to store the
153 list of `code_nodes`, the section name, and some other information.
155 This other information will include a reference counter so we can
156 ensure proper referencing, and an `indent` depth. As referenced
157 content can have an extra indent added, we need to know what that is.
158 The `code_node` will also have an `indent` depth which eventually gets
159 set to the sum for the indents from all references on the path from
162 Finally we need to know if the `code_node` was recognised by being
163 indented or not. If it was, the client of this data will want to
164 strip off the leading tab or 4 spaces. Hence a `needs_strip` flag is
165 needed. This will be set to 8 if a tab is found and 4 if four spaces are found.
166 This means the relative indent of text in the node
167 is `node->indent - node->needs_strip`.
168 The relative indent is needed for detecting indents in the overall file.
179 struct code_node *code;
180 struct section *next;
188 struct code_node *next;
189 struct section *child;
196 struct code_node *last;
201 You will note that the `struct psection` contains an anonymous `struct
202 section` embedded at the start. To make this work right, GCC
203 requires the `-fplan9-extensions` flag.
205 ##### File: mdcode.mk
207 CFLAGS += -fplan9-extensions
209 ### Manipulating the node
211 Though a tree with `next` and `child` links is the easiest way to
212 assemble the various code sections, it is not the easiest form for
213 using them. For that a simple list would be best.
215 So once we have a fully linked File section we will want to linearize
216 it, so that the `child` links become `NULL` and the `next` links will
217 find everything required. It is at this stage that the requirements
218 that each section is linked only once becomes import.
220 `code_linearize` will merge the `code_node`s from any child into the
221 given `code_node`. As it does this it sets the 'indent' field for
224 Note that we don't clear the section's `last` pointer, even though
225 it no longer owns any code. This allows subsequent code to see if a
226 section ever had any code, and to report an error if a section is
227 referenced but not defined.
229 ##### internal functions
231 static void code_linearize(struct code_node *code)
234 for (t = code; t; t = t->next)
236 for (; code; code = code->next)
238 struct code_node *next = code->next;
239 struct psection *pchild =
240 (struct psection *)code->child;
241 int indent = pchild->indent;
242 code->next = code->child->code;
243 code->child->code = NULL;
245 for (t = code; t->next; t = t->next)
246 t->next->indent = code->indent + indent;
251 Once a client has made use of a linearized code set, it will probably
254 void code_free(struct code_node *code)
257 struct code_node *this;
259 code_linearize(code);
266 ##### exported functions
268 void code_free(struct code_node *code);
270 ### Building the tree
272 As we parse the document there are two things we will want to do to
273 node trees: add some text or add a reference. We'll assume for now
274 that the relevant section structures have been found, and will just
275 deal with the `code_node`.
277 Adding text simply means adding another node. We will never have
278 empty nodes, even if the last node only has a child, new text must go
281 ##### internal functions
283 static void code_add_text(struct psection *where, struct text txt,
284 int line_no, int needs_strip)
289 n = malloc(sizeof(*n));
292 n->line_no = line_no;
294 if (txt.txt[0] == '\t')
303 where->last->next = n;
309 However when adding a link, we might be able to include it in the last
310 `code_node` if it currently only has text.
312 void code_add_link(struct psection *where, struct psection *to,
318 to->refcnt++; // this will be checked elsewhere
319 if (where->last && where->last->child == NULL) {
320 where->last->child = to;
323 n = malloc(sizeof(*n));
330 where->last->next = n;
338 Now we need a lookup table to be able to find sections by name.
339 Something that provides an `n*log(N)` search time is probably
340 justified, but for now I want a minimal stand-alone program so a
341 linked list managed by insertion-sort will do.
343 The text compare function will likely be useful for any clients of our
344 library, so we may as well export it.
346 If we cannot find a section, we simply want to create it. This allows
347 sections and references to be created in any order. Sections with
348 no references or no content will cause a warning eventually.
350 #### exported functions
352 int text_cmp(struct text a, struct text b);
354 #### internal functions
356 int text_cmp(struct text a, struct text b)
361 int cmp = strncmp(a.txt, b.txt, len);
365 return a.len - b.len;
368 static struct psection *section_find(struct psection **list, struct text name)
370 struct psection *new;
372 int cmp = text_cmp((*list)->section, name);
377 list = (struct psection **)&((*list)->next);
379 /* Add this section */
380 new = malloc(sizeof(*new));
391 ## Parsing the _markdown_
393 Parsing markdown is fairly easy, though there are complications.
395 The document is divided into "paragraphs" which are mostly separated by blank
396 lines (which may contain white space). The first few characters of
397 the first line of a paragraph determine the type of paragraph. For
398 our purposes we are only interested in list paragraphs, code
399 paragraphs, section headings, and everything else. Section headings
400 are single-line paragraphs and so do not require a preceding or
401 following blank line.
403 Section headings start with 1 or more hash characters (__#__). List
404 paragraphs start with hyphen, asterisk, plus, or digits followed by a
405 period. Code paragraphs aren't quite so easy.
407 The "standard" code paragraph starts with 4 or more spaces, or a tab.
408 However if the previous paragraph was a list paragraph, then those
409 spaces indicate another paragraph in the same list item, and 8 or
410 more spaces are required. Unless a nested list is in effect, in
411 which case 12 or more are need. Unfortunately not all _markdown_
412 parsers agree on nested lists.
414 Two alternate styles for marking code are in active use. "Github" uses
415 three backticks(_`` ``` ``_), while "pandoc" uses three or more tildes
416 (_~~~_). In these cases the code should not be indented.
418 Trying to please everyone as much as possible, this parser will handle
419 everything except for code inside lists.
421 So an indented (4+) paragraph after a list paragraph is always a list
422 paragraph, otherwise it is a code paragraph. A paragraph that starts
423 with three backticks or three tildes is code which continues until a
424 matching string of backticks or tildes.
428 While walking the document looking for various markers we will *not*
429 use the `struct text` introduced earlier as advancing that requires
430 updating both start and length which feels clumsy. Instead we will
431 carry `pos` and `end` pointers, only the first of which needs to
434 So to start, we need to skip various parts of the document. `lws`
435 stands for "Linear White Space" and is a term that comes from the
436 Email RFCs (e.g. RFC822). `line` and `para` are self explanatory.
437 Note that `skip_para` needs to update the current line number.
438 `skip_line` doesn't but every caller should.
440 #### internal functions
442 static char *skip_lws(char *pos, char *end)
444 while (pos < end && (*pos == ' ' || *pos == '\t'))
449 static char *skip_line(char *pos, char *end)
451 while (pos < end && *pos != '\n')
458 static char *skip_para(char *pos, char *end, int *line_no)
460 /* Might return a pointer to a blank line, as only
461 * one trailing blank line is skipped
464 pos = skip_line(pos, end);
470 *(pos = skip_lws(pos, end)) != '\n') {
471 pos = skip_line(pos, end);
474 if (pos < end && *pos == '\n') {
481 ### Recognising things
483 Recognising a section header is trivial and doesn't require a
484 function. However we need to extract the content of a section header
485 as a `struct text` for passing to `section_find`.
486 Recognising the start of a new list is fairly easy. Recognising the
487 start (and end) of code is a little messy so we provide a function for
488 matching the first few characters, which has a special case for "4
491 #### internal includes
496 #### internal functions
498 static struct text take_header(char *pos, char *end)
502 while (pos < end && *pos == '#')
504 while (pos < end && *pos == ' ')
507 while (pos < end && *pos != '\n')
509 while (pos > section.txt &&
510 (pos[-1] == '#' || pos[-1] == ' '))
512 section.len = pos - section.txt;
516 static int is_list(char *pos, char *end)
518 if (strchr("-*+", *pos))
521 while (pos < end && isdigit(*pos))
523 if (pos < end && *pos == '.')
529 static int matches(char *start, char *pos, char *end)
532 return matches("\t", pos, end) ||
533 matches(" ", pos, end);
534 return (pos + strlen(start) < end &&
535 strncmp(pos, start, strlen(start)) == 0);
538 ### Extracting the code
540 Now that we can skip paragraphs and recognise what type each paragraph
541 is, it is time to parse the file and extract the code. We'll do this
542 in two parts, first we look at what to do with some code once we
543 find it, and then how to actually find it.
545 When we have some code, we know where it is, what the end marker
546 should look like, and which section it is in.
548 There are two sorts of end markers: the presence of a particular
549 string, or the absence of an indent. We will use a string to
550 represent a presence, and a `NULL` to represent the absence.
552 While looking at code we don't think about paragraphs at all - just
553 look for a line that starts with the right thing.
554 Every line that is still code then needs to be examined to see if it
555 is a section reference.
557 When a section reference is found, all preceding code (if any) must be
558 added to the current section, then the reference is added.
560 When we do find the end of the code, all text that we have found but
561 not processed needs to be saved too.
563 When adding a reference we need to set the `indent`. This is the
564 number of spaces (counting 8 for tabs) after the natural indent of the
565 code (which is a tab or 4 spaces). We use a separate function `count_spaces`
568 If there are completely blank linkes (no indent) at the end of the found code,
569 these should be considered to be spacing between the code and the next section,
570 and so no included in the code. When a marker is used to explicitly mark the
571 end of the code, we don't need to check for these blank lines.
573 #### internal functions
575 static int count_space(char *sol, char *p)
588 static char *take_code(char *pos, char *end, char *marker,
589 struct psection **table, struct text section,
593 int line_no = *line_nop;
594 int start_line = line_no;
595 struct psection *sect;
597 sect = section_find(table, section);
603 if (marker && matches(marker, pos, end))
606 (skip_lws(pos, end))[0] != '\n' &&
607 !matches(NULL, pos, end))
608 /* Paragraph not indented */
611 /* Still in code - check for reference */
616 else if (strcmp(sol, " ") == 0)
619 t = skip_lws(sol, end);
620 if (t[0] != '#' || t[1] != '#') {
621 /* Just regular code here */
622 pos = skip_line(sol, end);
630 txt.len = pos - start;
631 code_add_text(sect, txt, start_line,
634 ref = take_header(t, end);
636 struct psection *refsec = section_find(table, ref);
637 code_add_link(sect, refsec, count_space(sol, t));
639 pos = skip_line(t, end);
642 start_line = line_no;
647 txt.len = pos - start;
648 /* strip trailing blank lines */
649 while (!marker && txt.len > 2 &&
650 start[txt.len-1] == '\n' &&
651 start[txt.len-2] == '\n')
654 code_add_text(sect, txt, start_line,
658 pos = skip_line(pos, end);
667 It is when looking for the code that we actually use the paragraph
668 structure. We need to recognise section headings so we can record the
669 name, list paragraphs so we can ignore indented follow-on paragraphs,
670 and the three different markings for code.
672 #### internal functions
674 static struct psection *code_find(char *pos, char *end)
676 struct psection *table = NULL;
679 struct text section = {0};
683 section = take_header(pos, end);
685 pos = skip_line(pos, end);
687 } else if (is_list(pos, end)) {
689 pos = skip_para(pos, end, &line_no);
690 } else if (!in_list && matches(NULL, pos, end)) {
691 pos = take_code(pos, end, NULL, &table,
693 } else if (matches("```", pos, end)) {
695 pos = skip_line(pos, end);
697 pos = take_code(pos, end, "```", &table,
699 } else if (matches("~~~", pos, end)) {
701 pos = skip_line(pos, end);
703 pos = take_code(pos, end, "~~~", &table,
708 pos = skip_para(pos, end, &line_no);
714 ### Returning the code
716 Having found all the code blocks and gathered them into a list of
717 section, we are now ready to return them to the caller. This is where
718 to perform consistency checks, like at most one reference and at least
719 one definition for each section.
721 All the sections with no references are returned in a list for the
722 caller to consider. The are linearized first so that the substructure
723 is completely hidden -- except for the small amount of structure
724 displayed in the line numbers.
726 To return errors, we have the caller pass a function which takes an
727 error message - a `code_err_fn`.
731 typedef void (*code_err_fn)(char *msg);
733 #### internal functions
734 struct section *code_extract(char *pos, char *end, code_err_fn error)
736 struct psection *table;
737 struct section *result = NULL;
738 struct section *tofree = NULL;
740 table = code_find(pos, end);
743 struct psection *t = (struct psection*)table->next;
744 if (table->last == NULL) {
747 "Section \"%.*s\" is referenced but not declared",
748 table->section.len, table->section.txt);
752 if (table->refcnt == 0) {
753 /* Root-section, return it */
754 table->next = result;
756 code_linearize(result->code);
758 table->next = tofree;
760 if (table->refcnt > 1) {
763 "Section \"%.*s\" referenced multiple times (%d).",
764 table->section.len, table->section.txt,
773 struct section *t = tofree->next;
780 ##### exported functions
782 struct section *code_extract(char *pos, char *end, code_err_fn error);
786 Now that we can extract code from a document and link it all together
787 it is time to do something with that code. Firstly we need to print
790 ### Printing the Code
792 Printing is mostly straight forward - we just walk the list and print
793 the code sections, adding whatever indent is required for each line.
794 However there is a complication (isn't there always)?
796 For code that was recognised because the paragraph was indented, we
797 need to strip that indent first. For other code, we don't.
799 The approach taken here is simple, though it could arguably be wrong
800 in some unlikely cases. So it might need to be fixed later.
802 If the first line of a code block is indented, then either one tab or
803 4 spaces are striped from every non-blank line.
805 This could go wrong if the first line of a code block marked by
806 _`` ``` ``_ is indented. To overcome this we would need to
807 record some extra state in each `code_node`. For now we won't bother.
809 The indents we insert will mostly be spaces. All-spaces doesn't work
810 for `Makefiles`, so if the indent is 8 or more, we use a TAB first.
812 ##### internal functions
814 void code_node_print(FILE *out, struct code_node *node,
817 for (; node; node = node->next) {
818 char *c = node->code.txt;
819 int len = node->code.len;
824 fprintf(out, "#line %d \"%s\"\n",
825 node->line_no, fname);
827 if (node->indent >= 8)
828 fprintf(out, "\t%*s", node->indent - 8, "");
830 fprintf(out, "%*s", node->indent, "");
831 if (node->needs_strip) {
832 if (*c == '\t' && len > 1) {
835 } else if (strncmp(c, " ", 4) == 0 && len > 4) {
844 } while (len && c[-1] != '\n');
849 ###### exported functions
850 void code_node_print(FILE *out, struct code_node *node, char *fname);
852 ### Bringing it all together
854 We are just about ready for the `main` function of the tool which will
855 extract all this lovely code and compile it. Just one helper is still
858 #### Handling filenames
860 Section names are stored in `struct text` which is not `nul`
861 terminated. Filenames passed to `open` need to be null terminated.
862 So we need to convert one to the other, and strip the leading `File:`
863 of while we are at it.
865 ##### client functions
867 static void copy_fname(char *name, int space, struct text t)
872 if (len < 5 || strncmp(sec, "File:", 5) != 0)
876 while (len && sec[0] == ' ') {
882 strncpy(name, sec, len);
888 And now we take a single file name, extract the code, and if there are
889 no error we write out a file for each appropriate code section. And
892 ##### client includes
896 #include <sys/mman.h>
899 ##### client functions
902 static void pr_err(char *msg)
905 fprintf(stderr, "%s\n", msg);
908 static char *strnchr(char *haystack, int len, char needle)
910 while (len > 0 && *haystack && *haystack != needle) {
914 return len > 0 && *haystack == needle ? haystack : NULL;
917 int main(int argc, char *argv[])
922 struct text section = {NULL, 0};
923 struct section *table, *s, *prev;
926 if (argc != 2 && argc != 3) {
927 fprintf(stderr, "Usage: mdcode file.mdc [section]\n");
931 section.txt = argv[2];
932 section.len = strlen(argv[2]);
935 fd = open(argv[1], O_RDONLY);
937 fprintf(stderr, "mdcode: cannot open %s: %s\n",
938 argv[1], strerror(errno));
941 len = lseek(fd, 0, 2);
942 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
943 table = code_extract(file, file+len, pr_err);
946 (code_free(s->code), prev = s, s = s->next, free(prev))) {
949 char *spc = strnchr(s->section.txt, s->section.len, ' ');
951 if (spc > s->section.txt && spc[-1] == ':') {
952 if (strncmp(s->section.txt, "File: ", 6) != 0 &&
953 (section.txt == NULL ||
954 text_cmp(s->section, section) != 0))
955 /* Ignore this section */
958 fprintf(stderr, "Code in unreferenced section that is not ignored or a file name: %.*s\n",
959 s->section.len, s->section.txt);
964 if (text_cmp(s->section, section) == 0)
965 code_node_print(stdout, s->code, argv[1]);
968 copy_fname(fname, sizeof(fname), s->section);
970 fprintf(stderr, "Missing file name at:%.*s\n",
971 s->section.len, s->section.txt);
975 fl = fopen(fname, "w");
977 fprintf(stderr, "Cannot create %s: %s\n",
978 fname, strerror(errno));
982 code_node_print(fl, s->code, argv[1]);