diff options
author | John MacFarlane <jgm@berkeley.edu> | 2014-12-27 21:51:30 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2014-12-28 16:27:00 -0800 |
commit | d57f3952ca8b9aac16db8243539f4c1c5dbf3c93 (patch) | |
tree | 21ff4ae66cc5d6130963172df2badb3a77a4930e | |
parent | bf44064d09afd04039058a00c32c1532fb5e2b61 (diff) |
Added xml writer, to dump the AST in XML.
This is a work-in-progress.
CommonMark.dtd gives the DTD for the generated XML.
Closes #53.
-rw-r--r-- | CommonMark.dtd | 45 | ||||
-rw-r--r-- | src/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/cmark.h | 5 | ||||
-rw-r--r-- | src/main.c | 8 | ||||
-rw-r--r-- | src/xml.c | 140 |
5 files changed, 198 insertions, 1 deletions
diff --git a/CommonMark.dtd b/CommonMark.dtd new file mode 100644 index 0000000..4ad924b --- /dev/null +++ b/CommonMark.dtd @@ -0,0 +1,45 @@ +<!-- DTD for CommonMark xml export format --> +<!ELEMENT DOCUMENT +(BLOCK_QUOTE|LIST|CODE_BLOCK|HTML|PARAGRAPH|HEADER|HRULE)*> +<!ELEMENT BLOCK_QUOTE +(BLOCK_QUOTE|LIST|CODE_BLOCK|HTML|PARAGRAPH|HEADER|HRULE)*> +<!ELEMENT LIST (LIST_ITEM)+> +<!ATTLIST LIST + type (bullet|ordered) #REQUIRED + start CDATA #IMPLIED + tight (true|false) #REQUIRED + delimiter (period|paren) #IMPLIED> +<!ELEMENT LIST_ITEM +(BLOCK_QUOTE|LIST|CODE_BLOCK|HTML|PARAGRAPH|HEADER|HRULE)*> +<!ELEMENT CODE_BLOCK (#PCDATA)> +<!ATTLIST CODE_BLOCK + xml:space CDATA #FIXED "preserve" + info CDATA #IMPLIED> +<!ELEMENT HTML (#PCDATA)> +<!ATTLIST HTML xml:space CDATA #FIXED "preserve"> +<!ELEMENT PARAGRAPH +(TEXT|SOFTBREAK|LINEBREAK|CODE|INLINE_HTML|EMPH|STRONG|LINK|IMAGE)*> +<!ELEMENT HEADER +(TEXT|SOFTBREAK|LINEBREAK|CODE|INLINE_HTML|EMPH|STRONG|LINK|IMAGE)*> +<!ATTLIST HEADER level (1|2|3|4|5|6) #REQUIRED> +<!ELEMENT HRULE EMPTY> +<!ELEMENT TEXT (#PCDATA)> +<!ELEMENT SOFTBREAK EMPTY> +<!ELEMENT LINEBREAK EMPTY> +<!ELEMENT CODE (#PCDATA)> +<!ATTLIST CODE xml:space CDATA #FIXED "preserve"> +<!ELEMENT INLINE_HTML (#PCDATA)> +<!ATTLIST INLINE_HTML xml:space CDATA #FIXED "preserve"> +<!ELEMENT EMPH +(TEXT|SOFTBREAK|LINEBREAK|CODE|INLINE_HTML|EMPH|STRONG|LINK|IMAGE)*> +<!ELEMENT STRONG +(TEXT|SOFTBREAK|LINEBREAK|CODE|INLINE_HTML|EMPH|STRONG|LINK|IMAGE)*> +<!ELEMENT LINK +(TEXT|SOFTBREAK|LINEBREAK|CODE|INLINE_HTML|EMPH|STRONG|LINK|IMAGE)*> +<!ATTLIST LINK url CDATA #REQUIRED> +<!ATTLIST LINK title CDATA #IMPLIED> +<!ELEMENT IMAGE +(TEXT|SOFTBREAK|LINEBREAK|CODE|INLINE_HTML|EMPH|STRONG|LINK|IMAGE)*> +<!ATTLIST IMAGE url CDATA #REQUIRED> +<!ATTLIST IMAGE title CDATA #IMPLIED> +<!ATTLIST ANY sourcepos CDATA #IMPLIED> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8eb198e..ef26bef 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -29,6 +29,7 @@ set(LIBRARY_SOURCES buffer.c references.c man.c + xml.c html.c html_unescape.gperf houdini_href_e.c diff --git a/src/cmark.h b/src/cmark.h index 3b60d67..1dab0dd 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -394,6 +394,11 @@ cmark_node *cmark_parse_file(FILE *f); CMARK_EXPORT char *cmark_render_ast(cmark_node *root); +/** Render a 'node' tree as XML. + */ +CMARK_EXPORT +char *cmark_render_xml(cmark_node *root); + /** Render a 'node' tree as an HTML fragment. It is up to the user * to add an appropriate header and footer. */ @@ -10,6 +10,7 @@ typedef enum { FORMAT_NONE, FORMAT_HTML, + FORMAT_XML, FORMAT_MAN, FORMAT_AST } writer_format; @@ -18,7 +19,7 @@ void print_usage() { printf("Usage: cmark [FILE*]\n"); printf("Options:\n"); - printf(" --to, -t FORMAT Specify output format (html, man, ast)\n"); + printf(" --to, -t FORMAT Specify output format (html, xml, man, ast)\n"); printf(" --help, -h Print usage information\n"); printf(" --version Print version\n"); } @@ -33,6 +34,9 @@ static void print_document(cmark_node *document, writer_format writer) case FORMAT_HTML: result = cmark_render_html(document); break; + case FORMAT_XML: + result = cmark_render_xml(document); + break; case FORMAT_MAN: result = cmark_render_man(document); break; @@ -74,6 +78,8 @@ int main(int argc, char *argv[]) writer = FORMAT_MAN; } else if (strcmp(argv[i], "html") == 0) { writer = FORMAT_HTML; + } else if (strcmp(argv[i], "xml") == 0) { + writer = FORMAT_XML; } else if (strcmp(argv[i], "ast") == 0) { writer = FORMAT_AST; } else { diff --git a/src/xml.c b/src/xml.c new file mode 100644 index 0000000..86fb6d4 --- /dev/null +++ b/src/xml.c @@ -0,0 +1,140 @@ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> + +#include "config.h" +#include "cmark.h" +#include "node.h" +#include "buffer.h" +#include "houdini.h" + +// Functions to convert cmark_nodes to XML strings. + +static void escape_xml(cmark_strbuf *dest, const unsigned char *source, int length) +{ + if (source != NULL) { + if (length < 0) + length = strlen((char *)source); + + houdini_escape_html0(dest, source, (size_t)length, 0); + } +} + +struct render_state { + cmark_strbuf* xml; + int indent; +}; + +static inline void indent(struct render_state *state) +{ + int i; + for (i = 0; i < state->indent; i++) { + cmark_strbuf_putc(state->xml, ' '); + } +} + +static int +S_render_node(cmark_node *node, cmark_event_type ev_type, void *vstate) +{ + struct render_state *state = vstate; + cmark_strbuf *xml = state->xml; + bool literal = false; + + bool entering = (ev_type == CMARK_EVENT_ENTER); + + if (entering) { + indent(state); + cmark_strbuf_printf(xml, "<%s", cmark_node_type_string(node)); + + if (node->start_line != 0) { + cmark_strbuf_printf(xml, " sourcepos=\"%d:%d-%d\"", + node->start_line, + node->start_column, + node->end_line); + } + + literal = false; + + switch (node->type) { + case CMARK_NODE_TEXT: + case CMARK_NODE_CODE: + case CMARK_NODE_HTML: + case CMARK_NODE_INLINE_HTML: + cmark_strbuf_puts(xml, ">"); + escape_xml(xml, node->as.literal.data, + node->as.literal.len); + cmark_strbuf_puts(xml, "</"); + cmark_strbuf_puts(xml, cmark_node_type_string(node)); + literal = true; + break; + case CMARK_NODE_CODE_BLOCK: + if (node->as.code.info.len > 0) { + cmark_strbuf_puts(xml, " info=\""); + escape_xml(xml, node->as.code.info.data, + node->as.code.info.len); + cmark_strbuf_putc(xml, '"'); + } + cmark_strbuf_puts(xml, ">"); + escape_xml(xml, node->as.code.literal.data, + node->as.code.literal.len); + cmark_strbuf_puts(xml, "</"); + cmark_strbuf_puts(xml, cmark_node_type_string(node)); + literal = true; + break; + case CMARK_NODE_LINK: + case CMARK_NODE_IMAGE: + cmark_strbuf_puts(xml, " url=\""); + escape_xml(xml, node->as.link.url, -1); + cmark_strbuf_putc(xml, '"'); + cmark_strbuf_puts(xml, " title=\""); + escape_xml(xml, node->as.link.title, -1); + cmark_strbuf_putc(xml, '"'); + break; + default: + break; + } + if (node->first_child) { + state->indent += 2; + } else if (!literal) { + cmark_strbuf_puts(xml, " /"); + } + + } else { + if (node->first_child) { + state->indent -= 2; + } + indent(state); + cmark_strbuf_printf(xml, "</%s", cmark_node_type_string(node)); + } + + // TODO print attributes + + cmark_strbuf_puts(xml, ">\n"); + + return 1; +} + +char *cmark_render_xml(cmark_node *root) +{ + char *result; + cmark_strbuf xml = GH_BUF_INIT; + cmark_event_type ev_type; + cmark_node *cur; + struct render_state state = { &xml, 0 }; + cmark_iter *iter = cmark_iter_new(root); + + cmark_strbuf_puts(state.xml, + "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); + cmark_strbuf_puts(state.xml, + "<!DOCTYPE CommonMark SYSTEM \"CommonMark.dtd\">\n"); + while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) { + cur = cmark_iter_get_node(iter); + S_render_node(cur, ev_type, &state); + } + result = (char *)cmark_strbuf_detach(&xml); + + cmark_iter_free(iter); + cmark_strbuf_free(&xml); + return result; +} |