aboutsummaryrefslogtreecommitdiff
path: root/src/html
diff options
context:
space:
mode:
Diffstat (limited to 'src/html')
-rw-r--r--src/html/houdini.h44
-rw-r--r--src/html/houdini_href_e.c115
-rw-r--r--src/html/houdini_html_e.c89
-rw-r--r--src/html/html.c212
4 files changed, 460 insertions, 0 deletions
diff --git a/src/html/houdini.h b/src/html/houdini.h
new file mode 100644
index 0000000..31fe917
--- /dev/null
+++ b/src/html/houdini.h
@@ -0,0 +1,44 @@
+#ifndef __HOUDINI_H__
+#define __HOUDINI_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include "buffer.h"
+
+#define likely(x) __builtin_expect((x),1)
+#define unlikely(x) __builtin_expect((x),0)
+
+#ifdef HOUDINI_USE_LOCALE
+# define _isxdigit(c) isxdigit(c)
+# define _isdigit(c) isdigit(c)
+#else
+/*
+ * Helper _isdigit methods -- do not trust the current locale
+ * */
+# define _isxdigit(c) (strchr("0123456789ABCDEFabcdef", (c)) != NULL)
+# define _isdigit(c) ((c) >= '0' && (c) <= '9')
+#endif
+
+#define HOUDINI_ESCAPED_SIZE(x) (((x) * 12) / 10)
+#define HOUDINI_UNESCAPED_SIZE(x) (x)
+
+extern int houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure);
+extern int houdini_unescape_html(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_uri(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_url(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_unescape_uri(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_unescape_url(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_js(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_unescape_js(gh_buf *ob, const uint8_t *src, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/html/houdini_href_e.c b/src/html/houdini_href_e.c
new file mode 100644
index 0000000..59fe850
--- /dev/null
+++ b/src/html/houdini_href_e.c
@@ -0,0 +1,115 @@
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "html/houdini.h"
+
+/*
+ * The following characters will not be escaped:
+ *
+ * -_.+!*'(),%#@?=;:/,+&$ alphanum
+ *
+ * Note that this character set is the addition of:
+ *
+ * - The characters which are safe to be in an URL
+ * - The characters which are *not* safe to be in
+ * an URL because they are RESERVED characters.
+ *
+ * We asume (lazily) that any RESERVED char that
+ * appears inside an URL is actually meant to
+ * have its native function (i.e. as an URL
+ * component/separator) and hence needs no escaping.
+ *
+ * There are two exceptions: the chacters & (amp)
+ * and ' (single quote) do not appear in the table.
+ * They are meant to appear in the URL as components,
+ * yet they require special HTML-entity escaping
+ * to generate valid HTML markup.
+ *
+ * All other characters will be escaped to %XX.
+ *
+ */
+static const char HREF_SAFE[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+int
+houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size)
+{
+ static const uint8_t hex_chars[] = "0123456789ABCDEF";
+ size_t i = 0, org;
+ uint8_t hex_str[3];
+
+ hex_str[0] = '%';
+
+ while (i < size) {
+ org = i;
+ while (i < size && HREF_SAFE[src[i]] != 0)
+ i++;
+
+ if (likely(i > org)) {
+ if (unlikely(org == 0)) {
+ if (i >= size)
+ return 0;
+
+ gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
+ }
+
+ gh_buf_put(ob, src + org, i - org);
+ }
+
+ /* escaping */
+ if (i >= size)
+ break;
+
+ switch (src[i]) {
+ /* amp appears all the time in URLs, but needs
+ * HTML-entity escaping to be inside an href */
+ case '&':
+ gh_buf_puts(ob, "&amp;");
+ break;
+
+ /* the single quote is a valid URL character
+ * according to the standard; it needs HTML
+ * entity escaping too */
+ case '\'':
+ gh_buf_puts(ob, "&#x27;");
+ break;
+
+ /* the space can be escaped to %20 or a plus
+ * sign. we're going with the generic escape
+ * for now. the plus thing is more commonly seen
+ * when building GET strings */
+#if 0
+ case ' ':
+ gh_buf_putc(ob, '+');
+ break;
+#endif
+
+ /* every other character goes with a %XX escaping */
+ default:
+ hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
+ hex_str[2] = hex_chars[src[i] & 0xF];
+ gh_buf_put(ob, hex_str, 3);
+ }
+
+ i++;
+ }
+
+ return 1;
+}
diff --git a/src/html/houdini_html_e.c b/src/html/houdini_html_e.c
new file mode 100644
index 0000000..316c5ce
--- /dev/null
+++ b/src/html/houdini_html_e.c
@@ -0,0 +1,89 @@
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "html/houdini.h"
+
+/**
+ * According to the OWASP rules:
+ *
+ * & --> &amp;
+ * < --> &lt;
+ * > --> &gt;
+ * " --> &quot;
+ * ' --> &#x27; &apos; is not recommended
+ * / --> &#x2F; forward slash is included as it helps end an HTML entity
+ *
+ */
+static const char HTML_ESCAPE_TABLE[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const char *HTML_ESCAPES[] = {
+ "",
+ "&quot;",
+ "&amp;",
+ "&#39;",
+ "&#47;",
+ "&lt;",
+ "&gt;"
+};
+
+int
+houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure)
+{
+ size_t i = 0, org, esc = 0;
+
+ while (i < size) {
+ org = i;
+ while (i < size && (esc = HTML_ESCAPE_TABLE[src[i]]) == 0)
+ i++;
+
+ if (i > org) {
+ if (unlikely(org == 0)) {
+ if (i >= size)
+ return 0;
+
+ gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
+ }
+
+ gh_buf_put(ob, src + org, i - org);
+ }
+
+ /* escaping */
+ if (unlikely(i >= size))
+ break;
+
+ /* The forward slash is only escaped in secure mode */
+ if (src[i] == '/' && !secure) {
+ gh_buf_putc(ob, '/');
+ } else {
+ gh_buf_puts(ob, HTML_ESCAPES[esc]);
+ }
+
+ i++;
+ }
+
+ return 1;
+}
+
+int
+houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size)
+{
+ return houdini_escape_html0(ob, src, size, 1);
+}
diff --git a/src/html/html.c b/src/html/html.c
new file mode 100644
index 0000000..2f160ca
--- /dev/null
+++ b/src/html/html.c
@@ -0,0 +1,212 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include "stmd.h"
+#include "debug.h"
+#include "scanners.h"
+#include "html/houdini.h"
+
+// Functions to convert block and inline lists to HTML strings.
+
+static void escape_html(gh_buf *dest, const unsigned char *source, int length)
+{
+ if (length < 0)
+ length = strlen((char *)source);
+
+ houdini_escape_html0(dest, source, (size_t)length, 0);
+}
+
+static void escape_href(gh_buf *dest, const unsigned char *source, int length)
+{
+ if (length < 0)
+ length = strlen((char *)source);
+
+ houdini_escape_href(dest, source, (size_t)length);
+}
+
+static inline void cr(gh_buf *html)
+{
+ if (html->size && html->ptr[html->size - 1] != '\n')
+ gh_buf_putc(html, '\n');
+}
+
+// Convert a block list to HTML. Returns 0 on success, and sets result.
+void blocks_to_html(gh_buf *html, block *b, bool tight)
+{
+ struct ListData *data;
+
+ while(b != NULL) {
+ switch(b->tag) {
+ case document:
+ blocks_to_html(html, b->children, false);
+ break;
+
+ case paragraph:
+ if (tight) {
+ inlines_to_html(html, b->inline_content);
+ } else {
+ cr(html);
+ gh_buf_puts(html, "<p>");
+ inlines_to_html(html, b->inline_content);
+ gh_buf_puts(html, "</p>");
+ cr(html);
+ }
+ break;
+
+ case block_quote:
+ cr(html);
+ gh_buf_puts(html, "<blockquote>");
+ blocks_to_html(html, b->children, false);
+ gh_buf_puts(html, "</blockquote>");
+ cr(html);
+ break;
+
+ case list_item:
+ cr(html);
+ gh_buf_puts(html, "<li>");
+ blocks_to_html(html, b->children, tight);
+ gh_buf_trim(html);
+ gh_buf_puts(html, "</li>");
+ cr(html);
+ break;
+
+ case list:
+ // make sure a list starts at the beginning of the line:
+ cr(html);
+ data = &(b->attributes.list_data);
+
+ if (data->start > 1) {
+ gh_buf_printf(html, "<%s start=\"%d\">\n",
+ data->list_type == bullet ? "ul" : "ol",
+ data->start);
+ } else {
+ gh_buf_puts(html, data->list_type == bullet ? "<ul>\n" : "<ol>\n");
+ }
+
+ blocks_to_html(html, b->children, data->tight);
+ gh_buf_puts(html, data->list_type == bullet ? "</ul>" : "</ol>");
+ cr(html);
+ break;
+
+ case atx_header:
+ case setext_header:
+ cr(html);
+ gh_buf_printf(html, "<h%d>", b->attributes.header_level);
+ inlines_to_html(html, b->inline_content);
+ gh_buf_printf(html, "</h%d>", b->attributes.header_level);
+ cr(html);
+ break;
+
+ case indented_code:
+ case fenced_code:
+ /* TODO: fenced code lang attributes */
+ cr(html);
+ gh_buf_puts(html, "<pre><code>");
+ escape_html(html, b->string_content.ptr, b->string_content.size);
+ gh_buf_puts(html, "</pre></code>");
+ cr(html);
+ break;
+
+ case html_block:
+ gh_buf_put(html, b->string_content.ptr, b->string_content.size);
+ break;
+
+ case hrule:
+ gh_buf_puts(html, "<hr />");
+ cr(html);
+ break;
+
+ case reference_def:
+ break;
+
+ default:
+ assert(false);
+ }
+
+ b = b->next;
+ }
+}
+
+// Convert an inline list to HTML. Returns 0 on success, and sets result.
+void inlines_to_html(gh_buf *html, inl* ils)
+{
+ gh_buf scrap = GH_BUF_INIT;
+
+ while(ils != NULL) {
+ switch(ils->tag) {
+ case INL_STRING:
+ escape_html(html, ils->content.literal.data, ils->content.literal.len);
+ break;
+
+ case INL_LINEBREAK:
+ gh_buf_puts(html, "<br />\n");
+ break;
+
+ case INL_SOFTBREAK:
+ gh_buf_putc(html, '\n');
+ break;
+
+ case INL_CODE:
+ gh_buf_puts(html, "<code>");
+ escape_html(html, ils->content.literal.data, ils->content.literal.len);
+ gh_buf_puts(html, "</code>");
+ break;
+
+ case INL_RAW_HTML:
+ case INL_ENTITY:
+ gh_buf_put(html,
+ ils->content.literal.data,
+ ils->content.literal.len);
+ break;
+
+ case INL_LINK:
+ gh_buf_puts(html, "<a href=\"");
+ escape_href(html, ils->content.linkable.url, -1);
+
+ if (ils->content.linkable.title) {
+ gh_buf_puts(html, "\" title=\"");
+ escape_html(html, ils->content.linkable.title, -1);
+ }
+
+ gh_buf_puts(html, "\">");
+ inlines_to_html(html, ils->content.inlines);
+ gh_buf_puts(html, "</a>");
+ break;
+
+ case INL_IMAGE:
+ gh_buf_puts(html, "<img src=\"");
+ escape_href(html, ils->content.linkable.url, -1);
+
+ inlines_to_html(&scrap, ils->content.inlines);
+ if (scrap.size) {
+ gh_buf_puts(html, "\" alt=\"");
+ escape_html(html, scrap.ptr, scrap.size);
+ }
+ gh_buf_clear(&scrap);
+
+ if (ils->content.linkable.title) {
+ gh_buf_puts(html, "\" title=\"");
+ escape_html(html, ils->content.linkable.title, -1);
+ }
+
+ gh_buf_puts(html, "\"/>");
+ break;
+
+ case INL_STRONG:
+ gh_buf_puts(html, "<strong>");
+ inlines_to_html(html, ils->content.inlines);
+ gh_buf_puts(html, "</strong>");
+ break;
+
+ case INL_EMPH:
+ gh_buf_puts(html, "<em>");
+ inlines_to_html(html, ils->content.inlines);
+ gh_buf_puts(html, "</em>");
+ break;
+ }
+ ils = ils->next;
+ }
+}