From c28af79329264a7cf331a1b1c414919e4ed9e9f9 Mon Sep 17 00:00:00 2001
From: Vicent Marti <tanoku@gmail.com>
Date: Tue, 2 Sep 2014 13:37:34 +0200
Subject: It buiiiilds

---
 src/blocks.c              |  11 ++-
 src/buffer.c              |   6 +-
 src/html/houdini.h        |  44 ++++++++++
 src/html/houdini_href_e.c | 115 +++++++++++++++++++++++++
 src/html/houdini_html_e.c |  89 +++++++++++++++++++
 src/html/html.c           | 212 ++++++++++++++++++++++++++++++++++++++++++++++
 src/stmd.h                |   1 +
 src/utf8.c                |   7 +-
 8 files changed, 473 insertions(+), 12 deletions(-)
 create mode 100644 src/html/houdini.h
 create mode 100644 src/html/houdini_href_e.c
 create mode 100644 src/html/houdini_html_e.c
 create mode 100644 src/html/html.c

diff --git a/src/blocks.c b/src/blocks.c
index 71dc830..42f20db 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -8,6 +8,7 @@
 #include "scanners.h"
 #include "uthash.h"
 
+static void incorporate_line(gh_buf *ln, int line_number, block** curptr);
 static void finalize(block* b, int line_number);
 
 static block* make_block(int tag, int start_line, int start_column)
@@ -390,7 +391,7 @@ static void expand_tabs(gh_buf *ob, const unsigned char *line, size_t size)
 	}
 }
 
-static block *finalize_parsing(block *document, int linenum)
+static block *finalize_document(block *document, int linenum)
 {
 	while (document != document->top) {
 		finalize(document, linenum);
@@ -411,7 +412,7 @@ extern block *stmd_parse_file(FILE *f)
 	block *document = make_document();
 
 	while (fgets((char *)buffer, sizeof(buffer), f)) {
-		expand_tabs(&line, buffer, strlen(buffer));
+		expand_tabs(&line, buffer, strlen((char *)buffer));
 		incorporate_line(&line, linenum, &document);
 		gh_buf_clear(&line);
 		linenum++;
@@ -429,7 +430,7 @@ extern block *stmd_parse_document(const unsigned char *buffer, size_t len)
 	block *document = make_document();
 
 	while (buffer < end) {
-		const char *eol = memchr(buffer, '\n', end - buffer);
+		const unsigned char *eol = memchr(buffer, '\n', end - buffer);
 
 		if (!eol) {
 			expand_tabs(&line, buffer, end - buffer);
@@ -449,9 +450,7 @@ extern block *stmd_parse_document(const unsigned char *buffer, size_t len)
 }
 
 // Process one line at a time, modifying a block.
-// Returns 0 if successful.  curptr is changed to point to
-// the currently open block.
-extern void incorporate_line(gh_buf *ln, int line_number, block** curptr)
+static void incorporate_line(gh_buf *ln, int line_number, block** curptr)
 {
 	block* last_matched_container;
 	int offset = 0;
diff --git a/src/buffer.c b/src/buffer.c
index 17dc864..cfc6a7e 100644
--- a/src/buffer.c
+++ b/src/buffer.c
@@ -245,11 +245,11 @@ int gh_buf_cmp(const gh_buf *a, const gh_buf *b)
 
 int gh_buf_strchr(const gh_buf *buf, int c, int pos)
 {
-	const char *p = memchr(buf->ptr + pos, c, buf->size - pos);
+	const unsigned char *p = memchr(buf->ptr + pos, c, buf->size - pos);
 	if (!p)
 		return -1;
 
-	return (int)(p - buf->ptr);
+	return (int)(p - (const unsigned char *)buf->ptr);
 }
 
 int gh_buf_strrchr(const gh_buf *buf, int c, int pos)
@@ -264,7 +264,7 @@ int gh_buf_strrchr(const gh_buf *buf, int c, int pos)
 	return -1;
 }
 
-void gh_buf_truncate(gh_buf *buf, size_t len)
+void gh_buf_truncate(gh_buf *buf, int len)
 {
 	if (len < buf->size) {
 		buf->size = len;
diff --git a/src/html/houdini.h b/src/html/houdini.h
new file mode 100644
index 0000000..31fe917
--- /dev/null
+++ b/src/html/houdini.h
@@ -0,0 +1,44 @@
+#ifndef __HOUDINI_H__
+#define __HOUDINI_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include "buffer.h"
+
+#define likely(x)       __builtin_expect((x),1)
+#define unlikely(x)     __builtin_expect((x),0)
+
+#ifdef HOUDINI_USE_LOCALE
+#	define _isxdigit(c) isxdigit(c)
+#	define _isdigit(c) isdigit(c)
+#else
+/*
+ * Helper _isdigit methods -- do not trust the current locale
+ * */
+#	define _isxdigit(c) (strchr("0123456789ABCDEFabcdef", (c)) != NULL)
+#	define _isdigit(c) ((c) >= '0' && (c) <= '9')
+#endif
+
+#define HOUDINI_ESCAPED_SIZE(x) (((x) * 12) / 10)
+#define HOUDINI_UNESCAPED_SIZE(x) (x)
+
+extern int houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure);
+extern int houdini_unescape_html(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_uri(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_url(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_unescape_uri(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_unescape_url(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_js(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_unescape_js(gh_buf *ob, const uint8_t *src, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/html/houdini_href_e.c b/src/html/houdini_href_e.c
new file mode 100644
index 0000000..59fe850
--- /dev/null
+++ b/src/html/houdini_href_e.c
@@ -0,0 +1,115 @@
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "html/houdini.h"
+
+/*
+ * The following characters will not be escaped:
+ *
+ *		-_.+!*'(),%#@?=;:/,+&$ alphanum
+ *
+ * Note that this character set is the addition of:
+ *
+ *	- The characters which are safe to be in an URL
+ *	- The characters which are *not* safe to be in
+ *	an URL because they are RESERVED characters.
+ *
+ * We asume (lazily) that any RESERVED char that
+ * appears inside an URL is actually meant to
+ * have its native function (i.e. as an URL
+ * component/separator) and hence needs no escaping.
+ *
+ * There are two exceptions: the chacters & (amp)
+ * and ' (single quote) do not appear in the table.
+ * They are meant to appear in the URL as components,
+ * yet they require special HTML-entity escaping
+ * to generate valid HTML markup.
+ *
+ * All other characters will be escaped to %XX.
+ *
+ */
+static const char HREF_SAFE[] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
+	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+int
+houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size)
+{
+	static const uint8_t hex_chars[] = "0123456789ABCDEF";
+	size_t  i = 0, org;
+	uint8_t hex_str[3];
+
+	hex_str[0] = '%';
+
+	while (i < size) {
+		org = i;
+		while (i < size && HREF_SAFE[src[i]] != 0)
+			i++;
+
+		if (likely(i > org)) {
+			if (unlikely(org == 0)) {
+				if (i >= size)
+					return 0;
+
+				gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
+			}
+
+			gh_buf_put(ob, src + org, i - org);
+		}
+
+		/* escaping */
+		if (i >= size)
+			break;
+
+		switch (src[i]) {
+		/* amp appears all the time in URLs, but needs
+		 * HTML-entity escaping to be inside an href */
+		case '&':
+			gh_buf_puts(ob, "&amp;");
+			break;
+
+		/* the single quote is a valid URL character
+		 * according to the standard; it needs HTML
+		 * entity escaping too */
+		case '\'':
+			gh_buf_puts(ob, "&#x27;");
+			break;
+
+		/* the space can be escaped to %20 or a plus
+		 * sign. we're going with the generic escape
+		 * for now. the plus thing is more commonly seen
+		 * when building GET strings */
+#if 0
+		case ' ':
+			gh_buf_putc(ob, '+');
+			break;
+#endif
+
+		/* every other character goes with a %XX escaping */
+		default:
+			hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
+			hex_str[2] = hex_chars[src[i] & 0xF];
+			gh_buf_put(ob, hex_str, 3);
+		}
+
+		i++;
+	}
+
+	return 1;
+}
diff --git a/src/html/houdini_html_e.c b/src/html/houdini_html_e.c
new file mode 100644
index 0000000..316c5ce
--- /dev/null
+++ b/src/html/houdini_html_e.c
@@ -0,0 +1,89 @@
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "html/houdini.h"
+
+/**
+ * According to the OWASP rules:
+ *
+ * & --> &amp;
+ * < --> &lt;
+ * > --> &gt;
+ * " --> &quot;
+ * ' --> &#x27;     &apos; is not recommended
+ * / --> &#x2F;     forward slash is included as it helps end an HTML entity
+ *
+ */
+static const char HTML_ESCAPE_TABLE[] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const char *HTML_ESCAPES[] = {
+        "",
+        "&quot;",
+        "&amp;",
+        "&#39;",
+        "&#47;",
+        "&lt;",
+        "&gt;"
+};
+
+int
+houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure)
+{
+	size_t  i = 0, org, esc = 0;
+
+	while (i < size) {
+		org = i;
+		while (i < size && (esc = HTML_ESCAPE_TABLE[src[i]]) == 0)
+			i++;
+
+		if (i > org) {
+			if (unlikely(org == 0)) {
+				if (i >= size)
+					return 0;
+
+				gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
+			}
+
+			gh_buf_put(ob, src + org, i - org);
+		}
+
+		/* escaping */
+		if (unlikely(i >= size))
+			break;
+
+		/* The forward slash is only escaped in secure mode */
+		if (src[i] == '/' && !secure) {
+			gh_buf_putc(ob, '/');
+		} else {
+			gh_buf_puts(ob, HTML_ESCAPES[esc]);
+		}
+
+		i++;
+	}
+
+	return 1;
+}
+
+int
+houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size)
+{
+	return houdini_escape_html0(ob, src, size, 1);
+}
diff --git a/src/html/html.c b/src/html/html.c
new file mode 100644
index 0000000..2f160ca
--- /dev/null
+++ b/src/html/html.c
@@ -0,0 +1,212 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include "stmd.h"
+#include "debug.h"
+#include "scanners.h"
+#include "html/houdini.h"
+
+// Functions to convert block and inline lists to HTML strings.
+
+static void escape_html(gh_buf *dest, const unsigned char *source, int length)
+{
+	if (length < 0)
+		length = strlen((char *)source);
+
+	houdini_escape_html0(dest, source, (size_t)length, 0);
+}
+
+static void escape_href(gh_buf *dest, const unsigned char *source, int length)
+{
+	if (length < 0)
+		length = strlen((char *)source);
+
+	houdini_escape_href(dest, source, (size_t)length);
+}
+
+static inline void cr(gh_buf *html)
+{
+	if (html->size && html->ptr[html->size - 1] != '\n')
+		gh_buf_putc(html, '\n');
+}
+
+// Convert a block list to HTML.  Returns 0 on success, and sets result.
+void blocks_to_html(gh_buf *html, block *b, bool tight)
+{
+	struct ListData *data;
+
+	while(b != NULL) {
+		switch(b->tag) {
+			case document:
+				blocks_to_html(html, b->children, false);
+				break;
+
+			case paragraph:
+				if (tight) {
+					inlines_to_html(html, b->inline_content);
+				} else {
+					cr(html);
+					gh_buf_puts(html, "<p>");
+					inlines_to_html(html, b->inline_content);
+					gh_buf_puts(html, "</p>");
+					cr(html);
+				}
+				break;
+
+			case block_quote:
+				cr(html);
+				gh_buf_puts(html, "<blockquote>");
+				blocks_to_html(html, b->children, false);
+				gh_buf_puts(html, "</blockquote>");
+				cr(html);
+				break;
+
+			case list_item:
+				cr(html);
+				gh_buf_puts(html, "<li>");
+				blocks_to_html(html, b->children, tight);
+				gh_buf_trim(html);
+				gh_buf_puts(html, "</li>");
+				cr(html);
+				break;
+
+			case list:
+				// make sure a list starts at the beginning of the line:
+				cr(html);
+				data = &(b->attributes.list_data);
+
+				if (data->start > 1) {
+					gh_buf_printf(html, "<%s start=\"%d\">\n",
+							data->list_type == bullet ? "ul" : "ol",
+							data->start);
+				} else {
+					gh_buf_puts(html, data->list_type == bullet ? "<ul>\n" : "<ol>\n");
+				}
+
+				blocks_to_html(html, b->children, data->tight);
+				gh_buf_puts(html, data->list_type == bullet ? "</ul>" : "</ol>");
+				cr(html);
+				break;
+
+			case atx_header:
+			case setext_header:
+				cr(html);
+				gh_buf_printf(html, "<h%d>", b->attributes.header_level);
+				inlines_to_html(html, b->inline_content);
+				gh_buf_printf(html, "</h%d>", b->attributes.header_level);
+				cr(html);
+				break;
+
+			case indented_code:
+			case fenced_code:
+				/* TODO: fenced code lang attributes */
+				cr(html);
+				gh_buf_puts(html, "<pre><code>");
+				escape_html(html, b->string_content.ptr, b->string_content.size);
+				gh_buf_puts(html, "</pre></code>");
+				cr(html);
+				break;
+
+			case html_block:
+				gh_buf_put(html, b->string_content.ptr, b->string_content.size);
+				break;
+
+			case hrule:
+				gh_buf_puts(html, "<hr />");
+				cr(html);
+				break;
+
+			case reference_def:
+				break;
+
+			default:
+				assert(false);
+		}
+
+		b = b->next;
+	}
+}
+
+// Convert an inline list to HTML.  Returns 0 on success, and sets result.
+void inlines_to_html(gh_buf *html, inl* ils)
+{
+	gh_buf scrap = GH_BUF_INIT;
+
+	while(ils != NULL) {
+		switch(ils->tag) {
+			case INL_STRING:
+				escape_html(html, ils->content.literal.data, ils->content.literal.len);
+				break;
+
+			case INL_LINEBREAK:
+				gh_buf_puts(html, "<br />\n");
+				break;
+
+			case INL_SOFTBREAK:
+				gh_buf_putc(html, '\n');
+				break;
+
+			case INL_CODE:
+				gh_buf_puts(html, "<code>");
+				escape_html(html, ils->content.literal.data, ils->content.literal.len);
+				gh_buf_puts(html, "</code>");
+				break;
+
+			case INL_RAW_HTML:
+			case INL_ENTITY:
+				gh_buf_put(html,
+						ils->content.literal.data,
+						ils->content.literal.len);
+				break;
+
+			case INL_LINK:
+				gh_buf_puts(html, "<a href=\"");
+				escape_href(html, ils->content.linkable.url, -1);
+
+				if (ils->content.linkable.title) {
+					gh_buf_puts(html, "\" title=\"");
+					escape_html(html, ils->content.linkable.title, -1);
+				}
+
+				gh_buf_puts(html, "\">");
+				inlines_to_html(html, ils->content.inlines);
+				gh_buf_puts(html, "</a>");
+				break;
+
+			case INL_IMAGE:
+				gh_buf_puts(html, "<img src=\"");
+				escape_href(html, ils->content.linkable.url, -1);
+
+				inlines_to_html(&scrap, ils->content.inlines);
+				if (scrap.size) {
+					gh_buf_puts(html, "\" alt=\"");
+					escape_html(html, scrap.ptr, scrap.size);
+				}
+				gh_buf_clear(&scrap);
+
+				if (ils->content.linkable.title) {
+					gh_buf_puts(html, "\" title=\"");
+					escape_html(html, ils->content.linkable.title, -1);
+				}
+
+				gh_buf_puts(html, "\"/>");
+				break;
+
+			case INL_STRONG:
+				gh_buf_puts(html, "<strong>");
+				inlines_to_html(html, ils->content.inlines);
+				gh_buf_puts(html, "</strong>");
+				break;
+
+			case INL_EMPH:
+				gh_buf_puts(html, "<em>");
+				inlines_to_html(html, ils->content.inlines);
+				gh_buf_puts(html, "</em>");
+				break;
+		}
+		ils = ils->next;
+	}
+}
diff --git a/src/stmd.h b/src/stmd.h
index 1e490d6..3e284bd 100644
--- a/src/stmd.h
+++ b/src/stmd.h
@@ -1,4 +1,5 @@
 #include <stdbool.h>
+#include <stdio.h>
 #include "buffer.h"
 #include "uthash.h"
 
diff --git a/src/utf8.c b/src/utf8.c
index e3f8dd3..32c78a4 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -1,6 +1,7 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <unistd.h>
+#include <assert.h>
 
 #include "stmd.h"
 
@@ -83,9 +84,9 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst)
 	return length;
 }
 
-void utf8_encode_char(int32_t uc, gh_buf *buf)
+void utf8proc_encode_char(int32_t uc, gh_buf *buf)
 {
-	char dst[4];
+	unsigned char dst[4];
 	int len = 0;
 
 	if (uc < 0x00) {
@@ -99,7 +100,7 @@ void utf8_encode_char(int32_t uc, gh_buf *buf)
 		len = 2;
 	} else if (uc == 0xFFFF) {
 		dst[0] = 0xFF;
-		return 1;
+		len = 1;
 	} else if (uc == 0xFFFE) {
 		dst[0] = 0xFE;
 		len = 1;
-- 
cgit v1.2.3