From ff9c0dcecd1314b820bf7d2584990c26c0e28909 Mon Sep 17 00:00:00 2001
From: Nick Wellnhofer <wellnhofer@aevum.de>
Date: Mon, 24 Nov 2014 20:10:49 +0100
Subject: Validate UTF-8 input

Invalid UTF-8 byte sequences are replaced with the Unicode replacement
character U+FFFD. Fixes #213.
---
 api_test/main.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 117 insertions(+), 7 deletions(-)

(limited to 'api_test')

diff --git a/api_test/main.c b/api_test/main.c
index 06d9be2..2d65a46 100644
--- a/api_test/main.c
+++ b/api_test/main.c
@@ -8,6 +8,8 @@
 
 #include "harness.h"
 
+#define UTF8_REPL "\xEF\xBF\xBD"
+
 static const cmark_node_type node_types[] = {
 	CMARK_NODE_DOCUMENT,
 	CMARK_NODE_BLOCK_QUOTE,
@@ -31,10 +33,25 @@ static const cmark_node_type node_types[] = {
 };
 static const int num_node_types = sizeof(node_types) / sizeof(*node_types);
 
+static void
+test_md_to_html(test_batch_runner *runner, const char *markdown,
+		const char *expected_html, const char *msg);
+
 static void
 test_content(test_batch_runner *runner, cmark_node_type type,
 	     int allowed_content);
 
+static void
+test_char(test_batch_runner *runner, int valid, const char *utf8,
+	  const char *msg);
+
+static void
+test_incomplete_char(test_batch_runner *runner, const char *utf8,
+		     const char *msg);
+
+static void
+test_continuation_byte(test_batch_runner *runner, const char *utf8);
+
 static void
 constructor(test_batch_runner *runner)
 {
@@ -436,13 +453,8 @@ test_content(test_batch_runner *runner, cmark_node_type type,
 static void
 parser(test_batch_runner *runner)
 {
-	static const char markdown[] = "No newline";
-	cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1);
-	char *html = cmark_render_html(doc);
-	STR_EQ(runner, html, "<p>No newline</p>\n",
-	       "document without trailing newline");
-	free(html);
-	cmark_node_destroy(doc);
+	test_md_to_html(runner, "No newline", "<p>No newline</p>\n",
+			"document without trailing newline");
 }
 
 static void
@@ -475,6 +487,103 @@ render_html(test_batch_runner *runner)
 	cmark_node_destroy(doc);
 }
 
+static void
+utf8(test_batch_runner *runner)
+{
+	// Ranges
+	test_char(runner, 1, "\x01", "valid utf8 01");
+	test_char(runner, 1, "\x7F", "valid utf8 7F");
+	test_char(runner, 0, "\x80", "invalid utf8 80");
+	test_char(runner, 0, "\xBF", "invalid utf8 BF");
+	test_char(runner, 0, "\xC0\x80", "invalid utf8 C080");
+	test_char(runner, 0, "\xC1\xBF", "invalid utf8 C1BF");
+	test_char(runner, 1, "\xC2\x80", "valid utf8 C280");
+	test_char(runner, 1, "\xDF\xBF", "valid utf8 DFBF");
+	test_char(runner, 0, "\xE0\x80\x80", "invalid utf8 E08080");
+	test_char(runner, 0, "\xE0\x9F\xBF", "invalid utf8 E09FBF");
+	test_char(runner, 1, "\xE0\xA0\x80", "valid utf8 E0A080");
+	test_char(runner, 1, "\xED\x9F\xBF", "valid utf8 ED9FBF");
+	test_char(runner, 0, "\xED\xA0\x80", "invalid utf8 EDA080");
+	test_char(runner, 0, "\xED\xBF\xBF", "invalid utf8 EDBFBF");
+	test_char(runner, 0, "\xF0\x80\x80\x80", "invalid utf8 F0808080");
+	test_char(runner, 0, "\xF0\x8F\xBF\xBF", "invalid utf8 F08FBFBF");
+	test_char(runner, 1, "\xF0\x90\x80\x80", "valid utf8 F0908080");
+	test_char(runner, 1, "\xF4\x8F\xBF\xBF", "valid utf8 F48FBFBF");
+	test_char(runner, 0, "\xF4\x90\x80\x80", "invalid utf8 F4908080");
+	test_char(runner, 0, "\xF7\xBF\xBF\xBF", "invalid utf8 F7BFBFBF");
+	test_char(runner, 0, "\xF8", "invalid utf8 F8");
+	test_char(runner, 0, "\xFF", "invalid utf8 FF");
+
+	// Incomplete byte sequences at end of input
+	test_incomplete_char(runner, "\xE0\xA0", "invalid utf8 E0A0");
+	test_incomplete_char(runner, "\xF0\x90\x80", "invalid utf8 F09080");
+
+	// Invalid continuation bytes
+	test_continuation_byte(runner, "\xC2\x80");
+	test_continuation_byte(runner, "\xE0\xA0\x80");
+	test_continuation_byte(runner, "\xF0\x90\x80\x80");
+}
+
+static void
+test_char(test_batch_runner *runner, int valid, const char *utf8,
+	  const char *msg)
+{
+	char buf[20];
+	sprintf(buf, "((((%s))))", utf8);
+
+	if (valid) {
+		char expected[30];
+		sprintf(expected, "<p>((((%s))))</p>\n", utf8);
+		test_md_to_html(runner, buf, expected, msg);
+	}
+	else {
+		test_md_to_html(runner, buf, "<p>((((" UTF8_REPL "))))</p>\n",
+				msg);
+	}
+}
+
+static void
+test_incomplete_char(test_batch_runner *runner, const char *utf8,
+		     const char *msg)
+{
+	char buf[20];
+	sprintf(buf, "----%s", utf8);
+	test_md_to_html(runner, buf, "<p>----" UTF8_REPL "</p>\n", msg);
+}
+
+static void
+test_continuation_byte(test_batch_runner *runner, const char *utf8)
+{
+	int len = strlen(utf8);
+
+	for (int pos = 1; pos < len; ++pos) {
+		char buf[20];
+		sprintf(buf, "((((%s))))", utf8);
+		buf[4+pos] = '\x20';
+
+		char expected[50];
+		strcpy(expected, "<p>((((" UTF8_REPL "\x20");
+		for (int i = pos + 1; i < len; ++i) {
+			strcat(expected, UTF8_REPL);
+		}
+		strcat(expected, "))))</p>\n");
+
+		char *html = cmark_markdown_to_html(buf, strlen(buf));
+		STR_EQ(runner, html, expected,
+		       "invalid utf8 continuation byte %d/%d", pos, len);
+		free(html);
+	}
+}
+
+static void
+test_md_to_html(test_batch_runner *runner, const char *markdown,
+		const char *expected_html, const char *msg)
+{
+	char *html = cmark_markdown_to_html(markdown, strlen(markdown));
+	STR_EQ(runner, html, expected_html, msg);
+	free(html);
+}
+
 int main() {
 	int retval;
 	test_batch_runner *runner = test_batch_runner_new();
@@ -486,6 +595,7 @@ int main() {
 	hierarchy(runner);
 	parser(runner);
 	render_html(runner);
+	utf8(runner);
 
 	test_print_summary(runner);
 	retval =  test_ok(runner) ? 0 : 1;
-- 
cgit v1.2.3


From a5ba5add1d72874fd40168eac54ed39e7b82bf49 Mon Sep 17 00:00:00 2001
From: Nick Wellnhofer <wellnhofer@aevum.de>
Date: Mon, 24 Nov 2014 22:46:20 +0100
Subject: Add test for input containing null character

---
 api_test/main.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'api_test')

diff --git a/api_test/main.c b/api_test/main.c
index 2d65a46..9931581 100644
--- a/api_test/main.c
+++ b/api_test/main.c
@@ -522,6 +522,14 @@ utf8(test_batch_runner *runner)
 	test_continuation_byte(runner, "\xC2\x80");
 	test_continuation_byte(runner, "\xE0\xA0\x80");
 	test_continuation_byte(runner, "\xF0\x90\x80\x80");
+
+	// Test string containing null character
+	static const char string_with_null[] = "((((\0))))";
+	char *html = cmark_markdown_to_html(string_with_null,
+					    sizeof(string_with_null) - 1);
+	STR_EQ(runner, html, "<p>((((" UTF8_REPL "))))</p>\n",
+	       "utf8 with U+0000");
+	free(html);
 }
 
 static void
-- 
cgit v1.2.3