From 635bb48bad16a846a7d5a513800ff08d393b0651 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Mon, 24 Nov 2014 18:22:42 +0100 Subject: Add sentence about U+0000 to spec Fixes #212. --- spec.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spec.txt b/spec.txt index 6d90d07..9f150dd 100644 --- a/spec.txt +++ b/spec.txt @@ -223,6 +223,9 @@ Line endings are replaced by newline characters (LF). A line containing no characters, or a line containing only spaces (after tab expansion), is called a [blank line](@blank-line). +For security reasons, a conforming parser must strip or replace the +Unicode character `U+0000`. + # Blocks and inlines We can think of a document as a sequence of -- cgit v1.2.3 From 82b360db50dfcb889e4488dfb3e5bcfc52d91857 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Mon, 24 Nov 2014 18:23:13 +0100 Subject: Off-by-one error in utf8proc_detab --- src/utf8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utf8.c b/src/utf8.c index e144c72..b343175 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -60,7 +60,7 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) while (i < size) { size_t org = i; - while (i < size && line[i] != '\t' && line[i] <= 0x80) { + while (i < size && line[i] != '\t' && line[i] < 0x80) { i++; tab++; } -- cgit v1.2.3 From ff9c0dcecd1314b820bf7d2584990c26c0e28909 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Mon, 24 Nov 2014 20:10:49 +0100 Subject: Validate UTF-8 input Invalid UTF-8 byte sequences are replaced with the Unicode replacement character U+FFFD. Fixes #213. --- api_test/main.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- src/utf8.c | 65 +++++++++++++++++++++++++++-- src/utf8.h | 1 - 3 files changed, 179 insertions(+), 11 deletions(-) diff --git a/api_test/main.c b/api_test/main.c index 06d9be2..2d65a46 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -8,6 +8,8 @@ #include "harness.h" +#define UTF8_REPL "\xEF\xBF\xBD" + static const cmark_node_type node_types[] = { CMARK_NODE_DOCUMENT, CMARK_NODE_BLOCK_QUOTE, @@ -31,10 +33,25 @@ static const cmark_node_type node_types[] = { }; static const int num_node_types = sizeof(node_types) / sizeof(*node_types); +static void +test_md_to_html(test_batch_runner *runner, const char *markdown, + const char *expected_html, const char *msg); + static void test_content(test_batch_runner *runner, cmark_node_type type, int allowed_content); +static void +test_char(test_batch_runner *runner, int valid, const char *utf8, + const char *msg); + +static void +test_incomplete_char(test_batch_runner *runner, const char *utf8, + const char *msg); + +static void +test_continuation_byte(test_batch_runner *runner, const char *utf8); + static void constructor(test_batch_runner *runner) { @@ -436,13 +453,8 @@ test_content(test_batch_runner *runner, cmark_node_type type, static void parser(test_batch_runner *runner) { - static const char markdown[] = "No newline"; - cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1); - char *html = cmark_render_html(doc); - STR_EQ(runner, html, "

No newline

\n", - "document without trailing newline"); - free(html); - cmark_node_destroy(doc); + test_md_to_html(runner, "No newline", "

No newline

\n", + "document without trailing newline"); } static void @@ -475,6 +487,103 @@ render_html(test_batch_runner *runner) cmark_node_destroy(doc); } +static void +utf8(test_batch_runner *runner) +{ + // Ranges + test_char(runner, 1, "\x01", "valid utf8 01"); + test_char(runner, 1, "\x7F", "valid utf8 7F"); + test_char(runner, 0, "\x80", "invalid utf8 80"); + test_char(runner, 0, "\xBF", "invalid utf8 BF"); + test_char(runner, 0, "\xC0\x80", "invalid utf8 C080"); + test_char(runner, 0, "\xC1\xBF", "invalid utf8 C1BF"); + test_char(runner, 1, "\xC2\x80", "valid utf8 C280"); + test_char(runner, 1, "\xDF\xBF", "valid utf8 DFBF"); + test_char(runner, 0, "\xE0\x80\x80", "invalid utf8 E08080"); + test_char(runner, 0, "\xE0\x9F\xBF", "invalid utf8 E09FBF"); + test_char(runner, 1, "\xE0\xA0\x80", "valid utf8 E0A080"); + test_char(runner, 1, "\xED\x9F\xBF", "valid utf8 ED9FBF"); + test_char(runner, 0, "\xED\xA0\x80", "invalid utf8 EDA080"); + test_char(runner, 0, "\xED\xBF\xBF", "invalid utf8 EDBFBF"); + test_char(runner, 0, "\xF0\x80\x80\x80", "invalid utf8 F0808080"); + test_char(runner, 0, "\xF0\x8F\xBF\xBF", "invalid utf8 F08FBFBF"); + test_char(runner, 1, "\xF0\x90\x80\x80", "valid utf8 F0908080"); + test_char(runner, 1, "\xF4\x8F\xBF\xBF", "valid utf8 F48FBFBF"); + test_char(runner, 0, "\xF4\x90\x80\x80", "invalid utf8 F4908080"); + test_char(runner, 0, "\xF7\xBF\xBF\xBF", "invalid utf8 F7BFBFBF"); + test_char(runner, 0, "\xF8", "invalid utf8 F8"); + test_char(runner, 0, "\xFF", "invalid utf8 FF"); + + // Incomplete byte sequences at end of input + test_incomplete_char(runner, "\xE0\xA0", "invalid utf8 E0A0"); + test_incomplete_char(runner, "\xF0\x90\x80", "invalid utf8 F09080"); + + // Invalid continuation bytes + test_continuation_byte(runner, "\xC2\x80"); + test_continuation_byte(runner, "\xE0\xA0\x80"); + test_continuation_byte(runner, "\xF0\x90\x80\x80"); +} + +static void +test_char(test_batch_runner *runner, int valid, const char *utf8, + const char *msg) +{ + char buf[20]; + sprintf(buf, "((((%s))))", utf8); + + if (valid) { + char expected[30]; + sprintf(expected, "

((((%s))))

\n", utf8); + test_md_to_html(runner, buf, expected, msg); + } + else { + test_md_to_html(runner, buf, "

((((" UTF8_REPL "))))

\n", + msg); + } +} + +static void +test_incomplete_char(test_batch_runner *runner, const char *utf8, + const char *msg) +{ + char buf[20]; + sprintf(buf, "----%s", utf8); + test_md_to_html(runner, buf, "

----" UTF8_REPL "

\n", msg); +} + +static void +test_continuation_byte(test_batch_runner *runner, const char *utf8) +{ + int len = strlen(utf8); + + for (int pos = 1; pos < len; ++pos) { + char buf[20]; + sprintf(buf, "((((%s))))", utf8); + buf[4+pos] = '\x20'; + + char expected[50]; + strcpy(expected, "

((((" UTF8_REPL "\x20"); + for (int i = pos + 1; i < len; ++i) { + strcat(expected, UTF8_REPL); + } + strcat(expected, "))))

\n"); + + char *html = cmark_markdown_to_html(buf, strlen(buf)); + STR_EQ(runner, html, expected, + "invalid utf8 continuation byte %d/%d", pos, len); + free(html); + } +} + +static void +test_md_to_html(test_batch_runner *runner, const char *markdown, + const char *expected_html, const char *msg) +{ + char *html = cmark_markdown_to_html(markdown, strlen(markdown)); + STR_EQ(runner, html, expected_html, msg); + free(html); +} + int main() { int retval; test_batch_runner *runner = test_batch_runner_new(); @@ -486,6 +595,7 @@ int main() { hierarchy(runner); parser(runner); render_html(runner); + utf8(runner); test_print_summary(runner); retval = test_ok(runner) ? 0 : 1; diff --git a/src/utf8.c b/src/utf8.c index b343175..e4ea8e2 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -28,7 +28,7 @@ static void encode_unknown(strbuf *buf) strbuf_put(buf, repl, 3); } -int utf8proc_charlen(const uint8_t *str, int str_len) +static int utf8proc_charlen(const uint8_t *str, int str_len) { int length, i; @@ -51,6 +51,64 @@ int utf8proc_charlen(const uint8_t *str, int str_len) return length; } +// Validate a single UTF-8 character according to RFC 3629. +static int utf8proc_valid(const uint8_t *str, int str_len) +{ + int length = utf8proc_charlen(str, str_len); + + if (length <= 0) + return length; + + switch (length) { + case 1: + if (str[0] == 0x00) { + // ASCII NUL is technically valid but rejected + // for security reasons. + return -length; + } + break; + + case 2: + if (str[0] < 0xC2) { + // Overlong + return -length; + } + break; + + case 3: + if (str[0] == 0xE0) { + if (str[1] < 0xA0) { + // Overlong + return -length; + } + } + else if (str[0] == 0xED) { + if (str[1] >= 0xA0) { + // Surrogate + return -length; + } + } + break; + + case 4: + if (str[0] == 0xF0) { + if (str[1] < 0x90) { + // Overlong + return -length; + } + } + else if (str[0] >= 0xF4) { + if (str[0] > 0xF4 || str[1] >= 0x90) { + // Above 0x10FFFF + return -length; + } + } + break; + } + + return length; +} + void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) { static const uint8_t whitespace[] = " "; @@ -60,7 +118,8 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) while (i < size) { size_t org = i; - while (i < size && line[i] != '\t' && line[i] < 0x80) { + while (i < size && line[i] != '\t' && line[i] != '\0' + && line[i] < 0x80) { i++; tab++; } @@ -76,7 +135,7 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) i += 1; tab += numspaces; } else { - int charlen = utf8proc_charlen(line + i, size - i); + int charlen = utf8proc_valid(line + i, size - i); if (charlen >= 0) { strbuf_put(ob, line + i, charlen); diff --git a/src/utf8.h b/src/utf8.h index 319e39a..7df1573 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -11,7 +11,6 @@ extern "C" { void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, int len); void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf); int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst); -int utf8proc_charlen(const uint8_t *str, int str_len); void utf8proc_detab(cmark_strbuf *dest, const uint8_t *line, size_t size); int utf8proc_is_space(int32_t uc); int utf8proc_is_punctuation(int32_t uc); -- cgit v1.2.3 From a5ba5add1d72874fd40168eac54ed39e7b82bf49 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Mon, 24 Nov 2014 22:46:20 +0100 Subject: Add test for input containing null character --- api_test/main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/api_test/main.c b/api_test/main.c index 2d65a46..9931581 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -522,6 +522,14 @@ utf8(test_batch_runner *runner) test_continuation_byte(runner, "\xC2\x80"); test_continuation_byte(runner, "\xE0\xA0\x80"); test_continuation_byte(runner, "\xF0\x90\x80\x80"); + + // Test string containing null character + static const char string_with_null[] = "((((\0))))"; + char *html = cmark_markdown_to_html(string_with_null, + sizeof(string_with_null) - 1); + STR_EQ(runner, html, "

((((" UTF8_REPL "))))

\n", + "utf8 with U+0000"); + free(html); } static void -- cgit v1.2.3