diff options
-rw-r--r-- | api_test/main.c | 132 | ||||
-rw-r--r-- | spec.txt | 3 | ||||
-rw-r--r-- | src/utf8.c | 65 | ||||
-rw-r--r-- | src/utf8.h | 1 |
4 files changed, 190 insertions, 11 deletions
diff --git a/api_test/main.c b/api_test/main.c index 06d9be2..9931581 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -8,6 +8,8 @@ #include "harness.h" +#define UTF8_REPL "\xEF\xBF\xBD" + static const cmark_node_type node_types[] = { CMARK_NODE_DOCUMENT, CMARK_NODE_BLOCK_QUOTE, @@ -32,10 +34,25 @@ static const cmark_node_type node_types[] = { static const int num_node_types = sizeof(node_types) / sizeof(*node_types); static void +test_md_to_html(test_batch_runner *runner, const char *markdown, + const char *expected_html, const char *msg); + +static void test_content(test_batch_runner *runner, cmark_node_type type, int allowed_content); static void +test_char(test_batch_runner *runner, int valid, const char *utf8, + const char *msg); + +static void +test_incomplete_char(test_batch_runner *runner, const char *utf8, + const char *msg); + +static void +test_continuation_byte(test_batch_runner *runner, const char *utf8); + +static void constructor(test_batch_runner *runner) { for (int i = 0; i < num_node_types; ++i) { @@ -436,13 +453,8 @@ test_content(test_batch_runner *runner, cmark_node_type type, static void parser(test_batch_runner *runner) { - static const char markdown[] = "No newline"; - cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1); - char *html = cmark_render_html(doc); - STR_EQ(runner, html, "<p>No newline</p>\n", - "document without trailing newline"); - free(html); - cmark_node_destroy(doc); + test_md_to_html(runner, "No newline", "<p>No newline</p>\n", + "document without trailing newline"); } static void @@ -475,6 +487,111 @@ render_html(test_batch_runner *runner) cmark_node_destroy(doc); } +static void +utf8(test_batch_runner *runner) +{ + // Ranges + test_char(runner, 1, "\x01", "valid utf8 01"); + test_char(runner, 1, "\x7F", "valid utf8 7F"); + test_char(runner, 0, "\x80", "invalid utf8 80"); + test_char(runner, 0, "\xBF", "invalid utf8 BF"); + test_char(runner, 0, "\xC0\x80", "invalid utf8 C080"); + test_char(runner, 0, "\xC1\xBF", "invalid utf8 C1BF"); + test_char(runner, 1, "\xC2\x80", "valid utf8 C280"); + test_char(runner, 1, "\xDF\xBF", "valid utf8 DFBF"); + test_char(runner, 0, "\xE0\x80\x80", "invalid utf8 E08080"); + test_char(runner, 0, "\xE0\x9F\xBF", "invalid utf8 E09FBF"); + test_char(runner, 1, "\xE0\xA0\x80", "valid utf8 E0A080"); + test_char(runner, 1, "\xED\x9F\xBF", "valid utf8 ED9FBF"); + test_char(runner, 0, "\xED\xA0\x80", "invalid utf8 EDA080"); + test_char(runner, 0, "\xED\xBF\xBF", "invalid utf8 EDBFBF"); + test_char(runner, 0, "\xF0\x80\x80\x80", "invalid utf8 F0808080"); + test_char(runner, 0, "\xF0\x8F\xBF\xBF", "invalid utf8 F08FBFBF"); + test_char(runner, 1, "\xF0\x90\x80\x80", "valid utf8 F0908080"); + test_char(runner, 1, "\xF4\x8F\xBF\xBF", "valid utf8 F48FBFBF"); + test_char(runner, 0, "\xF4\x90\x80\x80", "invalid utf8 F4908080"); + test_char(runner, 0, "\xF7\xBF\xBF\xBF", "invalid utf8 F7BFBFBF"); + test_char(runner, 0, "\xF8", "invalid utf8 F8"); + test_char(runner, 0, "\xFF", "invalid utf8 FF"); + + // Incomplete byte sequences at end of input + test_incomplete_char(runner, "\xE0\xA0", "invalid utf8 E0A0"); + test_incomplete_char(runner, "\xF0\x90\x80", "invalid utf8 F09080"); + + // Invalid continuation bytes + test_continuation_byte(runner, "\xC2\x80"); + test_continuation_byte(runner, "\xE0\xA0\x80"); + test_continuation_byte(runner, "\xF0\x90\x80\x80"); + + // Test string containing null character + static const char string_with_null[] = "((((\0))))"; + char *html = cmark_markdown_to_html(string_with_null, + sizeof(string_with_null) - 1); + STR_EQ(runner, html, "<p>((((" UTF8_REPL "))))</p>\n", + "utf8 with U+0000"); + free(html); +} + +static void +test_char(test_batch_runner *runner, int valid, const char *utf8, + const char *msg) +{ + char buf[20]; + sprintf(buf, "((((%s))))", utf8); + + if (valid) { + char expected[30]; + sprintf(expected, "<p>((((%s))))</p>\n", utf8); + test_md_to_html(runner, buf, expected, msg); + } + else { + test_md_to_html(runner, buf, "<p>((((" UTF8_REPL "))))</p>\n", + msg); + } +} + +static void +test_incomplete_char(test_batch_runner *runner, const char *utf8, + const char *msg) +{ + char buf[20]; + sprintf(buf, "----%s", utf8); + test_md_to_html(runner, buf, "<p>----" UTF8_REPL "</p>\n", msg); +} + +static void +test_continuation_byte(test_batch_runner *runner, const char *utf8) +{ + int len = strlen(utf8); + + for (int pos = 1; pos < len; ++pos) { + char buf[20]; + sprintf(buf, "((((%s))))", utf8); + buf[4+pos] = '\x20'; + + char expected[50]; + strcpy(expected, "<p>((((" UTF8_REPL "\x20"); + for (int i = pos + 1; i < len; ++i) { + strcat(expected, UTF8_REPL); + } + strcat(expected, "))))</p>\n"); + + char *html = cmark_markdown_to_html(buf, strlen(buf)); + STR_EQ(runner, html, expected, + "invalid utf8 continuation byte %d/%d", pos, len); + free(html); + } +} + +static void +test_md_to_html(test_batch_runner *runner, const char *markdown, + const char *expected_html, const char *msg) +{ + char *html = cmark_markdown_to_html(markdown, strlen(markdown)); + STR_EQ(runner, html, expected_html, msg); + free(html); +} + int main() { int retval; test_batch_runner *runner = test_batch_runner_new(); @@ -486,6 +603,7 @@ int main() { hierarchy(runner); parser(runner); render_html(runner); + utf8(runner); test_print_summary(runner); retval = test_ok(runner) ? 0 : 1; @@ -223,6 +223,9 @@ Line endings are replaced by newline characters (LF). A line containing no characters, or a line containing only spaces (after tab expansion), is called a [blank line](@blank-line). +For security reasons, a conforming parser must strip or replace the +Unicode character `U+0000`. + # Blocks and inlines We can think of a document as a sequence of @@ -28,7 +28,7 @@ static void encode_unknown(strbuf *buf) strbuf_put(buf, repl, 3); } -int utf8proc_charlen(const uint8_t *str, int str_len) +static int utf8proc_charlen(const uint8_t *str, int str_len) { int length, i; @@ -51,6 +51,64 @@ int utf8proc_charlen(const uint8_t *str, int str_len) return length; } +// Validate a single UTF-8 character according to RFC 3629. +static int utf8proc_valid(const uint8_t *str, int str_len) +{ + int length = utf8proc_charlen(str, str_len); + + if (length <= 0) + return length; + + switch (length) { + case 1: + if (str[0] == 0x00) { + // ASCII NUL is technically valid but rejected + // for security reasons. + return -length; + } + break; + + case 2: + if (str[0] < 0xC2) { + // Overlong + return -length; + } + break; + + case 3: + if (str[0] == 0xE0) { + if (str[1] < 0xA0) { + // Overlong + return -length; + } + } + else if (str[0] == 0xED) { + if (str[1] >= 0xA0) { + // Surrogate + return -length; + } + } + break; + + case 4: + if (str[0] == 0xF0) { + if (str[1] < 0x90) { + // Overlong + return -length; + } + } + else if (str[0] >= 0xF4) { + if (str[0] > 0xF4 || str[1] >= 0x90) { + // Above 0x10FFFF + return -length; + } + } + break; + } + + return length; +} + void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) { static const uint8_t whitespace[] = " "; @@ -60,7 +118,8 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) while (i < size) { size_t org = i; - while (i < size && line[i] != '\t' && line[i] <= 0x80) { + while (i < size && line[i] != '\t' && line[i] != '\0' + && line[i] < 0x80) { i++; tab++; } @@ -76,7 +135,7 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) i += 1; tab += numspaces; } else { - int charlen = utf8proc_charlen(line + i, size - i); + int charlen = utf8proc_valid(line + i, size - i); if (charlen >= 0) { strbuf_put(ob, line + i, charlen); @@ -11,7 +11,6 @@ extern "C" { void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, int len); void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf); int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst); -int utf8proc_charlen(const uint8_t *str, int str_len); void utf8proc_detab(cmark_strbuf *dest, const uint8_t *line, size_t size); int utf8proc_is_space(int32_t uc); int utf8proc_is_punctuation(int32_t uc); |