From c28af79329264a7cf331a1b1c414919e4ed9e9f9 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Tue, 2 Sep 2014 13:37:34 +0200 Subject: It buiiiilds --- src/blocks.c | 11 ++- src/buffer.c | 6 +- src/html/houdini.h | 44 ++++++++++ src/html/houdini_href_e.c | 115 +++++++++++++++++++++++++ src/html/houdini_html_e.c | 89 +++++++++++++++++++ src/html/html.c | 212 ++++++++++++++++++++++++++++++++++++++++++++++ src/stmd.h | 1 + src/utf8.c | 7 +- 8 files changed, 473 insertions(+), 12 deletions(-) create mode 100644 src/html/houdini.h create mode 100644 src/html/houdini_href_e.c create mode 100644 src/html/houdini_html_e.c create mode 100644 src/html/html.c diff --git a/src/blocks.c b/src/blocks.c index 71dc830..42f20db 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -8,6 +8,7 @@ #include "scanners.h" #include "uthash.h" +static void incorporate_line(gh_buf *ln, int line_number, block** curptr); static void finalize(block* b, int line_number); static block* make_block(int tag, int start_line, int start_column) @@ -390,7 +391,7 @@ static void expand_tabs(gh_buf *ob, const unsigned char *line, size_t size) } } -static block *finalize_parsing(block *document, int linenum) +static block *finalize_document(block *document, int linenum) { while (document != document->top) { finalize(document, linenum); @@ -411,7 +412,7 @@ extern block *stmd_parse_file(FILE *f) block *document = make_document(); while (fgets((char *)buffer, sizeof(buffer), f)) { - expand_tabs(&line, buffer, strlen(buffer)); + expand_tabs(&line, buffer, strlen((char *)buffer)); incorporate_line(&line, linenum, &document); gh_buf_clear(&line); linenum++; @@ -429,7 +430,7 @@ extern block *stmd_parse_document(const unsigned char *buffer, size_t len) block *document = make_document(); while (buffer < end) { - const char *eol = memchr(buffer, '\n', end - buffer); + const unsigned char *eol = memchr(buffer, '\n', end - buffer); if (!eol) { expand_tabs(&line, buffer, end - buffer); @@ -449,9 +450,7 @@ extern block *stmd_parse_document(const unsigned char *buffer, size_t len) } // Process one line at a time, modifying a block. -// Returns 0 if successful. curptr is changed to point to -// the currently open block. -extern void incorporate_line(gh_buf *ln, int line_number, block** curptr) +static void incorporate_line(gh_buf *ln, int line_number, block** curptr) { block* last_matched_container; int offset = 0; diff --git a/src/buffer.c b/src/buffer.c index 17dc864..cfc6a7e 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -245,11 +245,11 @@ int gh_buf_cmp(const gh_buf *a, const gh_buf *b) int gh_buf_strchr(const gh_buf *buf, int c, int pos) { - const char *p = memchr(buf->ptr + pos, c, buf->size - pos); + const unsigned char *p = memchr(buf->ptr + pos, c, buf->size - pos); if (!p) return -1; - return (int)(p - buf->ptr); + return (int)(p - (const unsigned char *)buf->ptr); } int gh_buf_strrchr(const gh_buf *buf, int c, int pos) @@ -264,7 +264,7 @@ int gh_buf_strrchr(const gh_buf *buf, int c, int pos) return -1; } -void gh_buf_truncate(gh_buf *buf, size_t len) +void gh_buf_truncate(gh_buf *buf, int len) { if (len < buf->size) { buf->size = len; diff --git a/src/html/houdini.h b/src/html/houdini.h new file mode 100644 index 0000000..31fe917 --- /dev/null +++ b/src/html/houdini.h @@ -0,0 +1,44 @@ +#ifndef __HOUDINI_H__ +#define __HOUDINI_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "buffer.h" + +#define likely(x) __builtin_expect((x),1) +#define unlikely(x) __builtin_expect((x),0) + +#ifdef HOUDINI_USE_LOCALE +# define _isxdigit(c) isxdigit(c) +# define _isdigit(c) isdigit(c) +#else +/* + * Helper _isdigit methods -- do not trust the current locale + * */ +# define _isxdigit(c) (strchr("0123456789ABCDEFabcdef", (c)) != NULL) +# define _isdigit(c) ((c) >= '0' && (c) <= '9') +#endif + +#define HOUDINI_ESCAPED_SIZE(x) (((x) * 12) / 10) +#define HOUDINI_UNESCAPED_SIZE(x) (x) + +extern int houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure); +extern int houdini_unescape_html(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_uri(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_url(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_uri(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_url(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_js(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_js(gh_buf *ob, const uint8_t *src, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/html/houdini_href_e.c b/src/html/houdini_href_e.c new file mode 100644 index 0000000..59fe850 --- /dev/null +++ b/src/html/houdini_href_e.c @@ -0,0 +1,115 @@ +#include +#include +#include + +#include "html/houdini.h" + +/* + * The following characters will not be escaped: + * + * -_.+!*'(),%#@?=;:/,+&$ alphanum + * + * Note that this character set is the addition of: + * + * - The characters which are safe to be in an URL + * - The characters which are *not* safe to be in + * an URL because they are RESERVED characters. + * + * We asume (lazily) that any RESERVED char that + * appears inside an URL is actually meant to + * have its native function (i.e. as an URL + * component/separator) and hence needs no escaping. + * + * There are two exceptions: the chacters & (amp) + * and ' (single quote) do not appear in the table. + * They are meant to appear in the URL as components, + * yet they require special HTML-entity escaping + * to generate valid HTML markup. + * + * All other characters will be escaped to %XX. + * + */ +static const char HREF_SAFE[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +int +houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size) +{ + static const uint8_t hex_chars[] = "0123456789ABCDEF"; + size_t i = 0, org; + uint8_t hex_str[3]; + + hex_str[0] = '%'; + + while (i < size) { + org = i; + while (i < size && HREF_SAFE[src[i]] != 0) + i++; + + if (likely(i > org)) { + if (unlikely(org == 0)) { + if (i >= size) + return 0; + + gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size)); + } + + gh_buf_put(ob, src + org, i - org); + } + + /* escaping */ + if (i >= size) + break; + + switch (src[i]) { + /* amp appears all the time in URLs, but needs + * HTML-entity escaping to be inside an href */ + case '&': + gh_buf_puts(ob, "&"); + break; + + /* the single quote is a valid URL character + * according to the standard; it needs HTML + * entity escaping too */ + case '\'': + gh_buf_puts(ob, "'"); + break; + + /* the space can be escaped to %20 or a plus + * sign. we're going with the generic escape + * for now. the plus thing is more commonly seen + * when building GET strings */ +#if 0 + case ' ': + gh_buf_putc(ob, '+'); + break; +#endif + + /* every other character goes with a %XX escaping */ + default: + hex_str[1] = hex_chars[(src[i] >> 4) & 0xF]; + hex_str[2] = hex_chars[src[i] & 0xF]; + gh_buf_put(ob, hex_str, 3); + } + + i++; + } + + return 1; +} diff --git a/src/html/houdini_html_e.c b/src/html/houdini_html_e.c new file mode 100644 index 0000000..316c5ce --- /dev/null +++ b/src/html/houdini_html_e.c @@ -0,0 +1,89 @@ +#include +#include +#include + +#include "html/houdini.h" + +/** + * According to the OWASP rules: + * + * & --> & + * < --> < + * > --> > + * " --> " + * ' --> ' ' is not recommended + * / --> / forward slash is included as it helps end an HTML entity + * + */ +static const char HTML_ESCAPE_TABLE[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const char *HTML_ESCAPES[] = { + "", + """, + "&", + "'", + "/", + "<", + ">" +}; + +int +houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure) +{ + size_t i = 0, org, esc = 0; + + while (i < size) { + org = i; + while (i < size && (esc = HTML_ESCAPE_TABLE[src[i]]) == 0) + i++; + + if (i > org) { + if (unlikely(org == 0)) { + if (i >= size) + return 0; + + gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size)); + } + + gh_buf_put(ob, src + org, i - org); + } + + /* escaping */ + if (unlikely(i >= size)) + break; + + /* The forward slash is only escaped in secure mode */ + if (src[i] == '/' && !secure) { + gh_buf_putc(ob, '/'); + } else { + gh_buf_puts(ob, HTML_ESCAPES[esc]); + } + + i++; + } + + return 1; +} + +int +houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size) +{ + return houdini_escape_html0(ob, src, size, 1); +} diff --git a/src/html/html.c b/src/html/html.c new file mode 100644 index 0000000..2f160ca --- /dev/null +++ b/src/html/html.c @@ -0,0 +1,212 @@ +#include +#include +#include +#include +#include + +#include "stmd.h" +#include "debug.h" +#include "scanners.h" +#include "html/houdini.h" + +// Functions to convert block and inline lists to HTML strings. + +static void escape_html(gh_buf *dest, const unsigned char *source, int length) +{ + if (length < 0) + length = strlen((char *)source); + + houdini_escape_html0(dest, source, (size_t)length, 0); +} + +static void escape_href(gh_buf *dest, const unsigned char *source, int length) +{ + if (length < 0) + length = strlen((char *)source); + + houdini_escape_href(dest, source, (size_t)length); +} + +static inline void cr(gh_buf *html) +{ + if (html->size && html->ptr[html->size - 1] != '\n') + gh_buf_putc(html, '\n'); +} + +// Convert a block list to HTML. Returns 0 on success, and sets result. +void blocks_to_html(gh_buf *html, block *b, bool tight) +{ + struct ListData *data; + + while(b != NULL) { + switch(b->tag) { + case document: + blocks_to_html(html, b->children, false); + break; + + case paragraph: + if (tight) { + inlines_to_html(html, b->inline_content); + } else { + cr(html); + gh_buf_puts(html, "

"); + inlines_to_html(html, b->inline_content); + gh_buf_puts(html, "

"); + cr(html); + } + break; + + case block_quote: + cr(html); + gh_buf_puts(html, "
"); + blocks_to_html(html, b->children, false); + gh_buf_puts(html, "
"); + cr(html); + break; + + case list_item: + cr(html); + gh_buf_puts(html, "
  • "); + blocks_to_html(html, b->children, tight); + gh_buf_trim(html); + gh_buf_puts(html, "
  • "); + cr(html); + break; + + case list: + // make sure a list starts at the beginning of the line: + cr(html); + data = &(b->attributes.list_data); + + if (data->start > 1) { + gh_buf_printf(html, "<%s start=\"%d\">\n", + data->list_type == bullet ? "ul" : "ol", + data->start); + } else { + gh_buf_puts(html, data->list_type == bullet ? "
      \n" : "
        \n"); + } + + blocks_to_html(html, b->children, data->tight); + gh_buf_puts(html, data->list_type == bullet ? "
    " : ""); + cr(html); + break; + + case atx_header: + case setext_header: + cr(html); + gh_buf_printf(html, "", b->attributes.header_level); + inlines_to_html(html, b->inline_content); + gh_buf_printf(html, "", b->attributes.header_level); + cr(html); + break; + + case indented_code: + case fenced_code: + /* TODO: fenced code lang attributes */ + cr(html); + gh_buf_puts(html, "
    ");
    +				escape_html(html, b->string_content.ptr, b->string_content.size);
    +				gh_buf_puts(html, "
    "); + cr(html); + break; + + case html_block: + gh_buf_put(html, b->string_content.ptr, b->string_content.size); + break; + + case hrule: + gh_buf_puts(html, "
    "); + cr(html); + break; + + case reference_def: + break; + + default: + assert(false); + } + + b = b->next; + } +} + +// Convert an inline list to HTML. Returns 0 on success, and sets result. +void inlines_to_html(gh_buf *html, inl* ils) +{ + gh_buf scrap = GH_BUF_INIT; + + while(ils != NULL) { + switch(ils->tag) { + case INL_STRING: + escape_html(html, ils->content.literal.data, ils->content.literal.len); + break; + + case INL_LINEBREAK: + gh_buf_puts(html, "
    \n"); + break; + + case INL_SOFTBREAK: + gh_buf_putc(html, '\n'); + break; + + case INL_CODE: + gh_buf_puts(html, ""); + escape_html(html, ils->content.literal.data, ils->content.literal.len); + gh_buf_puts(html, ""); + break; + + case INL_RAW_HTML: + case INL_ENTITY: + gh_buf_put(html, + ils->content.literal.data, + ils->content.literal.len); + break; + + case INL_LINK: + gh_buf_puts(html, "content.linkable.url, -1); + + if (ils->content.linkable.title) { + gh_buf_puts(html, "\" title=\""); + escape_html(html, ils->content.linkable.title, -1); + } + + gh_buf_puts(html, "\">"); + inlines_to_html(html, ils->content.inlines); + gh_buf_puts(html, ""); + break; + + case INL_IMAGE: + gh_buf_puts(html, "content.linkable.url, -1); + + inlines_to_html(&scrap, ils->content.inlines); + if (scrap.size) { + gh_buf_puts(html, "\" alt=\""); + escape_html(html, scrap.ptr, scrap.size); + } + gh_buf_clear(&scrap); + + if (ils->content.linkable.title) { + gh_buf_puts(html, "\" title=\""); + escape_html(html, ils->content.linkable.title, -1); + } + + gh_buf_puts(html, "\"/>"); + break; + + case INL_STRONG: + gh_buf_puts(html, ""); + inlines_to_html(html, ils->content.inlines); + gh_buf_puts(html, ""); + break; + + case INL_EMPH: + gh_buf_puts(html, ""); + inlines_to_html(html, ils->content.inlines); + gh_buf_puts(html, ""); + break; + } + ils = ils->next; + } +} diff --git a/src/stmd.h b/src/stmd.h index 1e490d6..3e284bd 100644 --- a/src/stmd.h +++ b/src/stmd.h @@ -1,4 +1,5 @@ #include +#include #include "buffer.h" #include "uthash.h" diff --git a/src/utf8.c b/src/utf8.c index e3f8dd3..32c78a4 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "stmd.h" @@ -83,9 +84,9 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) return length; } -void utf8_encode_char(int32_t uc, gh_buf *buf) +void utf8proc_encode_char(int32_t uc, gh_buf *buf) { - char dst[4]; + unsigned char dst[4]; int len = 0; if (uc < 0x00) { @@ -99,7 +100,7 @@ void utf8_encode_char(int32_t uc, gh_buf *buf) len = 2; } else if (uc == 0xFFFF) { dst[0] = 0xFF; - return 1; + len = 1; } else if (uc == 0xFFFE) { dst[0] = 0xFE; len = 1; -- cgit v1.2.3