diff options
Diffstat (limited to 'src/html')
-rw-r--r-- | src/html/houdini.h | 44 | ||||
-rw-r--r-- | src/html/houdini_href_e.c | 115 | ||||
-rw-r--r-- | src/html/houdini_html_e.c | 89 | ||||
-rw-r--r-- | src/html/html.c | 212 |
4 files changed, 460 insertions, 0 deletions
diff --git a/src/html/houdini.h b/src/html/houdini.h new file mode 100644 index 0000000..31fe917 --- /dev/null +++ b/src/html/houdini.h @@ -0,0 +1,44 @@ +#ifndef __HOUDINI_H__ +#define __HOUDINI_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> +#include "buffer.h" + +#define likely(x) __builtin_expect((x),1) +#define unlikely(x) __builtin_expect((x),0) + +#ifdef HOUDINI_USE_LOCALE +# define _isxdigit(c) isxdigit(c) +# define _isdigit(c) isdigit(c) +#else +/* + * Helper _isdigit methods -- do not trust the current locale + * */ +# define _isxdigit(c) (strchr("0123456789ABCDEFabcdef", (c)) != NULL) +# define _isdigit(c) ((c) >= '0' && (c) <= '9') +#endif + +#define HOUDINI_ESCAPED_SIZE(x) (((x) * 12) / 10) +#define HOUDINI_UNESCAPED_SIZE(x) (x) + +extern int houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure); +extern int houdini_unescape_html(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_uri(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_url(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_uri(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_url(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_js(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_js(gh_buf *ob, const uint8_t *src, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/html/houdini_href_e.c b/src/html/houdini_href_e.c new file mode 100644 index 0000000..59fe850 --- /dev/null +++ b/src/html/houdini_href_e.c @@ -0,0 +1,115 @@ +#include <assert.h> +#include <stdio.h> +#include <string.h> + +#include "html/houdini.h" + +/* + * The following characters will not be escaped: + * + * -_.+!*'(),%#@?=;:/,+&$ alphanum + * + * Note that this character set is the addition of: + * + * - The characters which are safe to be in an URL + * - The characters which are *not* safe to be in + * an URL because they are RESERVED characters. + * + * We asume (lazily) that any RESERVED char that + * appears inside an URL is actually meant to + * have its native function (i.e. as an URL + * component/separator) and hence needs no escaping. + * + * There are two exceptions: the chacters & (amp) + * and ' (single quote) do not appear in the table. + * They are meant to appear in the URL as components, + * yet they require special HTML-entity escaping + * to generate valid HTML markup. + * + * All other characters will be escaped to %XX. + * + */ +static const char HREF_SAFE[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +int +houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size) +{ + static const uint8_t hex_chars[] = "0123456789ABCDEF"; + size_t i = 0, org; + uint8_t hex_str[3]; + + hex_str[0] = '%'; + + while (i < size) { + org = i; + while (i < size && HREF_SAFE[src[i]] != 0) + i++; + + if (likely(i > org)) { + if (unlikely(org == 0)) { + if (i >= size) + return 0; + + gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size)); + } + + gh_buf_put(ob, src + org, i - org); + } + + /* escaping */ + if (i >= size) + break; + + switch (src[i]) { + /* amp appears all the time in URLs, but needs + * HTML-entity escaping to be inside an href */ + case '&': + gh_buf_puts(ob, "&"); + break; + + /* the single quote is a valid URL character + * according to the standard; it needs HTML + * entity escaping too */ + case '\'': + gh_buf_puts(ob, "'"); + break; + + /* the space can be escaped to %20 or a plus + * sign. we're going with the generic escape + * for now. the plus thing is more commonly seen + * when building GET strings */ +#if 0 + case ' ': + gh_buf_putc(ob, '+'); + break; +#endif + + /* every other character goes with a %XX escaping */ + default: + hex_str[1] = hex_chars[(src[i] >> 4) & 0xF]; + hex_str[2] = hex_chars[src[i] & 0xF]; + gh_buf_put(ob, hex_str, 3); + } + + i++; + } + + return 1; +} diff --git a/src/html/houdini_html_e.c b/src/html/houdini_html_e.c new file mode 100644 index 0000000..316c5ce --- /dev/null +++ b/src/html/houdini_html_e.c @@ -0,0 +1,89 @@ +#include <assert.h> +#include <stdio.h> +#include <string.h> + +#include "html/houdini.h" + +/** + * According to the OWASP rules: + * + * & --> & + * < --> < + * > --> > + * " --> " + * ' --> ' ' is not recommended + * / --> / forward slash is included as it helps end an HTML entity + * + */ +static const char HTML_ESCAPE_TABLE[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const char *HTML_ESCAPES[] = { + "", + """, + "&", + "'", + "/", + "<", + ">" +}; + +int +houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure) +{ + size_t i = 0, org, esc = 0; + + while (i < size) { + org = i; + while (i < size && (esc = HTML_ESCAPE_TABLE[src[i]]) == 0) + i++; + + if (i > org) { + if (unlikely(org == 0)) { + if (i >= size) + return 0; + + gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size)); + } + + gh_buf_put(ob, src + org, i - org); + } + + /* escaping */ + if (unlikely(i >= size)) + break; + + /* The forward slash is only escaped in secure mode */ + if (src[i] == '/' && !secure) { + gh_buf_putc(ob, '/'); + } else { + gh_buf_puts(ob, HTML_ESCAPES[esc]); + } + + i++; + } + + return 1; +} + +int +houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size) +{ + return houdini_escape_html0(ob, src, size, 1); +} diff --git a/src/html/html.c b/src/html/html.c new file mode 100644 index 0000000..2f160ca --- /dev/null +++ b/src/html/html.c @@ -0,0 +1,212 @@ +#include <stdlib.h> +#include <stdio.h> +#include <stdbool.h> +#include <string.h> +#include <assert.h> + +#include "stmd.h" +#include "debug.h" +#include "scanners.h" +#include "html/houdini.h" + +// Functions to convert block and inline lists to HTML strings. + +static void escape_html(gh_buf *dest, const unsigned char *source, int length) +{ + if (length < 0) + length = strlen((char *)source); + + houdini_escape_html0(dest, source, (size_t)length, 0); +} + +static void escape_href(gh_buf *dest, const unsigned char *source, int length) +{ + if (length < 0) + length = strlen((char *)source); + + houdini_escape_href(dest, source, (size_t)length); +} + +static inline void cr(gh_buf *html) +{ + if (html->size && html->ptr[html->size - 1] != '\n') + gh_buf_putc(html, '\n'); +} + +// Convert a block list to HTML. Returns 0 on success, and sets result. +void blocks_to_html(gh_buf *html, block *b, bool tight) +{ + struct ListData *data; + + while(b != NULL) { + switch(b->tag) { + case document: + blocks_to_html(html, b->children, false); + break; + + case paragraph: + if (tight) { + inlines_to_html(html, b->inline_content); + } else { + cr(html); + gh_buf_puts(html, "<p>"); + inlines_to_html(html, b->inline_content); + gh_buf_puts(html, "</p>"); + cr(html); + } + break; + + case block_quote: + cr(html); + gh_buf_puts(html, "<blockquote>"); + blocks_to_html(html, b->children, false); + gh_buf_puts(html, "</blockquote>"); + cr(html); + break; + + case list_item: + cr(html); + gh_buf_puts(html, "<li>"); + blocks_to_html(html, b->children, tight); + gh_buf_trim(html); + gh_buf_puts(html, "</li>"); + cr(html); + break; + + case list: + // make sure a list starts at the beginning of the line: + cr(html); + data = &(b->attributes.list_data); + + if (data->start > 1) { + gh_buf_printf(html, "<%s start=\"%d\">\n", + data->list_type == bullet ? "ul" : "ol", + data->start); + } else { + gh_buf_puts(html, data->list_type == bullet ? "<ul>\n" : "<ol>\n"); + } + + blocks_to_html(html, b->children, data->tight); + gh_buf_puts(html, data->list_type == bullet ? "</ul>" : "</ol>"); + cr(html); + break; + + case atx_header: + case setext_header: + cr(html); + gh_buf_printf(html, "<h%d>", b->attributes.header_level); + inlines_to_html(html, b->inline_content); + gh_buf_printf(html, "</h%d>", b->attributes.header_level); + cr(html); + break; + + case indented_code: + case fenced_code: + /* TODO: fenced code lang attributes */ + cr(html); + gh_buf_puts(html, "<pre><code>"); + escape_html(html, b->string_content.ptr, b->string_content.size); + gh_buf_puts(html, "</pre></code>"); + cr(html); + break; + + case html_block: + gh_buf_put(html, b->string_content.ptr, b->string_content.size); + break; + + case hrule: + gh_buf_puts(html, "<hr />"); + cr(html); + break; + + case reference_def: + break; + + default: + assert(false); + } + + b = b->next; + } +} + +// Convert an inline list to HTML. Returns 0 on success, and sets result. +void inlines_to_html(gh_buf *html, inl* ils) +{ + gh_buf scrap = GH_BUF_INIT; + + while(ils != NULL) { + switch(ils->tag) { + case INL_STRING: + escape_html(html, ils->content.literal.data, ils->content.literal.len); + break; + + case INL_LINEBREAK: + gh_buf_puts(html, "<br />\n"); + break; + + case INL_SOFTBREAK: + gh_buf_putc(html, '\n'); + break; + + case INL_CODE: + gh_buf_puts(html, "<code>"); + escape_html(html, ils->content.literal.data, ils->content.literal.len); + gh_buf_puts(html, "</code>"); + break; + + case INL_RAW_HTML: + case INL_ENTITY: + gh_buf_put(html, + ils->content.literal.data, + ils->content.literal.len); + break; + + case INL_LINK: + gh_buf_puts(html, "<a href=\""); + escape_href(html, ils->content.linkable.url, -1); + + if (ils->content.linkable.title) { + gh_buf_puts(html, "\" title=\""); + escape_html(html, ils->content.linkable.title, -1); + } + + gh_buf_puts(html, "\">"); + inlines_to_html(html, ils->content.inlines); + gh_buf_puts(html, "</a>"); + break; + + case INL_IMAGE: + gh_buf_puts(html, "<img src=\""); + escape_href(html, ils->content.linkable.url, -1); + + inlines_to_html(&scrap, ils->content.inlines); + if (scrap.size) { + gh_buf_puts(html, "\" alt=\""); + escape_html(html, scrap.ptr, scrap.size); + } + gh_buf_clear(&scrap); + + if (ils->content.linkable.title) { + gh_buf_puts(html, "\" title=\""); + escape_html(html, ils->content.linkable.title, -1); + } + + gh_buf_puts(html, "\"/>"); + break; + + case INL_STRONG: + gh_buf_puts(html, "<strong>"); + inlines_to_html(html, ils->content.inlines); + gh_buf_puts(html, "</strong>"); + break; + + case INL_EMPH: + gh_buf_puts(html, "<em>"); + inlines_to_html(html, ils->content.inlines); + gh_buf_puts(html, "</em>"); + break; + } + ils = ils->next; + } +} |