diff options
-rw-r--r-- | spec.txt | 35 | ||||
-rw-r--r-- | src/inlines.c | 68 |
2 files changed, 64 insertions, 39 deletions
@@ -3688,7 +3688,7 @@ raw HTML: . <http://google.com?find=\*> . -<p><a href="http://google.com?find=\*">http://google.com?find=\*</a></p> +<p><a href="http://google.com?find=%5C*">http://google.com?find=\*</a></p> . . @@ -3727,25 +3727,37 @@ foo ## Entities -Entities are parsed as entities, not as literal text, in all contexts -except code spans and code blocks. Three kinds of entities are recognized. +With the goal of making this standard as HTML-agnostic as possible, all HTML valid HTML Entities in any +context are recognized as such and converted into their actual values (i.e. the UTF8 characters representing +the entity itself) before they are stored in the AST. + +This allows implementations that target HTML output to trivially escape the entities when generating HTML, +and simplifies the job of implementations targetting other languages, as these will only need to handle the +UTF8 chars and need not be HTML-entity aware. [Named entities](#name-entities) <a id="named-entities"></a> consist of `&` -+ a string of 2-32 alphanumerics beginning with a letter + `;`. ++ any of the valid HTML5 entity names + `;`. The [following document](http://www.whatwg.org/specs/web-apps/current-work/multipage/entities.json) +is used as an authoritative source of the valid entity names and their corresponding codepoints. + +Conforming implementations that target Markdown don't need to generate entities for all the valid +named entities that exist, with the exception of `"` (`"`), `&` (`&`), `<` (`<`) and `>` (`>`), +which always need to be written as entities for security reasons. . & © Æ Ď ¾ ℋ ⅆ ∲ . -<p> & © Æ Ď ¾ ℋ ⅆ ∲</p> +<p> & © Æ Ď ¾ ℋ ⅆ ∲</p> . [Decimal entities](#decimal-entities) <a id="decimal-entities"></a> -consist of `&#` + a string of 1--8 arabic digits + `;`. +consist of `&#` + a string of 1--8 arabic digits + `;`. Again, these entities need to be recognised +and tranformed into their corresponding UTF8 codepoints. Invalid Unicode codepoints will be written +as the "unknown codepoint" character (`0xFFFD`) . - # Ӓ Ϡ � +# Ӓ Ϡ � . -<p> # Ӓ Ϡ �</p> +<p># Ӓ Ϡ �</p> . [Hexadecimal entities](#hexadecimal-entities) <a id="hexadecimal-entities"></a> @@ -3767,7 +3779,7 @@ Here are some nonentities: . Although HTML5 does accept some entities without a trailing semicolon -(such as `©`), these are not recognized as entities here: +(such as `©`), these are not recognized as entities here, because it makes the grammar too ambiguous: . © @@ -3775,13 +3787,12 @@ Although HTML5 does accept some entities without a trailing semicolon <p>&copy</p> . -On the other hand, many strings that are not on the list of HTML5 -named entities are recognized as entities here: +Strings that are not on the list of HTML5 named entities are not recognized as entities either: . &MadeUpEntity; . -<p>&MadeUpEntity;</p> +<p>&MadeUpEntity;</p> . Entities are recognized in any context besides code spans or diff --git a/src/inlines.c b/src/inlines.c index 7b27150..aa0e13e 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -20,8 +20,9 @@ typedef struct Subject { reference* lookup_reference(reference** refmap, chunk *label); reference* make_reference(chunk *label, chunk *url, chunk *title); -static unsigned char *clean_url(chunk *url, int is_email); +static unsigned char *clean_url(chunk *url); static unsigned char *clean_title(chunk *title); +static unsigned char *clean_autolink(chunk *url, int is_email); inline static void chunk_free(chunk *c); inline static void chunk_trim(chunk *c); @@ -91,7 +92,7 @@ extern reference* make_reference(chunk *label, chunk *url, chunk *title) reference *ref; ref = malloc(sizeof(reference)); ref->label = normalize_reference(label); - ref->url = clean_url(url, 0); + ref->url = clean_url(url); ref->title = clean_title(title); return ref; } @@ -123,27 +124,31 @@ static unsigned char *bufdup(const unsigned char *buf) return new; } -inline static node_inl* make_link_from_reference(node_inl* label, reference *ref) +static inline node_inl *make_link_(node_inl *label, unsigned char *url, unsigned char *title) { node_inl* e = (node_inl*) malloc(sizeof(node_inl)); e->tag = INL_LINK; e->content.linkable.label = label; - e->content.linkable.url = bufdup(ref->url); - e->content.linkable.title = bufdup(ref->title); + e->content.linkable.url = url; + e->content.linkable.title = title; e->next = NULL; return e; } +inline static node_inl* make_ref_link(node_inl* label, reference *ref) +{ + return make_link_(label, bufdup(ref->url), bufdup(ref->title)); +} + +inline static node_inl* make_autolink(node_inl* label, chunk url, int is_email) +{ + return make_link_(label, clean_autolink(&url, is_email), NULL); +} + // Create an inline with a linkable string value. -inline static node_inl* make_link(node_inl* label, chunk url, chunk title, int is_email) +inline static node_inl* make_link(node_inl* label, chunk url, chunk title) { - node_inl* e = (node_inl*) malloc(sizeof(node_inl)); - e->tag = INL_LINK; - e->content.linkable.label = label; - e->content.linkable.url = clean_url(&url, is_email); - e->content.linkable.title = clean_title(&title); - e->next = NULL; - return e; + return make_link_(label, clean_url(&url), clean_title(&title)); } inline static node_inl* make_inlines(int t, node_inl* contents) @@ -587,7 +592,7 @@ extern void unescape_buffer(strbuf *buf) // Clean a URL: remove surrounding whitespace and surrounding <>, // and remove \ that escape punctuation. -static unsigned char *clean_url(chunk *url, int is_email) +static unsigned char *clean_url(chunk *url) { strbuf buf = GH_BUF_INIT; @@ -596,9 +601,6 @@ static unsigned char *clean_url(chunk *url, int is_email) if (url->len == 0) return NULL; - if (is_email) - strbuf_puts(&buf, "mailto:"); - if (url->data[0] == '<' && url->data[url->len - 1] == '>') { houdini_unescape_html_f(&buf, url->data + 1, url->len - 2); } else { @@ -609,6 +611,22 @@ static unsigned char *clean_url(chunk *url, int is_email) return strbuf_detach(&buf); } +static unsigned char *clean_autolink(chunk *url, int is_email) +{ + strbuf buf = GH_BUF_INIT; + + chunk_trim(url); + + if (url->len == 0) + return NULL; + + if (is_email) + strbuf_puts(&buf, "mailto:"); + + houdini_unescape_html_f(&buf, url->data, url->len); + return strbuf_detach(&buf); +} + // Clean a title: remove surrounding quotes and remove \ that escape punctuation. static unsigned char *clean_title(chunk *title) { @@ -649,11 +667,9 @@ static node_inl* handle_pointy_brace(subject* subj) contents = chunk_dup(&subj->input, subj->pos, matchlen - 1); subj->pos += matchlen; - return make_link( + return make_autolink( make_str_with_entities(&contents), - contents, - chunk_literal(""), - 0 + contents, 0 ); } @@ -663,11 +679,9 @@ static node_inl* handle_pointy_brace(subject* subj) contents = chunk_dup(&subj->input, subj->pos, matchlen - 1); subj->pos += matchlen; - return make_link( + return make_autolink( make_str_with_entities(&contents), - contents, - chunk_literal(""), - 1 + contents, 1 ); } @@ -792,7 +806,7 @@ static node_inl* handle_left_bracket(subject* subj) title = chunk_dup(&subj->input, starttitle, endtitle - starttitle); lab = parse_chunk_inlines(&rawlabel, NULL); - return make_link(lab, url, title, 0); + return make_link(lab, url, title); } else { // if we get here, we matched a label but didn't get further: subj->pos = endlabel; @@ -823,7 +837,7 @@ static node_inl* handle_left_bracket(subject* subj) ref = lookup_reference(subj->reference_map, &reflabel); if (ref != NULL) { // found lab = parse_chunk_inlines(&rawlabel, NULL); - result = make_link_from_reference(lab, ref); + result = make_ref_link(lab, ref); } else { subj->pos = endlabel; lab = parse_chunk_inlines(&rawlabel, subj->reference_map); |