aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--spec.txt35
-rw-r--r--src/inlines.c68
2 files changed, 64 insertions, 39 deletions
diff --git a/spec.txt b/spec.txt
index 616cb96..ebd6d98 100644
--- a/spec.txt
+++ b/spec.txt
@@ -3688,7 +3688,7 @@ raw HTML:
.
<http://google.com?find=\*>
.
-<p><a href="http://google.com?find=\*">http://google.com?find=\*</a></p>
+<p><a href="http://google.com?find=%5C*">http://google.com?find=\*</a></p>
.
.
@@ -3727,25 +3727,37 @@ foo
## Entities
-Entities are parsed as entities, not as literal text, in all contexts
-except code spans and code blocks. Three kinds of entities are recognized.
+With the goal of making this standard as HTML-agnostic as possible, all HTML valid HTML Entities in any
+context are recognized as such and converted into their actual values (i.e. the UTF8 characters representing
+the entity itself) before they are stored in the AST.
+
+This allows implementations that target HTML output to trivially escape the entities when generating HTML,
+and simplifies the job of implementations targetting other languages, as these will only need to handle the
+UTF8 chars and need not be HTML-entity aware.
[Named entities](#name-entities) <a id="named-entities"></a> consist of `&`
-+ a string of 2-32 alphanumerics beginning with a letter + `;`.
++ any of the valid HTML5 entity names + `;`. The [following document](http://www.whatwg.org/specs/web-apps/current-work/multipage/entities.json)
+is used as an authoritative source of the valid entity names and their corresponding codepoints.
+
+Conforming implementations that target Markdown don't need to generate entities for all the valid
+named entities that exist, with the exception of `"` (`&quot;`), `&` (`&amp;`), `<` (`&lt;`) and `>` (`&gt;`),
+which always need to be written as entities for security reasons.
.
&nbsp; &amp; &copy; &AElig; &Dcaron; &frac34; &HilbertSpace; &DifferentialD; &ClockwiseContourIntegral;
.
-<p>&nbsp; &amp; &copy; &AElig; &Dcaron; &frac34; &HilbertSpace; &DifferentialD; &ClockwiseContourIntegral;</p>
+<p>  &amp; © Æ Ď ¾ ℋ ⅆ ∲</p>
.
[Decimal entities](#decimal-entities) <a id="decimal-entities"></a>
-consist of `&#` + a string of 1--8 arabic digits + `;`.
+consist of `&#` + a string of 1--8 arabic digits + `;`. Again, these entities need to be recognised
+and tranformed into their corresponding UTF8 codepoints. Invalid Unicode codepoints will be written
+as the "unknown codepoint" character (`0xFFFD`)
.
-&#1; &#35; &#1234; &#992; &#98765432;
+&#35; &#1234; &#992; &#98765432;
.
-<p>&#1; &#35; &#1234; &#992; &#98765432;</p>
+<p># Ӓ Ϡ �</p>
.
[Hexadecimal entities](#hexadecimal-entities) <a id="hexadecimal-entities"></a>
@@ -3767,7 +3779,7 @@ Here are some nonentities:
.
Although HTML5 does accept some entities without a trailing semicolon
-(such as `&copy`), these are not recognized as entities here:
+(such as `&copy`), these are not recognized as entities here, because it makes the grammar too ambiguous:
.
&copy
@@ -3775,13 +3787,12 @@ Although HTML5 does accept some entities without a trailing semicolon
<p>&amp;copy</p>
.
-On the other hand, many strings that are not on the list of HTML5
-named entities are recognized as entities here:
+Strings that are not on the list of HTML5 named entities are not recognized as entities either:
.
&MadeUpEntity;
.
-<p>&MadeUpEntity;</p>
+<p>&amp;MadeUpEntity;</p>
.
Entities are recognized in any context besides code spans or
diff --git a/src/inlines.c b/src/inlines.c
index 7b27150..aa0e13e 100644
--- a/src/inlines.c
+++ b/src/inlines.c
@@ -20,8 +20,9 @@ typedef struct Subject {
reference* lookup_reference(reference** refmap, chunk *label);
reference* make_reference(chunk *label, chunk *url, chunk *title);
-static unsigned char *clean_url(chunk *url, int is_email);
+static unsigned char *clean_url(chunk *url);
static unsigned char *clean_title(chunk *title);
+static unsigned char *clean_autolink(chunk *url, int is_email);
inline static void chunk_free(chunk *c);
inline static void chunk_trim(chunk *c);
@@ -91,7 +92,7 @@ extern reference* make_reference(chunk *label, chunk *url, chunk *title)
reference *ref;
ref = malloc(sizeof(reference));
ref->label = normalize_reference(label);
- ref->url = clean_url(url, 0);
+ ref->url = clean_url(url);
ref->title = clean_title(title);
return ref;
}
@@ -123,27 +124,31 @@ static unsigned char *bufdup(const unsigned char *buf)
return new;
}
-inline static node_inl* make_link_from_reference(node_inl* label, reference *ref)
+static inline node_inl *make_link_(node_inl *label, unsigned char *url, unsigned char *title)
{
node_inl* e = (node_inl*) malloc(sizeof(node_inl));
e->tag = INL_LINK;
e->content.linkable.label = label;
- e->content.linkable.url = bufdup(ref->url);
- e->content.linkable.title = bufdup(ref->title);
+ e->content.linkable.url = url;
+ e->content.linkable.title = title;
e->next = NULL;
return e;
}
+inline static node_inl* make_ref_link(node_inl* label, reference *ref)
+{
+ return make_link_(label, bufdup(ref->url), bufdup(ref->title));
+}
+
+inline static node_inl* make_autolink(node_inl* label, chunk url, int is_email)
+{
+ return make_link_(label, clean_autolink(&url, is_email), NULL);
+}
+
// Create an inline with a linkable string value.
-inline static node_inl* make_link(node_inl* label, chunk url, chunk title, int is_email)
+inline static node_inl* make_link(node_inl* label, chunk url, chunk title)
{
- node_inl* e = (node_inl*) malloc(sizeof(node_inl));
- e->tag = INL_LINK;
- e->content.linkable.label = label;
- e->content.linkable.url = clean_url(&url, is_email);
- e->content.linkable.title = clean_title(&title);
- e->next = NULL;
- return e;
+ return make_link_(label, clean_url(&url), clean_title(&title));
}
inline static node_inl* make_inlines(int t, node_inl* contents)
@@ -587,7 +592,7 @@ extern void unescape_buffer(strbuf *buf)
// Clean a URL: remove surrounding whitespace and surrounding <>,
// and remove \ that escape punctuation.
-static unsigned char *clean_url(chunk *url, int is_email)
+static unsigned char *clean_url(chunk *url)
{
strbuf buf = GH_BUF_INIT;
@@ -596,9 +601,6 @@ static unsigned char *clean_url(chunk *url, int is_email)
if (url->len == 0)
return NULL;
- if (is_email)
- strbuf_puts(&buf, "mailto:");
-
if (url->data[0] == '<' && url->data[url->len - 1] == '>') {
houdini_unescape_html_f(&buf, url->data + 1, url->len - 2);
} else {
@@ -609,6 +611,22 @@ static unsigned char *clean_url(chunk *url, int is_email)
return strbuf_detach(&buf);
}
+static unsigned char *clean_autolink(chunk *url, int is_email)
+{
+ strbuf buf = GH_BUF_INIT;
+
+ chunk_trim(url);
+
+ if (url->len == 0)
+ return NULL;
+
+ if (is_email)
+ strbuf_puts(&buf, "mailto:");
+
+ houdini_unescape_html_f(&buf, url->data, url->len);
+ return strbuf_detach(&buf);
+}
+
// Clean a title: remove surrounding quotes and remove \ that escape punctuation.
static unsigned char *clean_title(chunk *title)
{
@@ -649,11 +667,9 @@ static node_inl* handle_pointy_brace(subject* subj)
contents = chunk_dup(&subj->input, subj->pos, matchlen - 1);
subj->pos += matchlen;
- return make_link(
+ return make_autolink(
make_str_with_entities(&contents),
- contents,
- chunk_literal(""),
- 0
+ contents, 0
);
}
@@ -663,11 +679,9 @@ static node_inl* handle_pointy_brace(subject* subj)
contents = chunk_dup(&subj->input, subj->pos, matchlen - 1);
subj->pos += matchlen;
- return make_link(
+ return make_autolink(
make_str_with_entities(&contents),
- contents,
- chunk_literal(""),
- 1
+ contents, 1
);
}
@@ -792,7 +806,7 @@ static node_inl* handle_left_bracket(subject* subj)
title = chunk_dup(&subj->input, starttitle, endtitle - starttitle);
lab = parse_chunk_inlines(&rawlabel, NULL);
- return make_link(lab, url, title, 0);
+ return make_link(lab, url, title);
} else {
// if we get here, we matched a label but didn't get further:
subj->pos = endlabel;
@@ -823,7 +837,7 @@ static node_inl* handle_left_bracket(subject* subj)
ref = lookup_reference(subj->reference_map, &reflabel);
if (ref != NULL) { // found
lab = parse_chunk_inlines(&rawlabel, NULL);
- result = make_link_from_reference(lab, ref);
+ result = make_ref_link(lab, ref);
} else {
subj->pos = endlabel;
lab = parse_chunk_inlines(&rawlabel, subj->reference_map);