aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVicent Marti <tanoku@gmail.com>2014-09-06 20:48:05 +0200
committerVicent Marti <tanoku@gmail.com>2014-09-09 03:39:16 +0200
commit61e3e606e64221eaa5cf3d83dc598d5a42818d10 (patch)
tree1dfb6309c0e0fd7de8094c9da6497992b156350c
parent278b89d092cae8fe9cdd6346c69512886d36abbd (diff)
UTF8-aware detabbing and entity handling
-rw-r--r--Makefile13
-rw-r--r--src/blocks.c35
-rw-r--r--src/html/houdini.h2
-rw-r--r--src/html/html.c1
-rw-r--r--src/inlines.c63
-rw-r--r--src/print.c5
-rw-r--r--src/stmd.h3
-rw-r--r--src/utf8.c59
8 files changed, 95 insertions, 86 deletions
diff --git a/Makefile b/Makefile
index 0d2eb8b..b5e487d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
-CFLAGS=-g -O3 -Wall -Wextra -std=c99 -Isrc $(OPTFLAGS)
-LDFLAGS=-g -O3 -Wall -Werror
+CFLAGS=-g -pg -O3 -Wall -Wextra -std=c99 -Isrc $(OPTFLAGS)
+LDFLAGS=-g -pg -O3 -Wall -Werror
SRCDIR=src
DATADIR=data
@@ -41,11 +41,11 @@ testjs: spec.txt
benchjs:
node js/bench.js ${BENCHINP}
-HTML_OBJ=$(SRCDIR)/html/html.o $(SRCDIR)/html/houdini_href_e.o $(SRCDIR)/html/houdini_html_e.o
+HTML_OBJ=$(SRCDIR)/html/html.o $(SRCDIR)/html/houdini_href_e.o $(SRCDIR)/html/houdini_html_e.o $(SRCDIR)/html/houdini_html_u.o
STMD_OBJ=$(SRCDIR)/inlines.o $(SRCDIR)/buffer.o $(SRCDIR)/blocks.o $(SRCDIR)/scanners.c $(SRCDIR)/print.o $(SRCDIR)/utf8.o
-$(PROG): $(SRCDIR)/main.c $(HTML_OBJ) $(STMD_OBJ)
- $(CC) $(LDFLAGS) -o $@ $^
+$(PROG): $(SRCDIR)/html/html_unescape.h $(SRCDIR)/case_fold_switch.inc $(HTML_OBJ) $(STMD_OBJ) $(SRCDIR)/main.c
+ $(CC) $(LDFLAGS) -o $@ $(HTML_OBJ) $(STMD_OBJ) $(SRCDIR)/main.c
$(SRCDIR)/scanners.c: $(SRCDIR)/scanners.re
re2c --case-insensitive -bis $< > $@ || (rm $@ && false)
@@ -53,6 +53,9 @@ $(SRCDIR)/scanners.c: $(SRCDIR)/scanners.re
$(SRCDIR)/case_fold_switch.inc: $(DATADIR)/CaseFolding-3.2.0.txt
perl mkcasefold.pl < $< > $@
+$(SRCDIR)/html/html_unescape.h: $(SRCDIR)/html/html_unescape.gperf
+ gperf -I -t -N find_entity -H hash_entity -K entity -C -l --null-strings -m5 $< > $@
+
.PHONY: leakcheck clean fuzztest dingus upload
dingus:
diff --git a/src/blocks.c b/src/blocks.c
index f671b5e..8c7d49c 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -5,6 +5,8 @@
#include <ctype.h>
#include "stmd.h"
+#include "utf8.h"
+#include "html/houdini.h"
#include "scanners.h"
#include "uthash.h"
@@ -184,7 +186,7 @@ static void finalize(node_block* b, int line_number)
firstlinelen = strbuf_strchr(&b->string_content, '\n', 0);
strbuf_init(&b->attributes.fenced_code_data.info, 0);
- strbuf_set(
+ houdini_unescape_html_f(
&b->attributes.fenced_code_data.info,
b->string_content.ptr,
firstlinelen
@@ -369,31 +371,6 @@ static int lists_match(struct ListData list_data,
list_data.bullet_char == item_data.bullet_char);
}
-static void expand_tabs(strbuf *ob, const unsigned char *line, size_t size)
-{
- size_t i = 0, tab = 0;
-
- while (i < size) {
- size_t org = i;
-
- while (i < size && line[i] != '\t') {
- i++; tab++;
- }
-
- if (i > org)
- strbuf_put(ob, line + org, i - org);
-
- if (i >= size)
- break;
-
- do {
- strbuf_putc(ob, ' '); tab++;
- } while (tab % 4);
-
- i++;
- }
-}
-
static node_block *finalize_document(node_block *document, int linenum)
{
while (document != document->top) {
@@ -415,7 +392,7 @@ extern node_block *stmd_parse_file(FILE *f)
node_block *document = make_document();
while (fgets((char *)buffer, sizeof(buffer), f)) {
- expand_tabs(&line, buffer, strlen((char *)buffer));
+ utf8proc_detab(&line, buffer, strlen((char *)buffer));
incorporate_line(&line, linenum, &document);
strbuf_clear(&line);
linenum++;
@@ -436,10 +413,10 @@ extern node_block *stmd_parse_document(const unsigned char *buffer, size_t len)
const unsigned char *eol = memchr(buffer, '\n', end - buffer);
if (!eol) {
- expand_tabs(&line, buffer, end - buffer);
+ utf8proc_detab(&line, buffer, end - buffer);
buffer = end;
} else {
- expand_tabs(&line, buffer, (eol - buffer) + 1);
+ utf8proc_detab(&line, buffer, (eol - buffer) + 1);
buffer += (eol - buffer) + 1;
}
diff --git a/src/html/houdini.h b/src/html/houdini.h
index 1e54d20..5fd690d 100644
--- a/src/html/houdini.h
+++ b/src/html/houdini.h
@@ -25,9 +25,11 @@ extern "C" {
#define HOUDINI_ESCAPED_SIZE(x) (((x) * 12) / 10)
#define HOUDINI_UNESCAPED_SIZE(x) (x)
+extern size_t houdini_unescape_ent(strbuf *ob, const uint8_t *src, size_t size);
extern int houdini_escape_html(strbuf *ob, const uint8_t *src, size_t size);
extern int houdini_escape_html0(strbuf *ob, const uint8_t *src, size_t size, int secure);
extern int houdini_unescape_html(strbuf *ob, const uint8_t *src, size_t size);
+extern void houdini_unescape_html_f(strbuf *ob, const uint8_t *src, size_t size);
extern int houdini_escape_xml(strbuf *ob, const uint8_t *src, size_t size);
extern int houdini_escape_uri(strbuf *ob, const uint8_t *src, size_t size);
extern int houdini_escape_url(strbuf *ob, const uint8_t *src, size_t size);
diff --git a/src/html/html.c b/src/html/html.c
index 758ec80..595dfcd 100644
--- a/src/html/html.c
+++ b/src/html/html.c
@@ -166,7 +166,6 @@ void inlines_to_html(strbuf *html, node_inl* ils)
break;
case INL_RAW_HTML:
- case INL_ENTITY:
strbuf_put(html,
ils->content.literal.data,
ils->content.literal.len);
diff --git a/src/inlines.c b/src/inlines.c
index 6b17027..7b27150 100644
--- a/src/inlines.c
+++ b/src/inlines.c
@@ -5,6 +5,8 @@
#include <ctype.h>
#include "stmd.h"
+#include "html/houdini.h"
+#include "utf8.h"
#include "uthash.h"
#include "scanners.h"
@@ -176,7 +178,6 @@ inline static node_inl* make_simple(int t)
#define make_str(s) make_literal(INL_STRING, s)
#define make_code(s) make_literal(INL_CODE, s)
#define make_raw_html(s) make_literal(INL_RAW_HTML, s)
-#define make_entity(s) make_literal(INL_ENTITY, s)
#define make_linebreak() make_simple(INL_LINEBREAK)
#define make_softbreak() make_simple(INL_SOFTBREAK)
#define make_emph(contents) make_inlines(INL_EMPH, contents)
@@ -191,7 +192,6 @@ extern void free_inlines(node_inl* e)
case INL_STRING:
case INL_RAW_HTML:
case INL_CODE:
- case INL_ENTITY:
chunk_free(&e->content.literal);
break;
case INL_LINEBREAK:
@@ -540,45 +540,34 @@ static node_inl* handle_backslash(subject *subj)
// Assumes the subject has an '&' character at the current position.
static node_inl* handle_entity(subject* subj)
{
- int match;
- node_inl *result;
- match = scan_entity(&subj->input, subj->pos);
- if (match) {
- result = make_entity(chunk_dup(&subj->input, subj->pos, match));
- subj->pos += match;
- } else {
- advance(subj);
- result = make_str(chunk_literal("&"));
- }
- return result;
+ strbuf ent = GH_BUF_INIT;
+ size_t len;
+
+ advance(subj);
+
+ len = houdini_unescape_ent(&ent,
+ subj->input.data + subj->pos,
+ subj->input.len - subj->pos
+ );
+
+ if (len == 0)
+ return make_str(chunk_literal("&"));
+
+ subj->pos += len;
+ return make_str(chunk_buf_detach(&ent));
}
// Like make_str, but parses entities.
// Returns an inline sequence consisting of str and entity elements.
static node_inl *make_str_with_entities(chunk *content)
{
- node_inl *result = NULL;
- node_inl *new;
- int searchpos;
- char c;
- subject subj;
-
- subject_from_chunk(&subj, content, NULL);
+ strbuf unescaped = GH_BUF_INIT;
- while ((c = peek_char(&subj))) {
- switch (c) {
- case '&':
- new = handle_entity(&subj);
- break;
- default:
- searchpos = chunk_strchr(&subj.input, '&', subj.pos);
- new = make_str(chunk_dup(&subj.input, subj.pos, searchpos - subj.pos));
- subj.pos = searchpos;
- }
- result = append_inlines(result, new);
+ if (houdini_unescape_html(&unescaped, content->data, (size_t)content->len)) {
+ return make_str(chunk_buf_detach(&unescaped));
+ } else {
+ return make_str(*content);
}
-
- return result;
}
// Destructively unescape a string: remove backslashes before punctuation chars.
@@ -611,9 +600,9 @@ static unsigned char *clean_url(chunk *url, int is_email)
strbuf_puts(&buf, "mailto:");
if (url->data[0] == '<' && url->data[url->len - 1] == '>') {
- strbuf_put(&buf, url->data + 1, url->len - 2);
+ houdini_unescape_html_f(&buf, url->data + 1, url->len - 2);
} else {
- strbuf_put(&buf, url->data, url->len);
+ houdini_unescape_html_f(&buf, url->data, url->len);
}
unescape_buffer(&buf);
@@ -636,9 +625,9 @@ static unsigned char *clean_title(chunk *title)
if ((first == '\'' && last == '\'') ||
(first == '(' && last == ')') ||
(first == '"' && last == '"')) {
- strbuf_set(&buf, title->data + 1, title->len - 2);
+ houdini_unescape_html_f(&buf, title->data + 1, title->len - 2);
} else {
- strbuf_set(&buf, title->data, title->len);
+ houdini_unescape_html_f(&buf, title->data, title->len);
}
unescape_buffer(&buf);
diff --git a/src/print.c b/src/print.c
index 0ff86fa..9240dac 100644
--- a/src/print.c
+++ b/src/print.c
@@ -145,11 +145,6 @@ extern void print_inlines(node_inl* ils, int indent)
print_str(ils->content.literal.data, ils->content.literal.len);
putchar('\n');
break;
- case INL_ENTITY:
- printf("entity ");
- print_str(ils->content.literal.data, ils->content.literal.len);
- putchar('\n');
- break;
case INL_LINK:
case INL_IMAGE:
printf("%s url=", ils->tag == INL_LINK ? "link" : "image");
diff --git a/src/stmd.h b/src/stmd.h
index be65371..c80eeda 100644
--- a/src/stmd.h
+++ b/src/stmd.h
@@ -17,7 +17,6 @@ struct node_inl {
INL_LINEBREAK,
INL_CODE,
INL_RAW_HTML,
- INL_ENTITY,
INL_EMPH,
INL_STRONG,
INL_LINK,
@@ -133,6 +132,4 @@ void print_blocks(node_block* blk, int indent);
void blocks_to_html(strbuf *html, node_block *b, bool tight);
void inlines_to_html(strbuf *html, node_inl *b);
-void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len);
-
#endif
diff --git a/src/utf8.c b/src/utf8.c
index cebd872..12d7ba5 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -3,7 +3,7 @@
#include <unistd.h>
#include <assert.h>
-#include "stmd.h"
+#include "utf8.h"
static const int8_t utf8proc_utf8class[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -23,6 +23,12 @@ static const int8_t utf8proc_utf8class[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
+static void encode_unknown(strbuf *buf)
+{
+ static const unsigned char repl[] = {239, 191, 189};
+ strbuf_put(buf, repl, 3);
+}
+
ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len)
{
ssize_t length, i;
@@ -46,6 +52,46 @@ ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len)
return length;
}
+void utf8proc_detab(strbuf *ob, const unsigned char *line, size_t size)
+{
+ static const unsigned char whitespace[] = " ";
+
+ size_t i = 0, tab = 0;
+
+ while (i < size) {
+ size_t org = i;
+
+ while (i < size && line[i] != '\t' && line[i] <= 0x80) {
+ i++; tab++;
+ }
+
+ if (i > org)
+ strbuf_put(ob, line + org, i - org);
+
+ if (i >= size)
+ break;
+
+ if (line[i] == '\t') {
+ int numspaces = 4 - (tab % 4);
+ strbuf_put(ob, whitespace, numspaces);
+ i += 1;
+ tab += numspaces;
+ } else {
+ ssize_t charlen = utf8proc_charlen(line + i, size - i);
+
+ if (charlen < 0) {
+ encode_unknown(ob);
+ i++;
+ } else {
+ strbuf_put(ob, line + i, charlen);
+ i += charlen;
+ }
+
+ tab += 1;
+ }
+ }
+}
+
ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst)
{
ssize_t length;
@@ -89,9 +135,9 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf)
unsigned char dst[4];
int len = 0;
- if (uc < 0x00) {
- assert(false);
- } else if (uc < 0x80) {
+ assert(uc >= 0);
+
+ if (uc < 0x80) {
dst[0] = uc;
len = 1;
} else if (uc < 0x800) {
@@ -116,7 +162,8 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf)
dst[3] = 0x80 + (uc & 0x3F);
len = 4;
} else {
- assert(false);
+ encode_unknown(buf);
+ return;
}
strbuf_put(buf, dst, len);
@@ -133,7 +180,7 @@ void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len)
ssize_t char_len = utf8proc_iterate(str, len, &c);
if (char_len < 0) {
- bufpush(0xFFFD);
+ encode_unknown(dest);
continue;
}