From e216094e2192c05ddbd0988458eb8c0012e7baf8 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Tue, 2 Sep 2014 01:10:54 +0200 Subject: lol --- src/utf8.c | 221 ++++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 129 insertions(+), 92 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index 4bb3b35..1a5df9e 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -2,105 +2,142 @@ #include "bstrlib.h" #include "debug.h" -#define advance(s) \ - s++; \ - check(*s >> 6 == 0x02, "UTF-8 decode error on byte %x", *s); - -// Reads a unicode code point from a UTF8-encoded string, and -// puts it in the pointer n. If something illegal -// is encountered, 0xFFFD is emitted. -// Returns a pointer to next position in string, or NULL if no -// more characters remain. -extern unsigned char * from_utf8(unsigned char * s, unsigned int *n) +static const int8_t utf8proc_utf8class[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; + +ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) { - int x = 0; - - if (*s == 0) { - return NULL; - } else if (*s < 0x80) { - x = *s; - } else if (*s >> 5 == 0x06) { - x = *s & 0x1F; - advance(s); - x = (x << 6) + (*s & 0x3F); - } else if (*s >> 4 == 0x0E) { - x = *s & 0x0F; - advance(s); - x = (x << 6) + (*s & 0x3F); - advance(s); - x = (x << 6) + (*s & 0x3F); - } else if (*s >> 3 == 0x1E) { - x = *s & 0x07; - advance(s); - x = (x << 6) + (*s & 0x3F); - advance(s); - x = (x << 6) + (*s & 0x3F); - advance(s); - x = (x << 6) + (*s & 0x3F); - } else if (*s >> 2 == 0x3E) { - x = *s & 0x03; - advance(s); - x = (x << 6) + (*s & 0x3F); - advance(s); - x = (x << 6) + (*s & 0x3F); - advance(s); - x = (x << 6) + (*s & 0x3F); - advance(s); - x = (x << 6) + (*s & 0x3F); - } else { - log_err("UTF-8 decode error on byte %x", *s); - goto error; - } - *n = x; - s++; - return s; - error: - *n = 0xFFFD; - return s; + ssize_t length, i; + + if (!str_len) + return 0; + + length = utf8proc_utf8class[str[0]]; + + if (!length) + return -1; + + if (str_len >= 0 && length > str_len) + return -1; + + for (i = 1; i < length; i++) { + if ((str[i] & 0xC0) != 0x80) + return -1; + } + + return length; +} + +ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) +{ + ssize_t length; + int32_t uc = -1; + + *dst = -1; + length = utf8proc_charlen(str, str_len); + if (length < 0) + return -1; + + switch (length) { + case 1: + uc = str[0]; + break; + case 2: + uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); + if (uc < 0x80) uc = -1; + break; + case 3: + uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) + + (str[2] & 0x3F); + if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || + (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; + break; + case 4: + uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) + + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); + if (uc < 0x10000 || uc >= 0x110000) uc = -1; + break; + } + + if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) + return -1; + + *dst = uc; + return length; } -// Converts the unicode code point c to UTF-8, -// putting the result in dest. Returns 0 on success, -1 on error. -extern int to_utf8(unsigned int c, bstring dest) +void utf8_encode_char(int32_t uc, gh_buf *buf) { - if (c < 0x80) { - bconchar(dest, c); - } else if (c < 0x800) { - bconchar(dest, 192 + c/64); - bconchar(dest, 128 + c%64); - } else if (c - 0xd800u < 0x800) { - goto error; - } else if (c < 0x10000) { - bconchar(dest, 224 + c / 4096); - bconchar(dest, 128 + c /64%64); - bconchar(dest, 128 + c%64); - } else if (c < 0x110000) { - bconchar(dest, 240 + c/262144); - bconchar(dest, 128 + c/4096%64); - bconchar(dest, 128 + c/64%64); - bconchar(dest, 128 + c%64); - } else { - goto error; - } - return 0; -error: - return -1; + char dst[4]; + int len = 0; + + if (uc < 0x00) { + assert(false); + } else if (uc < 0x80) { + dst[0] = uc; + len = 1; + } else if (uc < 0x800) { + dst[0] = 0xC0 + (uc >> 6); + dst[1] = 0x80 + (uc & 0x3F); + len = 2; + } else if (uc == 0xFFFF) { + dst[0] = 0xFF; + return 1; + } else if (uc == 0xFFFE) { + dst[0] = 0xFE; + len = 1; + } else if (uc < 0x10000) { + dst[0] = 0xE0 + (uc >> 12); + dst[1] = 0x80 + ((uc >> 6) & 0x3F); + dst[2] = 0x80 + (uc & 0x3F); + len = 3; + } else if (uc < 0x110000) { + dst[0] = 0xF0 + (uc >> 18); + dst[1] = 0x80 + ((uc >> 12) & 0x3F); + dst[2] = 0x80 + ((uc >> 6) & 0x3F); + dst[3] = 0x80 + (uc & 0x3F); + len = 4; + } else { + assert(false); + } + + gh_buf_put(buf, dst, len); } +void utf8proc_case_fold(gh_buf *dest, const unsigned char *str, int len) +{ + int32_t c; + #define bufpush(x) \ - check(to_utf8(x, buf) == 0, "UTF-8 encode error on code point %04x", x) + utf8proc_encode_char(x, dest) -// Returns the case-folded version of the source string, or NULL on error. -extern bstring case_fold(bstring source) -{ - unsigned char * s = source->data; - unsigned int c = 0; - bstring buf = bfromcstr(""); - while ((s = from_utf8(s, &c))) { -#include "case_fold_switch.c" - } - return buf; -error: - return NULL; + while (len > 0) { + ssize_t char_len = utf8proc_iterate(str, len, &c); + + if (char_len < 0) { + bufpush(0xFFFD); + continue; + } + +#include "case_fold_switch.inc" + + str += char_len; + len -= char_len; + } } -- cgit v1.2.3 From 582674e662d1f8757350c51486a5e0a837195e15 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Tue, 2 Sep 2014 13:18:04 +0200 Subject: ffffix --- Makefile | 11 ++- src/blocks.c | 58 +++++++---- src/buffer.c | 69 +++++-------- src/buffer.h | 19 ++-- src/html.c | 276 ---------------------------------------------------- src/inlines.c | 4 +- src/main.c | 142 ++++++++++++--------------- src/print.c | 307 ++++++++++++++++++++++++++++++---------------------------- src/stmd.h | 13 +-- src/utf8.c | 6 +- src/utf8.h | 6 -- 11 files changed, 304 insertions(+), 607 deletions(-) delete mode 100644 src/html.c delete mode 100644 src/utf8.h (limited to 'src/utf8.c') diff --git a/Makefile b/Makefile index cb5938d..d14a928 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ DATADIR=data PROG=./stmd .PHONY: all oldtests test spec benchjs testjs -all: $(SRCDIR)/case_fold_switch.c $(PROG) +all: $(SRCDIR)/case_fold_switch.inc $(PROG) README.html: README.md template.html pandoc --template template.html -S -s -t html5 -o $@ $< @@ -41,13 +41,16 @@ testjs: spec.txt benchjs: node js/bench.js ${BENCHINP} -$(PROG): $(SRCDIR)/main.c $(SRCDIR)/inlines.o $(SRCDIR)/buffer.o $(SRCDIR)/blocks.o $(SRCDIR)/scanners.c $(SRCDIR)/print.o $(SRCDIR)/html.o $(SRCDIR)/utf8.o +HTML_OBJ=$(SRCDIR)/html/html.o $(SRCDIR)/html/houdini_href_e.o $(SRCDIR)/html/houdini_html_e.o +STMD_OBJ=$(SRCDIR)/inlines.o $(SRCDIR)/buffer.o $(SRCDIR)/blocks.o $(SRCDIR)/scanners.c $(SRCDIR)/print.o $(SRCDIR)/utf8.o + +$(PROG): $(SRCDIR)/main.c $(HTML_OBJ) $(STMD_OBJ) $(CC) $(LDFLAGS) -o $@ $^ $(SRCDIR)/scanners.c: $(SRCDIR)/scanners.re re2c --case-insensitive -bis $< > $@ || (rm $@ && false) -$(SRCDIR)/case_fold_switch.inc $(DATADIR)/CaseFolding-3.2.0.txt +$(SRCDIR)/case_fold_switch.inc: $(DATADIR)/CaseFolding-3.2.0.txt perl mkcasefold.pl < $< > $@ .PHONY: leakcheck clean fuzztest dingus upload @@ -72,7 +75,7 @@ update-site: spec.html narrative.html (cd _site ; git pull ; git commit -a -m "Updated site for latest spec, narrative, js" ; git push; cd ..) clean: - -rm -f test $(SRCDIR)/*.o $(SRCDIR)/scanners.c + -rm -f test $(SRCDIR)/*.o $(SRCDIR)/scanners.c $(SRCDIR)/html/*.o -rm -rf *.dSYM -rm -f README.html -rm -f spec.md fuzz.txt spec.html diff --git a/src/blocks.c b/src/blocks.c index eabac03..71dc830 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -3,11 +3,12 @@ #include #include #include -#include "bstrlib.h" + #include "stmd.h" -#include "uthash.h" -#include "debug.h" #include "scanners.h" +#include "uthash.h" + +static void finalize(block* b, int line_number); static block* make_block(int tag, int start_line, int start_column) { @@ -140,7 +141,7 @@ static int break_out_of_lists(block ** bptr, int line_number) } -extern void finalize(block* b, int line_number) +static void finalize(block* b, int line_number) { int firstlinelen; int pos; @@ -364,7 +365,7 @@ static int lists_match(struct ListData list_data, list_data.bullet_char == item_data.bullet_char); } -static void expand_tabs(gh_buf *ob, const char *line, size_t size) +static void expand_tabs(gh_buf *ob, const unsigned char *line, size_t size) { size_t i = 0, tab = 0; @@ -389,13 +390,43 @@ static void expand_tabs(gh_buf *ob, const char *line, size_t size) } } -extern block *stmd_parse_document(const char *buffer, size_t len) +static block *finalize_parsing(block *document, int linenum) { - gh_buf line = GH_BUF_INIT; + while (document != document->top) { + finalize(document, linenum); + document = document->parent; + } + + finalize(document, linenum); + process_inlines(document, document->attributes.refmap); + + return document; +} +extern block *stmd_parse_file(FILE *f) +{ + gh_buf line = GH_BUF_INIT; + unsigned char buffer[4096]; + int linenum = 1; block *document = make_document(); + + while (fgets((char *)buffer, sizeof(buffer), f)) { + expand_tabs(&line, buffer, strlen(buffer)); + incorporate_line(&line, linenum, &document); + gh_buf_clear(&line); + linenum++; + } + + gh_buf_free(&line); + return finalize_document(document, linenum); +} + +extern block *stmd_parse_document(const unsigned char *buffer, size_t len) +{ + gh_buf line = GH_BUF_INIT; int linenum = 1; - const char *end = buffer + len; + const unsigned char *end = buffer + len; + block *document = make_document(); while (buffer < end) { const char *eol = memchr(buffer, '\n', end - buffer); @@ -414,16 +445,7 @@ extern block *stmd_parse_document(const char *buffer, size_t len) } gh_buf_free(&line); - - while (document != document->top) { - finalize(document, linenum); - document = document->parent; - } - - finalize(document, linenum); - process_inlines(document, document->attributes.refmap); - - return document; + return finalize_document(document, linenum); } // Process one line at a time, modifying a block. diff --git a/src/buffer.c b/src/buffer.c index b81e7fa..17dc864 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -31,10 +31,10 @@ void gh_buf_init(gh_buf *buf, int initial_size) int gh_buf_try_grow(gh_buf *buf, int target_size, bool mark_oom) { - char *new_ptr; - size_t new_size; + unsigned char *new_ptr; + int new_size; - if (buf->ptr == gh_buf__oom || buf->asize < 0) + if (buf->ptr == gh_buf__oom) return -1; if (target_size <= buf->asize) @@ -79,7 +79,7 @@ void gh_buf_free(gh_buf *buf) { if (!buf) return; - if (buf->asize > 0 && buf->ptr != gh_buf__initbuf && buf->ptr != gh_buf__oom) + if (buf->ptr != gh_buf__initbuf && buf->ptr != gh_buf__oom) free(buf->ptr); gh_buf_init(buf, 0); @@ -91,14 +91,9 @@ void gh_buf_clear(gh_buf *buf) if (buf->asize > 0) buf->ptr[0] = '\0'; - - if (buf->asize < 0) { - buf->ptr = gh_buf__initbuf; - buf->asize = 0; - } } -int gh_buf_set(gh_buf *buf, const char *data, int len) +int gh_buf_set(gh_buf *buf, const unsigned char *data, int len) { if (len == 0 || data == NULL) { gh_buf_clear(buf); @@ -115,10 +110,12 @@ int gh_buf_set(gh_buf *buf, const char *data, int len) int gh_buf_sets(gh_buf *buf, const char *string) { - return gh_buf_set(buf, string, string ? strlen(string) : 0); + return gh_buf_set(buf, + (const unsigned char *)string, + string ? strlen(string) : 0); } -int gh_buf_putc(gh_buf *buf, char c) +int gh_buf_putc(gh_buf *buf, int c) { ENSURE_SIZE(buf, buf->size + 2); buf->ptr[buf->size++] = c; @@ -126,7 +123,7 @@ int gh_buf_putc(gh_buf *buf, char c) return 0; } -int gh_buf_put(gh_buf *buf, const char *data, int len) +int gh_buf_put(gh_buf *buf, const unsigned char *data, int len) { ENSURE_SIZE(buf, buf->size + len + 1); memmove(buf->ptr + buf->size, data, len); @@ -137,8 +134,7 @@ int gh_buf_put(gh_buf *buf, const char *data, int len) int gh_buf_puts(gh_buf *buf, const char *string) { - assert(string); - return gh_buf_put(buf, string, strlen(string)); + return gh_buf_put(buf, (const unsigned char *)string, strlen(string)); } int gh_buf_vprintf(gh_buf *buf, const char *format, va_list ap) @@ -153,7 +149,7 @@ int gh_buf_vprintf(gh_buf *buf, const char *format, va_list ap) va_copy(args, ap); len = vsnprintf( - buf->ptr + buf->size, + (char *)buf->ptr + buf->size, buf->asize - buf->size, format, args ); @@ -187,9 +183,9 @@ int gh_buf_printf(gh_buf *buf, const char *format, ...) return r; } -void gh_buf_copy_cstr(char *data, size_t datasize, const gh_buf *buf) +void gh_buf_copy_cstr(char *data, int datasize, const gh_buf *buf) { - size_t copylen; + int copylen; assert(data && datasize && buf); @@ -212,9 +208,9 @@ void gh_buf_swap(gh_buf *buf_a, gh_buf *buf_b) *buf_b = t; } -char *gh_buf_detach(gh_buf *buf) +unsigned char *gh_buf_detach(gh_buf *buf) { - char *data = buf->ptr; + unsigned char *data = buf->ptr; if (buf->asize == 0 || buf->ptr == gh_buf__oom) return NULL; @@ -224,13 +220,13 @@ char *gh_buf_detach(gh_buf *buf) return data; } -void gh_buf_attach(gh_buf *buf, char *ptr, int asize) +void gh_buf_attach(gh_buf *buf, unsigned char *ptr, int asize) { gh_buf_free(buf); if (ptr) { buf->ptr = ptr; - buf->size = strlen(ptr); + buf->size = strlen((char *)ptr); if (asize) buf->asize = (asize < buf->size) ? buf->size + 1 : asize; else /* pass 0 to fall back on strlen + 1 */ @@ -249,11 +245,11 @@ int gh_buf_cmp(const gh_buf *a, const gh_buf *b) int gh_buf_strchr(const gh_buf *buf, int c, int pos) { - const char *p = memchr(buf->ptr + pos, c, buf->size - pos); - if (!p) - return -1; + const char *p = memchr(buf->ptr + pos, c, buf->size - pos); + if (!p) + return -1; - return (int)(p - p->ptr); + return (int)(p - buf->ptr); } int gh_buf_strrchr(const gh_buf *buf, int c, int pos) @@ -270,36 +266,21 @@ int gh_buf_strrchr(const gh_buf *buf, int c, int pos) void gh_buf_truncate(gh_buf *buf, size_t len) { - assert(buf->asize >= 0); - if (len < buf->size) { buf->size = len; buf->ptr[buf->size] = '\0'; } } -void gh_buf_ltruncate(gh_buf *buf, size_t len) -{ - assert(buf->asize >= 0); - - if (len && len < buf->size) { - memmove(buf->ptr, buf->ptr + len, buf->size - len); - buf->size -= len; - buf->ptr[buf->size] = '\0'; - } -} - void gh_buf_trim(gh_buf *buf) { - size_t i = 0; - - assert(buf->asize >= 0); - - /* ltrim */ + /* TODO: leading whitespace? */ + /* while (i < buf->size && isspace(buf->ptr[i])) i++; gh_buf_truncate(buf, i); + */ /* rtrim */ while (buf->size > 0) { diff --git a/src/buffer.h b/src/buffer.h index 2581ee3..422ef02 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -24,13 +24,6 @@ extern unsigned char gh_buf__oom[]; */ extern void gh_buf_init(gh_buf *buf, int initial_size); -static inline void gh_buf_static(gh_buf *buf, unsigned char *source) -{ - buf->ptr = source; - buf->size = strlen(source); - buf->asize = -1; -} - /** * Attempt to grow the buffer to hold at least `target_size` bytes. * @@ -81,13 +74,13 @@ static inline size_t gh_buf_len(const gh_buf *buf) extern int gh_buf_cmp(const gh_buf *a, const gh_buf *b); -extern void gh_buf_attach(gh_buf *buf, char *ptr, int asize); -extern char *gh_buf_detach(gh_buf *buf); +extern void gh_buf_attach(gh_buf *buf, unsigned char *ptr, int asize); +extern unsigned char *gh_buf_detach(gh_buf *buf); extern void gh_buf_copy_cstr(char *data, int datasize, const gh_buf *buf); static inline const char *gh_buf_cstr(const gh_buf *buf) { - return buf->ptr; + return (char *)buf->ptr; } #define gh_buf_at(buf, n) ((buf)->ptr[n]) @@ -100,10 +93,10 @@ static inline const char *gh_buf_cstr(const gh_buf *buf) * return code of these functions and call them in a series then just call * gh_buf_oom at the end. */ -extern int gh_buf_set(gh_buf *buf, const char *data, int len); +extern int gh_buf_set(gh_buf *buf, const unsigned char *data, int len); extern int gh_buf_sets(gh_buf *buf, const char *string); -extern int gh_buf_putc(gh_buf *buf, char c); -extern int gh_buf_put(gh_buf *buf, const char *data, int len); +extern int gh_buf_putc(gh_buf *buf, int c); +extern int gh_buf_put(gh_buf *buf, const unsigned char *data, int len); extern int gh_buf_puts(gh_buf *buf, const char *string); extern int gh_buf_printf(gh_buf *buf, const char *format, ...) __attribute__((format (printf, 2, 3))); diff --git a/src/html.c b/src/html.c deleted file mode 100644 index aeec5f1..0000000 --- a/src/html.c +++ /dev/null @@ -1,276 +0,0 @@ -#include -#include -#include -#include "bstrlib.h" -#include "stmd.h" -#include "debug.h" -#include "scanners.h" - -// Functions to convert block and inline lists to HTML strings. - -// Escape special characters in HTML. More efficient than -// three calls to bfindreplace. If preserve_entities is set, -// existing entities are left alone. -static bstring escape_html(bstring inp, bool preserve_entities) -{ - int pos = 0; - int match; - char c; - bstring escapable = blk2bstr("&<>\"", 4); - bstring ent; - bstring s = bstrcpy(inp); - while ((pos = binchr(s, pos, escapable)) != BSTR_ERR) { - c = bchar(s,pos); - switch (c) { - case '<': - bdelete(s, pos, 1); - ent = blk2bstr("<", 4); - binsert(s, pos, ent, ' '); - bdestroy(ent); - pos += 4; - break; - case '>': - bdelete(s, pos, 1); - ent = blk2bstr(">", 4); - binsert(s, pos, ent, ' '); - bdestroy(ent); - pos += 4; - break; - case '&': - if (preserve_entities && (match = scan_entity(s, pos))) { - pos += match; - } else { - bdelete(s, pos, 1); - ent = blk2bstr("&", 5); - binsert(s, pos, ent, ' '); - bdestroy(ent); - pos += 5; - } - break; - case '"': - bdelete(s, pos, 1); - ent = blk2bstr(""", 6); - binsert(s, pos, ent, ' '); - bdestroy(ent); - pos += 6; - break; - default: - bdelete(s, pos, 1); - log_err("unexpected character %02x", c); - } - } - bdestroy(escapable); - return s; -} - -static inline void cr(bstring buffer) -{ - int c = bchar(buffer, blength(buffer) - 1); - if (c != '\n' && c) { - bconchar(buffer, '\n'); - } -} - -// Convert a block list to HTML. Returns 0 on success, and sets result. -extern int blocks_to_html(block* b, bstring* result, bool tight) -{ - bstring contents = NULL; - bstring escaped, escaped2; - struct bstrList * info_words; - struct ListData * data; - bstring mbstart; - bstring html = blk2bstr("", 0); - - while(b != NULL) { - switch(b->tag) { - case document: - check(blocks_to_html(b->children, &contents, false) == 0, - "error converting blocks to html"); - bformata(html, "%s", contents->data); - bdestroy(contents); - break; - case paragraph: - check(inlines_to_html(b->inline_content, &contents) == 0, - "error converting inlines to html"); - if (tight) { - bformata(html, "%s", contents->data); - } else { - cr(html); - bformata(html, "

%s

", contents->data); - cr(html); - } - bdestroy(contents); - break; - case block_quote: - check(blocks_to_html(b->children, &contents, false) == 0, - "error converting blocks to html"); - cr(html); - bformata(html, "
\n%s
", contents->data); - cr(html); - bdestroy(contents); - break; - case list_item: - check(blocks_to_html(b->children, &contents, tight) == 0, - "error converting blocks to html"); - brtrimws(contents); - cr(html); - bformata(html, "
  • %s
  • ", contents->data); - cr(html); - bdestroy(contents); - break; - case list: - // make sure a list starts at the beginning of the line: - cr(html); - data = &(b->attributes.list_data); - check(blocks_to_html(b->children, &contents, data->tight) == 0, - "error converting blocks to html"); - mbstart = bformat(" start=\"%d\"", data->start); - bformata(html, "<%s%s>\n%s", - data->list_type == bullet ? "ul" : "ol", - data->start == 1 ? "" : (char*) mbstart->data, - contents->data, - data->list_type == bullet ? "ul" : "ol"); - cr(html); - bdestroy(contents); - bdestroy(mbstart); - break; - case atx_header: - case setext_header: - check(inlines_to_html(b->inline_content, &contents) == 0, - "error converting inlines to html"); - cr(html); - bformata(html, "%s", - b->attributes.header_level, - contents->data, - b->attributes.header_level); - cr(html); - bdestroy(contents); - break; - case indented_code: - escaped = escape_html(b->string_content, false); - cr(html); - bformata(html, "
    %s
    ", escaped->data); - cr(html); - bdestroy(escaped); - break; - case fenced_code: - escaped = escape_html(b->string_content, false); - cr(html); - bformata(html, "
    attributes.fenced_code_data.info) > 0) {
    -        escaped2 = escape_html(b->attributes.fenced_code_data.info, true);
    -        info_words = bsplit(escaped2, ' ');
    -        bformata(html, " class=\"language-%s\"", info_words->entry[0]->data);
    -        bdestroy(escaped2);
    -        bstrListDestroy(info_words);
    -      }
    -      bformata(html, ">%s
    ", escaped->data); - cr(html); - bdestroy(escaped); - break; - case html_block: - bformata(html, "%s", b->string_content->data); - break; - case hrule: - bformata(html, "
    "); - cr(html); - break; - case reference_def: - break; - default: - log_warn("block type %d not implemented\n", b->tag); - break; - } - b = b->next; - } - *result = html; - return 0; - error: - return -1; -} - -// Convert an inline list to HTML. Returns 0 on success, and sets result. -extern int inlines_to_html(inl* ils, bstring* result) -{ - bstring contents = NULL; - bstring html = blk2bstr("", 0); - bstring mbtitle, escaped, escaped2; - - while(ils != NULL) { - switch(ils->tag) { - case str: - escaped = escape_html(ils->content.literal, false); - bformata(html, "%s", escaped->data); - bdestroy(escaped); - break; - case linebreak: - bformata(html, "
    \n"); - break; - case softbreak: - bformata(html, "\n"); - break; - case code: - escaped = escape_html(ils->content.literal, false); - bformata(html, "%s", escaped->data); - bdestroy(escaped); - break; - case raw_html: - case entity: - bformata(html, "%s", ils->content.literal->data); - break; - case link: - check(inlines_to_html(ils->content.inlines, &contents) == 0, - "error converting inlines to html"); - if (blength(ils->content.linkable.title) > 0) { - escaped = escape_html(ils->content.linkable.title, true); - mbtitle = bformat(" title=\"%s\"", escaped->data); - bdestroy(escaped); - } else { - mbtitle = blk2bstr("",0); - } - escaped = escape_html(ils->content.linkable.url, true); - bformata(html, "%s", - escaped->data, - mbtitle->data, - contents->data); - bdestroy(escaped); - bdestroy(mbtitle); - bdestroy(contents); - break; - case image: - check(inlines_to_html(ils->content.inlines, &contents) == 0, - "error converting inlines to html"); - escaped = escape_html(ils->content.linkable.url, true); - escaped2 = escape_html(contents, false); - bdestroy(contents); - bformata(html, "\"%s\"",data, escaped2->data); - bdestroy(escaped); - bdestroy(escaped2); - if (blength(ils->content.linkable.title) > 0) { - escaped = escape_html(ils->content.linkable.title, true); - bformata(html, " title=\"%s\"", escaped->data); - bdestroy(escaped); - } - bformata(html, " />"); - break; - case strong: - check(inlines_to_html(ils->content.inlines, &contents) == 0, - "error converting inlines to html"); - bformata(html, "%s", contents->data); - bdestroy(contents); - break; - case emph: - check(inlines_to_html(ils->content.inlines, &contents) == 0, - "error converting inlines to html"); - bformata(html, "%s", contents->data); - bdestroy(contents); - break; - } - ils = ils->next; - } - *result = html; - return 0; - error: - return -1; -} diff --git a/src/inlines.c b/src/inlines.c index 4ff45ad..82c7219 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -6,9 +6,7 @@ #include "stmd.h" #include "uthash.h" -#include "debug.h" #include "scanners.h" -#include "utf8.h" typedef struct Subject { const gh_buf *buffer; @@ -119,7 +117,7 @@ inline static inl* make_linkable(int t, inl* label, chunk url, chunk title) e->tag = t; e->content.linkable.label = label; e->content.linkable.url = chunk_to_cstr(&url); - e->content.linkable.title = chunk_to_cstr(&title); + e->content.linkable.title = url.len ? chunk_to_cstr(&title) : NULL; e->next = NULL; return e; } diff --git a/src/main.c b/src/main.c index 9e0a3c8..e1abedc 100644 --- a/src/main.c +++ b/src/main.c @@ -1,99 +1,77 @@ #include #include -#include "bstrlib.h" +#include #include "stmd.h" #include "debug.h" void print_usage() { - printf("Usage: stmd [FILE*]\n"); - printf("Options: --help, -h Print usage information\n"); - printf(" --ast Print AST instead of HTML\n"); - printf(" --version Print version\n"); + printf("Usage: stmd [FILE*]\n"); + printf("Options: --help, -h Print usage information\n"); + printf(" --ast Print AST instead of HTML\n"); + printf(" --version Print version\n"); } -int main(int argc, char *argv[]) { - int i; - bool ast = false; - int g = 0; - int numfps = 0; - int files[argc]; +static void print_document(block *document, bool ast) +{ + gh_buf html = GH_BUF_INIT; + + if (ast) { + print_blocks(document, 0); + } else { + blocks_to_html(&html, document, false); + printf("%s", html.ptr); + gh_buf_free(&html); + } +} - for (i=1; i < argc; i++) { - if (strcmp(argv[i], "--version") == 0) { - printf("stmd %s", VERSION); - printf(" - CommonMark converter (c) 2014 John MacFarlane\n"); - exit(0); - } else if ((strcmp(argv[i], "--help") == 0) || - (strcmp(argv[i], "-h") == 0)) { - print_usage(); - exit(0); - } else if (strcmp(argv[i], "--ast") == 0) { - ast = true; - } else if (*argv[i] == '-') { - print_usage(); - exit(1); - } else { // treat as file argument - files[g] = i; - g++; - } - } +int main(int argc, char *argv[]) +{ + int i, numfps = 0; + bool ast = false; + int files[argc]; + block *document = NULL; - numfps = g; - bstring s = NULL; - bstring html; - g = 0; - block * cur = make_document(); - int linenum = 1; - extern int errno; - FILE * fp = NULL; + for (i = 1; i < argc; i++) { + if (strcmp(argv[i], "--version") == 0) { + printf("stmd %s", VERSION); + printf(" - CommonMark converter (c) 2014 John MacFarlane\n"); + exit(0); + } else if ((strcmp(argv[i], "--help") == 0) || + (strcmp(argv[i], "-h") == 0)) { + print_usage(); + exit(0); + } else if (strcmp(argv[i], "--ast") == 0) { + ast = true; + } else if (*argv[i] == '-') { + print_usage(); + exit(1); + } else { // treat as file argument + files[numfps++] = i; + } + } - if (numfps == 0) { - // read from stdin - while ((s = bgets((bNgetc) fgetc, stdin, '\n'))) { - check(incorporate_line(s, linenum, &cur) == 0, - "error incorporating line %d", linenum); - bdestroy(s); - linenum++; - } - } else { - // iterate over input file pointers - for (g=0; g < numfps; g++) { + if (numfps == 0) { + document = stmd_parse_file(stdin); + print_document(document, ast); + free_blocks(document); + } else { + for (i = 0; i < numfps; i++) { + FILE *fp = fopen(argv[files[i]], "r"); - fp = fopen(argv[files[g]], "r"); - if (fp == NULL) { - fprintf(stderr, "Error opening file %s: %s\n", - argv[files[g]], strerror(errno)); - exit(1); - } + if (fp == NULL) { + fprintf(stderr, "Error opening file %s: %s\n", + argv[files[i]], strerror(errno)); + exit(1); + } - while ((s = bgets((bNgetc) fgetc, fp, '\n'))) { - check(incorporate_line(s, linenum, &cur) == 0, - "error incorporating line %d", linenum); - bdestroy(s); - linenum++; - } - fclose(fp); - } - } + document = stmd_parse_file(fp); + print_document(document, ast); + free_blocks(document); + fclose(fp); + } + } - while (cur != cur->top) { - finalize(cur, linenum); - cur = cur->parent; - } - check(cur == cur->top, "problems finalizing open containers"); - finalize(cur, linenum); - process_inlines(cur, cur->attributes.refmap); - if (ast) { - print_blocks(cur, 0); - } else { - check(blocks_to_html(cur, &html, false) == 0, "could not format as HTML"); - // printf("%s", html->data); - bdestroy(html); - } - free_blocks(cur); - return 0; -error: - return -1; + return 0; } diff --git a/src/print.c b/src/print.c index a924870..3ebde16 100644 --- a/src/print.c +++ b/src/print.c @@ -1,168 +1,175 @@ #include #include -#include "bstrlib.h" +#include #include "stmd.h" #include "debug.h" -static bstring format_str(bstring s) +static void print_str(const unsigned char *s, int len) { - int pos = 0; - int len = blength(s); - bstring result = bfromcstr(""); - char c; - bformata(result, "\""); - while (pos < len) { - c = bchar(s, pos); - switch (c) { - case '\n': - bformata(result, "\\n"); - break; - case '"': - bformata(result, "\\\""); - break; - case '\\': - bformata(result, "\\\\"); - break; - default: - bformata(result, "%c", c); - } - pos++; - } - bformata(result, "\""); - return result; + int i; + + if (len < 0) + len = strlen(s); + + putchar('"'); + for (i = 0; i < len; ++i) { + unsigned char c = s[i]; + + switch (c) { + case '\n': + printf("\\n"); + break; + case '"': + printf("\\\""); + break; + case '\\': + printf("\\\\"); + break; + default: + putchar((int)c); + } + } + putchar('"'); } // Functions to pretty-print inline and block lists, for debugging. // Prettyprint an inline list, for debugging. extern void print_blocks(block* b, int indent) { - struct ListData * data; - while(b != NULL) { - // printf("%3d %3d %3d| ", b->start_line, b->start_column, b->end_line); - for (int i=0; i < indent; i++) { - putchar(' '); - } - switch(b->tag) { - case document: - printf("document\n"); - print_blocks(b->children, indent + 2); - break; - case block_quote: - printf("block_quote\n"); - print_blocks(b->children, indent + 2); - break; - case list_item: - data = &(b->attributes.list_data); - printf("list_item\n"); - print_blocks(b->children, indent + 2); - break; - case list: - data = &(b->attributes.list_data); - if (data->list_type == ordered) { - printf("list (type=ordered tight=%s start=%d delim=%s)\n", - (data->tight ? "true" : "false"), - data->start, - (data->delimiter == parens ? "parens" : "period")); - } else { - printf("list (type=bullet tight=%s bullet_char=%c)\n", - (data->tight ? "true" : "false"), - data->bullet_char); - } - print_blocks(b->children, indent + 2); - break; - case atx_header: - printf("atx_header (level=%d)\n", b->attributes.header_level); - print_inlines(b->inline_content, indent + 2); - break; - case setext_header: - printf("setext_header (level=%d)\n", b->attributes.header_level); - print_inlines(b->inline_content, indent + 2); - break; - case paragraph: - printf("paragraph\n"); - print_inlines(b->inline_content, indent + 2); - break; - case hrule: - printf("hrule\n"); - break; - case indented_code: - printf("indented_code %s\n", format_str(b->string_content)->data); - break; - case fenced_code: - printf("fenced_code length=%d info=%s %s\n", - b->attributes.fenced_code_data.fence_length, - format_str(b->attributes.fenced_code_data.info)->data, - format_str(b->string_content)->data); - break; - case html_block: - printf("html_block %s\n", format_str(b->string_content)->data); - break; - case reference_def: - printf("reference_def\n"); - break; - default: - log_warn("block type %d not implemented\n", b->tag); - break; - } - b = b->next; - } + struct ListData *data; + + while(b != NULL) { + // printf("%3d %3d %3d| ", b->start_line, b->start_column, b->end_line); + for (int i=0; i < indent; i++) { + putchar(' '); + } + + switch(b->tag) { + case document: + printf("document\n"); + print_blocks(b->children, indent + 2); + break; + case block_quote: + printf("block_quote\n"); + print_blocks(b->children, indent + 2); + break; + case list_item: + data = &(b->attributes.list_data); + printf("list_item\n"); + print_blocks(b->children, indent + 2); + break; + case list: + data = &(b->attributes.list_data); + if (data->list_type == ordered) { + printf("list (type=ordered tight=%s start=%d delim=%s)\n", + (data->tight ? "true" : "false"), + data->start, + (data->delimiter == parens ? "parens" : "period")); + } else { + printf("list (type=bullet tight=%s bullet_char=%c)\n", + (data->tight ? "true" : "false"), + data->bullet_char); + } + print_blocks(b->children, indent + 2); + break; + case atx_header: + printf("atx_header (level=%d)\n", b->attributes.header_level); + print_inlines(b->inline_content, indent + 2); + break; + case setext_header: + printf("setext_header (level=%d)\n", b->attributes.header_level); + print_inlines(b->inline_content, indent + 2); + break; + case paragraph: + printf("paragraph\n"); + print_inlines(b->inline_content, indent + 2); + break; + case hrule: + printf("hrule\n"); + break; + case indented_code: + printf("indented_code "); + print_str(b->string_content.ptr, -1); + putchar('\n'); + break; + case fenced_code: + printf("fenced_code length=%d info=", + b->attributes.fenced_code_data.fence_length); + print_str(b->attributes.fenced_code_data.info.ptr, -1); + putchar(' '); + print_str(b->string_content.ptr, -1); + putchar('\n'); + break; + case html_block: + printf("html_block "); + print_str(b->string_content.ptr, -1); + putchar('\n'); + break; + case reference_def: + printf("reference_def\n"); + break; + default: + printf("# NOT IMPLEMENTED (%d)\n", b->tag); + break; + } + b = b->next; + } } // Prettyprint an inline list, for debugging. extern void print_inlines(inl* ils, int indent) { - while(ils != NULL) { - /* - // we add 11 extra spaces for the line/column info - for (int i=0; i < 11; i++) { - putchar(' '); - } - putchar('|'); - putchar(' '); - */ - for (int i=0; i < indent; i++) { - putchar(' '); - } - switch(ils->tag) { - case str: - printf("str %s\n", format_str(ils->content.literal)->data); - break; - case linebreak: - printf("linebreak\n"); - break; - case softbreak: - printf("softbreak\n"); - break; - case code: - printf("code %s\n", format_str(ils->content.literal)->data); - break; - case raw_html: - printf("html %s\n", format_str(ils->content.literal)->data); - break; - case entity: - printf("entity %s\n", format_str(ils->content.literal)->data); - break; - case link: - printf("link url=%s title=%s\n", - format_str(ils->content.linkable.url)->data, - format_str(ils->content.linkable.title)->data); - print_inlines(ils->content.linkable.label, indent + 2); - break; - case image: - printf("image url=%s title=%s\n", - format_str(ils->content.linkable.url)->data, - format_str(ils->content.linkable.title)->data); - print_inlines(ils->content.linkable.label, indent + 2); - break; - case strong: - printf("strong\n"); - print_inlines(ils->content.linkable.label, indent + 2); - break; - case emph: - printf("emph\n"); - print_inlines(ils->content.linkable.label, indent + 2); - break; - } - ils = ils->next; - } + while(ils != NULL) { + for (int i=0; i < indent; i++) { + putchar(' '); + } + switch(ils->tag) { + case str: + printf("str "); + print_str(ils->content.literal.data, ils->content.literal.len); + putchar('\n'); + break; + case linebreak: + printf("linebreak\n"); + break; + case softbreak: + printf("softbreak\n"); + break; + case code: + printf("code "); + print_str(ils->content.literal.data, ils->content.literal.len); + putchar('\n'); + break; + case raw_html: + printf("html "); + print_str(ils->content.literal.data, ils->content.literal.len); + putchar('\n'); + break; + case entity: + printf("entity "); + print_str(ils->content.literal.data, ils->content.literal.len); + putchar('\n'); + break; + case link: + case image: + printf("%s url=", ils->tag == link ? "link" : "image"); + print_str(ils->content.linkable.url, -1); + if (ils->content.linkable.title) { + printf(" title="); + print_str(ils->content.linkable.title, -1); + } + putchar('\n'); + print_inlines(ils->content.linkable.label, indent + 2); + break; + case strong: + printf("strong\n"); + print_inlines(ils->content.linkable.label, indent + 2); + break; + case emph: + printf("emph\n"); + print_inlines(ils->content.linkable.label, indent + 2); + break; + } + ils = ils->next; + } } diff --git a/src/stmd.h b/src/stmd.h index eb1b989..dc24235 100644 --- a/src/stmd.h +++ b/src/stmd.h @@ -105,19 +105,14 @@ extern block* add_child(block* parent, int block_type, int start_line, int start_column); void free_blocks(block* e); -block *stmd_parse_document(const char *buffer, size_t len); - -// FOR NOW: -void process_inlines(block* cur, reference** refmap); -void incorporate_line(gh_buf *ln, int line_number, block** curptr); -void finalize(block* b, int line_number); +extern block *stmd_parse_document(const unsigned char *buffer, size_t len); +extern block *stmd_parse_file(FILE *f); void print_inlines(inl* ils, int indent); void print_blocks(block* blk, int indent); -/* TODO */ -// int blocks_to_html(block* b, bstring* result, bool tight); -// int inlines_to_html(inl* b, bstring* result); +void blocks_to_html(gh_buf *html, block *b, bool tight); +void inlines_to_html(gh_buf *html, inl *b); void utf8proc_case_fold(gh_buf *dest, const unsigned char *str, int len); diff --git a/src/utf8.c b/src/utf8.c index 1a5df9e..e3f8dd3 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -1,6 +1,8 @@ #include -#include "bstrlib.h" -#include "debug.h" +#include +#include + +#include "stmd.h" static const int8_t utf8proc_utf8class[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, diff --git a/src/utf8.h b/src/utf8.h deleted file mode 100644 index fe59a90..0000000 --- a/src/utf8.h +++ /dev/null @@ -1,6 +0,0 @@ -#include -#include "bstrlib.h" - -extern unsigned char * from_utf8(unsigned char * s, unsigned int *n); -extern int to_utf8(unsigned int c, bstring dest); -extern bstring case_fold(bstring source); -- cgit v1.2.3 From c28af79329264a7cf331a1b1c414919e4ed9e9f9 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Tue, 2 Sep 2014 13:37:34 +0200 Subject: It buiiiilds --- src/blocks.c | 11 ++- src/buffer.c | 6 +- src/html/houdini.h | 44 ++++++++++ src/html/houdini_href_e.c | 115 +++++++++++++++++++++++++ src/html/houdini_html_e.c | 89 +++++++++++++++++++ src/html/html.c | 212 ++++++++++++++++++++++++++++++++++++++++++++++ src/stmd.h | 1 + src/utf8.c | 7 +- 8 files changed, 473 insertions(+), 12 deletions(-) create mode 100644 src/html/houdini.h create mode 100644 src/html/houdini_href_e.c create mode 100644 src/html/houdini_html_e.c create mode 100644 src/html/html.c (limited to 'src/utf8.c') diff --git a/src/blocks.c b/src/blocks.c index 71dc830..42f20db 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -8,6 +8,7 @@ #include "scanners.h" #include "uthash.h" +static void incorporate_line(gh_buf *ln, int line_number, block** curptr); static void finalize(block* b, int line_number); static block* make_block(int tag, int start_line, int start_column) @@ -390,7 +391,7 @@ static void expand_tabs(gh_buf *ob, const unsigned char *line, size_t size) } } -static block *finalize_parsing(block *document, int linenum) +static block *finalize_document(block *document, int linenum) { while (document != document->top) { finalize(document, linenum); @@ -411,7 +412,7 @@ extern block *stmd_parse_file(FILE *f) block *document = make_document(); while (fgets((char *)buffer, sizeof(buffer), f)) { - expand_tabs(&line, buffer, strlen(buffer)); + expand_tabs(&line, buffer, strlen((char *)buffer)); incorporate_line(&line, linenum, &document); gh_buf_clear(&line); linenum++; @@ -429,7 +430,7 @@ extern block *stmd_parse_document(const unsigned char *buffer, size_t len) block *document = make_document(); while (buffer < end) { - const char *eol = memchr(buffer, '\n', end - buffer); + const unsigned char *eol = memchr(buffer, '\n', end - buffer); if (!eol) { expand_tabs(&line, buffer, end - buffer); @@ -449,9 +450,7 @@ extern block *stmd_parse_document(const unsigned char *buffer, size_t len) } // Process one line at a time, modifying a block. -// Returns 0 if successful. curptr is changed to point to -// the currently open block. -extern void incorporate_line(gh_buf *ln, int line_number, block** curptr) +static void incorporate_line(gh_buf *ln, int line_number, block** curptr) { block* last_matched_container; int offset = 0; diff --git a/src/buffer.c b/src/buffer.c index 17dc864..cfc6a7e 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -245,11 +245,11 @@ int gh_buf_cmp(const gh_buf *a, const gh_buf *b) int gh_buf_strchr(const gh_buf *buf, int c, int pos) { - const char *p = memchr(buf->ptr + pos, c, buf->size - pos); + const unsigned char *p = memchr(buf->ptr + pos, c, buf->size - pos); if (!p) return -1; - return (int)(p - buf->ptr); + return (int)(p - (const unsigned char *)buf->ptr); } int gh_buf_strrchr(const gh_buf *buf, int c, int pos) @@ -264,7 +264,7 @@ int gh_buf_strrchr(const gh_buf *buf, int c, int pos) return -1; } -void gh_buf_truncate(gh_buf *buf, size_t len) +void gh_buf_truncate(gh_buf *buf, int len) { if (len < buf->size) { buf->size = len; diff --git a/src/html/houdini.h b/src/html/houdini.h new file mode 100644 index 0000000..31fe917 --- /dev/null +++ b/src/html/houdini.h @@ -0,0 +1,44 @@ +#ifndef __HOUDINI_H__ +#define __HOUDINI_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "buffer.h" + +#define likely(x) __builtin_expect((x),1) +#define unlikely(x) __builtin_expect((x),0) + +#ifdef HOUDINI_USE_LOCALE +# define _isxdigit(c) isxdigit(c) +# define _isdigit(c) isdigit(c) +#else +/* + * Helper _isdigit methods -- do not trust the current locale + * */ +# define _isxdigit(c) (strchr("0123456789ABCDEFabcdef", (c)) != NULL) +# define _isdigit(c) ((c) >= '0' && (c) <= '9') +#endif + +#define HOUDINI_ESCAPED_SIZE(x) (((x) * 12) / 10) +#define HOUDINI_UNESCAPED_SIZE(x) (x) + +extern int houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure); +extern int houdini_unescape_html(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_uri(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_url(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_uri(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_url(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_js(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_js(gh_buf *ob, const uint8_t *src, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/html/houdini_href_e.c b/src/html/houdini_href_e.c new file mode 100644 index 0000000..59fe850 --- /dev/null +++ b/src/html/houdini_href_e.c @@ -0,0 +1,115 @@ +#include +#include +#include + +#include "html/houdini.h" + +/* + * The following characters will not be escaped: + * + * -_.+!*'(),%#@?=;:/,+&$ alphanum + * + * Note that this character set is the addition of: + * + * - The characters which are safe to be in an URL + * - The characters which are *not* safe to be in + * an URL because they are RESERVED characters. + * + * We asume (lazily) that any RESERVED char that + * appears inside an URL is actually meant to + * have its native function (i.e. as an URL + * component/separator) and hence needs no escaping. + * + * There are two exceptions: the chacters & (amp) + * and ' (single quote) do not appear in the table. + * They are meant to appear in the URL as components, + * yet they require special HTML-entity escaping + * to generate valid HTML markup. + * + * All other characters will be escaped to %XX. + * + */ +static const char HREF_SAFE[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +int +houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size) +{ + static const uint8_t hex_chars[] = "0123456789ABCDEF"; + size_t i = 0, org; + uint8_t hex_str[3]; + + hex_str[0] = '%'; + + while (i < size) { + org = i; + while (i < size && HREF_SAFE[src[i]] != 0) + i++; + + if (likely(i > org)) { + if (unlikely(org == 0)) { + if (i >= size) + return 0; + + gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size)); + } + + gh_buf_put(ob, src + org, i - org); + } + + /* escaping */ + if (i >= size) + break; + + switch (src[i]) { + /* amp appears all the time in URLs, but needs + * HTML-entity escaping to be inside an href */ + case '&': + gh_buf_puts(ob, "&"); + break; + + /* the single quote is a valid URL character + * according to the standard; it needs HTML + * entity escaping too */ + case '\'': + gh_buf_puts(ob, "'"); + break; + + /* the space can be escaped to %20 or a plus + * sign. we're going with the generic escape + * for now. the plus thing is more commonly seen + * when building GET strings */ +#if 0 + case ' ': + gh_buf_putc(ob, '+'); + break; +#endif + + /* every other character goes with a %XX escaping */ + default: + hex_str[1] = hex_chars[(src[i] >> 4) & 0xF]; + hex_str[2] = hex_chars[src[i] & 0xF]; + gh_buf_put(ob, hex_str, 3); + } + + i++; + } + + return 1; +} diff --git a/src/html/houdini_html_e.c b/src/html/houdini_html_e.c new file mode 100644 index 0000000..316c5ce --- /dev/null +++ b/src/html/houdini_html_e.c @@ -0,0 +1,89 @@ +#include +#include +#include + +#include "html/houdini.h" + +/** + * According to the OWASP rules: + * + * & --> & + * < --> < + * > --> > + * " --> " + * ' --> ' ' is not recommended + * / --> / forward slash is included as it helps end an HTML entity + * + */ +static const char HTML_ESCAPE_TABLE[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const char *HTML_ESCAPES[] = { + "", + """, + "&", + "'", + "/", + "<", + ">" +}; + +int +houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure) +{ + size_t i = 0, org, esc = 0; + + while (i < size) { + org = i; + while (i < size && (esc = HTML_ESCAPE_TABLE[src[i]]) == 0) + i++; + + if (i > org) { + if (unlikely(org == 0)) { + if (i >= size) + return 0; + + gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size)); + } + + gh_buf_put(ob, src + org, i - org); + } + + /* escaping */ + if (unlikely(i >= size)) + break; + + /* The forward slash is only escaped in secure mode */ + if (src[i] == '/' && !secure) { + gh_buf_putc(ob, '/'); + } else { + gh_buf_puts(ob, HTML_ESCAPES[esc]); + } + + i++; + } + + return 1; +} + +int +houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size) +{ + return houdini_escape_html0(ob, src, size, 1); +} diff --git a/src/html/html.c b/src/html/html.c new file mode 100644 index 0000000..2f160ca --- /dev/null +++ b/src/html/html.c @@ -0,0 +1,212 @@ +#include +#include +#include +#include +#include + +#include "stmd.h" +#include "debug.h" +#include "scanners.h" +#include "html/houdini.h" + +// Functions to convert block and inline lists to HTML strings. + +static void escape_html(gh_buf *dest, const unsigned char *source, int length) +{ + if (length < 0) + length = strlen((char *)source); + + houdini_escape_html0(dest, source, (size_t)length, 0); +} + +static void escape_href(gh_buf *dest, const unsigned char *source, int length) +{ + if (length < 0) + length = strlen((char *)source); + + houdini_escape_href(dest, source, (size_t)length); +} + +static inline void cr(gh_buf *html) +{ + if (html->size && html->ptr[html->size - 1] != '\n') + gh_buf_putc(html, '\n'); +} + +// Convert a block list to HTML. Returns 0 on success, and sets result. +void blocks_to_html(gh_buf *html, block *b, bool tight) +{ + struct ListData *data; + + while(b != NULL) { + switch(b->tag) { + case document: + blocks_to_html(html, b->children, false); + break; + + case paragraph: + if (tight) { + inlines_to_html(html, b->inline_content); + } else { + cr(html); + gh_buf_puts(html, "

    "); + inlines_to_html(html, b->inline_content); + gh_buf_puts(html, "

    "); + cr(html); + } + break; + + case block_quote: + cr(html); + gh_buf_puts(html, "
    "); + blocks_to_html(html, b->children, false); + gh_buf_puts(html, "
    "); + cr(html); + break; + + case list_item: + cr(html); + gh_buf_puts(html, "
  • "); + blocks_to_html(html, b->children, tight); + gh_buf_trim(html); + gh_buf_puts(html, "
  • "); + cr(html); + break; + + case list: + // make sure a list starts at the beginning of the line: + cr(html); + data = &(b->attributes.list_data); + + if (data->start > 1) { + gh_buf_printf(html, "<%s start=\"%d\">\n", + data->list_type == bullet ? "ul" : "ol", + data->start); + } else { + gh_buf_puts(html, data->list_type == bullet ? "
      \n" : "
        \n"); + } + + blocks_to_html(html, b->children, data->tight); + gh_buf_puts(html, data->list_type == bullet ? "
    " : ""); + cr(html); + break; + + case atx_header: + case setext_header: + cr(html); + gh_buf_printf(html, "", b->attributes.header_level); + inlines_to_html(html, b->inline_content); + gh_buf_printf(html, "", b->attributes.header_level); + cr(html); + break; + + case indented_code: + case fenced_code: + /* TODO: fenced code lang attributes */ + cr(html); + gh_buf_puts(html, "
    ");
    +				escape_html(html, b->string_content.ptr, b->string_content.size);
    +				gh_buf_puts(html, "
    "); + cr(html); + break; + + case html_block: + gh_buf_put(html, b->string_content.ptr, b->string_content.size); + break; + + case hrule: + gh_buf_puts(html, "
    "); + cr(html); + break; + + case reference_def: + break; + + default: + assert(false); + } + + b = b->next; + } +} + +// Convert an inline list to HTML. Returns 0 on success, and sets result. +void inlines_to_html(gh_buf *html, inl* ils) +{ + gh_buf scrap = GH_BUF_INIT; + + while(ils != NULL) { + switch(ils->tag) { + case INL_STRING: + escape_html(html, ils->content.literal.data, ils->content.literal.len); + break; + + case INL_LINEBREAK: + gh_buf_puts(html, "
    \n"); + break; + + case INL_SOFTBREAK: + gh_buf_putc(html, '\n'); + break; + + case INL_CODE: + gh_buf_puts(html, ""); + escape_html(html, ils->content.literal.data, ils->content.literal.len); + gh_buf_puts(html, ""); + break; + + case INL_RAW_HTML: + case INL_ENTITY: + gh_buf_put(html, + ils->content.literal.data, + ils->content.literal.len); + break; + + case INL_LINK: + gh_buf_puts(html, "content.linkable.url, -1); + + if (ils->content.linkable.title) { + gh_buf_puts(html, "\" title=\""); + escape_html(html, ils->content.linkable.title, -1); + } + + gh_buf_puts(html, "\">"); + inlines_to_html(html, ils->content.inlines); + gh_buf_puts(html, ""); + break; + + case INL_IMAGE: + gh_buf_puts(html, "content.linkable.url, -1); + + inlines_to_html(&scrap, ils->content.inlines); + if (scrap.size) { + gh_buf_puts(html, "\" alt=\""); + escape_html(html, scrap.ptr, scrap.size); + } + gh_buf_clear(&scrap); + + if (ils->content.linkable.title) { + gh_buf_puts(html, "\" title=\""); + escape_html(html, ils->content.linkable.title, -1); + } + + gh_buf_puts(html, "\"/>"); + break; + + case INL_STRONG: + gh_buf_puts(html, ""); + inlines_to_html(html, ils->content.inlines); + gh_buf_puts(html, ""); + break; + + case INL_EMPH: + gh_buf_puts(html, ""); + inlines_to_html(html, ils->content.inlines); + gh_buf_puts(html, ""); + break; + } + ils = ils->next; + } +} diff --git a/src/stmd.h b/src/stmd.h index 1e490d6..3e284bd 100644 --- a/src/stmd.h +++ b/src/stmd.h @@ -1,4 +1,5 @@ #include +#include #include "buffer.h" #include "uthash.h" diff --git a/src/utf8.c b/src/utf8.c index e3f8dd3..32c78a4 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "stmd.h" @@ -83,9 +84,9 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) return length; } -void utf8_encode_char(int32_t uc, gh_buf *buf) +void utf8proc_encode_char(int32_t uc, gh_buf *buf) { - char dst[4]; + unsigned char dst[4]; int len = 0; if (uc < 0x00) { @@ -99,7 +100,7 @@ void utf8_encode_char(int32_t uc, gh_buf *buf) len = 2; } else if (uc == 0xFFFF) { dst[0] = 0xFF; - return 1; + len = 1; } else if (uc == 0xFFFE) { dst[0] = 0xFE; len = 1; -- cgit v1.2.3 From 543c2c94d71adee42c7bd2f8027d75c87ed8120d Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Thu, 4 Sep 2014 18:38:14 +0200 Subject: Rename to strbuf --- src/blocks.c | 64 +++++++++++++++---------------- src/buffer.c | 86 ++++++++++++++++++++--------------------- src/buffer.h | 80 +++++++++++++++++++------------------- src/chunk.h | 4 +- src/html/houdini.h | 22 +++++------ src/html/houdini_href_e.c | 12 +++--- src/html/houdini_html_e.c | 10 ++--- src/html/html.c | 98 +++++++++++++++++++++++------------------------ src/inlines.c | 50 ++++++++++++------------ src/main.c | 4 +- src/stmd.h | 16 ++++---- src/utf8.c | 6 +-- 12 files changed, 226 insertions(+), 226 deletions(-) (limited to 'src/utf8.c') diff --git a/src/blocks.c b/src/blocks.c index cf0e9e4..9faccd9 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -10,7 +10,7 @@ #define peek_at(i, n) (i)->data[n] -static void incorporate_line(gh_buf *ln, int line_number, block** curptr); +static void incorporate_line(strbuf *ln, int line_number, block** curptr); static void finalize(block* b, int line_number); static block* make_block(int tag, int start_line, int start_column) @@ -28,7 +28,7 @@ static block* make_block(int tag, int start_line, int start_column) e->parent = NULL; e->top = NULL; e->attributes.refmap = NULL; - gh_buf_init(&e->string_content, 32); + strbuf_init(&e->string_content, 32); e->inline_content = NULL; e->next = NULL; e->prev = NULL; @@ -49,7 +49,7 @@ extern block* make_document() } // Returns true if line has only space characters, else false. -bool is_blank(gh_buf *s, int offset) +bool is_blank(strbuf *s, int offset) { while (offset < s->size) { switch (s->ptr[offset]) { @@ -85,10 +85,10 @@ static inline bool accepts_lines(int block_type) static void add_line(block* block, chunk *ch, int offset) { assert(block->open); - gh_buf_put(&block->string_content, ch->data + offset, ch->len - offset); + strbuf_put(&block->string_content, ch->data + offset, ch->len - offset); } -static void remove_trailing_blank_lines(gh_buf *ln) +static void remove_trailing_blank_lines(strbuf *ln) { int i; @@ -100,13 +100,13 @@ static void remove_trailing_blank_lines(gh_buf *ln) } if (i < 0) { - gh_buf_clear(ln); + strbuf_clear(ln); return; } - i = gh_buf_strchr(ln, '\n', i); + i = strbuf_strchr(ln, '\n', i); if (i >= 0) - gh_buf_truncate(ln, i); + strbuf_truncate(ln, i); } // Check to see if a block ends with a blank line, descending @@ -164,10 +164,10 @@ static void finalize(block* b, int line_number) switch (b->tag) { case paragraph: pos = 0; - while (gh_buf_at(&b->string_content, 0) == '[' && + while (strbuf_at(&b->string_content, 0) == '[' && (pos = parse_reference(&b->string_content, b->top->attributes.refmap))) { - gh_buf_drop(&b->string_content, pos); + strbuf_drop(&b->string_content, pos); } if (is_blank(&b->string_content, 0)) { b->tag = reference_def; @@ -176,23 +176,23 @@ static void finalize(block* b, int line_number) case indented_code: remove_trailing_blank_lines(&b->string_content); - gh_buf_putc(&b->string_content, '\n'); + strbuf_putc(&b->string_content, '\n'); break; case fenced_code: // first line of contents becomes info - firstlinelen = gh_buf_strchr(&b->string_content, '\n', 0); + firstlinelen = strbuf_strchr(&b->string_content, '\n', 0); - gh_buf_init(&b->attributes.fenced_code_data.info, 0); - gh_buf_set( + strbuf_init(&b->attributes.fenced_code_data.info, 0); + strbuf_set( &b->attributes.fenced_code_data.info, b->string_content.ptr, firstlinelen ); - gh_buf_drop(&b->string_content, firstlinelen + 1); + strbuf_drop(&b->string_content, firstlinelen + 1); - gh_buf_trim(&b->attributes.fenced_code_data.info); + strbuf_trim(&b->attributes.fenced_code_data.info); unescape_buffer(&b->attributes.fenced_code_data.info); break; @@ -265,9 +265,9 @@ extern void free_blocks(block* e) while (e != NULL) { next = e->next; free_inlines(e->inline_content); - gh_buf_free(&e->string_content); + strbuf_free(&e->string_content); if (e->tag == fenced_code) { - gh_buf_free(&e->attributes.fenced_code_data.info); + strbuf_free(&e->attributes.fenced_code_data.info); } else if (e->tag == document) { free_reference_map(e->attributes.refmap); } @@ -287,7 +287,7 @@ void process_inlines(block* cur, reference** refmap) case setext_header: cur->inline_content = parse_inlines(&cur->string_content, refmap); // MEM - // gh_buf_free(&cur->string_content); + // strbuf_free(&cur->string_content); break; default: @@ -369,7 +369,7 @@ static int lists_match(struct ListData list_data, list_data.bullet_char == item_data.bullet_char); } -static void expand_tabs(gh_buf *ob, const unsigned char *line, size_t size) +static void expand_tabs(strbuf *ob, const unsigned char *line, size_t size) { size_t i = 0, tab = 0; @@ -381,13 +381,13 @@ static void expand_tabs(gh_buf *ob, const unsigned char *line, size_t size) } if (i > org) - gh_buf_put(ob, line + org, i - org); + strbuf_put(ob, line + org, i - org); if (i >= size) break; do { - gh_buf_putc(ob, ' '); tab++; + strbuf_putc(ob, ' '); tab++; } while (tab % 4); i++; @@ -409,7 +409,7 @@ static block *finalize_document(block *document, int linenum) extern block *stmd_parse_file(FILE *f) { - gh_buf line = GH_BUF_INIT; + strbuf line = GH_BUF_INIT; unsigned char buffer[4096]; int linenum = 1; block *document = make_document(); @@ -417,17 +417,17 @@ extern block *stmd_parse_file(FILE *f) while (fgets((char *)buffer, sizeof(buffer), f)) { expand_tabs(&line, buffer, strlen((char *)buffer)); incorporate_line(&line, linenum, &document); - gh_buf_clear(&line); + strbuf_clear(&line); linenum++; } - gh_buf_free(&line); + strbuf_free(&line); return finalize_document(document, linenum); } extern block *stmd_parse_document(const unsigned char *buffer, size_t len) { - gh_buf line = GH_BUF_INIT; + strbuf line = GH_BUF_INIT; int linenum = 1; const unsigned char *end = buffer + len; block *document = make_document(); @@ -444,11 +444,11 @@ extern block *stmd_parse_document(const unsigned char *buffer, size_t len) } incorporate_line(&line, linenum, &document); - gh_buf_clear(&line); + strbuf_clear(&line); linenum++; } - gh_buf_free(&line); + strbuf_free(&line); return finalize_document(document, linenum); } @@ -471,7 +471,7 @@ static void chop_trailing_hashtags(chunk *ch) } // Process one line at a time, modifying a block. -static void incorporate_line(gh_buf *line, int line_number, block** curptr) +static void incorporate_line(strbuf *line, int line_number, block** curptr) { block* last_matched_container; int offset = 0; @@ -639,8 +639,8 @@ static void incorporate_line(gh_buf *line, int line_number, block** curptr) } else if (container->tag == paragraph && (lev = scan_setext_header_line(&input, first_nonspace)) && // check that there is only one line in the paragraph: - gh_buf_strrchr(&container->string_content, '\n', - gh_buf_len(&container->string_content) - 2) < 0) { + strbuf_strrchr(&container->string_content, '\n', + strbuf_len(&container->string_content) - 2) < 0) { container->tag = setext_header; container->attributes.header_level = lev; @@ -734,7 +734,7 @@ static void incorporate_line(gh_buf *line, int line_number, block** curptr) container == last_matched_container && !blank && cur->tag == paragraph && - gh_buf_len(&cur->string_content) > 0) { + strbuf_len(&cur->string_content) > 0) { add_line(cur, &input, offset); diff --git a/src/buffer.c b/src/buffer.c index dc4a405..90c2186 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -9,32 +9,32 @@ #include "buffer.h" -/* Used as default value for gh_buf->ptr so that people can always - * assume ptr is non-NULL and zero terminated even for new gh_bufs. +/* Used as default value for strbuf->ptr so that people can always + * assume ptr is non-NULL and zero terminated even for new strbufs. */ -unsigned char gh_buf__initbuf[1]; -unsigned char gh_buf__oom[1]; +unsigned char strbuf__initbuf[1]; +unsigned char strbuf__oom[1]; #define ENSURE_SIZE(b, d) \ - if ((d) > buf->asize && gh_buf_grow(b, (d)) < 0)\ + if ((d) > buf->asize && strbuf_grow(b, (d)) < 0)\ return -1; -void gh_buf_init(gh_buf *buf, int initial_size) +void strbuf_init(strbuf *buf, int initial_size) { buf->asize = 0; buf->size = 0; - buf->ptr = gh_buf__initbuf; + buf->ptr = strbuf__initbuf; if (initial_size) - gh_buf_grow(buf, initial_size); + strbuf_grow(buf, initial_size); } -int gh_buf_try_grow(gh_buf *buf, int target_size, bool mark_oom) +int strbuf_try_grow(strbuf *buf, int target_size, bool mark_oom) { unsigned char *new_ptr; int new_size; - if (buf->ptr == gh_buf__oom) + if (buf->ptr == strbuf__oom) return -1; if (target_size <= buf->asize) @@ -60,7 +60,7 @@ int gh_buf_try_grow(gh_buf *buf, int target_size, bool mark_oom) if (!new_ptr) { if (mark_oom) - buf->ptr = gh_buf__oom; + buf->ptr = strbuf__oom; return -1; } @@ -75,17 +75,17 @@ int gh_buf_try_grow(gh_buf *buf, int target_size, bool mark_oom) return 0; } -void gh_buf_free(gh_buf *buf) +void strbuf_free(strbuf *buf) { if (!buf) return; - if (buf->ptr != gh_buf__initbuf && buf->ptr != gh_buf__oom) + if (buf->ptr != strbuf__initbuf && buf->ptr != strbuf__oom) free(buf->ptr); - gh_buf_init(buf, 0); + strbuf_init(buf, 0); } -void gh_buf_clear(gh_buf *buf) +void strbuf_clear(strbuf *buf) { buf->size = 0; @@ -93,10 +93,10 @@ void gh_buf_clear(gh_buf *buf) buf->ptr[0] = '\0'; } -int gh_buf_set(gh_buf *buf, const unsigned char *data, int len) +int strbuf_set(strbuf *buf, const unsigned char *data, int len) { if (len <= 0 || data == NULL) { - gh_buf_clear(buf); + strbuf_clear(buf); } else { if (data != buf->ptr) { ENSURE_SIZE(buf, len + 1); @@ -108,14 +108,14 @@ int gh_buf_set(gh_buf *buf, const unsigned char *data, int len) return 0; } -int gh_buf_sets(gh_buf *buf, const char *string) +int strbuf_sets(strbuf *buf, const char *string) { - return gh_buf_set(buf, + return strbuf_set(buf, (const unsigned char *)string, string ? strlen(string) : 0); } -int gh_buf_putc(gh_buf *buf, int c) +int strbuf_putc(strbuf *buf, int c) { ENSURE_SIZE(buf, buf->size + 2); buf->ptr[buf->size++] = c; @@ -123,7 +123,7 @@ int gh_buf_putc(gh_buf *buf, int c) return 0; } -int gh_buf_put(gh_buf *buf, const unsigned char *data, int len) +int strbuf_put(strbuf *buf, const unsigned char *data, int len) { if (len <= 0) return 0; @@ -135,12 +135,12 @@ int gh_buf_put(gh_buf *buf, const unsigned char *data, int len) return 0; } -int gh_buf_puts(gh_buf *buf, const char *string) +int strbuf_puts(strbuf *buf, const char *string) { - return gh_buf_put(buf, (const unsigned char *)string, strlen(string)); + return strbuf_put(buf, (const unsigned char *)string, strlen(string)); } -int gh_buf_vprintf(gh_buf *buf, const char *format, va_list ap) +int strbuf_vprintf(strbuf *buf, const char *format, va_list ap) { const int expected_size = buf->size + (strlen(format) * 2); int len; @@ -159,7 +159,7 @@ int gh_buf_vprintf(gh_buf *buf, const char *format, va_list ap) if (len < 0) { free(buf->ptr); - buf->ptr = gh_buf__oom; + buf->ptr = strbuf__oom; return -1; } @@ -174,19 +174,19 @@ int gh_buf_vprintf(gh_buf *buf, const char *format, va_list ap) return 0; } -int gh_buf_printf(gh_buf *buf, const char *format, ...) +int strbuf_printf(strbuf *buf, const char *format, ...) { int r; va_list ap; va_start(ap, format); - r = gh_buf_vprintf(buf, format, ap); + r = strbuf_vprintf(buf, format, ap); va_end(ap); return r; } -void gh_buf_copy_cstr(char *data, int datasize, const gh_buf *buf) +void strbuf_copy_cstr(char *data, int datasize, const strbuf *buf) { int copylen; @@ -204,28 +204,28 @@ void gh_buf_copy_cstr(char *data, int datasize, const gh_buf *buf) data[copylen] = '\0'; } -void gh_buf_swap(gh_buf *buf_a, gh_buf *buf_b) +void strbuf_swap(strbuf *buf_a, strbuf *buf_b) { - gh_buf t = *buf_a; + strbuf t = *buf_a; *buf_a = *buf_b; *buf_b = t; } -unsigned char *gh_buf_detach(gh_buf *buf) +unsigned char *strbuf_detach(strbuf *buf) { unsigned char *data = buf->ptr; - if (buf->asize == 0 || buf->ptr == gh_buf__oom) + if (buf->asize == 0 || buf->ptr == strbuf__oom) return NULL; - gh_buf_init(buf, 0); + strbuf_init(buf, 0); return data; } -void gh_buf_attach(gh_buf *buf, unsigned char *ptr, int asize) +void strbuf_attach(strbuf *buf, unsigned char *ptr, int asize) { - gh_buf_free(buf); + strbuf_free(buf); if (ptr) { buf->ptr = ptr; @@ -235,18 +235,18 @@ void gh_buf_attach(gh_buf *buf, unsigned char *ptr, int asize) else /* pass 0 to fall back on strlen + 1 */ buf->asize = buf->size + 1; } else { - gh_buf_grow(buf, asize); + strbuf_grow(buf, asize); } } -int gh_buf_cmp(const gh_buf *a, const gh_buf *b) +int strbuf_cmp(const strbuf *a, const strbuf *b) { int result = memcmp(a->ptr, b->ptr, MIN(a->size, b->size)); return (result != 0) ? result : (a->size < b->size) ? -1 : (a->size > b->size) ? 1 : 0; } -int gh_buf_strchr(const gh_buf *buf, int c, int pos) +int strbuf_strchr(const strbuf *buf, int c, int pos) { const unsigned char *p = memchr(buf->ptr + pos, c, buf->size - pos); if (!p) @@ -255,7 +255,7 @@ int gh_buf_strchr(const gh_buf *buf, int c, int pos) return (int)(p - (const unsigned char *)buf->ptr); } -int gh_buf_strrchr(const gh_buf *buf, int c, int pos) +int strbuf_strrchr(const strbuf *buf, int c, int pos) { int i; @@ -267,7 +267,7 @@ int gh_buf_strrchr(const gh_buf *buf, int c, int pos) return -1; } -void gh_buf_truncate(gh_buf *buf, int len) +void strbuf_truncate(strbuf *buf, int len) { if (len < buf->size) { buf->size = len; @@ -275,7 +275,7 @@ void gh_buf_truncate(gh_buf *buf, int len) } } -void gh_buf_drop(gh_buf *buf, int n) +void strbuf_drop(strbuf *buf, int n) { if (n > 0) { buf->size = buf->size - n; @@ -286,7 +286,7 @@ void gh_buf_drop(gh_buf *buf, int n) } } -void gh_buf_trim(gh_buf *buf) +void strbuf_trim(strbuf *buf) { int i = 0; @@ -296,7 +296,7 @@ void gh_buf_trim(gh_buf *buf) while (i < buf->size && isspace(buf->ptr[i])) i++; - gh_buf_drop(buf, i); + strbuf_drop(buf, i); /* rtrim */ while (buf->size > 0) { diff --git a/src/buffer.h b/src/buffer.h index 0d5143e..6f45cbb 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -9,20 +9,20 @@ typedef struct { unsigned char *ptr; int asize, size; -} gh_buf; +} strbuf; -extern unsigned char gh_buf__initbuf[]; -extern unsigned char gh_buf__oom[]; +extern unsigned char strbuf__initbuf[]; +extern unsigned char strbuf__oom[]; -#define GH_BUF_INIT { gh_buf__initbuf, 0, 0 } +#define GH_BUF_INIT { strbuf__initbuf, 0, 0 } /** - * Initialize a gh_buf structure. + * Initialize a strbuf structure. * * For the cases where GH_BUF_INIT cannot be used to do static * initialization. */ -extern void gh_buf_init(gh_buf *buf, int initial_size); +extern void strbuf_init(strbuf *buf, int initial_size); /** * Attempt to grow the buffer to hold at least `target_size` bytes. @@ -32,7 +32,7 @@ extern void gh_buf_init(gh_buf *buf, int initial_size); * existing buffer content will be preserved, but calling code must handle * that buffer was not expanded. */ -extern int gh_buf_try_grow(gh_buf *buf, int target_size, bool mark_oom); +extern int strbuf_try_grow(strbuf *buf, int target_size, bool mark_oom); /** * Grow the buffer to hold at least `target_size` bytes. @@ -42,71 +42,71 @@ extern int gh_buf_try_grow(gh_buf *buf, int target_size, bool mark_oom); * * @return 0 on success or -1 on failure */ -static inline int gh_buf_grow(gh_buf *buf, int target_size) +static inline int strbuf_grow(strbuf *buf, int target_size) { - return gh_buf_try_grow(buf, target_size, true); + return strbuf_try_grow(buf, target_size, true); } -extern void gh_buf_free(gh_buf *buf); -extern void gh_buf_swap(gh_buf *buf_a, gh_buf *buf_b); +extern void strbuf_free(strbuf *buf); +extern void strbuf_swap(strbuf *buf_a, strbuf *buf_b); /** - * Test if there have been any reallocation failures with this gh_buf. + * Test if there have been any reallocation failures with this strbuf. * - * Any function that writes to a gh_buf can fail due to memory allocation - * issues. If one fails, the gh_buf will be marked with an OOM error and - * further calls to modify the buffer will fail. Check gh_buf_oom() at the + * Any function that writes to a strbuf can fail due to memory allocation + * issues. If one fails, the strbuf will be marked with an OOM error and + * further calls to modify the buffer will fail. Check strbuf_oom() at the * end of your sequence and it will be true if you ran out of memory at any * point with that buffer. * * @return false if no error, true if allocation error */ -static inline bool gh_buf_oom(const gh_buf *buf) +static inline bool strbuf_oom(const strbuf *buf) { - return (buf->ptr == gh_buf__oom); + return (buf->ptr == strbuf__oom); } -static inline size_t gh_buf_len(const gh_buf *buf) +static inline size_t strbuf_len(const strbuf *buf) { return buf->size; } -extern int gh_buf_cmp(const gh_buf *a, const gh_buf *b); +extern int strbuf_cmp(const strbuf *a, const strbuf *b); -extern void gh_buf_attach(gh_buf *buf, unsigned char *ptr, int asize); -extern unsigned char *gh_buf_detach(gh_buf *buf); -extern void gh_buf_copy_cstr(char *data, int datasize, const gh_buf *buf); +extern void strbuf_attach(strbuf *buf, unsigned char *ptr, int asize); +extern unsigned char *strbuf_detach(strbuf *buf); +extern void strbuf_copy_cstr(char *data, int datasize, const strbuf *buf); -static inline const char *gh_buf_cstr(const gh_buf *buf) +static inline const char *strbuf_cstr(const strbuf *buf) { return (char *)buf->ptr; } -#define gh_buf_at(buf, n) ((buf)->ptr[n]) +#define strbuf_at(buf, n) ((buf)->ptr[n]) /* * Functions below that return int value error codes will return 0 on * success or -1 on failure (which generally means an allocation failed). - * Using a gh_buf where the allocation has failed with result in -1 from + * Using a strbuf where the allocation has failed with result in -1 from * all further calls using that buffer. As a result, you can ignore the * return code of these functions and call them in a series then just call - * gh_buf_oom at the end. + * strbuf_oom at the end. */ -extern int gh_buf_set(gh_buf *buf, const unsigned char *data, int len); -extern int gh_buf_sets(gh_buf *buf, const char *string); -extern int gh_buf_putc(gh_buf *buf, int c); -extern int gh_buf_put(gh_buf *buf, const unsigned char *data, int len); -extern int gh_buf_puts(gh_buf *buf, const char *string); -extern int gh_buf_printf(gh_buf *buf, const char *format, ...) +extern int strbuf_set(strbuf *buf, const unsigned char *data, int len); +extern int strbuf_sets(strbuf *buf, const char *string); +extern int strbuf_putc(strbuf *buf, int c); +extern int strbuf_put(strbuf *buf, const unsigned char *data, int len); +extern int strbuf_puts(strbuf *buf, const char *string); +extern int strbuf_printf(strbuf *buf, const char *format, ...) __attribute__((format (printf, 2, 3))); -extern int gh_buf_vprintf(gh_buf *buf, const char *format, va_list ap); -extern void gh_buf_clear(gh_buf *buf); - -int gh_buf_strchr(const gh_buf *buf, int c, int pos); -int gh_buf_strrchr(const gh_buf *buf, int c, int pos); -void gh_buf_drop(gh_buf *buf, int n); -void gh_buf_truncate(gh_buf *buf, int len); -void gh_buf_trim(gh_buf *buf); +extern int strbuf_vprintf(strbuf *buf, const char *format, va_list ap); +extern void strbuf_clear(strbuf *buf); + +int strbuf_strchr(const strbuf *buf, int c, int pos); +int strbuf_strrchr(const strbuf *buf, int c, int pos); +void strbuf_drop(strbuf *buf, int n); +void strbuf_truncate(strbuf *buf, int len); +void strbuf_trim(strbuf *buf); #endif diff --git a/src/chunk.h b/src/chunk.h index f3841ed..f37a2f3 100644 --- a/src/chunk.h +++ b/src/chunk.h @@ -78,12 +78,12 @@ static inline chunk chunk_dup(const chunk *ch, int pos, int len) return c; } -static inline chunk chunk_buf_detach(gh_buf *buf) +static inline chunk chunk_buf_detach(strbuf *buf) { chunk c; c.len = buf->size; - c.data = gh_buf_detach(buf); + c.data = strbuf_detach(buf); c.alloc = 1; return c; diff --git a/src/html/houdini.h b/src/html/houdini.h index 31fe917..1e54d20 100644 --- a/src/html/houdini.h +++ b/src/html/houdini.h @@ -25,17 +25,17 @@ extern "C" { #define HOUDINI_ESCAPED_SIZE(x) (((x) * 12) / 10) #define HOUDINI_UNESCAPED_SIZE(x) (x) -extern int houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size); -extern int houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure); -extern int houdini_unescape_html(gh_buf *ob, const uint8_t *src, size_t size); -extern int houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size); -extern int houdini_escape_uri(gh_buf *ob, const uint8_t *src, size_t size); -extern int houdini_escape_url(gh_buf *ob, const uint8_t *src, size_t size); -extern int houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size); -extern int houdini_unescape_uri(gh_buf *ob, const uint8_t *src, size_t size); -extern int houdini_unescape_url(gh_buf *ob, const uint8_t *src, size_t size); -extern int houdini_escape_js(gh_buf *ob, const uint8_t *src, size_t size); -extern int houdini_unescape_js(gh_buf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_html(strbuf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_html0(strbuf *ob, const uint8_t *src, size_t size, int secure); +extern int houdini_unescape_html(strbuf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_xml(strbuf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_uri(strbuf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_url(strbuf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_href(strbuf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_uri(strbuf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_url(strbuf *ob, const uint8_t *src, size_t size); +extern int houdini_escape_js(strbuf *ob, const uint8_t *src, size_t size); +extern int houdini_unescape_js(strbuf *ob, const uint8_t *src, size_t size); #ifdef __cplusplus } diff --git a/src/html/houdini_href_e.c b/src/html/houdini_href_e.c index b2a7d79..12456ce 100644 --- a/src/html/houdini_href_e.c +++ b/src/html/houdini_href_e.c @@ -49,7 +49,7 @@ static const char HREF_SAFE[] = { }; int -houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size) +houdini_escape_href(strbuf *ob, const uint8_t *src, size_t size) { static const uint8_t hex_chars[] = "0123456789ABCDEF"; size_t i = 0, org; @@ -63,7 +63,7 @@ houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size) i++; if (likely(i > org)) - gh_buf_put(ob, src + org, i - org); + strbuf_put(ob, src + org, i - org); /* escaping */ if (i >= size) @@ -73,14 +73,14 @@ houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size) /* amp appears all the time in URLs, but needs * HTML-entity escaping to be inside an href */ case '&': - gh_buf_puts(ob, "&"); + strbuf_puts(ob, "&"); break; /* the single quote is a valid URL character * according to the standard; it needs HTML * entity escaping too */ case '\'': - gh_buf_puts(ob, "'"); + strbuf_puts(ob, "'"); break; /* the space can be escaped to %20 or a plus @@ -89,7 +89,7 @@ houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size) * when building GET strings */ #if 0 case ' ': - gh_buf_putc(ob, '+'); + strbuf_putc(ob, '+'); break; #endif @@ -97,7 +97,7 @@ houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size) default: hex_str[1] = hex_chars[(src[i] >> 4) & 0xF]; hex_str[2] = hex_chars[src[i] & 0xF]; - gh_buf_put(ob, hex_str, 3); + strbuf_put(ob, hex_str, 3); } i++; diff --git a/src/html/houdini_html_e.c b/src/html/houdini_html_e.c index 95b6c41..f2e86fe 100644 --- a/src/html/houdini_html_e.c +++ b/src/html/houdini_html_e.c @@ -45,7 +45,7 @@ static const char *HTML_ESCAPES[] = { }; int -houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure) +houdini_escape_html0(strbuf *ob, const uint8_t *src, size_t size, int secure) { size_t i = 0, org, esc = 0; @@ -55,7 +55,7 @@ houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure) i++; if (i > org) - gh_buf_put(ob, src + org, i - org); + strbuf_put(ob, src + org, i - org); /* escaping */ if (unlikely(i >= size)) @@ -63,9 +63,9 @@ houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure) /* The forward slash is only escaped in secure mode */ if ((src[i] == '/' || src[i] == '\'') && !secure) { - gh_buf_putc(ob, src[i]); + strbuf_putc(ob, src[i]); } else { - gh_buf_puts(ob, HTML_ESCAPES[esc]); + strbuf_puts(ob, HTML_ESCAPES[esc]); } i++; @@ -75,7 +75,7 @@ houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure) } int -houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size) +houdini_escape_html(strbuf *ob, const uint8_t *src, size_t size) { return houdini_escape_html0(ob, src, size, 1); } diff --git a/src/html/html.c b/src/html/html.c index 41b8fda..a9356dd 100644 --- a/src/html/html.c +++ b/src/html/html.c @@ -10,7 +10,7 @@ // Functions to convert block and inline lists to HTML strings. -static void escape_html(gh_buf *dest, const unsigned char *source, int length) +static void escape_html(strbuf *dest, const unsigned char *source, int length) { if (length < 0) length = strlen((char *)source); @@ -18,7 +18,7 @@ static void escape_html(gh_buf *dest, const unsigned char *source, int length) houdini_escape_html0(dest, source, (size_t)length, 0); } -static void escape_href(gh_buf *dest, const unsigned char *source, int length) +static void escape_href(strbuf *dest, const unsigned char *source, int length) { if (length < 0) length = strlen((char *)source); @@ -26,14 +26,14 @@ static void escape_href(gh_buf *dest, const unsigned char *source, int length) houdini_escape_href(dest, source, (size_t)length); } -static inline void cr(gh_buf *html) +static inline void cr(strbuf *html) { if (html->size && html->ptr[html->size - 1] != '\n') - gh_buf_putc(html, '\n'); + strbuf_putc(html, '\n'); } // Convert a block list to HTML. Returns 0 on success, and sets result. -void blocks_to_html(gh_buf *html, block *b, bool tight) +void blocks_to_html(strbuf *html, block *b, bool tight) { struct ListData *data; @@ -48,25 +48,25 @@ void blocks_to_html(gh_buf *html, block *b, bool tight) inlines_to_html(html, b->inline_content); } else { cr(html); - gh_buf_puts(html, "

    "); + strbuf_puts(html, "

    "); inlines_to_html(html, b->inline_content); - gh_buf_puts(html, "

    \n"); + strbuf_puts(html, "

    \n"); } break; case block_quote: cr(html); - gh_buf_puts(html, "
    \n"); + strbuf_puts(html, "
    \n"); blocks_to_html(html, b->children, false); - gh_buf_puts(html, "
    \n"); + strbuf_puts(html, "
    \n"); break; case list_item: cr(html); - gh_buf_puts(html, "
  • "); + strbuf_puts(html, "
  • "); blocks_to_html(html, b->children, tight); - gh_buf_trim(html); /* TODO: rtrim */ - gh_buf_puts(html, "
  • \n"); + strbuf_trim(html); /* TODO: rtrim */ + strbuf_puts(html, "\n"); break; case list: @@ -75,58 +75,58 @@ void blocks_to_html(gh_buf *html, block *b, bool tight) data = &(b->attributes.list_data); if (data->start > 1) { - gh_buf_printf(html, "<%s start=\"%d\">\n", + strbuf_printf(html, "<%s start=\"%d\">\n", data->list_type == bullet ? "ul" : "ol", data->start); } else { - gh_buf_puts(html, data->list_type == bullet ? "
      \n" : "
        \n"); + strbuf_puts(html, data->list_type == bullet ? "
          \n" : "
            \n"); } blocks_to_html(html, b->children, data->tight); - gh_buf_puts(html, data->list_type == bullet ? "
        " : "
      "); - gh_buf_putc(html, '\n'); + strbuf_puts(html, data->list_type == bullet ? "
    " : ""); + strbuf_putc(html, '\n'); break; case atx_header: case setext_header: cr(html); - gh_buf_printf(html, "", b->attributes.header_level); + strbuf_printf(html, "", b->attributes.header_level); inlines_to_html(html, b->inline_content); - gh_buf_printf(html, "\n", b->attributes.header_level); + strbuf_printf(html, "\n", b->attributes.header_level); break; case indented_code: case fenced_code: cr(html); - gh_buf_puts(html, "tag == fenced_code) { - gh_buf *info = &b->attributes.fenced_code_data.info; + strbuf *info = &b->attributes.fenced_code_data.info; - if (gh_buf_len(info) > 0) { - int first_tag = gh_buf_strchr(info, ' ', 0); + if (strbuf_len(info) > 0) { + int first_tag = strbuf_strchr(info, ' ', 0); if (first_tag < 0) - first_tag = gh_buf_len(info); + first_tag = strbuf_len(info); - gh_buf_puts(html, " class=\""); + strbuf_puts(html, " class=\""); escape_html(html, info->ptr, first_tag); - gh_buf_putc(html, '"'); + strbuf_putc(html, '"'); } } - gh_buf_puts(html, ">"); + strbuf_puts(html, ">"); escape_html(html, b->string_content.ptr, b->string_content.size); - gh_buf_puts(html, "\n"); + strbuf_puts(html, "\n"); break; case html_block: - gh_buf_put(html, b->string_content.ptr, b->string_content.size); + strbuf_put(html, b->string_content.ptr, b->string_content.size); break; case hrule: - gh_buf_puts(html, "
    \n"); + strbuf_puts(html, "
    \n"); break; case reference_def: @@ -141,9 +141,9 @@ void blocks_to_html(gh_buf *html, block *b, bool tight) } // Convert an inline list to HTML. Returns 0 on success, and sets result. -void inlines_to_html(gh_buf *html, inl* ils) +void inlines_to_html(strbuf *html, inl* ils) { - gh_buf scrap = GH_BUF_INIT; + strbuf scrap = GH_BUF_INIT; while(ils != NULL) { switch(ils->tag) { @@ -152,70 +152,70 @@ void inlines_to_html(gh_buf *html, inl* ils) break; case INL_LINEBREAK: - gh_buf_puts(html, "
    \n"); + strbuf_puts(html, "
    \n"); break; case INL_SOFTBREAK: - gh_buf_putc(html, '\n'); + strbuf_putc(html, '\n'); break; case INL_CODE: - gh_buf_puts(html, ""); + strbuf_puts(html, ""); escape_html(html, ils->content.literal.data, ils->content.literal.len); - gh_buf_puts(html, ""); + strbuf_puts(html, ""); break; case INL_RAW_HTML: case INL_ENTITY: - gh_buf_put(html, + strbuf_put(html, ils->content.literal.data, ils->content.literal.len); break; case INL_LINK: - gh_buf_puts(html, "content.linkable.url) escape_href(html, ils->content.linkable.url, -1); if (ils->content.linkable.title) { - gh_buf_puts(html, "\" title=\""); + strbuf_puts(html, "\" title=\""); escape_html(html, ils->content.linkable.title, -1); } - gh_buf_puts(html, "\">"); + strbuf_puts(html, "\">"); inlines_to_html(html, ils->content.inlines); - gh_buf_puts(html, ""); + strbuf_puts(html, ""); break; case INL_IMAGE: - gh_buf_puts(html, "content.linkable.url) escape_href(html, ils->content.linkable.url, -1); inlines_to_html(&scrap, ils->content.inlines); - gh_buf_puts(html, "\" alt=\""); + strbuf_puts(html, "\" alt=\""); if (scrap.size) escape_html(html, scrap.ptr, scrap.size); - gh_buf_clear(&scrap); + strbuf_clear(&scrap); if (ils->content.linkable.title) { - gh_buf_puts(html, "\" title=\""); + strbuf_puts(html, "\" title=\""); escape_html(html, ils->content.linkable.title, -1); } - gh_buf_puts(html, "\"/>"); + strbuf_puts(html, "\"/>"); break; case INL_STRONG: - gh_buf_puts(html, ""); + strbuf_puts(html, ""); inlines_to_html(html, ils->content.inlines); - gh_buf_puts(html, ""); + strbuf_puts(html, ""); break; case INL_EMPH: - gh_buf_puts(html, ""); + strbuf_puts(html, ""); inlines_to_html(html, ils->content.inlines); - gh_buf_puts(html, ""); + strbuf_puts(html, ""); break; } ils = ils->next; diff --git a/src/inlines.c b/src/inlines.c index 8e2e683..33973df 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -25,7 +25,7 @@ inline static void chunk_free(chunk *c); inline static void chunk_trim(chunk *c); inline static chunk chunk_literal(const char *data); -inline static chunk chunk_buf_detach(gh_buf *buf); +inline static chunk chunk_buf_detach(strbuf *buf); inline static chunk chunk_dup(const chunk *ch, int pos, int len); static inl *parse_chunk_inlines(chunk *chunk, reference** refmap); @@ -33,10 +33,10 @@ static inl *parse_inlines_while(subject* subj, int (*f)(subject*)); static int parse_inline(subject* subj, inl ** last); static void subject_from_chunk(subject *e, chunk *chunk, reference** refmap); -static void subject_from_buf(subject *e, gh_buf *buffer, reference** refmap); +static void subject_from_buf(subject *e, strbuf *buffer, reference** refmap); static int subject_find_special_char(subject *subj); -static void normalize_whitespace(gh_buf *s); +static void normalize_whitespace(strbuf *s); extern void free_reference(reference *ref) { free(ref->label); @@ -62,13 +62,13 @@ extern void free_reference_map(reference **refmap) { // remove leading/trailing whitespace, case fold static unsigned char *normalize_reference(chunk *ref) { - gh_buf normalized = GH_BUF_INIT; + strbuf normalized = GH_BUF_INIT; utf8proc_case_fold(&normalized, ref->data, ref->len); - gh_buf_trim(&normalized); + strbuf_trim(&normalized); normalize_whitespace(&normalized); - return gh_buf_detach(&normalized); + return strbuf_detach(&normalized); } // Returns reference if refmap contains a reference with matching @@ -218,7 +218,7 @@ inline static inl* append_inlines(inl* a, inl* b) return a; } -static void subject_from_buf(subject *e, gh_buf *buffer, reference** refmap) +static void subject_from_buf(subject *e, strbuf *buffer, reference** refmap) { e->input.data = buffer->ptr; e->input.len = buffer->size; @@ -309,7 +309,7 @@ static int scan_to_closing_backticks(subject* subj, int openticklength) // Destructively modify string, collapsing consecutive // space and newline characters into a single space. -static void normalize_whitespace(gh_buf *s) +static void normalize_whitespace(strbuf *s) { bool last_char_was_space = false; int r, w; @@ -331,7 +331,7 @@ static void normalize_whitespace(gh_buf *s) } } - gh_buf_truncate(s, w); + strbuf_truncate(s, w); } // Parse backtick code section or raw backticks, return an inline. @@ -346,10 +346,10 @@ static inl* handle_backticks(subject *subj) subj->pos = startpos; // rewind return make_str(openticks); } else { - gh_buf buf = GH_BUF_INIT; + strbuf buf = GH_BUF_INIT; - gh_buf_set(&buf, subj->input.data + startpos, endpos - startpos - openticks.len); - gh_buf_trim(&buf); + strbuf_set(&buf, subj->input.data + startpos, endpos - startpos - openticks.len); + strbuf_trim(&buf); normalize_whitespace(&buf); return make_code(chunk_buf_detach(&buf)); @@ -569,7 +569,7 @@ static inl *make_str_with_entities(chunk *content) } // Destructively unescape a string: remove backslashes before punctuation chars. -extern void unescape_buffer(gh_buf *buf) +extern void unescape_buffer(strbuf *buf) { int r, w; @@ -580,14 +580,14 @@ extern void unescape_buffer(gh_buf *buf) buf->ptr[w++] = buf->ptr[r]; } - gh_buf_truncate(buf, w); + strbuf_truncate(buf, w); } // Clean a URL: remove surrounding whitespace and surrounding <>, // and remove \ that escape punctuation. static unsigned char *clean_url(chunk *url, int is_email) { - gh_buf buf = GH_BUF_INIT; + strbuf buf = GH_BUF_INIT; chunk_trim(url); @@ -595,22 +595,22 @@ static unsigned char *clean_url(chunk *url, int is_email) return NULL; if (is_email) - gh_buf_puts(&buf, "mailto:"); + strbuf_puts(&buf, "mailto:"); if (url->data[0] == '<' && url->data[url->len - 1] == '>') { - gh_buf_put(&buf, url->data + 1, url->len - 2); + strbuf_put(&buf, url->data + 1, url->len - 2); } else { - gh_buf_put(&buf, url->data, url->len); + strbuf_put(&buf, url->data, url->len); } unescape_buffer(&buf); - return gh_buf_detach(&buf); + return strbuf_detach(&buf); } // Clean a title: remove surrounding quotes and remove \ that escape punctuation. static unsigned char *clean_title(chunk *title) { - gh_buf buf = GH_BUF_INIT; + strbuf buf = GH_BUF_INIT; unsigned char first, last; if (title->len == 0) @@ -623,13 +623,13 @@ static unsigned char *clean_title(chunk *title) if ((first == '\'' && last == '\'') || (first == '(' && last == ')') || (first == '"' && last == '"')) { - gh_buf_set(&buf, title->data + 1, title->len - 2); + strbuf_set(&buf, title->data + 1, title->len - 2); } else { - gh_buf_set(&buf, title->data, title->len); + strbuf_set(&buf, title->data, title->len); } unescape_buffer(&buf); - return gh_buf_detach(&buf); + return strbuf_detach(&buf); } // Parse an autolink or HTML tag. @@ -971,7 +971,7 @@ static int parse_inline(subject* subj, inl ** last) return 1; } -extern inl* parse_inlines(gh_buf *input, reference** refmap) +extern inl* parse_inlines(strbuf *input, reference** refmap) { subject subj; subject_from_buf(&subj, input, refmap); @@ -993,7 +993,7 @@ void spnl(subject* subj) // Modify refmap if a reference is encountered. // Return 0 if no reference found, otherwise position of subject // after reference is parsed. -extern int parse_reference(gh_buf *input, reference** refmap) +extern int parse_reference(strbuf *input, reference** refmap) { subject subj; diff --git a/src/main.c b/src/main.c index e1abedc..7cf67e2 100644 --- a/src/main.c +++ b/src/main.c @@ -14,14 +14,14 @@ void print_usage() static void print_document(block *document, bool ast) { - gh_buf html = GH_BUF_INIT; + strbuf html = GH_BUF_INIT; if (ast) { print_blocks(document, 0); } else { blocks_to_html(&html, document, false); printf("%s", html.ptr); - gh_buf_free(&html); + strbuf_free(&html); } } diff --git a/src/stmd.h b/src/stmd.h index 4a3c399..2e86f3a 100644 --- a/src/stmd.h +++ b/src/stmd.h @@ -50,7 +50,7 @@ struct FencedCodeData { int fence_length; int fence_offset; char fence_char; - gh_buf info; + strbuf info; }; typedef struct Block { @@ -76,7 +76,7 @@ typedef struct Block { struct Block* last_child; struct Block* parent; struct Block* top; - gh_buf string_content; + strbuf string_content; inl* inline_content; union { struct ListData list_data; @@ -88,15 +88,15 @@ typedef struct Block { struct Block * prev; } block; -inl* parse_inlines(gh_buf *input, reference** refmap); +inl* parse_inlines(strbuf *input, reference** refmap); void free_inlines(inl* e); -int parse_reference(gh_buf *input, reference** refmap); +int parse_reference(strbuf *input, reference** refmap); void free_reference(reference *ref); void free_reference_map(reference **refmap); void add_reference(reference** refmap, reference* ref); -void unescape_buffer(gh_buf *buf); +void unescape_buffer(strbuf *buf); extern block* make_document(); extern block* add_child(block* parent, @@ -109,9 +109,9 @@ extern block *stmd_parse_file(FILE *f); void print_inlines(inl* ils, int indent); void print_blocks(block* blk, int indent); -void blocks_to_html(gh_buf *html, block *b, bool tight); -void inlines_to_html(gh_buf *html, inl *b); +void blocks_to_html(strbuf *html, block *b, bool tight); +void inlines_to_html(strbuf *html, inl *b); -void utf8proc_case_fold(gh_buf *dest, const unsigned char *str, int len); +void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len); #endif diff --git a/src/utf8.c b/src/utf8.c index 32c78a4..cebd872 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -84,7 +84,7 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) return length; } -void utf8proc_encode_char(int32_t uc, gh_buf *buf) +void utf8proc_encode_char(int32_t uc, strbuf *buf) { unsigned char dst[4]; int len = 0; @@ -119,10 +119,10 @@ void utf8proc_encode_char(int32_t uc, gh_buf *buf) assert(false); } - gh_buf_put(buf, dst, len); + strbuf_put(buf, dst, len); } -void utf8proc_case_fold(gh_buf *dest, const unsigned char *str, int len) +void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len) { int32_t c; -- cgit v1.2.3 From 61e3e606e64221eaa5cf3d83dc598d5a42818d10 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Sat, 6 Sep 2014 20:48:05 +0200 Subject: UTF8-aware detabbing and entity handling --- Makefile | 13 ++++++----- src/blocks.c | 35 ++++++------------------------ src/html/houdini.h | 2 ++ src/html/html.c | 1 - src/inlines.c | 63 ++++++++++++++++++++++-------------------------------- src/print.c | 5 ----- src/stmd.h | 3 --- src/utf8.c | 59 ++++++++++++++++++++++++++++++++++++++++++++------ 8 files changed, 95 insertions(+), 86 deletions(-) (limited to 'src/utf8.c') diff --git a/Makefile b/Makefile index 0d2eb8b..b5e487d 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ -CFLAGS=-g -O3 -Wall -Wextra -std=c99 -Isrc $(OPTFLAGS) -LDFLAGS=-g -O3 -Wall -Werror +CFLAGS=-g -pg -O3 -Wall -Wextra -std=c99 -Isrc $(OPTFLAGS) +LDFLAGS=-g -pg -O3 -Wall -Werror SRCDIR=src DATADIR=data @@ -41,11 +41,11 @@ testjs: spec.txt benchjs: node js/bench.js ${BENCHINP} -HTML_OBJ=$(SRCDIR)/html/html.o $(SRCDIR)/html/houdini_href_e.o $(SRCDIR)/html/houdini_html_e.o +HTML_OBJ=$(SRCDIR)/html/html.o $(SRCDIR)/html/houdini_href_e.o $(SRCDIR)/html/houdini_html_e.o $(SRCDIR)/html/houdini_html_u.o STMD_OBJ=$(SRCDIR)/inlines.o $(SRCDIR)/buffer.o $(SRCDIR)/blocks.o $(SRCDIR)/scanners.c $(SRCDIR)/print.o $(SRCDIR)/utf8.o -$(PROG): $(SRCDIR)/main.c $(HTML_OBJ) $(STMD_OBJ) - $(CC) $(LDFLAGS) -o $@ $^ +$(PROG): $(SRCDIR)/html/html_unescape.h $(SRCDIR)/case_fold_switch.inc $(HTML_OBJ) $(STMD_OBJ) $(SRCDIR)/main.c + $(CC) $(LDFLAGS) -o $@ $(HTML_OBJ) $(STMD_OBJ) $(SRCDIR)/main.c $(SRCDIR)/scanners.c: $(SRCDIR)/scanners.re re2c --case-insensitive -bis $< > $@ || (rm $@ && false) @@ -53,6 +53,9 @@ $(SRCDIR)/scanners.c: $(SRCDIR)/scanners.re $(SRCDIR)/case_fold_switch.inc: $(DATADIR)/CaseFolding-3.2.0.txt perl mkcasefold.pl < $< > $@ +$(SRCDIR)/html/html_unescape.h: $(SRCDIR)/html/html_unescape.gperf + gperf -I -t -N find_entity -H hash_entity -K entity -C -l --null-strings -m5 $< > $@ + .PHONY: leakcheck clean fuzztest dingus upload dingus: diff --git a/src/blocks.c b/src/blocks.c index f671b5e..8c7d49c 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -5,6 +5,8 @@ #include #include "stmd.h" +#include "utf8.h" +#include "html/houdini.h" #include "scanners.h" #include "uthash.h" @@ -184,7 +186,7 @@ static void finalize(node_block* b, int line_number) firstlinelen = strbuf_strchr(&b->string_content, '\n', 0); strbuf_init(&b->attributes.fenced_code_data.info, 0); - strbuf_set( + houdini_unescape_html_f( &b->attributes.fenced_code_data.info, b->string_content.ptr, firstlinelen @@ -369,31 +371,6 @@ static int lists_match(struct ListData list_data, list_data.bullet_char == item_data.bullet_char); } -static void expand_tabs(strbuf *ob, const unsigned char *line, size_t size) -{ - size_t i = 0, tab = 0; - - while (i < size) { - size_t org = i; - - while (i < size && line[i] != '\t') { - i++; tab++; - } - - if (i > org) - strbuf_put(ob, line + org, i - org); - - if (i >= size) - break; - - do { - strbuf_putc(ob, ' '); tab++; - } while (tab % 4); - - i++; - } -} - static node_block *finalize_document(node_block *document, int linenum) { while (document != document->top) { @@ -415,7 +392,7 @@ extern node_block *stmd_parse_file(FILE *f) node_block *document = make_document(); while (fgets((char *)buffer, sizeof(buffer), f)) { - expand_tabs(&line, buffer, strlen((char *)buffer)); + utf8proc_detab(&line, buffer, strlen((char *)buffer)); incorporate_line(&line, linenum, &document); strbuf_clear(&line); linenum++; @@ -436,10 +413,10 @@ extern node_block *stmd_parse_document(const unsigned char *buffer, size_t len) const unsigned char *eol = memchr(buffer, '\n', end - buffer); if (!eol) { - expand_tabs(&line, buffer, end - buffer); + utf8proc_detab(&line, buffer, end - buffer); buffer = end; } else { - expand_tabs(&line, buffer, (eol - buffer) + 1); + utf8proc_detab(&line, buffer, (eol - buffer) + 1); buffer += (eol - buffer) + 1; } diff --git a/src/html/houdini.h b/src/html/houdini.h index 1e54d20..5fd690d 100644 --- a/src/html/houdini.h +++ b/src/html/houdini.h @@ -25,9 +25,11 @@ extern "C" { #define HOUDINI_ESCAPED_SIZE(x) (((x) * 12) / 10) #define HOUDINI_UNESCAPED_SIZE(x) (x) +extern size_t houdini_unescape_ent(strbuf *ob, const uint8_t *src, size_t size); extern int houdini_escape_html(strbuf *ob, const uint8_t *src, size_t size); extern int houdini_escape_html0(strbuf *ob, const uint8_t *src, size_t size, int secure); extern int houdini_unescape_html(strbuf *ob, const uint8_t *src, size_t size); +extern void houdini_unescape_html_f(strbuf *ob, const uint8_t *src, size_t size); extern int houdini_escape_xml(strbuf *ob, const uint8_t *src, size_t size); extern int houdini_escape_uri(strbuf *ob, const uint8_t *src, size_t size); extern int houdini_escape_url(strbuf *ob, const uint8_t *src, size_t size); diff --git a/src/html/html.c b/src/html/html.c index 758ec80..595dfcd 100644 --- a/src/html/html.c +++ b/src/html/html.c @@ -166,7 +166,6 @@ void inlines_to_html(strbuf *html, node_inl* ils) break; case INL_RAW_HTML: - case INL_ENTITY: strbuf_put(html, ils->content.literal.data, ils->content.literal.len); diff --git a/src/inlines.c b/src/inlines.c index 6b17027..7b27150 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -5,6 +5,8 @@ #include #include "stmd.h" +#include "html/houdini.h" +#include "utf8.h" #include "uthash.h" #include "scanners.h" @@ -176,7 +178,6 @@ inline static node_inl* make_simple(int t) #define make_str(s) make_literal(INL_STRING, s) #define make_code(s) make_literal(INL_CODE, s) #define make_raw_html(s) make_literal(INL_RAW_HTML, s) -#define make_entity(s) make_literal(INL_ENTITY, s) #define make_linebreak() make_simple(INL_LINEBREAK) #define make_softbreak() make_simple(INL_SOFTBREAK) #define make_emph(contents) make_inlines(INL_EMPH, contents) @@ -191,7 +192,6 @@ extern void free_inlines(node_inl* e) case INL_STRING: case INL_RAW_HTML: case INL_CODE: - case INL_ENTITY: chunk_free(&e->content.literal); break; case INL_LINEBREAK: @@ -540,45 +540,34 @@ static node_inl* handle_backslash(subject *subj) // Assumes the subject has an '&' character at the current position. static node_inl* handle_entity(subject* subj) { - int match; - node_inl *result; - match = scan_entity(&subj->input, subj->pos); - if (match) { - result = make_entity(chunk_dup(&subj->input, subj->pos, match)); - subj->pos += match; - } else { - advance(subj); - result = make_str(chunk_literal("&")); - } - return result; + strbuf ent = GH_BUF_INIT; + size_t len; + + advance(subj); + + len = houdini_unescape_ent(&ent, + subj->input.data + subj->pos, + subj->input.len - subj->pos + ); + + if (len == 0) + return make_str(chunk_literal("&")); + + subj->pos += len; + return make_str(chunk_buf_detach(&ent)); } // Like make_str, but parses entities. // Returns an inline sequence consisting of str and entity elements. static node_inl *make_str_with_entities(chunk *content) { - node_inl *result = NULL; - node_inl *new; - int searchpos; - char c; - subject subj; - - subject_from_chunk(&subj, content, NULL); + strbuf unescaped = GH_BUF_INIT; - while ((c = peek_char(&subj))) { - switch (c) { - case '&': - new = handle_entity(&subj); - break; - default: - searchpos = chunk_strchr(&subj.input, '&', subj.pos); - new = make_str(chunk_dup(&subj.input, subj.pos, searchpos - subj.pos)); - subj.pos = searchpos; - } - result = append_inlines(result, new); + if (houdini_unescape_html(&unescaped, content->data, (size_t)content->len)) { + return make_str(chunk_buf_detach(&unescaped)); + } else { + return make_str(*content); } - - return result; } // Destructively unescape a string: remove backslashes before punctuation chars. @@ -611,9 +600,9 @@ static unsigned char *clean_url(chunk *url, int is_email) strbuf_puts(&buf, "mailto:"); if (url->data[0] == '<' && url->data[url->len - 1] == '>') { - strbuf_put(&buf, url->data + 1, url->len - 2); + houdini_unescape_html_f(&buf, url->data + 1, url->len - 2); } else { - strbuf_put(&buf, url->data, url->len); + houdini_unescape_html_f(&buf, url->data, url->len); } unescape_buffer(&buf); @@ -636,9 +625,9 @@ static unsigned char *clean_title(chunk *title) if ((first == '\'' && last == '\'') || (first == '(' && last == ')') || (first == '"' && last == '"')) { - strbuf_set(&buf, title->data + 1, title->len - 2); + houdini_unescape_html_f(&buf, title->data + 1, title->len - 2); } else { - strbuf_set(&buf, title->data, title->len); + houdini_unescape_html_f(&buf, title->data, title->len); } unescape_buffer(&buf); diff --git a/src/print.c b/src/print.c index 0ff86fa..9240dac 100644 --- a/src/print.c +++ b/src/print.c @@ -145,11 +145,6 @@ extern void print_inlines(node_inl* ils, int indent) print_str(ils->content.literal.data, ils->content.literal.len); putchar('\n'); break; - case INL_ENTITY: - printf("entity "); - print_str(ils->content.literal.data, ils->content.literal.len); - putchar('\n'); - break; case INL_LINK: case INL_IMAGE: printf("%s url=", ils->tag == INL_LINK ? "link" : "image"); diff --git a/src/stmd.h b/src/stmd.h index be65371..c80eeda 100644 --- a/src/stmd.h +++ b/src/stmd.h @@ -17,7 +17,6 @@ struct node_inl { INL_LINEBREAK, INL_CODE, INL_RAW_HTML, - INL_ENTITY, INL_EMPH, INL_STRONG, INL_LINK, @@ -133,6 +132,4 @@ void print_blocks(node_block* blk, int indent); void blocks_to_html(strbuf *html, node_block *b, bool tight); void inlines_to_html(strbuf *html, node_inl *b); -void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len); - #endif diff --git a/src/utf8.c b/src/utf8.c index cebd872..12d7ba5 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -3,7 +3,7 @@ #include #include -#include "stmd.h" +#include "utf8.h" static const int8_t utf8proc_utf8class[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -23,6 +23,12 @@ static const int8_t utf8proc_utf8class[256] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; +static void encode_unknown(strbuf *buf) +{ + static const unsigned char repl[] = {239, 191, 189}; + strbuf_put(buf, repl, 3); +} + ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) { ssize_t length, i; @@ -46,6 +52,46 @@ ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) return length; } +void utf8proc_detab(strbuf *ob, const unsigned char *line, size_t size) +{ + static const unsigned char whitespace[] = " "; + + size_t i = 0, tab = 0; + + while (i < size) { + size_t org = i; + + while (i < size && line[i] != '\t' && line[i] <= 0x80) { + i++; tab++; + } + + if (i > org) + strbuf_put(ob, line + org, i - org); + + if (i >= size) + break; + + if (line[i] == '\t') { + int numspaces = 4 - (tab % 4); + strbuf_put(ob, whitespace, numspaces); + i += 1; + tab += numspaces; + } else { + ssize_t charlen = utf8proc_charlen(line + i, size - i); + + if (charlen < 0) { + encode_unknown(ob); + i++; + } else { + strbuf_put(ob, line + i, charlen); + i += charlen; + } + + tab += 1; + } + } +} + ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) { ssize_t length; @@ -89,9 +135,9 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf) unsigned char dst[4]; int len = 0; - if (uc < 0x00) { - assert(false); - } else if (uc < 0x80) { + assert(uc >= 0); + + if (uc < 0x80) { dst[0] = uc; len = 1; } else if (uc < 0x800) { @@ -116,7 +162,8 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf) dst[3] = 0x80 + (uc & 0x3F); len = 4; } else { - assert(false); + encode_unknown(buf); + return; } strbuf_put(buf, dst, len); @@ -133,7 +180,7 @@ void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len) ssize_t char_len = utf8proc_iterate(str, len, &c); if (char_len < 0) { - bufpush(0xFFFD); + encode_unknown(dest); continue; } -- cgit v1.2.3 From 94a79a605f3e76a43f1f87a5044f6761b99e5ca5 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Wed, 10 Sep 2014 18:33:27 +0200 Subject: Cleanup reference implementation --- Makefile | 2 +- src/blocks.c | 16 ++--- src/buffer.c | 43 ++++++++++++++ src/buffer.h | 2 + src/inlines.c | 176 +++++++------------------------------------------------ src/references.c | 109 ++++++++++++++++++++++++++++++++++ src/references.h | 27 +++++++++ src/stmd.h | 26 +++----- src/utf8.c | 10 ++-- src/utf8.h | 5 +- 10 files changed, 225 insertions(+), 191 deletions(-) create mode 100644 src/references.c create mode 100644 src/references.h (limited to 'src/utf8.c') diff --git a/Makefile b/Makefile index 5d13272..11e2141 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ benchjs: node js/bench.js ${BENCHINP} HTML_OBJ=$(SRCDIR)/html/html.o $(SRCDIR)/html/houdini_href_e.o $(SRCDIR)/html/houdini_html_e.o $(SRCDIR)/html/houdini_html_u.o -STMD_OBJ=$(SRCDIR)/inlines.o $(SRCDIR)/buffer.o $(SRCDIR)/blocks.o $(SRCDIR)/scanners.c $(SRCDIR)/print.o $(SRCDIR)/utf8.o +STMD_OBJ=$(SRCDIR)/inlines.o $(SRCDIR)/buffer.o $(SRCDIR)/blocks.o $(SRCDIR)/scanners.c $(SRCDIR)/print.o $(SRCDIR)/utf8.o $(SRCDIR)/references.c $(PROG): $(SRCDIR)/html/html_unescape.h $(SRCDIR)/case_fold_switch.inc $(HTML_OBJ) $(STMD_OBJ) $(SRCDIR)/main.c $(CC) $(LDFLAGS) -o $@ $(HTML_OBJ) $(STMD_OBJ) $(SRCDIR)/main.c diff --git a/src/blocks.c b/src/blocks.c index 72b2dc2..30a8284 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -8,7 +8,6 @@ #include "utf8.h" #include "html/houdini.h" #include "scanners.h" -#include "uthash.h" #define peek_at(i, n) (i)->data[n] @@ -36,12 +35,7 @@ static node_block* make_block(int tag, int start_line, int start_column) extern node_block* make_document() { node_block *e = make_block(BLOCK_DOCUMENT, 1, 1); - reference *map = NULL; - reference ** refmap; - - refmap = (reference**) malloc(sizeof(reference*)); - *refmap = map; - e->as.document.refmap = refmap; + e->as.document.refmap = reference_map_new(); e->top = e; return e; @@ -164,7 +158,7 @@ static void finalize(node_block* b, int line_number) case BLOCK_PARAGRAPH: pos = 0; while (strbuf_at(&b->string_content, 0) == '[' && - (pos = parse_reference(&b->string_content, b->top->as.document.refmap))) { + (pos = parse_reference_inline(&b->string_content, b->top->as.document.refmap))) { strbuf_drop(&b->string_content, pos); } @@ -192,7 +186,7 @@ static void finalize(node_block* b, int line_number) strbuf_drop(&b->string_content, firstlinelen + 1); strbuf_trim(&b->as.code.info); - unescape_buffer(&b->as.code.info); + strbuf_unescape(&b->as.code.info); break; case BLOCK_LIST: // determine tight/loose status @@ -268,7 +262,7 @@ extern void free_blocks(node_block* e) if (e->tag == BLOCK_FENCED_CODE) { strbuf_free(&e->as.code.info); } else if (e->tag == BLOCK_DOCUMENT) { - free_reference_map(e->as.document.refmap); + reference_map_free(e->as.document.refmap); } free_blocks(e->children); free(e); @@ -278,7 +272,7 @@ extern void free_blocks(node_block* e) // Walk through node_block and all children, recursively, parsing // string content into inline content where appropriate. -void process_inlines(node_block* cur, reference** refmap) +void process_inlines(node_block* cur, reference_map *refmap) { switch (cur->tag) { case BLOCK_PARAGRAPH: diff --git a/src/buffer.c b/src/buffer.c index 90c2186..cdf8ca0 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -308,3 +308,46 @@ void strbuf_trim(strbuf *buf) buf->ptr[buf->size] = '\0'; } + +// Destructively modify string, collapsing consecutive +// space and newline characters into a single space. +void strbuf_normalize_whitespace(strbuf *s) +{ + bool last_char_was_space = false; + int r, w; + + for (r = 0, w = 0; r < s->size; ++r) { + switch (s->ptr[r]) { + case ' ': + case '\n': + if (last_char_was_space) + break; + + s->ptr[w++] = ' '; + last_char_was_space = true; + break; + + default: + s->ptr[w++] = s->ptr[r]; + last_char_was_space = false; + } + } + + strbuf_truncate(s, w); +} + +// Destructively unescape a string: remove backslashes before punctuation chars. +extern void strbuf_unescape(strbuf *buf) +{ + int r, w; + + for (r = 0, w = 0; r < buf->size; ++r) { + if (buf->ptr[r] == '\\' && ispunct(buf->ptr[r + 1])) + continue; + + buf->ptr[w++] = buf->ptr[r]; + } + + strbuf_truncate(buf, w); +} + diff --git a/src/buffer.h b/src/buffer.h index 6f45cbb..1bc1eee 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -108,5 +108,7 @@ int strbuf_strrchr(const strbuf *buf, int c, int pos); void strbuf_drop(strbuf *buf, int n); void strbuf_truncate(strbuf *buf, int len); void strbuf_trim(strbuf *buf); +void strbuf_normalize_whitespace(strbuf *s); +void strbuf_unescape(strbuf *s); #endif diff --git a/src/inlines.c b/src/inlines.c index aa0e13e..3040f09 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -7,110 +7,23 @@ #include "stmd.h" #include "html/houdini.h" #include "utf8.h" -#include "uthash.h" #include "scanners.h" typedef struct Subject { chunk input; int pos; - int label_nestlevel; - reference** reference_map; + int label_nestlevel; + reference_map *refmap; } subject; -reference* lookup_reference(reference** refmap, chunk *label); -reference* make_reference(chunk *label, chunk *url, chunk *title); - -static unsigned char *clean_url(chunk *url); -static unsigned char *clean_title(chunk *title); -static unsigned char *clean_autolink(chunk *url, int is_email); - -inline static void chunk_free(chunk *c); -inline static void chunk_trim(chunk *c); - -inline static chunk chunk_literal(const char *data); -inline static chunk chunk_buf_detach(strbuf *buf); -inline static chunk chunk_dup(const chunk *ch, int pos, int len); - -static node_inl *parse_chunk_inlines(chunk *chunk, reference** refmap); +static node_inl *parse_chunk_inlines(chunk *chunk, reference_map *refmap); static node_inl *parse_inlines_while(subject* subj, int (*f)(subject*)); static int parse_inline(subject* subj, node_inl ** last); -static void subject_from_chunk(subject *e, chunk *chunk, reference** refmap); -static void subject_from_buf(subject *e, strbuf *buffer, reference** refmap); +static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap); +static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap); static int subject_find_special_char(subject *subj); -static void normalize_whitespace(strbuf *s); - -extern void free_reference(reference *ref) { - free(ref->label); - free(ref->url); - free(ref->title); - free(ref); -} - -extern void free_reference_map(reference **refmap) { - /* free the hash table contents */ - reference *s; - reference *tmp; - if (refmap != NULL) { - HASH_ITER(hh, *refmap, s, tmp) { - HASH_DEL(*refmap, s); - free_reference(s); - } - free(refmap); - } -} - -// normalize reference: collapse internal whitespace to single space, -// remove leading/trailing whitespace, case fold -static unsigned char *normalize_reference(chunk *ref) -{ - strbuf normalized = GH_BUF_INIT; - - utf8proc_case_fold(&normalized, ref->data, ref->len); - strbuf_trim(&normalized); - normalize_whitespace(&normalized); - - return strbuf_detach(&normalized); -} - -// Returns reference if refmap contains a reference with matching -// label, otherwise NULL. -extern reference* lookup_reference(reference** refmap, chunk *label) -{ - reference *ref = NULL; - unsigned char *norm = normalize_reference(label); - if (refmap != NULL) { - HASH_FIND_STR(*refmap, (char*)norm, ref); - } - free(norm); - return ref; -} - -extern reference* make_reference(chunk *label, chunk *url, chunk *title) -{ - reference *ref; - ref = malloc(sizeof(reference)); - ref->label = normalize_reference(label); - ref->url = clean_url(url); - ref->title = clean_title(title); - return ref; -} - -extern void add_reference(reference** refmap, reference* ref) -{ - reference * t = NULL; - const char *label = (const char *)ref->label; - - HASH_FIND(hh, *refmap, label, strlen(label), t); - - if (t == NULL) { - HASH_ADD_KEYPTR(hh, *refmap, label, strlen(label), ref); - } else { - free_reference(ref); // we free this now since it won't be in the refmap - } -} - static unsigned char *bufdup(const unsigned char *buf) { unsigned char *new = NULL; @@ -236,26 +149,26 @@ inline static node_inl* append_inlines(node_inl* a, node_inl* b) return a; } -static void subject_from_buf(subject *e, strbuf *buffer, reference** refmap) +static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap) { e->input.data = buffer->ptr; e->input.len = buffer->size; e->input.alloc = 0; e->pos = 0; e->label_nestlevel = 0; - e->reference_map = refmap; + e->refmap = refmap; chunk_rtrim(&e->input); } -static void subject_from_chunk(subject *e, chunk *chunk, reference** refmap) +static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap) { e->input.data = chunk->data; e->input.len = chunk->len; e->input.alloc = 0; e->pos = 0; e->label_nestlevel = 0; - e->reference_map = refmap; + e->refmap = refmap; chunk_rtrim(&e->input); } @@ -325,33 +238,6 @@ static int scan_to_closing_backticks(subject* subj, int openticklength) return (subj->pos); } -// Destructively modify string, collapsing consecutive -// space and newline characters into a single space. -static void normalize_whitespace(strbuf *s) -{ - bool last_char_was_space = false; - int r, w; - - for (r = 0, w = 0; r < s->size; ++r) { - switch (s->ptr[r]) { - case ' ': - case '\n': - if (last_char_was_space) - break; - - s->ptr[w++] = ' '; - last_char_was_space = true; - break; - - default: - s->ptr[w++] = s->ptr[r]; - last_char_was_space = false; - } - } - - strbuf_truncate(s, w); -} - // Parse backtick code section or raw backticks, return an inline. // Assumes that the subject has a backtick at the current position. static node_inl* handle_backticks(subject *subj) @@ -368,7 +254,7 @@ static node_inl* handle_backticks(subject *subj) strbuf_set(&buf, subj->input.data + startpos, endpos - startpos - openticks.len); strbuf_trim(&buf); - normalize_whitespace(&buf); + strbuf_normalize_whitespace(&buf); return make_code(chunk_buf_detach(&buf)); } @@ -575,24 +461,9 @@ static node_inl *make_str_with_entities(chunk *content) } } -// Destructively unescape a string: remove backslashes before punctuation chars. -extern void unescape_buffer(strbuf *buf) -{ - int r, w; - - for (r = 0, w = 0; r < buf->size; ++r) { - if (buf->ptr[r] == '\\' && ispunct(buf->ptr[r + 1])) - continue; - - buf->ptr[w++] = buf->ptr[r]; - } - - strbuf_truncate(buf, w); -} - // Clean a URL: remove surrounding whitespace and surrounding <>, // and remove \ that escape punctuation. -static unsigned char *clean_url(chunk *url) +unsigned char *clean_url(chunk *url) { strbuf buf = GH_BUF_INIT; @@ -607,11 +478,11 @@ static unsigned char *clean_url(chunk *url) houdini_unescape_html_f(&buf, url->data, url->len); } - unescape_buffer(&buf); + strbuf_unescape(&buf); return strbuf_detach(&buf); } -static unsigned char *clean_autolink(chunk *url, int is_email) +unsigned char *clean_autolink(chunk *url, int is_email) { strbuf buf = GH_BUF_INIT; @@ -628,7 +499,7 @@ static unsigned char *clean_autolink(chunk *url, int is_email) } // Clean a title: remove surrounding quotes and remove \ that escape punctuation. -static unsigned char *clean_title(chunk *title) +unsigned char *clean_title(chunk *title) { strbuf buf = GH_BUF_INIT; unsigned char first, last; @@ -648,7 +519,7 @@ static unsigned char *clean_title(chunk *title) houdini_unescape_html_f(&buf, title->data, title->len); } - unescape_buffer(&buf); + strbuf_unescape(&buf); return strbuf_detach(&buf); } @@ -810,7 +681,7 @@ static node_inl* handle_left_bracket(subject* subj) } else { // if we get here, we matched a label but didn't get further: subj->pos = endlabel; - lab = parse_chunk_inlines(&rawlabel, subj->reference_map); + lab = parse_chunk_inlines(&rawlabel, subj->refmap); result = append_inlines(make_str(chunk_literal("[")), append_inlines(lab, make_str(chunk_literal("]")))); @@ -834,13 +705,13 @@ static node_inl* handle_left_bracket(subject* subj) } // lookup rawlabel in subject->reference_map: - ref = lookup_reference(subj->reference_map, &reflabel); + ref = reference_lookup(subj->refmap, &reflabel); if (ref != NULL) { // found lab = parse_chunk_inlines(&rawlabel, NULL); result = make_ref_link(lab, ref); } else { subj->pos = endlabel; - lab = parse_chunk_inlines(&rawlabel, subj->reference_map); + lab = parse_chunk_inlines(&rawlabel, subj->refmap); result = append_inlines(make_str(chunk_literal("[")), append_inlines(lab, make_str(chunk_literal("]")))); } @@ -887,7 +758,7 @@ extern node_inl* parse_inlines_while(subject* subj, int (*f)(subject*)) return result; } -node_inl *parse_chunk_inlines(chunk *chunk, reference** refmap) +node_inl *parse_chunk_inlines(chunk *chunk, reference_map *refmap) { subject subj; subject_from_chunk(&subj, chunk, refmap); @@ -987,7 +858,7 @@ static int parse_inline(subject* subj, node_inl ** last) return 1; } -extern node_inl* parse_inlines(strbuf *input, reference** refmap) +extern node_inl* parse_inlines(strbuf *input, reference_map *refmap) { subject subj; subject_from_buf(&subj, input, refmap); @@ -1009,7 +880,7 @@ void spnl(subject* subj) // Modify refmap if a reference is encountered. // Return 0 if no reference found, otherwise position of subject // after reference is parsed. -extern int parse_reference(strbuf *input, reference** refmap) +int parse_reference_inline(strbuf *input, reference_map *refmap) { subject subj; @@ -1019,7 +890,6 @@ extern int parse_reference(strbuf *input, reference** refmap) int matchlen = 0; int beforetitle; - reference *new = NULL; subject_from_buf(&subj, input, NULL); @@ -1065,9 +935,7 @@ extern int parse_reference(strbuf *input, reference** refmap) return 0; } // insert reference into refmap - new = make_reference(&lab, &url, &title); - add_reference(refmap, new); - + reference_create(refmap, &lab, &url, &title); return subj.pos; } diff --git a/src/references.c b/src/references.c new file mode 100644 index 0000000..ff64b00 --- /dev/null +++ b/src/references.c @@ -0,0 +1,109 @@ +#include "stmd.h" +#include "utf8.h" +#include "references.h" + +static unsigned int +refhash(const unsigned char *link_ref) +{ + unsigned int hash = 0; + + while (*link_ref) + hash = (*link_ref++) + (hash << 6) + (hash << 16) - hash; + + return hash; +} + +// normalize reference: collapse internal whitespace to single space, +// remove leading/trailing whitespace, case fold +static unsigned char *normalize_reference(chunk *ref) +{ + strbuf normalized = GH_BUF_INIT; + + utf8proc_case_fold(&normalized, ref->data, ref->len); + strbuf_trim(&normalized); + strbuf_normalize_whitespace(&normalized); + + return strbuf_detach(&normalized); +} + +static void add_reference(reference_map *map, reference* ref) +{ + ref->next = map->table[ref->hash % REFMAP_SIZE]; + map->table[ref->hash % REFMAP_SIZE] = ref; +} + +extern reference *reference_create(reference_map *map, chunk *label, chunk *url, chunk *title) +{ + reference *ref; + ref = malloc(sizeof(reference)); + ref->label = normalize_reference(label); + ref->hash = refhash(ref->label); + ref->url = clean_url(url); + ref->title = clean_title(title); + ref->next = NULL; + + add_reference(map, ref); + + return ref; +} + +// Returns reference if refmap contains a reference with matching +// label, otherwise NULL. +reference* reference_lookup(reference_map *map, chunk *label) +{ + reference *ref = NULL; + unsigned char *norm; + unsigned int hash; + + if (map == NULL) + return NULL; + + norm = normalize_reference(label); + hash = refhash(norm); + ref = map->table[hash % REFMAP_SIZE]; + + while (ref) { + if (ref->label[0] == norm[0] && + !strcmp((char *)ref->label, (char *)norm)) + break; + ref = ref->next; + } + + free(norm); + return ref; +} + +static void reference_free(reference *ref) +{ + free(ref->label); + free(ref->url); + free(ref->title); + free(ref); +} + +void reference_map_free(reference_map *map) +{ + unsigned int i; + + for (i = 0; i < REFMAP_SIZE; ++i) { + reference *ref = map->table[i]; + reference *next; + + while (ref) { + next = ref->next; + reference_free(ref); + ref = next; + } + } + + free(map->table); + free(map); +} + +reference_map *reference_map_new(void) +{ + reference_map *map = malloc(sizeof(reference_map)); + memset(map, 0x0, sizeof(reference_map)); + return map; +} + diff --git a/src/references.h b/src/references.h new file mode 100644 index 0000000..78fffe7 --- /dev/null +++ b/src/references.h @@ -0,0 +1,27 @@ +#ifndef _REFERENCES_H_ +#define _REFERENCES_H_ + +#define REFMAP_SIZE 16 + +struct reference { + struct reference *next; + unsigned char *label; + unsigned char *url; + unsigned char *title; + unsigned int hash; +}; + +typedef struct reference reference; + +struct reference_map { + reference *table[REFMAP_SIZE]; +}; + +typedef struct reference_map reference_map; + +reference_map *reference_map_new(void); +void reference_map_free(reference_map *map); +reference* reference_lookup(reference_map *map, chunk *label); +extern reference *reference_create(reference_map *map, chunk *label, chunk *url, chunk *title); + +#endif diff --git a/src/stmd.h b/src/stmd.h index 21a86b0..4e21e6c 100644 --- a/src/stmd.h +++ b/src/stmd.h @@ -5,7 +5,7 @@ #include #include "buffer.h" #include "chunk.h" -#include "uthash.h" +#include "references.h" #define VERSION "0.1" #define CODE_INDENT 4 @@ -36,17 +36,7 @@ struct node_inl { typedef struct node_inl node_inl; -struct reference { - unsigned char *label; - unsigned char *url; - unsigned char *title; - UT_hash_handle hh; // used by uthash -}; - -typedef struct reference reference; - // Types for blocks - struct ListData { enum { bullet, @@ -104,7 +94,7 @@ struct node_block { int level; } header; struct { - reference** refmap; + reference_map *refmap; } document; } as; @@ -114,14 +104,10 @@ struct node_block { typedef struct node_block node_block; -node_inl* parse_inlines(strbuf *input, reference** refmap); +node_inl* parse_inlines(strbuf *input, reference_map *refmap); void free_inlines(node_inl* e); -int parse_reference(strbuf *input, reference** refmap); -void free_reference(reference *ref); -void free_reference_map(reference **refmap); - -void add_reference(reference** refmap, reference* ref); +int parse_reference_inline(strbuf *input, reference_map *refmap); void unescape_buffer(strbuf *buf); extern node_block* make_document(); @@ -138,4 +124,8 @@ void print_blocks(node_block* blk, int indent); void blocks_to_html(strbuf *html, node_block *b, bool tight); void inlines_to_html(strbuf *html, node_inl *b); +unsigned char *clean_url(chunk *url); +unsigned char *clean_autolink(chunk *url, int is_email); +unsigned char *clean_title(chunk *title); + #endif diff --git a/src/utf8.c b/src/utf8.c index 12d7ba5..c65aec6 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -25,7 +25,7 @@ static const int8_t utf8proc_utf8class[256] = { static void encode_unknown(strbuf *buf) { - static const unsigned char repl[] = {239, 191, 189}; + static const uint8_t repl[] = {239, 191, 189}; strbuf_put(buf, repl, 3); } @@ -52,9 +52,9 @@ ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) return length; } -void utf8proc_detab(strbuf *ob, const unsigned char *line, size_t size) +void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) { - static const unsigned char whitespace[] = " "; + static const uint8_t whitespace[] = " "; size_t i = 0, tab = 0; @@ -132,7 +132,7 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) void utf8proc_encode_char(int32_t uc, strbuf *buf) { - unsigned char dst[4]; + uint8_t dst[4]; int len = 0; assert(uc >= 0); @@ -169,7 +169,7 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf) strbuf_put(buf, dst, len); } -void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len) +void utf8proc_case_fold(strbuf *dest, const uint8_t *str, int len) { int32_t c; diff --git a/src/utf8.h b/src/utf8.h index 1e4e556..9506b75 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -1,12 +1,13 @@ #ifndef _H_STMD_UTF8_ #define _H_STMD_UTF8_ +#include #include "buffer.h" -void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len); +void utf8proc_case_fold(strbuf *dest, const uint8_t *str, int len); void utf8proc_encode_char(int32_t uc, strbuf *buf); ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst); ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len); -void utf8proc_detab(strbuf *dest, const unsigned char *line, size_t size); +void utf8proc_detab(strbuf *dest, const uint8_t *line, size_t size); #endif -- cgit v1.2.3 From c47e3a34adac00a262f72c6d17a1c87deefa33c4 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Wed, 10 Sep 2014 19:39:03 +0200 Subject: Fix infinite loop when case folding invalid UTF8 chars --- src/utf8.c | 24 ++++++++++++------------ src/utf8.h | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index c65aec6..1b0224b 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -29,9 +29,9 @@ static void encode_unknown(strbuf *buf) strbuf_put(buf, repl, 3); } -ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) +int utf8proc_charlen(const uint8_t *str, int str_len) { - ssize_t length, i; + int length, i; if (!str_len) return 0; @@ -42,11 +42,11 @@ ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) return -1; if (str_len >= 0 && length > str_len) - return -1; + return -str_len; for (i = 1; i < length; i++) { if ((str[i] & 0xC0) != 0x80) - return -1; + return -i; } return length; @@ -77,7 +77,7 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) i += 1; tab += numspaces; } else { - ssize_t charlen = utf8proc_charlen(line + i, size - i); + int charlen = utf8proc_charlen(line + i, size - i); if (charlen < 0) { encode_unknown(ob); @@ -92,9 +92,9 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) } } -ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) +int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst) { - ssize_t length; + int length; int32_t uc = -1; *dst = -1; @@ -177,15 +177,15 @@ void utf8proc_case_fold(strbuf *dest, const uint8_t *str, int len) utf8proc_encode_char(x, dest) while (len > 0) { - ssize_t char_len = utf8proc_iterate(str, len, &c); + int char_len = utf8proc_iterate(str, len, &c); - if (char_len < 0) { + if (char_len >= 0) { +#include "case_fold_switch.inc" + } else { encode_unknown(dest); - continue; + char_len = -char_len; } -#include "case_fold_switch.inc" - str += char_len; len -= char_len; } diff --git a/src/utf8.h b/src/utf8.h index 9506b75..c971250 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -6,8 +6,8 @@ void utf8proc_case_fold(strbuf *dest, const uint8_t *str, int len); void utf8proc_encode_char(int32_t uc, strbuf *buf); -ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst); -ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len); +int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst); +int utf8proc_charlen(const uint8_t *str, int str_len); void utf8proc_detab(strbuf *dest, const uint8_t *line, size_t size); #endif -- cgit v1.2.3 From 79e7a4bbf7055e33b346564db769f03e85f98988 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Wed, 10 Sep 2014 19:40:40 +0200 Subject: Improve invalid UTF8 codepoint skipping --- src/utf8.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index 1b0224b..6b34831 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -79,14 +79,14 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) } else { int charlen = utf8proc_charlen(line + i, size - i); - if (charlen < 0) { - encode_unknown(ob); - i++; - } else { + if (charlen >= 0) { strbuf_put(ob, line + i, charlen); - i += charlen; + } else { + encode_unknown(ob); + charlen = -charlen; } + i += charlen; tab += 1; } } -- cgit v1.2.3