- #include <stdlib.h>
- #include "bstrlib.h"
- #include "debug.h"
- #define advance(s) \
- s++; \
- check(*s >> 6 == 0x02, "UTF-8 decode error on byte %x", *s);
- // Reads a unicode code point from a UTF8-encoded string, and
- // puts it in the pointer n. If something illegal
- // is encountered, 0xFFFD is emitted.
- // Returns a pointer to next position in string, or NULL if no
- // more characters remain.
- extern unsigned char * from_utf8(unsigned char * s, unsigned int *n)
- {
- int x = 0;
- if (*s == 0) {
- return NULL;
- } else if (*s < 0x80) {
- x = *s;
- } else if (*s >> 5 == 0x06) {
- x = *s & 0x1F;
- advance(s);
- x = (x << 6) + (*s & 0x3F);
- } else if (*s >> 4 == 0x0E) {
- x = *s & 0x0F;
- advance(s);
- x = (x << 6) + (*s & 0x3F);
- advance(s);
- x = (x << 6) + (*s & 0x3F);
- } else if (*s >> 3 == 0x1E) {
- x = *s & 0x07;
- advance(s);
- x = (x << 6) + (*s & 0x3F);
- advance(s);
- x = (x << 6) + (*s & 0x3F);
- advance(s);
- x = (x << 6) + (*s & 0x3F);
- } else if (*s >> 2 == 0x3E) {
- x = *s & 0x03;
- advance(s);
- x = (x << 6) + (*s & 0x3F);
- advance(s);
- x = (x << 6) + (*s & 0x3F);
- advance(s);
- x = (x << 6) + (*s & 0x3F);
- advance(s);
- x = (x << 6) + (*s & 0x3F);
- } else {
- log_err("UTF-8 decode error on byte %x", *s);
- goto error;
- }
- *n = x;
- s++;
- return s;
- error:
- *n = 0xFFFD;
- return s;
- }
- // Converts the unicode code point c to UTF-8,
- // putting the result in dest. Returns 0 on success, -1 on error.
- extern int to_utf8(unsigned int c, bstring dest)
- {
- if (c < 0x80) {
- bconchar(dest, c);
- } else if (c < 0x800) {
- bconchar(dest, 192 + c/64);
- bconchar(dest, 128 + c%64);
- } else if (c - 0xd800u < 0x800) {
- goto error;
- } else if (c < 0x10000) {
- bconchar(dest, 224 + c / 4096);
- bconchar(dest, 128 + c /64%64);
- bconchar(dest, 128 + c%64);
- } else if (c < 0x110000) {
- bconchar(dest, 240 + c/262144);
- bconchar(dest, 128 + c/4096%64);
- bconchar(dest, 128 + c/64%64);
- bconchar(dest, 128 + c%64);
- } else {
- goto error;
- }
- return 0;
- error:
- return -1;
- }
- #define bufpush(x) \
- check(to_utf8(x, buf) == 0, "UTF-8 encode error on code point %04x", x)
- // Returns the case-folded version of the source string, or NULL on error.
- extern bstring case_fold(bstring source)
- {
- unsigned char * s = source->data;
- unsigned int c = 0;
- bstring buf = bfromcstr("");
- while ((s = from_utf8(s, &c))) {
- #include "case_fold_switch.c"
- }
- return buf;
- error:
- return NULL;
- }
|