aboutsummaryrefslogtreecommitdiff
path: root/src/utf8.c
blob: 4bb3b35ae38870fb44ce310a484becc283bc7369 (plain)
  1. #include <stdlib.h>
  2. #include "bstrlib.h"
  3. #include "debug.h"
  4. #define advance(s) \
  5. s++; \
  6. check(*s >> 6 == 0x02, "UTF-8 decode error on byte %x", *s);
  7. // Reads a unicode code point from a UTF8-encoded string, and
  8. // puts it in the pointer n. If something illegal
  9. // is encountered, 0xFFFD is emitted.
  10. // Returns a pointer to next position in string, or NULL if no
  11. // more characters remain.
  12. extern unsigned char * from_utf8(unsigned char * s, unsigned int *n)
  13. {
  14. int x = 0;
  15. if (*s == 0) {
  16. return NULL;
  17. } else if (*s < 0x80) {
  18. x = *s;
  19. } else if (*s >> 5 == 0x06) {
  20. x = *s & 0x1F;
  21. advance(s);
  22. x = (x << 6) + (*s & 0x3F);
  23. } else if (*s >> 4 == 0x0E) {
  24. x = *s & 0x0F;
  25. advance(s);
  26. x = (x << 6) + (*s & 0x3F);
  27. advance(s);
  28. x = (x << 6) + (*s & 0x3F);
  29. } else if (*s >> 3 == 0x1E) {
  30. x = *s & 0x07;
  31. advance(s);
  32. x = (x << 6) + (*s & 0x3F);
  33. advance(s);
  34. x = (x << 6) + (*s & 0x3F);
  35. advance(s);
  36. x = (x << 6) + (*s & 0x3F);
  37. } else if (*s >> 2 == 0x3E) {
  38. x = *s & 0x03;
  39. advance(s);
  40. x = (x << 6) + (*s & 0x3F);
  41. advance(s);
  42. x = (x << 6) + (*s & 0x3F);
  43. advance(s);
  44. x = (x << 6) + (*s & 0x3F);
  45. advance(s);
  46. x = (x << 6) + (*s & 0x3F);
  47. } else {
  48. log_err("UTF-8 decode error on byte %x", *s);
  49. goto error;
  50. }
  51. *n = x;
  52. s++;
  53. return s;
  54. error:
  55. *n = 0xFFFD;
  56. return s;
  57. }
  58. // Converts the unicode code point c to UTF-8,
  59. // putting the result in dest. Returns 0 on success, -1 on error.
  60. extern int to_utf8(unsigned int c, bstring dest)
  61. {
  62. if (c < 0x80) {
  63. bconchar(dest, c);
  64. } else if (c < 0x800) {
  65. bconchar(dest, 192 + c/64);
  66. bconchar(dest, 128 + c%64);
  67. } else if (c - 0xd800u < 0x800) {
  68. goto error;
  69. } else if (c < 0x10000) {
  70. bconchar(dest, 224 + c / 4096);
  71. bconchar(dest, 128 + c /64%64);
  72. bconchar(dest, 128 + c%64);
  73. } else if (c < 0x110000) {
  74. bconchar(dest, 240 + c/262144);
  75. bconchar(dest, 128 + c/4096%64);
  76. bconchar(dest, 128 + c/64%64);
  77. bconchar(dest, 128 + c%64);
  78. } else {
  79. goto error;
  80. }
  81. return 0;
  82. error:
  83. return -1;
  84. }
  85. #define bufpush(x) \
  86. check(to_utf8(x, buf) == 0, "UTF-8 encode error on code point %04x", x)
  87. // Returns the case-folded version of the source string, or NULL on error.
  88. extern bstring case_fold(bstring source)
  89. {
  90. unsigned char * s = source->data;
  91. unsigned int c = 0;
  92. bstring buf = bfromcstr("");
  93. while ((s = from_utf8(s, &c))) {
  94. #include "case_fold_switch.c"
  95. }
  96. return buf;
  97. error:
  98. return NULL;
  99. }