summaryrefslogtreecommitdiff
path: root/code/fe310/eos/eve/unicode.c
diff options
context:
space:
mode:
authorUros Majstorovic <majstor@majstor.org>2020-07-29 09:21:45 +0200
committerUros Majstorovic <majstor@majstor.org>2020-07-29 09:21:45 +0200
commit0e518d5117b73fd54081decf1c0eb9f9d3173ff6 (patch)
tree9ef8f4529e3b24cd9ca16e6c46a89781abc8c670 /code/fe310/eos/eve/unicode.c
parent5a39774e6ed9002de3f0ec1f2cdbba2ebbe9fbde (diff)
unicode support
Diffstat (limited to 'code/fe310/eos/eve/unicode.c')
-rw-r--r--code/fe310/eos/eve/unicode.c103
1 files changed, 103 insertions, 0 deletions
diff --git a/code/fe310/eos/eve/unicode.c b/code/fe310/eos/eve/unicode.c
new file mode 100644
index 0000000..62b1714
--- /dev/null
+++ b/code/fe310/eos/eve/unicode.c
@@ -0,0 +1,103 @@
+#include "unicode.h"
+
+uint8_t utf8_enc(utf32_t ch, utf8_t *str) {
+ if (ch <= 0x7f) {
+ str[0] = ch;
+ return 1;
+ } else if (ch <= 0x7ff) {
+ str[0] = 0xc0 | (ch >> 6);
+ str[1] = 0x80 | (ch & 0x3f);
+ return 2;
+ } else if (ch <= 0xffff) {
+ if ((ch >= 0xd800) && (ch <= 0xdfff)) return 0;
+ str[0] = 0xe0 | (ch >> 12);
+ str[1] = 0x80 | ((ch >> 6) & 0x3f);
+ str[2] = 0x80 | (ch & 0x3f);
+ return 3;
+ } else if (ch <= 0x10ffff) {
+ str[0] = 0xf0 | (ch >> 18);
+ str[1] = 0x80 | ((ch >> 12) & 0x3f);
+ str[2] = 0x80 | ((ch >> 6) & 0x3f);
+ str[3] = 0x80 | (ch & 0x3f);
+ return 4;
+ } else {
+ return 0;
+ }
+}
+
+uint8_t utf8_dec(utf8_t *str, utf32_t *ch) {
+ if ((str[0] & 0x80) == 0x00) {
+ *ch = str[0];
+ return 1;
+ } else if ((str[0] & 0xe0) == 0xc0) {
+ if ((str[1] & 0xc0) != 0x80) return 0;
+ *ch = (utf32_t)(str[0] & 0x1f) << 6;
+ *ch |= (utf32_t)(str[1] & 0x3f);
+ if (*ch < 0x80) return 0;
+ return 2;
+ } else if ((str[0] & 0xf0) == 0xe0) {
+ if (((str[1] & 0xc0) != 0x80) || ((str[2] & 0xc0) != 0x80)) return 0;
+ *ch = (utf32_t)(str[0] & 0x0f) << 12;
+ *ch |= (utf32_t)(str[1] & 0x3f) << 6;
+ *ch |= (utf32_t)(str[2] & 0x3f);
+ if ((*ch >= 0xd800) && (*ch <= 0xdfff)) return 0;
+ if (*ch < 0x800) return 0;
+ return 3;
+ } else if ((str[0] & 0xf8) == 0xf0) {
+ if (((str[1] & 0xc0) != 0x80) || ((str[2] & 0xc0) != 0x80) || ((str[3] & 0xc0) != 0x80)) return 0;
+ *ch = (utf32_t)(str[0] & 0x07) << 18;
+ *ch |= (utf32_t)(str[1] & 0x0f) << 12;
+ *ch |= (utf32_t)(str[2] & 0x3f) << 6;
+ *ch |= (utf32_t)(str[3] & 0x3f);
+ if (*ch < 0x010000) return 0;
+ if (*ch > 0x10ffff) return 0;
+ return 4;
+ } else {
+ return 0;
+ }
+}
+
+int utf8_seek(utf8_t *str, int off, utf32_t *ch) {
+ int i;
+ int len = 0;
+
+ if (off < 0) {
+ off = -off;
+ for (i=0; i<off; i++) {
+ len--;
+ while ((*(str + len) & 0xc0) == 0x80) len--;
+ }
+ utf8_dec(str + len, ch);
+ } else {
+ for (i=0; i<off; i++) {
+ len += utf8_dec(str + len, ch);
+ }
+ }
+ return len;
+}
+
+int utf8_verify(utf8_t *str, int sz) {
+ utf32_t ch;
+ uint8_t ch_l;
+ int len = 0;
+
+ while (len < sz) {
+ if (sz - len < 4) {
+ if (((str[len] & 0xf8) == 0xf0) ||
+ (((str[len] & 0xf0) == 0xe0) && (sz - len < 3)) ||
+ (((str[len] & 0xe0) == 0xc0) && (sz - len < 2))) {
+ str[len] = '\0';
+ break;
+ }
+ }
+ ch_l = utf8_dec(str + len, &ch);
+ if (ch_l) {
+ if (ch == 0) break;
+ len += ch_l;
+ } else {
+ str[len] = '\0';
+ break;
+ }
+ }
+ return len;
+}