summaryrefslogtreecommitdiffstats
path: root/src/lib/unichar.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/unichar.c')
-rw-r--r--src/lib/unichar.c447
1 files changed, 447 insertions, 0 deletions
diff --git a/src/lib/unichar.c b/src/lib/unichar.c
new file mode 100644
index 0000000..7036e73
--- /dev/null
+++ b/src/lib/unichar.c
@@ -0,0 +1,447 @@
+/* Copyright (c) 2005-2018 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "array.h"
+#include "bsearch-insert-pos.h"
+#include "unichar.h"
+
+#include "unicodemap.c"
+
+#define HANGUL_FIRST 0xac00
+#define HANGUL_LAST 0xd7a3
+
+const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN] =
+ { 0xef, 0xbf, 0xbd }; /* 0xfffd */
+
+static const uint8_t utf8_non1_bytes[256 - 192 - 2] = {
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
+
+const uint8_t *const uni_utf8_non1_bytes = utf8_non1_bytes;
+
+unsigned int uni_strlen(const unichar_t *str)
+{
+ unsigned int len = 0;
+
+ for (len = 0; str[len] != 0; len++) ;
+
+ return len;
+}
+
+int uni_utf8_get_char(const char *input, unichar_t *chr_r)
+{
+ return uni_utf8_get_char_n((const unsigned char *)input, SIZE_MAX,
+ chr_r);
+}
+
+int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r)
+{
+ static unichar_t lowest_valid_chr_table[] =
+ { 0, 0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
+ const unsigned char *input = _input;
+ unichar_t chr, lowest_valid_chr;
+ unsigned int i, len;
+ int ret;
+
+ i_assert(max_len > 0);
+
+ if (*input < 0x80) {
+ *chr_r = *input;
+ return 1;
+ }
+
+ /* first byte has len highest bits set, followed by zero bit.
+ the rest of the bits are used as the highest bits of the value. */
+ chr = *input;
+ len = uni_utf8_char_bytes(*input);
+ switch (len) {
+ case 2:
+ chr &= 0x1f;
+ break;
+ case 3:
+ chr &= 0x0f;
+ break;
+ case 4:
+ chr &= 0x07;
+ break;
+ case 5:
+ chr &= 0x03;
+ break;
+ case 6:
+ chr &= 0x01;
+ break;
+ default:
+ /* only 7bit chars should have len==1 */
+ i_assert(len == 1);
+ return -1;
+ }
+
+ if (len <= max_len) {
+ lowest_valid_chr = lowest_valid_chr_table[len];
+ ret = len;
+ } else {
+ /* check first if the input is invalid before returning 0 */
+ lowest_valid_chr = 0;
+ ret = 0;
+ len = max_len;
+ }
+
+ /* the following bytes must all be 10xxxxxx */
+ for (i = 1; i < len; i++) {
+ if ((input[i] & 0xc0) != 0x80)
+ return input[i] == '\0' ? 0 : -1;
+
+ chr <<= 6;
+ chr |= input[i] & 0x3f;
+ }
+ /* these are specified as invalid encodings by standards
+ see RFC3629 */
+ if (!uni_is_valid_ucs4(chr))
+ return -1;
+ if (chr < lowest_valid_chr) {
+ /* overlong encoding */
+ return -1;
+ }
+
+ *chr_r = chr;
+ return ret;
+}
+
+int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output)
+{
+ unichar_t chr;
+
+ while (*input != '\0') {
+ int len = uni_utf8_get_char(input, &chr);
+ if (len <= 0) {
+ /* invalid input */
+ return -1;
+ }
+ input += len;
+
+ array_push_back(output, &chr);
+ }
+ return 0;
+}
+
+int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
+ ARRAY_TYPE(unichars) *output)
+{
+ unichar_t chr;
+
+ while (size > 0) {
+ int len = uni_utf8_get_char_n(input, size, &chr);
+ if (len <= 0)
+ return -1; /* invalid input */
+ input += len; size -= len;
+
+ array_push_back(output, &chr);
+ }
+ return 0;
+}
+
+void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output)
+{
+ for (; len > 0 && *input != '\0'; input++, len--)
+ uni_ucs4_to_utf8_c(*input, output);
+}
+
+void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output)
+{
+ unsigned char first;
+ int bitpos;
+
+ if (chr < 0x80) {
+ buffer_append_c(output, chr);
+ return;
+ }
+
+ i_assert(uni_is_valid_ucs4(chr));
+
+ if (chr < (1 << (6 + 5))) {
+ /* 110xxxxx */
+ bitpos = 6;
+ first = 0x80 | 0x40;
+ } else if (chr < (1 << ((2*6) + 4))) {
+ /* 1110xxxx */
+ bitpos = 2*6;
+ first = 0x80 | 0x40 | 0x20;
+ } else if (chr < (1 << ((3*6) + 3))) {
+ /* 11110xxx */
+ bitpos = 3*6;
+ first = 0x80 | 0x40 | 0x20 | 0x10;
+ } else if (chr < (1 << ((4*6) + 2))) {
+ /* 111110xx */
+ bitpos = 4*6;
+ first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08;
+ } else {
+ /* 1111110x */
+ bitpos = 5*6;
+ first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08 | 0x04;
+ }
+ buffer_append_c(output, first | (chr >> bitpos));
+
+ do {
+ bitpos -= 6;
+ buffer_append_c(output, 0x80 | ((chr >> bitpos) & 0x3f));
+ } while (bitpos > 0);
+}
+
+unsigned int uni_utf8_strlen(const char *input)
+{
+ return uni_utf8_strlen_n(input, strlen(input));
+}
+
+unsigned int uni_utf8_strlen_n(const void *input, size_t size)
+{
+ size_t partial_pos;
+
+ return uni_utf8_partial_strlen_n(input, size, &partial_pos);
+}
+
+unsigned int uni_utf8_partial_strlen_n(const void *_input, size_t size,
+ size_t *partial_pos_r)
+{
+ const unsigned char *input = _input;
+ unsigned int count, len = 0;
+ size_t i;
+
+ for (i = 0; i < size; ) {
+ count = uni_utf8_char_bytes(input[i]);
+ if (i + count > size)
+ break;
+ i += count;
+ len++;
+ }
+ *partial_pos_r = i;
+ return len;
+}
+
+static bool uint16_find(const uint16_t *data, unsigned int count,
+ uint16_t value, unsigned int *idx_r)
+{
+ BINARY_NUMBER_SEARCH(data, count, value, idx_r);
+}
+
+static bool uint32_find(const uint32_t *data, unsigned int count,
+ uint32_t value, unsigned int *idx_r)
+{
+ BINARY_NUMBER_SEARCH(data, count, value, idx_r);
+}
+
+unichar_t uni_ucs4_to_titlecase(unichar_t chr)
+{
+ unsigned int idx;
+
+ if (chr <= 0xff)
+ return titlecase8_map[chr];
+ else if (chr <= 0xffff) {
+ if (!uint16_find(titlecase16_keys, N_ELEMENTS(titlecase16_keys),
+ chr, &idx))
+ return chr;
+ else
+ return titlecase16_values[idx];
+ } else {
+ if (!uint32_find(titlecase32_keys, N_ELEMENTS(titlecase32_keys),
+ chr, &idx))
+ return chr;
+ else
+ return titlecase32_values[idx];
+ }
+}
+
+static bool uni_ucs4_decompose_uni(unichar_t *chr)
+{
+ unsigned int idx;
+
+ if (*chr <= 0xff) {
+ if (uni8_decomp_map[*chr] == *chr)
+ return FALSE;
+ *chr = uni8_decomp_map[*chr];
+ } else if (*chr <= 0xffff) {
+ if (*chr < uni16_decomp_keys[0])
+ return FALSE;
+
+ if (!uint16_find(uni16_decomp_keys,
+ N_ELEMENTS(uni16_decomp_keys), *chr, &idx))
+ return FALSE;
+ *chr = uni16_decomp_values[idx];
+ } else {
+ if (!uint32_find(uni32_decomp_keys,
+ N_ELEMENTS(uni32_decomp_keys), *chr, &idx))
+ return FALSE;
+ *chr = uni32_decomp_values[idx];
+ }
+ return TRUE;
+}
+
+static void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output)
+{
+#define SBase HANGUL_FIRST
+#define LBase 0x1100
+#define VBase 0x1161
+#define TBase 0x11A7
+#define LCount 19
+#define VCount 21
+#define TCount 28
+#define NCount (VCount * TCount)
+ unsigned int SIndex = chr - SBase;
+ unichar_t L = LBase + SIndex / NCount;
+ unichar_t V = VBase + (SIndex % NCount) / TCount;
+ unichar_t T = TBase + SIndex % TCount;
+
+ uni_ucs4_to_utf8_c(L, output);
+ uni_ucs4_to_utf8_c(V, output);
+ if (T != TBase) uni_ucs4_to_utf8_c(T, output);
+}
+
+static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output)
+{
+ const uint32_t *value;
+ unsigned int idx;
+
+ if (chr < multidecomp_keys[0] || chr > 0xffff)
+ return FALSE;
+
+ if (!uint32_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys),
+ chr, &idx))
+ return FALSE;
+
+ value = &multidecomp_values[multidecomp_offsets[idx]];
+ for (; *value != 0; value++)
+ uni_ucs4_to_utf8_c(*value, output);
+ return TRUE;
+}
+
+static void output_add_replacement_char(buffer_t *output)
+{
+ if (output->used >= UTF8_REPLACEMENT_CHAR_LEN &&
+ memcmp(CONST_PTR_OFFSET(output->data,
+ output->used - UTF8_REPLACEMENT_CHAR_LEN),
+ utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN) == 0) {
+ /* don't add the replacement char multiple times */
+ return;
+ }
+ buffer_append(output, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN);
+}
+
+int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
+ buffer_t *output)
+{
+ const unsigned char *input = _input;
+ unichar_t chr;
+ int ret = 0;
+
+ while (size > 0) {
+ int bytes = uni_utf8_get_char_n(input, size, &chr);
+ if (bytes <= 0) {
+ /* invalid input. try the next byte. */
+ ret = -1;
+ input++; size--;
+ output_add_replacement_char(output);
+ continue;
+ }
+ input += bytes;
+ size -= bytes;
+
+ chr = uni_ucs4_to_titlecase(chr);
+ if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST)
+ uni_ucs4_decompose_hangul_utf8(chr, output);
+ else if (uni_ucs4_decompose_uni(&chr) ||
+ !uni_ucs4_decompose_multi_utf8(chr, output))
+ uni_ucs4_to_utf8_c(chr, output);
+ }
+ return ret;
+}
+
+static inline unsigned int
+is_valid_utf8_seq(const unsigned char *input, unsigned int size)
+{
+ unichar_t chr;
+ int len = uni_utf8_get_char_n(input, size, &chr);
+ return len <= 0 ? 0 : len;
+}
+
+static int uni_utf8_find_invalid_pos(const unsigned char *input, size_t size,
+ size_t *pos_r)
+{
+ size_t i, len;
+
+ /* find the first invalid utf8 sequence */
+ for (i = 0; i < size;) {
+ if (input[i] < 0x80)
+ i++;
+ else {
+ len = is_valid_utf8_seq(input + i, size-i);
+ if (unlikely(len == 0)) {
+ *pos_r = i;
+ return -1;
+ }
+ i += len;
+ }
+ }
+ return 0;
+}
+
+bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
+ buffer_t *buf)
+{
+ size_t i, len;
+
+ if (uni_utf8_find_invalid_pos(input, size, &i) == 0)
+ return TRUE;
+
+ /* broken utf-8 input - skip the broken characters */
+ buffer_append(buf, input, i++);
+
+ output_add_replacement_char(buf);
+ while (i < size) {
+ if (input[i] < 0x80) {
+ buffer_append_c(buf, input[i++]);
+ continue;
+ }
+
+ len = is_valid_utf8_seq(input + i, size-i);
+ if (len == 0) {
+ i++;
+ output_add_replacement_char(buf);
+ continue;
+ }
+ buffer_append(buf, input + i, len);
+ i += len;
+ }
+ return FALSE;
+}
+
+bool uni_utf8_str_is_valid(const char *str)
+{
+ size_t i;
+
+ return uni_utf8_find_invalid_pos((const unsigned char *)str,
+ strlen(str), &i) == 0;
+}
+
+bool uni_utf8_data_is_valid(const unsigned char *data, size_t size)
+{
+ size_t i;
+
+ return uni_utf8_find_invalid_pos(data, size, &i) == 0;
+}
+
+size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size,
+ size_t max_new_size)
+{
+ if (max_new_size >= old_size)
+ return old_size;
+ if (max_new_size == 0)
+ return 0;
+
+ if ((data[max_new_size] & 0x80) == 0)
+ return max_new_size;
+ while (max_new_size > 0 && (data[max_new_size-1] & 0xc0) == 0x80)
+ max_new_size--;
+ if (max_new_size > 0 && (data[max_new_size-1] & 0xc0) == 0xc0)
+ max_new_size--;
+ return max_new_size;
+}