diff options
Diffstat (limited to 'src/lib/unichar.c')
-rw-r--r-- | src/lib/unichar.c | 447 |
1 files changed, 447 insertions, 0 deletions
diff --git a/src/lib/unichar.c b/src/lib/unichar.c new file mode 100644 index 0000000..7036e73 --- /dev/null +++ b/src/lib/unichar.c @@ -0,0 +1,447 @@ +/* Copyright (c) 2005-2018 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "array.h" +#include "bsearch-insert-pos.h" +#include "unichar.h" + +#include "unicodemap.c" + +#define HANGUL_FIRST 0xac00 +#define HANGUL_LAST 0xd7a3 + +const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN] = + { 0xef, 0xbf, 0xbd }; /* 0xfffd */ + +static const uint8_t utf8_non1_bytes[256 - 192 - 2] = { + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 +}; + +const uint8_t *const uni_utf8_non1_bytes = utf8_non1_bytes; + +unsigned int uni_strlen(const unichar_t *str) +{ + unsigned int len = 0; + + for (len = 0; str[len] != 0; len++) ; + + return len; +} + +int uni_utf8_get_char(const char *input, unichar_t *chr_r) +{ + return uni_utf8_get_char_n((const unsigned char *)input, SIZE_MAX, + chr_r); +} + +int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r) +{ + static unichar_t lowest_valid_chr_table[] = + { 0, 0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 }; + const unsigned char *input = _input; + unichar_t chr, lowest_valid_chr; + unsigned int i, len; + int ret; + + i_assert(max_len > 0); + + if (*input < 0x80) { + *chr_r = *input; + return 1; + } + + /* first byte has len highest bits set, followed by zero bit. + the rest of the bits are used as the highest bits of the value. */ + chr = *input; + len = uni_utf8_char_bytes(*input); + switch (len) { + case 2: + chr &= 0x1f; + break; + case 3: + chr &= 0x0f; + break; + case 4: + chr &= 0x07; + break; + case 5: + chr &= 0x03; + break; + case 6: + chr &= 0x01; + break; + default: + /* only 7bit chars should have len==1 */ + i_assert(len == 1); + return -1; + } + + if (len <= max_len) { + lowest_valid_chr = lowest_valid_chr_table[len]; + ret = len; + } else { + /* check first if the input is invalid before returning 0 */ + lowest_valid_chr = 0; + ret = 0; + len = max_len; + } + + /* the following bytes must all be 10xxxxxx */ + for (i = 1; i < len; i++) { + if ((input[i] & 0xc0) != 0x80) + return input[i] == '\0' ? 0 : -1; + + chr <<= 6; + chr |= input[i] & 0x3f; + } + /* these are specified as invalid encodings by standards + see RFC3629 */ + if (!uni_is_valid_ucs4(chr)) + return -1; + if (chr < lowest_valid_chr) { + /* overlong encoding */ + return -1; + } + + *chr_r = chr; + return ret; +} + +int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output) +{ + unichar_t chr; + + while (*input != '\0') { + int len = uni_utf8_get_char(input, &chr); + if (len <= 0) { + /* invalid input */ + return -1; + } + input += len; + + array_push_back(output, &chr); + } + return 0; +} + +int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size, + ARRAY_TYPE(unichars) *output) +{ + unichar_t chr; + + while (size > 0) { + int len = uni_utf8_get_char_n(input, size, &chr); + if (len <= 0) + return -1; /* invalid input */ + input += len; size -= len; + + array_push_back(output, &chr); + } + return 0; +} + +void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output) +{ + for (; len > 0 && *input != '\0'; input++, len--) + uni_ucs4_to_utf8_c(*input, output); +} + +void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output) +{ + unsigned char first; + int bitpos; + + if (chr < 0x80) { + buffer_append_c(output, chr); + return; + } + + i_assert(uni_is_valid_ucs4(chr)); + + if (chr < (1 << (6 + 5))) { + /* 110xxxxx */ + bitpos = 6; + first = 0x80 | 0x40; + } else if (chr < (1 << ((2*6) + 4))) { + /* 1110xxxx */ + bitpos = 2*6; + first = 0x80 | 0x40 | 0x20; + } else if (chr < (1 << ((3*6) + 3))) { + /* 11110xxx */ + bitpos = 3*6; + first = 0x80 | 0x40 | 0x20 | 0x10; + } else if (chr < (1 << ((4*6) + 2))) { + /* 111110xx */ + bitpos = 4*6; + first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08; + } else { + /* 1111110x */ + bitpos = 5*6; + first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08 | 0x04; + } + buffer_append_c(output, first | (chr >> bitpos)); + + do { + bitpos -= 6; + buffer_append_c(output, 0x80 | ((chr >> bitpos) & 0x3f)); + } while (bitpos > 0); +} + +unsigned int uni_utf8_strlen(const char *input) +{ + return uni_utf8_strlen_n(input, strlen(input)); +} + +unsigned int uni_utf8_strlen_n(const void *input, size_t size) +{ + size_t partial_pos; + + return uni_utf8_partial_strlen_n(input, size, &partial_pos); +} + +unsigned int uni_utf8_partial_strlen_n(const void *_input, size_t size, + size_t *partial_pos_r) +{ + const unsigned char *input = _input; + unsigned int count, len = 0; + size_t i; + + for (i = 0; i < size; ) { + count = uni_utf8_char_bytes(input[i]); + if (i + count > size) + break; + i += count; + len++; + } + *partial_pos_r = i; + return len; +} + +static bool uint16_find(const uint16_t *data, unsigned int count, + uint16_t value, unsigned int *idx_r) +{ + BINARY_NUMBER_SEARCH(data, count, value, idx_r); +} + +static bool uint32_find(const uint32_t *data, unsigned int count, + uint32_t value, unsigned int *idx_r) +{ + BINARY_NUMBER_SEARCH(data, count, value, idx_r); +} + +unichar_t uni_ucs4_to_titlecase(unichar_t chr) +{ + unsigned int idx; + + if (chr <= 0xff) + return titlecase8_map[chr]; + else if (chr <= 0xffff) { + if (!uint16_find(titlecase16_keys, N_ELEMENTS(titlecase16_keys), + chr, &idx)) + return chr; + else + return titlecase16_values[idx]; + } else { + if (!uint32_find(titlecase32_keys, N_ELEMENTS(titlecase32_keys), + chr, &idx)) + return chr; + else + return titlecase32_values[idx]; + } +} + +static bool uni_ucs4_decompose_uni(unichar_t *chr) +{ + unsigned int idx; + + if (*chr <= 0xff) { + if (uni8_decomp_map[*chr] == *chr) + return FALSE; + *chr = uni8_decomp_map[*chr]; + } else if (*chr <= 0xffff) { + if (*chr < uni16_decomp_keys[0]) + return FALSE; + + if (!uint16_find(uni16_decomp_keys, + N_ELEMENTS(uni16_decomp_keys), *chr, &idx)) + return FALSE; + *chr = uni16_decomp_values[idx]; + } else { + if (!uint32_find(uni32_decomp_keys, + N_ELEMENTS(uni32_decomp_keys), *chr, &idx)) + return FALSE; + *chr = uni32_decomp_values[idx]; + } + return TRUE; +} + +static void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output) +{ +#define SBase HANGUL_FIRST +#define LBase 0x1100 +#define VBase 0x1161 +#define TBase 0x11A7 +#define LCount 19 +#define VCount 21 +#define TCount 28 +#define NCount (VCount * TCount) + unsigned int SIndex = chr - SBase; + unichar_t L = LBase + SIndex / NCount; + unichar_t V = VBase + (SIndex % NCount) / TCount; + unichar_t T = TBase + SIndex % TCount; + + uni_ucs4_to_utf8_c(L, output); + uni_ucs4_to_utf8_c(V, output); + if (T != TBase) uni_ucs4_to_utf8_c(T, output); +} + +static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output) +{ + const uint32_t *value; + unsigned int idx; + + if (chr < multidecomp_keys[0] || chr > 0xffff) + return FALSE; + + if (!uint32_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys), + chr, &idx)) + return FALSE; + + value = &multidecomp_values[multidecomp_offsets[idx]]; + for (; *value != 0; value++) + uni_ucs4_to_utf8_c(*value, output); + return TRUE; +} + +static void output_add_replacement_char(buffer_t *output) +{ + if (output->used >= UTF8_REPLACEMENT_CHAR_LEN && + memcmp(CONST_PTR_OFFSET(output->data, + output->used - UTF8_REPLACEMENT_CHAR_LEN), + utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN) == 0) { + /* don't add the replacement char multiple times */ + return; + } + buffer_append(output, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN); +} + +int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size, + buffer_t *output) +{ + const unsigned char *input = _input; + unichar_t chr; + int ret = 0; + + while (size > 0) { + int bytes = uni_utf8_get_char_n(input, size, &chr); + if (bytes <= 0) { + /* invalid input. try the next byte. */ + ret = -1; + input++; size--; + output_add_replacement_char(output); + continue; + } + input += bytes; + size -= bytes; + + chr = uni_ucs4_to_titlecase(chr); + if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST) + uni_ucs4_decompose_hangul_utf8(chr, output); + else if (uni_ucs4_decompose_uni(&chr) || + !uni_ucs4_decompose_multi_utf8(chr, output)) + uni_ucs4_to_utf8_c(chr, output); + } + return ret; +} + +static inline unsigned int +is_valid_utf8_seq(const unsigned char *input, unsigned int size) +{ + unichar_t chr; + int len = uni_utf8_get_char_n(input, size, &chr); + return len <= 0 ? 0 : len; +} + +static int uni_utf8_find_invalid_pos(const unsigned char *input, size_t size, + size_t *pos_r) +{ + size_t i, len; + + /* find the first invalid utf8 sequence */ + for (i = 0; i < size;) { + if (input[i] < 0x80) + i++; + else { + len = is_valid_utf8_seq(input + i, size-i); + if (unlikely(len == 0)) { + *pos_r = i; + return -1; + } + i += len; + } + } + return 0; +} + +bool uni_utf8_get_valid_data(const unsigned char *input, size_t size, + buffer_t *buf) +{ + size_t i, len; + + if (uni_utf8_find_invalid_pos(input, size, &i) == 0) + return TRUE; + + /* broken utf-8 input - skip the broken characters */ + buffer_append(buf, input, i++); + + output_add_replacement_char(buf); + while (i < size) { + if (input[i] < 0x80) { + buffer_append_c(buf, input[i++]); + continue; + } + + len = is_valid_utf8_seq(input + i, size-i); + if (len == 0) { + i++; + output_add_replacement_char(buf); + continue; + } + buffer_append(buf, input + i, len); + i += len; + } + return FALSE; +} + +bool uni_utf8_str_is_valid(const char *str) +{ + size_t i; + + return uni_utf8_find_invalid_pos((const unsigned char *)str, + strlen(str), &i) == 0; +} + +bool uni_utf8_data_is_valid(const unsigned char *data, size_t size) +{ + size_t i; + + return uni_utf8_find_invalid_pos(data, size, &i) == 0; +} + +size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size, + size_t max_new_size) +{ + if (max_new_size >= old_size) + return old_size; + if (max_new_size == 0) + return 0; + + if ((data[max_new_size] & 0x80) == 0) + return max_new_size; + while (max_new_size > 0 && (data[max_new_size-1] & 0xc0) == 0x80) + max_new_size--; + if (max_new_size > 0 && (data[max_new_size-1] & 0xc0) == 0xc0) + max_new_size--; + return max_new_size; +} |