diff options
Diffstat (limited to 'third_party/heimdal/lib/wind/normalize.c')
-rw-r--r-- | third_party/heimdal/lib/wind/normalize.c | 325 |
1 files changed, 325 insertions, 0 deletions
diff --git a/third_party/heimdal/lib/wind/normalize.c b/third_party/heimdal/lib/wind/normalize.c new file mode 100644 index 0000000..8f3991d --- /dev/null +++ b/third_party/heimdal/lib/wind/normalize.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2004 Kungliga Tekniska Högskolan + * (Royal Institute of Technology, Stockholm, Sweden). + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Institute nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif +#include "windlocl.h" + +#include <assert.h> +#include <stdlib.h> +#include <errno.h> +#include <stdio.h> + +#include "roken.h" + +#include "normalize_table.h" + +static int +translation_cmp(const void *key, const void *data) +{ + const struct translation *t1 = (const struct translation *)key; + const struct translation *t2 = (const struct translation *)data; + + return t1->key - t2->key; +} + +enum { s_base = 0xAC00}; +enum { s_count = 11172}; +enum { l_base = 0x1100}; +enum { l_count = 19}; +enum { v_base = 0x1161}; +enum { v_count = 21}; +enum { t_base = 0x11A7}; +enum { t_count = 28}; +enum { n_count = v_count * t_count}; + +static int +hangul_decomp(const uint32_t *in, size_t in_len, + uint32_t *out, size_t *out_len) +{ + uint32_t u = *in; + unsigned s_index; + unsigned l, v, t; + unsigned o; + + if (u < s_base || u >= s_base + s_count) + return 0; + s_index = u - s_base; + l = l_base + s_index / n_count; + v = v_base + (s_index % n_count) / t_count; + t = t_base + s_index % t_count; + o = 2; + if (t != t_base) + ++o; + if (*out_len < o) + return WIND_ERR_OVERRUN; + out[0] = l; + out[1] = v; + if (t != t_base) + out[2] = t; + *out_len = o; + return 1; +} + +static uint32_t +hangul_composition(const uint32_t *in, size_t in_len) +{ + if (in_len < 2) + return 0; + if (in[0] >= l_base && in[0] < l_base + l_count) { + unsigned l_index = in[0] - l_base; + unsigned v_index; + + if (in[1] < v_base || in[1] >= v_base + v_count) + return 0; + v_index = in[1] - v_base; + return (l_index * v_count + v_index) * t_count + s_base; + } else if (in[0] >= s_base && in[0] < s_base + s_count) { + unsigned s_index = in[0] - s_base; + unsigned t_index; + + if (s_index % t_count != 0) + return 0; + if (in[1] < t_base || in[1] >= t_base + t_count) + return 0; + t_index = in[1] - t_base; + return in[0] + t_index; + } + return 0; +} + +static int +compat_decomp(const uint32_t *in, size_t in_len, + uint32_t *out, size_t *out_len) +{ + unsigned i; + unsigned o = 0; + + for (i = 0; i < in_len; ++i) { + struct translation ts = {in[i], 0, 0}; + size_t sub_len = *out_len - o; + int ret; + + ret = hangul_decomp(in + i, in_len - i, + out + o, &sub_len); + if (ret) { + if (ret == WIND_ERR_OVERRUN) + return ret; + o += sub_len; + } else { + void *s = bsearch(&ts, + _wind_normalize_table, + _wind_normalize_table_size, + sizeof(_wind_normalize_table[0]), + translation_cmp); + if (s != NULL) { + const struct translation *t = (const struct translation *)s; + + ret = compat_decomp(_wind_normalize_val_table + t->val_offset, + t->val_len, + out + o, &sub_len); + if (ret) + return ret; + o += sub_len; + } else { + if (o >= *out_len) + return WIND_ERR_OVERRUN; + out[o++] = in[i]; + + } + } + } + *out_len = o; + return 0; +} + +static void +swap_char(uint32_t * a, uint32_t * b) +{ + uint32_t t; + t = *a; + *a = *b; + *b = t; +} + +/* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points + * that all have Canonical_Combining_Class > 0 */ +static void +canonical_reorder_sequence(uint32_t * a, size_t len) +{ + size_t i, j; + + if (len <= 1) + return; + + for (i = 1; i < len; i++) { + for (j = i; + j > 0 && + _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]); + j--) + swap_char(&a[j], &a[j-1]); + } +} + +static void +canonical_reorder(uint32_t *tmp, size_t tmp_len) +{ + size_t i; + + for (i = 0; i < tmp_len; ++i) { + int cc = _wind_combining_class(tmp[i]); + if (cc) { + size_t j; + for (j = i + 1; + j < tmp_len && _wind_combining_class(tmp[j]); + ++j) + ; + canonical_reorder_sequence(&tmp[i], j - i); + i = j; + } + } +} + +static uint32_t +find_composition(const uint32_t *in, unsigned in_len) +{ + unsigned short canon_index = 0; + uint32_t cur; + unsigned n = 0; + + cur = hangul_composition(in, in_len); + if (cur) + return cur; + + do { + const struct canon_node *c = &_wind_canon_table[canon_index]; + unsigned i; + + if (n % 5 == 0) { + if (in_len-- == 0) + return c->val; + cur = *in++; + } + + i = cur >> 16; + if (i < c->next_start || i >= c->next_end) + canon_index = 0; + else + canon_index = + _wind_canon_next_table[c->next_offset + i - c->next_start]; + if (canon_index != 0) { + cur = (cur << 4) & 0xFFFFF; + ++n; + } + } while (canon_index != 0); + return 0; +} + +static int +combine(const uint32_t *in, size_t in_len, + uint32_t *out, size_t *out_len) +{ + unsigned i; + int ostarter; + unsigned o = 0; + int old_cc; + + for (i = 0; i < in_len;) { + while (i < in_len && _wind_combining_class(in[i]) != 0) { + out[o++] = in[i++]; + } + if (i < in_len) { + if (o >= *out_len) + return WIND_ERR_OVERRUN; + ostarter = o; + out[o++] = in[i++]; + old_cc = -1; + + while (i < in_len) { + uint32_t comb; + uint32_t v[2]; + int cc; + + v[0] = out[ostarter]; + v[1] = in[i]; + + cc = _wind_combining_class(in[i]); + if (old_cc != cc && (comb = find_composition(v, 2))) { + out[ostarter] = comb; + } else if (cc == 0) { + break; + } else { + if (o >= *out_len) + return WIND_ERR_OVERRUN; + out[o++] = in[i]; + old_cc = cc; + } + ++i; + } + } + } + *out_len = o; + return 0; +} + +int +_wind_stringprep_normalize(const uint32_t *in, size_t in_len, + uint32_t *out, size_t *out_len) +{ + size_t tmp_len; + uint32_t *tmp; + int ret; + + if (in_len == 0) { + *out_len = 0; + return 0; + } + + tmp_len = in_len * 4; + if (tmp_len < MAX_LENGTH_CANON) + tmp_len = MAX_LENGTH_CANON; + tmp = malloc(tmp_len * sizeof(uint32_t)); + if (tmp == NULL) + return ENOMEM; + + ret = compat_decomp(in, in_len, tmp, &tmp_len); + if (ret) { + free(tmp); + return ret; + } + canonical_reorder(tmp, tmp_len); + ret = combine(tmp, tmp_len, out, out_len); + free(tmp); + return ret; +} |