diff options
Diffstat (limited to 'src/share/utf8/charset.c')
-rw-r--r-- | src/share/utf8/charset.c | 534 |
1 files changed, 534 insertions, 0 deletions
diff --git a/src/share/utf8/charset.c b/src/share/utf8/charset.c new file mode 100644 index 0000000..5c5693d --- /dev/null +++ b/src/share/utf8/charset.c @@ -0,0 +1,534 @@ +/* + * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * See the corresponding header file for a description of the functions + * that this file provides. + * + * This was first written for Ogg Vorbis but could be of general use. + * + * The only deliberate assumption about data sizes is that a short has + * at least 16 bits, but this code has only been tested on systems with + * 8-bit char, 16-bit short and 32-bit int. + */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#if !defined _WIN32 && !defined HAVE_ICONV /* should be && defined USE_CHARSET_CONVERT */ + +#include <stdlib.h> + +#include "share/alloc.h" +#include "charset.h" + +#include "charmaps.h" + +/* + * This is like the standard strcasecmp, but it does not depend + * on the locale. Locale-dependent functions can be dangerous: + * we once had a bug involving strcasecmp("iso", "ISO") in a + * Turkish locale! + * + * (I'm not really sure what the official standard says + * about the sign of strcasecmp("Z", "["), but usually + * we're only interested in whether it's zero.) + */ + +static int ascii_strcasecmp(const char *s1, const char *s2) +{ + char c1, c2; + + for (;; s1++, s2++) { + if (!*s1 || !*s2) + break; + if (*s1 == *s2) + continue; + c1 = *s1; + if ('a' <= c1 && c1 <= 'z') + c1 += 'A' - 'a'; + c2 = *s2; + if ('a' <= c2 && c2 <= 'z') + c2 += 'A' - 'a'; + if (c1 != c2) + break; + } + return (uint8_t)*s1 - (uint8_t)*s2; +} + +/* + * UTF-8 equivalents of the C library's wctomb() and mbtowc(). + */ + +int utf8_mbtowc(int *pwc, const char *s, size_t n) +{ + uint8_t c; + int wc, i, k; + + if (!n || !s) + return 0; + + c = *s; + if (c < 0x80) { + if (pwc) + *pwc = c; + return c ? 1 : 0; + } + else if (c < 0xc2) + return -1; + else if (c < 0xe0) { + if (n >= 2 && (s[1] & 0xc0) == 0x80) { + if (pwc) + *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f); + return 2; + } + else + return -1; + } + else if (c < 0xf0) + k = 3; + else if (c < 0xf8) + k = 4; + else if (c < 0xfc) + k = 5; + else if (c < 0xfe) + k = 6; + else + return -1; + + if (n < (size_t)k) + return -1; + wc = *s++ & ((1 << (7 - k)) - 1); + for (i = 1; i < k; i++) { + if ((*s & 0xc0) != 0x80) + return -1; + wc = (wc << 6) | (*s++ & 0x3f); + } + if (wc < (1 << (5 * k - 4))) + return -1; + if (pwc) + *pwc = wc; + return k; +} + +int utf8_wctomb(char *s, int wc1) +{ + uint32_t wc = wc1; + + if (!s) + return 0; + if (wc < (1u << 7)) { + *s++ = wc; + return 1; + } + else if (wc < (1u << 11)) { + *s++ = 0xc0 | (wc >> 6); + *s++ = 0x80 | (wc & 0x3f); + return 2; + } + else if (wc < (1u << 16)) { + *s++ = 0xe0 | (wc >> 12); + *s++ = 0x80 | ((wc >> 6) & 0x3f); + *s++ = 0x80 | (wc & 0x3f); + return 3; + } + else if (wc < (1u << 21)) { + *s++ = 0xf0 | (wc >> 18); + *s++ = 0x80 | ((wc >> 12) & 0x3f); + *s++ = 0x80 | ((wc >> 6) & 0x3f); + *s++ = 0x80 | (wc & 0x3f); + return 4; + } + else if (wc < (1u << 26)) { + *s++ = 0xf8 | (wc >> 24); + *s++ = 0x80 | ((wc >> 18) & 0x3f); + *s++ = 0x80 | ((wc >> 12) & 0x3f); + *s++ = 0x80 | ((wc >> 6) & 0x3f); + *s++ = 0x80 | (wc & 0x3f); + return 5; + } + else if (wc < (1u << 31)) { + *s++ = 0xfc | (wc >> 30); + *s++ = 0x80 | ((wc >> 24) & 0x3f); + *s++ = 0x80 | ((wc >> 18) & 0x3f); + *s++ = 0x80 | ((wc >> 12) & 0x3f); + *s++ = 0x80 | ((wc >> 6) & 0x3f); + *s++ = 0x80 | (wc & 0x3f); + return 6; + } + else + return -1; +} + +/* + * The charset "object" and methods. + */ + +struct charset { + int max; + int (*mbtowc)(void *table, int *pwc, const char *s, size_t n); + int (*wctomb)(void *table, char *s, int wc); + void *map; +}; + +int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n) +{ + return (*charset->mbtowc)(charset->map, pwc, s, n); +} + +int charset_wctomb(struct charset *charset, char *s, int wc) +{ + return (*charset->wctomb)(charset->map, s, wc); +} + +int charset_max(struct charset *charset) +{ + return charset->max; +} + +/* + * Implementation of UTF-8. + */ + +static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n) +{ + (void)map; + return utf8_mbtowc(pwc, s, n); +} + +static int wctomb_utf8(void *map, char *s, int wc) +{ + (void)map; + return utf8_wctomb(s, wc); +} + +/* + * Implementation of US-ASCII. + * Probably on most architectures this compiles to less than 256 bytes + * of code, so we can save space by not having a table for this one. + */ + +static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n) +{ + int wc; + + (void)map; + if (!n || !s) + return 0; + wc = (uint8_t)*s; + if (wc & ~0x7f) + return -1; + if (pwc) + *pwc = wc; + return wc ? 1 : 0; +} + +static int wctomb_ascii(void *map, char *s, int wc) +{ + (void)map; + if (!s) + return 0; + if (wc & ~0x7f) + return -1; + *s = wc; + return 1; +} + +/* + * Implementation of ISO-8859-1. + * Probably on most architectures this compiles to less than 256 bytes + * of code, so we can save space by not having a table for this one. + */ + +static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n) +{ + int wc; + + (void)map; + if (!n || !s) + return 0; + wc = (uint8_t)*s; + if (wc & ~0xff) + return -1; + if (pwc) + *pwc = wc; + return wc ? 1 : 0; +} + +static int wctomb_iso1(void *map, char *s, int wc) +{ + (void)map; + if (!s) + return 0; + if (wc & ~0xff) + return -1; + *s = wc; + return 1; +} + +/* + * Implementation of any 8-bit charset. + */ + +struct map { + const uint16_t *from; + struct inverse_map *to; +}; + +static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n) +{ + struct map *map = map1; + uint16_t wc; + + if (!n || !s) + return 0; + wc = map->from[(uint8_t)*s]; + if (wc == 0xffff) + return -1; + if (pwc) + *pwc = (int)wc; + return wc ? 1 : 0; +} + +/* + * For the inverse map we use a hash table, which has the advantages + * of small constant memory requirement and simple memory allocation, + * but the disadvantage of slow conversion in the worst case. + * If you need real-time performance while letting a potentially + * malicious user define their own map, then the method used in + * linux/drivers/char/consolemap.c would be more appropriate. + */ + +struct inverse_map { + uint8_t first[256]; + uint8_t next[256]; +}; + +/* + * The simple hash is good enough for this application. + * Use the alternative trivial hashes for testing. + */ +#define HASH(i) ((i) & 0xff) +/* #define HASH(i) 0 */ +/* #define HASH(i) 99 */ + +static struct inverse_map *make_inverse_map(const uint16_t *from) +{ + struct inverse_map *to; + char used[256]; + int i, j, k; + + to = malloc(sizeof(struct inverse_map)); + if (!to) + return 0; + for (i = 0; i < 256; i++) + to->first[i] = to->next[i] = used[i] = 0; + for (i = 255; i >= 0; i--) + if (from[i] != 0xffff) { + k = HASH(from[i]); + to->next[i] = to->first[k]; + to->first[k] = i; + used[k] = 1; + } + + /* Point the empty buckets at an empty list. */ + for (i = 0; i < 256; i++) + if (!to->next[i]) + break; + if (i < 256) + for (j = 0; j < 256; j++) + if (!used[j]) + to->first[j] = i; + + return to; +} + +static int wctomb_8bit(void *map1, char *s, int wc1) +{ + struct map *map = map1; + uint16_t wc = wc1; + int i; + + if (!s) + return 0; + + if (wc1 & ~0xffff) + return -1; + + if (1) /* Change 1 to 0 to test the case where malloc fails. */ + if (!map->to) + map->to = make_inverse_map(map->from); + + if (map->to) { + /* Use the inverse map. */ + i = map->to->first[HASH(wc)]; + for (;;) { + if (map->from[i] == wc) { + *s = i; + return 1; + } + if (!(i = map->to->next[i])) + break; + } + } + else { + /* We don't have an inverse map, so do a linear search. */ + for (i = 0; i < 256; i++) + if (map->from[i] == wc) { + *s = i; + return 1; + } + } + + return -1; +} + +/* + * The "constructor" charset_find(). + */ + +struct charset charset_utf8 = { + 6, + &mbtowc_utf8, + &wctomb_utf8, + 0 +}; + +struct charset charset_iso1 = { + 1, + &mbtowc_iso1, + &wctomb_iso1, + 0 +}; + +struct charset charset_ascii = { + 1, + &mbtowc_ascii, + &wctomb_ascii, + 0 +}; + +struct charset *charset_find(const char *code) +{ + int i; + + /* Find good (MIME) name. */ + for (i = 0; names[i].bad; i++) + if (!ascii_strcasecmp(code, names[i].bad)) { + code = names[i].good; + break; + } + + /* Recognise some charsets for which we avoid using a table. */ + if (!ascii_strcasecmp(code, "UTF-8")) + return &charset_utf8; + if (!ascii_strcasecmp(code, "US-ASCII")) + return &charset_ascii; + if (!ascii_strcasecmp(code, "ISO-8859-1")) + return &charset_iso1; + + /* Look for a mapping for a simple 8-bit encoding. */ + for (i = 0; maps[i].name; i++) + if (!ascii_strcasecmp(code, maps[i].name)) { + if (!maps[i].charset) { + maps[i].charset = malloc(sizeof(struct charset)); + if (maps[i].charset) { + struct map *map = malloc(sizeof(struct map)); + if (!map) { + free(maps[i].charset); + maps[i].charset = 0; + } + else { + maps[i].charset->max = 1; + maps[i].charset->mbtowc = &mbtowc_8bit; + maps[i].charset->wctomb = &wctomb_8bit; + maps[i].charset->map = map; + map->from = maps[i].map; + map->to = 0; /* inverse mapping is created when required */ + } + } + } + return maps[i].charset; + } + + return 0; +} + +/* + * Function to convert a buffer from one encoding to another. + * Invalid bytes are replaced by '#', and characters that are + * not available in the target encoding are replaced by '?'. + * Each of TO and TOLEN may be zero, if the result is not needed. + * The output buffer is null-terminated, so it is all right to + * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0). + */ + +int charset_convert(const char *fromcode, const char *tocode, + const char *from, size_t fromlen, + char **to, size_t *tolen) +{ + int ret = 0; + struct charset *charset1, *charset2; + char *tobuf, *p; + int i, j, wc; + + charset1 = charset_find(fromcode); + charset2 = charset_find(tocode); + if (!charset1 || !charset2 ) + return -1; + + tobuf = safe_malloc_mul2add_(fromlen, /*times*/charset2->max, /*+*/1); + if (!tobuf) + return -2; + + for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) { + i = charset_mbtowc(charset1, &wc, from, fromlen); + if (!i) + i = 1; + else if (i == -1) { + i = 1; + wc = '#'; + ret = 2; + } + j = charset_wctomb(charset2, p, wc); + if (j == -1) { + if (!ret) + ret = 1; + j = charset_wctomb(charset2, p, '?'); + if (j == -1) + j = 0; + } + } + + if (tolen) + *tolen = p - tobuf; + *p++ = '\0'; + if (to) { + char *tobuf_saved = tobuf; + *to = realloc(tobuf, p - tobuf); + if (*to == NULL) + *to = tobuf_saved; + } + else + free(tobuf); + + return ret; +} + +#endif /* USE_CHARSET_ICONV */ |