diff options
Diffstat (limited to 'src/share/utf8')
-rw-r--r-- | src/share/utf8/CMakeLists.txt | 8 | ||||
-rw-r--r-- | src/share/utf8/charmaps.h | 57 | ||||
-rw-r--r-- | src/share/utf8/charset.c | 534 | ||||
-rw-r--r-- | src/share/utf8/charset.h | 72 | ||||
-rw-r--r-- | src/share/utf8/charset_test.c | 263 | ||||
-rw-r--r-- | src/share/utf8/iconvert.c | 257 | ||||
-rw-r--r-- | src/share/utf8/iconvert.h | 49 | ||||
-rw-r--r-- | src/share/utf8/makemap.c | 81 | ||||
-rw-r--r-- | src/share/utf8/utf8.c | 202 |
9 files changed, 1523 insertions, 0 deletions
diff --git a/src/share/utf8/CMakeLists.txt b/src/share/utf8/CMakeLists.txt new file mode 100644 index 0000000..389b09e --- /dev/null +++ b/src/share/utf8/CMakeLists.txt @@ -0,0 +1,8 @@ +cmake_minimum_required(VERSION 3.12) + +add_library(utf8 STATIC + charset.c + iconvert.c + utf8.c) + +target_link_libraries(utf8 PUBLIC grabbag $<TARGET_NAME_IF_EXISTS:Iconv::Iconv>) diff --git a/src/share/utf8/charmaps.h b/src/share/utf8/charmaps.h new file mode 100644 index 0000000..16d049a --- /dev/null +++ b/src/share/utf8/charmaps.h @@ -0,0 +1,57 @@ + +/* + * If you need to generate more maps, use makemap.c on a system + * with a decent iconv. + */ + +static const uint16_t mapping_iso_8859_2[256] = { + 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, + 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, + 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, + 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, + 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, + 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, + 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, + 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, + 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, + 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, + 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, + 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, + 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, + 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, + 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, + 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, + 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, + 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, + 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, + 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, + 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9 +}; + +static struct { + const char *name; + const uint16_t *map; + struct charset *charset; +} maps[] = { + { "ISO-8859-2", mapping_iso_8859_2, 0 }, + { 0, 0, 0 } +}; + +static const struct { + const char *bad; + const char *good; +} names[] = { + { "ANSI_X3.4-1968", "us-ascii" }, + { 0, 0 } +}; diff --git a/src/share/utf8/charset.c b/src/share/utf8/charset.c new file mode 100644 index 0000000..5c5693d --- /dev/null +++ b/src/share/utf8/charset.c @@ -0,0 +1,534 @@ +/* + * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * See the corresponding header file for a description of the functions + * that this file provides. + * + * This was first written for Ogg Vorbis but could be of general use. + * + * The only deliberate assumption about data sizes is that a short has + * at least 16 bits, but this code has only been tested on systems with + * 8-bit char, 16-bit short and 32-bit int. + */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#if !defined _WIN32 && !defined HAVE_ICONV /* should be && defined USE_CHARSET_CONVERT */ + +#include <stdlib.h> + +#include "share/alloc.h" +#include "charset.h" + +#include "charmaps.h" + +/* + * This is like the standard strcasecmp, but it does not depend + * on the locale. Locale-dependent functions can be dangerous: + * we once had a bug involving strcasecmp("iso", "ISO") in a + * Turkish locale! + * + * (I'm not really sure what the official standard says + * about the sign of strcasecmp("Z", "["), but usually + * we're only interested in whether it's zero.) + */ + +static int ascii_strcasecmp(const char *s1, const char *s2) +{ + char c1, c2; + + for (;; s1++, s2++) { + if (!*s1 || !*s2) + break; + if (*s1 == *s2) + continue; + c1 = *s1; + if ('a' <= c1 && c1 <= 'z') + c1 += 'A' - 'a'; + c2 = *s2; + if ('a' <= c2 && c2 <= 'z') + c2 += 'A' - 'a'; + if (c1 != c2) + break; + } + return (uint8_t)*s1 - (uint8_t)*s2; +} + +/* + * UTF-8 equivalents of the C library's wctomb() and mbtowc(). + */ + +int utf8_mbtowc(int *pwc, const char *s, size_t n) +{ + uint8_t c; + int wc, i, k; + + if (!n || !s) + return 0; + + c = *s; + if (c < 0x80) { + if (pwc) + *pwc = c; + return c ? 1 : 0; + } + else if (c < 0xc2) + return -1; + else if (c < 0xe0) { + if (n >= 2 && (s[1] & 0xc0) == 0x80) { + if (pwc) + *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f); + return 2; + } + else + return -1; + } + else if (c < 0xf0) + k = 3; + else if (c < 0xf8) + k = 4; + else if (c < 0xfc) + k = 5; + else if (c < 0xfe) + k = 6; + else + return -1; + + if (n < (size_t)k) + return -1; + wc = *s++ & ((1 << (7 - k)) - 1); + for (i = 1; i < k; i++) { + if ((*s & 0xc0) != 0x80) + return -1; + wc = (wc << 6) | (*s++ & 0x3f); + } + if (wc < (1 << (5 * k - 4))) + return -1; + if (pwc) + *pwc = wc; + return k; +} + +int utf8_wctomb(char *s, int wc1) +{ + uint32_t wc = wc1; + + if (!s) + return 0; + if (wc < (1u << 7)) { + *s++ = wc; + return 1; + } + else if (wc < (1u << 11)) { + *s++ = 0xc0 | (wc >> 6); + *s++ = 0x80 | (wc & 0x3f); + return 2; + } + else if (wc < (1u << 16)) { + *s++ = 0xe0 | (wc >> 12); + *s++ = 0x80 | ((wc >> 6) & 0x3f); + *s++ = 0x80 | (wc & 0x3f); + return 3; + } + else if (wc < (1u << 21)) { + *s++ = 0xf0 | (wc >> 18); + *s++ = 0x80 | ((wc >> 12) & 0x3f); + *s++ = 0x80 | ((wc >> 6) & 0x3f); + *s++ = 0x80 | (wc & 0x3f); + return 4; + } + else if (wc < (1u << 26)) { + *s++ = 0xf8 | (wc >> 24); + *s++ = 0x80 | ((wc >> 18) & 0x3f); + *s++ = 0x80 | ((wc >> 12) & 0x3f); + *s++ = 0x80 | ((wc >> 6) & 0x3f); + *s++ = 0x80 | (wc & 0x3f); + return 5; + } + else if (wc < (1u << 31)) { + *s++ = 0xfc | (wc >> 30); + *s++ = 0x80 | ((wc >> 24) & 0x3f); + *s++ = 0x80 | ((wc >> 18) & 0x3f); + *s++ = 0x80 | ((wc >> 12) & 0x3f); + *s++ = 0x80 | ((wc >> 6) & 0x3f); + *s++ = 0x80 | (wc & 0x3f); + return 6; + } + else + return -1; +} + +/* + * The charset "object" and methods. + */ + +struct charset { + int max; + int (*mbtowc)(void *table, int *pwc, const char *s, size_t n); + int (*wctomb)(void *table, char *s, int wc); + void *map; +}; + +int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n) +{ + return (*charset->mbtowc)(charset->map, pwc, s, n); +} + +int charset_wctomb(struct charset *charset, char *s, int wc) +{ + return (*charset->wctomb)(charset->map, s, wc); +} + +int charset_max(struct charset *charset) +{ + return charset->max; +} + +/* + * Implementation of UTF-8. + */ + +static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n) +{ + (void)map; + return utf8_mbtowc(pwc, s, n); +} + +static int wctomb_utf8(void *map, char *s, int wc) +{ + (void)map; + return utf8_wctomb(s, wc); +} + +/* + * Implementation of US-ASCII. + * Probably on most architectures this compiles to less than 256 bytes + * of code, so we can save space by not having a table for this one. + */ + +static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n) +{ + int wc; + + (void)map; + if (!n || !s) + return 0; + wc = (uint8_t)*s; + if (wc & ~0x7f) + return -1; + if (pwc) + *pwc = wc; + return wc ? 1 : 0; +} + +static int wctomb_ascii(void *map, char *s, int wc) +{ + (void)map; + if (!s) + return 0; + if (wc & ~0x7f) + return -1; + *s = wc; + return 1; +} + +/* + * Implementation of ISO-8859-1. + * Probably on most architectures this compiles to less than 256 bytes + * of code, so we can save space by not having a table for this one. + */ + +static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n) +{ + int wc; + + (void)map; + if (!n || !s) + return 0; + wc = (uint8_t)*s; + if (wc & ~0xff) + return -1; + if (pwc) + *pwc = wc; + return wc ? 1 : 0; +} + +static int wctomb_iso1(void *map, char *s, int wc) +{ + (void)map; + if (!s) + return 0; + if (wc & ~0xff) + return -1; + *s = wc; + return 1; +} + +/* + * Implementation of any 8-bit charset. + */ + +struct map { + const uint16_t *from; + struct inverse_map *to; +}; + +static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n) +{ + struct map *map = map1; + uint16_t wc; + + if (!n || !s) + return 0; + wc = map->from[(uint8_t)*s]; + if (wc == 0xffff) + return -1; + if (pwc) + *pwc = (int)wc; + return wc ? 1 : 0; +} + +/* + * For the inverse map we use a hash table, which has the advantages + * of small constant memory requirement and simple memory allocation, + * but the disadvantage of slow conversion in the worst case. + * If you need real-time performance while letting a potentially + * malicious user define their own map, then the method used in + * linux/drivers/char/consolemap.c would be more appropriate. + */ + +struct inverse_map { + uint8_t first[256]; + uint8_t next[256]; +}; + +/* + * The simple hash is good enough for this application. + * Use the alternative trivial hashes for testing. + */ +#define HASH(i) ((i) & 0xff) +/* #define HASH(i) 0 */ +/* #define HASH(i) 99 */ + +static struct inverse_map *make_inverse_map(const uint16_t *from) +{ + struct inverse_map *to; + char used[256]; + int i, j, k; + + to = malloc(sizeof(struct inverse_map)); + if (!to) + return 0; + for (i = 0; i < 256; i++) + to->first[i] = to->next[i] = used[i] = 0; + for (i = 255; i >= 0; i--) + if (from[i] != 0xffff) { + k = HASH(from[i]); + to->next[i] = to->first[k]; + to->first[k] = i; + used[k] = 1; + } + + /* Point the empty buckets at an empty list. */ + for (i = 0; i < 256; i++) + if (!to->next[i]) + break; + if (i < 256) + for (j = 0; j < 256; j++) + if (!used[j]) + to->first[j] = i; + + return to; +} + +static int wctomb_8bit(void *map1, char *s, int wc1) +{ + struct map *map = map1; + uint16_t wc = wc1; + int i; + + if (!s) + return 0; + + if (wc1 & ~0xffff) + return -1; + + if (1) /* Change 1 to 0 to test the case where malloc fails. */ + if (!map->to) + map->to = make_inverse_map(map->from); + + if (map->to) { + /* Use the inverse map. */ + i = map->to->first[HASH(wc)]; + for (;;) { + if (map->from[i] == wc) { + *s = i; + return 1; + } + if (!(i = map->to->next[i])) + break; + } + } + else { + /* We don't have an inverse map, so do a linear search. */ + for (i = 0; i < 256; i++) + if (map->from[i] == wc) { + *s = i; + return 1; + } + } + + return -1; +} + +/* + * The "constructor" charset_find(). + */ + +struct charset charset_utf8 = { + 6, + &mbtowc_utf8, + &wctomb_utf8, + 0 +}; + +struct charset charset_iso1 = { + 1, + &mbtowc_iso1, + &wctomb_iso1, + 0 +}; + +struct charset charset_ascii = { + 1, + &mbtowc_ascii, + &wctomb_ascii, + 0 +}; + +struct charset *charset_find(const char *code) +{ + int i; + + /* Find good (MIME) name. */ + for (i = 0; names[i].bad; i++) + if (!ascii_strcasecmp(code, names[i].bad)) { + code = names[i].good; + break; + } + + /* Recognise some charsets for which we avoid using a table. */ + if (!ascii_strcasecmp(code, "UTF-8")) + return &charset_utf8; + if (!ascii_strcasecmp(code, "US-ASCII")) + return &charset_ascii; + if (!ascii_strcasecmp(code, "ISO-8859-1")) + return &charset_iso1; + + /* Look for a mapping for a simple 8-bit encoding. */ + for (i = 0; maps[i].name; i++) + if (!ascii_strcasecmp(code, maps[i].name)) { + if (!maps[i].charset) { + maps[i].charset = malloc(sizeof(struct charset)); + if (maps[i].charset) { + struct map *map = malloc(sizeof(struct map)); + if (!map) { + free(maps[i].charset); + maps[i].charset = 0; + } + else { + maps[i].charset->max = 1; + maps[i].charset->mbtowc = &mbtowc_8bit; + maps[i].charset->wctomb = &wctomb_8bit; + maps[i].charset->map = map; + map->from = maps[i].map; + map->to = 0; /* inverse mapping is created when required */ + } + } + } + return maps[i].charset; + } + + return 0; +} + +/* + * Function to convert a buffer from one encoding to another. + * Invalid bytes are replaced by '#', and characters that are + * not available in the target encoding are replaced by '?'. + * Each of TO and TOLEN may be zero, if the result is not needed. + * The output buffer is null-terminated, so it is all right to + * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0). + */ + +int charset_convert(const char *fromcode, const char *tocode, + const char *from, size_t fromlen, + char **to, size_t *tolen) +{ + int ret = 0; + struct charset *charset1, *charset2; + char *tobuf, *p; + int i, j, wc; + + charset1 = charset_find(fromcode); + charset2 = charset_find(tocode); + if (!charset1 || !charset2 ) + return -1; + + tobuf = safe_malloc_mul2add_(fromlen, /*times*/charset2->max, /*+*/1); + if (!tobuf) + return -2; + + for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) { + i = charset_mbtowc(charset1, &wc, from, fromlen); + if (!i) + i = 1; + else if (i == -1) { + i = 1; + wc = '#'; + ret = 2; + } + j = charset_wctomb(charset2, p, wc); + if (j == -1) { + if (!ret) + ret = 1; + j = charset_wctomb(charset2, p, '?'); + if (j == -1) + j = 0; + } + } + + if (tolen) + *tolen = p - tobuf; + *p++ = '\0'; + if (to) { + char *tobuf_saved = tobuf; + *to = realloc(tobuf, p - tobuf); + if (*to == NULL) + *to = tobuf_saved; + } + else + free(tobuf); + + return ret; +} + +#endif /* USE_CHARSET_ICONV */ diff --git a/src/share/utf8/charset.h b/src/share/utf8/charset.h new file mode 100644 index 0000000..ea8e31e --- /dev/null +++ b/src/share/utf8/charset.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <stdlib.h> + +/* + * These functions are like the C library's mbtowc() and wctomb(), + * but instead of depending on the locale they always work in UTF-8, + * and they use int instead of wchar_t. + */ + +int utf8_mbtowc(int *pwc, const char *s, size_t n); +int utf8_wctomb(char *s, int wc); + +/* + * This is an object-oriented version of mbtowc() and wctomb(). + * The caller first uses charset_find() to get a pointer to struct + * charset, then uses the mbtowc() and wctomb() methods on it. + * The function charset_max() gives the maximum length of a + * multibyte character in that encoding. + * This API is only appropriate for stateless encodings like UTF-8 + * or ISO-8859-3, but I have no intention of implementing anything + * other than UTF-8 and 8-bit encodings. + * + * MINOR BUG: If there is no memory charset_find() may return 0 and + * there is no way to distinguish this case from an unknown encoding. + */ + +struct charset; + +struct charset *charset_find(const char *code); + +int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n); +int charset_wctomb(struct charset *charset, char *s, int wc); +int charset_max(struct charset *charset); + +/* + * Function to convert a buffer from one encoding to another. + * Invalid bytes are replaced by '#', and characters that are + * not available in the target encoding are replaced by '?'. + * Each of TO and TOLEN may be zero if the result is not wanted. + * The input or output may contain null bytes, but the output + * buffer is also null-terminated, so it is all right to + * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0). + * + * Return value: + * + * -2 : memory allocation failed + * -1 : unknown encoding + * 0 : data was converted exactly + * 1 : valid data was converted approximately (using '?') + * 2 : input was invalid (but still converted, using '#') + */ + +int charset_convert(const char *fromcode, const char *tocode, + const char *from, size_t fromlen, + char **to, size_t *tolen); diff --git a/src/share/utf8/charset_test.c b/src/share/utf8/charset_test.c new file mode 100644 index 0000000..6761100 --- /dev/null +++ b/src/share/utf8/charset_test.c @@ -0,0 +1,263 @@ +/* + * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#include <assert.h> +#include <string.h> + +#include "charset.h" + +void test_any(struct charset *charset) +{ + int wc; + char s[2]; + + assert(charset); + + /* Decoder */ + + assert(charset_mbtowc(charset, 0, 0, 0) == 0); + assert(charset_mbtowc(charset, 0, 0, 1) == 0); + assert(charset_mbtowc(charset, 0, (char *)(-1), 0) == 0); + + assert(charset_mbtowc(charset, 0, "a", 0) == 0); + assert(charset_mbtowc(charset, 0, "", 1) == 0); + assert(charset_mbtowc(charset, 0, "b", 1) == 1); + assert(charset_mbtowc(charset, 0, "", 2) == 0); + assert(charset_mbtowc(charset, 0, "c", 2) == 1); + + wc = 'x'; + assert(charset_mbtowc(charset, &wc, "a", 0) == 0 && wc == 'x'); + assert(charset_mbtowc(charset, &wc, "", 1) == 0 && wc == 0); + assert(charset_mbtowc(charset, &wc, "b", 1) == 1 && wc == 'b'); + assert(charset_mbtowc(charset, &wc, "", 2) == 0 && wc == 0); + assert(charset_mbtowc(charset, &wc, "c", 2) == 1 && wc == 'c'); + + /* Encoder */ + + assert(charset_wctomb(charset, 0, 0) == 0); + + s[0] = s[1] = '.'; + assert(charset_wctomb(charset, s, 0) == 1 && + s[0] == '\0' && s[1] == '.'); + assert(charset_wctomb(charset, s, 'x') == 1 && + s[0] == 'x' && s[1] == '.'); +} + +void test_utf8() +{ + struct charset *charset; + int wc; + char s[8]; + + charset = charset_find("UTF-8"); + test_any(charset); + + /* Decoder */ + wc = 0; + assert(charset_mbtowc(charset, &wc, "\177", 1) == 1 && wc == 127); + assert(charset_mbtowc(charset, &wc, "\200", 2) == -1); + assert(charset_mbtowc(charset, &wc, "\301\277", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\302\200", 1) == -1); + assert(charset_mbtowc(charset, &wc, "\302\200", 2) == 2 && wc == 128); + assert(charset_mbtowc(charset, &wc, "\302\200", 3) == 2 && wc == 128); + assert(charset_mbtowc(charset, &wc, "\340\237\200", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\340\240\200", 9) == 3 && + wc == 1 << 11); + assert(charset_mbtowc(charset, &wc, "\360\217\277\277", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\360\220\200\200", 9) == 4 && + wc == 1 << 16); + assert(charset_mbtowc(charset, &wc, "\370\207\277\277\277", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\370\210\200\200\200", 9) == 5 && + wc == 1 << 21); + assert(charset_mbtowc(charset, &wc, "\374\203\277\277\277\277", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\374\204\200\200\200\200", 9) == 6 && + wc == 1 << 26); + assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\277", 9) == 6 && + wc == 0x7fffffff); + + assert(charset_mbtowc(charset, &wc, "\302\000", 2) == -1); + assert(charset_mbtowc(charset, &wc, "\302\300", 2) == -1); + assert(charset_mbtowc(charset, &wc, "\340\040\200", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\340\340\200", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\340\240\000", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\340\240\300", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\360\020\200\200", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\360\320\200\200", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\360\220\000\200", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\360\220\300\200", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\360\220\200\000", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\360\220\200\300", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\375\077\277\277\277\277", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\375\377\277\277\277\277", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\375\277\077\277\277\277", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\375\277\377\277\277\277", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\375\277\277\277\077\277", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\375\277\277\277\377\277", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\077", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\377", 9) == -1); + + assert(charset_mbtowc(charset, &wc, "\376\277\277\277\277\277", 9) == -1); + assert(charset_mbtowc(charset, &wc, "\377\277\277\277\277\277", 9) == -1); + + /* Encoder */ + safe_strncpy(s, ".......", sizeof(s)); + assert(charset_wctomb(charset, s, 1u << 31) == -1 && + !strcmp(s, ".......")); + assert(charset_wctomb(charset, s, 127) == 1 && + !strcmp(s, "\177......")); + assert(charset_wctomb(charset, s, 128) == 2 && + !strcmp(s, "\302\200.....")); + assert(charset_wctomb(charset, s, 0x7ff) == 2 && + !strcmp(s, "\337\277.....")); + assert(charset_wctomb(charset, s, 0x800) == 3 && + !strcmp(s, "\340\240\200....")); + assert(charset_wctomb(charset, s, 0xffff) == 3 && + !strcmp(s, "\357\277\277....")); + assert(charset_wctomb(charset, s, 0x10000) == 4 && + !strcmp(s, "\360\220\200\200...")); + assert(charset_wctomb(charset, s, 0x1fffff) == 4 && + !strcmp(s, "\367\277\277\277...")); + assert(charset_wctomb(charset, s, 0x200000) == 5 && + !strcmp(s, "\370\210\200\200\200..")); + assert(charset_wctomb(charset, s, 0x3ffffff) == 5 && + !strcmp(s, "\373\277\277\277\277..")); + assert(charset_wctomb(charset, s, 0x4000000) == 6 && + !strcmp(s, "\374\204\200\200\200\200.")); + assert(charset_wctomb(charset, s, 0x7fffffff) == 6 && + !strcmp(s, "\375\277\277\277\277\277.")); +} + +void test_ascii() +{ + struct charset *charset; + int wc; + char s[3]; + + charset = charset_find("us-ascii"); + test_any(charset); + + /* Decoder */ + wc = 0; + assert(charset_mbtowc(charset, &wc, "\177", 2) == 1 && wc == 127); + assert(charset_mbtowc(charset, &wc, "\200", 2) == -1); + + /* Encoder */ + safe_strncpy(s, "..", sizeof(s)); + assert(charset_wctomb(charset, s, 256) == -1 && !strcmp(s, "..")); + assert(charset_wctomb(charset, s, 255) == -1); + assert(charset_wctomb(charset, s, 128) == -1); + assert(charset_wctomb(charset, s, 127) == 1 && !strcmp(s, "\177.")); +} + +void test_iso1() +{ + struct charset *charset; + int wc; + char s[3]; + + charset = charset_find("iso-8859-1"); + test_any(charset); + + /* Decoder */ + wc = 0; + assert(charset_mbtowc(charset, &wc, "\302\200", 9) == 1 && wc == 0xc2); + + /* Encoder */ + safe_strncpy(s, "..", sizeof(s)); + assert(charset_wctomb(charset, s, 256) == -1 && !strcmp(s, "..")); + assert(charset_wctomb(charset, s, 255) == 1 && !strcmp(s, "\377.")); + assert(charset_wctomb(charset, s, 128) == 1 && !strcmp(s, "\200.")); +} + +void test_iso2() +{ + struct charset *charset; + int wc; + char s[3]; + + charset = charset_find("iso-8859-2"); + test_any(charset); + + /* Decoder */ + wc = 0; + assert(charset_mbtowc(charset, &wc, "\302\200", 9) == 1 && wc == 0xc2); + assert(charset_mbtowc(charset, &wc, "\377", 2) == 1 && wc == 0x2d9); + + /* Encoder */ + safe_strncpy(s, "..", sizeof(s)); + assert(charset_wctomb(charset, s, 256) == -1 && !strcmp(s, "..")); + assert(charset_wctomb(charset, s, 255) == -1 && !strcmp(s, "..")); + assert(charset_wctomb(charset, s, 258) == 1 && !strcmp(s, "\303.")); + assert(charset_wctomb(charset, s, 128) == 1 && !strcmp(s, "\200.")); +} + +void test_convert() +{ + const char *p; + char *q, *r; + char s[256]; + size_t n, n2; + int i; + + p = "\000x\302\200\375\277\277\277\277\277"; + assert(charset_convert("UTF-8", "UTF-8", p, 10, &q, &n) == 0 && + n == 10 && !strcmp(p, q)); + assert(charset_convert("UTF-8", "UTF-8", "x\301\277y", 4, &q, &n) == 2 && + n == 4 && !strcmp(q, "x##y")); + assert(charset_convert("UTF-8", "UTF-8", "x\301\277y", 4, 0, &n) == 2 && + n == 4); + assert(charset_convert("UTF-8", "UTF-8", "x\301\277y", 4, &q, 0) == 2 && + !strcmp(q, "x##y")); + assert(charset_convert("UTF-8", "iso-8859-1", + "\302\200\304\200x", 5, &q, &n) == 1 && + n == 3 && !strcmp(q, "\200?x")); + assert(charset_convert("iso-8859-1", "UTF-8", + "\000\200\377", 3, &q, &n) == 0 && + n == 5 && !memcmp(q, "\000\302\200\303\277", 5)); + assert(charset_convert("iso-8859-1", "iso-8859-1", + "\000\200\377", 3, &q, &n) == 0 && + n == 3 && !memcmp(q, "\000\200\377", 3)); + + assert(charset_convert("iso-8859-2", "utf-8", "\300", 1, &q, &n) == 0 && + n == 2 && !strcmp(q, "\305\224")); + assert(charset_convert("utf-8", "iso-8859-2", "\305\224", 2, &q, &n) == 0 && + n == 1 && !strcmp(q, "\300")); + + for (i = 0; i < 256; i++) + s[i] = i; + + assert(charset_convert("iso-8859-2", "utf-8", s, 256, &q, &n) == 0); + assert(charset_convert("utf-8", "iso-8859-2", q, n, &r, &n2) == 0); + assert(n2 == 256 && !memcmp(r, s, n2)); +} + +int main() +{ + test_utf8(); + test_ascii(); + test_iso1(); + test_iso2(); + + test_convert(); + + return 0; +} diff --git a/src/share/utf8/iconvert.c b/src/share/utf8/iconvert.c new file mode 100644 index 0000000..9a1e3f6 --- /dev/null +++ b/src/share/utf8/iconvert.c @@ -0,0 +1,257 @@ +/* + * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#if !defined _WIN32 && defined HAVE_ICONV + +#include <assert.h> +#include <errno.h> +#include <iconv.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "iconvert.h" +#include "share/alloc.h" +#include "share/safe_str.h" + +/* + * Convert data from one encoding to another. Return: + * + * -2 : memory allocation failed + * -1 : unknown encoding + * 0 : data was converted exactly + * 1 : data was converted inexactly + * 2 : data was invalid (but still converted) + * + * We convert in two steps, via UTF-8, as this is the only + * reliable way of distinguishing between invalid input + * and valid input which iconv refuses to transliterate. + * We convert from UTF-8 twice, because we have no way of + * knowing whether the conversion was exact if iconv returns + * E2BIG (due to a bug in the specification of iconv). + * An alternative approach is to assume that the output of + * iconv is never more than 4 times as long as the input, + * but I prefer to avoid that assumption if possible. + */ + +int iconvert(const char *fromcode, const char *tocode, + const char *from, size_t fromlen, + char **to, size_t *tolen) +{ + int ret = 0; + iconv_t cd1, cd2; + char *ib; + char *ob; + char *utfbuf = 0, *outbuf, *newbuf; + size_t utflen, outlen, ibl, obl, obp, k; + char tbuf[2048]; + + cd1 = iconv_open("UTF-8", fromcode); + if (cd1 == (iconv_t)(-1)) + return -1; + + cd2 = (iconv_t)(-1); + /* Don't use strcasecmp() as it's locale-dependent. */ + if (!strchr("Uu", tocode[0]) || + !strchr("Tt", tocode[1]) || + !strchr("Ff", tocode[2]) || + tocode[3] != '-' || + tocode[4] != '8' || + tocode[5] != '\0') { + char *tocode1; + int rc; + /* + * Try using this non-standard feature of glibc and libiconv. + * This is deliberately not a config option as people often + * change their iconv library without rebuilding applications. + */ + + rc = asprintf(&tocode1, "%s//TRANSLIT", tocode); + if (rc < 0 || ! tocode1) + goto fail; + + cd2 = iconv_open(tocode1, "UTF-8"); + free(tocode1); + + if (cd2 == (iconv_t)(-1)) + cd2 = iconv_open(tocode, fromcode); + + if (cd2 == (iconv_t)(-1)) { + iconv_close(cd1); + return -1; + } + } + + utflen = 1; /*fromlen * 2 + 1; XXX */ + utfbuf = malloc(utflen); + if (!utfbuf) + goto fail; + + /* Convert to UTF-8 */ + ib = (char *)from; + ibl = fromlen; + ob = utfbuf; + obl = utflen; + for (;;) { + k = iconv(cd1, &ib, &ibl, &ob, &obl); + assert((!k && !ibl) || + (k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) || + (k == (size_t)(-1) && + (errno == EILSEQ || errno == EINVAL) && ibl)); + if (!ibl) + break; + if (obl < 6) { + /* Enlarge the buffer */ + if(utflen*2 < utflen) /* overflow check */ + goto fail; + utflen *= 2; + obp = ob - utfbuf; /* save position */ + newbuf = realloc(utfbuf, utflen); + if (!newbuf) + goto fail; + ob = newbuf + obp; + obl = utflen - obp; + utfbuf = newbuf; + } + else { + /* Invalid input */ + ib++, ibl--; + *ob++ = '#', obl--; + ret = 2; + iconv(cd1, 0, 0, 0, 0); + } + } + + if (cd2 == (iconv_t)(-1)) { + /* The target encoding was UTF-8 */ + if (tolen) + *tolen = ob - utfbuf; + if (!to) { + free(utfbuf); + iconv_close(cd1); + return ret; + } + newbuf = safe_realloc_nofree_add_2op_(utfbuf, (ob - utfbuf), /*+*/1); + if (!newbuf) + goto fail; + ob = (ob - utfbuf) + newbuf; + *ob = '\0'; + *to = newbuf; + iconv_close(cd1); + return ret; + } + + /* Truncate the buffer to be tidy */ + utflen = ob - utfbuf; + if (utflen == 0) + goto fail; + newbuf = realloc(utfbuf, utflen); + if (!newbuf) + goto fail; + utfbuf = newbuf; + + /* Convert from UTF-8 to discover how long the output is */ + outlen = 0; + ib = utfbuf; + ibl = utflen; + while (ibl) { + ob = tbuf; + obl = sizeof(tbuf); + k = iconv(cd2, &ib, &ibl, &ob, &obl); + assert((k != (size_t)(-1) && !ibl) || + (k == (size_t)(-1) && errno == E2BIG && ibl) || + (k == (size_t)(-1) && errno == EILSEQ && ibl)); + if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) { + /* Replace one character */ + char *tb = "?"; + size_t tbl = 1; + + outlen += ob - tbuf; + ob = tbuf; + obl = sizeof(tbuf); + k = iconv(cd2, &tb, &tbl, &ob, &obl); + assert((!k && !tbl) || + (k == (size_t)(-1) && errno == EILSEQ && tbl)); + for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--) + ; + } + outlen += ob - tbuf; + } + ob = tbuf; + obl = sizeof(tbuf); + k = iconv(cd2, 0, 0, &ob, &obl); + assert(!k); + outlen += ob - tbuf; + + /* Convert from UTF-8 for real */ + outbuf = safe_malloc_add_2op_(outlen, /*+*/1); + if (!outbuf) + goto fail; + ib = utfbuf; + ibl = utflen; + ob = outbuf; + obl = outlen; + while (ibl) { + k = iconv(cd2, &ib, &ibl, &ob, &obl); + assert((k != (size_t)(-1) && !ibl) || + (k == (size_t)(-1) && errno == EILSEQ && ibl)); + if (k && !ret) + ret = 1; + if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) { + /* Replace one character */ + char *tb = "?"; + size_t tbl = 1; + + k = iconv(cd2, &tb, &tbl, &ob, &obl); + assert((!k && !tbl) || + (k == (size_t)(-1) && errno == EILSEQ && tbl)); + for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--) + ; + } + } + k = iconv(cd2, 0, 0, &ob, &obl); + assert(!k); + assert(!obl); + *ob = '\0'; + + free(utfbuf); + iconv_close(cd1); + iconv_close(cd2); + if (tolen) + *tolen = outlen; + if (!to) { + free(outbuf); + return ret; + } + *to = outbuf; + return ret; + + fail: + if(0 != utfbuf) + free(utfbuf); + iconv_close(cd1); + if (cd2 != (iconv_t)(-1)) + iconv_close(cd2); + return -2; +} + +#endif /* HAVE_ICONV */ diff --git a/src/share/utf8/iconvert.h b/src/share/utf8/iconvert.h new file mode 100644 index 0000000..a2d75a2 --- /dev/null +++ b/src/share/utf8/iconvert.h @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#ifdef HAVE_ICONV + +/* + * Convert data from one encoding to another. Return: + * + * -2 : memory allocation failed + * -1 : unknown encoding + * 0 : data was converted exactly + * 1 : data was converted inexactly + * 2 : data was invalid (but still converted) + * + * We convert in two steps, via UTF-8, as this is the only + * reliable way of distinguishing between invalid input + * and valid input which iconv refuses to transliterate. + * We convert from UTF-8 twice, because we have no way of + * knowing whether the conversion was exact if iconv returns + * E2BIG (due to a bug in the specification of iconv). + * An alternative approach is to assume that the output of + * iconv is never more than 4 times as long as the input, + * but I prefer to avoid that assumption if possible. + */ + +int iconvert(const char *fromcode, const char *tocode, + const char *from, size_t fromlen, + char **to, size_t *tolen) ; + +#endif /* HAVE_ICONV */ diff --git a/src/share/utf8/makemap.c b/src/share/utf8/makemap.c new file mode 100644 index 0000000..790021c --- /dev/null +++ b/src/share/utf8/makemap.c @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#include <errno.h> +#include <iconv.h> +#include <stdio.h> + +int main(int argc, char *argv[]) +{ + iconv_t cd; + const char *ib; + char *ob; + size_t ibl, obl, k; + uint8_t c, buf[4]; + int i, wc; + + if (argc != 2) { + printf("Usage: %s ENCODING\n", argv[0]); + printf("Output a charset map for the 8-bit ENCODING.\n"); + return 1; + } + + cd = iconv_open("UCS-4", argv[1]); + if (cd == (iconv_t)(-1)) { + perror("iconv_open"); + return 1; + } + + for (i = 0; i < 256; i++) { + c = i; + ib = &c; + ibl = 1; + ob = buf; + obl = 4; + k = iconv(cd, &ib, &ibl, &ob, &obl); + if (!k && !ibl && !obl) { + wc = (buf[0] << 24) + (buf[1] << 16) + (buf[2] << 8) + buf[3]; + if (wc >= 0xffff) { + printf("Dodgy value.\n"); + return 1; + } + } + else if (k == (size_t)(-1) && errno == EILSEQ) + wc = 0xffff; + else { + printf("Non-standard iconv.\n"); + return 1; + } + + if (i % 8 == 0) + printf(" "); + printf("0x%04x", wc); + if (i == 255) + printf("\n"); + else if (i % 8 == 7) + printf(",\n"); + else + printf(", "); + } + + return 0; +} diff --git a/src/share/utf8/utf8.c b/src/share/utf8/utf8.c new file mode 100644 index 0000000..34af187 --- /dev/null +++ b/src/share/utf8/utf8.c @@ -0,0 +1,202 @@ +/* + * Copyright (C) 2001 Peter Harris <peter.harris@hummingbird.com> + * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> + * + * Buffer overflow checking added: Josh Coalson, 9/9/2007 + * + * Win32 part rewritten: lvqcl, 2/2/2016 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * Convert a string between UTF-8 and the locale's charset. + */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#include <stdio.h> +#include <stdlib.h> + +#include "share/alloc.h" +#include "share/utf8.h" + +#ifdef _WIN32 + +#include <windows.h> + +int utf8_encode(const char *from, char **to) +{ + wchar_t *unicode = NULL; + char *utf8 = NULL; + int ret = -1; + + do { + int len; + + len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from, -1, NULL, 0); + if(len == 0) break; + unicode = (wchar_t*) safe_malloc_mul_2op_((size_t)len, sizeof(wchar_t)); + if(unicode == NULL) break; + len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from, -1, unicode, len); + if(len == 0) break; + + len = WideCharToMultiByte(CP_UTF8, 0, unicode, -1, NULL, 0, NULL, NULL); + if(len == 0) break; + utf8 = (char*) safe_malloc_mul_2op_((size_t)len, sizeof(char)); + if(utf8 == NULL) break; + len = WideCharToMultiByte(CP_UTF8, 0, unicode, -1, utf8, len, NULL, NULL); + if(len == 0) break; + + ret = 0; + + } while(0); + + free(unicode); + + if(ret == 0) { + *to = utf8; + } else { + free(utf8); + *to = NULL; + } + + return ret; +} + +int utf8_decode(const char *from, char **to) +{ + wchar_t *unicode = NULL; + char *acp = NULL; + int ret = -1; + + do { + int len; + + len = MultiByteToWideChar(CP_UTF8, 0, from, -1, NULL, 0); + if(len == 0) break; + unicode = (wchar_t*) safe_malloc_mul_2op_((size_t)len, sizeof(wchar_t)); + if(unicode == NULL) break; + len = MultiByteToWideChar(CP_UTF8, 0, from, -1, unicode, len); + if(len == 0) break; + + len = WideCharToMultiByte(CP_ACP, WC_COMPOSITECHECK, unicode, -1, NULL, 0, NULL, NULL); + if(len == 0) break; + acp = (char*) safe_malloc_mul_2op_((size_t)len, sizeof(char)); + if(acp == NULL) break; + len = WideCharToMultiByte(CP_ACP, WC_COMPOSITECHECK, unicode, -1, acp, len, NULL, NULL); + if(len == 0) break; + + ret = 0; + + } while(0); + + free(unicode); + + if(ret == 0) { + *to = acp; + } else { + free(acp); + *to = NULL; + } + + return ret; +} + +#else /* End win32. Rest is for real operating systems */ + + +#ifdef HAVE_LANGINFO_CODESET +#include <langinfo.h> +#endif + +#include <string.h> + +#include "share/safe_str.h" +#include "iconvert.h" +#include "charset.h" + +static const char *current_charset(void) +{ + const char *c = 0; +#ifdef HAVE_LANGINFO_CODESET + c = nl_langinfo(CODESET); +#endif + + if (!c) + c = getenv("CHARSET"); + + return c? c : "US-ASCII"; +} + +static int convert_buffer(const char *fromcode, const char *tocode, + const char *from, size_t fromlen, + char **to, size_t *tolen) +{ + int ret = -1; + +#ifdef HAVE_ICONV + ret = iconvert(fromcode, tocode, from, fromlen, to, tolen); + if (ret != -1) + return ret; +#endif + +#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */ + ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen); + if (ret != -1) + return ret; +#endif + + return ret; +} + +static int convert_string(const char *fromcode, const char *tocode, + const char *from, char **to, char replace) +{ + int ret; + size_t fromlen; + char *s; + + fromlen = strlen(from); + ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0); + if (ret == -2) + return -1; + if (ret != -1) + return ret; + + s = safe_malloc_add_2op_(fromlen, /*+*/1); + if (!s) + return -1; + snprintf(s, fromlen + 1, "%s", from); + *to = s; + for (; *s; s++) + if (*s & ~0x7f) + *s = replace; + return 3; +} + +int utf8_encode(const char *from, char **to) +{ + return convert_string(current_charset(), "UTF-8", from, to, '#'); +} + +int utf8_decode(const char *from, char **to) +{ + return convert_string("UTF-8", current_charset(), from, to, '?'); +} + +#endif |