summaryrefslogtreecommitdiffstats
path: root/src/share/utf8
diff options
context:
space:
mode:
Diffstat (limited to 'src/share/utf8')
-rw-r--r--src/share/utf8/CMakeLists.txt8
-rw-r--r--src/share/utf8/charmaps.h57
-rw-r--r--src/share/utf8/charset.c534
-rw-r--r--src/share/utf8/charset.h72
-rw-r--r--src/share/utf8/charset_test.c263
-rw-r--r--src/share/utf8/iconvert.c257
-rw-r--r--src/share/utf8/iconvert.h49
-rw-r--r--src/share/utf8/makemap.c81
-rw-r--r--src/share/utf8/utf8.c202
9 files changed, 1523 insertions, 0 deletions
diff --git a/src/share/utf8/CMakeLists.txt b/src/share/utf8/CMakeLists.txt
new file mode 100644
index 0000000..389b09e
--- /dev/null
+++ b/src/share/utf8/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.12)
+
+add_library(utf8 STATIC
+ charset.c
+ iconvert.c
+ utf8.c)
+
+target_link_libraries(utf8 PUBLIC grabbag $<TARGET_NAME_IF_EXISTS:Iconv::Iconv>)
diff --git a/src/share/utf8/charmaps.h b/src/share/utf8/charmaps.h
new file mode 100644
index 0000000..16d049a
--- /dev/null
+++ b/src/share/utf8/charmaps.h
@@ -0,0 +1,57 @@
+
+/*
+ * If you need to generate more maps, use makemap.c on a system
+ * with a decent iconv.
+ */
+
+static const uint16_t mapping_iso_8859_2[256] = {
+ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
+ 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
+ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
+ 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
+ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
+ 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
+ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
+ 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
+ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+ 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
+ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
+ 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
+ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
+ 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
+ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
+ 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f,
+ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
+ 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
+ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
+ 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
+ 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,
+ 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
+ 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,
+ 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
+ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
+ 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
+ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
+ 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
+ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
+ 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
+ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
+ 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9
+};
+
+static struct {
+ const char *name;
+ const uint16_t *map;
+ struct charset *charset;
+} maps[] = {
+ { "ISO-8859-2", mapping_iso_8859_2, 0 },
+ { 0, 0, 0 }
+};
+
+static const struct {
+ const char *bad;
+ const char *good;
+} names[] = {
+ { "ANSI_X3.4-1968", "us-ascii" },
+ { 0, 0 }
+};
diff --git a/src/share/utf8/charset.c b/src/share/utf8/charset.c
new file mode 100644
index 0000000..5c5693d
--- /dev/null
+++ b/src/share/utf8/charset.c
@@ -0,0 +1,534 @@
+/*
+ * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * See the corresponding header file for a description of the functions
+ * that this file provides.
+ *
+ * This was first written for Ogg Vorbis but could be of general use.
+ *
+ * The only deliberate assumption about data sizes is that a short has
+ * at least 16 bits, but this code has only been tested on systems with
+ * 8-bit char, 16-bit short and 32-bit int.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#if !defined _WIN32 && !defined HAVE_ICONV /* should be && defined USE_CHARSET_CONVERT */
+
+#include <stdlib.h>
+
+#include "share/alloc.h"
+#include "charset.h"
+
+#include "charmaps.h"
+
+/*
+ * This is like the standard strcasecmp, but it does not depend
+ * on the locale. Locale-dependent functions can be dangerous:
+ * we once had a bug involving strcasecmp("iso", "ISO") in a
+ * Turkish locale!
+ *
+ * (I'm not really sure what the official standard says
+ * about the sign of strcasecmp("Z", "["), but usually
+ * we're only interested in whether it's zero.)
+ */
+
+static int ascii_strcasecmp(const char *s1, const char *s2)
+{
+ char c1, c2;
+
+ for (;; s1++, s2++) {
+ if (!*s1 || !*s2)
+ break;
+ if (*s1 == *s2)
+ continue;
+ c1 = *s1;
+ if ('a' <= c1 && c1 <= 'z')
+ c1 += 'A' - 'a';
+ c2 = *s2;
+ if ('a' <= c2 && c2 <= 'z')
+ c2 += 'A' - 'a';
+ if (c1 != c2)
+ break;
+ }
+ return (uint8_t)*s1 - (uint8_t)*s2;
+}
+
+/*
+ * UTF-8 equivalents of the C library's wctomb() and mbtowc().
+ */
+
+int utf8_mbtowc(int *pwc, const char *s, size_t n)
+{
+ uint8_t c;
+ int wc, i, k;
+
+ if (!n || !s)
+ return 0;
+
+ c = *s;
+ if (c < 0x80) {
+ if (pwc)
+ *pwc = c;
+ return c ? 1 : 0;
+ }
+ else if (c < 0xc2)
+ return -1;
+ else if (c < 0xe0) {
+ if (n >= 2 && (s[1] & 0xc0) == 0x80) {
+ if (pwc)
+ *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
+ return 2;
+ }
+ else
+ return -1;
+ }
+ else if (c < 0xf0)
+ k = 3;
+ else if (c < 0xf8)
+ k = 4;
+ else if (c < 0xfc)
+ k = 5;
+ else if (c < 0xfe)
+ k = 6;
+ else
+ return -1;
+
+ if (n < (size_t)k)
+ return -1;
+ wc = *s++ & ((1 << (7 - k)) - 1);
+ for (i = 1; i < k; i++) {
+ if ((*s & 0xc0) != 0x80)
+ return -1;
+ wc = (wc << 6) | (*s++ & 0x3f);
+ }
+ if (wc < (1 << (5 * k - 4)))
+ return -1;
+ if (pwc)
+ *pwc = wc;
+ return k;
+}
+
+int utf8_wctomb(char *s, int wc1)
+{
+ uint32_t wc = wc1;
+
+ if (!s)
+ return 0;
+ if (wc < (1u << 7)) {
+ *s++ = wc;
+ return 1;
+ }
+ else if (wc < (1u << 11)) {
+ *s++ = 0xc0 | (wc >> 6);
+ *s++ = 0x80 | (wc & 0x3f);
+ return 2;
+ }
+ else if (wc < (1u << 16)) {
+ *s++ = 0xe0 | (wc >> 12);
+ *s++ = 0x80 | ((wc >> 6) & 0x3f);
+ *s++ = 0x80 | (wc & 0x3f);
+ return 3;
+ }
+ else if (wc < (1u << 21)) {
+ *s++ = 0xf0 | (wc >> 18);
+ *s++ = 0x80 | ((wc >> 12) & 0x3f);
+ *s++ = 0x80 | ((wc >> 6) & 0x3f);
+ *s++ = 0x80 | (wc & 0x3f);
+ return 4;
+ }
+ else if (wc < (1u << 26)) {
+ *s++ = 0xf8 | (wc >> 24);
+ *s++ = 0x80 | ((wc >> 18) & 0x3f);
+ *s++ = 0x80 | ((wc >> 12) & 0x3f);
+ *s++ = 0x80 | ((wc >> 6) & 0x3f);
+ *s++ = 0x80 | (wc & 0x3f);
+ return 5;
+ }
+ else if (wc < (1u << 31)) {
+ *s++ = 0xfc | (wc >> 30);
+ *s++ = 0x80 | ((wc >> 24) & 0x3f);
+ *s++ = 0x80 | ((wc >> 18) & 0x3f);
+ *s++ = 0x80 | ((wc >> 12) & 0x3f);
+ *s++ = 0x80 | ((wc >> 6) & 0x3f);
+ *s++ = 0x80 | (wc & 0x3f);
+ return 6;
+ }
+ else
+ return -1;
+}
+
+/*
+ * The charset "object" and methods.
+ */
+
+struct charset {
+ int max;
+ int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
+ int (*wctomb)(void *table, char *s, int wc);
+ void *map;
+};
+
+int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
+{
+ return (*charset->mbtowc)(charset->map, pwc, s, n);
+}
+
+int charset_wctomb(struct charset *charset, char *s, int wc)
+{
+ return (*charset->wctomb)(charset->map, s, wc);
+}
+
+int charset_max(struct charset *charset)
+{
+ return charset->max;
+}
+
+/*
+ * Implementation of UTF-8.
+ */
+
+static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
+{
+ (void)map;
+ return utf8_mbtowc(pwc, s, n);
+}
+
+static int wctomb_utf8(void *map, char *s, int wc)
+{
+ (void)map;
+ return utf8_wctomb(s, wc);
+}
+
+/*
+ * Implementation of US-ASCII.
+ * Probably on most architectures this compiles to less than 256 bytes
+ * of code, so we can save space by not having a table for this one.
+ */
+
+static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
+{
+ int wc;
+
+ (void)map;
+ if (!n || !s)
+ return 0;
+ wc = (uint8_t)*s;
+ if (wc & ~0x7f)
+ return -1;
+ if (pwc)
+ *pwc = wc;
+ return wc ? 1 : 0;
+}
+
+static int wctomb_ascii(void *map, char *s, int wc)
+{
+ (void)map;
+ if (!s)
+ return 0;
+ if (wc & ~0x7f)
+ return -1;
+ *s = wc;
+ return 1;
+}
+
+/*
+ * Implementation of ISO-8859-1.
+ * Probably on most architectures this compiles to less than 256 bytes
+ * of code, so we can save space by not having a table for this one.
+ */
+
+static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
+{
+ int wc;
+
+ (void)map;
+ if (!n || !s)
+ return 0;
+ wc = (uint8_t)*s;
+ if (wc & ~0xff)
+ return -1;
+ if (pwc)
+ *pwc = wc;
+ return wc ? 1 : 0;
+}
+
+static int wctomb_iso1(void *map, char *s, int wc)
+{
+ (void)map;
+ if (!s)
+ return 0;
+ if (wc & ~0xff)
+ return -1;
+ *s = wc;
+ return 1;
+}
+
+/*
+ * Implementation of any 8-bit charset.
+ */
+
+struct map {
+ const uint16_t *from;
+ struct inverse_map *to;
+};
+
+static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
+{
+ struct map *map = map1;
+ uint16_t wc;
+
+ if (!n || !s)
+ return 0;
+ wc = map->from[(uint8_t)*s];
+ if (wc == 0xffff)
+ return -1;
+ if (pwc)
+ *pwc = (int)wc;
+ return wc ? 1 : 0;
+}
+
+/*
+ * For the inverse map we use a hash table, which has the advantages
+ * of small constant memory requirement and simple memory allocation,
+ * but the disadvantage of slow conversion in the worst case.
+ * If you need real-time performance while letting a potentially
+ * malicious user define their own map, then the method used in
+ * linux/drivers/char/consolemap.c would be more appropriate.
+ */
+
+struct inverse_map {
+ uint8_t first[256];
+ uint8_t next[256];
+};
+
+/*
+ * The simple hash is good enough for this application.
+ * Use the alternative trivial hashes for testing.
+ */
+#define HASH(i) ((i) & 0xff)
+/* #define HASH(i) 0 */
+/* #define HASH(i) 99 */
+
+static struct inverse_map *make_inverse_map(const uint16_t *from)
+{
+ struct inverse_map *to;
+ char used[256];
+ int i, j, k;
+
+ to = malloc(sizeof(struct inverse_map));
+ if (!to)
+ return 0;
+ for (i = 0; i < 256; i++)
+ to->first[i] = to->next[i] = used[i] = 0;
+ for (i = 255; i >= 0; i--)
+ if (from[i] != 0xffff) {
+ k = HASH(from[i]);
+ to->next[i] = to->first[k];
+ to->first[k] = i;
+ used[k] = 1;
+ }
+
+ /* Point the empty buckets at an empty list. */
+ for (i = 0; i < 256; i++)
+ if (!to->next[i])
+ break;
+ if (i < 256)
+ for (j = 0; j < 256; j++)
+ if (!used[j])
+ to->first[j] = i;
+
+ return to;
+}
+
+static int wctomb_8bit(void *map1, char *s, int wc1)
+{
+ struct map *map = map1;
+ uint16_t wc = wc1;
+ int i;
+
+ if (!s)
+ return 0;
+
+ if (wc1 & ~0xffff)
+ return -1;
+
+ if (1) /* Change 1 to 0 to test the case where malloc fails. */
+ if (!map->to)
+ map->to = make_inverse_map(map->from);
+
+ if (map->to) {
+ /* Use the inverse map. */
+ i = map->to->first[HASH(wc)];
+ for (;;) {
+ if (map->from[i] == wc) {
+ *s = i;
+ return 1;
+ }
+ if (!(i = map->to->next[i]))
+ break;
+ }
+ }
+ else {
+ /* We don't have an inverse map, so do a linear search. */
+ for (i = 0; i < 256; i++)
+ if (map->from[i] == wc) {
+ *s = i;
+ return 1;
+ }
+ }
+
+ return -1;
+}
+
+/*
+ * The "constructor" charset_find().
+ */
+
+struct charset charset_utf8 = {
+ 6,
+ &mbtowc_utf8,
+ &wctomb_utf8,
+ 0
+};
+
+struct charset charset_iso1 = {
+ 1,
+ &mbtowc_iso1,
+ &wctomb_iso1,
+ 0
+};
+
+struct charset charset_ascii = {
+ 1,
+ &mbtowc_ascii,
+ &wctomb_ascii,
+ 0
+};
+
+struct charset *charset_find(const char *code)
+{
+ int i;
+
+ /* Find good (MIME) name. */
+ for (i = 0; names[i].bad; i++)
+ if (!ascii_strcasecmp(code, names[i].bad)) {
+ code = names[i].good;
+ break;
+ }
+
+ /* Recognise some charsets for which we avoid using a table. */
+ if (!ascii_strcasecmp(code, "UTF-8"))
+ return &charset_utf8;
+ if (!ascii_strcasecmp(code, "US-ASCII"))
+ return &charset_ascii;
+ if (!ascii_strcasecmp(code, "ISO-8859-1"))
+ return &charset_iso1;
+
+ /* Look for a mapping for a simple 8-bit encoding. */
+ for (i = 0; maps[i].name; i++)
+ if (!ascii_strcasecmp(code, maps[i].name)) {
+ if (!maps[i].charset) {
+ maps[i].charset = malloc(sizeof(struct charset));
+ if (maps[i].charset) {
+ struct map *map = malloc(sizeof(struct map));
+ if (!map) {
+ free(maps[i].charset);
+ maps[i].charset = 0;
+ }
+ else {
+ maps[i].charset->max = 1;
+ maps[i].charset->mbtowc = &mbtowc_8bit;
+ maps[i].charset->wctomb = &wctomb_8bit;
+ maps[i].charset->map = map;
+ map->from = maps[i].map;
+ map->to = 0; /* inverse mapping is created when required */
+ }
+ }
+ }
+ return maps[i].charset;
+ }
+
+ return 0;
+}
+
+/*
+ * Function to convert a buffer from one encoding to another.
+ * Invalid bytes are replaced by '#', and characters that are
+ * not available in the target encoding are replaced by '?'.
+ * Each of TO and TOLEN may be zero, if the result is not needed.
+ * The output buffer is null-terminated, so it is all right to
+ * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
+ */
+
+int charset_convert(const char *fromcode, const char *tocode,
+ const char *from, size_t fromlen,
+ char **to, size_t *tolen)
+{
+ int ret = 0;
+ struct charset *charset1, *charset2;
+ char *tobuf, *p;
+ int i, j, wc;
+
+ charset1 = charset_find(fromcode);
+ charset2 = charset_find(tocode);
+ if (!charset1 || !charset2 )
+ return -1;
+
+ tobuf = safe_malloc_mul2add_(fromlen, /*times*/charset2->max, /*+*/1);
+ if (!tobuf)
+ return -2;
+
+ for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
+ i = charset_mbtowc(charset1, &wc, from, fromlen);
+ if (!i)
+ i = 1;
+ else if (i == -1) {
+ i = 1;
+ wc = '#';
+ ret = 2;
+ }
+ j = charset_wctomb(charset2, p, wc);
+ if (j == -1) {
+ if (!ret)
+ ret = 1;
+ j = charset_wctomb(charset2, p, '?');
+ if (j == -1)
+ j = 0;
+ }
+ }
+
+ if (tolen)
+ *tolen = p - tobuf;
+ *p++ = '\0';
+ if (to) {
+ char *tobuf_saved = tobuf;
+ *to = realloc(tobuf, p - tobuf);
+ if (*to == NULL)
+ *to = tobuf_saved;
+ }
+ else
+ free(tobuf);
+
+ return ret;
+}
+
+#endif /* USE_CHARSET_ICONV */
diff --git a/src/share/utf8/charset.h b/src/share/utf8/charset.h
new file mode 100644
index 0000000..ea8e31e
--- /dev/null
+++ b/src/share/utf8/charset.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdlib.h>
+
+/*
+ * These functions are like the C library's mbtowc() and wctomb(),
+ * but instead of depending on the locale they always work in UTF-8,
+ * and they use int instead of wchar_t.
+ */
+
+int utf8_mbtowc(int *pwc, const char *s, size_t n);
+int utf8_wctomb(char *s, int wc);
+
+/*
+ * This is an object-oriented version of mbtowc() and wctomb().
+ * The caller first uses charset_find() to get a pointer to struct
+ * charset, then uses the mbtowc() and wctomb() methods on it.
+ * The function charset_max() gives the maximum length of a
+ * multibyte character in that encoding.
+ * This API is only appropriate for stateless encodings like UTF-8
+ * or ISO-8859-3, but I have no intention of implementing anything
+ * other than UTF-8 and 8-bit encodings.
+ *
+ * MINOR BUG: If there is no memory charset_find() may return 0 and
+ * there is no way to distinguish this case from an unknown encoding.
+ */
+
+struct charset;
+
+struct charset *charset_find(const char *code);
+
+int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n);
+int charset_wctomb(struct charset *charset, char *s, int wc);
+int charset_max(struct charset *charset);
+
+/*
+ * Function to convert a buffer from one encoding to another.
+ * Invalid bytes are replaced by '#', and characters that are
+ * not available in the target encoding are replaced by '?'.
+ * Each of TO and TOLEN may be zero if the result is not wanted.
+ * The input or output may contain null bytes, but the output
+ * buffer is also null-terminated, so it is all right to
+ * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
+ *
+ * Return value:
+ *
+ * -2 : memory allocation failed
+ * -1 : unknown encoding
+ * 0 : data was converted exactly
+ * 1 : valid data was converted approximately (using '?')
+ * 2 : input was invalid (but still converted, using '#')
+ */
+
+int charset_convert(const char *fromcode, const char *tocode,
+ const char *from, size_t fromlen,
+ char **to, size_t *tolen);
diff --git a/src/share/utf8/charset_test.c b/src/share/utf8/charset_test.c
new file mode 100644
index 0000000..6761100
--- /dev/null
+++ b/src/share/utf8/charset_test.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <assert.h>
+#include <string.h>
+
+#include "charset.h"
+
+void test_any(struct charset *charset)
+{
+ int wc;
+ char s[2];
+
+ assert(charset);
+
+ /* Decoder */
+
+ assert(charset_mbtowc(charset, 0, 0, 0) == 0);
+ assert(charset_mbtowc(charset, 0, 0, 1) == 0);
+ assert(charset_mbtowc(charset, 0, (char *)(-1), 0) == 0);
+
+ assert(charset_mbtowc(charset, 0, "a", 0) == 0);
+ assert(charset_mbtowc(charset, 0, "", 1) == 0);
+ assert(charset_mbtowc(charset, 0, "b", 1) == 1);
+ assert(charset_mbtowc(charset, 0, "", 2) == 0);
+ assert(charset_mbtowc(charset, 0, "c", 2) == 1);
+
+ wc = 'x';
+ assert(charset_mbtowc(charset, &wc, "a", 0) == 0 && wc == 'x');
+ assert(charset_mbtowc(charset, &wc, "", 1) == 0 && wc == 0);
+ assert(charset_mbtowc(charset, &wc, "b", 1) == 1 && wc == 'b');
+ assert(charset_mbtowc(charset, &wc, "", 2) == 0 && wc == 0);
+ assert(charset_mbtowc(charset, &wc, "c", 2) == 1 && wc == 'c');
+
+ /* Encoder */
+
+ assert(charset_wctomb(charset, 0, 0) == 0);
+
+ s[0] = s[1] = '.';
+ assert(charset_wctomb(charset, s, 0) == 1 &&
+ s[0] == '\0' && s[1] == '.');
+ assert(charset_wctomb(charset, s, 'x') == 1 &&
+ s[0] == 'x' && s[1] == '.');
+}
+
+void test_utf8()
+{
+ struct charset *charset;
+ int wc;
+ char s[8];
+
+ charset = charset_find("UTF-8");
+ test_any(charset);
+
+ /* Decoder */
+ wc = 0;
+ assert(charset_mbtowc(charset, &wc, "\177", 1) == 1 && wc == 127);
+ assert(charset_mbtowc(charset, &wc, "\200", 2) == -1);
+ assert(charset_mbtowc(charset, &wc, "\301\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\302\200", 1) == -1);
+ assert(charset_mbtowc(charset, &wc, "\302\200", 2) == 2 && wc == 128);
+ assert(charset_mbtowc(charset, &wc, "\302\200", 3) == 2 && wc == 128);
+ assert(charset_mbtowc(charset, &wc, "\340\237\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\340\240\200", 9) == 3 &&
+ wc == 1 << 11);
+ assert(charset_mbtowc(charset, &wc, "\360\217\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\220\200\200", 9) == 4 &&
+ wc == 1 << 16);
+ assert(charset_mbtowc(charset, &wc, "\370\207\277\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\370\210\200\200\200", 9) == 5 &&
+ wc == 1 << 21);
+ assert(charset_mbtowc(charset, &wc, "\374\203\277\277\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\374\204\200\200\200\200", 9) == 6 &&
+ wc == 1 << 26);
+ assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\277", 9) == 6 &&
+ wc == 0x7fffffff);
+
+ assert(charset_mbtowc(charset, &wc, "\302\000", 2) == -1);
+ assert(charset_mbtowc(charset, &wc, "\302\300", 2) == -1);
+ assert(charset_mbtowc(charset, &wc, "\340\040\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\340\340\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\340\240\000", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\340\240\300", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\020\200\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\320\200\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\220\000\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\220\300\200", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\220\200\000", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\360\220\200\300", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\077\277\277\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\377\277\277\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\277\077\277\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\277\377\277\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\277\277\277\077\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\277\277\277\377\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\077", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\375\277\277\277\277\377", 9) == -1);
+
+ assert(charset_mbtowc(charset, &wc, "\376\277\277\277\277\277", 9) == -1);
+ assert(charset_mbtowc(charset, &wc, "\377\277\277\277\277\277", 9) == -1);
+
+ /* Encoder */
+ safe_strncpy(s, ".......", sizeof(s));
+ assert(charset_wctomb(charset, s, 1u << 31) == -1 &&
+ !strcmp(s, "......."));
+ assert(charset_wctomb(charset, s, 127) == 1 &&
+ !strcmp(s, "\177......"));
+ assert(charset_wctomb(charset, s, 128) == 2 &&
+ !strcmp(s, "\302\200....."));
+ assert(charset_wctomb(charset, s, 0x7ff) == 2 &&
+ !strcmp(s, "\337\277....."));
+ assert(charset_wctomb(charset, s, 0x800) == 3 &&
+ !strcmp(s, "\340\240\200...."));
+ assert(charset_wctomb(charset, s, 0xffff) == 3 &&
+ !strcmp(s, "\357\277\277...."));
+ assert(charset_wctomb(charset, s, 0x10000) == 4 &&
+ !strcmp(s, "\360\220\200\200..."));
+ assert(charset_wctomb(charset, s, 0x1fffff) == 4 &&
+ !strcmp(s, "\367\277\277\277..."));
+ assert(charset_wctomb(charset, s, 0x200000) == 5 &&
+ !strcmp(s, "\370\210\200\200\200.."));
+ assert(charset_wctomb(charset, s, 0x3ffffff) == 5 &&
+ !strcmp(s, "\373\277\277\277\277.."));
+ assert(charset_wctomb(charset, s, 0x4000000) == 6 &&
+ !strcmp(s, "\374\204\200\200\200\200."));
+ assert(charset_wctomb(charset, s, 0x7fffffff) == 6 &&
+ !strcmp(s, "\375\277\277\277\277\277."));
+}
+
+void test_ascii()
+{
+ struct charset *charset;
+ int wc;
+ char s[3];
+
+ charset = charset_find("us-ascii");
+ test_any(charset);
+
+ /* Decoder */
+ wc = 0;
+ assert(charset_mbtowc(charset, &wc, "\177", 2) == 1 && wc == 127);
+ assert(charset_mbtowc(charset, &wc, "\200", 2) == -1);
+
+ /* Encoder */
+ safe_strncpy(s, "..", sizeof(s));
+ assert(charset_wctomb(charset, s, 256) == -1 && !strcmp(s, ".."));
+ assert(charset_wctomb(charset, s, 255) == -1);
+ assert(charset_wctomb(charset, s, 128) == -1);
+ assert(charset_wctomb(charset, s, 127) == 1 && !strcmp(s, "\177."));
+}
+
+void test_iso1()
+{
+ struct charset *charset;
+ int wc;
+ char s[3];
+
+ charset = charset_find("iso-8859-1");
+ test_any(charset);
+
+ /* Decoder */
+ wc = 0;
+ assert(charset_mbtowc(charset, &wc, "\302\200", 9) == 1 && wc == 0xc2);
+
+ /* Encoder */
+ safe_strncpy(s, "..", sizeof(s));
+ assert(charset_wctomb(charset, s, 256) == -1 && !strcmp(s, ".."));
+ assert(charset_wctomb(charset, s, 255) == 1 && !strcmp(s, "\377."));
+ assert(charset_wctomb(charset, s, 128) == 1 && !strcmp(s, "\200."));
+}
+
+void test_iso2()
+{
+ struct charset *charset;
+ int wc;
+ char s[3];
+
+ charset = charset_find("iso-8859-2");
+ test_any(charset);
+
+ /* Decoder */
+ wc = 0;
+ assert(charset_mbtowc(charset, &wc, "\302\200", 9) == 1 && wc == 0xc2);
+ assert(charset_mbtowc(charset, &wc, "\377", 2) == 1 && wc == 0x2d9);
+
+ /* Encoder */
+ safe_strncpy(s, "..", sizeof(s));
+ assert(charset_wctomb(charset, s, 256) == -1 && !strcmp(s, ".."));
+ assert(charset_wctomb(charset, s, 255) == -1 && !strcmp(s, ".."));
+ assert(charset_wctomb(charset, s, 258) == 1 && !strcmp(s, "\303."));
+ assert(charset_wctomb(charset, s, 128) == 1 && !strcmp(s, "\200."));
+}
+
+void test_convert()
+{
+ const char *p;
+ char *q, *r;
+ char s[256];
+ size_t n, n2;
+ int i;
+
+ p = "\000x\302\200\375\277\277\277\277\277";
+ assert(charset_convert("UTF-8", "UTF-8", p, 10, &q, &n) == 0 &&
+ n == 10 && !strcmp(p, q));
+ assert(charset_convert("UTF-8", "UTF-8", "x\301\277y", 4, &q, &n) == 2 &&
+ n == 4 && !strcmp(q, "x##y"));
+ assert(charset_convert("UTF-8", "UTF-8", "x\301\277y", 4, 0, &n) == 2 &&
+ n == 4);
+ assert(charset_convert("UTF-8", "UTF-8", "x\301\277y", 4, &q, 0) == 2 &&
+ !strcmp(q, "x##y"));
+ assert(charset_convert("UTF-8", "iso-8859-1",
+ "\302\200\304\200x", 5, &q, &n) == 1 &&
+ n == 3 && !strcmp(q, "\200?x"));
+ assert(charset_convert("iso-8859-1", "UTF-8",
+ "\000\200\377", 3, &q, &n) == 0 &&
+ n == 5 && !memcmp(q, "\000\302\200\303\277", 5));
+ assert(charset_convert("iso-8859-1", "iso-8859-1",
+ "\000\200\377", 3, &q, &n) == 0 &&
+ n == 3 && !memcmp(q, "\000\200\377", 3));
+
+ assert(charset_convert("iso-8859-2", "utf-8", "\300", 1, &q, &n) == 0 &&
+ n == 2 && !strcmp(q, "\305\224"));
+ assert(charset_convert("utf-8", "iso-8859-2", "\305\224", 2, &q, &n) == 0 &&
+ n == 1 && !strcmp(q, "\300"));
+
+ for (i = 0; i < 256; i++)
+ s[i] = i;
+
+ assert(charset_convert("iso-8859-2", "utf-8", s, 256, &q, &n) == 0);
+ assert(charset_convert("utf-8", "iso-8859-2", q, n, &r, &n2) == 0);
+ assert(n2 == 256 && !memcmp(r, s, n2));
+}
+
+int main()
+{
+ test_utf8();
+ test_ascii();
+ test_iso1();
+ test_iso2();
+
+ test_convert();
+
+ return 0;
+}
diff --git a/src/share/utf8/iconvert.c b/src/share/utf8/iconvert.c
new file mode 100644
index 0000000..9a1e3f6
--- /dev/null
+++ b/src/share/utf8/iconvert.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#if !defined _WIN32 && defined HAVE_ICONV
+
+#include <assert.h>
+#include <errno.h>
+#include <iconv.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iconvert.h"
+#include "share/alloc.h"
+#include "share/safe_str.h"
+
+/*
+ * Convert data from one encoding to another. Return:
+ *
+ * -2 : memory allocation failed
+ * -1 : unknown encoding
+ * 0 : data was converted exactly
+ * 1 : data was converted inexactly
+ * 2 : data was invalid (but still converted)
+ *
+ * We convert in two steps, via UTF-8, as this is the only
+ * reliable way of distinguishing between invalid input
+ * and valid input which iconv refuses to transliterate.
+ * We convert from UTF-8 twice, because we have no way of
+ * knowing whether the conversion was exact if iconv returns
+ * E2BIG (due to a bug in the specification of iconv).
+ * An alternative approach is to assume that the output of
+ * iconv is never more than 4 times as long as the input,
+ * but I prefer to avoid that assumption if possible.
+ */
+
+int iconvert(const char *fromcode, const char *tocode,
+ const char *from, size_t fromlen,
+ char **to, size_t *tolen)
+{
+ int ret = 0;
+ iconv_t cd1, cd2;
+ char *ib;
+ char *ob;
+ char *utfbuf = 0, *outbuf, *newbuf;
+ size_t utflen, outlen, ibl, obl, obp, k;
+ char tbuf[2048];
+
+ cd1 = iconv_open("UTF-8", fromcode);
+ if (cd1 == (iconv_t)(-1))
+ return -1;
+
+ cd2 = (iconv_t)(-1);
+ /* Don't use strcasecmp() as it's locale-dependent. */
+ if (!strchr("Uu", tocode[0]) ||
+ !strchr("Tt", tocode[1]) ||
+ !strchr("Ff", tocode[2]) ||
+ tocode[3] != '-' ||
+ tocode[4] != '8' ||
+ tocode[5] != '\0') {
+ char *tocode1;
+ int rc;
+ /*
+ * Try using this non-standard feature of glibc and libiconv.
+ * This is deliberately not a config option as people often
+ * change their iconv library without rebuilding applications.
+ */
+
+ rc = asprintf(&tocode1, "%s//TRANSLIT", tocode);
+ if (rc < 0 || ! tocode1)
+ goto fail;
+
+ cd2 = iconv_open(tocode1, "UTF-8");
+ free(tocode1);
+
+ if (cd2 == (iconv_t)(-1))
+ cd2 = iconv_open(tocode, fromcode);
+
+ if (cd2 == (iconv_t)(-1)) {
+ iconv_close(cd1);
+ return -1;
+ }
+ }
+
+ utflen = 1; /*fromlen * 2 + 1; XXX */
+ utfbuf = malloc(utflen);
+ if (!utfbuf)
+ goto fail;
+
+ /* Convert to UTF-8 */
+ ib = (char *)from;
+ ibl = fromlen;
+ ob = utfbuf;
+ obl = utflen;
+ for (;;) {
+ k = iconv(cd1, &ib, &ibl, &ob, &obl);
+ assert((!k && !ibl) ||
+ (k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) ||
+ (k == (size_t)(-1) &&
+ (errno == EILSEQ || errno == EINVAL) && ibl));
+ if (!ibl)
+ break;
+ if (obl < 6) {
+ /* Enlarge the buffer */
+ if(utflen*2 < utflen) /* overflow check */
+ goto fail;
+ utflen *= 2;
+ obp = ob - utfbuf; /* save position */
+ newbuf = realloc(utfbuf, utflen);
+ if (!newbuf)
+ goto fail;
+ ob = newbuf + obp;
+ obl = utflen - obp;
+ utfbuf = newbuf;
+ }
+ else {
+ /* Invalid input */
+ ib++, ibl--;
+ *ob++ = '#', obl--;
+ ret = 2;
+ iconv(cd1, 0, 0, 0, 0);
+ }
+ }
+
+ if (cd2 == (iconv_t)(-1)) {
+ /* The target encoding was UTF-8 */
+ if (tolen)
+ *tolen = ob - utfbuf;
+ if (!to) {
+ free(utfbuf);
+ iconv_close(cd1);
+ return ret;
+ }
+ newbuf = safe_realloc_nofree_add_2op_(utfbuf, (ob - utfbuf), /*+*/1);
+ if (!newbuf)
+ goto fail;
+ ob = (ob - utfbuf) + newbuf;
+ *ob = '\0';
+ *to = newbuf;
+ iconv_close(cd1);
+ return ret;
+ }
+
+ /* Truncate the buffer to be tidy */
+ utflen = ob - utfbuf;
+ if (utflen == 0)
+ goto fail;
+ newbuf = realloc(utfbuf, utflen);
+ if (!newbuf)
+ goto fail;
+ utfbuf = newbuf;
+
+ /* Convert from UTF-8 to discover how long the output is */
+ outlen = 0;
+ ib = utfbuf;
+ ibl = utflen;
+ while (ibl) {
+ ob = tbuf;
+ obl = sizeof(tbuf);
+ k = iconv(cd2, &ib, &ibl, &ob, &obl);
+ assert((k != (size_t)(-1) && !ibl) ||
+ (k == (size_t)(-1) && errno == E2BIG && ibl) ||
+ (k == (size_t)(-1) && errno == EILSEQ && ibl));
+ if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
+ /* Replace one character */
+ char *tb = "?";
+ size_t tbl = 1;
+
+ outlen += ob - tbuf;
+ ob = tbuf;
+ obl = sizeof(tbuf);
+ k = iconv(cd2, &tb, &tbl, &ob, &obl);
+ assert((!k && !tbl) ||
+ (k == (size_t)(-1) && errno == EILSEQ && tbl));
+ for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
+ ;
+ }
+ outlen += ob - tbuf;
+ }
+ ob = tbuf;
+ obl = sizeof(tbuf);
+ k = iconv(cd2, 0, 0, &ob, &obl);
+ assert(!k);
+ outlen += ob - tbuf;
+
+ /* Convert from UTF-8 for real */
+ outbuf = safe_malloc_add_2op_(outlen, /*+*/1);
+ if (!outbuf)
+ goto fail;
+ ib = utfbuf;
+ ibl = utflen;
+ ob = outbuf;
+ obl = outlen;
+ while (ibl) {
+ k = iconv(cd2, &ib, &ibl, &ob, &obl);
+ assert((k != (size_t)(-1) && !ibl) ||
+ (k == (size_t)(-1) && errno == EILSEQ && ibl));
+ if (k && !ret)
+ ret = 1;
+ if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
+ /* Replace one character */
+ char *tb = "?";
+ size_t tbl = 1;
+
+ k = iconv(cd2, &tb, &tbl, &ob, &obl);
+ assert((!k && !tbl) ||
+ (k == (size_t)(-1) && errno == EILSEQ && tbl));
+ for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
+ ;
+ }
+ }
+ k = iconv(cd2, 0, 0, &ob, &obl);
+ assert(!k);
+ assert(!obl);
+ *ob = '\0';
+
+ free(utfbuf);
+ iconv_close(cd1);
+ iconv_close(cd2);
+ if (tolen)
+ *tolen = outlen;
+ if (!to) {
+ free(outbuf);
+ return ret;
+ }
+ *to = outbuf;
+ return ret;
+
+ fail:
+ if(0 != utfbuf)
+ free(utfbuf);
+ iconv_close(cd1);
+ if (cd2 != (iconv_t)(-1))
+ iconv_close(cd2);
+ return -2;
+}
+
+#endif /* HAVE_ICONV */
diff --git a/src/share/utf8/iconvert.h b/src/share/utf8/iconvert.h
new file mode 100644
index 0000000..a2d75a2
--- /dev/null
+++ b/src/share/utf8/iconvert.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#ifdef HAVE_ICONV
+
+/*
+ * Convert data from one encoding to another. Return:
+ *
+ * -2 : memory allocation failed
+ * -1 : unknown encoding
+ * 0 : data was converted exactly
+ * 1 : data was converted inexactly
+ * 2 : data was invalid (but still converted)
+ *
+ * We convert in two steps, via UTF-8, as this is the only
+ * reliable way of distinguishing between invalid input
+ * and valid input which iconv refuses to transliterate.
+ * We convert from UTF-8 twice, because we have no way of
+ * knowing whether the conversion was exact if iconv returns
+ * E2BIG (due to a bug in the specification of iconv).
+ * An alternative approach is to assume that the output of
+ * iconv is never more than 4 times as long as the input,
+ * but I prefer to avoid that assumption if possible.
+ */
+
+int iconvert(const char *fromcode, const char *tocode,
+ const char *from, size_t fromlen,
+ char **to, size_t *tolen) ;
+
+#endif /* HAVE_ICONV */
diff --git a/src/share/utf8/makemap.c b/src/share/utf8/makemap.c
new file mode 100644
index 0000000..790021c
--- /dev/null
+++ b/src/share/utf8/makemap.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <errno.h>
+#include <iconv.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[])
+{
+ iconv_t cd;
+ const char *ib;
+ char *ob;
+ size_t ibl, obl, k;
+ uint8_t c, buf[4];
+ int i, wc;
+
+ if (argc != 2) {
+ printf("Usage: %s ENCODING\n", argv[0]);
+ printf("Output a charset map for the 8-bit ENCODING.\n");
+ return 1;
+ }
+
+ cd = iconv_open("UCS-4", argv[1]);
+ if (cd == (iconv_t)(-1)) {
+ perror("iconv_open");
+ return 1;
+ }
+
+ for (i = 0; i < 256; i++) {
+ c = i;
+ ib = &c;
+ ibl = 1;
+ ob = buf;
+ obl = 4;
+ k = iconv(cd, &ib, &ibl, &ob, &obl);
+ if (!k && !ibl && !obl) {
+ wc = (buf[0] << 24) + (buf[1] << 16) + (buf[2] << 8) + buf[3];
+ if (wc >= 0xffff) {
+ printf("Dodgy value.\n");
+ return 1;
+ }
+ }
+ else if (k == (size_t)(-1) && errno == EILSEQ)
+ wc = 0xffff;
+ else {
+ printf("Non-standard iconv.\n");
+ return 1;
+ }
+
+ if (i % 8 == 0)
+ printf(" ");
+ printf("0x%04x", wc);
+ if (i == 255)
+ printf("\n");
+ else if (i % 8 == 7)
+ printf(",\n");
+ else
+ printf(", ");
+ }
+
+ return 0;
+}
diff --git a/src/share/utf8/utf8.c b/src/share/utf8/utf8.c
new file mode 100644
index 0000000..34af187
--- /dev/null
+++ b/src/share/utf8/utf8.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2001 Peter Harris <peter.harris@hummingbird.com>
+ * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
+ *
+ * Buffer overflow checking added: Josh Coalson, 9/9/2007
+ *
+ * Win32 part rewritten: lvqcl, 2/2/2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * Convert a string between UTF-8 and the locale's charset.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "share/alloc.h"
+#include "share/utf8.h"
+
+#ifdef _WIN32
+
+#include <windows.h>
+
+int utf8_encode(const char *from, char **to)
+{
+ wchar_t *unicode = NULL;
+ char *utf8 = NULL;
+ int ret = -1;
+
+ do {
+ int len;
+
+ len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from, -1, NULL, 0);
+ if(len == 0) break;
+ unicode = (wchar_t*) safe_malloc_mul_2op_((size_t)len, sizeof(wchar_t));
+ if(unicode == NULL) break;
+ len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from, -1, unicode, len);
+ if(len == 0) break;
+
+ len = WideCharToMultiByte(CP_UTF8, 0, unicode, -1, NULL, 0, NULL, NULL);
+ if(len == 0) break;
+ utf8 = (char*) safe_malloc_mul_2op_((size_t)len, sizeof(char));
+ if(utf8 == NULL) break;
+ len = WideCharToMultiByte(CP_UTF8, 0, unicode, -1, utf8, len, NULL, NULL);
+ if(len == 0) break;
+
+ ret = 0;
+
+ } while(0);
+
+ free(unicode);
+
+ if(ret == 0) {
+ *to = utf8;
+ } else {
+ free(utf8);
+ *to = NULL;
+ }
+
+ return ret;
+}
+
+int utf8_decode(const char *from, char **to)
+{
+ wchar_t *unicode = NULL;
+ char *acp = NULL;
+ int ret = -1;
+
+ do {
+ int len;
+
+ len = MultiByteToWideChar(CP_UTF8, 0, from, -1, NULL, 0);
+ if(len == 0) break;
+ unicode = (wchar_t*) safe_malloc_mul_2op_((size_t)len, sizeof(wchar_t));
+ if(unicode == NULL) break;
+ len = MultiByteToWideChar(CP_UTF8, 0, from, -1, unicode, len);
+ if(len == 0) break;
+
+ len = WideCharToMultiByte(CP_ACP, WC_COMPOSITECHECK, unicode, -1, NULL, 0, NULL, NULL);
+ if(len == 0) break;
+ acp = (char*) safe_malloc_mul_2op_((size_t)len, sizeof(char));
+ if(acp == NULL) break;
+ len = WideCharToMultiByte(CP_ACP, WC_COMPOSITECHECK, unicode, -1, acp, len, NULL, NULL);
+ if(len == 0) break;
+
+ ret = 0;
+
+ } while(0);
+
+ free(unicode);
+
+ if(ret == 0) {
+ *to = acp;
+ } else {
+ free(acp);
+ *to = NULL;
+ }
+
+ return ret;
+}
+
+#else /* End win32. Rest is for real operating systems */
+
+
+#ifdef HAVE_LANGINFO_CODESET
+#include <langinfo.h>
+#endif
+
+#include <string.h>
+
+#include "share/safe_str.h"
+#include "iconvert.h"
+#include "charset.h"
+
+static const char *current_charset(void)
+{
+ const char *c = 0;
+#ifdef HAVE_LANGINFO_CODESET
+ c = nl_langinfo(CODESET);
+#endif
+
+ if (!c)
+ c = getenv("CHARSET");
+
+ return c? c : "US-ASCII";
+}
+
+static int convert_buffer(const char *fromcode, const char *tocode,
+ const char *from, size_t fromlen,
+ char **to, size_t *tolen)
+{
+ int ret = -1;
+
+#ifdef HAVE_ICONV
+ ret = iconvert(fromcode, tocode, from, fromlen, to, tolen);
+ if (ret != -1)
+ return ret;
+#endif
+
+#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
+ ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen);
+ if (ret != -1)
+ return ret;
+#endif
+
+ return ret;
+}
+
+static int convert_string(const char *fromcode, const char *tocode,
+ const char *from, char **to, char replace)
+{
+ int ret;
+ size_t fromlen;
+ char *s;
+
+ fromlen = strlen(from);
+ ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0);
+ if (ret == -2)
+ return -1;
+ if (ret != -1)
+ return ret;
+
+ s = safe_malloc_add_2op_(fromlen, /*+*/1);
+ if (!s)
+ return -1;
+ snprintf(s, fromlen + 1, "%s", from);
+ *to = s;
+ for (; *s; s++)
+ if (*s & ~0x7f)
+ *s = replace;
+ return 3;
+}
+
+int utf8_encode(const char *from, char **to)
+{
+ return convert_string(current_charset(), "UTF-8", from, to, '#');
+}
+
+int utf8_decode(const char *from, char **to)
+{
+ return convert_string("UTF-8", current_charset(), from, to, '?');
+}
+
+#endif