diff options
Diffstat (limited to 'src/share/utf8/iconvert.c')
-rw-r--r-- | src/share/utf8/iconvert.c | 257 |
1 files changed, 257 insertions, 0 deletions
diff --git a/src/share/utf8/iconvert.c b/src/share/utf8/iconvert.c new file mode 100644 index 0000000..9a1e3f6 --- /dev/null +++ b/src/share/utf8/iconvert.c @@ -0,0 +1,257 @@ +/* + * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#if !defined _WIN32 && defined HAVE_ICONV + +#include <assert.h> +#include <errno.h> +#include <iconv.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "iconvert.h" +#include "share/alloc.h" +#include "share/safe_str.h" + +/* + * Convert data from one encoding to another. Return: + * + * -2 : memory allocation failed + * -1 : unknown encoding + * 0 : data was converted exactly + * 1 : data was converted inexactly + * 2 : data was invalid (but still converted) + * + * We convert in two steps, via UTF-8, as this is the only + * reliable way of distinguishing between invalid input + * and valid input which iconv refuses to transliterate. + * We convert from UTF-8 twice, because we have no way of + * knowing whether the conversion was exact if iconv returns + * E2BIG (due to a bug in the specification of iconv). + * An alternative approach is to assume that the output of + * iconv is never more than 4 times as long as the input, + * but I prefer to avoid that assumption if possible. + */ + +int iconvert(const char *fromcode, const char *tocode, + const char *from, size_t fromlen, + char **to, size_t *tolen) +{ + int ret = 0; + iconv_t cd1, cd2; + char *ib; + char *ob; + char *utfbuf = 0, *outbuf, *newbuf; + size_t utflen, outlen, ibl, obl, obp, k; + char tbuf[2048]; + + cd1 = iconv_open("UTF-8", fromcode); + if (cd1 == (iconv_t)(-1)) + return -1; + + cd2 = (iconv_t)(-1); + /* Don't use strcasecmp() as it's locale-dependent. */ + if (!strchr("Uu", tocode[0]) || + !strchr("Tt", tocode[1]) || + !strchr("Ff", tocode[2]) || + tocode[3] != '-' || + tocode[4] != '8' || + tocode[5] != '\0') { + char *tocode1; + int rc; + /* + * Try using this non-standard feature of glibc and libiconv. + * This is deliberately not a config option as people often + * change their iconv library without rebuilding applications. + */ + + rc = asprintf(&tocode1, "%s//TRANSLIT", tocode); + if (rc < 0 || ! tocode1) + goto fail; + + cd2 = iconv_open(tocode1, "UTF-8"); + free(tocode1); + + if (cd2 == (iconv_t)(-1)) + cd2 = iconv_open(tocode, fromcode); + + if (cd2 == (iconv_t)(-1)) { + iconv_close(cd1); + return -1; + } + } + + utflen = 1; /*fromlen * 2 + 1; XXX */ + utfbuf = malloc(utflen); + if (!utfbuf) + goto fail; + + /* Convert to UTF-8 */ + ib = (char *)from; + ibl = fromlen; + ob = utfbuf; + obl = utflen; + for (;;) { + k = iconv(cd1, &ib, &ibl, &ob, &obl); + assert((!k && !ibl) || + (k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) || + (k == (size_t)(-1) && + (errno == EILSEQ || errno == EINVAL) && ibl)); + if (!ibl) + break; + if (obl < 6) { + /* Enlarge the buffer */ + if(utflen*2 < utflen) /* overflow check */ + goto fail; + utflen *= 2; + obp = ob - utfbuf; /* save position */ + newbuf = realloc(utfbuf, utflen); + if (!newbuf) + goto fail; + ob = newbuf + obp; + obl = utflen - obp; + utfbuf = newbuf; + } + else { + /* Invalid input */ + ib++, ibl--; + *ob++ = '#', obl--; + ret = 2; + iconv(cd1, 0, 0, 0, 0); + } + } + + if (cd2 == (iconv_t)(-1)) { + /* The target encoding was UTF-8 */ + if (tolen) + *tolen = ob - utfbuf; + if (!to) { + free(utfbuf); + iconv_close(cd1); + return ret; + } + newbuf = safe_realloc_nofree_add_2op_(utfbuf, (ob - utfbuf), /*+*/1); + if (!newbuf) + goto fail; + ob = (ob - utfbuf) + newbuf; + *ob = '\0'; + *to = newbuf; + iconv_close(cd1); + return ret; + } + + /* Truncate the buffer to be tidy */ + utflen = ob - utfbuf; + if (utflen == 0) + goto fail; + newbuf = realloc(utfbuf, utflen); + if (!newbuf) + goto fail; + utfbuf = newbuf; + + /* Convert from UTF-8 to discover how long the output is */ + outlen = 0; + ib = utfbuf; + ibl = utflen; + while (ibl) { + ob = tbuf; + obl = sizeof(tbuf); + k = iconv(cd2, &ib, &ibl, &ob, &obl); + assert((k != (size_t)(-1) && !ibl) || + (k == (size_t)(-1) && errno == E2BIG && ibl) || + (k == (size_t)(-1) && errno == EILSEQ && ibl)); + if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) { + /* Replace one character */ + char *tb = "?"; + size_t tbl = 1; + + outlen += ob - tbuf; + ob = tbuf; + obl = sizeof(tbuf); + k = iconv(cd2, &tb, &tbl, &ob, &obl); + assert((!k && !tbl) || + (k == (size_t)(-1) && errno == EILSEQ && tbl)); + for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--) + ; + } + outlen += ob - tbuf; + } + ob = tbuf; + obl = sizeof(tbuf); + k = iconv(cd2, 0, 0, &ob, &obl); + assert(!k); + outlen += ob - tbuf; + + /* Convert from UTF-8 for real */ + outbuf = safe_malloc_add_2op_(outlen, /*+*/1); + if (!outbuf) + goto fail; + ib = utfbuf; + ibl = utflen; + ob = outbuf; + obl = outlen; + while (ibl) { + k = iconv(cd2, &ib, &ibl, &ob, &obl); + assert((k != (size_t)(-1) && !ibl) || + (k == (size_t)(-1) && errno == EILSEQ && ibl)); + if (k && !ret) + ret = 1; + if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) { + /* Replace one character */ + char *tb = "?"; + size_t tbl = 1; + + k = iconv(cd2, &tb, &tbl, &ob, &obl); + assert((!k && !tbl) || + (k == (size_t)(-1) && errno == EILSEQ && tbl)); + for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--) + ; + } + } + k = iconv(cd2, 0, 0, &ob, &obl); + assert(!k); + assert(!obl); + *ob = '\0'; + + free(utfbuf); + iconv_close(cd1); + iconv_close(cd2); + if (tolen) + *tolen = outlen; + if (!to) { + free(outbuf); + return ret; + } + *to = outbuf; + return ret; + + fail: + if(0 != utfbuf) + free(utfbuf); + iconv_close(cd1); + if (cd2 != (iconv_t)(-1)) + iconv_close(cd2); + return -2; +} + +#endif /* HAVE_ICONV */ |