diff options
Diffstat (limited to 'libraries/libldap/utf-8-conv.c')
-rw-r--r-- | libraries/libldap/utf-8-conv.c | 485 |
1 files changed, 485 insertions, 0 deletions
diff --git a/libraries/libldap/utf-8-conv.c b/libraries/libldap/utf-8-conv.c new file mode 100644 index 0000000..9d8f9c1 --- /dev/null +++ b/libraries/libldap/utf-8-conv.c @@ -0,0 +1,485 @@ +/* $OpenLDAP$ */ +/* This work is part of OpenLDAP Software <http://www.openldap.org/>. + * + * Copyright 1998-2022 The OpenLDAP Foundation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * <http://www.OpenLDAP.org/license.html>. + */ +/* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved. + * + * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND + * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT + * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS + * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" + * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION + * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP + * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT + * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. + *--- + * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License + * can be found in the file "build/LICENSE-2.0.1" in this distribution + * of OpenLDAP Software. + */ + +/* + * UTF-8 Conversion Routines + * + * These routines convert between Wide Character and UTF-8, + * or between MultiByte and UTF-8 encodings. + * + * Both single character and string versions of the functions are provided. + * All functions return -1 if the character or string cannot be converted. + */ + +#include "portable.h" + +#if SIZEOF_WCHAR_T >= 4 +/* These routines assume ( sizeof(wchar_t) >= 4 ) */ + +#include <stdio.h> +#include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */ +#include <ac/string.h> +#include <ac/time.h> /* for time_t */ + +#include "ldap-int.h" + +#include <ldap_utf8.h> + +static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; + + +/*----------------------------------------------------------------------------- + UTF-8 Format Summary + +ASCII chars 7 bits + 0xxxxxxx + +2-character UTF-8 sequence: 11 bits + 110xxxxx 10xxxxxx + +3-character UTF-8 16 bits + 1110xxxx 10xxxxxx 10xxxxxx + +4-char UTF-8 21 bits + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +5-char UTF-8 26 bits + 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + +6-char UTF-8 31 bits + 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + +Unicode address space (0 - 0x10FFFF) 21 bits +ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits + +Note: This code does not prevent UTF-8 sequences which are longer than + necessary from being decoded. +*/ + +/*----------------------------------------------------------------------------- + Convert a UTF-8 character to a wide char. + Return the length of the UTF-8 input character in bytes. +*/ +int +ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char ) +{ + int utflen, i; + wchar_t ch; + + if (utf8char == NULL) return -1; + + /* Get UTF-8 sequence length from 1st byte */ + utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen); + + if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1; + + /* First byte minus length tag */ + ch = (wchar_t)(utf8char[0] & mask[utflen]); + + for(i=1; i < utflen; i++) { + /* Subsequent bytes must start with 10 */ + if ((utf8char[i] & 0xc0) != 0x80) return -1; + + ch <<= 6; /* 6 bits of data in each subsequent byte */ + ch |= (wchar_t)(utf8char[i] & 0x3f); + } + + if (wchar) *wchar = ch; + + return utflen; +} + +/*----------------------------------------------------------------------------- + Convert a UTF-8 string to a wide char string. + No more than 'count' wide chars will be written to the output buffer. + Return the size of the converted string in wide chars, excl null terminator. +*/ +int +ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count ) +{ + size_t wclen = 0; + int utflen, i; + wchar_t ch; + + + /* If input ptr is NULL or empty... */ + if (utf8str == NULL || !*utf8str) { + if ( wcstr ) + *wcstr = 0; + return 0; + } + + /* Examine next UTF-8 character. If output buffer is NULL, ignore count */ + while ( *utf8str && (wcstr==NULL || wclen<count) ) { + /* Get UTF-8 sequence length from 1st byte */ + utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen); + + if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1; + + /* First byte minus length tag */ + ch = (wchar_t)(utf8str[0] & mask[utflen]); + + for(i=1; i < utflen; i++) { + /* Subsequent bytes must start with 10 */ + if ((utf8str[i] & 0xc0) != 0x80) return -1; + + ch <<= 6; /* 6 bits of data in each subsequent byte */ + ch |= (wchar_t)(utf8str[i] & 0x3f); + } + + if (wcstr) wcstr[wclen] = ch; + + utf8str += utflen; /* Move to next UTF-8 character */ + wclen++; /* Count number of wide chars stored/required */ + } + + /* Add null terminator if there's room in the buffer. */ + if (wcstr && wclen < count) wcstr[wclen] = 0; + + return wclen; +} + + +/*----------------------------------------------------------------------------- + Convert one wide char to a UTF-8 character. + Return the length of the converted UTF-8 character in bytes. + No more than 'count' bytes will be written to the output buffer. +*/ +int +ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count ) +{ + int len=0; + + if (utf8char == NULL) /* Just determine the required UTF-8 char length. */ + { /* Ignore count */ + if( wchar < 0 ) + return -1; + if( wchar < 0x80 ) + return 1; + if( wchar < 0x800 ) + return 2; + if( wchar < 0x10000 ) + return 3; + if( wchar < 0x200000 ) + return 4; + if( wchar < 0x4000000 ) + return 5; +#if SIZEOF_WCHAR_T > 4 + /* UL is not strictly needed by ANSI C */ + if( wchar < (wchar_t)0x80000000UL ) +#endif /* SIZEOF_WCHAR_T > 4 */ + return 6; + return -1; + } + + + if ( wchar < 0 ) { /* Invalid wide character */ + len = -1; + + } else if( wchar < 0x80 ) { + if (count >= 1) { + utf8char[len++] = (char)wchar; + } + + } else if( wchar < 0x800 ) { + if (count >=2) { + utf8char[len++] = 0xc0 | ( wchar >> 6 ); + utf8char[len++] = 0x80 | ( wchar & 0x3f ); + } + + } else if( wchar < 0x10000 ) { + if (count >= 3) { + utf8char[len++] = 0xe0 | ( wchar >> 12 ); + utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); + utf8char[len++] = 0x80 | ( wchar & 0x3f ); + } + + } else if( wchar < 0x200000 ) { + if (count >= 4) { + utf8char[len++] = 0xf0 | ( wchar >> 18 ); + utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f ); + utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); + utf8char[len++] = 0x80 | ( wchar & 0x3f ); + } + + } else if( wchar < 0x4000000 ) { + if (count >= 5) { + utf8char[len++] = 0xf8 | ( wchar >> 24 ); + utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f ); + utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f ); + utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); + utf8char[len++] = 0x80 | ( wchar & 0x3f ); + } + + } else +#if SIZEOF_WCHAR_T > 4 + /* UL is not strictly needed by ANSI C */ + if( wchar < (wchar_t)0x80000000UL ) +#endif /* SIZEOF_WCHAR_T > 4 */ + { + if (count >= 6) { + utf8char[len++] = 0xfc | ( wchar >> 30 ); + utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f ); + utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f ); + utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f ); + utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); + utf8char[len++] = 0x80 | ( wchar & 0x3f ); + } + +#if SIZEOF_WCHAR_T > 4 + } else { + len = -1; +#endif /* SIZEOF_WCHAR_T > 4 */ + } + + return len; + +} + + +/*----------------------------------------------------------------------------- + Convert a wide char string to a UTF-8 string. + No more than 'count' bytes will be written to the output buffer. + Return the # of bytes written to the output buffer, excl null terminator. +*/ +int +ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count ) +{ + int len = 0; + int n; + char *p = utf8str; + wchar_t empty = 0; /* To avoid use of L"" construct */ + + if (wcstr == NULL) /* Treat input ptr NULL as an empty string */ + wcstr = ∅ + + if (utf8str == NULL) /* Just compute size of output, excl null */ + { + while (*wcstr) + { + /* Get UTF-8 size of next wide char */ + n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN); + if (n == -1) + return -1; + len += n; + } + + return len; + } + + + /* Do the actual conversion. */ + + n = 1; /* In case of empty wcstr */ + while (*wcstr) + { + n = ldap_x_wc_to_utf8( p, *wcstr++, count); + + if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */ + break; + + p += n; + count -= n; /* Space left in output buffer */ + } + + /* If not enough room for last character, pad remainder with null + so that return value = original count, indicating buffer full. */ + if (n == 0) + { + while (count--) + *p++ = 0; + } + + /* Add a null terminator if there's room. */ + else if (count) + *p = 0; + + if (n == -1) /* Conversion encountered invalid wide char. */ + return -1; + + /* Return the number of bytes written to output buffer, excl null. */ + return (p - utf8str); +} + +#ifdef ANDROID +int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); } +int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); } +#endif + +/*----------------------------------------------------------------------------- + Convert a UTF-8 character to a MultiByte character. + Return the size of the converted character in bytes. +*/ +int +ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char, + int (*f_wctomb)(char *mbchar, wchar_t wchar) ) +{ + wchar_t wchar; + int n; + char tmp[6]; /* Large enough for biggest multibyte char */ + + if (f_wctomb == NULL) /* If no conversion function was given... */ + f_wctomb = wctomb; /* use the local ANSI C function */ + + /* First convert UTF-8 char to a wide char */ + n = ldap_x_utf8_to_wc( &wchar, utf8char); + + if (n == -1) + return -1; /* Invalid UTF-8 character */ + + if (mbchar == NULL) + n = f_wctomb( tmp, wchar ); + else + n = f_wctomb( mbchar, wchar); + + return n; +} + +/*----------------------------------------------------------------------------- + Convert a UTF-8 string to a MultiByte string. + No more than 'count' bytes will be written to the output buffer. + Return the size of the converted string in bytes, excl null terminator. +*/ +int +ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count, + size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) ) +{ + wchar_t *wcs; + size_t wcsize; + int n; + + if (f_wcstombs == NULL) /* If no conversion function was given... */ + f_wcstombs = wcstombs; /* use the local ANSI C function */ + + if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */ + { + if (mbstr) + *mbstr = 0; + return 0; + } + +/* Allocate memory for the maximum size wchar string that we could get. */ + wcsize = strlen(utf8str) + 1; + wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t)); + if (wcs == NULL) + return -1; /* Memory allocation failure. */ + + /* First convert the UTF-8 string to a wide char string */ + n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize); + + /* Then convert wide char string to multi-byte string */ + if (n != -1) + { + n = f_wcstombs(mbstr, wcs, count); + } + + LDAP_FREE(wcs); + + return n; +} + +/*----------------------------------------------------------------------------- + Convert a MultiByte character to a UTF-8 character. + 'mbsize' indicates the number of bytes of 'mbchar' to check. + Returns the number of bytes written to the output character. +*/ +int +ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize, + int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) ) +{ + wchar_t wchar; + int n; + + if (f_mbtowc == NULL) /* If no conversion function was given... */ + f_mbtowc = mbtowc; /* use the local ANSI C function */ + + if (mbsize == 0) /* 0 is not valid. */ + return -1; + + if (mbchar == NULL || *mbchar == 0) + { + if (utf8char) + *utf8char = 0; + return 1; + } + + /* First convert the MB char to a Wide Char */ + n = f_mbtowc( &wchar, mbchar, mbsize); + + if (n == -1) + return -1; + + /* Convert the Wide Char to a UTF-8 character. */ + n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN); + + return n; +} + + +/*----------------------------------------------------------------------------- + Convert a MultiByte string to a UTF-8 string. + No more than 'count' bytes will be written to the output buffer. + Return the size of the converted string in bytes, excl null terminator. +*/ +int +ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count, + size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) ) +{ + wchar_t *wcs; + int n; + size_t wcsize; + + if (mbstr == NULL) /* Treat NULL input string as an empty string */ + mbstr = ""; + + if (f_mbstowcs == NULL) /* If no conversion function was given... */ + f_mbstowcs = mbstowcs; /* use the local ANSI C function */ + + /* Allocate memory for the maximum size wchar string that we could get. */ + wcsize = strlen(mbstr) + 1; + wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) ); + if (wcs == NULL) + return -1; + + /* First convert multi-byte string to a wide char string */ + n = f_mbstowcs(wcs, mbstr, wcsize); + + /* Convert wide char string to UTF-8 string */ + if (n != -1) + { + n = ldap_x_wcs_to_utf8s( utf8str, wcs, count); + } + + LDAP_FREE(wcs); + + return n; +} + +#endif /* SIZEOF_WCHAR_T >= 4 */ |