diff options
Diffstat (limited to 'winpr/libwinpr/crt/unicode.c')
-rw-r--r-- | winpr/libwinpr/crt/unicode.c | 657 |
1 files changed, 657 insertions, 0 deletions
diff --git a/winpr/libwinpr/crt/unicode.c b/winpr/libwinpr/crt/unicode.c new file mode 100644 index 0000000..123a488 --- /dev/null +++ b/winpr/libwinpr/crt/unicode.c @@ -0,0 +1,657 @@ +/** + * WinPR: Windows Portable Runtime + * Unicode Conversion (CRT) + * + * Copyright 2012 Marc-Andre Moreau <marcandre.moreau@gmail.com> + * Copyright 2022 Armin Novak <anovak@thincast.com> + * Copyright 2022 Thincast Technologies GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <winpr/config.h> +#include <winpr/assert.h> + +#include <errno.h> +#include <wctype.h> + +#include <winpr/crt.h> +#include <winpr/error.h> +#include <winpr/print.h> + +#ifndef MIN +#define MIN(a, b) (a) < (b) ? (a) : (b) +#endif + +#ifndef _WIN32 + +#include "unicode.h" + +#include "../log.h" +#define TAG WINPR_TAG("unicode") + +/** + * Notes on cross-platform Unicode portability: + * + * Unicode has many possible Unicode Transformation Format (UTF) encodings, + * where some of the most commonly used are UTF-8, UTF-16 and sometimes UTF-32. + * + * The number in the UTF encoding name (8, 16, 32) refers to the number of bits + * per code unit. A code unit is the minimal bit combination that can represent + * a unit of encoded text in the given encoding. For instance, UTF-8 encodes + * the English alphabet using 8 bits (or one byte) each, just like in ASCII. + * + * However, the total number of code points (values in the Unicode codespace) + * only fits completely within 32 bits. This means that for UTF-8 and UTF-16, + * more than one code unit may be required to fully encode a specific value. + * UTF-8 and UTF-16 are variable-width encodings, while UTF-32 is fixed-width. + * + * UTF-8 has the advantage of being backwards compatible with ASCII, and is + * one of the most commonly used Unicode encoding. + * + * UTF-16 is used everywhere in the Windows API. The strategy employed by + * Microsoft to provide backwards compatibility in their API was to create + * an ANSI and a Unicode version of the same function, ending with A (ANSI) + * and W (Wide character, or UTF-16 Unicode). In headers, the original + * function name is replaced by a macro that defines to either the ANSI + * or Unicode version based on the definition of the _UNICODE macro. + * + * UTF-32 has the advantage of being fixed width, but wastes a lot of space + * for English text (4x more than UTF-8, 2x more than UTF-16). + * + * In C, wide character strings are often defined with the wchar_t type. + * Many functions are provided to deal with those wide character strings, + * such as wcslen (strlen equivalent) or wprintf (printf equivalent). + * + * This may lead to some confusion, since many of these functions exist + * on both Windows and Linux, but they are *not* the same! + * + * This sample hello world is a good example: + * + * #include <wchar.h> + * + * wchar_t hello[] = L"Hello, World!\n"; + * + * int main(int argc, char** argv) + * { + * wprintf(hello); + * wprintf(L"sizeof(wchar_t): %d\n", sizeof(wchar_t)); + * return 0; + * } + * + * There is a reason why the sample prints the size of the wchar_t type: + * On Windows, wchar_t is two bytes (UTF-16), while on most other systems + * it is 4 bytes (UTF-32). This means that if you write code on Windows, + * use L"" to define a string which is meant to be UTF-16 and not UTF-32, + * you will have a little surprise when trying to port your code to Linux. + * + * Since the Windows API uses UTF-16, not UTF-32, WinPR defines the WCHAR + * type to always be 2-bytes long and uses it instead of wchar_t. Do not + * ever use wchar_t with WinPR unless you know what you are doing. + * + * As for L"", it is unfortunately unusable in a portable way, unless a + * special option is passed to GCC to define wchar_t as being two bytes. + * For string constants that must be UTF-16, it is a pain, but they can + * be defined in a portable way like this: + * + * WCHAR hello[] = { 'H','e','l','l','o','\0' }; + * + * Such strings cannot be passed to native functions like wcslen(), which + * may expect a different wchar_t size. For this reason, WinPR provides + * _wcslen, which expects UTF-16 WCHAR strings on all platforms. + * + */ + +/** \deprecated We no longer export this function, see ConvertUtf8ToWChar family of functions for a + * replacement + * + * Conversion to Unicode (UTF-16) + * MultiByteToWideChar: http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072/ + * + * cbMultiByte is an input size in bytes (BYTE) + * cchWideChar is an output size in wide characters (WCHAR) + * + * Null-terminated UTF-8 strings: + * + * cchWideChar *cannot* be assumed to be cbMultiByte since UTF-8 is variable-width! + * + * Instead, obtain the required cchWideChar output size like this: + * cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, NULL, 0); + * + * A value of -1 for cbMultiByte indicates that the input string is null-terminated, + * and the null terminator *will* be processed. The size returned by MultiByteToWideChar + * will therefore include the null terminator. Equivalent behavior can be obtained by + * computing the length in bytes of the input buffer, including the null terminator: + * + * cbMultiByte = strlen((char*) lpMultiByteStr) + 1; + * + * An output buffer of the proper size can then be allocated: + * + * lpWideCharStr = (LPWSTR) malloc(cchWideChar * sizeof(WCHAR)); + * + * Since cchWideChar is an output size in wide characters, the actual buffer size is: + * (cchWideChar * sizeof(WCHAR)) or (cchWideChar * 2) + * + * Finally, perform the conversion: + * + * cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, lpWideCharStr, + * cchWideChar); + * + * The value returned by MultiByteToWideChar corresponds to the number of wide characters written + * to the output buffer, and should match the value obtained on the first call to + * MultiByteToWideChar. + * + */ + +#if !defined(WITH_WINPR_DEPRECATED) +static +#endif + int + MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte, + LPWSTR lpWideCharStr, int cchWideChar) +{ + return int_MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, lpWideCharStr, + cchWideChar); +} + +/** \deprecated We no longer export this function, see ConvertWCharToUtf8 family of functions for a + * replacement + * + * Conversion from Unicode (UTF-16) + * WideCharToMultiByte: http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130/ + * + * cchWideChar is an input size in wide characters (WCHAR) + * cbMultiByte is an output size in bytes (BYTE) + * + * Null-terminated UTF-16 strings: + * + * cbMultiByte *cannot* be assumed to be cchWideChar since UTF-8 is variable-width! + * + * Instead, obtain the required cbMultiByte output size like this: + * cbMultiByte = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) lpWideCharStr, -1, NULL, 0, NULL, NULL); + * + * A value of -1 for cbMultiByte indicates that the input string is null-terminated, + * and the null terminator *will* be processed. The size returned by WideCharToMultiByte + * will therefore include the null terminator. Equivalent behavior can be obtained by + * computing the length in bytes of the input buffer, including the null terminator: + * + * cchWideChar = _wcslen((WCHAR*) lpWideCharStr) + 1; + * + * An output buffer of the proper size can then be allocated: + * lpMultiByteStr = (LPSTR) malloc(cbMultiByte); + * + * Since cbMultiByte is an output size in bytes, it is the same as the buffer size + * + * Finally, perform the conversion: + * + * cbMultiByte = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) lpWideCharStr, -1, lpMultiByteStr, + * cbMultiByte, NULL, NULL); + * + * The value returned by WideCharToMultiByte corresponds to the number of bytes written + * to the output buffer, and should match the value obtained on the first call to + * WideCharToMultiByte. + * + */ + +#if !defined(WITH_WINPR_DEPRECATED) +static +#endif + int + WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar, + LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar, + LPBOOL lpUsedDefaultChar) +{ + return int_WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar, lpMultiByteStr, + cbMultiByte, lpDefaultChar, lpUsedDefaultChar); +} + +#endif + +/** + * ConvertToUnicode is a convenience wrapper for MultiByteToWideChar: + * + * If the lpWideCharStr parameter for the converted string points to NULL + * or if the cchWideChar parameter is set to 0 this function will automatically + * allocate the required memory which is guaranteed to be null-terminated + * after the conversion, even if the source c string isn't. + * + * If the cbMultiByte parameter is set to -1 the passed lpMultiByteStr must + * be null-terminated and the required length for the converted string will be + * calculated accordingly. + */ +#if defined(WITH_WINPR_DEPRECATED) +int ConvertToUnicode(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte, + LPWSTR* lpWideCharStr, int cchWideChar) +{ + int status = 0; + BOOL allocate = FALSE; + + if (!lpMultiByteStr) + return 0; + + if (!lpWideCharStr) + return 0; + + if (cbMultiByte == -1) + { + size_t len = strnlen(lpMultiByteStr, INT_MAX); + if (len >= INT_MAX) + return 0; + cbMultiByte = (int)(len + 1); + } + + if (cchWideChar == 0) + { + cchWideChar = MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, NULL, 0); + allocate = TRUE; + } + else if (!(*lpWideCharStr)) + allocate = TRUE; + + if (cchWideChar < 1) + return 0; + + if (allocate) + { + *lpWideCharStr = (LPWSTR)calloc(cchWideChar + 1, sizeof(WCHAR)); + + if (!(*lpWideCharStr)) + { + // SetLastError(ERROR_INSUFFICIENT_BUFFER); + return 0; + } + } + + status = MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, *lpWideCharStr, + cchWideChar); + + if (status != cchWideChar) + { + if (allocate) + { + free(*lpWideCharStr); + *lpWideCharStr = NULL; + status = 0; + } + } + + return status; +} +#endif + +/** + * ConvertFromUnicode is a convenience wrapper for WideCharToMultiByte: + * + * If the lpMultiByteStr parameter for the converted string points to NULL + * or if the cbMultiByte parameter is set to 0 this function will automatically + * allocate the required memory which is guaranteed to be null-terminated + * after the conversion, even if the source unicode string isn't. + * + * If the cchWideChar parameter is set to -1 the passed lpWideCharStr must + * be null-terminated and the required length for the converted string will be + * calculated accordingly. + */ +#if defined(WITH_WINPR_DEPRECATED) +int ConvertFromUnicode(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar, + LPSTR* lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar, + LPBOOL lpUsedDefaultChar) +{ + int status = 0; + BOOL allocate = FALSE; + + if (!lpWideCharStr) + return 0; + + if (!lpMultiByteStr) + return 0; + + if (cchWideChar == -1) + cchWideChar = (int)(_wcslen(lpWideCharStr) + 1); + + if (cbMultiByte == 0) + { + cbMultiByte = + WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar, NULL, 0, NULL, NULL); + allocate = TRUE; + } + else if (!(*lpMultiByteStr)) + allocate = TRUE; + + if (cbMultiByte < 1) + return 0; + + if (allocate) + { + *lpMultiByteStr = (LPSTR)calloc(1, cbMultiByte + 1); + + if (!(*lpMultiByteStr)) + { + // SetLastError(ERROR_INSUFFICIENT_BUFFER); + return 0; + } + } + + status = WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar, *lpMultiByteStr, + cbMultiByte, lpDefaultChar, lpUsedDefaultChar); + + if ((status != cbMultiByte) && allocate) + { + status = 0; + } + + if ((status <= 0) && allocate) + { + free(*lpMultiByteStr); + *lpMultiByteStr = NULL; + } + + return status; +} +#endif + +/** + * Swap Unicode byte order (UTF16LE <-> UTF16BE) + */ + +const WCHAR* ByteSwapUnicode(WCHAR* wstr, size_t length) +{ + WINPR_ASSERT(wstr || (length == 0)); + + for (size_t x = 0; x < length; x++) + wstr[x] = _byteswap_ushort(wstr[x]); + return wstr; +} + +SSIZE_T ConvertWCharToUtf8(const WCHAR* wstr, char* str, size_t len) +{ + if (!wstr) + { + if (str && len) + str[0] = 0; + return 0; + } + + const size_t wlen = _wcslen(wstr); + return ConvertWCharNToUtf8(wstr, wlen + 1, str, len); +} + +SSIZE_T ConvertWCharNToUtf8(const WCHAR* wstr, size_t wlen, char* str, size_t len) +{ + BOOL isNullTerminated = FALSE; + if (wlen == 0) + return 0; + + WINPR_ASSERT(wstr); + size_t iwlen = _wcsnlen(wstr, wlen); + + if (wlen > INT32_MAX) + { + SetLastError(ERROR_INVALID_PARAMETER); + return -1; + } + + if (iwlen < wlen) + { + isNullTerminated = TRUE; + iwlen++; + } + const int rc = WideCharToMultiByte(CP_UTF8, 0, wstr, (int)iwlen, str, (int)MIN(INT32_MAX, len), + NULL, NULL); + if ((rc <= 0) || ((len > 0) && ((size_t)rc > len))) + return -1; + else if (!isNullTerminated) + { + if (str && ((size_t)rc < len)) + str[rc] = '\0'; + return rc; + } + else if ((size_t)rc == len) + { + if (str && (str[rc - 1] != '\0')) + return rc; + } + return rc - 1; +} + +SSIZE_T ConvertMszWCharNToUtf8(const WCHAR* wstr, size_t wlen, char* str, size_t len) +{ + if (wlen == 0) + return 0; + + WINPR_ASSERT(wstr); + + if (wlen > INT32_MAX) + { + SetLastError(ERROR_INVALID_PARAMETER); + return -1; + } + + const int iwlen = MIN(INT32_MAX, len); + const int rc = WideCharToMultiByte(CP_UTF8, 0, wstr, (int)wlen, str, (int)iwlen, NULL, NULL); + if ((rc <= 0) || ((len > 0) && (rc > iwlen))) + return -1; + + return rc; +} + +SSIZE_T ConvertUtf8ToWChar(const char* str, WCHAR* wstr, size_t wlen) +{ + if (!str) + { + if (wstr && wlen) + wstr[0] = 0; + return 0; + } + + const size_t len = strlen(str); + return ConvertUtf8NToWChar(str, len + 1, wstr, wlen); +} + +SSIZE_T ConvertUtf8NToWChar(const char* str, size_t len, WCHAR* wstr, size_t wlen) +{ + size_t ilen = strnlen(str, len); + BOOL isNullTerminated = FALSE; + if (len == 0) + return 0; + + WINPR_ASSERT(str); + + if (len > INT32_MAX) + { + SetLastError(ERROR_INVALID_PARAMETER); + return -1; + } + if (ilen < len) + { + isNullTerminated = TRUE; + ilen++; + } + + const int iwlen = MIN(INT32_MAX, wlen); + const int rc = MultiByteToWideChar(CP_UTF8, 0, str, (int)ilen, wstr, (int)iwlen); + if ((rc <= 0) || ((wlen > 0) && (rc > iwlen))) + return -1; + if (!isNullTerminated) + { + if (wstr && (rc < iwlen)) + wstr[rc] = '\0'; + return rc; + } + else if (rc == iwlen) + { + if (wstr && (wstr[rc - 1] != '\0')) + return rc; + } + return rc - 1; +} + +SSIZE_T ConvertMszUtf8NToWChar(const char* str, size_t len, WCHAR* wstr, size_t wlen) +{ + if (len == 0) + return 0; + + WINPR_ASSERT(str); + + if (len > INT32_MAX) + { + SetLastError(ERROR_INVALID_PARAMETER); + return -1; + } + + const int iwlen = MIN(INT32_MAX, wlen); + const int rc = MultiByteToWideChar(CP_UTF8, 0, str, (int)len, wstr, (int)iwlen); + if ((rc <= 0) || ((wlen > 0) && (rc > iwlen))) + return -1; + + return rc; +} + +char* ConvertWCharToUtf8Alloc(const WCHAR* wstr, size_t* pUtfCharLength) +{ + char* tmp = NULL; + const SSIZE_T rc = ConvertWCharToUtf8(wstr, NULL, 0); + if (pUtfCharLength) + *pUtfCharLength = 0; + if (rc < 0) + return NULL; + tmp = calloc((size_t)rc + 1ull, sizeof(char)); + if (!tmp) + return NULL; + const SSIZE_T rc2 = ConvertWCharToUtf8(wstr, tmp, (size_t)rc + 1ull); + if (rc2 < 0) + { + free(tmp); + return NULL; + } + WINPR_ASSERT(rc == rc2); + if (pUtfCharLength) + *pUtfCharLength = (size_t)rc2; + return tmp; +} + +char* ConvertWCharNToUtf8Alloc(const WCHAR* wstr, size_t wlen, size_t* pUtfCharLength) +{ + char* tmp = NULL; + const SSIZE_T rc = ConvertWCharNToUtf8(wstr, wlen, NULL, 0); + + if (pUtfCharLength) + *pUtfCharLength = 0; + if (rc < 0) + return NULL; + tmp = calloc((size_t)rc + 1ull, sizeof(char)); + if (!tmp) + return NULL; + const SSIZE_T rc2 = ConvertWCharNToUtf8(wstr, wlen, tmp, (size_t)rc + 1ull); + if (rc2 < 0) + { + free(tmp); + return NULL; + } + WINPR_ASSERT(rc == rc2); + if (pUtfCharLength) + *pUtfCharLength = (size_t)rc2; + return tmp; +} + +char* ConvertMszWCharNToUtf8Alloc(const WCHAR* wstr, size_t wlen, size_t* pUtfCharLength) +{ + char* tmp = NULL; + const SSIZE_T rc = ConvertMszWCharNToUtf8(wstr, wlen, NULL, 0); + + if (pUtfCharLength) + *pUtfCharLength = 0; + if (rc < 0) + return NULL; + tmp = calloc((size_t)rc + 1ull, sizeof(char)); + if (!tmp) + return NULL; + const SSIZE_T rc2 = ConvertMszWCharNToUtf8(wstr, wlen, tmp, (size_t)rc + 1ull); + if (rc2 < 0) + { + free(tmp); + return NULL; + } + WINPR_ASSERT(rc == rc2); + if (pUtfCharLength) + *pUtfCharLength = (size_t)rc2; + return tmp; +} + +WCHAR* ConvertUtf8ToWCharAlloc(const char* str, size_t* pSize) +{ + WCHAR* tmp = NULL; + const SSIZE_T rc = ConvertUtf8ToWChar(str, NULL, 0); + if (pSize) + *pSize = 0; + if (rc < 0) + return NULL; + tmp = calloc((size_t)rc + 1ull, sizeof(WCHAR)); + if (!tmp) + return NULL; + const SSIZE_T rc2 = ConvertUtf8ToWChar(str, tmp, (size_t)rc + 1ull); + if (rc2 < 0) + { + free(tmp); + return NULL; + } + WINPR_ASSERT(rc == rc2); + if (pSize) + *pSize = (size_t)rc2; + return tmp; +} + +WCHAR* ConvertUtf8NToWCharAlloc(const char* str, size_t len, size_t* pSize) +{ + WCHAR* tmp = NULL; + const SSIZE_T rc = ConvertUtf8NToWChar(str, len, NULL, 0); + if (pSize) + *pSize = 0; + if (rc < 0) + return NULL; + tmp = calloc((size_t)rc + 1ull, sizeof(WCHAR)); + if (!tmp) + return NULL; + const SSIZE_T rc2 = ConvertUtf8NToWChar(str, len, tmp, (size_t)rc + 1ull); + if (rc2 < 0) + { + free(tmp); + return NULL; + } + WINPR_ASSERT(rc == rc2); + if (pSize) + *pSize = (size_t)rc2; + return tmp; +} + +WCHAR* ConvertMszUtf8NToWCharAlloc(const char* str, size_t len, size_t* pSize) +{ + WCHAR* tmp = NULL; + const SSIZE_T rc = ConvertMszUtf8NToWChar(str, len, NULL, 0); + if (pSize) + *pSize = 0; + if (rc < 0) + return NULL; + tmp = calloc((size_t)rc + 1ull, sizeof(WCHAR)); + if (!tmp) + return NULL; + const SSIZE_T rc2 = ConvertMszUtf8NToWChar(str, len, tmp, (size_t)rc + 1ull); + if (rc2 < 0) + { + free(tmp); + return NULL; + } + WINPR_ASSERT(rc == rc2); + if (pSize) + *pSize = (size_t)rc2; + return tmp; +} |