diff options
Diffstat (limited to 'src/libnetdata/string/utf8.c')
-rw-r--r-- | src/libnetdata/string/utf8.c | 408 |
1 files changed, 408 insertions, 0 deletions
diff --git a/src/libnetdata/string/utf8.c b/src/libnetdata/string/utf8.c new file mode 100644 index 000000000..0b4f138a6 --- /dev/null +++ b/src/libnetdata/string/utf8.c @@ -0,0 +1,408 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#if defined(OS_WINDOWS) +/* + * Convert any CodePage to UTF16 + * Goals: + * 1. Destination is always NULL terminated + * 2. If the destination buffer is not enough, return as much as possible data (truncate) + * 3. Always return the number of wide characters written, including the null terminator + */ + +size_t any_to_utf16(uint32_t CodePage, wchar_t *dst, size_t dst_size, const char *src, int src_len, bool *truncated) { + if(!src || src_len == 0) { + // invalid input + if(truncated) + *truncated = true; + + if(dst && dst_size) + *dst = L'\0'; + return 0; + } + + if(!dst || !dst_size) { + // the caller wants to know the buffer to allocate for the conversion + + if(truncated) + *truncated = true; + + int required = MultiByteToWideChar(CodePage, 0, src, src_len, NULL, 0); + if(required <= 0) return 0; // error in the conversion + + // Add 1 for null terminator only if src_len is not -1 + // so that the caller can call us again to get the entire string (not truncated) + return (size_t)required + ((src_len != -1) ? 1 : 0); + } + + // do the conversion directly to the destination buffer + int rc = MultiByteToWideChar(CodePage, 0, src, src_len, dst, (int)dst_size); + if(rc <= 0) { + if(truncated) + *truncated = true; + + // conversion failed, let's see why... + DWORD status = GetLastError(); + if(status == ERROR_INSUFFICIENT_BUFFER) { + // it cannot fit entirely, let's allocate a new buffer to convert it + // and then truncate it to the destination buffer + + // clear errno and LastError to clear the error of the + // MultiByteToWideChar() that failed + errno_clear(); + + // get the required size + int required_size = MultiByteToWideChar(CodePage, 0, src, src_len, NULL, 0); + + // mallocz() never fails (exits the program on NULL) + wchar_t *tmp = mallocz(required_size * sizeof(wchar_t)); + + // convert it, now it should fit + rc = MultiByteToWideChar(CodePage, 0, src, src_len, tmp, required_size); + if (rc <= 0) { + // it failed! + *dst = L'\0'; + freez(tmp); + return 0; + } + + size_t len = rc; + + // copy as much as we can + memcpy(dst, tmp, MIN(len, (dst_size - 1)) * sizeof(wchar_t)); + + // null terminate it + dst[MIN(len, (dst_size - 1))] = L'\0'; + + // free the temporary buffer + freez(tmp); + + // return the actual bytes written + return MIN(len, dst_size); + } + + // empty the destination + *dst = L'\0'; + return 0; + } + + size_t len = rc; + + if(truncated) + *truncated = false; + + if(len >= dst_size) { + if(dst[dst_size - 1] != L'\0') { + if (truncated) + *truncated = true; + + // Truncate it to fit the null terminator + dst[dst_size - 1] = L'\0'; + } + return dst_size; + } + + if(dst[len - 1] != L'\0') { + // the result is not null terminated + // append the null + dst[len] = L'\0'; + return len + 1; + } + + // the result is already null terminated + return len; +} + +/* + * Convert UTF16 (wide-character string) to UTF8 + * Goals: + * 1. Destination is always NULL terminated + * 2. If the destination buffer is not enough, return as much as possible data (truncate) + * 3. Always return the number of bytes written, including the null terminator + */ + +size_t utf16_to_utf8(char *dst, size_t dst_size, const wchar_t *src, int src_len, bool *truncated) { + if (!src || src_len == 0) { + // invalid input + if(truncated) + *truncated = true; + + if(dst && dst_size) + *dst = '\0'; + + return 0; + } + + if (!dst || dst_size == 0) { + // The caller wants to know the buffer size required for the conversion + + if(truncated) + *truncated = true; + + int required = WideCharToMultiByte(CP_UTF8, 0, src, src_len, NULL, 0, NULL, NULL); + if (required <= 0) return 0; // error in the conversion + + // Add 1 for null terminator only if src_len is not -1 + return (size_t)required + ((src_len != -1) ? 1 : 0); + } + + // Perform the conversion directly into the destination buffer + int rc = WideCharToMultiByte(CP_UTF8, 0, src, src_len, dst, (int)dst_size, NULL, NULL); + if (rc <= 0) { + if(truncated) + *truncated = true; + + // Conversion failed, let's see why... + DWORD status = GetLastError(); + if (status == ERROR_INSUFFICIENT_BUFFER) { + // It cannot fit entirely, let's allocate a new buffer to convert it + // and then truncate it to the destination buffer + + // Clear errno and LastError to clear the error of the + // WideCharToMultiByte() that failed + errno_clear(); + + // Get the required size + int required_size = WideCharToMultiByte(CP_UTF8, 0, src, src_len, NULL, 0, NULL, NULL); + + // mallocz() never fails (exits the program on NULL) + char *tmp = mallocz(required_size * sizeof(char)); + + // Convert it, now it should fit + rc = WideCharToMultiByte(CP_UTF8, 0, src, src_len, tmp, required_size, NULL, NULL); + if (rc <= 0) { + // Conversion failed + *dst = '\0'; + freez(tmp); + return 0; + } + + size_t len = rc; + + // Copy as much as we can + memcpy(dst, tmp, MIN(len, (dst_size - 1)) * sizeof(char)); + + // Null-terminate it + dst[MIN(len, (dst_size - 1))] = '\0'; + + // Free the temporary buffer + freez(tmp); + + // Return the actual bytes written + return MIN(len, dst_size); + } + + // Empty the destination + *dst = '\0'; + return 0; + } + + size_t len = rc; + + if(truncated) + *truncated = false; + + if (len >= dst_size) { + if(dst[dst_size - 1] != '\0') { + if (truncated) + *truncated = true; + + // Truncate it to fit the null terminator + dst[dst_size - 1] = '\0'; + } + return dst_size; + } + + if (dst[len - 1] != '\0') { + // The result is not null-terminated + // Append the null terminator + dst[len] = '\0'; + return len + 1; + } + + // The result is already null-terminated + return len; +} + +// -------------------------------------------------------------------------------------------------------------------- + +size_t txt_compute_new_size(size_t old_size, size_t required_size) { + size_t size = (required_size % 2048 == 0) ? required_size : required_size + 2048; + size = (size / 2048) * 2048; + + if(size < old_size * 2) + size = old_size * 2; + + return size; +} + +// -------------------------------------------------------------------------------------------------------------------- +// TXT_UTF8 + +void txt_utf8_cleanup(TXT_UTF8 *dst) { + freez(dst->data); + dst->data = NULL; + dst->used = 0; +} + +void txt_utf8_resize(TXT_UTF8 *dst, size_t required_size, bool keep) { + if(required_size <= dst->size) + return; + + size_t new_size = txt_compute_new_size(dst->size, required_size); + + if(keep && dst->data) + dst->data = reallocz(dst->data, new_size); + else { + txt_utf8_cleanup(dst); + dst->data = mallocz(new_size); + dst->used = 0; + } + + dst->size = new_size; +} + +void txt_utf8_empty(TXT_UTF8 *dst) { + txt_utf8_resize(dst, 1, false); + dst->data[0] = '\0'; + dst->used = 1; +} + +void txt_utf8_set(TXT_UTF8 *dst, const char *txt, size_t txt_len) { + txt_utf8_resize(dst, txt_len + 1, false); + memcpy(dst->data, txt, txt_len); + dst->used = txt_len + 1; + dst->data[dst->used - 1] = '\0'; +} + +void txt_utf8_append(TXT_UTF8 *dst, const char *txt, size_t txt_len) { + if(dst->used <= 1) { + // the destination is empty + txt_utf8_set(dst, txt, txt_len); + } + else { + // there is something already in the buffer + txt_utf8_resize(dst, dst->used + txt_len, true); + memcpy(&dst->data[dst->used - 1], txt, txt_len); + dst->used += txt_len; // the null was already counted + dst->data[dst->used - 1] = '\0'; + } +} + +// -------------------------------------------------------------------------------------------------------------------- +// TXT_UTF16 + +void txt_utf16_cleanup(TXT_UTF16 *dst) { + freez(dst->data); +} + +void txt_utf16_resize(TXT_UTF16 *dst, size_t required_size, bool keep) { + if(required_size <= dst->size) + return; + + size_t new_size = txt_compute_new_size(dst->size, required_size); + + if (keep && dst->data) { + dst->data = reallocz(dst->data, new_size * sizeof(wchar_t)); + } else { + txt_utf16_cleanup(dst); + dst->data = mallocz(new_size * sizeof(wchar_t)); + dst->used = 0; + } + + dst->size = new_size; +} + +void txt_utf16_set(TXT_UTF16 *dst, const wchar_t *txt, size_t txt_len) { + txt_utf16_resize(dst, dst->used + txt_len + 1, true); + memcpy(dst->data, txt, txt_len * sizeof(wchar_t)); + dst->used = txt_len + 1; + dst->data[dst->used - 1] = '\0'; +} + +void txt_utf16_append(TXT_UTF16 *dst, const wchar_t *txt, size_t txt_len) { + if(dst->used <= 1) { + // the destination is empty + txt_utf16_set(dst, txt, txt_len); + } + else { + // there is something already in the buffer + txt_utf16_resize(dst, dst->used + txt_len, true); + memcpy(&dst->data[dst->used - 1], txt, txt_len * sizeof(wchar_t)); + dst->used += txt_len; // the null was already counted + dst->data[dst->used - 1] = '\0'; + } +} + +// -------------------------------------------------------------------------------------------------------------------- + +bool wchar_to_txt_utf8(TXT_UTF8 *dst, const wchar_t *src, int src_len) { + if(!src || !src_len) { + txt_utf8_empty(dst); + return false; + } + + if(!dst->data && !dst->size) { + size_t size = utf16_to_utf8(NULL, 0, src, src_len, NULL); + if(!size) { + txt_utf8_empty(dst); + return false; + } + + // we +1 here to avoid entering the next condition below + txt_utf8_resize(dst, size, false); + } + + bool truncated = false; + dst->used = utf16_to_utf8(dst->data, dst->size, src, src_len, &truncated); + if(truncated) { + // we need to resize + size_t needed = utf16_to_utf8(NULL, 0, src, src_len, NULL); // find the size needed + if(!needed) { + txt_utf8_empty(dst); + return false; + } + + txt_utf8_resize(dst, needed, false); + dst->used = utf16_to_utf8(dst->data, dst->size, src, src_len, NULL); + } + + // Make sure it is not zero padded at the end + while(dst->used >= 2 && dst->data[dst->used - 2] == 0) + dst->used--; + + internal_fatal(strlen(dst->data) + 1 != dst->used, + "Wrong UTF8 string length"); + + return true; +} + +bool txt_utf16_to_utf8(TXT_UTF8 *utf8, TXT_UTF16 *utf16) { + fatal_assert(utf8 && ((utf8->data && utf8->size) || (!utf8->data && !utf8->size))); + fatal_assert(utf16 && ((utf16->data && utf16->size) || (!utf16->data && !utf16->size))); + + // pass the entire utf16 size, including the null terminator + // so that the resulting utf8 message will be null terminated too. + return wchar_to_txt_utf8(utf8, utf16->data, (int)utf16->used - 1); +} + +char *utf16_to_utf8_strdupz(const wchar_t *src, size_t *dst_len) { + size_t size = utf16_to_utf8(NULL, 0, src, -1, NULL); + if (size) { + char *dst = mallocz(size); + + size = utf16_to_utf8(dst, size, src, -1, NULL); + if(dst_len) + *dst_len = size - 1; + + return dst; + } + + if(dst_len) + *dst_len = 0; + + return NULL; +} + +#endif |