summaryrefslogtreecommitdiffstats
path: root/src/libnetdata/string/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libnetdata/string/utf8.c')
-rw-r--r--src/libnetdata/string/utf8.c408
1 files changed, 408 insertions, 0 deletions
diff --git a/src/libnetdata/string/utf8.c b/src/libnetdata/string/utf8.c
new file mode 100644
index 000000000..0b4f138a6
--- /dev/null
+++ b/src/libnetdata/string/utf8.c
@@ -0,0 +1,408 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "../libnetdata.h"
+
+#if defined(OS_WINDOWS)
+/*
+ * Convert any CodePage to UTF16
+ * Goals:
+ * 1. Destination is always NULL terminated
+ * 2. If the destination buffer is not enough, return as much as possible data (truncate)
+ * 3. Always return the number of wide characters written, including the null terminator
+ */
+
+size_t any_to_utf16(uint32_t CodePage, wchar_t *dst, size_t dst_size, const char *src, int src_len, bool *truncated) {
+ if(!src || src_len == 0) {
+ // invalid input
+ if(truncated)
+ *truncated = true;
+
+ if(dst && dst_size)
+ *dst = L'\0';
+ return 0;
+ }
+
+ if(!dst || !dst_size) {
+ // the caller wants to know the buffer to allocate for the conversion
+
+ if(truncated)
+ *truncated = true;
+
+ int required = MultiByteToWideChar(CodePage, 0, src, src_len, NULL, 0);
+ if(required <= 0) return 0; // error in the conversion
+
+ // Add 1 for null terminator only if src_len is not -1
+ // so that the caller can call us again to get the entire string (not truncated)
+ return (size_t)required + ((src_len != -1) ? 1 : 0);
+ }
+
+ // do the conversion directly to the destination buffer
+ int rc = MultiByteToWideChar(CodePage, 0, src, src_len, dst, (int)dst_size);
+ if(rc <= 0) {
+ if(truncated)
+ *truncated = true;
+
+ // conversion failed, let's see why...
+ DWORD status = GetLastError();
+ if(status == ERROR_INSUFFICIENT_BUFFER) {
+ // it cannot fit entirely, let's allocate a new buffer to convert it
+ // and then truncate it to the destination buffer
+
+ // clear errno and LastError to clear the error of the
+ // MultiByteToWideChar() that failed
+ errno_clear();
+
+ // get the required size
+ int required_size = MultiByteToWideChar(CodePage, 0, src, src_len, NULL, 0);
+
+ // mallocz() never fails (exits the program on NULL)
+ wchar_t *tmp = mallocz(required_size * sizeof(wchar_t));
+
+ // convert it, now it should fit
+ rc = MultiByteToWideChar(CodePage, 0, src, src_len, tmp, required_size);
+ if (rc <= 0) {
+ // it failed!
+ *dst = L'\0';
+ freez(tmp);
+ return 0;
+ }
+
+ size_t len = rc;
+
+ // copy as much as we can
+ memcpy(dst, tmp, MIN(len, (dst_size - 1)) * sizeof(wchar_t));
+
+ // null terminate it
+ dst[MIN(len, (dst_size - 1))] = L'\0';
+
+ // free the temporary buffer
+ freez(tmp);
+
+ // return the actual bytes written
+ return MIN(len, dst_size);
+ }
+
+ // empty the destination
+ *dst = L'\0';
+ return 0;
+ }
+
+ size_t len = rc;
+
+ if(truncated)
+ *truncated = false;
+
+ if(len >= dst_size) {
+ if(dst[dst_size - 1] != L'\0') {
+ if (truncated)
+ *truncated = true;
+
+ // Truncate it to fit the null terminator
+ dst[dst_size - 1] = L'\0';
+ }
+ return dst_size;
+ }
+
+ if(dst[len - 1] != L'\0') {
+ // the result is not null terminated
+ // append the null
+ dst[len] = L'\0';
+ return len + 1;
+ }
+
+ // the result is already null terminated
+ return len;
+}
+
+/*
+ * Convert UTF16 (wide-character string) to UTF8
+ * Goals:
+ * 1. Destination is always NULL terminated
+ * 2. If the destination buffer is not enough, return as much as possible data (truncate)
+ * 3. Always return the number of bytes written, including the null terminator
+ */
+
+size_t utf16_to_utf8(char *dst, size_t dst_size, const wchar_t *src, int src_len, bool *truncated) {
+ if (!src || src_len == 0) {
+ // invalid input
+ if(truncated)
+ *truncated = true;
+
+ if(dst && dst_size)
+ *dst = '\0';
+
+ return 0;
+ }
+
+ if (!dst || dst_size == 0) {
+ // The caller wants to know the buffer size required for the conversion
+
+ if(truncated)
+ *truncated = true;
+
+ int required = WideCharToMultiByte(CP_UTF8, 0, src, src_len, NULL, 0, NULL, NULL);
+ if (required <= 0) return 0; // error in the conversion
+
+ // Add 1 for null terminator only if src_len is not -1
+ return (size_t)required + ((src_len != -1) ? 1 : 0);
+ }
+
+ // Perform the conversion directly into the destination buffer
+ int rc = WideCharToMultiByte(CP_UTF8, 0, src, src_len, dst, (int)dst_size, NULL, NULL);
+ if (rc <= 0) {
+ if(truncated)
+ *truncated = true;
+
+ // Conversion failed, let's see why...
+ DWORD status = GetLastError();
+ if (status == ERROR_INSUFFICIENT_BUFFER) {
+ // It cannot fit entirely, let's allocate a new buffer to convert it
+ // and then truncate it to the destination buffer
+
+ // Clear errno and LastError to clear the error of the
+ // WideCharToMultiByte() that failed
+ errno_clear();
+
+ // Get the required size
+ int required_size = WideCharToMultiByte(CP_UTF8, 0, src, src_len, NULL, 0, NULL, NULL);
+
+ // mallocz() never fails (exits the program on NULL)
+ char *tmp = mallocz(required_size * sizeof(char));
+
+ // Convert it, now it should fit
+ rc = WideCharToMultiByte(CP_UTF8, 0, src, src_len, tmp, required_size, NULL, NULL);
+ if (rc <= 0) {
+ // Conversion failed
+ *dst = '\0';
+ freez(tmp);
+ return 0;
+ }
+
+ size_t len = rc;
+
+ // Copy as much as we can
+ memcpy(dst, tmp, MIN(len, (dst_size - 1)) * sizeof(char));
+
+ // Null-terminate it
+ dst[MIN(len, (dst_size - 1))] = '\0';
+
+ // Free the temporary buffer
+ freez(tmp);
+
+ // Return the actual bytes written
+ return MIN(len, dst_size);
+ }
+
+ // Empty the destination
+ *dst = '\0';
+ return 0;
+ }
+
+ size_t len = rc;
+
+ if(truncated)
+ *truncated = false;
+
+ if (len >= dst_size) {
+ if(dst[dst_size - 1] != '\0') {
+ if (truncated)
+ *truncated = true;
+
+ // Truncate it to fit the null terminator
+ dst[dst_size - 1] = '\0';
+ }
+ return dst_size;
+ }
+
+ if (dst[len - 1] != '\0') {
+ // The result is not null-terminated
+ // Append the null terminator
+ dst[len] = '\0';
+ return len + 1;
+ }
+
+ // The result is already null-terminated
+ return len;
+}
+
+// --------------------------------------------------------------------------------------------------------------------
+
+size_t txt_compute_new_size(size_t old_size, size_t required_size) {
+ size_t size = (required_size % 2048 == 0) ? required_size : required_size + 2048;
+ size = (size / 2048) * 2048;
+
+ if(size < old_size * 2)
+ size = old_size * 2;
+
+ return size;
+}
+
+// --------------------------------------------------------------------------------------------------------------------
+// TXT_UTF8
+
+void txt_utf8_cleanup(TXT_UTF8 *dst) {
+ freez(dst->data);
+ dst->data = NULL;
+ dst->used = 0;
+}
+
+void txt_utf8_resize(TXT_UTF8 *dst, size_t required_size, bool keep) {
+ if(required_size <= dst->size)
+ return;
+
+ size_t new_size = txt_compute_new_size(dst->size, required_size);
+
+ if(keep && dst->data)
+ dst->data = reallocz(dst->data, new_size);
+ else {
+ txt_utf8_cleanup(dst);
+ dst->data = mallocz(new_size);
+ dst->used = 0;
+ }
+
+ dst->size = new_size;
+}
+
+void txt_utf8_empty(TXT_UTF8 *dst) {
+ txt_utf8_resize(dst, 1, false);
+ dst->data[0] = '\0';
+ dst->used = 1;
+}
+
+void txt_utf8_set(TXT_UTF8 *dst, const char *txt, size_t txt_len) {
+ txt_utf8_resize(dst, txt_len + 1, false);
+ memcpy(dst->data, txt, txt_len);
+ dst->used = txt_len + 1;
+ dst->data[dst->used - 1] = '\0';
+}
+
+void txt_utf8_append(TXT_UTF8 *dst, const char *txt, size_t txt_len) {
+ if(dst->used <= 1) {
+ // the destination is empty
+ txt_utf8_set(dst, txt, txt_len);
+ }
+ else {
+ // there is something already in the buffer
+ txt_utf8_resize(dst, dst->used + txt_len, true);
+ memcpy(&dst->data[dst->used - 1], txt, txt_len);
+ dst->used += txt_len; // the null was already counted
+ dst->data[dst->used - 1] = '\0';
+ }
+}
+
+// --------------------------------------------------------------------------------------------------------------------
+// TXT_UTF16
+
+void txt_utf16_cleanup(TXT_UTF16 *dst) {
+ freez(dst->data);
+}
+
+void txt_utf16_resize(TXT_UTF16 *dst, size_t required_size, bool keep) {
+ if(required_size <= dst->size)
+ return;
+
+ size_t new_size = txt_compute_new_size(dst->size, required_size);
+
+ if (keep && dst->data) {
+ dst->data = reallocz(dst->data, new_size * sizeof(wchar_t));
+ } else {
+ txt_utf16_cleanup(dst);
+ dst->data = mallocz(new_size * sizeof(wchar_t));
+ dst->used = 0;
+ }
+
+ dst->size = new_size;
+}
+
+void txt_utf16_set(TXT_UTF16 *dst, const wchar_t *txt, size_t txt_len) {
+ txt_utf16_resize(dst, dst->used + txt_len + 1, true);
+ memcpy(dst->data, txt, txt_len * sizeof(wchar_t));
+ dst->used = txt_len + 1;
+ dst->data[dst->used - 1] = '\0';
+}
+
+void txt_utf16_append(TXT_UTF16 *dst, const wchar_t *txt, size_t txt_len) {
+ if(dst->used <= 1) {
+ // the destination is empty
+ txt_utf16_set(dst, txt, txt_len);
+ }
+ else {
+ // there is something already in the buffer
+ txt_utf16_resize(dst, dst->used + txt_len, true);
+ memcpy(&dst->data[dst->used - 1], txt, txt_len * sizeof(wchar_t));
+ dst->used += txt_len; // the null was already counted
+ dst->data[dst->used - 1] = '\0';
+ }
+}
+
+// --------------------------------------------------------------------------------------------------------------------
+
+bool wchar_to_txt_utf8(TXT_UTF8 *dst, const wchar_t *src, int src_len) {
+ if(!src || !src_len) {
+ txt_utf8_empty(dst);
+ return false;
+ }
+
+ if(!dst->data && !dst->size) {
+ size_t size = utf16_to_utf8(NULL, 0, src, src_len, NULL);
+ if(!size) {
+ txt_utf8_empty(dst);
+ return false;
+ }
+
+ // we +1 here to avoid entering the next condition below
+ txt_utf8_resize(dst, size, false);
+ }
+
+ bool truncated = false;
+ dst->used = utf16_to_utf8(dst->data, dst->size, src, src_len, &truncated);
+ if(truncated) {
+ // we need to resize
+ size_t needed = utf16_to_utf8(NULL, 0, src, src_len, NULL); // find the size needed
+ if(!needed) {
+ txt_utf8_empty(dst);
+ return false;
+ }
+
+ txt_utf8_resize(dst, needed, false);
+ dst->used = utf16_to_utf8(dst->data, dst->size, src, src_len, NULL);
+ }
+
+ // Make sure it is not zero padded at the end
+ while(dst->used >= 2 && dst->data[dst->used - 2] == 0)
+ dst->used--;
+
+ internal_fatal(strlen(dst->data) + 1 != dst->used,
+ "Wrong UTF8 string length");
+
+ return true;
+}
+
+bool txt_utf16_to_utf8(TXT_UTF8 *utf8, TXT_UTF16 *utf16) {
+ fatal_assert(utf8 && ((utf8->data && utf8->size) || (!utf8->data && !utf8->size)));
+ fatal_assert(utf16 && ((utf16->data && utf16->size) || (!utf16->data && !utf16->size)));
+
+ // pass the entire utf16 size, including the null terminator
+ // so that the resulting utf8 message will be null terminated too.
+ return wchar_to_txt_utf8(utf8, utf16->data, (int)utf16->used - 1);
+}
+
+char *utf16_to_utf8_strdupz(const wchar_t *src, size_t *dst_len) {
+ size_t size = utf16_to_utf8(NULL, 0, src, -1, NULL);
+ if (size) {
+ char *dst = mallocz(size);
+
+ size = utf16_to_utf8(dst, size, src, -1, NULL);
+ if(dst_len)
+ *dst_len = size - 1;
+
+ return dst;
+ }
+
+ if(dst_len)
+ *dst_len = 0;
+
+ return NULL;
+}
+
+#endif