5 files changed, 519 insertions, 15 deletions
diff --git a/src/libnetdata/string/README.md b/src/libnetdata/string/README.md
index 54c905946..c23160233 100644
--- a/src/libnetdata/string/README.md
+++ b/src/libnetdata/string/README.md
@@ -1,12 +1,3 @@
-<!--
-title: "String"
-custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/string/README.md
-sidebar_label: "String"
-learn_status: "Published"
-learn_topic_type: "Tasks"
-learn_rel_path: "Developers/libnetdata"
--->
-
 # STRING
 
 STRING provides a way to allocate and free text strings, while de-duplicating them.
diff --git a/src/libnetdata/string/string.c b/src/libnetdata/string/string.c
index 257a3cc4b..107c7eea5 100644
--- a/src/libnetdata/string/string.c
+++ b/src/libnetdata/string/string.c
@@ -347,16 +347,34 @@ void string_freez(STRING *string) {
     string_stats_atomic_increment(partition, releases);
 }
 
-inline size_t string_strlen(STRING *string) {
+inline size_t string_strlen(const STRING *string) {
     if(unlikely(!string)) return 0;
     return string->length - 1;
 }
 
-inline const char *string2str(STRING *string) {
+inline const char *string2str(const STRING *string) {
     if(unlikely(!string)) return "";
     return string->str;
 }
 
+bool string_ends_with_string(const STRING *whole, const STRING *end) {
+    if(whole == end) return true;
+    if(!whole || !end) return false;
+    if(end->length > whole->length) return false;
+    if(end->length == whole->length) return strcmp(string2str(whole), string2str(end)) == 0;
+    const char *we = string2str(whole);
+    we = &we[string_strlen(whole) - string_strlen(end)];
+    return strncmp(we, end->str, string_strlen(end)) == 0;
+}
+
+bool string_starts_with_string(const STRING *whole, const STRING *end) {
+    if(whole == end) return true;
+    if(!whole || !end) return false;
+    if(end->length > whole->length) return false;
+    if(end->length == whole->length) return strcmp(string2str(whole), string2str(end)) == 0;
+    return strncmp(string2str(whole), string2str(end), string_strlen(end)) == 0;
+}
+
 STRING *string_2way_merge(STRING *a, STRING *b) {
     static STRING *X = NULL;
 
diff --git a/src/libnetdata/string/string.h b/src/libnetdata/string/string.h
index c44696be2..e86ac6fb5 100644
--- a/src/libnetdata/string/string.h
+++ b/src/libnetdata/string/string.h
@@ -14,8 +14,10 @@ STRING *string_strndupz(const char *str, size_t len);
 
 STRING *string_dup(STRING *string);
 void string_freez(STRING *string);
-size_t string_strlen(STRING *string);
-const char *string2str(STRING *string) NEVERNULL;
+size_t string_strlen(const STRING *string);
+const char *string2str(const STRING *string) NEVERNULL;
+bool string_ends_with_string(const STRING *whole, const STRING *end);
+bool string_starts_with_string(const STRING *whole, const STRING *end);
 
 // keep common prefix/suffix and replace everything else with [x]
 STRING *string_2way_merge(STRING *a, STRING *b);
@@ -30,10 +32,21 @@ static inline int string_strcmp(STRING *string, const char *s) {
     return strcmp(string2str(string), s);
 }
 
+static inline int string_strncmp(STRING *string, const char *s, size_t n) {
+    return strncmp(string2str(string), s, n);
+}
+
 void string_statistics(size_t *inserts, size_t *deletes, size_t *searches, size_t *entries, size_t *references, size_t *memory, size_t *duplications, size_t *releases);
 
 int string_unittest(size_t entries);
 
 void string_init(void);
 
+static inline void cleanup_string_pp(STRING **stringpp) {
+    if(stringpp)
+        string_freez(*stringpp);
+}
+
+#define CLEAN_STRING _cleanup_(cleanup_string_pp) STRING
+
 #endif
diff --git a/src/libnetdata/string/utf8.c b/src/libnetdata/string/utf8.c
new file mode 100644
index 000000000..0b4f138a6
--- /dev/null
+++ b/src/libnetdata/string/utf8.c
@@ -0,0 +1,408 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "../libnetdata.h"
+
+#if defined(OS_WINDOWS)
+/*
+ * Convert any CodePage to UTF16
+ * Goals:
+ *   1. Destination is always NULL terminated
+ *   2. If the destination buffer is not enough, return as much as possible data (truncate)
+ *   3. Always return the number of wide characters written, including the null terminator
+ */
+
+size_t any_to_utf16(uint32_t CodePage, wchar_t *dst, size_t dst_size, const char *src, int src_len, bool *truncated) {
+    if(!src || src_len == 0) {
+        // invalid input
+        if(truncated)
+            *truncated = true;
+
+        if(dst && dst_size)
+            *dst = L'\0';
+        return 0;
+    }
+
+    if(!dst || !dst_size) {
+        // the caller wants to know the buffer to allocate for the conversion
+
+        if(truncated)
+            *truncated = true;
+
+        int required = MultiByteToWideChar(CodePage, 0, src, src_len, NULL, 0);
+        if(required <= 0) return 0; // error in the conversion
+
+        // Add 1 for null terminator only if src_len is not -1
+        // so that the caller can call us again to get the entire string (not truncated)
+        return (size_t)required + ((src_len != -1) ? 1 : 0);
+    }
+
+    // do the conversion directly to the destination buffer
+    int rc = MultiByteToWideChar(CodePage, 0, src, src_len, dst, (int)dst_size);
+    if(rc <= 0) {
+        if(truncated)
+            *truncated = true;
+
+        // conversion failed, let's see why...
+        DWORD status = GetLastError();
+        if(status == ERROR_INSUFFICIENT_BUFFER) {
+            // it cannot fit entirely, let's allocate a new buffer to convert it
+            // and then truncate it to the destination buffer
+
+            // clear errno and LastError to clear the error of the
+            // MultiByteToWideChar() that failed
+            errno_clear();
+
+            // get the required size
+            int required_size = MultiByteToWideChar(CodePage, 0, src, src_len, NULL, 0);
+
+            // mallocz() never fails (exits the program on NULL)
+            wchar_t *tmp = mallocz(required_size * sizeof(wchar_t));
+
+            // convert it, now it should fit
+            rc = MultiByteToWideChar(CodePage, 0, src, src_len, tmp, required_size);
+            if (rc <= 0) {
+                // it failed!
+                *dst = L'\0';
+                freez(tmp);
+                return 0;
+            }
+
+            size_t len = rc;
+
+            // copy as much as we can
+            memcpy(dst, tmp, MIN(len, (dst_size - 1)) * sizeof(wchar_t));
+
+            // null terminate it
+            dst[MIN(len, (dst_size - 1))] = L'\0';
+
+            // free the temporary buffer
+            freez(tmp);
+
+            // return the actual bytes written
+            return MIN(len, dst_size);
+        }
+
+        // empty the destination
+        *dst = L'\0';
+        return 0;
+    }
+
+    size_t len = rc;
+
+    if(truncated)
+        *truncated = false;
+
+    if(len >= dst_size) {
+        if(dst[dst_size - 1] != L'\0') {
+            if (truncated)
+                *truncated = true;
+
+            // Truncate it to fit the null terminator
+            dst[dst_size - 1] = L'\0';
+        }
+        return dst_size;
+    }
+
+    if(dst[len - 1] != L'\0') {
+        // the result is not null terminated
+        // append the null
+        dst[len] = L'\0';
+        return len + 1;
+    }
+
+    // the result is already null terminated
+    return len;
+}
+
+/*
+ * Convert UTF16 (wide-character string) to UTF8
+ * Goals:
+ *   1. Destination is always NULL terminated
+ *   2. If the destination buffer is not enough, return as much as possible data (truncate)
+ *   3. Always return the number of bytes written, including the null terminator
+ */
+
+size_t utf16_to_utf8(char *dst, size_t dst_size, const wchar_t *src, int src_len, bool *truncated) {
+    if (!src || src_len == 0) {
+        // invalid input
+        if(truncated)
+            *truncated = true;
+
+        if(dst && dst_size)
+            *dst = '\0';
+
+        return 0;
+    }
+
+    if (!dst || dst_size == 0) {
+        // The caller wants to know the buffer size required for the conversion
+
+        if(truncated)
+            *truncated = true;
+
+        int required = WideCharToMultiByte(CP_UTF8, 0, src, src_len, NULL, 0, NULL, NULL);
+        if (required <= 0) return 0; // error in the conversion
+
+        // Add 1 for null terminator only if src_len is not -1
+        return (size_t)required + ((src_len != -1) ? 1 : 0);
+    }
+
+    // Perform the conversion directly into the destination buffer
+    int rc = WideCharToMultiByte(CP_UTF8, 0, src, src_len, dst, (int)dst_size, NULL, NULL);
+    if (rc <= 0) {
+        if(truncated)
+            *truncated = true;
+
+        // Conversion failed, let's see why...
+        DWORD status = GetLastError();
+        if (status == ERROR_INSUFFICIENT_BUFFER) {
+            // It cannot fit entirely, let's allocate a new buffer to convert it
+            // and then truncate it to the destination buffer
+
+            // Clear errno and LastError to clear the error of the
+            // WideCharToMultiByte() that failed
+            errno_clear();
+
+            // Get the required size
+            int required_size = WideCharToMultiByte(CP_UTF8, 0, src, src_len, NULL, 0, NULL, NULL);
+
+            // mallocz() never fails (exits the program on NULL)
+            char *tmp = mallocz(required_size * sizeof(char));
+
+            // Convert it, now it should fit
+            rc = WideCharToMultiByte(CP_UTF8, 0, src, src_len, tmp, required_size, NULL, NULL);
+            if (rc <= 0) {
+                // Conversion failed
+                *dst = '\0';
+                freez(tmp);
+                return 0;
+            }
+
+            size_t len = rc;
+
+            // Copy as much as we can
+            memcpy(dst, tmp, MIN(len, (dst_size - 1)) * sizeof(char));
+
+            // Null-terminate it
+            dst[MIN(len, (dst_size - 1))] = '\0';
+
+            // Free the temporary buffer
+            freez(tmp);
+
+            // Return the actual bytes written
+            return MIN(len, dst_size);
+        }
+
+        // Empty the destination
+        *dst = '\0';
+        return 0;
+    }
+
+    size_t len = rc;
+
+    if(truncated)
+        *truncated = false;
+
+    if (len >= dst_size) {
+        if(dst[dst_size - 1] != '\0') {
+            if (truncated)
+                *truncated = true;
+
+            // Truncate it to fit the null terminator
+            dst[dst_size - 1] = '\0';
+        }
+        return dst_size;
+    }
+
+    if (dst[len - 1] != '\0') {
+        // The result is not null-terminated
+        // Append the null terminator
+        dst[len] = '\0';
+        return len + 1;
+    }
+
+    // The result is already null-terminated
+    return len;
+}
+
+// --------------------------------------------------------------------------------------------------------------------
+
+size_t txt_compute_new_size(size_t old_size, size_t required_size) {
+    size_t size = (required_size % 2048 == 0) ? required_size : required_size + 2048;
+    size = (size / 2048) * 2048;
+
+    if(size < old_size * 2)
+        size = old_size * 2;
+
+    return size;
+}
+
+// --------------------------------------------------------------------------------------------------------------------
+// TXT_UTF8
+
+void txt_utf8_cleanup(TXT_UTF8 *dst) {
+    freez(dst->data);
+    dst->data = NULL;
+    dst->used = 0;
+}
+
+void txt_utf8_resize(TXT_UTF8 *dst, size_t required_size, bool keep) {
+    if(required_size <= dst->size)
+        return;
+
+    size_t new_size = txt_compute_new_size(dst->size, required_size);
+
+    if(keep && dst->data)
+        dst->data = reallocz(dst->data, new_size);
+    else {
+        txt_utf8_cleanup(dst);
+        dst->data = mallocz(new_size);
+        dst->used = 0;
+    }
+
+    dst->size = new_size;
+}
+
+void txt_utf8_empty(TXT_UTF8 *dst) {
+    txt_utf8_resize(dst, 1, false);
+    dst->data[0] = '\0';
+    dst->used = 1;
+}
+
+void txt_utf8_set(TXT_UTF8 *dst, const char *txt, size_t txt_len) {
+    txt_utf8_resize(dst, txt_len + 1, false);
+    memcpy(dst->data, txt, txt_len);
+    dst->used = txt_len + 1;
+    dst->data[dst->used - 1] = '\0';
+}
+
+void txt_utf8_append(TXT_UTF8 *dst, const char *txt, size_t txt_len) {
+    if(dst->used <= 1) {
+        // the destination is empty
+        txt_utf8_set(dst, txt, txt_len);
+    }
+    else {
+        // there is something already in the buffer
+        txt_utf8_resize(dst, dst->used + txt_len, true);
+        memcpy(&dst->data[dst->used - 1], txt, txt_len);
+        dst->used += txt_len; // the null was already counted
+        dst->data[dst->used - 1] = '\0';
+    }
+}
+
+// --------------------------------------------------------------------------------------------------------------------
+// TXT_UTF16
+
+void txt_utf16_cleanup(TXT_UTF16 *dst) {
+    freez(dst->data);
+}
+
+void txt_utf16_resize(TXT_UTF16 *dst, size_t required_size, bool keep) {
+    if(required_size <= dst->size)
+        return;
+
+    size_t new_size = txt_compute_new_size(dst->size, required_size);
+
+    if (keep && dst->data) {
+        dst->data = reallocz(dst->data, new_size * sizeof(wchar_t));
+    } else {
+        txt_utf16_cleanup(dst);
+        dst->data = mallocz(new_size * sizeof(wchar_t));
+        dst->used = 0;
+    }
+
+    dst->size = new_size;
+}
+
+void txt_utf16_set(TXT_UTF16 *dst, const wchar_t *txt, size_t txt_len) {
+    txt_utf16_resize(dst, dst->used + txt_len + 1, true);
+    memcpy(dst->data, txt, txt_len * sizeof(wchar_t));
+    dst->used = txt_len + 1;
+    dst->data[dst->used - 1] = '\0';
+}
+
+void txt_utf16_append(TXT_UTF16 *dst, const wchar_t *txt, size_t txt_len) {
+    if(dst->used <= 1) {
+        // the destination is empty
+        txt_utf16_set(dst, txt, txt_len);
+    }
+    else {
+        // there is something already in the buffer
+        txt_utf16_resize(dst, dst->used + txt_len, true);
+        memcpy(&dst->data[dst->used - 1], txt, txt_len * sizeof(wchar_t));
+        dst->used += txt_len; // the null was already counted
+        dst->data[dst->used - 1] = '\0';
+    }
+}
+
+// --------------------------------------------------------------------------------------------------------------------
+
+bool wchar_to_txt_utf8(TXT_UTF8 *dst, const wchar_t *src, int src_len) {
+    if(!src || !src_len) {
+        txt_utf8_empty(dst);
+        return false;
+    }
+
+    if(!dst->data && !dst->size) {
+        size_t size = utf16_to_utf8(NULL, 0, src, src_len, NULL);
+        if(!size) {
+            txt_utf8_empty(dst);
+            return false;
+        }
+
+        // we +1 here to avoid entering the next condition below
+        txt_utf8_resize(dst, size, false);
+    }
+
+    bool truncated = false;
+    dst->used = utf16_to_utf8(dst->data, dst->size, src, src_len, &truncated);
+    if(truncated) {
+        // we need to resize
+        size_t needed = utf16_to_utf8(NULL, 0, src, src_len, NULL); // find the size needed
+        if(!needed) {
+            txt_utf8_empty(dst);
+            return false;
+        }
+
+        txt_utf8_resize(dst, needed, false);
+        dst->used = utf16_to_utf8(dst->data, dst->size, src, src_len, NULL);
+    }
+
+    // Make sure it is not zero padded at the end
+    while(dst->used >= 2 && dst->data[dst->used - 2] == 0)
+        dst->used--;
+
+    internal_fatal(strlen(dst->data) + 1 != dst->used,
+                   "Wrong UTF8 string length");
+
+    return true;
+}
+
+bool txt_utf16_to_utf8(TXT_UTF8 *utf8, TXT_UTF16 *utf16) {
+    fatal_assert(utf8 && ((utf8->data && utf8->size) || (!utf8->data && !utf8->size)));
+    fatal_assert(utf16 && ((utf16->data && utf16->size) || (!utf16->data && !utf16->size)));
+
+    // pass the entire utf16 size, including the null terminator
+    // so that the resulting utf8 message will be null terminated too.
+    return wchar_to_txt_utf8(utf8, utf16->data, (int)utf16->used - 1);
+}
+
+char *utf16_to_utf8_strdupz(const wchar_t *src, size_t *dst_len) {
+    size_t size = utf16_to_utf8(NULL, 0, src, -1, NULL);
+    if (size) {
+        char *dst = mallocz(size);
+
+        size = utf16_to_utf8(dst, size, src, -1, NULL);
+        if(dst_len)
+            *dst_len = size - 1;
+
+        return dst;
+    }
+
+    if(dst_len)
+        *dst_len = 0;
+
+    return NULL;
+}
+
+#endif
diff --git a/src/libnetdata/string/utf8.h b/src/libnetdata/string/utf8.h
index 3e6c8c288..f27ba5447 100644
--- a/src/libnetdata/string/utf8.h
+++ b/src/libnetdata/string/utf8.h
@@ -3,7 +3,81 @@
 #ifndef NETDATA_STRING_UTF8_H
 #define NETDATA_STRING_UTF8_H 1
 
-#define IS_UTF8_BYTE(x) ((x) & 0x80)
-#define IS_UTF8_STARTBYTE(x) (IS_UTF8_BYTE(x)&&((x) & 0x40))
+#include "../libnetdata.h"
+
+#define IS_UTF8_BYTE(x) ((uint8_t)(x) & (uint8_t)0x80)
+#define IS_UTF8_STARTBYTE(x) (IS_UTF8_BYTE(x) && ((uint8_t)(x) & (uint8_t)0x40))
+
+#ifndef _countof
+#define _countof(x) (sizeof(x) / sizeof(*(x)))
+#endif
+
+#if defined(OS_WINDOWS)
+
+// return an always null terminated wide string, truncate to given size if destination is not big enough,
+// src_len can be -1 use all of it.
+// returns zero on errors, > 0 otherwise (including the null, even if src is not null terminated).
+size_t any_to_utf16(uint32_t CodePage, wchar_t *dst, size_t dst_size, const char *src, int src_len, bool *truncated);
+
+// always null terminated, truncated if it does not fit, src_len can be -1 to use all of it.
+// returns zero on errors, > 0 otherwise (including the null, even if src is not null terminated).
+#define utf8_to_utf16(utf16, utf16_count, src, src_len) any_to_utf16(CP_UTF8, utf16, utf16_count, src, src_len, NULL)
+
+// always null terminated, truncated if it does not fit, src_len can be -1 to use all of it.
+// returns zero on errors, > 0 otherwise (including the null, even if src is not null terminated).
+size_t utf16_to_utf8(char *dst, size_t dst_size, const wchar_t *src, int src_len, bool *truncated);
+
+// --------------------------------------------------------------------------------------------------------------------
+// TXT_UTF8
+
+typedef enum __attribute__((packed)) {
+    TXT_SOURCE_UNKNOWN = 0,
+    TXT_SOURCE_PROVIDER,
+    TXT_SOURCE_FIELD_CACHE,
+    TXT_SOURCE_EVENT_LOG,
+    TXT_SOURCE_HARDCODED,
+
+    // terminator
+    TXT_SOURCE_MAX,
+} TXT_SOURCE;
+
+typedef struct {
+    char *data;
+    uint32_t size; // the allocated size of data buffer
+    uint32_t used;  // the used size of the data buffer (including null terminators, if any)
+    TXT_SOURCE src;
+} TXT_UTF8;
+
+void txt_utf8_append(TXT_UTF8 *dst, const char *txt, size_t txt_len);
+void txt_utf8_set(TXT_UTF8 *dst, const char *txt, size_t txt_len);
+void txt_utf8_empty(TXT_UTF8 *dst);
+void txt_utf8_resize(TXT_UTF8 *dst, size_t required_size, bool keep);
+void txt_utf8_cleanup(TXT_UTF8 *dst);
+
+// --------------------------------------------------------------------------------------------------------------------
+// TXT_UTF16
+
+typedef struct {
+    wchar_t *data;
+    uint32_t size; // the allocated size of data buffer
+    uint32_t used;  // the used size of the data buffer (including null terminators, if any)
+} TXT_UTF16;
+
+void txt_utf16_cleanup(TXT_UTF16 *dst);
+void txt_utf16_resize(TXT_UTF16 *dst, size_t required_size, bool keep);
+void txt_utf16_set(TXT_UTF16 *dst, const wchar_t *txt, size_t txt_len);
+void txt_utf16_append(TXT_UTF16 *dst, const wchar_t *txt, size_t txt_len);
+
+// --------------------------------------------------------------------------------------------------------------------
+
+size_t txt_compute_new_size(size_t old_size, size_t required_size);
+
+bool txt_utf16_to_utf8(TXT_UTF8 *utf8, TXT_UTF16 *utf16);
+bool wchar_to_txt_utf8(TXT_UTF8 *dst, const wchar_t *src, int src_len);
+char *utf16_to_utf8_strdupz(const wchar_t *src, size_t *dst_len);
+
+// --------------------------------------------------------------------------------------------------------------------
+
+#endif // OS_WINDOWS
 
 #endif /* NETDATA_STRING_UTF8_H */