diff options
Diffstat (limited to 'src/libnetdata/string')
-rw-r--r-- | src/libnetdata/string/README.md | 9 | ||||
-rw-r--r-- | src/libnetdata/string/string.c | 22 | ||||
-rw-r--r-- | src/libnetdata/string/string.h | 17 | ||||
-rw-r--r-- | src/libnetdata/string/utf8.c | 408 | ||||
-rw-r--r-- | src/libnetdata/string/utf8.h | 78 |
5 files changed, 519 insertions, 15 deletions
diff --git a/src/libnetdata/string/README.md b/src/libnetdata/string/README.md index 54c905946..c23160233 100644 --- a/src/libnetdata/string/README.md +++ b/src/libnetdata/string/README.md @@ -1,12 +1,3 @@ -<!-- -title: "String" -custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/string/README.md -sidebar_label: "String" -learn_status: "Published" -learn_topic_type: "Tasks" -learn_rel_path: "Developers/libnetdata" ---> - # STRING STRING provides a way to allocate and free text strings, while de-duplicating them. diff --git a/src/libnetdata/string/string.c b/src/libnetdata/string/string.c index 257a3cc4b..107c7eea5 100644 --- a/src/libnetdata/string/string.c +++ b/src/libnetdata/string/string.c @@ -347,16 +347,34 @@ void string_freez(STRING *string) { string_stats_atomic_increment(partition, releases); } -inline size_t string_strlen(STRING *string) { +inline size_t string_strlen(const STRING *string) { if(unlikely(!string)) return 0; return string->length - 1; } -inline const char *string2str(STRING *string) { +inline const char *string2str(const STRING *string) { if(unlikely(!string)) return ""; return string->str; } +bool string_ends_with_string(const STRING *whole, const STRING *end) { + if(whole == end) return true; + if(!whole || !end) return false; + if(end->length > whole->length) return false; + if(end->length == whole->length) return strcmp(string2str(whole), string2str(end)) == 0; + const char *we = string2str(whole); + we = &we[string_strlen(whole) - string_strlen(end)]; + return strncmp(we, end->str, string_strlen(end)) == 0; +} + +bool string_starts_with_string(const STRING *whole, const STRING *end) { + if(whole == end) return true; + if(!whole || !end) return false; + if(end->length > whole->length) return false; + if(end->length == whole->length) return strcmp(string2str(whole), string2str(end)) == 0; + return strncmp(string2str(whole), string2str(end), string_strlen(end)) == 0; +} + STRING *string_2way_merge(STRING *a, STRING *b) { static STRING *X = NULL; diff --git a/src/libnetdata/string/string.h b/src/libnetdata/string/string.h index c44696be2..e86ac6fb5 100644 --- a/src/libnetdata/string/string.h +++ b/src/libnetdata/string/string.h @@ -14,8 +14,10 @@ STRING *string_strndupz(const char *str, size_t len); STRING *string_dup(STRING *string); void string_freez(STRING *string); -size_t string_strlen(STRING *string); -const char *string2str(STRING *string) NEVERNULL; +size_t string_strlen(const STRING *string); +const char *string2str(const STRING *string) NEVERNULL; +bool string_ends_with_string(const STRING *whole, const STRING *end); +bool string_starts_with_string(const STRING *whole, const STRING *end); // keep common prefix/suffix and replace everything else with [x] STRING *string_2way_merge(STRING *a, STRING *b); @@ -30,10 +32,21 @@ static inline int string_strcmp(STRING *string, const char *s) { return strcmp(string2str(string), s); } +static inline int string_strncmp(STRING *string, const char *s, size_t n) { + return strncmp(string2str(string), s, n); +} + void string_statistics(size_t *inserts, size_t *deletes, size_t *searches, size_t *entries, size_t *references, size_t *memory, size_t *duplications, size_t *releases); int string_unittest(size_t entries); void string_init(void); +static inline void cleanup_string_pp(STRING **stringpp) { + if(stringpp) + string_freez(*stringpp); +} + +#define CLEAN_STRING _cleanup_(cleanup_string_pp) STRING + #endif diff --git a/src/libnetdata/string/utf8.c b/src/libnetdata/string/utf8.c new file mode 100644 index 000000000..0b4f138a6 --- /dev/null +++ b/src/libnetdata/string/utf8.c @@ -0,0 +1,408 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#if defined(OS_WINDOWS) +/* + * Convert any CodePage to UTF16 + * Goals: + * 1. Destination is always NULL terminated + * 2. If the destination buffer is not enough, return as much as possible data (truncate) + * 3. Always return the number of wide characters written, including the null terminator + */ + +size_t any_to_utf16(uint32_t CodePage, wchar_t *dst, size_t dst_size, const char *src, int src_len, bool *truncated) { + if(!src || src_len == 0) { + // invalid input + if(truncated) + *truncated = true; + + if(dst && dst_size) + *dst = L'\0'; + return 0; + } + + if(!dst || !dst_size) { + // the caller wants to know the buffer to allocate for the conversion + + if(truncated) + *truncated = true; + + int required = MultiByteToWideChar(CodePage, 0, src, src_len, NULL, 0); + if(required <= 0) return 0; // error in the conversion + + // Add 1 for null terminator only if src_len is not -1 + // so that the caller can call us again to get the entire string (not truncated) + return (size_t)required + ((src_len != -1) ? 1 : 0); + } + + // do the conversion directly to the destination buffer + int rc = MultiByteToWideChar(CodePage, 0, src, src_len, dst, (int)dst_size); + if(rc <= 0) { + if(truncated) + *truncated = true; + + // conversion failed, let's see why... + DWORD status = GetLastError(); + if(status == ERROR_INSUFFICIENT_BUFFER) { + // it cannot fit entirely, let's allocate a new buffer to convert it + // and then truncate it to the destination buffer + + // clear errno and LastError to clear the error of the + // MultiByteToWideChar() that failed + errno_clear(); + + // get the required size + int required_size = MultiByteToWideChar(CodePage, 0, src, src_len, NULL, 0); + + // mallocz() never fails (exits the program on NULL) + wchar_t *tmp = mallocz(required_size * sizeof(wchar_t)); + + // convert it, now it should fit + rc = MultiByteToWideChar(CodePage, 0, src, src_len, tmp, required_size); + if (rc <= 0) { + // it failed! + *dst = L'\0'; + freez(tmp); + return 0; + } + + size_t len = rc; + + // copy as much as we can + memcpy(dst, tmp, MIN(len, (dst_size - 1)) * sizeof(wchar_t)); + + // null terminate it + dst[MIN(len, (dst_size - 1))] = L'\0'; + + // free the temporary buffer + freez(tmp); + + // return the actual bytes written + return MIN(len, dst_size); + } + + // empty the destination + *dst = L'\0'; + return 0; + } + + size_t len = rc; + + if(truncated) + *truncated = false; + + if(len >= dst_size) { + if(dst[dst_size - 1] != L'\0') { + if (truncated) + *truncated = true; + + // Truncate it to fit the null terminator + dst[dst_size - 1] = L'\0'; + } + return dst_size; + } + + if(dst[len - 1] != L'\0') { + // the result is not null terminated + // append the null + dst[len] = L'\0'; + return len + 1; + } + + // the result is already null terminated + return len; +} + +/* + * Convert UTF16 (wide-character string) to UTF8 + * Goals: + * 1. Destination is always NULL terminated + * 2. If the destination buffer is not enough, return as much as possible data (truncate) + * 3. Always return the number of bytes written, including the null terminator + */ + +size_t utf16_to_utf8(char *dst, size_t dst_size, const wchar_t *src, int src_len, bool *truncated) { + if (!src || src_len == 0) { + // invalid input + if(truncated) + *truncated = true; + + if(dst && dst_size) + *dst = '\0'; + + return 0; + } + + if (!dst || dst_size == 0) { + // The caller wants to know the buffer size required for the conversion + + if(truncated) + *truncated = true; + + int required = WideCharToMultiByte(CP_UTF8, 0, src, src_len, NULL, 0, NULL, NULL); + if (required <= 0) return 0; // error in the conversion + + // Add 1 for null terminator only if src_len is not -1 + return (size_t)required + ((src_len != -1) ? 1 : 0); + } + + // Perform the conversion directly into the destination buffer + int rc = WideCharToMultiByte(CP_UTF8, 0, src, src_len, dst, (int)dst_size, NULL, NULL); + if (rc <= 0) { + if(truncated) + *truncated = true; + + // Conversion failed, let's see why... + DWORD status = GetLastError(); + if (status == ERROR_INSUFFICIENT_BUFFER) { + // It cannot fit entirely, let's allocate a new buffer to convert it + // and then truncate it to the destination buffer + + // Clear errno and LastError to clear the error of the + // WideCharToMultiByte() that failed + errno_clear(); + + // Get the required size + int required_size = WideCharToMultiByte(CP_UTF8, 0, src, src_len, NULL, 0, NULL, NULL); + + // mallocz() never fails (exits the program on NULL) + char *tmp = mallocz(required_size * sizeof(char)); + + // Convert it, now it should fit + rc = WideCharToMultiByte(CP_UTF8, 0, src, src_len, tmp, required_size, NULL, NULL); + if (rc <= 0) { + // Conversion failed + *dst = '\0'; + freez(tmp); + return 0; + } + + size_t len = rc; + + // Copy as much as we can + memcpy(dst, tmp, MIN(len, (dst_size - 1)) * sizeof(char)); + + // Null-terminate it + dst[MIN(len, (dst_size - 1))] = '\0'; + + // Free the temporary buffer + freez(tmp); + + // Return the actual bytes written + return MIN(len, dst_size); + } + + // Empty the destination + *dst = '\0'; + return 0; + } + + size_t len = rc; + + if(truncated) + *truncated = false; + + if (len >= dst_size) { + if(dst[dst_size - 1] != '\0') { + if (truncated) + *truncated = true; + + // Truncate it to fit the null terminator + dst[dst_size - 1] = '\0'; + } + return dst_size; + } + + if (dst[len - 1] != '\0') { + // The result is not null-terminated + // Append the null terminator + dst[len] = '\0'; + return len + 1; + } + + // The result is already null-terminated + return len; +} + +// -------------------------------------------------------------------------------------------------------------------- + +size_t txt_compute_new_size(size_t old_size, size_t required_size) { + size_t size = (required_size % 2048 == 0) ? required_size : required_size + 2048; + size = (size / 2048) * 2048; + + if(size < old_size * 2) + size = old_size * 2; + + return size; +} + +// -------------------------------------------------------------------------------------------------------------------- +// TXT_UTF8 + +void txt_utf8_cleanup(TXT_UTF8 *dst) { + freez(dst->data); + dst->data = NULL; + dst->used = 0; +} + +void txt_utf8_resize(TXT_UTF8 *dst, size_t required_size, bool keep) { + if(required_size <= dst->size) + return; + + size_t new_size = txt_compute_new_size(dst->size, required_size); + + if(keep && dst->data) + dst->data = reallocz(dst->data, new_size); + else { + txt_utf8_cleanup(dst); + dst->data = mallocz(new_size); + dst->used = 0; + } + + dst->size = new_size; +} + +void txt_utf8_empty(TXT_UTF8 *dst) { + txt_utf8_resize(dst, 1, false); + dst->data[0] = '\0'; + dst->used = 1; +} + +void txt_utf8_set(TXT_UTF8 *dst, const char *txt, size_t txt_len) { + txt_utf8_resize(dst, txt_len + 1, false); + memcpy(dst->data, txt, txt_len); + dst->used = txt_len + 1; + dst->data[dst->used - 1] = '\0'; +} + +void txt_utf8_append(TXT_UTF8 *dst, const char *txt, size_t txt_len) { + if(dst->used <= 1) { + // the destination is empty + txt_utf8_set(dst, txt, txt_len); + } + else { + // there is something already in the buffer + txt_utf8_resize(dst, dst->used + txt_len, true); + memcpy(&dst->data[dst->used - 1], txt, txt_len); + dst->used += txt_len; // the null was already counted + dst->data[dst->used - 1] = '\0'; + } +} + +// -------------------------------------------------------------------------------------------------------------------- +// TXT_UTF16 + +void txt_utf16_cleanup(TXT_UTF16 *dst) { + freez(dst->data); +} + +void txt_utf16_resize(TXT_UTF16 *dst, size_t required_size, bool keep) { + if(required_size <= dst->size) + return; + + size_t new_size = txt_compute_new_size(dst->size, required_size); + + if (keep && dst->data) { + dst->data = reallocz(dst->data, new_size * sizeof(wchar_t)); + } else { + txt_utf16_cleanup(dst); + dst->data = mallocz(new_size * sizeof(wchar_t)); + dst->used = 0; + } + + dst->size = new_size; +} + +void txt_utf16_set(TXT_UTF16 *dst, const wchar_t *txt, size_t txt_len) { + txt_utf16_resize(dst, dst->used + txt_len + 1, true); + memcpy(dst->data, txt, txt_len * sizeof(wchar_t)); + dst->used = txt_len + 1; + dst->data[dst->used - 1] = '\0'; +} + +void txt_utf16_append(TXT_UTF16 *dst, const wchar_t *txt, size_t txt_len) { + if(dst->used <= 1) { + // the destination is empty + txt_utf16_set(dst, txt, txt_len); + } + else { + // there is something already in the buffer + txt_utf16_resize(dst, dst->used + txt_len, true); + memcpy(&dst->data[dst->used - 1], txt, txt_len * sizeof(wchar_t)); + dst->used += txt_len; // the null was already counted + dst->data[dst->used - 1] = '\0'; + } +} + +// -------------------------------------------------------------------------------------------------------------------- + +bool wchar_to_txt_utf8(TXT_UTF8 *dst, const wchar_t *src, int src_len) { + if(!src || !src_len) { + txt_utf8_empty(dst); + return false; + } + + if(!dst->data && !dst->size) { + size_t size = utf16_to_utf8(NULL, 0, src, src_len, NULL); + if(!size) { + txt_utf8_empty(dst); + return false; + } + + // we +1 here to avoid entering the next condition below + txt_utf8_resize(dst, size, false); + } + + bool truncated = false; + dst->used = utf16_to_utf8(dst->data, dst->size, src, src_len, &truncated); + if(truncated) { + // we need to resize + size_t needed = utf16_to_utf8(NULL, 0, src, src_len, NULL); // find the size needed + if(!needed) { + txt_utf8_empty(dst); + return false; + } + + txt_utf8_resize(dst, needed, false); + dst->used = utf16_to_utf8(dst->data, dst->size, src, src_len, NULL); + } + + // Make sure it is not zero padded at the end + while(dst->used >= 2 && dst->data[dst->used - 2] == 0) + dst->used--; + + internal_fatal(strlen(dst->data) + 1 != dst->used, + "Wrong UTF8 string length"); + + return true; +} + +bool txt_utf16_to_utf8(TXT_UTF8 *utf8, TXT_UTF16 *utf16) { + fatal_assert(utf8 && ((utf8->data && utf8->size) || (!utf8->data && !utf8->size))); + fatal_assert(utf16 && ((utf16->data && utf16->size) || (!utf16->data && !utf16->size))); + + // pass the entire utf16 size, including the null terminator + // so that the resulting utf8 message will be null terminated too. + return wchar_to_txt_utf8(utf8, utf16->data, (int)utf16->used - 1); +} + +char *utf16_to_utf8_strdupz(const wchar_t *src, size_t *dst_len) { + size_t size = utf16_to_utf8(NULL, 0, src, -1, NULL); + if (size) { + char *dst = mallocz(size); + + size = utf16_to_utf8(dst, size, src, -1, NULL); + if(dst_len) + *dst_len = size - 1; + + return dst; + } + + if(dst_len) + *dst_len = 0; + + return NULL; +} + +#endif diff --git a/src/libnetdata/string/utf8.h b/src/libnetdata/string/utf8.h index 3e6c8c288..f27ba5447 100644 --- a/src/libnetdata/string/utf8.h +++ b/src/libnetdata/string/utf8.h @@ -3,7 +3,81 @@ #ifndef NETDATA_STRING_UTF8_H #define NETDATA_STRING_UTF8_H 1 -#define IS_UTF8_BYTE(x) ((x) & 0x80) -#define IS_UTF8_STARTBYTE(x) (IS_UTF8_BYTE(x)&&((x) & 0x40)) +#include "../libnetdata.h" + +#define IS_UTF8_BYTE(x) ((uint8_t)(x) & (uint8_t)0x80) +#define IS_UTF8_STARTBYTE(x) (IS_UTF8_BYTE(x) && ((uint8_t)(x) & (uint8_t)0x40)) + +#ifndef _countof +#define _countof(x) (sizeof(x) / sizeof(*(x))) +#endif + +#if defined(OS_WINDOWS) + +// return an always null terminated wide string, truncate to given size if destination is not big enough, +// src_len can be -1 use all of it. +// returns zero on errors, > 0 otherwise (including the null, even if src is not null terminated). +size_t any_to_utf16(uint32_t CodePage, wchar_t *dst, size_t dst_size, const char *src, int src_len, bool *truncated); + +// always null terminated, truncated if it does not fit, src_len can be -1 to use all of it. +// returns zero on errors, > 0 otherwise (including the null, even if src is not null terminated). +#define utf8_to_utf16(utf16, utf16_count, src, src_len) any_to_utf16(CP_UTF8, utf16, utf16_count, src, src_len, NULL) + +// always null terminated, truncated if it does not fit, src_len can be -1 to use all of it. +// returns zero on errors, > 0 otherwise (including the null, even if src is not null terminated). +size_t utf16_to_utf8(char *dst, size_t dst_size, const wchar_t *src, int src_len, bool *truncated); + +// -------------------------------------------------------------------------------------------------------------------- +// TXT_UTF8 + +typedef enum __attribute__((packed)) { + TXT_SOURCE_UNKNOWN = 0, + TXT_SOURCE_PROVIDER, + TXT_SOURCE_FIELD_CACHE, + TXT_SOURCE_EVENT_LOG, + TXT_SOURCE_HARDCODED, + + // terminator + TXT_SOURCE_MAX, +} TXT_SOURCE; + +typedef struct { + char *data; + uint32_t size; // the allocated size of data buffer + uint32_t used; // the used size of the data buffer (including null terminators, if any) + TXT_SOURCE src; +} TXT_UTF8; + +void txt_utf8_append(TXT_UTF8 *dst, const char *txt, size_t txt_len); +void txt_utf8_set(TXT_UTF8 *dst, const char *txt, size_t txt_len); +void txt_utf8_empty(TXT_UTF8 *dst); +void txt_utf8_resize(TXT_UTF8 *dst, size_t required_size, bool keep); +void txt_utf8_cleanup(TXT_UTF8 *dst); + +// -------------------------------------------------------------------------------------------------------------------- +// TXT_UTF16 + +typedef struct { + wchar_t *data; + uint32_t size; // the allocated size of data buffer + uint32_t used; // the used size of the data buffer (including null terminators, if any) +} TXT_UTF16; + +void txt_utf16_cleanup(TXT_UTF16 *dst); +void txt_utf16_resize(TXT_UTF16 *dst, size_t required_size, bool keep); +void txt_utf16_set(TXT_UTF16 *dst, const wchar_t *txt, size_t txt_len); +void txt_utf16_append(TXT_UTF16 *dst, const wchar_t *txt, size_t txt_len); + +// -------------------------------------------------------------------------------------------------------------------- + +size_t txt_compute_new_size(size_t old_size, size_t required_size); + +bool txt_utf16_to_utf8(TXT_UTF8 *utf8, TXT_UTF16 *utf16); +bool wchar_to_txt_utf8(TXT_UTF8 *dst, const wchar_t *src, int src_len); +char *utf16_to_utf8_strdupz(const wchar_t *src, size_t *dst_len); + +// -------------------------------------------------------------------------------------------------------------------- + +#endif // OS_WINDOWS #endif /* NETDATA_STRING_UTF8_H */ |