1 files changed, 392 insertions, 0 deletions
diff --git a/wsutil/str_util.h b/wsutil/str_util.h
new file mode 100644
index 0000000..7f1362f
--- /dev/null
+++ b/wsutil/str_util.h
@@ -0,0 +1,392 @@
+/** @file
+ * String utility definitions
+ *
+ * Wireshark - Network traffic analyzer
+ * By Gerald Combs <gerald@wireshark.org>
+ * Copyright 1998 Gerald Combs
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef __STR_UTIL_H__
+#define __STR_UTIL_H__
+
+#include <wireshark.h>
+#include <wsutil/wmem/wmem.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+WS_DLL_PUBLIC
+char *
+wmem_strconcat(wmem_allocator_t *allocator, const char *first, ...)
+G_GNUC_MALLOC G_GNUC_NULL_TERMINATED;
+
+WS_DLL_PUBLIC
+char *
+wmem_strjoin(wmem_allocator_t *allocator,
+             const char *separator, const char *first, ...)
+G_GNUC_MALLOC G_GNUC_NULL_TERMINATED;
+
+/**
+ * As g_strjoinv, with the returned string wmem allocated.
+ * Joins a number of strings together to form one long string,
+ * with the optional separator inserted between each of them.
+ *
+ * @param allocator  The wmem scope to use to allocate the returned string
+ * @param separator A string to insert between each of the strings, or NULL.
+ * @param str_array A NULL-terminated array of strings to join
+ *
+ * @note If str_array has no items, the return value is an empty string.
+ * str_array should not be NULL (NULL is returned with an warning.)
+ * NULL as a separator is equivalent to the empty string.
+ */
+WS_DLL_PUBLIC
+char *
+wmem_strjoinv(wmem_allocator_t *allocator,
+              const char *separator, char **str_array)
+G_GNUC_MALLOC;
+
+/**
+ * Splits a string into a maximum of max_tokens pieces, using the given
+ * delimiter. If max_tokens is reached, the remainder of string is appended
+ * to the last token. Successive tokens are not folded and will instead result
+ * in an empty string as element.
+ *
+ * If src or delimiter are NULL, or if delimiter is empty, this will return
+ * NULL.
+ *
+ * Do not use with a NULL allocator, use g_strsplit instead.
+ */
+WS_DLL_PUBLIC
+char **
+wmem_strsplit(wmem_allocator_t *allocator, const char *src,
+        const char *delimiter, int max_tokens);
+
+/**
+ * wmem_ascii_strdown:
+ * Based on g_ascii_strdown
+ * @param allocator  An enumeration of the different types of available allocators.
+ * @param str a string.
+ * @param len length of str in bytes, or -1 if str is nul-terminated.
+ *
+ * Converts all upper case ASCII letters to lower case ASCII letters.
+ *
+ * Return value: a newly-allocated string, with all the upper case
+ *               characters in str converted to lower case, with
+ *               semantics that exactly match g_ascii_tolower(). (Note
+ *               that this is unlike the old g_strdown(), which modified
+ *               the string in place.)
+ **/
+WS_DLL_PUBLIC
+char*
+wmem_ascii_strdown(wmem_allocator_t *allocator, const char *str, ssize_t len);
+
+/** Convert all upper-case ASCII letters to their ASCII lower-case
+ *  equivalents, in place, with a simple non-locale-dependent
+ *  ASCII mapping (A-Z -> a-z).
+ *  All other characters are left unchanged, as the mapping to
+ *  lower case may be locale-dependent.
+ *
+ *  The string is assumed to be in a character encoding, such as
+ *  an ISO 8859 or other EUC encoding, or UTF-8, in which all
+ *  bytes in the range 0x00 through 0x7F are ASCII characters and
+ *  non-ASCII characters are constructed from one or more bytes in
+ *  the range 0x80 through 0xFF.
+ *
+ * @param str The string to be lower-cased.
+ * @return    ptr to the string
+ */
+WS_DLL_PUBLIC
+char *ascii_strdown_inplace(char *str);
+
+/** Convert all lower-case ASCII letters to their ASCII upper-case
+ *  equivalents, in place, with a simple non-locale-dependent
+ *  ASCII mapping (a-z -> A-Z).
+ *  All other characters are left unchanged, as the mapping to
+ *  lower case may be locale-dependent.
+ *
+ *  The string is assumed to be in a character encoding, such as
+ *  an ISO 8859 or other EUC encoding, or UTF-8, in which all
+ *  bytes in the range 0x00 through 0x7F are ASCII characters and
+ *  non-ASCII characters are constructed from one or more bytes in
+ *  the range 0x80 through 0xFF.
+ *
+ * @param str The string to be upper-cased.
+ * @return    ptr to the string
+ */
+WS_DLL_PUBLIC
+char *ascii_strup_inplace(char *str);
+
+/** Check if an entire string consists of printable characters
+ *
+ * @param str    The string to be checked
+ * @return       true if the entire string is printable, otherwise false
+ */
+WS_DLL_PUBLIC
+bool isprint_string(const char *str);
+
+/** Given a not-necessarily-null-terminated string, expected to be in
+ *  UTF-8 but possibly containing invalid sequences (as it may have come
+ *  from packet data), and the length of the string, deterimine if the
+ *  string is valid UTF-8 consisting entirely of printable characters.
+ *
+ *  This means that it:
+ *
+ *   does not contain an illegal UTF-8 sequence (including overlong encodings,
+ *   the sequences reserved for UTF-16 surrogate halves, and the values for
+ *   code points above U+10FFFF that are no longer in Unicode)
+ *
+ *   does not contain a non-printable Unicode character such as control
+ *   characters (including internal NULL bytes)
+ *
+ *   does not end in a partial sequence that could begin a valid character;
+ *
+ *   does not start with a partial sequence that could end a valid character;
+ *
+ * and thus guarantees that the result of format_text() would be the same as
+ * that of wmem_strndup() with the same parameters.
+ *
+ * @param str    The string to be checked
+ * @param length The number of bytes to validate
+ * @return       true if the entire string is valid and printable UTF-8,
+ *               otherwise false
+ */
+WS_DLL_PUBLIC
+bool isprint_utf8_string(const char *str, const unsigned length);
+
+/** Check if an entire string consists of digits
+ *
+ * @param str    The string to be checked
+ * @return       true if the entire string is digits, otherwise false
+ */
+WS_DLL_PUBLIC
+bool isdigit_string(const unsigned char *str);
+
+/** Finds the first occurrence of string 'needle' in string 'haystack'.
+ *  The matching is done ignoring the case of ASCII characters in a
+ *  non-locale-dependent way.
+ *
+ *  The string is assumed to be in a character encoding, such as
+ *  an ISO 8859 or other EUC encoding, or UTF-8, in which all
+ *  bytes in the range 0x00 through 0x7F are ASCII characters and
+ *  non-ASCII characters are constructed from one or more bytes in
+ *  the range 0x80 through 0xFF.
+ *
+ * @param haystack The string possibly containing the substring
+ * @param needle The substring to be searched
+ * @return A pointer into 'haystack' where 'needle' is first found.
+ *   Otherwise it returns NULL.
+ */
+WS_DLL_PUBLIC
+const char *ws_ascii_strcasestr(const char *haystack, const char *needle);
+
+WS_DLL_PUBLIC
+char *ws_escape_string(wmem_allocator_t *alloc, const char *string, bool add_quotes);
+
+WS_DLL_PUBLIC
+char *ws_escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len, bool add_quotes);
+
+/* Replace null bytes with "\0". */
+WS_DLL_PUBLIC
+char *ws_escape_null(wmem_allocator_t *alloc, const char *string, size_t len, bool add_quotes);
+
+WS_DLL_PUBLIC
+int ws_xton(char ch);
+
+typedef enum {
+    FORMAT_SIZE_UNIT_NONE,          /**< No unit will be appended. You must supply your own. */
+    FORMAT_SIZE_UNIT_BYTES,         /**< "bytes" for un-prefixed sizes, "B" otherwise. */
+    FORMAT_SIZE_UNIT_BITS,          /**< "bits" for un-prefixed sizes, "b" otherwise. */
+    FORMAT_SIZE_UNIT_BITS_S,        /**< "bits/s" for un-prefixed sizes, "bps" otherwise. */
+    FORMAT_SIZE_UNIT_BYTES_S,       /**< "bytes/s" for un-prefixed sizes, "Bps" otherwise. */
+    FORMAT_SIZE_UNIT_PACKETS,       /**< "packets" */
+    FORMAT_SIZE_UNIT_PACKETS_S,     /**< "packets/s" */
+} format_size_units_e;
+
+#define FORMAT_SIZE_PREFIX_SI   (1 << 0)    /**< SI (power of 1000) prefixes will be used. */
+#define FORMAT_SIZE_PREFIX_IEC  (1 << 1)    /**< IEC (power of 1024) prefixes will be used. */
+
+/** Given a size, return its value in a human-readable format
+ *
+ * Prefixes up to "T/Ti" (tera, tebi) are currently supported.
+ *
+ * @param size The size value
+ * @param flags Flags to control the output (unit of measurement,
+ * SI vs IEC, etc). Unit and prefix flags may be ORed together.
+ * @return A newly-allocated string representing the value.
+ */
+WS_DLL_PUBLIC
+char *format_size_wmem(wmem_allocator_t *allocator, int64_t size,
+                        format_size_units_e unit, uint16_t flags);
+
+#define format_size(size, unit, flags) \
+    format_size_wmem(NULL, size, unit, flags)
+
+WS_DLL_PUBLIC
+char printable_char_or_period(char c);
+
+WS_DLL_PUBLIC WS_RETNONNULL
+const char *ws_strerrorname_r(int errnum, char *buf, size_t buf_size);
+
+WS_DLL_PUBLIC
+char *ws_strdup_underline(wmem_allocator_t *allocator, long offset, size_t len);
+
+/** Given a wmem scope, a not-necessarily-null-terminated string,
+ *  expected to be in UTF-8 but possibly containing invalid sequences
+ *  (as it may have come from packet data), and the length of the string,
+ *  generate a valid UTF-8 string from it, allocated in the specified
+ *  wmem scope, that:
+ *
+ *   shows printable Unicode characters as themselves;
+ *
+ *   shows non-printable ASCII characters as C-style escapes (octal
+ *   if not one of the standard ones such as LF -> '\n');
+ *
+ *   shows non-printable Unicode-but-not-ASCII characters as
+ *   their universal character names;
+ *
+ *   Replaces illegal UTF-8 sequences with U+FFFD (replacement character) ;
+ *
+ *  and return a pointer to it.
+ *
+ * @param allocator The wmem scope
+ * @param string A pointer to the input string
+ * @param len The length of the input string
+ * @return A pointer to the formatted string
+ *
+ * @see tvb_format_text()
+ */
+WS_DLL_PUBLIC
+char *format_text(wmem_allocator_t* allocator, const char *string, size_t len);
+
+/** Same as format_text() but accepts a nul-terminated string.
+ *
+ * @param allocator The wmem scope
+ * @param string A pointer to the input string
+ * @return A pointer to the formatted string
+ *
+ * @see tvb_format_text()
+ */
+WS_DLL_PUBLIC
+char *format_text_string(wmem_allocator_t* allocator, const char *string);
+
+/**
+ * Same as format_text() but replaces any whitespace characters
+ * (space, tab, carriage return, new line, vertical tab, or formfeed)
+ * with a space.
+ *
+ * @param allocator The wmem scope
+ * @param line A pointer to the input string
+ * @param len The length of the input string
+ * @return A pointer to the formatted string
+ *
+ */
+WS_DLL_PUBLIC
+char *format_text_wsp(wmem_allocator_t* allocator, const char *line, size_t len);
+
+/**
+ * Given a string, generate a string from it that shows non-printable
+ * characters as the chr parameter passed, except a whitespace character
+ * (space, tab, carriage return, new line, vertical tab, or formfeed)
+ * which will be replaced by a space, and return a pointer to it.
+ *
+ * This does *not* treat the input string as UTF-8.
+ *
+ * This is useful for displaying binary data that frequently but not always
+ * contains text; otherwise the number of C escape codes makes it unreadable.
+ *
+ * @param allocator The wmem scope
+ * @param string A pointer to the input string
+ * @param len The length of the input string
+ * @param chr The character to use to replace non-printable characters
+ * @return A pointer to the formatted string
+ *
+ */
+WS_DLL_PUBLIC
+char *format_text_chr(wmem_allocator_t *allocator,
+                        const char *string, size_t len, char chr);
+
+/** Given a wmem scope and an 8-bit character
+ *  generate a valid UTF-8 string from it, allocated in the specified
+ *  wmem scope, that:
+ *
+ *   shows printable Unicode characters as themselves;
+ *
+ *   shows non-printable ASCII characters as C-style escapes (hex
+ *   if not one of the standard ones such as LF -> '\n');
+ *
+ *  and return a pointer to it.
+ *
+ * @param allocator The wmem scope
+ * @param c A character to format
+ * @return A pointer to the formatted string
+ */
+WS_DLL_PUBLIC
+char *format_char(wmem_allocator_t *allocator, char c);
+
+/**
+ * Truncate a UTF-8 string in place so that it is no larger than len bytes,
+ * ensuring that the string is null terminated and ends with a complete
+ * character instead of a partial sequence (e.g., possibly truncating up
+ * to 3 additional bytes if the terminal character is 4 bytes long).
+ *
+ * The buffer holding the string must be large enough (at least len + 1
+ * including the null terminator), and the first len bytes of the buffer
+ * must be a valid UTF-8 string, except for possibly ending in a partial
+ * sequence or not being null terminated. This is a convenience function
+ * that for speed does not check either of those conditions.
+ *
+ * A common use case is when a valid UTF-8 string has been copied into a
+ * buffer of length len+1 via snprintf, strlcpy, or strlcat and truncated,
+ * to ensure that the final UTF-8 character is not a partial sequence.
+ *
+ * @param string A pointer to the input string
+ * @param len The maximum length to truncate to
+ * @return    ptr to the string
+ */
+WS_DLL_PUBLIC
+char* ws_utf8_truncate(char *string, size_t len);
+
+WS_DLL_PUBLIC
+void EBCDIC_to_ASCII(uint8_t *buf, unsigned bytes);
+
+WS_DLL_PUBLIC
+uint8_t EBCDIC_to_ASCII1(uint8_t c);
+
+/* Types of character encodings */
+typedef enum {
+    HEXDUMP_ENC_ASCII     = 0, /* ASCII */
+    HEXDUMP_ENC_EBCDIC    = 1  /* EBCDIC */
+} hex_dump_enc;
+
+/*
+ * Hexdump options for ASCII:
+ */
+
+#define HEXDUMP_ASCII_MASK            (0x0003U)
+#define HEXDUMP_ASCII_OPTION(option)  ((option) & HEXDUMP_ASCII_MASK)
+
+#define HEXDUMP_ASCII_INCLUDE         (0x0000U) /* include ASCII section no delimiters (legacy tshark behavior) */
+#define HEXDUMP_ASCII_DELIMIT         (0x0001U) /* include ASCII section with delimiters, useful for reliable detection of last hexdata */
+#define HEXDUMP_ASCII_EXCLUDE         (0x0002U) /* exclude ASCII section from hexdump reports, if we really don't want or need it */
+
+WS_DLL_PUBLIC
+bool hex_dump_buffer(bool (*print_line)(void *, const char *), void *fp,
+                                    const unsigned char *cp, unsigned length,
+                                    hex_dump_enc encoding,
+                                    unsigned ascii_option);
+
+/* To pass one of two strings, singular or plural */
+#define plurality(d,s,p) ((d) == 1 ? (s) : (p))
+
+#define true_or_false(val) ((val) ? "TRUE" : "FALSE")
+
+#define string_or_null(val) ((val) ? (val) : "[NULL]")
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* __STR_UTIL_H__ */