summaryrefslogtreecommitdiffstats
path: root/wsutil/str_util.h
blob: 7f1362f4e099b371bba11151c9882a92e02aad53 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
/** @file
 * String utility definitions
 *
 * Wireshark - Network traffic analyzer
 * By Gerald Combs <gerald@wireshark.org>
 * Copyright 1998 Gerald Combs
 *
 * SPDX-License-Identifier: GPL-2.0-or-later
 */

#ifndef __STR_UTIL_H__
#define __STR_UTIL_H__

#include <wireshark.h>
#include <wsutil/wmem/wmem.h>

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

WS_DLL_PUBLIC
char *
wmem_strconcat(wmem_allocator_t *allocator, const char *first, ...)
G_GNUC_MALLOC G_GNUC_NULL_TERMINATED;

WS_DLL_PUBLIC
char *
wmem_strjoin(wmem_allocator_t *allocator,
             const char *separator, const char *first, ...)
G_GNUC_MALLOC G_GNUC_NULL_TERMINATED;

/**
 * As g_strjoinv, with the returned string wmem allocated.
 * Joins a number of strings together to form one long string,
 * with the optional separator inserted between each of them.
 *
 * @param allocator  The wmem scope to use to allocate the returned string
 * @param separator A string to insert between each of the strings, or NULL.
 * @param str_array A NULL-terminated array of strings to join
 *
 * @note If str_array has no items, the return value is an empty string.
 * str_array should not be NULL (NULL is returned with an warning.)
 * NULL as a separator is equivalent to the empty string.
 */
WS_DLL_PUBLIC
char *
wmem_strjoinv(wmem_allocator_t *allocator,
              const char *separator, char **str_array)
G_GNUC_MALLOC;

/**
 * Splits a string into a maximum of max_tokens pieces, using the given
 * delimiter. If max_tokens is reached, the remainder of string is appended
 * to the last token. Successive tokens are not folded and will instead result
 * in an empty string as element.
 *
 * If src or delimiter are NULL, or if delimiter is empty, this will return
 * NULL.
 *
 * Do not use with a NULL allocator, use g_strsplit instead.
 */
WS_DLL_PUBLIC
char **
wmem_strsplit(wmem_allocator_t *allocator, const char *src,
        const char *delimiter, int max_tokens);

/**
 * wmem_ascii_strdown:
 * Based on g_ascii_strdown
 * @param allocator  An enumeration of the different types of available allocators.
 * @param str a string.
 * @param len length of str in bytes, or -1 if str is nul-terminated.
 *
 * Converts all upper case ASCII letters to lower case ASCII letters.
 *
 * Return value: a newly-allocated string, with all the upper case
 *               characters in str converted to lower case, with
 *               semantics that exactly match g_ascii_tolower(). (Note
 *               that this is unlike the old g_strdown(), which modified
 *               the string in place.)
 **/
WS_DLL_PUBLIC
char*
wmem_ascii_strdown(wmem_allocator_t *allocator, const char *str, ssize_t len);

/** Convert all upper-case ASCII letters to their ASCII lower-case
 *  equivalents, in place, with a simple non-locale-dependent
 *  ASCII mapping (A-Z -> a-z).
 *  All other characters are left unchanged, as the mapping to
 *  lower case may be locale-dependent.
 *
 *  The string is assumed to be in a character encoding, such as
 *  an ISO 8859 or other EUC encoding, or UTF-8, in which all
 *  bytes in the range 0x00 through 0x7F are ASCII characters and
 *  non-ASCII characters are constructed from one or more bytes in
 *  the range 0x80 through 0xFF.
 *
 * @param str The string to be lower-cased.
 * @return    ptr to the string
 */
WS_DLL_PUBLIC
char *ascii_strdown_inplace(char *str);

/** Convert all lower-case ASCII letters to their ASCII upper-case
 *  equivalents, in place, with a simple non-locale-dependent
 *  ASCII mapping (a-z -> A-Z).
 *  All other characters are left unchanged, as the mapping to
 *  lower case may be locale-dependent.
 *
 *  The string is assumed to be in a character encoding, such as
 *  an ISO 8859 or other EUC encoding, or UTF-8, in which all
 *  bytes in the range 0x00 through 0x7F are ASCII characters and
 *  non-ASCII characters are constructed from one or more bytes in
 *  the range 0x80 through 0xFF.
 *
 * @param str The string to be upper-cased.
 * @return    ptr to the string
 */
WS_DLL_PUBLIC
char *ascii_strup_inplace(char *str);

/** Check if an entire string consists of printable characters
 *
 * @param str    The string to be checked
 * @return       true if the entire string is printable, otherwise false
 */
WS_DLL_PUBLIC
bool isprint_string(const char *str);

/** Given a not-necessarily-null-terminated string, expected to be in
 *  UTF-8 but possibly containing invalid sequences (as it may have come
 *  from packet data), and the length of the string, deterimine if the
 *  string is valid UTF-8 consisting entirely of printable characters.
 *
 *  This means that it:
 *
 *   does not contain an illegal UTF-8 sequence (including overlong encodings,
 *   the sequences reserved for UTF-16 surrogate halves, and the values for
 *   code points above U+10FFFF that are no longer in Unicode)
 *
 *   does not contain a non-printable Unicode character such as control
 *   characters (including internal NULL bytes)
 *
 *   does not end in a partial sequence that could begin a valid character;
 *
 *   does not start with a partial sequence that could end a valid character;
 *
 * and thus guarantees that the result of format_text() would be the same as
 * that of wmem_strndup() with the same parameters.
 *
 * @param str    The string to be checked
 * @param length The number of bytes to validate
 * @return       true if the entire string is valid and printable UTF-8,
 *               otherwise false
 */
WS_DLL_PUBLIC
bool isprint_utf8_string(const char *str, const unsigned length);

/** Check if an entire string consists of digits
 *
 * @param str    The string to be checked
 * @return       true if the entire string is digits, otherwise false
 */
WS_DLL_PUBLIC
bool isdigit_string(const unsigned char *str);

/** Finds the first occurrence of string 'needle' in string 'haystack'.
 *  The matching is done ignoring the case of ASCII characters in a
 *  non-locale-dependent way.
 *
 *  The string is assumed to be in a character encoding, such as
 *  an ISO 8859 or other EUC encoding, or UTF-8, in which all
 *  bytes in the range 0x00 through 0x7F are ASCII characters and
 *  non-ASCII characters are constructed from one or more bytes in
 *  the range 0x80 through 0xFF.
 *
 * @param haystack The string possibly containing the substring
 * @param needle The substring to be searched
 * @return A pointer into 'haystack' where 'needle' is first found.
 *   Otherwise it returns NULL.
 */
WS_DLL_PUBLIC
const char *ws_ascii_strcasestr(const char *haystack, const char *needle);

WS_DLL_PUBLIC
char *ws_escape_string(wmem_allocator_t *alloc, const char *string, bool add_quotes);

WS_DLL_PUBLIC
char *ws_escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len, bool add_quotes);

/* Replace null bytes with "\0". */
WS_DLL_PUBLIC
char *ws_escape_null(wmem_allocator_t *alloc, const char *string, size_t len, bool add_quotes);

WS_DLL_PUBLIC
int ws_xton(char ch);

typedef enum {
    FORMAT_SIZE_UNIT_NONE,          /**< No unit will be appended. You must supply your own. */
    FORMAT_SIZE_UNIT_BYTES,         /**< "bytes" for un-prefixed sizes, "B" otherwise. */
    FORMAT_SIZE_UNIT_BITS,          /**< "bits" for un-prefixed sizes, "b" otherwise. */
    FORMAT_SIZE_UNIT_BITS_S,        /**< "bits/s" for un-prefixed sizes, "bps" otherwise. */
    FORMAT_SIZE_UNIT_BYTES_S,       /**< "bytes/s" for un-prefixed sizes, "Bps" otherwise. */
    FORMAT_SIZE_UNIT_PACKETS,       /**< "packets" */
    FORMAT_SIZE_UNIT_PACKETS_S,     /**< "packets/s" */
} format_size_units_e;

#define FORMAT_SIZE_PREFIX_SI   (1 << 0)    /**< SI (power of 1000) prefixes will be used. */
#define FORMAT_SIZE_PREFIX_IEC  (1 << 1)    /**< IEC (power of 1024) prefixes will be used. */

/** Given a size, return its value in a human-readable format
 *
 * Prefixes up to "T/Ti" (tera, tebi) are currently supported.
 *
 * @param size The size value
 * @param flags Flags to control the output (unit of measurement,
 * SI vs IEC, etc). Unit and prefix flags may be ORed together.
 * @return A newly-allocated string representing the value.
 */
WS_DLL_PUBLIC
char *format_size_wmem(wmem_allocator_t *allocator, int64_t size,
                        format_size_units_e unit, uint16_t flags);

#define format_size(size, unit, flags) \
    format_size_wmem(NULL, size, unit, flags)

WS_DLL_PUBLIC
char printable_char_or_period(char c);

WS_DLL_PUBLIC WS_RETNONNULL
const char *ws_strerrorname_r(int errnum, char *buf, size_t buf_size);

WS_DLL_PUBLIC
char *ws_strdup_underline(wmem_allocator_t *allocator, long offset, size_t len);

/** Given a wmem scope, a not-necessarily-null-terminated string,
 *  expected to be in UTF-8 but possibly containing invalid sequences
 *  (as it may have come from packet data), and the length of the string,
 *  generate a valid UTF-8 string from it, allocated in the specified
 *  wmem scope, that:
 *
 *   shows printable Unicode characters as themselves;
 *
 *   shows non-printable ASCII characters as C-style escapes (octal
 *   if not one of the standard ones such as LF -> '\n');
 *
 *   shows non-printable Unicode-but-not-ASCII characters as
 *   their universal character names;
 *
 *   Replaces illegal UTF-8 sequences with U+FFFD (replacement character) ;
 *
 *  and return a pointer to it.
 *
 * @param allocator The wmem scope
 * @param string A pointer to the input string
 * @param len The length of the input string
 * @return A pointer to the formatted string
 *
 * @see tvb_format_text()
 */
WS_DLL_PUBLIC
char *format_text(wmem_allocator_t* allocator, const char *string, size_t len);

/** Same as format_text() but accepts a nul-terminated string.
 *
 * @param allocator The wmem scope
 * @param string A pointer to the input string
 * @return A pointer to the formatted string
 *
 * @see tvb_format_text()
 */
WS_DLL_PUBLIC
char *format_text_string(wmem_allocator_t* allocator, const char *string);

/**
 * Same as format_text() but replaces any whitespace characters
 * (space, tab, carriage return, new line, vertical tab, or formfeed)
 * with a space.
 *
 * @param allocator The wmem scope
 * @param line A pointer to the input string
 * @param len The length of the input string
 * @return A pointer to the formatted string
 *
 */
WS_DLL_PUBLIC
char *format_text_wsp(wmem_allocator_t* allocator, const char *line, size_t len);

/**
 * Given a string, generate a string from it that shows non-printable
 * characters as the chr parameter passed, except a whitespace character
 * (space, tab, carriage return, new line, vertical tab, or formfeed)
 * which will be replaced by a space, and return a pointer to it.
 *
 * This does *not* treat the input string as UTF-8.
 *
 * This is useful for displaying binary data that frequently but not always
 * contains text; otherwise the number of C escape codes makes it unreadable.
 *
 * @param allocator The wmem scope
 * @param string A pointer to the input string
 * @param len The length of the input string
 * @param chr The character to use to replace non-printable characters
 * @return A pointer to the formatted string
 *
 */
WS_DLL_PUBLIC
char *format_text_chr(wmem_allocator_t *allocator,
                        const char *string, size_t len, char chr);

/** Given a wmem scope and an 8-bit character
 *  generate a valid UTF-8 string from it, allocated in the specified
 *  wmem scope, that:
 *
 *   shows printable Unicode characters as themselves;
 *
 *   shows non-printable ASCII characters as C-style escapes (hex
 *   if not one of the standard ones such as LF -> '\n');
 *
 *  and return a pointer to it.
 *
 * @param allocator The wmem scope
 * @param c A character to format
 * @return A pointer to the formatted string
 */
WS_DLL_PUBLIC
char *format_char(wmem_allocator_t *allocator, char c);

/**
 * Truncate a UTF-8 string in place so that it is no larger than len bytes,
 * ensuring that the string is null terminated and ends with a complete
 * character instead of a partial sequence (e.g., possibly truncating up
 * to 3 additional bytes if the terminal character is 4 bytes long).
 *
 * The buffer holding the string must be large enough (at least len + 1
 * including the null terminator), and the first len bytes of the buffer
 * must be a valid UTF-8 string, except for possibly ending in a partial
 * sequence or not being null terminated. This is a convenience function
 * that for speed does not check either of those conditions.
 *
 * A common use case is when a valid UTF-8 string has been copied into a
 * buffer of length len+1 via snprintf, strlcpy, or strlcat and truncated,
 * to ensure that the final UTF-8 character is not a partial sequence.
 *
 * @param string A pointer to the input string
 * @param len The maximum length to truncate to
 * @return    ptr to the string
 */
WS_DLL_PUBLIC
char* ws_utf8_truncate(char *string, size_t len);

WS_DLL_PUBLIC
void EBCDIC_to_ASCII(uint8_t *buf, unsigned bytes);

WS_DLL_PUBLIC
uint8_t EBCDIC_to_ASCII1(uint8_t c);

/* Types of character encodings */
typedef enum {
    HEXDUMP_ENC_ASCII     = 0, /* ASCII */
    HEXDUMP_ENC_EBCDIC    = 1  /* EBCDIC */
} hex_dump_enc;

/*
 * Hexdump options for ASCII:
 */

#define HEXDUMP_ASCII_MASK            (0x0003U)
#define HEXDUMP_ASCII_OPTION(option)  ((option) & HEXDUMP_ASCII_MASK)

#define HEXDUMP_ASCII_INCLUDE         (0x0000U) /* include ASCII section no delimiters (legacy tshark behavior) */
#define HEXDUMP_ASCII_DELIMIT         (0x0001U) /* include ASCII section with delimiters, useful for reliable detection of last hexdata */
#define HEXDUMP_ASCII_EXCLUDE         (0x0002U) /* exclude ASCII section from hexdump reports, if we really don't want or need it */

WS_DLL_PUBLIC
bool hex_dump_buffer(bool (*print_line)(void *, const char *), void *fp,
                                    const unsigned char *cp, unsigned length,
                                    hex_dump_enc encoding,
                                    unsigned ascii_option);

/* To pass one of two strings, singular or plural */
#define plurality(d,s,p) ((d) == 1 ? (s) : (p))

#define true_or_false(val) ((val) ? "TRUE" : "FALSE")

#define string_or_null(val) ((val) ? (val) : "[NULL]")

#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif /* __STR_UTIL_H__ */