diff options
Diffstat (limited to 'wsutil/str_util.c')
-rw-r--r-- | wsutil/str_util.c | 296 |
1 files changed, 283 insertions, 13 deletions
diff --git a/wsutil/str_util.c b/wsutil/str_util.c index 4243b22b..556ef5a9 100644 --- a/wsutil/str_util.c +++ b/wsutil/str_util.c @@ -13,6 +13,8 @@ #include "str_util.h" #include <string.h> +#include <locale.h> +#include <math.h> #include <ws_codepoints.h> @@ -339,10 +341,41 @@ ws_ascii_strcasestr(const char *haystack, const char *needle) return NULL; } +/* Return the last occurrence of ch in the n bytes of haystack. + * If not found or n is 0, return NULL. */ +const uint8_t * +ws_memrchr(const void *_haystack, int ch, size_t n) +{ +#ifdef HAVE_MEMRCHR + return memrchr(_haystack, ch, n); +#else + /* A generic implementation. This could be optimized considerably, + * e.g. by fetching a word at a time. + */ + if (n == 0) { + return NULL; + } + const uint8_t *haystack = _haystack; + const uint8_t *p; + uint8_t c = (uint8_t)ch; + + const uint8_t *const end = haystack + n - 1; + + for (p = end; p >= haystack; --p) { + if (*p == c) { + return p; + } + } + + return NULL; +#endif /* HAVE_MEMRCHR */ +} + #define FORMAT_SIZE_UNIT_MASK 0x00ff #define FORMAT_SIZE_PFX_MASK 0xff00 -static const char *thousands_grouping_fmt = NULL; +static const char *thousands_grouping_fmt; +static const char *thousands_grouping_fmt_flt; DIAG_OFF(format) static void test_printf_thousands_grouping(void) { @@ -351,16 +384,213 @@ static void test_printf_thousands_grouping(void) { wmem_strbuf_append_printf(buf, "%'d", 22); if (g_strcmp0(wmem_strbuf_get_str(buf), "22") == 0) { thousands_grouping_fmt = "%'"PRId64; + thousands_grouping_fmt_flt = "%'.*f"; } else { /* Don't use */ thousands_grouping_fmt = "%"PRId64; + thousands_grouping_fmt_flt = "%.*f"; } wmem_strbuf_destroy(buf); } DIAG_ON(format) +static const char* decimal_point = NULL; + +static void truncate_numeric_strbuf(wmem_strbuf_t *strbuf, int n) { + + const char *s = wmem_strbuf_get_str(strbuf); + char *p; + int count; + + if (decimal_point == NULL) { + decimal_point = localeconv()->decimal_point; + } + + p = strchr(s, decimal_point[0]); + if (p != NULL) { + count = n; + while (count >= 0) { + count--; + if (*p == '\0') + break; + p++; + } + + p--; + while (*p == '0') { + p--; + } + + if (*p != decimal_point[0]) { + p++; + } + wmem_strbuf_truncate(strbuf, p - s); + } +} + +/* Given a floating point value, return it in a human-readable format, + * using units with metric prefixes (falling back to scientific notation + * with the base units if outside the range.) + */ +char * +format_units(wmem_allocator_t *allocator, double size, + format_size_units_e unit, uint16_t flags, + int precision) +{ + wmem_strbuf_t *human_str = wmem_strbuf_new(allocator, NULL); + double power = 1000.0; + int pfx_off = 6; + bool is_small = false; + /* is_small is when to use the longer, spelled out unit. + * We use it for inf, NaN, 0, and unprefixed small values, + * but not for unprefixed values using scientific notation + * the value is outside the supported prefix range. + */ + bool scientific = false; + double abs_size = fabs(size); + int exponent = 0; + static const char * const si_prefix[] = {" a", " f", " p", " n", " μ", " m", " ", " k", " M", " G", " T", " P", " E"}; + static const char * const iec_prefix[] = {" ", " Ki", " Mi", " Gi", " Ti", " Pi", " Ei"}; + const char * const *prefix = si_prefix; + int max_exp = (int)G_N_ELEMENTS(si_prefix) - 1; + + char *ret_val; + + if (thousands_grouping_fmt == NULL) + test_printf_thousands_grouping(); + + if (flags & FORMAT_SIZE_PREFIX_IEC) { + prefix = iec_prefix; + max_exp = (int)G_N_ELEMENTS(iec_prefix) - 1; + power = 1024.0; + } + + if (isfinite(size) && size != 0.0) { + + double comp = precision == 0 ? 10.0 : 1.0; + + /* For precision 0, use the range [10, 10*power) because only + * one significant digit is not as useful. This is what format_size + * does for integers. ("ls -h" uses one digit after the decimal + * point only for the [1, 10) range, g_format_size() always displays + * tenths.) Prefer non-prefixed units for the range [1,10), though. + * + * We have a limited number of units to check, so this (which + * can be unrolled) is presumably faster than log + floor + pow/exp + */ + if (abs_size < 1.0) { + while (abs_size < comp) { + abs_size *= power; + exponent--; + if ((exponent + pfx_off) < 0) { + scientific = true; + break; + } + } + } else { + while (abs_size >= comp*power) { + abs_size *= 1/power; + exponent++; + if ((exponent + pfx_off) > max_exp) { + scientific = true; + break; + } + } + } + } + + if (scientific) { + wmem_strbuf_append_printf(human_str, "%.*g", precision + 1, size); + exponent = 0; + } else { + if (exponent == 0) { + is_small = true; + } + size = copysign(abs_size, size); + // Truncate trailing zeros, but do it this way because we know + // we don't want scientific notation, and we don't want %g to + // switch to that if precision is small. (We could always use + // %g when precision is large.) + wmem_strbuf_append_printf(human_str, thousands_grouping_fmt_flt, precision, size); + truncate_numeric_strbuf(human_str, precision); + // XXX - when rounding to a certain precision, printf might + // round up to "power" from something like 999.99999995, which + // looks a little odd on a graph when transitioning from 1,000 bytes + // (for values just under 1 kB) to 1 kB (for values 1 kB and larger.) + // Due to edge cases in binary fp representation and how printf might + // round things, the right way to handle it is taking the printf output + // and comparing it to "1000" and "1024" and adjusting the exponent + // if so - though we need to compare to the version with the thousands + // separator if we have that (which makes it harder to use strnatcmp + // as is.) + } + + if ((size_t)(pfx_off + exponent) < G_N_ELEMENTS(si_prefix)) { + wmem_strbuf_append(human_str, prefix[pfx_off+exponent]); + } + + switch (unit) { + case FORMAT_SIZE_UNIT_NONE: + break; + case FORMAT_SIZE_UNIT_BYTES: + wmem_strbuf_append(human_str, is_small ? "bytes" : "B"); + break; + case FORMAT_SIZE_UNIT_BITS: + wmem_strbuf_append(human_str, is_small ? "bits" : "b"); + break; + case FORMAT_SIZE_UNIT_BITS_S: + wmem_strbuf_append(human_str, is_small ? "bits/s" : "bps"); + break; + case FORMAT_SIZE_UNIT_BYTES_S: + wmem_strbuf_append(human_str, is_small ? "bytes/s" : "Bps"); + break; + case FORMAT_SIZE_UNIT_PACKETS: + wmem_strbuf_append(human_str, is_small ? "packets" : "packets"); + break; + case FORMAT_SIZE_UNIT_PACKETS_S: + wmem_strbuf_append(human_str, is_small ? "packets/s" : "packets/s"); + break; + case FORMAT_SIZE_UNIT_EVENTS: + wmem_strbuf_append(human_str, is_small ? "events" : "events"); + break; + case FORMAT_SIZE_UNIT_EVENTS_S: + wmem_strbuf_append(human_str, is_small ? "events/s" : "events/s"); + break; + case FORMAT_SIZE_UNIT_FIELDS: + wmem_strbuf_append(human_str, is_small ? "fields" : "fields"); + break; + case FORMAT_SIZE_UNIT_SECONDS: + wmem_strbuf_append(human_str, is_small ? "seconds" : "s"); + break; + case FORMAT_SIZE_UNIT_ERLANGS: + wmem_strbuf_append(human_str, is_small ? "erlangs" : "E"); + break; + default: + ws_assert_not_reached(); + } + + ret_val = wmem_strbuf_finalize(human_str); + /* Convention is a space between the value and the units. If we have + * a prefix, the space is before the prefix. There are two possible + * uses of FORMAT_SIZE_UNIT_NONE: + * 1. Add a unit immediately after the string returned. In this case, + * we would want the string to end with a space if there's no prefix. + * 2. The unit appears somewhere else, e.g. in a legend, header, or + * different column. In this case, we don't want the string to end + * with a space if there's no prefix. + * chomping the string here, as we've traditionally done, optimizes for + * the latter case but makes the former case harder. + * Perhaps the right approach is to distinguish the cases with a new + * enum value. + */ + return g_strchomp(ret_val); +} + /* Given a size, return its value in a human-readable format */ -/* This doesn't handle fractional values. We might want to make size a double. */ +/* This doesn't handle fractional values. We might want to just + * call the version with the double and precision 0 (possibly + * slower due to the use of floating point math, but do we care?) + */ char * format_size_wmem(wmem_allocator_t *allocator, int64_t size, format_size_units_e unit, uint16_t flags) @@ -418,6 +648,18 @@ format_size_wmem(wmem_allocator_t *allocator, int64_t size, case FORMAT_SIZE_UNIT_PACKETS_S: wmem_strbuf_append(human_str, is_small ? " packets/s" : "packets/s"); break; + case FORMAT_SIZE_UNIT_FIELDS: + wmem_strbuf_append(human_str, is_small ? " fields" : "fields"); + break; + /* These aren't that practical to use with integers, but + * perhaps better than asserting. + */ + case FORMAT_SIZE_UNIT_SECONDS: + wmem_strbuf_append(human_str, is_small ? " seconds" : "s"); + break; + case FORMAT_SIZE_UNIT_ERLANGS: + wmem_strbuf_append(human_str, is_small ? " erlangs" : "E"); + break; default: ws_assert_not_reached(); } @@ -443,8 +685,9 @@ escape_char(char c, char *p) ws_assert(p); /* - * Backslashes and double-quotes must - * be escaped. Whitespace is also escaped. + * backslashes and double-quotes must be escaped (double-quotes + * are escaped by passing '"' as quote_char in escape_string_len) + * whitespace is also escaped. */ switch (c) { case '\a': r = 'a'; break; @@ -454,7 +697,6 @@ escape_char(char c, char *p) case '\r': r = 'r'; break; case '\t': r = 't'; break; case '\v': r = 'v'; break; - case '"': r = '"'; break; case '\\': r = '\\'; break; case '\0': r = '0'; break; } @@ -479,7 +721,8 @@ escape_null(char c, char *p) static char * escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len, - bool (*escape_func)(char c, char *p), bool add_quotes) + bool (*escape_func)(char c, char *p), bool add_quotes, + char quote_char, bool double_quote) { char c, r; wmem_strbuf_t *buf; @@ -495,8 +738,8 @@ escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len, buf = wmem_strbuf_new_sized(alloc, alloc_size); - if (add_quotes) - wmem_strbuf_append_c(buf, '"'); + if (add_quotes && quote_char != '\0') + wmem_strbuf_append_c(buf, quote_char); for (i = 0; i < len; i++) { c = string[i]; @@ -504,14 +747,30 @@ escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len, wmem_strbuf_append_c(buf, '\\'); wmem_strbuf_append_c(buf, r); } + else if (c == quote_char && quote_char != '\0') { + /* If quoting, we must escape the quote_char somehow. */ + if (double_quote) { + wmem_strbuf_append_c(buf, c); + wmem_strbuf_append_c(buf, c); + } else { + wmem_strbuf_append_c(buf, '\\'); + wmem_strbuf_append_c(buf, c); + } + } + else if (c == '\\' && quote_char != '\0' && !double_quote) { + /* If quoting, and escaping the quote_char with a backslash, + * then backslash must be escaped, even if escape_func doesn't. */ + wmem_strbuf_append_c(buf, '\\'); + wmem_strbuf_append_c(buf, '\\'); + } else { /* Other UTF-8 bytes are passed through. */ wmem_strbuf_append_c(buf, c); } } - if (add_quotes) - wmem_strbuf_append_c(buf, '"'); + if (add_quotes && quote_char != '\0') + wmem_strbuf_append_c(buf, quote_char); return wmem_strbuf_finalize(buf); } @@ -519,18 +778,29 @@ escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len, char * ws_escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len, bool add_quotes) { - return escape_string_len(alloc, string, len, escape_char, add_quotes); + return escape_string_len(alloc, string, len, escape_char, add_quotes, '"', false); } char * ws_escape_string(wmem_allocator_t *alloc, const char *string, bool add_quotes) { - return escape_string_len(alloc, string, -1, escape_char, add_quotes); + return escape_string_len(alloc, string, -1, escape_char, add_quotes, '"', false); } char *ws_escape_null(wmem_allocator_t *alloc, const char *string, size_t len, bool add_quotes) { - return escape_string_len(alloc, string, len, escape_null, add_quotes); + /* XXX: The existing behavior (maintained) here is not to escape + * backslashes even though NUL is escaped. + */ + return escape_string_len(alloc, string, len, escape_null, add_quotes, add_quotes ? '"' : '\0', false); +} + +char *ws_escape_csv(wmem_allocator_t *alloc, const char *string, bool add_quotes, char quote_char, bool double_quote, bool escape_whitespace) +{ + if (escape_whitespace) + return escape_string_len(alloc, string, -1, escape_char, add_quotes, quote_char, double_quote); + else + return escape_string_len(alloc, string, -1, escape_null, add_quotes, quote_char, double_quote); } const char * |