1 files changed, 283 insertions, 13 deletions
diff --git a/wsutil/str_util.c b/wsutil/str_util.c
index 4243b22b..556ef5a9 100644
--- a/wsutil/str_util.c
+++ b/wsutil/str_util.c
@@ -13,6 +13,8 @@
 #include "str_util.h"
 
 #include <string.h>
+#include <locale.h>
+#include <math.h>
 
 #include <ws_codepoints.h>
 
@@ -339,10 +341,41 @@ ws_ascii_strcasestr(const char *haystack, const char *needle)
     return NULL;
 }
 
+/* Return the last occurrence of ch in the n bytes of haystack.
+ * If not found or n is 0, return NULL. */
+const uint8_t *
+ws_memrchr(const void *_haystack, int ch, size_t n)
+{
+#ifdef HAVE_MEMRCHR
+    return memrchr(_haystack, ch, n);
+#else
+    /* A generic implementation. This could be optimized considerably,
+     * e.g. by fetching a word at a time.
+     */
+    if (n == 0) {
+        return NULL;
+    }
+    const uint8_t *haystack = _haystack;
+    const uint8_t *p;
+    uint8_t c = (uint8_t)ch;
+
+    const uint8_t *const end = haystack + n - 1;
+
+    for (p = end; p >= haystack; --p) {
+        if (*p == c) {
+            return p;
+        }
+    }
+
+    return NULL;
+#endif /* HAVE_MEMRCHR */
+}
+
 #define FORMAT_SIZE_UNIT_MASK 0x00ff
 #define FORMAT_SIZE_PFX_MASK 0xff00
 
-static const char *thousands_grouping_fmt = NULL;
+static const char *thousands_grouping_fmt;
+static const char *thousands_grouping_fmt_flt;
 
 DIAG_OFF(format)
 static void test_printf_thousands_grouping(void) {
@@ -351,16 +384,213 @@ static void test_printf_thousands_grouping(void) {
     wmem_strbuf_append_printf(buf, "%'d", 22);
     if (g_strcmp0(wmem_strbuf_get_str(buf), "22") == 0) {
         thousands_grouping_fmt = "%'"PRId64;
+        thousands_grouping_fmt_flt = "%'.*f";
     } else {
         /* Don't use */
         thousands_grouping_fmt = "%"PRId64;
+        thousands_grouping_fmt_flt = "%.*f";
     }
     wmem_strbuf_destroy(buf);
 }
 DIAG_ON(format)
 
+static const char* decimal_point = NULL;
+
+static void truncate_numeric_strbuf(wmem_strbuf_t *strbuf, int n) {
+
+    const char *s = wmem_strbuf_get_str(strbuf);
+    char *p;
+    int count;
+
+    if (decimal_point == NULL) {
+        decimal_point = localeconv()->decimal_point;
+    }
+
+    p = strchr(s, decimal_point[0]);
+    if (p != NULL) {
+        count = n;
+        while (count >= 0) {
+            count--;
+            if (*p == '\0')
+                break;
+            p++;
+        }
+
+        p--;
+        while (*p == '0') {
+            p--;
+        }
+
+        if (*p != decimal_point[0]) {
+            p++;
+        }
+        wmem_strbuf_truncate(strbuf, p - s);
+    }
+}
+
+/* Given a floating point value, return it in a human-readable format,
+ * using units with metric prefixes (falling back to scientific notation
+ * with the base units if outside the range.)
+ */
+char *
+format_units(wmem_allocator_t *allocator, double size,
+             format_size_units_e unit, uint16_t flags,
+             int precision)
+{
+    wmem_strbuf_t *human_str = wmem_strbuf_new(allocator, NULL);
+    double power = 1000.0;
+    int pfx_off = 6;
+    bool is_small = false;
+    /* is_small is when to use the longer, spelled out unit.
+     * We use it for inf, NaN, 0, and unprefixed small values,
+     * but not for unprefixed values using scientific notation
+     * the value is outside the supported prefix range.
+     */
+    bool scientific = false;
+    double abs_size = fabs(size);
+    int exponent = 0;
+    static const char * const si_prefix[] = {" a", " f", " p", " n", " μ", " m", " ", " k", " M", " G", " T", " P", " E"};
+    static const char * const iec_prefix[] = {" ", " Ki", " Mi", " Gi", " Ti", " Pi", " Ei"};
+    const char * const *prefix = si_prefix;
+    int max_exp = (int)G_N_ELEMENTS(si_prefix) - 1;
+
+    char *ret_val;
+
+    if (thousands_grouping_fmt == NULL)
+        test_printf_thousands_grouping();
+
+    if (flags & FORMAT_SIZE_PREFIX_IEC) {
+        prefix = iec_prefix;
+        max_exp = (int)G_N_ELEMENTS(iec_prefix) - 1;
+        power = 1024.0;
+    }
+
+    if (isfinite(size) && size != 0.0) {
+
+        double comp = precision == 0 ? 10.0 : 1.0;
+
+        /* For precision 0, use the range [10, 10*power) because only
+         * one significant digit is not as useful. This is what format_size
+         * does for integers. ("ls -h" uses one digit after the decimal
+         * point only for the [1, 10) range, g_format_size() always displays
+         * tenths.) Prefer non-prefixed units for the range [1,10), though.
+         *
+         * We have a limited number of units to check, so this (which
+         * can be unrolled) is presumably faster than log + floor + pow/exp
+         */
+        if (abs_size < 1.0) {
+            while (abs_size < comp) {
+                abs_size *= power;
+                exponent--;
+                if ((exponent + pfx_off) < 0) {
+                    scientific = true;
+                    break;
+                }
+            }
+        } else {
+            while (abs_size >= comp*power) {
+                abs_size *= 1/power;
+                exponent++;
+                if ((exponent + pfx_off) > max_exp) {
+                    scientific = true;
+                    break;
+                }
+            }
+        }
+    }
+
+    if (scientific) {
+        wmem_strbuf_append_printf(human_str, "%.*g", precision + 1, size);
+        exponent = 0;
+    } else {
+        if (exponent == 0) {
+            is_small = true;
+        }
+        size = copysign(abs_size, size);
+        // Truncate trailing zeros, but do it this way because we know
+        // we don't want scientific notation, and we don't want %g to
+        // switch to that if precision is small. (We could always use
+        // %g when precision is large.)
+        wmem_strbuf_append_printf(human_str, thousands_grouping_fmt_flt, precision, size);
+        truncate_numeric_strbuf(human_str, precision);
+        // XXX - when rounding to a certain precision, printf might
+        // round up to "power" from something like 999.99999995, which
+        // looks a little odd on a graph when transitioning from 1,000 bytes
+        // (for values just under 1 kB) to 1 kB (for values 1 kB and larger.)
+        // Due to edge cases in binary fp representation and how printf might
+        // round things, the right way to handle it is taking the printf output
+        // and comparing it to "1000" and "1024" and adjusting the exponent
+        // if so - though we need to compare to the version with the thousands
+        // separator if we have that (which makes it harder to use strnatcmp
+        // as is.)
+    }
+
+    if ((size_t)(pfx_off + exponent) < G_N_ELEMENTS(si_prefix)) {
+        wmem_strbuf_append(human_str, prefix[pfx_off+exponent]);
+    }
+
+    switch (unit) {
+        case FORMAT_SIZE_UNIT_NONE:
+            break;
+        case FORMAT_SIZE_UNIT_BYTES:
+            wmem_strbuf_append(human_str, is_small ? "bytes" : "B");
+            break;
+        case FORMAT_SIZE_UNIT_BITS:
+            wmem_strbuf_append(human_str, is_small ? "bits" : "b");
+            break;
+        case FORMAT_SIZE_UNIT_BITS_S:
+            wmem_strbuf_append(human_str, is_small ? "bits/s" : "bps");
+            break;
+        case FORMAT_SIZE_UNIT_BYTES_S:
+            wmem_strbuf_append(human_str, is_small ? "bytes/s" : "Bps");
+            break;
+        case FORMAT_SIZE_UNIT_PACKETS:
+            wmem_strbuf_append(human_str, is_small ? "packets" : "packets");
+            break;
+        case FORMAT_SIZE_UNIT_PACKETS_S:
+            wmem_strbuf_append(human_str, is_small ? "packets/s" : "packets/s");
+            break;
+        case FORMAT_SIZE_UNIT_EVENTS:
+            wmem_strbuf_append(human_str, is_small ? "events" : "events");
+            break;
+        case FORMAT_SIZE_UNIT_EVENTS_S:
+            wmem_strbuf_append(human_str, is_small ? "events/s" : "events/s");
+            break;
+        case FORMAT_SIZE_UNIT_FIELDS:
+            wmem_strbuf_append(human_str, is_small ? "fields" : "fields");
+            break;
+        case FORMAT_SIZE_UNIT_SECONDS:
+            wmem_strbuf_append(human_str, is_small ? "seconds" : "s");
+            break;
+        case FORMAT_SIZE_UNIT_ERLANGS:
+            wmem_strbuf_append(human_str, is_small ? "erlangs" : "E");
+            break;
+        default:
+            ws_assert_not_reached();
+    }
+
+    ret_val = wmem_strbuf_finalize(human_str);
+    /* Convention is a space between the value and the units. If we have
+     * a prefix, the space is before the prefix. There are two possible
+     * uses of FORMAT_SIZE_UNIT_NONE:
+     * 1. Add a unit immediately after the string returned. In this case,
+     *    we would want the string to end with a space if there's no prefix.
+     * 2. The unit appears somewhere else, e.g. in a legend, header, or
+     *    different column. In this case, we don't want the string to end
+     *    with a space if there's no prefix.
+     * chomping the string here, as we've traditionally done, optimizes for
+     * the latter case but makes the former case harder.
+     * Perhaps the right approach is to distinguish the cases with a new
+     * enum value.
+     */
+    return g_strchomp(ret_val);
+}
+
 /* Given a size, return its value in a human-readable format */
-/* This doesn't handle fractional values. We might want to make size a double. */
+/* This doesn't handle fractional values. We might want to just
+ * call the version with the double and precision 0 (possibly
+ * slower due to the use of floating point math, but do we care?)
+ */
 char *
 format_size_wmem(wmem_allocator_t *allocator, int64_t size,
                         format_size_units_e unit, uint16_t flags)
@@ -418,6 +648,18 @@ format_size_wmem(wmem_allocator_t *allocator, int64_t size,
         case FORMAT_SIZE_UNIT_PACKETS_S:
             wmem_strbuf_append(human_str, is_small ? " packets/s" : "packets/s");
             break;
+        case FORMAT_SIZE_UNIT_FIELDS:
+            wmem_strbuf_append(human_str, is_small ? " fields" : "fields");
+            break;
+        /* These aren't that practical to use with integers, but
+         * perhaps better than asserting.
+         */
+        case FORMAT_SIZE_UNIT_SECONDS:
+            wmem_strbuf_append(human_str, is_small ? " seconds" : "s");
+            break;
+        case FORMAT_SIZE_UNIT_ERLANGS:
+            wmem_strbuf_append(human_str, is_small ? " erlangs" : "E");
+            break;
         default:
             ws_assert_not_reached();
     }
@@ -443,8 +685,9 @@ escape_char(char c, char *p)
     ws_assert(p);
 
     /*
-     * Backslashes and double-quotes must
-     * be escaped. Whitespace is also escaped.
+     * backslashes and double-quotes must be escaped (double-quotes
+     * are escaped by passing '"' as quote_char in escape_string_len)
+     * whitespace is also escaped.
      */
     switch (c) {
         case '\a': r = 'a'; break;
@@ -454,7 +697,6 @@ escape_char(char c, char *p)
         case '\r': r = 'r'; break;
         case '\t': r = 't'; break;
         case '\v': r = 'v'; break;
-        case '"':  r = '"'; break;
         case '\\': r = '\\'; break;
         case '\0': r = '0'; break;
     }
@@ -479,7 +721,8 @@ escape_null(char c, char *p)
 
 static char *
 escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len,
-                    bool (*escape_func)(char c, char *p), bool add_quotes)
+                    bool (*escape_func)(char c, char *p), bool add_quotes,
+                    char quote_char, bool double_quote)
 {
     char c, r;
     wmem_strbuf_t *buf;
@@ -495,8 +738,8 @@ escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len,
 
     buf = wmem_strbuf_new_sized(alloc, alloc_size);
 
-    if (add_quotes)
-        wmem_strbuf_append_c(buf, '"');
+    if (add_quotes && quote_char != '\0')
+        wmem_strbuf_append_c(buf, quote_char);
 
     for (i = 0; i < len; i++) {
         c = string[i];
@@ -504,14 +747,30 @@ escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len,
             wmem_strbuf_append_c(buf, '\\');
             wmem_strbuf_append_c(buf, r);
         }
+        else if (c == quote_char && quote_char != '\0') {
+            /* If quoting, we must escape the quote_char somehow. */
+            if (double_quote) {
+                wmem_strbuf_append_c(buf, c);
+                wmem_strbuf_append_c(buf, c);
+            } else {
+                wmem_strbuf_append_c(buf, '\\');
+                wmem_strbuf_append_c(buf, c);
+            }
+        }
+        else if (c == '\\' && quote_char != '\0' && !double_quote) {
+            /* If quoting, and escaping the quote_char with a backslash,
+             * then backslash must be escaped, even if escape_func doesn't. */
+            wmem_strbuf_append_c(buf, '\\');
+            wmem_strbuf_append_c(buf, '\\');
+        }
         else {
             /* Other UTF-8 bytes are passed through. */
             wmem_strbuf_append_c(buf, c);
         }
     }
 
-    if (add_quotes)
-        wmem_strbuf_append_c(buf, '"');
+    if (add_quotes && quote_char != '\0')
+        wmem_strbuf_append_c(buf, quote_char);
 
     return wmem_strbuf_finalize(buf);
 }
@@ -519,18 +778,29 @@ escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len,
 char *
 ws_escape_string_len(wmem_allocator_t *alloc, const char *string, ssize_t len, bool add_quotes)
 {
-    return escape_string_len(alloc, string, len, escape_char, add_quotes);
+    return escape_string_len(alloc, string, len, escape_char, add_quotes, '"', false);
 }
 
 char *
 ws_escape_string(wmem_allocator_t *alloc, const char *string, bool add_quotes)
 {
-    return escape_string_len(alloc, string, -1, escape_char, add_quotes);
+    return escape_string_len(alloc, string, -1, escape_char, add_quotes, '"', false);
 }
 
 char *ws_escape_null(wmem_allocator_t *alloc, const char *string, size_t len, bool add_quotes)
 {
-    return escape_string_len(alloc, string, len, escape_null, add_quotes);
+    /* XXX: The existing behavior (maintained) here is not to escape
+     * backslashes even though NUL is escaped.
+     */
+    return escape_string_len(alloc, string, len, escape_null, add_quotes, add_quotes ? '"' : '\0', false);
+}
+
+char *ws_escape_csv(wmem_allocator_t *alloc, const char *string, bool add_quotes, char quote_char, bool double_quote, bool escape_whitespace)
+{
+    if (escape_whitespace)
+        return escape_string_len(alloc, string, -1, escape_char, add_quotes, quote_char, double_quote);
+    else
+        return escape_string_len(alloc, string, -1, escape_null, add_quotes, quote_char, double_quote);
 }
 
 const char *