summaryrefslogtreecommitdiffstats
path: root/src/basic/strbuf.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/basic/strbuf.c')
-rw-r--r--src/basic/strbuf.c181
1 files changed, 181 insertions, 0 deletions
diff --git a/src/basic/strbuf.c b/src/basic/strbuf.c
new file mode 100644
index 0000000..0617acc
--- /dev/null
+++ b/src/basic/strbuf.c
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "alloc-util.h"
+#include "sort-util.h"
+#include "strbuf.h"
+
+/*
+ * Strbuf stores given strings in a single continuous allocated memory
+ * area. Identical strings are de-duplicated and return the same offset
+ * as the first string stored. If the tail of a string already exists
+ * in the buffer, the tail is returned.
+ *
+ * A trie (http://en.wikipedia.org/wiki/Trie) is used to maintain the
+ * information about the stored strings.
+ *
+ * Example of udev rules:
+ * $ ./udevadm test .
+ * ...
+ * read rules file: /usr/lib/udev/rules.d/99-systemd.rules
+ * rules contain 196608 bytes tokens (16384 * 12 bytes), 39742 bytes strings
+ * 23939 strings (207859 bytes), 20404 de-duplicated (171653 bytes), 3536 trie nodes used
+ * ...
+ */
+
+struct strbuf* strbuf_new(void) {
+ struct strbuf *str;
+
+ str = new(struct strbuf, 1);
+ if (!str)
+ return NULL;
+ *str = (struct strbuf) {
+ .buf = new0(char, 1),
+ .root = new0(struct strbuf_node, 1),
+ .len = 1,
+ .nodes_count = 1,
+ };
+ if (!str->buf || !str->root) {
+ free(str->buf);
+ free(str->root);
+ return mfree(str);
+ }
+
+ return str;
+}
+
+static struct strbuf_node* strbuf_node_cleanup(struct strbuf_node *node) {
+ size_t i;
+
+ for (i = 0; i < node->children_count; i++)
+ strbuf_node_cleanup(node->children[i].child);
+ free(node->children);
+ return mfree(node);
+}
+
+/* clean up trie data, leave only the string buffer */
+void strbuf_complete(struct strbuf *str) {
+ if (!str)
+ return;
+ if (str->root)
+ str->root = strbuf_node_cleanup(str->root);
+}
+
+/* clean up everything */
+struct strbuf* strbuf_free(struct strbuf *str) {
+ if (!str)
+ return NULL;
+
+ strbuf_complete(str);
+ free(str->buf);
+ return mfree(str);
+}
+
+static int strbuf_children_cmp(const struct strbuf_child_entry *n1,
+ const struct strbuf_child_entry *n2) {
+ return n1->c - n2->c;
+}
+
+static void bubbleinsert(struct strbuf_node *node,
+ uint8_t c,
+ struct strbuf_node *node_child) {
+
+ struct strbuf_child_entry new = {
+ .c = c,
+ .child = node_child,
+ };
+ int left = 0, right = node->children_count;
+
+ while (right > left) {
+ int middle = (right + left) / 2 ;
+ if (strbuf_children_cmp(&node->children[middle], &new) <= 0)
+ left = middle + 1;
+ else
+ right = middle;
+ }
+
+ memmove(node->children + left + 1, node->children + left,
+ sizeof(struct strbuf_child_entry) * (node->children_count - left));
+ node->children[left] = new;
+
+ node->children_count++;
+}
+
+/* add string, return the index/offset into the buffer */
+ssize_t strbuf_add_string(struct strbuf *str, const char *s, size_t len) {
+ uint8_t c;
+ char *buf_new;
+ struct strbuf_child_entry *child;
+ struct strbuf_node *node;
+ ssize_t off;
+
+ if (!str->root)
+ return -EINVAL;
+
+ /* search string; start from last character to find possibly matching tails */
+
+ str->in_count++;
+ if (len == 0) {
+ str->dedup_count++;
+ return 0;
+ }
+ str->in_len += len;
+
+ node = str->root;
+ for (size_t depth = 0; depth <= len; depth++) {
+ struct strbuf_child_entry search;
+
+ /* match against current node */
+ off = node->value_off + node->value_len - len;
+ if (depth == len || (node->value_len >= len && memcmp(str->buf + off, s, len) == 0)) {
+ str->dedup_len += len;
+ str->dedup_count++;
+ return off;
+ }
+
+ c = s[len - 1 - depth];
+
+ /* lookup child node */
+ search.c = c;
+ child = typesafe_bsearch(&search, node->children, node->children_count, strbuf_children_cmp);
+ if (!child)
+ break;
+ node = child->child;
+ }
+
+ /* add new string */
+ buf_new = realloc(str->buf, str->len + len+1);
+ if (!buf_new)
+ return -ENOMEM;
+ str->buf = buf_new;
+ off = str->len;
+ memcpy(str->buf + off, s, len);
+ str->len += len;
+ str->buf[str->len++] = '\0';
+
+ /* new node */
+ _cleanup_free_ struct strbuf_node *node_child = NULL;
+
+ node_child = new(struct strbuf_node, 1);
+ if (!node_child)
+ return -ENOMEM;
+ *node_child = (struct strbuf_node) {
+ .value_off = off,
+ .value_len = len,
+ };
+
+ /* extend array, add new entry, sort for bisection */
+ child = reallocarray(node->children, node->children_count + 1, sizeof(struct strbuf_child_entry));
+ if (!child)
+ return -ENOMEM;
+
+ str->nodes_count++;
+
+ node->children = child;
+ bubbleinsert(node, c, TAKE_PTR(node_child));
+
+ return off;
+}