5 files changed, 666 insertions, 0 deletions
diff --git a/src/libnetdata/gorilla/README.md b/src/libnetdata/gorilla/README.md
new file mode 100644
index 000000000..dc3718d13
--- /dev/null
+++ b/src/libnetdata/gorilla/README.md
@@ -0,0 +1,39 @@
+# Gorilla compression and decompression
+
+This provides an alternative way of representing values stored in database
+pages. Instead of allocating and using a page of fixed size, ie. 4096 bytes,
+the Gorilla implementation adds support for dynamically sized pages that
+contain a variable number of Gorilla buffers.
+
+Each buffer takes 512 bytes and compresses incoming data using the Gorilla
+compression:
+
+- The very first value is stored as it is.
+- For each new value, Gorilla compression doesn't store the value itself. Instead,
+it computes the difference (XOR) between the new value and the previous value.
+- If the XOR result is zero (meaning the new value is identical to the previous
+value), we store just a single bit set to `1`.
+- If the XOR result is not zero (meaning the new value differs from the previous):
+  - We store a `0` bit to indicate the change.
+  - We compute the leading-zero count (LZC) of the XOR result, and compare it
+    with the previous LZC. If the two LZCs are equal we store a `1` bit.
+  - If the LZCs are different we use 5 bits to store the new LZC, and we store
+    the rest of the value (ie. without its LZC) in the buffer.
+
+A Gorilla page can have multiple Gorilla buffers. If the values of a metric
+are highly compressible, just one Gorilla buffer is able to store all the values
+that otherwise would require a regular 4096 byte page, ie. we can use just 512
+bytes instead. In the worst case scenario (for metrics whose values are not
+compressible at all), a Gorilla page might end up having `9` Gorilla buffers,
+consuming 4608 bytes. In practice, this is pretty rare and does not negate
+the effect of compression for the metrics.
+
+When a gorilla page is full, ie. it contains 1024 slots/values, we serialize
+the linked-list of gorilla buffers directly to disk. During deserialization,
+eg. when performing a DBEngine query, the Gorilla page is loaded from the disk and
+its linked-list entries are patched to point to the new memory allocated for
+serving the query results.
+
+Overall, on a real-agent the Gorilla compression scheme reduces memory
+consumption approximately by ~30%, which can be several GiB of RAM for parents
+having hundreds, or even thousands of children streaming to them.
diff --git a/src/libnetdata/gorilla/benchmark.sh b/src/libnetdata/gorilla/benchmark.sh
new file mode 100755
index 000000000..a5d111435
--- /dev/null
+++ b/src/libnetdata/gorilla/benchmark.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+set -exu -o pipefail
+
+clang++ \
+    -std=c++11 -Wall -Wextra \
+    -DENABLE_BENCHMARK -O2 -g \
+    -lbenchmark -lbenchmark_main \
+    -o gorilla_benchmark gorilla.cc
+
+./gorilla_benchmark
diff --git a/src/libnetdata/gorilla/fuzzer.sh b/src/libnetdata/gorilla/fuzzer.sh
new file mode 100755
index 000000000..19098a615
--- /dev/null
+++ b/src/libnetdata/gorilla/fuzzer.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+set -exu -o pipefail
+
+clang++ \
+    -std=c++11 -Wall -Wextra \
+    -DENABLE_FUZZER -O2 -g \
+    -fsanitize=fuzzer \
+    -o gorilla_fuzzer gorilla.cc
+
+./gorilla_fuzzer -workers=12 -jobs=16
diff --git a/src/libnetdata/gorilla/gorilla.cc b/src/libnetdata/gorilla/gorilla.cc
new file mode 100644
index 000000000..c76018365
--- /dev/null
+++ b/src/libnetdata/gorilla/gorilla.cc
@@ -0,0 +1,522 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "gorilla.h"
+
+#include <cassert>
+#include <climits>
+#include <cstdio>
+#include <cstring>
+
+using std::size_t;
+
+template <typename T>
+static constexpr size_t bit_size() noexcept
+{
+    static_assert((sizeof(T) * CHAR_BIT) == 32 || (sizeof(T) * CHAR_BIT) == 64,
+                  "Word size should be 32 or 64 bits.");
+    return (sizeof(T) * CHAR_BIT);
+}
+
+static void bit_buffer_write(uint32_t *buf, size_t pos, uint32_t v, size_t nbits)
+{
+    assert(nbits > 0 && nbits <= bit_size<uint32_t>());
+
+    const size_t index = pos / bit_size<uint32_t>();
+    const size_t offset = pos % bit_size<uint32_t>();
+
+    pos += nbits;
+
+    if (offset == 0) {
+        buf[index] = v;
+    } else {
+        const size_t remaining_bits = bit_size<uint32_t>() - offset;
+
+        // write the lower part of the value
+        const uint32_t low_bits_mask = ((uint32_t) 1 << remaining_bits) - 1;
+        const uint32_t lowest_bits_in_value = v & low_bits_mask;
+        buf[index] |= (lowest_bits_in_value << offset);
+
+        if (nbits > remaining_bits) {
+            // write the upper part of the value
+            const uint32_t high_bits_mask = ~low_bits_mask;
+            const uint32_t highest_bits_in_value = (v & high_bits_mask) >> (remaining_bits);
+            buf[index + 1] = highest_bits_in_value;
+        }
+    }
+}
+
+static void bit_buffer_read(const uint32_t *buf, size_t pos, uint32_t *v, size_t nbits)
+{
+    assert(nbits > 0 && nbits <= bit_size<uint32_t>());
+
+    const size_t index = pos / bit_size<uint32_t>();
+    const size_t offset = pos % bit_size<uint32_t>();
+
+    pos += nbits;
+
+    if (offset == 0) {
+        *v = (nbits == bit_size<uint32_t>()) ?
+                    buf[index] :
+                    buf[index] & (((uint32_t) 1 << nbits) - 1);
+    } else {
+        const size_t remaining_bits = bit_size<uint32_t>() - offset;
+
+        // extract the lower part of the value
+        if (nbits < remaining_bits) {
+            *v = (buf[index] >> offset) & (((uint32_t) 1 << nbits) - 1);
+        } else {
+            *v = (buf[index] >> offset) & (((uint32_t) 1 << remaining_bits) - 1);
+            nbits -= remaining_bits;
+            *v |= (buf[index + 1] & (((uint32_t) 1 << nbits) - 1)) << remaining_bits;
+        }
+    }
+}
+
+gorilla_writer_t gorilla_writer_init(gorilla_buffer_t *gbuf, size_t n)
+{
+    gorilla_writer_t gw = gorilla_writer_t {
+        .head_buffer = gbuf,
+        .last_buffer = NULL,
+        .prev_number = 0,
+        .prev_xor_lzc = 0,
+        .capacity = 0
+    };
+
+    gorilla_writer_add_buffer(&gw, gbuf, n);
+    return gw;
+}
+
+void gorilla_writer_add_buffer(gorilla_writer_t *gw, gorilla_buffer_t *gbuf, size_t n)
+{
+    gbuf->header.next = NULL;
+    gbuf->header.entries = 0;
+    gbuf->header.nbits = 0;
+
+    uint32_t capacity = (n * bit_size<uint32_t>()) - (sizeof(gorilla_header_t) * CHAR_BIT);
+
+    gw->prev_number = 0;
+    gw->prev_xor_lzc = 0;
+    gw->capacity = capacity;
+
+    if (gw->last_buffer)
+        gw->last_buffer->header.next = gbuf;
+
+    __atomic_store_n(&gw->last_buffer, gbuf, __ATOMIC_RELAXED);
+}
+
+uint32_t gorilla_writer_entries(const gorilla_writer_t *gw) {
+    uint32_t entries = 0;
+
+    const gorilla_buffer_t *curr_gbuf = __atomic_load_n(&gw->head_buffer, __ATOMIC_SEQ_CST);
+    do {
+        const gorilla_buffer_t *next_gbuf = __atomic_load_n(&curr_gbuf->header.next, __ATOMIC_SEQ_CST);
+
+        entries += __atomic_load_n(&curr_gbuf->header.entries, __ATOMIC_SEQ_CST);
+
+        curr_gbuf = next_gbuf;
+    } while (curr_gbuf);
+
+    return entries;
+}
+
+bool gorilla_writer_write(gorilla_writer_t *gw, uint32_t number)
+{
+    gorilla_header_t *hdr = &gw->last_buffer->header;
+    uint32_t *data = gw->last_buffer->data;
+
+    // this is the first number we are writing
+    if (hdr->entries == 0) {
+        if (hdr->nbits + bit_size<uint32_t>() >= gw->capacity)
+            return false;
+        bit_buffer_write(data, hdr->nbits, number, bit_size<uint32_t>());
+
+        __atomic_fetch_add(&hdr->nbits, bit_size<uint32_t>(), __ATOMIC_RELAXED);
+        __atomic_fetch_add(&hdr->entries, 1, __ATOMIC_RELAXED);
+        gw->prev_number = number;
+        return true;
+    }
+
+    // write true/false based on whether we got the same number or not.
+    if (number == gw->prev_number) {
+        if (hdr->nbits + 1 >= gw->capacity)
+            return false;
+
+        bit_buffer_write(data, hdr->nbits, static_cast<uint32_t>(1), 1);
+        __atomic_fetch_add(&hdr->nbits, 1, __ATOMIC_RELAXED);
+        __atomic_fetch_add(&hdr->entries, 1, __ATOMIC_RELAXED);
+        return true;
+    }
+
+    if (hdr->nbits + 1 >= gw->capacity)
+        return false;
+    bit_buffer_write(data, hdr->nbits, static_cast<uint32_t>(0), 1);
+    __atomic_fetch_add(&hdr->nbits, 1, __ATOMIC_RELAXED);
+
+    uint32_t xor_value = gw->prev_number ^ number;
+    uint32_t xor_lzc = (bit_size<uint32_t>() == 32) ? __builtin_clz(xor_value) : __builtin_clzll(xor_value);
+    uint32_t is_xor_lzc_same = (xor_lzc == gw->prev_xor_lzc) ? 1 : 0;
+
+    if (hdr->nbits + 1 >= gw->capacity)
+        return false;
+    bit_buffer_write(data, hdr->nbits, is_xor_lzc_same, 1);
+    __atomic_fetch_add(&hdr->nbits, 1, __ATOMIC_RELAXED);
+    
+    if (!is_xor_lzc_same) {
+        if (hdr->nbits + 1 >= gw->capacity)
+            return false;
+        bit_buffer_write(data, hdr->nbits, xor_lzc, (bit_size<uint32_t>() == 32) ? 5 : 6);
+        __atomic_fetch_add(&hdr->nbits, (bit_size<uint32_t>() == 32) ? 5 : 6, __ATOMIC_RELAXED);
+    }
+
+    // write the bits of the XOR'd value without the LZC prefix
+    if (hdr->nbits + (bit_size<uint32_t>() - xor_lzc) >= gw->capacity)
+        return false;
+    bit_buffer_write(data, hdr->nbits, xor_value, bit_size<uint32_t>() - xor_lzc);
+    __atomic_fetch_add(&hdr->nbits, bit_size<uint32_t>() - xor_lzc, __ATOMIC_RELAXED);
+    __atomic_fetch_add(&hdr->entries, 1, __ATOMIC_RELAXED);
+
+    gw->prev_number = number;
+    gw->prev_xor_lzc = xor_lzc;
+    return true;
+}
+
+gorilla_buffer_t *gorilla_writer_drop_head_buffer(gorilla_writer_t *gw) {
+    if (!gw->head_buffer)
+        return NULL;
+
+    gorilla_buffer_t *curr_head = gw->head_buffer;
+    gorilla_buffer_t *next_head = gw->head_buffer->header.next;
+    __atomic_store_n(&gw->head_buffer, next_head, __ATOMIC_RELAXED);
+    return curr_head;
+}
+
+uint32_t gorilla_writer_nbytes(const gorilla_writer_t *gw)
+{
+    uint32_t nbits = 0;
+
+    const gorilla_buffer_t *curr_gbuf = __atomic_load_n(&gw->head_buffer, __ATOMIC_SEQ_CST);
+    do {
+        const gorilla_buffer_t *next_gbuf = __atomic_load_n(&curr_gbuf->header.next, __ATOMIC_SEQ_CST);
+
+        nbits += __atomic_load_n(&curr_gbuf->header.nbits, __ATOMIC_SEQ_CST);
+
+        curr_gbuf = next_gbuf;
+    } while (curr_gbuf);
+
+    return (nbits + (CHAR_BIT - 1)) / CHAR_BIT;
+}
+
+bool gorilla_writer_serialize(const gorilla_writer_t *gw, uint8_t *dst, uint32_t dst_size) {
+    const gorilla_buffer_t *curr_gbuf = gw->head_buffer;
+
+    do {
+        const gorilla_buffer_t *next_gbuf = curr_gbuf->header.next;
+
+        size_t bytes = RRDENG_GORILLA_32BIT_BUFFER_SIZE;
+        if (bytes > dst_size)
+            return false;   
+
+        memcpy(dst, curr_gbuf, bytes);
+        dst += bytes;
+        dst_size -= bytes;
+
+        curr_gbuf = next_gbuf;
+    } while (curr_gbuf);
+
+    return true;
+}
+
+uint32_t gorilla_buffer_patch(gorilla_buffer_t *gbuf) {
+    gorilla_buffer_t *curr_gbuf = gbuf;
+    uint32_t n = curr_gbuf->header.entries;
+
+    while (curr_gbuf->header.next) {
+        uint32_t *buf = reinterpret_cast<uint32_t *>(gbuf);
+        gbuf = reinterpret_cast<gorilla_buffer_t *>(&buf[RRDENG_GORILLA_32BIT_BUFFER_SLOTS]);
+
+        assert(((uintptr_t) (gbuf) % sizeof(uintptr_t)) == 0 &&
+               "Gorilla buffer not aligned to uintptr_t");
+
+        curr_gbuf->header.next = gbuf;
+        curr_gbuf = curr_gbuf->header.next;
+
+        n += curr_gbuf->header.entries;
+    }
+
+    return n;
+}
+
+gorilla_reader_t gorilla_writer_get_reader(const gorilla_writer_t *gw)
+{
+    const gorilla_buffer_t *buffer = __atomic_load_n(&gw->head_buffer, __ATOMIC_SEQ_CST);
+
+    uint32_t entries = __atomic_load_n(&buffer->header.entries, __ATOMIC_SEQ_CST);
+    uint32_t capacity = __atomic_load_n(&buffer->header.nbits, __ATOMIC_SEQ_CST);
+
+    return gorilla_reader_t {
+        .buffer = buffer,
+        .entries = entries,
+        .index = 0,
+        .capacity = capacity,
+        .position = 0,
+        .prev_number = 0,
+        .prev_xor_lzc = 0,
+        .prev_xor = 0,
+    };
+}
+
+gorilla_reader_t gorilla_reader_init(gorilla_buffer_t *gbuf)
+{
+    uint32_t entries = __atomic_load_n(&gbuf->header.entries, __ATOMIC_SEQ_CST);
+    uint32_t capacity = __atomic_load_n(&gbuf->header.nbits, __ATOMIC_SEQ_CST);
+
+    return gorilla_reader_t {
+        .buffer = gbuf,
+        .entries = entries,
+        .index = 0,
+        .capacity = capacity,
+        .position = 0,
+        .prev_number = 0,
+        .prev_xor_lzc = 0,
+        .prev_xor = 0,
+    };
+}
+
+bool gorilla_reader_read(gorilla_reader_t *gr, uint32_t *number)
+{
+    const uint32_t *data = gr->buffer->data;
+
+    if (gr->index + 1 > gr->entries) {
+        // We don't have any more entries to return. However, the writer
+        // might have updated the buffer's entries. We need to check once
+        // more in case more elements were added.
+        gr->entries = __atomic_load_n(&gr->buffer->header.entries, __ATOMIC_SEQ_CST);
+        gr->capacity = __atomic_load_n(&gr->buffer->header.nbits, __ATOMIC_SEQ_CST);
+
+        // if the reader's current buffer has not been updated, we need to
+        // check if it has a pointer to a next buffer.
+        if (gr->index + 1 > gr->entries) {
+            gorilla_buffer_t *next_buffer = __atomic_load_n(&gr->buffer->header.next, __ATOMIC_SEQ_CST);
+
+            if (!next_buffer) {
+                // fprintf(stderr, "Consumed reader with %zu entries from buffer %p\n (No more buffers to read from)", gr->length, gr->buffer);
+                return false;
+            }
+
+            // fprintf(stderr, "Consumed reader with %zu entries from buffer %p\n", gr->length, gr->buffer);
+            *gr = gorilla_reader_init(next_buffer);
+            return gorilla_reader_read(gr, number);
+        }
+    }
+
+    // read the first number
+    if (gr->index == 0) {
+        bit_buffer_read(data, gr->position, number, bit_size<uint32_t>());
+
+        gr->index++;
+        gr->position += bit_size<uint32_t>();
+        gr->prev_number = *number;
+        return true;
+    }
+
+    // process same-number bit
+    uint32_t is_same_number;
+    bit_buffer_read(data, gr->position, &is_same_number, 1);
+    gr->position++;
+
+    if (is_same_number) {
+        *number = gr->prev_number;
+        gr->index++;
+        return true;
+    }
+
+    // proceess same-xor-lzc bit
+    uint32_t xor_lzc = gr->prev_xor_lzc;
+
+    uint32_t same_xor_lzc;
+    bit_buffer_read(data, gr->position, &same_xor_lzc, 1);
+    gr->position++;
+
+    if (!same_xor_lzc) {
+        bit_buffer_read(data, gr->position, &xor_lzc, (bit_size<uint32_t>() == 32) ? 5 : 6);
+        gr->position += (bit_size<uint32_t>() == 32) ? 5 : 6;
+    }
+
+    // process the non-lzc suffix
+    uint32_t xor_value = 0;
+    bit_buffer_read(data, gr->position, &xor_value, bit_size<uint32_t>() - xor_lzc);
+    gr->position += bit_size<uint32_t>() - xor_lzc;
+
+    *number = (gr->prev_number ^ xor_value);
+
+    gr->index++;
+    gr->prev_number = *number;
+    gr->prev_xor_lzc = xor_lzc;
+    gr->prev_xor = xor_value;
+
+    return true;
+}
+
+/*
+ * Internal code used for fuzzing the library
+*/
+
+#ifdef ENABLE_FUZZER
+
+#include <vector>
+
+template<typename Word>
+static std::vector<Word> random_vector(const uint8_t *data, size_t size) {
+    std::vector<Word> V;
+
+    V.reserve(1024);
+
+    while (size >= sizeof(Word)) {
+        size -= sizeof(Word);
+
+        Word w;
+        memcpy(&w, &data[size], sizeof(Word));
+        V.push_back(w);
+    }
+
+    return V;
+}
+
+class Storage {
+public:
+    gorilla_buffer_t *alloc_buffer(size_t words) {
+        uint32_t *new_buffer = new uint32_t[words]();
+        assert(((((uintptr_t) new_buffer) % 8u) == 0) && "Unaligned buffer...");
+        Buffers.push_back(new_buffer);
+        return reinterpret_cast<gorilla_buffer_t *>(new_buffer);
+    }
+
+    void free_buffers() {
+        for (uint32_t *buffer : Buffers) {
+            delete[] buffer;
+        }
+    }
+    
+private:
+    std::vector<uint32_t *> Buffers;
+};
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+    if (Size < 4)
+        return 0;
+
+    std::vector<uint32_t> RandomData = random_vector<uint32_t>(Data, Size);
+
+    Storage S;
+    size_t words_per_buffer = 8;
+
+    /*
+     * write data
+    */
+    gorilla_buffer_t *first_buffer = S.alloc_buffer(words_per_buffer);
+    gorilla_writer_t gw = gorilla_writer_init(first_buffer, words_per_buffer);
+
+    for (size_t i = 0; i != RandomData.size(); i++) {
+        bool ok = gorilla_writer_write(&gw, RandomData[i]);
+        if (ok)
+            continue;
+
+        // add new buffer
+        gorilla_buffer_t *buffer = S.alloc_buffer(words_per_buffer);
+        gorilla_writer_add_buffer(&gw, buffer, words_per_buffer);
+
+        ok = gorilla_writer_write(&gw, RandomData[i]);
+        assert(ok && "Could not write data to new buffer!!!");
+    }
+
+
+    /*
+     * read data
+    */
+    gorilla_reader_t gr = gorilla_writer_get_reader(&gw);
+
+    for (size_t i = 0; i != RandomData.size(); i++) {
+        uint32_t number = 0;
+        bool ok = gorilla_reader_read(&gr, &number);
+        assert(ok && "Failed to read number from gorilla buffer");
+
+        assert((number == RandomData[i])
+                && "Read wrong number from gorilla buffer");
+    }
+
+    S.free_buffers();
+    return 0;
+}
+
+#endif /* ENABLE_FUZZER */
+
+#ifdef ENABLE_BENCHMARK
+
+#include <benchmark/benchmark.h>
+#include <random>
+
+static size_t NumItems = 1024;
+
+static void BM_EncodeU32Numbers(benchmark::State& state) {
+    std::random_device rd;
+    std::mt19937 mt(rd());
+    std::uniform_int_distribution<uint32_t> dist(0x0, 0x0000FFFF);
+
+    std::vector<uint32_t> RandomData;
+    for (size_t idx = 0; idx != NumItems; idx++) {
+        RandomData.push_back(dist(mt));
+    }
+    std::vector<uint32_t> EncodedData(10 * RandomData.capacity(), 0);
+
+    for (auto _ : state) {
+        gorilla_writer_t gw = gorilla_writer_init(
+            reinterpret_cast<gorilla_buffer_t *>(EncodedData.data()),
+            EncodedData.size());
+
+        for (size_t i = 0; i != RandomData.size(); i++)
+            benchmark::DoNotOptimize(gorilla_writer_write(&gw, RandomData[i]));
+
+        benchmark::ClobberMemory();
+    }
+
+    state.SetItemsProcessed(NumItems * state.iterations());
+    state.SetBytesProcessed(NumItems * state.iterations() * sizeof(uint32_t));
+}
+BENCHMARK(BM_EncodeU32Numbers)->ThreadRange(1, 16)->UseRealTime();
+
+static void BM_DecodeU32Numbers(benchmark::State& state) {
+    std::random_device rd;
+    std::mt19937 mt(rd());
+    std::uniform_int_distribution<uint32_t> dist(0x0, 0xFFFFFFFF);
+
+    std::vector<uint32_t> RandomData;
+    for (size_t idx = 0; idx != NumItems; idx++) {
+        RandomData.push_back(dist(mt));
+    }
+    std::vector<uint32_t> EncodedData(10 * RandomData.capacity(), 0);
+    std::vector<uint32_t> DecodedData(10 * RandomData.capacity(), 0);
+
+    gorilla_writer_t gw = gorilla_writer_init(
+        reinterpret_cast<gorilla_buffer_t *>(EncodedData.data()),
+        EncodedData.size());
+
+    for (size_t i = 0; i != RandomData.size(); i++)
+        gorilla_writer_write(&gw, RandomData[i]);
+
+    for (auto _ : state) {
+        gorilla_reader_t gr = gorilla_reader_init(reinterpret_cast<gorilla_buffer_t *>(EncodedData.data()));
+
+        for (size_t i = 0; i != RandomData.size(); i++) {
+            uint32_t number = 0;
+            benchmark::DoNotOptimize(gorilla_reader_read(&gr, &number));
+        }
+
+        benchmark::ClobberMemory();
+    }
+
+    state.SetItemsProcessed(NumItems * state.iterations());
+    state.SetBytesProcessed(NumItems * state.iterations() * sizeof(uint32_t));
+}
+BENCHMARK(BM_DecodeU32Numbers)->ThreadRange(1, 16)->UseRealTime();
+
+#endif /* ENABLE_BENCHMARK */
diff --git a/src/libnetdata/gorilla/gorilla.h b/src/libnetdata/gorilla/gorilla.h
new file mode 100644
index 000000000..7975d85ee
--- /dev/null
+++ b/src/libnetdata/gorilla/gorilla.h
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef GORILLA_H
+#define GORILLA_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct gorilla_buffer;
+
+typedef struct {
+    struct gorilla_buffer *next;
+    uint32_t entries;
+    uint32_t nbits;
+} gorilla_header_t;
+
+typedef struct gorilla_buffer {
+    gorilla_header_t header;
+    uint32_t data[];
+} gorilla_buffer_t;
+
+typedef struct {
+    gorilla_buffer_t *head_buffer;
+    gorilla_buffer_t *last_buffer;
+
+    uint32_t prev_number;
+    uint32_t prev_xor_lzc;
+
+    // in bits
+    uint32_t capacity;
+} gorilla_writer_t;
+
+typedef struct {
+    const gorilla_buffer_t *buffer;
+
+    // number of values
+    size_t entries;
+    size_t index;
+
+    // in bits
+    size_t capacity; // FIXME: this not needed on the reader's side
+    size_t position;
+
+    uint32_t prev_number;
+    uint32_t prev_xor_lzc;
+    uint32_t prev_xor;
+} gorilla_reader_t;
+
+gorilla_writer_t gorilla_writer_init(gorilla_buffer_t *gbuf, size_t n);
+void gorilla_writer_add_buffer(gorilla_writer_t *gw, gorilla_buffer_t *gbuf, size_t n);
+bool gorilla_writer_write(gorilla_writer_t *gw, uint32_t number);
+uint32_t gorilla_writer_entries(const gorilla_writer_t *gw);
+
+gorilla_reader_t gorilla_writer_get_reader(const gorilla_writer_t *gw);
+
+gorilla_buffer_t *gorilla_writer_drop_head_buffer(gorilla_writer_t *gw);
+
+uint32_t gorilla_writer_nbytes(const gorilla_writer_t *gw);
+bool gorilla_writer_serialize(const gorilla_writer_t *gw, uint8_t *dst, uint32_t dst_size);
+
+uint32_t gorilla_buffer_patch(gorilla_buffer_t *buf);
+gorilla_reader_t gorilla_reader_init(gorilla_buffer_t *buf);
+bool gorilla_reader_read(gorilla_reader_t *gr, uint32_t *number);
+
+#define RRDENG_GORILLA_32BIT_BUFFER_SLOTS 128
+#define RRDENG_GORILLA_32BIT_BUFFER_SIZE (RRDENG_GORILLA_32BIT_BUFFER_SLOTS * sizeof(uint32_t))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GORILLA_H */