Merging upstream version 1.45.3+dfsg.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-05 12:08:03 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-05 12:08:18 +0000
commit: 5da14042f70711ea5cf66e034699730335462f66 (patch)
tree: 0f6354ccac934ed87a2d555f45be4c831cf92f4a /src/database/engine/rrdengineapi.c
parent: Releasing debian version 1.44.3-2. (diff)
download: netdata-5da14042f70711ea5cf66e034699730335462f66.tar.xz
netdata-5da14042f70711ea5cf66e034699730335462f66.zip
1 files changed, 1392 insertions, 0 deletions
diff --git a/src/database/engine/rrdengineapi.c b/src/database/engine/rrdengineapi.c
new file mode 100755
index 000000000..43fed492b
--- /dev/null
+++ b/src/database/engine/rrdengineapi.c
@@ -0,0 +1,1392 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "database/engine/rrddiskprotocol.h"
+#include "rrdengine.h"
+#include "dbengine-compression.h"
+
+/* Default global database instance */
+struct rrdengine_instance multidb_ctx_storage_tier0;
+struct rrdengine_instance multidb_ctx_storage_tier1;
+struct rrdengine_instance multidb_ctx_storage_tier2;
+struct rrdengine_instance multidb_ctx_storage_tier3;
+struct rrdengine_instance multidb_ctx_storage_tier4;
+
+#define mrg_metric_ctx(metric) (struct rrdengine_instance *)mrg_metric_section(main_mrg, metric)
+
+#if RRD_STORAGE_TIERS != 5
+#error RRD_STORAGE_TIERS is not 5 - you need to add allocations here
+#endif
+struct rrdengine_instance *multidb_ctx[RRD_STORAGE_TIERS];
+uint8_t tier_page_type[RRD_STORAGE_TIERS] = {
+    RRDENG_PAGE_TYPE_GORILLA_32BIT,
+    RRDENG_PAGE_TYPE_ARRAY_TIER1,
+    RRDENG_PAGE_TYPE_ARRAY_TIER1,
+    RRDENG_PAGE_TYPE_ARRAY_TIER1,
+    RRDENG_PAGE_TYPE_ARRAY_TIER1};
+
+#if defined(ENV32BIT)
+size_t tier_page_size[RRD_STORAGE_TIERS] = {2048, 1024, 192, 192, 192};
+#else
+size_t tier_page_size[RRD_STORAGE_TIERS] = {4096, 2048, 384, 384, 384};
+#endif
+
+#if RRDENG_PAGE_TYPE_MAX != 2
+#error PAGE_TYPE_MAX is not 2 - you need to add allocations here
+#endif
+
+size_t page_type_size[256] = {
+        [RRDENG_PAGE_TYPE_ARRAY_32BIT] = sizeof(storage_number),
+        [RRDENG_PAGE_TYPE_ARRAY_TIER1] = sizeof(storage_number_tier1_t),
+        [RRDENG_PAGE_TYPE_GORILLA_32BIT] = sizeof(storage_number)
+};
+
+__attribute__((constructor)) void initialize_multidb_ctx(void) {
+    multidb_ctx[0] = &multidb_ctx_storage_tier0;
+    multidb_ctx[1] = &multidb_ctx_storage_tier1;
+    multidb_ctx[2] = &multidb_ctx_storage_tier2;
+    multidb_ctx[3] = &multidb_ctx_storage_tier3;
+    multidb_ctx[4] = &multidb_ctx_storage_tier4;
+}
+
+int db_engine_journal_check = 0;
+int default_rrdeng_disk_quota_mb = 256;
+int default_multidb_disk_quota_mb = 256;
+
+#if defined(ENV32BIT)
+int default_rrdeng_page_cache_mb = 16;
+int default_rrdeng_extent_cache_mb = 0;
+#else
+int default_rrdeng_page_cache_mb = 32;
+int default_rrdeng_extent_cache_mb = 0;
+#endif
+
+// ----------------------------------------------------------------------------
+// metrics groups
+
+static inline void rrdeng_page_alignment_acquire(struct pg_alignment *pa) {
+    if(unlikely(!pa)) return;
+    __atomic_add_fetch(&pa->refcount, 1, __ATOMIC_SEQ_CST);
+}
+
+static inline bool rrdeng_page_alignment_release(struct pg_alignment *pa) {
+    if(unlikely(!pa)) return true;
+
+    if(__atomic_sub_fetch(&pa->refcount, 1, __ATOMIC_SEQ_CST) == 0) {
+        freez(pa);
+        return true;
+    }
+
+    return false;
+}
+
+// charts call this
+STORAGE_METRICS_GROUP *rrdeng_metrics_group_get(STORAGE_INSTANCE *si __maybe_unused, uuid_t *uuid __maybe_unused) {
+    struct pg_alignment *pa = callocz(1, sizeof(struct pg_alignment));
+    rrdeng_page_alignment_acquire(pa);
+    return (STORAGE_METRICS_GROUP *)pa;
+}
+
+// charts call this
+void rrdeng_metrics_group_release(STORAGE_INSTANCE *si __maybe_unused, STORAGE_METRICS_GROUP *smg) {
+    if(unlikely(!smg)) return;
+
+    struct pg_alignment *pa = (struct pg_alignment *)smg;
+    rrdeng_page_alignment_release(pa);
+}
+
+// ----------------------------------------------------------------------------
+// metric handle for legacy dbs
+
+/* This UUID is not unique across hosts */
+void rrdeng_generate_legacy_uuid(const char *dim_id, const char *chart_id, uuid_t *ret_uuid)
+{
+    EVP_MD_CTX *evpctx;
+    unsigned char hash_value[EVP_MAX_MD_SIZE];
+    unsigned int hash_len;
+
+    evpctx = EVP_MD_CTX_create();
+    EVP_DigestInit_ex(evpctx, EVP_sha256(), NULL);
+    EVP_DigestUpdate(evpctx, dim_id, strlen(dim_id));
+    EVP_DigestUpdate(evpctx, chart_id, strlen(chart_id));
+    EVP_DigestFinal_ex(evpctx, hash_value, &hash_len);
+    EVP_MD_CTX_destroy(evpctx);
+    fatal_assert(hash_len > sizeof(uuid_t));
+    memcpy(ret_uuid, hash_value, sizeof(uuid_t));
+}
+
+static METRIC *rrdeng_metric_get_legacy(STORAGE_INSTANCE *si, const char *rd_id, const char *st_id) {
+    struct rrdengine_instance *ctx = (struct rrdengine_instance *)si;
+    uuid_t legacy_uuid;
+    rrdeng_generate_legacy_uuid(rd_id, st_id, &legacy_uuid);
+    return mrg_metric_get_and_acquire(main_mrg, &legacy_uuid, (Word_t) ctx);
+}
+
+// ----------------------------------------------------------------------------
+// metric handle
+
+void rrdeng_metric_release(STORAGE_METRIC_HANDLE *smh) {
+    METRIC *metric = (METRIC *)smh;
+    mrg_metric_release(main_mrg, metric);
+}
+
+STORAGE_METRIC_HANDLE *rrdeng_metric_dup(STORAGE_METRIC_HANDLE *smh) {
+    METRIC *metric = (METRIC *)smh;
+    return (STORAGE_METRIC_HANDLE *) mrg_metric_dup(main_mrg, metric);
+}
+
+STORAGE_METRIC_HANDLE *rrdeng_metric_get(STORAGE_INSTANCE *si, uuid_t *uuid) {
+    struct rrdengine_instance *ctx = (struct rrdengine_instance *)si;
+    return (STORAGE_METRIC_HANDLE *) mrg_metric_get_and_acquire(main_mrg, uuid, (Word_t) ctx);
+}
+
+static METRIC *rrdeng_metric_create(STORAGE_INSTANCE *si, uuid_t *uuid) {
+    internal_fatal(!si, "DBENGINE: STORAGE_INSTANCE is NULL");
+
+    struct rrdengine_instance *ctx = (struct rrdengine_instance *)si;
+    MRG_ENTRY entry = {
+            .uuid = uuid,
+            .section = (Word_t)ctx,
+            .first_time_s = 0,
+            .last_time_s = 0,
+            .latest_update_every_s = 0,
+    };
+
+    bool added;
+    METRIC *metric = mrg_metric_add_and_acquire(main_mrg, entry, &added);
+    if (added)
+        __atomic_add_fetch(&ctx->atomic.metrics, 1, __ATOMIC_RELAXED);
+    return metric;
+}
+
+STORAGE_METRIC_HANDLE *rrdeng_metric_get_or_create(RRDDIM *rd, STORAGE_INSTANCE *si) {
+    struct rrdengine_instance *ctx = (struct rrdengine_instance *)si;
+    METRIC *metric;
+
+    metric = mrg_metric_get_and_acquire(main_mrg, &rd->metric_uuid, (Word_t) ctx);
+
+    if(unlikely(!metric)) {
+        if(unlikely(ctx->config.legacy)) {
+            // this is a single host database
+            // generate uuid from the chart and dimensions ids
+            // and overwrite the one supplied by rrddim
+            metric = rrdeng_metric_get_legacy(si, rrddim_id(rd), rrdset_id(rd->rrdset));
+            if (metric)
+                uuid_copy(rd->metric_uuid, *mrg_metric_uuid(main_mrg, metric));
+        }
+
+        if(likely(!metric))
+            metric = rrdeng_metric_create(si, &rd->metric_uuid);
+    }
+
+#ifdef NETDATA_INTERNAL_CHECKS
+    if(uuid_memcmp(&rd->metric_uuid, mrg_metric_uuid(main_mrg, metric)) != 0) {
+        char uuid1[UUID_STR_LEN + 1];
+        char uuid2[UUID_STR_LEN + 1];
+
+        uuid_unparse(rd->metric_uuid, uuid1);
+        uuid_unparse(*mrg_metric_uuid(main_mrg, metric), uuid2);
+        fatal("DBENGINE: uuids do not match, asked for metric '%s', but got metric '%s'", uuid1, uuid2);
+    }
+
+    if(mrg_metric_ctx(metric) != ctx)
+        fatal("DBENGINE: mixed up db instances, asked for metric from %p, got from %p",
+              ctx, mrg_metric_ctx(metric));
+#endif
+
+    return (STORAGE_METRIC_HANDLE *)metric;
+}
+
+
+// ----------------------------------------------------------------------------
+// collect ops
+
+static inline void check_and_fix_mrg_update_every(struct rrdeng_collect_handle *handle) {
+    if(unlikely((uint32_t)(handle->update_every_ut / USEC_PER_SEC) != mrg_metric_get_update_every_s(main_mrg, handle->metric))) {
+        internal_error(true, "DBENGINE: collection handle has update every %u, but the metric registry has %u. Fixing it.",
+              (uint32_t)(handle->update_every_ut / USEC_PER_SEC), mrg_metric_get_update_every_s(main_mrg, handle->metric));
+
+        if(unlikely(!handle->update_every_ut))
+            handle->update_every_ut = (usec_t)mrg_metric_get_update_every_s(main_mrg, handle->metric) * USEC_PER_SEC;
+        else
+            mrg_metric_set_update_every(main_mrg, handle->metric, (uint32_t)(handle->update_every_ut / USEC_PER_SEC));
+    }
+}
+
+static inline bool check_completed_page_consistency(struct rrdeng_collect_handle *handle __maybe_unused) {
+#ifdef NETDATA_INTERNAL_CHECKS
+    if (unlikely(!handle->pgc_page || !handle->page_entries_max || !handle->page_position || !handle->page_end_time_ut))
+        return false;
+
+    struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric);
+
+    uuid_t *uuid = mrg_metric_uuid(main_mrg, handle->metric);
+    time_t start_time_s = pgc_page_start_time_s(handle->pgc_page);
+    time_t end_time_s = pgc_page_end_time_s(handle->pgc_page);
+    uint32_t update_every_s = pgc_page_update_every_s(handle->pgc_page);
+    size_t page_length = handle->page_position * CTX_POINT_SIZE_BYTES(ctx);
+    size_t entries = handle->page_position;
+    time_t overwrite_zero_update_every_s = (time_t)(handle->update_every_ut / USEC_PER_SEC);
+
+    if(end_time_s > max_acceptable_collected_time())
+        handle->page_flags |= RRDENG_PAGE_COMPLETED_IN_FUTURE;
+
+    VALIDATED_PAGE_DESCRIPTOR vd = validate_page(
+            uuid,
+            start_time_s,
+            end_time_s,
+            update_every_s,
+            page_length,
+            ctx->config.page_type,
+            entries,
+            0, // do not check for future timestamps - we inherit the timestamps of the children
+            overwrite_zero_update_every_s,
+            false,
+            "collected",
+            handle->page_flags);
+
+    return vd.is_valid;
+#else
+    return true;
+#endif
+}
+
+/*
+ * Gets a handle for storing metrics to the database.
+ * The handle must be released with rrdeng_store_metric_final().
+ */
+STORAGE_COLLECT_HANDLE *rrdeng_store_metric_init(STORAGE_METRIC_HANDLE *smh, uint32_t update_every, STORAGE_METRICS_GROUP *smg) {
+    METRIC *metric = (METRIC *)smh;
+    struct rrdengine_instance *ctx = mrg_metric_ctx(metric);
+
+    bool is_1st_metric_writer = true;
+    if(!mrg_metric_set_writer(main_mrg, metric)) {
+        is_1st_metric_writer = false;
+        char uuid[UUID_STR_LEN + 1];
+        uuid_unparse(*mrg_metric_uuid(main_mrg, metric), uuid);
+        netdata_log_error("DBENGINE: metric '%s' is already collected and should not be collected twice - expect gaps on the charts", uuid);
+    }
+
+    metric = mrg_metric_dup(main_mrg, metric);
+
+    struct rrdeng_collect_handle *handle;
+
+    handle = callocz(1, sizeof(struct rrdeng_collect_handle));
+    handle->common.seb = STORAGE_ENGINE_BACKEND_DBENGINE;
+    handle->metric = metric;
+    
+    handle->pgc_page = NULL;
+    handle->page_data = NULL;
+    handle->page_data_size = 0;
+    
+    handle->page_position = 0;
+    handle->page_entries_max = 0;
+    handle->update_every_ut = (usec_t)update_every * USEC_PER_SEC;
+    handle->options = is_1st_metric_writer ? RRDENG_1ST_METRIC_WRITER : 0;
+
+    __atomic_add_fetch(&ctx->atomic.collectors_running, 1, __ATOMIC_RELAXED);
+    if(!is_1st_metric_writer)
+        __atomic_add_fetch(&ctx->atomic.collectors_running_duplicate, 1, __ATOMIC_RELAXED);
+
+    mrg_metric_set_update_every(main_mrg, metric, update_every);
+
+    handle->alignment = (struct pg_alignment *)smg;
+    rrdeng_page_alignment_acquire(handle->alignment);
+
+    // this is important!
+    // if we don't set the page_end_time_ut during the first collection
+    // data collection may be able to go back in time and during the addition of new pages
+    // clean pages may be found matching ours!
+
+    time_t db_first_time_s, db_last_time_s;
+    mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, NULL);
+    handle->page_end_time_ut = (usec_t)db_last_time_s * USEC_PER_SEC;
+
+    return (STORAGE_COLLECT_HANDLE *)handle;
+}
+
+void rrdeng_store_metric_flush_current_page(STORAGE_COLLECT_HANDLE *sch) {
+    struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)sch;
+
+    if (unlikely(!handle->pgc_page))
+        return;
+
+    if(pgd_is_empty(handle->page_data))
+        pgc_page_to_clean_evict_or_release(main_cache, handle->pgc_page);
+
+    else {
+        check_completed_page_consistency(handle);
+        mrg_metric_set_clean_latest_time_s(main_mrg, handle->metric, pgc_page_end_time_s(handle->pgc_page));
+
+        struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric);
+        time_t start_time_s = pgc_page_start_time_s(handle->pgc_page);
+        time_t end_time_s = pgc_page_end_time_s(handle->pgc_page);
+        uint32_t update_every_s = mrg_metric_get_update_every_s(main_mrg, handle->metric);
+        if (end_time_s && start_time_s && end_time_s > start_time_s && update_every_s) {
+            uint64_t add_samples = (end_time_s - start_time_s) / update_every_s;
+            __atomic_add_fetch(&ctx->atomic.samples, add_samples, __ATOMIC_RELAXED);
+        }
+
+        pgc_page_hot_to_dirty_and_release(main_cache, handle->pgc_page, false);
+    }
+
+    mrg_metric_set_hot_latest_time_s(main_mrg, handle->metric, 0);
+
+    handle->pgc_page = NULL;
+    handle->page_flags = 0;
+    handle->page_position = 0;
+    handle->page_entries_max = 0;
+    handle->page_data = NULL;
+    handle->page_data_size = 0;
+
+    // important!
+    // we should never zero page end time ut, because this will allow
+    // collection to go back in time
+    // handle->page_end_time_ut = 0;
+    // handle->page_start_time_ut;
+
+    check_and_fix_mrg_update_every(handle);
+
+    timing_step(TIMING_STEP_DBENGINE_FLUSH_PAGE);
+}
+
+static void rrdeng_store_metric_create_new_page(struct rrdeng_collect_handle *handle,
+                                                struct rrdengine_instance *ctx,
+                                                usec_t point_in_time_ut,
+                                                PGD *data,
+                                                size_t data_size) {
+    time_t point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC);
+    const uint32_t update_every_s = (uint32_t)(handle->update_every_ut / USEC_PER_SEC);
+
+    PGC_ENTRY page_entry = {
+            .section = (Word_t) ctx,
+            .metric_id = mrg_metric_id(main_mrg, handle->metric),
+            .start_time_s = point_in_time_s,
+            .end_time_s = point_in_time_s,
+            .size = data_size,
+            .data = data,
+            .update_every_s = update_every_s,
+            .hot = true
+    };
+
+    size_t conflicts = 0;
+    bool added = true;
+    PGC_PAGE *pgc_page = pgc_page_add_and_acquire(main_cache, page_entry, &added);
+    while (unlikely(!added)) {
+        conflicts++;
+
+        char uuid[UUID_STR_LEN + 1];
+        uuid_unparse(*mrg_metric_uuid(main_mrg, handle->metric), uuid);
+
+#ifdef NETDATA_INTERNAL_CHECKS
+        internal_error(true,
+#else
+        nd_log_limit_static_global_var(erl, 1, 0);
+        nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING,
+#endif
+                    "DBENGINE: metric '%s' new page from %ld to %ld, update every %u, has a conflict in main cache "
+                    "with existing %s%s page from %ld to %ld, update every %u - "
+                    "is it collected more than once?",
+                    uuid,
+                    page_entry.start_time_s, page_entry.end_time_s, page_entry.update_every_s,
+                    pgc_is_page_hot(pgc_page) ? "hot" : "not-hot",
+                    pgc_page_data(pgc_page) == PGD_EMPTY ? " gap" : "",
+                    pgc_page_start_time_s(pgc_page), pgc_page_end_time_s(pgc_page), pgc_page_update_every_s(pgc_page)
+              );
+
+        pgc_page_release(main_cache, pgc_page);
+
+        point_in_time_ut -= handle->update_every_ut;
+        point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC);
+        page_entry.start_time_s = point_in_time_s;
+        page_entry.end_time_s = point_in_time_s;
+        pgc_page = pgc_page_add_and_acquire(main_cache, page_entry, &added);
+    }
+
+    handle->page_entries_max = data_size / CTX_POINT_SIZE_BYTES(ctx);
+    handle->page_start_time_ut = point_in_time_ut;
+    handle->page_end_time_ut = point_in_time_ut;
+    handle->page_position = 1; // zero is already in our data
+    handle->pgc_page = pgc_page;
+    handle->page_flags = conflicts? RRDENG_PAGE_CONFLICT : 0;
+
+    if(point_in_time_s > max_acceptable_collected_time())
+        handle->page_flags |= RRDENG_PAGE_CREATED_IN_FUTURE;
+
+    check_and_fix_mrg_update_every(handle);
+
+    timing_step(TIMING_STEP_DBENGINE_CREATE_NEW_PAGE);
+}
+
+static size_t aligned_allocation_entries(size_t max_slots, size_t target_slot, time_t now_s) {
+    size_t slots = target_slot;
+    size_t pos = (now_s % max_slots);
+
+    if(pos > slots)
+        slots += max_slots - pos;
+
+    else if(pos < slots)
+        slots -= pos;
+
+    else
+        slots = max_slots;
+
+    return slots;
+}
+
+static PGD *rrdeng_alloc_new_page_data(struct rrdeng_collect_handle *handle, size_t *data_size, usec_t point_in_time_ut) {
+    struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric);
+
+    PGD *d = NULL;
+    
+    size_t max_size = tier_page_size[ctx->config.tier];
+    size_t max_slots = max_size / CTX_POINT_SIZE_BYTES(ctx);
+
+    size_t slots = aligned_allocation_entries(
+            max_slots,
+            indexing_partition((Word_t) handle->alignment, max_slots),
+            (time_t) (point_in_time_ut / USEC_PER_SEC)
+    );
+
+    if(slots < max_slots / 3)
+        slots = max_slots / 3;
+
+    if(slots < 3)
+        slots = 3;
+
+    size_t size = slots * CTX_POINT_SIZE_BYTES(ctx);
+
+    // internal_error(true, "PAGE ALLOC %zu bytes (%zu max)", size, max_size);
+
+    internal_fatal(slots < 3 || slots > max_slots, "ooops! wrong distribution of metrics across time");
+    internal_fatal(size > tier_page_size[ctx->config.tier] || size < CTX_POINT_SIZE_BYTES(ctx) * 2, "ooops! wrong page size");
+
+    *data_size = size;
+
+    switch (ctx->config.page_type) {
+        case RRDENG_PAGE_TYPE_ARRAY_32BIT:
+        case RRDENG_PAGE_TYPE_ARRAY_TIER1:
+            d = pgd_create(ctx->config.page_type, slots);
+            break;
+        case RRDENG_PAGE_TYPE_GORILLA_32BIT:
+            // ignore slots, and use the fixed number of slots per gorilla buffer.
+            // gorilla will automatically add more buffers if needed.
+            d = pgd_create(ctx->config.page_type, RRDENG_GORILLA_32BIT_BUFFER_SLOTS);
+            break;
+        default:
+            fatal("Unknown page type: %uc\n", ctx->config.page_type);
+    }
+
+    timing_step(TIMING_STEP_DBENGINE_PAGE_ALLOC);
+    return d;
+}
+
+static void rrdeng_store_metric_append_point(STORAGE_COLLECT_HANDLE *sch,
+                                             const usec_t point_in_time_ut,
+                                             const NETDATA_DOUBLE n,
+                                             const NETDATA_DOUBLE min_value,
+                                             const NETDATA_DOUBLE max_value,
+                                             const uint16_t count,
+                                             const uint16_t anomaly_count,
+                                             const SN_FLAGS flags)
+{
+    struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)sch;
+    struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric);
+
+    if(unlikely(!handle->page_data))
+        handle->page_data = rrdeng_alloc_new_page_data(handle, &handle->page_data_size, point_in_time_ut);
+
+    timing_step(TIMING_STEP_DBENGINE_CHECK_DATA);
+
+    pgd_append_point(handle->page_data,
+                     point_in_time_ut,
+                     n, min_value, max_value, count, anomaly_count, flags,
+                     handle->page_position);
+
+    timing_step(TIMING_STEP_DBENGINE_PACK);
+
+    if(unlikely(!handle->pgc_page)) {
+        rrdeng_store_metric_create_new_page(handle, ctx, point_in_time_ut, handle->page_data, handle->page_data_size);
+        // handle->position is set to 1 already
+    }
+    else {
+        // update an existing page
+        pgc_page_hot_set_end_time_s(main_cache, handle->pgc_page, (time_t) (point_in_time_ut / USEC_PER_SEC));
+        handle->page_end_time_ut = point_in_time_ut;
+
+        if(unlikely(++handle->page_position >= handle->page_entries_max)) {
+            internal_fatal(handle->page_position > handle->page_entries_max, "DBENGINE: exceeded page max number of points");
+            handle->page_flags |= RRDENG_PAGE_FULL;
+            rrdeng_store_metric_flush_current_page(sch);
+        }
+    }
+
+    timing_step(TIMING_STEP_DBENGINE_PAGE_FIN);
+
+    // update the metric information
+    mrg_metric_set_hot_latest_time_s(main_mrg, handle->metric, (time_t) (point_in_time_ut / USEC_PER_SEC));
+
+    timing_step(TIMING_STEP_DBENGINE_MRG_UPDATE);
+}
+
+static void store_metric_next_error_log(struct rrdeng_collect_handle *handle __maybe_unused, usec_t point_in_time_ut __maybe_unused, const char *msg __maybe_unused) {
+#ifdef NETDATA_INTERNAL_CHECKS
+    time_t point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC);
+    char uuid[UUID_STR_LEN + 1];
+    uuid_unparse(*mrg_metric_uuid(main_mrg, handle->metric), uuid);
+
+    BUFFER *wb = NULL;
+    if(handle->pgc_page && handle->page_flags) {
+        wb = buffer_create(0, NULL);
+        collect_page_flags_to_buffer(wb, handle->page_flags);
+    }
+
+    nd_log_limit_static_global_var(erl, 1, 0);
+    nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE,
+                "DBENGINE: metric '%s' collected point at %ld, %s last collection at %ld, "
+                "update every %ld, %s page from %ld to %ld, position %u (of %u), flags: %s",
+                uuid,
+                point_in_time_s,
+                msg,
+                (time_t)(handle->page_end_time_ut / USEC_PER_SEC),
+                (time_t)(handle->update_every_ut / USEC_PER_SEC),
+                handle->pgc_page ? "current" : "*LAST*",
+                (time_t)(handle->page_start_time_ut / USEC_PER_SEC),
+                (time_t)(handle->page_end_time_ut / USEC_PER_SEC),
+                handle->page_position, handle->page_entries_max,
+                wb ? buffer_tostring(wb) : ""
+                );
+
+    buffer_free(wb);
+#else
+    ;
+#endif
+}
+
+void rrdeng_store_metric_next(STORAGE_COLLECT_HANDLE *sch,
+                              const usec_t point_in_time_ut,
+                              const NETDATA_DOUBLE n,
+                              const NETDATA_DOUBLE min_value,
+                              const NETDATA_DOUBLE max_value,
+                              const uint16_t count,
+                              const uint16_t anomaly_count,
+                              const SN_FLAGS flags)
+{
+    timing_step(TIMING_STEP_RRDSET_STORE_METRIC);
+
+    struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)sch;
+
+#ifdef NETDATA_INTERNAL_CHECKS
+    if(unlikely(point_in_time_ut > (usec_t)max_acceptable_collected_time() * USEC_PER_SEC))
+        handle->page_flags |= RRDENG_PAGE_FUTURE_POINT;
+#endif
+
+    usec_t delta_ut = point_in_time_ut - handle->page_end_time_ut;
+
+    if(likely(delta_ut == handle->update_every_ut)) {
+        // happy path
+        ;
+    }
+    else if(unlikely(point_in_time_ut > handle->page_end_time_ut)) {
+        if(handle->pgc_page) {
+            if (unlikely(delta_ut < handle->update_every_ut)) {
+                handle->page_flags |= RRDENG_PAGE_STEP_TOO_SMALL;
+                rrdeng_store_metric_flush_current_page(sch);
+            }
+            else if (unlikely(delta_ut % handle->update_every_ut)) {
+                handle->page_flags |= RRDENG_PAGE_STEP_UNALIGNED;
+                rrdeng_store_metric_flush_current_page(sch);
+            }
+            else {
+                size_t points_gap = delta_ut / handle->update_every_ut;
+                size_t page_remaining_points = handle->page_entries_max - handle->page_position;
+
+                if (points_gap >= page_remaining_points) {
+                    handle->page_flags |= RRDENG_PAGE_BIG_GAP;
+                    rrdeng_store_metric_flush_current_page(sch);
+                }
+                else {
+                    // loop to fill the gap
+                    handle->page_flags |= RRDENG_PAGE_GAP;
+
+                    usec_t stop_ut = point_in_time_ut - handle->update_every_ut;
+                    for (usec_t this_ut = handle->page_end_time_ut + handle->update_every_ut;
+                         this_ut <= stop_ut;
+                         this_ut = handle->page_end_time_ut + handle->update_every_ut) {
+                        rrdeng_store_metric_append_point(
+                                sch,
+                                this_ut,
+                                NAN, NAN, NAN,
+                                1, 0,
+                                SN_EMPTY_SLOT);
+                    }
+                }
+            }
+        }
+    }
+    else if(unlikely(point_in_time_ut < handle->page_end_time_ut)) {
+        handle->page_flags |= RRDENG_PAGE_PAST_COLLECTION;
+        store_metric_next_error_log(handle, point_in_time_ut, "is older than the");
+        return;
+    }
+
+    else /* if(unlikely(point_in_time_ut == handle->page_end_time_ut)) */ {
+        handle->page_flags |= RRDENG_PAGE_REPEATED_COLLECTION;
+        store_metric_next_error_log(handle, point_in_time_ut, "is at the same time as the");
+        return;
+    }
+
+    timing_step(TIMING_STEP_DBENGINE_FIRST_CHECK);
+
+    rrdeng_store_metric_append_point(sch,
+                                     point_in_time_ut,
+                                     n, min_value, max_value,
+                                     count, anomaly_count,
+                                     flags);
+}
+
+/*
+ * Releases the database reference from the handle for storing metrics.
+ * Returns 1 if it's safe to delete the dimension.
+ */
+int rrdeng_store_metric_finalize(STORAGE_COLLECT_HANDLE *sch) {
+    struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)sch;
+    struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric);
+
+    handle->page_flags |= RRDENG_PAGE_COLLECT_FINALIZE;
+    rrdeng_store_metric_flush_current_page(sch);
+    rrdeng_page_alignment_release(handle->alignment);
+
+    __atomic_sub_fetch(&ctx->atomic.collectors_running, 1, __ATOMIC_RELAXED);
+    if(!(handle->options & RRDENG_1ST_METRIC_WRITER))
+        __atomic_sub_fetch(&ctx->atomic.collectors_running_duplicate, 1, __ATOMIC_RELAXED);
+
+    if((handle->options & RRDENG_1ST_METRIC_WRITER) && !mrg_metric_clear_writer(main_mrg, handle->metric))
+        internal_fatal(true, "DBENGINE: metric is already released");
+
+    time_t first_time_s, last_time_s;
+    mrg_metric_get_retention(main_mrg, handle->metric, &first_time_s, &last_time_s, NULL);
+
+    mrg_metric_release(main_mrg, handle->metric);
+    freez(handle);
+
+    if(!first_time_s && !last_time_s)
+        return 1;
+
+    return 0;
+}
+
+void rrdeng_store_metric_change_collection_frequency(STORAGE_COLLECT_HANDLE *sch, int update_every) {
+    struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)sch;
+    check_and_fix_mrg_update_every(handle);
+
+    METRIC *metric = handle->metric;
+    usec_t update_every_ut = (usec_t)update_every * USEC_PER_SEC;
+
+    if(update_every_ut == handle->update_every_ut)
+        return;
+
+    handle->page_flags |= RRDENG_PAGE_UPDATE_EVERY_CHANGE;
+    rrdeng_store_metric_flush_current_page(sch);
+    mrg_metric_set_update_every(main_mrg, metric, update_every);
+    handle->update_every_ut = update_every_ut;
+}
+
+// ----------------------------------------------------------------------------
+// query ops
+
+#ifdef NETDATA_INTERNAL_CHECKS
+SPINLOCK global_query_handle_spinlock = NETDATA_SPINLOCK_INITIALIZER;
+static struct rrdeng_query_handle *global_query_handle_ll = NULL;
+static void register_query_handle(struct rrdeng_query_handle *handle) {
+    handle->query_pid = gettid();
+    handle->started_time_s = now_realtime_sec();
+
+    spinlock_lock(&global_query_handle_spinlock);
+    DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(global_query_handle_ll, handle, prev, next);
+    spinlock_unlock(&global_query_handle_spinlock);
+}
+static void unregister_query_handle(struct rrdeng_query_handle *handle) {
+    spinlock_lock(&global_query_handle_spinlock);
+    DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(global_query_handle_ll, handle, prev, next);
+    spinlock_unlock(&global_query_handle_spinlock);
+}
+#else
+static void register_query_handle(struct rrdeng_query_handle *handle __maybe_unused) {
+    ;
+}
+static void unregister_query_handle(struct rrdeng_query_handle *handle __maybe_unused) {
+    ;
+}
+#endif
+
+/*
+ * Gets a handle for loading metrics from the database.
+ * The handle must be released with rrdeng_load_metric_final().
+ */
+void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *smh,
+                             struct storage_engine_query_handle *seqh,
+                             time_t start_time_s,
+                             time_t end_time_s,
+                             STORAGE_PRIORITY priority)
+{
+    usec_t started_ut = now_monotonic_usec();
+
+    netdata_thread_disable_cancelability();
+
+    METRIC *metric = (METRIC *)smh;
+    struct rrdengine_instance *ctx = mrg_metric_ctx(metric);
+    struct rrdeng_query_handle *handle;
+
+    handle = rrdeng_query_handle_get();
+    register_query_handle(handle);
+
+    if (unlikely(priority < STORAGE_PRIORITY_HIGH))
+        priority = STORAGE_PRIORITY_HIGH;
+    else if (unlikely(priority >= STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE))
+        priority = STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE - 1;
+
+    handle->ctx = ctx;
+    handle->metric = metric;
+    handle->priority = priority;
+
+    // IMPORTANT!
+    // It is crucial not to exceed the db boundaries, because dbengine
+    // now has gap caching, so when a gap is detected a negative page
+    // is inserted into the main cache, to avoid scanning the journals
+    // again for pages matching the gap.
+
+    time_t db_first_time_s, db_last_time_s;
+    uint32_t db_update_every_s;
+    mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s);
+
+    if(is_page_in_time_range(start_time_s, end_time_s, db_first_time_s, db_last_time_s) == PAGE_IS_IN_RANGE) {
+        handle->start_time_s = MAX(start_time_s, db_first_time_s);
+        handle->end_time_s = MIN(end_time_s, db_last_time_s);
+        handle->now_s = handle->start_time_s;
+
+        handle->dt_s = db_update_every_s;
+        if (!handle->dt_s) {
+            handle->dt_s = default_rrd_update_every;
+            mrg_metric_set_update_every_s_if_zero(main_mrg, metric, default_rrd_update_every);
+        }
+
+        seqh->handle = (STORAGE_QUERY_HANDLE *) handle;
+        seqh->start_time_s = handle->start_time_s;
+        seqh->end_time_s = handle->end_time_s;
+        seqh->priority = priority;
+        seqh->seb = STORAGE_ENGINE_BACKEND_DBENGINE;
+
+        pg_cache_preload(handle);
+
+        __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_init, now_monotonic_usec() - started_ut, __ATOMIC_RELAXED);
+    }
+    else {
+        handle->start_time_s = start_time_s;
+        handle->end_time_s = end_time_s;
+        handle->now_s = start_time_s;
+        handle->dt_s = db_update_every_s;
+
+        seqh->handle = (STORAGE_QUERY_HANDLE *) handle;
+        seqh->start_time_s = handle->start_time_s;
+        seqh->end_time_s = 0;
+        seqh->priority = priority;
+        seqh->seb = STORAGE_ENGINE_BACKEND_DBENGINE;
+    }
+}
+
+static bool rrdeng_load_page_next(struct storage_engine_query_handle *seqh, bool debug_this __maybe_unused) {
+    struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)seqh->handle;
+    struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric);
+
+    if (likely(handle->page)) {
+        // we have a page to release
+        pgc_page_release(main_cache, handle->page);
+        handle->page = NULL;
+        pgdc_reset(&handle->pgdc, NULL, UINT32_MAX);
+    }
+
+    if (unlikely(handle->now_s > seqh->end_time_s))
+        return false;
+
+    size_t entries = 0;
+    handle->page = pg_cache_lookup_next(ctx, handle->pdc, handle->now_s, handle->dt_s, &entries);
+
+    internal_fatal(handle->page && (pgc_page_data(handle->page) == PGD_EMPTY || !entries),
+                   "A page was returned, but it is empty - pg_cache_lookup_next() should be handling this case");
+
+    if (unlikely(!handle->page || pgc_page_data(handle->page) == PGD_EMPTY || !entries))
+        return false;
+
+    time_t page_start_time_s = pgc_page_start_time_s(handle->page);
+    time_t page_end_time_s = pgc_page_end_time_s(handle->page);
+    uint32_t page_update_every_s = pgc_page_update_every_s(handle->page);
+
+    unsigned position;
+    if(likely(handle->now_s >= page_start_time_s && handle->now_s <= page_end_time_s)) {
+
+        if(unlikely(entries == 1 || page_start_time_s == page_end_time_s || !page_update_every_s)) {
+            position = 0;
+            handle->now_s = page_start_time_s;
+        }
+        else {
+            position = (handle->now_s - page_start_time_s) * (entries - 1) / (page_end_time_s - page_start_time_s);
+            time_t point_end_time_s = page_start_time_s + position * (time_t) page_update_every_s;
+            while(point_end_time_s < handle->now_s && position + 1 < entries) {
+                // https://github.com/netdata/netdata/issues/14411
+                // we really need a while() here, because the delta may be
+                // 2 points at higher tiers
+                position++;
+                point_end_time_s = page_start_time_s + position * (time_t) page_update_every_s;
+            }
+            handle->now_s = point_end_time_s;
+        }
+
+        internal_fatal(position >= entries, "DBENGINE: wrong page position calculation");
+    }
+    else if(handle->now_s < page_start_time_s) {
+        handle->now_s = page_start_time_s;
+        position = 0;
+    }
+    else {
+        internal_fatal(true, "DBENGINE: this page is entirely in our past and should not be accepted for this query in the first place");
+        handle->now_s = page_end_time_s;
+        position = entries - 1;
+    }
+
+    handle->entries = entries;
+    handle->position = position;
+    handle->dt_s = page_update_every_s;
+
+    pgdc_reset(&handle->pgdc, pgc_page_data(handle->page), handle->position);
+
+    return true;
+}
+
+// Returns the metric and sets its timestamp into current_time
+// IT IS REQUIRED TO **ALWAYS** SET ALL RETURN VALUES (current_time, end_time, flags)
+// IT IS REQUIRED TO **ALWAYS** KEEP TRACK OF TIME, EVEN OUTSIDE THE DATABASE BOUNDARIES
+STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *seqh) {
+    struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)seqh->handle;
+    STORAGE_POINT sp;
+
+    if (unlikely(handle->now_s > seqh->end_time_s)) {
+        storage_point_empty(sp, handle->now_s - handle->dt_s, handle->now_s);
+        goto prepare_for_next_iteration;
+    }
+
+    if (unlikely(!handle->page || handle->position >= handle->entries)) {
+        // We need to get a new page
+
+        if (!rrdeng_load_page_next(seqh, false)) {
+            handle->now_s = seqh->end_time_s;
+            storage_point_empty(sp, handle->now_s - handle->dt_s, handle->now_s);
+            goto prepare_for_next_iteration;
+        }
+    }
+
+    sp.start_time_s = handle->now_s - handle->dt_s;
+    sp.end_time_s = handle->now_s;
+
+    pgdc_get_next_point(&handle->pgdc, handle->position, &sp);
+
+prepare_for_next_iteration:
+    internal_fatal(sp.end_time_s < seqh->start_time_s, "DBENGINE: this point is too old for this query");
+    internal_fatal(sp.end_time_s < handle->now_s, "DBENGINE: this point is too old for this point in time");
+
+    handle->now_s += handle->dt_s;
+    handle->position++;
+
+    return sp;
+}
+
+int rrdeng_load_metric_is_finished(struct storage_engine_query_handle *seqh) {
+    struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)seqh->handle;
+    return (handle->now_s > seqh->end_time_s);
+}
+
+/*
+ * Releases the database reference from the handle for loading metrics.
+ */
+void rrdeng_load_metric_finalize(struct storage_engine_query_handle *seqh)
+{
+    struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)seqh->handle;
+
+    if (handle->page) {
+        pgc_page_release(main_cache, handle->page);
+        pgdc_reset(&handle->pgdc, NULL, UINT32_MAX);
+    }
+
+    if(!pdc_release_and_destroy_if_unreferenced(handle->pdc, false, false))
+        __atomic_store_n(&handle->pdc->workers_should_stop, true, __ATOMIC_RELAXED);
+
+    unregister_query_handle(handle);
+    rrdeng_query_handle_release(handle);
+    seqh->handle = NULL;
+    netdata_thread_enable_cancelability();
+}
+
+time_t rrdeng_load_align_to_optimal_before(struct storage_engine_query_handle *seqh) {
+    struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)seqh->handle;
+
+    if(handle->pdc) {
+        rrdeng_prep_wait(handle->pdc);
+        if (handle->pdc->optimal_end_time_s > seqh->end_time_s)
+            seqh->end_time_s = handle->pdc->optimal_end_time_s;
+    }
+
+    return seqh->end_time_s;
+}
+
+time_t rrdeng_metric_latest_time(STORAGE_METRIC_HANDLE *smh) {
+    METRIC *metric = (METRIC *)smh;
+    time_t latest_time_s = 0;
+
+    if (metric)
+        latest_time_s = mrg_metric_get_latest_time_s(main_mrg, metric);
+
+    return latest_time_s;
+}
+
+time_t rrdeng_metric_oldest_time(STORAGE_METRIC_HANDLE *smh) {
+    METRIC *metric = (METRIC *)smh;
+
+    time_t oldest_time_s = 0;
+    if (metric)
+        oldest_time_s = mrg_metric_get_first_time_s(main_mrg, metric);
+
+    return oldest_time_s;
+}
+
+bool rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *si, uuid_t *dim_uuid, time_t *first_entry_s, time_t *last_entry_s)
+{
+    struct rrdengine_instance *ctx = (struct rrdengine_instance *)si;
+    if (unlikely(!ctx)) {
+        netdata_log_error("DBENGINE: invalid STORAGE INSTANCE to %s()", __FUNCTION__);
+        return false;
+    }
+
+    METRIC *metric = mrg_metric_get_and_acquire(main_mrg, dim_uuid, (Word_t) ctx);
+    if (unlikely(!metric))
+        return false;
+
+    mrg_metric_get_retention(main_mrg, metric, first_entry_s, last_entry_s, NULL);
+
+    mrg_metric_release(main_mrg, metric);
+
+    return true;
+}
+
+uint64_t rrdeng_disk_space_max(STORAGE_INSTANCE *si) {
+    struct rrdengine_instance *ctx = (struct rrdengine_instance *)si;
+    return ctx->config.max_disk_space;
+}
+
+uint64_t rrdeng_disk_space_used(STORAGE_INSTANCE *si) {
+    struct rrdengine_instance *ctx = (struct rrdengine_instance *)si;
+    return __atomic_load_n(&ctx->atomic.current_disk_space, __ATOMIC_RELAXED);
+}
+
+uint64_t rrdeng_metrics(STORAGE_INSTANCE *si) {
+    struct rrdengine_instance *ctx = (struct rrdengine_instance *)si;
+    return __atomic_load_n(&ctx->atomic.metrics, __ATOMIC_RELAXED);
+}
+
+uint64_t rrdeng_samples(STORAGE_INSTANCE *si) {
+    struct rrdengine_instance *ctx = (struct rrdengine_instance *)si;
+    return __atomic_load_n(&ctx->atomic.samples, __ATOMIC_RELAXED);
+}
+
+time_t rrdeng_global_first_time_s(STORAGE_INSTANCE *si) {
+    struct rrdengine_instance *ctx = (struct rrdengine_instance *)si;
+
+    time_t t = __atomic_load_n(&ctx->atomic.first_time_s, __ATOMIC_RELAXED);
+    if(t == LONG_MAX || t < 0)
+        t = 0;
+
+    return t;
+}
+
+size_t rrdeng_currently_collected_metrics(STORAGE_INSTANCE *si) {
+    struct rrdengine_instance *ctx = (struct rrdengine_instance *)si;
+    return __atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED);
+}
+
+/*
+ * Gathers Database Engine statistics.
+ * Careful when modifying this function.
+ * You must not change the indices of the statistics or user code will break.
+ * You must not exceed RRDENG_NR_STATS or it will crash.
+ */
+void rrdeng_get_37_statistics(struct rrdengine_instance *ctx, unsigned long long *array)
+{
+    if (ctx == NULL)
+        return;
+
+    array[0] = (uint64_t)__atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED); // API producers
+    array[1] = (uint64_t)__atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED);   // API consumers
+    array[2] = 0;
+    array[3] = 0;
+    array[4] = 0;
+    array[5] = 0; // (uint64_t)ctx->stats.pg_cache_insertions;
+    array[6] = 0; // (uint64_t)ctx->stats.pg_cache_deletions;
+    array[7] = 0; // (uint64_t)ctx->stats.pg_cache_hits;
+    array[8] = 0; // (uint64_t)ctx->stats.pg_cache_misses;
+    array[9] = 0; // (uint64_t)ctx->stats.pg_cache_backfills;
+    array[10] = 0; // (uint64_t)ctx->stats.pg_cache_evictions;
+    array[11] = (uint64_t)__atomic_load_n(&ctx->stats.before_compress_bytes, __ATOMIC_RELAXED); // used
+    array[12] = (uint64_t)__atomic_load_n(&ctx->stats.after_compress_bytes, __ATOMIC_RELAXED); // used
+    array[13] = (uint64_t)__atomic_load_n(&ctx->stats.before_decompress_bytes, __ATOMIC_RELAXED);
+    array[14] = (uint64_t)__atomic_load_n(&ctx->stats.after_decompress_bytes, __ATOMIC_RELAXED);
+    array[15] = (uint64_t)__atomic_load_n(&ctx->stats.io_write_bytes, __ATOMIC_RELAXED); // used
+    array[16] = (uint64_t)__atomic_load_n(&ctx->stats.io_write_requests, __ATOMIC_RELAXED); // used
+    array[17] = (uint64_t)__atomic_load_n(&ctx->stats.io_read_bytes, __ATOMIC_RELAXED);
+    array[18] = (uint64_t)__atomic_load_n(&ctx->stats.io_read_requests, __ATOMIC_RELAXED); // used
+    array[19] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_write_extent_bytes, __ATOMIC_RELAXED);
+    array[20] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_write_extents, __ATOMIC_RELAXED);
+    array[21] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_read_extent_bytes, __ATOMIC_RELAXED);
+    array[22] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_read_extents, __ATOMIC_RELAXED);
+    array[23] = (uint64_t)__atomic_load_n(&ctx->stats.datafile_creations, __ATOMIC_RELAXED);
+    array[24] = (uint64_t)__atomic_load_n(&ctx->stats.datafile_deletions, __ATOMIC_RELAXED);
+    array[25] = (uint64_t)__atomic_load_n(&ctx->stats.journalfile_creations, __ATOMIC_RELAXED);
+    array[26] = (uint64_t)__atomic_load_n(&ctx->stats.journalfile_deletions, __ATOMIC_RELAXED);
+    array[27] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.page_cache_descriptors, __ATOMIC_RELAXED);
+    array[28] = (uint64_t)__atomic_load_n(&ctx->stats.io_errors, __ATOMIC_RELAXED);
+    array[29] = (uint64_t)__atomic_load_n(&ctx->stats.fs_errors, __ATOMIC_RELAXED);
+    array[30] = (uint64_t)__atomic_load_n(&global_io_errors, __ATOMIC_RELAXED); // used
+    array[31] = (uint64_t)__atomic_load_n(&global_fs_errors, __ATOMIC_RELAXED); // used
+    array[32] = (uint64_t)__atomic_load_n(&rrdeng_reserved_file_descriptors, __ATOMIC_RELAXED); // used
+    array[33] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.pg_cache_over_half_dirty_events, __ATOMIC_RELAXED);
+    array[34] = (uint64_t)__atomic_load_n(&global_pg_cache_over_half_dirty_events, __ATOMIC_RELAXED); // used
+    array[35] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.flushing_pressure_page_deletions, __ATOMIC_RELAXED);
+    array[36] = (uint64_t)__atomic_load_n(&global_flushing_pressure_page_deletions, __ATOMIC_RELAXED); // used
+    array[37] = 0; //(uint64_t)pg_cache->active_descriptors;
+
+    fatal_assert(RRDENG_NR_STATS == 38);
+}
+
+static void rrdeng_populate_mrg(struct rrdengine_instance *ctx) {
+    uv_rwlock_rdlock(&ctx->datafiles.rwlock);
+    size_t datafiles = 0;
+    for(struct rrdengine_datafile *df = ctx->datafiles.first; df ;df = df->next)
+        datafiles++;
+    uv_rwlock_rdunlock(&ctx->datafiles.rwlock);
+
+    ssize_t cpus = (ssize_t)get_netdata_cpus() / (ssize_t)storage_tiers;
+    if(cpus > (ssize_t)datafiles)
+        cpus = (ssize_t)datafiles;
+
+    if(cpus > (ssize_t)libuv_worker_threads)
+        cpus = (ssize_t)libuv_worker_threads;
+
+    if(cpus >= (ssize_t)get_netdata_cpus() / 2)
+        cpus = (ssize_t)(get_netdata_cpus() / 2 - 1);
+
+    if(cpus < 1)
+        cpus = 1;
+
+    netdata_log_info("DBENGINE: populating retention to MRG from %zu journal files of tier %d, using %zd threads...", datafiles, ctx->config.tier, cpus);
+
+    if(datafiles > 2) {
+        struct rrdengine_datafile *datafile;
+
+        datafile = ctx->datafiles.first->prev;
+        if(!(datafile->journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE))
+            datafile = datafile->prev;
+
+        if(datafile->journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE) {
+            journalfile_v2_populate_retention_to_mrg(ctx, datafile->journalfile);
+            datafile->populate_mrg.populated = true;
+        }
+
+        datafile = ctx->datafiles.first;
+        if(datafile->journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE) {
+            journalfile_v2_populate_retention_to_mrg(ctx, datafile->journalfile);
+            datafile->populate_mrg.populated = true;
+        }
+    }
+
+    ctx->loading.populate_mrg.size = cpus;
+    ctx->loading.populate_mrg.array = callocz(ctx->loading.populate_mrg.size, sizeof(struct completion));
+
+    for (size_t i = 0; i < ctx->loading.populate_mrg.size; i++) {
+        completion_init(&ctx->loading.populate_mrg.array[i]);
+        rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_POPULATE_MRG, NULL, &ctx->loading.populate_mrg.array[i],
+                       STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
+    }
+}
+
+void rrdeng_readiness_wait(struct rrdengine_instance *ctx) {
+    for (size_t i = 0; i < ctx->loading.populate_mrg.size; i++) {
+        completion_wait_for(&ctx->loading.populate_mrg.array[i]);
+        completion_destroy(&ctx->loading.populate_mrg.array[i]);
+    }
+
+    freez(ctx->loading.populate_mrg.array);
+    ctx->loading.populate_mrg.array = NULL;
+    ctx->loading.populate_mrg.size = 0;
+
+    netdata_log_info("DBENGINE: tier %d is ready for data collection and queries", ctx->config.tier);
+}
+
+bool rrdeng_is_legacy(STORAGE_INSTANCE *si) {
+    struct rrdengine_instance *ctx = (struct rrdengine_instance *)si;
+    return ctx->config.legacy;
+}
+
+void rrdeng_exit_mode(struct rrdengine_instance *ctx) {
+    __atomic_store_n(&ctx->quiesce.exit_mode, true, __ATOMIC_RELAXED);
+}
+/*
+ * Returns 0 on success, negative on error
+ */
+int rrdeng_init(struct rrdengine_instance **ctxp, const char *dbfiles_path,
+                unsigned disk_space_mb, size_t tier) {
+    struct rrdengine_instance *ctx;
+    uint32_t max_open_files;
+
+    max_open_files = rlimit_nofile.rlim_cur / 4;
+
+    /* reserve RRDENG_FD_BUDGET_PER_INSTANCE file descriptors for this instance */
+    rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, RRDENG_FD_BUDGET_PER_INSTANCE);
+    if (rrdeng_reserved_file_descriptors > max_open_files) {
+        netdata_log_error(
+            "Exceeded the budget of available file descriptors (%u/%u), cannot create new dbengine instance.",
+            (unsigned)rrdeng_reserved_file_descriptors,
+            (unsigned)max_open_files);
+
+        rrd_stat_atomic_add(&global_fs_errors, 1);
+        rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE);
+        return UV_EMFILE;
+    }
+
+    if(NULL == ctxp) {
+        ctx = multidb_ctx[tier];
+        memset(ctx, 0, sizeof(*ctx));
+        ctx->config.legacy = false;
+    }
+    else {
+        *ctxp = ctx = callocz(1, sizeof(*ctx));
+        ctx->config.legacy = true;
+    }
+
+    ctx->config.tier = (int)tier;
+    ctx->config.page_type = tier_page_type[tier];
+    ctx->config.global_compress_alg = dbengine_default_compression();
+    if (disk_space_mb < RRDENG_MIN_DISK_SPACE_MB)
+        disk_space_mb = RRDENG_MIN_DISK_SPACE_MB;
+    ctx->config.max_disk_space = disk_space_mb * 1048576LLU;
+    strncpyz(ctx->config.dbfiles_path, dbfiles_path, sizeof(ctx->config.dbfiles_path) - 1);
+    ctx->config.dbfiles_path[sizeof(ctx->config.dbfiles_path) - 1] = '\0';
+
+    ctx->atomic.transaction_id = 1;
+    ctx->quiesce.enabled = false;
+
+    rw_spinlock_init(&ctx->njfv2idx.spinlock);
+    ctx->atomic.first_time_s = LONG_MAX;
+    ctx->atomic.metrics = 0;
+    ctx->atomic.samples = 0;
+
+    if (rrdeng_dbengine_spawn(ctx) && !init_rrd_files(ctx)) {
+        // success - we run this ctx too
+        rrdeng_populate_mrg(ctx);
+        return 0;
+    }
+
+    if (ctx->config.legacy) {
+        freez(ctx);
+        if (ctxp)
+            *ctxp = NULL;
+    }
+
+    rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE);
+    return UV_EIO;
+}
+
+size_t rrdeng_collectors_running(struct rrdengine_instance *ctx) {
+    return __atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED);
+}
+
+/*
+ * Returns 0 on success, 1 on error
+ */
+int rrdeng_exit(struct rrdengine_instance *ctx) {
+    if (NULL == ctx)
+        return 1;
+
+    // FIXME - ktsaou - properly cleanup ctx
+    // 1. make sure all collectors are stopped
+    // 2. make new queries will not be accepted (this is quiesce that has already run)
+    // 3. flush this section of the main cache
+    // 4. then wait for completion
+
+    bool logged = false;
+    size_t count = 10;
+    while(__atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED) && count && !unittest_running) {
+        if(!logged) {
+            netdata_log_info("DBENGINE: waiting for collectors to finish on tier %d...", (ctx->config.legacy) ? -1 : ctx->config.tier);
+            logged = true;
+        }
+        sleep_usec(100 * USEC_PER_MS);
+        count--;
+    }
+
+    netdata_log_info("DBENGINE: flushing main cache for tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier);
+    pgc_flush_all_hot_and_dirty_pages(main_cache, (Word_t)ctx);
+
+    netdata_log_info("DBENGINE: shutting down tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier);
+    struct completion completion = {};
+    completion_init(&completion);
+    rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_SHUTDOWN, NULL, &completion, STORAGE_PRIORITY_BEST_EFFORT, NULL, NULL);
+    completion_wait_for(&completion);
+    completion_destroy(&completion);
+
+    finalize_rrd_files(ctx);
+
+    if(ctx->config.legacy)
+        freez(ctx);
+
+    rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE);
+    return 0;
+}
+
+void rrdeng_prepare_exit(struct rrdengine_instance *ctx) {
+    if (NULL == ctx)
+        return;
+
+    // FIXME - ktsaou - properly cleanup ctx
+    // 1. make sure all collectors are stopped
+
+    completion_init(&ctx->quiesce.completion);
+    rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_QUIESCE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
+}
+
+static void populate_v2_statistics(struct rrdengine_datafile *datafile, RRDENG_SIZE_STATS *stats)
+{
+    struct journal_v2_header *j2_header = journalfile_v2_data_acquire(datafile->journalfile, NULL, 0, 0);
+    void *data_start = (void *)j2_header;
+
+    if(unlikely(!j2_header))
+        return;
+
+    stats->extents += j2_header->extent_count;
+
+    unsigned entries;
+    struct journal_extent_list *extent_list = (void *) (data_start + j2_header->extent_offset);
+    for (entries = 0; entries < j2_header->extent_count; entries++) {
+        stats->extents_compressed_bytes += extent_list->datafile_size;
+        stats->extents_pages += extent_list->pages;
+        extent_list++;
+    }
+
+    struct journal_metric_list *metric = (void *) (data_start + j2_header->metric_offset);
+    time_t journal_start_time_s = (time_t) (j2_header->start_time_ut / USEC_PER_SEC);
+
+    stats->metrics += j2_header->metric_count;
+    for (entries = 0; entries < j2_header->metric_count; entries++) {
+
+        struct journal_page_header *metric_list_header = (void *) (data_start + metric->page_offset);
+        stats->metrics_pages += metric_list_header->entries;
+        struct journal_page_list *descr =  (void *) (data_start + metric->page_offset + sizeof(struct journal_page_header));
+        for (uint32_t idx=0; idx < metric_list_header->entries; idx++) {
+
+            time_t update_every_s;
+
+            size_t points = descr->page_length / CTX_POINT_SIZE_BYTES(datafile->ctx);
+
+            time_t start_time_s = journal_start_time_s + descr->delta_start_s;
+            time_t end_time_s = journal_start_time_s + descr->delta_end_s;
+
+            if(likely(points > 1))
+                update_every_s = (time_t) ((end_time_s - start_time_s) / (points - 1));
+            else {
+                update_every_s = (time_t) (default_rrd_update_every * get_tier_grouping(datafile->ctx->config.tier));
+                stats->single_point_pages++;
+            }
+
+            time_t duration_s = (time_t)((end_time_s - start_time_s + update_every_s));
+
+            stats->pages_uncompressed_bytes += descr->page_length;
+            stats->pages_duration_secs += duration_s;
+            stats->points += points;
+
+            stats->page_types[descr->type].pages++;
+            stats->page_types[descr->type].pages_uncompressed_bytes += descr->page_length;
+            stats->page_types[descr->type].pages_duration_secs += duration_s;
+            stats->page_types[descr->type].points += points;
+
+            if(!stats->first_time_s || (start_time_s - update_every_s) < stats->first_time_s)
+                stats->first_time_s = (start_time_s - update_every_s);
+
+            if(!stats->last_time_s || end_time_s > stats->last_time_s)
+                stats->last_time_s = end_time_s;
+
+            descr++;
+        }
+        metric++;
+    }
+
+    journalfile_v2_data_release(datafile->journalfile);
+}
+
+RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx) {
+    RRDENG_SIZE_STATS stats = { 0 };
+
+    uv_rwlock_rdlock(&ctx->datafiles.rwlock);
+    for(struct rrdengine_datafile *df = ctx->datafiles.first; df ;df = df->next) {
+        stats.datafiles++;
+        populate_v2_statistics(df, &stats);
+    }
+    uv_rwlock_rdunlock(&ctx->datafiles.rwlock);
+
+    stats.currently_collected_metrics = __atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED);
+
+    internal_error(stats.metrics_pages != stats.extents_pages + stats.currently_collected_metrics,
+                   "DBENGINE: metrics pages is %zu, but extents pages is %zu and API consumers is %zu",
+                   stats.metrics_pages, stats.extents_pages, stats.currently_collected_metrics);
+
+    stats.disk_space = ctx_current_disk_space_get(ctx);
+    stats.max_disk_space = ctx->config.max_disk_space;
+
+    stats.database_retention_secs = (time_t)(stats.last_time_s - stats.first_time_s);
+
+    if(stats.extents_pages)
+        stats.average_page_size_bytes = (double)stats.pages_uncompressed_bytes / (double)stats.extents_pages;
+
+    if(stats.pages_uncompressed_bytes > 0)
+        stats.average_compression_savings = 100.0 - ((double)stats.extents_compressed_bytes * 100.0 / (double)stats.pages_uncompressed_bytes);
+
+    if(stats.points)
+        stats.average_point_duration_secs = (double)stats.pages_duration_secs / (double)stats.points;
+
+    if(stats.metrics) {
+        stats.average_metric_retention_secs = (double)stats.pages_duration_secs / (double)stats.metrics;
+
+        if(stats.database_retention_secs) {
+            double metric_coverage = stats.average_metric_retention_secs / (double)stats.database_retention_secs;
+            double db_retention_days = (double)stats.database_retention_secs / 86400.0;
+
+            stats.estimated_concurrently_collected_metrics = stats.metrics * metric_coverage;
+
+            stats.ephemeral_metrics_per_day_percent = ((double)stats.metrics * 100.0 / (double)stats.estimated_concurrently_collected_metrics - 100.0) / (double)db_retention_days;
+        }
+    }
+
+//    stats.sizeof_metric = 0;
+    stats.sizeof_datafile = struct_natural_alignment(sizeof(struct rrdengine_datafile)) + struct_natural_alignment(sizeof(struct rrdengine_journalfile));
+    stats.sizeof_page_in_cache = 0; // struct_natural_alignment(sizeof(struct page_cache_descr));
+    stats.sizeof_point_data = page_type_size[ctx->config.page_type];
+    stats.sizeof_page_data = tier_page_size[ctx->config.tier];
+    stats.pages_per_extent = rrdeng_pages_per_extent;
+
+//    stats.sizeof_metric_in_index = 40;
+//    stats.sizeof_page_in_index = 24;
+
+    stats.default_granularity_secs = (size_t)default_rrd_update_every * get_tier_grouping(ctx->config.tier);
+
+    return stats;
+}
+
+struct rrdeng_cache_efficiency_stats rrdeng_get_cache_efficiency_stats(void) {
+    // FIXME - make cache efficiency stats atomic
+    return rrdeng_cache_efficiency_stats;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-05 12:08:03 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-05 12:08:18 +0000
commit	5da14042f70711ea5cf66e034699730335462f66 (patch)
tree	0f6354ccac934ed87a2d555f45be4c831cf92f4a /src/database/engine/rrdengineapi.c
parent	Releasing debian version 1.44.3-2. (diff)
download	netdata-5da14042f70711ea5cf66e034699730335462f66.tar.xz netdata-5da14042f70711ea5cf66e034699730335462f66.zip