summaryrefslogtreecommitdiffstats
path: root/database/engine
diff options
context:
space:
mode:
Diffstat (limited to 'database/engine')
-rw-r--r--database/engine/Makefile.am11
-rw-r--r--database/engine/README.md192
-rw-r--r--database/engine/cache.c2746
-rw-r--r--database/engine/cache.h250
-rw-r--r--database/engine/datafile.c611
-rw-r--r--database/engine/datafile.h88
-rw-r--r--database/engine/datafile.ksy74
-rw-r--r--database/engine/dbengine-diagram.xml1
-rw-r--r--database/engine/journalfile.c1569
-rw-r--r--database/engine/journalfile.h177
-rw-r--r--database/engine/journalfile_v2.ksy.in150
-rw-r--r--database/engine/metadata_log/README.md0
-rw-r--r--database/engine/metric.c873
-rw-r--r--database/engine/metric.h94
-rw-r--r--database/engine/page.c679
-rw-r--r--database/engine/page.h58
-rw-r--r--database/engine/page_test.cc405
-rw-r--r--database/engine/page_test.h14
-rw-r--r--database/engine/pagecache.c1117
-rw-r--r--database/engine/pagecache.h62
-rw-r--r--database/engine/pdc.c1332
-rw-r--r--database/engine/pdc.h67
-rw-r--r--database/engine/rrddiskprotocol.h130
-rw-r--r--database/engine/rrdengine.c1866
-rw-r--r--database/engine/rrdengine.h532
-rwxr-xr-xdatabase/engine/rrdengineapi.c1361
-rw-r--r--database/engine/rrdengineapi.h229
-rw-r--r--database/engine/rrdenginelib.c161
-rw-r--r--database/engine/rrdenginelib.h94
29 files changed, 14943 insertions, 0 deletions
diff --git a/database/engine/Makefile.am b/database/engine/Makefile.am
new file mode 100644
index 00000000..59250a99
--- /dev/null
+++ b/database/engine/Makefile.am
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+AUTOMAKE_OPTIONS = subdir-objects
+MAINTAINERCLEANFILES = $(srcdir)/Makefile.in
+
+SUBDIRS = \
+ $(NULL)
+
+dist_noinst_DATA = \
+ README.md \
+ $(NULL)
diff --git a/database/engine/README.md b/database/engine/README.md
new file mode 100644
index 00000000..89001864
--- /dev/null
+++ b/database/engine/README.md
@@ -0,0 +1,192 @@
+# Database engine
+
+DBENGINE is the time-series database of Netdata.
+
+![image](https://user-images.githubusercontent.com/2662304/233838474-d4f8f0b9-61dc-4409-a708-97d403cd153a.png)
+
+## Design
+
+### Data Points
+
+**Data points** represent the collected values of metrics.
+
+A **data point** has:
+
+1. A **value**, the data collected for a metric. There is a special **value** to indicate that the collector failed to collect a valid value, and thus the data point is a **gap**.
+2. A **timestamp**, the time it has been collected.
+3. A **duration**, the time between this and the previous data collection.
+4. A flag which is set when machine-learning categorized the collected value as **anomalous** (an outlier based on the trained models).
+
+Using the **timestamp** and **duration**, Netdata calculates for each point its **start time**, **end time** and **update every**.
+
+For incremental metrics (counters), Netdata interpolates the collected values to align them to the expected **end time** at the microsecond level, absorbing data collection micro-latencies.
+
+When data points are stored in higher tiers (time aggregations - see [Tiers](#Tiers) below), each data point has:
+
+1. The **sum** of the original values that have been aggregated,
+2. The **count** of all the original values aggregated,
+3. The **minimum** value among them,
+4. The **maximum** value among them,
+5. Their **anomaly rate**, i.e. the count of values that were detected as outliers based on the currently trained models for the metric,
+6. A **timestamp**, which is the equal to the **end time** of the last point aggregated,
+7. A **duration**, which is the duration between the **first time** of the first point aggregated to the **end time** of the last point aggregated.
+
+This design allows Netdata to accurately know the **average**, **minimum**, **maximum** and **anomaly rate** values even when using higher tiers to satisfy a query.
+
+### Pages
+Data points are organized into **pages**, i.e. segments of contiguous data collections of the same metric.
+
+Each page:
+
+1. Contains contiguous **data points** of a single metric.
+2. Contains **data points** having the same **update every**. If a metric changes **update every** on the fly, the page is flushed and a new one with the new **update every** is created. If a data collection is missed, a **gap point** is inserted into the page, so that the data points in a page remain contiguous.
+3. Has a **start time**, which is equivalent to the **end time** of the first data point stored into it,
+4. Has an **end time**, which is equal to the **end time** of the last data point stored into it,
+5. Has an **update every**, common for all points in the page.
+
+A **page** is a simple array of values. Each slot in the array has a **timestamp** implied by its position in the array, and each value stored represents the **data point** for that time, for the metric the page belongs to.
+
+This simple fixed step page design allows Netdata to collect several millions of points per second and pack all the values in a compact form with minimal metadata overhead.
+
+#### Hot Pages
+
+While a metric is collected, there is one **hot page** in memory for each of the configured tiers. Values collected for a metric are appended to its **hot page** until that page becomes full.
+
+#### Dirty Pages
+
+Once a **hot page** is full, it becomes a **dirty page**, and it is scheduled for immediate **flushing** (saving) to disk.
+
+#### Clean Pages
+
+Flushed (saved) pages are **clean pages**, i.e. read-only pages that reside primarily on disk, and are loaded on demand to satisfy data queries.
+
+#### Pages Configuration
+
+Pages are configured like this:
+
+| Attribute | Tier0 | Tier1 | Tier2 |
+|---------------------------------------------------------------------------------------|:-------------------------------------:|:---------------------------------------------------------------:|:---------------------------------------------------------------:|
+| Point Size in Memory, in Bytes | 4 | 16 | 16 |
+| Point Size on Disk, in Bytes<br/><small>after LZ4 compression, on the average</small> | 1 | 4 | 4 |
+| Page Size in Bytes | 4096<br/><small>2048 in 32bit</small> | 2048<br/><small>1024 in 32bit</small> | 384<br/><small>192 in 32bit</small> |
+| Collections per Point | 1 | 60x Tier0<br/><small>configurable in<br/>`netdata.conf`</small> | 60x Tier1<br/><small>configurable in<br/>`netdata.conf`</small> |
+| Points per Page | 1024<br/><small>512 in 32bit</small> | 128<br/><small>64 in 32bit</small> | 24<br/><small>12 in 32bit</small> |
+
+### Files
+
+To minimize the amount of data written to disk and the amount of storage required for storing metrics, Netdata aggregates up to 64 **dirty pages** of independent metrics, packs them all together into one bigger buffer, compresses this buffer with LZ4 (about 75% savings on the average) and commits a transaction to the disk files.
+
+#### Extents
+
+This collection of 64 pages that is packed and compressed together is called an **extent**. Netdata tries to store together, in the same **extent**, metrics that are meant to be "close". Dimensions of the same chart are such. They are usually queried together, so it is beneficial to have them in the same **extent** to read all of them at once at query time.
+
+#### Datafiles
+
+Multiple **extents** are appended to **datafiles** (filename suffix `.ndf`), until these **datafiles** become full. The size of each **datafile** is determined automatically by Netdata. The minimum for each **datafile** is 4MB and the maximum 512MB. Depending on the amount of disk space configured for each tier, Netdata will decide a **datafile** size trying to maintain about 50 datafiles for the whole database, within the limits mentioned (4MB min, 512MB max per file). The maximum number of datafiles supported is 65536, and therefore the maximum database size (per tier) that Netdata can support is 32TB.
+
+#### Journal Files
+
+Each **datafile** has two **journal files** with metadata related to the stored data in the **datafile**.
+
+- **journal file v1**, with filename suffix `.njf`, holds information about the transactions in its **datafile** and provides the ability to recover as much data as possible, in case either the datafile or the journal files get corrupted. This journal file has a maximum transaction size of 4KB, so in case data are corrupted on disk transactions of 4KB are lost. Each transaction holds the metadata of one **extent** (this is why DBENGINE supports up to 64 pages per extent).
+
+- **journal file v2**, with filename suffix `.njfv2`, which is a disk-based index for all the **pages** and **extents**. This file is memory mapped at runtime and is consulted to find where the data of a metric are in the datafile. This journal file is automatically re-created from **journal file v1** if it is missing. It is safe to delete these files (when Netdata does not run). Netdata will re-create them on the next run. Journal files v2 are supported in Netdata Agents with version `netdata-1.37.0-115-nightly`. Older versions maintain the journal index in memory.
+
+#### Database Rotation
+
+Database rotation is achieved by deleting the oldest **datafile** (and its journals) and creating a new one (with its journals).
+
+Data on disk are append-only. There is no way to delete, add, or update data in the middle of the database. If data are not useful for whatever reason, Netdata can be instructed to ignore these data. They will eventually be deleted from disk when the database is rotated. New data are always appended.
+
+#### Tiers
+
+Tiers are supported in Netdata Agents with version `netdata-1.35.0.138.nightly` and greater.
+
+**datafiles** and **journal files** are organized in **tiers**. All tiers share the same metrics and same collected values.
+
+- **tier 0** is the high resolution tier that stores the collected data at the frequency they are collected.
+- **tier 1** by default aggregates 60 values of **tier 0**.
+- **tier 2** by default aggregates 60 values of **tier 1**, or 3600 values of **tier 0**.
+
+Updating the higher **tiers** is automated, and it happens in real-time while data are being collected for **tier 0**.
+
+When the Netdata Agent starts, during the first data collection of each metric, higher tiers are automatically **backfilled** with
+data from lower tiers, so that the aggregation they provide will be accurate.
+
+Configuring how the number of tiers and the disk space allocated to each tier is how you can
+[change how long netdata stores metrics](https://github.com/netdata/netdata/blob/master/docs/store/change-metrics-storage.md).
+
+### Data loss
+
+Until **hot pages** and **dirty pages** are **flushed** to disk they are at risk (e.g. due to a crash, or
+power failure), as they are stored only in memory.
+
+The supported way of ensuring high data availability is the use of Netdata Parents to stream the data in real-time to
+multiple other Netdata agents.
+
+## Memory requirements and retention
+
+See (change how long netdata stores metrics)[https://github.com/netdata/netdata/edit/master/docs/store/change-metrics-storage.md]
+
+#### Exceptions
+
+Netdata has several protection mechanisms to prevent the use of more memory (than the above), by incrementally fetching data from disk and aggressively evicting old data to make room for new data, but still memory may grow beyond the above limit under the following conditions:
+
+1. The number of pages concurrently used in queries do not fit the in the above size. This can happen when multiple queries of unreasonably long time-frames run on lower, higher resolution, tiers. The Netdata query planner attempts to avoid such situations by gradually loading pages, but still under extreme conditions the system may use more memory to satisfy these queries.
+
+2. The disks that host Netdata files are extremely slow for the workload required by the database so that data cannot be flushed to disk quickly to free memory. Netdata will automatically spawn more flushing workers in an attempt to parallelize and speed up flushing, but still if the disks cannot write the data quickly enough, they will remain in memory until they are written to disk.
+
+### Caches
+
+DBENGINE stores metric data to disk. To achieve high performance even under severe stress, it uses several layers of caches.
+
+#### Main Cache
+
+Stores page data. It is the primary storage of hot and dirty pages (before they are saved to disk), and its clean queue is the LRU cache for speeding up queries.
+
+The entire DBENGINE is designed to use the hot queue size (the currently collected metrics) as the key for sizing all its memory consumption. We call this feature **memory ballooning**. More collected metrics, bigger main cache and vice versa.
+
+In the equation:
+
+```
+memory in KiB = METRICS x (TIERS - 1) x 4KiB x 2 + 32768 KiB
+```
+
+the part `METRICS x (TIERS - 1) x 4KiB` is an estimate for the max hot size of the main cache. Tier 0 pages are 4KiB, but tier 1 pages are 2 KiB and tier 2 pages are 384 bytes. So a single metric in 3 tiers uses 4096 + 2048 + 384 = 6528 bytes. The equation estimates 8192 per metric, which includes cache internal structures and leaves some spare.
+
+Then `x 2` is the worst case estimate for the dirty queue. If all collected metrics (hot) become available for saving at once, to avoid stopping data collection all their pages will become dirty and new hot pages will be created instantly. To save memory, when Netdata starts, DBENGINE allocates randomly smaller pages for metrics, to spread their completion evenly across time.
+
+The memory we saved with the above is used to improve the LRU cache. So, although we reserved 32MiB for the LRU, in bigger setups (Netdata Parents) the LRU grows a lot more, within the limits of the equation.
+
+In practice, the main cache sizes itself with `hot x 1.5` instead of `host x 2`. The reason is that 5% of main cache is reserved for expanding open cache, 5% for expanding extent cache and we need room for the extensive buffers that are allocated in these setups. When the main cache exceeds `hot x 1.5` it enters a mode of critical evictions, and aggresively frees pages from the LRU to maintain a healthy memory footprint within its design limits.
+
+#### Open Cache
+
+Stores metadata about on disk pages. Not the data itself. Only metadata about the location of the data on disk.
+
+Its primary use is to index information about the open datafile, the one that still accepts new pages. Once that datafile becomes full, all the hot pages of the open cache are indexed in journal v2 files.
+
+The clean queue is an LRU for reducing the journal v2 scans during quering.
+
+Open cache uses memory ballooning too, like the main cache, based on its own hot pages. Open cache hot size is mainly controlled by the size of the open datafile. This is why on netdata versions with journal files v2, we decreased the maximum datafile size from 1GB to 512MB and we increased the target number of datafiles from 20 to 50.
+
+On bigger setups open cache will get a bigger LRU by automatically sizing it (the whole open cache) to 5% to the size of (the whole) main cache.
+
+#### Extent Cache
+
+Caches compressed **extent** data, to avoid reading too repeatedly the same data from disks.
+
+
+### Shared Memory
+
+Journal v2 indexes are mapped into memory. Netdata attempts to minimize shared memory use by instructing the kernel about the use of these files, or even unmounting them when they are not needed.
+
+The time-ranges of the queries running control the amount of shared memory required.
+
+## Metrics Registry
+
+DBENGINE uses 150 bytes of memory for every metric for which retention is maintained but is not currently being collected.
+
+
+
+
diff --git a/database/engine/cache.c b/database/engine/cache.c
new file mode 100644
index 00000000..eb1c3529
--- /dev/null
+++ b/database/engine/cache.c
@@ -0,0 +1,2746 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+#include "cache.h"
+
+/* STATES AND TRANSITIONS
+ *
+ * entry | entry
+ * v v
+ * HOT -> DIRTY --> CLEAN --> EVICT
+ * v | v
+ * flush | evict
+ * v | v
+ * save | free
+ * callback | callback
+ *
+ */
+
+typedef int32_t REFCOUNT;
+#define REFCOUNT_DELETING (-100)
+
+// to use ARAL uncomment the following line:
+#define PGC_WITH_ARAL 1
+
+typedef enum __attribute__ ((__packed__)) {
+ // mutually exclusive flags
+ PGC_PAGE_CLEAN = (1 << 0), // none of the following
+ PGC_PAGE_DIRTY = (1 << 1), // contains unsaved data
+ PGC_PAGE_HOT = (1 << 2), // currently being collected
+
+ // flags related to various actions on each page
+ PGC_PAGE_IS_BEING_DELETED = (1 << 3),
+ PGC_PAGE_IS_BEING_MIGRATED_TO_V2 = (1 << 4),
+ PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES = (1 << 5),
+ PGC_PAGE_HAS_BEEN_ACCESSED = (1 << 6),
+} PGC_PAGE_FLAGS;
+
+#define page_flag_check(page, flag) (__atomic_load_n(&((page)->flags), __ATOMIC_ACQUIRE) & (flag))
+#define page_flag_set(page, flag) __atomic_or_fetch(&((page)->flags), flag, __ATOMIC_RELEASE)
+#define page_flag_clear(page, flag) __atomic_and_fetch(&((page)->flags), ~(flag), __ATOMIC_RELEASE)
+
+#define page_get_status_flags(page) page_flag_check(page, PGC_PAGE_HOT | PGC_PAGE_DIRTY | PGC_PAGE_CLEAN)
+#define is_page_hot(page) (page_get_status_flags(page) == PGC_PAGE_HOT)
+#define is_page_dirty(page) (page_get_status_flags(page) == PGC_PAGE_DIRTY)
+#define is_page_clean(page) (page_get_status_flags(page) == PGC_PAGE_CLEAN)
+
+struct pgc_page {
+ // indexing data
+ Word_t section;
+ Word_t metric_id;
+ time_t start_time_s;
+ time_t end_time_s;
+ uint32_t update_every_s;
+ uint32_t assumed_size;
+
+ REFCOUNT refcount;
+ uint16_t accesses; // counts the number of accesses on this page
+ PGC_PAGE_FLAGS flags;
+ SPINLOCK transition_spinlock; // when the page changes between HOT, DIRTY, CLEAN, we have to get this lock
+
+ struct {
+ struct pgc_page *next;
+ struct pgc_page *prev;
+ } link;
+
+ void *data;
+ uint8_t custom_data[];
+
+ // IMPORTANT!
+ // THIS STRUCTURE NEEDS TO BE INITIALIZED BY HAND!
+};
+
+struct pgc_linked_list {
+ SPINLOCK spinlock;
+ union {
+ PGC_PAGE *base;
+ Pvoid_t sections_judy;
+ };
+ PGC_PAGE_FLAGS flags;
+ size_t version;
+ size_t last_version_checked;
+ bool linked_list_in_sections_judy; // when true, we use 'sections_judy', otherwise we use 'base'
+ struct pgc_queue_statistics *stats;
+};
+
+struct pgc {
+ struct {
+ char name[PGC_NAME_MAX + 1];
+
+ size_t partitions;
+ size_t clean_size;
+ size_t max_dirty_pages_per_call;
+ size_t max_pages_per_inline_eviction;
+ size_t max_skip_pages_per_inline_eviction;
+ size_t max_flushes_inline;
+ size_t max_workers_evict_inline;
+ size_t additional_bytes_per_page;
+ free_clean_page_callback pgc_free_clean_cb;
+ save_dirty_page_callback pgc_save_dirty_cb;
+ save_dirty_init_callback pgc_save_init_cb;
+ PGC_OPTIONS options;
+
+ size_t severe_pressure_per1000;
+ size_t aggressive_evict_per1000;
+ size_t healthy_size_per1000;
+ size_t evict_low_threshold_per1000;
+
+ dynamic_target_cache_size_callback dynamic_target_size_cb;
+ } config;
+
+#ifdef PGC_WITH_ARAL
+ ARAL **aral;
+#endif
+
+ PGC_CACHE_LINE_PADDING(0);
+
+ struct pgc_index {
+ RW_SPINLOCK rw_spinlock;
+ Pvoid_t sections_judy;
+ PGC_CACHE_LINE_PADDING(0);
+ } *index;
+
+ PGC_CACHE_LINE_PADDING(1);
+
+ struct {
+ SPINLOCK spinlock;
+ size_t per1000;
+ } usage;
+
+ PGC_CACHE_LINE_PADDING(2);
+
+ struct pgc_linked_list clean; // LRU is applied here to free memory from the cache
+
+ PGC_CACHE_LINE_PADDING(3);
+
+ struct pgc_linked_list dirty; // in the dirty list, pages are ordered the way they were marked dirty
+
+ PGC_CACHE_LINE_PADDING(4);
+
+ struct pgc_linked_list hot; // in the hot list, pages are order the way they were marked hot
+
+ PGC_CACHE_LINE_PADDING(5);
+
+ struct pgc_statistics stats; // statistics
+
+#ifdef NETDATA_PGC_POINTER_CHECK
+ PGC_CACHE_LINE_PADDING(6);
+ netdata_mutex_t global_pointer_registry_mutex;
+ Pvoid_t global_pointer_registry;
+#endif
+};
+
+
+
+// ----------------------------------------------------------------------------
+// validate each pointer is indexed once - internal checks only
+
+static inline void pointer_index_init(PGC *cache __maybe_unused) {
+#ifdef NETDATA_PGC_POINTER_CHECK
+ netdata_mutex_init(&cache->global_pointer_registry_mutex);
+#else
+ ;
+#endif
+}
+
+static inline void pointer_destroy_index(PGC *cache __maybe_unused) {
+#ifdef NETDATA_PGC_POINTER_CHECK
+ netdata_mutex_lock(&cache->global_pointer_registry_mutex);
+ JudyHSFreeArray(&cache->global_pointer_registry, PJE0);
+ netdata_mutex_unlock(&cache->global_pointer_registry_mutex);
+#else
+ ;
+#endif
+}
+static inline void pointer_add(PGC *cache __maybe_unused, PGC_PAGE *page __maybe_unused) {
+#ifdef NETDATA_PGC_POINTER_CHECK
+ netdata_mutex_lock(&cache->global_pointer_registry_mutex);
+ Pvoid_t *PValue = JudyHSIns(&cache->global_pointer_registry, &page, sizeof(void *), PJE0);
+ if(*PValue != NULL)
+ fatal("pointer already exists in registry");
+ *PValue = page;
+ netdata_mutex_unlock(&cache->global_pointer_registry_mutex);
+#else
+ ;
+#endif
+}
+
+static inline void pointer_check(PGC *cache __maybe_unused, PGC_PAGE *page __maybe_unused) {
+#ifdef NETDATA_PGC_POINTER_CHECK
+ netdata_mutex_lock(&cache->global_pointer_registry_mutex);
+ Pvoid_t *PValue = JudyHSGet(cache->global_pointer_registry, &page, sizeof(void *));
+ if(PValue == NULL)
+ fatal("pointer is not found in registry");
+ netdata_mutex_unlock(&cache->global_pointer_registry_mutex);
+#else
+ ;
+#endif
+}
+
+static inline void pointer_del(PGC *cache __maybe_unused, PGC_PAGE *page __maybe_unused) {
+#ifdef NETDATA_PGC_POINTER_CHECK
+ netdata_mutex_lock(&cache->global_pointer_registry_mutex);
+ int ret = JudyHSDel(&cache->global_pointer_registry, &page, sizeof(void *), PJE0);
+ if(!ret)
+ fatal("pointer to be deleted does not exist in registry");
+ netdata_mutex_unlock(&cache->global_pointer_registry_mutex);
+#else
+ ;
+#endif
+}
+
+// ----------------------------------------------------------------------------
+// locking
+
+static inline size_t pgc_indexing_partition(PGC *cache, Word_t metric_id) {
+ static __thread Word_t last_metric_id = 0;
+ static __thread size_t last_partition = 0;
+
+ if(metric_id == last_metric_id || cache->config.partitions == 1)
+ return last_partition;
+
+ last_metric_id = metric_id;
+ last_partition = indexing_partition(metric_id, cache->config.partitions);
+
+ return last_partition;
+}
+
+static inline void pgc_index_read_lock(PGC *cache, size_t partition) {
+ rw_spinlock_read_lock(&cache->index[partition].rw_spinlock);
+}
+static inline void pgc_index_read_unlock(PGC *cache, size_t partition) {
+ rw_spinlock_read_unlock(&cache->index[partition].rw_spinlock);
+}
+static inline void pgc_index_write_lock(PGC *cache, size_t partition) {
+ rw_spinlock_write_lock(&cache->index[partition].rw_spinlock);
+}
+static inline void pgc_index_write_unlock(PGC *cache, size_t partition) {
+ rw_spinlock_write_unlock(&cache->index[partition].rw_spinlock);
+}
+
+static inline bool pgc_ll_trylock(PGC *cache __maybe_unused, struct pgc_linked_list *ll) {
+ return spinlock_trylock(&ll->spinlock);
+}
+
+static inline void pgc_ll_lock(PGC *cache __maybe_unused, struct pgc_linked_list *ll) {
+ spinlock_lock(&ll->spinlock);
+}
+
+static inline void pgc_ll_unlock(PGC *cache __maybe_unused, struct pgc_linked_list *ll) {
+ spinlock_unlock(&ll->spinlock);
+}
+
+static inline bool page_transition_trylock(PGC *cache __maybe_unused, PGC_PAGE *page) {
+ return spinlock_trylock(&page->transition_spinlock);
+}
+
+static inline void page_transition_lock(PGC *cache __maybe_unused, PGC_PAGE *page) {
+ spinlock_lock(&page->transition_spinlock);
+}
+
+static inline void page_transition_unlock(PGC *cache __maybe_unused, PGC_PAGE *page) {
+ spinlock_unlock(&page->transition_spinlock);
+}
+
+// ----------------------------------------------------------------------------
+// evictions control
+
+static inline size_t cache_usage_per1000(PGC *cache, size_t *size_to_evict) {
+
+ if(size_to_evict)
+ spinlock_lock(&cache->usage.spinlock);
+
+ else if(!spinlock_trylock(&cache->usage.spinlock))
+ return __atomic_load_n(&cache->usage.per1000, __ATOMIC_RELAXED);
+
+ size_t current_cache_size;
+ size_t wanted_cache_size;
+ size_t per1000;
+
+ size_t dirty = __atomic_load_n(&cache->dirty.stats->size, __ATOMIC_RELAXED);
+ size_t hot = __atomic_load_n(&cache->hot.stats->size, __ATOMIC_RELAXED);
+
+ if(cache->config.options & PGC_OPTIONS_AUTOSCALE) {
+ size_t dirty_max = __atomic_load_n(&cache->dirty.stats->max_size, __ATOMIC_RELAXED);
+ size_t hot_max = __atomic_load_n(&cache->hot.stats->max_size, __ATOMIC_RELAXED);
+
+ // our promise to users
+ size_t max_size1 = MAX(hot_max, hot) * 2;
+
+ // protection against slow flushing
+ size_t max_size2 = hot_max + ((dirty_max < hot_max / 2) ? hot_max / 2 : dirty_max * 2);
+
+ // the final wanted cache size
+ wanted_cache_size = MIN(max_size1, max_size2);
+
+ if(cache->config.dynamic_target_size_cb) {
+ size_t wanted_cache_size_cb = cache->config.dynamic_target_size_cb();
+ if(wanted_cache_size_cb > wanted_cache_size)
+ wanted_cache_size = wanted_cache_size_cb;
+ }
+
+ if (wanted_cache_size < hot + dirty + cache->config.clean_size)
+ wanted_cache_size = hot + dirty + cache->config.clean_size;
+ }
+ else
+ wanted_cache_size = hot + dirty + cache->config.clean_size;
+
+ // protection again huge queries
+ // if huge queries are running, or huge amounts need to be saved
+ // allow the cache to grow more (hot pages in main cache are also referenced)
+ size_t referenced_size = __atomic_load_n(&cache->stats.referenced_size, __ATOMIC_RELAXED);
+ if(unlikely(wanted_cache_size < referenced_size * 2 / 3))
+ wanted_cache_size = referenced_size * 2 / 3;
+
+ current_cache_size = __atomic_load_n(&cache->stats.size, __ATOMIC_RELAXED); // + pgc_aral_overhead();
+
+ per1000 = (size_t)((unsigned long long)current_cache_size * 1000ULL / (unsigned long long)wanted_cache_size);
+
+ __atomic_store_n(&cache->usage.per1000, per1000, __ATOMIC_RELAXED);
+ __atomic_store_n(&cache->stats.wanted_cache_size, wanted_cache_size, __ATOMIC_RELAXED);
+ __atomic_store_n(&cache->stats.current_cache_size, current_cache_size, __ATOMIC_RELAXED);
+
+ spinlock_unlock(&cache->usage.spinlock);
+
+ if(size_to_evict) {
+ size_t target = (size_t)((unsigned long long)wanted_cache_size * (unsigned long long)cache->config.evict_low_threshold_per1000 / 1000ULL);
+ if(current_cache_size > target)
+ *size_to_evict = current_cache_size - target;
+ else
+ *size_to_evict = 0;
+ }
+
+ if(per1000 >= cache->config.severe_pressure_per1000)
+ __atomic_add_fetch(&cache->stats.events_cache_under_severe_pressure, 1, __ATOMIC_RELAXED);
+
+ else if(per1000 >= cache->config.aggressive_evict_per1000)
+ __atomic_add_fetch(&cache->stats.events_cache_needs_space_aggressively, 1, __ATOMIC_RELAXED);
+
+ return per1000;
+}
+
+static inline bool cache_pressure(PGC *cache, size_t limit) {
+ return (cache_usage_per1000(cache, NULL) >= limit);
+}
+
+#define cache_under_severe_pressure(cache) cache_pressure(cache, (cache)->config.severe_pressure_per1000)
+#define cache_needs_space_aggressively(cache) cache_pressure(cache, (cache)->config.aggressive_evict_per1000)
+#define cache_above_healthy_limit(cache) cache_pressure(cache, (cache)->config.healthy_size_per1000)
+
+typedef bool (*evict_filter)(PGC_PAGE *page, void *data);
+static bool evict_pages_with_filter(PGC *cache, size_t max_skip, size_t max_evict, bool wait, bool all_of_them, evict_filter filter, void *data);
+#define evict_pages(cache, max_skip, max_evict, wait, all_of_them) evict_pages_with_filter(cache, max_skip, max_evict, wait, all_of_them, NULL, NULL)
+
+static inline void evict_on_clean_page_added(PGC *cache __maybe_unused) {
+ if((cache->config.options & PGC_OPTIONS_EVICT_PAGES_INLINE) || cache_needs_space_aggressively(cache)) {
+ evict_pages(cache,
+ cache->config.max_skip_pages_per_inline_eviction,
+ cache->config.max_pages_per_inline_eviction,
+ false, false);
+ }
+}
+
+static inline void evict_on_page_release_when_permitted(PGC *cache __maybe_unused) {
+ if ((cache->config.options & PGC_OPTIONS_EVICT_PAGES_INLINE) || cache_under_severe_pressure(cache)) {
+ evict_pages(cache,
+ cache->config.max_skip_pages_per_inline_eviction,
+ cache->config.max_pages_per_inline_eviction,
+ false, false);
+ }
+}
+
+// ----------------------------------------------------------------------------
+// flushing control
+
+static bool flush_pages(PGC *cache, size_t max_flushes, Word_t section, bool wait, bool all_of_them);
+
+static inline bool flushing_critical(PGC *cache) {
+ if(unlikely(__atomic_load_n(&cache->dirty.stats->size, __ATOMIC_RELAXED) > __atomic_load_n(&cache->hot.stats->max_size, __ATOMIC_RELAXED))) {
+ __atomic_add_fetch(&cache->stats.events_flush_critical, 1, __ATOMIC_RELAXED);
+ return true;
+ }
+
+ return false;
+}
+
+// ----------------------------------------------------------------------------
+// helpers
+
+static inline size_t page_assumed_size(PGC *cache, size_t size) {
+ return size + (sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page + sizeof(Word_t) * 3);
+}
+
+static inline size_t page_size_from_assumed_size(PGC *cache, size_t assumed_size) {
+ return assumed_size - (sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page + sizeof(Word_t) * 3);
+}
+
+// ----------------------------------------------------------------------------
+// Linked list management
+
+static inline void atomic_set_max(size_t *max, size_t desired) {
+ size_t expected;
+
+ expected = __atomic_load_n(max, __ATOMIC_RELAXED);
+
+ do {
+
+ if(expected >= desired)
+ return;
+
+ } while(!__atomic_compare_exchange_n(max, &expected, desired,
+ false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
+}
+
+struct section_pages {
+ SPINLOCK migration_to_v2_spinlock;
+ size_t entries;
+ size_t size;
+ PGC_PAGE *base;
+};
+
+static ARAL *pgc_section_pages_aral = NULL;
+static void pgc_section_pages_static_aral_init(void) {
+ static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER;
+
+ if(unlikely(!pgc_section_pages_aral)) {
+ spinlock_lock(&spinlock);
+
+ // we have to check again
+ if(!pgc_section_pages_aral)
+ pgc_section_pages_aral = aral_create(
+ "pgc_section",
+ sizeof(struct section_pages),
+ 0,
+ 65536, NULL,
+ NULL, NULL, false, false);
+
+ spinlock_unlock(&spinlock);
+ }
+}
+
+static inline void pgc_stats_ll_judy_change(PGC *cache, struct pgc_linked_list *ll, size_t mem_before_judyl, size_t mem_after_judyl) {
+ if(mem_after_judyl > mem_before_judyl) {
+ __atomic_add_fetch(&ll->stats->size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&cache->stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED);
+ }
+ else if(mem_after_judyl < mem_before_judyl) {
+ __atomic_sub_fetch(&ll->stats->size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED);
+ __atomic_sub_fetch(&cache->stats.size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED);
+ }
+}
+
+static inline void pgc_stats_index_judy_change(PGC *cache, size_t mem_before_judyl, size_t mem_after_judyl) {
+ if(mem_after_judyl > mem_before_judyl) {
+ __atomic_add_fetch(&cache->stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED);
+ }
+ else if(mem_after_judyl < mem_before_judyl) {
+ __atomic_sub_fetch(&cache->stats.size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED);
+ }
+}
+
+static void pgc_ll_add(PGC *cache __maybe_unused, struct pgc_linked_list *ll, PGC_PAGE *page, bool having_lock) {
+ if(!having_lock)
+ pgc_ll_lock(cache, ll);
+
+ internal_fatal(page_get_status_flags(page) != 0,
+ "DBENGINE CACHE: invalid page flags, the page has %d, but it is should be %d",
+ page_get_status_flags(page),
+ 0);
+
+ if(ll->linked_list_in_sections_judy) {
+ size_t mem_before_judyl, mem_after_judyl;
+
+ mem_before_judyl = JudyLMemUsed(ll->sections_judy);
+ Pvoid_t *section_pages_pptr = JudyLIns(&ll->sections_judy, page->section, PJE0);
+ mem_after_judyl = JudyLMemUsed(ll->sections_judy);
+
+ struct section_pages *sp = *section_pages_pptr;
+ if(!sp) {
+ // sp = callocz(1, sizeof(struct section_pages));
+ sp = aral_mallocz(pgc_section_pages_aral);
+ memset(sp, 0, sizeof(struct section_pages));
+
+ *section_pages_pptr = sp;
+
+ mem_after_judyl += sizeof(struct section_pages);
+ }
+ pgc_stats_ll_judy_change(cache, ll, mem_before_judyl, mem_after_judyl);
+
+ sp->entries++;
+ sp->size += page->assumed_size;
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(sp->base, page, link.prev, link.next);
+
+ if((sp->entries % cache->config.max_dirty_pages_per_call) == 0)
+ ll->version++;
+ }
+ else {
+ // CLEAN pages end up here.
+ // - New pages created as CLEAN, always have 1 access.
+ // - DIRTY pages made CLEAN, depending on their accesses may be appended (accesses > 0) or prepended (accesses = 0).
+
+ if(page->accesses || page_flag_check(page, PGC_PAGE_HAS_BEEN_ACCESSED | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES) == PGC_PAGE_HAS_BEEN_ACCESSED) {
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ll->base, page, link.prev, link.next);
+ page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED);
+ }
+ else
+ DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(ll->base, page, link.prev, link.next);
+
+ ll->version++;
+ }
+
+ page_flag_set(page, ll->flags);
+
+ if(!having_lock)
+ pgc_ll_unlock(cache, ll);
+
+ size_t entries = __atomic_add_fetch(&ll->stats->entries, 1, __ATOMIC_RELAXED);
+ size_t size = __atomic_add_fetch(&ll->stats->size, page->assumed_size, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&ll->stats->added_entries, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&ll->stats->added_size, page->assumed_size, __ATOMIC_RELAXED);
+
+ atomic_set_max(&ll->stats->max_entries, entries);
+ atomic_set_max(&ll->stats->max_size, size);
+}
+
+static void pgc_ll_del(PGC *cache __maybe_unused, struct pgc_linked_list *ll, PGC_PAGE *page, bool having_lock) {
+ __atomic_sub_fetch(&ll->stats->entries, 1, __ATOMIC_RELAXED);
+ __atomic_sub_fetch(&ll->stats->size, page->assumed_size, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&ll->stats->removed_entries, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&ll->stats->removed_size, page->assumed_size, __ATOMIC_RELAXED);
+
+ if(!having_lock)
+ pgc_ll_lock(cache, ll);
+
+ internal_fatal(page_get_status_flags(page) != ll->flags,
+ "DBENGINE CACHE: invalid page flags, the page has %d, but it is should be %d",
+ page_get_status_flags(page),
+ ll->flags);
+
+ page_flag_clear(page, ll->flags);
+
+ if(ll->linked_list_in_sections_judy) {
+ Pvoid_t *section_pages_pptr = JudyLGet(ll->sections_judy, page->section, PJE0);
+ internal_fatal(!section_pages_pptr, "DBENGINE CACHE: page should be in Judy LL, but it is not");
+
+ struct section_pages *sp = *section_pages_pptr;
+ sp->entries--;
+ sp->size -= page->assumed_size;
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(sp->base, page, link.prev, link.next);
+
+ if(!sp->base) {
+ size_t mem_before_judyl, mem_after_judyl;
+
+ mem_before_judyl = JudyLMemUsed(ll->sections_judy);
+ int rc = JudyLDel(&ll->sections_judy, page->section, PJE0);
+ mem_after_judyl = JudyLMemUsed(ll->sections_judy);
+
+ if(!rc)
+ fatal("DBENGINE CACHE: cannot delete section from Judy LL");
+
+ // freez(sp);
+ aral_freez(pgc_section_pages_aral, sp);
+ mem_after_judyl -= sizeof(struct section_pages);
+ pgc_stats_ll_judy_change(cache, ll, mem_before_judyl, mem_after_judyl);
+ }
+ }
+ else {
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ll->base, page, link.prev, link.next);
+ ll->version++;
+ }
+
+ if(!having_lock)
+ pgc_ll_unlock(cache, ll);
+}
+
+static inline void page_has_been_accessed(PGC *cache, PGC_PAGE *page) {
+ PGC_PAGE_FLAGS flags = page_flag_check(page, PGC_PAGE_CLEAN | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES);
+
+ if (!(flags & PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES)) {
+ __atomic_add_fetch(&page->accesses, 1, __ATOMIC_RELAXED);
+
+ if (flags & PGC_PAGE_CLEAN) {
+ if(pgc_ll_trylock(cache, &cache->clean)) {
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next);
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next);
+ pgc_ll_unlock(cache, &cache->clean);
+ page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED);
+ }
+ else
+ page_flag_set(page, PGC_PAGE_HAS_BEEN_ACCESSED);
+ }
+ }
+}
+
+
+// ----------------------------------------------------------------------------
+// state transitions
+
+static inline void page_set_clean(PGC *cache, PGC_PAGE *page, bool having_transition_lock, bool having_clean_lock) {
+ if(!having_transition_lock)
+ page_transition_lock(cache, page);
+
+ PGC_PAGE_FLAGS flags = page_get_status_flags(page);
+
+ if(flags & PGC_PAGE_CLEAN) {
+ if(!having_transition_lock)
+ page_transition_unlock(cache, page);
+ return;
+ }
+
+ if(flags & PGC_PAGE_HOT)
+ pgc_ll_del(cache, &cache->hot, page, false);
+
+ if(flags & PGC_PAGE_DIRTY)
+ pgc_ll_del(cache, &cache->dirty, page, false);
+
+ // first add to linked list, the set the flag (required for move_page_last())
+ pgc_ll_add(cache, &cache->clean, page, having_clean_lock);
+
+ if(!having_transition_lock)
+ page_transition_unlock(cache, page);
+}
+
+static inline void page_set_dirty(PGC *cache, PGC_PAGE *page, bool having_hot_lock) {
+ if(!having_hot_lock)
+ // to avoid deadlocks, we have to get the hot lock before the page transition
+ // since this is what all_hot_to_dirty() does
+ pgc_ll_lock(cache, &cache->hot);
+
+ page_transition_lock(cache, page);
+
+ PGC_PAGE_FLAGS flags = page_get_status_flags(page);
+
+ if(flags & PGC_PAGE_DIRTY) {
+ page_transition_unlock(cache, page);
+
+ if(!having_hot_lock)
+ // we don't need the hot lock anymore
+ pgc_ll_unlock(cache, &cache->hot);
+
+ return;
+ }
+
+ __atomic_add_fetch(&cache->stats.hot2dirty_entries, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&cache->stats.hot2dirty_size, page->assumed_size, __ATOMIC_RELAXED);
+
+ if(likely(flags & PGC_PAGE_HOT))
+ pgc_ll_del(cache, &cache->hot, page, true);
+
+ if(!having_hot_lock)
+ // we don't need the hot lock anymore
+ pgc_ll_unlock(cache, &cache->hot);
+
+ if(unlikely(flags & PGC_PAGE_CLEAN))
+ pgc_ll_del(cache, &cache->clean, page, false);
+
+ // first add to linked list, the set the flag (required for move_page_last())
+ pgc_ll_add(cache, &cache->dirty, page, false);
+
+ __atomic_sub_fetch(&cache->stats.hot2dirty_entries, 1, __ATOMIC_RELAXED);
+ __atomic_sub_fetch(&cache->stats.hot2dirty_size, page->assumed_size, __ATOMIC_RELAXED);
+
+ page_transition_unlock(cache, page);
+}
+
+static inline void page_set_hot(PGC *cache, PGC_PAGE *page) {
+ page_transition_lock(cache, page);
+
+ PGC_PAGE_FLAGS flags = page_get_status_flags(page);
+
+ if(flags & PGC_PAGE_HOT) {
+ page_transition_unlock(cache, page);
+ return;
+ }
+
+ if(flags & PGC_PAGE_DIRTY)
+ pgc_ll_del(cache, &cache->dirty, page, false);
+
+ if(flags & PGC_PAGE_CLEAN)
+ pgc_ll_del(cache, &cache->clean, page, false);
+
+ // first add to linked list, the set the flag (required for move_page_last())
+ pgc_ll_add(cache, &cache->hot, page, false);
+
+ page_transition_unlock(cache, page);
+}
+
+
+// ----------------------------------------------------------------------------
+// Referencing
+
+static inline size_t PGC_REFERENCED_PAGES(PGC *cache) {
+ return __atomic_load_n(&cache->stats.referenced_entries, __ATOMIC_RELAXED);
+}
+
+static inline void PGC_REFERENCED_PAGES_PLUS1(PGC *cache, PGC_PAGE *page) {
+ __atomic_add_fetch(&cache->stats.referenced_entries, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&cache->stats.referenced_size, page->assumed_size, __ATOMIC_RELAXED);
+}
+
+static inline void PGC_REFERENCED_PAGES_MINUS1(PGC *cache, size_t assumed_size) {
+ __atomic_sub_fetch(&cache->stats.referenced_entries, 1, __ATOMIC_RELAXED);
+ __atomic_sub_fetch(&cache->stats.referenced_size, assumed_size, __ATOMIC_RELAXED);
+}
+
+// If the page is not already acquired,
+// YOU HAVE TO HAVE THE QUEUE (hot, dirty, clean) THE PAGE IS IN, L O C K E D !
+// If you don't have it locked, NOTHING PREVENTS THIS PAGE FOR VANISHING WHILE THIS IS CALLED!
+static inline bool page_acquire(PGC *cache, PGC_PAGE *page) {
+ __atomic_add_fetch(&cache->stats.acquires, 1, __ATOMIC_RELAXED);
+
+ REFCOUNT expected, desired;
+
+ expected = __atomic_load_n(&page->refcount, __ATOMIC_RELAXED);
+ size_t spins = 0;
+
+ do {
+ spins++;
+
+ if(unlikely(expected < 0))
+ return false;
+
+ desired = expected + 1;
+
+ } while(!__atomic_compare_exchange_n(&page->refcount, &expected, desired, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED));
+
+ if(unlikely(spins > 1))
+ __atomic_add_fetch(&cache->stats.acquire_spins, spins - 1, __ATOMIC_RELAXED);
+
+ if(desired == 1)
+ PGC_REFERENCED_PAGES_PLUS1(cache, page);
+
+ return true;
+}
+
+static inline void page_release(PGC *cache, PGC_PAGE *page, bool evict_if_necessary) {
+ __atomic_add_fetch(&cache->stats.releases, 1, __ATOMIC_RELAXED);
+
+ size_t assumed_size = page->assumed_size; // take the size before we release it
+ REFCOUNT expected, desired;
+
+ expected = __atomic_load_n(&page->refcount, __ATOMIC_RELAXED);
+
+ size_t spins = 0;
+ do {
+ spins++;
+
+ internal_fatal(expected <= 0,
+ "DBENGINE CACHE: trying to release a page with reference counter %d", expected);
+
+ desired = expected - 1;
+
+ } while(!__atomic_compare_exchange_n(&page->refcount, &expected, desired, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED));
+
+ if(unlikely(spins > 1))
+ __atomic_add_fetch(&cache->stats.release_spins, spins - 1, __ATOMIC_RELAXED);
+
+ if(desired == 0) {
+ PGC_REFERENCED_PAGES_MINUS1(cache, assumed_size);
+
+ if(evict_if_necessary)
+ evict_on_page_release_when_permitted(cache);
+ }
+}
+
+static inline bool non_acquired_page_get_for_deletion___while_having_clean_locked(PGC *cache __maybe_unused, PGC_PAGE *page) {
+ __atomic_add_fetch(&cache->stats.acquires_for_deletion, 1, __ATOMIC_RELAXED);
+
+ internal_fatal(!is_page_clean(page),
+ "DBENGINE CACHE: only clean pages can be deleted");
+
+ REFCOUNT expected, desired;
+
+ expected = __atomic_load_n(&page->refcount, __ATOMIC_RELAXED);
+ size_t spins = 0;
+ bool delete_it;
+
+ do {
+ spins++;
+
+ if (expected == 0) {
+ desired = REFCOUNT_DELETING;
+ delete_it = true;
+ }
+ else {
+ delete_it = false;
+ break;
+ }
+
+ } while(!__atomic_compare_exchange_n(&page->refcount, &expected, desired, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED));
+
+ if(delete_it) {
+ // we can delete this page
+ internal_fatal(page_flag_check(page, PGC_PAGE_IS_BEING_DELETED),
+ "DBENGINE CACHE: page is already being deleted");
+
+ page_flag_set(page, PGC_PAGE_IS_BEING_DELETED);
+ }
+
+ if(unlikely(spins > 1))
+ __atomic_add_fetch(&cache->stats.delete_spins, spins - 1, __ATOMIC_RELAXED);
+
+ return delete_it;
+}
+
+static inline bool acquired_page_get_for_deletion_or_release_it(PGC *cache __maybe_unused, PGC_PAGE *page) {
+ __atomic_add_fetch(&cache->stats.acquires_for_deletion, 1, __ATOMIC_RELAXED);
+
+ size_t assumed_size = page->assumed_size; // take the size before we release it
+
+ REFCOUNT expected, desired;
+
+ expected = __atomic_load_n(&page->refcount, __ATOMIC_RELAXED);
+ size_t spins = 0;
+ bool delete_it;
+
+ do {
+ spins++;
+
+ internal_fatal(expected < 1,
+ "DBENGINE CACHE: page to be deleted should be acquired by the caller.");
+
+ if (expected == 1) {
+ // we are the only one having this page referenced
+ desired = REFCOUNT_DELETING;
+ delete_it = true;
+ }
+ else {
+ // this page cannot be deleted
+ desired = expected - 1;
+ delete_it = false;
+ }
+
+ } while(!__atomic_compare_exchange_n(&page->refcount, &expected, desired, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED));
+
+ if(delete_it) {
+ PGC_REFERENCED_PAGES_MINUS1(cache, assumed_size);
+
+ // we can delete this page
+ internal_fatal(page_flag_check(page, PGC_PAGE_IS_BEING_DELETED),
+ "DBENGINE CACHE: page is already being deleted");
+
+ page_flag_set(page, PGC_PAGE_IS_BEING_DELETED);
+ }
+
+ if(unlikely(spins > 1))
+ __atomic_add_fetch(&cache->stats.delete_spins, spins - 1, __ATOMIC_RELAXED);
+
+ return delete_it;
+}
+
+
+// ----------------------------------------------------------------------------
+// Indexing
+
+static inline void free_this_page(PGC *cache, PGC_PAGE *page, size_t partition __maybe_unused) {
+ // call the callback to free the user supplied memory
+ cache->config.pgc_free_clean_cb(cache, (PGC_ENTRY){
+ .section = page->section,
+ .metric_id = page->metric_id,
+ .start_time_s = page->start_time_s,
+ .end_time_s = __atomic_load_n(&page->end_time_s, __ATOMIC_RELAXED),
+ .update_every_s = page->update_every_s,
+ .size = page_size_from_assumed_size(cache, page->assumed_size),
+ .hot = (is_page_hot(page)) ? true : false,
+ .data = page->data,
+ .custom_data = (cache->config.additional_bytes_per_page) ? page->custom_data : NULL,
+ });
+
+ // update statistics
+ __atomic_add_fetch(&cache->stats.removed_entries, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&cache->stats.removed_size, page->assumed_size, __ATOMIC_RELAXED);
+
+ __atomic_sub_fetch(&cache->stats.entries, 1, __ATOMIC_RELAXED);
+ __atomic_sub_fetch(&cache->stats.size, page->assumed_size, __ATOMIC_RELAXED);
+
+ // free our memory
+#ifdef PGC_WITH_ARAL
+ aral_freez(cache->aral[partition], page);
+#else
+ freez(page);
+#endif
+}
+
+static void remove_this_page_from_index_unsafe(PGC *cache, PGC_PAGE *page, size_t partition) {
+ // remove it from the Judy arrays
+
+ pointer_check(cache, page);
+
+ internal_fatal(page_flag_check(page, PGC_PAGE_HOT | PGC_PAGE_DIRTY | PGC_PAGE_CLEAN),
+ "DBENGINE CACHE: page to be removed from the cache is still in the linked-list");
+
+ internal_fatal(!page_flag_check(page, PGC_PAGE_IS_BEING_DELETED),
+ "DBENGINE CACHE: page to be removed from the index, is not marked for deletion");
+
+ internal_fatal(partition != pgc_indexing_partition(cache, page->metric_id),
+ "DBENGINE CACHE: attempted to remove this page from the wrong partition of the cache");
+
+ Pvoid_t *metrics_judy_pptr = JudyLGet(cache->index[partition].sections_judy, page->section, PJE0);
+ if(unlikely(!metrics_judy_pptr))
+ fatal("DBENGINE CACHE: section '%lu' should exist, but it does not.", page->section);
+
+ Pvoid_t *pages_judy_pptr = JudyLGet(*metrics_judy_pptr, page->metric_id, PJE0);
+ if(unlikely(!pages_judy_pptr))
+ fatal("DBENGINE CACHE: metric '%lu' in section '%lu' should exist, but it does not.",
+ page->metric_id, page->section);
+
+ Pvoid_t *page_ptr = JudyLGet(*pages_judy_pptr, page->start_time_s, PJE0);
+ if(unlikely(!page_ptr))
+ fatal("DBENGINE CACHE: page with start time '%ld' of metric '%lu' in section '%lu' should exist, but it does not.",
+ page->start_time_s, page->metric_id, page->section);
+
+ PGC_PAGE *found_page = *page_ptr;
+ if(unlikely(found_page != page))
+ fatal("DBENGINE CACHE: page with start time '%ld' of metric '%lu' in section '%lu' should exist, but the index returned a different address.",
+ page->start_time_s, page->metric_id, page->section);
+
+ size_t mem_before_judyl = 0, mem_after_judyl = 0;
+
+ mem_before_judyl += JudyLMemUsed(*pages_judy_pptr);
+ if(unlikely(!JudyLDel(pages_judy_pptr, page->start_time_s, PJE0)))
+ fatal("DBENGINE CACHE: page with start time '%ld' of metric '%lu' in section '%lu' exists, but cannot be deleted.",
+ page->start_time_s, page->metric_id, page->section);
+ mem_after_judyl += JudyLMemUsed(*pages_judy_pptr);
+
+ mem_before_judyl += JudyLMemUsed(*metrics_judy_pptr);
+ if(!*pages_judy_pptr && !JudyLDel(metrics_judy_pptr, page->metric_id, PJE0))
+ fatal("DBENGINE CACHE: metric '%lu' in section '%lu' exists and is empty, but cannot be deleted.",
+ page->metric_id, page->section);
+ mem_after_judyl += JudyLMemUsed(*metrics_judy_pptr);
+
+ mem_before_judyl += JudyLMemUsed(cache->index[partition].sections_judy);
+ if(!*metrics_judy_pptr && !JudyLDel(&cache->index[partition].sections_judy, page->section, PJE0))
+ fatal("DBENGINE CACHE: section '%lu' exists and is empty, but cannot be deleted.", page->section);
+ mem_after_judyl += JudyLMemUsed(cache->index[partition].sections_judy);
+
+ pgc_stats_index_judy_change(cache, mem_before_judyl, mem_after_judyl);
+
+ pointer_del(cache, page);
+}
+
+static inline void remove_and_free_page_not_in_any_queue_and_acquired_for_deletion(PGC *cache, PGC_PAGE *page) {
+ size_t partition = pgc_indexing_partition(cache, page->metric_id);
+ pgc_index_write_lock(cache, partition);
+ remove_this_page_from_index_unsafe(cache, page, partition);
+ pgc_index_write_unlock(cache, partition);
+ free_this_page(cache, page, partition);
+}
+
+static inline bool make_acquired_page_clean_and_evict_or_page_release(PGC *cache, PGC_PAGE *page) {
+ pointer_check(cache, page);
+
+ page_transition_lock(cache, page);
+ pgc_ll_lock(cache, &cache->clean);
+
+ // make it clean - it does not have any accesses, so it will be prepended
+ page_set_clean(cache, page, true, true);
+
+ if(!acquired_page_get_for_deletion_or_release_it(cache, page)) {
+ pgc_ll_unlock(cache, &cache->clean);
+ page_transition_unlock(cache, page);
+ return false;
+ }
+
+ // remove it from the linked list
+ pgc_ll_del(cache, &cache->clean, page, true);
+ pgc_ll_unlock(cache, &cache->clean);
+ page_transition_unlock(cache, page);
+
+ remove_and_free_page_not_in_any_queue_and_acquired_for_deletion(cache, page);
+
+ return true;
+}
+
+// returns true, when there is more work to do
+static bool evict_pages_with_filter(PGC *cache, size_t max_skip, size_t max_evict, bool wait, bool all_of_them, evict_filter filter, void *data) {
+ size_t per1000 = cache_usage_per1000(cache, NULL);
+
+ if(!all_of_them && per1000 < cache->config.healthy_size_per1000)
+ // don't bother - not enough to do anything
+ return false;
+
+ size_t workers_running = __atomic_add_fetch(&cache->stats.workers_evict, 1, __ATOMIC_RELAXED);
+ if(!wait && !all_of_them && workers_running > cache->config.max_workers_evict_inline && per1000 < cache->config.severe_pressure_per1000) {
+ __atomic_sub_fetch(&cache->stats.workers_evict, 1, __ATOMIC_RELAXED);
+ return false;
+ }
+
+ internal_fatal(cache->clean.linked_list_in_sections_judy,
+ "wrong clean pages configuration - clean pages need to have a linked list, not a judy array");
+
+ if(unlikely(!max_skip))
+ max_skip = SIZE_MAX;
+ else if(unlikely(max_skip < 2))
+ max_skip = 2;
+
+ if(unlikely(!max_evict))
+ max_evict = SIZE_MAX;
+ else if(unlikely(max_evict < 2))
+ max_evict = 2;
+
+ size_t total_pages_evicted = 0;
+ size_t total_pages_skipped = 0;
+ bool stopped_before_finishing = false;
+ size_t spins = 0;
+
+ do {
+ if(++spins > 1)
+ __atomic_add_fetch(&cache->stats.evict_spins, 1, __ATOMIC_RELAXED);
+
+ bool batch;
+ size_t max_size_to_evict = 0;
+ if (unlikely(all_of_them)) {
+ max_size_to_evict = SIZE_MAX;
+ batch = true;
+ }
+ else if(unlikely(wait)) {
+ per1000 = cache_usage_per1000(cache, &max_size_to_evict);
+ batch = (wait && per1000 > cache->config.severe_pressure_per1000) ? true : false;
+ }
+ else {
+ batch = false;
+ max_size_to_evict = (cache_above_healthy_limit(cache)) ? 1 : 0;
+ }
+
+ if (!max_size_to_evict)
+ break;
+
+ // check if we have to stop
+ if(total_pages_evicted >= max_evict && !all_of_them) {
+ stopped_before_finishing = true;
+ break;
+ }
+
+ if(!all_of_them && !wait) {
+ if(!pgc_ll_trylock(cache, &cache->clean)) {
+ stopped_before_finishing = true;
+ goto premature_exit;
+ }
+
+ // at this point we have the clean lock
+ }
+ else
+ pgc_ll_lock(cache, &cache->clean);
+
+ // find a page to evict
+ PGC_PAGE *pages_to_evict = NULL;
+ size_t pages_to_evict_size = 0;
+ for(PGC_PAGE *page = cache->clean.base, *next = NULL, *first_page_we_relocated = NULL; page ; page = next) {
+ next = page->link.next;
+
+ if(unlikely(page == first_page_we_relocated))
+ // we did a complete loop on all pages
+ break;
+
+ if(unlikely(page_flag_check(page, PGC_PAGE_HAS_BEEN_ACCESSED | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES) == PGC_PAGE_HAS_BEEN_ACCESSED)) {
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next);
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next);
+ page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED);
+ continue;
+ }
+
+ if(unlikely(filter && !filter(page, data)))
+ continue;
+
+ if(non_acquired_page_get_for_deletion___while_having_clean_locked(cache, page)) {
+ // we can delete this page
+
+ // remove it from the clean list
+ pgc_ll_del(cache, &cache->clean, page, true);
+
+ __atomic_add_fetch(&cache->stats.evicting_entries, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&cache->stats.evicting_size, page->assumed_size, __ATOMIC_RELAXED);
+
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pages_to_evict, page, link.prev, link.next);
+
+ pages_to_evict_size += page->assumed_size;
+
+ if(unlikely(all_of_them || (batch && pages_to_evict_size < max_size_to_evict)))
+ // get more pages
+ ;
+ else
+ // one page at a time
+ break;
+ }
+ else {
+ // we can't delete this page
+
+ if(!first_page_we_relocated)
+ first_page_we_relocated = page;
+
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next);
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next);
+
+ // check if we have to stop
+ if(unlikely(++total_pages_skipped >= max_skip && !all_of_them)) {
+ stopped_before_finishing = true;
+ break;
+ }
+ }
+ }
+ pgc_ll_unlock(cache, &cache->clean);
+
+ if(likely(pages_to_evict)) {
+ // remove them from the index
+
+ if(unlikely(pages_to_evict->link.next)) {
+ // we have many pages, let's minimize the index locks we are going to get
+
+ PGC_PAGE *pages_per_partition[cache->config.partitions];
+ memset(pages_per_partition, 0, sizeof(PGC_PAGE *) * cache->config.partitions);
+
+ // sort them by partition
+ for (PGC_PAGE *page = pages_to_evict, *next = NULL; page; page = next) {
+ next = page->link.next;
+
+ size_t partition = pgc_indexing_partition(cache, page->metric_id);
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(pages_to_evict, page, link.prev, link.next);
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pages_per_partition[partition], page, link.prev, link.next);
+ }
+
+ // remove them from the index
+ for (size_t partition = 0; partition < cache->config.partitions; partition++) {
+ if (!pages_per_partition[partition]) continue;
+
+ pgc_index_write_lock(cache, partition);
+
+ for (PGC_PAGE *page = pages_per_partition[partition]; page; page = page->link.next)
+ remove_this_page_from_index_unsafe(cache, page, partition);
+
+ pgc_index_write_unlock(cache, partition);
+ }
+
+ // free them
+ for (size_t partition = 0; partition < cache->config.partitions; partition++) {
+ if (!pages_per_partition[partition]) continue;
+
+ for (PGC_PAGE *page = pages_per_partition[partition], *next = NULL; page; page = next) {
+ next = page->link.next;
+
+ size_t page_size = page->assumed_size;
+ free_this_page(cache, page, partition);
+
+ __atomic_sub_fetch(&cache->stats.evicting_entries, 1, __ATOMIC_RELAXED);
+ __atomic_sub_fetch(&cache->stats.evicting_size, page_size, __ATOMIC_RELAXED);
+
+ total_pages_evicted++;
+ }
+ }
+ }
+ else {
+ // just one page to be evicted
+ PGC_PAGE *page = pages_to_evict;
+
+ size_t page_size = page->assumed_size;
+
+ size_t partition = pgc_indexing_partition(cache, page->metric_id);
+ pgc_index_write_lock(cache, partition);
+ remove_this_page_from_index_unsafe(cache, page, partition);
+ pgc_index_write_unlock(cache, partition);
+ free_this_page(cache, page, partition);
+
+ __atomic_sub_fetch(&cache->stats.evicting_entries, 1, __ATOMIC_RELAXED);
+ __atomic_sub_fetch(&cache->stats.evicting_size, page_size, __ATOMIC_RELAXED);
+
+ total_pages_evicted++;
+ }
+ }
+ else
+ break;
+
+ } while(all_of_them || (total_pages_evicted < max_evict && total_pages_skipped < max_skip));
+
+ if(all_of_them && !filter) {
+ pgc_ll_lock(cache, &cache->clean);
+ if(cache->clean.stats->entries) {
+ nd_log_limit_static_global_var(erl, 1, 0);
+ nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE,
+ "DBENGINE CACHE: cannot free all clean pages, %zu are still in the clean queue",
+ cache->clean.stats->entries);
+ }
+ pgc_ll_unlock(cache, &cache->clean);
+ }
+
+premature_exit:
+ if(unlikely(total_pages_skipped))
+ __atomic_add_fetch(&cache->stats.evict_skipped, total_pages_skipped, __ATOMIC_RELAXED);
+
+ __atomic_sub_fetch(&cache->stats.workers_evict, 1, __ATOMIC_RELAXED);
+
+ return stopped_before_finishing;
+}
+
+static PGC_PAGE *page_add(PGC *cache, PGC_ENTRY *entry, bool *added) {
+ internal_fatal(entry->start_time_s < 0 || entry->end_time_s < 0,
+ "DBENGINE CACHE: timestamps are negative");
+
+ __atomic_add_fetch(&cache->stats.workers_add, 1, __ATOMIC_RELAXED);
+
+ size_t partition = pgc_indexing_partition(cache, entry->metric_id);
+
+#ifdef PGC_WITH_ARAL
+ PGC_PAGE *allocation = aral_mallocz(cache->aral[partition]);
+#endif
+ PGC_PAGE *page;
+ size_t spins = 0;
+
+ if(unlikely(entry->start_time_s < 0))
+ entry->start_time_s = 0;
+
+ if(unlikely(entry->end_time_s < 0))
+ entry->end_time_s = 0;
+
+ do {
+ if(++spins > 1)
+ __atomic_add_fetch(&cache->stats.insert_spins, 1, __ATOMIC_RELAXED);
+
+ pgc_index_write_lock(cache, partition);
+
+ size_t mem_before_judyl = 0, mem_after_judyl = 0;
+
+ mem_before_judyl += JudyLMemUsed(cache->index[partition].sections_judy);
+ Pvoid_t *metrics_judy_pptr = JudyLIns(&cache->index[partition].sections_judy, entry->section, PJE0);
+ if(unlikely(!metrics_judy_pptr || metrics_judy_pptr == PJERR))
+ fatal("DBENGINE CACHE: corrupted sections judy array");
+ mem_after_judyl += JudyLMemUsed(cache->index[partition].sections_judy);
+
+ mem_before_judyl += JudyLMemUsed(*metrics_judy_pptr);
+ Pvoid_t *pages_judy_pptr = JudyLIns(metrics_judy_pptr, entry->metric_id, PJE0);
+ if(unlikely(!pages_judy_pptr || pages_judy_pptr == PJERR))
+ fatal("DBENGINE CACHE: corrupted pages judy array");
+ mem_after_judyl += JudyLMemUsed(*metrics_judy_pptr);
+
+ mem_before_judyl += JudyLMemUsed(*pages_judy_pptr);
+ Pvoid_t *page_ptr = JudyLIns(pages_judy_pptr, entry->start_time_s, PJE0);
+ if(unlikely(!page_ptr || page_ptr == PJERR))
+ fatal("DBENGINE CACHE: corrupted page in judy array");
+ mem_after_judyl += JudyLMemUsed(*pages_judy_pptr);
+
+ pgc_stats_index_judy_change(cache, mem_before_judyl, mem_after_judyl);
+
+ page = *page_ptr;
+
+ if (likely(!page)) {
+#ifdef PGC_WITH_ARAL
+ page = allocation;
+ allocation = NULL;
+#else
+ page = mallocz(sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page);
+#endif
+ page->refcount = 1;
+ page->accesses = (entry->hot) ? 0 : 1;
+ page->flags = 0;
+ page->section = entry->section;
+ page->metric_id = entry->metric_id;
+ page->start_time_s = entry->start_time_s;
+ page->end_time_s = entry->end_time_s,
+ page->update_every_s = entry->update_every_s,
+ page->data = entry->data;
+ page->assumed_size = page_assumed_size(cache, entry->size);
+ spinlock_init(&page->transition_spinlock);
+ page->link.prev = NULL;
+ page->link.next = NULL;
+
+ if(cache->config.additional_bytes_per_page) {
+ if(entry->custom_data)
+ memcpy(page->custom_data, entry->custom_data, cache->config.additional_bytes_per_page);
+ else
+ memset(page->custom_data, 0, cache->config.additional_bytes_per_page);
+ }
+
+ // put it in the index
+ *page_ptr = page;
+ pointer_add(cache, page);
+ pgc_index_write_unlock(cache, partition);
+
+ if (entry->hot)
+ page_set_hot(cache, page);
+ else
+ page_set_clean(cache, page, false, false);
+
+ PGC_REFERENCED_PAGES_PLUS1(cache, page);
+
+ // update statistics
+ __atomic_add_fetch(&cache->stats.added_entries, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&cache->stats.added_size, page->assumed_size, __ATOMIC_RELAXED);
+
+ __atomic_add_fetch(&cache->stats.entries, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&cache->stats.size, page->assumed_size, __ATOMIC_RELAXED);
+
+ if(added)
+ *added = true;
+ }
+ else {
+ if (!page_acquire(cache, page))
+ page = NULL;
+
+ else if(added)
+ *added = false;
+
+ pgc_index_write_unlock(cache, partition);
+
+ if(unlikely(!page)) {
+ // now that we don't have the lock,
+ // give it some time for the old page to go away
+ struct timespec ns = { .tv_sec = 0, .tv_nsec = 1 };
+ nanosleep(&ns, NULL);
+ }
+ }
+
+ } while(!page);
+
+#ifdef PGC_WITH_ARAL
+ if(allocation)
+ aral_freez(cache->aral[partition], allocation);
+#endif
+
+ __atomic_sub_fetch(&cache->stats.workers_add, 1, __ATOMIC_RELAXED);
+
+ if(!entry->hot)
+ evict_on_clean_page_added(cache);
+
+ if((cache->config.options & PGC_OPTIONS_FLUSH_PAGES_INLINE) || flushing_critical(cache)) {
+ flush_pages(cache, cache->config.max_flushes_inline, PGC_SECTION_ALL,
+ false, false);
+ }
+
+ return page;
+}
+
+static PGC_PAGE *page_find_and_acquire(PGC *cache, Word_t section, Word_t metric_id, time_t start_time_s, PGC_SEARCH method) {
+ __atomic_add_fetch(&cache->stats.workers_search, 1, __ATOMIC_RELAXED);
+
+ size_t *stats_hit_ptr, *stats_miss_ptr;
+
+ if(method == PGC_SEARCH_CLOSEST) {
+ __atomic_add_fetch(&cache->stats.searches_closest, 1, __ATOMIC_RELAXED);
+ stats_hit_ptr = &cache->stats.searches_closest_hits;
+ stats_miss_ptr = &cache->stats.searches_closest_misses;
+ }
+ else {
+ __atomic_add_fetch(&cache->stats.searches_exact, 1, __ATOMIC_RELAXED);
+ stats_hit_ptr = &cache->stats.searches_exact_hits;
+ stats_miss_ptr = &cache->stats.searches_exact_misses;
+ }
+
+ PGC_PAGE *page = NULL;
+ size_t partition = pgc_indexing_partition(cache, metric_id);
+
+ pgc_index_read_lock(cache, partition);
+
+ Pvoid_t *metrics_judy_pptr = JudyLGet(cache->index[partition].sections_judy, section, PJE0);
+ if(unlikely(metrics_judy_pptr == PJERR))
+ fatal("DBENGINE CACHE: corrupted sections judy array");
+
+ if(unlikely(!metrics_judy_pptr)) {
+ // section does not exist
+ goto cleanup;
+ }
+
+ Pvoid_t *pages_judy_pptr = JudyLGet(*metrics_judy_pptr, metric_id, PJE0);
+ if(unlikely(pages_judy_pptr == PJERR))
+ fatal("DBENGINE CACHE: corrupted pages judy array");
+
+ if(unlikely(!pages_judy_pptr)) {
+ // metric does not exist
+ goto cleanup;
+ }
+
+ switch(method) {
+ default:
+ case PGC_SEARCH_CLOSEST: {
+ Pvoid_t *page_ptr = JudyLGet(*pages_judy_pptr, start_time_s, PJE0);
+ if (unlikely(page_ptr == PJERR))
+ fatal("DBENGINE CACHE: corrupted page in pages judy array");
+
+ if (page_ptr)
+ page = *page_ptr;
+
+ else {
+ Word_t time = start_time_s;
+
+ // find the previous page
+ page_ptr = JudyLPrev(*pages_judy_pptr, &time, PJE0);
+ if(unlikely(page_ptr == PJERR))
+ fatal("DBENGINE CACHE: corrupted page in pages judy array #2");
+
+ if(page_ptr) {
+ // found a page starting before our timestamp
+ // check if our timestamp is included
+ page = *page_ptr;
+ if(start_time_s > page->end_time_s)
+ // it is not good for us
+ page = NULL;
+ }
+
+ if(!page) {
+ // find the next page then...
+ time = start_time_s;
+ page_ptr = JudyLNext(*pages_judy_pptr, &time, PJE0);
+ if(page_ptr)
+ page = *page_ptr;
+ }
+ }
+ }
+ break;
+
+ case PGC_SEARCH_EXACT: {
+ Pvoid_t *page_ptr = JudyLGet(*pages_judy_pptr, start_time_s, PJE0);
+ if (unlikely(page_ptr == PJERR))
+ fatal("DBENGINE CACHE: corrupted page in pages judy array");
+
+ if (page_ptr)
+ page = *page_ptr;
+ }
+ break;
+
+ case PGC_SEARCH_FIRST: {
+ Word_t time = start_time_s;
+ Pvoid_t *page_ptr = JudyLFirst(*pages_judy_pptr, &time, PJE0);
+ if (unlikely(page_ptr == PJERR))
+ fatal("DBENGINE CACHE: corrupted page in pages judy array");
+
+ if (page_ptr)
+ page = *page_ptr;
+ }
+ break;
+
+ case PGC_SEARCH_NEXT: {
+ Word_t time = start_time_s;
+ Pvoid_t *page_ptr = JudyLNext(*pages_judy_pptr, &time, PJE0);
+ if (unlikely(page_ptr == PJERR))
+ fatal("DBENGINE CACHE: corrupted page in pages judy array");
+
+ if (page_ptr)
+ page = *page_ptr;
+ }
+ break;
+
+ case PGC_SEARCH_LAST: {
+ Word_t time = start_time_s;
+ Pvoid_t *page_ptr = JudyLLast(*pages_judy_pptr, &time, PJE0);
+ if (unlikely(page_ptr == PJERR))
+ fatal("DBENGINE CACHE: corrupted page in pages judy array");
+
+ if (page_ptr)
+ page = *page_ptr;
+ }
+ break;
+
+ case PGC_SEARCH_PREV: {
+ Word_t time = start_time_s;
+ Pvoid_t *page_ptr = JudyLPrev(*pages_judy_pptr, &time, PJE0);
+ if (unlikely(page_ptr == PJERR))
+ fatal("DBENGINE CACHE: corrupted page in pages judy array");
+
+ if (page_ptr)
+ page = *page_ptr;
+ }
+ break;
+ }
+
+ if(page) {
+ pointer_check(cache, page);
+
+ if(!page_acquire(cache, page)) {
+ // this page is not good to use
+ page = NULL;
+ }
+ }
+
+cleanup:
+ pgc_index_read_unlock(cache, partition);
+
+ if(page) {
+ __atomic_add_fetch(stats_hit_ptr, 1, __ATOMIC_RELAXED);
+ page_has_been_accessed(cache, page);
+ }
+ else
+ __atomic_add_fetch(stats_miss_ptr, 1, __ATOMIC_RELAXED);
+
+ __atomic_sub_fetch(&cache->stats.workers_search, 1, __ATOMIC_RELAXED);
+
+ return page;
+}
+
+static void all_hot_pages_to_dirty(PGC *cache, Word_t section) {
+ pgc_ll_lock(cache, &cache->hot);
+
+ bool first = true;
+ Word_t last_section = (section == PGC_SECTION_ALL) ? 0 : section;
+ Pvoid_t *section_pages_pptr;
+ while ((section_pages_pptr = JudyLFirstThenNext(cache->hot.sections_judy, &last_section, &first))) {
+ if(section != PGC_SECTION_ALL && last_section != section)
+ break;
+
+ struct section_pages *sp = *section_pages_pptr;
+
+ PGC_PAGE *page = sp->base;
+ while(page) {
+ PGC_PAGE *next = page->link.next;
+
+ if(page_acquire(cache, page)) {
+ page_set_dirty(cache, page, true);
+ page_release(cache, page, false);
+ // page ptr may be invalid now
+ }
+
+ page = next;
+ }
+ }
+ pgc_ll_unlock(cache, &cache->hot);
+}
+
+// returns true when there is more work to do
+static bool flush_pages(PGC *cache, size_t max_flushes, Word_t section, bool wait, bool all_of_them) {
+ internal_fatal(!cache->dirty.linked_list_in_sections_judy,
+ "wrong dirty pages configuration - dirty pages need to have a judy array, not a linked list");
+
+ if(!all_of_them && !wait) {
+ // we have been called from a data collection thread
+ // let's not waste its time...
+
+ if(!pgc_ll_trylock(cache, &cache->dirty)) {
+ // we would block, so give up...
+ return true;
+ }
+
+ // we got the lock at this point
+ }
+ else
+ pgc_ll_lock(cache, &cache->dirty);
+
+ size_t optimal_flush_size = cache->config.max_dirty_pages_per_call;
+ size_t dirty_version_at_entry = cache->dirty.version;
+ if(!all_of_them && (cache->dirty.stats->entries < optimal_flush_size || cache->dirty.last_version_checked == dirty_version_at_entry)) {
+ pgc_ll_unlock(cache, &cache->dirty);
+ return false;
+ }
+
+ __atomic_add_fetch(&cache->stats.workers_flush, 1, __ATOMIC_RELAXED);
+
+ bool have_dirty_lock = true;
+
+ if(all_of_them || !max_flushes)
+ max_flushes = SIZE_MAX;
+
+ Word_t last_section = (section == PGC_SECTION_ALL) ? 0 : section;
+ size_t flushes_so_far = 0;
+ Pvoid_t *section_pages_pptr;
+ bool stopped_before_finishing = false;
+ size_t spins = 0;
+ bool first = true;
+
+ while (have_dirty_lock && (section_pages_pptr = JudyLFirstThenNext(cache->dirty.sections_judy, &last_section, &first))) {
+ if(section != PGC_SECTION_ALL && last_section != section)
+ break;
+
+ struct section_pages *sp = *section_pages_pptr;
+ if(!all_of_them && sp->entries < optimal_flush_size)
+ continue;
+
+ if(!all_of_them && flushes_so_far > max_flushes) {
+ stopped_before_finishing = true;
+ break;
+ }
+
+ if(++spins > 1)
+ __atomic_add_fetch(&cache->stats.flush_spins, 1, __ATOMIC_RELAXED);
+
+ PGC_ENTRY array[optimal_flush_size];
+ PGC_PAGE *pages[optimal_flush_size];
+ size_t pages_added = 0, pages_added_size = 0;
+ size_t pages_removed_dirty = 0, pages_removed_dirty_size = 0;
+ size_t pages_cancelled = 0, pages_cancelled_size = 0;
+ size_t pages_made_clean = 0, pages_made_clean_size = 0;
+
+ PGC_PAGE *page = sp->base;
+ while (page && pages_added < optimal_flush_size) {
+ PGC_PAGE *next = page->link.next;
+
+ internal_fatal(page_get_status_flags(page) != PGC_PAGE_DIRTY,
+ "DBENGINE CACHE: page should be in the dirty list before saved");
+
+ if (page_acquire(cache, page)) {
+ internal_fatal(page_get_status_flags(page) != PGC_PAGE_DIRTY,
+ "DBENGINE CACHE: page should be in the dirty list before saved");
+
+ internal_fatal(page->section != last_section,
+ "DBENGINE CACHE: dirty page is not in the right section (tier)");
+
+ if(!page_transition_trylock(cache, page)) {
+ page_release(cache, page, false);
+ // page ptr may be invalid now
+ }
+ else {
+ pages[pages_added] = page;
+ array[pages_added] = (PGC_ENTRY) {
+ .section = page->section,
+ .metric_id = page->metric_id,
+ .start_time_s = page->start_time_s,
+ .end_time_s = __atomic_load_n(&page->end_time_s, __ATOMIC_RELAXED),
+ .update_every_s = page->update_every_s,
+ .size = page_size_from_assumed_size(cache, page->assumed_size),
+ .data = page->data,
+ .custom_data = (cache->config.additional_bytes_per_page) ? page->custom_data : NULL,
+ .hot = false,
+ };
+
+ pages_added_size += page->assumed_size;
+ pages_added++;
+ }
+ }
+
+ page = next;
+ }
+
+ // do we have enough to save?
+ if(all_of_them || pages_added == optimal_flush_size) {
+ // we should do it
+
+ for (size_t i = 0; i < pages_added; i++) {
+ PGC_PAGE *tpg = pages[i];
+
+ internal_fatal(page_get_status_flags(tpg) != PGC_PAGE_DIRTY,
+ "DBENGINE CACHE: page should be in the dirty list before saved");
+
+ __atomic_add_fetch(&cache->stats.flushing_entries, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&cache->stats.flushing_size, tpg->assumed_size, __ATOMIC_RELAXED);
+
+ // remove it from the dirty list
+ pgc_ll_del(cache, &cache->dirty, tpg, true);
+
+ pages_removed_dirty_size += tpg->assumed_size;
+ pages_removed_dirty++;
+ }
+
+ // next time, repeat the same section (tier)
+ first = true;
+ }
+ else {
+ // we can't do it
+
+ for (size_t i = 0; i < pages_added; i++) {
+ PGC_PAGE *tpg = pages[i];
+
+ internal_fatal(page_get_status_flags(tpg) != PGC_PAGE_DIRTY,
+ "DBENGINE CACHE: page should be in the dirty list before saved");
+
+ pages_cancelled_size += tpg->assumed_size;
+ pages_cancelled++;
+
+ page_transition_unlock(cache, tpg);
+ page_release(cache, tpg, false);
+ // page ptr may be invalid now
+ }
+
+ __atomic_add_fetch(&cache->stats.flushes_cancelled, pages_cancelled, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&cache->stats.flushes_cancelled_size, pages_cancelled_size, __ATOMIC_RELAXED);
+
+ internal_fatal(pages_added != pages_cancelled || pages_added_size != pages_cancelled_size,
+ "DBENGINE CACHE: flushing cancel pages mismatch");
+
+ // next time, continue to the next section (tier)
+ first = false;
+ continue;
+ }
+
+ if(cache->config.pgc_save_init_cb)
+ cache->config.pgc_save_init_cb(cache, last_section);
+
+ pgc_ll_unlock(cache, &cache->dirty);
+ have_dirty_lock = false;
+
+ // call the callback to save them
+ // it may take some time, so let's release the lock
+ cache->config.pgc_save_dirty_cb(cache, array, pages, pages_added);
+ flushes_so_far++;
+
+ __atomic_add_fetch(&cache->stats.flushes_completed, pages_added, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&cache->stats.flushes_completed_size, pages_added_size, __ATOMIC_RELAXED);
+
+ size_t pages_to_evict = 0; (void)pages_to_evict;
+ for (size_t i = 0; i < pages_added; i++) {
+ PGC_PAGE *tpg = pages[i];
+
+ internal_fatal(page_get_status_flags(tpg) != 0,
+ "DBENGINE CACHE: page should not be in any list while it is being saved");
+
+ __atomic_sub_fetch(&cache->stats.flushing_entries, 1, __ATOMIC_RELAXED);
+ __atomic_sub_fetch(&cache->stats.flushing_size, tpg->assumed_size, __ATOMIC_RELAXED);
+
+ pages_made_clean_size += tpg->assumed_size;
+ pages_made_clean++;
+
+ if(!tpg->accesses)
+ pages_to_evict++;
+
+ page_set_clean(cache, tpg, true, false);
+ page_transition_unlock(cache, tpg);
+ page_release(cache, tpg, false);
+ // tpg ptr may be invalid now
+ }
+
+ internal_fatal(pages_added != pages_made_clean || pages_added != pages_removed_dirty ||
+ pages_added_size != pages_made_clean_size || pages_added_size != pages_removed_dirty_size
+ , "DBENGINE CACHE: flushing pages mismatch");
+
+ if(!all_of_them && !wait) {
+ if(pgc_ll_trylock(cache, &cache->dirty))
+ have_dirty_lock = true;
+
+ else {
+ stopped_before_finishing = true;
+ have_dirty_lock = false;
+ }
+ }
+ else {
+ pgc_ll_lock(cache, &cache->dirty);
+ have_dirty_lock = true;
+ }
+ }
+
+ if(have_dirty_lock) {
+ if(!stopped_before_finishing && dirty_version_at_entry > cache->dirty.last_version_checked)
+ cache->dirty.last_version_checked = dirty_version_at_entry;
+
+ pgc_ll_unlock(cache, &cache->dirty);
+ }
+
+ __atomic_sub_fetch(&cache->stats.workers_flush, 1, __ATOMIC_RELAXED);
+
+ return stopped_before_finishing;
+}
+
+void free_all_unreferenced_clean_pages(PGC *cache) {
+ evict_pages(cache, 0, 0, true, true);
+}
+
+// ----------------------------------------------------------------------------
+// public API
+
+PGC *pgc_create(const char *name,
+ size_t clean_size_bytes, free_clean_page_callback pgc_free_cb,
+ size_t max_dirty_pages_per_flush,
+ save_dirty_init_callback pgc_save_init_cb,
+ save_dirty_page_callback pgc_save_dirty_cb,
+ size_t max_pages_per_inline_eviction, size_t max_inline_evictors,
+ size_t max_skip_pages_per_inline_eviction,
+ size_t max_flushes_inline,
+ PGC_OPTIONS options, size_t partitions, size_t additional_bytes_per_page) {
+
+ if(max_pages_per_inline_eviction < 2)
+ max_pages_per_inline_eviction = 2;
+
+ if(max_dirty_pages_per_flush < 1)
+ max_dirty_pages_per_flush = 1;
+
+ if(max_flushes_inline * max_dirty_pages_per_flush < 2)
+ max_flushes_inline = 2;
+
+ PGC *cache = callocz(1, sizeof(PGC));
+ strncpyz(cache->config.name, name, PGC_NAME_MAX);
+ cache->config.options = options;
+ cache->config.clean_size = (clean_size_bytes < 1 * 1024 * 1024) ? 1 * 1024 * 1024 : clean_size_bytes;
+ cache->config.pgc_free_clean_cb = pgc_free_cb;
+ cache->config.max_dirty_pages_per_call = max_dirty_pages_per_flush;
+ cache->config.pgc_save_init_cb = pgc_save_init_cb;
+ cache->config.pgc_save_dirty_cb = pgc_save_dirty_cb;
+ cache->config.max_pages_per_inline_eviction = max_pages_per_inline_eviction;
+ cache->config.max_skip_pages_per_inline_eviction = (max_skip_pages_per_inline_eviction < 2) ? 2 : max_skip_pages_per_inline_eviction;
+ cache->config.max_flushes_inline = (max_flushes_inline < 1) ? 1 : max_flushes_inline;
+ cache->config.partitions = partitions < 1 ? (size_t)get_netdata_cpus() : partitions;
+ cache->config.additional_bytes_per_page = additional_bytes_per_page;
+
+ cache->config.max_workers_evict_inline = max_inline_evictors;
+ cache->config.severe_pressure_per1000 = 1010;
+ cache->config.aggressive_evict_per1000 = 990;
+ cache->config.healthy_size_per1000 = 980;
+ cache->config.evict_low_threshold_per1000 = 970;
+
+ cache->index = callocz(cache->config.partitions, sizeof(struct pgc_index));
+
+ for(size_t part = 0; part < cache->config.partitions ; part++)
+ rw_spinlock_init(&cache->index[part].rw_spinlock);
+
+ spinlock_init(&cache->hot.spinlock);
+ spinlock_init(&cache->dirty.spinlock);
+ spinlock_init(&cache->clean.spinlock);
+
+ cache->hot.flags = PGC_PAGE_HOT;
+ cache->hot.linked_list_in_sections_judy = true;
+ cache->hot.stats = &cache->stats.queues.hot;
+
+ cache->dirty.flags = PGC_PAGE_DIRTY;
+ cache->dirty.linked_list_in_sections_judy = true;
+ cache->dirty.stats = &cache->stats.queues.dirty;
+
+ cache->clean.flags = PGC_PAGE_CLEAN;
+ cache->clean.linked_list_in_sections_judy = false;
+ cache->clean.stats = &cache->stats.queues.clean;
+
+ pgc_section_pages_static_aral_init();
+
+#ifdef PGC_WITH_ARAL
+ cache->aral = callocz(cache->config.partitions, sizeof(ARAL *));
+ for(size_t part = 0; part < cache->config.partitions ; part++) {
+ char buf[100 +1];
+ snprintfz(buf, sizeof(buf) - 1, "%s[%zu]", name, part);
+ cache->aral[part] = aral_create(
+ buf,
+ sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page,
+ 0,
+ 16384,
+ aral_statistics(pgc_section_pages_aral),
+ NULL, NULL, false, false);
+ }
+#endif
+
+ pointer_index_init(cache);
+
+ return cache;
+}
+
+struct aral_statistics *pgc_aral_statistics(void) {
+ return aral_statistics(pgc_section_pages_aral);
+}
+
+size_t pgc_aral_structures(void) {
+ return aral_structures(pgc_section_pages_aral);
+}
+
+size_t pgc_aral_overhead(void) {
+ return aral_overhead(pgc_section_pages_aral);
+}
+
+void pgc_flush_all_hot_and_dirty_pages(PGC *cache, Word_t section) {
+ all_hot_pages_to_dirty(cache, section);
+
+ // save all dirty pages to make them clean
+ flush_pages(cache, 0, section, true, true);
+}
+
+void pgc_destroy(PGC *cache) {
+ // convert all hot pages to dirty
+ all_hot_pages_to_dirty(cache, PGC_SECTION_ALL);
+
+ // save all dirty pages to make them clean
+ flush_pages(cache, 0, PGC_SECTION_ALL, true, true);
+
+ // free all unreferenced clean pages
+ free_all_unreferenced_clean_pages(cache);
+
+ if(PGC_REFERENCED_PAGES(cache))
+ netdata_log_error("DBENGINE CACHE: there are %zu referenced cache pages - leaving the cache allocated", PGC_REFERENCED_PAGES(cache));
+ else {
+ pointer_destroy_index(cache);
+
+// for(size_t part = 0; part < cache->config.partitions ; part++)
+// netdata_rwlock_destroy(&cache->index[part].rw_spinlock);
+
+#ifdef PGC_WITH_ARAL
+ for(size_t part = 0; part < cache->config.partitions ; part++)
+ aral_destroy(cache->aral[part]);
+
+ freez(cache->aral);
+#endif
+ freez(cache->index);
+ freez(cache);
+ }
+}
+
+PGC_PAGE *pgc_page_add_and_acquire(PGC *cache, PGC_ENTRY entry, bool *added) {
+ return page_add(cache, &entry, added);
+}
+
+PGC_PAGE *pgc_page_dup(PGC *cache, PGC_PAGE *page) {
+ if(!page_acquire(cache, page))
+ fatal("DBENGINE CACHE: tried to dup a page that is not acquired!");
+
+ return page;
+}
+
+void pgc_page_release(PGC *cache, PGC_PAGE *page) {
+ page_release(cache, page, is_page_clean(page));
+}
+
+void pgc_page_hot_to_dirty_and_release(PGC *cache, PGC_PAGE *page) {
+ __atomic_add_fetch(&cache->stats.workers_hot2dirty, 1, __ATOMIC_RELAXED);
+
+//#ifdef NETDATA_INTERNAL_CHECKS
+// page_transition_lock(cache, page);
+// internal_fatal(!is_page_hot(page), "DBENGINE CACHE: called %s() but page is not hot", __FUNCTION__ );
+// page_transition_unlock(cache, page);
+//#endif
+
+ // make page dirty
+ page_set_dirty(cache, page, false);
+
+ // release the page
+ page_release(cache, page, true);
+ // page ptr may be invalid now
+
+ __atomic_sub_fetch(&cache->stats.workers_hot2dirty, 1, __ATOMIC_RELAXED);
+
+ // flush, if we have to
+ if((cache->config.options & PGC_OPTIONS_FLUSH_PAGES_INLINE) || flushing_critical(cache)) {
+ flush_pages(cache, cache->config.max_flushes_inline, PGC_SECTION_ALL,
+ false, false);
+ }
+}
+
+bool pgc_page_to_clean_evict_or_release(PGC *cache, PGC_PAGE *page) {
+ bool ret;
+
+ __atomic_add_fetch(&cache->stats.workers_hot2dirty, 1, __ATOMIC_RELAXED);
+
+ // prevent accesses from increasing the accesses counter
+ page_flag_set(page, PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES);
+
+ // zero the accesses counter
+ __atomic_store_n(&page->accesses, 0, __ATOMIC_RELEASE);
+
+ // if there are no other references to it, evict it immediately
+ if(make_acquired_page_clean_and_evict_or_page_release(cache, page)) {
+ __atomic_add_fetch(&cache->stats.hot_empty_pages_evicted_immediately, 1, __ATOMIC_RELAXED);
+ ret = true;
+ }
+ else {
+ __atomic_add_fetch(&cache->stats.hot_empty_pages_evicted_later, 1, __ATOMIC_RELAXED);
+ ret = false;
+ }
+
+ __atomic_sub_fetch(&cache->stats.workers_hot2dirty, 1, __ATOMIC_RELAXED);
+
+ return ret;
+}
+
+Word_t pgc_page_section(PGC_PAGE *page) {
+ return page->section;
+}
+
+Word_t pgc_page_metric(PGC_PAGE *page) {
+ return page->metric_id;
+}
+
+time_t pgc_page_start_time_s(PGC_PAGE *page) {
+ return page->start_time_s;
+}
+
+time_t pgc_page_end_time_s(PGC_PAGE *page) {
+ return page->end_time_s;
+}
+
+time_t pgc_page_update_every_s(PGC_PAGE *page) {
+ return page->update_every_s;
+}
+
+time_t pgc_page_fix_update_every(PGC_PAGE *page, time_t update_every_s) {
+ if(page->update_every_s == 0)
+ page->update_every_s = (uint32_t) update_every_s;
+
+ return page->update_every_s;
+}
+
+time_t pgc_page_fix_end_time_s(PGC_PAGE *page, time_t end_time_s) {
+ page->end_time_s = end_time_s;
+ return page->end_time_s;
+}
+
+void *pgc_page_data(PGC_PAGE *page) {
+ return page->data;
+}
+
+void *pgc_page_custom_data(PGC *cache, PGC_PAGE *page) {
+ if(cache->config.additional_bytes_per_page)
+ return page->custom_data;
+
+ return NULL;
+}
+
+size_t pgc_page_data_size(PGC *cache, PGC_PAGE *page) {
+ return page_size_from_assumed_size(cache, page->assumed_size);
+}
+
+bool pgc_is_page_hot(PGC_PAGE *page) {
+ return is_page_hot(page);
+}
+
+bool pgc_is_page_dirty(PGC_PAGE *page) {
+ return is_page_dirty(page);
+}
+
+bool pgc_is_page_clean(PGC_PAGE *page) {
+ return is_page_clean(page);
+}
+
+void pgc_reset_hot_max(PGC *cache) {
+ size_t entries = __atomic_load_n(&cache->hot.stats->entries, __ATOMIC_RELAXED);
+ size_t size = __atomic_load_n(&cache->hot.stats->size, __ATOMIC_RELAXED);
+
+ __atomic_store_n(&cache->hot.stats->max_entries, entries, __ATOMIC_RELAXED);
+ __atomic_store_n(&cache->hot.stats->max_size, size, __ATOMIC_RELAXED);
+
+ size_t size_to_evict = 0;
+ cache_usage_per1000(cache, &size_to_evict);
+ evict_pages(cache, 0, 0, true, false);
+}
+
+void pgc_set_dynamic_target_cache_size_callback(PGC *cache, dynamic_target_cache_size_callback callback) {
+ cache->config.dynamic_target_size_cb = callback;
+
+ size_t size_to_evict = 0;
+ cache_usage_per1000(cache, &size_to_evict);
+ evict_pages(cache, 0, 0, true, false);
+}
+
+size_t pgc_get_current_cache_size(PGC *cache) {
+ cache_usage_per1000(cache, NULL);
+ return __atomic_load_n(&cache->stats.current_cache_size, __ATOMIC_RELAXED);
+}
+
+size_t pgc_get_wanted_cache_size(PGC *cache) {
+ cache_usage_per1000(cache, NULL);
+ return __atomic_load_n(&cache->stats.wanted_cache_size, __ATOMIC_RELAXED);
+}
+
+bool pgc_evict_pages(PGC *cache, size_t max_skip, size_t max_evict) {
+ bool under_pressure = cache_needs_space_aggressively(cache);
+ return evict_pages(cache,
+ under_pressure ? 0 : max_skip,
+ under_pressure ? 0 : max_evict,
+ true, false);
+}
+
+bool pgc_flush_pages(PGC *cache, size_t max_flushes) {
+ bool under_pressure = flushing_critical(cache);
+ return flush_pages(cache, under_pressure ? 0 : max_flushes, PGC_SECTION_ALL, true, false);
+}
+
+void pgc_page_hot_set_end_time_s(PGC *cache __maybe_unused, PGC_PAGE *page, time_t end_time_s) {
+ internal_fatal(!is_page_hot(page),
+ "DBENGINE CACHE: end_time_s update on non-hot page");
+
+ internal_fatal(end_time_s < __atomic_load_n(&page->end_time_s, __ATOMIC_RELAXED),
+ "DBENGINE CACHE: end_time_s is not bigger than existing");
+
+ __atomic_store_n(&page->end_time_s, end_time_s, __ATOMIC_RELAXED);
+
+#ifdef PGC_COUNT_POINTS_COLLECTED
+ __atomic_add_fetch(&cache->stats.points_collected, 1, __ATOMIC_RELAXED);
+#endif
+}
+
+PGC_PAGE *pgc_page_get_and_acquire(PGC *cache, Word_t section, Word_t metric_id, time_t start_time_s, PGC_SEARCH method) {
+ return page_find_and_acquire(cache, section, metric_id, start_time_s, method);
+}
+
+struct pgc_statistics pgc_get_statistics(PGC *cache) {
+ // FIXME - get the statistics atomically
+ return cache->stats;
+}
+
+size_t pgc_hot_and_dirty_entries(PGC *cache) {
+ size_t entries = 0;
+
+ entries += __atomic_load_n(&cache->hot.stats->entries, __ATOMIC_RELAXED);
+ entries += __atomic_load_n(&cache->dirty.stats->entries, __ATOMIC_RELAXED);
+ entries += __atomic_load_n(&cache->stats.flushing_entries, __ATOMIC_RELAXED);
+ entries += __atomic_load_n(&cache->stats.hot2dirty_entries, __ATOMIC_RELAXED);
+
+ return entries;
+}
+
+void pgc_open_cache_to_journal_v2(PGC *cache, Word_t section, unsigned datafile_fileno, uint8_t type, migrate_to_v2_callback cb, void *data) {
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.journal_v2_indexing_started, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&cache->stats.workers_jv2_flush, 1, __ATOMIC_RELAXED);
+
+ pgc_ll_lock(cache, &cache->hot);
+
+ Pvoid_t JudyL_metrics = NULL;
+ Pvoid_t JudyL_extents_pos = NULL;
+
+ size_t count_of_unique_extents = 0;
+ size_t count_of_unique_metrics = 0;
+ size_t count_of_unique_pages = 0;
+
+ size_t master_extent_index_id = 0;
+
+ Pvoid_t *section_pages_pptr = JudyLGet(cache->hot.sections_judy, section, PJE0);
+ if(!section_pages_pptr) {
+ pgc_ll_unlock(cache, &cache->hot);
+ return;
+ }
+
+ struct section_pages *sp = *section_pages_pptr;
+ if(!spinlock_trylock(&sp->migration_to_v2_spinlock)) {
+ netdata_log_info("DBENGINE: migration to journal v2 for datafile %u is postponed, another jv2 indexer is already running for this section", datafile_fileno);
+ pgc_ll_unlock(cache, &cache->hot);
+ return;
+ }
+
+ ARAL *ar_mi = aral_by_size_acquire(sizeof(struct jv2_metrics_info));
+ ARAL *ar_pi = aral_by_size_acquire(sizeof(struct jv2_page_info));
+ ARAL *ar_ei = aral_by_size_acquire(sizeof(struct jv2_extents_info));
+
+ for(PGC_PAGE *page = sp->base; page ; page = page->link.next) {
+ struct extent_io_data *xio = (struct extent_io_data *)page->custom_data;
+ if(xio->fileno != datafile_fileno) continue;
+
+ if(page_flag_check(page, PGC_PAGE_IS_BEING_MIGRATED_TO_V2)) {
+ internal_fatal(true, "Migration to journal v2: page has already been migrated to v2");
+ continue;
+ }
+
+ if(!page_transition_trylock(cache, page)) {
+ internal_fatal(true, "Migration to journal v2: cannot get page transition lock");
+ continue;
+ }
+
+ if(!page_acquire(cache, page)) {
+ internal_fatal(true, "Migration to journal v2: cannot acquire page for migration to v2");
+ continue;
+ }
+
+ page_flag_set(page, PGC_PAGE_IS_BEING_MIGRATED_TO_V2);
+
+ pgc_ll_unlock(cache, &cache->hot);
+
+ // update the extents JudyL
+
+ size_t current_extent_index_id;
+ Pvoid_t *PValue = JudyLIns(&JudyL_extents_pos, xio->pos, PJE0);
+ if(!PValue || *PValue == PJERR)
+ fatal("Corrupted JudyL extents pos");
+
+ struct jv2_extents_info *ei;
+ if(!*PValue) {
+ ei = aral_mallocz(ar_ei); // callocz(1, sizeof(struct jv2_extents_info));
+ ei->pos = xio->pos;
+ ei->bytes = xio->bytes;
+ ei->number_of_pages = 1;
+ ei->index = master_extent_index_id++;
+ *PValue = ei;
+
+ count_of_unique_extents++;
+ }
+ else {
+ ei = *PValue;
+ ei->number_of_pages++;
+ }
+
+ current_extent_index_id = ei->index;
+
+ // update the metrics JudyL
+
+ PValue = JudyLIns(&JudyL_metrics, page->metric_id, PJE0);
+ if(!PValue || *PValue == PJERR)
+ fatal("Corrupted JudyL metrics");
+
+ struct jv2_metrics_info *mi;
+ if(!*PValue) {
+ mi = aral_mallocz(ar_mi); // callocz(1, sizeof(struct jv2_metrics_info));
+ mi->uuid = mrg_metric_uuid(main_mrg, (METRIC *)page->metric_id);
+ mi->first_time_s = page->start_time_s;
+ mi->last_time_s = page->end_time_s;
+ mi->number_of_pages = 1;
+ mi->page_list_header = 0;
+ mi->JudyL_pages_by_start_time = NULL;
+ *PValue = mi;
+
+ count_of_unique_metrics++;
+ }
+ else {
+ mi = *PValue;
+ mi->number_of_pages++;
+ if(page->start_time_s < mi->first_time_s)
+ mi->first_time_s = page->start_time_s;
+ if(page->end_time_s > mi->last_time_s)
+ mi->last_time_s = page->end_time_s;
+ }
+
+ PValue = JudyLIns(&mi->JudyL_pages_by_start_time, page->start_time_s, PJE0);
+ if(!PValue || *PValue == PJERR)
+ fatal("Corrupted JudyL metric pages");
+
+ if(!*PValue) {
+ struct jv2_page_info *pi = aral_mallocz(ar_pi); // callocz(1, (sizeof(struct jv2_page_info)));
+ pi->start_time_s = page->start_time_s;
+ pi->end_time_s = page->end_time_s;
+ pi->update_every_s = page->update_every_s;
+ pi->page_length = page_size_from_assumed_size(cache, page->assumed_size);
+ pi->page = page;
+ pi->extent_index = current_extent_index_id;
+ pi->custom_data = (cache->config.additional_bytes_per_page) ? page->custom_data : NULL;
+ *PValue = pi;
+
+ count_of_unique_pages++;
+ }
+ else {
+ // impossible situation
+ internal_fatal(true, "Page is already in JudyL metric pages");
+ page_flag_clear(page, PGC_PAGE_IS_BEING_MIGRATED_TO_V2);
+ page_transition_unlock(cache, page);
+ page_release(cache, page, false);
+ }
+
+ pgc_ll_lock(cache, &cache->hot);
+ }
+
+ spinlock_unlock(&sp->migration_to_v2_spinlock);
+ pgc_ll_unlock(cache, &cache->hot);
+
+ // callback
+ cb(section, datafile_fileno, type, JudyL_metrics, JudyL_extents_pos, count_of_unique_extents, count_of_unique_metrics, count_of_unique_pages, data);
+
+ {
+ Pvoid_t *PValue1;
+ bool metric_id_first = true;
+ Word_t metric_id = 0;
+ while ((PValue1 = JudyLFirstThenNext(JudyL_metrics, &metric_id, &metric_id_first))) {
+ struct jv2_metrics_info *mi = *PValue1;
+
+ Pvoid_t *PValue2;
+ bool start_time_first = true;
+ Word_t start_time = 0;
+ while ((PValue2 = JudyLFirstThenNext(mi->JudyL_pages_by_start_time, &start_time, &start_time_first))) {
+ struct jv2_page_info *pi = *PValue2;
+ page_transition_unlock(cache, pi->page);
+ pgc_page_hot_to_dirty_and_release(cache, pi->page);
+ // make_acquired_page_clean_and_evict_or_page_release(cache, pi->page);
+ aral_freez(ar_pi, pi);
+ }
+
+ JudyLFreeArray(&mi->JudyL_pages_by_start_time, PJE0);
+ aral_freez(ar_mi, mi);
+ }
+ JudyLFreeArray(&JudyL_metrics, PJE0);
+ }
+
+ {
+ Pvoid_t *PValue;
+ bool extent_pos_first = true;
+ Word_t extent_pos = 0;
+ while ((PValue = JudyLFirstThenNext(JudyL_extents_pos, &extent_pos, &extent_pos_first))) {
+ struct jv2_extents_info *ei = *PValue;
+ aral_freez(ar_ei, ei);
+ }
+ JudyLFreeArray(&JudyL_extents_pos, PJE0);
+ }
+
+ aral_by_size_release(ar_ei);
+ aral_by_size_release(ar_pi);
+ aral_by_size_release(ar_mi);
+
+ __atomic_sub_fetch(&cache->stats.workers_jv2_flush, 1, __ATOMIC_RELAXED);
+}
+
+static bool match_page_data(PGC_PAGE *page, void *data) {
+ return (page->data == data);
+}
+
+void pgc_open_evict_clean_pages_of_datafile(PGC *cache, struct rrdengine_datafile *datafile) {
+ evict_pages_with_filter(cache, 0, 0, true, true, match_page_data, datafile);
+}
+
+size_t pgc_count_clean_pages_having_data_ptr(PGC *cache, Word_t section, void *ptr) {
+ size_t found = 0;
+
+ pgc_ll_lock(cache, &cache->clean);
+ for(PGC_PAGE *page = cache->clean.base; page ;page = page->link.next)
+ found += (page->data == ptr && page->section == section) ? 1 : 0;
+ pgc_ll_unlock(cache, &cache->clean);
+
+ return found;
+}
+
+size_t pgc_count_hot_pages_having_data_ptr(PGC *cache, Word_t section, void *ptr) {
+ size_t found = 0;
+
+ pgc_ll_lock(cache, &cache->hot);
+ Pvoid_t *section_pages_pptr = JudyLGet(cache->hot.sections_judy, section, PJE0);
+ if(section_pages_pptr) {
+ struct section_pages *sp = *section_pages_pptr;
+ for(PGC_PAGE *page = sp->base; page ;page = page->link.next)
+ found += (page->data == ptr) ? 1 : 0;
+ }
+ pgc_ll_unlock(cache, &cache->hot);
+
+ return found;
+}
+
+// ----------------------------------------------------------------------------
+// unittest
+
+static void unittest_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused) {
+ ;
+}
+
+static void unittest_save_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) {
+ ;
+}
+
+#ifdef PGC_STRESS_TEST
+
+struct {
+ bool stop;
+ PGC *cache;
+ PGC_PAGE **metrics;
+ size_t clean_metrics;
+ size_t hot_metrics;
+ time_t first_time_t;
+ time_t last_time_t;
+ size_t cache_size;
+ size_t query_threads;
+ size_t collect_threads;
+ size_t partitions;
+ size_t points_per_page;
+ time_t time_per_collection_ut;
+ time_t time_per_query_ut;
+ time_t time_per_flush_ut;
+ PGC_OPTIONS options;
+ char rand_statebufs[1024];
+ struct random_data *random_data;
+} pgc_uts = {
+ .stop = false,
+ .metrics = NULL,
+ .clean_metrics = 100000,
+ .hot_metrics = 1000000,
+ .first_time_t = 100000000,
+ .last_time_t = 0,
+ .cache_size = 0, // get the default (8MB)
+ .collect_threads = 16,
+ .query_threads = 16,
+ .partitions = 0, // get the default (system cpus)
+ .options = PGC_OPTIONS_AUTOSCALE,/* PGC_OPTIONS_FLUSH_PAGES_INLINE | PGC_OPTIONS_EVICT_PAGES_INLINE,*/
+ .points_per_page = 10,
+ .time_per_collection_ut = 1000000,
+ .time_per_query_ut = 250,
+ .time_per_flush_ut = 100,
+ .rand_statebufs = {},
+ .random_data = NULL,
+};
+
+void *unittest_stress_test_collector(void *ptr) {
+ size_t id = *((size_t *)ptr);
+
+ size_t metric_start = pgc_uts.clean_metrics;
+ size_t metric_end = pgc_uts.clean_metrics + pgc_uts.hot_metrics;
+ size_t number_of_metrics = metric_end - metric_start;
+ size_t per_collector_metrics = number_of_metrics / pgc_uts.collect_threads;
+ metric_start = metric_start + per_collector_metrics * id + 1;
+ metric_end = metric_start + per_collector_metrics - 1;
+
+ time_t start_time_t = pgc_uts.first_time_t + 1;
+
+ heartbeat_t hb;
+ heartbeat_init(&hb);
+
+ while(!__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) {
+ // netdata_log_info("COLLECTOR %zu: collecting metrics %zu to %zu, from %ld to %lu", id, metric_start, metric_end, start_time_t, start_time_t + pgc_uts.points_per_page);
+
+ netdata_thread_disable_cancelability();
+
+ for (size_t i = metric_start; i < metric_end; i++) {
+ bool added;
+
+ pgc_uts.metrics[i] = pgc_page_add_and_acquire(pgc_uts.cache, (PGC_ENTRY) {
+ .section = 1,
+ .metric_id = i,
+ .start_time_t = start_time_t,
+ .end_time_t = start_time_t,
+ .update_every = 1,
+ .size = 4096,
+ .data = NULL,
+ .hot = true,
+ }, &added);
+
+ if(!pgc_is_page_hot(pgc_uts.metrics[i]) || !added) {
+ pgc_page_release(pgc_uts.cache, pgc_uts.metrics[i]);
+ pgc_uts.metrics[i] = NULL;
+ }
+ }
+
+ time_t end_time_t = start_time_t + (time_t)pgc_uts.points_per_page;
+ while(++start_time_t <= end_time_t && !__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) {
+ heartbeat_next(&hb, pgc_uts.time_per_collection_ut);
+
+ for (size_t i = metric_start; i < metric_end; i++) {
+ if(pgc_uts.metrics[i])
+ pgc_page_hot_set_end_time_t(pgc_uts.cache, pgc_uts.metrics[i], start_time_t);
+ }
+
+ __atomic_store_n(&pgc_uts.last_time_t, start_time_t, __ATOMIC_RELAXED);
+ }
+
+ for (size_t i = metric_start; i < metric_end; i++) {
+ if (pgc_uts.metrics[i]) {
+ if(i % 10 == 0)
+ pgc_page_to_clean_evict_or_release(pgc_uts.cache, pgc_uts.metrics[i]);
+ else
+ pgc_page_hot_to_dirty_and_release(pgc_uts.cache, pgc_uts.metrics[i]);
+ }
+ }
+
+ netdata_thread_enable_cancelability();
+ }
+
+ return ptr;
+}
+
+void *unittest_stress_test_queries(void *ptr) {
+ size_t id = *((size_t *)ptr);
+ struct random_data *random_data = &pgc_uts.random_data[id];
+
+ size_t start = 0;
+ size_t end = pgc_uts.clean_metrics + pgc_uts.hot_metrics;
+
+ while(!__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) {
+ netdata_thread_disable_cancelability();
+
+ int32_t random_number;
+ random_r(random_data, &random_number);
+
+ size_t metric_id = random_number % (end - start);
+ time_t start_time_t = pgc_uts.first_time_t;
+ time_t end_time_t = __atomic_load_n(&pgc_uts.last_time_t, __ATOMIC_RELAXED);
+ if(end_time_t <= start_time_t)
+ end_time_t = start_time_t + 1;
+ size_t pages = (end_time_t - start_time_t) / pgc_uts.points_per_page + 1;
+
+ PGC_PAGE *array[pages];
+ for(size_t i = 0; i < pages ;i++)
+ array[i] = NULL;
+
+ // find the pages the cache has
+ for(size_t i = 0; i < pages ;i++) {
+ time_t page_start_time = start_time_t + (time_t)(i * pgc_uts.points_per_page);
+ array[i] = pgc_page_get_and_acquire(pgc_uts.cache, 1, metric_id,
+ page_start_time, (i < pages - 1)?PGC_SEARCH_EXACT:PGC_SEARCH_CLOSEST);
+ }
+
+ // load the rest of the pages
+ for(size_t i = 0; i < pages ;i++) {
+ if(array[i]) continue;
+
+ time_t page_start_time = start_time_t + (time_t)(i * pgc_uts.points_per_page);
+ array[i] = pgc_page_add_and_acquire(pgc_uts.cache, (PGC_ENTRY) {
+ .section = 1,
+ .metric_id = metric_id,
+ .start_time_t = page_start_time,
+ .end_time_t = page_start_time + (time_t)pgc_uts.points_per_page,
+ .update_every = 1,
+ .size = 4096,
+ .data = NULL,
+ .hot = false,
+ }, NULL);
+ }
+
+ // do the query
+ // ...
+ struct timespec work_duration = {.tv_sec = 0, .tv_nsec = pgc_uts.time_per_query_ut * NSEC_PER_USEC };
+ nanosleep(&work_duration, NULL);
+
+ // release the pages
+ for(size_t i = 0; i < pages ;i++) {
+ if(!array[i]) continue;
+ pgc_page_release(pgc_uts.cache, array[i]);
+ array[i] = NULL;
+ }
+
+ netdata_thread_enable_cancelability();
+ }
+
+ return ptr;
+}
+
+void *unittest_stress_test_service(void *ptr) {
+ heartbeat_t hb;
+ heartbeat_init(&hb);
+ while(!__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) {
+ heartbeat_next(&hb, 1 * USEC_PER_SEC);
+
+ pgc_flush_pages(pgc_uts.cache, 1000);
+ pgc_evict_pages(pgc_uts.cache, 0, 0);
+ }
+ return ptr;
+}
+
+static void unittest_stress_test_save_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) {
+ // netdata_log_info("SAVE %zu pages", entries);
+ if(!pgc_uts.stop) {
+ usec_t t = pgc_uts.time_per_flush_ut;
+
+ if(t > 0) {
+ struct timespec work_duration = {
+ .tv_sec = t / USEC_PER_SEC,
+ .tv_nsec = (long) ((t % USEC_PER_SEC) * NSEC_PER_USEC)
+ };
+
+ nanosleep(&work_duration, NULL);
+ }
+ }
+}
+
+void unittest_stress_test(void) {
+ pgc_uts.cache = pgc_create(pgc_uts.cache_size * 1024 * 1024,
+ unittest_free_clean_page_callback,
+ 64, unittest_stress_test_save_dirty_page_callback,
+ 1000, 10000, 1,
+ pgc_uts.options, pgc_uts.partitions, 0);
+
+ pgc_uts.metrics = callocz(pgc_uts.clean_metrics + pgc_uts.hot_metrics, sizeof(PGC_PAGE *));
+
+ pthread_t service_thread;
+ netdata_thread_create(&service_thread, "SERVICE",
+ NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG,
+ unittest_stress_test_service, NULL);
+
+ pthread_t collect_threads[pgc_uts.collect_threads];
+ size_t collect_thread_ids[pgc_uts.collect_threads];
+ for(size_t i = 0; i < pgc_uts.collect_threads ;i++) {
+ collect_thread_ids[i] = i;
+ char buffer[100 + 1];
+ snprintfz(buffer, sizeof(buffer) - 1, "COLLECT_%zu", i);
+ netdata_thread_create(&collect_threads[i], buffer,
+ NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG,
+ unittest_stress_test_collector, &collect_thread_ids[i]);
+ }
+
+ pthread_t queries_threads[pgc_uts.query_threads];
+ size_t query_thread_ids[pgc_uts.query_threads];
+ pgc_uts.random_data = callocz(pgc_uts.query_threads, sizeof(struct random_data));
+ for(size_t i = 0; i < pgc_uts.query_threads ;i++) {
+ query_thread_ids[i] = i;
+ char buffer[100 + 1];
+ snprintfz(buffer, sizeof(buffer) - 1, "QUERY_%zu", i);
+ initstate_r(1, pgc_uts.rand_statebufs, 1024, &pgc_uts.random_data[i]);
+ netdata_thread_create(&queries_threads[i], buffer,
+ NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG,
+ unittest_stress_test_queries, &query_thread_ids[i]);
+ }
+
+ heartbeat_t hb;
+ heartbeat_init(&hb);
+
+ struct {
+ size_t entries;
+ size_t added;
+ size_t deleted;
+ size_t referenced;
+
+ size_t hot_entries;
+ size_t hot_added;
+ size_t hot_deleted;
+
+ size_t dirty_entries;
+ size_t dirty_added;
+ size_t dirty_deleted;
+
+ size_t clean_entries;
+ size_t clean_added;
+ size_t clean_deleted;
+
+ size_t searches_exact;
+ size_t searches_exact_hits;
+ size_t searches_closest;
+ size_t searches_closest_hits;
+
+ size_t collections;
+
+ size_t events_cache_under_severe_pressure;
+ size_t events_cache_needs_space_90;
+ size_t events_flush_critical;
+ } stats = {}, old_stats = {};
+
+ for(int i = 0; i < 86400 ;i++) {
+ heartbeat_next(&hb, 1 * USEC_PER_SEC);
+
+ old_stats = stats;
+ stats.entries = __atomic_load_n(&pgc_uts.cache->stats.entries, __ATOMIC_RELAXED);
+ stats.added = __atomic_load_n(&pgc_uts.cache->stats.added_entries, __ATOMIC_RELAXED);
+ stats.deleted = __atomic_load_n(&pgc_uts.cache->stats.removed_entries, __ATOMIC_RELAXED);
+ stats.referenced = __atomic_load_n(&pgc_uts.cache->stats.referenced_entries, __ATOMIC_RELAXED);
+
+ stats.hot_entries = __atomic_load_n(&pgc_uts.cache->hot.stats->entries, __ATOMIC_RELAXED);
+ stats.hot_added = __atomic_load_n(&pgc_uts.cache->hot.stats->added_entries, __ATOMIC_RELAXED);
+ stats.hot_deleted = __atomic_load_n(&pgc_uts.cache->hot.stats->removed_entries, __ATOMIC_RELAXED);
+
+ stats.dirty_entries = __atomic_load_n(&pgc_uts.cache->dirty.stats->entries, __ATOMIC_RELAXED);
+ stats.dirty_added = __atomic_load_n(&pgc_uts.cache->dirty.stats->added_entries, __ATOMIC_RELAXED);
+ stats.dirty_deleted = __atomic_load_n(&pgc_uts.cache->dirty.stats->removed_entries, __ATOMIC_RELAXED);
+
+ stats.clean_entries = __atomic_load_n(&pgc_uts.cache->clean.stats->entries, __ATOMIC_RELAXED);
+ stats.clean_added = __atomic_load_n(&pgc_uts.cache->clean.stats->added_entries, __ATOMIC_RELAXED);
+ stats.clean_deleted = __atomic_load_n(&pgc_uts.cache->clean.stats->removed_entries, __ATOMIC_RELAXED);
+
+ stats.searches_exact = __atomic_load_n(&pgc_uts.cache->stats.searches_exact, __ATOMIC_RELAXED);
+ stats.searches_exact_hits = __atomic_load_n(&pgc_uts.cache->stats.searches_exact_hits, __ATOMIC_RELAXED);
+
+ stats.searches_closest = __atomic_load_n(&pgc_uts.cache->stats.searches_closest, __ATOMIC_RELAXED);
+ stats.searches_closest_hits = __atomic_load_n(&pgc_uts.cache->stats.searches_closest_hits, __ATOMIC_RELAXED);
+
+ stats.events_cache_under_severe_pressure = __atomic_load_n(&pgc_uts.cache->stats.events_cache_under_severe_pressure, __ATOMIC_RELAXED);
+ stats.events_cache_needs_space_90 = __atomic_load_n(&pgc_uts.cache->stats.events_cache_needs_space_aggressively, __ATOMIC_RELAXED);
+ stats.events_flush_critical = __atomic_load_n(&pgc_uts.cache->stats.events_flush_critical, __ATOMIC_RELAXED);
+
+ size_t searches_exact = stats.searches_exact - old_stats.searches_exact;
+ size_t searches_closest = stats.searches_closest - old_stats.searches_closest;
+
+ size_t hit_exact = stats.searches_exact_hits - old_stats.searches_exact_hits;
+ size_t hit_closest = stats.searches_closest_hits - old_stats.searches_closest_hits;
+
+ double hit_exact_pc = (searches_exact > 0) ? (double)hit_exact * 100.0 / (double)searches_exact : 0.0;
+ double hit_closest_pc = (searches_closest > 0) ? (double)hit_closest * 100.0 / (double)searches_closest : 0.0;
+
+#ifdef PGC_COUNT_POINTS_COLLECTED
+ stats.collections = __atomic_load_n(&pgc_uts.cache->stats.points_collected, __ATOMIC_RELAXED);
+#endif
+
+ char *cache_status = "N";
+ if(stats.events_cache_under_severe_pressure > old_stats.events_cache_under_severe_pressure)
+ cache_status = "F";
+ else if(stats.events_cache_needs_space_90 > old_stats.events_cache_needs_space_90)
+ cache_status = "f";
+
+ char *flushing_status = "N";
+ if(stats.events_flush_critical > old_stats.events_flush_critical)
+ flushing_status = "F";
+
+ netdata_log_info("PGS %5zuk +%4zuk/-%4zuk "
+ "| RF %5zuk "
+ "| HOT %5zuk +%4zuk -%4zuk "
+ "| DRT %s %5zuk +%4zuk -%4zuk "
+ "| CLN %s %5zuk +%4zuk -%4zuk "
+ "| SRCH %4zuk %4zuk, HIT %4.1f%% %4.1f%% "
+#ifdef PGC_COUNT_POINTS_COLLECTED
+ "| CLCT %8.4f Mps"
+#endif
+ , stats.entries / 1000
+ , (stats.added - old_stats.added) / 1000, (stats.deleted - old_stats.deleted) / 1000
+ , stats.referenced / 1000
+ , stats.hot_entries / 1000, (stats.hot_added - old_stats.hot_added) / 1000, (stats.hot_deleted - old_stats.hot_deleted) / 1000
+ , flushing_status
+ , stats.dirty_entries / 1000
+ , (stats.dirty_added - old_stats.dirty_added) / 1000, (stats.dirty_deleted - old_stats.dirty_deleted) / 1000
+ , cache_status
+ , stats.clean_entries / 1000
+ , (stats.clean_added - old_stats.clean_added) / 1000, (stats.clean_deleted - old_stats.clean_deleted) / 1000
+ , searches_exact / 1000, searches_closest / 1000
+ , hit_exact_pc, hit_closest_pc
+#ifdef PGC_COUNT_POINTS_COLLECTED
+ , (double)(stats.collections - old_stats.collections) / 1000.0 / 1000.0
+#endif
+ );
+ }
+ netdata_log_info("Waiting for threads to stop...");
+ __atomic_store_n(&pgc_uts.stop, true, __ATOMIC_RELAXED);
+
+ netdata_thread_join(service_thread, NULL);
+
+ for(size_t i = 0; i < pgc_uts.collect_threads ;i++)
+ netdata_thread_join(collect_threads[i],NULL);
+
+ for(size_t i = 0; i < pgc_uts.query_threads ;i++)
+ netdata_thread_join(queries_threads[i],NULL);
+
+ pgc_destroy(pgc_uts.cache);
+
+ freez(pgc_uts.metrics);
+ freez(pgc_uts.random_data);
+}
+#endif
+
+int pgc_unittest(void) {
+ PGC *cache = pgc_create("test",
+ 32 * 1024 * 1024, unittest_free_clean_page_callback,
+ 64, NULL, unittest_save_dirty_page_callback,
+ 10, 10, 1000, 10,
+ PGC_OPTIONS_DEFAULT, 1, 11);
+
+ // FIXME - unit tests
+ // - add clean page
+ // - add clean page again (should not add it)
+ // - release page (should decrement counters)
+ // - add hot page
+ // - add hot page again (should not add it)
+ // - turn hot page to dirty, with and without a reference counter to it
+ // - dirty pages are saved once there are enough of them
+ // - find page exact
+ // - find page (should return last)
+ // - find page (should return next)
+ // - page cache full (should evict)
+ // - on destroy, turn hot pages to dirty and save them
+
+ PGC_PAGE *page1 = pgc_page_add_and_acquire(cache, (PGC_ENTRY){
+ .section = 1,
+ .metric_id = 10,
+ .start_time_s = 100,
+ .end_time_s = 1000,
+ .size = 4096,
+ .data = NULL,
+ .hot = false,
+ .custom_data = (uint8_t *)"0123456789",
+ }, NULL);
+
+ if(strcmp(pgc_page_custom_data(cache, page1), "0123456789") != 0)
+ fatal("custom data do not work");
+
+ memcpy(pgc_page_custom_data(cache, page1), "ABCDEFGHIJ", 11);
+ if(strcmp(pgc_page_custom_data(cache, page1), "ABCDEFGHIJ") != 0)
+ fatal("custom data do not work");
+
+ pgc_page_release(cache, page1);
+
+ PGC_PAGE *page2 = pgc_page_add_and_acquire(cache, (PGC_ENTRY){
+ .section = 2,
+ .metric_id = 10,
+ .start_time_s = 1001,
+ .end_time_s = 2000,
+ .size = 4096,
+ .data = NULL,
+ .hot = true,
+ }, NULL);
+
+ pgc_page_hot_set_end_time_s(cache, page2, 2001);
+ pgc_page_hot_to_dirty_and_release(cache, page2);
+
+ PGC_PAGE *page3 = pgc_page_add_and_acquire(cache, (PGC_ENTRY){
+ .section = 3,
+ .metric_id = 10,
+ .start_time_s = 1001,
+ .end_time_s = 2000,
+ .size = 4096,
+ .data = NULL,
+ .hot = true,
+ }, NULL);
+
+ pgc_page_hot_set_end_time_s(cache, page3, 2001);
+ pgc_page_hot_to_dirty_and_release(cache, page3);
+
+ pgc_destroy(cache);
+
+#ifdef PGC_STRESS_TEST
+ unittest_stress_test();
+#endif
+
+ return 0;
+}
diff --git a/database/engine/cache.h b/database/engine/cache.h
new file mode 100644
index 00000000..7cd7c063
--- /dev/null
+++ b/database/engine/cache.h
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+#ifndef DBENGINE_CACHE_H
+#define DBENGINE_CACHE_H
+
+#include "../rrd.h"
+
+// CACHE COMPILE TIME CONFIGURATION
+// #define PGC_COUNT_POINTS_COLLECTED 1
+
+typedef struct pgc PGC;
+typedef struct pgc_page PGC_PAGE;
+#define PGC_NAME_MAX 23
+
+typedef enum __attribute__ ((__packed__)) {
+ PGC_OPTIONS_NONE = 0,
+ PGC_OPTIONS_EVICT_PAGES_INLINE = (1 << 0),
+ PGC_OPTIONS_FLUSH_PAGES_INLINE = (1 << 1),
+ PGC_OPTIONS_AUTOSCALE = (1 << 2),
+} PGC_OPTIONS;
+
+#define PGC_OPTIONS_DEFAULT (PGC_OPTIONS_EVICT_PAGES_INLINE | PGC_OPTIONS_FLUSH_PAGES_INLINE | PGC_OPTIONS_AUTOSCALE)
+
+typedef struct pgc_entry {
+ Word_t section; // the section this belongs to
+ Word_t metric_id; // the metric this belongs to
+ time_t start_time_s; // the start time of the page
+ time_t end_time_s; // the end time of the page
+ size_t size; // the size in bytes of the allocation, outside the cache
+ void *data; // a pointer to data outside the cache
+ uint32_t update_every_s; // the update every of the page
+ bool hot; // true if this entry is currently being collected
+ uint8_t *custom_data;
+} PGC_ENTRY;
+
+#define PGC_CACHE_LINE_PADDING(x) uint8_t padding##x[64]
+
+struct pgc_queue_statistics {
+ size_t entries;
+ size_t size;
+
+ PGC_CACHE_LINE_PADDING(1);
+
+ size_t max_entries;
+ size_t max_size;
+
+ PGC_CACHE_LINE_PADDING(2);
+
+ size_t added_entries;
+ size_t added_size;
+
+ PGC_CACHE_LINE_PADDING(3);
+
+ size_t removed_entries;
+ size_t removed_size;
+
+ PGC_CACHE_LINE_PADDING(4);
+};
+
+struct pgc_statistics {
+ size_t wanted_cache_size;
+ size_t current_cache_size;
+
+ PGC_CACHE_LINE_PADDING(1);
+
+ size_t added_entries;
+ size_t added_size;
+
+ PGC_CACHE_LINE_PADDING(2);
+
+ size_t removed_entries;
+ size_t removed_size;
+
+ PGC_CACHE_LINE_PADDING(3);
+
+ size_t entries; // all the entries (includes clean, dirty, hot)
+ size_t size; // all the entries (includes clean, dirty, hot)
+
+ size_t evicting_entries;
+ size_t evicting_size;
+
+ size_t flushing_entries;
+ size_t flushing_size;
+
+ size_t hot2dirty_entries;
+ size_t hot2dirty_size;
+
+ PGC_CACHE_LINE_PADDING(4);
+
+ size_t acquires;
+ PGC_CACHE_LINE_PADDING(4a);
+ size_t releases;
+ PGC_CACHE_LINE_PADDING(4b);
+ size_t acquires_for_deletion;
+ PGC_CACHE_LINE_PADDING(4c);
+
+ size_t referenced_entries; // all the entries currently referenced
+ size_t referenced_size; // all the entries currently referenced
+
+ PGC_CACHE_LINE_PADDING(5);
+
+ size_t searches_exact;
+ size_t searches_exact_hits;
+ size_t searches_exact_misses;
+
+ PGC_CACHE_LINE_PADDING(6);
+
+ size_t searches_closest;
+ size_t searches_closest_hits;
+ size_t searches_closest_misses;
+
+ PGC_CACHE_LINE_PADDING(7);
+
+ size_t flushes_completed;
+ size_t flushes_completed_size;
+ size_t flushes_cancelled;
+ size_t flushes_cancelled_size;
+
+#ifdef PGC_COUNT_POINTS_COLLECTED
+ PGC_CACHE_LINE_PADDING(8);
+ size_t points_collected;
+#endif
+
+ PGC_CACHE_LINE_PADDING(9);
+
+ size_t insert_spins;
+ size_t evict_spins;
+ size_t release_spins;
+ size_t acquire_spins;
+ size_t delete_spins;
+ size_t flush_spins;
+
+ PGC_CACHE_LINE_PADDING(10);
+
+ size_t workers_search;
+ size_t workers_add;
+ size_t workers_evict;
+ size_t workers_flush;
+ size_t workers_jv2_flush;
+ size_t workers_hot2dirty;
+
+ size_t evict_skipped;
+ size_t hot_empty_pages_evicted_immediately;
+ size_t hot_empty_pages_evicted_later;
+
+ PGC_CACHE_LINE_PADDING(11);
+
+ // events
+ size_t events_cache_under_severe_pressure;
+ size_t events_cache_needs_space_aggressively;
+ size_t events_flush_critical;
+
+ PGC_CACHE_LINE_PADDING(12);
+
+ struct {
+ PGC_CACHE_LINE_PADDING(0);
+ struct pgc_queue_statistics hot;
+ PGC_CACHE_LINE_PADDING(1);
+ struct pgc_queue_statistics dirty;
+ PGC_CACHE_LINE_PADDING(2);
+ struct pgc_queue_statistics clean;
+ PGC_CACHE_LINE_PADDING(3);
+ } queues;
+};
+
+
+typedef void (*free_clean_page_callback)(PGC *cache, PGC_ENTRY entry);
+typedef void (*save_dirty_page_callback)(PGC *cache, PGC_ENTRY *entries_array, PGC_PAGE **pages_array, size_t entries);
+typedef void (*save_dirty_init_callback)(PGC *cache, Word_t section);
+// create a cache
+PGC *pgc_create(const char *name,
+ size_t clean_size_bytes, free_clean_page_callback pgc_free_clean_cb,
+ size_t max_dirty_pages_per_flush, save_dirty_init_callback pgc_save_init_cb, save_dirty_page_callback pgc_save_dirty_cb,
+ size_t max_pages_per_inline_eviction, size_t max_inline_evictors,
+ size_t max_skip_pages_per_inline_eviction,
+ size_t max_flushes_inline,
+ PGC_OPTIONS options, size_t partitions, size_t additional_bytes_per_page);
+
+// destroy the cache
+void pgc_destroy(PGC *cache);
+
+#define PGC_SECTION_ALL ((Word_t)0)
+void pgc_flush_all_hot_and_dirty_pages(PGC *cache, Word_t section);
+
+// add a page to the cache and return a pointer to it
+PGC_PAGE *pgc_page_add_and_acquire(PGC *cache, PGC_ENTRY entry, bool *added);
+
+// get another reference counter on an already referenced page
+PGC_PAGE *pgc_page_dup(PGC *cache, PGC_PAGE *page);
+
+// release a page (all pointers to it are now invalid)
+void pgc_page_release(PGC *cache, PGC_PAGE *page);
+
+// mark a hot page dirty, and release it
+void pgc_page_hot_to_dirty_and_release(PGC *cache, PGC_PAGE *page);
+
+// find a page from the cache
+typedef enum {
+ PGC_SEARCH_EXACT,
+ PGC_SEARCH_CLOSEST,
+ PGC_SEARCH_FIRST,
+ PGC_SEARCH_NEXT,
+ PGC_SEARCH_LAST,
+ PGC_SEARCH_PREV,
+} PGC_SEARCH;
+
+PGC_PAGE *pgc_page_get_and_acquire(PGC *cache, Word_t section, Word_t metric_id, time_t start_time_s, PGC_SEARCH method);
+
+// get information from an acquired page
+Word_t pgc_page_section(PGC_PAGE *page);
+Word_t pgc_page_metric(PGC_PAGE *page);
+time_t pgc_page_start_time_s(PGC_PAGE *page);
+time_t pgc_page_end_time_s(PGC_PAGE *page);
+time_t pgc_page_update_every_s(PGC_PAGE *page);
+time_t pgc_page_fix_update_every(PGC_PAGE *page, time_t update_every_s);
+time_t pgc_page_fix_end_time_s(PGC_PAGE *page, time_t end_time_s);
+void *pgc_page_data(PGC_PAGE *page);
+void *pgc_page_custom_data(PGC *cache, PGC_PAGE *page);
+size_t pgc_page_data_size(PGC *cache, PGC_PAGE *page);
+bool pgc_is_page_hot(PGC_PAGE *page);
+bool pgc_is_page_dirty(PGC_PAGE *page);
+bool pgc_is_page_clean(PGC_PAGE *page);
+void pgc_reset_hot_max(PGC *cache);
+size_t pgc_get_current_cache_size(PGC *cache);
+size_t pgc_get_wanted_cache_size(PGC *cache);
+
+// resetting the end time of a hot page
+void pgc_page_hot_set_end_time_s(PGC *cache, PGC_PAGE *page, time_t end_time_s);
+bool pgc_page_to_clean_evict_or_release(PGC *cache, PGC_PAGE *page);
+
+typedef void (*migrate_to_v2_callback)(Word_t section, unsigned datafile_fileno, uint8_t type, Pvoid_t JudyL_metrics, Pvoid_t JudyL_extents_pos, size_t count_of_unique_extents, size_t count_of_unique_metrics, size_t count_of_unique_pages, void *data);
+void pgc_open_cache_to_journal_v2(PGC *cache, Word_t section, unsigned datafile_fileno, uint8_t type, migrate_to_v2_callback cb, void *data);
+void pgc_open_evict_clean_pages_of_datafile(PGC *cache, struct rrdengine_datafile *datafile);
+size_t pgc_count_clean_pages_having_data_ptr(PGC *cache, Word_t section, void *ptr);
+size_t pgc_count_hot_pages_having_data_ptr(PGC *cache, Word_t section, void *ptr);
+
+typedef size_t (*dynamic_target_cache_size_callback)(void);
+void pgc_set_dynamic_target_cache_size_callback(PGC *cache, dynamic_target_cache_size_callback callback);
+
+// return true when there is more work to do
+bool pgc_evict_pages(PGC *cache, size_t max_skip, size_t max_evict);
+bool pgc_flush_pages(PGC *cache, size_t max_flushes);
+
+struct pgc_statistics pgc_get_statistics(PGC *cache);
+size_t pgc_hot_and_dirty_entries(PGC *cache);
+
+struct aral_statistics *pgc_aral_statistics(void);
+size_t pgc_aral_structures(void);
+size_t pgc_aral_overhead(void);
+
+#endif // DBENGINE_CACHE_H
diff --git a/database/engine/datafile.c b/database/engine/datafile.c
new file mode 100644
index 00000000..7322039c
--- /dev/null
+++ b/database/engine/datafile.c
@@ -0,0 +1,611 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+#include "rrdengine.h"
+
+void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool having_lock)
+{
+ if(!having_lock)
+ uv_rwlock_wrlock(&ctx->datafiles.rwlock);
+
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ctx->datafiles.first, datafile, prev, next);
+
+ if(!having_lock)
+ uv_rwlock_wrunlock(&ctx->datafiles.rwlock);
+}
+
+void datafile_list_delete_unsafe(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile)
+{
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ctx->datafiles.first, datafile, prev, next);
+}
+
+
+static struct rrdengine_datafile *datafile_alloc_and_init(struct rrdengine_instance *ctx, unsigned tier, unsigned fileno)
+{
+ fatal_assert(tier == 1);
+
+ struct rrdengine_datafile *datafile = callocz(1, sizeof(struct rrdengine_datafile));
+
+ datafile->tier = tier;
+ datafile->fileno = fileno;
+ fatal_assert(0 == uv_rwlock_init(&datafile->extent_rwlock));
+ datafile->ctx = ctx;
+
+ datafile->users.available = true;
+
+ spinlock_init(&datafile->users.spinlock);
+ spinlock_init(&datafile->writers.spinlock);
+ spinlock_init(&datafile->extent_queries.spinlock);
+
+ return datafile;
+}
+
+bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason) {
+ bool ret;
+
+ spinlock_lock(&df->users.spinlock);
+
+ if(df->users.available) {
+ ret = true;
+ df->users.lockers++;
+ df->users.lockers_by_reason[reason]++;
+ }
+ else
+ ret = false;
+
+ spinlock_unlock(&df->users.spinlock);
+
+ return ret;
+}
+
+void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason) {
+ spinlock_lock(&df->users.spinlock);
+ if(!df->users.lockers)
+ fatal("DBENGINE DATAFILE: cannot release a datafile that is not acquired");
+
+ df->users.lockers--;
+ df->users.lockers_by_reason[reason]--;
+ spinlock_unlock(&df->users.spinlock);
+}
+
+bool datafile_acquire_for_deletion(struct rrdengine_datafile *df) {
+ bool can_be_deleted = false;
+
+ spinlock_lock(&df->users.spinlock);
+ df->users.available = false;
+
+ if(!df->users.lockers)
+ can_be_deleted = true;
+
+ else {
+ // there are lockers
+
+ // evict any pages referencing this in the open cache
+ spinlock_unlock(&df->users.spinlock);
+ pgc_open_evict_clean_pages_of_datafile(open_cache, df);
+ spinlock_lock(&df->users.spinlock);
+
+ if(!df->users.lockers)
+ can_be_deleted = true;
+
+ else {
+ // there are lockers still
+
+ // count the number of pages referencing this in the open cache
+ spinlock_unlock(&df->users.spinlock);
+ usec_t time_to_scan_ut = now_monotonic_usec();
+ size_t clean_pages_in_open_cache = pgc_count_clean_pages_having_data_ptr(open_cache, (Word_t)df->ctx, df);
+ size_t hot_pages_in_open_cache = pgc_count_hot_pages_having_data_ptr(open_cache, (Word_t)df->ctx, df);
+ time_to_scan_ut = now_monotonic_usec() - time_to_scan_ut;
+ spinlock_lock(&df->users.spinlock);
+
+ if(!df->users.lockers)
+ can_be_deleted = true;
+
+ else if(!clean_pages_in_open_cache && !hot_pages_in_open_cache) {
+ // no pages in the open cache related to this datafile
+
+ time_t now_s = now_monotonic_sec();
+
+ if(!df->users.time_to_evict) {
+ // first time we did the above
+ df->users.time_to_evict = now_s + 120;
+ internal_error(true, "DBENGINE: datafile %u of tier %d is not used by any open cache pages, "
+ "but it has %u lockers (oc:%u, pd:%u), "
+ "%zu clean and %zu hot open cache pages "
+ "- will be deleted shortly "
+ "(scanned open cache in %"PRIu64" usecs)",
+ df->fileno, df->ctx->config.tier,
+ df->users.lockers,
+ df->users.lockers_by_reason[DATAFILE_ACQUIRE_OPEN_CACHE],
+ df->users.lockers_by_reason[DATAFILE_ACQUIRE_PAGE_DETAILS],
+ clean_pages_in_open_cache,
+ hot_pages_in_open_cache,
+ time_to_scan_ut);
+ }
+
+ else if(now_s > df->users.time_to_evict) {
+ // time expired, lets remove it
+ can_be_deleted = true;
+ internal_error(true, "DBENGINE: datafile %u of tier %d is not used by any open cache pages, "
+ "but it has %u lockers (oc:%u, pd:%u), "
+ "%zu clean and %zu hot open cache pages "
+ "- will be deleted now "
+ "(scanned open cache in %"PRIu64" usecs)",
+ df->fileno, df->ctx->config.tier,
+ df->users.lockers,
+ df->users.lockers_by_reason[DATAFILE_ACQUIRE_OPEN_CACHE],
+ df->users.lockers_by_reason[DATAFILE_ACQUIRE_PAGE_DETAILS],
+ clean_pages_in_open_cache,
+ hot_pages_in_open_cache,
+ time_to_scan_ut);
+ }
+ }
+ else
+ internal_error(true, "DBENGINE: datafile %u of tier %d "
+ "has %u lockers (oc:%u, pd:%u), "
+ "%zu clean and %zu hot open cache pages "
+ "(scanned open cache in %"PRIu64" usecs)",
+ df->fileno, df->ctx->config.tier,
+ df->users.lockers,
+ df->users.lockers_by_reason[DATAFILE_ACQUIRE_OPEN_CACHE],
+ df->users.lockers_by_reason[DATAFILE_ACQUIRE_PAGE_DETAILS],
+ clean_pages_in_open_cache,
+ hot_pages_in_open_cache,
+ time_to_scan_ut);
+ }
+ }
+ spinlock_unlock(&df->users.spinlock);
+
+ return can_be_deleted;
+}
+
+void generate_datafilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen)
+{
+ (void) snprintfz(str, maxlen - 1, "%s/" DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION,
+ datafile->ctx->config.dbfiles_path, datafile->tier, datafile->fileno);
+}
+
+int close_data_file(struct rrdengine_datafile *datafile)
+{
+ struct rrdengine_instance *ctx = datafile->ctx;
+ uv_fs_t req;
+ int ret;
+ char path[RRDENG_PATH_MAX];
+
+ generate_datafilepath(datafile, path, sizeof(path));
+
+ ret = uv_fs_close(NULL, &req, datafile->file, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret));
+ ctx_fs_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+
+ return ret;
+}
+
+int unlink_data_file(struct rrdengine_datafile *datafile)
+{
+ struct rrdengine_instance *ctx = datafile->ctx;
+ uv_fs_t req;
+ int ret;
+ char path[RRDENG_PATH_MAX];
+
+ generate_datafilepath(datafile, path, sizeof(path));
+
+ ret = uv_fs_unlink(NULL, &req, path, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret));
+ ctx_fs_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+
+ __atomic_add_fetch(&ctx->stats.datafile_deletions, 1, __ATOMIC_RELAXED);
+
+ return ret;
+}
+
+int destroy_data_file_unsafe(struct rrdengine_datafile *datafile)
+{
+ struct rrdengine_instance *ctx = datafile->ctx;
+ uv_fs_t req;
+ int ret;
+ char path[RRDENG_PATH_MAX];
+
+ generate_datafilepath(datafile, path, sizeof(path));
+
+ ret = uv_fs_ftruncate(NULL, &req, datafile->file, 0, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_ftruncate(%s): %s", path, uv_strerror(ret));
+ ctx_fs_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+
+ ret = uv_fs_close(NULL, &req, datafile->file, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret));
+ ctx_fs_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+
+ ret = uv_fs_unlink(NULL, &req, path, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret));
+ ctx_fs_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+
+ __atomic_add_fetch(&ctx->stats.datafile_deletions, 1, __ATOMIC_RELAXED);
+
+ return ret;
+}
+
+int create_data_file(struct rrdengine_datafile *datafile)
+{
+ struct rrdengine_instance *ctx = datafile->ctx;
+ uv_fs_t req;
+ uv_file file;
+ int ret, fd;
+ struct rrdeng_df_sb *superblock;
+ uv_buf_t iov;
+ char path[RRDENG_PATH_MAX];
+
+ generate_datafilepath(datafile, path, sizeof(path));
+ fd = open_file_for_io(path, O_CREAT | O_RDWR | O_TRUNC, &file, use_direct_io);
+ if (fd < 0) {
+ ctx_fs_error(ctx);
+ return fd;
+ }
+ datafile->file = file;
+ __atomic_add_fetch(&ctx->stats.datafile_creations, 1, __ATOMIC_RELAXED);
+
+ ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock));
+ if (unlikely(ret)) {
+ fatal("DBENGINE: posix_memalign:%s", strerror(ret));
+ }
+ memset(superblock, 0, sizeof(*superblock));
+ (void) strncpy(superblock->magic_number, RRDENG_DF_MAGIC, RRDENG_MAGIC_SZ);
+ (void) strncpy(superblock->version, RRDENG_DF_VER, RRDENG_VER_SZ);
+ superblock->tier = 1;
+
+ iov = uv_buf_init((void *)superblock, sizeof(*superblock));
+
+ ret = uv_fs_write(NULL, &req, file, &iov, 1, 0, NULL);
+ if (ret < 0) {
+ fatal_assert(req.result < 0);
+ netdata_log_error("DBENGINE: uv_fs_write: %s", uv_strerror(ret));
+ ctx_io_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+ posix_memfree(superblock);
+ if (ret < 0) {
+ destroy_data_file_unsafe(datafile);
+ return ret;
+ }
+
+ datafile->pos = sizeof(*superblock);
+ ctx_io_write_op_bytes(ctx, sizeof(*superblock));
+
+ return 0;
+}
+
+static int check_data_file_superblock(uv_file file)
+{
+ int ret;
+ struct rrdeng_df_sb *superblock;
+ uv_buf_t iov;
+ uv_fs_t req;
+
+ ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock));
+ if (unlikely(ret)) {
+ fatal("DBENGINE: posix_memalign:%s", strerror(ret));
+ }
+ iov = uv_buf_init((void *)superblock, sizeof(*superblock));
+
+ ret = uv_fs_read(NULL, &req, file, &iov, 1, 0, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_read: %s", uv_strerror(ret));
+ uv_fs_req_cleanup(&req);
+ goto error;
+ }
+ fatal_assert(req.result >= 0);
+ uv_fs_req_cleanup(&req);
+
+ if (strncmp(superblock->magic_number, RRDENG_DF_MAGIC, RRDENG_MAGIC_SZ) ||
+ strncmp(superblock->version, RRDENG_DF_VER, RRDENG_VER_SZ) ||
+ superblock->tier != 1) {
+ netdata_log_error("DBENGINE: file has invalid superblock.");
+ ret = UV_EINVAL;
+ } else {
+ ret = 0;
+ }
+ error:
+ posix_memfree(superblock);
+ return ret;
+}
+
+static int load_data_file(struct rrdengine_datafile *datafile)
+{
+ struct rrdengine_instance *ctx = datafile->ctx;
+ uv_fs_t req;
+ uv_file file;
+ int ret, fd, error;
+ uint64_t file_size;
+ char path[RRDENG_PATH_MAX];
+
+ generate_datafilepath(datafile, path, sizeof(path));
+ fd = open_file_for_io(path, O_RDWR, &file, use_direct_io);
+ if (fd < 0) {
+ ctx_fs_error(ctx);
+ return fd;
+ }
+
+ nd_log_daemon(NDLP_DEBUG, "DBENGINE: initializing data file \"%s\".", path);
+
+ ret = check_file_properties(file, &file_size, sizeof(struct rrdeng_df_sb));
+ if (ret)
+ goto error;
+ file_size = ALIGN_BYTES_CEILING(file_size);
+
+ ret = check_data_file_superblock(file);
+ if (ret)
+ goto error;
+
+ ctx_io_read_op_bytes(ctx, sizeof(struct rrdeng_df_sb));
+
+ datafile->file = file;
+ datafile->pos = file_size;
+
+ nd_log_daemon(NDLP_DEBUG, "DBENGINE: data file \"%s\" initialized (size:%" PRIu64 ").", path, file_size);
+
+ return 0;
+
+ error:
+ error = ret;
+ ret = uv_fs_close(NULL, &req, file, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret));
+ ctx_fs_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+ return error;
+}
+
+static int scan_data_files_cmp(const void *a, const void *b)
+{
+ struct rrdengine_datafile *file1, *file2;
+ char path1[RRDENG_PATH_MAX], path2[RRDENG_PATH_MAX];
+
+ file1 = *(struct rrdengine_datafile **)a;
+ file2 = *(struct rrdengine_datafile **)b;
+ generate_datafilepath(file1, path1, sizeof(path1));
+ generate_datafilepath(file2, path2, sizeof(path2));
+ return strcmp(path1, path2);
+}
+
+/* Returns number of datafiles that were loaded or < 0 on error */
+static int scan_data_files(struct rrdengine_instance *ctx)
+{
+ int ret, matched_files, failed_to_load, i;
+ unsigned tier, no;
+ uv_fs_t req;
+ uv_dirent_t dent;
+ struct rrdengine_datafile **datafiles, *datafile;
+ struct rrdengine_journalfile *journalfile;
+
+ ret = uv_fs_scandir(NULL, &req, ctx->config.dbfiles_path, 0, NULL);
+ if (ret < 0) {
+ fatal_assert(req.result < 0);
+ uv_fs_req_cleanup(&req);
+ netdata_log_error("DBENGINE: uv_fs_scandir(%s): %s", ctx->config.dbfiles_path, uv_strerror(ret));
+ ctx_fs_error(ctx);
+ return ret;
+ }
+ netdata_log_info("DBENGINE: found %d files in path %s", ret, ctx->config.dbfiles_path);
+
+ datafiles = callocz(MIN(ret, MAX_DATAFILES), sizeof(*datafiles));
+ for (matched_files = 0 ; UV_EOF != uv_fs_scandir_next(&req, &dent) && matched_files < MAX_DATAFILES ; ) {
+ ret = sscanf(dent.name, DATAFILE_PREFIX RRDENG_FILE_NUMBER_SCAN_TMPL DATAFILE_EXTENSION, &tier, &no);
+ if (2 == ret) {
+ datafile = datafile_alloc_and_init(ctx, tier, no);
+ datafiles[matched_files++] = datafile;
+ }
+ }
+ uv_fs_req_cleanup(&req);
+
+ if (0 == matched_files) {
+ freez(datafiles);
+ return 0;
+ }
+
+ if (matched_files == MAX_DATAFILES)
+ netdata_log_error("DBENGINE: warning: hit maximum database engine file limit of %d files", MAX_DATAFILES);
+
+ qsort(datafiles, matched_files, sizeof(*datafiles), scan_data_files_cmp);
+
+ ctx->atomic.last_fileno = datafiles[matched_files - 1]->fileno;
+
+ netdata_log_info("DBENGINE: loading %d data/journal of tier %d...", matched_files, ctx->config.tier);
+ for (failed_to_load = 0, i = 0 ; i < matched_files ; ++i) {
+ uint8_t must_delete_pair = 0;
+
+ datafile = datafiles[i];
+ ret = load_data_file(datafile);
+ if (0 != ret)
+ must_delete_pair = 1;
+
+ journalfile = journalfile_alloc_and_init(datafile);
+ ret = journalfile_load(ctx, journalfile, datafile);
+ if (0 != ret) {
+ if (!must_delete_pair) /* If datafile is still open close it */
+ close_data_file(datafile);
+ must_delete_pair = 1;
+ }
+
+ if (must_delete_pair) {
+ char path[RRDENG_PATH_MAX];
+
+ netdata_log_error("DBENGINE: deleting invalid data and journal file pair.");
+ ret = journalfile_unlink(journalfile);
+ if (!ret) {
+ journalfile_v1_generate_path(datafile, path, sizeof(path));
+ netdata_log_info("DBENGINE: deleted journal file \"%s\".", path);
+ }
+ ret = unlink_data_file(datafile);
+ if (!ret) {
+ generate_datafilepath(datafile, path, sizeof(path));
+ netdata_log_info("DBENGINE: deleted data file \"%s\".", path);
+ }
+ freez(journalfile);
+ freez(datafile);
+ ++failed_to_load;
+ continue;
+ }
+
+ ctx_current_disk_space_increase(ctx, datafile->pos + journalfile->unsafe.pos);
+ datafile_list_insert(ctx, datafile, false);
+ }
+
+ matched_files -= failed_to_load;
+ freez(datafiles);
+
+ return matched_files;
+}
+
+/* Creates a datafile and a journalfile pair */
+int create_new_datafile_pair(struct rrdengine_instance *ctx, bool having_lock)
+{
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.datafile_creation_started, 1, __ATOMIC_RELAXED);
+
+ struct rrdengine_datafile *datafile;
+ struct rrdengine_journalfile *journalfile;
+ unsigned fileno = ctx_last_fileno_get(ctx) + 1;
+ int ret;
+ char path[RRDENG_PATH_MAX];
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "DBENGINE: creating new data and journal files in path %s",
+ ctx->config.dbfiles_path);
+
+ datafile = datafile_alloc_and_init(ctx, 1, fileno);
+ ret = create_data_file(datafile);
+ if(ret)
+ goto error_after_datafile;
+
+ generate_datafilepath(datafile, path, sizeof(path));
+ nd_log(NDLS_DAEMON, NDLP_INFO,
+ "DBENGINE: created data file \"%s\".", path);
+
+ journalfile = journalfile_alloc_and_init(datafile);
+ ret = journalfile_create(journalfile, datafile);
+ if (ret)
+ goto error_after_journalfile;
+
+ journalfile_v1_generate_path(datafile, path, sizeof(path));
+ nd_log(NDLS_DAEMON, NDLP_INFO,
+ "DBENGINE: created journal file \"%s\".", path);
+
+ ctx_current_disk_space_increase(ctx, datafile->pos + journalfile->unsafe.pos);
+ datafile_list_insert(ctx, datafile, having_lock);
+ ctx_last_fileno_increment(ctx);
+
+ return 0;
+
+error_after_journalfile:
+ destroy_data_file_unsafe(datafile);
+ freez(journalfile);
+
+error_after_datafile:
+ freez(datafile);
+ return ret;
+}
+
+/* Page cache must already be initialized.
+ * Return 0 on success.
+ */
+int init_data_files(struct rrdengine_instance *ctx)
+{
+ int ret;
+
+ fatal_assert(0 == uv_rwlock_init(&ctx->datafiles.rwlock));
+ ret = scan_data_files(ctx);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: failed to scan path \"%s\".", ctx->config.dbfiles_path);
+ return ret;
+ } else if (0 == ret) {
+ netdata_log_info("DBENGINE: data files not found, creating in path \"%s\".", ctx->config.dbfiles_path);
+ ctx->atomic.last_fileno = 0;
+ ret = create_new_datafile_pair(ctx, false);
+ if (ret) {
+ netdata_log_error("DBENGINE: failed to create data and journal files in path \"%s\".", ctx->config.dbfiles_path);
+ return ret;
+ }
+ }
+ else {
+ if (ctx->loading.create_new_datafile_pair)
+ create_new_datafile_pair(ctx, false);
+
+ while(rrdeng_ctx_exceeded_disk_quota(ctx))
+ datafile_delete(ctx, ctx->datafiles.first, false, false);
+ }
+
+ pgc_reset_hot_max(open_cache);
+ ctx->loading.create_new_datafile_pair = false;
+ return 0;
+}
+
+void finalize_data_files(struct rrdengine_instance *ctx)
+{
+ bool logged = false;
+
+ logged = false;
+ while(__atomic_load_n(&ctx->atomic.extents_currently_being_flushed, __ATOMIC_RELAXED)) {
+ if(!logged) {
+ netdata_log_info("Waiting for inflight flush to finish on tier %d...", ctx->config.tier);
+ logged = true;
+ }
+ sleep_usec(100 * USEC_PER_MS);
+ }
+
+ do {
+ struct rrdengine_datafile *datafile = ctx->datafiles.first;
+ struct rrdengine_journalfile *journalfile = datafile->journalfile;
+
+ logged = false;
+ size_t iterations = 100;
+ while(!datafile_acquire_for_deletion(datafile) && datafile != ctx->datafiles.first->prev && --iterations > 0) {
+ if(!logged) {
+ netdata_log_info("Waiting to acquire data file %u of tier %d to close it...", datafile->fileno, ctx->config.tier);
+ logged = true;
+ }
+ sleep_usec(100 * USEC_PER_MS);
+ }
+
+ logged = false;
+ bool available = false;
+ do {
+ uv_rwlock_wrlock(&ctx->datafiles.rwlock);
+ spinlock_lock(&datafile->writers.spinlock);
+ available = (datafile->writers.running || datafile->writers.flushed_to_open_running) ? false : true;
+
+ if(!available) {
+ spinlock_unlock(&datafile->writers.spinlock);
+ uv_rwlock_wrunlock(&ctx->datafiles.rwlock);
+ if(!logged) {
+ netdata_log_info("Waiting for writers to data file %u of tier %d to finish...", datafile->fileno, ctx->config.tier);
+ logged = true;
+ }
+ sleep_usec(100 * USEC_PER_MS);
+ }
+ } while(!available);
+
+ journalfile_close(journalfile, datafile);
+ close_data_file(datafile);
+ datafile_list_delete_unsafe(ctx, datafile);
+ spinlock_unlock(&datafile->writers.spinlock);
+ uv_rwlock_wrunlock(&ctx->datafiles.rwlock);
+
+ freez(journalfile);
+ freez(datafile);
+
+ } while(ctx->datafiles.first);
+}
diff --git a/database/engine/datafile.h b/database/engine/datafile.h
new file mode 100644
index 00000000..569f1b0a
--- /dev/null
+++ b/database/engine/datafile.h
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_DATAFILE_H
+#define NETDATA_DATAFILE_H
+
+#include "rrdengine.h"
+
+/* Forward declarations */
+struct rrdengine_datafile;
+struct rrdengine_journalfile;
+struct rrdengine_instance;
+
+#define DATAFILE_PREFIX "datafile-"
+#define DATAFILE_EXTENSION ".ndf"
+
+#ifndef MAX_DATAFILE_SIZE
+#define MAX_DATAFILE_SIZE (512LU * 1024LU * 1024LU)
+#endif
+#if MIN_DATAFILE_SIZE > MAX_DATAFILE_SIZE
+#error MIN_DATAFILE_SIZE > MAX_DATAFILE_SIZE
+#endif
+
+#define MIN_DATAFILE_SIZE (4LU * 1024LU * 1024LU)
+#define MAX_DATAFILES (65536 * 4) /* Supports up to 64TiB for now */
+#define TARGET_DATAFILES (50)
+
+typedef enum __attribute__ ((__packed__)) {
+ DATAFILE_ACQUIRE_OPEN_CACHE = 0,
+ DATAFILE_ACQUIRE_PAGE_DETAILS,
+ DATAFILE_ACQUIRE_RETENTION,
+
+ // terminator
+ DATAFILE_ACQUIRE_MAX,
+} DATAFILE_ACQUIRE_REASONS;
+
+/* only one event loop is supported for now */
+struct rrdengine_datafile {
+ unsigned tier;
+ unsigned fileno;
+ uv_file file;
+ uint64_t pos;
+ uv_rwlock_t extent_rwlock;
+ struct rrdengine_instance *ctx;
+ struct rrdengine_journalfile *journalfile;
+ struct rrdengine_datafile *prev;
+ struct rrdengine_datafile *next;
+
+ struct {
+ SPINLOCK spinlock;
+ bool populated;
+ } populate_mrg;
+
+ struct {
+ SPINLOCK spinlock;
+ size_t running;
+ size_t flushed_to_open_running;
+ } writers;
+
+ struct {
+ SPINLOCK spinlock;
+ unsigned lockers;
+ unsigned lockers_by_reason[DATAFILE_ACQUIRE_MAX];
+ bool available;
+ time_t time_to_evict;
+ } users;
+
+ struct {
+ SPINLOCK spinlock;
+ Pvoid_t pending_epdl_by_extent_offset_judyL;
+ } extent_queries;
+};
+
+bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason);
+void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason);
+bool datafile_acquire_for_deletion(struct rrdengine_datafile *df);
+
+void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool having_lock);
+void datafile_list_delete_unsafe(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile);
+void generate_datafilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen);
+int close_data_file(struct rrdengine_datafile *datafile);
+int unlink_data_file(struct rrdengine_datafile *datafile);
+int destroy_data_file_unsafe(struct rrdengine_datafile *datafile);
+int create_data_file(struct rrdengine_datafile *datafile);
+int create_new_datafile_pair(struct rrdengine_instance *ctx, bool having_lock);
+int init_data_files(struct rrdengine_instance *ctx);
+void finalize_data_files(struct rrdengine_instance *ctx);
+
+#endif /* NETDATA_DATAFILE_H */ \ No newline at end of file
diff --git a/database/engine/datafile.ksy b/database/engine/datafile.ksy
new file mode 100644
index 00000000..28d4b393
--- /dev/null
+++ b/database/engine/datafile.ksy
@@ -0,0 +1,74 @@
+meta:
+ id: netdata_datafile
+ endian: le
+
+seq:
+ - id: hdr
+ type: header
+ size: 4096
+ - id: extents
+ type: extent
+ repeat: eos
+
+types:
+ header:
+ seq:
+ - id: magic
+ contents: "netdata-data-file"
+ - id: reserved
+ size: 15
+ - id: version
+ contents: "1.0"
+ - id: reserved1
+ size: 13
+ - id: tier
+ type: u1
+ extent_page_descr:
+ seq:
+ - id: type
+ type: u1
+ enum: page_type
+ - id: uuid
+ size: 16
+ - id: page_len
+ type: u4
+ - id: start_time_ut
+ type: u8
+ - id: end_time_ut
+ type: u8
+ enums:
+ page_type:
+ 0: metrics
+ 1: tier
+ extent_header:
+ seq:
+ - id: payload_length
+ type: u4
+ - id: compression_algorithm
+ type: u1
+ enum: compression_algos
+ - id: number_of_pages
+ type: u1
+ - id: page_descriptors
+ type: extent_page_descr
+ repeat: expr
+ repeat-expr: number_of_pages
+ enums:
+ compression_algos:
+ 0: rrd_no_compression
+ 1: rrd_lz4
+ extent_trailer:
+ seq:
+ - id: crc32_checksum
+ type: u4
+ extent:
+ seq:
+ - id: header
+ type: extent_header
+ - id: payload
+ size: header.payload_length
+ - id: trailer
+ type: extent_trailer
+ - id: padding
+ size: (((_io.pos + 4095) / 4096) * 4096) - _io.pos
+ # the extent size is made to always be a multiple of 4096
diff --git a/database/engine/dbengine-diagram.xml b/database/engine/dbengine-diagram.xml
new file mode 100644
index 00000000..793e8a35
--- /dev/null
+++ b/database/engine/dbengine-diagram.xml
@@ -0,0 +1 @@
+<mxfile host="app.diagrams.net" modified="2023-01-16T23:29:24.274Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" etag="IzytOgui5y4srcr9Zrcm" version="20.5.1" type="device"><diagram name="Page-1" id="90a13364-a465-7bf4-72fc-28e22215d7a0">7V1rc5tI1v41rsp8MEVz52PsOJvsxkkmydTu7Je3EGpZTJDQIOTY++vf7oYG+oJoIUDIlqcmthA00H3O0+d+rszb1dM/0mCzvE/mML4y9PnTlfnuyjBM3zc1G/2Bjz3nx4DuG/mRhzSaF8eqA9+j/0F6YnF0F83hljkxS5I4izbswTBZr2GYMceCNE1+sactkpi96yZ4gMKB72EQ06P0DfDxf0fzbFkcB45fffEBRg/L4uae4eRfzILw50Oa7NbFHdfJGubfrAI6TPGW22UwT37VDpl3V+ZtmiRZ/tfq6RbGeG7pnNHrsmf6oFfmzTJbxegDQH+Sr983XAxULkbvlcJ1Vr9d03jL53+l9tvPH39EN3fr2R/2l88f/rq2i1l4DOIdvQt/WzhHc118JLNj3pD5gnhgHX3a/oRZuCw+JGm2TB6SdRB/SpJN8ax/wSx7Lsgm2GUJ+yblvJLBsjT5Wa4hfudFss6Ka4FHzgjS7C2mmuqByLH3URwXo8D1nJ4RxsF2G4X5weIUQD5l6fN/8AdNLz//ia/XHMOlB949lSPiT8/Fp3mwXZL3J1c+Rdl/yIV28elPehH6uxoBf3hmXvM2iZOUzLJ5S37ojb7CNFrBDKbF6fmi4JXg2KBafUovyS4N4b4lL1YYzdgDzPacaLkljSPsgAl6nvQZXZjCOMiiR/ZJgoKDH8rzyku/JhF6RkMvAAcYfsEXBdr4ns6Okb9CcVmdovmRfEfX9OoHUUd9XGB5Njtw/srCwOiP2gtWhwgDHcJMYGrMxJDpfs5yhuAshq1aWKrkojoPlRwl5yL8fl+DDPHJmtzO0MFe3urGRu/v9J/Jp//+988/7ufO82fvK7iPrm3/xGwE3Drt65bNEL/j6kq0L2FPljstW407+2Ii07sw0R4m0ly7Ex8Zk2UkxxyGP/Q2Qj6AI9iRbMPV7HGZwheY4n2EJFZD/4rk2u2V4cRo1m9mKcMnzt87LJoSCr3eEhJ9i1fW2jyRJaTfo78eit/BCjHIzXq2xb+iNTr7PiC/boNwCaWM+CmYIYWCYZ8gjh4wHYWIGLAIc/MI0yxC4vrb4otVNJ/HhG0heq5gRsbDdLjBM0jm1L65st9x7GURRs8QWSRrRjDC48MnhlIKVaMYuxLG6yS6B4BEeiyGv9Y1RBAWQxDXxT2PJNlrC7DDOhyVJYvFFh5LYFImRIrS1GB3fEWgxFIVcUWqAdC/VWSXNnDtgqVUqa0LJdITPUOK4ANIKR6LnUDvKu17jqHpnut6hqubvuca7LjAsjQX3c12TNOxHGANJfrLJ15B9Gd55dcyyuD3TUDW81cabFhW4GGPIbdGAhFwsBHGDINfl+Lzr8p64tFjy5rhxOS30ToRMNN6MAYZh85hF4hhUEUEnTqCKWDHEayqyKnUxNYfpx63SKawSB+SrGdBZAZD9LjYELjDT3R6oUNc5v0UrCqONLInEjIs3+1HqgCssGJw/DucUCHj54I+pORRmWKvw3xbxFQSraMsCmIpoUhIDz3prCbQluTIHplHj+0nRfwBTBZHEXbxpHq4SzEpxc/SmzNC+AxG6wfMCgh0EPC1nv8rQsiG6Q2dHmb4Cn0eZEFtevK3YN8MHRbeFh1jpoljw5YNrIRVfjfDkht+mZrks1hAJwxlMtHc9Wd6Jecft+XZrHgNLHHLA65kyzN4kaU3NDVkLDIhqVsmiQy6K5qKu2Lv4utRyyjuiTWgy9nnNufHJD0QqMQjR4NQ8SglMOiLNFmhX6sddtjFsPoiX65tI3YMiAg2/k+GCA756QcRWH9BuTPW8YCaVet4APgttD/panLGzxPjAZV+28Vkd1KIAESD3bsozZ5lgnLOTdvgkWzWaDUQ/0Xbn+ck91Ky7UHuNSzT6Unu5dh7PLnXnTYX87Y0RS6Wu7npN3Ij1wicT21eU+F8fXCtR4okZ6H3ZMsAD4TIoVTu+YcIiFsjP+NXgCYBaz9tSg8BzRkZNN4RQxkDpJNWfRZGg+rjzBy7J0HHNjlr3wRUn4mLOi0gWTxOhZClh6B74MMBuMq4PirArns/jhS83DOFX9ne2y/83sYwWA8Iv9sNGf4oPy6L4xz8Ikra4t/JmkNIOQqH2PWrYHkifuIVXCXpcwu25m94Ksid29CbWzLI9YyZ2ZtuCaYHuaJSIszzpJ1UtnNyJ5UpE+/2zuE5O6lMQ3EXMO1J7QKm6I7FJi0EBFDEsdzbJFHJz8z7VJLmlL1P5mhaOKXdUdxPx4sB9MA9zALOYdPJTDyMEoe+rx5QL2xW/CPkmlfBb1juINJEm/jwFwIjBJL0mkdEarfiVaVKqC/QCmGRBtZupqgphnGyhVRFJGpjhAD7SfYuRKipPRp+MuNq2hrlKZxpjiR+ZGTxxpS5YSakUY7vTLNVd+9p6XCmJSwk1WCU+LTg0fPZuCnhXszndbIdyYLKbGm9K/M87o9z6Qibf+tGWzf17ol3YbdfXcLHR++1jRPX/zTtXc0pygqnsD5PQVaQhbsPZyacnnowltlxL2Zw9shG3t/3+H9sK29PCue7EAqD6NswWNcttReb5HQY8czcQIPknWhlTl/nzJNDsma7axTK+SXU+DJ0fomts2mvNhgs5XvvhOwL/TtYuPAaAPZdbnV5T1zcaYtgIZVA9mwV34NHjJxcbEGOq++k4UhxHG22mPZbcJChV15F4nAQkh8ZDpqO6ZvzfnDQsVkctCS+mbK2CRP5N5hfwVLwK0wJBxWNF1182zVc1Flg1AynBRv3FM8Y1oLij4R3XO0Mr2s2nWObmuci+PQt17A9n1X2fcPUTMt0fd20HUvXjVFB1RK9N+fECy89NKR/a6GcRH1g42oHvZC7Z/ma7zs4dNuwbM81OXIHmu84vg8sAHTdB+OSu9EuQ/yTVWg+YntJVxWQbu01Rw7RvIhHBwZpuCTmRhoaos+CbWGaQTfVmu9wtPGEe8nSe0RMSKtgsykeg8gkNNBEf4M4NSXf5Id+0xolIw5DOsc6yBS4AHoLqSXFCT04W/D6YA9SjJjCK0lgkIZHAGcwdU4WYjKUPCxjit7l4oLVmkzFnNeTe6Qov3gQybk5JGQSMnUJ1SeUqSdnW3gJpYT2Ch/yOkLdZQ3LUhTA6YjDF7TgywrxIsnQ0sKlFBBFQGkpoFZvq9VbstLLLAU0vSqG4wPlgRbZEfDN6j3AsgHfTKPBRzASvtHypPskyN93kLzrMsD6wUOwUQzDf7PNy4qQyLGAPEuyuCqCyPiTU/j3LsoVi02hMRFNhD9vTYIAF6SCcK5J8WesctwkcfS/TQE4FZHSbhICuyCl4Zis0k1z+I9FSoer+uqMBZSUdyWUqqzaOI3hFtstEZ6a9fnSI8zsyoeYBNT1Y9Cuq3TK7sfn1fUR8iNVonvM+hfqOUrN/5ZEVRnMC2o1hzz1RUqGAil92cALKR1FSs7JSQkAGSxNWH6z+pPf1PKP2oUyVS+3MZLXR1A6BTv4wEIZACJAfU02O/RWYriNIP5UdmZiI0tylAmnokYqSkMVY/UjDllcfG0/0hAnZNFSFCNIQ82REH1tYabCFsYZYL+Hwfqyk3XZybyT72T2xPM5zskQMbzZAPga66t1+YwB5bYe4lglMY5lhGhW7QQjxDaLyIATNUbohfUB34Y/u74Tn5VhoreEmSENE1yQAR+ZOKAFV6ZNnhQ4p+X36tZBo0sDjSN0ENXIs9F0EN1lBQRbsTGMGIvjOppT+2Gh3nKF5hpDg70oubIeNanTv2/nGmt7OT0Ud3OuldAzaeca4JxrVBUbAZonVxpwWtBsdMHm/fGQvSOzqnWIxvUNbx3i0nr8rpK3zyXq+cM10JNzhxiwcwIk/ufEssg7IjFFmkkjMdcdpkTmEZC4OVLx6MBEQVcsMncKlfEoc9Rg0bdf0+QxmsNSGSUTsy20UolNN0WwjhiSVGmhJcSziKi0GUZZYuvNDGkVl/J0HK6p/8jHeJPuMPsVDQw2aZSkUZ6L9Nu4xjm8zx1ej7zJbNe40xxgnOO7KBmSxM3SEDdOqXLHmJokc6aJm5pum4yi6ZK8+z35UehDjylOkj6mJxVnLF6a4YPVlaUZQ9PR3LqG7fiuoXOpJEA3LM3WfcfyfdvRDZqxOpKo4x1oo0HrisDwG5naZH1XffPCOOrqIEtNd7J3VDuN+GMF3nm6zxCoM7aPVxcDP+8jtFyyAtyFPH564VhRGvb6tEvogGu2fKTBuOw/MpbsC/Qzc631GCMyAmSUvRQnUk8N6KI7KwipQ4ln7c2ZsXZFzL14gwxa0oHao4/jbSrUcOrzaEGqjozT+1dzJaOI51OP6bdkl1V5eocm5NEjyvWKWqJKrNptZUWAhujAtYThz1LRpg7dQvFd0T2XduEKHoJoTQok1QPJr4qyqvochrnajtTdEA8AnzKIWcvQ17CqjUTqrMVJMCeHCgW8yPAlJ6DL0oC0BdvkqY4KbQMv4TP027I0+Z5S5WXaLVNaiQ9t6E9DP7NA0EHiZ1h9AuitBUQknbwVypX0p48rZwRRZB9cMbH4KvxdHb/CSI47rorjiEo33ZMijJe44Dvts3o28k/J5pMOhnHYYa3R5B9ZWOrrBkFVF2sdAocrKacc6DIe3nHGdqcz3jlcTK0LbM3iSH9oyBMDDl4A5FGmnjLkAYcbdTTzjjO5uhenCJzWdY/FPd/wuyDfqMKfamzJaGBo82kBfAMXdTDkc5j4kYZGQtHh/wKQsM9iFoMJfzZrLh8xyoMqGKNEeahmYrfZSNBKZhyYMraLAh7rBpDikDpFyiwv3AbQXF+qlVZ7qGUmMZ/YEvMJDyK9WU9cWRmUoShHNfH6QjmdegSOTDlGu/g1rTaLZXovnUPqUqmHF0nncLDabe7eQoK9Nmi4o/by3pu/9N9SoXzWA3q5zna1RixFL1dd0qyVa6YUEBlitUGyS17/c5GkK/FctvFC4VBIYTC/NFzAnMWJPrYlctbIHRe8MUWinF4vW9twuDzu3ubJ9jZ5sPP7vHxvNydvUW21GIqA2zspuL0pnKRMq8laQYqqnaSY4Hrm9X8B31dpCgWAfRFdwjrrVytHfdH8cVhAxoQNVvm+MaCRvvxGJZK4kZRazU2eakQTFWlbzU0FaeoaMB3O/knjh480Jhi672l+9WMwd/FpEGWLUUkIvrsezSThNUfkKMuLh++/nzC3pYfBcWtADB60kBS3GxhGC5ISQhm4IeljPlwzmYmVxAYesLhCFj4QERrt6xKEHiwsxDsz70CPYMsQz4F5of1h5NEACBgA5GKRhbDxBgN9X3Dmiw5GGt4WNZYzueLyLilk5JZ4qqSejRXe680Kr2uWK42UODrlnUsgG60YCd2g92kMn779Id+HGncJhB8bSDg92c3bdw11tY3bKJy55zrWlRBVuCA/0g3kLdLxe5Lwfd4qLtk9ZFl/w0n3zbLLZTEPXEzbPfViKhS5uixmk9eBK1tjn5w1RbGOjegmPm4igvO7cWkUfl2qdyMZtOrNvmrMmq/aKq1WakG3OcWZKtJdRYAR9vjm8tWNKq3cX5M+zN4YuBwPLghg4F4n5A9b/40smN69AgOc7zYxEgtJwdq7BhX1G/x7B7fZgWbLPXpyXy/e9H79Ti9/l3uYEjPsahdn0Yb0hNsgHsqzU1g1/8XOATZT5i+ccsQhvDIHm3QnxBbq71mSYpN2u5k5eoJzCnZ1aARt+2NrmZjD7Reux6Uky8pOyPwPw9mXAe1FemQu4wvcz2qxj75fhjoW0duGddUS+9hvJKOvGsmonizLqa7HRja6nEpsKbYOH3NjBUCkdjydNUf+65LZGnIaWmj7GLtff5TuqRI6a0EqyNPsxxBk8MXiDif7sigRM85oJRR9MaPnrlR0GL9xgwNCYJ3GovLlkVqvg0IKqJdilobEsLm6TaJBEu5WhOLa5IJZzryfZtJgl6EjEYDHhYGbEk9zGXXASAL+YArvAa1m962teOQDJLFSxBb9jNj/SsiFbjaENIUjlRCEV0jdZE0JZfX0gEhiqQUYH7eG9tdutfm4LtDyZo5XBeAMrTleMBPHJ5AzP8AA39O222hkEUebD6UIKk+sRk8wjyBjbJmR/64kxpZek6strvSwIaE+aRRVCXYDyKEdwqgU9NSDXcHN1Q1q2KRc2uD426ZJCLfbJD383mqBl0cXX3iTV10QK+q3OanyCMpK2SiLHi4hh/fN9QovZRP2l00wTQlny8omDKlhih7N2v5fltCgNlWy28u3+YnFLI4iKTg+57OS2QxkfWQGi1kE4MCi25U29GLr7HFlt3WvrS4GW7tSZiuoClrSVPH29MkRtK7ROuA4fPHs7u1vDLQgrusZrm76Ho0rKduIWJqL5HHbMU3HcoDJlZIavBOcLHwo38SWgO5fn2FWxOViTWwWbInRff0Qresx4NX59GAt+P/Hksjf5B880BIhBYZf+AjxQ87hFgNhvWcObiyX5ibQ4vC7m7vP//j4+U6r3VSSuKGUy7HnUokwgpH/ugDxt+SxF3IB8G3e+Ic++Ta3RkfrMN4VZue/iyTVEH2bpbswf0F8FkKkjJxzW+xL+QVMnB0pzhzvtsvyROLS1eFjFBY3zMOryQ5W3B5B2xXNGAm2z+twmSbrZLdVmcdjJmP02acJwEWx6QhNAL5glJfcxfyRWEiTKeXhDx//8UEqJ5N7IZ5In7N8iWli0CxOwp/bUnVF5BPHsCAe/Q2uvI2/xC4SLH7kl+bB8ynMHWX5qUVVMrLZ/B8iQBis8MnmO0SLjKArPv2e9/n85dv920/Nb7TbksrgefpWbmzBfBAxKs1Bd/z05d/Nt2NfOZ+GJQxi/Ordbndz9/0Hxrz37798+9F84xXiOgKKiF6CdF2rtSq9ITooks3JmbE3qRfpiWFOXHb16QeRqK6NphyNBBH/IiZizxIJxXB9hABMxe40F0t7EYh9LlJEouFYshwNczDbhSMzXTRYKHEpwhhSQ1OL4klsTlSM1TXLQIqiZRi24egWcEi0DjmlmHJdMy3TRoIScDwTmBbwuSXptxA/q0MzRFHznfp9OU8N29O4sp9AtzRahqKu4BqmBkyRAvpQcZfP/0rtt58//ohu7tazP+wvnz/8dS2Ga3z/8RbDlP7h7tvdccYHPtLeBpi6DwjTujF0vbyCqmSyFg0NKyJZtz2L5OByzswiyWyLhoQ/+wjQl66OKGC/1tXxNN+sRbr73rQWSuaAuMCoAKNH0YCvmbZe+2F5FQCgUUtAnQxMXwP+iIAKxB31A9FXOXJoW3icRYWflUQBSeSTpiR2iWvGd+e668pWmEtwPxJAga9RTxwVcGyRMWVBQn3UjpCvhli65XOSrnA+8MDrQZOAJevhwsCBupTjaIJwPxvaBNfDENbjE8KMgRcDbWBG2LAYi8XcdxzZYsydmWP3xRwmF/J/+pUQ8zdu4FYSAnG3WCSpGFXU9xJ5IWxaIsQuM/kSzTzbsnvaXoDpTm2JxKyMr2UrNH6Zft/BnSTAcRp+J7l36bjlApYSvMliAYZbscm12z5Fuc2DOqFVviLDrDc/A7hj11W35mecg6ebW4l2hay7lfZIfP17kfgtw+K1F+XwPGHzUewK28FTJJ+hybU6PiO2sADLFUSnOwOuGMi3aiItw+6LL4SxRucMBcPBS+WMKkjA81kS162DSbwrn52ENYyBWMNuEGDPbsMwRCPKq2GLozcM13E5djqTHcMchi0s29Ysr8l0bPGWwPNhEllZ4lfHJMDjavsXaoMSq9TD0zq3djqUS/aajdt7L1sDcYm1l0sUK/RLxuWsKqWDeywuES2Or5BLTN99EVyiWoFrMC5x9nJJZ4HL4kKlrbH3kjPrAjwMlzi54nG4yDUhDjFOzSH7pS3FNn/trbgErX9oDrm00ESE7gHz7DlEtdLtifaQzvqIsIco9mfqjUNE58criXzwKDjQBHhJBbBRPYeGmBj/asIePHdqiyF6P15HzINvWHxE5ekXQwypfOVhD747wVUS3SKXyIeqXJHdjnCjhj2YF3P9MbIx0vaPk44HtuDnNR37F4Q9y9N8p0kQ9vikF2VBGI/rsX5dl9c7B5aFzYtt/hiOsJigB1zEzn09XOEPxBX+6bniYos/hiucV8wVPt4r3P65Ao/rOafliovt/Riu8EiNs3q4g/l6uMIfiCv803PFxd5+tjrFMEKRazYRun+EUORr1D5HuYY3wA9N6JecgQuhM/KIa/VP6L6FCN07LaGLTolXSOjANhhC14DZ1uJzIsSuUtd6oBBObNEZBP6t08P/JTcGw7/BlSEExoUrVKT/QfYK/+R7hXWg/RStSZQ9fyMTmqynWauzYoM6rwCWVwpuqhhFZxiFZZK2EM6TiTGGaVn1xqQ+l2XVNcalqvXbNNLQdClaMN/HEmd+S4NSWlSs9/6kym1HWSrYw4ONbshrXC6J6zhbzM7xHUqBbNheG0rIX/nMTHFWfxij3BjlFHAisH3XiroImGytVkPItvlx0bcA+9iRxmS7SijVG/GJFq/bZL3dxfKWPmcPLw1SUTH8cO2PLUc27AjgcmaGnqHAZTowopplcSiMoHEZGKEhx2MBiWhoqboWvEQkafCuvFwkkdkMuKrD9+geUYgLO3+DD9GWvMkhzVWUKqEr94hp6sBCByrLG98HaI7R/6TWOx47Wl+v4CohCxohIHyqvYasKLKswcyesshDv88V1zmGFKPP286s6Aod/D7RQc+oy5uf5gW146bXYJ56gS4y9F/LKFy2PO6p5jmFmGpIdfCiynmBRGXx+scgigm2cO82xfkn/YPyTgbF2yyiGLaRitrcj048eQ31/G3y1gLaRIlob53z8wCbEZ//oNYML+vVL88/9vMf1EKv/9f7iOW+TZo8RnPS6StaL3A+Fe0SMUt2GdsupPclk/cI6aOrQ/+zlWN9Fq0gUhTRVsC25onnJOGG2ae5DbrDLMpbcpzh/OT/YmWtZZZur4r+JC99uiIixpUzstvM8/57C9KAdx0+t8kTynNwpdJUhdOg++zASHP2mvPy9mXzNWrW6rlF11znsrIpY71lH61yytRT500pvSUXWbIuYxM2mwl+v14c5O5BxcDYFn0dkvBVbXQKsa/FarWW/8qNGr0b/a4NYNVNdbrPEXjXnudcG+yyPsVYnsjJVY5kiO6M6uWpc8VxbjEFXpGUphiVVwQDOTC0jvzBFy9W9Nj1xR626Kg/WEDxWgSUGT3wjvRX1H9Hp5Wt4StFiJo0cRu7PUrSTFFwqS7+hnusUBsRYyCqzgmw6LSGuBF3QEikJjFWZ6UwDOKQ9jVOYYbjabBqwwNKHEebLebUFqmHwQLBqcRspkwTNq4nEfmRyUOmY/rmvB+ZBwCuRB3QJY3ZDEMi9YDBGgrZl0iEPZEIJehzUYPKNe1qkF9tAIOLQp6qLJSjV/8Rg4Zpc5SuWI1RUm2I+DibPKB4YNYFqrSL9Ib+F1e7CvccGE5IuQechnusk3MPt0/4nft6A5MJHwDe/nGH5pZmdzIm0qOMObk1a71bzRrEH9E7WrQnJr680keKowMQC8X4z1WA/WNtYxFXE352/JD7BK/8HRusO5XEQ/v3hXGym7dLQC2mnrmn6655dUC7xbe2rlsVSx3Z68jTfLaIjCkpIlOqA4zMM1gZGbu5n/zRdEhF5EaCqOhL7h6uzkw2sOySfqEvKX2ZFouUsgpFcuIaTqCW2Ut6Iq6iRXnZllxPg/VDM42JX6C1Twu8K0JdLlQlK1XG7b/uyalK2j+7J6paBU91ikBEJhJEHpgzn8H1A+kdj31CkcQMwKOa8VTuoyWQLRNJGbwr48Y07ve7d09Cio7lOp57CCmawDJtaxiAk1GirAQb4KsoKFAi+pgmmDIqmQ9N1vI+meOozbv/Bw==</diagram></mxfile> \ No newline at end of file
diff --git a/database/engine/journalfile.c b/database/engine/journalfile.c
new file mode 100644
index 00000000..9005b81c
--- /dev/null
+++ b/database/engine/journalfile.c
@@ -0,0 +1,1569 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+#include "rrdengine.h"
+
+static void after_extent_write_journalfile_v1_io(uv_fs_t* req)
+{
+ worker_is_busy(RRDENG_FLUSH_TRANSACTION_BUFFER_CB);
+
+ WAL *wal = req->data;
+ struct generic_io_descriptor *io_descr = &wal->io_descr;
+ struct rrdengine_instance *ctx = io_descr->ctx;
+
+ netdata_log_debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__);
+ if (req->result < 0) {
+ ctx_io_error(ctx);
+ netdata_log_error("DBENGINE: %s: uv_fs_write: %s", __func__, uv_strerror((int)req->result));
+ } else {
+ netdata_log_debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__);
+ }
+
+ uv_fs_req_cleanup(req);
+ wal_release(wal);
+
+ __atomic_sub_fetch(&ctx->atomic.extents_currently_being_flushed, 1, __ATOMIC_RELAXED);
+
+ worker_is_idle();
+}
+
+/* Careful to always call this before creating a new journal file */
+void journalfile_v1_extent_write(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, WAL *wal, uv_loop_t *loop)
+{
+ int ret;
+ struct generic_io_descriptor *io_descr;
+ struct rrdengine_journalfile *journalfile = datafile->journalfile;
+
+ io_descr = &wal->io_descr;
+ io_descr->ctx = ctx;
+ if (wal->size < wal->buf_size) {
+ /* simulate an empty transaction to skip the rest of the block */
+ *(uint8_t *) (wal->buf + wal->size) = STORE_PADDING;
+ }
+ io_descr->buf = wal->buf;
+ io_descr->bytes = wal->buf_size;
+
+ spinlock_lock(&journalfile->unsafe.spinlock);
+ io_descr->pos = journalfile->unsafe.pos;
+ journalfile->unsafe.pos += wal->buf_size;
+ spinlock_unlock(&journalfile->unsafe.spinlock);
+
+ io_descr->req.data = wal;
+ io_descr->data = journalfile;
+ io_descr->completion = NULL;
+
+ io_descr->iov = uv_buf_init((void *)io_descr->buf, wal->buf_size);
+ ret = uv_fs_write(loop, &io_descr->req, journalfile->file, &io_descr->iov, 1,
+ (int64_t)io_descr->pos, after_extent_write_journalfile_v1_io);
+ fatal_assert(-1 != ret);
+
+ ctx_current_disk_space_increase(ctx, wal->buf_size);
+ ctx_io_write_op_bytes(ctx, wal->buf_size);
+}
+
+void journalfile_v2_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen)
+{
+ (void) snprintfz(str, maxlen, "%s/" WALFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL WALFILE_EXTENSION_V2,
+ datafile->ctx->config.dbfiles_path, datafile->tier, datafile->fileno);
+}
+
+void journalfile_v1_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen)
+{
+ (void) snprintfz(str, maxlen - 1, "%s/" WALFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL WALFILE_EXTENSION,
+ datafile->ctx->config.dbfiles_path, datafile->tier, datafile->fileno);
+}
+
+// ----------------------------------------------------------------------------
+
+struct rrdengine_datafile *njfv2idx_find_and_acquire_j2_header(NJFV2IDX_FIND_STATE *s) {
+ struct rrdengine_datafile *datafile = NULL;
+
+ rw_spinlock_read_lock(&s->ctx->njfv2idx.spinlock);
+
+ Pvoid_t *PValue = NULL;
+
+ if(unlikely(!s->init)) {
+ s->init = true;
+ s->last = s->wanted_start_time_s;
+
+ PValue = JudyLPrev(s->ctx->njfv2idx.JudyL, &s->last, PJE0);
+ if (unlikely(PValue == PJERR))
+ fatal("DBENGINE: NJFV2IDX corrupted judy array");
+
+ if(!PValue) {
+ s->last = 0;
+ PValue = JudyLFirst(s->ctx->njfv2idx.JudyL, &s->last, PJE0);
+ if (unlikely(PValue == PJERR))
+ fatal("DBENGINE: NJFV2IDX corrupted judy array");
+
+ if(!PValue)
+ s->last = s->wanted_start_time_s;
+ }
+ }
+
+ while(1) {
+ if (likely(!PValue)) {
+ PValue = JudyLNext(s->ctx->njfv2idx.JudyL, &s->last, PJE0);
+ if (unlikely(PValue == PJERR))
+ fatal("DBENGINE: NJFV2IDX corrupted judy array");
+
+ if(!PValue) {
+ // cannot find anything after that point
+ datafile = NULL;
+ break;
+ }
+ }
+
+ datafile = *PValue;
+ TIME_RANGE_COMPARE rc = is_page_in_time_range(datafile->journalfile->v2.first_time_s,
+ datafile->journalfile->v2.last_time_s,
+ s->wanted_start_time_s,
+ s->wanted_end_time_s);
+
+ if(rc == PAGE_IS_IN_RANGE) {
+ // this is good to return
+ break;
+ }
+ else if(rc == PAGE_IS_IN_THE_PAST) {
+ // continue to get the next
+ datafile = NULL;
+ PValue = NULL;
+ continue;
+ }
+ else /* PAGE_IS_IN_THE_FUTURE */ {
+ // we finished - no more datafiles
+ datafile = NULL;
+ PValue = NULL;
+ break;
+ }
+ }
+
+ if(datafile)
+ s->j2_header_acquired = journalfile_v2_data_acquire(datafile->journalfile, NULL,
+ s->wanted_start_time_s,
+ s->wanted_end_time_s);
+ else
+ s->j2_header_acquired = NULL;
+
+ rw_spinlock_read_unlock(&s->ctx->njfv2idx.spinlock);
+
+ return datafile;
+}
+
+static void njfv2idx_add(struct rrdengine_datafile *datafile) {
+ internal_fatal(datafile->journalfile->v2.last_time_s <= 0, "DBENGINE: NJFV2IDX trying to index a journal file with invalid first_time_s");
+
+ rw_spinlock_write_lock(&datafile->ctx->njfv2idx.spinlock);
+ datafile->journalfile->njfv2idx.indexed_as = datafile->journalfile->v2.last_time_s;
+
+ do {
+ internal_fatal(datafile->journalfile->njfv2idx.indexed_as <= 0, "DBENGINE: NJFV2IDX journalfile is already indexed");
+
+ Pvoid_t *PValue = JudyLIns(&datafile->ctx->njfv2idx.JudyL, datafile->journalfile->njfv2idx.indexed_as, PJE0);
+ if (!PValue || PValue == PJERR)
+ fatal("DBENGINE: NJFV2IDX corrupted judy array");
+
+ if (unlikely(*PValue)) {
+ // already there
+ datafile->journalfile->njfv2idx.indexed_as++;
+ }
+ else {
+ *PValue = datafile;
+ break;
+ }
+ } while(1);
+
+ rw_spinlock_write_unlock(&datafile->ctx->njfv2idx.spinlock);
+}
+
+static void njfv2idx_remove(struct rrdengine_datafile *datafile) {
+ internal_fatal(!datafile->journalfile->njfv2idx.indexed_as, "DBENGINE: NJFV2IDX journalfile to remove is not indexed");
+
+ rw_spinlock_write_lock(&datafile->ctx->njfv2idx.spinlock);
+
+ int rc = JudyLDel(&datafile->ctx->njfv2idx.JudyL, datafile->journalfile->njfv2idx.indexed_as, PJE0);
+ (void)rc;
+ internal_fatal(!rc, "DBENGINE: NJFV2IDX cannot remove entry");
+
+ datafile->journalfile->njfv2idx.indexed_as = 0;
+
+ rw_spinlock_write_unlock(&datafile->ctx->njfv2idx.spinlock);
+}
+
+// ----------------------------------------------------------------------------
+
+static struct journal_v2_header *journalfile_v2_mounted_data_get(struct rrdengine_journalfile *journalfile, size_t *data_size) {
+ struct journal_v2_header *j2_header = NULL;
+
+ spinlock_lock(&journalfile->mmap.spinlock);
+
+ if(!journalfile->mmap.data) {
+ journalfile->mmap.data = mmap(NULL, journalfile->mmap.size, PROT_READ, MAP_SHARED, journalfile->mmap.fd, 0);
+ if (journalfile->mmap.data == MAP_FAILED) {
+ internal_fatal(true, "DBENGINE: failed to re-mmap() journal file v2");
+ close(journalfile->mmap.fd);
+ journalfile->mmap.fd = -1;
+ journalfile->mmap.data = NULL;
+ journalfile->mmap.size = 0;
+
+ spinlock_lock(&journalfile->v2.spinlock);
+ journalfile->v2.flags &= ~(JOURNALFILE_FLAG_IS_AVAILABLE | JOURNALFILE_FLAG_IS_MOUNTED);
+ spinlock_unlock(&journalfile->v2.spinlock);
+
+ ctx_fs_error(journalfile->datafile->ctx);
+ }
+ else {
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.journal_v2_mapped, 1, __ATOMIC_RELAXED);
+
+ madvise_dontfork(journalfile->mmap.data, journalfile->mmap.size);
+ madvise_dontdump(journalfile->mmap.data, journalfile->mmap.size);
+
+ spinlock_lock(&journalfile->v2.spinlock);
+ journalfile->v2.flags |= JOURNALFILE_FLAG_IS_AVAILABLE | JOURNALFILE_FLAG_IS_MOUNTED;
+ JOURNALFILE_FLAGS flags = journalfile->v2.flags;
+ spinlock_unlock(&journalfile->v2.spinlock);
+
+ if(flags & JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION) {
+ // we need the entire metrics directory into memory to process it
+ madvise_willneed(journalfile->mmap.data, journalfile->v2.size_of_directory);
+ }
+ else {
+ // let the kernel know that we don't want read-ahead on this file
+ madvise_random(journalfile->mmap.data, journalfile->mmap.size);
+ // madvise_dontneed(journalfile->mmap.data, journalfile->mmap.size);
+ }
+ }
+ }
+
+ if(journalfile->mmap.data) {
+ j2_header = journalfile->mmap.data;
+
+ if (data_size)
+ *data_size = journalfile->mmap.size;
+ }
+
+ spinlock_unlock(&journalfile->mmap.spinlock);
+
+ return j2_header;
+}
+
+static bool journalfile_v2_mounted_data_unmount(struct rrdengine_journalfile *journalfile, bool have_locks, bool wait) {
+ bool unmounted = false;
+
+ if(!have_locks) {
+ if(!wait) {
+ if (!spinlock_trylock(&journalfile->mmap.spinlock))
+ return false;
+ }
+ else
+ spinlock_lock(&journalfile->mmap.spinlock);
+
+ if(!wait) {
+ if(!spinlock_trylock(&journalfile->v2.spinlock)) {
+ spinlock_unlock(&journalfile->mmap.spinlock);
+ return false;
+ }
+ }
+ else
+ spinlock_lock(&journalfile->v2.spinlock);
+ }
+
+ if(!journalfile->v2.refcount) {
+ if(journalfile->mmap.data) {
+ if (munmap(journalfile->mmap.data, journalfile->mmap.size)) {
+ char path[RRDENG_PATH_MAX];
+ journalfile_v2_generate_path(journalfile->datafile, path, sizeof(path));
+ netdata_log_error("DBENGINE: failed to unmap index file '%s'", path);
+ internal_fatal(true, "DBENGINE: failed to unmap file '%s'", path);
+ ctx_fs_error(journalfile->datafile->ctx);
+ }
+ else {
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.journal_v2_unmapped, 1, __ATOMIC_RELAXED);
+ journalfile->mmap.data = NULL;
+ journalfile->v2.flags &= ~JOURNALFILE_FLAG_IS_MOUNTED;
+ }
+ }
+
+ unmounted = true;
+ }
+
+ if(!have_locks) {
+ spinlock_unlock(&journalfile->v2.spinlock);
+ spinlock_unlock(&journalfile->mmap.spinlock);
+ }
+
+ return unmounted;
+}
+
+void journalfile_v2_data_unmount_cleanup(time_t now_s) {
+ // DO NOT WAIT ON ANY LOCK!!!
+
+ for(size_t tier = 0; tier < (size_t)storage_tiers ;tier++) {
+ struct rrdengine_instance *ctx = multidb_ctx[tier];
+ if(!ctx) continue;
+
+ struct rrdengine_datafile *datafile;
+ if(uv_rwlock_tryrdlock(&ctx->datafiles.rwlock) != 0)
+ continue;
+
+ for (datafile = ctx->datafiles.first; datafile; datafile = datafile->next) {
+ struct rrdengine_journalfile *journalfile = datafile->journalfile;
+
+ if(!spinlock_trylock(&journalfile->v2.spinlock))
+ continue;
+
+ bool unmount = false;
+ if (!journalfile->v2.refcount && (journalfile->v2.flags & JOURNALFILE_FLAG_IS_MOUNTED)) {
+ // this journal has no references and it is mounted
+
+ if (!journalfile->v2.not_needed_since_s)
+ journalfile->v2.not_needed_since_s = now_s;
+
+ else if (now_s - journalfile->v2.not_needed_since_s >= 120)
+ // 2 minutes have passed since last use
+ unmount = true;
+ }
+ spinlock_unlock(&journalfile->v2.spinlock);
+
+ if (unmount)
+ journalfile_v2_mounted_data_unmount(journalfile, false, false);
+ }
+ uv_rwlock_rdunlock(&ctx->datafiles.rwlock);
+ }
+}
+
+struct journal_v2_header *journalfile_v2_data_acquire(struct rrdengine_journalfile *journalfile, size_t *data_size, time_t wanted_first_time_s, time_t wanted_last_time_s) {
+ spinlock_lock(&journalfile->v2.spinlock);
+
+ bool has_data = (journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE);
+ bool is_mounted = (journalfile->v2.flags & JOURNALFILE_FLAG_IS_MOUNTED);
+ bool do_we_need_it = false;
+
+ if(has_data) {
+ if (!wanted_first_time_s || !wanted_last_time_s ||
+ is_page_in_time_range(journalfile->v2.first_time_s, journalfile->v2.last_time_s,
+ wanted_first_time_s, wanted_last_time_s) == PAGE_IS_IN_RANGE) {
+
+ journalfile->v2.refcount++;
+
+ do_we_need_it = true;
+
+ if (!wanted_first_time_s && !wanted_last_time_s && !is_mounted)
+ journalfile->v2.flags |= JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION;
+ else
+ journalfile->v2.flags &= ~JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION;
+
+ }
+ }
+ spinlock_unlock(&journalfile->v2.spinlock);
+
+ if(do_we_need_it)
+ return journalfile_v2_mounted_data_get(journalfile, data_size);
+
+ return NULL;
+}
+
+void journalfile_v2_data_release(struct rrdengine_journalfile *journalfile) {
+ spinlock_lock(&journalfile->v2.spinlock);
+
+ internal_fatal(!journalfile->mmap.data, "trying to release a journalfile without data");
+ internal_fatal(journalfile->v2.refcount < 1, "trying to release a non-acquired journalfile");
+
+ bool unmount = false;
+
+ journalfile->v2.refcount--;
+
+ if(journalfile->v2.refcount == 0) {
+ journalfile->v2.not_needed_since_s = 0;
+
+ if(journalfile->v2.flags & JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION)
+ unmount = true;
+ }
+ spinlock_unlock(&journalfile->v2.spinlock);
+
+ if(unmount)
+ journalfile_v2_mounted_data_unmount(journalfile, false, true);
+}
+
+bool journalfile_v2_data_available(struct rrdengine_journalfile *journalfile) {
+
+ spinlock_lock(&journalfile->v2.spinlock);
+ bool has_data = (journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE);
+ spinlock_unlock(&journalfile->v2.spinlock);
+
+ return has_data;
+}
+
+size_t journalfile_v2_data_size_get(struct rrdengine_journalfile *journalfile) {
+
+ spinlock_lock(&journalfile->mmap.spinlock);
+ size_t data_size = journalfile->mmap.size;
+ spinlock_unlock(&journalfile->mmap.spinlock);
+
+ return data_size;
+}
+
+void journalfile_v2_data_set(struct rrdengine_journalfile *journalfile, int fd, void *journal_data, uint32_t journal_data_size) {
+ spinlock_lock(&journalfile->mmap.spinlock);
+ spinlock_lock(&journalfile->v2.spinlock);
+
+ internal_fatal(journalfile->mmap.fd != -1, "DBENGINE JOURNALFILE: trying to re-set journal fd");
+ internal_fatal(journalfile->mmap.data, "DBENGINE JOURNALFILE: trying to re-set journal_data");
+ internal_fatal(journalfile->v2.refcount, "DBENGINE JOURNALFILE: trying to re-set journal_data of referenced journalfile");
+
+ journalfile->mmap.fd = fd;
+ journalfile->mmap.data = journal_data;
+ journalfile->mmap.size = journal_data_size;
+ journalfile->v2.not_needed_since_s = now_monotonic_sec();
+ journalfile->v2.flags |= JOURNALFILE_FLAG_IS_AVAILABLE | JOURNALFILE_FLAG_IS_MOUNTED;
+
+ struct journal_v2_header *j2_header = journalfile->mmap.data;
+ journalfile->v2.first_time_s = (time_t)(j2_header->start_time_ut / USEC_PER_SEC);
+ journalfile->v2.last_time_s = (time_t)(j2_header->end_time_ut / USEC_PER_SEC);
+ journalfile->v2.size_of_directory = j2_header->metric_offset + j2_header->metric_count * sizeof(struct journal_metric_list);
+
+ journalfile_v2_mounted_data_unmount(journalfile, true, true);
+
+ spinlock_unlock(&journalfile->v2.spinlock);
+ spinlock_unlock(&journalfile->mmap.spinlock);
+
+ njfv2idx_add(journalfile->datafile);
+}
+
+static void journalfile_v2_data_unmap_permanently(struct rrdengine_journalfile *journalfile) {
+ njfv2idx_remove(journalfile->datafile);
+
+ bool has_references = false;
+
+ do {
+ if (has_references)
+ sleep_usec(10 * USEC_PER_MS);
+
+ spinlock_lock(&journalfile->mmap.spinlock);
+ spinlock_lock(&journalfile->v2.spinlock);
+
+ if(journalfile_v2_mounted_data_unmount(journalfile, true, true)) {
+ if(journalfile->mmap.fd != -1)
+ close(journalfile->mmap.fd);
+
+ journalfile->mmap.fd = -1;
+ journalfile->mmap.data = NULL;
+ journalfile->mmap.size = 0;
+ journalfile->v2.first_time_s = 0;
+ journalfile->v2.last_time_s = 0;
+ journalfile->v2.flags = 0;
+ }
+ else {
+ has_references = true;
+ internal_error(true, "DBENGINE JOURNALFILE: waiting for journalfile to be available to unmap...");
+ }
+
+ spinlock_unlock(&journalfile->v2.spinlock);
+ spinlock_unlock(&journalfile->mmap.spinlock);
+
+ } while(has_references);
+}
+
+struct rrdengine_journalfile *journalfile_alloc_and_init(struct rrdengine_datafile *datafile)
+{
+ struct rrdengine_journalfile *journalfile = callocz(1, sizeof(struct rrdengine_journalfile));
+ journalfile->datafile = datafile;
+ spinlock_init(&journalfile->mmap.spinlock);
+ spinlock_init(&journalfile->v2.spinlock);
+ spinlock_init(&journalfile->unsafe.spinlock);
+ journalfile->mmap.fd = -1;
+ datafile->journalfile = journalfile;
+ return journalfile;
+}
+
+static int close_uv_file(struct rrdengine_datafile *datafile, uv_file file)
+{
+ int ret;
+ char path[RRDENG_PATH_MAX];
+
+ uv_fs_t req;
+ ret = uv_fs_close(NULL, &req, file, NULL);
+ if (ret < 0) {
+ journalfile_v1_generate_path(datafile, path, sizeof(path));
+ netdata_log_error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret));
+ ctx_fs_error(datafile->ctx);
+ }
+ uv_fs_req_cleanup(&req);
+ return ret;
+}
+
+int journalfile_close(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile)
+{
+ if(journalfile_v2_data_available(journalfile)) {
+ journalfile_v2_data_unmap_permanently(journalfile);
+ return 0;
+ }
+
+ return close_uv_file(datafile, journalfile->file);
+}
+
+int journalfile_unlink(struct rrdengine_journalfile *journalfile)
+{
+ struct rrdengine_datafile *datafile = journalfile->datafile;
+ struct rrdengine_instance *ctx = datafile->ctx;
+ uv_fs_t req;
+ int ret;
+ char path[RRDENG_PATH_MAX];
+
+ journalfile_v1_generate_path(datafile, path, sizeof(path));
+
+ ret = uv_fs_unlink(NULL, &req, path, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret));
+ ctx_fs_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+
+ __atomic_add_fetch(&ctx->stats.journalfile_deletions, 1, __ATOMIC_RELAXED);
+
+ return ret;
+}
+
+int journalfile_destroy_unsafe(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile)
+{
+ struct rrdengine_instance *ctx = datafile->ctx;
+ uv_fs_t req;
+ int ret;
+ char path[RRDENG_PATH_MAX];
+ char path_v2[RRDENG_PATH_MAX];
+
+ journalfile_v1_generate_path(datafile, path, sizeof(path));
+ journalfile_v2_generate_path(datafile, path_v2, sizeof(path));
+
+ if (journalfile->file) {
+ ret = uv_fs_ftruncate(NULL, &req, journalfile->file, 0, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_ftruncate(%s): %s", path, uv_strerror(ret));
+ ctx_fs_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+ (void) close_uv_file(datafile, journalfile->file);
+ }
+
+ // This is the new journal v2 index file
+ ret = uv_fs_unlink(NULL, &req, path_v2, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret));
+ ctx_fs_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+
+ ret = uv_fs_unlink(NULL, &req, path, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret));
+ ctx_fs_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+
+ __atomic_add_fetch(&ctx->stats.journalfile_deletions, 2, __ATOMIC_RELAXED);
+
+ if(journalfile_v2_data_available(journalfile))
+ journalfile_v2_data_unmap_permanently(journalfile);
+
+ return ret;
+}
+
+int journalfile_create(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile)
+{
+ struct rrdengine_instance *ctx = datafile->ctx;
+ uv_fs_t req;
+ uv_file file;
+ int ret, fd;
+ struct rrdeng_jf_sb *superblock;
+ uv_buf_t iov;
+ char path[RRDENG_PATH_MAX];
+
+ journalfile_v1_generate_path(datafile, path, sizeof(path));
+ fd = open_file_for_io(path, O_CREAT | O_RDWR | O_TRUNC, &file, use_direct_io);
+ if (fd < 0) {
+ ctx_fs_error(ctx);
+ return fd;
+ }
+ journalfile->file = file;
+ __atomic_add_fetch(&ctx->stats.journalfile_creations, 1, __ATOMIC_RELAXED);
+
+ ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock));
+ if (unlikely(ret)) {
+ fatal("DBENGINE: posix_memalign:%s", strerror(ret));
+ }
+ memset(superblock, 0, sizeof(*superblock));
+ (void) strncpy(superblock->magic_number, RRDENG_JF_MAGIC, RRDENG_MAGIC_SZ);
+ (void) strncpy(superblock->version, RRDENG_JF_VER, RRDENG_VER_SZ);
+
+ iov = uv_buf_init((void *)superblock, sizeof(*superblock));
+
+ ret = uv_fs_write(NULL, &req, file, &iov, 1, 0, NULL);
+ if (ret < 0) {
+ fatal_assert(req.result < 0);
+ netdata_log_error("DBENGINE: uv_fs_write: %s", uv_strerror(ret));
+ ctx_io_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+ posix_memfree(superblock);
+ if (ret < 0) {
+ journalfile_destroy_unsafe(journalfile, datafile);
+ return ret;
+ }
+
+ journalfile->unsafe.pos = sizeof(*superblock);
+
+ ctx_io_write_op_bytes(ctx, sizeof(*superblock));
+
+ return 0;
+}
+
+static int journalfile_check_superblock(uv_file file)
+{
+ int ret;
+ struct rrdeng_jf_sb *superblock;
+ uv_buf_t iov;
+ uv_fs_t req;
+
+ ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock));
+ if (unlikely(ret)) {
+ fatal("DBENGINE: posix_memalign:%s", strerror(ret));
+ }
+ iov = uv_buf_init((void *)superblock, sizeof(*superblock));
+
+ ret = uv_fs_read(NULL, &req, file, &iov, 1, 0, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_read: %s", uv_strerror(ret));
+ uv_fs_req_cleanup(&req);
+ goto error;
+ }
+ fatal_assert(req.result >= 0);
+ uv_fs_req_cleanup(&req);
+
+ if (strncmp(superblock->magic_number, RRDENG_JF_MAGIC, RRDENG_MAGIC_SZ) ||
+ strncmp(superblock->version, RRDENG_JF_VER, RRDENG_VER_SZ)) {
+ netdata_log_error("DBENGINE: File has invalid superblock.");
+ ret = UV_EINVAL;
+ } else {
+ ret = 0;
+ }
+ error:
+ posix_memfree(superblock);
+ return ret;
+}
+
+static void journalfile_restore_extent_metadata(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, void *buf, unsigned max_size)
+{
+ static BITMAP256 page_error_map = BITMAP256_INITIALIZER;
+ unsigned i, count, payload_length, descr_size;
+ struct rrdeng_jf_store_data *jf_metric_data;
+
+ jf_metric_data = buf;
+ count = jf_metric_data->number_of_pages;
+ descr_size = sizeof(*jf_metric_data->descr) * count;
+ payload_length = sizeof(*jf_metric_data) + descr_size;
+ if (payload_length > max_size) {
+ netdata_log_error("DBENGINE: corrupted transaction payload.");
+ return;
+ }
+
+ time_t now_s = max_acceptable_collected_time();
+ for (i = 0; i < count ; ++i) {
+ uuid_t *temp_id;
+ uint8_t page_type = jf_metric_data->descr[i].type;
+
+ if (page_type > PAGE_TYPE_MAX) {
+ if (!bitmap256_get_bit(&page_error_map, page_type)) {
+ netdata_log_error("DBENGINE: unknown page type %d encountered.", page_type);
+ bitmap256_set_bit(&page_error_map, page_type, 1);
+ }
+ continue;
+ }
+
+ temp_id = (uuid_t *)jf_metric_data->descr[i].uuid;
+ METRIC *metric = mrg_metric_get_and_acquire(main_mrg, temp_id, (Word_t) ctx);
+
+ struct rrdeng_extent_page_descr *descr = &jf_metric_data->descr[i];
+ VALIDATED_PAGE_DESCRIPTOR vd = validate_extent_page_descr(
+ descr, now_s,
+ (metric) ? mrg_metric_get_update_every_s(main_mrg, metric) : 0,
+ false);
+
+ if(!vd.is_valid) {
+ if(metric)
+ mrg_metric_release(main_mrg, metric);
+
+ continue;
+ }
+
+ bool update_metric_time = true;
+ if (!metric) {
+ MRG_ENTRY entry = {
+ .uuid = temp_id,
+ .section = (Word_t)ctx,
+ .first_time_s = vd.start_time_s,
+ .last_time_s = vd.end_time_s,
+ .latest_update_every_s = (uint32_t) vd.update_every_s,
+ };
+
+ bool added;
+ metric = mrg_metric_add_and_acquire(main_mrg, entry, &added);
+ if(added)
+ update_metric_time = false;
+ }
+ Word_t metric_id = mrg_metric_id(main_mrg, metric);
+
+ if (update_metric_time)
+ mrg_metric_expand_retention(main_mrg, metric, vd.start_time_s, vd.end_time_s, vd.update_every_s);
+
+ pgc_open_add_hot_page(
+ (Word_t)ctx, metric_id, vd.start_time_s, vd.end_time_s, vd.update_every_s,
+ journalfile->datafile,
+ jf_metric_data->extent_offset, jf_metric_data->extent_size, jf_metric_data->descr[i].page_length);
+
+ mrg_metric_release(main_mrg, metric);
+ }
+}
+
+/*
+ * Replays transaction by interpreting up to max_size bytes from buf.
+ * Sets id to the current transaction id or to 0 if unknown.
+ * Returns size of transaction record or 0 for unknown size.
+ */
+static unsigned journalfile_replay_transaction(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile,
+ void *buf, uint64_t *id, unsigned max_size)
+{
+ unsigned payload_length, size_bytes;
+ int ret;
+ /* persistent structures */
+ struct rrdeng_jf_transaction_header *jf_header;
+ struct rrdeng_jf_transaction_trailer *jf_trailer;
+ uLong crc;
+
+ *id = 0;
+ jf_header = buf;
+ if (STORE_PADDING == jf_header->type) {
+ netdata_log_debug(D_RRDENGINE, "Skipping padding.");
+ return 0;
+ }
+ if (sizeof(*jf_header) > max_size) {
+ netdata_log_error("DBENGINE: corrupted transaction record, skipping.");
+ return 0;
+ }
+ *id = jf_header->id;
+ payload_length = jf_header->payload_length;
+ size_bytes = sizeof(*jf_header) + payload_length + sizeof(*jf_trailer);
+ if (size_bytes > max_size) {
+ netdata_log_error("DBENGINE: corrupted transaction record, skipping.");
+ return 0;
+ }
+ jf_trailer = buf + sizeof(*jf_header) + payload_length;
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, buf, sizeof(*jf_header) + payload_length);
+ ret = crc32cmp(jf_trailer->checksum, crc);
+ netdata_log_debug(D_RRDENGINE, "Transaction %"PRIu64" was read from disk. CRC32 check: %s", *id, ret ? "FAILED" : "SUCCEEDED");
+ if (unlikely(ret)) {
+ netdata_log_error("DBENGINE: transaction %"PRIu64" was read from disk. CRC32 check: FAILED", *id);
+ return size_bytes;
+ }
+ switch (jf_header->type) {
+ case STORE_DATA:
+ netdata_log_debug(D_RRDENGINE, "Replaying transaction %"PRIu64"", jf_header->id);
+ journalfile_restore_extent_metadata(ctx, journalfile, buf + sizeof(*jf_header), payload_length);
+ break;
+ default:
+ netdata_log_error("DBENGINE: unknown transaction type, skipping record.");
+ break;
+ }
+
+ return size_bytes;
+}
+
+
+#define READAHEAD_BYTES (RRDENG_BLOCK_SIZE * 256)
+/*
+ * Iterates journal file transactions and populates the page cache.
+ * Page cache must already be initialized.
+ * Returns the maximum transaction id it discovered.
+ */
+static uint64_t journalfile_iterate_transactions(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile)
+{
+ uv_file file;
+ uint64_t file_size;
+ int ret;
+ uint64_t pos, pos_i, max_id, id;
+ unsigned size_bytes;
+ void *buf;
+ uv_buf_t iov;
+ uv_fs_t req;
+
+ file = journalfile->file;
+ file_size = journalfile->unsafe.pos;
+
+ max_id = 1;
+ ret = posix_memalign((void *)&buf, RRDFILE_ALIGNMENT, READAHEAD_BYTES);
+ if (unlikely(ret))
+ fatal("DBENGINE: posix_memalign:%s", strerror(ret));
+
+ for (pos = sizeof(struct rrdeng_jf_sb); pos < file_size; pos += READAHEAD_BYTES) {
+ size_bytes = MIN(READAHEAD_BYTES, file_size - pos);
+ iov = uv_buf_init(buf, size_bytes);
+ ret = uv_fs_read(NULL, &req, file, &iov, 1, pos, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_read: pos=%" PRIu64 ", %s", pos, uv_strerror(ret));
+ uv_fs_req_cleanup(&req);
+ goto skip_file;
+ }
+ fatal_assert(req.result >= 0);
+ uv_fs_req_cleanup(&req);
+ ctx_io_read_op_bytes(ctx, size_bytes);
+
+ for (pos_i = 0; pos_i < size_bytes;) {
+ unsigned max_size;
+
+ max_size = pos + size_bytes - pos_i;
+ ret = journalfile_replay_transaction(ctx, journalfile, buf + pos_i, &id, max_size);
+ if (!ret) /* TODO: support transactions bigger than 4K */
+ /* unknown transaction size, move on to the next block */
+ pos_i = ALIGN_BYTES_FLOOR(pos_i + RRDENG_BLOCK_SIZE);
+ else
+ pos_i += ret;
+ max_id = MAX(max_id, id);
+ }
+ }
+skip_file:
+ posix_memfree(buf);
+ return max_id;
+}
+
+// Checks that the extent list checksum is valid
+static int journalfile_check_v2_extent_list (void *data_start, size_t file_size)
+{
+ UNUSED(file_size);
+ uLong crc;
+
+ struct journal_v2_header *j2_header = (void *) data_start;
+ struct journal_v2_block_trailer *journal_v2_trailer;
+
+ journal_v2_trailer = (struct journal_v2_block_trailer *) ((uint8_t *) data_start + j2_header->extent_trailer_offset);
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, (uint8_t *) data_start + j2_header->extent_offset, j2_header->extent_count * sizeof(struct journal_extent_list));
+ if (unlikely(crc32cmp(journal_v2_trailer->checksum, crc))) {
+ netdata_log_error("DBENGINE: extent list CRC32 check: FAILED");
+ return 1;
+ }
+
+ return 0;
+}
+
+// Checks that the metric list (UUIDs) checksum is valid
+static int journalfile_check_v2_metric_list(void *data_start, size_t file_size)
+{
+ UNUSED(file_size);
+ uLong crc;
+
+ struct journal_v2_header *j2_header = (void *) data_start;
+ struct journal_v2_block_trailer *journal_v2_trailer;
+
+ journal_v2_trailer = (struct journal_v2_block_trailer *) ((uint8_t *) data_start + j2_header->metric_trailer_offset);
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, (uint8_t *) data_start + j2_header->metric_offset, j2_header->metric_count * sizeof(struct journal_metric_list));
+ if (unlikely(crc32cmp(journal_v2_trailer->checksum, crc))) {
+ netdata_log_error("DBENGINE: metric list CRC32 check: FAILED");
+ return 1;
+ }
+ return 0;
+}
+
+//
+// Return
+// 0 Ok
+// 1 Invalid
+// 2 Force rebuild
+// 3 skip
+
+static int journalfile_v2_validate(void *data_start, size_t journal_v2_file_size, size_t journal_v1_file_size)
+{
+ int rc;
+ uLong crc;
+
+ struct journal_v2_header *j2_header = (void *) data_start;
+ struct journal_v2_block_trailer *journal_v2_trailer;
+
+ if (j2_header->magic == JOURVAL_V2_REBUILD_MAGIC)
+ return 2;
+
+ if (j2_header->magic == JOURVAL_V2_SKIP_MAGIC)
+ return 3;
+
+ // Magic failure
+ if (j2_header->magic != JOURVAL_V2_MAGIC)
+ return 1;
+
+ if (j2_header->journal_v2_file_size != journal_v2_file_size)
+ return 1;
+
+ if (journal_v1_file_size && j2_header->journal_v1_file_size != journal_v1_file_size)
+ return 1;
+
+ journal_v2_trailer = (struct journal_v2_block_trailer *) ((uint8_t *) data_start + journal_v2_file_size - sizeof(*journal_v2_trailer));
+
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, (void *) j2_header, sizeof(*j2_header));
+
+ rc = crc32cmp(journal_v2_trailer->checksum, crc);
+ if (unlikely(rc)) {
+ netdata_log_error("DBENGINE: file CRC32 check: FAILED");
+ return 1;
+ }
+
+ rc = journalfile_check_v2_extent_list(data_start, journal_v2_file_size);
+ if (rc) return 1;
+
+ if (!db_engine_journal_check)
+ return 0;
+
+ rc = journalfile_check_v2_metric_list(data_start, journal_v2_file_size);
+ if (rc) return 1;
+
+ // Verify complete UUID chain
+
+ struct journal_metric_list *metric = (void *) (data_start + j2_header->metric_offset);
+
+ unsigned verified = 0;
+ unsigned entries;
+ unsigned total_pages = 0;
+
+ netdata_log_info("DBENGINE: checking %u metrics that exist in the journal", j2_header->metric_count);
+ for (entries = 0; entries < j2_header->metric_count; entries++) {
+
+ char uuid_str[UUID_STR_LEN];
+ uuid_unparse_lower(metric->uuid, uuid_str);
+ struct journal_page_header *metric_list_header = (void *) (data_start + metric->page_offset);
+ struct journal_page_header local_metric_list_header = *metric_list_header;
+
+ local_metric_list_header.crc = JOURVAL_V2_MAGIC;
+
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, (void *) &local_metric_list_header, sizeof(local_metric_list_header));
+ rc = crc32cmp(metric_list_header->checksum, crc);
+
+ if (!rc) {
+ struct journal_v2_block_trailer *journal_trailer =
+ (void *) data_start + metric->page_offset + sizeof(struct journal_page_header) + (metric_list_header->entries * sizeof(struct journal_page_list));
+
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, (uint8_t *) metric_list_header + sizeof(struct journal_page_header), metric_list_header->entries * sizeof(struct journal_page_list));
+ rc = crc32cmp(journal_trailer->checksum, crc);
+ internal_error(rc, "DBENGINE: index %u : %s entries %u at offset %u verified, DATA CRC computed %lu, stored %u", entries, uuid_str, metric->entries, metric->page_offset,
+ crc, metric_list_header->crc);
+ if (!rc) {
+ total_pages += metric_list_header->entries;
+ verified++;
+ }
+ }
+
+ metric++;
+ if ((uint32_t)((uint8_t *) metric - (uint8_t *) data_start) > (uint32_t) journal_v2_file_size) {
+ netdata_log_info("DBENGINE: verification failed EOF reached -- total entries %u, verified %u", entries, verified);
+ return 1;
+ }
+ }
+
+ if (entries != verified) {
+ netdata_log_info("DBENGINE: verification failed -- total entries %u, verified %u", entries, verified);
+ return 1;
+ }
+ netdata_log_info("DBENGINE: verification succeeded -- total entries %u, verified %u (%u total pages)", entries, verified, total_pages);
+
+ return 0;
+}
+
+void journalfile_v2_populate_retention_to_mrg(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile) {
+ usec_t started_ut = now_monotonic_usec();
+
+ size_t data_size = 0;
+ struct journal_v2_header *j2_header = journalfile_v2_data_acquire(journalfile, &data_size, 0, 0);
+ if(!j2_header)
+ return;
+
+ uint8_t *data_start = (uint8_t *)j2_header;
+ uint32_t entries = j2_header->metric_count;
+
+ if (journalfile->v2.flags & JOURNALFILE_FLAG_METRIC_CRC_CHECK) {
+ journalfile->v2.flags &= ~JOURNALFILE_FLAG_METRIC_CRC_CHECK;
+ if (journalfile_check_v2_metric_list(data_start, j2_header->journal_v2_file_size)) {
+ journalfile->v2.flags &= ~JOURNALFILE_FLAG_IS_AVAILABLE;
+ // needs rebuild
+ return;
+ }
+ }
+
+ struct journal_metric_list *metric = (struct journal_metric_list *) (data_start + j2_header->metric_offset);
+ time_t header_start_time_s = (time_t) (j2_header->start_time_ut / USEC_PER_SEC);
+ time_t global_first_time_s = header_start_time_s;
+ time_t now_s = max_acceptable_collected_time();
+ for (size_t i=0; i < entries; i++) {
+ time_t start_time_s = header_start_time_s + metric->delta_start_s;
+ time_t end_time_s = header_start_time_s + metric->delta_end_s;
+
+ mrg_update_metric_retention_and_granularity_by_uuid(
+ main_mrg, (Word_t)ctx, &metric->uuid, start_time_s, end_time_s, (time_t) metric->update_every_s, now_s);
+
+ metric++;
+ }
+
+ journalfile_v2_data_release(journalfile);
+ usec_t ended_ut = now_monotonic_usec();
+
+ nd_log_daemon(NDLP_DEBUG, "DBENGINE: journal v2 of tier %d, datafile %u populated, size: %0.2f MiB, metrics: %0.2f k, %0.2f ms"
+ , ctx->config.tier, journalfile->datafile->fileno
+ , (double)data_size / 1024 / 1024
+ , (double)entries / 1000
+ , ((double)(ended_ut - started_ut) / USEC_PER_MS)
+ );
+
+ time_t old = __atomic_load_n(&ctx->atomic.first_time_s, __ATOMIC_RELAXED);;
+ do {
+ if(old <= global_first_time_s)
+ break;
+ } while(!__atomic_compare_exchange_n(&ctx->atomic.first_time_s, &old, global_first_time_s, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
+}
+
+int journalfile_v2_load(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile)
+{
+ int ret, fd;
+ char path_v1[RRDENG_PATH_MAX];
+ char path_v2[RRDENG_PATH_MAX];
+ struct stat statbuf;
+ size_t journal_v1_file_size = 0;
+ size_t journal_v2_file_size;
+
+ journalfile_v1_generate_path(datafile, path_v1, sizeof(path_v1));
+ ret = stat(path_v1, &statbuf);
+ if (!ret)
+ journal_v1_file_size = (uint32_t)statbuf.st_size;
+
+ journalfile_v2_generate_path(datafile, path_v2, sizeof(path_v2));
+ fd = open(path_v2, O_RDONLY);
+ if (fd < 0) {
+ if (errno == ENOENT)
+ return 1;
+ ctx_fs_error(ctx);
+ netdata_log_error("DBENGINE: failed to open '%s'", path_v2);
+ return 1;
+ }
+
+ ret = fstat(fd, &statbuf);
+ if (ret) {
+ netdata_log_error("DBENGINE: failed to get file information for '%s'", path_v2);
+ close(fd);
+ return 1;
+ }
+
+ journal_v2_file_size = (size_t)statbuf.st_size;
+
+ if (journal_v2_file_size < sizeof(struct journal_v2_header)) {
+ error_report("Invalid file %s. Not the expected size", path_v2);
+ close(fd);
+ return 1;
+ }
+
+ usec_t mmap_start_ut = now_monotonic_usec();
+ uint8_t *data_start = mmap(NULL, journal_v2_file_size, PROT_READ, MAP_SHARED, fd, 0);
+ if (data_start == MAP_FAILED) {
+ close(fd);
+ return 1;
+ }
+
+ nd_log_daemon(NDLP_DEBUG, "DBENGINE: checking integrity of '%s'", path_v2);
+
+ usec_t validation_start_ut = now_monotonic_usec();
+ int rc = journalfile_v2_validate(data_start, journal_v2_file_size, journal_v1_file_size);
+ if (unlikely(rc)) {
+ if (rc == 2)
+ error_report("File %s needs to be rebuilt", path_v2);
+ else if (rc == 3)
+ error_report("File %s will be skipped", path_v2);
+ else
+ error_report("File %s is invalid and it will be rebuilt", path_v2);
+
+ if (unlikely(munmap(data_start, journal_v2_file_size)))
+ netdata_log_error("DBENGINE: failed to unmap '%s'", path_v2);
+
+ close(fd);
+ return rc;
+ }
+
+ struct journal_v2_header *j2_header = (void *) data_start;
+ uint32_t entries = j2_header->metric_count;
+
+ if (unlikely(!entries)) {
+ if (unlikely(munmap(data_start, journal_v2_file_size)))
+ netdata_log_error("DBENGINE: failed to unmap '%s'", path_v2);
+
+ close(fd);
+ return 1;
+ }
+
+ usec_t finished_ut = now_monotonic_usec();
+
+ nd_log_daemon(NDLP_DEBUG, "DBENGINE: journal v2 '%s' loaded, size: %0.2f MiB, metrics: %0.2f k, "
+ "mmap: %0.2f ms, validate: %0.2f ms"
+ , path_v2
+ , (double)journal_v2_file_size / 1024 / 1024
+ , (double)entries / 1000
+ , ((double)(validation_start_ut - mmap_start_ut) / USEC_PER_MS)
+ , ((double)(finished_ut - validation_start_ut) / USEC_PER_MS)
+ );
+
+ // Initialize the journal file to be able to access the data
+
+ if (!db_engine_journal_check)
+ journalfile->v2.flags |= JOURNALFILE_FLAG_METRIC_CRC_CHECK;
+ journalfile_v2_data_set(journalfile, fd, data_start, journal_v2_file_size);
+
+ ctx_current_disk_space_increase(ctx, journal_v2_file_size);
+
+ // File is OK load it
+ return 0;
+}
+
+struct journal_metric_list_to_sort {
+ struct jv2_metrics_info *metric_info;
+};
+
+static int journalfile_metric_compare (const void *item1, const void *item2)
+{
+ const struct jv2_metrics_info *metric1 = ((struct journal_metric_list_to_sort *) item1)->metric_info;
+ const struct jv2_metrics_info *metric2 = ((struct journal_metric_list_to_sort *) item2)->metric_info;
+
+ return memcmp(metric1->uuid, metric2->uuid, sizeof(uuid_t));
+}
+
+
+// Write list of extents for the journalfile
+void *journalfile_v2_write_extent_list(Pvoid_t JudyL_extents_pos, void *data)
+{
+ Pvoid_t *PValue;
+ struct journal_extent_list *j2_extent_base = (void *) data;
+ struct jv2_extents_info *ext_info;
+
+ bool first = true;
+ Word_t pos = 0;
+ size_t count = 0;
+ while ((PValue = JudyLFirstThenNext(JudyL_extents_pos, &pos, &first))) {
+ ext_info = *PValue;
+ size_t index = ext_info->index;
+ j2_extent_base[index].file_index = 0;
+ j2_extent_base[index].datafile_offset = ext_info->pos;
+ j2_extent_base[index].datafile_size = ext_info->bytes;
+ j2_extent_base[index].pages = ext_info->number_of_pages;
+ count++;
+ }
+ return j2_extent_base + count;
+}
+
+static int journalfile_verify_space(struct journal_v2_header *j2_header, void *data, uint32_t bytes)
+{
+ if ((unsigned long)(((uint8_t *) data - (uint8_t *) j2_header->data) + bytes) > (j2_header->journal_v2_file_size - sizeof(struct journal_v2_block_trailer)))
+ return 1;
+
+ return 0;
+}
+
+void *journalfile_v2_write_metric_page(struct journal_v2_header *j2_header, void *data, struct jv2_metrics_info *metric_info, uint32_t pages_offset)
+{
+ struct journal_metric_list *metric = (void *) data;
+
+ if (journalfile_verify_space(j2_header, data, sizeof(*metric)))
+ return NULL;
+
+ uuid_copy(metric->uuid, *metric_info->uuid);
+ metric->entries = metric_info->number_of_pages;
+ metric->page_offset = pages_offset;
+ metric->delta_start_s = (uint32_t)(metric_info->first_time_s - (time_t)(j2_header->start_time_ut / USEC_PER_SEC));
+ metric->delta_end_s = (uint32_t)(metric_info->last_time_s - (time_t)(j2_header->start_time_ut / USEC_PER_SEC));
+ metric->update_every_s = 0;
+
+ return ++metric;
+}
+
+void *journalfile_v2_write_data_page_header(struct journal_v2_header *j2_header __maybe_unused, void *data, struct jv2_metrics_info *metric_info, uint32_t uuid_offset)
+{
+ struct journal_page_header *data_page_header = (void *) data;
+ uLong crc;
+
+ uuid_copy(data_page_header->uuid, *metric_info->uuid);
+ data_page_header->entries = metric_info->number_of_pages;
+ data_page_header->uuid_offset = uuid_offset; // data header OFFSET poings to METRIC in the directory
+ data_page_header->crc = JOURVAL_V2_MAGIC;
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, (void *) data_page_header, sizeof(*data_page_header));
+ crc32set(data_page_header->checksum, crc);
+ return ++data_page_header;
+}
+
+void *journalfile_v2_write_data_page_trailer(struct journal_v2_header *j2_header __maybe_unused, void *data, void *page_header)
+{
+ struct journal_page_header *data_page_header = (void *) page_header;
+ struct journal_v2_block_trailer *journal_trailer = (void *) data;
+ uLong crc;
+
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, (uint8_t *) page_header + sizeof(struct journal_page_header), data_page_header->entries * sizeof(struct journal_page_list));
+ crc32set(journal_trailer->checksum, crc);
+ return ++journal_trailer;
+}
+
+void *journalfile_v2_write_data_page(struct journal_v2_header *j2_header, void *data, struct jv2_page_info *page_info)
+{
+ struct journal_page_list *data_page = data;
+
+ if (journalfile_verify_space(j2_header, data, sizeof(*data_page)))
+ return NULL;
+
+ struct extent_io_data *ei = page_info->custom_data;
+
+ data_page->delta_start_s = (uint32_t) (page_info->start_time_s - (time_t) (j2_header->start_time_ut) / USEC_PER_SEC);
+ data_page->delta_end_s = (uint32_t) (page_info->end_time_s - (time_t) (j2_header->start_time_ut) / USEC_PER_SEC);
+ data_page->extent_index = page_info->extent_index;
+
+ data_page->update_every_s = (uint32_t) page_info->update_every_s;
+ data_page->page_length = (uint16_t) (ei ? ei->page_length : page_info->page_length);
+ data_page->type = 0;
+
+ return ++data_page;
+}
+
+// Must be recorded in metric_info->entries
+static void *journalfile_v2_write_descriptors(struct journal_v2_header *j2_header, void *data, struct jv2_metrics_info *metric_info,
+ struct journal_metric_list *current_metric)
+{
+ Pvoid_t *PValue;
+
+ struct journal_page_list *data_page = (void *)data;
+ // We need to write all descriptors with index metric_info->min_index_time_s, metric_info->max_index_time_s
+ // that belong to this journal file
+ Pvoid_t JudyL_array = metric_info->JudyL_pages_by_start_time;
+
+ Word_t index_time = 0;
+ bool first = true;
+ struct jv2_page_info *page_info;
+ uint32_t update_every_s = 0;
+ while ((PValue = JudyLFirstThenNext(JudyL_array, &index_time, &first))) {
+ page_info = *PValue;
+ // Write one descriptor and return the next data page location
+ data_page = journalfile_v2_write_data_page(j2_header, (void *) data_page, page_info);
+ update_every_s = (uint32_t) page_info->update_every_s;
+ if (NULL == data_page)
+ break;
+ }
+ current_metric->update_every_s = update_every_s;
+ return data_page;
+}
+
+// Migrate the journalfile pointed by datafile
+// activate : make the new file active immediately
+// journafile data will be set and descriptors (if deleted) will be repopulated as needed
+// startup : if the migration is done during agent startup
+// this will allow us to optimize certain things
+
+void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno __maybe_unused, uint8_t type __maybe_unused,
+ Pvoid_t JudyL_metrics, Pvoid_t JudyL_extents_pos,
+ size_t number_of_extents, size_t number_of_metrics, size_t number_of_pages, void *user_data)
+{
+ char path[RRDENG_PATH_MAX];
+ Pvoid_t *PValue;
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *) section;
+ struct rrdengine_journalfile *journalfile = (struct rrdengine_journalfile *) user_data;
+ struct rrdengine_datafile *datafile = journalfile->datafile;
+ time_t min_time_s = LONG_MAX;
+ time_t max_time_s = 0;
+ struct jv2_metrics_info *metric_info;
+
+ journalfile_v2_generate_path(datafile, path, sizeof(path));
+
+ netdata_log_info("DBENGINE: indexing file '%s': extents %zu, metrics %zu, pages %zu",
+ path,
+ number_of_extents,
+ number_of_metrics,
+ number_of_pages);
+
+#ifdef NETDATA_INTERNAL_CHECKS
+ usec_t start_loading = now_monotonic_usec();
+#endif
+
+ size_t total_file_size = 0;
+ total_file_size += (sizeof(struct journal_v2_header) + JOURNAL_V2_HEADER_PADDING_SZ);
+
+ // Extents will start here
+ uint32_t extent_offset = total_file_size;
+ total_file_size += (number_of_extents * sizeof(struct journal_extent_list));
+
+ uint32_t extent_offset_trailer = total_file_size;
+ total_file_size += sizeof(struct journal_v2_block_trailer);
+
+ // UUID list will start here
+ uint32_t metrics_offset = total_file_size;
+ total_file_size += (number_of_metrics * sizeof(struct journal_metric_list));
+
+ // UUID list trailer
+ uint32_t metric_offset_trailer = total_file_size;
+ total_file_size += sizeof(struct journal_v2_block_trailer);
+
+ // descr @ time will start here
+ uint32_t pages_offset = total_file_size;
+ total_file_size += (number_of_pages * (sizeof(struct journal_page_list) + sizeof(struct journal_page_header) + sizeof(struct journal_v2_block_trailer)));
+
+ // File trailer
+ uint32_t trailer_offset = total_file_size;
+ total_file_size += sizeof(struct journal_v2_block_trailer);
+
+ int fd_v2;
+ uint8_t *data_start = netdata_mmap(path, total_file_size, MAP_SHARED, 0, false, &fd_v2);
+ uint8_t *data = data_start;
+
+ memset(data_start, 0, extent_offset);
+
+ // Write header
+ struct journal_v2_header j2_header;
+ memset(&j2_header, 0, sizeof(j2_header));
+
+ j2_header.magic = JOURVAL_V2_MAGIC;
+ j2_header.start_time_ut = 0;
+ j2_header.end_time_ut = 0;
+ j2_header.extent_count = number_of_extents;
+ j2_header.extent_offset = extent_offset;
+ j2_header.metric_count = number_of_metrics;
+ j2_header.metric_offset = metrics_offset;
+ j2_header.page_count = number_of_pages;
+ j2_header.page_offset = pages_offset;
+ j2_header.extent_trailer_offset = extent_offset_trailer;
+ j2_header.metric_trailer_offset = metric_offset_trailer;
+ j2_header.journal_v2_file_size = total_file_size;
+ j2_header.journal_v1_file_size = (uint32_t)journalfile_current_size(journalfile);
+ j2_header.data = data_start; // Used during migration
+
+ struct journal_v2_block_trailer *journal_v2_trailer;
+
+ data = journalfile_v2_write_extent_list(JudyL_extents_pos, data_start + extent_offset);
+ internal_error(true, "DBENGINE: write extent list so far %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS);
+
+ fatal_assert(data == data_start + extent_offset_trailer);
+
+ // Calculate CRC for extents
+ journal_v2_trailer = (struct journal_v2_block_trailer *) (data_start + extent_offset_trailer);
+ uLong crc;
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, (uint8_t *) data_start + extent_offset, number_of_extents * sizeof(struct journal_extent_list));
+ crc32set(journal_v2_trailer->checksum, crc);
+
+ internal_error(true, "DBENGINE: CALCULATE CRC FOR EXTENT %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS);
+ // Skip the trailer, point to the metrics off
+ data += sizeof(struct journal_v2_block_trailer);
+
+ // Sanity check -- we must be at the metrics_offset
+ fatal_assert(data == data_start + metrics_offset);
+
+ // Allocate array to sort UUIDs and keep them sorted in the journal because we want to do binary search when we do lookups
+ struct journal_metric_list_to_sort *uuid_list = mallocz(number_of_metrics * sizeof(struct journal_metric_list_to_sort));
+
+ Word_t Index = 0;
+ size_t count = 0;
+ bool first_then_next = true;
+ while ((PValue = JudyLFirstThenNext(JudyL_metrics, &Index, &first_then_next))) {
+ metric_info = *PValue;
+
+ fatal_assert(count < number_of_metrics);
+ uuid_list[count++].metric_info = metric_info;
+ min_time_s = MIN(min_time_s, metric_info->first_time_s);
+ max_time_s = MAX(max_time_s, metric_info->last_time_s);
+ }
+
+ // Store in the header
+ j2_header.start_time_ut = min_time_s * USEC_PER_SEC;
+ j2_header.end_time_ut = max_time_s * USEC_PER_SEC;
+
+ qsort(&uuid_list[0], number_of_metrics, sizeof(struct journal_metric_list_to_sort), journalfile_metric_compare);
+ internal_error(true, "DBENGINE: traverse and qsort UUID %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS);
+
+ uint32_t resize_file_to = total_file_size;
+
+ for (Index = 0; Index < number_of_metrics; Index++) {
+ metric_info = uuid_list[Index].metric_info;
+
+ // Calculate current UUID offset from start of file. We will store this in the data page header
+ uint32_t uuid_offset = data - data_start;
+
+ struct journal_metric_list *current_metric = (void *) data;
+ // Write the UUID we are processing
+ data = (void *) journalfile_v2_write_metric_page(&j2_header, data, metric_info, pages_offset);
+ if (unlikely(!data))
+ break;
+
+ // Next we will write
+ // Header
+ // Detailed entries (descr @ time)
+ // Trailer (checksum)
+
+ // Keep the page_list_header, to be used for migration when where agent is running
+ metric_info->page_list_header = pages_offset;
+ // Write page header
+ void *metric_page = journalfile_v2_write_data_page_header(&j2_header, data_start + pages_offset, metric_info,
+ uuid_offset);
+
+ // Start writing descr @ time
+ void *page_trailer = journalfile_v2_write_descriptors(&j2_header, metric_page, metric_info, current_metric);
+ if (unlikely(!page_trailer))
+ break;
+
+ // Trailer (checksum)
+ uint8_t *next_page_address = journalfile_v2_write_data_page_trailer(&j2_header, page_trailer,
+ data_start + pages_offset);
+
+ // Calculate start of the pages start for next descriptor
+ pages_offset += (metric_info->number_of_pages * (sizeof(struct journal_page_list)) + sizeof(struct journal_page_header) + sizeof(struct journal_v2_block_trailer));
+ // Verify we are at the right location
+ if (pages_offset != (uint32_t)(next_page_address - data_start)) {
+ // make sure checks fail so that we abort
+ data = data_start;
+ break;
+ }
+ }
+
+ if (data == data_start + metric_offset_trailer) {
+ internal_error(true, "DBENGINE: WRITE METRICS AND PAGES %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS);
+
+ // Calculate CRC for metrics
+ journal_v2_trailer = (struct journal_v2_block_trailer *)(data_start + metric_offset_trailer);
+ crc = crc32(0L, Z_NULL, 0);
+ crc =
+ crc32(crc, (uint8_t *)data_start + metrics_offset, number_of_metrics * sizeof(struct journal_metric_list));
+ crc32set(journal_v2_trailer->checksum, crc);
+ internal_error(true, "DBENGINE: CALCULATE CRC FOR UUIDs %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS);
+
+ // Prepare to write checksum for the file
+ j2_header.data = NULL;
+ journal_v2_trailer = (struct journal_v2_block_trailer *)(data_start + trailer_offset);
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, (void *)&j2_header, sizeof(j2_header));
+ crc32set(journal_v2_trailer->checksum, crc);
+
+ // Write header to the file
+ memcpy(data_start, &j2_header, sizeof(j2_header));
+
+ internal_error(true, "DBENGINE: FILE COMPLETED --------> %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS);
+
+ netdata_log_info("DBENGINE: migrated journal file '%s', file size %zu", path, total_file_size);
+
+ // msync(data_start, total_file_size, MS_SYNC);
+ journalfile_v2_data_set(journalfile, fd_v2, data_start, total_file_size);
+
+ internal_error(true, "DBENGINE: ACTIVATING NEW INDEX JNL %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS);
+ ctx_current_disk_space_increase(ctx, total_file_size);
+ freez(uuid_list);
+ return;
+ }
+ else {
+ netdata_log_info("DBENGINE: failed to build index '%s', file will be skipped", path);
+ j2_header.data = NULL;
+ j2_header.magic = JOURVAL_V2_SKIP_MAGIC;
+ memcpy(data_start, &j2_header, sizeof(j2_header));
+ resize_file_to = sizeof(j2_header);
+ }
+
+ netdata_munmap(data_start, total_file_size);
+ freez(uuid_list);
+
+ if (likely(resize_file_to == total_file_size))
+ return;
+
+ int ret = truncate(path, (long) resize_file_to);
+ if (ret < 0) {
+ ctx_current_disk_space_increase(ctx, total_file_size);
+ ctx_fs_error(ctx);
+ netdata_log_error("DBENGINE: failed to resize file '%s'", path);
+ }
+ else
+ ctx_current_disk_space_increase(ctx, resize_file_to);
+}
+
+int journalfile_load(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile,
+ struct rrdengine_datafile *datafile)
+{
+ uv_fs_t req;
+ uv_file file;
+ int ret, fd, error;
+ uint64_t file_size, max_id;
+ char path[RRDENG_PATH_MAX];
+ bool loaded_v2 = false;
+
+ // Do not try to load jv2 of the latest file
+ if (datafile->fileno != ctx_last_fileno_get(ctx))
+ loaded_v2 = journalfile_v2_load(ctx, journalfile, datafile) == 0;
+
+ journalfile_v1_generate_path(datafile, path, sizeof(path));
+
+ fd = open_file_for_io(path, O_RDWR, &file, use_direct_io);
+ if (fd < 0) {
+ ctx_fs_error(ctx);
+
+ if(loaded_v2)
+ return 0;
+
+ return fd;
+ }
+
+ ret = check_file_properties(file, &file_size, sizeof(struct rrdeng_df_sb));
+ if (ret) {
+ error = ret;
+ goto cleanup;
+ }
+
+ if(loaded_v2) {
+ journalfile->unsafe.pos = file_size;
+ error = 0;
+ goto cleanup;
+ }
+
+ file_size = ALIGN_BYTES_FLOOR(file_size);
+ journalfile->unsafe.pos = file_size;
+ journalfile->file = file;
+
+ ret = journalfile_check_superblock(file);
+ if (ret) {
+ netdata_log_info("DBENGINE: invalid journal file '%s' ; superblock check failed.", path);
+ error = ret;
+ goto cleanup;
+ }
+ ctx_io_read_op_bytes(ctx, sizeof(struct rrdeng_jf_sb));
+
+ nd_log_daemon(NDLP_DEBUG, "DBENGINE: loading journal file '%s'", path);
+
+ max_id = journalfile_iterate_transactions(ctx, journalfile);
+
+ __atomic_store_n(&ctx->atomic.transaction_id, MAX(__atomic_load_n(&ctx->atomic.transaction_id, __ATOMIC_RELAXED), max_id + 1), __ATOMIC_RELAXED);
+
+ nd_log_daemon(NDLP_DEBUG, "DBENGINE: journal file '%s' loaded (size:%" PRIu64 ").", path, file_size);
+
+ bool is_last_file = (ctx_last_fileno_get(ctx) == journalfile->datafile->fileno);
+ if (is_last_file && journalfile->datafile->pos <= rrdeng_target_data_file_size(ctx) / 3) {
+ ctx->loading.create_new_datafile_pair = false;
+ return 0;
+ }
+
+ pgc_open_cache_to_journal_v2(open_cache, (Word_t) ctx, (int) datafile->fileno, ctx->config.page_type,
+ journalfile_migrate_to_v2_callback, (void *) datafile->journalfile);
+
+ if (is_last_file)
+ ctx->loading.create_new_datafile_pair = true;
+
+ return 0;
+
+cleanup:
+ ret = uv_fs_close(NULL, &req, file, NULL);
+ if (ret < 0) {
+ netdata_log_error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret));
+ ctx_fs_error(ctx);
+ }
+ uv_fs_req_cleanup(&req);
+ return error;
+}
diff --git a/database/engine/journalfile.h b/database/engine/journalfile.h
new file mode 100644
index 00000000..5cdf72b9
--- /dev/null
+++ b/database/engine/journalfile.h
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_JOURNALFILE_H
+#define NETDATA_JOURNALFILE_H
+
+#include "rrdengine.h"
+
+/* Forward declarations */
+struct rrdengine_instance;
+struct rrdengine_worker_config;
+struct rrdengine_datafile;
+struct rrdengine_journalfile;
+
+#define WALFILE_PREFIX "journalfile-"
+#define WALFILE_EXTENSION ".njf"
+#define WALFILE_EXTENSION_V2 ".njfv2"
+
+#define is_descr_journal_v2(descr) ((descr)->extent_entry != NULL)
+
+typedef enum __attribute__ ((__packed__)) {
+ JOURNALFILE_FLAG_IS_AVAILABLE = (1 << 0),
+ JOURNALFILE_FLAG_IS_MOUNTED = (1 << 1),
+ JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION = (1 << 2),
+ JOURNALFILE_FLAG_METRIC_CRC_CHECK = (1 << 3),
+} JOURNALFILE_FLAGS;
+
+/* only one event loop is supported for now */
+struct rrdengine_journalfile {
+ struct {
+ SPINLOCK spinlock;
+ void *data; // MMAPed file of journal v2
+ uint32_t size; // Total file size mapped
+ int fd;
+ } mmap;
+
+ struct {
+ SPINLOCK spinlock;
+ JOURNALFILE_FLAGS flags;
+ int32_t refcount;
+ time_t first_time_s;
+ time_t last_time_s;
+ time_t not_needed_since_s;
+ uint32_t size_of_directory;
+ } v2;
+
+ struct {
+ Word_t indexed_as;
+ } njfv2idx;
+
+ struct {
+ SPINLOCK spinlock;
+ uint64_t pos;
+ } unsafe;
+
+ uv_file file;
+ struct rrdengine_datafile *datafile;
+};
+
+static inline uint64_t journalfile_current_size(struct rrdengine_journalfile *journalfile) {
+ spinlock_lock(&journalfile->unsafe.spinlock);
+ uint64_t size = journalfile->unsafe.pos;
+ spinlock_unlock(&journalfile->unsafe.spinlock);
+ return size;
+}
+
+// Journal v2 structures
+
+#define JOURVAL_V2_MAGIC (0x01230317)
+#define JOURVAL_V2_REBUILD_MAGIC (0x00230317)
+#define JOURVAL_V2_SKIP_MAGIC (0x02230317)
+
+struct journal_v2_block_trailer {
+ union {
+ uint8_t checksum[CHECKSUM_SZ]; /* CRC32 */
+ uint32_t crc;
+ };
+};
+
+// Journal V2
+// 28 bytes
+struct journal_page_header {
+ union {
+ uint8_t checksum[CHECKSUM_SZ]; // CRC check
+ uint32_t crc;
+ };
+ uint32_t uuid_offset; // Points back to the UUID list which should point here (UUIDs should much)
+ uint32_t entries; // Entries
+ uuid_t uuid; // Which UUID this is
+};
+
+// 20 bytes
+struct journal_page_list {
+ uint32_t delta_start_s; // relative to the start time of journal
+ uint32_t delta_end_s; // relative to delta_start
+ uint32_t extent_index; // Index to the extent (extent list) (bytes from BASE)
+ uint32_t update_every_s;
+ uint16_t page_length;
+ uint8_t type;
+};
+
+// UUID_LIST
+// 36 bytes
+struct journal_metric_list {
+ uuid_t uuid;
+ uint32_t entries; // Number of entries
+ uint32_t page_offset; // OFFSET that contains entries * struct( journal_page_list )
+ uint32_t delta_start_s; // Min time of metric
+ uint32_t delta_end_s; // Max time of metric (to be used to populate page_index)
+ uint32_t update_every_s; // Last update every for this metric in this journal (last page collected)
+};
+
+// 16 bytes
+struct journal_extent_list {
+ uint64_t datafile_offset; // Datafile offset to find the extent
+ uint32_t datafile_size; // Size of the extent
+ uint16_t file_index; // which file index is this datafile[index]
+ uint8_t pages; // number of pages (not all are necesssarily valid)
+};
+
+// 72 bytes
+struct journal_v2_header {
+ uint32_t magic;
+ usec_t start_time_ut; // Min start time of journal
+ usec_t end_time_ut; // Maximum end time of journal
+ uint32_t extent_count; // Count of extents
+ uint32_t extent_offset;
+ uint32_t metric_count; // Count of metrics (unique UUIDS)
+ uint32_t metric_offset;
+ uint32_t page_count; // Total count of pages (descriptors @ time)
+ uint32_t page_offset;
+ uint32_t extent_trailer_offset; // CRC for entent list
+ uint32_t metric_trailer_offset; // CRC for metric list
+ uint32_t journal_v1_file_size; // This is the original journal file
+ uint32_t journal_v2_file_size; // This is the total file size
+ void *data; // Used when building the index
+};
+
+#define JOURNAL_V2_HEADER_PADDING_SZ (RRDENG_BLOCK_SIZE - (sizeof(struct journal_v2_header)))
+
+struct wal;
+
+void journalfile_v1_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen);
+void journalfile_v2_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen);
+struct rrdengine_journalfile *journalfile_alloc_and_init(struct rrdengine_datafile *datafile);
+void journalfile_v1_extent_write(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, struct wal *wal, uv_loop_t *loop);
+int journalfile_close(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile);
+int journalfile_unlink(struct rrdengine_journalfile *journalfile);
+int journalfile_destroy_unsafe(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile);
+int journalfile_create(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile);
+int journalfile_load(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile,
+ struct rrdengine_datafile *datafile);
+void journalfile_v2_populate_retention_to_mrg(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile);
+
+void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno __maybe_unused, uint8_t type __maybe_unused,
+ Pvoid_t JudyL_metrics, Pvoid_t JudyL_extents_pos,
+ size_t number_of_extents, size_t number_of_metrics, size_t number_of_pages, void *user_data);
+
+
+bool journalfile_v2_data_available(struct rrdengine_journalfile *journalfile);
+size_t journalfile_v2_data_size_get(struct rrdengine_journalfile *journalfile);
+void journalfile_v2_data_set(struct rrdengine_journalfile *journalfile, int fd, void *journal_data, uint32_t journal_data_size);
+struct journal_v2_header *journalfile_v2_data_acquire(struct rrdengine_journalfile *journalfile, size_t *data_size, time_t wanted_first_time_s, time_t wanted_last_time_s);
+void journalfile_v2_data_release(struct rrdengine_journalfile *journalfile);
+void journalfile_v2_data_unmount_cleanup(time_t now_s);
+
+typedef struct {
+ bool init;
+ Word_t last;
+ time_t wanted_start_time_s;
+ time_t wanted_end_time_s;
+ struct rrdengine_instance *ctx;
+ struct journal_v2_header *j2_header_acquired;
+} NJFV2IDX_FIND_STATE;
+
+struct rrdengine_datafile *njfv2idx_find_and_acquire_j2_header(NJFV2IDX_FIND_STATE *s);
+
+#endif /* NETDATA_JOURNALFILE_H */ \ No newline at end of file
diff --git a/database/engine/journalfile_v2.ksy.in b/database/engine/journalfile_v2.ksy.in
new file mode 100644
index 00000000..6a656bc4
--- /dev/null
+++ b/database/engine/journalfile_v2.ksy.in
@@ -0,0 +1,150 @@
+meta:
+ id: journalfile_v2`'ifdef(`VIRT_MEMBERS',`_virtmemb')
+ endian: le
+ application: netdata
+ file-extension: njfv2
+ license: GPL-3.0-or-later
+
+seq:
+ - id: journal_v2_header
+ type: journal_v2_header
+ size: 4096
+ - id: extent_list
+ type: journal_v2_extent_list
+ repeat: expr
+ repeat-expr: journal_v2_header.extent_count
+ - id: extent_trailer
+ type: journal_v2_block_trailer
+ - id: metric_list
+ type: journal_v2_metric_list
+ repeat: expr
+ repeat-expr: journal_v2_header.metric_count
+ - id: metric_trailer
+ type: journal_v2_block_trailer
+ - id: page_blocs
+ type: journal_v2_page_block
+ repeat: expr
+ repeat-expr: _root.journal_v2_header.metric_count
+ - id: padding
+ size: _root._io.size - _root._io.pos - 4
+ - id: journal_file_trailer
+ type: journal_v2_block_trailer
+
+types:
+ journal_v2_metric_list:
+ seq:
+ - id: uuid
+ size: 16
+ - id: entries
+ type: u4
+ - id: page_offset
+ type: u4
+ - id: delta_start_s
+ type: u4
+ - id: delta_end_s
+ type: u4
+ifdef(`VIRT_MEMBERS',
+` instances:
+ page_block:
+ type: journal_v2_page_block
+ io: _root._io
+ pos: page_offset
+')dnl
+ journal_v2_page_hdr:
+ seq:
+ - id: crc
+ type: u4
+ - id: uuid_offset
+ type: u4
+ - id: entries
+ type: u4
+ - id: uuid
+ size: 16
+ journal_v2_page_list:
+ seq:
+ - id: delta_start_s
+ type: u4
+ - id: delta_end_s
+ type: u4
+ - id: extent_idx
+ type: u4
+ - id: update_every_s
+ type: u4
+ - id: page_len
+ type: u2
+ - id: type
+ type: u1
+ - id: reserved
+ type: u1
+ifdef(`VIRT_MEMBERS',
+` instances:
+ extent:
+ io: _root._io
+ type: journal_v2_extent_list
+ pos: _root.journal_v2_header.extent_offset + (extent_idx * 16)
+')dnl
+ journal_v2_header:
+ seq:
+ - id: magic
+ contents: [ 0x19, 0x10, 0x22, 0x01 ] #0x01221019
+ - id: reserved
+ type: u4
+ - id: start_time_ut
+ type: u8
+ - id: end_time_ut
+ type: u8
+ - id: extent_count
+ type: u4
+ - id: extent_offset
+ type: u4
+ - id: metric_count
+ type: u4
+ - id: metric_offset
+ type: u4
+ - id: page_count
+ type: u4
+ - id: page_offset
+ type: u4
+ - id: extent_trailer_offset
+ type: u4
+ - id: metric_trailer_offset
+ type: u4
+ - id: original_file_size
+ type: u4
+ - id: total_file_size
+ type: u4
+ - id: data
+ type: u8
+ifdef(`VIRT_MEMBERS',
+` instances:
+ trailer:
+ io: _root._io
+ type: journal_v2_block_trailer
+ pos: _root._io.size - 4
+')dnl
+ journal_v2_block_trailer:
+ seq:
+ - id: checksum
+ type: u4
+ journal_v2_extent_list:
+ seq:
+ - id: datafile_offset
+ type: u8
+ - id: datafile_size
+ type: u4
+ - id: file_idx
+ type: u2
+ - id: page_cnt
+ type: u1
+ - id: padding
+ type: u1
+ journal_v2_page_block:
+ seq:
+ - id: hdr
+ type: journal_v2_page_hdr
+ - id: page_list
+ type: journal_v2_page_list
+ repeat: expr
+ repeat-expr: hdr.entries
+ - id: block_trailer
+ type: journal_v2_block_trailer
diff --git a/database/engine/metadata_log/README.md b/database/engine/metadata_log/README.md
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/database/engine/metadata_log/README.md
diff --git a/database/engine/metric.c b/database/engine/metric.c
new file mode 100644
index 00000000..2e132612
--- /dev/null
+++ b/database/engine/metric.c
@@ -0,0 +1,873 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+#include "metric.h"
+
+typedef int32_t REFCOUNT;
+#define REFCOUNT_DELETING (-100)
+
+struct metric {
+ uuid_t uuid; // never changes
+ Word_t section; // never changes
+
+ time_t first_time_s; // the timestamp of the oldest point in the database
+ time_t latest_time_s_clean; // the timestamp of the newest point in the database
+ time_t latest_time_s_hot; // the timestamp of the latest point that has been collected (not yet stored)
+ uint32_t latest_update_every_s; // the latest data collection frequency
+ pid_t writer;
+ uint8_t partition;
+ REFCOUNT refcount;
+
+ // THIS IS allocated with malloc()
+ // YOU HAVE TO INITIALIZE IT YOURSELF !
+};
+
+#define set_metric_field_with_condition(field, value, condition) ({ \
+ typeof(field) _current = __atomic_load_n(&(field), __ATOMIC_RELAXED); \
+ typeof(field) _wanted = value; \
+ bool did_it = true; \
+ \
+ do { \
+ if((condition) && (_current != _wanted)) { \
+ ; \
+ } \
+ else { \
+ did_it = false; \
+ break; \
+ } \
+ } while(!__atomic_compare_exchange_n(&(field), &_current, _wanted, \
+ false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); \
+ \
+ did_it; \
+})
+
+static struct aral_statistics mrg_aral_statistics;
+
+struct mrg {
+ size_t partitions;
+
+ struct mrg_partition {
+ ARAL *aral; // not protected by our spinlock - it has its own
+
+ RW_SPINLOCK rw_spinlock;
+ Pvoid_t uuid_judy; // JudyHS: each UUID has a JudyL of sections (tiers)
+
+ struct mrg_statistics stats;
+ } index[];
+};
+
+static inline void MRG_STATS_DUPLICATE_ADD(MRG *mrg, size_t partition) {
+ mrg->index[partition].stats.additions_duplicate++;
+}
+
+static inline void MRG_STATS_ADDED_METRIC(MRG *mrg, size_t partition) {
+ mrg->index[partition].stats.entries++;
+ mrg->index[partition].stats.additions++;
+ mrg->index[partition].stats.size += sizeof(METRIC);
+}
+
+static inline void MRG_STATS_DELETED_METRIC(MRG *mrg, size_t partition) {
+ mrg->index[partition].stats.entries--;
+ mrg->index[partition].stats.size -= sizeof(METRIC);
+ mrg->index[partition].stats.deletions++;
+}
+
+static inline void MRG_STATS_SEARCH_HIT(MRG *mrg, size_t partition) {
+ __atomic_add_fetch(&mrg->index[partition].stats.search_hits, 1, __ATOMIC_RELAXED);
+}
+
+static inline void MRG_STATS_SEARCH_MISS(MRG *mrg, size_t partition) {
+ __atomic_add_fetch(&mrg->index[partition].stats.search_misses, 1, __ATOMIC_RELAXED);
+}
+
+static inline void MRG_STATS_DELETE_MISS(MRG *mrg, size_t partition) {
+ mrg->index[partition].stats.delete_misses++;
+}
+
+#define mrg_index_read_lock(mrg, partition) rw_spinlock_read_lock(&(mrg)->index[partition].rw_spinlock)
+#define mrg_index_read_unlock(mrg, partition) rw_spinlock_read_unlock(&(mrg)->index[partition].rw_spinlock)
+#define mrg_index_write_lock(mrg, partition) rw_spinlock_write_lock(&(mrg)->index[partition].rw_spinlock)
+#define mrg_index_write_unlock(mrg, partition) rw_spinlock_write_unlock(&(mrg)->index[partition].rw_spinlock)
+
+static inline void mrg_stats_size_judyl_change(MRG *mrg, size_t mem_before_judyl, size_t mem_after_judyl, size_t partition) {
+ if(mem_after_judyl > mem_before_judyl)
+ __atomic_add_fetch(&mrg->index[partition].stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED);
+ else if(mem_after_judyl < mem_before_judyl)
+ __atomic_sub_fetch(&mrg->index[partition].stats.size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED);
+}
+
+static inline void mrg_stats_size_judyhs_added_uuid(MRG *mrg, size_t partition) {
+ __atomic_add_fetch(&mrg->index[partition].stats.size, JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(uuid_t)), __ATOMIC_RELAXED);
+}
+
+static inline void mrg_stats_size_judyhs_removed_uuid(MRG *mrg, size_t partition) {
+ __atomic_sub_fetch(&mrg->index[partition].stats.size, JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(uuid_t)), __ATOMIC_RELAXED);
+}
+
+static inline size_t uuid_partition(MRG *mrg __maybe_unused, uuid_t *uuid) {
+ uint8_t *u = (uint8_t *)uuid;
+ size_t *n = (size_t *)&u[UUID_SZ - sizeof(size_t)];
+ return *n % mrg->partitions;
+}
+
+static inline time_t mrg_metric_get_first_time_s_smart(MRG *mrg __maybe_unused, METRIC *metric) {
+ time_t first_time_s = __atomic_load_n(&metric->first_time_s, __ATOMIC_RELAXED);
+
+ if(first_time_s <= 0) {
+ first_time_s = __atomic_load_n(&metric->latest_time_s_clean, __ATOMIC_RELAXED);
+ if(first_time_s <= 0)
+ first_time_s = __atomic_load_n(&metric->latest_time_s_hot, __ATOMIC_RELAXED);
+
+ if(first_time_s <= 0)
+ first_time_s = 0;
+ else
+ __atomic_store_n(&metric->first_time_s, first_time_s, __ATOMIC_RELAXED);
+ }
+
+ return first_time_s;
+}
+
+static inline REFCOUNT metric_acquire(MRG *mrg __maybe_unused, METRIC *metric) {
+ size_t partition = metric->partition;
+ REFCOUNT expected = __atomic_load_n(&metric->refcount, __ATOMIC_RELAXED);
+ REFCOUNT refcount;
+
+ do {
+ if(expected < 0)
+ fatal("METRIC: refcount is %d (negative) during acquire", metric->refcount);
+
+ refcount = expected + 1;
+ } while(!__atomic_compare_exchange_n(&metric->refcount, &expected, refcount, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
+
+ if(refcount == 1)
+ __atomic_add_fetch(&mrg->index[partition].stats.entries_referenced, 1, __ATOMIC_RELAXED);
+
+ __atomic_add_fetch(&mrg->index[partition].stats.current_references, 1, __ATOMIC_RELAXED);
+
+ return refcount;
+}
+
+static inline bool metric_release_and_can_be_deleted(MRG *mrg __maybe_unused, METRIC *metric) {
+ size_t partition = metric->partition;
+ REFCOUNT expected = __atomic_load_n(&metric->refcount, __ATOMIC_RELAXED);
+ REFCOUNT refcount;
+
+ do {
+ if(expected <= 0)
+ fatal("METRIC: refcount is %d (zero or negative) during release", metric->refcount);
+
+ refcount = expected - 1;
+ } while(!__atomic_compare_exchange_n(&metric->refcount, &expected, refcount, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
+
+ if(unlikely(!refcount))
+ __atomic_sub_fetch(&mrg->index[partition].stats.entries_referenced, 1, __ATOMIC_RELAXED);
+
+ __atomic_sub_fetch(&mrg->index[partition].stats.current_references, 1, __ATOMIC_RELAXED);
+
+ time_t first, last, ue;
+ mrg_metric_get_retention(mrg, metric, &first, &last, &ue);
+ return (!first || !last || first > last);
+}
+
+static inline METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *ret) {
+ size_t partition = uuid_partition(mrg, entry->uuid);
+
+ METRIC *allocation = aral_mallocz(mrg->index[partition].aral);
+
+ mrg_index_write_lock(mrg, partition);
+
+ size_t mem_before_judyl, mem_after_judyl;
+
+ Pvoid_t *sections_judy_pptr = JudyHSIns(&mrg->index[partition].uuid_judy, entry->uuid, sizeof(uuid_t), PJE0);
+ if(unlikely(!sections_judy_pptr || sections_judy_pptr == PJERR))
+ fatal("DBENGINE METRIC: corrupted UUIDs JudyHS array");
+
+ if(unlikely(!*sections_judy_pptr))
+ mrg_stats_size_judyhs_added_uuid(mrg, partition);
+
+ mem_before_judyl = JudyLMemUsed(*sections_judy_pptr);
+ Pvoid_t *PValue = JudyLIns(sections_judy_pptr, entry->section, PJE0);
+ mem_after_judyl = JudyLMemUsed(*sections_judy_pptr);
+ mrg_stats_size_judyl_change(mrg, mem_before_judyl, mem_after_judyl, partition);
+
+ if(unlikely(!PValue || PValue == PJERR))
+ fatal("DBENGINE METRIC: corrupted section JudyL array");
+
+ if(unlikely(*PValue != NULL)) {
+ METRIC *metric = *PValue;
+
+ metric_acquire(mrg, metric);
+
+ MRG_STATS_DUPLICATE_ADD(mrg, partition);
+
+ mrg_index_write_unlock(mrg, partition);
+
+ if(ret)
+ *ret = false;
+
+ aral_freez(mrg->index[partition].aral, allocation);
+
+ return metric;
+ }
+
+ METRIC *metric = allocation;
+ uuid_copy(metric->uuid, *entry->uuid);
+ metric->section = entry->section;
+ metric->first_time_s = MAX(0, entry->first_time_s);
+ metric->latest_time_s_clean = MAX(0, entry->last_time_s);
+ metric->latest_time_s_hot = 0;
+ metric->latest_update_every_s = entry->latest_update_every_s;
+ metric->writer = 0;
+ metric->refcount = 0;
+ metric->partition = partition;
+ metric_acquire(mrg, metric);
+ *PValue = metric;
+
+ MRG_STATS_ADDED_METRIC(mrg, partition);
+
+ mrg_index_write_unlock(mrg, partition);
+
+ if(ret)
+ *ret = true;
+
+ return metric;
+}
+
+static inline METRIC *metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) {
+ size_t partition = uuid_partition(mrg, uuid);
+
+ mrg_index_read_lock(mrg, partition);
+
+ Pvoid_t *sections_judy_pptr = JudyHSGet(mrg->index[partition].uuid_judy, uuid, sizeof(uuid_t));
+ if(unlikely(!sections_judy_pptr)) {
+ mrg_index_read_unlock(mrg, partition);
+ MRG_STATS_SEARCH_MISS(mrg, partition);
+ return NULL;
+ }
+
+ Pvoid_t *PValue = JudyLGet(*sections_judy_pptr, section, PJE0);
+ if(unlikely(!PValue)) {
+ mrg_index_read_unlock(mrg, partition);
+ MRG_STATS_SEARCH_MISS(mrg, partition);
+ return NULL;
+ }
+
+ METRIC *metric = *PValue;
+
+ metric_acquire(mrg, metric);
+
+ mrg_index_read_unlock(mrg, partition);
+
+ MRG_STATS_SEARCH_HIT(mrg, partition);
+ return metric;
+}
+
+static inline bool acquired_metric_del(MRG *mrg, METRIC *metric) {
+ size_t partition = metric->partition;
+
+ size_t mem_before_judyl, mem_after_judyl;
+
+ mrg_index_write_lock(mrg, partition);
+
+ if(!metric_release_and_can_be_deleted(mrg, metric)) {
+ mrg->index[partition].stats.delete_having_retention_or_referenced++;
+ mrg_index_write_unlock(mrg, partition);
+ return false;
+ }
+
+ Pvoid_t *sections_judy_pptr = JudyHSGet(mrg->index[partition].uuid_judy, &metric->uuid, sizeof(uuid_t));
+ if(unlikely(!sections_judy_pptr || !*sections_judy_pptr)) {
+ MRG_STATS_DELETE_MISS(mrg, partition);
+ mrg_index_write_unlock(mrg, partition);
+ return false;
+ }
+
+ mem_before_judyl = JudyLMemUsed(*sections_judy_pptr);
+ int rc = JudyLDel(sections_judy_pptr, metric->section, PJE0);
+ mem_after_judyl = JudyLMemUsed(*sections_judy_pptr);
+ mrg_stats_size_judyl_change(mrg, mem_before_judyl, mem_after_judyl, partition);
+
+ if(unlikely(!rc)) {
+ MRG_STATS_DELETE_MISS(mrg, partition);
+ mrg_index_write_unlock(mrg, partition);
+ return false;
+ }
+
+ if(!*sections_judy_pptr) {
+ rc = JudyHSDel(&mrg->index[partition].uuid_judy, &metric->uuid, sizeof(uuid_t), PJE0);
+ if(unlikely(!rc))
+ fatal("DBENGINE METRIC: cannot delete UUID from JudyHS");
+ mrg_stats_size_judyhs_removed_uuid(mrg, partition);
+ }
+
+ MRG_STATS_DELETED_METRIC(mrg, partition);
+
+ mrg_index_write_unlock(mrg, partition);
+
+ aral_freez(mrg->index[partition].aral, metric);
+
+ return true;
+}
+
+// ----------------------------------------------------------------------------
+// public API
+
+inline MRG *mrg_create(ssize_t partitions) {
+ if(partitions < 1)
+ partitions = get_netdata_cpus();
+
+ MRG *mrg = callocz(1, sizeof(MRG) + sizeof(struct mrg_partition) * partitions);
+ mrg->partitions = partitions;
+
+ for(size_t i = 0; i < mrg->partitions ; i++) {
+ rw_spinlock_init(&mrg->index[i].rw_spinlock);
+
+ char buf[ARAL_MAX_NAME + 1];
+ snprintfz(buf, ARAL_MAX_NAME, "mrg[%zu]", i);
+
+ mrg->index[i].aral = aral_create(buf, sizeof(METRIC), 0, 16384, &mrg_aral_statistics, NULL, NULL, false, false);
+ }
+
+ return mrg;
+}
+
+inline size_t mrg_aral_structures(void) {
+ return aral_structures_from_stats(&mrg_aral_statistics);
+}
+
+inline size_t mrg_aral_overhead(void) {
+ return aral_overhead_from_stats(&mrg_aral_statistics);
+}
+
+inline void mrg_destroy(MRG *mrg __maybe_unused) {
+ // no destruction possible
+ // we can't traverse the metrics list
+
+ // to delete entries, the caller needs to keep pointers to them
+ // and delete them one by one
+
+ ;
+}
+
+inline METRIC *mrg_metric_add_and_acquire(MRG *mrg, MRG_ENTRY entry, bool *ret) {
+// internal_fatal(entry.latest_time_s > max_acceptable_collected_time(),
+// "DBENGINE METRIC: metric latest time is in the future");
+
+ return metric_add_and_acquire(mrg, &entry, ret);
+}
+
+inline METRIC *mrg_metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) {
+ return metric_get_and_acquire(mrg, uuid, section);
+}
+
+inline bool mrg_metric_release_and_delete(MRG *mrg, METRIC *metric) {
+ return acquired_metric_del(mrg, metric);
+}
+
+inline METRIC *mrg_metric_dup(MRG *mrg, METRIC *metric) {
+ metric_acquire(mrg, metric);
+ return metric;
+}
+
+inline bool mrg_metric_release(MRG *mrg, METRIC *metric) {
+ return metric_release_and_can_be_deleted(mrg, metric);
+}
+
+inline Word_t mrg_metric_id(MRG *mrg __maybe_unused, METRIC *metric) {
+ return (Word_t)metric;
+}
+
+inline uuid_t *mrg_metric_uuid(MRG *mrg __maybe_unused, METRIC *metric) {
+ return &metric->uuid;
+}
+
+inline Word_t mrg_metric_section(MRG *mrg __maybe_unused, METRIC *metric) {
+ return metric->section;
+}
+
+inline bool mrg_metric_set_first_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) {
+ internal_fatal(first_time_s < 0, "DBENGINE METRIC: timestamp is negative");
+
+ if(unlikely(first_time_s < 0))
+ return false;
+
+ __atomic_store_n(&metric->first_time_s, first_time_s, __ATOMIC_RELAXED);
+
+ return true;
+}
+
+inline void mrg_metric_expand_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s, time_t last_time_s, time_t update_every_s) {
+ internal_fatal(first_time_s < 0 || last_time_s < 0 || update_every_s < 0,
+ "DBENGINE METRIC: timestamp is negative");
+ internal_fatal(first_time_s > max_acceptable_collected_time(),
+ "DBENGINE METRIC: metric first time is in the future");
+ internal_fatal(last_time_s > max_acceptable_collected_time(),
+ "DBENGINE METRIC: metric last time is in the future");
+
+ if(first_time_s > 0)
+ set_metric_field_with_condition(metric->first_time_s, first_time_s, _current <= 0 || _wanted < _current);
+
+ if(last_time_s > 0) {
+ if(set_metric_field_with_condition(metric->latest_time_s_clean, last_time_s, _current <= 0 || _wanted > _current) &&
+ update_every_s > 0)
+ // set the latest update every too
+ set_metric_field_with_condition(metric->latest_update_every_s, update_every_s, true);
+ }
+ else if(update_every_s > 0)
+ // set it only if it is invalid
+ set_metric_field_with_condition(metric->latest_update_every_s, update_every_s, _current <= 0);
+}
+
+inline bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) {
+ internal_fatal(first_time_s < 0, "DBENGINE METRIC: timestamp is negative");
+ return set_metric_field_with_condition(metric->first_time_s, first_time_s, _wanted > _current);
+}
+
+inline time_t mrg_metric_get_first_time_s(MRG *mrg __maybe_unused, METRIC *metric) {
+ return mrg_metric_get_first_time_s_smart(mrg, metric);
+}
+
+inline void mrg_metric_get_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t *first_time_s, time_t *last_time_s, time_t *update_every_s) {
+ time_t clean = __atomic_load_n(&metric->latest_time_s_clean, __ATOMIC_RELAXED);
+ time_t hot = __atomic_load_n(&metric->latest_time_s_hot, __ATOMIC_RELAXED);
+
+ *last_time_s = MAX(clean, hot);
+ *first_time_s = mrg_metric_get_first_time_s_smart(mrg, metric);
+ *update_every_s = __atomic_load_n(&metric->latest_update_every_s, __ATOMIC_RELAXED);
+}
+
+inline bool mrg_metric_set_clean_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) {
+ internal_fatal(latest_time_s < 0, "DBENGINE METRIC: timestamp is negative");
+
+// internal_fatal(latest_time_s > max_acceptable_collected_time(),
+// "DBENGINE METRIC: metric latest time is in the future");
+
+// internal_fatal(metric->latest_time_s_clean > latest_time_s,
+// "DBENGINE METRIC: metric new clean latest time is older than the previous one");
+
+ if(latest_time_s > 0) {
+ if(set_metric_field_with_condition(metric->latest_time_s_clean, latest_time_s, true)) {
+ set_metric_field_with_condition(metric->first_time_s, latest_time_s, _current <= 0 || _wanted < _current);
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// returns true when metric still has retention
+inline bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric) {
+ Word_t section = mrg_metric_section(mrg, metric);
+ bool do_again = false;
+ size_t countdown = 5;
+
+ do {
+ time_t min_first_time_s = LONG_MAX;
+ time_t max_end_time_s = 0;
+ PGC_PAGE *page;
+ PGC_SEARCH method = PGC_SEARCH_FIRST;
+ time_t page_first_time_s = 0;
+ time_t page_end_time_s = 0;
+ while ((page = pgc_page_get_and_acquire(main_cache, section, (Word_t)metric, page_first_time_s, method))) {
+ method = PGC_SEARCH_NEXT;
+
+ bool is_hot = pgc_is_page_hot(page);
+ bool is_dirty = pgc_is_page_dirty(page);
+ page_first_time_s = pgc_page_start_time_s(page);
+ page_end_time_s = pgc_page_end_time_s(page);
+
+ if ((is_hot || is_dirty) && page_first_time_s > 0 && page_first_time_s < min_first_time_s)
+ min_first_time_s = page_first_time_s;
+
+ if (is_dirty && page_end_time_s > max_end_time_s)
+ max_end_time_s = page_end_time_s;
+
+ pgc_page_release(main_cache, page);
+ }
+
+ if (min_first_time_s == LONG_MAX)
+ min_first_time_s = 0;
+
+ if (--countdown && !min_first_time_s && __atomic_load_n(&metric->latest_time_s_hot, __ATOMIC_RELAXED))
+ do_again = true;
+ else {
+ internal_error(!countdown, "METRIC: giving up on updating the retention of metric without disk retention");
+
+ do_again = false;
+ set_metric_field_with_condition(metric->first_time_s, min_first_time_s, true);
+ set_metric_field_with_condition(metric->latest_time_s_clean, max_end_time_s, true);
+ }
+ } while(do_again);
+
+ time_t first, last, ue;
+ mrg_metric_get_retention(mrg, metric, &first, &last, &ue);
+ return (first && last && first < last);
+}
+
+inline bool mrg_metric_set_hot_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) {
+ internal_fatal(latest_time_s < 0, "DBENGINE METRIC: timestamp is negative");
+
+// internal_fatal(latest_time_s > max_acceptable_collected_time(),
+// "DBENGINE METRIC: metric latest time is in the future");
+
+ if(likely(latest_time_s > 0)) {
+ __atomic_store_n(&metric->latest_time_s_hot, latest_time_s, __ATOMIC_RELAXED);
+ return true;
+ }
+
+ return false;
+}
+
+inline time_t mrg_metric_get_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric) {
+ time_t clean = __atomic_load_n(&metric->latest_time_s_clean, __ATOMIC_RELAXED);
+ time_t hot = __atomic_load_n(&metric->latest_time_s_hot, __ATOMIC_RELAXED);
+
+ return MAX(clean, hot);
+}
+
+inline bool mrg_metric_set_update_every(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) {
+ internal_fatal(update_every_s < 0, "DBENGINE METRIC: timestamp is negative");
+
+ if(update_every_s > 0)
+ return set_metric_field_with_condition(metric->latest_update_every_s, update_every_s, true);
+
+ return false;
+}
+
+inline bool mrg_metric_set_update_every_s_if_zero(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) {
+ internal_fatal(update_every_s < 0, "DBENGINE METRIC: timestamp is negative");
+
+ if(update_every_s > 0)
+ return set_metric_field_with_condition(metric->latest_update_every_s, update_every_s, _current <= 0);
+
+ return false;
+}
+
+inline time_t mrg_metric_get_update_every_s(MRG *mrg __maybe_unused, METRIC *metric) {
+ return __atomic_load_n(&metric->latest_update_every_s, __ATOMIC_RELAXED);
+}
+
+inline bool mrg_metric_set_writer(MRG *mrg, METRIC *metric) {
+ pid_t expected = __atomic_load_n(&metric->writer, __ATOMIC_RELAXED);
+ pid_t wanted = gettid();
+ bool done = true;
+
+ do {
+ if(expected != 0) {
+ done = false;
+ break;
+ }
+ } while(!__atomic_compare_exchange_n(&metric->writer, &expected, wanted, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
+
+ if(done)
+ __atomic_add_fetch(&mrg->index[metric->partition].stats.writers, 1, __ATOMIC_RELAXED);
+ else
+ __atomic_add_fetch(&mrg->index[metric->partition].stats.writers_conflicts, 1, __ATOMIC_RELAXED);
+
+ return done;
+}
+
+inline bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric) {
+ // this function can be called from a different thread than the one than the writer
+
+ pid_t expected = __atomic_load_n(&metric->writer, __ATOMIC_RELAXED);
+ pid_t wanted = 0;
+ bool done = true;
+
+ do {
+ if(!expected) {
+ done = false;
+ break;
+ }
+ } while(!__atomic_compare_exchange_n(&metric->writer, &expected, wanted, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
+
+ if(done)
+ __atomic_sub_fetch(&mrg->index[metric->partition].stats.writers, 1, __ATOMIC_RELAXED);
+
+ return done;
+}
+
+inline void mrg_update_metric_retention_and_granularity_by_uuid(
+ MRG *mrg, Word_t section, uuid_t *uuid,
+ time_t first_time_s, time_t last_time_s,
+ time_t update_every_s, time_t now_s)
+{
+ if(unlikely(last_time_s > now_s)) {
+ nd_log_limit_static_global_var(erl, 1, 0);
+ nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING,
+ "DBENGINE JV2: wrong last time on-disk (%ld - %ld, now %ld), "
+ "fixing last time to now",
+ first_time_s, last_time_s, now_s);
+ last_time_s = now_s;
+ }
+
+ if (unlikely(first_time_s > last_time_s)) {
+ nd_log_limit_static_global_var(erl, 1, 0);
+ nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING,
+ "DBENGINE JV2: wrong first time on-disk (%ld - %ld, now %ld), "
+ "fixing first time to last time",
+ first_time_s, last_time_s, now_s);
+
+ first_time_s = last_time_s;
+ }
+
+ if (unlikely(first_time_s == 0 || last_time_s == 0)) {
+ nd_log_limit_static_global_var(erl, 1, 0);
+ nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING,
+ "DBENGINE JV2: zero on-disk timestamps (%ld - %ld, now %ld), "
+ "using them as-is",
+ first_time_s, last_time_s, now_s);
+ }
+
+ bool added = false;
+ METRIC *metric = mrg_metric_get_and_acquire(mrg, uuid, section);
+ if (!metric) {
+ MRG_ENTRY entry = {
+ .uuid = uuid,
+ .section = section,
+ .first_time_s = first_time_s,
+ .last_time_s = last_time_s,
+ .latest_update_every_s = (uint32_t) update_every_s
+ };
+ metric = mrg_metric_add_and_acquire(mrg, entry, &added);
+ }
+
+ if (likely(!added))
+ mrg_metric_expand_retention(mrg, metric, first_time_s, last_time_s, update_every_s);
+
+ mrg_metric_release(mrg, metric);
+}
+
+inline void mrg_get_statistics(MRG *mrg, struct mrg_statistics *s) {
+ memset(s, 0, sizeof(struct mrg_statistics));
+
+ for(size_t i = 0; i < mrg->partitions ;i++) {
+ s->entries += __atomic_load_n(&mrg->index[i].stats.entries, __ATOMIC_RELAXED);
+ s->entries_referenced += __atomic_load_n(&mrg->index[i].stats.entries_referenced, __ATOMIC_RELAXED);
+ s->size += __atomic_load_n(&mrg->index[i].stats.size, __ATOMIC_RELAXED);
+ s->current_references += __atomic_load_n(&mrg->index[i].stats.current_references, __ATOMIC_RELAXED);
+ s->additions += __atomic_load_n(&mrg->index[i].stats.additions, __ATOMIC_RELAXED);
+ s->additions_duplicate += __atomic_load_n(&mrg->index[i].stats.additions_duplicate, __ATOMIC_RELAXED);
+ s->deletions += __atomic_load_n(&mrg->index[i].stats.deletions, __ATOMIC_RELAXED);
+ s->delete_having_retention_or_referenced += __atomic_load_n(&mrg->index[i].stats.delete_having_retention_or_referenced, __ATOMIC_RELAXED);
+ s->delete_misses += __atomic_load_n(&mrg->index[i].stats.delete_misses, __ATOMIC_RELAXED);
+ s->search_hits += __atomic_load_n(&mrg->index[i].stats.search_hits, __ATOMIC_RELAXED);
+ s->search_misses += __atomic_load_n(&mrg->index[i].stats.search_misses, __ATOMIC_RELAXED);
+ s->writers += __atomic_load_n(&mrg->index[i].stats.writers, __ATOMIC_RELAXED);
+ s->writers_conflicts += __atomic_load_n(&mrg->index[i].stats.writers_conflicts, __ATOMIC_RELAXED);
+ }
+
+ s->size += sizeof(MRG) + sizeof(struct mrg_partition) * mrg->partitions;
+}
+
+// ----------------------------------------------------------------------------
+// unit test
+
+struct mrg_stress_entry {
+ uuid_t uuid;
+ time_t after;
+ time_t before;
+};
+
+struct mrg_stress {
+ MRG *mrg;
+ bool stop;
+ size_t entries;
+ struct mrg_stress_entry *array;
+ size_t updates;
+};
+
+static void *mrg_stress(void *ptr) {
+ struct mrg_stress *t = ptr;
+ MRG *mrg = t->mrg;
+
+ ssize_t start = 0;
+ ssize_t end = (ssize_t)t->entries;
+ ssize_t step = 1;
+
+ if(gettid() % 2) {
+ start = (ssize_t)t->entries - 1;
+ end = -1;
+ step = -1;
+ }
+
+ while(!__atomic_load_n(&t->stop, __ATOMIC_RELAXED)) {
+ for (ssize_t i = start; i != end; i += step) {
+ struct mrg_stress_entry *e = &t->array[i];
+
+ time_t after = __atomic_sub_fetch(&e->after, 1, __ATOMIC_RELAXED);
+ time_t before = __atomic_add_fetch(&e->before, 1, __ATOMIC_RELAXED);
+
+ mrg_update_metric_retention_and_granularity_by_uuid(
+ mrg, 0x01,
+ &e->uuid,
+ after,
+ before,
+ 1,
+ before);
+
+ __atomic_add_fetch(&t->updates, 1, __ATOMIC_RELAXED);
+ }
+ }
+
+ return ptr;
+}
+
+int mrg_unittest(void) {
+ MRG *mrg = mrg_create(0);
+ METRIC *m1_t0, *m2_t0, *m3_t0, *m4_t0;
+ METRIC *m1_t1, *m2_t1, *m3_t1, *m4_t1;
+ bool ret;
+
+ uuid_t test_uuid;
+ uuid_generate(test_uuid);
+ MRG_ENTRY entry = {
+ .uuid = &test_uuid,
+ .section = 0,
+ .first_time_s = 2,
+ .last_time_s = 3,
+ .latest_update_every_s = 4,
+ };
+ m1_t0 = mrg_metric_add_and_acquire(mrg, entry, &ret);
+ if(!ret)
+ fatal("DBENGINE METRIC: failed to add metric");
+
+ // add the same metric again
+ m2_t0 = mrg_metric_add_and_acquire(mrg, entry, &ret);
+ if(m2_t0 != m1_t0)
+ fatal("DBENGINE METRIC: adding the same metric twice, does not return the same pointer");
+ if(ret)
+ fatal("DBENGINE METRIC: managed to add the same metric twice");
+
+ m3_t0 = mrg_metric_get_and_acquire(mrg, entry.uuid, entry.section);
+ if(m3_t0 != m1_t0)
+ fatal("DBENGINE METRIC: cannot find the metric added");
+
+ // add the same metric again
+ m4_t0 = mrg_metric_add_and_acquire(mrg, entry, &ret);
+ if(m4_t0 != m1_t0)
+ fatal("DBENGINE METRIC: adding the same metric twice, does not return the same pointer");
+ if(ret)
+ fatal("DBENGINE METRIC: managed to add the same metric twice");
+
+ // add the same metric in another section
+ entry.section = 1;
+ m1_t1 = mrg_metric_add_and_acquire(mrg, entry, &ret);
+ if(!ret)
+ fatal("DBENGINE METRIC: failed to add metric in section %zu", (size_t)entry.section);
+
+ // add the same metric again
+ m2_t1 = mrg_metric_add_and_acquire(mrg, entry, &ret);
+ if(m2_t1 != m1_t1)
+ fatal("DBENGINE METRIC: adding the same metric twice (section %zu), does not return the same pointer", (size_t)entry.section);
+ if(ret)
+ fatal("DBENGINE METRIC: managed to add the same metric twice in (section 0)");
+
+ m3_t1 = mrg_metric_get_and_acquire(mrg, entry.uuid, entry.section);
+ if(m3_t1 != m1_t1)
+ fatal("DBENGINE METRIC: cannot find the metric added (section %zu)", (size_t)entry.section);
+
+ // delete the first metric
+ mrg_metric_release(mrg, m2_t0);
+ mrg_metric_release(mrg, m3_t0);
+ mrg_metric_release(mrg, m4_t0);
+ mrg_metric_set_first_time_s(mrg, m1_t0, 0);
+ mrg_metric_set_clean_latest_time_s(mrg, m1_t0, 0);
+ mrg_metric_set_hot_latest_time_s(mrg, m1_t0, 0);
+ if(!mrg_metric_release_and_delete(mrg, m1_t0))
+ fatal("DBENGINE METRIC: cannot delete the first metric");
+
+ m4_t1 = mrg_metric_get_and_acquire(mrg, entry.uuid, entry.section);
+ if(m4_t1 != m1_t1)
+ fatal("DBENGINE METRIC: cannot find the metric added (section %zu), after deleting the first one", (size_t)entry.section);
+
+ // delete the second metric
+ mrg_metric_release(mrg, m2_t1);
+ mrg_metric_release(mrg, m3_t1);
+ mrg_metric_release(mrg, m4_t1);
+ mrg_metric_set_first_time_s(mrg, m1_t1, 0);
+ mrg_metric_set_clean_latest_time_s(mrg, m1_t1, 0);
+ mrg_metric_set_hot_latest_time_s(mrg, m1_t1, 0);
+ if(!mrg_metric_release_and_delete(mrg, m1_t1))
+ fatal("DBENGINE METRIC: cannot delete the second metric");
+
+ struct mrg_statistics s;
+ mrg_get_statistics(mrg, &s);
+ if(s.entries != 0)
+ fatal("DBENGINE METRIC: invalid entries counter");
+
+ size_t entries = 1000000;
+ size_t threads = mrg->partitions / 3 + 1;
+ size_t tiers = 3;
+ size_t run_for_secs = 5;
+ netdata_log_info("preparing stress test of %zu entries...", entries);
+ struct mrg_stress t = {
+ .mrg = mrg,
+ .entries = entries,
+ .array = callocz(entries, sizeof(struct mrg_stress_entry)),
+ };
+
+ time_t now = max_acceptable_collected_time();
+ for(size_t i = 0; i < entries ;i++) {
+ uuid_generate_random(t.array[i].uuid);
+ t.array[i].after = now / 3;
+ t.array[i].before = now / 2;
+ }
+ netdata_log_info("stress test is populating MRG with 3 tiers...");
+ for(size_t i = 0; i < entries ;i++) {
+ struct mrg_stress_entry *e = &t.array[i];
+ for(size_t tier = 1; tier <= tiers ;tier++) {
+ mrg_update_metric_retention_and_granularity_by_uuid(
+ mrg, tier,
+ &e->uuid,
+ e->after,
+ e->before,
+ 1,
+ e->before);
+ }
+ }
+ netdata_log_info("stress test ready to run...");
+
+ usec_t started_ut = now_monotonic_usec();
+
+ pthread_t th[threads];
+ for(size_t i = 0; i < threads ; i++) {
+ char buf[15 + 1];
+ snprintfz(buf, sizeof(buf) - 1, "TH[%zu]", i);
+ netdata_thread_create(&th[i], buf,
+ NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG,
+ mrg_stress, &t);
+ }
+
+ sleep_usec(run_for_secs * USEC_PER_SEC);
+ __atomic_store_n(&t.stop, true, __ATOMIC_RELAXED);
+
+ for(size_t i = 0; i < threads ; i++)
+ netdata_thread_cancel(th[i]);
+
+ for(size_t i = 0; i < threads ; i++)
+ netdata_thread_join(th[i], NULL);
+
+ usec_t ended_ut = now_monotonic_usec();
+
+ struct mrg_statistics stats;
+ mrg_get_statistics(mrg, &stats);
+
+ netdata_log_info("DBENGINE METRIC: did %zu additions, %zu duplicate additions, "
+ "%zu deletions, %zu wrong deletions, "
+ "%zu successful searches, %zu wrong searches, "
+ "in %"PRIu64" usecs",
+ stats.additions, stats.additions_duplicate,
+ stats.deletions, stats.delete_misses,
+ stats.search_hits, stats.search_misses,
+ ended_ut - started_ut);
+
+ netdata_log_info("DBENGINE METRIC: updates performance: %0.2fk/sec total, %0.2fk/sec/thread",
+ (double)t.updates / (double)((ended_ut - started_ut) / USEC_PER_SEC) / 1000.0,
+ (double)t.updates / (double)((ended_ut - started_ut) / USEC_PER_SEC) / 1000.0 / threads);
+
+ mrg_destroy(mrg);
+
+ netdata_log_info("DBENGINE METRIC: all tests passed!");
+
+ return 0;
+}
diff --git a/database/engine/metric.h b/database/engine/metric.h
new file mode 100644
index 00000000..dbb94930
--- /dev/null
+++ b/database/engine/metric.h
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+#ifndef DBENGINE_METRIC_H
+#define DBENGINE_METRIC_H
+
+#include "../rrd.h"
+
+#define MRG_CACHE_LINE_PADDING(x) uint8_t padding##x[64]
+
+typedef struct metric METRIC;
+typedef struct mrg MRG;
+
+typedef struct mrg_entry {
+ uuid_t *uuid;
+ Word_t section;
+ time_t first_time_s;
+ time_t last_time_s;
+ uint32_t latest_update_every_s;
+} MRG_ENTRY;
+
+struct mrg_statistics {
+ // --- non-atomic --- under a write lock
+
+ size_t entries;
+ size_t size; // total memory used, with indexing
+
+ size_t additions;
+ size_t additions_duplicate;
+
+ size_t deletions;
+ size_t delete_having_retention_or_referenced;
+ size_t delete_misses;
+
+ MRG_CACHE_LINE_PADDING(0);
+
+ // --- atomic --- multiple readers / writers
+
+ size_t entries_referenced;
+
+ MRG_CACHE_LINE_PADDING(2);
+ size_t current_references;
+
+ MRG_CACHE_LINE_PADDING(3);
+ size_t search_hits;
+ size_t search_misses;
+
+ MRG_CACHE_LINE_PADDING(4);
+ size_t writers;
+ size_t writers_conflicts;
+};
+
+MRG *mrg_create(ssize_t partitions);
+void mrg_destroy(MRG *mrg);
+
+METRIC *mrg_metric_dup(MRG *mrg, METRIC *metric);
+bool mrg_metric_release(MRG *mrg, METRIC *metric);
+
+METRIC *mrg_metric_add_and_acquire(MRG *mrg, MRG_ENTRY entry, bool *ret);
+METRIC *mrg_metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section);
+bool mrg_metric_release_and_delete(MRG *mrg, METRIC *metric);
+
+Word_t mrg_metric_id(MRG *mrg, METRIC *metric);
+uuid_t *mrg_metric_uuid(MRG *mrg, METRIC *metric);
+Word_t mrg_metric_section(MRG *mrg, METRIC *metric);
+
+bool mrg_metric_set_first_time_s(MRG *mrg, METRIC *metric, time_t first_time_s);
+bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg, METRIC *metric, time_t first_time_s);
+time_t mrg_metric_get_first_time_s(MRG *mrg, METRIC *metric);
+
+bool mrg_metric_set_clean_latest_time_s(MRG *mrg, METRIC *metric, time_t latest_time_s);
+bool mrg_metric_set_hot_latest_time_s(MRG *mrg, METRIC *metric, time_t latest_time_s);
+time_t mrg_metric_get_latest_time_s(MRG *mrg, METRIC *metric);
+
+bool mrg_metric_set_update_every(MRG *mrg, METRIC *metric, time_t update_every_s);
+bool mrg_metric_set_update_every_s_if_zero(MRG *mrg, METRIC *metric, time_t update_every_s);
+time_t mrg_metric_get_update_every_s(MRG *mrg, METRIC *metric);
+
+void mrg_metric_expand_retention(MRG *mrg, METRIC *metric, time_t first_time_s, time_t last_time_s, time_t update_every_s);
+void mrg_metric_get_retention(MRG *mrg, METRIC *metric, time_t *first_time_s, time_t *last_time_s, time_t *update_every_s);
+bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric);
+
+bool mrg_metric_set_writer(MRG *mrg, METRIC *metric);
+bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric);
+
+void mrg_get_statistics(MRG *mrg, struct mrg_statistics *s);
+size_t mrg_aral_structures(void);
+size_t mrg_aral_overhead(void);
+
+
+void mrg_update_metric_retention_and_granularity_by_uuid(
+ MRG *mrg, Word_t section, uuid_t *uuid,
+ time_t first_time_s, time_t last_time_s,
+ time_t update_every_s, time_t now_s);
+
+#endif // DBENGINE_METRIC_H
diff --git a/database/engine/page.c b/database/engine/page.c
new file mode 100644
index 00000000..b7a39348
--- /dev/null
+++ b/database/engine/page.c
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "page.h"
+
+#include "libnetdata/libnetdata.h"
+
+typedef enum __attribute__((packed)) {
+ PAGE_OPTION_ALL_VALUES_EMPTY = (1 << 0),
+} PAGE_OPTIONS;
+
+typedef enum __attribute__((packed)) {
+ PGD_STATE_CREATED_FROM_COLLECTOR = (1 << 0),
+ PGD_STATE_CREATED_FROM_DISK = (1 << 1),
+ PGD_STATE_SCHEDULED_FOR_FLUSHING = (1 << 2),
+ PGD_STATE_FLUSHED_TO_DISK = (1 << 3),
+} PGD_STATES;
+
+typedef struct {
+ uint8_t *data;
+ uint32_t size;
+} page_raw_t;
+
+
+typedef struct {
+ size_t num_buffers;
+ gorilla_writer_t *writer;
+ int aral_index;
+} page_gorilla_t;
+
+struct pgd {
+ // the page type
+ uint8_t type;
+
+ // options related to the page
+ PAGE_OPTIONS options;
+
+ PGD_STATES states;
+
+ // the uses number of slots in the page
+ uint32_t used;
+
+ // the total number of slots available in the page
+ uint32_t slots;
+
+ union {
+ page_raw_t raw;
+ page_gorilla_t gorilla;
+ };
+};
+
+// ----------------------------------------------------------------------------
+// memory management
+
+struct {
+ ARAL *aral_pgd;
+ ARAL *aral_data[RRD_STORAGE_TIERS];
+ ARAL *aral_gorilla_buffer[4];
+ ARAL *aral_gorilla_writer[4];
+} pgd_alloc_globals = {};
+
+static ARAL *pgd_aral_data_lookup(size_t size)
+{
+ for (size_t tier = 0; tier < storage_tiers; tier++)
+ if (size == tier_page_size[tier])
+ return pgd_alloc_globals.aral_data[tier];
+
+ return NULL;
+}
+
+void pgd_init_arals(void)
+{
+ // pgd aral
+ {
+ char buf[20 + 1];
+ snprintfz(buf, sizeof(buf) - 1, "pgd");
+
+ // FIXME: add stats
+ pgd_alloc_globals.aral_pgd = aral_create(
+ buf,
+ sizeof(struct pgd),
+ 64,
+ 512 * (sizeof(struct pgd)),
+ pgc_aral_statistics(),
+ NULL, NULL, false, false);
+ }
+
+ // tier page aral
+ {
+ for (size_t i = storage_tiers; i > 0 ;i--)
+ {
+ size_t tier = storage_tiers - i;
+
+ char buf[20 + 1];
+ snprintfz(buf, sizeof(buf) - 1, "tier%zu-pages", tier);
+
+ pgd_alloc_globals.aral_data[tier] = aral_create(
+ buf,
+ tier_page_size[tier],
+ 64,
+ 512 * (tier_page_size[tier]),
+ pgc_aral_statistics(),
+ NULL, NULL, false, false);
+ }
+ }
+
+ // gorilla buffers aral
+ for (size_t i = 0; i != 4; i++) {
+ char buf[20 + 1];
+ snprintfz(buf, sizeof(buf) - 1, "gbuffer-%zu", i);
+
+ // FIXME: add stats
+ pgd_alloc_globals.aral_gorilla_buffer[i] = aral_create(
+ buf,
+ GORILLA_BUFFER_SIZE,
+ 64,
+ 512 * GORILLA_BUFFER_SIZE,
+ pgc_aral_statistics(),
+ NULL, NULL, false, false);
+ }
+
+ // gorilla writers aral
+ for (size_t i = 0; i != 4; i++) {
+ char buf[20 + 1];
+ snprintfz(buf, sizeof(buf) - 1, "gwriter-%zu", i);
+
+ // FIXME: add stats
+ pgd_alloc_globals.aral_gorilla_writer[i] = aral_create(
+ buf,
+ sizeof(gorilla_writer_t),
+ 64,
+ 512 * sizeof(gorilla_writer_t),
+ pgc_aral_statistics(),
+ NULL, NULL, false, false);
+ }
+}
+
+static void *pgd_data_aral_alloc(size_t size)
+{
+ ARAL *ar = pgd_aral_data_lookup(size);
+ if (!ar)
+ return mallocz(size);
+ else
+ return aral_mallocz(ar);
+}
+
+static void pgd_data_aral_free(void *page, size_t size)
+{
+ ARAL *ar = pgd_aral_data_lookup(size);
+ if (!ar)
+ freez(page);
+ else
+ aral_freez(ar, page);
+}
+
+// ----------------------------------------------------------------------------
+// management api
+
+PGD *pgd_create(uint8_t type, uint32_t slots)
+{
+ PGD *pg = aral_mallocz(pgd_alloc_globals.aral_pgd);
+ pg->type = type;
+ pg->used = 0;
+ pg->slots = slots;
+ pg->options = PAGE_OPTION_ALL_VALUES_EMPTY;
+ pg->states = PGD_STATE_CREATED_FROM_COLLECTOR;
+
+ switch (type) {
+ case PAGE_METRICS:
+ case PAGE_TIER: {
+ uint32_t size = slots * page_type_size[type];
+
+ internal_fatal(!size || slots == 1,
+ "DBENGINE: invalid number of slots (%u) or page type (%u)", slots, type);
+
+ pg->raw.size = size;
+ pg->raw.data = pgd_data_aral_alloc(size);
+ break;
+ }
+ case PAGE_GORILLA_METRICS: {
+ internal_fatal(slots == 1,
+ "DBENGINE: invalid number of slots (%u) or page type (%u)", slots, type);
+
+ pg->slots = 8 * GORILLA_BUFFER_SLOTS;
+
+ // allocate new gorilla writer
+ pg->gorilla.aral_index = gettid() % 4;
+ pg->gorilla.writer = aral_mallocz(pgd_alloc_globals.aral_gorilla_writer[pg->gorilla.aral_index]);
+
+ // allocate new gorilla buffer
+ gorilla_buffer_t *gbuf = aral_mallocz(pgd_alloc_globals.aral_gorilla_buffer[pg->gorilla.aral_index]);
+ memset(gbuf, 0, GORILLA_BUFFER_SIZE);
+ global_statistics_gorilla_buffer_add_hot();
+
+ *pg->gorilla.writer = gorilla_writer_init(gbuf, GORILLA_BUFFER_SLOTS);
+ pg->gorilla.num_buffers = 1;
+
+ break;
+ }
+ default:
+ fatal("Unknown page type: %uc", type);
+ }
+
+ return pg;
+}
+
+PGD *pgd_create_from_disk_data(uint8_t type, void *base, uint32_t size)
+{
+ if (!size)
+ return PGD_EMPTY;
+
+ if (size < page_type_size[type])
+ return PGD_EMPTY;
+
+ PGD *pg = aral_mallocz(pgd_alloc_globals.aral_pgd);
+
+ pg->type = type;
+ pg->states = PGD_STATE_CREATED_FROM_DISK;
+ pg->options = ~PAGE_OPTION_ALL_VALUES_EMPTY;
+
+ switch (type)
+ {
+ case PAGE_METRICS:
+ case PAGE_TIER:
+ pg->raw.size = size;
+ pg->used = size / page_type_size[type];
+ pg->slots = pg->used;
+
+ pg->raw.data = pgd_data_aral_alloc(size);
+ memcpy(pg->raw.data, base, size);
+ break;
+ case PAGE_GORILLA_METRICS:
+ internal_fatal(size == 0, "Asked to create page with 0 data!!!");
+ internal_fatal(size % sizeof(uint32_t), "Unaligned gorilla buffer size");
+ internal_fatal(size % GORILLA_BUFFER_SIZE, "Expected size to be a multiple of %zu-bytes", GORILLA_BUFFER_SIZE);
+
+ pg->raw.data = mallocz(size);
+ pg->raw.size = size;
+
+ // TODO: rm this
+ memset(pg->raw.data, 0, size);
+ memcpy(pg->raw.data, base, size);
+
+ uint32_t total_entries = gorilla_buffer_patch((void *) pg->raw.data);
+
+ pg->used = total_entries;
+ pg->slots = pg->used;
+ break;
+ default:
+ fatal("Unknown page type: %uc", type);
+ }
+
+ return pg;
+}
+
+void pgd_free(PGD *pg)
+{
+ if (!pg)
+ return;
+
+ if (pg == PGD_EMPTY)
+ return;
+
+ switch (pg->type)
+ {
+ case PAGE_METRICS:
+ case PAGE_TIER:
+ pgd_data_aral_free(pg->raw.data, pg->raw.size);
+ break;
+ case PAGE_GORILLA_METRICS: {
+ if (pg->states & PGD_STATE_CREATED_FROM_DISK)
+ {
+ internal_fatal(pg->raw.data == NULL, "Tried to free gorilla PGD loaded from disk with NULL data");
+ freez(pg->raw.data);
+ pg->raw.data = NULL;
+ }
+ else if ((pg->states & PGD_STATE_CREATED_FROM_COLLECTOR) ||
+ (pg->states & PGD_STATE_SCHEDULED_FOR_FLUSHING) ||
+ (pg->states & PGD_STATE_FLUSHED_TO_DISK))
+ {
+ internal_fatal(pg->gorilla.writer == NULL,
+ "PGD does not have an active gorilla writer");
+
+ internal_fatal(pg->gorilla.num_buffers == 0,
+ "PGD does not have any gorilla buffers allocated");
+
+ while (true) {
+ gorilla_buffer_t *gbuf = gorilla_writer_drop_head_buffer(pg->gorilla.writer);
+ if (!gbuf)
+ break;
+ aral_freez(pgd_alloc_globals.aral_gorilla_buffer[pg->gorilla.aral_index], gbuf);
+ pg->gorilla.num_buffers -= 1;
+ }
+
+ internal_fatal(pg->gorilla.num_buffers != 0,
+ "Could not free all gorilla writer buffers");
+
+ aral_freez(pgd_alloc_globals.aral_gorilla_writer[pg->gorilla.aral_index], pg->gorilla.writer);
+ pg->gorilla.writer = NULL;
+ } else {
+ fatal("pgd_free() called on gorilla page with unsupported state");
+ // TODO: should we support any other states?
+ // if (!(pg->states & PGD_STATE_FLUSHED_TO_DISK))
+ // fatal("pgd_free() is not supported yet for pages flushed to disk");
+ }
+
+ break;
+ }
+ default:
+ fatal("Unknown page type: %uc", pg->type);
+ }
+
+ aral_freez(pgd_alloc_globals.aral_pgd, pg);
+}
+
+// ----------------------------------------------------------------------------
+// utility functions
+
+uint32_t pgd_type(PGD *pg)
+{
+ return pg->type;
+}
+
+bool pgd_is_empty(PGD *pg)
+{
+ if (!pg)
+ return true;
+
+ if (pg == PGD_EMPTY)
+ return true;
+
+ if (pg->used == 0)
+ return true;
+
+ if (pg->options & PAGE_OPTION_ALL_VALUES_EMPTY)
+ return true;
+
+ return false;
+}
+
+uint32_t pgd_slots_used(PGD *pg)
+{
+ if (!pg)
+ return 0;
+
+ if (pg == PGD_EMPTY)
+ return 0;
+
+ return pg->used;
+}
+
+uint32_t pgd_memory_footprint(PGD *pg)
+{
+ if (!pg)
+ return 0;
+
+ if (pg == PGD_EMPTY)
+ return 0;
+
+ size_t footprint = 0;
+ switch (pg->type) {
+ case PAGE_METRICS:
+ case PAGE_TIER:
+ footprint = sizeof(PGD) + pg->raw.size;
+ break;
+ case PAGE_GORILLA_METRICS: {
+ if (pg->states & PGD_STATE_CREATED_FROM_DISK)
+ footprint = sizeof(PGD) + pg->raw.size;
+ else
+ footprint = sizeof(PGD) + sizeof(gorilla_writer_t) + (pg->gorilla.num_buffers * GORILLA_BUFFER_SIZE);
+
+ break;
+ }
+ default:
+ fatal("Unknown page type: %uc", pg->type);
+ }
+
+ return footprint;
+}
+
+uint32_t pgd_disk_footprint(PGD *pg)
+{
+ if (!pgd_slots_used(pg))
+ return 0;
+
+ size_t size = 0;
+
+ switch (pg->type) {
+ case PAGE_METRICS:
+ case PAGE_TIER: {
+ uint32_t used_size = pg->used * page_type_size[pg->type];
+ internal_fatal(used_size > pg->raw.size, "Wrong disk footprint page size");
+ size = used_size;
+
+ break;
+ }
+ case PAGE_GORILLA_METRICS: {
+ if (pg->states & PGD_STATE_CREATED_FROM_COLLECTOR ||
+ pg->states & PGD_STATE_SCHEDULED_FOR_FLUSHING ||
+ pg->states & PGD_STATE_FLUSHED_TO_DISK)
+ {
+ internal_fatal(!pg->gorilla.writer,
+ "pgd_disk_footprint() not implemented for NULL gorilla writers");
+
+ internal_fatal(pg->gorilla.num_buffers == 0,
+ "Gorilla writer does not have any buffers");
+
+ size = pg->gorilla.num_buffers * GORILLA_BUFFER_SIZE;
+
+ if (pg->states & PGD_STATE_CREATED_FROM_COLLECTOR) {
+ global_statistics_tier0_disk_compressed_bytes(gorilla_writer_nbytes(pg->gorilla.writer));
+ global_statistics_tier0_disk_uncompressed_bytes(gorilla_writer_entries(pg->gorilla.writer) * sizeof(storage_number));
+ }
+ } else if (pg->states & PGD_STATE_CREATED_FROM_DISK) {
+ size = pg->raw.size;
+ } else {
+ fatal("Asked disk footprint on unknown page state");
+ }
+
+ break;
+ }
+ default:
+ fatal("Unknown page type: %uc", pg->type);
+ }
+
+ internal_fatal(pg->states & PGD_STATE_CREATED_FROM_DISK,
+ "Disk footprint asked for page created from disk.");
+ pg->states = PGD_STATE_SCHEDULED_FOR_FLUSHING;
+ return size;
+}
+
+void pgd_copy_to_extent(PGD *pg, uint8_t *dst, uint32_t dst_size)
+{
+ internal_fatal(pgd_disk_footprint(pg) != dst_size, "Wrong disk footprint size requested (need %u, available %u)",
+ pgd_disk_footprint(pg), dst_size);
+
+ switch (pg->type) {
+ case PAGE_METRICS:
+ case PAGE_TIER:
+ memcpy(dst, pg->raw.data, dst_size);
+ break;
+ case PAGE_GORILLA_METRICS: {
+ if ((pg->states & PGD_STATE_SCHEDULED_FOR_FLUSHING) == 0)
+ fatal("Copying to extent is supported only for PGDs that are scheduled for flushing.");
+
+ internal_fatal(!pg->gorilla.writer,
+ "pgd_copy_to_extent() not implemented for NULL gorilla writers");
+
+ internal_fatal(pg->gorilla.num_buffers == 0,
+ "pgd_copy_to_extent() gorilla writer does not have any buffers");
+
+ bool ok = gorilla_writer_serialize(pg->gorilla.writer, dst, dst_size);
+ UNUSED(ok);
+ internal_fatal(!ok,
+ "pgd_copy_to_extent() tried to serialize pg=%p, gw=%p (with dst_size=%u bytes, num_buffers=%zu)",
+ pg, pg->gorilla.writer, dst_size, pg->gorilla.num_buffers);
+ break;
+ }
+ default:
+ fatal("Unknown page type: %uc", pg->type);
+ }
+
+ pg->states = PGD_STATE_FLUSHED_TO_DISK;
+}
+
+// ----------------------------------------------------------------------------
+// data collection
+
+void pgd_append_point(PGD *pg,
+ usec_t point_in_time_ut __maybe_unused,
+ NETDATA_DOUBLE n,
+ NETDATA_DOUBLE min_value,
+ NETDATA_DOUBLE max_value,
+ uint16_t count,
+ uint16_t anomaly_count,
+ SN_FLAGS flags,
+ uint32_t expected_slot)
+{
+ if (unlikely(pg->used >= pg->slots))
+ fatal("DBENGINE: attempted to write beyond page size (page type %u, slots %u, used %u)",
+ pg->type, pg->slots, pg->used /* FIXME:, pg->size */);
+
+ if (unlikely(pg->used != expected_slot))
+ fatal("DBENGINE: page is not aligned to expected slot (used %u, expected %u)",
+ pg->used, expected_slot);
+
+ if (!(pg->states & PGD_STATE_CREATED_FROM_COLLECTOR))
+ fatal("DBENGINE: collection on page not created from a collector");
+
+ if (pg->states & PGD_STATE_SCHEDULED_FOR_FLUSHING)
+ fatal("Data collection on page already scheduled for flushing");
+
+ switch (pg->type) {
+ case PAGE_METRICS: {
+ storage_number *tier0_metric_data = (storage_number *)pg->raw.data;
+ storage_number t = pack_storage_number(n, flags);
+ tier0_metric_data[pg->used++] = t;
+
+ if ((pg->options & PAGE_OPTION_ALL_VALUES_EMPTY) && does_storage_number_exist(t))
+ pg->options &= ~PAGE_OPTION_ALL_VALUES_EMPTY;
+
+ break;
+ }
+ case PAGE_TIER: {
+ storage_number_tier1_t *tier12_metric_data = (storage_number_tier1_t *)pg->raw.data;
+ storage_number_tier1_t t;
+ t.sum_value = (float) n;
+ t.min_value = (float) min_value;
+ t.max_value = (float) max_value;
+ t.anomaly_count = anomaly_count;
+ t.count = count;
+ tier12_metric_data[pg->used++] = t;
+
+ if ((pg->options & PAGE_OPTION_ALL_VALUES_EMPTY) && fpclassify(n) != FP_NAN)
+ pg->options &= ~PAGE_OPTION_ALL_VALUES_EMPTY;
+
+ break;
+ }
+ case PAGE_GORILLA_METRICS: {
+ pg->used++;
+ storage_number t = pack_storage_number(n, flags);
+
+ if ((pg->options & PAGE_OPTION_ALL_VALUES_EMPTY) && does_storage_number_exist(t))
+ pg->options &= ~PAGE_OPTION_ALL_VALUES_EMPTY;
+
+ bool ok = gorilla_writer_write(pg->gorilla.writer, t);
+ if (!ok) {
+ gorilla_buffer_t *new_buffer = aral_mallocz(pgd_alloc_globals.aral_gorilla_buffer[pg->gorilla.aral_index]);
+ memset(new_buffer, 0, GORILLA_BUFFER_SIZE);
+
+ gorilla_writer_add_buffer(pg->gorilla.writer, new_buffer, GORILLA_BUFFER_SLOTS);
+ pg->gorilla.num_buffers += 1;
+ global_statistics_gorilla_buffer_add_hot();
+
+ ok = gorilla_writer_write(pg->gorilla.writer, t);
+ internal_fatal(ok == false, "Failed to writer value in newly allocated gorilla buffer.");
+ }
+ break;
+ }
+ default:
+ fatal("DBENGINE: unknown page type id %d", pg->type);
+ break;
+ }
+}
+
+// ----------------------------------------------------------------------------
+// querying with cursor
+
+static void pgdc_seek(PGDC *pgdc, uint32_t position)
+{
+ PGD *pg = pgdc->pgd;
+
+ switch (pg->type) {
+ case PAGE_METRICS:
+ case PAGE_TIER:
+ pgdc->slots = pgdc->pgd->used;
+ break;
+ case PAGE_GORILLA_METRICS: {
+ if (pg->states & PGD_STATE_CREATED_FROM_DISK) {
+ pgdc->slots = pgdc->pgd->slots;
+ pgdc->gr = gorilla_reader_init((void *) pg->raw.data);
+ } else {
+ if (!(pg->states & PGD_STATE_CREATED_FROM_COLLECTOR) &&
+ !(pg->states & PGD_STATE_SCHEDULED_FOR_FLUSHING) &&
+ !(pg->states & PGD_STATE_FLUSHED_TO_DISK))
+ fatal("pgdc_seek() currently is not supported for pages created from disk.");
+
+ if (!pg->gorilla.writer)
+ fatal("Seeking from a page without an active gorilla writer is not supported (yet).");
+
+ pgdc->slots = gorilla_writer_entries(pg->gorilla.writer);
+ pgdc->gr = gorilla_writer_get_reader(pg->gorilla.writer);
+ }
+
+ if (position > pgdc->slots)
+ position = pgdc->slots;
+
+ for (uint32_t i = 0; i != position; i++) {
+ uint32_t value;
+
+ bool ok = gorilla_reader_read(&pgdc->gr, &value);
+
+ if (!ok) {
+ // this is fine, the reader will return empty points
+ break;
+ }
+ }
+
+ break;
+ }
+ default:
+ fatal("DBENGINE: unknown page type id %d", pg->type);
+ break;
+ }
+}
+
+void pgdc_reset(PGDC *pgdc, PGD *pgd, uint32_t position)
+{
+ // pgd might be null and position equal to UINT32_MAX
+
+ pgdc->pgd = pgd;
+ pgdc->position = position;
+
+ if (!pgd)
+ return;
+
+ if (pgd == PGD_EMPTY)
+ return;
+
+ if (position == UINT32_MAX)
+ return;
+
+ pgdc_seek(pgdc, position);
+}
+
+bool pgdc_get_next_point(PGDC *pgdc, uint32_t expected_position, STORAGE_POINT *sp)
+{
+ if (!pgdc->pgd || pgdc->pgd == PGD_EMPTY || pgdc->position >= pgdc->slots)
+ {
+ storage_point_empty(*sp, sp->start_time_s, sp->end_time_s);
+ return false;
+ }
+
+ internal_fatal(pgdc->position != expected_position, "Wrong expected cursor position");
+
+ switch (pgdc->pgd->type)
+ {
+ case PAGE_METRICS: {
+ storage_number *array = (storage_number *) pgdc->pgd->raw.data;
+ storage_number n = array[pgdc->position++];
+
+ sp->min = sp->max = sp->sum = unpack_storage_number(n);
+ sp->flags = (SN_FLAGS)(n & SN_USER_FLAGS);
+ sp->count = 1;
+ sp->anomaly_count = is_storage_number_anomalous(n) ? 1 : 0;
+
+ return true;
+ }
+ case PAGE_TIER: {
+ storage_number_tier1_t *array = (storage_number_tier1_t *) pgdc->pgd->raw.data;
+ storage_number_tier1_t n = array[pgdc->position++];
+
+ sp->flags = n.anomaly_count ? SN_FLAG_NONE : SN_FLAG_NOT_ANOMALOUS;
+ sp->count = n.count;
+ sp->anomaly_count = n.anomaly_count;
+ sp->min = n.min_value;
+ sp->max = n.max_value;
+ sp->sum = n.sum_value;
+
+ return true;
+ }
+ case PAGE_GORILLA_METRICS: {
+ pgdc->position++;
+
+ uint32_t n = 666666666;
+ bool ok = gorilla_reader_read(&pgdc->gr, &n);
+ if (ok) {
+ sp->min = sp->max = sp->sum = unpack_storage_number(n);
+ sp->flags = (SN_FLAGS)(n & SN_USER_FLAGS);
+ sp->count = 1;
+ sp->anomaly_count = is_storage_number_anomalous(n) ? 1 : 0;
+ } else {
+ storage_point_empty(*sp, sp->start_time_s, sp->end_time_s);
+ }
+
+ return ok;
+ }
+ default: {
+ static bool logged = false;
+ if (!logged)
+ {
+ netdata_log_error("DBENGINE: unknown page type %d found. Cannot decode it. Ignoring its metrics.", pgd_type(pgdc->pgd));
+ logged = true;
+ }
+
+ storage_point_empty(*sp, sp->start_time_s, sp->end_time_s);
+ return false;
+ }
+ }
+}
diff --git a/database/engine/page.h b/database/engine/page.h
new file mode 100644
index 00000000..32c87c58
--- /dev/null
+++ b/database/engine/page.h
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef DBENGINE_PAGE_H
+#define DBENGINE_PAGE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "libnetdata/libnetdata.h"
+
+typedef struct pgd_cursor {
+ struct pgd *pgd;
+ uint32_t position;
+ uint32_t slots;
+
+ gorilla_reader_t gr;
+} PGDC;
+
+#include "rrdengine.h"
+
+typedef struct pgd PGD;
+
+#define PGD_EMPTY (PGD *)(-1)
+
+void pgd_init_arals(void);
+
+PGD *pgd_create(uint8_t type, uint32_t slots);
+PGD *pgd_create_from_disk_data(uint8_t type, void *base, uint32_t size);
+void pgd_free(PGD *pg);
+
+uint32_t pgd_type(PGD *pg);
+bool pgd_is_empty(PGD *pg);
+uint32_t pgd_slots_used(PGD *pg);
+
+uint32_t pgd_memory_footprint(PGD *pg);
+uint32_t pgd_disk_footprint(PGD *pg);
+
+void pgd_copy_to_extent(PGD *pg, uint8_t *dst, uint32_t dst_size);
+
+void pgd_append_point(PGD *pg,
+ usec_t point_in_time_ut,
+ NETDATA_DOUBLE n,
+ NETDATA_DOUBLE min_value,
+ NETDATA_DOUBLE max_value,
+ uint16_t count,
+ uint16_t anomaly_count,
+ SN_FLAGS flags,
+ uint32_t expected_slot);
+
+void pgdc_reset(PGDC *pgdc, PGD *pgd, uint32_t position);
+bool pgdc_get_next_point(PGDC *pgdc, uint32_t expected_position, STORAGE_POINT *sp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // DBENGINE_PAGE_H
diff --git a/database/engine/page_test.cc b/database/engine/page_test.cc
new file mode 100644
index 00000000..d61299bc
--- /dev/null
+++ b/database/engine/page_test.cc
@@ -0,0 +1,405 @@
+#include "page.h"
+#include "page_test.h"
+
+#ifdef HAVE_GTEST
+
+#include <gtest/gtest.h>
+#include <limits>
+#include <random>
+
+bool operator==(const STORAGE_POINT lhs, const STORAGE_POINT rhs) {
+ if (lhs.min != rhs.min)
+ return false;
+
+ if (lhs.max != rhs.max)
+ return false;
+
+ if (lhs.sum != rhs.sum)
+ return false;
+
+ if (lhs.start_time_s != rhs.start_time_s)
+ return false;
+
+ if (lhs.end_time_s != rhs.end_time_s)
+ return false;
+
+ if (lhs.count != rhs.count)
+ return false;
+
+ if (lhs.flags != rhs.flags)
+ return false;
+
+ return true;
+}
+
+// TODO: use value-parameterized tests
+// http://google.github.io/googletest/advanced.html#value-parameterized-tests
+static uint8_t page_type = PAGE_GORILLA_METRICS;
+
+static size_t slots_for_page(size_t n) {
+ switch (page_type) {
+ case PAGE_METRICS:
+ return 1024;
+ case PAGE_GORILLA_METRICS:
+ return n;
+ default:
+ fatal("Slots requested for unsupported page: %uc", page_type);
+ }
+}
+
+TEST(PGD, EmptyOrNull) {
+ PGD *pg = NULL;
+
+ PGDC cursor;
+ STORAGE_POINT sp;
+
+ EXPECT_TRUE(pgd_is_empty(pg));
+ EXPECT_EQ(pgd_slots_used(pg), 0);
+ EXPECT_EQ(pgd_memory_footprint(pg), 0);
+ EXPECT_EQ(pgd_disk_footprint(pg), 0);
+
+ pgdc_reset(&cursor, pg, 0);
+ EXPECT_FALSE(pgdc_get_next_point(&cursor, 0, &sp));
+
+ pgd_free(pg);
+
+ pg = PGD_EMPTY;
+
+ EXPECT_TRUE(pgd_is_empty(pg));
+ EXPECT_EQ(pgd_slots_used(pg), 0);
+ EXPECT_EQ(pgd_memory_footprint(pg), 0);
+ EXPECT_EQ(pgd_disk_footprint(pg), 0);
+ EXPECT_FALSE(pgdc_get_next_point(&cursor, 0, &sp));
+
+ pgdc_reset(&cursor, pg, 0);
+ EXPECT_FALSE(pgdc_get_next_point(&cursor, 0, &sp));
+
+ pgd_free(pg);
+}
+
+TEST(PGD, Create) {
+ size_t slots = slots_for_page(1024 * 1024);
+ PGD *pg = pgd_create(page_type, slots);
+
+ EXPECT_EQ(pgd_type(pg), page_type);
+ EXPECT_TRUE(pgd_is_empty(pg));
+ EXPECT_EQ(pgd_slots_used(pg), 0);
+
+ for (size_t i = 0; i != slots; i++) {
+ pgd_append_point(pg, i, i, 0, 0, 1, 1, SN_DEFAULT_FLAGS, i);
+ EXPECT_FALSE(pgd_is_empty(pg));
+ }
+ EXPECT_EQ(pgd_slots_used(pg), slots);
+
+ EXPECT_DEATH(
+ pgd_append_point(pg, slots, slots, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slots),
+ ".*"
+ );
+
+ pgd_free(pg);
+}
+
+TEST(PGD, CursorFullPage) {
+ size_t slots = slots_for_page(1024 * 1024);
+ PGD *pg = pgd_create(page_type, slots);
+
+ for (size_t slot = 0; slot != slots; slot++)
+ pgd_append_point(pg, slot, slot, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slot);
+
+ for (size_t i = 0; i != 2; i++) {
+ PGDC cursor;
+ pgdc_reset(&cursor, pg, 0);
+
+ STORAGE_POINT sp;
+ for (size_t slot = 0; slot != slots; slot++) {
+ EXPECT_TRUE(pgdc_get_next_point(&cursor, slot, &sp));
+
+ EXPECT_EQ(slot, static_cast<size_t>(sp.min));
+ EXPECT_EQ(sp.min, sp.max);
+ EXPECT_EQ(sp.min, sp.sum);
+ EXPECT_EQ(sp.count, 1);
+ EXPECT_EQ(sp.anomaly_count, 0);
+ }
+
+ EXPECT_FALSE(pgdc_get_next_point(&cursor, slots, &sp));
+ }
+
+ for (size_t i = 0; i != 2; i++) {
+ PGDC cursor;
+ pgdc_reset(&cursor, pg, slots / 2);
+
+ STORAGE_POINT sp;
+ for (size_t slot = slots / 2; slot != slots; slot++) {
+ EXPECT_TRUE(pgdc_get_next_point(&cursor, slot, &sp));
+
+ EXPECT_EQ(slot, static_cast<size_t>(sp.min));
+ EXPECT_EQ(sp.min, sp.max);
+ EXPECT_EQ(sp.min, sp.sum);
+ EXPECT_EQ(sp.count, 1);
+ EXPECT_EQ(sp.anomaly_count, 0);
+ }
+
+ EXPECT_FALSE(pgdc_get_next_point(&cursor, slots, &sp));
+ }
+
+ // out of bounds seek
+ {
+ PGDC cursor;
+ pgdc_reset(&cursor, pg, 2 * slots);
+
+ STORAGE_POINT sp;
+ EXPECT_FALSE(pgdc_get_next_point(&cursor, 2 * slots, &sp));
+ }
+
+ pgd_free(pg);
+}
+
+TEST(PGD, CursorHalfPage) {
+ size_t slots = slots_for_page(1024 * 1024);
+ PGD *pg = pgd_create(page_type, slots);
+
+ PGDC cursor;
+ STORAGE_POINT sp;
+
+ // fill the 1st half of the page
+ for (size_t slot = 0; slot != slots / 2; slot++)
+ pgd_append_point(pg, slot, slot, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slot);
+
+ pgdc_reset(&cursor, pg, 0);
+
+ for (size_t slot = 0; slot != slots / 2; slot++) {
+ EXPECT_TRUE(pgdc_get_next_point(&cursor, slot, &sp));
+
+ EXPECT_EQ(slot, static_cast<size_t>(sp.min));
+ EXPECT_EQ(sp.min, sp.max);
+ EXPECT_EQ(sp.min, sp.sum);
+ EXPECT_EQ(sp.count, 1);
+ EXPECT_EQ(sp.anomaly_count, 0);
+ }
+ EXPECT_FALSE(pgdc_get_next_point(&cursor, slots / 2, &sp));
+
+ // reset pgdc to the end of the page, we should not be getting more
+ // points even if the page has grown in between.
+
+ pgdc_reset(&cursor, pg, slots / 2);
+
+ for (size_t slot = slots / 2; slot != slots; slot++)
+ pgd_append_point(pg, slot, slot, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slot);
+
+ for (size_t slot = slots / 2; slot != slots; slot++)
+ EXPECT_FALSE(pgdc_get_next_point(&cursor, slot, &sp));
+
+ EXPECT_FALSE(pgdc_get_next_point(&cursor, slots, &sp));
+
+ pgd_free(pg);
+}
+
+TEST(PGD, MemoryFootprint) {
+ size_t slots = slots_for_page(1024 * 1024);
+ PGD *pg = pgd_create(page_type, slots);
+
+ uint32_t footprint = 0;
+ switch (pgd_type(pg)) {
+ case PAGE_METRICS:
+ footprint = slots * sizeof(uint32_t);
+ break;
+ case PAGE_GORILLA_METRICS:
+ footprint = 128 * sizeof(uint32_t);
+ break;
+ default:
+ fatal("Uknown page type: %uc", pgd_type(pg));
+ }
+ EXPECT_NEAR(pgd_memory_footprint(pg), footprint, 128);
+
+ std::random_device rand_dev;
+ std::mt19937 gen(rand_dev());
+ std::uniform_int_distribution<uint32_t> distr(std::numeric_limits<uint32_t>::min(),
+ std::numeric_limits<uint32_t>::max()); // define the range
+
+ for (size_t slot = 0; slot != slots; slot++) {
+ uint32_t n = distr(gen);
+ pgd_append_point(pg, slot, n, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slot);
+ }
+
+ footprint = slots * sizeof(uint32_t);
+
+ uint32_t abs_error = 0;
+ switch (pgd_type(pg)) {
+ case PAGE_METRICS:
+ abs_error = 128;
+ break;
+ case PAGE_GORILLA_METRICS:
+ abs_error = footprint / 10;
+ break;
+ default:
+ fatal("Uknown page type: %uc", pgd_type(pg));
+ }
+
+ EXPECT_NEAR(pgd_memory_footprint(pg), footprint, abs_error);
+}
+
+TEST(PGD, DiskFootprint) {
+ size_t slots = slots_for_page(1024 * 1024);
+ PGD *pg = pgd_create(page_type, slots);
+
+ std::random_device rand_dev;
+ std::mt19937 gen(rand_dev());
+ std::uniform_int_distribution<uint32_t> distr(std::numeric_limits<uint32_t>::min(),
+ std::numeric_limits<uint32_t>::max()); // define the range
+
+ size_t used_slots = 16;
+
+ for (size_t slot = 0; slot != used_slots; slot++) {
+ uint32_t n = distr(gen);
+ pgd_append_point(pg, slot, n, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slot);
+ }
+
+ uint32_t footprint = 0;
+ switch (pgd_type(pg)) {
+ case PAGE_METRICS:
+ footprint = used_slots * sizeof(uint32_t);
+ break;
+ case PAGE_GORILLA_METRICS:
+ footprint = 128 * sizeof(uint32_t);
+ break;
+ default:
+ fatal("Uknown page type: %uc", pgd_type(pg));
+ }
+ EXPECT_EQ(pgd_disk_footprint(pg), footprint);
+
+ pgd_free(pg);
+
+ pg = pgd_create(page_type, slots);
+
+ used_slots = 128 + 64;
+
+ for (size_t slot = 0; slot != used_slots; slot++) {
+ uint32_t n = distr(gen);
+ pgd_append_point(pg, slot, n, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slot);
+ }
+
+ switch (pgd_type(pg)) {
+ case PAGE_METRICS:
+ footprint = used_slots * sizeof(uint32_t);
+ break;
+ case PAGE_GORILLA_METRICS:
+ footprint = 2 * (128 * sizeof(uint32_t));
+ break;
+ default:
+ fatal("Uknown page type: %uc", pgd_type(pg));
+ }
+ EXPECT_EQ(pgd_disk_footprint(pg), footprint);
+
+ pgd_free(pg);
+}
+
+TEST(PGD, CopyToExtent) {
+ size_t slots = slots_for_page(1024 * 1024);
+ PGD *pg_collector = pgd_create(page_type, slots);
+
+ uint32_t value = 666;
+ pgd_append_point(pg_collector, 0, value, 0, 0, 1, 0, SN_DEFAULT_FLAGS, 0);
+
+ uint32_t size_in_bytes = pgd_disk_footprint(pg_collector);
+ EXPECT_EQ(size_in_bytes, 512);
+
+ uint32_t size_in_words = size_in_bytes / sizeof(uint32_t);
+ alignas(sizeof(uintptr_t)) uint32_t disk_buffer[size_in_words];
+
+ for (size_t i = 0; i != size_in_words; i++) {
+ disk_buffer[i] = std::numeric_limits<uint32_t>::max();
+ }
+
+ pgd_copy_to_extent(pg_collector, (uint8_t *) &disk_buffer[0], size_in_bytes);
+
+ EXPECT_EQ(disk_buffer[0], NULL);
+ EXPECT_EQ(disk_buffer[1], NULL);
+ EXPECT_EQ(disk_buffer[2], 1);
+ EXPECT_EQ(disk_buffer[3], 32);
+ storage_number sn = pack_storage_number(value, SN_DEFAULT_FLAGS);
+ EXPECT_EQ(disk_buffer[4], sn);
+
+ // make sure the rest of the page is 0'ed so that it's amenable to compression
+ for (size_t i = 5; i != size_in_words; i++)
+ EXPECT_EQ(disk_buffer[i], 0);
+
+ pgd_free(pg_collector);
+}
+
+TEST(PGD, Roundtrip) {
+ size_t slots = slots_for_page(1024 * 1024);
+ PGD *pg_collector = pgd_create(page_type, slots);
+
+ for (size_t i = 0; i != slots; i++)
+ pgd_append_point(pg_collector, i, i, 0, 0, 1, 1, SN_DEFAULT_FLAGS, i);
+
+ uint32_t size_in_bytes = pgd_disk_footprint(pg_collector);
+ uint32_t size_in_words = size_in_bytes / sizeof(uint32_t);
+
+ alignas(sizeof(uintptr_t)) uint32_t disk_buffer[size_in_words];
+ for (size_t i = 0; i != size_in_words; i++)
+ disk_buffer[i] = std::numeric_limits<uint32_t>::max();
+
+ pgd_copy_to_extent(pg_collector, (uint8_t *) &disk_buffer[0], size_in_bytes);
+
+ PGD *pg_disk = pgd_create_from_disk_data(page_type, &disk_buffer[0], size_in_bytes);
+ EXPECT_EQ(pgd_slots_used(pg_disk), slots);
+
+ // Expected memory footprint is equal to the disk footprint + a couple
+ // bytes for the PGD metadata.
+ EXPECT_NEAR(pgd_memory_footprint(pg_disk), size_in_bytes, 128);
+
+ // Do not allow calling disk footprint for pages created from disk.
+ EXPECT_DEATH(pgd_disk_footprint(pg_disk), ".*");
+
+ for (size_t i = 0; i != 10; i++) {
+ PGDC cursor_collector;
+ PGDC cursor_disk;
+
+ pgdc_reset(&cursor_collector, pg_collector, i * 1024);
+ pgdc_reset(&cursor_disk, pg_disk, i * 1024);
+
+ STORAGE_POINT sp_collector = {};
+ STORAGE_POINT sp_disk = {};
+
+ for (size_t slot = i * 1024; slot != slots; slot++) {
+ EXPECT_TRUE(pgdc_get_next_point(&cursor_collector, slot, &sp_collector));
+ EXPECT_TRUE(pgdc_get_next_point(&cursor_disk, slot, &sp_disk));
+
+ EXPECT_EQ(sp_collector, sp_disk);
+ }
+
+ EXPECT_FALSE(pgdc_get_next_point(&cursor_collector, slots, &sp_collector));
+ EXPECT_FALSE(pgdc_get_next_point(&cursor_disk, slots, &sp_disk));
+ }
+
+ pgd_free(pg_disk);
+ pgd_free(pg_collector);
+}
+
+int pgd_test(int argc, char *argv[])
+{
+ // Dummy/necessary initialization stuff
+ PGC *dummy_cache = pgc_create("pgd-tests-cache", 32 * 1024 * 1024, NULL, 64, NULL, NULL,
+ 10, 10, 1000, 10, PGC_OPTIONS_NONE, 1, 11);
+ pgd_init_arals();
+
+ ::testing::InitGoogleTest(&argc, argv);
+ int rc = RUN_ALL_TESTS();
+
+ pgc_destroy(dummy_cache);
+
+ return rc;
+}
+
+#else // HAVE_GTEST
+
+int pgd_test(int argc, char *argv[])
+{
+ (void) argc;
+ (void) argv;
+ fprintf(stderr, "Can not run PGD tests because the agent was not build with support for google tests.\n");
+ return 0;
+}
+
+#endif // HAVE_GTEST
diff --git a/database/engine/page_test.h b/database/engine/page_test.h
new file mode 100644
index 00000000..30837f0a
--- /dev/null
+++ b/database/engine/page_test.h
@@ -0,0 +1,14 @@
+#ifndef PAGE_TEST_H
+#define PAGE_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int pgd_test(int argc, char *argv[]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PAGE_TEST_H */
diff --git a/database/engine/pagecache.c b/database/engine/pagecache.c
new file mode 100644
index 00000000..dab9cdd0
--- /dev/null
+++ b/database/engine/pagecache.c
@@ -0,0 +1,1117 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+#define NETDATA_RRD_INTERNALS
+
+#include "rrdengine.h"
+
+MRG *main_mrg = NULL;
+PGC *main_cache = NULL;
+PGC *open_cache = NULL;
+PGC *extent_cache = NULL;
+struct rrdeng_cache_efficiency_stats rrdeng_cache_efficiency_stats = {};
+
+static void main_cache_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused)
+{
+ // Release storage associated with the page
+ pgd_free(entry.data);
+}
+
+static void main_cache_flush_dirty_page_init_callback(PGC *cache __maybe_unused, Word_t section) {
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *) section;
+
+ // mark ctx as having flushing in progress
+ __atomic_add_fetch(&ctx->atomic.extents_currently_being_flushed, 1, __ATOMIC_RELAXED);
+}
+
+static void main_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused)
+{
+ if(!entries)
+ return;
+
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *) entries_array[0].section;
+
+ struct page_descr_with_data *base = NULL;
+
+ for (size_t Index = 0 ; Index < entries; Index++) {
+ time_t start_time_s = entries_array[Index].start_time_s;
+ time_t end_time_s = entries_array[Index].end_time_s;
+ struct page_descr_with_data *descr = page_descriptor_get();
+
+ descr->id = mrg_metric_uuid(main_mrg, (METRIC *) entries_array[Index].metric_id);
+ descr->metric_id = entries_array[Index].metric_id;
+ descr->start_time_ut = start_time_s * USEC_PER_SEC;
+ descr->end_time_ut = end_time_s * USEC_PER_SEC;
+ descr->update_every_s = entries_array[Index].update_every_s;
+
+ descr->pgd = pgc_page_data(pages_array[Index]);
+ descr->type = pgd_type(descr->pgd);
+ descr->page_length = pgd_disk_footprint(descr->pgd);
+
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(base, descr, link.prev, link.next);
+
+ // TODO: ask @stelfrag/@ktsaou about this.
+ // internal_fatal(descr->page_length > RRDENG_BLOCK_SIZE, "DBENGINE: faulty page length calculation");
+ }
+
+ struct completion completion;
+ completion_init(&completion);
+ rrdeng_enq_cmd(ctx, RRDENG_OPCODE_EXTENT_WRITE, base, &completion, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
+ completion_wait_for(&completion);
+ completion_destroy(&completion);
+}
+
+static void open_cache_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused)
+{
+ struct rrdengine_datafile *datafile = entry.data;
+ datafile_release(datafile, DATAFILE_ACQUIRE_OPEN_CACHE);
+}
+
+static void open_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused)
+{
+ ;
+}
+
+static void extent_cache_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused)
+{
+ dbengine_extent_free(entry.data, entry.size);
+}
+
+static void extent_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused)
+{
+ ;
+}
+
+inline TIME_RANGE_COMPARE is_page_in_time_range(time_t page_first_time_s, time_t page_last_time_s, time_t wanted_start_time_s, time_t wanted_end_time_s) {
+ // page_first_time_s <= wanted_end_time_s && page_last_time_s >= wanted_start_time_s
+
+ if(page_last_time_s < wanted_start_time_s)
+ return PAGE_IS_IN_THE_PAST;
+
+ if(page_first_time_s > wanted_end_time_s)
+ return PAGE_IS_IN_THE_FUTURE;
+
+ return PAGE_IS_IN_RANGE;
+}
+
+static inline struct page_details *pdc_find_page_for_time(
+ Pcvoid_t PArray,
+ time_t wanted_time_s,
+ size_t *gaps,
+ PDC_PAGE_STATUS mode,
+ PDC_PAGE_STATUS skip_list
+) {
+ Word_t PIndexF = wanted_time_s, PIndexL = wanted_time_s;
+ Pvoid_t *PValueF, *PValueL;
+ struct page_details *pdF = NULL, *pdL = NULL;
+ bool firstF = true, firstL = true;
+
+ PDC_PAGE_STATUS ignore_list = PDC_PAGE_QUERY_GLOBAL_SKIP_LIST | skip_list;
+
+ while ((PValueF = PDCJudyLFirstThenNext(PArray, &PIndexF, &firstF))) {
+ pdF = *PValueF;
+
+ PDC_PAGE_STATUS status = __atomic_load_n(&pdF->status, __ATOMIC_ACQUIRE);
+ if (!(status & (ignore_list | mode)))
+ break;
+
+ pdF = NULL;
+ }
+
+ while ((PValueL = PDCJudyLLastThenPrev(PArray, &PIndexL, &firstL))) {
+ pdL = *PValueL;
+
+ PDC_PAGE_STATUS status = __atomic_load_n(&pdL->status, __ATOMIC_ACQUIRE);
+ if(status & mode) {
+ // don't go all the way back to the beginning
+ // stop at the last processed
+ pdL = NULL;
+ break;
+ }
+
+ if (!(status & ignore_list))
+ break;
+
+ pdL = NULL;
+ }
+
+ TIME_RANGE_COMPARE rcF = (pdF) ? is_page_in_time_range(pdF->first_time_s, pdF->last_time_s, wanted_time_s, wanted_time_s) : PAGE_IS_IN_THE_FUTURE;
+ TIME_RANGE_COMPARE rcL = (pdL) ? is_page_in_time_range(pdL->first_time_s, pdL->last_time_s, wanted_time_s, wanted_time_s) : PAGE_IS_IN_THE_PAST;
+
+ if (!pdF || pdF == pdL) {
+ // F is missing, or they are the same
+ // return L
+ (*gaps) += (rcL == PAGE_IS_IN_RANGE) ? 0 : 1;
+ return pdL;
+ }
+
+ if (!pdL) {
+ // L is missing
+ // return F
+ (*gaps) += (rcF == PAGE_IS_IN_RANGE) ? 0 : 1;
+ return pdF;
+ }
+
+ if (rcF == rcL) {
+ // both are on the same side,
+ // but they are different pages
+
+ switch (rcF) {
+ case PAGE_IS_IN_RANGE:
+ // pick the higher resolution
+ if (pdF->update_every_s && pdF->update_every_s < pdL->update_every_s)
+ return pdF;
+
+ if (pdL->update_every_s && pdL->update_every_s < pdF->update_every_s)
+ return pdL;
+
+ // same resolution - pick the one that starts earlier
+ if (pdL->first_time_s < pdF->first_time_s)
+ return pdL;
+
+ return pdF;
+ break;
+
+ case PAGE_IS_IN_THE_FUTURE:
+ (*gaps)++;
+
+ // pick the one that starts earlier
+ if (pdL->first_time_s < pdF->first_time_s)
+ return pdL;
+
+ return pdF;
+ break;
+
+ default:
+ case PAGE_IS_IN_THE_PAST:
+ (*gaps)++;
+ return NULL;
+ break;
+ }
+ }
+
+ if(rcF == PAGE_IS_IN_RANGE) {
+ // (*gaps) += 0;
+ return pdF;
+ }
+
+ if(rcL == PAGE_IS_IN_RANGE) {
+ // (*gaps) += 0;
+ return pdL;
+ }
+
+ if(rcF == PAGE_IS_IN_THE_FUTURE) {
+ (*gaps)++;
+ return pdF;
+ }
+
+ if(rcL == PAGE_IS_IN_THE_FUTURE) {
+ (*gaps)++;
+ return pdL;
+ }
+
+ // impossible case
+ (*gaps)++;
+ return NULL;
+}
+
+static size_t get_page_list_from_pgc(PGC *cache, METRIC *metric, struct rrdengine_instance *ctx,
+ time_t wanted_start_time_s, time_t wanted_end_time_s,
+ Pvoid_t *JudyL_page_array, size_t *cache_gaps,
+ bool open_cache_mode, PDC_PAGE_STATUS tags) {
+
+ size_t pages_found_in_cache = 0;
+ Word_t metric_id = mrg_metric_id(main_mrg, metric);
+
+ time_t now_s = wanted_start_time_s;
+ time_t dt_s = mrg_metric_get_update_every_s(main_mrg, metric);
+
+ if(!dt_s)
+ dt_s = default_rrd_update_every;
+
+ time_t previous_page_end_time_s = now_s - dt_s;
+ bool first = true;
+
+ do {
+ PGC_PAGE *page = pgc_page_get_and_acquire(
+ cache, (Word_t)ctx, (Word_t)metric_id, now_s,
+ (first) ? PGC_SEARCH_CLOSEST : PGC_SEARCH_NEXT);
+
+ first = false;
+
+ if(!page) {
+ if(previous_page_end_time_s < wanted_end_time_s)
+ (*cache_gaps)++;
+
+ break;
+ }
+
+ time_t page_start_time_s = pgc_page_start_time_s(page);
+ time_t page_end_time_s = pgc_page_end_time_s(page);
+ time_t page_update_every_s = pgc_page_update_every_s(page);
+
+ if(!page_update_every_s)
+ page_update_every_s = dt_s;
+
+ if(is_page_in_time_range(page_start_time_s, page_end_time_s, wanted_start_time_s, wanted_end_time_s) != PAGE_IS_IN_RANGE) {
+ // not a useful page for this query
+ pgc_page_release(cache, page);
+ page = NULL;
+
+ if(previous_page_end_time_s < wanted_end_time_s)
+ (*cache_gaps)++;
+
+ break;
+ }
+
+ if (page_start_time_s - previous_page_end_time_s > dt_s)
+ (*cache_gaps)++;
+
+ Pvoid_t *PValue = PDCJudyLIns(JudyL_page_array, (Word_t) page_start_time_s, PJE0);
+ if (!PValue || PValue == PJERR)
+ fatal("DBENGINE: corrupted judy array in %s()", __FUNCTION__ );
+
+ if (unlikely(*PValue))
+ // already exists in our list
+ pgc_page_release(cache, page);
+
+ else {
+
+ internal_fatal(pgc_page_metric(page) != metric_id, "Wrong metric id in page found in cache");
+ internal_fatal(pgc_page_section(page) != (Word_t)ctx, "Wrong section in page found in cache");
+
+ struct page_details *pd = page_details_get();
+ pd->metric_id = metric_id;
+ pd->first_time_s = page_start_time_s;
+ pd->last_time_s = page_end_time_s;
+ pd->update_every_s = (uint32_t) page_update_every_s;
+ pd->page = (open_cache_mode) ? NULL : page;
+ pd->status |= tags;
+
+ if((pd->page)) {
+ pd->status |= PDC_PAGE_READY | PDC_PAGE_PRELOADED;
+
+ if(pgd_is_empty(pgc_page_data(page)))
+ pd->status |= PDC_PAGE_EMPTY;
+ }
+
+ if(open_cache_mode) {
+ struct rrdengine_datafile *datafile = pgc_page_data(page);
+ if(datafile_acquire(datafile, DATAFILE_ACQUIRE_PAGE_DETAILS)) { // for pd
+ struct extent_io_data *xio = (struct extent_io_data *) pgc_page_custom_data(cache, page);
+ pd->datafile.ptr = pgc_page_data(page);
+ pd->datafile.file = xio->file;
+ pd->datafile.extent.pos = xio->pos;
+ pd->datafile.extent.bytes = xio->bytes;
+ pd->datafile.fileno = pd->datafile.ptr->fileno;
+ pd->status |= PDC_PAGE_DATAFILE_ACQUIRED | PDC_PAGE_DISK_PENDING;
+ }
+ else {
+ pd->status |= PDC_PAGE_FAILED | PDC_PAGE_FAILED_TO_ACQUIRE_DATAFILE;
+ }
+ pgc_page_release(cache, page);
+ }
+
+ *PValue = pd;
+
+ pages_found_in_cache++;
+ }
+
+ // prepare for the next iteration
+ previous_page_end_time_s = page_end_time_s;
+
+ if(page_update_every_s > 0)
+ dt_s = page_update_every_s;
+
+ // we are going to as for the NEXT page
+ // so, set this to our first time
+ now_s = page_start_time_s;
+
+ } while(now_s <= wanted_end_time_s);
+
+ return pages_found_in_cache;
+}
+
+static void pgc_inject_gap(struct rrdengine_instance *ctx, METRIC *metric, time_t start_time_s, time_t end_time_s) {
+
+ time_t db_first_time_s, db_last_time_s, db_update_every_s;
+ mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s);
+
+ if(is_page_in_time_range(start_time_s, end_time_s, db_first_time_s, db_last_time_s) != PAGE_IS_IN_RANGE)
+ return;
+
+ PGC_ENTRY page_entry = {
+ .hot = false,
+ .section = (Word_t)ctx,
+ .metric_id = (Word_t)metric,
+ .start_time_s = MAX(start_time_s, db_first_time_s),
+ .end_time_s = MIN(end_time_s, db_last_time_s),
+ .update_every_s = 0,
+ .size = 0,
+ .data = PGD_EMPTY,
+ };
+
+ if(page_entry.start_time_s >= page_entry.end_time_s)
+ return;
+
+ PGC_PAGE *page = pgc_page_add_and_acquire(main_cache, page_entry, NULL);
+ pgc_page_release(main_cache, page);
+}
+
+static size_t list_has_time_gaps(
+ struct rrdengine_instance *ctx,
+ METRIC *metric,
+ Pvoid_t JudyL_page_array,
+ time_t wanted_start_time_s,
+ time_t wanted_end_time_s,
+ size_t *pages_total,
+ size_t *pages_found_pass4,
+ size_t *pages_to_load_from_disk,
+ size_t *pages_overlapping,
+ time_t *optimal_end_time_s,
+ bool populate_gaps,
+ PDC_PAGE_STATUS *common_status
+) {
+ // we will recalculate these, so zero them
+ *pages_to_load_from_disk = 0;
+ *pages_overlapping = 0;
+ *optimal_end_time_s = 0;
+ *common_status = 0;
+
+ bool first;
+ Pvoid_t *PValue;
+ Word_t this_page_start_time;
+ struct page_details *pd;
+
+ size_t gaps = 0;
+ Word_t metric_id = mrg_metric_id(main_mrg, metric);
+
+ // ------------------------------------------------------------------------
+ // PASS 1: remove the preprocessing flags from the pages in PDC
+
+ first = true;
+ this_page_start_time = 0;
+ while((PValue = PDCJudyLFirstThenNext(JudyL_page_array, &this_page_start_time, &first))) {
+ pd = *PValue;
+ pd->status &= ~(PDC_PAGE_SKIP|PDC_PAGE_PREPROCESSED);
+ }
+
+ // ------------------------------------------------------------------------
+ // PASS 2: emulate processing to find the useful pages
+
+ time_t now_s = wanted_start_time_s;
+ time_t dt_s = mrg_metric_get_update_every_s(main_mrg, metric);
+ if(!dt_s)
+ dt_s = default_rrd_update_every;
+
+ size_t pages_pass2 = 0, pages_pass3 = 0;
+ while((pd = pdc_find_page_for_time(
+ JudyL_page_array, now_s, &gaps,
+ PDC_PAGE_PREPROCESSED, 0))) {
+
+ pd->status |= PDC_PAGE_PREPROCESSED;
+ pages_pass2++;
+
+ if(pd->update_every_s)
+ dt_s = pd->update_every_s;
+
+ if(populate_gaps && pd->first_time_s > now_s)
+ pgc_inject_gap(ctx, metric, now_s, pd->first_time_s);
+
+ now_s = pd->last_time_s + dt_s;
+ if(now_s > wanted_end_time_s) {
+ *optimal_end_time_s = pd->last_time_s;
+ break;
+ }
+ }
+
+ if(populate_gaps && now_s < wanted_end_time_s)
+ pgc_inject_gap(ctx, metric, now_s, wanted_end_time_s);
+
+ // ------------------------------------------------------------------------
+ // PASS 3: mark as skipped all the pages not useful
+
+ first = true;
+ this_page_start_time = 0;
+ while((PValue = PDCJudyLFirstThenNext(JudyL_page_array, &this_page_start_time, &first))) {
+ pd = *PValue;
+
+ internal_fatal(pd->metric_id != metric_id, "pd has wrong metric_id");
+
+ if(!(pd->status & PDC_PAGE_PREPROCESSED)) {
+ (*pages_overlapping)++;
+ pd->status |= PDC_PAGE_SKIP;
+ pd->status &= ~(PDC_PAGE_READY | PDC_PAGE_DISK_PENDING);
+ *common_status |= pd->status;
+ continue;
+ }
+
+ pages_pass3++;
+
+ if(!pd->page) {
+ pd->page = pgc_page_get_and_acquire(main_cache, (Word_t) ctx, (Word_t) metric_id, pd->first_time_s, PGC_SEARCH_EXACT);
+
+ if(pd->page) {
+ (*pages_found_pass4)++;
+
+ pd->status &= ~PDC_PAGE_DISK_PENDING;
+ pd->status |= PDC_PAGE_READY | PDC_PAGE_PRELOADED | PDC_PAGE_PRELOADED_PASS4;
+
+ if(pgd_is_empty(pgc_page_data(pd->page)))
+ pd->status |= PDC_PAGE_EMPTY;
+
+ }
+ else if(!(pd->status & PDC_PAGE_FAILED) && (pd->status & PDC_PAGE_DATAFILE_ACQUIRED)) {
+ (*pages_to_load_from_disk)++;
+
+ pd->status |= PDC_PAGE_DISK_PENDING;
+
+ internal_fatal(pd->status & PDC_PAGE_SKIP, "page is disk pending and skipped");
+ internal_fatal(!pd->datafile.ptr, "datafile is NULL");
+ internal_fatal(!pd->datafile.extent.bytes, "datafile.extent.bytes zero");
+ internal_fatal(!pd->datafile.extent.pos, "datafile.extent.pos is zero");
+ internal_fatal(!pd->datafile.fileno, "datafile.fileno is zero");
+ }
+ }
+ else {
+ pd->status &= ~PDC_PAGE_DISK_PENDING;
+ pd->status |= (PDC_PAGE_READY | PDC_PAGE_PRELOADED);
+ }
+
+ *common_status |= pd->status;
+ }
+
+ internal_fatal(pages_pass2 != pages_pass3,
+ "DBENGINE: page count does not match");
+
+ *pages_total = pages_pass2;
+
+ return gaps;
+}
+
+// ----------------------------------------------------------------------------
+
+typedef void (*page_found_callback_t)(PGC_PAGE *page, void *data);
+static size_t get_page_list_from_journal_v2(struct rrdengine_instance *ctx, METRIC *metric, usec_t start_time_ut, usec_t end_time_ut, page_found_callback_t callback, void *callback_data) {
+ uuid_t *uuid = mrg_metric_uuid(main_mrg, metric);
+ Word_t metric_id = mrg_metric_id(main_mrg, metric);
+
+ time_t wanted_start_time_s = (time_t)(start_time_ut / USEC_PER_SEC);
+ time_t wanted_end_time_s = (time_t)(end_time_ut / USEC_PER_SEC);
+
+ size_t pages_found = 0;
+
+ NJFV2IDX_FIND_STATE state = {
+ .init = false,
+ .last = 0,
+ .ctx = ctx,
+ .wanted_start_time_s = wanted_start_time_s,
+ .wanted_end_time_s = wanted_end_time_s,
+ .j2_header_acquired = NULL,
+ };
+
+ struct rrdengine_datafile *datafile;
+ while((datafile = njfv2idx_find_and_acquire_j2_header(&state))) {
+ struct journal_v2_header *j2_header = state.j2_header_acquired;
+
+ if (unlikely(!j2_header))
+ continue;
+
+ time_t journal_start_time_s = (time_t)(j2_header->start_time_ut / USEC_PER_SEC);
+
+ // the datafile possibly contains useful data for this query
+
+ size_t journal_metric_count = (size_t)j2_header->metric_count;
+ struct journal_metric_list *uuid_list = (struct journal_metric_list *)((uint8_t *) j2_header + j2_header->metric_offset);
+ struct journal_metric_list *uuid_entry = bsearch(uuid,uuid_list,journal_metric_count,sizeof(*uuid_list), journal_metric_uuid_compare);
+
+ if (unlikely(!uuid_entry)) {
+ // our UUID is not in this datafile
+ journalfile_v2_data_release(datafile->journalfile);
+ continue;
+ }
+
+ struct journal_page_header *page_list_header = (struct journal_page_header *) ((uint8_t *) j2_header + uuid_entry->page_offset);
+ struct journal_page_list *page_list = (struct journal_page_list *)((uint8_t *) page_list_header + sizeof(*page_list_header));
+ struct journal_extent_list *extent_list = (void *)((uint8_t *)j2_header + j2_header->extent_offset);
+ uint32_t uuid_page_entries = page_list_header->entries;
+
+ for (uint32_t index = 0; index < uuid_page_entries; index++) {
+ struct journal_page_list *page_entry_in_journal = &page_list[index];
+
+ time_t page_first_time_s = page_entry_in_journal->delta_start_s + journal_start_time_s;
+ time_t page_last_time_s = page_entry_in_journal->delta_end_s + journal_start_time_s;
+
+ TIME_RANGE_COMPARE prc = is_page_in_time_range(page_first_time_s, page_last_time_s, wanted_start_time_s, wanted_end_time_s);
+ if(prc == PAGE_IS_IN_THE_PAST)
+ continue;
+
+ if(prc == PAGE_IS_IN_THE_FUTURE)
+ break;
+
+ time_t page_update_every_s = page_entry_in_journal->update_every_s;
+ size_t page_length = page_entry_in_journal->page_length;
+
+ if(datafile_acquire(datafile, DATAFILE_ACQUIRE_OPEN_CACHE)) { //for open cache item
+ // add this page to open cache
+ bool added = false;
+ struct extent_io_data ei = {
+ .pos = extent_list[page_entry_in_journal->extent_index].datafile_offset,
+ .bytes = extent_list[page_entry_in_journal->extent_index].datafile_size,
+ .page_length = page_length,
+ .file = datafile->file,
+ .fileno = datafile->fileno,
+ };
+
+ PGC_PAGE *page = pgc_page_add_and_acquire(open_cache, (PGC_ENTRY) {
+ .hot = false,
+ .section = (Word_t) ctx,
+ .metric_id = metric_id,
+ .start_time_s = page_first_time_s,
+ .end_time_s = page_last_time_s,
+ .update_every_s = (uint32_t) page_update_every_s,
+ .data = datafile,
+ .size = 0,
+ .custom_data = (uint8_t *) &ei,
+ }, &added);
+
+ if(!added)
+ datafile_release(datafile, DATAFILE_ACQUIRE_OPEN_CACHE);
+
+ callback(page, callback_data);
+
+ pgc_page_release(open_cache, page);
+
+ pages_found++;
+ }
+ }
+
+ journalfile_v2_data_release(datafile->journalfile);
+ }
+
+ return pages_found;
+}
+
+void add_page_details_from_journal_v2(PGC_PAGE *page, void *JudyL_pptr) {
+ struct rrdengine_datafile *datafile = pgc_page_data(page);
+
+ if(!datafile_acquire(datafile, DATAFILE_ACQUIRE_PAGE_DETAILS)) // for pd
+ return;
+
+ Pvoid_t *PValue = PDCJudyLIns(JudyL_pptr, pgc_page_start_time_s(page), PJE0);
+ if (!PValue || PValue == PJERR)
+ fatal("DBENGINE: corrupted judy array");
+
+ if (unlikely(*PValue)) {
+ datafile_release(datafile, DATAFILE_ACQUIRE_PAGE_DETAILS);
+ return;
+ }
+
+ Word_t metric_id = pgc_page_metric(page);
+
+ // let's add it to the judy
+ struct extent_io_data *ei = pgc_page_custom_data(open_cache, page);
+ struct page_details *pd = page_details_get();
+ *PValue = pd;
+
+ pd->datafile.extent.pos = ei->pos;
+ pd->datafile.extent.bytes = ei->bytes;
+ pd->datafile.file = ei->file;
+ pd->datafile.fileno = ei->fileno;
+ pd->first_time_s = pgc_page_start_time_s(page);
+ pd->last_time_s = pgc_page_end_time_s(page);
+ pd->datafile.ptr = datafile;
+ pd->update_every_s = (uint32_t) pgc_page_update_every_s(page);
+ pd->metric_id = metric_id;
+ pd->status |= PDC_PAGE_DISK_PENDING | PDC_PAGE_SOURCE_JOURNAL_V2 | PDC_PAGE_DATAFILE_ACQUIRED;
+}
+
+// Return a judyL will all pages that have start_time_ut and end_time_ut
+// Pvalue of the judy will be the end time for that page
+// DBENGINE2:
+#define time_delta(finish, pass) do { if(pass) { usec_t t = pass; (pass) = (finish) - (pass); (finish) = t; } } while(0)
+static Pvoid_t get_page_list(
+ struct rrdengine_instance *ctx,
+ METRIC *metric,
+ usec_t start_time_ut,
+ usec_t end_time_ut,
+ time_t *optimal_end_time_s,
+ size_t *pages_to_load_from_disk,
+ PDC_PAGE_STATUS *common_status
+) {
+ *optimal_end_time_s = 0;
+ *pages_to_load_from_disk = 0;
+ *common_status = 0;
+
+ Pvoid_t JudyL_page_array = (Pvoid_t) NULL;
+
+ time_t wanted_start_time_s = (time_t)(start_time_ut / USEC_PER_SEC);
+ time_t wanted_end_time_s = (time_t)(end_time_ut / USEC_PER_SEC);
+
+ size_t pages_found_in_main_cache = 0,
+ pages_found_in_open_cache = 0,
+ pages_found_in_journals_v2 = 0,
+ pages_found_pass4 = 0,
+ pages_overlapping = 0,
+ pages_total = 0;
+
+ size_t cache_gaps = 0, query_gaps = 0;
+ bool done_v2 = false, done_open = false;
+
+ usec_t pass1_ut = 0, pass2_ut = 0, pass3_ut = 0, pass4_ut = 0, finish_ut = 0;
+
+ // --------------------------------------------------------------
+ // PASS 1: Check what the main page cache has available
+
+ pass1_ut = now_monotonic_usec();
+ size_t pages_pass1 = get_page_list_from_pgc(main_cache, metric, ctx, wanted_start_time_s, wanted_end_time_s,
+ &JudyL_page_array, &cache_gaps,
+ false, PDC_PAGE_SOURCE_MAIN_CACHE);
+ query_gaps += cache_gaps;
+ pages_found_in_main_cache += pages_pass1;
+ pages_total += pages_pass1;
+
+ if(pages_found_in_main_cache && !cache_gaps) {
+ query_gaps = list_has_time_gaps(ctx, metric, JudyL_page_array, wanted_start_time_s, wanted_end_time_s,
+ &pages_total, &pages_found_pass4, pages_to_load_from_disk, &pages_overlapping,
+ optimal_end_time_s, false, common_status);
+
+ if (pages_total && !query_gaps)
+ goto we_are_done;
+ }
+
+ // --------------------------------------------------------------
+ // PASS 2: Check what the open journal page cache has available
+ // these will be loaded from disk
+
+ pass2_ut = now_monotonic_usec();
+ size_t pages_pass2 = get_page_list_from_pgc(open_cache, metric, ctx, wanted_start_time_s, wanted_end_time_s,
+ &JudyL_page_array, &cache_gaps,
+ true, PDC_PAGE_SOURCE_OPEN_CACHE);
+ query_gaps += cache_gaps;
+ pages_found_in_open_cache += pages_pass2;
+ pages_total += pages_pass2;
+ done_open = true;
+
+ if(pages_found_in_open_cache) {
+ query_gaps = list_has_time_gaps(ctx, metric, JudyL_page_array, wanted_start_time_s, wanted_end_time_s,
+ &pages_total, &pages_found_pass4, pages_to_load_from_disk, &pages_overlapping,
+ optimal_end_time_s, false, common_status);
+
+ if (pages_total && !query_gaps)
+ goto we_are_done;
+ }
+
+ // --------------------------------------------------------------
+ // PASS 3: Check Journal v2 to fill the gaps
+
+ pass3_ut = now_monotonic_usec();
+ size_t pages_pass3 = get_page_list_from_journal_v2(ctx, metric, start_time_ut, end_time_ut,
+ add_page_details_from_journal_v2, &JudyL_page_array);
+ pages_found_in_journals_v2 += pages_pass3;
+ pages_total += pages_pass3;
+ done_v2 = true;
+
+ // --------------------------------------------------------------
+ // PASS 4: Check the cache again
+ // and calculate the time gaps in the query
+ // THIS IS REQUIRED AFTER JOURNAL V2 LOOKUP
+
+ pass4_ut = now_monotonic_usec();
+ query_gaps = list_has_time_gaps(ctx, metric, JudyL_page_array, wanted_start_time_s, wanted_end_time_s,
+ &pages_total, &pages_found_pass4, pages_to_load_from_disk, &pages_overlapping,
+ optimal_end_time_s, true, common_status);
+
+we_are_done:
+ finish_ut = now_monotonic_usec();
+ time_delta(finish_ut, pass4_ut);
+ time_delta(finish_ut, pass3_ut);
+ time_delta(finish_ut, pass2_ut);
+ time_delta(finish_ut, pass1_ut);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_in_main_cache_lookup, pass1_ut, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_in_open_cache_lookup, pass2_ut, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_in_journal_v2_lookup, pass3_ut, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_in_pass4_lookup, pass4_ut, __ATOMIC_RELAXED);
+
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries_planned_with_gaps, (query_gaps) ? 1 : 0, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries_open, done_open ? 1 : 0, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries_journal_v2, done_v2 ? 1 : 0, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_total, pages_total, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_meta_source_main_cache, pages_found_in_main_cache, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_meta_source_open_cache, pages_found_in_open_cache, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_meta_source_journal_v2, pages_found_in_journals_v2, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache, pages_found_in_main_cache, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache_at_pass4, pages_found_pass4, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_to_load_from_disk, *pages_to_load_from_disk, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_overlapping_skipped, pages_overlapping, __ATOMIC_RELAXED);
+
+ return JudyL_page_array;
+}
+
+inline void rrdeng_prep_wait(PDC *pdc) {
+ if (unlikely(pdc && !pdc->prep_done)) {
+ usec_t started_ut = now_monotonic_usec();
+ completion_wait_for(&pdc->prep_completion);
+ pdc->prep_done = true;
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_wait_for_prep, now_monotonic_usec() - started_ut, __ATOMIC_RELAXED);
+ }
+}
+
+void rrdeng_prep_query(struct page_details_control *pdc, bool worker) {
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_QUERY);
+
+ pdc->page_list_JudyL = get_page_list(pdc->ctx, pdc->metric,
+ pdc->start_time_s * USEC_PER_SEC,
+ pdc->end_time_s * USEC_PER_SEC,
+ &pdc->optimal_end_time_s,
+ &pdc->pages_to_load_from_disk,
+ &pdc->common_status);
+
+ internal_fatal(pdc->pages_to_load_from_disk && !(pdc->common_status & PDC_PAGE_DISK_PENDING),
+ "DBENGINE: PDC reports there are %zu pages to load from disk, "
+ "but none of the pages has the PDC_PAGE_DISK_PENDING flag",
+ pdc->pages_to_load_from_disk);
+
+ internal_fatal(!pdc->pages_to_load_from_disk && (pdc->common_status & PDC_PAGE_DISK_PENDING),
+ "DBENGINE: PDC reports there are no pages to load from disk, "
+ "but one or more pages have the PDC_PAGE_DISK_PENDING flag");
+
+ if (pdc->pages_to_load_from_disk && pdc->page_list_JudyL) {
+ pdc_acquire(pdc); // we get 1 for the 1st worker in the chain: do_read_page_list_work()
+ usec_t start_ut = now_monotonic_usec();
+ if(likely(pdc->priority == STORAGE_PRIORITY_SYNCHRONOUS))
+ pdc_route_synchronously(pdc->ctx, pdc);
+ else
+ pdc_route_asynchronously(pdc->ctx, pdc);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_to_route, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED);
+ }
+ else
+ completion_mark_complete(&pdc->page_completion);
+
+ completion_mark_complete(&pdc->prep_completion);
+
+ pdc_release_and_destroy_if_unreferenced(pdc, true, true);
+
+ if(worker)
+ worker_is_idle();
+}
+
+/**
+ * Searches for pages in a time range and triggers disk I/O if necessary and possible.
+ * @param ctx DB context
+ * @param handle query handle as initialized
+ * @param start_time_ut inclusive starting time in usec
+ * @param end_time_ut inclusive ending time in usec
+ * @return 1 / 0 (pages found or not found)
+ */
+void pg_cache_preload(struct rrdeng_query_handle *handle) {
+ if (unlikely(!handle || !handle->metric))
+ return;
+
+ __atomic_add_fetch(&handle->ctx->atomic.inflight_queries, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.currently_running_queries, 1, __ATOMIC_RELAXED);
+ handle->pdc = pdc_get();
+ handle->pdc->metric = mrg_metric_dup(main_mrg, handle->metric);
+ handle->pdc->start_time_s = handle->start_time_s;
+ handle->pdc->end_time_s = handle->end_time_s;
+ handle->pdc->priority = handle->priority;
+ handle->pdc->optimal_end_time_s = handle->end_time_s;
+ handle->pdc->ctx = handle->ctx;
+ handle->pdc->refcount = 1;
+ spinlock_init(&handle->pdc->refcount_spinlock);
+ completion_init(&handle->pdc->prep_completion);
+ completion_init(&handle->pdc->page_completion);
+
+ if(ctx_is_available_for_queries(handle->ctx)) {
+ handle->pdc->refcount++; // we get 1 for the query thread and 1 for the prep thread
+
+ if(unlikely(handle->pdc->priority == STORAGE_PRIORITY_SYNCHRONOUS))
+ rrdeng_prep_query(handle->pdc, false);
+ else
+ rrdeng_enq_cmd(handle->ctx, RRDENG_OPCODE_QUERY, handle->pdc, NULL, handle->priority, NULL, NULL);
+ }
+ else {
+ completion_mark_complete(&handle->pdc->prep_completion);
+ completion_mark_complete(&handle->pdc->page_completion);
+ }
+}
+
+/*
+ * Searches for the first page between start_time and end_time and gets a reference.
+ * start_time and end_time are inclusive.
+ * If index is NULL lookup by UUID (id).
+ */
+struct pgc_page *pg_cache_lookup_next(
+ struct rrdengine_instance *ctx,
+ PDC *pdc,
+ time_t now_s,
+ time_t last_update_every_s,
+ size_t *entries
+) {
+ if (unlikely(!pdc))
+ return NULL;
+
+ rrdeng_prep_wait(pdc);
+
+ if (unlikely(!pdc->page_list_JudyL))
+ return NULL;
+
+ usec_t start_ut = now_monotonic_usec();
+ size_t gaps = 0;
+ bool waited = false, preloaded;
+ PGC_PAGE *page = NULL;
+
+ while(!page) {
+ bool page_from_pd = false;
+ preloaded = false;
+ struct page_details *pd = pdc_find_page_for_time(
+ pdc->page_list_JudyL, now_s, &gaps,
+ PDC_PAGE_PROCESSED, PDC_PAGE_EMPTY);
+
+ if (!pd)
+ break;
+
+ page = pd->page;
+ page_from_pd = true;
+ preloaded = pdc_page_status_check(pd, PDC_PAGE_PRELOADED);
+ if(!page) {
+ if(!completion_is_done(&pdc->page_completion)) {
+ page = pgc_page_get_and_acquire(main_cache, (Word_t)ctx,
+ pd->metric_id, pd->first_time_s, PGC_SEARCH_EXACT);
+ page_from_pd = false;
+ preloaded = pdc_page_status_check(pd, PDC_PAGE_PRELOADED);
+ }
+
+ if(!page) {
+ pdc->completed_jobs =
+ completion_wait_for_a_job(&pdc->page_completion, pdc->completed_jobs);
+
+ page = pd->page;
+ page_from_pd = true;
+ preloaded = pdc_page_status_check(pd, PDC_PAGE_PRELOADED);
+ waited = true;
+ }
+ }
+
+ if(page && pgd_is_empty(pgc_page_data(page)))
+ pdc_page_status_set(pd, PDC_PAGE_EMPTY);
+
+ if(!page || pdc_page_status_check(pd, PDC_PAGE_QUERY_GLOBAL_SKIP_LIST | PDC_PAGE_EMPTY)) {
+ page = NULL;
+ continue;
+ }
+
+ // we now have page and is not empty
+
+ time_t page_start_time_s = pgc_page_start_time_s(page);
+ time_t page_end_time_s = pgc_page_end_time_s(page);
+ time_t page_update_every_s = pgc_page_update_every_s(page);
+
+ if(unlikely(page_start_time_s == INVALID_TIME || page_end_time_s == INVALID_TIME)) {
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_zero_time_skipped, 1, __ATOMIC_RELAXED);
+ pgc_page_to_clean_evict_or_release(main_cache, page);
+ pdc_page_status_set(pd, PDC_PAGE_INVALID | PDC_PAGE_RELEASED);
+ pd->page = page = NULL;
+ continue;
+ }
+ else {
+ if (unlikely(page_update_every_s <= 0 || page_update_every_s > 86400)) {
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_invalid_update_every_fixed, 1, __ATOMIC_RELAXED);
+ page_update_every_s = pgc_page_fix_update_every(page, last_update_every_s);
+ pd->update_every_s = (uint32_t) page_update_every_s;
+ }
+
+ size_t entries_by_size = pgd_slots_used(pgc_page_data(page));
+ size_t entries_by_time = page_entries_by_time(page_start_time_s, page_end_time_s, page_update_every_s);
+ if(unlikely(entries_by_size < entries_by_time)) {
+ time_t fixed_page_end_time_s = (time_t)(page_start_time_s + (entries_by_size - 1) * page_update_every_s);
+ pd->last_time_s = page_end_time_s = pgc_page_fix_end_time_s(page, fixed_page_end_time_s);
+ entries_by_time = (page_end_time_s - (page_start_time_s - page_update_every_s)) / page_update_every_s;
+
+ internal_fatal(entries_by_size != entries_by_time, "DBENGINE: wrong entries by time again!");
+
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_invalid_entries_fixed, 1, __ATOMIC_RELAXED);
+ }
+ *entries = entries_by_time;
+ }
+
+ if(unlikely(page_end_time_s < now_s)) {
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_past_time_skipped, 1, __ATOMIC_RELAXED);
+ pgc_page_release(main_cache, page);
+ pdc_page_status_set(pd, PDC_PAGE_SKIP | PDC_PAGE_RELEASED);
+ pd->page = page = NULL;
+ continue;
+ }
+
+ if(page_from_pd)
+ // PDC_PAGE_RELEASED is for pdc_destroy() to not release the page twice - the caller will release it
+ pdc_page_status_set(pd, PDC_PAGE_RELEASED | PDC_PAGE_PROCESSED);
+ else
+ pdc_page_status_set(pd, PDC_PAGE_PROCESSED);
+ }
+
+ if(gaps && !pdc->executed_with_gaps)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries_executed_with_gaps, 1, __ATOMIC_RELAXED);
+ pdc->executed_with_gaps = +gaps;
+
+ if(page) {
+ if(waited)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.page_next_wait_loaded, 1, __ATOMIC_RELAXED);
+ else
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.page_next_nowait_loaded, 1, __ATOMIC_RELAXED);
+ }
+ else {
+ if(waited)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.page_next_wait_failed, 1, __ATOMIC_RELAXED);
+ else
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.page_next_nowait_failed, 1, __ATOMIC_RELAXED);
+ }
+
+ if(waited) {
+ if(preloaded)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_to_slow_preload_next_page, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED);
+ else
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_to_slow_disk_next_page, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED);
+ }
+ else {
+ if(preloaded)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_to_fast_preload_next_page, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED);
+ else
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_to_fast_disk_next_page, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED);
+ }
+
+ return page;
+}
+
+void pgc_open_add_hot_page(Word_t section, Word_t metric_id, time_t start_time_s, time_t end_time_s, time_t update_every_s,
+ struct rrdengine_datafile *datafile, uint64_t extent_offset, unsigned extent_size, uint32_t page_length) {
+
+ if(!datafile_acquire(datafile, DATAFILE_ACQUIRE_OPEN_CACHE)) // for open cache item
+ fatal("DBENGINE: cannot acquire datafile to put page in open cache");
+
+ struct extent_io_data ext_io_data = {
+ .file = datafile->file,
+ .fileno = datafile->fileno,
+ .pos = extent_offset,
+ .bytes = extent_size,
+ .page_length = page_length
+ };
+
+ PGC_ENTRY page_entry = {
+ .hot = true,
+ .section = section,
+ .metric_id = metric_id,
+ .start_time_s = start_time_s,
+ .end_time_s = end_time_s,
+ .update_every_s = (uint32_t) update_every_s,
+ .size = 0,
+ .data = datafile,
+ .custom_data = (uint8_t *) &ext_io_data,
+ };
+
+ internal_fatal(!datafile->fileno, "DBENGINE: datafile supplied does not have a number");
+
+ bool added = true;
+ PGC_PAGE *page = pgc_page_add_and_acquire(open_cache, page_entry, &added);
+ int tries = 100;
+ while(!added && page_entry.end_time_s > pgc_page_end_time_s(page) && tries--) {
+ pgc_page_to_clean_evict_or_release(open_cache, page);
+ page = pgc_page_add_and_acquire(open_cache, page_entry, &added);
+ }
+
+ if(!added) {
+ datafile_release(datafile, DATAFILE_ACQUIRE_OPEN_CACHE);
+
+ internal_fatal(page_entry.end_time_s > pgc_page_end_time_s(page),
+ "DBENGINE: cannot add longer page to open cache");
+ }
+
+ pgc_page_release(open_cache, (PGC_PAGE *)page);
+}
+
+size_t dynamic_open_cache_size(void) {
+ size_t main_cache_size = pgc_get_wanted_cache_size(main_cache);
+ size_t target_size = main_cache_size / 100 * 5;
+
+ if(target_size < 2 * 1024 * 1024)
+ target_size = 2 * 1024 * 1024;
+
+ return target_size;
+}
+
+size_t dynamic_extent_cache_size(void) {
+ size_t main_cache_size = pgc_get_wanted_cache_size(main_cache);
+ size_t target_size = main_cache_size / 100 * 5;
+
+ if(target_size < 3 * 1024 * 1024)
+ target_size = 3 * 1024 * 1024;
+
+ return target_size;
+}
+
+void pgc_and_mrg_initialize(void)
+{
+ main_mrg = mrg_create(0);
+
+ size_t target_cache_size = (size_t)default_rrdeng_page_cache_mb * 1024ULL * 1024ULL;
+ size_t main_cache_size = (target_cache_size / 100) * 95;
+ size_t open_cache_size = 0;
+ size_t extent_cache_size = (target_cache_size / 100) * 5;
+
+ if(extent_cache_size < 3 * 1024 * 1024) {
+ extent_cache_size = 3 * 1024 * 1024;
+ main_cache_size = target_cache_size - extent_cache_size;
+ }
+
+ extent_cache_size += (size_t)(default_rrdeng_extent_cache_mb * 1024ULL * 1024ULL);
+
+ main_cache = pgc_create(
+ "main_cache",
+ main_cache_size,
+ main_cache_free_clean_page_callback,
+ (size_t) rrdeng_pages_per_extent,
+ main_cache_flush_dirty_page_init_callback,
+ main_cache_flush_dirty_page_callback,
+ 10,
+ 10240, // if there are that many threads, evict so many at once!
+ 1000, //
+ 5, // don't delay too much other threads
+ PGC_OPTIONS_AUTOSCALE, // AUTOSCALE = 2x max hot pages
+ 0, // 0 = as many as the system cpus
+ 0
+ );
+
+ open_cache = pgc_create(
+ "open_cache",
+ open_cache_size, // the default is 1MB
+ open_cache_free_clean_page_callback,
+ 1,
+ NULL,
+ open_cache_flush_dirty_page_callback,
+ 10,
+ 10240, // if there are that many threads, evict that many at once!
+ 1000, //
+ 3, // don't delay too much other threads
+ PGC_OPTIONS_AUTOSCALE | PGC_OPTIONS_EVICT_PAGES_INLINE | PGC_OPTIONS_FLUSH_PAGES_INLINE,
+ 0, // 0 = as many as the system cpus
+ sizeof(struct extent_io_data)
+ );
+ pgc_set_dynamic_target_cache_size_callback(open_cache, dynamic_open_cache_size);
+
+ extent_cache = pgc_create(
+ "extent_cache",
+ extent_cache_size,
+ extent_cache_free_clean_page_callback,
+ 1,
+ NULL,
+ extent_cache_flush_dirty_page_callback,
+ 5,
+ 10, // it will lose up to that extents at once!
+ 100, //
+ 2, // don't delay too much other threads
+ PGC_OPTIONS_AUTOSCALE | PGC_OPTIONS_EVICT_PAGES_INLINE | PGC_OPTIONS_FLUSH_PAGES_INLINE,
+ 0, // 0 = as many as the system cpus
+ 0
+ );
+ pgc_set_dynamic_target_cache_size_callback(extent_cache, dynamic_extent_cache_size);
+}
diff --git a/database/engine/pagecache.h b/database/engine/pagecache.h
new file mode 100644
index 00000000..dbcbea53
--- /dev/null
+++ b/database/engine/pagecache.h
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_PAGECACHE_H
+#define NETDATA_PAGECACHE_H
+
+#include "rrdengine.h"
+
+extern struct mrg *main_mrg;
+extern struct pgc *main_cache;
+extern struct pgc *open_cache;
+extern struct pgc *extent_cache;
+
+/* Forward declarations */
+struct rrdengine_instance;
+
+#define INVALID_TIME (0)
+#define MAX_PAGE_CACHE_FETCH_RETRIES (3)
+#define PAGE_CACHE_FETCH_WAIT_TIMEOUT (3)
+
+extern struct rrdeng_cache_efficiency_stats rrdeng_cache_efficiency_stats;
+
+struct page_descr_with_data {
+ uuid_t *id;
+ Word_t metric_id;
+ usec_t start_time_ut;
+ usec_t end_time_ut;
+ uint8_t type;
+ uint32_t update_every_s;
+ uint32_t page_length;
+ struct pgd *pgd;
+
+ struct {
+ struct page_descr_with_data *prev;
+ struct page_descr_with_data *next;
+ } link;
+};
+
+#define PAGE_INFO_SCRATCH_SZ (8)
+struct rrdeng_page_info {
+ uint8_t scratch[PAGE_INFO_SCRATCH_SZ]; /* scratch area to be used by page-cache users */
+
+ usec_t start_time_ut;
+ usec_t end_time_ut;
+ uint32_t page_length;
+};
+
+struct pg_alignment {
+ uint32_t refcount;
+};
+
+struct rrdeng_query_handle;
+struct page_details_control;
+
+void rrdeng_prep_wait(struct page_details_control *pdc);
+void rrdeng_prep_query(struct page_details_control *pdc, bool worker);
+void pg_cache_preload(struct rrdeng_query_handle *handle);
+struct pgc_page *pg_cache_lookup_next(struct rrdengine_instance *ctx, struct page_details_control *pdc, time_t now_s, time_t last_update_every_s, size_t *entries);
+void pgc_and_mrg_initialize(void);
+
+void pgc_open_add_hot_page(Word_t section, Word_t metric_id, time_t start_time_s, time_t end_time_s, time_t update_every_s, struct rrdengine_datafile *datafile, uint64_t extent_offset, unsigned extent_size, uint32_t page_length);
+
+#endif /* NETDATA_PAGECACHE_H */
diff --git a/database/engine/pdc.c b/database/engine/pdc.c
new file mode 100644
index 00000000..5fe205e6
--- /dev/null
+++ b/database/engine/pdc.c
@@ -0,0 +1,1332 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+#define NETDATA_RRD_INTERNALS
+#include "pdc.h"
+
+struct extent_page_details_list {
+ uv_file file;
+ uint64_t extent_offset;
+ uint32_t extent_size;
+ unsigned number_of_pages_in_JudyL;
+ Pvoid_t page_details_by_metric_id_JudyL;
+ struct page_details_control *pdc;
+ struct rrdengine_datafile *datafile;
+
+ struct rrdeng_cmd *cmd;
+ bool head_to_datafile_extent_queries_pending_for_extent;
+
+ struct {
+ struct extent_page_details_list *prev;
+ struct extent_page_details_list *next;
+ } query;
+};
+
+typedef struct datafile_extent_offset_list {
+ uv_file file;
+ unsigned fileno;
+ Pvoid_t extent_pd_list_by_extent_offset_JudyL;
+} DEOL;
+
+// ----------------------------------------------------------------------------
+// PDC cache
+
+static struct {
+ struct {
+ ARAL *ar;
+ } pdc;
+
+ struct {
+ ARAL *ar;
+ } pd;
+
+ struct {
+ ARAL *ar;
+ } epdl;
+
+ struct {
+ ARAL *ar;
+ } deol;
+} pdc_globals = {};
+
+void pdc_init(void) {
+ pdc_globals.pdc.ar = aral_create(
+ "dbengine-pdc",
+ sizeof(PDC),
+ 0,
+ 65536,
+ NULL,
+ NULL, NULL, false, false
+ );
+}
+
+PDC *pdc_get(void) {
+ PDC *pdc = aral_mallocz(pdc_globals.pdc.ar);
+ memset(pdc, 0, sizeof(PDC));
+ return pdc;
+}
+
+static void pdc_release(PDC *pdc) {
+ aral_freez(pdc_globals.pdc.ar, pdc);
+}
+
+size_t pdc_cache_size(void) {
+ return aral_overhead(pdc_globals.pdc.ar) + aral_structures(pdc_globals.pdc.ar);
+}
+
+// ----------------------------------------------------------------------------
+// PD cache
+
+void page_details_init(void) {
+ pdc_globals.pd.ar = aral_create(
+ "dbengine-pd",
+ sizeof(struct page_details),
+ 0,
+ 65536,
+ NULL,
+ NULL, NULL, false, false
+ );
+}
+
+struct page_details *page_details_get(void) {
+ struct page_details *pd = aral_mallocz(pdc_globals.pd.ar);
+ memset(pd, 0, sizeof(struct page_details));
+ return pd;
+}
+
+static void page_details_release(struct page_details *pd) {
+ aral_freez(pdc_globals.pd.ar, pd);
+}
+
+size_t pd_cache_size(void) {
+ return aral_overhead(pdc_globals.pd.ar) + aral_structures(pdc_globals.pd.ar);
+}
+
+// ----------------------------------------------------------------------------
+// epdl cache
+
+void epdl_init(void) {
+ pdc_globals.epdl.ar = aral_create(
+ "dbengine-epdl",
+ sizeof(EPDL),
+ 0,
+ 65536,
+ NULL,
+ NULL, NULL, false, false
+ );
+}
+
+static EPDL *epdl_get(void) {
+ EPDL *epdl = aral_mallocz(pdc_globals.epdl.ar);
+ memset(epdl, 0, sizeof(EPDL));
+ return epdl;
+}
+
+static void epdl_release(EPDL *epdl) {
+ aral_freez(pdc_globals.epdl.ar, epdl);
+}
+
+size_t epdl_cache_size(void) {
+ return aral_overhead(pdc_globals.epdl.ar) + aral_structures(pdc_globals.epdl.ar);
+}
+
+// ----------------------------------------------------------------------------
+// deol cache
+
+void deol_init(void) {
+ pdc_globals.deol.ar = aral_create(
+ "dbengine-deol",
+ sizeof(DEOL),
+ 0,
+ 65536,
+ NULL,
+ NULL, NULL, false, false
+ );
+}
+
+static DEOL *deol_get(void) {
+ DEOL *deol = aral_mallocz(pdc_globals.deol.ar);
+ memset(deol, 0, sizeof(DEOL));
+ return deol;
+}
+
+static void deol_release(DEOL *deol) {
+ aral_freez(pdc_globals.deol.ar, deol);
+}
+
+size_t deol_cache_size(void) {
+ return aral_overhead(pdc_globals.deol.ar) + aral_structures(pdc_globals.deol.ar);
+}
+
+// ----------------------------------------------------------------------------
+// extent with buffer cache
+
+static struct {
+ struct {
+ SPINLOCK spinlock;
+ struct extent_buffer *available_items;
+ size_t available;
+ } protected;
+
+ struct {
+ size_t allocated;
+ size_t allocated_bytes;
+ } atomics;
+
+ size_t max_size;
+
+} extent_buffer_globals = {
+ .protected = {
+ .spinlock = NETDATA_SPINLOCK_INITIALIZER,
+ .available_items = NULL,
+ .available = 0,
+ },
+ .atomics = {
+ .allocated = 0,
+ .allocated_bytes = 0,
+ },
+ .max_size = MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE,
+};
+
+void extent_buffer_init(void) {
+ size_t max_extent_uncompressed = MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE;
+ size_t max_size = (size_t)LZ4_compressBound(MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE);
+ if(max_size < max_extent_uncompressed)
+ max_size = max_extent_uncompressed;
+
+ extent_buffer_globals.max_size = max_size;
+}
+
+void extent_buffer_cleanup1(void) {
+ struct extent_buffer *item = NULL;
+
+ if(!spinlock_trylock(&extent_buffer_globals.protected.spinlock))
+ return;
+
+ if(extent_buffer_globals.protected.available_items && extent_buffer_globals.protected.available > 1) {
+ item = extent_buffer_globals.protected.available_items;
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, item, cache.prev, cache.next);
+ extent_buffer_globals.protected.available--;
+ }
+
+ spinlock_unlock(&extent_buffer_globals.protected.spinlock);
+
+ if(item) {
+ size_t bytes = sizeof(struct extent_buffer) + item->bytes;
+ freez(item);
+ __atomic_sub_fetch(&extent_buffer_globals.atomics.allocated, 1, __ATOMIC_RELAXED);
+ __atomic_sub_fetch(&extent_buffer_globals.atomics.allocated_bytes, bytes, __ATOMIC_RELAXED);
+ }
+}
+
+struct extent_buffer *extent_buffer_get(size_t size) {
+ internal_fatal(size > extent_buffer_globals.max_size, "DBENGINE: extent size is too big");
+
+ struct extent_buffer *eb = NULL;
+
+ if(size < extent_buffer_globals.max_size)
+ size = extent_buffer_globals.max_size;
+
+ spinlock_lock(&extent_buffer_globals.protected.spinlock);
+ if(likely(extent_buffer_globals.protected.available_items)) {
+ eb = extent_buffer_globals.protected.available_items;
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, eb, cache.prev, cache.next);
+ extent_buffer_globals.protected.available--;
+ }
+ spinlock_unlock(&extent_buffer_globals.protected.spinlock);
+
+ if(unlikely(eb && eb->bytes < size)) {
+ size_t bytes = sizeof(struct extent_buffer) + eb->bytes;
+ freez(eb);
+ eb = NULL;
+ __atomic_sub_fetch(&extent_buffer_globals.atomics.allocated, 1, __ATOMIC_RELAXED);
+ __atomic_sub_fetch(&extent_buffer_globals.atomics.allocated_bytes, bytes, __ATOMIC_RELAXED);
+ }
+
+ if(unlikely(!eb)) {
+ size_t bytes = sizeof(struct extent_buffer) + size;
+ eb = mallocz(bytes);
+ eb->bytes = size;
+ __atomic_add_fetch(&extent_buffer_globals.atomics.allocated, 1, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&extent_buffer_globals.atomics.allocated_bytes, bytes, __ATOMIC_RELAXED);
+ }
+
+ return eb;
+}
+
+void extent_buffer_release(struct extent_buffer *eb) {
+ if(unlikely(!eb)) return;
+
+ spinlock_lock(&extent_buffer_globals.protected.spinlock);
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, eb, cache.prev, cache.next);
+ extent_buffer_globals.protected.available++;
+ spinlock_unlock(&extent_buffer_globals.protected.spinlock);
+}
+
+size_t extent_buffer_cache_size(void) {
+ return __atomic_load_n(&extent_buffer_globals.atomics.allocated_bytes, __ATOMIC_RELAXED);
+}
+
+// ----------------------------------------------------------------------------
+// epdl logic
+
+static void epdl_destroy(EPDL *epdl)
+{
+ Pvoid_t *pd_by_start_time_s_JudyL;
+ Word_t metric_id_index = 0;
+ bool metric_id_first = true;
+ while ((pd_by_start_time_s_JudyL = PDCJudyLFirstThenNext(
+ epdl->page_details_by_metric_id_JudyL,
+ &metric_id_index, &metric_id_first)))
+ PDCJudyLFreeArray(pd_by_start_time_s_JudyL, PJE0);
+
+ PDCJudyLFreeArray(&epdl->page_details_by_metric_id_JudyL, PJE0);
+ epdl_release(epdl);
+}
+
+static void epdl_mark_all_not_loaded_pages_as_failed(EPDL *epdl, PDC_PAGE_STATUS tags, size_t *statistics_counter)
+{
+ size_t pages_matched = 0;
+
+ Word_t metric_id_index = 0;
+ bool metric_id_first = true;
+ Pvoid_t *pd_by_start_time_s_JudyL;
+ while((pd_by_start_time_s_JudyL = PDCJudyLFirstThenNext(epdl->page_details_by_metric_id_JudyL, &metric_id_index, &metric_id_first))) {
+
+ Word_t start_time_index = 0;
+ bool start_time_first = true;
+ Pvoid_t *PValue;
+ while ((PValue = PDCJudyLFirstThenNext(*pd_by_start_time_s_JudyL, &start_time_index, &start_time_first))) {
+ struct page_details *pd = *PValue;
+
+ if(!pd->page && !pdc_page_status_check(pd, PDC_PAGE_FAILED|PDC_PAGE_READY)) {
+ pdc_page_status_set(pd, PDC_PAGE_FAILED | tags);
+ pages_matched++;
+ }
+ }
+ }
+
+ if(pages_matched && statistics_counter)
+ __atomic_add_fetch(statistics_counter, pages_matched, __ATOMIC_RELAXED);
+}
+/*
+static bool epdl_check_if_pages_are_already_in_cache(struct rrdengine_instance *ctx, EPDL *epdl, PDC_PAGE_STATUS tags)
+{
+ size_t count_remaining = 0;
+ size_t found = 0;
+
+ Word_t metric_id_index = 0;
+ bool metric_id_first = true;
+ Pvoid_t *pd_by_start_time_s_JudyL;
+ while((pd_by_start_time_s_JudyL = PDCJudyLFirstThenNext(epdl->page_details_by_metric_id_JudyL, &metric_id_index, &metric_id_first))) {
+
+ Word_t start_time_index = 0;
+ bool start_time_first = true;
+ Pvoid_t *PValue;
+ while ((PValue = PDCJudyLFirstThenNext(*pd_by_start_time_s_JudyL, &start_time_index, &start_time_first))) {
+ struct page_details *pd = *PValue;
+ if (pd->page)
+ continue;
+
+ pd->page = pgc_page_get_and_acquire(main_cache, (Word_t) ctx, pd->metric_id, pd->first_time_s, PGC_SEARCH_EXACT);
+ if (pd->page) {
+ found++;
+ pdc_page_status_set(pd, PDC_PAGE_READY | tags);
+ }
+ else
+ count_remaining++;
+ }
+ }
+
+ if(found) {
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_ok_preloaded, found, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache, found, __ATOMIC_RELAXED);
+ }
+
+ return count_remaining == 0;
+}
+*/
+
+// ----------------------------------------------------------------------------
+// PDC logic
+
+static void pdc_destroy(PDC *pdc) {
+ mrg_metric_release(main_mrg, pdc->metric);
+ completion_destroy(&pdc->prep_completion);
+ completion_destroy(&pdc->page_completion);
+
+ Pvoid_t *PValue;
+ struct page_details *pd;
+ Word_t time_index = 0;
+ bool first_then_next = true;
+ size_t unroutable = 0, cancelled = 0;
+ while((PValue = PDCJudyLFirstThenNext(pdc->page_list_JudyL, &time_index, &first_then_next))) {
+ pd = *PValue;
+
+ // no need for atomics here - we are done...
+ PDC_PAGE_STATUS status = pd->status;
+
+ if(status & PDC_PAGE_DATAFILE_ACQUIRED) {
+ datafile_release(pd->datafile.ptr, DATAFILE_ACQUIRE_PAGE_DETAILS);
+ pd->datafile.ptr = NULL;
+ }
+
+ internal_fatal(pd->datafile.ptr, "DBENGINE: page details has a datafile.ptr that is not released.");
+
+ if(!pd->page && !(status & (PDC_PAGE_READY | PDC_PAGE_FAILED | PDC_PAGE_RELEASED | PDC_PAGE_SKIP | PDC_PAGE_INVALID | PDC_PAGE_CANCELLED))) {
+ // pdc_page_status_set(pd, PDC_PAGE_FAILED);
+ unroutable++;
+ }
+ else if(!pd->page && (status & PDC_PAGE_CANCELLED))
+ cancelled++;
+
+ if(pd->page && !(status & PDC_PAGE_RELEASED)) {
+ pgc_page_release(main_cache, pd->page);
+ // pdc_page_status_set(pd, PDC_PAGE_RELEASED);
+ }
+
+ page_details_release(pd);
+ }
+
+ PDCJudyLFreeArray(&pdc->page_list_JudyL, PJE0);
+
+ __atomic_sub_fetch(&rrdeng_cache_efficiency_stats.currently_running_queries, 1, __ATOMIC_RELAXED);
+ __atomic_sub_fetch(&pdc->ctx->atomic.inflight_queries, 1, __ATOMIC_RELAXED);
+ pdc_release(pdc);
+
+ if(unroutable)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_fail_unroutable, unroutable, __ATOMIC_RELAXED);
+
+ if(cancelled)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_fail_cancelled, cancelled, __ATOMIC_RELAXED);
+}
+
+void pdc_acquire(PDC *pdc) {
+ spinlock_lock(&pdc->refcount_spinlock);
+
+ if(pdc->refcount < 1)
+ fatal("DBENGINE: pdc is not referenced and cannot be acquired");
+
+ pdc->refcount++;
+ spinlock_unlock(&pdc->refcount_spinlock);
+}
+
+bool pdc_release_and_destroy_if_unreferenced(PDC *pdc, bool worker, bool router __maybe_unused) {
+ if(unlikely(!pdc))
+ return true;
+
+ spinlock_lock(&pdc->refcount_spinlock);
+
+ if(pdc->refcount <= 0)
+ fatal("DBENGINE: pdc is not referenced and cannot be released");
+
+ pdc->refcount--;
+
+ if (pdc->refcount <= 1 && worker) {
+ // when 1 refcount is remaining, and we are a worker,
+ // we can mark the job completed:
+ // - if the remaining refcount is from the query caller, we will wake it up
+ // - if the remaining refcount is from another worker, the query thread is already away
+ completion_mark_complete(&pdc->page_completion);
+ }
+
+ if (pdc->refcount == 0) {
+ spinlock_unlock(&pdc->refcount_spinlock);
+ pdc_destroy(pdc);
+ return true;
+ }
+
+ spinlock_unlock(&pdc->refcount_spinlock);
+ return false;
+}
+
+void epdl_cmd_queued(void *epdl_ptr, struct rrdeng_cmd *cmd) {
+ EPDL *epdl = epdl_ptr;
+ epdl->cmd = cmd;
+}
+
+void epdl_cmd_dequeued(void *epdl_ptr) {
+ EPDL *epdl = epdl_ptr;
+ epdl->cmd = NULL;
+}
+
+static struct rrdeng_cmd *epdl_get_cmd(void *epdl_ptr) {
+ EPDL *epdl = epdl_ptr;
+ return epdl->cmd;
+}
+
+static bool epdl_pending_add(EPDL *epdl) {
+ bool added_new;
+
+ spinlock_lock(&epdl->datafile->extent_queries.spinlock);
+ Pvoid_t *PValue = JudyLIns(&epdl->datafile->extent_queries.pending_epdl_by_extent_offset_judyL, epdl->extent_offset, PJE0);
+ internal_fatal(!PValue || PValue == PJERR, "DBENGINE: corrupted pending extent judy");
+
+ EPDL *base = *PValue;
+
+ if(!base) {
+ added_new = true;
+ epdl->head_to_datafile_extent_queries_pending_for_extent = true;
+ }
+ else {
+ added_new = false;
+ epdl->head_to_datafile_extent_queries_pending_for_extent = false;
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_extent_merged, 1, __ATOMIC_RELAXED);
+
+ if(base->pdc->priority > epdl->pdc->priority)
+ rrdeng_req_cmd(epdl_get_cmd, base, epdl->pdc->priority);
+ }
+
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(base, epdl, query.prev, query.next);
+ *PValue = base;
+
+ spinlock_unlock(&epdl->datafile->extent_queries.spinlock);
+
+ return added_new;
+}
+
+static void epdl_pending_del(EPDL *epdl) {
+ spinlock_lock(&epdl->datafile->extent_queries.spinlock);
+ if(epdl->head_to_datafile_extent_queries_pending_for_extent) {
+ epdl->head_to_datafile_extent_queries_pending_for_extent = false;
+ int rc = JudyLDel(&epdl->datafile->extent_queries.pending_epdl_by_extent_offset_judyL, epdl->extent_offset, PJE0);
+ (void) rc;
+ internal_fatal(!rc, "DBENGINE: epdl not found in pending list");
+ }
+ spinlock_unlock(&epdl->datafile->extent_queries.spinlock);
+}
+
+void pdc_to_epdl_router(struct rrdengine_instance *ctx, PDC *pdc, execute_extent_page_details_list_t exec_first_extent_list, execute_extent_page_details_list_t exec_rest_extent_list)
+{
+ Pvoid_t *PValue;
+ Pvoid_t *PValue1;
+ Pvoid_t *PValue2;
+ Word_t time_index = 0;
+ struct page_details *pd = NULL;
+
+ // this is the entire page list
+ // Lets do some deduplication
+ // 1. Per datafile
+ // 2. Per extent
+ // 3. Pages per extent will be added to the cache either as acquired or not
+
+ Pvoid_t JudyL_datafile_list = NULL;
+
+ DEOL *deol;
+ EPDL *epdl;
+
+ if (pdc->page_list_JudyL) {
+ bool first_then_next = true;
+ while((PValue = PDCJudyLFirstThenNext(pdc->page_list_JudyL, &time_index, &first_then_next))) {
+ pd = *PValue;
+
+ internal_fatal(!pd,
+ "DBENGINE: pdc page list has an empty page details entry");
+
+ if (!(pd->status & PDC_PAGE_DISK_PENDING))
+ continue;
+
+ internal_fatal(!(pd->status & PDC_PAGE_DATAFILE_ACQUIRED),
+ "DBENGINE: page details has not acquired the datafile");
+
+ internal_fatal((pd->status & (PDC_PAGE_READY | PDC_PAGE_FAILED)),
+ "DBENGINE: page details has disk pending flag but it is ready/failed");
+
+ internal_fatal(pd->page,
+ "DBENGINE: page details has a page linked to it, but it is marked for loading");
+
+ PValue1 = PDCJudyLIns(&JudyL_datafile_list, pd->datafile.fileno, PJE0);
+ if (PValue1 && !*PValue1) {
+ *PValue1 = deol = deol_get();
+ deol->extent_pd_list_by_extent_offset_JudyL = NULL;
+ deol->fileno = pd->datafile.fileno;
+ }
+ else
+ deol = *PValue1;
+
+ PValue2 = PDCJudyLIns(&deol->extent_pd_list_by_extent_offset_JudyL, pd->datafile.extent.pos, PJE0);
+ if (PValue2 && !*PValue2) {
+ *PValue2 = epdl = epdl_get();
+ epdl->page_details_by_metric_id_JudyL = NULL;
+ epdl->number_of_pages_in_JudyL = 0;
+ epdl->file = pd->datafile.file;
+ epdl->extent_offset = pd->datafile.extent.pos;
+ epdl->extent_size = pd->datafile.extent.bytes;
+ epdl->datafile = pd->datafile.ptr;
+ }
+ else
+ epdl = *PValue2;
+
+ epdl->number_of_pages_in_JudyL++;
+
+ Pvoid_t *pd_by_first_time_s_judyL = PDCJudyLIns(&epdl->page_details_by_metric_id_JudyL, pd->metric_id, PJE0);
+ Pvoid_t *pd_pptr = PDCJudyLIns(pd_by_first_time_s_judyL, pd->first_time_s, PJE0);
+ *pd_pptr = pd;
+ }
+
+ size_t extent_list_no = 0;
+ Word_t datafile_no = 0;
+ first_then_next = true;
+ while((PValue = PDCJudyLFirstThenNext(JudyL_datafile_list, &datafile_no, &first_then_next))) {
+ deol = *PValue;
+
+ bool first_then_next_extent = true;
+ Word_t pos = 0;
+ while ((PValue = PDCJudyLFirstThenNext(deol->extent_pd_list_by_extent_offset_JudyL, &pos, &first_then_next_extent))) {
+ epdl = *PValue;
+ internal_fatal(!epdl, "DBENGINE: extent_list is not populated properly");
+
+ // The extent page list can be dispatched to a worker
+ // It will need to populate the cache with "acquired" pages that are in the list (pd) only
+ // the rest of the extent pages will be added to the cache butnot acquired
+
+ pdc_acquire(pdc); // we do this for the next worker: do_read_extent_work()
+ epdl->pdc = pdc;
+
+ if(epdl_pending_add(epdl)) {
+ if (extent_list_no++ == 0)
+ exec_first_extent_list(ctx, epdl, pdc->priority);
+ else
+ exec_rest_extent_list(ctx, epdl, pdc->priority);
+ }
+ }
+ PDCJudyLFreeArray(&deol->extent_pd_list_by_extent_offset_JudyL, PJE0);
+ deol_release(deol);
+ }
+ PDCJudyLFreeArray(&JudyL_datafile_list, PJE0);
+ }
+
+ pdc_release_and_destroy_if_unreferenced(pdc, true, true);
+}
+
+void collect_page_flags_to_buffer(BUFFER *wb, RRDENG_COLLECT_PAGE_FLAGS flags) {
+ if(flags & RRDENG_PAGE_PAST_COLLECTION)
+ buffer_strcat(wb, "PAST_COLLECTION ");
+ if(flags & RRDENG_PAGE_REPEATED_COLLECTION)
+ buffer_strcat(wb, "REPEATED_COLLECTION ");
+ if(flags & RRDENG_PAGE_BIG_GAP)
+ buffer_strcat(wb, "BIG_GAP ");
+ if(flags & RRDENG_PAGE_GAP)
+ buffer_strcat(wb, "GAP ");
+ if(flags & RRDENG_PAGE_FUTURE_POINT)
+ buffer_strcat(wb, "FUTURE_POINT ");
+ if(flags & RRDENG_PAGE_CREATED_IN_FUTURE)
+ buffer_strcat(wb, "CREATED_IN_FUTURE ");
+ if(flags & RRDENG_PAGE_COMPLETED_IN_FUTURE)
+ buffer_strcat(wb, "COMPLETED_IN_FUTURE ");
+ if(flags & RRDENG_PAGE_UNALIGNED)
+ buffer_strcat(wb, "UNALIGNED ");
+ if(flags & RRDENG_PAGE_CONFLICT)
+ buffer_strcat(wb, "CONFLICT ");
+ if(flags & RRDENG_PAGE_FULL)
+ buffer_strcat(wb, "PAGE_FULL");
+ if(flags & RRDENG_PAGE_COLLECT_FINALIZE)
+ buffer_strcat(wb, "COLLECT_FINALIZE");
+ if(flags & RRDENG_PAGE_UPDATE_EVERY_CHANGE)
+ buffer_strcat(wb, "UPDATE_EVERY_CHANGE");
+ if(flags & RRDENG_PAGE_STEP_TOO_SMALL)
+ buffer_strcat(wb, "STEP_TOO_SMALL");
+ if(flags & RRDENG_PAGE_STEP_UNALIGNED)
+ buffer_strcat(wb, "STEP_UNALIGNED");
+}
+
+inline VALIDATED_PAGE_DESCRIPTOR validate_extent_page_descr(const struct rrdeng_extent_page_descr *descr, time_t now_s, time_t overwrite_zero_update_every_s, bool have_read_error) {
+ time_t start_time_s = (time_t) (descr->start_time_ut / USEC_PER_SEC);
+
+ time_t end_time_s;
+ size_t entries;
+
+ switch (descr->type) {
+ case PAGE_METRICS:
+ case PAGE_TIER:
+ end_time_s = descr->end_time_ut / USEC_PER_SEC;
+ entries = 0;
+ break;
+ case PAGE_GORILLA_METRICS:
+ end_time_s = start_time_s + descr->gorilla.delta_time_s;
+ entries = descr->gorilla.entries;
+ break;
+ default:
+ fatal("Unknown page type: %uc\n", descr->type);
+ }
+
+ return validate_page(
+ (uuid_t *)descr->uuid,
+ start_time_s,
+ end_time_s,
+ 0,
+ descr->page_length,
+ descr->type,
+ entries,
+ now_s,
+ overwrite_zero_update_every_s,
+ have_read_error,
+ "loaded", 0);
+}
+
+VALIDATED_PAGE_DESCRIPTOR validate_page(
+ uuid_t *uuid,
+ time_t start_time_s,
+ time_t end_time_s,
+ time_t update_every_s, // can be zero, if unknown
+ size_t page_length,
+ uint8_t page_type,
+ size_t entries, // can be zero, if unknown
+ time_t now_s, // can be zero, to disable future timestamp check
+ time_t overwrite_zero_update_every_s, // can be zero, if unknown
+ bool have_read_error,
+ const char *msg,
+ RRDENG_COLLECT_PAGE_FLAGS flags) {
+
+ VALIDATED_PAGE_DESCRIPTOR vd = {
+ .start_time_s = start_time_s,
+ .end_time_s = end_time_s,
+ .update_every_s = update_every_s,
+ .page_length = page_length,
+ .type = page_type,
+ .is_valid = true,
+ };
+
+ vd.point_size = page_type_size[vd.type];
+ switch (page_type) {
+ case PAGE_METRICS:
+ case PAGE_TIER:
+ // always calculate entries by size
+ vd.entries = page_entries_by_size(vd.page_length, vd.point_size);
+
+ // allow to be called without entries (when loading pages from disk)
+ if(!entries)
+ entries = vd.entries;
+ break;
+ case PAGE_GORILLA_METRICS:
+ internal_fatal(entries == 0, "0 number of entries found on gorilla page");
+ vd.entries = entries;
+ break;
+ default:
+ // TODO: should set vd.is_valid false instead?
+ fatal("Unknown page type: %uc", page_type);
+ }
+
+ // allow to be called without update every (when loading pages from disk)
+ if(!update_every_s) {
+ vd.update_every_s = (vd.entries > 1) ? ((vd.end_time_s - vd.start_time_s) / (time_t) (vd.entries - 1))
+ : overwrite_zero_update_every_s;
+
+ update_every_s = vd.update_every_s;
+ }
+
+ // another such set of checks exists in
+ // update_metric_retention_and_granularity_by_uuid()
+
+ bool updated = false;
+
+ size_t max_page_length = RRDENG_BLOCK_SIZE;
+
+ // If gorilla can not compress the data we might end up needing slightly more
+ // than 4KiB. However, gorilla pages extend the page length by increments of
+ // 512 bytes.
+ max_page_length += ((page_type == PAGE_GORILLA_METRICS) * GORILLA_BUFFER_SIZE);
+
+ if( have_read_error ||
+ vd.page_length == 0 ||
+ vd.page_length > max_page_length ||
+ vd.start_time_s > vd.end_time_s ||
+ (now_s && vd.end_time_s > now_s) ||
+ vd.start_time_s <= 0 ||
+ vd.end_time_s <= 0 ||
+ vd.update_every_s < 0 ||
+ (vd.start_time_s == vd.end_time_s && vd.entries > 1) ||
+ (vd.update_every_s == 0 && vd.entries > 1))
+ {
+ vd.is_valid = false;
+ }
+ else {
+ if(unlikely(vd.entries != entries || vd.update_every_s != update_every_s))
+ updated = true;
+
+ if (likely(vd.update_every_s)) {
+ size_t entries_by_time = page_entries_by_time(vd.start_time_s, vd.end_time_s, vd.update_every_s);
+
+ if (vd.entries != entries_by_time) {
+ if (overwrite_zero_update_every_s < vd.update_every_s)
+ vd.update_every_s = overwrite_zero_update_every_s;
+
+ time_t new_end_time_s = (time_t)(vd.start_time_s + (vd.entries - 1) * vd.update_every_s);
+
+ if(new_end_time_s <= vd.end_time_s) {
+ // end time is wrong
+ vd.end_time_s = new_end_time_s;
+ }
+ else {
+ // update every is wrong
+ vd.update_every_s = overwrite_zero_update_every_s;
+ vd.end_time_s = (time_t)(vd.start_time_s + (vd.entries - 1) * vd.update_every_s);
+ }
+
+ updated = true;
+ }
+ }
+ else if(overwrite_zero_update_every_s) {
+ vd.update_every_s = overwrite_zero_update_every_s;
+ updated = true;
+ }
+ }
+
+ if(unlikely(!vd.is_valid || updated)) {
+#ifndef NETDATA_INTERNAL_CHECKS
+ nd_log_limit_static_global_var(erl, 1, 0);
+#endif
+ char uuid_str[UUID_STR_LEN + 1];
+ uuid_unparse(*uuid, uuid_str);
+
+ BUFFER *wb = NULL;
+
+ if(flags) {
+ wb = buffer_create(0, NULL);
+ collect_page_flags_to_buffer(wb, flags);
+ }
+
+ if(!vd.is_valid) {
+#ifdef NETDATA_INTERNAL_CHECKS
+ internal_error(true,
+#else
+ nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR,
+#endif
+ "DBENGINE: metric '%s' %s invalid page of type %u "
+ "from %ld to %ld (now %ld), update every %ld, page length %zu, entries %zu (flags: %s)",
+ uuid_str, msg, vd.type,
+ vd.start_time_s, vd.end_time_s, now_s, vd.update_every_s, vd.page_length, vd.entries, wb?buffer_tostring(wb):""
+ );
+ }
+ else {
+ const char *err_valid = (vd.is_valid) ? "" : "found invalid, ";
+ const char *err_start = (vd.start_time_s == start_time_s) ? "" : "start time updated, ";
+ const char *err_end = (vd.end_time_s == end_time_s) ? "" : "end time updated, ";
+ const char *err_update = (vd.update_every_s == update_every_s) ? "" : "update every updated, ";
+ const char *err_length = (vd.page_length == page_length) ? "" : "page length updated, ";
+ const char *err_entries = (vd.entries == entries) ? "" : "entries updated, ";
+ const char *err_future = (now_s && vd.end_time_s <= now_s) ? "" : "future end time, ";
+
+#ifdef NETDATA_INTERNAL_CHECKS
+ internal_error(true,
+#else
+ nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR,
+#endif
+ "DBENGINE: metric '%s' %s page of type %u "
+ "from %ld to %ld (now %ld), update every %ld, page length %zu, entries %zu (flags: %s), "
+ "found inconsistent - the right is "
+ "from %ld to %ld, update every %ld, page length %zu, entries %zu: "
+ "%s%s%s%s%s%s%s",
+ uuid_str, msg, vd.type,
+ start_time_s, end_time_s, now_s, update_every_s, page_length, entries, wb?buffer_tostring(wb):"",
+ vd.start_time_s, vd.end_time_s, vd.update_every_s, vd.page_length, vd.entries,
+ err_valid, err_start, err_end, err_update, err_length, err_entries, err_future
+ );
+ }
+
+ buffer_free(wb);
+ }
+
+ return vd;
+}
+
+static inline struct page_details *epdl_get_pd_load_link_list_from_metric_start_time(EPDL *epdl, Word_t metric_id, time_t start_time_s) {
+
+ if(unlikely(epdl->head_to_datafile_extent_queries_pending_for_extent))
+ // stop appending more pages to this epdl
+ epdl_pending_del(epdl);
+
+ struct page_details *pd_list = NULL;
+
+ for(EPDL *ep = epdl; ep ;ep = ep->query.next) {
+ Pvoid_t *pd_by_start_time_s_judyL = PDCJudyLGet(ep->page_details_by_metric_id_JudyL, metric_id, PJE0);
+ internal_fatal(pd_by_start_time_s_judyL == PJERR, "DBENGINE: corrupted extent metrics JudyL");
+
+ if (unlikely(pd_by_start_time_s_judyL && *pd_by_start_time_s_judyL)) {
+ Pvoid_t *pd_pptr = PDCJudyLGet(*pd_by_start_time_s_judyL, start_time_s, PJE0);
+ internal_fatal(pd_pptr == PJERR, "DBENGINE: corrupted metric page details JudyHS");
+
+ if(likely(pd_pptr && *pd_pptr)) {
+ struct page_details *pd = *pd_pptr;
+ internal_fatal(metric_id != pd->metric_id, "DBENGINE: metric ids do not match");
+
+ if(likely(!pd->page)) {
+ if (unlikely(__atomic_load_n(&ep->pdc->workers_should_stop, __ATOMIC_RELAXED)))
+ pdc_page_status_set(pd, PDC_PAGE_FAILED | PDC_PAGE_CANCELLED);
+ else
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pd_list, pd, load.prev, load.next);
+ }
+ }
+ }
+ }
+
+ return pd_list;
+}
+
+static void epdl_extent_loading_error_log(struct rrdengine_instance *ctx, EPDL *epdl, struct rrdeng_extent_page_descr *descr, const char *msg) {
+ char uuid[UUID_STR_LEN] = "";
+ time_t start_time_s = 0;
+ time_t end_time_s = 0;
+ bool used_epdl = false;
+ bool used_descr = false;
+
+ if (descr) {
+ start_time_s = (time_t)(descr->start_time_ut / USEC_PER_SEC);
+ switch (descr->type) {
+ case PAGE_METRICS:
+ case PAGE_TIER:
+ end_time_s = (time_t)(descr->end_time_ut / USEC_PER_SEC);
+ break;
+ case PAGE_GORILLA_METRICS:
+ end_time_s = (time_t) start_time_s + (descr->gorilla.delta_time_s);
+ break;
+ }
+ uuid_unparse_lower(descr->uuid, uuid);
+ used_descr = true;
+ }
+ else {
+ struct page_details *pd = NULL;
+
+ Word_t start = 0;
+ Pvoid_t *pd_by_start_time_s_judyL = PDCJudyLFirst(epdl->page_details_by_metric_id_JudyL, &start, PJE0);
+ if(pd_by_start_time_s_judyL) {
+ start = 0;
+ Pvoid_t *pd_pptr = PDCJudyLFirst(*pd_by_start_time_s_judyL, &start, PJE0);
+ if(pd_pptr) {
+ pd = *pd_pptr;
+ start_time_s = pd->first_time_s;
+ end_time_s = pd->last_time_s;
+ METRIC *metric = (METRIC *)pd->metric_id;
+ uuid_t *u = mrg_metric_uuid(main_mrg, metric);
+ uuid_unparse_lower(*u, uuid);
+ used_epdl = true;
+ }
+ }
+ }
+
+ if(!used_epdl && !used_descr && epdl->pdc) {
+ start_time_s = epdl->pdc->start_time_s;
+ end_time_s = epdl->pdc->end_time_s;
+ }
+
+ char start_time_str[LOG_DATE_LENGTH + 1] = "";
+ if(start_time_s)
+ log_date(start_time_str, LOG_DATE_LENGTH, start_time_s);
+
+ char end_time_str[LOG_DATE_LENGTH + 1] = "";
+ if(end_time_s)
+ log_date(end_time_str, LOG_DATE_LENGTH, end_time_s);
+
+ nd_log_limit_static_global_var(erl, 1, 0);
+ nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR,
+ "DBENGINE: error while reading extent from datafile %u of tier %d, at offset %" PRIu64 " (%u bytes) "
+ "%s from %ld (%s) to %ld (%s) %s%s: "
+ "%s",
+ epdl->datafile->fileno, ctx->config.tier,
+ epdl->extent_offset, epdl->extent_size,
+ used_epdl ? "to extract page (PD)" : used_descr ? "expected page (DESCR)" : "part of a query (PDC)",
+ start_time_s, start_time_str, end_time_s, end_time_str,
+ used_epdl || used_descr ? " of metric " : "",
+ used_epdl || used_descr ? uuid : "",
+ msg);
+}
+
+static bool epdl_populate_pages_from_extent_data(
+ struct rrdengine_instance *ctx,
+ void *data,
+ size_t data_length,
+ EPDL *epdl,
+ bool worker,
+ PDC_PAGE_STATUS tags,
+ bool cached_extent)
+{
+ int ret;
+ unsigned i, count;
+ void *uncompressed_buf = NULL;
+ uint32_t payload_length, payload_offset, trailer_offset, uncompressed_payload_length = 0;
+ bool have_read_error = false;
+ /* persistent structures */
+ struct rrdeng_df_extent_header *header;
+ struct rrdeng_df_extent_trailer *trailer;
+ struct extent_buffer *eb = NULL;
+ uLong crc;
+
+ bool can_use_data = true;
+ if(data_length < sizeof(*header) + sizeof(header->descr[0]) + sizeof(*trailer)) {
+ can_use_data = false;
+
+ // added to satisfy the requirements of older compilers (prevent warnings)
+ payload_length = 0;
+ payload_offset = 0;
+ trailer_offset = 0;
+ count = 0;
+ header = NULL;
+ trailer = NULL;
+ }
+ else {
+ header = data;
+ payload_length = header->payload_length;
+ count = header->number_of_pages;
+ payload_offset = sizeof(*header) + sizeof(header->descr[0]) * count;
+ trailer_offset = data_length - sizeof(*trailer);
+ trailer = data + trailer_offset;
+ }
+
+ if( !can_use_data ||
+ count < 1 ||
+ count > MAX_PAGES_PER_EXTENT ||
+ (header->compression_algorithm != RRD_NO_COMPRESSION && header->compression_algorithm != RRD_LZ4) ||
+ (payload_length != trailer_offset - payload_offset) ||
+ (data_length != payload_offset + payload_length + sizeof(*trailer))
+ ) {
+ epdl_extent_loading_error_log(ctx, epdl, NULL, "header is INVALID");
+ return false;
+ }
+
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, data, epdl->extent_size - sizeof(*trailer));
+ ret = crc32cmp(trailer->checksum, crc);
+ if (unlikely(ret)) {
+ ctx_io_error(ctx);
+ have_read_error = true;
+ epdl_extent_loading_error_log(ctx, epdl, NULL, "CRC32 checksum FAILED");
+ }
+
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_EXTENT_DECOMPRESSION);
+
+ if (likely(!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm)) {
+ // find the uncompressed extent size
+ uncompressed_payload_length = 0;
+ for (i = 0; i < count; ++i) {
+ size_t page_length = header->descr[i].page_length;
+ if (page_length > RRDENG_BLOCK_SIZE && (header->descr[i].type != PAGE_GORILLA_METRICS ||
+ (header->descr[i].type == PAGE_GORILLA_METRICS &&
+ (page_length - RRDENG_BLOCK_SIZE) % GORILLA_BUFFER_SIZE))) {
+ have_read_error = true;
+ break;
+ }
+
+ uncompressed_payload_length += header->descr[i].page_length;
+ }
+
+ if(unlikely(uncompressed_payload_length > MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE))
+ have_read_error = true;
+
+ if(likely(!have_read_error)) {
+ eb = extent_buffer_get(uncompressed_payload_length);
+ uncompressed_buf = eb->data;
+
+ ret = LZ4_decompress_safe(data + payload_offset, uncompressed_buf,
+ (int) payload_length, (int) uncompressed_payload_length);
+
+ __atomic_add_fetch(&ctx->stats.before_decompress_bytes, payload_length, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&ctx->stats.after_decompress_bytes, ret, __ATOMIC_RELAXED);
+ }
+ }
+
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_LOOKUP);
+
+ size_t stats_data_from_main_cache = 0;
+ size_t stats_data_from_extent = 0;
+ size_t stats_load_compressed = 0;
+ size_t stats_load_uncompressed = 0;
+ size_t stats_load_invalid_page = 0;
+ size_t stats_cache_hit_while_inserting = 0;
+
+ uint32_t page_offset = 0, page_length;
+ time_t now_s = max_acceptable_collected_time();
+ for (i = 0; i < count; i++, page_offset += page_length) {
+ page_length = header->descr[i].page_length;
+ time_t start_time_s = (time_t) (header->descr[i].start_time_ut / USEC_PER_SEC);
+
+ if(!page_length || !start_time_s) {
+ char log[200 + 1];
+ snprintfz(log, sizeof(log) - 1, "page %u (out of %u) is EMPTY", i, count);
+ epdl_extent_loading_error_log(ctx, epdl, &header->descr[i], log);
+ continue;
+ }
+
+ METRIC *metric = mrg_metric_get_and_acquire(main_mrg, &header->descr[i].uuid, (Word_t)ctx);
+ Word_t metric_id = (Word_t)metric;
+ if(!metric) {
+ char log[200 + 1];
+ snprintfz(log, sizeof(log) - 1, "page %u (out of %u) has unknown UUID", i, count);
+ epdl_extent_loading_error_log(ctx, epdl, &header->descr[i], log);
+ continue;
+ }
+ mrg_metric_release(main_mrg, metric);
+
+ struct page_details *pd_list = epdl_get_pd_load_link_list_from_metric_start_time(epdl, metric_id, start_time_s);
+ if(likely(!pd_list))
+ continue;
+
+ VALIDATED_PAGE_DESCRIPTOR vd = validate_extent_page_descr(
+ &header->descr[i], now_s,
+ (pd_list) ? pd_list->update_every_s : 0,
+ have_read_error);
+
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_ALLOCATION);
+
+ PGD *pgd;
+
+ if (unlikely(!vd.is_valid)) {
+ pgd = PGD_EMPTY;
+ stats_load_invalid_page++;
+ }
+ else {
+ if (RRD_NO_COMPRESSION == header->compression_algorithm) {
+ pgd = pgd_create_from_disk_data(header->descr[i].type,
+ data + payload_offset + page_offset,
+ vd.page_length);
+ stats_load_uncompressed++;
+ }
+ else {
+ if (unlikely(page_offset + vd.page_length > uncompressed_payload_length)) {
+ char log[200 + 1];
+ snprintfz(log, sizeof(log) - 1, "page %u (out of %u) offset %u + page length %zu, "
+ "exceeds the uncompressed buffer size %u",
+ i, count, page_offset, vd.page_length, uncompressed_payload_length);
+ epdl_extent_loading_error_log(ctx, epdl, &header->descr[i], log);
+
+ pgd = PGD_EMPTY;
+ stats_load_invalid_page++;
+ }
+ else {
+ pgd = pgd_create_from_disk_data(header->descr[i].type,
+ uncompressed_buf + page_offset,
+ vd.page_length);
+ stats_load_compressed++;
+ }
+ }
+ }
+
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_POPULATION);
+
+ PGC_ENTRY page_entry = {
+ .hot = false,
+ .section = (Word_t)ctx,
+ .metric_id = metric_id,
+ .start_time_s = vd.start_time_s,
+ .end_time_s = vd.end_time_s,
+ .update_every_s = (uint32_t) vd.update_every_s,
+ .size = pgd_memory_footprint(pgd), // the footprint of the entire PGD, for accurate memory management
+ .data = pgd,
+ };
+
+ bool added = true;
+ PGC_PAGE *page = pgc_page_add_and_acquire(main_cache, page_entry, &added);
+ if (false == added) {
+ pgd_free(pgd);
+ stats_cache_hit_while_inserting++;
+ stats_data_from_main_cache++;
+ }
+ else
+ stats_data_from_extent++;
+
+ struct page_details *pd = pd_list;
+ do {
+ if(pd != pd_list)
+ pgc_page_dup(main_cache, page);
+
+ pd->page = page;
+ pdc_page_status_set(pd, PDC_PAGE_READY | tags | (pgd_is_empty(pgd) ? PDC_PAGE_EMPTY : 0));
+
+ pd = pd->load.next;
+ } while(pd);
+
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_LOOKUP);
+ }
+
+ if(stats_data_from_main_cache)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache, stats_data_from_main_cache, __ATOMIC_RELAXED);
+
+ if(cached_extent)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_extent_cache, stats_data_from_extent, __ATOMIC_RELAXED);
+ else {
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_disk, stats_data_from_extent, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.extents_loaded_from_disk, 1, __ATOMIC_RELAXED);
+ }
+
+ if(stats_cache_hit_while_inserting)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_ok_loaded_but_cache_hit_while_inserting, stats_cache_hit_while_inserting, __ATOMIC_RELAXED);
+
+ if(stats_load_compressed)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_ok_compressed, stats_load_compressed, __ATOMIC_RELAXED);
+
+ if(stats_load_uncompressed)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_ok_uncompressed, stats_load_uncompressed, __ATOMIC_RELAXED);
+
+ if(stats_load_invalid_page)
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_fail_invalid_page_in_extent, stats_load_invalid_page, __ATOMIC_RELAXED);
+
+ if(worker)
+ worker_is_idle();
+
+ extent_buffer_release(eb);
+
+ return true;
+}
+
+static inline void *datafile_extent_read(struct rrdengine_instance *ctx, uv_file file, unsigned pos, unsigned size_bytes)
+{
+ void *buffer;
+ uv_fs_t request;
+
+ unsigned real_io_size = ALIGN_BYTES_CEILING(size_bytes);
+ int ret = posix_memalign(&buffer, RRDFILE_ALIGNMENT, real_io_size);
+ if (unlikely(ret))
+ fatal("DBENGINE: posix_memalign(): %s", strerror(ret));
+
+ uv_buf_t iov = uv_buf_init(buffer, real_io_size);
+ ret = uv_fs_read(NULL, &request, file, &iov, 1, pos, NULL);
+ if (unlikely(-1 == ret)) {
+ ctx_io_error(ctx);
+ posix_memfree(buffer);
+ buffer = NULL;
+ }
+ else
+ ctx_io_read_op_bytes(ctx, real_io_size);
+
+ uv_fs_req_cleanup(&request);
+
+ return buffer;
+}
+
+static inline void datafile_extent_read_free(void *buffer) {
+ posix_memfree(buffer);
+}
+
+void epdl_find_extent_and_populate_pages(struct rrdengine_instance *ctx, EPDL *epdl, bool worker) {
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_EXTENT_CACHE_LOOKUP);
+
+ size_t *statistics_counter = NULL;
+ PDC_PAGE_STATUS not_loaded_pages_tag = 0, loaded_pages_tag = 0;
+
+ bool should_stop = __atomic_load_n(&epdl->pdc->workers_should_stop, __ATOMIC_RELAXED);
+ for(EPDL *ep = epdl->query.next; ep ;ep = ep->query.next) {
+ internal_fatal(ep->datafile != epdl->datafile, "DBENGINE: datafiles do not match");
+ internal_fatal(ep->extent_offset != epdl->extent_offset, "DBENGINE: extent offsets do not match");
+ internal_fatal(ep->extent_size != epdl->extent_size, "DBENGINE: extent sizes do not match");
+ internal_fatal(ep->file != epdl->file, "DBENGINE: files do not match");
+
+ if(!__atomic_load_n(&ep->pdc->workers_should_stop, __ATOMIC_RELAXED)) {
+ should_stop = false;
+ break;
+ }
+ }
+
+ if(unlikely(should_stop)) {
+ statistics_counter = &rrdeng_cache_efficiency_stats.pages_load_fail_cancelled;
+ not_loaded_pages_tag = PDC_PAGE_CANCELLED;
+ goto cleanup;
+ }
+
+ bool extent_found_in_cache = false;
+
+ void *extent_compressed_data = NULL;
+ PGC_PAGE *extent_cache_page = pgc_page_get_and_acquire(
+ extent_cache, (Word_t)ctx,
+ (Word_t)epdl->datafile->fileno, (time_t)epdl->extent_offset,
+ PGC_SEARCH_EXACT);
+
+ if(extent_cache_page) {
+ extent_compressed_data = pgc_page_data(extent_cache_page);
+ internal_fatal(epdl->extent_size != pgc_page_data_size(extent_cache, extent_cache_page),
+ "DBENGINE: cache size does not match the expected size");
+
+ loaded_pages_tag |= PDC_PAGE_EXTENT_FROM_CACHE;
+ not_loaded_pages_tag |= PDC_PAGE_EXTENT_FROM_CACHE;
+ extent_found_in_cache = true;
+ }
+ else {
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_EXTENT_MMAP);
+
+ void *extent_data = datafile_extent_read(ctx, epdl->file, epdl->extent_offset, epdl->extent_size);
+ if(extent_data != NULL) {
+
+ void *copied_extent_compressed_data = dbengine_extent_alloc(epdl->extent_size);
+ memcpy(copied_extent_compressed_data, extent_data, epdl->extent_size);
+ datafile_extent_read_free(extent_data);
+
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_EXTENT_CACHE_LOOKUP);
+
+ bool added = false;
+ extent_cache_page = pgc_page_add_and_acquire(extent_cache, (PGC_ENTRY) {
+ .hot = false,
+ .section = (Word_t) ctx,
+ .metric_id = (Word_t) epdl->datafile->fileno,
+ .start_time_s = (time_t) epdl->extent_offset,
+ .size = epdl->extent_size,
+ .end_time_s = 0,
+ .update_every_s = 0,
+ .data = copied_extent_compressed_data,
+ }, &added);
+
+ if (!added) {
+ dbengine_extent_free(copied_extent_compressed_data, epdl->extent_size);
+ internal_fatal(epdl->extent_size != pgc_page_data_size(extent_cache, extent_cache_page),
+ "DBENGINE: cache size does not match the expected size");
+ }
+
+ extent_compressed_data = pgc_page_data(extent_cache_page);
+
+ loaded_pages_tag |= PDC_PAGE_EXTENT_FROM_DISK;
+ not_loaded_pages_tag |= PDC_PAGE_EXTENT_FROM_DISK;
+ }
+ }
+
+ if(extent_compressed_data) {
+ // Need to decompress and then process the pagelist
+ bool extent_used = epdl_populate_pages_from_extent_data(
+ ctx, extent_compressed_data, epdl->extent_size,
+ epdl, worker, loaded_pages_tag, extent_found_in_cache);
+
+ if(extent_used) {
+ // since the extent was used, all the pages that are not
+ // loaded from this extent, were not found in the extent
+ not_loaded_pages_tag |= PDC_PAGE_FAILED_NOT_IN_EXTENT;
+ statistics_counter = &rrdeng_cache_efficiency_stats.pages_load_fail_not_found;
+ }
+ else {
+ not_loaded_pages_tag |= PDC_PAGE_FAILED_INVALID_EXTENT;
+ statistics_counter = &rrdeng_cache_efficiency_stats.pages_load_fail_invalid_extent;
+ }
+ }
+ else {
+ not_loaded_pages_tag |= PDC_PAGE_FAILED_TO_MAP_EXTENT;
+ statistics_counter = &rrdeng_cache_efficiency_stats.pages_load_fail_cant_mmap_extent;
+ }
+
+ if(extent_cache_page)
+ pgc_page_release(extent_cache, extent_cache_page);
+
+cleanup:
+ // remove it from the datafile extent_queries
+ // this can be called multiple times safely
+ epdl_pending_del(epdl);
+
+ // mark all pending pages as failed
+ for(EPDL *ep = epdl; ep ;ep = ep->query.next) {
+ epdl_mark_all_not_loaded_pages_as_failed(
+ ep, not_loaded_pages_tag, statistics_counter);
+ }
+
+ for(EPDL *ep = epdl, *next = NULL; ep ; ep = next) {
+ next = ep->query.next;
+
+ completion_mark_complete_a_job(&ep->pdc->page_completion);
+ pdc_release_and_destroy_if_unreferenced(ep->pdc, true, false);
+
+ // Free the Judy that holds the requested pagelist and the extents
+ epdl_destroy(ep);
+ }
+
+ if(worker)
+ worker_is_idle();
+}
diff --git a/database/engine/pdc.h b/database/engine/pdc.h
new file mode 100644
index 00000000..9bae39ad
--- /dev/null
+++ b/database/engine/pdc.h
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef DBENGINE_PDC_H
+#define DBENGINE_PDC_H
+
+#include "../engine/rrdengine.h"
+
+struct rrdeng_cmd;
+
+#ifdef PDC_USE_JULYL
+#define PDCJudyLIns JulyLIns
+#define PDCJudyLGet JulyLGet
+#define PDCJudyLFirst JulyLFirst
+#define PDCJudyLNext JulyLNext
+#define PDCJudyLLast JulyLLast
+#define PDCJudyLPrev JulyLPrev
+#define PDCJudyLFirstThenNext JulyLFirstThenNext
+#define PDCJudyLLastThenPrev JulyLLastThenPrev
+#define PDCJudyLFreeArray JulyLFreeArray
+#else
+#define PDCJudyLIns JudyLIns
+#define PDCJudyLGet JudyLGet
+#define PDCJudyLFirst JudyLFirst
+#define PDCJudyLNext JudyLNext
+#define PDCJudyLLast JudyLLast
+#define PDCJudyLPrev JudyLPrev
+#define PDCJudyLFirstThenNext JudyLFirstThenNext
+#define PDCJudyLLastThenPrev JudyLLastThenPrev
+#define PDCJudyLFreeArray JudyLFreeArray
+#endif
+
+typedef struct extent_page_details_list EPDL;
+typedef void (*execute_extent_page_details_list_t)(struct rrdengine_instance *ctx, EPDL *epdl, enum storage_priority priority);
+void pdc_to_epdl_router(struct rrdengine_instance *ctx, struct page_details_control *pdc, execute_extent_page_details_list_t exec_first_extent_list, execute_extent_page_details_list_t exec_rest_extent_list);
+void epdl_find_extent_and_populate_pages(struct rrdengine_instance *ctx, EPDL *epdl, bool worker);
+
+size_t pdc_cache_size(void);
+size_t pd_cache_size(void);
+size_t epdl_cache_size(void);
+size_t deol_cache_size(void);
+size_t extent_buffer_cache_size(void);
+
+void pdc_init(void);
+void page_details_init(void);
+void epdl_init(void);
+void deol_init(void);
+void extent_buffer_cleanup1(void);
+
+void epdl_cmd_dequeued(void *epdl_ptr);
+void epdl_cmd_queued(void *epdl_ptr, struct rrdeng_cmd *cmd);
+
+struct extent_buffer {
+ size_t bytes;
+
+ struct {
+ struct extent_buffer *prev;
+ struct extent_buffer *next;
+ } cache;
+
+ uint8_t data[];
+};
+
+void extent_buffer_init(void);
+struct extent_buffer *extent_buffer_get(size_t size);
+void extent_buffer_release(struct extent_buffer *eb);
+
+#endif // DBENGINE_PDC_H
diff --git a/database/engine/rrddiskprotocol.h b/database/engine/rrddiskprotocol.h
new file mode 100644
index 00000000..86b41f0b
--- /dev/null
+++ b/database/engine/rrddiskprotocol.h
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_RRDDISKPROTOCOL_H
+#define NETDATA_RRDDISKPROTOCOL_H
+
+#include <stdint.h>
+
+#define RRDENG_BLOCK_SIZE (4096)
+#define RRDFILE_ALIGNMENT RRDENG_BLOCK_SIZE
+
+#define RRDENG_MAGIC_SZ (32)
+#define RRDENG_DF_MAGIC "netdata-data-file"
+#define RRDENG_JF_MAGIC "netdata-journal-file"
+
+#define RRDENG_VER_SZ (16)
+#define RRDENG_DF_VER "1.0"
+#define RRDENG_JF_VER "1.0"
+
+#define UUID_SZ (16)
+#define CHECKSUM_SZ (4) /* CRC32 */
+
+#define RRD_NO_COMPRESSION (0)
+#define RRD_LZ4 (1)
+
+#define RRDENG_DF_SB_PADDING_SZ (RRDENG_BLOCK_SIZE - (RRDENG_MAGIC_SZ + RRDENG_VER_SZ + sizeof(uint8_t)))
+/*
+ * Data file persistent super-block
+ */
+struct rrdeng_df_sb {
+ char magic_number[RRDENG_MAGIC_SZ];
+ char version[RRDENG_VER_SZ];
+ uint8_t tier;
+ uint8_t padding[RRDENG_DF_SB_PADDING_SZ];
+} __attribute__ ((packed));
+
+/*
+ * Page types
+ */
+#define PAGE_METRICS (0)
+#define PAGE_TIER (1)
+#define PAGE_GORILLA_METRICS (2)
+#define PAGE_TYPE_MAX 2 // Maximum page type (inclusive)
+
+/*
+ * Data file page descriptor
+ */
+struct rrdeng_extent_page_descr {
+ uint8_t type;
+
+ uint8_t uuid[UUID_SZ];
+ uint32_t page_length;
+ uint64_t start_time_ut;
+ union {
+ struct {
+ uint32_t entries;
+ uint32_t delta_time_s;
+ } gorilla __attribute__((packed));
+
+ uint64_t end_time_ut;
+ };
+} __attribute__ ((packed));
+
+/*
+ * Data file extent header
+ */
+struct rrdeng_df_extent_header {
+ uint32_t payload_length;
+ uint8_t compression_algorithm;
+ uint8_t number_of_pages;
+ /* #number_of_pages page descriptors follow */
+ struct rrdeng_extent_page_descr descr[];
+} __attribute__ ((packed));
+
+/*
+ * Data file extent trailer
+ */
+struct rrdeng_df_extent_trailer {
+ uint8_t checksum[CHECKSUM_SZ]; /* CRC32 */
+} __attribute__ ((packed));
+
+#define RRDENG_JF_SB_PADDING_SZ (RRDENG_BLOCK_SIZE - (RRDENG_MAGIC_SZ + RRDENG_VER_SZ))
+/*
+ * Journal file super-block
+ */
+struct rrdeng_jf_sb {
+ char magic_number[RRDENG_MAGIC_SZ];
+ char version[RRDENG_VER_SZ];
+ uint8_t padding[RRDENG_JF_SB_PADDING_SZ];
+} __attribute__ ((packed));
+
+/*
+ * Transaction record types
+ */
+#define STORE_PADDING (0)
+#define STORE_DATA (1)
+#define STORE_LOGS (2) /* reserved */
+
+/*
+ * Journal file transaction record header
+ */
+struct rrdeng_jf_transaction_header {
+ /* when set to STORE_PADDING jump to start of next block */
+ uint8_t type;
+
+ uint32_t reserved; /* reserved for future use */
+ uint64_t id;
+ uint16_t payload_length;
+} __attribute__ ((packed));
+
+/*
+ * Journal file transaction record trailer
+ */
+struct rrdeng_jf_transaction_trailer {
+ uint8_t checksum[CHECKSUM_SZ]; /* CRC32 */
+} __attribute__ ((packed));
+
+/*
+ * Journal file STORE_DATA action
+ */
+struct rrdeng_jf_store_data {
+ /* data file extent information */
+ uint64_t extent_offset;
+ uint32_t extent_size;
+
+ uint8_t number_of_pages;
+ /* #number_of_pages page descriptors follow */
+ struct rrdeng_extent_page_descr descr[];
+} __attribute__ ((packed));
+
+#endif /* NETDATA_RRDDISKPROTOCOL_H */ \ No newline at end of file
diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c
new file mode 100644
index 00000000..b82cc1ad
--- /dev/null
+++ b/database/engine/rrdengine.c
@@ -0,0 +1,1866 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+#define NETDATA_RRD_INTERNALS
+
+#include "rrdengine.h"
+#include "pdc.h"
+
+rrdeng_stats_t global_io_errors = 0;
+rrdeng_stats_t global_fs_errors = 0;
+rrdeng_stats_t rrdeng_reserved_file_descriptors = 0;
+rrdeng_stats_t global_pg_cache_over_half_dirty_events = 0;
+rrdeng_stats_t global_flushing_pressure_page_deletions = 0;
+
+unsigned rrdeng_pages_per_extent = MAX_PAGES_PER_EXTENT;
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_OPCODE_MAX + 2)
+#error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least (RRDENG_MAX_OPCODE + 2)
+#endif
+
+struct rrdeng_cmd {
+ struct rrdengine_instance *ctx;
+ enum rrdeng_opcode opcode;
+ void *data;
+ struct completion *completion;
+ enum storage_priority priority;
+ dequeue_callback_t dequeue_cb;
+
+ struct {
+ struct rrdeng_cmd *prev;
+ struct rrdeng_cmd *next;
+ } queue;
+};
+
+static inline struct rrdeng_cmd rrdeng_deq_cmd(bool from_worker);
+static inline void worker_dispatch_extent_read(struct rrdeng_cmd cmd, bool from_worker);
+static inline void worker_dispatch_query_prep(struct rrdeng_cmd cmd, bool from_worker);
+
+struct rrdeng_main {
+ uv_thread_t thread;
+ uv_loop_t loop;
+ uv_async_t async;
+ uv_timer_t timer;
+ pid_t tid;
+ bool shutdown;
+
+ size_t flushes_running;
+ size_t evictions_running;
+ size_t cleanup_running;
+
+ struct {
+ ARAL *ar;
+
+ struct {
+ SPINLOCK spinlock;
+
+ size_t waiting;
+ struct rrdeng_cmd *waiting_items_by_priority[STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE];
+ size_t executed_by_priority[STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE];
+ } unsafe;
+ } cmd_queue;
+
+ struct {
+ ARAL *ar;
+
+ struct {
+ size_t dispatched;
+ size_t executing;
+ } atomics;
+ } work_cmd;
+
+ struct {
+ ARAL *ar;
+ } handles;
+
+ struct {
+ ARAL *ar;
+ } descriptors;
+
+ struct {
+ ARAL *ar;
+ } xt_io_descr;
+
+} rrdeng_main = {
+ .thread = 0,
+ .loop = {},
+ .async = {},
+ .timer = {},
+ .flushes_running = 0,
+ .evictions_running = 0,
+ .cleanup_running = 0,
+
+ .cmd_queue = {
+ .unsafe = {
+ .spinlock = NETDATA_SPINLOCK_INITIALIZER,
+ },
+ }
+};
+
+static void sanity_check(void)
+{
+ BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_OPCODE_MAX + 2));
+
+ /* Magic numbers must fit in the super-blocks */
+ BUILD_BUG_ON(strlen(RRDENG_DF_MAGIC) > RRDENG_MAGIC_SZ);
+ BUILD_BUG_ON(strlen(RRDENG_JF_MAGIC) > RRDENG_MAGIC_SZ);
+
+ /* Version strings must fit in the super-blocks */
+ BUILD_BUG_ON(strlen(RRDENG_DF_VER) > RRDENG_VER_SZ);
+ BUILD_BUG_ON(strlen(RRDENG_JF_VER) > RRDENG_VER_SZ);
+
+ /* Data file super-block cannot be larger than RRDENG_BLOCK_SIZE */
+ BUILD_BUG_ON(RRDENG_DF_SB_PADDING_SZ < 0);
+
+ BUILD_BUG_ON(sizeof(uuid_t) != UUID_SZ); /* check UUID size */
+
+ /* page count must fit in 8 bits */
+ BUILD_BUG_ON(MAX_PAGES_PER_EXTENT > 255);
+
+ /* extent cache count must fit in 32 bits */
+// BUILD_BUG_ON(MAX_CACHED_EXTENTS > 32);
+
+ /* page info scratch space must be able to hold 2 32-bit integers */
+ BUILD_BUG_ON(sizeof(((struct rrdeng_page_info *)0)->scratch) < 2 * sizeof(uint32_t));
+}
+
+// ----------------------------------------------------------------------------
+// work request cache
+
+typedef void *(*work_cb)(struct rrdengine_instance *ctx, void *data, struct completion *completion, uv_work_t* req);
+typedef void (*after_work_cb)(struct rrdengine_instance *ctx, void *data, struct completion *completion, uv_work_t* req, int status);
+
+struct rrdeng_work {
+ uv_work_t req;
+
+ struct rrdengine_instance *ctx;
+ void *data;
+ struct completion *completion;
+
+ work_cb work_cb;
+ after_work_cb after_work_cb;
+ enum rrdeng_opcode opcode;
+};
+
+static void work_request_init(void) {
+ rrdeng_main.work_cmd.ar = aral_create(
+ "dbengine-work-cmd",
+ sizeof(struct rrdeng_work),
+ 0,
+ 65536, NULL,
+ NULL, NULL, false, false
+ );
+}
+
+enum LIBUV_WORKERS_STATUS {
+ LIBUV_WORKERS_RELAXED,
+ LIBUV_WORKERS_STRESSED,
+ LIBUV_WORKERS_CRITICAL,
+};
+
+static inline enum LIBUV_WORKERS_STATUS work_request_full(void) {
+ size_t dispatched = __atomic_load_n(&rrdeng_main.work_cmd.atomics.dispatched, __ATOMIC_RELAXED);
+
+ if(dispatched >= (size_t)(libuv_worker_threads))
+ return LIBUV_WORKERS_CRITICAL;
+
+ else if(dispatched >= (size_t)(libuv_worker_threads - RESERVED_LIBUV_WORKER_THREADS))
+ return LIBUV_WORKERS_STRESSED;
+
+ return LIBUV_WORKERS_RELAXED;
+}
+
+static inline void work_done(struct rrdeng_work *work_request) {
+ aral_freez(rrdeng_main.work_cmd.ar, work_request);
+}
+
+static void work_standard_worker(uv_work_t *req) {
+ __atomic_add_fetch(&rrdeng_main.work_cmd.atomics.executing, 1, __ATOMIC_RELAXED);
+
+ register_libuv_worker_jobs();
+ worker_is_busy(UV_EVENT_WORKER_INIT);
+
+ struct rrdeng_work *work_request = req->data;
+
+ work_request->data = work_request->work_cb(work_request->ctx, work_request->data, work_request->completion, req);
+ worker_is_idle();
+
+ if(work_request->opcode == RRDENG_OPCODE_EXTENT_READ || work_request->opcode == RRDENG_OPCODE_QUERY) {
+ internal_fatal(work_request->after_work_cb != NULL, "DBENGINE: opcodes with a callback should not boosted");
+
+ while(1) {
+ struct rrdeng_cmd cmd = rrdeng_deq_cmd(true);
+ if (cmd.opcode == RRDENG_OPCODE_NOOP)
+ break;
+
+ worker_is_busy(UV_EVENT_WORKER_INIT);
+ switch (cmd.opcode) {
+ case RRDENG_OPCODE_EXTENT_READ:
+ worker_dispatch_extent_read(cmd, true);
+ break;
+
+ case RRDENG_OPCODE_QUERY:
+ worker_dispatch_query_prep(cmd, true);
+ break;
+
+ default:
+ fatal("DBENGINE: Opcode should not be executed synchronously");
+ break;
+ }
+ worker_is_idle();
+ }
+ }
+
+ __atomic_sub_fetch(&rrdeng_main.work_cmd.atomics.dispatched, 1, __ATOMIC_RELAXED);
+ __atomic_sub_fetch(&rrdeng_main.work_cmd.atomics.executing, 1, __ATOMIC_RELAXED);
+
+ // signal the event loop a worker is available
+ fatal_assert(0 == uv_async_send(&rrdeng_main.async));
+}
+
+static void after_work_standard_callback(uv_work_t* req, int status) {
+ struct rrdeng_work *work_request = req->data;
+
+ worker_is_busy(RRDENG_OPCODE_MAX + work_request->opcode);
+
+ if(work_request->after_work_cb)
+ work_request->after_work_cb(work_request->ctx, work_request->data, work_request->completion, req, status);
+
+ work_done(work_request);
+
+ worker_is_idle();
+}
+
+static bool work_dispatch(struct rrdengine_instance *ctx, void *data, struct completion *completion, enum rrdeng_opcode opcode, work_cb work_cb, after_work_cb after_work_cb) {
+ struct rrdeng_work *work_request = NULL;
+
+ internal_fatal(rrdeng_main.tid != gettid(), "work_dispatch() can only be run from the event loop thread");
+
+ work_request = aral_mallocz(rrdeng_main.work_cmd.ar);
+ memset(work_request, 0, sizeof(struct rrdeng_work));
+ work_request->req.data = work_request;
+ work_request->ctx = ctx;
+ work_request->data = data;
+ work_request->completion = completion;
+ work_request->work_cb = work_cb;
+ work_request->after_work_cb = after_work_cb;
+ work_request->opcode = opcode;
+
+ if(uv_queue_work(&rrdeng_main.loop, &work_request->req, work_standard_worker, after_work_standard_callback)) {
+ internal_fatal(true, "DBENGINE: cannot queue work");
+ work_done(work_request);
+ return false;
+ }
+
+ __atomic_add_fetch(&rrdeng_main.work_cmd.atomics.dispatched, 1, __ATOMIC_RELAXED);
+
+ return true;
+}
+
+// ----------------------------------------------------------------------------
+// page descriptor cache
+
+void page_descriptors_init(void) {
+ rrdeng_main.descriptors.ar = aral_create(
+ "dbengine-descriptors",
+ sizeof(struct page_descr_with_data),
+ 0,
+ 65536 * 4,
+ NULL,
+ NULL, NULL, false, false);
+}
+
+struct page_descr_with_data *page_descriptor_get(void) {
+ struct page_descr_with_data *descr = aral_mallocz(rrdeng_main.descriptors.ar);
+ memset(descr, 0, sizeof(struct page_descr_with_data));
+ return descr;
+}
+
+static inline void page_descriptor_release(struct page_descr_with_data *descr) {
+ aral_freez(rrdeng_main.descriptors.ar, descr);
+}
+
+// ----------------------------------------------------------------------------
+// extent io descriptor cache
+
+static void extent_io_descriptor_init(void) {
+ rrdeng_main.xt_io_descr.ar = aral_create(
+ "dbengine-extent-io",
+ sizeof(struct extent_io_descriptor),
+ 0,
+ 65536,
+ NULL,
+ NULL, NULL, false, false
+ );
+}
+
+static struct extent_io_descriptor *extent_io_descriptor_get(void) {
+ struct extent_io_descriptor *xt_io_descr = aral_mallocz(rrdeng_main.xt_io_descr.ar);
+ memset(xt_io_descr, 0, sizeof(struct extent_io_descriptor));
+ return xt_io_descr;
+}
+
+static inline void extent_io_descriptor_release(struct extent_io_descriptor *xt_io_descr) {
+ aral_freez(rrdeng_main.xt_io_descr.ar, xt_io_descr);
+}
+
+// ----------------------------------------------------------------------------
+// query handle cache
+
+void rrdeng_query_handle_init(void) {
+ rrdeng_main.handles.ar = aral_create(
+ "dbengine-query-handles",
+ sizeof(struct rrdeng_query_handle),
+ 0,
+ 65536,
+ NULL,
+ NULL, NULL, false, false);
+}
+
+struct rrdeng_query_handle *rrdeng_query_handle_get(void) {
+ struct rrdeng_query_handle *handle = aral_mallocz(rrdeng_main.handles.ar);
+ memset(handle, 0, sizeof(struct rrdeng_query_handle));
+ return handle;
+}
+
+void rrdeng_query_handle_release(struct rrdeng_query_handle *handle) {
+ aral_freez(rrdeng_main.handles.ar, handle);
+}
+
+// ----------------------------------------------------------------------------
+// WAL cache
+
+static struct {
+ struct {
+ SPINLOCK spinlock;
+ WAL *available_items;
+ size_t available;
+ } protected;
+
+ struct {
+ size_t allocated;
+ } atomics;
+} wal_globals = {
+ .protected = {
+ .spinlock = NETDATA_SPINLOCK_INITIALIZER,
+ .available_items = NULL,
+ .available = 0,
+ },
+ .atomics = {
+ .allocated = 0,
+ },
+};
+
+static void wal_cleanup1(void) {
+ WAL *wal = NULL;
+
+ if(!spinlock_trylock(&wal_globals.protected.spinlock))
+ return;
+
+ if(wal_globals.protected.available_items && wal_globals.protected.available > storage_tiers) {
+ wal = wal_globals.protected.available_items;
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next);
+ wal_globals.protected.available--;
+ }
+
+ spinlock_unlock(&wal_globals.protected.spinlock);
+
+ if(wal) {
+ posix_memfree(wal->buf);
+ freez(wal);
+ __atomic_sub_fetch(&wal_globals.atomics.allocated, 1, __ATOMIC_RELAXED);
+ }
+}
+
+WAL *wal_get(struct rrdengine_instance *ctx, unsigned size) {
+ if(!size || size > RRDENG_BLOCK_SIZE)
+ fatal("DBENGINE: invalid WAL size requested");
+
+ WAL *wal = NULL;
+
+ spinlock_lock(&wal_globals.protected.spinlock);
+
+ if(likely(wal_globals.protected.available_items)) {
+ wal = wal_globals.protected.available_items;
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next);
+ wal_globals.protected.available--;
+ }
+
+ uint64_t transaction_id = __atomic_fetch_add(&ctx->atomic.transaction_id, 1, __ATOMIC_RELAXED);
+ spinlock_unlock(&wal_globals.protected.spinlock);
+
+ if(unlikely(!wal)) {
+ wal = mallocz(sizeof(WAL));
+ wal->buf_size = RRDENG_BLOCK_SIZE;
+ int ret = posix_memalign((void *)&wal->buf, RRDFILE_ALIGNMENT, wal->buf_size);
+ if (unlikely(ret))
+ fatal("DBENGINE: posix_memalign:%s", strerror(ret));
+ __atomic_add_fetch(&wal_globals.atomics.allocated, 1, __ATOMIC_RELAXED);
+ }
+
+ // these need to survive
+ unsigned buf_size = wal->buf_size;
+ void *buf = wal->buf;
+
+ memset(wal, 0, sizeof(WAL));
+
+ // put them back
+ wal->buf_size = buf_size;
+ wal->buf = buf;
+
+ memset(wal->buf, 0, wal->buf_size);
+
+ wal->transaction_id = transaction_id;
+ wal->size = size;
+
+ return wal;
+}
+
+void wal_release(WAL *wal) {
+ if(unlikely(!wal)) return;
+
+ spinlock_lock(&wal_globals.protected.spinlock);
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next);
+ wal_globals.protected.available++;
+ spinlock_unlock(&wal_globals.protected.spinlock);
+}
+
+// ----------------------------------------------------------------------------
+// command queue cache
+
+static void rrdeng_cmd_queue_init(void) {
+ rrdeng_main.cmd_queue.ar = aral_create("dbengine-opcodes",
+ sizeof(struct rrdeng_cmd),
+ 0,
+ 65536,
+ NULL,
+ NULL, NULL, false, false);
+}
+
+static inline STORAGE_PRIORITY rrdeng_enq_cmd_map_opcode_to_priority(enum rrdeng_opcode opcode, STORAGE_PRIORITY priority) {
+ if(unlikely(priority >= STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE))
+ priority = STORAGE_PRIORITY_BEST_EFFORT;
+
+ switch(opcode) {
+ case RRDENG_OPCODE_QUERY:
+ priority = STORAGE_PRIORITY_INTERNAL_QUERY_PREP;
+ break;
+
+ default:
+ break;
+ }
+
+ return priority;
+}
+
+void rrdeng_enqueue_epdl_cmd(struct rrdeng_cmd *cmd) {
+ epdl_cmd_queued(cmd->data, cmd);
+}
+
+void rrdeng_dequeue_epdl_cmd(struct rrdeng_cmd *cmd) {
+ epdl_cmd_dequeued(cmd->data);
+}
+
+void rrdeng_req_cmd(requeue_callback_t get_cmd_cb, void *data, STORAGE_PRIORITY priority) {
+ spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock);
+
+ struct rrdeng_cmd *cmd = get_cmd_cb(data);
+ if(cmd) {
+ priority = rrdeng_enq_cmd_map_opcode_to_priority(cmd->opcode, priority);
+
+ if (cmd->priority > priority) {
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[cmd->priority], cmd, queue.prev, queue.next);
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority], cmd, queue.prev, queue.next);
+ cmd->priority = priority;
+ }
+ }
+
+ spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock);
+}
+
+void rrdeng_enq_cmd(struct rrdengine_instance *ctx, enum rrdeng_opcode opcode, void *data, struct completion *completion,
+ enum storage_priority priority, enqueue_callback_t enqueue_cb, dequeue_callback_t dequeue_cb) {
+
+ priority = rrdeng_enq_cmd_map_opcode_to_priority(opcode, priority);
+
+ struct rrdeng_cmd *cmd = aral_mallocz(rrdeng_main.cmd_queue.ar);
+ memset(cmd, 0, sizeof(struct rrdeng_cmd));
+ cmd->ctx = ctx;
+ cmd->opcode = opcode;
+ cmd->data = data;
+ cmd->completion = completion;
+ cmd->priority = priority;
+ cmd->dequeue_cb = dequeue_cb;
+
+ spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock);
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority], cmd, queue.prev, queue.next);
+ rrdeng_main.cmd_queue.unsafe.waiting++;
+ if(enqueue_cb)
+ enqueue_cb(cmd);
+ spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock);
+
+ fatal_assert(0 == uv_async_send(&rrdeng_main.async));
+}
+
+static inline bool rrdeng_cmd_has_waiting_opcodes_in_lower_priorities(STORAGE_PRIORITY priority, STORAGE_PRIORITY max_priority) {
+ for(; priority <= max_priority ; priority++)
+ if(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority])
+ return true;
+
+ return false;
+}
+
+#define opcode_empty (struct rrdeng_cmd) { \
+ .ctx = NULL, \
+ .opcode = RRDENG_OPCODE_NOOP, \
+ .priority = STORAGE_PRIORITY_BEST_EFFORT, \
+ .completion = NULL, \
+ .data = NULL, \
+}
+
+static inline struct rrdeng_cmd rrdeng_deq_cmd(bool from_worker) {
+ struct rrdeng_cmd *cmd = NULL;
+ enum LIBUV_WORKERS_STATUS status = work_request_full();
+
+ STORAGE_PRIORITY min_priority, max_priority;
+ min_priority = STORAGE_PRIORITY_INTERNAL_DBENGINE;
+ max_priority = (status != LIBUV_WORKERS_RELAXED) ? STORAGE_PRIORITY_INTERNAL_DBENGINE : STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE - 1;
+
+ if(from_worker) {
+ if(status == LIBUV_WORKERS_CRITICAL)
+ return opcode_empty;
+
+ min_priority = STORAGE_PRIORITY_INTERNAL_QUERY_PREP;
+ max_priority = STORAGE_PRIORITY_BEST_EFFORT;
+ }
+
+ // find an opcode to execute from the queue
+ spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock);
+ for(STORAGE_PRIORITY priority = min_priority; priority <= max_priority ; priority++) {
+ cmd = rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority];
+ if(cmd) {
+
+ // avoid starvation of lower priorities
+ if(unlikely(priority >= STORAGE_PRIORITY_HIGH &&
+ priority < STORAGE_PRIORITY_BEST_EFFORT &&
+ ++rrdeng_main.cmd_queue.unsafe.executed_by_priority[priority] % 50 == 0 &&
+ rrdeng_cmd_has_waiting_opcodes_in_lower_priorities(priority + 1, max_priority))) {
+ // let the others run 2% of the requests
+ cmd = NULL;
+ continue;
+ }
+
+ // remove it from the queue
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority], cmd, queue.prev, queue.next);
+ rrdeng_main.cmd_queue.unsafe.waiting--;
+ break;
+ }
+ }
+
+ if(cmd && cmd->dequeue_cb) {
+ cmd->dequeue_cb(cmd);
+ cmd->dequeue_cb = NULL;
+ }
+
+ spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock);
+
+ struct rrdeng_cmd ret;
+ if(cmd) {
+ // copy it, to return it
+ ret = *cmd;
+
+ aral_freez(rrdeng_main.cmd_queue.ar, cmd);
+ }
+ else
+ ret = opcode_empty;
+
+ return ret;
+}
+
+
+// ----------------------------------------------------------------------------
+
+void *dbengine_extent_alloc(size_t size) {
+ void *extent = mallocz(size);
+ return extent;
+}
+
+void dbengine_extent_free(void *extent, size_t size __maybe_unused) {
+ freez(extent);
+}
+
+static void journalfile_extent_build(struct rrdengine_instance *ctx, struct extent_io_descriptor *xt_io_descr) {
+ unsigned count, payload_length, descr_size, size_bytes;
+ void *buf;
+ /* persistent structures */
+ struct rrdeng_df_extent_header *df_header;
+ struct rrdeng_jf_transaction_header *jf_header;
+ struct rrdeng_jf_store_data *jf_metric_data;
+ struct rrdeng_jf_transaction_trailer *jf_trailer;
+ uLong crc;
+
+ df_header = xt_io_descr->buf;
+ count = df_header->number_of_pages;
+ descr_size = sizeof(*jf_metric_data->descr) * count;
+ payload_length = sizeof(*jf_metric_data) + descr_size;
+ size_bytes = sizeof(*jf_header) + payload_length + sizeof(*jf_trailer);
+
+ xt_io_descr->wal = wal_get(ctx, size_bytes);
+ buf = xt_io_descr->wal->buf;
+
+ jf_header = buf;
+ jf_header->type = STORE_DATA;
+ jf_header->reserved = 0;
+ jf_header->id = xt_io_descr->wal->transaction_id;
+ jf_header->payload_length = payload_length;
+
+ jf_metric_data = buf + sizeof(*jf_header);
+ jf_metric_data->extent_offset = xt_io_descr->pos;
+ jf_metric_data->extent_size = xt_io_descr->bytes;
+ jf_metric_data->number_of_pages = count;
+ memcpy(jf_metric_data->descr, df_header->descr, descr_size);
+
+ jf_trailer = buf + sizeof(*jf_header) + payload_length;
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, buf, sizeof(*jf_header) + payload_length);
+ crc32set(jf_trailer->checksum, crc);
+}
+
+static void after_extent_flushed_to_open(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) {
+ if(completion)
+ completion_mark_complete(completion);
+
+ if(ctx_is_available_for_queries(ctx))
+ rrdeng_enq_cmd(ctx, RRDENG_OPCODE_DATABASE_ROTATE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
+}
+
+static void *extent_flushed_to_open_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) {
+ worker_is_busy(UV_EVENT_DBENGINE_FLUSHED_TO_OPEN);
+
+ uv_fs_t *uv_fs_request = data;
+ struct extent_io_descriptor *xt_io_descr = uv_fs_request->data;
+ struct page_descr_with_data *descr;
+ struct rrdengine_datafile *datafile;
+ unsigned i;
+
+ datafile = xt_io_descr->datafile;
+
+ bool still_running = ctx_is_available_for_queries(ctx);
+
+ for (i = 0 ; i < xt_io_descr->descr_count ; ++i) {
+ descr = xt_io_descr->descr_array[i];
+
+ if (likely(still_running))
+ pgc_open_add_hot_page(
+ (Word_t)ctx, descr->metric_id,
+ (time_t) (descr->start_time_ut / USEC_PER_SEC),
+ (time_t) (descr->end_time_ut / USEC_PER_SEC),
+ descr->update_every_s,
+ datafile,
+ xt_io_descr->pos, xt_io_descr->bytes, descr->page_length);
+
+ page_descriptor_release(descr);
+ }
+
+ uv_fs_req_cleanup(uv_fs_request);
+ posix_memfree(xt_io_descr->buf);
+ extent_io_descriptor_release(xt_io_descr);
+
+ spinlock_lock(&datafile->writers.spinlock);
+ datafile->writers.flushed_to_open_running--;
+ spinlock_unlock(&datafile->writers.spinlock);
+
+ if(datafile->fileno != ctx_last_fileno_get(ctx) && still_running)
+ // we just finished a flushing on a datafile that is not the active one
+ rrdeng_enq_cmd(ctx, RRDENG_OPCODE_JOURNAL_INDEX, datafile, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
+
+ return data;
+}
+
+// Main event loop callback
+static void after_extent_write_datafile_io(uv_fs_t *uv_fs_request) {
+ worker_is_busy(RRDENG_OPCODE_MAX + RRDENG_OPCODE_EXTENT_WRITE);
+
+ struct extent_io_descriptor *xt_io_descr = uv_fs_request->data;
+ struct rrdengine_datafile *datafile = xt_io_descr->datafile;
+ struct rrdengine_instance *ctx = datafile->ctx;
+
+ if (uv_fs_request->result < 0) {
+ ctx_io_error(ctx);
+ netdata_log_error("DBENGINE: %s: uv_fs_write(): %s", __func__, uv_strerror((int)uv_fs_request->result));
+ }
+
+ journalfile_v1_extent_write(ctx, xt_io_descr->datafile, xt_io_descr->wal, &rrdeng_main.loop);
+
+ spinlock_lock(&datafile->writers.spinlock);
+ datafile->writers.running--;
+ datafile->writers.flushed_to_open_running++;
+ spinlock_unlock(&datafile->writers.spinlock);
+
+ rrdeng_enq_cmd(xt_io_descr->ctx,
+ RRDENG_OPCODE_FLUSHED_TO_OPEN,
+ uv_fs_request,
+ xt_io_descr->completion,
+ STORAGE_PRIORITY_INTERNAL_DBENGINE,
+ NULL,
+ NULL);
+
+ worker_is_idle();
+}
+
+static bool datafile_is_full(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) {
+ bool ret = false;
+ spinlock_lock(&datafile->writers.spinlock);
+
+ if(ctx_is_available_for_queries(ctx) && datafile->pos > rrdeng_target_data_file_size(ctx))
+ ret = true;
+
+ spinlock_unlock(&datafile->writers.spinlock);
+
+ return ret;
+}
+
+static struct rrdengine_datafile *get_datafile_to_write_extent(struct rrdengine_instance *ctx) {
+ struct rrdengine_datafile *datafile;
+
+ // get the latest datafile
+ uv_rwlock_rdlock(&ctx->datafiles.rwlock);
+ datafile = ctx->datafiles.first->prev;
+ // become a writer on this datafile, to prevent it from vanishing
+ spinlock_lock(&datafile->writers.spinlock);
+ datafile->writers.running++;
+ spinlock_unlock(&datafile->writers.spinlock);
+ uv_rwlock_rdunlock(&ctx->datafiles.rwlock);
+
+ if(datafile_is_full(ctx, datafile)) {
+ // remember the datafile we have become writers to
+ struct rrdengine_datafile *old_datafile = datafile;
+
+ // only 1 datafile creation at a time
+ static netdata_mutex_t mutex = NETDATA_MUTEX_INITIALIZER;
+ netdata_mutex_lock(&mutex);
+
+ // take the latest datafile again - without this, multiple threads may create multiple files
+ uv_rwlock_rdlock(&ctx->datafiles.rwlock);
+ datafile = ctx->datafiles.first->prev;
+ uv_rwlock_rdunlock(&ctx->datafiles.rwlock);
+
+ if(datafile_is_full(ctx, datafile) && create_new_datafile_pair(ctx, true) == 0)
+ rrdeng_enq_cmd(ctx, RRDENG_OPCODE_JOURNAL_INDEX, datafile, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL,
+ NULL);
+
+ netdata_mutex_unlock(&mutex);
+
+ // get the new latest datafile again, like above
+ uv_rwlock_rdlock(&ctx->datafiles.rwlock);
+ datafile = ctx->datafiles.first->prev;
+ // become a writer on this datafile, to prevent it from vanishing
+ spinlock_lock(&datafile->writers.spinlock);
+ datafile->writers.running++;
+ spinlock_unlock(&datafile->writers.spinlock);
+ uv_rwlock_rdunlock(&ctx->datafiles.rwlock);
+
+ // release the writers on the old datafile
+ spinlock_lock(&old_datafile->writers.spinlock);
+ old_datafile->writers.running--;
+ spinlock_unlock(&old_datafile->writers.spinlock);
+ }
+
+ return datafile;
+}
+
+/*
+ * Take a page list in a judy array and write them
+ */
+static struct extent_io_descriptor *datafile_extent_build(struct rrdengine_instance *ctx, struct page_descr_with_data *base, struct completion *completion) {
+ int ret;
+ int compressed_size, max_compressed_size = 0;
+ unsigned i, count, size_bytes, pos, real_io_size;
+ uint32_t uncompressed_payload_length, payload_offset;
+ struct page_descr_with_data *descr, *eligible_pages[MAX_PAGES_PER_EXTENT];
+ struct extent_io_descriptor *xt_io_descr;
+ struct extent_buffer *eb = NULL;
+ void *compressed_buf = NULL;
+ Word_t Index;
+ uint8_t compression_algorithm = ctx->config.global_compress_alg;
+ struct rrdengine_datafile *datafile;
+ /* persistent structures */
+ struct rrdeng_df_extent_header *header;
+ struct rrdeng_df_extent_trailer *trailer;
+ uLong crc;
+
+ for(descr = base, Index = 0, count = 0, uncompressed_payload_length = 0;
+ descr && count != rrdeng_pages_per_extent;
+ descr = descr->link.next, Index++) {
+
+ uncompressed_payload_length += descr->page_length;
+ eligible_pages[count++] = descr;
+
+ }
+
+ if (!count) {
+ if (completion)
+ completion_mark_complete(completion);
+
+ __atomic_sub_fetch(&ctx->atomic.extents_currently_being_flushed, 1, __ATOMIC_RELAXED);
+ return NULL;
+ }
+
+ xt_io_descr = extent_io_descriptor_get();
+ xt_io_descr->ctx = ctx;
+ payload_offset = sizeof(*header) + count * sizeof(header->descr[0]);
+ switch (compression_algorithm) {
+ case RRD_NO_COMPRESSION:
+ size_bytes = payload_offset + uncompressed_payload_length + sizeof(*trailer);
+ break;
+
+ default: /* Compress */
+ fatal_assert(uncompressed_payload_length < LZ4_MAX_INPUT_SIZE);
+ max_compressed_size = LZ4_compressBound(uncompressed_payload_length);
+ eb = extent_buffer_get(max_compressed_size);
+ compressed_buf = eb->data;
+ size_bytes = payload_offset + MAX(uncompressed_payload_length, (unsigned)max_compressed_size) + sizeof(*trailer);
+ break;
+ }
+
+ ret = posix_memalign((void *)&xt_io_descr->buf, RRDFILE_ALIGNMENT, ALIGN_BYTES_CEILING(size_bytes));
+ if (unlikely(ret)) {
+ fatal("DBENGINE: posix_memalign:%s", strerror(ret));
+ /* freez(xt_io_descr);*/
+ }
+ memset(xt_io_descr->buf, 0, ALIGN_BYTES_CEILING(size_bytes));
+ (void) memcpy(xt_io_descr->descr_array, eligible_pages, sizeof(struct page_descr_with_data *) * count);
+ xt_io_descr->descr_count = count;
+
+ pos = 0;
+ header = xt_io_descr->buf;
+ header->compression_algorithm = compression_algorithm;
+ header->number_of_pages = count;
+ pos += sizeof(*header);
+
+ for (i = 0 ; i < count ; ++i) {
+ descr = xt_io_descr->descr_array[i];
+ header->descr[i].type = descr->type;
+ uuid_copy(*(uuid_t *)header->descr[i].uuid, *descr->id);
+ header->descr[i].page_length = descr->page_length;
+ header->descr[i].start_time_ut = descr->start_time_ut;
+
+ switch (descr->type) {
+ case PAGE_METRICS:
+ case PAGE_TIER:
+ header->descr[i].end_time_ut = descr->end_time_ut;
+ break;
+ case PAGE_GORILLA_METRICS:
+ header->descr[i].gorilla.delta_time_s = (uint32_t) ((descr->end_time_ut - descr->start_time_ut) / USEC_PER_SEC);
+ header->descr[i].gorilla.entries = pgd_slots_used(descr->pgd);
+ break;
+ default:
+ fatal("Unknown page type: %uc", descr->type);
+ }
+
+ pos += sizeof(header->descr[i]);
+ }
+ for (i = 0 ; i < count ; ++i) {
+ descr = xt_io_descr->descr_array[i];
+ pgd_copy_to_extent(descr->pgd, xt_io_descr->buf + pos, descr->page_length);
+ pos += descr->page_length;
+ }
+
+ if(likely(compression_algorithm == RRD_LZ4)) {
+ compressed_size = LZ4_compress_default(
+ xt_io_descr->buf + payload_offset,
+ compressed_buf,
+ (int)uncompressed_payload_length,
+ max_compressed_size);
+
+ __atomic_add_fetch(&ctx->stats.before_compress_bytes, uncompressed_payload_length, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&ctx->stats.after_compress_bytes, compressed_size, __ATOMIC_RELAXED);
+
+ (void) memcpy(xt_io_descr->buf + payload_offset, compressed_buf, compressed_size);
+ extent_buffer_release(eb);
+ size_bytes = payload_offset + compressed_size + sizeof(*trailer);
+ header->payload_length = compressed_size;
+ }
+ else { // RRD_NO_COMPRESSION
+ header->payload_length = uncompressed_payload_length;
+ }
+
+ real_io_size = ALIGN_BYTES_CEILING(size_bytes);
+
+ datafile = get_datafile_to_write_extent(ctx);
+ spinlock_lock(&datafile->writers.spinlock);
+ xt_io_descr->datafile = datafile;
+ xt_io_descr->pos = datafile->pos;
+ datafile->pos += real_io_size;
+ spinlock_unlock(&datafile->writers.spinlock);
+
+ xt_io_descr->bytes = size_bytes;
+ xt_io_descr->uv_fs_request.data = xt_io_descr;
+ xt_io_descr->completion = completion;
+
+ trailer = xt_io_descr->buf + size_bytes - sizeof(*trailer);
+ crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, xt_io_descr->buf, size_bytes - sizeof(*trailer));
+ crc32set(trailer->checksum, crc);
+
+ xt_io_descr->iov = uv_buf_init((void *)xt_io_descr->buf, real_io_size);
+ journalfile_extent_build(ctx, xt_io_descr);
+
+ ctx_last_flush_fileno_set(ctx, datafile->fileno);
+ ctx_current_disk_space_increase(ctx, real_io_size);
+ ctx_io_write_op_bytes(ctx, real_io_size);
+
+ return xt_io_descr;
+}
+
+static void after_extent_write(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* uv_work_req __maybe_unused, int status __maybe_unused) {
+ struct extent_io_descriptor *xt_io_descr = data;
+
+ if(xt_io_descr) {
+ int ret = uv_fs_write(&rrdeng_main.loop,
+ &xt_io_descr->uv_fs_request,
+ xt_io_descr->datafile->file,
+ &xt_io_descr->iov,
+ 1,
+ (int64_t) xt_io_descr->pos,
+ after_extent_write_datafile_io);
+
+ fatal_assert(-1 != ret);
+ }
+}
+
+static void *extent_write_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) {
+ worker_is_busy(UV_EVENT_DBENGINE_EXTENT_WRITE);
+ struct page_descr_with_data *base = data;
+ struct extent_io_descriptor *xt_io_descr = datafile_extent_build(ctx, base, completion);
+ return xt_io_descr;
+}
+
+static void after_database_rotate(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) {
+ __atomic_store_n(&ctx->atomic.now_deleting_files, false, __ATOMIC_RELAXED);
+}
+
+struct uuid_first_time_s {
+ uuid_t *uuid;
+ time_t first_time_s;
+ METRIC *metric;
+ size_t pages_found;
+ size_t df_matched;
+ size_t df_index_oldest;
+};
+
+struct rrdengine_datafile *datafile_release_and_acquire_next_for_retention(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) {
+
+ uv_rwlock_rdlock(&ctx->datafiles.rwlock);
+
+ struct rrdengine_datafile *next_datafile = datafile->next;
+
+ while(next_datafile && !datafile_acquire(next_datafile, DATAFILE_ACQUIRE_RETENTION))
+ next_datafile = next_datafile->next;
+
+ uv_rwlock_rdunlock(&ctx->datafiles.rwlock);
+
+ datafile_release(datafile, DATAFILE_ACQUIRE_RETENTION);
+
+ return next_datafile;
+}
+
+time_t find_uuid_first_time(
+ struct rrdengine_instance *ctx,
+ struct rrdengine_datafile *datafile,
+ struct uuid_first_time_s *uuid_first_entry_list,
+ size_t count)
+{
+ time_t global_first_time_s = LONG_MAX;
+
+ // acquire the datafile to work with it
+ uv_rwlock_rdlock(&ctx->datafiles.rwlock);
+ while(datafile && !datafile_acquire(datafile, DATAFILE_ACQUIRE_RETENTION))
+ datafile = datafile->next;
+ uv_rwlock_rdunlock(&ctx->datafiles.rwlock);
+
+ if (unlikely(!datafile))
+ return global_first_time_s;
+
+ unsigned journalfile_count = 0;
+ size_t binary_match = 0;
+ size_t not_matching_bsearches = 0;
+
+ while (datafile) {
+ struct journal_v2_header *j2_header = journalfile_v2_data_acquire(datafile->journalfile, NULL, 0, 0);
+ if (!j2_header) {
+ datafile = datafile_release_and_acquire_next_for_retention(ctx, datafile);
+ continue;
+ }
+
+ time_t journal_start_time_s = (time_t) (j2_header->start_time_ut / USEC_PER_SEC);
+
+ if(journal_start_time_s < global_first_time_s)
+ global_first_time_s = journal_start_time_s;
+
+ struct journal_metric_list *uuid_list = (struct journal_metric_list *)((uint8_t *) j2_header + j2_header->metric_offset);
+ struct uuid_first_time_s *uuid_original_entry;
+
+ size_t journal_metric_count = j2_header->metric_count;
+
+ for (size_t index = 0; index < count; ++index) {
+ uuid_original_entry = &uuid_first_entry_list[index];
+
+ // Check here if we should skip this
+ if (uuid_original_entry->df_matched > 3 || uuid_original_entry->pages_found > 5)
+ continue;
+
+ struct journal_metric_list *live_entry =
+ bsearch(uuid_original_entry->uuid,uuid_list,journal_metric_count,
+ sizeof(*uuid_list), journal_metric_uuid_compare);
+
+ if (!live_entry) {
+ // Not found in this journal
+ not_matching_bsearches++;
+ continue;
+ }
+
+ uuid_original_entry->pages_found += live_entry->entries;
+ uuid_original_entry->df_matched++;
+
+ time_t old_first_time_s = uuid_original_entry->first_time_s;
+
+ // Calculate first / last for this match
+ time_t first_time_s = live_entry->delta_start_s + journal_start_time_s;
+ uuid_original_entry->first_time_s = MIN(uuid_original_entry->first_time_s, first_time_s);
+
+ if (uuid_original_entry->first_time_s != old_first_time_s)
+ uuid_original_entry->df_index_oldest = uuid_original_entry->df_matched;
+
+ binary_match++;
+ }
+
+ journalfile_count++;
+ journalfile_v2_data_release(datafile->journalfile);
+ datafile = datafile_release_and_acquire_next_for_retention(ctx, datafile);
+ }
+
+ // Let's scan the open cache for almost exact match
+ size_t open_cache_count = 0;
+
+ size_t df_index[10] = { 0 };
+ size_t without_metric = 0;
+ size_t open_cache_gave_first_time_s = 0;
+ size_t metric_count = 0;
+ size_t without_retention = 0;
+ size_t not_needed_bsearches = 0;
+
+ for (size_t index = 0; index < count; ++index) {
+ struct uuid_first_time_s *uuid_first_t_entry = &uuid_first_entry_list[index];
+
+ metric_count++;
+
+ size_t idx = uuid_first_t_entry->df_index_oldest;
+ if(idx >= 10)
+ idx = 9;
+
+ df_index[idx]++;
+
+ not_needed_bsearches += uuid_first_t_entry->df_matched - uuid_first_t_entry->df_index_oldest;
+
+ if (unlikely(!uuid_first_t_entry->metric)) {
+ without_metric++;
+ continue;
+ }
+
+ PGC_PAGE *page = pgc_page_get_and_acquire(
+ open_cache, (Word_t)ctx,
+ (Word_t)uuid_first_t_entry->metric, 0,
+ PGC_SEARCH_FIRST);
+
+ if (page) {
+ time_t old_first_time_s = uuid_first_t_entry->first_time_s;
+
+ time_t first_time_s = pgc_page_start_time_s(page);
+ uuid_first_t_entry->first_time_s = MIN(uuid_first_t_entry->first_time_s, first_time_s);
+ pgc_page_release(open_cache, page);
+ open_cache_count++;
+
+ if(uuid_first_t_entry->first_time_s != old_first_time_s) {
+ open_cache_gave_first_time_s++;
+ }
+ }
+ else {
+ if(!uuid_first_t_entry->df_index_oldest)
+ without_retention++;
+ }
+ }
+ internal_error(true,
+ "DBENGINE: analyzed the retention of %zu rotated metrics of tier %d, "
+ "did %zu jv2 matching binary searches (%zu not matching, %zu overflown) in %u journal files, "
+ "%zu metrics with entries in open cache, "
+ "metrics first time found per datafile index ([not in jv2]:%zu, [1]:%zu, [2]:%zu, [3]:%zu, [4]:%zu, [5]:%zu, [6]:%zu, [7]:%zu, [8]:%zu, [bigger]: %zu), "
+ "open cache found first time %zu, "
+ "metrics without any remaining retention %zu, "
+ "metrics not in MRG %zu",
+ metric_count,
+ ctx->config.tier,
+ binary_match,
+ not_matching_bsearches,
+ not_needed_bsearches,
+ journalfile_count,
+ open_cache_count,
+ df_index[0], df_index[1], df_index[2], df_index[3], df_index[4], df_index[5], df_index[6], df_index[7], df_index[8], df_index[9],
+ open_cache_gave_first_time_s,
+ without_retention,
+ without_metric
+ );
+
+ return global_first_time_s;
+}
+
+static void update_metrics_first_time_s(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile_to_delete, struct rrdengine_datafile *first_datafile_remaining, bool worker) {
+ time_t global_first_time_s = LONG_MAX;
+
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_FIND_ROTATED_METRICS);
+
+ struct rrdengine_journalfile *journalfile = datafile_to_delete->journalfile;
+ struct journal_v2_header *j2_header = journalfile_v2_data_acquire(journalfile, NULL, 0, 0);
+
+ if (unlikely(!j2_header)) {
+ if (worker)
+ worker_is_idle();
+ return;
+ }
+
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.metrics_retention_started, 1, __ATOMIC_RELAXED);
+
+ struct journal_metric_list *uuid_list = (struct journal_metric_list *)((uint8_t *) j2_header + j2_header->metric_offset);
+
+ size_t count = j2_header->metric_count;
+ struct uuid_first_time_s *uuid_first_t_entry;
+ struct uuid_first_time_s *uuid_first_entry_list = callocz(count, sizeof(struct uuid_first_time_s));
+
+ size_t added = 0;
+ for (size_t index = 0; index < count; ++index) {
+ METRIC *metric = mrg_metric_get_and_acquire(main_mrg, &uuid_list[index].uuid, (Word_t) ctx);
+ if (!metric)
+ continue;
+
+ uuid_first_entry_list[added].metric = metric;
+ uuid_first_entry_list[added].first_time_s = LONG_MAX;
+ uuid_first_entry_list[added].df_matched = 0;
+ uuid_first_entry_list[added].df_index_oldest = 0;
+ uuid_first_entry_list[added].uuid = mrg_metric_uuid(main_mrg, metric);
+ added++;
+ }
+
+ netdata_log_info("DBENGINE: recalculating tier %d retention for %zu metrics starting with datafile %u",
+ ctx->config.tier, count, first_datafile_remaining->fileno);
+
+ journalfile_v2_data_release(journalfile);
+
+ // Update the first time / last time for all metrics we plan to delete
+
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_FIND_REMAINING_RETENTION);
+
+ global_first_time_s = find_uuid_first_time(ctx, first_datafile_remaining, uuid_first_entry_list, added);
+
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_POPULATE_MRG);
+
+ netdata_log_info("DBENGINE: updating tier %d metrics registry retention for %zu metrics",
+ ctx->config.tier, added);
+
+ size_t deleted_metrics = 0, zero_retention_referenced = 0, zero_disk_retention = 0, zero_disk_but_live = 0;
+ for (size_t index = 0; index < added; ++index) {
+ uuid_first_t_entry = &uuid_first_entry_list[index];
+ if (likely(uuid_first_t_entry->first_time_s != LONG_MAX)) {
+ mrg_metric_set_first_time_s_if_bigger(main_mrg, uuid_first_t_entry->metric, uuid_first_t_entry->first_time_s);
+ mrg_metric_release(main_mrg, uuid_first_t_entry->metric);
+ }
+ else {
+ zero_disk_retention++;
+
+ // there is no retention for this metric
+ bool has_retention = mrg_metric_zero_disk_retention(main_mrg, uuid_first_t_entry->metric);
+ if (!has_retention) {
+ bool deleted = mrg_metric_release_and_delete(main_mrg, uuid_first_t_entry->metric);
+ if(deleted)
+ deleted_metrics++;
+ else
+ zero_retention_referenced++;
+ }
+ else {
+ zero_disk_but_live++;
+ mrg_metric_release(main_mrg, uuid_first_t_entry->metric);
+ }
+ }
+ }
+ freez(uuid_first_entry_list);
+
+ internal_error(zero_disk_retention,
+ "DBENGINE: deleted %zu metrics, zero retention but referenced %zu (out of %zu total, of which %zu have main cache retention) zero on-disk retention tier %d metrics from metrics registry",
+ deleted_metrics, zero_retention_referenced, zero_disk_retention, zero_disk_but_live, ctx->config.tier);
+
+ if(global_first_time_s != LONG_MAX)
+ __atomic_store_n(&ctx->atomic.first_time_s, global_first_time_s, __ATOMIC_RELAXED);
+
+ if(worker)
+ worker_is_idle();
+}
+
+void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool update_retention, bool worker) {
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_DATAFILE_DELETE_WAIT);
+
+ bool datafile_got_for_deletion = datafile_acquire_for_deletion(datafile);
+
+ if (update_retention)
+ update_metrics_first_time_s(ctx, datafile, datafile->next, worker);
+
+ while (!datafile_got_for_deletion) {
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_DATAFILE_DELETE_WAIT);
+
+ datafile_got_for_deletion = datafile_acquire_for_deletion(datafile);
+
+ if (!datafile_got_for_deletion) {
+ netdata_log_info("DBENGINE: waiting for data file '%s/"
+ DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION
+ "' to be available for deletion, "
+ "it is in use currently by %u users.",
+ ctx->config.dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno, datafile->users.lockers);
+
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.datafile_deletion_spin, 1, __ATOMIC_RELAXED);
+ sleep_usec(1 * USEC_PER_SEC);
+ }
+ }
+
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.datafile_deletion_started, 1, __ATOMIC_RELAXED);
+ netdata_log_info("DBENGINE: deleting data file '%s/"
+ DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION
+ "'.",
+ ctx->config.dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno);
+
+ if(worker)
+ worker_is_busy(UV_EVENT_DBENGINE_DATAFILE_DELETE);
+
+ struct rrdengine_journalfile *journal_file;
+ unsigned deleted_bytes, journal_file_bytes, datafile_bytes;
+ int ret;
+ char path[RRDENG_PATH_MAX];
+
+ uv_rwlock_wrlock(&ctx->datafiles.rwlock);
+ datafile_list_delete_unsafe(ctx, datafile);
+ uv_rwlock_wrunlock(&ctx->datafiles.rwlock);
+
+ journal_file = datafile->journalfile;
+ datafile_bytes = datafile->pos;
+ journal_file_bytes = journalfile_current_size(journal_file);
+ deleted_bytes = journalfile_v2_data_size_get(journal_file);
+
+ netdata_log_info("DBENGINE: deleting data and journal files to maintain disk quota");
+ ret = journalfile_destroy_unsafe(journal_file, datafile);
+ if (!ret) {
+ journalfile_v1_generate_path(datafile, path, sizeof(path));
+ netdata_log_info("DBENGINE: deleted journal file \"%s\".", path);
+ journalfile_v2_generate_path(datafile, path, sizeof(path));
+ netdata_log_info("DBENGINE: deleted journal file \"%s\".", path);
+ deleted_bytes += journal_file_bytes;
+ }
+ ret = destroy_data_file_unsafe(datafile);
+ if (!ret) {
+ generate_datafilepath(datafile, path, sizeof(path));
+ netdata_log_info("DBENGINE: deleted data file \"%s\".", path);
+ deleted_bytes += datafile_bytes;
+ }
+ freez(journal_file);
+ freez(datafile);
+
+ ctx_current_disk_space_decrease(ctx, deleted_bytes);
+ netdata_log_info("DBENGINE: reclaimed %u bytes of disk space.", deleted_bytes);
+}
+
+static void *database_rotate_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) {
+ datafile_delete(ctx, ctx->datafiles.first, ctx_is_available_for_queries(ctx), true);
+
+ if (rrdeng_ctx_exceeded_disk_quota(ctx))
+ rrdeng_enq_cmd(ctx, RRDENG_OPCODE_DATABASE_ROTATE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
+
+ rrdcontext_db_rotation();
+
+ return data;
+}
+
+static void after_flush_all_hot_and_dirty_pages_of_section(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) {
+ ;
+}
+
+static void *flush_all_hot_and_dirty_pages_of_section_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) {
+ worker_is_busy(UV_EVENT_DBENGINE_QUIESCE);
+ pgc_flush_all_hot_and_dirty_pages(main_cache, (Word_t)ctx);
+ completion_mark_complete(&ctx->quiesce.completion);
+ return data;
+}
+
+static void after_populate_mrg(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) {
+ ;
+}
+
+static void *populate_mrg_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) {
+ worker_is_busy(UV_EVENT_DBENGINE_POPULATE_MRG);
+
+ do {
+ struct rrdengine_datafile *datafile = NULL;
+
+ // find a datafile to work
+ uv_rwlock_rdlock(&ctx->datafiles.rwlock);
+ for(datafile = ctx->datafiles.first; datafile ; datafile = datafile->next) {
+ if(!spinlock_trylock(&datafile->populate_mrg.spinlock))
+ continue;
+
+ if(datafile->populate_mrg.populated) {
+ spinlock_unlock(&datafile->populate_mrg.spinlock);
+ continue;
+ }
+
+ // we have the spinlock and it is not populated
+ break;
+ }
+ uv_rwlock_rdunlock(&ctx->datafiles.rwlock);
+
+ if(!datafile)
+ break;
+
+ journalfile_v2_populate_retention_to_mrg(ctx, datafile->journalfile);
+ datafile->populate_mrg.populated = true;
+ spinlock_unlock(&datafile->populate_mrg.spinlock);
+
+ } while(1);
+
+ completion_mark_complete(completion);
+
+ return data;
+}
+
+static void after_ctx_shutdown(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) {
+ ;
+}
+
+static void *ctx_shutdown_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) {
+ worker_is_busy(UV_EVENT_DBENGINE_SHUTDOWN);
+
+ bool logged = false;
+ while(__atomic_load_n(&ctx->atomic.extents_currently_being_flushed, __ATOMIC_RELAXED) ||
+ __atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED)) {
+ if(!logged) {
+ logged = true;
+ netdata_log_info("DBENGINE: waiting for %zu inflight queries to finish to shutdown tier %d...",
+ __atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED),
+ (ctx->config.legacy) ? -1 : ctx->config.tier);
+ }
+ sleep_usec(1 * USEC_PER_MS);
+ }
+
+ completion_mark_complete(completion);
+
+ return data;
+}
+
+static void *cache_flush_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) {
+ if (!main_cache)
+ return data;
+
+ worker_is_busy(UV_EVENT_DBENGINE_FLUSH_MAIN_CACHE);
+ pgc_flush_pages(main_cache, 0);
+
+ return data;
+}
+
+static void *cache_evict_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *req __maybe_unused) {
+ if (!main_cache)
+ return data;
+
+ worker_is_busy(UV_EVENT_DBENGINE_EVICT_MAIN_CACHE);
+ pgc_evict_pages(main_cache, 0, 0);
+
+ return data;
+}
+
+static void *query_prep_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *req __maybe_unused) {
+ PDC *pdc = data;
+ rrdeng_prep_query(pdc, true);
+ return data;
+}
+
+uint64_t rrdeng_target_data_file_size(struct rrdengine_instance *ctx) {
+ uint64_t target_size = ctx->config.max_disk_space / TARGET_DATAFILES;
+ target_size = MIN(target_size, MAX_DATAFILE_SIZE);
+ target_size = MAX(target_size, MIN_DATAFILE_SIZE);
+ return target_size;
+}
+
+bool rrdeng_ctx_exceeded_disk_quota(struct rrdengine_instance *ctx)
+{
+ if(!ctx->datafiles.first)
+ // no datafiles available
+ return false;
+
+ if(!ctx->datafiles.first->next)
+ // only 1 datafile available
+ return false;
+
+ uint64_t estimated_disk_space = ctx_current_disk_space_get(ctx) + rrdeng_target_data_file_size(ctx) -
+ (ctx->datafiles.first->prev ? ctx->datafiles.first->prev->pos : 0);
+
+ return estimated_disk_space > ctx->config.max_disk_space;
+}
+
+/* return 0 on success */
+int init_rrd_files(struct rrdengine_instance *ctx)
+{
+ return init_data_files(ctx);
+}
+
+void finalize_rrd_files(struct rrdengine_instance *ctx)
+{
+ return finalize_data_files(ctx);
+}
+
+void async_cb(uv_async_t *handle)
+{
+ uv_stop(handle->loop);
+ uv_update_time(handle->loop);
+ netdata_log_debug(D_RRDENGINE, "%s called, active=%d.", __func__, uv_is_active((uv_handle_t *)handle));
+}
+
+#define TIMER_PERIOD_MS (1000)
+
+
+static void *extent_read_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) {
+ EPDL *epdl = data;
+ epdl_find_extent_and_populate_pages(ctx, epdl, true);
+ return data;
+}
+
+static void epdl_populate_pages_asynchronously(struct rrdengine_instance *ctx, EPDL *epdl, STORAGE_PRIORITY priority) {
+ rrdeng_enq_cmd(ctx, RRDENG_OPCODE_EXTENT_READ, epdl, NULL, priority,
+ rrdeng_enqueue_epdl_cmd, rrdeng_dequeue_epdl_cmd);
+}
+
+void pdc_route_asynchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc) {
+ pdc_to_epdl_router(ctx, pdc, epdl_populate_pages_asynchronously, epdl_populate_pages_asynchronously);
+}
+
+void epdl_populate_pages_synchronously(struct rrdengine_instance *ctx, EPDL *epdl, enum storage_priority priority __maybe_unused) {
+ epdl_find_extent_and_populate_pages(ctx, epdl, false);
+}
+
+void pdc_route_synchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc) {
+ pdc_to_epdl_router(ctx, pdc, epdl_populate_pages_synchronously, epdl_populate_pages_synchronously);
+}
+
+#define MAX_RETRIES_TO_START_INDEX (100)
+static void *journal_v2_indexing_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) {
+ unsigned count = 0;
+ worker_is_busy(UV_EVENT_DBENGINE_JOURNAL_INDEX_WAIT);
+
+ while (__atomic_load_n(&ctx->atomic.now_deleting_files, __ATOMIC_RELAXED) && count++ < MAX_RETRIES_TO_START_INDEX)
+ sleep_usec(100 * USEC_PER_MS);
+
+ if (count == MAX_RETRIES_TO_START_INDEX) {
+ worker_is_idle();
+ return data;
+ }
+
+ struct rrdengine_datafile *datafile = ctx->datafiles.first;
+ worker_is_busy(UV_EVENT_DBENGINE_JOURNAL_INDEX);
+ count = 0;
+ while (datafile && datafile->fileno != ctx_last_fileno_get(ctx) && datafile->fileno != ctx_last_flush_fileno_get(ctx)) {
+ if(journalfile_v2_data_available(datafile->journalfile)) {
+ // journal file v2 is already there for this datafile
+ datafile = datafile->next;
+ continue;
+ }
+
+ spinlock_lock(&datafile->writers.spinlock);
+ bool available = (datafile->writers.running || datafile->writers.flushed_to_open_running) ? false : true;
+ spinlock_unlock(&datafile->writers.spinlock);
+
+ if(!available) {
+ nd_log(NDLS_DAEMON, NDLP_NOTICE,
+ "DBENGINE: journal file %u needs to be indexed, but it has writers working on it - "
+ "skipping it for now",
+ datafile->fileno);
+
+ datafile = datafile->next;
+ continue;
+ }
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "DBENGINE: journal file %u is ready to be indexed",
+ datafile->fileno);
+
+ pgc_open_cache_to_journal_v2(open_cache, (Word_t) ctx, (int) datafile->fileno, ctx->config.page_type,
+ journalfile_migrate_to_v2_callback, (void *) datafile->journalfile);
+
+ count++;
+
+ datafile = datafile->next;
+
+ if (unlikely(!ctx_is_available_for_queries(ctx)))
+ break;
+ }
+
+ errno = 0;
+ if(count)
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "DBENGINE: journal indexing done; %u files processed",
+ count);
+
+ worker_is_idle();
+
+ return data;
+}
+
+static void after_do_cache_flush(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) {
+ rrdeng_main.flushes_running--;
+}
+
+static void after_do_cache_evict(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) {
+ rrdeng_main.evictions_running--;
+}
+
+static void after_journal_v2_indexing(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) {
+ __atomic_store_n(&ctx->atomic.migration_to_v2_running, false, __ATOMIC_RELAXED);
+ rrdeng_enq_cmd(ctx, RRDENG_OPCODE_DATABASE_ROTATE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
+}
+
+struct rrdeng_buffer_sizes rrdeng_get_buffer_sizes(void) {
+ return (struct rrdeng_buffer_sizes) {
+ .pgc = pgc_aral_overhead() + pgc_aral_structures(),
+ .mrg = mrg_aral_overhead() + mrg_aral_structures(),
+ .opcodes = aral_overhead(rrdeng_main.cmd_queue.ar) + aral_structures(rrdeng_main.cmd_queue.ar),
+ .handles = aral_overhead(rrdeng_main.handles.ar) + aral_structures(rrdeng_main.handles.ar),
+ .descriptors = aral_overhead(rrdeng_main.descriptors.ar) + aral_structures(rrdeng_main.descriptors.ar),
+ .wal = __atomic_load_n(&wal_globals.atomics.allocated, __ATOMIC_RELAXED) * (sizeof(WAL) + RRDENG_BLOCK_SIZE),
+ .workers = aral_overhead(rrdeng_main.work_cmd.ar),
+ .pdc = pdc_cache_size(),
+ .xt_io = aral_overhead(rrdeng_main.xt_io_descr.ar) + aral_structures(rrdeng_main.xt_io_descr.ar),
+ .xt_buf = extent_buffer_cache_size(),
+ .epdl = epdl_cache_size(),
+ .deol = deol_cache_size(),
+ .pd = pd_cache_size(),
+
+#ifdef PDC_USE_JULYL
+ .julyl = julyl_cache_size(),
+#endif
+ };
+}
+
+static void after_cleanup(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) {
+ rrdeng_main.cleanup_running--;
+}
+
+static void *cleanup_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) {
+ worker_is_busy(UV_EVENT_DBENGINE_BUFFERS_CLEANUP);
+
+ wal_cleanup1();
+ extent_buffer_cleanup1();
+
+ {
+ static time_t last_run_s = 0;
+ time_t now_s = now_monotonic_sec();
+ if(now_s - last_run_s >= 10) {
+ last_run_s = now_s;
+ journalfile_v2_data_unmount_cleanup(now_s);
+ }
+ }
+
+#ifdef PDC_USE_JULYL
+ julyl_cleanup1();
+#endif
+
+ return data;
+}
+
+void timer_cb(uv_timer_t* handle) {
+ worker_is_busy(RRDENG_TIMER_CB);
+ uv_stop(handle->loop);
+ uv_update_time(handle->loop);
+
+ worker_set_metric(RRDENG_OPCODES_WAITING, (NETDATA_DOUBLE)rrdeng_main.cmd_queue.unsafe.waiting);
+ worker_set_metric(RRDENG_WORKS_DISPATCHED, (NETDATA_DOUBLE)__atomic_load_n(&rrdeng_main.work_cmd.atomics.dispatched, __ATOMIC_RELAXED));
+ worker_set_metric(RRDENG_WORKS_EXECUTING, (NETDATA_DOUBLE)__atomic_load_n(&rrdeng_main.work_cmd.atomics.executing, __ATOMIC_RELAXED));
+
+ rrdeng_enq_cmd(NULL, RRDENG_OPCODE_FLUSH_INIT, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
+ rrdeng_enq_cmd(NULL, RRDENG_OPCODE_EVICT_INIT, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
+ rrdeng_enq_cmd(NULL, RRDENG_OPCODE_CLEANUP, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
+
+ worker_is_idle();
+}
+
+static void dbengine_initialize_structures(void) {
+ pgc_and_mrg_initialize();
+
+ pdc_init();
+ page_details_init();
+ epdl_init();
+ deol_init();
+ rrdeng_cmd_queue_init();
+ work_request_init();
+ rrdeng_query_handle_init();
+ page_descriptors_init();
+ extent_buffer_init();
+ pgd_init_arals();
+ extent_io_descriptor_init();
+}
+
+bool rrdeng_dbengine_spawn(struct rrdengine_instance *ctx __maybe_unused) {
+ static bool spawned = false;
+ static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER;
+
+ spinlock_lock(&spinlock);
+
+ if(!spawned) {
+ int ret;
+
+ ret = uv_loop_init(&rrdeng_main.loop);
+ if (ret) {
+ netdata_log_error("DBENGINE: uv_loop_init(): %s", uv_strerror(ret));
+ return false;
+ }
+ rrdeng_main.loop.data = &rrdeng_main;
+
+ ret = uv_async_init(&rrdeng_main.loop, &rrdeng_main.async, async_cb);
+ if (ret) {
+ netdata_log_error("DBENGINE: uv_async_init(): %s", uv_strerror(ret));
+ fatal_assert(0 == uv_loop_close(&rrdeng_main.loop));
+ return false;
+ }
+ rrdeng_main.async.data = &rrdeng_main;
+
+ ret = uv_timer_init(&rrdeng_main.loop, &rrdeng_main.timer);
+ if (ret) {
+ netdata_log_error("DBENGINE: uv_timer_init(): %s", uv_strerror(ret));
+ uv_close((uv_handle_t *)&rrdeng_main.async, NULL);
+ fatal_assert(0 == uv_loop_close(&rrdeng_main.loop));
+ return false;
+ }
+ rrdeng_main.timer.data = &rrdeng_main;
+
+ dbengine_initialize_structures();
+
+ fatal_assert(0 == uv_thread_create(&rrdeng_main.thread, dbengine_event_loop, &rrdeng_main));
+ spawned = true;
+ }
+
+ spinlock_unlock(&spinlock);
+ return true;
+}
+
+static inline void worker_dispatch_extent_read(struct rrdeng_cmd cmd, bool from_worker) {
+ struct rrdengine_instance *ctx = cmd.ctx;
+ EPDL *epdl = cmd.data;
+
+ if(from_worker)
+ epdl_find_extent_and_populate_pages(ctx, epdl, true);
+ else
+ work_dispatch(ctx, epdl, NULL, cmd.opcode, extent_read_tp_worker, NULL);
+}
+
+static inline void worker_dispatch_query_prep(struct rrdeng_cmd cmd, bool from_worker) {
+ struct rrdengine_instance *ctx = cmd.ctx;
+ PDC *pdc = cmd.data;
+
+ if(from_worker)
+ rrdeng_prep_query(pdc, true);
+ else
+ work_dispatch(ctx, pdc, NULL, cmd.opcode, query_prep_tp_worker, NULL);
+}
+
+void dbengine_event_loop(void* arg) {
+ sanity_check();
+ uv_thread_set_name_np(pthread_self(), "DBENGINE");
+ service_register(SERVICE_THREAD_TYPE_EVENT_LOOP, NULL, NULL, NULL, true);
+
+ worker_register("DBENGINE");
+
+ // opcode jobs
+ worker_register_job_name(RRDENG_OPCODE_NOOP, "noop");
+
+ worker_register_job_name(RRDENG_OPCODE_QUERY, "query");
+ worker_register_job_name(RRDENG_OPCODE_EXTENT_WRITE, "extent write");
+ worker_register_job_name(RRDENG_OPCODE_EXTENT_READ, "extent read");
+ worker_register_job_name(RRDENG_OPCODE_FLUSHED_TO_OPEN, "flushed to open");
+ worker_register_job_name(RRDENG_OPCODE_DATABASE_ROTATE, "db rotate");
+ worker_register_job_name(RRDENG_OPCODE_JOURNAL_INDEX, "journal index");
+ worker_register_job_name(RRDENG_OPCODE_FLUSH_INIT, "flush init");
+ worker_register_job_name(RRDENG_OPCODE_EVICT_INIT, "evict init");
+ worker_register_job_name(RRDENG_OPCODE_CTX_SHUTDOWN, "ctx shutdown");
+ worker_register_job_name(RRDENG_OPCODE_CTX_QUIESCE, "ctx quiesce");
+ worker_register_job_name(RRDENG_OPCODE_SHUTDOWN_EVLOOP, "dbengine shutdown");
+
+ worker_register_job_name(RRDENG_OPCODE_MAX, "get opcode");
+
+ worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_QUERY, "query cb");
+ worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_EXTENT_WRITE, "extent write cb");
+ worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_EXTENT_READ, "extent read cb");
+ worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_FLUSHED_TO_OPEN, "flushed to open cb");
+ worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_DATABASE_ROTATE, "db rotate cb");
+ worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_JOURNAL_INDEX, "journal index cb");
+ worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_FLUSH_INIT, "flush init cb");
+ worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_EVICT_INIT, "evict init cb");
+ worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_CTX_SHUTDOWN, "ctx shutdown cb");
+ worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_CTX_QUIESCE, "ctx quiesce cb");
+
+ // special jobs
+ worker_register_job_name(RRDENG_TIMER_CB, "timer");
+ worker_register_job_name(RRDENG_FLUSH_TRANSACTION_BUFFER_CB, "transaction buffer flush cb");
+
+ worker_register_job_custom_metric(RRDENG_OPCODES_WAITING, "opcodes waiting", "opcodes", WORKER_METRIC_ABSOLUTE);
+ worker_register_job_custom_metric(RRDENG_WORKS_DISPATCHED, "works dispatched", "works", WORKER_METRIC_ABSOLUTE);
+ worker_register_job_custom_metric(RRDENG_WORKS_EXECUTING, "works executing", "works", WORKER_METRIC_ABSOLUTE);
+
+ struct rrdeng_main *main = arg;
+ enum rrdeng_opcode opcode;
+ struct rrdeng_cmd cmd;
+ main->tid = gettid();
+
+ fatal_assert(0 == uv_timer_start(&main->timer, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS));
+
+ bool shutdown = false;
+ while (likely(!shutdown)) {
+ worker_is_idle();
+ uv_run(&main->loop, UV_RUN_DEFAULT);
+
+ /* wait for commands */
+ do {
+ worker_is_busy(RRDENG_OPCODE_MAX);
+ cmd = rrdeng_deq_cmd(RRDENG_OPCODE_NOOP);
+ opcode = cmd.opcode;
+
+ worker_is_busy(opcode);
+
+ switch (opcode) {
+ case RRDENG_OPCODE_EXTENT_READ:
+ worker_dispatch_extent_read(cmd, false);
+ break;
+
+ case RRDENG_OPCODE_QUERY:
+ worker_dispatch_query_prep(cmd, false);
+ break;
+
+ case RRDENG_OPCODE_EXTENT_WRITE: {
+ struct rrdengine_instance *ctx = cmd.ctx;
+ struct page_descr_with_data *base = cmd.data;
+ struct completion *completion = cmd.completion; // optional
+ work_dispatch(ctx, base, completion, opcode, extent_write_tp_worker, after_extent_write);
+ break;
+ }
+
+ case RRDENG_OPCODE_FLUSHED_TO_OPEN: {
+ struct rrdengine_instance *ctx = cmd.ctx;
+ uv_fs_t *uv_fs_request = cmd.data;
+ struct extent_io_descriptor *xt_io_descr = uv_fs_request->data;
+ struct completion *completion = xt_io_descr->completion;
+ work_dispatch(ctx, uv_fs_request, completion, opcode, extent_flushed_to_open_tp_worker, after_extent_flushed_to_open);
+ break;
+ }
+
+ case RRDENG_OPCODE_FLUSH_INIT: {
+ if(rrdeng_main.flushes_running < (size_t)(libuv_worker_threads / 4)) {
+ rrdeng_main.flushes_running++;
+ work_dispatch(NULL, NULL, NULL, opcode, cache_flush_tp_worker, after_do_cache_flush);
+ }
+ break;
+ }
+
+ case RRDENG_OPCODE_EVICT_INIT: {
+ if(!rrdeng_main.evictions_running) {
+ rrdeng_main.evictions_running++;
+ work_dispatch(NULL, NULL, NULL, opcode, cache_evict_tp_worker, after_do_cache_evict);
+ }
+ break;
+ }
+
+ case RRDENG_OPCODE_CLEANUP: {
+ if(!rrdeng_main.cleanup_running) {
+ rrdeng_main.cleanup_running++;
+ work_dispatch(NULL, NULL, NULL, opcode, cleanup_tp_worker, after_cleanup);
+ }
+ break;
+ }
+
+ case RRDENG_OPCODE_JOURNAL_INDEX: {
+ struct rrdengine_instance *ctx = cmd.ctx;
+ struct rrdengine_datafile *datafile = cmd.data;
+ if(!__atomic_load_n(&ctx->atomic.migration_to_v2_running, __ATOMIC_RELAXED)) {
+
+ __atomic_store_n(&ctx->atomic.migration_to_v2_running, true, __ATOMIC_RELAXED);
+ work_dispatch(ctx, datafile, NULL, opcode, journal_v2_indexing_tp_worker, after_journal_v2_indexing);
+ }
+ break;
+ }
+
+ case RRDENG_OPCODE_DATABASE_ROTATE: {
+ struct rrdengine_instance *ctx = cmd.ctx;
+ if (!__atomic_load_n(&ctx->atomic.now_deleting_files, __ATOMIC_RELAXED) &&
+ ctx->datafiles.first->next != NULL &&
+ ctx->datafiles.first->next->next != NULL &&
+ rrdeng_ctx_exceeded_disk_quota(ctx)) {
+
+ __atomic_store_n(&ctx->atomic.now_deleting_files, true, __ATOMIC_RELAXED);
+ work_dispatch(ctx, NULL, NULL, opcode, database_rotate_tp_worker, after_database_rotate);
+ }
+ break;
+ }
+
+ case RRDENG_OPCODE_CTX_POPULATE_MRG: {
+ struct rrdengine_instance *ctx = cmd.ctx;
+ struct completion *completion = cmd.completion;
+ work_dispatch(ctx, NULL, completion, opcode, populate_mrg_tp_worker, after_populate_mrg);
+ break;
+ }
+
+ case RRDENG_OPCODE_CTX_QUIESCE: {
+ // a ctx will shutdown shortly
+ struct rrdengine_instance *ctx = cmd.ctx;
+ __atomic_store_n(&ctx->quiesce.enabled, true, __ATOMIC_RELEASE);
+ work_dispatch(ctx, NULL, NULL, opcode,
+ flush_all_hot_and_dirty_pages_of_section_tp_worker,
+ after_flush_all_hot_and_dirty_pages_of_section);
+ break;
+ }
+
+ case RRDENG_OPCODE_CTX_SHUTDOWN: {
+ // a ctx is shutting down
+ struct rrdengine_instance *ctx = cmd.ctx;
+ struct completion *completion = cmd.completion;
+ work_dispatch(ctx, NULL, completion, opcode, ctx_shutdown_tp_worker, after_ctx_shutdown);
+ break;
+ }
+
+ case RRDENG_OPCODE_SHUTDOWN_EVLOOP: {
+ uv_close((uv_handle_t *)&main->async, NULL);
+ (void) uv_timer_stop(&main->timer);
+ uv_close((uv_handle_t *)&main->timer, NULL);
+ shutdown = true;
+ }
+
+ case RRDENG_OPCODE_NOOP: {
+ /* the command queue was empty, do nothing */
+ break;
+ }
+
+ // not opcodes
+ case RRDENG_OPCODE_MAX:
+ default: {
+ internal_fatal(true, "DBENGINE: unknown opcode");
+ break;
+ }
+ }
+
+ } while (opcode != RRDENG_OPCODE_NOOP);
+ }
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG, "Shutting down dbengine thread");
+ uv_loop_close(&main->loop);
+ worker_unregister();
+}
diff --git a/database/engine/rrdengine.h b/database/engine/rrdengine.h
new file mode 100644
index 00000000..cd3352f1
--- /dev/null
+++ b/database/engine/rrdengine.h
@@ -0,0 +1,532 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_RRDENGINE_H
+#define NETDATA_RRDENGINE_H
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <fcntl.h>
+#include <lz4.h>
+#include <Judy.h>
+#include <openssl/sha.h>
+#include <openssl/evp.h>
+#include "daemon/common.h"
+#include "../rrd.h"
+#include "rrddiskprotocol.h"
+#include "rrdenginelib.h"
+#include "datafile.h"
+#include "journalfile.h"
+#include "rrdengineapi.h"
+#include "pagecache.h"
+#include "metric.h"
+#include "cache.h"
+#include "pdc.h"
+#include "page.h"
+
+extern unsigned rrdeng_pages_per_extent;
+
+/* Forward declarations */
+struct rrdengine_instance;
+struct rrdeng_cmd;
+
+#define MAX_PAGES_PER_EXTENT (64) /* TODO: can go higher only when journal supports bigger than 4KiB transactions */
+
+#define RRDENG_FILE_NUMBER_SCAN_TMPL "%1u-%10u"
+#define RRDENG_FILE_NUMBER_PRINT_TMPL "%1.1u-%10.10u"
+
+typedef enum __attribute__ ((__packed__)) {
+ // final status for all pages
+ // if a page does not have one of these, it is considered unroutable
+ PDC_PAGE_READY = (1 << 0), // ready to be processed (pd->page is not null)
+ PDC_PAGE_FAILED = (1 << 1), // failed to be loaded (pd->page is null)
+ PDC_PAGE_SKIP = (1 << 2), // don't use this page, it is not good for us
+ PDC_PAGE_INVALID = (1 << 3), // don't use this page, it is invalid
+ PDC_PAGE_EMPTY = (1 << 4), // the page is empty, does not have any data
+
+ // other statuses for tracking issues
+ PDC_PAGE_PREPROCESSED = (1 << 5), // used during preprocessing
+ PDC_PAGE_PROCESSED = (1 << 6), // processed by the query caller
+ PDC_PAGE_RELEASED = (1 << 7), // already released
+
+ // data found in cache (preloaded) or on disk?
+ PDC_PAGE_PRELOADED = (1 << 8), // data found in memory
+ PDC_PAGE_DISK_PENDING = (1 << 9), // data need to be loaded from disk
+
+ // worker related statuses
+ PDC_PAGE_FAILED_INVALID_EXTENT = (1 << 10),
+ PDC_PAGE_FAILED_NOT_IN_EXTENT = (1 << 11),
+ PDC_PAGE_FAILED_TO_MAP_EXTENT = (1 << 12),
+ PDC_PAGE_FAILED_TO_ACQUIRE_DATAFILE= (1 << 13),
+
+ PDC_PAGE_EXTENT_FROM_CACHE = (1 << 14),
+ PDC_PAGE_EXTENT_FROM_DISK = (1 << 15),
+
+ PDC_PAGE_CANCELLED = (1 << 16), // the query thread had left when we try to load the page
+
+ PDC_PAGE_SOURCE_MAIN_CACHE = (1 << 17),
+ PDC_PAGE_SOURCE_OPEN_CACHE = (1 << 18),
+ PDC_PAGE_SOURCE_JOURNAL_V2 = (1 << 19),
+ PDC_PAGE_PRELOADED_PASS4 = (1 << 20),
+
+ // datafile acquired
+ PDC_PAGE_DATAFILE_ACQUIRED = (1 << 30),
+} PDC_PAGE_STATUS;
+
+#define PDC_PAGE_QUERY_GLOBAL_SKIP_LIST (PDC_PAGE_FAILED | PDC_PAGE_SKIP | PDC_PAGE_INVALID | PDC_PAGE_RELEASED)
+
+typedef struct page_details_control {
+ struct rrdengine_instance *ctx;
+ struct metric *metric;
+
+ struct completion prep_completion;
+ struct completion page_completion; // sync between the query thread and the workers
+
+ Pvoid_t page_list_JudyL; // the list of page details
+ unsigned completed_jobs; // the number of jobs completed last time the query thread checked
+ bool workers_should_stop; // true when the query thread left and the workers should stop
+ bool prep_done;
+
+ PDC_PAGE_STATUS common_status;
+ size_t pages_to_load_from_disk;
+
+ SPINLOCK refcount_spinlock; // spinlock to protect refcount
+ int32_t refcount; // the number of workers currently working on this request + 1 for the query thread
+ size_t executed_with_gaps;
+
+ time_t start_time_s;
+ time_t end_time_s;
+ STORAGE_PRIORITY priority;
+
+ time_t optimal_end_time_s;
+} PDC;
+
+PDC *pdc_get(void);
+
+struct page_details {
+ struct {
+ struct rrdengine_datafile *ptr;
+ uv_file file;
+ unsigned fileno;
+
+ struct {
+ uint64_t pos;
+ uint32_t bytes;
+ } extent;
+ } datafile;
+
+ struct pgc_page *page;
+ Word_t metric_id;
+ time_t first_time_s;
+ time_t last_time_s;
+ uint32_t update_every_s;
+ PDC_PAGE_STATUS status;
+
+ struct {
+ struct page_details *prev;
+ struct page_details *next;
+ } load;
+};
+
+struct page_details *page_details_get(void);
+
+#define pdc_page_status_check(pd, flag) (__atomic_load_n(&((pd)->status), __ATOMIC_ACQUIRE) & (flag))
+#define pdc_page_status_set(pd, flag) __atomic_or_fetch(&((pd)->status), flag, __ATOMIC_RELEASE)
+#define pdc_page_status_clear(pd, flag) __atomic_and_fetch(&((od)->status), ~(flag), __ATOMIC_RELEASE)
+
+struct jv2_extents_info {
+ size_t index;
+ uint64_t pos;
+ unsigned bytes;
+ size_t number_of_pages;
+};
+
+struct jv2_metrics_info {
+ uuid_t *uuid;
+ uint32_t page_list_header;
+ time_t first_time_s;
+ time_t last_time_s;
+ size_t number_of_pages;
+ Pvoid_t JudyL_pages_by_start_time;
+};
+
+struct jv2_page_info {
+ time_t start_time_s;
+ time_t end_time_s;
+ time_t update_every_s;
+ size_t page_length;
+ uint32_t extent_index;
+ void *custom_data;
+
+ // private
+ struct pgc_page *page;
+};
+
+typedef enum __attribute__ ((__packed__)) {
+ RRDENG_1ST_METRIC_WRITER = (1 << 0),
+} RRDENG_COLLECT_HANDLE_OPTIONS;
+
+typedef enum __attribute__ ((__packed__)) {
+ RRDENG_PAGE_PAST_COLLECTION = (1 << 0),
+ RRDENG_PAGE_REPEATED_COLLECTION = (1 << 1),
+ RRDENG_PAGE_BIG_GAP = (1 << 2),
+ RRDENG_PAGE_GAP = (1 << 3),
+ RRDENG_PAGE_FUTURE_POINT = (1 << 4),
+ RRDENG_PAGE_CREATED_IN_FUTURE = (1 << 5),
+ RRDENG_PAGE_COMPLETED_IN_FUTURE = (1 << 6),
+ RRDENG_PAGE_UNALIGNED = (1 << 7),
+ RRDENG_PAGE_CONFLICT = (1 << 8),
+ RRDENG_PAGE_FULL = (1 << 9),
+ RRDENG_PAGE_COLLECT_FINALIZE = (1 << 10),
+ RRDENG_PAGE_UPDATE_EVERY_CHANGE = (1 << 11),
+ RRDENG_PAGE_STEP_TOO_SMALL = (1 << 12),
+ RRDENG_PAGE_STEP_UNALIGNED = (1 << 13),
+} RRDENG_COLLECT_PAGE_FLAGS;
+
+struct rrdeng_collect_handle {
+ struct storage_collect_handle common; // has to be first item
+
+ RRDENG_COLLECT_PAGE_FLAGS page_flags;
+ RRDENG_COLLECT_HANDLE_OPTIONS options;
+ uint8_t type;
+
+ struct rrdengine_instance *ctx;
+ struct metric *metric;
+ struct pgc_page *pgc_page;
+ struct pgd *page_data;
+ size_t page_data_size;
+ struct pg_alignment *alignment;
+ uint32_t page_entries_max;
+ uint32_t page_position; // keep track of the current page size, to make sure we don't exceed it
+ usec_t page_start_time_ut;
+ usec_t page_end_time_ut;
+ usec_t update_every_ut;
+};
+
+struct rrdeng_query_handle {
+ struct metric *metric;
+ struct pgc_page *page;
+ struct rrdengine_instance *ctx;
+ struct pgd_cursor pgdc;
+ struct page_details_control *pdc;
+
+ // the request
+ time_t start_time_s;
+ time_t end_time_s;
+ STORAGE_PRIORITY priority;
+
+ // internal data
+ time_t now_s;
+ time_t dt_s;
+
+ unsigned position;
+ unsigned entries;
+
+#ifdef NETDATA_INTERNAL_CHECKS
+ usec_t started_time_s;
+ pid_t query_pid;
+ struct rrdeng_query_handle *prev, *next;
+#endif
+};
+
+struct rrdeng_query_handle *rrdeng_query_handle_get(void);
+void rrdeng_query_handle_release(struct rrdeng_query_handle *handle);
+
+enum rrdeng_opcode {
+ /* can be used to return empty status or flush the command queue */
+ RRDENG_OPCODE_NOOP = 0,
+
+ RRDENG_OPCODE_QUERY,
+ RRDENG_OPCODE_EXTENT_WRITE,
+ RRDENG_OPCODE_EXTENT_READ,
+ RRDENG_OPCODE_FLUSHED_TO_OPEN,
+ RRDENG_OPCODE_DATABASE_ROTATE,
+ RRDENG_OPCODE_JOURNAL_INDEX,
+ RRDENG_OPCODE_FLUSH_INIT,
+ RRDENG_OPCODE_EVICT_INIT,
+ RRDENG_OPCODE_CTX_SHUTDOWN,
+ RRDENG_OPCODE_CTX_QUIESCE,
+ RRDENG_OPCODE_CTX_POPULATE_MRG,
+ RRDENG_OPCODE_SHUTDOWN_EVLOOP,
+ RRDENG_OPCODE_CLEANUP,
+
+ RRDENG_OPCODE_MAX
+};
+
+// WORKERS IDS:
+// RRDENG_MAX_OPCODE : reserved for the cleanup
+// RRDENG_MAX_OPCODE + opcode : reserved for the callbacks of each opcode
+// RRDENG_MAX_OPCODE + RRDENG_MAX_OPCODE : reserved for the timer
+#define RRDENG_TIMER_CB (RRDENG_OPCODE_MAX + RRDENG_OPCODE_MAX)
+#define RRDENG_FLUSH_TRANSACTION_BUFFER_CB (RRDENG_TIMER_CB + 1)
+#define RRDENG_OPCODES_WAITING (RRDENG_TIMER_CB + 2)
+#define RRDENG_WORKS_DISPATCHED (RRDENG_TIMER_CB + 3)
+#define RRDENG_WORKS_EXECUTING (RRDENG_TIMER_CB + 4)
+
+struct extent_io_data {
+ unsigned fileno;
+ uv_file file;
+ uint64_t pos;
+ unsigned bytes;
+ uint16_t page_length;
+};
+
+struct extent_io_descriptor {
+ struct rrdengine_instance *ctx;
+ uv_fs_t uv_fs_request;
+ uv_buf_t iov;
+ uv_file file;
+ void *buf;
+ struct wal *wal;
+ uint64_t pos;
+ unsigned bytes;
+ struct completion *completion;
+ unsigned descr_count;
+ struct page_descr_with_data *descr_array[MAX_PAGES_PER_EXTENT];
+ struct rrdengine_datafile *datafile;
+ struct extent_io_descriptor *next; /* multiple requests to be served by the same cached extent */
+};
+
+struct generic_io_descriptor {
+ struct rrdengine_instance *ctx;
+ uv_fs_t req;
+ uv_buf_t iov;
+ void *buf;
+ void *data;
+ uint64_t pos;
+ unsigned bytes;
+ struct completion *completion;
+};
+
+typedef struct wal {
+ uint64_t transaction_id;
+ void *buf;
+ size_t size;
+ size_t buf_size;
+ struct generic_io_descriptor io_descr;
+
+ struct {
+ struct wal *prev;
+ struct wal *next;
+ } cache;
+} WAL;
+
+WAL *wal_get(struct rrdengine_instance *ctx, unsigned size);
+void wal_release(WAL *wal);
+
+/*
+ * Debug statistics not used by code logic.
+ * They only describe operations since DB engine instance load time.
+ */
+struct rrdengine_statistics {
+ rrdeng_stats_t before_decompress_bytes;
+ rrdeng_stats_t after_decompress_bytes;
+ rrdeng_stats_t before_compress_bytes;
+ rrdeng_stats_t after_compress_bytes;
+
+ rrdeng_stats_t io_write_bytes;
+ rrdeng_stats_t io_write_requests;
+ rrdeng_stats_t io_read_bytes;
+ rrdeng_stats_t io_read_requests;
+
+ rrdeng_stats_t datafile_creations;
+ rrdeng_stats_t datafile_deletions;
+ rrdeng_stats_t journalfile_creations;
+ rrdeng_stats_t journalfile_deletions;
+
+ rrdeng_stats_t io_errors;
+ rrdeng_stats_t fs_errors;
+};
+
+/* I/O errors global counter */
+extern rrdeng_stats_t global_io_errors;
+/* File-System errors global counter */
+extern rrdeng_stats_t global_fs_errors;
+/* number of File-Descriptors that have been reserved by dbengine */
+extern rrdeng_stats_t rrdeng_reserved_file_descriptors;
+/* inability to flush global counters */
+extern rrdeng_stats_t global_pg_cache_over_half_dirty_events;
+extern rrdeng_stats_t global_flushing_pressure_page_deletions; /* number of deleted pages */
+
+struct rrdengine_instance {
+ struct {
+ bool legacy; // true when the db is autonomous for a single host
+
+ int tier; // the tier of this ctx
+ uint8_t page_type; // default page type for this context
+
+ uint64_t max_disk_space; // the max disk space this ctx is allowed to use
+ uint8_t global_compress_alg; // the wanted compression algorithm
+
+ char dbfiles_path[FILENAME_MAX + 1];
+ } config;
+
+ struct {
+ uv_rwlock_t rwlock; // the linked list of datafiles is protected by this lock
+ struct rrdengine_datafile *first; // oldest - the newest with ->first->prev
+ } datafiles;
+
+ struct {
+ RW_SPINLOCK spinlock;
+ Pvoid_t JudyL;
+ } njfv2idx;
+
+ struct {
+ unsigned last_fileno; // newest index of datafile and journalfile
+ unsigned last_flush_fileno; // newest index of datafile received data
+
+ size_t collectors_running;
+ size_t collectors_running_duplicate;
+ size_t inflight_queries; // the number of queries currently running
+ uint64_t current_disk_space; // the current disk space size used
+
+ uint64_t transaction_id; // the transaction id of the next extent flushing
+
+ bool migration_to_v2_running;
+ bool now_deleting_files;
+ unsigned extents_currently_being_flushed; // non-zero until we commit data to disk (both datafile and journal file)
+
+ time_t first_time_s;
+ } atomic;
+
+ struct {
+ bool exit_mode;
+ bool enabled; // when set (before shutdown), queries are prohibited
+ struct completion completion;
+ } quiesce;
+
+ struct {
+ struct {
+ size_t size;
+ struct completion *array;
+ } populate_mrg;
+
+ bool create_new_datafile_pair;
+ } loading;
+
+ struct rrdengine_statistics stats;
+};
+
+#define ctx_current_disk_space_get(ctx) __atomic_load_n(&(ctx)->atomic.current_disk_space, __ATOMIC_RELAXED)
+#define ctx_current_disk_space_increase(ctx, size) __atomic_add_fetch(&(ctx)->atomic.current_disk_space, size, __ATOMIC_RELAXED)
+#define ctx_current_disk_space_decrease(ctx, size) __atomic_sub_fetch(&(ctx)->atomic.current_disk_space, size, __ATOMIC_RELAXED)
+
+static inline void ctx_io_read_op_bytes(struct rrdengine_instance *ctx, size_t bytes) {
+ __atomic_add_fetch(&ctx->stats.io_read_bytes, bytes, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&ctx->stats.io_read_requests, 1, __ATOMIC_RELAXED);
+}
+
+static inline void ctx_io_write_op_bytes(struct rrdengine_instance *ctx, size_t bytes) {
+ __atomic_add_fetch(&ctx->stats.io_write_bytes, bytes, __ATOMIC_RELAXED);
+ __atomic_add_fetch(&ctx->stats.io_write_requests, 1, __ATOMIC_RELAXED);
+}
+
+static inline void ctx_io_error(struct rrdengine_instance *ctx) {
+ __atomic_add_fetch(&ctx->stats.io_errors, 1, __ATOMIC_RELAXED);
+ rrd_stat_atomic_add(&global_io_errors, 1);
+}
+
+static inline void ctx_fs_error(struct rrdengine_instance *ctx) {
+ __atomic_add_fetch(&ctx->stats.fs_errors, 1, __ATOMIC_RELAXED);
+ rrd_stat_atomic_add(&global_fs_errors, 1);
+}
+
+#define ctx_last_fileno_get(ctx) __atomic_load_n(&(ctx)->atomic.last_fileno, __ATOMIC_RELAXED)
+#define ctx_last_fileno_increment(ctx) __atomic_add_fetch(&(ctx)->atomic.last_fileno, 1, __ATOMIC_RELAXED)
+
+#define ctx_last_flush_fileno_get(ctx) __atomic_load_n(&(ctx)->atomic.last_flush_fileno, __ATOMIC_RELAXED)
+static inline void ctx_last_flush_fileno_set(struct rrdengine_instance *ctx, unsigned fileno) {
+ unsigned old_fileno = ctx_last_flush_fileno_get(ctx);
+
+ do {
+ if(old_fileno >= fileno)
+ return;
+
+ } while(!__atomic_compare_exchange_n(&ctx->atomic.last_flush_fileno, &old_fileno, fileno, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
+}
+
+#define ctx_is_available_for_queries(ctx) (__atomic_load_n(&(ctx)->quiesce.enabled, __ATOMIC_RELAXED) == false && __atomic_load_n(&(ctx)->quiesce.exit_mode, __ATOMIC_RELAXED) == false)
+
+void *dbengine_extent_alloc(size_t size);
+void dbengine_extent_free(void *extent, size_t size);
+
+bool rrdeng_ctx_exceeded_disk_quota(struct rrdengine_instance *ctx);
+int init_rrd_files(struct rrdengine_instance *ctx);
+void finalize_rrd_files(struct rrdengine_instance *ctx);
+bool rrdeng_dbengine_spawn(struct rrdengine_instance *ctx);
+void dbengine_event_loop(void *arg);
+
+typedef void (*enqueue_callback_t)(struct rrdeng_cmd *cmd);
+typedef void (*dequeue_callback_t)(struct rrdeng_cmd *cmd);
+
+void rrdeng_enqueue_epdl_cmd(struct rrdeng_cmd *cmd);
+void rrdeng_dequeue_epdl_cmd(struct rrdeng_cmd *cmd);
+
+typedef struct rrdeng_cmd *(*requeue_callback_t)(void *data);
+void rrdeng_req_cmd(requeue_callback_t get_cmd_cb, void *data, STORAGE_PRIORITY priority);
+
+void rrdeng_enq_cmd(struct rrdengine_instance *ctx, enum rrdeng_opcode opcode, void *data,
+ struct completion *completion, enum storage_priority priority,
+ enqueue_callback_t enqueue_cb, dequeue_callback_t dequeue_cb);
+
+void pdc_route_asynchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc);
+void pdc_route_synchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc);
+
+void pdc_acquire(PDC *pdc);
+bool pdc_release_and_destroy_if_unreferenced(PDC *pdc, bool worker, bool router);
+
+uint64_t rrdeng_target_data_file_size(struct rrdengine_instance *ctx);
+
+struct page_descr_with_data *page_descriptor_get(void);
+
+typedef struct validated_page_descriptor {
+ time_t start_time_s;
+ time_t end_time_s;
+ time_t update_every_s;
+ size_t page_length;
+ size_t point_size;
+ size_t entries;
+ uint8_t type;
+ bool is_valid;
+} VALIDATED_PAGE_DESCRIPTOR;
+
+#define page_entries_by_time(start_time_s, end_time_s, update_every_s) \
+ ((update_every_s) ? (((end_time_s) - ((start_time_s) - (update_every_s))) / (update_every_s)) : 1)
+
+#define page_entries_by_size(page_length_in_bytes, point_size_in_bytes) \
+ ((page_length_in_bytes) / (point_size_in_bytes))
+
+VALIDATED_PAGE_DESCRIPTOR validate_page(uuid_t *uuid,
+ time_t start_time_s,
+ time_t end_time_s,
+ time_t update_every_s,
+ size_t page_length,
+ uint8_t page_type,
+ size_t entries,
+ time_t now_s,
+ time_t overwrite_zero_update_every_s,
+ bool have_read_error,
+ const char *msg,
+ RRDENG_COLLECT_PAGE_FLAGS flags);
+VALIDATED_PAGE_DESCRIPTOR validate_extent_page_descr(const struct rrdeng_extent_page_descr *descr, time_t now_s, time_t overwrite_zero_update_every_s, bool have_read_error);
+void collect_page_flags_to_buffer(BUFFER *wb, RRDENG_COLLECT_PAGE_FLAGS flags);
+
+typedef enum {
+ PAGE_IS_IN_THE_PAST = -1,
+ PAGE_IS_IN_RANGE = 0,
+ PAGE_IS_IN_THE_FUTURE = 1,
+} TIME_RANGE_COMPARE;
+
+TIME_RANGE_COMPARE is_page_in_time_range(time_t page_first_time_s, time_t page_last_time_s, time_t wanted_start_time_s, time_t wanted_end_time_s);
+
+static inline time_t max_acceptable_collected_time(void) {
+ return now_realtime_sec() + 1;
+}
+
+void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool update_retention, bool worker);
+
+static inline int journal_metric_uuid_compare(const void *key, const void *metric) {
+ return uuid_memcmp((uuid_t *)key, &(((struct journal_metric_list *) metric)->uuid));
+}
+
+#endif /* NETDATA_RRDENGINE_H */
diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c
new file mode 100755
index 00000000..1ddce524
--- /dev/null
+++ b/database/engine/rrdengineapi.c
@@ -0,0 +1,1361 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "database/engine/rrddiskprotocol.h"
+#include "rrdengine.h"
+
+/* Default global database instance */
+struct rrdengine_instance multidb_ctx_storage_tier0;
+struct rrdengine_instance multidb_ctx_storage_tier1;
+struct rrdengine_instance multidb_ctx_storage_tier2;
+struct rrdengine_instance multidb_ctx_storage_tier3;
+struct rrdengine_instance multidb_ctx_storage_tier4;
+
+#define mrg_metric_ctx(metric) (struct rrdengine_instance *)mrg_metric_section(main_mrg, metric)
+
+#if RRD_STORAGE_TIERS != 5
+#error RRD_STORAGE_TIERS is not 5 - you need to add allocations here
+#endif
+struct rrdengine_instance *multidb_ctx[RRD_STORAGE_TIERS];
+uint8_t tier_page_type[RRD_STORAGE_TIERS] = {PAGE_METRICS, PAGE_TIER, PAGE_TIER, PAGE_TIER, PAGE_TIER};
+
+#if defined(ENV32BIT)
+size_t tier_page_size[RRD_STORAGE_TIERS] = {2048, 1024, 192, 192, 192};
+#else
+size_t tier_page_size[RRD_STORAGE_TIERS] = {4096, 2048, 384, 384, 384};
+#endif
+
+#if PAGE_TYPE_MAX != 2
+#error PAGE_TYPE_MAX is not 2 - you need to add allocations here
+#endif
+
+size_t page_type_size[256] = {
+ [PAGE_METRICS] = sizeof(storage_number),
+ [PAGE_TIER] = sizeof(storage_number_tier1_t),
+ [PAGE_GORILLA_METRICS] = sizeof(storage_number)
+};
+
+__attribute__((constructor)) void initialize_multidb_ctx(void) {
+ multidb_ctx[0] = &multidb_ctx_storage_tier0;
+ multidb_ctx[1] = &multidb_ctx_storage_tier1;
+ multidb_ctx[2] = &multidb_ctx_storage_tier2;
+ multidb_ctx[3] = &multidb_ctx_storage_tier3;
+ multidb_ctx[4] = &multidb_ctx_storage_tier4;
+}
+
+int db_engine_journal_check = 0;
+int default_rrdeng_disk_quota_mb = 256;
+int default_multidb_disk_quota_mb = 256;
+
+#if defined(ENV32BIT)
+int default_rrdeng_page_cache_mb = 16;
+int default_rrdeng_extent_cache_mb = 0;
+#else
+int default_rrdeng_page_cache_mb = 32;
+int default_rrdeng_extent_cache_mb = 0;
+#endif
+
+// ----------------------------------------------------------------------------
+// metrics groups
+
+static inline void rrdeng_page_alignment_acquire(struct pg_alignment *pa) {
+ if(unlikely(!pa)) return;
+ __atomic_add_fetch(&pa->refcount, 1, __ATOMIC_SEQ_CST);
+}
+
+static inline bool rrdeng_page_alignment_release(struct pg_alignment *pa) {
+ if(unlikely(!pa)) return true;
+
+ if(__atomic_sub_fetch(&pa->refcount, 1, __ATOMIC_SEQ_CST) == 0) {
+ freez(pa);
+ return true;
+ }
+
+ return false;
+}
+
+// charts call this
+STORAGE_METRICS_GROUP *rrdeng_metrics_group_get(STORAGE_INSTANCE *db_instance __maybe_unused, uuid_t *uuid __maybe_unused) {
+ struct pg_alignment *pa = callocz(1, sizeof(struct pg_alignment));
+ rrdeng_page_alignment_acquire(pa);
+ return (STORAGE_METRICS_GROUP *)pa;
+}
+
+// charts call this
+void rrdeng_metrics_group_release(STORAGE_INSTANCE *db_instance __maybe_unused, STORAGE_METRICS_GROUP *smg) {
+ if(unlikely(!smg)) return;
+
+ struct pg_alignment *pa = (struct pg_alignment *)smg;
+ rrdeng_page_alignment_release(pa);
+}
+
+// ----------------------------------------------------------------------------
+// metric handle for legacy dbs
+
+/* This UUID is not unique across hosts */
+void rrdeng_generate_legacy_uuid(const char *dim_id, const char *chart_id, uuid_t *ret_uuid)
+{
+ EVP_MD_CTX *evpctx;
+ unsigned char hash_value[EVP_MAX_MD_SIZE];
+ unsigned int hash_len;
+
+ evpctx = EVP_MD_CTX_create();
+ EVP_DigestInit_ex(evpctx, EVP_sha256(), NULL);
+ EVP_DigestUpdate(evpctx, dim_id, strlen(dim_id));
+ EVP_DigestUpdate(evpctx, chart_id, strlen(chart_id));
+ EVP_DigestFinal_ex(evpctx, hash_value, &hash_len);
+ EVP_MD_CTX_destroy(evpctx);
+ fatal_assert(hash_len > sizeof(uuid_t));
+ memcpy(ret_uuid, hash_value, sizeof(uuid_t));
+}
+
+static METRIC *rrdeng_metric_get_legacy(STORAGE_INSTANCE *db_instance, const char *rd_id, const char *st_id) {
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance;
+ uuid_t legacy_uuid;
+ rrdeng_generate_legacy_uuid(rd_id, st_id, &legacy_uuid);
+ return mrg_metric_get_and_acquire(main_mrg, &legacy_uuid, (Word_t) ctx);
+}
+
+// ----------------------------------------------------------------------------
+// metric handle
+
+void rrdeng_metric_release(STORAGE_METRIC_HANDLE *db_metric_handle) {
+ METRIC *metric = (METRIC *)db_metric_handle;
+ mrg_metric_release(main_mrg, metric);
+}
+
+STORAGE_METRIC_HANDLE *rrdeng_metric_dup(STORAGE_METRIC_HANDLE *db_metric_handle) {
+ METRIC *metric = (METRIC *)db_metric_handle;
+ return (STORAGE_METRIC_HANDLE *) mrg_metric_dup(main_mrg, metric);
+}
+
+STORAGE_METRIC_HANDLE *rrdeng_metric_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid) {
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance;
+ return (STORAGE_METRIC_HANDLE *) mrg_metric_get_and_acquire(main_mrg, uuid, (Word_t) ctx);
+}
+
+static METRIC *rrdeng_metric_create(STORAGE_INSTANCE *db_instance, uuid_t *uuid) {
+ internal_fatal(!db_instance, "DBENGINE: db_instance is NULL");
+
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance;
+ MRG_ENTRY entry = {
+ .uuid = uuid,
+ .section = (Word_t)ctx,
+ .first_time_s = 0,
+ .last_time_s = 0,
+ .latest_update_every_s = 0,
+ };
+
+ METRIC *metric = mrg_metric_add_and_acquire(main_mrg, entry, NULL);
+ return metric;
+}
+
+STORAGE_METRIC_HANDLE *rrdeng_metric_get_or_create(RRDDIM *rd, STORAGE_INSTANCE *db_instance) {
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance;
+ METRIC *metric;
+
+ metric = mrg_metric_get_and_acquire(main_mrg, &rd->metric_uuid, (Word_t) ctx);
+
+ if(unlikely(!metric)) {
+ if(unlikely(ctx->config.legacy)) {
+ // this is a single host database
+ // generate uuid from the chart and dimensions ids
+ // and overwrite the one supplied by rrddim
+ metric = rrdeng_metric_get_legacy(db_instance, rrddim_id(rd), rrdset_id(rd->rrdset));
+ if (metric)
+ uuid_copy(rd->metric_uuid, *mrg_metric_uuid(main_mrg, metric));
+ }
+
+ if(likely(!metric))
+ metric = rrdeng_metric_create(db_instance, &rd->metric_uuid);
+ }
+
+#ifdef NETDATA_INTERNAL_CHECKS
+ if(uuid_memcmp(&rd->metric_uuid, mrg_metric_uuid(main_mrg, metric)) != 0) {
+ char uuid1[UUID_STR_LEN + 1];
+ char uuid2[UUID_STR_LEN + 1];
+
+ uuid_unparse(rd->metric_uuid, uuid1);
+ uuid_unparse(*mrg_metric_uuid(main_mrg, metric), uuid2);
+ fatal("DBENGINE: uuids do not match, asked for metric '%s', but got metric '%s'", uuid1, uuid2);
+ }
+
+ if(mrg_metric_ctx(metric) != ctx)
+ fatal("DBENGINE: mixed up db instances, asked for metric from %p, got from %p",
+ ctx, mrg_metric_ctx(metric));
+#endif
+
+ return (STORAGE_METRIC_HANDLE *)metric;
+}
+
+
+// ----------------------------------------------------------------------------
+// collect ops
+
+static inline void check_and_fix_mrg_update_every(struct rrdeng_collect_handle *handle) {
+ if(unlikely((time_t)(handle->update_every_ut / USEC_PER_SEC) != mrg_metric_get_update_every_s(main_mrg, handle->metric))) {
+ internal_error(true, "DBENGINE: collection handle has update every %ld, but the metric registry has %ld. Fixing it.",
+ (time_t)(handle->update_every_ut / USEC_PER_SEC), mrg_metric_get_update_every_s(main_mrg, handle->metric));
+
+ if(unlikely(!handle->update_every_ut))
+ handle->update_every_ut = (usec_t)mrg_metric_get_update_every_s(main_mrg, handle->metric) * USEC_PER_SEC;
+ else
+ mrg_metric_set_update_every(main_mrg, handle->metric, (time_t)(handle->update_every_ut / USEC_PER_SEC));
+ }
+}
+
+static inline bool check_completed_page_consistency(struct rrdeng_collect_handle *handle __maybe_unused) {
+#ifdef NETDATA_INTERNAL_CHECKS
+ if (unlikely(!handle->pgc_page || !handle->page_entries_max || !handle->page_position || !handle->page_end_time_ut))
+ return false;
+
+ struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric);
+
+ uuid_t *uuid = mrg_metric_uuid(main_mrg, handle->metric);
+ time_t start_time_s = pgc_page_start_time_s(handle->pgc_page);
+ time_t end_time_s = pgc_page_end_time_s(handle->pgc_page);
+ time_t update_every_s = pgc_page_update_every_s(handle->pgc_page);
+ size_t page_length = handle->page_position * CTX_POINT_SIZE_BYTES(ctx);
+ size_t entries = handle->page_position;
+ time_t overwrite_zero_update_every_s = (time_t)(handle->update_every_ut / USEC_PER_SEC);
+
+ if(end_time_s > max_acceptable_collected_time())
+ handle->page_flags |= RRDENG_PAGE_COMPLETED_IN_FUTURE;
+
+ VALIDATED_PAGE_DESCRIPTOR vd = validate_page(
+ uuid,
+ start_time_s,
+ end_time_s,
+ update_every_s,
+ page_length,
+ ctx->config.page_type,
+ entries,
+ 0, // do not check for future timestamps - we inherit the timestamps of the children
+ overwrite_zero_update_every_s,
+ false,
+ "collected",
+ handle->page_flags);
+
+ return vd.is_valid;
+#else
+ return true;
+#endif
+}
+
+/*
+ * Gets a handle for storing metrics to the database.
+ * The handle must be released with rrdeng_store_metric_final().
+ */
+STORAGE_COLLECT_HANDLE *rrdeng_store_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, uint32_t update_every, STORAGE_METRICS_GROUP *smg) {
+ METRIC *metric = (METRIC *)db_metric_handle;
+ struct rrdengine_instance *ctx = mrg_metric_ctx(metric);
+
+ bool is_1st_metric_writer = true;
+ if(!mrg_metric_set_writer(main_mrg, metric)) {
+ is_1st_metric_writer = false;
+ char uuid[UUID_STR_LEN + 1];
+ uuid_unparse(*mrg_metric_uuid(main_mrg, metric), uuid);
+ netdata_log_error("DBENGINE: metric '%s' is already collected and should not be collected twice - expect gaps on the charts", uuid);
+ }
+
+ metric = mrg_metric_dup(main_mrg, metric);
+
+ struct rrdeng_collect_handle *handle;
+
+ handle = callocz(1, sizeof(struct rrdeng_collect_handle));
+ handle->common.backend = STORAGE_ENGINE_BACKEND_DBENGINE;
+ handle->metric = metric;
+
+ handle->pgc_page = NULL;
+ handle->page_data = NULL;
+ handle->page_data_size = 0;
+
+ handle->page_position = 0;
+ handle->page_entries_max = 0;
+ handle->update_every_ut = (usec_t)update_every * USEC_PER_SEC;
+ handle->options = is_1st_metric_writer ? RRDENG_1ST_METRIC_WRITER : 0;
+
+ __atomic_add_fetch(&ctx->atomic.collectors_running, 1, __ATOMIC_RELAXED);
+ if(!is_1st_metric_writer)
+ __atomic_add_fetch(&ctx->atomic.collectors_running_duplicate, 1, __ATOMIC_RELAXED);
+
+ mrg_metric_set_update_every(main_mrg, metric, update_every);
+
+ handle->alignment = (struct pg_alignment *)smg;
+ rrdeng_page_alignment_acquire(handle->alignment);
+
+ // this is important!
+ // if we don't set the page_end_time_ut during the first collection
+ // data collection may be able to go back in time and during the addition of new pages
+ // clean pages may be found matching ours!
+
+ time_t db_first_time_s, db_last_time_s, db_update_every_s;
+ mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s);
+ handle->page_end_time_ut = (usec_t)db_last_time_s * USEC_PER_SEC;
+
+ return (STORAGE_COLLECT_HANDLE *)handle;
+}
+
+void rrdeng_store_metric_flush_current_page(STORAGE_COLLECT_HANDLE *collection_handle) {
+ struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle;
+
+ if (unlikely(!handle->pgc_page))
+ return;
+
+ if(pgd_is_empty(handle->page_data))
+ pgc_page_to_clean_evict_or_release(main_cache, handle->pgc_page);
+
+ else {
+ check_completed_page_consistency(handle);
+ mrg_metric_set_clean_latest_time_s(main_mrg, handle->metric, pgc_page_end_time_s(handle->pgc_page));
+ pgc_page_hot_to_dirty_and_release(main_cache, handle->pgc_page);
+ }
+
+ mrg_metric_set_hot_latest_time_s(main_mrg, handle->metric, 0);
+
+ handle->pgc_page = NULL;
+ handle->page_flags = 0;
+ handle->page_position = 0;
+ handle->page_entries_max = 0;
+ handle->page_data = NULL;
+ handle->page_data_size = 0;
+
+ // important!
+ // we should never zero page end time ut, because this will allow
+ // collection to go back in time
+ // handle->page_end_time_ut = 0;
+ // handle->page_start_time_ut;
+
+ check_and_fix_mrg_update_every(handle);
+
+ timing_step(TIMING_STEP_DBENGINE_FLUSH_PAGE);
+}
+
+static void rrdeng_store_metric_create_new_page(struct rrdeng_collect_handle *handle,
+ struct rrdengine_instance *ctx,
+ usec_t point_in_time_ut,
+ PGD *data,
+ size_t data_size) {
+ time_t point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC);
+ const time_t update_every_s = (time_t)(handle->update_every_ut / USEC_PER_SEC);
+
+ PGC_ENTRY page_entry = {
+ .section = (Word_t) ctx,
+ .metric_id = mrg_metric_id(main_mrg, handle->metric),
+ .start_time_s = point_in_time_s,
+ .end_time_s = point_in_time_s,
+ .size = data_size,
+ .data = data,
+ .update_every_s = (uint32_t) update_every_s,
+ .hot = true
+ };
+
+ size_t conflicts = 0;
+ bool added = true;
+ PGC_PAGE *pgc_page = pgc_page_add_and_acquire(main_cache, page_entry, &added);
+ while (unlikely(!added)) {
+ conflicts++;
+
+ char uuid[UUID_STR_LEN + 1];
+ uuid_unparse(*mrg_metric_uuid(main_mrg, handle->metric), uuid);
+
+#ifdef NETDATA_INTERNAL_CHECKS
+ internal_error(true,
+#else
+ nd_log_limit_static_global_var(erl, 1, 0);
+ nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING,
+#endif
+ "DBENGINE: metric '%s' new page from %ld to %ld, update every %ld, has a conflict in main cache "
+ "with existing %s%s page from %ld to %ld, update every %ld - "
+ "is it collected more than once?",
+ uuid,
+ page_entry.start_time_s, page_entry.end_time_s, (time_t)page_entry.update_every_s,
+ pgc_is_page_hot(pgc_page) ? "hot" : "not-hot",
+ pgc_page_data(pgc_page) == PGD_EMPTY ? " gap" : "",
+ pgc_page_start_time_s(pgc_page), pgc_page_end_time_s(pgc_page), pgc_page_update_every_s(pgc_page)
+ );
+
+ pgc_page_release(main_cache, pgc_page);
+
+ point_in_time_ut -= handle->update_every_ut;
+ point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC);
+ page_entry.start_time_s = point_in_time_s;
+ page_entry.end_time_s = point_in_time_s;
+ pgc_page = pgc_page_add_and_acquire(main_cache, page_entry, &added);
+ }
+
+ handle->page_entries_max = data_size / CTX_POINT_SIZE_BYTES(ctx);
+ handle->page_start_time_ut = point_in_time_ut;
+ handle->page_end_time_ut = point_in_time_ut;
+ handle->page_position = 1; // zero is already in our data
+ handle->pgc_page = pgc_page;
+ handle->page_flags = conflicts? RRDENG_PAGE_CONFLICT : 0;
+
+ if(point_in_time_s > max_acceptable_collected_time())
+ handle->page_flags |= RRDENG_PAGE_CREATED_IN_FUTURE;
+
+ check_and_fix_mrg_update_every(handle);
+
+ timing_step(TIMING_STEP_DBENGINE_CREATE_NEW_PAGE);
+}
+
+static size_t aligned_allocation_entries(size_t max_slots, size_t target_slot, time_t now_s) {
+ size_t slots = target_slot;
+ size_t pos = (now_s % max_slots);
+
+ if(pos > slots)
+ slots += max_slots - pos;
+
+ else if(pos < slots)
+ slots -= pos;
+
+ else
+ slots = max_slots;
+
+ return slots;
+}
+
+static PGD *rrdeng_alloc_new_page_data(struct rrdeng_collect_handle *handle, size_t *data_size, usec_t point_in_time_ut) {
+ struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric);
+
+ PGD *d = NULL;
+
+ size_t max_size = tier_page_size[ctx->config.tier];
+ size_t max_slots = max_size / CTX_POINT_SIZE_BYTES(ctx);
+
+ size_t slots = aligned_allocation_entries(
+ max_slots,
+ indexing_partition((Word_t) handle->alignment, max_slots),
+ (time_t) (point_in_time_ut / USEC_PER_SEC)
+ );
+
+ if(slots < max_slots / 3)
+ slots = max_slots / 3;
+
+ if(slots < 3)
+ slots = 3;
+
+ size_t size = slots * CTX_POINT_SIZE_BYTES(ctx);
+
+ // internal_error(true, "PAGE ALLOC %zu bytes (%zu max)", size, max_size);
+
+ internal_fatal(slots < 3 || slots > max_slots, "ooops! wrong distribution of metrics across time");
+ internal_fatal(size > tier_page_size[ctx->config.tier] || size < CTX_POINT_SIZE_BYTES(ctx) * 2, "ooops! wrong page size");
+
+ *data_size = size;
+
+ switch (ctx->config.page_type) {
+ case PAGE_METRICS:
+ case PAGE_TIER:
+ d = pgd_create(ctx->config.page_type, slots);
+ break;
+ case PAGE_GORILLA_METRICS:
+ // ignore slots, and use the fixed number of slots per gorilla buffer.
+ // gorilla will automatically add more buffers if needed.
+ d = pgd_create(ctx->config.page_type, GORILLA_BUFFER_SLOTS);
+ break;
+ default:
+ fatal("Unknown page type: %uc\n", ctx->config.page_type);
+ }
+
+ timing_step(TIMING_STEP_DBENGINE_PAGE_ALLOC);
+ return d;
+}
+
+static void rrdeng_store_metric_append_point(STORAGE_COLLECT_HANDLE *collection_handle,
+ const usec_t point_in_time_ut,
+ const NETDATA_DOUBLE n,
+ const NETDATA_DOUBLE min_value,
+ const NETDATA_DOUBLE max_value,
+ const uint16_t count,
+ const uint16_t anomaly_count,
+ const SN_FLAGS flags)
+{
+ struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle;
+ struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric);
+
+ if(unlikely(!handle->page_data))
+ handle->page_data = rrdeng_alloc_new_page_data(handle, &handle->page_data_size, point_in_time_ut);
+
+ timing_step(TIMING_STEP_DBENGINE_CHECK_DATA);
+
+ pgd_append_point(handle->page_data,
+ point_in_time_ut,
+ n, min_value, max_value, count, anomaly_count, flags,
+ handle->page_position);
+
+ timing_step(TIMING_STEP_DBENGINE_PACK);
+
+ if(unlikely(!handle->pgc_page)) {
+ rrdeng_store_metric_create_new_page(handle, ctx, point_in_time_ut, handle->page_data, handle->page_data_size);
+ // handle->position is set to 1 already
+ }
+ else {
+ // update an existing page
+ pgc_page_hot_set_end_time_s(main_cache, handle->pgc_page, (time_t) (point_in_time_ut / USEC_PER_SEC));
+ handle->page_end_time_ut = point_in_time_ut;
+
+ if(unlikely(++handle->page_position >= handle->page_entries_max)) {
+ internal_fatal(handle->page_position > handle->page_entries_max, "DBENGINE: exceeded page max number of points");
+ handle->page_flags |= RRDENG_PAGE_FULL;
+ rrdeng_store_metric_flush_current_page(collection_handle);
+ }
+ }
+
+ timing_step(TIMING_STEP_DBENGINE_PAGE_FIN);
+
+ // update the metric information
+ mrg_metric_set_hot_latest_time_s(main_mrg, handle->metric, (time_t) (point_in_time_ut / USEC_PER_SEC));
+
+ timing_step(TIMING_STEP_DBENGINE_MRG_UPDATE);
+}
+
+static void store_metric_next_error_log(struct rrdeng_collect_handle *handle __maybe_unused, usec_t point_in_time_ut __maybe_unused, const char *msg __maybe_unused) {
+#ifdef NETDATA_INTERNAL_CHECKS
+ time_t point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC);
+ char uuid[UUID_STR_LEN + 1];
+ uuid_unparse(*mrg_metric_uuid(main_mrg, handle->metric), uuid);
+
+ BUFFER *wb = NULL;
+ if(handle->pgc_page && handle->page_flags) {
+ wb = buffer_create(0, NULL);
+ collect_page_flags_to_buffer(wb, handle->page_flags);
+ }
+
+ nd_log_limit_static_global_var(erl, 1, 0);
+ nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE,
+ "DBENGINE: metric '%s' collected point at %ld, %s last collection at %ld, "
+ "update every %ld, %s page from %ld to %ld, position %u (of %u), flags: %s",
+ uuid,
+ point_in_time_s,
+ msg,
+ (time_t)(handle->page_end_time_ut / USEC_PER_SEC),
+ (time_t)(handle->update_every_ut / USEC_PER_SEC),
+ handle->pgc_page ? "current" : "*LAST*",
+ (time_t)(handle->page_start_time_ut / USEC_PER_SEC),
+ (time_t)(handle->page_end_time_ut / USEC_PER_SEC),
+ handle->page_position, handle->page_entries_max,
+ wb ? buffer_tostring(wb) : ""
+ );
+
+ buffer_free(wb);
+#else
+ ;
+#endif
+}
+
+void rrdeng_store_metric_next(STORAGE_COLLECT_HANDLE *collection_handle,
+ const usec_t point_in_time_ut,
+ const NETDATA_DOUBLE n,
+ const NETDATA_DOUBLE min_value,
+ const NETDATA_DOUBLE max_value,
+ const uint16_t count,
+ const uint16_t anomaly_count,
+ const SN_FLAGS flags)
+{
+ timing_step(TIMING_STEP_RRDSET_STORE_METRIC);
+
+ struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle;
+
+#ifdef NETDATA_INTERNAL_CHECKS
+ if(unlikely(point_in_time_ut > (usec_t)max_acceptable_collected_time() * USEC_PER_SEC))
+ handle->page_flags |= RRDENG_PAGE_FUTURE_POINT;
+#endif
+
+ usec_t delta_ut = point_in_time_ut - handle->page_end_time_ut;
+
+ if(likely(delta_ut == handle->update_every_ut)) {
+ // happy path
+ ;
+ }
+ else if(unlikely(point_in_time_ut > handle->page_end_time_ut)) {
+ if(handle->pgc_page) {
+ if (unlikely(delta_ut < handle->update_every_ut)) {
+ handle->page_flags |= RRDENG_PAGE_STEP_TOO_SMALL;
+ rrdeng_store_metric_flush_current_page(collection_handle);
+ }
+ else if (unlikely(delta_ut % handle->update_every_ut)) {
+ handle->page_flags |= RRDENG_PAGE_STEP_UNALIGNED;
+ rrdeng_store_metric_flush_current_page(collection_handle);
+ }
+ else {
+ size_t points_gap = delta_ut / handle->update_every_ut;
+ size_t page_remaining_points = handle->page_entries_max - handle->page_position;
+
+ if (points_gap >= page_remaining_points) {
+ handle->page_flags |= RRDENG_PAGE_BIG_GAP;
+ rrdeng_store_metric_flush_current_page(collection_handle);
+ }
+ else {
+ // loop to fill the gap
+ handle->page_flags |= RRDENG_PAGE_GAP;
+
+ usec_t stop_ut = point_in_time_ut - handle->update_every_ut;
+ for (usec_t this_ut = handle->page_end_time_ut + handle->update_every_ut;
+ this_ut <= stop_ut;
+ this_ut = handle->page_end_time_ut + handle->update_every_ut) {
+ rrdeng_store_metric_append_point(
+ collection_handle,
+ this_ut,
+ NAN, NAN, NAN,
+ 1, 0,
+ SN_EMPTY_SLOT);
+ }
+ }
+ }
+ }
+ }
+ else if(unlikely(point_in_time_ut < handle->page_end_time_ut)) {
+ handle->page_flags |= RRDENG_PAGE_PAST_COLLECTION;
+ store_metric_next_error_log(handle, point_in_time_ut, "is older than the");
+ return;
+ }
+
+ else /* if(unlikely(point_in_time_ut == handle->page_end_time_ut)) */ {
+ handle->page_flags |= RRDENG_PAGE_REPEATED_COLLECTION;
+ store_metric_next_error_log(handle, point_in_time_ut, "is at the same time as the");
+ return;
+ }
+
+ timing_step(TIMING_STEP_DBENGINE_FIRST_CHECK);
+
+ rrdeng_store_metric_append_point(collection_handle,
+ point_in_time_ut,
+ n, min_value, max_value,
+ count, anomaly_count,
+ flags);
+}
+
+/*
+ * Releases the database reference from the handle for storing metrics.
+ * Returns 1 if it's safe to delete the dimension.
+ */
+int rrdeng_store_metric_finalize(STORAGE_COLLECT_HANDLE *collection_handle) {
+ struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle;
+ struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric);
+
+ handle->page_flags |= RRDENG_PAGE_COLLECT_FINALIZE;
+ rrdeng_store_metric_flush_current_page(collection_handle);
+ rrdeng_page_alignment_release(handle->alignment);
+
+ __atomic_sub_fetch(&ctx->atomic.collectors_running, 1, __ATOMIC_RELAXED);
+ if(!(handle->options & RRDENG_1ST_METRIC_WRITER))
+ __atomic_sub_fetch(&ctx->atomic.collectors_running_duplicate, 1, __ATOMIC_RELAXED);
+
+ if((handle->options & RRDENG_1ST_METRIC_WRITER) && !mrg_metric_clear_writer(main_mrg, handle->metric))
+ internal_fatal(true, "DBENGINE: metric is already released");
+
+ time_t first_time_s, last_time_s, update_every_s;
+ mrg_metric_get_retention(main_mrg, handle->metric, &first_time_s, &last_time_s, &update_every_s);
+
+ mrg_metric_release(main_mrg, handle->metric);
+ freez(handle);
+
+ if(!first_time_s && !last_time_s)
+ return 1;
+
+ return 0;
+}
+
+void rrdeng_store_metric_change_collection_frequency(STORAGE_COLLECT_HANDLE *collection_handle, int update_every) {
+ struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle;
+ check_and_fix_mrg_update_every(handle);
+
+ METRIC *metric = handle->metric;
+ usec_t update_every_ut = (usec_t)update_every * USEC_PER_SEC;
+
+ if(update_every_ut == handle->update_every_ut)
+ return;
+
+ handle->page_flags |= RRDENG_PAGE_UPDATE_EVERY_CHANGE;
+ rrdeng_store_metric_flush_current_page(collection_handle);
+ mrg_metric_set_update_every(main_mrg, metric, update_every);
+ handle->update_every_ut = update_every_ut;
+}
+
+// ----------------------------------------------------------------------------
+// query ops
+
+#ifdef NETDATA_INTERNAL_CHECKS
+SPINLOCK global_query_handle_spinlock = NETDATA_SPINLOCK_INITIALIZER;
+static struct rrdeng_query_handle *global_query_handle_ll = NULL;
+static void register_query_handle(struct rrdeng_query_handle *handle) {
+ handle->query_pid = gettid();
+ handle->started_time_s = now_realtime_sec();
+
+ spinlock_lock(&global_query_handle_spinlock);
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(global_query_handle_ll, handle, prev, next);
+ spinlock_unlock(&global_query_handle_spinlock);
+}
+static void unregister_query_handle(struct rrdeng_query_handle *handle) {
+ spinlock_lock(&global_query_handle_spinlock);
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(global_query_handle_ll, handle, prev, next);
+ spinlock_unlock(&global_query_handle_spinlock);
+}
+#else
+static void register_query_handle(struct rrdeng_query_handle *handle __maybe_unused) {
+ ;
+}
+static void unregister_query_handle(struct rrdeng_query_handle *handle __maybe_unused) {
+ ;
+}
+#endif
+
+/*
+ * Gets a handle for loading metrics from the database.
+ * The handle must be released with rrdeng_load_metric_final().
+ */
+void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle,
+ struct storage_engine_query_handle *rrddim_handle,
+ time_t start_time_s,
+ time_t end_time_s,
+ STORAGE_PRIORITY priority)
+{
+ usec_t started_ut = now_monotonic_usec();
+
+ netdata_thread_disable_cancelability();
+
+ METRIC *metric = (METRIC *)db_metric_handle;
+ struct rrdengine_instance *ctx = mrg_metric_ctx(metric);
+ struct rrdeng_query_handle *handle;
+
+ handle = rrdeng_query_handle_get();
+ register_query_handle(handle);
+
+ if (unlikely(priority < STORAGE_PRIORITY_HIGH))
+ priority = STORAGE_PRIORITY_HIGH;
+ else if (unlikely(priority >= STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE))
+ priority = STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE - 1;
+
+ handle->ctx = ctx;
+ handle->metric = metric;
+ handle->priority = priority;
+
+ // IMPORTANT!
+ // It is crucial not to exceed the db boundaries, because dbengine
+ // now has gap caching, so when a gap is detected a negative page
+ // is inserted into the main cache, to avoid scanning the journals
+ // again for pages matching the gap.
+
+ time_t db_first_time_s, db_last_time_s, db_update_every_s;
+ mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s);
+
+ if(is_page_in_time_range(start_time_s, end_time_s, db_first_time_s, db_last_time_s) == PAGE_IS_IN_RANGE) {
+ handle->start_time_s = MAX(start_time_s, db_first_time_s);
+ handle->end_time_s = MIN(end_time_s, db_last_time_s);
+ handle->now_s = handle->start_time_s;
+
+ handle->dt_s = db_update_every_s;
+ if (!handle->dt_s) {
+ handle->dt_s = default_rrd_update_every;
+ mrg_metric_set_update_every_s_if_zero(main_mrg, metric, default_rrd_update_every);
+ }
+
+ rrddim_handle->handle = (STORAGE_QUERY_HANDLE *) handle;
+ rrddim_handle->start_time_s = handle->start_time_s;
+ rrddim_handle->end_time_s = handle->end_time_s;
+ rrddim_handle->priority = priority;
+ rrddim_handle->backend = STORAGE_ENGINE_BACKEND_DBENGINE;
+
+ pg_cache_preload(handle);
+
+ __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_init, now_monotonic_usec() - started_ut, __ATOMIC_RELAXED);
+ }
+ else {
+ handle->start_time_s = start_time_s;
+ handle->end_time_s = end_time_s;
+ handle->now_s = start_time_s;
+ handle->dt_s = db_update_every_s;
+
+ rrddim_handle->handle = (STORAGE_QUERY_HANDLE *) handle;
+ rrddim_handle->start_time_s = handle->start_time_s;
+ rrddim_handle->end_time_s = 0;
+ rrddim_handle->priority = priority;
+ rrddim_handle->backend = STORAGE_ENGINE_BACKEND_DBENGINE;
+ }
+}
+
+static bool rrdeng_load_page_next(struct storage_engine_query_handle *rrddim_handle, bool debug_this __maybe_unused) {
+ struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle;
+ struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric);
+
+ if (likely(handle->page)) {
+ // we have a page to release
+ pgc_page_release(main_cache, handle->page);
+ handle->page = NULL;
+ pgdc_reset(&handle->pgdc, NULL, UINT32_MAX);
+ }
+
+ if (unlikely(handle->now_s > rrddim_handle->end_time_s))
+ return false;
+
+ size_t entries = 0;
+ handle->page = pg_cache_lookup_next(ctx, handle->pdc, handle->now_s, handle->dt_s, &entries);
+
+ internal_fatal(handle->page && (pgc_page_data(handle->page) == PGD_EMPTY || !entries),
+ "A page was returned, but it is empty - pg_cache_lookup_next() should be handling this case");
+
+ if (unlikely(!handle->page || pgc_page_data(handle->page) == PGD_EMPTY || !entries))
+ return false;
+
+ time_t page_start_time_s = pgc_page_start_time_s(handle->page);
+ time_t page_end_time_s = pgc_page_end_time_s(handle->page);
+ time_t page_update_every_s = pgc_page_update_every_s(handle->page);
+
+ unsigned position;
+ if(likely(handle->now_s >= page_start_time_s && handle->now_s <= page_end_time_s)) {
+
+ if(unlikely(entries == 1 || page_start_time_s == page_end_time_s || !page_update_every_s)) {
+ position = 0;
+ handle->now_s = page_start_time_s;
+ }
+ else {
+ position = (handle->now_s - page_start_time_s) * (entries - 1) / (page_end_time_s - page_start_time_s);
+ time_t point_end_time_s = page_start_time_s + position * page_update_every_s;
+ while(point_end_time_s < handle->now_s && position + 1 < entries) {
+ // https://github.com/netdata/netdata/issues/14411
+ // we really need a while() here, because the delta may be
+ // 2 points at higher tiers
+ position++;
+ point_end_time_s = page_start_time_s + position * page_update_every_s;
+ }
+ handle->now_s = point_end_time_s;
+ }
+
+ internal_fatal(position >= entries, "DBENGINE: wrong page position calculation");
+ }
+ else if(handle->now_s < page_start_time_s) {
+ handle->now_s = page_start_time_s;
+ position = 0;
+ }
+ else {
+ internal_fatal(true, "DBENGINE: this page is entirely in our past and should not be accepted for this query in the first place");
+ handle->now_s = page_end_time_s;
+ position = entries - 1;
+ }
+
+ handle->entries = entries;
+ handle->position = position;
+ handle->dt_s = page_update_every_s;
+
+ pgdc_reset(&handle->pgdc, pgc_page_data(handle->page), handle->position);
+
+ return true;
+}
+
+// Returns the metric and sets its timestamp into current_time
+// IT IS REQUIRED TO **ALWAYS** SET ALL RETURN VALUES (current_time, end_time, flags)
+// IT IS REQUIRED TO **ALWAYS** KEEP TRACK OF TIME, EVEN OUTSIDE THE DATABASE BOUNDARIES
+STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim_handle) {
+ struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle;
+ STORAGE_POINT sp;
+
+ if (unlikely(handle->now_s > rrddim_handle->end_time_s)) {
+ storage_point_empty(sp, handle->now_s - handle->dt_s, handle->now_s);
+ goto prepare_for_next_iteration;
+ }
+
+ if (unlikely(!handle->page || handle->position >= handle->entries)) {
+ // We need to get a new page
+
+ if (!rrdeng_load_page_next(rrddim_handle, false)) {
+ handle->now_s = rrddim_handle->end_time_s;
+ storage_point_empty(sp, handle->now_s - handle->dt_s, handle->now_s);
+ goto prepare_for_next_iteration;
+ }
+ }
+
+ sp.start_time_s = handle->now_s - handle->dt_s;
+ sp.end_time_s = handle->now_s;
+
+ pgdc_get_next_point(&handle->pgdc, handle->position, &sp);
+
+prepare_for_next_iteration:
+ internal_fatal(sp.end_time_s < rrddim_handle->start_time_s, "DBENGINE: this point is too old for this query");
+ internal_fatal(sp.end_time_s < handle->now_s, "DBENGINE: this point is too old for this point in time");
+
+ handle->now_s += handle->dt_s;
+ handle->position++;
+
+ return sp;
+}
+
+int rrdeng_load_metric_is_finished(struct storage_engine_query_handle *rrddim_handle) {
+ struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle;
+ return (handle->now_s > rrddim_handle->end_time_s);
+}
+
+/*
+ * Releases the database reference from the handle for loading metrics.
+ */
+void rrdeng_load_metric_finalize(struct storage_engine_query_handle *rrddim_handle)
+{
+ struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle;
+
+ if (handle->page) {
+ pgc_page_release(main_cache, handle->page);
+ pgdc_reset(&handle->pgdc, NULL, UINT32_MAX);
+ }
+
+ if(!pdc_release_and_destroy_if_unreferenced(handle->pdc, false, false))
+ __atomic_store_n(&handle->pdc->workers_should_stop, true, __ATOMIC_RELAXED);
+
+ unregister_query_handle(handle);
+ rrdeng_query_handle_release(handle);
+ rrddim_handle->handle = NULL;
+ netdata_thread_enable_cancelability();
+}
+
+time_t rrdeng_load_align_to_optimal_before(struct storage_engine_query_handle *rrddim_handle) {
+ struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle;
+
+ if(handle->pdc) {
+ rrdeng_prep_wait(handle->pdc);
+ if (handle->pdc->optimal_end_time_s > rrddim_handle->end_time_s)
+ rrddim_handle->end_time_s = handle->pdc->optimal_end_time_s;
+ }
+
+ return rrddim_handle->end_time_s;
+}
+
+time_t rrdeng_metric_latest_time(STORAGE_METRIC_HANDLE *db_metric_handle) {
+ METRIC *metric = (METRIC *)db_metric_handle;
+ time_t latest_time_s = 0;
+
+ if (metric)
+ latest_time_s = mrg_metric_get_latest_time_s(main_mrg, metric);
+
+ return latest_time_s;
+}
+
+time_t rrdeng_metric_oldest_time(STORAGE_METRIC_HANDLE *db_metric_handle) {
+ METRIC *metric = (METRIC *)db_metric_handle;
+
+ time_t oldest_time_s = 0;
+ if (metric)
+ oldest_time_s = mrg_metric_get_first_time_s(main_mrg, metric);
+
+ return oldest_time_s;
+}
+
+bool rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *db_instance, uuid_t *dim_uuid, time_t *first_entry_s, time_t *last_entry_s)
+{
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance;
+ if (unlikely(!ctx)) {
+ netdata_log_error("DBENGINE: invalid STORAGE INSTANCE to %s()", __FUNCTION__);
+ return false;
+ }
+
+ METRIC *metric = mrg_metric_get_and_acquire(main_mrg, dim_uuid, (Word_t) ctx);
+ if (unlikely(!metric))
+ return false;
+
+ time_t update_every_s;
+ mrg_metric_get_retention(main_mrg, metric, first_entry_s, last_entry_s, &update_every_s);
+
+ mrg_metric_release(main_mrg, metric);
+
+ return true;
+}
+
+uint64_t rrdeng_disk_space_max(STORAGE_INSTANCE *db_instance) {
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance;
+ return ctx->config.max_disk_space;
+}
+
+uint64_t rrdeng_disk_space_used(STORAGE_INSTANCE *db_instance) {
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance;
+ return __atomic_load_n(&ctx->atomic.current_disk_space, __ATOMIC_RELAXED);
+}
+
+time_t rrdeng_global_first_time_s(STORAGE_INSTANCE *db_instance) {
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance;
+
+ time_t t = __atomic_load_n(&ctx->atomic.first_time_s, __ATOMIC_RELAXED);
+ if(t == LONG_MAX || t < 0)
+ t = 0;
+
+ return t;
+}
+
+size_t rrdeng_currently_collected_metrics(STORAGE_INSTANCE *db_instance) {
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance;
+ return __atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED);
+}
+
+/*
+ * Gathers Database Engine statistics.
+ * Careful when modifying this function.
+ * You must not change the indices of the statistics or user code will break.
+ * You must not exceed RRDENG_NR_STATS or it will crash.
+ */
+void rrdeng_get_37_statistics(struct rrdengine_instance *ctx, unsigned long long *array)
+{
+ if (ctx == NULL)
+ return;
+
+ array[0] = (uint64_t)__atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED); // API producers
+ array[1] = (uint64_t)__atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED); // API consumers
+ array[2] = 0;
+ array[3] = 0;
+ array[4] = 0;
+ array[5] = 0; // (uint64_t)ctx->stats.pg_cache_insertions;
+ array[6] = 0; // (uint64_t)ctx->stats.pg_cache_deletions;
+ array[7] = 0; // (uint64_t)ctx->stats.pg_cache_hits;
+ array[8] = 0; // (uint64_t)ctx->stats.pg_cache_misses;
+ array[9] = 0; // (uint64_t)ctx->stats.pg_cache_backfills;
+ array[10] = 0; // (uint64_t)ctx->stats.pg_cache_evictions;
+ array[11] = (uint64_t)__atomic_load_n(&ctx->stats.before_compress_bytes, __ATOMIC_RELAXED); // used
+ array[12] = (uint64_t)__atomic_load_n(&ctx->stats.after_compress_bytes, __ATOMIC_RELAXED); // used
+ array[13] = (uint64_t)__atomic_load_n(&ctx->stats.before_decompress_bytes, __ATOMIC_RELAXED);
+ array[14] = (uint64_t)__atomic_load_n(&ctx->stats.after_decompress_bytes, __ATOMIC_RELAXED);
+ array[15] = (uint64_t)__atomic_load_n(&ctx->stats.io_write_bytes, __ATOMIC_RELAXED); // used
+ array[16] = (uint64_t)__atomic_load_n(&ctx->stats.io_write_requests, __ATOMIC_RELAXED); // used
+ array[17] = (uint64_t)__atomic_load_n(&ctx->stats.io_read_bytes, __ATOMIC_RELAXED);
+ array[18] = (uint64_t)__atomic_load_n(&ctx->stats.io_read_requests, __ATOMIC_RELAXED); // used
+ array[19] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_write_extent_bytes, __ATOMIC_RELAXED);
+ array[20] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_write_extents, __ATOMIC_RELAXED);
+ array[21] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_read_extent_bytes, __ATOMIC_RELAXED);
+ array[22] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_read_extents, __ATOMIC_RELAXED);
+ array[23] = (uint64_t)__atomic_load_n(&ctx->stats.datafile_creations, __ATOMIC_RELAXED);
+ array[24] = (uint64_t)__atomic_load_n(&ctx->stats.datafile_deletions, __ATOMIC_RELAXED);
+ array[25] = (uint64_t)__atomic_load_n(&ctx->stats.journalfile_creations, __ATOMIC_RELAXED);
+ array[26] = (uint64_t)__atomic_load_n(&ctx->stats.journalfile_deletions, __ATOMIC_RELAXED);
+ array[27] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.page_cache_descriptors, __ATOMIC_RELAXED);
+ array[28] = (uint64_t)__atomic_load_n(&ctx->stats.io_errors, __ATOMIC_RELAXED);
+ array[29] = (uint64_t)__atomic_load_n(&ctx->stats.fs_errors, __ATOMIC_RELAXED);
+ array[30] = (uint64_t)__atomic_load_n(&global_io_errors, __ATOMIC_RELAXED); // used
+ array[31] = (uint64_t)__atomic_load_n(&global_fs_errors, __ATOMIC_RELAXED); // used
+ array[32] = (uint64_t)__atomic_load_n(&rrdeng_reserved_file_descriptors, __ATOMIC_RELAXED); // used
+ array[33] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.pg_cache_over_half_dirty_events, __ATOMIC_RELAXED);
+ array[34] = (uint64_t)__atomic_load_n(&global_pg_cache_over_half_dirty_events, __ATOMIC_RELAXED); // used
+ array[35] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.flushing_pressure_page_deletions, __ATOMIC_RELAXED);
+ array[36] = (uint64_t)__atomic_load_n(&global_flushing_pressure_page_deletions, __ATOMIC_RELAXED); // used
+ array[37] = 0; //(uint64_t)pg_cache->active_descriptors;
+
+ fatal_assert(RRDENG_NR_STATS == 38);
+}
+
+static void rrdeng_populate_mrg(struct rrdengine_instance *ctx) {
+ uv_rwlock_rdlock(&ctx->datafiles.rwlock);
+ size_t datafiles = 0;
+ for(struct rrdengine_datafile *df = ctx->datafiles.first; df ;df = df->next)
+ datafiles++;
+ uv_rwlock_rdunlock(&ctx->datafiles.rwlock);
+
+ ssize_t cpus = (ssize_t)get_netdata_cpus() / (ssize_t)storage_tiers;
+ if(cpus > (ssize_t)datafiles)
+ cpus = (ssize_t)datafiles;
+
+ if(cpus > (ssize_t)libuv_worker_threads)
+ cpus = (ssize_t)libuv_worker_threads;
+
+ if(cpus >= (ssize_t)get_netdata_cpus() / 2)
+ cpus = (ssize_t)(get_netdata_cpus() / 2 - 1);
+
+ if(cpus < 1)
+ cpus = 1;
+
+ netdata_log_info("DBENGINE: populating retention to MRG from %zu journal files of tier %d, using %zd threads...", datafiles, ctx->config.tier, cpus);
+
+ if(datafiles > 2) {
+ struct rrdengine_datafile *datafile;
+
+ datafile = ctx->datafiles.first->prev;
+ if(!(datafile->journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE))
+ datafile = datafile->prev;
+
+ if(datafile->journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE) {
+ journalfile_v2_populate_retention_to_mrg(ctx, datafile->journalfile);
+ datafile->populate_mrg.populated = true;
+ }
+
+ datafile = ctx->datafiles.first;
+ if(datafile->journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE) {
+ journalfile_v2_populate_retention_to_mrg(ctx, datafile->journalfile);
+ datafile->populate_mrg.populated = true;
+ }
+ }
+
+ ctx->loading.populate_mrg.size = cpus;
+ ctx->loading.populate_mrg.array = callocz(ctx->loading.populate_mrg.size, sizeof(struct completion));
+
+ for (size_t i = 0; i < ctx->loading.populate_mrg.size; i++) {
+ completion_init(&ctx->loading.populate_mrg.array[i]);
+ rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_POPULATE_MRG, NULL, &ctx->loading.populate_mrg.array[i],
+ STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
+ }
+}
+
+void rrdeng_readiness_wait(struct rrdengine_instance *ctx) {
+ for (size_t i = 0; i < ctx->loading.populate_mrg.size; i++) {
+ completion_wait_for(&ctx->loading.populate_mrg.array[i]);
+ completion_destroy(&ctx->loading.populate_mrg.array[i]);
+ }
+
+ freez(ctx->loading.populate_mrg.array);
+ ctx->loading.populate_mrg.array = NULL;
+ ctx->loading.populate_mrg.size = 0;
+
+ netdata_log_info("DBENGINE: tier %d is ready for data collection and queries", ctx->config.tier);
+}
+
+bool rrdeng_is_legacy(STORAGE_INSTANCE *db_instance) {
+ struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance;
+ return ctx->config.legacy;
+}
+
+void rrdeng_exit_mode(struct rrdengine_instance *ctx) {
+ __atomic_store_n(&ctx->quiesce.exit_mode, true, __ATOMIC_RELAXED);
+}
+/*
+ * Returns 0 on success, negative on error
+ */
+int rrdeng_init(struct rrdengine_instance **ctxp, const char *dbfiles_path,
+ unsigned disk_space_mb, size_t tier) {
+ struct rrdengine_instance *ctx;
+ uint32_t max_open_files;
+
+ max_open_files = rlimit_nofile.rlim_cur / 4;
+
+ /* reserve RRDENG_FD_BUDGET_PER_INSTANCE file descriptors for this instance */
+ rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, RRDENG_FD_BUDGET_PER_INSTANCE);
+ if (rrdeng_reserved_file_descriptors > max_open_files) {
+ netdata_log_error(
+ "Exceeded the budget of available file descriptors (%u/%u), cannot create new dbengine instance.",
+ (unsigned)rrdeng_reserved_file_descriptors,
+ (unsigned)max_open_files);
+
+ rrd_stat_atomic_add(&global_fs_errors, 1);
+ rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE);
+ return UV_EMFILE;
+ }
+
+ if(NULL == ctxp) {
+ ctx = multidb_ctx[tier];
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->config.legacy = false;
+ }
+ else {
+ *ctxp = ctx = callocz(1, sizeof(*ctx));
+ ctx->config.legacy = true;
+ }
+
+ ctx->config.tier = (int)tier;
+ ctx->config.page_type = tier_page_type[tier];
+ ctx->config.global_compress_alg = RRD_LZ4;
+ if (disk_space_mb < RRDENG_MIN_DISK_SPACE_MB)
+ disk_space_mb = RRDENG_MIN_DISK_SPACE_MB;
+ ctx->config.max_disk_space = disk_space_mb * 1048576LLU;
+ strncpyz(ctx->config.dbfiles_path, dbfiles_path, sizeof(ctx->config.dbfiles_path) - 1);
+ ctx->config.dbfiles_path[sizeof(ctx->config.dbfiles_path) - 1] = '\0';
+
+ ctx->atomic.transaction_id = 1;
+ ctx->quiesce.enabled = false;
+
+ rw_spinlock_init(&ctx->njfv2idx.spinlock);
+ ctx->atomic.first_time_s = LONG_MAX;
+
+ if (rrdeng_dbengine_spawn(ctx) && !init_rrd_files(ctx)) {
+ // success - we run this ctx too
+ rrdeng_populate_mrg(ctx);
+ return 0;
+ }
+
+ if (ctx->config.legacy) {
+ freez(ctx);
+ if (ctxp)
+ *ctxp = NULL;
+ }
+
+ rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE);
+ return UV_EIO;
+}
+
+size_t rrdeng_collectors_running(struct rrdengine_instance *ctx) {
+ return __atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED);
+}
+
+/*
+ * Returns 0 on success, 1 on error
+ */
+int rrdeng_exit(struct rrdengine_instance *ctx) {
+ if (NULL == ctx)
+ return 1;
+
+ // FIXME - ktsaou - properly cleanup ctx
+ // 1. make sure all collectors are stopped
+ // 2. make new queries will not be accepted (this is quiesce that has already run)
+ // 3. flush this section of the main cache
+ // 4. then wait for completion
+
+ bool logged = false;
+ size_t count = 10;
+ while(__atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED) && count && !unittest_running) {
+ if(!logged) {
+ netdata_log_info("DBENGINE: waiting for collectors to finish on tier %d...", (ctx->config.legacy) ? -1 : ctx->config.tier);
+ logged = true;
+ }
+ sleep_usec(100 * USEC_PER_MS);
+ count--;
+ }
+
+ netdata_log_info("DBENGINE: flushing main cache for tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier);
+ pgc_flush_all_hot_and_dirty_pages(main_cache, (Word_t)ctx);
+
+ netdata_log_info("DBENGINE: shutting down tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier);
+ struct completion completion = {};
+ completion_init(&completion);
+ rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_SHUTDOWN, NULL, &completion, STORAGE_PRIORITY_BEST_EFFORT, NULL, NULL);
+ completion_wait_for(&completion);
+ completion_destroy(&completion);
+
+ finalize_rrd_files(ctx);
+
+ if(ctx->config.legacy)
+ freez(ctx);
+
+ rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE);
+ return 0;
+}
+
+void rrdeng_prepare_exit(struct rrdengine_instance *ctx) {
+ if (NULL == ctx)
+ return;
+
+ // FIXME - ktsaou - properly cleanup ctx
+ // 1. make sure all collectors are stopped
+
+ completion_init(&ctx->quiesce.completion);
+ rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_QUIESCE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
+}
+
+static void populate_v2_statistics(struct rrdengine_datafile *datafile, RRDENG_SIZE_STATS *stats)
+{
+ struct journal_v2_header *j2_header = journalfile_v2_data_acquire(datafile->journalfile, NULL, 0, 0);
+ void *data_start = (void *)j2_header;
+
+ if(unlikely(!j2_header))
+ return;
+
+ stats->extents += j2_header->extent_count;
+
+ unsigned entries;
+ struct journal_extent_list *extent_list = (void *) (data_start + j2_header->extent_offset);
+ for (entries = 0; entries < j2_header->extent_count; entries++) {
+ stats->extents_compressed_bytes += extent_list->datafile_size;
+ stats->extents_pages += extent_list->pages;
+ extent_list++;
+ }
+
+ struct journal_metric_list *metric = (void *) (data_start + j2_header->metric_offset);
+ time_t journal_start_time_s = (time_t) (j2_header->start_time_ut / USEC_PER_SEC);
+
+ stats->metrics += j2_header->metric_count;
+ for (entries = 0; entries < j2_header->metric_count; entries++) {
+
+ struct journal_page_header *metric_list_header = (void *) (data_start + metric->page_offset);
+ stats->metrics_pages += metric_list_header->entries;
+ struct journal_page_list *descr = (void *) (data_start + metric->page_offset + sizeof(struct journal_page_header));
+ for (uint32_t idx=0; idx < metric_list_header->entries; idx++) {
+
+ time_t update_every_s;
+
+ size_t points = descr->page_length / CTX_POINT_SIZE_BYTES(datafile->ctx);
+
+ time_t start_time_s = journal_start_time_s + descr->delta_start_s;
+ time_t end_time_s = journal_start_time_s + descr->delta_end_s;
+
+ if(likely(points > 1))
+ update_every_s = (time_t) ((end_time_s - start_time_s) / (points - 1));
+ else {
+ update_every_s = (time_t) (default_rrd_update_every * get_tier_grouping(datafile->ctx->config.tier));
+ stats->single_point_pages++;
+ }
+
+ time_t duration_s = (time_t)((end_time_s - start_time_s + update_every_s));
+
+ stats->pages_uncompressed_bytes += descr->page_length;
+ stats->pages_duration_secs += duration_s;
+ stats->points += points;
+
+ stats->page_types[descr->type].pages++;
+ stats->page_types[descr->type].pages_uncompressed_bytes += descr->page_length;
+ stats->page_types[descr->type].pages_duration_secs += duration_s;
+ stats->page_types[descr->type].points += points;
+
+ if(!stats->first_time_s || (start_time_s - update_every_s) < stats->first_time_s)
+ stats->first_time_s = (start_time_s - update_every_s);
+
+ if(!stats->last_time_s || end_time_s > stats->last_time_s)
+ stats->last_time_s = end_time_s;
+
+ descr++;
+ }
+ metric++;
+ }
+
+ journalfile_v2_data_release(datafile->journalfile);
+}
+
+RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx) {
+ RRDENG_SIZE_STATS stats = { 0 };
+
+ uv_rwlock_rdlock(&ctx->datafiles.rwlock);
+ for(struct rrdengine_datafile *df = ctx->datafiles.first; df ;df = df->next) {
+ stats.datafiles++;
+ populate_v2_statistics(df, &stats);
+ }
+ uv_rwlock_rdunlock(&ctx->datafiles.rwlock);
+
+ stats.currently_collected_metrics = __atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED);
+
+ internal_error(stats.metrics_pages != stats.extents_pages + stats.currently_collected_metrics,
+ "DBENGINE: metrics pages is %zu, but extents pages is %zu and API consumers is %zu",
+ stats.metrics_pages, stats.extents_pages, stats.currently_collected_metrics);
+
+ stats.disk_space = ctx_current_disk_space_get(ctx);
+ stats.max_disk_space = ctx->config.max_disk_space;
+
+ stats.database_retention_secs = (time_t)(stats.last_time_s - stats.first_time_s);
+
+ if(stats.extents_pages)
+ stats.average_page_size_bytes = (double)stats.pages_uncompressed_bytes / (double)stats.extents_pages;
+
+ if(stats.pages_uncompressed_bytes > 0)
+ stats.average_compression_savings = 100.0 - ((double)stats.extents_compressed_bytes * 100.0 / (double)stats.pages_uncompressed_bytes);
+
+ if(stats.points)
+ stats.average_point_duration_secs = (double)stats.pages_duration_secs / (double)stats.points;
+
+ if(stats.metrics) {
+ stats.average_metric_retention_secs = (double)stats.pages_duration_secs / (double)stats.metrics;
+
+ if(stats.database_retention_secs) {
+ double metric_coverage = stats.average_metric_retention_secs / (double)stats.database_retention_secs;
+ double db_retention_days = (double)stats.database_retention_secs / 86400.0;
+
+ stats.estimated_concurrently_collected_metrics = stats.metrics * metric_coverage;
+
+ stats.ephemeral_metrics_per_day_percent = ((double)stats.metrics * 100.0 / (double)stats.estimated_concurrently_collected_metrics - 100.0) / (double)db_retention_days;
+ }
+ }
+
+// stats.sizeof_metric = 0;
+ stats.sizeof_datafile = struct_natural_alignment(sizeof(struct rrdengine_datafile)) + struct_natural_alignment(sizeof(struct rrdengine_journalfile));
+ stats.sizeof_page_in_cache = 0; // struct_natural_alignment(sizeof(struct page_cache_descr));
+ stats.sizeof_point_data = page_type_size[ctx->config.page_type];
+ stats.sizeof_page_data = tier_page_size[ctx->config.tier];
+ stats.pages_per_extent = rrdeng_pages_per_extent;
+
+// stats.sizeof_metric_in_index = 40;
+// stats.sizeof_page_in_index = 24;
+
+ stats.default_granularity_secs = (size_t)default_rrd_update_every * get_tier_grouping(ctx->config.tier);
+
+ return stats;
+}
+
+struct rrdeng_cache_efficiency_stats rrdeng_get_cache_efficiency_stats(void) {
+ // FIXME - make cache efficiency stats atomic
+ return rrdeng_cache_efficiency_stats;
+}
diff --git a/database/engine/rrdengineapi.h b/database/engine/rrdengineapi.h
new file mode 100644
index 00000000..7ae0e707
--- /dev/null
+++ b/database/engine/rrdengineapi.h
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_RRDENGINEAPI_H
+#define NETDATA_RRDENGINEAPI_H
+
+#include "rrdengine.h"
+
+#define RRDENG_MIN_PAGE_CACHE_SIZE_MB (8)
+#define RRDENG_MIN_DISK_SPACE_MB (64)
+
+#define RRDENG_NR_STATS (38)
+
+#define RRDENG_FD_BUDGET_PER_INSTANCE (50)
+
+extern int default_rrdeng_page_cache_mb;
+extern int default_rrdeng_extent_cache_mb;
+extern int db_engine_journal_check;
+extern int default_rrdeng_disk_quota_mb;
+extern int default_multidb_disk_quota_mb;
+extern struct rrdengine_instance *multidb_ctx[RRD_STORAGE_TIERS];
+extern size_t page_type_size[];
+extern size_t tier_page_size[];
+extern uint8_t tier_page_type[];
+
+#define CTX_POINT_SIZE_BYTES(ctx) page_type_size[(ctx)->config.page_type]
+
+void rrdeng_generate_legacy_uuid(const char *dim_id, const char *chart_id, uuid_t *ret_uuid);
+
+STORAGE_METRIC_HANDLE *rrdeng_metric_get_or_create(RRDDIM *rd, STORAGE_INSTANCE *db_instance);
+STORAGE_METRIC_HANDLE *rrdeng_metric_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid);
+void rrdeng_metric_release(STORAGE_METRIC_HANDLE *db_metric_handle);
+STORAGE_METRIC_HANDLE *rrdeng_metric_dup(STORAGE_METRIC_HANDLE *db_metric_handle);
+
+STORAGE_COLLECT_HANDLE *rrdeng_store_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, uint32_t update_every, STORAGE_METRICS_GROUP *smg);
+void rrdeng_store_metric_flush_current_page(STORAGE_COLLECT_HANDLE *collection_handle);
+void rrdeng_store_metric_change_collection_frequency(STORAGE_COLLECT_HANDLE *collection_handle, int update_every);
+void rrdeng_store_metric_next(STORAGE_COLLECT_HANDLE *collection_handle, usec_t point_in_time_ut, NETDATA_DOUBLE n,
+ NETDATA_DOUBLE min_value,
+ NETDATA_DOUBLE max_value,
+ uint16_t count,
+ uint16_t anomaly_count,
+ SN_FLAGS flags);
+int rrdeng_store_metric_finalize(STORAGE_COLLECT_HANDLE *collection_handle);
+
+void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *rrddim_handle,
+ time_t start_time_s, time_t end_time_s, STORAGE_PRIORITY priority);
+STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim_handle);
+
+
+int rrdeng_load_metric_is_finished(struct storage_engine_query_handle *rrddim_handle);
+void rrdeng_load_metric_finalize(struct storage_engine_query_handle *rrddim_handle);
+time_t rrdeng_metric_latest_time(STORAGE_METRIC_HANDLE *db_metric_handle);
+time_t rrdeng_metric_oldest_time(STORAGE_METRIC_HANDLE *db_metric_handle);
+time_t rrdeng_load_align_to_optimal_before(struct storage_engine_query_handle *rrddim_handle);
+
+void rrdeng_get_37_statistics(struct rrdengine_instance *ctx, unsigned long long *array);
+
+/* must call once before using anything */
+int rrdeng_init(struct rrdengine_instance **ctxp, const char *dbfiles_path,
+ unsigned disk_space_mb, size_t tier);
+
+void rrdeng_readiness_wait(struct rrdengine_instance *ctx);
+void rrdeng_exit_mode(struct rrdengine_instance *ctx);
+
+int rrdeng_exit(struct rrdengine_instance *ctx);
+void rrdeng_prepare_exit(struct rrdengine_instance *ctx);
+bool rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *db_instance, uuid_t *dim_uuid, time_t *first_entry_s, time_t *last_entry_s);
+
+extern STORAGE_METRICS_GROUP *rrdeng_metrics_group_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid);
+extern void rrdeng_metrics_group_release(STORAGE_INSTANCE *db_instance, STORAGE_METRICS_GROUP *smg);
+
+typedef struct rrdengine_size_statistics {
+ size_t default_granularity_secs;
+
+ size_t sizeof_datafile;
+ size_t sizeof_page_in_cache;
+ size_t sizeof_point_data;
+ size_t sizeof_page_data;
+
+ size_t pages_per_extent;
+
+ size_t datafiles;
+ size_t extents;
+ size_t extents_pages;
+ size_t points;
+ size_t metrics;
+ size_t metrics_pages;
+
+ size_t extents_compressed_bytes;
+ size_t pages_uncompressed_bytes;
+ time_t pages_duration_secs;
+
+ struct {
+ size_t pages;
+ size_t pages_uncompressed_bytes;
+ time_t pages_duration_secs;
+ size_t points;
+ } page_types[256];
+
+ size_t single_point_pages;
+
+ time_t first_time_s;
+ time_t last_time_s;
+
+ size_t currently_collected_metrics;
+ size_t estimated_concurrently_collected_metrics;
+
+ size_t disk_space;
+ size_t max_disk_space;
+
+ time_t database_retention_secs;
+ double average_compression_savings;
+ double average_point_duration_secs;
+ double average_metric_retention_secs;
+
+ double ephemeral_metrics_per_day_percent;
+
+ double average_page_size_bytes;
+} RRDENG_SIZE_STATS;
+
+struct rrdeng_cache_efficiency_stats {
+ size_t queries;
+ size_t queries_planned_with_gaps;
+ size_t queries_executed_with_gaps;
+ size_t queries_open;
+ size_t queries_journal_v2;
+
+ size_t currently_running_queries;
+
+ // query planner output of the queries
+ size_t pages_total;
+ size_t pages_to_load_from_disk;
+ size_t extents_loaded_from_disk;
+
+ // pages metadata sources
+ size_t pages_meta_source_main_cache;
+ size_t pages_meta_source_open_cache;
+ size_t pages_meta_source_journal_v2;
+
+ // preloading
+ size_t page_next_wait_failed;
+ size_t page_next_wait_loaded;
+ size_t page_next_nowait_failed;
+ size_t page_next_nowait_loaded;
+
+ // pages data sources
+ size_t pages_data_source_main_cache;
+ size_t pages_data_source_main_cache_at_pass4;
+ size_t pages_data_source_disk;
+ size_t pages_data_source_extent_cache; // loaded by a cached extent
+
+ // cache hits at different points
+ size_t pages_load_ok_loaded_but_cache_hit_while_inserting; // found in cache while inserting it (conflict)
+
+ // loading
+ size_t pages_load_extent_merged;
+ size_t pages_load_ok_uncompressed;
+ size_t pages_load_ok_compressed;
+ size_t pages_load_fail_invalid_page_in_extent;
+ size_t pages_load_fail_cant_mmap_extent;
+ size_t pages_load_fail_datafile_not_available;
+ size_t pages_load_fail_unroutable;
+ size_t pages_load_fail_not_found;
+ size_t pages_load_fail_invalid_extent;
+ size_t pages_load_fail_cancelled;
+
+ // timings for query preparation
+ size_t prep_time_to_route;
+ size_t prep_time_in_main_cache_lookup;
+ size_t prep_time_in_open_cache_lookup;
+ size_t prep_time_in_journal_v2_lookup;
+ size_t prep_time_in_pass4_lookup;
+
+ // timings the query thread experiences
+ size_t query_time_init;
+ size_t query_time_wait_for_prep;
+ size_t query_time_to_slow_disk_next_page;
+ size_t query_time_to_fast_disk_next_page;
+ size_t query_time_to_slow_preload_next_page;
+ size_t query_time_to_fast_preload_next_page;
+
+ // query issues
+ size_t pages_zero_time_skipped;
+ size_t pages_past_time_skipped;
+ size_t pages_overlapping_skipped;
+ size_t pages_invalid_size_skipped;
+ size_t pages_invalid_update_every_fixed;
+ size_t pages_invalid_entries_fixed;
+
+ // database events
+ size_t journal_v2_mapped;
+ size_t journal_v2_unmapped;
+ size_t datafile_creation_started;
+ size_t datafile_deletion_started;
+ size_t datafile_deletion_spin;
+ size_t journal_v2_indexing_started;
+ size_t metrics_retention_started;
+};
+
+struct rrdeng_buffer_sizes {
+ size_t workers;
+ size_t pdc;
+ size_t wal;
+ size_t descriptors;
+ size_t xt_io;
+ size_t xt_buf;
+ size_t handles;
+ size_t opcodes;
+ size_t epdl;
+ size_t deol;
+ size_t pd;
+ size_t pgc;
+ size_t mrg;
+#ifdef PDC_USE_JULYL
+ size_t julyl;
+#endif
+};
+
+struct rrdeng_buffer_sizes rrdeng_get_buffer_sizes(void);
+struct rrdeng_cache_efficiency_stats rrdeng_get_cache_efficiency_stats(void);
+
+RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx);
+size_t rrdeng_collectors_running(struct rrdengine_instance *ctx);
+bool rrdeng_is_legacy(STORAGE_INSTANCE *db_instance);
+
+uint64_t rrdeng_disk_space_max(STORAGE_INSTANCE *db_instance);
+uint64_t rrdeng_disk_space_used(STORAGE_INSTANCE *db_instance);
+
+#endif /* NETDATA_RRDENGINEAPI_H */
diff --git a/database/engine/rrdenginelib.c b/database/engine/rrdenginelib.c
new file mode 100644
index 00000000..dc581d98
--- /dev/null
+++ b/database/engine/rrdenginelib.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+#include "rrdengine.h"
+
+int check_file_properties(uv_file file, uint64_t *file_size, size_t min_size)
+{
+ int ret;
+ uv_fs_t req;
+ uv_stat_t* s;
+
+ ret = uv_fs_fstat(NULL, &req, file, NULL);
+ if (ret < 0) {
+ fatal("uv_fs_fstat: %s\n", uv_strerror(ret));
+ }
+ fatal_assert(req.result == 0);
+ s = req.ptr;
+ if (!(s->st_mode & S_IFREG)) {
+ netdata_log_error("Not a regular file.\n");
+ uv_fs_req_cleanup(&req);
+ return UV_EINVAL;
+ }
+ if (s->st_size < min_size) {
+ netdata_log_error("File length is too short.\n");
+ uv_fs_req_cleanup(&req);
+ return UV_EINVAL;
+ }
+ *file_size = s->st_size;
+ uv_fs_req_cleanup(&req);
+
+ return 0;
+}
+
+/**
+ * Open file for I/O.
+ *
+ * @param path The full path of the file.
+ * @param flags Same flags as the open() system call uses.
+ * @param file On success sets (*file) to be the uv_file that was opened.
+ * @param direct Tries to open a file in direct I/O mode when direct=1, falls back to buffered mode if not possible.
+ * @return Returns UV error number that is < 0 on failure. 0 on success.
+ */
+int open_file_for_io(char *path, int flags, uv_file *file, int direct)
+{
+ uv_fs_t req;
+ int fd = -1, current_flags;
+
+ fatal_assert(0 == direct || 1 == direct);
+ for ( ; direct >= 0 ; --direct) {
+#ifdef __APPLE__
+ /* Apple OS does not support O_DIRECT */
+ direct = 0;
+#endif
+ current_flags = flags;
+ if (direct) {
+ current_flags |= O_DIRECT;
+ }
+ fd = uv_fs_open(NULL, &req, path, current_flags, S_IRUSR | S_IWUSR, NULL);
+ if (fd < 0) {
+ if ((direct) && (UV_EINVAL == fd)) {
+ netdata_log_error("File \"%s\" does not support direct I/O, falling back to buffered I/O.", path);
+ } else {
+ netdata_log_error("Failed to open file \"%s\".", path);
+ --direct; /* break the loop */
+ }
+ } else {
+ fatal_assert(req.result >= 0);
+ *file = req.result;
+#ifdef __APPLE__
+ netdata_log_info("Disabling OS X caching for file \"%s\".", path);
+ fcntl(fd, F_NOCACHE, 1);
+#endif
+ --direct; /* break the loop */
+ }
+ uv_fs_req_cleanup(&req);
+ }
+
+ return fd;
+}
+
+int is_legacy_child(const char *machine_guid)
+{
+ uuid_t uuid;
+ char dbengine_file[FILENAME_MAX+1];
+
+ if (unlikely(!strcmp(machine_guid, "unittest-dbengine") || !strcmp(machine_guid, "dbengine-dataset") ||
+ !strcmp(machine_guid, "dbengine-stress-test"))) {
+ return 1;
+ }
+ if (!uuid_parse(machine_guid, uuid)) {
+ uv_fs_t stat_req;
+ snprintfz(dbengine_file, FILENAME_MAX, "%s/%s/dbengine", netdata_configured_cache_dir, machine_guid);
+ int rc = uv_fs_stat(NULL, &stat_req, dbengine_file, NULL);
+ if (likely(rc == 0 && ((stat_req.statbuf.st_mode & S_IFMT) == S_IFDIR))) {
+ //netdata_log_info("Found legacy engine folder \"%s\"", dbengine_file);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int count_legacy_children(char *dbfiles_path)
+{
+ int ret;
+ uv_fs_t req;
+ uv_dirent_t dent;
+ int legacy_engines = 0;
+
+ ret = uv_fs_scandir(NULL, &req, dbfiles_path, 0, NULL);
+ if (ret < 0) {
+ uv_fs_req_cleanup(&req);
+ netdata_log_error("uv_fs_scandir(%s): %s", dbfiles_path, uv_strerror(ret));
+ return ret;
+ }
+
+ while(UV_EOF != uv_fs_scandir_next(&req, &dent)) {
+ if (dent.type == UV_DIRENT_DIR) {
+ if (is_legacy_child(dent.name))
+ legacy_engines++;
+ }
+ }
+ uv_fs_req_cleanup(&req);
+ return legacy_engines;
+}
+
+int compute_multidb_diskspace()
+{
+ char multidb_disk_space_file[FILENAME_MAX + 1];
+ FILE *fp;
+ int computed_multidb_disk_quota_mb = -1;
+
+ snprintfz(multidb_disk_space_file, FILENAME_MAX, "%s/dbengine_multihost_size", netdata_configured_varlib_dir);
+ fp = fopen(multidb_disk_space_file, "r");
+ if (likely(fp)) {
+ int rc = fscanf(fp, "%d", &computed_multidb_disk_quota_mb);
+ fclose(fp);
+ if (unlikely(rc != 1 || computed_multidb_disk_quota_mb < RRDENG_MIN_DISK_SPACE_MB)) {
+ errno = 0;
+ netdata_log_error("File '%s' contains invalid input, it will be rebuild", multidb_disk_space_file);
+ computed_multidb_disk_quota_mb = -1;
+ }
+ }
+
+ if (computed_multidb_disk_quota_mb == -1) {
+ int rc = count_legacy_children(netdata_configured_cache_dir);
+ if (likely(rc >= 0)) {
+ computed_multidb_disk_quota_mb = (rc + 1) * default_rrdeng_disk_quota_mb;
+ netdata_log_info("Found %d legacy dbengines, setting multidb diskspace to %dMB", rc, computed_multidb_disk_quota_mb);
+
+ fp = fopen(multidb_disk_space_file, "w");
+ if (likely(fp)) {
+ fprintf(fp, "%d", computed_multidb_disk_quota_mb);
+ netdata_log_info("Created file '%s' to store the computed value", multidb_disk_space_file);
+ fclose(fp);
+ } else
+ netdata_log_error("Failed to store the default multidb disk quota size on '%s'", multidb_disk_space_file);
+ }
+ else
+ computed_multidb_disk_quota_mb = default_rrdeng_disk_quota_mb;
+ }
+
+ return computed_multidb_disk_quota_mb;
+}
diff --git a/database/engine/rrdenginelib.h b/database/engine/rrdenginelib.h
new file mode 100644
index 00000000..a0febd4f
--- /dev/null
+++ b/database/engine/rrdenginelib.h
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_RRDENGINELIB_H
+#define NETDATA_RRDENGINELIB_H
+
+#include "libnetdata/libnetdata.h"
+
+/* Forward declarations */
+struct rrdengine_instance;
+
+#define ALIGN_BYTES_FLOOR(x) (((x) / RRDENG_BLOCK_SIZE) * RRDENG_BLOCK_SIZE)
+#define ALIGN_BYTES_CEILING(x) ((((x) + RRDENG_BLOCK_SIZE - 1) / RRDENG_BLOCK_SIZE) * RRDENG_BLOCK_SIZE)
+
+typedef uintptr_t rrdeng_stats_t;
+
+#ifdef __ATOMIC_RELAXED
+#define rrd_atomic_fetch_add(p, n) __atomic_fetch_add(p, n, __ATOMIC_RELAXED)
+#define rrd_atomic_add_fetch(p, n) __atomic_add_fetch(p, n, __ATOMIC_RELAXED)
+#else
+#define rrd_atomic_fetch_add(p, n) __sync_fetch_and_add(p, n)
+#define rrd_atomic_add_fetch(p, n) __sync_add_and_fetch(p, n)
+#endif
+
+#define rrd_stat_atomic_add(p, n) rrd_atomic_fetch_add(p, n)
+
+/* returns -1 if it didn't find the first cleared bit, the position otherwise. Starts from LSB. */
+static inline int find_first_zero(unsigned x)
+{
+ return ffs((int)(~x)) - 1;
+}
+
+/* Starts from LSB. */
+static inline uint8_t check_bit(unsigned x, size_t pos)
+{
+ return !!(x & (1 << pos));
+}
+
+/* Starts from LSB. val is 0 or 1 */
+static inline void modify_bit(unsigned *x, unsigned pos, uint8_t val)
+{
+ switch(val) {
+ case 0:
+ *x &= ~(1U << pos);
+ break;
+ case 1:
+ *x |= 1U << pos;
+ break;
+ default:
+ netdata_log_error("modify_bit() called with invalid argument.");
+ break;
+ }
+}
+
+#define RRDENG_PATH_MAX (FILENAME_MAX + 1)
+
+/* returns old *ptr value */
+static inline unsigned long ulong_compare_and_swap(volatile unsigned long *ptr,
+ unsigned long oldval, unsigned long newval)
+{
+ return __sync_val_compare_and_swap(ptr, oldval, newval);
+}
+
+#ifndef O_DIRECT
+/* Workaround for OS X */
+#define O_DIRECT (0)
+#endif
+
+static inline int crc32cmp(void *crcp, uLong crc)
+{
+ uint32_t loaded_crc;
+ memcpy(&loaded_crc, crcp, sizeof(loaded_crc));
+ return (loaded_crc != crc);
+}
+
+static inline void crc32set(void *crcp, uLong crc)
+{
+ uint32_t store_crc = (uint32_t) crc;
+ memcpy(crcp, &store_crc, sizeof(store_crc));
+}
+
+int check_file_properties(uv_file file, uint64_t *file_size, size_t min_size);
+int open_file_for_io(char *path, int flags, uv_file *file, int direct);
+static inline int open_file_direct_io(char *path, int flags, uv_file *file)
+{
+ return open_file_for_io(path, flags, file, 1);
+}
+static inline int open_file_buffered_io(char *path, int flags, uv_file *file)
+{
+ return open_file_for_io(path, flags, file, 0);
+}
+int compute_multidb_diskspace();
+int is_legacy_child(const char *machine_guid);
+
+#endif /* NETDATA_RRDENGINELIB_H */