diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2023-02-06 16:11:30 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2023-02-06 16:11:30 +0000 |
commit | aa2fe8ccbfcb117efa207d10229eeeac5d0f97c7 (patch) | |
tree | 941cbdd387b41c1a81587c20a6df9f0e5e0ff7ab /database/engine | |
parent | Adding upstream version 1.37.1. (diff) | |
download | netdata-aa2fe8ccbfcb117efa207d10229eeeac5d0f97c7.tar.xz netdata-aa2fe8ccbfcb117efa207d10229eeeac5d0f97c7.zip |
Adding upstream version 1.38.0.upstream/1.38.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'database/engine')
-rw-r--r-- | database/engine/README.md | 303 | ||||
-rw-r--r-- | database/engine/cache.c | 2737 | ||||
-rw-r--r-- | database/engine/cache.h | 249 | ||||
-rw-r--r-- | database/engine/datafile.c | 415 | ||||
-rw-r--r-- | database/engine/datafile.h | 76 | ||||
-rw-r--r-- | database/engine/datafile.ksy | 74 | ||||
-rw-r--r-- | database/engine/dbengine-diagram.xml | 1 | ||||
-rw-r--r-- | database/engine/journalfile.c | 1445 | ||||
-rw-r--r-- | database/engine/journalfile.h | 150 | ||||
-rw-r--r-- | database/engine/journalfile.ksy | 144 | ||||
-rw-r--r-- | database/engine/metric.c | 875 | ||||
-rw-r--r-- | database/engine/metric.h | 79 | ||||
-rw-r--r-- | database/engine/pagecache.c | 2054 | ||||
-rw-r--r-- | database/engine/pagecache.h | 242 | ||||
-rw-r--r-- | database/engine/pdc.c | 1282 | ||||
-rw-r--r-- | database/engine/pdc.h | 67 | ||||
-rw-r--r-- | database/engine/rrdengine.c | 2634 | ||||
-rw-r--r-- | database/engine/rrdengine.h | 569 | ||||
-rwxr-xr-x | database/engine/rrdengineapi.c | 1683 | ||||
-rw-r--r-- | database/engine/rrdengineapi.h | 153 | ||||
-rw-r--r-- | database/engine/rrdenginelib.c | 208 | ||||
-rw-r--r-- | database/engine/rrdenginelib.h | 4 | ||||
-rw-r--r-- | database/engine/rrdenglocking.c | 241 | ||||
-rw-r--r-- | database/engine/rrdenglocking.h | 17 |
24 files changed, 11284 insertions, 4418 deletions
diff --git a/database/engine/README.md b/database/engine/README.md index c67e400f4..664d40506 100644 --- a/database/engine/README.md +++ b/database/engine/README.md @@ -1,48 +1,126 @@ <!-- title: "Database engine" description: "Netdata's highly-efficient database engine use both RAM and disk for distributed, long-term storage of per-second metrics." -custom_edit_url: https://github.com/netdata/netdata/edit/master/database/engine/README.md +custom_edit_url: "https://github.com/netdata/netdata/edit/master/database/engine/README.md" +sidebar_label: "Database engine" +learn_status: "Published" +learn_topic_type: "Concepts" +learn_rel_path: "Concepts" --> -# Database engine +# DBENGINE -The Database Engine works like a traditional time series database. Unlike other [database modes](/database/README.md), -the amount of historical metrics stored is based on the amount of disk space you allocate and the effective compression -ratio, not a fixed number of metrics collected. +DBENGINE is the time-series database of Netdata. -## Tiering +## Design -Tiering is a mechanism of providing multiple tiers of data with -different [granularity on metrics](/docs/store/distributed-data-architecture.md#granularity-of-metrics). +### Data Points -For Netdata Agents with version `netdata-1.35.0.138.nightly` and greater, `dbengine` supports Tiering, allowing almost -unlimited retention of data. +**Data points** represent the collected values of metrics. +A **data point** has: -### Metric size +1. A **value**, the data collected for a metric. There is a special **value** to indicate that the collector failed to collect a valid value, and thus the data point is a **gap**. +2. A **timestamp**, the time it has been collected. +3. A **duration**, the time between this and the previous data collection. +4. A flag which is set when machine-learning categorized the collected value as **anomalous** (an outlier based on the trained models). -Every Tier down samples the exact lower tier (lower tiers have greater resolution). You can have up to 5 -Tiers **[0. . 4]** of data (including the Tier 0, which has the highest resolution) +Using the **timestamp** and **duration**, Netdata calculates for each point its **start time**, **end time** and **update every**. -Tier 0 is the default that was always available in `dbengine` mode. Tier 1 is the first level of aggregation, Tier 2 is -the second, and so on. +For incremental metrics (counters), Netdata interpolates the collected values to align them to the expected **end time** at the microsecond level, absorbing data collection micro-latencies. -Metrics on all tiers except of the _Tier 0_ also store the following five additional values for every point for accurate -representation: +When data points are stored in higher tiers (time aggregations - see [Tiers](#Tiers) below), each data point has: -1. The `sum` of the points aggregated -2. The `min` of the points aggregated -3. The `max` of the points aggregated -4. The `count` of the points aggregated (could be constant, but it may not be due to gaps in data collection) -5. The `anomaly_count` of the points aggregated (how many of the aggregated points found anomalous) +1. The **sum** of the original values that have been aggregated, +2. The **count** of all the original values aggregated, +3. The **minimum** value among them, +4. The **maximum** value among them, +5. Their **anomaly rate**, i.e. the count of values that were detected as outliers based on the currently trained models for the metric, +6. A **timestamp**, which is the equal to the **end time** of the last point aggregated, +7. A **duration**, which is the duration between the **first time** of the first point aggregated to the **end time** of the last point aggregated. -Among `min`, `max` and `sum`, the correct value is chosen based on the user query. `average` is calculated on the fly at -query time. +This design allows Netdata to accurately know the **average**, **minimum**, **maximum** and **anomaly rate** values even when using higher tiers to satisfy a query. -### Tiering in a nutshell +### Pages +Data points are organized into **pages**, i.e. segments of contiguous data collections of the same metric. -The `dbengine` is capable of retaining metrics for years. To further understand the `dbengine` tiering mechanism let's -explore the following configuration. +Each page: + +1. Contains contiguous **data points** of a single metric. +2. Contains **data points** having the same **update every**. If a metric changes **update every** on the fly, the page is flushed and a new one with the new **update every** is created. If a data collection is missed, a **gap point** is inserted into the page, so that the data points in a page remain contiguous. +3. Has a **start time**, which is equivalent to the **end time** of the first data point stored into it, +4. Has an **end time**, which is equal to the **end time** of the last data point stored into it, +5. Has an **update every**, common for all points in the page. + +A **page** is a simple array of values. Each slot in the array has a **timestamp** implied by its position in the array, and each value stored represents the **data point** for that time, for the metric the page belongs to. + +This simple fixed step page design allows Netdata to collect several millions of points per second and pack all the values in a compact form with minimal metadata overhead. + +#### Hot Pages + +While a metric is collected, there is one **hot page** in memory for each of the configured tiers. Values collected for a metric are appended to its **hot page** until that page becomes full. + +#### Dirty Pages + +Once a **hot page** is full, it becomes a **dirty page**, and it is scheduled for immediate **flushing** (saving) to disk. + +#### Clean Pages + +Flushed (saved) pages are **clean pages**, i.e. read-only pages that reside primarily on disk, and are loaded on demand to satisfy data queries. + +#### Pages Configuration + +Pages are configured like this: + +| Attribute | Tier0 | Tier1 | Tier2 | +|---------------------------------------------------------------------------------------|:-------------------------------------:|:---------------------------------------------------------------:|:---------------------------------------------------------------:| +| Point Size in Memory, in Bytes | 4 | 16 | 16 | +| Point Size on Disk, in Bytes<br/><small>after LZ4 compression, on the average</small> | 1 | 4 | 4 | +| Page Size in Bytes | 4096<br/><small>2048 in 32bit</small> | 2048<br/><small>1024 in 32bit</small> | 384<br/><small>192 in 32bit</small> | +| Collections per Point | 1 | 60x Tier0<br/><small>configurable in<br/>`netdata.conf`</small> | 60x Tier1<br/><small>configurable in<br/>`netdata.conf`</small> | +| Points per Page | 1024<br/><small>512 in 32bit</small> | 128<br/><small>64 in 32bit</small> | 24<br/><small>12 in 32bit</small> | + +### Files + +To minimize the amount of data written to disk and the amount of storage required for storing metrics, Netdata aggregates up to 64 **dirty pages** of independent metrics, packs them all together into one bigger buffer, compresses this buffer with LZ4 (about 75% savings on the average) and commits a transaction to the disk files. + +#### Extents + +This collection of 64 pages that is packed and compressed together is called an **extent**. Netdata tries to store together, in the same **extent**, metrics that are meant to be "close". Dimensions of the same chart are such. They are usually queried together, so it is beneficial to have them in the same **extent** to read all of them at once at query time. + +#### Datafiles + +Multiple **extents** are appended to **datafiles** (filename suffix `.ndf`), until these **datafiles** become full. The size of each **datafile** is determined automatically by Netdata. The minimum for each **datafile** is 4MB and the maximum 512MB. Depending on the amount of disk space configured for each tier, Netdata will decide a **datafile** size trying to maintain about 50 datafiles for the whole database, within the limits mentioned (4MB min, 512MB max per file). The maximum number of datafiles supported is 65536, and therefore the maximum database size (per tier) that Netdata can support is 32TB. + +#### Journal Files + +Each **datafile** has two **journal files** with metadata related to the stored data in the **datafile**. + +- **journal file v1**, with filename suffix `.njf`, holds information about the transactions in its **datafile** and provides the ability to recover as much data as possible, in case either the datafile or the journal files get corrupted. This journal file has a maximum transaction size of 4KB, so in case data are corrupted on disk transactions of 4KB are lost. Each transaction holds the metadata of one **extent** (this is why DBENGINE supports up to 64 pages per extent). + +- **journal file v2**, with filename suffix `.njfv2`, which is a disk-based index for all the **pages** and **extents**. This file is memory mapped at runtime and is consulted to find where the data of a metric are in the datafile. This journal file is automatically re-created from **journal file v1** if it is missing. It is safe to delete these files (when Netdata does not run). Netdata will re-create them on the next run. Journal files v2 are supported in Netdata Agents with version `netdata-1.37.0-115-nightly`. Older versions maintain the journal index in memory. + +#### Database Rotation + +Database rotation is achieved by deleting the oldest **datafile** (and its journals) and creating a new one (with its journals). + +Data on disk are append-only. There is no way to delete, add, or update data in the middle of the database. If data are not useful for whatever reason, Netdata can be instructed to ignore these data. They will eventually be deleted from disk when the database is rotated. New data are always appended. + +#### Tiers + +Tiers are supported in Netdata Agents with version `netdata-1.35.0.138.nightly` and greater. + +**datafiles** and **journal files** are organized in **tiers**. All tiers share the same metrics and same collected values. + +- **tier 0** is the high resolution tier that stores the collected data at the frequency they are collected. +- **tier 1** by default aggregates 60 values of **tier 0**. +- **tier 2** by default aggregates 60 values of **tier 1**, or 3600 values of **tier 0**. + +Updating the higher **tiers** is automated, and it happens in real-time while data are being collected for **tier 0**. + +When the Netdata Agent starts, during the first data collection of each metric, higher tiers are automatically **backfilled** with data from lower tiers, so that the aggregation they provide will be accurate. + +3 tiers are enabled by default in Netdata, with the following configuration: ``` [db] @@ -51,46 +129,151 @@ explore the following configuration. # per second data collection update every = 1 - # enables Tier 1 and Tier 2, Tier 0 is always enabled in dbengine mode + # number of tiers used (1 to 5, 3 being default) storage tiers = 3 - # Tier 0, per second data for a week - dbengine multihost disk space MB = 1100 + # Tier 0, per second data + dbengine multihost disk space MB = 256 - # Tier 1, per minute data for a month - dbengine tier 1 multihost disk space MB = 330 + # Tier 1, per minute data + dbengine tier 1 multihost disk space MB = 128 + + # Tier 2, per hour data + dbengine tier 2 multihost disk space MB = 64 +``` + +The exact retention that can be achieved by each tier depends on the number of metrics collected. The more the metrics, the smaller the retention that will fit in a given size. The general rule is that Netdata needs about **1 byte per data point on disk for tier 0**, and **4 bytes per data point on disk for tier 1 and above**. + +So, for 1000 metrics collected per second and 256 MB for tier 0, Netdata will store about: + +``` +256MB on disk / 1 byte per point / 1000 metrics => 256k points per metric / 86400 seconds per day = about 3 days +``` + +At tier 1 (per minute): + +``` +128MB on disk / 4 bytes per point / 1000 metrics => 32k points per metric / (24 hours * 60 minutes) = about 22 days +``` + +At tier 2 (per hour): + +``` +64MB on disk / 4 bytes per point / 1000 metrics => 16k points per metric / 24 hours per day = about 2 years +``` + +Of course double the metrics, half the retention. There are more factors that affect retention. The number of ephemeral metrics (i.e. metrics that are collected for part of the time). The number of metrics that are usually constant over time (affecting compression efficiency). The number of restarts a Netdata Agents gets through time (because it has to break pages prematurely, increasing the metadata overhead). But the actual numbers should not deviate significantly from the above. + +### Data Loss + +Until **hot pages** and **dirty pages** are **flushed** to disk they are at risk (e.g. due to a crash, or +power failure), as they are stored only in memory. + +The supported way of ensuring high data availability is the use of Netdata Parents to stream the data in real-time to +multiple other Netdata agents. + +## Memory Requirements + +DBENGINE memory is related to the number of metrics concurrently being collected, the retention of the metrics on disk in relation with the queries running, and the number of metrics for which retention is maintained. + +### Memory for concurrently collected metrics + +DBENGINE is automatically sized to use memory according to this equation: + +``` +memory in KiB = METRICS x (TIERS - 1) x 4KiB x 2 + 32768 KiB +``` + +Where: +- `METRICS`: the maximum number of concurrently collected metrics (dimensions) from the time the agent started. +- `TIERS`: the number of storage tiers configured, by default 3 ( `-1` when using 3+ tiers) +- `x 2`, to accommodate room for flushing data to disk +- `x 4KiB`, the data segment size of each metric +- `+ 32768 KiB`, 32 MB for operational caches + +So, for 2000 metrics (dimensions) in 3 storage tiers: + +``` +memory for 2k metrics = 2000 x (3 - 1) x 4 KiB x 2 + 32768 KiB = 64 MiB +``` + +For 100k concurrently collected metrics in 3 storage tiers: + +``` +memory for 100k metrics = 100000 x (3 - 1) x 4 KiB x 2 + 32768 KiB = 1.6 GiB +``` + +#### Exceptions + +Netdata has several protection mechanisms to prevent the use of more memory (than the above), by incrementally fetching data from disk and aggressively evicting old data to make room for new data, but still memory may grow beyond the above limit under the following conditions: + +1. The number of pages concurrently used in queries do not fit the in the above size. This can happen when multiple queries of unreasonably long time-frames run on lower, higher resolution, tiers. The Netdata query planner attempts to avoid such situations by gradually loading pages, but still under extreme conditions the system may use more memory to satisfy these queries. + +2. The disks that host Netdata files are extremely slow for the workload required by the database so that data cannot be flushed to disk quickly to free memory. Netdata will automatically spawn more flushing workers in an attempt to parallelize and speed up flushing, but still if the disks cannot write the data quickly enough, they will remain in memory until they are written to disk. + +### Caches + +DBENGINE stores metric data to disk. To achieve high performance even under severe stress, it uses several layers of caches. + +#### Main Cache + +Stores page data. It is the primary storage of hot and dirty pages (before they are saved to disk), and its clean queue is the LRU cache for speeding up queries. + +The entire DBENGINE is designed to use the hot queue size (the currently collected metrics) as the key for sizing all its memory consumption. We call this feature **memory ballooning**. More collected metrics, bigger main cache and vice versa. + +In the equation: - # Tier 2, per hour data for a year - dbengine tier 2 multihost disk space MB = 67 ``` +memory in KiB = METRICS x (TIERS - 1) x 4KiB x 2 + 32768 KiB +``` + +the part `METRICS x (TIERS - 1) x 4KiB` is an estimate for the max hot size of the main cache. Tier 0 pages are 4KiB, but tier 1 pages are 2 KiB and tier 2 pages are 384 bytes. So a single metric in 3 tiers uses 4096 + 2048 + 384 = 6528 bytes. The equation estimates 8192 per metric, which includes cache internal structures and leaves some spare. + +Then `x 2` is the worst case estimate for the dirty queue. If all collected metrics (hot) become available for saving at once, to avoid stopping data collection all their pages will become dirty and new hot pages will be created instantly. To save memory, when Netdata starts, DBENGINE allocates randomly smaller pages for metrics, to spread their completion evenly across time. + +The memory we saved with the above is used to improve the LRU cache. So, although we reserved 32MiB for the LRU, in bigger setups (Netdata Parents) the LRU grows a lot more, within the limits of the equation. + +In practice, the main cache sizes itself with `hot x 1.5` instead of `host x 2`. The reason is that 5% of main cache is reserved for expanding open cache, 5% for expanding extent cache and we need room for the extensive buffers that are allocated in these setups. When the main cache exceeds `hot x 1.5` it enters a mode of critical evictions, and aggresively frees pages from the LRU to maintain a healthy memory footprint within its design limits. + +#### Open Cache -For 2000 metrics, collected every second and retained for a week, Tier 0 needs: 1 byte x 2000 metrics x 3600 secs per -hour x 24 hours per day x 7 days per week = 1100MB. +Stores metadata about on disk pages. Not the data itself. Only metadata about the location of the data on disk. -By setting `dbengine multihost disk space MB` to `1100`, this node will start maintaining about a week of data. But pay -attention to the number of metrics. If you have more than 2000 metrics on a node, or you need more that a week of high -resolution metrics, you may need to adjust this setting accordingly. +Its primary use is to index information about the open datafile, the one that still accepts new pages. Once that datafile becomes full, all the hot pages of the open cache are indexed in journal v2 files. -Tier 1 is by default sampling the data every **60 points of Tier 0**. In our case, Tier 0 is per second, if we want to -transform this information in terms of time then the Tier 1 "resolution" is per minute. +The clean queue is an LRU for reducing the journal v2 scans during quering. -Tier 1 needs four times more storage per point compared to Tier 0. So, for 2000 metrics, with per minute resolution, -retained for a month, Tier 1 needs: 4 bytes x 2000 metrics x 60 minutes per hour x 24 hours per day x 30 days per month -= 330MB. +Open cache uses memory ballooning too, like the main cache, based on its own hot pages. Open cache hot size is mainly controlled by the size of the open datafile. This is why on netdata versions with journal files v2, we decreased the maximum datafile size from 1GB to 512MB and we increased the target number of datafiles from 20 to 50. -Tier 2 is by default sampling data every 3600 points of Tier 0 (60 of Tier 1, which is the previous exact Tier). Again -in term of "time" (Tier 0 is per second), then Tier 2 is per hour. +On bigger setups open cache will get a bigger LRU by automatically sizing it (the whole open cache) to 5% to the size of (the whole) main cache. -The storage requirements are the same to Tier 1. +#### Extent Cache + +Caches compressed **extent** data, to avoid reading too repeatedly the same data from disks. + + +### Shared Memory + +Journal v2 indexes are mapped into memory. Netdata attempts to minimize shared memory use by instructing the kernel about the use of these files, or even unmounting them when they are not needed. + +The time-ranges of the queries running control the amount of shared memory required. + +## Metrics Registry + +DBENGINE uses 150 bytes of memory for every metric for which retention is maintained but is not currently being collected. + +--- + +--- OLD DOCS BELOW THIS POINT --- + +--- -For 2000 metrics, with per hour resolution, retained for a year, Tier 2 needs: 4 bytes x 2000 metrics x 24 hours per day -x 365 days per year = 67MB. ## Legacy configuration ### v1.35.1 and prior -These versions of the Agent do not support [Tiering](#Tiering). You could change the metric retention for the parent and +These versions of the Agent do not support [Tiers](#Tiers). You could change the metric retention for the parent and all of its children only with the `dbengine multihost disk space MB` setting. This setting accounts the space allocation for the parent node and all of its children. @@ -105,15 +288,9 @@ the `[db]` section of your `netdata.conf`. ### v1.23.2 and prior -_For Netdata Agents earlier than v1.23.2_, the Agent on the parent node uses one dbengine instance for itself, and -another instance for every child node it receives metrics from. If you had four streaming nodes, you would have five -instances in total (`1 parent + 4 child nodes = 5 instances`). +_For Netdata Agents earlier than v1.23.2_, the Agent on the parent node uses one dbengine instance for itself, and another instance for every child node it receives metrics from. If you had four streaming nodes, you would have five instances in total (`1 parent + 4 child nodes = 5 instances`). -The Agent allocates resources for each instance separately using the `dbengine disk space MB` (**deprecated**) setting. -If -`dbengine disk space MB`(**deprecated**) is set to the default `256`, each instance is given 256 MiB in disk space, -which means the total disk space required to store all instances is, -roughly, `256 MiB * 1 parent * 4 child nodes = 1280 MiB`. +The Agent allocates resources for each instance separately using the `dbengine disk space MB` (**deprecated**) setting. If `dbengine disk space MB`(**deprecated**) is set to the default `256`, each instance is given 256 MiB in disk space, which means the total disk space required to store all instances is, roughly, `256 MiB * 1 parent * 4 child nodes = 1280 MiB`. #### Backward compatibility @@ -128,7 +305,7 @@ Agent. ##### Information For more information about setting `[db].mode` on your nodes, in addition to other streaming configurations, see -[streaming](/streaming/README.md). +[streaming](https://github.com/netdata/netdata/blob/master/streaming/README.md). ## Requirements & limitations @@ -154,7 +331,7 @@ An important observation is that RAM usage depends on both the `page cache size` options. You can use -our [database engine calculator](/docs/store/change-metrics-storage.md#calculate-the-system-resources-ram-disk-space-needed-to-store-metrics) +our [database engine calculator](https://github.com/netdata/netdata/blob/master/docs/store/change-metrics-storage.md#calculate-the-system-resources-ram-disk-space-needed-to-store-metrics) to validate the memory requirements for your particular system(s) and configuration (**out-of-date**). ### Disk space @@ -208,7 +385,7 @@ You can apply the settings by running `sysctl -p` or by rebooting. ## Files -With the DB engine mode the metric data are stored in database files. These files are organized in pairs, the datafiles +With the DB engine mode the metric data are stored in database files. These files are organized in pairs, the datafiles and their corresponding journalfiles, e.g.: ```sh @@ -226,7 +403,7 @@ location is `/var/cache/netdata/dbengine/*`). The higher numbered filenames cont can safely delete some pairs of files when Netdata is stopped to manually free up some space. _Users should_ **back up** _their `./dbengine` folders if they consider this data to be important._ You can also set up -one or more [exporting connectors](/exporting/README.md) to send your Netdata metrics to other databases for long-term +one or more [exporting connectors](https://github.com/netdata/netdata/blob/master/exporting/README.md) to send your Netdata metrics to other databases for long-term storage at lower granularity. ## Operation @@ -298,5 +475,3 @@ An interesting observation to make is that the CPU-bound run (16 GiB page cache) and generate a read load of 1.7M/sec, whereas in the CPU-bound scenario the read load is 70 times higher at 118M/sec. Consequently, there is a significant degree of interference by the reader threads, that slow down the writer threads. This is also possible because the interference effects are greater than the SSD impact on data generation throughput. - - diff --git a/database/engine/cache.c b/database/engine/cache.c new file mode 100644 index 000000000..4091684b2 --- /dev/null +++ b/database/engine/cache.c @@ -0,0 +1,2737 @@ +#include "cache.h" + +/* STATES AND TRANSITIONS + * + * entry | entry + * v v + * HOT -> DIRTY --> CLEAN --> EVICT + * v | v + * flush | evict + * v | v + * save | free + * callback | callback + * + */ + +typedef int32_t REFCOUNT; +#define REFCOUNT_DELETING (-100) + +// to use ARAL uncomment the following line: +#define PGC_WITH_ARAL 1 + +typedef enum __attribute__ ((__packed__)) { + // mutually exclusive flags + PGC_PAGE_CLEAN = (1 << 0), // none of the following + PGC_PAGE_DIRTY = (1 << 1), // contains unsaved data + PGC_PAGE_HOT = (1 << 2), // currently being collected + + // flags related to various actions on each page + PGC_PAGE_IS_BEING_DELETED = (1 << 3), + PGC_PAGE_IS_BEING_MIGRATED_TO_V2 = (1 << 4), + PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES = (1 << 5), + PGC_PAGE_HAS_BEEN_ACCESSED = (1 << 6), +} PGC_PAGE_FLAGS; + +#define page_flag_check(page, flag) (__atomic_load_n(&((page)->flags), __ATOMIC_ACQUIRE) & (flag)) +#define page_flag_set(page, flag) __atomic_or_fetch(&((page)->flags), flag, __ATOMIC_RELEASE) +#define page_flag_clear(page, flag) __atomic_and_fetch(&((page)->flags), ~(flag), __ATOMIC_RELEASE) + +#define page_get_status_flags(page) page_flag_check(page, PGC_PAGE_HOT | PGC_PAGE_DIRTY | PGC_PAGE_CLEAN) +#define is_page_hot(page) (page_get_status_flags(page) == PGC_PAGE_HOT) +#define is_page_dirty(page) (page_get_status_flags(page) == PGC_PAGE_DIRTY) +#define is_page_clean(page) (page_get_status_flags(page) == PGC_PAGE_CLEAN) + +struct pgc_page { + // indexing data + Word_t section; + Word_t metric_id; + time_t start_time_s; + time_t end_time_s; + uint32_t update_every_s; + uint32_t assumed_size; + + REFCOUNT refcount; + uint16_t accesses; // counts the number of accesses on this page + PGC_PAGE_FLAGS flags; + SPINLOCK transition_spinlock; // when the page changes between HOT, DIRTY, CLEAN, we have to get this lock + + struct { + struct pgc_page *next; + struct pgc_page *prev; + } link; + + void *data; + uint8_t custom_data[]; + + // IMPORTANT! + // THIS STRUCTURE NEEDS TO BE INITIALIZED BY HAND! +}; + +struct pgc_linked_list { + SPINLOCK spinlock; + union { + PGC_PAGE *base; + Pvoid_t sections_judy; + }; + PGC_PAGE_FLAGS flags; + size_t version; + size_t last_version_checked; + bool linked_list_in_sections_judy; // when true, we use 'sections_judy', otherwise we use 'base' + struct pgc_queue_statistics *stats; +}; + +struct pgc { + struct { + char name[PGC_NAME_MAX + 1]; + + size_t partitions; + size_t clean_size; + size_t max_dirty_pages_per_call; + size_t max_pages_per_inline_eviction; + size_t max_skip_pages_per_inline_eviction; + size_t max_flushes_inline; + size_t max_workers_evict_inline; + size_t additional_bytes_per_page; + free_clean_page_callback pgc_free_clean_cb; + save_dirty_page_callback pgc_save_dirty_cb; + save_dirty_init_callback pgc_save_init_cb; + PGC_OPTIONS options; + + size_t severe_pressure_per1000; + size_t aggressive_evict_per1000; + size_t healthy_size_per1000; + size_t evict_low_threshold_per1000; + + dynamic_target_cache_size_callback dynamic_target_size_cb; + } config; + +#ifdef PGC_WITH_ARAL + ARAL **aral; +#endif + + PGC_CACHE_LINE_PADDING(0); + + struct pgc_index { + netdata_rwlock_t rwlock; + Pvoid_t sections_judy; + } *index; + + PGC_CACHE_LINE_PADDING(1); + + struct { + SPINLOCK spinlock; + size_t per1000; + } usage; + + PGC_CACHE_LINE_PADDING(2); + + struct pgc_linked_list clean; // LRU is applied here to free memory from the cache + + PGC_CACHE_LINE_PADDING(3); + + struct pgc_linked_list dirty; // in the dirty list, pages are ordered the way they were marked dirty + + PGC_CACHE_LINE_PADDING(4); + + struct pgc_linked_list hot; // in the hot list, pages are order the way they were marked hot + + PGC_CACHE_LINE_PADDING(5); + + struct pgc_statistics stats; // statistics + +#ifdef NETDATA_PGC_POINTER_CHECK + PGC_CACHE_LINE_PADDING(6); + netdata_mutex_t global_pointer_registry_mutex; + Pvoid_t global_pointer_registry; +#endif +}; + + + +// ---------------------------------------------------------------------------- +// validate each pointer is indexed once - internal checks only + +static inline void pointer_index_init(PGC *cache __maybe_unused) { +#ifdef NETDATA_PGC_POINTER_CHECK + netdata_mutex_init(&cache->global_pointer_registry_mutex); +#else + ; +#endif +} + +static inline void pointer_destroy_index(PGC *cache __maybe_unused) { +#ifdef NETDATA_PGC_POINTER_CHECK + netdata_mutex_lock(&cache->global_pointer_registry_mutex); + JudyHSFreeArray(&cache->global_pointer_registry, PJE0); + netdata_mutex_unlock(&cache->global_pointer_registry_mutex); +#else + ; +#endif +} +static inline void pointer_add(PGC *cache __maybe_unused, PGC_PAGE *page __maybe_unused) { +#ifdef NETDATA_PGC_POINTER_CHECK + netdata_mutex_lock(&cache->global_pointer_registry_mutex); + Pvoid_t *PValue = JudyHSIns(&cache->global_pointer_registry, &page, sizeof(void *), PJE0); + if(*PValue != NULL) + fatal("pointer already exists in registry"); + *PValue = page; + netdata_mutex_unlock(&cache->global_pointer_registry_mutex); +#else + ; +#endif +} + +static inline void pointer_check(PGC *cache __maybe_unused, PGC_PAGE *page __maybe_unused) { +#ifdef NETDATA_PGC_POINTER_CHECK + netdata_mutex_lock(&cache->global_pointer_registry_mutex); + Pvoid_t *PValue = JudyHSGet(cache->global_pointer_registry, &page, sizeof(void *)); + if(PValue == NULL) + fatal("pointer is not found in registry"); + netdata_mutex_unlock(&cache->global_pointer_registry_mutex); +#else + ; +#endif +} + +static inline void pointer_del(PGC *cache __maybe_unused, PGC_PAGE *page __maybe_unused) { +#ifdef NETDATA_PGC_POINTER_CHECK + netdata_mutex_lock(&cache->global_pointer_registry_mutex); + int ret = JudyHSDel(&cache->global_pointer_registry, &page, sizeof(void *), PJE0); + if(!ret) + fatal("pointer to be deleted does not exist in registry"); + netdata_mutex_unlock(&cache->global_pointer_registry_mutex); +#else + ; +#endif +} + +// ---------------------------------------------------------------------------- +// locking + +static inline size_t pgc_indexing_partition(PGC *cache, Word_t metric_id) { + static __thread Word_t last_metric_id = 0; + static __thread size_t last_partition = 0; + + if(metric_id == last_metric_id || cache->config.partitions == 1) + return last_partition; + + last_metric_id = metric_id; + last_partition = indexing_partition(metric_id, cache->config.partitions); + + return last_partition; +} + +static inline void pgc_index_read_lock(PGC *cache, size_t partition) { + netdata_rwlock_rdlock(&cache->index[partition].rwlock); +} +static inline void pgc_index_read_unlock(PGC *cache, size_t partition) { + netdata_rwlock_unlock(&cache->index[partition].rwlock); +} +//static inline bool pgc_index_write_trylock(PGC *cache, size_t partition) { +// return !netdata_rwlock_trywrlock(&cache->index[partition].rwlock); +//} +static inline void pgc_index_write_lock(PGC *cache, size_t partition) { + netdata_rwlock_wrlock(&cache->index[partition].rwlock); +} +static inline void pgc_index_write_unlock(PGC *cache, size_t partition) { + netdata_rwlock_unlock(&cache->index[partition].rwlock); +} + +static inline bool pgc_ll_trylock(PGC *cache __maybe_unused, struct pgc_linked_list *ll) { + return netdata_spinlock_trylock(&ll->spinlock); +} + +static inline void pgc_ll_lock(PGC *cache __maybe_unused, struct pgc_linked_list *ll) { + netdata_spinlock_lock(&ll->spinlock); +} + +static inline void pgc_ll_unlock(PGC *cache __maybe_unused, struct pgc_linked_list *ll) { + netdata_spinlock_unlock(&ll->spinlock); +} + +static inline bool page_transition_trylock(PGC *cache __maybe_unused, PGC_PAGE *page) { + return netdata_spinlock_trylock(&page->transition_spinlock); +} + +static inline void page_transition_lock(PGC *cache __maybe_unused, PGC_PAGE *page) { + netdata_spinlock_lock(&page->transition_spinlock); +} + +static inline void page_transition_unlock(PGC *cache __maybe_unused, PGC_PAGE *page) { + netdata_spinlock_unlock(&page->transition_spinlock); +} + +// ---------------------------------------------------------------------------- +// evictions control + +static inline size_t cache_usage_per1000(PGC *cache, size_t *size_to_evict) { + + if(size_to_evict) + netdata_spinlock_lock(&cache->usage.spinlock); + + else if(!netdata_spinlock_trylock(&cache->usage.spinlock)) + return __atomic_load_n(&cache->usage.per1000, __ATOMIC_RELAXED); + + size_t current_cache_size; + size_t wanted_cache_size; + size_t per1000; + + size_t dirty = __atomic_load_n(&cache->dirty.stats->size, __ATOMIC_RELAXED); + size_t hot = __atomic_load_n(&cache->hot.stats->size, __ATOMIC_RELAXED); + + if(cache->config.options & PGC_OPTIONS_AUTOSCALE) { + size_t dirty_max = __atomic_load_n(&cache->dirty.stats->max_size, __ATOMIC_RELAXED); + size_t hot_max = __atomic_load_n(&cache->hot.stats->max_size, __ATOMIC_RELAXED); + + // our promise to users + size_t max_size1 = MAX(hot_max, hot) * 2; + + // protection against slow flushing + size_t max_size2 = hot_max + ((dirty_max < hot_max / 2) ? hot_max / 2 : dirty_max * 2); + + // the final wanted cache size + wanted_cache_size = MIN(max_size1, max_size2); + + if(cache->config.dynamic_target_size_cb) { + size_t wanted_cache_size_cb = cache->config.dynamic_target_size_cb(); + if(wanted_cache_size_cb > wanted_cache_size) + wanted_cache_size = wanted_cache_size_cb; + } + + if (wanted_cache_size < hot + dirty + cache->config.clean_size) + wanted_cache_size = hot + dirty + cache->config.clean_size; + } + else + wanted_cache_size = hot + dirty + cache->config.clean_size; + + // protection again huge queries + // if huge queries are running, or huge amounts need to be saved + // allow the cache to grow more (hot pages in main cache are also referenced) + size_t referenced_size = __atomic_load_n(&cache->stats.referenced_size, __ATOMIC_RELAXED); + if(unlikely(wanted_cache_size < referenced_size * 2 / 3)) + wanted_cache_size = referenced_size * 2 / 3; + + current_cache_size = __atomic_load_n(&cache->stats.size, __ATOMIC_RELAXED); // + pgc_aral_overhead(); + + per1000 = (size_t)((unsigned long long)current_cache_size * 1000ULL / (unsigned long long)wanted_cache_size); + + __atomic_store_n(&cache->usage.per1000, per1000, __ATOMIC_RELAXED); + __atomic_store_n(&cache->stats.wanted_cache_size, wanted_cache_size, __ATOMIC_RELAXED); + __atomic_store_n(&cache->stats.current_cache_size, current_cache_size, __ATOMIC_RELAXED); + + netdata_spinlock_unlock(&cache->usage.spinlock); + + if(size_to_evict) { + size_t target = (size_t)((unsigned long long)wanted_cache_size * (unsigned long long)cache->config.evict_low_threshold_per1000 / 1000ULL); + if(current_cache_size > target) + *size_to_evict = current_cache_size - target; + else + *size_to_evict = 0; + } + + if(per1000 >= cache->config.severe_pressure_per1000) + __atomic_add_fetch(&cache->stats.events_cache_under_severe_pressure, 1, __ATOMIC_RELAXED); + + else if(per1000 >= cache->config.aggressive_evict_per1000) + __atomic_add_fetch(&cache->stats.events_cache_needs_space_aggressively, 1, __ATOMIC_RELAXED); + + return per1000; +} + +static inline bool cache_pressure(PGC *cache, size_t limit) { + return (cache_usage_per1000(cache, NULL) >= limit); +} + +#define cache_under_severe_pressure(cache) cache_pressure(cache, (cache)->config.severe_pressure_per1000) +#define cache_needs_space_aggressively(cache) cache_pressure(cache, (cache)->config.aggressive_evict_per1000) +#define cache_above_healthy_limit(cache) cache_pressure(cache, (cache)->config.healthy_size_per1000) + +typedef bool (*evict_filter)(PGC_PAGE *page, void *data); +static bool evict_pages_with_filter(PGC *cache, size_t max_skip, size_t max_evict, bool wait, bool all_of_them, evict_filter filter, void *data); +#define evict_pages(cache, max_skip, max_evict, wait, all_of_them) evict_pages_with_filter(cache, max_skip, max_evict, wait, all_of_them, NULL, NULL) + +static inline void evict_on_clean_page_added(PGC *cache __maybe_unused) { + if((cache->config.options & PGC_OPTIONS_EVICT_PAGES_INLINE) || cache_needs_space_aggressively(cache)) { + evict_pages(cache, + cache->config.max_skip_pages_per_inline_eviction, + cache->config.max_pages_per_inline_eviction, + false, false); + } +} + +static inline void evict_on_page_release_when_permitted(PGC *cache __maybe_unused) { + if ((cache->config.options & PGC_OPTIONS_EVICT_PAGES_INLINE) || cache_under_severe_pressure(cache)) { + evict_pages(cache, + cache->config.max_skip_pages_per_inline_eviction, + cache->config.max_pages_per_inline_eviction, + false, false); + } +} + +// ---------------------------------------------------------------------------- +// flushing control + +static bool flush_pages(PGC *cache, size_t max_flushes, Word_t section, bool wait, bool all_of_them); + +static inline bool flushing_critical(PGC *cache) { + if(unlikely(__atomic_load_n(&cache->dirty.stats->size, __ATOMIC_RELAXED) > __atomic_load_n(&cache->hot.stats->max_size, __ATOMIC_RELAXED))) { + __atomic_add_fetch(&cache->stats.events_flush_critical, 1, __ATOMIC_RELAXED); + return true; + } + + return false; +} + +// ---------------------------------------------------------------------------- +// helpers + +static inline size_t page_assumed_size(PGC *cache, size_t size) { + return size + (sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page + sizeof(Word_t) * 3); +} + +static inline size_t page_size_from_assumed_size(PGC *cache, size_t assumed_size) { + return assumed_size - (sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page + sizeof(Word_t) * 3); +} + +// ---------------------------------------------------------------------------- +// Linked list management + +static inline void atomic_set_max(size_t *max, size_t desired) { + size_t expected; + + expected = __atomic_load_n(max, __ATOMIC_RELAXED); + + do { + + if(expected >= desired) + return; + + } while(!__atomic_compare_exchange_n(max, &expected, desired, + false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); +} + +struct section_pages { + SPINLOCK migration_to_v2_spinlock; + size_t entries; + size_t size; + PGC_PAGE *base; +}; + +static ARAL *pgc_section_pages_aral = NULL; +static void pgc_section_pages_static_aral_init(void) { + static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + + if(unlikely(!pgc_section_pages_aral)) { + netdata_spinlock_lock(&spinlock); + + // we have to check again + if(!pgc_section_pages_aral) + pgc_section_pages_aral = aral_create( + "pgc_section", + sizeof(struct section_pages), + 0, + 65536, NULL, + NULL, NULL, false, false); + + netdata_spinlock_unlock(&spinlock); + } +} + +static inline void pgc_stats_ll_judy_change(PGC *cache, struct pgc_linked_list *ll, size_t mem_before_judyl, size_t mem_after_judyl) { + if(mem_after_judyl > mem_before_judyl) { + __atomic_add_fetch(&ll->stats->size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED); + } + else if(mem_after_judyl < mem_before_judyl) { + __atomic_sub_fetch(&ll->stats->size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED); + } +} + +static inline void pgc_stats_index_judy_change(PGC *cache, size_t mem_before_judyl, size_t mem_after_judyl) { + if(mem_after_judyl > mem_before_judyl) { + __atomic_add_fetch(&cache->stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED); + } + else if(mem_after_judyl < mem_before_judyl) { + __atomic_sub_fetch(&cache->stats.size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED); + } +} + +static void pgc_ll_add(PGC *cache __maybe_unused, struct pgc_linked_list *ll, PGC_PAGE *page, bool having_lock) { + if(!having_lock) + pgc_ll_lock(cache, ll); + + internal_fatal(page_get_status_flags(page) != 0, + "DBENGINE CACHE: invalid page flags, the page has %d, but it is should be %d", + page_get_status_flags(page), + 0); + + if(ll->linked_list_in_sections_judy) { + size_t mem_before_judyl, mem_after_judyl; + + mem_before_judyl = JudyLMemUsed(ll->sections_judy); + Pvoid_t *section_pages_pptr = JudyLIns(&ll->sections_judy, page->section, PJE0); + mem_after_judyl = JudyLMemUsed(ll->sections_judy); + + struct section_pages *sp = *section_pages_pptr; + if(!sp) { + // sp = callocz(1, sizeof(struct section_pages)); + sp = aral_mallocz(pgc_section_pages_aral); + memset(sp, 0, sizeof(struct section_pages)); + + *section_pages_pptr = sp; + + mem_after_judyl += sizeof(struct section_pages); + } + pgc_stats_ll_judy_change(cache, ll, mem_before_judyl, mem_after_judyl); + + sp->entries++; + sp->size += page->assumed_size; + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(sp->base, page, link.prev, link.next); + + if((sp->entries % cache->config.max_dirty_pages_per_call) == 0) + ll->version++; + } + else { + // CLEAN pages end up here. + // - New pages created as CLEAN, always have 1 access. + // - DIRTY pages made CLEAN, depending on their accesses may be appended (accesses > 0) or prepended (accesses = 0). + + if(page->accesses || page_flag_check(page, PGC_PAGE_HAS_BEEN_ACCESSED | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES) == PGC_PAGE_HAS_BEEN_ACCESSED) { + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ll->base, page, link.prev, link.next); + page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED); + } + else + DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(ll->base, page, link.prev, link.next); + + ll->version++; + } + + page_flag_set(page, ll->flags); + + if(!having_lock) + pgc_ll_unlock(cache, ll); + + size_t entries = __atomic_add_fetch(&ll->stats->entries, 1, __ATOMIC_RELAXED); + size_t size = __atomic_add_fetch(&ll->stats->size, page->assumed_size, __ATOMIC_RELAXED); + __atomic_add_fetch(&ll->stats->added_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ll->stats->added_size, page->assumed_size, __ATOMIC_RELAXED); + + atomic_set_max(&ll->stats->max_entries, entries); + atomic_set_max(&ll->stats->max_size, size); +} + +static void pgc_ll_del(PGC *cache __maybe_unused, struct pgc_linked_list *ll, PGC_PAGE *page, bool having_lock) { + __atomic_sub_fetch(&ll->stats->entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&ll->stats->size, page->assumed_size, __ATOMIC_RELAXED); + __atomic_add_fetch(&ll->stats->removed_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ll->stats->removed_size, page->assumed_size, __ATOMIC_RELAXED); + + if(!having_lock) + pgc_ll_lock(cache, ll); + + internal_fatal(page_get_status_flags(page) != ll->flags, + "DBENGINE CACHE: invalid page flags, the page has %d, but it is should be %d", + page_get_status_flags(page), + ll->flags); + + page_flag_clear(page, ll->flags); + + if(ll->linked_list_in_sections_judy) { + Pvoid_t *section_pages_pptr = JudyLGet(ll->sections_judy, page->section, PJE0); + internal_fatal(!section_pages_pptr, "DBENGINE CACHE: page should be in Judy LL, but it is not"); + + struct section_pages *sp = *section_pages_pptr; + sp->entries--; + sp->size -= page->assumed_size; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(sp->base, page, link.prev, link.next); + + if(!sp->base) { + size_t mem_before_judyl, mem_after_judyl; + + mem_before_judyl = JudyLMemUsed(ll->sections_judy); + int rc = JudyLDel(&ll->sections_judy, page->section, PJE0); + mem_after_judyl = JudyLMemUsed(ll->sections_judy); + + if(!rc) + fatal("DBENGINE CACHE: cannot delete section from Judy LL"); + + // freez(sp); + aral_freez(pgc_section_pages_aral, sp); + mem_after_judyl -= sizeof(struct section_pages); + pgc_stats_ll_judy_change(cache, ll, mem_before_judyl, mem_after_judyl); + } + } + else { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ll->base, page, link.prev, link.next); + ll->version++; + } + + if(!having_lock) + pgc_ll_unlock(cache, ll); +} + +static inline void page_has_been_accessed(PGC *cache, PGC_PAGE *page) { + PGC_PAGE_FLAGS flags = page_flag_check(page, PGC_PAGE_CLEAN | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES); + + if (!(flags & PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES)) { + __atomic_add_fetch(&page->accesses, 1, __ATOMIC_RELAXED); + + if (flags & PGC_PAGE_CLEAN) { + if(pgc_ll_trylock(cache, &cache->clean)) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next); + pgc_ll_unlock(cache, &cache->clean); + page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED); + } + else + page_flag_set(page, PGC_PAGE_HAS_BEEN_ACCESSED); + } + } +} + + +// ---------------------------------------------------------------------------- +// state transitions + +static inline void page_set_clean(PGC *cache, PGC_PAGE *page, bool having_transition_lock, bool having_clean_lock) { + if(!having_transition_lock) + page_transition_lock(cache, page); + + PGC_PAGE_FLAGS flags = page_get_status_flags(page); + + if(flags & PGC_PAGE_CLEAN) { + if(!having_transition_lock) + page_transition_unlock(cache, page); + return; + } + + if(flags & PGC_PAGE_HOT) + pgc_ll_del(cache, &cache->hot, page, false); + + if(flags & PGC_PAGE_DIRTY) + pgc_ll_del(cache, &cache->dirty, page, false); + + // first add to linked list, the set the flag (required for move_page_last()) + pgc_ll_add(cache, &cache->clean, page, having_clean_lock); + + if(!having_transition_lock) + page_transition_unlock(cache, page); +} + +static inline void page_set_dirty(PGC *cache, PGC_PAGE *page, bool having_hot_lock) { + if(!having_hot_lock) + // to avoid deadlocks, we have to get the hot lock before the page transition + // since this is what all_hot_to_dirty() does + pgc_ll_lock(cache, &cache->hot); + + page_transition_lock(cache, page); + + PGC_PAGE_FLAGS flags = page_get_status_flags(page); + + if(flags & PGC_PAGE_DIRTY) { + page_transition_unlock(cache, page); + + if(!having_hot_lock) + // we don't need the hot lock anymore + pgc_ll_unlock(cache, &cache->hot); + + return; + } + + __atomic_add_fetch(&cache->stats.hot2dirty_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.hot2dirty_size, page->assumed_size, __ATOMIC_RELAXED); + + if(likely(flags & PGC_PAGE_HOT)) + pgc_ll_del(cache, &cache->hot, page, true); + + if(!having_hot_lock) + // we don't need the hot lock anymore + pgc_ll_unlock(cache, &cache->hot); + + if(unlikely(flags & PGC_PAGE_CLEAN)) + pgc_ll_del(cache, &cache->clean, page, false); + + // first add to linked list, the set the flag (required for move_page_last()) + pgc_ll_add(cache, &cache->dirty, page, false); + + __atomic_sub_fetch(&cache->stats.hot2dirty_entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.hot2dirty_size, page->assumed_size, __ATOMIC_RELAXED); + + page_transition_unlock(cache, page); +} + +static inline void page_set_hot(PGC *cache, PGC_PAGE *page) { + page_transition_lock(cache, page); + + PGC_PAGE_FLAGS flags = page_get_status_flags(page); + + if(flags & PGC_PAGE_HOT) { + page_transition_unlock(cache, page); + return; + } + + if(flags & PGC_PAGE_DIRTY) + pgc_ll_del(cache, &cache->dirty, page, false); + + if(flags & PGC_PAGE_CLEAN) + pgc_ll_del(cache, &cache->clean, page, false); + + // first add to linked list, the set the flag (required for move_page_last()) + pgc_ll_add(cache, &cache->hot, page, false); + + page_transition_unlock(cache, page); +} + + +// ---------------------------------------------------------------------------- +// Referencing + +static inline size_t PGC_REFERENCED_PAGES(PGC *cache) { + return __atomic_load_n(&cache->stats.referenced_entries, __ATOMIC_RELAXED); +} + +static inline void PGC_REFERENCED_PAGES_PLUS1(PGC *cache, PGC_PAGE *page) { + __atomic_add_fetch(&cache->stats.referenced_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.referenced_size, page->assumed_size, __ATOMIC_RELAXED); +} + +static inline void PGC_REFERENCED_PAGES_MINUS1(PGC *cache, size_t assumed_size) { + __atomic_sub_fetch(&cache->stats.referenced_entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.referenced_size, assumed_size, __ATOMIC_RELAXED); +} + +// If the page is not already acquired, +// YOU HAVE TO HAVE THE QUEUE (hot, dirty, clean) THE PAGE IS IN, L O C K E D ! +// If you don't have it locked, NOTHING PREVENTS THIS PAGE FOR VANISHING WHILE THIS IS CALLED! +static inline bool page_acquire(PGC *cache, PGC_PAGE *page) { + __atomic_add_fetch(&cache->stats.acquires, 1, __ATOMIC_RELAXED); + + REFCOUNT expected, desired; + + expected = __atomic_load_n(&page->refcount, __ATOMIC_RELAXED); + size_t spins = 0; + + do { + spins++; + + if(unlikely(expected < 0)) + return false; + + desired = expected + 1; + + } while(!__atomic_compare_exchange_n(&page->refcount, &expected, desired, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)); + + if(unlikely(spins > 1)) + __atomic_add_fetch(&cache->stats.acquire_spins, spins - 1, __ATOMIC_RELAXED); + + if(desired == 1) + PGC_REFERENCED_PAGES_PLUS1(cache, page); + + return true; +} + +static inline void page_release(PGC *cache, PGC_PAGE *page, bool evict_if_necessary) { + __atomic_add_fetch(&cache->stats.releases, 1, __ATOMIC_RELAXED); + + size_t assumed_size = page->assumed_size; // take the size before we release it + REFCOUNT expected, desired; + + expected = __atomic_load_n(&page->refcount, __ATOMIC_RELAXED); + + size_t spins = 0; + do { + spins++; + + internal_fatal(expected <= 0, + "DBENGINE CACHE: trying to release a page with reference counter %d", expected); + + desired = expected - 1; + + } while(!__atomic_compare_exchange_n(&page->refcount, &expected, desired, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED)); + + if(unlikely(spins > 1)) + __atomic_add_fetch(&cache->stats.release_spins, spins - 1, __ATOMIC_RELAXED); + + if(desired == 0) { + PGC_REFERENCED_PAGES_MINUS1(cache, assumed_size); + + if(evict_if_necessary) + evict_on_page_release_when_permitted(cache); + } +} + +static inline bool non_acquired_page_get_for_deletion___while_having_clean_locked(PGC *cache __maybe_unused, PGC_PAGE *page) { + __atomic_add_fetch(&cache->stats.acquires_for_deletion, 1, __ATOMIC_RELAXED); + + internal_fatal(!is_page_clean(page), + "DBENGINE CACHE: only clean pages can be deleted"); + + REFCOUNT expected, desired; + + expected = __atomic_load_n(&page->refcount, __ATOMIC_RELAXED); + size_t spins = 0; + bool delete_it; + + do { + spins++; + + if (expected == 0) { + desired = REFCOUNT_DELETING; + delete_it = true; + } + else { + delete_it = false; + break; + } + + } while(!__atomic_compare_exchange_n(&page->refcount, &expected, desired, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED)); + + if(delete_it) { + // we can delete this page + internal_fatal(page_flag_check(page, PGC_PAGE_IS_BEING_DELETED), + "DBENGINE CACHE: page is already being deleted"); + + page_flag_set(page, PGC_PAGE_IS_BEING_DELETED); + } + + if(unlikely(spins > 1)) + __atomic_add_fetch(&cache->stats.delete_spins, spins - 1, __ATOMIC_RELAXED); + + return delete_it; +} + +static inline bool acquired_page_get_for_deletion_or_release_it(PGC *cache __maybe_unused, PGC_PAGE *page) { + __atomic_add_fetch(&cache->stats.acquires_for_deletion, 1, __ATOMIC_RELAXED); + + size_t assumed_size = page->assumed_size; // take the size before we release it + + REFCOUNT expected, desired; + + expected = __atomic_load_n(&page->refcount, __ATOMIC_RELAXED); + size_t spins = 0; + bool delete_it; + + do { + spins++; + + internal_fatal(expected < 1, + "DBENGINE CACHE: page to be deleted should be acquired by the caller."); + + if (expected == 1) { + // we are the only one having this page referenced + desired = REFCOUNT_DELETING; + delete_it = true; + } + else { + // this page cannot be deleted + desired = expected - 1; + delete_it = false; + } + + } while(!__atomic_compare_exchange_n(&page->refcount, &expected, desired, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED)); + + if(delete_it) { + PGC_REFERENCED_PAGES_MINUS1(cache, assumed_size); + + // we can delete this page + internal_fatal(page_flag_check(page, PGC_PAGE_IS_BEING_DELETED), + "DBENGINE CACHE: page is already being deleted"); + + page_flag_set(page, PGC_PAGE_IS_BEING_DELETED); + } + + if(unlikely(spins > 1)) + __atomic_add_fetch(&cache->stats.delete_spins, spins - 1, __ATOMIC_RELAXED); + + return delete_it; +} + + +// ---------------------------------------------------------------------------- +// Indexing + +static inline void free_this_page(PGC *cache, PGC_PAGE *page, size_t partition __maybe_unused) { + // call the callback to free the user supplied memory + cache->config.pgc_free_clean_cb(cache, (PGC_ENTRY){ + .section = page->section, + .metric_id = page->metric_id, + .start_time_s = page->start_time_s, + .end_time_s = __atomic_load_n(&page->end_time_s, __ATOMIC_RELAXED), + .update_every_s = page->update_every_s, + .size = page_size_from_assumed_size(cache, page->assumed_size), + .hot = (is_page_hot(page)) ? true : false, + .data = page->data, + .custom_data = (cache->config.additional_bytes_per_page) ? page->custom_data : NULL, + }); + + // update statistics + __atomic_add_fetch(&cache->stats.removed_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.removed_size, page->assumed_size, __ATOMIC_RELAXED); + + __atomic_sub_fetch(&cache->stats.entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.size, page->assumed_size, __ATOMIC_RELAXED); + + // free our memory +#ifdef PGC_WITH_ARAL + aral_freez(cache->aral[partition], page); +#else + freez(page); +#endif +} + +static void remove_this_page_from_index_unsafe(PGC *cache, PGC_PAGE *page, size_t partition) { + // remove it from the Judy arrays + + pointer_check(cache, page); + + internal_fatal(page_flag_check(page, PGC_PAGE_HOT | PGC_PAGE_DIRTY | PGC_PAGE_CLEAN), + "DBENGINE CACHE: page to be removed from the cache is still in the linked-list"); + + internal_fatal(!page_flag_check(page, PGC_PAGE_IS_BEING_DELETED), + "DBENGINE CACHE: page to be removed from the index, is not marked for deletion"); + + internal_fatal(partition != pgc_indexing_partition(cache, page->metric_id), + "DBENGINE CACHE: attempted to remove this page from the wrong partition of the cache"); + + Pvoid_t *metrics_judy_pptr = JudyLGet(cache->index[partition].sections_judy, page->section, PJE0); + if(unlikely(!metrics_judy_pptr)) + fatal("DBENGINE CACHE: section '%lu' should exist, but it does not.", page->section); + + Pvoid_t *pages_judy_pptr = JudyLGet(*metrics_judy_pptr, page->metric_id, PJE0); + if(unlikely(!pages_judy_pptr)) + fatal("DBENGINE CACHE: metric '%lu' in section '%lu' should exist, but it does not.", + page->metric_id, page->section); + + Pvoid_t *page_ptr = JudyLGet(*pages_judy_pptr, page->start_time_s, PJE0); + if(unlikely(!page_ptr)) + fatal("DBENGINE CACHE: page with start time '%ld' of metric '%lu' in section '%lu' should exist, but it does not.", + page->start_time_s, page->metric_id, page->section); + + PGC_PAGE *found_page = *page_ptr; + if(unlikely(found_page != page)) + fatal("DBENGINE CACHE: page with start time '%ld' of metric '%lu' in section '%lu' should exist, but the index returned a different address.", + page->start_time_s, page->metric_id, page->section); + + size_t mem_before_judyl = 0, mem_after_judyl = 0; + + mem_before_judyl += JudyLMemUsed(*pages_judy_pptr); + if(unlikely(!JudyLDel(pages_judy_pptr, page->start_time_s, PJE0))) + fatal("DBENGINE CACHE: page with start time '%ld' of metric '%lu' in section '%lu' exists, but cannot be deleted.", + page->start_time_s, page->metric_id, page->section); + mem_after_judyl += JudyLMemUsed(*pages_judy_pptr); + + mem_before_judyl += JudyLMemUsed(*metrics_judy_pptr); + if(!*pages_judy_pptr && !JudyLDel(metrics_judy_pptr, page->metric_id, PJE0)) + fatal("DBENGINE CACHE: metric '%lu' in section '%lu' exists and is empty, but cannot be deleted.", + page->metric_id, page->section); + mem_after_judyl += JudyLMemUsed(*metrics_judy_pptr); + + mem_before_judyl += JudyLMemUsed(cache->index[partition].sections_judy); + if(!*metrics_judy_pptr && !JudyLDel(&cache->index[partition].sections_judy, page->section, PJE0)) + fatal("DBENGINE CACHE: section '%lu' exists and is empty, but cannot be deleted.", page->section); + mem_after_judyl += JudyLMemUsed(cache->index[partition].sections_judy); + + pgc_stats_index_judy_change(cache, mem_before_judyl, mem_after_judyl); + + pointer_del(cache, page); +} + +static inline void remove_and_free_page_not_in_any_queue_and_acquired_for_deletion(PGC *cache, PGC_PAGE *page) { + size_t partition = pgc_indexing_partition(cache, page->metric_id); + pgc_index_write_lock(cache, partition); + remove_this_page_from_index_unsafe(cache, page, partition); + pgc_index_write_unlock(cache, partition); + free_this_page(cache, page, partition); +} + +static inline bool make_acquired_page_clean_and_evict_or_page_release(PGC *cache, PGC_PAGE *page) { + pointer_check(cache, page); + + page_transition_lock(cache, page); + pgc_ll_lock(cache, &cache->clean); + + // make it clean - it does not have any accesses, so it will be prepended + page_set_clean(cache, page, true, true); + + if(!acquired_page_get_for_deletion_or_release_it(cache, page)) { + pgc_ll_unlock(cache, &cache->clean); + page_transition_unlock(cache, page); + return false; + } + + // remove it from the linked list + pgc_ll_del(cache, &cache->clean, page, true); + pgc_ll_unlock(cache, &cache->clean); + page_transition_unlock(cache, page); + + remove_and_free_page_not_in_any_queue_and_acquired_for_deletion(cache, page); + + return true; +} + +// returns true, when there is more work to do +static bool evict_pages_with_filter(PGC *cache, size_t max_skip, size_t max_evict, bool wait, bool all_of_them, evict_filter filter, void *data) { + size_t per1000 = cache_usage_per1000(cache, NULL); + + if(!all_of_them && per1000 < cache->config.healthy_size_per1000) + // don't bother - not enough to do anything + return false; + + size_t workers_running = __atomic_add_fetch(&cache->stats.workers_evict, 1, __ATOMIC_RELAXED); + if(!wait && !all_of_them && workers_running > cache->config.max_workers_evict_inline && per1000 < cache->config.severe_pressure_per1000) { + __atomic_sub_fetch(&cache->stats.workers_evict, 1, __ATOMIC_RELAXED); + return false; + } + + internal_fatal(cache->clean.linked_list_in_sections_judy, + "wrong clean pages configuration - clean pages need to have a linked list, not a judy array"); + + if(unlikely(!max_skip)) + max_skip = SIZE_MAX; + else if(unlikely(max_skip < 2)) + max_skip = 2; + + if(unlikely(!max_evict)) + max_evict = SIZE_MAX; + else if(unlikely(max_evict < 2)) + max_evict = 2; + + size_t total_pages_evicted = 0; + size_t total_pages_skipped = 0; + bool stopped_before_finishing = false; + size_t spins = 0; + + do { + if(++spins > 1) + __atomic_add_fetch(&cache->stats.evict_spins, 1, __ATOMIC_RELAXED); + + bool batch; + size_t max_size_to_evict = 0; + if (unlikely(all_of_them)) { + max_size_to_evict = SIZE_MAX; + batch = true; + } + else if(unlikely(wait)) { + per1000 = cache_usage_per1000(cache, &max_size_to_evict); + batch = (wait && per1000 > cache->config.severe_pressure_per1000) ? true : false; + } + else { + batch = false; + max_size_to_evict = (cache_above_healthy_limit(cache)) ? 1 : 0; + } + + if (!max_size_to_evict) + break; + + // check if we have to stop + if(total_pages_evicted >= max_evict && !all_of_them) { + stopped_before_finishing = true; + break; + } + + if(!all_of_them && !wait) { + if(!pgc_ll_trylock(cache, &cache->clean)) { + stopped_before_finishing = true; + goto premature_exit; + } + + // at this point we have the clean lock + } + else + pgc_ll_lock(cache, &cache->clean); + + // find a page to evict + PGC_PAGE *pages_to_evict = NULL; + size_t pages_to_evict_size = 0; + for(PGC_PAGE *page = cache->clean.base, *next = NULL, *first_page_we_relocated = NULL; page ; page = next) { + next = page->link.next; + + if(unlikely(page == first_page_we_relocated)) + // we did a complete loop on all pages + break; + + if(unlikely(page_flag_check(page, PGC_PAGE_HAS_BEEN_ACCESSED | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES) == PGC_PAGE_HAS_BEEN_ACCESSED)) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next); + page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED); + continue; + } + + if(unlikely(filter && !filter(page, data))) + continue; + + if(non_acquired_page_get_for_deletion___while_having_clean_locked(cache, page)) { + // we can delete this page + + // remove it from the clean list + pgc_ll_del(cache, &cache->clean, page, true); + + __atomic_add_fetch(&cache->stats.evicting_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.evicting_size, page->assumed_size, __ATOMIC_RELAXED); + + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pages_to_evict, page, link.prev, link.next); + + pages_to_evict_size += page->assumed_size; + + if(unlikely(all_of_them || (batch && pages_to_evict_size < max_size_to_evict))) + // get more pages + ; + else + // one page at a time + break; + } + else { + // we can't delete this page + + if(!first_page_we_relocated) + first_page_we_relocated = page; + + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next); + + // check if we have to stop + if(unlikely(++total_pages_skipped >= max_skip && !all_of_them)) { + stopped_before_finishing = true; + break; + } + } + } + pgc_ll_unlock(cache, &cache->clean); + + if(likely(pages_to_evict)) { + // remove them from the index + + if(unlikely(pages_to_evict->link.next)) { + // we have many pages, let's minimize the index locks we are going to get + + PGC_PAGE *pages_per_partition[cache->config.partitions]; + memset(pages_per_partition, 0, sizeof(PGC_PAGE *) * cache->config.partitions); + + // sort them by partition + for (PGC_PAGE *page = pages_to_evict, *next = NULL; page; page = next) { + next = page->link.next; + + size_t partition = pgc_indexing_partition(cache, page->metric_id); + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(pages_to_evict, page, link.prev, link.next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pages_per_partition[partition], page, link.prev, link.next); + } + + // remove them from the index + for (size_t partition = 0; partition < cache->config.partitions; partition++) { + if (!pages_per_partition[partition]) continue; + + pgc_index_write_lock(cache, partition); + + for (PGC_PAGE *page = pages_per_partition[partition]; page; page = page->link.next) + remove_this_page_from_index_unsafe(cache, page, partition); + + pgc_index_write_unlock(cache, partition); + } + + // free them + for (size_t partition = 0; partition < cache->config.partitions; partition++) { + if (!pages_per_partition[partition]) continue; + + for (PGC_PAGE *page = pages_per_partition[partition], *next = NULL; page; page = next) { + next = page->link.next; + + size_t page_size = page->assumed_size; + free_this_page(cache, page, partition); + + __atomic_sub_fetch(&cache->stats.evicting_entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.evicting_size, page_size, __ATOMIC_RELAXED); + + total_pages_evicted++; + } + } + } + else { + // just one page to be evicted + PGC_PAGE *page = pages_to_evict; + + size_t page_size = page->assumed_size; + + size_t partition = pgc_indexing_partition(cache, page->metric_id); + pgc_index_write_lock(cache, partition); + remove_this_page_from_index_unsafe(cache, page, partition); + pgc_index_write_unlock(cache, partition); + free_this_page(cache, page, partition); + + __atomic_sub_fetch(&cache->stats.evicting_entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.evicting_size, page_size, __ATOMIC_RELAXED); + + total_pages_evicted++; + } + } + else + break; + + } while(all_of_them || (total_pages_evicted < max_evict && total_pages_skipped < max_skip)); + + if(all_of_them && !filter) { + pgc_ll_lock(cache, &cache->clean); + if(cache->clean.stats->entries) { + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE CACHE: cannot free all clean pages, %zu are still in the clean queue", + cache->clean.stats->entries); + } + pgc_ll_unlock(cache, &cache->clean); + } + +premature_exit: + if(unlikely(total_pages_skipped)) + __atomic_add_fetch(&cache->stats.evict_skipped, total_pages_skipped, __ATOMIC_RELAXED); + + __atomic_sub_fetch(&cache->stats.workers_evict, 1, __ATOMIC_RELAXED); + + return stopped_before_finishing; +} + +static PGC_PAGE *page_add(PGC *cache, PGC_ENTRY *entry, bool *added) { + __atomic_add_fetch(&cache->stats.workers_add, 1, __ATOMIC_RELAXED); + + size_t partition = pgc_indexing_partition(cache, entry->metric_id); + +#ifdef PGC_WITH_ARAL + PGC_PAGE *allocation = aral_mallocz(cache->aral[partition]); +#endif + PGC_PAGE *page; + size_t spins = 0; + + do { + if(++spins > 1) + __atomic_add_fetch(&cache->stats.insert_spins, 1, __ATOMIC_RELAXED); + + pgc_index_write_lock(cache, partition); + + size_t mem_before_judyl = 0, mem_after_judyl = 0; + + mem_before_judyl += JudyLMemUsed(cache->index[partition].sections_judy); + Pvoid_t *metrics_judy_pptr = JudyLIns(&cache->index[partition].sections_judy, entry->section, PJE0); + if(unlikely(!metrics_judy_pptr || metrics_judy_pptr == PJERR)) + fatal("DBENGINE CACHE: corrupted sections judy array"); + mem_after_judyl += JudyLMemUsed(cache->index[partition].sections_judy); + + mem_before_judyl += JudyLMemUsed(*metrics_judy_pptr); + Pvoid_t *pages_judy_pptr = JudyLIns(metrics_judy_pptr, entry->metric_id, PJE0); + if(unlikely(!pages_judy_pptr || pages_judy_pptr == PJERR)) + fatal("DBENGINE CACHE: corrupted pages judy array"); + mem_after_judyl += JudyLMemUsed(*metrics_judy_pptr); + + mem_before_judyl += JudyLMemUsed(*pages_judy_pptr); + Pvoid_t *page_ptr = JudyLIns(pages_judy_pptr, entry->start_time_s, PJE0); + if(unlikely(!page_ptr || page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in judy array"); + mem_after_judyl += JudyLMemUsed(*pages_judy_pptr); + + pgc_stats_index_judy_change(cache, mem_before_judyl, mem_after_judyl); + + page = *page_ptr; + + if (likely(!page)) { +#ifdef PGC_WITH_ARAL + page = allocation; + allocation = NULL; +#else + page = mallocz(sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page); +#endif + page->refcount = 1; + page->accesses = (entry->hot) ? 0 : 1; + page->flags = 0; + page->section = entry->section; + page->metric_id = entry->metric_id; + page->start_time_s = entry->start_time_s; + page->end_time_s = entry->end_time_s, + page->update_every_s = entry->update_every_s, + page->data = entry->data; + page->assumed_size = page_assumed_size(cache, entry->size); + netdata_spinlock_init(&page->transition_spinlock); + page->link.prev = NULL; + page->link.next = NULL; + + if(cache->config.additional_bytes_per_page) { + if(entry->custom_data) + memcpy(page->custom_data, entry->custom_data, cache->config.additional_bytes_per_page); + else + memset(page->custom_data, 0, cache->config.additional_bytes_per_page); + } + + // put it in the index + *page_ptr = page; + pointer_add(cache, page); + pgc_index_write_unlock(cache, partition); + + if (entry->hot) + page_set_hot(cache, page); + else + page_set_clean(cache, page, false, false); + + PGC_REFERENCED_PAGES_PLUS1(cache, page); + + // update statistics + __atomic_add_fetch(&cache->stats.added_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.added_size, page->assumed_size, __ATOMIC_RELAXED); + + __atomic_add_fetch(&cache->stats.entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.size, page->assumed_size, __ATOMIC_RELAXED); + + if(added) + *added = true; + } + else { + if (!page_acquire(cache, page)) + page = NULL; + + else if(added) + *added = false; + + pgc_index_write_unlock(cache, partition); + + if(unlikely(!page)) { + // now that we don't have the lock, + // give it some time for the old page to go away + struct timespec ns = { .tv_sec = 0, .tv_nsec = 1 }; + nanosleep(&ns, NULL); + } + } + + } while(!page); + +#ifdef PGC_WITH_ARAL + if(allocation) + aral_freez(cache->aral[partition], allocation); +#endif + + __atomic_sub_fetch(&cache->stats.workers_add, 1, __ATOMIC_RELAXED); + + if(!entry->hot) + evict_on_clean_page_added(cache); + + if((cache->config.options & PGC_OPTIONS_FLUSH_PAGES_INLINE) || flushing_critical(cache)) { + flush_pages(cache, cache->config.max_flushes_inline, PGC_SECTION_ALL, + false, false); + } + + return page; +} + +static PGC_PAGE *page_find_and_acquire(PGC *cache, Word_t section, Word_t metric_id, time_t start_time_s, PGC_SEARCH method) { + __atomic_add_fetch(&cache->stats.workers_search, 1, __ATOMIC_RELAXED); + + size_t *stats_hit_ptr, *stats_miss_ptr; + + if(method == PGC_SEARCH_CLOSEST) { + __atomic_add_fetch(&cache->stats.searches_closest, 1, __ATOMIC_RELAXED); + stats_hit_ptr = &cache->stats.searches_closest_hits; + stats_miss_ptr = &cache->stats.searches_closest_misses; + } + else { + __atomic_add_fetch(&cache->stats.searches_exact, 1, __ATOMIC_RELAXED); + stats_hit_ptr = &cache->stats.searches_exact_hits; + stats_miss_ptr = &cache->stats.searches_exact_misses; + } + + PGC_PAGE *page = NULL; + size_t partition = pgc_indexing_partition(cache, metric_id); + + pgc_index_read_lock(cache, partition); + + Pvoid_t *metrics_judy_pptr = JudyLGet(cache->index[partition].sections_judy, section, PJE0); + if(unlikely(metrics_judy_pptr == PJERR)) + fatal("DBENGINE CACHE: corrupted sections judy array"); + + if(unlikely(!metrics_judy_pptr)) { + // section does not exist + goto cleanup; + } + + Pvoid_t *pages_judy_pptr = JudyLGet(*metrics_judy_pptr, metric_id, PJE0); + if(unlikely(pages_judy_pptr == PJERR)) + fatal("DBENGINE CACHE: corrupted pages judy array"); + + if(unlikely(!pages_judy_pptr)) { + // metric does not exist + goto cleanup; + } + + switch(method) { + default: + case PGC_SEARCH_CLOSEST: { + Pvoid_t *page_ptr = JudyLGet(*pages_judy_pptr, start_time_s, PJE0); + if (unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array"); + + if (page_ptr) + page = *page_ptr; + + else { + Word_t time = start_time_s; + + // find the previous page + page_ptr = JudyLLast(*pages_judy_pptr, &time, PJE0); + if(unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array #2"); + + if(page_ptr) { + // found a page starting before our timestamp + // check if our timestamp is included + page = *page_ptr; + if(start_time_s > page->end_time_s) + // it is not good for us + page = NULL; + } + + if(!page) { + // find the next page then... + time = start_time_s; + page_ptr = JudyLNext(*pages_judy_pptr, &time, PJE0); + if(page_ptr) + page = *page_ptr; + } + } + } + break; + + case PGC_SEARCH_EXACT: { + Pvoid_t *page_ptr = JudyLGet(*pages_judy_pptr, start_time_s, PJE0); + if (unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array"); + + if (page_ptr) + page = *page_ptr; + } + break; + + case PGC_SEARCH_FIRST: { + Word_t time = start_time_s; + Pvoid_t *page_ptr = JudyLFirst(*pages_judy_pptr, &time, PJE0); + if (unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array"); + + if (page_ptr) + page = *page_ptr; + } + break; + + case PGC_SEARCH_NEXT: { + Word_t time = start_time_s; + Pvoid_t *page_ptr = JudyLNext(*pages_judy_pptr, &time, PJE0); + if (unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array"); + + if (page_ptr) + page = *page_ptr; + } + break; + + case PGC_SEARCH_LAST: { + Word_t time = start_time_s; + Pvoid_t *page_ptr = JudyLLast(*pages_judy_pptr, &time, PJE0); + if (unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array"); + + if (page_ptr) + page = *page_ptr; + } + break; + + case PGC_SEARCH_PREV: { + Word_t time = start_time_s; + Pvoid_t *page_ptr = JudyLPrev(*pages_judy_pptr, &time, PJE0); + if (unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array"); + + if (page_ptr) + page = *page_ptr; + } + break; + } + + if(page) { + pointer_check(cache, page); + + if(!page_acquire(cache, page)) { + // this page is not good to use + page = NULL; + } + } + +cleanup: + pgc_index_read_unlock(cache, partition); + + if(page) { + __atomic_add_fetch(stats_hit_ptr, 1, __ATOMIC_RELAXED); + page_has_been_accessed(cache, page); + } + else + __atomic_add_fetch(stats_miss_ptr, 1, __ATOMIC_RELAXED); + + __atomic_sub_fetch(&cache->stats.workers_search, 1, __ATOMIC_RELAXED); + + return page; +} + +static void all_hot_pages_to_dirty(PGC *cache, Word_t section) { + pgc_ll_lock(cache, &cache->hot); + + bool first = true; + Word_t last_section = (section == PGC_SECTION_ALL) ? 0 : section; + Pvoid_t *section_pages_pptr; + while ((section_pages_pptr = JudyLFirstThenNext(cache->hot.sections_judy, &last_section, &first))) { + if(section != PGC_SECTION_ALL && last_section != section) + break; + + struct section_pages *sp = *section_pages_pptr; + + PGC_PAGE *page = sp->base; + while(page) { + PGC_PAGE *next = page->link.next; + + if(page_acquire(cache, page)) { + page_set_dirty(cache, page, true); + page_release(cache, page, false); + // page ptr may be invalid now + } + + page = next; + } + } + pgc_ll_unlock(cache, &cache->hot); +} + +// returns true when there is more work to do +static bool flush_pages(PGC *cache, size_t max_flushes, Word_t section, bool wait, bool all_of_them) { + internal_fatal(!cache->dirty.linked_list_in_sections_judy, + "wrong dirty pages configuration - dirty pages need to have a judy array, not a linked list"); + + if(!all_of_them && !wait) { + // we have been called from a data collection thread + // let's not waste its time... + + if(!pgc_ll_trylock(cache, &cache->dirty)) { + // we would block, so give up... + return true; + } + + // we got the lock at this point + } + else + pgc_ll_lock(cache, &cache->dirty); + + size_t optimal_flush_size = cache->config.max_dirty_pages_per_call; + size_t dirty_version_at_entry = cache->dirty.version; + if(!all_of_them && (cache->dirty.stats->entries < optimal_flush_size || cache->dirty.last_version_checked == dirty_version_at_entry)) { + pgc_ll_unlock(cache, &cache->dirty); + return false; + } + + __atomic_add_fetch(&cache->stats.workers_flush, 1, __ATOMIC_RELAXED); + + bool have_dirty_lock = true; + + if(all_of_them || !max_flushes) + max_flushes = SIZE_MAX; + + Word_t last_section = (section == PGC_SECTION_ALL) ? 0 : section; + size_t flushes_so_far = 0; + Pvoid_t *section_pages_pptr; + bool stopped_before_finishing = false; + size_t spins = 0; + bool first = true; + + while (have_dirty_lock && (section_pages_pptr = JudyLFirstThenNext(cache->dirty.sections_judy, &last_section, &first))) { + if(section != PGC_SECTION_ALL && last_section != section) + break; + + struct section_pages *sp = *section_pages_pptr; + if(!all_of_them && sp->entries < optimal_flush_size) + continue; + + if(!all_of_them && flushes_so_far > max_flushes) { + stopped_before_finishing = true; + break; + } + + if(++spins > 1) + __atomic_add_fetch(&cache->stats.flush_spins, 1, __ATOMIC_RELAXED); + + PGC_ENTRY array[optimal_flush_size]; + PGC_PAGE *pages[optimal_flush_size]; + size_t pages_added = 0, pages_added_size = 0; + size_t pages_removed_dirty = 0, pages_removed_dirty_size = 0; + size_t pages_cancelled = 0, pages_cancelled_size = 0; + size_t pages_made_clean = 0, pages_made_clean_size = 0; + + PGC_PAGE *page = sp->base; + while (page && pages_added < optimal_flush_size) { + PGC_PAGE *next = page->link.next; + + internal_fatal(page_get_status_flags(page) != PGC_PAGE_DIRTY, + "DBENGINE CACHE: page should be in the dirty list before saved"); + + if (page_acquire(cache, page)) { + internal_fatal(page_get_status_flags(page) != PGC_PAGE_DIRTY, + "DBENGINE CACHE: page should be in the dirty list before saved"); + + internal_fatal(page->section != last_section, + "DBENGINE CACHE: dirty page is not in the right section (tier)"); + + if(!page_transition_trylock(cache, page)) { + page_release(cache, page, false); + // page ptr may be invalid now + } + else { + pages[pages_added] = page; + array[pages_added] = (PGC_ENTRY) { + .section = page->section, + .metric_id = page->metric_id, + .start_time_s = page->start_time_s, + .end_time_s = __atomic_load_n(&page->end_time_s, __ATOMIC_RELAXED), + .update_every_s = page->update_every_s, + .size = page_size_from_assumed_size(cache, page->assumed_size), + .data = page->data, + .custom_data = (cache->config.additional_bytes_per_page) ? page->custom_data : NULL, + .hot = false, + }; + + pages_added_size += page->assumed_size; + pages_added++; + } + } + + page = next; + } + + // do we have enough to save? + if(all_of_them || pages_added == optimal_flush_size) { + // we should do it + + for (size_t i = 0; i < pages_added; i++) { + PGC_PAGE *tpg = pages[i]; + + internal_fatal(page_get_status_flags(tpg) != PGC_PAGE_DIRTY, + "DBENGINE CACHE: page should be in the dirty list before saved"); + + __atomic_add_fetch(&cache->stats.flushing_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.flushing_size, tpg->assumed_size, __ATOMIC_RELAXED); + + // remove it from the dirty list + pgc_ll_del(cache, &cache->dirty, tpg, true); + + pages_removed_dirty_size += tpg->assumed_size; + pages_removed_dirty++; + } + + // next time, repeat the same section (tier) + first = true; + } + else { + // we can't do it + + for (size_t i = 0; i < pages_added; i++) { + PGC_PAGE *tpg = pages[i]; + + internal_fatal(page_get_status_flags(tpg) != PGC_PAGE_DIRTY, + "DBENGINE CACHE: page should be in the dirty list before saved"); + + pages_cancelled_size += tpg->assumed_size; + pages_cancelled++; + + page_transition_unlock(cache, tpg); + page_release(cache, tpg, false); + // page ptr may be invalid now + } + + __atomic_add_fetch(&cache->stats.flushes_cancelled, pages_cancelled, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.flushes_cancelled_size, pages_cancelled_size, __ATOMIC_RELAXED); + + internal_fatal(pages_added != pages_cancelled || pages_added_size != pages_cancelled_size, + "DBENGINE CACHE: flushing cancel pages mismatch"); + + // next time, continue to the next section (tier) + first = false; + continue; + } + + if(cache->config.pgc_save_init_cb) + cache->config.pgc_save_init_cb(cache, last_section); + + pgc_ll_unlock(cache, &cache->dirty); + have_dirty_lock = false; + + // call the callback to save them + // it may take some time, so let's release the lock + cache->config.pgc_save_dirty_cb(cache, array, pages, pages_added); + flushes_so_far++; + + __atomic_add_fetch(&cache->stats.flushes_completed, pages_added, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.flushes_completed_size, pages_added_size, __ATOMIC_RELAXED); + + size_t pages_to_evict = 0; (void)pages_to_evict; + for (size_t i = 0; i < pages_added; i++) { + PGC_PAGE *tpg = pages[i]; + + internal_fatal(page_get_status_flags(tpg) != 0, + "DBENGINE CACHE: page should not be in any list while it is being saved"); + + __atomic_sub_fetch(&cache->stats.flushing_entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.flushing_size, tpg->assumed_size, __ATOMIC_RELAXED); + + pages_made_clean_size += tpg->assumed_size; + pages_made_clean++; + + if(!tpg->accesses) + pages_to_evict++; + + page_set_clean(cache, tpg, true, false); + page_transition_unlock(cache, tpg); + page_release(cache, tpg, false); + // tpg ptr may be invalid now + } + + internal_fatal(pages_added != pages_made_clean || pages_added != pages_removed_dirty || + pages_added_size != pages_made_clean_size || pages_added_size != pages_removed_dirty_size + , "DBENGINE CACHE: flushing pages mismatch"); + + if(!all_of_them && !wait) { + if(pgc_ll_trylock(cache, &cache->dirty)) + have_dirty_lock = true; + + else { + stopped_before_finishing = true; + have_dirty_lock = false; + } + } + else { + pgc_ll_lock(cache, &cache->dirty); + have_dirty_lock = true; + } + } + + if(have_dirty_lock) { + if(!stopped_before_finishing && dirty_version_at_entry > cache->dirty.last_version_checked) + cache->dirty.last_version_checked = dirty_version_at_entry; + + pgc_ll_unlock(cache, &cache->dirty); + } + + __atomic_sub_fetch(&cache->stats.workers_flush, 1, __ATOMIC_RELAXED); + + return stopped_before_finishing; +} + +void free_all_unreferenced_clean_pages(PGC *cache) { + evict_pages(cache, 0, 0, true, true); +} + +// ---------------------------------------------------------------------------- +// public API + +PGC *pgc_create(const char *name, + size_t clean_size_bytes, free_clean_page_callback pgc_free_cb, + size_t max_dirty_pages_per_flush, + save_dirty_init_callback pgc_save_init_cb, + save_dirty_page_callback pgc_save_dirty_cb, + size_t max_pages_per_inline_eviction, size_t max_inline_evictors, + size_t max_skip_pages_per_inline_eviction, + size_t max_flushes_inline, + PGC_OPTIONS options, size_t partitions, size_t additional_bytes_per_page) { + + if(max_pages_per_inline_eviction < 2) + max_pages_per_inline_eviction = 2; + + if(max_dirty_pages_per_flush < 1) + max_dirty_pages_per_flush = 1; + + if(max_flushes_inline * max_dirty_pages_per_flush < 2) + max_flushes_inline = 2; + + PGC *cache = callocz(1, sizeof(PGC)); + strncpyz(cache->config.name, name, PGC_NAME_MAX); + cache->config.options = options; + cache->config.clean_size = (clean_size_bytes < 1 * 1024 * 1024) ? 1 * 1024 * 1024 : clean_size_bytes; + cache->config.pgc_free_clean_cb = pgc_free_cb; + cache->config.max_dirty_pages_per_call = max_dirty_pages_per_flush; + cache->config.pgc_save_init_cb = pgc_save_init_cb; + cache->config.pgc_save_dirty_cb = pgc_save_dirty_cb; + cache->config.max_pages_per_inline_eviction = (max_pages_per_inline_eviction < 2) ? 2 : max_pages_per_inline_eviction; + cache->config.max_skip_pages_per_inline_eviction = (max_skip_pages_per_inline_eviction < 2) ? 2 : max_skip_pages_per_inline_eviction; + cache->config.max_flushes_inline = (max_flushes_inline < 1) ? 1 : max_flushes_inline; + cache->config.partitions = partitions < 1 ? (size_t)get_netdata_cpus() : partitions; + cache->config.additional_bytes_per_page = additional_bytes_per_page; + + cache->config.max_workers_evict_inline = max_inline_evictors; + cache->config.severe_pressure_per1000 = 1010; + cache->config.aggressive_evict_per1000 = 990; + cache->config.healthy_size_per1000 = 980; + cache->config.evict_low_threshold_per1000 = 970; + + cache->index = callocz(cache->config.partitions, sizeof(struct pgc_index)); + + for(size_t part = 0; part < cache->config.partitions ; part++) + netdata_rwlock_init(&cache->index[part].rwlock); + + netdata_spinlock_init(&cache->hot.spinlock); + netdata_spinlock_init(&cache->dirty.spinlock); + netdata_spinlock_init(&cache->clean.spinlock); + + cache->hot.flags = PGC_PAGE_HOT; + cache->hot.linked_list_in_sections_judy = true; + cache->hot.stats = &cache->stats.queues.hot; + + cache->dirty.flags = PGC_PAGE_DIRTY; + cache->dirty.linked_list_in_sections_judy = true; + cache->dirty.stats = &cache->stats.queues.dirty; + + cache->clean.flags = PGC_PAGE_CLEAN; + cache->clean.linked_list_in_sections_judy = false; + cache->clean.stats = &cache->stats.queues.clean; + + pgc_section_pages_static_aral_init(); + +#ifdef PGC_WITH_ARAL + cache->aral = callocz(cache->config.partitions, sizeof(ARAL *)); + for(size_t part = 0; part < cache->config.partitions ; part++) { + char buf[100 +1]; + snprintfz(buf, 100, "%s[%zu]", name, part); + cache->aral[part] = aral_create( + buf, + sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page, + 0, + 16384, + aral_statistics(pgc_section_pages_aral), + NULL, NULL, false, false); + } +#endif + + pointer_index_init(cache); + + return cache; +} + +struct aral_statistics *pgc_aral_statistics(void) { + return aral_statistics(pgc_section_pages_aral); +} + +size_t pgc_aral_structures(void) { + return aral_structures(pgc_section_pages_aral); +} + +size_t pgc_aral_overhead(void) { + return aral_overhead(pgc_section_pages_aral); +} + +void pgc_flush_all_hot_and_dirty_pages(PGC *cache, Word_t section) { + all_hot_pages_to_dirty(cache, section); + + // save all dirty pages to make them clean + flush_pages(cache, 0, section, true, true); +} + +void pgc_destroy(PGC *cache) { + // convert all hot pages to dirty + all_hot_pages_to_dirty(cache, PGC_SECTION_ALL); + + // save all dirty pages to make them clean + flush_pages(cache, 0, PGC_SECTION_ALL, true, true); + + // free all unreferenced clean pages + free_all_unreferenced_clean_pages(cache); + + if(PGC_REFERENCED_PAGES(cache)) + error("DBENGINE CACHE: there are %zu referenced cache pages - leaving the cache allocated", PGC_REFERENCED_PAGES(cache)); + else { + pointer_destroy_index(cache); + + for(size_t part = 0; part < cache->config.partitions ; part++) + netdata_rwlock_destroy(&cache->index[part].rwlock); + +#ifdef PGC_WITH_ARAL + for(size_t part = 0; part < cache->config.partitions ; part++) + aral_destroy(cache->aral[part]); + + freez(cache->aral); +#endif + + freez(cache); + } +} + +PGC_PAGE *pgc_page_add_and_acquire(PGC *cache, PGC_ENTRY entry, bool *added) { + return page_add(cache, &entry, added); +} + +PGC_PAGE *pgc_page_dup(PGC *cache, PGC_PAGE *page) { + if(!page_acquire(cache, page)) + fatal("DBENGINE CACHE: tried to dup a page that is not acquired!"); + + return page; +} + +void pgc_page_release(PGC *cache, PGC_PAGE *page) { + page_release(cache, page, is_page_clean(page)); +} + +void pgc_page_hot_to_dirty_and_release(PGC *cache, PGC_PAGE *page) { + __atomic_add_fetch(&cache->stats.workers_hot2dirty, 1, __ATOMIC_RELAXED); + +//#ifdef NETDATA_INTERNAL_CHECKS +// page_transition_lock(cache, page); +// internal_fatal(!is_page_hot(page), "DBENGINE CACHE: called %s() but page is not hot", __FUNCTION__ ); +// page_transition_unlock(cache, page); +//#endif + + // make page dirty + page_set_dirty(cache, page, false); + + // release the page + page_release(cache, page, true); + // page ptr may be invalid now + + __atomic_sub_fetch(&cache->stats.workers_hot2dirty, 1, __ATOMIC_RELAXED); + + // flush, if we have to + if((cache->config.options & PGC_OPTIONS_FLUSH_PAGES_INLINE) || flushing_critical(cache)) { + flush_pages(cache, cache->config.max_flushes_inline, PGC_SECTION_ALL, + false, false); + } +} + +bool pgc_page_to_clean_evict_or_release(PGC *cache, PGC_PAGE *page) { + bool ret; + + __atomic_add_fetch(&cache->stats.workers_hot2dirty, 1, __ATOMIC_RELAXED); + + // prevent accesses from increasing the accesses counter + page_flag_set(page, PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES); + + // zero the accesses counter + __atomic_store_n(&page->accesses, 0, __ATOMIC_RELEASE); + + // if there are no other references to it, evict it immediately + if(make_acquired_page_clean_and_evict_or_page_release(cache, page)) { + __atomic_add_fetch(&cache->stats.hot_empty_pages_evicted_immediately, 1, __ATOMIC_RELAXED); + ret = true; + } + else { + __atomic_add_fetch(&cache->stats.hot_empty_pages_evicted_later, 1, __ATOMIC_RELAXED); + ret = false; + } + + __atomic_sub_fetch(&cache->stats.workers_hot2dirty, 1, __ATOMIC_RELAXED); + + return ret; +} + +Word_t pgc_page_section(PGC_PAGE *page) { + return page->section; +} + +Word_t pgc_page_metric(PGC_PAGE *page) { + return page->metric_id; +} + +time_t pgc_page_start_time_s(PGC_PAGE *page) { + return page->start_time_s; +} + +time_t pgc_page_end_time_s(PGC_PAGE *page) { + return page->end_time_s; +} + +time_t pgc_page_update_every_s(PGC_PAGE *page) { + return page->update_every_s; +} + +time_t pgc_page_fix_update_every(PGC_PAGE *page, time_t update_every_s) { + if(page->update_every_s == 0) + page->update_every_s = update_every_s; + + return page->update_every_s; +} + +time_t pgc_page_fix_end_time_s(PGC_PAGE *page, time_t end_time_s) { + page->end_time_s = end_time_s; + return page->end_time_s; +} + +void *pgc_page_data(PGC_PAGE *page) { + return page->data; +} + +void *pgc_page_custom_data(PGC *cache, PGC_PAGE *page) { + if(cache->config.additional_bytes_per_page) + return page->custom_data; + + return NULL; +} + +size_t pgc_page_data_size(PGC *cache, PGC_PAGE *page) { + return page_size_from_assumed_size(cache, page->assumed_size); +} + +bool pgc_is_page_hot(PGC_PAGE *page) { + return is_page_hot(page); +} + +bool pgc_is_page_dirty(PGC_PAGE *page) { + return is_page_dirty(page); +} + +bool pgc_is_page_clean(PGC_PAGE *page) { + return is_page_clean(page); +} + +void pgc_reset_hot_max(PGC *cache) { + size_t entries = __atomic_load_n(&cache->hot.stats->entries, __ATOMIC_RELAXED); + size_t size = __atomic_load_n(&cache->hot.stats->size, __ATOMIC_RELAXED); + + __atomic_store_n(&cache->hot.stats->max_entries, entries, __ATOMIC_RELAXED); + __atomic_store_n(&cache->hot.stats->max_size, size, __ATOMIC_RELAXED); + + size_t size_to_evict = 0; + cache_usage_per1000(cache, &size_to_evict); + evict_pages(cache, 0, 0, true, false); +} + +void pgc_set_dynamic_target_cache_size_callback(PGC *cache, dynamic_target_cache_size_callback callback) { + cache->config.dynamic_target_size_cb = callback; + + size_t size_to_evict = 0; + cache_usage_per1000(cache, &size_to_evict); + evict_pages(cache, 0, 0, true, false); +} + +size_t pgc_get_current_cache_size(PGC *cache) { + cache_usage_per1000(cache, NULL); + return __atomic_load_n(&cache->stats.current_cache_size, __ATOMIC_RELAXED); +} + +size_t pgc_get_wanted_cache_size(PGC *cache) { + cache_usage_per1000(cache, NULL); + return __atomic_load_n(&cache->stats.wanted_cache_size, __ATOMIC_RELAXED); +} + +bool pgc_evict_pages(PGC *cache, size_t max_skip, size_t max_evict) { + bool under_pressure = cache_needs_space_aggressively(cache); + return evict_pages(cache, + under_pressure ? 0 : max_skip, + under_pressure ? 0 : max_evict, + true, false); +} + +bool pgc_flush_pages(PGC *cache, size_t max_flushes) { + bool under_pressure = flushing_critical(cache); + return flush_pages(cache, under_pressure ? 0 : max_flushes, PGC_SECTION_ALL, true, false); +} + +void pgc_page_hot_set_end_time_s(PGC *cache __maybe_unused, PGC_PAGE *page, time_t end_time_s) { + internal_fatal(!is_page_hot(page), + "DBENGINE CACHE: end_time_s update on non-hot page"); + + internal_fatal(end_time_s < __atomic_load_n(&page->end_time_s, __ATOMIC_RELAXED), + "DBENGINE CACHE: end_time_s is not bigger than existing"); + + __atomic_store_n(&page->end_time_s, end_time_s, __ATOMIC_RELAXED); + +#ifdef PGC_COUNT_POINTS_COLLECTED + __atomic_add_fetch(&cache->stats.points_collected, 1, __ATOMIC_RELAXED); +#endif +} + +PGC_PAGE *pgc_page_get_and_acquire(PGC *cache, Word_t section, Word_t metric_id, time_t start_time_s, PGC_SEARCH method) { + return page_find_and_acquire(cache, section, metric_id, start_time_s, method); +} + +struct pgc_statistics pgc_get_statistics(PGC *cache) { + // FIXME - get the statistics atomically + return cache->stats; +} + +size_t pgc_hot_and_dirty_entries(PGC *cache) { + size_t entries = 0; + + entries += __atomic_load_n(&cache->hot.stats->entries, __ATOMIC_RELAXED); + entries += __atomic_load_n(&cache->dirty.stats->entries, __ATOMIC_RELAXED); + entries += __atomic_load_n(&cache->stats.flushing_entries, __ATOMIC_RELAXED); + entries += __atomic_load_n(&cache->stats.hot2dirty_entries, __ATOMIC_RELAXED); + + return entries; +} + +void pgc_open_cache_to_journal_v2(PGC *cache, Word_t section, unsigned datafile_fileno, uint8_t type, migrate_to_v2_callback cb, void *data) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.journal_v2_indexing_started, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.workers_jv2_flush, 1, __ATOMIC_RELAXED); + + pgc_ll_lock(cache, &cache->hot); + + Pvoid_t JudyL_metrics = NULL; + Pvoid_t JudyL_extents_pos = NULL; + + size_t count_of_unique_extents = 0; + size_t count_of_unique_metrics = 0; + size_t count_of_unique_pages = 0; + + size_t master_extent_index_id = 0; + + Pvoid_t *section_pages_pptr = JudyLGet(cache->hot.sections_judy, section, PJE0); + if(!section_pages_pptr) { + pgc_ll_unlock(cache, &cache->hot); + return; + } + + struct section_pages *sp = *section_pages_pptr; + if(!netdata_spinlock_trylock(&sp->migration_to_v2_spinlock)) { + internal_fatal(true, "DBENGINE: migration to journal v2 is already running for this section"); + pgc_ll_unlock(cache, &cache->hot); + return; + } + + ARAL *ar_mi = aral_by_size_acquire(sizeof(struct jv2_metrics_info)); + ARAL *ar_pi = aral_by_size_acquire(sizeof(struct jv2_page_info)); + ARAL *ar_ei = aral_by_size_acquire(sizeof(struct jv2_extents_info)); + + for(PGC_PAGE *page = sp->base; page ; page = page->link.next) { + struct extent_io_data *xio = (struct extent_io_data *)page->custom_data; + if(xio->fileno != datafile_fileno) continue; + + if(page_flag_check(page, PGC_PAGE_IS_BEING_MIGRATED_TO_V2)) { + internal_fatal(true, "Migration to journal v2: page has already been migrated to v2"); + continue; + } + + if(!page_transition_trylock(cache, page)) { + internal_fatal(true, "Migration to journal v2: cannot get page transition lock"); + continue; + } + + if(!page_acquire(cache, page)) { + internal_fatal(true, "Migration to journal v2: cannot acquire page for migration to v2"); + continue; + } + + page_flag_set(page, PGC_PAGE_IS_BEING_MIGRATED_TO_V2); + + pgc_ll_unlock(cache, &cache->hot); + + // update the extents JudyL + + size_t current_extent_index_id; + Pvoid_t *PValue = JudyLIns(&JudyL_extents_pos, xio->pos, PJE0); + if(!PValue || *PValue == PJERR) + fatal("Corrupted JudyL extents pos"); + + struct jv2_extents_info *ei; + if(!*PValue) { + ei = aral_mallocz(ar_ei); // callocz(1, sizeof(struct jv2_extents_info)); + ei->pos = xio->pos; + ei->bytes = xio->bytes; + ei->number_of_pages = 1; + ei->index = master_extent_index_id++; + *PValue = ei; + + count_of_unique_extents++; + } + else { + ei = *PValue; + ei->number_of_pages++; + } + + current_extent_index_id = ei->index; + + // update the metrics JudyL + + PValue = JudyLIns(&JudyL_metrics, page->metric_id, PJE0); + if(!PValue || *PValue == PJERR) + fatal("Corrupted JudyL metrics"); + + struct jv2_metrics_info *mi; + if(!*PValue) { + mi = aral_mallocz(ar_mi); // callocz(1, sizeof(struct jv2_metrics_info)); + mi->uuid = mrg_metric_uuid(main_mrg, (METRIC *)page->metric_id); + mi->first_time_s = page->start_time_s; + mi->last_time_s = page->end_time_s; + mi->number_of_pages = 1; + mi->page_list_header = 0; + mi->JudyL_pages_by_start_time = NULL; + *PValue = mi; + + count_of_unique_metrics++; + } + else { + mi = *PValue; + mi->number_of_pages++; + if(page->start_time_s < mi->first_time_s) + mi->first_time_s = page->start_time_s; + if(page->end_time_s > mi->last_time_s) + mi->last_time_s = page->end_time_s; + } + + PValue = JudyLIns(&mi->JudyL_pages_by_start_time, page->start_time_s, PJE0); + if(!PValue || *PValue == PJERR) + fatal("Corrupted JudyL metric pages"); + + if(!*PValue) { + struct jv2_page_info *pi = aral_mallocz(ar_pi); // callocz(1, (sizeof(struct jv2_page_info))); + pi->start_time_s = page->start_time_s; + pi->end_time_s = page->end_time_s; + pi->update_every_s = page->update_every_s; + pi->page_length = page_size_from_assumed_size(cache, page->assumed_size); + pi->page = page; + pi->extent_index = current_extent_index_id; + pi->custom_data = (cache->config.additional_bytes_per_page) ? page->custom_data : NULL; + *PValue = pi; + + count_of_unique_pages++; + } + else { + // impossible situation + internal_fatal(true, "Page is already in JudyL metric pages"); + page_flag_clear(page, PGC_PAGE_IS_BEING_MIGRATED_TO_V2); + page_transition_unlock(cache, page); + page_release(cache, page, false); + } + + pgc_ll_lock(cache, &cache->hot); + } + + netdata_spinlock_unlock(&sp->migration_to_v2_spinlock); + pgc_ll_unlock(cache, &cache->hot); + + // callback + cb(section, datafile_fileno, type, JudyL_metrics, JudyL_extents_pos, count_of_unique_extents, count_of_unique_metrics, count_of_unique_pages, data); + + { + Pvoid_t *PValue1; + bool metric_id_first = true; + Word_t metric_id = 0; + while ((PValue1 = JudyLFirstThenNext(JudyL_metrics, &metric_id, &metric_id_first))) { + struct jv2_metrics_info *mi = *PValue1; + + Pvoid_t *PValue2; + bool start_time_first = true; + Word_t start_time = 0; + while ((PValue2 = JudyLFirstThenNext(mi->JudyL_pages_by_start_time, &start_time, &start_time_first))) { + struct jv2_page_info *pi = *PValue2; + page_transition_unlock(cache, pi->page); + pgc_page_hot_to_dirty_and_release(cache, pi->page); + // make_acquired_page_clean_and_evict_or_page_release(cache, pi->page); + aral_freez(ar_pi, pi); + } + + JudyLFreeArray(&mi->JudyL_pages_by_start_time, PJE0); + aral_freez(ar_mi, mi); + } + JudyLFreeArray(&JudyL_metrics, PJE0); + } + + { + Pvoid_t *PValue; + bool extent_pos_first = true; + Word_t extent_pos = 0; + while ((PValue = JudyLFirstThenNext(JudyL_extents_pos, &extent_pos, &extent_pos_first))) { + struct jv2_extents_info *ei = *PValue; + aral_freez(ar_ei, ei); + } + JudyLFreeArray(&JudyL_extents_pos, PJE0); + } + + aral_by_size_release(ar_ei); + aral_by_size_release(ar_pi); + aral_by_size_release(ar_mi); + + __atomic_sub_fetch(&cache->stats.workers_jv2_flush, 1, __ATOMIC_RELAXED); +} + +static bool match_page_data(PGC_PAGE *page, void *data) { + return (page->data == data); +} + +void pgc_open_evict_clean_pages_of_datafile(PGC *cache, struct rrdengine_datafile *datafile) { + evict_pages_with_filter(cache, 0, 0, true, true, match_page_data, datafile); +} + +size_t pgc_count_clean_pages_having_data_ptr(PGC *cache, Word_t section, void *ptr) { + size_t found = 0; + + pgc_ll_lock(cache, &cache->clean); + for(PGC_PAGE *page = cache->clean.base; page ;page = page->link.next) + found += (page->data == ptr && page->section == section) ? 1 : 0; + pgc_ll_unlock(cache, &cache->clean); + + return found; +} + +size_t pgc_count_hot_pages_having_data_ptr(PGC *cache, Word_t section, void *ptr) { + size_t found = 0; + + pgc_ll_lock(cache, &cache->hot); + Pvoid_t *section_pages_pptr = JudyLGet(cache->hot.sections_judy, section, PJE0); + if(section_pages_pptr) { + struct section_pages *sp = *section_pages_pptr; + for(PGC_PAGE *page = sp->base; page ;page = page->link.next) + found += (page->data == ptr) ? 1 : 0; + } + pgc_ll_unlock(cache, &cache->hot); + + return found; +} + +// ---------------------------------------------------------------------------- +// unittest + +static void unittest_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused) { + ; +} + +static void unittest_save_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) { + ; +} + +#ifdef PGC_STRESS_TEST + +struct { + bool stop; + PGC *cache; + PGC_PAGE **metrics; + size_t clean_metrics; + size_t hot_metrics; + time_t first_time_t; + time_t last_time_t; + size_t cache_size; + size_t query_threads; + size_t collect_threads; + size_t partitions; + size_t points_per_page; + time_t time_per_collection_ut; + time_t time_per_query_ut; + time_t time_per_flush_ut; + PGC_OPTIONS options; + char rand_statebufs[1024]; + struct random_data *random_data; +} pgc_uts = { + .stop = false, + .metrics = NULL, + .clean_metrics = 100000, + .hot_metrics = 1000000, + .first_time_t = 100000000, + .last_time_t = 0, + .cache_size = 0, // get the default (8MB) + .collect_threads = 16, + .query_threads = 16, + .partitions = 0, // get the default (system cpus) + .options = PGC_OPTIONS_AUTOSCALE,/* PGC_OPTIONS_FLUSH_PAGES_INLINE | PGC_OPTIONS_EVICT_PAGES_INLINE,*/ + .points_per_page = 10, + .time_per_collection_ut = 1000000, + .time_per_query_ut = 250, + .time_per_flush_ut = 100, + .rand_statebufs = {}, + .random_data = NULL, +}; + +void *unittest_stress_test_collector(void *ptr) { + size_t id = *((size_t *)ptr); + + size_t metric_start = pgc_uts.clean_metrics; + size_t metric_end = pgc_uts.clean_metrics + pgc_uts.hot_metrics; + size_t number_of_metrics = metric_end - metric_start; + size_t per_collector_metrics = number_of_metrics / pgc_uts.collect_threads; + metric_start = metric_start + per_collector_metrics * id + 1; + metric_end = metric_start + per_collector_metrics - 1; + + time_t start_time_t = pgc_uts.first_time_t + 1; + + heartbeat_t hb; + heartbeat_init(&hb); + + while(!__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) { + // info("COLLECTOR %zu: collecting metrics %zu to %zu, from %ld to %lu", id, metric_start, metric_end, start_time_t, start_time_t + pgc_uts.points_per_page); + + netdata_thread_disable_cancelability(); + + for (size_t i = metric_start; i < metric_end; i++) { + bool added; + + pgc_uts.metrics[i] = pgc_page_add_and_acquire(pgc_uts.cache, (PGC_ENTRY) { + .section = 1, + .metric_id = i, + .start_time_t = start_time_t, + .end_time_t = start_time_t, + .update_every = 1, + .size = 4096, + .data = NULL, + .hot = true, + }, &added); + + if(!pgc_is_page_hot(pgc_uts.metrics[i]) || !added) { + pgc_page_release(pgc_uts.cache, pgc_uts.metrics[i]); + pgc_uts.metrics[i] = NULL; + } + } + + time_t end_time_t = start_time_t + (time_t)pgc_uts.points_per_page; + while(++start_time_t <= end_time_t && !__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) { + heartbeat_next(&hb, pgc_uts.time_per_collection_ut); + + for (size_t i = metric_start; i < metric_end; i++) { + if(pgc_uts.metrics[i]) + pgc_page_hot_set_end_time_t(pgc_uts.cache, pgc_uts.metrics[i], start_time_t); + } + + __atomic_store_n(&pgc_uts.last_time_t, start_time_t, __ATOMIC_RELAXED); + } + + for (size_t i = metric_start; i < metric_end; i++) { + if (pgc_uts.metrics[i]) { + if(i % 10 == 0) + pgc_page_to_clean_evict_or_release(pgc_uts.cache, pgc_uts.metrics[i]); + else + pgc_page_hot_to_dirty_and_release(pgc_uts.cache, pgc_uts.metrics[i]); + } + } + + netdata_thread_enable_cancelability(); + } + + return ptr; +} + +void *unittest_stress_test_queries(void *ptr) { + size_t id = *((size_t *)ptr); + struct random_data *random_data = &pgc_uts.random_data[id]; + + size_t start = 0; + size_t end = pgc_uts.clean_metrics + pgc_uts.hot_metrics; + + while(!__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) { + netdata_thread_disable_cancelability(); + + int32_t random_number; + random_r(random_data, &random_number); + + size_t metric_id = random_number % (end - start); + time_t start_time_t = pgc_uts.first_time_t; + time_t end_time_t = __atomic_load_n(&pgc_uts.last_time_t, __ATOMIC_RELAXED); + if(end_time_t <= start_time_t) + end_time_t = start_time_t + 1; + size_t pages = (end_time_t - start_time_t) / pgc_uts.points_per_page + 1; + + PGC_PAGE *array[pages]; + for(size_t i = 0; i < pages ;i++) + array[i] = NULL; + + // find the pages the cache has + for(size_t i = 0; i < pages ;i++) { + time_t page_start_time = start_time_t + (time_t)(i * pgc_uts.points_per_page); + array[i] = pgc_page_get_and_acquire(pgc_uts.cache, 1, metric_id, + page_start_time, (i < pages - 1)?PGC_SEARCH_EXACT:PGC_SEARCH_CLOSEST); + } + + // load the rest of the pages + for(size_t i = 0; i < pages ;i++) { + if(array[i]) continue; + + time_t page_start_time = start_time_t + (time_t)(i * pgc_uts.points_per_page); + array[i] = pgc_page_add_and_acquire(pgc_uts.cache, (PGC_ENTRY) { + .section = 1, + .metric_id = metric_id, + .start_time_t = page_start_time, + .end_time_t = page_start_time + (time_t)pgc_uts.points_per_page, + .update_every = 1, + .size = 4096, + .data = NULL, + .hot = false, + }, NULL); + } + + // do the query + // ... + struct timespec work_duration = {.tv_sec = 0, .tv_nsec = pgc_uts.time_per_query_ut * NSEC_PER_USEC }; + nanosleep(&work_duration, NULL); + + // release the pages + for(size_t i = 0; i < pages ;i++) { + if(!array[i]) continue; + pgc_page_release(pgc_uts.cache, array[i]); + array[i] = NULL; + } + + netdata_thread_enable_cancelability(); + } + + return ptr; +} + +void *unittest_stress_test_service(void *ptr) { + heartbeat_t hb; + heartbeat_init(&hb); + while(!__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) { + heartbeat_next(&hb, 1 * USEC_PER_SEC); + + pgc_flush_pages(pgc_uts.cache, 1000); + pgc_evict_pages(pgc_uts.cache, 0, 0); + } + return ptr; +} + +static void unittest_stress_test_save_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) { + // info("SAVE %zu pages", entries); + if(!pgc_uts.stop) { + usec_t t = pgc_uts.time_per_flush_ut; + + if(t > 0) { + struct timespec work_duration = { + .tv_sec = t / USEC_PER_SEC, + .tv_nsec = (long) ((t % USEC_PER_SEC) * NSEC_PER_USEC) + }; + + nanosleep(&work_duration, NULL); + } + } +} + +void unittest_stress_test(void) { + pgc_uts.cache = pgc_create(pgc_uts.cache_size * 1024 * 1024, + unittest_free_clean_page_callback, + 64, unittest_stress_test_save_dirty_page_callback, + 1000, 10000, 1, + pgc_uts.options, pgc_uts.partitions, 0); + + pgc_uts.metrics = callocz(pgc_uts.clean_metrics + pgc_uts.hot_metrics, sizeof(PGC_PAGE *)); + + pthread_t service_thread; + netdata_thread_create(&service_thread, "SERVICE", + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + unittest_stress_test_service, NULL); + + pthread_t collect_threads[pgc_uts.collect_threads]; + size_t collect_thread_ids[pgc_uts.collect_threads]; + for(size_t i = 0; i < pgc_uts.collect_threads ;i++) { + collect_thread_ids[i] = i; + char buffer[100 + 1]; + snprintfz(buffer, 100, "COLLECT_%zu", i); + netdata_thread_create(&collect_threads[i], buffer, + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + unittest_stress_test_collector, &collect_thread_ids[i]); + } + + pthread_t queries_threads[pgc_uts.query_threads]; + size_t query_thread_ids[pgc_uts.query_threads]; + pgc_uts.random_data = callocz(pgc_uts.query_threads, sizeof(struct random_data)); + for(size_t i = 0; i < pgc_uts.query_threads ;i++) { + query_thread_ids[i] = i; + char buffer[100 + 1]; + snprintfz(buffer, 100, "QUERY_%zu", i); + initstate_r(1, pgc_uts.rand_statebufs, 1024, &pgc_uts.random_data[i]); + netdata_thread_create(&queries_threads[i], buffer, + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + unittest_stress_test_queries, &query_thread_ids[i]); + } + + heartbeat_t hb; + heartbeat_init(&hb); + + struct { + size_t entries; + size_t added; + size_t deleted; + size_t referenced; + + size_t hot_entries; + size_t hot_added; + size_t hot_deleted; + + size_t dirty_entries; + size_t dirty_added; + size_t dirty_deleted; + + size_t clean_entries; + size_t clean_added; + size_t clean_deleted; + + size_t searches_exact; + size_t searches_exact_hits; + size_t searches_closest; + size_t searches_closest_hits; + + size_t collections; + + size_t events_cache_under_severe_pressure; + size_t events_cache_needs_space_90; + size_t events_flush_critical; + } stats = {}, old_stats = {}; + + for(int i = 0; i < 86400 ;i++) { + heartbeat_next(&hb, 1 * USEC_PER_SEC); + + old_stats = stats; + stats.entries = __atomic_load_n(&pgc_uts.cache->stats.entries, __ATOMIC_RELAXED); + stats.added = __atomic_load_n(&pgc_uts.cache->stats.added_entries, __ATOMIC_RELAXED); + stats.deleted = __atomic_load_n(&pgc_uts.cache->stats.removed_entries, __ATOMIC_RELAXED); + stats.referenced = __atomic_load_n(&pgc_uts.cache->stats.referenced_entries, __ATOMIC_RELAXED); + + stats.hot_entries = __atomic_load_n(&pgc_uts.cache->hot.stats->entries, __ATOMIC_RELAXED); + stats.hot_added = __atomic_load_n(&pgc_uts.cache->hot.stats->added_entries, __ATOMIC_RELAXED); + stats.hot_deleted = __atomic_load_n(&pgc_uts.cache->hot.stats->removed_entries, __ATOMIC_RELAXED); + + stats.dirty_entries = __atomic_load_n(&pgc_uts.cache->dirty.stats->entries, __ATOMIC_RELAXED); + stats.dirty_added = __atomic_load_n(&pgc_uts.cache->dirty.stats->added_entries, __ATOMIC_RELAXED); + stats.dirty_deleted = __atomic_load_n(&pgc_uts.cache->dirty.stats->removed_entries, __ATOMIC_RELAXED); + + stats.clean_entries = __atomic_load_n(&pgc_uts.cache->clean.stats->entries, __ATOMIC_RELAXED); + stats.clean_added = __atomic_load_n(&pgc_uts.cache->clean.stats->added_entries, __ATOMIC_RELAXED); + stats.clean_deleted = __atomic_load_n(&pgc_uts.cache->clean.stats->removed_entries, __ATOMIC_RELAXED); + + stats.searches_exact = __atomic_load_n(&pgc_uts.cache->stats.searches_exact, __ATOMIC_RELAXED); + stats.searches_exact_hits = __atomic_load_n(&pgc_uts.cache->stats.searches_exact_hits, __ATOMIC_RELAXED); + + stats.searches_closest = __atomic_load_n(&pgc_uts.cache->stats.searches_closest, __ATOMIC_RELAXED); + stats.searches_closest_hits = __atomic_load_n(&pgc_uts.cache->stats.searches_closest_hits, __ATOMIC_RELAXED); + + stats.events_cache_under_severe_pressure = __atomic_load_n(&pgc_uts.cache->stats.events_cache_under_severe_pressure, __ATOMIC_RELAXED); + stats.events_cache_needs_space_90 = __atomic_load_n(&pgc_uts.cache->stats.events_cache_needs_space_aggressively, __ATOMIC_RELAXED); + stats.events_flush_critical = __atomic_load_n(&pgc_uts.cache->stats.events_flush_critical, __ATOMIC_RELAXED); + + size_t searches_exact = stats.searches_exact - old_stats.searches_exact; + size_t searches_closest = stats.searches_closest - old_stats.searches_closest; + + size_t hit_exact = stats.searches_exact_hits - old_stats.searches_exact_hits; + size_t hit_closest = stats.searches_closest_hits - old_stats.searches_closest_hits; + + double hit_exact_pc = (searches_exact > 0) ? (double)hit_exact * 100.0 / (double)searches_exact : 0.0; + double hit_closest_pc = (searches_closest > 0) ? (double)hit_closest * 100.0 / (double)searches_closest : 0.0; + +#ifdef PGC_COUNT_POINTS_COLLECTED + stats.collections = __atomic_load_n(&pgc_uts.cache->stats.points_collected, __ATOMIC_RELAXED); +#endif + + char *cache_status = "N"; + if(stats.events_cache_under_severe_pressure > old_stats.events_cache_under_severe_pressure) + cache_status = "F"; + else if(stats.events_cache_needs_space_90 > old_stats.events_cache_needs_space_90) + cache_status = "f"; + + char *flushing_status = "N"; + if(stats.events_flush_critical > old_stats.events_flush_critical) + flushing_status = "F"; + + info("PGS %5zuk +%4zuk/-%4zuk " + "| RF %5zuk " + "| HOT %5zuk +%4zuk -%4zuk " + "| DRT %s %5zuk +%4zuk -%4zuk " + "| CLN %s %5zuk +%4zuk -%4zuk " + "| SRCH %4zuk %4zuk, HIT %4.1f%% %4.1f%% " +#ifdef PGC_COUNT_POINTS_COLLECTED + "| CLCT %8.4f Mps" +#endif + , stats.entries / 1000 + , (stats.added - old_stats.added) / 1000, (stats.deleted - old_stats.deleted) / 1000 + , stats.referenced / 1000 + , stats.hot_entries / 1000, (stats.hot_added - old_stats.hot_added) / 1000, (stats.hot_deleted - old_stats.hot_deleted) / 1000 + , flushing_status + , stats.dirty_entries / 1000 + , (stats.dirty_added - old_stats.dirty_added) / 1000, (stats.dirty_deleted - old_stats.dirty_deleted) / 1000 + , cache_status + , stats.clean_entries / 1000 + , (stats.clean_added - old_stats.clean_added) / 1000, (stats.clean_deleted - old_stats.clean_deleted) / 1000 + , searches_exact / 1000, searches_closest / 1000 + , hit_exact_pc, hit_closest_pc +#ifdef PGC_COUNT_POINTS_COLLECTED + , (double)(stats.collections - old_stats.collections) / 1000.0 / 1000.0 +#endif + ); + } + info("Waiting for threads to stop..."); + __atomic_store_n(&pgc_uts.stop, true, __ATOMIC_RELAXED); + + netdata_thread_join(service_thread, NULL); + + for(size_t i = 0; i < pgc_uts.collect_threads ;i++) + netdata_thread_join(collect_threads[i],NULL); + + for(size_t i = 0; i < pgc_uts.query_threads ;i++) + netdata_thread_join(queries_threads[i],NULL); + + pgc_destroy(pgc_uts.cache); + + freez(pgc_uts.metrics); + freez(pgc_uts.random_data); +} +#endif + +int pgc_unittest(void) { + PGC *cache = pgc_create("test", + 32 * 1024 * 1024, unittest_free_clean_page_callback, + 64, NULL, unittest_save_dirty_page_callback, + 10, 10, 1000, 10, + PGC_OPTIONS_DEFAULT, 1, 11); + + // FIXME - unit tests + // - add clean page + // - add clean page again (should not add it) + // - release page (should decrement counters) + // - add hot page + // - add hot page again (should not add it) + // - turn hot page to dirty, with and without a reference counter to it + // - dirty pages are saved once there are enough of them + // - find page exact + // - find page (should return last) + // - find page (should return next) + // - page cache full (should evict) + // - on destroy, turn hot pages to dirty and save them + + PGC_PAGE *page1 = pgc_page_add_and_acquire(cache, (PGC_ENTRY){ + .section = 1, + .metric_id = 10, + .start_time_s = 100, + .end_time_s = 1000, + .size = 4096, + .data = NULL, + .hot = false, + .custom_data = (uint8_t *)"0123456789", + }, NULL); + + if(strcmp(pgc_page_custom_data(cache, page1), "0123456789") != 0) + fatal("custom data do not work"); + + memcpy(pgc_page_custom_data(cache, page1), "ABCDEFGHIJ", 11); + if(strcmp(pgc_page_custom_data(cache, page1), "ABCDEFGHIJ") != 0) + fatal("custom data do not work"); + + pgc_page_release(cache, page1); + + PGC_PAGE *page2 = pgc_page_add_and_acquire(cache, (PGC_ENTRY){ + .section = 2, + .metric_id = 10, + .start_time_s = 1001, + .end_time_s = 2000, + .size = 4096, + .data = NULL, + .hot = true, + }, NULL); + + pgc_page_hot_set_end_time_s(cache, page2, 2001); + pgc_page_hot_to_dirty_and_release(cache, page2); + + PGC_PAGE *page3 = pgc_page_add_and_acquire(cache, (PGC_ENTRY){ + .section = 3, + .metric_id = 10, + .start_time_s = 1001, + .end_time_s = 2000, + .size = 4096, + .data = NULL, + .hot = true, + }, NULL); + + pgc_page_hot_set_end_time_s(cache, page3, 2001); + pgc_page_hot_to_dirty_and_release(cache, page3); + + pgc_destroy(cache); + +#ifdef PGC_STRESS_TEST + unittest_stress_test(); +#endif + + return 0; +} diff --git a/database/engine/cache.h b/database/engine/cache.h new file mode 100644 index 000000000..65e6a6137 --- /dev/null +++ b/database/engine/cache.h @@ -0,0 +1,249 @@ +#ifndef DBENGINE_CACHE_H +#define DBENGINE_CACHE_H + +#include "../rrd.h" + +// CACHE COMPILE TIME CONFIGURATION +// #define PGC_COUNT_POINTS_COLLECTED 1 + +typedef struct pgc PGC; +typedef struct pgc_page PGC_PAGE; +#define PGC_NAME_MAX 23 + +typedef enum __attribute__ ((__packed__)) { + PGC_OPTIONS_NONE = 0, + PGC_OPTIONS_EVICT_PAGES_INLINE = (1 << 0), + PGC_OPTIONS_FLUSH_PAGES_INLINE = (1 << 1), + PGC_OPTIONS_AUTOSCALE = (1 << 2), +} PGC_OPTIONS; + +#define PGC_OPTIONS_DEFAULT (PGC_OPTIONS_EVICT_PAGES_INLINE | PGC_OPTIONS_FLUSH_PAGES_INLINE | PGC_OPTIONS_AUTOSCALE) + +typedef struct pgc_entry { + Word_t section; // the section this belongs to + Word_t metric_id; // the metric this belongs to + time_t start_time_s; // the start time of the page + time_t end_time_s; // the end time of the page + size_t size; // the size in bytes of the allocation, outside the cache + void *data; // a pointer to data outside the cache + uint32_t update_every_s; // the update every of the page + bool hot; // true if this entry is currently being collected + uint8_t *custom_data; +} PGC_ENTRY; + +#define PGC_CACHE_LINE_PADDING(x) uint8_t padding##x[128] + +struct pgc_queue_statistics { + size_t entries; + size_t size; + + PGC_CACHE_LINE_PADDING(1); + + size_t max_entries; + size_t max_size; + + PGC_CACHE_LINE_PADDING(2); + + size_t added_entries; + size_t added_size; + + PGC_CACHE_LINE_PADDING(3); + + size_t removed_entries; + size_t removed_size; + + PGC_CACHE_LINE_PADDING(4); +}; + +struct pgc_statistics { + size_t wanted_cache_size; + size_t current_cache_size; + + PGC_CACHE_LINE_PADDING(1); + + size_t added_entries; + size_t added_size; + + PGC_CACHE_LINE_PADDING(2); + + size_t removed_entries; + size_t removed_size; + + PGC_CACHE_LINE_PADDING(3); + + size_t entries; // all the entries (includes clean, dirty, host) + size_t size; // all the entries (includes clean, dirty, host) + + size_t evicting_entries; + size_t evicting_size; + + size_t flushing_entries; + size_t flushing_size; + + size_t hot2dirty_entries; + size_t hot2dirty_size; + + PGC_CACHE_LINE_PADDING(4); + + size_t acquires; + PGC_CACHE_LINE_PADDING(4a); + size_t releases; + PGC_CACHE_LINE_PADDING(4b); + size_t acquires_for_deletion; + PGC_CACHE_LINE_PADDING(4c); + + size_t referenced_entries; // all the entries currently referenced + size_t referenced_size; // all the entries currently referenced + + PGC_CACHE_LINE_PADDING(5); + + size_t searches_exact; + size_t searches_exact_hits; + size_t searches_exact_misses; + + PGC_CACHE_LINE_PADDING(6); + + size_t searches_closest; + size_t searches_closest_hits; + size_t searches_closest_misses; + + PGC_CACHE_LINE_PADDING(7); + + size_t flushes_completed; + size_t flushes_completed_size; + size_t flushes_cancelled; + size_t flushes_cancelled_size; + +#ifdef PGC_COUNT_POINTS_COLLECTED + PGC_CACHE_LINE_PADDING(8); + size_t points_collected; +#endif + + PGC_CACHE_LINE_PADDING(9); + + size_t insert_spins; + size_t evict_spins; + size_t release_spins; + size_t acquire_spins; + size_t delete_spins; + size_t flush_spins; + + PGC_CACHE_LINE_PADDING(10); + + size_t workers_search; + size_t workers_add; + size_t workers_evict; + size_t workers_flush; + size_t workers_jv2_flush; + size_t workers_hot2dirty; + + size_t evict_skipped; + size_t hot_empty_pages_evicted_immediately; + size_t hot_empty_pages_evicted_later; + + PGC_CACHE_LINE_PADDING(11); + + // events + size_t events_cache_under_severe_pressure; + size_t events_cache_needs_space_aggressively; + size_t events_flush_critical; + + PGC_CACHE_LINE_PADDING(12); + + struct { + PGC_CACHE_LINE_PADDING(0); + struct pgc_queue_statistics hot; + PGC_CACHE_LINE_PADDING(1); + struct pgc_queue_statistics dirty; + PGC_CACHE_LINE_PADDING(2); + struct pgc_queue_statistics clean; + PGC_CACHE_LINE_PADDING(3); + } queues; +}; + + +typedef void (*free_clean_page_callback)(PGC *cache, PGC_ENTRY entry); +typedef void (*save_dirty_page_callback)(PGC *cache, PGC_ENTRY *entries_array, PGC_PAGE **pages_array, size_t entries); +typedef void (*save_dirty_init_callback)(PGC *cache, Word_t section); +// create a cache +PGC *pgc_create(const char *name, + size_t clean_size_bytes, free_clean_page_callback pgc_free_clean_cb, + size_t max_dirty_pages_per_flush, save_dirty_init_callback pgc_save_init_cb, save_dirty_page_callback pgc_save_dirty_cb, + size_t max_pages_per_inline_eviction, size_t max_inline_evictors, + size_t max_skip_pages_per_inline_eviction, + size_t max_flushes_inline, + PGC_OPTIONS options, size_t partitions, size_t additional_bytes_per_page); + +// destroy the cache +void pgc_destroy(PGC *cache); + +#define PGC_SECTION_ALL ((Word_t)0) +void pgc_flush_all_hot_and_dirty_pages(PGC *cache, Word_t section); + +// add a page to the cache and return a pointer to it +PGC_PAGE *pgc_page_add_and_acquire(PGC *cache, PGC_ENTRY entry, bool *added); + +// get another reference counter on an already referenced page +PGC_PAGE *pgc_page_dup(PGC *cache, PGC_PAGE *page); + +// release a page (all pointers to it are now invalid) +void pgc_page_release(PGC *cache, PGC_PAGE *page); + +// mark a hot page dirty, and release it +void pgc_page_hot_to_dirty_and_release(PGC *cache, PGC_PAGE *page); + +// find a page from the cache +typedef enum { + PGC_SEARCH_EXACT, + PGC_SEARCH_CLOSEST, + PGC_SEARCH_FIRST, + PGC_SEARCH_NEXT, + PGC_SEARCH_LAST, + PGC_SEARCH_PREV, +} PGC_SEARCH; + +PGC_PAGE *pgc_page_get_and_acquire(PGC *cache, Word_t section, Word_t metric_id, time_t start_time_s, PGC_SEARCH method); + +// get information from an acquired page +Word_t pgc_page_section(PGC_PAGE *page); +Word_t pgc_page_metric(PGC_PAGE *page); +time_t pgc_page_start_time_s(PGC_PAGE *page); +time_t pgc_page_end_time_s(PGC_PAGE *page); +time_t pgc_page_update_every_s(PGC_PAGE *page); +time_t pgc_page_fix_update_every(PGC_PAGE *page, time_t update_every_s); +time_t pgc_page_fix_end_time_s(PGC_PAGE *page, time_t end_time_s); +void *pgc_page_data(PGC_PAGE *page); +void *pgc_page_custom_data(PGC *cache, PGC_PAGE *page); +size_t pgc_page_data_size(PGC *cache, PGC_PAGE *page); +bool pgc_is_page_hot(PGC_PAGE *page); +bool pgc_is_page_dirty(PGC_PAGE *page); +bool pgc_is_page_clean(PGC_PAGE *page); +void pgc_reset_hot_max(PGC *cache); +size_t pgc_get_current_cache_size(PGC *cache); +size_t pgc_get_wanted_cache_size(PGC *cache); + +// resetting the end time of a hot page +void pgc_page_hot_set_end_time_s(PGC *cache, PGC_PAGE *page, time_t end_time_s); +bool pgc_page_to_clean_evict_or_release(PGC *cache, PGC_PAGE *page); + +typedef void (*migrate_to_v2_callback)(Word_t section, unsigned datafile_fileno, uint8_t type, Pvoid_t JudyL_metrics, Pvoid_t JudyL_extents_pos, size_t count_of_unique_extents, size_t count_of_unique_metrics, size_t count_of_unique_pages, void *data); +void pgc_open_cache_to_journal_v2(PGC *cache, Word_t section, unsigned datafile_fileno, uint8_t type, migrate_to_v2_callback cb, void *data); +void pgc_open_evict_clean_pages_of_datafile(PGC *cache, struct rrdengine_datafile *datafile); +size_t pgc_count_clean_pages_having_data_ptr(PGC *cache, Word_t section, void *ptr); +size_t pgc_count_hot_pages_having_data_ptr(PGC *cache, Word_t section, void *ptr); + +typedef size_t (*dynamic_target_cache_size_callback)(void); +void pgc_set_dynamic_target_cache_size_callback(PGC *cache, dynamic_target_cache_size_callback callback); + +// return true when there is more work to do +bool pgc_evict_pages(PGC *cache, size_t max_skip, size_t max_evict); +bool pgc_flush_pages(PGC *cache, size_t max_flushes); + +struct pgc_statistics pgc_get_statistics(PGC *cache); +size_t pgc_hot_and_dirty_entries(PGC *cache); + +struct aral_statistics *pgc_aral_statistics(void); +size_t pgc_aral_structures(void); +size_t pgc_aral_overhead(void); + +#endif // DBENGINE_CACHE_H diff --git a/database/engine/datafile.c b/database/engine/datafile.c index 9c70068d9..286ae1e30 100644 --- a/database/engine/datafile.c +++ b/database/engine/datafile.c @@ -1,58 +1,174 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "rrdengine.h" -void df_extent_insert(struct extent_info *extent) -{ - struct rrdengine_datafile *datafile = extent->datafile; - - if (likely(NULL != datafile->extents.last)) { - datafile->extents.last->next = extent; - } - if (unlikely(NULL == datafile->extents.first)) { - datafile->extents.first = extent; - } - datafile->extents.last = extent; -} - void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) { - if (likely(NULL != ctx->datafiles.last)) { - ctx->datafiles.last->next = datafile; - } - if (unlikely(NULL == ctx->datafiles.first)) { - ctx->datafiles.first = datafile; - } - ctx->datafiles.last = datafile; + uv_rwlock_wrlock(&ctx->datafiles.rwlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ctx->datafiles.first, datafile, prev, next); + uv_rwlock_wrunlock(&ctx->datafiles.rwlock); } -void datafile_list_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) +void datafile_list_delete_unsafe(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) { - struct rrdengine_datafile *next; - - next = datafile->next; - fatal_assert((NULL != next) && (ctx->datafiles.first == datafile) && (ctx->datafiles.last != datafile)); - ctx->datafiles.first = next; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ctx->datafiles.first, datafile, prev, next); } -static void datafile_init(struct rrdengine_datafile *datafile, struct rrdengine_instance *ctx, - unsigned tier, unsigned fileno) +static struct rrdengine_datafile *datafile_alloc_and_init(struct rrdengine_instance *ctx, unsigned tier, unsigned fileno) { fatal_assert(tier == 1); + + struct rrdengine_datafile *datafile = callocz(1, sizeof(struct rrdengine_datafile)); + datafile->tier = tier; datafile->fileno = fileno; - datafile->file = (uv_file)0; - datafile->pos = 0; - datafile->extents.first = datafile->extents.last = NULL; /* will be populated by journalfile */ - datafile->journalfile = NULL; - datafile->next = NULL; + fatal_assert(0 == uv_rwlock_init(&datafile->extent_rwlock)); datafile->ctx = ctx; + + datafile->users.available = true; + + netdata_spinlock_init(&datafile->users.spinlock); + netdata_spinlock_init(&datafile->writers.spinlock); + netdata_spinlock_init(&datafile->extent_queries.spinlock); + + return datafile; +} + +void datafile_acquire_dup(struct rrdengine_datafile *df) { + netdata_spinlock_lock(&df->users.spinlock); + + if(!df->users.lockers) + fatal("DBENGINE: datafile is not acquired to duplicate"); + + df->users.lockers++; + + netdata_spinlock_unlock(&df->users.spinlock); +} + +bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason) { + bool ret; + + netdata_spinlock_lock(&df->users.spinlock); + + if(df->users.available) { + ret = true; + df->users.lockers++; + df->users.lockers_by_reason[reason]++; + } + else + ret = false; + + netdata_spinlock_unlock(&df->users.spinlock); + + return ret; +} + +void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason) { + netdata_spinlock_lock(&df->users.spinlock); + if(!df->users.lockers) + fatal("DBENGINE DATAFILE: cannot release a datafile that is not acquired"); + + df->users.lockers--; + df->users.lockers_by_reason[reason]--; + netdata_spinlock_unlock(&df->users.spinlock); +} + +bool datafile_acquire_for_deletion(struct rrdengine_datafile *df) { + bool can_be_deleted = false; + + netdata_spinlock_lock(&df->users.spinlock); + df->users.available = false; + + if(!df->users.lockers) + can_be_deleted = true; + + else { + // there are lockers + + // evict any pages referencing this in the open cache + netdata_spinlock_unlock(&df->users.spinlock); + pgc_open_evict_clean_pages_of_datafile(open_cache, df); + netdata_spinlock_lock(&df->users.spinlock); + + if(!df->users.lockers) + can_be_deleted = true; + + else { + // there are lockers still + + // count the number of pages referencing this in the open cache + netdata_spinlock_unlock(&df->users.spinlock); + usec_t time_to_scan_ut = now_monotonic_usec(); + size_t clean_pages_in_open_cache = pgc_count_clean_pages_having_data_ptr(open_cache, (Word_t)df->ctx, df); + size_t hot_pages_in_open_cache = pgc_count_hot_pages_having_data_ptr(open_cache, (Word_t)df->ctx, df); + time_to_scan_ut = now_monotonic_usec() - time_to_scan_ut; + netdata_spinlock_lock(&df->users.spinlock); + + if(!df->users.lockers) + can_be_deleted = true; + + else if(!clean_pages_in_open_cache && !hot_pages_in_open_cache) { + // no pages in the open cache related to this datafile + + time_t now_s = now_monotonic_sec(); + + if(!df->users.time_to_evict) { + // first time we did the above + df->users.time_to_evict = now_s + 120; + internal_error(true, "DBENGINE: datafile %u of tier %d is not used by any open cache pages, " + "but it has %u lockers (oc:%u, pd:%u), " + "%zu clean and %zu hot open cache pages " + "- will be deleted shortly " + "(scanned open cache in %llu usecs)", + df->fileno, df->ctx->config.tier, + df->users.lockers, + df->users.lockers_by_reason[DATAFILE_ACQUIRE_OPEN_CACHE], + df->users.lockers_by_reason[DATAFILE_ACQUIRE_PAGE_DETAILS], + clean_pages_in_open_cache, + hot_pages_in_open_cache, + time_to_scan_ut); + } + + else if(now_s > df->users.time_to_evict) { + // time expired, lets remove it + can_be_deleted = true; + internal_error(true, "DBENGINE: datafile %u of tier %d is not used by any open cache pages, " + "but it has %u lockers (oc:%u, pd:%u), " + "%zu clean and %zu hot open cache pages " + "- will be deleted now " + "(scanned open cache in %llu usecs)", + df->fileno, df->ctx->config.tier, + df->users.lockers, + df->users.lockers_by_reason[DATAFILE_ACQUIRE_OPEN_CACHE], + df->users.lockers_by_reason[DATAFILE_ACQUIRE_PAGE_DETAILS], + clean_pages_in_open_cache, + hot_pages_in_open_cache, + time_to_scan_ut); + } + } + else + internal_error(true, "DBENGINE: datafile %u of tier %d " + "has %u lockers (oc:%u, pd:%u), " + "%zu clean and %zu hot open cache pages " + "(scanned open cache in %llu usecs)", + df->fileno, df->ctx->config.tier, + df->users.lockers, + df->users.lockers_by_reason[DATAFILE_ACQUIRE_OPEN_CACHE], + df->users.lockers_by_reason[DATAFILE_ACQUIRE_PAGE_DETAILS], + clean_pages_in_open_cache, + hot_pages_in_open_cache, + time_to_scan_ut); + } + } + netdata_spinlock_unlock(&df->users.spinlock); + + return can_be_deleted; } void generate_datafilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen) { (void) snprintfz(str, maxlen, "%s/" DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION, - datafile->ctx->dbfiles_path, datafile->tier, datafile->fileno); + datafile->ctx->config.dbfiles_path, datafile->tier, datafile->fileno); } int close_data_file(struct rrdengine_datafile *datafile) @@ -66,9 +182,8 @@ int close_data_file(struct rrdengine_datafile *datafile) ret = uv_fs_close(NULL, &req, datafile->file, NULL); if (ret < 0) { - error("uv_fs_close(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); @@ -86,18 +201,17 @@ int unlink_data_file(struct rrdengine_datafile *datafile) ret = uv_fs_unlink(NULL, &req, path, NULL); if (ret < 0) { - error("uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); - ++ctx->stats.datafile_deletions; + __atomic_add_fetch(&ctx->stats.datafile_deletions, 1, __ATOMIC_RELAXED); return ret; } -int destroy_data_file(struct rrdengine_datafile *datafile) +int destroy_data_file_unsafe(struct rrdengine_datafile *datafile) { struct rrdengine_instance *ctx = datafile->ctx; uv_fs_t req; @@ -108,29 +222,26 @@ int destroy_data_file(struct rrdengine_datafile *datafile) ret = uv_fs_ftruncate(NULL, &req, datafile->file, 0, NULL); if (ret < 0) { - error("uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); ret = uv_fs_close(NULL, &req, datafile->file, NULL); if (ret < 0) { - error("uv_fs_close(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); ret = uv_fs_unlink(NULL, &req, path, NULL); if (ret < 0) { - error("uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); - ++ctx->stats.datafile_deletions; + __atomic_add_fetch(&ctx->stats.datafile_deletions, 1, __ATOMIC_RELAXED); return ret; } @@ -146,18 +257,17 @@ int create_data_file(struct rrdengine_datafile *datafile) char path[RRDENG_PATH_MAX]; generate_datafilepath(datafile, path, sizeof(path)); - fd = open_file_direct_io(path, O_CREAT | O_RDWR | O_TRUNC, &file); + fd = open_file_for_io(path, O_CREAT | O_RDWR | O_TRUNC, &file, use_direct_io); if (fd < 0) { - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + ctx_fs_error(ctx); return fd; } datafile->file = file; - ++ctx->stats.datafile_creations; + __atomic_add_fetch(&ctx->stats.datafile_creations, 1, __ATOMIC_RELAXED); ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock)); if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); } memset(superblock, 0, sizeof(*superblock)); (void) strncpy(superblock->magic_number, RRDENG_DF_MAGIC, RRDENG_MAGIC_SZ); @@ -169,20 +279,18 @@ int create_data_file(struct rrdengine_datafile *datafile) ret = uv_fs_write(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { fatal_assert(req.result < 0); - error("uv_fs_write: %s", uv_strerror(ret)); - ++ctx->stats.io_errors; - rrd_stat_atomic_add(&global_io_errors, 1); + error("DBENGINE: uv_fs_write: %s", uv_strerror(ret)); + ctx_io_error(ctx); } uv_fs_req_cleanup(&req); posix_memfree(superblock); if (ret < 0) { - destroy_data_file(datafile); + destroy_data_file_unsafe(datafile); return ret; } datafile->pos = sizeof(*superblock); - ctx->stats.io_write_bytes += sizeof(*superblock); - ++ctx->stats.io_write_requests; + ctx_io_write_op_bytes(ctx, sizeof(*superblock)); return 0; } @@ -196,13 +304,13 @@ static int check_data_file_superblock(uv_file file) ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock)); if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); } iov = uv_buf_init((void *)superblock, sizeof(*superblock)); ret = uv_fs_read(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { - error("uv_fs_read: %s", uv_strerror(ret)); + error("DBENGINE: uv_fs_read: %s", uv_strerror(ret)); uv_fs_req_cleanup(&req); goto error; } @@ -212,7 +320,7 @@ static int check_data_file_superblock(uv_file file) if (strncmp(superblock->magic_number, RRDENG_DF_MAGIC, RRDENG_MAGIC_SZ) || strncmp(superblock->version, RRDENG_DF_VER, RRDENG_VER_SZ) || superblock->tier != 1) { - error("File has invalid superblock."); + error("DBENGINE: file has invalid superblock."); ret = UV_EINVAL; } else { ret = 0; @@ -232,13 +340,12 @@ static int load_data_file(struct rrdengine_datafile *datafile) char path[RRDENG_PATH_MAX]; generate_datafilepath(datafile, path, sizeof(path)); - fd = open_file_direct_io(path, O_RDWR, &file); + fd = open_file_for_io(path, O_RDWR, &file, use_direct_io); if (fd < 0) { - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + ctx_fs_error(ctx); return fd; } - info("Initializing data file \"%s\".", path); + info("DBENGINE: initializing data file \"%s\".", path); ret = check_file_properties(file, &file_size, sizeof(struct rrdeng_df_sb)); if (ret) @@ -248,22 +355,21 @@ static int load_data_file(struct rrdengine_datafile *datafile) ret = check_data_file_superblock(file); if (ret) goto error; - ctx->stats.io_read_bytes += sizeof(struct rrdeng_df_sb); - ++ctx->stats.io_read_requests; + + ctx_io_read_op_bytes(ctx, sizeof(struct rrdeng_df_sb)); datafile->file = file; datafile->pos = file_size; - info("Data file \"%s\" initialized (size:%"PRIu64").", path, file_size); + info("DBENGINE: data file \"%s\" initialized (size:%"PRIu64").", path, file_size); return 0; error: error = ret; ret = uv_fs_close(NULL, &req, file, NULL); if (ret < 0) { - error("uv_fs_close(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); return error; @@ -286,30 +392,26 @@ static int scan_data_files(struct rrdengine_instance *ctx) { int ret; unsigned tier, no, matched_files, i,failed_to_load; - static uv_fs_t req; + uv_fs_t req; uv_dirent_t dent; struct rrdengine_datafile **datafiles, *datafile; struct rrdengine_journalfile *journalfile; - ret = uv_fs_scandir(NULL, &req, ctx->dbfiles_path, 0, NULL); + ret = uv_fs_scandir(NULL, &req, ctx->config.dbfiles_path, 0, NULL); if (ret < 0) { fatal_assert(req.result < 0); uv_fs_req_cleanup(&req); - error("uv_fs_scandir(%s): %s", ctx->dbfiles_path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_scandir(%s): %s", ctx->config.dbfiles_path, uv_strerror(ret)); + ctx_fs_error(ctx); return ret; } - info("Found %d files in path %s", ret, ctx->dbfiles_path); + info("DBENGINE: found %d files in path %s", ret, ctx->config.dbfiles_path); datafiles = callocz(MIN(ret, MAX_DATAFILES), sizeof(*datafiles)); for (matched_files = 0 ; UV_EOF != uv_fs_scandir_next(&req, &dent) && matched_files < MAX_DATAFILES ; ) { - info("Scanning file \"%s/%s\"", ctx->dbfiles_path, dent.name); ret = sscanf(dent.name, DATAFILE_PREFIX RRDENG_FILE_NUMBER_SCAN_TMPL DATAFILE_EXTENSION, &tier, &no); if (2 == ret) { - info("Matched file \"%s/%s\"", ctx->dbfiles_path, dent.name); - datafile = mallocz(sizeof(*datafile)); - datafile_init(datafile, ctx, tier, no); + datafile = datafile_alloc_and_init(ctx, tier, no); datafiles[matched_files++] = datafile; } } @@ -320,11 +422,11 @@ static int scan_data_files(struct rrdengine_instance *ctx) return 0; } if (matched_files == MAX_DATAFILES) { - error("Warning: hit maximum database engine file limit of %d files", MAX_DATAFILES); + error("DBENGINE: warning: hit maximum database engine file limit of %d files", MAX_DATAFILES); } qsort(datafiles, matched_files, sizeof(*datafiles), scan_data_files_cmp); /* TODO: change this when tiering is implemented */ - ctx->last_fileno = datafiles[matched_files - 1]->fileno; + ctx->atomic.last_fileno = datafiles[matched_files - 1]->fileno; for (failed_to_load = 0, i = 0 ; i < matched_files ; ++i) { uint8_t must_delete_pair = 0; @@ -334,10 +436,8 @@ static int scan_data_files(struct rrdengine_instance *ctx) if (0 != ret) { must_delete_pair = 1; } - journalfile = mallocz(sizeof(*journalfile)); - datafile->journalfile = journalfile; - journalfile_init(journalfile, datafile); - ret = load_journal_file(ctx, journalfile, datafile); + journalfile = journalfile_alloc_and_init(datafile); + ret = journalfile_load(ctx, journalfile, datafile); if (0 != ret) { if (!must_delete_pair) /* If datafile is still open close it */ close_data_file(datafile); @@ -346,16 +446,16 @@ static int scan_data_files(struct rrdengine_instance *ctx) if (must_delete_pair) { char path[RRDENG_PATH_MAX]; - error("Deleting invalid data and journal file pair."); - ret = unlink_journal_file(journalfile); + error("DBENGINE: deleting invalid data and journal file pair."); + ret = journalfile_unlink(journalfile); if (!ret) { - generate_journalfilepath(datafile, path, sizeof(path)); - info("Deleted journal file \"%s\".", path); + journalfile_v1_generate_path(datafile, path, sizeof(path)); + info("DBENGINE: deleted journal file \"%s\".", path); } ret = unlink_data_file(datafile); if (!ret) { generate_datafilepath(datafile, path, sizeof(path)); - info("Deleted data file \"%s\".", path); + info("DBENGINE: deleted data file \"%s\".", path); } freez(journalfile); freez(datafile); @@ -363,8 +463,8 @@ static int scan_data_files(struct rrdengine_instance *ctx) continue; } + ctx_current_disk_space_increase(ctx, datafile->pos + journalfile->unsafe.pos); datafile_list_insert(ctx, datafile); - ctx->disk_space += datafile->pos + journalfile->pos; } matched_files -= failed_to_load; freez(datafiles); @@ -373,42 +473,43 @@ static int scan_data_files(struct rrdengine_instance *ctx) } /* Creates a datafile and a journalfile pair */ -int create_new_datafile_pair(struct rrdengine_instance *ctx, unsigned tier, unsigned fileno) +int create_new_datafile_pair(struct rrdengine_instance *ctx) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.datafile_creation_started, 1, __ATOMIC_RELAXED); + struct rrdengine_datafile *datafile; struct rrdengine_journalfile *journalfile; + unsigned fileno = ctx_last_fileno_get(ctx) + 1; int ret; char path[RRDENG_PATH_MAX]; - info("Creating new data and journal files in path %s", ctx->dbfiles_path); - datafile = mallocz(sizeof(*datafile)); - datafile_init(datafile, ctx, tier, fileno); + info("DBENGINE: creating new data and journal files in path %s", ctx->config.dbfiles_path); + datafile = datafile_alloc_and_init(ctx, 1, fileno); ret = create_data_file(datafile); - if (!ret) { - generate_datafilepath(datafile, path, sizeof(path)); - info("Created data file \"%s\".", path); - } else { + if(ret) goto error_after_datafile; - } - journalfile = mallocz(sizeof(*journalfile)); - datafile->journalfile = journalfile; - journalfile_init(journalfile, datafile); - ret = create_journal_file(journalfile, datafile); - if (!ret) { - generate_journalfilepath(datafile, path, sizeof(path)); - info("Created journal file \"%s\".", path); - } else { + generate_datafilepath(datafile, path, sizeof(path)); + info("DBENGINE: created data file \"%s\".", path); + + journalfile = journalfile_alloc_and_init(datafile); + ret = journalfile_create(journalfile, datafile); + if (ret) goto error_after_journalfile; - } + + journalfile_v1_generate_path(datafile, path, sizeof(path)); + info("DBENGINE: created journal file \"%s\".", path); + + ctx_current_disk_space_increase(ctx, datafile->pos + journalfile->unsafe.pos); datafile_list_insert(ctx, datafile); - ctx->disk_space += datafile->pos + journalfile->pos; + ctx_last_fileno_increment(ctx); return 0; error_after_journalfile: - destroy_data_file(datafile); + destroy_data_file_unsafe(datafile); freez(journalfile); + error_after_datafile: freez(datafile); return ret; @@ -421,40 +522,86 @@ int init_data_files(struct rrdengine_instance *ctx) { int ret; + fatal_assert(0 == uv_rwlock_init(&ctx->datafiles.rwlock)); ret = scan_data_files(ctx); if (ret < 0) { - error("Failed to scan path \"%s\".", ctx->dbfiles_path); + error("DBENGINE: failed to scan path \"%s\".", ctx->config.dbfiles_path); return ret; } else if (0 == ret) { - info("Data files not found, creating in path \"%s\".", ctx->dbfiles_path); - ret = create_new_datafile_pair(ctx, 1, 1); + info("DBENGINE: data files not found, creating in path \"%s\".", ctx->config.dbfiles_path); + ctx->atomic.last_fileno = 0; + ret = create_new_datafile_pair(ctx); if (ret) { - error("Failed to create data and journal files in path \"%s\".", ctx->dbfiles_path); + error("DBENGINE: failed to create data and journal files in path \"%s\".", ctx->config.dbfiles_path); return ret; } - ctx->last_fileno = 1; + } + else { + if (ctx->loading.create_new_datafile_pair) + create_new_datafile_pair(ctx); + + while(rrdeng_ctx_exceeded_disk_quota(ctx)) + datafile_delete(ctx, ctx->datafiles.first, false, false); } + pgc_reset_hot_max(open_cache); + ctx->loading.create_new_datafile_pair = false; return 0; } void finalize_data_files(struct rrdengine_instance *ctx) { - struct rrdengine_datafile *datafile, *next_datafile; - struct rrdengine_journalfile *journalfile; - struct extent_info *extent, *next_extent; + bool logged = false; + + logged = false; + while(__atomic_load_n(&ctx->atomic.extents_currently_being_flushed, __ATOMIC_RELAXED)) { + if(!logged) { + info("Waiting for inflight flush to finish on tier %d...", ctx->config.tier); + logged = true; + } + sleep_usec(100 * USEC_PER_MS); + } - for (datafile = ctx->datafiles.first ; datafile != NULL ; datafile = next_datafile) { - journalfile = datafile->journalfile; - next_datafile = datafile->next; + do { + struct rrdengine_datafile *datafile = ctx->datafiles.first; + struct rrdengine_journalfile *journalfile = datafile->journalfile; - for (extent = datafile->extents.first ; extent != NULL ; extent = next_extent) { - next_extent = extent->next; - freez(extent); + logged = false; + size_t iterations = 100; + while(!datafile_acquire_for_deletion(datafile) && datafile != ctx->datafiles.first->prev && --iterations > 0) { + if(!logged) { + info("Waiting to acquire data file %u of tier %d to close it...", datafile->fileno, ctx->config.tier); + logged = true; + } + sleep_usec(100 * USEC_PER_MS); } - close_journal_file(journalfile, datafile); + + logged = false; + bool available = false; + do { + uv_rwlock_wrlock(&ctx->datafiles.rwlock); + netdata_spinlock_lock(&datafile->writers.spinlock); + available = (datafile->writers.running || datafile->writers.flushed_to_open_running) ? false : true; + + if(!available) { + netdata_spinlock_unlock(&datafile->writers.spinlock); + uv_rwlock_wrunlock(&ctx->datafiles.rwlock); + if(!logged) { + info("Waiting for writers to data file %u of tier %d to finish...", datafile->fileno, ctx->config.tier); + logged = true; + } + sleep_usec(100 * USEC_PER_MS); + } + } while(!available); + + journalfile_close(journalfile, datafile); close_data_file(datafile); + datafile_list_delete_unsafe(ctx, datafile); + netdata_spinlock_unlock(&datafile->writers.spinlock); + uv_rwlock_wrunlock(&ctx->datafiles.rwlock); + freez(journalfile); freez(datafile); - } + + } while(ctx->datafiles.first); } diff --git a/database/engine/datafile.h b/database/engine/datafile.h index 1cf256aff..274add91e 100644 --- a/database/engine/datafile.h +++ b/database/engine/datafile.h @@ -13,27 +13,25 @@ struct rrdengine_instance; #define DATAFILE_PREFIX "datafile-" #define DATAFILE_EXTENSION ".ndf" -#define MAX_DATAFILE_SIZE (1073741824LU) -#define MIN_DATAFILE_SIZE (4194304LU) +#ifndef MAX_DATAFILE_SIZE +#define MAX_DATAFILE_SIZE (512LU * 1024LU * 1024LU) +#endif +#if MIN_DATAFILE_SIZE > MAX_DATAFILE_SIZE +#error MIN_DATAFILE_SIZE > MAX_DATAFILE_SIZE +#endif + +#define MIN_DATAFILE_SIZE (4LU * 1024LU * 1024LU) #define MAX_DATAFILES (65536) /* Supports up to 64TiB for now */ -#define TARGET_DATAFILES (20) +#define TARGET_DATAFILES (50) -#define DATAFILE_IDEAL_IO_SIZE (1048576U) +typedef enum __attribute__ ((__packed__)) { + DATAFILE_ACQUIRE_OPEN_CACHE = 0, + DATAFILE_ACQUIRE_PAGE_DETAILS, + DATAFILE_ACQUIRE_RETENTION, -struct extent_info { - uint64_t offset; - uint32_t size; - uint8_t number_of_pages; - struct rrdengine_datafile *datafile; - struct extent_info *next; - struct rrdeng_page_descr *pages[]; -}; - -struct rrdengine_df_extents { - /* the extent list is sorted based on disk offset */ - struct extent_info *first; - struct extent_info *last; -}; + // terminator + DATAFILE_ACQUIRE_MAX, +} DATAFILE_ACQUIRE_REASONS; /* only one event loop is supported for now */ struct rrdengine_datafile { @@ -41,26 +39,50 @@ struct rrdengine_datafile { unsigned fileno; uv_file file; uint64_t pos; + uv_rwlock_t extent_rwlock; struct rrdengine_instance *ctx; - struct rrdengine_df_extents extents; struct rrdengine_journalfile *journalfile; + struct rrdengine_datafile *prev; struct rrdengine_datafile *next; -}; -struct rrdengine_datafile_list { - struct rrdengine_datafile *first; /* oldest */ - struct rrdengine_datafile *last; /* newest */ + struct { + SPINLOCK spinlock; + bool populated; + } populate_mrg; + + struct { + SPINLOCK spinlock; + size_t running; + size_t flushed_to_open_running; + } writers; + + struct { + SPINLOCK spinlock; + unsigned lockers; + unsigned lockers_by_reason[DATAFILE_ACQUIRE_MAX]; + bool available; + time_t time_to_evict; + } users; + + struct { + SPINLOCK spinlock; + Pvoid_t pending_epdl_by_extent_offset_judyL; + } extent_queries; }; -void df_extent_insert(struct extent_info *extent); +void datafile_acquire_dup(struct rrdengine_datafile *df); +bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason); +void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason); +bool datafile_acquire_for_deletion(struct rrdengine_datafile *df); + void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile); -void datafile_list_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile); +void datafile_list_delete_unsafe(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile); void generate_datafilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen); int close_data_file(struct rrdengine_datafile *datafile); int unlink_data_file(struct rrdengine_datafile *datafile); -int destroy_data_file(struct rrdengine_datafile *datafile); +int destroy_data_file_unsafe(struct rrdengine_datafile *datafile); int create_data_file(struct rrdengine_datafile *datafile); -int create_new_datafile_pair(struct rrdengine_instance *ctx, unsigned tier, unsigned fileno); +int create_new_datafile_pair(struct rrdengine_instance *ctx); int init_data_files(struct rrdengine_instance *ctx); void finalize_data_files(struct rrdengine_instance *ctx); diff --git a/database/engine/datafile.ksy b/database/engine/datafile.ksy new file mode 100644 index 000000000..28d4b3935 --- /dev/null +++ b/database/engine/datafile.ksy @@ -0,0 +1,74 @@ +meta: + id: netdata_datafile + endian: le + +seq: + - id: hdr + type: header + size: 4096 + - id: extents + type: extent + repeat: eos + +types: + header: + seq: + - id: magic + contents: "netdata-data-file" + - id: reserved + size: 15 + - id: version + contents: "1.0" + - id: reserved1 + size: 13 + - id: tier + type: u1 + extent_page_descr: + seq: + - id: type + type: u1 + enum: page_type + - id: uuid + size: 16 + - id: page_len + type: u4 + - id: start_time_ut + type: u8 + - id: end_time_ut + type: u8 + enums: + page_type: + 0: metrics + 1: tier + extent_header: + seq: + - id: payload_length + type: u4 + - id: compression_algorithm + type: u1 + enum: compression_algos + - id: number_of_pages + type: u1 + - id: page_descriptors + type: extent_page_descr + repeat: expr + repeat-expr: number_of_pages + enums: + compression_algos: + 0: rrd_no_compression + 1: rrd_lz4 + extent_trailer: + seq: + - id: crc32_checksum + type: u4 + extent: + seq: + - id: header + type: extent_header + - id: payload + size: header.payload_length + - id: trailer + type: extent_trailer + - id: padding + size: (((_io.pos + 4095) / 4096) * 4096) - _io.pos + # the extent size is made to always be a multiple of 4096 diff --git a/database/engine/dbengine-diagram.xml b/database/engine/dbengine-diagram.xml new file mode 100644 index 000000000..793e8a355 --- /dev/null +++ b/database/engine/dbengine-diagram.xml @@ -0,0 +1 @@ +<mxfile host="app.diagrams.net" modified="2023-01-16T23:29:24.274Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" etag="IzytOgui5y4srcr9Zrcm" version="20.5.1" type="device"><diagram name="Page-1" id="90a13364-a465-7bf4-72fc-28e22215d7a0">7V1rc5tI1v41rsp8MEVz52PsOJvsxkkmydTu7Je3EGpZTJDQIOTY++vf7oYG+oJoIUDIlqcmthA00H3O0+d+rszb1dM/0mCzvE/mML4y9PnTlfnuyjBM3zc1G/2Bjz3nx4DuG/mRhzSaF8eqA9+j/0F6YnF0F83hljkxS5I4izbswTBZr2GYMceCNE1+sactkpi96yZ4gMKB72EQ06P0DfDxf0fzbFkcB45fffEBRg/L4uae4eRfzILw50Oa7NbFHdfJGubfrAI6TPGW22UwT37VDpl3V+ZtmiRZ/tfq6RbGeG7pnNHrsmf6oFfmzTJbxegDQH+Sr983XAxULkbvlcJ1Vr9d03jL53+l9tvPH39EN3fr2R/2l88f/rq2i1l4DOIdvQt/WzhHc118JLNj3pD5gnhgHX3a/oRZuCw+JGm2TB6SdRB/SpJN8ax/wSx7Lsgm2GUJ+yblvJLBsjT5Wa4hfudFss6Ka4FHzgjS7C2mmuqByLH3URwXo8D1nJ4RxsF2G4X5weIUQD5l6fN/8AdNLz//ia/XHMOlB949lSPiT8/Fp3mwXZL3J1c+Rdl/yIV28elPehH6uxoBf3hmXvM2iZOUzLJ5S37ojb7CNFrBDKbF6fmi4JXg2KBafUovyS4N4b4lL1YYzdgDzPacaLkljSPsgAl6nvQZXZjCOMiiR/ZJgoKDH8rzyku/JhF6RkMvAAcYfsEXBdr4ns6Okb9CcVmdovmRfEfX9OoHUUd9XGB5Njtw/srCwOiP2gtWhwgDHcJMYGrMxJDpfs5yhuAshq1aWKrkojoPlRwl5yL8fl+DDPHJmtzO0MFe3urGRu/v9J/Jp//+988/7ufO82fvK7iPrm3/xGwE3Drt65bNEL/j6kq0L2FPljstW407+2Ii07sw0R4m0ly7Ex8Zk2UkxxyGP/Q2Qj6AI9iRbMPV7HGZwheY4n2EJFZD/4rk2u2V4cRo1m9mKcMnzt87LJoSCr3eEhJ9i1fW2jyRJaTfo78eit/BCjHIzXq2xb+iNTr7PiC/boNwCaWM+CmYIYWCYZ8gjh4wHYWIGLAIc/MI0yxC4vrb4otVNJ/HhG0heq5gRsbDdLjBM0jm1L65st9x7GURRs8QWSRrRjDC48MnhlIKVaMYuxLG6yS6B4BEeiyGv9Y1RBAWQxDXxT2PJNlrC7DDOhyVJYvFFh5LYFImRIrS1GB3fEWgxFIVcUWqAdC/VWSXNnDtgqVUqa0LJdITPUOK4ANIKR6LnUDvKu17jqHpnut6hqubvuca7LjAsjQX3c12TNOxHGANJfrLJ15B9Gd55dcyyuD3TUDW81cabFhW4GGPIbdGAhFwsBHGDINfl+Lzr8p64tFjy5rhxOS30ToRMNN6MAYZh85hF4hhUEUEnTqCKWDHEayqyKnUxNYfpx63SKawSB+SrGdBZAZD9LjYELjDT3R6oUNc5v0UrCqONLInEjIs3+1HqgCssGJw/DucUCHj54I+pORRmWKvw3xbxFQSraMsCmIpoUhIDz3prCbQluTIHplHj+0nRfwBTBZHEXbxpHq4SzEpxc/SmzNC+AxG6wfMCgh0EPC1nv8rQsiG6Q2dHmb4Cn0eZEFtevK3YN8MHRbeFh1jpoljw5YNrIRVfjfDkht+mZrks1hAJwxlMtHc9Wd6Jecft+XZrHgNLHHLA65kyzN4kaU3NDVkLDIhqVsmiQy6K5qKu2Lv4utRyyjuiTWgy9nnNufHJD0QqMQjR4NQ8SglMOiLNFmhX6sddtjFsPoiX65tI3YMiAg2/k+GCA756QcRWH9BuTPW8YCaVet4APgttD/panLGzxPjAZV+28Vkd1KIAESD3bsozZ5lgnLOTdvgkWzWaDUQ/0Xbn+ck91Ky7UHuNSzT6Unu5dh7PLnXnTYX87Y0RS6Wu7npN3Ij1wicT21eU+F8fXCtR4okZ6H3ZMsAD4TIoVTu+YcIiFsjP+NXgCYBaz9tSg8BzRkZNN4RQxkDpJNWfRZGg+rjzBy7J0HHNjlr3wRUn4mLOi0gWTxOhZClh6B74MMBuMq4PirArns/jhS83DOFX9ne2y/83sYwWA8Iv9sNGf4oPy6L4xz8Ikra4t/JmkNIOQqH2PWrYHkifuIVXCXpcwu25m94Ksid29CbWzLI9YyZ2ZtuCaYHuaJSIszzpJ1UtnNyJ5UpE+/2zuE5O6lMQ3EXMO1J7QKm6I7FJi0EBFDEsdzbJFHJz8z7VJLmlL1P5mhaOKXdUdxPx4sB9MA9zALOYdPJTDyMEoe+rx5QL2xW/CPkmlfBb1juINJEm/jwFwIjBJL0mkdEarfiVaVKqC/QCmGRBtZupqgphnGyhVRFJGpjhAD7SfYuRKipPRp+MuNq2hrlKZxpjiR+ZGTxxpS5YSakUY7vTLNVd+9p6XCmJSwk1WCU+LTg0fPZuCnhXszndbIdyYLKbGm9K/M87o9z6Qibf+tGWzf17ol3YbdfXcLHR++1jRPX/zTtXc0pygqnsD5PQVaQhbsPZyacnnowltlxL2Zw9shG3t/3+H9sK29PCue7EAqD6NswWNcttReb5HQY8czcQIPknWhlTl/nzJNDsma7axTK+SXU+DJ0fomts2mvNhgs5XvvhOwL/TtYuPAaAPZdbnV5T1zcaYtgIZVA9mwV34NHjJxcbEGOq++k4UhxHG22mPZbcJChV15F4nAQkh8ZDpqO6ZvzfnDQsVkctCS+mbK2CRP5N5hfwVLwK0wJBxWNF1182zVc1Flg1AynBRv3FM8Y1oLij4R3XO0Mr2s2nWObmuci+PQt17A9n1X2fcPUTMt0fd20HUvXjVFB1RK9N+fECy89NKR/a6GcRH1g42oHvZC7Z/ma7zs4dNuwbM81OXIHmu84vg8sAHTdB+OSu9EuQ/yTVWg+YntJVxWQbu01Rw7RvIhHBwZpuCTmRhoaos+CbWGaQTfVmu9wtPGEe8nSe0RMSKtgsykeg8gkNNBEf4M4NSXf5Id+0xolIw5DOsc6yBS4AHoLqSXFCT04W/D6YA9SjJjCK0lgkIZHAGcwdU4WYjKUPCxjit7l4oLVmkzFnNeTe6Qov3gQybk5JGQSMnUJ1SeUqSdnW3gJpYT2Ch/yOkLdZQ3LUhTA6YjDF7TgywrxIsnQ0sKlFBBFQGkpoFZvq9VbstLLLAU0vSqG4wPlgRbZEfDN6j3AsgHfTKPBRzASvtHypPskyN93kLzrMsD6wUOwUQzDf7PNy4qQyLGAPEuyuCqCyPiTU/j3LsoVi02hMRFNhD9vTYIAF6SCcK5J8WesctwkcfS/TQE4FZHSbhICuyCl4Zis0k1z+I9FSoer+uqMBZSUdyWUqqzaOI3hFtstEZ6a9fnSI8zsyoeYBNT1Y9Cuq3TK7sfn1fUR8iNVonvM+hfqOUrN/5ZEVRnMC2o1hzz1RUqGAil92cALKR1FSs7JSQkAGSxNWH6z+pPf1PKP2oUyVS+3MZLXR1A6BTv4wEIZACJAfU02O/RWYriNIP5UdmZiI0tylAmnokYqSkMVY/UjDllcfG0/0hAnZNFSFCNIQ82REH1tYabCFsYZYL+Hwfqyk3XZybyT72T2xPM5zskQMbzZAPga66t1+YwB5bYe4lglMY5lhGhW7QQjxDaLyIATNUbohfUB34Y/u74Tn5VhoreEmSENE1yQAR+ZOKAFV6ZNnhQ4p+X36tZBo0sDjSN0ENXIs9F0EN1lBQRbsTGMGIvjOppT+2Gh3nKF5hpDg70oubIeNanTv2/nGmt7OT0Ud3OuldAzaeca4JxrVBUbAZonVxpwWtBsdMHm/fGQvSOzqnWIxvUNbx3i0nr8rpK3zyXq+cM10JNzhxiwcwIk/ufEssg7IjFFmkkjMdcdpkTmEZC4OVLx6MBEQVcsMncKlfEoc9Rg0bdf0+QxmsNSGSUTsy20UolNN0WwjhiSVGmhJcSziKi0GUZZYuvNDGkVl/J0HK6p/8jHeJPuMPsVDQw2aZSkUZ6L9Nu4xjm8zx1ej7zJbNe40xxgnOO7KBmSxM3SEDdOqXLHmJokc6aJm5pum4yi6ZK8+z35UehDjylOkj6mJxVnLF6a4YPVlaUZQ9PR3LqG7fiuoXOpJEA3LM3WfcfyfdvRDZqxOpKo4x1oo0HrisDwG5naZH1XffPCOOrqIEtNd7J3VDuN+GMF3nm6zxCoM7aPVxcDP+8jtFyyAtyFPH564VhRGvb6tEvogGu2fKTBuOw/MpbsC/Qzc631GCMyAmSUvRQnUk8N6KI7KwipQ4ln7c2ZsXZFzL14gwxa0oHao4/jbSrUcOrzaEGqjozT+1dzJaOI51OP6bdkl1V5eocm5NEjyvWKWqJKrNptZUWAhujAtYThz1LRpg7dQvFd0T2XduEKHoJoTQok1QPJr4qyqvochrnajtTdEA8AnzKIWcvQ17CqjUTqrMVJMCeHCgW8yPAlJ6DL0oC0BdvkqY4KbQMv4TP027I0+Z5S5WXaLVNaiQ9t6E9DP7NA0EHiZ1h9AuitBUQknbwVypX0p48rZwRRZB9cMbH4KvxdHb/CSI47rorjiEo33ZMijJe44Dvts3o28k/J5pMOhnHYYa3R5B9ZWOrrBkFVF2sdAocrKacc6DIe3nHGdqcz3jlcTK0LbM3iSH9oyBMDDl4A5FGmnjLkAYcbdTTzjjO5uhenCJzWdY/FPd/wuyDfqMKfamzJaGBo82kBfAMXdTDkc5j4kYZGQtHh/wKQsM9iFoMJfzZrLh8xyoMqGKNEeahmYrfZSNBKZhyYMraLAh7rBpDikDpFyiwv3AbQXF+qlVZ7qGUmMZ/YEvMJDyK9WU9cWRmUoShHNfH6QjmdegSOTDlGu/g1rTaLZXovnUPqUqmHF0nncLDabe7eQoK9Nmi4o/by3pu/9N9SoXzWA3q5zna1RixFL1dd0qyVa6YUEBlitUGyS17/c5GkK/FctvFC4VBIYTC/NFzAnMWJPrYlctbIHRe8MUWinF4vW9twuDzu3ubJ9jZ5sPP7vHxvNydvUW21GIqA2zspuL0pnKRMq8laQYqqnaSY4Hrm9X8B31dpCgWAfRFdwjrrVytHfdH8cVhAxoQNVvm+MaCRvvxGJZK4kZRazU2eakQTFWlbzU0FaeoaMB3O/knjh480Jhi672l+9WMwd/FpEGWLUUkIvrsezSThNUfkKMuLh++/nzC3pYfBcWtADB60kBS3GxhGC5ISQhm4IeljPlwzmYmVxAYesLhCFj4QERrt6xKEHiwsxDsz70CPYMsQz4F5of1h5NEACBgA5GKRhbDxBgN9X3Dmiw5GGt4WNZYzueLyLilk5JZ4qqSejRXe680Kr2uWK42UODrlnUsgG60YCd2g92kMn779Id+HGncJhB8bSDg92c3bdw11tY3bKJy55zrWlRBVuCA/0g3kLdLxe5Lwfd4qLtk9ZFl/w0n3zbLLZTEPXEzbPfViKhS5uixmk9eBK1tjn5w1RbGOjegmPm4igvO7cWkUfl2qdyMZtOrNvmrMmq/aKq1WakG3OcWZKtJdRYAR9vjm8tWNKq3cX5M+zN4YuBwPLghg4F4n5A9b/40smN69AgOc7zYxEgtJwdq7BhX1G/x7B7fZgWbLPXpyXy/e9H79Ti9/l3uYEjPsahdn0Yb0hNsgHsqzU1g1/8XOATZT5i+ccsQhvDIHm3QnxBbq71mSYpN2u5k5eoJzCnZ1aARt+2NrmZjD7Reux6Uky8pOyPwPw9mXAe1FemQu4wvcz2qxj75fhjoW0duGddUS+9hvJKOvGsmonizLqa7HRja6nEpsKbYOH3NjBUCkdjydNUf+65LZGnIaWmj7GLtff5TuqRI6a0EqyNPsxxBk8MXiDif7sigRM85oJRR9MaPnrlR0GL9xgwNCYJ3GovLlkVqvg0IKqJdilobEsLm6TaJBEu5WhOLa5IJZzryfZtJgl6EjEYDHhYGbEk9zGXXASAL+YArvAa1m962teOQDJLFSxBb9jNj/SsiFbjaENIUjlRCEV0jdZE0JZfX0gEhiqQUYH7eG9tdutfm4LtDyZo5XBeAMrTleMBPHJ5AzP8AA39O222hkEUebD6UIKk+sRk8wjyBjbJmR/64kxpZek6strvSwIaE+aRRVCXYDyKEdwqgU9NSDXcHN1Q1q2KRc2uD426ZJCLfbJD383mqBl0cXX3iTV10QK+q3OanyCMpK2SiLHi4hh/fN9QovZRP2l00wTQlny8omDKlhih7N2v5fltCgNlWy28u3+YnFLI4iKTg+57OS2QxkfWQGi1kE4MCi25U29GLr7HFlt3WvrS4GW7tSZiuoClrSVPH29MkRtK7ROuA4fPHs7u1vDLQgrusZrm76Ho0rKduIWJqL5HHbMU3HcoDJlZIavBOcLHwo38SWgO5fn2FWxOViTWwWbInRff0Qresx4NX59GAt+P/Hksjf5B880BIhBYZf+AjxQ87hFgNhvWcObiyX5ibQ4vC7m7vP//j4+U6r3VSSuKGUy7HnUokwgpH/ugDxt+SxF3IB8G3e+Ic++Ta3RkfrMN4VZue/iyTVEH2bpbswf0F8FkKkjJxzW+xL+QVMnB0pzhzvtsvyROLS1eFjFBY3zMOryQ5W3B5B2xXNGAm2z+twmSbrZLdVmcdjJmP02acJwEWx6QhNAL5glJfcxfyRWEiTKeXhDx//8UEqJ5N7IZ5In7N8iWli0CxOwp/bUnVF5BPHsCAe/Q2uvI2/xC4SLH7kl+bB8ynMHWX5qUVVMrLZ/B8iQBis8MnmO0SLjKArPv2e9/n85dv920/Nb7TbksrgefpWbmzBfBAxKs1Bd/z05d/Nt2NfOZ+GJQxi/Ordbndz9/0Hxrz37798+9F84xXiOgKKiF6CdF2rtSq9ITooks3JmbE3qRfpiWFOXHb16QeRqK6NphyNBBH/IiZizxIJxXB9hABMxe40F0t7EYh9LlJEouFYshwNczDbhSMzXTRYKHEpwhhSQ1OL4klsTlSM1TXLQIqiZRi24egWcEi0DjmlmHJdMy3TRoIScDwTmBbwuSXptxA/q0MzRFHznfp9OU8N29O4sp9AtzRahqKu4BqmBkyRAvpQcZfP/0rtt58//ohu7tazP+wvnz/8dS2Ga3z/8RbDlP7h7tvdccYHPtLeBpi6DwjTujF0vbyCqmSyFg0NKyJZtz2L5OByzswiyWyLhoQ/+wjQl66OKGC/1tXxNN+sRbr73rQWSuaAuMCoAKNH0YCvmbZe+2F5FQCgUUtAnQxMXwP+iIAKxB31A9FXOXJoW3icRYWflUQBSeSTpiR2iWvGd+e668pWmEtwPxJAga9RTxwVcGyRMWVBQn3UjpCvhli65XOSrnA+8MDrQZOAJevhwsCBupTjaIJwPxvaBNfDENbjE8KMgRcDbWBG2LAYi8XcdxzZYsydmWP3xRwmF/J/+pUQ8zdu4FYSAnG3WCSpGFXU9xJ5IWxaIsQuM/kSzTzbsnvaXoDpTm2JxKyMr2UrNH6Zft/BnSTAcRp+J7l36bjlApYSvMliAYZbscm12z5Fuc2DOqFVviLDrDc/A7hj11W35mecg6ebW4l2hay7lfZIfP17kfgtw+K1F+XwPGHzUewK28FTJJ+hybU6PiO2sADLFUSnOwOuGMi3aiItw+6LL4SxRucMBcPBS+WMKkjA81kS162DSbwrn52ENYyBWMNuEGDPbsMwRCPKq2GLozcM13E5djqTHcMchi0s29Ysr8l0bPGWwPNhEllZ4lfHJMDjavsXaoMSq9TD0zq3djqUS/aajdt7L1sDcYm1l0sUK/RLxuWsKqWDeywuES2Or5BLTN99EVyiWoFrMC5x9nJJZ4HL4kKlrbH3kjPrAjwMlzi54nG4yDUhDjFOzSH7pS3FNn/trbgErX9oDrm00ESE7gHz7DlEtdLtifaQzvqIsIco9mfqjUNE58criXzwKDjQBHhJBbBRPYeGmBj/asIePHdqiyF6P15HzINvWHxE5ekXQwypfOVhD747wVUS3SKXyIeqXJHdjnCjhj2YF3P9MbIx0vaPk44HtuDnNR37F4Q9y9N8p0kQ9vikF2VBGI/rsX5dl9c7B5aFzYtt/hiOsJigB1zEzn09XOEPxBX+6bniYos/hiucV8wVPt4r3P65Ao/rOafliovt/Riu8EiNs3q4g/l6uMIfiCv803PFxd5+tjrFMEKRazYRun+EUORr1D5HuYY3wA9N6JecgQuhM/KIa/VP6L6FCN07LaGLTolXSOjANhhC14DZ1uJzIsSuUtd6oBBObNEZBP6t08P/JTcGw7/BlSEExoUrVKT/QfYK/+R7hXWg/RStSZQ9fyMTmqynWauzYoM6rwCWVwpuqhhFZxiFZZK2EM6TiTGGaVn1xqQ+l2XVNcalqvXbNNLQdClaMN/HEmd+S4NSWlSs9/6kym1HWSrYw4ONbshrXC6J6zhbzM7xHUqBbNheG0rIX/nMTHFWfxij3BjlFHAisH3XiroImGytVkPItvlx0bcA+9iRxmS7SijVG/GJFq/bZL3dxfKWPmcPLw1SUTH8cO2PLUc27AjgcmaGnqHAZTowopplcSiMoHEZGKEhx2MBiWhoqboWvEQkafCuvFwkkdkMuKrD9+geUYgLO3+DD9GWvMkhzVWUKqEr94hp6sBCByrLG98HaI7R/6TWOx47Wl+v4CohCxohIHyqvYasKLKswcyesshDv88V1zmGFKPP286s6Aod/D7RQc+oy5uf5gW146bXYJ56gS4y9F/LKFy2PO6p5jmFmGpIdfCiynmBRGXx+scgigm2cO82xfkn/YPyTgbF2yyiGLaRitrcj048eQ31/G3y1gLaRIlob53z8wCbEZ//oNYML+vVL88/9vMf1EKv/9f7iOW+TZo8RnPS6StaL3A+Fe0SMUt2GdsupPclk/cI6aOrQ/+zlWN9Fq0gUhTRVsC25onnJOGG2ae5DbrDLMpbcpzh/OT/YmWtZZZur4r+JC99uiIixpUzstvM8/57C9KAdx0+t8kTynNwpdJUhdOg++zASHP2mvPy9mXzNWrW6rlF11znsrIpY71lH61yytRT500pvSUXWbIuYxM2mwl+v14c5O5BxcDYFn0dkvBVbXQKsa/FarWW/8qNGr0b/a4NYNVNdbrPEXjXnudcG+yyPsVYnsjJVY5kiO6M6uWpc8VxbjEFXpGUphiVVwQDOTC0jvzBFy9W9Nj1xR626Kg/WEDxWgSUGT3wjvRX1H9Hp5Wt4StFiJo0cRu7PUrSTFFwqS7+hnusUBsRYyCqzgmw6LSGuBF3QEikJjFWZ6UwDOKQ9jVOYYbjabBqwwNKHEebLebUFqmHwQLBqcRspkwTNq4nEfmRyUOmY/rmvB+ZBwCuRB3QJY3ZDEMi9YDBGgrZl0iEPZEIJehzUYPKNe1qkF9tAIOLQp6qLJSjV/8Rg4Zpc5SuWI1RUm2I+DibPKB4YNYFqrSL9Ib+F1e7CvccGE5IuQechnusk3MPt0/4nft6A5MJHwDe/nGH5pZmdzIm0qOMObk1a71bzRrEH9E7WrQnJr680keKowMQC8X4z1WA/WNtYxFXE352/JD7BK/8HRusO5XEQ/v3hXGym7dLQC2mnrmn6655dUC7xbe2rlsVSx3Z68jTfLaIjCkpIlOqA4zMM1gZGbu5n/zRdEhF5EaCqOhL7h6uzkw2sOySfqEvKX2ZFouUsgpFcuIaTqCW2Ut6Iq6iRXnZllxPg/VDM42JX6C1Twu8K0JdLlQlK1XG7b/uyalK2j+7J6paBU91ikBEJhJEHpgzn8H1A+kdj31CkcQMwKOa8VTuoyWQLRNJGbwr48Y07ve7d09Cio7lOp57CCmawDJtaxiAk1GirAQb4KsoKFAi+pgmmDIqmQ9N1vI+meOozbv/Bw==</diagram></mxfile>
\ No newline at end of file diff --git a/database/engine/journalfile.c b/database/engine/journalfile.c index 500dd7880..de2b909c0 100644 --- a/database/engine/journalfile.c +++ b/database/engine/journalfile.c @@ -1,132 +1,424 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "rrdengine.h" -static void flush_transaction_buffer_cb(uv_fs_t* req) + +// DBENGINE2: Helper + +static void update_metric_retention_and_granularity_by_uuid( + struct rrdengine_instance *ctx, uuid_t *uuid, + time_t first_time_s, time_t last_time_s, + time_t update_every_s, time_t now_s) +{ + if(unlikely(last_time_s > now_s)) { + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE JV2: wrong last time on-disk (%ld - %ld, now %ld), " + "fixing last time to now", + first_time_s, last_time_s, now_s); + last_time_s = now_s; + } + + if (unlikely(first_time_s > last_time_s)) { + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE JV2: wrong first time on-disk (%ld - %ld, now %ld), " + "fixing first time to last time", + first_time_s, last_time_s, now_s); + + first_time_s = last_time_s; + } + + if (unlikely(first_time_s == 0 || last_time_s == 0)) { + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE JV2: zero on-disk timestamps (%ld - %ld, now %ld), " + "using them as-is", + first_time_s, last_time_s, now_s); + } + + bool added = false; + METRIC *metric = mrg_metric_get_and_acquire(main_mrg, uuid, (Word_t) ctx); + if (!metric) { + MRG_ENTRY entry = { + .section = (Word_t) ctx, + .first_time_s = first_time_s, + .last_time_s = last_time_s, + .latest_update_every_s = update_every_s + }; + uuid_copy(entry.uuid, *uuid); + metric = mrg_metric_add_and_acquire(main_mrg, entry, &added); + } + + if (likely(!added)) + mrg_metric_expand_retention(main_mrg, metric, first_time_s, last_time_s, update_every_s); + + mrg_metric_release(main_mrg, metric); +} + +static void after_extent_write_journalfile_v1_io(uv_fs_t* req) { - struct generic_io_descriptor *io_descr = req->data; - struct rrdengine_worker_config* wc = req->loop->data; - struct rrdengine_instance *ctx = wc->ctx; + worker_is_busy(RRDENG_FLUSH_TRANSACTION_BUFFER_CB); + + WAL *wal = req->data; + struct generic_io_descriptor *io_descr = &wal->io_descr; + struct rrdengine_instance *ctx = io_descr->ctx; debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__); if (req->result < 0) { - ++ctx->stats.io_errors; - rrd_stat_atomic_add(&global_io_errors, 1); - error("%s: uv_fs_write: %s", __func__, uv_strerror((int)req->result)); + ctx_io_error(ctx); + error("DBENGINE: %s: uv_fs_write: %s", __func__, uv_strerror((int)req->result)); } else { debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__); } uv_fs_req_cleanup(req); - posix_memfree(io_descr->buf); - freez(io_descr); + wal_release(wal); + + __atomic_sub_fetch(&ctx->atomic.extents_currently_being_flushed, 1, __ATOMIC_RELAXED); + + worker_is_idle(); } /* Careful to always call this before creating a new journal file */ -void wal_flush_transaction_buffer(struct rrdengine_worker_config* wc) +void journalfile_v1_extent_write(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, WAL *wal, uv_loop_t *loop) { - struct rrdengine_instance *ctx = wc->ctx; int ret; struct generic_io_descriptor *io_descr; - unsigned pos, size; - struct rrdengine_journalfile *journalfile; + struct rrdengine_journalfile *journalfile = datafile->journalfile; - if (unlikely(NULL == ctx->commit_log.buf || 0 == ctx->commit_log.buf_pos)) { - return; - } - /* care with outstanding transactions when switching journal files */ - journalfile = ctx->datafiles.last->journalfile; - - io_descr = mallocz(sizeof(*io_descr)); - pos = ctx->commit_log.buf_pos; - size = ctx->commit_log.buf_size; - if (pos < size) { + io_descr = &wal->io_descr; + io_descr->ctx = ctx; + if (wal->size < wal->buf_size) { /* simulate an empty transaction to skip the rest of the block */ - *(uint8_t *) (ctx->commit_log.buf + pos) = STORE_PADDING; + *(uint8_t *) (wal->buf + wal->size) = STORE_PADDING; } - io_descr->buf = ctx->commit_log.buf; - io_descr->bytes = size; - io_descr->pos = journalfile->pos; - io_descr->req.data = io_descr; + io_descr->buf = wal->buf; + io_descr->bytes = wal->buf_size; + + netdata_spinlock_lock(&journalfile->unsafe.spinlock); + io_descr->pos = journalfile->unsafe.pos; + journalfile->unsafe.pos += wal->buf_size; + netdata_spinlock_unlock(&journalfile->unsafe.spinlock); + + io_descr->req.data = wal; + io_descr->data = journalfile; io_descr->completion = NULL; - io_descr->iov = uv_buf_init((void *)io_descr->buf, size); - ret = uv_fs_write(wc->loop, &io_descr->req, journalfile->file, &io_descr->iov, 1, - journalfile->pos, flush_transaction_buffer_cb); + io_descr->iov = uv_buf_init((void *)io_descr->buf, wal->buf_size); + ret = uv_fs_write(loop, &io_descr->req, journalfile->file, &io_descr->iov, 1, + (int64_t)io_descr->pos, after_extent_write_journalfile_v1_io); fatal_assert(-1 != ret); - journalfile->pos += RRDENG_BLOCK_SIZE; - ctx->disk_space += RRDENG_BLOCK_SIZE; - ctx->commit_log.buf = NULL; - ctx->stats.io_write_bytes += RRDENG_BLOCK_SIZE; - ++ctx->stats.io_write_requests; + + ctx_current_disk_space_increase(ctx, wal->buf_size); + ctx_io_write_op_bytes(ctx, wal->buf_size); } -void * wal_get_transaction_buffer(struct rrdengine_worker_config* wc, unsigned size) +void journalfile_v2_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen) { - struct rrdengine_instance *ctx = wc->ctx; - int ret; - unsigned buf_pos = 0, buf_size; - - fatal_assert(size); - if (ctx->commit_log.buf) { - unsigned remaining; - - buf_pos = ctx->commit_log.buf_pos; - buf_size = ctx->commit_log.buf_size; - remaining = buf_size - buf_pos; - if (size > remaining) { - /* we need a new buffer */ - wal_flush_transaction_buffer(wc); + (void) snprintfz(str, maxlen, "%s/" WALFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL WALFILE_EXTENSION_V2, + datafile->ctx->config.dbfiles_path, datafile->tier, datafile->fileno); +} + +void journalfile_v1_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen) +{ + (void) snprintfz(str, maxlen, "%s/" WALFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL WALFILE_EXTENSION, + datafile->ctx->config.dbfiles_path, datafile->tier, datafile->fileno); +} + +static struct journal_v2_header *journalfile_v2_mounted_data_get(struct rrdengine_journalfile *journalfile, size_t *data_size) { + struct journal_v2_header *j2_header = NULL; + + netdata_spinlock_lock(&journalfile->mmap.spinlock); + + if(!journalfile->mmap.data) { + journalfile->mmap.data = mmap(NULL, journalfile->mmap.size, PROT_READ, MAP_SHARED, journalfile->mmap.fd, 0); + if (journalfile->mmap.data == MAP_FAILED) { + internal_fatal(true, "DBENGINE: failed to re-mmap() journal file v2"); + close(journalfile->mmap.fd); + journalfile->mmap.fd = -1; + journalfile->mmap.data = NULL; + journalfile->mmap.size = 0; + + netdata_spinlock_lock(&journalfile->v2.spinlock); + journalfile->v2.flags &= ~(JOURNALFILE_FLAG_IS_AVAILABLE | JOURNALFILE_FLAG_IS_MOUNTED); + netdata_spinlock_unlock(&journalfile->v2.spinlock); + + ctx_fs_error(journalfile->datafile->ctx); + } + else { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.journal_v2_mapped, 1, __ATOMIC_RELAXED); + + madvise_dontfork(journalfile->mmap.data, journalfile->mmap.size); + madvise_dontdump(journalfile->mmap.data, journalfile->mmap.size); + madvise_random(journalfile->mmap.data, journalfile->mmap.size); + madvise_dontneed(journalfile->mmap.data, journalfile->mmap.size); + + netdata_spinlock_lock(&journalfile->v2.spinlock); + journalfile->v2.flags |= JOURNALFILE_FLAG_IS_AVAILABLE | JOURNALFILE_FLAG_IS_MOUNTED; + netdata_spinlock_unlock(&journalfile->v2.spinlock); + } + } + + if(journalfile->mmap.data) { + j2_header = journalfile->mmap.data; + + if (data_size) + *data_size = journalfile->mmap.size; + } + + netdata_spinlock_unlock(&journalfile->mmap.spinlock); + + return j2_header; +} + +static bool journalfile_v2_mounted_data_unmount(struct rrdengine_journalfile *journalfile, bool have_locks, bool wait) { + bool unmounted = false; + + if(!have_locks) { + if(!wait) { + if (!netdata_spinlock_trylock(&journalfile->mmap.spinlock)) + return false; } + else + netdata_spinlock_lock(&journalfile->mmap.spinlock); + + if(!wait) { + if(!netdata_spinlock_trylock(&journalfile->v2.spinlock)) { + netdata_spinlock_unlock(&journalfile->mmap.spinlock); + return false; + } + } + else + netdata_spinlock_lock(&journalfile->v2.spinlock); } - if (NULL == ctx->commit_log.buf) { - buf_size = ALIGN_BYTES_CEILING(size); - ret = posix_memalign((void *)&ctx->commit_log.buf, RRDFILE_ALIGNMENT, buf_size); - if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + + if(!journalfile->v2.refcount) { + if(journalfile->mmap.data) { + if (munmap(journalfile->mmap.data, journalfile->mmap.size)) { + char path[RRDENG_PATH_MAX]; + journalfile_v2_generate_path(journalfile->datafile, path, sizeof(path)); + error("DBENGINE: failed to unmap index file '%s'", path); + internal_fatal(true, "DBENGINE: failed to unmap file '%s'", path); + ctx_fs_error(journalfile->datafile->ctx); + } + else { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.journal_v2_unmapped, 1, __ATOMIC_RELAXED); + journalfile->mmap.data = NULL; + journalfile->v2.flags &= ~JOURNALFILE_FLAG_IS_MOUNTED; + } } - memset(ctx->commit_log.buf, 0, buf_size); - buf_pos = ctx->commit_log.buf_pos = 0; - ctx->commit_log.buf_size = buf_size; + + unmounted = true; } - ctx->commit_log.buf_pos += size; - return ctx->commit_log.buf + buf_pos; + if(!have_locks) { + netdata_spinlock_unlock(&journalfile->v2.spinlock); + netdata_spinlock_unlock(&journalfile->mmap.spinlock); + } + + return unmounted; } -void generate_journalfilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen) -{ - (void) snprintfz(str, maxlen, "%s/" WALFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL WALFILE_EXTENSION, - datafile->ctx->dbfiles_path, datafile->tier, datafile->fileno); +void journalfile_v2_data_unmount_cleanup(time_t now_s) { + // DO NOT WAIT ON ANY LOCK!!! + + for(size_t tier = 0; tier < (size_t)storage_tiers ;tier++) { + struct rrdengine_instance *ctx = multidb_ctx[tier]; + if(!ctx) continue; + + struct rrdengine_datafile *datafile; + if(uv_rwlock_tryrdlock(&ctx->datafiles.rwlock) != 0) + continue; + + for (datafile = ctx->datafiles.first; datafile; datafile = datafile->next) { + struct rrdengine_journalfile *journalfile = datafile->journalfile; + + if(!netdata_spinlock_trylock(&journalfile->v2.spinlock)) + continue; + + bool unmount = false; + if (!journalfile->v2.refcount && (journalfile->v2.flags & JOURNALFILE_FLAG_IS_MOUNTED)) { + // this journal has no references and it is mounted + + if (!journalfile->v2.not_needed_since_s) + journalfile->v2.not_needed_since_s = now_s; + + else if (now_s - journalfile->v2.not_needed_since_s >= 120) + // 2 minutes have passed since last use + unmount = true; + } + netdata_spinlock_unlock(&journalfile->v2.spinlock); + + if (unmount) + journalfile_v2_mounted_data_unmount(journalfile, false, false); + } + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + } +} + +struct journal_v2_header *journalfile_v2_data_acquire(struct rrdengine_journalfile *journalfile, size_t *data_size, time_t wanted_first_time_s, time_t wanted_last_time_s) { + netdata_spinlock_lock(&journalfile->v2.spinlock); + + bool has_data = (journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE); + bool is_mounted = (journalfile->v2.flags & JOURNALFILE_FLAG_IS_MOUNTED); + bool do_we_need_it = false; + + if(has_data) { + if (!wanted_first_time_s || !wanted_last_time_s || + is_page_in_time_range(journalfile->v2.first_time_s, journalfile->v2.last_time_s, + wanted_first_time_s, wanted_last_time_s) == PAGE_IS_IN_RANGE) { + + journalfile->v2.refcount++; + + do_we_need_it = true; + + if (!wanted_first_time_s && !wanted_last_time_s && !is_mounted) + journalfile->v2.flags |= JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION; + else + journalfile->v2.flags &= ~JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION; + + } + } + netdata_spinlock_unlock(&journalfile->v2.spinlock); + + if(do_we_need_it) + return journalfile_v2_mounted_data_get(journalfile, data_size); + + return NULL; +} + +void journalfile_v2_data_release(struct rrdengine_journalfile *journalfile) { + netdata_spinlock_lock(&journalfile->v2.spinlock); + + internal_fatal(!journalfile->mmap.data, "trying to release a journalfile without data"); + internal_fatal(journalfile->v2.refcount < 1, "trying to release a non-acquired journalfile"); + + bool unmount = false; + + journalfile->v2.refcount--; + + if(journalfile->v2.refcount == 0) { + journalfile->v2.not_needed_since_s = 0; + + if(journalfile->v2.flags & JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION) + unmount = true; + } + netdata_spinlock_unlock(&journalfile->v2.spinlock); + + if(unmount) + journalfile_v2_mounted_data_unmount(journalfile, false, true); +} + +bool journalfile_v2_data_available(struct rrdengine_journalfile *journalfile) { + + netdata_spinlock_lock(&journalfile->v2.spinlock); + bool has_data = (journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE); + netdata_spinlock_unlock(&journalfile->v2.spinlock); + + return has_data; +} + +size_t journalfile_v2_data_size_get(struct rrdengine_journalfile *journalfile) { + + netdata_spinlock_lock(&journalfile->mmap.spinlock); + size_t data_size = journalfile->mmap.size; + netdata_spinlock_unlock(&journalfile->mmap.spinlock); + + return data_size; +} + +void journalfile_v2_data_set(struct rrdengine_journalfile *journalfile, int fd, void *journal_data, uint32_t journal_data_size) { + netdata_spinlock_lock(&journalfile->mmap.spinlock); + netdata_spinlock_lock(&journalfile->v2.spinlock); + + internal_fatal(journalfile->mmap.fd != -1, "DBENGINE JOURNALFILE: trying to re-set journal fd"); + internal_fatal(journalfile->mmap.data, "DBENGINE JOURNALFILE: trying to re-set journal_data"); + internal_fatal(journalfile->v2.refcount, "DBENGINE JOURNALFILE: trying to re-set journal_data of referenced journalfile"); + + journalfile->mmap.fd = fd; + journalfile->mmap.data = journal_data; + journalfile->mmap.size = journal_data_size; + journalfile->v2.not_needed_since_s = now_monotonic_sec(); + journalfile->v2.flags |= JOURNALFILE_FLAG_IS_AVAILABLE | JOURNALFILE_FLAG_IS_MOUNTED; + + struct journal_v2_header *j2_header = journalfile->mmap.data; + journalfile->v2.first_time_s = (time_t)(j2_header->start_time_ut / USEC_PER_SEC); + journalfile->v2.last_time_s = (time_t)(j2_header->end_time_ut / USEC_PER_SEC); + + journalfile_v2_mounted_data_unmount(journalfile, true, true); + + netdata_spinlock_unlock(&journalfile->v2.spinlock); + netdata_spinlock_unlock(&journalfile->mmap.spinlock); +} + +static void journalfile_v2_data_unmap_permanently(struct rrdengine_journalfile *journalfile) { + bool has_references = false; + + do { + if (has_references) + sleep_usec(10 * USEC_PER_MS); + + netdata_spinlock_lock(&journalfile->mmap.spinlock); + netdata_spinlock_lock(&journalfile->v2.spinlock); + + if(journalfile_v2_mounted_data_unmount(journalfile, true, true)) { + if(journalfile->mmap.fd != -1) + close(journalfile->mmap.fd); + + journalfile->mmap.fd = -1; + journalfile->mmap.data = NULL; + journalfile->mmap.size = 0; + journalfile->v2.first_time_s = 0; + journalfile->v2.last_time_s = 0; + journalfile->v2.flags = 0; + } + else { + has_references = true; + internal_error(true, "DBENGINE JOURNALFILE: waiting for journalfile to be available to unmap..."); + } + + netdata_spinlock_unlock(&journalfile->v2.spinlock); + netdata_spinlock_unlock(&journalfile->mmap.spinlock); + + } while(has_references); } -void journalfile_init(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +struct rrdengine_journalfile *journalfile_alloc_and_init(struct rrdengine_datafile *datafile) { - journalfile->file = (uv_file)0; - journalfile->pos = 0; + struct rrdengine_journalfile *journalfile = callocz(1, sizeof(struct rrdengine_journalfile)); journalfile->datafile = datafile; + netdata_spinlock_init(&journalfile->mmap.spinlock); + netdata_spinlock_init(&journalfile->v2.spinlock); + netdata_spinlock_init(&journalfile->unsafe.spinlock); + journalfile->mmap.fd = -1; + datafile->journalfile = journalfile; + return journalfile; } -int close_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +static int close_uv_file(struct rrdengine_datafile *datafile, uv_file file) { - struct rrdengine_instance *ctx = datafile->ctx; - uv_fs_t req; int ret; char path[RRDENG_PATH_MAX]; - generate_journalfilepath(datafile, path, sizeof(path)); - - ret = uv_fs_close(NULL, &req, journalfile->file, NULL); + uv_fs_t req; + ret = uv_fs_close(NULL, &req, file, NULL); if (ret < 0) { - error("uv_fs_close(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + journalfile_v1_generate_path(datafile, path, sizeof(path)); + error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(datafile->ctx); } uv_fs_req_cleanup(&req); - return ret; } -int unlink_journal_file(struct rrdengine_journalfile *journalfile) +int journalfile_close(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +{ + if(journalfile_v2_data_available(journalfile)) { + journalfile_v2_data_unmap_permanently(journalfile); + return 0; + } + + return close_uv_file(datafile, journalfile->file); +} + +int journalfile_unlink(struct rrdengine_journalfile *journalfile) { struct rrdengine_datafile *datafile = journalfile->datafile; struct rrdengine_instance *ctx = datafile->ctx; @@ -134,60 +426,65 @@ int unlink_journal_file(struct rrdengine_journalfile *journalfile) int ret; char path[RRDENG_PATH_MAX]; - generate_journalfilepath(datafile, path, sizeof(path)); + journalfile_v1_generate_path(datafile, path, sizeof(path)); ret = uv_fs_unlink(NULL, &req, path, NULL); if (ret < 0) { - error("uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); - ++ctx->stats.journalfile_deletions; + __atomic_add_fetch(&ctx->stats.journalfile_deletions, 1, __ATOMIC_RELAXED); return ret; } -int destroy_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +int journalfile_destroy_unsafe(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) { struct rrdengine_instance *ctx = datafile->ctx; uv_fs_t req; int ret; char path[RRDENG_PATH_MAX]; + char path_v2[RRDENG_PATH_MAX]; - generate_journalfilepath(datafile, path, sizeof(path)); + journalfile_v1_generate_path(datafile, path, sizeof(path)); + journalfile_v2_generate_path(datafile, path_v2, sizeof(path)); + if (journalfile->file) { ret = uv_fs_ftruncate(NULL, &req, journalfile->file, 0, NULL); if (ret < 0) { - error("uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); + (void) close_uv_file(datafile, journalfile->file); + } - ret = uv_fs_close(NULL, &req, journalfile->file, NULL); + // This is the new journal v2 index file + ret = uv_fs_unlink(NULL, &req, path_v2, NULL); if (ret < 0) { - error("uv_fs_close(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); ret = uv_fs_unlink(NULL, &req, path, NULL); if (ret < 0) { - error("uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); - ++ctx->stats.journalfile_deletions; + __atomic_add_fetch(&ctx->stats.journalfile_deletions, 2, __ATOMIC_RELAXED); + + if(journalfile_v2_data_available(journalfile)) + journalfile_v2_data_unmap_permanently(journalfile); return ret; } -int create_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +int journalfile_create(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) { struct rrdengine_instance *ctx = datafile->ctx; uv_fs_t req; @@ -197,19 +494,18 @@ int create_journal_file(struct rrdengine_journalfile *journalfile, struct rrdeng uv_buf_t iov; char path[RRDENG_PATH_MAX]; - generate_journalfilepath(datafile, path, sizeof(path)); - fd = open_file_direct_io(path, O_CREAT | O_RDWR | O_TRUNC, &file); + journalfile_v1_generate_path(datafile, path, sizeof(path)); + fd = open_file_for_io(path, O_CREAT | O_RDWR | O_TRUNC, &file, use_direct_io); if (fd < 0) { - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + ctx_fs_error(ctx); return fd; } journalfile->file = file; - ++ctx->stats.journalfile_creations; + __atomic_add_fetch(&ctx->stats.journalfile_creations, 1, __ATOMIC_RELAXED); ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock)); if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); } memset(superblock, 0, sizeof(*superblock)); (void) strncpy(superblock->magic_number, RRDENG_JF_MAGIC, RRDENG_MAGIC_SZ); @@ -220,25 +516,24 @@ int create_journal_file(struct rrdengine_journalfile *journalfile, struct rrdeng ret = uv_fs_write(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { fatal_assert(req.result < 0); - error("uv_fs_write: %s", uv_strerror(ret)); - ++ctx->stats.io_errors; - rrd_stat_atomic_add(&global_io_errors, 1); + error("DBENGINE: uv_fs_write: %s", uv_strerror(ret)); + ctx_io_error(ctx); } uv_fs_req_cleanup(&req); posix_memfree(superblock); if (ret < 0) { - destroy_journal_file(journalfile, datafile); + journalfile_destroy_unsafe(journalfile, datafile); return ret; } - journalfile->pos = sizeof(*superblock); - ctx->stats.io_write_bytes += sizeof(*superblock); - ++ctx->stats.io_write_requests; + journalfile->unsafe.pos = sizeof(*superblock); + + ctx_io_write_op_bytes(ctx, sizeof(*superblock)); return 0; } -static int check_journal_file_superblock(uv_file file) +static int journalfile_check_superblock(uv_file file) { int ret; struct rrdeng_jf_sb *superblock; @@ -247,13 +542,13 @@ static int check_journal_file_superblock(uv_file file) ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock)); if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); } iov = uv_buf_init((void *)superblock, sizeof(*superblock)); ret = uv_fs_read(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { - error("uv_fs_read: %s", uv_strerror(ret)); + error("DBENGINE: uv_fs_read: %s", uv_strerror(ret)); uv_fs_req_cleanup(&req); goto error; } @@ -262,7 +557,7 @@ static int check_journal_file_superblock(uv_file file) if (strncmp(superblock->magic_number, RRDENG_JF_MAGIC, RRDENG_MAGIC_SZ) || strncmp(superblock->version, RRDENG_JF_VER, RRDENG_VER_SZ)) { - error("File has invalid superblock."); + error("DBENGINE: File has invalid superblock."); ret = UV_EINVAL; } else { ret = 0; @@ -272,15 +567,10 @@ static int check_journal_file_superblock(uv_file file) return ret; } -static void restore_extent_metadata(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, - void *buf, unsigned max_size) +static void journalfile_restore_extent_metadata(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, void *buf, unsigned max_size) { static BITMAP256 page_error_map; - struct page_cache *pg_cache = &ctx->pg_cache; - unsigned i, count, payload_length, descr_size, valid_pages; - struct rrdeng_page_descr *descr; - struct extent_info *extent; - /* persistent structures */ + unsigned i, count, payload_length, descr_size; struct rrdeng_jf_store_data *jf_metric_data; jf_metric_data = buf; @@ -288,117 +578,65 @@ static void restore_extent_metadata(struct rrdengine_instance *ctx, struct rrden descr_size = sizeof(*jf_metric_data->descr) * count; payload_length = sizeof(*jf_metric_data) + descr_size; if (payload_length > max_size) { - error("Corrupted transaction payload."); + error("DBENGINE: corrupted transaction payload."); return; } - extent = mallocz(sizeof(*extent) + count * sizeof(extent->pages[0])); - extent->offset = jf_metric_data->extent_offset; - extent->size = jf_metric_data->extent_size; - extent->datafile = journalfile->datafile; - extent->next = NULL; - - for (i = 0, valid_pages = 0 ; i < count ; ++i) { + time_t now_s = max_acceptable_collected_time(); + for (i = 0; i < count ; ++i) { uuid_t *temp_id; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; uint8_t page_type = jf_metric_data->descr[i].type; if (page_type > PAGE_TYPE_MAX) { if (!bitmap256_get_bit(&page_error_map, page_type)) { - error("Unknown page type %d encountered.", page_type); + error("DBENGINE: unknown page type %d encountered.", page_type); bitmap256_set_bit(&page_error_map, page_type, 1); } continue; } - uint64_t start_time_ut = jf_metric_data->descr[i].start_time_ut; - uint64_t end_time_ut = jf_metric_data->descr[i].end_time_ut; - size_t entries = jf_metric_data->descr[i].page_length / page_type_size[page_type]; - time_t update_every_s = (entries > 1) ? ((end_time_ut - start_time_ut) / USEC_PER_SEC / (entries - 1)) : 0; - - if (unlikely(start_time_ut > end_time_ut)) { - ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].counter++; - if(ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].latest_end_time_ut < end_time_ut) - ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].latest_end_time_ut = end_time_ut; - continue; - } - if (unlikely(start_time_ut == end_time_ut && entries != 1)) { - ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].counter++; - if(ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].latest_end_time_ut < end_time_ut) - ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].latest_end_time_ut = end_time_ut; - continue; - } - - if (unlikely(!entries)) { - ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].counter++; - if(ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].latest_end_time_ut < end_time_ut) - ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].latest_end_time_ut = end_time_ut; - continue; - } + temp_id = (uuid_t *)jf_metric_data->descr[i].uuid; + METRIC *metric = mrg_metric_get_and_acquire(main_mrg, temp_id, (Word_t) ctx); - if(entries > 1 && update_every_s == 0) { - ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].counter++; - if(ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].latest_end_time_ut < end_time_ut) - ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].latest_end_time_ut = end_time_ut; - continue; - } + struct rrdeng_extent_page_descr *descr = &jf_metric_data->descr[i]; + VALIDATED_PAGE_DESCRIPTOR vd = validate_extent_page_descr( + descr, now_s, + (metric) ? mrg_metric_get_update_every_s(main_mrg, metric) : 0, + false); - if(start_time_ut + update_every_s * USEC_PER_SEC * (entries - 1) != end_time_ut) { - ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].counter++; - if(ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].latest_end_time_ut < end_time_ut) - ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].latest_end_time_ut = end_time_ut; + if(!vd.is_valid) { + if(metric) + mrg_metric_release(main_mrg, metric); - // let this be - // end_time_ut = start_time_ut + update_every_s * USEC_PER_SEC * (entries - 1); + continue; } - temp_id = (uuid_t *)jf_metric_data->descr[i].uuid; - - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, temp_id, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - page_index = *PValue; - } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - if (NULL == PValue) { - /* First time we see the UUID */ - uv_rwlock_wrlock(&pg_cache->metrics_index.lock); - PValue = JudyHSIns(&pg_cache->metrics_index.JudyHS_array, temp_id, sizeof(uuid_t), PJE0); - fatal_assert(NULL == *PValue); /* TODO: figure out concurrency model */ - *PValue = page_index = create_page_index(temp_id, ctx); - page_index->prev = pg_cache->metrics_index.last_page_index; - pg_cache->metrics_index.last_page_index = page_index; - uv_rwlock_wrunlock(&pg_cache->metrics_index.lock); + bool update_metric_time = true; + if (!metric) { + MRG_ENTRY entry = { + .section = (Word_t)ctx, + .first_time_s = vd.start_time_s, + .last_time_s = vd.end_time_s, + .latest_update_every_s = vd.update_every_s, + }; + uuid_copy(entry.uuid, *temp_id); + + bool added; + metric = mrg_metric_add_and_acquire(main_mrg, entry, &added); + if(added) + update_metric_time = false; } + Word_t metric_id = mrg_metric_id(main_mrg, metric); - descr = pg_cache_create_descr(); - descr->page_length = jf_metric_data->descr[i].page_length; - descr->start_time_ut = start_time_ut; - descr->end_time_ut = end_time_ut; - descr->update_every_s = (update_every_s > 0) ? (uint32_t)update_every_s : (page_index->latest_update_every_s); - descr->id = &page_index->id; - descr->extent = extent; - descr->type = page_type; - extent->pages[valid_pages++] = descr; - pg_cache_insert(ctx, page_index, descr); + if (update_metric_time) + mrg_metric_expand_retention(main_mrg, metric, vd.start_time_s, vd.end_time_s, vd.update_every_s); - if(page_index->latest_time_ut == descr->end_time_ut) - page_index->latest_update_every_s = descr->update_every_s; + pgc_open_add_hot_page( + (Word_t)ctx, metric_id, vd.start_time_s, vd.end_time_s, vd.update_every_s, + journalfile->datafile, + jf_metric_data->extent_offset, jf_metric_data->extent_size, jf_metric_data->descr[i].page_length); - if(descr->update_every_s == 0) - fatal( - "DBENGINE: page descriptor update every is zero, end_time_ut = %llu, start_time_ut = %llu, entries = %zu", - (unsigned long long)end_time_ut, (unsigned long long)start_time_ut, entries); - } - - extent->number_of_pages = valid_pages; - - if (likely(valid_pages)) - df_extent_insert(extent); - else { - freez(extent); - ctx->load_errors[LOAD_ERRORS_DROPPED_EXTENT].counter++; + mrg_metric_release(main_mrg, metric); } } @@ -407,8 +645,8 @@ static void restore_extent_metadata(struct rrdengine_instance *ctx, struct rrden * Sets id to the current transaction id or to 0 if unknown. * Returns size of transaction record or 0 for unknown size. */ -static unsigned replay_transaction(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, - void *buf, uint64_t *id, unsigned max_size) +static unsigned journalfile_replay_transaction(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, + void *buf, uint64_t *id, unsigned max_size) { unsigned payload_length, size_bytes; int ret; @@ -424,14 +662,14 @@ static unsigned replay_transaction(struct rrdengine_instance *ctx, struct rrdeng return 0; } if (sizeof(*jf_header) > max_size) { - error("Corrupted transaction record, skipping."); + error("DBENGINE: corrupted transaction record, skipping."); return 0; } *id = jf_header->id; payload_length = jf_header->payload_length; size_bytes = sizeof(*jf_header) + payload_length + sizeof(*jf_trailer); if (size_bytes > max_size) { - error("Corrupted transaction record, skipping."); + error("DBENGINE: corrupted transaction record, skipping."); return 0; } jf_trailer = buf + sizeof(*jf_header) + payload_length; @@ -440,16 +678,16 @@ static unsigned replay_transaction(struct rrdengine_instance *ctx, struct rrdeng ret = crc32cmp(jf_trailer->checksum, crc); debug(D_RRDENGINE, "Transaction %"PRIu64" was read from disk. CRC32 check: %s", *id, ret ? "FAILED" : "SUCCEEDED"); if (unlikely(ret)) { - error("Transaction %"PRIu64" was read from disk. CRC32 check: FAILED", *id); + error("DBENGINE: transaction %"PRIu64" was read from disk. CRC32 check: FAILED", *id); return size_bytes; } switch (jf_header->type) { case STORE_DATA: debug(D_RRDENGINE, "Replaying transaction %"PRIu64"", jf_header->id); - restore_extent_metadata(ctx, journalfile, buf + sizeof(*jf_header), payload_length); + journalfile_restore_extent_metadata(ctx, journalfile, buf + sizeof(*jf_header), payload_length); break; default: - error("Unknown transaction type. Skipping record."); + error("DBENGINE: unknown transaction type, skipping record."); break; } @@ -463,10 +701,10 @@ static unsigned replay_transaction(struct rrdengine_instance *ctx, struct rrdeng * Page cache must already be initialized. * Returns the maximum transaction id it discovered. */ -static uint64_t iterate_transactions(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile) +static uint64_t journalfile_iterate_transactions(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile) { uv_file file; - uint64_t file_size;//, data_file_size; + uint64_t file_size; int ret; uint64_t pos, pos_i, max_id, id; unsigned size_bytes; @@ -475,39 +713,31 @@ static uint64_t iterate_transactions(struct rrdengine_instance *ctx, struct rrde uv_fs_t req; file = journalfile->file; - file_size = journalfile->pos; - //data_file_size = journalfile->datafile->pos; TODO: utilize this? + file_size = journalfile->unsafe.pos; max_id = 1; - bool journal_is_mmapped = (journalfile->data != NULL); - if (unlikely(!journal_is_mmapped)) { - ret = posix_memalign((void *)&buf, RRDFILE_ALIGNMENT, READAHEAD_BYTES); - if (unlikely(ret)) - fatal("posix_memalign:%s", strerror(ret)); - } - else - buf = journalfile->data + sizeof(struct rrdeng_jf_sb); - for (pos = sizeof(struct rrdeng_jf_sb) ; pos < file_size ; pos += READAHEAD_BYTES) { + ret = posix_memalign((void *)&buf, RRDFILE_ALIGNMENT, READAHEAD_BYTES); + if (unlikely(ret)) + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); + + for (pos = sizeof(struct rrdeng_jf_sb); pos < file_size; pos += READAHEAD_BYTES) { size_bytes = MIN(READAHEAD_BYTES, file_size - pos); - if (unlikely(!journal_is_mmapped)) { - iov = uv_buf_init(buf, size_bytes); - ret = uv_fs_read(NULL, &req, file, &iov, 1, pos, NULL); - if (ret < 0) { - error("uv_fs_read: pos=%" PRIu64 ", %s", pos, uv_strerror(ret)); - uv_fs_req_cleanup(&req); - goto skip_file; - } - fatal_assert(req.result >= 0); + iov = uv_buf_init(buf, size_bytes); + ret = uv_fs_read(NULL, &req, file, &iov, 1, pos, NULL); + if (ret < 0) { + error("DBENGINE: uv_fs_read: pos=%" PRIu64 ", %s", pos, uv_strerror(ret)); uv_fs_req_cleanup(&req); - ++ctx->stats.io_read_requests; - ctx->stats.io_read_bytes += size_bytes; + goto skip_file; } + fatal_assert(req.result >= 0); + uv_fs_req_cleanup(&req); + ctx_io_read_op_bytes(ctx, size_bytes); - for (pos_i = 0 ; pos_i < size_bytes ; ) { + for (pos_i = 0; pos_i < size_bytes;) { unsigned max_size; max_size = pos + size_bytes - pos_i; - ret = replay_transaction(ctx, journalfile, buf + pos_i, &id, max_size); + ret = journalfile_replay_transaction(ctx, journalfile, buf + pos_i, &id, max_size); if (!ret) /* TODO: support transactions bigger than 4K */ /* unknown transaction size, move on to the next block */ pos_i = ALIGN_BYTES_FLOOR(pos_i + RRDENG_BLOCK_SIZE); @@ -515,73 +745,722 @@ static uint64_t iterate_transactions(struct rrdengine_instance *ctx, struct rrde pos_i += ret; max_id = MAX(max_id, id); } - if (likely(journal_is_mmapped)) - buf += size_bytes; } skip_file: - if (unlikely(!journal_is_mmapped)) - posix_memfree(buf); + posix_memfree(buf); return max_id; } -int load_journal_file(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, - struct rrdengine_datafile *datafile) +// Checks that the extent list checksum is valid +static int journalfile_check_v2_extent_list (void *data_start, size_t file_size) +{ + UNUSED(file_size); + uLong crc; + + struct journal_v2_header *j2_header = (void *) data_start; + struct journal_v2_block_trailer *journal_v2_trailer; + + journal_v2_trailer = (struct journal_v2_block_trailer *) ((uint8_t *) data_start + j2_header->extent_trailer_offset); + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (uint8_t *) data_start + j2_header->extent_offset, j2_header->extent_count * sizeof(struct journal_extent_list)); + if (unlikely(crc32cmp(journal_v2_trailer->checksum, crc))) { + error("DBENGINE: extent list CRC32 check: FAILED"); + return 1; + } + + return 0; +} + +// Checks that the metric list (UUIDs) checksum is valid +static int journalfile_check_v2_metric_list(void *data_start, size_t file_size) +{ + UNUSED(file_size); + uLong crc; + + struct journal_v2_header *j2_header = (void *) data_start; + struct journal_v2_block_trailer *journal_v2_trailer; + + journal_v2_trailer = (struct journal_v2_block_trailer *) ((uint8_t *) data_start + j2_header->metric_trailer_offset); + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (uint8_t *) data_start + j2_header->metric_offset, j2_header->metric_count * sizeof(struct journal_metric_list)); + if (unlikely(crc32cmp(journal_v2_trailer->checksum, crc))) { + error("DBENGINE: metric list CRC32 check: FAILED"); + return 1; + } + return 0; +} + +// +// Return +// 0 Ok +// 1 Invalid +// 2 Force rebuild +// 3 skip + +static int journalfile_v2_validate(void *data_start, size_t journal_v2_file_size, size_t journal_v1_file_size) +{ + int rc; + uLong crc; + + struct journal_v2_header *j2_header = (void *) data_start; + struct journal_v2_block_trailer *journal_v2_trailer; + + if (j2_header->magic == JOURVAL_V2_REBUILD_MAGIC) + return 2; + + if (j2_header->magic == JOURVAL_V2_SKIP_MAGIC) + return 3; + + // Magic failure + if (j2_header->magic != JOURVAL_V2_MAGIC) + return 1; + + if (j2_header->journal_v2_file_size != journal_v2_file_size) + return 1; + + if (journal_v1_file_size && j2_header->journal_v1_file_size != journal_v1_file_size) + return 1; + + journal_v2_trailer = (struct journal_v2_block_trailer *) ((uint8_t *) data_start + journal_v2_file_size - sizeof(*journal_v2_trailer)); + + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (void *) j2_header, sizeof(*j2_header)); + + rc = crc32cmp(journal_v2_trailer->checksum, crc); + if (unlikely(rc)) { + error("DBENGINE: file CRC32 check: FAILED"); + return 1; + } + + rc = journalfile_check_v2_extent_list(data_start, journal_v2_file_size); + if (rc) return 1; + + rc = journalfile_check_v2_metric_list(data_start, journal_v2_file_size); + if (rc) return 1; + + if (!db_engine_journal_check) + return 0; + + // Verify complete UUID chain + + struct journal_metric_list *metric = (void *) (data_start + j2_header->metric_offset); + + unsigned verified = 0; + unsigned entries; + unsigned total_pages = 0; + + info("DBENGINE: checking %u metrics that exist in the journal", j2_header->metric_count); + for (entries = 0; entries < j2_header->metric_count; entries++) { + + char uuid_str[UUID_STR_LEN]; + uuid_unparse_lower(metric->uuid, uuid_str); + struct journal_page_header *metric_list_header = (void *) (data_start + metric->page_offset); + struct journal_page_header local_metric_list_header = *metric_list_header; + + local_metric_list_header.crc = JOURVAL_V2_MAGIC; + + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (void *) &local_metric_list_header, sizeof(local_metric_list_header)); + rc = crc32cmp(metric_list_header->checksum, crc); + + if (!rc) { + struct journal_v2_block_trailer *journal_trailer = + (void *) data_start + metric->page_offset + sizeof(struct journal_page_header) + (metric_list_header->entries * sizeof(struct journal_page_list)); + + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (uint8_t *) metric_list_header + sizeof(struct journal_page_header), metric_list_header->entries * sizeof(struct journal_page_list)); + rc = crc32cmp(journal_trailer->checksum, crc); + internal_error(rc, "DBENGINE: index %u : %s entries %u at offset %u verified, DATA CRC computed %lu, stored %u", entries, uuid_str, metric->entries, metric->page_offset, + crc, metric_list_header->crc); + if (!rc) { + total_pages += metric_list_header->entries; + verified++; + } + } + + metric++; + if ((uint32_t)((uint8_t *) metric - (uint8_t *) data_start) > (uint32_t) journal_v2_file_size) { + info("DBENGINE: verification failed EOF reached -- total entries %u, verified %u", entries, verified); + return 1; + } + } + + if (entries != verified) { + info("DBENGINE: verification failed -- total entries %u, verified %u", entries, verified); + return 1; + } + info("DBENGINE: verification succeeded -- total entries %u, verified %u (%u total pages)", entries, verified, total_pages); + + return 0; +} + +void journalfile_v2_populate_retention_to_mrg(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile) { + usec_t started_ut = now_monotonic_usec(); + + size_t data_size = 0; + struct journal_v2_header *j2_header = journalfile_v2_data_acquire(journalfile, &data_size, 0, 0); + if(!j2_header) + return; + + uint8_t *data_start = (uint8_t *)j2_header; + uint32_t entries = j2_header->metric_count; + + struct journal_metric_list *metric = (struct journal_metric_list *) (data_start + j2_header->metric_offset); + time_t header_start_time_s = (time_t) (j2_header->start_time_ut / USEC_PER_SEC); + time_t now_s = max_acceptable_collected_time(); + for (size_t i=0; i < entries; i++) { + time_t start_time_s = header_start_time_s + metric->delta_start_s; + time_t end_time_s = header_start_time_s + metric->delta_end_s; + time_t update_every_s = (metric->entries > 1) ? ((end_time_s - start_time_s) / (entries - 1)) : 0; + update_metric_retention_and_granularity_by_uuid( + ctx, &metric->uuid, start_time_s, end_time_s, update_every_s, now_s); + +#ifdef NETDATA_INTERNAL_CHECKS + struct journal_page_header *metric_list_header = (void *) (data_start + metric->page_offset); + fatal_assert(uuid_compare(metric_list_header->uuid, metric->uuid) == 0); + fatal_assert(metric->entries == metric_list_header->entries); +#endif + metric++; + } + + journalfile_v2_data_release(journalfile); + usec_t ended_ut = now_monotonic_usec(); + + info("DBENGINE: journal v2 of tier %d, datafile %u populated, size: %0.2f MiB, metrics: %0.2f k, %0.2f ms" + , ctx->config.tier, journalfile->datafile->fileno + , (double)data_size / 1024 / 1024 + , (double)entries / 1000 + , ((double)(ended_ut - started_ut) / USEC_PER_MS) + ); +} + +int journalfile_v2_load(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +{ + int ret, fd; + char path_v1[RRDENG_PATH_MAX]; + char path_v2[RRDENG_PATH_MAX]; + struct stat statbuf; + size_t journal_v1_file_size = 0; + size_t journal_v2_file_size; + + journalfile_v1_generate_path(datafile, path_v1, sizeof(path_v1)); + ret = stat(path_v1, &statbuf); + if (!ret) + journal_v1_file_size = (uint32_t)statbuf.st_size; + + journalfile_v2_generate_path(datafile, path_v2, sizeof(path_v2)); + fd = open(path_v2, O_RDONLY); + if (fd < 0) { + if (errno == ENOENT) + return 1; + ctx_fs_error(ctx); + error("DBENGINE: failed to open '%s'", path_v2); + return 1; + } + + ret = fstat(fd, &statbuf); + if (ret) { + error("DBENGINE: failed to get file information for '%s'", path_v2); + close(fd); + return 1; + } + + journal_v2_file_size = (size_t)statbuf.st_size; + + if (journal_v2_file_size < sizeof(struct journal_v2_header)) { + error_report("Invalid file %s. Not the expected size", path_v2); + close(fd); + return 1; + } + + usec_t mmap_start_ut = now_monotonic_usec(); + uint8_t *data_start = mmap(NULL, journal_v2_file_size, PROT_READ, MAP_SHARED, fd, 0); + if (data_start == MAP_FAILED) { + close(fd); + return 1; + } + + info("DBENGINE: checking integrity of '%s'", path_v2); + usec_t validation_start_ut = now_monotonic_usec(); + int rc = journalfile_v2_validate(data_start, journal_v2_file_size, journal_v1_file_size); + if (unlikely(rc)) { + if (rc == 2) + error_report("File %s needs to be rebuilt", path_v2); + else if (rc == 3) + error_report("File %s will be skipped", path_v2); + else + error_report("File %s is invalid and it will be rebuilt", path_v2); + + if (unlikely(munmap(data_start, journal_v2_file_size))) + error("DBENGINE: failed to unmap '%s'", path_v2); + + close(fd); + return rc; + } + + struct journal_v2_header *j2_header = (void *) data_start; + uint32_t entries = j2_header->metric_count; + + if (unlikely(!entries)) { + if (unlikely(munmap(data_start, journal_v2_file_size))) + error("DBENGINE: failed to unmap '%s'", path_v2); + + close(fd); + return 1; + } + + usec_t finished_ut = now_monotonic_usec(); + + info("DBENGINE: journal v2 '%s' loaded, size: %0.2f MiB, metrics: %0.2f k, " + "mmap: %0.2f ms, validate: %0.2f ms" + , path_v2 + , (double)journal_v2_file_size / 1024 / 1024 + , (double)entries / 1000 + , ((double)(validation_start_ut - mmap_start_ut) / USEC_PER_MS) + , ((double)(finished_ut - validation_start_ut) / USEC_PER_MS) + ); + + // Initialize the journal file to be able to access the data + journalfile_v2_data_set(journalfile, fd, data_start, journal_v2_file_size); + + ctx_current_disk_space_increase(ctx, journal_v2_file_size); + + // File is OK load it + return 0; +} + +struct journal_metric_list_to_sort { + struct jv2_metrics_info *metric_info; +}; + +static int journalfile_metric_compare (const void *item1, const void *item2) +{ + const struct jv2_metrics_info *metric1 = ((struct journal_metric_list_to_sort *) item1)->metric_info; + const struct jv2_metrics_info *metric2 = ((struct journal_metric_list_to_sort *) item2)->metric_info; + + return uuid_compare(*(metric1->uuid), *(metric2->uuid)); +} + + +// Write list of extents for the journalfile +void *journalfile_v2_write_extent_list(Pvoid_t JudyL_extents_pos, void *data) +{ + Pvoid_t *PValue; + struct journal_extent_list *j2_extent_base = (void *) data; + struct jv2_extents_info *ext_info; + + bool first = true; + Word_t pos = 0; + size_t count = 0; + while ((PValue = JudyLFirstThenNext(JudyL_extents_pos, &pos, &first))) { + ext_info = *PValue; + size_t index = ext_info->index; + j2_extent_base[index].file_index = 0; + j2_extent_base[index].datafile_offset = ext_info->pos; + j2_extent_base[index].datafile_size = ext_info->bytes; + j2_extent_base[index].pages = ext_info->number_of_pages; + count++; + } + return j2_extent_base + count; +} + +static int journalfile_verify_space(struct journal_v2_header *j2_header, void *data, uint32_t bytes) +{ + if ((unsigned long)(((uint8_t *) data - (uint8_t *) j2_header->data) + bytes) > (j2_header->journal_v2_file_size - sizeof(struct journal_v2_block_trailer))) + return 1; + + return 0; +} + +void *journalfile_v2_write_metric_page(struct journal_v2_header *j2_header, void *data, struct jv2_metrics_info *metric_info, uint32_t pages_offset) +{ + struct journal_metric_list *metric = (void *) data; + + if (journalfile_verify_space(j2_header, data, sizeof(*metric))) + return NULL; + + uuid_copy(metric->uuid, *metric_info->uuid); + metric->entries = metric_info->number_of_pages; + metric->page_offset = pages_offset; + metric->delta_start_s = (uint32_t)(metric_info->first_time_s - (time_t)(j2_header->start_time_ut / USEC_PER_SEC)); + metric->delta_end_s = (uint32_t)(metric_info->last_time_s - (time_t)(j2_header->start_time_ut / USEC_PER_SEC)); + + return ++metric; +} + +void *journalfile_v2_write_data_page_header(struct journal_v2_header *j2_header __maybe_unused, void *data, struct jv2_metrics_info *metric_info, uint32_t uuid_offset) +{ + struct journal_page_header *data_page_header = (void *) data; + uLong crc; + + uuid_copy(data_page_header->uuid, *metric_info->uuid); + data_page_header->entries = metric_info->number_of_pages; + data_page_header->uuid_offset = uuid_offset; // data header OFFSET poings to METRIC in the directory + data_page_header->crc = JOURVAL_V2_MAGIC; + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (void *) data_page_header, sizeof(*data_page_header)); + crc32set(data_page_header->checksum, crc); + return ++data_page_header; +} + +void *journalfile_v2_write_data_page_trailer(struct journal_v2_header *j2_header __maybe_unused, void *data, void *page_header) +{ + struct journal_page_header *data_page_header = (void *) page_header; + struct journal_v2_block_trailer *journal_trailer = (void *) data; + uLong crc; + + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (uint8_t *) page_header + sizeof(struct journal_page_header), data_page_header->entries * sizeof(struct journal_page_list)); + crc32set(journal_trailer->checksum, crc); + return ++journal_trailer; +} + +void *journalfile_v2_write_data_page(struct journal_v2_header *j2_header, void *data, struct jv2_page_info *page_info) +{ + struct journal_page_list *data_page = data; + + if (journalfile_verify_space(j2_header, data, sizeof(*data_page))) + return NULL; + + struct extent_io_data *ei = page_info->custom_data; + + data_page->delta_start_s = (uint32_t) (page_info->start_time_s - (time_t) (j2_header->start_time_ut) / USEC_PER_SEC); + data_page->delta_end_s = (uint32_t) (page_info->end_time_s - (time_t) (j2_header->start_time_ut) / USEC_PER_SEC); + data_page->extent_index = page_info->extent_index; + + data_page->update_every_s = page_info->update_every_s; + data_page->page_length = (uint16_t) (ei ? ei->page_length : page_info->page_length); + data_page->type = 0; + + return ++data_page; +} + +// Must be recorded in metric_info->entries +void *journalfile_v2_write_descriptors(struct journal_v2_header *j2_header, void *data, struct jv2_metrics_info *metric_info) +{ + Pvoid_t *PValue; + + struct journal_page_list *data_page = (void *)data; + // We need to write all descriptors with index metric_info->min_index_time_s, metric_info->max_index_time_s + // that belong to this journal file + Pvoid_t JudyL_array = metric_info->JudyL_pages_by_start_time; + + Word_t index_time = 0; + bool first = true; + struct jv2_page_info *page_info; + while ((PValue = JudyLFirstThenNext(JudyL_array, &index_time, &first))) { + page_info = *PValue; + // Write one descriptor and return the next data page location + data_page = journalfile_v2_write_data_page(j2_header, (void *) data_page, page_info); + if (NULL == data_page) + break; + } + return data_page; +} + +// Migrate the journalfile pointed by datafile +// activate : make the new file active immediately +// journafile data will be set and descriptors (if deleted) will be repopulated as needed +// startup : if the migration is done during agent startup +// this will allow us to optimize certain things + +void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno __maybe_unused, uint8_t type __maybe_unused, + Pvoid_t JudyL_metrics, Pvoid_t JudyL_extents_pos, + size_t number_of_extents, size_t number_of_metrics, size_t number_of_pages, void *user_data) +{ + char path[RRDENG_PATH_MAX]; + Pvoid_t *PValue; + struct rrdengine_instance *ctx = (struct rrdengine_instance *) section; + struct rrdengine_journalfile *journalfile = (struct rrdengine_journalfile *) user_data; + struct rrdengine_datafile *datafile = journalfile->datafile; + time_t min_time_s = LONG_MAX; + time_t max_time_s = 0; + struct jv2_metrics_info *metric_info; + + journalfile_v2_generate_path(datafile, path, sizeof(path)); + + info("DBENGINE: indexing file '%s': extents %zu, metrics %zu, pages %zu", + path, + number_of_extents, + number_of_metrics, + number_of_pages); + +#ifdef NETDATA_INTERNAL_CHECKS + usec_t start_loading = now_monotonic_usec(); +#endif + + size_t total_file_size = 0; + total_file_size += (sizeof(struct journal_v2_header) + JOURNAL_V2_HEADER_PADDING_SZ); + + // Extents will start here + uint32_t extent_offset = total_file_size; + total_file_size += (number_of_extents * sizeof(struct journal_extent_list)); + + uint32_t extent_offset_trailer = total_file_size; + total_file_size += sizeof(struct journal_v2_block_trailer); + + // UUID list will start here + uint32_t metrics_offset = total_file_size; + total_file_size += (number_of_metrics * sizeof(struct journal_metric_list)); + + // UUID list trailer + uint32_t metric_offset_trailer = total_file_size; + total_file_size += sizeof(struct journal_v2_block_trailer); + + // descr @ time will start here + uint32_t pages_offset = total_file_size; + total_file_size += (number_of_pages * (sizeof(struct journal_page_list) + sizeof(struct journal_page_header) + sizeof(struct journal_v2_block_trailer))); + + // File trailer + uint32_t trailer_offset = total_file_size; + total_file_size += sizeof(struct journal_v2_block_trailer); + + int fd_v2; + uint8_t *data_start = netdata_mmap(path, total_file_size, MAP_SHARED, 0, false, &fd_v2); + uint8_t *data = data_start; + + memset(data_start, 0, extent_offset); + + // Write header + struct journal_v2_header j2_header; + memset(&j2_header, 0, sizeof(j2_header)); + + j2_header.magic = JOURVAL_V2_MAGIC; + j2_header.start_time_ut = 0; + j2_header.end_time_ut = 0; + j2_header.extent_count = number_of_extents; + j2_header.extent_offset = extent_offset; + j2_header.metric_count = number_of_metrics; + j2_header.metric_offset = metrics_offset; + j2_header.page_count = number_of_pages; + j2_header.page_offset = pages_offset; + j2_header.extent_trailer_offset = extent_offset_trailer; + j2_header.metric_trailer_offset = metric_offset_trailer; + j2_header.journal_v2_file_size = total_file_size; + j2_header.journal_v1_file_size = (uint32_t)journalfile_current_size(journalfile); + j2_header.data = data_start; // Used during migration + + struct journal_v2_block_trailer *journal_v2_trailer; + + data = journalfile_v2_write_extent_list(JudyL_extents_pos, data_start + extent_offset); + internal_error(true, "DBENGINE: write extent list so far %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + + fatal_assert(data == data_start + extent_offset_trailer); + + // Calculate CRC for extents + journal_v2_trailer = (struct journal_v2_block_trailer *) (data_start + extent_offset_trailer); + uLong crc; + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (uint8_t *) data_start + extent_offset, number_of_extents * sizeof(struct journal_extent_list)); + crc32set(journal_v2_trailer->checksum, crc); + + internal_error(true, "DBENGINE: CALCULATE CRC FOR EXTENT %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + // Skip the trailer, point to the metrics off + data += sizeof(struct journal_v2_block_trailer); + + // Sanity check -- we must be at the metrics_offset + fatal_assert(data == data_start + metrics_offset); + + // Allocate array to sort UUIDs and keep them sorted in the journal because we want to do binary search when we do lookups + struct journal_metric_list_to_sort *uuid_list = mallocz(number_of_metrics * sizeof(struct journal_metric_list_to_sort)); + + Word_t Index = 0; + size_t count = 0; + bool first_then_next = true; + while ((PValue = JudyLFirstThenNext(JudyL_metrics, &Index, &first_then_next))) { + metric_info = *PValue; + + fatal_assert(count < number_of_metrics); + uuid_list[count++].metric_info = metric_info; + min_time_s = MIN(min_time_s, metric_info->first_time_s); + max_time_s = MAX(max_time_s, metric_info->last_time_s); + } + + // Store in the header + j2_header.start_time_ut = min_time_s * USEC_PER_SEC; + j2_header.end_time_ut = max_time_s * USEC_PER_SEC; + + qsort(&uuid_list[0], number_of_metrics, sizeof(struct journal_metric_list_to_sort), journalfile_metric_compare); + internal_error(true, "DBENGINE: traverse and qsort UUID %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + + uint32_t resize_file_to = total_file_size; + + for (Index = 0; Index < number_of_metrics; Index++) { + metric_info = uuid_list[Index].metric_info; + + // Calculate current UUID offset from start of file. We will store this in the data page header + uint32_t uuid_offset = data - data_start; + + // Write the UUID we are processing + data = (void *) journalfile_v2_write_metric_page(&j2_header, data, metric_info, pages_offset); + if (unlikely(!data)) + break; + + // Next we will write + // Header + // Detailed entries (descr @ time) + // Trailer (checksum) + + // Keep the page_list_header, to be used for migration when where agent is running + metric_info->page_list_header = pages_offset; + // Write page header + void *metric_page = journalfile_v2_write_data_page_header(&j2_header, data_start + pages_offset, metric_info, + uuid_offset); + + // Start writing descr @ time + void *page_trailer = journalfile_v2_write_descriptors(&j2_header, metric_page, metric_info); + if (unlikely(!page_trailer)) + break; + + // Trailer (checksum) + uint8_t *next_page_address = journalfile_v2_write_data_page_trailer(&j2_header, page_trailer, + data_start + pages_offset); + + // Calculate start of the pages start for next descriptor + pages_offset += (metric_info->number_of_pages * (sizeof(struct journal_page_list)) + sizeof(struct journal_page_header) + sizeof(struct journal_v2_block_trailer)); + // Verify we are at the right location + if (pages_offset != (uint32_t)(next_page_address - data_start)) { + // make sure checks fail so that we abort + data = data_start; + break; + } + } + + if (data == data_start + metric_offset_trailer) { + internal_error(true, "DBENGINE: WRITE METRICS AND PAGES %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + + // Calculate CRC for metrics + journal_v2_trailer = (struct journal_v2_block_trailer *)(data_start + metric_offset_trailer); + crc = crc32(0L, Z_NULL, 0); + crc = + crc32(crc, (uint8_t *)data_start + metrics_offset, number_of_metrics * sizeof(struct journal_metric_list)); + crc32set(journal_v2_trailer->checksum, crc); + internal_error(true, "DBENGINE: CALCULATE CRC FOR UUIDs %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + + // Prepare to write checksum for the file + j2_header.data = NULL; + journal_v2_trailer = (struct journal_v2_block_trailer *)(data_start + trailer_offset); + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (void *)&j2_header, sizeof(j2_header)); + crc32set(journal_v2_trailer->checksum, crc); + + // Write header to the file + memcpy(data_start, &j2_header, sizeof(j2_header)); + + internal_error(true, "DBENGINE: FILE COMPLETED --------> %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + + info("DBENGINE: migrated journal file '%s', file size %zu", path, total_file_size); + + // msync(data_start, total_file_size, MS_SYNC); + journalfile_v2_data_set(journalfile, fd_v2, data_start, total_file_size); + + internal_error(true, "DBENGINE: ACTIVATING NEW INDEX JNL %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + ctx_current_disk_space_increase(ctx, total_file_size); + freez(uuid_list); + return; + } + else { + info("DBENGINE: failed to build index '%s', file will be skipped", path); + j2_header.data = NULL; + j2_header.magic = JOURVAL_V2_SKIP_MAGIC; + memcpy(data_start, &j2_header, sizeof(j2_header)); + resize_file_to = sizeof(j2_header); + } + + netdata_munmap(data_start, total_file_size); + freez(uuid_list); + + if (likely(resize_file_to == total_file_size)) + return; + + int ret = truncate(path, (long) resize_file_to); + if (ret < 0) { + ctx_current_disk_space_increase(ctx, total_file_size); + ctx_fs_error(ctx); + error("DBENGINE: failed to resize file '%s'", path); + } + else + ctx_current_disk_space_increase(ctx, resize_file_to); +} + +int journalfile_load(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, + struct rrdengine_datafile *datafile) { uv_fs_t req; uv_file file; int ret, fd, error; uint64_t file_size, max_id; char path[RRDENG_PATH_MAX]; + bool loaded_v2 = false; + + // Do not try to load jv2 of the latest file + if (datafile->fileno != ctx_last_fileno_get(ctx)) + loaded_v2 = journalfile_v2_load(ctx, journalfile, datafile) == 0; - generate_journalfilepath(datafile, path, sizeof(path)); - fd = open_file_direct_io(path, O_RDWR, &file); + journalfile_v1_generate_path(datafile, path, sizeof(path)); + + fd = open_file_for_io(path, O_RDWR, &file, use_direct_io); if (fd < 0) { - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + ctx_fs_error(ctx); + + if(loaded_v2) + return 0; + return fd; } - info("Loading journal file \"%s\".", path); ret = check_file_properties(file, &file_size, sizeof(struct rrdeng_df_sb)); - if (ret) - goto error; - file_size = ALIGN_BYTES_FLOOR(file_size); + if (ret) { + error = ret; + goto cleanup; + } - ret = check_journal_file_superblock(file); - if (ret) - goto error; - ctx->stats.io_read_bytes += sizeof(struct rrdeng_jf_sb); - ++ctx->stats.io_read_requests; + if(loaded_v2) { + journalfile->unsafe.pos = file_size; + error = 0; + goto cleanup; + } + file_size = ALIGN_BYTES_FLOOR(file_size); + journalfile->unsafe.pos = file_size; journalfile->file = file; - journalfile->pos = file_size; - journalfile->data = netdata_mmap(path, file_size, MAP_SHARED, 0); - info("Loading journal file \"%s\" using %s.", path, journalfile->data?"MMAP":"uv_fs_read"); - max_id = iterate_transactions(ctx, journalfile); + ret = journalfile_check_superblock(file); + if (ret) { + info("DBENGINE: invalid journal file '%s' ; superblock check failed.", path); + error = ret; + goto cleanup; + } + ctx_io_read_op_bytes(ctx, sizeof(struct rrdeng_jf_sb)); + + info("DBENGINE: loading journal file '%s'", path); - ctx->commit_log.transaction_id = MAX(ctx->commit_log.transaction_id, max_id + 1); + max_id = journalfile_iterate_transactions(ctx, journalfile); + + __atomic_store_n(&ctx->atomic.transaction_id, MAX(__atomic_load_n(&ctx->atomic.transaction_id, __ATOMIC_RELAXED), max_id + 1), __ATOMIC_RELAXED); + + info("DBENGINE: journal file '%s' loaded (size:%"PRIu64").", path, file_size); + + bool is_last_file = (ctx_last_fileno_get(ctx) == journalfile->datafile->fileno); + if (is_last_file && journalfile->datafile->pos <= rrdeng_target_data_file_size(ctx) / 3) { + ctx->loading.create_new_datafile_pair = false; + return 0; + } + + pgc_open_cache_to_journal_v2(open_cache, (Word_t) ctx, (int) datafile->fileno, ctx->config.page_type, + journalfile_migrate_to_v2_callback, (void *) datafile->journalfile); + + if (is_last_file) + ctx->loading.create_new_datafile_pair = true; - info("Journal file \"%s\" loaded (size:%"PRIu64").", path, file_size); - if (likely(journalfile->data)) - netdata_munmap(journalfile->data, file_size); return 0; - error: - error = ret; +cleanup: ret = uv_fs_close(NULL, &req, file, NULL); if (ret < 0) { - error("uv_fs_close(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); return error; } - -void init_commit_log(struct rrdengine_instance *ctx) -{ - ctx->commit_log.buf = NULL; - ctx->commit_log.buf_pos = 0; - ctx->commit_log.transaction_id = 1; -} diff --git a/database/engine/journalfile.h b/database/engine/journalfile.h index 011c5065f..5fbcc90fa 100644 --- a/database/engine/journalfile.h +++ b/database/engine/journalfile.h @@ -13,37 +13,147 @@ struct rrdengine_journalfile; #define WALFILE_PREFIX "journalfile-" #define WALFILE_EXTENSION ".njf" +#define WALFILE_EXTENSION_V2 ".njfv2" +#define is_descr_journal_v2(descr) ((descr)->extent_entry != NULL) + +typedef enum __attribute__ ((__packed__)) { + JOURNALFILE_FLAG_IS_AVAILABLE = (1 << 0), + JOURNALFILE_FLAG_IS_MOUNTED = (1 << 1), + JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION = (1 << 2), +} JOURNALFILE_FLAGS; /* only one event loop is supported for now */ struct rrdengine_journalfile { + struct { + SPINLOCK spinlock; + void *data; // MMAPed file of journal v2 + uint32_t size; // Total file size mapped + int fd; + } mmap; + + struct { + SPINLOCK spinlock; + JOURNALFILE_FLAGS flags; + int32_t refcount; + time_t first_time_s; + time_t last_time_s; + time_t not_needed_since_s; + } v2; + + struct { + SPINLOCK spinlock; + uint64_t pos; + } unsafe; + uv_file file; - uint64_t pos; - void *data; struct rrdengine_datafile *datafile; }; -/* only one event loop is supported for now */ -struct transaction_commit_log { - uint64_t transaction_id; +static inline uint64_t journalfile_current_size(struct rrdengine_journalfile *journalfile) { + netdata_spinlock_lock(&journalfile->unsafe.spinlock); + uint64_t size = journalfile->unsafe.pos; + netdata_spinlock_unlock(&journalfile->unsafe.spinlock); + return size; +} + +// Journal v2 structures + +#define JOURVAL_V2_MAGIC (0x01221019) +#define JOURVAL_V2_REBUILD_MAGIC (0x00221019) +#define JOURVAL_V2_SKIP_MAGIC (0x02221019) + +struct journal_v2_block_trailer { + union { + uint8_t checksum[CHECKSUM_SZ]; /* CRC32 */ + uint32_t crc; + }; +}; + +// Journal V2 +// 28 bytes +struct journal_page_header { + union { + uint8_t checksum[CHECKSUM_SZ]; // CRC check + uint32_t crc; + }; + uint32_t uuid_offset; // Points back to the UUID list which should point here (UUIDs should much) + uint32_t entries; // Entries + uuid_t uuid; // Which UUID this is +}; - /* outstanding transaction buffer */ - void *buf; - unsigned buf_pos; - unsigned buf_size; +// 20 bytes +struct journal_page_list { + uint32_t delta_start_s; // relative to the start time of journal + uint32_t delta_end_s; // relative to delta_start + uint32_t extent_index; // Index to the extent (extent list) (bytes from BASE) + uint32_t update_every_s; + uint16_t page_length; + uint8_t type; }; -void generate_journalfilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen); -void journalfile_init(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); -void *wal_get_transaction_buffer(struct rrdengine_worker_config* wc, unsigned size); -void wal_flush_transaction_buffer(struct rrdengine_worker_config* wc); -int close_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); -int unlink_journal_file(struct rrdengine_journalfile *journalfile); -int destroy_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); -int create_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); -int load_journal_file(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, - struct rrdengine_datafile *datafile); -void init_commit_log(struct rrdengine_instance *ctx); +// UUID_LIST +// 32 bytes +struct journal_metric_list { + uuid_t uuid; + uint32_t entries; // Number of entries + uint32_t page_offset; // OFFSET that contains entries * struct( journal_page_list ) + uint32_t delta_start_s; // Min time of metric + uint32_t delta_end_s; // Max time of metric (to be used to populate page_index) +}; + +// 16 bytes +struct journal_extent_list { + uint64_t datafile_offset; // Datafile offset to find the extent + uint32_t datafile_size; // Size of the extent + uint16_t file_index; // which file index is this datafile[index] + uint8_t pages; // number of pages (not all are necesssarily valid) +}; + +// 72 bytes +struct journal_v2_header { + uint32_t magic; + usec_t start_time_ut; // Min start time of journal + usec_t end_time_ut; // Maximum end time of journal + uint32_t extent_count; // Count of extents + uint32_t extent_offset; + uint32_t metric_count; // Count of metrics (unique UUIDS) + uint32_t metric_offset; + uint32_t page_count; // Total count of pages (descriptors @ time) + uint32_t page_offset; + uint32_t extent_trailer_offset; // CRC for entent list + uint32_t metric_trailer_offset; // CRC for metric list + uint32_t journal_v1_file_size; // This is the original journal file + uint32_t journal_v2_file_size; // This is the total file size + void *data; // Used when building the index +}; + +#define JOURNAL_V2_HEADER_PADDING_SZ (RRDENG_BLOCK_SIZE - (sizeof(struct journal_v2_header))) + +struct wal; + +void journalfile_v1_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen); +void journalfile_v2_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen); +struct rrdengine_journalfile *journalfile_alloc_and_init(struct rrdengine_datafile *datafile); +void journalfile_v1_extent_write(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, struct wal *wal, uv_loop_t *loop); +int journalfile_close(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); +int journalfile_unlink(struct rrdengine_journalfile *journalfile); +int journalfile_destroy_unsafe(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); +int journalfile_create(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); +int journalfile_load(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, + struct rrdengine_datafile *datafile); +void journalfile_v2_populate_retention_to_mrg(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile); + +void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno __maybe_unused, uint8_t type __maybe_unused, + Pvoid_t JudyL_metrics, Pvoid_t JudyL_extents_pos, + size_t number_of_extents, size_t number_of_metrics, size_t number_of_pages, void *user_data); + +bool journalfile_v2_data_available(struct rrdengine_journalfile *journalfile); +size_t journalfile_v2_data_size_get(struct rrdengine_journalfile *journalfile); +void journalfile_v2_data_set(struct rrdengine_journalfile *journalfile, int fd, void *journal_data, uint32_t journal_data_size); +struct journal_v2_header *journalfile_v2_data_acquire(struct rrdengine_journalfile *journalfile, size_t *data_size, time_t wanted_first_time_s, time_t wanted_last_time_s); +void journalfile_v2_data_release(struct rrdengine_journalfile *journalfile); +void journalfile_v2_data_unmount_cleanup(time_t now_s); #endif /* NETDATA_JOURNALFILE_H */
\ No newline at end of file diff --git a/database/engine/journalfile.ksy b/database/engine/journalfile.ksy new file mode 100644 index 000000000..858db83d4 --- /dev/null +++ b/database/engine/journalfile.ksy @@ -0,0 +1,144 @@ +meta: + id: netdata_journalfile_v2 + endian: le + +seq: + - id: journal_v2_header + type: journal_v2_header + size: 4096 + - id: extent_list + type: journal_v2_extent_list + repeat: expr + repeat-expr: journal_v2_header.extent_count + - id: extent_trailer + type: journal_v2_block_trailer + - id: metric_list + type: journal_v2_metric_list + repeat: expr + repeat-expr: journal_v2_header.metric_count + - id: metric_trailer + type: journal_v2_block_trailer + - id: page_blocs + type: jounral_v2_page_blocs + size: _root._io.size - _root._io.pos - 4 + - id: journal_file_trailer + type: journal_v2_block_trailer + + +types: + journal_v2_metric_list: + seq: + - id: uuid + size: 16 + - id: entries + type: u4 + - id: page_offset + type: u4 + - id: delta_start_s + type: u4 + - id: delta_end_s + type: u4 + instances: + page_block: + type: journal_v2_page_block + io: _root._io + pos: page_offset + journal_v2_page_hdr: + seq: + - id: crc + type: u4 + - id: uuid_offset + type: u4 + - id: entries + type: u4 + - id: uuid + size: 16 + journal_v2_page_list: + seq: + - id: delta_start_s + type: u4 + - id: delta_end_s + type: u4 + - id: extent_idx + type: u4 + - id: update_every_s + type: u4 + - id: page_len + type: u2 + - id: type + type: u1 + - id: reserved + type: u1 + instances: + extent: + io: _root._io + type: journal_v2_extent_list + pos: _root.journal_v2_header.extent_offset + (extent_idx * 16) + journal_v2_header: + seq: + - id: magic + contents: [ 0x19, 0x10, 0x22, 0x01 ] #0x01221019 + - id: reserved + type: u4 + - id: start_time_ut + type: u8 + - id: end_time_ut + type: u8 + - id: extent_count + type: u4 + - id: extent_offset + type: u4 + - id: metric_count + type: u4 + - id: metric_offset + type: u4 + - id: page_count + type: u4 + - id: page_offset + type: u4 + - id: extent_trailer_offset + type: u4 + - id: metric_trailer_offset + type: u4 + - id: original_file_size + type: u4 + - id: total_file_size + type: u4 + - id: data + type: u8 + instances: + trailer: + io: _root._io + type: journal_v2_block_trailer + pos: _root._io.size - 4 + journal_v2_block_trailer: + seq: + - id: checksum + type: u4 + journal_v2_extent_list: + seq: + - id: datafile_offset + type: u8 + - id: datafile_size + type: u4 + - id: file_idx + type: u2 + - id: page_cnt + type: u1 + - id: padding + type: u1 + journal_v2_page_block: + seq: + - id: hdr + type: journal_v2_page_hdr + - id: page_list + type: journal_v2_page_list + repeat: expr + repeat-expr: hdr.entries + - id: block_trailer + type: journal_v2_block_trailer + jounral_v2_page_blocs: + seq: + - id: blocs + type: journal_v2_page_block + repeat: eos diff --git a/database/engine/metric.c b/database/engine/metric.c new file mode 100644 index 000000000..9dc9d9ebc --- /dev/null +++ b/database/engine/metric.c @@ -0,0 +1,875 @@ +#include "metric.h" + +typedef int32_t REFCOUNT; +#define REFCOUNT_DELETING (-100) + +typedef enum __attribute__ ((__packed__)) { + METRIC_FLAG_HAS_RETENTION = (1 << 0), +} METRIC_FLAGS; + +struct metric { + uuid_t uuid; // never changes + Word_t section; // never changes + + time_t first_time_s; // + time_t latest_time_s_clean; // archived pages latest time + time_t latest_time_s_hot; // latest time of the currently collected page + uint32_t latest_update_every_s; // + pid_t writer; + METRIC_FLAGS flags; + REFCOUNT refcount; + SPINLOCK spinlock; // protects all variable members + + // THIS IS allocated with malloc() + // YOU HAVE TO INITIALIZE IT YOURSELF ! +}; + +static struct aral_statistics mrg_aral_statistics; + +struct mrg { + ARAL *aral[MRG_PARTITIONS]; + + struct pgc_index { + netdata_rwlock_t rwlock; + Pvoid_t uuid_judy; // each UUID has a JudyL of sections (tiers) + } index[MRG_PARTITIONS]; + + struct mrg_statistics stats; + + size_t entries_per_partition[MRG_PARTITIONS]; +}; + +static inline void MRG_STATS_DUPLICATE_ADD(MRG *mrg) { + __atomic_add_fetch(&mrg->stats.additions_duplicate, 1, __ATOMIC_RELAXED); +} + +static inline void MRG_STATS_ADDED_METRIC(MRG *mrg, size_t partition) { + __atomic_add_fetch(&mrg->stats.entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&mrg->stats.additions, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&mrg->stats.size, sizeof(METRIC), __ATOMIC_RELAXED); + + __atomic_add_fetch(&mrg->entries_per_partition[partition], 1, __ATOMIC_RELAXED); +} + +static inline void MRG_STATS_DELETED_METRIC(MRG *mrg, size_t partition) { + __atomic_sub_fetch(&mrg->stats.entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&mrg->stats.size, sizeof(METRIC), __ATOMIC_RELAXED); + __atomic_add_fetch(&mrg->stats.deletions, 1, __ATOMIC_RELAXED); + + __atomic_sub_fetch(&mrg->entries_per_partition[partition], 1, __ATOMIC_RELAXED); +} + +static inline void MRG_STATS_SEARCH_HIT(MRG *mrg) { + __atomic_add_fetch(&mrg->stats.search_hits, 1, __ATOMIC_RELAXED); +} + +static inline void MRG_STATS_SEARCH_MISS(MRG *mrg) { + __atomic_add_fetch(&mrg->stats.search_misses, 1, __ATOMIC_RELAXED); +} + +static inline void MRG_STATS_DELETE_MISS(MRG *mrg) { + __atomic_add_fetch(&mrg->stats.delete_misses, 1, __ATOMIC_RELAXED); +} + +static inline void mrg_index_read_lock(MRG *mrg, size_t partition) { + netdata_rwlock_rdlock(&mrg->index[partition].rwlock); +} +static inline void mrg_index_read_unlock(MRG *mrg, size_t partition) { + netdata_rwlock_unlock(&mrg->index[partition].rwlock); +} +static inline void mrg_index_write_lock(MRG *mrg, size_t partition) { + netdata_rwlock_wrlock(&mrg->index[partition].rwlock); +} +static inline void mrg_index_write_unlock(MRG *mrg, size_t partition) { + netdata_rwlock_unlock(&mrg->index[partition].rwlock); +} + +static inline void mrg_stats_size_judyl_change(MRG *mrg, size_t mem_before_judyl, size_t mem_after_judyl) { + if(mem_after_judyl > mem_before_judyl) + __atomic_add_fetch(&mrg->stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED); + else if(mem_after_judyl < mem_before_judyl) + __atomic_sub_fetch(&mrg->stats.size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED); +} + +static inline void mrg_stats_size_judyhs_added_uuid(MRG *mrg) { + __atomic_add_fetch(&mrg->stats.size, JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(uuid_t)), __ATOMIC_RELAXED); +} + +static inline void mrg_stats_size_judyhs_removed_uuid(MRG *mrg) { + __atomic_sub_fetch(&mrg->stats.size, JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(uuid_t)), __ATOMIC_RELAXED); +} + +static inline size_t uuid_partition(MRG *mrg __maybe_unused, uuid_t *uuid) { + uint8_t *u = (uint8_t *)uuid; + return u[UUID_SZ - 1] % MRG_PARTITIONS; +} + +static inline bool metric_has_retention_unsafe(MRG *mrg __maybe_unused, METRIC *metric) { + bool has_retention = (metric->first_time_s || metric->latest_time_s_clean || metric->latest_time_s_hot); + + if(has_retention && !(metric->flags & METRIC_FLAG_HAS_RETENTION)) { + metric->flags |= METRIC_FLAG_HAS_RETENTION; + __atomic_add_fetch(&mrg->stats.entries_with_retention, 1, __ATOMIC_RELAXED); + } + else if(!has_retention && (metric->flags & METRIC_FLAG_HAS_RETENTION)) { + metric->flags &= ~METRIC_FLAG_HAS_RETENTION; + __atomic_sub_fetch(&mrg->stats.entries_with_retention, 1, __ATOMIC_RELAXED); + } + + return has_retention; +} + +static inline REFCOUNT metric_acquire(MRG *mrg __maybe_unused, METRIC *metric, bool having_spinlock) { + REFCOUNT refcount; + + if(!having_spinlock) + netdata_spinlock_lock(&metric->spinlock); + + if(unlikely(metric->refcount < 0)) + fatal("METRIC: refcount is %d (negative) during acquire", metric->refcount); + + refcount = ++metric->refcount; + + // update its retention flags + metric_has_retention_unsafe(mrg, metric); + + if(!having_spinlock) + netdata_spinlock_unlock(&metric->spinlock); + + if(refcount == 1) + __atomic_add_fetch(&mrg->stats.entries_referenced, 1, __ATOMIC_RELAXED); + + __atomic_add_fetch(&mrg->stats.current_references, 1, __ATOMIC_RELAXED); + + return refcount; +} + +static inline bool metric_release_and_can_be_deleted(MRG *mrg __maybe_unused, METRIC *metric) { + bool ret = true; + REFCOUNT refcount; + + netdata_spinlock_lock(&metric->spinlock); + + if(unlikely(metric->refcount <= 0)) + fatal("METRIC: refcount is %d (zero or negative) during release", metric->refcount); + + refcount = --metric->refcount; + + if(likely(metric_has_retention_unsafe(mrg, metric) || refcount != 0)) + ret = false; + + netdata_spinlock_unlock(&metric->spinlock); + + if(unlikely(!refcount)) + __atomic_sub_fetch(&mrg->stats.entries_referenced, 1, __ATOMIC_RELAXED); + + __atomic_sub_fetch(&mrg->stats.current_references, 1, __ATOMIC_RELAXED); + + return ret; +} + +static METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *ret) { + size_t partition = uuid_partition(mrg, &entry->uuid); + + METRIC *allocation = aral_mallocz(mrg->aral[partition]); + + mrg_index_write_lock(mrg, partition); + + size_t mem_before_judyl, mem_after_judyl; + + Pvoid_t *sections_judy_pptr = JudyHSIns(&mrg->index[partition].uuid_judy, &entry->uuid, sizeof(uuid_t), PJE0); + if(unlikely(!sections_judy_pptr || sections_judy_pptr == PJERR)) + fatal("DBENGINE METRIC: corrupted UUIDs JudyHS array"); + + if(unlikely(!*sections_judy_pptr)) + mrg_stats_size_judyhs_added_uuid(mrg); + + mem_before_judyl = JudyLMemUsed(*sections_judy_pptr); + Pvoid_t *PValue = JudyLIns(sections_judy_pptr, entry->section, PJE0); + mem_after_judyl = JudyLMemUsed(*sections_judy_pptr); + mrg_stats_size_judyl_change(mrg, mem_before_judyl, mem_after_judyl); + + if(unlikely(!PValue || PValue == PJERR)) + fatal("DBENGINE METRIC: corrupted section JudyL array"); + + if(unlikely(*PValue != NULL)) { + METRIC *metric = *PValue; + + metric_acquire(mrg, metric, false); + mrg_index_write_unlock(mrg, partition); + + if(ret) + *ret = false; + + aral_freez(mrg->aral[partition], allocation); + + MRG_STATS_DUPLICATE_ADD(mrg); + return metric; + } + + METRIC *metric = allocation; + uuid_copy(metric->uuid, entry->uuid); + metric->section = entry->section; + metric->first_time_s = entry->first_time_s; + metric->latest_time_s_clean = entry->last_time_s; + metric->latest_time_s_hot = 0; + metric->latest_update_every_s = entry->latest_update_every_s; + metric->writer = 0; + metric->refcount = 0; + metric->flags = 0; + netdata_spinlock_init(&metric->spinlock); + metric_acquire(mrg, metric, true); // no spinlock use required here + *PValue = metric; + + mrg_index_write_unlock(mrg, partition); + + if(ret) + *ret = true; + + MRG_STATS_ADDED_METRIC(mrg, partition); + + return metric; +} + +static METRIC *metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) { + size_t partition = uuid_partition(mrg, uuid); + + mrg_index_read_lock(mrg, partition); + + Pvoid_t *sections_judy_pptr = JudyHSGet(mrg->index[partition].uuid_judy, uuid, sizeof(uuid_t)); + if(unlikely(!sections_judy_pptr)) { + mrg_index_read_unlock(mrg, partition); + MRG_STATS_SEARCH_MISS(mrg); + return NULL; + } + + Pvoid_t *PValue = JudyLGet(*sections_judy_pptr, section, PJE0); + if(unlikely(!PValue)) { + mrg_index_read_unlock(mrg, partition); + MRG_STATS_SEARCH_MISS(mrg); + return NULL; + } + + METRIC *metric = *PValue; + + metric_acquire(mrg, metric, false); + + mrg_index_read_unlock(mrg, partition); + + MRG_STATS_SEARCH_HIT(mrg); + return metric; +} + +static bool acquired_metric_del(MRG *mrg, METRIC *metric) { + size_t partition = uuid_partition(mrg, &metric->uuid); + + size_t mem_before_judyl, mem_after_judyl; + + mrg_index_write_lock(mrg, partition); + + if(!metric_release_and_can_be_deleted(mrg, metric)) { + mrg_index_write_unlock(mrg, partition); + __atomic_add_fetch(&mrg->stats.delete_having_retention_or_referenced, 1, __ATOMIC_RELAXED); + return false; + } + + Pvoid_t *sections_judy_pptr = JudyHSGet(mrg->index[partition].uuid_judy, &metric->uuid, sizeof(uuid_t)); + if(unlikely(!sections_judy_pptr || !*sections_judy_pptr)) { + mrg_index_write_unlock(mrg, partition); + MRG_STATS_DELETE_MISS(mrg); + return false; + } + + mem_before_judyl = JudyLMemUsed(*sections_judy_pptr); + int rc = JudyLDel(sections_judy_pptr, metric->section, PJE0); + mem_after_judyl = JudyLMemUsed(*sections_judy_pptr); + mrg_stats_size_judyl_change(mrg, mem_before_judyl, mem_after_judyl); + + if(unlikely(!rc)) { + mrg_index_write_unlock(mrg, partition); + MRG_STATS_DELETE_MISS(mrg); + return false; + } + + if(!*sections_judy_pptr) { + rc = JudyHSDel(&mrg->index[partition].uuid_judy, &metric->uuid, sizeof(uuid_t), PJE0); + if(unlikely(!rc)) + fatal("DBENGINE METRIC: cannot delete UUID from JudyHS"); + mrg_stats_size_judyhs_removed_uuid(mrg); + } + + mrg_index_write_unlock(mrg, partition); + + aral_freez(mrg->aral[partition], metric); + + MRG_STATS_DELETED_METRIC(mrg, partition); + + return true; +} + +// ---------------------------------------------------------------------------- +// public API + +MRG *mrg_create(void) { + MRG *mrg = callocz(1, sizeof(MRG)); + + for(size_t i = 0; i < MRG_PARTITIONS ; i++) { + netdata_rwlock_init(&mrg->index[i].rwlock); + + char buf[ARAL_MAX_NAME + 1]; + snprintfz(buf, ARAL_MAX_NAME, "mrg[%zu]", i); + + mrg->aral[i] = aral_create(buf, + sizeof(METRIC), + 0, + 16384, + &mrg_aral_statistics, + NULL, NULL, false, + false); + } + + mrg->stats.size = sizeof(MRG); + + return mrg; +} + +size_t mrg_aral_structures(void) { + return aral_structures_from_stats(&mrg_aral_statistics); +} + +size_t mrg_aral_overhead(void) { + return aral_overhead_from_stats(&mrg_aral_statistics); +} + +void mrg_destroy(MRG *mrg __maybe_unused) { + // no destruction possible + // we can't traverse the metrics list + + // to delete entries, the caller needs to keep pointers to them + // and delete them one by one + + ; +} + +METRIC *mrg_metric_add_and_acquire(MRG *mrg, MRG_ENTRY entry, bool *ret) { +// internal_fatal(entry.latest_time_s > max_acceptable_collected_time(), +// "DBENGINE METRIC: metric latest time is in the future"); + + return metric_add_and_acquire(mrg, &entry, ret); +} + +METRIC *mrg_metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) { + return metric_get_and_acquire(mrg, uuid, section); +} + +bool mrg_metric_release_and_delete(MRG *mrg, METRIC *metric) { + return acquired_metric_del(mrg, metric); +} + +METRIC *mrg_metric_dup(MRG *mrg, METRIC *metric) { + metric_acquire(mrg, metric, false); + return metric; +} + +bool mrg_metric_release(MRG *mrg, METRIC *metric) { + return metric_release_and_can_be_deleted(mrg, metric); +} + +Word_t mrg_metric_id(MRG *mrg __maybe_unused, METRIC *metric) { + return (Word_t)metric; +} + +uuid_t *mrg_metric_uuid(MRG *mrg __maybe_unused, METRIC *metric) { + return &metric->uuid; +} + +Word_t mrg_metric_section(MRG *mrg __maybe_unused, METRIC *metric) { + return metric->section; +} + +bool mrg_metric_set_first_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) { + netdata_spinlock_lock(&metric->spinlock); + metric->first_time_s = first_time_s; + metric_has_retention_unsafe(mrg, metric); + netdata_spinlock_unlock(&metric->spinlock); + + return true; +} + +void mrg_metric_expand_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s, time_t last_time_s, time_t update_every_s) { + + internal_fatal(first_time_s > max_acceptable_collected_time(), + "DBENGINE METRIC: metric first time is in the future"); + internal_fatal(last_time_s > max_acceptable_collected_time(), + "DBENGINE METRIC: metric last time is in the future"); + + netdata_spinlock_lock(&metric->spinlock); + + if(unlikely(first_time_s && (!metric->first_time_s || first_time_s < metric->first_time_s))) + metric->first_time_s = first_time_s; + + if(likely(last_time_s && (!metric->latest_time_s_clean || last_time_s > metric->latest_time_s_clean))) { + metric->latest_time_s_clean = last_time_s; + + if(likely(update_every_s)) + metric->latest_update_every_s = update_every_s; + } + else if(unlikely(!metric->latest_update_every_s && update_every_s)) + metric->latest_update_every_s = update_every_s; + + metric_has_retention_unsafe(mrg, metric); + netdata_spinlock_unlock(&metric->spinlock); +} + +bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) { + bool ret = false; + + netdata_spinlock_lock(&metric->spinlock); + if(first_time_s > metric->first_time_s) { + metric->first_time_s = first_time_s; + ret = true; + } + metric_has_retention_unsafe(mrg, metric); + netdata_spinlock_unlock(&metric->spinlock); + + return ret; +} + +time_t mrg_metric_get_first_time_s(MRG *mrg __maybe_unused, METRIC *metric) { + time_t first_time_s; + + netdata_spinlock_lock(&metric->spinlock); + + if(unlikely(!metric->first_time_s)) { + if(metric->latest_time_s_clean) + metric->first_time_s = metric->latest_time_s_clean; + + else if(metric->latest_time_s_hot) + metric->first_time_s = metric->latest_time_s_hot; + } + + first_time_s = metric->first_time_s; + + netdata_spinlock_unlock(&metric->spinlock); + + return first_time_s; +} + +void mrg_metric_get_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t *first_time_s, time_t *last_time_s, time_t *update_every_s) { + netdata_spinlock_lock(&metric->spinlock); + + if(unlikely(!metric->first_time_s)) { + if(metric->latest_time_s_clean) + metric->first_time_s = metric->latest_time_s_clean; + + else if(metric->latest_time_s_hot) + metric->first_time_s = metric->latest_time_s_hot; + } + + *first_time_s = metric->first_time_s; + *last_time_s = MAX(metric->latest_time_s_clean, metric->latest_time_s_hot); + *update_every_s = metric->latest_update_every_s; + + netdata_spinlock_unlock(&metric->spinlock); +} + +bool mrg_metric_set_clean_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) { + netdata_spinlock_lock(&metric->spinlock); + +// internal_fatal(latest_time_s > max_acceptable_collected_time(), +// "DBENGINE METRIC: metric latest time is in the future"); + +// internal_fatal(metric->latest_time_s_clean > latest_time_s, +// "DBENGINE METRIC: metric new clean latest time is older than the previous one"); + + metric->latest_time_s_clean = latest_time_s; + + if(unlikely(!metric->first_time_s)) + metric->first_time_s = latest_time_s; + +// if(unlikely(metric->first_time_s > latest_time_s)) +// metric->first_time_s = latest_time_s; + + metric_has_retention_unsafe(mrg, metric); + netdata_spinlock_unlock(&metric->spinlock); + return true; +} + +// returns true when metric still has retention +bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric) { + Word_t section = mrg_metric_section(mrg, metric); + bool do_again = false; + size_t countdown = 5; + bool ret = true; + + do { + time_t min_first_time_s = LONG_MAX; + time_t max_end_time_s = 0; + PGC_PAGE *page; + PGC_SEARCH method = PGC_SEARCH_FIRST; + time_t page_first_time_s = 0; + time_t page_end_time_s = 0; + while ((page = pgc_page_get_and_acquire(main_cache, section, (Word_t)metric, page_first_time_s, method))) { + method = PGC_SEARCH_NEXT; + + bool is_hot = pgc_is_page_hot(page); + bool is_dirty = pgc_is_page_dirty(page); + page_first_time_s = pgc_page_start_time_s(page); + page_end_time_s = pgc_page_end_time_s(page); + + if ((is_hot || is_dirty) && page_first_time_s < min_first_time_s) + min_first_time_s = page_first_time_s; + + if (is_dirty && page_end_time_s > max_end_time_s) + max_end_time_s = page_end_time_s; + + pgc_page_release(main_cache, page); + } + + if (min_first_time_s == LONG_MAX) + min_first_time_s = 0; + + netdata_spinlock_lock(&metric->spinlock); + if (--countdown && !min_first_time_s && metric->latest_time_s_hot) + do_again = true; + else { + internal_error(!countdown, "METRIC: giving up on updating the retention of metric without disk retention"); + + do_again = false; + metric->first_time_s = min_first_time_s; + metric->latest_time_s_clean = max_end_time_s; + + ret = metric_has_retention_unsafe(mrg, metric); + } + netdata_spinlock_unlock(&metric->spinlock); + } while(do_again); + + return ret; +} + +bool mrg_metric_set_hot_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) { +// internal_fatal(latest_time_s > max_acceptable_collected_time(), +// "DBENGINE METRIC: metric latest time is in the future"); + + netdata_spinlock_lock(&metric->spinlock); + metric->latest_time_s_hot = latest_time_s; + + if(unlikely(!metric->first_time_s)) + metric->first_time_s = latest_time_s; + +// if(unlikely(metric->first_time_s > latest_time_s)) +// metric->first_time_s = latest_time_s; + + metric_has_retention_unsafe(mrg, metric); + netdata_spinlock_unlock(&metric->spinlock); + return true; +} + +time_t mrg_metric_get_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric) { + time_t max; + netdata_spinlock_lock(&metric->spinlock); + max = MAX(metric->latest_time_s_clean, metric->latest_time_s_hot); + netdata_spinlock_unlock(&metric->spinlock); + return max; +} + +bool mrg_metric_set_update_every(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) { + if(!update_every_s) + return false; + + netdata_spinlock_lock(&metric->spinlock); + metric->latest_update_every_s = update_every_s; + netdata_spinlock_unlock(&metric->spinlock); + + return true; +} + +bool mrg_metric_set_update_every_s_if_zero(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) { + if(!update_every_s) + return false; + + netdata_spinlock_lock(&metric->spinlock); + if(!metric->latest_update_every_s) + metric->latest_update_every_s = update_every_s; + netdata_spinlock_unlock(&metric->spinlock); + + return true; +} + +time_t mrg_metric_get_update_every_s(MRG *mrg __maybe_unused, METRIC *metric) { + time_t update_every_s; + + netdata_spinlock_lock(&metric->spinlock); + update_every_s = metric->latest_update_every_s; + netdata_spinlock_unlock(&metric->spinlock); + + return update_every_s; +} + +bool mrg_metric_set_writer(MRG *mrg, METRIC *metric) { + bool done = false; + netdata_spinlock_lock(&metric->spinlock); + if(!metric->writer) { + metric->writer = gettid(); + __atomic_add_fetch(&mrg->stats.writers, 1, __ATOMIC_RELAXED); + done = true; + } + else + __atomic_add_fetch(&mrg->stats.writers_conflicts, 1, __ATOMIC_RELAXED); + netdata_spinlock_unlock(&metric->spinlock); + return done; +} + +bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric) { + bool done = false; + netdata_spinlock_lock(&metric->spinlock); + if(metric->writer) { + metric->writer = 0; + __atomic_sub_fetch(&mrg->stats.writers, 1, __ATOMIC_RELAXED); + done = true; + } + netdata_spinlock_unlock(&metric->spinlock); + return done; +} + +struct mrg_statistics mrg_get_statistics(MRG *mrg) { + // FIXME - use atomics + return mrg->stats; +} + +// ---------------------------------------------------------------------------- +// unit test + +#ifdef MRG_STRESS_TEST + +static void mrg_stress(MRG *mrg, size_t entries, size_t sections) { + bool ret; + + info("DBENGINE METRIC: stress testing %zu entries on %zu sections...", entries, sections); + + METRIC *array[entries][sections]; + for(size_t i = 0; i < entries ; i++) { + MRG_ENTRY e = { + .first_time_s = (time_t)(i + 1), + .latest_time_s = (time_t)(i + 2), + .latest_update_every_s = (time_t)(i + 3), + }; + uuid_generate_random(e.uuid); + + for(size_t section = 0; section < sections ;section++) { + e.section = section; + array[i][section] = mrg_metric_add_and_acquire(mrg, e, &ret); + if(!ret) + fatal("DBENGINE METRIC: failed to add metric %zu, section %zu", i, section); + + if(mrg_metric_add_and_acquire(mrg, e, &ret) != array[i][section]) + fatal("DBENGINE METRIC: adding the same metric twice, returns a different metric"); + + if(ret) + fatal("DBENGINE METRIC: adding the same metric twice, returns success"); + + if(mrg_metric_get_and_acquire(mrg, &e.uuid, e.section) != array[i][section]) + fatal("DBENGINE METRIC: cannot get back the same metric"); + + if(uuid_compare(*mrg_metric_uuid(mrg, array[i][section]), e.uuid) != 0) + fatal("DBENGINE METRIC: uuids do not match"); + } + } + + for(size_t i = 0; i < entries ; i++) { + for (size_t section = 0; section < sections; section++) { + uuid_t uuid; + uuid_generate_random(uuid); + + if(mrg_metric_get_and_acquire(mrg, &uuid, section)) + fatal("DBENGINE METRIC: found non-existing uuid"); + + if(mrg_metric_id(mrg, array[i][section]) != (Word_t)array[i][section]) + fatal("DBENGINE METRIC: metric id does not match"); + + if(mrg_metric_get_first_time_s(mrg, array[i][section]) != (time_t)(i + 1)) + fatal("DBENGINE METRIC: wrong first time returned"); + if(mrg_metric_get_latest_time_s(mrg, array[i][section]) != (time_t)(i + 2)) + fatal("DBENGINE METRIC: wrong latest time returned"); + if(mrg_metric_get_update_every_s(mrg, array[i][section]) != (time_t)(i + 3)) + fatal("DBENGINE METRIC: wrong latest time returned"); + + if(!mrg_metric_set_first_time_s(mrg, array[i][section], (time_t)((i + 1) * 2))) + fatal("DBENGINE METRIC: cannot set first time"); + if(!mrg_metric_set_clean_latest_time_s(mrg, array[i][section], (time_t) ((i + 1) * 3))) + fatal("DBENGINE METRIC: cannot set latest time"); + if(!mrg_metric_set_update_every(mrg, array[i][section], (time_t)((i + 1) * 4))) + fatal("DBENGINE METRIC: cannot set update every"); + + if(mrg_metric_get_first_time_s(mrg, array[i][section]) != (time_t)((i + 1) * 2)) + fatal("DBENGINE METRIC: wrong first time returned"); + if(mrg_metric_get_latest_time_s(mrg, array[i][section]) != (time_t)((i + 1) * 3)) + fatal("DBENGINE METRIC: wrong latest time returned"); + if(mrg_metric_get_update_every_s(mrg, array[i][section]) != (time_t)((i + 1) * 4)) + fatal("DBENGINE METRIC: wrong latest time returned"); + } + } + + for(size_t i = 0; i < entries ; i++) { + for (size_t section = 0; section < sections; section++) { + if(!mrg_metric_release_and_delete(mrg, array[i][section])) + fatal("DBENGINE METRIC: failed to delete metric"); + } + } +} + +static void *mrg_stress_test_thread1(void *ptr) { + MRG *mrg = ptr; + + for(int i = 0; i < 5 ; i++) + mrg_stress(mrg, 10000, 5); + + return ptr; +} + +static void *mrg_stress_test_thread2(void *ptr) { + MRG *mrg = ptr; + + for(int i = 0; i < 10 ; i++) + mrg_stress(mrg, 500, 50); + + return ptr; +} + +static void *mrg_stress_test_thread3(void *ptr) { + MRG *mrg = ptr; + + for(int i = 0; i < 50 ; i++) + mrg_stress(mrg, 5000, 1); + + return ptr; +} +#endif + +int mrg_unittest(void) { + MRG *mrg = mrg_create(); + METRIC *m1_t0, *m2_t0, *m3_t0, *m4_t0; + METRIC *m1_t1, *m2_t1, *m3_t1, *m4_t1; + bool ret; + + MRG_ENTRY entry = { + .section = 0, + .first_time_s = 2, + .last_time_s = 3, + .latest_update_every_s = 4, + }; + uuid_generate(entry.uuid); + m1_t0 = mrg_metric_add_and_acquire(mrg, entry, &ret); + if(!ret) + fatal("DBENGINE METRIC: failed to add metric"); + + // add the same metric again + m2_t0 = mrg_metric_add_and_acquire(mrg, entry, &ret); + if(m2_t0 != m1_t0) + fatal("DBENGINE METRIC: adding the same metric twice, does not return the same pointer"); + if(ret) + fatal("DBENGINE METRIC: managed to add the same metric twice"); + + m3_t0 = mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section); + if(m3_t0 != m1_t0) + fatal("DBENGINE METRIC: cannot find the metric added"); + + // add the same metric again + m4_t0 = mrg_metric_add_and_acquire(mrg, entry, &ret); + if(m4_t0 != m1_t0) + fatal("DBENGINE METRIC: adding the same metric twice, does not return the same pointer"); + if(ret) + fatal("DBENGINE METRIC: managed to add the same metric twice"); + + // add the same metric in another section + entry.section = 1; + m1_t1 = mrg_metric_add_and_acquire(mrg, entry, &ret); + if(!ret) + fatal("DBENGINE METRIC: failed to add metric in section %zu", (size_t)entry.section); + + // add the same metric again + m2_t1 = mrg_metric_add_and_acquire(mrg, entry, &ret); + if(m2_t1 != m1_t1) + fatal("DBENGINE METRIC: adding the same metric twice (section %zu), does not return the same pointer", (size_t)entry.section); + if(ret) + fatal("DBENGINE METRIC: managed to add the same metric twice in (section 0)"); + + m3_t1 = mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section); + if(m3_t1 != m1_t1) + fatal("DBENGINE METRIC: cannot find the metric added (section %zu)", (size_t)entry.section); + + // delete the first metric + mrg_metric_release(mrg, m2_t0); + mrg_metric_release(mrg, m3_t0); + mrg_metric_release(mrg, m4_t0); + mrg_metric_set_first_time_s(mrg, m1_t0, 0); + mrg_metric_set_clean_latest_time_s(mrg, m1_t0, 0); + mrg_metric_set_hot_latest_time_s(mrg, m1_t0, 0); + if(!mrg_metric_release_and_delete(mrg, m1_t0)) + fatal("DBENGINE METRIC: cannot delete the first metric"); + + m4_t1 = mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section); + if(m4_t1 != m1_t1) + fatal("DBENGINE METRIC: cannot find the metric added (section %zu), after deleting the first one", (size_t)entry.section); + + // delete the second metric + mrg_metric_release(mrg, m2_t1); + mrg_metric_release(mrg, m3_t1); + mrg_metric_release(mrg, m4_t1); + mrg_metric_set_first_time_s(mrg, m1_t1, 0); + mrg_metric_set_clean_latest_time_s(mrg, m1_t1, 0); + mrg_metric_set_hot_latest_time_s(mrg, m1_t1, 0); + if(!mrg_metric_release_and_delete(mrg, m1_t1)) + fatal("DBENGINE METRIC: cannot delete the second metric"); + + if(mrg->stats.entries != 0) + fatal("DBENGINE METRIC: invalid entries counter"); + +#ifdef MRG_STRESS_TEST + usec_t started_ut = now_monotonic_usec(); + pthread_t thread1; + netdata_thread_create(&thread1, "TH1", + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + mrg_stress_test_thread1, mrg); + + pthread_t thread2; + netdata_thread_create(&thread2, "TH2", + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + mrg_stress_test_thread2, mrg); + + pthread_t thread3; + netdata_thread_create(&thread3, "TH3", + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + mrg_stress_test_thread3, mrg); + + + sleep_usec(5 * USEC_PER_SEC); + + netdata_thread_cancel(thread1); + netdata_thread_cancel(thread2); + netdata_thread_cancel(thread3); + + netdata_thread_join(thread1, NULL); + netdata_thread_join(thread2, NULL); + netdata_thread_join(thread3, NULL); + usec_t ended_ut = now_monotonic_usec(); + + info("DBENGINE METRIC: did %zu additions, %zu duplicate additions, " + "%zu deletions, %zu wrong deletions, " + "%zu successful searches, %zu wrong searches, " + "%zu successful pointer validations, %zu wrong pointer validations " + "in %llu usecs", + mrg->stats.additions, mrg->stats.additions_duplicate, + mrg->stats.deletions, mrg->stats.delete_misses, + mrg->stats.search_hits, mrg->stats.search_misses, + mrg->stats.pointer_validation_hits, mrg->stats.pointer_validation_misses, + ended_ut - started_ut); + +#endif + + mrg_destroy(mrg); + + info("DBENGINE METRIC: all tests passed!"); + + return 0; +} diff --git a/database/engine/metric.h b/database/engine/metric.h new file mode 100644 index 000000000..82aff903a --- /dev/null +++ b/database/engine/metric.h @@ -0,0 +1,79 @@ +#ifndef DBENGINE_METRIC_H +#define DBENGINE_METRIC_H + +#include "../rrd.h" + +#define MRG_PARTITIONS 10 + +typedef struct metric METRIC; +typedef struct mrg MRG; + +typedef struct mrg_entry { + uuid_t uuid; + Word_t section; + time_t first_time_s; + time_t last_time_s; + uint32_t latest_update_every_s; +} MRG_ENTRY; + +struct mrg_statistics { + size_t entries; + size_t entries_referenced; + size_t entries_with_retention; + + size_t size; // total memory used, with indexing + + size_t current_references; + + size_t additions; + size_t additions_duplicate; + + size_t deletions; + size_t delete_having_retention_or_referenced; + size_t delete_misses; + + size_t search_hits; + size_t search_misses; + + size_t writers; + size_t writers_conflicts; +}; + +MRG *mrg_create(void); +void mrg_destroy(MRG *mrg); + +METRIC *mrg_metric_dup(MRG *mrg, METRIC *metric); +bool mrg_metric_release(MRG *mrg, METRIC *metric); + +METRIC *mrg_metric_add_and_acquire(MRG *mrg, MRG_ENTRY entry, bool *ret); +METRIC *mrg_metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section); +bool mrg_metric_release_and_delete(MRG *mrg, METRIC *metric); + +Word_t mrg_metric_id(MRG *mrg, METRIC *metric); +uuid_t *mrg_metric_uuid(MRG *mrg, METRIC *metric); +Word_t mrg_metric_section(MRG *mrg, METRIC *metric); + +bool mrg_metric_set_first_time_s(MRG *mrg, METRIC *metric, time_t first_time_s); +bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg, METRIC *metric, time_t first_time_s); +time_t mrg_metric_get_first_time_s(MRG *mrg, METRIC *metric); + +bool mrg_metric_set_clean_latest_time_s(MRG *mrg, METRIC *metric, time_t latest_time_s); +bool mrg_metric_set_hot_latest_time_s(MRG *mrg, METRIC *metric, time_t latest_time_s); +time_t mrg_metric_get_latest_time_s(MRG *mrg, METRIC *metric); + +bool mrg_metric_set_update_every(MRG *mrg, METRIC *metric, time_t update_every_s); +bool mrg_metric_set_update_every_s_if_zero(MRG *mrg, METRIC *metric, time_t update_every_s); +time_t mrg_metric_get_update_every_s(MRG *mrg, METRIC *metric); + +void mrg_metric_expand_retention(MRG *mrg, METRIC *metric, time_t first_time_s, time_t last_time_s, time_t update_every_s); +void mrg_metric_get_retention(MRG *mrg, METRIC *metric, time_t *first_time_s, time_t *last_time_s, time_t *update_every_s); +bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric); + +bool mrg_metric_set_writer(MRG *mrg, METRIC *metric); +bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric); + +struct mrg_statistics mrg_get_statistics(MRG *mrg); +size_t mrg_aral_structures(void); +size_t mrg_aral_overhead(void); + +#endif // DBENGINE_METRIC_H diff --git a/database/engine/pagecache.c b/database/engine/pagecache.c index 4f5da7084..b4902d784 100644 --- a/database/engine/pagecache.c +++ b/database/engine/pagecache.c @@ -3,1084 +3,836 @@ #include "rrdengine.h" -ARAL page_descr_aral = { - .requested_element_size = sizeof(struct rrdeng_page_descr), - .initial_elements = 20000, - .filename = "page_descriptors", - .cache_dir = &netdata_configured_cache_dir, - .use_mmap = false, - .internal.initialized = false -}; - -void rrdeng_page_descr_aral_go_singlethreaded(void) { - page_descr_aral.internal.lockless = true; -} -void rrdeng_page_descr_aral_go_multithreaded(void) { - page_descr_aral.internal.lockless = false; -} +MRG *main_mrg = NULL; +PGC *main_cache = NULL; +PGC *open_cache = NULL; +PGC *extent_cache = NULL; +struct rrdeng_cache_efficiency_stats rrdeng_cache_efficiency_stats = {}; -struct rrdeng_page_descr *rrdeng_page_descr_mallocz(void) { - struct rrdeng_page_descr *descr; - descr = arrayalloc_mallocz(&page_descr_aral); - return descr; +static void main_cache_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused) +{ + // Release storage associated with the page + dbengine_page_free(entry.data, entry.size); } +static void main_cache_flush_dirty_page_init_callback(PGC *cache __maybe_unused, Word_t section) { + struct rrdengine_instance *ctx = (struct rrdengine_instance *) section; -void rrdeng_page_descr_freez(struct rrdeng_page_descr *descr) { - arrayalloc_freez(&page_descr_aral, descr); + // mark ctx as having flushing in progress + __atomic_add_fetch(&ctx->atomic.extents_currently_being_flushed, 1, __ATOMIC_RELAXED); } -void rrdeng_page_descr_use_malloc(void) { - if(page_descr_aral.internal.initialized) - error("DBENGINE: cannot change ARAL allocation policy after it has been initialized."); - else - page_descr_aral.use_mmap = false; -} +static void main_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) +{ + if(!entries) + return; -void rrdeng_page_descr_use_mmap(void) { - if(page_descr_aral.internal.initialized) - error("DBENGINE: cannot change ARAL allocation policy after it has been initialized."); - else - page_descr_aral.use_mmap = true; -} + struct rrdengine_instance *ctx = (struct rrdengine_instance *) entries_array[0].section; -bool rrdeng_page_descr_is_mmap(void) { - return page_descr_aral.use_mmap; -} + size_t bytes_per_point = CTX_POINT_SIZE_BYTES(ctx); -/* Forward declarations */ -static int pg_cache_try_evict_one_page_unsafe(struct rrdengine_instance *ctx); + struct page_descr_with_data *base = NULL; -/* always inserts into tail */ -static inline void pg_cache_replaceQ_insert_unsafe(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + for (size_t Index = 0 ; Index < entries; Index++) { + time_t start_time_s = entries_array[Index].start_time_s; + time_t end_time_s = entries_array[Index].end_time_s; + struct page_descr_with_data *descr = page_descriptor_get(); - if (likely(NULL != pg_cache->replaceQ.tail)) { - pg_cache_descr->prev = pg_cache->replaceQ.tail; - pg_cache->replaceQ.tail->next = pg_cache_descr; - } - if (unlikely(NULL == pg_cache->replaceQ.head)) { - pg_cache->replaceQ.head = pg_cache_descr; - } - pg_cache->replaceQ.tail = pg_cache_descr; -} - -static inline void pg_cache_replaceQ_delete_unsafe(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr, *prev, *next; + descr->id = mrg_metric_uuid(main_mrg, (METRIC *) entries_array[Index].metric_id); + descr->metric_id = entries_array[Index].metric_id; + descr->start_time_ut = start_time_s * USEC_PER_SEC; + descr->end_time_ut = end_time_s * USEC_PER_SEC; + descr->update_every_s = entries_array[Index].update_every_s; + descr->type = ctx->config.page_type; - prev = pg_cache_descr->prev; - next = pg_cache_descr->next; + descr->page_length = (end_time_s - (start_time_s - descr->update_every_s)) / descr->update_every_s * bytes_per_point; - if (likely(NULL != prev)) { - prev->next = next; - } - if (likely(NULL != next)) { - next->prev = prev; - } - if (unlikely(pg_cache_descr == pg_cache->replaceQ.head)) { - pg_cache->replaceQ.head = next; - } - if (unlikely(pg_cache_descr == pg_cache->replaceQ.tail)) { - pg_cache->replaceQ.tail = prev; - } - pg_cache_descr->prev = pg_cache_descr->next = NULL; -} + if(descr->page_length > entries_array[Index].size) { + descr->page_length = entries_array[Index].size; -void pg_cache_replaceQ_insert(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - - uv_rwlock_wrlock(&pg_cache->replaceQ.lock); - pg_cache_replaceQ_insert_unsafe(ctx, descr); - uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); -} + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE: page exceeds the maximum size, adjusting it to max."); + } -void pg_cache_replaceQ_delete(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr) -{ - struct page_cache *pg_cache = &ctx->pg_cache; + descr->page = pgc_page_data(pages_array[Index]); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(base, descr, link.prev, link.next); - uv_rwlock_wrlock(&pg_cache->replaceQ.lock); - pg_cache_replaceQ_delete_unsafe(ctx, descr); - uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); -} -void pg_cache_replaceQ_set_hot(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr) -{ - struct page_cache *pg_cache = &ctx->pg_cache; + internal_fatal(descr->page_length > RRDENG_BLOCK_SIZE, "DBENGINE: faulty page length calculation"); + } - uv_rwlock_wrlock(&pg_cache->replaceQ.lock); - pg_cache_replaceQ_delete_unsafe(ctx, descr); - pg_cache_replaceQ_insert_unsafe(ctx, descr); - uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); + struct completion completion; + completion_init(&completion); + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_EXTENT_WRITE, base, &completion, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); + completion_wait_for(&completion); + completion_destroy(&completion); } -struct rrdeng_page_descr *pg_cache_create_descr(void) +static void open_cache_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused) { - struct rrdeng_page_descr *descr; - - descr = rrdeng_page_descr_mallocz(); - descr->page_length = 0; - descr->start_time_ut = INVALID_TIME; - descr->end_time_ut = INVALID_TIME; - descr->id = NULL; - descr->extent = NULL; - descr->pg_cache_descr_state = 0; - descr->pg_cache_descr = NULL; - descr->update_every_s = 0; - - return descr; + struct rrdengine_datafile *datafile = entry.data; + datafile_release(datafile, DATAFILE_ACQUIRE_OPEN_CACHE); } -/* The caller must hold page descriptor lock. */ -void pg_cache_wake_up_waiters_unsafe(struct rrdeng_page_descr *descr) +static void open_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) { - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; - if (pg_cache_descr->waiters) - uv_cond_broadcast(&pg_cache_descr->cond); + ; } -void pg_cache_wake_up_waiters(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) +static void extent_cache_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused) { - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_wake_up_waiters_unsafe(descr); - rrdeng_page_descr_mutex_unlock(ctx, descr); + dbengine_extent_free(entry.data, entry.size); } -/* - * The caller must hold page descriptor lock. - * The lock will be released and re-acquired. The descriptor is not guaranteed - * to exist after this function returns. - */ -void pg_cache_wait_event_unsafe(struct rrdeng_page_descr *descr) +static void extent_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) { - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; - - ++pg_cache_descr->waiters; - uv_cond_wait(&pg_cache_descr->cond, &pg_cache_descr->mutex); - --pg_cache_descr->waiters; + ; } -/* - * The caller must hold page descriptor lock. - * The lock will be released and re-acquired. The descriptor is not guaranteed - * to exist after this function returns. - * Returns UV_ETIMEDOUT if timeout_sec seconds pass. - */ -int pg_cache_timedwait_event_unsafe(struct rrdeng_page_descr *descr, uint64_t timeout_sec) -{ - int ret; - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; +inline TIME_RANGE_COMPARE is_page_in_time_range(time_t page_first_time_s, time_t page_last_time_s, time_t wanted_start_time_s, time_t wanted_end_time_s) { + // page_first_time_s <= wanted_end_time_s && page_last_time_s >= wanted_start_time_s + + if(page_last_time_s < wanted_start_time_s) + return PAGE_IS_IN_THE_PAST; - ++pg_cache_descr->waiters; - ret = uv_cond_timedwait(&pg_cache_descr->cond, &pg_cache_descr->mutex, timeout_sec * NSEC_PER_SEC); - --pg_cache_descr->waiters; + if(page_first_time_s > wanted_end_time_s) + return PAGE_IS_IN_THE_FUTURE; - return ret; + return PAGE_IS_IN_RANGE; } -/* - * Returns page flags. - * The lock will be released and re-acquired. The descriptor is not guaranteed - * to exist after this function returns. - */ -unsigned long pg_cache_wait_event(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) +static int journal_metric_uuid_compare(const void *key, const void *metric) { - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; - unsigned long flags; - - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_wait_event_unsafe(descr); - flags = pg_cache_descr->flags; - rrdeng_page_descr_mutex_unlock(ctx, descr); - - return flags; + return uuid_compare(*(uuid_t *) key, ((struct journal_metric_list *) metric)->uuid); } -/* - * The caller must hold page descriptor lock. - */ -int pg_cache_can_get_unsafe(struct rrdeng_page_descr *descr, int exclusive_access) -{ - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; +static inline struct page_details *pdc_find_page_for_time( + Pcvoid_t PArray, + time_t wanted_time_s, + size_t *gaps, + PDC_PAGE_STATUS mode, + PDC_PAGE_STATUS skip_list +) { + Word_t PIndexF = wanted_time_s, PIndexL = wanted_time_s; + Pvoid_t *PValueF, *PValueL; + struct page_details *pdF = NULL, *pdL = NULL; + bool firstF = true, firstL = true; + + PDC_PAGE_STATUS ignore_list = PDC_PAGE_QUERY_GLOBAL_SKIP_LIST | skip_list; + + while ((PValueF = PDCJudyLFirstThenNext(PArray, &PIndexF, &firstF))) { + pdF = *PValueF; + + PDC_PAGE_STATUS status = __atomic_load_n(&pdF->status, __ATOMIC_ACQUIRE); + if (!(status & (ignore_list | mode))) + break; - if ((pg_cache_descr->flags & (RRD_PAGE_LOCKED | RRD_PAGE_READ_PENDING)) || - (exclusive_access && pg_cache_descr->refcnt)) { - return 0; + pdF = NULL; } - return 1; -} + while ((PValueL = PDCJudyLLastThenPrev(PArray, &PIndexL, &firstL))) { + pdL = *PValueL; -/* - * The caller must hold page descriptor lock. - * Gets a reference to the page descriptor. - * Returns 1 on success and 0 on failure. - */ -int pg_cache_try_get_unsafe(struct rrdeng_page_descr *descr, int exclusive_access) -{ - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + PDC_PAGE_STATUS status = __atomic_load_n(&pdL->status, __ATOMIC_ACQUIRE); + if(status & mode) { + // don't go all the way back to the beginning + // stop at the last processed + pdL = NULL; + break; + } - if (!pg_cache_can_get_unsafe(descr, exclusive_access)) - return 0; + if (!(status & ignore_list)) + break; - if (exclusive_access) - pg_cache_descr->flags |= RRD_PAGE_LOCKED; - ++pg_cache_descr->refcnt; + pdL = NULL; + } - return 1; -} + TIME_RANGE_COMPARE rcF = (pdF) ? is_page_in_time_range(pdF->first_time_s, pdF->last_time_s, wanted_time_s, wanted_time_s) : PAGE_IS_IN_THE_FUTURE; + TIME_RANGE_COMPARE rcL = (pdL) ? is_page_in_time_range(pdL->first_time_s, pdL->last_time_s, wanted_time_s, wanted_time_s) : PAGE_IS_IN_THE_PAST; -/* - * The caller must hold the page descriptor lock. - * This function may block doing cleanup. - */ -void pg_cache_put_unsafe(struct rrdeng_page_descr *descr) -{ - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + if (!pdF || pdF == pdL) { + // F is missing, or they are the same + // return L + (*gaps) += (rcL == PAGE_IS_IN_RANGE) ? 0 : 1; + return pdL; + } - pg_cache_descr->flags &= ~RRD_PAGE_LOCKED; - if (0 == --pg_cache_descr->refcnt) { - pg_cache_wake_up_waiters_unsafe(descr); + if (!pdL) { + // L is missing + // return F + (*gaps) += (rcF == PAGE_IS_IN_RANGE) ? 0 : 1; + return pdF; } -} -/* - * This function may block doing cleanup. - */ -void pg_cache_put(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) -{ - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_put_unsafe(descr); - rrdeng_page_descr_mutex_unlock(ctx, descr); -} + if (rcF == rcL) { + // both are on the same side, + // but they are different pages -/* The caller must hold the page cache lock */ -static void pg_cache_release_pages_unsafe(struct rrdengine_instance *ctx, unsigned number) -{ - struct page_cache *pg_cache = &ctx->pg_cache; + switch (rcF) { + case PAGE_IS_IN_RANGE: + // pick the higher resolution + if (pdF->update_every_s && pdF->update_every_s < pdL->update_every_s) + return pdF; - pg_cache->populated_pages -= number; -} + if (pdL->update_every_s && pdL->update_every_s < pdF->update_every_s) + return pdL; -static void pg_cache_release_pages(struct rrdengine_instance *ctx, unsigned number) -{ - struct page_cache *pg_cache = &ctx->pg_cache; + // same resolution - pick the one that starts earlier + if (pdL->first_time_s < pdF->first_time_s) + return pdL; - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - pg_cache_release_pages_unsafe(ctx, number); - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); -} + return pdF; + break; -/* - * This function returns the maximum number of pages allowed in the page cache. - */ -unsigned long pg_cache_hard_limit(struct rrdengine_instance *ctx) -{ - return ctx->max_cache_pages + (unsigned long)ctx->metric_API_max_producers; -} + case PAGE_IS_IN_THE_FUTURE: + (*gaps)++; -/* - * This function returns the low watermark number of pages in the page cache. The page cache should strive to keep the - * number of pages below that number. - */ -unsigned long pg_cache_soft_limit(struct rrdengine_instance *ctx) -{ - return ctx->cache_pages_low_watermark + (unsigned long)ctx->metric_API_max_producers; -} + // pick the one that starts earlier + if (pdL->first_time_s < pdF->first_time_s) + return pdL; -/* - * This function returns the maximum number of dirty pages that are committed to be written to disk allowed in the page - * cache. - */ -unsigned long pg_cache_committed_hard_limit(struct rrdengine_instance *ctx) -{ - /* We remove the active pages of the producers from the calculation and only allow the extra pinned pages */ - return ctx->cache_pages_low_watermark + (unsigned long)ctx->metric_API_max_producers; -} + return pdF; + break; -/* - * This function will block until it reserves #number populated pages. - * It will trigger evictions or dirty page flushing if the pg_cache_hard_limit() limit is hit. - */ -static void pg_cache_reserve_pages(struct rrdengine_instance *ctx, unsigned number) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - unsigned failures = 0; - const unsigned FAILURES_CEILING = 10; /* truncates exponential backoff to (2^FAILURES_CEILING x slot) */ - unsigned long exp_backoff_slot_usec = USEC_PER_MS * 10; - - assert(number < ctx->max_cache_pages); - - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - if (pg_cache->populated_pages + number >= pg_cache_hard_limit(ctx) + 1) - debug(D_RRDENGINE, "==Page cache full. Reserving %u pages.==", - number); - while (pg_cache->populated_pages + number >= pg_cache_hard_limit(ctx) + 1) { - - if (!pg_cache_try_evict_one_page_unsafe(ctx)) { - /* failed to evict */ - struct completion compl; - struct rrdeng_cmd cmd; - - ++failures; - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); - - completion_init(&compl); - cmd.opcode = RRDENG_FLUSH_PAGES; - cmd.completion = &compl; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); - /* wait for some pages to be flushed */ - debug(D_RRDENGINE, "%s: waiting for pages to be written to disk before evicting.", __func__); - completion_wait_for(&compl); - completion_destroy(&compl); - - if (unlikely(failures > 1)) { - unsigned long slots, usecs_to_sleep; - /* exponential backoff */ - slots = random() % (2LU << MIN(failures, FAILURES_CEILING)); - usecs_to_sleep = slots * exp_backoff_slot_usec; - - if (usecs_to_sleep >= USEC_PER_SEC) - error("Page cache is full. Sleeping for %llu second(s).", usecs_to_sleep / USEC_PER_SEC); - - (void)sleep_usec(usecs_to_sleep); - } - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); + default: + case PAGE_IS_IN_THE_PAST: + (*gaps)++; + return NULL; + break; } } - pg_cache->populated_pages += number; - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); -} -/* - * This function will attempt to reserve #number populated pages. - * It may trigger evictions if the pg_cache_soft_limit() limit is hit. - * Returns 0 on failure and 1 on success. - */ -static int pg_cache_try_reserve_pages(struct rrdengine_instance *ctx, unsigned number) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - unsigned count = 0; - int ret = 0; - - assert(number < ctx->max_cache_pages); - - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - if (pg_cache->populated_pages + number >= pg_cache_soft_limit(ctx) + 1) { - debug(D_RRDENGINE, - "==Page cache full. Trying to reserve %u pages.==", - number); - do { - if (!pg_cache_try_evict_one_page_unsafe(ctx)) - break; - ++count; - } while (pg_cache->populated_pages + number >= pg_cache_soft_limit(ctx) + 1); - debug(D_RRDENGINE, "Evicted %u pages.", count); + if(rcF == PAGE_IS_IN_RANGE) { + // (*gaps) += 0; + return pdF; } - if (pg_cache->populated_pages + number < pg_cache_hard_limit(ctx) + 1) { - pg_cache->populated_pages += number; - ret = 1; /* success */ + if(rcL == PAGE_IS_IN_RANGE) { + // (*gaps) += 0; + return pdL; } - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); - return ret; -} + if(rcF == PAGE_IS_IN_THE_FUTURE) { + (*gaps)++; + return pdF; + } -/* The caller must hold the page cache and the page descriptor locks in that order */ -static void pg_cache_evict_unsafe(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) -{ - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + if(rcL == PAGE_IS_IN_THE_FUTURE) { + (*gaps)++; + return pdL; + } - dbengine_page_free(pg_cache_descr->page); - pg_cache_descr->page = NULL; - pg_cache_descr->flags &= ~RRD_PAGE_POPULATED; - pg_cache_release_pages_unsafe(ctx, 1); - ++ctx->stats.pg_cache_evictions; + // impossible case + (*gaps)++; + return NULL; } -/* - * The caller must hold the page cache lock. - * Lock order: page cache -> replaceQ -> page descriptor - * This function iterates all pages and tries to evict one. - * If it fails it sets in_flight_descr to the oldest descriptor that has write-back in progress, - * or it sets it to NULL if no write-back is in progress. - * - * Returns 1 on success and 0 on failure. - */ -static int pg_cache_try_evict_one_page_unsafe(struct rrdengine_instance *ctx) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - unsigned long old_flags; - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr = NULL; +static size_t get_page_list_from_pgc(PGC *cache, METRIC *metric, struct rrdengine_instance *ctx, + time_t wanted_start_time_s, time_t wanted_end_time_s, + Pvoid_t *JudyL_page_array, size_t *cache_gaps, + bool open_cache_mode, PDC_PAGE_STATUS tags) { - uv_rwlock_wrlock(&pg_cache->replaceQ.lock); - for (pg_cache_descr = pg_cache->replaceQ.head ; NULL != pg_cache_descr ; pg_cache_descr = pg_cache_descr->next) { - descr = pg_cache_descr->descr; + size_t pages_found_in_cache = 0; + Word_t metric_id = mrg_metric_id(main_mrg, metric); - rrdeng_page_descr_mutex_lock(ctx, descr); - old_flags = pg_cache_descr->flags; - if ((old_flags & RRD_PAGE_POPULATED) && !(old_flags & RRD_PAGE_DIRTY) && pg_cache_try_get_unsafe(descr, 1)) { - /* must evict */ - pg_cache_evict_unsafe(ctx, descr); - pg_cache_put_unsafe(descr); - pg_cache_replaceQ_delete_unsafe(ctx, descr); + time_t now_s = wanted_start_time_s; + time_t dt_s = mrg_metric_get_update_every_s(main_mrg, metric); - rrdeng_page_descr_mutex_unlock(ctx, descr); - uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); + if(!dt_s) + dt_s = default_rrd_update_every; - rrdeng_try_deallocate_pg_cache_descr(ctx, descr); + time_t previous_page_end_time_s = now_s - dt_s; + bool first = true; - return 1; - } - rrdeng_page_descr_mutex_unlock(ctx, descr); - } - uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); + do { + PGC_PAGE *page = pgc_page_get_and_acquire( + cache, (Word_t)ctx, (Word_t)metric_id, now_s, + (first) ? PGC_SEARCH_CLOSEST : PGC_SEARCH_NEXT); - /* failed to evict */ - return 0; -} + first = false; -/** - * Deletes a page from the database. - * Callers of this function need to make sure they're not deleting the same descriptor concurrently. - * @param ctx is the database instance. - * @param descr is the page descriptor. - * @param remove_dirty must be non-zero if the page to be deleted is dirty. - * @param is_exclusive_holder must be non-zero if the caller holds an exclusive page reference. - * @param metric_id is set to the metric the page belongs to, if it's safe to delete the metric and metric_id is not - * NULL. Otherwise, metric_id is not set. - * @return 1 if it's safe to delete the metric, 0 otherwise. - */ -uint8_t pg_cache_punch_hole(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr, uint8_t remove_dirty, - uint8_t is_exclusive_holder, uuid_t *metric_id) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct page_cache_descr *pg_cache_descr = NULL; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; - int ret; - uint8_t can_delete_metric = 0; - - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, descr->id, sizeof(uuid_t)); - fatal_assert(NULL != PValue); - page_index = *PValue; - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - - uv_rwlock_wrlock(&page_index->lock); - ret = JudyLDel(&page_index->JudyL_array, (Word_t)(descr->start_time_ut / USEC_PER_SEC), PJE0); - if (unlikely(0 == ret)) { - uv_rwlock_wrunlock(&page_index->lock); - if (unlikely(debug_flags & D_RRDENGINE)) { - print_page_descr(descr); - } - goto destroy; - } - --page_index->page_count; - if (!page_index->writers && !page_index->page_count) { - can_delete_metric = 1; - if (metric_id) { - memcpy(metric_id, page_index->id, sizeof(uuid_t)); - } - } - uv_rwlock_wrunlock(&page_index->lock); - fatal_assert(1 == ret); - - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - ++ctx->stats.pg_cache_deletions; - --pg_cache->page_descriptors; - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); - - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - if (!is_exclusive_holder) { - /* If we don't hold an exclusive page reference get one */ - while (!pg_cache_try_get_unsafe(descr, 1)) { - debug(D_RRDENGINE, "%s: Waiting for locked page:", __func__); - if (unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - pg_cache_wait_event_unsafe(descr); + if(!page) { + if(previous_page_end_time_s < wanted_end_time_s) + (*cache_gaps)++; + + break; } - } - if (remove_dirty) { - pg_cache_descr->flags &= ~RRD_PAGE_DIRTY; - } else { - /* even a locked page could be dirty */ - while (unlikely(pg_cache_descr->flags & RRD_PAGE_DIRTY)) { - debug(D_RRDENGINE, "%s: Found dirty page, waiting for it to be flushed:", __func__); - if (unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - pg_cache_wait_event_unsafe(descr); + + time_t page_start_time_s = pgc_page_start_time_s(page); + time_t page_end_time_s = pgc_page_end_time_s(page); + time_t page_update_every_s = pgc_page_update_every_s(page); + size_t page_length = pgc_page_data_size(cache, page); + + if(!page_update_every_s) + page_update_every_s = dt_s; + + if(is_page_in_time_range(page_start_time_s, page_end_time_s, wanted_start_time_s, wanted_end_time_s) != PAGE_IS_IN_RANGE) { + // not a useful page for this query + pgc_page_release(cache, page); + page = NULL; + + if(previous_page_end_time_s < wanted_end_time_s) + (*cache_gaps)++; + + break; } - } - rrdeng_page_descr_mutex_unlock(ctx, descr); - - while (unlikely(pg_cache_descr->flags & RRD_PAGE_READ_PENDING)) { - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, "%s: Found page with READ PENDING, waiting for read to complete", __func__); - if (unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - pg_cache_wait_event_unsafe(descr); - } - if (pg_cache_descr->flags & RRD_PAGE_POPULATED) { - /* only after locking can it be safely deleted from LRU */ - pg_cache_replaceQ_delete(ctx, descr); + if (page_start_time_s - previous_page_end_time_s > dt_s) + (*cache_gaps)++; + + Pvoid_t *PValue = PDCJudyLIns(JudyL_page_array, (Word_t) page_start_time_s, PJE0); + if (!PValue || PValue == PJERR) + fatal("DBENGINE: corrupted judy array in %s()", __FUNCTION__ ); + + if (unlikely(*PValue)) { + struct page_details *pd = *PValue; + UNUSED(pd); + +// internal_error( +// pd->first_time_s != page_first_time_s || +// pd->last_time_s != page_last_time_s || +// pd->update_every_s != page_update_every_s, +// "DBENGINE: duplicate page with different retention in %s cache " +// "1st: %ld to %ld, ue %u, size %u " +// "2nd: %ld to %ld, ue %ld size %zu " +// "- ignoring the second", +// cache == open_cache ? "open" : "main", +// pd->first_time_s, pd->last_time_s, pd->update_every_s, pd->page_length, +// page_first_time_s, page_last_time_s, page_update_every_s, page_length); + + pgc_page_release(cache, page); + } + else { - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - pg_cache_evict_unsafe(ctx, descr); - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); - } - pg_cache_put(ctx, descr); - rrdeng_try_deallocate_pg_cache_descr(ctx, descr); - while (descr->pg_cache_descr_state & PG_CACHE_DESCR_ALLOCATED) { - rrdeng_try_deallocate_pg_cache_descr(ctx, descr); /* spin */ - (void)sleep_usec(1000); /* 1 msec */ - } -destroy: - rrdeng_page_descr_freez(descr); - pg_cache_update_metric_times(page_index); + internal_fatal(pgc_page_metric(page) != metric_id, "Wrong metric id in page found in cache"); + internal_fatal(pgc_page_section(page) != (Word_t)ctx, "Wrong section in page found in cache"); - return can_delete_metric; -} + struct page_details *pd = page_details_get(); + pd->metric_id = metric_id; + pd->first_time_s = page_start_time_s; + pd->last_time_s = page_end_time_s; + pd->page_length = page_length; + pd->update_every_s = page_update_every_s; + pd->page = (open_cache_mode) ? NULL : page; + pd->status |= tags; -static inline int is_page_in_time_range(struct rrdeng_page_descr *descr, usec_t start_time, usec_t end_time) -{ - usec_t pg_start, pg_end; + if((pd->page)) { + pd->status |= PDC_PAGE_READY | PDC_PAGE_PRELOADED; - pg_start = descr->start_time_ut; - pg_end = descr->end_time_ut; + if(pgc_page_data(page) == DBENGINE_EMPTY_PAGE) + pd->status |= PDC_PAGE_EMPTY; + } - return (pg_start < start_time && pg_end >= start_time) || - (pg_start >= start_time && pg_start <= end_time); -} + if(open_cache_mode) { + struct rrdengine_datafile *datafile = pgc_page_data(page); + if(datafile_acquire(datafile, DATAFILE_ACQUIRE_PAGE_DETAILS)) { // for pd + struct extent_io_data *xio = (struct extent_io_data *) pgc_page_custom_data(cache, page); + pd->datafile.ptr = pgc_page_data(page); + pd->datafile.file = xio->file; + pd->datafile.extent.pos = xio->pos; + pd->datafile.extent.bytes = xio->bytes; + pd->datafile.fileno = pd->datafile.ptr->fileno; + pd->status |= PDC_PAGE_DATAFILE_ACQUIRED | PDC_PAGE_DISK_PENDING; + } + else { + pd->status |= PDC_PAGE_FAILED | PDC_PAGE_FAILED_TO_ACQUIRE_DATAFILE; + } + pgc_page_release(cache, page); + } -static inline int is_point_in_time_in_page(struct rrdeng_page_descr *descr, usec_t point_in_time) -{ - return (point_in_time >= descr->start_time_ut && point_in_time <= descr->end_time_ut); -} + *PValue = pd; -/* The caller must hold the page index lock */ -static inline struct rrdeng_page_descr * - find_first_page_in_time_range(struct pg_cache_page_index *page_index, usec_t start_time, usec_t end_time) -{ - struct rrdeng_page_descr *descr = NULL; - Pvoid_t *PValue; - Word_t Index; - - Index = (Word_t)(start_time / USEC_PER_SEC); - PValue = JudyLLast(page_index->JudyL_array, &Index, PJE0); - if (likely(NULL != PValue)) { - descr = *PValue; - if (is_page_in_time_range(descr, start_time, end_time)) { - return descr; + pages_found_in_cache++; } - } - Index = (Word_t)(start_time / USEC_PER_SEC); - PValue = JudyLFirst(page_index->JudyL_array, &Index, PJE0); - if (likely(NULL != PValue)) { - descr = *PValue; - if (is_page_in_time_range(descr, start_time, end_time)) { - return descr; - } - } + // prepare for the next iteration + previous_page_end_time_s = page_end_time_s; - return NULL; -} + if(page_update_every_s > 0) + dt_s = page_update_every_s; -/* Update metric oldest and latest timestamps efficiently when adding new values */ -void pg_cache_add_new_metric_time(struct pg_cache_page_index *page_index, struct rrdeng_page_descr *descr) -{ - usec_t oldest_time = page_index->oldest_time_ut; - usec_t latest_time = page_index->latest_time_ut; + // we are going to as for the NEXT page + // so, set this to our first time + now_s = page_start_time_s; - if (unlikely(oldest_time == INVALID_TIME || descr->start_time_ut < oldest_time)) { - page_index->oldest_time_ut = descr->start_time_ut; - } - if (likely(descr->end_time_ut > latest_time || latest_time == INVALID_TIME)) { - page_index->latest_time_ut = descr->end_time_ut; - } + } while(now_s <= wanted_end_time_s); + + return pages_found_in_cache; } -/* Update metric oldest and latest timestamps when removing old values */ -void pg_cache_update_metric_times(struct pg_cache_page_index *page_index) -{ - Pvoid_t *firstPValue, *lastPValue; - Word_t firstIndex, lastIndex; - struct rrdeng_page_descr *descr; - usec_t oldest_time = INVALID_TIME; - usec_t latest_time = INVALID_TIME; - - uv_rwlock_rdlock(&page_index->lock); - /* Find first page in range */ - firstIndex = (Word_t)0; - firstPValue = JudyLFirst(page_index->JudyL_array, &firstIndex, PJE0); - if (likely(NULL != firstPValue)) { - descr = *firstPValue; - oldest_time = descr->start_time_ut; - } - lastIndex = (Word_t)-1; - lastPValue = JudyLLast(page_index->JudyL_array, &lastIndex, PJE0); - if (likely(NULL != lastPValue)) { - descr = *lastPValue; - latest_time = descr->end_time_ut; - } - uv_rwlock_rdunlock(&page_index->lock); +static void pgc_inject_gap(struct rrdengine_instance *ctx, METRIC *metric, time_t start_time_s, time_t end_time_s) { - if (unlikely(NULL == firstPValue)) { - fatal_assert(NULL == lastPValue); - page_index->oldest_time_ut = page_index->latest_time_ut = INVALID_TIME; + time_t db_first_time_s, db_last_time_s, db_update_every_s; + mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s); + + if(is_page_in_time_range(start_time_s, end_time_s, db_first_time_s, db_last_time_s) != PAGE_IS_IN_RANGE) return; - } - page_index->oldest_time_ut = oldest_time; - page_index->latest_time_ut = latest_time; + + PGC_ENTRY page_entry = { + .hot = false, + .section = (Word_t)ctx, + .metric_id = (Word_t)metric, + .start_time_s = MAX(start_time_s, db_first_time_s), + .end_time_s = MIN(end_time_s, db_last_time_s), + .update_every_s = 0, + .size = 0, + .data = DBENGINE_EMPTY_PAGE, + }; + + if(page_entry.start_time_s >= page_entry.end_time_s) + return; + + PGC_PAGE *page = pgc_page_add_and_acquire(main_cache, page_entry, NULL); + pgc_page_release(main_cache, page); } -/* If index is NULL lookup by UUID (descr->id) */ -void pg_cache_insert(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, - struct rrdeng_page_descr *descr) -{ - struct page_cache *pg_cache = &ctx->pg_cache; +static size_t list_has_time_gaps( + struct rrdengine_instance *ctx, + METRIC *metric, + Pvoid_t JudyL_page_array, + time_t wanted_start_time_s, + time_t wanted_end_time_s, + size_t *pages_total, + size_t *pages_found_pass4, + size_t *pages_pending, + size_t *pages_overlapping, + time_t *optimal_end_time_s, + bool populate_gaps +) { + // we will recalculate these, so zero them + *pages_pending = 0; + *pages_overlapping = 0; + *optimal_end_time_s = 0; + + bool first; Pvoid_t *PValue; - struct pg_cache_page_index *page_index; - unsigned long pg_cache_descr_state = descr->pg_cache_descr_state; - - if (0 != pg_cache_descr_state) { - /* there is page cache descriptor pre-allocated state */ - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; - - fatal_assert(pg_cache_descr_state & PG_CACHE_DESCR_ALLOCATED); - if (pg_cache_descr->flags & RRD_PAGE_POPULATED) { - pg_cache_reserve_pages(ctx, 1); - if (!(pg_cache_descr->flags & RRD_PAGE_DIRTY)) - pg_cache_replaceQ_insert(ctx, descr); - } - } + Word_t this_page_start_time; + struct page_details *pd; + + size_t gaps = 0; + Word_t metric_id = mrg_metric_id(main_mrg, metric); + + // ------------------------------------------------------------------------ + // PASS 1: remove the preprocessing flags from the pages in PDC - if (unlikely(NULL == index)) { - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, descr->id, sizeof(uuid_t)); - fatal_assert(NULL != PValue); - page_index = *PValue; - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - } else { - page_index = index; + first = true; + this_page_start_time = 0; + while((PValue = PDCJudyLFirstThenNext(JudyL_page_array, &this_page_start_time, &first))) { + pd = *PValue; + pd->status &= ~(PDC_PAGE_SKIP|PDC_PAGE_PREPROCESSED); } - uv_rwlock_wrlock(&page_index->lock); - PValue = JudyLIns(&page_index->JudyL_array, (Word_t)(descr->start_time_ut / USEC_PER_SEC), PJE0); - *PValue = descr; - ++page_index->page_count; - pg_cache_add_new_metric_time(page_index, descr); - uv_rwlock_wrunlock(&page_index->lock); - - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - ++ctx->stats.pg_cache_insertions; - ++pg_cache->page_descriptors; - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); -} + // ------------------------------------------------------------------------ + // PASS 2: emulate processing to find the useful pages -usec_t pg_cache_oldest_time_in_range(struct rrdengine_instance *ctx, uuid_t *id, usec_t start_time_ut, usec_t end_time_ut) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = NULL; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; + time_t now_s = wanted_start_time_s; + time_t dt_s = mrg_metric_get_update_every_s(main_mrg, metric); + if(!dt_s) + dt_s = default_rrd_update_every; - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, id, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - page_index = *PValue; - } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - if (NULL == PValue) { - return INVALID_TIME; - } + size_t pages_pass2 = 0, pages_pass3 = 0; + while((pd = pdc_find_page_for_time( + JudyL_page_array, now_s, &gaps, + PDC_PAGE_PREPROCESSED, 0))) { - uv_rwlock_rdlock(&page_index->lock); - descr = find_first_page_in_time_range(page_index, start_time_ut, end_time_ut); - if (NULL == descr) { - uv_rwlock_rdunlock(&page_index->lock); - return INVALID_TIME; - } - uv_rwlock_rdunlock(&page_index->lock); - return descr->start_time_ut; -} + pd->status |= PDC_PAGE_PREPROCESSED; + pages_pass2++; -/** - * Return page information for the first page before point_in_time that satisfies the filter. - * @param ctx DB context - * @param page_index page index of a metric - * @param point_in_time_ut the pages that are searched must be older than this timestamp - * @param filter decides if the page satisfies the caller's criteria - * @param page_info the result of the search is set in this pointer - */ -void pg_cache_get_filtered_info_prev(struct rrdengine_instance *ctx, struct pg_cache_page_index *page_index, - usec_t point_in_time_ut, pg_cache_page_info_filter_t *filter, - struct rrdeng_page_info *page_info) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = NULL; - Pvoid_t *PValue; - Word_t Index; + if(pd->update_every_s) + dt_s = pd->update_every_s; - (void)pg_cache; - fatal_assert(NULL != page_index); + if(populate_gaps && pd->first_time_s > now_s) + pgc_inject_gap(ctx, metric, now_s, pd->first_time_s); - Index = (Word_t)(point_in_time_ut / USEC_PER_SEC); - uv_rwlock_rdlock(&page_index->lock); - do { - PValue = JudyLPrev(page_index->JudyL_array, &Index, PJE0); - descr = unlikely(NULL == PValue) ? NULL : *PValue; - } while (descr != NULL && !filter(descr)); - if (unlikely(NULL == descr)) { - page_info->page_length = 0; - page_info->start_time_ut = INVALID_TIME; - page_info->end_time_ut = INVALID_TIME; - } else { - page_info->page_length = descr->page_length; - page_info->start_time_ut = descr->start_time_ut; - page_info->end_time_ut = descr->end_time_ut; + now_s = pd->last_time_s + dt_s; + if(now_s > wanted_end_time_s) { + *optimal_end_time_s = pd->last_time_s; + break; + } } - uv_rwlock_rdunlock(&page_index->lock); -} -/** - * Searches for an unallocated page without triggering disk I/O. Attempts to reserve the page and get a reference. - * @param ctx DB context - * @param id lookup by UUID - * @param start_time_ut exact starting time in usec - * @param ret_page_indexp Sets the page index pointer (*ret_page_indexp) for the given UUID. - * @return the page descriptor or NULL on failure. It can fail if: - * 1. The page is already allocated to the page cache. - * 2. It did not succeed to get a reference. - * 3. It did not succeed to reserve a spot in the page cache. - */ -struct rrdeng_page_descr *pg_cache_lookup_unpopulated_and_lock(struct rrdengine_instance *ctx, uuid_t *id, - usec_t start_time_ut) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = NULL; - struct page_cache_descr *pg_cache_descr = NULL; - unsigned long flags; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; - Word_t Index; + if(populate_gaps && now_s < wanted_end_time_s) + pgc_inject_gap(ctx, metric, now_s, wanted_end_time_s); - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, id, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - page_index = *PValue; - } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); + // ------------------------------------------------------------------------ + // PASS 3: mark as skipped all the pages not useful - if ((NULL == PValue) || !pg_cache_try_reserve_pages(ctx, 1)) { - /* Failed to find page or failed to reserve a spot in the cache */ - return NULL; - } + first = true; + this_page_start_time = 0; + while((PValue = PDCJudyLFirstThenNext(JudyL_page_array, &this_page_start_time, &first))) { + pd = *PValue; - uv_rwlock_rdlock(&page_index->lock); - Index = (Word_t)(start_time_ut / USEC_PER_SEC); - PValue = JudyLGet(page_index->JudyL_array, Index, PJE0); - if (likely(NULL != PValue)) { - descr = *PValue; - } - if (NULL == PValue || 0 == descr->page_length) { - /* Failed to find non-empty page */ - uv_rwlock_rdunlock(&page_index->lock); + internal_fatal(pd->metric_id != metric_id, "pd has wrong metric_id"); - pg_cache_release_pages(ctx, 1); - return NULL; - } + if(!(pd->status & PDC_PAGE_PREPROCESSED)) { + (*pages_overlapping)++; + pd->status |= PDC_PAGE_SKIP; + pd->status &= ~(PDC_PAGE_READY | PDC_PAGE_DISK_PENDING); + continue; + } - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - flags = pg_cache_descr->flags; - uv_rwlock_rdunlock(&page_index->lock); + pages_pass3++; - if ((flags & RRD_PAGE_POPULATED) || !pg_cache_try_get_unsafe(descr, 1)) { - /* Failed to get reference or page is already populated */ - rrdeng_page_descr_mutex_unlock(ctx, descr); + if(!pd->page) { + pd->page = pgc_page_get_and_acquire(main_cache, (Word_t) ctx, (Word_t) metric_id, pd->first_time_s, PGC_SEARCH_EXACT); - pg_cache_release_pages(ctx, 1); - return NULL; + if(pd->page) { + (*pages_found_pass4)++; + + pd->status &= ~PDC_PAGE_DISK_PENDING; + pd->status |= PDC_PAGE_READY | PDC_PAGE_PRELOADED | PDC_PAGE_PRELOADED_PASS4; + + if(pgc_page_data(pd->page) == DBENGINE_EMPTY_PAGE) + pd->status |= PDC_PAGE_EMPTY; + + } + else if(!(pd->status & PDC_PAGE_FAILED) && (pd->status & PDC_PAGE_DATAFILE_ACQUIRED)) { + (*pages_pending)++; + + pd->status |= PDC_PAGE_DISK_PENDING; + + internal_fatal(pd->status & PDC_PAGE_SKIP, "page is disk pending and skipped"); + internal_fatal(!pd->datafile.ptr, "datafile is NULL"); + internal_fatal(!pd->datafile.extent.bytes, "datafile.extent.bytes zero"); + internal_fatal(!pd->datafile.extent.pos, "datafile.extent.pos is zero"); + internal_fatal(!pd->datafile.fileno, "datafile.fileno is zero"); + } + } + else { + pd->status &= ~PDC_PAGE_DISK_PENDING; + pd->status |= (PDC_PAGE_READY | PDC_PAGE_PRELOADED); + } } - /* success */ - rrdeng_page_descr_mutex_unlock(ctx, descr); - rrd_stat_atomic_add(&ctx->stats.pg_cache_misses, 1); - return descr; + internal_fatal(pages_pass2 != pages_pass3, + "DBENGINE: page count does not match"); + + *pages_total = pages_pass2; + + return gaps; } -/** - * Searches for pages in a time range and triggers disk I/O if necessary and possible. - * Does not get a reference. - * @param ctx DB context - * @param id UUID - * @param start_time_ut inclusive starting time in usec - * @param end_time_ut inclusive ending time in usec - * @param page_info_arrayp It allocates (*page_arrayp) and populates it with information of pages that overlap - * with the time range [start_time,end_time]. The caller must free (*page_info_arrayp) with freez(). - * If page_info_arrayp is set to NULL nothing was allocated. - * @param ret_page_indexp Sets the page index pointer (*ret_page_indexp) for the given UUID. - * @return the number of pages that overlap with the time range [start_time,end_time]. - */ -unsigned pg_cache_preload(struct rrdengine_instance *ctx, uuid_t *id, usec_t start_time_ut, usec_t end_time_ut, - struct rrdeng_page_info **page_info_arrayp, struct pg_cache_page_index **ret_page_indexp) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = NULL, *preload_array[PAGE_CACHE_MAX_PRELOAD_PAGES]; - struct page_cache_descr *pg_cache_descr = NULL; - unsigned i, j, k, preload_count, count, page_info_array_max_size; - unsigned long flags; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; - Word_t Index; - uint8_t failed_to_reserve; +typedef void (*page_found_callback_t)(PGC_PAGE *page, void *data); +static size_t get_page_list_from_journal_v2(struct rrdengine_instance *ctx, METRIC *metric, usec_t start_time_ut, usec_t end_time_ut, page_found_callback_t callback, void *callback_data) { + uuid_t *uuid = mrg_metric_uuid(main_mrg, metric); + Word_t metric_id = mrg_metric_id(main_mrg, metric); - fatal_assert(NULL != ret_page_indexp); + time_t wanted_start_time_s = (time_t)(start_time_ut / USEC_PER_SEC); + time_t wanted_end_time_s = (time_t)(end_time_ut / USEC_PER_SEC); - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, id, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - *ret_page_indexp = page_index = *PValue; - } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - if (NULL == PValue) { - debug(D_RRDENGINE, "%s: No page was found to attempt preload.", __func__); - *ret_page_indexp = NULL; - return 0; - } + size_t pages_found = 0; - uv_rwlock_rdlock(&page_index->lock); - descr = find_first_page_in_time_range(page_index, start_time_ut, end_time_ut); - if (NULL == descr) { - uv_rwlock_rdunlock(&page_index->lock); - debug(D_RRDENGINE, "%s: No page was found to attempt preload.", __func__); - *ret_page_indexp = NULL; - return 0; - } else { - Index = (Word_t)(descr->start_time_ut / USEC_PER_SEC); - } - if (page_info_arrayp) { - page_info_array_max_size = PAGE_CACHE_MAX_PRELOAD_PAGES * sizeof(struct rrdeng_page_info); - *page_info_arrayp = mallocz(page_info_array_max_size); - } + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + struct rrdengine_datafile *datafile; + for(datafile = ctx->datafiles.first; datafile ; datafile = datafile->next) { + struct journal_v2_header *j2_header = journalfile_v2_data_acquire(datafile->journalfile, NULL, + wanted_start_time_s, + wanted_end_time_s); + if (unlikely(!j2_header)) + continue; - for (count = 0, preload_count = 0 ; - descr != NULL && is_page_in_time_range(descr, start_time_ut, end_time_ut) ; - PValue = JudyLNext(page_index->JudyL_array, &Index, PJE0), - descr = unlikely(NULL == PValue) ? NULL : *PValue) { - /* Iterate all pages in range */ + time_t journal_start_time_s = (time_t)(j2_header->start_time_ut / USEC_PER_SEC); - if (unlikely(0 == descr->page_length)) + // the datafile possibly contains useful data for this query + + size_t journal_metric_count = (size_t)j2_header->metric_count; + struct journal_metric_list *uuid_list = (struct journal_metric_list *)((uint8_t *) j2_header + j2_header->metric_offset); + struct journal_metric_list *uuid_entry = bsearch(uuid,uuid_list,journal_metric_count,sizeof(*uuid_list), journal_metric_uuid_compare); + + if (unlikely(!uuid_entry)) { + // our UUID is not in this datafile + journalfile_v2_data_release(datafile->journalfile); continue; - if (page_info_arrayp) { - if (unlikely(count >= page_info_array_max_size / sizeof(struct rrdeng_page_info))) { - page_info_array_max_size += PAGE_CACHE_MAX_PRELOAD_PAGES * sizeof(struct rrdeng_page_info); - *page_info_arrayp = reallocz(*page_info_arrayp, page_info_array_max_size); - } - (*page_info_arrayp)[count].start_time_ut = descr->start_time_ut; - (*page_info_arrayp)[count].end_time_ut = descr->end_time_ut; - (*page_info_arrayp)[count].page_length = descr->page_length; } - ++count; - - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - flags = pg_cache_descr->flags; - if (pg_cache_can_get_unsafe(descr, 0)) { - if (flags & RRD_PAGE_POPULATED) { - /* success */ - rrdeng_page_descr_mutex_unlock(ctx, descr); - debug(D_RRDENGINE, "%s: Page was found in memory.", __func__); + + struct journal_page_header *page_list_header = (struct journal_page_header *) ((uint8_t *) j2_header + uuid_entry->page_offset); + struct journal_page_list *page_list = (struct journal_page_list *)((uint8_t *) page_list_header + sizeof(*page_list_header)); + struct journal_extent_list *extent_list = (void *)((uint8_t *)j2_header + j2_header->extent_offset); + uint32_t uuid_page_entries = page_list_header->entries; + + for (uint32_t index = 0; index < uuid_page_entries; index++) { + struct journal_page_list *page_entry_in_journal = &page_list[index]; + + time_t page_first_time_s = page_entry_in_journal->delta_start_s + journal_start_time_s; + time_t page_last_time_s = page_entry_in_journal->delta_end_s + journal_start_time_s; + + TIME_RANGE_COMPARE prc = is_page_in_time_range(page_first_time_s, page_last_time_s, wanted_start_time_s, wanted_end_time_s); + if(prc == PAGE_IS_IN_THE_PAST) continue; - } - } - if (!(flags & RRD_PAGE_POPULATED) && pg_cache_try_get_unsafe(descr, 1)) { - preload_array[preload_count++] = descr; - if (PAGE_CACHE_MAX_PRELOAD_PAGES == preload_count) { - rrdeng_page_descr_mutex_unlock(ctx, descr); + + if(prc == PAGE_IS_IN_THE_FUTURE) break; + + time_t page_update_every_s = page_entry_in_journal->update_every_s; + size_t page_length = page_entry_in_journal->page_length; + + if(datafile_acquire(datafile, DATAFILE_ACQUIRE_OPEN_CACHE)) { //for open cache item + // add this page to open cache + bool added = false; + struct extent_io_data ei = { + .pos = extent_list[page_entry_in_journal->extent_index].datafile_offset, + .bytes = extent_list[page_entry_in_journal->extent_index].datafile_size, + .page_length = page_length, + .file = datafile->file, + .fileno = datafile->fileno, + }; + + PGC_PAGE *page = pgc_page_add_and_acquire(open_cache, (PGC_ENTRY) { + .hot = false, + .section = (Word_t) ctx, + .metric_id = metric_id, + .start_time_s = page_first_time_s, + .end_time_s = page_last_time_s, + .update_every_s = page_update_every_s, + .data = datafile, + .size = 0, + .custom_data = (uint8_t *) &ei, + }, &added); + + if(!added) + datafile_release(datafile, DATAFILE_ACQUIRE_OPEN_CACHE); + + callback(page, callback_data); + + pgc_page_release(open_cache, page); + + pages_found++; } } - rrdeng_page_descr_mutex_unlock(ctx, descr); + journalfile_v2_data_release(datafile->journalfile); } - uv_rwlock_rdunlock(&page_index->lock); + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); - failed_to_reserve = 0; - for (i = 0 ; i < preload_count && !failed_to_reserve ; ++i) { - struct rrdeng_cmd cmd; - struct rrdeng_page_descr *next; + return pages_found; +} - descr = preload_array[i]; - if (NULL == descr) { - continue; - } - if (!pg_cache_try_reserve_pages(ctx, 1)) { - failed_to_reserve = 1; - break; - } - cmd.opcode = RRDENG_READ_EXTENT; - cmd.read_extent.page_cache_descr[0] = descr; - /* don't use this page again */ - preload_array[i] = NULL; - for (j = 0, k = 1 ; j < preload_count ; ++j) { - next = preload_array[j]; - if (NULL == next) { - continue; - } - if (descr->extent == next->extent) { - /* same extent, consolidate */ - if (!pg_cache_try_reserve_pages(ctx, 1)) { - failed_to_reserve = 1; - break; - } - cmd.read_extent.page_cache_descr[k++] = next; - /* don't use this page again */ - preload_array[j] = NULL; - } - } - cmd.read_extent.page_count = k; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); +void add_page_details_from_journal_v2(PGC_PAGE *page, void *JudyL_pptr) { + struct rrdengine_datafile *datafile = pgc_page_data(page); + + if(!datafile_acquire(datafile, DATAFILE_ACQUIRE_PAGE_DETAILS)) // for pd + return; + + Pvoid_t *PValue = PDCJudyLIns(JudyL_pptr, pgc_page_start_time_s(page), PJE0); + if (!PValue || PValue == PJERR) + fatal("DBENGINE: corrupted judy array"); + + if (unlikely(*PValue)) { + datafile_release(datafile, DATAFILE_ACQUIRE_PAGE_DETAILS); + return; } - if (failed_to_reserve) { - debug(D_RRDENGINE, "%s: Failed to reserve enough memory, canceling I/O.", __func__); - for (i = 0 ; i < preload_count ; ++i) { - descr = preload_array[i]; - if (NULL == descr) { - continue; - } - pg_cache_put(ctx, descr); - } + + Word_t metric_id = pgc_page_metric(page); + + // let's add it to the judy + struct extent_io_data *ei = pgc_page_custom_data(open_cache, page); + struct page_details *pd = page_details_get(); + *PValue = pd; + + pd->datafile.extent.pos = ei->pos; + pd->datafile.extent.bytes = ei->bytes; + pd->datafile.file = ei->file; + pd->datafile.fileno = ei->fileno; + pd->first_time_s = pgc_page_start_time_s(page); + pd->last_time_s = pgc_page_end_time_s(page); + pd->datafile.ptr = datafile; + pd->page_length = ei->page_length; + pd->update_every_s = pgc_page_update_every_s(page); + pd->metric_id = metric_id; + pd->status |= PDC_PAGE_DISK_PENDING | PDC_PAGE_SOURCE_JOURNAL_V2 | PDC_PAGE_DATAFILE_ACQUIRED; +} + +// Return a judyL will all pages that have start_time_ut and end_time_ut +// Pvalue of the judy will be the end time for that page +// DBENGINE2: +#define time_delta(finish, pass) do { if(pass) { usec_t t = pass; (pass) = (finish) - (pass); (finish) = t; } } while(0) +static Pvoid_t get_page_list( + struct rrdengine_instance *ctx, + METRIC *metric, + usec_t start_time_ut, + usec_t end_time_ut, + size_t *pages_to_load, + time_t *optimal_end_time_s +) { + *optimal_end_time_s = 0; + + Pvoid_t JudyL_page_array = (Pvoid_t) NULL; + + time_t wanted_start_time_s = (time_t)(start_time_ut / USEC_PER_SEC); + time_t wanted_end_time_s = (time_t)(end_time_ut / USEC_PER_SEC); + + size_t pages_found_in_main_cache = 0, + pages_found_in_open_cache = 0, + pages_found_in_journals_v2 = 0, + pages_found_pass4 = 0, + pages_pending = 0, + pages_overlapping = 0, + pages_total = 0; + + size_t cache_gaps = 0, query_gaps = 0; + bool done_v2 = false, done_open = false; + + usec_t pass1_ut = 0, pass2_ut = 0, pass3_ut = 0, pass4_ut = 0; + + // -------------------------------------------------------------- + // PASS 1: Check what the main page cache has available + + pass1_ut = now_monotonic_usec(); + size_t pages_pass1 = get_page_list_from_pgc(main_cache, metric, ctx, wanted_start_time_s, wanted_end_time_s, + &JudyL_page_array, &cache_gaps, + false, PDC_PAGE_SOURCE_MAIN_CACHE); + query_gaps += cache_gaps; + pages_found_in_main_cache += pages_pass1; + pages_total += pages_pass1; + + if(pages_found_in_main_cache && !cache_gaps) { + query_gaps = list_has_time_gaps(ctx, metric, JudyL_page_array, wanted_start_time_s, wanted_end_time_s, + &pages_total, &pages_found_pass4, &pages_pending, &pages_overlapping, + optimal_end_time_s, false); + + if (pages_total && !query_gaps) + goto we_are_done; } - if (!preload_count) { - /* no such page */ - debug(D_RRDENGINE, "%s: No page was eligible to attempt preload.", __func__); + + // -------------------------------------------------------------- + // PASS 2: Check what the open journal page cache has available + // these will be loaded from disk + + pass2_ut = now_monotonic_usec(); + size_t pages_pass2 = get_page_list_from_pgc(open_cache, metric, ctx, wanted_start_time_s, wanted_end_time_s, + &JudyL_page_array, &cache_gaps, + true, PDC_PAGE_SOURCE_OPEN_CACHE); + query_gaps += cache_gaps; + pages_found_in_open_cache += pages_pass2; + pages_total += pages_pass2; + done_open = true; + + if(pages_found_in_open_cache) { + query_gaps = list_has_time_gaps(ctx, metric, JudyL_page_array, wanted_start_time_s, wanted_end_time_s, + &pages_total, &pages_found_pass4, &pages_pending, &pages_overlapping, + optimal_end_time_s, false); + + if (pages_total && !query_gaps) + goto we_are_done; } - if (unlikely(0 == count && page_info_arrayp)) { - freez(*page_info_arrayp); - *page_info_arrayp = NULL; + + // -------------------------------------------------------------- + // PASS 3: Check Journal v2 to fill the gaps + + pass3_ut = now_monotonic_usec(); + size_t pages_pass3 = get_page_list_from_journal_v2(ctx, metric, start_time_ut, end_time_ut, + add_page_details_from_journal_v2, &JudyL_page_array); + pages_found_in_journals_v2 += pages_pass3; + pages_total += pages_pass3; + done_v2 = true; + + // -------------------------------------------------------------- + // PASS 4: Check the cache again + // and calculate the time gaps in the query + // THIS IS REQUIRED AFTER JOURNAL V2 LOOKUP + + pass4_ut = now_monotonic_usec(); + query_gaps = list_has_time_gaps(ctx, metric, JudyL_page_array, wanted_start_time_s, wanted_end_time_s, + &pages_total, &pages_found_pass4, &pages_pending, &pages_overlapping, + optimal_end_time_s, true); + +we_are_done: + + if(pages_to_load) + *pages_to_load = pages_pending; + + usec_t finish_ut = now_monotonic_usec(); + time_delta(finish_ut, pass4_ut); + time_delta(finish_ut, pass3_ut); + time_delta(finish_ut, pass2_ut); + time_delta(finish_ut, pass1_ut); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_in_main_cache_lookup, pass1_ut, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_in_open_cache_lookup, pass2_ut, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_in_journal_v2_lookup, pass3_ut, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_in_pass4_lookup, pass4_ut, __ATOMIC_RELAXED); + + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries_planned_with_gaps, (query_gaps) ? 1 : 0, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries_open, done_open ? 1 : 0, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries_journal_v2, done_v2 ? 1 : 0, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_total, pages_total, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_meta_source_main_cache, pages_found_in_main_cache, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_meta_source_open_cache, pages_found_in_open_cache, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_meta_source_journal_v2, pages_found_in_journals_v2, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache, pages_found_in_main_cache, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache_at_pass4, pages_found_pass4, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_to_load_from_disk, pages_pending, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_overlapping_skipped, pages_overlapping, __ATOMIC_RELAXED); + + return JudyL_page_array; +} + +inline void rrdeng_prep_wait(PDC *pdc) { + if (unlikely(pdc && !pdc->prep_done)) { + usec_t started_ut = now_monotonic_usec(); + completion_wait_for(&pdc->prep_completion); + pdc->prep_done = true; + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_wait_for_prep, now_monotonic_usec() - started_ut, __ATOMIC_RELAXED); } - return count; } -/* - * Searches for a page and gets a reference. - * When point_in_time is INVALID_TIME get any page. - * If index is NULL lookup by UUID (id). - */ -struct rrdeng_page_descr * - pg_cache_lookup(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, uuid_t *id, - usec_t point_in_time_ut) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = NULL; - struct page_cache_descr *pg_cache_descr = NULL; - unsigned long flags; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; - Word_t Index; - uint8_t page_not_in_cache; - - if (unlikely(NULL == index)) { - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, id, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - page_index = *PValue; - } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - if (NULL == PValue) { - return NULL; - } - } else { - page_index = index; +void rrdeng_prep_query(PDC *pdc) { + size_t pages_to_load = 0; + pdc->page_list_JudyL = get_page_list(pdc->ctx, pdc->metric, + pdc->start_time_s * USEC_PER_SEC, + pdc->end_time_s * USEC_PER_SEC, + &pages_to_load, + &pdc->optimal_end_time_s); + + if (pages_to_load && pdc->page_list_JudyL) { + pdc_acquire(pdc); // we get 1 for the 1st worker in the chain: do_read_page_list_work() + usec_t start_ut = now_monotonic_usec(); +// if(likely(priority == STORAGE_PRIORITY_BEST_EFFORT)) +// dbengine_load_page_list_directly(ctx, handle->pdc); +// else + pdc_route_asynchronously(pdc->ctx, pdc); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_to_route, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED); } - pg_cache_reserve_pages(ctx, 1); - - page_not_in_cache = 0; - uv_rwlock_rdlock(&page_index->lock); - while (1) { - Index = (Word_t)(point_in_time_ut / USEC_PER_SEC); - PValue = JudyLLast(page_index->JudyL_array, &Index, PJE0); - if (likely(NULL != PValue)) { - descr = *PValue; - } - if (NULL == PValue || - 0 == descr->page_length || - (INVALID_TIME != point_in_time_ut && - !is_point_in_time_in_page(descr, point_in_time_ut))) { - /* non-empty page not found */ - uv_rwlock_rdunlock(&page_index->lock); - - pg_cache_release_pages(ctx, 1); - return NULL; - } - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - flags = pg_cache_descr->flags; - if ((flags & RRD_PAGE_POPULATED) && pg_cache_try_get_unsafe(descr, 0)) { - /* success */ - rrdeng_page_descr_mutex_unlock(ctx, descr); - debug(D_RRDENGINE, "%s: Page was found in memory.", __func__); - break; - } - if (!(flags & RRD_PAGE_POPULATED) && pg_cache_try_get_unsafe(descr, 1)) { - struct rrdeng_cmd cmd; + else + completion_mark_complete(&pdc->page_completion); - uv_rwlock_rdunlock(&page_index->lock); + completion_mark_complete(&pdc->prep_completion); - cmd.opcode = RRDENG_READ_PAGE; - cmd.read_page.page_cache_descr = descr; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); + pdc_release_and_destroy_if_unreferenced(pdc, true, true); +} - debug(D_RRDENGINE, "%s: Waiting for page to be asynchronously read from disk:", __func__); - if(unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - while (!(pg_cache_descr->flags & RRD_PAGE_POPULATED)) { - pg_cache_wait_event_unsafe(descr); - } - /* success */ - /* Downgrade exclusive reference to allow other readers */ - pg_cache_descr->flags &= ~RRD_PAGE_LOCKED; - pg_cache_wake_up_waiters_unsafe(descr); - rrdeng_page_descr_mutex_unlock(ctx, descr); - rrd_stat_atomic_add(&ctx->stats.pg_cache_misses, 1); - return descr; - } - uv_rwlock_rdunlock(&page_index->lock); - debug(D_RRDENGINE, "%s: Waiting for page to be unlocked:", __func__); - if(unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - if (!(flags & RRD_PAGE_POPULATED)) - page_not_in_cache = 1; - pg_cache_wait_event_unsafe(descr); - rrdeng_page_descr_mutex_unlock(ctx, descr); - - /* reset scan to find again */ - uv_rwlock_rdlock(&page_index->lock); - } - uv_rwlock_rdunlock(&page_index->lock); +/** + * Searches for pages in a time range and triggers disk I/O if necessary and possible. + * @param ctx DB context + * @param handle query handle as initialized + * @param start_time_ut inclusive starting time in usec + * @param end_time_ut inclusive ending time in usec + * @return 1 / 0 (pages found or not found) + */ +void pg_cache_preload(struct rrdeng_query_handle *handle) { + if (unlikely(!handle || !handle->metric)) + return; - if (!(flags & RRD_PAGE_DIRTY)) - pg_cache_replaceQ_set_hot(ctx, descr); - pg_cache_release_pages(ctx, 1); - if (page_not_in_cache) - rrd_stat_atomic_add(&ctx->stats.pg_cache_misses, 1); - else - rrd_stat_atomic_add(&ctx->stats.pg_cache_hits, 1); - return descr; + __atomic_add_fetch(&handle->ctx->atomic.inflight_queries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.currently_running_queries, 1, __ATOMIC_RELAXED); + handle->pdc = pdc_get(); + handle->pdc->metric = mrg_metric_dup(main_mrg, handle->metric); + handle->pdc->start_time_s = handle->start_time_s; + handle->pdc->end_time_s = handle->end_time_s; + handle->pdc->priority = handle->priority; + handle->pdc->optimal_end_time_s = handle->end_time_s; + handle->pdc->ctx = handle->ctx; + handle->pdc->refcount = 1; + netdata_spinlock_init(&handle->pdc->refcount_spinlock); + completion_init(&handle->pdc->prep_completion); + completion_init(&handle->pdc->page_completion); + + if(ctx_is_available_for_queries(handle->ctx)) { + handle->pdc->refcount++; // we get 1 for the query thread and 1 for the prep thread + rrdeng_enq_cmd(handle->ctx, RRDENG_OPCODE_QUERY, handle->pdc, NULL, handle->priority, NULL, NULL); + } + else { + completion_mark_complete(&handle->pdc->prep_completion); + completion_mark_complete(&handle->pdc->page_completion); + } } /* @@ -1088,226 +840,282 @@ struct rrdeng_page_descr * * start_time and end_time are inclusive. * If index is NULL lookup by UUID (id). */ -struct rrdeng_page_descr * -pg_cache_lookup_next(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, uuid_t *id, - usec_t start_time_ut, usec_t end_time_ut) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = NULL; - struct page_cache_descr *pg_cache_descr = NULL; - unsigned long flags; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; - uint8_t page_not_in_cache; - - if (unlikely(NULL == index)) { - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, id, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - page_index = *PValue; +struct pgc_page *pg_cache_lookup_next( + struct rrdengine_instance *ctx, + PDC *pdc, + time_t now_s, + time_t last_update_every_s, + size_t *entries +) { + if (unlikely(!pdc)) + return NULL; + + rrdeng_prep_wait(pdc); + + if (unlikely(!pdc->page_list_JudyL)) + return NULL; + + usec_t start_ut = now_monotonic_usec(); + size_t gaps = 0; + bool waited = false, preloaded; + PGC_PAGE *page = NULL; + + while(!page) { + bool page_from_pd = false; + preloaded = false; + struct page_details *pd = pdc_find_page_for_time( + pdc->page_list_JudyL, now_s, &gaps, + PDC_PAGE_PROCESSED, PDC_PAGE_EMPTY); + + if (!pd) + break; + + page = pd->page; + page_from_pd = true; + preloaded = pdc_page_status_check(pd, PDC_PAGE_PRELOADED); + if(!page) { + if(!completion_is_done(&pdc->page_completion)) { + page = pgc_page_get_and_acquire(main_cache, (Word_t)ctx, + pd->metric_id, pd->first_time_s, PGC_SEARCH_EXACT); + page_from_pd = false; + preloaded = pdc_page_status_check(pd, PDC_PAGE_PRELOADED); + } + + if(!page) { + pdc->completed_jobs = + completion_wait_for_a_job(&pdc->page_completion, pdc->completed_jobs); + + page = pd->page; + page_from_pd = true; + preloaded = pdc_page_status_check(pd, PDC_PAGE_PRELOADED); + waited = true; + } } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - if (NULL == PValue) { - return NULL; + + if(page && pgc_page_data(page) == DBENGINE_EMPTY_PAGE) + pdc_page_status_set(pd, PDC_PAGE_EMPTY); + + if(!page || pdc_page_status_check(pd, PDC_PAGE_QUERY_GLOBAL_SKIP_LIST | PDC_PAGE_EMPTY)) { + page = NULL; + continue; } - } else { - page_index = index; - } - pg_cache_reserve_pages(ctx, 1); - - page_not_in_cache = 0; - uv_rwlock_rdlock(&page_index->lock); - int retry_count = 0; - while (1) { - descr = find_first_page_in_time_range(page_index, start_time_ut, end_time_ut); - if (NULL == descr || 0 == descr->page_length || retry_count == default_rrdeng_page_fetch_retries) { - /* non-empty page not found */ - if (retry_count == default_rrdeng_page_fetch_retries) - error_report("Page cache timeout while waiting for page %p : returning FAIL", descr); - uv_rwlock_rdunlock(&page_index->lock); - - pg_cache_release_pages(ctx, 1); - return NULL; + + // we now have page and is not empty + + time_t page_start_time_s = pgc_page_start_time_s(page); + time_t page_end_time_s = pgc_page_end_time_s(page); + time_t page_update_every_s = pgc_page_update_every_s(page); + size_t page_length = pgc_page_data_size(main_cache, page); + + if(unlikely(page_start_time_s == INVALID_TIME || page_end_time_s == INVALID_TIME)) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_zero_time_skipped, 1, __ATOMIC_RELAXED); + pgc_page_to_clean_evict_or_release(main_cache, page); + pdc_page_status_set(pd, PDC_PAGE_INVALID | PDC_PAGE_RELEASED); + pd->page = page = NULL; + continue; } - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - flags = pg_cache_descr->flags; - if ((flags & RRD_PAGE_POPULATED) && pg_cache_try_get_unsafe(descr, 0)) { - /* success */ - rrdeng_page_descr_mutex_unlock(ctx, descr); - debug(D_RRDENGINE, "%s: Page was found in memory.", __func__); - break; + else if(page_length > RRDENG_BLOCK_SIZE) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_invalid_size_skipped, 1, __ATOMIC_RELAXED); + pgc_page_to_clean_evict_or_release(main_cache, page); + pdc_page_status_set(pd, PDC_PAGE_INVALID | PDC_PAGE_RELEASED); + pd->page = page = NULL; + continue; } - if (!(flags & RRD_PAGE_POPULATED) && pg_cache_try_get_unsafe(descr, 1)) { - struct rrdeng_cmd cmd; + else { + if (unlikely(page_update_every_s <= 0 || page_update_every_s > 86400)) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_invalid_update_every_fixed, 1, __ATOMIC_RELAXED); + pd->update_every_s = page_update_every_s = pgc_page_fix_update_every(page, last_update_every_s); + } - uv_rwlock_rdunlock(&page_index->lock); + size_t entries_by_size = page_entries_by_size(page_length, CTX_POINT_SIZE_BYTES(ctx)); + size_t entries_by_time = page_entries_by_time(page_start_time_s, page_end_time_s, page_update_every_s); + if(unlikely(entries_by_size < entries_by_time)) { + time_t fixed_page_end_time_s = (time_t)(page_start_time_s + (entries_by_size - 1) * page_update_every_s); + pd->last_time_s = page_end_time_s = pgc_page_fix_end_time_s(page, fixed_page_end_time_s); + entries_by_time = (page_end_time_s - (page_start_time_s - page_update_every_s)) / page_update_every_s; - cmd.opcode = RRDENG_READ_PAGE; - cmd.read_page.page_cache_descr = descr; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); + internal_fatal(entries_by_size != entries_by_time, "DBENGINE: wrong entries by time again!"); - debug(D_RRDENGINE, "%s: Waiting for page to be asynchronously read from disk:", __func__); - if(unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - while (!(pg_cache_descr->flags & RRD_PAGE_POPULATED)) { - pg_cache_wait_event_unsafe(descr); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_invalid_entries_fixed, 1, __ATOMIC_RELAXED); } - /* success */ - /* Downgrade exclusive reference to allow other readers */ - pg_cache_descr->flags &= ~RRD_PAGE_LOCKED; - pg_cache_wake_up_waiters_unsafe(descr); - rrdeng_page_descr_mutex_unlock(ctx, descr); - rrd_stat_atomic_add(&ctx->stats.pg_cache_misses, 1); - return descr; + *entries = entries_by_time; } - uv_rwlock_rdunlock(&page_index->lock); - debug(D_RRDENGINE, "%s: Waiting for page to be unlocked:", __func__); - if(unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - if (!(flags & RRD_PAGE_POPULATED)) - page_not_in_cache = 1; - - if (pg_cache_timedwait_event_unsafe(descr, default_rrdeng_page_fetch_timeout) == UV_ETIMEDOUT) { - error_report("Page cache timeout while waiting for page %p : retry count = %d", descr, retry_count); - ++retry_count; + + if(unlikely(page_end_time_s < now_s)) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_past_time_skipped, 1, __ATOMIC_RELAXED); + pgc_page_release(main_cache, page); + pdc_page_status_set(pd, PDC_PAGE_SKIP | PDC_PAGE_RELEASED); + pd->page = page = NULL; + continue; } - rrdeng_page_descr_mutex_unlock(ctx, descr); - /* reset scan to find again */ - uv_rwlock_rdlock(&page_index->lock); + if(page_from_pd) + // PDC_PAGE_RELEASED is for pdc_destroy() to not release the page twice - the caller will release it + pdc_page_status_set(pd, PDC_PAGE_RELEASED | PDC_PAGE_PROCESSED); + else + pdc_page_status_set(pd, PDC_PAGE_PROCESSED); } - uv_rwlock_rdunlock(&page_index->lock); - if (!(flags & RRD_PAGE_DIRTY)) - pg_cache_replaceQ_set_hot(ctx, descr); - pg_cache_release_pages(ctx, 1); - if (page_not_in_cache) - rrd_stat_atomic_add(&ctx->stats.pg_cache_misses, 1); - else - rrd_stat_atomic_add(&ctx->stats.pg_cache_hits, 1); - return descr; -} + if(gaps && !pdc->executed_with_gaps) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries_executed_with_gaps, 1, __ATOMIC_RELAXED); + pdc->executed_with_gaps = +gaps; -struct pg_cache_page_index *create_page_index(uuid_t *id, struct rrdengine_instance *ctx) -{ - struct pg_cache_page_index *page_index; - - page_index = mallocz(sizeof(*page_index)); - page_index->JudyL_array = (Pvoid_t) NULL; - uuid_copy(page_index->id, *id); - fatal_assert(0 == uv_rwlock_init(&page_index->lock)); - page_index->oldest_time_ut = INVALID_TIME; - page_index->latest_time_ut = INVALID_TIME; - page_index->prev = NULL; - page_index->page_count = 0; - page_index->refcount = 0; - page_index->writers = 0; - page_index->ctx = ctx; - page_index->latest_update_every_s = default_rrd_update_every; - - return page_index; -} + if(page) { + if(waited) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.page_next_wait_loaded, 1, __ATOMIC_RELAXED); + else + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.page_next_nowait_loaded, 1, __ATOMIC_RELAXED); + } + else { + if(waited) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.page_next_wait_failed, 1, __ATOMIC_RELAXED); + else + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.page_next_nowait_failed, 1, __ATOMIC_RELAXED); + } -static void init_metrics_index(struct rrdengine_instance *ctx) -{ - struct page_cache *pg_cache = &ctx->pg_cache; + if(waited) { + if(preloaded) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_to_slow_preload_next_page, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED); + else + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_to_slow_disk_next_page, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED); + } + else { + if(preloaded) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_to_fast_preload_next_page, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED); + else + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_to_fast_disk_next_page, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED); + } - pg_cache->metrics_index.JudyHS_array = (Pvoid_t) NULL; - pg_cache->metrics_index.last_page_index = NULL; - fatal_assert(0 == uv_rwlock_init(&pg_cache->metrics_index.lock)); + return page; } -static void init_replaceQ(struct rrdengine_instance *ctx) -{ - struct page_cache *pg_cache = &ctx->pg_cache; +void pgc_open_add_hot_page(Word_t section, Word_t metric_id, time_t start_time_s, time_t end_time_s, time_t update_every_s, + struct rrdengine_datafile *datafile, uint64_t extent_offset, unsigned extent_size, uint32_t page_length) { + + if(!datafile_acquire(datafile, DATAFILE_ACQUIRE_OPEN_CACHE)) // for open cache item + fatal("DBENGINE: cannot acquire datafile to put page in open cache"); + + struct extent_io_data ext_io_data = { + .file = datafile->file, + .fileno = datafile->fileno, + .pos = extent_offset, + .bytes = extent_size, + .page_length = page_length + }; + + PGC_ENTRY page_entry = { + .hot = true, + .section = section, + .metric_id = metric_id, + .start_time_s = start_time_s, + .end_time_s = end_time_s, + .update_every_s = update_every_s, + .size = 0, + .data = datafile, + .custom_data = (uint8_t *) &ext_io_data, + }; + + internal_fatal(!datafile->fileno, "DBENGINE: datafile supplied does not have a number"); + + bool added = true; + PGC_PAGE *page = pgc_page_add_and_acquire(open_cache, page_entry, &added); + int tries = 100; + while(!added && page_entry.end_time_s > pgc_page_end_time_s(page) && tries--) { + pgc_page_to_clean_evict_or_release(open_cache, page); + page = pgc_page_add_and_acquire(open_cache, page_entry, &added); + } - pg_cache->replaceQ.head = NULL; - pg_cache->replaceQ.tail = NULL; - fatal_assert(0 == uv_rwlock_init(&pg_cache->replaceQ.lock)); -} + if(!added) { + datafile_release(datafile, DATAFILE_ACQUIRE_OPEN_CACHE); -static void init_committed_page_index(struct rrdengine_instance *ctx) -{ - struct page_cache *pg_cache = &ctx->pg_cache; + internal_fatal(page_entry.end_time_s > pgc_page_end_time_s(page), + "DBENGINE: cannot add longer page to open cache"); + } - pg_cache->committed_page_index.JudyL_array = (Pvoid_t) NULL; - fatal_assert(0 == uv_rwlock_init(&pg_cache->committed_page_index.lock)); - pg_cache->committed_page_index.latest_corr_id = 0; - pg_cache->committed_page_index.nr_committed_pages = 0; + pgc_page_release(open_cache, (PGC_PAGE *)page); } -void init_page_cache(struct rrdengine_instance *ctx) -{ - struct page_cache *pg_cache = &ctx->pg_cache; +size_t dynamic_open_cache_size(void) { + size_t main_cache_size = pgc_get_wanted_cache_size(main_cache); + size_t target_size = main_cache_size / 100 * 5; - pg_cache->page_descriptors = 0; - pg_cache->populated_pages = 0; - fatal_assert(0 == uv_rwlock_init(&pg_cache->pg_cache_rwlock)); + if(target_size < 2 * 1024 * 1024) + target_size = 2 * 1024 * 1024; - init_metrics_index(ctx); - init_replaceQ(ctx); - init_committed_page_index(ctx); + return target_size; } -void free_page_cache(struct rrdengine_instance *ctx) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index, *prev_page_index; - Word_t Index; - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - - // if we are exiting, the OS will recover all memory so do not slow down the shutdown process - // Do the cleanup if we are compiling with NETDATA_INTERNAL_CHECKS - // This affects the reporting of dbengine statistics which are available in real time - // via the /api/v1/dbengine_stats endpoint -#ifndef NETDATA_DBENGINE_FREE - if (netdata_exit) - return; -#endif - Word_t metrics_index_bytes = 0, pages_index_bytes = 0, pages_dirty_index_bytes = 0; - - /* Free committed page index */ - pages_dirty_index_bytes = JudyLFreeArray(&pg_cache->committed_page_index.JudyL_array, PJE0); - fatal_assert(NULL == pg_cache->committed_page_index.JudyL_array); - - for (page_index = pg_cache->metrics_index.last_page_index ; - page_index != NULL ; - page_index = prev_page_index) { +size_t dynamic_extent_cache_size(void) { + size_t main_cache_size = pgc_get_wanted_cache_size(main_cache); + size_t target_size = main_cache_size / 100 * 5; - prev_page_index = page_index->prev; + if(target_size < 3 * 1024 * 1024) + target_size = 3 * 1024 * 1024; - /* Find first page in range */ - Index = (Word_t) 0; - PValue = JudyLFirst(page_index->JudyL_array, &Index, PJE0); - descr = unlikely(NULL == PValue) ? NULL : *PValue; - - while (descr != NULL) { - /* Iterate all page descriptors of this metric */ + return target_size; +} - if (descr->pg_cache_descr_state & PG_CACHE_DESCR_ALLOCATED) { - /* Check rrdenglocking.c */ - pg_cache_descr = descr->pg_cache_descr; - if (pg_cache_descr->flags & RRD_PAGE_POPULATED) { - dbengine_page_free(pg_cache_descr->page); - } - rrdeng_destroy_pg_cache_descr(ctx, pg_cache_descr); - } - rrdeng_page_descr_freez(descr); +void pgc_and_mrg_initialize(void) +{ + main_mrg = mrg_create(); - PValue = JudyLNext(page_index->JudyL_array, &Index, PJE0); - descr = unlikely(NULL == PValue) ? NULL : *PValue; - } + size_t target_cache_size = (size_t)default_rrdeng_page_cache_mb * 1024ULL * 1024ULL; + size_t main_cache_size = (target_cache_size / 100) * 95; + size_t open_cache_size = 0; + size_t extent_cache_size = (target_cache_size / 100) * 5; - /* Free page index */ - pages_index_bytes += JudyLFreeArray(&page_index->JudyL_array, PJE0); - fatal_assert(NULL == page_index->JudyL_array); - freez(page_index); + if(extent_cache_size < 3 * 1024 * 1024) { + extent_cache_size = 3 * 1024 * 1024; + main_cache_size = target_cache_size - extent_cache_size; } - /* Free metrics index */ - metrics_index_bytes = JudyHSFreeArray(&pg_cache->metrics_index.JudyHS_array, PJE0); - fatal_assert(NULL == pg_cache->metrics_index.JudyHS_array); - info("Freed %lu bytes of memory from page cache.", pages_dirty_index_bytes + pages_index_bytes + metrics_index_bytes); + + main_cache = pgc_create( + "main_cache", + main_cache_size, + main_cache_free_clean_page_callback, + (size_t) rrdeng_pages_per_extent, + main_cache_flush_dirty_page_init_callback, + main_cache_flush_dirty_page_callback, + 10, + 10240, // if there are that many threads, evict so many at once! + 1000, // + 5, // don't delay too much other threads + PGC_OPTIONS_AUTOSCALE, // AUTOSCALE = 2x max hot pages + 0, // 0 = as many as the system cpus + 0 + ); + + open_cache = pgc_create( + "open_cache", + open_cache_size, // the default is 1MB + open_cache_free_clean_page_callback, + 1, + NULL, + open_cache_flush_dirty_page_callback, + 10, + 10240, // if there are that many threads, evict that many at once! + 1000, // + 3, // don't delay too much other threads + PGC_OPTIONS_AUTOSCALE | PGC_OPTIONS_EVICT_PAGES_INLINE | PGC_OPTIONS_FLUSH_PAGES_INLINE, + 0, // 0 = as many as the system cpus + sizeof(struct extent_io_data) + ); + pgc_set_dynamic_target_cache_size_callback(open_cache, dynamic_open_cache_size); + + extent_cache = pgc_create( + "extent_cache", + extent_cache_size, + extent_cache_free_clean_page_callback, + 1, + NULL, + extent_cache_flush_dirty_page_callback, + 5, + 10, // it will lose up to that extents at once! + 100, // + 2, // don't delay too much other threads + PGC_OPTIONS_AUTOSCALE | PGC_OPTIONS_EVICT_PAGES_INLINE | PGC_OPTIONS_FLUSH_PAGES_INLINE, + 0, // 0 = as many as the system cpus + 0 + ); + pgc_set_dynamic_target_cache_size_callback(extent_cache, dynamic_extent_cache_size); } diff --git a/database/engine/pagecache.h b/database/engine/pagecache.h index 635b02123..9ab7db078 100644 --- a/database/engine/pagecache.h +++ b/database/engine/pagecache.h @@ -5,66 +5,34 @@ #include "rrdengine.h" +extern struct mrg *main_mrg; +extern struct pgc *main_cache; +extern struct pgc *open_cache; +extern struct pgc *extent_cache; + /* Forward declarations */ struct rrdengine_instance; -struct extent_info; -struct rrdeng_page_descr; #define INVALID_TIME (0) #define MAX_PAGE_CACHE_FETCH_RETRIES (3) #define PAGE_CACHE_FETCH_WAIT_TIMEOUT (3) -/* Page flags */ -#define RRD_PAGE_DIRTY (1LU << 0) -#define RRD_PAGE_LOCKED (1LU << 1) -#define RRD_PAGE_READ_PENDING (1LU << 2) -#define RRD_PAGE_WRITE_PENDING (1LU << 3) -#define RRD_PAGE_POPULATED (1LU << 4) - -struct page_cache_descr { - struct rrdeng_page_descr *descr; /* parent descriptor */ - void *page; - unsigned long flags; - struct page_cache_descr *prev; /* LRU */ - struct page_cache_descr *next; /* LRU */ - - unsigned refcnt; - uv_mutex_t mutex; /* always take it after the page cache lock or after the commit lock */ - uv_cond_t cond; - unsigned waiters; -}; - -/* Page cache descriptor flags, state = 0 means no descriptor */ -#define PG_CACHE_DESCR_ALLOCATED (1LU << 0) -#define PG_CACHE_DESCR_DESTROY (1LU << 1) -#define PG_CACHE_DESCR_LOCKED (1LU << 2) -#define PG_CACHE_DESCR_SHIFT (3) -#define PG_CACHE_DESCR_USERS_MASK (((unsigned long)-1) << PG_CACHE_DESCR_SHIFT) -#define PG_CACHE_DESCR_FLAGS_MASK (((unsigned long)-1) >> (BITS_PER_ULONG - PG_CACHE_DESCR_SHIFT)) +extern struct rrdeng_cache_efficiency_stats rrdeng_cache_efficiency_stats; -/* - * Page cache descriptor state bits (works for both 32-bit and 64-bit architectures): - * - * 63 ... 31 ... 3 | 2 | 1 | 0| - * -----------------------------+------------+------------+-----------| - * number of descriptor users | DESTROY | LOCKED | ALLOCATED | - */ -struct rrdeng_page_descr { - uuid_t *id; /* never changes */ - struct extent_info *extent; - - /* points to ephemeral page cache descriptor if the page resides in the cache */ - struct page_cache_descr *pg_cache_descr; - - /* Compare-And-Swap target for page cache descriptor allocation algorithm */ - volatile unsigned long pg_cache_descr_state; - - /* page information */ +struct page_descr_with_data { + uuid_t *id; + Word_t metric_id; usec_t start_time_ut; usec_t end_time_ut; - uint32_t update_every_s:24; uint8_t type; + uint32_t update_every_s; uint32_t page_length; + uint8_t *page; + + struct { + struct page_descr_with_data *prev; + struct page_descr_with_data *next; + } link; }; #define PAGE_INFO_SCRATCH_SZ (8) @@ -76,179 +44,21 @@ struct rrdeng_page_info { uint32_t page_length; }; -/* returns 1 for success, 0 for failure */ -typedef int pg_cache_page_info_filter_t(struct rrdeng_page_descr *); - -#define PAGE_CACHE_MAX_PRELOAD_PAGES (256) - struct pg_alignment { - uint32_t page_length; + uint32_t page_position; uint32_t refcount; + uint16_t initial_slots; }; -/* maps time ranges to pages */ -struct pg_cache_page_index { - uuid_t id; - /* - * care: JudyL_array indices are converted from useconds to seconds to fit in one word in 32-bit architectures - * TODO: examine if we want to support better granularity than seconds - */ - Pvoid_t JudyL_array; - Word_t page_count; - unsigned short refcount; - unsigned short writers; - uv_rwlock_t lock; - - /* - * Only one effective writer, data deletion workqueue. - * It's also written during the DB loading phase. - */ - usec_t oldest_time_ut; - - /* - * Only one effective writer, data collection thread. - * It's also written by the data deletion workqueue when data collection is disabled for this metric. - */ - usec_t latest_time_ut; - - struct rrdengine_instance *ctx; - uint32_t latest_update_every_s; - - struct pg_cache_page_index *prev; -}; - -/* maps UUIDs to page indices */ -struct pg_cache_metrics_index { - uv_rwlock_t lock; - Pvoid_t JudyHS_array; - struct pg_cache_page_index *last_page_index; -}; - -/* gathers dirty pages to be written on disk */ -struct pg_cache_committed_page_index { - uv_rwlock_t lock; - - Pvoid_t JudyL_array; - - /* - * Dirty page correlation ID is a hint. Dirty pages that are correlated should have - * a small correlation ID difference. Dirty pages in memory should never have the - * same ID at the same time for correctness. - */ - Word_t latest_corr_id; - - unsigned nr_committed_pages; -}; - -/* - * Gathers populated pages to be evicted. - * Relies on page cache descriptors being there as it uses their memory. - */ -struct pg_cache_replaceQ { - uv_rwlock_t lock; /* LRU lock */ - - struct page_cache_descr *head; /* LRU */ - struct page_cache_descr *tail; /* MRU */ -}; - -struct page_cache { /* TODO: add statistics */ - uv_rwlock_t pg_cache_rwlock; /* page cache lock */ - - struct pg_cache_metrics_index metrics_index; - struct pg_cache_committed_page_index committed_page_index; - struct pg_cache_replaceQ replaceQ; - - unsigned page_descriptors; - unsigned populated_pages; -}; - -void pg_cache_wake_up_waiters_unsafe(struct rrdeng_page_descr *descr); -void pg_cache_wake_up_waiters(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); -void pg_cache_wait_event_unsafe(struct rrdeng_page_descr *descr); -unsigned long pg_cache_wait_event(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); -void pg_cache_replaceQ_insert(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr); -void pg_cache_replaceQ_delete(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr); -void pg_cache_replaceQ_set_hot(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr); -struct rrdeng_page_descr *pg_cache_create_descr(void); -int pg_cache_try_get_unsafe(struct rrdeng_page_descr *descr, int exclusive_access); -void pg_cache_put_unsafe(struct rrdeng_page_descr *descr); -void pg_cache_put(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); -void pg_cache_insert(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, - struct rrdeng_page_descr *descr); -uint8_t pg_cache_punch_hole(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr, - uint8_t remove_dirty, uint8_t is_exclusive_holder, uuid_t *metric_id); -usec_t pg_cache_oldest_time_in_range(struct rrdengine_instance *ctx, uuid_t *id, - usec_t start_time_ut, usec_t end_time_ut); -void pg_cache_get_filtered_info_prev(struct rrdengine_instance *ctx, struct pg_cache_page_index *page_index, - usec_t point_in_time_ut, pg_cache_page_info_filter_t *filter, - struct rrdeng_page_info *page_info); -struct rrdeng_page_descr *pg_cache_lookup_unpopulated_and_lock(struct rrdengine_instance *ctx, uuid_t *id, - usec_t start_time_ut); -unsigned - pg_cache_preload(struct rrdengine_instance *ctx, uuid_t *id, usec_t start_time_ut, usec_t end_time_ut, - struct rrdeng_page_info **page_info_arrayp, struct pg_cache_page_index **ret_page_indexp); -struct rrdeng_page_descr * - pg_cache_lookup(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, uuid_t *id, - usec_t point_in_time_ut); -struct rrdeng_page_descr * - pg_cache_lookup_next(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, uuid_t *id, - usec_t start_time_ut, usec_t end_time_ut); -struct pg_cache_page_index *create_page_index(uuid_t *id, struct rrdengine_instance *ctx); -void init_page_cache(struct rrdengine_instance *ctx); -void free_page_cache(struct rrdengine_instance *ctx); -void pg_cache_add_new_metric_time(struct pg_cache_page_index *page_index, struct rrdeng_page_descr *descr); -void pg_cache_update_metric_times(struct pg_cache_page_index *page_index); -unsigned long pg_cache_hard_limit(struct rrdengine_instance *ctx); -unsigned long pg_cache_soft_limit(struct rrdengine_instance *ctx); -unsigned long pg_cache_committed_hard_limit(struct rrdengine_instance *ctx); - -void rrdeng_page_descr_aral_go_singlethreaded(void); -void rrdeng_page_descr_aral_go_multithreaded(void); -void rrdeng_page_descr_use_malloc(void); -void rrdeng_page_descr_use_mmap(void); -bool rrdeng_page_descr_is_mmap(void); -struct rrdeng_page_descr *rrdeng_page_descr_mallocz(void); -void rrdeng_page_descr_freez(struct rrdeng_page_descr *descr); - -static inline void - pg_cache_atomic_get_pg_info(struct rrdeng_page_descr *descr, usec_t *end_time_ut_p, uint32_t *page_lengthp) -{ - usec_t end_time_ut, old_end_time_ut; - uint32_t page_length; - - if (NULL == descr->extent) { - /* this page is currently being modified, get consistent info locklessly */ - do { - end_time_ut = descr->end_time_ut; - __sync_synchronize(); - old_end_time_ut = end_time_ut; - page_length = descr->page_length; - __sync_synchronize(); - end_time_ut = descr->end_time_ut; - __sync_synchronize(); - } while ((end_time_ut != old_end_time_ut || (end_time_ut & 1) != 0)); +struct rrdeng_query_handle; +struct page_details_control; - *end_time_ut_p = end_time_ut; - *page_lengthp = page_length; - } else { - *end_time_ut_p = descr->end_time_ut; - *page_lengthp = descr->page_length; - } -} +void rrdeng_prep_wait(struct page_details_control *pdc); +void rrdeng_prep_query(struct page_details_control *pdc); +void pg_cache_preload(struct rrdeng_query_handle *handle); +struct pgc_page *pg_cache_lookup_next(struct rrdengine_instance *ctx, struct page_details_control *pdc, time_t now_s, time_t last_update_every_s, size_t *entries); +void pgc_and_mrg_initialize(void); -/* The caller must hold a reference to the page and must have already set the new data */ -static inline void pg_cache_atomic_set_pg_info(struct rrdeng_page_descr *descr, usec_t end_time_ut, uint32_t page_length) -{ - fatal_assert(!(end_time_ut & 1)); - __sync_synchronize(); - descr->end_time_ut |= 1; /* mark start of uncertainty period by adding 1 microsecond */ - __sync_synchronize(); - descr->page_length = page_length; - __sync_synchronize(); - descr->end_time_ut = end_time_ut; /* mark end of uncertainty period */ -} +void pgc_open_add_hot_page(Word_t section, Word_t metric_id, time_t start_time_s, time_t end_time_s, time_t update_every_s, struct rrdengine_datafile *datafile, uint64_t extent_offset, unsigned extent_size, uint32_t page_length); #endif /* NETDATA_PAGECACHE_H */ diff --git a/database/engine/pdc.c b/database/engine/pdc.c new file mode 100644 index 000000000..8b8e71958 --- /dev/null +++ b/database/engine/pdc.c @@ -0,0 +1,1282 @@ +// SPDX-License-Identifier: GPL-3.0-or-later +#define NETDATA_RRD_INTERNALS +#include "pdc.h" + +struct extent_page_details_list { + uv_file file; + uint64_t extent_offset; + uint32_t extent_size; + unsigned number_of_pages_in_JudyL; + Pvoid_t page_details_by_metric_id_JudyL; + struct page_details_control *pdc; + struct rrdengine_datafile *datafile; + + struct rrdeng_cmd *cmd; + bool head_to_datafile_extent_queries_pending_for_extent; + + struct { + struct extent_page_details_list *prev; + struct extent_page_details_list *next; + } query; +}; + +typedef struct datafile_extent_offset_list { + uv_file file; + unsigned fileno; + Pvoid_t extent_pd_list_by_extent_offset_JudyL; +} DEOL; + +// ---------------------------------------------------------------------------- +// PDC cache + +static struct { + struct { + ARAL *ar; + } pdc; + + struct { + ARAL *ar; + } pd; + + struct { + ARAL *ar; + } epdl; + + struct { + ARAL *ar; + } deol; +} pdc_globals = {}; + +void pdc_init(void) { + pdc_globals.pdc.ar = aral_create( + "dbengine-pdc", + sizeof(PDC), + 0, + 65536, + NULL, + NULL, NULL, false, false + ); +} + +PDC *pdc_get(void) { + PDC *pdc = aral_mallocz(pdc_globals.pdc.ar); + memset(pdc, 0, sizeof(PDC)); + return pdc; +} + +static void pdc_release(PDC *pdc) { + aral_freez(pdc_globals.pdc.ar, pdc); +} + +size_t pdc_cache_size(void) { + return aral_overhead(pdc_globals.pdc.ar) + aral_structures(pdc_globals.pdc.ar); +} + +// ---------------------------------------------------------------------------- +// PD cache + +void page_details_init(void) { + pdc_globals.pd.ar = aral_create( + "dbengine-pd", + sizeof(struct page_details), + 0, + 65536, + NULL, + NULL, NULL, false, false + ); +} + +struct page_details *page_details_get(void) { + struct page_details *pd = aral_mallocz(pdc_globals.pd.ar); + memset(pd, 0, sizeof(struct page_details)); + return pd; +} + +static void page_details_release(struct page_details *pd) { + aral_freez(pdc_globals.pd.ar, pd); +} + +size_t pd_cache_size(void) { + return aral_overhead(pdc_globals.pd.ar) + aral_structures(pdc_globals.pd.ar); +} + +// ---------------------------------------------------------------------------- +// epdl cache + +void epdl_init(void) { + pdc_globals.epdl.ar = aral_create( + "dbengine-epdl", + sizeof(EPDL), + 0, + 65536, + NULL, + NULL, NULL, false, false + ); +} + +static EPDL *epdl_get(void) { + EPDL *epdl = aral_mallocz(pdc_globals.epdl.ar); + memset(epdl, 0, sizeof(EPDL)); + return epdl; +} + +static void epdl_release(EPDL *epdl) { + aral_freez(pdc_globals.epdl.ar, epdl); +} + +size_t epdl_cache_size(void) { + return aral_overhead(pdc_globals.epdl.ar) + aral_structures(pdc_globals.epdl.ar); +} + +// ---------------------------------------------------------------------------- +// deol cache + +void deol_init(void) { + pdc_globals.deol.ar = aral_create( + "dbengine-deol", + sizeof(DEOL), + 0, + 65536, + NULL, + NULL, NULL, false, false + ); +} + +static DEOL *deol_get(void) { + DEOL *deol = aral_mallocz(pdc_globals.deol.ar); + memset(deol, 0, sizeof(DEOL)); + return deol; +} + +static void deol_release(DEOL *deol) { + aral_freez(pdc_globals.deol.ar, deol); +} + +size_t deol_cache_size(void) { + return aral_overhead(pdc_globals.deol.ar) + aral_structures(pdc_globals.deol.ar); +} + +// ---------------------------------------------------------------------------- +// extent with buffer cache + +static struct { + struct { + SPINLOCK spinlock; + struct extent_buffer *available_items; + size_t available; + } protected; + + struct { + size_t allocated; + size_t allocated_bytes; + } atomics; + + size_t max_size; + +} extent_buffer_globals = { + .protected = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .available_items = NULL, + .available = 0, + }, + .atomics = { + .allocated = 0, + .allocated_bytes = 0, + }, + .max_size = MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE, +}; + +void extent_buffer_init(void) { + size_t max_extent_uncompressed = MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE; + size_t max_size = (size_t)LZ4_compressBound(MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE); + if(max_size < max_extent_uncompressed) + max_size = max_extent_uncompressed; + + extent_buffer_globals.max_size = max_size; +} + +void extent_buffer_cleanup1(void) { + struct extent_buffer *item = NULL; + + if(!netdata_spinlock_trylock(&extent_buffer_globals.protected.spinlock)) + return; + + if(extent_buffer_globals.protected.available_items && extent_buffer_globals.protected.available > 1) { + item = extent_buffer_globals.protected.available_items; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, item, cache.prev, cache.next); + extent_buffer_globals.protected.available--; + } + + netdata_spinlock_unlock(&extent_buffer_globals.protected.spinlock); + + if(item) { + size_t bytes = sizeof(struct extent_buffer) + item->bytes; + freez(item); + __atomic_sub_fetch(&extent_buffer_globals.atomics.allocated, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&extent_buffer_globals.atomics.allocated_bytes, bytes, __ATOMIC_RELAXED); + } +} + +struct extent_buffer *extent_buffer_get(size_t size) { + internal_fatal(size > extent_buffer_globals.max_size, "DBENGINE: extent size is too big"); + + struct extent_buffer *eb = NULL; + + if(size < extent_buffer_globals.max_size) + size = extent_buffer_globals.max_size; + + netdata_spinlock_lock(&extent_buffer_globals.protected.spinlock); + if(likely(extent_buffer_globals.protected.available_items)) { + eb = extent_buffer_globals.protected.available_items; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, eb, cache.prev, cache.next); + extent_buffer_globals.protected.available--; + } + netdata_spinlock_unlock(&extent_buffer_globals.protected.spinlock); + + if(unlikely(eb && eb->bytes < size)) { + size_t bytes = sizeof(struct extent_buffer) + eb->bytes; + freez(eb); + eb = NULL; + __atomic_sub_fetch(&extent_buffer_globals.atomics.allocated, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&extent_buffer_globals.atomics.allocated_bytes, bytes, __ATOMIC_RELAXED); + } + + if(unlikely(!eb)) { + size_t bytes = sizeof(struct extent_buffer) + size; + eb = mallocz(bytes); + eb->bytes = size; + __atomic_add_fetch(&extent_buffer_globals.atomics.allocated, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&extent_buffer_globals.atomics.allocated_bytes, bytes, __ATOMIC_RELAXED); + } + + return eb; +} + +void extent_buffer_release(struct extent_buffer *eb) { + if(unlikely(!eb)) return; + + netdata_spinlock_lock(&extent_buffer_globals.protected.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, eb, cache.prev, cache.next); + extent_buffer_globals.protected.available++; + netdata_spinlock_unlock(&extent_buffer_globals.protected.spinlock); +} + +size_t extent_buffer_cache_size(void) { + return __atomic_load_n(&extent_buffer_globals.atomics.allocated_bytes, __ATOMIC_RELAXED); +} + +// ---------------------------------------------------------------------------- +// epdl logic + +static void epdl_destroy(EPDL *epdl) +{ + Pvoid_t *pd_by_start_time_s_JudyL; + Word_t metric_id_index = 0; + bool metric_id_first = true; + while ((pd_by_start_time_s_JudyL = PDCJudyLFirstThenNext( + epdl->page_details_by_metric_id_JudyL, + &metric_id_index, &metric_id_first))) + PDCJudyLFreeArray(pd_by_start_time_s_JudyL, PJE0); + + PDCJudyLFreeArray(&epdl->page_details_by_metric_id_JudyL, PJE0); + epdl_release(epdl); +} + +static void epdl_mark_all_not_loaded_pages_as_failed(EPDL *epdl, PDC_PAGE_STATUS tags, size_t *statistics_counter) +{ + size_t pages_matched = 0; + + Word_t metric_id_index = 0; + bool metric_id_first = true; + Pvoid_t *pd_by_start_time_s_JudyL; + while((pd_by_start_time_s_JudyL = PDCJudyLFirstThenNext(epdl->page_details_by_metric_id_JudyL, &metric_id_index, &metric_id_first))) { + + Word_t start_time_index = 0; + bool start_time_first = true; + Pvoid_t *PValue; + while ((PValue = PDCJudyLFirstThenNext(*pd_by_start_time_s_JudyL, &start_time_index, &start_time_first))) { + struct page_details *pd = *PValue; + + if(!pd->page && !pdc_page_status_check(pd, PDC_PAGE_FAILED|PDC_PAGE_READY)) { + pdc_page_status_set(pd, PDC_PAGE_FAILED | tags); + pages_matched++; + } + } + } + + if(pages_matched && statistics_counter) + __atomic_add_fetch(statistics_counter, pages_matched, __ATOMIC_RELAXED); +} +/* +static bool epdl_check_if_pages_are_already_in_cache(struct rrdengine_instance *ctx, EPDL *epdl, PDC_PAGE_STATUS tags) +{ + size_t count_remaining = 0; + size_t found = 0; + + Word_t metric_id_index = 0; + bool metric_id_first = true; + Pvoid_t *pd_by_start_time_s_JudyL; + while((pd_by_start_time_s_JudyL = PDCJudyLFirstThenNext(epdl->page_details_by_metric_id_JudyL, &metric_id_index, &metric_id_first))) { + + Word_t start_time_index = 0; + bool start_time_first = true; + Pvoid_t *PValue; + while ((PValue = PDCJudyLFirstThenNext(*pd_by_start_time_s_JudyL, &start_time_index, &start_time_first))) { + struct page_details *pd = *PValue; + if (pd->page) + continue; + + pd->page = pgc_page_get_and_acquire(main_cache, (Word_t) ctx, pd->metric_id, pd->first_time_s, PGC_SEARCH_EXACT); + if (pd->page) { + found++; + pdc_page_status_set(pd, PDC_PAGE_READY | tags); + } + else + count_remaining++; + } + } + + if(found) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_ok_preloaded, found, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache, found, __ATOMIC_RELAXED); + } + + return count_remaining == 0; +} +*/ + +// ---------------------------------------------------------------------------- +// PDC logic + +static void pdc_destroy(PDC *pdc) { + mrg_metric_release(main_mrg, pdc->metric); + completion_destroy(&pdc->prep_completion); + completion_destroy(&pdc->page_completion); + + Pvoid_t *PValue; + struct page_details *pd; + Word_t time_index = 0; + bool first_then_next = true; + size_t unroutable = 0, cancelled = 0; + while((PValue = PDCJudyLFirstThenNext(pdc->page_list_JudyL, &time_index, &first_then_next))) { + pd = *PValue; + + // no need for atomics here - we are done... + PDC_PAGE_STATUS status = pd->status; + + if(status & PDC_PAGE_DATAFILE_ACQUIRED) { + datafile_release(pd->datafile.ptr, DATAFILE_ACQUIRE_PAGE_DETAILS); + pd->datafile.ptr = NULL; + } + + internal_fatal(pd->datafile.ptr, "DBENGINE: page details has a datafile.ptr that is not released."); + + if(!pd->page && !(status & (PDC_PAGE_READY | PDC_PAGE_FAILED | PDC_PAGE_RELEASED | PDC_PAGE_SKIP | PDC_PAGE_INVALID | PDC_PAGE_CANCELLED))) { + // pdc_page_status_set(pd, PDC_PAGE_FAILED); + unroutable++; + } + else if(!pd->page && (status & PDC_PAGE_CANCELLED)) + cancelled++; + + if(pd->page && !(status & PDC_PAGE_RELEASED)) { + pgc_page_release(main_cache, pd->page); + // pdc_page_status_set(pd, PDC_PAGE_RELEASED); + } + + page_details_release(pd); + } + + PDCJudyLFreeArray(&pdc->page_list_JudyL, PJE0); + + __atomic_sub_fetch(&rrdeng_cache_efficiency_stats.currently_running_queries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&pdc->ctx->atomic.inflight_queries, 1, __ATOMIC_RELAXED); + pdc_release(pdc); + + if(unroutable) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_fail_unroutable, unroutable, __ATOMIC_RELAXED); + + if(cancelled) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_fail_cancelled, cancelled, __ATOMIC_RELAXED); +} + +void pdc_acquire(PDC *pdc) { + netdata_spinlock_lock(&pdc->refcount_spinlock); + + if(pdc->refcount < 1) + fatal("DBENGINE: pdc is not referenced and cannot be acquired"); + + pdc->refcount++; + netdata_spinlock_unlock(&pdc->refcount_spinlock); +} + +bool pdc_release_and_destroy_if_unreferenced(PDC *pdc, bool worker, bool router __maybe_unused) { + if(unlikely(!pdc)) + return true; + + netdata_spinlock_lock(&pdc->refcount_spinlock); + + if(pdc->refcount <= 0) + fatal("DBENGINE: pdc is not referenced and cannot be released"); + + pdc->refcount--; + + if (pdc->refcount <= 1 && worker) { + // when 1 refcount is remaining, and we are a worker, + // we can mark the job completed: + // - if the remaining refcount is from the query caller, we will wake it up + // - if the remaining refcount is from another worker, the query thread is already away + completion_mark_complete(&pdc->page_completion); + } + + if (pdc->refcount == 0) { + netdata_spinlock_unlock(&pdc->refcount_spinlock); + pdc_destroy(pdc); + return true; + } + + netdata_spinlock_unlock(&pdc->refcount_spinlock); + return false; +} + +void epdl_cmd_queued(void *epdl_ptr, struct rrdeng_cmd *cmd) { + EPDL *epdl = epdl_ptr; + epdl->cmd = cmd; +} + +void epdl_cmd_dequeued(void *epdl_ptr) { + EPDL *epdl = epdl_ptr; + epdl->cmd = NULL; +} + +static struct rrdeng_cmd *epdl_get_cmd(void *epdl_ptr) { + EPDL *epdl = epdl_ptr; + return epdl->cmd; +} + +static bool epdl_pending_add(EPDL *epdl) { + bool added_new; + + netdata_spinlock_lock(&epdl->datafile->extent_queries.spinlock); + Pvoid_t *PValue = JudyLIns(&epdl->datafile->extent_queries.pending_epdl_by_extent_offset_judyL, epdl->extent_offset, PJE0); + internal_fatal(!PValue || PValue == PJERR, "DBENGINE: corrupted pending extent judy"); + + EPDL *base = *PValue; + + if(!base) { + added_new = true; + epdl->head_to_datafile_extent_queries_pending_for_extent = true; + } + else { + added_new = false; + epdl->head_to_datafile_extent_queries_pending_for_extent = false; + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_extent_merged, 1, __ATOMIC_RELAXED); + + if(base->pdc->priority > epdl->pdc->priority) + rrdeng_req_cmd(epdl_get_cmd, base, epdl->pdc->priority); + } + + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(base, epdl, query.prev, query.next); + *PValue = base; + + netdata_spinlock_unlock(&epdl->datafile->extent_queries.spinlock); + + return added_new; +} + +static void epdl_pending_del(EPDL *epdl) { + netdata_spinlock_lock(&epdl->datafile->extent_queries.spinlock); + if(epdl->head_to_datafile_extent_queries_pending_for_extent) { + epdl->head_to_datafile_extent_queries_pending_for_extent = false; + int rc = JudyLDel(&epdl->datafile->extent_queries.pending_epdl_by_extent_offset_judyL, epdl->extent_offset, PJE0); + (void) rc; + internal_fatal(!rc, "DBENGINE: epdl not found in pending list"); + } + netdata_spinlock_unlock(&epdl->datafile->extent_queries.spinlock); +} + +void pdc_to_epdl_router(struct rrdengine_instance *ctx, PDC *pdc, execute_extent_page_details_list_t exec_first_extent_list, execute_extent_page_details_list_t exec_rest_extent_list) +{ + Pvoid_t *PValue; + Pvoid_t *PValue1; + Pvoid_t *PValue2; + Word_t time_index = 0; + struct page_details *pd = NULL; + + // this is the entire page list + // Lets do some deduplication + // 1. Per datafile + // 2. Per extent + // 3. Pages per extent will be added to the cache either as acquired or not + + Pvoid_t JudyL_datafile_list = NULL; + + DEOL *deol; + EPDL *epdl; + + if (pdc->page_list_JudyL) { + bool first_then_next = true; + while((PValue = PDCJudyLFirstThenNext(pdc->page_list_JudyL, &time_index, &first_then_next))) { + pd = *PValue; + + internal_fatal(!pd, + "DBENGINE: pdc page list has an empty page details entry"); + + if (!(pd->status & PDC_PAGE_DISK_PENDING)) + continue; + + internal_fatal(!(pd->status & PDC_PAGE_DATAFILE_ACQUIRED), + "DBENGINE: page details has not acquired the datafile"); + + internal_fatal((pd->status & (PDC_PAGE_READY | PDC_PAGE_FAILED)), + "DBENGINE: page details has disk pending flag but it is ready/failed"); + + internal_fatal(pd->page, + "DBENGINE: page details has a page linked to it, but it is marked for loading"); + + PValue1 = PDCJudyLIns(&JudyL_datafile_list, pd->datafile.fileno, PJE0); + if (PValue1 && !*PValue1) { + *PValue1 = deol = deol_get(); + deol->extent_pd_list_by_extent_offset_JudyL = NULL; + deol->fileno = pd->datafile.fileno; + } + else + deol = *PValue1; + + PValue2 = PDCJudyLIns(&deol->extent_pd_list_by_extent_offset_JudyL, pd->datafile.extent.pos, PJE0); + if (PValue2 && !*PValue2) { + *PValue2 = epdl = epdl_get(); + epdl->page_details_by_metric_id_JudyL = NULL; + epdl->number_of_pages_in_JudyL = 0; + epdl->file = pd->datafile.file; + epdl->extent_offset = pd->datafile.extent.pos; + epdl->extent_size = pd->datafile.extent.bytes; + epdl->datafile = pd->datafile.ptr; + } + else + epdl = *PValue2; + + epdl->number_of_pages_in_JudyL++; + + Pvoid_t *pd_by_first_time_s_judyL = PDCJudyLIns(&epdl->page_details_by_metric_id_JudyL, pd->metric_id, PJE0); + Pvoid_t *pd_pptr = PDCJudyLIns(pd_by_first_time_s_judyL, pd->first_time_s, PJE0); + *pd_pptr = pd; + } + + size_t extent_list_no = 0; + Word_t datafile_no = 0; + first_then_next = true; + while((PValue = PDCJudyLFirstThenNext(JudyL_datafile_list, &datafile_no, &first_then_next))) { + deol = *PValue; + + bool first_then_next_extent = true; + Word_t pos = 0; + while ((PValue = PDCJudyLFirstThenNext(deol->extent_pd_list_by_extent_offset_JudyL, &pos, &first_then_next_extent))) { + epdl = *PValue; + internal_fatal(!epdl, "DBENGINE: extent_list is not populated properly"); + + // The extent page list can be dispatched to a worker + // It will need to populate the cache with "acquired" pages that are in the list (pd) only + // the rest of the extent pages will be added to the cache butnot acquired + + pdc_acquire(pdc); // we do this for the next worker: do_read_extent_work() + epdl->pdc = pdc; + + if(epdl_pending_add(epdl)) { + if (extent_list_no++ == 0) + exec_first_extent_list(ctx, epdl, pdc->priority); + else + exec_rest_extent_list(ctx, epdl, pdc->priority); + } + } + PDCJudyLFreeArray(&deol->extent_pd_list_by_extent_offset_JudyL, PJE0); + deol_release(deol); + } + PDCJudyLFreeArray(&JudyL_datafile_list, PJE0); + } + + pdc_release_and_destroy_if_unreferenced(pdc, true, true); +} + +void collect_page_flags_to_buffer(BUFFER *wb, RRDENG_COLLECT_PAGE_FLAGS flags) { + if(flags & RRDENG_PAGE_PAST_COLLECTION) + buffer_strcat(wb, "PAST_COLLECTION "); + if(flags & RRDENG_PAGE_REPEATED_COLLECTION) + buffer_strcat(wb, "REPEATED_COLLECTION "); + if(flags & RRDENG_PAGE_BIG_GAP) + buffer_strcat(wb, "BIG_GAP "); + if(flags & RRDENG_PAGE_GAP) + buffer_strcat(wb, "GAP "); + if(flags & RRDENG_PAGE_FUTURE_POINT) + buffer_strcat(wb, "FUTURE_POINT "); + if(flags & RRDENG_PAGE_CREATED_IN_FUTURE) + buffer_strcat(wb, "CREATED_IN_FUTURE "); + if(flags & RRDENG_PAGE_COMPLETED_IN_FUTURE) + buffer_strcat(wb, "COMPLETED_IN_FUTURE "); + if(flags & RRDENG_PAGE_UNALIGNED) + buffer_strcat(wb, "UNALIGNED "); + if(flags & RRDENG_PAGE_CONFLICT) + buffer_strcat(wb, "CONFLICT "); + if(flags & RRDENG_PAGE_FULL) + buffer_strcat(wb, "PAGE_FULL"); + if(flags & RRDENG_PAGE_COLLECT_FINALIZE) + buffer_strcat(wb, "COLLECT_FINALIZE"); + if(flags & RRDENG_PAGE_UPDATE_EVERY_CHANGE) + buffer_strcat(wb, "UPDATE_EVERY_CHANGE"); + if(flags & RRDENG_PAGE_STEP_TOO_SMALL) + buffer_strcat(wb, "STEP_TOO_SMALL"); + if(flags & RRDENG_PAGE_STEP_UNALIGNED) + buffer_strcat(wb, "STEP_UNALIGNED"); +} + +inline VALIDATED_PAGE_DESCRIPTOR validate_extent_page_descr(const struct rrdeng_extent_page_descr *descr, time_t now_s, time_t overwrite_zero_update_every_s, bool have_read_error) { + return validate_page( + (uuid_t *)descr->uuid, + (time_t) (descr->start_time_ut / USEC_PER_SEC), + (time_t) (descr->end_time_ut / USEC_PER_SEC), + 0, + descr->page_length, + descr->type, + 0, + now_s, + overwrite_zero_update_every_s, + have_read_error, + "loaded", 0); +} + +VALIDATED_PAGE_DESCRIPTOR validate_page( + uuid_t *uuid, + time_t start_time_s, + time_t end_time_s, + time_t update_every_s, // can be zero, if unknown + size_t page_length, + uint8_t page_type, + size_t entries, // can be zero, if unknown + time_t now_s, // can be zero, to disable future timestamp check + time_t overwrite_zero_update_every_s, // can be zero, if unknown + bool have_read_error, + const char *msg, + RRDENG_COLLECT_PAGE_FLAGS flags) { + + VALIDATED_PAGE_DESCRIPTOR vd = { + .start_time_s = start_time_s, + .end_time_s = end_time_s, + .update_every_s = update_every_s, + .page_length = page_length, + .type = page_type, + .is_valid = true, + }; + + // always calculate entries by size + vd.point_size = page_type_size[vd.type]; + vd.entries = page_entries_by_size(vd.page_length, vd.point_size); + + // allow to be called without entries (when loading pages from disk) + if(!entries) + entries = vd.entries; + + // allow to be called without update every (when loading pages from disk) + if(!update_every_s) { + vd.update_every_s = (vd.entries > 1) ? ((vd.end_time_s - vd.start_time_s) / (time_t) (vd.entries - 1)) + : overwrite_zero_update_every_s; + + update_every_s = vd.update_every_s; + } + + // another such set of checks exists in + // update_metric_retention_and_granularity_by_uuid() + + bool updated = false; + + if( have_read_error || + vd.page_length == 0 || + vd.page_length > RRDENG_BLOCK_SIZE || + vd.start_time_s > vd.end_time_s || + (now_s && vd.end_time_s > now_s) || + vd.start_time_s == 0 || + vd.end_time_s == 0 || + (vd.start_time_s == vd.end_time_s && vd.entries > 1) || + (vd.update_every_s == 0 && vd.entries > 1) + ) + vd.is_valid = false; + + else { + if(unlikely(vd.entries != entries || vd.update_every_s != update_every_s)) + updated = true; + + if (likely(vd.update_every_s)) { + size_t entries_by_time = page_entries_by_time(vd.start_time_s, vd.end_time_s, vd.update_every_s); + + if (vd.entries != entries_by_time) { + if (overwrite_zero_update_every_s < vd.update_every_s) + vd.update_every_s = overwrite_zero_update_every_s; + + time_t new_end_time_s = (time_t)(vd.start_time_s + (vd.entries - 1) * vd.update_every_s); + + if(new_end_time_s <= vd.end_time_s) { + // end time is wrong + vd.end_time_s = new_end_time_s; + } + else { + // update every is wrong + vd.update_every_s = overwrite_zero_update_every_s; + vd.end_time_s = (time_t)(vd.start_time_s + (vd.entries - 1) * vd.update_every_s); + } + + updated = true; + } + } + else if(overwrite_zero_update_every_s) { + vd.update_every_s = overwrite_zero_update_every_s; + updated = true; + } + } + + if(unlikely(!vd.is_valid || updated)) { +#ifndef NETDATA_INTERNAL_CHECKS + error_limit_static_global_var(erl, 1, 0); +#endif + char uuid_str[UUID_STR_LEN + 1]; + uuid_unparse(*uuid, uuid_str); + + BUFFER *wb = NULL; + + if(flags) { + wb = buffer_create(0, NULL); + collect_page_flags_to_buffer(wb, flags); + } + + if(!vd.is_valid) { +#ifdef NETDATA_INTERNAL_CHECKS + internal_error(true, +#else + error_limit(&erl, +#endif + "DBENGINE: metric '%s' %s invalid page of type %u " + "from %ld to %ld (now %ld), update every %ld, page length %zu, entries %zu (flags: %s)", + uuid_str, msg, vd.type, + vd.start_time_s, vd.end_time_s, now_s, vd.update_every_s, vd.page_length, vd.entries, wb?buffer_tostring(wb):"" + ); + } + else { + const char *err_valid = (vd.is_valid) ? "" : "found invalid, "; + const char *err_start = (vd.start_time_s == start_time_s) ? "" : "start time updated, "; + const char *err_end = (vd.end_time_s == end_time_s) ? "" : "end time updated, "; + const char *err_update = (vd.update_every_s == update_every_s) ? "" : "update every updated, "; + const char *err_length = (vd.page_length == page_length) ? "" : "page length updated, "; + const char *err_entries = (vd.entries == entries) ? "" : "entries updated, "; + const char *err_future = (now_s && vd.end_time_s <= now_s) ? "" : "future end time, "; + +#ifdef NETDATA_INTERNAL_CHECKS + internal_error(true, +#else + error_limit(&erl, +#endif + "DBENGINE: metric '%s' %s page of type %u " + "from %ld to %ld (now %ld), update every %ld, page length %zu, entries %zu (flags: %s), " + "found inconsistent - the right is " + "from %ld to %ld, update every %ld, page length %zu, entries %zu: " + "%s%s%s%s%s%s%s", + uuid_str, msg, vd.type, + start_time_s, end_time_s, now_s, update_every_s, page_length, entries, wb?buffer_tostring(wb):"", + vd.start_time_s, vd.end_time_s, vd.update_every_s, vd.page_length, vd.entries, + err_valid, err_start, err_end, err_update, err_length, err_entries, err_future + ); + } + + buffer_free(wb); + } + + return vd; +} + +static inline struct page_details *epdl_get_pd_load_link_list_from_metric_start_time(EPDL *epdl, Word_t metric_id, time_t start_time_s) { + + if(unlikely(epdl->head_to_datafile_extent_queries_pending_for_extent)) + // stop appending more pages to this epdl + epdl_pending_del(epdl); + + struct page_details *pd_list = NULL; + + for(EPDL *ep = epdl; ep ;ep = ep->query.next) { + Pvoid_t *pd_by_start_time_s_judyL = PDCJudyLGet(ep->page_details_by_metric_id_JudyL, metric_id, PJE0); + internal_fatal(pd_by_start_time_s_judyL == PJERR, "DBENGINE: corrupted extent metrics JudyL"); + + if (unlikely(pd_by_start_time_s_judyL && *pd_by_start_time_s_judyL)) { + Pvoid_t *pd_pptr = PDCJudyLGet(*pd_by_start_time_s_judyL, start_time_s, PJE0); + internal_fatal(pd_pptr == PJERR, "DBENGINE: corrupted metric page details JudyHS"); + + if(likely(pd_pptr && *pd_pptr)) { + struct page_details *pd = *pd_pptr; + internal_fatal(metric_id != pd->metric_id, "DBENGINE: metric ids do not match"); + + if(likely(!pd->page)) { + if (unlikely(__atomic_load_n(&ep->pdc->workers_should_stop, __ATOMIC_RELAXED))) + pdc_page_status_set(pd, PDC_PAGE_FAILED | PDC_PAGE_CANCELLED); + else + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pd_list, pd, load.prev, load.next); + } + } + } + } + + return pd_list; +} + +static void epdl_extent_loading_error_log(struct rrdengine_instance *ctx, EPDL *epdl, struct rrdeng_extent_page_descr *descr, const char *msg) { + char uuid[UUID_STR_LEN] = ""; + time_t start_time_s = 0; + time_t end_time_s = 0; + bool used_epdl = false; + bool used_descr = false; + + if (descr) { + start_time_s = (time_t)(descr->start_time_ut / USEC_PER_SEC); + end_time_s = (time_t)(descr->end_time_ut / USEC_PER_SEC); + uuid_unparse_lower(descr->uuid, uuid); + used_descr = true; + } + else if (epdl) { + struct page_details *pd = NULL; + + Word_t start = 0; + Pvoid_t *pd_by_start_time_s_judyL = PDCJudyLFirst(epdl->page_details_by_metric_id_JudyL, &start, PJE0); + if(pd_by_start_time_s_judyL) { + start = 0; + Pvoid_t *pd_pptr = PDCJudyLFirst(*pd_by_start_time_s_judyL, &start, PJE0); + if(pd_pptr) { + pd = *pd_pptr; + start_time_s = pd->first_time_s; + end_time_s = pd->last_time_s; + METRIC *metric = (METRIC *)pd->metric_id; + uuid_t *u = mrg_metric_uuid(main_mrg, metric); + uuid_unparse_lower(*u, uuid); + used_epdl = true; + } + } + } + + if(!used_epdl && !used_descr && epdl && epdl->pdc) { + start_time_s = epdl->pdc->start_time_s; + end_time_s = epdl->pdc->end_time_s; + } + + char start_time_str[LOG_DATE_LENGTH + 1] = ""; + if(start_time_s) + log_date(start_time_str, LOG_DATE_LENGTH, start_time_s); + + char end_time_str[LOG_DATE_LENGTH + 1] = ""; + if(end_time_s) + log_date(end_time_str, LOG_DATE_LENGTH, end_time_s); + + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, + "DBENGINE: error while reading extent from datafile %u of tier %d, at offset %" PRIu64 " (%u bytes) " + "%s from %ld (%s) to %ld (%s) %s%s: " + "%s", + epdl->datafile->fileno, ctx->config.tier, + epdl->extent_offset, epdl->extent_size, + used_epdl ? "to extract page (PD)" : used_descr ? "expected page (DESCR)" : "part of a query (PDC)", + start_time_s, start_time_str, end_time_s, end_time_str, + used_epdl || used_descr ? " of metric " : "", + used_epdl || used_descr ? uuid : "", + msg); +} + +static bool epdl_populate_pages_from_extent_data( + struct rrdengine_instance *ctx, + void *data, + size_t data_length, + EPDL *epdl, + bool worker, + PDC_PAGE_STATUS tags, + bool cached_extent) +{ + int ret; + unsigned i, count; + void *uncompressed_buf = NULL; + uint32_t payload_length, payload_offset, trailer_offset, uncompressed_payload_length = 0; + bool have_read_error = false; + /* persistent structures */ + struct rrdeng_df_extent_header *header; + struct rrdeng_df_extent_trailer *trailer; + struct extent_buffer *eb = NULL; + uLong crc; + + bool can_use_data = true; + if(data_length < sizeof(*header) + sizeof(header->descr[0]) + sizeof(*trailer)) { + can_use_data = false; + + // added to satisfy the requirements of older compilers (prevent warnings) + payload_length = 0; + payload_offset = 0; + trailer_offset = 0; + count = 0; + header = NULL; + trailer = NULL; + } + else { + header = data; + payload_length = header->payload_length; + count = header->number_of_pages; + payload_offset = sizeof(*header) + sizeof(header->descr[0]) * count; + trailer_offset = data_length - sizeof(*trailer); + trailer = data + trailer_offset; + } + + if( !can_use_data || + count < 1 || + count > MAX_PAGES_PER_EXTENT || + (header->compression_algorithm != RRD_NO_COMPRESSION && header->compression_algorithm != RRD_LZ4) || + (payload_length != trailer_offset - payload_offset) || + (data_length != payload_offset + payload_length + sizeof(*trailer)) + ) { + epdl_extent_loading_error_log(ctx, epdl, NULL, "header is INVALID"); + return false; + } + + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, data, epdl->extent_size - sizeof(*trailer)); + ret = crc32cmp(trailer->checksum, crc); + if (unlikely(ret)) { + ctx_io_error(ctx); + have_read_error = true; + epdl_extent_loading_error_log(ctx, epdl, NULL, "CRC32 checksum FAILED"); + } + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_DECOMPRESSION); + + if (likely(!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm)) { + // find the uncompressed extent size + uncompressed_payload_length = 0; + for (i = 0; i < count; ++i) { + size_t page_length = header->descr[i].page_length; + if(page_length > RRDENG_BLOCK_SIZE) { + have_read_error = true; + break; + } + + uncompressed_payload_length += header->descr[i].page_length; + } + + if(unlikely(uncompressed_payload_length > MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE)) + have_read_error = true; + + if(likely(!have_read_error)) { + eb = extent_buffer_get(uncompressed_payload_length); + uncompressed_buf = eb->data; + + ret = LZ4_decompress_safe(data + payload_offset, uncompressed_buf, + (int) payload_length, (int) uncompressed_payload_length); + + __atomic_add_fetch(&ctx->stats.before_decompress_bytes, payload_length, __ATOMIC_RELAXED); + __atomic_add_fetch(&ctx->stats.after_decompress_bytes, ret, __ATOMIC_RELAXED); + } + } + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_LOOKUP); + + size_t stats_data_from_main_cache = 0; + size_t stats_data_from_extent = 0; + size_t stats_load_compressed = 0; + size_t stats_load_uncompressed = 0; + size_t stats_load_invalid_page = 0; + size_t stats_cache_hit_while_inserting = 0; + + uint32_t page_offset = 0, page_length; + time_t now_s = max_acceptable_collected_time(); + for (i = 0; i < count; i++, page_offset += page_length) { + page_length = header->descr[i].page_length; + time_t start_time_s = (time_t) (header->descr[i].start_time_ut / USEC_PER_SEC); + + if(!page_length || !start_time_s) { + char log[200 + 1]; + snprintfz(log, 200, "page %u (out of %u) is EMPTY", i, count); + epdl_extent_loading_error_log(ctx, epdl, &header->descr[i], log); + continue; + } + + METRIC *metric = mrg_metric_get_and_acquire(main_mrg, &header->descr[i].uuid, (Word_t)ctx); + Word_t metric_id = (Word_t)metric; + if(!metric) { + char log[200 + 1]; + snprintfz(log, 200, "page %u (out of %u) has unknown UUID", i, count); + epdl_extent_loading_error_log(ctx, epdl, &header->descr[i], log); + continue; + } + mrg_metric_release(main_mrg, metric); + + struct page_details *pd_list = epdl_get_pd_load_link_list_from_metric_start_time(epdl, metric_id, start_time_s); + if(likely(!pd_list)) + continue; + + VALIDATED_PAGE_DESCRIPTOR vd = validate_extent_page_descr( + &header->descr[i], now_s, + (pd_list) ? pd_list->update_every_s : 0, + have_read_error); + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_ALLOCATION); + + void *page_data; + + if (unlikely(!vd.is_valid)) { + page_data = DBENGINE_EMPTY_PAGE; + stats_load_invalid_page++; + } + else { + if (RRD_NO_COMPRESSION == header->compression_algorithm) { + page_data = dbengine_page_alloc(vd.page_length); + memcpy(page_data, data + payload_offset + page_offset, (size_t) vd.page_length); + stats_load_uncompressed++; + } + else { + if (unlikely(page_offset + vd.page_length > uncompressed_payload_length)) { + char log[200 + 1]; + snprintfz(log, 200, "page %u (out of %u) offset %u + page length %zu, " + "exceeds the uncompressed buffer size %u", + i, count, page_offset, vd.page_length, uncompressed_payload_length); + epdl_extent_loading_error_log(ctx, epdl, &header->descr[i], log); + + page_data = DBENGINE_EMPTY_PAGE; + stats_load_invalid_page++; + } + else { + page_data = dbengine_page_alloc(vd.page_length); + memcpy(page_data, uncompressed_buf + page_offset, vd.page_length); + stats_load_compressed++; + } + } + } + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_POPULATION); + + PGC_ENTRY page_entry = { + .hot = false, + .section = (Word_t)ctx, + .metric_id = metric_id, + .start_time_s = vd.start_time_s, + .end_time_s = vd.end_time_s, + .update_every_s = vd.update_every_s, + .size = (size_t) ((page_data == DBENGINE_EMPTY_PAGE) ? 0 : vd.page_length), + .data = page_data + }; + + bool added = true; + PGC_PAGE *page = pgc_page_add_and_acquire(main_cache, page_entry, &added); + if (false == added) { + dbengine_page_free(page_data, vd.page_length); + stats_cache_hit_while_inserting++; + stats_data_from_main_cache++; + } + else + stats_data_from_extent++; + + struct page_details *pd = pd_list; + do { + if(pd != pd_list) + pgc_page_dup(main_cache, page); + + pd->page = page; + pd->page_length = pgc_page_data_size(main_cache, page); + pdc_page_status_set(pd, PDC_PAGE_READY | tags | ((page_data == DBENGINE_EMPTY_PAGE) ? PDC_PAGE_EMPTY : 0)); + + pd = pd->load.next; + } while(pd); + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_LOOKUP); + } + + if(stats_data_from_main_cache) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache, stats_data_from_main_cache, __ATOMIC_RELAXED); + + if(cached_extent) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_extent_cache, stats_data_from_extent, __ATOMIC_RELAXED); + else { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_disk, stats_data_from_extent, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.extents_loaded_from_disk, 1, __ATOMIC_RELAXED); + } + + if(stats_cache_hit_while_inserting) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_ok_loaded_but_cache_hit_while_inserting, stats_cache_hit_while_inserting, __ATOMIC_RELAXED); + + if(stats_load_compressed) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_ok_compressed, stats_load_compressed, __ATOMIC_RELAXED); + + if(stats_load_uncompressed) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_ok_uncompressed, stats_load_uncompressed, __ATOMIC_RELAXED); + + if(stats_load_invalid_page) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_fail_invalid_page_in_extent, stats_load_invalid_page, __ATOMIC_RELAXED); + + if(worker) + worker_is_idle(); + + extent_buffer_release(eb); + + return true; +} + +static inline void *datafile_extent_read(struct rrdengine_instance *ctx, uv_file file, unsigned pos, unsigned size_bytes) +{ + void *buffer; + uv_fs_t request; + + unsigned real_io_size = ALIGN_BYTES_CEILING(size_bytes); + int ret = posix_memalign(&buffer, RRDFILE_ALIGNMENT, real_io_size); + if (unlikely(ret)) + fatal("DBENGINE: posix_memalign(): %s", strerror(ret)); + + uv_buf_t iov = uv_buf_init(buffer, real_io_size); + ret = uv_fs_read(NULL, &request, file, &iov, 1, pos, NULL); + if (unlikely(-1 == ret)) { + ctx_io_error(ctx); + posix_memfree(buffer); + buffer = NULL; + } + else + ctx_io_read_op_bytes(ctx, real_io_size); + + uv_fs_req_cleanup(&request); + + return buffer; +} + +static inline void datafile_extent_read_free(void *buffer) { + posix_memfree(buffer); +} + +void epdl_find_extent_and_populate_pages(struct rrdengine_instance *ctx, EPDL *epdl, bool worker) { + size_t *statistics_counter = NULL; + PDC_PAGE_STATUS not_loaded_pages_tag = 0, loaded_pages_tag = 0; + + bool should_stop = __atomic_load_n(&epdl->pdc->workers_should_stop, __ATOMIC_RELAXED); + for(EPDL *ep = epdl->query.next; ep ;ep = ep->query.next) { + internal_fatal(ep->datafile != epdl->datafile, "DBENGINE: datafiles do not match"); + internal_fatal(ep->extent_offset != epdl->extent_offset, "DBENGINE: extent offsets do not match"); + internal_fatal(ep->extent_size != epdl->extent_size, "DBENGINE: extent sizes do not match"); + internal_fatal(ep->file != epdl->file, "DBENGINE: files do not match"); + + if(!__atomic_load_n(&ep->pdc->workers_should_stop, __ATOMIC_RELAXED)) { + should_stop = false; + break; + } + } + + if(unlikely(should_stop)) { + statistics_counter = &rrdeng_cache_efficiency_stats.pages_load_fail_cancelled; + not_loaded_pages_tag = PDC_PAGE_CANCELLED; + goto cleanup; + } + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_CACHE_LOOKUP); + + bool extent_found_in_cache = false; + + void *extent_compressed_data = NULL; + PGC_PAGE *extent_cache_page = pgc_page_get_and_acquire( + extent_cache, (Word_t)ctx, + (Word_t)epdl->datafile->fileno, (time_t)epdl->extent_offset, + PGC_SEARCH_EXACT); + + if(extent_cache_page) { + extent_compressed_data = pgc_page_data(extent_cache_page); + internal_fatal(epdl->extent_size != pgc_page_data_size(extent_cache, extent_cache_page), + "DBENGINE: cache size does not match the expected size"); + + loaded_pages_tag |= PDC_PAGE_EXTENT_FROM_CACHE; + not_loaded_pages_tag |= PDC_PAGE_EXTENT_FROM_CACHE; + extent_found_in_cache = true; + } + else { + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_MMAP); + + void *extent_data = datafile_extent_read(ctx, epdl->file, epdl->extent_offset, epdl->extent_size); + if(extent_data != NULL) { + + void *copied_extent_compressed_data = dbengine_extent_alloc(epdl->extent_size); + memcpy(copied_extent_compressed_data, extent_data, epdl->extent_size); + datafile_extent_read_free(extent_data); + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_CACHE_LOOKUP); + + bool added = false; + extent_cache_page = pgc_page_add_and_acquire(extent_cache, (PGC_ENTRY) { + .hot = false, + .section = (Word_t) ctx, + .metric_id = (Word_t) epdl->datafile->fileno, + .start_time_s = (time_t) epdl->extent_offset, + .size = epdl->extent_size, + .end_time_s = 0, + .update_every_s = 0, + .data = copied_extent_compressed_data, + }, &added); + + if (!added) { + dbengine_extent_free(copied_extent_compressed_data, epdl->extent_size); + internal_fatal(epdl->extent_size != pgc_page_data_size(extent_cache, extent_cache_page), + "DBENGINE: cache size does not match the expected size"); + } + + extent_compressed_data = pgc_page_data(extent_cache_page); + + loaded_pages_tag |= PDC_PAGE_EXTENT_FROM_DISK; + not_loaded_pages_tag |= PDC_PAGE_EXTENT_FROM_DISK; + } + } + + if(extent_compressed_data) { + // Need to decompress and then process the pagelist + bool extent_used = epdl_populate_pages_from_extent_data( + ctx, extent_compressed_data, epdl->extent_size, + epdl, worker, loaded_pages_tag, extent_found_in_cache); + + if(extent_used) { + // since the extent was used, all the pages that are not + // loaded from this extent, were not found in the extent + not_loaded_pages_tag |= PDC_PAGE_FAILED_NOT_IN_EXTENT; + statistics_counter = &rrdeng_cache_efficiency_stats.pages_load_fail_not_found; + } + else { + not_loaded_pages_tag |= PDC_PAGE_FAILED_INVALID_EXTENT; + statistics_counter = &rrdeng_cache_efficiency_stats.pages_load_fail_invalid_extent; + } + } + else { + not_loaded_pages_tag |= PDC_PAGE_FAILED_TO_MAP_EXTENT; + statistics_counter = &rrdeng_cache_efficiency_stats.pages_load_fail_cant_mmap_extent; + } + + if(extent_cache_page) + pgc_page_release(extent_cache, extent_cache_page); + +cleanup: + // remove it from the datafile extent_queries + // this can be called multiple times safely + epdl_pending_del(epdl); + + // mark all pending pages as failed + for(EPDL *ep = epdl; ep ;ep = ep->query.next) { + epdl_mark_all_not_loaded_pages_as_failed( + ep, not_loaded_pages_tag, statistics_counter); + } + + for(EPDL *ep = epdl, *next = NULL; ep ; ep = next) { + next = ep->query.next; + + completion_mark_complete_a_job(&ep->pdc->page_completion); + pdc_release_and_destroy_if_unreferenced(ep->pdc, true, false); + + // Free the Judy that holds the requested pagelist and the extents + epdl_destroy(ep); + } + + if(worker) + worker_is_idle(); +} diff --git a/database/engine/pdc.h b/database/engine/pdc.h new file mode 100644 index 000000000..9bae39ade --- /dev/null +++ b/database/engine/pdc.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef DBENGINE_PDC_H +#define DBENGINE_PDC_H + +#include "../engine/rrdengine.h" + +struct rrdeng_cmd; + +#ifdef PDC_USE_JULYL +#define PDCJudyLIns JulyLIns +#define PDCJudyLGet JulyLGet +#define PDCJudyLFirst JulyLFirst +#define PDCJudyLNext JulyLNext +#define PDCJudyLLast JulyLLast +#define PDCJudyLPrev JulyLPrev +#define PDCJudyLFirstThenNext JulyLFirstThenNext +#define PDCJudyLLastThenPrev JulyLLastThenPrev +#define PDCJudyLFreeArray JulyLFreeArray +#else +#define PDCJudyLIns JudyLIns +#define PDCJudyLGet JudyLGet +#define PDCJudyLFirst JudyLFirst +#define PDCJudyLNext JudyLNext +#define PDCJudyLLast JudyLLast +#define PDCJudyLPrev JudyLPrev +#define PDCJudyLFirstThenNext JudyLFirstThenNext +#define PDCJudyLLastThenPrev JudyLLastThenPrev +#define PDCJudyLFreeArray JudyLFreeArray +#endif + +typedef struct extent_page_details_list EPDL; +typedef void (*execute_extent_page_details_list_t)(struct rrdengine_instance *ctx, EPDL *epdl, enum storage_priority priority); +void pdc_to_epdl_router(struct rrdengine_instance *ctx, struct page_details_control *pdc, execute_extent_page_details_list_t exec_first_extent_list, execute_extent_page_details_list_t exec_rest_extent_list); +void epdl_find_extent_and_populate_pages(struct rrdengine_instance *ctx, EPDL *epdl, bool worker); + +size_t pdc_cache_size(void); +size_t pd_cache_size(void); +size_t epdl_cache_size(void); +size_t deol_cache_size(void); +size_t extent_buffer_cache_size(void); + +void pdc_init(void); +void page_details_init(void); +void epdl_init(void); +void deol_init(void); +void extent_buffer_cleanup1(void); + +void epdl_cmd_dequeued(void *epdl_ptr); +void epdl_cmd_queued(void *epdl_ptr, struct rrdeng_cmd *cmd); + +struct extent_buffer { + size_t bytes; + + struct { + struct extent_buffer *prev; + struct extent_buffer *next; + } cache; + + uint8_t data[]; +}; + +void extent_buffer_init(void); +struct extent_buffer *extent_buffer_get(size_t size); +void extent_buffer_release(struct extent_buffer *eb); + +#endif // DBENGINE_PDC_H diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c index a6840f38c..d64868f03 100644 --- a/database/engine/rrdengine.c +++ b/database/engine/rrdengine.c @@ -2,6 +2,7 @@ #define NETDATA_RRD_INTERNALS #include "rrdengine.h" +#include "pdc.h" rrdeng_stats_t global_io_errors = 0; rrdeng_stats_t global_fs_errors = 0; @@ -11,31 +12,74 @@ rrdeng_stats_t global_flushing_pressure_page_deletions = 0; unsigned rrdeng_pages_per_extent = MAX_PAGES_PER_EXTENT; -#if WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2) +#if WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_OPCODE_MAX + 2) #error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least (RRDENG_MAX_OPCODE + 2) #endif -void *dbengine_page_alloc() { - void *page = NULL; - if (unlikely(db_engine_use_malloc)) - page = mallocz(RRDENG_BLOCK_SIZE); - else { - page = netdata_mmap(NULL, RRDENG_BLOCK_SIZE, MAP_PRIVATE, enable_ksm); - if(!page) fatal("Cannot allocate dbengine page cache page, with mmap()"); - } - return page; -} - -void dbengine_page_free(void *page) { - if (unlikely(db_engine_use_malloc)) - freez(page); - else - netdata_munmap(page, RRDENG_BLOCK_SIZE); -} +struct rrdeng_main { + uv_thread_t thread; + uv_loop_t loop; + uv_async_t async; + uv_timer_t timer; + pid_t tid; + + size_t flushes_running; + size_t evictions_running; + size_t cleanup_running; + + struct { + ARAL *ar; + + struct { + SPINLOCK spinlock; + + size_t waiting; + struct rrdeng_cmd *waiting_items_by_priority[STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE]; + size_t executed_by_priority[STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE]; + } unsafe; + } cmd_queue; + + struct { + ARAL *ar; + + struct { + size_t dispatched; + size_t executing; + size_t pending_cb; + } atomics; + } work_cmd; + + struct { + ARAL *ar; + } handles; + + struct { + ARAL *ar; + } descriptors; + + struct { + ARAL *ar; + } xt_io_descr; + +} rrdeng_main = { + .thread = 0, + .loop = {}, + .async = {}, + .timer = {}, + .flushes_running = 0, + .evictions_running = 0, + .cleanup_running = 0, + + .cmd_queue = { + .unsafe = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + }, + } +}; static void sanity_check(void) { - BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2)); + BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_OPCODE_MAX + 2)); /* Magic numbers must fit in the super-blocks */ BUILD_BUG_ON(strlen(RRDENG_DF_MAGIC) > RRDENG_MAGIC_SZ); @@ -54,519 +98,489 @@ static void sanity_check(void) BUILD_BUG_ON(MAX_PAGES_PER_EXTENT > 255); /* extent cache count must fit in 32 bits */ - BUILD_BUG_ON(MAX_CACHED_EXTENTS > 32); +// BUILD_BUG_ON(MAX_CACHED_EXTENTS > 32); /* page info scratch space must be able to hold 2 32-bit integers */ BUILD_BUG_ON(sizeof(((struct rrdeng_page_info *)0)->scratch) < 2 * sizeof(uint32_t)); } -/* always inserts into tail */ -static inline void xt_cache_replaceQ_insert(struct rrdengine_worker_config* wc, - struct extent_cache_element *xt_cache_elem) -{ - struct extent_cache *xt_cache = &wc->xt_cache; +// ---------------------------------------------------------------------------- +// work request cache - xt_cache_elem->prev = NULL; - xt_cache_elem->next = NULL; +typedef void *(*work_cb)(struct rrdengine_instance *ctx, void *data, struct completion *completion, uv_work_t* req); +typedef void (*after_work_cb)(struct rrdengine_instance *ctx, void *data, struct completion *completion, uv_work_t* req, int status); - if (likely(NULL != xt_cache->replaceQ_tail)) { - xt_cache_elem->prev = xt_cache->replaceQ_tail; - xt_cache->replaceQ_tail->next = xt_cache_elem; - } - if (unlikely(NULL == xt_cache->replaceQ_head)) { - xt_cache->replaceQ_head = xt_cache_elem; - } - xt_cache->replaceQ_tail = xt_cache_elem; +struct rrdeng_work { + uv_work_t req; + + struct rrdengine_instance *ctx; + void *data; + struct completion *completion; + + work_cb work_cb; + after_work_cb after_work_cb; + enum rrdeng_opcode opcode; +}; + +static void work_request_init(void) { + rrdeng_main.work_cmd.ar = aral_create( + "dbengine-work-cmd", + sizeof(struct rrdeng_work), + 0, + 65536, NULL, + NULL, NULL, false, false + ); } -static inline void xt_cache_replaceQ_delete(struct rrdengine_worker_config* wc, - struct extent_cache_element *xt_cache_elem) -{ - struct extent_cache *xt_cache = &wc->xt_cache; - struct extent_cache_element *prev, *next; +static inline bool work_request_full(void) { + return __atomic_load_n(&rrdeng_main.work_cmd.atomics.dispatched, __ATOMIC_RELAXED) >= (size_t)(libuv_worker_threads - RESERVED_LIBUV_WORKER_THREADS); +} - prev = xt_cache_elem->prev; - next = xt_cache_elem->next; +static inline void work_done(struct rrdeng_work *work_request) { + aral_freez(rrdeng_main.work_cmd.ar, work_request); +} - if (likely(NULL != prev)) { - prev->next = next; - } - if (likely(NULL != next)) { - next->prev = prev; - } - if (unlikely(xt_cache_elem == xt_cache->replaceQ_head)) { - xt_cache->replaceQ_head = next; - } - if (unlikely(xt_cache_elem == xt_cache->replaceQ_tail)) { - xt_cache->replaceQ_tail = prev; - } - xt_cache_elem->prev = xt_cache_elem->next = NULL; +static void work_standard_worker(uv_work_t *req) { + __atomic_add_fetch(&rrdeng_main.work_cmd.atomics.executing, 1, __ATOMIC_RELAXED); + + register_libuv_worker_jobs(); + worker_is_busy(UV_EVENT_WORKER_INIT); + + struct rrdeng_work *work_request = req->data; + work_request->data = work_request->work_cb(work_request->ctx, work_request->data, work_request->completion, req); + worker_is_idle(); + + __atomic_sub_fetch(&rrdeng_main.work_cmd.atomics.dispatched, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&rrdeng_main.work_cmd.atomics.executing, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_main.work_cmd.atomics.pending_cb, 1, __ATOMIC_RELAXED); + + // signal the event loop a worker is available + fatal_assert(0 == uv_async_send(&rrdeng_main.async)); } -static inline void xt_cache_replaceQ_set_hot(struct rrdengine_worker_config* wc, - struct extent_cache_element *xt_cache_elem) -{ - xt_cache_replaceQ_delete(wc, xt_cache_elem); - xt_cache_replaceQ_insert(wc, xt_cache_elem); +static void after_work_standard_callback(uv_work_t* req, int status) { + struct rrdeng_work *work_request = req->data; + + worker_is_busy(RRDENG_OPCODE_MAX + work_request->opcode); + + if(work_request->after_work_cb) + work_request->after_work_cb(work_request->ctx, work_request->data, work_request->completion, req, status); + + work_done(work_request); + __atomic_sub_fetch(&rrdeng_main.work_cmd.atomics.pending_cb, 1, __ATOMIC_RELAXED); + + worker_is_idle(); } -/* Returns the index of the cached extent if it was successfully inserted in the extent cache, otherwise -1 */ -static int try_insert_into_xt_cache(struct rrdengine_worker_config* wc, struct extent_info *extent) -{ - struct extent_cache *xt_cache = &wc->xt_cache; - struct extent_cache_element *xt_cache_elem; - unsigned idx; - int ret; +static bool work_dispatch(struct rrdengine_instance *ctx, void *data, struct completion *completion, enum rrdeng_opcode opcode, work_cb work_cb, after_work_cb after_work_cb) { + struct rrdeng_work *work_request = NULL; - ret = find_first_zero(xt_cache->allocation_bitmap); - if (-1 == ret || ret >= MAX_CACHED_EXTENTS) { - for (xt_cache_elem = xt_cache->replaceQ_head ; NULL != xt_cache_elem ; xt_cache_elem = xt_cache_elem->next) { - idx = xt_cache_elem - xt_cache->extent_array; - if (!check_bit(xt_cache->inflight_bitmap, idx)) { - xt_cache_replaceQ_delete(wc, xt_cache_elem); - break; - } - } - if (NULL == xt_cache_elem) - return -1; - } else { - idx = (unsigned)ret; - xt_cache_elem = &xt_cache->extent_array[idx]; + internal_fatal(rrdeng_main.tid != gettid(), "work_dispatch() can only be run from the event loop thread"); + + work_request = aral_mallocz(rrdeng_main.work_cmd.ar); + memset(work_request, 0, sizeof(struct rrdeng_work)); + work_request->req.data = work_request; + work_request->ctx = ctx; + work_request->data = data; + work_request->completion = completion; + work_request->work_cb = work_cb; + work_request->after_work_cb = after_work_cb; + work_request->opcode = opcode; + + if(uv_queue_work(&rrdeng_main.loop, &work_request->req, work_standard_worker, after_work_standard_callback)) { + internal_fatal(true, "DBENGINE: cannot queue work"); + work_done(work_request); + return false; } - xt_cache_elem->extent = extent; - xt_cache_elem->fileno = extent->datafile->fileno; - xt_cache_elem->inflight_io_descr = NULL; - xt_cache_replaceQ_insert(wc, xt_cache_elem); - modify_bit(&xt_cache->allocation_bitmap, idx, 1); - return (int)idx; + __atomic_add_fetch(&rrdeng_main.work_cmd.atomics.dispatched, 1, __ATOMIC_RELAXED); + + return true; } -/** - * Returns 0 if the cached extent was found in the extent cache, 1 otherwise. - * Sets *idx to point to the position of the extent inside the cache. - **/ -static uint8_t lookup_in_xt_cache(struct rrdengine_worker_config* wc, struct extent_info *extent, unsigned *idx) -{ - struct extent_cache *xt_cache = &wc->xt_cache; - struct extent_cache_element *xt_cache_elem; - unsigned i; +// ---------------------------------------------------------------------------- +// page descriptor cache + +void page_descriptors_init(void) { + rrdeng_main.descriptors.ar = aral_create( + "dbengine-descriptors", + sizeof(struct page_descr_with_data), + 0, + 65536 * 4, + NULL, + NULL, NULL, false, false); +} - for (i = 0 ; i < MAX_CACHED_EXTENTS ; ++i) { - xt_cache_elem = &xt_cache->extent_array[i]; - if (check_bit(xt_cache->allocation_bitmap, i) && xt_cache_elem->extent == extent && - xt_cache_elem->fileno == extent->datafile->fileno) { - *idx = i; - return 0; - } - } - return 1; +struct page_descr_with_data *page_descriptor_get(void) { + struct page_descr_with_data *descr = aral_mallocz(rrdeng_main.descriptors.ar); + memset(descr, 0, sizeof(struct page_descr_with_data)); + return descr; } -#if 0 /* disabled code */ -static void delete_from_xt_cache(struct rrdengine_worker_config* wc, unsigned idx) -{ - struct extent_cache *xt_cache = &wc->xt_cache; - struct extent_cache_element *xt_cache_elem; +static inline void page_descriptor_release(struct page_descr_with_data *descr) { + aral_freez(rrdeng_main.descriptors.ar, descr); +} - xt_cache_elem = &xt_cache->extent_array[idx]; - xt_cache_replaceQ_delete(wc, xt_cache_elem); - xt_cache_elem->extent = NULL; - modify_bit(&wc->xt_cache.allocation_bitmap, idx, 0); /* invalidate it */ - modify_bit(&wc->xt_cache.inflight_bitmap, idx, 0); /* not in-flight anymore */ +// ---------------------------------------------------------------------------- +// extent io descriptor cache + +static void extent_io_descriptor_init(void) { + rrdeng_main.xt_io_descr.ar = aral_create( + "dbengine-extent-io", + sizeof(struct extent_io_descriptor), + 0, + 65536, + NULL, + NULL, NULL, false, false + ); } -#endif -void enqueue_inflight_read_to_xt_cache(struct rrdengine_worker_config* wc, unsigned idx, - struct extent_io_descriptor *xt_io_descr) -{ - struct extent_cache *xt_cache = &wc->xt_cache; - struct extent_cache_element *xt_cache_elem; - struct extent_io_descriptor *old_next; +static struct extent_io_descriptor *extent_io_descriptor_get(void) { + struct extent_io_descriptor *xt_io_descr = aral_mallocz(rrdeng_main.xt_io_descr.ar); + memset(xt_io_descr, 0, sizeof(struct extent_io_descriptor)); + return xt_io_descr; +} - xt_cache_elem = &xt_cache->extent_array[idx]; - old_next = xt_cache_elem->inflight_io_descr->next; - xt_cache_elem->inflight_io_descr->next = xt_io_descr; - xt_io_descr->next = old_next; +static inline void extent_io_descriptor_release(struct extent_io_descriptor *xt_io_descr) { + aral_freez(rrdeng_main.xt_io_descr.ar, xt_io_descr); } -void read_cached_extent_cb(struct rrdengine_worker_config* wc, unsigned idx, struct extent_io_descriptor *xt_io_descr) -{ - unsigned i, j, page_offset; - struct rrdengine_instance *ctx = wc->ctx; - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - void *page; - struct extent_info *extent = xt_io_descr->descr_array[0]->extent; - - for (i = 0 ; i < xt_io_descr->descr_count; ++i) { - page = dbengine_page_alloc(); - descr = xt_io_descr->descr_array[i]; - for (j = 0, page_offset = 0 ; j < extent->number_of_pages ; ++j) { - /* care, we don't hold the descriptor mutex */ - if (!uuid_compare(*extent->pages[j]->id, *descr->id) && - extent->pages[j]->page_length == descr->page_length && - extent->pages[j]->start_time_ut == descr->start_time_ut && - extent->pages[j]->end_time_ut == descr->end_time_ut) { - break; - } - page_offset += extent->pages[j]->page_length; +// ---------------------------------------------------------------------------- +// query handle cache + +void rrdeng_query_handle_init(void) { + rrdeng_main.handles.ar = aral_create( + "dbengine-query-handles", + sizeof(struct rrdeng_query_handle), + 0, + 65536, + NULL, + NULL, NULL, false, false); +} - } - /* care, we don't hold the descriptor mutex */ - (void) memcpy(page, wc->xt_cache.extent_array[idx].pages + page_offset, descr->page_length); - - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - pg_cache_descr->page = page; - pg_cache_descr->flags |= RRD_PAGE_POPULATED; - pg_cache_descr->flags &= ~RRD_PAGE_READ_PENDING; - rrdeng_page_descr_mutex_unlock(ctx, descr); - pg_cache_replaceQ_insert(ctx, descr); - if (xt_io_descr->release_descr) { - pg_cache_put(ctx, descr); - } else { - debug(D_RRDENGINE, "%s: Waking up waiters.", __func__); - pg_cache_wake_up_waiters(ctx, descr); - } +struct rrdeng_query_handle *rrdeng_query_handle_get(void) { + struct rrdeng_query_handle *handle = aral_mallocz(rrdeng_main.handles.ar); + memset(handle, 0, sizeof(struct rrdeng_query_handle)); + return handle; +} + +void rrdeng_query_handle_release(struct rrdeng_query_handle *handle) { + aral_freez(rrdeng_main.handles.ar, handle); +} + +// ---------------------------------------------------------------------------- +// WAL cache + +static struct { + struct { + SPINLOCK spinlock; + WAL *available_items; + size_t available; + } protected; + + struct { + size_t allocated; + } atomics; +} wal_globals = { + .protected = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .available_items = NULL, + .available = 0, + }, + .atomics = { + .allocated = 0, + }, +}; + +static void wal_cleanup1(void) { + WAL *wal = NULL; + + if(!netdata_spinlock_trylock(&wal_globals.protected.spinlock)) + return; + + if(wal_globals.protected.available_items && wal_globals.protected.available > storage_tiers) { + wal = wal_globals.protected.available_items; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next); + wal_globals.protected.available--; } - if (xt_io_descr->completion) - completion_mark_complete(xt_io_descr->completion); - freez(xt_io_descr); -} - -static void fill_page_with_nulls(void *page, uint32_t page_length, uint8_t type) { - switch(type) { - case PAGE_METRICS: { - storage_number n = pack_storage_number(NAN, SN_FLAG_NONE); - storage_number *array = (storage_number *)page; - size_t slots = page_length / sizeof(n); - for(size_t i = 0; i < slots ; i++) - array[i] = n; - } - break; - - case PAGE_TIER: { - storage_number_tier1_t n = { - .min_value = NAN, - .max_value = NAN, - .sum_value = NAN, - .count = 1, - .anomaly_count = 0, - }; - storage_number_tier1_t *array = (storage_number_tier1_t *)page; - size_t slots = page_length / sizeof(n); - for(size_t i = 0; i < slots ; i++) - array[i] = n; - } - break; - default: { - static bool logged = false; - if(!logged) { - error("DBENGINE: cannot fill page with nulls on unknown page type id %d", type); - logged = true; - } - memset(page, 0, page_length); - } + netdata_spinlock_unlock(&wal_globals.protected.spinlock); + + if(wal) { + posix_memfree(wal->buf); + freez(wal); + __atomic_sub_fetch(&wal_globals.atomics.allocated, 1, __ATOMIC_RELAXED); } } -struct rrdeng_page_descr *get_descriptor(struct pg_cache_page_index *page_index, time_t start_time_s) -{ - uv_rwlock_rdlock(&page_index->lock); - Pvoid_t *PValue = JudyLGet(page_index->JudyL_array, start_time_s, PJE0); - struct rrdeng_page_descr *descr = unlikely(NULL == PValue) ? NULL : *PValue; - uv_rwlock_rdunlock(&page_index->lock); - return descr; -}; +WAL *wal_get(struct rrdengine_instance *ctx, unsigned size) { + if(!size || size > RRDENG_BLOCK_SIZE) + fatal("DBENGINE: invalid WAL size requested"); -static void do_extent_processing (struct rrdengine_worker_config *wc, struct extent_io_descriptor *xt_io_descr, bool read_failed) -{ - struct rrdengine_instance *ctx = wc->ctx; - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - int ret; - unsigned i, j, count; - void *page, *uncompressed_buf = NULL; - uint32_t payload_length, payload_offset, page_offset, uncompressed_payload_length = 0; - uint8_t have_read_error = 0; - /* persistent structures */ - struct rrdeng_df_extent_header *header; - struct rrdeng_df_extent_trailer *trailer; - uLong crc; + WAL *wal = NULL; - header = xt_io_descr->buf; - payload_length = header->payload_length; - count = header->number_of_pages; - payload_offset = sizeof(*header) + sizeof(header->descr[0]) * count; - trailer = xt_io_descr->buf + xt_io_descr->bytes - sizeof(*trailer); - - if (unlikely(read_failed)) { - struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile; - - ++ctx->stats.io_errors; - rrd_stat_atomic_add(&global_io_errors, 1); - have_read_error = 1; - error("%s: uv_fs_read - extent at offset %"PRIu64"(%u) in datafile %u-%u.", __func__, xt_io_descr->pos, - xt_io_descr->bytes, datafile->tier, datafile->fileno); - goto after_crc_check; + netdata_spinlock_lock(&wal_globals.protected.spinlock); + + if(likely(wal_globals.protected.available_items)) { + wal = wal_globals.protected.available_items; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next); + wal_globals.protected.available--; } - crc = crc32(0L, Z_NULL, 0); - crc = crc32(crc, xt_io_descr->buf, xt_io_descr->bytes - sizeof(*trailer)); - ret = crc32cmp(trailer->checksum, crc); -#ifdef NETDATA_INTERNAL_CHECKS - { - struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile; - debug(D_RRDENGINE, "%s: Extent at offset %"PRIu64"(%u) was read from datafile %u-%u. CRC32 check: %s", __func__, - xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno, ret ? "FAILED" : "SUCCEEDED"); + + uint64_t transaction_id = __atomic_fetch_add(&ctx->atomic.transaction_id, 1, __ATOMIC_RELAXED); + netdata_spinlock_unlock(&wal_globals.protected.spinlock); + + if(unlikely(!wal)) { + wal = mallocz(sizeof(WAL)); + wal->buf_size = RRDENG_BLOCK_SIZE; + int ret = posix_memalign((void *)&wal->buf, RRDFILE_ALIGNMENT, wal->buf_size); + if (unlikely(ret)) + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); + __atomic_add_fetch(&wal_globals.atomics.allocated, 1, __ATOMIC_RELAXED); } -#endif - if (unlikely(ret)) { - struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile; - ++ctx->stats.io_errors; - rrd_stat_atomic_add(&global_io_errors, 1); - have_read_error = 1; - error("%s: Extent at offset %"PRIu64"(%u) was read from datafile %u-%u. CRC32 check: FAILED", __func__, - xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno); + // these need to survive + unsigned buf_size = wal->buf_size; + void *buf = wal->buf; + + memset(wal, 0, sizeof(WAL)); + + // put them back + wal->buf_size = buf_size; + wal->buf = buf; + + memset(wal->buf, 0, wal->buf_size); + + wal->transaction_id = transaction_id; + wal->size = size; + + return wal; +} + +void wal_release(WAL *wal) { + if(unlikely(!wal)) return; + + netdata_spinlock_lock(&wal_globals.protected.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next); + wal_globals.protected.available++; + netdata_spinlock_unlock(&wal_globals.protected.spinlock); +} + +// ---------------------------------------------------------------------------- +// command queue cache + +struct rrdeng_cmd { + struct rrdengine_instance *ctx; + enum rrdeng_opcode opcode; + void *data; + struct completion *completion; + enum storage_priority priority; + dequeue_callback_t dequeue_cb; + + struct { + struct rrdeng_cmd *prev; + struct rrdeng_cmd *next; + } queue; +}; + +static void rrdeng_cmd_queue_init(void) { + rrdeng_main.cmd_queue.ar = aral_create("dbengine-opcodes", + sizeof(struct rrdeng_cmd), + 0, + 65536, + NULL, + NULL, NULL, false, false); +} + +static inline STORAGE_PRIORITY rrdeng_enq_cmd_map_opcode_to_priority(enum rrdeng_opcode opcode, STORAGE_PRIORITY priority) { + if(unlikely(priority >= STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE)) + priority = STORAGE_PRIORITY_BEST_EFFORT; + + switch(opcode) { + case RRDENG_OPCODE_QUERY: + priority = STORAGE_PRIORITY_INTERNAL_QUERY_PREP; + break; + + default: + break; } -after_crc_check: - if (!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm) { - uncompressed_payload_length = 0; - for (i = 0 ; i < count ; ++i) { - uncompressed_payload_length += header->descr[i].page_length; + return priority; +} + +void rrdeng_enqueue_epdl_cmd(struct rrdeng_cmd *cmd) { + epdl_cmd_queued(cmd->data, cmd); +} + +void rrdeng_dequeue_epdl_cmd(struct rrdeng_cmd *cmd) { + epdl_cmd_dequeued(cmd->data); +} + +void rrdeng_req_cmd(requeue_callback_t get_cmd_cb, void *data, STORAGE_PRIORITY priority) { + netdata_spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); + + struct rrdeng_cmd *cmd = get_cmd_cb(data); + if(cmd) { + priority = rrdeng_enq_cmd_map_opcode_to_priority(cmd->opcode, priority); + + if (cmd->priority > priority) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[cmd->priority], cmd, queue.prev, queue.next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority], cmd, queue.prev, queue.next); + cmd->priority = priority; } - uncompressed_buf = mallocz(uncompressed_payload_length); - ret = LZ4_decompress_safe(xt_io_descr->buf + payload_offset, uncompressed_buf, - payload_length, uncompressed_payload_length); - ctx->stats.before_decompress_bytes += payload_length; - ctx->stats.after_decompress_bytes += ret; - debug(D_RRDENGINE, "LZ4 decompressed %u bytes to %d bytes.", payload_length, ret); - /* care, we don't hold the descriptor mutex */ } - { - uint8_t xt_is_cached = 0; - unsigned xt_idx; - struct extent_info *extent = xt_io_descr->descr_array[0]->extent; - - xt_is_cached = !lookup_in_xt_cache(wc, extent, &xt_idx); - if (xt_is_cached && check_bit(wc->xt_cache.inflight_bitmap, xt_idx)) { - struct extent_cache *xt_cache = &wc->xt_cache; - struct extent_cache_element *xt_cache_elem = &xt_cache->extent_array[xt_idx]; - struct extent_io_descriptor *curr, *next; - - if (have_read_error) { - memset(xt_cache_elem->pages, 0, sizeof(xt_cache_elem->pages)); - } else if (RRD_NO_COMPRESSION == header->compression_algorithm) { - (void)memcpy(xt_cache_elem->pages, xt_io_descr->buf + payload_offset, payload_length); - } else { - (void)memcpy(xt_cache_elem->pages, uncompressed_buf, uncompressed_payload_length); - } - /* complete all connected in-flight read requests */ - for (curr = xt_cache_elem->inflight_io_descr->next ; curr ; curr = next) { - next = curr->next; - read_cached_extent_cb(wc, xt_idx, curr); + + netdata_spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock); +} + +void rrdeng_enq_cmd(struct rrdengine_instance *ctx, enum rrdeng_opcode opcode, void *data, struct completion *completion, + enum storage_priority priority, enqueue_callback_t enqueue_cb, dequeue_callback_t dequeue_cb) { + + priority = rrdeng_enq_cmd_map_opcode_to_priority(opcode, priority); + + struct rrdeng_cmd *cmd = aral_mallocz(rrdeng_main.cmd_queue.ar); + memset(cmd, 0, sizeof(struct rrdeng_cmd)); + cmd->ctx = ctx; + cmd->opcode = opcode; + cmd->data = data; + cmd->completion = completion; + cmd->priority = priority; + cmd->dequeue_cb = dequeue_cb; + + netdata_spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority], cmd, queue.prev, queue.next); + rrdeng_main.cmd_queue.unsafe.waiting++; + if(enqueue_cb) + enqueue_cb(cmd); + netdata_spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock); + + fatal_assert(0 == uv_async_send(&rrdeng_main.async)); +} + +static inline bool rrdeng_cmd_has_waiting_opcodes_in_lower_priorities(STORAGE_PRIORITY priority, STORAGE_PRIORITY max_priority) { + for(; priority <= max_priority ; priority++) + if(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority]) + return true; + + return false; +} + +static inline struct rrdeng_cmd rrdeng_deq_cmd(void) { + struct rrdeng_cmd *cmd = NULL; + + STORAGE_PRIORITY max_priority = work_request_full() ? STORAGE_PRIORITY_INTERNAL_DBENGINE : STORAGE_PRIORITY_BEST_EFFORT; + + // find an opcode to execute from the queue + netdata_spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); + for(STORAGE_PRIORITY priority = STORAGE_PRIORITY_INTERNAL_DBENGINE; priority <= max_priority ; priority++) { + cmd = rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority]; + if(cmd) { + + // avoid starvation of lower priorities + if(unlikely(priority >= STORAGE_PRIORITY_HIGH && + priority < STORAGE_PRIORITY_BEST_EFFORT && + ++rrdeng_main.cmd_queue.unsafe.executed_by_priority[priority] % 50 == 0 && + rrdeng_cmd_has_waiting_opcodes_in_lower_priorities(priority + 1, max_priority))) { + // let the others run 2% of the requests + cmd = NULL; + continue; } - xt_cache_elem->inflight_io_descr = NULL; - modify_bit(&xt_cache->inflight_bitmap, xt_idx, 0); /* not in-flight anymore */ + + // remove it from the queue + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority], cmd, queue.prev, queue.next); + rrdeng_main.cmd_queue.unsafe.waiting--; + break; } } - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - Pvoid_t *PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, xt_io_descr->descr_array[0]->id, sizeof(uuid_t)); - struct pg_cache_page_index *page_index = likely( NULL != PValue) ? *PValue : NULL; - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - - - for (i = 0, page_offset = 0; i < count; page_offset += header->descr[i++].page_length) { - uint8_t is_prefetched_page; - descr = NULL; - for (j = 0 ; j < xt_io_descr->descr_count; ++j) { - struct rrdeng_page_descr descrj; - - descrj = xt_io_descr->descr_read_array[j]; - /* care, we don't hold the descriptor mutex */ - if (!uuid_compare(*(uuid_t *) header->descr[i].uuid, *descrj.id) && - header->descr[i].page_length == descrj.page_length && - header->descr[i].start_time_ut == descrj.start_time_ut && - header->descr[i].end_time_ut == descrj.end_time_ut) { - //descr = descrj; - descr = get_descriptor(page_index, (time_t) (descrj.start_time_ut / USEC_PER_SEC)); - if (unlikely(!descr)) { - error_limit_static_thread_var(erl, 1, 0); - error_limit(&erl, "%s: Required descriptor is not in the page index anymore", __FUNCTION__); - } - break; - } - } - is_prefetched_page = 0; - if (!descr) { /* This extent page has not been requested. Try populating it for locality (best effort). */ - descr = pg_cache_lookup_unpopulated_and_lock(ctx, (uuid_t *)header->descr[i].uuid, - header->descr[i].start_time_ut); - if (!descr) - continue; /* Failed to reserve a suitable page */ - is_prefetched_page = 1; - } - page = dbengine_page_alloc(); - - /* care, we don't hold the descriptor mutex */ - if (have_read_error) { - fill_page_with_nulls(page, descr->page_length, descr->type); - } else if (RRD_NO_COMPRESSION == header->compression_algorithm) { - (void) memcpy(page, xt_io_descr->buf + payload_offset + page_offset, descr->page_length); - } else { - (void) memcpy(page, uncompressed_buf + page_offset, descr->page_length); - } - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - pg_cache_descr->page = page; - pg_cache_descr->flags |= RRD_PAGE_POPULATED; - pg_cache_descr->flags &= ~RRD_PAGE_READ_PENDING; - rrdeng_page_descr_mutex_unlock(ctx, descr); - pg_cache_replaceQ_insert(ctx, descr); - if (xt_io_descr->release_descr || is_prefetched_page) { - pg_cache_put(ctx, descr); - } else { - debug(D_RRDENGINE, "%s: Waking up waiters.", __func__); - pg_cache_wake_up_waiters(ctx, descr); - } + if(cmd && cmd->dequeue_cb) { + cmd->dequeue_cb(cmd); + cmd->dequeue_cb = NULL; } - if (!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm) { - freez(uncompressed_buf); + + netdata_spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock); + + struct rrdeng_cmd ret; + if(cmd) { + // copy it, to return it + ret = *cmd; + + aral_freez(rrdeng_main.cmd_queue.ar, cmd); } - if (xt_io_descr->completion) - completion_mark_complete(xt_io_descr->completion); + else + ret = (struct rrdeng_cmd) { + .ctx = NULL, + .opcode = RRDENG_OPCODE_NOOP, + .priority = STORAGE_PRIORITY_BEST_EFFORT, + .completion = NULL, + .data = NULL, + }; + + return ret; } -static void read_extent_cb(uv_fs_t *req) -{ - struct rrdengine_worker_config *wc = req->loop->data; - struct extent_io_descriptor *xt_io_descr; - xt_io_descr = req->data; - do_extent_processing(wc, xt_io_descr, req->result < 0); - uv_fs_req_cleanup(req); - posix_memfree(xt_io_descr->buf); - freez(xt_io_descr); +// ---------------------------------------------------------------------------- + +struct { + ARAL *aral[RRD_STORAGE_TIERS]; +} dbengine_page_alloc_globals = {}; + +static inline ARAL *page_size_lookup(size_t size) { + for(size_t tier = 0; tier < storage_tiers ;tier++) + if(size == tier_page_size[tier]) + return dbengine_page_alloc_globals.aral[tier]; + + return NULL; } -static void read_mmap_extent_cb(uv_work_t *req, int status __maybe_unused) -{ - struct rrdengine_worker_config *wc = req->loop->data; - struct rrdengine_instance *ctx = wc->ctx; - struct extent_io_descriptor *xt_io_descr; - xt_io_descr = req->data; +static void dbengine_page_alloc_init(void) { + for(size_t i = storage_tiers; i > 0 ;i--) { + size_t tier = storage_tiers - i; - if (likely(xt_io_descr->map_base)) { - do_extent_processing(wc, xt_io_descr, false); - munmap(xt_io_descr->map_base, xt_io_descr->map_length); - freez(xt_io_descr); - return; - } + char buf[20 + 1]; + snprintfz(buf, 20, "tier%zu-pages", tier); - // MMAP failed, so do uv_fs_read - int ret = posix_memalign((void *)&xt_io_descr->buf, RRDFILE_ALIGNMENT, ALIGN_BYTES_CEILING(xt_io_descr->bytes)); - if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + dbengine_page_alloc_globals.aral[tier] = aral_create( + buf, + tier_page_size[tier], + 64, + 512 * tier_page_size[tier], + pgc_aral_statistics(), + NULL, NULL, false, false); } - unsigned real_io_size = ALIGN_BYTES_CEILING( xt_io_descr->bytes); - xt_io_descr->iov = uv_buf_init((void *)xt_io_descr->buf, real_io_size); - xt_io_descr->req.data = xt_io_descr; - ret = uv_fs_read(req->loop, &xt_io_descr->req, xt_io_descr->file, &xt_io_descr->iov, 1, (unsigned) xt_io_descr->pos, read_extent_cb); - fatal_assert(-1 != ret); - ctx->stats.io_read_bytes += real_io_size; - ctx->stats.io_read_extent_bytes += real_io_size; } -static void do_mmap_read_extent(uv_work_t *req) -{ - struct extent_io_descriptor *xt_io_descr = (struct extent_io_descriptor * )req->data; - struct rrdengine_worker_config *wc = req->loop->data; - struct rrdengine_instance *ctx = wc->ctx; - - off_t map_start = ALIGN_BYTES_FLOOR(xt_io_descr->pos); - size_t length = ALIGN_BYTES_CEILING(xt_io_descr->pos + xt_io_descr->bytes) - map_start; - unsigned real_io_size = xt_io_descr->bytes; - - void *data = mmap(NULL, length, PROT_READ, MAP_SHARED, xt_io_descr->file, map_start); - if (likely(data != MAP_FAILED)) { - xt_io_descr->map_base = data; - xt_io_descr->map_length = length; - xt_io_descr->buf = data + (xt_io_descr->pos - map_start); - ctx->stats.io_read_bytes += real_io_size; - ctx->stats.io_read_extent_bytes += real_io_size; - } +void *dbengine_page_alloc(size_t size) { + ARAL *ar = page_size_lookup(size); + if(ar) return aral_mallocz(ar); + + return mallocz(size); } -static void do_read_extent(struct rrdengine_worker_config* wc, - struct rrdeng_page_descr **descr, - unsigned count, - uint8_t release_descr) -{ - struct rrdengine_instance *ctx = wc->ctx; - struct page_cache_descr *pg_cache_descr; - int ret; - unsigned i, size_bytes, pos; - struct extent_io_descriptor *xt_io_descr; - struct rrdengine_datafile *datafile; - struct extent_info *extent = descr[0]->extent; - uint8_t xt_is_cached = 0, xt_is_inflight = 0; - unsigned xt_idx; - - datafile = extent->datafile; - pos = extent->offset; - size_bytes = extent->size; - - xt_io_descr = callocz(1, sizeof(*xt_io_descr)); - for (i = 0 ; i < count; ++i) { - rrdeng_page_descr_mutex_lock(ctx, descr[i]); - pg_cache_descr = descr[i]->pg_cache_descr; - pg_cache_descr->flags |= RRD_PAGE_READ_PENDING; - rrdeng_page_descr_mutex_unlock(ctx, descr[i]); - xt_io_descr->descr_array[i] = descr[i]; - xt_io_descr->descr_read_array[i] = *(descr[i]); - } - xt_io_descr->descr_count = count; - xt_io_descr->file = datafile->file; - xt_io_descr->bytes = size_bytes; - xt_io_descr->pos = pos; - xt_io_descr->req_worker.data = xt_io_descr; - xt_io_descr->completion = NULL; - xt_io_descr->release_descr = release_descr; - xt_io_descr->buf = NULL; - - xt_is_cached = !lookup_in_xt_cache(wc, extent, &xt_idx); - if (xt_is_cached) { - xt_cache_replaceQ_set_hot(wc, &wc->xt_cache.extent_array[xt_idx]); - xt_is_inflight = check_bit(wc->xt_cache.inflight_bitmap, xt_idx); - if (xt_is_inflight) { - enqueue_inflight_read_to_xt_cache(wc, xt_idx, xt_io_descr); - return; - } - return read_cached_extent_cb(wc, xt_idx, xt_io_descr); - } else { - ret = try_insert_into_xt_cache(wc, extent); - if (-1 != ret) { - xt_idx = (unsigned)ret; - modify_bit(&wc->xt_cache.inflight_bitmap, xt_idx, 1); - wc->xt_cache.extent_array[xt_idx].inflight_io_descr = xt_io_descr; - } - } +void dbengine_page_free(void *page, size_t size __maybe_unused) { + if(unlikely(!page || page == DBENGINE_EMPTY_PAGE)) + return; - ret = uv_queue_work(wc->loop, &xt_io_descr->req_worker, do_mmap_read_extent, read_mmap_extent_cb); - fatal_assert(-1 != ret); + ARAL *ar = page_size_lookup(size); + if(ar) + aral_freez(ar, page); + else + freez(page); +} - ++ctx->stats.io_read_requests; - ++ctx->stats.io_read_extents; - ctx->stats.pg_cache_backfills += count; +// ---------------------------------------------------------------------------- + +void *dbengine_extent_alloc(size_t size) { + void *extent = mallocz(size); + return extent; } -static void commit_data_extent(struct rrdengine_worker_config* wc, struct extent_io_descriptor *xt_io_descr) -{ - struct rrdengine_instance *ctx = wc->ctx; +void dbengine_extent_free(void *extent, size_t size __maybe_unused) { + freez(extent); +} + +static void journalfile_extent_build(struct rrdengine_instance *ctx, struct extent_io_descriptor *xt_io_descr) { unsigned count, payload_length, descr_size, size_bytes; void *buf; /* persistent structures */ @@ -582,12 +596,13 @@ static void commit_data_extent(struct rrdengine_worker_config* wc, struct extent payload_length = sizeof(*jf_metric_data) + descr_size; size_bytes = sizeof(*jf_header) + payload_length + sizeof(*jf_trailer); - buf = wal_get_transaction_buffer(wc, size_bytes); + xt_io_descr->wal = wal_get(ctx, size_bytes); + buf = xt_io_descr->wal->buf; jf_header = buf; jf_header->type = STORE_DATA; jf_header->reserved = 0; - jf_header->id = ctx->commit_log.transaction_id++; + jf_header->id = xt_io_descr->wal->transaction_id; jf_header->payload_length = payload_length; jf_metric_data = buf + sizeof(*jf_header); @@ -602,265 +617,210 @@ static void commit_data_extent(struct rrdengine_worker_config* wc, struct extent crc32set(jf_trailer->checksum, crc); } -static void do_commit_transaction(struct rrdengine_worker_config* wc, uint8_t type, void *data) -{ - switch (type) { - case STORE_DATA: - commit_data_extent(wc, (struct extent_io_descriptor *)data); - break; - default: - fatal_assert(type == STORE_DATA); - break; - } -} +static void after_extent_flushed_to_open(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + if(completion) + completion_mark_complete(completion); -static void after_invalidate_oldest_committed(struct rrdengine_worker_config* wc) -{ - int error; - - error = uv_thread_join(wc->now_invalidating_dirty_pages); - if (error) { - error("uv_thread_join(): %s", uv_strerror(error)); - } - freez(wc->now_invalidating_dirty_pages); - wc->now_invalidating_dirty_pages = NULL; - wc->cleanup_thread_invalidating_dirty_pages = 0; + if(ctx_is_available_for_queries(ctx)) + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_DATABASE_ROTATE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); } -static void invalidate_oldest_committed(void *arg) -{ - struct rrdengine_instance *ctx = arg; - struct rrdengine_worker_config *wc = &ctx->worker_config; - struct page_cache *pg_cache = &ctx->pg_cache; - int ret; - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - Pvoid_t *PValue; - Word_t Index; - unsigned nr_committed_pages; +static void *extent_flushed_to_open_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_FLUSHED_TO_OPEN); - do { - uv_rwlock_wrlock(&pg_cache->committed_page_index.lock); - for (Index = 0, - PValue = JudyLFirst(pg_cache->committed_page_index.JudyL_array, &Index, PJE0), - descr = unlikely(NULL == PValue) ? NULL : *PValue; + uv_fs_t *uv_fs_request = data; + struct extent_io_descriptor *xt_io_descr = uv_fs_request->data; + struct page_descr_with_data *descr; + struct rrdengine_datafile *datafile; + unsigned i; - descr != NULL; + datafile = xt_io_descr->datafile; - PValue = JudyLNext(pg_cache->committed_page_index.JudyL_array, &Index, PJE0), - descr = unlikely(NULL == PValue) ? NULL : *PValue) { - fatal_assert(0 != descr->page_length); + bool still_running = ctx_is_available_for_queries(ctx); - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - if (!(pg_cache_descr->flags & RRD_PAGE_WRITE_PENDING) && pg_cache_try_get_unsafe(descr, 1)) { - rrdeng_page_descr_mutex_unlock(ctx, descr); + for (i = 0 ; i < xt_io_descr->descr_count ; ++i) { + descr = xt_io_descr->descr_array[i]; - ret = JudyLDel(&pg_cache->committed_page_index.JudyL_array, Index, PJE0); - fatal_assert(1 == ret); - break; - } - rrdeng_page_descr_mutex_unlock(ctx, descr); - } - uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock); + if (likely(still_running)) + pgc_open_add_hot_page( + (Word_t)ctx, descr->metric_id, + (time_t) (descr->start_time_ut / USEC_PER_SEC), + (time_t) (descr->end_time_ut / USEC_PER_SEC), + descr->update_every_s, + datafile, + xt_io_descr->pos, xt_io_descr->bytes, descr->page_length); - if (!descr) { - info("Failed to invalidate any dirty pages to relieve page cache pressure."); + page_descriptor_release(descr); + } - goto out; - } - pg_cache_punch_hole(ctx, descr, 1, 1, NULL); + uv_fs_req_cleanup(uv_fs_request); + posix_memfree(xt_io_descr->buf); + extent_io_descriptor_release(xt_io_descr); - uv_rwlock_wrlock(&pg_cache->committed_page_index.lock); - nr_committed_pages = --pg_cache->committed_page_index.nr_committed_pages; - uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock); - rrd_stat_atomic_add(&ctx->stats.flushing_pressure_page_deletions, 1); - rrd_stat_atomic_add(&global_flushing_pressure_page_deletions, 1); + netdata_spinlock_lock(&datafile->writers.spinlock); + datafile->writers.flushed_to_open_running--; + netdata_spinlock_unlock(&datafile->writers.spinlock); - } while (nr_committed_pages >= pg_cache_committed_hard_limit(ctx)); -out: - wc->cleanup_thread_invalidating_dirty_pages = 1; - /* wake up event loop */ - fatal_assert(0 == uv_async_send(&wc->async)); -} + if(datafile->fileno != ctx_last_fileno_get(ctx) && still_running) + // we just finished a flushing on a datafile that is not the active one + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_JOURNAL_INDEX, datafile, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); -void rrdeng_invalidate_oldest_committed(struct rrdengine_worker_config* wc) -{ - struct rrdengine_instance *ctx = wc->ctx; - struct page_cache *pg_cache = &ctx->pg_cache; - unsigned nr_committed_pages; - int error; + return data; +} - if (unlikely(ctx->quiesce != NO_QUIESCE)) /* Shutting down */ - return; +// Main event loop callback +static void after_extent_write_datafile_io(uv_fs_t *uv_fs_request) { + worker_is_busy(RRDENG_OPCODE_MAX + RRDENG_OPCODE_EXTENT_WRITE); - uv_rwlock_rdlock(&pg_cache->committed_page_index.lock); - nr_committed_pages = pg_cache->committed_page_index.nr_committed_pages; - uv_rwlock_rdunlock(&pg_cache->committed_page_index.lock); + struct extent_io_descriptor *xt_io_descr = uv_fs_request->data; + struct rrdengine_datafile *datafile = xt_io_descr->datafile; + struct rrdengine_instance *ctx = datafile->ctx; - if (nr_committed_pages >= pg_cache_committed_hard_limit(ctx)) { - /* delete the oldest page in memory */ - if (wc->now_invalidating_dirty_pages) { - /* already deleting a page */ - return; - } - errno = 0; - error("Failed to flush dirty buffers quickly enough in dbengine instance \"%s\". " - "Metric data are being deleted, please reduce disk load or use a faster disk.", ctx->dbfiles_path); - - wc->now_invalidating_dirty_pages = mallocz(sizeof(*wc->now_invalidating_dirty_pages)); - wc->cleanup_thread_invalidating_dirty_pages = 0; - - error = uv_thread_create(wc->now_invalidating_dirty_pages, invalidate_oldest_committed, ctx); - if (error) { - error("uv_thread_create(): %s", uv_strerror(error)); - freez(wc->now_invalidating_dirty_pages); - wc->now_invalidating_dirty_pages = NULL; - } + if (uv_fs_request->result < 0) { + ctx_io_error(ctx); + error("DBENGINE: %s: uv_fs_write(): %s", __func__, uv_strerror((int)uv_fs_request->result)); } + + journalfile_v1_extent_write(ctx, xt_io_descr->datafile, xt_io_descr->wal, &rrdeng_main.loop); + + netdata_spinlock_lock(&datafile->writers.spinlock); + datafile->writers.running--; + datafile->writers.flushed_to_open_running++; + netdata_spinlock_unlock(&datafile->writers.spinlock); + + rrdeng_enq_cmd(xt_io_descr->ctx, + RRDENG_OPCODE_FLUSHED_TO_OPEN, + uv_fs_request, + xt_io_descr->completion, + STORAGE_PRIORITY_INTERNAL_DBENGINE, + NULL, + NULL); + + worker_is_idle(); } -void flush_pages_cb(uv_fs_t* req) -{ - struct rrdengine_worker_config* wc = req->loop->data; - struct rrdengine_instance *ctx = wc->ctx; - struct page_cache *pg_cache = &ctx->pg_cache; - struct extent_io_descriptor *xt_io_descr; - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - unsigned i, count; - - xt_io_descr = req->data; - if (req->result < 0) { - ++ctx->stats.io_errors; - rrd_stat_atomic_add(&global_io_errors, 1); - error("%s: uv_fs_write: %s", __func__, uv_strerror((int)req->result)); - } -#ifdef NETDATA_INTERNAL_CHECKS - { - struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile; - debug(D_RRDENGINE, "%s: Extent at offset %"PRIu64"(%u) was written to datafile %u-%u. Waking up waiters.", - __func__, xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno); - } -#endif - count = xt_io_descr->descr_count; - for (i = 0 ; i < count ; ++i) { - /* care, we don't hold the descriptor mutex */ - descr = xt_io_descr->descr_array[i]; +static bool datafile_is_full(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) { + bool ret = false; + netdata_spinlock_lock(&datafile->writers.spinlock); - pg_cache_replaceQ_insert(ctx, descr); + if(ctx_is_available_for_queries(ctx) && datafile->pos > rrdeng_target_data_file_size(ctx)) + ret = true; - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - pg_cache_descr->flags &= ~(RRD_PAGE_DIRTY | RRD_PAGE_WRITE_PENDING); - /* wake up waiters, care no reference being held */ - pg_cache_wake_up_waiters_unsafe(descr); - rrdeng_page_descr_mutex_unlock(ctx, descr); - } - if (xt_io_descr->completion) - completion_mark_complete(xt_io_descr->completion); - uv_fs_req_cleanup(req); - posix_memfree(xt_io_descr->buf); - freez(xt_io_descr); + netdata_spinlock_unlock(&datafile->writers.spinlock); + + return ret; +} + +static struct rrdengine_datafile *get_datafile_to_write_extent(struct rrdengine_instance *ctx) { + struct rrdengine_datafile *datafile; - uv_rwlock_wrlock(&pg_cache->committed_page_index.lock); - pg_cache->committed_page_index.nr_committed_pages -= count; - uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock); - wc->inflight_dirty_pages -= count; + // get the latest datafile + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + datafile = ctx->datafiles.first->prev; + // become a writer on this datafile, to prevent it from vanishing + netdata_spinlock_lock(&datafile->writers.spinlock); + datafile->writers.running++; + netdata_spinlock_unlock(&datafile->writers.spinlock); + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + if(datafile_is_full(ctx, datafile)) { + // remember the datafile we have become writers to + struct rrdengine_datafile *old_datafile = datafile; + + // only 1 datafile creation at a time + static netdata_mutex_t mutex = NETDATA_MUTEX_INITIALIZER; + netdata_mutex_lock(&mutex); + + // take the latest datafile again - without this, multiple threads may create multiple files + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + datafile = ctx->datafiles.first->prev; + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + if(datafile_is_full(ctx, datafile) && create_new_datafile_pair(ctx) == 0) + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_JOURNAL_INDEX, datafile, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, + NULL); + + netdata_mutex_unlock(&mutex); + + // get the new latest datafile again, like above + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + datafile = ctx->datafiles.first->prev; + // become a writer on this datafile, to prevent it from vanishing + netdata_spinlock_lock(&datafile->writers.spinlock); + datafile->writers.running++; + netdata_spinlock_unlock(&datafile->writers.spinlock); + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + // release the writers on the old datafile + netdata_spinlock_lock(&old_datafile->writers.spinlock); + old_datafile->writers.running--; + netdata_spinlock_unlock(&old_datafile->writers.spinlock); + } + + return datafile; } /* - * completion must be NULL or valid. - * Returns 0 when no flushing can take place. - * Returns datafile bytes to be written on successful flushing initiation. + * Take a page list in a judy array and write them */ -static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct completion *completion) -{ - struct rrdengine_instance *ctx = wc->ctx; - struct page_cache *pg_cache = &ctx->pg_cache; +static struct extent_io_descriptor *datafile_extent_build(struct rrdengine_instance *ctx, struct page_descr_with_data *base, struct completion *completion) { int ret; int compressed_size, max_compressed_size = 0; unsigned i, count, size_bytes, pos, real_io_size; uint32_t uncompressed_payload_length, payload_offset; - struct rrdeng_page_descr *descr, *eligible_pages[MAX_PAGES_PER_EXTENT]; - struct page_cache_descr *pg_cache_descr; + struct page_descr_with_data *descr, *eligible_pages[MAX_PAGES_PER_EXTENT]; struct extent_io_descriptor *xt_io_descr; + struct extent_buffer *eb = NULL; void *compressed_buf = NULL; - Word_t descr_commit_idx_array[MAX_PAGES_PER_EXTENT]; - Pvoid_t *PValue; Word_t Index; - uint8_t compression_algorithm = ctx->global_compress_alg; - struct extent_info *extent; + uint8_t compression_algorithm = ctx->config.global_compress_alg; struct rrdengine_datafile *datafile; /* persistent structures */ struct rrdeng_df_extent_header *header; struct rrdeng_df_extent_trailer *trailer; uLong crc; - if (force) { - debug(D_RRDENGINE, "Asynchronous flushing of extent has been forced by page pressure."); - } - uv_rwlock_wrlock(&pg_cache->committed_page_index.lock); - for (Index = 0, count = 0, uncompressed_payload_length = 0, - PValue = JudyLFirst(pg_cache->committed_page_index.JudyL_array, &Index, PJE0), - descr = unlikely(NULL == PValue) ? NULL : *PValue ; - - descr != NULL && count != rrdeng_pages_per_extent; - - PValue = JudyLNext(pg_cache->committed_page_index.JudyL_array, &Index, PJE0), - descr = unlikely(NULL == PValue) ? NULL : *PValue) { - uint8_t page_write_pending; - - fatal_assert(0 != descr->page_length); - page_write_pending = 0; - - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - if (!(pg_cache_descr->flags & RRD_PAGE_WRITE_PENDING)) { - page_write_pending = 1; - /* care, no reference being held */ - pg_cache_descr->flags |= RRD_PAGE_WRITE_PENDING; - uncompressed_payload_length += descr->page_length; - descr_commit_idx_array[count] = Index; - eligible_pages[count++] = descr; - } - rrdeng_page_descr_mutex_unlock(ctx, descr); + for(descr = base, Index = 0, count = 0, uncompressed_payload_length = 0; + descr && count != rrdeng_pages_per_extent; + descr = descr->link.next, Index++) { + + uncompressed_payload_length += descr->page_length; + eligible_pages[count++] = descr; - if (page_write_pending) { - ret = JudyLDel(&pg_cache->committed_page_index.JudyL_array, Index, PJE0); - fatal_assert(1 == ret); - } } - uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock); if (!count) { - debug(D_RRDENGINE, "%s: no pages eligible for flushing.", __func__); if (completion) completion_mark_complete(completion); - return 0; + + __atomic_sub_fetch(&ctx->atomic.extents_currently_being_flushed, 1, __ATOMIC_RELAXED); + return NULL; } - wc->inflight_dirty_pages += count; - xt_io_descr = mallocz(sizeof(*xt_io_descr)); + xt_io_descr = extent_io_descriptor_get(); + xt_io_descr->ctx = ctx; payload_offset = sizeof(*header) + count * sizeof(header->descr[0]); switch (compression_algorithm) { - case RRD_NO_COMPRESSION: - size_bytes = payload_offset + uncompressed_payload_length + sizeof(*trailer); - break; - default: /* Compress */ - fatal_assert(uncompressed_payload_length < LZ4_MAX_INPUT_SIZE); - max_compressed_size = LZ4_compressBound(uncompressed_payload_length); - compressed_buf = mallocz(max_compressed_size); - size_bytes = payload_offset + MAX(uncompressed_payload_length, (unsigned)max_compressed_size) + sizeof(*trailer); - break; + case RRD_NO_COMPRESSION: + size_bytes = payload_offset + uncompressed_payload_length + sizeof(*trailer); + break; + + default: /* Compress */ + fatal_assert(uncompressed_payload_length < LZ4_MAX_INPUT_SIZE); + max_compressed_size = LZ4_compressBound(uncompressed_payload_length); + eb = extent_buffer_get(max_compressed_size); + compressed_buf = eb->data; + size_bytes = payload_offset + MAX(uncompressed_payload_length, (unsigned)max_compressed_size) + sizeof(*trailer); + break; } + ret = posix_memalign((void *)&xt_io_descr->buf, RRDFILE_ALIGNMENT, ALIGN_BYTES_CEILING(size_bytes)); if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); /* freez(xt_io_descr);*/ } memset(xt_io_descr->buf, 0, ALIGN_BYTES_CEILING(size_bytes)); - (void) memcpy(xt_io_descr->descr_array, eligible_pages, sizeof(struct rrdeng_page_descr *) * count); + (void) memcpy(xt_io_descr->descr_array, eligible_pages, sizeof(struct page_descr_with_data *) * count); xt_io_descr->descr_count = count; pos = 0; @@ -869,17 +829,7 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct header->number_of_pages = count; pos += sizeof(*header); - extent = mallocz(sizeof(*extent) + count * sizeof(extent->pages[0])); - datafile = ctx->datafiles.last; /* TODO: check for exceeded size quota */ - extent->offset = datafile->pos; - extent->number_of_pages = count; - extent->datafile = datafile; - extent->next = NULL; - for (i = 0 ; i < count ; ++i) { - /* This is here for performance reasons */ - xt_io_descr->descr_commit_idx_array[i] = descr_commit_idx_array[i]; - descr = xt_io_descr->descr_array[i]; header->descr[i].type = descr->type; uuid_copy(*(uuid_t *)header->descr[i].uuid, *descr->id); @@ -890,35 +840,40 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct } for (i = 0 ; i < count ; ++i) { descr = xt_io_descr->descr_array[i]; - /* care, we don't hold the descriptor mutex */ - (void) memcpy(xt_io_descr->buf + pos, descr->pg_cache_descr->page, descr->page_length); - descr->extent = extent; - extent->pages[i] = descr; - + (void) memcpy(xt_io_descr->buf + pos, descr->page, descr->page_length); pos += descr->page_length; } - df_extent_insert(extent); - switch (compression_algorithm) { - case RRD_NO_COMPRESSION: - header->payload_length = uncompressed_payload_length; - break; - default: /* Compress */ - compressed_size = LZ4_compress_default(xt_io_descr->buf + payload_offset, compressed_buf, - uncompressed_payload_length, max_compressed_size); - ctx->stats.before_compress_bytes += uncompressed_payload_length; - ctx->stats.after_compress_bytes += compressed_size; - debug(D_RRDENGINE, "LZ4 compressed %"PRIu32" bytes to %d bytes.", uncompressed_payload_length, compressed_size); + if(likely(compression_algorithm == RRD_LZ4)) { + compressed_size = LZ4_compress_default( + xt_io_descr->buf + payload_offset, + compressed_buf, + (int)uncompressed_payload_length, + max_compressed_size); + + __atomic_add_fetch(&ctx->stats.before_compress_bytes, uncompressed_payload_length, __ATOMIC_RELAXED); + __atomic_add_fetch(&ctx->stats.after_compress_bytes, compressed_size, __ATOMIC_RELAXED); + (void) memcpy(xt_io_descr->buf + payload_offset, compressed_buf, compressed_size); - freez(compressed_buf); + extent_buffer_release(eb); size_bytes = payload_offset + compressed_size + sizeof(*trailer); header->payload_length = compressed_size; - break; } - extent->size = size_bytes; - xt_io_descr->bytes = size_bytes; + else { // RRD_NO_COMPRESSION + header->payload_length = uncompressed_payload_length; + } + + real_io_size = ALIGN_BYTES_CEILING(size_bytes); + + datafile = get_datafile_to_write_extent(ctx); + netdata_spinlock_lock(&datafile->writers.spinlock); + xt_io_descr->datafile = datafile; xt_io_descr->pos = datafile->pos; - xt_io_descr->req.data = xt_io_descr; + datafile->pos += real_io_size; + netdata_spinlock_unlock(&datafile->writers.spinlock); + + xt_io_descr->bytes = size_bytes; + xt_io_descr->uv_fs_request.data = xt_io_descr; xt_io_descr->completion = completion; trailer = xt_io_descr->buf + size_bytes - sizeof(*trailer); @@ -926,324 +881,508 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct crc = crc32(crc, xt_io_descr->buf, size_bytes - sizeof(*trailer)); crc32set(trailer->checksum, crc); - real_io_size = ALIGN_BYTES_CEILING(size_bytes); xt_io_descr->iov = uv_buf_init((void *)xt_io_descr->buf, real_io_size); - ret = uv_fs_write(wc->loop, &xt_io_descr->req, datafile->file, &xt_io_descr->iov, 1, datafile->pos, flush_pages_cb); - fatal_assert(-1 != ret); - ctx->stats.io_write_bytes += real_io_size; - ++ctx->stats.io_write_requests; - ctx->stats.io_write_extent_bytes += real_io_size; - ++ctx->stats.io_write_extents; - do_commit_transaction(wc, STORE_DATA, xt_io_descr); - datafile->pos += ALIGN_BYTES_CEILING(size_bytes); - ctx->disk_space += ALIGN_BYTES_CEILING(size_bytes); - rrdeng_test_quota(wc); + journalfile_extent_build(ctx, xt_io_descr); + + ctx_last_flush_fileno_set(ctx, datafile->fileno); + ctx_current_disk_space_increase(ctx, real_io_size); + ctx_io_write_op_bytes(ctx, real_io_size); - return ALIGN_BYTES_CEILING(size_bytes); + return xt_io_descr; } -static void after_delete_old_data(struct rrdengine_worker_config* wc) -{ - struct rrdengine_instance *ctx = wc->ctx; - struct rrdengine_datafile *datafile; - struct rrdengine_journalfile *journalfile; - unsigned deleted_bytes, journalfile_bytes, datafile_bytes; - int ret, error; - char path[RRDENG_PATH_MAX]; +static void after_extent_write(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* uv_work_req __maybe_unused, int status __maybe_unused) { + struct extent_io_descriptor *xt_io_descr = data; - datafile = ctx->datafiles.first; - journalfile = datafile->journalfile; - datafile_bytes = datafile->pos; - journalfile_bytes = journalfile->pos; - deleted_bytes = 0; + if(xt_io_descr) { + int ret = uv_fs_write(&rrdeng_main.loop, + &xt_io_descr->uv_fs_request, + xt_io_descr->datafile->file, + &xt_io_descr->iov, + 1, + (int64_t) xt_io_descr->pos, + after_extent_write_datafile_io); - info("Deleting data and journal file pair."); - datafile_list_delete(ctx, datafile); - ret = destroy_journal_file(journalfile, datafile); - if (!ret) { - generate_journalfilepath(datafile, path, sizeof(path)); - info("Deleted journal file \"%s\".", path); - deleted_bytes += journalfile_bytes; - } - ret = destroy_data_file(datafile); - if (!ret) { - generate_datafilepath(datafile, path, sizeof(path)); - info("Deleted data file \"%s\".", path); - deleted_bytes += datafile_bytes; + fatal_assert(-1 != ret); } - freez(journalfile); - freez(datafile); +} - ctx->disk_space -= deleted_bytes; - info("Reclaimed %u bytes of disk space.", deleted_bytes); +static void *extent_write_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_WRITE); + struct page_descr_with_data *base = data; + struct extent_io_descriptor *xt_io_descr = datafile_extent_build(ctx, base, completion); + return xt_io_descr; +} - error = uv_thread_join(wc->now_deleting_files); - if (error) { - error("uv_thread_join(): %s", uv_strerror(error)); - } - freez(wc->now_deleting_files); - /* unfreeze command processing */ - wc->now_deleting_files = NULL; +static void after_database_rotate(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + __atomic_store_n(&ctx->atomic.now_deleting_files, false, __ATOMIC_RELAXED); +} - wc->cleanup_thread_deleting_files = 0; - rrdcontext_db_rotation(); +struct uuid_first_time_s { + uuid_t *uuid; + time_t first_time_s; + METRIC *metric; + size_t pages_found; + size_t df_matched; + size_t df_index_oldest; +}; - /* interrupt event loop */ - uv_stop(wc->loop); +static int journal_metric_compare(const void *key, const void *metric) +{ + return uuid_compare(*(uuid_t *) key, ((struct journal_metric_list *) metric)->uuid); } -static void delete_old_data(void *arg) -{ - struct rrdengine_instance *ctx = arg; - struct rrdengine_worker_config* wc = &ctx->worker_config; - struct rrdengine_datafile *datafile; - struct extent_info *extent, *next; - struct rrdeng_page_descr *descr; - unsigned count, i; - uint8_t can_delete_metric; - uuid_t metric_id; - - /* Safe to use since it will be deleted after we are done */ - datafile = ctx->datafiles.first; - - for (extent = datafile->extents.first ; extent != NULL ; extent = next) { - count = extent->number_of_pages; - for (i = 0 ; i < count ; ++i) { - descr = extent->pages[i]; - can_delete_metric = pg_cache_punch_hole(ctx, descr, 0, 0, &metric_id); - if (unlikely(can_delete_metric)) { - /* - * If the metric is empty, has no active writers and if the metadata log has been initialized then - * attempt to delete the corresponding netdata dimension. - */ - metaqueue_delete_dimension_uuid(&metric_id); - } - } - next = extent->next; - freez(extent); - } - wc->cleanup_thread_deleting_files = 1; - /* wake up event loop */ - fatal_assert(0 == uv_async_send(&wc->async)); +struct rrdengine_datafile *datafile_release_and_acquire_next_for_retention(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) { + + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + + struct rrdengine_datafile *next_datafile = datafile->next; + + while(next_datafile && !datafile_acquire(next_datafile, DATAFILE_ACQUIRE_RETENTION)) + next_datafile = next_datafile->next; + + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + datafile_release(datafile, DATAFILE_ACQUIRE_RETENTION); + + return next_datafile; } -void rrdeng_test_quota(struct rrdengine_worker_config* wc) +void find_uuid_first_time( + struct rrdengine_instance *ctx, + struct rrdengine_datafile *datafile, + struct uuid_first_time_s *uuid_first_entry_list, + size_t count) { - struct rrdengine_instance *ctx = wc->ctx; - struct rrdengine_datafile *datafile; - unsigned current_size, target_size; - uint8_t out_of_space, only_one_datafile; - int ret, error; - - out_of_space = 0; - /* Do not allow the pinned pages to exceed the disk space quota to avoid deadlocks */ - if (unlikely(ctx->disk_space > MAX(ctx->max_disk_space, 2 * ctx->metric_API_max_producers * RRDENG_BLOCK_SIZE))) { - out_of_space = 1; - } - datafile = ctx->datafiles.last; - current_size = datafile->pos; - target_size = ctx->max_disk_space / TARGET_DATAFILES; - target_size = MIN(target_size, MAX_DATAFILE_SIZE); - target_size = MAX(target_size, MIN_DATAFILE_SIZE); - only_one_datafile = (datafile == ctx->datafiles.first) ? 1 : 0; - if (unlikely(current_size >= target_size || (out_of_space && only_one_datafile))) { - /* Finalize data and journal file and create a new pair */ - wal_flush_transaction_buffer(wc); - ret = create_new_datafile_pair(ctx, 1, ctx->last_fileno + 1); - if (likely(!ret)) { - ++ctx->last_fileno; + // acquire the datafile to work with it + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + while(datafile && !datafile_acquire(datafile, DATAFILE_ACQUIRE_RETENTION)) + datafile = datafile->next; + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + if (unlikely(!datafile)) + return; + + unsigned journalfile_count = 0; + size_t binary_match = 0; + size_t not_matching_bsearches = 0; + + while (datafile) { + struct journal_v2_header *j2_header = journalfile_v2_data_acquire(datafile->journalfile, NULL, 0, 0); + if (!j2_header) { + datafile = datafile_release_and_acquire_next_for_retention(ctx, datafile); + continue; + } + + time_t journal_start_time_s = (time_t) (j2_header->start_time_ut / USEC_PER_SEC); + struct journal_metric_list *uuid_list = (struct journal_metric_list *)((uint8_t *) j2_header + j2_header->metric_offset); + struct uuid_first_time_s *uuid_original_entry; + + size_t journal_metric_count = j2_header->metric_count; + + for (size_t index = 0; index < count; ++index) { + uuid_original_entry = &uuid_first_entry_list[index]; + + // Check here if we should skip this + if (uuid_original_entry->df_matched > 3 || uuid_original_entry->pages_found > 5) + continue; + + struct journal_metric_list *live_entry = bsearch(uuid_original_entry->uuid,uuid_list,journal_metric_count,sizeof(*uuid_list), journal_metric_compare); + if (!live_entry) { + // Not found in this journal + not_matching_bsearches++; + continue; + } + + uuid_original_entry->pages_found += live_entry->entries; + uuid_original_entry->df_matched++; + + time_t old_first_time_s = uuid_original_entry->first_time_s; + + // Calculate first / last for this match + time_t first_time_s = live_entry->delta_start_s + journal_start_time_s; + uuid_original_entry->first_time_s = MIN(uuid_original_entry->first_time_s, first_time_s); + + if (uuid_original_entry->first_time_s != old_first_time_s) + uuid_original_entry->df_index_oldest = uuid_original_entry->df_matched; + + binary_match++; } + + journalfile_count++; + journalfile_v2_data_release(datafile->journalfile); + datafile = datafile_release_and_acquire_next_for_retention(ctx, datafile); } - if (unlikely(out_of_space && NO_QUIESCE == ctx->quiesce)) { - /* delete old data */ - if (wc->now_deleting_files) { - /* already deleting data */ - return; + + // Let's scan the open cache for almost exact match + size_t open_cache_count = 0; + + size_t df_index[10] = { 0 }; + size_t without_metric = 0; + size_t open_cache_gave_first_time_s = 0; + size_t metric_count = 0; + size_t without_retention = 0; + size_t not_needed_bsearches = 0; + + for (size_t index = 0; index < count; ++index) { + struct uuid_first_time_s *uuid_first_t_entry = &uuid_first_entry_list[index]; + + metric_count++; + + size_t idx = uuid_first_t_entry->df_index_oldest; + if(idx >= 10) + idx = 9; + + df_index[idx]++; + + not_needed_bsearches += uuid_first_t_entry->df_matched - uuid_first_t_entry->df_index_oldest; + + if (unlikely(!uuid_first_t_entry->metric)) { + without_metric++; + continue; } - if (NULL == ctx->datafiles.first->next) { - error("Cannot delete data file \"%s/"DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION"\"" - " to reclaim space, there are no other file pairs left.", - ctx->dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno); - return; + + PGC_PAGE *page = pgc_page_get_and_acquire( + open_cache, (Word_t)ctx, + (Word_t)uuid_first_t_entry->metric, 0, + PGC_SEARCH_FIRST); + + if (page) { + time_t old_first_time_s = uuid_first_t_entry->first_time_s; + + time_t first_time_s = pgc_page_start_time_s(page); + uuid_first_t_entry->first_time_s = MIN(uuid_first_t_entry->first_time_s, first_time_s); + pgc_page_release(open_cache, page); + open_cache_count++; + + if(uuid_first_t_entry->first_time_s != old_first_time_s) { + open_cache_gave_first_time_s++; + } } - info("Deleting data file \"%s/"DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION"\".", - ctx->dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno); - wc->now_deleting_files = mallocz(sizeof(*wc->now_deleting_files)); - wc->cleanup_thread_deleting_files = 0; - - error = uv_thread_create(wc->now_deleting_files, delete_old_data, ctx); - if (error) { - error("uv_thread_create(): %s", uv_strerror(error)); - freez(wc->now_deleting_files); - wc->now_deleting_files = NULL; + else { + if(!uuid_first_t_entry->df_index_oldest) + without_retention++; } } + internal_error(true, + "DBENGINE: analyzed the retention of %zu rotated metrics of tier %d, " + "did %zu jv2 matching binary searches (%zu not matching, %zu overflown) in %u journal files, " + "%zu metrics with entries in open cache, " + "metrics first time found per datafile index ([not in jv2]:%zu, [1]:%zu, [2]:%zu, [3]:%zu, [4]:%zu, [5]:%zu, [6]:%zu, [7]:%zu, [8]:%zu, [bigger]: %zu), " + "open cache found first time %zu, " + "metrics without any remaining retention %zu, " + "metrics not in MRG %zu", + metric_count, + ctx->config.tier, + binary_match, + not_matching_bsearches, + not_needed_bsearches, + journalfile_count, + open_cache_count, + df_index[0], df_index[1], df_index[2], df_index[3], df_index[4], df_index[5], df_index[6], df_index[7], df_index[8], df_index[9], + open_cache_gave_first_time_s, + without_retention, + without_metric + ); } -static inline int rrdeng_threads_alive(struct rrdengine_worker_config* wc) -{ - if (wc->now_invalidating_dirty_pages || wc->now_deleting_files) { - return 1; +static void update_metrics_first_time_s(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile_to_delete, struct rrdengine_datafile *first_datafile_remaining, bool worker) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.metrics_retention_started, 1, __ATOMIC_RELAXED); + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_FIND_ROTATED_METRICS); + + struct rrdengine_journalfile *journalfile = datafile_to_delete->journalfile; + struct journal_v2_header *j2_header = journalfile_v2_data_acquire(journalfile, NULL, 0, 0); + struct journal_metric_list *uuid_list = (struct journal_metric_list *)((uint8_t *) j2_header + j2_header->metric_offset); + + size_t count = j2_header->metric_count; + struct uuid_first_time_s *uuid_first_t_entry; + struct uuid_first_time_s *uuid_first_entry_list = callocz(count, sizeof(struct uuid_first_time_s)); + + size_t added = 0; + for (size_t index = 0; index < count; ++index) { + METRIC *metric = mrg_metric_get_and_acquire(main_mrg, &uuid_list[index].uuid, (Word_t) ctx); + if (!metric) + continue; + + uuid_first_entry_list[added].metric = metric; + uuid_first_entry_list[added].first_time_s = LONG_MAX; + uuid_first_entry_list[added].df_matched = 0; + uuid_first_entry_list[added].df_index_oldest = 0; + uuid_first_entry_list[added].uuid = mrg_metric_uuid(main_mrg, metric); + added++; + } + + info("DBENGINE: recalculating tier %d retention for %zu metrics starting with datafile %u", + ctx->config.tier, count, first_datafile_remaining->fileno); + + journalfile_v2_data_release(journalfile); + + // Update the first time / last time for all metrics we plan to delete + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_FIND_REMAINING_RETENTION); + + find_uuid_first_time(ctx, first_datafile_remaining, uuid_first_entry_list, added); + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_POPULATE_MRG); + + info("DBENGINE: updating tier %d metrics registry retention for %zu metrics", + ctx->config.tier, added); + + size_t deleted_metrics = 0, zero_retention_referenced = 0, zero_disk_retention = 0, zero_disk_but_live = 0; + for (size_t index = 0; index < added; ++index) { + uuid_first_t_entry = &uuid_first_entry_list[index]; + if (likely(uuid_first_t_entry->first_time_s != LONG_MAX)) { + mrg_metric_set_first_time_s_if_bigger(main_mrg, uuid_first_t_entry->metric, uuid_first_t_entry->first_time_s); + mrg_metric_release(main_mrg, uuid_first_t_entry->metric); + } + else { + zero_disk_retention++; + + // there is no retention for this metric + bool has_retention = mrg_metric_zero_disk_retention(main_mrg, uuid_first_t_entry->metric); + if (!has_retention) { + bool deleted = mrg_metric_release_and_delete(main_mrg, uuid_first_t_entry->metric); + if(deleted) + deleted_metrics++; + else + zero_retention_referenced++; + } + else { + zero_disk_but_live++; + mrg_metric_release(main_mrg, uuid_first_t_entry->metric); + } + } } - return 0; + freez(uuid_first_entry_list); + + internal_error(zero_disk_retention, + "DBENGINE: deleted %zu metrics, zero retention but referenced %zu (out of %zu total, of which %zu have main cache retention) zero on-disk retention tier %d metrics from metrics registry", + deleted_metrics, zero_retention_referenced, zero_disk_retention, zero_disk_but_live, ctx->config.tier); + + if(worker) + worker_is_idle(); } -static void rrdeng_cleanup_finished_threads(struct rrdengine_worker_config* wc) -{ - struct rrdengine_instance *ctx = wc->ctx; +void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool update_retention, bool worker) { + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_DATAFILE_DELETE_WAIT); + + bool datafile_got_for_deletion = datafile_acquire_for_deletion(datafile); + + if (update_retention) + update_metrics_first_time_s(ctx, datafile, datafile->next, worker); - if (unlikely(wc->cleanup_thread_invalidating_dirty_pages)) { - after_invalidate_oldest_committed(wc); + while (!datafile_got_for_deletion) { + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_DATAFILE_DELETE_WAIT); + + datafile_got_for_deletion = datafile_acquire_for_deletion(datafile); + + if (!datafile_got_for_deletion) { + info("DBENGINE: waiting for data file '%s/" + DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION + "' to be available for deletion, " + "it is in use currently by %u users.", + ctx->config.dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno, datafile->users.lockers); + + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.datafile_deletion_spin, 1, __ATOMIC_RELAXED); + sleep_usec(1 * USEC_PER_SEC); + } } - if (unlikely(wc->cleanup_thread_deleting_files)) { - after_delete_old_data(wc); + + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.datafile_deletion_started, 1, __ATOMIC_RELAXED); + info("DBENGINE: deleting data file '%s/" + DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION + "'.", + ctx->config.dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno); + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_DATAFILE_DELETE); + + struct rrdengine_journalfile *journal_file; + unsigned deleted_bytes, journal_file_bytes, datafile_bytes; + int ret; + char path[RRDENG_PATH_MAX]; + + uv_rwlock_wrlock(&ctx->datafiles.rwlock); + datafile_list_delete_unsafe(ctx, datafile); + uv_rwlock_wrunlock(&ctx->datafiles.rwlock); + + journal_file = datafile->journalfile; + datafile_bytes = datafile->pos; + journal_file_bytes = journalfile_current_size(journal_file); + deleted_bytes = journalfile_v2_data_size_get(journal_file); + + info("DBENGINE: deleting data and journal files to maintain disk quota"); + ret = journalfile_destroy_unsafe(journal_file, datafile); + if (!ret) { + journalfile_v1_generate_path(datafile, path, sizeof(path)); + info("DBENGINE: deleted journal file \"%s\".", path); + journalfile_v2_generate_path(datafile, path, sizeof(path)); + info("DBENGINE: deleted journal file \"%s\".", path); + deleted_bytes += journal_file_bytes; } - if (unlikely(SET_QUIESCE == ctx->quiesce && !rrdeng_threads_alive(wc))) { - ctx->quiesce = QUIESCED; - completion_mark_complete(&ctx->rrdengine_completion); + ret = destroy_data_file_unsafe(datafile); + if (!ret) { + generate_datafilepath(datafile, path, sizeof(path)); + info("DBENGINE: deleted data file \"%s\".", path); + deleted_bytes += datafile_bytes; } + freez(journal_file); + freez(datafile); + + ctx_current_disk_space_decrease(ctx, deleted_bytes); + info("DBENGINE: reclaimed %u bytes of disk space.", deleted_bytes); } -/* return 0 on success */ -int init_rrd_files(struct rrdengine_instance *ctx) -{ - int ret = init_data_files(ctx); - - BUFFER *wb = buffer_create(1000); - size_t all_errors = 0; - usec_t now = now_realtime_usec(); - - if(ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].counter) { - buffer_sprintf(wb, "%s%zu pages had start time > end time (latest: %llu secs ago)" - , (all_errors)?", ":"" - , ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].counter - , (now - ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].latest_end_time_ut) / USEC_PER_SEC - ); - all_errors += ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].counter; - } +static void *database_rotate_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + datafile_delete(ctx, ctx->datafiles.first, ctx_is_available_for_queries(ctx), true); - if(ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].counter) { - buffer_sprintf(wb, "%s%zu pages had start time = end time with more than 1 entries (latest: %llu secs ago)" - , (all_errors)?", ":"" - , ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].counter - , (now - ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].latest_end_time_ut) / USEC_PER_SEC - ); - all_errors += ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].counter; - } + if (rrdeng_ctx_exceeded_disk_quota(ctx)) + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_DATABASE_ROTATE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); - if(ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].counter) { - buffer_sprintf(wb, "%s%zu pages had zero points (latest: %llu secs ago)" - , (all_errors)?", ":"" - , ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].counter - , (now - ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].latest_end_time_ut) / USEC_PER_SEC - ); - all_errors += ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].counter; - } + rrdcontext_db_rotation(); - if(ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].counter) { - buffer_sprintf(wb, "%s%zu pages had update every == 0 with entries > 1 (latest: %llu secs ago)" - , (all_errors)?", ":"" - , ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].counter - , (now - ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].latest_end_time_ut) / USEC_PER_SEC - ); - all_errors += ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].counter; - } + return data; +} - if(ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].counter) { - buffer_sprintf(wb, "%s%zu pages had a different number of points compared to their timestamps (latest: %llu secs ago; these page have been loaded)" - , (all_errors)?", ":"" - , ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].counter - , (now - ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].latest_end_time_ut) / USEC_PER_SEC - ); - all_errors += ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].counter; - } +static void after_flush_all_hot_and_dirty_pages_of_section(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + ; +} + +static void *flush_all_hot_and_dirty_pages_of_section_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_QUIESCE); + pgc_flush_all_hot_and_dirty_pages(main_cache, (Word_t)ctx); + completion_mark_complete(&ctx->quiesce.completion); + return data; +} + +static void after_populate_mrg(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + ; +} + +static void *populate_mrg_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_POPULATE_MRG); + + do { + struct rrdengine_datafile *datafile = NULL; + + // find a datafile to work + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + for(datafile = ctx->datafiles.first; datafile ; datafile = datafile->next) { + if(!netdata_spinlock_trylock(&datafile->populate_mrg.spinlock)) + continue; + + if(datafile->populate_mrg.populated) { + netdata_spinlock_unlock(&datafile->populate_mrg.spinlock); + continue; + } + + // we have the spinlock and it is not populated + break; + } + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + if(!datafile) + break; + + journalfile_v2_populate_retention_to_mrg(ctx, datafile->journalfile); + datafile->populate_mrg.populated = true; + netdata_spinlock_unlock(&datafile->populate_mrg.spinlock); - if(ctx->load_errors[LOAD_ERRORS_DROPPED_EXTENT].counter) { - buffer_sprintf(wb, "%s%zu extents have been dropped because they didn't have any valid pages" - , (all_errors)?", ":"" - , ctx->load_errors[LOAD_ERRORS_DROPPED_EXTENT].counter - ); - all_errors += ctx->load_errors[LOAD_ERRORS_DROPPED_EXTENT].counter; + } while(1); + + completion_mark_complete(completion); + + return data; +} + +static void after_ctx_shutdown(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + ; +} + +static void *ctx_shutdown_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_SHUTDOWN); + + completion_wait_for(&ctx->quiesce.completion); + completion_destroy(&ctx->quiesce.completion); + + bool logged = false; + while(__atomic_load_n(&ctx->atomic.extents_currently_being_flushed, __ATOMIC_RELAXED) || + __atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED)) { + if(!logged) { + logged = true; + info("DBENGINE: waiting for %zu inflight queries to finish to shutdown tier %d...", + __atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED), + (ctx->config.legacy) ? -1 : ctx->config.tier); + } + sleep_usec(1 * USEC_PER_MS); } - if(all_errors) - info("DBENGINE: tier %d: %s", ctx->tier, buffer_tostring(wb)); + completion_mark_complete(completion); - buffer_free(wb); - return ret; + return data; } -void finalize_rrd_files(struct rrdengine_instance *ctx) -{ - return finalize_data_files(ctx); +static void *cache_flush_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + if (!main_cache) + return data; + + worker_is_busy(UV_EVENT_DBENGINE_FLUSH_MAIN_CACHE); + pgc_flush_pages(main_cache, 0); + + return data; } -void rrdeng_init_cmd_queue(struct rrdengine_worker_config* wc) -{ - wc->cmd_queue.head = wc->cmd_queue.tail = 0; - wc->queue_size = 0; - fatal_assert(0 == uv_cond_init(&wc->cmd_cond)); - fatal_assert(0 == uv_mutex_init(&wc->cmd_mutex)); +static void *cache_evict_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *req __maybe_unused) { + if (!main_cache) + return data; + + worker_is_busy(UV_EVENT_DBENGINE_EVICT_MAIN_CACHE); + pgc_evict_pages(main_cache, 0, 0); + + return data; } -void rrdeng_enq_cmd(struct rrdengine_worker_config* wc, struct rrdeng_cmd *cmd) -{ - unsigned queue_size; +static void after_prep_query(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + ; +} - /* wait for free space in queue */ - uv_mutex_lock(&wc->cmd_mutex); - while ((queue_size = wc->queue_size) == RRDENG_CMD_Q_MAX_SIZE) { - uv_cond_wait(&wc->cmd_cond, &wc->cmd_mutex); - } - fatal_assert(queue_size < RRDENG_CMD_Q_MAX_SIZE); - /* enqueue command */ - wc->cmd_queue.cmd_array[wc->cmd_queue.tail] = *cmd; - wc->cmd_queue.tail = wc->cmd_queue.tail != RRDENG_CMD_Q_MAX_SIZE - 1 ? - wc->cmd_queue.tail + 1 : 0; - wc->queue_size = queue_size + 1; - uv_mutex_unlock(&wc->cmd_mutex); +static void *query_prep_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_QUERY); + PDC *pdc = data; + rrdeng_prep_query(pdc); + return data; +} - /* wake up event loop */ - fatal_assert(0 == uv_async_send(&wc->async)); +unsigned rrdeng_target_data_file_size(struct rrdengine_instance *ctx) { + unsigned target_size = ctx->config.max_disk_space / TARGET_DATAFILES; + target_size = MIN(target_size, MAX_DATAFILE_SIZE); + target_size = MAX(target_size, MIN_DATAFILE_SIZE); + return target_size; } -struct rrdeng_cmd rrdeng_deq_cmd(struct rrdengine_worker_config* wc) +bool rrdeng_ctx_exceeded_disk_quota(struct rrdengine_instance *ctx) { - struct rrdeng_cmd ret; - unsigned queue_size; - - uv_mutex_lock(&wc->cmd_mutex); - queue_size = wc->queue_size; - if (queue_size == 0) { - ret.opcode = RRDENG_NOOP; - } else { - /* dequeue command */ - ret = wc->cmd_queue.cmd_array[wc->cmd_queue.head]; - if (queue_size == 1) { - wc->cmd_queue.head = wc->cmd_queue.tail = 0; - } else { - wc->cmd_queue.head = wc->cmd_queue.head != RRDENG_CMD_Q_MAX_SIZE - 1 ? - wc->cmd_queue.head + 1 : 0; - } - wc->queue_size = queue_size - 1; + uint64_t estimated_disk_space = ctx_current_disk_space_get(ctx) + rrdeng_target_data_file_size(ctx) - + (ctx->datafiles.first->prev ? ctx->datafiles.first->prev->pos : 0); - /* wake up producers */ - uv_cond_signal(&wc->cmd_cond); - } - uv_mutex_unlock(&wc->cmd_mutex); + return estimated_disk_space > ctx->config.max_disk_space; +} - return ret; +/* return 0 on success */ +int init_rrd_files(struct rrdengine_instance *ctx) +{ + return init_data_files(ctx); } -static void load_configuration_dynamic(void) +void finalize_rrd_files(struct rrdengine_instance *ctx) { - unsigned read_num = (unsigned)config_get_number(CONFIG_SECTION_DB, "dbengine pages per extent", MAX_PAGES_PER_EXTENT); - if (read_num > 0 && read_num <= MAX_PAGES_PER_EXTENT) - rrdeng_pages_per_extent = read_num; - else { - error("Invalid dbengine pages per extent %u given. Using %u.", read_num, rrdeng_pages_per_extent); - config_set_number(CONFIG_SECTION_DB, "dbengine pages per extent", rrdeng_pages_per_extent); - } + return finalize_data_files(ctx); } void async_cb(uv_async_t *handle) @@ -1253,256 +1392,413 @@ void async_cb(uv_async_t *handle) debug(D_RRDENGINE, "%s called, active=%d.", __func__, uv_is_active((uv_handle_t *)handle)); } -/* Flushes dirty pages when timer expires */ #define TIMER_PERIOD_MS (1000) -void timer_cb(uv_timer_t* handle) -{ - worker_is_busy(RRDENG_MAX_OPCODE + 1); - struct rrdengine_worker_config* wc = handle->data; - struct rrdengine_instance *ctx = wc->ctx; +static void *extent_read_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + EPDL *epdl = data; + epdl_find_extent_and_populate_pages(ctx, epdl, true); + return data; +} - uv_stop(handle->loop); - uv_update_time(handle->loop); - rrdeng_test_quota(wc); - debug(D_RRDENGINE, "%s: timeout reached.", __func__); - if (likely(!wc->now_deleting_files && !wc->now_invalidating_dirty_pages)) { - /* There is free space so we can write to disk and we are not actively deleting dirty buffers */ - struct page_cache *pg_cache = &ctx->pg_cache; - unsigned long total_bytes, bytes_written, nr_committed_pages, bytes_to_write = 0, producers, low_watermark, - high_watermark; - - uv_rwlock_rdlock(&pg_cache->committed_page_index.lock); - nr_committed_pages = pg_cache->committed_page_index.nr_committed_pages; - uv_rwlock_rdunlock(&pg_cache->committed_page_index.lock); - producers = ctx->metric_API_max_producers; - /* are flushable pages more than 25% of the maximum page cache size */ - high_watermark = (ctx->max_cache_pages * 25LLU) / 100; - low_watermark = (ctx->max_cache_pages * 5LLU) / 100; /* 5%, must be smaller than high_watermark */ - - /* Flush more pages only if disk can keep up */ - if (wc->inflight_dirty_pages < high_watermark + producers) { - if (nr_committed_pages > producers && - /* committed to be written pages are more than the produced number */ - nr_committed_pages - producers > high_watermark) { - /* Flushing speed must increase to stop page cache from filling with dirty pages */ - bytes_to_write = (nr_committed_pages - producers - low_watermark) * RRDENG_BLOCK_SIZE; - } - bytes_to_write = MAX(DATAFILE_IDEAL_IO_SIZE, bytes_to_write); +static void epdl_populate_pages_asynchronously(struct rrdengine_instance *ctx, EPDL *epdl, STORAGE_PRIORITY priority) { + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_EXTENT_READ, epdl, NULL, priority, + rrdeng_enqueue_epdl_cmd, rrdeng_dequeue_epdl_cmd); +} - debug(D_RRDENGINE, "Flushing pages to disk."); - for (total_bytes = bytes_written = do_flush_pages(wc, 0, NULL); - bytes_written && (total_bytes < bytes_to_write); - total_bytes += bytes_written) { - bytes_written = do_flush_pages(wc, 0, NULL); - } +void pdc_route_asynchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc) { + pdc_to_epdl_router(ctx, pdc, epdl_populate_pages_asynchronously, epdl_populate_pages_asynchronously); +} + +void epdl_populate_pages_synchronously(struct rrdengine_instance *ctx, EPDL *epdl, enum storage_priority priority __maybe_unused) { + epdl_find_extent_and_populate_pages(ctx, epdl, false); +} + +void pdc_route_synchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc) { + pdc_to_epdl_router(ctx, pdc, epdl_populate_pages_synchronously, epdl_populate_pages_synchronously); +} + +#define MAX_RETRIES_TO_START_INDEX (100) +static void *journal_v2_indexing_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + unsigned count = 0; + worker_is_busy(UV_EVENT_DBENGINE_JOURNAL_INDEX_WAIT); + + while (__atomic_load_n(&ctx->atomic.now_deleting_files, __ATOMIC_RELAXED) && count++ < MAX_RETRIES_TO_START_INDEX) + sleep_usec(100 * USEC_PER_MS); + + if (count == MAX_RETRIES_TO_START_INDEX) { + worker_is_idle(); + return data; + } + + struct rrdengine_datafile *datafile = ctx->datafiles.first; + worker_is_busy(UV_EVENT_DBENGINE_JOURNAL_INDEX); + count = 0; + while (datafile && datafile->fileno != ctx_last_fileno_get(ctx) && datafile->fileno != ctx_last_flush_fileno_get(ctx)) { + + netdata_spinlock_lock(&datafile->writers.spinlock); + bool available = (datafile->writers.running || datafile->writers.flushed_to_open_running) ? false : true; + netdata_spinlock_unlock(&datafile->writers.spinlock); + + if(!available) + continue; + + if (unlikely(!journalfile_v2_data_available(datafile->journalfile))) { + info("DBENGINE: journal file %u is ready to be indexed", datafile->fileno); + pgc_open_cache_to_journal_v2(open_cache, (Word_t) ctx, (int) datafile->fileno, ctx->config.page_type, + journalfile_migrate_to_v2_callback, (void *) datafile->journalfile); + count++; } + + datafile = datafile->next; + + if (unlikely(!ctx_is_available_for_queries(ctx))) + break; } - load_configuration_dynamic(); -#ifdef NETDATA_INTERNAL_CHECKS + + errno = 0; + internal_error(count, "DBENGINE: journal indexing done; %u files processed", count); + + worker_is_idle(); + + return data; +} + +static void after_do_cache_flush(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + rrdeng_main.flushes_running--; +} + +static void after_do_cache_evict(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + rrdeng_main.evictions_running--; +} + +static void after_extent_read(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + ; +} + +static void after_journal_v2_indexing(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + __atomic_store_n(&ctx->atomic.migration_to_v2_running, false, __ATOMIC_RELAXED); + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_DATABASE_ROTATE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); +} + +struct rrdeng_buffer_sizes rrdeng_get_buffer_sizes(void) { + return (struct rrdeng_buffer_sizes) { + .pgc = pgc_aral_overhead() + pgc_aral_structures(), + .mrg = mrg_aral_overhead() + mrg_aral_structures(), + .opcodes = aral_overhead(rrdeng_main.cmd_queue.ar) + aral_structures(rrdeng_main.cmd_queue.ar), + .handles = aral_overhead(rrdeng_main.handles.ar) + aral_structures(rrdeng_main.handles.ar), + .descriptors = aral_overhead(rrdeng_main.descriptors.ar) + aral_structures(rrdeng_main.descriptors.ar), + .wal = __atomic_load_n(&wal_globals.atomics.allocated, __ATOMIC_RELAXED) * (sizeof(WAL) + RRDENG_BLOCK_SIZE), + .workers = aral_overhead(rrdeng_main.work_cmd.ar), + .pdc = pdc_cache_size(), + .xt_io = aral_overhead(rrdeng_main.xt_io_descr.ar) + aral_structures(rrdeng_main.xt_io_descr.ar), + .xt_buf = extent_buffer_cache_size(), + .epdl = epdl_cache_size(), + .deol = deol_cache_size(), + .pd = pd_cache_size(), + +#ifdef PDC_USE_JULYL + .julyl = julyl_cache_size(), +#endif + }; +} + +static void after_cleanup(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + rrdeng_main.cleanup_running--; +} + +static void *cleanup_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_BUFFERS_CLEANUP); + + wal_cleanup1(); + extent_buffer_cleanup1(); + { - char buf[4096]; - debug(D_RRDENGINE, "%s", get_rrdeng_statistics(wc->ctx, buf, sizeof(buf))); + static time_t last_run_s = 0; + time_t now_s = now_monotonic_sec(); + if(now_s - last_run_s >= 10) { + last_run_s = now_s; + journalfile_v2_data_unmount_cleanup(now_s); + } } + +#ifdef PDC_USE_JULYL + julyl_cleanup1(); #endif + return data; +} + +void timer_cb(uv_timer_t* handle) { + worker_is_busy(RRDENG_TIMER_CB); + uv_stop(handle->loop); + uv_update_time(handle->loop); + + worker_set_metric(RRDENG_OPCODES_WAITING, (NETDATA_DOUBLE)rrdeng_main.cmd_queue.unsafe.waiting); + worker_set_metric(RRDENG_WORKS_DISPATCHED, (NETDATA_DOUBLE)__atomic_load_n(&rrdeng_main.work_cmd.atomics.dispatched, __ATOMIC_RELAXED)); + worker_set_metric(RRDENG_WORKS_EXECUTING, (NETDATA_DOUBLE)__atomic_load_n(&rrdeng_main.work_cmd.atomics.executing, __ATOMIC_RELAXED)); + + rrdeng_enq_cmd(NULL, RRDENG_OPCODE_FLUSH_INIT, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); + rrdeng_enq_cmd(NULL, RRDENG_OPCODE_EVICT_INIT, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); + rrdeng_enq_cmd(NULL, RRDENG_OPCODE_CLEANUP, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); + worker_is_idle(); } -#define MAX_CMD_BATCH_SIZE (256) +static void dbengine_initialize_structures(void) { + pgc_and_mrg_initialize(); + + pdc_init(); + page_details_init(); + epdl_init(); + deol_init(); + rrdeng_cmd_queue_init(); + work_request_init(); + rrdeng_query_handle_init(); + page_descriptors_init(); + extent_buffer_init(); + dbengine_page_alloc_init(); + extent_io_descriptor_init(); +} -void rrdeng_worker(void* arg) -{ - worker_register("DBENGINE"); - worker_register_job_name(RRDENG_NOOP, "noop"); - worker_register_job_name(RRDENG_READ_PAGE, "page read"); - worker_register_job_name(RRDENG_READ_EXTENT, "extent read"); - worker_register_job_name(RRDENG_COMMIT_PAGE, "commit"); - worker_register_job_name(RRDENG_FLUSH_PAGES, "flush"); - worker_register_job_name(RRDENG_SHUTDOWN, "shutdown"); - worker_register_job_name(RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE, "page lru"); - worker_register_job_name(RRDENG_QUIESCE, "quiesce"); - worker_register_job_name(RRDENG_MAX_OPCODE, "cleanup"); - worker_register_job_name(RRDENG_MAX_OPCODE + 1, "timer"); - - struct rrdengine_worker_config* wc = arg; - struct rrdengine_instance *ctx = wc->ctx; - uv_loop_t* loop; - int shutdown, ret; - enum rrdeng_opcode opcode; - uv_timer_t timer_req; - struct rrdeng_cmd cmd; - unsigned cmd_batch_size; +bool rrdeng_dbengine_spawn(struct rrdengine_instance *ctx __maybe_unused) { + static bool spawned = false; + static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; - rrdeng_init_cmd_queue(wc); + netdata_spinlock_lock(&spinlock); - loop = wc->loop = mallocz(sizeof(uv_loop_t)); - ret = uv_loop_init(loop); - if (ret) { - error("uv_loop_init(): %s", uv_strerror(ret)); - goto error_after_loop_init; - } - loop->data = wc; + if(!spawned) { + int ret; - ret = uv_async_init(wc->loop, &wc->async, async_cb); - if (ret) { - error("uv_async_init(): %s", uv_strerror(ret)); - goto error_after_async_init; - } - wc->async.data = wc; + ret = uv_loop_init(&rrdeng_main.loop); + if (ret) { + error("DBENGINE: uv_loop_init(): %s", uv_strerror(ret)); + return false; + } + rrdeng_main.loop.data = &rrdeng_main; - wc->now_deleting_files = NULL; - wc->cleanup_thread_deleting_files = 0; + ret = uv_async_init(&rrdeng_main.loop, &rrdeng_main.async, async_cb); + if (ret) { + error("DBENGINE: uv_async_init(): %s", uv_strerror(ret)); + fatal_assert(0 == uv_loop_close(&rrdeng_main.loop)); + return false; + } + rrdeng_main.async.data = &rrdeng_main; + + ret = uv_timer_init(&rrdeng_main.loop, &rrdeng_main.timer); + if (ret) { + error("DBENGINE: uv_timer_init(): %s", uv_strerror(ret)); + uv_close((uv_handle_t *)&rrdeng_main.async, NULL); + fatal_assert(0 == uv_loop_close(&rrdeng_main.loop)); + return false; + } + rrdeng_main.timer.data = &rrdeng_main; - wc->now_invalidating_dirty_pages = NULL; - wc->cleanup_thread_invalidating_dirty_pages = 0; - wc->inflight_dirty_pages = 0; + dbengine_initialize_structures(); - /* dirty page flushing timer */ - ret = uv_timer_init(loop, &timer_req); - if (ret) { - error("uv_timer_init(): %s", uv_strerror(ret)); - goto error_after_timer_init; + fatal_assert(0 == uv_thread_create(&rrdeng_main.thread, dbengine_event_loop, &rrdeng_main)); + spawned = true; } - timer_req.data = wc; - wc->error = 0; - /* wake up initialization thread */ - completion_mark_complete(&ctx->rrdengine_completion); + netdata_spinlock_unlock(&spinlock); + return true; +} - fatal_assert(0 == uv_timer_start(&timer_req, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS)); - shutdown = 0; - int set_name = 0; - while (likely(shutdown == 0 || rrdeng_threads_alive(wc))) { +void dbengine_event_loop(void* arg) { + sanity_check(); + uv_thread_set_name_np(pthread_self(), "DBENGINE"); + service_register(SERVICE_THREAD_TYPE_EVENT_LOOP, NULL, NULL, NULL, true); + + worker_register("DBENGINE"); + + // opcode jobs + worker_register_job_name(RRDENG_OPCODE_NOOP, "noop"); + + worker_register_job_name(RRDENG_OPCODE_QUERY, "query"); + worker_register_job_name(RRDENG_OPCODE_EXTENT_WRITE, "extent write"); + worker_register_job_name(RRDENG_OPCODE_EXTENT_READ, "extent read"); + worker_register_job_name(RRDENG_OPCODE_FLUSHED_TO_OPEN, "flushed to open"); + worker_register_job_name(RRDENG_OPCODE_DATABASE_ROTATE, "db rotate"); + worker_register_job_name(RRDENG_OPCODE_JOURNAL_INDEX, "journal index"); + worker_register_job_name(RRDENG_OPCODE_FLUSH_INIT, "flush init"); + worker_register_job_name(RRDENG_OPCODE_EVICT_INIT, "evict init"); + worker_register_job_name(RRDENG_OPCODE_CTX_SHUTDOWN, "ctx shutdown"); + worker_register_job_name(RRDENG_OPCODE_CTX_QUIESCE, "ctx quiesce"); + + worker_register_job_name(RRDENG_OPCODE_MAX, "get opcode"); + + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_QUERY, "query cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_EXTENT_WRITE, "extent write cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_EXTENT_READ, "extent read cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_FLUSHED_TO_OPEN, "flushed to open cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_DATABASE_ROTATE, "db rotate cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_JOURNAL_INDEX, "journal index cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_FLUSH_INIT, "flush init cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_EVICT_INIT, "evict init cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_CTX_SHUTDOWN, "ctx shutdown cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_CTX_QUIESCE, "ctx quiesce cb"); + + // special jobs + worker_register_job_name(RRDENG_TIMER_CB, "timer"); + worker_register_job_name(RRDENG_FLUSH_TRANSACTION_BUFFER_CB, "transaction buffer flush cb"); + + worker_register_job_custom_metric(RRDENG_OPCODES_WAITING, "opcodes waiting", "opcodes", WORKER_METRIC_ABSOLUTE); + worker_register_job_custom_metric(RRDENG_WORKS_DISPATCHED, "works dispatched", "works", WORKER_METRIC_ABSOLUTE); + worker_register_job_custom_metric(RRDENG_WORKS_EXECUTING, "works executing", "works", WORKER_METRIC_ABSOLUTE); + + struct rrdeng_main *main = arg; + enum rrdeng_opcode opcode; + struct rrdeng_cmd cmd; + main->tid = gettid(); + + fatal_assert(0 == uv_timer_start(&main->timer, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS)); + + bool shutdown = false; + while (likely(!shutdown)) { worker_is_idle(); - uv_run(loop, UV_RUN_DEFAULT); - worker_is_busy(RRDENG_MAX_OPCODE); - rrdeng_cleanup_finished_threads(wc); + uv_run(&main->loop, UV_RUN_DEFAULT); /* wait for commands */ - cmd_batch_size = 0; do { - /* - * Avoid starving the loop when there are too many commands coming in. - * timer_cb will interrupt the loop again to allow serving more commands. - */ - if (unlikely(cmd_batch_size >= MAX_CMD_BATCH_SIZE)) - break; - - cmd = rrdeng_deq_cmd(wc); + worker_is_busy(RRDENG_OPCODE_MAX); + cmd = rrdeng_deq_cmd(); opcode = cmd.opcode; - ++cmd_batch_size; - if(likely(opcode != RRDENG_NOOP)) - worker_is_busy(opcode); + worker_is_busy(opcode); switch (opcode) { - case RRDENG_NOOP: - /* the command queue was empty, do nothing */ - break; - case RRDENG_SHUTDOWN: - shutdown = 1; - break; - case RRDENG_QUIESCE: - ctx->drop_metrics_under_page_cache_pressure = 0; - ctx->quiesce = SET_QUIESCE; - fatal_assert(0 == uv_timer_stop(&timer_req)); - uv_close((uv_handle_t *)&timer_req, NULL); - while (do_flush_pages(wc, 1, NULL)) { - ; /* Force flushing of all committed pages. */ + case RRDENG_OPCODE_EXTENT_READ: { + struct rrdengine_instance *ctx = cmd.ctx; + EPDL *epdl = cmd.data; + work_dispatch(ctx, epdl, NULL, opcode, extent_read_tp_worker, after_extent_read); + break; } - wal_flush_transaction_buffer(wc); - if (!rrdeng_threads_alive(wc)) { - ctx->quiesce = QUIESCED; - completion_mark_complete(&ctx->rrdengine_completion); + + case RRDENG_OPCODE_QUERY: { + struct rrdengine_instance *ctx = cmd.ctx; + PDC *pdc = cmd.data; + work_dispatch(ctx, pdc, NULL, opcode, query_prep_tp_worker, after_prep_query); + break; } - break; - case RRDENG_READ_PAGE: - do_read_extent(wc, &cmd.read_page.page_cache_descr, 1, 0); - break; - case RRDENG_READ_EXTENT: - do_read_extent(wc, cmd.read_extent.page_cache_descr, cmd.read_extent.page_count, 1); - if (unlikely(!set_name)) { - set_name = 1; - uv_thread_set_name_np(ctx->worker_config.thread, "DBENGINE"); + + case RRDENG_OPCODE_EXTENT_WRITE: { + struct rrdengine_instance *ctx = cmd.ctx; + struct page_descr_with_data *base = cmd.data; + struct completion *completion = cmd.completion; // optional + work_dispatch(ctx, base, completion, opcode, extent_write_tp_worker, after_extent_write); + break; } - break; - case RRDENG_COMMIT_PAGE: - do_commit_transaction(wc, STORE_DATA, NULL); - break; - case RRDENG_FLUSH_PAGES: { - if (wc->now_invalidating_dirty_pages) { - /* Do not flush if the disk cannot keep up */ - completion_mark_complete(cmd.completion); - } else { - (void)do_flush_pages(wc, 1, cmd.completion); + + case RRDENG_OPCODE_FLUSHED_TO_OPEN: { + struct rrdengine_instance *ctx = cmd.ctx; + uv_fs_t *uv_fs_request = cmd.data; + struct extent_io_descriptor *xt_io_descr = uv_fs_request->data; + struct completion *completion = xt_io_descr->completion; + work_dispatch(ctx, uv_fs_request, completion, opcode, extent_flushed_to_open_tp_worker, after_extent_flushed_to_open); + break; + } + + case RRDENG_OPCODE_FLUSH_INIT: { + if(rrdeng_main.flushes_running < (size_t)(libuv_worker_threads / 4)) { + rrdeng_main.flushes_running++; + work_dispatch(NULL, NULL, NULL, opcode, cache_flush_tp_worker, after_do_cache_flush); + } + break; + } + + case RRDENG_OPCODE_EVICT_INIT: { + if(!rrdeng_main.evictions_running) { + rrdeng_main.evictions_running++; + work_dispatch(NULL, NULL, NULL, opcode, cache_evict_tp_worker, after_do_cache_evict); + } + break; + } + + case RRDENG_OPCODE_CLEANUP: { + if(!rrdeng_main.cleanup_running) { + rrdeng_main.cleanup_running++; + work_dispatch(NULL, NULL, NULL, opcode, cleanup_tp_worker, after_cleanup); + } + break; + } + + case RRDENG_OPCODE_JOURNAL_INDEX: { + struct rrdengine_instance *ctx = cmd.ctx; + struct rrdengine_datafile *datafile = cmd.data; + if(!__atomic_load_n(&ctx->atomic.migration_to_v2_running, __ATOMIC_RELAXED)) { + + __atomic_store_n(&ctx->atomic.migration_to_v2_running, true, __ATOMIC_RELAXED); + work_dispatch(ctx, datafile, NULL, opcode, journal_v2_indexing_tp_worker, after_journal_v2_indexing); + } + break; + } + + case RRDENG_OPCODE_DATABASE_ROTATE: { + struct rrdengine_instance *ctx = cmd.ctx; + if (!__atomic_load_n(&ctx->atomic.now_deleting_files, __ATOMIC_RELAXED) && + ctx->datafiles.first->next != NULL && + ctx->datafiles.first->next->next != NULL && + rrdeng_ctx_exceeded_disk_quota(ctx)) { + + __atomic_store_n(&ctx->atomic.now_deleting_files, true, __ATOMIC_RELAXED); + work_dispatch(ctx, NULL, NULL, opcode, database_rotate_tp_worker, after_database_rotate); + } + break; + } + + case RRDENG_OPCODE_CTX_POPULATE_MRG: { + struct rrdengine_instance *ctx = cmd.ctx; + struct completion *completion = cmd.completion; + work_dispatch(ctx, NULL, completion, opcode, populate_mrg_tp_worker, after_populate_mrg); + break; + } + + case RRDENG_OPCODE_CTX_QUIESCE: { + // a ctx will shutdown shortly + struct rrdengine_instance *ctx = cmd.ctx; + __atomic_store_n(&ctx->quiesce.enabled, true, __ATOMIC_RELEASE); + work_dispatch(ctx, NULL, NULL, opcode, + flush_all_hot_and_dirty_pages_of_section_tp_worker, + after_flush_all_hot_and_dirty_pages_of_section); + break; + } + + case RRDENG_OPCODE_CTX_SHUTDOWN: { + // a ctx is shutting down + struct rrdengine_instance *ctx = cmd.ctx; + struct completion *completion = cmd.completion; + work_dispatch(ctx, NULL, completion, opcode, ctx_shutdown_tp_worker, after_ctx_shutdown); + break; + } + + case RRDENG_OPCODE_NOOP: { + /* the command queue was empty, do nothing */ + break; + } + + // not opcodes + case RRDENG_OPCODE_MAX: + default: { + internal_fatal(true, "DBENGINE: unknown opcode"); + break; } - break; - case RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE: - rrdeng_invalidate_oldest_committed(wc); - break; - } - default: - debug(D_RRDENGINE, "%s: default.", __func__); - break; } - } while (opcode != RRDENG_NOOP); + + } while (opcode != RRDENG_OPCODE_NOOP); } /* cleanup operations of the event loop */ - info("Shutting down RRD engine event loop for tier %d", ctx->tier); + info("DBENGINE: shutting down dbengine thread"); /* * uv_async_send after uv_close does not seem to crash in linux at the moment, * it is however undocumented behaviour and we need to be aware if this becomes * an issue in the future. */ - uv_close((uv_handle_t *)&wc->async, NULL); - - while (do_flush_pages(wc, 1, NULL)) { - ; /* Force flushing of all committed pages. */ - } - wal_flush_transaction_buffer(wc); - uv_run(loop, UV_RUN_DEFAULT); - - info("Shutting down RRD engine event loop for tier %d complete", ctx->tier); - /* TODO: don't let the API block by waiting to enqueue commands */ - uv_cond_destroy(&wc->cmd_cond); -/* uv_mutex_destroy(&wc->cmd_mutex); */ - fatal_assert(0 == uv_loop_close(loop)); - freez(loop); - + uv_close((uv_handle_t *)&main->async, NULL); + uv_timer_stop(&main->timer); + uv_close((uv_handle_t *)&main->timer, NULL); + uv_run(&main->loop, UV_RUN_DEFAULT); + uv_loop_close(&main->loop); worker_unregister(); - return; - -error_after_timer_init: - uv_close((uv_handle_t *)&wc->async, NULL); -error_after_async_init: - fatal_assert(0 == uv_loop_close(loop)); -error_after_loop_init: - freez(loop); - - wc->error = UV_EAGAIN; - /* wake up initialization thread */ - completion_mark_complete(&ctx->rrdengine_completion); - worker_unregister(); -} - -/* C entry point for development purposes - * make "LDFLAGS=-errdengine_main" - */ -void rrdengine_main(void) -{ - int ret; - struct rrdengine_instance *ctx; - - sanity_check(); - ret = rrdeng_init(NULL, &ctx, "/tmp", RRDENG_MIN_PAGE_CACHE_SIZE_MB, RRDENG_MIN_DISK_SPACE_MB, 0); - if (ret) { - exit(ret); - } - rrdeng_exit(ctx); - fprintf(stderr, "Hello world!"); - exit(0); } diff --git a/database/engine/rrdengine.h b/database/engine/rrdengine.h index 521d2521a..492666815 100644 --- a/database/engine/rrdengine.h +++ b/database/engine/rrdengine.h @@ -19,202 +19,315 @@ #include "journalfile.h" #include "rrdengineapi.h" #include "pagecache.h" -#include "rrdenglocking.h" - -#ifdef NETDATA_RRD_INTERNALS - -#endif /* NETDATA_RRD_INTERNALS */ +#include "metric.h" +#include "cache.h" +#include "pdc.h" extern unsigned rrdeng_pages_per_extent; /* Forward declarations */ struct rrdengine_instance; +struct rrdeng_cmd; #define MAX_PAGES_PER_EXTENT (64) /* TODO: can go higher only when journal supports bigger than 4KiB transactions */ #define RRDENG_FILE_NUMBER_SCAN_TMPL "%1u-%10u" #define RRDENG_FILE_NUMBER_PRINT_TMPL "%1.1u-%10.10u" +typedef struct page_details_control { + struct rrdengine_instance *ctx; + struct metric *metric; + + struct completion prep_completion; + struct completion page_completion; // sync between the query thread and the workers + + Pvoid_t page_list_JudyL; // the list of page details + unsigned completed_jobs; // the number of jobs completed last time the query thread checked + bool workers_should_stop; // true when the query thread left and the workers should stop + bool prep_done; + + SPINLOCK refcount_spinlock; // spinlock to protect refcount + int32_t refcount; // the number of workers currently working on this request + 1 for the query thread + size_t executed_with_gaps; + + time_t start_time_s; + time_t end_time_s; + STORAGE_PRIORITY priority; + + time_t optimal_end_time_s; +} PDC; + +PDC *pdc_get(void); + +typedef enum __attribute__ ((__packed__)) { + // final status for all pages + // if a page does not have one of these, it is considered unroutable + PDC_PAGE_READY = (1 << 0), // ready to be processed (pd->page is not null) + PDC_PAGE_FAILED = (1 << 1), // failed to be loaded (pd->page is null) + PDC_PAGE_SKIP = (1 << 2), // don't use this page, it is not good for us + PDC_PAGE_INVALID = (1 << 3), // don't use this page, it is invalid + PDC_PAGE_EMPTY = (1 << 4), // the page is empty, does not have any data + + // other statuses for tracking issues + PDC_PAGE_PREPROCESSED = (1 << 5), // used during preprocessing + PDC_PAGE_PROCESSED = (1 << 6), // processed by the query caller + PDC_PAGE_RELEASED = (1 << 7), // already released + + // data found in cache (preloaded) or on disk? + PDC_PAGE_PRELOADED = (1 << 8), // data found in memory + PDC_PAGE_DISK_PENDING = (1 << 9), // data need to be loaded from disk + + // worker related statuses + PDC_PAGE_FAILED_INVALID_EXTENT = (1 << 10), + PDC_PAGE_FAILED_NOT_IN_EXTENT = (1 << 11), + PDC_PAGE_FAILED_TO_MAP_EXTENT = (1 << 12), + PDC_PAGE_FAILED_TO_ACQUIRE_DATAFILE= (1 << 13), + + PDC_PAGE_EXTENT_FROM_CACHE = (1 << 14), + PDC_PAGE_EXTENT_FROM_DISK = (1 << 15), + + PDC_PAGE_CANCELLED = (1 << 16), // the query thread had left when we try to load the page + + PDC_PAGE_SOURCE_MAIN_CACHE = (1 << 17), + PDC_PAGE_SOURCE_OPEN_CACHE = (1 << 18), + PDC_PAGE_SOURCE_JOURNAL_V2 = (1 << 19), + PDC_PAGE_PRELOADED_PASS4 = (1 << 20), + + // datafile acquired + PDC_PAGE_DATAFILE_ACQUIRED = (1 << 30), +} PDC_PAGE_STATUS; + +#define PDC_PAGE_QUERY_GLOBAL_SKIP_LIST (PDC_PAGE_FAILED | PDC_PAGE_SKIP | PDC_PAGE_INVALID | PDC_PAGE_RELEASED) + +struct page_details { + struct { + struct rrdengine_datafile *ptr; + uv_file file; + unsigned fileno; + + struct { + uint64_t pos; + uint32_t bytes; + } extent; + } datafile; + + struct pgc_page *page; + Word_t metric_id; + time_t first_time_s; + time_t last_time_s; + uint32_t update_every_s; + uint16_t page_length; + PDC_PAGE_STATUS status; + + struct { + struct page_details *prev; + struct page_details *next; + } load; +}; + +struct page_details *page_details_get(void); + +#define pdc_page_status_check(pd, flag) (__atomic_load_n(&((pd)->status), __ATOMIC_ACQUIRE) & (flag)) +#define pdc_page_status_set(pd, flag) __atomic_or_fetch(&((pd)->status), flag, __ATOMIC_RELEASE) +#define pdc_page_status_clear(pd, flag) __atomic_and_fetch(&((od)->status), ~(flag), __ATOMIC_RELEASE) + +struct jv2_extents_info { + size_t index; + uint64_t pos; + unsigned bytes; + size_t number_of_pages; +}; + +struct jv2_metrics_info { + uuid_t *uuid; + uint32_t page_list_header; + time_t first_time_s; + time_t last_time_s; + size_t number_of_pages; + Pvoid_t JudyL_pages_by_start_time; +}; + +struct jv2_page_info { + time_t start_time_s; + time_t end_time_s; + time_t update_every_s; + size_t page_length; + uint32_t extent_index; + void *custom_data; + + // private + struct pgc_page *page; +}; + +typedef enum __attribute__ ((__packed__)) { + RRDENG_CHO_UNALIGNED = (1 << 0), // set when this metric is not page aligned according to page alignment + RRDENG_FIRST_PAGE_ALLOCATED = (1 << 1), // set when this metric has allocated its first page + RRDENG_1ST_METRIC_WRITER = (1 << 2), +} RRDENG_COLLECT_HANDLE_OPTIONS; + +typedef enum __attribute__ ((__packed__)) { + RRDENG_PAGE_PAST_COLLECTION = (1 << 0), + RRDENG_PAGE_REPEATED_COLLECTION = (1 << 1), + RRDENG_PAGE_BIG_GAP = (1 << 2), + RRDENG_PAGE_GAP = (1 << 3), + RRDENG_PAGE_FUTURE_POINT = (1 << 4), + RRDENG_PAGE_CREATED_IN_FUTURE = (1 << 5), + RRDENG_PAGE_COMPLETED_IN_FUTURE = (1 << 6), + RRDENG_PAGE_UNALIGNED = (1 << 7), + RRDENG_PAGE_CONFLICT = (1 << 8), + RRDENG_PAGE_FULL = (1 << 9), + RRDENG_PAGE_COLLECT_FINALIZE = (1 << 10), + RRDENG_PAGE_UPDATE_EVERY_CHANGE = (1 << 11), + RRDENG_PAGE_STEP_TOO_SMALL = (1 << 12), + RRDENG_PAGE_STEP_UNALIGNED = (1 << 13), +} RRDENG_COLLECT_PAGE_FLAGS; + struct rrdeng_collect_handle { - struct pg_cache_page_index *page_index; - struct rrdeng_page_descr *descr; - unsigned long page_correlation_id; - // set to 1 when this dimension is not page aligned with the other dimensions in the chart - uint8_t unaligned_page; + struct metric *metric; + struct pgc_page *page; struct pg_alignment *alignment; + RRDENG_COLLECT_HANDLE_OPTIONS options; + uint8_t type; + RRDENG_COLLECT_PAGE_FLAGS page_flags; + uint32_t page_entries_max; + uint32_t page_position; // keep track of the current page size, to make sure we don't exceed it + usec_t page_start_time_ut; + usec_t page_end_time_ut; + usec_t update_every_ut; }; struct rrdeng_query_handle { - struct rrdeng_page_descr *descr; + struct metric *metric; + struct pgc_page *page; struct rrdengine_instance *ctx; - struct pg_cache_page_index *page_index; - time_t wanted_start_time_s; + storage_number *metric_data; + struct page_details_control *pdc; + + // the request + time_t start_time_s; + time_t end_time_s; + STORAGE_PRIORITY priority; + + // internal data time_t now_s; + time_t dt_s; + unsigned position; unsigned entries; - storage_number *page; - usec_t page_end_time_ut; - uint32_t page_length; - time_t dt_s; + +#ifdef NETDATA_INTERNAL_CHECKS + usec_t started_time_s; + pid_t query_pid; + struct rrdeng_query_handle *prev, *next; +#endif }; -typedef enum { - RRDENGINE_STATUS_UNINITIALIZED = 0, - RRDENGINE_STATUS_INITIALIZING, - RRDENGINE_STATUS_INITIALIZED -} rrdengine_state_t; +struct rrdeng_query_handle *rrdeng_query_handle_get(void); +void rrdeng_query_handle_release(struct rrdeng_query_handle *handle); enum rrdeng_opcode { /* can be used to return empty status or flush the command queue */ - RRDENG_NOOP = 0, - - RRDENG_READ_PAGE, - RRDENG_READ_EXTENT, - RRDENG_COMMIT_PAGE, - RRDENG_FLUSH_PAGES, - RRDENG_SHUTDOWN, - RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE, - RRDENG_QUIESCE, - - RRDENG_MAX_OPCODE -}; - -struct rrdeng_read_page { - struct rrdeng_page_descr *page_cache_descr; + RRDENG_OPCODE_NOOP = 0, + + RRDENG_OPCODE_QUERY, + RRDENG_OPCODE_EXTENT_WRITE, + RRDENG_OPCODE_EXTENT_READ, + RRDENG_OPCODE_FLUSHED_TO_OPEN, + RRDENG_OPCODE_DATABASE_ROTATE, + RRDENG_OPCODE_JOURNAL_INDEX, + RRDENG_OPCODE_FLUSH_INIT, + RRDENG_OPCODE_EVICT_INIT, + RRDENG_OPCODE_CTX_SHUTDOWN, + RRDENG_OPCODE_CTX_QUIESCE, + RRDENG_OPCODE_CTX_POPULATE_MRG, + RRDENG_OPCODE_CLEANUP, + + RRDENG_OPCODE_MAX }; -struct rrdeng_read_extent { - struct rrdeng_page_descr *page_cache_descr[MAX_PAGES_PER_EXTENT]; - int page_count; -}; - -struct rrdeng_cmd { - enum rrdeng_opcode opcode; - union { - struct rrdeng_read_page read_page; - struct rrdeng_read_extent read_extent; - struct completion *completion; - }; -}; - -#define RRDENG_CMD_Q_MAX_SIZE (2048) - -struct rrdeng_cmdqueue { - unsigned head, tail; - struct rrdeng_cmd cmd_array[RRDENG_CMD_Q_MAX_SIZE]; +// WORKERS IDS: +// RRDENG_MAX_OPCODE : reserved for the cleanup +// RRDENG_MAX_OPCODE + opcode : reserved for the callbacks of each opcode +// RRDENG_MAX_OPCODE + RRDENG_MAX_OPCODE : reserved for the timer +#define RRDENG_TIMER_CB (RRDENG_OPCODE_MAX + RRDENG_OPCODE_MAX) +#define RRDENG_FLUSH_TRANSACTION_BUFFER_CB (RRDENG_TIMER_CB + 1) +#define RRDENG_OPCODES_WAITING (RRDENG_TIMER_CB + 2) +#define RRDENG_WORKS_DISPATCHED (RRDENG_TIMER_CB + 3) +#define RRDENG_WORKS_EXECUTING (RRDENG_TIMER_CB + 4) + +struct extent_io_data { + unsigned fileno; + uv_file file; + uint64_t pos; + unsigned bytes; + uint16_t page_length; }; struct extent_io_descriptor { - uv_fs_t req; - uv_work_t req_worker; + struct rrdengine_instance *ctx; + uv_fs_t uv_fs_request; uv_buf_t iov; uv_file file; void *buf; - void *map_base; - size_t map_length; + struct wal *wal; uint64_t pos; unsigned bytes; struct completion *completion; unsigned descr_count; - int release_descr; - struct rrdeng_page_descr *descr_array[MAX_PAGES_PER_EXTENT]; - struct rrdeng_page_descr descr_read_array[MAX_PAGES_PER_EXTENT]; - Word_t descr_commit_idx_array[MAX_PAGES_PER_EXTENT]; + struct page_descr_with_data *descr_array[MAX_PAGES_PER_EXTENT]; + struct rrdengine_datafile *datafile; struct extent_io_descriptor *next; /* multiple requests to be served by the same cached extent */ }; struct generic_io_descriptor { + struct rrdengine_instance *ctx; uv_fs_t req; uv_buf_t iov; void *buf; + void *data; uint64_t pos; unsigned bytes; struct completion *completion; }; -struct extent_cache_element { - struct extent_info *extent; /* The ABA problem is avoided with the help of fileno below */ - unsigned fileno; - struct extent_cache_element *prev; /* LRU */ - struct extent_cache_element *next; /* LRU */ - struct extent_io_descriptor *inflight_io_descr; /* I/O descriptor for in-flight extent */ - uint8_t pages[MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE]; -}; - -#define MAX_CACHED_EXTENTS 16 /* cannot be over 32 to fit in 32-bit architectures */ - -/* Initialize by setting the structure to zero */ -struct extent_cache { - struct extent_cache_element extent_array[MAX_CACHED_EXTENTS]; - unsigned allocation_bitmap; /* 1 if the corresponding position in the extent_array is allocated */ - unsigned inflight_bitmap; /* 1 if the corresponding position in the extent_array is waiting for I/O */ - - struct extent_cache_element *replaceQ_head; /* LRU */ - struct extent_cache_element *replaceQ_tail; /* MRU */ -}; - -struct rrdengine_worker_config { - struct rrdengine_instance *ctx; - - uv_thread_t thread; - uv_loop_t* loop; - uv_async_t async; - - /* file deletion thread */ - uv_thread_t *now_deleting_files; - unsigned long cleanup_thread_deleting_files; /* set to 0 when now_deleting_files is still running */ - - /* dirty page deletion thread */ - uv_thread_t *now_invalidating_dirty_pages; - /* set to 0 when now_invalidating_dirty_pages is still running */ - unsigned long cleanup_thread_invalidating_dirty_pages; - unsigned inflight_dirty_pages; - - /* FIFO command queue */ - uv_mutex_t cmd_mutex; - uv_cond_t cmd_cond; - volatile unsigned queue_size; - struct rrdeng_cmdqueue cmd_queue; +typedef struct wal { + uint64_t transaction_id; + void *buf; + size_t size; + size_t buf_size; + struct generic_io_descriptor io_descr; - struct extent_cache xt_cache; + struct { + struct wal *prev; + struct wal *next; + } cache; +} WAL; - int error; -}; +WAL *wal_get(struct rrdengine_instance *ctx, unsigned size); +void wal_release(WAL *wal); /* * Debug statistics not used by code logic. * They only describe operations since DB engine instance load time. */ struct rrdengine_statistics { - rrdeng_stats_t metric_API_producers; - rrdeng_stats_t metric_API_consumers; - rrdeng_stats_t pg_cache_insertions; - rrdeng_stats_t pg_cache_deletions; - rrdeng_stats_t pg_cache_hits; - rrdeng_stats_t pg_cache_misses; - rrdeng_stats_t pg_cache_backfills; - rrdeng_stats_t pg_cache_evictions; rrdeng_stats_t before_decompress_bytes; rrdeng_stats_t after_decompress_bytes; rrdeng_stats_t before_compress_bytes; rrdeng_stats_t after_compress_bytes; + rrdeng_stats_t io_write_bytes; rrdeng_stats_t io_write_requests; rrdeng_stats_t io_read_bytes; rrdeng_stats_t io_read_requests; - rrdeng_stats_t io_write_extent_bytes; - rrdeng_stats_t io_write_extents; - rrdeng_stats_t io_read_extent_bytes; - rrdeng_stats_t io_read_extents; + rrdeng_stats_t datafile_creations; rrdeng_stats_t datafile_deletions; rrdeng_stats_t journalfile_creations; rrdeng_stats_t journalfile_deletions; - rrdeng_stats_t page_cache_descriptors; + rrdeng_stats_t io_errors; rrdeng_stats_t fs_errors; - rrdeng_stats_t pg_cache_over_half_dirty_events; - rrdeng_stats_t flushing_pressure_page_deletions; }; /* I/O errors global counter */ @@ -227,57 +340,179 @@ extern rrdeng_stats_t rrdeng_reserved_file_descriptors; extern rrdeng_stats_t global_pg_cache_over_half_dirty_events; extern rrdeng_stats_t global_flushing_pressure_page_deletions; /* number of deleted pages */ -#define NO_QUIESCE (0) /* initial state when all operations function normally */ -#define SET_QUIESCE (1) /* set it before shutting down the instance, quiesce long running operations */ -#define QUIESCED (2) /* is set after all threads have finished running */ +struct rrdengine_instance { + struct { + bool legacy; // true when the db is autonomous for a single host -typedef enum { - LOAD_ERRORS_PAGE_FLIPPED_TIME = 0, - LOAD_ERRORS_PAGE_EQUAL_TIME = 1, - LOAD_ERRORS_PAGE_ZERO_ENTRIES = 2, - LOAD_ERRORS_PAGE_UPDATE_ZERO = 3, - LOAD_ERRORS_PAGE_FLEXY_TIME = 4, - LOAD_ERRORS_DROPPED_EXTENT = 5, -} INVALID_PAGE_ID; + int tier; // the tier of this ctx + uint8_t page_type; // default page type for this context -struct rrdengine_instance { - struct rrdengine_worker_config worker_config; - struct completion rrdengine_completion; - struct page_cache pg_cache; - uint8_t drop_metrics_under_page_cache_pressure; /* boolean */ - uint8_t global_compress_alg; - struct transaction_commit_log commit_log; - struct rrdengine_datafile_list datafiles; - RRDHOST *host; /* the legacy host, or NULL for multi-host DB */ - char dbfiles_path[FILENAME_MAX + 1]; - char machine_guid[GUID_LEN + 1]; /* the unique ID of the corresponding host, or localhost for multihost DB */ - uint64_t disk_space; - uint64_t max_disk_space; - int tier; - unsigned last_fileno; /* newest index of datafile and journalfile */ - unsigned long max_cache_pages; - unsigned long cache_pages_low_watermark; - unsigned long metric_API_max_producers; - - uint8_t quiesce; /* set to SET_QUIESCE before shutdown of the engine */ - uint8_t page_type; /* Default page type for this context */ + uint64_t max_disk_space; // the max disk space this ctx is allowed to use + uint8_t global_compress_alg; // the wanted compression algorithm - struct rrdengine_statistics stats; + char dbfiles_path[FILENAME_MAX + 1]; + } config; struct { - size_t counter; - usec_t latest_end_time_ut; - } load_errors[6]; + uv_rwlock_t rwlock; // the linked list of datafiles is protected by this lock + struct rrdengine_datafile *first; // oldest - the newest with ->first->prev + } datafiles; + + struct { + unsigned last_fileno; // newest index of datafile and journalfile + unsigned last_flush_fileno; // newest index of datafile received data + + size_t collectors_running; + size_t collectors_running_duplicate; + size_t inflight_queries; // the number of queries currently running + uint64_t current_disk_space; // the current disk space size used + + uint64_t transaction_id; // the transaction id of the next extent flushing + + bool migration_to_v2_running; + bool now_deleting_files; + unsigned extents_currently_being_flushed; // non-zero until we commit data to disk (both datafile and journal file) + } atomic; + + struct { + bool exit_mode; + bool enabled; // when set (before shutdown), queries are prohibited + struct completion completion; + } quiesce; + + struct { + struct { + size_t size; + struct completion *array; + } populate_mrg; + + bool create_new_datafile_pair; + } loading; + + struct rrdengine_statistics stats; }; -void *dbengine_page_alloc(void); -void dbengine_page_free(void *page); +#define ctx_current_disk_space_get(ctx) __atomic_load_n(&(ctx)->atomic.current_disk_space, __ATOMIC_RELAXED) +#define ctx_current_disk_space_increase(ctx, size) __atomic_add_fetch(&(ctx)->atomic.current_disk_space, size, __ATOMIC_RELAXED) +#define ctx_current_disk_space_decrease(ctx, size) __atomic_sub_fetch(&(ctx)->atomic.current_disk_space, size, __ATOMIC_RELAXED) + +static inline void ctx_io_read_op_bytes(struct rrdengine_instance *ctx, size_t bytes) { + __atomic_add_fetch(&ctx->stats.io_read_bytes, bytes, __ATOMIC_RELAXED); + __atomic_add_fetch(&ctx->stats.io_read_requests, 1, __ATOMIC_RELAXED); +} + +static inline void ctx_io_write_op_bytes(struct rrdengine_instance *ctx, size_t bytes) { + __atomic_add_fetch(&ctx->stats.io_write_bytes, bytes, __ATOMIC_RELAXED); + __atomic_add_fetch(&ctx->stats.io_write_requests, 1, __ATOMIC_RELAXED); +} +static inline void ctx_io_error(struct rrdengine_instance *ctx) { + __atomic_add_fetch(&ctx->stats.io_errors, 1, __ATOMIC_RELAXED); + rrd_stat_atomic_add(&global_io_errors, 1); +} + +static inline void ctx_fs_error(struct rrdengine_instance *ctx) { + __atomic_add_fetch(&ctx->stats.fs_errors, 1, __ATOMIC_RELAXED); + rrd_stat_atomic_add(&global_fs_errors, 1); +} + +#define ctx_last_fileno_get(ctx) __atomic_load_n(&(ctx)->atomic.last_fileno, __ATOMIC_RELAXED) +#define ctx_last_fileno_increment(ctx) __atomic_add_fetch(&(ctx)->atomic.last_fileno, 1, __ATOMIC_RELAXED) + +#define ctx_last_flush_fileno_get(ctx) __atomic_load_n(&(ctx)->atomic.last_flush_fileno, __ATOMIC_RELAXED) +static inline void ctx_last_flush_fileno_set(struct rrdengine_instance *ctx, unsigned fileno) { + unsigned old_fileno = ctx_last_flush_fileno_get(ctx); + + do { + if(old_fileno >= fileno) + return; + + } while(!__atomic_compare_exchange_n(&ctx->atomic.last_flush_fileno, &old_fileno, fileno, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); +} + +#define ctx_is_available_for_queries(ctx) (__atomic_load_n(&(ctx)->quiesce.enabled, __ATOMIC_RELAXED) == false && __atomic_load_n(&(ctx)->quiesce.exit_mode, __ATOMIC_RELAXED) == false) + +void *dbengine_page_alloc(size_t size); +void dbengine_page_free(void *page, size_t size); + +void *dbengine_extent_alloc(size_t size); +void dbengine_extent_free(void *extent, size_t size); + +bool rrdeng_ctx_exceeded_disk_quota(struct rrdengine_instance *ctx); int init_rrd_files(struct rrdengine_instance *ctx); void finalize_rrd_files(struct rrdengine_instance *ctx); -void rrdeng_test_quota(struct rrdengine_worker_config* wc); -void rrdeng_worker(void* arg); -void rrdeng_enq_cmd(struct rrdengine_worker_config* wc, struct rrdeng_cmd *cmd); -struct rrdeng_cmd rrdeng_deq_cmd(struct rrdengine_worker_config* wc); +bool rrdeng_dbengine_spawn(struct rrdengine_instance *ctx); +void dbengine_event_loop(void *arg); + +typedef void (*enqueue_callback_t)(struct rrdeng_cmd *cmd); +typedef void (*dequeue_callback_t)(struct rrdeng_cmd *cmd); + +void rrdeng_enqueue_epdl_cmd(struct rrdeng_cmd *cmd); +void rrdeng_dequeue_epdl_cmd(struct rrdeng_cmd *cmd); + +typedef struct rrdeng_cmd *(*requeue_callback_t)(void *data); +void rrdeng_req_cmd(requeue_callback_t get_cmd_cb, void *data, STORAGE_PRIORITY priority); + +void rrdeng_enq_cmd(struct rrdengine_instance *ctx, enum rrdeng_opcode opcode, void *data, + struct completion *completion, enum storage_priority priority, + enqueue_callback_t enqueue_cb, dequeue_callback_t dequeue_cb); + +void pdc_route_asynchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc); +void pdc_route_synchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc); + +void pdc_acquire(PDC *pdc); +bool pdc_release_and_destroy_if_unreferenced(PDC *pdc, bool worker, bool router); + +unsigned rrdeng_target_data_file_size(struct rrdengine_instance *ctx); + +struct page_descr_with_data *page_descriptor_get(void); + +typedef struct validated_page_descriptor { + time_t start_time_s; + time_t end_time_s; + time_t update_every_s; + size_t page_length; + size_t point_size; + size_t entries; + uint8_t type; + bool is_valid; +} VALIDATED_PAGE_DESCRIPTOR; + +#define DBENGINE_EMPTY_PAGE (void *)(-1) + +#define page_entries_by_time(start_time_s, end_time_s, update_every_s) \ + ((update_every_s) ? (((end_time_s) - ((start_time_s) - (update_every_s))) / (update_every_s)) : 1) + +#define page_entries_by_size(page_length_in_bytes, point_size_in_bytes) \ + ((page_length_in_bytes) / (point_size_in_bytes)) + +VALIDATED_PAGE_DESCRIPTOR validate_page(uuid_t *uuid, + time_t start_time_s, + time_t end_time_s, + time_t update_every_s, + size_t page_length, + uint8_t page_type, + size_t entries, + time_t now_s, + time_t overwrite_zero_update_every_s, + bool have_read_error, + const char *msg, + RRDENG_COLLECT_PAGE_FLAGS flags); +VALIDATED_PAGE_DESCRIPTOR validate_extent_page_descr(const struct rrdeng_extent_page_descr *descr, time_t now_s, time_t overwrite_zero_update_every_s, bool have_read_error); +void collect_page_flags_to_buffer(BUFFER *wb, RRDENG_COLLECT_PAGE_FLAGS flags); + +typedef enum { + PAGE_IS_IN_THE_PAST = -1, + PAGE_IS_IN_RANGE = 0, + PAGE_IS_IN_THE_FUTURE = 1, +} TIME_RANGE_COMPARE; + +TIME_RANGE_COMPARE is_page_in_time_range(time_t page_first_time_s, time_t page_last_time_s, time_t wanted_start_time_s, time_t wanted_end_time_s); + +static inline time_t max_acceptable_collected_time(void) { + return now_realtime_sec() + 1; +} + +void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool update_retention, bool worker); #endif /* NETDATA_RRDENGINE_H */ diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c index 4525b041f..27497bbb8 100755 --- a/database/engine/rrdengineapi.c +++ b/database/engine/rrdengineapi.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "rrdengine.h" -#include "../storage_engine.h" /* Default global database instance */ struct rrdengine_instance multidb_ctx_storage_tier0; @@ -8,12 +7,21 @@ struct rrdengine_instance multidb_ctx_storage_tier1; struct rrdengine_instance multidb_ctx_storage_tier2; struct rrdengine_instance multidb_ctx_storage_tier3; struct rrdengine_instance multidb_ctx_storage_tier4; + +#define mrg_metric_ctx(metric) (struct rrdengine_instance *)mrg_metric_section(main_mrg, metric) + #if RRD_STORAGE_TIERS != 5 #error RRD_STORAGE_TIERS is not 5 - you need to add allocations here #endif struct rrdengine_instance *multidb_ctx[RRD_STORAGE_TIERS]; uint8_t tier_page_type[RRD_STORAGE_TIERS] = {PAGE_METRICS, PAGE_TIER, PAGE_TIER, PAGE_TIER, PAGE_TIER}; +#if defined(ENV32BIT) +size_t tier_page_size[RRD_STORAGE_TIERS] = {2048, 1024, 192, 192, 192}; +#else +size_t tier_page_size[RRD_STORAGE_TIERS] = {4096, 2048, 384, 384, 384}; +#endif + #if PAGE_TYPE_MAX != 1 #error PAGE_TYPE_MAX is not 1 - you need to add allocations here #endif @@ -27,14 +35,17 @@ __attribute__((constructor)) void initialize_multidb_ctx(void) { multidb_ctx[4] = &multidb_ctx_storage_tier4; } -int db_engine_use_malloc = 0; int default_rrdeng_page_fetch_timeout = 3; int default_rrdeng_page_fetch_retries = 3; -int default_rrdeng_page_cache_mb = 32; +int db_engine_journal_check = 0; int default_rrdeng_disk_quota_mb = 256; int default_multidb_disk_quota_mb = 256; -/* Default behaviour is to unblock data collection if the page cache is full of dirty pages by dropping metrics */ -uint8_t rrdeng_drop_metrics_under_page_cache_pressure = 1; + +#if defined(ENV32BIT) +int default_rrdeng_page_cache_mb = 16; +#else +int default_rrdeng_page_cache_mb = 32; +#endif // ---------------------------------------------------------------------------- // metrics groups @@ -90,161 +101,207 @@ void rrdeng_generate_legacy_uuid(const char *dim_id, const char *chart_id, uuid_ memcpy(ret_uuid, hash_value, sizeof(uuid_t)); } -/* Transform legacy UUID to be unique across hosts deterministically */ -void rrdeng_convert_legacy_uuid_to_multihost(char machine_guid[GUID_LEN + 1], uuid_t *legacy_uuid, uuid_t *ret_uuid) -{ - EVP_MD_CTX *evpctx; - unsigned char hash_value[EVP_MAX_MD_SIZE]; - unsigned int hash_len; - - evpctx = EVP_MD_CTX_create(); - EVP_DigestInit_ex(evpctx, EVP_sha256(), NULL); - EVP_DigestUpdate(evpctx, machine_guid, GUID_LEN); - EVP_DigestUpdate(evpctx, *legacy_uuid, sizeof(uuid_t)); - EVP_DigestFinal_ex(evpctx, hash_value, &hash_len); - EVP_MD_CTX_destroy(evpctx); - fatal_assert(hash_len > sizeof(uuid_t)); - memcpy(ret_uuid, hash_value, sizeof(uuid_t)); -} - -STORAGE_METRIC_HANDLE *rrdeng_metric_get_legacy(STORAGE_INSTANCE *db_instance, const char *rd_id, const char *st_id) { +static METRIC *rrdeng_metric_get_legacy(STORAGE_INSTANCE *db_instance, const char *rd_id, const char *st_id) { + struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; uuid_t legacy_uuid; rrdeng_generate_legacy_uuid(rd_id, st_id, &legacy_uuid); - return rrdeng_metric_get(db_instance, &legacy_uuid); + return mrg_metric_get_and_acquire(main_mrg, &legacy_uuid, (Word_t) ctx); } // ---------------------------------------------------------------------------- // metric handle void rrdeng_metric_release(STORAGE_METRIC_HANDLE *db_metric_handle) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - - __atomic_sub_fetch(&page_index->refcount, 1, __ATOMIC_SEQ_CST); + METRIC *metric = (METRIC *)db_metric_handle; + mrg_metric_release(main_mrg, metric); } STORAGE_METRIC_HANDLE *rrdeng_metric_dup(STORAGE_METRIC_HANDLE *db_metric_handle) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - __atomic_add_fetch(&page_index->refcount, 1, __ATOMIC_SEQ_CST); - return db_metric_handle; + METRIC *metric = (METRIC *)db_metric_handle; + return (STORAGE_METRIC_HANDLE *) mrg_metric_dup(main_mrg, metric); } STORAGE_METRIC_HANDLE *rrdeng_metric_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid) { struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; - struct page_cache *pg_cache = &ctx->pg_cache; - struct pg_cache_page_index *page_index = NULL; - - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - Pvoid_t *PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, uuid, sizeof(uuid_t)); - if (likely(NULL != PValue)) - page_index = *PValue; - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - - if (likely(page_index)) - __atomic_add_fetch(&page_index->refcount, 1, __ATOMIC_SEQ_CST); - - return (STORAGE_METRIC_HANDLE *)page_index; + return (STORAGE_METRIC_HANDLE *) mrg_metric_get_and_acquire(main_mrg, uuid, (Word_t) ctx); } -STORAGE_METRIC_HANDLE *rrdeng_metric_create(STORAGE_INSTANCE *db_instance, uuid_t *uuid) { +static METRIC *rrdeng_metric_create(STORAGE_INSTANCE *db_instance, uuid_t *uuid) { internal_fatal(!db_instance, "DBENGINE: db_instance is NULL"); struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; - struct pg_cache_page_index *page_index; - struct page_cache *pg_cache = &ctx->pg_cache; - - uv_rwlock_wrlock(&pg_cache->metrics_index.lock); - Pvoid_t *PValue = JudyHSIns(&pg_cache->metrics_index.JudyHS_array, uuid, sizeof(uuid_t), PJE0); - fatal_assert(NULL == *PValue); /* TODO: figure out concurrency model */ - *PValue = page_index = create_page_index(uuid, ctx); - page_index->prev = pg_cache->metrics_index.last_page_index; - pg_cache->metrics_index.last_page_index = page_index; - page_index->refcount = 1; - uv_rwlock_wrunlock(&pg_cache->metrics_index.lock); - - return (STORAGE_METRIC_HANDLE *)page_index; + MRG_ENTRY entry = { + .section = (Word_t)ctx, + .first_time_s = 0, + .last_time_s = 0, + .latest_update_every_s = 0, + }; + uuid_copy(entry.uuid, *uuid); + + METRIC *metric = mrg_metric_add_and_acquire(main_mrg, entry, NULL); + return metric; } STORAGE_METRIC_HANDLE *rrdeng_metric_get_or_create(RRDDIM *rd, STORAGE_INSTANCE *db_instance) { - STORAGE_METRIC_HANDLE *db_metric_handle; - - db_metric_handle = rrdeng_metric_get(db_instance, &rd->metric_uuid); - if(!db_metric_handle) { - db_metric_handle = rrdeng_metric_get_legacy(db_instance, rrddim_id(rd), rrdset_id(rd->rrdset)); - if(db_metric_handle) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - uuid_copy(rd->metric_uuid, page_index->id); + struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; + METRIC *metric; + + metric = mrg_metric_get_and_acquire(main_mrg, &rd->metric_uuid, (Word_t) ctx); + + if(unlikely(!metric)) { + if(unlikely(ctx->config.legacy)) { + // this is a single host database + // generate uuid from the chart and dimensions ids + // and overwrite the one supplied by rrddim + metric = rrdeng_metric_get_legacy(db_instance, rrddim_id(rd), rrdset_id(rd->rrdset)); + if (metric) + uuid_copy(rd->metric_uuid, *mrg_metric_uuid(main_mrg, metric)); } + + if(likely(!metric)) + metric = rrdeng_metric_create(db_instance, &rd->metric_uuid); } - if(!db_metric_handle) - db_metric_handle = rrdeng_metric_create(db_instance, &rd->metric_uuid); #ifdef NETDATA_INTERNAL_CHECKS - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - if(uuid_compare(rd->metric_uuid, page_index->id) != 0) { + if(uuid_compare(rd->metric_uuid, *mrg_metric_uuid(main_mrg, metric)) != 0) { char uuid1[UUID_STR_LEN + 1]; char uuid2[UUID_STR_LEN + 1]; uuid_unparse(rd->metric_uuid, uuid1); - uuid_unparse(page_index->id, uuid2); - fatal("DBENGINE: uuids do not match, asked for metric '%s', but got page_index of metric '%s'", uuid1, uuid2); + uuid_unparse(*mrg_metric_uuid(main_mrg, metric), uuid2); + fatal("DBENGINE: uuids do not match, asked for metric '%s', but got metric '%s'", uuid1, uuid2); } - struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; - if(page_index->ctx != ctx) - fatal("DBENGINE: mixed up rrdengine instances, asked for metric from %p, got from %p", ctx, page_index->ctx); + if(mrg_metric_ctx(metric) != ctx) + fatal("DBENGINE: mixed up db instances, asked for metric from %p, got from %p", + ctx, mrg_metric_ctx(metric)); #endif - return db_metric_handle; + return (STORAGE_METRIC_HANDLE *)metric; } // ---------------------------------------------------------------------------- // collect ops +static inline void check_and_fix_mrg_update_every(struct rrdeng_collect_handle *handle) { + if(unlikely((time_t)(handle->update_every_ut / USEC_PER_SEC) != mrg_metric_get_update_every_s(main_mrg, handle->metric))) { + internal_error(true, "DBENGINE: collection handle has update every %ld, but the metric registry has %ld. Fixing it.", + (time_t)(handle->update_every_ut / USEC_PER_SEC), mrg_metric_get_update_every_s(main_mrg, handle->metric)); + + if(unlikely(!handle->update_every_ut)) + handle->update_every_ut = (usec_t)mrg_metric_get_update_every_s(main_mrg, handle->metric) * USEC_PER_SEC; + else + mrg_metric_set_update_every(main_mrg, handle->metric, (time_t)(handle->update_every_ut / USEC_PER_SEC)); + } +} + +static inline bool check_completed_page_consistency(struct rrdeng_collect_handle *handle __maybe_unused) { +#ifdef NETDATA_INTERNAL_CHECKS + if (unlikely(!handle->page || !handle->page_entries_max || !handle->page_position || !handle->page_end_time_ut)) + return false; + + struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); + + uuid_t *uuid = mrg_metric_uuid(main_mrg, handle->metric); + time_t start_time_s = pgc_page_start_time_s(handle->page); + time_t end_time_s = pgc_page_end_time_s(handle->page); + time_t update_every_s = pgc_page_update_every_s(handle->page); + size_t page_length = handle->page_position * CTX_POINT_SIZE_BYTES(ctx); + size_t entries = handle->page_position; + time_t overwrite_zero_update_every_s = (time_t)(handle->update_every_ut / USEC_PER_SEC); + + if(end_time_s > max_acceptable_collected_time()) + handle->page_flags |= RRDENG_PAGE_COMPLETED_IN_FUTURE; + + VALIDATED_PAGE_DESCRIPTOR vd = validate_page( + uuid, + start_time_s, + end_time_s, + update_every_s, + page_length, + ctx->config.page_type, + entries, + 0, // do not check for future timestamps - we inherit the timestamps of the children + overwrite_zero_update_every_s, + false, + "collected", + handle->page_flags); + + return vd.is_valid; +#else + return true; +#endif +} + /* * Gets a handle for storing metrics to the database. * The handle must be released with rrdeng_store_metric_final(). */ STORAGE_COLLECT_HANDLE *rrdeng_store_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, uint32_t update_every, STORAGE_METRICS_GROUP *smg) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; + METRIC *metric = (METRIC *)db_metric_handle; + struct rrdengine_instance *ctx = mrg_metric_ctx(metric); + + bool is_1st_metric_writer = true; + if(!mrg_metric_set_writer(main_mrg, metric)) { + is_1st_metric_writer = false; + char uuid[UUID_STR_LEN + 1]; + uuid_unparse(*mrg_metric_uuid(main_mrg, metric), uuid); + error("DBENGINE: metric '%s' is already collected and should not be collected twice - expect gaps on the charts", uuid); + } + + metric = mrg_metric_dup(main_mrg, metric); + struct rrdeng_collect_handle *handle; handle = callocz(1, sizeof(struct rrdeng_collect_handle)); - handle->page_index = page_index; - handle->descr = NULL; - handle->unaligned_page = 0; - page_index->latest_update_every_s = update_every; + handle->metric = metric; + handle->page = NULL; + handle->page_position = 0; + handle->page_entries_max = 0; + handle->update_every_ut = (usec_t)update_every * USEC_PER_SEC; + handle->options = is_1st_metric_writer ? RRDENG_1ST_METRIC_WRITER : 0; + + __atomic_add_fetch(&ctx->atomic.collectors_running, 1, __ATOMIC_RELAXED); + if(!is_1st_metric_writer) + __atomic_add_fetch(&ctx->atomic.collectors_running_duplicate, 1, __ATOMIC_RELAXED); + + mrg_metric_set_update_every(main_mrg, metric, update_every); handle->alignment = (struct pg_alignment *)smg; rrdeng_page_alignment_acquire(handle->alignment); - uv_rwlock_wrlock(&page_index->lock); - ++page_index->writers; - uv_rwlock_wrunlock(&page_index->lock); + // this is important! + // if we don't set the page_end_time_ut during the first collection + // data collection may be able to go back in time and during the addition of new pages + // clean pages may be found matching ours! + + time_t db_first_time_s, db_last_time_s, db_update_every_s; + mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s); + handle->page_end_time_ut = (usec_t)db_last_time_s * USEC_PER_SEC; return (STORAGE_COLLECT_HANDLE *)handle; } /* The page must be populated and referenced */ -static int page_has_only_empty_metrics(struct rrdeng_page_descr *descr) -{ - switch(descr->type) { +static bool page_has_only_empty_metrics(struct rrdeng_collect_handle *handle) { + switch(handle->type) { case PAGE_METRICS: { - size_t slots = descr->page_length / PAGE_POINT_SIZE_BYTES(descr); - storage_number *array = (storage_number *)descr->pg_cache_descr->page; + size_t slots = handle->page_position; + storage_number *array = (storage_number *)pgc_page_data(handle->page); for (size_t i = 0 ; i < slots; ++i) { if(does_storage_number_exist(array[i])) - return 0; + return false; } } break; case PAGE_TIER: { - size_t slots = descr->page_length / PAGE_POINT_SIZE_BYTES(descr); - storage_number_tier1_t *array = (storage_number_tier1_t *)descr->pg_cache_descr->page; + size_t slots = handle->page_position; + storage_number_tier1_t *array = (storage_number_tier1_t *)pgc_page_data(handle->page); for (size_t i = 0 ; i < slots; ++i) { if(fpclassify(array[i].sum_value) != FP_NAN) - return 0; + return false; } } break; @@ -252,422 +309,585 @@ static int page_has_only_empty_metrics(struct rrdeng_page_descr *descr) default: { static bool logged = false; if(!logged) { - error("DBENGINE: cannot check page for nulls on unknown page type id %d", descr->type); + error("DBENGINE: cannot check page for nulls on unknown page type id %d", (mrg_metric_ctx(handle->metric))->config.page_type); logged = true; } - return 0; + return false; } } - return 1; + return true; } void rrdeng_store_metric_flush_current_page(STORAGE_COLLECT_HANDLE *collection_handle) { struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; - // struct rrdeng_metric_handle *metric_handle = (struct rrdeng_metric_handle *)handle->metric_handle; - struct rrdengine_instance *ctx = handle->page_index->ctx; - struct rrdeng_page_descr *descr = handle->descr; - - if (unlikely(!ctx)) return; - if (unlikely(!descr)) return; - - if (likely(descr->page_length)) { - int page_is_empty; - - rrd_stat_atomic_add(&ctx->stats.metric_API_producers, -1); - - page_is_empty = page_has_only_empty_metrics(descr); - if (page_is_empty) { - print_page_cache_descr(descr, "Page has empty metrics only, deleting", true); - pg_cache_put(ctx, descr); - pg_cache_punch_hole(ctx, descr, 1, 0, NULL); - } else - rrdeng_commit_page(ctx, descr, handle->page_correlation_id); - } else { - dbengine_page_free(descr->pg_cache_descr->page); - rrdeng_destroy_pg_cache_descr(ctx, descr->pg_cache_descr); - rrdeng_page_descr_freez(descr); + + if (unlikely(!handle->page)) + return; + + if(!handle->page_position || page_has_only_empty_metrics(handle)) + pgc_page_to_clean_evict_or_release(main_cache, handle->page); + + else { + check_completed_page_consistency(handle); + mrg_metric_set_clean_latest_time_s(main_mrg, handle->metric, pgc_page_end_time_s(handle->page)); + pgc_page_hot_to_dirty_and_release(main_cache, handle->page); } - handle->descr = NULL; -} -static void rrdeng_store_metric_next_internal(STORAGE_COLLECT_HANDLE *collection_handle, - usec_t point_in_time_ut, - NETDATA_DOUBLE n, - NETDATA_DOUBLE min_value, - NETDATA_DOUBLE max_value, - uint16_t count, - uint16_t anomaly_count, - SN_FLAGS flags) -{ - struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; - struct pg_cache_page_index *page_index = handle->page_index; - struct rrdengine_instance *ctx = handle->page_index->ctx; - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = handle->descr; + mrg_metric_set_hot_latest_time_s(main_mrg, handle->metric, 0); - void *page; - uint8_t must_flush_unaligned_page = 0, perfect_page_alignment = 0; + handle->page = NULL; + handle->page_flags = 0; + handle->page_position = 0; + handle->page_entries_max = 0; - if (descr) { - /* Make alignment decisions */ + // important! + // we should never zero page end time ut, because this will allow + // collection to go back in time + // handle->page_end_time_ut = 0; + // handle->page_start_time_ut; + + check_and_fix_mrg_update_every(handle); +} + +static void rrdeng_store_metric_create_new_page(struct rrdeng_collect_handle *handle, + struct rrdengine_instance *ctx, + usec_t point_in_time_ut, + void *data, + size_t data_size) { + time_t point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC); + const time_t update_every_s = (time_t)(handle->update_every_ut / USEC_PER_SEC); + + PGC_ENTRY page_entry = { + .section = (Word_t) ctx, + .metric_id = mrg_metric_id(main_mrg, handle->metric), + .start_time_s = point_in_time_s, + .end_time_s = point_in_time_s, + .size = data_size, + .data = data, + .update_every_s = update_every_s, + .hot = true + }; + + size_t conflicts = 0; + bool added = true; + PGC_PAGE *page = pgc_page_add_and_acquire(main_cache, page_entry, &added); + while (unlikely(!added)) { + conflicts++; + + char uuid[UUID_STR_LEN + 1]; + uuid_unparse(*mrg_metric_uuid(main_mrg, handle->metric), uuid); #ifdef NETDATA_INTERNAL_CHECKS - if(descr->end_time_ut + page_index->latest_update_every_s * USEC_PER_SEC != point_in_time_ut) { - char buffer[200 + 1]; - snprintfz(buffer, 200, - "metrics collected are %s, end_time_ut = %llu, point_in_time_ut = %llu, update_every = %u, delta = %llu", - (point_in_time_ut / USEC_PER_SEC - descr->end_time_ut / USEC_PER_SEC > page_index->latest_update_every_s)?"far apart":"not aligned", - descr->end_time_ut / USEC_PER_SEC, - point_in_time_ut / USEC_PER_SEC, - page_index->latest_update_every_s, - point_in_time_ut / USEC_PER_SEC - descr->end_time_ut / USEC_PER_SEC); - print_page_cache_descr(descr, buffer, false); - } + internal_error(true, +#else + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, #endif + "DBENGINE: metric '%s' new page from %ld to %ld, update every %ld, has a conflict in main cache " + "with existing %s%s page from %ld to %ld, update every %ld - " + "is it collected more than once?", + uuid, + page_entry.start_time_s, page_entry.end_time_s, (time_t)page_entry.update_every_s, + pgc_is_page_hot(page) ? "hot" : "not-hot", + pgc_page_data(page) == DBENGINE_EMPTY_PAGE ? " gap" : "", + pgc_page_start_time_s(page), pgc_page_end_time_s(page), pgc_page_update_every_s(page) + ); + + pgc_page_release(main_cache, page); + + point_in_time_ut -= handle->update_every_ut; + point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC); + page_entry.start_time_s = point_in_time_s; + page_entry.end_time_s = point_in_time_s; + page = pgc_page_add_and_acquire(main_cache, page_entry, &added); + } - if (descr->page_length == handle->alignment->page_length) { - /* this is the leading dimension that defines chart alignment */ - perfect_page_alignment = 1; - } - /* is the metric far enough out of alignment with the others? */ - if (unlikely(descr->page_length + PAGE_POINT_SIZE_BYTES(descr) < handle->alignment->page_length)) { - handle->unaligned_page = 1; - print_page_cache_descr(descr, "Metric page is not aligned with chart", true); - } - if (unlikely(handle->unaligned_page && - /* did the other metrics change page? */ - handle->alignment->page_length <= PAGE_POINT_SIZE_BYTES(descr))) { - print_page_cache_descr(descr, "must_flush_unaligned_page = 1", true); - must_flush_unaligned_page = 1; - handle->unaligned_page = 0; - } + handle->page_entries_max = data_size / CTX_POINT_SIZE_BYTES(ctx); + handle->page_start_time_ut = point_in_time_ut; + handle->page_end_time_ut = point_in_time_ut; + handle->page_position = 1; // zero is already in our data + handle->page = page; + handle->page_flags = conflicts? RRDENG_PAGE_CONFLICT : 0; + + if(point_in_time_s > max_acceptable_collected_time()) + handle->page_flags |= RRDENG_PAGE_CREATED_IN_FUTURE; + + check_and_fix_mrg_update_every(handle); +} + +static void *rrdeng_alloc_new_metric_data(struct rrdeng_collect_handle *handle, size_t *data_size, usec_t point_in_time_ut) { + struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); + size_t size; + + if(handle->options & RRDENG_FIRST_PAGE_ALLOCATED) { + // any page except the first + size = tier_page_size[ctx->config.tier]; } - if (unlikely(NULL == descr || - descr->page_length + PAGE_POINT_SIZE_BYTES(descr) > RRDENG_BLOCK_SIZE || - must_flush_unaligned_page)) { + else { + size_t final_slots = 0; - if(descr) { - print_page_cache_descr(descr, "flushing metric", true); - rrdeng_store_metric_flush_current_page(collection_handle); + // the first page + handle->options |= RRDENG_FIRST_PAGE_ALLOCATED; + size_t max_size = tier_page_size[ctx->config.tier]; + size_t max_slots = max_size / CTX_POINT_SIZE_BYTES(ctx); + + if(handle->alignment->initial_slots) { + final_slots = handle->alignment->initial_slots; } + else { + max_slots -= 3; - page = rrdeng_create_page(ctx, &page_index->id, &descr); - fatal_assert(page); + size_t smaller_slot = indexing_partition((Word_t)handle->alignment, max_slots); + final_slots = smaller_slot; - descr->update_every_s = page_index->latest_update_every_s; - handle->descr = descr; + time_t now_s = (time_t)(point_in_time_ut / USEC_PER_SEC); + size_t current_pos = (now_s % max_slots); - handle->page_correlation_id = rrd_atomic_fetch_add(&pg_cache->committed_page_index.latest_corr_id, 1); + if(current_pos > final_slots) + final_slots += max_slots - current_pos; - if (0 == handle->alignment->page_length) { - /* this is the leading dimension that defines chart alignment */ - perfect_page_alignment = 1; + else if(current_pos < final_slots) + final_slots -= current_pos; + + if(final_slots < 3) { + final_slots += 3; + smaller_slot += 3; + + if(smaller_slot >= max_slots) + smaller_slot -= max_slots; + } + + max_slots += 3; + handle->alignment->initial_slots = smaller_slot + 3; + + internal_fatal(handle->alignment->initial_slots < 3 || handle->alignment->initial_slots >= max_slots, "ooops! wrong distribution of metrics across time"); + internal_fatal(final_slots < 3 || final_slots >= max_slots, "ooops! wrong distribution of metrics across time"); } + + size = final_slots * CTX_POINT_SIZE_BYTES(ctx); } - page = descr->pg_cache_descr->page; + *data_size = size; + return dbengine_page_alloc(size); +} + +static void rrdeng_store_metric_append_point(STORAGE_COLLECT_HANDLE *collection_handle, + const usec_t point_in_time_ut, + const NETDATA_DOUBLE n, + const NETDATA_DOUBLE min_value, + const NETDATA_DOUBLE max_value, + const uint16_t count, + const uint16_t anomaly_count, + const SN_FLAGS flags) +{ + struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; + struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); - switch (descr->type) { + bool perfect_page_alignment = false; + void *data; + size_t data_size; + + if(likely(handle->page)) { + /* Make alignment decisions */ + if (handle->page_position == handle->alignment->page_position) { + /* this is the leading dimension that defines chart alignment */ + perfect_page_alignment = true; + } + + /* is the metric far enough out of alignment with the others? */ + if (unlikely(handle->page_position + 1 < handle->alignment->page_position)) + handle->options |= RRDENG_CHO_UNALIGNED; + + if (unlikely((handle->options & RRDENG_CHO_UNALIGNED) && + /* did the other metrics change page? */ + handle->alignment->page_position <= 1)) { + handle->options &= ~RRDENG_CHO_UNALIGNED; + handle->page_flags |= RRDENG_PAGE_UNALIGNED; + rrdeng_store_metric_flush_current_page(collection_handle); + + data = rrdeng_alloc_new_metric_data(handle, &data_size, point_in_time_ut); + } + else { + data = pgc_page_data(handle->page); + data_size = pgc_page_data_size(main_cache, handle->page); + } + } + else + data = rrdeng_alloc_new_metric_data(handle, &data_size, point_in_time_ut); + + switch (ctx->config.page_type) { case PAGE_METRICS: { - ((storage_number *)page)[descr->page_length / PAGE_POINT_SIZE_BYTES(descr)] = pack_storage_number(n, flags); + storage_number *tier0_metric_data = data; + tier0_metric_data[handle->page_position] = pack_storage_number(n, flags); } break; case PAGE_TIER: { + storage_number_tier1_t *tier12_metric_data = data; storage_number_tier1_t number_tier1; number_tier1.sum_value = (float)n; number_tier1.min_value = (float)min_value; number_tier1.max_value = (float)max_value; number_tier1.anomaly_count = anomaly_count; number_tier1.count = count; - ((storage_number_tier1_t *)page)[descr->page_length / PAGE_POINT_SIZE_BYTES(descr)] = number_tier1; + tier12_metric_data[handle->page_position] = number_tier1; } break; default: { static bool logged = false; if(!logged) { - error("DBENGINE: cannot store metric on unknown page type id %d", descr->type); + error("DBENGINE: cannot store metric on unknown page type id %d", ctx->config.page_type); logged = true; } } break; } - pg_cache_atomic_set_pg_info(descr, point_in_time_ut, descr->page_length + PAGE_POINT_SIZE_BYTES(descr)); + if(unlikely(!handle->page)){ + rrdeng_store_metric_create_new_page(handle, ctx, point_in_time_ut, data, data_size); + // handle->position is set to 1 already - if (perfect_page_alignment) - handle->alignment->page_length = descr->page_length; - if (unlikely(INVALID_TIME == descr->start_time_ut)) { - unsigned long new_metric_API_producers, old_metric_API_max_producers, ret_metric_API_max_producers; - descr->start_time_ut = point_in_time_ut; - - new_metric_API_producers = rrd_atomic_add_fetch(&ctx->stats.metric_API_producers, 1); - while (unlikely(new_metric_API_producers > (old_metric_API_max_producers = ctx->metric_API_max_producers))) { - /* Increase ctx->metric_API_max_producers */ - ret_metric_API_max_producers = ulong_compare_and_swap(&ctx->metric_API_max_producers, - old_metric_API_max_producers, - new_metric_API_producers); - if (old_metric_API_max_producers == ret_metric_API_max_producers) { - /* success */ - break; - } + if (0 == handle->alignment->page_position) { + /* this is the leading dimension that defines chart alignment */ + perfect_page_alignment = true; + } + } + else { + // update an existing page + pgc_page_hot_set_end_time_s(main_cache, handle->page, (time_t) (point_in_time_ut / USEC_PER_SEC)); + handle->page_end_time_ut = point_in_time_ut; + + if(unlikely(++handle->page_position >= handle->page_entries_max)) { + internal_fatal(handle->page_position > handle->page_entries_max, "DBENGINE: exceeded page max number of points"); + handle->page_flags |= RRDENG_PAGE_FULL; + rrdeng_store_metric_flush_current_page(collection_handle); } + } + + if (perfect_page_alignment) + handle->alignment->page_position = handle->page_position; + + // update the metric information + mrg_metric_set_hot_latest_time_s(main_mrg, handle->metric, (time_t) (point_in_time_ut / USEC_PER_SEC)); +} + +static void store_metric_next_error_log(struct rrdeng_collect_handle *handle, usec_t point_in_time_ut, const char *msg) { + time_t point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC); + char uuid[UUID_STR_LEN + 1]; + uuid_unparse(*mrg_metric_uuid(main_mrg, handle->metric), uuid); - pg_cache_insert(ctx, page_index, descr); - } else { - pg_cache_add_new_metric_time(page_index, descr); + BUFFER *wb = NULL; + if(handle->page && handle->page_flags) { + wb = buffer_create(0, NULL); + collect_page_flags_to_buffer(wb, handle->page_flags); } -// { -// unsigned char u[16] = { 0x0C, 0x0A, 0x40, 0xD6, 0x2A, 0x43, 0x4A, 0x7C, 0x95, 0xF7, 0xD1, 0x1E, 0x0C, 0x9E, 0x8A, 0xE7 }; -// if(uuid_compare(u, page_index->id) == 0) { -// char buffer[100]; -// snprintfz(buffer, 100, "store system.cpu, collect:%u, page_index first:%u, last:%u", -// (uint32_t)(point_in_time / USEC_PER_SEC), -// (uint32_t)(page_index->oldest_time / USEC_PER_SEC), -// (uint32_t)(page_index->latest_time / USEC_PER_SEC)); -// -// print_page_cache_descr(descr, buffer, false); -// } -// } + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, + "DBENGINE: metric '%s' collected point at %ld, %s last collection at %ld, " + "update every %ld, %s page from %ld to %ld, position %u (of %u), flags: %s", + uuid, + point_in_time_s, + msg, + (time_t)(handle->page_end_time_ut / USEC_PER_SEC), + (time_t)(handle->update_every_ut / USEC_PER_SEC), + handle->page ? "current" : "*LAST*", + (time_t)(handle->page_start_time_ut / USEC_PER_SEC), + (time_t)(handle->page_end_time_ut / USEC_PER_SEC), + handle->page_position, handle->page_entries_max, + wb ? buffer_tostring(wb) : "" + ); + + buffer_free(wb); } void rrdeng_store_metric_next(STORAGE_COLLECT_HANDLE *collection_handle, - usec_t point_in_time_ut, - NETDATA_DOUBLE n, - NETDATA_DOUBLE min_value, - NETDATA_DOUBLE max_value, - uint16_t count, - uint16_t anomaly_count, - SN_FLAGS flags) + const usec_t point_in_time_ut, + const NETDATA_DOUBLE n, + const NETDATA_DOUBLE min_value, + const NETDATA_DOUBLE max_value, + const uint16_t count, + const uint16_t anomaly_count, + const SN_FLAGS flags) { struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; - struct pg_cache_page_index *page_index = handle->page_index; - struct rrdeng_page_descr *descr = handle->descr; - - if(likely(descr)) { - usec_t last_point_in_time_ut = descr->end_time_ut; - usec_t update_every_ut = page_index->latest_update_every_s * USEC_PER_SEC; - size_t points_gap = (point_in_time_ut <= last_point_in_time_ut) ? - (size_t)0 : - (size_t)((point_in_time_ut - last_point_in_time_ut) / update_every_ut); - - if(unlikely(points_gap != 1)) { - if (unlikely(points_gap <= 0)) { - time_t now = now_realtime_sec(); - static __thread size_t counter = 0; - static __thread time_t last_time_logged = 0; - counter++; - - if(now - last_time_logged > 600) { - error("DBENGINE: collected point is in the past (repeated %zu times in the last %zu secs). Ignoring these data collection points.", - counter, (size_t)(last_time_logged?(now - last_time_logged):0)); - - last_time_logged = now; - counter = 0; - } - return; - } - size_t point_size = PAGE_POINT_SIZE_BYTES(descr); - size_t page_size_in_points = RRDENG_BLOCK_SIZE / point_size; - size_t used_points = descr->page_length / point_size; - size_t remaining_points_in_page = page_size_in_points - used_points; +#ifdef NETDATA_INTERNAL_CHECKS + if(unlikely(point_in_time_ut > (usec_t)max_acceptable_collected_time() * USEC_PER_SEC)) + handle->page_flags |= RRDENG_PAGE_FUTURE_POINT; +#endif + + if(likely(handle->page_end_time_ut + handle->update_every_ut == point_in_time_ut)) { + // happy path + ; + } + else if(unlikely(point_in_time_ut < handle->page_end_time_ut)) { + handle->page_flags |= RRDENG_PAGE_PAST_COLLECTION; + store_metric_next_error_log(handle, point_in_time_ut, "is older than the"); + return; + } - bool new_point_is_aligned = true; - if(unlikely((point_in_time_ut - last_point_in_time_ut) / points_gap != update_every_ut)) - new_point_is_aligned = false; + else if(unlikely(point_in_time_ut == handle->page_end_time_ut)) { + handle->page_flags |= RRDENG_PAGE_REPEATED_COLLECTION; + store_metric_next_error_log(handle, point_in_time_ut, "is at the same time as the"); + return; + } - if(unlikely(points_gap > remaining_points_in_page || !new_point_is_aligned)) { -// char buffer[200]; -// snprintfz(buffer, 200, "data collection skipped %zu points, last stored point %llu, new point %llu, update every %d. Cutting page.", -// points_gap, last_point_in_time_ut / USEC_PER_SEC, point_in_time_ut / USEC_PER_SEC, page_index->latest_update_every_s); -// print_page_cache_descr(descr, buffer, false); + else if(handle->page) { + usec_t delta_ut = point_in_time_ut - handle->page_end_time_ut; + if(unlikely(delta_ut < handle->update_every_ut)) { + handle->page_flags |= RRDENG_PAGE_STEP_TOO_SMALL; + rrdeng_store_metric_flush_current_page(collection_handle); + } + else if(unlikely(delta_ut % handle->update_every_ut)) { + handle->page_flags |= RRDENG_PAGE_STEP_UNALIGNED; + rrdeng_store_metric_flush_current_page(collection_handle); + } + else { + size_t points_gap = delta_ut / handle->update_every_ut; + size_t page_remaining_points = handle->page_entries_max - handle->page_position; + + if(points_gap >= page_remaining_points) { + handle->page_flags |= RRDENG_PAGE_BIG_GAP; rrdeng_store_metric_flush_current_page(collection_handle); } else { -// char buffer[200]; -// snprintfz(buffer, 200, "data collection skipped %zu points, last stored point %llu, new point %llu, update every %d. Filling the gap.", -// points_gap, last_point_in_time_ut / USEC_PER_SEC, point_in_time_ut / USEC_PER_SEC, page_index->latest_update_every_s); -// print_page_cache_descr(descr, buffer, false); - // loop to fill the gap - usec_t step_ut = page_index->latest_update_every_s * USEC_PER_SEC; - usec_t last_point_filled_ut = last_point_in_time_ut + step_ut; - - while (last_point_filled_ut < point_in_time_ut) { - rrdeng_store_metric_next_internal( - collection_handle, last_point_filled_ut, NAN, NAN, NAN, - 1, 0, SN_EMPTY_SLOT); - - last_point_filled_ut += step_ut; + handle->page_flags |= RRDENG_PAGE_GAP; + + usec_t stop_ut = point_in_time_ut - handle->update_every_ut; + for(usec_t this_ut = handle->page_end_time_ut + handle->update_every_ut; + this_ut <= stop_ut ; + this_ut = handle->page_end_time_ut + handle->update_every_ut) { + rrdeng_store_metric_append_point( + collection_handle, + this_ut, + NAN, NAN, NAN, + 1, 0, + SN_EMPTY_SLOT); } } } } - rrdeng_store_metric_next_internal(collection_handle, point_in_time_ut, n, min_value, max_value, count, anomaly_count, flags); + rrdeng_store_metric_append_point(collection_handle, + point_in_time_ut, + n, min_value, max_value, + count, anomaly_count, + flags); } - /* * Releases the database reference from the handle for storing metrics. * Returns 1 if it's safe to delete the dimension. */ int rrdeng_store_metric_finalize(STORAGE_COLLECT_HANDLE *collection_handle) { struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; - struct pg_cache_page_index *page_index = handle->page_index; - - uint8_t can_delete_metric = 0; + struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); + handle->page_flags |= RRDENG_PAGE_COLLECT_FINALIZE; rrdeng_store_metric_flush_current_page(collection_handle); - uv_rwlock_wrlock(&page_index->lock); + rrdeng_page_alignment_release(handle->alignment); - if (!--page_index->writers && !page_index->page_count) - can_delete_metric = 1; + __atomic_sub_fetch(&ctx->atomic.collectors_running, 1, __ATOMIC_RELAXED); + if(!(handle->options & RRDENG_1ST_METRIC_WRITER)) + __atomic_sub_fetch(&ctx->atomic.collectors_running_duplicate, 1, __ATOMIC_RELAXED); - uv_rwlock_wrunlock(&page_index->lock); + if((handle->options & RRDENG_1ST_METRIC_WRITER) && !mrg_metric_clear_writer(main_mrg, handle->metric)) + internal_fatal(true, "DBENGINE: metric is already released"); - rrdeng_page_alignment_release(handle->alignment); + time_t first_time_s, last_time_s, update_every_s; + mrg_metric_get_retention(main_mrg, handle->metric, &first_time_s, &last_time_s, &update_every_s); + + mrg_metric_release(main_mrg, handle->metric); freez(handle); - return can_delete_metric; + if(!first_time_s && !last_time_s) + return 1; + + return 0; } void rrdeng_store_metric_change_collection_frequency(STORAGE_COLLECT_HANDLE *collection_handle, int update_every) { struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; - struct pg_cache_page_index *page_index = handle->page_index; + check_and_fix_mrg_update_every(handle); + + METRIC *metric = handle->metric; + usec_t update_every_ut = (usec_t)update_every * USEC_PER_SEC; + + if(update_every_ut == handle->update_every_ut) + return; + + handle->page_flags |= RRDENG_PAGE_UPDATE_EVERY_CHANGE; rrdeng_store_metric_flush_current_page(collection_handle); - uv_rwlock_rdlock(&page_index->lock); - page_index->latest_update_every_s = update_every; - uv_rwlock_rdunlock(&page_index->lock); + mrg_metric_set_update_every(main_mrg, metric, update_every); + handle->update_every_ut = update_every_ut; } // ---------------------------------------------------------------------------- // query ops -//static inline uint32_t *pginfo_to_dt(struct rrdeng_page_info *page_info) -//{ -// return (uint32_t *)&page_info->scratch[0]; -//} -// -//static inline uint32_t *pginfo_to_points(struct rrdeng_page_info *page_info) -//{ -// return (uint32_t *)&page_info->scratch[sizeof(uint32_t)]; -//} -// +#ifdef NETDATA_INTERNAL_CHECKS +SPINLOCK global_query_handle_spinlock = NETDATA_SPINLOCK_INITIALIZER; +static struct rrdeng_query_handle *global_query_handle_ll = NULL; +static void register_query_handle(struct rrdeng_query_handle *handle) { + handle->query_pid = gettid(); + handle->started_time_s = now_realtime_sec(); + + netdata_spinlock_lock(&global_query_handle_spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(global_query_handle_ll, handle, prev, next); + netdata_spinlock_unlock(&global_query_handle_spinlock); +} +static void unregister_query_handle(struct rrdeng_query_handle *handle) { + netdata_spinlock_lock(&global_query_handle_spinlock); + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(global_query_handle_ll, handle, prev, next); + netdata_spinlock_unlock(&global_query_handle_spinlock); +} +#else +static void register_query_handle(struct rrdeng_query_handle *handle __maybe_unused) { + ; +} +static void unregister_query_handle(struct rrdeng_query_handle *handle __maybe_unused) { + ; +} +#endif + /* * Gets a handle for loading metrics from the database. * The handle must be released with rrdeng_load_metric_final(). */ -void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *rrdimm_handle, time_t start_time_s, time_t end_time_s) +void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, + struct storage_engine_query_handle *rrddim_handle, + time_t start_time_s, + time_t end_time_s, + STORAGE_PRIORITY priority) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - struct rrdengine_instance *ctx = page_index->ctx; + usec_t started_ut = now_monotonic_usec(); - // fprintf(stderr, "%s: %s/%s start time %ld, end time %ld\n", __FUNCTION__ , rd->rrdset->name, rd->name, start_time, end_time); + netdata_thread_disable_cancelability(); + METRIC *metric = (METRIC *)db_metric_handle; + struct rrdengine_instance *ctx = mrg_metric_ctx(metric); struct rrdeng_query_handle *handle; - unsigned pages_nr; - if(!page_index->latest_update_every_s) - page_index->latest_update_every_s = default_rrd_update_every; + handle = rrdeng_query_handle_get(); + register_query_handle(handle); - rrdimm_handle->start_time_s = start_time_s; - rrdimm_handle->end_time_s = end_time_s; + if(unlikely(priority < STORAGE_PRIORITY_HIGH)) + priority = STORAGE_PRIORITY_HIGH; + else if(unlikely(priority > STORAGE_PRIORITY_BEST_EFFORT)) + priority = STORAGE_PRIORITY_BEST_EFFORT; - handle = callocz(1, sizeof(struct rrdeng_query_handle)); - handle->wanted_start_time_s = start_time_s; - handle->now_s = start_time_s; - handle->position = 0; handle->ctx = ctx; - handle->descr = NULL; - handle->dt_s = page_index->latest_update_every_s; - rrdimm_handle->handle = (STORAGE_QUERY_HANDLE *)handle; - pages_nr = pg_cache_preload(ctx, &page_index->id, start_time_s * USEC_PER_SEC, end_time_s * USEC_PER_SEC, - NULL, &handle->page_index); - if (unlikely(NULL == handle->page_index || 0 == pages_nr)) - // there are no metrics to load - handle->wanted_start_time_s = INVALID_TIME; -} + handle->metric = metric; + handle->priority = priority; + + // IMPORTANT! + // It is crucial not to exceed the db boundaries, because dbengine + // now has gap caching, so when a gap is detected a negative page + // is inserted into the main cache, to avoid scanning the journals + // again for pages matching the gap. + + time_t db_first_time_s, db_last_time_s, db_update_every_s; + mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s); + + if(is_page_in_time_range(start_time_s, end_time_s, db_first_time_s, db_last_time_s) == PAGE_IS_IN_RANGE) { + handle->start_time_s = MAX(start_time_s, db_first_time_s); + handle->end_time_s = MIN(end_time_s, db_last_time_s); + handle->now_s = handle->start_time_s; + + handle->dt_s = db_update_every_s; + if (!handle->dt_s) { + handle->dt_s = default_rrd_update_every; + mrg_metric_set_update_every_s_if_zero(main_mrg, metric, default_rrd_update_every); + } -static int rrdeng_load_page_next(struct storage_engine_query_handle *rrdimm_handle, bool debug_this __maybe_unused) { - struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrdimm_handle->handle; + rrddim_handle->handle = (STORAGE_QUERY_HANDLE *) handle; + rrddim_handle->start_time_s = handle->start_time_s; + rrddim_handle->end_time_s = handle->end_time_s; + rrddim_handle->priority = priority; - struct rrdengine_instance *ctx = handle->ctx; - struct rrdeng_page_descr *descr = handle->descr; + pg_cache_preload(handle); - uint32_t page_length; - usec_t page_end_time_ut; - unsigned position; + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_init, now_monotonic_usec() - started_ut, __ATOMIC_RELAXED); + } + else { + handle->start_time_s = start_time_s; + handle->end_time_s = end_time_s; + handle->now_s = start_time_s; + handle->dt_s = db_update_every_s; + + rrddim_handle->handle = (STORAGE_QUERY_HANDLE *) handle; + rrddim_handle->start_time_s = handle->start_time_s; + rrddim_handle->end_time_s = 0; + rrddim_handle->priority = priority; + } +} - if (likely(descr)) { - // Drop old page's reference +static bool rrdeng_load_page_next(struct storage_engine_query_handle *rrddim_handle, bool debug_this __maybe_unused) { + struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle; + struct rrdengine_instance *ctx = handle->ctx; -#ifdef NETDATA_INTERNAL_CHECKS - rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, -1); -#endif + if (likely(handle->page)) { + // we have a page to release + pgc_page_release(main_cache, handle->page); + handle->page = NULL; + } - pg_cache_put(ctx, descr); - handle->descr = NULL; - handle->wanted_start_time_s = (time_t)((handle->page_end_time_ut / USEC_PER_SEC) + handle->dt_s); + if (unlikely(handle->now_s > rrddim_handle->end_time_s)) + return false; - if (unlikely(handle->wanted_start_time_s > rrdimm_handle->end_time_s)) - return 1; - } + size_t entries; + handle->page = pg_cache_lookup_next(ctx, handle->pdc, handle->now_s, handle->dt_s, &entries); + if (unlikely(!handle->page)) + return false; - usec_t wanted_start_time_ut = handle->wanted_start_time_s * USEC_PER_SEC; - descr = pg_cache_lookup_next(ctx, handle->page_index, &handle->page_index->id, - wanted_start_time_ut, rrdimm_handle->end_time_s * USEC_PER_SEC); - if (NULL == descr) - return 1; + internal_fatal(pgc_page_data(handle->page) == DBENGINE_EMPTY_PAGE, "Empty page returned"); -#ifdef NETDATA_INTERNAL_CHECKS - rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, 1); -#endif + time_t page_start_time_s = pgc_page_start_time_s(handle->page); + time_t page_end_time_s = pgc_page_end_time_s(handle->page); + time_t page_update_every_s = pgc_page_update_every_s(handle->page); - handle->descr = descr; - pg_cache_atomic_get_pg_info(descr, &page_end_time_ut, &page_length); - if (unlikely(INVALID_TIME == descr->start_time_ut || INVALID_TIME == page_end_time_ut || 0 == descr->update_every_s)) { - error("DBENGINE: discarding invalid page descriptor (start_time = %llu, end_time = %llu, update_every_s = %d)", - descr->start_time_ut, page_end_time_ut, descr->update_every_s); - return 1; - } + unsigned position; + if(likely(handle->now_s >= page_start_time_s && handle->now_s <= page_end_time_s)) { + + if(unlikely(entries == 1 || page_start_time_s == page_end_time_s || !page_update_every_s)) { + position = 0; + handle->now_s = page_start_time_s; + } + else { + position = (handle->now_s - page_start_time_s) * (entries - 1) / (page_end_time_s - page_start_time_s); + time_t point_end_time_s = page_start_time_s + position * page_update_every_s; + while(point_end_time_s < handle->now_s && position + 1 < entries) { + // https://github.com/netdata/netdata/issues/14411 + // we really need a while() here, because the delta may be + // 2 points at higher tiers + position++; + point_end_time_s = page_start_time_s + position * page_update_every_s; + } + handle->now_s = point_end_time_s; + } - if (unlikely(descr->start_time_ut != page_end_time_ut && wanted_start_time_ut > descr->start_time_ut)) { - // we're in the middle of the page somewhere - unsigned entries = page_length / PAGE_POINT_SIZE_BYTES(descr); - position = ((uint64_t)(wanted_start_time_ut - descr->start_time_ut)) * (entries - 1) / - (page_end_time_ut - descr->start_time_ut); + internal_fatal(position >= entries, "DBENGINE: wrong page position calculation"); } - else + else if(handle->now_s < page_start_time_s) { + handle->now_s = page_start_time_s; position = 0; + } + else { + internal_fatal(true, "DBENGINE: this page is entirely in our past and should not be accepted for this query in the first place"); + handle->now_s = page_end_time_s; + position = entries - 1; + } - handle->page_end_time_ut = page_end_time_ut; - handle->page_length = page_length; - handle->entries = page_length / PAGE_POINT_SIZE_BYTES(descr); - handle->page = descr->pg_cache_descr->page; - handle->dt_s = descr->update_every_s; + handle->entries = entries; handle->position = position; - -// if(debug_this) -// info("DBENGINE: rrdeng_load_page_next(), " -// "position:%d, " -// "start_time_ut:%llu, " -// "page_end_time_ut:%llu, " -// "next_page_time_ut:%llu, " -// "in_out:%s" -// , position -// , descr->start_time_ut -// , page_end_time_ut -// , -// wanted_start_time_ut, in_out?"true":"false" -// ); - - return 0; + handle->metric_data = pgc_page_data((PGC_PAGE *)handle->page); + handle->dt_s = page_update_every_s; + return true; } // Returns the metric and sets its timestamp into current_time @@ -675,75 +895,28 @@ static int rrdeng_load_page_next(struct storage_engine_query_handle *rrdimm_hand // IT IS REQUIRED TO **ALWAYS** KEEP TRACK OF TIME, EVEN OUTSIDE THE DATABASE BOUNDARIES STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim_handle) { struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle; - // struct rrdeng_metric_handle *metric_handle = handle->metric_handle; - - struct rrdeng_page_descr *descr = handle->descr; - time_t now = handle->now_s + handle->dt_s; - -// bool debug_this = false; -// { -// unsigned char u[16] = { 0x0C, 0x0A, 0x40, 0xD6, 0x2A, 0x43, 0x4A, 0x7C, 0x95, 0xF7, 0xD1, 0x1E, 0x0C, 0x9E, 0x8A, 0xE7 }; -// if(uuid_compare(u, handle->page_index->id) == 0) { -// char buffer[100]; -// snprintfz(buffer, 100, "load system.cpu, now:%u, dt:%u, position:%u page_index first:%u, last:%u", -// (uint32_t)(now), -// (uint32_t)(handle->dt_s), -// (uint32_t)(handle->position), -// (uint32_t)(handle->page_index->oldest_time / USEC_PER_SEC), -// (uint32_t)(handle->page_index->latest_time / USEC_PER_SEC)); -// -// print_page_cache_descr(descr, buffer, false); -// debug_this = true; -// } -// } - STORAGE_POINT sp; - unsigned position = handle->position + 1; - storage_number_tier1_t tier1_value; - - if (unlikely(INVALID_TIME == handle->wanted_start_time_s)) { - handle->wanted_start_time_s = INVALID_TIME; - handle->now_s = now; - storage_point_empty(sp, now - handle->dt_s, now); - return sp; + + if (unlikely(handle->now_s > rrddim_handle->end_time_s)) { + storage_point_empty(sp, handle->now_s - handle->dt_s, handle->now_s); + goto prepare_for_next_iteration; } - if (unlikely(!descr || position >= handle->entries)) { + if (unlikely(!handle->page || handle->position >= handle->entries)) { // We need to get a new page - if(rrdeng_load_page_next(rrddim_handle, false)) { - // next calls will not load any more metrics - handle->wanted_start_time_s = INVALID_TIME; - handle->now_s = now; - storage_point_empty(sp, now - handle->dt_s, now); - return sp; - } - descr = handle->descr; - position = handle->position; - now = (time_t)((descr->start_time_ut / USEC_PER_SEC) + position * descr->update_every_s); - -// if(debug_this) { -// char buffer[100]; -// snprintfz(buffer, 100, "NEW PAGE system.cpu, now:%u, dt:%u, position:%u page_index first:%u, last:%u", -// (uint32_t)(now), -// (uint32_t)(handle->dt_s), -// (uint32_t)(handle->position), -// (uint32_t)(handle->page_index->oldest_time / USEC_PER_SEC), -// (uint32_t)(handle->page_index->latest_time / USEC_PER_SEC)); -// -// print_page_cache_descr(descr, buffer, false); -// } + if (!rrdeng_load_page_next(rrddim_handle, false)) { + storage_point_empty(sp, handle->now_s - handle->dt_s, handle->now_s); + goto prepare_for_next_iteration; + } } - sp.start_time = now - handle->dt_s; - sp.end_time = now; - - handle->position = position; - handle->now_s = now; + sp.start_time_s = handle->now_s - handle->dt_s; + sp.end_time_s = handle->now_s; - switch(descr->type) { + switch(handle->ctx->config.page_type) { case PAGE_METRICS: { - storage_number n = handle->page[position]; + storage_number n = handle->metric_data[handle->position]; sp.min = sp.max = sp.sum = unpack_storage_number(n); sp.flags = n & SN_USER_FLAGS; sp.count = 1; @@ -752,7 +925,7 @@ STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim break; case PAGE_TIER: { - tier1_value = ((storage_number_tier1_t *)handle->page)[position]; + storage_number_tier1_t tier1_value = ((storage_number_tier1_t *)handle->metric_data)[handle->position]; sp.flags = tier1_value.anomaly_count ? SN_FLAG_NONE : SN_FLAG_NOT_ANOMALOUS; sp.count = tier1_value.count; sp.anomaly_count = tier1_value.anomaly_count; @@ -766,204 +939,98 @@ STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim default: { static bool logged = false; if(!logged) { - error("DBENGINE: unknown page type %d found. Cannot decode it. Ignoring its metrics.", descr->type); + error("DBENGINE: unknown page type %d found. Cannot decode it. Ignoring its metrics.", handle->ctx->config.page_type); logged = true; } - storage_point_empty(sp, sp.start_time, sp.end_time); + storage_point_empty(sp, sp.start_time_s, sp.end_time_s); } break; } - if (unlikely(now >= rrddim_handle->end_time_s)) { - // next calls will not load any more metrics - handle->wanted_start_time_s = INVALID_TIME; - } +prepare_for_next_iteration: + internal_fatal(sp.end_time_s < rrddim_handle->start_time_s, "DBENGINE: this point is too old for this query"); + internal_fatal(sp.end_time_s < handle->now_s, "DBENGINE: this point is too old for this point in time"); -// if(debug_this) -// info("DBENGINE: returning point: " -// "time from %ld to %ld // query from %ld to %ld // wanted_start_time_s %ld" -// , sp.start_time, sp.end_time -// , rrddim_handle->start_time_s, rrddim_handle->end_time_s -// , handle->wanted_start_time_s -// ); + handle->now_s += handle->dt_s; + handle->position++; return sp; } -int rrdeng_load_metric_is_finished(struct storage_engine_query_handle *rrdimm_handle) -{ - struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrdimm_handle->handle; - return (INVALID_TIME == handle->wanted_start_time_s); +int rrdeng_load_metric_is_finished(struct storage_engine_query_handle *rrddim_handle) { + struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle; + return (handle->now_s > rrddim_handle->end_time_s); } /* * Releases the database reference from the handle for loading metrics. */ -void rrdeng_load_metric_finalize(struct storage_engine_query_handle *rrdimm_handle) +void rrdeng_load_metric_finalize(struct storage_engine_query_handle *rrddim_handle) { - struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrdimm_handle->handle; - struct rrdengine_instance *ctx = handle->ctx; - struct rrdeng_page_descr *descr = handle->descr; + struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle; - if (descr) { -#ifdef NETDATA_INTERNAL_CHECKS - rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, -1); -#endif - pg_cache_put(ctx, descr); - } + if (handle->page) + pgc_page_release(main_cache, handle->page); - // whatever is allocated at rrdeng_load_metric_init() should be freed here - freez(handle); - rrdimm_handle->handle = NULL; -} + if(!pdc_release_and_destroy_if_unreferenced(handle->pdc, false, false)) + __atomic_store_n(&handle->pdc->workers_should_stop, true, __ATOMIC_RELAXED); -time_t rrdeng_metric_latest_time(STORAGE_METRIC_HANDLE *db_metric_handle) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - return (time_t)(page_index->latest_time_ut / USEC_PER_SEC); -} -time_t rrdeng_metric_oldest_time(STORAGE_METRIC_HANDLE *db_metric_handle) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - return (time_t)(page_index->oldest_time_ut / USEC_PER_SEC); + unregister_query_handle(handle); + rrdeng_query_handle_release(handle); + rrddim_handle->handle = NULL; + netdata_thread_enable_cancelability(); } -int rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *si, uuid_t *dim_uuid, time_t *first_entry_t, time_t *last_entry_t) -{ - struct page_cache *pg_cache; - struct rrdengine_instance *ctx; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; +time_t rrdeng_load_align_to_optimal_before(struct storage_engine_query_handle *rrddim_handle) { + struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle; - ctx = (struct rrdengine_instance *)si; - if (unlikely(!ctx)) { - error("DBENGINE: invalid STORAGE INSTANCE to %s()", __FUNCTION__); - return 1; + if(handle->pdc) { + rrdeng_prep_wait(handle->pdc); + if (handle->pdc->optimal_end_time_s > rrddim_handle->end_time_s) + rrddim_handle->end_time_s = handle->pdc->optimal_end_time_s; } - pg_cache = &ctx->pg_cache; - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, dim_uuid, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - page_index = *PValue; - } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); + return rrddim_handle->end_time_s; +} - if (likely(page_index)) { - *first_entry_t = page_index->oldest_time_ut / USEC_PER_SEC; - *last_entry_t = page_index->latest_time_ut / USEC_PER_SEC; - return 0; - } +time_t rrdeng_metric_latest_time(STORAGE_METRIC_HANDLE *db_metric_handle) { + METRIC *metric = (METRIC *)db_metric_handle; + time_t latest_time_s = 0; - return 1; -} + if (metric) + latest_time_s = mrg_metric_get_latest_time_s(main_mrg, metric); -/* Also gets a reference for the page */ -void *rrdeng_create_page(struct rrdengine_instance *ctx, uuid_t *id, struct rrdeng_page_descr **ret_descr) -{ - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - void *page; - /* TODO: check maximum number of pages in page cache limit */ - - descr = pg_cache_create_descr(); - descr->id = id; /* TODO: add page type: metric, log, something? */ - descr->type = ctx->page_type; - page = dbengine_page_alloc(); /*TODO: add page size */ - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - pg_cache_descr->page = page; - pg_cache_descr->flags = RRD_PAGE_DIRTY /*| RRD_PAGE_LOCKED */ | RRD_PAGE_POPULATED /* | BEING_COLLECTED */; - pg_cache_descr->refcnt = 1; - - debug(D_RRDENGINE, "Created new page:"); - if (unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - rrdeng_page_descr_mutex_unlock(ctx, descr); - *ret_descr = descr; - return page; + return latest_time_s; } -/* The page must not be empty */ -void rrdeng_commit_page(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr, - Word_t page_correlation_id) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - Pvoid_t *PValue; - unsigned nr_committed_pages; +time_t rrdeng_metric_oldest_time(STORAGE_METRIC_HANDLE *db_metric_handle) { + METRIC *metric = (METRIC *)db_metric_handle; - if (unlikely(NULL == descr)) { - debug(D_RRDENGINE, "%s: page descriptor is NULL, page has already been force-committed.", __func__); - return; - } - fatal_assert(descr->page_length); - - uv_rwlock_wrlock(&pg_cache->committed_page_index.lock); - PValue = JudyLIns(&pg_cache->committed_page_index.JudyL_array, page_correlation_id, PJE0); - *PValue = descr; - nr_committed_pages = ++pg_cache->committed_page_index.nr_committed_pages; - uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock); - - if (nr_committed_pages >= pg_cache_hard_limit(ctx) / 2) { - /* over 50% of pages have not been committed yet */ - - if (ctx->drop_metrics_under_page_cache_pressure && - nr_committed_pages >= pg_cache_committed_hard_limit(ctx)) { - /* 100% of pages are dirty */ - struct rrdeng_cmd cmd; - - cmd.opcode = RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); - } else { - if (0 == (unsigned long) ctx->stats.pg_cache_over_half_dirty_events) { - /* only print the first time */ - errno = 0; - error("Failed to flush dirty buffers quickly enough in dbengine instance \"%s\". " - "Metric data at risk of not being stored in the database, " - "please reduce disk load or use a faster disk.", ctx->dbfiles_path); - } - rrd_stat_atomic_add(&ctx->stats.pg_cache_over_half_dirty_events, 1); - rrd_stat_atomic_add(&global_pg_cache_over_half_dirty_events, 1); - } - } + time_t oldest_time_s = 0; + if (metric) + oldest_time_s = mrg_metric_get_first_time_s(main_mrg, metric); - pg_cache_put(ctx, descr); + return oldest_time_s; } -/* Gets a reference for the page */ -void *rrdeng_get_latest_page(struct rrdengine_instance *ctx, uuid_t *id, void **handle) +bool rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *db_instance, uuid_t *dim_uuid, time_t *first_entry_s, time_t *last_entry_s) { - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - - debug(D_RRDENGINE, "Reading existing page:"); - descr = pg_cache_lookup(ctx, NULL, id, INVALID_TIME); - if (NULL == descr) { - *handle = NULL; - - return NULL; + struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; + if (unlikely(!ctx)) { + error("DBENGINE: invalid STORAGE INSTANCE to %s()", __FUNCTION__); + return false; } - *handle = descr; - pg_cache_descr = descr->pg_cache_descr; - return pg_cache_descr->page; -} + METRIC *metric = mrg_metric_get_and_acquire(main_mrg, dim_uuid, (Word_t) ctx); + if (unlikely(!metric)) + return false; -/* Gets a reference for the page */ -void *rrdeng_get_page(struct rrdengine_instance *ctx, uuid_t *id, usec_t point_in_time_ut, void **handle) -{ - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; + time_t update_every_s; + mrg_metric_get_retention(main_mrg, metric, first_entry_s, last_entry_s, &update_every_s); - debug(D_RRDENGINE, "Reading existing page:"); - descr = pg_cache_lookup(ctx, NULL, id, point_in_time_ut); - if (NULL == descr) { - *handle = NULL; - - return NULL; - } - *handle = descr; - pg_cache_descr = descr->pg_cache_descr; + mrg_metric_release(main_mrg, metric); - return pg_cache_descr->page; + return true; } /* @@ -977,62 +1044,126 @@ void rrdeng_get_37_statistics(struct rrdengine_instance *ctx, unsigned long long if (ctx == NULL) return; - struct page_cache *pg_cache = &ctx->pg_cache; - - array[0] = (uint64_t)ctx->stats.metric_API_producers; - array[1] = (uint64_t)ctx->stats.metric_API_consumers; - array[2] = (uint64_t)pg_cache->page_descriptors; - array[3] = (uint64_t)pg_cache->populated_pages; - array[4] = (uint64_t)pg_cache->committed_page_index.nr_committed_pages; - array[5] = (uint64_t)ctx->stats.pg_cache_insertions; - array[6] = (uint64_t)ctx->stats.pg_cache_deletions; - array[7] = (uint64_t)ctx->stats.pg_cache_hits; - array[8] = (uint64_t)ctx->stats.pg_cache_misses; - array[9] = (uint64_t)ctx->stats.pg_cache_backfills; - array[10] = (uint64_t)ctx->stats.pg_cache_evictions; - array[11] = (uint64_t)ctx->stats.before_compress_bytes; - array[12] = (uint64_t)ctx->stats.after_compress_bytes; - array[13] = (uint64_t)ctx->stats.before_decompress_bytes; - array[14] = (uint64_t)ctx->stats.after_decompress_bytes; - array[15] = (uint64_t)ctx->stats.io_write_bytes; - array[16] = (uint64_t)ctx->stats.io_write_requests; - array[17] = (uint64_t)ctx->stats.io_read_bytes; - array[18] = (uint64_t)ctx->stats.io_read_requests; - array[19] = (uint64_t)ctx->stats.io_write_extent_bytes; - array[20] = (uint64_t)ctx->stats.io_write_extents; - array[21] = (uint64_t)ctx->stats.io_read_extent_bytes; - array[22] = (uint64_t)ctx->stats.io_read_extents; - array[23] = (uint64_t)ctx->stats.datafile_creations; - array[24] = (uint64_t)ctx->stats.datafile_deletions; - array[25] = (uint64_t)ctx->stats.journalfile_creations; - array[26] = (uint64_t)ctx->stats.journalfile_deletions; - array[27] = (uint64_t)ctx->stats.page_cache_descriptors; - array[28] = (uint64_t)ctx->stats.io_errors; - array[29] = (uint64_t)ctx->stats.fs_errors; - array[30] = (uint64_t)global_io_errors; - array[31] = (uint64_t)global_fs_errors; - array[32] = (uint64_t)rrdeng_reserved_file_descriptors; - array[33] = (uint64_t)ctx->stats.pg_cache_over_half_dirty_events; - array[34] = (uint64_t)global_pg_cache_over_half_dirty_events; - array[35] = (uint64_t)ctx->stats.flushing_pressure_page_deletions; - array[36] = (uint64_t)global_flushing_pressure_page_deletions; - fatal_assert(RRDENG_NR_STATS == 37); + array[0] = (uint64_t)__atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED); // API producers + array[1] = (uint64_t)__atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED); // API consumers + array[2] = 0; + array[3] = 0; + array[4] = 0; + array[5] = 0; // (uint64_t)ctx->stats.pg_cache_insertions; + array[6] = 0; // (uint64_t)ctx->stats.pg_cache_deletions; + array[7] = 0; // (uint64_t)ctx->stats.pg_cache_hits; + array[8] = 0; // (uint64_t)ctx->stats.pg_cache_misses; + array[9] = 0; // (uint64_t)ctx->stats.pg_cache_backfills; + array[10] = 0; // (uint64_t)ctx->stats.pg_cache_evictions; + array[11] = (uint64_t)__atomic_load_n(&ctx->stats.before_compress_bytes, __ATOMIC_RELAXED); // used + array[12] = (uint64_t)__atomic_load_n(&ctx->stats.after_compress_bytes, __ATOMIC_RELAXED); // used + array[13] = (uint64_t)__atomic_load_n(&ctx->stats.before_decompress_bytes, __ATOMIC_RELAXED); + array[14] = (uint64_t)__atomic_load_n(&ctx->stats.after_decompress_bytes, __ATOMIC_RELAXED); + array[15] = (uint64_t)__atomic_load_n(&ctx->stats.io_write_bytes, __ATOMIC_RELAXED); // used + array[16] = (uint64_t)__atomic_load_n(&ctx->stats.io_write_requests, __ATOMIC_RELAXED); // used + array[17] = (uint64_t)__atomic_load_n(&ctx->stats.io_read_bytes, __ATOMIC_RELAXED); + array[18] = (uint64_t)__atomic_load_n(&ctx->stats.io_read_requests, __ATOMIC_RELAXED); // used + array[19] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_write_extent_bytes, __ATOMIC_RELAXED); + array[20] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_write_extents, __ATOMIC_RELAXED); + array[21] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_read_extent_bytes, __ATOMIC_RELAXED); + array[22] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_read_extents, __ATOMIC_RELAXED); + array[23] = (uint64_t)__atomic_load_n(&ctx->stats.datafile_creations, __ATOMIC_RELAXED); + array[24] = (uint64_t)__atomic_load_n(&ctx->stats.datafile_deletions, __ATOMIC_RELAXED); + array[25] = (uint64_t)__atomic_load_n(&ctx->stats.journalfile_creations, __ATOMIC_RELAXED); + array[26] = (uint64_t)__atomic_load_n(&ctx->stats.journalfile_deletions, __ATOMIC_RELAXED); + array[27] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.page_cache_descriptors, __ATOMIC_RELAXED); + array[28] = (uint64_t)__atomic_load_n(&ctx->stats.io_errors, __ATOMIC_RELAXED); + array[29] = (uint64_t)__atomic_load_n(&ctx->stats.fs_errors, __ATOMIC_RELAXED); + array[30] = (uint64_t)__atomic_load_n(&global_io_errors, __ATOMIC_RELAXED); // used + array[31] = (uint64_t)__atomic_load_n(&global_fs_errors, __ATOMIC_RELAXED); // used + array[32] = (uint64_t)__atomic_load_n(&rrdeng_reserved_file_descriptors, __ATOMIC_RELAXED); // used + array[33] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.pg_cache_over_half_dirty_events, __ATOMIC_RELAXED); + array[34] = (uint64_t)__atomic_load_n(&global_pg_cache_over_half_dirty_events, __ATOMIC_RELAXED); // used + array[35] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.flushing_pressure_page_deletions, __ATOMIC_RELAXED); + array[36] = (uint64_t)__atomic_load_n(&global_flushing_pressure_page_deletions, __ATOMIC_RELAXED); // used + array[37] = 0; //(uint64_t)pg_cache->active_descriptors; + + fatal_assert(RRDENG_NR_STATS == 38); } -/* Releases reference to page */ -void rrdeng_put_page(struct rrdengine_instance *ctx, void *handle) -{ - (void)ctx; - pg_cache_put(ctx, (struct rrdeng_page_descr *)handle); +static void rrdeng_populate_mrg(struct rrdengine_instance *ctx) { + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + size_t datafiles = 0; + for(struct rrdengine_datafile *df = ctx->datafiles.first; df ;df = df->next) + datafiles++; + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + size_t cpus = get_netdata_cpus() / storage_tiers; + if(cpus > datafiles) + cpus = datafiles; + + if(cpus < 1) + cpus = 1; + + if(cpus > (size_t)libuv_worker_threads) + cpus = (size_t)libuv_worker_threads; + + if(cpus > MRG_PARTITIONS) + cpus = MRG_PARTITIONS; + + info("DBENGINE: populating retention to MRG from %zu journal files of tier %d, using %zu threads...", datafiles, ctx->config.tier, cpus); + + if(datafiles > 2) { + struct rrdengine_datafile *datafile; + + datafile = ctx->datafiles.first->prev; + if(!(datafile->journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE)) + datafile = datafile->prev; + + if(datafile->journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE) { + journalfile_v2_populate_retention_to_mrg(ctx, datafile->journalfile); + datafile->populate_mrg.populated = true; + } + + datafile = ctx->datafiles.first; + if(datafile->journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE) { + journalfile_v2_populate_retention_to_mrg(ctx, datafile->journalfile); + datafile->populate_mrg.populated = true; + } + } + + ctx->loading.populate_mrg.size = cpus; + ctx->loading.populate_mrg.array = callocz(ctx->loading.populate_mrg.size, sizeof(struct completion)); + + for (size_t i = 0; i < ctx->loading.populate_mrg.size; i++) { + completion_init(&ctx->loading.populate_mrg.array[i]); + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_POPULATE_MRG, NULL, &ctx->loading.populate_mrg.array[i], + STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); + } +} + +void rrdeng_readiness_wait(struct rrdengine_instance *ctx) { + for (size_t i = 0; i < ctx->loading.populate_mrg.size; i++) { + completion_wait_for(&ctx->loading.populate_mrg.array[i]); + completion_destroy(&ctx->loading.populate_mrg.array[i]); + } + + freez(ctx->loading.populate_mrg.array); + ctx->loading.populate_mrg.array = NULL; + ctx->loading.populate_mrg.size = 0; + + info("DBENGINE: tier %d is ready for data collection and queries", ctx->config.tier); +} + +bool rrdeng_is_legacy(STORAGE_INSTANCE *db_instance) { + struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; + return ctx->config.legacy; } +void rrdeng_exit_mode(struct rrdengine_instance *ctx) { + __atomic_store_n(&ctx->quiesce.exit_mode, true, __ATOMIC_RELAXED); +} /* * Returns 0 on success, negative on error */ -int rrdeng_init(RRDHOST *host, struct rrdengine_instance **ctxp, char *dbfiles_path, unsigned page_cache_mb, +int rrdeng_init(struct rrdengine_instance **ctxp, const char *dbfiles_path, unsigned disk_space_mb, size_t tier) { struct rrdengine_instance *ctx; - int error; uint32_t max_open_files; max_open_files = rlimit_nofile.rlim_cur / 4; @@ -1053,182 +1184,185 @@ int rrdeng_init(RRDHOST *host, struct rrdengine_instance **ctxp, char *dbfiles_p if(NULL == ctxp) { ctx = multidb_ctx[tier]; memset(ctx, 0, sizeof(*ctx)); + ctx->config.legacy = false; } else { *ctxp = ctx = callocz(1, sizeof(*ctx)); + ctx->config.legacy = true; } - ctx->tier = tier; - ctx->page_type = tier_page_type[tier]; - ctx->global_compress_alg = RRD_LZ4; - if (page_cache_mb < RRDENG_MIN_PAGE_CACHE_SIZE_MB) - page_cache_mb = RRDENG_MIN_PAGE_CACHE_SIZE_MB; - ctx->max_cache_pages = page_cache_mb * (1048576LU / RRDENG_BLOCK_SIZE); - /* try to keep 5% of the page cache free */ - ctx->cache_pages_low_watermark = (ctx->max_cache_pages * 95LLU) / 100; + + ctx->config.tier = (int)tier; + ctx->config.page_type = tier_page_type[tier]; + ctx->config.global_compress_alg = RRD_LZ4; if (disk_space_mb < RRDENG_MIN_DISK_SPACE_MB) disk_space_mb = RRDENG_MIN_DISK_SPACE_MB; - ctx->max_disk_space = disk_space_mb * 1048576LLU; - strncpyz(ctx->dbfiles_path, dbfiles_path, sizeof(ctx->dbfiles_path) - 1); - ctx->dbfiles_path[sizeof(ctx->dbfiles_path) - 1] = '\0'; - if (NULL == host) - strncpyz(ctx->machine_guid, registry_get_this_machine_guid(), GUID_LEN); - else - strncpyz(ctx->machine_guid, host->machine_guid, GUID_LEN); - - ctx->drop_metrics_under_page_cache_pressure = rrdeng_drop_metrics_under_page_cache_pressure; - ctx->metric_API_max_producers = 0; - ctx->quiesce = NO_QUIESCE; - ctx->host = host; - - memset(&ctx->worker_config, 0, sizeof(ctx->worker_config)); - ctx->worker_config.ctx = ctx; - init_page_cache(ctx); - init_commit_log(ctx); - error = init_rrd_files(ctx); - if (error) { - goto error_after_init_rrd_files; - } + ctx->config.max_disk_space = disk_space_mb * 1048576LLU; + strncpyz(ctx->config.dbfiles_path, dbfiles_path, sizeof(ctx->config.dbfiles_path) - 1); + ctx->config.dbfiles_path[sizeof(ctx->config.dbfiles_path) - 1] = '\0'; - completion_init(&ctx->rrdengine_completion); - fatal_assert(0 == uv_thread_create(&ctx->worker_config.thread, rrdeng_worker, &ctx->worker_config)); - /* wait for worker thread to initialize */ - completion_wait_for(&ctx->rrdengine_completion); - completion_destroy(&ctx->rrdengine_completion); - uv_thread_set_name_np(ctx->worker_config.thread, "LIBUV_WORKER"); - if (ctx->worker_config.error) { - goto error_after_rrdeng_worker; - } -// error = metalog_init(ctx); -// if (error) { -// error("Failed to initialize metadata log file event loop."); -// goto error_after_rrdeng_worker; -// } + ctx->atomic.transaction_id = 1; + ctx->quiesce.enabled = false; - return 0; + if (rrdeng_dbengine_spawn(ctx) && !init_rrd_files(ctx)) { + // success - we run this ctx too + rrdeng_populate_mrg(ctx); + return 0; + } -error_after_rrdeng_worker: - finalize_rrd_files(ctx); -error_after_init_rrd_files: - free_page_cache(ctx); - if (!is_storage_engine_shared((STORAGE_INSTANCE *)ctx)) { + if (ctx->config.legacy) { freez(ctx); if (ctxp) *ctxp = NULL; } + rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE); return UV_EIO; } +size_t rrdeng_collectors_running(struct rrdengine_instance *ctx) { + return __atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED); +} + /* * Returns 0 on success, 1 on error */ -int rrdeng_exit(struct rrdengine_instance *ctx) -{ - struct rrdeng_cmd cmd; - - if (NULL == ctx) { +int rrdeng_exit(struct rrdengine_instance *ctx) { + if (NULL == ctx) return 1; + + // FIXME - ktsaou - properly cleanup ctx + // 1. make sure all collectors are stopped + // 2. make new queries will not be accepted (this is quiesce that has already run) + // 3. flush this section of the main cache + // 4. then wait for completion + + bool logged = false; + while(__atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED) && !unittest_running) { + if(!logged) { + info("DBENGINE: waiting for collectors to finish on tier %d...", (ctx->config.legacy) ? -1 : ctx->config.tier); + logged = true; + } + sleep_usec(100 * USEC_PER_MS); } - /* TODO: add page to page cache */ - cmd.opcode = RRDENG_SHUTDOWN; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); + info("DBENGINE: flushing main cache for tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier); + pgc_flush_all_hot_and_dirty_pages(main_cache, (Word_t)ctx); - fatal_assert(0 == uv_thread_join(&ctx->worker_config.thread)); + info("DBENGINE: shutting down tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier); + struct completion completion = {}; + completion_init(&completion); + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_SHUTDOWN, NULL, &completion, STORAGE_PRIORITY_BEST_EFFORT, NULL, NULL); + completion_wait_for(&completion); + completion_destroy(&completion); finalize_rrd_files(ctx); - //metalog_exit(ctx->metalog_ctx); - free_page_cache(ctx); - if(!is_storage_engine_shared((STORAGE_INSTANCE *)ctx)) + if(ctx->config.legacy) freez(ctx); rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE); return 0; } -void rrdeng_prepare_exit(struct rrdengine_instance *ctx) -{ - struct rrdeng_cmd cmd; - - if (NULL == ctx) { +void rrdeng_prepare_exit(struct rrdengine_instance *ctx) { + if (NULL == ctx) return; - } - - completion_init(&ctx->rrdengine_completion); - cmd.opcode = RRDENG_QUIESCE; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); - /* wait for dbengine to quiesce */ - completion_wait_for(&ctx->rrdengine_completion); - completion_destroy(&ctx->rrdengine_completion); + // FIXME - ktsaou - properly cleanup ctx + // 1. make sure all collectors are stopped - //metalog_prepare_exit(ctx->metalog_ctx); + completion_init(&ctx->quiesce.completion); + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_QUIESCE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); } -RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx) { - RRDENG_SIZE_STATS stats = { 0 }; +static void populate_v2_statistics(struct rrdengine_datafile *datafile, RRDENG_SIZE_STATS *stats) +{ + struct journal_v2_header *j2_header = journalfile_v2_data_acquire(datafile->journalfile, NULL, 0, 0); + void *data_start = (void *)j2_header; - for(struct pg_cache_page_index *page_index = ctx->pg_cache.metrics_index.last_page_index; - page_index != NULL ;page_index = page_index->prev) { - stats.metrics++; - stats.metrics_pages += page_index->page_count; + if(unlikely(!j2_header)) + return; + + stats->extents += j2_header->extent_count; + + unsigned entries; + struct journal_extent_list *extent_list = (void *) (data_start + j2_header->extent_offset); + for (entries = 0; entries < j2_header->extent_count; entries++) { + stats->extents_compressed_bytes += extent_list->datafile_size; + stats->extents_pages += extent_list->pages; + extent_list++; } - for(struct rrdengine_datafile *df = ctx->datafiles.first; df ;df = df->next) { - stats.datafiles++; + struct journal_metric_list *metric = (void *) (data_start + j2_header->metric_offset); + time_t journal_start_time_s = (time_t) (j2_header->start_time_ut / USEC_PER_SEC); - for(struct extent_info *ei = df->extents.first; ei ; ei = ei->next) { - stats.extents++; - stats.extents_compressed_bytes += ei->size; + stats->metrics += j2_header->metric_count; + for (entries = 0; entries < j2_header->metric_count; entries++) { - for(int p = 0; p < ei->number_of_pages ;p++) { - struct rrdeng_page_descr *descr = ei->pages[p]; + struct journal_page_header *metric_list_header = (void *) (data_start + metric->page_offset); + stats->metrics_pages += metric_list_header->entries; + struct journal_page_list *descr = (void *) (data_start + metric->page_offset + sizeof(struct journal_page_header)); + for (uint32_t idx=0; idx < metric_list_header->entries; idx++) { - usec_t update_every_usec; + time_t update_every_s; - size_t points = descr->page_length / PAGE_POINT_SIZE_BYTES(descr); + size_t points = descr->page_length / CTX_POINT_SIZE_BYTES(datafile->ctx); - if(likely(points > 1)) - update_every_usec = (descr->end_time_ut - descr->start_time_ut) / (points - 1); - else { - update_every_usec = default_rrd_update_every * get_tier_grouping(ctx->tier) * USEC_PER_SEC; - stats.single_point_pages++; - } + time_t start_time_s = journal_start_time_s + descr->delta_start_s; + time_t end_time_s = journal_start_time_s + descr->delta_end_s; - time_t duration_secs = (time_t)((descr->end_time_ut - descr->start_time_ut + update_every_usec)/USEC_PER_SEC); + if(likely(points > 1)) + update_every_s = (time_t) ((end_time_s - start_time_s) / (points - 1)); + else { + update_every_s = (time_t) (default_rrd_update_every * get_tier_grouping(datafile->ctx->config.tier)); + stats->single_point_pages++; + } - stats.extents_pages++; - stats.pages_uncompressed_bytes += descr->page_length; - stats.pages_duration_secs += duration_secs; - stats.points += points; + time_t duration_s = (time_t)((end_time_s - start_time_s + update_every_s)); - stats.page_types[descr->type].pages++; - stats.page_types[descr->type].pages_uncompressed_bytes += descr->page_length; - stats.page_types[descr->type].pages_duration_secs += duration_secs; - stats.page_types[descr->type].points += points; + stats->pages_uncompressed_bytes += descr->page_length; + stats->pages_duration_secs += duration_s; + stats->points += points; - if(!stats.first_t || (descr->start_time_ut - update_every_usec) < stats.first_t) - stats.first_t = (descr->start_time_ut - update_every_usec) / USEC_PER_SEC; + stats->page_types[descr->type].pages++; + stats->page_types[descr->type].pages_uncompressed_bytes += descr->page_length; + stats->page_types[descr->type].pages_duration_secs += duration_s; + stats->page_types[descr->type].points += points; - if(!stats.last_t || descr->end_time_ut > stats.last_t) - stats.last_t = descr->end_time_ut / USEC_PER_SEC; - } + if(!stats->first_time_s || (start_time_s - update_every_s) < stats->first_time_s) + stats->first_time_s = (start_time_s - update_every_s); + + if(!stats->last_time_s || end_time_s > stats->last_time_s) + stats->last_time_s = end_time_s; + + descr++; } + metric++; } + journalfile_v2_data_release(datafile->journalfile); +} + +RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx) { + RRDENG_SIZE_STATS stats = { 0 }; - stats.currently_collected_metrics = ctx->stats.metric_API_producers; - stats.max_concurrently_collected_metrics = ctx->metric_API_max_producers; + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + for(struct rrdengine_datafile *df = ctx->datafiles.first; df ;df = df->next) { + stats.datafiles++; + populate_v2_statistics(df, &stats); + } + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + stats.currently_collected_metrics = __atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED); internal_error(stats.metrics_pages != stats.extents_pages + stats.currently_collected_metrics, "DBENGINE: metrics pages is %zu, but extents pages is %zu and API consumers is %zu", stats.metrics_pages, stats.extents_pages, stats.currently_collected_metrics); - stats.disk_space = ctx->disk_space; - stats.max_disk_space = ctx->max_disk_space; + stats.disk_space = ctx_current_disk_space_get(ctx); + stats.max_disk_space = ctx->config.max_disk_space; - stats.database_retention_secs = (time_t)(stats.last_t - stats.first_t); + stats.database_retention_secs = (time_t)(stats.last_time_s - stats.first_time_s); if(stats.extents_pages) stats.average_page_size_bytes = (double)stats.pages_uncompressed_bytes / (double)stats.extents_pages; @@ -1252,21 +1386,22 @@ RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx) { } } - stats.sizeof_metric = struct_natural_alignment(sizeof(struct pg_cache_page_index) + sizeof(struct pg_alignment)); - stats.sizeof_page = struct_natural_alignment(sizeof(struct rrdeng_page_descr)); +// stats.sizeof_metric = 0; stats.sizeof_datafile = struct_natural_alignment(sizeof(struct rrdengine_datafile)) + struct_natural_alignment(sizeof(struct rrdengine_journalfile)); - stats.sizeof_page_in_cache = struct_natural_alignment(sizeof(struct page_cache_descr)); - stats.sizeof_point_data = page_type_size[ctx->page_type]; - stats.sizeof_page_data = RRDENG_BLOCK_SIZE; + stats.sizeof_page_in_cache = 0; // struct_natural_alignment(sizeof(struct page_cache_descr)); + stats.sizeof_point_data = page_type_size[ctx->config.page_type]; + stats.sizeof_page_data = tier_page_size[ctx->config.tier]; stats.pages_per_extent = rrdeng_pages_per_extent; - stats.sizeof_extent = sizeof(struct extent_info); - stats.sizeof_page_in_extent = sizeof(struct rrdeng_page_descr *); - - stats.sizeof_metric_in_index = 40; - stats.sizeof_page_in_index = 24; +// stats.sizeof_metric_in_index = 40; +// stats.sizeof_page_in_index = 24; - stats.default_granularity_secs = (size_t)default_rrd_update_every * get_tier_grouping(ctx->tier); + stats.default_granularity_secs = (size_t)default_rrd_update_every * get_tier_grouping(ctx->config.tier); return stats; } + +struct rrdeng_cache_efficiency_stats rrdeng_get_cache_efficiency_stats(void) { + // FIXME - make cache efficiency stats atomic + return rrdeng_cache_efficiency_stats; +} diff --git a/database/engine/rrdengineapi.h b/database/engine/rrdengineapi.h index 3acee4ec6..feb79b977 100644 --- a/database/engine/rrdengineapi.h +++ b/database/engine/rrdengineapi.h @@ -8,7 +8,7 @@ #define RRDENG_MIN_PAGE_CACHE_SIZE_MB (8) #define RRDENG_MIN_DISK_SPACE_MB (64) -#define RRDENG_NR_STATS (37) +#define RRDENG_NR_STATS (38) #define RRDENG_FD_BUDGET_PER_INSTANCE (50) @@ -16,26 +16,15 @@ extern int db_engine_use_malloc; extern int default_rrdeng_page_fetch_timeout; extern int default_rrdeng_page_fetch_retries; extern int default_rrdeng_page_cache_mb; +extern int db_engine_journal_indexing; +extern int db_engine_journal_check; extern int default_rrdeng_disk_quota_mb; extern int default_multidb_disk_quota_mb; -extern uint8_t rrdeng_drop_metrics_under_page_cache_pressure; extern struct rrdengine_instance *multidb_ctx[RRD_STORAGE_TIERS]; extern size_t page_type_size[]; +extern size_t tier_page_size[]; -#define PAGE_POINT_SIZE_BYTES(x) page_type_size[(x)->type] - -struct rrdeng_region_info { - time_t start_time_s; - int update_every; - unsigned points; -}; - -void *rrdeng_create_page(struct rrdengine_instance *ctx, uuid_t *id, struct rrdeng_page_descr **ret_descr); -void rrdeng_commit_page(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr, - Word_t page_correlation_id); -void *rrdeng_get_latest_page(struct rrdengine_instance *ctx, uuid_t *id, void **handle); -void *rrdeng_get_page(struct rrdengine_instance *ctx, uuid_t *id, usec_t point_in_time_ut, void **handle); -void rrdeng_put_page(struct rrdengine_instance *ctx, void *handle); +#define CTX_POINT_SIZE_BYTES(ctx) page_type_size[(ctx)->config.page_type] void rrdeng_generate_legacy_uuid(const char *dim_id, const char *chart_id, uuid_t *ret_uuid); void rrdeng_convert_legacy_uuid_to_multihost(char machine_guid[GUID_LEN + 1], uuid_t *legacy_uuid, @@ -44,8 +33,6 @@ void rrdeng_convert_legacy_uuid_to_multihost(char machine_guid[GUID_LEN + 1], uu STORAGE_METRIC_HANDLE *rrdeng_metric_get_or_create(RRDDIM *rd, STORAGE_INSTANCE *db_instance); STORAGE_METRIC_HANDLE *rrdeng_metric_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid); -STORAGE_METRIC_HANDLE *rrdeng_metric_create(STORAGE_INSTANCE *db_instance, uuid_t *uuid); -STORAGE_METRIC_HANDLE *rrdeng_metric_get_legacy(STORAGE_INSTANCE *db_instance, const char *rd_id, const char *st_id); void rrdeng_metric_release(STORAGE_METRIC_HANDLE *db_metric_handle); STORAGE_METRIC_HANDLE *rrdeng_metric_dup(STORAGE_METRIC_HANDLE *db_metric_handle); @@ -60,25 +47,29 @@ void rrdeng_store_metric_next(STORAGE_COLLECT_HANDLE *collection_handle, usec_t SN_FLAGS flags); int rrdeng_store_metric_finalize(STORAGE_COLLECT_HANDLE *collection_handle); -void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *rrdimm_handle, - time_t start_time_s, time_t end_time_s); +void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *rrddim_handle, + time_t start_time_s, time_t end_time_s, STORAGE_PRIORITY priority); STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim_handle); -int rrdeng_load_metric_is_finished(struct storage_engine_query_handle *rrdimm_handle); -void rrdeng_load_metric_finalize(struct storage_engine_query_handle *rrdimm_handle); +int rrdeng_load_metric_is_finished(struct storage_engine_query_handle *rrddim_handle); +void rrdeng_load_metric_finalize(struct storage_engine_query_handle *rrddim_handle); time_t rrdeng_metric_latest_time(STORAGE_METRIC_HANDLE *db_metric_handle); time_t rrdeng_metric_oldest_time(STORAGE_METRIC_HANDLE *db_metric_handle); +time_t rrdeng_load_align_to_optimal_before(struct storage_engine_query_handle *rrddim_handle); void rrdeng_get_37_statistics(struct rrdengine_instance *ctx, unsigned long long *array); /* must call once before using anything */ -int rrdeng_init(RRDHOST *host, struct rrdengine_instance **ctxp, char *dbfiles_path, unsigned page_cache_mb, +int rrdeng_init(struct rrdengine_instance **ctxp, const char *dbfiles_path, unsigned disk_space_mb, size_t tier); +void rrdeng_readiness_wait(struct rrdengine_instance *ctx); +void rrdeng_exit_mode(struct rrdengine_instance *ctx); + int rrdeng_exit(struct rrdengine_instance *ctx); void rrdeng_prepare_exit(struct rrdengine_instance *ctx); -int rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *si, uuid_t *dim_uuid, time_t *first_entry_t, time_t *last_entry_t); +bool rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *db_instance, uuid_t *dim_uuid, time_t *first_entry_s, time_t *last_entry_s); extern STORAGE_METRICS_GROUP *rrdeng_metrics_group_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid); extern void rrdeng_metrics_group_release(STORAGE_INSTANCE *db_instance, STORAGE_METRICS_GROUP *smg); @@ -86,12 +77,6 @@ extern void rrdeng_metrics_group_release(STORAGE_INSTANCE *db_instance, STORAGE_ typedef struct rrdengine_size_statistics { size_t default_granularity_secs; - size_t sizeof_metric; - size_t sizeof_metric_in_index; - size_t sizeof_page; - size_t sizeof_page_in_index; - size_t sizeof_extent; - size_t sizeof_page_in_extent; size_t sizeof_datafile; size_t sizeof_page_in_cache; size_t sizeof_point_data; @@ -119,11 +104,10 @@ typedef struct rrdengine_size_statistics { size_t single_point_pages; - usec_t first_t; - usec_t last_t; + time_t first_time_s; + time_t last_time_s; size_t currently_collected_metrics; - size_t max_concurrently_collected_metrics; size_t estimated_concurrently_collected_metrics; size_t disk_space; @@ -139,6 +123,109 @@ typedef struct rrdengine_size_statistics { double average_page_size_bytes; } RRDENG_SIZE_STATS; +struct rrdeng_cache_efficiency_stats { + size_t queries; + size_t queries_planned_with_gaps; + size_t queries_executed_with_gaps; + size_t queries_open; + size_t queries_journal_v2; + + size_t currently_running_queries; + + // query planner output of the queries + size_t pages_total; + size_t pages_to_load_from_disk; + size_t extents_loaded_from_disk; + + // pages metadata sources + size_t pages_meta_source_main_cache; + size_t pages_meta_source_open_cache; + size_t pages_meta_source_journal_v2; + + // preloading + size_t page_next_wait_failed; + size_t page_next_wait_loaded; + size_t page_next_nowait_failed; + size_t page_next_nowait_loaded; + + // pages data sources + size_t pages_data_source_main_cache; + size_t pages_data_source_main_cache_at_pass4; + size_t pages_data_source_disk; + size_t pages_data_source_extent_cache; // loaded by a cached extent + + // cache hits at different points + size_t pages_load_ok_loaded_but_cache_hit_while_inserting; // found in cache while inserting it (conflict) + + // loading + size_t pages_load_extent_merged; + size_t pages_load_ok_uncompressed; + size_t pages_load_ok_compressed; + size_t pages_load_fail_invalid_page_in_extent; + size_t pages_load_fail_cant_mmap_extent; + size_t pages_load_fail_datafile_not_available; + size_t pages_load_fail_unroutable; + size_t pages_load_fail_not_found; + size_t pages_load_fail_invalid_extent; + size_t pages_load_fail_cancelled; + + // timings for query preparation + size_t prep_time_to_route; + size_t prep_time_in_main_cache_lookup; + size_t prep_time_in_open_cache_lookup; + size_t prep_time_in_journal_v2_lookup; + size_t prep_time_in_pass4_lookup; + + // timings the query thread experiences + size_t query_time_init; + size_t query_time_wait_for_prep; + size_t query_time_to_slow_disk_next_page; + size_t query_time_to_fast_disk_next_page; + size_t query_time_to_slow_preload_next_page; + size_t query_time_to_fast_preload_next_page; + + // query issues + size_t pages_zero_time_skipped; + size_t pages_past_time_skipped; + size_t pages_overlapping_skipped; + size_t pages_invalid_size_skipped; + size_t pages_invalid_update_every_fixed; + size_t pages_invalid_entries_fixed; + + // database events + size_t journal_v2_mapped; + size_t journal_v2_unmapped; + size_t datafile_creation_started; + size_t datafile_deletion_started; + size_t datafile_deletion_spin; + size_t journal_v2_indexing_started; + size_t metrics_retention_started; +}; + +struct rrdeng_buffer_sizes { + size_t workers; + size_t pdc; + size_t wal; + size_t descriptors; + size_t xt_io; + size_t xt_buf; + size_t handles; + size_t opcodes; + size_t epdl; + size_t deol; + size_t pd; + size_t pgc; + size_t mrg; +#ifdef PDC_USE_JULYL + size_t julyl; +#endif +}; + +struct rrdeng_buffer_sizes rrdeng_get_buffer_sizes(void); +struct rrdeng_cache_efficiency_stats rrdeng_get_cache_efficiency_stats(void); + RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx); +size_t rrdeng_collectors_running(struct rrdengine_instance *ctx); +bool rrdeng_is_legacy(STORAGE_INSTANCE *db_instance); #endif /* NETDATA_RRDENGINEAPI_H */ diff --git a/database/engine/rrdenginelib.c b/database/engine/rrdenginelib.c index 58bd9c437..7ec626c59 100644 --- a/database/engine/rrdenginelib.c +++ b/database/engine/rrdenginelib.c @@ -4,68 +4,68 @@ #define BUFSIZE (512) /* Caller must hold descriptor lock */ -void print_page_cache_descr(struct rrdeng_page_descr *descr, const char *msg, bool log_debug) -{ - if(log_debug && !(debug_flags & D_RRDENGINE)) - return; - - BUFFER *wb = buffer_create(512); - - if(!descr) { - buffer_sprintf(wb, "DBENGINE: %s : descr is NULL", msg); - } - else { - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; - char uuid_str[UUID_STR_LEN]; - - uuid_unparse_lower(*descr->id, uuid_str); - buffer_sprintf(wb, "DBENGINE: %s : page(%p) metric:%s, len:%"PRIu32", time:%"PRIu64"->%"PRIu64", update_every:%u, type:%u, xt_offset:", - msg, - pg_cache_descr->page, uuid_str, - descr->page_length, - (uint64_t)descr->start_time_ut, - (uint64_t)descr->end_time_ut, - (uint32_t)descr->update_every_s, - (uint32_t)descr->type - ); - if (!descr->extent) { - buffer_strcat(wb, "N/A"); - } else { - buffer_sprintf(wb, "%"PRIu64, descr->extent->offset); - } - - buffer_sprintf(wb, ", flags:0x%2.2lX refcnt:%u", pg_cache_descr->flags, pg_cache_descr->refcnt); - } - - if(log_debug) - debug(D_RRDENGINE, "%s", buffer_tostring(wb)); - else - internal_error(true, "%s", buffer_tostring(wb)); - - buffer_free(wb); -} - -void print_page_descr(struct rrdeng_page_descr *descr) -{ - char uuid_str[UUID_STR_LEN]; - char str[BUFSIZE + 1]; - int pos = 0; - - uuid_unparse_lower(*descr->id, uuid_str); - pos += snprintfz(str, BUFSIZE - pos, "id=%s\n" - "--->len:%"PRIu32" time:%"PRIu64"->%"PRIu64" xt_offset:", - uuid_str, - descr->page_length, - (uint64_t)descr->start_time_ut, - (uint64_t)descr->end_time_ut); - if (!descr->extent) { - pos += snprintfz(str + pos, BUFSIZE - pos, "N/A"); - } else { - pos += snprintfz(str + pos, BUFSIZE - pos, "%"PRIu64, descr->extent->offset); - } - snprintfz(str + pos, BUFSIZE - pos, "\n\n"); - fputs(str, stderr); -} +//void print_page_cache_descr(struct rrdeng_page_descr *descr, const char *msg, bool log_debug) +//{ +// if(log_debug && !(debug_flags & D_RRDENGINE)) +// return; +// +// BUFFER *wb = buffer_create(512); +// +// if(!descr) { +// buffer_sprintf(wb, "DBENGINE: %s : descr is NULL", msg); +// } +// else { +// struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; +// char uuid_str[UUID_STR_LEN]; +// +// uuid_unparse_lower(*descr->id, uuid_str); +// buffer_sprintf(wb, "DBENGINE: %s : page(%p) metric:%s, len:%"PRIu32", time:%"PRIu64"->%"PRIu64", update_every:%u, type:%u, xt_offset:", +// msg, +// pg_cache_descr->page, uuid_str, +// descr->page_length, +// (uint64_t)descr->start_time_ut, +// (uint64_t)descr->end_time_ut, +// (uint32_t)descr->update_every_s, +// (uint32_t)descr->type +// ); +// if (!descr->extent) { +// buffer_strcat(wb, "N/A"); +// } else { +// buffer_sprintf(wb, "%"PRIu64, descr->extent->offset); +// } +// +// buffer_sprintf(wb, ", flags:0x%2.2lX refcnt:%u", pg_cache_descr->flags, pg_cache_descr->refcnt); +// } +// +// if(log_debug) +// debug(D_RRDENGINE, "%s", buffer_tostring(wb)); +// else +// internal_error(true, "%s", buffer_tostring(wb)); +// +// buffer_free(wb); +//} +// +//void print_page_descr(struct rrdeng_page_descr *descr) +//{ +// char uuid_str[UUID_STR_LEN]; +// char str[BUFSIZE + 1]; +// int pos = 0; +// +// uuid_unparse_lower(*descr->id, uuid_str); +// pos += snprintfz(str, BUFSIZE - pos, "id=%s\n" +// "--->len:%"PRIu32" time:%"PRIu64"->%"PRIu64" xt_offset:", +// uuid_str, +// descr->page_length, +// (uint64_t)descr->start_time_ut, +// (uint64_t)descr->end_time_ut); +// if (!descr->extent) { +// pos += snprintfz(str + pos, BUFSIZE - pos, "N/A"); +// } else { +// pos += snprintfz(str + pos, BUFSIZE - pos, "%"PRIu64, descr->extent->offset); +// } +// snprintfz(str + pos, BUFSIZE - pos, "\n\n"); +// fputs(str, stderr); +//} int check_file_properties(uv_file file, uint64_t *file_size, size_t min_size) { @@ -142,90 +142,6 @@ int open_file_for_io(char *path, int flags, uv_file *file, int direct) return fd; } -char *get_rrdeng_statistics(struct rrdengine_instance *ctx, char *str, size_t size) -{ - struct page_cache *pg_cache; - - pg_cache = &ctx->pg_cache; - snprintfz(str, size, - "metric_API_producers: %ld\n" - "metric_API_consumers: %ld\n" - "page_cache_total_pages: %ld\n" - "page_cache_descriptors: %ld\n" - "page_cache_populated_pages: %ld\n" - "page_cache_committed_pages: %ld\n" - "page_cache_insertions: %ld\n" - "page_cache_deletions: %ld\n" - "page_cache_hits: %ld\n" - "page_cache_misses: %ld\n" - "page_cache_backfills: %ld\n" - "page_cache_evictions: %ld\n" - "compress_before_bytes: %ld\n" - "compress_after_bytes: %ld\n" - "decompress_before_bytes: %ld\n" - "decompress_after_bytes: %ld\n" - "io_write_bytes: %ld\n" - "io_write_requests: %ld\n" - "io_read_bytes: %ld\n" - "io_read_requests: %ld\n" - "io_write_extent_bytes: %ld\n" - "io_write_extents: %ld\n" - "io_read_extent_bytes: %ld\n" - "io_read_extents: %ld\n" - "datafile_creations: %ld\n" - "datafile_deletions: %ld\n" - "journalfile_creations: %ld\n" - "journalfile_deletions: %ld\n" - "io_errors: %ld\n" - "fs_errors: %ld\n" - "global_io_errors: %ld\n" - "global_fs_errors: %ld\n" - "rrdeng_reserved_file_descriptors: %ld\n" - "pg_cache_over_half_dirty_events: %ld\n" - "global_pg_cache_over_half_dirty_events: %ld\n" - "flushing_pressure_page_deletions: %ld\n" - "global_flushing_pressure_page_deletions: %ld\n", - (long)ctx->stats.metric_API_producers, - (long)ctx->stats.metric_API_consumers, - (long)pg_cache->page_descriptors, - (long)ctx->stats.page_cache_descriptors, - (long)pg_cache->populated_pages, - (long)pg_cache->committed_page_index.nr_committed_pages, - (long)ctx->stats.pg_cache_insertions, - (long)ctx->stats.pg_cache_deletions, - (long)ctx->stats.pg_cache_hits, - (long)ctx->stats.pg_cache_misses, - (long)ctx->stats.pg_cache_backfills, - (long)ctx->stats.pg_cache_evictions, - (long)ctx->stats.before_compress_bytes, - (long)ctx->stats.after_compress_bytes, - (long)ctx->stats.before_decompress_bytes, - (long)ctx->stats.after_decompress_bytes, - (long)ctx->stats.io_write_bytes, - (long)ctx->stats.io_write_requests, - (long)ctx->stats.io_read_bytes, - (long)ctx->stats.io_read_requests, - (long)ctx->stats.io_write_extent_bytes, - (long)ctx->stats.io_write_extents, - (long)ctx->stats.io_read_extent_bytes, - (long)ctx->stats.io_read_extents, - (long)ctx->stats.datafile_creations, - (long)ctx->stats.datafile_deletions, - (long)ctx->stats.journalfile_creations, - (long)ctx->stats.journalfile_deletions, - (long)ctx->stats.io_errors, - (long)ctx->stats.fs_errors, - (long)global_io_errors, - (long)global_fs_errors, - (long)rrdeng_reserved_file_descriptors, - (long)ctx->stats.pg_cache_over_half_dirty_events, - (long)global_pg_cache_over_half_dirty_events, - (long)ctx->stats.flushing_pressure_page_deletions, - (long)global_flushing_pressure_page_deletions - ); - return str; -} - int is_legacy_child(const char *machine_guid) { uuid_t uuid; diff --git a/database/engine/rrdenginelib.h b/database/engine/rrdenginelib.h index 6b1a15fb1..ca8eacae4 100644 --- a/database/engine/rrdenginelib.h +++ b/database/engine/rrdenginelib.h @@ -6,7 +6,6 @@ #include "libnetdata/libnetdata.h" /* Forward declarations */ -struct rrdeng_page_descr; struct rrdengine_instance; #define STR_HELPER(x) #x @@ -83,8 +82,6 @@ static inline void crc32set(void *crcp, uLong crc) *(uint32_t *)crcp = crc; } -void print_page_cache_descr(struct rrdeng_page_descr *descr, const char *msg, bool log_debug); -void print_page_descr(struct rrdeng_page_descr *descr); int check_file_properties(uv_file file, uint64_t *file_size, size_t min_size); int open_file_for_io(char *path, int flags, uv_file *file, int direct); static inline int open_file_direct_io(char *path, int flags, uv_file *file) @@ -95,7 +92,6 @@ static inline int open_file_buffered_io(char *path, int flags, uv_file *file) { return open_file_for_io(path, flags, file, 0); } -char *get_rrdeng_statistics(struct rrdengine_instance *ctx, char *str, size_t size); int compute_multidb_diskspace(); int is_legacy_child(const char *machine_guid); diff --git a/database/engine/rrdenglocking.c b/database/engine/rrdenglocking.c deleted file mode 100644 index a23abf307..000000000 --- a/database/engine/rrdenglocking.c +++ /dev/null @@ -1,241 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later -#include "rrdengine.h" - -struct page_cache_descr *rrdeng_create_pg_cache_descr(struct rrdengine_instance *ctx) -{ - struct page_cache_descr *pg_cache_descr; - - pg_cache_descr = mallocz(sizeof(*pg_cache_descr)); - rrd_stat_atomic_add(&ctx->stats.page_cache_descriptors, 1); - pg_cache_descr->page = NULL; - pg_cache_descr->flags = 0; - pg_cache_descr->prev = pg_cache_descr->next = NULL; - pg_cache_descr->refcnt = 0; - pg_cache_descr->waiters = 0; - fatal_assert(0 == uv_cond_init(&pg_cache_descr->cond)); - fatal_assert(0 == uv_mutex_init(&pg_cache_descr->mutex)); - - return pg_cache_descr; -} - -void rrdeng_destroy_pg_cache_descr(struct rrdengine_instance *ctx, struct page_cache_descr *pg_cache_descr) -{ - uv_cond_destroy(&pg_cache_descr->cond); - uv_mutex_destroy(&pg_cache_descr->mutex); - freez(pg_cache_descr); - rrd_stat_atomic_add(&ctx->stats.page_cache_descriptors, -1); -} - -/* also allocates page cache descriptor if missing */ -void rrdeng_page_descr_mutex_lock(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) -{ - unsigned long old_state, old_users, new_state, ret_state; - struct page_cache_descr *pg_cache_descr = NULL; - uint8_t we_locked; - - we_locked = 0; - while (1) { /* spin */ - old_state = descr->pg_cache_descr_state; - old_users = old_state >> PG_CACHE_DESCR_SHIFT; - - if (unlikely(we_locked)) { - fatal_assert(old_state & PG_CACHE_DESCR_LOCKED); - new_state = (1 << PG_CACHE_DESCR_SHIFT) | PG_CACHE_DESCR_ALLOCATED; - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - /* success */ - break; - } - continue; /* spin */ - } - if (old_state & PG_CACHE_DESCR_LOCKED) { - fatal_assert(0 == old_users); - continue; /* spin */ - } - if (0 == old_state) { - /* no page cache descriptor has been allocated */ - - if (NULL == pg_cache_descr) { - pg_cache_descr = rrdeng_create_pg_cache_descr(ctx); - } - new_state = PG_CACHE_DESCR_LOCKED; - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, 0, new_state); - if (0 == ret_state) { - we_locked = 1; - descr->pg_cache_descr = pg_cache_descr; - pg_cache_descr->descr = descr; - pg_cache_descr = NULL; /* make sure we don't free pg_cache_descr */ - /* retry */ - continue; - } - continue; /* spin */ - } - /* page cache descriptor is already allocated */ - if (unlikely(!(old_state & PG_CACHE_DESCR_ALLOCATED))) { - fatal("Invalid page cache descriptor locking state:%#lX", old_state); - } - new_state = (old_users + 1) << PG_CACHE_DESCR_SHIFT; - new_state |= old_state & PG_CACHE_DESCR_FLAGS_MASK; - - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - /* success */ - break; - } - /* spin */ - } - - if (pg_cache_descr) { - rrdeng_destroy_pg_cache_descr(ctx, pg_cache_descr); - } - pg_cache_descr = descr->pg_cache_descr; - uv_mutex_lock(&pg_cache_descr->mutex); -} - -void rrdeng_page_descr_mutex_unlock(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) -{ - unsigned long old_state, new_state, ret_state, old_users; - struct page_cache_descr *pg_cache_descr, *delete_pg_cache_descr = NULL; - uint8_t we_locked; - - uv_mutex_unlock(&descr->pg_cache_descr->mutex); - - we_locked = 0; - while (1) { /* spin */ - old_state = descr->pg_cache_descr_state; - old_users = old_state >> PG_CACHE_DESCR_SHIFT; - - if (unlikely(we_locked)) { - fatal_assert(0 == old_users); - - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, 0); - if (old_state == ret_state) { - /* success */ - rrdeng_destroy_pg_cache_descr(ctx, delete_pg_cache_descr); - return; - } - continue; /* spin */ - } - if (old_state & PG_CACHE_DESCR_LOCKED) { - fatal_assert(0 == old_users); - continue; /* spin */ - } - fatal_assert(old_state & PG_CACHE_DESCR_ALLOCATED); - pg_cache_descr = descr->pg_cache_descr; - /* caller is the only page cache descriptor user and there are no pending references on the page */ - if ((old_state & PG_CACHE_DESCR_DESTROY) && (1 == old_users) && - !pg_cache_descr->flags && !pg_cache_descr->refcnt) { - fatal_assert(!pg_cache_descr->waiters); - - new_state = PG_CACHE_DESCR_LOCKED; - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - we_locked = 1; - delete_pg_cache_descr = pg_cache_descr; - descr->pg_cache_descr = NULL; - /* retry */ - continue; - } - continue; /* spin */ - } - fatal_assert(old_users > 0); - new_state = (old_users - 1) << PG_CACHE_DESCR_SHIFT; - new_state |= old_state & PG_CACHE_DESCR_FLAGS_MASK; - - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - /* success */ - break; - } - /* spin */ - } -} - -/* - * Tries to deallocate page cache descriptor. If it fails, it postpones deallocation by setting the - * PG_CACHE_DESCR_DESTROY flag which will be eventually cleared by a different context after doing - * the deallocation. - */ -void rrdeng_try_deallocate_pg_cache_descr(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) -{ - unsigned long old_state, new_state, ret_state, old_users; - struct page_cache_descr *pg_cache_descr = NULL; - uint8_t just_locked, can_free, must_unlock; - - just_locked = 0; - can_free = 0; - must_unlock = 0; - while (1) { /* spin */ - old_state = descr->pg_cache_descr_state; - old_users = old_state >> PG_CACHE_DESCR_SHIFT; - - if (unlikely(just_locked)) { - fatal_assert(0 == old_users); - - must_unlock = 1; - just_locked = 0; - /* Try deallocate if there are no pending references on the page */ - if (!pg_cache_descr->flags && !pg_cache_descr->refcnt) { - fatal_assert(!pg_cache_descr->waiters); - - descr->pg_cache_descr = NULL; - can_free = 1; - /* success */ - continue; - } - continue; /* spin */ - } - if (unlikely(must_unlock)) { - fatal_assert(0 == old_users); - - if (can_free) { - /* success */ - new_state = 0; - } else { - new_state = old_state | PG_CACHE_DESCR_DESTROY; - new_state &= ~PG_CACHE_DESCR_LOCKED; - } - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - /* unlocked */ - if (can_free) - rrdeng_destroy_pg_cache_descr(ctx, pg_cache_descr); - return; - } - continue; /* spin */ - } - if (!(old_state & PG_CACHE_DESCR_ALLOCATED)) { - /* don't do anything */ - return; - } - if (old_state & PG_CACHE_DESCR_LOCKED) { - fatal_assert(0 == old_users); - continue; /* spin */ - } - /* caller is the only page cache descriptor user */ - if (0 == old_users) { - new_state = old_state | PG_CACHE_DESCR_LOCKED; - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - just_locked = 1; - pg_cache_descr = descr->pg_cache_descr; - /* retry */ - continue; - } - continue; /* spin */ - } - if (old_state & PG_CACHE_DESCR_DESTROY) { - /* don't do anything */ - return; - } - /* plant PG_CACHE_DESCR_DESTROY so that other contexts eventually free the page cache descriptor */ - new_state = old_state | PG_CACHE_DESCR_DESTROY; - - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - /* success */ - return; - } - /* spin */ - } -}
\ No newline at end of file diff --git a/database/engine/rrdenglocking.h b/database/engine/rrdenglocking.h deleted file mode 100644 index 078eab38b..000000000 --- a/database/engine/rrdenglocking.h +++ /dev/null @@ -1,17 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_RRDENGLOCKING_H -#define NETDATA_RRDENGLOCKING_H - -#include "rrdengine.h" - -/* Forward declarations */ -struct page_cache_descr; - -struct page_cache_descr *rrdeng_create_pg_cache_descr(struct rrdengine_instance *ctx); -void rrdeng_destroy_pg_cache_descr(struct rrdengine_instance *ctx, struct page_cache_descr *pg_cache_descr); -void rrdeng_page_descr_mutex_lock(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); -void rrdeng_page_descr_mutex_unlock(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); -void rrdeng_try_deallocate_pg_cache_descr(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); - -#endif /* NETDATA_RRDENGLOCKING_H */
\ No newline at end of file |