From aa2fe8ccbfcb117efa207d10229eeeac5d0f97c7 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 6 Feb 2023 17:11:30 +0100 Subject: Adding upstream version 1.38.0. Signed-off-by: Daniel Baumann --- database/README.md | 18 +- database/engine/README.md | 303 +- database/engine/cache.c | 2737 +++++++ database/engine/cache.h | 249 + database/engine/datafile.c | 415 +- database/engine/datafile.h | 76 +- database/engine/datafile.ksy | 74 + database/engine/dbengine-diagram.xml | 1 + database/engine/journalfile.c | 1445 +++- database/engine/journalfile.h | 150 +- database/engine/journalfile.ksy | 144 + database/engine/metric.c | 875 +++ database/engine/metric.h | 79 + database/engine/pagecache.c | 2054 +++-- database/engine/pagecache.h | 242 +- database/engine/pdc.c | 1282 +++ database/engine/pdc.h | 67 + database/engine/rrdengine.c | 2634 ++++--- database/engine/rrdengine.h | 569 +- database/engine/rrdengineapi.c | 1683 ++-- database/engine/rrdengineapi.h | 153 +- database/engine/rrdenginelib.c | 208 +- database/engine/rrdenginelib.h | 4 - database/engine/rrdenglocking.c | 241 - database/engine/rrdenglocking.h | 17 - database/ram/rrddim_mem.c | 352 +- database/ram/rrddim_mem.h | 15 +- database/rrd.c | 2 +- database/rrd.h | 417 +- database/rrdcalc.c | 25 +- database/rrdcalc.h | 4 +- database/rrdcalctemplate.c | 3 +- database/rrdcontext.c | 502 +- database/rrdcontext.h | 35 +- database/rrddim.c | 157 +- database/rrddimvar.c | 7 +- database/rrdfamily.c | 3 +- database/rrdfunctions.c | 11 +- database/rrdhost.c | 577 +- database/rrdlabels.c | 7 +- database/rrdset.c | 417 +- database/rrdsetvar.c | 7 +- database/rrdvar.c | 3 +- database/sqlite/sqlite3.c | 13954 +++++++++++++++++++++------------ database/sqlite/sqlite3.h | 160 +- database/sqlite/sqlite_aclk.c | 336 +- database/sqlite/sqlite_aclk.h | 5 +- database/sqlite/sqlite_aclk_alert.c | 102 +- database/sqlite/sqlite_context.c | 24 +- database/sqlite/sqlite_context.h | 4 +- database/sqlite/sqlite_functions.c | 432 +- database/sqlite/sqlite_functions.h | 7 - database/sqlite/sqlite_health.c | 36 +- database/sqlite/sqlite_metadata.c | 428 +- database/sqlite/sqlite_metadata.h | 9 +- database/storage_engine.c | 50 +- 56 files changed, 22329 insertions(+), 11482 deletions(-) create mode 100644 database/engine/cache.c create mode 100644 database/engine/cache.h create mode 100644 database/engine/datafile.ksy create mode 100644 database/engine/dbengine-diagram.xml create mode 100644 database/engine/journalfile.ksy create mode 100644 database/engine/metric.c create mode 100644 database/engine/metric.h create mode 100644 database/engine/pdc.c create mode 100644 database/engine/pdc.h delete mode 100644 database/engine/rrdenglocking.c delete mode 100644 database/engine/rrdenglocking.h (limited to 'database') diff --git a/database/README.md b/database/README.md index 1453f9b39..becd4165f 100644 --- a/database/README.md +++ b/database/README.md @@ -1,7 +1,11 @@ # Database @@ -9,12 +13,12 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/database/README. Netdata is fully capable of long-term metrics storage, at per-second granularity, via its default database engine (`dbengine`). But to remain as flexible as possible, Netdata supports several storage options: -1. `dbengine`, (the default) data are in database files. The [Database Engine](/database/engine/README.md) works like a +1. `dbengine`, (the default) data are in database files. The [Database Engine](https://github.com/netdata/netdata/blob/master/database/engine/README.md) works like a traditional database. There is some amount of RAM dedicated to data caching and indexing and the rest of the data reside compressed on disk. The number of history entries is not fixed in this case, but depends on the configured disk space and the effective compression ratio of the data stored. This is the **only mode** that supports changing the data collection update frequency (`update every`) **without losing** the previously stored metrics. For more - details see [here](/database/engine/README.md). + details see [here](https://github.com/netdata/netdata/blob/master/database/engine/README.md). 2. `ram`, data are purely in memory. Data are never saved on disk. This mode uses `mmap()` and supports [KSM](#ksm). @@ -38,13 +42,13 @@ The default mode `[db].mode = dbengine` has been designed to scale for longer re for parent Agents in the _Parent - Child_ setups The other available database modes are designed to minimize resource utilization and should only be considered on -[Parent - Child](/docs/metrics-storage-management/how-streaming-works.mdx) setups at the children side and only when the +[Parent - Child](https://github.com/netdata/netdata/blob/master/docs/metrics-storage-management/how-streaming-works.mdx) setups at the children side and only when the resource constraints are very strict. So, - On a single node setup, use `[db].mode = dbengine`. -- On a [Parent - Child](/docs/metrics-storage-management/how-streaming-works.mdx) setup, use `[db].mode = dbengine` on the +- On a [Parent - Child](https://github.com/netdata/netdata/blob/master/docs/metrics-storage-management/how-streaming-works.mdx) setup, use `[db].mode = dbengine` on the parent to increase retention, a more resource efficient mode like, `dbengine` with light retention settings, and `save`, `ram` or `none` modes for the children to minimize resource utilization. @@ -64,7 +68,7 @@ Metrics retention is controlled only by the disk space allocated to storing metr CPU required by the agent to query longer timeframes. Since Netdata Agents usually run on the edge, on production systems, Netdata Agent **parents** should be considered. -When having a [**parent - child**](/docs/metrics-storage-management/how-streaming-works.mdx) setup, the child (the +When having a [**parent - child**](https://github.com/netdata/netdata/blob/master/docs/metrics-storage-management/how-streaming-works.mdx) setup, the child (the Netdata Agent running on a production system) delegates all of its functions, including longer metrics retention and querying, to the parent node that can dedicate more resources to this task. A single Netdata Agent parent can centralize multiple children Netdata Agents (dozens, hundreds, or even thousands depending on its available resources). @@ -85,7 +89,7 @@ every 2 seconds. This will **cut in half** both CPU and RAM resources consumed b On very weak devices you might have to use `[db].update every = 5` and `[db].retention = 720` (still 1 hour of data, but 1/5 of the CPU and RAM resources). -You can also disable [data collection plugins](/collectors/README.md) that you don't need. Disabling such plugins will also +You can also disable [data collection plugins](https://github.com/netdata/netdata/blob/master/collectors/README.md) that you don't need. Disabling such plugins will also free both CPU and RAM resources. ## Memory optimizations diff --git a/database/engine/README.md b/database/engine/README.md index c67e400f4..664d40506 100644 --- a/database/engine/README.md +++ b/database/engine/README.md @@ -1,48 +1,126 @@ -# Database engine +# DBENGINE -The Database Engine works like a traditional time series database. Unlike other [database modes](/database/README.md), -the amount of historical metrics stored is based on the amount of disk space you allocate and the effective compression -ratio, not a fixed number of metrics collected. +DBENGINE is the time-series database of Netdata. -## Tiering +## Design -Tiering is a mechanism of providing multiple tiers of data with -different [granularity on metrics](/docs/store/distributed-data-architecture.md#granularity-of-metrics). +### Data Points -For Netdata Agents with version `netdata-1.35.0.138.nightly` and greater, `dbengine` supports Tiering, allowing almost -unlimited retention of data. +**Data points** represent the collected values of metrics. +A **data point** has: -### Metric size +1. A **value**, the data collected for a metric. There is a special **value** to indicate that the collector failed to collect a valid value, and thus the data point is a **gap**. +2. A **timestamp**, the time it has been collected. +3. A **duration**, the time between this and the previous data collection. +4. A flag which is set when machine-learning categorized the collected value as **anomalous** (an outlier based on the trained models). -Every Tier down samples the exact lower tier (lower tiers have greater resolution). You can have up to 5 -Tiers **[0. . 4]** of data (including the Tier 0, which has the highest resolution) +Using the **timestamp** and **duration**, Netdata calculates for each point its **start time**, **end time** and **update every**. -Tier 0 is the default that was always available in `dbengine` mode. Tier 1 is the first level of aggregation, Tier 2 is -the second, and so on. +For incremental metrics (counters), Netdata interpolates the collected values to align them to the expected **end time** at the microsecond level, absorbing data collection micro-latencies. -Metrics on all tiers except of the _Tier 0_ also store the following five additional values for every point for accurate -representation: +When data points are stored in higher tiers (time aggregations - see [Tiers](#Tiers) below), each data point has: -1. The `sum` of the points aggregated -2. The `min` of the points aggregated -3. The `max` of the points aggregated -4. The `count` of the points aggregated (could be constant, but it may not be due to gaps in data collection) -5. The `anomaly_count` of the points aggregated (how many of the aggregated points found anomalous) +1. The **sum** of the original values that have been aggregated, +2. The **count** of all the original values aggregated, +3. The **minimum** value among them, +4. The **maximum** value among them, +5. Their **anomaly rate**, i.e. the count of values that were detected as outliers based on the currently trained models for the metric, +6. A **timestamp**, which is the equal to the **end time** of the last point aggregated, +7. A **duration**, which is the duration between the **first time** of the first point aggregated to the **end time** of the last point aggregated. -Among `min`, `max` and `sum`, the correct value is chosen based on the user query. `average` is calculated on the fly at -query time. +This design allows Netdata to accurately know the **average**, **minimum**, **maximum** and **anomaly rate** values even when using higher tiers to satisfy a query. -### Tiering in a nutshell +### Pages +Data points are organized into **pages**, i.e. segments of contiguous data collections of the same metric. -The `dbengine` is capable of retaining metrics for years. To further understand the `dbengine` tiering mechanism let's -explore the following configuration. +Each page: + +1. Contains contiguous **data points** of a single metric. +2. Contains **data points** having the same **update every**. If a metric changes **update every** on the fly, the page is flushed and a new one with the new **update every** is created. If a data collection is missed, a **gap point** is inserted into the page, so that the data points in a page remain contiguous. +3. Has a **start time**, which is equivalent to the **end time** of the first data point stored into it, +4. Has an **end time**, which is equal to the **end time** of the last data point stored into it, +5. Has an **update every**, common for all points in the page. + +A **page** is a simple array of values. Each slot in the array has a **timestamp** implied by its position in the array, and each value stored represents the **data point** for that time, for the metric the page belongs to. + +This simple fixed step page design allows Netdata to collect several millions of points per second and pack all the values in a compact form with minimal metadata overhead. + +#### Hot Pages + +While a metric is collected, there is one **hot page** in memory for each of the configured tiers. Values collected for a metric are appended to its **hot page** until that page becomes full. + +#### Dirty Pages + +Once a **hot page** is full, it becomes a **dirty page**, and it is scheduled for immediate **flushing** (saving) to disk. + +#### Clean Pages + +Flushed (saved) pages are **clean pages**, i.e. read-only pages that reside primarily on disk, and are loaded on demand to satisfy data queries. + +#### Pages Configuration + +Pages are configured like this: + +| Attribute | Tier0 | Tier1 | Tier2 | +|---------------------------------------------------------------------------------------|:-------------------------------------:|:---------------------------------------------------------------:|:---------------------------------------------------------------:| +| Point Size in Memory, in Bytes | 4 | 16 | 16 | +| Point Size on Disk, in Bytes
after LZ4 compression, on the average | 1 | 4 | 4 | +| Page Size in Bytes | 4096
2048 in 32bit | 2048
1024 in 32bit | 384
192 in 32bit | +| Collections per Point | 1 | 60x Tier0
configurable in
`netdata.conf`
| 60x Tier1
configurable in
`netdata.conf`
| +| Points per Page | 1024
512 in 32bit | 128
64 in 32bit | 24
12 in 32bit | + +### Files + +To minimize the amount of data written to disk and the amount of storage required for storing metrics, Netdata aggregates up to 64 **dirty pages** of independent metrics, packs them all together into one bigger buffer, compresses this buffer with LZ4 (about 75% savings on the average) and commits a transaction to the disk files. + +#### Extents + +This collection of 64 pages that is packed and compressed together is called an **extent**. Netdata tries to store together, in the same **extent**, metrics that are meant to be "close". Dimensions of the same chart are such. They are usually queried together, so it is beneficial to have them in the same **extent** to read all of them at once at query time. + +#### Datafiles + +Multiple **extents** are appended to **datafiles** (filename suffix `.ndf`), until these **datafiles** become full. The size of each **datafile** is determined automatically by Netdata. The minimum for each **datafile** is 4MB and the maximum 512MB. Depending on the amount of disk space configured for each tier, Netdata will decide a **datafile** size trying to maintain about 50 datafiles for the whole database, within the limits mentioned (4MB min, 512MB max per file). The maximum number of datafiles supported is 65536, and therefore the maximum database size (per tier) that Netdata can support is 32TB. + +#### Journal Files + +Each **datafile** has two **journal files** with metadata related to the stored data in the **datafile**. + +- **journal file v1**, with filename suffix `.njf`, holds information about the transactions in its **datafile** and provides the ability to recover as much data as possible, in case either the datafile or the journal files get corrupted. This journal file has a maximum transaction size of 4KB, so in case data are corrupted on disk transactions of 4KB are lost. Each transaction holds the metadata of one **extent** (this is why DBENGINE supports up to 64 pages per extent). + +- **journal file v2**, with filename suffix `.njfv2`, which is a disk-based index for all the **pages** and **extents**. This file is memory mapped at runtime and is consulted to find where the data of a metric are in the datafile. This journal file is automatically re-created from **journal file v1** if it is missing. It is safe to delete these files (when Netdata does not run). Netdata will re-create them on the next run. Journal files v2 are supported in Netdata Agents with version `netdata-1.37.0-115-nightly`. Older versions maintain the journal index in memory. + +#### Database Rotation + +Database rotation is achieved by deleting the oldest **datafile** (and its journals) and creating a new one (with its journals). + +Data on disk are append-only. There is no way to delete, add, or update data in the middle of the database. If data are not useful for whatever reason, Netdata can be instructed to ignore these data. They will eventually be deleted from disk when the database is rotated. New data are always appended. + +#### Tiers + +Tiers are supported in Netdata Agents with version `netdata-1.35.0.138.nightly` and greater. + +**datafiles** and **journal files** are organized in **tiers**. All tiers share the same metrics and same collected values. + +- **tier 0** is the high resolution tier that stores the collected data at the frequency they are collected. +- **tier 1** by default aggregates 60 values of **tier 0**. +- **tier 2** by default aggregates 60 values of **tier 1**, or 3600 values of **tier 0**. + +Updating the higher **tiers** is automated, and it happens in real-time while data are being collected for **tier 0**. + +When the Netdata Agent starts, during the first data collection of each metric, higher tiers are automatically **backfilled** with data from lower tiers, so that the aggregation they provide will be accurate. + +3 tiers are enabled by default in Netdata, with the following configuration: ``` [db] @@ -51,46 +129,151 @@ explore the following configuration. # per second data collection update every = 1 - # enables Tier 1 and Tier 2, Tier 0 is always enabled in dbengine mode + # number of tiers used (1 to 5, 3 being default) storage tiers = 3 - # Tier 0, per second data for a week - dbengine multihost disk space MB = 1100 + # Tier 0, per second data + dbengine multihost disk space MB = 256 - # Tier 1, per minute data for a month - dbengine tier 1 multihost disk space MB = 330 + # Tier 1, per minute data + dbengine tier 1 multihost disk space MB = 128 + + # Tier 2, per hour data + dbengine tier 2 multihost disk space MB = 64 +``` + +The exact retention that can be achieved by each tier depends on the number of metrics collected. The more the metrics, the smaller the retention that will fit in a given size. The general rule is that Netdata needs about **1 byte per data point on disk for tier 0**, and **4 bytes per data point on disk for tier 1 and above**. + +So, for 1000 metrics collected per second and 256 MB for tier 0, Netdata will store about: + +``` +256MB on disk / 1 byte per point / 1000 metrics => 256k points per metric / 86400 seconds per day = about 3 days +``` + +At tier 1 (per minute): + +``` +128MB on disk / 4 bytes per point / 1000 metrics => 32k points per metric / (24 hours * 60 minutes) = about 22 days +``` + +At tier 2 (per hour): + +``` +64MB on disk / 4 bytes per point / 1000 metrics => 16k points per metric / 24 hours per day = about 2 years +``` + +Of course double the metrics, half the retention. There are more factors that affect retention. The number of ephemeral metrics (i.e. metrics that are collected for part of the time). The number of metrics that are usually constant over time (affecting compression efficiency). The number of restarts a Netdata Agents gets through time (because it has to break pages prematurely, increasing the metadata overhead). But the actual numbers should not deviate significantly from the above. + +### Data Loss + +Until **hot pages** and **dirty pages** are **flushed** to disk they are at risk (e.g. due to a crash, or +power failure), as they are stored only in memory. + +The supported way of ensuring high data availability is the use of Netdata Parents to stream the data in real-time to +multiple other Netdata agents. + +## Memory Requirements + +DBENGINE memory is related to the number of metrics concurrently being collected, the retention of the metrics on disk in relation with the queries running, and the number of metrics for which retention is maintained. + +### Memory for concurrently collected metrics + +DBENGINE is automatically sized to use memory according to this equation: + +``` +memory in KiB = METRICS x (TIERS - 1) x 4KiB x 2 + 32768 KiB +``` + +Where: +- `METRICS`: the maximum number of concurrently collected metrics (dimensions) from the time the agent started. +- `TIERS`: the number of storage tiers configured, by default 3 ( `-1` when using 3+ tiers) +- `x 2`, to accommodate room for flushing data to disk +- `x 4KiB`, the data segment size of each metric +- `+ 32768 KiB`, 32 MB for operational caches + +So, for 2000 metrics (dimensions) in 3 storage tiers: + +``` +memory for 2k metrics = 2000 x (3 - 1) x 4 KiB x 2 + 32768 KiB = 64 MiB +``` + +For 100k concurrently collected metrics in 3 storage tiers: + +``` +memory for 100k metrics = 100000 x (3 - 1) x 4 KiB x 2 + 32768 KiB = 1.6 GiB +``` + +#### Exceptions + +Netdata has several protection mechanisms to prevent the use of more memory (than the above), by incrementally fetching data from disk and aggressively evicting old data to make room for new data, but still memory may grow beyond the above limit under the following conditions: + +1. The number of pages concurrently used in queries do not fit the in the above size. This can happen when multiple queries of unreasonably long time-frames run on lower, higher resolution, tiers. The Netdata query planner attempts to avoid such situations by gradually loading pages, but still under extreme conditions the system may use more memory to satisfy these queries. + +2. The disks that host Netdata files are extremely slow for the workload required by the database so that data cannot be flushed to disk quickly to free memory. Netdata will automatically spawn more flushing workers in an attempt to parallelize and speed up flushing, but still if the disks cannot write the data quickly enough, they will remain in memory until they are written to disk. + +### Caches + +DBENGINE stores metric data to disk. To achieve high performance even under severe stress, it uses several layers of caches. + +#### Main Cache + +Stores page data. It is the primary storage of hot and dirty pages (before they are saved to disk), and its clean queue is the LRU cache for speeding up queries. + +The entire DBENGINE is designed to use the hot queue size (the currently collected metrics) as the key for sizing all its memory consumption. We call this feature **memory ballooning**. More collected metrics, bigger main cache and vice versa. + +In the equation: - # Tier 2, per hour data for a year - dbengine tier 2 multihost disk space MB = 67 ``` +memory in KiB = METRICS x (TIERS - 1) x 4KiB x 2 + 32768 KiB +``` + +the part `METRICS x (TIERS - 1) x 4KiB` is an estimate for the max hot size of the main cache. Tier 0 pages are 4KiB, but tier 1 pages are 2 KiB and tier 2 pages are 384 bytes. So a single metric in 3 tiers uses 4096 + 2048 + 384 = 6528 bytes. The equation estimates 8192 per metric, which includes cache internal structures and leaves some spare. + +Then `x 2` is the worst case estimate for the dirty queue. If all collected metrics (hot) become available for saving at once, to avoid stopping data collection all their pages will become dirty and new hot pages will be created instantly. To save memory, when Netdata starts, DBENGINE allocates randomly smaller pages for metrics, to spread their completion evenly across time. + +The memory we saved with the above is used to improve the LRU cache. So, although we reserved 32MiB for the LRU, in bigger setups (Netdata Parents) the LRU grows a lot more, within the limits of the equation. + +In practice, the main cache sizes itself with `hot x 1.5` instead of `host x 2`. The reason is that 5% of main cache is reserved for expanding open cache, 5% for expanding extent cache and we need room for the extensive buffers that are allocated in these setups. When the main cache exceeds `hot x 1.5` it enters a mode of critical evictions, and aggresively frees pages from the LRU to maintain a healthy memory footprint within its design limits. + +#### Open Cache -For 2000 metrics, collected every second and retained for a week, Tier 0 needs: 1 byte x 2000 metrics x 3600 secs per -hour x 24 hours per day x 7 days per week = 1100MB. +Stores metadata about on disk pages. Not the data itself. Only metadata about the location of the data on disk. -By setting `dbengine multihost disk space MB` to `1100`, this node will start maintaining about a week of data. But pay -attention to the number of metrics. If you have more than 2000 metrics on a node, or you need more that a week of high -resolution metrics, you may need to adjust this setting accordingly. +Its primary use is to index information about the open datafile, the one that still accepts new pages. Once that datafile becomes full, all the hot pages of the open cache are indexed in journal v2 files. -Tier 1 is by default sampling the data every **60 points of Tier 0**. In our case, Tier 0 is per second, if we want to -transform this information in terms of time then the Tier 1 "resolution" is per minute. +The clean queue is an LRU for reducing the journal v2 scans during quering. -Tier 1 needs four times more storage per point compared to Tier 0. So, for 2000 metrics, with per minute resolution, -retained for a month, Tier 1 needs: 4 bytes x 2000 metrics x 60 minutes per hour x 24 hours per day x 30 days per month -= 330MB. +Open cache uses memory ballooning too, like the main cache, based on its own hot pages. Open cache hot size is mainly controlled by the size of the open datafile. This is why on netdata versions with journal files v2, we decreased the maximum datafile size from 1GB to 512MB and we increased the target number of datafiles from 20 to 50. -Tier 2 is by default sampling data every 3600 points of Tier 0 (60 of Tier 1, which is the previous exact Tier). Again -in term of "time" (Tier 0 is per second), then Tier 2 is per hour. +On bigger setups open cache will get a bigger LRU by automatically sizing it (the whole open cache) to 5% to the size of (the whole) main cache. -The storage requirements are the same to Tier 1. +#### Extent Cache + +Caches compressed **extent** data, to avoid reading too repeatedly the same data from disks. + + +### Shared Memory + +Journal v2 indexes are mapped into memory. Netdata attempts to minimize shared memory use by instructing the kernel about the use of these files, or even unmounting them when they are not needed. + +The time-ranges of the queries running control the amount of shared memory required. + +## Metrics Registry + +DBENGINE uses 150 bytes of memory for every metric for which retention is maintained but is not currently being collected. + +--- + +--- OLD DOCS BELOW THIS POINT --- + +--- -For 2000 metrics, with per hour resolution, retained for a year, Tier 2 needs: 4 bytes x 2000 metrics x 24 hours per day -x 365 days per year = 67MB. ## Legacy configuration ### v1.35.1 and prior -These versions of the Agent do not support [Tiering](#Tiering). You could change the metric retention for the parent and +These versions of the Agent do not support [Tiers](#Tiers). You could change the metric retention for the parent and all of its children only with the `dbengine multihost disk space MB` setting. This setting accounts the space allocation for the parent node and all of its children. @@ -105,15 +288,9 @@ the `[db]` section of your `netdata.conf`. ### v1.23.2 and prior -_For Netdata Agents earlier than v1.23.2_, the Agent on the parent node uses one dbengine instance for itself, and -another instance for every child node it receives metrics from. If you had four streaming nodes, you would have five -instances in total (`1 parent + 4 child nodes = 5 instances`). +_For Netdata Agents earlier than v1.23.2_, the Agent on the parent node uses one dbengine instance for itself, and another instance for every child node it receives metrics from. If you had four streaming nodes, you would have five instances in total (`1 parent + 4 child nodes = 5 instances`). -The Agent allocates resources for each instance separately using the `dbengine disk space MB` (**deprecated**) setting. -If -`dbengine disk space MB`(**deprecated**) is set to the default `256`, each instance is given 256 MiB in disk space, -which means the total disk space required to store all instances is, -roughly, `256 MiB * 1 parent * 4 child nodes = 1280 MiB`. +The Agent allocates resources for each instance separately using the `dbengine disk space MB` (**deprecated**) setting. If `dbengine disk space MB`(**deprecated**) is set to the default `256`, each instance is given 256 MiB in disk space, which means the total disk space required to store all instances is, roughly, `256 MiB * 1 parent * 4 child nodes = 1280 MiB`. #### Backward compatibility @@ -128,7 +305,7 @@ Agent. ##### Information For more information about setting `[db].mode` on your nodes, in addition to other streaming configurations, see -[streaming](/streaming/README.md). +[streaming](https://github.com/netdata/netdata/blob/master/streaming/README.md). ## Requirements & limitations @@ -154,7 +331,7 @@ An important observation is that RAM usage depends on both the `page cache size` options. You can use -our [database engine calculator](/docs/store/change-metrics-storage.md#calculate-the-system-resources-ram-disk-space-needed-to-store-metrics) +our [database engine calculator](https://github.com/netdata/netdata/blob/master/docs/store/change-metrics-storage.md#calculate-the-system-resources-ram-disk-space-needed-to-store-metrics) to validate the memory requirements for your particular system(s) and configuration (**out-of-date**). ### Disk space @@ -208,7 +385,7 @@ You can apply the settings by running `sysctl -p` or by rebooting. ## Files -With the DB engine mode the metric data are stored in database files. These files are organized in pairs, the datafiles +With the DB engine mode the metric data are stored in database files. These files are organized in pairs, the datafiles and their corresponding journalfiles, e.g.: ```sh @@ -226,7 +403,7 @@ location is `/var/cache/netdata/dbengine/*`). The higher numbered filenames cont can safely delete some pairs of files when Netdata is stopped to manually free up some space. _Users should_ **back up** _their `./dbengine` folders if they consider this data to be important._ You can also set up -one or more [exporting connectors](/exporting/README.md) to send your Netdata metrics to other databases for long-term +one or more [exporting connectors](https://github.com/netdata/netdata/blob/master/exporting/README.md) to send your Netdata metrics to other databases for long-term storage at lower granularity. ## Operation @@ -298,5 +475,3 @@ An interesting observation to make is that the CPU-bound run (16 GiB page cache) and generate a read load of 1.7M/sec, whereas in the CPU-bound scenario the read load is 70 times higher at 118M/sec. Consequently, there is a significant degree of interference by the reader threads, that slow down the writer threads. This is also possible because the interference effects are greater than the SSD impact on data generation throughput. - - diff --git a/database/engine/cache.c b/database/engine/cache.c new file mode 100644 index 000000000..4091684b2 --- /dev/null +++ b/database/engine/cache.c @@ -0,0 +1,2737 @@ +#include "cache.h" + +/* STATES AND TRANSITIONS + * + * entry | entry + * v v + * HOT -> DIRTY --> CLEAN --> EVICT + * v | v + * flush | evict + * v | v + * save | free + * callback | callback + * + */ + +typedef int32_t REFCOUNT; +#define REFCOUNT_DELETING (-100) + +// to use ARAL uncomment the following line: +#define PGC_WITH_ARAL 1 + +typedef enum __attribute__ ((__packed__)) { + // mutually exclusive flags + PGC_PAGE_CLEAN = (1 << 0), // none of the following + PGC_PAGE_DIRTY = (1 << 1), // contains unsaved data + PGC_PAGE_HOT = (1 << 2), // currently being collected + + // flags related to various actions on each page + PGC_PAGE_IS_BEING_DELETED = (1 << 3), + PGC_PAGE_IS_BEING_MIGRATED_TO_V2 = (1 << 4), + PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES = (1 << 5), + PGC_PAGE_HAS_BEEN_ACCESSED = (1 << 6), +} PGC_PAGE_FLAGS; + +#define page_flag_check(page, flag) (__atomic_load_n(&((page)->flags), __ATOMIC_ACQUIRE) & (flag)) +#define page_flag_set(page, flag) __atomic_or_fetch(&((page)->flags), flag, __ATOMIC_RELEASE) +#define page_flag_clear(page, flag) __atomic_and_fetch(&((page)->flags), ~(flag), __ATOMIC_RELEASE) + +#define page_get_status_flags(page) page_flag_check(page, PGC_PAGE_HOT | PGC_PAGE_DIRTY | PGC_PAGE_CLEAN) +#define is_page_hot(page) (page_get_status_flags(page) == PGC_PAGE_HOT) +#define is_page_dirty(page) (page_get_status_flags(page) == PGC_PAGE_DIRTY) +#define is_page_clean(page) (page_get_status_flags(page) == PGC_PAGE_CLEAN) + +struct pgc_page { + // indexing data + Word_t section; + Word_t metric_id; + time_t start_time_s; + time_t end_time_s; + uint32_t update_every_s; + uint32_t assumed_size; + + REFCOUNT refcount; + uint16_t accesses; // counts the number of accesses on this page + PGC_PAGE_FLAGS flags; + SPINLOCK transition_spinlock; // when the page changes between HOT, DIRTY, CLEAN, we have to get this lock + + struct { + struct pgc_page *next; + struct pgc_page *prev; + } link; + + void *data; + uint8_t custom_data[]; + + // IMPORTANT! + // THIS STRUCTURE NEEDS TO BE INITIALIZED BY HAND! +}; + +struct pgc_linked_list { + SPINLOCK spinlock; + union { + PGC_PAGE *base; + Pvoid_t sections_judy; + }; + PGC_PAGE_FLAGS flags; + size_t version; + size_t last_version_checked; + bool linked_list_in_sections_judy; // when true, we use 'sections_judy', otherwise we use 'base' + struct pgc_queue_statistics *stats; +}; + +struct pgc { + struct { + char name[PGC_NAME_MAX + 1]; + + size_t partitions; + size_t clean_size; + size_t max_dirty_pages_per_call; + size_t max_pages_per_inline_eviction; + size_t max_skip_pages_per_inline_eviction; + size_t max_flushes_inline; + size_t max_workers_evict_inline; + size_t additional_bytes_per_page; + free_clean_page_callback pgc_free_clean_cb; + save_dirty_page_callback pgc_save_dirty_cb; + save_dirty_init_callback pgc_save_init_cb; + PGC_OPTIONS options; + + size_t severe_pressure_per1000; + size_t aggressive_evict_per1000; + size_t healthy_size_per1000; + size_t evict_low_threshold_per1000; + + dynamic_target_cache_size_callback dynamic_target_size_cb; + } config; + +#ifdef PGC_WITH_ARAL + ARAL **aral; +#endif + + PGC_CACHE_LINE_PADDING(0); + + struct pgc_index { + netdata_rwlock_t rwlock; + Pvoid_t sections_judy; + } *index; + + PGC_CACHE_LINE_PADDING(1); + + struct { + SPINLOCK spinlock; + size_t per1000; + } usage; + + PGC_CACHE_LINE_PADDING(2); + + struct pgc_linked_list clean; // LRU is applied here to free memory from the cache + + PGC_CACHE_LINE_PADDING(3); + + struct pgc_linked_list dirty; // in the dirty list, pages are ordered the way they were marked dirty + + PGC_CACHE_LINE_PADDING(4); + + struct pgc_linked_list hot; // in the hot list, pages are order the way they were marked hot + + PGC_CACHE_LINE_PADDING(5); + + struct pgc_statistics stats; // statistics + +#ifdef NETDATA_PGC_POINTER_CHECK + PGC_CACHE_LINE_PADDING(6); + netdata_mutex_t global_pointer_registry_mutex; + Pvoid_t global_pointer_registry; +#endif +}; + + + +// ---------------------------------------------------------------------------- +// validate each pointer is indexed once - internal checks only + +static inline void pointer_index_init(PGC *cache __maybe_unused) { +#ifdef NETDATA_PGC_POINTER_CHECK + netdata_mutex_init(&cache->global_pointer_registry_mutex); +#else + ; +#endif +} + +static inline void pointer_destroy_index(PGC *cache __maybe_unused) { +#ifdef NETDATA_PGC_POINTER_CHECK + netdata_mutex_lock(&cache->global_pointer_registry_mutex); + JudyHSFreeArray(&cache->global_pointer_registry, PJE0); + netdata_mutex_unlock(&cache->global_pointer_registry_mutex); +#else + ; +#endif +} +static inline void pointer_add(PGC *cache __maybe_unused, PGC_PAGE *page __maybe_unused) { +#ifdef NETDATA_PGC_POINTER_CHECK + netdata_mutex_lock(&cache->global_pointer_registry_mutex); + Pvoid_t *PValue = JudyHSIns(&cache->global_pointer_registry, &page, sizeof(void *), PJE0); + if(*PValue != NULL) + fatal("pointer already exists in registry"); + *PValue = page; + netdata_mutex_unlock(&cache->global_pointer_registry_mutex); +#else + ; +#endif +} + +static inline void pointer_check(PGC *cache __maybe_unused, PGC_PAGE *page __maybe_unused) { +#ifdef NETDATA_PGC_POINTER_CHECK + netdata_mutex_lock(&cache->global_pointer_registry_mutex); + Pvoid_t *PValue = JudyHSGet(cache->global_pointer_registry, &page, sizeof(void *)); + if(PValue == NULL) + fatal("pointer is not found in registry"); + netdata_mutex_unlock(&cache->global_pointer_registry_mutex); +#else + ; +#endif +} + +static inline void pointer_del(PGC *cache __maybe_unused, PGC_PAGE *page __maybe_unused) { +#ifdef NETDATA_PGC_POINTER_CHECK + netdata_mutex_lock(&cache->global_pointer_registry_mutex); + int ret = JudyHSDel(&cache->global_pointer_registry, &page, sizeof(void *), PJE0); + if(!ret) + fatal("pointer to be deleted does not exist in registry"); + netdata_mutex_unlock(&cache->global_pointer_registry_mutex); +#else + ; +#endif +} + +// ---------------------------------------------------------------------------- +// locking + +static inline size_t pgc_indexing_partition(PGC *cache, Word_t metric_id) { + static __thread Word_t last_metric_id = 0; + static __thread size_t last_partition = 0; + + if(metric_id == last_metric_id || cache->config.partitions == 1) + return last_partition; + + last_metric_id = metric_id; + last_partition = indexing_partition(metric_id, cache->config.partitions); + + return last_partition; +} + +static inline void pgc_index_read_lock(PGC *cache, size_t partition) { + netdata_rwlock_rdlock(&cache->index[partition].rwlock); +} +static inline void pgc_index_read_unlock(PGC *cache, size_t partition) { + netdata_rwlock_unlock(&cache->index[partition].rwlock); +} +//static inline bool pgc_index_write_trylock(PGC *cache, size_t partition) { +// return !netdata_rwlock_trywrlock(&cache->index[partition].rwlock); +//} +static inline void pgc_index_write_lock(PGC *cache, size_t partition) { + netdata_rwlock_wrlock(&cache->index[partition].rwlock); +} +static inline void pgc_index_write_unlock(PGC *cache, size_t partition) { + netdata_rwlock_unlock(&cache->index[partition].rwlock); +} + +static inline bool pgc_ll_trylock(PGC *cache __maybe_unused, struct pgc_linked_list *ll) { + return netdata_spinlock_trylock(&ll->spinlock); +} + +static inline void pgc_ll_lock(PGC *cache __maybe_unused, struct pgc_linked_list *ll) { + netdata_spinlock_lock(&ll->spinlock); +} + +static inline void pgc_ll_unlock(PGC *cache __maybe_unused, struct pgc_linked_list *ll) { + netdata_spinlock_unlock(&ll->spinlock); +} + +static inline bool page_transition_trylock(PGC *cache __maybe_unused, PGC_PAGE *page) { + return netdata_spinlock_trylock(&page->transition_spinlock); +} + +static inline void page_transition_lock(PGC *cache __maybe_unused, PGC_PAGE *page) { + netdata_spinlock_lock(&page->transition_spinlock); +} + +static inline void page_transition_unlock(PGC *cache __maybe_unused, PGC_PAGE *page) { + netdata_spinlock_unlock(&page->transition_spinlock); +} + +// ---------------------------------------------------------------------------- +// evictions control + +static inline size_t cache_usage_per1000(PGC *cache, size_t *size_to_evict) { + + if(size_to_evict) + netdata_spinlock_lock(&cache->usage.spinlock); + + else if(!netdata_spinlock_trylock(&cache->usage.spinlock)) + return __atomic_load_n(&cache->usage.per1000, __ATOMIC_RELAXED); + + size_t current_cache_size; + size_t wanted_cache_size; + size_t per1000; + + size_t dirty = __atomic_load_n(&cache->dirty.stats->size, __ATOMIC_RELAXED); + size_t hot = __atomic_load_n(&cache->hot.stats->size, __ATOMIC_RELAXED); + + if(cache->config.options & PGC_OPTIONS_AUTOSCALE) { + size_t dirty_max = __atomic_load_n(&cache->dirty.stats->max_size, __ATOMIC_RELAXED); + size_t hot_max = __atomic_load_n(&cache->hot.stats->max_size, __ATOMIC_RELAXED); + + // our promise to users + size_t max_size1 = MAX(hot_max, hot) * 2; + + // protection against slow flushing + size_t max_size2 = hot_max + ((dirty_max < hot_max / 2) ? hot_max / 2 : dirty_max * 2); + + // the final wanted cache size + wanted_cache_size = MIN(max_size1, max_size2); + + if(cache->config.dynamic_target_size_cb) { + size_t wanted_cache_size_cb = cache->config.dynamic_target_size_cb(); + if(wanted_cache_size_cb > wanted_cache_size) + wanted_cache_size = wanted_cache_size_cb; + } + + if (wanted_cache_size < hot + dirty + cache->config.clean_size) + wanted_cache_size = hot + dirty + cache->config.clean_size; + } + else + wanted_cache_size = hot + dirty + cache->config.clean_size; + + // protection again huge queries + // if huge queries are running, or huge amounts need to be saved + // allow the cache to grow more (hot pages in main cache are also referenced) + size_t referenced_size = __atomic_load_n(&cache->stats.referenced_size, __ATOMIC_RELAXED); + if(unlikely(wanted_cache_size < referenced_size * 2 / 3)) + wanted_cache_size = referenced_size * 2 / 3; + + current_cache_size = __atomic_load_n(&cache->stats.size, __ATOMIC_RELAXED); // + pgc_aral_overhead(); + + per1000 = (size_t)((unsigned long long)current_cache_size * 1000ULL / (unsigned long long)wanted_cache_size); + + __atomic_store_n(&cache->usage.per1000, per1000, __ATOMIC_RELAXED); + __atomic_store_n(&cache->stats.wanted_cache_size, wanted_cache_size, __ATOMIC_RELAXED); + __atomic_store_n(&cache->stats.current_cache_size, current_cache_size, __ATOMIC_RELAXED); + + netdata_spinlock_unlock(&cache->usage.spinlock); + + if(size_to_evict) { + size_t target = (size_t)((unsigned long long)wanted_cache_size * (unsigned long long)cache->config.evict_low_threshold_per1000 / 1000ULL); + if(current_cache_size > target) + *size_to_evict = current_cache_size - target; + else + *size_to_evict = 0; + } + + if(per1000 >= cache->config.severe_pressure_per1000) + __atomic_add_fetch(&cache->stats.events_cache_under_severe_pressure, 1, __ATOMIC_RELAXED); + + else if(per1000 >= cache->config.aggressive_evict_per1000) + __atomic_add_fetch(&cache->stats.events_cache_needs_space_aggressively, 1, __ATOMIC_RELAXED); + + return per1000; +} + +static inline bool cache_pressure(PGC *cache, size_t limit) { + return (cache_usage_per1000(cache, NULL) >= limit); +} + +#define cache_under_severe_pressure(cache) cache_pressure(cache, (cache)->config.severe_pressure_per1000) +#define cache_needs_space_aggressively(cache) cache_pressure(cache, (cache)->config.aggressive_evict_per1000) +#define cache_above_healthy_limit(cache) cache_pressure(cache, (cache)->config.healthy_size_per1000) + +typedef bool (*evict_filter)(PGC_PAGE *page, void *data); +static bool evict_pages_with_filter(PGC *cache, size_t max_skip, size_t max_evict, bool wait, bool all_of_them, evict_filter filter, void *data); +#define evict_pages(cache, max_skip, max_evict, wait, all_of_them) evict_pages_with_filter(cache, max_skip, max_evict, wait, all_of_them, NULL, NULL) + +static inline void evict_on_clean_page_added(PGC *cache __maybe_unused) { + if((cache->config.options & PGC_OPTIONS_EVICT_PAGES_INLINE) || cache_needs_space_aggressively(cache)) { + evict_pages(cache, + cache->config.max_skip_pages_per_inline_eviction, + cache->config.max_pages_per_inline_eviction, + false, false); + } +} + +static inline void evict_on_page_release_when_permitted(PGC *cache __maybe_unused) { + if ((cache->config.options & PGC_OPTIONS_EVICT_PAGES_INLINE) || cache_under_severe_pressure(cache)) { + evict_pages(cache, + cache->config.max_skip_pages_per_inline_eviction, + cache->config.max_pages_per_inline_eviction, + false, false); + } +} + +// ---------------------------------------------------------------------------- +// flushing control + +static bool flush_pages(PGC *cache, size_t max_flushes, Word_t section, bool wait, bool all_of_them); + +static inline bool flushing_critical(PGC *cache) { + if(unlikely(__atomic_load_n(&cache->dirty.stats->size, __ATOMIC_RELAXED) > __atomic_load_n(&cache->hot.stats->max_size, __ATOMIC_RELAXED))) { + __atomic_add_fetch(&cache->stats.events_flush_critical, 1, __ATOMIC_RELAXED); + return true; + } + + return false; +} + +// ---------------------------------------------------------------------------- +// helpers + +static inline size_t page_assumed_size(PGC *cache, size_t size) { + return size + (sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page + sizeof(Word_t) * 3); +} + +static inline size_t page_size_from_assumed_size(PGC *cache, size_t assumed_size) { + return assumed_size - (sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page + sizeof(Word_t) * 3); +} + +// ---------------------------------------------------------------------------- +// Linked list management + +static inline void atomic_set_max(size_t *max, size_t desired) { + size_t expected; + + expected = __atomic_load_n(max, __ATOMIC_RELAXED); + + do { + + if(expected >= desired) + return; + + } while(!__atomic_compare_exchange_n(max, &expected, desired, + false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); +} + +struct section_pages { + SPINLOCK migration_to_v2_spinlock; + size_t entries; + size_t size; + PGC_PAGE *base; +}; + +static ARAL *pgc_section_pages_aral = NULL; +static void pgc_section_pages_static_aral_init(void) { + static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; + + if(unlikely(!pgc_section_pages_aral)) { + netdata_spinlock_lock(&spinlock); + + // we have to check again + if(!pgc_section_pages_aral) + pgc_section_pages_aral = aral_create( + "pgc_section", + sizeof(struct section_pages), + 0, + 65536, NULL, + NULL, NULL, false, false); + + netdata_spinlock_unlock(&spinlock); + } +} + +static inline void pgc_stats_ll_judy_change(PGC *cache, struct pgc_linked_list *ll, size_t mem_before_judyl, size_t mem_after_judyl) { + if(mem_after_judyl > mem_before_judyl) { + __atomic_add_fetch(&ll->stats->size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED); + } + else if(mem_after_judyl < mem_before_judyl) { + __atomic_sub_fetch(&ll->stats->size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED); + } +} + +static inline void pgc_stats_index_judy_change(PGC *cache, size_t mem_before_judyl, size_t mem_after_judyl) { + if(mem_after_judyl > mem_before_judyl) { + __atomic_add_fetch(&cache->stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED); + } + else if(mem_after_judyl < mem_before_judyl) { + __atomic_sub_fetch(&cache->stats.size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED); + } +} + +static void pgc_ll_add(PGC *cache __maybe_unused, struct pgc_linked_list *ll, PGC_PAGE *page, bool having_lock) { + if(!having_lock) + pgc_ll_lock(cache, ll); + + internal_fatal(page_get_status_flags(page) != 0, + "DBENGINE CACHE: invalid page flags, the page has %d, but it is should be %d", + page_get_status_flags(page), + 0); + + if(ll->linked_list_in_sections_judy) { + size_t mem_before_judyl, mem_after_judyl; + + mem_before_judyl = JudyLMemUsed(ll->sections_judy); + Pvoid_t *section_pages_pptr = JudyLIns(&ll->sections_judy, page->section, PJE0); + mem_after_judyl = JudyLMemUsed(ll->sections_judy); + + struct section_pages *sp = *section_pages_pptr; + if(!sp) { + // sp = callocz(1, sizeof(struct section_pages)); + sp = aral_mallocz(pgc_section_pages_aral); + memset(sp, 0, sizeof(struct section_pages)); + + *section_pages_pptr = sp; + + mem_after_judyl += sizeof(struct section_pages); + } + pgc_stats_ll_judy_change(cache, ll, mem_before_judyl, mem_after_judyl); + + sp->entries++; + sp->size += page->assumed_size; + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(sp->base, page, link.prev, link.next); + + if((sp->entries % cache->config.max_dirty_pages_per_call) == 0) + ll->version++; + } + else { + // CLEAN pages end up here. + // - New pages created as CLEAN, always have 1 access. + // - DIRTY pages made CLEAN, depending on their accesses may be appended (accesses > 0) or prepended (accesses = 0). + + if(page->accesses || page_flag_check(page, PGC_PAGE_HAS_BEEN_ACCESSED | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES) == PGC_PAGE_HAS_BEEN_ACCESSED) { + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ll->base, page, link.prev, link.next); + page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED); + } + else + DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(ll->base, page, link.prev, link.next); + + ll->version++; + } + + page_flag_set(page, ll->flags); + + if(!having_lock) + pgc_ll_unlock(cache, ll); + + size_t entries = __atomic_add_fetch(&ll->stats->entries, 1, __ATOMIC_RELAXED); + size_t size = __atomic_add_fetch(&ll->stats->size, page->assumed_size, __ATOMIC_RELAXED); + __atomic_add_fetch(&ll->stats->added_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ll->stats->added_size, page->assumed_size, __ATOMIC_RELAXED); + + atomic_set_max(&ll->stats->max_entries, entries); + atomic_set_max(&ll->stats->max_size, size); +} + +static void pgc_ll_del(PGC *cache __maybe_unused, struct pgc_linked_list *ll, PGC_PAGE *page, bool having_lock) { + __atomic_sub_fetch(&ll->stats->entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&ll->stats->size, page->assumed_size, __ATOMIC_RELAXED); + __atomic_add_fetch(&ll->stats->removed_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ll->stats->removed_size, page->assumed_size, __ATOMIC_RELAXED); + + if(!having_lock) + pgc_ll_lock(cache, ll); + + internal_fatal(page_get_status_flags(page) != ll->flags, + "DBENGINE CACHE: invalid page flags, the page has %d, but it is should be %d", + page_get_status_flags(page), + ll->flags); + + page_flag_clear(page, ll->flags); + + if(ll->linked_list_in_sections_judy) { + Pvoid_t *section_pages_pptr = JudyLGet(ll->sections_judy, page->section, PJE0); + internal_fatal(!section_pages_pptr, "DBENGINE CACHE: page should be in Judy LL, but it is not"); + + struct section_pages *sp = *section_pages_pptr; + sp->entries--; + sp->size -= page->assumed_size; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(sp->base, page, link.prev, link.next); + + if(!sp->base) { + size_t mem_before_judyl, mem_after_judyl; + + mem_before_judyl = JudyLMemUsed(ll->sections_judy); + int rc = JudyLDel(&ll->sections_judy, page->section, PJE0); + mem_after_judyl = JudyLMemUsed(ll->sections_judy); + + if(!rc) + fatal("DBENGINE CACHE: cannot delete section from Judy LL"); + + // freez(sp); + aral_freez(pgc_section_pages_aral, sp); + mem_after_judyl -= sizeof(struct section_pages); + pgc_stats_ll_judy_change(cache, ll, mem_before_judyl, mem_after_judyl); + } + } + else { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ll->base, page, link.prev, link.next); + ll->version++; + } + + if(!having_lock) + pgc_ll_unlock(cache, ll); +} + +static inline void page_has_been_accessed(PGC *cache, PGC_PAGE *page) { + PGC_PAGE_FLAGS flags = page_flag_check(page, PGC_PAGE_CLEAN | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES); + + if (!(flags & PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES)) { + __atomic_add_fetch(&page->accesses, 1, __ATOMIC_RELAXED); + + if (flags & PGC_PAGE_CLEAN) { + if(pgc_ll_trylock(cache, &cache->clean)) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next); + pgc_ll_unlock(cache, &cache->clean); + page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED); + } + else + page_flag_set(page, PGC_PAGE_HAS_BEEN_ACCESSED); + } + } +} + + +// ---------------------------------------------------------------------------- +// state transitions + +static inline void page_set_clean(PGC *cache, PGC_PAGE *page, bool having_transition_lock, bool having_clean_lock) { + if(!having_transition_lock) + page_transition_lock(cache, page); + + PGC_PAGE_FLAGS flags = page_get_status_flags(page); + + if(flags & PGC_PAGE_CLEAN) { + if(!having_transition_lock) + page_transition_unlock(cache, page); + return; + } + + if(flags & PGC_PAGE_HOT) + pgc_ll_del(cache, &cache->hot, page, false); + + if(flags & PGC_PAGE_DIRTY) + pgc_ll_del(cache, &cache->dirty, page, false); + + // first add to linked list, the set the flag (required for move_page_last()) + pgc_ll_add(cache, &cache->clean, page, having_clean_lock); + + if(!having_transition_lock) + page_transition_unlock(cache, page); +} + +static inline void page_set_dirty(PGC *cache, PGC_PAGE *page, bool having_hot_lock) { + if(!having_hot_lock) + // to avoid deadlocks, we have to get the hot lock before the page transition + // since this is what all_hot_to_dirty() does + pgc_ll_lock(cache, &cache->hot); + + page_transition_lock(cache, page); + + PGC_PAGE_FLAGS flags = page_get_status_flags(page); + + if(flags & PGC_PAGE_DIRTY) { + page_transition_unlock(cache, page); + + if(!having_hot_lock) + // we don't need the hot lock anymore + pgc_ll_unlock(cache, &cache->hot); + + return; + } + + __atomic_add_fetch(&cache->stats.hot2dirty_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.hot2dirty_size, page->assumed_size, __ATOMIC_RELAXED); + + if(likely(flags & PGC_PAGE_HOT)) + pgc_ll_del(cache, &cache->hot, page, true); + + if(!having_hot_lock) + // we don't need the hot lock anymore + pgc_ll_unlock(cache, &cache->hot); + + if(unlikely(flags & PGC_PAGE_CLEAN)) + pgc_ll_del(cache, &cache->clean, page, false); + + // first add to linked list, the set the flag (required for move_page_last()) + pgc_ll_add(cache, &cache->dirty, page, false); + + __atomic_sub_fetch(&cache->stats.hot2dirty_entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.hot2dirty_size, page->assumed_size, __ATOMIC_RELAXED); + + page_transition_unlock(cache, page); +} + +static inline void page_set_hot(PGC *cache, PGC_PAGE *page) { + page_transition_lock(cache, page); + + PGC_PAGE_FLAGS flags = page_get_status_flags(page); + + if(flags & PGC_PAGE_HOT) { + page_transition_unlock(cache, page); + return; + } + + if(flags & PGC_PAGE_DIRTY) + pgc_ll_del(cache, &cache->dirty, page, false); + + if(flags & PGC_PAGE_CLEAN) + pgc_ll_del(cache, &cache->clean, page, false); + + // first add to linked list, the set the flag (required for move_page_last()) + pgc_ll_add(cache, &cache->hot, page, false); + + page_transition_unlock(cache, page); +} + + +// ---------------------------------------------------------------------------- +// Referencing + +static inline size_t PGC_REFERENCED_PAGES(PGC *cache) { + return __atomic_load_n(&cache->stats.referenced_entries, __ATOMIC_RELAXED); +} + +static inline void PGC_REFERENCED_PAGES_PLUS1(PGC *cache, PGC_PAGE *page) { + __atomic_add_fetch(&cache->stats.referenced_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.referenced_size, page->assumed_size, __ATOMIC_RELAXED); +} + +static inline void PGC_REFERENCED_PAGES_MINUS1(PGC *cache, size_t assumed_size) { + __atomic_sub_fetch(&cache->stats.referenced_entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.referenced_size, assumed_size, __ATOMIC_RELAXED); +} + +// If the page is not already acquired, +// YOU HAVE TO HAVE THE QUEUE (hot, dirty, clean) THE PAGE IS IN, L O C K E D ! +// If you don't have it locked, NOTHING PREVENTS THIS PAGE FOR VANISHING WHILE THIS IS CALLED! +static inline bool page_acquire(PGC *cache, PGC_PAGE *page) { + __atomic_add_fetch(&cache->stats.acquires, 1, __ATOMIC_RELAXED); + + REFCOUNT expected, desired; + + expected = __atomic_load_n(&page->refcount, __ATOMIC_RELAXED); + size_t spins = 0; + + do { + spins++; + + if(unlikely(expected < 0)) + return false; + + desired = expected + 1; + + } while(!__atomic_compare_exchange_n(&page->refcount, &expected, desired, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)); + + if(unlikely(spins > 1)) + __atomic_add_fetch(&cache->stats.acquire_spins, spins - 1, __ATOMIC_RELAXED); + + if(desired == 1) + PGC_REFERENCED_PAGES_PLUS1(cache, page); + + return true; +} + +static inline void page_release(PGC *cache, PGC_PAGE *page, bool evict_if_necessary) { + __atomic_add_fetch(&cache->stats.releases, 1, __ATOMIC_RELAXED); + + size_t assumed_size = page->assumed_size; // take the size before we release it + REFCOUNT expected, desired; + + expected = __atomic_load_n(&page->refcount, __ATOMIC_RELAXED); + + size_t spins = 0; + do { + spins++; + + internal_fatal(expected <= 0, + "DBENGINE CACHE: trying to release a page with reference counter %d", expected); + + desired = expected - 1; + + } while(!__atomic_compare_exchange_n(&page->refcount, &expected, desired, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED)); + + if(unlikely(spins > 1)) + __atomic_add_fetch(&cache->stats.release_spins, spins - 1, __ATOMIC_RELAXED); + + if(desired == 0) { + PGC_REFERENCED_PAGES_MINUS1(cache, assumed_size); + + if(evict_if_necessary) + evict_on_page_release_when_permitted(cache); + } +} + +static inline bool non_acquired_page_get_for_deletion___while_having_clean_locked(PGC *cache __maybe_unused, PGC_PAGE *page) { + __atomic_add_fetch(&cache->stats.acquires_for_deletion, 1, __ATOMIC_RELAXED); + + internal_fatal(!is_page_clean(page), + "DBENGINE CACHE: only clean pages can be deleted"); + + REFCOUNT expected, desired; + + expected = __atomic_load_n(&page->refcount, __ATOMIC_RELAXED); + size_t spins = 0; + bool delete_it; + + do { + spins++; + + if (expected == 0) { + desired = REFCOUNT_DELETING; + delete_it = true; + } + else { + delete_it = false; + break; + } + + } while(!__atomic_compare_exchange_n(&page->refcount, &expected, desired, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED)); + + if(delete_it) { + // we can delete this page + internal_fatal(page_flag_check(page, PGC_PAGE_IS_BEING_DELETED), + "DBENGINE CACHE: page is already being deleted"); + + page_flag_set(page, PGC_PAGE_IS_BEING_DELETED); + } + + if(unlikely(spins > 1)) + __atomic_add_fetch(&cache->stats.delete_spins, spins - 1, __ATOMIC_RELAXED); + + return delete_it; +} + +static inline bool acquired_page_get_for_deletion_or_release_it(PGC *cache __maybe_unused, PGC_PAGE *page) { + __atomic_add_fetch(&cache->stats.acquires_for_deletion, 1, __ATOMIC_RELAXED); + + size_t assumed_size = page->assumed_size; // take the size before we release it + + REFCOUNT expected, desired; + + expected = __atomic_load_n(&page->refcount, __ATOMIC_RELAXED); + size_t spins = 0; + bool delete_it; + + do { + spins++; + + internal_fatal(expected < 1, + "DBENGINE CACHE: page to be deleted should be acquired by the caller."); + + if (expected == 1) { + // we are the only one having this page referenced + desired = REFCOUNT_DELETING; + delete_it = true; + } + else { + // this page cannot be deleted + desired = expected - 1; + delete_it = false; + } + + } while(!__atomic_compare_exchange_n(&page->refcount, &expected, desired, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED)); + + if(delete_it) { + PGC_REFERENCED_PAGES_MINUS1(cache, assumed_size); + + // we can delete this page + internal_fatal(page_flag_check(page, PGC_PAGE_IS_BEING_DELETED), + "DBENGINE CACHE: page is already being deleted"); + + page_flag_set(page, PGC_PAGE_IS_BEING_DELETED); + } + + if(unlikely(spins > 1)) + __atomic_add_fetch(&cache->stats.delete_spins, spins - 1, __ATOMIC_RELAXED); + + return delete_it; +} + + +// ---------------------------------------------------------------------------- +// Indexing + +static inline void free_this_page(PGC *cache, PGC_PAGE *page, size_t partition __maybe_unused) { + // call the callback to free the user supplied memory + cache->config.pgc_free_clean_cb(cache, (PGC_ENTRY){ + .section = page->section, + .metric_id = page->metric_id, + .start_time_s = page->start_time_s, + .end_time_s = __atomic_load_n(&page->end_time_s, __ATOMIC_RELAXED), + .update_every_s = page->update_every_s, + .size = page_size_from_assumed_size(cache, page->assumed_size), + .hot = (is_page_hot(page)) ? true : false, + .data = page->data, + .custom_data = (cache->config.additional_bytes_per_page) ? page->custom_data : NULL, + }); + + // update statistics + __atomic_add_fetch(&cache->stats.removed_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.removed_size, page->assumed_size, __ATOMIC_RELAXED); + + __atomic_sub_fetch(&cache->stats.entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.size, page->assumed_size, __ATOMIC_RELAXED); + + // free our memory +#ifdef PGC_WITH_ARAL + aral_freez(cache->aral[partition], page); +#else + freez(page); +#endif +} + +static void remove_this_page_from_index_unsafe(PGC *cache, PGC_PAGE *page, size_t partition) { + // remove it from the Judy arrays + + pointer_check(cache, page); + + internal_fatal(page_flag_check(page, PGC_PAGE_HOT | PGC_PAGE_DIRTY | PGC_PAGE_CLEAN), + "DBENGINE CACHE: page to be removed from the cache is still in the linked-list"); + + internal_fatal(!page_flag_check(page, PGC_PAGE_IS_BEING_DELETED), + "DBENGINE CACHE: page to be removed from the index, is not marked for deletion"); + + internal_fatal(partition != pgc_indexing_partition(cache, page->metric_id), + "DBENGINE CACHE: attempted to remove this page from the wrong partition of the cache"); + + Pvoid_t *metrics_judy_pptr = JudyLGet(cache->index[partition].sections_judy, page->section, PJE0); + if(unlikely(!metrics_judy_pptr)) + fatal("DBENGINE CACHE: section '%lu' should exist, but it does not.", page->section); + + Pvoid_t *pages_judy_pptr = JudyLGet(*metrics_judy_pptr, page->metric_id, PJE0); + if(unlikely(!pages_judy_pptr)) + fatal("DBENGINE CACHE: metric '%lu' in section '%lu' should exist, but it does not.", + page->metric_id, page->section); + + Pvoid_t *page_ptr = JudyLGet(*pages_judy_pptr, page->start_time_s, PJE0); + if(unlikely(!page_ptr)) + fatal("DBENGINE CACHE: page with start time '%ld' of metric '%lu' in section '%lu' should exist, but it does not.", + page->start_time_s, page->metric_id, page->section); + + PGC_PAGE *found_page = *page_ptr; + if(unlikely(found_page != page)) + fatal("DBENGINE CACHE: page with start time '%ld' of metric '%lu' in section '%lu' should exist, but the index returned a different address.", + page->start_time_s, page->metric_id, page->section); + + size_t mem_before_judyl = 0, mem_after_judyl = 0; + + mem_before_judyl += JudyLMemUsed(*pages_judy_pptr); + if(unlikely(!JudyLDel(pages_judy_pptr, page->start_time_s, PJE0))) + fatal("DBENGINE CACHE: page with start time '%ld' of metric '%lu' in section '%lu' exists, but cannot be deleted.", + page->start_time_s, page->metric_id, page->section); + mem_after_judyl += JudyLMemUsed(*pages_judy_pptr); + + mem_before_judyl += JudyLMemUsed(*metrics_judy_pptr); + if(!*pages_judy_pptr && !JudyLDel(metrics_judy_pptr, page->metric_id, PJE0)) + fatal("DBENGINE CACHE: metric '%lu' in section '%lu' exists and is empty, but cannot be deleted.", + page->metric_id, page->section); + mem_after_judyl += JudyLMemUsed(*metrics_judy_pptr); + + mem_before_judyl += JudyLMemUsed(cache->index[partition].sections_judy); + if(!*metrics_judy_pptr && !JudyLDel(&cache->index[partition].sections_judy, page->section, PJE0)) + fatal("DBENGINE CACHE: section '%lu' exists and is empty, but cannot be deleted.", page->section); + mem_after_judyl += JudyLMemUsed(cache->index[partition].sections_judy); + + pgc_stats_index_judy_change(cache, mem_before_judyl, mem_after_judyl); + + pointer_del(cache, page); +} + +static inline void remove_and_free_page_not_in_any_queue_and_acquired_for_deletion(PGC *cache, PGC_PAGE *page) { + size_t partition = pgc_indexing_partition(cache, page->metric_id); + pgc_index_write_lock(cache, partition); + remove_this_page_from_index_unsafe(cache, page, partition); + pgc_index_write_unlock(cache, partition); + free_this_page(cache, page, partition); +} + +static inline bool make_acquired_page_clean_and_evict_or_page_release(PGC *cache, PGC_PAGE *page) { + pointer_check(cache, page); + + page_transition_lock(cache, page); + pgc_ll_lock(cache, &cache->clean); + + // make it clean - it does not have any accesses, so it will be prepended + page_set_clean(cache, page, true, true); + + if(!acquired_page_get_for_deletion_or_release_it(cache, page)) { + pgc_ll_unlock(cache, &cache->clean); + page_transition_unlock(cache, page); + return false; + } + + // remove it from the linked list + pgc_ll_del(cache, &cache->clean, page, true); + pgc_ll_unlock(cache, &cache->clean); + page_transition_unlock(cache, page); + + remove_and_free_page_not_in_any_queue_and_acquired_for_deletion(cache, page); + + return true; +} + +// returns true, when there is more work to do +static bool evict_pages_with_filter(PGC *cache, size_t max_skip, size_t max_evict, bool wait, bool all_of_them, evict_filter filter, void *data) { + size_t per1000 = cache_usage_per1000(cache, NULL); + + if(!all_of_them && per1000 < cache->config.healthy_size_per1000) + // don't bother - not enough to do anything + return false; + + size_t workers_running = __atomic_add_fetch(&cache->stats.workers_evict, 1, __ATOMIC_RELAXED); + if(!wait && !all_of_them && workers_running > cache->config.max_workers_evict_inline && per1000 < cache->config.severe_pressure_per1000) { + __atomic_sub_fetch(&cache->stats.workers_evict, 1, __ATOMIC_RELAXED); + return false; + } + + internal_fatal(cache->clean.linked_list_in_sections_judy, + "wrong clean pages configuration - clean pages need to have a linked list, not a judy array"); + + if(unlikely(!max_skip)) + max_skip = SIZE_MAX; + else if(unlikely(max_skip < 2)) + max_skip = 2; + + if(unlikely(!max_evict)) + max_evict = SIZE_MAX; + else if(unlikely(max_evict < 2)) + max_evict = 2; + + size_t total_pages_evicted = 0; + size_t total_pages_skipped = 0; + bool stopped_before_finishing = false; + size_t spins = 0; + + do { + if(++spins > 1) + __atomic_add_fetch(&cache->stats.evict_spins, 1, __ATOMIC_RELAXED); + + bool batch; + size_t max_size_to_evict = 0; + if (unlikely(all_of_them)) { + max_size_to_evict = SIZE_MAX; + batch = true; + } + else if(unlikely(wait)) { + per1000 = cache_usage_per1000(cache, &max_size_to_evict); + batch = (wait && per1000 > cache->config.severe_pressure_per1000) ? true : false; + } + else { + batch = false; + max_size_to_evict = (cache_above_healthy_limit(cache)) ? 1 : 0; + } + + if (!max_size_to_evict) + break; + + // check if we have to stop + if(total_pages_evicted >= max_evict && !all_of_them) { + stopped_before_finishing = true; + break; + } + + if(!all_of_them && !wait) { + if(!pgc_ll_trylock(cache, &cache->clean)) { + stopped_before_finishing = true; + goto premature_exit; + } + + // at this point we have the clean lock + } + else + pgc_ll_lock(cache, &cache->clean); + + // find a page to evict + PGC_PAGE *pages_to_evict = NULL; + size_t pages_to_evict_size = 0; + for(PGC_PAGE *page = cache->clean.base, *next = NULL, *first_page_we_relocated = NULL; page ; page = next) { + next = page->link.next; + + if(unlikely(page == first_page_we_relocated)) + // we did a complete loop on all pages + break; + + if(unlikely(page_flag_check(page, PGC_PAGE_HAS_BEEN_ACCESSED | PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES) == PGC_PAGE_HAS_BEEN_ACCESSED)) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next); + page_flag_clear(page, PGC_PAGE_HAS_BEEN_ACCESSED); + continue; + } + + if(unlikely(filter && !filter(page, data))) + continue; + + if(non_acquired_page_get_for_deletion___while_having_clean_locked(cache, page)) { + // we can delete this page + + // remove it from the clean list + pgc_ll_del(cache, &cache->clean, page, true); + + __atomic_add_fetch(&cache->stats.evicting_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.evicting_size, page->assumed_size, __ATOMIC_RELAXED); + + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pages_to_evict, page, link.prev, link.next); + + pages_to_evict_size += page->assumed_size; + + if(unlikely(all_of_them || (batch && pages_to_evict_size < max_size_to_evict))) + // get more pages + ; + else + // one page at a time + break; + } + else { + // we can't delete this page + + if(!first_page_we_relocated) + first_page_we_relocated = page; + + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(cache->clean.base, page, link.prev, link.next); + + // check if we have to stop + if(unlikely(++total_pages_skipped >= max_skip && !all_of_them)) { + stopped_before_finishing = true; + break; + } + } + } + pgc_ll_unlock(cache, &cache->clean); + + if(likely(pages_to_evict)) { + // remove them from the index + + if(unlikely(pages_to_evict->link.next)) { + // we have many pages, let's minimize the index locks we are going to get + + PGC_PAGE *pages_per_partition[cache->config.partitions]; + memset(pages_per_partition, 0, sizeof(PGC_PAGE *) * cache->config.partitions); + + // sort them by partition + for (PGC_PAGE *page = pages_to_evict, *next = NULL; page; page = next) { + next = page->link.next; + + size_t partition = pgc_indexing_partition(cache, page->metric_id); + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(pages_to_evict, page, link.prev, link.next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pages_per_partition[partition], page, link.prev, link.next); + } + + // remove them from the index + for (size_t partition = 0; partition < cache->config.partitions; partition++) { + if (!pages_per_partition[partition]) continue; + + pgc_index_write_lock(cache, partition); + + for (PGC_PAGE *page = pages_per_partition[partition]; page; page = page->link.next) + remove_this_page_from_index_unsafe(cache, page, partition); + + pgc_index_write_unlock(cache, partition); + } + + // free them + for (size_t partition = 0; partition < cache->config.partitions; partition++) { + if (!pages_per_partition[partition]) continue; + + for (PGC_PAGE *page = pages_per_partition[partition], *next = NULL; page; page = next) { + next = page->link.next; + + size_t page_size = page->assumed_size; + free_this_page(cache, page, partition); + + __atomic_sub_fetch(&cache->stats.evicting_entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.evicting_size, page_size, __ATOMIC_RELAXED); + + total_pages_evicted++; + } + } + } + else { + // just one page to be evicted + PGC_PAGE *page = pages_to_evict; + + size_t page_size = page->assumed_size; + + size_t partition = pgc_indexing_partition(cache, page->metric_id); + pgc_index_write_lock(cache, partition); + remove_this_page_from_index_unsafe(cache, page, partition); + pgc_index_write_unlock(cache, partition); + free_this_page(cache, page, partition); + + __atomic_sub_fetch(&cache->stats.evicting_entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.evicting_size, page_size, __ATOMIC_RELAXED); + + total_pages_evicted++; + } + } + else + break; + + } while(all_of_them || (total_pages_evicted < max_evict && total_pages_skipped < max_skip)); + + if(all_of_them && !filter) { + pgc_ll_lock(cache, &cache->clean); + if(cache->clean.stats->entries) { + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE CACHE: cannot free all clean pages, %zu are still in the clean queue", + cache->clean.stats->entries); + } + pgc_ll_unlock(cache, &cache->clean); + } + +premature_exit: + if(unlikely(total_pages_skipped)) + __atomic_add_fetch(&cache->stats.evict_skipped, total_pages_skipped, __ATOMIC_RELAXED); + + __atomic_sub_fetch(&cache->stats.workers_evict, 1, __ATOMIC_RELAXED); + + return stopped_before_finishing; +} + +static PGC_PAGE *page_add(PGC *cache, PGC_ENTRY *entry, bool *added) { + __atomic_add_fetch(&cache->stats.workers_add, 1, __ATOMIC_RELAXED); + + size_t partition = pgc_indexing_partition(cache, entry->metric_id); + +#ifdef PGC_WITH_ARAL + PGC_PAGE *allocation = aral_mallocz(cache->aral[partition]); +#endif + PGC_PAGE *page; + size_t spins = 0; + + do { + if(++spins > 1) + __atomic_add_fetch(&cache->stats.insert_spins, 1, __ATOMIC_RELAXED); + + pgc_index_write_lock(cache, partition); + + size_t mem_before_judyl = 0, mem_after_judyl = 0; + + mem_before_judyl += JudyLMemUsed(cache->index[partition].sections_judy); + Pvoid_t *metrics_judy_pptr = JudyLIns(&cache->index[partition].sections_judy, entry->section, PJE0); + if(unlikely(!metrics_judy_pptr || metrics_judy_pptr == PJERR)) + fatal("DBENGINE CACHE: corrupted sections judy array"); + mem_after_judyl += JudyLMemUsed(cache->index[partition].sections_judy); + + mem_before_judyl += JudyLMemUsed(*metrics_judy_pptr); + Pvoid_t *pages_judy_pptr = JudyLIns(metrics_judy_pptr, entry->metric_id, PJE0); + if(unlikely(!pages_judy_pptr || pages_judy_pptr == PJERR)) + fatal("DBENGINE CACHE: corrupted pages judy array"); + mem_after_judyl += JudyLMemUsed(*metrics_judy_pptr); + + mem_before_judyl += JudyLMemUsed(*pages_judy_pptr); + Pvoid_t *page_ptr = JudyLIns(pages_judy_pptr, entry->start_time_s, PJE0); + if(unlikely(!page_ptr || page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in judy array"); + mem_after_judyl += JudyLMemUsed(*pages_judy_pptr); + + pgc_stats_index_judy_change(cache, mem_before_judyl, mem_after_judyl); + + page = *page_ptr; + + if (likely(!page)) { +#ifdef PGC_WITH_ARAL + page = allocation; + allocation = NULL; +#else + page = mallocz(sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page); +#endif + page->refcount = 1; + page->accesses = (entry->hot) ? 0 : 1; + page->flags = 0; + page->section = entry->section; + page->metric_id = entry->metric_id; + page->start_time_s = entry->start_time_s; + page->end_time_s = entry->end_time_s, + page->update_every_s = entry->update_every_s, + page->data = entry->data; + page->assumed_size = page_assumed_size(cache, entry->size); + netdata_spinlock_init(&page->transition_spinlock); + page->link.prev = NULL; + page->link.next = NULL; + + if(cache->config.additional_bytes_per_page) { + if(entry->custom_data) + memcpy(page->custom_data, entry->custom_data, cache->config.additional_bytes_per_page); + else + memset(page->custom_data, 0, cache->config.additional_bytes_per_page); + } + + // put it in the index + *page_ptr = page; + pointer_add(cache, page); + pgc_index_write_unlock(cache, partition); + + if (entry->hot) + page_set_hot(cache, page); + else + page_set_clean(cache, page, false, false); + + PGC_REFERENCED_PAGES_PLUS1(cache, page); + + // update statistics + __atomic_add_fetch(&cache->stats.added_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.added_size, page->assumed_size, __ATOMIC_RELAXED); + + __atomic_add_fetch(&cache->stats.entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.size, page->assumed_size, __ATOMIC_RELAXED); + + if(added) + *added = true; + } + else { + if (!page_acquire(cache, page)) + page = NULL; + + else if(added) + *added = false; + + pgc_index_write_unlock(cache, partition); + + if(unlikely(!page)) { + // now that we don't have the lock, + // give it some time for the old page to go away + struct timespec ns = { .tv_sec = 0, .tv_nsec = 1 }; + nanosleep(&ns, NULL); + } + } + + } while(!page); + +#ifdef PGC_WITH_ARAL + if(allocation) + aral_freez(cache->aral[partition], allocation); +#endif + + __atomic_sub_fetch(&cache->stats.workers_add, 1, __ATOMIC_RELAXED); + + if(!entry->hot) + evict_on_clean_page_added(cache); + + if((cache->config.options & PGC_OPTIONS_FLUSH_PAGES_INLINE) || flushing_critical(cache)) { + flush_pages(cache, cache->config.max_flushes_inline, PGC_SECTION_ALL, + false, false); + } + + return page; +} + +static PGC_PAGE *page_find_and_acquire(PGC *cache, Word_t section, Word_t metric_id, time_t start_time_s, PGC_SEARCH method) { + __atomic_add_fetch(&cache->stats.workers_search, 1, __ATOMIC_RELAXED); + + size_t *stats_hit_ptr, *stats_miss_ptr; + + if(method == PGC_SEARCH_CLOSEST) { + __atomic_add_fetch(&cache->stats.searches_closest, 1, __ATOMIC_RELAXED); + stats_hit_ptr = &cache->stats.searches_closest_hits; + stats_miss_ptr = &cache->stats.searches_closest_misses; + } + else { + __atomic_add_fetch(&cache->stats.searches_exact, 1, __ATOMIC_RELAXED); + stats_hit_ptr = &cache->stats.searches_exact_hits; + stats_miss_ptr = &cache->stats.searches_exact_misses; + } + + PGC_PAGE *page = NULL; + size_t partition = pgc_indexing_partition(cache, metric_id); + + pgc_index_read_lock(cache, partition); + + Pvoid_t *metrics_judy_pptr = JudyLGet(cache->index[partition].sections_judy, section, PJE0); + if(unlikely(metrics_judy_pptr == PJERR)) + fatal("DBENGINE CACHE: corrupted sections judy array"); + + if(unlikely(!metrics_judy_pptr)) { + // section does not exist + goto cleanup; + } + + Pvoid_t *pages_judy_pptr = JudyLGet(*metrics_judy_pptr, metric_id, PJE0); + if(unlikely(pages_judy_pptr == PJERR)) + fatal("DBENGINE CACHE: corrupted pages judy array"); + + if(unlikely(!pages_judy_pptr)) { + // metric does not exist + goto cleanup; + } + + switch(method) { + default: + case PGC_SEARCH_CLOSEST: { + Pvoid_t *page_ptr = JudyLGet(*pages_judy_pptr, start_time_s, PJE0); + if (unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array"); + + if (page_ptr) + page = *page_ptr; + + else { + Word_t time = start_time_s; + + // find the previous page + page_ptr = JudyLLast(*pages_judy_pptr, &time, PJE0); + if(unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array #2"); + + if(page_ptr) { + // found a page starting before our timestamp + // check if our timestamp is included + page = *page_ptr; + if(start_time_s > page->end_time_s) + // it is not good for us + page = NULL; + } + + if(!page) { + // find the next page then... + time = start_time_s; + page_ptr = JudyLNext(*pages_judy_pptr, &time, PJE0); + if(page_ptr) + page = *page_ptr; + } + } + } + break; + + case PGC_SEARCH_EXACT: { + Pvoid_t *page_ptr = JudyLGet(*pages_judy_pptr, start_time_s, PJE0); + if (unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array"); + + if (page_ptr) + page = *page_ptr; + } + break; + + case PGC_SEARCH_FIRST: { + Word_t time = start_time_s; + Pvoid_t *page_ptr = JudyLFirst(*pages_judy_pptr, &time, PJE0); + if (unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array"); + + if (page_ptr) + page = *page_ptr; + } + break; + + case PGC_SEARCH_NEXT: { + Word_t time = start_time_s; + Pvoid_t *page_ptr = JudyLNext(*pages_judy_pptr, &time, PJE0); + if (unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array"); + + if (page_ptr) + page = *page_ptr; + } + break; + + case PGC_SEARCH_LAST: { + Word_t time = start_time_s; + Pvoid_t *page_ptr = JudyLLast(*pages_judy_pptr, &time, PJE0); + if (unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array"); + + if (page_ptr) + page = *page_ptr; + } + break; + + case PGC_SEARCH_PREV: { + Word_t time = start_time_s; + Pvoid_t *page_ptr = JudyLPrev(*pages_judy_pptr, &time, PJE0); + if (unlikely(page_ptr == PJERR)) + fatal("DBENGINE CACHE: corrupted page in pages judy array"); + + if (page_ptr) + page = *page_ptr; + } + break; + } + + if(page) { + pointer_check(cache, page); + + if(!page_acquire(cache, page)) { + // this page is not good to use + page = NULL; + } + } + +cleanup: + pgc_index_read_unlock(cache, partition); + + if(page) { + __atomic_add_fetch(stats_hit_ptr, 1, __ATOMIC_RELAXED); + page_has_been_accessed(cache, page); + } + else + __atomic_add_fetch(stats_miss_ptr, 1, __ATOMIC_RELAXED); + + __atomic_sub_fetch(&cache->stats.workers_search, 1, __ATOMIC_RELAXED); + + return page; +} + +static void all_hot_pages_to_dirty(PGC *cache, Word_t section) { + pgc_ll_lock(cache, &cache->hot); + + bool first = true; + Word_t last_section = (section == PGC_SECTION_ALL) ? 0 : section; + Pvoid_t *section_pages_pptr; + while ((section_pages_pptr = JudyLFirstThenNext(cache->hot.sections_judy, &last_section, &first))) { + if(section != PGC_SECTION_ALL && last_section != section) + break; + + struct section_pages *sp = *section_pages_pptr; + + PGC_PAGE *page = sp->base; + while(page) { + PGC_PAGE *next = page->link.next; + + if(page_acquire(cache, page)) { + page_set_dirty(cache, page, true); + page_release(cache, page, false); + // page ptr may be invalid now + } + + page = next; + } + } + pgc_ll_unlock(cache, &cache->hot); +} + +// returns true when there is more work to do +static bool flush_pages(PGC *cache, size_t max_flushes, Word_t section, bool wait, bool all_of_them) { + internal_fatal(!cache->dirty.linked_list_in_sections_judy, + "wrong dirty pages configuration - dirty pages need to have a judy array, not a linked list"); + + if(!all_of_them && !wait) { + // we have been called from a data collection thread + // let's not waste its time... + + if(!pgc_ll_trylock(cache, &cache->dirty)) { + // we would block, so give up... + return true; + } + + // we got the lock at this point + } + else + pgc_ll_lock(cache, &cache->dirty); + + size_t optimal_flush_size = cache->config.max_dirty_pages_per_call; + size_t dirty_version_at_entry = cache->dirty.version; + if(!all_of_them && (cache->dirty.stats->entries < optimal_flush_size || cache->dirty.last_version_checked == dirty_version_at_entry)) { + pgc_ll_unlock(cache, &cache->dirty); + return false; + } + + __atomic_add_fetch(&cache->stats.workers_flush, 1, __ATOMIC_RELAXED); + + bool have_dirty_lock = true; + + if(all_of_them || !max_flushes) + max_flushes = SIZE_MAX; + + Word_t last_section = (section == PGC_SECTION_ALL) ? 0 : section; + size_t flushes_so_far = 0; + Pvoid_t *section_pages_pptr; + bool stopped_before_finishing = false; + size_t spins = 0; + bool first = true; + + while (have_dirty_lock && (section_pages_pptr = JudyLFirstThenNext(cache->dirty.sections_judy, &last_section, &first))) { + if(section != PGC_SECTION_ALL && last_section != section) + break; + + struct section_pages *sp = *section_pages_pptr; + if(!all_of_them && sp->entries < optimal_flush_size) + continue; + + if(!all_of_them && flushes_so_far > max_flushes) { + stopped_before_finishing = true; + break; + } + + if(++spins > 1) + __atomic_add_fetch(&cache->stats.flush_spins, 1, __ATOMIC_RELAXED); + + PGC_ENTRY array[optimal_flush_size]; + PGC_PAGE *pages[optimal_flush_size]; + size_t pages_added = 0, pages_added_size = 0; + size_t pages_removed_dirty = 0, pages_removed_dirty_size = 0; + size_t pages_cancelled = 0, pages_cancelled_size = 0; + size_t pages_made_clean = 0, pages_made_clean_size = 0; + + PGC_PAGE *page = sp->base; + while (page && pages_added < optimal_flush_size) { + PGC_PAGE *next = page->link.next; + + internal_fatal(page_get_status_flags(page) != PGC_PAGE_DIRTY, + "DBENGINE CACHE: page should be in the dirty list before saved"); + + if (page_acquire(cache, page)) { + internal_fatal(page_get_status_flags(page) != PGC_PAGE_DIRTY, + "DBENGINE CACHE: page should be in the dirty list before saved"); + + internal_fatal(page->section != last_section, + "DBENGINE CACHE: dirty page is not in the right section (tier)"); + + if(!page_transition_trylock(cache, page)) { + page_release(cache, page, false); + // page ptr may be invalid now + } + else { + pages[pages_added] = page; + array[pages_added] = (PGC_ENTRY) { + .section = page->section, + .metric_id = page->metric_id, + .start_time_s = page->start_time_s, + .end_time_s = __atomic_load_n(&page->end_time_s, __ATOMIC_RELAXED), + .update_every_s = page->update_every_s, + .size = page_size_from_assumed_size(cache, page->assumed_size), + .data = page->data, + .custom_data = (cache->config.additional_bytes_per_page) ? page->custom_data : NULL, + .hot = false, + }; + + pages_added_size += page->assumed_size; + pages_added++; + } + } + + page = next; + } + + // do we have enough to save? + if(all_of_them || pages_added == optimal_flush_size) { + // we should do it + + for (size_t i = 0; i < pages_added; i++) { + PGC_PAGE *tpg = pages[i]; + + internal_fatal(page_get_status_flags(tpg) != PGC_PAGE_DIRTY, + "DBENGINE CACHE: page should be in the dirty list before saved"); + + __atomic_add_fetch(&cache->stats.flushing_entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.flushing_size, tpg->assumed_size, __ATOMIC_RELAXED); + + // remove it from the dirty list + pgc_ll_del(cache, &cache->dirty, tpg, true); + + pages_removed_dirty_size += tpg->assumed_size; + pages_removed_dirty++; + } + + // next time, repeat the same section (tier) + first = true; + } + else { + // we can't do it + + for (size_t i = 0; i < pages_added; i++) { + PGC_PAGE *tpg = pages[i]; + + internal_fatal(page_get_status_flags(tpg) != PGC_PAGE_DIRTY, + "DBENGINE CACHE: page should be in the dirty list before saved"); + + pages_cancelled_size += tpg->assumed_size; + pages_cancelled++; + + page_transition_unlock(cache, tpg); + page_release(cache, tpg, false); + // page ptr may be invalid now + } + + __atomic_add_fetch(&cache->stats.flushes_cancelled, pages_cancelled, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.flushes_cancelled_size, pages_cancelled_size, __ATOMIC_RELAXED); + + internal_fatal(pages_added != pages_cancelled || pages_added_size != pages_cancelled_size, + "DBENGINE CACHE: flushing cancel pages mismatch"); + + // next time, continue to the next section (tier) + first = false; + continue; + } + + if(cache->config.pgc_save_init_cb) + cache->config.pgc_save_init_cb(cache, last_section); + + pgc_ll_unlock(cache, &cache->dirty); + have_dirty_lock = false; + + // call the callback to save them + // it may take some time, so let's release the lock + cache->config.pgc_save_dirty_cb(cache, array, pages, pages_added); + flushes_so_far++; + + __atomic_add_fetch(&cache->stats.flushes_completed, pages_added, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.flushes_completed_size, pages_added_size, __ATOMIC_RELAXED); + + size_t pages_to_evict = 0; (void)pages_to_evict; + for (size_t i = 0; i < pages_added; i++) { + PGC_PAGE *tpg = pages[i]; + + internal_fatal(page_get_status_flags(tpg) != 0, + "DBENGINE CACHE: page should not be in any list while it is being saved"); + + __atomic_sub_fetch(&cache->stats.flushing_entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&cache->stats.flushing_size, tpg->assumed_size, __ATOMIC_RELAXED); + + pages_made_clean_size += tpg->assumed_size; + pages_made_clean++; + + if(!tpg->accesses) + pages_to_evict++; + + page_set_clean(cache, tpg, true, false); + page_transition_unlock(cache, tpg); + page_release(cache, tpg, false); + // tpg ptr may be invalid now + } + + internal_fatal(pages_added != pages_made_clean || pages_added != pages_removed_dirty || + pages_added_size != pages_made_clean_size || pages_added_size != pages_removed_dirty_size + , "DBENGINE CACHE: flushing pages mismatch"); + + if(!all_of_them && !wait) { + if(pgc_ll_trylock(cache, &cache->dirty)) + have_dirty_lock = true; + + else { + stopped_before_finishing = true; + have_dirty_lock = false; + } + } + else { + pgc_ll_lock(cache, &cache->dirty); + have_dirty_lock = true; + } + } + + if(have_dirty_lock) { + if(!stopped_before_finishing && dirty_version_at_entry > cache->dirty.last_version_checked) + cache->dirty.last_version_checked = dirty_version_at_entry; + + pgc_ll_unlock(cache, &cache->dirty); + } + + __atomic_sub_fetch(&cache->stats.workers_flush, 1, __ATOMIC_RELAXED); + + return stopped_before_finishing; +} + +void free_all_unreferenced_clean_pages(PGC *cache) { + evict_pages(cache, 0, 0, true, true); +} + +// ---------------------------------------------------------------------------- +// public API + +PGC *pgc_create(const char *name, + size_t clean_size_bytes, free_clean_page_callback pgc_free_cb, + size_t max_dirty_pages_per_flush, + save_dirty_init_callback pgc_save_init_cb, + save_dirty_page_callback pgc_save_dirty_cb, + size_t max_pages_per_inline_eviction, size_t max_inline_evictors, + size_t max_skip_pages_per_inline_eviction, + size_t max_flushes_inline, + PGC_OPTIONS options, size_t partitions, size_t additional_bytes_per_page) { + + if(max_pages_per_inline_eviction < 2) + max_pages_per_inline_eviction = 2; + + if(max_dirty_pages_per_flush < 1) + max_dirty_pages_per_flush = 1; + + if(max_flushes_inline * max_dirty_pages_per_flush < 2) + max_flushes_inline = 2; + + PGC *cache = callocz(1, sizeof(PGC)); + strncpyz(cache->config.name, name, PGC_NAME_MAX); + cache->config.options = options; + cache->config.clean_size = (clean_size_bytes < 1 * 1024 * 1024) ? 1 * 1024 * 1024 : clean_size_bytes; + cache->config.pgc_free_clean_cb = pgc_free_cb; + cache->config.max_dirty_pages_per_call = max_dirty_pages_per_flush; + cache->config.pgc_save_init_cb = pgc_save_init_cb; + cache->config.pgc_save_dirty_cb = pgc_save_dirty_cb; + cache->config.max_pages_per_inline_eviction = (max_pages_per_inline_eviction < 2) ? 2 : max_pages_per_inline_eviction; + cache->config.max_skip_pages_per_inline_eviction = (max_skip_pages_per_inline_eviction < 2) ? 2 : max_skip_pages_per_inline_eviction; + cache->config.max_flushes_inline = (max_flushes_inline < 1) ? 1 : max_flushes_inline; + cache->config.partitions = partitions < 1 ? (size_t)get_netdata_cpus() : partitions; + cache->config.additional_bytes_per_page = additional_bytes_per_page; + + cache->config.max_workers_evict_inline = max_inline_evictors; + cache->config.severe_pressure_per1000 = 1010; + cache->config.aggressive_evict_per1000 = 990; + cache->config.healthy_size_per1000 = 980; + cache->config.evict_low_threshold_per1000 = 970; + + cache->index = callocz(cache->config.partitions, sizeof(struct pgc_index)); + + for(size_t part = 0; part < cache->config.partitions ; part++) + netdata_rwlock_init(&cache->index[part].rwlock); + + netdata_spinlock_init(&cache->hot.spinlock); + netdata_spinlock_init(&cache->dirty.spinlock); + netdata_spinlock_init(&cache->clean.spinlock); + + cache->hot.flags = PGC_PAGE_HOT; + cache->hot.linked_list_in_sections_judy = true; + cache->hot.stats = &cache->stats.queues.hot; + + cache->dirty.flags = PGC_PAGE_DIRTY; + cache->dirty.linked_list_in_sections_judy = true; + cache->dirty.stats = &cache->stats.queues.dirty; + + cache->clean.flags = PGC_PAGE_CLEAN; + cache->clean.linked_list_in_sections_judy = false; + cache->clean.stats = &cache->stats.queues.clean; + + pgc_section_pages_static_aral_init(); + +#ifdef PGC_WITH_ARAL + cache->aral = callocz(cache->config.partitions, sizeof(ARAL *)); + for(size_t part = 0; part < cache->config.partitions ; part++) { + char buf[100 +1]; + snprintfz(buf, 100, "%s[%zu]", name, part); + cache->aral[part] = aral_create( + buf, + sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page, + 0, + 16384, + aral_statistics(pgc_section_pages_aral), + NULL, NULL, false, false); + } +#endif + + pointer_index_init(cache); + + return cache; +} + +struct aral_statistics *pgc_aral_statistics(void) { + return aral_statistics(pgc_section_pages_aral); +} + +size_t pgc_aral_structures(void) { + return aral_structures(pgc_section_pages_aral); +} + +size_t pgc_aral_overhead(void) { + return aral_overhead(pgc_section_pages_aral); +} + +void pgc_flush_all_hot_and_dirty_pages(PGC *cache, Word_t section) { + all_hot_pages_to_dirty(cache, section); + + // save all dirty pages to make them clean + flush_pages(cache, 0, section, true, true); +} + +void pgc_destroy(PGC *cache) { + // convert all hot pages to dirty + all_hot_pages_to_dirty(cache, PGC_SECTION_ALL); + + // save all dirty pages to make them clean + flush_pages(cache, 0, PGC_SECTION_ALL, true, true); + + // free all unreferenced clean pages + free_all_unreferenced_clean_pages(cache); + + if(PGC_REFERENCED_PAGES(cache)) + error("DBENGINE CACHE: there are %zu referenced cache pages - leaving the cache allocated", PGC_REFERENCED_PAGES(cache)); + else { + pointer_destroy_index(cache); + + for(size_t part = 0; part < cache->config.partitions ; part++) + netdata_rwlock_destroy(&cache->index[part].rwlock); + +#ifdef PGC_WITH_ARAL + for(size_t part = 0; part < cache->config.partitions ; part++) + aral_destroy(cache->aral[part]); + + freez(cache->aral); +#endif + + freez(cache); + } +} + +PGC_PAGE *pgc_page_add_and_acquire(PGC *cache, PGC_ENTRY entry, bool *added) { + return page_add(cache, &entry, added); +} + +PGC_PAGE *pgc_page_dup(PGC *cache, PGC_PAGE *page) { + if(!page_acquire(cache, page)) + fatal("DBENGINE CACHE: tried to dup a page that is not acquired!"); + + return page; +} + +void pgc_page_release(PGC *cache, PGC_PAGE *page) { + page_release(cache, page, is_page_clean(page)); +} + +void pgc_page_hot_to_dirty_and_release(PGC *cache, PGC_PAGE *page) { + __atomic_add_fetch(&cache->stats.workers_hot2dirty, 1, __ATOMIC_RELAXED); + +//#ifdef NETDATA_INTERNAL_CHECKS +// page_transition_lock(cache, page); +// internal_fatal(!is_page_hot(page), "DBENGINE CACHE: called %s() but page is not hot", __FUNCTION__ ); +// page_transition_unlock(cache, page); +//#endif + + // make page dirty + page_set_dirty(cache, page, false); + + // release the page + page_release(cache, page, true); + // page ptr may be invalid now + + __atomic_sub_fetch(&cache->stats.workers_hot2dirty, 1, __ATOMIC_RELAXED); + + // flush, if we have to + if((cache->config.options & PGC_OPTIONS_FLUSH_PAGES_INLINE) || flushing_critical(cache)) { + flush_pages(cache, cache->config.max_flushes_inline, PGC_SECTION_ALL, + false, false); + } +} + +bool pgc_page_to_clean_evict_or_release(PGC *cache, PGC_PAGE *page) { + bool ret; + + __atomic_add_fetch(&cache->stats.workers_hot2dirty, 1, __ATOMIC_RELAXED); + + // prevent accesses from increasing the accesses counter + page_flag_set(page, PGC_PAGE_HAS_NO_DATA_IGNORE_ACCESSES); + + // zero the accesses counter + __atomic_store_n(&page->accesses, 0, __ATOMIC_RELEASE); + + // if there are no other references to it, evict it immediately + if(make_acquired_page_clean_and_evict_or_page_release(cache, page)) { + __atomic_add_fetch(&cache->stats.hot_empty_pages_evicted_immediately, 1, __ATOMIC_RELAXED); + ret = true; + } + else { + __atomic_add_fetch(&cache->stats.hot_empty_pages_evicted_later, 1, __ATOMIC_RELAXED); + ret = false; + } + + __atomic_sub_fetch(&cache->stats.workers_hot2dirty, 1, __ATOMIC_RELAXED); + + return ret; +} + +Word_t pgc_page_section(PGC_PAGE *page) { + return page->section; +} + +Word_t pgc_page_metric(PGC_PAGE *page) { + return page->metric_id; +} + +time_t pgc_page_start_time_s(PGC_PAGE *page) { + return page->start_time_s; +} + +time_t pgc_page_end_time_s(PGC_PAGE *page) { + return page->end_time_s; +} + +time_t pgc_page_update_every_s(PGC_PAGE *page) { + return page->update_every_s; +} + +time_t pgc_page_fix_update_every(PGC_PAGE *page, time_t update_every_s) { + if(page->update_every_s == 0) + page->update_every_s = update_every_s; + + return page->update_every_s; +} + +time_t pgc_page_fix_end_time_s(PGC_PAGE *page, time_t end_time_s) { + page->end_time_s = end_time_s; + return page->end_time_s; +} + +void *pgc_page_data(PGC_PAGE *page) { + return page->data; +} + +void *pgc_page_custom_data(PGC *cache, PGC_PAGE *page) { + if(cache->config.additional_bytes_per_page) + return page->custom_data; + + return NULL; +} + +size_t pgc_page_data_size(PGC *cache, PGC_PAGE *page) { + return page_size_from_assumed_size(cache, page->assumed_size); +} + +bool pgc_is_page_hot(PGC_PAGE *page) { + return is_page_hot(page); +} + +bool pgc_is_page_dirty(PGC_PAGE *page) { + return is_page_dirty(page); +} + +bool pgc_is_page_clean(PGC_PAGE *page) { + return is_page_clean(page); +} + +void pgc_reset_hot_max(PGC *cache) { + size_t entries = __atomic_load_n(&cache->hot.stats->entries, __ATOMIC_RELAXED); + size_t size = __atomic_load_n(&cache->hot.stats->size, __ATOMIC_RELAXED); + + __atomic_store_n(&cache->hot.stats->max_entries, entries, __ATOMIC_RELAXED); + __atomic_store_n(&cache->hot.stats->max_size, size, __ATOMIC_RELAXED); + + size_t size_to_evict = 0; + cache_usage_per1000(cache, &size_to_evict); + evict_pages(cache, 0, 0, true, false); +} + +void pgc_set_dynamic_target_cache_size_callback(PGC *cache, dynamic_target_cache_size_callback callback) { + cache->config.dynamic_target_size_cb = callback; + + size_t size_to_evict = 0; + cache_usage_per1000(cache, &size_to_evict); + evict_pages(cache, 0, 0, true, false); +} + +size_t pgc_get_current_cache_size(PGC *cache) { + cache_usage_per1000(cache, NULL); + return __atomic_load_n(&cache->stats.current_cache_size, __ATOMIC_RELAXED); +} + +size_t pgc_get_wanted_cache_size(PGC *cache) { + cache_usage_per1000(cache, NULL); + return __atomic_load_n(&cache->stats.wanted_cache_size, __ATOMIC_RELAXED); +} + +bool pgc_evict_pages(PGC *cache, size_t max_skip, size_t max_evict) { + bool under_pressure = cache_needs_space_aggressively(cache); + return evict_pages(cache, + under_pressure ? 0 : max_skip, + under_pressure ? 0 : max_evict, + true, false); +} + +bool pgc_flush_pages(PGC *cache, size_t max_flushes) { + bool under_pressure = flushing_critical(cache); + return flush_pages(cache, under_pressure ? 0 : max_flushes, PGC_SECTION_ALL, true, false); +} + +void pgc_page_hot_set_end_time_s(PGC *cache __maybe_unused, PGC_PAGE *page, time_t end_time_s) { + internal_fatal(!is_page_hot(page), + "DBENGINE CACHE: end_time_s update on non-hot page"); + + internal_fatal(end_time_s < __atomic_load_n(&page->end_time_s, __ATOMIC_RELAXED), + "DBENGINE CACHE: end_time_s is not bigger than existing"); + + __atomic_store_n(&page->end_time_s, end_time_s, __ATOMIC_RELAXED); + +#ifdef PGC_COUNT_POINTS_COLLECTED + __atomic_add_fetch(&cache->stats.points_collected, 1, __ATOMIC_RELAXED); +#endif +} + +PGC_PAGE *pgc_page_get_and_acquire(PGC *cache, Word_t section, Word_t metric_id, time_t start_time_s, PGC_SEARCH method) { + return page_find_and_acquire(cache, section, metric_id, start_time_s, method); +} + +struct pgc_statistics pgc_get_statistics(PGC *cache) { + // FIXME - get the statistics atomically + return cache->stats; +} + +size_t pgc_hot_and_dirty_entries(PGC *cache) { + size_t entries = 0; + + entries += __atomic_load_n(&cache->hot.stats->entries, __ATOMIC_RELAXED); + entries += __atomic_load_n(&cache->dirty.stats->entries, __ATOMIC_RELAXED); + entries += __atomic_load_n(&cache->stats.flushing_entries, __ATOMIC_RELAXED); + entries += __atomic_load_n(&cache->stats.hot2dirty_entries, __ATOMIC_RELAXED); + + return entries; +} + +void pgc_open_cache_to_journal_v2(PGC *cache, Word_t section, unsigned datafile_fileno, uint8_t type, migrate_to_v2_callback cb, void *data) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.journal_v2_indexing_started, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&cache->stats.workers_jv2_flush, 1, __ATOMIC_RELAXED); + + pgc_ll_lock(cache, &cache->hot); + + Pvoid_t JudyL_metrics = NULL; + Pvoid_t JudyL_extents_pos = NULL; + + size_t count_of_unique_extents = 0; + size_t count_of_unique_metrics = 0; + size_t count_of_unique_pages = 0; + + size_t master_extent_index_id = 0; + + Pvoid_t *section_pages_pptr = JudyLGet(cache->hot.sections_judy, section, PJE0); + if(!section_pages_pptr) { + pgc_ll_unlock(cache, &cache->hot); + return; + } + + struct section_pages *sp = *section_pages_pptr; + if(!netdata_spinlock_trylock(&sp->migration_to_v2_spinlock)) { + internal_fatal(true, "DBENGINE: migration to journal v2 is already running for this section"); + pgc_ll_unlock(cache, &cache->hot); + return; + } + + ARAL *ar_mi = aral_by_size_acquire(sizeof(struct jv2_metrics_info)); + ARAL *ar_pi = aral_by_size_acquire(sizeof(struct jv2_page_info)); + ARAL *ar_ei = aral_by_size_acquire(sizeof(struct jv2_extents_info)); + + for(PGC_PAGE *page = sp->base; page ; page = page->link.next) { + struct extent_io_data *xio = (struct extent_io_data *)page->custom_data; + if(xio->fileno != datafile_fileno) continue; + + if(page_flag_check(page, PGC_PAGE_IS_BEING_MIGRATED_TO_V2)) { + internal_fatal(true, "Migration to journal v2: page has already been migrated to v2"); + continue; + } + + if(!page_transition_trylock(cache, page)) { + internal_fatal(true, "Migration to journal v2: cannot get page transition lock"); + continue; + } + + if(!page_acquire(cache, page)) { + internal_fatal(true, "Migration to journal v2: cannot acquire page for migration to v2"); + continue; + } + + page_flag_set(page, PGC_PAGE_IS_BEING_MIGRATED_TO_V2); + + pgc_ll_unlock(cache, &cache->hot); + + // update the extents JudyL + + size_t current_extent_index_id; + Pvoid_t *PValue = JudyLIns(&JudyL_extents_pos, xio->pos, PJE0); + if(!PValue || *PValue == PJERR) + fatal("Corrupted JudyL extents pos"); + + struct jv2_extents_info *ei; + if(!*PValue) { + ei = aral_mallocz(ar_ei); // callocz(1, sizeof(struct jv2_extents_info)); + ei->pos = xio->pos; + ei->bytes = xio->bytes; + ei->number_of_pages = 1; + ei->index = master_extent_index_id++; + *PValue = ei; + + count_of_unique_extents++; + } + else { + ei = *PValue; + ei->number_of_pages++; + } + + current_extent_index_id = ei->index; + + // update the metrics JudyL + + PValue = JudyLIns(&JudyL_metrics, page->metric_id, PJE0); + if(!PValue || *PValue == PJERR) + fatal("Corrupted JudyL metrics"); + + struct jv2_metrics_info *mi; + if(!*PValue) { + mi = aral_mallocz(ar_mi); // callocz(1, sizeof(struct jv2_metrics_info)); + mi->uuid = mrg_metric_uuid(main_mrg, (METRIC *)page->metric_id); + mi->first_time_s = page->start_time_s; + mi->last_time_s = page->end_time_s; + mi->number_of_pages = 1; + mi->page_list_header = 0; + mi->JudyL_pages_by_start_time = NULL; + *PValue = mi; + + count_of_unique_metrics++; + } + else { + mi = *PValue; + mi->number_of_pages++; + if(page->start_time_s < mi->first_time_s) + mi->first_time_s = page->start_time_s; + if(page->end_time_s > mi->last_time_s) + mi->last_time_s = page->end_time_s; + } + + PValue = JudyLIns(&mi->JudyL_pages_by_start_time, page->start_time_s, PJE0); + if(!PValue || *PValue == PJERR) + fatal("Corrupted JudyL metric pages"); + + if(!*PValue) { + struct jv2_page_info *pi = aral_mallocz(ar_pi); // callocz(1, (sizeof(struct jv2_page_info))); + pi->start_time_s = page->start_time_s; + pi->end_time_s = page->end_time_s; + pi->update_every_s = page->update_every_s; + pi->page_length = page_size_from_assumed_size(cache, page->assumed_size); + pi->page = page; + pi->extent_index = current_extent_index_id; + pi->custom_data = (cache->config.additional_bytes_per_page) ? page->custom_data : NULL; + *PValue = pi; + + count_of_unique_pages++; + } + else { + // impossible situation + internal_fatal(true, "Page is already in JudyL metric pages"); + page_flag_clear(page, PGC_PAGE_IS_BEING_MIGRATED_TO_V2); + page_transition_unlock(cache, page); + page_release(cache, page, false); + } + + pgc_ll_lock(cache, &cache->hot); + } + + netdata_spinlock_unlock(&sp->migration_to_v2_spinlock); + pgc_ll_unlock(cache, &cache->hot); + + // callback + cb(section, datafile_fileno, type, JudyL_metrics, JudyL_extents_pos, count_of_unique_extents, count_of_unique_metrics, count_of_unique_pages, data); + + { + Pvoid_t *PValue1; + bool metric_id_first = true; + Word_t metric_id = 0; + while ((PValue1 = JudyLFirstThenNext(JudyL_metrics, &metric_id, &metric_id_first))) { + struct jv2_metrics_info *mi = *PValue1; + + Pvoid_t *PValue2; + bool start_time_first = true; + Word_t start_time = 0; + while ((PValue2 = JudyLFirstThenNext(mi->JudyL_pages_by_start_time, &start_time, &start_time_first))) { + struct jv2_page_info *pi = *PValue2; + page_transition_unlock(cache, pi->page); + pgc_page_hot_to_dirty_and_release(cache, pi->page); + // make_acquired_page_clean_and_evict_or_page_release(cache, pi->page); + aral_freez(ar_pi, pi); + } + + JudyLFreeArray(&mi->JudyL_pages_by_start_time, PJE0); + aral_freez(ar_mi, mi); + } + JudyLFreeArray(&JudyL_metrics, PJE0); + } + + { + Pvoid_t *PValue; + bool extent_pos_first = true; + Word_t extent_pos = 0; + while ((PValue = JudyLFirstThenNext(JudyL_extents_pos, &extent_pos, &extent_pos_first))) { + struct jv2_extents_info *ei = *PValue; + aral_freez(ar_ei, ei); + } + JudyLFreeArray(&JudyL_extents_pos, PJE0); + } + + aral_by_size_release(ar_ei); + aral_by_size_release(ar_pi); + aral_by_size_release(ar_mi); + + __atomic_sub_fetch(&cache->stats.workers_jv2_flush, 1, __ATOMIC_RELAXED); +} + +static bool match_page_data(PGC_PAGE *page, void *data) { + return (page->data == data); +} + +void pgc_open_evict_clean_pages_of_datafile(PGC *cache, struct rrdengine_datafile *datafile) { + evict_pages_with_filter(cache, 0, 0, true, true, match_page_data, datafile); +} + +size_t pgc_count_clean_pages_having_data_ptr(PGC *cache, Word_t section, void *ptr) { + size_t found = 0; + + pgc_ll_lock(cache, &cache->clean); + for(PGC_PAGE *page = cache->clean.base; page ;page = page->link.next) + found += (page->data == ptr && page->section == section) ? 1 : 0; + pgc_ll_unlock(cache, &cache->clean); + + return found; +} + +size_t pgc_count_hot_pages_having_data_ptr(PGC *cache, Word_t section, void *ptr) { + size_t found = 0; + + pgc_ll_lock(cache, &cache->hot); + Pvoid_t *section_pages_pptr = JudyLGet(cache->hot.sections_judy, section, PJE0); + if(section_pages_pptr) { + struct section_pages *sp = *section_pages_pptr; + for(PGC_PAGE *page = sp->base; page ;page = page->link.next) + found += (page->data == ptr) ? 1 : 0; + } + pgc_ll_unlock(cache, &cache->hot); + + return found; +} + +// ---------------------------------------------------------------------------- +// unittest + +static void unittest_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused) { + ; +} + +static void unittest_save_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) { + ; +} + +#ifdef PGC_STRESS_TEST + +struct { + bool stop; + PGC *cache; + PGC_PAGE **metrics; + size_t clean_metrics; + size_t hot_metrics; + time_t first_time_t; + time_t last_time_t; + size_t cache_size; + size_t query_threads; + size_t collect_threads; + size_t partitions; + size_t points_per_page; + time_t time_per_collection_ut; + time_t time_per_query_ut; + time_t time_per_flush_ut; + PGC_OPTIONS options; + char rand_statebufs[1024]; + struct random_data *random_data; +} pgc_uts = { + .stop = false, + .metrics = NULL, + .clean_metrics = 100000, + .hot_metrics = 1000000, + .first_time_t = 100000000, + .last_time_t = 0, + .cache_size = 0, // get the default (8MB) + .collect_threads = 16, + .query_threads = 16, + .partitions = 0, // get the default (system cpus) + .options = PGC_OPTIONS_AUTOSCALE,/* PGC_OPTIONS_FLUSH_PAGES_INLINE | PGC_OPTIONS_EVICT_PAGES_INLINE,*/ + .points_per_page = 10, + .time_per_collection_ut = 1000000, + .time_per_query_ut = 250, + .time_per_flush_ut = 100, + .rand_statebufs = {}, + .random_data = NULL, +}; + +void *unittest_stress_test_collector(void *ptr) { + size_t id = *((size_t *)ptr); + + size_t metric_start = pgc_uts.clean_metrics; + size_t metric_end = pgc_uts.clean_metrics + pgc_uts.hot_metrics; + size_t number_of_metrics = metric_end - metric_start; + size_t per_collector_metrics = number_of_metrics / pgc_uts.collect_threads; + metric_start = metric_start + per_collector_metrics * id + 1; + metric_end = metric_start + per_collector_metrics - 1; + + time_t start_time_t = pgc_uts.first_time_t + 1; + + heartbeat_t hb; + heartbeat_init(&hb); + + while(!__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) { + // info("COLLECTOR %zu: collecting metrics %zu to %zu, from %ld to %lu", id, metric_start, metric_end, start_time_t, start_time_t + pgc_uts.points_per_page); + + netdata_thread_disable_cancelability(); + + for (size_t i = metric_start; i < metric_end; i++) { + bool added; + + pgc_uts.metrics[i] = pgc_page_add_and_acquire(pgc_uts.cache, (PGC_ENTRY) { + .section = 1, + .metric_id = i, + .start_time_t = start_time_t, + .end_time_t = start_time_t, + .update_every = 1, + .size = 4096, + .data = NULL, + .hot = true, + }, &added); + + if(!pgc_is_page_hot(pgc_uts.metrics[i]) || !added) { + pgc_page_release(pgc_uts.cache, pgc_uts.metrics[i]); + pgc_uts.metrics[i] = NULL; + } + } + + time_t end_time_t = start_time_t + (time_t)pgc_uts.points_per_page; + while(++start_time_t <= end_time_t && !__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) { + heartbeat_next(&hb, pgc_uts.time_per_collection_ut); + + for (size_t i = metric_start; i < metric_end; i++) { + if(pgc_uts.metrics[i]) + pgc_page_hot_set_end_time_t(pgc_uts.cache, pgc_uts.metrics[i], start_time_t); + } + + __atomic_store_n(&pgc_uts.last_time_t, start_time_t, __ATOMIC_RELAXED); + } + + for (size_t i = metric_start; i < metric_end; i++) { + if (pgc_uts.metrics[i]) { + if(i % 10 == 0) + pgc_page_to_clean_evict_or_release(pgc_uts.cache, pgc_uts.metrics[i]); + else + pgc_page_hot_to_dirty_and_release(pgc_uts.cache, pgc_uts.metrics[i]); + } + } + + netdata_thread_enable_cancelability(); + } + + return ptr; +} + +void *unittest_stress_test_queries(void *ptr) { + size_t id = *((size_t *)ptr); + struct random_data *random_data = &pgc_uts.random_data[id]; + + size_t start = 0; + size_t end = pgc_uts.clean_metrics + pgc_uts.hot_metrics; + + while(!__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) { + netdata_thread_disable_cancelability(); + + int32_t random_number; + random_r(random_data, &random_number); + + size_t metric_id = random_number % (end - start); + time_t start_time_t = pgc_uts.first_time_t; + time_t end_time_t = __atomic_load_n(&pgc_uts.last_time_t, __ATOMIC_RELAXED); + if(end_time_t <= start_time_t) + end_time_t = start_time_t + 1; + size_t pages = (end_time_t - start_time_t) / pgc_uts.points_per_page + 1; + + PGC_PAGE *array[pages]; + for(size_t i = 0; i < pages ;i++) + array[i] = NULL; + + // find the pages the cache has + for(size_t i = 0; i < pages ;i++) { + time_t page_start_time = start_time_t + (time_t)(i * pgc_uts.points_per_page); + array[i] = pgc_page_get_and_acquire(pgc_uts.cache, 1, metric_id, + page_start_time, (i < pages - 1)?PGC_SEARCH_EXACT:PGC_SEARCH_CLOSEST); + } + + // load the rest of the pages + for(size_t i = 0; i < pages ;i++) { + if(array[i]) continue; + + time_t page_start_time = start_time_t + (time_t)(i * pgc_uts.points_per_page); + array[i] = pgc_page_add_and_acquire(pgc_uts.cache, (PGC_ENTRY) { + .section = 1, + .metric_id = metric_id, + .start_time_t = page_start_time, + .end_time_t = page_start_time + (time_t)pgc_uts.points_per_page, + .update_every = 1, + .size = 4096, + .data = NULL, + .hot = false, + }, NULL); + } + + // do the query + // ... + struct timespec work_duration = {.tv_sec = 0, .tv_nsec = pgc_uts.time_per_query_ut * NSEC_PER_USEC }; + nanosleep(&work_duration, NULL); + + // release the pages + for(size_t i = 0; i < pages ;i++) { + if(!array[i]) continue; + pgc_page_release(pgc_uts.cache, array[i]); + array[i] = NULL; + } + + netdata_thread_enable_cancelability(); + } + + return ptr; +} + +void *unittest_stress_test_service(void *ptr) { + heartbeat_t hb; + heartbeat_init(&hb); + while(!__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) { + heartbeat_next(&hb, 1 * USEC_PER_SEC); + + pgc_flush_pages(pgc_uts.cache, 1000); + pgc_evict_pages(pgc_uts.cache, 0, 0); + } + return ptr; +} + +static void unittest_stress_test_save_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) { + // info("SAVE %zu pages", entries); + if(!pgc_uts.stop) { + usec_t t = pgc_uts.time_per_flush_ut; + + if(t > 0) { + struct timespec work_duration = { + .tv_sec = t / USEC_PER_SEC, + .tv_nsec = (long) ((t % USEC_PER_SEC) * NSEC_PER_USEC) + }; + + nanosleep(&work_duration, NULL); + } + } +} + +void unittest_stress_test(void) { + pgc_uts.cache = pgc_create(pgc_uts.cache_size * 1024 * 1024, + unittest_free_clean_page_callback, + 64, unittest_stress_test_save_dirty_page_callback, + 1000, 10000, 1, + pgc_uts.options, pgc_uts.partitions, 0); + + pgc_uts.metrics = callocz(pgc_uts.clean_metrics + pgc_uts.hot_metrics, sizeof(PGC_PAGE *)); + + pthread_t service_thread; + netdata_thread_create(&service_thread, "SERVICE", + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + unittest_stress_test_service, NULL); + + pthread_t collect_threads[pgc_uts.collect_threads]; + size_t collect_thread_ids[pgc_uts.collect_threads]; + for(size_t i = 0; i < pgc_uts.collect_threads ;i++) { + collect_thread_ids[i] = i; + char buffer[100 + 1]; + snprintfz(buffer, 100, "COLLECT_%zu", i); + netdata_thread_create(&collect_threads[i], buffer, + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + unittest_stress_test_collector, &collect_thread_ids[i]); + } + + pthread_t queries_threads[pgc_uts.query_threads]; + size_t query_thread_ids[pgc_uts.query_threads]; + pgc_uts.random_data = callocz(pgc_uts.query_threads, sizeof(struct random_data)); + for(size_t i = 0; i < pgc_uts.query_threads ;i++) { + query_thread_ids[i] = i; + char buffer[100 + 1]; + snprintfz(buffer, 100, "QUERY_%zu", i); + initstate_r(1, pgc_uts.rand_statebufs, 1024, &pgc_uts.random_data[i]); + netdata_thread_create(&queries_threads[i], buffer, + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + unittest_stress_test_queries, &query_thread_ids[i]); + } + + heartbeat_t hb; + heartbeat_init(&hb); + + struct { + size_t entries; + size_t added; + size_t deleted; + size_t referenced; + + size_t hot_entries; + size_t hot_added; + size_t hot_deleted; + + size_t dirty_entries; + size_t dirty_added; + size_t dirty_deleted; + + size_t clean_entries; + size_t clean_added; + size_t clean_deleted; + + size_t searches_exact; + size_t searches_exact_hits; + size_t searches_closest; + size_t searches_closest_hits; + + size_t collections; + + size_t events_cache_under_severe_pressure; + size_t events_cache_needs_space_90; + size_t events_flush_critical; + } stats = {}, old_stats = {}; + + for(int i = 0; i < 86400 ;i++) { + heartbeat_next(&hb, 1 * USEC_PER_SEC); + + old_stats = stats; + stats.entries = __atomic_load_n(&pgc_uts.cache->stats.entries, __ATOMIC_RELAXED); + stats.added = __atomic_load_n(&pgc_uts.cache->stats.added_entries, __ATOMIC_RELAXED); + stats.deleted = __atomic_load_n(&pgc_uts.cache->stats.removed_entries, __ATOMIC_RELAXED); + stats.referenced = __atomic_load_n(&pgc_uts.cache->stats.referenced_entries, __ATOMIC_RELAXED); + + stats.hot_entries = __atomic_load_n(&pgc_uts.cache->hot.stats->entries, __ATOMIC_RELAXED); + stats.hot_added = __atomic_load_n(&pgc_uts.cache->hot.stats->added_entries, __ATOMIC_RELAXED); + stats.hot_deleted = __atomic_load_n(&pgc_uts.cache->hot.stats->removed_entries, __ATOMIC_RELAXED); + + stats.dirty_entries = __atomic_load_n(&pgc_uts.cache->dirty.stats->entries, __ATOMIC_RELAXED); + stats.dirty_added = __atomic_load_n(&pgc_uts.cache->dirty.stats->added_entries, __ATOMIC_RELAXED); + stats.dirty_deleted = __atomic_load_n(&pgc_uts.cache->dirty.stats->removed_entries, __ATOMIC_RELAXED); + + stats.clean_entries = __atomic_load_n(&pgc_uts.cache->clean.stats->entries, __ATOMIC_RELAXED); + stats.clean_added = __atomic_load_n(&pgc_uts.cache->clean.stats->added_entries, __ATOMIC_RELAXED); + stats.clean_deleted = __atomic_load_n(&pgc_uts.cache->clean.stats->removed_entries, __ATOMIC_RELAXED); + + stats.searches_exact = __atomic_load_n(&pgc_uts.cache->stats.searches_exact, __ATOMIC_RELAXED); + stats.searches_exact_hits = __atomic_load_n(&pgc_uts.cache->stats.searches_exact_hits, __ATOMIC_RELAXED); + + stats.searches_closest = __atomic_load_n(&pgc_uts.cache->stats.searches_closest, __ATOMIC_RELAXED); + stats.searches_closest_hits = __atomic_load_n(&pgc_uts.cache->stats.searches_closest_hits, __ATOMIC_RELAXED); + + stats.events_cache_under_severe_pressure = __atomic_load_n(&pgc_uts.cache->stats.events_cache_under_severe_pressure, __ATOMIC_RELAXED); + stats.events_cache_needs_space_90 = __atomic_load_n(&pgc_uts.cache->stats.events_cache_needs_space_aggressively, __ATOMIC_RELAXED); + stats.events_flush_critical = __atomic_load_n(&pgc_uts.cache->stats.events_flush_critical, __ATOMIC_RELAXED); + + size_t searches_exact = stats.searches_exact - old_stats.searches_exact; + size_t searches_closest = stats.searches_closest - old_stats.searches_closest; + + size_t hit_exact = stats.searches_exact_hits - old_stats.searches_exact_hits; + size_t hit_closest = stats.searches_closest_hits - old_stats.searches_closest_hits; + + double hit_exact_pc = (searches_exact > 0) ? (double)hit_exact * 100.0 / (double)searches_exact : 0.0; + double hit_closest_pc = (searches_closest > 0) ? (double)hit_closest * 100.0 / (double)searches_closest : 0.0; + +#ifdef PGC_COUNT_POINTS_COLLECTED + stats.collections = __atomic_load_n(&pgc_uts.cache->stats.points_collected, __ATOMIC_RELAXED); +#endif + + char *cache_status = "N"; + if(stats.events_cache_under_severe_pressure > old_stats.events_cache_under_severe_pressure) + cache_status = "F"; + else if(stats.events_cache_needs_space_90 > old_stats.events_cache_needs_space_90) + cache_status = "f"; + + char *flushing_status = "N"; + if(stats.events_flush_critical > old_stats.events_flush_critical) + flushing_status = "F"; + + info("PGS %5zuk +%4zuk/-%4zuk " + "| RF %5zuk " + "| HOT %5zuk +%4zuk -%4zuk " + "| DRT %s %5zuk +%4zuk -%4zuk " + "| CLN %s %5zuk +%4zuk -%4zuk " + "| SRCH %4zuk %4zuk, HIT %4.1f%% %4.1f%% " +#ifdef PGC_COUNT_POINTS_COLLECTED + "| CLCT %8.4f Mps" +#endif + , stats.entries / 1000 + , (stats.added - old_stats.added) / 1000, (stats.deleted - old_stats.deleted) / 1000 + , stats.referenced / 1000 + , stats.hot_entries / 1000, (stats.hot_added - old_stats.hot_added) / 1000, (stats.hot_deleted - old_stats.hot_deleted) / 1000 + , flushing_status + , stats.dirty_entries / 1000 + , (stats.dirty_added - old_stats.dirty_added) / 1000, (stats.dirty_deleted - old_stats.dirty_deleted) / 1000 + , cache_status + , stats.clean_entries / 1000 + , (stats.clean_added - old_stats.clean_added) / 1000, (stats.clean_deleted - old_stats.clean_deleted) / 1000 + , searches_exact / 1000, searches_closest / 1000 + , hit_exact_pc, hit_closest_pc +#ifdef PGC_COUNT_POINTS_COLLECTED + , (double)(stats.collections - old_stats.collections) / 1000.0 / 1000.0 +#endif + ); + } + info("Waiting for threads to stop..."); + __atomic_store_n(&pgc_uts.stop, true, __ATOMIC_RELAXED); + + netdata_thread_join(service_thread, NULL); + + for(size_t i = 0; i < pgc_uts.collect_threads ;i++) + netdata_thread_join(collect_threads[i],NULL); + + for(size_t i = 0; i < pgc_uts.query_threads ;i++) + netdata_thread_join(queries_threads[i],NULL); + + pgc_destroy(pgc_uts.cache); + + freez(pgc_uts.metrics); + freez(pgc_uts.random_data); +} +#endif + +int pgc_unittest(void) { + PGC *cache = pgc_create("test", + 32 * 1024 * 1024, unittest_free_clean_page_callback, + 64, NULL, unittest_save_dirty_page_callback, + 10, 10, 1000, 10, + PGC_OPTIONS_DEFAULT, 1, 11); + + // FIXME - unit tests + // - add clean page + // - add clean page again (should not add it) + // - release page (should decrement counters) + // - add hot page + // - add hot page again (should not add it) + // - turn hot page to dirty, with and without a reference counter to it + // - dirty pages are saved once there are enough of them + // - find page exact + // - find page (should return last) + // - find page (should return next) + // - page cache full (should evict) + // - on destroy, turn hot pages to dirty and save them + + PGC_PAGE *page1 = pgc_page_add_and_acquire(cache, (PGC_ENTRY){ + .section = 1, + .metric_id = 10, + .start_time_s = 100, + .end_time_s = 1000, + .size = 4096, + .data = NULL, + .hot = false, + .custom_data = (uint8_t *)"0123456789", + }, NULL); + + if(strcmp(pgc_page_custom_data(cache, page1), "0123456789") != 0) + fatal("custom data do not work"); + + memcpy(pgc_page_custom_data(cache, page1), "ABCDEFGHIJ", 11); + if(strcmp(pgc_page_custom_data(cache, page1), "ABCDEFGHIJ") != 0) + fatal("custom data do not work"); + + pgc_page_release(cache, page1); + + PGC_PAGE *page2 = pgc_page_add_and_acquire(cache, (PGC_ENTRY){ + .section = 2, + .metric_id = 10, + .start_time_s = 1001, + .end_time_s = 2000, + .size = 4096, + .data = NULL, + .hot = true, + }, NULL); + + pgc_page_hot_set_end_time_s(cache, page2, 2001); + pgc_page_hot_to_dirty_and_release(cache, page2); + + PGC_PAGE *page3 = pgc_page_add_and_acquire(cache, (PGC_ENTRY){ + .section = 3, + .metric_id = 10, + .start_time_s = 1001, + .end_time_s = 2000, + .size = 4096, + .data = NULL, + .hot = true, + }, NULL); + + pgc_page_hot_set_end_time_s(cache, page3, 2001); + pgc_page_hot_to_dirty_and_release(cache, page3); + + pgc_destroy(cache); + +#ifdef PGC_STRESS_TEST + unittest_stress_test(); +#endif + + return 0; +} diff --git a/database/engine/cache.h b/database/engine/cache.h new file mode 100644 index 000000000..65e6a6137 --- /dev/null +++ b/database/engine/cache.h @@ -0,0 +1,249 @@ +#ifndef DBENGINE_CACHE_H +#define DBENGINE_CACHE_H + +#include "../rrd.h" + +// CACHE COMPILE TIME CONFIGURATION +// #define PGC_COUNT_POINTS_COLLECTED 1 + +typedef struct pgc PGC; +typedef struct pgc_page PGC_PAGE; +#define PGC_NAME_MAX 23 + +typedef enum __attribute__ ((__packed__)) { + PGC_OPTIONS_NONE = 0, + PGC_OPTIONS_EVICT_PAGES_INLINE = (1 << 0), + PGC_OPTIONS_FLUSH_PAGES_INLINE = (1 << 1), + PGC_OPTIONS_AUTOSCALE = (1 << 2), +} PGC_OPTIONS; + +#define PGC_OPTIONS_DEFAULT (PGC_OPTIONS_EVICT_PAGES_INLINE | PGC_OPTIONS_FLUSH_PAGES_INLINE | PGC_OPTIONS_AUTOSCALE) + +typedef struct pgc_entry { + Word_t section; // the section this belongs to + Word_t metric_id; // the metric this belongs to + time_t start_time_s; // the start time of the page + time_t end_time_s; // the end time of the page + size_t size; // the size in bytes of the allocation, outside the cache + void *data; // a pointer to data outside the cache + uint32_t update_every_s; // the update every of the page + bool hot; // true if this entry is currently being collected + uint8_t *custom_data; +} PGC_ENTRY; + +#define PGC_CACHE_LINE_PADDING(x) uint8_t padding##x[128] + +struct pgc_queue_statistics { + size_t entries; + size_t size; + + PGC_CACHE_LINE_PADDING(1); + + size_t max_entries; + size_t max_size; + + PGC_CACHE_LINE_PADDING(2); + + size_t added_entries; + size_t added_size; + + PGC_CACHE_LINE_PADDING(3); + + size_t removed_entries; + size_t removed_size; + + PGC_CACHE_LINE_PADDING(4); +}; + +struct pgc_statistics { + size_t wanted_cache_size; + size_t current_cache_size; + + PGC_CACHE_LINE_PADDING(1); + + size_t added_entries; + size_t added_size; + + PGC_CACHE_LINE_PADDING(2); + + size_t removed_entries; + size_t removed_size; + + PGC_CACHE_LINE_PADDING(3); + + size_t entries; // all the entries (includes clean, dirty, host) + size_t size; // all the entries (includes clean, dirty, host) + + size_t evicting_entries; + size_t evicting_size; + + size_t flushing_entries; + size_t flushing_size; + + size_t hot2dirty_entries; + size_t hot2dirty_size; + + PGC_CACHE_LINE_PADDING(4); + + size_t acquires; + PGC_CACHE_LINE_PADDING(4a); + size_t releases; + PGC_CACHE_LINE_PADDING(4b); + size_t acquires_for_deletion; + PGC_CACHE_LINE_PADDING(4c); + + size_t referenced_entries; // all the entries currently referenced + size_t referenced_size; // all the entries currently referenced + + PGC_CACHE_LINE_PADDING(5); + + size_t searches_exact; + size_t searches_exact_hits; + size_t searches_exact_misses; + + PGC_CACHE_LINE_PADDING(6); + + size_t searches_closest; + size_t searches_closest_hits; + size_t searches_closest_misses; + + PGC_CACHE_LINE_PADDING(7); + + size_t flushes_completed; + size_t flushes_completed_size; + size_t flushes_cancelled; + size_t flushes_cancelled_size; + +#ifdef PGC_COUNT_POINTS_COLLECTED + PGC_CACHE_LINE_PADDING(8); + size_t points_collected; +#endif + + PGC_CACHE_LINE_PADDING(9); + + size_t insert_spins; + size_t evict_spins; + size_t release_spins; + size_t acquire_spins; + size_t delete_spins; + size_t flush_spins; + + PGC_CACHE_LINE_PADDING(10); + + size_t workers_search; + size_t workers_add; + size_t workers_evict; + size_t workers_flush; + size_t workers_jv2_flush; + size_t workers_hot2dirty; + + size_t evict_skipped; + size_t hot_empty_pages_evicted_immediately; + size_t hot_empty_pages_evicted_later; + + PGC_CACHE_LINE_PADDING(11); + + // events + size_t events_cache_under_severe_pressure; + size_t events_cache_needs_space_aggressively; + size_t events_flush_critical; + + PGC_CACHE_LINE_PADDING(12); + + struct { + PGC_CACHE_LINE_PADDING(0); + struct pgc_queue_statistics hot; + PGC_CACHE_LINE_PADDING(1); + struct pgc_queue_statistics dirty; + PGC_CACHE_LINE_PADDING(2); + struct pgc_queue_statistics clean; + PGC_CACHE_LINE_PADDING(3); + } queues; +}; + + +typedef void (*free_clean_page_callback)(PGC *cache, PGC_ENTRY entry); +typedef void (*save_dirty_page_callback)(PGC *cache, PGC_ENTRY *entries_array, PGC_PAGE **pages_array, size_t entries); +typedef void (*save_dirty_init_callback)(PGC *cache, Word_t section); +// create a cache +PGC *pgc_create(const char *name, + size_t clean_size_bytes, free_clean_page_callback pgc_free_clean_cb, + size_t max_dirty_pages_per_flush, save_dirty_init_callback pgc_save_init_cb, save_dirty_page_callback pgc_save_dirty_cb, + size_t max_pages_per_inline_eviction, size_t max_inline_evictors, + size_t max_skip_pages_per_inline_eviction, + size_t max_flushes_inline, + PGC_OPTIONS options, size_t partitions, size_t additional_bytes_per_page); + +// destroy the cache +void pgc_destroy(PGC *cache); + +#define PGC_SECTION_ALL ((Word_t)0) +void pgc_flush_all_hot_and_dirty_pages(PGC *cache, Word_t section); + +// add a page to the cache and return a pointer to it +PGC_PAGE *pgc_page_add_and_acquire(PGC *cache, PGC_ENTRY entry, bool *added); + +// get another reference counter on an already referenced page +PGC_PAGE *pgc_page_dup(PGC *cache, PGC_PAGE *page); + +// release a page (all pointers to it are now invalid) +void pgc_page_release(PGC *cache, PGC_PAGE *page); + +// mark a hot page dirty, and release it +void pgc_page_hot_to_dirty_and_release(PGC *cache, PGC_PAGE *page); + +// find a page from the cache +typedef enum { + PGC_SEARCH_EXACT, + PGC_SEARCH_CLOSEST, + PGC_SEARCH_FIRST, + PGC_SEARCH_NEXT, + PGC_SEARCH_LAST, + PGC_SEARCH_PREV, +} PGC_SEARCH; + +PGC_PAGE *pgc_page_get_and_acquire(PGC *cache, Word_t section, Word_t metric_id, time_t start_time_s, PGC_SEARCH method); + +// get information from an acquired page +Word_t pgc_page_section(PGC_PAGE *page); +Word_t pgc_page_metric(PGC_PAGE *page); +time_t pgc_page_start_time_s(PGC_PAGE *page); +time_t pgc_page_end_time_s(PGC_PAGE *page); +time_t pgc_page_update_every_s(PGC_PAGE *page); +time_t pgc_page_fix_update_every(PGC_PAGE *page, time_t update_every_s); +time_t pgc_page_fix_end_time_s(PGC_PAGE *page, time_t end_time_s); +void *pgc_page_data(PGC_PAGE *page); +void *pgc_page_custom_data(PGC *cache, PGC_PAGE *page); +size_t pgc_page_data_size(PGC *cache, PGC_PAGE *page); +bool pgc_is_page_hot(PGC_PAGE *page); +bool pgc_is_page_dirty(PGC_PAGE *page); +bool pgc_is_page_clean(PGC_PAGE *page); +void pgc_reset_hot_max(PGC *cache); +size_t pgc_get_current_cache_size(PGC *cache); +size_t pgc_get_wanted_cache_size(PGC *cache); + +// resetting the end time of a hot page +void pgc_page_hot_set_end_time_s(PGC *cache, PGC_PAGE *page, time_t end_time_s); +bool pgc_page_to_clean_evict_or_release(PGC *cache, PGC_PAGE *page); + +typedef void (*migrate_to_v2_callback)(Word_t section, unsigned datafile_fileno, uint8_t type, Pvoid_t JudyL_metrics, Pvoid_t JudyL_extents_pos, size_t count_of_unique_extents, size_t count_of_unique_metrics, size_t count_of_unique_pages, void *data); +void pgc_open_cache_to_journal_v2(PGC *cache, Word_t section, unsigned datafile_fileno, uint8_t type, migrate_to_v2_callback cb, void *data); +void pgc_open_evict_clean_pages_of_datafile(PGC *cache, struct rrdengine_datafile *datafile); +size_t pgc_count_clean_pages_having_data_ptr(PGC *cache, Word_t section, void *ptr); +size_t pgc_count_hot_pages_having_data_ptr(PGC *cache, Word_t section, void *ptr); + +typedef size_t (*dynamic_target_cache_size_callback)(void); +void pgc_set_dynamic_target_cache_size_callback(PGC *cache, dynamic_target_cache_size_callback callback); + +// return true when there is more work to do +bool pgc_evict_pages(PGC *cache, size_t max_skip, size_t max_evict); +bool pgc_flush_pages(PGC *cache, size_t max_flushes); + +struct pgc_statistics pgc_get_statistics(PGC *cache); +size_t pgc_hot_and_dirty_entries(PGC *cache); + +struct aral_statistics *pgc_aral_statistics(void); +size_t pgc_aral_structures(void); +size_t pgc_aral_overhead(void); + +#endif // DBENGINE_CACHE_H diff --git a/database/engine/datafile.c b/database/engine/datafile.c index 9c70068d9..286ae1e30 100644 --- a/database/engine/datafile.c +++ b/database/engine/datafile.c @@ -1,58 +1,174 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "rrdengine.h" -void df_extent_insert(struct extent_info *extent) -{ - struct rrdengine_datafile *datafile = extent->datafile; - - if (likely(NULL != datafile->extents.last)) { - datafile->extents.last->next = extent; - } - if (unlikely(NULL == datafile->extents.first)) { - datafile->extents.first = extent; - } - datafile->extents.last = extent; -} - void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) { - if (likely(NULL != ctx->datafiles.last)) { - ctx->datafiles.last->next = datafile; - } - if (unlikely(NULL == ctx->datafiles.first)) { - ctx->datafiles.first = datafile; - } - ctx->datafiles.last = datafile; + uv_rwlock_wrlock(&ctx->datafiles.rwlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ctx->datafiles.first, datafile, prev, next); + uv_rwlock_wrunlock(&ctx->datafiles.rwlock); } -void datafile_list_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) +void datafile_list_delete_unsafe(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) { - struct rrdengine_datafile *next; - - next = datafile->next; - fatal_assert((NULL != next) && (ctx->datafiles.first == datafile) && (ctx->datafiles.last != datafile)); - ctx->datafiles.first = next; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ctx->datafiles.first, datafile, prev, next); } -static void datafile_init(struct rrdengine_datafile *datafile, struct rrdengine_instance *ctx, - unsigned tier, unsigned fileno) +static struct rrdengine_datafile *datafile_alloc_and_init(struct rrdengine_instance *ctx, unsigned tier, unsigned fileno) { fatal_assert(tier == 1); + + struct rrdengine_datafile *datafile = callocz(1, sizeof(struct rrdengine_datafile)); + datafile->tier = tier; datafile->fileno = fileno; - datafile->file = (uv_file)0; - datafile->pos = 0; - datafile->extents.first = datafile->extents.last = NULL; /* will be populated by journalfile */ - datafile->journalfile = NULL; - datafile->next = NULL; + fatal_assert(0 == uv_rwlock_init(&datafile->extent_rwlock)); datafile->ctx = ctx; + + datafile->users.available = true; + + netdata_spinlock_init(&datafile->users.spinlock); + netdata_spinlock_init(&datafile->writers.spinlock); + netdata_spinlock_init(&datafile->extent_queries.spinlock); + + return datafile; +} + +void datafile_acquire_dup(struct rrdengine_datafile *df) { + netdata_spinlock_lock(&df->users.spinlock); + + if(!df->users.lockers) + fatal("DBENGINE: datafile is not acquired to duplicate"); + + df->users.lockers++; + + netdata_spinlock_unlock(&df->users.spinlock); +} + +bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason) { + bool ret; + + netdata_spinlock_lock(&df->users.spinlock); + + if(df->users.available) { + ret = true; + df->users.lockers++; + df->users.lockers_by_reason[reason]++; + } + else + ret = false; + + netdata_spinlock_unlock(&df->users.spinlock); + + return ret; +} + +void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason) { + netdata_spinlock_lock(&df->users.spinlock); + if(!df->users.lockers) + fatal("DBENGINE DATAFILE: cannot release a datafile that is not acquired"); + + df->users.lockers--; + df->users.lockers_by_reason[reason]--; + netdata_spinlock_unlock(&df->users.spinlock); +} + +bool datafile_acquire_for_deletion(struct rrdengine_datafile *df) { + bool can_be_deleted = false; + + netdata_spinlock_lock(&df->users.spinlock); + df->users.available = false; + + if(!df->users.lockers) + can_be_deleted = true; + + else { + // there are lockers + + // evict any pages referencing this in the open cache + netdata_spinlock_unlock(&df->users.spinlock); + pgc_open_evict_clean_pages_of_datafile(open_cache, df); + netdata_spinlock_lock(&df->users.spinlock); + + if(!df->users.lockers) + can_be_deleted = true; + + else { + // there are lockers still + + // count the number of pages referencing this in the open cache + netdata_spinlock_unlock(&df->users.spinlock); + usec_t time_to_scan_ut = now_monotonic_usec(); + size_t clean_pages_in_open_cache = pgc_count_clean_pages_having_data_ptr(open_cache, (Word_t)df->ctx, df); + size_t hot_pages_in_open_cache = pgc_count_hot_pages_having_data_ptr(open_cache, (Word_t)df->ctx, df); + time_to_scan_ut = now_monotonic_usec() - time_to_scan_ut; + netdata_spinlock_lock(&df->users.spinlock); + + if(!df->users.lockers) + can_be_deleted = true; + + else if(!clean_pages_in_open_cache && !hot_pages_in_open_cache) { + // no pages in the open cache related to this datafile + + time_t now_s = now_monotonic_sec(); + + if(!df->users.time_to_evict) { + // first time we did the above + df->users.time_to_evict = now_s + 120; + internal_error(true, "DBENGINE: datafile %u of tier %d is not used by any open cache pages, " + "but it has %u lockers (oc:%u, pd:%u), " + "%zu clean and %zu hot open cache pages " + "- will be deleted shortly " + "(scanned open cache in %llu usecs)", + df->fileno, df->ctx->config.tier, + df->users.lockers, + df->users.lockers_by_reason[DATAFILE_ACQUIRE_OPEN_CACHE], + df->users.lockers_by_reason[DATAFILE_ACQUIRE_PAGE_DETAILS], + clean_pages_in_open_cache, + hot_pages_in_open_cache, + time_to_scan_ut); + } + + else if(now_s > df->users.time_to_evict) { + // time expired, lets remove it + can_be_deleted = true; + internal_error(true, "DBENGINE: datafile %u of tier %d is not used by any open cache pages, " + "but it has %u lockers (oc:%u, pd:%u), " + "%zu clean and %zu hot open cache pages " + "- will be deleted now " + "(scanned open cache in %llu usecs)", + df->fileno, df->ctx->config.tier, + df->users.lockers, + df->users.lockers_by_reason[DATAFILE_ACQUIRE_OPEN_CACHE], + df->users.lockers_by_reason[DATAFILE_ACQUIRE_PAGE_DETAILS], + clean_pages_in_open_cache, + hot_pages_in_open_cache, + time_to_scan_ut); + } + } + else + internal_error(true, "DBENGINE: datafile %u of tier %d " + "has %u lockers (oc:%u, pd:%u), " + "%zu clean and %zu hot open cache pages " + "(scanned open cache in %llu usecs)", + df->fileno, df->ctx->config.tier, + df->users.lockers, + df->users.lockers_by_reason[DATAFILE_ACQUIRE_OPEN_CACHE], + df->users.lockers_by_reason[DATAFILE_ACQUIRE_PAGE_DETAILS], + clean_pages_in_open_cache, + hot_pages_in_open_cache, + time_to_scan_ut); + } + } + netdata_spinlock_unlock(&df->users.spinlock); + + return can_be_deleted; } void generate_datafilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen) { (void) snprintfz(str, maxlen, "%s/" DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION, - datafile->ctx->dbfiles_path, datafile->tier, datafile->fileno); + datafile->ctx->config.dbfiles_path, datafile->tier, datafile->fileno); } int close_data_file(struct rrdengine_datafile *datafile) @@ -66,9 +182,8 @@ int close_data_file(struct rrdengine_datafile *datafile) ret = uv_fs_close(NULL, &req, datafile->file, NULL); if (ret < 0) { - error("uv_fs_close(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); @@ -86,18 +201,17 @@ int unlink_data_file(struct rrdengine_datafile *datafile) ret = uv_fs_unlink(NULL, &req, path, NULL); if (ret < 0) { - error("uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); - ++ctx->stats.datafile_deletions; + __atomic_add_fetch(&ctx->stats.datafile_deletions, 1, __ATOMIC_RELAXED); return ret; } -int destroy_data_file(struct rrdengine_datafile *datafile) +int destroy_data_file_unsafe(struct rrdengine_datafile *datafile) { struct rrdengine_instance *ctx = datafile->ctx; uv_fs_t req; @@ -108,29 +222,26 @@ int destroy_data_file(struct rrdengine_datafile *datafile) ret = uv_fs_ftruncate(NULL, &req, datafile->file, 0, NULL); if (ret < 0) { - error("uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); ret = uv_fs_close(NULL, &req, datafile->file, NULL); if (ret < 0) { - error("uv_fs_close(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); ret = uv_fs_unlink(NULL, &req, path, NULL); if (ret < 0) { - error("uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); - ++ctx->stats.datafile_deletions; + __atomic_add_fetch(&ctx->stats.datafile_deletions, 1, __ATOMIC_RELAXED); return ret; } @@ -146,18 +257,17 @@ int create_data_file(struct rrdengine_datafile *datafile) char path[RRDENG_PATH_MAX]; generate_datafilepath(datafile, path, sizeof(path)); - fd = open_file_direct_io(path, O_CREAT | O_RDWR | O_TRUNC, &file); + fd = open_file_for_io(path, O_CREAT | O_RDWR | O_TRUNC, &file, use_direct_io); if (fd < 0) { - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + ctx_fs_error(ctx); return fd; } datafile->file = file; - ++ctx->stats.datafile_creations; + __atomic_add_fetch(&ctx->stats.datafile_creations, 1, __ATOMIC_RELAXED); ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock)); if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); } memset(superblock, 0, sizeof(*superblock)); (void) strncpy(superblock->magic_number, RRDENG_DF_MAGIC, RRDENG_MAGIC_SZ); @@ -169,20 +279,18 @@ int create_data_file(struct rrdengine_datafile *datafile) ret = uv_fs_write(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { fatal_assert(req.result < 0); - error("uv_fs_write: %s", uv_strerror(ret)); - ++ctx->stats.io_errors; - rrd_stat_atomic_add(&global_io_errors, 1); + error("DBENGINE: uv_fs_write: %s", uv_strerror(ret)); + ctx_io_error(ctx); } uv_fs_req_cleanup(&req); posix_memfree(superblock); if (ret < 0) { - destroy_data_file(datafile); + destroy_data_file_unsafe(datafile); return ret; } datafile->pos = sizeof(*superblock); - ctx->stats.io_write_bytes += sizeof(*superblock); - ++ctx->stats.io_write_requests; + ctx_io_write_op_bytes(ctx, sizeof(*superblock)); return 0; } @@ -196,13 +304,13 @@ static int check_data_file_superblock(uv_file file) ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock)); if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); } iov = uv_buf_init((void *)superblock, sizeof(*superblock)); ret = uv_fs_read(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { - error("uv_fs_read: %s", uv_strerror(ret)); + error("DBENGINE: uv_fs_read: %s", uv_strerror(ret)); uv_fs_req_cleanup(&req); goto error; } @@ -212,7 +320,7 @@ static int check_data_file_superblock(uv_file file) if (strncmp(superblock->magic_number, RRDENG_DF_MAGIC, RRDENG_MAGIC_SZ) || strncmp(superblock->version, RRDENG_DF_VER, RRDENG_VER_SZ) || superblock->tier != 1) { - error("File has invalid superblock."); + error("DBENGINE: file has invalid superblock."); ret = UV_EINVAL; } else { ret = 0; @@ -232,13 +340,12 @@ static int load_data_file(struct rrdengine_datafile *datafile) char path[RRDENG_PATH_MAX]; generate_datafilepath(datafile, path, sizeof(path)); - fd = open_file_direct_io(path, O_RDWR, &file); + fd = open_file_for_io(path, O_RDWR, &file, use_direct_io); if (fd < 0) { - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + ctx_fs_error(ctx); return fd; } - info("Initializing data file \"%s\".", path); + info("DBENGINE: initializing data file \"%s\".", path); ret = check_file_properties(file, &file_size, sizeof(struct rrdeng_df_sb)); if (ret) @@ -248,22 +355,21 @@ static int load_data_file(struct rrdengine_datafile *datafile) ret = check_data_file_superblock(file); if (ret) goto error; - ctx->stats.io_read_bytes += sizeof(struct rrdeng_df_sb); - ++ctx->stats.io_read_requests; + + ctx_io_read_op_bytes(ctx, sizeof(struct rrdeng_df_sb)); datafile->file = file; datafile->pos = file_size; - info("Data file \"%s\" initialized (size:%"PRIu64").", path, file_size); + info("DBENGINE: data file \"%s\" initialized (size:%"PRIu64").", path, file_size); return 0; error: error = ret; ret = uv_fs_close(NULL, &req, file, NULL); if (ret < 0) { - error("uv_fs_close(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); return error; @@ -286,30 +392,26 @@ static int scan_data_files(struct rrdengine_instance *ctx) { int ret; unsigned tier, no, matched_files, i,failed_to_load; - static uv_fs_t req; + uv_fs_t req; uv_dirent_t dent; struct rrdengine_datafile **datafiles, *datafile; struct rrdengine_journalfile *journalfile; - ret = uv_fs_scandir(NULL, &req, ctx->dbfiles_path, 0, NULL); + ret = uv_fs_scandir(NULL, &req, ctx->config.dbfiles_path, 0, NULL); if (ret < 0) { fatal_assert(req.result < 0); uv_fs_req_cleanup(&req); - error("uv_fs_scandir(%s): %s", ctx->dbfiles_path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_scandir(%s): %s", ctx->config.dbfiles_path, uv_strerror(ret)); + ctx_fs_error(ctx); return ret; } - info("Found %d files in path %s", ret, ctx->dbfiles_path); + info("DBENGINE: found %d files in path %s", ret, ctx->config.dbfiles_path); datafiles = callocz(MIN(ret, MAX_DATAFILES), sizeof(*datafiles)); for (matched_files = 0 ; UV_EOF != uv_fs_scandir_next(&req, &dent) && matched_files < MAX_DATAFILES ; ) { - info("Scanning file \"%s/%s\"", ctx->dbfiles_path, dent.name); ret = sscanf(dent.name, DATAFILE_PREFIX RRDENG_FILE_NUMBER_SCAN_TMPL DATAFILE_EXTENSION, &tier, &no); if (2 == ret) { - info("Matched file \"%s/%s\"", ctx->dbfiles_path, dent.name); - datafile = mallocz(sizeof(*datafile)); - datafile_init(datafile, ctx, tier, no); + datafile = datafile_alloc_and_init(ctx, tier, no); datafiles[matched_files++] = datafile; } } @@ -320,11 +422,11 @@ static int scan_data_files(struct rrdengine_instance *ctx) return 0; } if (matched_files == MAX_DATAFILES) { - error("Warning: hit maximum database engine file limit of %d files", MAX_DATAFILES); + error("DBENGINE: warning: hit maximum database engine file limit of %d files", MAX_DATAFILES); } qsort(datafiles, matched_files, sizeof(*datafiles), scan_data_files_cmp); /* TODO: change this when tiering is implemented */ - ctx->last_fileno = datafiles[matched_files - 1]->fileno; + ctx->atomic.last_fileno = datafiles[matched_files - 1]->fileno; for (failed_to_load = 0, i = 0 ; i < matched_files ; ++i) { uint8_t must_delete_pair = 0; @@ -334,10 +436,8 @@ static int scan_data_files(struct rrdengine_instance *ctx) if (0 != ret) { must_delete_pair = 1; } - journalfile = mallocz(sizeof(*journalfile)); - datafile->journalfile = journalfile; - journalfile_init(journalfile, datafile); - ret = load_journal_file(ctx, journalfile, datafile); + journalfile = journalfile_alloc_and_init(datafile); + ret = journalfile_load(ctx, journalfile, datafile); if (0 != ret) { if (!must_delete_pair) /* If datafile is still open close it */ close_data_file(datafile); @@ -346,16 +446,16 @@ static int scan_data_files(struct rrdengine_instance *ctx) if (must_delete_pair) { char path[RRDENG_PATH_MAX]; - error("Deleting invalid data and journal file pair."); - ret = unlink_journal_file(journalfile); + error("DBENGINE: deleting invalid data and journal file pair."); + ret = journalfile_unlink(journalfile); if (!ret) { - generate_journalfilepath(datafile, path, sizeof(path)); - info("Deleted journal file \"%s\".", path); + journalfile_v1_generate_path(datafile, path, sizeof(path)); + info("DBENGINE: deleted journal file \"%s\".", path); } ret = unlink_data_file(datafile); if (!ret) { generate_datafilepath(datafile, path, sizeof(path)); - info("Deleted data file \"%s\".", path); + info("DBENGINE: deleted data file \"%s\".", path); } freez(journalfile); freez(datafile); @@ -363,8 +463,8 @@ static int scan_data_files(struct rrdengine_instance *ctx) continue; } + ctx_current_disk_space_increase(ctx, datafile->pos + journalfile->unsafe.pos); datafile_list_insert(ctx, datafile); - ctx->disk_space += datafile->pos + journalfile->pos; } matched_files -= failed_to_load; freez(datafiles); @@ -373,42 +473,43 @@ static int scan_data_files(struct rrdengine_instance *ctx) } /* Creates a datafile and a journalfile pair */ -int create_new_datafile_pair(struct rrdengine_instance *ctx, unsigned tier, unsigned fileno) +int create_new_datafile_pair(struct rrdengine_instance *ctx) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.datafile_creation_started, 1, __ATOMIC_RELAXED); + struct rrdengine_datafile *datafile; struct rrdengine_journalfile *journalfile; + unsigned fileno = ctx_last_fileno_get(ctx) + 1; int ret; char path[RRDENG_PATH_MAX]; - info("Creating new data and journal files in path %s", ctx->dbfiles_path); - datafile = mallocz(sizeof(*datafile)); - datafile_init(datafile, ctx, tier, fileno); + info("DBENGINE: creating new data and journal files in path %s", ctx->config.dbfiles_path); + datafile = datafile_alloc_and_init(ctx, 1, fileno); ret = create_data_file(datafile); - if (!ret) { - generate_datafilepath(datafile, path, sizeof(path)); - info("Created data file \"%s\".", path); - } else { + if(ret) goto error_after_datafile; - } - journalfile = mallocz(sizeof(*journalfile)); - datafile->journalfile = journalfile; - journalfile_init(journalfile, datafile); - ret = create_journal_file(journalfile, datafile); - if (!ret) { - generate_journalfilepath(datafile, path, sizeof(path)); - info("Created journal file \"%s\".", path); - } else { + generate_datafilepath(datafile, path, sizeof(path)); + info("DBENGINE: created data file \"%s\".", path); + + journalfile = journalfile_alloc_and_init(datafile); + ret = journalfile_create(journalfile, datafile); + if (ret) goto error_after_journalfile; - } + + journalfile_v1_generate_path(datafile, path, sizeof(path)); + info("DBENGINE: created journal file \"%s\".", path); + + ctx_current_disk_space_increase(ctx, datafile->pos + journalfile->unsafe.pos); datafile_list_insert(ctx, datafile); - ctx->disk_space += datafile->pos + journalfile->pos; + ctx_last_fileno_increment(ctx); return 0; error_after_journalfile: - destroy_data_file(datafile); + destroy_data_file_unsafe(datafile); freez(journalfile); + error_after_datafile: freez(datafile); return ret; @@ -421,40 +522,86 @@ int init_data_files(struct rrdengine_instance *ctx) { int ret; + fatal_assert(0 == uv_rwlock_init(&ctx->datafiles.rwlock)); ret = scan_data_files(ctx); if (ret < 0) { - error("Failed to scan path \"%s\".", ctx->dbfiles_path); + error("DBENGINE: failed to scan path \"%s\".", ctx->config.dbfiles_path); return ret; } else if (0 == ret) { - info("Data files not found, creating in path \"%s\".", ctx->dbfiles_path); - ret = create_new_datafile_pair(ctx, 1, 1); + info("DBENGINE: data files not found, creating in path \"%s\".", ctx->config.dbfiles_path); + ctx->atomic.last_fileno = 0; + ret = create_new_datafile_pair(ctx); if (ret) { - error("Failed to create data and journal files in path \"%s\".", ctx->dbfiles_path); + error("DBENGINE: failed to create data and journal files in path \"%s\".", ctx->config.dbfiles_path); return ret; } - ctx->last_fileno = 1; + } + else { + if (ctx->loading.create_new_datafile_pair) + create_new_datafile_pair(ctx); + + while(rrdeng_ctx_exceeded_disk_quota(ctx)) + datafile_delete(ctx, ctx->datafiles.first, false, false); } + pgc_reset_hot_max(open_cache); + ctx->loading.create_new_datafile_pair = false; return 0; } void finalize_data_files(struct rrdengine_instance *ctx) { - struct rrdengine_datafile *datafile, *next_datafile; - struct rrdengine_journalfile *journalfile; - struct extent_info *extent, *next_extent; + bool logged = false; + + logged = false; + while(__atomic_load_n(&ctx->atomic.extents_currently_being_flushed, __ATOMIC_RELAXED)) { + if(!logged) { + info("Waiting for inflight flush to finish on tier %d...", ctx->config.tier); + logged = true; + } + sleep_usec(100 * USEC_PER_MS); + } - for (datafile = ctx->datafiles.first ; datafile != NULL ; datafile = next_datafile) { - journalfile = datafile->journalfile; - next_datafile = datafile->next; + do { + struct rrdengine_datafile *datafile = ctx->datafiles.first; + struct rrdengine_journalfile *journalfile = datafile->journalfile; - for (extent = datafile->extents.first ; extent != NULL ; extent = next_extent) { - next_extent = extent->next; - freez(extent); + logged = false; + size_t iterations = 100; + while(!datafile_acquire_for_deletion(datafile) && datafile != ctx->datafiles.first->prev && --iterations > 0) { + if(!logged) { + info("Waiting to acquire data file %u of tier %d to close it...", datafile->fileno, ctx->config.tier); + logged = true; + } + sleep_usec(100 * USEC_PER_MS); } - close_journal_file(journalfile, datafile); + + logged = false; + bool available = false; + do { + uv_rwlock_wrlock(&ctx->datafiles.rwlock); + netdata_spinlock_lock(&datafile->writers.spinlock); + available = (datafile->writers.running || datafile->writers.flushed_to_open_running) ? false : true; + + if(!available) { + netdata_spinlock_unlock(&datafile->writers.spinlock); + uv_rwlock_wrunlock(&ctx->datafiles.rwlock); + if(!logged) { + info("Waiting for writers to data file %u of tier %d to finish...", datafile->fileno, ctx->config.tier); + logged = true; + } + sleep_usec(100 * USEC_PER_MS); + } + } while(!available); + + journalfile_close(journalfile, datafile); close_data_file(datafile); + datafile_list_delete_unsafe(ctx, datafile); + netdata_spinlock_unlock(&datafile->writers.spinlock); + uv_rwlock_wrunlock(&ctx->datafiles.rwlock); + freez(journalfile); freez(datafile); - } + + } while(ctx->datafiles.first); } diff --git a/database/engine/datafile.h b/database/engine/datafile.h index 1cf256aff..274add91e 100644 --- a/database/engine/datafile.h +++ b/database/engine/datafile.h @@ -13,27 +13,25 @@ struct rrdengine_instance; #define DATAFILE_PREFIX "datafile-" #define DATAFILE_EXTENSION ".ndf" -#define MAX_DATAFILE_SIZE (1073741824LU) -#define MIN_DATAFILE_SIZE (4194304LU) +#ifndef MAX_DATAFILE_SIZE +#define MAX_DATAFILE_SIZE (512LU * 1024LU * 1024LU) +#endif +#if MIN_DATAFILE_SIZE > MAX_DATAFILE_SIZE +#error MIN_DATAFILE_SIZE > MAX_DATAFILE_SIZE +#endif + +#define MIN_DATAFILE_SIZE (4LU * 1024LU * 1024LU) #define MAX_DATAFILES (65536) /* Supports up to 64TiB for now */ -#define TARGET_DATAFILES (20) +#define TARGET_DATAFILES (50) -#define DATAFILE_IDEAL_IO_SIZE (1048576U) +typedef enum __attribute__ ((__packed__)) { + DATAFILE_ACQUIRE_OPEN_CACHE = 0, + DATAFILE_ACQUIRE_PAGE_DETAILS, + DATAFILE_ACQUIRE_RETENTION, -struct extent_info { - uint64_t offset; - uint32_t size; - uint8_t number_of_pages; - struct rrdengine_datafile *datafile; - struct extent_info *next; - struct rrdeng_page_descr *pages[]; -}; - -struct rrdengine_df_extents { - /* the extent list is sorted based on disk offset */ - struct extent_info *first; - struct extent_info *last; -}; + // terminator + DATAFILE_ACQUIRE_MAX, +} DATAFILE_ACQUIRE_REASONS; /* only one event loop is supported for now */ struct rrdengine_datafile { @@ -41,26 +39,50 @@ struct rrdengine_datafile { unsigned fileno; uv_file file; uint64_t pos; + uv_rwlock_t extent_rwlock; struct rrdengine_instance *ctx; - struct rrdengine_df_extents extents; struct rrdengine_journalfile *journalfile; + struct rrdengine_datafile *prev; struct rrdengine_datafile *next; -}; -struct rrdengine_datafile_list { - struct rrdengine_datafile *first; /* oldest */ - struct rrdengine_datafile *last; /* newest */ + struct { + SPINLOCK spinlock; + bool populated; + } populate_mrg; + + struct { + SPINLOCK spinlock; + size_t running; + size_t flushed_to_open_running; + } writers; + + struct { + SPINLOCK spinlock; + unsigned lockers; + unsigned lockers_by_reason[DATAFILE_ACQUIRE_MAX]; + bool available; + time_t time_to_evict; + } users; + + struct { + SPINLOCK spinlock; + Pvoid_t pending_epdl_by_extent_offset_judyL; + } extent_queries; }; -void df_extent_insert(struct extent_info *extent); +void datafile_acquire_dup(struct rrdengine_datafile *df); +bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason); +void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason); +bool datafile_acquire_for_deletion(struct rrdengine_datafile *df); + void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile); -void datafile_list_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile); +void datafile_list_delete_unsafe(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile); void generate_datafilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen); int close_data_file(struct rrdengine_datafile *datafile); int unlink_data_file(struct rrdengine_datafile *datafile); -int destroy_data_file(struct rrdengine_datafile *datafile); +int destroy_data_file_unsafe(struct rrdengine_datafile *datafile); int create_data_file(struct rrdengine_datafile *datafile); -int create_new_datafile_pair(struct rrdengine_instance *ctx, unsigned tier, unsigned fileno); +int create_new_datafile_pair(struct rrdengine_instance *ctx); int init_data_files(struct rrdengine_instance *ctx); void finalize_data_files(struct rrdengine_instance *ctx); diff --git a/database/engine/datafile.ksy b/database/engine/datafile.ksy new file mode 100644 index 000000000..28d4b3935 --- /dev/null +++ b/database/engine/datafile.ksy @@ -0,0 +1,74 @@ +meta: + id: netdata_datafile + endian: le + +seq: + - id: hdr + type: header + size: 4096 + - id: extents + type: extent + repeat: eos + +types: + header: + seq: + - id: magic + contents: "netdata-data-file" + - id: reserved + size: 15 + - id: version + contents: "1.0" + - id: reserved1 + size: 13 + - id: tier + type: u1 + extent_page_descr: + seq: + - id: type + type: u1 + enum: page_type + - id: uuid + size: 16 + - id: page_len + type: u4 + - id: start_time_ut + type: u8 + - id: end_time_ut + type: u8 + enums: + page_type: + 0: metrics + 1: tier + extent_header: + seq: + - id: payload_length + type: u4 + - id: compression_algorithm + type: u1 + enum: compression_algos + - id: number_of_pages + type: u1 + - id: page_descriptors + type: extent_page_descr + repeat: expr + repeat-expr: number_of_pages + enums: + compression_algos: + 0: rrd_no_compression + 1: rrd_lz4 + extent_trailer: + seq: + - id: crc32_checksum + type: u4 + extent: + seq: + - id: header + type: extent_header + - id: payload + size: header.payload_length + - id: trailer + type: extent_trailer + - id: padding + size: (((_io.pos + 4095) / 4096) * 4096) - _io.pos + # the extent size is made to always be a multiple of 4096 diff --git a/database/engine/dbengine-diagram.xml b/database/engine/dbengine-diagram.xml new file mode 100644 index 000000000..793e8a355 --- /dev/null +++ b/database/engine/dbengine-diagram.xml @@ -0,0 +1 @@ +7V1rc5tI1v41rsp8MEVz52PsOJvsxkkmydTu7Je3EGpZTJDQIOTY++vf7oYG+oJoIUDIlqcmthA00H3O0+d+rszb1dM/0mCzvE/mML4y9PnTlfnuyjBM3zc1G/2Bjz3nx4DuG/mRhzSaF8eqA9+j/0F6YnF0F83hljkxS5I4izbswTBZr2GYMceCNE1+sactkpi96yZ4gMKB72EQ06P0DfDxf0fzbFkcB45fffEBRg/L4uae4eRfzILw50Oa7NbFHdfJGubfrAI6TPGW22UwT37VDpl3V+ZtmiRZ/tfq6RbGeG7pnNHrsmf6oFfmzTJbxegDQH+Sr983XAxULkbvlcJ1Vr9d03jL53+l9tvPH39EN3fr2R/2l88f/rq2i1l4DOIdvQt/WzhHc118JLNj3pD5gnhgHX3a/oRZuCw+JGm2TB6SdRB/SpJN8ax/wSx7Lsgm2GUJ+yblvJLBsjT5Wa4hfudFss6Ka4FHzgjS7C2mmuqByLH3URwXo8D1nJ4RxsF2G4X5weIUQD5l6fN/8AdNLz//ia/XHMOlB949lSPiT8/Fp3mwXZL3J1c+Rdl/yIV28elPehH6uxoBf3hmXvM2iZOUzLJ5S37ojb7CNFrBDKbF6fmi4JXg2KBafUovyS4N4b4lL1YYzdgDzPacaLkljSPsgAl6nvQZXZjCOMiiR/ZJgoKDH8rzyku/JhF6RkMvAAcYfsEXBdr4ns6Okb9CcVmdovmRfEfX9OoHUUd9XGB5Njtw/srCwOiP2gtWhwgDHcJMYGrMxJDpfs5yhuAshq1aWKrkojoPlRwl5yL8fl+DDPHJmtzO0MFe3urGRu/v9J/Jp//+988/7ufO82fvK7iPrm3/xGwE3Drt65bNEL/j6kq0L2FPljstW407+2Ii07sw0R4m0ly7Ex8Zk2UkxxyGP/Q2Qj6AI9iRbMPV7HGZwheY4n2EJFZD/4rk2u2V4cRo1m9mKcMnzt87LJoSCr3eEhJ9i1fW2jyRJaTfo78eit/BCjHIzXq2xb+iNTr7PiC/boNwCaWM+CmYIYWCYZ8gjh4wHYWIGLAIc/MI0yxC4vrb4otVNJ/HhG0heq5gRsbDdLjBM0jm1L65st9x7GURRs8QWSRrRjDC48MnhlIKVaMYuxLG6yS6B4BEeiyGv9Y1RBAWQxDXxT2PJNlrC7DDOhyVJYvFFh5LYFImRIrS1GB3fEWgxFIVcUWqAdC/VWSXNnDtgqVUqa0LJdITPUOK4ANIKR6LnUDvKu17jqHpnut6hqubvuca7LjAsjQX3c12TNOxHGANJfrLJ15B9Gd55dcyyuD3TUDW81cabFhW4GGPIbdGAhFwsBHGDINfl+Lzr8p64tFjy5rhxOS30ToRMNN6MAYZh85hF4hhUEUEnTqCKWDHEayqyKnUxNYfpx63SKawSB+SrGdBZAZD9LjYELjDT3R6oUNc5v0UrCqONLInEjIs3+1HqgCssGJw/DucUCHj54I+pORRmWKvw3xbxFQSraMsCmIpoUhIDz3prCbQluTIHplHj+0nRfwBTBZHEXbxpHq4SzEpxc/SmzNC+AxG6wfMCgh0EPC1nv8rQsiG6Q2dHmb4Cn0eZEFtevK3YN8MHRbeFh1jpoljw5YNrIRVfjfDkht+mZrks1hAJwxlMtHc9Wd6Jecft+XZrHgNLHHLA65kyzN4kaU3NDVkLDIhqVsmiQy6K5qKu2Lv4utRyyjuiTWgy9nnNufHJD0QqMQjR4NQ8SglMOiLNFmhX6sddtjFsPoiX65tI3YMiAg2/k+GCA756QcRWH9BuTPW8YCaVet4APgttD/panLGzxPjAZV+28Vkd1KIAESD3bsozZ5lgnLOTdvgkWzWaDUQ/0Xbn+ck91Ky7UHuNSzT6Unu5dh7PLnXnTYX87Y0RS6Wu7npN3Ij1wicT21eU+F8fXCtR4okZ6H3ZMsAD4TIoVTu+YcIiFsjP+NXgCYBaz9tSg8BzRkZNN4RQxkDpJNWfRZGg+rjzBy7J0HHNjlr3wRUn4mLOi0gWTxOhZClh6B74MMBuMq4PirArns/jhS83DOFX9ne2y/83sYwWA8Iv9sNGf4oPy6L4xz8Ikra4t/JmkNIOQqH2PWrYHkifuIVXCXpcwu25m94Ksid29CbWzLI9YyZ2ZtuCaYHuaJSIszzpJ1UtnNyJ5UpE+/2zuE5O6lMQ3EXMO1J7QKm6I7FJi0EBFDEsdzbJFHJz8z7VJLmlL1P5mhaOKXdUdxPx4sB9MA9zALOYdPJTDyMEoe+rx5QL2xW/CPkmlfBb1juINJEm/jwFwIjBJL0mkdEarfiVaVKqC/QCmGRBtZupqgphnGyhVRFJGpjhAD7SfYuRKipPRp+MuNq2hrlKZxpjiR+ZGTxxpS5YSakUY7vTLNVd+9p6XCmJSwk1WCU+LTg0fPZuCnhXszndbIdyYLKbGm9K/M87o9z6Qibf+tGWzf17ol3YbdfXcLHR++1jRPX/zTtXc0pygqnsD5PQVaQhbsPZyacnnowltlxL2Zw9shG3t/3+H9sK29PCue7EAqD6NswWNcttReb5HQY8czcQIPknWhlTl/nzJNDsma7axTK+SXU+DJ0fomts2mvNhgs5XvvhOwL/TtYuPAaAPZdbnV5T1zcaYtgIZVA9mwV34NHjJxcbEGOq++k4UhxHG22mPZbcJChV15F4nAQkh8ZDpqO6ZvzfnDQsVkctCS+mbK2CRP5N5hfwVLwK0wJBxWNF1182zVc1Flg1AynBRv3FM8Y1oLij4R3XO0Mr2s2nWObmuci+PQt17A9n1X2fcPUTMt0fd20HUvXjVFB1RK9N+fECy89NKR/a6GcRH1g42oHvZC7Z/ma7zs4dNuwbM81OXIHmu84vg8sAHTdB+OSu9EuQ/yTVWg+YntJVxWQbu01Rw7RvIhHBwZpuCTmRhoaos+CbWGaQTfVmu9wtPGEe8nSe0RMSKtgsykeg8gkNNBEf4M4NSXf5Id+0xolIw5DOsc6yBS4AHoLqSXFCT04W/D6YA9SjJjCK0lgkIZHAGcwdU4WYjKUPCxjit7l4oLVmkzFnNeTe6Qov3gQybk5JGQSMnUJ1SeUqSdnW3gJpYT2Ch/yOkLdZQ3LUhTA6YjDF7TgywrxIsnQ0sKlFBBFQGkpoFZvq9VbstLLLAU0vSqG4wPlgRbZEfDN6j3AsgHfTKPBRzASvtHypPskyN93kLzrMsD6wUOwUQzDf7PNy4qQyLGAPEuyuCqCyPiTU/j3LsoVi02hMRFNhD9vTYIAF6SCcK5J8WesctwkcfS/TQE4FZHSbhICuyCl4Zis0k1z+I9FSoer+uqMBZSUdyWUqqzaOI3hFtstEZ6a9fnSI8zsyoeYBNT1Y9Cuq3TK7sfn1fUR8iNVonvM+hfqOUrN/5ZEVRnMC2o1hzz1RUqGAil92cALKR1FSs7JSQkAGSxNWH6z+pPf1PKP2oUyVS+3MZLXR1A6BTv4wEIZACJAfU02O/RWYriNIP5UdmZiI0tylAmnokYqSkMVY/UjDllcfG0/0hAnZNFSFCNIQ82REH1tYabCFsYZYL+Hwfqyk3XZybyT72T2xPM5zskQMbzZAPga66t1+YwB5bYe4lglMY5lhGhW7QQjxDaLyIATNUbohfUB34Y/u74Tn5VhoreEmSENE1yQAR+ZOKAFV6ZNnhQ4p+X36tZBo0sDjSN0ENXIs9F0EN1lBQRbsTGMGIvjOppT+2Gh3nKF5hpDg70oubIeNanTv2/nGmt7OT0Ud3OuldAzaeca4JxrVBUbAZonVxpwWtBsdMHm/fGQvSOzqnWIxvUNbx3i0nr8rpK3zyXq+cM10JNzhxiwcwIk/ufEssg7IjFFmkkjMdcdpkTmEZC4OVLx6MBEQVcsMncKlfEoc9Rg0bdf0+QxmsNSGSUTsy20UolNN0WwjhiSVGmhJcSziKi0GUZZYuvNDGkVl/J0HK6p/8jHeJPuMPsVDQw2aZSkUZ6L9Nu4xjm8zx1ej7zJbNe40xxgnOO7KBmSxM3SEDdOqXLHmJokc6aJm5pum4yi6ZK8+z35UehDjylOkj6mJxVnLF6a4YPVlaUZQ9PR3LqG7fiuoXOpJEA3LM3WfcfyfdvRDZqxOpKo4x1oo0HrisDwG5naZH1XffPCOOrqIEtNd7J3VDuN+GMF3nm6zxCoM7aPVxcDP+8jtFyyAtyFPH564VhRGvb6tEvogGu2fKTBuOw/MpbsC/Qzc631GCMyAmSUvRQnUk8N6KI7KwipQ4ln7c2ZsXZFzL14gwxa0oHao4/jbSrUcOrzaEGqjozT+1dzJaOI51OP6bdkl1V5eocm5NEjyvWKWqJKrNptZUWAhujAtYThz1LRpg7dQvFd0T2XduEKHoJoTQok1QPJr4qyqvochrnajtTdEA8AnzKIWcvQ17CqjUTqrMVJMCeHCgW8yPAlJ6DL0oC0BdvkqY4KbQMv4TP027I0+Z5S5WXaLVNaiQ9t6E9DP7NA0EHiZ1h9AuitBUQknbwVypX0p48rZwRRZB9cMbH4KvxdHb/CSI47rorjiEo33ZMijJe44Dvts3o28k/J5pMOhnHYYa3R5B9ZWOrrBkFVF2sdAocrKacc6DIe3nHGdqcz3jlcTK0LbM3iSH9oyBMDDl4A5FGmnjLkAYcbdTTzjjO5uhenCJzWdY/FPd/wuyDfqMKfamzJaGBo82kBfAMXdTDkc5j4kYZGQtHh/wKQsM9iFoMJfzZrLh8xyoMqGKNEeahmYrfZSNBKZhyYMraLAh7rBpDikDpFyiwv3AbQXF+qlVZ7qGUmMZ/YEvMJDyK9WU9cWRmUoShHNfH6QjmdegSOTDlGu/g1rTaLZXovnUPqUqmHF0nncLDabe7eQoK9Nmi4o/by3pu/9N9SoXzWA3q5zna1RixFL1dd0qyVa6YUEBlitUGyS17/c5GkK/FctvFC4VBIYTC/NFzAnMWJPrYlctbIHRe8MUWinF4vW9twuDzu3ubJ9jZ5sPP7vHxvNydvUW21GIqA2zspuL0pnKRMq8laQYqqnaSY4Hrm9X8B31dpCgWAfRFdwjrrVytHfdH8cVhAxoQNVvm+MaCRvvxGJZK4kZRazU2eakQTFWlbzU0FaeoaMB3O/knjh480Jhi672l+9WMwd/FpEGWLUUkIvrsezSThNUfkKMuLh++/nzC3pYfBcWtADB60kBS3GxhGC5ISQhm4IeljPlwzmYmVxAYesLhCFj4QERrt6xKEHiwsxDsz70CPYMsQz4F5of1h5NEACBgA5GKRhbDxBgN9X3Dmiw5GGt4WNZYzueLyLilk5JZ4qqSejRXe680Kr2uWK42UODrlnUsgG60YCd2g92kMn779Id+HGncJhB8bSDg92c3bdw11tY3bKJy55zrWlRBVuCA/0g3kLdLxe5Lwfd4qLtk9ZFl/w0n3zbLLZTEPXEzbPfViKhS5uixmk9eBK1tjn5w1RbGOjegmPm4igvO7cWkUfl2qdyMZtOrNvmrMmq/aKq1WakG3OcWZKtJdRYAR9vjm8tWNKq3cX5M+zN4YuBwPLghg4F4n5A9b/40smN69AgOc7zYxEgtJwdq7BhX1G/x7B7fZgWbLPXpyXy/e9H79Ti9/l3uYEjPsahdn0Yb0hNsgHsqzU1g1/8XOATZT5i+ccsQhvDIHm3QnxBbq71mSYpN2u5k5eoJzCnZ1aARt+2NrmZjD7Reux6Uky8pOyPwPw9mXAe1FemQu4wvcz2qxj75fhjoW0duGddUS+9hvJKOvGsmonizLqa7HRja6nEpsKbYOH3NjBUCkdjydNUf+65LZGnIaWmj7GLtff5TuqRI6a0EqyNPsxxBk8MXiDif7sigRM85oJRR9MaPnrlR0GL9xgwNCYJ3GovLlkVqvg0IKqJdilobEsLm6TaJBEu5WhOLa5IJZzryfZtJgl6EjEYDHhYGbEk9zGXXASAL+YArvAa1m962teOQDJLFSxBb9jNj/SsiFbjaENIUjlRCEV0jdZE0JZfX0gEhiqQUYH7eG9tdutfm4LtDyZo5XBeAMrTleMBPHJ5AzP8AA39O222hkEUebD6UIKk+sRk8wjyBjbJmR/64kxpZek6strvSwIaE+aRRVCXYDyKEdwqgU9NSDXcHN1Q1q2KRc2uD426ZJCLfbJD383mqBl0cXX3iTV10QK+q3OanyCMpK2SiLHi4hh/fN9QovZRP2l00wTQlny8omDKlhih7N2v5fltCgNlWy28u3+YnFLI4iKTg+57OS2QxkfWQGi1kE4MCi25U29GLr7HFlt3WvrS4GW7tSZiuoClrSVPH29MkRtK7ROuA4fPHs7u1vDLQgrusZrm76Ho0rKduIWJqL5HHbMU3HcoDJlZIavBOcLHwo38SWgO5fn2FWxOViTWwWbInRff0Qresx4NX59GAt+P/Hksjf5B880BIhBYZf+AjxQ87hFgNhvWcObiyX5ibQ4vC7m7vP//j4+U6r3VSSuKGUy7HnUokwgpH/ugDxt+SxF3IB8G3e+Ic++Ta3RkfrMN4VZue/iyTVEH2bpbswf0F8FkKkjJxzW+xL+QVMnB0pzhzvtsvyROLS1eFjFBY3zMOryQ5W3B5B2xXNGAm2z+twmSbrZLdVmcdjJmP02acJwEWx6QhNAL5glJfcxfyRWEiTKeXhDx//8UEqJ5N7IZ5In7N8iWli0CxOwp/bUnVF5BPHsCAe/Q2uvI2/xC4SLH7kl+bB8ynMHWX5qUVVMrLZ/B8iQBis8MnmO0SLjKArPv2e9/n85dv920/Nb7TbksrgefpWbmzBfBAxKs1Bd/z05d/Nt2NfOZ+GJQxi/Ordbndz9/0Hxrz37798+9F84xXiOgKKiF6CdF2rtSq9ITooks3JmbE3qRfpiWFOXHb16QeRqK6NphyNBBH/IiZizxIJxXB9hABMxe40F0t7EYh9LlJEouFYshwNczDbhSMzXTRYKHEpwhhSQ1OL4klsTlSM1TXLQIqiZRi24egWcEi0DjmlmHJdMy3TRoIScDwTmBbwuSXptxA/q0MzRFHznfp9OU8N29O4sp9AtzRahqKu4BqmBkyRAvpQcZfP/0rtt58//ohu7tazP+wvnz/8dS2Ga3z/8RbDlP7h7tvdccYHPtLeBpi6DwjTujF0vbyCqmSyFg0NKyJZtz2L5OByzswiyWyLhoQ/+wjQl66OKGC/1tXxNN+sRbr73rQWSuaAuMCoAKNH0YCvmbZe+2F5FQCgUUtAnQxMXwP+iIAKxB31A9FXOXJoW3icRYWflUQBSeSTpiR2iWvGd+e668pWmEtwPxJAga9RTxwVcGyRMWVBQn3UjpCvhli65XOSrnA+8MDrQZOAJevhwsCBupTjaIJwPxvaBNfDENbjE8KMgRcDbWBG2LAYi8XcdxzZYsydmWP3xRwmF/J/+pUQ8zdu4FYSAnG3WCSpGFXU9xJ5IWxaIsQuM/kSzTzbsnvaXoDpTm2JxKyMr2UrNH6Zft/BnSTAcRp+J7l36bjlApYSvMliAYZbscm12z5Fuc2DOqFVviLDrDc/A7hj11W35mecg6ebW4l2hay7lfZIfP17kfgtw+K1F+XwPGHzUewK28FTJJ+hybU6PiO2sADLFUSnOwOuGMi3aiItw+6LL4SxRucMBcPBS+WMKkjA81kS162DSbwrn52ENYyBWMNuEGDPbsMwRCPKq2GLozcM13E5djqTHcMchi0s29Ysr8l0bPGWwPNhEllZ4lfHJMDjavsXaoMSq9TD0zq3djqUS/aajdt7L1sDcYm1l0sUK/RLxuWsKqWDeywuES2Or5BLTN99EVyiWoFrMC5x9nJJZ4HL4kKlrbH3kjPrAjwMlzi54nG4yDUhDjFOzSH7pS3FNn/trbgErX9oDrm00ESE7gHz7DlEtdLtifaQzvqIsIco9mfqjUNE58criXzwKDjQBHhJBbBRPYeGmBj/asIePHdqiyF6P15HzINvWHxE5ekXQwypfOVhD747wVUS3SKXyIeqXJHdjnCjhj2YF3P9MbIx0vaPk44HtuDnNR37F4Q9y9N8p0kQ9vikF2VBGI/rsX5dl9c7B5aFzYtt/hiOsJigB1zEzn09XOEPxBX+6bniYos/hiucV8wVPt4r3P65Ao/rOafliovt/Riu8EiNs3q4g/l6uMIfiCv803PFxd5+tjrFMEKRazYRun+EUORr1D5HuYY3wA9N6JecgQuhM/KIa/VP6L6FCN07LaGLTolXSOjANhhC14DZ1uJzIsSuUtd6oBBObNEZBP6t08P/JTcGw7/BlSEExoUrVKT/QfYK/+R7hXWg/RStSZQ9fyMTmqynWauzYoM6rwCWVwpuqhhFZxiFZZK2EM6TiTGGaVn1xqQ+l2XVNcalqvXbNNLQdClaMN/HEmd+S4NSWlSs9/6kym1HWSrYw4ONbshrXC6J6zhbzM7xHUqBbNheG0rIX/nMTHFWfxij3BjlFHAisH3XiroImGytVkPItvlx0bcA+9iRxmS7SijVG/GJFq/bZL3dxfKWPmcPLw1SUTH8cO2PLUc27AjgcmaGnqHAZTowopplcSiMoHEZGKEhx2MBiWhoqboWvEQkafCuvFwkkdkMuKrD9+geUYgLO3+DD9GWvMkhzVWUKqEr94hp6sBCByrLG98HaI7R/6TWOx47Wl+v4CohCxohIHyqvYasKLKswcyesshDv88V1zmGFKPP286s6Aod/D7RQc+oy5uf5gW146bXYJ56gS4y9F/LKFy2PO6p5jmFmGpIdfCiynmBRGXx+scgigm2cO82xfkn/YPyTgbF2yyiGLaRitrcj048eQ31/G3y1gLaRIlob53z8wCbEZ//oNYML+vVL88/9vMf1EKv/9f7iOW+TZo8RnPS6StaL3A+Fe0SMUt2GdsupPclk/cI6aOrQ/+zlWN9Fq0gUhTRVsC25onnJOGG2ae5DbrDLMpbcpzh/OT/YmWtZZZur4r+JC99uiIixpUzstvM8/57C9KAdx0+t8kTynNwpdJUhdOg++zASHP2mvPy9mXzNWrW6rlF11znsrIpY71lH61yytRT500pvSUXWbIuYxM2mwl+v14c5O5BxcDYFn0dkvBVbXQKsa/FarWW/8qNGr0b/a4NYNVNdbrPEXjXnudcG+yyPsVYnsjJVY5kiO6M6uWpc8VxbjEFXpGUphiVVwQDOTC0jvzBFy9W9Nj1xR626Kg/WEDxWgSUGT3wjvRX1H9Hp5Wt4StFiJo0cRu7PUrSTFFwqS7+hnusUBsRYyCqzgmw6LSGuBF3QEikJjFWZ6UwDOKQ9jVOYYbjabBqwwNKHEebLebUFqmHwQLBqcRspkwTNq4nEfmRyUOmY/rmvB+ZBwCuRB3QJY3ZDEMi9YDBGgrZl0iEPZEIJehzUYPKNe1qkF9tAIOLQp6qLJSjV/8Rg4Zpc5SuWI1RUm2I+DibPKB4YNYFqrSL9Ib+F1e7CvccGE5IuQechnusk3MPt0/4nft6A5MJHwDe/nGH5pZmdzIm0qOMObk1a71bzRrEH9E7WrQnJr680keKowMQC8X4z1WA/WNtYxFXE352/JD7BK/8HRusO5XEQ/v3hXGym7dLQC2mnrmn6655dUC7xbe2rlsVSx3Z68jTfLaIjCkpIlOqA4zMM1gZGbu5n/zRdEhF5EaCqOhL7h6uzkw2sOySfqEvKX2ZFouUsgpFcuIaTqCW2Ut6Iq6iRXnZllxPg/VDM42JX6C1Twu8K0JdLlQlK1XG7b/uyalK2j+7J6paBU91ikBEJhJEHpgzn8H1A+kdj31CkcQMwKOa8VTuoyWQLRNJGbwr48Y07ve7d09Cio7lOp57CCmawDJtaxiAk1GirAQb4KsoKFAi+pgmmDIqmQ9N1vI+meOozbv/Bw== \ No newline at end of file diff --git a/database/engine/journalfile.c b/database/engine/journalfile.c index 500dd7880..de2b909c0 100644 --- a/database/engine/journalfile.c +++ b/database/engine/journalfile.c @@ -1,132 +1,424 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "rrdengine.h" -static void flush_transaction_buffer_cb(uv_fs_t* req) + +// DBENGINE2: Helper + +static void update_metric_retention_and_granularity_by_uuid( + struct rrdengine_instance *ctx, uuid_t *uuid, + time_t first_time_s, time_t last_time_s, + time_t update_every_s, time_t now_s) +{ + if(unlikely(last_time_s > now_s)) { + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE JV2: wrong last time on-disk (%ld - %ld, now %ld), " + "fixing last time to now", + first_time_s, last_time_s, now_s); + last_time_s = now_s; + } + + if (unlikely(first_time_s > last_time_s)) { + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE JV2: wrong first time on-disk (%ld - %ld, now %ld), " + "fixing first time to last time", + first_time_s, last_time_s, now_s); + + first_time_s = last_time_s; + } + + if (unlikely(first_time_s == 0 || last_time_s == 0)) { + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE JV2: zero on-disk timestamps (%ld - %ld, now %ld), " + "using them as-is", + first_time_s, last_time_s, now_s); + } + + bool added = false; + METRIC *metric = mrg_metric_get_and_acquire(main_mrg, uuid, (Word_t) ctx); + if (!metric) { + MRG_ENTRY entry = { + .section = (Word_t) ctx, + .first_time_s = first_time_s, + .last_time_s = last_time_s, + .latest_update_every_s = update_every_s + }; + uuid_copy(entry.uuid, *uuid); + metric = mrg_metric_add_and_acquire(main_mrg, entry, &added); + } + + if (likely(!added)) + mrg_metric_expand_retention(main_mrg, metric, first_time_s, last_time_s, update_every_s); + + mrg_metric_release(main_mrg, metric); +} + +static void after_extent_write_journalfile_v1_io(uv_fs_t* req) { - struct generic_io_descriptor *io_descr = req->data; - struct rrdengine_worker_config* wc = req->loop->data; - struct rrdengine_instance *ctx = wc->ctx; + worker_is_busy(RRDENG_FLUSH_TRANSACTION_BUFFER_CB); + + WAL *wal = req->data; + struct generic_io_descriptor *io_descr = &wal->io_descr; + struct rrdengine_instance *ctx = io_descr->ctx; debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__); if (req->result < 0) { - ++ctx->stats.io_errors; - rrd_stat_atomic_add(&global_io_errors, 1); - error("%s: uv_fs_write: %s", __func__, uv_strerror((int)req->result)); + ctx_io_error(ctx); + error("DBENGINE: %s: uv_fs_write: %s", __func__, uv_strerror((int)req->result)); } else { debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__); } uv_fs_req_cleanup(req); - posix_memfree(io_descr->buf); - freez(io_descr); + wal_release(wal); + + __atomic_sub_fetch(&ctx->atomic.extents_currently_being_flushed, 1, __ATOMIC_RELAXED); + + worker_is_idle(); } /* Careful to always call this before creating a new journal file */ -void wal_flush_transaction_buffer(struct rrdengine_worker_config* wc) +void journalfile_v1_extent_write(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, WAL *wal, uv_loop_t *loop) { - struct rrdengine_instance *ctx = wc->ctx; int ret; struct generic_io_descriptor *io_descr; - unsigned pos, size; - struct rrdengine_journalfile *journalfile; + struct rrdengine_journalfile *journalfile = datafile->journalfile; - if (unlikely(NULL == ctx->commit_log.buf || 0 == ctx->commit_log.buf_pos)) { - return; - } - /* care with outstanding transactions when switching journal files */ - journalfile = ctx->datafiles.last->journalfile; - - io_descr = mallocz(sizeof(*io_descr)); - pos = ctx->commit_log.buf_pos; - size = ctx->commit_log.buf_size; - if (pos < size) { + io_descr = &wal->io_descr; + io_descr->ctx = ctx; + if (wal->size < wal->buf_size) { /* simulate an empty transaction to skip the rest of the block */ - *(uint8_t *) (ctx->commit_log.buf + pos) = STORE_PADDING; + *(uint8_t *) (wal->buf + wal->size) = STORE_PADDING; } - io_descr->buf = ctx->commit_log.buf; - io_descr->bytes = size; - io_descr->pos = journalfile->pos; - io_descr->req.data = io_descr; + io_descr->buf = wal->buf; + io_descr->bytes = wal->buf_size; + + netdata_spinlock_lock(&journalfile->unsafe.spinlock); + io_descr->pos = journalfile->unsafe.pos; + journalfile->unsafe.pos += wal->buf_size; + netdata_spinlock_unlock(&journalfile->unsafe.spinlock); + + io_descr->req.data = wal; + io_descr->data = journalfile; io_descr->completion = NULL; - io_descr->iov = uv_buf_init((void *)io_descr->buf, size); - ret = uv_fs_write(wc->loop, &io_descr->req, journalfile->file, &io_descr->iov, 1, - journalfile->pos, flush_transaction_buffer_cb); + io_descr->iov = uv_buf_init((void *)io_descr->buf, wal->buf_size); + ret = uv_fs_write(loop, &io_descr->req, journalfile->file, &io_descr->iov, 1, + (int64_t)io_descr->pos, after_extent_write_journalfile_v1_io); fatal_assert(-1 != ret); - journalfile->pos += RRDENG_BLOCK_SIZE; - ctx->disk_space += RRDENG_BLOCK_SIZE; - ctx->commit_log.buf = NULL; - ctx->stats.io_write_bytes += RRDENG_BLOCK_SIZE; - ++ctx->stats.io_write_requests; + + ctx_current_disk_space_increase(ctx, wal->buf_size); + ctx_io_write_op_bytes(ctx, wal->buf_size); } -void * wal_get_transaction_buffer(struct rrdengine_worker_config* wc, unsigned size) +void journalfile_v2_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen) { - struct rrdengine_instance *ctx = wc->ctx; - int ret; - unsigned buf_pos = 0, buf_size; - - fatal_assert(size); - if (ctx->commit_log.buf) { - unsigned remaining; - - buf_pos = ctx->commit_log.buf_pos; - buf_size = ctx->commit_log.buf_size; - remaining = buf_size - buf_pos; - if (size > remaining) { - /* we need a new buffer */ - wal_flush_transaction_buffer(wc); + (void) snprintfz(str, maxlen, "%s/" WALFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL WALFILE_EXTENSION_V2, + datafile->ctx->config.dbfiles_path, datafile->tier, datafile->fileno); +} + +void journalfile_v1_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen) +{ + (void) snprintfz(str, maxlen, "%s/" WALFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL WALFILE_EXTENSION, + datafile->ctx->config.dbfiles_path, datafile->tier, datafile->fileno); +} + +static struct journal_v2_header *journalfile_v2_mounted_data_get(struct rrdengine_journalfile *journalfile, size_t *data_size) { + struct journal_v2_header *j2_header = NULL; + + netdata_spinlock_lock(&journalfile->mmap.spinlock); + + if(!journalfile->mmap.data) { + journalfile->mmap.data = mmap(NULL, journalfile->mmap.size, PROT_READ, MAP_SHARED, journalfile->mmap.fd, 0); + if (journalfile->mmap.data == MAP_FAILED) { + internal_fatal(true, "DBENGINE: failed to re-mmap() journal file v2"); + close(journalfile->mmap.fd); + journalfile->mmap.fd = -1; + journalfile->mmap.data = NULL; + journalfile->mmap.size = 0; + + netdata_spinlock_lock(&journalfile->v2.spinlock); + journalfile->v2.flags &= ~(JOURNALFILE_FLAG_IS_AVAILABLE | JOURNALFILE_FLAG_IS_MOUNTED); + netdata_spinlock_unlock(&journalfile->v2.spinlock); + + ctx_fs_error(journalfile->datafile->ctx); + } + else { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.journal_v2_mapped, 1, __ATOMIC_RELAXED); + + madvise_dontfork(journalfile->mmap.data, journalfile->mmap.size); + madvise_dontdump(journalfile->mmap.data, journalfile->mmap.size); + madvise_random(journalfile->mmap.data, journalfile->mmap.size); + madvise_dontneed(journalfile->mmap.data, journalfile->mmap.size); + + netdata_spinlock_lock(&journalfile->v2.spinlock); + journalfile->v2.flags |= JOURNALFILE_FLAG_IS_AVAILABLE | JOURNALFILE_FLAG_IS_MOUNTED; + netdata_spinlock_unlock(&journalfile->v2.spinlock); + } + } + + if(journalfile->mmap.data) { + j2_header = journalfile->mmap.data; + + if (data_size) + *data_size = journalfile->mmap.size; + } + + netdata_spinlock_unlock(&journalfile->mmap.spinlock); + + return j2_header; +} + +static bool journalfile_v2_mounted_data_unmount(struct rrdengine_journalfile *journalfile, bool have_locks, bool wait) { + bool unmounted = false; + + if(!have_locks) { + if(!wait) { + if (!netdata_spinlock_trylock(&journalfile->mmap.spinlock)) + return false; } + else + netdata_spinlock_lock(&journalfile->mmap.spinlock); + + if(!wait) { + if(!netdata_spinlock_trylock(&journalfile->v2.spinlock)) { + netdata_spinlock_unlock(&journalfile->mmap.spinlock); + return false; + } + } + else + netdata_spinlock_lock(&journalfile->v2.spinlock); } - if (NULL == ctx->commit_log.buf) { - buf_size = ALIGN_BYTES_CEILING(size); - ret = posix_memalign((void *)&ctx->commit_log.buf, RRDFILE_ALIGNMENT, buf_size); - if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + + if(!journalfile->v2.refcount) { + if(journalfile->mmap.data) { + if (munmap(journalfile->mmap.data, journalfile->mmap.size)) { + char path[RRDENG_PATH_MAX]; + journalfile_v2_generate_path(journalfile->datafile, path, sizeof(path)); + error("DBENGINE: failed to unmap index file '%s'", path); + internal_fatal(true, "DBENGINE: failed to unmap file '%s'", path); + ctx_fs_error(journalfile->datafile->ctx); + } + else { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.journal_v2_unmapped, 1, __ATOMIC_RELAXED); + journalfile->mmap.data = NULL; + journalfile->v2.flags &= ~JOURNALFILE_FLAG_IS_MOUNTED; + } } - memset(ctx->commit_log.buf, 0, buf_size); - buf_pos = ctx->commit_log.buf_pos = 0; - ctx->commit_log.buf_size = buf_size; + + unmounted = true; } - ctx->commit_log.buf_pos += size; - return ctx->commit_log.buf + buf_pos; + if(!have_locks) { + netdata_spinlock_unlock(&journalfile->v2.spinlock); + netdata_spinlock_unlock(&journalfile->mmap.spinlock); + } + + return unmounted; } -void generate_journalfilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen) -{ - (void) snprintfz(str, maxlen, "%s/" WALFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL WALFILE_EXTENSION, - datafile->ctx->dbfiles_path, datafile->tier, datafile->fileno); +void journalfile_v2_data_unmount_cleanup(time_t now_s) { + // DO NOT WAIT ON ANY LOCK!!! + + for(size_t tier = 0; tier < (size_t)storage_tiers ;tier++) { + struct rrdengine_instance *ctx = multidb_ctx[tier]; + if(!ctx) continue; + + struct rrdengine_datafile *datafile; + if(uv_rwlock_tryrdlock(&ctx->datafiles.rwlock) != 0) + continue; + + for (datafile = ctx->datafiles.first; datafile; datafile = datafile->next) { + struct rrdengine_journalfile *journalfile = datafile->journalfile; + + if(!netdata_spinlock_trylock(&journalfile->v2.spinlock)) + continue; + + bool unmount = false; + if (!journalfile->v2.refcount && (journalfile->v2.flags & JOURNALFILE_FLAG_IS_MOUNTED)) { + // this journal has no references and it is mounted + + if (!journalfile->v2.not_needed_since_s) + journalfile->v2.not_needed_since_s = now_s; + + else if (now_s - journalfile->v2.not_needed_since_s >= 120) + // 2 minutes have passed since last use + unmount = true; + } + netdata_spinlock_unlock(&journalfile->v2.spinlock); + + if (unmount) + journalfile_v2_mounted_data_unmount(journalfile, false, false); + } + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + } +} + +struct journal_v2_header *journalfile_v2_data_acquire(struct rrdengine_journalfile *journalfile, size_t *data_size, time_t wanted_first_time_s, time_t wanted_last_time_s) { + netdata_spinlock_lock(&journalfile->v2.spinlock); + + bool has_data = (journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE); + bool is_mounted = (journalfile->v2.flags & JOURNALFILE_FLAG_IS_MOUNTED); + bool do_we_need_it = false; + + if(has_data) { + if (!wanted_first_time_s || !wanted_last_time_s || + is_page_in_time_range(journalfile->v2.first_time_s, journalfile->v2.last_time_s, + wanted_first_time_s, wanted_last_time_s) == PAGE_IS_IN_RANGE) { + + journalfile->v2.refcount++; + + do_we_need_it = true; + + if (!wanted_first_time_s && !wanted_last_time_s && !is_mounted) + journalfile->v2.flags |= JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION; + else + journalfile->v2.flags &= ~JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION; + + } + } + netdata_spinlock_unlock(&journalfile->v2.spinlock); + + if(do_we_need_it) + return journalfile_v2_mounted_data_get(journalfile, data_size); + + return NULL; +} + +void journalfile_v2_data_release(struct rrdengine_journalfile *journalfile) { + netdata_spinlock_lock(&journalfile->v2.spinlock); + + internal_fatal(!journalfile->mmap.data, "trying to release a journalfile without data"); + internal_fatal(journalfile->v2.refcount < 1, "trying to release a non-acquired journalfile"); + + bool unmount = false; + + journalfile->v2.refcount--; + + if(journalfile->v2.refcount == 0) { + journalfile->v2.not_needed_since_s = 0; + + if(journalfile->v2.flags & JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION) + unmount = true; + } + netdata_spinlock_unlock(&journalfile->v2.spinlock); + + if(unmount) + journalfile_v2_mounted_data_unmount(journalfile, false, true); +} + +bool journalfile_v2_data_available(struct rrdengine_journalfile *journalfile) { + + netdata_spinlock_lock(&journalfile->v2.spinlock); + bool has_data = (journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE); + netdata_spinlock_unlock(&journalfile->v2.spinlock); + + return has_data; +} + +size_t journalfile_v2_data_size_get(struct rrdengine_journalfile *journalfile) { + + netdata_spinlock_lock(&journalfile->mmap.spinlock); + size_t data_size = journalfile->mmap.size; + netdata_spinlock_unlock(&journalfile->mmap.spinlock); + + return data_size; +} + +void journalfile_v2_data_set(struct rrdengine_journalfile *journalfile, int fd, void *journal_data, uint32_t journal_data_size) { + netdata_spinlock_lock(&journalfile->mmap.spinlock); + netdata_spinlock_lock(&journalfile->v2.spinlock); + + internal_fatal(journalfile->mmap.fd != -1, "DBENGINE JOURNALFILE: trying to re-set journal fd"); + internal_fatal(journalfile->mmap.data, "DBENGINE JOURNALFILE: trying to re-set journal_data"); + internal_fatal(journalfile->v2.refcount, "DBENGINE JOURNALFILE: trying to re-set journal_data of referenced journalfile"); + + journalfile->mmap.fd = fd; + journalfile->mmap.data = journal_data; + journalfile->mmap.size = journal_data_size; + journalfile->v2.not_needed_since_s = now_monotonic_sec(); + journalfile->v2.flags |= JOURNALFILE_FLAG_IS_AVAILABLE | JOURNALFILE_FLAG_IS_MOUNTED; + + struct journal_v2_header *j2_header = journalfile->mmap.data; + journalfile->v2.first_time_s = (time_t)(j2_header->start_time_ut / USEC_PER_SEC); + journalfile->v2.last_time_s = (time_t)(j2_header->end_time_ut / USEC_PER_SEC); + + journalfile_v2_mounted_data_unmount(journalfile, true, true); + + netdata_spinlock_unlock(&journalfile->v2.spinlock); + netdata_spinlock_unlock(&journalfile->mmap.spinlock); +} + +static void journalfile_v2_data_unmap_permanently(struct rrdengine_journalfile *journalfile) { + bool has_references = false; + + do { + if (has_references) + sleep_usec(10 * USEC_PER_MS); + + netdata_spinlock_lock(&journalfile->mmap.spinlock); + netdata_spinlock_lock(&journalfile->v2.spinlock); + + if(journalfile_v2_mounted_data_unmount(journalfile, true, true)) { + if(journalfile->mmap.fd != -1) + close(journalfile->mmap.fd); + + journalfile->mmap.fd = -1; + journalfile->mmap.data = NULL; + journalfile->mmap.size = 0; + journalfile->v2.first_time_s = 0; + journalfile->v2.last_time_s = 0; + journalfile->v2.flags = 0; + } + else { + has_references = true; + internal_error(true, "DBENGINE JOURNALFILE: waiting for journalfile to be available to unmap..."); + } + + netdata_spinlock_unlock(&journalfile->v2.spinlock); + netdata_spinlock_unlock(&journalfile->mmap.spinlock); + + } while(has_references); } -void journalfile_init(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +struct rrdengine_journalfile *journalfile_alloc_and_init(struct rrdengine_datafile *datafile) { - journalfile->file = (uv_file)0; - journalfile->pos = 0; + struct rrdengine_journalfile *journalfile = callocz(1, sizeof(struct rrdengine_journalfile)); journalfile->datafile = datafile; + netdata_spinlock_init(&journalfile->mmap.spinlock); + netdata_spinlock_init(&journalfile->v2.spinlock); + netdata_spinlock_init(&journalfile->unsafe.spinlock); + journalfile->mmap.fd = -1; + datafile->journalfile = journalfile; + return journalfile; } -int close_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +static int close_uv_file(struct rrdengine_datafile *datafile, uv_file file) { - struct rrdengine_instance *ctx = datafile->ctx; - uv_fs_t req; int ret; char path[RRDENG_PATH_MAX]; - generate_journalfilepath(datafile, path, sizeof(path)); - - ret = uv_fs_close(NULL, &req, journalfile->file, NULL); + uv_fs_t req; + ret = uv_fs_close(NULL, &req, file, NULL); if (ret < 0) { - error("uv_fs_close(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + journalfile_v1_generate_path(datafile, path, sizeof(path)); + error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(datafile->ctx); } uv_fs_req_cleanup(&req); - return ret; } -int unlink_journal_file(struct rrdengine_journalfile *journalfile) +int journalfile_close(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +{ + if(journalfile_v2_data_available(journalfile)) { + journalfile_v2_data_unmap_permanently(journalfile); + return 0; + } + + return close_uv_file(datafile, journalfile->file); +} + +int journalfile_unlink(struct rrdengine_journalfile *journalfile) { struct rrdengine_datafile *datafile = journalfile->datafile; struct rrdengine_instance *ctx = datafile->ctx; @@ -134,60 +426,65 @@ int unlink_journal_file(struct rrdengine_journalfile *journalfile) int ret; char path[RRDENG_PATH_MAX]; - generate_journalfilepath(datafile, path, sizeof(path)); + journalfile_v1_generate_path(datafile, path, sizeof(path)); ret = uv_fs_unlink(NULL, &req, path, NULL); if (ret < 0) { - error("uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); - ++ctx->stats.journalfile_deletions; + __atomic_add_fetch(&ctx->stats.journalfile_deletions, 1, __ATOMIC_RELAXED); return ret; } -int destroy_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +int journalfile_destroy_unsafe(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) { struct rrdengine_instance *ctx = datafile->ctx; uv_fs_t req; int ret; char path[RRDENG_PATH_MAX]; + char path_v2[RRDENG_PATH_MAX]; - generate_journalfilepath(datafile, path, sizeof(path)); + journalfile_v1_generate_path(datafile, path, sizeof(path)); + journalfile_v2_generate_path(datafile, path_v2, sizeof(path)); + if (journalfile->file) { ret = uv_fs_ftruncate(NULL, &req, journalfile->file, 0, NULL); if (ret < 0) { - error("uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); + (void) close_uv_file(datafile, journalfile->file); + } - ret = uv_fs_close(NULL, &req, journalfile->file, NULL); + // This is the new journal v2 index file + ret = uv_fs_unlink(NULL, &req, path_v2, NULL); if (ret < 0) { - error("uv_fs_close(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); ret = uv_fs_unlink(NULL, &req, path, NULL); if (ret < 0) { - error("uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); - ++ctx->stats.journalfile_deletions; + __atomic_add_fetch(&ctx->stats.journalfile_deletions, 2, __ATOMIC_RELAXED); + + if(journalfile_v2_data_available(journalfile)) + journalfile_v2_data_unmap_permanently(journalfile); return ret; } -int create_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +int journalfile_create(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) { struct rrdengine_instance *ctx = datafile->ctx; uv_fs_t req; @@ -197,19 +494,18 @@ int create_journal_file(struct rrdengine_journalfile *journalfile, struct rrdeng uv_buf_t iov; char path[RRDENG_PATH_MAX]; - generate_journalfilepath(datafile, path, sizeof(path)); - fd = open_file_direct_io(path, O_CREAT | O_RDWR | O_TRUNC, &file); + journalfile_v1_generate_path(datafile, path, sizeof(path)); + fd = open_file_for_io(path, O_CREAT | O_RDWR | O_TRUNC, &file, use_direct_io); if (fd < 0) { - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + ctx_fs_error(ctx); return fd; } journalfile->file = file; - ++ctx->stats.journalfile_creations; + __atomic_add_fetch(&ctx->stats.journalfile_creations, 1, __ATOMIC_RELAXED); ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock)); if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); } memset(superblock, 0, sizeof(*superblock)); (void) strncpy(superblock->magic_number, RRDENG_JF_MAGIC, RRDENG_MAGIC_SZ); @@ -220,25 +516,24 @@ int create_journal_file(struct rrdengine_journalfile *journalfile, struct rrdeng ret = uv_fs_write(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { fatal_assert(req.result < 0); - error("uv_fs_write: %s", uv_strerror(ret)); - ++ctx->stats.io_errors; - rrd_stat_atomic_add(&global_io_errors, 1); + error("DBENGINE: uv_fs_write: %s", uv_strerror(ret)); + ctx_io_error(ctx); } uv_fs_req_cleanup(&req); posix_memfree(superblock); if (ret < 0) { - destroy_journal_file(journalfile, datafile); + journalfile_destroy_unsafe(journalfile, datafile); return ret; } - journalfile->pos = sizeof(*superblock); - ctx->stats.io_write_bytes += sizeof(*superblock); - ++ctx->stats.io_write_requests; + journalfile->unsafe.pos = sizeof(*superblock); + + ctx_io_write_op_bytes(ctx, sizeof(*superblock)); return 0; } -static int check_journal_file_superblock(uv_file file) +static int journalfile_check_superblock(uv_file file) { int ret; struct rrdeng_jf_sb *superblock; @@ -247,13 +542,13 @@ static int check_journal_file_superblock(uv_file file) ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock)); if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); } iov = uv_buf_init((void *)superblock, sizeof(*superblock)); ret = uv_fs_read(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { - error("uv_fs_read: %s", uv_strerror(ret)); + error("DBENGINE: uv_fs_read: %s", uv_strerror(ret)); uv_fs_req_cleanup(&req); goto error; } @@ -262,7 +557,7 @@ static int check_journal_file_superblock(uv_file file) if (strncmp(superblock->magic_number, RRDENG_JF_MAGIC, RRDENG_MAGIC_SZ) || strncmp(superblock->version, RRDENG_JF_VER, RRDENG_VER_SZ)) { - error("File has invalid superblock."); + error("DBENGINE: File has invalid superblock."); ret = UV_EINVAL; } else { ret = 0; @@ -272,15 +567,10 @@ static int check_journal_file_superblock(uv_file file) return ret; } -static void restore_extent_metadata(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, - void *buf, unsigned max_size) +static void journalfile_restore_extent_metadata(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, void *buf, unsigned max_size) { static BITMAP256 page_error_map; - struct page_cache *pg_cache = &ctx->pg_cache; - unsigned i, count, payload_length, descr_size, valid_pages; - struct rrdeng_page_descr *descr; - struct extent_info *extent; - /* persistent structures */ + unsigned i, count, payload_length, descr_size; struct rrdeng_jf_store_data *jf_metric_data; jf_metric_data = buf; @@ -288,117 +578,65 @@ static void restore_extent_metadata(struct rrdengine_instance *ctx, struct rrden descr_size = sizeof(*jf_metric_data->descr) * count; payload_length = sizeof(*jf_metric_data) + descr_size; if (payload_length > max_size) { - error("Corrupted transaction payload."); + error("DBENGINE: corrupted transaction payload."); return; } - extent = mallocz(sizeof(*extent) + count * sizeof(extent->pages[0])); - extent->offset = jf_metric_data->extent_offset; - extent->size = jf_metric_data->extent_size; - extent->datafile = journalfile->datafile; - extent->next = NULL; - - for (i = 0, valid_pages = 0 ; i < count ; ++i) { + time_t now_s = max_acceptable_collected_time(); + for (i = 0; i < count ; ++i) { uuid_t *temp_id; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; uint8_t page_type = jf_metric_data->descr[i].type; if (page_type > PAGE_TYPE_MAX) { if (!bitmap256_get_bit(&page_error_map, page_type)) { - error("Unknown page type %d encountered.", page_type); + error("DBENGINE: unknown page type %d encountered.", page_type); bitmap256_set_bit(&page_error_map, page_type, 1); } continue; } - uint64_t start_time_ut = jf_metric_data->descr[i].start_time_ut; - uint64_t end_time_ut = jf_metric_data->descr[i].end_time_ut; - size_t entries = jf_metric_data->descr[i].page_length / page_type_size[page_type]; - time_t update_every_s = (entries > 1) ? ((end_time_ut - start_time_ut) / USEC_PER_SEC / (entries - 1)) : 0; - - if (unlikely(start_time_ut > end_time_ut)) { - ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].counter++; - if(ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].latest_end_time_ut < end_time_ut) - ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].latest_end_time_ut = end_time_ut; - continue; - } - if (unlikely(start_time_ut == end_time_ut && entries != 1)) { - ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].counter++; - if(ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].latest_end_time_ut < end_time_ut) - ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].latest_end_time_ut = end_time_ut; - continue; - } - - if (unlikely(!entries)) { - ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].counter++; - if(ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].latest_end_time_ut < end_time_ut) - ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].latest_end_time_ut = end_time_ut; - continue; - } + temp_id = (uuid_t *)jf_metric_data->descr[i].uuid; + METRIC *metric = mrg_metric_get_and_acquire(main_mrg, temp_id, (Word_t) ctx); - if(entries > 1 && update_every_s == 0) { - ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].counter++; - if(ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].latest_end_time_ut < end_time_ut) - ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].latest_end_time_ut = end_time_ut; - continue; - } + struct rrdeng_extent_page_descr *descr = &jf_metric_data->descr[i]; + VALIDATED_PAGE_DESCRIPTOR vd = validate_extent_page_descr( + descr, now_s, + (metric) ? mrg_metric_get_update_every_s(main_mrg, metric) : 0, + false); - if(start_time_ut + update_every_s * USEC_PER_SEC * (entries - 1) != end_time_ut) { - ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].counter++; - if(ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].latest_end_time_ut < end_time_ut) - ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].latest_end_time_ut = end_time_ut; + if(!vd.is_valid) { + if(metric) + mrg_metric_release(main_mrg, metric); - // let this be - // end_time_ut = start_time_ut + update_every_s * USEC_PER_SEC * (entries - 1); + continue; } - temp_id = (uuid_t *)jf_metric_data->descr[i].uuid; - - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, temp_id, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - page_index = *PValue; - } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - if (NULL == PValue) { - /* First time we see the UUID */ - uv_rwlock_wrlock(&pg_cache->metrics_index.lock); - PValue = JudyHSIns(&pg_cache->metrics_index.JudyHS_array, temp_id, sizeof(uuid_t), PJE0); - fatal_assert(NULL == *PValue); /* TODO: figure out concurrency model */ - *PValue = page_index = create_page_index(temp_id, ctx); - page_index->prev = pg_cache->metrics_index.last_page_index; - pg_cache->metrics_index.last_page_index = page_index; - uv_rwlock_wrunlock(&pg_cache->metrics_index.lock); + bool update_metric_time = true; + if (!metric) { + MRG_ENTRY entry = { + .section = (Word_t)ctx, + .first_time_s = vd.start_time_s, + .last_time_s = vd.end_time_s, + .latest_update_every_s = vd.update_every_s, + }; + uuid_copy(entry.uuid, *temp_id); + + bool added; + metric = mrg_metric_add_and_acquire(main_mrg, entry, &added); + if(added) + update_metric_time = false; } + Word_t metric_id = mrg_metric_id(main_mrg, metric); - descr = pg_cache_create_descr(); - descr->page_length = jf_metric_data->descr[i].page_length; - descr->start_time_ut = start_time_ut; - descr->end_time_ut = end_time_ut; - descr->update_every_s = (update_every_s > 0) ? (uint32_t)update_every_s : (page_index->latest_update_every_s); - descr->id = &page_index->id; - descr->extent = extent; - descr->type = page_type; - extent->pages[valid_pages++] = descr; - pg_cache_insert(ctx, page_index, descr); + if (update_metric_time) + mrg_metric_expand_retention(main_mrg, metric, vd.start_time_s, vd.end_time_s, vd.update_every_s); - if(page_index->latest_time_ut == descr->end_time_ut) - page_index->latest_update_every_s = descr->update_every_s; + pgc_open_add_hot_page( + (Word_t)ctx, metric_id, vd.start_time_s, vd.end_time_s, vd.update_every_s, + journalfile->datafile, + jf_metric_data->extent_offset, jf_metric_data->extent_size, jf_metric_data->descr[i].page_length); - if(descr->update_every_s == 0) - fatal( - "DBENGINE: page descriptor update every is zero, end_time_ut = %llu, start_time_ut = %llu, entries = %zu", - (unsigned long long)end_time_ut, (unsigned long long)start_time_ut, entries); - } - - extent->number_of_pages = valid_pages; - - if (likely(valid_pages)) - df_extent_insert(extent); - else { - freez(extent); - ctx->load_errors[LOAD_ERRORS_DROPPED_EXTENT].counter++; + mrg_metric_release(main_mrg, metric); } } @@ -407,8 +645,8 @@ static void restore_extent_metadata(struct rrdengine_instance *ctx, struct rrden * Sets id to the current transaction id or to 0 if unknown. * Returns size of transaction record or 0 for unknown size. */ -static unsigned replay_transaction(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, - void *buf, uint64_t *id, unsigned max_size) +static unsigned journalfile_replay_transaction(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, + void *buf, uint64_t *id, unsigned max_size) { unsigned payload_length, size_bytes; int ret; @@ -424,14 +662,14 @@ static unsigned replay_transaction(struct rrdengine_instance *ctx, struct rrdeng return 0; } if (sizeof(*jf_header) > max_size) { - error("Corrupted transaction record, skipping."); + error("DBENGINE: corrupted transaction record, skipping."); return 0; } *id = jf_header->id; payload_length = jf_header->payload_length; size_bytes = sizeof(*jf_header) + payload_length + sizeof(*jf_trailer); if (size_bytes > max_size) { - error("Corrupted transaction record, skipping."); + error("DBENGINE: corrupted transaction record, skipping."); return 0; } jf_trailer = buf + sizeof(*jf_header) + payload_length; @@ -440,16 +678,16 @@ static unsigned replay_transaction(struct rrdengine_instance *ctx, struct rrdeng ret = crc32cmp(jf_trailer->checksum, crc); debug(D_RRDENGINE, "Transaction %"PRIu64" was read from disk. CRC32 check: %s", *id, ret ? "FAILED" : "SUCCEEDED"); if (unlikely(ret)) { - error("Transaction %"PRIu64" was read from disk. CRC32 check: FAILED", *id); + error("DBENGINE: transaction %"PRIu64" was read from disk. CRC32 check: FAILED", *id); return size_bytes; } switch (jf_header->type) { case STORE_DATA: debug(D_RRDENGINE, "Replaying transaction %"PRIu64"", jf_header->id); - restore_extent_metadata(ctx, journalfile, buf + sizeof(*jf_header), payload_length); + journalfile_restore_extent_metadata(ctx, journalfile, buf + sizeof(*jf_header), payload_length); break; default: - error("Unknown transaction type. Skipping record."); + error("DBENGINE: unknown transaction type, skipping record."); break; } @@ -463,10 +701,10 @@ static unsigned replay_transaction(struct rrdengine_instance *ctx, struct rrdeng * Page cache must already be initialized. * Returns the maximum transaction id it discovered. */ -static uint64_t iterate_transactions(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile) +static uint64_t journalfile_iterate_transactions(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile) { uv_file file; - uint64_t file_size;//, data_file_size; + uint64_t file_size; int ret; uint64_t pos, pos_i, max_id, id; unsigned size_bytes; @@ -475,39 +713,31 @@ static uint64_t iterate_transactions(struct rrdengine_instance *ctx, struct rrde uv_fs_t req; file = journalfile->file; - file_size = journalfile->pos; - //data_file_size = journalfile->datafile->pos; TODO: utilize this? + file_size = journalfile->unsafe.pos; max_id = 1; - bool journal_is_mmapped = (journalfile->data != NULL); - if (unlikely(!journal_is_mmapped)) { - ret = posix_memalign((void *)&buf, RRDFILE_ALIGNMENT, READAHEAD_BYTES); - if (unlikely(ret)) - fatal("posix_memalign:%s", strerror(ret)); - } - else - buf = journalfile->data + sizeof(struct rrdeng_jf_sb); - for (pos = sizeof(struct rrdeng_jf_sb) ; pos < file_size ; pos += READAHEAD_BYTES) { + ret = posix_memalign((void *)&buf, RRDFILE_ALIGNMENT, READAHEAD_BYTES); + if (unlikely(ret)) + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); + + for (pos = sizeof(struct rrdeng_jf_sb); pos < file_size; pos += READAHEAD_BYTES) { size_bytes = MIN(READAHEAD_BYTES, file_size - pos); - if (unlikely(!journal_is_mmapped)) { - iov = uv_buf_init(buf, size_bytes); - ret = uv_fs_read(NULL, &req, file, &iov, 1, pos, NULL); - if (ret < 0) { - error("uv_fs_read: pos=%" PRIu64 ", %s", pos, uv_strerror(ret)); - uv_fs_req_cleanup(&req); - goto skip_file; - } - fatal_assert(req.result >= 0); + iov = uv_buf_init(buf, size_bytes); + ret = uv_fs_read(NULL, &req, file, &iov, 1, pos, NULL); + if (ret < 0) { + error("DBENGINE: uv_fs_read: pos=%" PRIu64 ", %s", pos, uv_strerror(ret)); uv_fs_req_cleanup(&req); - ++ctx->stats.io_read_requests; - ctx->stats.io_read_bytes += size_bytes; + goto skip_file; } + fatal_assert(req.result >= 0); + uv_fs_req_cleanup(&req); + ctx_io_read_op_bytes(ctx, size_bytes); - for (pos_i = 0 ; pos_i < size_bytes ; ) { + for (pos_i = 0; pos_i < size_bytes;) { unsigned max_size; max_size = pos + size_bytes - pos_i; - ret = replay_transaction(ctx, journalfile, buf + pos_i, &id, max_size); + ret = journalfile_replay_transaction(ctx, journalfile, buf + pos_i, &id, max_size); if (!ret) /* TODO: support transactions bigger than 4K */ /* unknown transaction size, move on to the next block */ pos_i = ALIGN_BYTES_FLOOR(pos_i + RRDENG_BLOCK_SIZE); @@ -515,73 +745,722 @@ static uint64_t iterate_transactions(struct rrdengine_instance *ctx, struct rrde pos_i += ret; max_id = MAX(max_id, id); } - if (likely(journal_is_mmapped)) - buf += size_bytes; } skip_file: - if (unlikely(!journal_is_mmapped)) - posix_memfree(buf); + posix_memfree(buf); return max_id; } -int load_journal_file(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, - struct rrdengine_datafile *datafile) +// Checks that the extent list checksum is valid +static int journalfile_check_v2_extent_list (void *data_start, size_t file_size) +{ + UNUSED(file_size); + uLong crc; + + struct journal_v2_header *j2_header = (void *) data_start; + struct journal_v2_block_trailer *journal_v2_trailer; + + journal_v2_trailer = (struct journal_v2_block_trailer *) ((uint8_t *) data_start + j2_header->extent_trailer_offset); + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (uint8_t *) data_start + j2_header->extent_offset, j2_header->extent_count * sizeof(struct journal_extent_list)); + if (unlikely(crc32cmp(journal_v2_trailer->checksum, crc))) { + error("DBENGINE: extent list CRC32 check: FAILED"); + return 1; + } + + return 0; +} + +// Checks that the metric list (UUIDs) checksum is valid +static int journalfile_check_v2_metric_list(void *data_start, size_t file_size) +{ + UNUSED(file_size); + uLong crc; + + struct journal_v2_header *j2_header = (void *) data_start; + struct journal_v2_block_trailer *journal_v2_trailer; + + journal_v2_trailer = (struct journal_v2_block_trailer *) ((uint8_t *) data_start + j2_header->metric_trailer_offset); + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (uint8_t *) data_start + j2_header->metric_offset, j2_header->metric_count * sizeof(struct journal_metric_list)); + if (unlikely(crc32cmp(journal_v2_trailer->checksum, crc))) { + error("DBENGINE: metric list CRC32 check: FAILED"); + return 1; + } + return 0; +} + +// +// Return +// 0 Ok +// 1 Invalid +// 2 Force rebuild +// 3 skip + +static int journalfile_v2_validate(void *data_start, size_t journal_v2_file_size, size_t journal_v1_file_size) +{ + int rc; + uLong crc; + + struct journal_v2_header *j2_header = (void *) data_start; + struct journal_v2_block_trailer *journal_v2_trailer; + + if (j2_header->magic == JOURVAL_V2_REBUILD_MAGIC) + return 2; + + if (j2_header->magic == JOURVAL_V2_SKIP_MAGIC) + return 3; + + // Magic failure + if (j2_header->magic != JOURVAL_V2_MAGIC) + return 1; + + if (j2_header->journal_v2_file_size != journal_v2_file_size) + return 1; + + if (journal_v1_file_size && j2_header->journal_v1_file_size != journal_v1_file_size) + return 1; + + journal_v2_trailer = (struct journal_v2_block_trailer *) ((uint8_t *) data_start + journal_v2_file_size - sizeof(*journal_v2_trailer)); + + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (void *) j2_header, sizeof(*j2_header)); + + rc = crc32cmp(journal_v2_trailer->checksum, crc); + if (unlikely(rc)) { + error("DBENGINE: file CRC32 check: FAILED"); + return 1; + } + + rc = journalfile_check_v2_extent_list(data_start, journal_v2_file_size); + if (rc) return 1; + + rc = journalfile_check_v2_metric_list(data_start, journal_v2_file_size); + if (rc) return 1; + + if (!db_engine_journal_check) + return 0; + + // Verify complete UUID chain + + struct journal_metric_list *metric = (void *) (data_start + j2_header->metric_offset); + + unsigned verified = 0; + unsigned entries; + unsigned total_pages = 0; + + info("DBENGINE: checking %u metrics that exist in the journal", j2_header->metric_count); + for (entries = 0; entries < j2_header->metric_count; entries++) { + + char uuid_str[UUID_STR_LEN]; + uuid_unparse_lower(metric->uuid, uuid_str); + struct journal_page_header *metric_list_header = (void *) (data_start + metric->page_offset); + struct journal_page_header local_metric_list_header = *metric_list_header; + + local_metric_list_header.crc = JOURVAL_V2_MAGIC; + + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (void *) &local_metric_list_header, sizeof(local_metric_list_header)); + rc = crc32cmp(metric_list_header->checksum, crc); + + if (!rc) { + struct journal_v2_block_trailer *journal_trailer = + (void *) data_start + metric->page_offset + sizeof(struct journal_page_header) + (metric_list_header->entries * sizeof(struct journal_page_list)); + + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (uint8_t *) metric_list_header + sizeof(struct journal_page_header), metric_list_header->entries * sizeof(struct journal_page_list)); + rc = crc32cmp(journal_trailer->checksum, crc); + internal_error(rc, "DBENGINE: index %u : %s entries %u at offset %u verified, DATA CRC computed %lu, stored %u", entries, uuid_str, metric->entries, metric->page_offset, + crc, metric_list_header->crc); + if (!rc) { + total_pages += metric_list_header->entries; + verified++; + } + } + + metric++; + if ((uint32_t)((uint8_t *) metric - (uint8_t *) data_start) > (uint32_t) journal_v2_file_size) { + info("DBENGINE: verification failed EOF reached -- total entries %u, verified %u", entries, verified); + return 1; + } + } + + if (entries != verified) { + info("DBENGINE: verification failed -- total entries %u, verified %u", entries, verified); + return 1; + } + info("DBENGINE: verification succeeded -- total entries %u, verified %u (%u total pages)", entries, verified, total_pages); + + return 0; +} + +void journalfile_v2_populate_retention_to_mrg(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile) { + usec_t started_ut = now_monotonic_usec(); + + size_t data_size = 0; + struct journal_v2_header *j2_header = journalfile_v2_data_acquire(journalfile, &data_size, 0, 0); + if(!j2_header) + return; + + uint8_t *data_start = (uint8_t *)j2_header; + uint32_t entries = j2_header->metric_count; + + struct journal_metric_list *metric = (struct journal_metric_list *) (data_start + j2_header->metric_offset); + time_t header_start_time_s = (time_t) (j2_header->start_time_ut / USEC_PER_SEC); + time_t now_s = max_acceptable_collected_time(); + for (size_t i=0; i < entries; i++) { + time_t start_time_s = header_start_time_s + metric->delta_start_s; + time_t end_time_s = header_start_time_s + metric->delta_end_s; + time_t update_every_s = (metric->entries > 1) ? ((end_time_s - start_time_s) / (entries - 1)) : 0; + update_metric_retention_and_granularity_by_uuid( + ctx, &metric->uuid, start_time_s, end_time_s, update_every_s, now_s); + +#ifdef NETDATA_INTERNAL_CHECKS + struct journal_page_header *metric_list_header = (void *) (data_start + metric->page_offset); + fatal_assert(uuid_compare(metric_list_header->uuid, metric->uuid) == 0); + fatal_assert(metric->entries == metric_list_header->entries); +#endif + metric++; + } + + journalfile_v2_data_release(journalfile); + usec_t ended_ut = now_monotonic_usec(); + + info("DBENGINE: journal v2 of tier %d, datafile %u populated, size: %0.2f MiB, metrics: %0.2f k, %0.2f ms" + , ctx->config.tier, journalfile->datafile->fileno + , (double)data_size / 1024 / 1024 + , (double)entries / 1000 + , ((double)(ended_ut - started_ut) / USEC_PER_MS) + ); +} + +int journalfile_v2_load(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +{ + int ret, fd; + char path_v1[RRDENG_PATH_MAX]; + char path_v2[RRDENG_PATH_MAX]; + struct stat statbuf; + size_t journal_v1_file_size = 0; + size_t journal_v2_file_size; + + journalfile_v1_generate_path(datafile, path_v1, sizeof(path_v1)); + ret = stat(path_v1, &statbuf); + if (!ret) + journal_v1_file_size = (uint32_t)statbuf.st_size; + + journalfile_v2_generate_path(datafile, path_v2, sizeof(path_v2)); + fd = open(path_v2, O_RDONLY); + if (fd < 0) { + if (errno == ENOENT) + return 1; + ctx_fs_error(ctx); + error("DBENGINE: failed to open '%s'", path_v2); + return 1; + } + + ret = fstat(fd, &statbuf); + if (ret) { + error("DBENGINE: failed to get file information for '%s'", path_v2); + close(fd); + return 1; + } + + journal_v2_file_size = (size_t)statbuf.st_size; + + if (journal_v2_file_size < sizeof(struct journal_v2_header)) { + error_report("Invalid file %s. Not the expected size", path_v2); + close(fd); + return 1; + } + + usec_t mmap_start_ut = now_monotonic_usec(); + uint8_t *data_start = mmap(NULL, journal_v2_file_size, PROT_READ, MAP_SHARED, fd, 0); + if (data_start == MAP_FAILED) { + close(fd); + return 1; + } + + info("DBENGINE: checking integrity of '%s'", path_v2); + usec_t validation_start_ut = now_monotonic_usec(); + int rc = journalfile_v2_validate(data_start, journal_v2_file_size, journal_v1_file_size); + if (unlikely(rc)) { + if (rc == 2) + error_report("File %s needs to be rebuilt", path_v2); + else if (rc == 3) + error_report("File %s will be skipped", path_v2); + else + error_report("File %s is invalid and it will be rebuilt", path_v2); + + if (unlikely(munmap(data_start, journal_v2_file_size))) + error("DBENGINE: failed to unmap '%s'", path_v2); + + close(fd); + return rc; + } + + struct journal_v2_header *j2_header = (void *) data_start; + uint32_t entries = j2_header->metric_count; + + if (unlikely(!entries)) { + if (unlikely(munmap(data_start, journal_v2_file_size))) + error("DBENGINE: failed to unmap '%s'", path_v2); + + close(fd); + return 1; + } + + usec_t finished_ut = now_monotonic_usec(); + + info("DBENGINE: journal v2 '%s' loaded, size: %0.2f MiB, metrics: %0.2f k, " + "mmap: %0.2f ms, validate: %0.2f ms" + , path_v2 + , (double)journal_v2_file_size / 1024 / 1024 + , (double)entries / 1000 + , ((double)(validation_start_ut - mmap_start_ut) / USEC_PER_MS) + , ((double)(finished_ut - validation_start_ut) / USEC_PER_MS) + ); + + // Initialize the journal file to be able to access the data + journalfile_v2_data_set(journalfile, fd, data_start, journal_v2_file_size); + + ctx_current_disk_space_increase(ctx, journal_v2_file_size); + + // File is OK load it + return 0; +} + +struct journal_metric_list_to_sort { + struct jv2_metrics_info *metric_info; +}; + +static int journalfile_metric_compare (const void *item1, const void *item2) +{ + const struct jv2_metrics_info *metric1 = ((struct journal_metric_list_to_sort *) item1)->metric_info; + const struct jv2_metrics_info *metric2 = ((struct journal_metric_list_to_sort *) item2)->metric_info; + + return uuid_compare(*(metric1->uuid), *(metric2->uuid)); +} + + +// Write list of extents for the journalfile +void *journalfile_v2_write_extent_list(Pvoid_t JudyL_extents_pos, void *data) +{ + Pvoid_t *PValue; + struct journal_extent_list *j2_extent_base = (void *) data; + struct jv2_extents_info *ext_info; + + bool first = true; + Word_t pos = 0; + size_t count = 0; + while ((PValue = JudyLFirstThenNext(JudyL_extents_pos, &pos, &first))) { + ext_info = *PValue; + size_t index = ext_info->index; + j2_extent_base[index].file_index = 0; + j2_extent_base[index].datafile_offset = ext_info->pos; + j2_extent_base[index].datafile_size = ext_info->bytes; + j2_extent_base[index].pages = ext_info->number_of_pages; + count++; + } + return j2_extent_base + count; +} + +static int journalfile_verify_space(struct journal_v2_header *j2_header, void *data, uint32_t bytes) +{ + if ((unsigned long)(((uint8_t *) data - (uint8_t *) j2_header->data) + bytes) > (j2_header->journal_v2_file_size - sizeof(struct journal_v2_block_trailer))) + return 1; + + return 0; +} + +void *journalfile_v2_write_metric_page(struct journal_v2_header *j2_header, void *data, struct jv2_metrics_info *metric_info, uint32_t pages_offset) +{ + struct journal_metric_list *metric = (void *) data; + + if (journalfile_verify_space(j2_header, data, sizeof(*metric))) + return NULL; + + uuid_copy(metric->uuid, *metric_info->uuid); + metric->entries = metric_info->number_of_pages; + metric->page_offset = pages_offset; + metric->delta_start_s = (uint32_t)(metric_info->first_time_s - (time_t)(j2_header->start_time_ut / USEC_PER_SEC)); + metric->delta_end_s = (uint32_t)(metric_info->last_time_s - (time_t)(j2_header->start_time_ut / USEC_PER_SEC)); + + return ++metric; +} + +void *journalfile_v2_write_data_page_header(struct journal_v2_header *j2_header __maybe_unused, void *data, struct jv2_metrics_info *metric_info, uint32_t uuid_offset) +{ + struct journal_page_header *data_page_header = (void *) data; + uLong crc; + + uuid_copy(data_page_header->uuid, *metric_info->uuid); + data_page_header->entries = metric_info->number_of_pages; + data_page_header->uuid_offset = uuid_offset; // data header OFFSET poings to METRIC in the directory + data_page_header->crc = JOURVAL_V2_MAGIC; + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (void *) data_page_header, sizeof(*data_page_header)); + crc32set(data_page_header->checksum, crc); + return ++data_page_header; +} + +void *journalfile_v2_write_data_page_trailer(struct journal_v2_header *j2_header __maybe_unused, void *data, void *page_header) +{ + struct journal_page_header *data_page_header = (void *) page_header; + struct journal_v2_block_trailer *journal_trailer = (void *) data; + uLong crc; + + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (uint8_t *) page_header + sizeof(struct journal_page_header), data_page_header->entries * sizeof(struct journal_page_list)); + crc32set(journal_trailer->checksum, crc); + return ++journal_trailer; +} + +void *journalfile_v2_write_data_page(struct journal_v2_header *j2_header, void *data, struct jv2_page_info *page_info) +{ + struct journal_page_list *data_page = data; + + if (journalfile_verify_space(j2_header, data, sizeof(*data_page))) + return NULL; + + struct extent_io_data *ei = page_info->custom_data; + + data_page->delta_start_s = (uint32_t) (page_info->start_time_s - (time_t) (j2_header->start_time_ut) / USEC_PER_SEC); + data_page->delta_end_s = (uint32_t) (page_info->end_time_s - (time_t) (j2_header->start_time_ut) / USEC_PER_SEC); + data_page->extent_index = page_info->extent_index; + + data_page->update_every_s = page_info->update_every_s; + data_page->page_length = (uint16_t) (ei ? ei->page_length : page_info->page_length); + data_page->type = 0; + + return ++data_page; +} + +// Must be recorded in metric_info->entries +void *journalfile_v2_write_descriptors(struct journal_v2_header *j2_header, void *data, struct jv2_metrics_info *metric_info) +{ + Pvoid_t *PValue; + + struct journal_page_list *data_page = (void *)data; + // We need to write all descriptors with index metric_info->min_index_time_s, metric_info->max_index_time_s + // that belong to this journal file + Pvoid_t JudyL_array = metric_info->JudyL_pages_by_start_time; + + Word_t index_time = 0; + bool first = true; + struct jv2_page_info *page_info; + while ((PValue = JudyLFirstThenNext(JudyL_array, &index_time, &first))) { + page_info = *PValue; + // Write one descriptor and return the next data page location + data_page = journalfile_v2_write_data_page(j2_header, (void *) data_page, page_info); + if (NULL == data_page) + break; + } + return data_page; +} + +// Migrate the journalfile pointed by datafile +// activate : make the new file active immediately +// journafile data will be set and descriptors (if deleted) will be repopulated as needed +// startup : if the migration is done during agent startup +// this will allow us to optimize certain things + +void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno __maybe_unused, uint8_t type __maybe_unused, + Pvoid_t JudyL_metrics, Pvoid_t JudyL_extents_pos, + size_t number_of_extents, size_t number_of_metrics, size_t number_of_pages, void *user_data) +{ + char path[RRDENG_PATH_MAX]; + Pvoid_t *PValue; + struct rrdengine_instance *ctx = (struct rrdengine_instance *) section; + struct rrdengine_journalfile *journalfile = (struct rrdengine_journalfile *) user_data; + struct rrdengine_datafile *datafile = journalfile->datafile; + time_t min_time_s = LONG_MAX; + time_t max_time_s = 0; + struct jv2_metrics_info *metric_info; + + journalfile_v2_generate_path(datafile, path, sizeof(path)); + + info("DBENGINE: indexing file '%s': extents %zu, metrics %zu, pages %zu", + path, + number_of_extents, + number_of_metrics, + number_of_pages); + +#ifdef NETDATA_INTERNAL_CHECKS + usec_t start_loading = now_monotonic_usec(); +#endif + + size_t total_file_size = 0; + total_file_size += (sizeof(struct journal_v2_header) + JOURNAL_V2_HEADER_PADDING_SZ); + + // Extents will start here + uint32_t extent_offset = total_file_size; + total_file_size += (number_of_extents * sizeof(struct journal_extent_list)); + + uint32_t extent_offset_trailer = total_file_size; + total_file_size += sizeof(struct journal_v2_block_trailer); + + // UUID list will start here + uint32_t metrics_offset = total_file_size; + total_file_size += (number_of_metrics * sizeof(struct journal_metric_list)); + + // UUID list trailer + uint32_t metric_offset_trailer = total_file_size; + total_file_size += sizeof(struct journal_v2_block_trailer); + + // descr @ time will start here + uint32_t pages_offset = total_file_size; + total_file_size += (number_of_pages * (sizeof(struct journal_page_list) + sizeof(struct journal_page_header) + sizeof(struct journal_v2_block_trailer))); + + // File trailer + uint32_t trailer_offset = total_file_size; + total_file_size += sizeof(struct journal_v2_block_trailer); + + int fd_v2; + uint8_t *data_start = netdata_mmap(path, total_file_size, MAP_SHARED, 0, false, &fd_v2); + uint8_t *data = data_start; + + memset(data_start, 0, extent_offset); + + // Write header + struct journal_v2_header j2_header; + memset(&j2_header, 0, sizeof(j2_header)); + + j2_header.magic = JOURVAL_V2_MAGIC; + j2_header.start_time_ut = 0; + j2_header.end_time_ut = 0; + j2_header.extent_count = number_of_extents; + j2_header.extent_offset = extent_offset; + j2_header.metric_count = number_of_metrics; + j2_header.metric_offset = metrics_offset; + j2_header.page_count = number_of_pages; + j2_header.page_offset = pages_offset; + j2_header.extent_trailer_offset = extent_offset_trailer; + j2_header.metric_trailer_offset = metric_offset_trailer; + j2_header.journal_v2_file_size = total_file_size; + j2_header.journal_v1_file_size = (uint32_t)journalfile_current_size(journalfile); + j2_header.data = data_start; // Used during migration + + struct journal_v2_block_trailer *journal_v2_trailer; + + data = journalfile_v2_write_extent_list(JudyL_extents_pos, data_start + extent_offset); + internal_error(true, "DBENGINE: write extent list so far %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + + fatal_assert(data == data_start + extent_offset_trailer); + + // Calculate CRC for extents + journal_v2_trailer = (struct journal_v2_block_trailer *) (data_start + extent_offset_trailer); + uLong crc; + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (uint8_t *) data_start + extent_offset, number_of_extents * sizeof(struct journal_extent_list)); + crc32set(journal_v2_trailer->checksum, crc); + + internal_error(true, "DBENGINE: CALCULATE CRC FOR EXTENT %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + // Skip the trailer, point to the metrics off + data += sizeof(struct journal_v2_block_trailer); + + // Sanity check -- we must be at the metrics_offset + fatal_assert(data == data_start + metrics_offset); + + // Allocate array to sort UUIDs and keep them sorted in the journal because we want to do binary search when we do lookups + struct journal_metric_list_to_sort *uuid_list = mallocz(number_of_metrics * sizeof(struct journal_metric_list_to_sort)); + + Word_t Index = 0; + size_t count = 0; + bool first_then_next = true; + while ((PValue = JudyLFirstThenNext(JudyL_metrics, &Index, &first_then_next))) { + metric_info = *PValue; + + fatal_assert(count < number_of_metrics); + uuid_list[count++].metric_info = metric_info; + min_time_s = MIN(min_time_s, metric_info->first_time_s); + max_time_s = MAX(max_time_s, metric_info->last_time_s); + } + + // Store in the header + j2_header.start_time_ut = min_time_s * USEC_PER_SEC; + j2_header.end_time_ut = max_time_s * USEC_PER_SEC; + + qsort(&uuid_list[0], number_of_metrics, sizeof(struct journal_metric_list_to_sort), journalfile_metric_compare); + internal_error(true, "DBENGINE: traverse and qsort UUID %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + + uint32_t resize_file_to = total_file_size; + + for (Index = 0; Index < number_of_metrics; Index++) { + metric_info = uuid_list[Index].metric_info; + + // Calculate current UUID offset from start of file. We will store this in the data page header + uint32_t uuid_offset = data - data_start; + + // Write the UUID we are processing + data = (void *) journalfile_v2_write_metric_page(&j2_header, data, metric_info, pages_offset); + if (unlikely(!data)) + break; + + // Next we will write + // Header + // Detailed entries (descr @ time) + // Trailer (checksum) + + // Keep the page_list_header, to be used for migration when where agent is running + metric_info->page_list_header = pages_offset; + // Write page header + void *metric_page = journalfile_v2_write_data_page_header(&j2_header, data_start + pages_offset, metric_info, + uuid_offset); + + // Start writing descr @ time + void *page_trailer = journalfile_v2_write_descriptors(&j2_header, metric_page, metric_info); + if (unlikely(!page_trailer)) + break; + + // Trailer (checksum) + uint8_t *next_page_address = journalfile_v2_write_data_page_trailer(&j2_header, page_trailer, + data_start + pages_offset); + + // Calculate start of the pages start for next descriptor + pages_offset += (metric_info->number_of_pages * (sizeof(struct journal_page_list)) + sizeof(struct journal_page_header) + sizeof(struct journal_v2_block_trailer)); + // Verify we are at the right location + if (pages_offset != (uint32_t)(next_page_address - data_start)) { + // make sure checks fail so that we abort + data = data_start; + break; + } + } + + if (data == data_start + metric_offset_trailer) { + internal_error(true, "DBENGINE: WRITE METRICS AND PAGES %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + + // Calculate CRC for metrics + journal_v2_trailer = (struct journal_v2_block_trailer *)(data_start + metric_offset_trailer); + crc = crc32(0L, Z_NULL, 0); + crc = + crc32(crc, (uint8_t *)data_start + metrics_offset, number_of_metrics * sizeof(struct journal_metric_list)); + crc32set(journal_v2_trailer->checksum, crc); + internal_error(true, "DBENGINE: CALCULATE CRC FOR UUIDs %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + + // Prepare to write checksum for the file + j2_header.data = NULL; + journal_v2_trailer = (struct journal_v2_block_trailer *)(data_start + trailer_offset); + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (void *)&j2_header, sizeof(j2_header)); + crc32set(journal_v2_trailer->checksum, crc); + + // Write header to the file + memcpy(data_start, &j2_header, sizeof(j2_header)); + + internal_error(true, "DBENGINE: FILE COMPLETED --------> %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + + info("DBENGINE: migrated journal file '%s', file size %zu", path, total_file_size); + + // msync(data_start, total_file_size, MS_SYNC); + journalfile_v2_data_set(journalfile, fd_v2, data_start, total_file_size); + + internal_error(true, "DBENGINE: ACTIVATING NEW INDEX JNL %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); + ctx_current_disk_space_increase(ctx, total_file_size); + freez(uuid_list); + return; + } + else { + info("DBENGINE: failed to build index '%s', file will be skipped", path); + j2_header.data = NULL; + j2_header.magic = JOURVAL_V2_SKIP_MAGIC; + memcpy(data_start, &j2_header, sizeof(j2_header)); + resize_file_to = sizeof(j2_header); + } + + netdata_munmap(data_start, total_file_size); + freez(uuid_list); + + if (likely(resize_file_to == total_file_size)) + return; + + int ret = truncate(path, (long) resize_file_to); + if (ret < 0) { + ctx_current_disk_space_increase(ctx, total_file_size); + ctx_fs_error(ctx); + error("DBENGINE: failed to resize file '%s'", path); + } + else + ctx_current_disk_space_increase(ctx, resize_file_to); +} + +int journalfile_load(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, + struct rrdengine_datafile *datafile) { uv_fs_t req; uv_file file; int ret, fd, error; uint64_t file_size, max_id; char path[RRDENG_PATH_MAX]; + bool loaded_v2 = false; + + // Do not try to load jv2 of the latest file + if (datafile->fileno != ctx_last_fileno_get(ctx)) + loaded_v2 = journalfile_v2_load(ctx, journalfile, datafile) == 0; - generate_journalfilepath(datafile, path, sizeof(path)); - fd = open_file_direct_io(path, O_RDWR, &file); + journalfile_v1_generate_path(datafile, path, sizeof(path)); + + fd = open_file_for_io(path, O_RDWR, &file, use_direct_io); if (fd < 0) { - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + ctx_fs_error(ctx); + + if(loaded_v2) + return 0; + return fd; } - info("Loading journal file \"%s\".", path); ret = check_file_properties(file, &file_size, sizeof(struct rrdeng_df_sb)); - if (ret) - goto error; - file_size = ALIGN_BYTES_FLOOR(file_size); + if (ret) { + error = ret; + goto cleanup; + } - ret = check_journal_file_superblock(file); - if (ret) - goto error; - ctx->stats.io_read_bytes += sizeof(struct rrdeng_jf_sb); - ++ctx->stats.io_read_requests; + if(loaded_v2) { + journalfile->unsafe.pos = file_size; + error = 0; + goto cleanup; + } + file_size = ALIGN_BYTES_FLOOR(file_size); + journalfile->unsafe.pos = file_size; journalfile->file = file; - journalfile->pos = file_size; - journalfile->data = netdata_mmap(path, file_size, MAP_SHARED, 0); - info("Loading journal file \"%s\" using %s.", path, journalfile->data?"MMAP":"uv_fs_read"); - max_id = iterate_transactions(ctx, journalfile); + ret = journalfile_check_superblock(file); + if (ret) { + info("DBENGINE: invalid journal file '%s' ; superblock check failed.", path); + error = ret; + goto cleanup; + } + ctx_io_read_op_bytes(ctx, sizeof(struct rrdeng_jf_sb)); + + info("DBENGINE: loading journal file '%s'", path); - ctx->commit_log.transaction_id = MAX(ctx->commit_log.transaction_id, max_id + 1); + max_id = journalfile_iterate_transactions(ctx, journalfile); + + __atomic_store_n(&ctx->atomic.transaction_id, MAX(__atomic_load_n(&ctx->atomic.transaction_id, __ATOMIC_RELAXED), max_id + 1), __ATOMIC_RELAXED); + + info("DBENGINE: journal file '%s' loaded (size:%"PRIu64").", path, file_size); + + bool is_last_file = (ctx_last_fileno_get(ctx) == journalfile->datafile->fileno); + if (is_last_file && journalfile->datafile->pos <= rrdeng_target_data_file_size(ctx) / 3) { + ctx->loading.create_new_datafile_pair = false; + return 0; + } + + pgc_open_cache_to_journal_v2(open_cache, (Word_t) ctx, (int) datafile->fileno, ctx->config.page_type, + journalfile_migrate_to_v2_callback, (void *) datafile->journalfile); + + if (is_last_file) + ctx->loading.create_new_datafile_pair = true; - info("Journal file \"%s\" loaded (size:%"PRIu64").", path, file_size); - if (likely(journalfile->data)) - netdata_munmap(journalfile->data, file_size); return 0; - error: - error = ret; +cleanup: ret = uv_fs_close(NULL, &req, file, NULL); if (ret < 0) { - error("uv_fs_close(%s): %s", path, uv_strerror(ret)); - ++ctx->stats.fs_errors; - rrd_stat_atomic_add(&global_fs_errors, 1); + error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); return error; } - -void init_commit_log(struct rrdengine_instance *ctx) -{ - ctx->commit_log.buf = NULL; - ctx->commit_log.buf_pos = 0; - ctx->commit_log.transaction_id = 1; -} diff --git a/database/engine/journalfile.h b/database/engine/journalfile.h index 011c5065f..5fbcc90fa 100644 --- a/database/engine/journalfile.h +++ b/database/engine/journalfile.h @@ -13,37 +13,147 @@ struct rrdengine_journalfile; #define WALFILE_PREFIX "journalfile-" #define WALFILE_EXTENSION ".njf" +#define WALFILE_EXTENSION_V2 ".njfv2" +#define is_descr_journal_v2(descr) ((descr)->extent_entry != NULL) + +typedef enum __attribute__ ((__packed__)) { + JOURNALFILE_FLAG_IS_AVAILABLE = (1 << 0), + JOURNALFILE_FLAG_IS_MOUNTED = (1 << 1), + JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION = (1 << 2), +} JOURNALFILE_FLAGS; /* only one event loop is supported for now */ struct rrdengine_journalfile { + struct { + SPINLOCK spinlock; + void *data; // MMAPed file of journal v2 + uint32_t size; // Total file size mapped + int fd; + } mmap; + + struct { + SPINLOCK spinlock; + JOURNALFILE_FLAGS flags; + int32_t refcount; + time_t first_time_s; + time_t last_time_s; + time_t not_needed_since_s; + } v2; + + struct { + SPINLOCK spinlock; + uint64_t pos; + } unsafe; + uv_file file; - uint64_t pos; - void *data; struct rrdengine_datafile *datafile; }; -/* only one event loop is supported for now */ -struct transaction_commit_log { - uint64_t transaction_id; +static inline uint64_t journalfile_current_size(struct rrdengine_journalfile *journalfile) { + netdata_spinlock_lock(&journalfile->unsafe.spinlock); + uint64_t size = journalfile->unsafe.pos; + netdata_spinlock_unlock(&journalfile->unsafe.spinlock); + return size; +} + +// Journal v2 structures + +#define JOURVAL_V2_MAGIC (0x01221019) +#define JOURVAL_V2_REBUILD_MAGIC (0x00221019) +#define JOURVAL_V2_SKIP_MAGIC (0x02221019) + +struct journal_v2_block_trailer { + union { + uint8_t checksum[CHECKSUM_SZ]; /* CRC32 */ + uint32_t crc; + }; +}; + +// Journal V2 +// 28 bytes +struct journal_page_header { + union { + uint8_t checksum[CHECKSUM_SZ]; // CRC check + uint32_t crc; + }; + uint32_t uuid_offset; // Points back to the UUID list which should point here (UUIDs should much) + uint32_t entries; // Entries + uuid_t uuid; // Which UUID this is +}; - /* outstanding transaction buffer */ - void *buf; - unsigned buf_pos; - unsigned buf_size; +// 20 bytes +struct journal_page_list { + uint32_t delta_start_s; // relative to the start time of journal + uint32_t delta_end_s; // relative to delta_start + uint32_t extent_index; // Index to the extent (extent list) (bytes from BASE) + uint32_t update_every_s; + uint16_t page_length; + uint8_t type; }; -void generate_journalfilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen); -void journalfile_init(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); -void *wal_get_transaction_buffer(struct rrdengine_worker_config* wc, unsigned size); -void wal_flush_transaction_buffer(struct rrdengine_worker_config* wc); -int close_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); -int unlink_journal_file(struct rrdengine_journalfile *journalfile); -int destroy_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); -int create_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); -int load_journal_file(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, - struct rrdengine_datafile *datafile); -void init_commit_log(struct rrdengine_instance *ctx); +// UUID_LIST +// 32 bytes +struct journal_metric_list { + uuid_t uuid; + uint32_t entries; // Number of entries + uint32_t page_offset; // OFFSET that contains entries * struct( journal_page_list ) + uint32_t delta_start_s; // Min time of metric + uint32_t delta_end_s; // Max time of metric (to be used to populate page_index) +}; + +// 16 bytes +struct journal_extent_list { + uint64_t datafile_offset; // Datafile offset to find the extent + uint32_t datafile_size; // Size of the extent + uint16_t file_index; // which file index is this datafile[index] + uint8_t pages; // number of pages (not all are necesssarily valid) +}; + +// 72 bytes +struct journal_v2_header { + uint32_t magic; + usec_t start_time_ut; // Min start time of journal + usec_t end_time_ut; // Maximum end time of journal + uint32_t extent_count; // Count of extents + uint32_t extent_offset; + uint32_t metric_count; // Count of metrics (unique UUIDS) + uint32_t metric_offset; + uint32_t page_count; // Total count of pages (descriptors @ time) + uint32_t page_offset; + uint32_t extent_trailer_offset; // CRC for entent list + uint32_t metric_trailer_offset; // CRC for metric list + uint32_t journal_v1_file_size; // This is the original journal file + uint32_t journal_v2_file_size; // This is the total file size + void *data; // Used when building the index +}; + +#define JOURNAL_V2_HEADER_PADDING_SZ (RRDENG_BLOCK_SIZE - (sizeof(struct journal_v2_header))) + +struct wal; + +void journalfile_v1_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen); +void journalfile_v2_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen); +struct rrdengine_journalfile *journalfile_alloc_and_init(struct rrdengine_datafile *datafile); +void journalfile_v1_extent_write(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, struct wal *wal, uv_loop_t *loop); +int journalfile_close(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); +int journalfile_unlink(struct rrdengine_journalfile *journalfile); +int journalfile_destroy_unsafe(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); +int journalfile_create(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); +int journalfile_load(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, + struct rrdengine_datafile *datafile); +void journalfile_v2_populate_retention_to_mrg(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile); + +void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno __maybe_unused, uint8_t type __maybe_unused, + Pvoid_t JudyL_metrics, Pvoid_t JudyL_extents_pos, + size_t number_of_extents, size_t number_of_metrics, size_t number_of_pages, void *user_data); + +bool journalfile_v2_data_available(struct rrdengine_journalfile *journalfile); +size_t journalfile_v2_data_size_get(struct rrdengine_journalfile *journalfile); +void journalfile_v2_data_set(struct rrdengine_journalfile *journalfile, int fd, void *journal_data, uint32_t journal_data_size); +struct journal_v2_header *journalfile_v2_data_acquire(struct rrdengine_journalfile *journalfile, size_t *data_size, time_t wanted_first_time_s, time_t wanted_last_time_s); +void journalfile_v2_data_release(struct rrdengine_journalfile *journalfile); +void journalfile_v2_data_unmount_cleanup(time_t now_s); #endif /* NETDATA_JOURNALFILE_H */ \ No newline at end of file diff --git a/database/engine/journalfile.ksy b/database/engine/journalfile.ksy new file mode 100644 index 000000000..858db83d4 --- /dev/null +++ b/database/engine/journalfile.ksy @@ -0,0 +1,144 @@ +meta: + id: netdata_journalfile_v2 + endian: le + +seq: + - id: journal_v2_header + type: journal_v2_header + size: 4096 + - id: extent_list + type: journal_v2_extent_list + repeat: expr + repeat-expr: journal_v2_header.extent_count + - id: extent_trailer + type: journal_v2_block_trailer + - id: metric_list + type: journal_v2_metric_list + repeat: expr + repeat-expr: journal_v2_header.metric_count + - id: metric_trailer + type: journal_v2_block_trailer + - id: page_blocs + type: jounral_v2_page_blocs + size: _root._io.size - _root._io.pos - 4 + - id: journal_file_trailer + type: journal_v2_block_trailer + + +types: + journal_v2_metric_list: + seq: + - id: uuid + size: 16 + - id: entries + type: u4 + - id: page_offset + type: u4 + - id: delta_start_s + type: u4 + - id: delta_end_s + type: u4 + instances: + page_block: + type: journal_v2_page_block + io: _root._io + pos: page_offset + journal_v2_page_hdr: + seq: + - id: crc + type: u4 + - id: uuid_offset + type: u4 + - id: entries + type: u4 + - id: uuid + size: 16 + journal_v2_page_list: + seq: + - id: delta_start_s + type: u4 + - id: delta_end_s + type: u4 + - id: extent_idx + type: u4 + - id: update_every_s + type: u4 + - id: page_len + type: u2 + - id: type + type: u1 + - id: reserved + type: u1 + instances: + extent: + io: _root._io + type: journal_v2_extent_list + pos: _root.journal_v2_header.extent_offset + (extent_idx * 16) + journal_v2_header: + seq: + - id: magic + contents: [ 0x19, 0x10, 0x22, 0x01 ] #0x01221019 + - id: reserved + type: u4 + - id: start_time_ut + type: u8 + - id: end_time_ut + type: u8 + - id: extent_count + type: u4 + - id: extent_offset + type: u4 + - id: metric_count + type: u4 + - id: metric_offset + type: u4 + - id: page_count + type: u4 + - id: page_offset + type: u4 + - id: extent_trailer_offset + type: u4 + - id: metric_trailer_offset + type: u4 + - id: original_file_size + type: u4 + - id: total_file_size + type: u4 + - id: data + type: u8 + instances: + trailer: + io: _root._io + type: journal_v2_block_trailer + pos: _root._io.size - 4 + journal_v2_block_trailer: + seq: + - id: checksum + type: u4 + journal_v2_extent_list: + seq: + - id: datafile_offset + type: u8 + - id: datafile_size + type: u4 + - id: file_idx + type: u2 + - id: page_cnt + type: u1 + - id: padding + type: u1 + journal_v2_page_block: + seq: + - id: hdr + type: journal_v2_page_hdr + - id: page_list + type: journal_v2_page_list + repeat: expr + repeat-expr: hdr.entries + - id: block_trailer + type: journal_v2_block_trailer + jounral_v2_page_blocs: + seq: + - id: blocs + type: journal_v2_page_block + repeat: eos diff --git a/database/engine/metric.c b/database/engine/metric.c new file mode 100644 index 000000000..9dc9d9ebc --- /dev/null +++ b/database/engine/metric.c @@ -0,0 +1,875 @@ +#include "metric.h" + +typedef int32_t REFCOUNT; +#define REFCOUNT_DELETING (-100) + +typedef enum __attribute__ ((__packed__)) { + METRIC_FLAG_HAS_RETENTION = (1 << 0), +} METRIC_FLAGS; + +struct metric { + uuid_t uuid; // never changes + Word_t section; // never changes + + time_t first_time_s; // + time_t latest_time_s_clean; // archived pages latest time + time_t latest_time_s_hot; // latest time of the currently collected page + uint32_t latest_update_every_s; // + pid_t writer; + METRIC_FLAGS flags; + REFCOUNT refcount; + SPINLOCK spinlock; // protects all variable members + + // THIS IS allocated with malloc() + // YOU HAVE TO INITIALIZE IT YOURSELF ! +}; + +static struct aral_statistics mrg_aral_statistics; + +struct mrg { + ARAL *aral[MRG_PARTITIONS]; + + struct pgc_index { + netdata_rwlock_t rwlock; + Pvoid_t uuid_judy; // each UUID has a JudyL of sections (tiers) + } index[MRG_PARTITIONS]; + + struct mrg_statistics stats; + + size_t entries_per_partition[MRG_PARTITIONS]; +}; + +static inline void MRG_STATS_DUPLICATE_ADD(MRG *mrg) { + __atomic_add_fetch(&mrg->stats.additions_duplicate, 1, __ATOMIC_RELAXED); +} + +static inline void MRG_STATS_ADDED_METRIC(MRG *mrg, size_t partition) { + __atomic_add_fetch(&mrg->stats.entries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&mrg->stats.additions, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&mrg->stats.size, sizeof(METRIC), __ATOMIC_RELAXED); + + __atomic_add_fetch(&mrg->entries_per_partition[partition], 1, __ATOMIC_RELAXED); +} + +static inline void MRG_STATS_DELETED_METRIC(MRG *mrg, size_t partition) { + __atomic_sub_fetch(&mrg->stats.entries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&mrg->stats.size, sizeof(METRIC), __ATOMIC_RELAXED); + __atomic_add_fetch(&mrg->stats.deletions, 1, __ATOMIC_RELAXED); + + __atomic_sub_fetch(&mrg->entries_per_partition[partition], 1, __ATOMIC_RELAXED); +} + +static inline void MRG_STATS_SEARCH_HIT(MRG *mrg) { + __atomic_add_fetch(&mrg->stats.search_hits, 1, __ATOMIC_RELAXED); +} + +static inline void MRG_STATS_SEARCH_MISS(MRG *mrg) { + __atomic_add_fetch(&mrg->stats.search_misses, 1, __ATOMIC_RELAXED); +} + +static inline void MRG_STATS_DELETE_MISS(MRG *mrg) { + __atomic_add_fetch(&mrg->stats.delete_misses, 1, __ATOMIC_RELAXED); +} + +static inline void mrg_index_read_lock(MRG *mrg, size_t partition) { + netdata_rwlock_rdlock(&mrg->index[partition].rwlock); +} +static inline void mrg_index_read_unlock(MRG *mrg, size_t partition) { + netdata_rwlock_unlock(&mrg->index[partition].rwlock); +} +static inline void mrg_index_write_lock(MRG *mrg, size_t partition) { + netdata_rwlock_wrlock(&mrg->index[partition].rwlock); +} +static inline void mrg_index_write_unlock(MRG *mrg, size_t partition) { + netdata_rwlock_unlock(&mrg->index[partition].rwlock); +} + +static inline void mrg_stats_size_judyl_change(MRG *mrg, size_t mem_before_judyl, size_t mem_after_judyl) { + if(mem_after_judyl > mem_before_judyl) + __atomic_add_fetch(&mrg->stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED); + else if(mem_after_judyl < mem_before_judyl) + __atomic_sub_fetch(&mrg->stats.size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED); +} + +static inline void mrg_stats_size_judyhs_added_uuid(MRG *mrg) { + __atomic_add_fetch(&mrg->stats.size, JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(uuid_t)), __ATOMIC_RELAXED); +} + +static inline void mrg_stats_size_judyhs_removed_uuid(MRG *mrg) { + __atomic_sub_fetch(&mrg->stats.size, JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(uuid_t)), __ATOMIC_RELAXED); +} + +static inline size_t uuid_partition(MRG *mrg __maybe_unused, uuid_t *uuid) { + uint8_t *u = (uint8_t *)uuid; + return u[UUID_SZ - 1] % MRG_PARTITIONS; +} + +static inline bool metric_has_retention_unsafe(MRG *mrg __maybe_unused, METRIC *metric) { + bool has_retention = (metric->first_time_s || metric->latest_time_s_clean || metric->latest_time_s_hot); + + if(has_retention && !(metric->flags & METRIC_FLAG_HAS_RETENTION)) { + metric->flags |= METRIC_FLAG_HAS_RETENTION; + __atomic_add_fetch(&mrg->stats.entries_with_retention, 1, __ATOMIC_RELAXED); + } + else if(!has_retention && (metric->flags & METRIC_FLAG_HAS_RETENTION)) { + metric->flags &= ~METRIC_FLAG_HAS_RETENTION; + __atomic_sub_fetch(&mrg->stats.entries_with_retention, 1, __ATOMIC_RELAXED); + } + + return has_retention; +} + +static inline REFCOUNT metric_acquire(MRG *mrg __maybe_unused, METRIC *metric, bool having_spinlock) { + REFCOUNT refcount; + + if(!having_spinlock) + netdata_spinlock_lock(&metric->spinlock); + + if(unlikely(metric->refcount < 0)) + fatal("METRIC: refcount is %d (negative) during acquire", metric->refcount); + + refcount = ++metric->refcount; + + // update its retention flags + metric_has_retention_unsafe(mrg, metric); + + if(!having_spinlock) + netdata_spinlock_unlock(&metric->spinlock); + + if(refcount == 1) + __atomic_add_fetch(&mrg->stats.entries_referenced, 1, __ATOMIC_RELAXED); + + __atomic_add_fetch(&mrg->stats.current_references, 1, __ATOMIC_RELAXED); + + return refcount; +} + +static inline bool metric_release_and_can_be_deleted(MRG *mrg __maybe_unused, METRIC *metric) { + bool ret = true; + REFCOUNT refcount; + + netdata_spinlock_lock(&metric->spinlock); + + if(unlikely(metric->refcount <= 0)) + fatal("METRIC: refcount is %d (zero or negative) during release", metric->refcount); + + refcount = --metric->refcount; + + if(likely(metric_has_retention_unsafe(mrg, metric) || refcount != 0)) + ret = false; + + netdata_spinlock_unlock(&metric->spinlock); + + if(unlikely(!refcount)) + __atomic_sub_fetch(&mrg->stats.entries_referenced, 1, __ATOMIC_RELAXED); + + __atomic_sub_fetch(&mrg->stats.current_references, 1, __ATOMIC_RELAXED); + + return ret; +} + +static METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *ret) { + size_t partition = uuid_partition(mrg, &entry->uuid); + + METRIC *allocation = aral_mallocz(mrg->aral[partition]); + + mrg_index_write_lock(mrg, partition); + + size_t mem_before_judyl, mem_after_judyl; + + Pvoid_t *sections_judy_pptr = JudyHSIns(&mrg->index[partition].uuid_judy, &entry->uuid, sizeof(uuid_t), PJE0); + if(unlikely(!sections_judy_pptr || sections_judy_pptr == PJERR)) + fatal("DBENGINE METRIC: corrupted UUIDs JudyHS array"); + + if(unlikely(!*sections_judy_pptr)) + mrg_stats_size_judyhs_added_uuid(mrg); + + mem_before_judyl = JudyLMemUsed(*sections_judy_pptr); + Pvoid_t *PValue = JudyLIns(sections_judy_pptr, entry->section, PJE0); + mem_after_judyl = JudyLMemUsed(*sections_judy_pptr); + mrg_stats_size_judyl_change(mrg, mem_before_judyl, mem_after_judyl); + + if(unlikely(!PValue || PValue == PJERR)) + fatal("DBENGINE METRIC: corrupted section JudyL array"); + + if(unlikely(*PValue != NULL)) { + METRIC *metric = *PValue; + + metric_acquire(mrg, metric, false); + mrg_index_write_unlock(mrg, partition); + + if(ret) + *ret = false; + + aral_freez(mrg->aral[partition], allocation); + + MRG_STATS_DUPLICATE_ADD(mrg); + return metric; + } + + METRIC *metric = allocation; + uuid_copy(metric->uuid, entry->uuid); + metric->section = entry->section; + metric->first_time_s = entry->first_time_s; + metric->latest_time_s_clean = entry->last_time_s; + metric->latest_time_s_hot = 0; + metric->latest_update_every_s = entry->latest_update_every_s; + metric->writer = 0; + metric->refcount = 0; + metric->flags = 0; + netdata_spinlock_init(&metric->spinlock); + metric_acquire(mrg, metric, true); // no spinlock use required here + *PValue = metric; + + mrg_index_write_unlock(mrg, partition); + + if(ret) + *ret = true; + + MRG_STATS_ADDED_METRIC(mrg, partition); + + return metric; +} + +static METRIC *metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) { + size_t partition = uuid_partition(mrg, uuid); + + mrg_index_read_lock(mrg, partition); + + Pvoid_t *sections_judy_pptr = JudyHSGet(mrg->index[partition].uuid_judy, uuid, sizeof(uuid_t)); + if(unlikely(!sections_judy_pptr)) { + mrg_index_read_unlock(mrg, partition); + MRG_STATS_SEARCH_MISS(mrg); + return NULL; + } + + Pvoid_t *PValue = JudyLGet(*sections_judy_pptr, section, PJE0); + if(unlikely(!PValue)) { + mrg_index_read_unlock(mrg, partition); + MRG_STATS_SEARCH_MISS(mrg); + return NULL; + } + + METRIC *metric = *PValue; + + metric_acquire(mrg, metric, false); + + mrg_index_read_unlock(mrg, partition); + + MRG_STATS_SEARCH_HIT(mrg); + return metric; +} + +static bool acquired_metric_del(MRG *mrg, METRIC *metric) { + size_t partition = uuid_partition(mrg, &metric->uuid); + + size_t mem_before_judyl, mem_after_judyl; + + mrg_index_write_lock(mrg, partition); + + if(!metric_release_and_can_be_deleted(mrg, metric)) { + mrg_index_write_unlock(mrg, partition); + __atomic_add_fetch(&mrg->stats.delete_having_retention_or_referenced, 1, __ATOMIC_RELAXED); + return false; + } + + Pvoid_t *sections_judy_pptr = JudyHSGet(mrg->index[partition].uuid_judy, &metric->uuid, sizeof(uuid_t)); + if(unlikely(!sections_judy_pptr || !*sections_judy_pptr)) { + mrg_index_write_unlock(mrg, partition); + MRG_STATS_DELETE_MISS(mrg); + return false; + } + + mem_before_judyl = JudyLMemUsed(*sections_judy_pptr); + int rc = JudyLDel(sections_judy_pptr, metric->section, PJE0); + mem_after_judyl = JudyLMemUsed(*sections_judy_pptr); + mrg_stats_size_judyl_change(mrg, mem_before_judyl, mem_after_judyl); + + if(unlikely(!rc)) { + mrg_index_write_unlock(mrg, partition); + MRG_STATS_DELETE_MISS(mrg); + return false; + } + + if(!*sections_judy_pptr) { + rc = JudyHSDel(&mrg->index[partition].uuid_judy, &metric->uuid, sizeof(uuid_t), PJE0); + if(unlikely(!rc)) + fatal("DBENGINE METRIC: cannot delete UUID from JudyHS"); + mrg_stats_size_judyhs_removed_uuid(mrg); + } + + mrg_index_write_unlock(mrg, partition); + + aral_freez(mrg->aral[partition], metric); + + MRG_STATS_DELETED_METRIC(mrg, partition); + + return true; +} + +// ---------------------------------------------------------------------------- +// public API + +MRG *mrg_create(void) { + MRG *mrg = callocz(1, sizeof(MRG)); + + for(size_t i = 0; i < MRG_PARTITIONS ; i++) { + netdata_rwlock_init(&mrg->index[i].rwlock); + + char buf[ARAL_MAX_NAME + 1]; + snprintfz(buf, ARAL_MAX_NAME, "mrg[%zu]", i); + + mrg->aral[i] = aral_create(buf, + sizeof(METRIC), + 0, + 16384, + &mrg_aral_statistics, + NULL, NULL, false, + false); + } + + mrg->stats.size = sizeof(MRG); + + return mrg; +} + +size_t mrg_aral_structures(void) { + return aral_structures_from_stats(&mrg_aral_statistics); +} + +size_t mrg_aral_overhead(void) { + return aral_overhead_from_stats(&mrg_aral_statistics); +} + +void mrg_destroy(MRG *mrg __maybe_unused) { + // no destruction possible + // we can't traverse the metrics list + + // to delete entries, the caller needs to keep pointers to them + // and delete them one by one + + ; +} + +METRIC *mrg_metric_add_and_acquire(MRG *mrg, MRG_ENTRY entry, bool *ret) { +// internal_fatal(entry.latest_time_s > max_acceptable_collected_time(), +// "DBENGINE METRIC: metric latest time is in the future"); + + return metric_add_and_acquire(mrg, &entry, ret); +} + +METRIC *mrg_metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) { + return metric_get_and_acquire(mrg, uuid, section); +} + +bool mrg_metric_release_and_delete(MRG *mrg, METRIC *metric) { + return acquired_metric_del(mrg, metric); +} + +METRIC *mrg_metric_dup(MRG *mrg, METRIC *metric) { + metric_acquire(mrg, metric, false); + return metric; +} + +bool mrg_metric_release(MRG *mrg, METRIC *metric) { + return metric_release_and_can_be_deleted(mrg, metric); +} + +Word_t mrg_metric_id(MRG *mrg __maybe_unused, METRIC *metric) { + return (Word_t)metric; +} + +uuid_t *mrg_metric_uuid(MRG *mrg __maybe_unused, METRIC *metric) { + return &metric->uuid; +} + +Word_t mrg_metric_section(MRG *mrg __maybe_unused, METRIC *metric) { + return metric->section; +} + +bool mrg_metric_set_first_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) { + netdata_spinlock_lock(&metric->spinlock); + metric->first_time_s = first_time_s; + metric_has_retention_unsafe(mrg, metric); + netdata_spinlock_unlock(&metric->spinlock); + + return true; +} + +void mrg_metric_expand_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s, time_t last_time_s, time_t update_every_s) { + + internal_fatal(first_time_s > max_acceptable_collected_time(), + "DBENGINE METRIC: metric first time is in the future"); + internal_fatal(last_time_s > max_acceptable_collected_time(), + "DBENGINE METRIC: metric last time is in the future"); + + netdata_spinlock_lock(&metric->spinlock); + + if(unlikely(first_time_s && (!metric->first_time_s || first_time_s < metric->first_time_s))) + metric->first_time_s = first_time_s; + + if(likely(last_time_s && (!metric->latest_time_s_clean || last_time_s > metric->latest_time_s_clean))) { + metric->latest_time_s_clean = last_time_s; + + if(likely(update_every_s)) + metric->latest_update_every_s = update_every_s; + } + else if(unlikely(!metric->latest_update_every_s && update_every_s)) + metric->latest_update_every_s = update_every_s; + + metric_has_retention_unsafe(mrg, metric); + netdata_spinlock_unlock(&metric->spinlock); +} + +bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) { + bool ret = false; + + netdata_spinlock_lock(&metric->spinlock); + if(first_time_s > metric->first_time_s) { + metric->first_time_s = first_time_s; + ret = true; + } + metric_has_retention_unsafe(mrg, metric); + netdata_spinlock_unlock(&metric->spinlock); + + return ret; +} + +time_t mrg_metric_get_first_time_s(MRG *mrg __maybe_unused, METRIC *metric) { + time_t first_time_s; + + netdata_spinlock_lock(&metric->spinlock); + + if(unlikely(!metric->first_time_s)) { + if(metric->latest_time_s_clean) + metric->first_time_s = metric->latest_time_s_clean; + + else if(metric->latest_time_s_hot) + metric->first_time_s = metric->latest_time_s_hot; + } + + first_time_s = metric->first_time_s; + + netdata_spinlock_unlock(&metric->spinlock); + + return first_time_s; +} + +void mrg_metric_get_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t *first_time_s, time_t *last_time_s, time_t *update_every_s) { + netdata_spinlock_lock(&metric->spinlock); + + if(unlikely(!metric->first_time_s)) { + if(metric->latest_time_s_clean) + metric->first_time_s = metric->latest_time_s_clean; + + else if(metric->latest_time_s_hot) + metric->first_time_s = metric->latest_time_s_hot; + } + + *first_time_s = metric->first_time_s; + *last_time_s = MAX(metric->latest_time_s_clean, metric->latest_time_s_hot); + *update_every_s = metric->latest_update_every_s; + + netdata_spinlock_unlock(&metric->spinlock); +} + +bool mrg_metric_set_clean_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) { + netdata_spinlock_lock(&metric->spinlock); + +// internal_fatal(latest_time_s > max_acceptable_collected_time(), +// "DBENGINE METRIC: metric latest time is in the future"); + +// internal_fatal(metric->latest_time_s_clean > latest_time_s, +// "DBENGINE METRIC: metric new clean latest time is older than the previous one"); + + metric->latest_time_s_clean = latest_time_s; + + if(unlikely(!metric->first_time_s)) + metric->first_time_s = latest_time_s; + +// if(unlikely(metric->first_time_s > latest_time_s)) +// metric->first_time_s = latest_time_s; + + metric_has_retention_unsafe(mrg, metric); + netdata_spinlock_unlock(&metric->spinlock); + return true; +} + +// returns true when metric still has retention +bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric) { + Word_t section = mrg_metric_section(mrg, metric); + bool do_again = false; + size_t countdown = 5; + bool ret = true; + + do { + time_t min_first_time_s = LONG_MAX; + time_t max_end_time_s = 0; + PGC_PAGE *page; + PGC_SEARCH method = PGC_SEARCH_FIRST; + time_t page_first_time_s = 0; + time_t page_end_time_s = 0; + while ((page = pgc_page_get_and_acquire(main_cache, section, (Word_t)metric, page_first_time_s, method))) { + method = PGC_SEARCH_NEXT; + + bool is_hot = pgc_is_page_hot(page); + bool is_dirty = pgc_is_page_dirty(page); + page_first_time_s = pgc_page_start_time_s(page); + page_end_time_s = pgc_page_end_time_s(page); + + if ((is_hot || is_dirty) && page_first_time_s < min_first_time_s) + min_first_time_s = page_first_time_s; + + if (is_dirty && page_end_time_s > max_end_time_s) + max_end_time_s = page_end_time_s; + + pgc_page_release(main_cache, page); + } + + if (min_first_time_s == LONG_MAX) + min_first_time_s = 0; + + netdata_spinlock_lock(&metric->spinlock); + if (--countdown && !min_first_time_s && metric->latest_time_s_hot) + do_again = true; + else { + internal_error(!countdown, "METRIC: giving up on updating the retention of metric without disk retention"); + + do_again = false; + metric->first_time_s = min_first_time_s; + metric->latest_time_s_clean = max_end_time_s; + + ret = metric_has_retention_unsafe(mrg, metric); + } + netdata_spinlock_unlock(&metric->spinlock); + } while(do_again); + + return ret; +} + +bool mrg_metric_set_hot_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) { +// internal_fatal(latest_time_s > max_acceptable_collected_time(), +// "DBENGINE METRIC: metric latest time is in the future"); + + netdata_spinlock_lock(&metric->spinlock); + metric->latest_time_s_hot = latest_time_s; + + if(unlikely(!metric->first_time_s)) + metric->first_time_s = latest_time_s; + +// if(unlikely(metric->first_time_s > latest_time_s)) +// metric->first_time_s = latest_time_s; + + metric_has_retention_unsafe(mrg, metric); + netdata_spinlock_unlock(&metric->spinlock); + return true; +} + +time_t mrg_metric_get_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric) { + time_t max; + netdata_spinlock_lock(&metric->spinlock); + max = MAX(metric->latest_time_s_clean, metric->latest_time_s_hot); + netdata_spinlock_unlock(&metric->spinlock); + return max; +} + +bool mrg_metric_set_update_every(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) { + if(!update_every_s) + return false; + + netdata_spinlock_lock(&metric->spinlock); + metric->latest_update_every_s = update_every_s; + netdata_spinlock_unlock(&metric->spinlock); + + return true; +} + +bool mrg_metric_set_update_every_s_if_zero(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) { + if(!update_every_s) + return false; + + netdata_spinlock_lock(&metric->spinlock); + if(!metric->latest_update_every_s) + metric->latest_update_every_s = update_every_s; + netdata_spinlock_unlock(&metric->spinlock); + + return true; +} + +time_t mrg_metric_get_update_every_s(MRG *mrg __maybe_unused, METRIC *metric) { + time_t update_every_s; + + netdata_spinlock_lock(&metric->spinlock); + update_every_s = metric->latest_update_every_s; + netdata_spinlock_unlock(&metric->spinlock); + + return update_every_s; +} + +bool mrg_metric_set_writer(MRG *mrg, METRIC *metric) { + bool done = false; + netdata_spinlock_lock(&metric->spinlock); + if(!metric->writer) { + metric->writer = gettid(); + __atomic_add_fetch(&mrg->stats.writers, 1, __ATOMIC_RELAXED); + done = true; + } + else + __atomic_add_fetch(&mrg->stats.writers_conflicts, 1, __ATOMIC_RELAXED); + netdata_spinlock_unlock(&metric->spinlock); + return done; +} + +bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric) { + bool done = false; + netdata_spinlock_lock(&metric->spinlock); + if(metric->writer) { + metric->writer = 0; + __atomic_sub_fetch(&mrg->stats.writers, 1, __ATOMIC_RELAXED); + done = true; + } + netdata_spinlock_unlock(&metric->spinlock); + return done; +} + +struct mrg_statistics mrg_get_statistics(MRG *mrg) { + // FIXME - use atomics + return mrg->stats; +} + +// ---------------------------------------------------------------------------- +// unit test + +#ifdef MRG_STRESS_TEST + +static void mrg_stress(MRG *mrg, size_t entries, size_t sections) { + bool ret; + + info("DBENGINE METRIC: stress testing %zu entries on %zu sections...", entries, sections); + + METRIC *array[entries][sections]; + for(size_t i = 0; i < entries ; i++) { + MRG_ENTRY e = { + .first_time_s = (time_t)(i + 1), + .latest_time_s = (time_t)(i + 2), + .latest_update_every_s = (time_t)(i + 3), + }; + uuid_generate_random(e.uuid); + + for(size_t section = 0; section < sections ;section++) { + e.section = section; + array[i][section] = mrg_metric_add_and_acquire(mrg, e, &ret); + if(!ret) + fatal("DBENGINE METRIC: failed to add metric %zu, section %zu", i, section); + + if(mrg_metric_add_and_acquire(mrg, e, &ret) != array[i][section]) + fatal("DBENGINE METRIC: adding the same metric twice, returns a different metric"); + + if(ret) + fatal("DBENGINE METRIC: adding the same metric twice, returns success"); + + if(mrg_metric_get_and_acquire(mrg, &e.uuid, e.section) != array[i][section]) + fatal("DBENGINE METRIC: cannot get back the same metric"); + + if(uuid_compare(*mrg_metric_uuid(mrg, array[i][section]), e.uuid) != 0) + fatal("DBENGINE METRIC: uuids do not match"); + } + } + + for(size_t i = 0; i < entries ; i++) { + for (size_t section = 0; section < sections; section++) { + uuid_t uuid; + uuid_generate_random(uuid); + + if(mrg_metric_get_and_acquire(mrg, &uuid, section)) + fatal("DBENGINE METRIC: found non-existing uuid"); + + if(mrg_metric_id(mrg, array[i][section]) != (Word_t)array[i][section]) + fatal("DBENGINE METRIC: metric id does not match"); + + if(mrg_metric_get_first_time_s(mrg, array[i][section]) != (time_t)(i + 1)) + fatal("DBENGINE METRIC: wrong first time returned"); + if(mrg_metric_get_latest_time_s(mrg, array[i][section]) != (time_t)(i + 2)) + fatal("DBENGINE METRIC: wrong latest time returned"); + if(mrg_metric_get_update_every_s(mrg, array[i][section]) != (time_t)(i + 3)) + fatal("DBENGINE METRIC: wrong latest time returned"); + + if(!mrg_metric_set_first_time_s(mrg, array[i][section], (time_t)((i + 1) * 2))) + fatal("DBENGINE METRIC: cannot set first time"); + if(!mrg_metric_set_clean_latest_time_s(mrg, array[i][section], (time_t) ((i + 1) * 3))) + fatal("DBENGINE METRIC: cannot set latest time"); + if(!mrg_metric_set_update_every(mrg, array[i][section], (time_t)((i + 1) * 4))) + fatal("DBENGINE METRIC: cannot set update every"); + + if(mrg_metric_get_first_time_s(mrg, array[i][section]) != (time_t)((i + 1) * 2)) + fatal("DBENGINE METRIC: wrong first time returned"); + if(mrg_metric_get_latest_time_s(mrg, array[i][section]) != (time_t)((i + 1) * 3)) + fatal("DBENGINE METRIC: wrong latest time returned"); + if(mrg_metric_get_update_every_s(mrg, array[i][section]) != (time_t)((i + 1) * 4)) + fatal("DBENGINE METRIC: wrong latest time returned"); + } + } + + for(size_t i = 0; i < entries ; i++) { + for (size_t section = 0; section < sections; section++) { + if(!mrg_metric_release_and_delete(mrg, array[i][section])) + fatal("DBENGINE METRIC: failed to delete metric"); + } + } +} + +static void *mrg_stress_test_thread1(void *ptr) { + MRG *mrg = ptr; + + for(int i = 0; i < 5 ; i++) + mrg_stress(mrg, 10000, 5); + + return ptr; +} + +static void *mrg_stress_test_thread2(void *ptr) { + MRG *mrg = ptr; + + for(int i = 0; i < 10 ; i++) + mrg_stress(mrg, 500, 50); + + return ptr; +} + +static void *mrg_stress_test_thread3(void *ptr) { + MRG *mrg = ptr; + + for(int i = 0; i < 50 ; i++) + mrg_stress(mrg, 5000, 1); + + return ptr; +} +#endif + +int mrg_unittest(void) { + MRG *mrg = mrg_create(); + METRIC *m1_t0, *m2_t0, *m3_t0, *m4_t0; + METRIC *m1_t1, *m2_t1, *m3_t1, *m4_t1; + bool ret; + + MRG_ENTRY entry = { + .section = 0, + .first_time_s = 2, + .last_time_s = 3, + .latest_update_every_s = 4, + }; + uuid_generate(entry.uuid); + m1_t0 = mrg_metric_add_and_acquire(mrg, entry, &ret); + if(!ret) + fatal("DBENGINE METRIC: failed to add metric"); + + // add the same metric again + m2_t0 = mrg_metric_add_and_acquire(mrg, entry, &ret); + if(m2_t0 != m1_t0) + fatal("DBENGINE METRIC: adding the same metric twice, does not return the same pointer"); + if(ret) + fatal("DBENGINE METRIC: managed to add the same metric twice"); + + m3_t0 = mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section); + if(m3_t0 != m1_t0) + fatal("DBENGINE METRIC: cannot find the metric added"); + + // add the same metric again + m4_t0 = mrg_metric_add_and_acquire(mrg, entry, &ret); + if(m4_t0 != m1_t0) + fatal("DBENGINE METRIC: adding the same metric twice, does not return the same pointer"); + if(ret) + fatal("DBENGINE METRIC: managed to add the same metric twice"); + + // add the same metric in another section + entry.section = 1; + m1_t1 = mrg_metric_add_and_acquire(mrg, entry, &ret); + if(!ret) + fatal("DBENGINE METRIC: failed to add metric in section %zu", (size_t)entry.section); + + // add the same metric again + m2_t1 = mrg_metric_add_and_acquire(mrg, entry, &ret); + if(m2_t1 != m1_t1) + fatal("DBENGINE METRIC: adding the same metric twice (section %zu), does not return the same pointer", (size_t)entry.section); + if(ret) + fatal("DBENGINE METRIC: managed to add the same metric twice in (section 0)"); + + m3_t1 = mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section); + if(m3_t1 != m1_t1) + fatal("DBENGINE METRIC: cannot find the metric added (section %zu)", (size_t)entry.section); + + // delete the first metric + mrg_metric_release(mrg, m2_t0); + mrg_metric_release(mrg, m3_t0); + mrg_metric_release(mrg, m4_t0); + mrg_metric_set_first_time_s(mrg, m1_t0, 0); + mrg_metric_set_clean_latest_time_s(mrg, m1_t0, 0); + mrg_metric_set_hot_latest_time_s(mrg, m1_t0, 0); + if(!mrg_metric_release_and_delete(mrg, m1_t0)) + fatal("DBENGINE METRIC: cannot delete the first metric"); + + m4_t1 = mrg_metric_get_and_acquire(mrg, &entry.uuid, entry.section); + if(m4_t1 != m1_t1) + fatal("DBENGINE METRIC: cannot find the metric added (section %zu), after deleting the first one", (size_t)entry.section); + + // delete the second metric + mrg_metric_release(mrg, m2_t1); + mrg_metric_release(mrg, m3_t1); + mrg_metric_release(mrg, m4_t1); + mrg_metric_set_first_time_s(mrg, m1_t1, 0); + mrg_metric_set_clean_latest_time_s(mrg, m1_t1, 0); + mrg_metric_set_hot_latest_time_s(mrg, m1_t1, 0); + if(!mrg_metric_release_and_delete(mrg, m1_t1)) + fatal("DBENGINE METRIC: cannot delete the second metric"); + + if(mrg->stats.entries != 0) + fatal("DBENGINE METRIC: invalid entries counter"); + +#ifdef MRG_STRESS_TEST + usec_t started_ut = now_monotonic_usec(); + pthread_t thread1; + netdata_thread_create(&thread1, "TH1", + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + mrg_stress_test_thread1, mrg); + + pthread_t thread2; + netdata_thread_create(&thread2, "TH2", + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + mrg_stress_test_thread2, mrg); + + pthread_t thread3; + netdata_thread_create(&thread3, "TH3", + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + mrg_stress_test_thread3, mrg); + + + sleep_usec(5 * USEC_PER_SEC); + + netdata_thread_cancel(thread1); + netdata_thread_cancel(thread2); + netdata_thread_cancel(thread3); + + netdata_thread_join(thread1, NULL); + netdata_thread_join(thread2, NULL); + netdata_thread_join(thread3, NULL); + usec_t ended_ut = now_monotonic_usec(); + + info("DBENGINE METRIC: did %zu additions, %zu duplicate additions, " + "%zu deletions, %zu wrong deletions, " + "%zu successful searches, %zu wrong searches, " + "%zu successful pointer validations, %zu wrong pointer validations " + "in %llu usecs", + mrg->stats.additions, mrg->stats.additions_duplicate, + mrg->stats.deletions, mrg->stats.delete_misses, + mrg->stats.search_hits, mrg->stats.search_misses, + mrg->stats.pointer_validation_hits, mrg->stats.pointer_validation_misses, + ended_ut - started_ut); + +#endif + + mrg_destroy(mrg); + + info("DBENGINE METRIC: all tests passed!"); + + return 0; +} diff --git a/database/engine/metric.h b/database/engine/metric.h new file mode 100644 index 000000000..82aff903a --- /dev/null +++ b/database/engine/metric.h @@ -0,0 +1,79 @@ +#ifndef DBENGINE_METRIC_H +#define DBENGINE_METRIC_H + +#include "../rrd.h" + +#define MRG_PARTITIONS 10 + +typedef struct metric METRIC; +typedef struct mrg MRG; + +typedef struct mrg_entry { + uuid_t uuid; + Word_t section; + time_t first_time_s; + time_t last_time_s; + uint32_t latest_update_every_s; +} MRG_ENTRY; + +struct mrg_statistics { + size_t entries; + size_t entries_referenced; + size_t entries_with_retention; + + size_t size; // total memory used, with indexing + + size_t current_references; + + size_t additions; + size_t additions_duplicate; + + size_t deletions; + size_t delete_having_retention_or_referenced; + size_t delete_misses; + + size_t search_hits; + size_t search_misses; + + size_t writers; + size_t writers_conflicts; +}; + +MRG *mrg_create(void); +void mrg_destroy(MRG *mrg); + +METRIC *mrg_metric_dup(MRG *mrg, METRIC *metric); +bool mrg_metric_release(MRG *mrg, METRIC *metric); + +METRIC *mrg_metric_add_and_acquire(MRG *mrg, MRG_ENTRY entry, bool *ret); +METRIC *mrg_metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section); +bool mrg_metric_release_and_delete(MRG *mrg, METRIC *metric); + +Word_t mrg_metric_id(MRG *mrg, METRIC *metric); +uuid_t *mrg_metric_uuid(MRG *mrg, METRIC *metric); +Word_t mrg_metric_section(MRG *mrg, METRIC *metric); + +bool mrg_metric_set_first_time_s(MRG *mrg, METRIC *metric, time_t first_time_s); +bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg, METRIC *metric, time_t first_time_s); +time_t mrg_metric_get_first_time_s(MRG *mrg, METRIC *metric); + +bool mrg_metric_set_clean_latest_time_s(MRG *mrg, METRIC *metric, time_t latest_time_s); +bool mrg_metric_set_hot_latest_time_s(MRG *mrg, METRIC *metric, time_t latest_time_s); +time_t mrg_metric_get_latest_time_s(MRG *mrg, METRIC *metric); + +bool mrg_metric_set_update_every(MRG *mrg, METRIC *metric, time_t update_every_s); +bool mrg_metric_set_update_every_s_if_zero(MRG *mrg, METRIC *metric, time_t update_every_s); +time_t mrg_metric_get_update_every_s(MRG *mrg, METRIC *metric); + +void mrg_metric_expand_retention(MRG *mrg, METRIC *metric, time_t first_time_s, time_t last_time_s, time_t update_every_s); +void mrg_metric_get_retention(MRG *mrg, METRIC *metric, time_t *first_time_s, time_t *last_time_s, time_t *update_every_s); +bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric); + +bool mrg_metric_set_writer(MRG *mrg, METRIC *metric); +bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric); + +struct mrg_statistics mrg_get_statistics(MRG *mrg); +size_t mrg_aral_structures(void); +size_t mrg_aral_overhead(void); + +#endif // DBENGINE_METRIC_H diff --git a/database/engine/pagecache.c b/database/engine/pagecache.c index 4f5da7084..b4902d784 100644 --- a/database/engine/pagecache.c +++ b/database/engine/pagecache.c @@ -3,1084 +3,836 @@ #include "rrdengine.h" -ARAL page_descr_aral = { - .requested_element_size = sizeof(struct rrdeng_page_descr), - .initial_elements = 20000, - .filename = "page_descriptors", - .cache_dir = &netdata_configured_cache_dir, - .use_mmap = false, - .internal.initialized = false -}; - -void rrdeng_page_descr_aral_go_singlethreaded(void) { - page_descr_aral.internal.lockless = true; -} -void rrdeng_page_descr_aral_go_multithreaded(void) { - page_descr_aral.internal.lockless = false; -} +MRG *main_mrg = NULL; +PGC *main_cache = NULL; +PGC *open_cache = NULL; +PGC *extent_cache = NULL; +struct rrdeng_cache_efficiency_stats rrdeng_cache_efficiency_stats = {}; -struct rrdeng_page_descr *rrdeng_page_descr_mallocz(void) { - struct rrdeng_page_descr *descr; - descr = arrayalloc_mallocz(&page_descr_aral); - return descr; +static void main_cache_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused) +{ + // Release storage associated with the page + dbengine_page_free(entry.data, entry.size); } +static void main_cache_flush_dirty_page_init_callback(PGC *cache __maybe_unused, Word_t section) { + struct rrdengine_instance *ctx = (struct rrdengine_instance *) section; -void rrdeng_page_descr_freez(struct rrdeng_page_descr *descr) { - arrayalloc_freez(&page_descr_aral, descr); + // mark ctx as having flushing in progress + __atomic_add_fetch(&ctx->atomic.extents_currently_being_flushed, 1, __ATOMIC_RELAXED); } -void rrdeng_page_descr_use_malloc(void) { - if(page_descr_aral.internal.initialized) - error("DBENGINE: cannot change ARAL allocation policy after it has been initialized."); - else - page_descr_aral.use_mmap = false; -} +static void main_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) +{ + if(!entries) + return; -void rrdeng_page_descr_use_mmap(void) { - if(page_descr_aral.internal.initialized) - error("DBENGINE: cannot change ARAL allocation policy after it has been initialized."); - else - page_descr_aral.use_mmap = true; -} + struct rrdengine_instance *ctx = (struct rrdengine_instance *) entries_array[0].section; -bool rrdeng_page_descr_is_mmap(void) { - return page_descr_aral.use_mmap; -} + size_t bytes_per_point = CTX_POINT_SIZE_BYTES(ctx); -/* Forward declarations */ -static int pg_cache_try_evict_one_page_unsafe(struct rrdengine_instance *ctx); + struct page_descr_with_data *base = NULL; -/* always inserts into tail */ -static inline void pg_cache_replaceQ_insert_unsafe(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + for (size_t Index = 0 ; Index < entries; Index++) { + time_t start_time_s = entries_array[Index].start_time_s; + time_t end_time_s = entries_array[Index].end_time_s; + struct page_descr_with_data *descr = page_descriptor_get(); - if (likely(NULL != pg_cache->replaceQ.tail)) { - pg_cache_descr->prev = pg_cache->replaceQ.tail; - pg_cache->replaceQ.tail->next = pg_cache_descr; - } - if (unlikely(NULL == pg_cache->replaceQ.head)) { - pg_cache->replaceQ.head = pg_cache_descr; - } - pg_cache->replaceQ.tail = pg_cache_descr; -} - -static inline void pg_cache_replaceQ_delete_unsafe(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr, *prev, *next; + descr->id = mrg_metric_uuid(main_mrg, (METRIC *) entries_array[Index].metric_id); + descr->metric_id = entries_array[Index].metric_id; + descr->start_time_ut = start_time_s * USEC_PER_SEC; + descr->end_time_ut = end_time_s * USEC_PER_SEC; + descr->update_every_s = entries_array[Index].update_every_s; + descr->type = ctx->config.page_type; - prev = pg_cache_descr->prev; - next = pg_cache_descr->next; + descr->page_length = (end_time_s - (start_time_s - descr->update_every_s)) / descr->update_every_s * bytes_per_point; - if (likely(NULL != prev)) { - prev->next = next; - } - if (likely(NULL != next)) { - next->prev = prev; - } - if (unlikely(pg_cache_descr == pg_cache->replaceQ.head)) { - pg_cache->replaceQ.head = next; - } - if (unlikely(pg_cache_descr == pg_cache->replaceQ.tail)) { - pg_cache->replaceQ.tail = prev; - } - pg_cache_descr->prev = pg_cache_descr->next = NULL; -} + if(descr->page_length > entries_array[Index].size) { + descr->page_length = entries_array[Index].size; -void pg_cache_replaceQ_insert(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - - uv_rwlock_wrlock(&pg_cache->replaceQ.lock); - pg_cache_replaceQ_insert_unsafe(ctx, descr); - uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); -} + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE: page exceeds the maximum size, adjusting it to max."); + } -void pg_cache_replaceQ_delete(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr) -{ - struct page_cache *pg_cache = &ctx->pg_cache; + descr->page = pgc_page_data(pages_array[Index]); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(base, descr, link.prev, link.next); - uv_rwlock_wrlock(&pg_cache->replaceQ.lock); - pg_cache_replaceQ_delete_unsafe(ctx, descr); - uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); -} -void pg_cache_replaceQ_set_hot(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr) -{ - struct page_cache *pg_cache = &ctx->pg_cache; + internal_fatal(descr->page_length > RRDENG_BLOCK_SIZE, "DBENGINE: faulty page length calculation"); + } - uv_rwlock_wrlock(&pg_cache->replaceQ.lock); - pg_cache_replaceQ_delete_unsafe(ctx, descr); - pg_cache_replaceQ_insert_unsafe(ctx, descr); - uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); + struct completion completion; + completion_init(&completion); + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_EXTENT_WRITE, base, &completion, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); + completion_wait_for(&completion); + completion_destroy(&completion); } -struct rrdeng_page_descr *pg_cache_create_descr(void) +static void open_cache_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused) { - struct rrdeng_page_descr *descr; - - descr = rrdeng_page_descr_mallocz(); - descr->page_length = 0; - descr->start_time_ut = INVALID_TIME; - descr->end_time_ut = INVALID_TIME; - descr->id = NULL; - descr->extent = NULL; - descr->pg_cache_descr_state = 0; - descr->pg_cache_descr = NULL; - descr->update_every_s = 0; - - return descr; + struct rrdengine_datafile *datafile = entry.data; + datafile_release(datafile, DATAFILE_ACQUIRE_OPEN_CACHE); } -/* The caller must hold page descriptor lock. */ -void pg_cache_wake_up_waiters_unsafe(struct rrdeng_page_descr *descr) +static void open_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) { - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; - if (pg_cache_descr->waiters) - uv_cond_broadcast(&pg_cache_descr->cond); + ; } -void pg_cache_wake_up_waiters(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) +static void extent_cache_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused) { - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_wake_up_waiters_unsafe(descr); - rrdeng_page_descr_mutex_unlock(ctx, descr); + dbengine_extent_free(entry.data, entry.size); } -/* - * The caller must hold page descriptor lock. - * The lock will be released and re-acquired. The descriptor is not guaranteed - * to exist after this function returns. - */ -void pg_cache_wait_event_unsafe(struct rrdeng_page_descr *descr) +static void extent_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) { - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; - - ++pg_cache_descr->waiters; - uv_cond_wait(&pg_cache_descr->cond, &pg_cache_descr->mutex); - --pg_cache_descr->waiters; + ; } -/* - * The caller must hold page descriptor lock. - * The lock will be released and re-acquired. The descriptor is not guaranteed - * to exist after this function returns. - * Returns UV_ETIMEDOUT if timeout_sec seconds pass. - */ -int pg_cache_timedwait_event_unsafe(struct rrdeng_page_descr *descr, uint64_t timeout_sec) -{ - int ret; - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; +inline TIME_RANGE_COMPARE is_page_in_time_range(time_t page_first_time_s, time_t page_last_time_s, time_t wanted_start_time_s, time_t wanted_end_time_s) { + // page_first_time_s <= wanted_end_time_s && page_last_time_s >= wanted_start_time_s + + if(page_last_time_s < wanted_start_time_s) + return PAGE_IS_IN_THE_PAST; - ++pg_cache_descr->waiters; - ret = uv_cond_timedwait(&pg_cache_descr->cond, &pg_cache_descr->mutex, timeout_sec * NSEC_PER_SEC); - --pg_cache_descr->waiters; + if(page_first_time_s > wanted_end_time_s) + return PAGE_IS_IN_THE_FUTURE; - return ret; + return PAGE_IS_IN_RANGE; } -/* - * Returns page flags. - * The lock will be released and re-acquired. The descriptor is not guaranteed - * to exist after this function returns. - */ -unsigned long pg_cache_wait_event(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) +static int journal_metric_uuid_compare(const void *key, const void *metric) { - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; - unsigned long flags; - - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_wait_event_unsafe(descr); - flags = pg_cache_descr->flags; - rrdeng_page_descr_mutex_unlock(ctx, descr); - - return flags; + return uuid_compare(*(uuid_t *) key, ((struct journal_metric_list *) metric)->uuid); } -/* - * The caller must hold page descriptor lock. - */ -int pg_cache_can_get_unsafe(struct rrdeng_page_descr *descr, int exclusive_access) -{ - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; +static inline struct page_details *pdc_find_page_for_time( + Pcvoid_t PArray, + time_t wanted_time_s, + size_t *gaps, + PDC_PAGE_STATUS mode, + PDC_PAGE_STATUS skip_list +) { + Word_t PIndexF = wanted_time_s, PIndexL = wanted_time_s; + Pvoid_t *PValueF, *PValueL; + struct page_details *pdF = NULL, *pdL = NULL; + bool firstF = true, firstL = true; + + PDC_PAGE_STATUS ignore_list = PDC_PAGE_QUERY_GLOBAL_SKIP_LIST | skip_list; + + while ((PValueF = PDCJudyLFirstThenNext(PArray, &PIndexF, &firstF))) { + pdF = *PValueF; + + PDC_PAGE_STATUS status = __atomic_load_n(&pdF->status, __ATOMIC_ACQUIRE); + if (!(status & (ignore_list | mode))) + break; - if ((pg_cache_descr->flags & (RRD_PAGE_LOCKED | RRD_PAGE_READ_PENDING)) || - (exclusive_access && pg_cache_descr->refcnt)) { - return 0; + pdF = NULL; } - return 1; -} + while ((PValueL = PDCJudyLLastThenPrev(PArray, &PIndexL, &firstL))) { + pdL = *PValueL; -/* - * The caller must hold page descriptor lock. - * Gets a reference to the page descriptor. - * Returns 1 on success and 0 on failure. - */ -int pg_cache_try_get_unsafe(struct rrdeng_page_descr *descr, int exclusive_access) -{ - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + PDC_PAGE_STATUS status = __atomic_load_n(&pdL->status, __ATOMIC_ACQUIRE); + if(status & mode) { + // don't go all the way back to the beginning + // stop at the last processed + pdL = NULL; + break; + } - if (!pg_cache_can_get_unsafe(descr, exclusive_access)) - return 0; + if (!(status & ignore_list)) + break; - if (exclusive_access) - pg_cache_descr->flags |= RRD_PAGE_LOCKED; - ++pg_cache_descr->refcnt; + pdL = NULL; + } - return 1; -} + TIME_RANGE_COMPARE rcF = (pdF) ? is_page_in_time_range(pdF->first_time_s, pdF->last_time_s, wanted_time_s, wanted_time_s) : PAGE_IS_IN_THE_FUTURE; + TIME_RANGE_COMPARE rcL = (pdL) ? is_page_in_time_range(pdL->first_time_s, pdL->last_time_s, wanted_time_s, wanted_time_s) : PAGE_IS_IN_THE_PAST; -/* - * The caller must hold the page descriptor lock. - * This function may block doing cleanup. - */ -void pg_cache_put_unsafe(struct rrdeng_page_descr *descr) -{ - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + if (!pdF || pdF == pdL) { + // F is missing, or they are the same + // return L + (*gaps) += (rcL == PAGE_IS_IN_RANGE) ? 0 : 1; + return pdL; + } - pg_cache_descr->flags &= ~RRD_PAGE_LOCKED; - if (0 == --pg_cache_descr->refcnt) { - pg_cache_wake_up_waiters_unsafe(descr); + if (!pdL) { + // L is missing + // return F + (*gaps) += (rcF == PAGE_IS_IN_RANGE) ? 0 : 1; + return pdF; } -} -/* - * This function may block doing cleanup. - */ -void pg_cache_put(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) -{ - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_put_unsafe(descr); - rrdeng_page_descr_mutex_unlock(ctx, descr); -} + if (rcF == rcL) { + // both are on the same side, + // but they are different pages -/* The caller must hold the page cache lock */ -static void pg_cache_release_pages_unsafe(struct rrdengine_instance *ctx, unsigned number) -{ - struct page_cache *pg_cache = &ctx->pg_cache; + switch (rcF) { + case PAGE_IS_IN_RANGE: + // pick the higher resolution + if (pdF->update_every_s && pdF->update_every_s < pdL->update_every_s) + return pdF; - pg_cache->populated_pages -= number; -} + if (pdL->update_every_s && pdL->update_every_s < pdF->update_every_s) + return pdL; -static void pg_cache_release_pages(struct rrdengine_instance *ctx, unsigned number) -{ - struct page_cache *pg_cache = &ctx->pg_cache; + // same resolution - pick the one that starts earlier + if (pdL->first_time_s < pdF->first_time_s) + return pdL; - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - pg_cache_release_pages_unsafe(ctx, number); - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); -} + return pdF; + break; -/* - * This function returns the maximum number of pages allowed in the page cache. - */ -unsigned long pg_cache_hard_limit(struct rrdengine_instance *ctx) -{ - return ctx->max_cache_pages + (unsigned long)ctx->metric_API_max_producers; -} + case PAGE_IS_IN_THE_FUTURE: + (*gaps)++; -/* - * This function returns the low watermark number of pages in the page cache. The page cache should strive to keep the - * number of pages below that number. - */ -unsigned long pg_cache_soft_limit(struct rrdengine_instance *ctx) -{ - return ctx->cache_pages_low_watermark + (unsigned long)ctx->metric_API_max_producers; -} + // pick the one that starts earlier + if (pdL->first_time_s < pdF->first_time_s) + return pdL; -/* - * This function returns the maximum number of dirty pages that are committed to be written to disk allowed in the page - * cache. - */ -unsigned long pg_cache_committed_hard_limit(struct rrdengine_instance *ctx) -{ - /* We remove the active pages of the producers from the calculation and only allow the extra pinned pages */ - return ctx->cache_pages_low_watermark + (unsigned long)ctx->metric_API_max_producers; -} + return pdF; + break; -/* - * This function will block until it reserves #number populated pages. - * It will trigger evictions or dirty page flushing if the pg_cache_hard_limit() limit is hit. - */ -static void pg_cache_reserve_pages(struct rrdengine_instance *ctx, unsigned number) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - unsigned failures = 0; - const unsigned FAILURES_CEILING = 10; /* truncates exponential backoff to (2^FAILURES_CEILING x slot) */ - unsigned long exp_backoff_slot_usec = USEC_PER_MS * 10; - - assert(number < ctx->max_cache_pages); - - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - if (pg_cache->populated_pages + number >= pg_cache_hard_limit(ctx) + 1) - debug(D_RRDENGINE, "==Page cache full. Reserving %u pages.==", - number); - while (pg_cache->populated_pages + number >= pg_cache_hard_limit(ctx) + 1) { - - if (!pg_cache_try_evict_one_page_unsafe(ctx)) { - /* failed to evict */ - struct completion compl; - struct rrdeng_cmd cmd; - - ++failures; - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); - - completion_init(&compl); - cmd.opcode = RRDENG_FLUSH_PAGES; - cmd.completion = &compl; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); - /* wait for some pages to be flushed */ - debug(D_RRDENGINE, "%s: waiting for pages to be written to disk before evicting.", __func__); - completion_wait_for(&compl); - completion_destroy(&compl); - - if (unlikely(failures > 1)) { - unsigned long slots, usecs_to_sleep; - /* exponential backoff */ - slots = random() % (2LU << MIN(failures, FAILURES_CEILING)); - usecs_to_sleep = slots * exp_backoff_slot_usec; - - if (usecs_to_sleep >= USEC_PER_SEC) - error("Page cache is full. Sleeping for %llu second(s).", usecs_to_sleep / USEC_PER_SEC); - - (void)sleep_usec(usecs_to_sleep); - } - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); + default: + case PAGE_IS_IN_THE_PAST: + (*gaps)++; + return NULL; + break; } } - pg_cache->populated_pages += number; - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); -} -/* - * This function will attempt to reserve #number populated pages. - * It may trigger evictions if the pg_cache_soft_limit() limit is hit. - * Returns 0 on failure and 1 on success. - */ -static int pg_cache_try_reserve_pages(struct rrdengine_instance *ctx, unsigned number) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - unsigned count = 0; - int ret = 0; - - assert(number < ctx->max_cache_pages); - - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - if (pg_cache->populated_pages + number >= pg_cache_soft_limit(ctx) + 1) { - debug(D_RRDENGINE, - "==Page cache full. Trying to reserve %u pages.==", - number); - do { - if (!pg_cache_try_evict_one_page_unsafe(ctx)) - break; - ++count; - } while (pg_cache->populated_pages + number >= pg_cache_soft_limit(ctx) + 1); - debug(D_RRDENGINE, "Evicted %u pages.", count); + if(rcF == PAGE_IS_IN_RANGE) { + // (*gaps) += 0; + return pdF; } - if (pg_cache->populated_pages + number < pg_cache_hard_limit(ctx) + 1) { - pg_cache->populated_pages += number; - ret = 1; /* success */ + if(rcL == PAGE_IS_IN_RANGE) { + // (*gaps) += 0; + return pdL; } - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); - return ret; -} + if(rcF == PAGE_IS_IN_THE_FUTURE) { + (*gaps)++; + return pdF; + } -/* The caller must hold the page cache and the page descriptor locks in that order */ -static void pg_cache_evict_unsafe(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) -{ - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + if(rcL == PAGE_IS_IN_THE_FUTURE) { + (*gaps)++; + return pdL; + } - dbengine_page_free(pg_cache_descr->page); - pg_cache_descr->page = NULL; - pg_cache_descr->flags &= ~RRD_PAGE_POPULATED; - pg_cache_release_pages_unsafe(ctx, 1); - ++ctx->stats.pg_cache_evictions; + // impossible case + (*gaps)++; + return NULL; } -/* - * The caller must hold the page cache lock. - * Lock order: page cache -> replaceQ -> page descriptor - * This function iterates all pages and tries to evict one. - * If it fails it sets in_flight_descr to the oldest descriptor that has write-back in progress, - * or it sets it to NULL if no write-back is in progress. - * - * Returns 1 on success and 0 on failure. - */ -static int pg_cache_try_evict_one_page_unsafe(struct rrdengine_instance *ctx) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - unsigned long old_flags; - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr = NULL; +static size_t get_page_list_from_pgc(PGC *cache, METRIC *metric, struct rrdengine_instance *ctx, + time_t wanted_start_time_s, time_t wanted_end_time_s, + Pvoid_t *JudyL_page_array, size_t *cache_gaps, + bool open_cache_mode, PDC_PAGE_STATUS tags) { - uv_rwlock_wrlock(&pg_cache->replaceQ.lock); - for (pg_cache_descr = pg_cache->replaceQ.head ; NULL != pg_cache_descr ; pg_cache_descr = pg_cache_descr->next) { - descr = pg_cache_descr->descr; + size_t pages_found_in_cache = 0; + Word_t metric_id = mrg_metric_id(main_mrg, metric); - rrdeng_page_descr_mutex_lock(ctx, descr); - old_flags = pg_cache_descr->flags; - if ((old_flags & RRD_PAGE_POPULATED) && !(old_flags & RRD_PAGE_DIRTY) && pg_cache_try_get_unsafe(descr, 1)) { - /* must evict */ - pg_cache_evict_unsafe(ctx, descr); - pg_cache_put_unsafe(descr); - pg_cache_replaceQ_delete_unsafe(ctx, descr); + time_t now_s = wanted_start_time_s; + time_t dt_s = mrg_metric_get_update_every_s(main_mrg, metric); - rrdeng_page_descr_mutex_unlock(ctx, descr); - uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); + if(!dt_s) + dt_s = default_rrd_update_every; - rrdeng_try_deallocate_pg_cache_descr(ctx, descr); + time_t previous_page_end_time_s = now_s - dt_s; + bool first = true; - return 1; - } - rrdeng_page_descr_mutex_unlock(ctx, descr); - } - uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); + do { + PGC_PAGE *page = pgc_page_get_and_acquire( + cache, (Word_t)ctx, (Word_t)metric_id, now_s, + (first) ? PGC_SEARCH_CLOSEST : PGC_SEARCH_NEXT); - /* failed to evict */ - return 0; -} + first = false; -/** - * Deletes a page from the database. - * Callers of this function need to make sure they're not deleting the same descriptor concurrently. - * @param ctx is the database instance. - * @param descr is the page descriptor. - * @param remove_dirty must be non-zero if the page to be deleted is dirty. - * @param is_exclusive_holder must be non-zero if the caller holds an exclusive page reference. - * @param metric_id is set to the metric the page belongs to, if it's safe to delete the metric and metric_id is not - * NULL. Otherwise, metric_id is not set. - * @return 1 if it's safe to delete the metric, 0 otherwise. - */ -uint8_t pg_cache_punch_hole(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr, uint8_t remove_dirty, - uint8_t is_exclusive_holder, uuid_t *metric_id) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct page_cache_descr *pg_cache_descr = NULL; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; - int ret; - uint8_t can_delete_metric = 0; - - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, descr->id, sizeof(uuid_t)); - fatal_assert(NULL != PValue); - page_index = *PValue; - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - - uv_rwlock_wrlock(&page_index->lock); - ret = JudyLDel(&page_index->JudyL_array, (Word_t)(descr->start_time_ut / USEC_PER_SEC), PJE0); - if (unlikely(0 == ret)) { - uv_rwlock_wrunlock(&page_index->lock); - if (unlikely(debug_flags & D_RRDENGINE)) { - print_page_descr(descr); - } - goto destroy; - } - --page_index->page_count; - if (!page_index->writers && !page_index->page_count) { - can_delete_metric = 1; - if (metric_id) { - memcpy(metric_id, page_index->id, sizeof(uuid_t)); - } - } - uv_rwlock_wrunlock(&page_index->lock); - fatal_assert(1 == ret); - - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - ++ctx->stats.pg_cache_deletions; - --pg_cache->page_descriptors; - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); - - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - if (!is_exclusive_holder) { - /* If we don't hold an exclusive page reference get one */ - while (!pg_cache_try_get_unsafe(descr, 1)) { - debug(D_RRDENGINE, "%s: Waiting for locked page:", __func__); - if (unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - pg_cache_wait_event_unsafe(descr); + if(!page) { + if(previous_page_end_time_s < wanted_end_time_s) + (*cache_gaps)++; + + break; } - } - if (remove_dirty) { - pg_cache_descr->flags &= ~RRD_PAGE_DIRTY; - } else { - /* even a locked page could be dirty */ - while (unlikely(pg_cache_descr->flags & RRD_PAGE_DIRTY)) { - debug(D_RRDENGINE, "%s: Found dirty page, waiting for it to be flushed:", __func__); - if (unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - pg_cache_wait_event_unsafe(descr); + + time_t page_start_time_s = pgc_page_start_time_s(page); + time_t page_end_time_s = pgc_page_end_time_s(page); + time_t page_update_every_s = pgc_page_update_every_s(page); + size_t page_length = pgc_page_data_size(cache, page); + + if(!page_update_every_s) + page_update_every_s = dt_s; + + if(is_page_in_time_range(page_start_time_s, page_end_time_s, wanted_start_time_s, wanted_end_time_s) != PAGE_IS_IN_RANGE) { + // not a useful page for this query + pgc_page_release(cache, page); + page = NULL; + + if(previous_page_end_time_s < wanted_end_time_s) + (*cache_gaps)++; + + break; } - } - rrdeng_page_descr_mutex_unlock(ctx, descr); - - while (unlikely(pg_cache_descr->flags & RRD_PAGE_READ_PENDING)) { - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, "%s: Found page with READ PENDING, waiting for read to complete", __func__); - if (unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - pg_cache_wait_event_unsafe(descr); - } - if (pg_cache_descr->flags & RRD_PAGE_POPULATED) { - /* only after locking can it be safely deleted from LRU */ - pg_cache_replaceQ_delete(ctx, descr); + if (page_start_time_s - previous_page_end_time_s > dt_s) + (*cache_gaps)++; + + Pvoid_t *PValue = PDCJudyLIns(JudyL_page_array, (Word_t) page_start_time_s, PJE0); + if (!PValue || PValue == PJERR) + fatal("DBENGINE: corrupted judy array in %s()", __FUNCTION__ ); + + if (unlikely(*PValue)) { + struct page_details *pd = *PValue; + UNUSED(pd); + +// internal_error( +// pd->first_time_s != page_first_time_s || +// pd->last_time_s != page_last_time_s || +// pd->update_every_s != page_update_every_s, +// "DBENGINE: duplicate page with different retention in %s cache " +// "1st: %ld to %ld, ue %u, size %u " +// "2nd: %ld to %ld, ue %ld size %zu " +// "- ignoring the second", +// cache == open_cache ? "open" : "main", +// pd->first_time_s, pd->last_time_s, pd->update_every_s, pd->page_length, +// page_first_time_s, page_last_time_s, page_update_every_s, page_length); + + pgc_page_release(cache, page); + } + else { - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - pg_cache_evict_unsafe(ctx, descr); - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); - } - pg_cache_put(ctx, descr); - rrdeng_try_deallocate_pg_cache_descr(ctx, descr); - while (descr->pg_cache_descr_state & PG_CACHE_DESCR_ALLOCATED) { - rrdeng_try_deallocate_pg_cache_descr(ctx, descr); /* spin */ - (void)sleep_usec(1000); /* 1 msec */ - } -destroy: - rrdeng_page_descr_freez(descr); - pg_cache_update_metric_times(page_index); + internal_fatal(pgc_page_metric(page) != metric_id, "Wrong metric id in page found in cache"); + internal_fatal(pgc_page_section(page) != (Word_t)ctx, "Wrong section in page found in cache"); - return can_delete_metric; -} + struct page_details *pd = page_details_get(); + pd->metric_id = metric_id; + pd->first_time_s = page_start_time_s; + pd->last_time_s = page_end_time_s; + pd->page_length = page_length; + pd->update_every_s = page_update_every_s; + pd->page = (open_cache_mode) ? NULL : page; + pd->status |= tags; -static inline int is_page_in_time_range(struct rrdeng_page_descr *descr, usec_t start_time, usec_t end_time) -{ - usec_t pg_start, pg_end; + if((pd->page)) { + pd->status |= PDC_PAGE_READY | PDC_PAGE_PRELOADED; - pg_start = descr->start_time_ut; - pg_end = descr->end_time_ut; + if(pgc_page_data(page) == DBENGINE_EMPTY_PAGE) + pd->status |= PDC_PAGE_EMPTY; + } - return (pg_start < start_time && pg_end >= start_time) || - (pg_start >= start_time && pg_start <= end_time); -} + if(open_cache_mode) { + struct rrdengine_datafile *datafile = pgc_page_data(page); + if(datafile_acquire(datafile, DATAFILE_ACQUIRE_PAGE_DETAILS)) { // for pd + struct extent_io_data *xio = (struct extent_io_data *) pgc_page_custom_data(cache, page); + pd->datafile.ptr = pgc_page_data(page); + pd->datafile.file = xio->file; + pd->datafile.extent.pos = xio->pos; + pd->datafile.extent.bytes = xio->bytes; + pd->datafile.fileno = pd->datafile.ptr->fileno; + pd->status |= PDC_PAGE_DATAFILE_ACQUIRED | PDC_PAGE_DISK_PENDING; + } + else { + pd->status |= PDC_PAGE_FAILED | PDC_PAGE_FAILED_TO_ACQUIRE_DATAFILE; + } + pgc_page_release(cache, page); + } -static inline int is_point_in_time_in_page(struct rrdeng_page_descr *descr, usec_t point_in_time) -{ - return (point_in_time >= descr->start_time_ut && point_in_time <= descr->end_time_ut); -} + *PValue = pd; -/* The caller must hold the page index lock */ -static inline struct rrdeng_page_descr * - find_first_page_in_time_range(struct pg_cache_page_index *page_index, usec_t start_time, usec_t end_time) -{ - struct rrdeng_page_descr *descr = NULL; - Pvoid_t *PValue; - Word_t Index; - - Index = (Word_t)(start_time / USEC_PER_SEC); - PValue = JudyLLast(page_index->JudyL_array, &Index, PJE0); - if (likely(NULL != PValue)) { - descr = *PValue; - if (is_page_in_time_range(descr, start_time, end_time)) { - return descr; + pages_found_in_cache++; } - } - Index = (Word_t)(start_time / USEC_PER_SEC); - PValue = JudyLFirst(page_index->JudyL_array, &Index, PJE0); - if (likely(NULL != PValue)) { - descr = *PValue; - if (is_page_in_time_range(descr, start_time, end_time)) { - return descr; - } - } + // prepare for the next iteration + previous_page_end_time_s = page_end_time_s; - return NULL; -} + if(page_update_every_s > 0) + dt_s = page_update_every_s; -/* Update metric oldest and latest timestamps efficiently when adding new values */ -void pg_cache_add_new_metric_time(struct pg_cache_page_index *page_index, struct rrdeng_page_descr *descr) -{ - usec_t oldest_time = page_index->oldest_time_ut; - usec_t latest_time = page_index->latest_time_ut; + // we are going to as for the NEXT page + // so, set this to our first time + now_s = page_start_time_s; - if (unlikely(oldest_time == INVALID_TIME || descr->start_time_ut < oldest_time)) { - page_index->oldest_time_ut = descr->start_time_ut; - } - if (likely(descr->end_time_ut > latest_time || latest_time == INVALID_TIME)) { - page_index->latest_time_ut = descr->end_time_ut; - } + } while(now_s <= wanted_end_time_s); + + return pages_found_in_cache; } -/* Update metric oldest and latest timestamps when removing old values */ -void pg_cache_update_metric_times(struct pg_cache_page_index *page_index) -{ - Pvoid_t *firstPValue, *lastPValue; - Word_t firstIndex, lastIndex; - struct rrdeng_page_descr *descr; - usec_t oldest_time = INVALID_TIME; - usec_t latest_time = INVALID_TIME; - - uv_rwlock_rdlock(&page_index->lock); - /* Find first page in range */ - firstIndex = (Word_t)0; - firstPValue = JudyLFirst(page_index->JudyL_array, &firstIndex, PJE0); - if (likely(NULL != firstPValue)) { - descr = *firstPValue; - oldest_time = descr->start_time_ut; - } - lastIndex = (Word_t)-1; - lastPValue = JudyLLast(page_index->JudyL_array, &lastIndex, PJE0); - if (likely(NULL != lastPValue)) { - descr = *lastPValue; - latest_time = descr->end_time_ut; - } - uv_rwlock_rdunlock(&page_index->lock); +static void pgc_inject_gap(struct rrdengine_instance *ctx, METRIC *metric, time_t start_time_s, time_t end_time_s) { - if (unlikely(NULL == firstPValue)) { - fatal_assert(NULL == lastPValue); - page_index->oldest_time_ut = page_index->latest_time_ut = INVALID_TIME; + time_t db_first_time_s, db_last_time_s, db_update_every_s; + mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s); + + if(is_page_in_time_range(start_time_s, end_time_s, db_first_time_s, db_last_time_s) != PAGE_IS_IN_RANGE) return; - } - page_index->oldest_time_ut = oldest_time; - page_index->latest_time_ut = latest_time; + + PGC_ENTRY page_entry = { + .hot = false, + .section = (Word_t)ctx, + .metric_id = (Word_t)metric, + .start_time_s = MAX(start_time_s, db_first_time_s), + .end_time_s = MIN(end_time_s, db_last_time_s), + .update_every_s = 0, + .size = 0, + .data = DBENGINE_EMPTY_PAGE, + }; + + if(page_entry.start_time_s >= page_entry.end_time_s) + return; + + PGC_PAGE *page = pgc_page_add_and_acquire(main_cache, page_entry, NULL); + pgc_page_release(main_cache, page); } -/* If index is NULL lookup by UUID (descr->id) */ -void pg_cache_insert(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, - struct rrdeng_page_descr *descr) -{ - struct page_cache *pg_cache = &ctx->pg_cache; +static size_t list_has_time_gaps( + struct rrdengine_instance *ctx, + METRIC *metric, + Pvoid_t JudyL_page_array, + time_t wanted_start_time_s, + time_t wanted_end_time_s, + size_t *pages_total, + size_t *pages_found_pass4, + size_t *pages_pending, + size_t *pages_overlapping, + time_t *optimal_end_time_s, + bool populate_gaps +) { + // we will recalculate these, so zero them + *pages_pending = 0; + *pages_overlapping = 0; + *optimal_end_time_s = 0; + + bool first; Pvoid_t *PValue; - struct pg_cache_page_index *page_index; - unsigned long pg_cache_descr_state = descr->pg_cache_descr_state; - - if (0 != pg_cache_descr_state) { - /* there is page cache descriptor pre-allocated state */ - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; - - fatal_assert(pg_cache_descr_state & PG_CACHE_DESCR_ALLOCATED); - if (pg_cache_descr->flags & RRD_PAGE_POPULATED) { - pg_cache_reserve_pages(ctx, 1); - if (!(pg_cache_descr->flags & RRD_PAGE_DIRTY)) - pg_cache_replaceQ_insert(ctx, descr); - } - } + Word_t this_page_start_time; + struct page_details *pd; + + size_t gaps = 0; + Word_t metric_id = mrg_metric_id(main_mrg, metric); + + // ------------------------------------------------------------------------ + // PASS 1: remove the preprocessing flags from the pages in PDC - if (unlikely(NULL == index)) { - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, descr->id, sizeof(uuid_t)); - fatal_assert(NULL != PValue); - page_index = *PValue; - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - } else { - page_index = index; + first = true; + this_page_start_time = 0; + while((PValue = PDCJudyLFirstThenNext(JudyL_page_array, &this_page_start_time, &first))) { + pd = *PValue; + pd->status &= ~(PDC_PAGE_SKIP|PDC_PAGE_PREPROCESSED); } - uv_rwlock_wrlock(&page_index->lock); - PValue = JudyLIns(&page_index->JudyL_array, (Word_t)(descr->start_time_ut / USEC_PER_SEC), PJE0); - *PValue = descr; - ++page_index->page_count; - pg_cache_add_new_metric_time(page_index, descr); - uv_rwlock_wrunlock(&page_index->lock); - - uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - ++ctx->stats.pg_cache_insertions; - ++pg_cache->page_descriptors; - uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); -} + // ------------------------------------------------------------------------ + // PASS 2: emulate processing to find the useful pages -usec_t pg_cache_oldest_time_in_range(struct rrdengine_instance *ctx, uuid_t *id, usec_t start_time_ut, usec_t end_time_ut) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = NULL; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; + time_t now_s = wanted_start_time_s; + time_t dt_s = mrg_metric_get_update_every_s(main_mrg, metric); + if(!dt_s) + dt_s = default_rrd_update_every; - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, id, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - page_index = *PValue; - } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - if (NULL == PValue) { - return INVALID_TIME; - } + size_t pages_pass2 = 0, pages_pass3 = 0; + while((pd = pdc_find_page_for_time( + JudyL_page_array, now_s, &gaps, + PDC_PAGE_PREPROCESSED, 0))) { - uv_rwlock_rdlock(&page_index->lock); - descr = find_first_page_in_time_range(page_index, start_time_ut, end_time_ut); - if (NULL == descr) { - uv_rwlock_rdunlock(&page_index->lock); - return INVALID_TIME; - } - uv_rwlock_rdunlock(&page_index->lock); - return descr->start_time_ut; -} + pd->status |= PDC_PAGE_PREPROCESSED; + pages_pass2++; -/** - * Return page information for the first page before point_in_time that satisfies the filter. - * @param ctx DB context - * @param page_index page index of a metric - * @param point_in_time_ut the pages that are searched must be older than this timestamp - * @param filter decides if the page satisfies the caller's criteria - * @param page_info the result of the search is set in this pointer - */ -void pg_cache_get_filtered_info_prev(struct rrdengine_instance *ctx, struct pg_cache_page_index *page_index, - usec_t point_in_time_ut, pg_cache_page_info_filter_t *filter, - struct rrdeng_page_info *page_info) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = NULL; - Pvoid_t *PValue; - Word_t Index; + if(pd->update_every_s) + dt_s = pd->update_every_s; - (void)pg_cache; - fatal_assert(NULL != page_index); + if(populate_gaps && pd->first_time_s > now_s) + pgc_inject_gap(ctx, metric, now_s, pd->first_time_s); - Index = (Word_t)(point_in_time_ut / USEC_PER_SEC); - uv_rwlock_rdlock(&page_index->lock); - do { - PValue = JudyLPrev(page_index->JudyL_array, &Index, PJE0); - descr = unlikely(NULL == PValue) ? NULL : *PValue; - } while (descr != NULL && !filter(descr)); - if (unlikely(NULL == descr)) { - page_info->page_length = 0; - page_info->start_time_ut = INVALID_TIME; - page_info->end_time_ut = INVALID_TIME; - } else { - page_info->page_length = descr->page_length; - page_info->start_time_ut = descr->start_time_ut; - page_info->end_time_ut = descr->end_time_ut; + now_s = pd->last_time_s + dt_s; + if(now_s > wanted_end_time_s) { + *optimal_end_time_s = pd->last_time_s; + break; + } } - uv_rwlock_rdunlock(&page_index->lock); -} -/** - * Searches for an unallocated page without triggering disk I/O. Attempts to reserve the page and get a reference. - * @param ctx DB context - * @param id lookup by UUID - * @param start_time_ut exact starting time in usec - * @param ret_page_indexp Sets the page index pointer (*ret_page_indexp) for the given UUID. - * @return the page descriptor or NULL on failure. It can fail if: - * 1. The page is already allocated to the page cache. - * 2. It did not succeed to get a reference. - * 3. It did not succeed to reserve a spot in the page cache. - */ -struct rrdeng_page_descr *pg_cache_lookup_unpopulated_and_lock(struct rrdengine_instance *ctx, uuid_t *id, - usec_t start_time_ut) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = NULL; - struct page_cache_descr *pg_cache_descr = NULL; - unsigned long flags; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; - Word_t Index; + if(populate_gaps && now_s < wanted_end_time_s) + pgc_inject_gap(ctx, metric, now_s, wanted_end_time_s); - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, id, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - page_index = *PValue; - } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); + // ------------------------------------------------------------------------ + // PASS 3: mark as skipped all the pages not useful - if ((NULL == PValue) || !pg_cache_try_reserve_pages(ctx, 1)) { - /* Failed to find page or failed to reserve a spot in the cache */ - return NULL; - } + first = true; + this_page_start_time = 0; + while((PValue = PDCJudyLFirstThenNext(JudyL_page_array, &this_page_start_time, &first))) { + pd = *PValue; - uv_rwlock_rdlock(&page_index->lock); - Index = (Word_t)(start_time_ut / USEC_PER_SEC); - PValue = JudyLGet(page_index->JudyL_array, Index, PJE0); - if (likely(NULL != PValue)) { - descr = *PValue; - } - if (NULL == PValue || 0 == descr->page_length) { - /* Failed to find non-empty page */ - uv_rwlock_rdunlock(&page_index->lock); + internal_fatal(pd->metric_id != metric_id, "pd has wrong metric_id"); - pg_cache_release_pages(ctx, 1); - return NULL; - } + if(!(pd->status & PDC_PAGE_PREPROCESSED)) { + (*pages_overlapping)++; + pd->status |= PDC_PAGE_SKIP; + pd->status &= ~(PDC_PAGE_READY | PDC_PAGE_DISK_PENDING); + continue; + } - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - flags = pg_cache_descr->flags; - uv_rwlock_rdunlock(&page_index->lock); + pages_pass3++; - if ((flags & RRD_PAGE_POPULATED) || !pg_cache_try_get_unsafe(descr, 1)) { - /* Failed to get reference or page is already populated */ - rrdeng_page_descr_mutex_unlock(ctx, descr); + if(!pd->page) { + pd->page = pgc_page_get_and_acquire(main_cache, (Word_t) ctx, (Word_t) metric_id, pd->first_time_s, PGC_SEARCH_EXACT); - pg_cache_release_pages(ctx, 1); - return NULL; + if(pd->page) { + (*pages_found_pass4)++; + + pd->status &= ~PDC_PAGE_DISK_PENDING; + pd->status |= PDC_PAGE_READY | PDC_PAGE_PRELOADED | PDC_PAGE_PRELOADED_PASS4; + + if(pgc_page_data(pd->page) == DBENGINE_EMPTY_PAGE) + pd->status |= PDC_PAGE_EMPTY; + + } + else if(!(pd->status & PDC_PAGE_FAILED) && (pd->status & PDC_PAGE_DATAFILE_ACQUIRED)) { + (*pages_pending)++; + + pd->status |= PDC_PAGE_DISK_PENDING; + + internal_fatal(pd->status & PDC_PAGE_SKIP, "page is disk pending and skipped"); + internal_fatal(!pd->datafile.ptr, "datafile is NULL"); + internal_fatal(!pd->datafile.extent.bytes, "datafile.extent.bytes zero"); + internal_fatal(!pd->datafile.extent.pos, "datafile.extent.pos is zero"); + internal_fatal(!pd->datafile.fileno, "datafile.fileno is zero"); + } + } + else { + pd->status &= ~PDC_PAGE_DISK_PENDING; + pd->status |= (PDC_PAGE_READY | PDC_PAGE_PRELOADED); + } } - /* success */ - rrdeng_page_descr_mutex_unlock(ctx, descr); - rrd_stat_atomic_add(&ctx->stats.pg_cache_misses, 1); - return descr; + internal_fatal(pages_pass2 != pages_pass3, + "DBENGINE: page count does not match"); + + *pages_total = pages_pass2; + + return gaps; } -/** - * Searches for pages in a time range and triggers disk I/O if necessary and possible. - * Does not get a reference. - * @param ctx DB context - * @param id UUID - * @param start_time_ut inclusive starting time in usec - * @param end_time_ut inclusive ending time in usec - * @param page_info_arrayp It allocates (*page_arrayp) and populates it with information of pages that overlap - * with the time range [start_time,end_time]. The caller must free (*page_info_arrayp) with freez(). - * If page_info_arrayp is set to NULL nothing was allocated. - * @param ret_page_indexp Sets the page index pointer (*ret_page_indexp) for the given UUID. - * @return the number of pages that overlap with the time range [start_time,end_time]. - */ -unsigned pg_cache_preload(struct rrdengine_instance *ctx, uuid_t *id, usec_t start_time_ut, usec_t end_time_ut, - struct rrdeng_page_info **page_info_arrayp, struct pg_cache_page_index **ret_page_indexp) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = NULL, *preload_array[PAGE_CACHE_MAX_PRELOAD_PAGES]; - struct page_cache_descr *pg_cache_descr = NULL; - unsigned i, j, k, preload_count, count, page_info_array_max_size; - unsigned long flags; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; - Word_t Index; - uint8_t failed_to_reserve; +typedef void (*page_found_callback_t)(PGC_PAGE *page, void *data); +static size_t get_page_list_from_journal_v2(struct rrdengine_instance *ctx, METRIC *metric, usec_t start_time_ut, usec_t end_time_ut, page_found_callback_t callback, void *callback_data) { + uuid_t *uuid = mrg_metric_uuid(main_mrg, metric); + Word_t metric_id = mrg_metric_id(main_mrg, metric); - fatal_assert(NULL != ret_page_indexp); + time_t wanted_start_time_s = (time_t)(start_time_ut / USEC_PER_SEC); + time_t wanted_end_time_s = (time_t)(end_time_ut / USEC_PER_SEC); - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, id, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - *ret_page_indexp = page_index = *PValue; - } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - if (NULL == PValue) { - debug(D_RRDENGINE, "%s: No page was found to attempt preload.", __func__); - *ret_page_indexp = NULL; - return 0; - } + size_t pages_found = 0; - uv_rwlock_rdlock(&page_index->lock); - descr = find_first_page_in_time_range(page_index, start_time_ut, end_time_ut); - if (NULL == descr) { - uv_rwlock_rdunlock(&page_index->lock); - debug(D_RRDENGINE, "%s: No page was found to attempt preload.", __func__); - *ret_page_indexp = NULL; - return 0; - } else { - Index = (Word_t)(descr->start_time_ut / USEC_PER_SEC); - } - if (page_info_arrayp) { - page_info_array_max_size = PAGE_CACHE_MAX_PRELOAD_PAGES * sizeof(struct rrdeng_page_info); - *page_info_arrayp = mallocz(page_info_array_max_size); - } + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + struct rrdengine_datafile *datafile; + for(datafile = ctx->datafiles.first; datafile ; datafile = datafile->next) { + struct journal_v2_header *j2_header = journalfile_v2_data_acquire(datafile->journalfile, NULL, + wanted_start_time_s, + wanted_end_time_s); + if (unlikely(!j2_header)) + continue; - for (count = 0, preload_count = 0 ; - descr != NULL && is_page_in_time_range(descr, start_time_ut, end_time_ut) ; - PValue = JudyLNext(page_index->JudyL_array, &Index, PJE0), - descr = unlikely(NULL == PValue) ? NULL : *PValue) { - /* Iterate all pages in range */ + time_t journal_start_time_s = (time_t)(j2_header->start_time_ut / USEC_PER_SEC); - if (unlikely(0 == descr->page_length)) + // the datafile possibly contains useful data for this query + + size_t journal_metric_count = (size_t)j2_header->metric_count; + struct journal_metric_list *uuid_list = (struct journal_metric_list *)((uint8_t *) j2_header + j2_header->metric_offset); + struct journal_metric_list *uuid_entry = bsearch(uuid,uuid_list,journal_metric_count,sizeof(*uuid_list), journal_metric_uuid_compare); + + if (unlikely(!uuid_entry)) { + // our UUID is not in this datafile + journalfile_v2_data_release(datafile->journalfile); continue; - if (page_info_arrayp) { - if (unlikely(count >= page_info_array_max_size / sizeof(struct rrdeng_page_info))) { - page_info_array_max_size += PAGE_CACHE_MAX_PRELOAD_PAGES * sizeof(struct rrdeng_page_info); - *page_info_arrayp = reallocz(*page_info_arrayp, page_info_array_max_size); - } - (*page_info_arrayp)[count].start_time_ut = descr->start_time_ut; - (*page_info_arrayp)[count].end_time_ut = descr->end_time_ut; - (*page_info_arrayp)[count].page_length = descr->page_length; } - ++count; - - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - flags = pg_cache_descr->flags; - if (pg_cache_can_get_unsafe(descr, 0)) { - if (flags & RRD_PAGE_POPULATED) { - /* success */ - rrdeng_page_descr_mutex_unlock(ctx, descr); - debug(D_RRDENGINE, "%s: Page was found in memory.", __func__); + + struct journal_page_header *page_list_header = (struct journal_page_header *) ((uint8_t *) j2_header + uuid_entry->page_offset); + struct journal_page_list *page_list = (struct journal_page_list *)((uint8_t *) page_list_header + sizeof(*page_list_header)); + struct journal_extent_list *extent_list = (void *)((uint8_t *)j2_header + j2_header->extent_offset); + uint32_t uuid_page_entries = page_list_header->entries; + + for (uint32_t index = 0; index < uuid_page_entries; index++) { + struct journal_page_list *page_entry_in_journal = &page_list[index]; + + time_t page_first_time_s = page_entry_in_journal->delta_start_s + journal_start_time_s; + time_t page_last_time_s = page_entry_in_journal->delta_end_s + journal_start_time_s; + + TIME_RANGE_COMPARE prc = is_page_in_time_range(page_first_time_s, page_last_time_s, wanted_start_time_s, wanted_end_time_s); + if(prc == PAGE_IS_IN_THE_PAST) continue; - } - } - if (!(flags & RRD_PAGE_POPULATED) && pg_cache_try_get_unsafe(descr, 1)) { - preload_array[preload_count++] = descr; - if (PAGE_CACHE_MAX_PRELOAD_PAGES == preload_count) { - rrdeng_page_descr_mutex_unlock(ctx, descr); + + if(prc == PAGE_IS_IN_THE_FUTURE) break; + + time_t page_update_every_s = page_entry_in_journal->update_every_s; + size_t page_length = page_entry_in_journal->page_length; + + if(datafile_acquire(datafile, DATAFILE_ACQUIRE_OPEN_CACHE)) { //for open cache item + // add this page to open cache + bool added = false; + struct extent_io_data ei = { + .pos = extent_list[page_entry_in_journal->extent_index].datafile_offset, + .bytes = extent_list[page_entry_in_journal->extent_index].datafile_size, + .page_length = page_length, + .file = datafile->file, + .fileno = datafile->fileno, + }; + + PGC_PAGE *page = pgc_page_add_and_acquire(open_cache, (PGC_ENTRY) { + .hot = false, + .section = (Word_t) ctx, + .metric_id = metric_id, + .start_time_s = page_first_time_s, + .end_time_s = page_last_time_s, + .update_every_s = page_update_every_s, + .data = datafile, + .size = 0, + .custom_data = (uint8_t *) &ei, + }, &added); + + if(!added) + datafile_release(datafile, DATAFILE_ACQUIRE_OPEN_CACHE); + + callback(page, callback_data); + + pgc_page_release(open_cache, page); + + pages_found++; } } - rrdeng_page_descr_mutex_unlock(ctx, descr); + journalfile_v2_data_release(datafile->journalfile); } - uv_rwlock_rdunlock(&page_index->lock); + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); - failed_to_reserve = 0; - for (i = 0 ; i < preload_count && !failed_to_reserve ; ++i) { - struct rrdeng_cmd cmd; - struct rrdeng_page_descr *next; + return pages_found; +} - descr = preload_array[i]; - if (NULL == descr) { - continue; - } - if (!pg_cache_try_reserve_pages(ctx, 1)) { - failed_to_reserve = 1; - break; - } - cmd.opcode = RRDENG_READ_EXTENT; - cmd.read_extent.page_cache_descr[0] = descr; - /* don't use this page again */ - preload_array[i] = NULL; - for (j = 0, k = 1 ; j < preload_count ; ++j) { - next = preload_array[j]; - if (NULL == next) { - continue; - } - if (descr->extent == next->extent) { - /* same extent, consolidate */ - if (!pg_cache_try_reserve_pages(ctx, 1)) { - failed_to_reserve = 1; - break; - } - cmd.read_extent.page_cache_descr[k++] = next; - /* don't use this page again */ - preload_array[j] = NULL; - } - } - cmd.read_extent.page_count = k; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); +void add_page_details_from_journal_v2(PGC_PAGE *page, void *JudyL_pptr) { + struct rrdengine_datafile *datafile = pgc_page_data(page); + + if(!datafile_acquire(datafile, DATAFILE_ACQUIRE_PAGE_DETAILS)) // for pd + return; + + Pvoid_t *PValue = PDCJudyLIns(JudyL_pptr, pgc_page_start_time_s(page), PJE0); + if (!PValue || PValue == PJERR) + fatal("DBENGINE: corrupted judy array"); + + if (unlikely(*PValue)) { + datafile_release(datafile, DATAFILE_ACQUIRE_PAGE_DETAILS); + return; } - if (failed_to_reserve) { - debug(D_RRDENGINE, "%s: Failed to reserve enough memory, canceling I/O.", __func__); - for (i = 0 ; i < preload_count ; ++i) { - descr = preload_array[i]; - if (NULL == descr) { - continue; - } - pg_cache_put(ctx, descr); - } + + Word_t metric_id = pgc_page_metric(page); + + // let's add it to the judy + struct extent_io_data *ei = pgc_page_custom_data(open_cache, page); + struct page_details *pd = page_details_get(); + *PValue = pd; + + pd->datafile.extent.pos = ei->pos; + pd->datafile.extent.bytes = ei->bytes; + pd->datafile.file = ei->file; + pd->datafile.fileno = ei->fileno; + pd->first_time_s = pgc_page_start_time_s(page); + pd->last_time_s = pgc_page_end_time_s(page); + pd->datafile.ptr = datafile; + pd->page_length = ei->page_length; + pd->update_every_s = pgc_page_update_every_s(page); + pd->metric_id = metric_id; + pd->status |= PDC_PAGE_DISK_PENDING | PDC_PAGE_SOURCE_JOURNAL_V2 | PDC_PAGE_DATAFILE_ACQUIRED; +} + +// Return a judyL will all pages that have start_time_ut and end_time_ut +// Pvalue of the judy will be the end time for that page +// DBENGINE2: +#define time_delta(finish, pass) do { if(pass) { usec_t t = pass; (pass) = (finish) - (pass); (finish) = t; } } while(0) +static Pvoid_t get_page_list( + struct rrdengine_instance *ctx, + METRIC *metric, + usec_t start_time_ut, + usec_t end_time_ut, + size_t *pages_to_load, + time_t *optimal_end_time_s +) { + *optimal_end_time_s = 0; + + Pvoid_t JudyL_page_array = (Pvoid_t) NULL; + + time_t wanted_start_time_s = (time_t)(start_time_ut / USEC_PER_SEC); + time_t wanted_end_time_s = (time_t)(end_time_ut / USEC_PER_SEC); + + size_t pages_found_in_main_cache = 0, + pages_found_in_open_cache = 0, + pages_found_in_journals_v2 = 0, + pages_found_pass4 = 0, + pages_pending = 0, + pages_overlapping = 0, + pages_total = 0; + + size_t cache_gaps = 0, query_gaps = 0; + bool done_v2 = false, done_open = false; + + usec_t pass1_ut = 0, pass2_ut = 0, pass3_ut = 0, pass4_ut = 0; + + // -------------------------------------------------------------- + // PASS 1: Check what the main page cache has available + + pass1_ut = now_monotonic_usec(); + size_t pages_pass1 = get_page_list_from_pgc(main_cache, metric, ctx, wanted_start_time_s, wanted_end_time_s, + &JudyL_page_array, &cache_gaps, + false, PDC_PAGE_SOURCE_MAIN_CACHE); + query_gaps += cache_gaps; + pages_found_in_main_cache += pages_pass1; + pages_total += pages_pass1; + + if(pages_found_in_main_cache && !cache_gaps) { + query_gaps = list_has_time_gaps(ctx, metric, JudyL_page_array, wanted_start_time_s, wanted_end_time_s, + &pages_total, &pages_found_pass4, &pages_pending, &pages_overlapping, + optimal_end_time_s, false); + + if (pages_total && !query_gaps) + goto we_are_done; } - if (!preload_count) { - /* no such page */ - debug(D_RRDENGINE, "%s: No page was eligible to attempt preload.", __func__); + + // -------------------------------------------------------------- + // PASS 2: Check what the open journal page cache has available + // these will be loaded from disk + + pass2_ut = now_monotonic_usec(); + size_t pages_pass2 = get_page_list_from_pgc(open_cache, metric, ctx, wanted_start_time_s, wanted_end_time_s, + &JudyL_page_array, &cache_gaps, + true, PDC_PAGE_SOURCE_OPEN_CACHE); + query_gaps += cache_gaps; + pages_found_in_open_cache += pages_pass2; + pages_total += pages_pass2; + done_open = true; + + if(pages_found_in_open_cache) { + query_gaps = list_has_time_gaps(ctx, metric, JudyL_page_array, wanted_start_time_s, wanted_end_time_s, + &pages_total, &pages_found_pass4, &pages_pending, &pages_overlapping, + optimal_end_time_s, false); + + if (pages_total && !query_gaps) + goto we_are_done; } - if (unlikely(0 == count && page_info_arrayp)) { - freez(*page_info_arrayp); - *page_info_arrayp = NULL; + + // -------------------------------------------------------------- + // PASS 3: Check Journal v2 to fill the gaps + + pass3_ut = now_monotonic_usec(); + size_t pages_pass3 = get_page_list_from_journal_v2(ctx, metric, start_time_ut, end_time_ut, + add_page_details_from_journal_v2, &JudyL_page_array); + pages_found_in_journals_v2 += pages_pass3; + pages_total += pages_pass3; + done_v2 = true; + + // -------------------------------------------------------------- + // PASS 4: Check the cache again + // and calculate the time gaps in the query + // THIS IS REQUIRED AFTER JOURNAL V2 LOOKUP + + pass4_ut = now_monotonic_usec(); + query_gaps = list_has_time_gaps(ctx, metric, JudyL_page_array, wanted_start_time_s, wanted_end_time_s, + &pages_total, &pages_found_pass4, &pages_pending, &pages_overlapping, + optimal_end_time_s, true); + +we_are_done: + + if(pages_to_load) + *pages_to_load = pages_pending; + + usec_t finish_ut = now_monotonic_usec(); + time_delta(finish_ut, pass4_ut); + time_delta(finish_ut, pass3_ut); + time_delta(finish_ut, pass2_ut); + time_delta(finish_ut, pass1_ut); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_in_main_cache_lookup, pass1_ut, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_in_open_cache_lookup, pass2_ut, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_in_journal_v2_lookup, pass3_ut, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_in_pass4_lookup, pass4_ut, __ATOMIC_RELAXED); + + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries_planned_with_gaps, (query_gaps) ? 1 : 0, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries_open, done_open ? 1 : 0, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries_journal_v2, done_v2 ? 1 : 0, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_total, pages_total, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_meta_source_main_cache, pages_found_in_main_cache, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_meta_source_open_cache, pages_found_in_open_cache, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_meta_source_journal_v2, pages_found_in_journals_v2, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache, pages_found_in_main_cache, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache_at_pass4, pages_found_pass4, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_to_load_from_disk, pages_pending, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_overlapping_skipped, pages_overlapping, __ATOMIC_RELAXED); + + return JudyL_page_array; +} + +inline void rrdeng_prep_wait(PDC *pdc) { + if (unlikely(pdc && !pdc->prep_done)) { + usec_t started_ut = now_monotonic_usec(); + completion_wait_for(&pdc->prep_completion); + pdc->prep_done = true; + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_wait_for_prep, now_monotonic_usec() - started_ut, __ATOMIC_RELAXED); } - return count; } -/* - * Searches for a page and gets a reference. - * When point_in_time is INVALID_TIME get any page. - * If index is NULL lookup by UUID (id). - */ -struct rrdeng_page_descr * - pg_cache_lookup(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, uuid_t *id, - usec_t point_in_time_ut) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = NULL; - struct page_cache_descr *pg_cache_descr = NULL; - unsigned long flags; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; - Word_t Index; - uint8_t page_not_in_cache; - - if (unlikely(NULL == index)) { - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, id, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - page_index = *PValue; - } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - if (NULL == PValue) { - return NULL; - } - } else { - page_index = index; +void rrdeng_prep_query(PDC *pdc) { + size_t pages_to_load = 0; + pdc->page_list_JudyL = get_page_list(pdc->ctx, pdc->metric, + pdc->start_time_s * USEC_PER_SEC, + pdc->end_time_s * USEC_PER_SEC, + &pages_to_load, + &pdc->optimal_end_time_s); + + if (pages_to_load && pdc->page_list_JudyL) { + pdc_acquire(pdc); // we get 1 for the 1st worker in the chain: do_read_page_list_work() + usec_t start_ut = now_monotonic_usec(); +// if(likely(priority == STORAGE_PRIORITY_BEST_EFFORT)) +// dbengine_load_page_list_directly(ctx, handle->pdc); +// else + pdc_route_asynchronously(pdc->ctx, pdc); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_to_route, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED); } - pg_cache_reserve_pages(ctx, 1); - - page_not_in_cache = 0; - uv_rwlock_rdlock(&page_index->lock); - while (1) { - Index = (Word_t)(point_in_time_ut / USEC_PER_SEC); - PValue = JudyLLast(page_index->JudyL_array, &Index, PJE0); - if (likely(NULL != PValue)) { - descr = *PValue; - } - if (NULL == PValue || - 0 == descr->page_length || - (INVALID_TIME != point_in_time_ut && - !is_point_in_time_in_page(descr, point_in_time_ut))) { - /* non-empty page not found */ - uv_rwlock_rdunlock(&page_index->lock); - - pg_cache_release_pages(ctx, 1); - return NULL; - } - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - flags = pg_cache_descr->flags; - if ((flags & RRD_PAGE_POPULATED) && pg_cache_try_get_unsafe(descr, 0)) { - /* success */ - rrdeng_page_descr_mutex_unlock(ctx, descr); - debug(D_RRDENGINE, "%s: Page was found in memory.", __func__); - break; - } - if (!(flags & RRD_PAGE_POPULATED) && pg_cache_try_get_unsafe(descr, 1)) { - struct rrdeng_cmd cmd; + else + completion_mark_complete(&pdc->page_completion); - uv_rwlock_rdunlock(&page_index->lock); + completion_mark_complete(&pdc->prep_completion); - cmd.opcode = RRDENG_READ_PAGE; - cmd.read_page.page_cache_descr = descr; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); + pdc_release_and_destroy_if_unreferenced(pdc, true, true); +} - debug(D_RRDENGINE, "%s: Waiting for page to be asynchronously read from disk:", __func__); - if(unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - while (!(pg_cache_descr->flags & RRD_PAGE_POPULATED)) { - pg_cache_wait_event_unsafe(descr); - } - /* success */ - /* Downgrade exclusive reference to allow other readers */ - pg_cache_descr->flags &= ~RRD_PAGE_LOCKED; - pg_cache_wake_up_waiters_unsafe(descr); - rrdeng_page_descr_mutex_unlock(ctx, descr); - rrd_stat_atomic_add(&ctx->stats.pg_cache_misses, 1); - return descr; - } - uv_rwlock_rdunlock(&page_index->lock); - debug(D_RRDENGINE, "%s: Waiting for page to be unlocked:", __func__); - if(unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - if (!(flags & RRD_PAGE_POPULATED)) - page_not_in_cache = 1; - pg_cache_wait_event_unsafe(descr); - rrdeng_page_descr_mutex_unlock(ctx, descr); - - /* reset scan to find again */ - uv_rwlock_rdlock(&page_index->lock); - } - uv_rwlock_rdunlock(&page_index->lock); +/** + * Searches for pages in a time range and triggers disk I/O if necessary and possible. + * @param ctx DB context + * @param handle query handle as initialized + * @param start_time_ut inclusive starting time in usec + * @param end_time_ut inclusive ending time in usec + * @return 1 / 0 (pages found or not found) + */ +void pg_cache_preload(struct rrdeng_query_handle *handle) { + if (unlikely(!handle || !handle->metric)) + return; - if (!(flags & RRD_PAGE_DIRTY)) - pg_cache_replaceQ_set_hot(ctx, descr); - pg_cache_release_pages(ctx, 1); - if (page_not_in_cache) - rrd_stat_atomic_add(&ctx->stats.pg_cache_misses, 1); - else - rrd_stat_atomic_add(&ctx->stats.pg_cache_hits, 1); - return descr; + __atomic_add_fetch(&handle->ctx->atomic.inflight_queries, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.currently_running_queries, 1, __ATOMIC_RELAXED); + handle->pdc = pdc_get(); + handle->pdc->metric = mrg_metric_dup(main_mrg, handle->metric); + handle->pdc->start_time_s = handle->start_time_s; + handle->pdc->end_time_s = handle->end_time_s; + handle->pdc->priority = handle->priority; + handle->pdc->optimal_end_time_s = handle->end_time_s; + handle->pdc->ctx = handle->ctx; + handle->pdc->refcount = 1; + netdata_spinlock_init(&handle->pdc->refcount_spinlock); + completion_init(&handle->pdc->prep_completion); + completion_init(&handle->pdc->page_completion); + + if(ctx_is_available_for_queries(handle->ctx)) { + handle->pdc->refcount++; // we get 1 for the query thread and 1 for the prep thread + rrdeng_enq_cmd(handle->ctx, RRDENG_OPCODE_QUERY, handle->pdc, NULL, handle->priority, NULL, NULL); + } + else { + completion_mark_complete(&handle->pdc->prep_completion); + completion_mark_complete(&handle->pdc->page_completion); + } } /* @@ -1088,226 +840,282 @@ struct rrdeng_page_descr * * start_time and end_time are inclusive. * If index is NULL lookup by UUID (id). */ -struct rrdeng_page_descr * -pg_cache_lookup_next(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, uuid_t *id, - usec_t start_time_ut, usec_t end_time_ut) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = NULL; - struct page_cache_descr *pg_cache_descr = NULL; - unsigned long flags; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; - uint8_t page_not_in_cache; - - if (unlikely(NULL == index)) { - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, id, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - page_index = *PValue; +struct pgc_page *pg_cache_lookup_next( + struct rrdengine_instance *ctx, + PDC *pdc, + time_t now_s, + time_t last_update_every_s, + size_t *entries +) { + if (unlikely(!pdc)) + return NULL; + + rrdeng_prep_wait(pdc); + + if (unlikely(!pdc->page_list_JudyL)) + return NULL; + + usec_t start_ut = now_monotonic_usec(); + size_t gaps = 0; + bool waited = false, preloaded; + PGC_PAGE *page = NULL; + + while(!page) { + bool page_from_pd = false; + preloaded = false; + struct page_details *pd = pdc_find_page_for_time( + pdc->page_list_JudyL, now_s, &gaps, + PDC_PAGE_PROCESSED, PDC_PAGE_EMPTY); + + if (!pd) + break; + + page = pd->page; + page_from_pd = true; + preloaded = pdc_page_status_check(pd, PDC_PAGE_PRELOADED); + if(!page) { + if(!completion_is_done(&pdc->page_completion)) { + page = pgc_page_get_and_acquire(main_cache, (Word_t)ctx, + pd->metric_id, pd->first_time_s, PGC_SEARCH_EXACT); + page_from_pd = false; + preloaded = pdc_page_status_check(pd, PDC_PAGE_PRELOADED); + } + + if(!page) { + pdc->completed_jobs = + completion_wait_for_a_job(&pdc->page_completion, pdc->completed_jobs); + + page = pd->page; + page_from_pd = true; + preloaded = pdc_page_status_check(pd, PDC_PAGE_PRELOADED); + waited = true; + } } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - if (NULL == PValue) { - return NULL; + + if(page && pgc_page_data(page) == DBENGINE_EMPTY_PAGE) + pdc_page_status_set(pd, PDC_PAGE_EMPTY); + + if(!page || pdc_page_status_check(pd, PDC_PAGE_QUERY_GLOBAL_SKIP_LIST | PDC_PAGE_EMPTY)) { + page = NULL; + continue; } - } else { - page_index = index; - } - pg_cache_reserve_pages(ctx, 1); - - page_not_in_cache = 0; - uv_rwlock_rdlock(&page_index->lock); - int retry_count = 0; - while (1) { - descr = find_first_page_in_time_range(page_index, start_time_ut, end_time_ut); - if (NULL == descr || 0 == descr->page_length || retry_count == default_rrdeng_page_fetch_retries) { - /* non-empty page not found */ - if (retry_count == default_rrdeng_page_fetch_retries) - error_report("Page cache timeout while waiting for page %p : returning FAIL", descr); - uv_rwlock_rdunlock(&page_index->lock); - - pg_cache_release_pages(ctx, 1); - return NULL; + + // we now have page and is not empty + + time_t page_start_time_s = pgc_page_start_time_s(page); + time_t page_end_time_s = pgc_page_end_time_s(page); + time_t page_update_every_s = pgc_page_update_every_s(page); + size_t page_length = pgc_page_data_size(main_cache, page); + + if(unlikely(page_start_time_s == INVALID_TIME || page_end_time_s == INVALID_TIME)) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_zero_time_skipped, 1, __ATOMIC_RELAXED); + pgc_page_to_clean_evict_or_release(main_cache, page); + pdc_page_status_set(pd, PDC_PAGE_INVALID | PDC_PAGE_RELEASED); + pd->page = page = NULL; + continue; } - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - flags = pg_cache_descr->flags; - if ((flags & RRD_PAGE_POPULATED) && pg_cache_try_get_unsafe(descr, 0)) { - /* success */ - rrdeng_page_descr_mutex_unlock(ctx, descr); - debug(D_RRDENGINE, "%s: Page was found in memory.", __func__); - break; + else if(page_length > RRDENG_BLOCK_SIZE) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_invalid_size_skipped, 1, __ATOMIC_RELAXED); + pgc_page_to_clean_evict_or_release(main_cache, page); + pdc_page_status_set(pd, PDC_PAGE_INVALID | PDC_PAGE_RELEASED); + pd->page = page = NULL; + continue; } - if (!(flags & RRD_PAGE_POPULATED) && pg_cache_try_get_unsafe(descr, 1)) { - struct rrdeng_cmd cmd; + else { + if (unlikely(page_update_every_s <= 0 || page_update_every_s > 86400)) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_invalid_update_every_fixed, 1, __ATOMIC_RELAXED); + pd->update_every_s = page_update_every_s = pgc_page_fix_update_every(page, last_update_every_s); + } - uv_rwlock_rdunlock(&page_index->lock); + size_t entries_by_size = page_entries_by_size(page_length, CTX_POINT_SIZE_BYTES(ctx)); + size_t entries_by_time = page_entries_by_time(page_start_time_s, page_end_time_s, page_update_every_s); + if(unlikely(entries_by_size < entries_by_time)) { + time_t fixed_page_end_time_s = (time_t)(page_start_time_s + (entries_by_size - 1) * page_update_every_s); + pd->last_time_s = page_end_time_s = pgc_page_fix_end_time_s(page, fixed_page_end_time_s); + entries_by_time = (page_end_time_s - (page_start_time_s - page_update_every_s)) / page_update_every_s; - cmd.opcode = RRDENG_READ_PAGE; - cmd.read_page.page_cache_descr = descr; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); + internal_fatal(entries_by_size != entries_by_time, "DBENGINE: wrong entries by time again!"); - debug(D_RRDENGINE, "%s: Waiting for page to be asynchronously read from disk:", __func__); - if(unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - while (!(pg_cache_descr->flags & RRD_PAGE_POPULATED)) { - pg_cache_wait_event_unsafe(descr); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_invalid_entries_fixed, 1, __ATOMIC_RELAXED); } - /* success */ - /* Downgrade exclusive reference to allow other readers */ - pg_cache_descr->flags &= ~RRD_PAGE_LOCKED; - pg_cache_wake_up_waiters_unsafe(descr); - rrdeng_page_descr_mutex_unlock(ctx, descr); - rrd_stat_atomic_add(&ctx->stats.pg_cache_misses, 1); - return descr; + *entries = entries_by_time; } - uv_rwlock_rdunlock(&page_index->lock); - debug(D_RRDENGINE, "%s: Waiting for page to be unlocked:", __func__); - if(unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - if (!(flags & RRD_PAGE_POPULATED)) - page_not_in_cache = 1; - - if (pg_cache_timedwait_event_unsafe(descr, default_rrdeng_page_fetch_timeout) == UV_ETIMEDOUT) { - error_report("Page cache timeout while waiting for page %p : retry count = %d", descr, retry_count); - ++retry_count; + + if(unlikely(page_end_time_s < now_s)) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_past_time_skipped, 1, __ATOMIC_RELAXED); + pgc_page_release(main_cache, page); + pdc_page_status_set(pd, PDC_PAGE_SKIP | PDC_PAGE_RELEASED); + pd->page = page = NULL; + continue; } - rrdeng_page_descr_mutex_unlock(ctx, descr); - /* reset scan to find again */ - uv_rwlock_rdlock(&page_index->lock); + if(page_from_pd) + // PDC_PAGE_RELEASED is for pdc_destroy() to not release the page twice - the caller will release it + pdc_page_status_set(pd, PDC_PAGE_RELEASED | PDC_PAGE_PROCESSED); + else + pdc_page_status_set(pd, PDC_PAGE_PROCESSED); } - uv_rwlock_rdunlock(&page_index->lock); - if (!(flags & RRD_PAGE_DIRTY)) - pg_cache_replaceQ_set_hot(ctx, descr); - pg_cache_release_pages(ctx, 1); - if (page_not_in_cache) - rrd_stat_atomic_add(&ctx->stats.pg_cache_misses, 1); - else - rrd_stat_atomic_add(&ctx->stats.pg_cache_hits, 1); - return descr; -} + if(gaps && !pdc->executed_with_gaps) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.queries_executed_with_gaps, 1, __ATOMIC_RELAXED); + pdc->executed_with_gaps = +gaps; -struct pg_cache_page_index *create_page_index(uuid_t *id, struct rrdengine_instance *ctx) -{ - struct pg_cache_page_index *page_index; - - page_index = mallocz(sizeof(*page_index)); - page_index->JudyL_array = (Pvoid_t) NULL; - uuid_copy(page_index->id, *id); - fatal_assert(0 == uv_rwlock_init(&page_index->lock)); - page_index->oldest_time_ut = INVALID_TIME; - page_index->latest_time_ut = INVALID_TIME; - page_index->prev = NULL; - page_index->page_count = 0; - page_index->refcount = 0; - page_index->writers = 0; - page_index->ctx = ctx; - page_index->latest_update_every_s = default_rrd_update_every; - - return page_index; -} + if(page) { + if(waited) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.page_next_wait_loaded, 1, __ATOMIC_RELAXED); + else + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.page_next_nowait_loaded, 1, __ATOMIC_RELAXED); + } + else { + if(waited) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.page_next_wait_failed, 1, __ATOMIC_RELAXED); + else + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.page_next_nowait_failed, 1, __ATOMIC_RELAXED); + } -static void init_metrics_index(struct rrdengine_instance *ctx) -{ - struct page_cache *pg_cache = &ctx->pg_cache; + if(waited) { + if(preloaded) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_to_slow_preload_next_page, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED); + else + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_to_slow_disk_next_page, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED); + } + else { + if(preloaded) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_to_fast_preload_next_page, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED); + else + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_to_fast_disk_next_page, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED); + } - pg_cache->metrics_index.JudyHS_array = (Pvoid_t) NULL; - pg_cache->metrics_index.last_page_index = NULL; - fatal_assert(0 == uv_rwlock_init(&pg_cache->metrics_index.lock)); + return page; } -static void init_replaceQ(struct rrdengine_instance *ctx) -{ - struct page_cache *pg_cache = &ctx->pg_cache; +void pgc_open_add_hot_page(Word_t section, Word_t metric_id, time_t start_time_s, time_t end_time_s, time_t update_every_s, + struct rrdengine_datafile *datafile, uint64_t extent_offset, unsigned extent_size, uint32_t page_length) { + + if(!datafile_acquire(datafile, DATAFILE_ACQUIRE_OPEN_CACHE)) // for open cache item + fatal("DBENGINE: cannot acquire datafile to put page in open cache"); + + struct extent_io_data ext_io_data = { + .file = datafile->file, + .fileno = datafile->fileno, + .pos = extent_offset, + .bytes = extent_size, + .page_length = page_length + }; + + PGC_ENTRY page_entry = { + .hot = true, + .section = section, + .metric_id = metric_id, + .start_time_s = start_time_s, + .end_time_s = end_time_s, + .update_every_s = update_every_s, + .size = 0, + .data = datafile, + .custom_data = (uint8_t *) &ext_io_data, + }; + + internal_fatal(!datafile->fileno, "DBENGINE: datafile supplied does not have a number"); + + bool added = true; + PGC_PAGE *page = pgc_page_add_and_acquire(open_cache, page_entry, &added); + int tries = 100; + while(!added && page_entry.end_time_s > pgc_page_end_time_s(page) && tries--) { + pgc_page_to_clean_evict_or_release(open_cache, page); + page = pgc_page_add_and_acquire(open_cache, page_entry, &added); + } - pg_cache->replaceQ.head = NULL; - pg_cache->replaceQ.tail = NULL; - fatal_assert(0 == uv_rwlock_init(&pg_cache->replaceQ.lock)); -} + if(!added) { + datafile_release(datafile, DATAFILE_ACQUIRE_OPEN_CACHE); -static void init_committed_page_index(struct rrdengine_instance *ctx) -{ - struct page_cache *pg_cache = &ctx->pg_cache; + internal_fatal(page_entry.end_time_s > pgc_page_end_time_s(page), + "DBENGINE: cannot add longer page to open cache"); + } - pg_cache->committed_page_index.JudyL_array = (Pvoid_t) NULL; - fatal_assert(0 == uv_rwlock_init(&pg_cache->committed_page_index.lock)); - pg_cache->committed_page_index.latest_corr_id = 0; - pg_cache->committed_page_index.nr_committed_pages = 0; + pgc_page_release(open_cache, (PGC_PAGE *)page); } -void init_page_cache(struct rrdengine_instance *ctx) -{ - struct page_cache *pg_cache = &ctx->pg_cache; +size_t dynamic_open_cache_size(void) { + size_t main_cache_size = pgc_get_wanted_cache_size(main_cache); + size_t target_size = main_cache_size / 100 * 5; - pg_cache->page_descriptors = 0; - pg_cache->populated_pages = 0; - fatal_assert(0 == uv_rwlock_init(&pg_cache->pg_cache_rwlock)); + if(target_size < 2 * 1024 * 1024) + target_size = 2 * 1024 * 1024; - init_metrics_index(ctx); - init_replaceQ(ctx); - init_committed_page_index(ctx); + return target_size; } -void free_page_cache(struct rrdengine_instance *ctx) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index, *prev_page_index; - Word_t Index; - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - - // if we are exiting, the OS will recover all memory so do not slow down the shutdown process - // Do the cleanup if we are compiling with NETDATA_INTERNAL_CHECKS - // This affects the reporting of dbengine statistics which are available in real time - // via the /api/v1/dbengine_stats endpoint -#ifndef NETDATA_DBENGINE_FREE - if (netdata_exit) - return; -#endif - Word_t metrics_index_bytes = 0, pages_index_bytes = 0, pages_dirty_index_bytes = 0; - - /* Free committed page index */ - pages_dirty_index_bytes = JudyLFreeArray(&pg_cache->committed_page_index.JudyL_array, PJE0); - fatal_assert(NULL == pg_cache->committed_page_index.JudyL_array); - - for (page_index = pg_cache->metrics_index.last_page_index ; - page_index != NULL ; - page_index = prev_page_index) { +size_t dynamic_extent_cache_size(void) { + size_t main_cache_size = pgc_get_wanted_cache_size(main_cache); + size_t target_size = main_cache_size / 100 * 5; - prev_page_index = page_index->prev; + if(target_size < 3 * 1024 * 1024) + target_size = 3 * 1024 * 1024; - /* Find first page in range */ - Index = (Word_t) 0; - PValue = JudyLFirst(page_index->JudyL_array, &Index, PJE0); - descr = unlikely(NULL == PValue) ? NULL : *PValue; - - while (descr != NULL) { - /* Iterate all page descriptors of this metric */ + return target_size; +} - if (descr->pg_cache_descr_state & PG_CACHE_DESCR_ALLOCATED) { - /* Check rrdenglocking.c */ - pg_cache_descr = descr->pg_cache_descr; - if (pg_cache_descr->flags & RRD_PAGE_POPULATED) { - dbengine_page_free(pg_cache_descr->page); - } - rrdeng_destroy_pg_cache_descr(ctx, pg_cache_descr); - } - rrdeng_page_descr_freez(descr); +void pgc_and_mrg_initialize(void) +{ + main_mrg = mrg_create(); - PValue = JudyLNext(page_index->JudyL_array, &Index, PJE0); - descr = unlikely(NULL == PValue) ? NULL : *PValue; - } + size_t target_cache_size = (size_t)default_rrdeng_page_cache_mb * 1024ULL * 1024ULL; + size_t main_cache_size = (target_cache_size / 100) * 95; + size_t open_cache_size = 0; + size_t extent_cache_size = (target_cache_size / 100) * 5; - /* Free page index */ - pages_index_bytes += JudyLFreeArray(&page_index->JudyL_array, PJE0); - fatal_assert(NULL == page_index->JudyL_array); - freez(page_index); + if(extent_cache_size < 3 * 1024 * 1024) { + extent_cache_size = 3 * 1024 * 1024; + main_cache_size = target_cache_size - extent_cache_size; } - /* Free metrics index */ - metrics_index_bytes = JudyHSFreeArray(&pg_cache->metrics_index.JudyHS_array, PJE0); - fatal_assert(NULL == pg_cache->metrics_index.JudyHS_array); - info("Freed %lu bytes of memory from page cache.", pages_dirty_index_bytes + pages_index_bytes + metrics_index_bytes); + + main_cache = pgc_create( + "main_cache", + main_cache_size, + main_cache_free_clean_page_callback, + (size_t) rrdeng_pages_per_extent, + main_cache_flush_dirty_page_init_callback, + main_cache_flush_dirty_page_callback, + 10, + 10240, // if there are that many threads, evict so many at once! + 1000, // + 5, // don't delay too much other threads + PGC_OPTIONS_AUTOSCALE, // AUTOSCALE = 2x max hot pages + 0, // 0 = as many as the system cpus + 0 + ); + + open_cache = pgc_create( + "open_cache", + open_cache_size, // the default is 1MB + open_cache_free_clean_page_callback, + 1, + NULL, + open_cache_flush_dirty_page_callback, + 10, + 10240, // if there are that many threads, evict that many at once! + 1000, // + 3, // don't delay too much other threads + PGC_OPTIONS_AUTOSCALE | PGC_OPTIONS_EVICT_PAGES_INLINE | PGC_OPTIONS_FLUSH_PAGES_INLINE, + 0, // 0 = as many as the system cpus + sizeof(struct extent_io_data) + ); + pgc_set_dynamic_target_cache_size_callback(open_cache, dynamic_open_cache_size); + + extent_cache = pgc_create( + "extent_cache", + extent_cache_size, + extent_cache_free_clean_page_callback, + 1, + NULL, + extent_cache_flush_dirty_page_callback, + 5, + 10, // it will lose up to that extents at once! + 100, // + 2, // don't delay too much other threads + PGC_OPTIONS_AUTOSCALE | PGC_OPTIONS_EVICT_PAGES_INLINE | PGC_OPTIONS_FLUSH_PAGES_INLINE, + 0, // 0 = as many as the system cpus + 0 + ); + pgc_set_dynamic_target_cache_size_callback(extent_cache, dynamic_extent_cache_size); } diff --git a/database/engine/pagecache.h b/database/engine/pagecache.h index 635b02123..9ab7db078 100644 --- a/database/engine/pagecache.h +++ b/database/engine/pagecache.h @@ -5,66 +5,34 @@ #include "rrdengine.h" +extern struct mrg *main_mrg; +extern struct pgc *main_cache; +extern struct pgc *open_cache; +extern struct pgc *extent_cache; + /* Forward declarations */ struct rrdengine_instance; -struct extent_info; -struct rrdeng_page_descr; #define INVALID_TIME (0) #define MAX_PAGE_CACHE_FETCH_RETRIES (3) #define PAGE_CACHE_FETCH_WAIT_TIMEOUT (3) -/* Page flags */ -#define RRD_PAGE_DIRTY (1LU << 0) -#define RRD_PAGE_LOCKED (1LU << 1) -#define RRD_PAGE_READ_PENDING (1LU << 2) -#define RRD_PAGE_WRITE_PENDING (1LU << 3) -#define RRD_PAGE_POPULATED (1LU << 4) - -struct page_cache_descr { - struct rrdeng_page_descr *descr; /* parent descriptor */ - void *page; - unsigned long flags; - struct page_cache_descr *prev; /* LRU */ - struct page_cache_descr *next; /* LRU */ - - unsigned refcnt; - uv_mutex_t mutex; /* always take it after the page cache lock or after the commit lock */ - uv_cond_t cond; - unsigned waiters; -}; - -/* Page cache descriptor flags, state = 0 means no descriptor */ -#define PG_CACHE_DESCR_ALLOCATED (1LU << 0) -#define PG_CACHE_DESCR_DESTROY (1LU << 1) -#define PG_CACHE_DESCR_LOCKED (1LU << 2) -#define PG_CACHE_DESCR_SHIFT (3) -#define PG_CACHE_DESCR_USERS_MASK (((unsigned long)-1) << PG_CACHE_DESCR_SHIFT) -#define PG_CACHE_DESCR_FLAGS_MASK (((unsigned long)-1) >> (BITS_PER_ULONG - PG_CACHE_DESCR_SHIFT)) +extern struct rrdeng_cache_efficiency_stats rrdeng_cache_efficiency_stats; -/* - * Page cache descriptor state bits (works for both 32-bit and 64-bit architectures): - * - * 63 ... 31 ... 3 | 2 | 1 | 0| - * -----------------------------+------------+------------+-----------| - * number of descriptor users | DESTROY | LOCKED | ALLOCATED | - */ -struct rrdeng_page_descr { - uuid_t *id; /* never changes */ - struct extent_info *extent; - - /* points to ephemeral page cache descriptor if the page resides in the cache */ - struct page_cache_descr *pg_cache_descr; - - /* Compare-And-Swap target for page cache descriptor allocation algorithm */ - volatile unsigned long pg_cache_descr_state; - - /* page information */ +struct page_descr_with_data { + uuid_t *id; + Word_t metric_id; usec_t start_time_ut; usec_t end_time_ut; - uint32_t update_every_s:24; uint8_t type; + uint32_t update_every_s; uint32_t page_length; + uint8_t *page; + + struct { + struct page_descr_with_data *prev; + struct page_descr_with_data *next; + } link; }; #define PAGE_INFO_SCRATCH_SZ (8) @@ -76,179 +44,21 @@ struct rrdeng_page_info { uint32_t page_length; }; -/* returns 1 for success, 0 for failure */ -typedef int pg_cache_page_info_filter_t(struct rrdeng_page_descr *); - -#define PAGE_CACHE_MAX_PRELOAD_PAGES (256) - struct pg_alignment { - uint32_t page_length; + uint32_t page_position; uint32_t refcount; + uint16_t initial_slots; }; -/* maps time ranges to pages */ -struct pg_cache_page_index { - uuid_t id; - /* - * care: JudyL_array indices are converted from useconds to seconds to fit in one word in 32-bit architectures - * TODO: examine if we want to support better granularity than seconds - */ - Pvoid_t JudyL_array; - Word_t page_count; - unsigned short refcount; - unsigned short writers; - uv_rwlock_t lock; - - /* - * Only one effective writer, data deletion workqueue. - * It's also written during the DB loading phase. - */ - usec_t oldest_time_ut; - - /* - * Only one effective writer, data collection thread. - * It's also written by the data deletion workqueue when data collection is disabled for this metric. - */ - usec_t latest_time_ut; - - struct rrdengine_instance *ctx; - uint32_t latest_update_every_s; - - struct pg_cache_page_index *prev; -}; - -/* maps UUIDs to page indices */ -struct pg_cache_metrics_index { - uv_rwlock_t lock; - Pvoid_t JudyHS_array; - struct pg_cache_page_index *last_page_index; -}; - -/* gathers dirty pages to be written on disk */ -struct pg_cache_committed_page_index { - uv_rwlock_t lock; - - Pvoid_t JudyL_array; - - /* - * Dirty page correlation ID is a hint. Dirty pages that are correlated should have - * a small correlation ID difference. Dirty pages in memory should never have the - * same ID at the same time for correctness. - */ - Word_t latest_corr_id; - - unsigned nr_committed_pages; -}; - -/* - * Gathers populated pages to be evicted. - * Relies on page cache descriptors being there as it uses their memory. - */ -struct pg_cache_replaceQ { - uv_rwlock_t lock; /* LRU lock */ - - struct page_cache_descr *head; /* LRU */ - struct page_cache_descr *tail; /* MRU */ -}; - -struct page_cache { /* TODO: add statistics */ - uv_rwlock_t pg_cache_rwlock; /* page cache lock */ - - struct pg_cache_metrics_index metrics_index; - struct pg_cache_committed_page_index committed_page_index; - struct pg_cache_replaceQ replaceQ; - - unsigned page_descriptors; - unsigned populated_pages; -}; - -void pg_cache_wake_up_waiters_unsafe(struct rrdeng_page_descr *descr); -void pg_cache_wake_up_waiters(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); -void pg_cache_wait_event_unsafe(struct rrdeng_page_descr *descr); -unsigned long pg_cache_wait_event(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); -void pg_cache_replaceQ_insert(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr); -void pg_cache_replaceQ_delete(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr); -void pg_cache_replaceQ_set_hot(struct rrdengine_instance *ctx, - struct rrdeng_page_descr *descr); -struct rrdeng_page_descr *pg_cache_create_descr(void); -int pg_cache_try_get_unsafe(struct rrdeng_page_descr *descr, int exclusive_access); -void pg_cache_put_unsafe(struct rrdeng_page_descr *descr); -void pg_cache_put(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); -void pg_cache_insert(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, - struct rrdeng_page_descr *descr); -uint8_t pg_cache_punch_hole(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr, - uint8_t remove_dirty, uint8_t is_exclusive_holder, uuid_t *metric_id); -usec_t pg_cache_oldest_time_in_range(struct rrdengine_instance *ctx, uuid_t *id, - usec_t start_time_ut, usec_t end_time_ut); -void pg_cache_get_filtered_info_prev(struct rrdengine_instance *ctx, struct pg_cache_page_index *page_index, - usec_t point_in_time_ut, pg_cache_page_info_filter_t *filter, - struct rrdeng_page_info *page_info); -struct rrdeng_page_descr *pg_cache_lookup_unpopulated_and_lock(struct rrdengine_instance *ctx, uuid_t *id, - usec_t start_time_ut); -unsigned - pg_cache_preload(struct rrdengine_instance *ctx, uuid_t *id, usec_t start_time_ut, usec_t end_time_ut, - struct rrdeng_page_info **page_info_arrayp, struct pg_cache_page_index **ret_page_indexp); -struct rrdeng_page_descr * - pg_cache_lookup(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, uuid_t *id, - usec_t point_in_time_ut); -struct rrdeng_page_descr * - pg_cache_lookup_next(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, uuid_t *id, - usec_t start_time_ut, usec_t end_time_ut); -struct pg_cache_page_index *create_page_index(uuid_t *id, struct rrdengine_instance *ctx); -void init_page_cache(struct rrdengine_instance *ctx); -void free_page_cache(struct rrdengine_instance *ctx); -void pg_cache_add_new_metric_time(struct pg_cache_page_index *page_index, struct rrdeng_page_descr *descr); -void pg_cache_update_metric_times(struct pg_cache_page_index *page_index); -unsigned long pg_cache_hard_limit(struct rrdengine_instance *ctx); -unsigned long pg_cache_soft_limit(struct rrdengine_instance *ctx); -unsigned long pg_cache_committed_hard_limit(struct rrdengine_instance *ctx); - -void rrdeng_page_descr_aral_go_singlethreaded(void); -void rrdeng_page_descr_aral_go_multithreaded(void); -void rrdeng_page_descr_use_malloc(void); -void rrdeng_page_descr_use_mmap(void); -bool rrdeng_page_descr_is_mmap(void); -struct rrdeng_page_descr *rrdeng_page_descr_mallocz(void); -void rrdeng_page_descr_freez(struct rrdeng_page_descr *descr); - -static inline void - pg_cache_atomic_get_pg_info(struct rrdeng_page_descr *descr, usec_t *end_time_ut_p, uint32_t *page_lengthp) -{ - usec_t end_time_ut, old_end_time_ut; - uint32_t page_length; - - if (NULL == descr->extent) { - /* this page is currently being modified, get consistent info locklessly */ - do { - end_time_ut = descr->end_time_ut; - __sync_synchronize(); - old_end_time_ut = end_time_ut; - page_length = descr->page_length; - __sync_synchronize(); - end_time_ut = descr->end_time_ut; - __sync_synchronize(); - } while ((end_time_ut != old_end_time_ut || (end_time_ut & 1) != 0)); +struct rrdeng_query_handle; +struct page_details_control; - *end_time_ut_p = end_time_ut; - *page_lengthp = page_length; - } else { - *end_time_ut_p = descr->end_time_ut; - *page_lengthp = descr->page_length; - } -} +void rrdeng_prep_wait(struct page_details_control *pdc); +void rrdeng_prep_query(struct page_details_control *pdc); +void pg_cache_preload(struct rrdeng_query_handle *handle); +struct pgc_page *pg_cache_lookup_next(struct rrdengine_instance *ctx, struct page_details_control *pdc, time_t now_s, time_t last_update_every_s, size_t *entries); +void pgc_and_mrg_initialize(void); -/* The caller must hold a reference to the page and must have already set the new data */ -static inline void pg_cache_atomic_set_pg_info(struct rrdeng_page_descr *descr, usec_t end_time_ut, uint32_t page_length) -{ - fatal_assert(!(end_time_ut & 1)); - __sync_synchronize(); - descr->end_time_ut |= 1; /* mark start of uncertainty period by adding 1 microsecond */ - __sync_synchronize(); - descr->page_length = page_length; - __sync_synchronize(); - descr->end_time_ut = end_time_ut; /* mark end of uncertainty period */ -} +void pgc_open_add_hot_page(Word_t section, Word_t metric_id, time_t start_time_s, time_t end_time_s, time_t update_every_s, struct rrdengine_datafile *datafile, uint64_t extent_offset, unsigned extent_size, uint32_t page_length); #endif /* NETDATA_PAGECACHE_H */ diff --git a/database/engine/pdc.c b/database/engine/pdc.c new file mode 100644 index 000000000..8b8e71958 --- /dev/null +++ b/database/engine/pdc.c @@ -0,0 +1,1282 @@ +// SPDX-License-Identifier: GPL-3.0-or-later +#define NETDATA_RRD_INTERNALS +#include "pdc.h" + +struct extent_page_details_list { + uv_file file; + uint64_t extent_offset; + uint32_t extent_size; + unsigned number_of_pages_in_JudyL; + Pvoid_t page_details_by_metric_id_JudyL; + struct page_details_control *pdc; + struct rrdengine_datafile *datafile; + + struct rrdeng_cmd *cmd; + bool head_to_datafile_extent_queries_pending_for_extent; + + struct { + struct extent_page_details_list *prev; + struct extent_page_details_list *next; + } query; +}; + +typedef struct datafile_extent_offset_list { + uv_file file; + unsigned fileno; + Pvoid_t extent_pd_list_by_extent_offset_JudyL; +} DEOL; + +// ---------------------------------------------------------------------------- +// PDC cache + +static struct { + struct { + ARAL *ar; + } pdc; + + struct { + ARAL *ar; + } pd; + + struct { + ARAL *ar; + } epdl; + + struct { + ARAL *ar; + } deol; +} pdc_globals = {}; + +void pdc_init(void) { + pdc_globals.pdc.ar = aral_create( + "dbengine-pdc", + sizeof(PDC), + 0, + 65536, + NULL, + NULL, NULL, false, false + ); +} + +PDC *pdc_get(void) { + PDC *pdc = aral_mallocz(pdc_globals.pdc.ar); + memset(pdc, 0, sizeof(PDC)); + return pdc; +} + +static void pdc_release(PDC *pdc) { + aral_freez(pdc_globals.pdc.ar, pdc); +} + +size_t pdc_cache_size(void) { + return aral_overhead(pdc_globals.pdc.ar) + aral_structures(pdc_globals.pdc.ar); +} + +// ---------------------------------------------------------------------------- +// PD cache + +void page_details_init(void) { + pdc_globals.pd.ar = aral_create( + "dbengine-pd", + sizeof(struct page_details), + 0, + 65536, + NULL, + NULL, NULL, false, false + ); +} + +struct page_details *page_details_get(void) { + struct page_details *pd = aral_mallocz(pdc_globals.pd.ar); + memset(pd, 0, sizeof(struct page_details)); + return pd; +} + +static void page_details_release(struct page_details *pd) { + aral_freez(pdc_globals.pd.ar, pd); +} + +size_t pd_cache_size(void) { + return aral_overhead(pdc_globals.pd.ar) + aral_structures(pdc_globals.pd.ar); +} + +// ---------------------------------------------------------------------------- +// epdl cache + +void epdl_init(void) { + pdc_globals.epdl.ar = aral_create( + "dbengine-epdl", + sizeof(EPDL), + 0, + 65536, + NULL, + NULL, NULL, false, false + ); +} + +static EPDL *epdl_get(void) { + EPDL *epdl = aral_mallocz(pdc_globals.epdl.ar); + memset(epdl, 0, sizeof(EPDL)); + return epdl; +} + +static void epdl_release(EPDL *epdl) { + aral_freez(pdc_globals.epdl.ar, epdl); +} + +size_t epdl_cache_size(void) { + return aral_overhead(pdc_globals.epdl.ar) + aral_structures(pdc_globals.epdl.ar); +} + +// ---------------------------------------------------------------------------- +// deol cache + +void deol_init(void) { + pdc_globals.deol.ar = aral_create( + "dbengine-deol", + sizeof(DEOL), + 0, + 65536, + NULL, + NULL, NULL, false, false + ); +} + +static DEOL *deol_get(void) { + DEOL *deol = aral_mallocz(pdc_globals.deol.ar); + memset(deol, 0, sizeof(DEOL)); + return deol; +} + +static void deol_release(DEOL *deol) { + aral_freez(pdc_globals.deol.ar, deol); +} + +size_t deol_cache_size(void) { + return aral_overhead(pdc_globals.deol.ar) + aral_structures(pdc_globals.deol.ar); +} + +// ---------------------------------------------------------------------------- +// extent with buffer cache + +static struct { + struct { + SPINLOCK spinlock; + struct extent_buffer *available_items; + size_t available; + } protected; + + struct { + size_t allocated; + size_t allocated_bytes; + } atomics; + + size_t max_size; + +} extent_buffer_globals = { + .protected = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .available_items = NULL, + .available = 0, + }, + .atomics = { + .allocated = 0, + .allocated_bytes = 0, + }, + .max_size = MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE, +}; + +void extent_buffer_init(void) { + size_t max_extent_uncompressed = MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE; + size_t max_size = (size_t)LZ4_compressBound(MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE); + if(max_size < max_extent_uncompressed) + max_size = max_extent_uncompressed; + + extent_buffer_globals.max_size = max_size; +} + +void extent_buffer_cleanup1(void) { + struct extent_buffer *item = NULL; + + if(!netdata_spinlock_trylock(&extent_buffer_globals.protected.spinlock)) + return; + + if(extent_buffer_globals.protected.available_items && extent_buffer_globals.protected.available > 1) { + item = extent_buffer_globals.protected.available_items; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, item, cache.prev, cache.next); + extent_buffer_globals.protected.available--; + } + + netdata_spinlock_unlock(&extent_buffer_globals.protected.spinlock); + + if(item) { + size_t bytes = sizeof(struct extent_buffer) + item->bytes; + freez(item); + __atomic_sub_fetch(&extent_buffer_globals.atomics.allocated, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&extent_buffer_globals.atomics.allocated_bytes, bytes, __ATOMIC_RELAXED); + } +} + +struct extent_buffer *extent_buffer_get(size_t size) { + internal_fatal(size > extent_buffer_globals.max_size, "DBENGINE: extent size is too big"); + + struct extent_buffer *eb = NULL; + + if(size < extent_buffer_globals.max_size) + size = extent_buffer_globals.max_size; + + netdata_spinlock_lock(&extent_buffer_globals.protected.spinlock); + if(likely(extent_buffer_globals.protected.available_items)) { + eb = extent_buffer_globals.protected.available_items; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, eb, cache.prev, cache.next); + extent_buffer_globals.protected.available--; + } + netdata_spinlock_unlock(&extent_buffer_globals.protected.spinlock); + + if(unlikely(eb && eb->bytes < size)) { + size_t bytes = sizeof(struct extent_buffer) + eb->bytes; + freez(eb); + eb = NULL; + __atomic_sub_fetch(&extent_buffer_globals.atomics.allocated, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&extent_buffer_globals.atomics.allocated_bytes, bytes, __ATOMIC_RELAXED); + } + + if(unlikely(!eb)) { + size_t bytes = sizeof(struct extent_buffer) + size; + eb = mallocz(bytes); + eb->bytes = size; + __atomic_add_fetch(&extent_buffer_globals.atomics.allocated, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&extent_buffer_globals.atomics.allocated_bytes, bytes, __ATOMIC_RELAXED); + } + + return eb; +} + +void extent_buffer_release(struct extent_buffer *eb) { + if(unlikely(!eb)) return; + + netdata_spinlock_lock(&extent_buffer_globals.protected.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, eb, cache.prev, cache.next); + extent_buffer_globals.protected.available++; + netdata_spinlock_unlock(&extent_buffer_globals.protected.spinlock); +} + +size_t extent_buffer_cache_size(void) { + return __atomic_load_n(&extent_buffer_globals.atomics.allocated_bytes, __ATOMIC_RELAXED); +} + +// ---------------------------------------------------------------------------- +// epdl logic + +static void epdl_destroy(EPDL *epdl) +{ + Pvoid_t *pd_by_start_time_s_JudyL; + Word_t metric_id_index = 0; + bool metric_id_first = true; + while ((pd_by_start_time_s_JudyL = PDCJudyLFirstThenNext( + epdl->page_details_by_metric_id_JudyL, + &metric_id_index, &metric_id_first))) + PDCJudyLFreeArray(pd_by_start_time_s_JudyL, PJE0); + + PDCJudyLFreeArray(&epdl->page_details_by_metric_id_JudyL, PJE0); + epdl_release(epdl); +} + +static void epdl_mark_all_not_loaded_pages_as_failed(EPDL *epdl, PDC_PAGE_STATUS tags, size_t *statistics_counter) +{ + size_t pages_matched = 0; + + Word_t metric_id_index = 0; + bool metric_id_first = true; + Pvoid_t *pd_by_start_time_s_JudyL; + while((pd_by_start_time_s_JudyL = PDCJudyLFirstThenNext(epdl->page_details_by_metric_id_JudyL, &metric_id_index, &metric_id_first))) { + + Word_t start_time_index = 0; + bool start_time_first = true; + Pvoid_t *PValue; + while ((PValue = PDCJudyLFirstThenNext(*pd_by_start_time_s_JudyL, &start_time_index, &start_time_first))) { + struct page_details *pd = *PValue; + + if(!pd->page && !pdc_page_status_check(pd, PDC_PAGE_FAILED|PDC_PAGE_READY)) { + pdc_page_status_set(pd, PDC_PAGE_FAILED | tags); + pages_matched++; + } + } + } + + if(pages_matched && statistics_counter) + __atomic_add_fetch(statistics_counter, pages_matched, __ATOMIC_RELAXED); +} +/* +static bool epdl_check_if_pages_are_already_in_cache(struct rrdengine_instance *ctx, EPDL *epdl, PDC_PAGE_STATUS tags) +{ + size_t count_remaining = 0; + size_t found = 0; + + Word_t metric_id_index = 0; + bool metric_id_first = true; + Pvoid_t *pd_by_start_time_s_JudyL; + while((pd_by_start_time_s_JudyL = PDCJudyLFirstThenNext(epdl->page_details_by_metric_id_JudyL, &metric_id_index, &metric_id_first))) { + + Word_t start_time_index = 0; + bool start_time_first = true; + Pvoid_t *PValue; + while ((PValue = PDCJudyLFirstThenNext(*pd_by_start_time_s_JudyL, &start_time_index, &start_time_first))) { + struct page_details *pd = *PValue; + if (pd->page) + continue; + + pd->page = pgc_page_get_and_acquire(main_cache, (Word_t) ctx, pd->metric_id, pd->first_time_s, PGC_SEARCH_EXACT); + if (pd->page) { + found++; + pdc_page_status_set(pd, PDC_PAGE_READY | tags); + } + else + count_remaining++; + } + } + + if(found) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_ok_preloaded, found, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache, found, __ATOMIC_RELAXED); + } + + return count_remaining == 0; +} +*/ + +// ---------------------------------------------------------------------------- +// PDC logic + +static void pdc_destroy(PDC *pdc) { + mrg_metric_release(main_mrg, pdc->metric); + completion_destroy(&pdc->prep_completion); + completion_destroy(&pdc->page_completion); + + Pvoid_t *PValue; + struct page_details *pd; + Word_t time_index = 0; + bool first_then_next = true; + size_t unroutable = 0, cancelled = 0; + while((PValue = PDCJudyLFirstThenNext(pdc->page_list_JudyL, &time_index, &first_then_next))) { + pd = *PValue; + + // no need for atomics here - we are done... + PDC_PAGE_STATUS status = pd->status; + + if(status & PDC_PAGE_DATAFILE_ACQUIRED) { + datafile_release(pd->datafile.ptr, DATAFILE_ACQUIRE_PAGE_DETAILS); + pd->datafile.ptr = NULL; + } + + internal_fatal(pd->datafile.ptr, "DBENGINE: page details has a datafile.ptr that is not released."); + + if(!pd->page && !(status & (PDC_PAGE_READY | PDC_PAGE_FAILED | PDC_PAGE_RELEASED | PDC_PAGE_SKIP | PDC_PAGE_INVALID | PDC_PAGE_CANCELLED))) { + // pdc_page_status_set(pd, PDC_PAGE_FAILED); + unroutable++; + } + else if(!pd->page && (status & PDC_PAGE_CANCELLED)) + cancelled++; + + if(pd->page && !(status & PDC_PAGE_RELEASED)) { + pgc_page_release(main_cache, pd->page); + // pdc_page_status_set(pd, PDC_PAGE_RELEASED); + } + + page_details_release(pd); + } + + PDCJudyLFreeArray(&pdc->page_list_JudyL, PJE0); + + __atomic_sub_fetch(&rrdeng_cache_efficiency_stats.currently_running_queries, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&pdc->ctx->atomic.inflight_queries, 1, __ATOMIC_RELAXED); + pdc_release(pdc); + + if(unroutable) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_fail_unroutable, unroutable, __ATOMIC_RELAXED); + + if(cancelled) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_fail_cancelled, cancelled, __ATOMIC_RELAXED); +} + +void pdc_acquire(PDC *pdc) { + netdata_spinlock_lock(&pdc->refcount_spinlock); + + if(pdc->refcount < 1) + fatal("DBENGINE: pdc is not referenced and cannot be acquired"); + + pdc->refcount++; + netdata_spinlock_unlock(&pdc->refcount_spinlock); +} + +bool pdc_release_and_destroy_if_unreferenced(PDC *pdc, bool worker, bool router __maybe_unused) { + if(unlikely(!pdc)) + return true; + + netdata_spinlock_lock(&pdc->refcount_spinlock); + + if(pdc->refcount <= 0) + fatal("DBENGINE: pdc is not referenced and cannot be released"); + + pdc->refcount--; + + if (pdc->refcount <= 1 && worker) { + // when 1 refcount is remaining, and we are a worker, + // we can mark the job completed: + // - if the remaining refcount is from the query caller, we will wake it up + // - if the remaining refcount is from another worker, the query thread is already away + completion_mark_complete(&pdc->page_completion); + } + + if (pdc->refcount == 0) { + netdata_spinlock_unlock(&pdc->refcount_spinlock); + pdc_destroy(pdc); + return true; + } + + netdata_spinlock_unlock(&pdc->refcount_spinlock); + return false; +} + +void epdl_cmd_queued(void *epdl_ptr, struct rrdeng_cmd *cmd) { + EPDL *epdl = epdl_ptr; + epdl->cmd = cmd; +} + +void epdl_cmd_dequeued(void *epdl_ptr) { + EPDL *epdl = epdl_ptr; + epdl->cmd = NULL; +} + +static struct rrdeng_cmd *epdl_get_cmd(void *epdl_ptr) { + EPDL *epdl = epdl_ptr; + return epdl->cmd; +} + +static bool epdl_pending_add(EPDL *epdl) { + bool added_new; + + netdata_spinlock_lock(&epdl->datafile->extent_queries.spinlock); + Pvoid_t *PValue = JudyLIns(&epdl->datafile->extent_queries.pending_epdl_by_extent_offset_judyL, epdl->extent_offset, PJE0); + internal_fatal(!PValue || PValue == PJERR, "DBENGINE: corrupted pending extent judy"); + + EPDL *base = *PValue; + + if(!base) { + added_new = true; + epdl->head_to_datafile_extent_queries_pending_for_extent = true; + } + else { + added_new = false; + epdl->head_to_datafile_extent_queries_pending_for_extent = false; + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_extent_merged, 1, __ATOMIC_RELAXED); + + if(base->pdc->priority > epdl->pdc->priority) + rrdeng_req_cmd(epdl_get_cmd, base, epdl->pdc->priority); + } + + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(base, epdl, query.prev, query.next); + *PValue = base; + + netdata_spinlock_unlock(&epdl->datafile->extent_queries.spinlock); + + return added_new; +} + +static void epdl_pending_del(EPDL *epdl) { + netdata_spinlock_lock(&epdl->datafile->extent_queries.spinlock); + if(epdl->head_to_datafile_extent_queries_pending_for_extent) { + epdl->head_to_datafile_extent_queries_pending_for_extent = false; + int rc = JudyLDel(&epdl->datafile->extent_queries.pending_epdl_by_extent_offset_judyL, epdl->extent_offset, PJE0); + (void) rc; + internal_fatal(!rc, "DBENGINE: epdl not found in pending list"); + } + netdata_spinlock_unlock(&epdl->datafile->extent_queries.spinlock); +} + +void pdc_to_epdl_router(struct rrdengine_instance *ctx, PDC *pdc, execute_extent_page_details_list_t exec_first_extent_list, execute_extent_page_details_list_t exec_rest_extent_list) +{ + Pvoid_t *PValue; + Pvoid_t *PValue1; + Pvoid_t *PValue2; + Word_t time_index = 0; + struct page_details *pd = NULL; + + // this is the entire page list + // Lets do some deduplication + // 1. Per datafile + // 2. Per extent + // 3. Pages per extent will be added to the cache either as acquired or not + + Pvoid_t JudyL_datafile_list = NULL; + + DEOL *deol; + EPDL *epdl; + + if (pdc->page_list_JudyL) { + bool first_then_next = true; + while((PValue = PDCJudyLFirstThenNext(pdc->page_list_JudyL, &time_index, &first_then_next))) { + pd = *PValue; + + internal_fatal(!pd, + "DBENGINE: pdc page list has an empty page details entry"); + + if (!(pd->status & PDC_PAGE_DISK_PENDING)) + continue; + + internal_fatal(!(pd->status & PDC_PAGE_DATAFILE_ACQUIRED), + "DBENGINE: page details has not acquired the datafile"); + + internal_fatal((pd->status & (PDC_PAGE_READY | PDC_PAGE_FAILED)), + "DBENGINE: page details has disk pending flag but it is ready/failed"); + + internal_fatal(pd->page, + "DBENGINE: page details has a page linked to it, but it is marked for loading"); + + PValue1 = PDCJudyLIns(&JudyL_datafile_list, pd->datafile.fileno, PJE0); + if (PValue1 && !*PValue1) { + *PValue1 = deol = deol_get(); + deol->extent_pd_list_by_extent_offset_JudyL = NULL; + deol->fileno = pd->datafile.fileno; + } + else + deol = *PValue1; + + PValue2 = PDCJudyLIns(&deol->extent_pd_list_by_extent_offset_JudyL, pd->datafile.extent.pos, PJE0); + if (PValue2 && !*PValue2) { + *PValue2 = epdl = epdl_get(); + epdl->page_details_by_metric_id_JudyL = NULL; + epdl->number_of_pages_in_JudyL = 0; + epdl->file = pd->datafile.file; + epdl->extent_offset = pd->datafile.extent.pos; + epdl->extent_size = pd->datafile.extent.bytes; + epdl->datafile = pd->datafile.ptr; + } + else + epdl = *PValue2; + + epdl->number_of_pages_in_JudyL++; + + Pvoid_t *pd_by_first_time_s_judyL = PDCJudyLIns(&epdl->page_details_by_metric_id_JudyL, pd->metric_id, PJE0); + Pvoid_t *pd_pptr = PDCJudyLIns(pd_by_first_time_s_judyL, pd->first_time_s, PJE0); + *pd_pptr = pd; + } + + size_t extent_list_no = 0; + Word_t datafile_no = 0; + first_then_next = true; + while((PValue = PDCJudyLFirstThenNext(JudyL_datafile_list, &datafile_no, &first_then_next))) { + deol = *PValue; + + bool first_then_next_extent = true; + Word_t pos = 0; + while ((PValue = PDCJudyLFirstThenNext(deol->extent_pd_list_by_extent_offset_JudyL, &pos, &first_then_next_extent))) { + epdl = *PValue; + internal_fatal(!epdl, "DBENGINE: extent_list is not populated properly"); + + // The extent page list can be dispatched to a worker + // It will need to populate the cache with "acquired" pages that are in the list (pd) only + // the rest of the extent pages will be added to the cache butnot acquired + + pdc_acquire(pdc); // we do this for the next worker: do_read_extent_work() + epdl->pdc = pdc; + + if(epdl_pending_add(epdl)) { + if (extent_list_no++ == 0) + exec_first_extent_list(ctx, epdl, pdc->priority); + else + exec_rest_extent_list(ctx, epdl, pdc->priority); + } + } + PDCJudyLFreeArray(&deol->extent_pd_list_by_extent_offset_JudyL, PJE0); + deol_release(deol); + } + PDCJudyLFreeArray(&JudyL_datafile_list, PJE0); + } + + pdc_release_and_destroy_if_unreferenced(pdc, true, true); +} + +void collect_page_flags_to_buffer(BUFFER *wb, RRDENG_COLLECT_PAGE_FLAGS flags) { + if(flags & RRDENG_PAGE_PAST_COLLECTION) + buffer_strcat(wb, "PAST_COLLECTION "); + if(flags & RRDENG_PAGE_REPEATED_COLLECTION) + buffer_strcat(wb, "REPEATED_COLLECTION "); + if(flags & RRDENG_PAGE_BIG_GAP) + buffer_strcat(wb, "BIG_GAP "); + if(flags & RRDENG_PAGE_GAP) + buffer_strcat(wb, "GAP "); + if(flags & RRDENG_PAGE_FUTURE_POINT) + buffer_strcat(wb, "FUTURE_POINT "); + if(flags & RRDENG_PAGE_CREATED_IN_FUTURE) + buffer_strcat(wb, "CREATED_IN_FUTURE "); + if(flags & RRDENG_PAGE_COMPLETED_IN_FUTURE) + buffer_strcat(wb, "COMPLETED_IN_FUTURE "); + if(flags & RRDENG_PAGE_UNALIGNED) + buffer_strcat(wb, "UNALIGNED "); + if(flags & RRDENG_PAGE_CONFLICT) + buffer_strcat(wb, "CONFLICT "); + if(flags & RRDENG_PAGE_FULL) + buffer_strcat(wb, "PAGE_FULL"); + if(flags & RRDENG_PAGE_COLLECT_FINALIZE) + buffer_strcat(wb, "COLLECT_FINALIZE"); + if(flags & RRDENG_PAGE_UPDATE_EVERY_CHANGE) + buffer_strcat(wb, "UPDATE_EVERY_CHANGE"); + if(flags & RRDENG_PAGE_STEP_TOO_SMALL) + buffer_strcat(wb, "STEP_TOO_SMALL"); + if(flags & RRDENG_PAGE_STEP_UNALIGNED) + buffer_strcat(wb, "STEP_UNALIGNED"); +} + +inline VALIDATED_PAGE_DESCRIPTOR validate_extent_page_descr(const struct rrdeng_extent_page_descr *descr, time_t now_s, time_t overwrite_zero_update_every_s, bool have_read_error) { + return validate_page( + (uuid_t *)descr->uuid, + (time_t) (descr->start_time_ut / USEC_PER_SEC), + (time_t) (descr->end_time_ut / USEC_PER_SEC), + 0, + descr->page_length, + descr->type, + 0, + now_s, + overwrite_zero_update_every_s, + have_read_error, + "loaded", 0); +} + +VALIDATED_PAGE_DESCRIPTOR validate_page( + uuid_t *uuid, + time_t start_time_s, + time_t end_time_s, + time_t update_every_s, // can be zero, if unknown + size_t page_length, + uint8_t page_type, + size_t entries, // can be zero, if unknown + time_t now_s, // can be zero, to disable future timestamp check + time_t overwrite_zero_update_every_s, // can be zero, if unknown + bool have_read_error, + const char *msg, + RRDENG_COLLECT_PAGE_FLAGS flags) { + + VALIDATED_PAGE_DESCRIPTOR vd = { + .start_time_s = start_time_s, + .end_time_s = end_time_s, + .update_every_s = update_every_s, + .page_length = page_length, + .type = page_type, + .is_valid = true, + }; + + // always calculate entries by size + vd.point_size = page_type_size[vd.type]; + vd.entries = page_entries_by_size(vd.page_length, vd.point_size); + + // allow to be called without entries (when loading pages from disk) + if(!entries) + entries = vd.entries; + + // allow to be called without update every (when loading pages from disk) + if(!update_every_s) { + vd.update_every_s = (vd.entries > 1) ? ((vd.end_time_s - vd.start_time_s) / (time_t) (vd.entries - 1)) + : overwrite_zero_update_every_s; + + update_every_s = vd.update_every_s; + } + + // another such set of checks exists in + // update_metric_retention_and_granularity_by_uuid() + + bool updated = false; + + if( have_read_error || + vd.page_length == 0 || + vd.page_length > RRDENG_BLOCK_SIZE || + vd.start_time_s > vd.end_time_s || + (now_s && vd.end_time_s > now_s) || + vd.start_time_s == 0 || + vd.end_time_s == 0 || + (vd.start_time_s == vd.end_time_s && vd.entries > 1) || + (vd.update_every_s == 0 && vd.entries > 1) + ) + vd.is_valid = false; + + else { + if(unlikely(vd.entries != entries || vd.update_every_s != update_every_s)) + updated = true; + + if (likely(vd.update_every_s)) { + size_t entries_by_time = page_entries_by_time(vd.start_time_s, vd.end_time_s, vd.update_every_s); + + if (vd.entries != entries_by_time) { + if (overwrite_zero_update_every_s < vd.update_every_s) + vd.update_every_s = overwrite_zero_update_every_s; + + time_t new_end_time_s = (time_t)(vd.start_time_s + (vd.entries - 1) * vd.update_every_s); + + if(new_end_time_s <= vd.end_time_s) { + // end time is wrong + vd.end_time_s = new_end_time_s; + } + else { + // update every is wrong + vd.update_every_s = overwrite_zero_update_every_s; + vd.end_time_s = (time_t)(vd.start_time_s + (vd.entries - 1) * vd.update_every_s); + } + + updated = true; + } + } + else if(overwrite_zero_update_every_s) { + vd.update_every_s = overwrite_zero_update_every_s; + updated = true; + } + } + + if(unlikely(!vd.is_valid || updated)) { +#ifndef NETDATA_INTERNAL_CHECKS + error_limit_static_global_var(erl, 1, 0); +#endif + char uuid_str[UUID_STR_LEN + 1]; + uuid_unparse(*uuid, uuid_str); + + BUFFER *wb = NULL; + + if(flags) { + wb = buffer_create(0, NULL); + collect_page_flags_to_buffer(wb, flags); + } + + if(!vd.is_valid) { +#ifdef NETDATA_INTERNAL_CHECKS + internal_error(true, +#else + error_limit(&erl, +#endif + "DBENGINE: metric '%s' %s invalid page of type %u " + "from %ld to %ld (now %ld), update every %ld, page length %zu, entries %zu (flags: %s)", + uuid_str, msg, vd.type, + vd.start_time_s, vd.end_time_s, now_s, vd.update_every_s, vd.page_length, vd.entries, wb?buffer_tostring(wb):"" + ); + } + else { + const char *err_valid = (vd.is_valid) ? "" : "found invalid, "; + const char *err_start = (vd.start_time_s == start_time_s) ? "" : "start time updated, "; + const char *err_end = (vd.end_time_s == end_time_s) ? "" : "end time updated, "; + const char *err_update = (vd.update_every_s == update_every_s) ? "" : "update every updated, "; + const char *err_length = (vd.page_length == page_length) ? "" : "page length updated, "; + const char *err_entries = (vd.entries == entries) ? "" : "entries updated, "; + const char *err_future = (now_s && vd.end_time_s <= now_s) ? "" : "future end time, "; + +#ifdef NETDATA_INTERNAL_CHECKS + internal_error(true, +#else + error_limit(&erl, +#endif + "DBENGINE: metric '%s' %s page of type %u " + "from %ld to %ld (now %ld), update every %ld, page length %zu, entries %zu (flags: %s), " + "found inconsistent - the right is " + "from %ld to %ld, update every %ld, page length %zu, entries %zu: " + "%s%s%s%s%s%s%s", + uuid_str, msg, vd.type, + start_time_s, end_time_s, now_s, update_every_s, page_length, entries, wb?buffer_tostring(wb):"", + vd.start_time_s, vd.end_time_s, vd.update_every_s, vd.page_length, vd.entries, + err_valid, err_start, err_end, err_update, err_length, err_entries, err_future + ); + } + + buffer_free(wb); + } + + return vd; +} + +static inline struct page_details *epdl_get_pd_load_link_list_from_metric_start_time(EPDL *epdl, Word_t metric_id, time_t start_time_s) { + + if(unlikely(epdl->head_to_datafile_extent_queries_pending_for_extent)) + // stop appending more pages to this epdl + epdl_pending_del(epdl); + + struct page_details *pd_list = NULL; + + for(EPDL *ep = epdl; ep ;ep = ep->query.next) { + Pvoid_t *pd_by_start_time_s_judyL = PDCJudyLGet(ep->page_details_by_metric_id_JudyL, metric_id, PJE0); + internal_fatal(pd_by_start_time_s_judyL == PJERR, "DBENGINE: corrupted extent metrics JudyL"); + + if (unlikely(pd_by_start_time_s_judyL && *pd_by_start_time_s_judyL)) { + Pvoid_t *pd_pptr = PDCJudyLGet(*pd_by_start_time_s_judyL, start_time_s, PJE0); + internal_fatal(pd_pptr == PJERR, "DBENGINE: corrupted metric page details JudyHS"); + + if(likely(pd_pptr && *pd_pptr)) { + struct page_details *pd = *pd_pptr; + internal_fatal(metric_id != pd->metric_id, "DBENGINE: metric ids do not match"); + + if(likely(!pd->page)) { + if (unlikely(__atomic_load_n(&ep->pdc->workers_should_stop, __ATOMIC_RELAXED))) + pdc_page_status_set(pd, PDC_PAGE_FAILED | PDC_PAGE_CANCELLED); + else + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(pd_list, pd, load.prev, load.next); + } + } + } + } + + return pd_list; +} + +static void epdl_extent_loading_error_log(struct rrdengine_instance *ctx, EPDL *epdl, struct rrdeng_extent_page_descr *descr, const char *msg) { + char uuid[UUID_STR_LEN] = ""; + time_t start_time_s = 0; + time_t end_time_s = 0; + bool used_epdl = false; + bool used_descr = false; + + if (descr) { + start_time_s = (time_t)(descr->start_time_ut / USEC_PER_SEC); + end_time_s = (time_t)(descr->end_time_ut / USEC_PER_SEC); + uuid_unparse_lower(descr->uuid, uuid); + used_descr = true; + } + else if (epdl) { + struct page_details *pd = NULL; + + Word_t start = 0; + Pvoid_t *pd_by_start_time_s_judyL = PDCJudyLFirst(epdl->page_details_by_metric_id_JudyL, &start, PJE0); + if(pd_by_start_time_s_judyL) { + start = 0; + Pvoid_t *pd_pptr = PDCJudyLFirst(*pd_by_start_time_s_judyL, &start, PJE0); + if(pd_pptr) { + pd = *pd_pptr; + start_time_s = pd->first_time_s; + end_time_s = pd->last_time_s; + METRIC *metric = (METRIC *)pd->metric_id; + uuid_t *u = mrg_metric_uuid(main_mrg, metric); + uuid_unparse_lower(*u, uuid); + used_epdl = true; + } + } + } + + if(!used_epdl && !used_descr && epdl && epdl->pdc) { + start_time_s = epdl->pdc->start_time_s; + end_time_s = epdl->pdc->end_time_s; + } + + char start_time_str[LOG_DATE_LENGTH + 1] = ""; + if(start_time_s) + log_date(start_time_str, LOG_DATE_LENGTH, start_time_s); + + char end_time_str[LOG_DATE_LENGTH + 1] = ""; + if(end_time_s) + log_date(end_time_str, LOG_DATE_LENGTH, end_time_s); + + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, + "DBENGINE: error while reading extent from datafile %u of tier %d, at offset %" PRIu64 " (%u bytes) " + "%s from %ld (%s) to %ld (%s) %s%s: " + "%s", + epdl->datafile->fileno, ctx->config.tier, + epdl->extent_offset, epdl->extent_size, + used_epdl ? "to extract page (PD)" : used_descr ? "expected page (DESCR)" : "part of a query (PDC)", + start_time_s, start_time_str, end_time_s, end_time_str, + used_epdl || used_descr ? " of metric " : "", + used_epdl || used_descr ? uuid : "", + msg); +} + +static bool epdl_populate_pages_from_extent_data( + struct rrdengine_instance *ctx, + void *data, + size_t data_length, + EPDL *epdl, + bool worker, + PDC_PAGE_STATUS tags, + bool cached_extent) +{ + int ret; + unsigned i, count; + void *uncompressed_buf = NULL; + uint32_t payload_length, payload_offset, trailer_offset, uncompressed_payload_length = 0; + bool have_read_error = false; + /* persistent structures */ + struct rrdeng_df_extent_header *header; + struct rrdeng_df_extent_trailer *trailer; + struct extent_buffer *eb = NULL; + uLong crc; + + bool can_use_data = true; + if(data_length < sizeof(*header) + sizeof(header->descr[0]) + sizeof(*trailer)) { + can_use_data = false; + + // added to satisfy the requirements of older compilers (prevent warnings) + payload_length = 0; + payload_offset = 0; + trailer_offset = 0; + count = 0; + header = NULL; + trailer = NULL; + } + else { + header = data; + payload_length = header->payload_length; + count = header->number_of_pages; + payload_offset = sizeof(*header) + sizeof(header->descr[0]) * count; + trailer_offset = data_length - sizeof(*trailer); + trailer = data + trailer_offset; + } + + if( !can_use_data || + count < 1 || + count > MAX_PAGES_PER_EXTENT || + (header->compression_algorithm != RRD_NO_COMPRESSION && header->compression_algorithm != RRD_LZ4) || + (payload_length != trailer_offset - payload_offset) || + (data_length != payload_offset + payload_length + sizeof(*trailer)) + ) { + epdl_extent_loading_error_log(ctx, epdl, NULL, "header is INVALID"); + return false; + } + + crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, data, epdl->extent_size - sizeof(*trailer)); + ret = crc32cmp(trailer->checksum, crc); + if (unlikely(ret)) { + ctx_io_error(ctx); + have_read_error = true; + epdl_extent_loading_error_log(ctx, epdl, NULL, "CRC32 checksum FAILED"); + } + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_DECOMPRESSION); + + if (likely(!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm)) { + // find the uncompressed extent size + uncompressed_payload_length = 0; + for (i = 0; i < count; ++i) { + size_t page_length = header->descr[i].page_length; + if(page_length > RRDENG_BLOCK_SIZE) { + have_read_error = true; + break; + } + + uncompressed_payload_length += header->descr[i].page_length; + } + + if(unlikely(uncompressed_payload_length > MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE)) + have_read_error = true; + + if(likely(!have_read_error)) { + eb = extent_buffer_get(uncompressed_payload_length); + uncompressed_buf = eb->data; + + ret = LZ4_decompress_safe(data + payload_offset, uncompressed_buf, + (int) payload_length, (int) uncompressed_payload_length); + + __atomic_add_fetch(&ctx->stats.before_decompress_bytes, payload_length, __ATOMIC_RELAXED); + __atomic_add_fetch(&ctx->stats.after_decompress_bytes, ret, __ATOMIC_RELAXED); + } + } + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_LOOKUP); + + size_t stats_data_from_main_cache = 0; + size_t stats_data_from_extent = 0; + size_t stats_load_compressed = 0; + size_t stats_load_uncompressed = 0; + size_t stats_load_invalid_page = 0; + size_t stats_cache_hit_while_inserting = 0; + + uint32_t page_offset = 0, page_length; + time_t now_s = max_acceptable_collected_time(); + for (i = 0; i < count; i++, page_offset += page_length) { + page_length = header->descr[i].page_length; + time_t start_time_s = (time_t) (header->descr[i].start_time_ut / USEC_PER_SEC); + + if(!page_length || !start_time_s) { + char log[200 + 1]; + snprintfz(log, 200, "page %u (out of %u) is EMPTY", i, count); + epdl_extent_loading_error_log(ctx, epdl, &header->descr[i], log); + continue; + } + + METRIC *metric = mrg_metric_get_and_acquire(main_mrg, &header->descr[i].uuid, (Word_t)ctx); + Word_t metric_id = (Word_t)metric; + if(!metric) { + char log[200 + 1]; + snprintfz(log, 200, "page %u (out of %u) has unknown UUID", i, count); + epdl_extent_loading_error_log(ctx, epdl, &header->descr[i], log); + continue; + } + mrg_metric_release(main_mrg, metric); + + struct page_details *pd_list = epdl_get_pd_load_link_list_from_metric_start_time(epdl, metric_id, start_time_s); + if(likely(!pd_list)) + continue; + + VALIDATED_PAGE_DESCRIPTOR vd = validate_extent_page_descr( + &header->descr[i], now_s, + (pd_list) ? pd_list->update_every_s : 0, + have_read_error); + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_ALLOCATION); + + void *page_data; + + if (unlikely(!vd.is_valid)) { + page_data = DBENGINE_EMPTY_PAGE; + stats_load_invalid_page++; + } + else { + if (RRD_NO_COMPRESSION == header->compression_algorithm) { + page_data = dbengine_page_alloc(vd.page_length); + memcpy(page_data, data + payload_offset + page_offset, (size_t) vd.page_length); + stats_load_uncompressed++; + } + else { + if (unlikely(page_offset + vd.page_length > uncompressed_payload_length)) { + char log[200 + 1]; + snprintfz(log, 200, "page %u (out of %u) offset %u + page length %zu, " + "exceeds the uncompressed buffer size %u", + i, count, page_offset, vd.page_length, uncompressed_payload_length); + epdl_extent_loading_error_log(ctx, epdl, &header->descr[i], log); + + page_data = DBENGINE_EMPTY_PAGE; + stats_load_invalid_page++; + } + else { + page_data = dbengine_page_alloc(vd.page_length); + memcpy(page_data, uncompressed_buf + page_offset, vd.page_length); + stats_load_compressed++; + } + } + } + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_POPULATION); + + PGC_ENTRY page_entry = { + .hot = false, + .section = (Word_t)ctx, + .metric_id = metric_id, + .start_time_s = vd.start_time_s, + .end_time_s = vd.end_time_s, + .update_every_s = vd.update_every_s, + .size = (size_t) ((page_data == DBENGINE_EMPTY_PAGE) ? 0 : vd.page_length), + .data = page_data + }; + + bool added = true; + PGC_PAGE *page = pgc_page_add_and_acquire(main_cache, page_entry, &added); + if (false == added) { + dbengine_page_free(page_data, vd.page_length); + stats_cache_hit_while_inserting++; + stats_data_from_main_cache++; + } + else + stats_data_from_extent++; + + struct page_details *pd = pd_list; + do { + if(pd != pd_list) + pgc_page_dup(main_cache, page); + + pd->page = page; + pd->page_length = pgc_page_data_size(main_cache, page); + pdc_page_status_set(pd, PDC_PAGE_READY | tags | ((page_data == DBENGINE_EMPTY_PAGE) ? PDC_PAGE_EMPTY : 0)); + + pd = pd->load.next; + } while(pd); + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_LOOKUP); + } + + if(stats_data_from_main_cache) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache, stats_data_from_main_cache, __ATOMIC_RELAXED); + + if(cached_extent) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_extent_cache, stats_data_from_extent, __ATOMIC_RELAXED); + else { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_disk, stats_data_from_extent, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.extents_loaded_from_disk, 1, __ATOMIC_RELAXED); + } + + if(stats_cache_hit_while_inserting) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_ok_loaded_but_cache_hit_while_inserting, stats_cache_hit_while_inserting, __ATOMIC_RELAXED); + + if(stats_load_compressed) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_ok_compressed, stats_load_compressed, __ATOMIC_RELAXED); + + if(stats_load_uncompressed) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_ok_uncompressed, stats_load_uncompressed, __ATOMIC_RELAXED); + + if(stats_load_invalid_page) + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_load_fail_invalid_page_in_extent, stats_load_invalid_page, __ATOMIC_RELAXED); + + if(worker) + worker_is_idle(); + + extent_buffer_release(eb); + + return true; +} + +static inline void *datafile_extent_read(struct rrdengine_instance *ctx, uv_file file, unsigned pos, unsigned size_bytes) +{ + void *buffer; + uv_fs_t request; + + unsigned real_io_size = ALIGN_BYTES_CEILING(size_bytes); + int ret = posix_memalign(&buffer, RRDFILE_ALIGNMENT, real_io_size); + if (unlikely(ret)) + fatal("DBENGINE: posix_memalign(): %s", strerror(ret)); + + uv_buf_t iov = uv_buf_init(buffer, real_io_size); + ret = uv_fs_read(NULL, &request, file, &iov, 1, pos, NULL); + if (unlikely(-1 == ret)) { + ctx_io_error(ctx); + posix_memfree(buffer); + buffer = NULL; + } + else + ctx_io_read_op_bytes(ctx, real_io_size); + + uv_fs_req_cleanup(&request); + + return buffer; +} + +static inline void datafile_extent_read_free(void *buffer) { + posix_memfree(buffer); +} + +void epdl_find_extent_and_populate_pages(struct rrdengine_instance *ctx, EPDL *epdl, bool worker) { + size_t *statistics_counter = NULL; + PDC_PAGE_STATUS not_loaded_pages_tag = 0, loaded_pages_tag = 0; + + bool should_stop = __atomic_load_n(&epdl->pdc->workers_should_stop, __ATOMIC_RELAXED); + for(EPDL *ep = epdl->query.next; ep ;ep = ep->query.next) { + internal_fatal(ep->datafile != epdl->datafile, "DBENGINE: datafiles do not match"); + internal_fatal(ep->extent_offset != epdl->extent_offset, "DBENGINE: extent offsets do not match"); + internal_fatal(ep->extent_size != epdl->extent_size, "DBENGINE: extent sizes do not match"); + internal_fatal(ep->file != epdl->file, "DBENGINE: files do not match"); + + if(!__atomic_load_n(&ep->pdc->workers_should_stop, __ATOMIC_RELAXED)) { + should_stop = false; + break; + } + } + + if(unlikely(should_stop)) { + statistics_counter = &rrdeng_cache_efficiency_stats.pages_load_fail_cancelled; + not_loaded_pages_tag = PDC_PAGE_CANCELLED; + goto cleanup; + } + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_CACHE_LOOKUP); + + bool extent_found_in_cache = false; + + void *extent_compressed_data = NULL; + PGC_PAGE *extent_cache_page = pgc_page_get_and_acquire( + extent_cache, (Word_t)ctx, + (Word_t)epdl->datafile->fileno, (time_t)epdl->extent_offset, + PGC_SEARCH_EXACT); + + if(extent_cache_page) { + extent_compressed_data = pgc_page_data(extent_cache_page); + internal_fatal(epdl->extent_size != pgc_page_data_size(extent_cache, extent_cache_page), + "DBENGINE: cache size does not match the expected size"); + + loaded_pages_tag |= PDC_PAGE_EXTENT_FROM_CACHE; + not_loaded_pages_tag |= PDC_PAGE_EXTENT_FROM_CACHE; + extent_found_in_cache = true; + } + else { + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_MMAP); + + void *extent_data = datafile_extent_read(ctx, epdl->file, epdl->extent_offset, epdl->extent_size); + if(extent_data != NULL) { + + void *copied_extent_compressed_data = dbengine_extent_alloc(epdl->extent_size); + memcpy(copied_extent_compressed_data, extent_data, epdl->extent_size); + datafile_extent_read_free(extent_data); + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_CACHE_LOOKUP); + + bool added = false; + extent_cache_page = pgc_page_add_and_acquire(extent_cache, (PGC_ENTRY) { + .hot = false, + .section = (Word_t) ctx, + .metric_id = (Word_t) epdl->datafile->fileno, + .start_time_s = (time_t) epdl->extent_offset, + .size = epdl->extent_size, + .end_time_s = 0, + .update_every_s = 0, + .data = copied_extent_compressed_data, + }, &added); + + if (!added) { + dbengine_extent_free(copied_extent_compressed_data, epdl->extent_size); + internal_fatal(epdl->extent_size != pgc_page_data_size(extent_cache, extent_cache_page), + "DBENGINE: cache size does not match the expected size"); + } + + extent_compressed_data = pgc_page_data(extent_cache_page); + + loaded_pages_tag |= PDC_PAGE_EXTENT_FROM_DISK; + not_loaded_pages_tag |= PDC_PAGE_EXTENT_FROM_DISK; + } + } + + if(extent_compressed_data) { + // Need to decompress and then process the pagelist + bool extent_used = epdl_populate_pages_from_extent_data( + ctx, extent_compressed_data, epdl->extent_size, + epdl, worker, loaded_pages_tag, extent_found_in_cache); + + if(extent_used) { + // since the extent was used, all the pages that are not + // loaded from this extent, were not found in the extent + not_loaded_pages_tag |= PDC_PAGE_FAILED_NOT_IN_EXTENT; + statistics_counter = &rrdeng_cache_efficiency_stats.pages_load_fail_not_found; + } + else { + not_loaded_pages_tag |= PDC_PAGE_FAILED_INVALID_EXTENT; + statistics_counter = &rrdeng_cache_efficiency_stats.pages_load_fail_invalid_extent; + } + } + else { + not_loaded_pages_tag |= PDC_PAGE_FAILED_TO_MAP_EXTENT; + statistics_counter = &rrdeng_cache_efficiency_stats.pages_load_fail_cant_mmap_extent; + } + + if(extent_cache_page) + pgc_page_release(extent_cache, extent_cache_page); + +cleanup: + // remove it from the datafile extent_queries + // this can be called multiple times safely + epdl_pending_del(epdl); + + // mark all pending pages as failed + for(EPDL *ep = epdl; ep ;ep = ep->query.next) { + epdl_mark_all_not_loaded_pages_as_failed( + ep, not_loaded_pages_tag, statistics_counter); + } + + for(EPDL *ep = epdl, *next = NULL; ep ; ep = next) { + next = ep->query.next; + + completion_mark_complete_a_job(&ep->pdc->page_completion); + pdc_release_and_destroy_if_unreferenced(ep->pdc, true, false); + + // Free the Judy that holds the requested pagelist and the extents + epdl_destroy(ep); + } + + if(worker) + worker_is_idle(); +} diff --git a/database/engine/pdc.h b/database/engine/pdc.h new file mode 100644 index 000000000..9bae39ade --- /dev/null +++ b/database/engine/pdc.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef DBENGINE_PDC_H +#define DBENGINE_PDC_H + +#include "../engine/rrdengine.h" + +struct rrdeng_cmd; + +#ifdef PDC_USE_JULYL +#define PDCJudyLIns JulyLIns +#define PDCJudyLGet JulyLGet +#define PDCJudyLFirst JulyLFirst +#define PDCJudyLNext JulyLNext +#define PDCJudyLLast JulyLLast +#define PDCJudyLPrev JulyLPrev +#define PDCJudyLFirstThenNext JulyLFirstThenNext +#define PDCJudyLLastThenPrev JulyLLastThenPrev +#define PDCJudyLFreeArray JulyLFreeArray +#else +#define PDCJudyLIns JudyLIns +#define PDCJudyLGet JudyLGet +#define PDCJudyLFirst JudyLFirst +#define PDCJudyLNext JudyLNext +#define PDCJudyLLast JudyLLast +#define PDCJudyLPrev JudyLPrev +#define PDCJudyLFirstThenNext JudyLFirstThenNext +#define PDCJudyLLastThenPrev JudyLLastThenPrev +#define PDCJudyLFreeArray JudyLFreeArray +#endif + +typedef struct extent_page_details_list EPDL; +typedef void (*execute_extent_page_details_list_t)(struct rrdengine_instance *ctx, EPDL *epdl, enum storage_priority priority); +void pdc_to_epdl_router(struct rrdengine_instance *ctx, struct page_details_control *pdc, execute_extent_page_details_list_t exec_first_extent_list, execute_extent_page_details_list_t exec_rest_extent_list); +void epdl_find_extent_and_populate_pages(struct rrdengine_instance *ctx, EPDL *epdl, bool worker); + +size_t pdc_cache_size(void); +size_t pd_cache_size(void); +size_t epdl_cache_size(void); +size_t deol_cache_size(void); +size_t extent_buffer_cache_size(void); + +void pdc_init(void); +void page_details_init(void); +void epdl_init(void); +void deol_init(void); +void extent_buffer_cleanup1(void); + +void epdl_cmd_dequeued(void *epdl_ptr); +void epdl_cmd_queued(void *epdl_ptr, struct rrdeng_cmd *cmd); + +struct extent_buffer { + size_t bytes; + + struct { + struct extent_buffer *prev; + struct extent_buffer *next; + } cache; + + uint8_t data[]; +}; + +void extent_buffer_init(void); +struct extent_buffer *extent_buffer_get(size_t size); +void extent_buffer_release(struct extent_buffer *eb); + +#endif // DBENGINE_PDC_H diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c index a6840f38c..d64868f03 100644 --- a/database/engine/rrdengine.c +++ b/database/engine/rrdengine.c @@ -2,6 +2,7 @@ #define NETDATA_RRD_INTERNALS #include "rrdengine.h" +#include "pdc.h" rrdeng_stats_t global_io_errors = 0; rrdeng_stats_t global_fs_errors = 0; @@ -11,31 +12,74 @@ rrdeng_stats_t global_flushing_pressure_page_deletions = 0; unsigned rrdeng_pages_per_extent = MAX_PAGES_PER_EXTENT; -#if WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2) +#if WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_OPCODE_MAX + 2) #error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least (RRDENG_MAX_OPCODE + 2) #endif -void *dbengine_page_alloc() { - void *page = NULL; - if (unlikely(db_engine_use_malloc)) - page = mallocz(RRDENG_BLOCK_SIZE); - else { - page = netdata_mmap(NULL, RRDENG_BLOCK_SIZE, MAP_PRIVATE, enable_ksm); - if(!page) fatal("Cannot allocate dbengine page cache page, with mmap()"); - } - return page; -} - -void dbengine_page_free(void *page) { - if (unlikely(db_engine_use_malloc)) - freez(page); - else - netdata_munmap(page, RRDENG_BLOCK_SIZE); -} +struct rrdeng_main { + uv_thread_t thread; + uv_loop_t loop; + uv_async_t async; + uv_timer_t timer; + pid_t tid; + + size_t flushes_running; + size_t evictions_running; + size_t cleanup_running; + + struct { + ARAL *ar; + + struct { + SPINLOCK spinlock; + + size_t waiting; + struct rrdeng_cmd *waiting_items_by_priority[STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE]; + size_t executed_by_priority[STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE]; + } unsafe; + } cmd_queue; + + struct { + ARAL *ar; + + struct { + size_t dispatched; + size_t executing; + size_t pending_cb; + } atomics; + } work_cmd; + + struct { + ARAL *ar; + } handles; + + struct { + ARAL *ar; + } descriptors; + + struct { + ARAL *ar; + } xt_io_descr; + +} rrdeng_main = { + .thread = 0, + .loop = {}, + .async = {}, + .timer = {}, + .flushes_running = 0, + .evictions_running = 0, + .cleanup_running = 0, + + .cmd_queue = { + .unsafe = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + }, + } +}; static void sanity_check(void) { - BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2)); + BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_OPCODE_MAX + 2)); /* Magic numbers must fit in the super-blocks */ BUILD_BUG_ON(strlen(RRDENG_DF_MAGIC) > RRDENG_MAGIC_SZ); @@ -54,519 +98,489 @@ static void sanity_check(void) BUILD_BUG_ON(MAX_PAGES_PER_EXTENT > 255); /* extent cache count must fit in 32 bits */ - BUILD_BUG_ON(MAX_CACHED_EXTENTS > 32); +// BUILD_BUG_ON(MAX_CACHED_EXTENTS > 32); /* page info scratch space must be able to hold 2 32-bit integers */ BUILD_BUG_ON(sizeof(((struct rrdeng_page_info *)0)->scratch) < 2 * sizeof(uint32_t)); } -/* always inserts into tail */ -static inline void xt_cache_replaceQ_insert(struct rrdengine_worker_config* wc, - struct extent_cache_element *xt_cache_elem) -{ - struct extent_cache *xt_cache = &wc->xt_cache; +// ---------------------------------------------------------------------------- +// work request cache - xt_cache_elem->prev = NULL; - xt_cache_elem->next = NULL; +typedef void *(*work_cb)(struct rrdengine_instance *ctx, void *data, struct completion *completion, uv_work_t* req); +typedef void (*after_work_cb)(struct rrdengine_instance *ctx, void *data, struct completion *completion, uv_work_t* req, int status); - if (likely(NULL != xt_cache->replaceQ_tail)) { - xt_cache_elem->prev = xt_cache->replaceQ_tail; - xt_cache->replaceQ_tail->next = xt_cache_elem; - } - if (unlikely(NULL == xt_cache->replaceQ_head)) { - xt_cache->replaceQ_head = xt_cache_elem; - } - xt_cache->replaceQ_tail = xt_cache_elem; +struct rrdeng_work { + uv_work_t req; + + struct rrdengine_instance *ctx; + void *data; + struct completion *completion; + + work_cb work_cb; + after_work_cb after_work_cb; + enum rrdeng_opcode opcode; +}; + +static void work_request_init(void) { + rrdeng_main.work_cmd.ar = aral_create( + "dbengine-work-cmd", + sizeof(struct rrdeng_work), + 0, + 65536, NULL, + NULL, NULL, false, false + ); } -static inline void xt_cache_replaceQ_delete(struct rrdengine_worker_config* wc, - struct extent_cache_element *xt_cache_elem) -{ - struct extent_cache *xt_cache = &wc->xt_cache; - struct extent_cache_element *prev, *next; +static inline bool work_request_full(void) { + return __atomic_load_n(&rrdeng_main.work_cmd.atomics.dispatched, __ATOMIC_RELAXED) >= (size_t)(libuv_worker_threads - RESERVED_LIBUV_WORKER_THREADS); +} - prev = xt_cache_elem->prev; - next = xt_cache_elem->next; +static inline void work_done(struct rrdeng_work *work_request) { + aral_freez(rrdeng_main.work_cmd.ar, work_request); +} - if (likely(NULL != prev)) { - prev->next = next; - } - if (likely(NULL != next)) { - next->prev = prev; - } - if (unlikely(xt_cache_elem == xt_cache->replaceQ_head)) { - xt_cache->replaceQ_head = next; - } - if (unlikely(xt_cache_elem == xt_cache->replaceQ_tail)) { - xt_cache->replaceQ_tail = prev; - } - xt_cache_elem->prev = xt_cache_elem->next = NULL; +static void work_standard_worker(uv_work_t *req) { + __atomic_add_fetch(&rrdeng_main.work_cmd.atomics.executing, 1, __ATOMIC_RELAXED); + + register_libuv_worker_jobs(); + worker_is_busy(UV_EVENT_WORKER_INIT); + + struct rrdeng_work *work_request = req->data; + work_request->data = work_request->work_cb(work_request->ctx, work_request->data, work_request->completion, req); + worker_is_idle(); + + __atomic_sub_fetch(&rrdeng_main.work_cmd.atomics.dispatched, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&rrdeng_main.work_cmd.atomics.executing, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_main.work_cmd.atomics.pending_cb, 1, __ATOMIC_RELAXED); + + // signal the event loop a worker is available + fatal_assert(0 == uv_async_send(&rrdeng_main.async)); } -static inline void xt_cache_replaceQ_set_hot(struct rrdengine_worker_config* wc, - struct extent_cache_element *xt_cache_elem) -{ - xt_cache_replaceQ_delete(wc, xt_cache_elem); - xt_cache_replaceQ_insert(wc, xt_cache_elem); +static void after_work_standard_callback(uv_work_t* req, int status) { + struct rrdeng_work *work_request = req->data; + + worker_is_busy(RRDENG_OPCODE_MAX + work_request->opcode); + + if(work_request->after_work_cb) + work_request->after_work_cb(work_request->ctx, work_request->data, work_request->completion, req, status); + + work_done(work_request); + __atomic_sub_fetch(&rrdeng_main.work_cmd.atomics.pending_cb, 1, __ATOMIC_RELAXED); + + worker_is_idle(); } -/* Returns the index of the cached extent if it was successfully inserted in the extent cache, otherwise -1 */ -static int try_insert_into_xt_cache(struct rrdengine_worker_config* wc, struct extent_info *extent) -{ - struct extent_cache *xt_cache = &wc->xt_cache; - struct extent_cache_element *xt_cache_elem; - unsigned idx; - int ret; +static bool work_dispatch(struct rrdengine_instance *ctx, void *data, struct completion *completion, enum rrdeng_opcode opcode, work_cb work_cb, after_work_cb after_work_cb) { + struct rrdeng_work *work_request = NULL; - ret = find_first_zero(xt_cache->allocation_bitmap); - if (-1 == ret || ret >= MAX_CACHED_EXTENTS) { - for (xt_cache_elem = xt_cache->replaceQ_head ; NULL != xt_cache_elem ; xt_cache_elem = xt_cache_elem->next) { - idx = xt_cache_elem - xt_cache->extent_array; - if (!check_bit(xt_cache->inflight_bitmap, idx)) { - xt_cache_replaceQ_delete(wc, xt_cache_elem); - break; - } - } - if (NULL == xt_cache_elem) - return -1; - } else { - idx = (unsigned)ret; - xt_cache_elem = &xt_cache->extent_array[idx]; + internal_fatal(rrdeng_main.tid != gettid(), "work_dispatch() can only be run from the event loop thread"); + + work_request = aral_mallocz(rrdeng_main.work_cmd.ar); + memset(work_request, 0, sizeof(struct rrdeng_work)); + work_request->req.data = work_request; + work_request->ctx = ctx; + work_request->data = data; + work_request->completion = completion; + work_request->work_cb = work_cb; + work_request->after_work_cb = after_work_cb; + work_request->opcode = opcode; + + if(uv_queue_work(&rrdeng_main.loop, &work_request->req, work_standard_worker, after_work_standard_callback)) { + internal_fatal(true, "DBENGINE: cannot queue work"); + work_done(work_request); + return false; } - xt_cache_elem->extent = extent; - xt_cache_elem->fileno = extent->datafile->fileno; - xt_cache_elem->inflight_io_descr = NULL; - xt_cache_replaceQ_insert(wc, xt_cache_elem); - modify_bit(&xt_cache->allocation_bitmap, idx, 1); - return (int)idx; + __atomic_add_fetch(&rrdeng_main.work_cmd.atomics.dispatched, 1, __ATOMIC_RELAXED); + + return true; } -/** - * Returns 0 if the cached extent was found in the extent cache, 1 otherwise. - * Sets *idx to point to the position of the extent inside the cache. - **/ -static uint8_t lookup_in_xt_cache(struct rrdengine_worker_config* wc, struct extent_info *extent, unsigned *idx) -{ - struct extent_cache *xt_cache = &wc->xt_cache; - struct extent_cache_element *xt_cache_elem; - unsigned i; +// ---------------------------------------------------------------------------- +// page descriptor cache + +void page_descriptors_init(void) { + rrdeng_main.descriptors.ar = aral_create( + "dbengine-descriptors", + sizeof(struct page_descr_with_data), + 0, + 65536 * 4, + NULL, + NULL, NULL, false, false); +} - for (i = 0 ; i < MAX_CACHED_EXTENTS ; ++i) { - xt_cache_elem = &xt_cache->extent_array[i]; - if (check_bit(xt_cache->allocation_bitmap, i) && xt_cache_elem->extent == extent && - xt_cache_elem->fileno == extent->datafile->fileno) { - *idx = i; - return 0; - } - } - return 1; +struct page_descr_with_data *page_descriptor_get(void) { + struct page_descr_with_data *descr = aral_mallocz(rrdeng_main.descriptors.ar); + memset(descr, 0, sizeof(struct page_descr_with_data)); + return descr; } -#if 0 /* disabled code */ -static void delete_from_xt_cache(struct rrdengine_worker_config* wc, unsigned idx) -{ - struct extent_cache *xt_cache = &wc->xt_cache; - struct extent_cache_element *xt_cache_elem; +static inline void page_descriptor_release(struct page_descr_with_data *descr) { + aral_freez(rrdeng_main.descriptors.ar, descr); +} - xt_cache_elem = &xt_cache->extent_array[idx]; - xt_cache_replaceQ_delete(wc, xt_cache_elem); - xt_cache_elem->extent = NULL; - modify_bit(&wc->xt_cache.allocation_bitmap, idx, 0); /* invalidate it */ - modify_bit(&wc->xt_cache.inflight_bitmap, idx, 0); /* not in-flight anymore */ +// ---------------------------------------------------------------------------- +// extent io descriptor cache + +static void extent_io_descriptor_init(void) { + rrdeng_main.xt_io_descr.ar = aral_create( + "dbengine-extent-io", + sizeof(struct extent_io_descriptor), + 0, + 65536, + NULL, + NULL, NULL, false, false + ); } -#endif -void enqueue_inflight_read_to_xt_cache(struct rrdengine_worker_config* wc, unsigned idx, - struct extent_io_descriptor *xt_io_descr) -{ - struct extent_cache *xt_cache = &wc->xt_cache; - struct extent_cache_element *xt_cache_elem; - struct extent_io_descriptor *old_next; +static struct extent_io_descriptor *extent_io_descriptor_get(void) { + struct extent_io_descriptor *xt_io_descr = aral_mallocz(rrdeng_main.xt_io_descr.ar); + memset(xt_io_descr, 0, sizeof(struct extent_io_descriptor)); + return xt_io_descr; +} - xt_cache_elem = &xt_cache->extent_array[idx]; - old_next = xt_cache_elem->inflight_io_descr->next; - xt_cache_elem->inflight_io_descr->next = xt_io_descr; - xt_io_descr->next = old_next; +static inline void extent_io_descriptor_release(struct extent_io_descriptor *xt_io_descr) { + aral_freez(rrdeng_main.xt_io_descr.ar, xt_io_descr); } -void read_cached_extent_cb(struct rrdengine_worker_config* wc, unsigned idx, struct extent_io_descriptor *xt_io_descr) -{ - unsigned i, j, page_offset; - struct rrdengine_instance *ctx = wc->ctx; - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - void *page; - struct extent_info *extent = xt_io_descr->descr_array[0]->extent; - - for (i = 0 ; i < xt_io_descr->descr_count; ++i) { - page = dbengine_page_alloc(); - descr = xt_io_descr->descr_array[i]; - for (j = 0, page_offset = 0 ; j < extent->number_of_pages ; ++j) { - /* care, we don't hold the descriptor mutex */ - if (!uuid_compare(*extent->pages[j]->id, *descr->id) && - extent->pages[j]->page_length == descr->page_length && - extent->pages[j]->start_time_ut == descr->start_time_ut && - extent->pages[j]->end_time_ut == descr->end_time_ut) { - break; - } - page_offset += extent->pages[j]->page_length; +// ---------------------------------------------------------------------------- +// query handle cache + +void rrdeng_query_handle_init(void) { + rrdeng_main.handles.ar = aral_create( + "dbengine-query-handles", + sizeof(struct rrdeng_query_handle), + 0, + 65536, + NULL, + NULL, NULL, false, false); +} - } - /* care, we don't hold the descriptor mutex */ - (void) memcpy(page, wc->xt_cache.extent_array[idx].pages + page_offset, descr->page_length); - - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - pg_cache_descr->page = page; - pg_cache_descr->flags |= RRD_PAGE_POPULATED; - pg_cache_descr->flags &= ~RRD_PAGE_READ_PENDING; - rrdeng_page_descr_mutex_unlock(ctx, descr); - pg_cache_replaceQ_insert(ctx, descr); - if (xt_io_descr->release_descr) { - pg_cache_put(ctx, descr); - } else { - debug(D_RRDENGINE, "%s: Waking up waiters.", __func__); - pg_cache_wake_up_waiters(ctx, descr); - } +struct rrdeng_query_handle *rrdeng_query_handle_get(void) { + struct rrdeng_query_handle *handle = aral_mallocz(rrdeng_main.handles.ar); + memset(handle, 0, sizeof(struct rrdeng_query_handle)); + return handle; +} + +void rrdeng_query_handle_release(struct rrdeng_query_handle *handle) { + aral_freez(rrdeng_main.handles.ar, handle); +} + +// ---------------------------------------------------------------------------- +// WAL cache + +static struct { + struct { + SPINLOCK spinlock; + WAL *available_items; + size_t available; + } protected; + + struct { + size_t allocated; + } atomics; +} wal_globals = { + .protected = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .available_items = NULL, + .available = 0, + }, + .atomics = { + .allocated = 0, + }, +}; + +static void wal_cleanup1(void) { + WAL *wal = NULL; + + if(!netdata_spinlock_trylock(&wal_globals.protected.spinlock)) + return; + + if(wal_globals.protected.available_items && wal_globals.protected.available > storage_tiers) { + wal = wal_globals.protected.available_items; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next); + wal_globals.protected.available--; } - if (xt_io_descr->completion) - completion_mark_complete(xt_io_descr->completion); - freez(xt_io_descr); -} - -static void fill_page_with_nulls(void *page, uint32_t page_length, uint8_t type) { - switch(type) { - case PAGE_METRICS: { - storage_number n = pack_storage_number(NAN, SN_FLAG_NONE); - storage_number *array = (storage_number *)page; - size_t slots = page_length / sizeof(n); - for(size_t i = 0; i < slots ; i++) - array[i] = n; - } - break; - - case PAGE_TIER: { - storage_number_tier1_t n = { - .min_value = NAN, - .max_value = NAN, - .sum_value = NAN, - .count = 1, - .anomaly_count = 0, - }; - storage_number_tier1_t *array = (storage_number_tier1_t *)page; - size_t slots = page_length / sizeof(n); - for(size_t i = 0; i < slots ; i++) - array[i] = n; - } - break; - default: { - static bool logged = false; - if(!logged) { - error("DBENGINE: cannot fill page with nulls on unknown page type id %d", type); - logged = true; - } - memset(page, 0, page_length); - } + netdata_spinlock_unlock(&wal_globals.protected.spinlock); + + if(wal) { + posix_memfree(wal->buf); + freez(wal); + __atomic_sub_fetch(&wal_globals.atomics.allocated, 1, __ATOMIC_RELAXED); } } -struct rrdeng_page_descr *get_descriptor(struct pg_cache_page_index *page_index, time_t start_time_s) -{ - uv_rwlock_rdlock(&page_index->lock); - Pvoid_t *PValue = JudyLGet(page_index->JudyL_array, start_time_s, PJE0); - struct rrdeng_page_descr *descr = unlikely(NULL == PValue) ? NULL : *PValue; - uv_rwlock_rdunlock(&page_index->lock); - return descr; -}; +WAL *wal_get(struct rrdengine_instance *ctx, unsigned size) { + if(!size || size > RRDENG_BLOCK_SIZE) + fatal("DBENGINE: invalid WAL size requested"); -static void do_extent_processing (struct rrdengine_worker_config *wc, struct extent_io_descriptor *xt_io_descr, bool read_failed) -{ - struct rrdengine_instance *ctx = wc->ctx; - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - int ret; - unsigned i, j, count; - void *page, *uncompressed_buf = NULL; - uint32_t payload_length, payload_offset, page_offset, uncompressed_payload_length = 0; - uint8_t have_read_error = 0; - /* persistent structures */ - struct rrdeng_df_extent_header *header; - struct rrdeng_df_extent_trailer *trailer; - uLong crc; + WAL *wal = NULL; - header = xt_io_descr->buf; - payload_length = header->payload_length; - count = header->number_of_pages; - payload_offset = sizeof(*header) + sizeof(header->descr[0]) * count; - trailer = xt_io_descr->buf + xt_io_descr->bytes - sizeof(*trailer); - - if (unlikely(read_failed)) { - struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile; - - ++ctx->stats.io_errors; - rrd_stat_atomic_add(&global_io_errors, 1); - have_read_error = 1; - error("%s: uv_fs_read - extent at offset %"PRIu64"(%u) in datafile %u-%u.", __func__, xt_io_descr->pos, - xt_io_descr->bytes, datafile->tier, datafile->fileno); - goto after_crc_check; + netdata_spinlock_lock(&wal_globals.protected.spinlock); + + if(likely(wal_globals.protected.available_items)) { + wal = wal_globals.protected.available_items; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next); + wal_globals.protected.available--; } - crc = crc32(0L, Z_NULL, 0); - crc = crc32(crc, xt_io_descr->buf, xt_io_descr->bytes - sizeof(*trailer)); - ret = crc32cmp(trailer->checksum, crc); -#ifdef NETDATA_INTERNAL_CHECKS - { - struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile; - debug(D_RRDENGINE, "%s: Extent at offset %"PRIu64"(%u) was read from datafile %u-%u. CRC32 check: %s", __func__, - xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno, ret ? "FAILED" : "SUCCEEDED"); + + uint64_t transaction_id = __atomic_fetch_add(&ctx->atomic.transaction_id, 1, __ATOMIC_RELAXED); + netdata_spinlock_unlock(&wal_globals.protected.spinlock); + + if(unlikely(!wal)) { + wal = mallocz(sizeof(WAL)); + wal->buf_size = RRDENG_BLOCK_SIZE; + int ret = posix_memalign((void *)&wal->buf, RRDFILE_ALIGNMENT, wal->buf_size); + if (unlikely(ret)) + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); + __atomic_add_fetch(&wal_globals.atomics.allocated, 1, __ATOMIC_RELAXED); } -#endif - if (unlikely(ret)) { - struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile; - ++ctx->stats.io_errors; - rrd_stat_atomic_add(&global_io_errors, 1); - have_read_error = 1; - error("%s: Extent at offset %"PRIu64"(%u) was read from datafile %u-%u. CRC32 check: FAILED", __func__, - xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno); + // these need to survive + unsigned buf_size = wal->buf_size; + void *buf = wal->buf; + + memset(wal, 0, sizeof(WAL)); + + // put them back + wal->buf_size = buf_size; + wal->buf = buf; + + memset(wal->buf, 0, wal->buf_size); + + wal->transaction_id = transaction_id; + wal->size = size; + + return wal; +} + +void wal_release(WAL *wal) { + if(unlikely(!wal)) return; + + netdata_spinlock_lock(&wal_globals.protected.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next); + wal_globals.protected.available++; + netdata_spinlock_unlock(&wal_globals.protected.spinlock); +} + +// ---------------------------------------------------------------------------- +// command queue cache + +struct rrdeng_cmd { + struct rrdengine_instance *ctx; + enum rrdeng_opcode opcode; + void *data; + struct completion *completion; + enum storage_priority priority; + dequeue_callback_t dequeue_cb; + + struct { + struct rrdeng_cmd *prev; + struct rrdeng_cmd *next; + } queue; +}; + +static void rrdeng_cmd_queue_init(void) { + rrdeng_main.cmd_queue.ar = aral_create("dbengine-opcodes", + sizeof(struct rrdeng_cmd), + 0, + 65536, + NULL, + NULL, NULL, false, false); +} + +static inline STORAGE_PRIORITY rrdeng_enq_cmd_map_opcode_to_priority(enum rrdeng_opcode opcode, STORAGE_PRIORITY priority) { + if(unlikely(priority >= STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE)) + priority = STORAGE_PRIORITY_BEST_EFFORT; + + switch(opcode) { + case RRDENG_OPCODE_QUERY: + priority = STORAGE_PRIORITY_INTERNAL_QUERY_PREP; + break; + + default: + break; } -after_crc_check: - if (!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm) { - uncompressed_payload_length = 0; - for (i = 0 ; i < count ; ++i) { - uncompressed_payload_length += header->descr[i].page_length; + return priority; +} + +void rrdeng_enqueue_epdl_cmd(struct rrdeng_cmd *cmd) { + epdl_cmd_queued(cmd->data, cmd); +} + +void rrdeng_dequeue_epdl_cmd(struct rrdeng_cmd *cmd) { + epdl_cmd_dequeued(cmd->data); +} + +void rrdeng_req_cmd(requeue_callback_t get_cmd_cb, void *data, STORAGE_PRIORITY priority) { + netdata_spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); + + struct rrdeng_cmd *cmd = get_cmd_cb(data); + if(cmd) { + priority = rrdeng_enq_cmd_map_opcode_to_priority(cmd->opcode, priority); + + if (cmd->priority > priority) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[cmd->priority], cmd, queue.prev, queue.next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority], cmd, queue.prev, queue.next); + cmd->priority = priority; } - uncompressed_buf = mallocz(uncompressed_payload_length); - ret = LZ4_decompress_safe(xt_io_descr->buf + payload_offset, uncompressed_buf, - payload_length, uncompressed_payload_length); - ctx->stats.before_decompress_bytes += payload_length; - ctx->stats.after_decompress_bytes += ret; - debug(D_RRDENGINE, "LZ4 decompressed %u bytes to %d bytes.", payload_length, ret); - /* care, we don't hold the descriptor mutex */ } - { - uint8_t xt_is_cached = 0; - unsigned xt_idx; - struct extent_info *extent = xt_io_descr->descr_array[0]->extent; - - xt_is_cached = !lookup_in_xt_cache(wc, extent, &xt_idx); - if (xt_is_cached && check_bit(wc->xt_cache.inflight_bitmap, xt_idx)) { - struct extent_cache *xt_cache = &wc->xt_cache; - struct extent_cache_element *xt_cache_elem = &xt_cache->extent_array[xt_idx]; - struct extent_io_descriptor *curr, *next; - - if (have_read_error) { - memset(xt_cache_elem->pages, 0, sizeof(xt_cache_elem->pages)); - } else if (RRD_NO_COMPRESSION == header->compression_algorithm) { - (void)memcpy(xt_cache_elem->pages, xt_io_descr->buf + payload_offset, payload_length); - } else { - (void)memcpy(xt_cache_elem->pages, uncompressed_buf, uncompressed_payload_length); - } - /* complete all connected in-flight read requests */ - for (curr = xt_cache_elem->inflight_io_descr->next ; curr ; curr = next) { - next = curr->next; - read_cached_extent_cb(wc, xt_idx, curr); + + netdata_spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock); +} + +void rrdeng_enq_cmd(struct rrdengine_instance *ctx, enum rrdeng_opcode opcode, void *data, struct completion *completion, + enum storage_priority priority, enqueue_callback_t enqueue_cb, dequeue_callback_t dequeue_cb) { + + priority = rrdeng_enq_cmd_map_opcode_to_priority(opcode, priority); + + struct rrdeng_cmd *cmd = aral_mallocz(rrdeng_main.cmd_queue.ar); + memset(cmd, 0, sizeof(struct rrdeng_cmd)); + cmd->ctx = ctx; + cmd->opcode = opcode; + cmd->data = data; + cmd->completion = completion; + cmd->priority = priority; + cmd->dequeue_cb = dequeue_cb; + + netdata_spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority], cmd, queue.prev, queue.next); + rrdeng_main.cmd_queue.unsafe.waiting++; + if(enqueue_cb) + enqueue_cb(cmd); + netdata_spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock); + + fatal_assert(0 == uv_async_send(&rrdeng_main.async)); +} + +static inline bool rrdeng_cmd_has_waiting_opcodes_in_lower_priorities(STORAGE_PRIORITY priority, STORAGE_PRIORITY max_priority) { + for(; priority <= max_priority ; priority++) + if(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority]) + return true; + + return false; +} + +static inline struct rrdeng_cmd rrdeng_deq_cmd(void) { + struct rrdeng_cmd *cmd = NULL; + + STORAGE_PRIORITY max_priority = work_request_full() ? STORAGE_PRIORITY_INTERNAL_DBENGINE : STORAGE_PRIORITY_BEST_EFFORT; + + // find an opcode to execute from the queue + netdata_spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); + for(STORAGE_PRIORITY priority = STORAGE_PRIORITY_INTERNAL_DBENGINE; priority <= max_priority ; priority++) { + cmd = rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority]; + if(cmd) { + + // avoid starvation of lower priorities + if(unlikely(priority >= STORAGE_PRIORITY_HIGH && + priority < STORAGE_PRIORITY_BEST_EFFORT && + ++rrdeng_main.cmd_queue.unsafe.executed_by_priority[priority] % 50 == 0 && + rrdeng_cmd_has_waiting_opcodes_in_lower_priorities(priority + 1, max_priority))) { + // let the others run 2% of the requests + cmd = NULL; + continue; } - xt_cache_elem->inflight_io_descr = NULL; - modify_bit(&xt_cache->inflight_bitmap, xt_idx, 0); /* not in-flight anymore */ + + // remove it from the queue + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority], cmd, queue.prev, queue.next); + rrdeng_main.cmd_queue.unsafe.waiting--; + break; } } - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - Pvoid_t *PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, xt_io_descr->descr_array[0]->id, sizeof(uuid_t)); - struct pg_cache_page_index *page_index = likely( NULL != PValue) ? *PValue : NULL; - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - - - for (i = 0, page_offset = 0; i < count; page_offset += header->descr[i++].page_length) { - uint8_t is_prefetched_page; - descr = NULL; - for (j = 0 ; j < xt_io_descr->descr_count; ++j) { - struct rrdeng_page_descr descrj; - - descrj = xt_io_descr->descr_read_array[j]; - /* care, we don't hold the descriptor mutex */ - if (!uuid_compare(*(uuid_t *) header->descr[i].uuid, *descrj.id) && - header->descr[i].page_length == descrj.page_length && - header->descr[i].start_time_ut == descrj.start_time_ut && - header->descr[i].end_time_ut == descrj.end_time_ut) { - //descr = descrj; - descr = get_descriptor(page_index, (time_t) (descrj.start_time_ut / USEC_PER_SEC)); - if (unlikely(!descr)) { - error_limit_static_thread_var(erl, 1, 0); - error_limit(&erl, "%s: Required descriptor is not in the page index anymore", __FUNCTION__); - } - break; - } - } - is_prefetched_page = 0; - if (!descr) { /* This extent page has not been requested. Try populating it for locality (best effort). */ - descr = pg_cache_lookup_unpopulated_and_lock(ctx, (uuid_t *)header->descr[i].uuid, - header->descr[i].start_time_ut); - if (!descr) - continue; /* Failed to reserve a suitable page */ - is_prefetched_page = 1; - } - page = dbengine_page_alloc(); - - /* care, we don't hold the descriptor mutex */ - if (have_read_error) { - fill_page_with_nulls(page, descr->page_length, descr->type); - } else if (RRD_NO_COMPRESSION == header->compression_algorithm) { - (void) memcpy(page, xt_io_descr->buf + payload_offset + page_offset, descr->page_length); - } else { - (void) memcpy(page, uncompressed_buf + page_offset, descr->page_length); - } - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - pg_cache_descr->page = page; - pg_cache_descr->flags |= RRD_PAGE_POPULATED; - pg_cache_descr->flags &= ~RRD_PAGE_READ_PENDING; - rrdeng_page_descr_mutex_unlock(ctx, descr); - pg_cache_replaceQ_insert(ctx, descr); - if (xt_io_descr->release_descr || is_prefetched_page) { - pg_cache_put(ctx, descr); - } else { - debug(D_RRDENGINE, "%s: Waking up waiters.", __func__); - pg_cache_wake_up_waiters(ctx, descr); - } + if(cmd && cmd->dequeue_cb) { + cmd->dequeue_cb(cmd); + cmd->dequeue_cb = NULL; } - if (!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm) { - freez(uncompressed_buf); + + netdata_spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock); + + struct rrdeng_cmd ret; + if(cmd) { + // copy it, to return it + ret = *cmd; + + aral_freez(rrdeng_main.cmd_queue.ar, cmd); } - if (xt_io_descr->completion) - completion_mark_complete(xt_io_descr->completion); + else + ret = (struct rrdeng_cmd) { + .ctx = NULL, + .opcode = RRDENG_OPCODE_NOOP, + .priority = STORAGE_PRIORITY_BEST_EFFORT, + .completion = NULL, + .data = NULL, + }; + + return ret; } -static void read_extent_cb(uv_fs_t *req) -{ - struct rrdengine_worker_config *wc = req->loop->data; - struct extent_io_descriptor *xt_io_descr; - xt_io_descr = req->data; - do_extent_processing(wc, xt_io_descr, req->result < 0); - uv_fs_req_cleanup(req); - posix_memfree(xt_io_descr->buf); - freez(xt_io_descr); +// ---------------------------------------------------------------------------- + +struct { + ARAL *aral[RRD_STORAGE_TIERS]; +} dbengine_page_alloc_globals = {}; + +static inline ARAL *page_size_lookup(size_t size) { + for(size_t tier = 0; tier < storage_tiers ;tier++) + if(size == tier_page_size[tier]) + return dbengine_page_alloc_globals.aral[tier]; + + return NULL; } -static void read_mmap_extent_cb(uv_work_t *req, int status __maybe_unused) -{ - struct rrdengine_worker_config *wc = req->loop->data; - struct rrdengine_instance *ctx = wc->ctx; - struct extent_io_descriptor *xt_io_descr; - xt_io_descr = req->data; +static void dbengine_page_alloc_init(void) { + for(size_t i = storage_tiers; i > 0 ;i--) { + size_t tier = storage_tiers - i; - if (likely(xt_io_descr->map_base)) { - do_extent_processing(wc, xt_io_descr, false); - munmap(xt_io_descr->map_base, xt_io_descr->map_length); - freez(xt_io_descr); - return; - } + char buf[20 + 1]; + snprintfz(buf, 20, "tier%zu-pages", tier); - // MMAP failed, so do uv_fs_read - int ret = posix_memalign((void *)&xt_io_descr->buf, RRDFILE_ALIGNMENT, ALIGN_BYTES_CEILING(xt_io_descr->bytes)); - if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + dbengine_page_alloc_globals.aral[tier] = aral_create( + buf, + tier_page_size[tier], + 64, + 512 * tier_page_size[tier], + pgc_aral_statistics(), + NULL, NULL, false, false); } - unsigned real_io_size = ALIGN_BYTES_CEILING( xt_io_descr->bytes); - xt_io_descr->iov = uv_buf_init((void *)xt_io_descr->buf, real_io_size); - xt_io_descr->req.data = xt_io_descr; - ret = uv_fs_read(req->loop, &xt_io_descr->req, xt_io_descr->file, &xt_io_descr->iov, 1, (unsigned) xt_io_descr->pos, read_extent_cb); - fatal_assert(-1 != ret); - ctx->stats.io_read_bytes += real_io_size; - ctx->stats.io_read_extent_bytes += real_io_size; } -static void do_mmap_read_extent(uv_work_t *req) -{ - struct extent_io_descriptor *xt_io_descr = (struct extent_io_descriptor * )req->data; - struct rrdengine_worker_config *wc = req->loop->data; - struct rrdengine_instance *ctx = wc->ctx; - - off_t map_start = ALIGN_BYTES_FLOOR(xt_io_descr->pos); - size_t length = ALIGN_BYTES_CEILING(xt_io_descr->pos + xt_io_descr->bytes) - map_start; - unsigned real_io_size = xt_io_descr->bytes; - - void *data = mmap(NULL, length, PROT_READ, MAP_SHARED, xt_io_descr->file, map_start); - if (likely(data != MAP_FAILED)) { - xt_io_descr->map_base = data; - xt_io_descr->map_length = length; - xt_io_descr->buf = data + (xt_io_descr->pos - map_start); - ctx->stats.io_read_bytes += real_io_size; - ctx->stats.io_read_extent_bytes += real_io_size; - } +void *dbengine_page_alloc(size_t size) { + ARAL *ar = page_size_lookup(size); + if(ar) return aral_mallocz(ar); + + return mallocz(size); } -static void do_read_extent(struct rrdengine_worker_config* wc, - struct rrdeng_page_descr **descr, - unsigned count, - uint8_t release_descr) -{ - struct rrdengine_instance *ctx = wc->ctx; - struct page_cache_descr *pg_cache_descr; - int ret; - unsigned i, size_bytes, pos; - struct extent_io_descriptor *xt_io_descr; - struct rrdengine_datafile *datafile; - struct extent_info *extent = descr[0]->extent; - uint8_t xt_is_cached = 0, xt_is_inflight = 0; - unsigned xt_idx; - - datafile = extent->datafile; - pos = extent->offset; - size_bytes = extent->size; - - xt_io_descr = callocz(1, sizeof(*xt_io_descr)); - for (i = 0 ; i < count; ++i) { - rrdeng_page_descr_mutex_lock(ctx, descr[i]); - pg_cache_descr = descr[i]->pg_cache_descr; - pg_cache_descr->flags |= RRD_PAGE_READ_PENDING; - rrdeng_page_descr_mutex_unlock(ctx, descr[i]); - xt_io_descr->descr_array[i] = descr[i]; - xt_io_descr->descr_read_array[i] = *(descr[i]); - } - xt_io_descr->descr_count = count; - xt_io_descr->file = datafile->file; - xt_io_descr->bytes = size_bytes; - xt_io_descr->pos = pos; - xt_io_descr->req_worker.data = xt_io_descr; - xt_io_descr->completion = NULL; - xt_io_descr->release_descr = release_descr; - xt_io_descr->buf = NULL; - - xt_is_cached = !lookup_in_xt_cache(wc, extent, &xt_idx); - if (xt_is_cached) { - xt_cache_replaceQ_set_hot(wc, &wc->xt_cache.extent_array[xt_idx]); - xt_is_inflight = check_bit(wc->xt_cache.inflight_bitmap, xt_idx); - if (xt_is_inflight) { - enqueue_inflight_read_to_xt_cache(wc, xt_idx, xt_io_descr); - return; - } - return read_cached_extent_cb(wc, xt_idx, xt_io_descr); - } else { - ret = try_insert_into_xt_cache(wc, extent); - if (-1 != ret) { - xt_idx = (unsigned)ret; - modify_bit(&wc->xt_cache.inflight_bitmap, xt_idx, 1); - wc->xt_cache.extent_array[xt_idx].inflight_io_descr = xt_io_descr; - } - } +void dbengine_page_free(void *page, size_t size __maybe_unused) { + if(unlikely(!page || page == DBENGINE_EMPTY_PAGE)) + return; + + ARAL *ar = page_size_lookup(size); + if(ar) + aral_freez(ar, page); + else + freez(page); +} - ret = uv_queue_work(wc->loop, &xt_io_descr->req_worker, do_mmap_read_extent, read_mmap_extent_cb); - fatal_assert(-1 != ret); +// ---------------------------------------------------------------------------- - ++ctx->stats.io_read_requests; - ++ctx->stats.io_read_extents; - ctx->stats.pg_cache_backfills += count; +void *dbengine_extent_alloc(size_t size) { + void *extent = mallocz(size); + return extent; } -static void commit_data_extent(struct rrdengine_worker_config* wc, struct extent_io_descriptor *xt_io_descr) -{ - struct rrdengine_instance *ctx = wc->ctx; +void dbengine_extent_free(void *extent, size_t size __maybe_unused) { + freez(extent); +} + +static void journalfile_extent_build(struct rrdengine_instance *ctx, struct extent_io_descriptor *xt_io_descr) { unsigned count, payload_length, descr_size, size_bytes; void *buf; /* persistent structures */ @@ -582,12 +596,13 @@ static void commit_data_extent(struct rrdengine_worker_config* wc, struct extent payload_length = sizeof(*jf_metric_data) + descr_size; size_bytes = sizeof(*jf_header) + payload_length + sizeof(*jf_trailer); - buf = wal_get_transaction_buffer(wc, size_bytes); + xt_io_descr->wal = wal_get(ctx, size_bytes); + buf = xt_io_descr->wal->buf; jf_header = buf; jf_header->type = STORE_DATA; jf_header->reserved = 0; - jf_header->id = ctx->commit_log.transaction_id++; + jf_header->id = xt_io_descr->wal->transaction_id; jf_header->payload_length = payload_length; jf_metric_data = buf + sizeof(*jf_header); @@ -602,265 +617,210 @@ static void commit_data_extent(struct rrdengine_worker_config* wc, struct extent crc32set(jf_trailer->checksum, crc); } -static void do_commit_transaction(struct rrdengine_worker_config* wc, uint8_t type, void *data) -{ - switch (type) { - case STORE_DATA: - commit_data_extent(wc, (struct extent_io_descriptor *)data); - break; - default: - fatal_assert(type == STORE_DATA); - break; - } -} - -static void after_invalidate_oldest_committed(struct rrdengine_worker_config* wc) -{ - int error; +static void after_extent_flushed_to_open(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + if(completion) + completion_mark_complete(completion); - error = uv_thread_join(wc->now_invalidating_dirty_pages); - if (error) { - error("uv_thread_join(): %s", uv_strerror(error)); - } - freez(wc->now_invalidating_dirty_pages); - wc->now_invalidating_dirty_pages = NULL; - wc->cleanup_thread_invalidating_dirty_pages = 0; + if(ctx_is_available_for_queries(ctx)) + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_DATABASE_ROTATE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); } -static void invalidate_oldest_committed(void *arg) -{ - struct rrdengine_instance *ctx = arg; - struct rrdengine_worker_config *wc = &ctx->worker_config; - struct page_cache *pg_cache = &ctx->pg_cache; - int ret; - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - Pvoid_t *PValue; - Word_t Index; - unsigned nr_committed_pages; +static void *extent_flushed_to_open_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_FLUSHED_TO_OPEN); - do { - uv_rwlock_wrlock(&pg_cache->committed_page_index.lock); - for (Index = 0, - PValue = JudyLFirst(pg_cache->committed_page_index.JudyL_array, &Index, PJE0), - descr = unlikely(NULL == PValue) ? NULL : *PValue; + uv_fs_t *uv_fs_request = data; + struct extent_io_descriptor *xt_io_descr = uv_fs_request->data; + struct page_descr_with_data *descr; + struct rrdengine_datafile *datafile; + unsigned i; - descr != NULL; + datafile = xt_io_descr->datafile; - PValue = JudyLNext(pg_cache->committed_page_index.JudyL_array, &Index, PJE0), - descr = unlikely(NULL == PValue) ? NULL : *PValue) { - fatal_assert(0 != descr->page_length); + bool still_running = ctx_is_available_for_queries(ctx); - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - if (!(pg_cache_descr->flags & RRD_PAGE_WRITE_PENDING) && pg_cache_try_get_unsafe(descr, 1)) { - rrdeng_page_descr_mutex_unlock(ctx, descr); + for (i = 0 ; i < xt_io_descr->descr_count ; ++i) { + descr = xt_io_descr->descr_array[i]; - ret = JudyLDel(&pg_cache->committed_page_index.JudyL_array, Index, PJE0); - fatal_assert(1 == ret); - break; - } - rrdeng_page_descr_mutex_unlock(ctx, descr); - } - uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock); + if (likely(still_running)) + pgc_open_add_hot_page( + (Word_t)ctx, descr->metric_id, + (time_t) (descr->start_time_ut / USEC_PER_SEC), + (time_t) (descr->end_time_ut / USEC_PER_SEC), + descr->update_every_s, + datafile, + xt_io_descr->pos, xt_io_descr->bytes, descr->page_length); - if (!descr) { - info("Failed to invalidate any dirty pages to relieve page cache pressure."); + page_descriptor_release(descr); + } - goto out; - } - pg_cache_punch_hole(ctx, descr, 1, 1, NULL); + uv_fs_req_cleanup(uv_fs_request); + posix_memfree(xt_io_descr->buf); + extent_io_descriptor_release(xt_io_descr); - uv_rwlock_wrlock(&pg_cache->committed_page_index.lock); - nr_committed_pages = --pg_cache->committed_page_index.nr_committed_pages; - uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock); - rrd_stat_atomic_add(&ctx->stats.flushing_pressure_page_deletions, 1); - rrd_stat_atomic_add(&global_flushing_pressure_page_deletions, 1); + netdata_spinlock_lock(&datafile->writers.spinlock); + datafile->writers.flushed_to_open_running--; + netdata_spinlock_unlock(&datafile->writers.spinlock); - } while (nr_committed_pages >= pg_cache_committed_hard_limit(ctx)); -out: - wc->cleanup_thread_invalidating_dirty_pages = 1; - /* wake up event loop */ - fatal_assert(0 == uv_async_send(&wc->async)); -} + if(datafile->fileno != ctx_last_fileno_get(ctx) && still_running) + // we just finished a flushing on a datafile that is not the active one + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_JOURNAL_INDEX, datafile, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); -void rrdeng_invalidate_oldest_committed(struct rrdengine_worker_config* wc) -{ - struct rrdengine_instance *ctx = wc->ctx; - struct page_cache *pg_cache = &ctx->pg_cache; - unsigned nr_committed_pages; - int error; + return data; +} - if (unlikely(ctx->quiesce != NO_QUIESCE)) /* Shutting down */ - return; +// Main event loop callback +static void after_extent_write_datafile_io(uv_fs_t *uv_fs_request) { + worker_is_busy(RRDENG_OPCODE_MAX + RRDENG_OPCODE_EXTENT_WRITE); - uv_rwlock_rdlock(&pg_cache->committed_page_index.lock); - nr_committed_pages = pg_cache->committed_page_index.nr_committed_pages; - uv_rwlock_rdunlock(&pg_cache->committed_page_index.lock); + struct extent_io_descriptor *xt_io_descr = uv_fs_request->data; + struct rrdengine_datafile *datafile = xt_io_descr->datafile; + struct rrdengine_instance *ctx = datafile->ctx; - if (nr_committed_pages >= pg_cache_committed_hard_limit(ctx)) { - /* delete the oldest page in memory */ - if (wc->now_invalidating_dirty_pages) { - /* already deleting a page */ - return; - } - errno = 0; - error("Failed to flush dirty buffers quickly enough in dbengine instance \"%s\". " - "Metric data are being deleted, please reduce disk load or use a faster disk.", ctx->dbfiles_path); - - wc->now_invalidating_dirty_pages = mallocz(sizeof(*wc->now_invalidating_dirty_pages)); - wc->cleanup_thread_invalidating_dirty_pages = 0; - - error = uv_thread_create(wc->now_invalidating_dirty_pages, invalidate_oldest_committed, ctx); - if (error) { - error("uv_thread_create(): %s", uv_strerror(error)); - freez(wc->now_invalidating_dirty_pages); - wc->now_invalidating_dirty_pages = NULL; - } + if (uv_fs_request->result < 0) { + ctx_io_error(ctx); + error("DBENGINE: %s: uv_fs_write(): %s", __func__, uv_strerror((int)uv_fs_request->result)); } + + journalfile_v1_extent_write(ctx, xt_io_descr->datafile, xt_io_descr->wal, &rrdeng_main.loop); + + netdata_spinlock_lock(&datafile->writers.spinlock); + datafile->writers.running--; + datafile->writers.flushed_to_open_running++; + netdata_spinlock_unlock(&datafile->writers.spinlock); + + rrdeng_enq_cmd(xt_io_descr->ctx, + RRDENG_OPCODE_FLUSHED_TO_OPEN, + uv_fs_request, + xt_io_descr->completion, + STORAGE_PRIORITY_INTERNAL_DBENGINE, + NULL, + NULL); + + worker_is_idle(); } -void flush_pages_cb(uv_fs_t* req) -{ - struct rrdengine_worker_config* wc = req->loop->data; - struct rrdengine_instance *ctx = wc->ctx; - struct page_cache *pg_cache = &ctx->pg_cache; - struct extent_io_descriptor *xt_io_descr; - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - unsigned i, count; - - xt_io_descr = req->data; - if (req->result < 0) { - ++ctx->stats.io_errors; - rrd_stat_atomic_add(&global_io_errors, 1); - error("%s: uv_fs_write: %s", __func__, uv_strerror((int)req->result)); - } -#ifdef NETDATA_INTERNAL_CHECKS - { - struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile; - debug(D_RRDENGINE, "%s: Extent at offset %"PRIu64"(%u) was written to datafile %u-%u. Waking up waiters.", - __func__, xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno); - } -#endif - count = xt_io_descr->descr_count; - for (i = 0 ; i < count ; ++i) { - /* care, we don't hold the descriptor mutex */ - descr = xt_io_descr->descr_array[i]; +static bool datafile_is_full(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) { + bool ret = false; + netdata_spinlock_lock(&datafile->writers.spinlock); - pg_cache_replaceQ_insert(ctx, descr); + if(ctx_is_available_for_queries(ctx) && datafile->pos > rrdeng_target_data_file_size(ctx)) + ret = true; - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - pg_cache_descr->flags &= ~(RRD_PAGE_DIRTY | RRD_PAGE_WRITE_PENDING); - /* wake up waiters, care no reference being held */ - pg_cache_wake_up_waiters_unsafe(descr); - rrdeng_page_descr_mutex_unlock(ctx, descr); - } - if (xt_io_descr->completion) - completion_mark_complete(xt_io_descr->completion); - uv_fs_req_cleanup(req); - posix_memfree(xt_io_descr->buf); - freez(xt_io_descr); + netdata_spinlock_unlock(&datafile->writers.spinlock); + + return ret; +} - uv_rwlock_wrlock(&pg_cache->committed_page_index.lock); - pg_cache->committed_page_index.nr_committed_pages -= count; - uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock); - wc->inflight_dirty_pages -= count; +static struct rrdengine_datafile *get_datafile_to_write_extent(struct rrdengine_instance *ctx) { + struct rrdengine_datafile *datafile; + + // get the latest datafile + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + datafile = ctx->datafiles.first->prev; + // become a writer on this datafile, to prevent it from vanishing + netdata_spinlock_lock(&datafile->writers.spinlock); + datafile->writers.running++; + netdata_spinlock_unlock(&datafile->writers.spinlock); + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + if(datafile_is_full(ctx, datafile)) { + // remember the datafile we have become writers to + struct rrdengine_datafile *old_datafile = datafile; + + // only 1 datafile creation at a time + static netdata_mutex_t mutex = NETDATA_MUTEX_INITIALIZER; + netdata_mutex_lock(&mutex); + + // take the latest datafile again - without this, multiple threads may create multiple files + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + datafile = ctx->datafiles.first->prev; + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + if(datafile_is_full(ctx, datafile) && create_new_datafile_pair(ctx) == 0) + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_JOURNAL_INDEX, datafile, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, + NULL); + + netdata_mutex_unlock(&mutex); + + // get the new latest datafile again, like above + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + datafile = ctx->datafiles.first->prev; + // become a writer on this datafile, to prevent it from vanishing + netdata_spinlock_lock(&datafile->writers.spinlock); + datafile->writers.running++; + netdata_spinlock_unlock(&datafile->writers.spinlock); + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + // release the writers on the old datafile + netdata_spinlock_lock(&old_datafile->writers.spinlock); + old_datafile->writers.running--; + netdata_spinlock_unlock(&old_datafile->writers.spinlock); + } + + return datafile; } /* - * completion must be NULL or valid. - * Returns 0 when no flushing can take place. - * Returns datafile bytes to be written on successful flushing initiation. + * Take a page list in a judy array and write them */ -static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct completion *completion) -{ - struct rrdengine_instance *ctx = wc->ctx; - struct page_cache *pg_cache = &ctx->pg_cache; +static struct extent_io_descriptor *datafile_extent_build(struct rrdengine_instance *ctx, struct page_descr_with_data *base, struct completion *completion) { int ret; int compressed_size, max_compressed_size = 0; unsigned i, count, size_bytes, pos, real_io_size; uint32_t uncompressed_payload_length, payload_offset; - struct rrdeng_page_descr *descr, *eligible_pages[MAX_PAGES_PER_EXTENT]; - struct page_cache_descr *pg_cache_descr; + struct page_descr_with_data *descr, *eligible_pages[MAX_PAGES_PER_EXTENT]; struct extent_io_descriptor *xt_io_descr; + struct extent_buffer *eb = NULL; void *compressed_buf = NULL; - Word_t descr_commit_idx_array[MAX_PAGES_PER_EXTENT]; - Pvoid_t *PValue; Word_t Index; - uint8_t compression_algorithm = ctx->global_compress_alg; - struct extent_info *extent; + uint8_t compression_algorithm = ctx->config.global_compress_alg; struct rrdengine_datafile *datafile; /* persistent structures */ struct rrdeng_df_extent_header *header; struct rrdeng_df_extent_trailer *trailer; uLong crc; - if (force) { - debug(D_RRDENGINE, "Asynchronous flushing of extent has been forced by page pressure."); - } - uv_rwlock_wrlock(&pg_cache->committed_page_index.lock); - for (Index = 0, count = 0, uncompressed_payload_length = 0, - PValue = JudyLFirst(pg_cache->committed_page_index.JudyL_array, &Index, PJE0), - descr = unlikely(NULL == PValue) ? NULL : *PValue ; - - descr != NULL && count != rrdeng_pages_per_extent; - - PValue = JudyLNext(pg_cache->committed_page_index.JudyL_array, &Index, PJE0), - descr = unlikely(NULL == PValue) ? NULL : *PValue) { - uint8_t page_write_pending; - - fatal_assert(0 != descr->page_length); - page_write_pending = 0; - - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - if (!(pg_cache_descr->flags & RRD_PAGE_WRITE_PENDING)) { - page_write_pending = 1; - /* care, no reference being held */ - pg_cache_descr->flags |= RRD_PAGE_WRITE_PENDING; - uncompressed_payload_length += descr->page_length; - descr_commit_idx_array[count] = Index; - eligible_pages[count++] = descr; - } - rrdeng_page_descr_mutex_unlock(ctx, descr); + for(descr = base, Index = 0, count = 0, uncompressed_payload_length = 0; + descr && count != rrdeng_pages_per_extent; + descr = descr->link.next, Index++) { + + uncompressed_payload_length += descr->page_length; + eligible_pages[count++] = descr; - if (page_write_pending) { - ret = JudyLDel(&pg_cache->committed_page_index.JudyL_array, Index, PJE0); - fatal_assert(1 == ret); - } } - uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock); if (!count) { - debug(D_RRDENGINE, "%s: no pages eligible for flushing.", __func__); if (completion) completion_mark_complete(completion); - return 0; + + __atomic_sub_fetch(&ctx->atomic.extents_currently_being_flushed, 1, __ATOMIC_RELAXED); + return NULL; } - wc->inflight_dirty_pages += count; - xt_io_descr = mallocz(sizeof(*xt_io_descr)); + xt_io_descr = extent_io_descriptor_get(); + xt_io_descr->ctx = ctx; payload_offset = sizeof(*header) + count * sizeof(header->descr[0]); switch (compression_algorithm) { - case RRD_NO_COMPRESSION: - size_bytes = payload_offset + uncompressed_payload_length + sizeof(*trailer); - break; - default: /* Compress */ - fatal_assert(uncompressed_payload_length < LZ4_MAX_INPUT_SIZE); - max_compressed_size = LZ4_compressBound(uncompressed_payload_length); - compressed_buf = mallocz(max_compressed_size); - size_bytes = payload_offset + MAX(uncompressed_payload_length, (unsigned)max_compressed_size) + sizeof(*trailer); - break; + case RRD_NO_COMPRESSION: + size_bytes = payload_offset + uncompressed_payload_length + sizeof(*trailer); + break; + + default: /* Compress */ + fatal_assert(uncompressed_payload_length < LZ4_MAX_INPUT_SIZE); + max_compressed_size = LZ4_compressBound(uncompressed_payload_length); + eb = extent_buffer_get(max_compressed_size); + compressed_buf = eb->data; + size_bytes = payload_offset + MAX(uncompressed_payload_length, (unsigned)max_compressed_size) + sizeof(*trailer); + break; } + ret = posix_memalign((void *)&xt_io_descr->buf, RRDFILE_ALIGNMENT, ALIGN_BYTES_CEILING(size_bytes)); if (unlikely(ret)) { - fatal("posix_memalign:%s", strerror(ret)); + fatal("DBENGINE: posix_memalign:%s", strerror(ret)); /* freez(xt_io_descr);*/ } memset(xt_io_descr->buf, 0, ALIGN_BYTES_CEILING(size_bytes)); - (void) memcpy(xt_io_descr->descr_array, eligible_pages, sizeof(struct rrdeng_page_descr *) * count); + (void) memcpy(xt_io_descr->descr_array, eligible_pages, sizeof(struct page_descr_with_data *) * count); xt_io_descr->descr_count = count; pos = 0; @@ -869,17 +829,7 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct header->number_of_pages = count; pos += sizeof(*header); - extent = mallocz(sizeof(*extent) + count * sizeof(extent->pages[0])); - datafile = ctx->datafiles.last; /* TODO: check for exceeded size quota */ - extent->offset = datafile->pos; - extent->number_of_pages = count; - extent->datafile = datafile; - extent->next = NULL; - for (i = 0 ; i < count ; ++i) { - /* This is here for performance reasons */ - xt_io_descr->descr_commit_idx_array[i] = descr_commit_idx_array[i]; - descr = xt_io_descr->descr_array[i]; header->descr[i].type = descr->type; uuid_copy(*(uuid_t *)header->descr[i].uuid, *descr->id); @@ -890,35 +840,40 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct } for (i = 0 ; i < count ; ++i) { descr = xt_io_descr->descr_array[i]; - /* care, we don't hold the descriptor mutex */ - (void) memcpy(xt_io_descr->buf + pos, descr->pg_cache_descr->page, descr->page_length); - descr->extent = extent; - extent->pages[i] = descr; - + (void) memcpy(xt_io_descr->buf + pos, descr->page, descr->page_length); pos += descr->page_length; } - df_extent_insert(extent); - switch (compression_algorithm) { - case RRD_NO_COMPRESSION: - header->payload_length = uncompressed_payload_length; - break; - default: /* Compress */ - compressed_size = LZ4_compress_default(xt_io_descr->buf + payload_offset, compressed_buf, - uncompressed_payload_length, max_compressed_size); - ctx->stats.before_compress_bytes += uncompressed_payload_length; - ctx->stats.after_compress_bytes += compressed_size; - debug(D_RRDENGINE, "LZ4 compressed %"PRIu32" bytes to %d bytes.", uncompressed_payload_length, compressed_size); + if(likely(compression_algorithm == RRD_LZ4)) { + compressed_size = LZ4_compress_default( + xt_io_descr->buf + payload_offset, + compressed_buf, + (int)uncompressed_payload_length, + max_compressed_size); + + __atomic_add_fetch(&ctx->stats.before_compress_bytes, uncompressed_payload_length, __ATOMIC_RELAXED); + __atomic_add_fetch(&ctx->stats.after_compress_bytes, compressed_size, __ATOMIC_RELAXED); + (void) memcpy(xt_io_descr->buf + payload_offset, compressed_buf, compressed_size); - freez(compressed_buf); + extent_buffer_release(eb); size_bytes = payload_offset + compressed_size + sizeof(*trailer); header->payload_length = compressed_size; - break; } - extent->size = size_bytes; - xt_io_descr->bytes = size_bytes; + else { // RRD_NO_COMPRESSION + header->payload_length = uncompressed_payload_length; + } + + real_io_size = ALIGN_BYTES_CEILING(size_bytes); + + datafile = get_datafile_to_write_extent(ctx); + netdata_spinlock_lock(&datafile->writers.spinlock); + xt_io_descr->datafile = datafile; xt_io_descr->pos = datafile->pos; - xt_io_descr->req.data = xt_io_descr; + datafile->pos += real_io_size; + netdata_spinlock_unlock(&datafile->writers.spinlock); + + xt_io_descr->bytes = size_bytes; + xt_io_descr->uv_fs_request.data = xt_io_descr; xt_io_descr->completion = completion; trailer = xt_io_descr->buf + size_bytes - sizeof(*trailer); @@ -926,324 +881,508 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct crc = crc32(crc, xt_io_descr->buf, size_bytes - sizeof(*trailer)); crc32set(trailer->checksum, crc); - real_io_size = ALIGN_BYTES_CEILING(size_bytes); xt_io_descr->iov = uv_buf_init((void *)xt_io_descr->buf, real_io_size); - ret = uv_fs_write(wc->loop, &xt_io_descr->req, datafile->file, &xt_io_descr->iov, 1, datafile->pos, flush_pages_cb); - fatal_assert(-1 != ret); - ctx->stats.io_write_bytes += real_io_size; - ++ctx->stats.io_write_requests; - ctx->stats.io_write_extent_bytes += real_io_size; - ++ctx->stats.io_write_extents; - do_commit_transaction(wc, STORE_DATA, xt_io_descr); - datafile->pos += ALIGN_BYTES_CEILING(size_bytes); - ctx->disk_space += ALIGN_BYTES_CEILING(size_bytes); - rrdeng_test_quota(wc); + journalfile_extent_build(ctx, xt_io_descr); - return ALIGN_BYTES_CEILING(size_bytes); + ctx_last_flush_fileno_set(ctx, datafile->fileno); + ctx_current_disk_space_increase(ctx, real_io_size); + ctx_io_write_op_bytes(ctx, real_io_size); + + return xt_io_descr; } -static void after_delete_old_data(struct rrdengine_worker_config* wc) -{ - struct rrdengine_instance *ctx = wc->ctx; - struct rrdengine_datafile *datafile; - struct rrdengine_journalfile *journalfile; - unsigned deleted_bytes, journalfile_bytes, datafile_bytes; - int ret, error; - char path[RRDENG_PATH_MAX]; +static void after_extent_write(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* uv_work_req __maybe_unused, int status __maybe_unused) { + struct extent_io_descriptor *xt_io_descr = data; - datafile = ctx->datafiles.first; - journalfile = datafile->journalfile; - datafile_bytes = datafile->pos; - journalfile_bytes = journalfile->pos; - deleted_bytes = 0; + if(xt_io_descr) { + int ret = uv_fs_write(&rrdeng_main.loop, + &xt_io_descr->uv_fs_request, + xt_io_descr->datafile->file, + &xt_io_descr->iov, + 1, + (int64_t) xt_io_descr->pos, + after_extent_write_datafile_io); - info("Deleting data and journal file pair."); - datafile_list_delete(ctx, datafile); - ret = destroy_journal_file(journalfile, datafile); - if (!ret) { - generate_journalfilepath(datafile, path, sizeof(path)); - info("Deleted journal file \"%s\".", path); - deleted_bytes += journalfile_bytes; - } - ret = destroy_data_file(datafile); - if (!ret) { - generate_datafilepath(datafile, path, sizeof(path)); - info("Deleted data file \"%s\".", path); - deleted_bytes += datafile_bytes; + fatal_assert(-1 != ret); } - freez(journalfile); - freez(datafile); +} - ctx->disk_space -= deleted_bytes; - info("Reclaimed %u bytes of disk space.", deleted_bytes); +static void *extent_write_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_WRITE); + struct page_descr_with_data *base = data; + struct extent_io_descriptor *xt_io_descr = datafile_extent_build(ctx, base, completion); + return xt_io_descr; +} - error = uv_thread_join(wc->now_deleting_files); - if (error) { - error("uv_thread_join(): %s", uv_strerror(error)); - } - freez(wc->now_deleting_files); - /* unfreeze command processing */ - wc->now_deleting_files = NULL; +static void after_database_rotate(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + __atomic_store_n(&ctx->atomic.now_deleting_files, false, __ATOMIC_RELAXED); +} - wc->cleanup_thread_deleting_files = 0; - rrdcontext_db_rotation(); +struct uuid_first_time_s { + uuid_t *uuid; + time_t first_time_s; + METRIC *metric; + size_t pages_found; + size_t df_matched; + size_t df_index_oldest; +}; - /* interrupt event loop */ - uv_stop(wc->loop); +static int journal_metric_compare(const void *key, const void *metric) +{ + return uuid_compare(*(uuid_t *) key, ((struct journal_metric_list *) metric)->uuid); } -static void delete_old_data(void *arg) -{ - struct rrdengine_instance *ctx = arg; - struct rrdengine_worker_config* wc = &ctx->worker_config; - struct rrdengine_datafile *datafile; - struct extent_info *extent, *next; - struct rrdeng_page_descr *descr; - unsigned count, i; - uint8_t can_delete_metric; - uuid_t metric_id; - - /* Safe to use since it will be deleted after we are done */ - datafile = ctx->datafiles.first; - - for (extent = datafile->extents.first ; extent != NULL ; extent = next) { - count = extent->number_of_pages; - for (i = 0 ; i < count ; ++i) { - descr = extent->pages[i]; - can_delete_metric = pg_cache_punch_hole(ctx, descr, 0, 0, &metric_id); - if (unlikely(can_delete_metric)) { - /* - * If the metric is empty, has no active writers and if the metadata log has been initialized then - * attempt to delete the corresponding netdata dimension. - */ - metaqueue_delete_dimension_uuid(&metric_id); - } - } - next = extent->next; - freez(extent); - } - wc->cleanup_thread_deleting_files = 1; - /* wake up event loop */ - fatal_assert(0 == uv_async_send(&wc->async)); +struct rrdengine_datafile *datafile_release_and_acquire_next_for_retention(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) { + + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + + struct rrdengine_datafile *next_datafile = datafile->next; + + while(next_datafile && !datafile_acquire(next_datafile, DATAFILE_ACQUIRE_RETENTION)) + next_datafile = next_datafile->next; + + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + datafile_release(datafile, DATAFILE_ACQUIRE_RETENTION); + + return next_datafile; } -void rrdeng_test_quota(struct rrdengine_worker_config* wc) +void find_uuid_first_time( + struct rrdengine_instance *ctx, + struct rrdengine_datafile *datafile, + struct uuid_first_time_s *uuid_first_entry_list, + size_t count) { - struct rrdengine_instance *ctx = wc->ctx; - struct rrdengine_datafile *datafile; - unsigned current_size, target_size; - uint8_t out_of_space, only_one_datafile; - int ret, error; - - out_of_space = 0; - /* Do not allow the pinned pages to exceed the disk space quota to avoid deadlocks */ - if (unlikely(ctx->disk_space > MAX(ctx->max_disk_space, 2 * ctx->metric_API_max_producers * RRDENG_BLOCK_SIZE))) { - out_of_space = 1; - } - datafile = ctx->datafiles.last; - current_size = datafile->pos; - target_size = ctx->max_disk_space / TARGET_DATAFILES; - target_size = MIN(target_size, MAX_DATAFILE_SIZE); - target_size = MAX(target_size, MIN_DATAFILE_SIZE); - only_one_datafile = (datafile == ctx->datafiles.first) ? 1 : 0; - if (unlikely(current_size >= target_size || (out_of_space && only_one_datafile))) { - /* Finalize data and journal file and create a new pair */ - wal_flush_transaction_buffer(wc); - ret = create_new_datafile_pair(ctx, 1, ctx->last_fileno + 1); - if (likely(!ret)) { - ++ctx->last_fileno; + // acquire the datafile to work with it + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + while(datafile && !datafile_acquire(datafile, DATAFILE_ACQUIRE_RETENTION)) + datafile = datafile->next; + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + if (unlikely(!datafile)) + return; + + unsigned journalfile_count = 0; + size_t binary_match = 0; + size_t not_matching_bsearches = 0; + + while (datafile) { + struct journal_v2_header *j2_header = journalfile_v2_data_acquire(datafile->journalfile, NULL, 0, 0); + if (!j2_header) { + datafile = datafile_release_and_acquire_next_for_retention(ctx, datafile); + continue; + } + + time_t journal_start_time_s = (time_t) (j2_header->start_time_ut / USEC_PER_SEC); + struct journal_metric_list *uuid_list = (struct journal_metric_list *)((uint8_t *) j2_header + j2_header->metric_offset); + struct uuid_first_time_s *uuid_original_entry; + + size_t journal_metric_count = j2_header->metric_count; + + for (size_t index = 0; index < count; ++index) { + uuid_original_entry = &uuid_first_entry_list[index]; + + // Check here if we should skip this + if (uuid_original_entry->df_matched > 3 || uuid_original_entry->pages_found > 5) + continue; + + struct journal_metric_list *live_entry = bsearch(uuid_original_entry->uuid,uuid_list,journal_metric_count,sizeof(*uuid_list), journal_metric_compare); + if (!live_entry) { + // Not found in this journal + not_matching_bsearches++; + continue; + } + + uuid_original_entry->pages_found += live_entry->entries; + uuid_original_entry->df_matched++; + + time_t old_first_time_s = uuid_original_entry->first_time_s; + + // Calculate first / last for this match + time_t first_time_s = live_entry->delta_start_s + journal_start_time_s; + uuid_original_entry->first_time_s = MIN(uuid_original_entry->first_time_s, first_time_s); + + if (uuid_original_entry->first_time_s != old_first_time_s) + uuid_original_entry->df_index_oldest = uuid_original_entry->df_matched; + + binary_match++; } + + journalfile_count++; + journalfile_v2_data_release(datafile->journalfile); + datafile = datafile_release_and_acquire_next_for_retention(ctx, datafile); } - if (unlikely(out_of_space && NO_QUIESCE == ctx->quiesce)) { - /* delete old data */ - if (wc->now_deleting_files) { - /* already deleting data */ - return; + + // Let's scan the open cache for almost exact match + size_t open_cache_count = 0; + + size_t df_index[10] = { 0 }; + size_t without_metric = 0; + size_t open_cache_gave_first_time_s = 0; + size_t metric_count = 0; + size_t without_retention = 0; + size_t not_needed_bsearches = 0; + + for (size_t index = 0; index < count; ++index) { + struct uuid_first_time_s *uuid_first_t_entry = &uuid_first_entry_list[index]; + + metric_count++; + + size_t idx = uuid_first_t_entry->df_index_oldest; + if(idx >= 10) + idx = 9; + + df_index[idx]++; + + not_needed_bsearches += uuid_first_t_entry->df_matched - uuid_first_t_entry->df_index_oldest; + + if (unlikely(!uuid_first_t_entry->metric)) { + without_metric++; + continue; } - if (NULL == ctx->datafiles.first->next) { - error("Cannot delete data file \"%s/"DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION"\"" - " to reclaim space, there are no other file pairs left.", - ctx->dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno); - return; + + PGC_PAGE *page = pgc_page_get_and_acquire( + open_cache, (Word_t)ctx, + (Word_t)uuid_first_t_entry->metric, 0, + PGC_SEARCH_FIRST); + + if (page) { + time_t old_first_time_s = uuid_first_t_entry->first_time_s; + + time_t first_time_s = pgc_page_start_time_s(page); + uuid_first_t_entry->first_time_s = MIN(uuid_first_t_entry->first_time_s, first_time_s); + pgc_page_release(open_cache, page); + open_cache_count++; + + if(uuid_first_t_entry->first_time_s != old_first_time_s) { + open_cache_gave_first_time_s++; + } } - info("Deleting data file \"%s/"DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION"\".", - ctx->dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno); - wc->now_deleting_files = mallocz(sizeof(*wc->now_deleting_files)); - wc->cleanup_thread_deleting_files = 0; - - error = uv_thread_create(wc->now_deleting_files, delete_old_data, ctx); - if (error) { - error("uv_thread_create(): %s", uv_strerror(error)); - freez(wc->now_deleting_files); - wc->now_deleting_files = NULL; + else { + if(!uuid_first_t_entry->df_index_oldest) + without_retention++; } } + internal_error(true, + "DBENGINE: analyzed the retention of %zu rotated metrics of tier %d, " + "did %zu jv2 matching binary searches (%zu not matching, %zu overflown) in %u journal files, " + "%zu metrics with entries in open cache, " + "metrics first time found per datafile index ([not in jv2]:%zu, [1]:%zu, [2]:%zu, [3]:%zu, [4]:%zu, [5]:%zu, [6]:%zu, [7]:%zu, [8]:%zu, [bigger]: %zu), " + "open cache found first time %zu, " + "metrics without any remaining retention %zu, " + "metrics not in MRG %zu", + metric_count, + ctx->config.tier, + binary_match, + not_matching_bsearches, + not_needed_bsearches, + journalfile_count, + open_cache_count, + df_index[0], df_index[1], df_index[2], df_index[3], df_index[4], df_index[5], df_index[6], df_index[7], df_index[8], df_index[9], + open_cache_gave_first_time_s, + without_retention, + without_metric + ); } -static inline int rrdeng_threads_alive(struct rrdengine_worker_config* wc) -{ - if (wc->now_invalidating_dirty_pages || wc->now_deleting_files) { - return 1; +static void update_metrics_first_time_s(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile_to_delete, struct rrdengine_datafile *first_datafile_remaining, bool worker) { + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.metrics_retention_started, 1, __ATOMIC_RELAXED); + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_FIND_ROTATED_METRICS); + + struct rrdengine_journalfile *journalfile = datafile_to_delete->journalfile; + struct journal_v2_header *j2_header = journalfile_v2_data_acquire(journalfile, NULL, 0, 0); + struct journal_metric_list *uuid_list = (struct journal_metric_list *)((uint8_t *) j2_header + j2_header->metric_offset); + + size_t count = j2_header->metric_count; + struct uuid_first_time_s *uuid_first_t_entry; + struct uuid_first_time_s *uuid_first_entry_list = callocz(count, sizeof(struct uuid_first_time_s)); + + size_t added = 0; + for (size_t index = 0; index < count; ++index) { + METRIC *metric = mrg_metric_get_and_acquire(main_mrg, &uuid_list[index].uuid, (Word_t) ctx); + if (!metric) + continue; + + uuid_first_entry_list[added].metric = metric; + uuid_first_entry_list[added].first_time_s = LONG_MAX; + uuid_first_entry_list[added].df_matched = 0; + uuid_first_entry_list[added].df_index_oldest = 0; + uuid_first_entry_list[added].uuid = mrg_metric_uuid(main_mrg, metric); + added++; } - return 0; + + info("DBENGINE: recalculating tier %d retention for %zu metrics starting with datafile %u", + ctx->config.tier, count, first_datafile_remaining->fileno); + + journalfile_v2_data_release(journalfile); + + // Update the first time / last time for all metrics we plan to delete + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_FIND_REMAINING_RETENTION); + + find_uuid_first_time(ctx, first_datafile_remaining, uuid_first_entry_list, added); + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_POPULATE_MRG); + + info("DBENGINE: updating tier %d metrics registry retention for %zu metrics", + ctx->config.tier, added); + + size_t deleted_metrics = 0, zero_retention_referenced = 0, zero_disk_retention = 0, zero_disk_but_live = 0; + for (size_t index = 0; index < added; ++index) { + uuid_first_t_entry = &uuid_first_entry_list[index]; + if (likely(uuid_first_t_entry->first_time_s != LONG_MAX)) { + mrg_metric_set_first_time_s_if_bigger(main_mrg, uuid_first_t_entry->metric, uuid_first_t_entry->first_time_s); + mrg_metric_release(main_mrg, uuid_first_t_entry->metric); + } + else { + zero_disk_retention++; + + // there is no retention for this metric + bool has_retention = mrg_metric_zero_disk_retention(main_mrg, uuid_first_t_entry->metric); + if (!has_retention) { + bool deleted = mrg_metric_release_and_delete(main_mrg, uuid_first_t_entry->metric); + if(deleted) + deleted_metrics++; + else + zero_retention_referenced++; + } + else { + zero_disk_but_live++; + mrg_metric_release(main_mrg, uuid_first_t_entry->metric); + } + } + } + freez(uuid_first_entry_list); + + internal_error(zero_disk_retention, + "DBENGINE: deleted %zu metrics, zero retention but referenced %zu (out of %zu total, of which %zu have main cache retention) zero on-disk retention tier %d metrics from metrics registry", + deleted_metrics, zero_retention_referenced, zero_disk_retention, zero_disk_but_live, ctx->config.tier); + + if(worker) + worker_is_idle(); } -static void rrdeng_cleanup_finished_threads(struct rrdengine_worker_config* wc) -{ - struct rrdengine_instance *ctx = wc->ctx; +void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool update_retention, bool worker) { + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_DATAFILE_DELETE_WAIT); + + bool datafile_got_for_deletion = datafile_acquire_for_deletion(datafile); - if (unlikely(wc->cleanup_thread_invalidating_dirty_pages)) { - after_invalidate_oldest_committed(wc); + if (update_retention) + update_metrics_first_time_s(ctx, datafile, datafile->next, worker); + + while (!datafile_got_for_deletion) { + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_DATAFILE_DELETE_WAIT); + + datafile_got_for_deletion = datafile_acquire_for_deletion(datafile); + + if (!datafile_got_for_deletion) { + info("DBENGINE: waiting for data file '%s/" + DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION + "' to be available for deletion, " + "it is in use currently by %u users.", + ctx->config.dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno, datafile->users.lockers); + + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.datafile_deletion_spin, 1, __ATOMIC_RELAXED); + sleep_usec(1 * USEC_PER_SEC); + } } - if (unlikely(wc->cleanup_thread_deleting_files)) { - after_delete_old_data(wc); + + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.datafile_deletion_started, 1, __ATOMIC_RELAXED); + info("DBENGINE: deleting data file '%s/" + DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION + "'.", + ctx->config.dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno); + + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_DATAFILE_DELETE); + + struct rrdengine_journalfile *journal_file; + unsigned deleted_bytes, journal_file_bytes, datafile_bytes; + int ret; + char path[RRDENG_PATH_MAX]; + + uv_rwlock_wrlock(&ctx->datafiles.rwlock); + datafile_list_delete_unsafe(ctx, datafile); + uv_rwlock_wrunlock(&ctx->datafiles.rwlock); + + journal_file = datafile->journalfile; + datafile_bytes = datafile->pos; + journal_file_bytes = journalfile_current_size(journal_file); + deleted_bytes = journalfile_v2_data_size_get(journal_file); + + info("DBENGINE: deleting data and journal files to maintain disk quota"); + ret = journalfile_destroy_unsafe(journal_file, datafile); + if (!ret) { + journalfile_v1_generate_path(datafile, path, sizeof(path)); + info("DBENGINE: deleted journal file \"%s\".", path); + journalfile_v2_generate_path(datafile, path, sizeof(path)); + info("DBENGINE: deleted journal file \"%s\".", path); + deleted_bytes += journal_file_bytes; } - if (unlikely(SET_QUIESCE == ctx->quiesce && !rrdeng_threads_alive(wc))) { - ctx->quiesce = QUIESCED; - completion_mark_complete(&ctx->rrdengine_completion); + ret = destroy_data_file_unsafe(datafile); + if (!ret) { + generate_datafilepath(datafile, path, sizeof(path)); + info("DBENGINE: deleted data file \"%s\".", path); + deleted_bytes += datafile_bytes; } + freez(journal_file); + freez(datafile); + + ctx_current_disk_space_decrease(ctx, deleted_bytes); + info("DBENGINE: reclaimed %u bytes of disk space.", deleted_bytes); } -/* return 0 on success */ -int init_rrd_files(struct rrdengine_instance *ctx) -{ - int ret = init_data_files(ctx); - - BUFFER *wb = buffer_create(1000); - size_t all_errors = 0; - usec_t now = now_realtime_usec(); - - if(ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].counter) { - buffer_sprintf(wb, "%s%zu pages had start time > end time (latest: %llu secs ago)" - , (all_errors)?", ":"" - , ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].counter - , (now - ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].latest_end_time_ut) / USEC_PER_SEC - ); - all_errors += ctx->load_errors[LOAD_ERRORS_PAGE_FLIPPED_TIME].counter; - } +static void *database_rotate_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + datafile_delete(ctx, ctx->datafiles.first, ctx_is_available_for_queries(ctx), true); - if(ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].counter) { - buffer_sprintf(wb, "%s%zu pages had start time = end time with more than 1 entries (latest: %llu secs ago)" - , (all_errors)?", ":"" - , ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].counter - , (now - ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].latest_end_time_ut) / USEC_PER_SEC - ); - all_errors += ctx->load_errors[LOAD_ERRORS_PAGE_EQUAL_TIME].counter; - } + if (rrdeng_ctx_exceeded_disk_quota(ctx)) + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_DATABASE_ROTATE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); - if(ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].counter) { - buffer_sprintf(wb, "%s%zu pages had zero points (latest: %llu secs ago)" - , (all_errors)?", ":"" - , ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].counter - , (now - ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].latest_end_time_ut) / USEC_PER_SEC - ); - all_errors += ctx->load_errors[LOAD_ERRORS_PAGE_ZERO_ENTRIES].counter; - } + rrdcontext_db_rotation(); - if(ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].counter) { - buffer_sprintf(wb, "%s%zu pages had update every == 0 with entries > 1 (latest: %llu secs ago)" - , (all_errors)?", ":"" - , ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].counter - , (now - ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].latest_end_time_ut) / USEC_PER_SEC - ); - all_errors += ctx->load_errors[LOAD_ERRORS_PAGE_UPDATE_ZERO].counter; - } + return data; +} - if(ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].counter) { - buffer_sprintf(wb, "%s%zu pages had a different number of points compared to their timestamps (latest: %llu secs ago; these page have been loaded)" - , (all_errors)?", ":"" - , ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].counter - , (now - ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].latest_end_time_ut) / USEC_PER_SEC - ); - all_errors += ctx->load_errors[LOAD_ERRORS_PAGE_FLEXY_TIME].counter; - } +static void after_flush_all_hot_and_dirty_pages_of_section(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + ; +} + +static void *flush_all_hot_and_dirty_pages_of_section_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_QUIESCE); + pgc_flush_all_hot_and_dirty_pages(main_cache, (Word_t)ctx); + completion_mark_complete(&ctx->quiesce.completion); + return data; +} - if(ctx->load_errors[LOAD_ERRORS_DROPPED_EXTENT].counter) { - buffer_sprintf(wb, "%s%zu extents have been dropped because they didn't have any valid pages" - , (all_errors)?", ":"" - , ctx->load_errors[LOAD_ERRORS_DROPPED_EXTENT].counter - ); - all_errors += ctx->load_errors[LOAD_ERRORS_DROPPED_EXTENT].counter; +static void after_populate_mrg(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + ; +} + +static void *populate_mrg_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_POPULATE_MRG); + + do { + struct rrdengine_datafile *datafile = NULL; + + // find a datafile to work + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + for(datafile = ctx->datafiles.first; datafile ; datafile = datafile->next) { + if(!netdata_spinlock_trylock(&datafile->populate_mrg.spinlock)) + continue; + + if(datafile->populate_mrg.populated) { + netdata_spinlock_unlock(&datafile->populate_mrg.spinlock); + continue; + } + + // we have the spinlock and it is not populated + break; + } + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + if(!datafile) + break; + + journalfile_v2_populate_retention_to_mrg(ctx, datafile->journalfile); + datafile->populate_mrg.populated = true; + netdata_spinlock_unlock(&datafile->populate_mrg.spinlock); + + } while(1); + + completion_mark_complete(completion); + + return data; +} + +static void after_ctx_shutdown(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + ; +} + +static void *ctx_shutdown_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_SHUTDOWN); + + completion_wait_for(&ctx->quiesce.completion); + completion_destroy(&ctx->quiesce.completion); + + bool logged = false; + while(__atomic_load_n(&ctx->atomic.extents_currently_being_flushed, __ATOMIC_RELAXED) || + __atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED)) { + if(!logged) { + logged = true; + info("DBENGINE: waiting for %zu inflight queries to finish to shutdown tier %d...", + __atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED), + (ctx->config.legacy) ? -1 : ctx->config.tier); + } + sleep_usec(1 * USEC_PER_MS); } - if(all_errors) - info("DBENGINE: tier %d: %s", ctx->tier, buffer_tostring(wb)); + completion_mark_complete(completion); - buffer_free(wb); - return ret; + return data; } -void finalize_rrd_files(struct rrdengine_instance *ctx) -{ - return finalize_data_files(ctx); +static void *cache_flush_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + if (!main_cache) + return data; + + worker_is_busy(UV_EVENT_DBENGINE_FLUSH_MAIN_CACHE); + pgc_flush_pages(main_cache, 0); + + return data; } -void rrdeng_init_cmd_queue(struct rrdengine_worker_config* wc) -{ - wc->cmd_queue.head = wc->cmd_queue.tail = 0; - wc->queue_size = 0; - fatal_assert(0 == uv_cond_init(&wc->cmd_cond)); - fatal_assert(0 == uv_mutex_init(&wc->cmd_mutex)); +static void *cache_evict_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *req __maybe_unused) { + if (!main_cache) + return data; + + worker_is_busy(UV_EVENT_DBENGINE_EVICT_MAIN_CACHE); + pgc_evict_pages(main_cache, 0, 0); + + return data; } -void rrdeng_enq_cmd(struct rrdengine_worker_config* wc, struct rrdeng_cmd *cmd) -{ - unsigned queue_size; +static void after_prep_query(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + ; +} - /* wait for free space in queue */ - uv_mutex_lock(&wc->cmd_mutex); - while ((queue_size = wc->queue_size) == RRDENG_CMD_Q_MAX_SIZE) { - uv_cond_wait(&wc->cmd_cond, &wc->cmd_mutex); - } - fatal_assert(queue_size < RRDENG_CMD_Q_MAX_SIZE); - /* enqueue command */ - wc->cmd_queue.cmd_array[wc->cmd_queue.tail] = *cmd; - wc->cmd_queue.tail = wc->cmd_queue.tail != RRDENG_CMD_Q_MAX_SIZE - 1 ? - wc->cmd_queue.tail + 1 : 0; - wc->queue_size = queue_size + 1; - uv_mutex_unlock(&wc->cmd_mutex); +static void *query_prep_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_QUERY); + PDC *pdc = data; + rrdeng_prep_query(pdc); + return data; +} - /* wake up event loop */ - fatal_assert(0 == uv_async_send(&wc->async)); +unsigned rrdeng_target_data_file_size(struct rrdengine_instance *ctx) { + unsigned target_size = ctx->config.max_disk_space / TARGET_DATAFILES; + target_size = MIN(target_size, MAX_DATAFILE_SIZE); + target_size = MAX(target_size, MIN_DATAFILE_SIZE); + return target_size; } -struct rrdeng_cmd rrdeng_deq_cmd(struct rrdengine_worker_config* wc) +bool rrdeng_ctx_exceeded_disk_quota(struct rrdengine_instance *ctx) { - struct rrdeng_cmd ret; - unsigned queue_size; - - uv_mutex_lock(&wc->cmd_mutex); - queue_size = wc->queue_size; - if (queue_size == 0) { - ret.opcode = RRDENG_NOOP; - } else { - /* dequeue command */ - ret = wc->cmd_queue.cmd_array[wc->cmd_queue.head]; - if (queue_size == 1) { - wc->cmd_queue.head = wc->cmd_queue.tail = 0; - } else { - wc->cmd_queue.head = wc->cmd_queue.head != RRDENG_CMD_Q_MAX_SIZE - 1 ? - wc->cmd_queue.head + 1 : 0; - } - wc->queue_size = queue_size - 1; + uint64_t estimated_disk_space = ctx_current_disk_space_get(ctx) + rrdeng_target_data_file_size(ctx) - + (ctx->datafiles.first->prev ? ctx->datafiles.first->prev->pos : 0); - /* wake up producers */ - uv_cond_signal(&wc->cmd_cond); - } - uv_mutex_unlock(&wc->cmd_mutex); + return estimated_disk_space > ctx->config.max_disk_space; +} - return ret; +/* return 0 on success */ +int init_rrd_files(struct rrdengine_instance *ctx) +{ + return init_data_files(ctx); } -static void load_configuration_dynamic(void) +void finalize_rrd_files(struct rrdengine_instance *ctx) { - unsigned read_num = (unsigned)config_get_number(CONFIG_SECTION_DB, "dbengine pages per extent", MAX_PAGES_PER_EXTENT); - if (read_num > 0 && read_num <= MAX_PAGES_PER_EXTENT) - rrdeng_pages_per_extent = read_num; - else { - error("Invalid dbengine pages per extent %u given. Using %u.", read_num, rrdeng_pages_per_extent); - config_set_number(CONFIG_SECTION_DB, "dbengine pages per extent", rrdeng_pages_per_extent); - } + return finalize_data_files(ctx); } void async_cb(uv_async_t *handle) @@ -1253,256 +1392,413 @@ void async_cb(uv_async_t *handle) debug(D_RRDENGINE, "%s called, active=%d.", __func__, uv_is_active((uv_handle_t *)handle)); } -/* Flushes dirty pages when timer expires */ #define TIMER_PERIOD_MS (1000) -void timer_cb(uv_timer_t* handle) -{ - worker_is_busy(RRDENG_MAX_OPCODE + 1); - struct rrdengine_worker_config* wc = handle->data; - struct rrdengine_instance *ctx = wc->ctx; +static void *extent_read_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + EPDL *epdl = data; + epdl_find_extent_and_populate_pages(ctx, epdl, true); + return data; +} - uv_stop(handle->loop); - uv_update_time(handle->loop); - rrdeng_test_quota(wc); - debug(D_RRDENGINE, "%s: timeout reached.", __func__); - if (likely(!wc->now_deleting_files && !wc->now_invalidating_dirty_pages)) { - /* There is free space so we can write to disk and we are not actively deleting dirty buffers */ - struct page_cache *pg_cache = &ctx->pg_cache; - unsigned long total_bytes, bytes_written, nr_committed_pages, bytes_to_write = 0, producers, low_watermark, - high_watermark; - - uv_rwlock_rdlock(&pg_cache->committed_page_index.lock); - nr_committed_pages = pg_cache->committed_page_index.nr_committed_pages; - uv_rwlock_rdunlock(&pg_cache->committed_page_index.lock); - producers = ctx->metric_API_max_producers; - /* are flushable pages more than 25% of the maximum page cache size */ - high_watermark = (ctx->max_cache_pages * 25LLU) / 100; - low_watermark = (ctx->max_cache_pages * 5LLU) / 100; /* 5%, must be smaller than high_watermark */ - - /* Flush more pages only if disk can keep up */ - if (wc->inflight_dirty_pages < high_watermark + producers) { - if (nr_committed_pages > producers && - /* committed to be written pages are more than the produced number */ - nr_committed_pages - producers > high_watermark) { - /* Flushing speed must increase to stop page cache from filling with dirty pages */ - bytes_to_write = (nr_committed_pages - producers - low_watermark) * RRDENG_BLOCK_SIZE; - } - bytes_to_write = MAX(DATAFILE_IDEAL_IO_SIZE, bytes_to_write); +static void epdl_populate_pages_asynchronously(struct rrdengine_instance *ctx, EPDL *epdl, STORAGE_PRIORITY priority) { + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_EXTENT_READ, epdl, NULL, priority, + rrdeng_enqueue_epdl_cmd, rrdeng_dequeue_epdl_cmd); +} - debug(D_RRDENGINE, "Flushing pages to disk."); - for (total_bytes = bytes_written = do_flush_pages(wc, 0, NULL); - bytes_written && (total_bytes < bytes_to_write); - total_bytes += bytes_written) { - bytes_written = do_flush_pages(wc, 0, NULL); - } +void pdc_route_asynchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc) { + pdc_to_epdl_router(ctx, pdc, epdl_populate_pages_asynchronously, epdl_populate_pages_asynchronously); +} + +void epdl_populate_pages_synchronously(struct rrdengine_instance *ctx, EPDL *epdl, enum storage_priority priority __maybe_unused) { + epdl_find_extent_and_populate_pages(ctx, epdl, false); +} + +void pdc_route_synchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc) { + pdc_to_epdl_router(ctx, pdc, epdl_populate_pages_synchronously, epdl_populate_pages_synchronously); +} + +#define MAX_RETRIES_TO_START_INDEX (100) +static void *journal_v2_indexing_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + unsigned count = 0; + worker_is_busy(UV_EVENT_DBENGINE_JOURNAL_INDEX_WAIT); + + while (__atomic_load_n(&ctx->atomic.now_deleting_files, __ATOMIC_RELAXED) && count++ < MAX_RETRIES_TO_START_INDEX) + sleep_usec(100 * USEC_PER_MS); + + if (count == MAX_RETRIES_TO_START_INDEX) { + worker_is_idle(); + return data; + } + + struct rrdengine_datafile *datafile = ctx->datafiles.first; + worker_is_busy(UV_EVENT_DBENGINE_JOURNAL_INDEX); + count = 0; + while (datafile && datafile->fileno != ctx_last_fileno_get(ctx) && datafile->fileno != ctx_last_flush_fileno_get(ctx)) { + + netdata_spinlock_lock(&datafile->writers.spinlock); + bool available = (datafile->writers.running || datafile->writers.flushed_to_open_running) ? false : true; + netdata_spinlock_unlock(&datafile->writers.spinlock); + + if(!available) + continue; + + if (unlikely(!journalfile_v2_data_available(datafile->journalfile))) { + info("DBENGINE: journal file %u is ready to be indexed", datafile->fileno); + pgc_open_cache_to_journal_v2(open_cache, (Word_t) ctx, (int) datafile->fileno, ctx->config.page_type, + journalfile_migrate_to_v2_callback, (void *) datafile->journalfile); + count++; } + + datafile = datafile->next; + + if (unlikely(!ctx_is_available_for_queries(ctx))) + break; } - load_configuration_dynamic(); -#ifdef NETDATA_INTERNAL_CHECKS + + errno = 0; + internal_error(count, "DBENGINE: journal indexing done; %u files processed", count); + + worker_is_idle(); + + return data; +} + +static void after_do_cache_flush(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + rrdeng_main.flushes_running--; +} + +static void after_do_cache_evict(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + rrdeng_main.evictions_running--; +} + +static void after_extent_read(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + ; +} + +static void after_journal_v2_indexing(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + __atomic_store_n(&ctx->atomic.migration_to_v2_running, false, __ATOMIC_RELAXED); + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_DATABASE_ROTATE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); +} + +struct rrdeng_buffer_sizes rrdeng_get_buffer_sizes(void) { + return (struct rrdeng_buffer_sizes) { + .pgc = pgc_aral_overhead() + pgc_aral_structures(), + .mrg = mrg_aral_overhead() + mrg_aral_structures(), + .opcodes = aral_overhead(rrdeng_main.cmd_queue.ar) + aral_structures(rrdeng_main.cmd_queue.ar), + .handles = aral_overhead(rrdeng_main.handles.ar) + aral_structures(rrdeng_main.handles.ar), + .descriptors = aral_overhead(rrdeng_main.descriptors.ar) + aral_structures(rrdeng_main.descriptors.ar), + .wal = __atomic_load_n(&wal_globals.atomics.allocated, __ATOMIC_RELAXED) * (sizeof(WAL) + RRDENG_BLOCK_SIZE), + .workers = aral_overhead(rrdeng_main.work_cmd.ar), + .pdc = pdc_cache_size(), + .xt_io = aral_overhead(rrdeng_main.xt_io_descr.ar) + aral_structures(rrdeng_main.xt_io_descr.ar), + .xt_buf = extent_buffer_cache_size(), + .epdl = epdl_cache_size(), + .deol = deol_cache_size(), + .pd = pd_cache_size(), + +#ifdef PDC_USE_JULYL + .julyl = julyl_cache_size(), +#endif + }; +} + +static void after_cleanup(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { + rrdeng_main.cleanup_running--; +} + +static void *cleanup_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { + worker_is_busy(UV_EVENT_DBENGINE_BUFFERS_CLEANUP); + + wal_cleanup1(); + extent_buffer_cleanup1(); + { - char buf[4096]; - debug(D_RRDENGINE, "%s", get_rrdeng_statistics(wc->ctx, buf, sizeof(buf))); + static time_t last_run_s = 0; + time_t now_s = now_monotonic_sec(); + if(now_s - last_run_s >= 10) { + last_run_s = now_s; + journalfile_v2_data_unmount_cleanup(now_s); + } } + +#ifdef PDC_USE_JULYL + julyl_cleanup1(); #endif + return data; +} + +void timer_cb(uv_timer_t* handle) { + worker_is_busy(RRDENG_TIMER_CB); + uv_stop(handle->loop); + uv_update_time(handle->loop); + + worker_set_metric(RRDENG_OPCODES_WAITING, (NETDATA_DOUBLE)rrdeng_main.cmd_queue.unsafe.waiting); + worker_set_metric(RRDENG_WORKS_DISPATCHED, (NETDATA_DOUBLE)__atomic_load_n(&rrdeng_main.work_cmd.atomics.dispatched, __ATOMIC_RELAXED)); + worker_set_metric(RRDENG_WORKS_EXECUTING, (NETDATA_DOUBLE)__atomic_load_n(&rrdeng_main.work_cmd.atomics.executing, __ATOMIC_RELAXED)); + + rrdeng_enq_cmd(NULL, RRDENG_OPCODE_FLUSH_INIT, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); + rrdeng_enq_cmd(NULL, RRDENG_OPCODE_EVICT_INIT, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); + rrdeng_enq_cmd(NULL, RRDENG_OPCODE_CLEANUP, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); + worker_is_idle(); } -#define MAX_CMD_BATCH_SIZE (256) +static void dbengine_initialize_structures(void) { + pgc_and_mrg_initialize(); + + pdc_init(); + page_details_init(); + epdl_init(); + deol_init(); + rrdeng_cmd_queue_init(); + work_request_init(); + rrdeng_query_handle_init(); + page_descriptors_init(); + extent_buffer_init(); + dbengine_page_alloc_init(); + extent_io_descriptor_init(); +} -void rrdeng_worker(void* arg) -{ - worker_register("DBENGINE"); - worker_register_job_name(RRDENG_NOOP, "noop"); - worker_register_job_name(RRDENG_READ_PAGE, "page read"); - worker_register_job_name(RRDENG_READ_EXTENT, "extent read"); - worker_register_job_name(RRDENG_COMMIT_PAGE, "commit"); - worker_register_job_name(RRDENG_FLUSH_PAGES, "flush"); - worker_register_job_name(RRDENG_SHUTDOWN, "shutdown"); - worker_register_job_name(RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE, "page lru"); - worker_register_job_name(RRDENG_QUIESCE, "quiesce"); - worker_register_job_name(RRDENG_MAX_OPCODE, "cleanup"); - worker_register_job_name(RRDENG_MAX_OPCODE + 1, "timer"); - - struct rrdengine_worker_config* wc = arg; - struct rrdengine_instance *ctx = wc->ctx; - uv_loop_t* loop; - int shutdown, ret; - enum rrdeng_opcode opcode; - uv_timer_t timer_req; - struct rrdeng_cmd cmd; - unsigned cmd_batch_size; +bool rrdeng_dbengine_spawn(struct rrdengine_instance *ctx __maybe_unused) { + static bool spawned = false; + static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; - rrdeng_init_cmd_queue(wc); + netdata_spinlock_lock(&spinlock); - loop = wc->loop = mallocz(sizeof(uv_loop_t)); - ret = uv_loop_init(loop); - if (ret) { - error("uv_loop_init(): %s", uv_strerror(ret)); - goto error_after_loop_init; - } - loop->data = wc; + if(!spawned) { + int ret; - ret = uv_async_init(wc->loop, &wc->async, async_cb); - if (ret) { - error("uv_async_init(): %s", uv_strerror(ret)); - goto error_after_async_init; - } - wc->async.data = wc; + ret = uv_loop_init(&rrdeng_main.loop); + if (ret) { + error("DBENGINE: uv_loop_init(): %s", uv_strerror(ret)); + return false; + } + rrdeng_main.loop.data = &rrdeng_main; - wc->now_deleting_files = NULL; - wc->cleanup_thread_deleting_files = 0; + ret = uv_async_init(&rrdeng_main.loop, &rrdeng_main.async, async_cb); + if (ret) { + error("DBENGINE: uv_async_init(): %s", uv_strerror(ret)); + fatal_assert(0 == uv_loop_close(&rrdeng_main.loop)); + return false; + } + rrdeng_main.async.data = &rrdeng_main; + + ret = uv_timer_init(&rrdeng_main.loop, &rrdeng_main.timer); + if (ret) { + error("DBENGINE: uv_timer_init(): %s", uv_strerror(ret)); + uv_close((uv_handle_t *)&rrdeng_main.async, NULL); + fatal_assert(0 == uv_loop_close(&rrdeng_main.loop)); + return false; + } + rrdeng_main.timer.data = &rrdeng_main; - wc->now_invalidating_dirty_pages = NULL; - wc->cleanup_thread_invalidating_dirty_pages = 0; - wc->inflight_dirty_pages = 0; + dbengine_initialize_structures(); - /* dirty page flushing timer */ - ret = uv_timer_init(loop, &timer_req); - if (ret) { - error("uv_timer_init(): %s", uv_strerror(ret)); - goto error_after_timer_init; + fatal_assert(0 == uv_thread_create(&rrdeng_main.thread, dbengine_event_loop, &rrdeng_main)); + spawned = true; } - timer_req.data = wc; - wc->error = 0; - /* wake up initialization thread */ - completion_mark_complete(&ctx->rrdengine_completion); + netdata_spinlock_unlock(&spinlock); + return true; +} + +void dbengine_event_loop(void* arg) { + sanity_check(); + uv_thread_set_name_np(pthread_self(), "DBENGINE"); + service_register(SERVICE_THREAD_TYPE_EVENT_LOOP, NULL, NULL, NULL, true); + + worker_register("DBENGINE"); + + // opcode jobs + worker_register_job_name(RRDENG_OPCODE_NOOP, "noop"); + + worker_register_job_name(RRDENG_OPCODE_QUERY, "query"); + worker_register_job_name(RRDENG_OPCODE_EXTENT_WRITE, "extent write"); + worker_register_job_name(RRDENG_OPCODE_EXTENT_READ, "extent read"); + worker_register_job_name(RRDENG_OPCODE_FLUSHED_TO_OPEN, "flushed to open"); + worker_register_job_name(RRDENG_OPCODE_DATABASE_ROTATE, "db rotate"); + worker_register_job_name(RRDENG_OPCODE_JOURNAL_INDEX, "journal index"); + worker_register_job_name(RRDENG_OPCODE_FLUSH_INIT, "flush init"); + worker_register_job_name(RRDENG_OPCODE_EVICT_INIT, "evict init"); + worker_register_job_name(RRDENG_OPCODE_CTX_SHUTDOWN, "ctx shutdown"); + worker_register_job_name(RRDENG_OPCODE_CTX_QUIESCE, "ctx quiesce"); + + worker_register_job_name(RRDENG_OPCODE_MAX, "get opcode"); + + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_QUERY, "query cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_EXTENT_WRITE, "extent write cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_EXTENT_READ, "extent read cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_FLUSHED_TO_OPEN, "flushed to open cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_DATABASE_ROTATE, "db rotate cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_JOURNAL_INDEX, "journal index cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_FLUSH_INIT, "flush init cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_EVICT_INIT, "evict init cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_CTX_SHUTDOWN, "ctx shutdown cb"); + worker_register_job_name(RRDENG_OPCODE_MAX + RRDENG_OPCODE_CTX_QUIESCE, "ctx quiesce cb"); + + // special jobs + worker_register_job_name(RRDENG_TIMER_CB, "timer"); + worker_register_job_name(RRDENG_FLUSH_TRANSACTION_BUFFER_CB, "transaction buffer flush cb"); + + worker_register_job_custom_metric(RRDENG_OPCODES_WAITING, "opcodes waiting", "opcodes", WORKER_METRIC_ABSOLUTE); + worker_register_job_custom_metric(RRDENG_WORKS_DISPATCHED, "works dispatched", "works", WORKER_METRIC_ABSOLUTE); + worker_register_job_custom_metric(RRDENG_WORKS_EXECUTING, "works executing", "works", WORKER_METRIC_ABSOLUTE); + + struct rrdeng_main *main = arg; + enum rrdeng_opcode opcode; + struct rrdeng_cmd cmd; + main->tid = gettid(); - fatal_assert(0 == uv_timer_start(&timer_req, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS)); - shutdown = 0; - int set_name = 0; - while (likely(shutdown == 0 || rrdeng_threads_alive(wc))) { + fatal_assert(0 == uv_timer_start(&main->timer, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS)); + + bool shutdown = false; + while (likely(!shutdown)) { worker_is_idle(); - uv_run(loop, UV_RUN_DEFAULT); - worker_is_busy(RRDENG_MAX_OPCODE); - rrdeng_cleanup_finished_threads(wc); + uv_run(&main->loop, UV_RUN_DEFAULT); /* wait for commands */ - cmd_batch_size = 0; do { - /* - * Avoid starving the loop when there are too many commands coming in. - * timer_cb will interrupt the loop again to allow serving more commands. - */ - if (unlikely(cmd_batch_size >= MAX_CMD_BATCH_SIZE)) - break; - - cmd = rrdeng_deq_cmd(wc); + worker_is_busy(RRDENG_OPCODE_MAX); + cmd = rrdeng_deq_cmd(); opcode = cmd.opcode; - ++cmd_batch_size; - if(likely(opcode != RRDENG_NOOP)) - worker_is_busy(opcode); + worker_is_busy(opcode); switch (opcode) { - case RRDENG_NOOP: - /* the command queue was empty, do nothing */ - break; - case RRDENG_SHUTDOWN: - shutdown = 1; - break; - case RRDENG_QUIESCE: - ctx->drop_metrics_under_page_cache_pressure = 0; - ctx->quiesce = SET_QUIESCE; - fatal_assert(0 == uv_timer_stop(&timer_req)); - uv_close((uv_handle_t *)&timer_req, NULL); - while (do_flush_pages(wc, 1, NULL)) { - ; /* Force flushing of all committed pages. */ + case RRDENG_OPCODE_EXTENT_READ: { + struct rrdengine_instance *ctx = cmd.ctx; + EPDL *epdl = cmd.data; + work_dispatch(ctx, epdl, NULL, opcode, extent_read_tp_worker, after_extent_read); + break; + } + + case RRDENG_OPCODE_QUERY: { + struct rrdengine_instance *ctx = cmd.ctx; + PDC *pdc = cmd.data; + work_dispatch(ctx, pdc, NULL, opcode, query_prep_tp_worker, after_prep_query); + break; } - wal_flush_transaction_buffer(wc); - if (!rrdeng_threads_alive(wc)) { - ctx->quiesce = QUIESCED; - completion_mark_complete(&ctx->rrdengine_completion); + + case RRDENG_OPCODE_EXTENT_WRITE: { + struct rrdengine_instance *ctx = cmd.ctx; + struct page_descr_with_data *base = cmd.data; + struct completion *completion = cmd.completion; // optional + work_dispatch(ctx, base, completion, opcode, extent_write_tp_worker, after_extent_write); + break; } - break; - case RRDENG_READ_PAGE: - do_read_extent(wc, &cmd.read_page.page_cache_descr, 1, 0); - break; - case RRDENG_READ_EXTENT: - do_read_extent(wc, cmd.read_extent.page_cache_descr, cmd.read_extent.page_count, 1); - if (unlikely(!set_name)) { - set_name = 1; - uv_thread_set_name_np(ctx->worker_config.thread, "DBENGINE"); + + case RRDENG_OPCODE_FLUSHED_TO_OPEN: { + struct rrdengine_instance *ctx = cmd.ctx; + uv_fs_t *uv_fs_request = cmd.data; + struct extent_io_descriptor *xt_io_descr = uv_fs_request->data; + struct completion *completion = xt_io_descr->completion; + work_dispatch(ctx, uv_fs_request, completion, opcode, extent_flushed_to_open_tp_worker, after_extent_flushed_to_open); + break; } - break; - case RRDENG_COMMIT_PAGE: - do_commit_transaction(wc, STORE_DATA, NULL); - break; - case RRDENG_FLUSH_PAGES: { - if (wc->now_invalidating_dirty_pages) { - /* Do not flush if the disk cannot keep up */ - completion_mark_complete(cmd.completion); - } else { - (void)do_flush_pages(wc, 1, cmd.completion); + + case RRDENG_OPCODE_FLUSH_INIT: { + if(rrdeng_main.flushes_running < (size_t)(libuv_worker_threads / 4)) { + rrdeng_main.flushes_running++; + work_dispatch(NULL, NULL, NULL, opcode, cache_flush_tp_worker, after_do_cache_flush); + } + break; + } + + case RRDENG_OPCODE_EVICT_INIT: { + if(!rrdeng_main.evictions_running) { + rrdeng_main.evictions_running++; + work_dispatch(NULL, NULL, NULL, opcode, cache_evict_tp_worker, after_do_cache_evict); + } + break; + } + + case RRDENG_OPCODE_CLEANUP: { + if(!rrdeng_main.cleanup_running) { + rrdeng_main.cleanup_running++; + work_dispatch(NULL, NULL, NULL, opcode, cleanup_tp_worker, after_cleanup); + } + break; + } + + case RRDENG_OPCODE_JOURNAL_INDEX: { + struct rrdengine_instance *ctx = cmd.ctx; + struct rrdengine_datafile *datafile = cmd.data; + if(!__atomic_load_n(&ctx->atomic.migration_to_v2_running, __ATOMIC_RELAXED)) { + + __atomic_store_n(&ctx->atomic.migration_to_v2_running, true, __ATOMIC_RELAXED); + work_dispatch(ctx, datafile, NULL, opcode, journal_v2_indexing_tp_worker, after_journal_v2_indexing); + } + break; + } + + case RRDENG_OPCODE_DATABASE_ROTATE: { + struct rrdengine_instance *ctx = cmd.ctx; + if (!__atomic_load_n(&ctx->atomic.now_deleting_files, __ATOMIC_RELAXED) && + ctx->datafiles.first->next != NULL && + ctx->datafiles.first->next->next != NULL && + rrdeng_ctx_exceeded_disk_quota(ctx)) { + + __atomic_store_n(&ctx->atomic.now_deleting_files, true, __ATOMIC_RELAXED); + work_dispatch(ctx, NULL, NULL, opcode, database_rotate_tp_worker, after_database_rotate); + } + break; + } + + case RRDENG_OPCODE_CTX_POPULATE_MRG: { + struct rrdengine_instance *ctx = cmd.ctx; + struct completion *completion = cmd.completion; + work_dispatch(ctx, NULL, completion, opcode, populate_mrg_tp_worker, after_populate_mrg); + break; + } + + case RRDENG_OPCODE_CTX_QUIESCE: { + // a ctx will shutdown shortly + struct rrdengine_instance *ctx = cmd.ctx; + __atomic_store_n(&ctx->quiesce.enabled, true, __ATOMIC_RELEASE); + work_dispatch(ctx, NULL, NULL, opcode, + flush_all_hot_and_dirty_pages_of_section_tp_worker, + after_flush_all_hot_and_dirty_pages_of_section); + break; + } + + case RRDENG_OPCODE_CTX_SHUTDOWN: { + // a ctx is shutting down + struct rrdengine_instance *ctx = cmd.ctx; + struct completion *completion = cmd.completion; + work_dispatch(ctx, NULL, completion, opcode, ctx_shutdown_tp_worker, after_ctx_shutdown); + break; + } + + case RRDENG_OPCODE_NOOP: { + /* the command queue was empty, do nothing */ + break; + } + + // not opcodes + case RRDENG_OPCODE_MAX: + default: { + internal_fatal(true, "DBENGINE: unknown opcode"); + break; } - break; - case RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE: - rrdeng_invalidate_oldest_committed(wc); - break; - } - default: - debug(D_RRDENGINE, "%s: default.", __func__); - break; } - } while (opcode != RRDENG_NOOP); + + } while (opcode != RRDENG_OPCODE_NOOP); } /* cleanup operations of the event loop */ - info("Shutting down RRD engine event loop for tier %d", ctx->tier); + info("DBENGINE: shutting down dbengine thread"); /* * uv_async_send after uv_close does not seem to crash in linux at the moment, * it is however undocumented behaviour and we need to be aware if this becomes * an issue in the future. */ - uv_close((uv_handle_t *)&wc->async, NULL); - - while (do_flush_pages(wc, 1, NULL)) { - ; /* Force flushing of all committed pages. */ - } - wal_flush_transaction_buffer(wc); - uv_run(loop, UV_RUN_DEFAULT); - - info("Shutting down RRD engine event loop for tier %d complete", ctx->tier); - /* TODO: don't let the API block by waiting to enqueue commands */ - uv_cond_destroy(&wc->cmd_cond); -/* uv_mutex_destroy(&wc->cmd_mutex); */ - fatal_assert(0 == uv_loop_close(loop)); - freez(loop); - + uv_close((uv_handle_t *)&main->async, NULL); + uv_timer_stop(&main->timer); + uv_close((uv_handle_t *)&main->timer, NULL); + uv_run(&main->loop, UV_RUN_DEFAULT); + uv_loop_close(&main->loop); worker_unregister(); - return; - -error_after_timer_init: - uv_close((uv_handle_t *)&wc->async, NULL); -error_after_async_init: - fatal_assert(0 == uv_loop_close(loop)); -error_after_loop_init: - freez(loop); - - wc->error = UV_EAGAIN; - /* wake up initialization thread */ - completion_mark_complete(&ctx->rrdengine_completion); - worker_unregister(); -} - -/* C entry point for development purposes - * make "LDFLAGS=-errdengine_main" - */ -void rrdengine_main(void) -{ - int ret; - struct rrdengine_instance *ctx; - - sanity_check(); - ret = rrdeng_init(NULL, &ctx, "/tmp", RRDENG_MIN_PAGE_CACHE_SIZE_MB, RRDENG_MIN_DISK_SPACE_MB, 0); - if (ret) { - exit(ret); - } - rrdeng_exit(ctx); - fprintf(stderr, "Hello world!"); - exit(0); } diff --git a/database/engine/rrdengine.h b/database/engine/rrdengine.h index 521d2521a..492666815 100644 --- a/database/engine/rrdengine.h +++ b/database/engine/rrdengine.h @@ -19,202 +19,315 @@ #include "journalfile.h" #include "rrdengineapi.h" #include "pagecache.h" -#include "rrdenglocking.h" - -#ifdef NETDATA_RRD_INTERNALS - -#endif /* NETDATA_RRD_INTERNALS */ +#include "metric.h" +#include "cache.h" +#include "pdc.h" extern unsigned rrdeng_pages_per_extent; /* Forward declarations */ struct rrdengine_instance; +struct rrdeng_cmd; #define MAX_PAGES_PER_EXTENT (64) /* TODO: can go higher only when journal supports bigger than 4KiB transactions */ #define RRDENG_FILE_NUMBER_SCAN_TMPL "%1u-%10u" #define RRDENG_FILE_NUMBER_PRINT_TMPL "%1.1u-%10.10u" +typedef struct page_details_control { + struct rrdengine_instance *ctx; + struct metric *metric; + + struct completion prep_completion; + struct completion page_completion; // sync between the query thread and the workers + + Pvoid_t page_list_JudyL; // the list of page details + unsigned completed_jobs; // the number of jobs completed last time the query thread checked + bool workers_should_stop; // true when the query thread left and the workers should stop + bool prep_done; + + SPINLOCK refcount_spinlock; // spinlock to protect refcount + int32_t refcount; // the number of workers currently working on this request + 1 for the query thread + size_t executed_with_gaps; + + time_t start_time_s; + time_t end_time_s; + STORAGE_PRIORITY priority; + + time_t optimal_end_time_s; +} PDC; + +PDC *pdc_get(void); + +typedef enum __attribute__ ((__packed__)) { + // final status for all pages + // if a page does not have one of these, it is considered unroutable + PDC_PAGE_READY = (1 << 0), // ready to be processed (pd->page is not null) + PDC_PAGE_FAILED = (1 << 1), // failed to be loaded (pd->page is null) + PDC_PAGE_SKIP = (1 << 2), // don't use this page, it is not good for us + PDC_PAGE_INVALID = (1 << 3), // don't use this page, it is invalid + PDC_PAGE_EMPTY = (1 << 4), // the page is empty, does not have any data + + // other statuses for tracking issues + PDC_PAGE_PREPROCESSED = (1 << 5), // used during preprocessing + PDC_PAGE_PROCESSED = (1 << 6), // processed by the query caller + PDC_PAGE_RELEASED = (1 << 7), // already released + + // data found in cache (preloaded) or on disk? + PDC_PAGE_PRELOADED = (1 << 8), // data found in memory + PDC_PAGE_DISK_PENDING = (1 << 9), // data need to be loaded from disk + + // worker related statuses + PDC_PAGE_FAILED_INVALID_EXTENT = (1 << 10), + PDC_PAGE_FAILED_NOT_IN_EXTENT = (1 << 11), + PDC_PAGE_FAILED_TO_MAP_EXTENT = (1 << 12), + PDC_PAGE_FAILED_TO_ACQUIRE_DATAFILE= (1 << 13), + + PDC_PAGE_EXTENT_FROM_CACHE = (1 << 14), + PDC_PAGE_EXTENT_FROM_DISK = (1 << 15), + + PDC_PAGE_CANCELLED = (1 << 16), // the query thread had left when we try to load the page + + PDC_PAGE_SOURCE_MAIN_CACHE = (1 << 17), + PDC_PAGE_SOURCE_OPEN_CACHE = (1 << 18), + PDC_PAGE_SOURCE_JOURNAL_V2 = (1 << 19), + PDC_PAGE_PRELOADED_PASS4 = (1 << 20), + + // datafile acquired + PDC_PAGE_DATAFILE_ACQUIRED = (1 << 30), +} PDC_PAGE_STATUS; + +#define PDC_PAGE_QUERY_GLOBAL_SKIP_LIST (PDC_PAGE_FAILED | PDC_PAGE_SKIP | PDC_PAGE_INVALID | PDC_PAGE_RELEASED) + +struct page_details { + struct { + struct rrdengine_datafile *ptr; + uv_file file; + unsigned fileno; + + struct { + uint64_t pos; + uint32_t bytes; + } extent; + } datafile; + + struct pgc_page *page; + Word_t metric_id; + time_t first_time_s; + time_t last_time_s; + uint32_t update_every_s; + uint16_t page_length; + PDC_PAGE_STATUS status; + + struct { + struct page_details *prev; + struct page_details *next; + } load; +}; + +struct page_details *page_details_get(void); + +#define pdc_page_status_check(pd, flag) (__atomic_load_n(&((pd)->status), __ATOMIC_ACQUIRE) & (flag)) +#define pdc_page_status_set(pd, flag) __atomic_or_fetch(&((pd)->status), flag, __ATOMIC_RELEASE) +#define pdc_page_status_clear(pd, flag) __atomic_and_fetch(&((od)->status), ~(flag), __ATOMIC_RELEASE) + +struct jv2_extents_info { + size_t index; + uint64_t pos; + unsigned bytes; + size_t number_of_pages; +}; + +struct jv2_metrics_info { + uuid_t *uuid; + uint32_t page_list_header; + time_t first_time_s; + time_t last_time_s; + size_t number_of_pages; + Pvoid_t JudyL_pages_by_start_time; +}; + +struct jv2_page_info { + time_t start_time_s; + time_t end_time_s; + time_t update_every_s; + size_t page_length; + uint32_t extent_index; + void *custom_data; + + // private + struct pgc_page *page; +}; + +typedef enum __attribute__ ((__packed__)) { + RRDENG_CHO_UNALIGNED = (1 << 0), // set when this metric is not page aligned according to page alignment + RRDENG_FIRST_PAGE_ALLOCATED = (1 << 1), // set when this metric has allocated its first page + RRDENG_1ST_METRIC_WRITER = (1 << 2), +} RRDENG_COLLECT_HANDLE_OPTIONS; + +typedef enum __attribute__ ((__packed__)) { + RRDENG_PAGE_PAST_COLLECTION = (1 << 0), + RRDENG_PAGE_REPEATED_COLLECTION = (1 << 1), + RRDENG_PAGE_BIG_GAP = (1 << 2), + RRDENG_PAGE_GAP = (1 << 3), + RRDENG_PAGE_FUTURE_POINT = (1 << 4), + RRDENG_PAGE_CREATED_IN_FUTURE = (1 << 5), + RRDENG_PAGE_COMPLETED_IN_FUTURE = (1 << 6), + RRDENG_PAGE_UNALIGNED = (1 << 7), + RRDENG_PAGE_CONFLICT = (1 << 8), + RRDENG_PAGE_FULL = (1 << 9), + RRDENG_PAGE_COLLECT_FINALIZE = (1 << 10), + RRDENG_PAGE_UPDATE_EVERY_CHANGE = (1 << 11), + RRDENG_PAGE_STEP_TOO_SMALL = (1 << 12), + RRDENG_PAGE_STEP_UNALIGNED = (1 << 13), +} RRDENG_COLLECT_PAGE_FLAGS; + struct rrdeng_collect_handle { - struct pg_cache_page_index *page_index; - struct rrdeng_page_descr *descr; - unsigned long page_correlation_id; - // set to 1 when this dimension is not page aligned with the other dimensions in the chart - uint8_t unaligned_page; + struct metric *metric; + struct pgc_page *page; struct pg_alignment *alignment; + RRDENG_COLLECT_HANDLE_OPTIONS options; + uint8_t type; + RRDENG_COLLECT_PAGE_FLAGS page_flags; + uint32_t page_entries_max; + uint32_t page_position; // keep track of the current page size, to make sure we don't exceed it + usec_t page_start_time_ut; + usec_t page_end_time_ut; + usec_t update_every_ut; }; struct rrdeng_query_handle { - struct rrdeng_page_descr *descr; + struct metric *metric; + struct pgc_page *page; struct rrdengine_instance *ctx; - struct pg_cache_page_index *page_index; - time_t wanted_start_time_s; + storage_number *metric_data; + struct page_details_control *pdc; + + // the request + time_t start_time_s; + time_t end_time_s; + STORAGE_PRIORITY priority; + + // internal data time_t now_s; + time_t dt_s; + unsigned position; unsigned entries; - storage_number *page; - usec_t page_end_time_ut; - uint32_t page_length; - time_t dt_s; + +#ifdef NETDATA_INTERNAL_CHECKS + usec_t started_time_s; + pid_t query_pid; + struct rrdeng_query_handle *prev, *next; +#endif }; -typedef enum { - RRDENGINE_STATUS_UNINITIALIZED = 0, - RRDENGINE_STATUS_INITIALIZING, - RRDENGINE_STATUS_INITIALIZED -} rrdengine_state_t; +struct rrdeng_query_handle *rrdeng_query_handle_get(void); +void rrdeng_query_handle_release(struct rrdeng_query_handle *handle); enum rrdeng_opcode { /* can be used to return empty status or flush the command queue */ - RRDENG_NOOP = 0, - - RRDENG_READ_PAGE, - RRDENG_READ_EXTENT, - RRDENG_COMMIT_PAGE, - RRDENG_FLUSH_PAGES, - RRDENG_SHUTDOWN, - RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE, - RRDENG_QUIESCE, - - RRDENG_MAX_OPCODE -}; - -struct rrdeng_read_page { - struct rrdeng_page_descr *page_cache_descr; + RRDENG_OPCODE_NOOP = 0, + + RRDENG_OPCODE_QUERY, + RRDENG_OPCODE_EXTENT_WRITE, + RRDENG_OPCODE_EXTENT_READ, + RRDENG_OPCODE_FLUSHED_TO_OPEN, + RRDENG_OPCODE_DATABASE_ROTATE, + RRDENG_OPCODE_JOURNAL_INDEX, + RRDENG_OPCODE_FLUSH_INIT, + RRDENG_OPCODE_EVICT_INIT, + RRDENG_OPCODE_CTX_SHUTDOWN, + RRDENG_OPCODE_CTX_QUIESCE, + RRDENG_OPCODE_CTX_POPULATE_MRG, + RRDENG_OPCODE_CLEANUP, + + RRDENG_OPCODE_MAX }; -struct rrdeng_read_extent { - struct rrdeng_page_descr *page_cache_descr[MAX_PAGES_PER_EXTENT]; - int page_count; -}; - -struct rrdeng_cmd { - enum rrdeng_opcode opcode; - union { - struct rrdeng_read_page read_page; - struct rrdeng_read_extent read_extent; - struct completion *completion; - }; -}; - -#define RRDENG_CMD_Q_MAX_SIZE (2048) - -struct rrdeng_cmdqueue { - unsigned head, tail; - struct rrdeng_cmd cmd_array[RRDENG_CMD_Q_MAX_SIZE]; +// WORKERS IDS: +// RRDENG_MAX_OPCODE : reserved for the cleanup +// RRDENG_MAX_OPCODE + opcode : reserved for the callbacks of each opcode +// RRDENG_MAX_OPCODE + RRDENG_MAX_OPCODE : reserved for the timer +#define RRDENG_TIMER_CB (RRDENG_OPCODE_MAX + RRDENG_OPCODE_MAX) +#define RRDENG_FLUSH_TRANSACTION_BUFFER_CB (RRDENG_TIMER_CB + 1) +#define RRDENG_OPCODES_WAITING (RRDENG_TIMER_CB + 2) +#define RRDENG_WORKS_DISPATCHED (RRDENG_TIMER_CB + 3) +#define RRDENG_WORKS_EXECUTING (RRDENG_TIMER_CB + 4) + +struct extent_io_data { + unsigned fileno; + uv_file file; + uint64_t pos; + unsigned bytes; + uint16_t page_length; }; struct extent_io_descriptor { - uv_fs_t req; - uv_work_t req_worker; + struct rrdengine_instance *ctx; + uv_fs_t uv_fs_request; uv_buf_t iov; uv_file file; void *buf; - void *map_base; - size_t map_length; + struct wal *wal; uint64_t pos; unsigned bytes; struct completion *completion; unsigned descr_count; - int release_descr; - struct rrdeng_page_descr *descr_array[MAX_PAGES_PER_EXTENT]; - struct rrdeng_page_descr descr_read_array[MAX_PAGES_PER_EXTENT]; - Word_t descr_commit_idx_array[MAX_PAGES_PER_EXTENT]; + struct page_descr_with_data *descr_array[MAX_PAGES_PER_EXTENT]; + struct rrdengine_datafile *datafile; struct extent_io_descriptor *next; /* multiple requests to be served by the same cached extent */ }; struct generic_io_descriptor { + struct rrdengine_instance *ctx; uv_fs_t req; uv_buf_t iov; void *buf; + void *data; uint64_t pos; unsigned bytes; struct completion *completion; }; -struct extent_cache_element { - struct extent_info *extent; /* The ABA problem is avoided with the help of fileno below */ - unsigned fileno; - struct extent_cache_element *prev; /* LRU */ - struct extent_cache_element *next; /* LRU */ - struct extent_io_descriptor *inflight_io_descr; /* I/O descriptor for in-flight extent */ - uint8_t pages[MAX_PAGES_PER_EXTENT * RRDENG_BLOCK_SIZE]; -}; - -#define MAX_CACHED_EXTENTS 16 /* cannot be over 32 to fit in 32-bit architectures */ - -/* Initialize by setting the structure to zero */ -struct extent_cache { - struct extent_cache_element extent_array[MAX_CACHED_EXTENTS]; - unsigned allocation_bitmap; /* 1 if the corresponding position in the extent_array is allocated */ - unsigned inflight_bitmap; /* 1 if the corresponding position in the extent_array is waiting for I/O */ - - struct extent_cache_element *replaceQ_head; /* LRU */ - struct extent_cache_element *replaceQ_tail; /* MRU */ -}; - -struct rrdengine_worker_config { - struct rrdengine_instance *ctx; - - uv_thread_t thread; - uv_loop_t* loop; - uv_async_t async; - - /* file deletion thread */ - uv_thread_t *now_deleting_files; - unsigned long cleanup_thread_deleting_files; /* set to 0 when now_deleting_files is still running */ - - /* dirty page deletion thread */ - uv_thread_t *now_invalidating_dirty_pages; - /* set to 0 when now_invalidating_dirty_pages is still running */ - unsigned long cleanup_thread_invalidating_dirty_pages; - unsigned inflight_dirty_pages; - - /* FIFO command queue */ - uv_mutex_t cmd_mutex; - uv_cond_t cmd_cond; - volatile unsigned queue_size; - struct rrdeng_cmdqueue cmd_queue; +typedef struct wal { + uint64_t transaction_id; + void *buf; + size_t size; + size_t buf_size; + struct generic_io_descriptor io_descr; - struct extent_cache xt_cache; + struct { + struct wal *prev; + struct wal *next; + } cache; +} WAL; - int error; -}; +WAL *wal_get(struct rrdengine_instance *ctx, unsigned size); +void wal_release(WAL *wal); /* * Debug statistics not used by code logic. * They only describe operations since DB engine instance load time. */ struct rrdengine_statistics { - rrdeng_stats_t metric_API_producers; - rrdeng_stats_t metric_API_consumers; - rrdeng_stats_t pg_cache_insertions; - rrdeng_stats_t pg_cache_deletions; - rrdeng_stats_t pg_cache_hits; - rrdeng_stats_t pg_cache_misses; - rrdeng_stats_t pg_cache_backfills; - rrdeng_stats_t pg_cache_evictions; rrdeng_stats_t before_decompress_bytes; rrdeng_stats_t after_decompress_bytes; rrdeng_stats_t before_compress_bytes; rrdeng_stats_t after_compress_bytes; + rrdeng_stats_t io_write_bytes; rrdeng_stats_t io_write_requests; rrdeng_stats_t io_read_bytes; rrdeng_stats_t io_read_requests; - rrdeng_stats_t io_write_extent_bytes; - rrdeng_stats_t io_write_extents; - rrdeng_stats_t io_read_extent_bytes; - rrdeng_stats_t io_read_extents; + rrdeng_stats_t datafile_creations; rrdeng_stats_t datafile_deletions; rrdeng_stats_t journalfile_creations; rrdeng_stats_t journalfile_deletions; - rrdeng_stats_t page_cache_descriptors; + rrdeng_stats_t io_errors; rrdeng_stats_t fs_errors; - rrdeng_stats_t pg_cache_over_half_dirty_events; - rrdeng_stats_t flushing_pressure_page_deletions; }; /* I/O errors global counter */ @@ -227,57 +340,179 @@ extern rrdeng_stats_t rrdeng_reserved_file_descriptors; extern rrdeng_stats_t global_pg_cache_over_half_dirty_events; extern rrdeng_stats_t global_flushing_pressure_page_deletions; /* number of deleted pages */ -#define NO_QUIESCE (0) /* initial state when all operations function normally */ -#define SET_QUIESCE (1) /* set it before shutting down the instance, quiesce long running operations */ -#define QUIESCED (2) /* is set after all threads have finished running */ +struct rrdengine_instance { + struct { + bool legacy; // true when the db is autonomous for a single host -typedef enum { - LOAD_ERRORS_PAGE_FLIPPED_TIME = 0, - LOAD_ERRORS_PAGE_EQUAL_TIME = 1, - LOAD_ERRORS_PAGE_ZERO_ENTRIES = 2, - LOAD_ERRORS_PAGE_UPDATE_ZERO = 3, - LOAD_ERRORS_PAGE_FLEXY_TIME = 4, - LOAD_ERRORS_DROPPED_EXTENT = 5, -} INVALID_PAGE_ID; + int tier; // the tier of this ctx + uint8_t page_type; // default page type for this context -struct rrdengine_instance { - struct rrdengine_worker_config worker_config; - struct completion rrdengine_completion; - struct page_cache pg_cache; - uint8_t drop_metrics_under_page_cache_pressure; /* boolean */ - uint8_t global_compress_alg; - struct transaction_commit_log commit_log; - struct rrdengine_datafile_list datafiles; - RRDHOST *host; /* the legacy host, or NULL for multi-host DB */ - char dbfiles_path[FILENAME_MAX + 1]; - char machine_guid[GUID_LEN + 1]; /* the unique ID of the corresponding host, or localhost for multihost DB */ - uint64_t disk_space; - uint64_t max_disk_space; - int tier; - unsigned last_fileno; /* newest index of datafile and journalfile */ - unsigned long max_cache_pages; - unsigned long cache_pages_low_watermark; - unsigned long metric_API_max_producers; - - uint8_t quiesce; /* set to SET_QUIESCE before shutdown of the engine */ - uint8_t page_type; /* Default page type for this context */ + uint64_t max_disk_space; // the max disk space this ctx is allowed to use + uint8_t global_compress_alg; // the wanted compression algorithm - struct rrdengine_statistics stats; + char dbfiles_path[FILENAME_MAX + 1]; + } config; struct { - size_t counter; - usec_t latest_end_time_ut; - } load_errors[6]; + uv_rwlock_t rwlock; // the linked list of datafiles is protected by this lock + struct rrdengine_datafile *first; // oldest - the newest with ->first->prev + } datafiles; + + struct { + unsigned last_fileno; // newest index of datafile and journalfile + unsigned last_flush_fileno; // newest index of datafile received data + + size_t collectors_running; + size_t collectors_running_duplicate; + size_t inflight_queries; // the number of queries currently running + uint64_t current_disk_space; // the current disk space size used + + uint64_t transaction_id; // the transaction id of the next extent flushing + + bool migration_to_v2_running; + bool now_deleting_files; + unsigned extents_currently_being_flushed; // non-zero until we commit data to disk (both datafile and journal file) + } atomic; + + struct { + bool exit_mode; + bool enabled; // when set (before shutdown), queries are prohibited + struct completion completion; + } quiesce; + + struct { + struct { + size_t size; + struct completion *array; + } populate_mrg; + + bool create_new_datafile_pair; + } loading; + + struct rrdengine_statistics stats; }; -void *dbengine_page_alloc(void); -void dbengine_page_free(void *page); +#define ctx_current_disk_space_get(ctx) __atomic_load_n(&(ctx)->atomic.current_disk_space, __ATOMIC_RELAXED) +#define ctx_current_disk_space_increase(ctx, size) __atomic_add_fetch(&(ctx)->atomic.current_disk_space, size, __ATOMIC_RELAXED) +#define ctx_current_disk_space_decrease(ctx, size) __atomic_sub_fetch(&(ctx)->atomic.current_disk_space, size, __ATOMIC_RELAXED) + +static inline void ctx_io_read_op_bytes(struct rrdengine_instance *ctx, size_t bytes) { + __atomic_add_fetch(&ctx->stats.io_read_bytes, bytes, __ATOMIC_RELAXED); + __atomic_add_fetch(&ctx->stats.io_read_requests, 1, __ATOMIC_RELAXED); +} + +static inline void ctx_io_write_op_bytes(struct rrdengine_instance *ctx, size_t bytes) { + __atomic_add_fetch(&ctx->stats.io_write_bytes, bytes, __ATOMIC_RELAXED); + __atomic_add_fetch(&ctx->stats.io_write_requests, 1, __ATOMIC_RELAXED); +} +static inline void ctx_io_error(struct rrdengine_instance *ctx) { + __atomic_add_fetch(&ctx->stats.io_errors, 1, __ATOMIC_RELAXED); + rrd_stat_atomic_add(&global_io_errors, 1); +} + +static inline void ctx_fs_error(struct rrdengine_instance *ctx) { + __atomic_add_fetch(&ctx->stats.fs_errors, 1, __ATOMIC_RELAXED); + rrd_stat_atomic_add(&global_fs_errors, 1); +} + +#define ctx_last_fileno_get(ctx) __atomic_load_n(&(ctx)->atomic.last_fileno, __ATOMIC_RELAXED) +#define ctx_last_fileno_increment(ctx) __atomic_add_fetch(&(ctx)->atomic.last_fileno, 1, __ATOMIC_RELAXED) + +#define ctx_last_flush_fileno_get(ctx) __atomic_load_n(&(ctx)->atomic.last_flush_fileno, __ATOMIC_RELAXED) +static inline void ctx_last_flush_fileno_set(struct rrdengine_instance *ctx, unsigned fileno) { + unsigned old_fileno = ctx_last_flush_fileno_get(ctx); + + do { + if(old_fileno >= fileno) + return; + + } while(!__atomic_compare_exchange_n(&ctx->atomic.last_flush_fileno, &old_fileno, fileno, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); +} + +#define ctx_is_available_for_queries(ctx) (__atomic_load_n(&(ctx)->quiesce.enabled, __ATOMIC_RELAXED) == false && __atomic_load_n(&(ctx)->quiesce.exit_mode, __ATOMIC_RELAXED) == false) + +void *dbengine_page_alloc(size_t size); +void dbengine_page_free(void *page, size_t size); + +void *dbengine_extent_alloc(size_t size); +void dbengine_extent_free(void *extent, size_t size); + +bool rrdeng_ctx_exceeded_disk_quota(struct rrdengine_instance *ctx); int init_rrd_files(struct rrdengine_instance *ctx); void finalize_rrd_files(struct rrdengine_instance *ctx); -void rrdeng_test_quota(struct rrdengine_worker_config* wc); -void rrdeng_worker(void* arg); -void rrdeng_enq_cmd(struct rrdengine_worker_config* wc, struct rrdeng_cmd *cmd); -struct rrdeng_cmd rrdeng_deq_cmd(struct rrdengine_worker_config* wc); +bool rrdeng_dbengine_spawn(struct rrdengine_instance *ctx); +void dbengine_event_loop(void *arg); + +typedef void (*enqueue_callback_t)(struct rrdeng_cmd *cmd); +typedef void (*dequeue_callback_t)(struct rrdeng_cmd *cmd); + +void rrdeng_enqueue_epdl_cmd(struct rrdeng_cmd *cmd); +void rrdeng_dequeue_epdl_cmd(struct rrdeng_cmd *cmd); + +typedef struct rrdeng_cmd *(*requeue_callback_t)(void *data); +void rrdeng_req_cmd(requeue_callback_t get_cmd_cb, void *data, STORAGE_PRIORITY priority); + +void rrdeng_enq_cmd(struct rrdengine_instance *ctx, enum rrdeng_opcode opcode, void *data, + struct completion *completion, enum storage_priority priority, + enqueue_callback_t enqueue_cb, dequeue_callback_t dequeue_cb); + +void pdc_route_asynchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc); +void pdc_route_synchronously(struct rrdengine_instance *ctx, struct page_details_control *pdc); + +void pdc_acquire(PDC *pdc); +bool pdc_release_and_destroy_if_unreferenced(PDC *pdc, bool worker, bool router); + +unsigned rrdeng_target_data_file_size(struct rrdengine_instance *ctx); + +struct page_descr_with_data *page_descriptor_get(void); + +typedef struct validated_page_descriptor { + time_t start_time_s; + time_t end_time_s; + time_t update_every_s; + size_t page_length; + size_t point_size; + size_t entries; + uint8_t type; + bool is_valid; +} VALIDATED_PAGE_DESCRIPTOR; + +#define DBENGINE_EMPTY_PAGE (void *)(-1) + +#define page_entries_by_time(start_time_s, end_time_s, update_every_s) \ + ((update_every_s) ? (((end_time_s) - ((start_time_s) - (update_every_s))) / (update_every_s)) : 1) + +#define page_entries_by_size(page_length_in_bytes, point_size_in_bytes) \ + ((page_length_in_bytes) / (point_size_in_bytes)) + +VALIDATED_PAGE_DESCRIPTOR validate_page(uuid_t *uuid, + time_t start_time_s, + time_t end_time_s, + time_t update_every_s, + size_t page_length, + uint8_t page_type, + size_t entries, + time_t now_s, + time_t overwrite_zero_update_every_s, + bool have_read_error, + const char *msg, + RRDENG_COLLECT_PAGE_FLAGS flags); +VALIDATED_PAGE_DESCRIPTOR validate_extent_page_descr(const struct rrdeng_extent_page_descr *descr, time_t now_s, time_t overwrite_zero_update_every_s, bool have_read_error); +void collect_page_flags_to_buffer(BUFFER *wb, RRDENG_COLLECT_PAGE_FLAGS flags); + +typedef enum { + PAGE_IS_IN_THE_PAST = -1, + PAGE_IS_IN_RANGE = 0, + PAGE_IS_IN_THE_FUTURE = 1, +} TIME_RANGE_COMPARE; + +TIME_RANGE_COMPARE is_page_in_time_range(time_t page_first_time_s, time_t page_last_time_s, time_t wanted_start_time_s, time_t wanted_end_time_s); + +static inline time_t max_acceptable_collected_time(void) { + return now_realtime_sec() + 1; +} + +void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool update_retention, bool worker); #endif /* NETDATA_RRDENGINE_H */ diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c index 4525b041f..27497bbb8 100755 --- a/database/engine/rrdengineapi.c +++ b/database/engine/rrdengineapi.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "rrdengine.h" -#include "../storage_engine.h" /* Default global database instance */ struct rrdengine_instance multidb_ctx_storage_tier0; @@ -8,12 +7,21 @@ struct rrdengine_instance multidb_ctx_storage_tier1; struct rrdengine_instance multidb_ctx_storage_tier2; struct rrdengine_instance multidb_ctx_storage_tier3; struct rrdengine_instance multidb_ctx_storage_tier4; + +#define mrg_metric_ctx(metric) (struct rrdengine_instance *)mrg_metric_section(main_mrg, metric) + #if RRD_STORAGE_TIERS != 5 #error RRD_STORAGE_TIERS is not 5 - you need to add allocations here #endif struct rrdengine_instance *multidb_ctx[RRD_STORAGE_TIERS]; uint8_t tier_page_type[RRD_STORAGE_TIERS] = {PAGE_METRICS, PAGE_TIER, PAGE_TIER, PAGE_TIER, PAGE_TIER}; +#if defined(ENV32BIT) +size_t tier_page_size[RRD_STORAGE_TIERS] = {2048, 1024, 192, 192, 192}; +#else +size_t tier_page_size[RRD_STORAGE_TIERS] = {4096, 2048, 384, 384, 384}; +#endif + #if PAGE_TYPE_MAX != 1 #error PAGE_TYPE_MAX is not 1 - you need to add allocations here #endif @@ -27,14 +35,17 @@ __attribute__((constructor)) void initialize_multidb_ctx(void) { multidb_ctx[4] = &multidb_ctx_storage_tier4; } -int db_engine_use_malloc = 0; int default_rrdeng_page_fetch_timeout = 3; int default_rrdeng_page_fetch_retries = 3; -int default_rrdeng_page_cache_mb = 32; +int db_engine_journal_check = 0; int default_rrdeng_disk_quota_mb = 256; int default_multidb_disk_quota_mb = 256; -/* Default behaviour is to unblock data collection if the page cache is full of dirty pages by dropping metrics */ -uint8_t rrdeng_drop_metrics_under_page_cache_pressure = 1; + +#if defined(ENV32BIT) +int default_rrdeng_page_cache_mb = 16; +#else +int default_rrdeng_page_cache_mb = 32; +#endif // ---------------------------------------------------------------------------- // metrics groups @@ -90,161 +101,207 @@ void rrdeng_generate_legacy_uuid(const char *dim_id, const char *chart_id, uuid_ memcpy(ret_uuid, hash_value, sizeof(uuid_t)); } -/* Transform legacy UUID to be unique across hosts deterministically */ -void rrdeng_convert_legacy_uuid_to_multihost(char machine_guid[GUID_LEN + 1], uuid_t *legacy_uuid, uuid_t *ret_uuid) -{ - EVP_MD_CTX *evpctx; - unsigned char hash_value[EVP_MAX_MD_SIZE]; - unsigned int hash_len; - - evpctx = EVP_MD_CTX_create(); - EVP_DigestInit_ex(evpctx, EVP_sha256(), NULL); - EVP_DigestUpdate(evpctx, machine_guid, GUID_LEN); - EVP_DigestUpdate(evpctx, *legacy_uuid, sizeof(uuid_t)); - EVP_DigestFinal_ex(evpctx, hash_value, &hash_len); - EVP_MD_CTX_destroy(evpctx); - fatal_assert(hash_len > sizeof(uuid_t)); - memcpy(ret_uuid, hash_value, sizeof(uuid_t)); -} - -STORAGE_METRIC_HANDLE *rrdeng_metric_get_legacy(STORAGE_INSTANCE *db_instance, const char *rd_id, const char *st_id) { +static METRIC *rrdeng_metric_get_legacy(STORAGE_INSTANCE *db_instance, const char *rd_id, const char *st_id) { + struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; uuid_t legacy_uuid; rrdeng_generate_legacy_uuid(rd_id, st_id, &legacy_uuid); - return rrdeng_metric_get(db_instance, &legacy_uuid); + return mrg_metric_get_and_acquire(main_mrg, &legacy_uuid, (Word_t) ctx); } // ---------------------------------------------------------------------------- // metric handle void rrdeng_metric_release(STORAGE_METRIC_HANDLE *db_metric_handle) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - - __atomic_sub_fetch(&page_index->refcount, 1, __ATOMIC_SEQ_CST); + METRIC *metric = (METRIC *)db_metric_handle; + mrg_metric_release(main_mrg, metric); } STORAGE_METRIC_HANDLE *rrdeng_metric_dup(STORAGE_METRIC_HANDLE *db_metric_handle) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - __atomic_add_fetch(&page_index->refcount, 1, __ATOMIC_SEQ_CST); - return db_metric_handle; + METRIC *metric = (METRIC *)db_metric_handle; + return (STORAGE_METRIC_HANDLE *) mrg_metric_dup(main_mrg, metric); } STORAGE_METRIC_HANDLE *rrdeng_metric_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid) { struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; - struct page_cache *pg_cache = &ctx->pg_cache; - struct pg_cache_page_index *page_index = NULL; - - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - Pvoid_t *PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, uuid, sizeof(uuid_t)); - if (likely(NULL != PValue)) - page_index = *PValue; - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); - - if (likely(page_index)) - __atomic_add_fetch(&page_index->refcount, 1, __ATOMIC_SEQ_CST); - - return (STORAGE_METRIC_HANDLE *)page_index; + return (STORAGE_METRIC_HANDLE *) mrg_metric_get_and_acquire(main_mrg, uuid, (Word_t) ctx); } -STORAGE_METRIC_HANDLE *rrdeng_metric_create(STORAGE_INSTANCE *db_instance, uuid_t *uuid) { +static METRIC *rrdeng_metric_create(STORAGE_INSTANCE *db_instance, uuid_t *uuid) { internal_fatal(!db_instance, "DBENGINE: db_instance is NULL"); struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; - struct pg_cache_page_index *page_index; - struct page_cache *pg_cache = &ctx->pg_cache; - - uv_rwlock_wrlock(&pg_cache->metrics_index.lock); - Pvoid_t *PValue = JudyHSIns(&pg_cache->metrics_index.JudyHS_array, uuid, sizeof(uuid_t), PJE0); - fatal_assert(NULL == *PValue); /* TODO: figure out concurrency model */ - *PValue = page_index = create_page_index(uuid, ctx); - page_index->prev = pg_cache->metrics_index.last_page_index; - pg_cache->metrics_index.last_page_index = page_index; - page_index->refcount = 1; - uv_rwlock_wrunlock(&pg_cache->metrics_index.lock); - - return (STORAGE_METRIC_HANDLE *)page_index; + MRG_ENTRY entry = { + .section = (Word_t)ctx, + .first_time_s = 0, + .last_time_s = 0, + .latest_update_every_s = 0, + }; + uuid_copy(entry.uuid, *uuid); + + METRIC *metric = mrg_metric_add_and_acquire(main_mrg, entry, NULL); + return metric; } STORAGE_METRIC_HANDLE *rrdeng_metric_get_or_create(RRDDIM *rd, STORAGE_INSTANCE *db_instance) { - STORAGE_METRIC_HANDLE *db_metric_handle; - - db_metric_handle = rrdeng_metric_get(db_instance, &rd->metric_uuid); - if(!db_metric_handle) { - db_metric_handle = rrdeng_metric_get_legacy(db_instance, rrddim_id(rd), rrdset_id(rd->rrdset)); - if(db_metric_handle) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - uuid_copy(rd->metric_uuid, page_index->id); + struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; + METRIC *metric; + + metric = mrg_metric_get_and_acquire(main_mrg, &rd->metric_uuid, (Word_t) ctx); + + if(unlikely(!metric)) { + if(unlikely(ctx->config.legacy)) { + // this is a single host database + // generate uuid from the chart and dimensions ids + // and overwrite the one supplied by rrddim + metric = rrdeng_metric_get_legacy(db_instance, rrddim_id(rd), rrdset_id(rd->rrdset)); + if (metric) + uuid_copy(rd->metric_uuid, *mrg_metric_uuid(main_mrg, metric)); } + + if(likely(!metric)) + metric = rrdeng_metric_create(db_instance, &rd->metric_uuid); } - if(!db_metric_handle) - db_metric_handle = rrdeng_metric_create(db_instance, &rd->metric_uuid); #ifdef NETDATA_INTERNAL_CHECKS - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - if(uuid_compare(rd->metric_uuid, page_index->id) != 0) { + if(uuid_compare(rd->metric_uuid, *mrg_metric_uuid(main_mrg, metric)) != 0) { char uuid1[UUID_STR_LEN + 1]; char uuid2[UUID_STR_LEN + 1]; uuid_unparse(rd->metric_uuid, uuid1); - uuid_unparse(page_index->id, uuid2); - fatal("DBENGINE: uuids do not match, asked for metric '%s', but got page_index of metric '%s'", uuid1, uuid2); + uuid_unparse(*mrg_metric_uuid(main_mrg, metric), uuid2); + fatal("DBENGINE: uuids do not match, asked for metric '%s', but got metric '%s'", uuid1, uuid2); } - struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; - if(page_index->ctx != ctx) - fatal("DBENGINE: mixed up rrdengine instances, asked for metric from %p, got from %p", ctx, page_index->ctx); + if(mrg_metric_ctx(metric) != ctx) + fatal("DBENGINE: mixed up db instances, asked for metric from %p, got from %p", + ctx, mrg_metric_ctx(metric)); #endif - return db_metric_handle; + return (STORAGE_METRIC_HANDLE *)metric; } // ---------------------------------------------------------------------------- // collect ops +static inline void check_and_fix_mrg_update_every(struct rrdeng_collect_handle *handle) { + if(unlikely((time_t)(handle->update_every_ut / USEC_PER_SEC) != mrg_metric_get_update_every_s(main_mrg, handle->metric))) { + internal_error(true, "DBENGINE: collection handle has update every %ld, but the metric registry has %ld. Fixing it.", + (time_t)(handle->update_every_ut / USEC_PER_SEC), mrg_metric_get_update_every_s(main_mrg, handle->metric)); + + if(unlikely(!handle->update_every_ut)) + handle->update_every_ut = (usec_t)mrg_metric_get_update_every_s(main_mrg, handle->metric) * USEC_PER_SEC; + else + mrg_metric_set_update_every(main_mrg, handle->metric, (time_t)(handle->update_every_ut / USEC_PER_SEC)); + } +} + +static inline bool check_completed_page_consistency(struct rrdeng_collect_handle *handle __maybe_unused) { +#ifdef NETDATA_INTERNAL_CHECKS + if (unlikely(!handle->page || !handle->page_entries_max || !handle->page_position || !handle->page_end_time_ut)) + return false; + + struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); + + uuid_t *uuid = mrg_metric_uuid(main_mrg, handle->metric); + time_t start_time_s = pgc_page_start_time_s(handle->page); + time_t end_time_s = pgc_page_end_time_s(handle->page); + time_t update_every_s = pgc_page_update_every_s(handle->page); + size_t page_length = handle->page_position * CTX_POINT_SIZE_BYTES(ctx); + size_t entries = handle->page_position; + time_t overwrite_zero_update_every_s = (time_t)(handle->update_every_ut / USEC_PER_SEC); + + if(end_time_s > max_acceptable_collected_time()) + handle->page_flags |= RRDENG_PAGE_COMPLETED_IN_FUTURE; + + VALIDATED_PAGE_DESCRIPTOR vd = validate_page( + uuid, + start_time_s, + end_time_s, + update_every_s, + page_length, + ctx->config.page_type, + entries, + 0, // do not check for future timestamps - we inherit the timestamps of the children + overwrite_zero_update_every_s, + false, + "collected", + handle->page_flags); + + return vd.is_valid; +#else + return true; +#endif +} + /* * Gets a handle for storing metrics to the database. * The handle must be released with rrdeng_store_metric_final(). */ STORAGE_COLLECT_HANDLE *rrdeng_store_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, uint32_t update_every, STORAGE_METRICS_GROUP *smg) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; + METRIC *metric = (METRIC *)db_metric_handle; + struct rrdengine_instance *ctx = mrg_metric_ctx(metric); + + bool is_1st_metric_writer = true; + if(!mrg_metric_set_writer(main_mrg, metric)) { + is_1st_metric_writer = false; + char uuid[UUID_STR_LEN + 1]; + uuid_unparse(*mrg_metric_uuid(main_mrg, metric), uuid); + error("DBENGINE: metric '%s' is already collected and should not be collected twice - expect gaps on the charts", uuid); + } + + metric = mrg_metric_dup(main_mrg, metric); + struct rrdeng_collect_handle *handle; handle = callocz(1, sizeof(struct rrdeng_collect_handle)); - handle->page_index = page_index; - handle->descr = NULL; - handle->unaligned_page = 0; - page_index->latest_update_every_s = update_every; + handle->metric = metric; + handle->page = NULL; + handle->page_position = 0; + handle->page_entries_max = 0; + handle->update_every_ut = (usec_t)update_every * USEC_PER_SEC; + handle->options = is_1st_metric_writer ? RRDENG_1ST_METRIC_WRITER : 0; + + __atomic_add_fetch(&ctx->atomic.collectors_running, 1, __ATOMIC_RELAXED); + if(!is_1st_metric_writer) + __atomic_add_fetch(&ctx->atomic.collectors_running_duplicate, 1, __ATOMIC_RELAXED); + + mrg_metric_set_update_every(main_mrg, metric, update_every); handle->alignment = (struct pg_alignment *)smg; rrdeng_page_alignment_acquire(handle->alignment); - uv_rwlock_wrlock(&page_index->lock); - ++page_index->writers; - uv_rwlock_wrunlock(&page_index->lock); + // this is important! + // if we don't set the page_end_time_ut during the first collection + // data collection may be able to go back in time and during the addition of new pages + // clean pages may be found matching ours! + + time_t db_first_time_s, db_last_time_s, db_update_every_s; + mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s); + handle->page_end_time_ut = (usec_t)db_last_time_s * USEC_PER_SEC; return (STORAGE_COLLECT_HANDLE *)handle; } /* The page must be populated and referenced */ -static int page_has_only_empty_metrics(struct rrdeng_page_descr *descr) -{ - switch(descr->type) { +static bool page_has_only_empty_metrics(struct rrdeng_collect_handle *handle) { + switch(handle->type) { case PAGE_METRICS: { - size_t slots = descr->page_length / PAGE_POINT_SIZE_BYTES(descr); - storage_number *array = (storage_number *)descr->pg_cache_descr->page; + size_t slots = handle->page_position; + storage_number *array = (storage_number *)pgc_page_data(handle->page); for (size_t i = 0 ; i < slots; ++i) { if(does_storage_number_exist(array[i])) - return 0; + return false; } } break; case PAGE_TIER: { - size_t slots = descr->page_length / PAGE_POINT_SIZE_BYTES(descr); - storage_number_tier1_t *array = (storage_number_tier1_t *)descr->pg_cache_descr->page; + size_t slots = handle->page_position; + storage_number_tier1_t *array = (storage_number_tier1_t *)pgc_page_data(handle->page); for (size_t i = 0 ; i < slots; ++i) { if(fpclassify(array[i].sum_value) != FP_NAN) - return 0; + return false; } } break; @@ -252,422 +309,585 @@ static int page_has_only_empty_metrics(struct rrdeng_page_descr *descr) default: { static bool logged = false; if(!logged) { - error("DBENGINE: cannot check page for nulls on unknown page type id %d", descr->type); + error("DBENGINE: cannot check page for nulls on unknown page type id %d", (mrg_metric_ctx(handle->metric))->config.page_type); logged = true; } - return 0; + return false; } } - return 1; + return true; } void rrdeng_store_metric_flush_current_page(STORAGE_COLLECT_HANDLE *collection_handle) { struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; - // struct rrdeng_metric_handle *metric_handle = (struct rrdeng_metric_handle *)handle->metric_handle; - struct rrdengine_instance *ctx = handle->page_index->ctx; - struct rrdeng_page_descr *descr = handle->descr; - - if (unlikely(!ctx)) return; - if (unlikely(!descr)) return; - - if (likely(descr->page_length)) { - int page_is_empty; - - rrd_stat_atomic_add(&ctx->stats.metric_API_producers, -1); - - page_is_empty = page_has_only_empty_metrics(descr); - if (page_is_empty) { - print_page_cache_descr(descr, "Page has empty metrics only, deleting", true); - pg_cache_put(ctx, descr); - pg_cache_punch_hole(ctx, descr, 1, 0, NULL); - } else - rrdeng_commit_page(ctx, descr, handle->page_correlation_id); - } else { - dbengine_page_free(descr->pg_cache_descr->page); - rrdeng_destroy_pg_cache_descr(ctx, descr->pg_cache_descr); - rrdeng_page_descr_freez(descr); + + if (unlikely(!handle->page)) + return; + + if(!handle->page_position || page_has_only_empty_metrics(handle)) + pgc_page_to_clean_evict_or_release(main_cache, handle->page); + + else { + check_completed_page_consistency(handle); + mrg_metric_set_clean_latest_time_s(main_mrg, handle->metric, pgc_page_end_time_s(handle->page)); + pgc_page_hot_to_dirty_and_release(main_cache, handle->page); } - handle->descr = NULL; -} -static void rrdeng_store_metric_next_internal(STORAGE_COLLECT_HANDLE *collection_handle, - usec_t point_in_time_ut, - NETDATA_DOUBLE n, - NETDATA_DOUBLE min_value, - NETDATA_DOUBLE max_value, - uint16_t count, - uint16_t anomaly_count, - SN_FLAGS flags) -{ - struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; - struct pg_cache_page_index *page_index = handle->page_index; - struct rrdengine_instance *ctx = handle->page_index->ctx; - struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_descr *descr = handle->descr; + mrg_metric_set_hot_latest_time_s(main_mrg, handle->metric, 0); - void *page; - uint8_t must_flush_unaligned_page = 0, perfect_page_alignment = 0; + handle->page = NULL; + handle->page_flags = 0; + handle->page_position = 0; + handle->page_entries_max = 0; - if (descr) { - /* Make alignment decisions */ + // important! + // we should never zero page end time ut, because this will allow + // collection to go back in time + // handle->page_end_time_ut = 0; + // handle->page_start_time_ut; + + check_and_fix_mrg_update_every(handle); +} + +static void rrdeng_store_metric_create_new_page(struct rrdeng_collect_handle *handle, + struct rrdengine_instance *ctx, + usec_t point_in_time_ut, + void *data, + size_t data_size) { + time_t point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC); + const time_t update_every_s = (time_t)(handle->update_every_ut / USEC_PER_SEC); + + PGC_ENTRY page_entry = { + .section = (Word_t) ctx, + .metric_id = mrg_metric_id(main_mrg, handle->metric), + .start_time_s = point_in_time_s, + .end_time_s = point_in_time_s, + .size = data_size, + .data = data, + .update_every_s = update_every_s, + .hot = true + }; + + size_t conflicts = 0; + bool added = true; + PGC_PAGE *page = pgc_page_add_and_acquire(main_cache, page_entry, &added); + while (unlikely(!added)) { + conflicts++; + + char uuid[UUID_STR_LEN + 1]; + uuid_unparse(*mrg_metric_uuid(main_mrg, handle->metric), uuid); #ifdef NETDATA_INTERNAL_CHECKS - if(descr->end_time_ut + page_index->latest_update_every_s * USEC_PER_SEC != point_in_time_ut) { - char buffer[200 + 1]; - snprintfz(buffer, 200, - "metrics collected are %s, end_time_ut = %llu, point_in_time_ut = %llu, update_every = %u, delta = %llu", - (point_in_time_ut / USEC_PER_SEC - descr->end_time_ut / USEC_PER_SEC > page_index->latest_update_every_s)?"far apart":"not aligned", - descr->end_time_ut / USEC_PER_SEC, - point_in_time_ut / USEC_PER_SEC, - page_index->latest_update_every_s, - point_in_time_ut / USEC_PER_SEC - descr->end_time_ut / USEC_PER_SEC); - print_page_cache_descr(descr, buffer, false); - } + internal_error(true, +#else + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, #endif + "DBENGINE: metric '%s' new page from %ld to %ld, update every %ld, has a conflict in main cache " + "with existing %s%s page from %ld to %ld, update every %ld - " + "is it collected more than once?", + uuid, + page_entry.start_time_s, page_entry.end_time_s, (time_t)page_entry.update_every_s, + pgc_is_page_hot(page) ? "hot" : "not-hot", + pgc_page_data(page) == DBENGINE_EMPTY_PAGE ? " gap" : "", + pgc_page_start_time_s(page), pgc_page_end_time_s(page), pgc_page_update_every_s(page) + ); + + pgc_page_release(main_cache, page); + + point_in_time_ut -= handle->update_every_ut; + point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC); + page_entry.start_time_s = point_in_time_s; + page_entry.end_time_s = point_in_time_s; + page = pgc_page_add_and_acquire(main_cache, page_entry, &added); + } - if (descr->page_length == handle->alignment->page_length) { - /* this is the leading dimension that defines chart alignment */ - perfect_page_alignment = 1; - } - /* is the metric far enough out of alignment with the others? */ - if (unlikely(descr->page_length + PAGE_POINT_SIZE_BYTES(descr) < handle->alignment->page_length)) { - handle->unaligned_page = 1; - print_page_cache_descr(descr, "Metric page is not aligned with chart", true); - } - if (unlikely(handle->unaligned_page && - /* did the other metrics change page? */ - handle->alignment->page_length <= PAGE_POINT_SIZE_BYTES(descr))) { - print_page_cache_descr(descr, "must_flush_unaligned_page = 1", true); - must_flush_unaligned_page = 1; - handle->unaligned_page = 0; - } + handle->page_entries_max = data_size / CTX_POINT_SIZE_BYTES(ctx); + handle->page_start_time_ut = point_in_time_ut; + handle->page_end_time_ut = point_in_time_ut; + handle->page_position = 1; // zero is already in our data + handle->page = page; + handle->page_flags = conflicts? RRDENG_PAGE_CONFLICT : 0; + + if(point_in_time_s > max_acceptable_collected_time()) + handle->page_flags |= RRDENG_PAGE_CREATED_IN_FUTURE; + + check_and_fix_mrg_update_every(handle); +} + +static void *rrdeng_alloc_new_metric_data(struct rrdeng_collect_handle *handle, size_t *data_size, usec_t point_in_time_ut) { + struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); + size_t size; + + if(handle->options & RRDENG_FIRST_PAGE_ALLOCATED) { + // any page except the first + size = tier_page_size[ctx->config.tier]; } - if (unlikely(NULL == descr || - descr->page_length + PAGE_POINT_SIZE_BYTES(descr) > RRDENG_BLOCK_SIZE || - must_flush_unaligned_page)) { + else { + size_t final_slots = 0; - if(descr) { - print_page_cache_descr(descr, "flushing metric", true); - rrdeng_store_metric_flush_current_page(collection_handle); + // the first page + handle->options |= RRDENG_FIRST_PAGE_ALLOCATED; + size_t max_size = tier_page_size[ctx->config.tier]; + size_t max_slots = max_size / CTX_POINT_SIZE_BYTES(ctx); + + if(handle->alignment->initial_slots) { + final_slots = handle->alignment->initial_slots; } + else { + max_slots -= 3; - page = rrdeng_create_page(ctx, &page_index->id, &descr); - fatal_assert(page); + size_t smaller_slot = indexing_partition((Word_t)handle->alignment, max_slots); + final_slots = smaller_slot; - descr->update_every_s = page_index->latest_update_every_s; - handle->descr = descr; + time_t now_s = (time_t)(point_in_time_ut / USEC_PER_SEC); + size_t current_pos = (now_s % max_slots); - handle->page_correlation_id = rrd_atomic_fetch_add(&pg_cache->committed_page_index.latest_corr_id, 1); + if(current_pos > final_slots) + final_slots += max_slots - current_pos; - if (0 == handle->alignment->page_length) { - /* this is the leading dimension that defines chart alignment */ - perfect_page_alignment = 1; + else if(current_pos < final_slots) + final_slots -= current_pos; + + if(final_slots < 3) { + final_slots += 3; + smaller_slot += 3; + + if(smaller_slot >= max_slots) + smaller_slot -= max_slots; + } + + max_slots += 3; + handle->alignment->initial_slots = smaller_slot + 3; + + internal_fatal(handle->alignment->initial_slots < 3 || handle->alignment->initial_slots >= max_slots, "ooops! wrong distribution of metrics across time"); + internal_fatal(final_slots < 3 || final_slots >= max_slots, "ooops! wrong distribution of metrics across time"); } + + size = final_slots * CTX_POINT_SIZE_BYTES(ctx); } - page = descr->pg_cache_descr->page; + *data_size = size; + return dbengine_page_alloc(size); +} + +static void rrdeng_store_metric_append_point(STORAGE_COLLECT_HANDLE *collection_handle, + const usec_t point_in_time_ut, + const NETDATA_DOUBLE n, + const NETDATA_DOUBLE min_value, + const NETDATA_DOUBLE max_value, + const uint16_t count, + const uint16_t anomaly_count, + const SN_FLAGS flags) +{ + struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; + struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); - switch (descr->type) { + bool perfect_page_alignment = false; + void *data; + size_t data_size; + + if(likely(handle->page)) { + /* Make alignment decisions */ + if (handle->page_position == handle->alignment->page_position) { + /* this is the leading dimension that defines chart alignment */ + perfect_page_alignment = true; + } + + /* is the metric far enough out of alignment with the others? */ + if (unlikely(handle->page_position + 1 < handle->alignment->page_position)) + handle->options |= RRDENG_CHO_UNALIGNED; + + if (unlikely((handle->options & RRDENG_CHO_UNALIGNED) && + /* did the other metrics change page? */ + handle->alignment->page_position <= 1)) { + handle->options &= ~RRDENG_CHO_UNALIGNED; + handle->page_flags |= RRDENG_PAGE_UNALIGNED; + rrdeng_store_metric_flush_current_page(collection_handle); + + data = rrdeng_alloc_new_metric_data(handle, &data_size, point_in_time_ut); + } + else { + data = pgc_page_data(handle->page); + data_size = pgc_page_data_size(main_cache, handle->page); + } + } + else + data = rrdeng_alloc_new_metric_data(handle, &data_size, point_in_time_ut); + + switch (ctx->config.page_type) { case PAGE_METRICS: { - ((storage_number *)page)[descr->page_length / PAGE_POINT_SIZE_BYTES(descr)] = pack_storage_number(n, flags); + storage_number *tier0_metric_data = data; + tier0_metric_data[handle->page_position] = pack_storage_number(n, flags); } break; case PAGE_TIER: { + storage_number_tier1_t *tier12_metric_data = data; storage_number_tier1_t number_tier1; number_tier1.sum_value = (float)n; number_tier1.min_value = (float)min_value; number_tier1.max_value = (float)max_value; number_tier1.anomaly_count = anomaly_count; number_tier1.count = count; - ((storage_number_tier1_t *)page)[descr->page_length / PAGE_POINT_SIZE_BYTES(descr)] = number_tier1; + tier12_metric_data[handle->page_position] = number_tier1; } break; default: { static bool logged = false; if(!logged) { - error("DBENGINE: cannot store metric on unknown page type id %d", descr->type); + error("DBENGINE: cannot store metric on unknown page type id %d", ctx->config.page_type); logged = true; } } break; } - pg_cache_atomic_set_pg_info(descr, point_in_time_ut, descr->page_length + PAGE_POINT_SIZE_BYTES(descr)); + if(unlikely(!handle->page)){ + rrdeng_store_metric_create_new_page(handle, ctx, point_in_time_ut, data, data_size); + // handle->position is set to 1 already - if (perfect_page_alignment) - handle->alignment->page_length = descr->page_length; - if (unlikely(INVALID_TIME == descr->start_time_ut)) { - unsigned long new_metric_API_producers, old_metric_API_max_producers, ret_metric_API_max_producers; - descr->start_time_ut = point_in_time_ut; - - new_metric_API_producers = rrd_atomic_add_fetch(&ctx->stats.metric_API_producers, 1); - while (unlikely(new_metric_API_producers > (old_metric_API_max_producers = ctx->metric_API_max_producers))) { - /* Increase ctx->metric_API_max_producers */ - ret_metric_API_max_producers = ulong_compare_and_swap(&ctx->metric_API_max_producers, - old_metric_API_max_producers, - new_metric_API_producers); - if (old_metric_API_max_producers == ret_metric_API_max_producers) { - /* success */ - break; - } + if (0 == handle->alignment->page_position) { + /* this is the leading dimension that defines chart alignment */ + perfect_page_alignment = true; + } + } + else { + // update an existing page + pgc_page_hot_set_end_time_s(main_cache, handle->page, (time_t) (point_in_time_ut / USEC_PER_SEC)); + handle->page_end_time_ut = point_in_time_ut; + + if(unlikely(++handle->page_position >= handle->page_entries_max)) { + internal_fatal(handle->page_position > handle->page_entries_max, "DBENGINE: exceeded page max number of points"); + handle->page_flags |= RRDENG_PAGE_FULL; + rrdeng_store_metric_flush_current_page(collection_handle); } + } + + if (perfect_page_alignment) + handle->alignment->page_position = handle->page_position; + + // update the metric information + mrg_metric_set_hot_latest_time_s(main_mrg, handle->metric, (time_t) (point_in_time_ut / USEC_PER_SEC)); +} + +static void store_metric_next_error_log(struct rrdeng_collect_handle *handle, usec_t point_in_time_ut, const char *msg) { + time_t point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC); + char uuid[UUID_STR_LEN + 1]; + uuid_unparse(*mrg_metric_uuid(main_mrg, handle->metric), uuid); - pg_cache_insert(ctx, page_index, descr); - } else { - pg_cache_add_new_metric_time(page_index, descr); + BUFFER *wb = NULL; + if(handle->page && handle->page_flags) { + wb = buffer_create(0, NULL); + collect_page_flags_to_buffer(wb, handle->page_flags); } -// { -// unsigned char u[16] = { 0x0C, 0x0A, 0x40, 0xD6, 0x2A, 0x43, 0x4A, 0x7C, 0x95, 0xF7, 0xD1, 0x1E, 0x0C, 0x9E, 0x8A, 0xE7 }; -// if(uuid_compare(u, page_index->id) == 0) { -// char buffer[100]; -// snprintfz(buffer, 100, "store system.cpu, collect:%u, page_index first:%u, last:%u", -// (uint32_t)(point_in_time / USEC_PER_SEC), -// (uint32_t)(page_index->oldest_time / USEC_PER_SEC), -// (uint32_t)(page_index->latest_time / USEC_PER_SEC)); -// -// print_page_cache_descr(descr, buffer, false); -// } -// } + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, + "DBENGINE: metric '%s' collected point at %ld, %s last collection at %ld, " + "update every %ld, %s page from %ld to %ld, position %u (of %u), flags: %s", + uuid, + point_in_time_s, + msg, + (time_t)(handle->page_end_time_ut / USEC_PER_SEC), + (time_t)(handle->update_every_ut / USEC_PER_SEC), + handle->page ? "current" : "*LAST*", + (time_t)(handle->page_start_time_ut / USEC_PER_SEC), + (time_t)(handle->page_end_time_ut / USEC_PER_SEC), + handle->page_position, handle->page_entries_max, + wb ? buffer_tostring(wb) : "" + ); + + buffer_free(wb); } void rrdeng_store_metric_next(STORAGE_COLLECT_HANDLE *collection_handle, - usec_t point_in_time_ut, - NETDATA_DOUBLE n, - NETDATA_DOUBLE min_value, - NETDATA_DOUBLE max_value, - uint16_t count, - uint16_t anomaly_count, - SN_FLAGS flags) + const usec_t point_in_time_ut, + const NETDATA_DOUBLE n, + const NETDATA_DOUBLE min_value, + const NETDATA_DOUBLE max_value, + const uint16_t count, + const uint16_t anomaly_count, + const SN_FLAGS flags) { struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; - struct pg_cache_page_index *page_index = handle->page_index; - struct rrdeng_page_descr *descr = handle->descr; - - if(likely(descr)) { - usec_t last_point_in_time_ut = descr->end_time_ut; - usec_t update_every_ut = page_index->latest_update_every_s * USEC_PER_SEC; - size_t points_gap = (point_in_time_ut <= last_point_in_time_ut) ? - (size_t)0 : - (size_t)((point_in_time_ut - last_point_in_time_ut) / update_every_ut); - - if(unlikely(points_gap != 1)) { - if (unlikely(points_gap <= 0)) { - time_t now = now_realtime_sec(); - static __thread size_t counter = 0; - static __thread time_t last_time_logged = 0; - counter++; - - if(now - last_time_logged > 600) { - error("DBENGINE: collected point is in the past (repeated %zu times in the last %zu secs). Ignoring these data collection points.", - counter, (size_t)(last_time_logged?(now - last_time_logged):0)); - - last_time_logged = now; - counter = 0; - } - return; - } - size_t point_size = PAGE_POINT_SIZE_BYTES(descr); - size_t page_size_in_points = RRDENG_BLOCK_SIZE / point_size; - size_t used_points = descr->page_length / point_size; - size_t remaining_points_in_page = page_size_in_points - used_points; +#ifdef NETDATA_INTERNAL_CHECKS + if(unlikely(point_in_time_ut > (usec_t)max_acceptable_collected_time() * USEC_PER_SEC)) + handle->page_flags |= RRDENG_PAGE_FUTURE_POINT; +#endif + + if(likely(handle->page_end_time_ut + handle->update_every_ut == point_in_time_ut)) { + // happy path + ; + } + else if(unlikely(point_in_time_ut < handle->page_end_time_ut)) { + handle->page_flags |= RRDENG_PAGE_PAST_COLLECTION; + store_metric_next_error_log(handle, point_in_time_ut, "is older than the"); + return; + } - bool new_point_is_aligned = true; - if(unlikely((point_in_time_ut - last_point_in_time_ut) / points_gap != update_every_ut)) - new_point_is_aligned = false; + else if(unlikely(point_in_time_ut == handle->page_end_time_ut)) { + handle->page_flags |= RRDENG_PAGE_REPEATED_COLLECTION; + store_metric_next_error_log(handle, point_in_time_ut, "is at the same time as the"); + return; + } - if(unlikely(points_gap > remaining_points_in_page || !new_point_is_aligned)) { -// char buffer[200]; -// snprintfz(buffer, 200, "data collection skipped %zu points, last stored point %llu, new point %llu, update every %d. Cutting page.", -// points_gap, last_point_in_time_ut / USEC_PER_SEC, point_in_time_ut / USEC_PER_SEC, page_index->latest_update_every_s); -// print_page_cache_descr(descr, buffer, false); + else if(handle->page) { + usec_t delta_ut = point_in_time_ut - handle->page_end_time_ut; + if(unlikely(delta_ut < handle->update_every_ut)) { + handle->page_flags |= RRDENG_PAGE_STEP_TOO_SMALL; + rrdeng_store_metric_flush_current_page(collection_handle); + } + else if(unlikely(delta_ut % handle->update_every_ut)) { + handle->page_flags |= RRDENG_PAGE_STEP_UNALIGNED; + rrdeng_store_metric_flush_current_page(collection_handle); + } + else { + size_t points_gap = delta_ut / handle->update_every_ut; + size_t page_remaining_points = handle->page_entries_max - handle->page_position; + + if(points_gap >= page_remaining_points) { + handle->page_flags |= RRDENG_PAGE_BIG_GAP; rrdeng_store_metric_flush_current_page(collection_handle); } else { -// char buffer[200]; -// snprintfz(buffer, 200, "data collection skipped %zu points, last stored point %llu, new point %llu, update every %d. Filling the gap.", -// points_gap, last_point_in_time_ut / USEC_PER_SEC, point_in_time_ut / USEC_PER_SEC, page_index->latest_update_every_s); -// print_page_cache_descr(descr, buffer, false); - // loop to fill the gap - usec_t step_ut = page_index->latest_update_every_s * USEC_PER_SEC; - usec_t last_point_filled_ut = last_point_in_time_ut + step_ut; - - while (last_point_filled_ut < point_in_time_ut) { - rrdeng_store_metric_next_internal( - collection_handle, last_point_filled_ut, NAN, NAN, NAN, - 1, 0, SN_EMPTY_SLOT); - - last_point_filled_ut += step_ut; + handle->page_flags |= RRDENG_PAGE_GAP; + + usec_t stop_ut = point_in_time_ut - handle->update_every_ut; + for(usec_t this_ut = handle->page_end_time_ut + handle->update_every_ut; + this_ut <= stop_ut ; + this_ut = handle->page_end_time_ut + handle->update_every_ut) { + rrdeng_store_metric_append_point( + collection_handle, + this_ut, + NAN, NAN, NAN, + 1, 0, + SN_EMPTY_SLOT); } } } } - rrdeng_store_metric_next_internal(collection_handle, point_in_time_ut, n, min_value, max_value, count, anomaly_count, flags); + rrdeng_store_metric_append_point(collection_handle, + point_in_time_ut, + n, min_value, max_value, + count, anomaly_count, + flags); } - /* * Releases the database reference from the handle for storing metrics. * Returns 1 if it's safe to delete the dimension. */ int rrdeng_store_metric_finalize(STORAGE_COLLECT_HANDLE *collection_handle) { struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; - struct pg_cache_page_index *page_index = handle->page_index; - - uint8_t can_delete_metric = 0; + struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); + handle->page_flags |= RRDENG_PAGE_COLLECT_FINALIZE; rrdeng_store_metric_flush_current_page(collection_handle); - uv_rwlock_wrlock(&page_index->lock); + rrdeng_page_alignment_release(handle->alignment); - if (!--page_index->writers && !page_index->page_count) - can_delete_metric = 1; + __atomic_sub_fetch(&ctx->atomic.collectors_running, 1, __ATOMIC_RELAXED); + if(!(handle->options & RRDENG_1ST_METRIC_WRITER)) + __atomic_sub_fetch(&ctx->atomic.collectors_running_duplicate, 1, __ATOMIC_RELAXED); - uv_rwlock_wrunlock(&page_index->lock); + if((handle->options & RRDENG_1ST_METRIC_WRITER) && !mrg_metric_clear_writer(main_mrg, handle->metric)) + internal_fatal(true, "DBENGINE: metric is already released"); - rrdeng_page_alignment_release(handle->alignment); + time_t first_time_s, last_time_s, update_every_s; + mrg_metric_get_retention(main_mrg, handle->metric, &first_time_s, &last_time_s, &update_every_s); + + mrg_metric_release(main_mrg, handle->metric); freez(handle); - return can_delete_metric; + if(!first_time_s && !last_time_s) + return 1; + + return 0; } void rrdeng_store_metric_change_collection_frequency(STORAGE_COLLECT_HANDLE *collection_handle, int update_every) { struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; - struct pg_cache_page_index *page_index = handle->page_index; + check_and_fix_mrg_update_every(handle); + + METRIC *metric = handle->metric; + usec_t update_every_ut = (usec_t)update_every * USEC_PER_SEC; + + if(update_every_ut == handle->update_every_ut) + return; + + handle->page_flags |= RRDENG_PAGE_UPDATE_EVERY_CHANGE; rrdeng_store_metric_flush_current_page(collection_handle); - uv_rwlock_rdlock(&page_index->lock); - page_index->latest_update_every_s = update_every; - uv_rwlock_rdunlock(&page_index->lock); + mrg_metric_set_update_every(main_mrg, metric, update_every); + handle->update_every_ut = update_every_ut; } // ---------------------------------------------------------------------------- // query ops -//static inline uint32_t *pginfo_to_dt(struct rrdeng_page_info *page_info) -//{ -// return (uint32_t *)&page_info->scratch[0]; -//} -// -//static inline uint32_t *pginfo_to_points(struct rrdeng_page_info *page_info) -//{ -// return (uint32_t *)&page_info->scratch[sizeof(uint32_t)]; -//} -// +#ifdef NETDATA_INTERNAL_CHECKS +SPINLOCK global_query_handle_spinlock = NETDATA_SPINLOCK_INITIALIZER; +static struct rrdeng_query_handle *global_query_handle_ll = NULL; +static void register_query_handle(struct rrdeng_query_handle *handle) { + handle->query_pid = gettid(); + handle->started_time_s = now_realtime_sec(); + + netdata_spinlock_lock(&global_query_handle_spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(global_query_handle_ll, handle, prev, next); + netdata_spinlock_unlock(&global_query_handle_spinlock); +} +static void unregister_query_handle(struct rrdeng_query_handle *handle) { + netdata_spinlock_lock(&global_query_handle_spinlock); + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(global_query_handle_ll, handle, prev, next); + netdata_spinlock_unlock(&global_query_handle_spinlock); +} +#else +static void register_query_handle(struct rrdeng_query_handle *handle __maybe_unused) { + ; +} +static void unregister_query_handle(struct rrdeng_query_handle *handle __maybe_unused) { + ; +} +#endif + /* * Gets a handle for loading metrics from the database. * The handle must be released with rrdeng_load_metric_final(). */ -void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *rrdimm_handle, time_t start_time_s, time_t end_time_s) +void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, + struct storage_engine_query_handle *rrddim_handle, + time_t start_time_s, + time_t end_time_s, + STORAGE_PRIORITY priority) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - struct rrdengine_instance *ctx = page_index->ctx; + usec_t started_ut = now_monotonic_usec(); - // fprintf(stderr, "%s: %s/%s start time %ld, end time %ld\n", __FUNCTION__ , rd->rrdset->name, rd->name, start_time, end_time); + netdata_thread_disable_cancelability(); + METRIC *metric = (METRIC *)db_metric_handle; + struct rrdengine_instance *ctx = mrg_metric_ctx(metric); struct rrdeng_query_handle *handle; - unsigned pages_nr; - if(!page_index->latest_update_every_s) - page_index->latest_update_every_s = default_rrd_update_every; + handle = rrdeng_query_handle_get(); + register_query_handle(handle); - rrdimm_handle->start_time_s = start_time_s; - rrdimm_handle->end_time_s = end_time_s; + if(unlikely(priority < STORAGE_PRIORITY_HIGH)) + priority = STORAGE_PRIORITY_HIGH; + else if(unlikely(priority > STORAGE_PRIORITY_BEST_EFFORT)) + priority = STORAGE_PRIORITY_BEST_EFFORT; - handle = callocz(1, sizeof(struct rrdeng_query_handle)); - handle->wanted_start_time_s = start_time_s; - handle->now_s = start_time_s; - handle->position = 0; handle->ctx = ctx; - handle->descr = NULL; - handle->dt_s = page_index->latest_update_every_s; - rrdimm_handle->handle = (STORAGE_QUERY_HANDLE *)handle; - pages_nr = pg_cache_preload(ctx, &page_index->id, start_time_s * USEC_PER_SEC, end_time_s * USEC_PER_SEC, - NULL, &handle->page_index); - if (unlikely(NULL == handle->page_index || 0 == pages_nr)) - // there are no metrics to load - handle->wanted_start_time_s = INVALID_TIME; -} + handle->metric = metric; + handle->priority = priority; + + // IMPORTANT! + // It is crucial not to exceed the db boundaries, because dbengine + // now has gap caching, so when a gap is detected a negative page + // is inserted into the main cache, to avoid scanning the journals + // again for pages matching the gap. + + time_t db_first_time_s, db_last_time_s, db_update_every_s; + mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s); + + if(is_page_in_time_range(start_time_s, end_time_s, db_first_time_s, db_last_time_s) == PAGE_IS_IN_RANGE) { + handle->start_time_s = MAX(start_time_s, db_first_time_s); + handle->end_time_s = MIN(end_time_s, db_last_time_s); + handle->now_s = handle->start_time_s; + + handle->dt_s = db_update_every_s; + if (!handle->dt_s) { + handle->dt_s = default_rrd_update_every; + mrg_metric_set_update_every_s_if_zero(main_mrg, metric, default_rrd_update_every); + } -static int rrdeng_load_page_next(struct storage_engine_query_handle *rrdimm_handle, bool debug_this __maybe_unused) { - struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrdimm_handle->handle; + rrddim_handle->handle = (STORAGE_QUERY_HANDLE *) handle; + rrddim_handle->start_time_s = handle->start_time_s; + rrddim_handle->end_time_s = handle->end_time_s; + rrddim_handle->priority = priority; - struct rrdengine_instance *ctx = handle->ctx; - struct rrdeng_page_descr *descr = handle->descr; + pg_cache_preload(handle); - uint32_t page_length; - usec_t page_end_time_ut; - unsigned position; + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_init, now_monotonic_usec() - started_ut, __ATOMIC_RELAXED); + } + else { + handle->start_time_s = start_time_s; + handle->end_time_s = end_time_s; + handle->now_s = start_time_s; + handle->dt_s = db_update_every_s; + + rrddim_handle->handle = (STORAGE_QUERY_HANDLE *) handle; + rrddim_handle->start_time_s = handle->start_time_s; + rrddim_handle->end_time_s = 0; + rrddim_handle->priority = priority; + } +} - if (likely(descr)) { - // Drop old page's reference +static bool rrdeng_load_page_next(struct storage_engine_query_handle *rrddim_handle, bool debug_this __maybe_unused) { + struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle; + struct rrdengine_instance *ctx = handle->ctx; -#ifdef NETDATA_INTERNAL_CHECKS - rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, -1); -#endif + if (likely(handle->page)) { + // we have a page to release + pgc_page_release(main_cache, handle->page); + handle->page = NULL; + } - pg_cache_put(ctx, descr); - handle->descr = NULL; - handle->wanted_start_time_s = (time_t)((handle->page_end_time_ut / USEC_PER_SEC) + handle->dt_s); + if (unlikely(handle->now_s > rrddim_handle->end_time_s)) + return false; - if (unlikely(handle->wanted_start_time_s > rrdimm_handle->end_time_s)) - return 1; - } + size_t entries; + handle->page = pg_cache_lookup_next(ctx, handle->pdc, handle->now_s, handle->dt_s, &entries); + if (unlikely(!handle->page)) + return false; - usec_t wanted_start_time_ut = handle->wanted_start_time_s * USEC_PER_SEC; - descr = pg_cache_lookup_next(ctx, handle->page_index, &handle->page_index->id, - wanted_start_time_ut, rrdimm_handle->end_time_s * USEC_PER_SEC); - if (NULL == descr) - return 1; + internal_fatal(pgc_page_data(handle->page) == DBENGINE_EMPTY_PAGE, "Empty page returned"); -#ifdef NETDATA_INTERNAL_CHECKS - rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, 1); -#endif + time_t page_start_time_s = pgc_page_start_time_s(handle->page); + time_t page_end_time_s = pgc_page_end_time_s(handle->page); + time_t page_update_every_s = pgc_page_update_every_s(handle->page); - handle->descr = descr; - pg_cache_atomic_get_pg_info(descr, &page_end_time_ut, &page_length); - if (unlikely(INVALID_TIME == descr->start_time_ut || INVALID_TIME == page_end_time_ut || 0 == descr->update_every_s)) { - error("DBENGINE: discarding invalid page descriptor (start_time = %llu, end_time = %llu, update_every_s = %d)", - descr->start_time_ut, page_end_time_ut, descr->update_every_s); - return 1; - } + unsigned position; + if(likely(handle->now_s >= page_start_time_s && handle->now_s <= page_end_time_s)) { + + if(unlikely(entries == 1 || page_start_time_s == page_end_time_s || !page_update_every_s)) { + position = 0; + handle->now_s = page_start_time_s; + } + else { + position = (handle->now_s - page_start_time_s) * (entries - 1) / (page_end_time_s - page_start_time_s); + time_t point_end_time_s = page_start_time_s + position * page_update_every_s; + while(point_end_time_s < handle->now_s && position + 1 < entries) { + // https://github.com/netdata/netdata/issues/14411 + // we really need a while() here, because the delta may be + // 2 points at higher tiers + position++; + point_end_time_s = page_start_time_s + position * page_update_every_s; + } + handle->now_s = point_end_time_s; + } - if (unlikely(descr->start_time_ut != page_end_time_ut && wanted_start_time_ut > descr->start_time_ut)) { - // we're in the middle of the page somewhere - unsigned entries = page_length / PAGE_POINT_SIZE_BYTES(descr); - position = ((uint64_t)(wanted_start_time_ut - descr->start_time_ut)) * (entries - 1) / - (page_end_time_ut - descr->start_time_ut); + internal_fatal(position >= entries, "DBENGINE: wrong page position calculation"); } - else + else if(handle->now_s < page_start_time_s) { + handle->now_s = page_start_time_s; position = 0; + } + else { + internal_fatal(true, "DBENGINE: this page is entirely in our past and should not be accepted for this query in the first place"); + handle->now_s = page_end_time_s; + position = entries - 1; + } - handle->page_end_time_ut = page_end_time_ut; - handle->page_length = page_length; - handle->entries = page_length / PAGE_POINT_SIZE_BYTES(descr); - handle->page = descr->pg_cache_descr->page; - handle->dt_s = descr->update_every_s; + handle->entries = entries; handle->position = position; - -// if(debug_this) -// info("DBENGINE: rrdeng_load_page_next(), " -// "position:%d, " -// "start_time_ut:%llu, " -// "page_end_time_ut:%llu, " -// "next_page_time_ut:%llu, " -// "in_out:%s" -// , position -// , descr->start_time_ut -// , page_end_time_ut -// , -// wanted_start_time_ut, in_out?"true":"false" -// ); - - return 0; + handle->metric_data = pgc_page_data((PGC_PAGE *)handle->page); + handle->dt_s = page_update_every_s; + return true; } // Returns the metric and sets its timestamp into current_time @@ -675,75 +895,28 @@ static int rrdeng_load_page_next(struct storage_engine_query_handle *rrdimm_hand // IT IS REQUIRED TO **ALWAYS** KEEP TRACK OF TIME, EVEN OUTSIDE THE DATABASE BOUNDARIES STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim_handle) { struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle; - // struct rrdeng_metric_handle *metric_handle = handle->metric_handle; - - struct rrdeng_page_descr *descr = handle->descr; - time_t now = handle->now_s + handle->dt_s; - -// bool debug_this = false; -// { -// unsigned char u[16] = { 0x0C, 0x0A, 0x40, 0xD6, 0x2A, 0x43, 0x4A, 0x7C, 0x95, 0xF7, 0xD1, 0x1E, 0x0C, 0x9E, 0x8A, 0xE7 }; -// if(uuid_compare(u, handle->page_index->id) == 0) { -// char buffer[100]; -// snprintfz(buffer, 100, "load system.cpu, now:%u, dt:%u, position:%u page_index first:%u, last:%u", -// (uint32_t)(now), -// (uint32_t)(handle->dt_s), -// (uint32_t)(handle->position), -// (uint32_t)(handle->page_index->oldest_time / USEC_PER_SEC), -// (uint32_t)(handle->page_index->latest_time / USEC_PER_SEC)); -// -// print_page_cache_descr(descr, buffer, false); -// debug_this = true; -// } -// } - STORAGE_POINT sp; - unsigned position = handle->position + 1; - storage_number_tier1_t tier1_value; - - if (unlikely(INVALID_TIME == handle->wanted_start_time_s)) { - handle->wanted_start_time_s = INVALID_TIME; - handle->now_s = now; - storage_point_empty(sp, now - handle->dt_s, now); - return sp; + + if (unlikely(handle->now_s > rrddim_handle->end_time_s)) { + storage_point_empty(sp, handle->now_s - handle->dt_s, handle->now_s); + goto prepare_for_next_iteration; } - if (unlikely(!descr || position >= handle->entries)) { + if (unlikely(!handle->page || handle->position >= handle->entries)) { // We need to get a new page - if(rrdeng_load_page_next(rrddim_handle, false)) { - // next calls will not load any more metrics - handle->wanted_start_time_s = INVALID_TIME; - handle->now_s = now; - storage_point_empty(sp, now - handle->dt_s, now); - return sp; - } - descr = handle->descr; - position = handle->position; - now = (time_t)((descr->start_time_ut / USEC_PER_SEC) + position * descr->update_every_s); - -// if(debug_this) { -// char buffer[100]; -// snprintfz(buffer, 100, "NEW PAGE system.cpu, now:%u, dt:%u, position:%u page_index first:%u, last:%u", -// (uint32_t)(now), -// (uint32_t)(handle->dt_s), -// (uint32_t)(handle->position), -// (uint32_t)(handle->page_index->oldest_time / USEC_PER_SEC), -// (uint32_t)(handle->page_index->latest_time / USEC_PER_SEC)); -// -// print_page_cache_descr(descr, buffer, false); -// } + if (!rrdeng_load_page_next(rrddim_handle, false)) { + storage_point_empty(sp, handle->now_s - handle->dt_s, handle->now_s); + goto prepare_for_next_iteration; + } } - sp.start_time = now - handle->dt_s; - sp.end_time = now; - - handle->position = position; - handle->now_s = now; + sp.start_time_s = handle->now_s - handle->dt_s; + sp.end_time_s = handle->now_s; - switch(descr->type) { + switch(handle->ctx->config.page_type) { case PAGE_METRICS: { - storage_number n = handle->page[position]; + storage_number n = handle->metric_data[handle->position]; sp.min = sp.max = sp.sum = unpack_storage_number(n); sp.flags = n & SN_USER_FLAGS; sp.count = 1; @@ -752,7 +925,7 @@ STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim break; case PAGE_TIER: { - tier1_value = ((storage_number_tier1_t *)handle->page)[position]; + storage_number_tier1_t tier1_value = ((storage_number_tier1_t *)handle->metric_data)[handle->position]; sp.flags = tier1_value.anomaly_count ? SN_FLAG_NONE : SN_FLAG_NOT_ANOMALOUS; sp.count = tier1_value.count; sp.anomaly_count = tier1_value.anomaly_count; @@ -766,204 +939,98 @@ STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim default: { static bool logged = false; if(!logged) { - error("DBENGINE: unknown page type %d found. Cannot decode it. Ignoring its metrics.", descr->type); + error("DBENGINE: unknown page type %d found. Cannot decode it. Ignoring its metrics.", handle->ctx->config.page_type); logged = true; } - storage_point_empty(sp, sp.start_time, sp.end_time); + storage_point_empty(sp, sp.start_time_s, sp.end_time_s); } break; } - if (unlikely(now >= rrddim_handle->end_time_s)) { - // next calls will not load any more metrics - handle->wanted_start_time_s = INVALID_TIME; - } +prepare_for_next_iteration: + internal_fatal(sp.end_time_s < rrddim_handle->start_time_s, "DBENGINE: this point is too old for this query"); + internal_fatal(sp.end_time_s < handle->now_s, "DBENGINE: this point is too old for this point in time"); -// if(debug_this) -// info("DBENGINE: returning point: " -// "time from %ld to %ld // query from %ld to %ld // wanted_start_time_s %ld" -// , sp.start_time, sp.end_time -// , rrddim_handle->start_time_s, rrddim_handle->end_time_s -// , handle->wanted_start_time_s -// ); + handle->now_s += handle->dt_s; + handle->position++; return sp; } -int rrdeng_load_metric_is_finished(struct storage_engine_query_handle *rrdimm_handle) -{ - struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrdimm_handle->handle; - return (INVALID_TIME == handle->wanted_start_time_s); +int rrdeng_load_metric_is_finished(struct storage_engine_query_handle *rrddim_handle) { + struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle; + return (handle->now_s > rrddim_handle->end_time_s); } /* * Releases the database reference from the handle for loading metrics. */ -void rrdeng_load_metric_finalize(struct storage_engine_query_handle *rrdimm_handle) +void rrdeng_load_metric_finalize(struct storage_engine_query_handle *rrddim_handle) { - struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrdimm_handle->handle; - struct rrdengine_instance *ctx = handle->ctx; - struct rrdeng_page_descr *descr = handle->descr; + struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle; - if (descr) { -#ifdef NETDATA_INTERNAL_CHECKS - rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, -1); -#endif - pg_cache_put(ctx, descr); - } + if (handle->page) + pgc_page_release(main_cache, handle->page); - // whatever is allocated at rrdeng_load_metric_init() should be freed here - freez(handle); - rrdimm_handle->handle = NULL; -} + if(!pdc_release_and_destroy_if_unreferenced(handle->pdc, false, false)) + __atomic_store_n(&handle->pdc->workers_should_stop, true, __ATOMIC_RELAXED); -time_t rrdeng_metric_latest_time(STORAGE_METRIC_HANDLE *db_metric_handle) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - return (time_t)(page_index->latest_time_ut / USEC_PER_SEC); -} -time_t rrdeng_metric_oldest_time(STORAGE_METRIC_HANDLE *db_metric_handle) { - struct pg_cache_page_index *page_index = (struct pg_cache_page_index *)db_metric_handle; - return (time_t)(page_index->oldest_time_ut / USEC_PER_SEC); + unregister_query_handle(handle); + rrdeng_query_handle_release(handle); + rrddim_handle->handle = NULL; + netdata_thread_enable_cancelability(); } -int rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *si, uuid_t *dim_uuid, time_t *first_entry_t, time_t *last_entry_t) -{ - struct page_cache *pg_cache; - struct rrdengine_instance *ctx; - Pvoid_t *PValue; - struct pg_cache_page_index *page_index = NULL; +time_t rrdeng_load_align_to_optimal_before(struct storage_engine_query_handle *rrddim_handle) { + struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle; - ctx = (struct rrdengine_instance *)si; - if (unlikely(!ctx)) { - error("DBENGINE: invalid STORAGE INSTANCE to %s()", __FUNCTION__); - return 1; + if(handle->pdc) { + rrdeng_prep_wait(handle->pdc); + if (handle->pdc->optimal_end_time_s > rrddim_handle->end_time_s) + rrddim_handle->end_time_s = handle->pdc->optimal_end_time_s; } - pg_cache = &ctx->pg_cache; - uv_rwlock_rdlock(&pg_cache->metrics_index.lock); - PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, dim_uuid, sizeof(uuid_t)); - if (likely(NULL != PValue)) { - page_index = *PValue; - } - uv_rwlock_rdunlock(&pg_cache->metrics_index.lock); + return rrddim_handle->end_time_s; +} - if (likely(page_index)) { - *first_entry_t = page_index->oldest_time_ut / USEC_PER_SEC; - *last_entry_t = page_index->latest_time_ut / USEC_PER_SEC; - return 0; - } +time_t rrdeng_metric_latest_time(STORAGE_METRIC_HANDLE *db_metric_handle) { + METRIC *metric = (METRIC *)db_metric_handle; + time_t latest_time_s = 0; - return 1; -} + if (metric) + latest_time_s = mrg_metric_get_latest_time_s(main_mrg, metric); -/* Also gets a reference for the page */ -void *rrdeng_create_page(struct rrdengine_instance *ctx, uuid_t *id, struct rrdeng_page_descr **ret_descr) -{ - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - void *page; - /* TODO: check maximum number of pages in page cache limit */ - - descr = pg_cache_create_descr(); - descr->id = id; /* TODO: add page type: metric, log, something? */ - descr->type = ctx->page_type; - page = dbengine_page_alloc(); /*TODO: add page size */ - rrdeng_page_descr_mutex_lock(ctx, descr); - pg_cache_descr = descr->pg_cache_descr; - pg_cache_descr->page = page; - pg_cache_descr->flags = RRD_PAGE_DIRTY /*| RRD_PAGE_LOCKED */ | RRD_PAGE_POPULATED /* | BEING_COLLECTED */; - pg_cache_descr->refcnt = 1; - - debug(D_RRDENGINE, "Created new page:"); - if (unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr, "", true); - rrdeng_page_descr_mutex_unlock(ctx, descr); - *ret_descr = descr; - return page; + return latest_time_s; } -/* The page must not be empty */ -void rrdeng_commit_page(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr, - Word_t page_correlation_id) -{ - struct page_cache *pg_cache = &ctx->pg_cache; - Pvoid_t *PValue; - unsigned nr_committed_pages; +time_t rrdeng_metric_oldest_time(STORAGE_METRIC_HANDLE *db_metric_handle) { + METRIC *metric = (METRIC *)db_metric_handle; - if (unlikely(NULL == descr)) { - debug(D_RRDENGINE, "%s: page descriptor is NULL, page has already been force-committed.", __func__); - return; - } - fatal_assert(descr->page_length); - - uv_rwlock_wrlock(&pg_cache->committed_page_index.lock); - PValue = JudyLIns(&pg_cache->committed_page_index.JudyL_array, page_correlation_id, PJE0); - *PValue = descr; - nr_committed_pages = ++pg_cache->committed_page_index.nr_committed_pages; - uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock); - - if (nr_committed_pages >= pg_cache_hard_limit(ctx) / 2) { - /* over 50% of pages have not been committed yet */ - - if (ctx->drop_metrics_under_page_cache_pressure && - nr_committed_pages >= pg_cache_committed_hard_limit(ctx)) { - /* 100% of pages are dirty */ - struct rrdeng_cmd cmd; - - cmd.opcode = RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); - } else { - if (0 == (unsigned long) ctx->stats.pg_cache_over_half_dirty_events) { - /* only print the first time */ - errno = 0; - error("Failed to flush dirty buffers quickly enough in dbengine instance \"%s\". " - "Metric data at risk of not being stored in the database, " - "please reduce disk load or use a faster disk.", ctx->dbfiles_path); - } - rrd_stat_atomic_add(&ctx->stats.pg_cache_over_half_dirty_events, 1); - rrd_stat_atomic_add(&global_pg_cache_over_half_dirty_events, 1); - } - } + time_t oldest_time_s = 0; + if (metric) + oldest_time_s = mrg_metric_get_first_time_s(main_mrg, metric); - pg_cache_put(ctx, descr); + return oldest_time_s; } -/* Gets a reference for the page */ -void *rrdeng_get_latest_page(struct rrdengine_instance *ctx, uuid_t *id, void **handle) +bool rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *db_instance, uuid_t *dim_uuid, time_t *first_entry_s, time_t *last_entry_s) { - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; - - debug(D_RRDENGINE, "Reading existing page:"); - descr = pg_cache_lookup(ctx, NULL, id, INVALID_TIME); - if (NULL == descr) { - *handle = NULL; - - return NULL; + struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; + if (unlikely(!ctx)) { + error("DBENGINE: invalid STORAGE INSTANCE to %s()", __FUNCTION__); + return false; } - *handle = descr; - pg_cache_descr = descr->pg_cache_descr; - return pg_cache_descr->page; -} + METRIC *metric = mrg_metric_get_and_acquire(main_mrg, dim_uuid, (Word_t) ctx); + if (unlikely(!metric)) + return false; -/* Gets a reference for the page */ -void *rrdeng_get_page(struct rrdengine_instance *ctx, uuid_t *id, usec_t point_in_time_ut, void **handle) -{ - struct rrdeng_page_descr *descr; - struct page_cache_descr *pg_cache_descr; + time_t update_every_s; + mrg_metric_get_retention(main_mrg, metric, first_entry_s, last_entry_s, &update_every_s); - debug(D_RRDENGINE, "Reading existing page:"); - descr = pg_cache_lookup(ctx, NULL, id, point_in_time_ut); - if (NULL == descr) { - *handle = NULL; - - return NULL; - } - *handle = descr; - pg_cache_descr = descr->pg_cache_descr; + mrg_metric_release(main_mrg, metric); - return pg_cache_descr->page; + return true; } /* @@ -977,62 +1044,126 @@ void rrdeng_get_37_statistics(struct rrdengine_instance *ctx, unsigned long long if (ctx == NULL) return; - struct page_cache *pg_cache = &ctx->pg_cache; - - array[0] = (uint64_t)ctx->stats.metric_API_producers; - array[1] = (uint64_t)ctx->stats.metric_API_consumers; - array[2] = (uint64_t)pg_cache->page_descriptors; - array[3] = (uint64_t)pg_cache->populated_pages; - array[4] = (uint64_t)pg_cache->committed_page_index.nr_committed_pages; - array[5] = (uint64_t)ctx->stats.pg_cache_insertions; - array[6] = (uint64_t)ctx->stats.pg_cache_deletions; - array[7] = (uint64_t)ctx->stats.pg_cache_hits; - array[8] = (uint64_t)ctx->stats.pg_cache_misses; - array[9] = (uint64_t)ctx->stats.pg_cache_backfills; - array[10] = (uint64_t)ctx->stats.pg_cache_evictions; - array[11] = (uint64_t)ctx->stats.before_compress_bytes; - array[12] = (uint64_t)ctx->stats.after_compress_bytes; - array[13] = (uint64_t)ctx->stats.before_decompress_bytes; - array[14] = (uint64_t)ctx->stats.after_decompress_bytes; - array[15] = (uint64_t)ctx->stats.io_write_bytes; - array[16] = (uint64_t)ctx->stats.io_write_requests; - array[17] = (uint64_t)ctx->stats.io_read_bytes; - array[18] = (uint64_t)ctx->stats.io_read_requests; - array[19] = (uint64_t)ctx->stats.io_write_extent_bytes; - array[20] = (uint64_t)ctx->stats.io_write_extents; - array[21] = (uint64_t)ctx->stats.io_read_extent_bytes; - array[22] = (uint64_t)ctx->stats.io_read_extents; - array[23] = (uint64_t)ctx->stats.datafile_creations; - array[24] = (uint64_t)ctx->stats.datafile_deletions; - array[25] = (uint64_t)ctx->stats.journalfile_creations; - array[26] = (uint64_t)ctx->stats.journalfile_deletions; - array[27] = (uint64_t)ctx->stats.page_cache_descriptors; - array[28] = (uint64_t)ctx->stats.io_errors; - array[29] = (uint64_t)ctx->stats.fs_errors; - array[30] = (uint64_t)global_io_errors; - array[31] = (uint64_t)global_fs_errors; - array[32] = (uint64_t)rrdeng_reserved_file_descriptors; - array[33] = (uint64_t)ctx->stats.pg_cache_over_half_dirty_events; - array[34] = (uint64_t)global_pg_cache_over_half_dirty_events; - array[35] = (uint64_t)ctx->stats.flushing_pressure_page_deletions; - array[36] = (uint64_t)global_flushing_pressure_page_deletions; - fatal_assert(RRDENG_NR_STATS == 37); + array[0] = (uint64_t)__atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED); // API producers + array[1] = (uint64_t)__atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED); // API consumers + array[2] = 0; + array[3] = 0; + array[4] = 0; + array[5] = 0; // (uint64_t)ctx->stats.pg_cache_insertions; + array[6] = 0; // (uint64_t)ctx->stats.pg_cache_deletions; + array[7] = 0; // (uint64_t)ctx->stats.pg_cache_hits; + array[8] = 0; // (uint64_t)ctx->stats.pg_cache_misses; + array[9] = 0; // (uint64_t)ctx->stats.pg_cache_backfills; + array[10] = 0; // (uint64_t)ctx->stats.pg_cache_evictions; + array[11] = (uint64_t)__atomic_load_n(&ctx->stats.before_compress_bytes, __ATOMIC_RELAXED); // used + array[12] = (uint64_t)__atomic_load_n(&ctx->stats.after_compress_bytes, __ATOMIC_RELAXED); // used + array[13] = (uint64_t)__atomic_load_n(&ctx->stats.before_decompress_bytes, __ATOMIC_RELAXED); + array[14] = (uint64_t)__atomic_load_n(&ctx->stats.after_decompress_bytes, __ATOMIC_RELAXED); + array[15] = (uint64_t)__atomic_load_n(&ctx->stats.io_write_bytes, __ATOMIC_RELAXED); // used + array[16] = (uint64_t)__atomic_load_n(&ctx->stats.io_write_requests, __ATOMIC_RELAXED); // used + array[17] = (uint64_t)__atomic_load_n(&ctx->stats.io_read_bytes, __ATOMIC_RELAXED); + array[18] = (uint64_t)__atomic_load_n(&ctx->stats.io_read_requests, __ATOMIC_RELAXED); // used + array[19] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_write_extent_bytes, __ATOMIC_RELAXED); + array[20] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_write_extents, __ATOMIC_RELAXED); + array[21] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_read_extent_bytes, __ATOMIC_RELAXED); + array[22] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.io_read_extents, __ATOMIC_RELAXED); + array[23] = (uint64_t)__atomic_load_n(&ctx->stats.datafile_creations, __ATOMIC_RELAXED); + array[24] = (uint64_t)__atomic_load_n(&ctx->stats.datafile_deletions, __ATOMIC_RELAXED); + array[25] = (uint64_t)__atomic_load_n(&ctx->stats.journalfile_creations, __ATOMIC_RELAXED); + array[26] = (uint64_t)__atomic_load_n(&ctx->stats.journalfile_deletions, __ATOMIC_RELAXED); + array[27] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.page_cache_descriptors, __ATOMIC_RELAXED); + array[28] = (uint64_t)__atomic_load_n(&ctx->stats.io_errors, __ATOMIC_RELAXED); + array[29] = (uint64_t)__atomic_load_n(&ctx->stats.fs_errors, __ATOMIC_RELAXED); + array[30] = (uint64_t)__atomic_load_n(&global_io_errors, __ATOMIC_RELAXED); // used + array[31] = (uint64_t)__atomic_load_n(&global_fs_errors, __ATOMIC_RELAXED); // used + array[32] = (uint64_t)__atomic_load_n(&rrdeng_reserved_file_descriptors, __ATOMIC_RELAXED); // used + array[33] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.pg_cache_over_half_dirty_events, __ATOMIC_RELAXED); + array[34] = (uint64_t)__atomic_load_n(&global_pg_cache_over_half_dirty_events, __ATOMIC_RELAXED); // used + array[35] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.flushing_pressure_page_deletions, __ATOMIC_RELAXED); + array[36] = (uint64_t)__atomic_load_n(&global_flushing_pressure_page_deletions, __ATOMIC_RELAXED); // used + array[37] = 0; //(uint64_t)pg_cache->active_descriptors; + + fatal_assert(RRDENG_NR_STATS == 38); } -/* Releases reference to page */ -void rrdeng_put_page(struct rrdengine_instance *ctx, void *handle) -{ - (void)ctx; - pg_cache_put(ctx, (struct rrdeng_page_descr *)handle); +static void rrdeng_populate_mrg(struct rrdengine_instance *ctx) { + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + size_t datafiles = 0; + for(struct rrdengine_datafile *df = ctx->datafiles.first; df ;df = df->next) + datafiles++; + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + size_t cpus = get_netdata_cpus() / storage_tiers; + if(cpus > datafiles) + cpus = datafiles; + + if(cpus < 1) + cpus = 1; + + if(cpus > (size_t)libuv_worker_threads) + cpus = (size_t)libuv_worker_threads; + + if(cpus > MRG_PARTITIONS) + cpus = MRG_PARTITIONS; + + info("DBENGINE: populating retention to MRG from %zu journal files of tier %d, using %zu threads...", datafiles, ctx->config.tier, cpus); + + if(datafiles > 2) { + struct rrdengine_datafile *datafile; + + datafile = ctx->datafiles.first->prev; + if(!(datafile->journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE)) + datafile = datafile->prev; + + if(datafile->journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE) { + journalfile_v2_populate_retention_to_mrg(ctx, datafile->journalfile); + datafile->populate_mrg.populated = true; + } + + datafile = ctx->datafiles.first; + if(datafile->journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE) { + journalfile_v2_populate_retention_to_mrg(ctx, datafile->journalfile); + datafile->populate_mrg.populated = true; + } + } + + ctx->loading.populate_mrg.size = cpus; + ctx->loading.populate_mrg.array = callocz(ctx->loading.populate_mrg.size, sizeof(struct completion)); + + for (size_t i = 0; i < ctx->loading.populate_mrg.size; i++) { + completion_init(&ctx->loading.populate_mrg.array[i]); + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_POPULATE_MRG, NULL, &ctx->loading.populate_mrg.array[i], + STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); + } +} + +void rrdeng_readiness_wait(struct rrdengine_instance *ctx) { + for (size_t i = 0; i < ctx->loading.populate_mrg.size; i++) { + completion_wait_for(&ctx->loading.populate_mrg.array[i]); + completion_destroy(&ctx->loading.populate_mrg.array[i]); + } + + freez(ctx->loading.populate_mrg.array); + ctx->loading.populate_mrg.array = NULL; + ctx->loading.populate_mrg.size = 0; + + info("DBENGINE: tier %d is ready for data collection and queries", ctx->config.tier); +} + +bool rrdeng_is_legacy(STORAGE_INSTANCE *db_instance) { + struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; + return ctx->config.legacy; } +void rrdeng_exit_mode(struct rrdengine_instance *ctx) { + __atomic_store_n(&ctx->quiesce.exit_mode, true, __ATOMIC_RELAXED); +} /* * Returns 0 on success, negative on error */ -int rrdeng_init(RRDHOST *host, struct rrdengine_instance **ctxp, char *dbfiles_path, unsigned page_cache_mb, +int rrdeng_init(struct rrdengine_instance **ctxp, const char *dbfiles_path, unsigned disk_space_mb, size_t tier) { struct rrdengine_instance *ctx; - int error; uint32_t max_open_files; max_open_files = rlimit_nofile.rlim_cur / 4; @@ -1053,182 +1184,185 @@ int rrdeng_init(RRDHOST *host, struct rrdengine_instance **ctxp, char *dbfiles_p if(NULL == ctxp) { ctx = multidb_ctx[tier]; memset(ctx, 0, sizeof(*ctx)); + ctx->config.legacy = false; } else { *ctxp = ctx = callocz(1, sizeof(*ctx)); + ctx->config.legacy = true; } - ctx->tier = tier; - ctx->page_type = tier_page_type[tier]; - ctx->global_compress_alg = RRD_LZ4; - if (page_cache_mb < RRDENG_MIN_PAGE_CACHE_SIZE_MB) - page_cache_mb = RRDENG_MIN_PAGE_CACHE_SIZE_MB; - ctx->max_cache_pages = page_cache_mb * (1048576LU / RRDENG_BLOCK_SIZE); - /* try to keep 5% of the page cache free */ - ctx->cache_pages_low_watermark = (ctx->max_cache_pages * 95LLU) / 100; + + ctx->config.tier = (int)tier; + ctx->config.page_type = tier_page_type[tier]; + ctx->config.global_compress_alg = RRD_LZ4; if (disk_space_mb < RRDENG_MIN_DISK_SPACE_MB) disk_space_mb = RRDENG_MIN_DISK_SPACE_MB; - ctx->max_disk_space = disk_space_mb * 1048576LLU; - strncpyz(ctx->dbfiles_path, dbfiles_path, sizeof(ctx->dbfiles_path) - 1); - ctx->dbfiles_path[sizeof(ctx->dbfiles_path) - 1] = '\0'; - if (NULL == host) - strncpyz(ctx->machine_guid, registry_get_this_machine_guid(), GUID_LEN); - else - strncpyz(ctx->machine_guid, host->machine_guid, GUID_LEN); - - ctx->drop_metrics_under_page_cache_pressure = rrdeng_drop_metrics_under_page_cache_pressure; - ctx->metric_API_max_producers = 0; - ctx->quiesce = NO_QUIESCE; - ctx->host = host; - - memset(&ctx->worker_config, 0, sizeof(ctx->worker_config)); - ctx->worker_config.ctx = ctx; - init_page_cache(ctx); - init_commit_log(ctx); - error = init_rrd_files(ctx); - if (error) { - goto error_after_init_rrd_files; - } + ctx->config.max_disk_space = disk_space_mb * 1048576LLU; + strncpyz(ctx->config.dbfiles_path, dbfiles_path, sizeof(ctx->config.dbfiles_path) - 1); + ctx->config.dbfiles_path[sizeof(ctx->config.dbfiles_path) - 1] = '\0'; - completion_init(&ctx->rrdengine_completion); - fatal_assert(0 == uv_thread_create(&ctx->worker_config.thread, rrdeng_worker, &ctx->worker_config)); - /* wait for worker thread to initialize */ - completion_wait_for(&ctx->rrdengine_completion); - completion_destroy(&ctx->rrdengine_completion); - uv_thread_set_name_np(ctx->worker_config.thread, "LIBUV_WORKER"); - if (ctx->worker_config.error) { - goto error_after_rrdeng_worker; - } -// error = metalog_init(ctx); -// if (error) { -// error("Failed to initialize metadata log file event loop."); -// goto error_after_rrdeng_worker; -// } + ctx->atomic.transaction_id = 1; + ctx->quiesce.enabled = false; - return 0; + if (rrdeng_dbengine_spawn(ctx) && !init_rrd_files(ctx)) { + // success - we run this ctx too + rrdeng_populate_mrg(ctx); + return 0; + } -error_after_rrdeng_worker: - finalize_rrd_files(ctx); -error_after_init_rrd_files: - free_page_cache(ctx); - if (!is_storage_engine_shared((STORAGE_INSTANCE *)ctx)) { + if (ctx->config.legacy) { freez(ctx); if (ctxp) *ctxp = NULL; } + rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE); return UV_EIO; } +size_t rrdeng_collectors_running(struct rrdengine_instance *ctx) { + return __atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED); +} + /* * Returns 0 on success, 1 on error */ -int rrdeng_exit(struct rrdengine_instance *ctx) -{ - struct rrdeng_cmd cmd; - - if (NULL == ctx) { +int rrdeng_exit(struct rrdengine_instance *ctx) { + if (NULL == ctx) return 1; + + // FIXME - ktsaou - properly cleanup ctx + // 1. make sure all collectors are stopped + // 2. make new queries will not be accepted (this is quiesce that has already run) + // 3. flush this section of the main cache + // 4. then wait for completion + + bool logged = false; + while(__atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED) && !unittest_running) { + if(!logged) { + info("DBENGINE: waiting for collectors to finish on tier %d...", (ctx->config.legacy) ? -1 : ctx->config.tier); + logged = true; + } + sleep_usec(100 * USEC_PER_MS); } - /* TODO: add page to page cache */ - cmd.opcode = RRDENG_SHUTDOWN; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); + info("DBENGINE: flushing main cache for tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier); + pgc_flush_all_hot_and_dirty_pages(main_cache, (Word_t)ctx); - fatal_assert(0 == uv_thread_join(&ctx->worker_config.thread)); + info("DBENGINE: shutting down tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier); + struct completion completion = {}; + completion_init(&completion); + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_SHUTDOWN, NULL, &completion, STORAGE_PRIORITY_BEST_EFFORT, NULL, NULL); + completion_wait_for(&completion); + completion_destroy(&completion); finalize_rrd_files(ctx); - //metalog_exit(ctx->metalog_ctx); - free_page_cache(ctx); - if(!is_storage_engine_shared((STORAGE_INSTANCE *)ctx)) + if(ctx->config.legacy) freez(ctx); rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE); return 0; } -void rrdeng_prepare_exit(struct rrdengine_instance *ctx) -{ - struct rrdeng_cmd cmd; - - if (NULL == ctx) { +void rrdeng_prepare_exit(struct rrdengine_instance *ctx) { + if (NULL == ctx) return; - } - - completion_init(&ctx->rrdengine_completion); - cmd.opcode = RRDENG_QUIESCE; - rrdeng_enq_cmd(&ctx->worker_config, &cmd); - /* wait for dbengine to quiesce */ - completion_wait_for(&ctx->rrdengine_completion); - completion_destroy(&ctx->rrdengine_completion); + // FIXME - ktsaou - properly cleanup ctx + // 1. make sure all collectors are stopped - //metalog_prepare_exit(ctx->metalog_ctx); + completion_init(&ctx->quiesce.completion); + rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_QUIESCE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); } -RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx) { - RRDENG_SIZE_STATS stats = { 0 }; +static void populate_v2_statistics(struct rrdengine_datafile *datafile, RRDENG_SIZE_STATS *stats) +{ + struct journal_v2_header *j2_header = journalfile_v2_data_acquire(datafile->journalfile, NULL, 0, 0); + void *data_start = (void *)j2_header; - for(struct pg_cache_page_index *page_index = ctx->pg_cache.metrics_index.last_page_index; - page_index != NULL ;page_index = page_index->prev) { - stats.metrics++; - stats.metrics_pages += page_index->page_count; + if(unlikely(!j2_header)) + return; + + stats->extents += j2_header->extent_count; + + unsigned entries; + struct journal_extent_list *extent_list = (void *) (data_start + j2_header->extent_offset); + for (entries = 0; entries < j2_header->extent_count; entries++) { + stats->extents_compressed_bytes += extent_list->datafile_size; + stats->extents_pages += extent_list->pages; + extent_list++; } - for(struct rrdengine_datafile *df = ctx->datafiles.first; df ;df = df->next) { - stats.datafiles++; + struct journal_metric_list *metric = (void *) (data_start + j2_header->metric_offset); + time_t journal_start_time_s = (time_t) (j2_header->start_time_ut / USEC_PER_SEC); - for(struct extent_info *ei = df->extents.first; ei ; ei = ei->next) { - stats.extents++; - stats.extents_compressed_bytes += ei->size; + stats->metrics += j2_header->metric_count; + for (entries = 0; entries < j2_header->metric_count; entries++) { - for(int p = 0; p < ei->number_of_pages ;p++) { - struct rrdeng_page_descr *descr = ei->pages[p]; + struct journal_page_header *metric_list_header = (void *) (data_start + metric->page_offset); + stats->metrics_pages += metric_list_header->entries; + struct journal_page_list *descr = (void *) (data_start + metric->page_offset + sizeof(struct journal_page_header)); + for (uint32_t idx=0; idx < metric_list_header->entries; idx++) { - usec_t update_every_usec; + time_t update_every_s; - size_t points = descr->page_length / PAGE_POINT_SIZE_BYTES(descr); + size_t points = descr->page_length / CTX_POINT_SIZE_BYTES(datafile->ctx); - if(likely(points > 1)) - update_every_usec = (descr->end_time_ut - descr->start_time_ut) / (points - 1); - else { - update_every_usec = default_rrd_update_every * get_tier_grouping(ctx->tier) * USEC_PER_SEC; - stats.single_point_pages++; - } + time_t start_time_s = journal_start_time_s + descr->delta_start_s; + time_t end_time_s = journal_start_time_s + descr->delta_end_s; - time_t duration_secs = (time_t)((descr->end_time_ut - descr->start_time_ut + update_every_usec)/USEC_PER_SEC); + if(likely(points > 1)) + update_every_s = (time_t) ((end_time_s - start_time_s) / (points - 1)); + else { + update_every_s = (time_t) (default_rrd_update_every * get_tier_grouping(datafile->ctx->config.tier)); + stats->single_point_pages++; + } - stats.extents_pages++; - stats.pages_uncompressed_bytes += descr->page_length; - stats.pages_duration_secs += duration_secs; - stats.points += points; + time_t duration_s = (time_t)((end_time_s - start_time_s + update_every_s)); - stats.page_types[descr->type].pages++; - stats.page_types[descr->type].pages_uncompressed_bytes += descr->page_length; - stats.page_types[descr->type].pages_duration_secs += duration_secs; - stats.page_types[descr->type].points += points; + stats->pages_uncompressed_bytes += descr->page_length; + stats->pages_duration_secs += duration_s; + stats->points += points; - if(!stats.first_t || (descr->start_time_ut - update_every_usec) < stats.first_t) - stats.first_t = (descr->start_time_ut - update_every_usec) / USEC_PER_SEC; + stats->page_types[descr->type].pages++; + stats->page_types[descr->type].pages_uncompressed_bytes += descr->page_length; + stats->page_types[descr->type].pages_duration_secs += duration_s; + stats->page_types[descr->type].points += points; - if(!stats.last_t || descr->end_time_ut > stats.last_t) - stats.last_t = descr->end_time_ut / USEC_PER_SEC; - } + if(!stats->first_time_s || (start_time_s - update_every_s) < stats->first_time_s) + stats->first_time_s = (start_time_s - update_every_s); + + if(!stats->last_time_s || end_time_s > stats->last_time_s) + stats->last_time_s = end_time_s; + + descr++; } + metric++; } + journalfile_v2_data_release(datafile->journalfile); +} + +RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx) { + RRDENG_SIZE_STATS stats = { 0 }; - stats.currently_collected_metrics = ctx->stats.metric_API_producers; - stats.max_concurrently_collected_metrics = ctx->metric_API_max_producers; + uv_rwlock_rdlock(&ctx->datafiles.rwlock); + for(struct rrdengine_datafile *df = ctx->datafiles.first; df ;df = df->next) { + stats.datafiles++; + populate_v2_statistics(df, &stats); + } + uv_rwlock_rdunlock(&ctx->datafiles.rwlock); + + stats.currently_collected_metrics = __atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED); internal_error(stats.metrics_pages != stats.extents_pages + stats.currently_collected_metrics, "DBENGINE: metrics pages is %zu, but extents pages is %zu and API consumers is %zu", stats.metrics_pages, stats.extents_pages, stats.currently_collected_metrics); - stats.disk_space = ctx->disk_space; - stats.max_disk_space = ctx->max_disk_space; + stats.disk_space = ctx_current_disk_space_get(ctx); + stats.max_disk_space = ctx->config.max_disk_space; - stats.database_retention_secs = (time_t)(stats.last_t - stats.first_t); + stats.database_retention_secs = (time_t)(stats.last_time_s - stats.first_time_s); if(stats.extents_pages) stats.average_page_size_bytes = (double)stats.pages_uncompressed_bytes / (double)stats.extents_pages; @@ -1252,21 +1386,22 @@ RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx) { } } - stats.sizeof_metric = struct_natural_alignment(sizeof(struct pg_cache_page_index) + sizeof(struct pg_alignment)); - stats.sizeof_page = struct_natural_alignment(sizeof(struct rrdeng_page_descr)); +// stats.sizeof_metric = 0; stats.sizeof_datafile = struct_natural_alignment(sizeof(struct rrdengine_datafile)) + struct_natural_alignment(sizeof(struct rrdengine_journalfile)); - stats.sizeof_page_in_cache = struct_natural_alignment(sizeof(struct page_cache_descr)); - stats.sizeof_point_data = page_type_size[ctx->page_type]; - stats.sizeof_page_data = RRDENG_BLOCK_SIZE; + stats.sizeof_page_in_cache = 0; // struct_natural_alignment(sizeof(struct page_cache_descr)); + stats.sizeof_point_data = page_type_size[ctx->config.page_type]; + stats.sizeof_page_data = tier_page_size[ctx->config.tier]; stats.pages_per_extent = rrdeng_pages_per_extent; - stats.sizeof_extent = sizeof(struct extent_info); - stats.sizeof_page_in_extent = sizeof(struct rrdeng_page_descr *); - - stats.sizeof_metric_in_index = 40; - stats.sizeof_page_in_index = 24; +// stats.sizeof_metric_in_index = 40; +// stats.sizeof_page_in_index = 24; - stats.default_granularity_secs = (size_t)default_rrd_update_every * get_tier_grouping(ctx->tier); + stats.default_granularity_secs = (size_t)default_rrd_update_every * get_tier_grouping(ctx->config.tier); return stats; } + +struct rrdeng_cache_efficiency_stats rrdeng_get_cache_efficiency_stats(void) { + // FIXME - make cache efficiency stats atomic + return rrdeng_cache_efficiency_stats; +} diff --git a/database/engine/rrdengineapi.h b/database/engine/rrdengineapi.h index 3acee4ec6..feb79b977 100644 --- a/database/engine/rrdengineapi.h +++ b/database/engine/rrdengineapi.h @@ -8,7 +8,7 @@ #define RRDENG_MIN_PAGE_CACHE_SIZE_MB (8) #define RRDENG_MIN_DISK_SPACE_MB (64) -#define RRDENG_NR_STATS (37) +#define RRDENG_NR_STATS (38) #define RRDENG_FD_BUDGET_PER_INSTANCE (50) @@ -16,26 +16,15 @@ extern int db_engine_use_malloc; extern int default_rrdeng_page_fetch_timeout; extern int default_rrdeng_page_fetch_retries; extern int default_rrdeng_page_cache_mb; +extern int db_engine_journal_indexing; +extern int db_engine_journal_check; extern int default_rrdeng_disk_quota_mb; extern int default_multidb_disk_quota_mb; -extern uint8_t rrdeng_drop_metrics_under_page_cache_pressure; extern struct rrdengine_instance *multidb_ctx[RRD_STORAGE_TIERS]; extern size_t page_type_size[]; +extern size_t tier_page_size[]; -#define PAGE_POINT_SIZE_BYTES(x) page_type_size[(x)->type] - -struct rrdeng_region_info { - time_t start_time_s; - int update_every; - unsigned points; -}; - -void *rrdeng_create_page(struct rrdengine_instance *ctx, uuid_t *id, struct rrdeng_page_descr **ret_descr); -void rrdeng_commit_page(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr, - Word_t page_correlation_id); -void *rrdeng_get_latest_page(struct rrdengine_instance *ctx, uuid_t *id, void **handle); -void *rrdeng_get_page(struct rrdengine_instance *ctx, uuid_t *id, usec_t point_in_time_ut, void **handle); -void rrdeng_put_page(struct rrdengine_instance *ctx, void *handle); +#define CTX_POINT_SIZE_BYTES(ctx) page_type_size[(ctx)->config.page_type] void rrdeng_generate_legacy_uuid(const char *dim_id, const char *chart_id, uuid_t *ret_uuid); void rrdeng_convert_legacy_uuid_to_multihost(char machine_guid[GUID_LEN + 1], uuid_t *legacy_uuid, @@ -44,8 +33,6 @@ void rrdeng_convert_legacy_uuid_to_multihost(char machine_guid[GUID_LEN + 1], uu STORAGE_METRIC_HANDLE *rrdeng_metric_get_or_create(RRDDIM *rd, STORAGE_INSTANCE *db_instance); STORAGE_METRIC_HANDLE *rrdeng_metric_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid); -STORAGE_METRIC_HANDLE *rrdeng_metric_create(STORAGE_INSTANCE *db_instance, uuid_t *uuid); -STORAGE_METRIC_HANDLE *rrdeng_metric_get_legacy(STORAGE_INSTANCE *db_instance, const char *rd_id, const char *st_id); void rrdeng_metric_release(STORAGE_METRIC_HANDLE *db_metric_handle); STORAGE_METRIC_HANDLE *rrdeng_metric_dup(STORAGE_METRIC_HANDLE *db_metric_handle); @@ -60,25 +47,29 @@ void rrdeng_store_metric_next(STORAGE_COLLECT_HANDLE *collection_handle, usec_t SN_FLAGS flags); int rrdeng_store_metric_finalize(STORAGE_COLLECT_HANDLE *collection_handle); -void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *rrdimm_handle, - time_t start_time_s, time_t end_time_s); +void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *rrddim_handle, + time_t start_time_s, time_t end_time_s, STORAGE_PRIORITY priority); STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim_handle); -int rrdeng_load_metric_is_finished(struct storage_engine_query_handle *rrdimm_handle); -void rrdeng_load_metric_finalize(struct storage_engine_query_handle *rrdimm_handle); +int rrdeng_load_metric_is_finished(struct storage_engine_query_handle *rrddim_handle); +void rrdeng_load_metric_finalize(struct storage_engine_query_handle *rrddim_handle); time_t rrdeng_metric_latest_time(STORAGE_METRIC_HANDLE *db_metric_handle); time_t rrdeng_metric_oldest_time(STORAGE_METRIC_HANDLE *db_metric_handle); +time_t rrdeng_load_align_to_optimal_before(struct storage_engine_query_handle *rrddim_handle); void rrdeng_get_37_statistics(struct rrdengine_instance *ctx, unsigned long long *array); /* must call once before using anything */ -int rrdeng_init(RRDHOST *host, struct rrdengine_instance **ctxp, char *dbfiles_path, unsigned page_cache_mb, +int rrdeng_init(struct rrdengine_instance **ctxp, const char *dbfiles_path, unsigned disk_space_mb, size_t tier); +void rrdeng_readiness_wait(struct rrdengine_instance *ctx); +void rrdeng_exit_mode(struct rrdengine_instance *ctx); + int rrdeng_exit(struct rrdengine_instance *ctx); void rrdeng_prepare_exit(struct rrdengine_instance *ctx); -int rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *si, uuid_t *dim_uuid, time_t *first_entry_t, time_t *last_entry_t); +bool rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *db_instance, uuid_t *dim_uuid, time_t *first_entry_s, time_t *last_entry_s); extern STORAGE_METRICS_GROUP *rrdeng_metrics_group_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid); extern void rrdeng_metrics_group_release(STORAGE_INSTANCE *db_instance, STORAGE_METRICS_GROUP *smg); @@ -86,12 +77,6 @@ extern void rrdeng_metrics_group_release(STORAGE_INSTANCE *db_instance, STORAGE_ typedef struct rrdengine_size_statistics { size_t default_granularity_secs; - size_t sizeof_metric; - size_t sizeof_metric_in_index; - size_t sizeof_page; - size_t sizeof_page_in_index; - size_t sizeof_extent; - size_t sizeof_page_in_extent; size_t sizeof_datafile; size_t sizeof_page_in_cache; size_t sizeof_point_data; @@ -119,11 +104,10 @@ typedef struct rrdengine_size_statistics { size_t single_point_pages; - usec_t first_t; - usec_t last_t; + time_t first_time_s; + time_t last_time_s; size_t currently_collected_metrics; - size_t max_concurrently_collected_metrics; size_t estimated_concurrently_collected_metrics; size_t disk_space; @@ -139,6 +123,109 @@ typedef struct rrdengine_size_statistics { double average_page_size_bytes; } RRDENG_SIZE_STATS; +struct rrdeng_cache_efficiency_stats { + size_t queries; + size_t queries_planned_with_gaps; + size_t queries_executed_with_gaps; + size_t queries_open; + size_t queries_journal_v2; + + size_t currently_running_queries; + + // query planner output of the queries + size_t pages_total; + size_t pages_to_load_from_disk; + size_t extents_loaded_from_disk; + + // pages metadata sources + size_t pages_meta_source_main_cache; + size_t pages_meta_source_open_cache; + size_t pages_meta_source_journal_v2; + + // preloading + size_t page_next_wait_failed; + size_t page_next_wait_loaded; + size_t page_next_nowait_failed; + size_t page_next_nowait_loaded; + + // pages data sources + size_t pages_data_source_main_cache; + size_t pages_data_source_main_cache_at_pass4; + size_t pages_data_source_disk; + size_t pages_data_source_extent_cache; // loaded by a cached extent + + // cache hits at different points + size_t pages_load_ok_loaded_but_cache_hit_while_inserting; // found in cache while inserting it (conflict) + + // loading + size_t pages_load_extent_merged; + size_t pages_load_ok_uncompressed; + size_t pages_load_ok_compressed; + size_t pages_load_fail_invalid_page_in_extent; + size_t pages_load_fail_cant_mmap_extent; + size_t pages_load_fail_datafile_not_available; + size_t pages_load_fail_unroutable; + size_t pages_load_fail_not_found; + size_t pages_load_fail_invalid_extent; + size_t pages_load_fail_cancelled; + + // timings for query preparation + size_t prep_time_to_route; + size_t prep_time_in_main_cache_lookup; + size_t prep_time_in_open_cache_lookup; + size_t prep_time_in_journal_v2_lookup; + size_t prep_time_in_pass4_lookup; + + // timings the query thread experiences + size_t query_time_init; + size_t query_time_wait_for_prep; + size_t query_time_to_slow_disk_next_page; + size_t query_time_to_fast_disk_next_page; + size_t query_time_to_slow_preload_next_page; + size_t query_time_to_fast_preload_next_page; + + // query issues + size_t pages_zero_time_skipped; + size_t pages_past_time_skipped; + size_t pages_overlapping_skipped; + size_t pages_invalid_size_skipped; + size_t pages_invalid_update_every_fixed; + size_t pages_invalid_entries_fixed; + + // database events + size_t journal_v2_mapped; + size_t journal_v2_unmapped; + size_t datafile_creation_started; + size_t datafile_deletion_started; + size_t datafile_deletion_spin; + size_t journal_v2_indexing_started; + size_t metrics_retention_started; +}; + +struct rrdeng_buffer_sizes { + size_t workers; + size_t pdc; + size_t wal; + size_t descriptors; + size_t xt_io; + size_t xt_buf; + size_t handles; + size_t opcodes; + size_t epdl; + size_t deol; + size_t pd; + size_t pgc; + size_t mrg; +#ifdef PDC_USE_JULYL + size_t julyl; +#endif +}; + +struct rrdeng_buffer_sizes rrdeng_get_buffer_sizes(void); +struct rrdeng_cache_efficiency_stats rrdeng_get_cache_efficiency_stats(void); + RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx); +size_t rrdeng_collectors_running(struct rrdengine_instance *ctx); +bool rrdeng_is_legacy(STORAGE_INSTANCE *db_instance); #endif /* NETDATA_RRDENGINEAPI_H */ diff --git a/database/engine/rrdenginelib.c b/database/engine/rrdenginelib.c index 58bd9c437..7ec626c59 100644 --- a/database/engine/rrdenginelib.c +++ b/database/engine/rrdenginelib.c @@ -4,68 +4,68 @@ #define BUFSIZE (512) /* Caller must hold descriptor lock */ -void print_page_cache_descr(struct rrdeng_page_descr *descr, const char *msg, bool log_debug) -{ - if(log_debug && !(debug_flags & D_RRDENGINE)) - return; - - BUFFER *wb = buffer_create(512); - - if(!descr) { - buffer_sprintf(wb, "DBENGINE: %s : descr is NULL", msg); - } - else { - struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; - char uuid_str[UUID_STR_LEN]; - - uuid_unparse_lower(*descr->id, uuid_str); - buffer_sprintf(wb, "DBENGINE: %s : page(%p) metric:%s, len:%"PRIu32", time:%"PRIu64"->%"PRIu64", update_every:%u, type:%u, xt_offset:", - msg, - pg_cache_descr->page, uuid_str, - descr->page_length, - (uint64_t)descr->start_time_ut, - (uint64_t)descr->end_time_ut, - (uint32_t)descr->update_every_s, - (uint32_t)descr->type - ); - if (!descr->extent) { - buffer_strcat(wb, "N/A"); - } else { - buffer_sprintf(wb, "%"PRIu64, descr->extent->offset); - } - - buffer_sprintf(wb, ", flags:0x%2.2lX refcnt:%u", pg_cache_descr->flags, pg_cache_descr->refcnt); - } - - if(log_debug) - debug(D_RRDENGINE, "%s", buffer_tostring(wb)); - else - internal_error(true, "%s", buffer_tostring(wb)); - - buffer_free(wb); -} - -void print_page_descr(struct rrdeng_page_descr *descr) -{ - char uuid_str[UUID_STR_LEN]; - char str[BUFSIZE + 1]; - int pos = 0; - - uuid_unparse_lower(*descr->id, uuid_str); - pos += snprintfz(str, BUFSIZE - pos, "id=%s\n" - "--->len:%"PRIu32" time:%"PRIu64"->%"PRIu64" xt_offset:", - uuid_str, - descr->page_length, - (uint64_t)descr->start_time_ut, - (uint64_t)descr->end_time_ut); - if (!descr->extent) { - pos += snprintfz(str + pos, BUFSIZE - pos, "N/A"); - } else { - pos += snprintfz(str + pos, BUFSIZE - pos, "%"PRIu64, descr->extent->offset); - } - snprintfz(str + pos, BUFSIZE - pos, "\n\n"); - fputs(str, stderr); -} +//void print_page_cache_descr(struct rrdeng_page_descr *descr, const char *msg, bool log_debug) +//{ +// if(log_debug && !(debug_flags & D_RRDENGINE)) +// return; +// +// BUFFER *wb = buffer_create(512); +// +// if(!descr) { +// buffer_sprintf(wb, "DBENGINE: %s : descr is NULL", msg); +// } +// else { +// struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; +// char uuid_str[UUID_STR_LEN]; +// +// uuid_unparse_lower(*descr->id, uuid_str); +// buffer_sprintf(wb, "DBENGINE: %s : page(%p) metric:%s, len:%"PRIu32", time:%"PRIu64"->%"PRIu64", update_every:%u, type:%u, xt_offset:", +// msg, +// pg_cache_descr->page, uuid_str, +// descr->page_length, +// (uint64_t)descr->start_time_ut, +// (uint64_t)descr->end_time_ut, +// (uint32_t)descr->update_every_s, +// (uint32_t)descr->type +// ); +// if (!descr->extent) { +// buffer_strcat(wb, "N/A"); +// } else { +// buffer_sprintf(wb, "%"PRIu64, descr->extent->offset); +// } +// +// buffer_sprintf(wb, ", flags:0x%2.2lX refcnt:%u", pg_cache_descr->flags, pg_cache_descr->refcnt); +// } +// +// if(log_debug) +// debug(D_RRDENGINE, "%s", buffer_tostring(wb)); +// else +// internal_error(true, "%s", buffer_tostring(wb)); +// +// buffer_free(wb); +//} +// +//void print_page_descr(struct rrdeng_page_descr *descr) +//{ +// char uuid_str[UUID_STR_LEN]; +// char str[BUFSIZE + 1]; +// int pos = 0; +// +// uuid_unparse_lower(*descr->id, uuid_str); +// pos += snprintfz(str, BUFSIZE - pos, "id=%s\n" +// "--->len:%"PRIu32" time:%"PRIu64"->%"PRIu64" xt_offset:", +// uuid_str, +// descr->page_length, +// (uint64_t)descr->start_time_ut, +// (uint64_t)descr->end_time_ut); +// if (!descr->extent) { +// pos += snprintfz(str + pos, BUFSIZE - pos, "N/A"); +// } else { +// pos += snprintfz(str + pos, BUFSIZE - pos, "%"PRIu64, descr->extent->offset); +// } +// snprintfz(str + pos, BUFSIZE - pos, "\n\n"); +// fputs(str, stderr); +//} int check_file_properties(uv_file file, uint64_t *file_size, size_t min_size) { @@ -142,90 +142,6 @@ int open_file_for_io(char *path, int flags, uv_file *file, int direct) return fd; } -char *get_rrdeng_statistics(struct rrdengine_instance *ctx, char *str, size_t size) -{ - struct page_cache *pg_cache; - - pg_cache = &ctx->pg_cache; - snprintfz(str, size, - "metric_API_producers: %ld\n" - "metric_API_consumers: %ld\n" - "page_cache_total_pages: %ld\n" - "page_cache_descriptors: %ld\n" - "page_cache_populated_pages: %ld\n" - "page_cache_committed_pages: %ld\n" - "page_cache_insertions: %ld\n" - "page_cache_deletions: %ld\n" - "page_cache_hits: %ld\n" - "page_cache_misses: %ld\n" - "page_cache_backfills: %ld\n" - "page_cache_evictions: %ld\n" - "compress_before_bytes: %ld\n" - "compress_after_bytes: %ld\n" - "decompress_before_bytes: %ld\n" - "decompress_after_bytes: %ld\n" - "io_write_bytes: %ld\n" - "io_write_requests: %ld\n" - "io_read_bytes: %ld\n" - "io_read_requests: %ld\n" - "io_write_extent_bytes: %ld\n" - "io_write_extents: %ld\n" - "io_read_extent_bytes: %ld\n" - "io_read_extents: %ld\n" - "datafile_creations: %ld\n" - "datafile_deletions: %ld\n" - "journalfile_creations: %ld\n" - "journalfile_deletions: %ld\n" - "io_errors: %ld\n" - "fs_errors: %ld\n" - "global_io_errors: %ld\n" - "global_fs_errors: %ld\n" - "rrdeng_reserved_file_descriptors: %ld\n" - "pg_cache_over_half_dirty_events: %ld\n" - "global_pg_cache_over_half_dirty_events: %ld\n" - "flushing_pressure_page_deletions: %ld\n" - "global_flushing_pressure_page_deletions: %ld\n", - (long)ctx->stats.metric_API_producers, - (long)ctx->stats.metric_API_consumers, - (long)pg_cache->page_descriptors, - (long)ctx->stats.page_cache_descriptors, - (long)pg_cache->populated_pages, - (long)pg_cache->committed_page_index.nr_committed_pages, - (long)ctx->stats.pg_cache_insertions, - (long)ctx->stats.pg_cache_deletions, - (long)ctx->stats.pg_cache_hits, - (long)ctx->stats.pg_cache_misses, - (long)ctx->stats.pg_cache_backfills, - (long)ctx->stats.pg_cache_evictions, - (long)ctx->stats.before_compress_bytes, - (long)ctx->stats.after_compress_bytes, - (long)ctx->stats.before_decompress_bytes, - (long)ctx->stats.after_decompress_bytes, - (long)ctx->stats.io_write_bytes, - (long)ctx->stats.io_write_requests, - (long)ctx->stats.io_read_bytes, - (long)ctx->stats.io_read_requests, - (long)ctx->stats.io_write_extent_bytes, - (long)ctx->stats.io_write_extents, - (long)ctx->stats.io_read_extent_bytes, - (long)ctx->stats.io_read_extents, - (long)ctx->stats.datafile_creations, - (long)ctx->stats.datafile_deletions, - (long)ctx->stats.journalfile_creations, - (long)ctx->stats.journalfile_deletions, - (long)ctx->stats.io_errors, - (long)ctx->stats.fs_errors, - (long)global_io_errors, - (long)global_fs_errors, - (long)rrdeng_reserved_file_descriptors, - (long)ctx->stats.pg_cache_over_half_dirty_events, - (long)global_pg_cache_over_half_dirty_events, - (long)ctx->stats.flushing_pressure_page_deletions, - (long)global_flushing_pressure_page_deletions - ); - return str; -} - int is_legacy_child(const char *machine_guid) { uuid_t uuid; diff --git a/database/engine/rrdenginelib.h b/database/engine/rrdenginelib.h index 6b1a15fb1..ca8eacae4 100644 --- a/database/engine/rrdenginelib.h +++ b/database/engine/rrdenginelib.h @@ -6,7 +6,6 @@ #include "libnetdata/libnetdata.h" /* Forward declarations */ -struct rrdeng_page_descr; struct rrdengine_instance; #define STR_HELPER(x) #x @@ -83,8 +82,6 @@ static inline void crc32set(void *crcp, uLong crc) *(uint32_t *)crcp = crc; } -void print_page_cache_descr(struct rrdeng_page_descr *descr, const char *msg, bool log_debug); -void print_page_descr(struct rrdeng_page_descr *descr); int check_file_properties(uv_file file, uint64_t *file_size, size_t min_size); int open_file_for_io(char *path, int flags, uv_file *file, int direct); static inline int open_file_direct_io(char *path, int flags, uv_file *file) @@ -95,7 +92,6 @@ static inline int open_file_buffered_io(char *path, int flags, uv_file *file) { return open_file_for_io(path, flags, file, 0); } -char *get_rrdeng_statistics(struct rrdengine_instance *ctx, char *str, size_t size); int compute_multidb_diskspace(); int is_legacy_child(const char *machine_guid); diff --git a/database/engine/rrdenglocking.c b/database/engine/rrdenglocking.c deleted file mode 100644 index a23abf307..000000000 --- a/database/engine/rrdenglocking.c +++ /dev/null @@ -1,241 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later -#include "rrdengine.h" - -struct page_cache_descr *rrdeng_create_pg_cache_descr(struct rrdengine_instance *ctx) -{ - struct page_cache_descr *pg_cache_descr; - - pg_cache_descr = mallocz(sizeof(*pg_cache_descr)); - rrd_stat_atomic_add(&ctx->stats.page_cache_descriptors, 1); - pg_cache_descr->page = NULL; - pg_cache_descr->flags = 0; - pg_cache_descr->prev = pg_cache_descr->next = NULL; - pg_cache_descr->refcnt = 0; - pg_cache_descr->waiters = 0; - fatal_assert(0 == uv_cond_init(&pg_cache_descr->cond)); - fatal_assert(0 == uv_mutex_init(&pg_cache_descr->mutex)); - - return pg_cache_descr; -} - -void rrdeng_destroy_pg_cache_descr(struct rrdengine_instance *ctx, struct page_cache_descr *pg_cache_descr) -{ - uv_cond_destroy(&pg_cache_descr->cond); - uv_mutex_destroy(&pg_cache_descr->mutex); - freez(pg_cache_descr); - rrd_stat_atomic_add(&ctx->stats.page_cache_descriptors, -1); -} - -/* also allocates page cache descriptor if missing */ -void rrdeng_page_descr_mutex_lock(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) -{ - unsigned long old_state, old_users, new_state, ret_state; - struct page_cache_descr *pg_cache_descr = NULL; - uint8_t we_locked; - - we_locked = 0; - while (1) { /* spin */ - old_state = descr->pg_cache_descr_state; - old_users = old_state >> PG_CACHE_DESCR_SHIFT; - - if (unlikely(we_locked)) { - fatal_assert(old_state & PG_CACHE_DESCR_LOCKED); - new_state = (1 << PG_CACHE_DESCR_SHIFT) | PG_CACHE_DESCR_ALLOCATED; - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - /* success */ - break; - } - continue; /* spin */ - } - if (old_state & PG_CACHE_DESCR_LOCKED) { - fatal_assert(0 == old_users); - continue; /* spin */ - } - if (0 == old_state) { - /* no page cache descriptor has been allocated */ - - if (NULL == pg_cache_descr) { - pg_cache_descr = rrdeng_create_pg_cache_descr(ctx); - } - new_state = PG_CACHE_DESCR_LOCKED; - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, 0, new_state); - if (0 == ret_state) { - we_locked = 1; - descr->pg_cache_descr = pg_cache_descr; - pg_cache_descr->descr = descr; - pg_cache_descr = NULL; /* make sure we don't free pg_cache_descr */ - /* retry */ - continue; - } - continue; /* spin */ - } - /* page cache descriptor is already allocated */ - if (unlikely(!(old_state & PG_CACHE_DESCR_ALLOCATED))) { - fatal("Invalid page cache descriptor locking state:%#lX", old_state); - } - new_state = (old_users + 1) << PG_CACHE_DESCR_SHIFT; - new_state |= old_state & PG_CACHE_DESCR_FLAGS_MASK; - - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - /* success */ - break; - } - /* spin */ - } - - if (pg_cache_descr) { - rrdeng_destroy_pg_cache_descr(ctx, pg_cache_descr); - } - pg_cache_descr = descr->pg_cache_descr; - uv_mutex_lock(&pg_cache_descr->mutex); -} - -void rrdeng_page_descr_mutex_unlock(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) -{ - unsigned long old_state, new_state, ret_state, old_users; - struct page_cache_descr *pg_cache_descr, *delete_pg_cache_descr = NULL; - uint8_t we_locked; - - uv_mutex_unlock(&descr->pg_cache_descr->mutex); - - we_locked = 0; - while (1) { /* spin */ - old_state = descr->pg_cache_descr_state; - old_users = old_state >> PG_CACHE_DESCR_SHIFT; - - if (unlikely(we_locked)) { - fatal_assert(0 == old_users); - - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, 0); - if (old_state == ret_state) { - /* success */ - rrdeng_destroy_pg_cache_descr(ctx, delete_pg_cache_descr); - return; - } - continue; /* spin */ - } - if (old_state & PG_CACHE_DESCR_LOCKED) { - fatal_assert(0 == old_users); - continue; /* spin */ - } - fatal_assert(old_state & PG_CACHE_DESCR_ALLOCATED); - pg_cache_descr = descr->pg_cache_descr; - /* caller is the only page cache descriptor user and there are no pending references on the page */ - if ((old_state & PG_CACHE_DESCR_DESTROY) && (1 == old_users) && - !pg_cache_descr->flags && !pg_cache_descr->refcnt) { - fatal_assert(!pg_cache_descr->waiters); - - new_state = PG_CACHE_DESCR_LOCKED; - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - we_locked = 1; - delete_pg_cache_descr = pg_cache_descr; - descr->pg_cache_descr = NULL; - /* retry */ - continue; - } - continue; /* spin */ - } - fatal_assert(old_users > 0); - new_state = (old_users - 1) << PG_CACHE_DESCR_SHIFT; - new_state |= old_state & PG_CACHE_DESCR_FLAGS_MASK; - - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - /* success */ - break; - } - /* spin */ - } -} - -/* - * Tries to deallocate page cache descriptor. If it fails, it postpones deallocation by setting the - * PG_CACHE_DESCR_DESTROY flag which will be eventually cleared by a different context after doing - * the deallocation. - */ -void rrdeng_try_deallocate_pg_cache_descr(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) -{ - unsigned long old_state, new_state, ret_state, old_users; - struct page_cache_descr *pg_cache_descr = NULL; - uint8_t just_locked, can_free, must_unlock; - - just_locked = 0; - can_free = 0; - must_unlock = 0; - while (1) { /* spin */ - old_state = descr->pg_cache_descr_state; - old_users = old_state >> PG_CACHE_DESCR_SHIFT; - - if (unlikely(just_locked)) { - fatal_assert(0 == old_users); - - must_unlock = 1; - just_locked = 0; - /* Try deallocate if there are no pending references on the page */ - if (!pg_cache_descr->flags && !pg_cache_descr->refcnt) { - fatal_assert(!pg_cache_descr->waiters); - - descr->pg_cache_descr = NULL; - can_free = 1; - /* success */ - continue; - } - continue; /* spin */ - } - if (unlikely(must_unlock)) { - fatal_assert(0 == old_users); - - if (can_free) { - /* success */ - new_state = 0; - } else { - new_state = old_state | PG_CACHE_DESCR_DESTROY; - new_state &= ~PG_CACHE_DESCR_LOCKED; - } - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - /* unlocked */ - if (can_free) - rrdeng_destroy_pg_cache_descr(ctx, pg_cache_descr); - return; - } - continue; /* spin */ - } - if (!(old_state & PG_CACHE_DESCR_ALLOCATED)) { - /* don't do anything */ - return; - } - if (old_state & PG_CACHE_DESCR_LOCKED) { - fatal_assert(0 == old_users); - continue; /* spin */ - } - /* caller is the only page cache descriptor user */ - if (0 == old_users) { - new_state = old_state | PG_CACHE_DESCR_LOCKED; - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - just_locked = 1; - pg_cache_descr = descr->pg_cache_descr; - /* retry */ - continue; - } - continue; /* spin */ - } - if (old_state & PG_CACHE_DESCR_DESTROY) { - /* don't do anything */ - return; - } - /* plant PG_CACHE_DESCR_DESTROY so that other contexts eventually free the page cache descriptor */ - new_state = old_state | PG_CACHE_DESCR_DESTROY; - - ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); - if (old_state == ret_state) { - /* success */ - return; - } - /* spin */ - } -} \ No newline at end of file diff --git a/database/engine/rrdenglocking.h b/database/engine/rrdenglocking.h deleted file mode 100644 index 078eab38b..000000000 --- a/database/engine/rrdenglocking.h +++ /dev/null @@ -1,17 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_RRDENGLOCKING_H -#define NETDATA_RRDENGLOCKING_H - -#include "rrdengine.h" - -/* Forward declarations */ -struct page_cache_descr; - -struct page_cache_descr *rrdeng_create_pg_cache_descr(struct rrdengine_instance *ctx); -void rrdeng_destroy_pg_cache_descr(struct rrdengine_instance *ctx, struct page_cache_descr *pg_cache_descr); -void rrdeng_page_descr_mutex_lock(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); -void rrdeng_page_descr_mutex_unlock(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); -void rrdeng_try_deallocate_pg_cache_descr(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); - -#endif /* NETDATA_RRDENGLOCKING_H */ \ No newline at end of file diff --git a/database/ram/rrddim_mem.c b/database/ram/rrddim_mem.c index 299b6557a..0f17d6cb9 100644 --- a/database/ram/rrddim_mem.c +++ b/database/ram/rrddim_mem.c @@ -21,129 +21,263 @@ void rrddim_metrics_group_release(STORAGE_INSTANCE *db_instance __maybe_unused, // ---------------------------------------------------------------------------- // RRDDIM legacy data collection functions +struct mem_metric_handle { + RRDDIM *rd; + + size_t counter; + size_t entries; + size_t current_entry; + time_t last_updated_s; + time_t update_every_s; + + int32_t refcount; +}; + +static void update_metric_handle_from_rrddim(struct mem_metric_handle *mh, RRDDIM *rd) { + mh->counter = rd->rrdset->counter; + mh->entries = rd->rrdset->entries; + mh->current_entry = rd->rrdset->current_entry; + mh->last_updated_s = rd->rrdset->last_updated.tv_sec; + mh->update_every_s = rd->rrdset->update_every; +} + +static void check_metric_handle_from_rrddim(struct mem_metric_handle *mh) { + RRDDIM *rd = mh->rd; (void)rd; + internal_fatal(mh->entries != (size_t)rd->rrdset->entries, "RRDDIM: entries do not match"); + internal_fatal(mh->update_every_s != rd->rrdset->update_every, "RRDDIM: update every does not match"); +} + STORAGE_METRIC_HANDLE * rrddim_metric_get_or_create(RRDDIM *rd, STORAGE_INSTANCE *db_instance __maybe_unused) { - STORAGE_METRIC_HANDLE *t = rrddim_metric_get(db_instance, &rd->metric_uuid); - if(!t) { + struct mem_metric_handle *mh = (struct mem_metric_handle *)rrddim_metric_get(db_instance, &rd->metric_uuid); + while(!mh) { netdata_rwlock_wrlock(&rrddim_JudyHS_rwlock); Pvoid_t *PValue = JudyHSIns(&rrddim_JudyHS_array, &rd->metric_uuid, sizeof(uuid_t), PJE0); - fatal_assert(NULL == *PValue); - *PValue = rd; - t = (STORAGE_METRIC_HANDLE *)rd; + mh = *PValue; + if(!mh) { + mh = callocz(1, sizeof(struct mem_metric_handle)); + mh->rd = rd; + mh->refcount = 1; + update_metric_handle_from_rrddim(mh, rd); + *PValue = mh; + __atomic_add_fetch(&rrddim_db_memory_size, sizeof(struct mem_metric_handle) + JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(uuid_t)), __ATOMIC_RELAXED); + } + else { + if(__atomic_add_fetch(&mh->refcount, 1, __ATOMIC_RELAXED) <= 0) + mh = NULL; + } netdata_rwlock_unlock(&rrddim_JudyHS_rwlock); } - if((RRDDIM *)t != rd) - fatal("RRDDIM_MEM: incorrect pointer returned from index."); + internal_fatal(mh->rd != rd, "RRDDIM_MEM: incorrect pointer returned from index."); - return (STORAGE_METRIC_HANDLE *)rd; + return (STORAGE_METRIC_HANDLE *)mh; } STORAGE_METRIC_HANDLE * rrddim_metric_get(STORAGE_INSTANCE *db_instance __maybe_unused, uuid_t *uuid) { - RRDDIM *rd = NULL; + struct mem_metric_handle *mh = NULL; netdata_rwlock_rdlock(&rrddim_JudyHS_rwlock); Pvoid_t *PValue = JudyHSGet(rrddim_JudyHS_array, uuid, sizeof(uuid_t)); - if (likely(NULL != PValue)) - rd = *PValue; + if (likely(NULL != PValue)) { + mh = *PValue; + if(__atomic_add_fetch(&mh->refcount, 1, __ATOMIC_RELAXED) <= 0) + mh = NULL; + } netdata_rwlock_unlock(&rrddim_JudyHS_rwlock); - return (STORAGE_METRIC_HANDLE *)rd; + return (STORAGE_METRIC_HANDLE *)mh; } STORAGE_METRIC_HANDLE *rrddim_metric_dup(STORAGE_METRIC_HANDLE *db_metric_handle) { + struct mem_metric_handle *mh = (struct mem_metric_handle *)db_metric_handle; + __atomic_add_fetch(&mh->refcount, 1, __ATOMIC_RELAXED); return db_metric_handle; } void rrddim_metric_release(STORAGE_METRIC_HANDLE *db_metric_handle __maybe_unused) { - RRDDIM *rd = (RRDDIM *)db_metric_handle; + struct mem_metric_handle *mh = (struct mem_metric_handle *)db_metric_handle; - netdata_rwlock_wrlock(&rrddim_JudyHS_rwlock); - JudyHSDel(&rrddim_JudyHS_array, &rd->metric_uuid, sizeof(uuid_t), PJE0); - netdata_rwlock_unlock(&rrddim_JudyHS_rwlock); + if(__atomic_sub_fetch(&mh->refcount, 1, __ATOMIC_RELAXED) == 0) { + // we are the last one holding this + + int32_t expected = 0; + if(__atomic_compare_exchange_n(&mh->refcount, &expected, -99999, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) { + // we can delete it + + RRDDIM *rd = mh->rd; + netdata_rwlock_wrlock(&rrddim_JudyHS_rwlock); + JudyHSDel(&rrddim_JudyHS_array, &rd->metric_uuid, sizeof(uuid_t), PJE0); + netdata_rwlock_unlock(&rrddim_JudyHS_rwlock); + + freez(mh); + __atomic_sub_fetch(&rrddim_db_memory_size, sizeof(struct mem_metric_handle) + JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(uuid_t)), __ATOMIC_RELAXED); + } + } } -void rrddim_store_metric_change_collection_frequency(STORAGE_COLLECT_HANDLE *collection_handle, int update_every __maybe_unused) { +bool rrddim_metric_retention_by_uuid(STORAGE_INSTANCE *db_instance __maybe_unused, uuid_t *uuid, time_t *first_entry_s, time_t *last_entry_s) { + STORAGE_METRIC_HANDLE *db_metric_handle = rrddim_metric_get(db_instance, uuid); + if(!db_metric_handle) + return false; + + *first_entry_s = rrddim_query_oldest_time_s(db_metric_handle); + *last_entry_s = rrddim_query_latest_time_s(db_metric_handle); + + return true; +} + +void rrddim_store_metric_change_collection_frequency(STORAGE_COLLECT_HANDLE *collection_handle, int update_every) { + struct mem_collect_handle *ch = (struct mem_collect_handle *)collection_handle; + struct mem_metric_handle *mh = (struct mem_metric_handle *)ch->db_metric_handle; + rrddim_store_metric_flush(collection_handle); + mh->update_every_s = update_every; } STORAGE_COLLECT_HANDLE *rrddim_collect_init(STORAGE_METRIC_HANDLE *db_metric_handle, uint32_t update_every __maybe_unused, STORAGE_METRICS_GROUP *smg __maybe_unused) { - RRDDIM *rd = (RRDDIM *)db_metric_handle; - rd->db[rd->rrdset->current_entry] = pack_storage_number(NAN, SN_FLAG_NONE); + struct mem_metric_handle *mh = (struct mem_metric_handle *)db_metric_handle; + RRDDIM *rd = mh->rd; + + update_metric_handle_from_rrddim(mh, rd); + internal_fatal((uint32_t)mh->update_every_s != update_every, "RRDDIM: update requested does not match the dimension"); + struct mem_collect_handle *ch = callocz(1, sizeof(struct mem_collect_handle)); ch->rd = rd; + ch->db_metric_handle = db_metric_handle; + + __atomic_add_fetch(&rrddim_db_memory_size, sizeof(struct mem_collect_handle), __ATOMIC_RELAXED); + return (STORAGE_COLLECT_HANDLE *)ch; } -void rrddim_collect_store_metric(STORAGE_COLLECT_HANDLE *collection_handle, usec_t point_in_time, NETDATA_DOUBLE number, - NETDATA_DOUBLE min_value, - NETDATA_DOUBLE max_value, - uint16_t count, - uint16_t anomaly_count, - SN_FLAGS flags) -{ - UNUSED(point_in_time); - UNUSED(min_value); - UNUSED(max_value); - UNUSED(count); - UNUSED(anomaly_count); +void rrddim_store_metric_flush(STORAGE_COLLECT_HANDLE *collection_handle) { + struct mem_collect_handle *ch = (struct mem_collect_handle *)collection_handle; + struct mem_metric_handle *mh = (struct mem_metric_handle *)ch->db_metric_handle; + + RRDDIM *rd = mh->rd; + size_t entries = mh->entries; + storage_number empty = pack_storage_number(NAN, SN_FLAG_NONE); + + for(size_t i = 0; i < entries ;i++) + rd->db[i] = empty; + mh->counter = 0; + mh->last_updated_s = 0; + mh->current_entry = 0; +} + +static inline void rrddim_fill_the_gap(STORAGE_COLLECT_HANDLE *collection_handle, time_t now_collect_s) { struct mem_collect_handle *ch = (struct mem_collect_handle *)collection_handle; - RRDDIM *rd = ch->rd; - rd->db[rd->rrdset->current_entry] = pack_storage_number(number, flags); + struct mem_metric_handle *mh = (struct mem_metric_handle *)ch->db_metric_handle; + + RRDDIM *rd = mh->rd; + + internal_fatal(ch->rd != mh->rd, "RRDDIM: dimensions do not match"); + check_metric_handle_from_rrddim(mh); + + size_t entries = mh->entries; + time_t update_every_s = mh->update_every_s; + time_t last_stored_s = mh->last_updated_s; + size_t gap_entries = (now_collect_s - last_stored_s) / update_every_s; + if(gap_entries >= entries) + rrddim_store_metric_flush(collection_handle); + + else { + storage_number empty = pack_storage_number(NAN, SN_FLAG_NONE); + size_t current_entry = mh->current_entry; + time_t now_store_s = last_stored_s + update_every_s; + + // fill the dimension + size_t c; + for(c = 0; c < entries && now_store_s <= now_collect_s ; now_store_s += update_every_s, c++) { + rd->db[current_entry++] = empty; + + if(unlikely(current_entry >= entries)) + current_entry = 0; + } + mh->counter += c; + mh->current_entry = current_entry; + mh->last_updated_s = now_store_s; + } } -void rrddim_store_metric_flush(STORAGE_COLLECT_HANDLE *collection_handle) { +void rrddim_collect_store_metric(STORAGE_COLLECT_HANDLE *collection_handle, + usec_t point_in_time_ut, + NETDATA_DOUBLE number, + NETDATA_DOUBLE min_value __maybe_unused, + NETDATA_DOUBLE max_value __maybe_unused, + uint16_t count __maybe_unused, + uint16_t anomaly_count __maybe_unused, + SN_FLAGS flags) +{ struct mem_collect_handle *ch = (struct mem_collect_handle *)collection_handle; + struct mem_metric_handle *mh = (struct mem_metric_handle *)ch->db_metric_handle; RRDDIM *rd = ch->rd; - for(int i = 0; i < rd->rrdset->entries ;i++) - rd->db[i] = SN_EMPTY_SLOT; + time_t point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC); + + internal_fatal(ch->rd != mh->rd, "RRDDIM: dimensions do not match"); + check_metric_handle_from_rrddim(mh); + if(unlikely(point_in_time_s <= mh->last_updated_s)) + return; + + if(unlikely(mh->last_updated_s && point_in_time_s - mh->update_every_s > mh->last_updated_s)) + rrddim_fill_the_gap(collection_handle, point_in_time_s); + + rd->db[mh->current_entry] = pack_storage_number(number, flags); + mh->counter++; + mh->current_entry = (mh->current_entry + 1) >= mh->entries ? 0 : mh->current_entry + 1; + mh->last_updated_s = point_in_time_s; } int rrddim_collect_finalize(STORAGE_COLLECT_HANDLE *collection_handle) { freez(collection_handle); + __atomic_sub_fetch(&rrddim_db_memory_size, sizeof(struct mem_collect_handle), __ATOMIC_RELAXED); return 0; } // ---------------------------------------------------------------------------- -// get the total duration in seconds of the round robin database -#define rrddim_duration(st) (( (time_t)(rd)->rrdset->counter >= (time_t)(rd)->rrdset->entries ? (time_t)(rd)->rrdset->entries : (time_t)(rd)->rrdset->counter ) * (time_t)(rd)->rrdset->update_every) +// get the total duration in seconds of the round-robin database +#define metric_duration(mh) (( (time_t)(mh)->counter >= (time_t)(mh)->entries ? (time_t)(mh)->entries : (time_t)(mh)->counter ) * (time_t)(mh)->update_every_s) -// get the last slot updated in the round robin database -#define rrddim_last_slot(rd) ((size_t)(((rd)->rrdset->current_entry == 0) ? (rd)->rrdset->entries - 1 : (rd)->rrdset->current_entry - 1)) +// get the last slot updated in the round-robin database +#define rrddim_last_slot(mh) ((size_t)(((mh)->current_entry == 0) ? (mh)->entries - 1 : (mh)->current_entry - 1)) // return the slot that has the oldest value -#define rrddim_first_slot(rd) ((size_t)((rd)->rrdset->counter >= (size_t)(rd)->rrdset->entries ? (rd)->rrdset->current_entry : 0)) +#define rrddim_first_slot(mh) ((size_t)((mh)->counter >= (size_t)(mh)->entries ? (mh)->current_entry : 0)) -// get the slot of the round robin database, for the given timestamp (t) -// it always returns a valid slot, although may not be for the time requested if the time is outside the round robin database +// get the slot of the round-robin database, for the given timestamp (t) +// it always returns a valid slot, although it may not be for the time requested if the time is outside the round-robin database // only valid when not using dbengine -static inline size_t rrddim_time2slot(RRDDIM *rd, time_t t) { +static inline size_t rrddim_time2slot(STORAGE_METRIC_HANDLE *db_metric_handle, time_t t) { + struct mem_metric_handle *mh = (struct mem_metric_handle *)db_metric_handle; + RRDDIM *rd = mh->rd; + size_t ret = 0; - time_t last_entry_t = rrddim_query_latest_time((STORAGE_METRIC_HANDLE *)rd); - time_t first_entry_t = rrddim_query_oldest_time((STORAGE_METRIC_HANDLE *)rd); - size_t entries = rd->rrdset->entries; - size_t first_slot = rrddim_first_slot(rd); - size_t last_slot = rrddim_last_slot(rd); - size_t update_every = rd->rrdset->update_every; - - if(t >= last_entry_t) { + time_t last_entry_s = rrddim_query_latest_time_s(db_metric_handle); + time_t first_entry_s = rrddim_query_oldest_time_s(db_metric_handle); + size_t entries = mh->entries; + size_t first_slot = rrddim_first_slot(mh); + size_t last_slot = rrddim_last_slot(mh); + size_t update_every = mh->update_every_s; + + if(t >= last_entry_s) { // the requested time is after the last entry we have ret = last_slot; } else { - if(t <= first_entry_t) { + if(t <= first_entry_s) { // the requested time is before the first entry we have ret = first_slot; } else { - if(last_slot >= (size_t)((last_entry_t - t) / update_every)) - ret = last_slot - ((last_entry_t - t) / update_every); + if(last_slot >= (size_t)((last_entry_s - t) / update_every)) + ret = last_slot - ((last_entry_s - t) / update_every); else - ret = last_slot - ((last_entry_t - t) / update_every) + entries; + ret = last_slot - ((last_entry_s - t) / update_every) + entries; } } @@ -155,15 +289,18 @@ static inline size_t rrddim_time2slot(RRDDIM *rd, time_t t) { return ret; } -// get the timestamp of a specific slot in the round robin database +// get the timestamp of a specific slot in the round-robin database // only valid when not using dbengine -static inline time_t rrddim_slot2time(RRDDIM *rd, size_t slot) { +static inline time_t rrddim_slot2time(STORAGE_METRIC_HANDLE *db_metric_handle, size_t slot) { + struct mem_metric_handle *mh = (struct mem_metric_handle *)db_metric_handle; + RRDDIM *rd = mh->rd; + time_t ret; - time_t last_entry_t = rrddim_query_latest_time((STORAGE_METRIC_HANDLE *)rd); - time_t first_entry_t = rrddim_query_oldest_time((STORAGE_METRIC_HANDLE *)rd); - size_t entries = rd->rrdset->entries; - size_t last_slot = rrddim_last_slot(rd); - size_t update_every = rd->rrdset->update_every; + time_t last_entry_s = rrddim_query_latest_time_s(db_metric_handle); + time_t first_entry_s = rrddim_query_oldest_time_s(db_metric_handle); + size_t entries = mh->entries; + size_t last_slot = rrddim_last_slot(mh); + size_t update_every = mh->update_every_s; if(slot >= entries) { error("INTERNAL ERROR: caller of rrddim_slot2time() gives invalid slot %zu", slot); @@ -171,18 +308,22 @@ static inline time_t rrddim_slot2time(RRDDIM *rd, size_t slot) { } if(slot > last_slot) - ret = last_entry_t - (time_t)(update_every * (last_slot - slot + entries)); + ret = last_entry_s - (time_t)(update_every * (last_slot - slot + entries)); else - ret = last_entry_t - (time_t)(update_every * (last_slot - slot)); + ret = last_entry_s - (time_t)(update_every * (last_slot - slot)); - if(unlikely(ret < first_entry_t)) { - error("INTERNAL ERROR: rrddim_slot2time() on %s returns time too far in the past", rrddim_name(rd)); - ret = first_entry_t; + if(unlikely(ret < first_entry_s)) { + error("INTERNAL ERROR: rrddim_slot2time() on dimension '%s' of chart '%s' returned time (%ld) too far in the past (before first_entry_s %ld) for slot %zu", + rrddim_name(rd), rrdset_id(rd->rrdset), ret, first_entry_s, slot); + + ret = first_entry_s; } - if(unlikely(ret > last_entry_t)) { - error("INTERNAL ERROR: rrddim_slot2time() on %s returns time into the future", rrddim_name(rd)); - ret = last_entry_t; + if(unlikely(ret > last_entry_s)) { + error("INTERNAL ERROR: rrddim_slot2time() on dimension '%s' of chart '%s' returned time (%ld) too far into the future (after last_entry_s %ld) for slot %zu", + rrddim_name(rd), rrdset_id(rd->rrdset), ret, last_entry_s, slot); + + ret = last_entry_s; } return ret; @@ -191,23 +332,28 @@ static inline time_t rrddim_slot2time(RRDDIM *rd, size_t slot) { // ---------------------------------------------------------------------------- // RRDDIM legacy database query functions -void rrddim_query_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *handle, time_t start_time, time_t end_time) { - RRDDIM *rd = (RRDDIM *)db_metric_handle; +void rrddim_query_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *handle, time_t start_time_s, time_t end_time_s, STORAGE_PRIORITY priority __maybe_unused) { + struct mem_metric_handle *mh = (struct mem_metric_handle *)db_metric_handle; + + check_metric_handle_from_rrddim(mh); - handle->rd = rd; - handle->start_time_s = start_time; - handle->end_time_s = end_time; + handle->start_time_s = start_time_s; + handle->end_time_s = end_time_s; + handle->priority = priority; struct mem_query_handle* h = mallocz(sizeof(struct mem_query_handle)); - h->slot = rrddim_time2slot(rd, start_time); - h->last_slot = rrddim_time2slot(rd, end_time); - h->dt = rd->rrdset->update_every; + h->db_metric_handle = db_metric_handle; - h->next_timestamp = start_time; - h->slot_timestamp = rrddim_slot2time(rd, h->slot); - h->last_timestamp = rrddim_slot2time(rd, h->last_slot); + h->slot = rrddim_time2slot(db_metric_handle, start_time_s); + h->last_slot = rrddim_time2slot(db_metric_handle, end_time_s); + h->dt = mh->update_every_s; + + h->next_timestamp = start_time_s; + h->slot_timestamp = rrddim_slot2time(db_metric_handle, h->slot); + h->last_timestamp = rrddim_slot2time(db_metric_handle, h->last_slot); // info("RRDDIM QUERY INIT: start %ld, end %ld, next %ld, first %ld, last %ld, dt %ld", start_time, end_time, h->next_timestamp, h->slot_timestamp, h->last_timestamp, h->dt); + __atomic_add_fetch(&rrddim_db_memory_size, sizeof(struct mem_query_handle), __ATOMIC_RELAXED); handle->handle = (STORAGE_QUERY_HANDLE *)h; } @@ -215,9 +361,11 @@ void rrddim_query_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_e // IT IS REQUIRED TO **ALWAYS** SET ALL RETURN VALUES (current_time, end_time, flags) // IT IS REQUIRED TO **ALWAYS** KEEP TRACK OF TIME, EVEN OUTSIDE THE DATABASE BOUNDARIES STORAGE_POINT rrddim_query_next_metric(struct storage_engine_query_handle *handle) { - RRDDIM *rd = handle->rd; struct mem_query_handle* h = (struct mem_query_handle*)handle->handle; - size_t entries = rd->rrdset->entries; + struct mem_metric_handle *mh = (struct mem_metric_handle *)h->db_metric_handle; + RRDDIM *rd = mh->rd; + + size_t entries = mh->entries; size_t slot = h->slot; STORAGE_POINT sp; @@ -227,16 +375,16 @@ STORAGE_POINT rrddim_query_next_metric(struct storage_engine_query_handle *handl h->next_timestamp += h->dt; // set this timestamp for our caller - sp.start_time = this_timestamp - h->dt; - sp.end_time = this_timestamp; + sp.start_time_s = this_timestamp - h->dt; + sp.end_time_s = this_timestamp; if(unlikely(this_timestamp < h->slot_timestamp)) { - storage_point_empty(sp, sp.start_time, sp.end_time); + storage_point_empty(sp, sp.start_time_s, sp.end_time_s); return sp; } if(unlikely(this_timestamp > h->last_timestamp)) { - storage_point_empty(sp, sp.start_time, sp.end_time); + storage_point_empty(sp, sp.start_time_s, sp.end_time_s); return sp; } @@ -254,24 +402,34 @@ STORAGE_POINT rrddim_query_next_metric(struct storage_engine_query_handle *handl } int rrddim_query_is_finished(struct storage_engine_query_handle *handle) { - struct mem_query_handle* h = (struct mem_query_handle*)handle->handle; + struct mem_query_handle *h = (struct mem_query_handle*)handle->handle; return (h->next_timestamp > handle->end_time_s); } void rrddim_query_finalize(struct storage_engine_query_handle *handle) { #ifdef NETDATA_INTERNAL_CHECKS - if(!rrddim_query_is_finished(handle)) - error("QUERY: query for chart '%s' dimension '%s' has been stopped unfinished", rrdset_id(handle->rd->rrdset), rrddim_name(handle->rd)); + struct mem_query_handle *h = (struct mem_query_handle*)handle->handle; + struct mem_metric_handle *mh = (struct mem_metric_handle *)h->db_metric_handle; + + internal_error(!rrddim_query_is_finished(handle), + "QUERY: query for chart '%s' dimension '%s' has been stopped unfinished", + rrdset_id(mh->rd->rrdset), rrddim_name(mh->rd)); + #endif freez(handle->handle); + __atomic_sub_fetch(&rrddim_db_memory_size, sizeof(struct mem_query_handle), __ATOMIC_RELAXED); +} + +time_t rrddim_query_align_to_optimal_before(struct storage_engine_query_handle *rrddim_handle) { + return rrddim_handle->end_time_s; } -time_t rrddim_query_latest_time(STORAGE_METRIC_HANDLE *db_metric_handle) { - RRDDIM *rd = (RRDDIM *)db_metric_handle; - return rd->rrdset->last_updated.tv_sec; +time_t rrddim_query_latest_time_s(STORAGE_METRIC_HANDLE *db_metric_handle) { + struct mem_metric_handle *mh = (struct mem_metric_handle *)db_metric_handle; + return mh->last_updated_s; } -time_t rrddim_query_oldest_time(STORAGE_METRIC_HANDLE *db_metric_handle) { - RRDDIM *rd = (RRDDIM *)db_metric_handle; - return (time_t)(rd->rrdset->last_updated.tv_sec - rrddim_duration(rd)); +time_t rrddim_query_oldest_time_s(STORAGE_METRIC_HANDLE *db_metric_handle) { + struct mem_metric_handle *mh = (struct mem_metric_handle *)db_metric_handle; + return (time_t)(mh->last_updated_s - metric_duration(mh)); } diff --git a/database/ram/rrddim_mem.h b/database/ram/rrddim_mem.h index 79c59f110..373a2bd7b 100644 --- a/database/ram/rrddim_mem.h +++ b/database/ram/rrddim_mem.h @@ -6,12 +6,12 @@ #include "database/rrd.h" struct mem_collect_handle { + STORAGE_METRIC_HANDLE *db_metric_handle; RRDDIM *rd; - long slot; - long entries; }; struct mem_query_handle { + STORAGE_METRIC_HANDLE *db_metric_handle; time_t dt; time_t next_timestamp; time_t last_timestamp; @@ -25,12 +25,14 @@ STORAGE_METRIC_HANDLE *rrddim_metric_get(STORAGE_INSTANCE *db_instance, uuid_t * STORAGE_METRIC_HANDLE *rrddim_metric_dup(STORAGE_METRIC_HANDLE *db_metric_handle); void rrddim_metric_release(STORAGE_METRIC_HANDLE *db_metric_handle); +bool rrddim_metric_retention_by_uuid(STORAGE_INSTANCE *db_instance, uuid_t *uuid, time_t *first_entry_s, time_t *last_entry_s); + STORAGE_METRICS_GROUP *rrddim_metrics_group_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid); void rrddim_metrics_group_release(STORAGE_INSTANCE *db_instance, STORAGE_METRICS_GROUP *smg); STORAGE_COLLECT_HANDLE *rrddim_collect_init(STORAGE_METRIC_HANDLE *db_metric_handle, uint32_t update_every, STORAGE_METRICS_GROUP *smg); void rrddim_store_metric_change_collection_frequency(STORAGE_COLLECT_HANDLE *collection_handle, int update_every); -void rrddim_collect_store_metric(STORAGE_COLLECT_HANDLE *collection_handle, usec_t point_in_time, NETDATA_DOUBLE number, +void rrddim_collect_store_metric(STORAGE_COLLECT_HANDLE *collection_handle, usec_t point_in_time_ut, NETDATA_DOUBLE number, NETDATA_DOUBLE min_value, NETDATA_DOUBLE max_value, uint16_t count, @@ -39,11 +41,12 @@ void rrddim_collect_store_metric(STORAGE_COLLECT_HANDLE *collection_handle, usec void rrddim_store_metric_flush(STORAGE_COLLECT_HANDLE *collection_handle); int rrddim_collect_finalize(STORAGE_COLLECT_HANDLE *collection_handle); -void rrddim_query_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *handle, time_t start_time, time_t end_time); +void rrddim_query_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *handle, time_t start_time_s, time_t end_time_s, STORAGE_PRIORITY priority); STORAGE_POINT rrddim_query_next_metric(struct storage_engine_query_handle *handle); int rrddim_query_is_finished(struct storage_engine_query_handle *handle); void rrddim_query_finalize(struct storage_engine_query_handle *handle); -time_t rrddim_query_latest_time(STORAGE_METRIC_HANDLE *db_metric_handle); -time_t rrddim_query_oldest_time(STORAGE_METRIC_HANDLE *db_metric_handle); +time_t rrddim_query_latest_time_s(STORAGE_METRIC_HANDLE *db_metric_handle); +time_t rrddim_query_oldest_time_s(STORAGE_METRIC_HANDLE *db_metric_handle); +time_t rrddim_query_align_to_optimal_before(struct storage_engine_query_handle *rrddim_handle); #endif diff --git a/database/rrd.c b/database/rrd.c index df364419e..d489ddb8b 100644 --- a/database/rrd.c +++ b/database/rrd.c @@ -135,7 +135,7 @@ const char *rrdset_type_name(RRDSET_TYPE chart_type) { // ---------------------------------------------------------------------------- // RRD - cache directory -char *rrdset_cache_dir(RRDHOST *host, const char *id) { +char *rrdhost_cache_dir_for_rrdset_alloc(RRDHOST *host, const char *id) { char *ret = NULL; char b[FILENAME_MAX + 1]; diff --git a/database/rrd.h b/database/rrd.h index 0796ff901..42eeb1655 100644 --- a/database/rrd.h +++ b/database/rrd.h @@ -30,11 +30,12 @@ typedef struct rrdhost_acquired RRDHOST_ACQUIRED; typedef struct rrdset_acquired RRDSET_ACQUIRED; typedef struct rrddim_acquired RRDDIM_ACQUIRED; -typedef void *ml_host_t; -typedef void *ml_dimension_t; +typedef struct ml_host ml_host_t; +typedef struct ml_chart ml_chart_t; +typedef struct ml_dimension ml_dimension_t; -typedef enum { - QUERY_SOURCE_UNKNOWN, +typedef enum __attribute__ ((__packed__)) { + QUERY_SOURCE_UNKNOWN = 0, QUERY_SOURCE_API_DATA, QUERY_SOURCE_API_BADGE, QUERY_SOURCE_API_WEIGHTS, @@ -43,6 +44,19 @@ typedef enum { QUERY_SOURCE_UNITTEST, } QUERY_SOURCE; +typedef enum __attribute__ ((__packed__)) storage_priority { + STORAGE_PRIORITY_INTERNAL_DBENGINE = 0, + STORAGE_PRIORITY_INTERNAL_QUERY_PREP, + + // query priorities + STORAGE_PRIORITY_HIGH, + STORAGE_PRIORITY_NORMAL, + STORAGE_PRIORITY_LOW, + STORAGE_PRIORITY_BEST_EFFORT, + + STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE, +} STORAGE_PRIORITY; + // forward declarations struct rrddim_tier; @@ -52,6 +66,32 @@ struct rrdengine_instance; struct pg_cache_page_index; #endif +// ---------------------------------------------------------------------------- +// memory mode + +typedef enum __attribute__ ((__packed__)) rrd_memory_mode { + RRD_MEMORY_MODE_NONE = 0, + RRD_MEMORY_MODE_RAM = 1, + RRD_MEMORY_MODE_MAP = 2, + RRD_MEMORY_MODE_SAVE = 3, + RRD_MEMORY_MODE_ALLOC = 4, + RRD_MEMORY_MODE_DBENGINE = 5, + + // this is 8-bit +} RRD_MEMORY_MODE; + +#define RRD_MEMORY_MODE_NONE_NAME "none" +#define RRD_MEMORY_MODE_RAM_NAME "ram" +#define RRD_MEMORY_MODE_MAP_NAME "map" +#define RRD_MEMORY_MODE_SAVE_NAME "save" +#define RRD_MEMORY_MODE_ALLOC_NAME "alloc" +#define RRD_MEMORY_MODE_DBENGINE_NAME "dbengine" + +extern RRD_MEMORY_MODE default_rrd_memory_mode; + +const char *rrd_memory_mode_name(RRD_MEMORY_MODE id); +RRD_MEMORY_MODE rrd_memory_mode_id(const char *name); + #include "daemon/common.h" #include "web/api/queries/query.h" #include "web/api/queries/rrdr.h" @@ -63,34 +103,48 @@ struct pg_cache_page_index; #include "streaming/rrdpush.h" #include "aclk/aclk_rrdhost_state.h" #include "sqlite/sqlite_health.h" + +typedef struct storage_query_handle STORAGE_QUERY_HANDLE; + +// iterator state for RRD dimension data queries +struct storage_engine_query_handle { + time_t start_time_s; + time_t end_time_s; + STORAGE_PRIORITY priority; + STORAGE_QUERY_HANDLE* handle; +}; + +typedef struct storage_point { + NETDATA_DOUBLE min; // when count > 1, this is the minimum among them + NETDATA_DOUBLE max; // when count > 1, this is the maximum among them + NETDATA_DOUBLE sum; // the point sum - divided by count gives the average + + // end_time - start_time = point duration + time_t start_time_s; // the time the point starts + time_t end_time_s; // the time the point ends + + size_t count; // the number of original points aggregated + size_t anomaly_count; // the number of original points found anomalous + + SN_FLAGS flags; // flags stored with the point +} STORAGE_POINT; + #include "rrdcontext.h" extern bool unittest_running; extern bool dbengine_enabled; extern size_t storage_tiers; +extern bool use_direct_io; extern size_t storage_tiers_grouping_iterations[RRD_STORAGE_TIERS]; -typedef enum { - RRD_BACKFILL_NONE, +typedef enum __attribute__ ((__packed__)) { + RRD_BACKFILL_NONE = 0, RRD_BACKFILL_FULL, RRD_BACKFILL_NEW } RRD_BACKFILL; extern RRD_BACKFILL storage_tiers_backfill[RRD_STORAGE_TIERS]; -enum { - CONTEXT_FLAGS_ARCHIVE = 0x01, - CONTEXT_FLAGS_CHART = 0x02, - CONTEXT_FLAGS_CONTEXT = 0x04 -}; - -struct context_param { - RRDDIM *rd; - time_t first_entry_t; - time_t last_entry_t; - uint8_t flags; -}; - #define UPDATE_EVERY 1 #define UPDATE_EVERY_MAX 3600 @@ -100,7 +154,19 @@ struct context_param { extern int default_rrd_update_every; extern int default_rrd_history_entries; extern int gap_when_lost_iterations_above; -extern time_t rrdset_free_obsolete_time; +extern time_t rrdset_free_obsolete_time_s; + +#if defined(ENV32BIT) +#define MIN_LIBUV_WORKER_THREADS 8 +#define MAX_LIBUV_WORKER_THREADS 64 +#define RESERVED_LIBUV_WORKER_THREADS 3 +#else +#define MIN_LIBUV_WORKER_THREADS 16 +#define MAX_LIBUV_WORKER_THREADS 128 +#define RESERVED_LIBUV_WORKER_THREADS 6 +#endif + +extern int libuv_worker_threads; #define RRD_ID_LENGTH_MAX 200 @@ -110,10 +176,10 @@ typedef long long total_number; // ---------------------------------------------------------------------------- // chart types -typedef enum rrdset_type { +typedef enum __attribute__ ((__packed__)) rrdset_type { RRDSET_TYPE_LINE = 0, RRDSET_TYPE_AREA = 1, - RRDSET_TYPE_STACKED = 2 + RRDSET_TYPE_STACKED = 2, } RRDSET_TYPE; #define RRDSET_TYPE_LINE_NAME "line" @@ -124,37 +190,10 @@ RRDSET_TYPE rrdset_type_id(const char *name); const char *rrdset_type_name(RRDSET_TYPE chart_type); -// ---------------------------------------------------------------------------- -// memory mode - -typedef enum rrd_memory_mode { - RRD_MEMORY_MODE_NONE = 0, - RRD_MEMORY_MODE_RAM = 1, - RRD_MEMORY_MODE_MAP = 2, - RRD_MEMORY_MODE_SAVE = 3, - RRD_MEMORY_MODE_ALLOC = 4, - RRD_MEMORY_MODE_DBENGINE = 5, - - // this is 8-bit -} RRD_MEMORY_MODE; - -#define RRD_MEMORY_MODE_NONE_NAME "none" -#define RRD_MEMORY_MODE_RAM_NAME "ram" -#define RRD_MEMORY_MODE_MAP_NAME "map" -#define RRD_MEMORY_MODE_SAVE_NAME "save" -#define RRD_MEMORY_MODE_ALLOC_NAME "alloc" -#define RRD_MEMORY_MODE_DBENGINE_NAME "dbengine" - -extern RRD_MEMORY_MODE default_rrd_memory_mode; - -const char *rrd_memory_mode_name(RRD_MEMORY_MODE id); -RRD_MEMORY_MODE rrd_memory_mode_id(const char *name); - - // ---------------------------------------------------------------------------- // algorithms types -typedef enum rrd_algorithm { +typedef enum __attribute__ ((__packed__)) rrd_algorithm { RRD_ALGORITHM_ABSOLUTE = 0, RRD_ALGORITHM_INCREMENTAL = 1, RRD_ALGORITHM_PCENT_OVER_DIFF_TOTAL = 2, @@ -185,7 +224,7 @@ DICTIONARY *rrdfamily_rrdvars_dict(const RRDFAMILY_ACQUIRED *rf); // flags & options // options are permanent configuration options (no atomics to alter/access them) -typedef enum rrddim_options { +typedef enum __attribute__ ((__packed__)) rrddim_options { RRDDIM_OPTION_NONE = 0, RRDDIM_OPTION_HIDDEN = (1 << 0), // this dimension will not be offered to callers RRDDIM_OPTION_DONT_DETECT_RESETS_OR_OVERFLOWS = (1 << 1), // do not offer RESET or OVERFLOW info to callers @@ -199,7 +238,7 @@ typedef enum rrddim_options { #define rrddim_option_clear(rd, option) (rd)->options &= ~(option) // flags are runtime changing status flags (atomics are required to alter/access them) -typedef enum rrddim_flags { +typedef enum __attribute__ ((__packed__)) rrddim_flags { RRDDIM_FLAG_NONE = 0, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION = (1 << 0), @@ -218,7 +257,7 @@ typedef enum rrddim_flags { #define rrddim_flag_set(rd, flag) __atomic_or_fetch(&((rd)->flags), (flag), __ATOMIC_SEQ_CST) #define rrddim_flag_clear(rd, flag) __atomic_and_fetch(&((rd)->flags), ~(flag), __ATOMIC_SEQ_CST) -typedef enum rrdlabel_source { +typedef enum __attribute__ ((__packed__)) rrdlabel_source { RRDLABEL_SRC_AUTO = (1 << 0), // set when Netdata found the label by some automation RRDLABEL_SRC_CONFIG = (1 << 1), // set when the user configured the label RRDLABEL_SRC_K8S = (1 << 2), // set when this label is found from k8s (RRDLABEL_SRC_AUTO should also be set) @@ -266,6 +305,25 @@ int rrdlabels_unittest(void); // unfortunately this break when defined in exporting_engine.h bool exporting_labels_filter_callback(const char *name, const char *value, RRDLABEL_SRC ls, void *data); +// ---------------------------------------------------------------------------- +// engine-specific iterator state for dimension data collection +typedef struct storage_collect_handle STORAGE_COLLECT_HANDLE; + +// ---------------------------------------------------------------------------- +// Storage tier data for every dimension + +struct rrddim_tier { + STORAGE_POINT virtual_point; + size_t tier_grouping; + time_t next_point_end_time_s; + STORAGE_METRIC_HANDLE *db_metric_handle; // the metric handle inside the database + STORAGE_COLLECT_HANDLE *db_collection_handle; // the data collection handle + struct storage_engine_collect_ops *collect_ops; + struct storage_engine_query_ops *query_ops; +}; + +void rrdr_fill_tier_gap_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now_s); + // ---------------------------------------------------------------------------- // RRD DIMENSION - this is a metric @@ -278,10 +336,10 @@ struct rrddim { STRING *id; // the id of this dimension (for internal identification) STRING *name; // the name of this dimension (as presented to user) - RRD_ALGORITHM algorithm:8; // the algorithm that is applied to add new collected values - RRDDIM_OPTIONS options:8; // permanent configuration options - RRD_MEMORY_MODE rrd_memory_mode:8; // the memory mode for this dimension - /*RRDDIM_FLAGS*/ uint8_t flags; // run time changing status flags + RRD_ALGORITHM algorithm; // the algorithm that is applied to add new collected values + RRDDIM_OPTIONS options; // permanent configuration options + RRD_MEMORY_MODE rrd_memory_mode; // the memory mode for this dimension + RRDDIM_FLAGS flags; // run time changing status flags bool updated; // 1 when the dimension has been updated since the last processing bool exposed; // 1 when set what have sent this dimension to the central netdata @@ -296,7 +354,7 @@ struct rrddim { // ------------------------------------------------------------------------ // operational state members - ml_dimension_t ml_dimension; // machine learning data about this dimension + ml_dimension_t *ml_dimension; // machine learning data about this dimension // ------------------------------------------------------------------------ // linking to siblings and parents @@ -308,7 +366,7 @@ struct rrddim { // ------------------------------------------------------------------------ // data collection members - struct rrddim_tier *tiers[RRD_STORAGE_TIERS]; // our tiers of databases + struct rrddim_tier tiers[RRD_STORAGE_TIERS]; // our tiers of databases struct timeval last_collected_time; // when was this dimension last updated // this is actual date time we updated the last_collected_value @@ -361,49 +419,28 @@ void rrddim_memory_file_save(RRDDIM *rd); // ---------------------------------------------------------------------------- -typedef struct storage_point { - NETDATA_DOUBLE min; // when count > 1, this is the minimum among them - NETDATA_DOUBLE max; // when count > 1, this is the maximum among them - NETDATA_DOUBLE sum; // the point sum - divided by count gives the average - - // end_time - start_time = point duration - time_t start_time; // the time the point starts - time_t end_time; // the time the point ends - - unsigned count; // the number of original points aggregated - unsigned anomaly_count; // the number of original points found anomalous - - SN_FLAGS flags; // flags stored with the point -} STORAGE_POINT; - #define storage_point_unset(x) do { \ (x).min = (x).max = (x).sum = NAN; \ (x).count = 0; \ (x).anomaly_count = 0; \ (x).flags = SN_FLAG_NONE; \ - (x).start_time = 0; \ - (x).end_time = 0; \ + (x).start_time_s = 0; \ + (x).end_time_s = 0; \ } while(0) -#define storage_point_empty(x, start_t, end_t) do { \ +#define storage_point_empty(x, start_s, end_s) do { \ (x).min = (x).max = (x).sum = NAN; \ (x).count = 1; \ (x).anomaly_count = 0; \ (x).flags = SN_FLAG_NONE; \ - (x).start_time = start_t; \ - (x).end_time = end_t; \ + (x).start_time_s = start_s; \ + (x).end_time_s = end_s; \ } while(0) -#define storage_point_is_unset(x) (!(x).count) -#define storage_point_is_empty(x) (!netdata_double_isnumber((x).sum)) - -// ---------------------------------------------------------------------------- -// engine-specific iterator state for dimension data collection -typedef struct storage_collect_handle STORAGE_COLLECT_HANDLE; +#define STORAGE_POINT_UNSET { .min = NAN, .max = NAN, .sum = NAN, .count = 0, .anomaly_count = 0, .flags = SN_FLAG_NONE, .start_time_s = 0, .end_time_s = 0 } -// ---------------------------------------------------------------------------- -// engine-specific iterator state for dimension data queries -typedef struct storage_query_handle STORAGE_QUERY_HANDLE; +#define storage_point_is_unset(x) (!(x).count) +#define storage_point_is_gap(x) (!netdata_double_isnumber((x).sum)) // ------------------------------------------------------------------------ // function pointers that handle data collection @@ -429,18 +466,11 @@ struct storage_engine_collect_ops { }; // ---------------------------------------------------------------------------- -// iterator state for RRD dimension data queries -struct storage_engine_query_handle { - RRDDIM *rd; - time_t start_time_s; - time_t end_time_s; - STORAGE_QUERY_HANDLE* handle; -}; // function pointers that handle database queries struct storage_engine_query_ops { // run this before starting a series of next_metric() database queries - void (*init)(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *handle, time_t start_time, time_t end_time); + void (*init)(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *handle, time_t start_time_s, time_t end_time_s, STORAGE_PRIORITY priority); // run this to load each metric number from the database STORAGE_POINT (*next_metric)(struct storage_engine_query_handle *handle); @@ -452,10 +482,14 @@ struct storage_engine_query_ops { void (*finalize)(struct storage_engine_query_handle *handle); // get the timestamp of the last entry of this metric - time_t (*latest_time)(STORAGE_METRIC_HANDLE *db_metric_handle); + time_t (*latest_time_s)(STORAGE_METRIC_HANDLE *db_metric_handle); // get the timestamp of the first entry of this metric - time_t (*oldest_time)(STORAGE_METRIC_HANDLE *db_metric_handle); + time_t (*oldest_time_s)(STORAGE_METRIC_HANDLE *db_metric_handle); + + // adapt 'before' timestamp to the optimal for the query + // can only move 'before' ahead (to the future) + time_t (*align_to_optimal_before)(struct storage_engine_query_handle *handle); }; typedef struct storage_engine STORAGE_ENGINE; @@ -468,6 +502,7 @@ typedef struct storage_engine_api { STORAGE_METRIC_HANDLE *(*metric_get_or_create)(RRDDIM *rd, STORAGE_INSTANCE *instance); void (*metric_release)(STORAGE_METRIC_HANDLE *); STORAGE_METRIC_HANDLE *(*metric_dup)(STORAGE_METRIC_HANDLE *); + bool (*metric_retention_by_uuid)(STORAGE_INSTANCE *db_instance, uuid_t *uuid, time_t *first_entry_s, time_t *last_entry_s); // operations struct storage_engine_collect_ops collect_ops; @@ -483,21 +518,6 @@ struct storage_engine { STORAGE_ENGINE* storage_engine_get(RRD_MEMORY_MODE mmode); STORAGE_ENGINE* storage_engine_find(const char* name); -// ---------------------------------------------------------------------------- -// Storage tier data for every dimension - -struct rrddim_tier { - size_t tier_grouping; - STORAGE_METRIC_HANDLE *db_metric_handle; // the metric handle inside the database - STORAGE_COLLECT_HANDLE *db_collection_handle; // the data collection handle - STORAGE_POINT virtual_point; - time_t next_point_time; - struct storage_engine_collect_ops *collect_ops; - struct storage_engine_query_ops *query_ops; -}; - -void rrdr_fill_tier_gap_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now); - // ---------------------------------------------------------------------------- // these loop macros make sure the linked list is accessed with the right lock @@ -520,7 +540,7 @@ void rrdr_fill_tier_gap_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now); // flags are set/unset in a manner that is not thread safe // and may lead to missing information. -typedef enum rrdset_flags { +typedef enum __attribute__ ((__packed__)) rrdset_flags { RRDSET_FLAG_DETAIL = (1 << 1), // if set, the data set should be considered as a detail of another // (the master data set should be the one that has the same family and is not detail) RRDSET_FLAG_DEBUG = (1 << 2), // enables or disables debugging for a chart @@ -554,6 +574,8 @@ typedef enum rrdset_flags { RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED = (1 << 25), // the receiving side has completed replication RRDSET_FLAG_UPSTREAM_SEND_VARIABLES = (1 << 26), // a custom variable has been updated and needs to be exposed to parent + + RRDSET_FLAG_COLLECTION_FINISHED = (1 << 27), // when set, data collection is not available for this chart } RRDSET_FLAGS; #define rrdset_flag_check(st, flag) (__atomic_load_n(&((st)->flags), __ATOMIC_SEQ_CST) & (flag)) @@ -595,6 +617,8 @@ struct rrdset { DICTIONARY *rrddimvar_root_index; // dimension variables // we use this dictionary to manage their allocation + ml_chart_t *ml_chart; + // ------------------------------------------------------------------------ // operational state members @@ -603,10 +627,6 @@ struct rrdset { DICTIONARY *rrddim_root_index; // dimensions index - int gap_when_lost_iterations_above; // after how many lost iterations a gap should be stored - // netdata will interpolate values for gaps lower than this - // TODO - use the global - all charts have the same value - STORAGE_METRICS_GROUP *storage_metrics_groups[RRD_STORAGE_TIERS]; // ------------------------------------------------------------------------ @@ -620,10 +640,12 @@ struct rrdset { // ------------------------------------------------------------------------ // data collection members + SPINLOCK data_collection_lock; + size_t counter; // the number of times we added values to this database size_t counter_done; // the number of times rrdset_done() has been called - time_t last_accessed_time; // the last time this RRDSET has been accessed + time_t last_accessed_time_s; // the last time this RRDSET has been accessed usec_t usec_since_last_update; // the time in microseconds since the last collection of data @@ -637,7 +659,7 @@ struct rrdset { // ------------------------------------------------------------------------ // data collection - streaming to parents, temp variables - time_t upstream_resync_time; // the timestamp up to which we should resync clock upstream + time_t upstream_resync_time_s; // the timestamp up to which we should resync clock upstream // ------------------------------------------------------------------------ // db mode SAVE, MAP specifics @@ -645,7 +667,6 @@ struct rrdset { // (RRDSET_DB_STATE ptr to an undefined structure, and a call to clean this up during destruction) char *cache_dir; // the directory to store dimensions - unsigned long memsize; // how much mem we have allocated for this (without dimensions) void *st_on_file; // compatibility with V019 RRDSET files // ------------------------------------------------------------------------ @@ -735,7 +756,7 @@ bool rrdset_memory_load_or_create_map_save(RRDSET *st_on_file, RRD_MEMORY_MODE m // flags are set/unset in a manner that is not thread safe // and may lead to missing information. -typedef enum rrdhost_flags { +typedef enum __attribute__ ((__packed__)) rrdhost_flags { // Orphan, Archived and Obsolete flags RRDHOST_FLAG_ORPHAN = (1 << 10), // this host is orphan (not receiving data) RRDHOST_FLAG_ARCHIVED = (1 << 11), // The host is archived, no collected charts yet @@ -748,7 +769,6 @@ typedef enum rrdhost_flags { RRDHOST_FLAG_RRDPUSH_SENDER_CONNECTED = (1 << 16), // When set, the host is connected to a parent RRDHOST_FLAG_RRDPUSH_SENDER_READY_4_METRICS = (1 << 17), // when set, rrdset_done() should push metrics to parent RRDHOST_FLAG_RRDPUSH_SENDER_LOGGED_STATUS = (1 << 18), // when set, we have logged the status of metrics streaming - RRDHOST_FLAG_RRDPUSH_SENDER_JOIN = (1 << 19), // When set, we want to join the sender thread // Health RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION = (1 << 20), // contains charts and dims with uninitialized variables @@ -762,8 +782,11 @@ typedef enum rrdhost_flags { RRDHOST_FLAG_ACLK_STREAM_CONTEXTS = (1 << 24), // when set, we should send ACLK stream context updates // Metadata RRDHOST_FLAG_METADATA_UPDATE = (1 << 25), // metadata needs to be stored in the database + RRDHOST_FLAG_METADATA_LABELS = (1 << 26), // metadata needs to be stored in the database + RRDHOST_FLAG_METADATA_INFO = (1 << 27), // metadata needs to be stored in the database + RRDHOST_FLAG_METADATA_CLAIMID = (1 << 28), // metadata needs to be stored in the database - RRDHOST_FLAG_RRDPUSH_RECEIVER_DISCONNECTED = ( 1 << 26), // set when the receiver part is disconnected + RRDHOST_FLAG_RRDPUSH_RECEIVER_DISCONNECTED = (1 << 29), // set when the receiver part is disconnected } RRDHOST_FLAGS; #define rrdhost_flag_check(host, flag) (__atomic_load_n(&((host)->flags), __ATOMIC_SEQ_CST) & (flag)) @@ -777,7 +800,7 @@ typedef enum rrdhost_flags { #define rrdset_debug(st, fmt, args...) debug_dummy() #endif -typedef enum { +typedef enum __attribute__ ((__packed__)) { // Indexing RRDHOST_OPTION_INDEXED_MACHINE_GUID = (1 << 0), // when set, we have indexed its machine guid RRDHOST_OPTION_INDEXED_HOSTNAME = (1 << 1), // when set, we have indexed its hostname @@ -880,6 +903,15 @@ typedef struct alarm_log { netdata_rwlock_t alarm_log_rwlock; } ALARM_LOG; +typedef struct health { + unsigned int health_enabled; // 1 when this host has health enabled + time_t health_delay_up_to; // a timestamp to delay alarms processing up to + STRING *health_default_exec; // the full path of the alarms notifications program + STRING *health_default_recipient; // the default recipient for all alarms + size_t health_log_entries_written; // the number of alarm events written to the alarms event log + uint32_t health_default_warn_repeat_every; // the default value for the interval between repeating warning notifications + uint32_t health_default_crit_repeat_every; // the default value for the interval between repeating critical notifications +} HEALTH; // ---------------------------------------------------------------------------- // RRD HOST @@ -984,10 +1016,10 @@ struct rrdhost { // ------------------------------------------------------------------------ // streaming of data from remote hosts - rrdpush receiver - time_t senders_connect_time; // the time the last sender was connected - time_t senders_last_chart_command; // the time of the last CHART streaming command - time_t senders_disconnected_time; // the time the last sender was disconnected - int senders_count; // number of senders currently streaming + time_t child_connect_time; // the time the last sender was connected + time_t child_last_chart_command; // the time of the last CHART streaming command + time_t child_disconnected_time; // the time the last sender was disconnected + int connected_children_count; // number of senders currently streaming struct receiver_state *receiver; netdata_mutex_t receiver_lock; @@ -997,18 +1029,8 @@ struct rrdhost { // ------------------------------------------------------------------------ // health monitoring options - unsigned int health_enabled; // 1 when this host has health enabled - bool health_spawn; // true when health thread is running - netdata_thread_t health_thread; // the health thread - unsigned int aclk_alert_reloaded; // 1 on thread start and health reload, 0 after removed are sent - time_t health_delay_up_to; // a timestamp to delay alarms processing up to - STRING *health_default_exec; // the full path of the alarms notifications program - STRING *health_default_recipient; // the default recipient for all alarms - char *health_log_filename; // the alarms event log filename - size_t health_log_entries_written; // the number of alarm events written to the alarms event log - FILE *health_log_fp; // the FILE pointer to the open alarms event log file - uint32_t health_default_warn_repeat_every; // the default value for the interval between repeating warning notifications - uint32_t health_default_crit_repeat_every; // the default value for the interval between repeating critical notifications + // health variables + HEALTH health; // all RRDCALCs are primarily allocated and linked here DICTIONARY *rrdcalc_root_index; @@ -1024,11 +1046,11 @@ struct rrdhost { // ------------------------------------------------------------------------ // locks - netdata_rwlock_t rrdhost_rwlock; // lock for this RRDHOST (protects rrdset_root linked list) + SPINLOCK rrdhost_update_lock; // ------------------------------------------------------------------------ // ML handle - ml_host_t ml_host; + ml_host_t *ml_host; // ------------------------------------------------------------------------ // Support for host-level labels @@ -1072,10 +1094,6 @@ extern RRDHOST *localhost; #define rrdhost_program_name(host) string2str((host)->program_name) #define rrdhost_program_version(host) string2str((host)->program_version) -#define rrdhost_rdlock(host) netdata_rwlock_rdlock(&((host)->rrdhost_rwlock)) -#define rrdhost_wrlock(host) netdata_rwlock_wrlock(&((host)->rrdhost_rwlock)) -#define rrdhost_unlock(host) netdata_rwlock_unlock(&((host)->rrdhost_rwlock)) - #define rrdhost_aclk_state_lock(host) netdata_mutex_lock(&((host)->aclk_state_lock)) #define rrdhost_aclk_state_unlock(host) netdata_mutex_unlock(&((host)->aclk_state_lock)) @@ -1090,16 +1108,15 @@ extern RRDHOST *localhost; #define rrdhost_sender_replicating_charts_zero(host) (__atomic_store_n(&((host)->rrdpush_sender_replicating_charts), 0, __ATOMIC_RELAXED)) extern DICTIONARY *rrdhost_root_index; -long rrdhost_hosts_available(void); +size_t rrdhost_hosts_available(void); // ---------------------------------------------------------------------------- -// these loop macros make sure the linked list is accessed with the right lock #define rrdhost_foreach_read(var) \ - for((var) = localhost, rrd_check_rdlock(); var ; (var) = (var)->next) + for((var) = localhost; var ; (var) = (var)->next) #define rrdhost_foreach_write(var) \ - for((var) = localhost, rrd_check_wrlock(); var ; (var) = (var)->next) + for((var) = localhost; var ; (var) = (var)->next) // ---------------------------------------------------------------------------- @@ -1122,10 +1139,9 @@ void rrddim_index_destroy(RRDSET *st); // ---------------------------------------------------------------------------- -extern size_t rrd_hosts_available; -extern time_t rrdhost_free_orphan_time; +extern time_t rrdhost_free_orphan_time_s; -int rrd_init(char *hostname, struct rrdhost_system_info *system_info); +int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unittest); RRDHOST *rrdhost_find_by_hostname(const char *hostname); RRDHOST *rrdhost_find_by_guid(const char *guid); @@ -1156,57 +1172,8 @@ RRDHOST *rrdhost_find_or_create( , bool is_archived ); -void rrdhost_update(RRDHOST *host - , const char *hostname - , const char *registry_hostname - , const char *guid - , const char *os - , const char *timezone - , const char *abbrev_timezone - , int32_t utc_offset - , const char *tags - , const char *program_name - , const char *program_version - , int update_every - , long history - , RRD_MEMORY_MODE mode - , unsigned int health_enabled - , unsigned int rrdpush_enabled - , char *rrdpush_destination - , char *rrdpush_api_key - , char *rrdpush_send_charts_matching - , bool rrdpush_enable_replication - , time_t rrdpush_seconds_to_replicate - , time_t rrdpush_replication_step - , struct rrdhost_system_info *system_info -); - int rrdhost_set_system_info_variable(struct rrdhost_system_info *system_info, char *name, char *value); -#if defined(NETDATA_INTERNAL_CHECKS) && defined(NETDATA_VERIFY_LOCKS) -void __rrdhost_check_wrlock(RRDHOST *host, const char *file, const char *function, const unsigned long line); -void __rrdhost_check_rdlock(RRDHOST *host, const char *file, const char *function, const unsigned long line); -void __rrdset_check_rdlock(RRDSET *st, const char *file, const char *function, const unsigned long line); -void __rrdset_check_wrlock(RRDSET *st, const char *file, const char *function, const unsigned long line); -void __rrd_check_rdlock(const char *file, const char *function, const unsigned long line); -void __rrd_check_wrlock(const char *file, const char *function, const unsigned long line); - -#define rrdhost_check_rdlock(host) __rrdhost_check_rdlock(host, __FILE__, __FUNCTION__, __LINE__) -#define rrdhost_check_wrlock(host) __rrdhost_check_wrlock(host, __FILE__, __FUNCTION__, __LINE__) -#define rrdset_check_rdlock(st) __rrdset_check_rdlock(st, __FILE__, __FUNCTION__, __LINE__) -#define rrdset_check_wrlock(st) __rrdset_check_wrlock(st, __FILE__, __FUNCTION__, __LINE__) -#define rrd_check_rdlock() __rrd_check_rdlock(__FILE__, __FUNCTION__, __LINE__) -#define rrd_check_wrlock() __rrd_check_wrlock(__FILE__, __FUNCTION__, __LINE__) - -#else -#define rrdhost_check_rdlock(host) (void)0 -#define rrdhost_check_wrlock(host) (void)0 -#define rrdset_check_rdlock(st) (void)0 -#define rrdset_check_wrlock(st) (void)0 -#define rrd_check_rdlock() (void)0 -#define rrd_check_wrlock() (void)0 -#endif - // ---------------------------------------------------------------------------- // RRDSET functions @@ -1239,15 +1206,15 @@ void rrdhost_save_all(void); void rrdhost_cleanup_all(void); void rrdhost_system_info_free(struct rrdhost_system_info *system_info); -void rrdhost_free(RRDHOST *host, bool force); +void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force); void rrdhost_save_charts(RRDHOST *host); void rrdhost_delete_charts(RRDHOST *host); -int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, time_t now); +int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, time_t now_s); void rrdset_update_heterogeneous_flag(RRDSET *st); -time_t rrdset_set_update_every(RRDSET *st, time_t update_every); +time_t rrdset_set_update_every_s(RRDSET *st, time_t update_every_s); RRDSET *rrdset_find(RRDHOST *host, const char *id); #define rrdset_find_localhost(id) rrdset_find(localhost, id) @@ -1298,13 +1265,17 @@ void rrdset_isnot_obsolete(RRDSET *st); #define rrdset_is_available_for_exporting_and_alarms(st) (!rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE) && !rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED) && rrdset_number_of_dimensions(st)) #define rrdset_is_archived(st) (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED) && rrdset_number_of_dimensions(st)) -time_t rrddim_first_entry_t(RRDDIM *rd); -time_t rrddim_first_entry_t_of_tier(RRDDIM *rd, size_t tier); -time_t rrddim_last_entry_t(RRDDIM *rd); -time_t rrdset_last_entry_t(RRDSET *st); -time_t rrdset_first_entry_t_of_tier(RRDSET *st, size_t tier); -time_t rrdset_first_entry_t(RRDSET *st); -time_t rrdhost_last_entry_t(RRDHOST *h); +time_t rrddim_first_entry_s(RRDDIM *rd); +time_t rrddim_first_entry_s_of_tier(RRDDIM *rd, size_t tier); +time_t rrddim_last_entry_s(RRDDIM *rd); +time_t rrddim_last_entry_s_of_tier(RRDDIM *rd, size_t tier); + +time_t rrdset_first_entry_s(RRDSET *st); +time_t rrdset_first_entry_s_of_tier(RRDSET *st, size_t tier); +time_t rrdset_last_entry_s(RRDSET *st); +time_t rrdset_last_entry_s_of_tier(RRDSET *st, size_t tier); + +void rrdset_get_retention_of_tier_for_collected_chart(RRDSET *st, time_t *first_time_s, time_t *last_time_s, time_t now_s, size_t tier); // ---------------------------------------------------------------------------- // RRD DIMENSION functions @@ -1342,9 +1313,11 @@ collected_number rrddim_timed_set_by_pointer(RRDSET *st, RRDDIM *rd, struct time collected_number rrddim_set_by_pointer(RRDSET *st, RRDDIM *rd, collected_number value); collected_number rrddim_set(RRDSET *st, const char *id, collected_number value); -#ifdef ENABLE_ACLK -time_t calc_dimension_liveness(RRDDIM *rd, time_t now); -#endif +bool rrddim_finalize_collection_and_check_retention(RRDDIM *rd); +void rrdset_finalize_collection(RRDSET *st, bool dimensions_too); +void rrdhost_finalize_collection(RRDHOST *host); +void rrd_finalize_collection_for_all_hosts(void); + long align_entries_to_pagesize(RRD_MEMORY_MODE mode, long entries); #ifdef NETDATA_LOG_COLLECTION_ERRORS @@ -1368,21 +1341,14 @@ void rrdset_free(RRDSET *st); #ifdef NETDATA_RRD_INTERNALS -char *rrdset_cache_dir(RRDHOST *host, const char *id); +char *rrdhost_cache_dir_for_rrdset_alloc(RRDHOST *host, const char *id); +const char *rrdset_cache_dir(RRDSET *st); void rrddim_free(RRDSET *st, RRDDIM *rd); void rrdset_reset(RRDSET *st); void rrdset_delete_obsolete_dimensions(RRDSET *st); -RRDHOST *rrdhost_create( - const char *hostname, const char *registry_hostname, const char *guid, const char *os, const char *timezone, - const char *abbrev_timezone, int32_t utc_offset,const char *tags, const char *program_name, const char *program_version, - int update_every, long entries, RRD_MEMORY_MODE memory_mode, unsigned int health_enabled, unsigned int rrdpush_enabled, - char *rrdpush_destination, char *rrdpush_api_key, char *rrdpush_send_charts_matching, - bool rrdpush_enable_replication, time_t rrdpush_seconds_to_replicate, time_t rrdpush_replication_step, - struct rrdhost_system_info *system_info, int is_localhost, bool is_archived); - #endif /* NETDATA_RRD_INTERNALS */ void set_host_properties( @@ -1391,6 +1357,7 @@ void set_host_properties( const char *program_name, const char *program_version); size_t get_tier_grouping(size_t tier); +void store_metric_collection_completed(void); // ---------------------------------------------------------------------------- // RRD DB engine declarations diff --git a/database/rrdcalc.c b/database/rrdcalc.c index aad945a90..762635824 100644 --- a/database/rrdcalc.c +++ b/database/rrdcalc.c @@ -74,18 +74,16 @@ static STRING *rrdcalc_replace_variables_with_rrdset_labels(const char *line, RR char var[RRDCALC_VAR_MAX]; char *m, *lbl_value = NULL; - while ((m = strchr(temp + pos, '$'))) { + while ((m = strchr(temp + pos, '$')) && *(m+1) == '{') { int i = 0; char *e = m; while (*e) { + var[i++] = *e; - if (*e == ' ' || i == RRDCALC_VAR_MAX - 1) + if (*e == '}' || i == RRDCALC_VAR_MAX - 1) break; - else - var[i] = *e; e++; - i++; } var[i] = '\0'; @@ -97,8 +95,12 @@ static STRING *rrdcalc_replace_variables_with_rrdset_labels(const char *line, RR temp = buf; } else if (!strncmp(var, RRDCALC_VAR_LABEL, RRDCALC_VAR_LABEL_LEN)) { + char label_val[RRDCALC_VAR_MAX + 1] = { 0 }; + strcpy(label_val, var+RRDCALC_VAR_LABEL_LEN); + label_val[i - RRDCALC_VAR_LABEL_LEN - 1] = '\0'; + if(likely(rc->rrdset && rc->rrdset->rrdlabels)) { - rrdlabels_get_value_to_char_or_null(rc->rrdset->rrdlabels, &lbl_value, var+RRDCALC_VAR_LABEL_LEN); + rrdlabels_get_value_to_char_or_null(rc->rrdset->rrdlabels, &lbl_value, label_val); if (lbl_value) { char *buf = find_and_replace(temp, var, lbl_value, m); freez(temp); @@ -179,7 +181,7 @@ static void rrdcalc_link_to_rrdset(RRDSET *st, RRDCALC *rc) { rc->rrdset = st; netdata_rwlock_wrlock(&st->alerts.rwlock); - DOUBLE_LINKED_LIST_APPEND_UNSAFE(st->alerts.base, rc, prev, next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(st->alerts.base, rc, prev, next); netdata_rwlock_unlock(&st->alerts.rwlock); if(rc->update_every < rc->rrdset->update_every) { @@ -326,7 +328,7 @@ static void rrdcalc_unlink_from_rrdset(RRDCALC *rc, bool having_ll_wrlock) { if(!having_ll_wrlock) netdata_rwlock_wrlock(&st->alerts.rwlock); - DOUBLE_LINKED_LIST_REMOVE_UNSAFE(st->alerts.base, rc, prev, next); + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(st->alerts.base, rc, prev, next); if(!having_ll_wrlock) netdata_rwlock_unlock(&st->alerts.rwlock); @@ -625,7 +627,8 @@ static void rrdcalc_rrdhost_delete_callback(const DICTIONARY_ITEM *item __maybe_ void rrdcalc_rrdhost_index_init(RRDHOST *host) { if(!host->rrdcalc_root_index) { - host->rrdcalc_root_index = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + host->rrdcalc_root_index = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdhealth, sizeof(RRDCALC)); dictionary_register_insert_callback(host->rrdcalc_root_index, rrdcalc_rrdhost_insert_callback, NULL); dictionary_register_conflict_callback(host->rrdcalc_root_index, rrdcalc_rrdhost_conflict_callback, NULL); @@ -737,7 +740,7 @@ void rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(RRDHOST *host continue; if(!rrdlabels_match_simple_pattern_parsed(host->rrdlabels, rc->host_labels_pattern, '=')) { - info("Health configuration for alarm '%s' cannot be applied, because the host %s does not have the label(s) '%s'", + log_health("Health configuration for alarm '%s' cannot be applied, because the host %s does not have the label(s) '%s'", rrdcalc_name(rc), rrdhost_hostname(host), rrdcalc_host_labels(rc)); @@ -753,7 +756,7 @@ void rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts() { RRDHOST *host; rrdhost_foreach_read(host) { - if (unlikely(!host->health_enabled)) + if (unlikely(!host->health.health_enabled)) continue; if (host->rrdlabels) diff --git a/database/rrdcalc.h b/database/rrdcalc.h index a25c05cc6..08d8beee2 100644 --- a/database/rrdcalc.h +++ b/database/rrdcalc.h @@ -251,8 +251,8 @@ void rrdcalc_rrdhost_index_init(RRDHOST *host); void rrdcalc_rrdhost_index_destroy(RRDHOST *host); #define RRDCALC_VAR_MAX 100 -#define RRDCALC_VAR_FAMILY "$family" -#define RRDCALC_VAR_LABEL "$label:" +#define RRDCALC_VAR_FAMILY "${family}" +#define RRDCALC_VAR_LABEL "${label:" #define RRDCALC_VAR_LABEL_LEN (sizeof(RRDCALC_VAR_LABEL)-1) #endif //NETDATA_RRDCALC_H diff --git a/database/rrdcalctemplate.c b/database/rrdcalctemplate.c index 87e085c93..4d7352b28 100644 --- a/database/rrdcalctemplate.c +++ b/database/rrdcalctemplate.c @@ -189,7 +189,8 @@ static void rrdcalctemplate_delete_callback(const DICTIONARY_ITEM *item __maybe_ void rrdcalctemplate_index_init(RRDHOST *host) { if(!host->rrdcalctemplate_root_index) { - host->rrdcalctemplate_root_index = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + host->rrdcalctemplate_root_index = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdhealth, sizeof(RRDCALCTEMPLATE)); dictionary_register_insert_callback(host->rrdcalctemplate_root_index, rrdcalctemplate_insert_callback, NULL); dictionary_register_delete_callback(host->rrdcalctemplate_root_index, rrdcalctemplate_delete_callback, host); diff --git a/database/rrdcontext.c b/database/rrdcontext.c index 3413d1ea8..9fc605f32 100644 --- a/database/rrdcontext.c +++ b/database/rrdcontext.c @@ -221,8 +221,8 @@ typedef struct rrdmetric { RRDDIM *rrddim; - time_t first_time_t; - time_t last_time_t; + time_t first_time_s; + time_t last_time_s; RRD_FLAGS flags; struct rrdinstance *ri; @@ -240,10 +240,10 @@ typedef struct rrdinstance { RRDSET_TYPE chart_type; RRD_FLAGS flags; // flags related to this instance - time_t first_time_t; - time_t last_time_t; + time_t first_time_s; + time_t last_time_s; - int update_every; // data collection frequency + time_t update_every_s; // data collection frequency RRDSET *rrdset; // pointer to RRDSET when collected, or NULL DICTIONARY *rrdlabels; // linked to RRDSET->chart_labels or own version @@ -269,8 +269,8 @@ typedef struct rrdcontext { RRDSET_TYPE chart_type; RRD_FLAGS flags; - time_t first_time_t; - time_t last_time_t; + time_t first_time_s; + time_t last_time_s; VERSIONED_CONTEXT_DATA hub; @@ -522,20 +522,20 @@ static bool rrdmetric_conflict_callback(const DICTIONARY_ITEM *item __maybe_unus uuid_unparse(rm->uuid, uuid1); uuid_unparse(rm_new->uuid, uuid2); - time_t old_first_time_t = 0; - time_t old_last_time_t = 0; + time_t old_first_time_s = 0; + time_t old_last_time_s = 0; if(rrdmetric_update_retention(rm)) { - old_first_time_t = rm->first_time_t; - old_last_time_t = rm->last_time_t; + old_first_time_s = rm->first_time_s; + old_last_time_s = rm->last_time_s; } uuid_copy(rm->uuid, rm_new->uuid); - time_t new_first_time_t = 0; - time_t new_last_time_t = 0; + time_t new_first_time_s = 0; + time_t new_last_time_s = 0; if(rrdmetric_update_retention(rm)) { - new_first_time_t = rm->first_time_t; - new_last_time_t = rm->last_time_t; + new_first_time_s = rm->first_time_s; + new_last_time_s = rm->last_time_s; } internal_error(true, @@ -543,8 +543,8 @@ static bool rrdmetric_conflict_callback(const DICTIONARY_ITEM *item __maybe_unus , string2str(rm->id) , string2str(rm->ri->id) , rrdhost_hostname(rm->ri->rc->rrdhost) - , uuid1, old_first_time_t, old_last_time_t, old_last_time_t - old_first_time_t - , uuid2, new_first_time_t, new_last_time_t, new_last_time_t - new_first_time_t + , uuid1, old_first_time_s, old_last_time_s, old_last_time_s - old_first_time_s + , uuid2, new_first_time_s, new_last_time_s, new_last_time_s - new_first_time_s ); #else uuid_copy(rm->uuid, rm_new->uuid); @@ -576,13 +576,13 @@ static bool rrdmetric_conflict_callback(const DICTIONARY_ITEM *item __maybe_unus rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); } - if(!rm->first_time_t || (rm_new->first_time_t && rm_new->first_time_t < rm->first_time_t)) { - rm->first_time_t = rm_new->first_time_t; + if(!rm->first_time_s || (rm_new->first_time_s && rm_new->first_time_s < rm->first_time_s)) { + rm->first_time_s = rm_new->first_time_s; rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); } - if(!rm->last_time_t || (rm_new->last_time_t && rm_new->last_time_t > rm->last_time_t)) { - rm->last_time_t = rm_new->last_time_t; + if(!rm->last_time_s || (rm_new->last_time_s && rm_new->last_time_s > rm->last_time_s)) { + rm->last_time_s = rm_new->last_time_s; rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); } @@ -611,7 +611,9 @@ static void rrdmetrics_create_in_rrdinstance(RRDINSTANCE *ri) { if(unlikely(!ri)) return; if(likely(ri->rrdmetrics)) return; - ri->rrdmetrics = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + ri->rrdmetrics = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdcontext, sizeof(RRDMETRIC)); + dictionary_register_insert_callback(ri->rrdmetrics, rrdmetric_insert_callback, ri); dictionary_register_delete_callback(ri->rrdmetrics, rrdmetric_delete_callback, ri); dictionary_register_conflict_callback(ri->rrdmetrics, rrdmetric_conflict_callback, ri); @@ -750,11 +752,6 @@ static void rrdinstance_free(RRDINSTANCE *ri) { } static void rrdinstance_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdcontext) { - static STRING *ml_anomaly_rates_id = NULL; - - if(unlikely(!ml_anomaly_rates_id)) - ml_anomaly_rates_id = string_strdupz(ML_ANOMALY_RATES_CHART_ID); - RRDINSTANCE *ri = value; // link it to its parent @@ -781,10 +778,6 @@ static void rrdinstance_insert_callback(const DICTIONARY_ITEM *item __maybe_unus ri->flags &= ~RRD_FLAG_HIDDEN; // no need of atomics at the constructor } - // we need this when loading from SQL - if(unlikely(ri->id == ml_anomaly_rates_id)) - ri->flags |= RRD_FLAG_HIDDEN; // no need of atomics at the constructor - rrdmetrics_create_in_rrdinstance(ri); // signal the react callback to do the job @@ -872,8 +865,8 @@ static bool rrdinstance_conflict_callback(const DICTIONARY_ITEM *item __maybe_un rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); } - if(ri->update_every != ri_new->update_every) { - ri->update_every = ri_new->update_every; + if(ri->update_every_s != ri_new->update_every_s) { + ri->update_every_s = ri_new->update_every_s; rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); } @@ -923,7 +916,9 @@ static void rrdinstance_react_callback(const DICTIONARY_ITEM *item __maybe_unuse void rrdinstances_create_in_rrdcontext(RRDCONTEXT *rc) { if(unlikely(!rc || rc->rrdinstances)) return; - rc->rrdinstances = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + rc->rrdinstances = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdcontext, sizeof(RRDINSTANCE)); + dictionary_register_insert_callback(rc->rrdinstances, rrdinstance_insert_callback, rc); dictionary_register_delete_callback(rc->rrdinstances, rrdinstance_delete_callback, rc); dictionary_register_conflict_callback(rc->rrdinstances, rrdinstance_conflict_callback, rc); @@ -945,8 +940,8 @@ static void rrdinstance_trigger_updates(RRDINSTANCE *ri, const char *function) { ri->priority = st->priority; rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); } - if(unlikely(st->update_every != ri->update_every)) { - ri->update_every = st->update_every; + if(unlikely(st->update_every != ri->update_every_s)) { + ri->update_every_s = st->update_every; rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); } } @@ -989,7 +984,7 @@ static inline void rrdinstance_from_rrdset(RRDSET *st) { .title = string_dup(st->title), .chart_type = st->chart_type, .priority = st->priority, - .update_every = st->update_every, + .update_every_s = st->update_every, .flags = RRD_FLAG_NONE, // no need for atomics .rrdset = st, }; @@ -1027,8 +1022,8 @@ static inline void rrdinstance_from_rrdset(RRDSET *st) { RRDMETRIC *rm_old = rrdmetric_acquired_value(rd->rrdmetric); rrd_flags_replace(rm_old, RRD_FLAG_DELETED|RRD_FLAG_UPDATED|RRD_FLAG_LIVE_RETENTION|RRD_FLAG_UPDATE_REASON_UNUSED|RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); rm_old->rrddim = NULL; - rm_old->first_time_t = 0; - rm_old->last_time_t = 0; + rm_old->first_time_s = 0; + rm_old->last_time_s = 0; rrdmetric_release(rd->rrdmetric); rd->rrdmetric = NULL; @@ -1043,8 +1038,8 @@ static inline void rrdinstance_from_rrdset(RRDSET *st) { rrd_flags_replace(ri_old, RRD_FLAG_OWN_LABELS|RRD_FLAG_DELETED|RRD_FLAG_UPDATED|RRD_FLAG_LIVE_RETENTION|RRD_FLAG_UPDATE_REASON_UNUSED|RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); ri_old->rrdset = NULL; - ri_old->first_time_t = 0; - ri_old->last_time_t = 0; + ri_old->first_time_s = 0; + ri_old->last_time_s = 0; rrdinstance_trigger_updates(ri_old, __FUNCTION__ ); rrdinstance_release(ria_old); @@ -1054,8 +1049,8 @@ static inline void rrdinstance_from_rrdset(RRDSET *st) { if(!dictionary_entries(rc_old->rrdinstances) && !dictionary_stats_referenced_items(rc_old->rrdinstances)) { rrdcontext_lock(rc_old); rc_old->flags = ((rc_old->flags & RRD_FLAG_QUEUED)?RRD_FLAG_QUEUED:RRD_FLAG_NONE)|RRD_FLAG_DELETED|RRD_FLAG_UPDATED|RRD_FLAG_LIVE_RETENTION|RRD_FLAG_UPDATE_REASON_UNUSED|RRD_FLAG_UPDATE_REASON_ZERO_RETENTION; - rc_old->first_time_t = 0; - rc_old->last_time_t = 0; + rc_old->first_time_s = 0; + rc_old->last_time_s = 0; rrdcontext_unlock(rc_old); rrdcontext_trigger_updates(rc_old, __FUNCTION__ ); } @@ -1233,13 +1228,13 @@ static void rrdcontext_insert_callback(const DICTIONARY_ITEM *item __maybe_unuse rc->version = rc->hub.version; rc->priority = rc->hub.priority; - rc->first_time_t = (time_t)rc->hub.first_time_t; - rc->last_time_t = (time_t)rc->hub.last_time_t; + rc->first_time_s = (time_t)rc->hub.first_time_s; + rc->last_time_s = (time_t)rc->hub.last_time_s; - if(rc->hub.deleted || !rc->hub.first_time_t) + if(rc->hub.deleted || !rc->hub.first_time_s) rrd_flag_set_deleted(rc, RRD_FLAG_NONE); else { - if (rc->last_time_t == 0) + if (rc->last_time_s == 0) rrd_flag_set_collected(rc); else rrd_flag_set_archived(rc); @@ -1401,18 +1396,20 @@ void rrdhost_create_rrdcontexts(RRDHOST *host) { if(unlikely(!host)) return; if(likely(host->rrdctx)) return; - host->rrdctx = (RRDCONTEXTS *)dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + host->rrdctx = (RRDCONTEXTS *)dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdcontext, sizeof(RRDCONTEXT)); + dictionary_register_insert_callback((DICTIONARY *)host->rrdctx, rrdcontext_insert_callback, host); dictionary_register_delete_callback((DICTIONARY *)host->rrdctx, rrdcontext_delete_callback, host); dictionary_register_conflict_callback((DICTIONARY *)host->rrdctx, rrdcontext_conflict_callback, host); dictionary_register_react_callback((DICTIONARY *)host->rrdctx, rrdcontext_react_callback, host); - host->rrdctx_hub_queue = (RRDCONTEXTS *)dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_VALUE_LINK_DONT_CLONE); + host->rrdctx_hub_queue = (RRDCONTEXTS *)dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_VALUE_LINK_DONT_CLONE, &dictionary_stats_category_rrdcontext, 0); dictionary_register_insert_callback((DICTIONARY *)host->rrdctx_hub_queue, rrdcontext_hub_queue_insert_callback, NULL); dictionary_register_delete_callback((DICTIONARY *)host->rrdctx_hub_queue, rrdcontext_hub_queue_delete_callback, NULL); dictionary_register_conflict_callback((DICTIONARY *)host->rrdctx_hub_queue, rrdcontext_hub_queue_conflict_callback, NULL); - host->rrdctx_post_processing_queue = (RRDCONTEXTS *)dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_VALUE_LINK_DONT_CLONE); + host->rrdctx_post_processing_queue = (RRDCONTEXTS *)dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_VALUE_LINK_DONT_CLONE, &dictionary_stats_category_rrdcontext, 0); dictionary_register_insert_callback((DICTIONARY *)host->rrdctx_post_processing_queue, rrdcontext_post_processing_queue_insert_callback, NULL); dictionary_register_delete_callback((DICTIONARY *)host->rrdctx_post_processing_queue, rrdcontext_post_processing_queue_delete_callback, NULL); dictionary_register_conflict_callback((DICTIONARY *)host->rrdctx_post_processing_queue, rrdcontext_post_processing_queue_conflict_callback, NULL); @@ -1743,8 +1740,8 @@ struct rrdcontext_to_json { SIMPLE_PATTERN *chart_dimensions; size_t written; time_t now; - time_t combined_first_time_t; - time_t combined_last_time_t; + time_t combined_first_time_s; + time_t combined_last_time_s; RRD_FLAGS combined_flags; }; @@ -1760,10 +1757,10 @@ static inline int rrdmetric_to_json_callback(const DICTIONARY_ITEM *item, void * if(unlikely(rrd_flag_is_deleted(rm) && !(options & RRDCONTEXT_OPTION_SHOW_DELETED))) return 0; - if(after && (!rm->last_time_t || after > rm->last_time_t)) + if(after && (!rm->last_time_s || after > rm->last_time_s)) return 0; - if(before && (!rm->first_time_t || before < rm->first_time_t)) + if(before && (!rm->first_time_s || before < rm->first_time_s)) return 0; if(t->chart_dimensions @@ -1773,14 +1770,14 @@ static inline int rrdmetric_to_json_callback(const DICTIONARY_ITEM *item, void * if(t->written) { buffer_strcat(wb, ",\n"); - t->combined_first_time_t = MIN(t->combined_first_time_t, rm->first_time_t); - t->combined_last_time_t = MAX(t->combined_last_time_t, rm->last_time_t); + t->combined_first_time_s = MIN(t->combined_first_time_s, rm->first_time_s); + t->combined_last_time_s = MAX(t->combined_last_time_s, rm->last_time_s); t->combined_flags |= rrd_flags_get(rm); } else { buffer_strcat(wb, "\n"); - t->combined_first_time_t = rm->first_time_t; - t->combined_last_time_t = rm->last_time_t; + t->combined_first_time_s = rm->first_time_s; + t->combined_last_time_s = rm->last_time_s; t->combined_flags = rrd_flags_get(rm); } @@ -1798,8 +1795,8 @@ static inline int rrdmetric_to_json_callback(const DICTIONARY_ITEM *item, void * ",\n\t\t\t\t\t\t\t\"last_time_t\":%lld" ",\n\t\t\t\t\t\t\t\"collected\":%s" , string2str(rm->name) - , (long long)rm->first_time_t - , rrd_flag_is_collected(rm) ? (long long)t->now : (long long)rm->last_time_t + , (long long)rm->first_time_s + , rrd_flag_is_collected(rm) ? (long long)t->now : (long long)rm->last_time_s , rrd_flag_is_collected(rm) ? "true" : "false" ); @@ -1835,10 +1832,10 @@ static inline int rrdinstance_to_json_callback(const DICTIONARY_ITEM *item, void if(unlikely(rrd_flag_is_deleted(ri) && !(options & RRDCONTEXT_OPTION_SHOW_DELETED))) return 0; - if(after && (!ri->last_time_t || after > ri->last_time_t)) + if(after && (!ri->last_time_s || after > ri->last_time_s)) return 0; - if(before && (!ri->first_time_t || before < ri->first_time_t)) + if(before && (!ri->first_time_s || before < ri->first_time_s)) return 0; if(t_parent->chart_label_key && !rrdlabels_match_simple_pattern_parsed(ri->rrdlabels, t_parent->chart_label_key, '\0')) @@ -1847,14 +1844,14 @@ static inline int rrdinstance_to_json_callback(const DICTIONARY_ITEM *item, void if(t_parent->chart_labels_filter && !rrdlabels_match_simple_pattern_parsed(ri->rrdlabels, t_parent->chart_labels_filter, ':')) return 0; - time_t first_time_t = ri->first_time_t; - time_t last_time_t = ri->last_time_t; + time_t first_time_s = ri->first_time_s; + time_t last_time_s = ri->last_time_s; RRD_FLAGS flags = rrd_flags_get(ri); BUFFER *wb_metrics = NULL; if(options & RRDCONTEXT_OPTION_SHOW_METRICS || t_parent->chart_dimensions) { - wb_metrics = buffer_create(4096); + wb_metrics = buffer_create(4096, &netdata_buffers_statistics.buffers_api); struct rrdcontext_to_json t_metrics = { .wb = wb_metrics, @@ -1874,21 +1871,21 @@ static inline int rrdinstance_to_json_callback(const DICTIONARY_ITEM *item, void return 0; } - first_time_t = t_metrics.combined_first_time_t; - last_time_t = t_metrics.combined_last_time_t; + first_time_s = t_metrics.combined_first_time_s; + last_time_s = t_metrics.combined_last_time_s; flags = t_metrics.combined_flags; } if(t_parent->written) { buffer_strcat(wb, ",\n"); - t_parent->combined_first_time_t = MIN(t_parent->combined_first_time_t, first_time_t); - t_parent->combined_last_time_t = MAX(t_parent->combined_last_time_t, last_time_t); + t_parent->combined_first_time_s = MIN(t_parent->combined_first_time_s, first_time_s); + t_parent->combined_last_time_s = MAX(t_parent->combined_last_time_s, last_time_s); t_parent->combined_flags |= flags; } else { buffer_strcat(wb, "\n"); - t_parent->combined_first_time_t = first_time_t; - t_parent->combined_last_time_t = last_time_t; + t_parent->combined_first_time_s = first_time_s; + t_parent->combined_last_time_s = last_time_s; t_parent->combined_flags = flags; } @@ -1908,7 +1905,7 @@ static inline int rrdinstance_to_json_callback(const DICTIONARY_ITEM *item, void ",\n\t\t\t\t\t\"family\":\"%s\"" ",\n\t\t\t\t\t\"chart_type\":\"%s\"" ",\n\t\t\t\t\t\"priority\":%u" - ",\n\t\t\t\t\t\"update_every\":%d" + ",\n\t\t\t\t\t\"update_every\":%ld" ",\n\t\t\t\t\t\"first_time_t\":%lld" ",\n\t\t\t\t\t\"last_time_t\":%lld" ",\n\t\t\t\t\t\"collected\":%s" @@ -1919,9 +1916,9 @@ static inline int rrdinstance_to_json_callback(const DICTIONARY_ITEM *item, void , string2str(ri->family) , rrdset_type_name(ri->chart_type) , ri->priority - , ri->update_every - , (long long)first_time_t - , (flags & RRD_FLAG_COLLECTED) ? (long long)t_parent->now : (long long)last_time_t + , ri->update_every_s + , (long long)first_time_s + , (flags & RRD_FLAG_COLLECTED) ? (long long)t_parent->now : (long long)last_time_s , (flags & RRD_FLAG_COLLECTED) ? "true" : "false" ); @@ -1976,14 +1973,14 @@ static inline int rrdcontext_to_json_callback(const DICTIONARY_ITEM *item, void if(options & RRDCONTEXT_OPTION_DEEPSCAN) rrdcontext_recalculate_context_retention(rc, RRD_FLAG_NONE, false); - if(after && (!rc->last_time_t || after > rc->last_time_t)) + if(after && (!rc->last_time_s || after > rc->last_time_s)) return 0; - if(before && (!rc->first_time_t || before < rc->first_time_t)) + if(before && (!rc->first_time_s || before < rc->first_time_s)) return 0; - time_t first_time_t = rc->first_time_t; - time_t last_time_t = rc->last_time_t; + time_t first_time_s = rc->first_time_s; + time_t last_time_s = rc->last_time_s; RRD_FLAGS flags = rrd_flags_get(rc); BUFFER *wb_instances = NULL; @@ -1992,7 +1989,7 @@ static inline int rrdcontext_to_json_callback(const DICTIONARY_ITEM *item, void || t_parent->chart_labels_filter || t_parent->chart_dimensions) { - wb_instances = buffer_create(4096); + wb_instances = buffer_create(4096, &netdata_buffers_statistics.buffers_api); struct rrdcontext_to_json t_instances = { .wb = wb_instances, @@ -2012,8 +2009,8 @@ static inline int rrdcontext_to_json_callback(const DICTIONARY_ITEM *item, void return 0; } - first_time_t = t_instances.combined_first_time_t; - last_time_t = t_instances.combined_last_time_t; + first_time_s = t_instances.combined_first_time_s; + last_time_s = t_instances.combined_last_time_s; flags = t_instances.combined_flags; } @@ -2043,8 +2040,8 @@ static inline int rrdcontext_to_json_callback(const DICTIONARY_ITEM *item, void , string2str(rc->family) , rrdset_type_name(rc->chart_type) , rc->priority - , (long long)first_time_t - , (flags & RRD_FLAG_COLLECTED) ? (long long)t_parent->now : (long long)last_time_t + , (long long)first_time_s + , (flags & RRD_FLAG_COLLECTED) ? (long long)t_parent->now : (long long)last_time_s , (flags & RRD_FLAG_COLLECTED) ? "true" : "false" ); @@ -2220,7 +2217,7 @@ DICTIONARY *rrdcontext_all_metrics_to_dict(RRDHOST *host, SIMPLE_PATTERN *contex if(!host || !host->rrdctx) return NULL; - DICTIONARY *dict = dictionary_create(DICT_OPTION_SINGLE_THREADED|DICT_OPTION_DONT_OVERWRITE_VALUE); + DICTIONARY *dict = dictionary_create_advanced(DICT_OPTION_SINGLE_THREADED|DICT_OPTION_DONT_OVERWRITE_VALUE, &dictionary_stats_category_rrdcontext, 0); dictionary_register_insert_callback(dict, metric_entry_insert_callback, NULL); dictionary_register_delete_callback(dict, metric_entry_delete_callback, NULL); dictionary_register_conflict_callback(dict, metric_entry_conflict_callback, NULL); @@ -2328,11 +2325,25 @@ void query_target_release(QUERY_TARGET *qt) { string_freez(qt->query.array[i].chart.name); qt->query.array[i].chart.name = NULL; + // reset the plans + for(size_t p = 0; p < qt->query.array[i].plan.used; p++) { + internal_fatal(qt->query.array[i].plan.array[p].initialized && + !qt->query.array[i].plan.array[p].finalized, + "QUERY: left-over initialized plan"); + + qt->query.array[i].plan.array[p].initialized = false; + qt->query.array[i].plan.array[p].finalized = false; + } + qt->query.array[i].plan.used = 0; + + // reset the tiers for(size_t tier = 0; tier < storage_tiers ;tier++) { if(qt->query.array[i].tiers[tier].db_metric_handle) { STORAGE_ENGINE *eng = qt->query.array[i].tiers[tier].eng; eng->api.metric_release(qt->query.array[i].tiers[tier].db_metric_handle); qt->query.array[i].tiers[tier].db_metric_handle = NULL; + qt->query.array[i].tiers[tier].weight = 0; + qt->query.array[i].tiers[tier].eng = NULL; } } } @@ -2366,37 +2377,44 @@ void query_target_release(QUERY_TARGET *qt) { qt->contexts.used = 0; qt->hosts.used = 0; - qt->db.minimum_latest_update_every = 0; - qt->db.first_time_t = 0; - qt->db.last_time_t = 0; + qt->db.minimum_latest_update_every_s = 0; + qt->db.first_time_s = 0; + qt->db.last_time_s = 0; qt->id[0] = '\0'; qt->used = false; } void query_target_free(void) { - if(thread_query_target.used) - query_target_release(&thread_query_target); + QUERY_TARGET *qt = &thread_query_target; - freez(thread_query_target.query.array); - thread_query_target.query.array = NULL; - thread_query_target.query.size = 0; + if(qt->used) + query_target_release(qt); - freez(thread_query_target.metrics.array); - thread_query_target.metrics.array = NULL; - thread_query_target.metrics.size = 0; + __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->query.size * sizeof(QUERY_METRIC), __ATOMIC_RELAXED); + freez(qt->query.array); + qt->query.array = NULL; + qt->query.size = 0; - freez(thread_query_target.instances.array); - thread_query_target.instances.array = NULL; - thread_query_target.instances.size = 0; + __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->metrics.size * sizeof(RRDMETRIC_ACQUIRED *), __ATOMIC_RELAXED); + freez(qt->metrics.array); + qt->metrics.array = NULL; + qt->metrics.size = 0; - freez(thread_query_target.contexts.array); - thread_query_target.contexts.array = NULL; - thread_query_target.contexts.size = 0; + __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->instances.size * sizeof(RRDINSTANCE_ACQUIRED *), __ATOMIC_RELAXED); + freez(qt->instances.array); + qt->instances.array = NULL; + qt->instances.size = 0; - freez(thread_query_target.hosts.array); - thread_query_target.hosts.array = NULL; - thread_query_target.hosts.size = 0; + __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->contexts.size * sizeof(RRDCONTEXT_ACQUIRED *), __ATOMIC_RELAXED); + freez(qt->contexts.array); + qt->contexts.array = NULL; + qt->contexts.size = 0; + + __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->hosts.size * sizeof(RRDHOST *), __ATOMIC_RELAXED); + freez(qt->hosts.array); + qt->hosts.array = NULL; + qt->hosts.size = 0; } static void query_target_add_metric(QUERY_TARGET_LOCALS *qtl, RRDMETRIC_ACQUIRED *rma, RRDINSTANCE *ri, @@ -2408,69 +2426,73 @@ static void query_target_add_metric(QUERY_TARGET_LOCALS *qtl, RRDMETRIC_ACQUIRED return; if(qt->metrics.used == qt->metrics.size) { + size_t old_mem = qt->metrics.size * sizeof(RRDMETRIC_ACQUIRED *); qt->metrics.size = (qt->metrics.size) ? qt->metrics.size * 2 : 1; - qt->metrics.array = reallocz(qt->metrics.array, qt->metrics.size * sizeof(RRDMETRIC_ACQUIRED *)); + size_t new_mem = qt->metrics.size * sizeof(RRDMETRIC_ACQUIRED *); + qt->metrics.array = reallocz(qt->metrics.array, new_mem); + + __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); } qt->metrics.array[qt->metrics.used++] = rrdmetric_acquired_dup(rma); if(!queryable_instance) return; - time_t common_first_time_t = 0; - time_t common_last_time_t = 0; - time_t common_update_every = 0; + time_t common_first_time_s = 0; + time_t common_last_time_s = 0; + time_t common_update_every_s = 0; size_t tiers_added = 0; struct { STORAGE_ENGINE *eng; STORAGE_METRIC_HANDLE *db_metric_handle; - time_t db_first_time_t; - time_t db_last_time_t; - time_t db_update_every; + time_t db_first_time_s; + time_t db_last_time_s; + time_t db_update_every_s; } tier_retention[storage_tiers]; for (size_t tier = 0; tier < storage_tiers; tier++) { STORAGE_ENGINE *eng = qtl->host->db[tier].eng; tier_retention[tier].eng = eng; - tier_retention[tier].db_update_every = (time_t) (qtl->host->db[tier].tier_grouping * ri->update_every); + tier_retention[tier].db_update_every_s = (time_t) (qtl->host->db[tier].tier_grouping * ri->update_every_s); - if(rm->rrddim && rm->rrddim->tiers[tier] && rm->rrddim->tiers[tier]->db_metric_handle) - tier_retention[tier].db_metric_handle = eng->api.metric_dup(rm->rrddim->tiers[tier]->db_metric_handle); + if(rm->rrddim && rm->rrddim->tiers[tier].db_metric_handle) + tier_retention[tier].db_metric_handle = eng->api.metric_dup(rm->rrddim->tiers[tier].db_metric_handle); else tier_retention[tier].db_metric_handle = eng->api.metric_get(qtl->host->db[tier].instance, &rm->uuid); if(tier_retention[tier].db_metric_handle) { - tier_retention[tier].db_first_time_t = tier_retention[tier].eng->api.query_ops.oldest_time(tier_retention[tier].db_metric_handle); - tier_retention[tier].db_last_time_t = tier_retention[tier].eng->api.query_ops.latest_time(tier_retention[tier].db_metric_handle); + tier_retention[tier].db_first_time_s = tier_retention[tier].eng->api.query_ops.oldest_time_s(tier_retention[tier].db_metric_handle); + tier_retention[tier].db_last_time_s = tier_retention[tier].eng->api.query_ops.latest_time_s(tier_retention[tier].db_metric_handle); - if(!common_first_time_t) - common_first_time_t = tier_retention[tier].db_first_time_t; - else if(tier_retention[tier].db_first_time_t) - common_first_time_t = MIN(common_first_time_t, tier_retention[tier].db_first_time_t); + if(!common_first_time_s) + common_first_time_s = tier_retention[tier].db_first_time_s; + else if(tier_retention[tier].db_first_time_s) + common_first_time_s = MIN(common_first_time_s, tier_retention[tier].db_first_time_s); - if(!common_last_time_t) - common_last_time_t = tier_retention[tier].db_last_time_t; + if(!common_last_time_s) + common_last_time_s = tier_retention[tier].db_last_time_s; else - common_last_time_t = MAX(common_last_time_t, tier_retention[tier].db_last_time_t); + common_last_time_s = MAX(common_last_time_s, tier_retention[tier].db_last_time_s); - if(!common_update_every) - common_update_every = tier_retention[tier].db_update_every; - else if(tier_retention[tier].db_update_every) - common_update_every = MIN(common_update_every, tier_retention[tier].db_update_every); + if(!common_update_every_s) + common_update_every_s = tier_retention[tier].db_update_every_s; + else if(tier_retention[tier].db_update_every_s) + common_update_every_s = MIN(common_update_every_s, tier_retention[tier].db_update_every_s); tiers_added++; } else { - tier_retention[tier].db_first_time_t = 0; - tier_retention[tier].db_last_time_t = 0; - tier_retention[tier].db_update_every = 0; + tier_retention[tier].db_first_time_s = 0; + tier_retention[tier].db_last_time_s = 0; + tier_retention[tier].db_update_every_s = 0; } } bool release_retention = true; bool timeframe_matches = (tiers_added - && (common_first_time_t - common_update_every * 2) <= qt->window.before - && (common_last_time_t + common_update_every * 2) >= qt->window.after + && (common_first_time_s - common_update_every_s * 2) <= qt->window.before + && (common_last_time_s + common_update_every_s * 2) >= qt->window.after ) ? true : false; if(timeframe_matches) { @@ -2515,14 +2537,19 @@ static void query_target_add_metric(QUERY_TARGET_LOCALS *qtl, RRDMETRIC_ACQUIRED // let's add it to the query metrics if(ri->rrdset) - ri->rrdset->last_accessed_time = qtl->start_s; + ri->rrdset->last_accessed_time_s = qtl->start_s; if (qt->query.used == qt->query.size) { + size_t old_mem = qt->query.size * sizeof(QUERY_METRIC); qt->query.size = (qt->query.size) ? qt->query.size * 2 : 1; - qt->query.array = reallocz(qt->query.array, qt->query.size * sizeof(QUERY_METRIC)); + size_t new_mem = qt->query.size * sizeof(QUERY_METRIC); + qt->query.array = reallocz(qt->query.array, new_mem); + + __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); } QUERY_METRIC *qm = &qt->query.array[qt->query.used++]; + qm->plan.used = 0; qm->dimension.options = options; qm->link.host = qtl->host; @@ -2536,18 +2563,18 @@ static void query_target_add_metric(QUERY_TARGET_LOCALS *qtl, RRDMETRIC_ACQUIRED qm->dimension.id = string_dup(rm->id); qm->dimension.name = string_dup(rm->name); - if (!qt->db.first_time_t || common_first_time_t < qt->db.first_time_t) - qt->db.first_time_t = common_first_time_t; + if (!qt->db.first_time_s || common_first_time_s < qt->db.first_time_s) + qt->db.first_time_s = common_first_time_s; - if (!qt->db.last_time_t || common_last_time_t > qt->db.last_time_t) - qt->db.last_time_t = common_last_time_t; + if (!qt->db.last_time_s || common_last_time_s > qt->db.last_time_s) + qt->db.last_time_s = common_last_time_s; for (size_t tier = 0; tier < storage_tiers; tier++) { qm->tiers[tier].eng = tier_retention[tier].eng; qm->tiers[tier].db_metric_handle = tier_retention[tier].db_metric_handle; - qm->tiers[tier].db_first_time_t = tier_retention[tier].db_first_time_t; - qm->tiers[tier].db_last_time_t = tier_retention[tier].db_last_time_t; - qm->tiers[tier].db_update_every = tier_retention[tier].db_update_every; + qm->tiers[tier].db_first_time_s = tier_retention[tier].db_first_time_s; + qm->tiers[tier].db_last_time_s = tier_retention[tier].db_last_time_s; + qm->tiers[tier].db_update_every_s = tier_retention[tier].db_update_every_s; } release_retention = false; } @@ -2572,14 +2599,18 @@ static void query_target_add_instance(QUERY_TARGET_LOCALS *qtl, RRDINSTANCE_ACQU return; if(qt->instances.used == qt->instances.size) { + size_t old_mem = qt->instances.size * sizeof(RRDINSTANCE_ACQUIRED *); qt->instances.size = (qt->instances.size) ? qt->instances.size * 2 : 1; - qt->instances.array = reallocz(qt->instances.array, qt->instances.size * sizeof(RRDINSTANCE_ACQUIRED *)); + size_t new_mem = qt->instances.size * sizeof(RRDINSTANCE_ACQUIRED *); + qt->instances.array = reallocz(qt->instances.array, new_mem); + + __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); } qtl->ria = qt->instances.array[qt->instances.used++] = rrdinstance_acquired_dup(ria); - if(qt->db.minimum_latest_update_every == 0 || ri->update_every < qt->db.minimum_latest_update_every) - qt->db.minimum_latest_update_every = ri->update_every; + if(qt->db.minimum_latest_update_every_s == 0 || ri->update_every_s < qt->db.minimum_latest_update_every_s) + qt->db.minimum_latest_update_every_s = ri->update_every_s; if(queryable_instance) { if ((qt->instances.chart_label_key_pattern && !rrdlabels_match_simple_pattern_parsed(ri->rrdlabels, qt->instances.chart_label_key_pattern, ':')) || @@ -2616,8 +2647,12 @@ static void query_target_add_context(QUERY_TARGET_LOCALS *qtl, RRDCONTEXT_ACQUIR return; if(qt->contexts.used == qt->contexts.size) { + size_t old_mem = qt->contexts.size * sizeof(RRDCONTEXT_ACQUIRED *); qt->contexts.size = (qt->contexts.size) ? qt->contexts.size * 2 : 1; - qt->contexts.array = reallocz(qt->contexts.array, qt->contexts.size * sizeof(RRDCONTEXT_ACQUIRED *)); + size_t new_mem = qt->contexts.size * sizeof(RRDCONTEXT_ACQUIRED *); + qt->contexts.array = reallocz(qt->contexts.array, new_mem); + + __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); } qtl->rca = qt->contexts.array[qt->contexts.used++] = rrdcontext_acquired_dup(rca); @@ -2656,8 +2691,12 @@ static void query_target_add_host(QUERY_TARGET_LOCALS *qtl, RRDHOST *host) { QUERY_TARGET *qt = qtl->qt; if(qt->hosts.used == qt->hosts.size) { + size_t old_mem = qt->hosts.size * sizeof(RRDHOST *); qt->hosts.size = (qt->hosts.size) ? qt->hosts.size * 2 : 1; - qt->hosts.array = reallocz(qt->hosts.array, qt->hosts.size * sizeof(RRDHOST *)); + size_t new_mem = qt->hosts.size * sizeof(RRDHOST *); + qt->hosts.array = reallocz(qt->hosts.array, new_mem); + + __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); } qtl->host = qt->hosts.array[qt->hosts.used++] = host; @@ -2770,6 +2809,9 @@ void query_target_generate_name(QUERY_TARGET *qt) { } QUERY_TARGET *query_target_create(QUERY_TARGET_REQUEST *qtr) { + if(!service_running(ABILITY_DATA_QUERIES)) + return NULL; + QUERY_TARGET *qt = &thread_query_target; if(qt->used) @@ -2800,7 +2842,7 @@ QUERY_TARGET *query_target_create(QUERY_TARGET_REQUEST *qtr) { .charts_labels_filter = qt->request.charts_labels_filter, }; - qt->db.minimum_latest_update_every = 0; // it will be updated by query_target_add_query() + qt->db.minimum_latest_update_every_s = 0; // it will be updated by query_target_add_query() // prepare all the patterns qt->hosts.pattern = is_valid_sp(qtl.hosts) ? simple_pattern_create(qtl.hosts, ",|\t\r\n\f\v", SIMPLE_PATTERN_EXACT) : NULL; @@ -2922,7 +2964,7 @@ static void rrdinstance_load_chart_callback(SQL_CHART_DATA *sc, void *data) { .family = string_strdupz(sc->family), .chart_type = sc->chart_type, .priority = sc->priority, - .update_every = sc->update_every, + .update_every_s = sc->update_every, .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomics }; uuid_copy(tri.uuid, sc->chart_id); @@ -3012,7 +3054,7 @@ static uint64_t rrdcontext_version_hash_with_callback( // when the context is being collected, // rc->hub.last_time_t is already zero - hash += rc->hub.version + rc->hub.last_time_t - rc->hub.first_time_t; + hash += rc->hub.version + rc->hub.last_time_s - rc->hub.first_time_s; rrdcontext_unlock(rc); @@ -3057,17 +3099,16 @@ static bool rrdmetric_update_retention(RRDMETRIC *rm) { time_t min_first_time_t = LONG_MAX, max_last_time_t = 0; if(rm->rrddim) { - min_first_time_t = rrddim_first_entry_t(rm->rrddim); - max_last_time_t = rrddim_last_entry_t(rm->rrddim); + min_first_time_t = rrddim_first_entry_s(rm->rrddim); + max_last_time_t = rrddim_last_entry_s(rm->rrddim); } -#ifdef ENABLE_DBENGINE - else if (dbengine_enabled) { + else { RRDHOST *rrdhost = rm->ri->rc->rrdhost; for (size_t tier = 0; tier < storage_tiers; tier++) { - if(!rrdhost->db[tier].instance) continue; + STORAGE_ENGINE *eng = rrdhost->db[tier].eng; time_t first_time_t, last_time_t; - if (rrdeng_metric_retention_by_uuid(rrdhost->db[tier].instance, &rm->uuid, &first_time_t, &last_time_t) == 0) { + if (eng->api.metric_retention_by_uuid(rrdhost->db[tier].instance, &rm->uuid, &first_time_t, &last_time_t)) { if (first_time_t < min_first_time_t) min_first_time_t = first_time_t; @@ -3076,17 +3117,15 @@ static bool rrdmetric_update_retention(RRDMETRIC *rm) { } } } - else { - // cannot get retention + + if((min_first_time_t == LONG_MAX || min_first_time_t == 0) && max_last_time_t == 0) return false; - } -#endif if(min_first_time_t == LONG_MAX) min_first_time_t = 0; if(min_first_time_t > max_last_time_t) { - internal_error(true, "RRDMETRIC: retention of '%s' is flipped", string2str(rm->id)); + internal_error(true, "RRDMETRIC: retention of '%s' is flipped, first_time_t = %ld, last_time_t = %ld", string2str(rm->id), min_first_time_t, max_last_time_t); time_t tmp = min_first_time_t; min_first_time_t = max_last_time_t; max_last_time_t = tmp; @@ -3094,17 +3133,17 @@ static bool rrdmetric_update_retention(RRDMETRIC *rm) { // check if retention changed - if (min_first_time_t != rm->first_time_t) { - rm->first_time_t = min_first_time_t; + if (min_first_time_t != rm->first_time_s) { + rm->first_time_s = min_first_time_t; rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); } - if (max_last_time_t != rm->last_time_t) { - rm->last_time_t = max_last_time_t; + if (max_last_time_t != rm->last_time_s) { + rm->last_time_s = max_last_time_t; rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); } - if(unlikely(!rm->first_time_t && !rm->last_time_t)) + if(unlikely(!rm->first_time_s && !rm->last_time_s)) rrd_flag_set_deleted(rm, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); rrd_flag_set(rm, RRD_FLAG_LIVE_RETENTION); @@ -3123,7 +3162,7 @@ static inline bool rrdmetric_should_be_deleted(RRDMETRIC *rm) { return false; rrdmetric_update_retention(rm); - if(rm->first_time_t || rm->last_time_t) + if(rm->first_time_s || rm->last_time_s) return false; return true; @@ -3145,7 +3184,7 @@ static inline bool rrdinstance_should_be_deleted(RRDINSTANCE *ri) { if(unlikely(dictionary_entries(ri->rrdmetrics) != 0)) return false; - if(ri->first_time_t || ri->last_time_t) + if(ri->first_time_s || ri->last_time_s) return false; return true; @@ -3164,7 +3203,7 @@ static inline bool rrdcontext_should_be_deleted(RRDCONTEXT *rc) { if(unlikely(dictionary_entries(rc->rrdinstances) != 0)) return false; - if(unlikely(rc->first_time_t || rc->last_time_t)) + if(unlikely(rc->first_time_s || rc->last_time_s)) return false; return true; @@ -3189,7 +3228,7 @@ static void rrdcontext_garbage_collect_single_host(RRDHOST *host, bool worker_jo RRDCONTEXT *rc; dfe_start_reentrant((DICTIONARY *)host->rrdctx, rc) { - if(unlikely(netdata_exit)) break; + if(unlikely(!service_running(SERVICE_CONTEXT))) break; if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP); @@ -3197,7 +3236,7 @@ static void rrdcontext_garbage_collect_single_host(RRDHOST *host, bool worker_jo RRDINSTANCE *ri; dfe_start_reentrant(rc->rrdinstances, ri) { - if(unlikely(netdata_exit)) break; + if(unlikely(!service_running(SERVICE_CONTEXT))) break; RRDMETRIC *rm; dfe_start_write(ri->rrdmetrics, rm) { @@ -3313,7 +3352,7 @@ static void rrdinstance_post_process_updates(RRDINSTANCE *ri, bool force, RRD_FL if(dictionary_entries(ri->rrdmetrics) > 0) { RRDMETRIC *rm; dfe_start_read((DICTIONARY *)ri->rrdmetrics, rm) { - if(unlikely(netdata_exit)) break; + if(unlikely(!service_running(SERVICE_CONTEXT))) break; RRD_FLAGS reason_to_pass = reason; if(rrd_flag_check(ri, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) @@ -3329,16 +3368,16 @@ static void rrdinstance_post_process_updates(RRDINSTANCE *ri, bool force, RRD_FL continue; } - if(!currently_collected && rrd_flag_check(rm, RRD_FLAG_COLLECTED) && rm->first_time_t) + if(!currently_collected && rrd_flag_check(rm, RRD_FLAG_COLLECTED) && rm->first_time_s) currently_collected = true; metrics_active++; - if (rm->first_time_t && rm->first_time_t < min_first_time_t) - min_first_time_t = rm->first_time_t; + if (rm->first_time_s && rm->first_time_s < min_first_time_t) + min_first_time_t = rm->first_time_s; - if (rm->last_time_t && rm->last_time_t > max_last_time_t) - max_last_time_t = rm->last_time_t; + if (rm->last_time_s && rm->last_time_s > max_last_time_t) + max_last_time_t = rm->last_time_s; } dfe_done(rm); } @@ -3351,13 +3390,13 @@ static void rrdinstance_post_process_updates(RRDINSTANCE *ri, bool force, RRD_FL if(unlikely(!metrics_active)) { // no metrics available - if(ri->first_time_t) { - ri->first_time_t = 0; + if(ri->first_time_s) { + ri->first_time_s = 0; rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); } - if(ri->last_time_t) { - ri->last_time_t = 0; + if(ri->last_time_s) { + ri->last_time_s = 0; rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); } @@ -3370,13 +3409,13 @@ static void rrdinstance_post_process_updates(RRDINSTANCE *ri, bool force, RRD_FL min_first_time_t = 0; if (unlikely(min_first_time_t == 0 || max_last_time_t == 0)) { - if(ri->first_time_t) { - ri->first_time_t = 0; + if(ri->first_time_s) { + ri->first_time_s = 0; rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); } - if(ri->last_time_t) { - ri->last_time_t = 0; + if(ri->last_time_s) { + ri->last_time_s = 0; rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); } @@ -3386,13 +3425,13 @@ static void rrdinstance_post_process_updates(RRDINSTANCE *ri, bool force, RRD_FL else { rrd_flag_clear(ri, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); - if (unlikely(ri->first_time_t != min_first_time_t)) { - ri->first_time_t = min_first_time_t; + if (unlikely(ri->first_time_s != min_first_time_t)) { + ri->first_time_s = min_first_time_t; rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); } - if (unlikely(ri->last_time_t != max_last_time_t)) { - ri->last_time_t = max_last_time_t; + if (unlikely(ri->last_time_s != max_last_time_t)) { + ri->last_time_s = max_last_time_t; rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); } @@ -3413,6 +3452,8 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG if(worker_jobs) worker_is_busy(WORKER_JOB_PP_CONTEXT); + size_t min_priority_collected = LONG_MAX; + size_t min_priority_not_collected = LONG_MAX; size_t min_priority = LONG_MAX; time_t min_first_time_t = LONG_MAX, max_last_time_t = 0; size_t instances_active = 0, instances_deleted = 0; @@ -3420,7 +3461,7 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG if(dictionary_entries(rc->rrdinstances) > 0) { RRDINSTANCE *ri; dfe_start_reentrant(rc->rrdinstances, ri) { - if(unlikely(netdata_exit)) break; + if(unlikely(!service_running(SERVICE_CONTEXT))) break; RRD_FLAGS reason_to_pass = reason; if(rrd_flag_check(rc, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) @@ -3439,7 +3480,7 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG continue; } - if(unlikely(!currently_collected && rrd_flag_is_collected(ri) && ri->first_time_t)) + if(unlikely(!currently_collected && rrd_flag_is_collected(ri) && ri->first_time_s)) currently_collected = true; internal_error(rc->units != ri->units, @@ -3449,16 +3490,31 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG instances_active++; - if (ri->priority >= RRDCONTEXT_MINIMUM_ALLOWED_PRIORITY && ri->priority < min_priority) - min_priority = ri->priority; + if (ri->priority >= RRDCONTEXT_MINIMUM_ALLOWED_PRIORITY) { + if(rrd_flag_check(ri, RRD_FLAG_COLLECTED)) { + if(ri->priority < min_priority_collected) + min_priority_collected = ri->priority; + } + else { + if(ri->priority < min_priority_not_collected) + min_priority_not_collected = ri->priority; + } + } - if (ri->first_time_t && ri->first_time_t < min_first_time_t) - min_first_time_t = ri->first_time_t; + if (ri->first_time_s && ri->first_time_s < min_first_time_t) + min_first_time_t = ri->first_time_s; - if (ri->last_time_t && ri->last_time_t > max_last_time_t) - max_last_time_t = ri->last_time_t; + if (ri->last_time_s && ri->last_time_s > max_last_time_t) + max_last_time_t = ri->last_time_s; } dfe_done(ri); + + if(min_priority_collected != LONG_MAX) + // use the collected priority + min_priority = min_priority_collected; + else + // use the non-collected priority + min_priority = min_priority_not_collected; } { @@ -3485,13 +3541,13 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG if(unlikely(!instances_active)) { // we had some instances, but they are gone now... - if(rc->first_time_t) { - rc->first_time_t = 0; + if(rc->first_time_s) { + rc->first_time_s = 0; rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); } - if(rc->last_time_t) { - rc->last_time_t = 0; + if(rc->last_time_s) { + rc->last_time_s = 0; rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); } @@ -3504,13 +3560,13 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG min_first_time_t = 0; if (unlikely(min_first_time_t == 0 && max_last_time_t == 0)) { - if(rc->first_time_t) { - rc->first_time_t = 0; + if(rc->first_time_s) { + rc->first_time_s = 0; rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); } - if(rc->last_time_t) { - rc->last_time_t = 0; + if(rc->last_time_s) { + rc->last_time_s = 0; rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); } @@ -3519,13 +3575,13 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG else { rrd_flag_clear(rc, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); - if (unlikely(rc->first_time_t != min_first_time_t)) { - rc->first_time_t = min_first_time_t; + if (unlikely(rc->first_time_s != min_first_time_t)) { + rc->first_time_s = min_first_time_t; rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); } - if (rc->last_time_t != max_last_time_t) { - rc->last_time_t = max_last_time_t; + if (rc->last_time_s != max_last_time_t) { + rc->last_time_s = max_last_time_t; rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); } @@ -3592,7 +3648,7 @@ static void rrdcontext_post_process_queued_contexts(RRDHOST *host) { RRDCONTEXT *rc; dfe_start_reentrant((DICTIONARY *)host->rrdctx_post_processing_queue, rc) { - if(unlikely(netdata_exit)) break; + if(unlikely(!service_running(SERVICE_CONTEXT))) break; rrdcontext_dequeue_from_post_processing(rc); rrdcontext_post_process_updates(rc, false, RRD_FLAG_NONE, true); @@ -3621,8 +3677,8 @@ static void rrdcontext_message_send_unsafe(RRDCONTEXT *rc, bool snapshot __maybe rc->hub.family = string2str(rc->family); rc->hub.chart_type = rrdset_type_name(rc->chart_type); rc->hub.priority = rc->priority; - rc->hub.first_time_t = rc->first_time_t; - rc->hub.last_time_t = rrd_flag_is_collected(rc) ? 0 : rc->last_time_t; + rc->hub.first_time_s = rc->first_time_s; + rc->hub.last_time_s = rrd_flag_is_collected(rc) ? 0 : rc->last_time_s; rc->hub.deleted = rrd_flag_is_deleted(rc) ? true : false; #ifdef ENABLE_ACLK @@ -3634,8 +3690,8 @@ static void rrdcontext_message_send_unsafe(RRDCONTEXT *rc, bool snapshot __maybe .family = rc->hub.family, .chart_type = rc->hub.chart_type, .priority = rc->hub.priority, - .first_entry = rc->hub.first_time_t, - .last_entry = rc->hub.last_time_t, + .first_entry = rc->hub.first_time_s, + .last_entry = rc->hub.last_time_s, .deleted = rc->hub.deleted, }; @@ -3689,10 +3745,10 @@ static bool check_if_cloud_version_changed_unsafe(RRDCONTEXT *rc, bool sending _ if(unlikely(rc->priority != rc->hub.priority)) priority_changed = true; - if(unlikely((uint64_t)rc->first_time_t != rc->hub.first_time_t)) + if(unlikely((uint64_t)rc->first_time_s != rc->hub.first_time_s)) first_time_changed = true; - if(unlikely((uint64_t)((flags & RRD_FLAG_COLLECTED) ? 0 : rc->last_time_t) != rc->hub.last_time_t)) + if(unlikely((uint64_t)((flags & RRD_FLAG_COLLECTED) ? 0 : rc->last_time_s) != rc->hub.last_time_s)) last_time_changed = true; if(unlikely(((flags & RRD_FLAG_DELETED) ? true : false) != rc->hub.deleted)) @@ -3711,8 +3767,8 @@ static bool check_if_cloud_version_changed_unsafe(RRDCONTEXT *rc, bool sending _ string2str(rc->family), family_changed ? " (CHANGED)" : "", rrdset_type_name(rc->chart_type), chart_type_changed ? " (CHANGED)" : "", rc->priority, priority_changed ? " (CHANGED)" : "", - rc->first_time_t, first_time_changed ? " (CHANGED)" : "", - (flags & RRD_FLAG_COLLECTED) ? 0 : rc->last_time_t, last_time_changed ? " (CHANGED)" : "", + rc->first_time_s, first_time_changed ? " (CHANGED)" : "", + (flags & RRD_FLAG_COLLECTED) ? 0 : rc->last_time_s, last_time_changed ? " (CHANGED)" : "", (flags & RRD_FLAG_DELETED) ? "true" : "false", deleted_changed ? " (CHANGED)" : "", sending ? (now_realtime_usec() - rc->queue.queued_ut) / USEC_PER_MS : 0, sending ? (rc->queue.scheduled_dispatch_ut - rc->queue.queued_ut) / USEC_PER_MS : 0 @@ -3773,7 +3829,7 @@ static void rrdcontext_dispatch_queued_contexts_to_hub(RRDHOST *host, usec_t now RRDCONTEXT *rc; dfe_start_reentrant((DICTIONARY *)host->rrdctx_hub_queue, rc) { - if(unlikely(netdata_exit)) break; + if(unlikely(!service_running(SERVICE_CONTEXT))) break; if(unlikely(messages_added >= MESSAGES_PER_BUNDLE_TO_SEND_TO_HUB_PER_HOST)) break; @@ -3840,7 +3896,7 @@ static void rrdcontext_dispatch_queued_contexts_to_hub(RRDHOST *host, usec_t now dfe_done(rc); #ifdef ENABLE_ACLK - if(!netdata_exit && bundle) { + if(service_running(SERVICE_CONTEXT) && bundle) { // we have a bundle to send messages // update the version hash @@ -3891,11 +3947,11 @@ void *rrdcontext_main(void *ptr) { heartbeat_init(&hb); usec_t step = RRDCONTEXT_WORKER_THREAD_HEARTBEAT_USEC; - while (!netdata_exit) { + while (service_running(SERVICE_CONTEXT)) { worker_is_idle(); heartbeat_next(&hb, step); - if(unlikely(netdata_exit)) break; + if(unlikely(!service_running(SERVICE_CONTEXT))) break; usec_t now_ut = now_realtime_usec(); @@ -3911,7 +3967,7 @@ void *rrdcontext_main(void *ptr) { rrd_rdlock(); RRDHOST *host; rrdhost_foreach_read(host) { - if(unlikely(netdata_exit)) break; + if(unlikely(!service_running(SERVICE_CONTEXT))) break; worker_is_busy(WORKER_JOB_HOSTS); diff --git a/database/rrdcontext.h b/database/rrdcontext.h index 67e6cf394..eae37036c 100644 --- a/database/rrdcontext.h +++ b/database/rrdcontext.h @@ -118,15 +118,37 @@ DICTIONARY *rrdcontext_all_metrics_to_dict(RRDHOST *host, SIMPLE_PATTERN *contex // ---------------------------------------------------------------------------- // public API for queries +typedef struct query_plan_entry { + size_t tier; + time_t after; + time_t before; + time_t expanded_after; + time_t expanded_before; + struct storage_engine_query_handle handle; + STORAGE_POINT (*next_metric)(struct storage_engine_query_handle *handle); + int (*is_finished)(struct storage_engine_query_handle *handle); + void (*finalize)(struct storage_engine_query_handle *handle); + bool initialized; + bool finalized; +} QUERY_PLAN_ENTRY; + +#define QUERY_PLANS_MAX (RRD_STORAGE_TIERS * 2) + typedef struct query_metric { struct query_metric_tier { struct storage_engine *eng; STORAGE_METRIC_HANDLE *db_metric_handle; - time_t db_first_time_t; // the oldest timestamp available for this tier - time_t db_last_time_t; // the latest timestamp available for this tier - time_t db_update_every; // latest update every for this tier + time_t db_first_time_s; // the oldest timestamp available for this tier + time_t db_last_time_s; // the latest timestamp available for this tier + time_t db_update_every_s; // latest update every for this tier + long weight; } tiers[RRD_STORAGE_TIERS]; + struct { + size_t used; + QUERY_PLAN_ENTRY array[QUERY_PLANS_MAX]; + } plan; + struct { RRDHOST *host; RRDCONTEXT_ACQUIRED *rca; @@ -172,6 +194,7 @@ typedef struct query_target_request { time_t resampling_time; size_t tier; QUERY_SOURCE query_source; + STORAGE_PRIORITY priority; } QUERY_TARGET_REQUEST; typedef struct query_target { @@ -198,9 +221,9 @@ typedef struct query_target { } window; struct { - time_t first_time_t; // the combined first_time_t of all metrics in the query, across all tiers - time_t last_time_t; // the combined last_time_T of all metrics in the query, across all tiers - time_t minimum_latest_update_every; // the min update every of the metrics in the query + time_t first_time_s; // the combined first_time_t of all metrics in the query, across all tiers + time_t last_time_s; // the combined last_time_T of all metrics in the query, across all tiers + time_t minimum_latest_update_every_s; // the min update every of the metrics in the query } db; struct { diff --git a/database/rrddim.c b/database/rrddim.c index 2d909a701..b8059b3c4 100644 --- a/database/rrddim.c +++ b/database/rrddim.c @@ -64,12 +64,15 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v size_t entries = st->entries; if(!entries) entries = 5; - rd->db = netdata_mmap(NULL, entries * sizeof(storage_number), MAP_PRIVATE, 1); + rd->db = netdata_mmap(NULL, entries * sizeof(storage_number), MAP_PRIVATE, 1, false, NULL); if(!rd->db) { info("Failed to use memory mode ram for chart '%s', dimension '%s', falling back to alloc", rrdset_name(st), rrddim_name(rd)); ctr->memory_mode = RRD_MEMORY_MODE_ALLOC; } - else rd->memsize = entries * sizeof(storage_number); + else { + rd->memsize = entries * sizeof(storage_number); + __atomic_add_fetch(&rrddim_db_memory_size, rd->memsize, __ATOMIC_RELAXED); + } } if(ctr->memory_mode == RRD_MEMORY_MODE_ALLOC || ctr->memory_mode == RRD_MEMORY_MODE_NONE) { @@ -78,42 +81,24 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v rd->db = rrddim_alloc_db(entries); rd->memsize = entries * sizeof(storage_number); + __atomic_add_fetch(&rrddim_db_memory_size, rd->memsize, __ATOMIC_RELAXED); } rd->rrd_memory_mode = ctr->memory_mode; - if (unlikely(rrdcontext_find_dimension_uuid(st, rrddim_id(rd), &(rd->metric_uuid)))) { + if (unlikely(rrdcontext_find_dimension_uuid(st, rrddim_id(rd), &(rd->metric_uuid)))) uuid_generate(rd->metric_uuid); - bool found_in_sql = false; (void)found_in_sql; - -// bool found_in_sql = true; -// if(unlikely(sql_find_dimension_uuid(st, rd, &rd->metric_uuid))) { -// found_in_sql = false; -// uuid_generate(rd->metric_uuid); -// } - -#ifdef NETDATA_INTERNAL_CHECKS - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower(rd->metric_uuid, uuid_str); - error_report("Dimension UUID for host %s chart [%s] dimension [%s] not found in context. It is now set to %s (%s)", - string2str(host->hostname), - string2str(st->name), - string2str(rd->name), - uuid_str, found_in_sql ? "found in sqlite" : "newly generated"); -#endif - } // initialize the db tiers { size_t initialized = 0; for(size_t tier = 0; tier < storage_tiers ; tier++) { STORAGE_ENGINE *eng = host->db[tier].eng; - rd->tiers[tier] = callocz(1, sizeof(struct rrddim_tier)); - rd->tiers[tier]->tier_grouping = host->db[tier].tier_grouping; - rd->tiers[tier]->collect_ops = &eng->api.collect_ops; - rd->tiers[tier]->query_ops = &eng->api.query_ops; - rd->tiers[tier]->db_metric_handle = eng->api.metric_get_or_create(rd, host->db[tier].instance); - storage_point_unset(rd->tiers[tier]->virtual_point); + rd->tiers[tier].tier_grouping = host->db[tier].tier_grouping; + rd->tiers[tier].collect_ops = &eng->api.collect_ops; + rd->tiers[tier].query_ops = &eng->api.query_ops; + rd->tiers[tier].db_metric_handle = eng->api.metric_get_or_create(rd, host->db[tier].instance); + storage_point_unset(rd->tiers[tier].virtual_point); initialized++; // internal_error(true, "TIER GROUPING of chart '%s', dimension '%s' for tier %d is set to %d", rd->rrdset->name, rd->name, tier, rd->tiers[tier]->tier_grouping); @@ -122,7 +107,7 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v if(!initialized) error("Failed to initialize all db tiers for chart '%s', dimension '%s", rrdset_name(st), rrddim_name(rd)); - if(!rd->tiers[0]) + if(!rd->tiers[0].db_metric_handle) error("Failed to initialize the first db tier for chart '%s', dimension '%s", rrdset_name(st), rrddim_name(rd)); } @@ -130,8 +115,8 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v { size_t initialized = 0; for (size_t tier = 0; tier < storage_tiers; tier++) { - if (rd->tiers[tier]) { - rd->tiers[tier]->db_collection_handle = rd->tiers[tier]->collect_ops->init(rd->tiers[tier]->db_metric_handle, st->rrdhost->db[tier].tier_grouping * st->update_every, rd->rrdset->storage_metrics_groups[tier]); + if (rd->tiers[tier].db_metric_handle) { + rd->tiers[tier].db_collection_handle = rd->tiers[tier].collect_ops->init(rd->tiers[tier].db_metric_handle, st->rrdhost->db[tier].tier_grouping * st->update_every, rd->rrdset->storage_metrics_groups[tier]); initialized++; } } @@ -172,7 +157,7 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); - ml_new_dimension(rd); + ml_dimension_new(rd); ctr->react_action = RRDDIM_REACT_NEW; @@ -181,6 +166,25 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v } +bool rrddim_finalize_collection_and_check_retention(RRDDIM *rd) { + size_t tiers_available = 0, tiers_said_no_retention = 0; + + for(size_t tier = 0; tier < storage_tiers ;tier++) { + if(!rd->tiers[tier].db_collection_handle) + continue; + + tiers_available++; + + if(rd->tiers[tier].collect_ops->finalize(rd->tiers[tier].db_collection_handle)) + tiers_said_no_retention++; + + rd->tiers[tier].db_collection_handle = NULL; + } + + // return true if the dimension has retention in the db + return (!tiers_said_no_retention || tiers_available > tiers_said_no_retention); +} + static void rrddim_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *rrddim, void *rrdset) { RRDDIM *rd = rrddim; RRDSET *st = rrdset; @@ -191,23 +195,11 @@ static void rrddim_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, v rrdcontext_removed_rrddim(rd); - ml_delete_dimension(rd); + ml_dimension_delete(rd); debug(D_RRD_CALLS, "rrddim_free() %s.%s", rrdset_name(st), rrddim_name(rd)); - size_t tiers_available = 0, tiers_said_no_retention = 0; - for(size_t tier = 0; tier < storage_tiers ;tier++) { - if(rd->tiers[tier] && rd->tiers[tier]->db_collection_handle) { - tiers_available++; - - if(rd->tiers[tier]->collect_ops->finalize(rd->tiers[tier]->db_collection_handle)) - tiers_said_no_retention++; - - rd->tiers[tier]->db_collection_handle = NULL; - } - } - - if (tiers_available == tiers_said_no_retention && tiers_said_no_retention && rd->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { + if (!rrddim_finalize_collection_and_check_retention(rd) && rd->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { /* This metric has no data and no references */ metaqueue_delete_dimension_uuid(&rd->metric_uuid); } @@ -224,16 +216,16 @@ static void rrddim_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, v rrddim_memory_file_free(rd); for(size_t tier = 0; tier < storage_tiers ;tier++) { - if(!rd->tiers[tier]) continue; + if(!rd->tiers[tier].db_metric_handle) continue; STORAGE_ENGINE* eng = host->db[tier].eng; - eng->api.metric_release(rd->tiers[tier]->db_metric_handle); - - freez(rd->tiers[tier]); - rd->tiers[tier] = NULL; + eng->api.metric_release(rd->tiers[tier].db_metric_handle); + rd->tiers[tier].db_metric_handle = NULL; } if(rd->db) { + __atomic_sub_fetch(&rrddim_db_memory_size, rd->memsize, __ATOMIC_RELAXED); + if(rd->rrd_memory_mode == RRD_MEMORY_MODE_RAM) netdata_munmap(rd->db, rd->memsize); else @@ -259,9 +251,9 @@ static bool rrddim_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, rc += rrddim_set_divisor(st, rd, ctr->divisor); for(size_t tier = 0; tier < storage_tiers ;tier++) { - if (rd->tiers[tier] && !rd->tiers[tier]->db_collection_handle) - rd->tiers[tier]->db_collection_handle = - rd->tiers[tier]->collect_ops->init(rd->tiers[tier]->db_metric_handle, st->rrdhost->db[tier].tier_grouping * st->update_every, rd->rrdset->storage_metrics_groups[tier]); + if (!rd->tiers[tier].db_collection_handle) + rd->tiers[tier].db_collection_handle = + rd->tiers[tier].collect_ops->init(rd->tiers[tier].db_metric_handle, st->rrdhost->db[tier].tier_grouping * st->update_every, rd->rrdset->storage_metrics_groups[tier]); } if(rrddim_flag_check(rd, RRDDIM_FLAG_ARCHIVED)) { @@ -285,7 +277,6 @@ static void rrddim_react_callback(const DICTIONARY_ITEM *item __maybe_unused, vo if(ctr->react_action & (RRDDIM_REACT_UPDATED | RRDDIM_REACT_NEW)) { rrddim_flag_set(rd, RRDDIM_FLAG_METADATA_UPDATE); - rrdset_flag_set(rd->rrdset, RRDSET_FLAG_METADATA_UPDATE); rrdhost_flag_set(rd->rrdset->rrdhost, RRDHOST_FLAG_METADATA_UPDATE); } @@ -300,7 +291,8 @@ static void rrddim_react_callback(const DICTIONARY_ITEM *item __maybe_unused, vo void rrddim_index_init(RRDSET *st) { if(!st->rrddim_root_index) { - st->rrddim_root_index = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + st->rrddim_root_index = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdset_rrddim, sizeof(RRDDIM)); dictionary_register_insert_callback(st->rrddim_root_index, rrddim_insert_callback, NULL); dictionary_register_conflict_callback(st->rrddim_root_index, rrddim_conflict_callback, NULL); @@ -420,38 +412,45 @@ inline int rrddim_set_divisor(RRDSET *st, RRDDIM *rd, collected_number divisor) // ---------------------------------------------------------------------------- +time_t rrddim_last_entry_s_of_tier(RRDDIM *rd, size_t tier) { + if(unlikely(tier > storage_tiers || !rd->tiers[tier].db_metric_handle)) + return 0; + + return rd->tiers[tier].query_ops->latest_time_s(rd->tiers[tier].db_metric_handle); +} + // get the timestamp of the last entry in the round-robin database -time_t rrddim_last_entry_t(RRDDIM *rd) { - time_t latest = rd->tiers[0]->query_ops->latest_time(rd->tiers[0]->db_metric_handle); +time_t rrddim_last_entry_s(RRDDIM *rd) { + time_t latest_time_s = rrddim_last_entry_s_of_tier(rd, 0); for(size_t tier = 1; tier < storage_tiers ;tier++) { - if(unlikely(!rd->tiers[tier])) continue; + if(unlikely(!rd->tiers[tier].db_metric_handle)) continue; - time_t t = rd->tiers[tier]->query_ops->latest_time(rd->tiers[tier]->db_metric_handle); - if(t > latest) - latest = t; + time_t t = rrddim_last_entry_s_of_tier(rd, tier); + if(t > latest_time_s) + latest_time_s = t; } - return latest; + return latest_time_s; } -time_t rrddim_first_entry_t_of_tier(RRDDIM *rd, size_t tier) { - if(unlikely(tier > storage_tiers || !rd->tiers[tier])) +time_t rrddim_first_entry_s_of_tier(RRDDIM *rd, size_t tier) { + if(unlikely(tier > storage_tiers || !rd->tiers[tier].db_metric_handle)) return 0; - return rd->tiers[tier]->query_ops->oldest_time(rd->tiers[tier]->db_metric_handle); + return rd->tiers[tier].query_ops->oldest_time_s(rd->tiers[tier].db_metric_handle); } -time_t rrddim_first_entry_t(RRDDIM *rd) { - time_t oldest = 0; +time_t rrddim_first_entry_s(RRDDIM *rd) { + time_t oldest_time_s = 0; for(size_t tier = 0; tier < storage_tiers ;tier++) { - time_t t = rrddim_first_entry_t_of_tier(rd, tier); - if(t != 0 && (oldest == 0 || t < oldest)) - oldest = t; + time_t t = rrddim_first_entry_s_of_tier(rd, tier); + if(t != 0 && (oldest_time_s == 0 || t < oldest_time_s)) + oldest_time_s = t; } - return oldest; + return oldest_time_s; } RRDDIM *rrddim_add_custom(RRDSET *st @@ -498,8 +497,8 @@ int rrddim_hide(RRDSET *st, const char *id) { return 1; } if (!rrddim_flag_check(rd, RRDDIM_FLAG_META_HIDDEN)) { - rrddim_flag_set(rd, RRDDIM_FLAG_META_HIDDEN); - metaqueue_dimension_update_flags(rd); + rrddim_flag_set(rd, RRDDIM_FLAG_META_HIDDEN | RRDDIM_FLAG_METADATA_UPDATE); + rrdhost_flag_set(rd->rrdset->rrdhost, RRDHOST_FLAG_METADATA_UPDATE); } rrddim_option_set(rd, RRDDIM_OPTION_HIDDEN); @@ -518,7 +517,8 @@ int rrddim_unhide(RRDSET *st, const char *id) { } if (rrddim_flag_check(rd, RRDDIM_FLAG_META_HIDDEN)) { rrddim_flag_clear(rd, RRDDIM_FLAG_META_HIDDEN); - metaqueue_dimension_update_flags(rd); + rrddim_flag_set(rd, RRDDIM_FLAG_METADATA_UPDATE); + rrdhost_flag_set(rd->rrdset->rrdhost, RRDHOST_FLAG_METADATA_UPDATE); } rrddim_option_clear(rd, RRDDIM_OPTION_HIDDEN); @@ -650,6 +650,7 @@ void rrddim_memory_file_free(RRDDIM *rd) { rrddim_memory_file_update(rd); struct rrddim_map_save_v019 *rd_on_file = rd->rd_on_file; + __atomic_sub_fetch(&rrddim_db_memory_size, rd_on_file->memsize + strlen(rd_on_file->cache_filename), __ATOMIC_RELAXED); freez(rd_on_file->cache_filename); netdata_munmap(rd_on_file, rd_on_file->memsize); @@ -686,10 +687,10 @@ bool rrddim_memory_load_or_create_map_save(RRDSET *st, RRDDIM *rd, RRD_MEMORY_MO char filename[FILENAME_MAX + 1]; char fullfilename[FILENAME_MAX + 1]; rrdset_strncpyz_name(filename, rrddim_id(rd), FILENAME_MAX); - snprintfz(fullfilename, FILENAME_MAX, "%s/%s.db", st->cache_dir, filename); + snprintfz(fullfilename, FILENAME_MAX, "%s/%s.db", rrdset_cache_dir(st), filename); - rd_on_file = (struct rrddim_map_save_v019 *)netdata_mmap(fullfilename, size, - ((memory_mode == RRD_MEMORY_MODE_MAP) ? MAP_SHARED : MAP_PRIVATE), 1); + rd_on_file = (struct rrddim_map_save_v019 *)netdata_mmap( + fullfilename, size, ((memory_mode == RRD_MEMORY_MODE_MAP) ? MAP_SHARED : MAP_PRIVATE), 1, false, NULL); if(unlikely(!rd_on_file)) return false; @@ -747,6 +748,8 @@ bool rrddim_memory_load_or_create_map_save(RRDSET *st, RRDDIM *rd, RRD_MEMORY_MO rd_on_file->rrd_memory_mode = memory_mode; rd_on_file->cache_filename = strdupz(fullfilename); + __atomic_add_fetch(&rrddim_db_memory_size, rd_on_file->memsize + strlen(rd_on_file->cache_filename), __ATOMIC_RELAXED); + rd->db = &rd_on_file->values[0]; rd->rd_on_file = rd_on_file; rd->memsize = size; diff --git a/database/rrddimvar.c b/database/rrddimvar.c index 449ceeb93..da8b939ce 100644 --- a/database/rrddimvar.c +++ b/database/rrddimvar.c @@ -65,7 +65,7 @@ static inline void rrddimvar_free_variables_unsafe(RRDDIMVAR *rs) { // HOST VARIABLES FOR THIS DIMENSION - if(host->rrdvars && host->health_enabled) { + if(host->rrdvars && host->health.health_enabled) { rrdvar_release_and_del(host->rrdvars, rs->rrdvar_host_chart_id_dim_id); rs->rrdvar_host_chart_id_dim_id = NULL; @@ -152,7 +152,7 @@ static inline void rrddimvar_update_variables_unsafe(RRDDIMVAR *rs) { // - $chart-name.id // - $chart-name.name - if(host->rrdvars && host->health_enabled) { + if(host->rrdvars && host->health.health_enabled) { rs->rrdvar_host_chart_id_dim_id = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_id_dim_id, rs->type, RRDVAR_FLAG_NONE, rs->value); rs->rrdvar_host_chart_id_dim_name = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_id_dim_name, rs->type, RRDVAR_FLAG_NONE, rs->value); rs->rrdvar_host_chart_name_dim_id = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_name_dim_id, rs->type, RRDVAR_FLAG_NONE, rs->value); @@ -214,7 +214,8 @@ static void rrddimvar_delete_callback(const DICTIONARY_ITEM *item __maybe_unused void rrddimvar_index_init(RRDSET *st) { if(!st->rrddimvar_root_index) { - st->rrddimvar_root_index = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + st->rrddimvar_root_index = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdhealth, sizeof(RRDDIMVAR)); dictionary_register_insert_callback(st->rrddimvar_root_index, rrddimvar_insert_callback, NULL); dictionary_register_conflict_callback(st->rrddimvar_root_index, rrddimvar_conflict_callback, NULL); diff --git a/database/rrdfamily.c b/database/rrdfamily.c index e7d1536c8..011cb28b4 100644 --- a/database/rrdfamily.c +++ b/database/rrdfamily.c @@ -33,7 +33,8 @@ static void rrdfamily_delete_callback(const DICTIONARY_ITEM *item __maybe_unused void rrdfamily_index_init(RRDHOST *host) { if(!host->rrdfamily_root_index) { - host->rrdfamily_root_index = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + host->rrdfamily_root_index = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdhealth, sizeof(RRDFAMILY)); dictionary_register_insert_callback(host->rrdfamily_root_index, rrdfamily_insert_callback, NULL); dictionary_register_delete_callback(host->rrdfamily_root_index, rrdfamily_delete_callback, host); diff --git a/database/rrdfunctions.c b/database/rrdfunctions.c index fb847a356..a8341f87e 100644 --- a/database/rrdfunctions.c +++ b/database/rrdfunctions.c @@ -424,7 +424,9 @@ static bool rrd_functions_conflict_callback(const DICTIONARY_ITEM *item __maybe_ void rrdfunctions_init(RRDHOST *host) { if(host->functions) return; - host->functions = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + host->functions = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_functions, sizeof(struct rrd_collector_function)); + dictionary_register_insert_callback(host->functions, rrd_functions_insert_callback, host); dictionary_register_delete_callback(host->functions, rrd_functions_delete_callback, host); dictionary_register_conflict_callback(host->functions, rrd_functions_conflict_callback, host); @@ -629,7 +631,7 @@ int rrd_call_function_and_wait(RRDHOST *host, BUFFER *wb, int timeout, const cha pthread_cond_init(&tmp->cond, NULL); bool we_should_free = true; - BUFFER *temp_wb = buffer_create(PLUGINSD_LINE_MAX + 1); // we need it because we may give up on it + BUFFER *temp_wb = buffer_create(PLUGINSD_LINE_MAX + 1, &netdata_buffers_statistics.buffers_functions); // we need it because we may give up on it temp_wb->contenttype = wb->contenttype; code = rdcf->function(temp_wb, timeout, key, rdcf->collector_data, rrd_call_function_signal_when_ready, tmp); if (code == HTTP_RESP_OK) { @@ -668,13 +670,14 @@ int rrd_call_function_and_wait(RRDHOST *host, BUFFER *wb, int timeout, const cha netdata_mutex_unlock(&tmp->mutex); } else { - buffer_free(temp_wb); if(!buffer_strlen(wb)) rrd_call_function_error(wb, "Failed to send request to the collector.", code); } - if (we_should_free) + if (we_should_free) { rrd_function_call_wait_free(tmp); + buffer_free(temp_wb); + } } return code; diff --git a/database/rrdhost.c b/database/rrdhost.c index 5ba13d47b..60b14c13c 100644 --- a/database/rrdhost.c +++ b/database/rrdhost.c @@ -3,8 +3,11 @@ #define NETDATA_RRD_INTERNALS #include "rrd.h" +static void rrdhost_streaming_sender_structures_init(RRDHOST *host); + bool dbengine_enabled = false; // will become true if and when dbengine is initialized size_t storage_tiers = 3; +bool use_direct_io = true; size_t storage_tiers_grouping_iterations[RRD_STORAGE_TIERS] = { 1, 60, 60, 60, 60 }; RRD_BACKFILL storage_tiers_backfill[RRD_STORAGE_TIERS] = { RRD_BACKFILL_NEW, RRD_BACKFILL_NEW, RRD_BACKFILL_NEW, RRD_BACKFILL_NEW, RRD_BACKFILL_NEW }; @@ -24,18 +27,15 @@ size_t get_tier_grouping(size_t tier) { } RRDHOST *localhost = NULL; -size_t rrd_hosts_available = 0; netdata_rwlock_t rrd_rwlock = NETDATA_RWLOCK_INITIALIZER; -time_t rrdset_free_obsolete_time = 3600; -time_t rrdhost_free_orphan_time = 3600; +time_t rrdset_free_obsolete_time_s = 3600; +time_t rrdhost_free_orphan_time_s = 3600; -bool is_storage_engine_shared(STORAGE_INSTANCE *engine) { +bool is_storage_engine_shared(STORAGE_INSTANCE *engine __maybe_unused) { #ifdef ENABLE_DBENGINE - for(size_t tier = 0; tier < storage_tiers ;tier++) { - if (engine == (STORAGE_INSTANCE *)multidb_ctx[tier]) - return true; - } + if(!rrdeng_is_legacy(engine)) + return true; #endif return false; @@ -50,20 +50,22 @@ static DICTIONARY *rrdhost_root_index_hostname = NULL; static inline void rrdhost_init() { if(unlikely(!rrdhost_root_index)) { - rrdhost_root_index = dictionary_create( - DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE); + rrdhost_root_index = dictionary_create_advanced( + DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE, + &dictionary_stats_category_rrdhost, 0); } if(unlikely(!rrdhost_root_index_hostname)) { - rrdhost_root_index_hostname = dictionary_create( - DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE); + rrdhost_root_index_hostname = dictionary_create_advanced( + DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE, + &dictionary_stats_category_rrdhost, 0); } } // ---------------------------------------------------------------------------- // RRDHOST index by UUID -inline long rrdhost_hosts_available(void) { +inline size_t rrdhost_hosts_available(void) { return dictionary_entries(rrdhost_root_index); } @@ -139,7 +141,7 @@ static inline void rrdhost_init_tags(RRDHOST *host, const char *tags) { string_freez(old); } -static inline void rrdhost_init_hostname(RRDHOST *host, const char *hostname) { +static inline void rrdhost_init_hostname(RRDHOST *host, const char *hostname, bool add_to_index) { if(unlikely(hostname && !*hostname)) hostname = NULL; if(host->hostname && hostname && !strcmp(rrdhost_hostname(host), hostname)) @@ -151,7 +153,8 @@ static inline void rrdhost_init_hostname(RRDHOST *host, const char *hostname) { host->hostname = string_strdupz(hostname?hostname:"localhost"); string_freez(old); - rrdhost_index_add_hostname(host); + if(add_to_index) + rrdhost_index_add_hostname(host); } static inline void rrdhost_init_os(RRDHOST *host, const char *os) { @@ -211,7 +214,7 @@ static void rrdhost_initialize_rrdpush_sender(RRDHOST *host, if(rrdpush_enabled && rrdpush_destination && *rrdpush_destination && rrdpush_api_key && *rrdpush_api_key) { rrdhost_flag_set(host, RRDHOST_FLAG_RRDPUSH_SENDER_INITIALIZED); - sender_init(host); + rrdhost_streaming_sender_structures_init(host); #ifdef ENABLE_HTTPS host->sender->ssl.conn = NULL; @@ -230,35 +233,34 @@ static void rrdhost_initialize_rrdpush_sender(RRDHOST *host, rrdhost_option_clear(host, RRDHOST_OPTION_SENDER_ENABLED); } -RRDHOST *rrdhost_create(const char *hostname, - const char *registry_hostname, - const char *guid, - const char *os, - const char *timezone, - const char *abbrev_timezone, - int32_t utc_offset, - const char *tags, - const char *program_name, - const char *program_version, - int update_every, - long entries, - RRD_MEMORY_MODE memory_mode, - unsigned int health_enabled, - unsigned int rrdpush_enabled, - char *rrdpush_destination, - char *rrdpush_api_key, - char *rrdpush_send_charts_matching, - bool rrdpush_enable_replication, - time_t rrdpush_seconds_to_replicate, - time_t rrdpush_replication_step, - struct rrdhost_system_info *system_info, - int is_localhost, - bool archived +static RRDHOST *rrdhost_create( + const char *hostname, + const char *registry_hostname, + const char *guid, + const char *os, + const char *timezone, + const char *abbrev_timezone, + int32_t utc_offset, + const char *tags, + const char *program_name, + const char *program_version, + int update_every, + long entries, + RRD_MEMORY_MODE memory_mode, + unsigned int health_enabled, + unsigned int rrdpush_enabled, + char *rrdpush_destination, + char *rrdpush_api_key, + char *rrdpush_send_charts_matching, + bool rrdpush_enable_replication, + time_t rrdpush_seconds_to_replicate, + time_t rrdpush_replication_step, + struct rrdhost_system_info *system_info, + int is_localhost, + bool archived ) { debug(D_RRDHOST, "Host '%s': adding with guid '%s'", hostname, guid); - rrd_check_wrlock(); - if(memory_mode == RRD_MEMORY_MODE_DBENGINE && !dbengine_enabled) { error("memory mode 'dbengine' is not enabled, but host '%s' is configured for it. Falling back to 'alloc'", hostname); memory_mode = RRD_MEMORY_MODE_ALLOC; @@ -272,16 +274,17 @@ int is_legacy = 1; int is_in_multihost = (memory_mode == RRD_MEMORY_MODE_DBENGINE && !is_legacy); RRDHOST *host = callocz(1, sizeof(RRDHOST)); + __atomic_add_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(RRDHOST), __ATOMIC_RELAXED); strncpyz(host->machine_guid, guid, GUID_LEN + 1); set_host_properties(host, (update_every > 0)?update_every:1, memory_mode, registry_hostname, os, tags, timezone, abbrev_timezone, utc_offset, program_name, program_version); - rrdhost_init_hostname(host, hostname); + rrdhost_init_hostname(host, hostname, false); - host->rrd_history_entries = align_entries_to_pagesize(memory_mode, entries); - host->health_enabled = ((memory_mode == RRD_MEMORY_MODE_NONE)) ? 0 : health_enabled; + host->rrd_history_entries = align_entries_to_pagesize(memory_mode, entries); + host->health.health_enabled = ((memory_mode == RRD_MEMORY_MODE_NONE)) ? 0 : health_enabled; if (likely(!archived)) { rrdfunctions_init(host); @@ -312,7 +315,6 @@ int is_legacy = 1; break; } - netdata_rwlock_init(&host->rrdhost_rwlock); netdata_mutex_init(&host->aclk_state_lock); netdata_mutex_init(&host->receiver_lock); @@ -356,27 +358,15 @@ int is_legacy = 1; if(!host->rrdvars) host->rrdvars = rrdvariables_create(); - RRDHOST *t = rrdhost_index_add_by_guid(host); - if(t != host) { - error("Host '%s': cannot add host with machine guid '%s' to index. It already exists as host '%s' with machine guid '%s'.", rrdhost_hostname(host), host->machine_guid, rrdhost_hostname(t), t->machine_guid); - rrdhost_free(host, 1); - return NULL; - } - - if (likely(!uuid_parse(host->machine_guid, host->host_uuid))) { - if(!archived) - metaqueue_host_update_info(host->machine_guid); + if (likely(!uuid_parse(host->machine_guid, host->host_uuid))) sql_load_node_id(host); - } else error_report("Host machine GUID %s is not valid", host->machine_guid); rrdfamily_index_init(host); rrdcalctemplate_index_init(host); rrdcalc_rrdhost_index_init(host); - - if (health_enabled) - health_thread_spawn(host); + metaqueue_host_update_info(host); if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { #ifdef ENABLE_DBENGINE @@ -385,9 +375,12 @@ int is_legacy = 1; snprintfz(dbenginepath, FILENAME_MAX, "%s/dbengine", host->cache_dir); ret = mkdir(dbenginepath, 0775); + if (ret != 0 && errno != EEXIST) error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), dbenginepath); - else ret = 0; // succeed + else + ret = 0; // succeed + if (is_legacy) { // initialize legacy dbengine instance as needed @@ -396,16 +389,17 @@ int is_legacy = 1; host->db[0].tier_grouping = get_tier_grouping(0); ret = rrdeng_init( - host, (struct rrdengine_instance **)&host->db[0].instance, dbenginepath, - default_rrdeng_page_cache_mb, default_rrdeng_disk_quota_mb, 0); // may fail here for legacy dbengine initialization if(ret == 0) { + rrdeng_readiness_wait((struct rrdengine_instance *)host->db[0].instance); + // assign the rest of the shared storage instances to it // to allow them collect its metrics too + for(size_t tier = 1; tier < storage_tiers ; tier++) { host->db[tier].mode = RRD_MEMORY_MODE_DBENGINE; host->db[tier].eng = storage_engine_get(host->db[tier].mode); @@ -422,15 +416,17 @@ int is_legacy = 1; host->db[tier].tier_grouping = get_tier_grouping(tier); } } + if (ret) { // check legacy or multihost initialization success error( "Host '%s': cannot initialize host with machine guid '%s'. Failed to initialize DB engine at '%s'.", rrdhost_hostname(host), host->machine_guid, host->cache_dir); - rrdhost_free(host, 1); - host = NULL; - //rrd_hosts_available++; //TODO: maybe we want this? - return host; + rrd_wrlock(); + rrdhost_free___while_having_rrd_wrlock(host, true); + rrd_unlock(); + + return NULL; } #else @@ -454,14 +450,6 @@ int is_legacy = 1; #endif } - // ------------------------------------------------------------------------ - // link it and add it to the index - - if(is_localhost) - DOUBLE_LINKED_LIST_PREPEND_UNSAFE(localhost, host, prev, next); - else - DOUBLE_LINKED_LIST_APPEND_UNSAFE(localhost, host, prev, next); - // ------------------------------------------------------------------------ // init new ML host and update system_info to let upstreams know // about ML functionality @@ -473,6 +461,30 @@ int is_legacy = 1; host->system_info->mc_version = enable_metric_correlations ? metric_correlations_version : 0; } + // ------------------------------------------------------------------------ + // link it and add it to the index + + rrd_wrlock(); + + RRDHOST *t = rrdhost_index_add_by_guid(host); + if(t != host) { + error("Host '%s': cannot add host with machine guid '%s' to index. It already exists as host '%s' with machine guid '%s'.", rrdhost_hostname(host), host->machine_guid, rrdhost_hostname(t), t->machine_guid); + rrdhost_free___while_having_rrd_wrlock(host, true); + rrd_unlock(); + return NULL; + } + + rrdhost_index_add_hostname(host); + + if(is_localhost) + DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(localhost, host, prev, next); + else + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(localhost, host, prev, next); + + rrd_unlock(); + + // ------------------------------------------------------------------------ + info("Host '%s' (at registry as '%s') with guid '%s' initialized" ", os '%s'" ", timezone '%s'" @@ -487,7 +499,6 @@ int is_legacy = 1; ", health %s" ", cache_dir '%s'" ", varlib_dir '%s'" - ", health_log '%s'" ", alarms default handler '%s'" ", alarms default recipient '%s'" , rrdhost_hostname(host) @@ -504,29 +515,27 @@ int is_legacy = 1; , rrdhost_has_rrdpush_sender_enabled(host)?"enabled":"disabled" , host->rrdpush_send_destination?host->rrdpush_send_destination:"" , host->rrdpush_send_api_key?host->rrdpush_send_api_key:"" - , host->health_enabled?"enabled":"disabled" + , host->health.health_enabled?"enabled":"disabled" , host->cache_dir , host->varlib_dir - , host->health_log_filename - , string2str(host->health_default_exec) - , string2str(host->health_default_recipient) + , string2str(host->health.health_default_exec) + , string2str(host->health.health_default_recipient) ); - if(!archived) - metaqueue_host_update_system_info(host); - rrd_hosts_available++; + if(!archived) + rrdhost_flag_set(host,RRDHOST_FLAG_METADATA_INFO | RRDHOST_FLAG_METADATA_UPDATE); rrdhost_load_rrdcontext_data(host); - if (!archived) - ml_new_host(host); - else - rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED); - + if (!archived) { + ml_host_new(host); + ml_start_anomaly_detection_threads(host); + } else + rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED | RRDHOST_FLAG_ORPHAN); return host; } -void rrdhost_update(RRDHOST *host +static void rrdhost_update(RRDHOST *host , const char *hostname , const char *registry_hostname , const char *guid @@ -553,11 +562,16 @@ void rrdhost_update(RRDHOST *host { UNUSED(guid); - host->health_enabled = (mode == RRD_MEMORY_MODE_NONE) ? 0 : health_enabled; + netdata_spinlock_lock(&host->rrdhost_update_lock); - rrdhost_system_info_free(host->system_info); - host->system_info = system_info; - metaqueue_host_update_system_info(host); + host->health.health_enabled = (mode == RRD_MEMORY_MODE_NONE) ? 0 : health_enabled; + + { + struct rrdhost_system_info *old = host->system_info; + host->system_info = system_info; + rrdhost_flag_set(host, RRDHOST_FLAG_METADATA_INFO | RRDHOST_FLAG_METADATA_CLAIMID | RRDHOST_FLAG_METADATA_UPDATE); + rrdhost_system_info_free(old); + } rrdhost_init_os(host, os); rrdhost_init_timezone(host, timezone, abbrev_timezone, utc_offset); @@ -567,7 +581,7 @@ void rrdhost_update(RRDHOST *host if(strcmp(rrdhost_hostname(host), hostname) != 0) { info("Host '%s' has been renamed to '%s'. If this is not intentional it may mean multiple hosts are using the same machine_guid.", rrdhost_hostname(host), hostname); - rrdhost_init_hostname(host, hostname); + rrdhost_init_hostname(host, hostname, true); } if(strcmp(rrdhost_program_name(host), program_name) != 0) { @@ -628,14 +642,14 @@ void rrdhost_update(RRDHOST *host host->rrdpush_seconds_to_replicate = rrdpush_seconds_to_replicate; host->rrdpush_replication_step = rrdpush_replication_step; - rrd_hosts_available++; - ml_new_host(host); + ml_host_new(host); + ml_start_anomaly_detection_threads(host); + rrdhost_load_rrdcontext_data(host); info("Host %s is not in archived mode anymore", rrdhost_hostname(host)); } - if (health_enabled) - health_thread_spawn(host); + netdata_spinlock_unlock(&host->rrdhost_update_lock); } RRDHOST *rrdhost_find_or_create( @@ -665,15 +679,18 @@ RRDHOST *rrdhost_find_or_create( ) { debug(D_RRDHOST, "Searching for host '%s' with guid '%s'", hostname, guid); - rrd_wrlock(); RRDHOST *host = rrdhost_find_by_guid(guid); if (unlikely(host && host->rrd_memory_mode != mode && rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED))) { /* If a legacy memory mode instantiates all dbengine state must be discarded to avoid inconsistencies */ error("Archived host '%s' has memory mode '%s', but the wanted one is '%s'. Discarding archived state.", rrdhost_hostname(host), rrd_memory_mode_name(host->rrd_memory_mode), rrd_memory_mode_name(mode)); - rrdhost_free(host, 1); + + rrd_wrlock(); + rrdhost_free___while_having_rrd_wrlock(host, true); host = NULL; + rrd_unlock(); } + if(!host) { host = rrdhost_create( hostname @@ -703,6 +720,7 @@ RRDHOST *rrdhost_find_or_create( ); } else { + rrdhost_update(host , hostname , registry_hostname @@ -726,19 +744,13 @@ RRDHOST *rrdhost_find_or_create( , rrdpush_seconds_to_replicate , rrdpush_replication_step , system_info); - } - if (host) { - rrdhost_wrlock(host); - rrdhost_flag_clear(host, RRDHOST_FLAG_ORPHAN); - host->senders_disconnected_time = 0; - rrdhost_unlock(host); - } - rrd_unlock(); + } return host; } -inline int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, time_t now) { + +inline int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, time_t now_s) { if(host != protected_host && host != localhost && rrdhost_receiver_replicating_charts(host) == 0 @@ -746,8 +758,8 @@ inline int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, tim && rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN) && !rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED) && !host->receiver - && host->senders_disconnected_time - && host->senders_disconnected_time + rrdhost_free_orphan_time < now) + && host->child_disconnected_time + && host->child_disconnected_time + rrdhost_free_orphan_time_s < now_s) return 1; return 0; @@ -756,8 +768,34 @@ inline int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, tim // ---------------------------------------------------------------------------- // RRDHOST global / startup initialization +#ifdef ENABLE_DBENGINE +struct dbengine_initialization { + netdata_thread_t thread; + char path[FILENAME_MAX + 1]; + int disk_space_mb; + size_t tier; + int ret; +}; + +void *dbengine_tier_init(void *ptr) { + struct dbengine_initialization *dbi = ptr; + dbi->ret = rrdeng_init(NULL, dbi->path, dbi->disk_space_mb, dbi->tier); + return ptr; +} +#endif + void dbengine_init(char *hostname) { #ifdef ENABLE_DBENGINE + use_direct_io = config_get_boolean(CONFIG_SECTION_DB, "dbengine use direct io", use_direct_io); + + unsigned read_num = (unsigned)config_get_number(CONFIG_SECTION_DB, "dbengine pages per extent", MAX_PAGES_PER_EXTENT); + if (read_num > 0 && read_num <= MAX_PAGES_PER_EXTENT) + rrdeng_pages_per_extent = read_num; + else { + error("Invalid dbengine pages per extent %u given. Using %u.", read_num, rrdeng_pages_per_extent); + config_set_number(CONFIG_SECTION_DB, "dbengine pages per extent", rrdeng_pages_per_extent); + } + storage_tiers = config_get_number(CONFIG_SECTION_DB, "storage tiers", storage_tiers); if(storage_tiers < 1) { error("At least 1 storage tier is required. Assuming 1."); @@ -770,6 +808,9 @@ void dbengine_init(char *hostname) { config_set_number(CONFIG_SECTION_DB, "storage tiers", storage_tiers); } + bool parallel_initialization = (storage_tiers <= (size_t)get_netdata_cpus()) ? true : false; + parallel_initialization = config_get_boolean(CONFIG_SECTION_DB, "dbengine parallel initialization", parallel_initialization); + default_rrdeng_page_fetch_timeout = (int) config_get_number(CONFIG_SECTION_DB, "dbengine page fetch timeout secs", PAGE_CACHE_FETCH_WAIT_TIMEOUT); if (default_rrdeng_page_fetch_timeout < 1) { info("'dbengine page fetch timeout secs' cannot be %d, using 1", default_rrdeng_page_fetch_timeout); @@ -784,10 +825,7 @@ void dbengine_init(char *hostname) { config_set_number(CONFIG_SECTION_DB, "dbengine page fetch retries", default_rrdeng_page_fetch_retries); } - if(config_get_boolean(CONFIG_SECTION_DB, "dbengine page descriptors in file mapped memory", rrdeng_page_descr_is_mmap()) == CONFIG_BOOLEAN_YES) - rrdeng_page_descr_use_mmap(); - else - rrdeng_page_descr_use_malloc(); + struct dbengine_initialization tiers_init[RRD_STORAGE_TIERS] = {}; size_t created_tiers = 0; char dbenginepath[FILENAME_MAX + 1]; @@ -808,15 +846,11 @@ void dbengine_init(char *hostname) { if(tier > 0) divisor *= 2; - int page_cache_mb = default_rrdeng_page_cache_mb / divisor; int disk_space_mb = default_multidb_disk_quota_mb / divisor; size_t grouping_iterations = storage_tiers_grouping_iterations[tier]; RRD_BACKFILL backfill = storage_tiers_backfill[tier]; if(tier > 0) { - snprintfz(dbengineconfig, 200, "dbengine tier %zu page cache size MB", tier); - page_cache_mb = config_get_number(CONFIG_SECTION_DB, dbengineconfig, page_cache_mb); - snprintfz(dbengineconfig, 200, "dbengine tier %zu multihost disk space MB", tier); disk_space_mb = config_get_number(CONFIG_SECTION_DB, dbengineconfig, disk_space_mb); @@ -850,13 +884,30 @@ void dbengine_init(char *hostname) { } internal_error(true, "DBENGINE tier %zu grouping iterations is set to %zu", tier, storage_tiers_grouping_iterations[tier]); - ret = rrdeng_init(NULL, NULL, dbenginepath, page_cache_mb, disk_space_mb, tier); - if(ret != 0) { + + tiers_init[tier].disk_space_mb = disk_space_mb; + tiers_init[tier].tier = tier; + strncpyz(tiers_init[tier].path, dbenginepath, FILENAME_MAX); + tiers_init[tier].ret = 0; + + if(parallel_initialization) + netdata_thread_create(&tiers_init[tier].thread, "DBENGINE_INIT", NETDATA_THREAD_OPTION_JOINABLE, + dbengine_tier_init, &tiers_init[tier]); + else + dbengine_tier_init(&tiers_init[tier]); + } + + for(size_t tier = 0; tier < storage_tiers ;tier++) { + void *ptr; + + if(parallel_initialization) + netdata_thread_join(tiers_init[tier].thread, &ptr); + + if(tiers_init[tier].ret != 0) { error("DBENGINE on '%s': Failed to initialize multi-host database tier %zu on path '%s'", - hostname, tier, dbenginepath); - break; + hostname, tiers_init[tier].tier, tiers_init[tier].path); } - else + else if(created_tiers == tier) created_tiers++; } @@ -868,6 +919,9 @@ void dbengine_init(char *hostname) { else if(!created_tiers) fatal("DBENGINE on '%s', failed to initialize databases at '%s'.", hostname, netdata_configured_cache_dir); + for(size_t tier = 0; tier < storage_tiers ;tier++) + rrdeng_readiness_wait(multidb_ctx[tier]); + dbengine_enabled = true; #else storage_tiers = config_get_number(CONFIG_SECTION_DB, "storage tiers", 1); @@ -880,7 +934,7 @@ void dbengine_init(char *hostname) { #endif } -int rrd_init(char *hostname, struct rrdhost_system_info *system_info) { +int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unittest) { rrdhost_init(); if (unlikely(sql_init_database(DB_CHECK_NONE, system_info ? 0 : 1))) { @@ -893,7 +947,7 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info) { error_report("Failed to initialize context metadata database"); } - if (unlikely(strcmp(hostname, "unittest") == 0)) { + if (unlikely(unittest)) { dbengine_enabled = true; } else { @@ -901,11 +955,11 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info) { rrdpush_init(); if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE || rrdpush_receiver_needs_dbengine()) { - info("Initializing dbengine..."); + info("DBENGINE: Initializing ..."); dbengine_init(hostname); } else { - info("Not initializing dbengine..."); + info("DBENGINE: Not initializing ..."); storage_tiers = 1; } @@ -923,42 +977,41 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info) { } } - metadata_sync_init(); + if(!unittest) + metadata_sync_init(); + debug(D_RRDHOST, "Initializing localhost with hostname '%s'", hostname); - rrd_wrlock(); localhost = rrdhost_create( - hostname - , registry_get_this_machine_hostname() + hostname + , registry_get_this_machine_hostname() , registry_get_this_machine_guid() , os_type - , netdata_configured_timezone - , netdata_configured_abbrev_timezone - , netdata_configured_utc_offset - , "" - , program_name - , program_version - , default_rrd_update_every - , default_rrd_history_entries - , default_rrd_memory_mode - , default_health_enabled - , default_rrdpush_enabled - , default_rrdpush_destination - , default_rrdpush_api_key - , default_rrdpush_send_charts_matching - , default_rrdpush_enable_replication - , default_rrdpush_seconds_to_replicate - , default_rrdpush_replication_step - , system_info - , 1 - , 0 + , netdata_configured_timezone + , netdata_configured_abbrev_timezone + , netdata_configured_utc_offset + , "" + , program_name + , program_version + , default_rrd_update_every + , default_rrd_history_entries + , default_rrd_memory_mode + , default_health_enabled + , default_rrdpush_enabled + , default_rrdpush_destination + , default_rrdpush_api_key + , default_rrdpush_send_charts_matching + , default_rrdpush_enable_replication + , default_rrdpush_seconds_to_replicate + , default_rrdpush_replication_step + , system_info + , 1 + , 0 ); + if (unlikely(!localhost)) { - rrd_unlock(); return 1; } - rrd_unlock(); - if (likely(system_info)) { migrate_localhost(&localhost->host_uuid); sql_aclk_sync_init(); @@ -967,47 +1020,13 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info) { return localhost==NULL; } -// ---------------------------------------------------------------------------- -// RRDHOST - lock validations -// there are only used when NETDATA_INTERNAL_CHECKS is set - -void __rrdhost_check_rdlock(RRDHOST *host, const char *file, const char *function, const unsigned long line) { - debug(D_RRDHOST, "Checking read lock on host '%s'", rrdhost_hostname(host)); - - int ret = netdata_rwlock_trywrlock(&host->rrdhost_rwlock); - if(ret == 0) - fatal("RRDHOST '%s' should be read-locked, but it is not, at function %s() at line %lu of file '%s'", rrdhost_hostname(host), function, line, file); -} - -void __rrdhost_check_wrlock(RRDHOST *host, const char *file, const char *function, const unsigned long line) { - debug(D_RRDHOST, "Checking write lock on host '%s'", rrdhost_hostname(host)); - - int ret = netdata_rwlock_tryrdlock(&host->rrdhost_rwlock); - if(ret == 0) - fatal("RRDHOST '%s' should be write-locked, but it is not, at function %s() at line %lu of file '%s'", rrdhost_hostname(host), function, line, file); -} - -void __rrd_check_rdlock(const char *file, const char *function, const unsigned long line) { - debug(D_RRDHOST, "Checking read lock on all RRDs"); - - int ret = netdata_rwlock_trywrlock(&rrd_rwlock); - if(ret == 0) - fatal("RRDs should be read-locked, but it are not, at function %s() at line %lu of file '%s'", function, line, file); -} - -void __rrd_check_wrlock(const char *file, const char *function, const unsigned long line) { - debug(D_RRDHOST, "Checking write lock on all RRDs"); - - int ret = netdata_rwlock_tryrdlock(&rrd_rwlock); - if(ret == 0) - fatal("RRDs should be write-locked, but it are not, at function %s() at line %lu of file '%s'", function, line, file); -} - // ---------------------------------------------------------------------------- // RRDHOST - free void rrdhost_system_info_free(struct rrdhost_system_info *system_info) { if(likely(system_info)) { + __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(struct rrdhost_system_info), __ATOMIC_RELAXED); + freez(system_info->cloud_provider_type); freez(system_info->cloud_instance_type); freez(system_info->cloud_instance_region); @@ -1042,63 +1061,80 @@ void rrdhost_system_info_free(struct rrdhost_system_info *system_info) { } } -void destroy_receiver_state(struct receiver_state *rpt); +static void rrdhost_streaming_sender_structures_init(RRDHOST *host) +{ + if (host->sender) + return; + + host->sender = callocz(1, sizeof(*host->sender)); + __atomic_add_fetch(&netdata_buffers_statistics.rrdhost_senders, sizeof(*host->sender), __ATOMIC_RELAXED); + + host->sender->host = host; + host->sender->buffer = cbuffer_new(CBUFFER_INITIAL_SIZE, 1024 * 1024, &netdata_buffers_statistics.cbuffers_streaming); + host->sender->capabilities = STREAM_OUR_CAPABILITIES; -void stop_streaming_sender(RRDHOST *host) + host->sender->rrdpush_sender_pipe[PIPE_READ] = -1; + host->sender->rrdpush_sender_pipe[PIPE_WRITE] = -1; + host->sender->rrdpush_sender_socket = -1; + +#ifdef ENABLE_COMPRESSION + if(default_compression_enabled) { + host->sender->flags |= SENDER_FLAG_COMPRESSION; + host->sender->compressor = create_compressor(); + } + else + host->sender->flags &= ~SENDER_FLAG_COMPRESSION; +#endif + + netdata_mutex_init(&host->sender->mutex); + replication_init_sender(host->sender); +} + +static void rrdhost_streaming_sender_structures_free(RRDHOST *host) { rrdhost_option_clear(host, RRDHOST_OPTION_SENDER_ENABLED); if (unlikely(!host->sender)) return; - rrdpush_sender_thread_stop(host); // stop a possibly running thread + rrdpush_sender_thread_stop(host, "HOST CLEANUP", true); // stop a possibly running thread cbuffer_free(host->sender->buffer); #ifdef ENABLE_COMPRESSION if (host->sender->compressor) host->sender->compressor->destroy(&host->sender->compressor); #endif replication_cleanup_sender(host->sender); + + __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_senders, sizeof(*host->sender), __ATOMIC_RELAXED); + freez(host->sender); host->sender = NULL; rrdhost_flag_clear(host, RRDHOST_FLAG_RRDPUSH_SENDER_INITIALIZED); } -void stop_streaming_receiver(RRDHOST *host) -{ - netdata_mutex_lock(&host->receiver_lock); - if (host->receiver) { - if (!host->receiver->exited) - netdata_thread_cancel(host->receiver->thread); - netdata_mutex_unlock(&host->receiver_lock); - struct receiver_state *rpt = host->receiver; - while (host->receiver && !rpt->exited) - sleep_usec(50 * USEC_PER_MS); - // If the receiver detached from the host then its thread will destroy the state - if (host->receiver == rpt) - destroy_receiver_state(host->receiver); - } else - netdata_mutex_unlock(&host->receiver_lock); -} - -void rrdhost_free(RRDHOST *host, bool force) { +void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { if(!host) return; - if (netdata_exit || force) - info("Freeing all memory for host '%s'...", rrdhost_hostname(host)); + if (netdata_exit || force) { + info("RRD: 'host:%s' freeing memory...", rrdhost_hostname(host)); - rrd_check_wrlock(); // make sure the RRDs are write locked + // ------------------------------------------------------------------------ + // first remove it from the indexes, so that it will not be discoverable - rrdhost_wrlock(host); - ml_delete_host(host); - rrdhost_unlock(host); + rrdhost_index_del_hostname(host); + rrdhost_index_del_by_guid(host); + + if (host->prev) + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(localhost, host, prev, next); + } // ------------------------------------------------------------------------ // clean up streaming - stop_streaming_sender(host); + rrdhost_streaming_sender_structures_free(host); if (netdata_exit || force) - stop_streaming_receiver(host); + stop_streaming_receiver(host, "HOST CLEANUP"); // ------------------------------------------------------------------------ @@ -1106,9 +1142,6 @@ void rrdhost_free(RRDHOST *host, bool force) { rrdcalc_delete_all(host); - - rrdhost_wrlock(host); // lock this RRDHOST - // ------------------------------------------------------------------------ // release its children resources @@ -1126,6 +1159,10 @@ void rrdhost_free(RRDHOST *host, bool force) { rrdcalc_rrdhost_index_destroy(host); rrdcalctemplate_index_destroy(host); + // cleanup ML resources + ml_stop_anomaly_detection_threads(host); + ml_host_delete(host); + freez(host->exporting_flags); health_alarm_log_free(host); @@ -1140,9 +1177,8 @@ void rrdhost_free(RRDHOST *host, bool force) { #endif if (!netdata_exit && !force) { - info("Setting archive mode for host '%s'...", rrdhost_hostname(host)); - rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED); - rrdhost_unlock(host); + info("RRD: 'host:%s' is now in archive mode...", rrdhost_hostname(host)); + rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED | RRDHOST_FLAG_ORPHAN); return; } @@ -1161,17 +1197,6 @@ void rrdhost_free(RRDHOST *host, bool force) { } #endif - // ------------------------------------------------------------------------ - // remove it from the indexes - - rrdhost_index_del_hostname(host); - rrdhost_index_del_by_guid(host); - - // ------------------------------------------------------------------------ - // unlink it from the host - - DOUBLE_LINKED_LIST_REMOVE_UNSAFE(localhost, host, prev, next); - // ------------------------------------------------------------------------ // free it @@ -1191,14 +1216,11 @@ void rrdhost_free(RRDHOST *host, bool force) { freez(host->rrdpush_send_api_key); freez(host->rrdpush_send_destination); rrdpush_destinations_free(host); - string_freez(host->health_default_exec); - string_freez(host->health_default_recipient); - freez(host->health_log_filename); + string_freez(host->health.health_default_exec); + string_freez(host->health.health_default_recipient); string_freez(host->registry_hostname); simple_pattern_free(host->rrdpush_send_charts_matching); - rrdhost_unlock(host); netdata_rwlock_destroy(&host->health_log.alarm_log_rwlock); - netdata_rwlock_destroy(&host->rrdhost_rwlock); freez(host->node_id); rrdfamily_index_destroy(host); @@ -1208,12 +1230,12 @@ void rrdhost_free(RRDHOST *host, bool force) { rrdhost_destroy_rrdcontexts(host); string_freez(host->hostname); + __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(RRDHOST), __ATOMIC_RELAXED); freez(host); #ifdef ENABLE_ACLK if (wc) wc->is_orphan = 0; #endif - rrd_hosts_available--; } void rrdhost_free_all(void) { @@ -1221,21 +1243,30 @@ void rrdhost_free_all(void) { /* Make sure child-hosts are released before the localhost. */ while(localhost && localhost->next) - rrdhost_free(localhost->next, 1); + rrdhost_free___while_having_rrd_wrlock(localhost->next, true); if(localhost) - rrdhost_free(localhost, 1); + rrdhost_free___while_having_rrd_wrlock(localhost, true); rrd_unlock(); } +void rrd_finalize_collection_for_all_hosts(void) { + RRDHOST *host; + rrd_wrlock(); + rrdhost_foreach_read(host) { + rrdhost_finalize_collection(host); + } + rrd_unlock(); +} + // ---------------------------------------------------------------------------- // RRDHOST - save host files void rrdhost_save_charts(RRDHOST *host) { if(!host) return; - info("Saving/Closing database of host '%s'...", rrdhost_hostname(host)); + info("RRD: 'host:%s' saving / closing database...", rrdhost_hostname(host)); RRDSET *st; @@ -1312,8 +1343,7 @@ static void rrdhost_load_auto_labels(void) { health_add_host_labels(); - rrdlabels_add( - labels, "_is_parent", (localhost->senders_count > 0) ? "true" : "false", RRDLABEL_SRC_AUTO); + rrdlabels_add(labels, "_is_parent", (localhost->connected_children_count > 0) ? "true" : "false", RRDLABEL_SRC_AUTO); if (localhost->rrdpush_send_destination) rrdlabels_add(labels, "_streams_to", localhost->rrdpush_send_destination, RRDLABEL_SRC_AUTO); @@ -1391,13 +1421,18 @@ void reload_host_labels(void) { rrdhost_load_kubernetes_labels(); rrdhost_load_auto_labels(); - rrdlabels_remove_all_unmarked(localhost->rrdlabels); - metaqueue_store_host_labels(localhost->machine_guid); - - health_label_log_save(localhost); + rrdhost_flag_set(localhost,RRDHOST_FLAG_METADATA_LABELS | RRDHOST_FLAG_METADATA_UPDATE); rrdpush_send_host_labels(localhost); - health_reload(); +} + +void rrdhost_finalize_collection(RRDHOST *host) { + info("RRD: 'host:%s' stopping data collection...", rrdhost_hostname(host)); + + RRDSET *st; + rrdset_foreach_write(st, host) + rrdset_finalize_collection(st, true); + rrdset_foreach_done(st); } // ---------------------------------------------------------------------------- @@ -1406,16 +1441,18 @@ void reload_host_labels(void) { void rrdhost_delete_charts(RRDHOST *host) { if(!host) return; - info("Deleting database of host '%s'...", rrdhost_hostname(host)); + info("RRD: 'host:%s' deleting disk files...", rrdhost_hostname(host)); RRDSET *st; - // we get a write lock - // to ensure only one thread is saving the database - rrdset_foreach_write(st, host) { - rrdset_delete_files(st); + if(host->rrd_memory_mode == RRD_MEMORY_MODE_SAVE || host->rrd_memory_mode == RRD_MEMORY_MODE_MAP) { + // we get a write lock + // to ensure only one thread is saving the database + rrdset_foreach_write(st, host){ + rrdset_delete_files(st); + } + rrdset_foreach_done(st); } - rrdset_foreach_done(st); recursively_delete_dir(host->cache_dir, "left over host"); } @@ -1426,7 +1463,7 @@ void rrdhost_delete_charts(RRDHOST *host) { void rrdhost_cleanup_charts(RRDHOST *host) { if(!host) return; - info("Cleaning up database of host '%s'...", rrdhost_hostname(host)); + info("RRD: 'host:%s' cleaning up disk files...", rrdhost_hostname(host)); RRDSET *st; uint32_t rrdhost_delete_obsolete_charts = rrdhost_option_check(host, RRDHOST_OPTION_DELETE_OBSOLETE_CHARTS); @@ -1453,7 +1490,7 @@ void rrdhost_cleanup_charts(RRDHOST *host) { // RRDHOST - save all hosts to disk void rrdhost_save_all(void) { - info("Saving database [%zu hosts(s)]...", rrd_hosts_available); + info("RRD: saving databases [%zu hosts(s)]...", rrdhost_hosts_available()); rrd_rdlock(); @@ -1468,7 +1505,7 @@ void rrdhost_save_all(void) { // RRDHOST - save or delete all hosts from disk void rrdhost_cleanup_all(void) { - info("Cleaning up database [%zu hosts(s)]...", rrd_hosts_available); + info("RRD: cleaning up database [%zu hosts(s)]...", rrdhost_hosts_available()); rrd_rdlock(); @@ -1622,19 +1659,3 @@ int rrdhost_set_system_info_variable(struct rrdhost_system_info *system_info, ch return res; } - -// Added for gap-filling, if this proves to be a bottleneck in large-scale systems then we will need to cache -// the last entry times as the metric updates, but let's see if it is a problem first. -time_t rrdhost_last_entry_t(RRDHOST *h) { - RRDSET *st; - time_t result = 0; - - rrdset_foreach_read(st, h) { - time_t st_last = rrdset_last_entry_t(st); - - if (st_last > result) - result = st_last; - } - rrdset_foreach_done(st); - return result; -} diff --git a/database/rrdlabels.c b/database/rrdlabels.c index 743499ab5..4a9a6dae6 100644 --- a/database/rrdlabels.c +++ b/database/rrdlabels.c @@ -533,7 +533,9 @@ static bool rrdlabel_conflict_callback(const DICTIONARY_ITEM *item __maybe_unuse } DICTIONARY *rrdlabels_create(void) { - DICTIONARY *dict = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + DICTIONARY *dict = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdlabels, sizeof(RRDLABEL)); + dictionary_register_insert_callback(dict, rrdlabel_insert_callback, dict); dictionary_register_delete_callback(dict, rrdlabel_delete_callback, dict); dictionary_register_conflict_callback(dict, rrdlabel_conflict_callback, dict); @@ -964,7 +966,8 @@ void rrdset_update_rrdlabels(RRDSET *st, DICTIONARY *new_rrdlabels) { if (new_rrdlabels) rrdlabels_migrate_to_these(st->rrdlabels, new_rrdlabels); - metaqueue_chart_labels(st); + rrdset_flag_set(st, RRDSET_FLAG_METADATA_UPDATE); + rrdhost_flag_set(st->rrdhost, RRDHOST_FLAG_METADATA_UPDATE); } diff --git a/database/rrdset.c b/database/rrdset.c index 6eb3c7105..57f962cd6 100644 --- a/database/rrdset.c +++ b/database/rrdset.c @@ -28,6 +28,8 @@ static inline void rrdset_index_del_name(RRDHOST *host, RRDSET *st) { } static inline RRDSET *rrdset_index_find_name(RRDHOST *host, const char *name) { + if (unlikely(!host->rrdset_root_index_name)) + return NULL; return dictionary_get(host->rrdset_root_index_name, name); } @@ -126,15 +128,15 @@ static void rrdset_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v st->module_name = rrd_string_strdupz(ctr->module); st->priority = ctr->priority; - st->cache_dir = rrdset_cache_dir(host, chart_full_id); st->entries = (ctr->memory_mode != RRD_MEMORY_MODE_DBENGINE) ? align_entries_to_pagesize(ctr->memory_mode, ctr->history_entries) : 5; st->update_every = ctr->update_every; st->rrd_memory_mode = ctr->memory_mode; st->chart_type = ctr->chart_type; - st->gap_when_lost_iterations_above = (int) (gap_when_lost_iterations_above + 2); st->rrdhost = host; + netdata_spinlock_init(&st->data_collection_lock); + st->flags = RRDSET_FLAG_SYNC_CLOCK | RRDSET_FLAG_INDEXED_ID | RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED @@ -165,7 +167,7 @@ static void rrdset_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v // chart variables - we need this for data collection to work (collector given chart variables) - not only health rrdsetvar_index_init(st); - if (host->health_enabled) { + if (host->health.health_enabled) { st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st)); st->rrdvars = rrdvariables_create(); rrddimvar_index_init(st); @@ -178,6 +180,31 @@ static void rrdset_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v st->red = NAN; ctr->react_action = RRDSET_REACT_NEW; + + ml_chart_new(st); +} + +void rrdset_finalize_collection(RRDSET *st, bool dimensions_too) { + RRDHOST *host = st->rrdhost; + + rrdset_flag_set(st, RRDSET_FLAG_COLLECTION_FINISHED); + + if(dimensions_too) { + RRDDIM *rd; + rrddim_foreach_read(rd, st) + rrddim_finalize_collection_and_check_retention(rd); + rrddim_foreach_done(rd); + } + + for(size_t tier = 0; tier < storage_tiers ; tier++) { + STORAGE_ENGINE *eng = st->rrdhost->db[tier].eng; + if(!eng) continue; + + if(st->storage_metrics_groups[tier]) { + eng->api.collect_ops.metrics_group_release(host->db[tier].instance, st->storage_metrics_groups[tier]); + st->storage_metrics_groups[tier] = NULL; + } + } } // the destructor - the dictionary is write locked while this runs @@ -187,15 +214,7 @@ static void rrdset_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, v rrdset_flag_clear(st, RRDSET_FLAG_INDEXED_ID); - // cleanup storage engines - { - for(size_t tier = 0; tier < storage_tiers ; tier++) { - STORAGE_ENGINE *eng = st->rrdhost->db[tier].eng; - if(!eng) continue; - - eng->api.collect_ops.metrics_group_release(host->db[tier].instance, st->storage_metrics_groups[tier]); - } - } + rrdset_finalize_collection(st, false); // remove it from the name index rrdset_index_del_name(host, st); @@ -232,6 +251,9 @@ static void rrdset_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, v // 7. destroy the chart labels rrdlabels_destroy(st->rrdlabels); // destroy the labels, after letting the contexts know + // 8. destroy the ml handle + ml_chart_delete(st); + rrdset_memory_file_free(st); // remove files of db mode save and map // ------------------------------------------------------------------------ @@ -282,7 +304,7 @@ static bool rrdset_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, } if (unlikely(st->update_every != ctr->update_every)) { - rrdset_set_update_every(st, ctr->update_every); + rrdset_set_update_every_s(st, ctr->update_every); ctr->react_action |= RRDSET_REACT_UPDATED; } @@ -356,33 +378,17 @@ static void rrdset_react_callback(const DICTIONARY_ITEM *item __maybe_unused, vo RRDSET *st = rrdset; RRDHOST *host = st->rrdhost; - st->last_accessed_time = now_realtime_sec(); + st->last_accessed_time_s = now_realtime_sec(); - if(host->health_enabled && (ctr->react_action & (RRDSET_REACT_NEW | RRDSET_REACT_CHART_ACTIVATED))) { + if(host->health.health_enabled && (ctr->react_action & (RRDSET_REACT_NEW | RRDSET_REACT_CHART_ACTIVATED))) { rrdset_flag_set(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION); rrdhost_flag_set(st->rrdhost, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION); } if(ctr->react_action & (RRDSET_REACT_NEW | RRDSET_REACT_PLUGIN_UPDATED | RRDSET_REACT_MODULE_UPDATED)) { if (ctr->react_action & RRDSET_REACT_NEW) { - if(unlikely(rrdcontext_find_chart_uuid(st, &st->chart_uuid))) { + if(unlikely(rrdcontext_find_chart_uuid(st, &st->chart_uuid))) uuid_generate(st->chart_uuid); - bool found_in_sql = false; (void)found_in_sql; - -// bool found_in_sql = true; -// if(unlikely(sql_find_chart_uuid(host, st, &st->chart_uuid))) { -// uuid_generate(st->chart_uuid); -// found_in_sql = false; -// } - -#ifdef NETDATA_INTERNAL_CHECKS - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower(st->chart_uuid, uuid_str); - error_report("Chart UUID for host %s chart [%s] not found in context. It is now set to %s (%s)", - string2str(host->hostname), - string2str(st->name), uuid_str, found_in_sql ? "found in sqlite" : "newly generated"); -#endif - } } rrdset_flag_set(st, RRDSET_FLAG_METADATA_UPDATE); rrdhost_flag_set(st->rrdhost, RRDHOST_FLAG_METADATA_UPDATE); @@ -393,7 +399,8 @@ static void rrdset_react_callback(const DICTIONARY_ITEM *item __maybe_unused, vo void rrdset_index_init(RRDHOST *host) { if(!host->rrdset_root_index) { - host->rrdset_root_index = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + host->rrdset_root_index = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdset_rrddim, sizeof(RRDSET)); dictionary_register_insert_callback(host->rrdset_root_index, rrdset_insert_callback, NULL); dictionary_register_conflict_callback(host->rrdset_root_index, rrdset_conflict_callback, NULL); @@ -402,8 +409,9 @@ void rrdset_index_init(RRDHOST *host) { } if(!host->rrdset_root_index_name) { - host->rrdset_root_index_name = dictionary_create( - DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE); + host->rrdset_root_index_name = dictionary_create_advanced( + DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE, + &dictionary_stats_category_rrdset_rrddim, 0); dictionary_register_insert_callback(host->rrdset_root_index_name, rrdset_name_insert_callback, host); dictionary_register_delete_callback(host->rrdset_root_index_name, rrdset_name_delete_callback, host); @@ -431,6 +439,8 @@ static inline void rrdset_index_del(RRDHOST *host, RRDSET *st) { static RRDSET *rrdset_index_find(RRDHOST *host, const char *id) { // TODO - the name index should have an acquired dictionary item, not just a pointer to RRDSET + if (unlikely(!host->rrdset_root_index)) + return NULL; return dictionary_get(host->rrdset_root_index, id); } @@ -442,7 +452,7 @@ inline RRDSET *rrdset_find(RRDHOST *host, const char *id) { RRDSET *st = rrdset_index_find(host, id); if(st) - st->last_accessed_time = now_realtime_sec(); + st->last_accessed_time_s = now_realtime_sec(); return(st); } @@ -521,51 +531,106 @@ int rrdset_reset_name(RRDSET *st, const char *name) { } // get the timestamp of the last entry in the round-robin database -time_t rrdset_last_entry_t(RRDSET *st) { +time_t rrdset_last_entry_s(RRDSET *st) { RRDDIM *rd; - time_t last_entry_t = 0; + time_t last_entry_s = 0; rrddim_foreach_read(rd, st) { - time_t t = rrddim_last_entry_t(rd); - if(t > last_entry_t) last_entry_t = t; + time_t t = rrddim_last_entry_s(rd); + if(t > last_entry_s) last_entry_s = t; } rrddim_foreach_done(rd); - return last_entry_t; + return last_entry_s; +} + +time_t rrdset_last_entry_s_of_tier(RRDSET *st, size_t tier) { + RRDDIM *rd; + time_t last_entry_s = 0; + + rrddim_foreach_read(rd, st) { + time_t t = rrddim_last_entry_s_of_tier(rd, tier); + if(t > last_entry_s) last_entry_s = t; + } + rrddim_foreach_done(rd); + + return last_entry_s; } // get the timestamp of first entry in the round-robin database -time_t rrdset_first_entry_t(RRDSET *st) { +time_t rrdset_first_entry_s(RRDSET *st) { RRDDIM *rd; - time_t first_entry_t = LONG_MAX; + time_t first_entry_s = LONG_MAX; rrddim_foreach_read(rd, st) { - time_t t = rrddim_first_entry_t(rd); - if(t < first_entry_t) - first_entry_t = t; + time_t t = rrddim_first_entry_s(rd); + if(t < first_entry_s) + first_entry_s = t; } rrddim_foreach_done(rd); - if (unlikely(LONG_MAX == first_entry_t)) return 0; - return first_entry_t; + if (unlikely(LONG_MAX == first_entry_s)) return 0; + return first_entry_s; } -time_t rrdset_first_entry_t_of_tier(RRDSET *st, size_t tier) { +time_t rrdset_first_entry_s_of_tier(RRDSET *st, size_t tier) { if(unlikely(tier > storage_tiers)) return 0; RRDDIM *rd; - time_t first_entry_t = LONG_MAX; + time_t first_entry_s = LONG_MAX; rrddim_foreach_read(rd, st) { - time_t t = rrddim_first_entry_t_of_tier(rd, tier); - if(t && t < first_entry_t) - first_entry_t = t; + time_t t = rrddim_first_entry_s_of_tier(rd, tier); + if(t && t < first_entry_s) + first_entry_s = t; } rrddim_foreach_done(rd); - if (unlikely(LONG_MAX == first_entry_t)) return 0; - return first_entry_t; + if (unlikely(LONG_MAX == first_entry_s)) return 0; + return first_entry_s; +} + +void rrdset_get_retention_of_tier_for_collected_chart(RRDSET *st, time_t *first_time_s, time_t *last_time_s, time_t now_s, size_t tier) { + if(!now_s) + now_s = now_realtime_sec(); + + time_t db_first_entry_s = rrdset_first_entry_s_of_tier(st, tier); + time_t db_last_entry_s = st->last_updated.tv_sec; // we assume this is a collected RRDSET + + if(unlikely(!db_last_entry_s)) { + db_last_entry_s = rrdset_last_entry_s_of_tier(st, tier); + + if (unlikely(!db_last_entry_s)) { + // we assume this is a collected RRDSET + db_first_entry_s = 0; + db_last_entry_s = 0; + } + } + + if(unlikely(db_last_entry_s > now_s)) { + internal_error(db_last_entry_s > now_s + 1, + "RRDSET: 'host:%s/chart:%s' latest db time %ld is in the future, adjusting it to now %ld", + rrdhost_hostname(st->rrdhost), rrdset_id(st), + db_last_entry_s, now_s); + db_last_entry_s = now_s; + } + + if(unlikely(db_first_entry_s && db_last_entry_s && db_first_entry_s >= db_last_entry_s)) { + internal_error(db_first_entry_s > db_last_entry_s, + "RRDSET: 'host:%s/chart:%s' oldest db time %ld is bigger than latest db time %ld, adjusting it to (latest time %ld - update every %ld)", + rrdhost_hostname(st->rrdhost), rrdset_id(st), + db_first_entry_s, db_last_entry_s, + db_last_entry_s, (time_t)st->update_every); + db_first_entry_s = db_last_entry_s - st->update_every; + } + + if(unlikely(!db_first_entry_s && db_last_entry_s)) + // this can be the case on the first data collection of a chart + db_first_entry_s = db_last_entry_s - st->update_every; + + *first_time_s = db_first_entry_s; + *last_time_s = db_last_entry_s; } inline void rrdset_is_obsolete(RRDSET *st) { @@ -578,7 +643,7 @@ inline void rrdset_is_obsolete(RRDSET *st) { rrdset_flag_set(st, RRDSET_FLAG_OBSOLETE); rrdhost_flag_set(st->rrdhost, RRDHOST_FLAG_PENDING_OBSOLETE_CHARTS); - st->last_accessed_time = now_realtime_sec(); + st->last_accessed_time_s = now_realtime_sec(); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); @@ -592,7 +657,7 @@ inline void rrdset_is_obsolete(RRDSET *st) { inline void rrdset_isnot_obsolete(RRDSET *st) { if(unlikely((rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE)))) { rrdset_flag_clear(st, RRDSET_FLAG_OBSOLETE); - st->last_accessed_time = now_realtime_sec(); + st->last_accessed_time_s = now_realtime_sec(); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); @@ -673,8 +738,8 @@ void rrdset_reset(RRDSET *st) { if(!rrddim_flag_check(rd, RRDDIM_FLAG_ARCHIVED)) { for(size_t tier = 0; tier < storage_tiers ;tier++) { - if(rd->tiers[tier]) - rd->tiers[tier]->collect_ops->flush(rd->tiers[tier]->db_collection_handle); + if(rd->tiers[tier].db_collection_handle) + rd->tiers[tier].collect_ops->flush(rd->tiers[tier].db_collection_handle); } } } @@ -768,7 +833,8 @@ void rrdset_delete_files(RRDSET *st) { } rrddim_foreach_done(rd); - recursively_delete_dir(st->cache_dir, "left-over chart"); + if(st->cache_dir) + recursively_delete_dir(st->cache_dir, "left-over chart"); } void rrdset_delete_obsolete_dimensions(RRDSET *st) { @@ -809,7 +875,7 @@ RRDSET *rrdset_create_custom( , long history_entries ) { if (host != localhost) - host->senders_last_chart_command = now_realtime_sec(); + host->child_last_chart_command = now_realtime_sec(); if(!type || !type[0]) fatal("Cannot create rrd stats without a type: id '%s', name '%s', family '%s', context '%s', title '%s', units '%s', plugin '%s', module '%s'." @@ -920,15 +986,8 @@ void rrdset_timed_next(RRDSET *st, struct timeval now, usec_t duration_since_las ); #endif - st->last_collected_time.tv_sec = now.tv_sec - st->update_every; - st->last_collected_time.tv_usec = now.tv_usec; - last_collected_time_align(st); + duration_since_last_update = 0; - st->last_updated.tv_sec = now.tv_sec - st->update_every; - st->last_updated.tv_usec = now.tv_usec; - last_updated_time_align(st); - - duration_since_last_update = st->update_every * USEC_PER_SEC; #ifdef NETDATA_INTERNAL_CHECKS if(!discard_reason) discard_reason = "COLLECTION TIME IN FUTURE"; #endif @@ -941,6 +1000,7 @@ void rrdset_timed_next(RRDSET *st, struct timeval now, usec_t duration_since_las #endif duration_since_last_update = (usec_t)since_last_usec; + #ifdef NETDATA_INTERNAL_CHECKS if(!discard_reason) discard_reason = "COLLECTION TIME TOO FAR IN THE PAST"; #endif @@ -949,16 +1009,16 @@ void rrdset_timed_next(RRDSET *st, struct timeval now, usec_t duration_since_las #ifdef NETDATA_INTERNAL_CHECKS if(since_last_usec > 0 && (susec_t) duration_since_last_update < since_last_usec) { static __thread susec_t min_delta = USEC_PER_SEC * 3600, permanent_min_delta = 0; - static __thread time_t last_t = 0; + static __thread time_t last_time_s = 0; // the first time initialize it so that it will make the check later - if(last_t == 0) last_t = now.tv_sec + 60; + if(last_time_s == 0) last_time_s = now.tv_sec + 60; susec_t delta = since_last_usec - (susec_t) duration_since_last_update; if(delta < min_delta) min_delta = delta; - if(now.tv_sec >= last_t + 60) { - last_t = now.tv_sec; + if(now.tv_sec >= last_time_s + 60) { + last_time_s = now.tv_sec; if(min_delta > permanent_min_delta) { info("MINIMUM MICROSECONDS DELTA of thread %d increased from %lld to %lld (+%lld)", gettid(), permanent_min_delta, min_delta, min_delta - permanent_min_delta); @@ -1029,7 +1089,7 @@ static inline usec_t rrdset_update_last_collected_time(RRDSET *st) { return last_collect_ut; } -static inline usec_t rrdset_init_last_updated_time(RRDSET *st) { +static inline void rrdset_init_last_updated_time(RRDSET *st) { // copy the last collected time to last updated time st->last_updated.tv_sec = st->last_collected_time.tv_sec; st->last_updated.tv_usec = st->last_collected_time.tv_usec; @@ -1038,31 +1098,27 @@ static inline usec_t rrdset_init_last_updated_time(RRDSET *st) { st->last_updated.tv_sec -= st->update_every; last_updated_time_align(st); - - usec_t last_updated_ut = st->last_updated.tv_sec * USEC_PER_SEC + st->last_updated.tv_usec; - - rrdset_debug(st, "initialized last updated time to %0.3" NETDATA_DOUBLE_MODIFIER, (NETDATA_DOUBLE)last_updated_ut / USEC_PER_SEC); - - return last_updated_ut; } static __thread size_t rrdset_done_statistics_points_stored_per_tier[RRD_STORAGE_TIERS]; -static inline time_t tier_next_point_time(RRDDIM *rd, struct rrddim_tier *t, time_t now) { +static inline time_t tier_next_point_time_s(RRDDIM *rd, struct rrddim_tier *t, time_t now_s) { time_t loop = (time_t)rd->update_every * (time_t)t->tier_grouping; - return now + loop - ((now + loop) % loop); + return now_s + loop - ((now_s + loop) % loop); } void store_metric_at_tier(RRDDIM *rd, size_t tier, struct rrddim_tier *t, STORAGE_POINT sp, usec_t now_ut __maybe_unused) { - if (unlikely(!t->next_point_time)) - t->next_point_time = tier_next_point_time(rd, t, sp.end_time); + if (unlikely(!t->next_point_end_time_s)) + t->next_point_end_time_s = tier_next_point_time_s(rd, t, sp.end_time_s); + + if(unlikely(sp.start_time_s >= t->next_point_end_time_s)) { + // flush the virtual point, it is done - if(unlikely(sp.start_time > t->next_point_time)) { if (likely(!storage_point_is_unset(t->virtual_point))) { t->collect_ops->store_metric( t->db_collection_handle, - t->next_point_time * USEC_PER_SEC, + t->next_point_end_time_s * USEC_PER_SEC, t->virtual_point.sum, t->virtual_point.min, t->virtual_point.max, @@ -1073,7 +1129,7 @@ void store_metric_at_tier(RRDDIM *rd, size_t tier, struct rrddim_tier *t, STORAG else { t->collect_ops->store_metric( t->db_collection_handle, - t->next_point_time * USEC_PER_SEC, + t->next_point_end_time_s * USEC_PER_SEC, NAN, NAN, NAN, @@ -1083,18 +1139,18 @@ void store_metric_at_tier(RRDDIM *rd, size_t tier, struct rrddim_tier *t, STORAG rrdset_done_statistics_points_stored_per_tier[tier]++; t->virtual_point.count = 0; // make the point unset - t->next_point_time = tier_next_point_time(rd, t, sp.end_time); + t->next_point_end_time_s = tier_next_point_time_s(rd, t, sp.end_time_s); } // merge the dates into our virtual point - if (unlikely(sp.start_time < t->virtual_point.start_time)) - t->virtual_point.start_time = sp.start_time; + if (unlikely(sp.start_time_s < t->virtual_point.start_time_s)) + t->virtual_point.start_time_s = sp.start_time_s; - if (likely(sp.end_time > t->virtual_point.end_time)) - t->virtual_point.end_time = sp.end_time; + if (likely(sp.end_time_s > t->virtual_point.end_time_s)) + t->virtual_point.end_time_s = sp.end_time_s; // merge the values into our virtual point - if (likely(!storage_point_is_empty(sp))) { + if (likely(!storage_point_is_gap(sp))) { // we aggregate only non NULLs into higher tiers if (likely(!storage_point_is_unset(t->virtual_point))) { @@ -1143,14 +1199,14 @@ void rrddim_store_metric(RRDDIM *rd, usec_t point_end_time_ut, NETDATA_DOUBLE n, #endif // NETDATA_LOG_COLLECTION_ERRORS // store the metric on tier 0 - rd->tiers[0]->collect_ops->store_metric(rd->tiers[0]->db_collection_handle, point_end_time_ut, n, 0, 0, 1, 0, flags); + rd->tiers[0].collect_ops->store_metric(rd->tiers[0].db_collection_handle, point_end_time_ut, n, 0, 0, 1, 0, flags); rrdset_done_statistics_points_stored_per_tier[0]++; - time_t now = (time_t)(point_end_time_ut / USEC_PER_SEC); + time_t now_s = (time_t)(point_end_time_ut / USEC_PER_SEC); STORAGE_POINT sp = { - .start_time = now - rd->update_every, - .end_time = now, + .start_time_s = now_s - rd->update_every, + .end_time_s = now_s, .min = n, .max = n, .sum = n, @@ -1160,14 +1216,14 @@ void rrddim_store_metric(RRDDIM *rd, usec_t point_end_time_ut, NETDATA_DOUBLE n, }; for(size_t tier = 1; tier < storage_tiers ;tier++) { - if(unlikely(!rd->tiers[tier])) continue; + if(unlikely(!rd->tiers[tier].db_metric_handle)) continue; - struct rrddim_tier *t = rd->tiers[tier]; + struct rrddim_tier *t = &rd->tiers[tier]; if(!rrddim_option_check(rd, RRDDIM_OPTION_BACKFILLED_HIGH_TIERS)) { // we have not collected this tier before // let's fill any gap that may exist - rrdr_fill_tier_gap_from_smaller_tiers(rd, tier, now); + rrdr_fill_tier_gap_from_smaller_tiers(rd, tier, now_s); rrddim_option_set(rd, RRDDIM_OPTION_BACKFILLED_HIGH_TIERS); } @@ -1188,12 +1244,16 @@ struct rda_item { static __thread struct rda_item *thread_rda = NULL; static __thread size_t thread_rda_entries = 0; -struct rda_item *rrdset_thread_rda(size_t *dimensions) { +struct rda_item *rrdset_thread_rda_get(size_t *dimensions) { if(unlikely(!thread_rda || (*dimensions) > thread_rda_entries)) { + size_t old_mem = thread_rda_entries * sizeof(struct rda_item); freez(thread_rda); - thread_rda = mallocz((*dimensions) * sizeof(struct rda_item)); thread_rda_entries = *dimensions; + size_t new_mem = thread_rda_entries * sizeof(struct rda_item); + thread_rda = mallocz(new_mem); + + __atomic_add_fetch(&netdata_buffers_statistics.rrdset_done_rda_size, new_mem - old_mem, __ATOMIC_RELAXED); } *dimensions = thread_rda_entries; @@ -1201,6 +1261,8 @@ struct rda_item *rrdset_thread_rda(size_t *dimensions) { } void rrdset_thread_rda_free(void) { + __atomic_sub_fetch(&netdata_buffers_statistics.rrdset_done_rda_size, thread_rda_entries * sizeof(struct rda_item), __ATOMIC_RELAXED); + freez(thread_rda); thread_rda = NULL; thread_rda_entries = 0; @@ -1253,6 +1315,8 @@ static inline size_t rrdset_done_interpolate( last_ut = next_store_ut; + ml_chart_update_begin(st); + struct rda_item *rda; size_t dim_id; for(dim_id = 0, rda = rda_base ; dim_id < rda_slots ; ++dim_id, ++rda) { @@ -1332,17 +1396,20 @@ static inline size_t rrdset_done_interpolate( break; } + time_t current_time_s = (time_t) (next_store_ut / USEC_PER_SEC); + if(unlikely(!store_this_entry)) { - (void) ml_is_anomalous(rd, 0, false); + (void) ml_is_anomalous(rd, current_time_s, 0, false); + rrddim_store_metric(rd, next_store_ut, NAN, SN_FLAG_NONE); rrdcontext_collected_rrddim(rd); continue; } - if(likely(rd->updated && rd->collections_counter > 1 && iterations < st->gap_when_lost_iterations_above)) { + if(likely(rd->updated && rd->collections_counter > 1 && iterations < gap_when_lost_iterations_above)) { uint32_t dim_storage_flags = storage_flags; - if (ml_is_anomalous(rd, new_value, true)) { + if (ml_is_anomalous(rd, current_time_s, new_value, true)) { // clear anomaly bit: 0 -> is anomalous, 1 -> not anomalous dim_storage_flags &= ~((storage_number)SN_FLAG_NOT_ANOMALOUS); } @@ -1352,7 +1419,7 @@ static inline size_t rrdset_done_interpolate( rd->last_stored_value = new_value; } else { - (void) ml_is_anomalous(rd, 0, false); + (void) ml_is_anomalous(rd, current_time_s, 0, false); rrdset_debug(st, "%s: STORE[%ld] = NON EXISTING ", rrddim_name(rd), current_entry); @@ -1364,6 +1431,8 @@ static inline size_t rrdset_done_interpolate( stored_entries++; } + ml_chart_update_end(st); + // reset the storage flags for the next point, if any; storage_flags = SN_DEFAULT_FLAGS; @@ -1389,36 +1458,6 @@ static inline size_t rrdset_done_interpolate( return stored_entries; } -static inline void rrdset_done_fill_the_gap(RRDSET *st) { - usec_t update_every_ut = st->update_every * USEC_PER_SEC; - usec_t now_collect_ut = st->last_collected_time.tv_sec * USEC_PER_SEC + st->last_collected_time.tv_usec; - - long c = 0, entries = st->entries; - RRDDIM *rd; - rrddim_foreach_read(rd, st) { - usec_t next_store_ut = (st->last_updated.tv_sec + st->update_every) * USEC_PER_SEC; - long current_entry = st->current_entry; - - for(c = 0; c < entries && next_store_ut <= now_collect_ut ; next_store_ut += update_every_ut, c++) { - rd->db[current_entry] = pack_storage_number(NAN, SN_FLAG_NONE); - current_entry = ((current_entry + 1) >= entries) ? 0 : current_entry + 1; - - rrdset_debug(st, "%s: STORE[%ld] = NON EXISTING (FILLED THE GAP)", rrddim_name(rd), current_entry); - } - } - rrddim_foreach_done(rd); - - if(c > 0) { - c--; - st->last_updated.tv_sec += c * st->update_every; - - st->current_entry += c; - st->counter += c; - if(st->current_entry >= st->entries) - st->current_entry -= st->entries; - } -} - void rrdset_done(RRDSET *st) { struct timeval now; @@ -1427,10 +1466,12 @@ void rrdset_done(RRDSET *st) { } void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) { - if(unlikely(netdata_exit)) return; + if(unlikely(!service_running(SERVICE_COLLECTORS))) return; + + netdata_spinlock_lock(&st->data_collection_lock); if (pending_rrdset_next) - rrdset_next(st); + rrdset_timed_next(st, now, 0ULL); debug(D_RRD_CALLS, "rrdset_done() for chart '%s'", rrdset_name(st)); @@ -1447,9 +1488,13 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) next_store_ut = 0, // the timestamp in microseconds, of the next entry to store in the db update_every_ut = st->update_every * USEC_PER_SEC; // st->update_every in microseconds + RRDSET_FLAGS rrdset_flags = rrdset_flag_check(st, ~0); + if(unlikely(rrdset_flags & RRDSET_FLAG_COLLECTION_FINISHED)) + return; + netdata_thread_disable_cancelability(); - if(unlikely(rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE))) { + if (unlikely(rrdset_flags & RRDSET_FLAG_OBSOLETE)) { error("Chart '%s' has the OBSOLETE flag set, but it is collected.", rrdset_id(st)); rrdset_isnot_obsolete(st); } @@ -1519,29 +1564,6 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) first_entry = 1; } -#ifdef ENABLE_DBENGINE - // check if we will re-write the entire page - if(unlikely(st->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE && - dt_usec(&st->last_collected_time, &st->last_updated) > (RRDENG_BLOCK_SIZE / sizeof(storage_number)) * update_every_ut)) { - info( - "'%s': too old data (last updated at %" PRId64 ".%" PRId64 ", last collected at %" PRId64 ".%" PRId64 "). " - "Resetting it. Will not store the next entry.", - rrdset_id(st), - (int64_t)st->last_updated.tv_sec, - (int64_t)st->last_updated.tv_usec, - (int64_t)st->last_collected_time.tv_sec, - (int64_t)st->last_collected_time.tv_usec); - rrdset_reset(st); - rrdset_init_last_updated_time(st); - - st->usec_since_last_update = update_every_ut; - - // the first entry should not be stored - store_this_entry = 0; - first_entry = 1; - } -#endif - // these are the 3 variables that will help us in interpolation // last_stored_ut = the last time we added a value to the storage // now_collect_ut = the time the current value has been collected @@ -1551,23 +1573,13 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) next_store_ut = (st->last_updated.tv_sec + st->update_every) * USEC_PER_SEC; if(unlikely(!st->counter_done)) { - // if we have not collected metrics this session (st->counter_done == 0) - // and we have collected metrics for this chart in the past (st->counter != 0) - // fill the gap (the chart has been just loaded from disk) - if(unlikely(st->counter) && st->rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) { - // TODO this should be inside the storage engine - rrdset_done_fill_the_gap(st); - last_stored_ut = st->last_updated.tv_sec * USEC_PER_SEC + st->last_updated.tv_usec; - next_store_ut = (st->last_updated.tv_sec + st->update_every) * USEC_PER_SEC; - } - if (st->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { - // set a fake last_updated to jump to current time - rrdset_init_last_updated_time(st); - last_stored_ut = st->last_updated.tv_sec * USEC_PER_SEC + st->last_updated.tv_usec; - next_store_ut = (st->last_updated.tv_sec + st->update_every) * USEC_PER_SEC; - } + // set a fake last_updated to jump to current time + rrdset_init_last_updated_time(st); + + last_stored_ut = st->last_updated.tv_sec * USEC_PER_SEC + st->last_updated.tv_usec; + next_store_ut = (st->last_updated.tv_sec + st->update_every) * USEC_PER_SEC; - if(unlikely(rrdset_flag_check(st, RRDSET_FLAG_STORE_FIRST))) { + if(unlikely(rrdset_flags & RRDSET_FLAG_STORE_FIRST)) { store_this_entry = 1; last_collect_ut = next_store_ut - update_every_ut; @@ -1589,7 +1601,7 @@ after_first_database_work: uint32_t has_reset_value = 0; size_t rda_slots = dictionary_entries(st->rrddim_root_index); - struct rda_item *rda_base = rrdset_thread_rda(&rda_slots); + struct rda_item *rda_base = rrdset_thread_rda_get(&rda_slots); size_t dim_id; size_t dimensions = 0; @@ -1915,6 +1927,8 @@ after_second_database_work: ); } + netdata_spinlock_unlock(&st->data_collection_lock); + // ALL DONE ABOUT THE DATA UPDATE // -------------------------------------------------------------------- @@ -1946,29 +1960,29 @@ after_second_database_work: store_metric_collection_completed(); } -time_t rrdset_set_update_every(RRDSET *st, time_t update_every) { +time_t rrdset_set_update_every_s(RRDSET *st, time_t update_every_s) { internal_error(true, "RRDSET '%s' switching update every from %d to %d", - rrdset_id(st), (int)st->update_every, (int)update_every); + rrdset_id(st), (int)st->update_every, (int)update_every_s); - time_t prev_update_every = st->update_every; - st->update_every = update_every; + time_t prev_update_every_s = st->update_every; + st->update_every = update_every_s; // switch update every to the storage engine RRDDIM *rd; rrddim_foreach_read(rd, st) { for (size_t tier = 0; tier < storage_tiers; tier++) { - if (rd->tiers[tier] && rd->tiers[tier]->db_collection_handle) - rd->tiers[tier]->collect_ops->change_collection_frequency(rd->tiers[tier]->db_collection_handle, (int)(st->rrdhost->db[tier].tier_grouping * st->update_every)); + if (rd->tiers[tier].db_collection_handle) + rd->tiers[tier].collect_ops->change_collection_frequency(rd->tiers[tier].db_collection_handle, (int)(st->rrdhost->db[tier].tier_grouping * st->update_every)); } - assert(rd->update_every == prev_update_every && + assert(rd->update_every == prev_update_every_s && "chart's update every differs from the update every of its dimensions"); rd->update_every = st->update_every; } rrddim_foreach_done(rd); - return prev_update_every; + return prev_update_every_s; } // ---------------------------------------------------------------------------- @@ -2016,8 +2030,8 @@ struct rrdset_map_save_v019 { size_t counter; // NEEDS TO BE UPDATED - maintained on load size_t counter_done; // ignored union { // - time_t last_accessed_time; // ignored - time_t last_entry_t; // ignored + time_t last_accessed_time_s; // ignored + time_t last_entry_s; // ignored }; // time_t upstream_resync_time; // ignored void *plugin_name; // ignored @@ -2064,6 +2078,13 @@ const char *rrdset_cache_filename(RRDSET *st) { return st_on_file->cache_filename; } +const char *rrdset_cache_dir(RRDSET *st) { + if(!st->cache_dir) + st->cache_dir = rrdhost_cache_dir_for_rrdset_alloc(st->rrdhost, rrdset_id(st)); + + return st->cache_dir; +} + void rrdset_memory_file_free(RRDSET *st) { if(!st->st_on_file) return; @@ -2071,6 +2092,7 @@ void rrdset_memory_file_free(RRDSET *st) { rrdset_memory_file_update(st); struct rrdset_map_save_v019 *st_on_file = st->st_on_file; + __atomic_sub_fetch(&rrddim_db_memory_size, st_on_file->memsize, __ATOMIC_RELAXED); netdata_munmap(st_on_file, st_on_file->memsize); // remove the pointers from the RRDDIM @@ -2093,17 +2115,15 @@ bool rrdset_memory_load_or_create_map_save(RRDSET *st, RRD_MEMORY_MODE memory_mo return false; char fullfilename[FILENAME_MAX + 1]; - snprintfz(fullfilename, FILENAME_MAX, "%s/main.db", st->cache_dir); + snprintfz(fullfilename, FILENAME_MAX, "%s/main.db", rrdset_cache_dir(st)); unsigned long size = sizeof(struct rrdset_map_save_v019); struct rrdset_map_save_v019 *st_on_file = (struct rrdset_map_save_v019 *)netdata_mmap( - fullfilename, size, - ((memory_mode == RRD_MEMORY_MODE_MAP) ? MAP_SHARED : MAP_PRIVATE), - 0); + fullfilename, size, ((memory_mode == RRD_MEMORY_MODE_MAP) ? MAP_SHARED : MAP_PRIVATE), 0, false, NULL); if(!st_on_file) return false; - time_t now = now_realtime_sec(); + time_t now_s = now_realtime_sec(); st_on_file->magic[sizeof(RRDSET_MAGIC_V019)] = '\0'; if(strcmp(st_on_file->magic, RRDSET_MAGIC_V019) != 0) { @@ -2122,13 +2142,13 @@ bool rrdset_memory_load_or_create_map_save(RRDSET *st, RRD_MEMORY_MODE memory_mo error("File '%s' does not have the desired granularity. Clearing it.", fullfilename); memset(st_on_file, 0, size); } - else if((now - st_on_file->last_updated.tv_sec) > st->update_every * st->entries) { + else if((now_s - st_on_file->last_updated.tv_sec) > st->update_every * st->entries) { info("File '%s' is too old. Clearing it.", fullfilename); memset(st_on_file, 0, size); } - else if(st_on_file->last_updated.tv_sec > now + st->update_every) { - error("File '%s' refers to the future by %zd secs. Resetting it to now.", fullfilename, (ssize_t)(st_on_file->last_updated.tv_sec - now)); - st_on_file->last_updated.tv_sec = now; + else if(st_on_file->last_updated.tv_sec > now_s + st->update_every) { + error("File '%s' refers to the future by %zd secs. Resetting it to now.", fullfilename, (ssize_t)(st_on_file->last_updated.tv_sec - now_s)); + st_on_file->last_updated.tv_sec = now_s; } if(st_on_file->current_entry >= st_on_file->entries) @@ -2169,5 +2189,6 @@ bool rrdset_memory_load_or_create_map_save(RRDSET *st, RRD_MEMORY_MODE memory_mo // copy the useful values back to st_on_file rrdset_memory_file_update(st); + __atomic_add_fetch(&rrddim_db_memory_size, st_on_file->memsize, __ATOMIC_RELAXED); return true; } diff --git a/database/rrdsetvar.c b/database/rrdsetvar.c index 22cf8a1f0..15377ddb2 100644 --- a/database/rrdsetvar.c +++ b/database/rrdsetvar.c @@ -43,7 +43,7 @@ static inline void rrdsetvar_free_rrdvars_unsafe(RRDSET *st, RRDSETVAR *rs) { // ------------------------------------------------------------------------ // HOST - if(host->rrdvars && host->health_enabled) { + if(host->rrdvars && host->health.health_enabled) { rrdvar_release_and_del(host->rrdvars, rs->rrdvar_host_chart_id); rs->rrdvar_host_chart_id = NULL; @@ -93,7 +93,7 @@ static inline void rrdsetvar_update_rrdvars_unsafe(RRDSET *st, RRDSETVAR *rs) { // ------------------------------------------------------------------------ // HOST - if(host->rrdvars && host->health_enabled) { + if(host->rrdvars && host->health.health_enabled) { rs->rrdvar_host_chart_id = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_id, rs->type, options, rs->value); rs->rrdvar_host_chart_name = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_name, rs->type, options, rs->value); } @@ -189,7 +189,8 @@ static void rrdsetvar_delete_callback(const DICTIONARY_ITEM *item __maybe_unused void rrdsetvar_index_init(RRDSET *st) { if(!st->rrdsetvar_root_index) { - st->rrdsetvar_root_index = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + st->rrdsetvar_root_index = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdhealth, sizeof(RRDSETVAR)); dictionary_register_insert_callback(st->rrdsetvar_root_index, rrdsetvar_insert_callback, NULL); dictionary_register_conflict_callback(st->rrdsetvar_root_index, rrdsetvar_conflict_callback, NULL); diff --git a/database/rrdvar.c b/database/rrdvar.c index 28be4f6a1..72decbd46 100644 --- a/database/rrdvar.c +++ b/database/rrdvar.c @@ -84,7 +84,8 @@ static void rrdvar_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, v } DICTIONARY *rrdvariables_create(void) { - DICTIONARY *dict = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + DICTIONARY *dict = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdhealth, sizeof(RRDVAR)); dictionary_register_insert_callback(dict, rrdvar_insert_callback, NULL); dictionary_register_delete_callback(dict, rrdvar_delete_callback, NULL); diff --git a/database/sqlite/sqlite3.c b/database/sqlite/sqlite3.c index 296de3e74..d5fb13d0f 100644 --- a/database/sqlite/sqlite3.c +++ b/database/sqlite/sqlite3.c @@ -1,6 +1,6 @@ /****************************************************************************** ** This file is an amalgamation of many separate C source files from SQLite -** version 3.38.5. By combining all the individual C code files into this +** version 3.40.1. By combining all the individual C code files into this ** single large file, the entire code can be compiled as a single translation ** unit. This allows many compilers to do optimizations that would not be ** possible if the files were compiled separately. Performance improvements @@ -27,7 +27,6 @@ #define SQLITE_ENABLE_UPDATE_DELETE_LIMIT 1 #define SQLITE_OMIT_LOAD_EXTENSION 1 #define SQLITE_ENABLE_DBSTAT_VTAB 1 - /************** Begin file sqliteInt.h ***************************************/ /* ** 2001 September 15 @@ -458,9 +457,9 @@ extern "C" { ** [sqlite3_libversion_number()], [sqlite3_sourceid()], ** [sqlite_version()] and [sqlite_source_id()]. */ -#define SQLITE_VERSION "3.38.5" -#define SQLITE_VERSION_NUMBER 3038005 -#define SQLITE_SOURCE_ID "2022-05-06 15:25:27 78d9c993d404cdfaa7fdd2973fa1052e3da9f66215cff9c5540ebe55c407d9fe" +#define SQLITE_VERSION "3.40.1" +#define SQLITE_VERSION_NUMBER 3040001 +#define SQLITE_SOURCE_ID "2022-12-28 14:03:47 df5c253c0b3dd24916e4ec7cf77d3db5294cc9fd45ae7b9c5e82ad8197f38a24" /* ** CAPI3REF: Run-Time Library Version Numbers @@ -982,13 +981,17 @@ SQLITE_API int sqlite3_exec( ** ** SQLite uses one of these integer values as the second ** argument to calls it makes to the xLock() and xUnlock() methods -** of an [sqlite3_io_methods] object. +** of an [sqlite3_io_methods] object. These values are ordered from +** lest restrictive to most restrictive. +** +** The argument to xLock() is always SHARED or higher. The argument to +** xUnlock is either SHARED or NONE. */ -#define SQLITE_LOCK_NONE 0 -#define SQLITE_LOCK_SHARED 1 -#define SQLITE_LOCK_RESERVED 2 -#define SQLITE_LOCK_PENDING 3 -#define SQLITE_LOCK_EXCLUSIVE 4 +#define SQLITE_LOCK_NONE 0 /* xUnlock() only */ +#define SQLITE_LOCK_SHARED 1 /* xLock() or xUnlock() */ +#define SQLITE_LOCK_RESERVED 2 /* xLock() only */ +#define SQLITE_LOCK_PENDING 3 /* xLock() only */ +#define SQLITE_LOCK_EXCLUSIVE 4 /* xLock() only */ /* ** CAPI3REF: Synchronization Type Flags @@ -1066,7 +1069,14 @@ struct sqlite3_file { **
  • [SQLITE_LOCK_PENDING], or **
  • [SQLITE_LOCK_EXCLUSIVE]. ** -** xLock() increases the lock. xUnlock() decreases the lock. +** xLock() upgrades the database file lock. In other words, xLock() moves the +** database file lock in the direction NONE toward EXCLUSIVE. The argument to +** xLock() is always on of SHARED, RESERVED, PENDING, or EXCLUSIVE, never +** SQLITE_LOCK_NONE. If the database file lock is already at or above the +** requested lock, then the call to xLock() is a no-op. +** xUnlock() downgrades the database file lock to either SHARED or NONE. +* If the lock is already at or below the requested lock state, then the call +** to xUnlock() is a no-op. ** The xCheckReservedLock() method checks whether any database connection, ** either in this process or in some other process, is holding a RESERVED, ** PENDING, or EXCLUSIVE lock on the file. It returns true @@ -1171,9 +1181,8 @@ struct sqlite3_io_methods { ** opcode causes the xFileControl method to write the current state of ** the lock (one of [SQLITE_LOCK_NONE], [SQLITE_LOCK_SHARED], ** [SQLITE_LOCK_RESERVED], [SQLITE_LOCK_PENDING], or [SQLITE_LOCK_EXCLUSIVE]) -** into an integer that the pArg argument points to. This capability -** is used during testing and is only available when the SQLITE_TEST -** compile-time option is used. +** into an integer that the pArg argument points to. +** This capability is only available if SQLite is compiled with [SQLITE_DEBUG]. ** **
  • [[SQLITE_FCNTL_SIZE_HINT]] ** The [SQLITE_FCNTL_SIZE_HINT] opcode is used by SQLite to give the VFS @@ -1494,6 +1503,12 @@ struct sqlite3_io_methods { ** **
  • [[SQLITE_FCNTL_CKSM_FILE]] ** Used by the cksmvfs VFS module only. +** +**
  • [[SQLITE_FCNTL_RESET_CACHE]] +** If there is currently no transaction open on the database, and the +** database is not a temp db, then this file-control purges the contents +** of the in-memory page cache. If there is an open transaction, or if +** the db is a temp-db, it is a no-op, not an error. ** */ #define SQLITE_FCNTL_LOCKSTATE 1 @@ -1536,6 +1551,7 @@ struct sqlite3_io_methods { #define SQLITE_FCNTL_CKPT_START 39 #define SQLITE_FCNTL_EXTERNAL_READER 40 #define SQLITE_FCNTL_CKSM_FILE 41 +#define SQLITE_FCNTL_RESET_CACHE 42 /* deprecated names */ #define SQLITE_GET_LOCKPROXYFILE SQLITE_FCNTL_GET_LOCKPROXYFILE @@ -1565,6 +1581,26 @@ typedef struct sqlite3_mutex sqlite3_mutex; */ typedef struct sqlite3_api_routines sqlite3_api_routines; +/* +** CAPI3REF: File Name +** +** Type [sqlite3_filename] is used by SQLite to pass filenames to the +** xOpen method of a [VFS]. It may be cast to (const char*) and treated +** as a normal, nul-terminated, UTF-8 buffer containing the filename, but +** may also be passed to special APIs such as: +** +**
      +**
    • sqlite3_filename_database() +**
    • sqlite3_filename_journal() +**
    • sqlite3_filename_wal() +**
    • sqlite3_uri_parameter() +**
    • sqlite3_uri_boolean() +**
    • sqlite3_uri_int64() +**
    • sqlite3_uri_key() +**
    +*/ +typedef const char *sqlite3_filename; + /* ** CAPI3REF: OS Interface Object ** @@ -1743,7 +1779,7 @@ struct sqlite3_vfs { sqlite3_vfs *pNext; /* Next registered VFS */ const char *zName; /* Name of this virtual file system */ void *pAppData; /* Pointer to application-specific data */ - int (*xOpen)(sqlite3_vfs*, const char *zName, sqlite3_file*, + int (*xOpen)(sqlite3_vfs*, sqlite3_filename zName, sqlite3_file*, int flags, int *pOutFlags); int (*xDelete)(sqlite3_vfs*, const char *zName, int syncDir); int (*xAccess)(sqlite3_vfs*, const char *zName, int flags, int *pResOut); @@ -2621,6 +2657,7 @@ struct sqlite3_mem_methods { **
      **
    • The [PRAGMA writable_schema=ON] statement. **
    • The [PRAGMA journal_mode=OFF] statement. +**
    • The [PRAGMA schema_version=N] statement. **
    • Writes to the [sqlite_dbpage] virtual table. **
    • Direct writes to [shadow tables]. **
    @@ -3736,6 +3773,9 @@ SQLITE_API void sqlite3_progress_handler(sqlite3*, int, int(*)(void*), void*); **
    The database is opened [shared cache] enabled, overriding ** the default shared cache setting provided by ** [sqlite3_enable_shared_cache()].)^ +** The [use of shared cache mode is discouraged] and hence shared cache +** capabilities may be omitted from many builds of SQLite. In such cases, +** this option is a no-op. ** ** ^(
    [SQLITE_OPEN_PRIVATECACHE]
    **
    The database is opened [shared cache] disabled, overriding @@ -3751,7 +3791,7 @@ SQLITE_API void sqlite3_progress_handler(sqlite3*, int, int(*)(void*), void*); ** to return an extended result code.
    ** ** [[OPEN_NOFOLLOW]] ^(
    [SQLITE_OPEN_NOFOLLOW]
    -**
    The database filename is not allowed to be a symbolic link
    +**
    The database filename is not allowed to contain a symbolic link
    ** )^ ** ** If the 3rd parameter to sqlite3_open_v2() is not one of the @@ -4010,10 +4050,10 @@ SQLITE_API int sqlite3_open_v2( ** ** See the [URI filename] documentation for additional information. */ -SQLITE_API const char *sqlite3_uri_parameter(const char *zFilename, const char *zParam); -SQLITE_API int sqlite3_uri_boolean(const char *zFile, const char *zParam, int bDefault); -SQLITE_API sqlite3_int64 sqlite3_uri_int64(const char*, const char*, sqlite3_int64); -SQLITE_API const char *sqlite3_uri_key(const char *zFilename, int N); +SQLITE_API const char *sqlite3_uri_parameter(sqlite3_filename z, const char *zParam); +SQLITE_API int sqlite3_uri_boolean(sqlite3_filename z, const char *zParam, int bDefault); +SQLITE_API sqlite3_int64 sqlite3_uri_int64(sqlite3_filename, const char*, sqlite3_int64); +SQLITE_API const char *sqlite3_uri_key(sqlite3_filename z, int N); /* ** CAPI3REF: Translate filenames @@ -4042,9 +4082,9 @@ SQLITE_API const char *sqlite3_uri_key(const char *zFilename, int N); ** return value from [sqlite3_db_filename()], then the result is ** undefined and is likely a memory access violation. */ -SQLITE_API const char *sqlite3_filename_database(const char*); -SQLITE_API const char *sqlite3_filename_journal(const char*); -SQLITE_API const char *sqlite3_filename_wal(const char*); +SQLITE_API const char *sqlite3_filename_database(sqlite3_filename); +SQLITE_API const char *sqlite3_filename_journal(sqlite3_filename); +SQLITE_API const char *sqlite3_filename_wal(sqlite3_filename); /* ** CAPI3REF: Database File Corresponding To A Journal @@ -4110,14 +4150,14 @@ SQLITE_API sqlite3_file *sqlite3_database_file_object(const char*); ** then the corresponding [sqlite3_module.xClose() method should also be ** invoked prior to calling sqlite3_free_filename(Y). */ -SQLITE_API char *sqlite3_create_filename( +SQLITE_API sqlite3_filename sqlite3_create_filename( const char *zDatabase, const char *zJournal, const char *zWal, int nParam, const char **azParam ); -SQLITE_API void sqlite3_free_filename(char*); +SQLITE_API void sqlite3_free_filename(sqlite3_filename); /* ** CAPI3REF: Error Codes And Messages @@ -5820,6 +5860,16 @@ SQLITE_API SQLITE_DEPRECATED int sqlite3_memory_alarm(void(*)(void*,sqlite3_int6 ** then the conversion is performed. Otherwise no conversion occurs. ** The [SQLITE_INTEGER | datatype] after conversion is returned.)^ ** +** ^(The sqlite3_value_encoding(X) interface returns one of [SQLITE_UTF8], +** [SQLITE_UTF16BE], or [SQLITE_UTF16LE] according to the current encoding +** of the value X, assuming that X has type TEXT.)^ If sqlite3_value_type(X) +** returns something other than SQLITE_TEXT, then the return value from +** sqlite3_value_encoding(X) is meaningless. ^Calls to +** sqlite3_value_text(X), sqlite3_value_text16(X), sqlite3_value_text16be(X), +** sqlite3_value_text16le(X), sqlite3_value_bytes(X), or +** sqlite3_value_bytes16(X) might change the encoding of the value X and +** thus change the return from subsequent calls to sqlite3_value_encoding(X). +** ** ^Within the [xUpdate] method of a [virtual table], the ** sqlite3_value_nochange(X) interface returns true if and only if ** the column corresponding to X is unchanged by the UPDATE operation @@ -5884,6 +5934,7 @@ SQLITE_API int sqlite3_value_type(sqlite3_value*); SQLITE_API int sqlite3_value_numeric_type(sqlite3_value*); SQLITE_API int sqlite3_value_nochange(sqlite3_value*); SQLITE_API int sqlite3_value_frombind(sqlite3_value*); +SQLITE_API int sqlite3_value_encoding(sqlite3_value*); /* ** CAPI3REF: Finding The Subtype Of SQL Values @@ -5905,7 +5956,8 @@ SQLITE_API unsigned int sqlite3_value_subtype(sqlite3_value*); ** object D and returns a pointer to that copy. ^The [sqlite3_value] returned ** is a [protected sqlite3_value] object even if the input is not. ** ^The sqlite3_value_dup(V) interface returns NULL if V is NULL or if a -** memory allocation fails. +** memory allocation fails. ^If V is a [pointer value], then the result +** of sqlite3_value_dup(V) is a NULL value. ** ** ^The sqlite3_value_free(V) interface frees an [sqlite3_value] object ** previously obtained from [sqlite3_value_dup()]. ^If V is a NULL pointer @@ -5936,7 +5988,7 @@ SQLITE_API void sqlite3_value_free(sqlite3_value*); ** ** ^The sqlite3_aggregate_context(C,N) routine returns a NULL pointer ** when first called if N is less than or equal to zero or if a memory -** allocate error occurs. +** allocation error occurs. ** ** ^(The amount of space allocated by sqlite3_aggregate_context(C,N) is ** determined by the N parameter on first successful call. Changing the @@ -6141,9 +6193,10 @@ typedef void (*sqlite3_destructor_type)(void*); ** of [SQLITE_UTF8], [SQLITE_UTF16], [SQLITE_UTF16BE], or [SQLITE_UTF16LE]. ** ^SQLite takes the text result from the application from ** the 2nd parameter of the sqlite3_result_text* interfaces. -** ^If the 3rd parameter to the sqlite3_result_text* interfaces -** is negative, then SQLite takes result text from the 2nd parameter -** through the first zero character. +** ^If the 3rd parameter to any of the sqlite3_result_text* interfaces +** other than sqlite3_result_text64() is negative, then SQLite computes +** the string length itself by searching the 2nd parameter for the first +** zero character. ** ^If the 3rd parameter to the sqlite3_result_text* interfaces ** is non-negative, then as many bytes (not characters) of the text ** pointed to by the 2nd parameter are taken as the application-defined @@ -6587,6 +6640,28 @@ SQLITE_API int sqlite3_get_autocommit(sqlite3*); */ SQLITE_API sqlite3 *sqlite3_db_handle(sqlite3_stmt*); +/* +** CAPI3REF: Return The Schema Name For A Database Connection +** METHOD: sqlite3 +** +** ^The sqlite3_db_name(D,N) interface returns a pointer to the schema name +** for the N-th database on database connection D, or a NULL pointer of N is +** out of range. An N value of 0 means the main database file. An N of 1 is +** the "temp" schema. Larger values of N correspond to various ATTACH-ed +** databases. +** +** Space to hold the string that is returned by sqlite3_db_name() is managed +** by SQLite itself. The string might be deallocated by any operation that +** changes the schema, including [ATTACH] or [DETACH] or calls to +** [sqlite3_serialize()] or [sqlite3_deserialize()], even operations that +** occur on a different thread. Applications that need to +** remember the string long-term should make their own copy. Applications that +** are accessing the same database connection simultaneously on multiple +** threads should mutex-protect calls to this API and should make their own +** private copy of the result prior to releasing the mutex. +*/ +SQLITE_API const char *sqlite3_db_name(sqlite3 *db, int N); + /* ** CAPI3REF: Return The Filename For A Database Connection ** METHOD: sqlite3 @@ -6617,7 +6692,7 @@ SQLITE_API sqlite3 *sqlite3_db_handle(sqlite3_stmt*); **
  • [sqlite3_filename_wal()] ** */ -SQLITE_API const char *sqlite3_db_filename(sqlite3 *db, const char *zDbName); +SQLITE_API sqlite3_filename sqlite3_db_filename(sqlite3 *db, const char *zDbName); /* ** CAPI3REF: Determine if a database is read-only @@ -6754,7 +6829,7 @@ SQLITE_API void *sqlite3_rollback_hook(sqlite3*, void(*)(void *), void*); ** function C that is invoked prior to each autovacuum of the database ** file. ^The callback is passed a copy of the generic data pointer (P), ** the schema-name of the attached database that is being autovacuumed, -** the the size of the database file in pages, the number of free pages, +** the size of the database file in pages, the number of free pages, ** and the number of bytes per page, respectively. The callback should ** return the number of free pages that should be removed by the ** autovacuum. ^If the callback returns zero, then no autovacuum happens. @@ -6875,6 +6950,11 @@ SQLITE_API void *sqlite3_update_hook( ** to the same database. Sharing is enabled if the argument is true ** and disabled if the argument is false.)^ ** +** This interface is omitted if SQLite is compiled with +** [-DSQLITE_OMIT_SHARED_CACHE]. The [-DSQLITE_OMIT_SHARED_CACHE] +** compile-time option is recommended because the +** [use of shared cache mode is discouraged]. +** ** ^Cache sharing is enabled and disabled for an entire process. ** This is a change as of SQLite [version 3.5.0] ([dateof:3.5.0]). ** In prior versions of SQLite, @@ -6973,7 +7053,7 @@ SQLITE_API int sqlite3_db_release_memory(sqlite3*); ** ^The soft heap limit may not be greater than the hard heap limit. ** ^If the hard heap limit is enabled and if sqlite3_soft_heap_limit(N) ** is invoked with a value of N that is greater than the hard heap limit, -** the the soft heap limit is set to the value of the hard heap limit. +** the soft heap limit is set to the value of the hard heap limit. ** ^The soft heap limit is automatically enabled whenever the hard heap ** limit is enabled. ^When sqlite3_hard_heap_limit64(N) is invoked and ** the soft heap limit is outside the range of 1..N, then the soft heap @@ -9268,7 +9348,7 @@ typedef struct sqlite3_backup sqlite3_backup; ** if the application incorrectly accesses the destination [database connection] ** and so no error code is reported, but the operations may malfunction ** nevertheless. Use of the destination database connection while a -** backup is in progress might also also cause a mutex deadlock. +** backup is in progress might also cause a mutex deadlock. ** ** If running in [shared cache mode], the application must ** guarantee that the shared cache used by the destination database @@ -9696,7 +9776,7 @@ SQLITE_API int sqlite3_wal_checkpoint_v2( */ #define SQLITE_CHECKPOINT_PASSIVE 0 /* Do as much as possible w/o blocking */ #define SQLITE_CHECKPOINT_FULL 1 /* Wait for writers, then checkpoint */ -#define SQLITE_CHECKPOINT_RESTART 2 /* Like FULL but wait for for readers */ +#define SQLITE_CHECKPOINT_RESTART 2 /* Like FULL but wait for readers */ #define SQLITE_CHECKPOINT_TRUNCATE 3 /* Like RESTART but also truncate WAL */ /* @@ -9866,8 +9946,8 @@ SQLITE_API SQLITE_EXPERIMENTAL const char *sqlite3_vtab_collation(sqlite3_index_ ** of a [virtual table] implementation. The result of calling this ** interface from outside of xBestIndex() is undefined and probably harmful. ** -** ^The sqlite3_vtab_distinct() interface returns an integer that is -** either 0, 1, or 2. The integer returned by sqlite3_vtab_distinct() +** ^The sqlite3_vtab_distinct() interface returns an integer between 0 and +** 3. The integer returned by sqlite3_vtab_distinct() ** gives the virtual table additional information about how the query ** planner wants the output to be ordered. As long as the virtual table ** can meet the ordering requirements of the query planner, it may set @@ -9899,6 +9979,13 @@ SQLITE_API SQLITE_EXPERIMENTAL const char *sqlite3_vtab_collation(sqlite3_index_ ** that have the same value for all columns identified by "aOrderBy". ** ^However omitting the extra rows is optional. ** This mode is used for a DISTINCT query. +**
  • +** ^(If the sqlite3_vtab_distinct() interface returns 3, that means +** that the query planner needs only distinct rows but it does need the +** rows to be sorted.)^ ^The virtual table implementation is free to omit +** rows that are identical in all aOrderBy columns, if it wants to, but +** it is not required to omit any rows. This mode is used for queries +** that have both DISTINCT and ORDER BY clauses. ** ** ** ^For the purposes of comparing virtual table output values to see if the @@ -13120,12 +13207,17 @@ struct fts5_api { /************** End of sqlite3.h *********************************************/ /************** Continuing where we left off in sqliteInt.h ******************/ +/* +** Reuse the STATIC_LRU for mutex access to sqlite3_temp_directory. +*/ +#define SQLITE_MUTEX_STATIC_TEMPDIR SQLITE_MUTEX_STATIC_VFS1 + /* ** Include the configuration header output by 'configure' if we're using the ** autoconf-based build */ #if defined(_HAVE_SQLITE_CONFIG_H) && !defined(SQLITECONFIG_H) -#include "config.h" +#include "sqlite_cfg.h" #define SQLITECONFIG_H 1 #endif @@ -14363,8 +14455,19 @@ typedef INT16_TYPE LogEst; /* ** Round up a number to the next larger multiple of 8. This is used ** to force 8-byte alignment on 64-bit architectures. +** +** ROUND8() always does the rounding, for any argument. +** +** ROUND8P() assumes that the argument is already an integer number of +** pointers in size, and so it is a no-op on systems where the pointer +** size is 8. */ #define ROUND8(x) (((x)+7)&~7) +#if SQLITE_PTRSIZE==8 +# define ROUND8P(x) (x) +#else +# define ROUND8P(x) (((x)+7)&~7) +#endif /* ** Round down to the nearest multiple of 8 @@ -14427,22 +14530,23 @@ typedef INT16_TYPE LogEst; #endif /* -** SELECTTRACE_ENABLED will be either 1 or 0 depending on whether or not -** the Select query generator tracing logic is turned on. +** TREETRACE_ENABLED will be either 1 or 0 depending on whether or not +** the Abstract Syntax Tree tracing logic is turned on. */ #if !defined(SQLITE_AMALGAMATION) -SQLITE_PRIVATE u32 sqlite3SelectTrace; +SQLITE_PRIVATE u32 sqlite3TreeTrace; #endif #if defined(SQLITE_DEBUG) \ - && (defined(SQLITE_TEST) || defined(SQLITE_ENABLE_SELECTTRACE)) -# define SELECTTRACE_ENABLED 1 + && (defined(SQLITE_TEST) || defined(SQLITE_ENABLE_SELECTTRACE) \ + || defined(SQLITE_ENABLE_TREETRACE)) +# define TREETRACE_ENABLED 1 # define SELECTTRACE(K,P,S,X) \ - if(sqlite3SelectTrace&(K)) \ + if(sqlite3TreeTrace&(K)) \ sqlite3DebugPrintf("%u/%d/%p: ",(S)->selId,(P)->addrExplain,(S)),\ sqlite3DebugPrintf X #else # define SELECTTRACE(K,P,S,X) -# define SELECTTRACE_ENABLED 0 +# define TREETRACE_ENABLED 0 #endif /* @@ -14527,7 +14631,7 @@ struct BusyHandler { ** pointer will work here as long as it is distinct from SQLITE_STATIC ** and SQLITE_TRANSIENT. */ -#define SQLITE_DYNAMIC ((sqlite3_destructor_type)sqlite3OomFault) +#define SQLITE_DYNAMIC ((sqlite3_destructor_type)sqlite3OomClear) /* ** When SQLITE_OMIT_WSD is defined, it means that the target platform does @@ -14596,6 +14700,7 @@ typedef struct FuncDef FuncDef; typedef struct FuncDefHash FuncDefHash; typedef struct IdList IdList; typedef struct Index Index; +typedef struct IndexedExpr IndexedExpr; typedef struct IndexSample IndexSample; typedef struct KeyClass KeyClass; typedef struct KeyInfo KeyInfo; @@ -14603,6 +14708,7 @@ typedef struct Lookaside Lookaside; typedef struct LookasideSlot LookasideSlot; typedef struct Module Module; typedef struct NameContext NameContext; +typedef struct OnOrUsing OnOrUsing; typedef struct Parse Parse; typedef struct ParseCleanup ParseCleanup; typedef struct PreUpdate PreUpdate; @@ -14660,6 +14766,7 @@ typedef struct With With; #define MASKBIT32(n) (((unsigned int)1)<<(n)) #define SMASKBIT32(n) ((n)<=31?((unsigned int)1)<<(n):0) #define ALLBITS ((Bitmask)-1) +#define TOPBIT (((Bitmask)1)<<(BMS-1)) /* A VList object records a mapping between parameters/variables/wildcards ** in the SQL statement (such as $abc, @pqr, or :xyz) and the integer @@ -14674,6 +14781,331 @@ typedef int VList; ** "BusyHandler" typedefs. vdbe.h also requires a few of the opaque ** pointer types (i.e. FuncDef) defined above. */ +/************** Include os.h in the middle of sqliteInt.h ********************/ +/************** Begin file os.h **********************************************/ +/* +** 2001 September 16 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +** This header file (together with is companion C source-code file +** "os.c") attempt to abstract the underlying operating system so that +** the SQLite library will work on both POSIX and windows systems. +** +** This header file is #include-ed by sqliteInt.h and thus ends up +** being included by every source file. +*/ +#ifndef _SQLITE_OS_H_ +#define _SQLITE_OS_H_ + +/* +** Attempt to automatically detect the operating system and setup the +** necessary pre-processor macros for it. +*/ +/************** Include os_setup.h in the middle of os.h *********************/ +/************** Begin file os_setup.h ****************************************/ +/* +** 2013 November 25 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +** This file contains pre-processor directives related to operating system +** detection and/or setup. +*/ +#ifndef SQLITE_OS_SETUP_H +#define SQLITE_OS_SETUP_H + +/* +** Figure out if we are dealing with Unix, Windows, or some other operating +** system. +** +** After the following block of preprocess macros, all of +** +** SQLITE_OS_KV +** SQLITE_OS_OTHER +** SQLITE_OS_UNIX +** SQLITE_OS_WIN +** +** will defined to either 1 or 0. One of them will be 1. The others will be 0. +** If none of the macros are initially defined, then select either +** SQLITE_OS_UNIX or SQLITE_OS_WIN depending on the target platform. +** +** If SQLITE_OS_OTHER=1 is specified at compile-time, then the application +** must provide its own VFS implementation together with sqlite3_os_init() +** and sqlite3_os_end() routines. +*/ +#if !defined(SQLITE_OS_KV) && !defined(SQLITE_OS_OTHER) && \ + !defined(SQLITE_OS_UNIX) && !defined(SQLITE_OS_WIN) +# if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || \ + defined(__MINGW32__) || defined(__BORLANDC__) +# define SQLITE_OS_WIN 1 +# define SQLITE_OS_UNIX 0 +# else +# define SQLITE_OS_WIN 0 +# define SQLITE_OS_UNIX 1 +# endif +#endif +#if SQLITE_OS_OTHER+1>1 +# undef SQLITE_OS_KV +# define SQLITE_OS_KV 0 +# undef SQLITE_OS_UNIX +# define SQLITE_OS_UNIX 0 +# undef SQLITE_OS_WIN +# define SQLITE_OS_WIN 0 +#endif +#if SQLITE_OS_KV+1>1 +# undef SQLITE_OS_OTHER +# define SQLITE_OS_OTHER 0 +# undef SQLITE_OS_UNIX +# define SQLITE_OS_UNIX 0 +# undef SQLITE_OS_WIN +# define SQLITE_OS_WIN 0 +# define SQLITE_OMIT_LOAD_EXTENSION 1 +# define SQLITE_OMIT_WAL 1 +# define SQLITE_OMIT_DEPRECATED 1 +# undef SQLITE_TEMP_STORE +# define SQLITE_TEMP_STORE 3 /* Always use memory for temporary storage */ +# define SQLITE_DQS 0 +# define SQLITE_OMIT_SHARED_CACHE 1 +# define SQLITE_OMIT_AUTOINIT 1 +#endif +#if SQLITE_OS_UNIX+1>1 +# undef SQLITE_OS_KV +# define SQLITE_OS_KV 0 +# undef SQLITE_OS_OTHER +# define SQLITE_OS_OTHER 0 +# undef SQLITE_OS_WIN +# define SQLITE_OS_WIN 0 +#endif +#if SQLITE_OS_WIN+1>1 +# undef SQLITE_OS_KV +# define SQLITE_OS_KV 0 +# undef SQLITE_OS_OTHER +# define SQLITE_OS_OTHER 0 +# undef SQLITE_OS_UNIX +# define SQLITE_OS_UNIX 0 +#endif + + +#endif /* SQLITE_OS_SETUP_H */ + +/************** End of os_setup.h ********************************************/ +/************** Continuing where we left off in os.h *************************/ + +/* If the SET_FULLSYNC macro is not defined above, then make it +** a no-op +*/ +#ifndef SET_FULLSYNC +# define SET_FULLSYNC(x,y) +#endif + +/* Maximum pathname length. Note: FILENAME_MAX defined by stdio.h +*/ +#ifndef SQLITE_MAX_PATHLEN +# define SQLITE_MAX_PATHLEN FILENAME_MAX +#endif + +/* Maximum number of symlinks that will be resolved while trying to +** expand a filename in xFullPathname() in the VFS. +*/ +#ifndef SQLITE_MAX_SYMLINK +# define SQLITE_MAX_SYMLINK 200 +#endif + +/* +** The default size of a disk sector +*/ +#ifndef SQLITE_DEFAULT_SECTOR_SIZE +# define SQLITE_DEFAULT_SECTOR_SIZE 4096 +#endif + +/* +** Temporary files are named starting with this prefix followed by 16 random +** alphanumeric characters, and no file extension. They are stored in the +** OS's standard temporary file directory, and are deleted prior to exit. +** If sqlite is being embedded in another program, you may wish to change the +** prefix to reflect your program's name, so that if your program exits +** prematurely, old temporary files can be easily identified. This can be done +** using -DSQLITE_TEMP_FILE_PREFIX=myprefix_ on the compiler command line. +** +** 2006-10-31: The default prefix used to be "sqlite_". But then +** Mcafee started using SQLite in their anti-virus product and it +** started putting files with the "sqlite" name in the c:/temp folder. +** This annoyed many windows users. Those users would then do a +** Google search for "sqlite", find the telephone numbers of the +** developers and call to wake them up at night and complain. +** For this reason, the default name prefix is changed to be "sqlite" +** spelled backwards. So the temp files are still identified, but +** anybody smart enough to figure out the code is also likely smart +** enough to know that calling the developer will not help get rid +** of the file. +*/ +#ifndef SQLITE_TEMP_FILE_PREFIX +# define SQLITE_TEMP_FILE_PREFIX "etilqs_" +#endif + +/* +** The following values may be passed as the second argument to +** sqlite3OsLock(). The various locks exhibit the following semantics: +** +** SHARED: Any number of processes may hold a SHARED lock simultaneously. +** RESERVED: A single process may hold a RESERVED lock on a file at +** any time. Other processes may hold and obtain new SHARED locks. +** PENDING: A single process may hold a PENDING lock on a file at +** any one time. Existing SHARED locks may persist, but no new +** SHARED locks may be obtained by other processes. +** EXCLUSIVE: An EXCLUSIVE lock precludes all other locks. +** +** PENDING_LOCK may not be passed directly to sqlite3OsLock(). Instead, a +** process that requests an EXCLUSIVE lock may actually obtain a PENDING +** lock. This can be upgraded to an EXCLUSIVE lock by a subsequent call to +** sqlite3OsLock(). +*/ +#define NO_LOCK 0 +#define SHARED_LOCK 1 +#define RESERVED_LOCK 2 +#define PENDING_LOCK 3 +#define EXCLUSIVE_LOCK 4 + +/* +** File Locking Notes: (Mostly about windows but also some info for Unix) +** +** We cannot use LockFileEx() or UnlockFileEx() on Win95/98/ME because +** those functions are not available. So we use only LockFile() and +** UnlockFile(). +** +** LockFile() prevents not just writing but also reading by other processes. +** A SHARED_LOCK is obtained by locking a single randomly-chosen +** byte out of a specific range of bytes. The lock byte is obtained at +** random so two separate readers can probably access the file at the +** same time, unless they are unlucky and choose the same lock byte. +** An EXCLUSIVE_LOCK is obtained by locking all bytes in the range. +** There can only be one writer. A RESERVED_LOCK is obtained by locking +** a single byte of the file that is designated as the reserved lock byte. +** A PENDING_LOCK is obtained by locking a designated byte different from +** the RESERVED_LOCK byte. +** +** On WinNT/2K/XP systems, LockFileEx() and UnlockFileEx() are available, +** which means we can use reader/writer locks. When reader/writer locks +** are used, the lock is placed on the same range of bytes that is used +** for probabilistic locking in Win95/98/ME. Hence, the locking scheme +** will support two or more Win95 readers or two or more WinNT readers. +** But a single Win95 reader will lock out all WinNT readers and a single +** WinNT reader will lock out all other Win95 readers. +** +** The following #defines specify the range of bytes used for locking. +** SHARED_SIZE is the number of bytes available in the pool from which +** a random byte is selected for a shared lock. The pool of bytes for +** shared locks begins at SHARED_FIRST. +** +** The same locking strategy and +** byte ranges are used for Unix. This leaves open the possibility of having +** clients on win95, winNT, and unix all talking to the same shared file +** and all locking correctly. To do so would require that samba (or whatever +** tool is being used for file sharing) implements locks correctly between +** windows and unix. I'm guessing that isn't likely to happen, but by +** using the same locking range we are at least open to the possibility. +** +** Locking in windows is manditory. For this reason, we cannot store +** actual data in the bytes used for locking. The pager never allocates +** the pages involved in locking therefore. SHARED_SIZE is selected so +** that all locks will fit on a single page even at the minimum page size. +** PENDING_BYTE defines the beginning of the locks. By default PENDING_BYTE +** is set high so that we don't have to allocate an unused page except +** for very large databases. But one should test the page skipping logic +** by setting PENDING_BYTE low and running the entire regression suite. +** +** Changing the value of PENDING_BYTE results in a subtly incompatible +** file format. Depending on how it is changed, you might not notice +** the incompatibility right away, even running a full regression test. +** The default location of PENDING_BYTE is the first byte past the +** 1GB boundary. +** +*/ +#ifdef SQLITE_OMIT_WSD +# define PENDING_BYTE (0x40000000) +#else +# define PENDING_BYTE sqlite3PendingByte +#endif +#define RESERVED_BYTE (PENDING_BYTE+1) +#define SHARED_FIRST (PENDING_BYTE+2) +#define SHARED_SIZE 510 + +/* +** Wrapper around OS specific sqlite3_os_init() function. +*/ +SQLITE_PRIVATE int sqlite3OsInit(void); + +/* +** Functions for accessing sqlite3_file methods +*/ +SQLITE_PRIVATE void sqlite3OsClose(sqlite3_file*); +SQLITE_PRIVATE int sqlite3OsRead(sqlite3_file*, void*, int amt, i64 offset); +SQLITE_PRIVATE int sqlite3OsWrite(sqlite3_file*, const void*, int amt, i64 offset); +SQLITE_PRIVATE int sqlite3OsTruncate(sqlite3_file*, i64 size); +SQLITE_PRIVATE int sqlite3OsSync(sqlite3_file*, int); +SQLITE_PRIVATE int sqlite3OsFileSize(sqlite3_file*, i64 *pSize); +SQLITE_PRIVATE int sqlite3OsLock(sqlite3_file*, int); +SQLITE_PRIVATE int sqlite3OsUnlock(sqlite3_file*, int); +SQLITE_PRIVATE int sqlite3OsCheckReservedLock(sqlite3_file *id, int *pResOut); +SQLITE_PRIVATE int sqlite3OsFileControl(sqlite3_file*,int,void*); +SQLITE_PRIVATE void sqlite3OsFileControlHint(sqlite3_file*,int,void*); +#define SQLITE_FCNTL_DB_UNCHANGED 0xca093fa0 +SQLITE_PRIVATE int sqlite3OsSectorSize(sqlite3_file *id); +SQLITE_PRIVATE int sqlite3OsDeviceCharacteristics(sqlite3_file *id); +#ifndef SQLITE_OMIT_WAL +SQLITE_PRIVATE int sqlite3OsShmMap(sqlite3_file *,int,int,int,void volatile **); +SQLITE_PRIVATE int sqlite3OsShmLock(sqlite3_file *id, int, int, int); +SQLITE_PRIVATE void sqlite3OsShmBarrier(sqlite3_file *id); +SQLITE_PRIVATE int sqlite3OsShmUnmap(sqlite3_file *id, int); +#endif /* SQLITE_OMIT_WAL */ +SQLITE_PRIVATE int sqlite3OsFetch(sqlite3_file *id, i64, int, void **); +SQLITE_PRIVATE int sqlite3OsUnfetch(sqlite3_file *, i64, void *); + + +/* +** Functions for accessing sqlite3_vfs methods +*/ +SQLITE_PRIVATE int sqlite3OsOpen(sqlite3_vfs *, const char *, sqlite3_file*, int, int *); +SQLITE_PRIVATE int sqlite3OsDelete(sqlite3_vfs *, const char *, int); +SQLITE_PRIVATE int sqlite3OsAccess(sqlite3_vfs *, const char *, int, int *pResOut); +SQLITE_PRIVATE int sqlite3OsFullPathname(sqlite3_vfs *, const char *, int, char *); +#ifndef SQLITE_OMIT_LOAD_EXTENSION +SQLITE_PRIVATE void *sqlite3OsDlOpen(sqlite3_vfs *, const char *); +SQLITE_PRIVATE void sqlite3OsDlError(sqlite3_vfs *, int, char *); +SQLITE_PRIVATE void (*sqlite3OsDlSym(sqlite3_vfs *, void *, const char *))(void); +SQLITE_PRIVATE void sqlite3OsDlClose(sqlite3_vfs *, void *); +#endif /* SQLITE_OMIT_LOAD_EXTENSION */ +SQLITE_PRIVATE int sqlite3OsRandomness(sqlite3_vfs *, int, char *); +SQLITE_PRIVATE int sqlite3OsSleep(sqlite3_vfs *, int); +SQLITE_PRIVATE int sqlite3OsGetLastError(sqlite3_vfs*); +SQLITE_PRIVATE int sqlite3OsCurrentTimeInt64(sqlite3_vfs *, sqlite3_int64*); + +/* +** Convenience functions for opening and closing files using +** sqlite3_malloc() to obtain space for the file-handle structure. +*/ +SQLITE_PRIVATE int sqlite3OsOpenMalloc(sqlite3_vfs *, const char *, sqlite3_file **, int,int*); +SQLITE_PRIVATE void sqlite3OsCloseFree(sqlite3_file *); + +#endif /* _SQLITE_OS_H_ */ + +/************** End of os.h **************************************************/ +/************** Continuing where we left off in sqliteInt.h ******************/ /************** Include pager.h in the middle of sqliteInt.h *****************/ /************** Begin file pager.h *******************************************/ /* @@ -14721,14 +15153,15 @@ typedef struct Pager Pager; typedef struct PgHdr DbPage; /* -** Page number PAGER_MJ_PGNO is never used in an SQLite database (it is +** Page number PAGER_SJ_PGNO is never used in an SQLite database (it is ** reserved for working around a windows/posix incompatibility). It is ** used in the journal to signify that the remainder of the journal file ** is devoted to storing a super-journal name - there are no more pages to ** roll back. See comments for function writeSuperJournal() in pager.c ** for details. */ -#define PAGER_MJ_PGNO(x) ((Pgno)((PENDING_BYTE/((x)->pageSize))+1)) +#define PAGER_SJ_PGNO_COMPUTED(x) ((Pgno)((PENDING_BYTE/((x)->pageSize))+1)) +#define PAGER_SJ_PGNO(x) ((x)->lckPgno) /* ** Allowed values for the flags parameter to sqlite3PagerOpen(). @@ -15293,6 +15726,8 @@ SQLITE_PRIVATE int sqlite3BtreeCheckpoint(Btree*, int, int *, int *); SQLITE_PRIVATE int sqlite3BtreeTransferRow(BtCursor*, BtCursor*, i64); +SQLITE_PRIVATE void sqlite3BtreeClearCache(Btree*); + /* ** If we are not using shared cache, then there is no need to ** use mutexes to access the BtShared structures. So make the @@ -15405,7 +15840,6 @@ struct VdbeOp { #ifdef SQLITE_ENABLE_CURSOR_HINTS Expr *pExpr; /* Used when p4type is P4_EXPR */ #endif - int (*xAdvance)(BtCursor *, int); } p4; #ifdef SQLITE_ENABLE_EXPLAIN_COMMENTS char *zComment; /* Comment to improve readability */ @@ -15456,21 +15890,19 @@ typedef struct VdbeOpList VdbeOpList; #define P4_COLLSEQ (-2) /* P4 is a pointer to a CollSeq structure */ #define P4_INT32 (-3) /* P4 is a 32-bit signed integer */ #define P4_SUBPROGRAM (-4) /* P4 is a pointer to a SubProgram structure */ -#define P4_ADVANCE (-5) /* P4 is a pointer to BtreeNext() or BtreePrev() */ -#define P4_TABLE (-6) /* P4 is a pointer to a Table structure */ +#define P4_TABLE (-5) /* P4 is a pointer to a Table structure */ /* Above do not own any resources. Must free those below */ -#define P4_FREE_IF_LE (-7) -#define P4_DYNAMIC (-7) /* Pointer to memory from sqliteMalloc() */ -#define P4_FUNCDEF (-8) /* P4 is a pointer to a FuncDef structure */ -#define P4_KEYINFO (-9) /* P4 is a pointer to a KeyInfo structure */ -#define P4_EXPR (-10) /* P4 is a pointer to an Expr tree */ -#define P4_MEM (-11) /* P4 is a pointer to a Mem* structure */ -#define P4_VTAB (-12) /* P4 is a pointer to an sqlite3_vtab structure */ -#define P4_REAL (-13) /* P4 is a 64-bit floating point value */ -#define P4_INT64 (-14) /* P4 is a 64-bit signed integer */ -#define P4_INTARRAY (-15) /* P4 is a vector of 32-bit integers */ -#define P4_FUNCCTX (-16) /* P4 is a pointer to an sqlite3_context object */ -#define P4_DYNBLOB (-17) /* Pointer to memory from sqliteMalloc() */ +#define P4_FREE_IF_LE (-6) +#define P4_DYNAMIC (-6) /* Pointer to memory from sqliteMalloc() */ +#define P4_FUNCDEF (-7) /* P4 is a pointer to a FuncDef structure */ +#define P4_KEYINFO (-8) /* P4 is a pointer to a KeyInfo structure */ +#define P4_EXPR (-9) /* P4 is a pointer to an Expr tree */ +#define P4_MEM (-10) /* P4 is a pointer to a Mem* structure */ +#define P4_VTAB (-11) /* P4 is a pointer to an sqlite3_vtab structure */ +#define P4_REAL (-12) /* P4 is a 64-bit floating point value */ +#define P4_INT64 (-13) /* P4 is a 64-bit signed integer */ +#define P4_INTARRAY (-14) /* P4 is a vector of 32-bit integers */ +#define P4_FUNCCTX (-15) /* P4 is a pointer to an sqlite3_context object */ /* Error message codes for OP_Halt */ #define P5_ConstraintNotNull 1 @@ -15515,53 +15947,53 @@ typedef struct VdbeOpList VdbeOpList; #define OP_Savepoint 0 #define OP_AutoCommit 1 #define OP_Transaction 2 -#define OP_SorterNext 3 /* jump */ -#define OP_Prev 4 /* jump */ -#define OP_Next 5 /* jump */ -#define OP_Checkpoint 6 -#define OP_JournalMode 7 -#define OP_Vacuum 8 -#define OP_VFilter 9 /* jump, synopsis: iplan=r[P3] zplan='P4' */ -#define OP_VUpdate 10 /* synopsis: data=r[P3@P2] */ -#define OP_Goto 11 /* jump */ -#define OP_Gosub 12 /* jump */ -#define OP_InitCoroutine 13 /* jump */ -#define OP_Yield 14 /* jump */ -#define OP_MustBeInt 15 /* jump */ -#define OP_Jump 16 /* jump */ -#define OP_Once 17 /* jump */ -#define OP_If 18 /* jump */ +#define OP_Checkpoint 3 +#define OP_JournalMode 4 +#define OP_Vacuum 5 +#define OP_VFilter 6 /* jump, synopsis: iplan=r[P3] zplan='P4' */ +#define OP_VUpdate 7 /* synopsis: data=r[P3@P2] */ +#define OP_Init 8 /* jump, synopsis: Start at P2 */ +#define OP_Goto 9 /* jump */ +#define OP_Gosub 10 /* jump */ +#define OP_InitCoroutine 11 /* jump */ +#define OP_Yield 12 /* jump */ +#define OP_MustBeInt 13 /* jump */ +#define OP_Jump 14 /* jump */ +#define OP_Once 15 /* jump */ +#define OP_If 16 /* jump */ +#define OP_IfNot 17 /* jump */ +#define OP_IsType 18 /* jump, synopsis: if typeof(P1.P3) in P5 goto P2 */ #define OP_Not 19 /* same as TK_NOT, synopsis: r[P2]= !r[P1] */ -#define OP_IfNot 20 /* jump */ -#define OP_IsNullOrType 21 /* jump, synopsis: if typeof(r[P1]) IN (P3,5) goto P2 */ -#define OP_IfNullRow 22 /* jump, synopsis: if P1.nullRow then r[P3]=NULL, goto P2 */ -#define OP_SeekLT 23 /* jump, synopsis: key=r[P3@P4] */ -#define OP_SeekLE 24 /* jump, synopsis: key=r[P3@P4] */ -#define OP_SeekGE 25 /* jump, synopsis: key=r[P3@P4] */ -#define OP_SeekGT 26 /* jump, synopsis: key=r[P3@P4] */ -#define OP_IfNotOpen 27 /* jump, synopsis: if( !csr[P1] ) goto P2 */ -#define OP_IfNoHope 28 /* jump, synopsis: key=r[P3@P4] */ -#define OP_NoConflict 29 /* jump, synopsis: key=r[P3@P4] */ -#define OP_NotFound 30 /* jump, synopsis: key=r[P3@P4] */ -#define OP_Found 31 /* jump, synopsis: key=r[P3@P4] */ -#define OP_SeekRowid 32 /* jump, synopsis: intkey=r[P3] */ -#define OP_NotExists 33 /* jump, synopsis: intkey=r[P3] */ -#define OP_Last 34 /* jump */ -#define OP_IfSmaller 35 /* jump */ -#define OP_SorterSort 36 /* jump */ -#define OP_Sort 37 /* jump */ -#define OP_Rewind 38 /* jump */ -#define OP_IdxLE 39 /* jump, synopsis: key=r[P3@P4] */ -#define OP_IdxGT 40 /* jump, synopsis: key=r[P3@P4] */ -#define OP_IdxLT 41 /* jump, synopsis: key=r[P3@P4] */ -#define OP_IdxGE 42 /* jump, synopsis: key=r[P3@P4] */ +#define OP_IfNullRow 20 /* jump, synopsis: if P1.nullRow then r[P3]=NULL, goto P2 */ +#define OP_SeekLT 21 /* jump, synopsis: key=r[P3@P4] */ +#define OP_SeekLE 22 /* jump, synopsis: key=r[P3@P4] */ +#define OP_SeekGE 23 /* jump, synopsis: key=r[P3@P4] */ +#define OP_SeekGT 24 /* jump, synopsis: key=r[P3@P4] */ +#define OP_IfNotOpen 25 /* jump, synopsis: if( !csr[P1] ) goto P2 */ +#define OP_IfNoHope 26 /* jump, synopsis: key=r[P3@P4] */ +#define OP_NoConflict 27 /* jump, synopsis: key=r[P3@P4] */ +#define OP_NotFound 28 /* jump, synopsis: key=r[P3@P4] */ +#define OP_Found 29 /* jump, synopsis: key=r[P3@P4] */ +#define OP_SeekRowid 30 /* jump, synopsis: intkey=r[P3] */ +#define OP_NotExists 31 /* jump, synopsis: intkey=r[P3] */ +#define OP_Last 32 /* jump */ +#define OP_IfSmaller 33 /* jump */ +#define OP_SorterSort 34 /* jump */ +#define OP_Sort 35 /* jump */ +#define OP_Rewind 36 /* jump */ +#define OP_SorterNext 37 /* jump */ +#define OP_Prev 38 /* jump */ +#define OP_Next 39 /* jump */ +#define OP_IdxLE 40 /* jump, synopsis: key=r[P3@P4] */ +#define OP_IdxGT 41 /* jump, synopsis: key=r[P3@P4] */ +#define OP_IdxLT 42 /* jump, synopsis: key=r[P3@P4] */ #define OP_Or 43 /* same as TK_OR, synopsis: r[P3]=(r[P1] || r[P2]) */ #define OP_And 44 /* same as TK_AND, synopsis: r[P3]=(r[P1] && r[P2]) */ -#define OP_RowSetRead 45 /* jump, synopsis: r[P3]=rowset(P1) */ -#define OP_RowSetTest 46 /* jump, synopsis: if r[P3] in rowset(P1) goto P2 */ -#define OP_Program 47 /* jump */ -#define OP_FkIfZero 48 /* jump, synopsis: if fkctr[P1]==0 goto P2 */ -#define OP_IfPos 49 /* jump, synopsis: if r[P1]>0 then r[P1]-=P3, goto P2 */ +#define OP_IdxGE 45 /* jump, synopsis: key=r[P3@P4] */ +#define OP_RowSetRead 46 /* jump, synopsis: r[P3]=rowset(P1) */ +#define OP_RowSetTest 47 /* jump, synopsis: if r[P3] in rowset(P1) goto P2 */ +#define OP_Program 48 /* jump */ +#define OP_FkIfZero 49 /* jump, synopsis: if fkctr[P1]==0 goto P2 */ #define OP_IsNull 50 /* jump, same as TK_ISNULL, synopsis: if r[P1]==NULL goto P2 */ #define OP_NotNull 51 /* jump, same as TK_NOTNULL, synopsis: if r[P1]!=NULL goto P2 */ #define OP_Ne 52 /* jump, same as TK_NE, synopsis: IF r[P3]!=r[P1] */ @@ -15571,12 +16003,12 @@ typedef struct VdbeOpList VdbeOpList; #define OP_Lt 56 /* jump, same as TK_LT, synopsis: IF r[P3]=r[P1] */ #define OP_ElseEq 58 /* jump, same as TK_ESCAPE */ -#define OP_IfNotZero 59 /* jump, synopsis: if r[P1]!=0 then r[P1]--, goto P2 */ -#define OP_DecrJumpZero 60 /* jump, synopsis: if (--r[P1])==0 goto P2 */ -#define OP_IncrVacuum 61 /* jump */ -#define OP_VNext 62 /* jump */ -#define OP_Filter 63 /* jump, synopsis: if key(P3@P4) not in filter(P1) goto P2 */ -#define OP_Init 64 /* jump, synopsis: Start at P2 */ +#define OP_IfPos 59 /* jump, synopsis: if r[P1]>0 then r[P1]-=P3, goto P2 */ +#define OP_IfNotZero 60 /* jump, synopsis: if r[P1]!=0 then r[P1]--, goto P2 */ +#define OP_DecrJumpZero 61 /* jump, synopsis: if (--r[P1])==0 goto P2 */ +#define OP_IncrVacuum 62 /* jump */ +#define OP_VNext 63 /* jump */ +#define OP_Filter 64 /* jump, synopsis: if key(P3@P4) not in filter(P1) goto P2 */ #define OP_PureFunc 65 /* synopsis: r[P3]=func(r[P2@NP]) */ #define OP_Function 66 /* synopsis: r[P3]=func(r[P2@NP]) */ #define OP_Return 67 @@ -15586,34 +16018,34 @@ typedef struct VdbeOpList VdbeOpList; #define OP_Integer 71 /* synopsis: r[P2]=P1 */ #define OP_Int64 72 /* synopsis: r[P2]=P4 */ #define OP_String 73 /* synopsis: r[P2]='P4' (len=P1) */ -#define OP_Null 74 /* synopsis: r[P2..P3]=NULL */ -#define OP_SoftNull 75 /* synopsis: r[P1]=NULL */ -#define OP_Blob 76 /* synopsis: r[P2]=P4 (len=P1) */ -#define OP_Variable 77 /* synopsis: r[P2]=parameter(P1,P4) */ -#define OP_Move 78 /* synopsis: r[P2@P3]=r[P1@P3] */ -#define OP_Copy 79 /* synopsis: r[P2@P3+1]=r[P1@P3+1] */ -#define OP_SCopy 80 /* synopsis: r[P2]=r[P1] */ -#define OP_IntCopy 81 /* synopsis: r[P2]=r[P1] */ -#define OP_FkCheck 82 -#define OP_ResultRow 83 /* synopsis: output=r[P1@P2] */ -#define OP_CollSeq 84 -#define OP_AddImm 85 /* synopsis: r[P1]=r[P1]+P2 */ -#define OP_RealAffinity 86 -#define OP_Cast 87 /* synopsis: affinity(r[P1]) */ -#define OP_Permutation 88 -#define OP_Compare 89 /* synopsis: r[P1@P3] <-> r[P2@P3] */ -#define OP_IsTrue 90 /* synopsis: r[P2] = coalesce(r[P1]==TRUE,P3) ^ P4 */ -#define OP_ZeroOrNull 91 /* synopsis: r[P2] = 0 OR NULL */ -#define OP_Offset 92 /* synopsis: r[P3] = sqlite_offset(P1) */ -#define OP_Column 93 /* synopsis: r[P3]=PX */ -#define OP_TypeCheck 94 /* synopsis: typecheck(r[P1@P2]) */ -#define OP_Affinity 95 /* synopsis: affinity(r[P1@P2]) */ -#define OP_MakeRecord 96 /* synopsis: r[P3]=mkrec(r[P1@P2]) */ -#define OP_Count 97 /* synopsis: r[P2]=count() */ -#define OP_ReadCookie 98 -#define OP_SetCookie 99 -#define OP_ReopenIdx 100 /* synopsis: root=P2 iDb=P3 */ -#define OP_OpenRead 101 /* synopsis: root=P2 iDb=P3 */ +#define OP_BeginSubrtn 74 /* synopsis: r[P2]=NULL */ +#define OP_Null 75 /* synopsis: r[P2..P3]=NULL */ +#define OP_SoftNull 76 /* synopsis: r[P1]=NULL */ +#define OP_Blob 77 /* synopsis: r[P2]=P4 (len=P1) */ +#define OP_Variable 78 /* synopsis: r[P2]=parameter(P1,P4) */ +#define OP_Move 79 /* synopsis: r[P2@P3]=r[P1@P3] */ +#define OP_Copy 80 /* synopsis: r[P2@P3+1]=r[P1@P3+1] */ +#define OP_SCopy 81 /* synopsis: r[P2]=r[P1] */ +#define OP_IntCopy 82 /* synopsis: r[P2]=r[P1] */ +#define OP_FkCheck 83 +#define OP_ResultRow 84 /* synopsis: output=r[P1@P2] */ +#define OP_CollSeq 85 +#define OP_AddImm 86 /* synopsis: r[P1]=r[P1]+P2 */ +#define OP_RealAffinity 87 +#define OP_Cast 88 /* synopsis: affinity(r[P1]) */ +#define OP_Permutation 89 +#define OP_Compare 90 /* synopsis: r[P1@P3] <-> r[P2@P3] */ +#define OP_IsTrue 91 /* synopsis: r[P2] = coalesce(r[P1]==TRUE,P3) ^ P4 */ +#define OP_ZeroOrNull 92 /* synopsis: r[P2] = 0 OR NULL */ +#define OP_Offset 93 /* synopsis: r[P3] = sqlite_offset(P1) */ +#define OP_Column 94 /* synopsis: r[P3]=PX cursor P1 column P2 */ +#define OP_TypeCheck 95 /* synopsis: typecheck(r[P1@P2]) */ +#define OP_Affinity 96 /* synopsis: affinity(r[P1@P2]) */ +#define OP_MakeRecord 97 /* synopsis: r[P3]=mkrec(r[P1@P2]) */ +#define OP_Count 98 /* synopsis: r[P2]=count() */ +#define OP_ReadCookie 99 +#define OP_SetCookie 100 +#define OP_ReopenIdx 101 /* synopsis: root=P2 iDb=P3 */ #define OP_BitAnd 102 /* same as TK_BITAND, synopsis: r[P3]=r[P1]&r[P2] */ #define OP_BitOr 103 /* same as TK_BITOR, synopsis: r[P3]=r[P1]|r[P2] */ #define OP_ShiftLeft 104 /* same as TK_LSHIFT, synopsis: r[P3]=r[P2]<0 then r[P2]=r[P1]+max(0,r[P3]) else r[P2]=(-1) */ -#define OP_AggInverse 160 /* synopsis: accum=r[P3] inverse(r[P2@P5]) */ -#define OP_AggStep 161 /* synopsis: accum=r[P3] step(r[P2@P5]) */ -#define OP_AggStep1 162 /* synopsis: accum=r[P3] step(r[P2@P5]) */ -#define OP_AggValue 163 /* synopsis: r[P3]=value N=P2 */ -#define OP_AggFinal 164 /* synopsis: accum=r[P1] N=P2 */ -#define OP_Expire 165 -#define OP_CursorLock 166 -#define OP_CursorUnlock 167 -#define OP_TableLock 168 /* synopsis: iDb=P1 root=P2 write=P3 */ -#define OP_VBegin 169 -#define OP_VCreate 170 -#define OP_VDestroy 171 -#define OP_VOpen 172 -#define OP_VInitIn 173 /* synopsis: r[P2]=ValueList(P1,P3) */ -#define OP_VColumn 174 /* synopsis: r[P3]=vcolumn(P2) */ -#define OP_VRename 175 -#define OP_Pagecount 176 -#define OP_MaxPgcnt 177 -#define OP_FilterAdd 178 /* synopsis: filter(P1) += key(P3@P4) */ -#define OP_Trace 179 -#define OP_CursorHint 180 -#define OP_ReleaseReg 181 /* synopsis: release r[P1@P2] mask P3 */ -#define OP_Noop 182 -#define OP_Explain 183 -#define OP_Abortable 184 +#define OP_DropTrigger 154 +#define OP_IntegrityCk 155 +#define OP_RowSetAdd 156 /* synopsis: rowset(P1)=r[P2] */ +#define OP_Param 157 +#define OP_FkCounter 158 /* synopsis: fkctr[P1]+=P2 */ +#define OP_MemMax 159 /* synopsis: r[P1]=max(r[P1],r[P2]) */ +#define OP_OffsetLimit 160 /* synopsis: if r[P1]>0 then r[P2]=r[P1]+max(0,r[P3]) else r[P2]=(-1) */ +#define OP_AggInverse 161 /* synopsis: accum=r[P3] inverse(r[P2@P5]) */ +#define OP_AggStep 162 /* synopsis: accum=r[P3] step(r[P2@P5]) */ +#define OP_AggStep1 163 /* synopsis: accum=r[P3] step(r[P2@P5]) */ +#define OP_AggValue 164 /* synopsis: r[P3]=value N=P2 */ +#define OP_AggFinal 165 /* synopsis: accum=r[P1] N=P2 */ +#define OP_Expire 166 +#define OP_CursorLock 167 +#define OP_CursorUnlock 168 +#define OP_TableLock 169 /* synopsis: iDb=P1 root=P2 write=P3 */ +#define OP_VBegin 170 +#define OP_VCreate 171 +#define OP_VDestroy 172 +#define OP_VOpen 173 +#define OP_VInitIn 174 /* synopsis: r[P2]=ValueList(P1,P3) */ +#define OP_VColumn 175 /* synopsis: r[P3]=vcolumn(P2) */ +#define OP_VRename 176 +#define OP_Pagecount 177 +#define OP_MaxPgcnt 178 +#define OP_ClrSubtype 179 /* synopsis: r[P1].subtype = 0 */ +#define OP_FilterAdd 180 /* synopsis: filter(P1) += key(P3@P4) */ +#define OP_Trace 181 +#define OP_CursorHint 182 +#define OP_ReleaseReg 183 /* synopsis: release r[P1@P2] mask P3 */ +#define OP_Noop 184 +#define OP_Explain 185 +#define OP_Abortable 186 /* Properties such as "out2" or "jump" that are specified in ** comments following the "case" for each opcode in the vdbe.c @@ -15709,30 +16143,30 @@ typedef struct VdbeOpList VdbeOpList; #define OPFLG_OUT2 0x10 /* out2: P2 is an output */ #define OPFLG_OUT3 0x20 /* out3: P3 is an output */ #define OPFLG_INITIALIZER {\ -/* 0 */ 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x00, 0x10,\ -/* 8 */ 0x00, 0x01, 0x00, 0x01, 0x01, 0x01, 0x03, 0x03,\ -/* 16 */ 0x01, 0x01, 0x03, 0x12, 0x03, 0x03, 0x01, 0x09,\ -/* 24 */ 0x09, 0x09, 0x09, 0x01, 0x09, 0x09, 0x09, 0x09,\ -/* 32 */ 0x09, 0x09, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\ -/* 40 */ 0x01, 0x01, 0x01, 0x26, 0x26, 0x23, 0x0b, 0x01,\ -/* 48 */ 0x01, 0x03, 0x03, 0x03, 0x0b, 0x0b, 0x0b, 0x0b,\ -/* 56 */ 0x0b, 0x0b, 0x01, 0x03, 0x03, 0x01, 0x01, 0x01,\ +/* 0 */ 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x01, 0x00,\ +/* 8 */ 0x01, 0x01, 0x01, 0x01, 0x03, 0x03, 0x01, 0x01,\ +/* 16 */ 0x03, 0x03, 0x01, 0x12, 0x01, 0x09, 0x09, 0x09,\ +/* 24 */ 0x09, 0x01, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,\ +/* 32 */ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\ +/* 40 */ 0x01, 0x01, 0x01, 0x26, 0x26, 0x01, 0x23, 0x0b,\ +/* 48 */ 0x01, 0x01, 0x03, 0x03, 0x0b, 0x0b, 0x0b, 0x0b,\ +/* 56 */ 0x0b, 0x0b, 0x01, 0x03, 0x03, 0x03, 0x01, 0x01,\ /* 64 */ 0x01, 0x00, 0x00, 0x02, 0x02, 0x08, 0x00, 0x10,\ -/* 72 */ 0x10, 0x10, 0x10, 0x00, 0x10, 0x10, 0x00, 0x00,\ -/* 80 */ 0x10, 0x10, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02,\ -/* 88 */ 0x00, 0x00, 0x12, 0x1e, 0x20, 0x00, 0x00, 0x00,\ -/* 96 */ 0x00, 0x10, 0x10, 0x00, 0x00, 0x00, 0x26, 0x26,\ +/* 72 */ 0x10, 0x10, 0x00, 0x10, 0x00, 0x10, 0x10, 0x00,\ +/* 80 */ 0x00, 0x10, 0x10, 0x00, 0x00, 0x00, 0x02, 0x02,\ +/* 88 */ 0x02, 0x00, 0x00, 0x12, 0x1e, 0x20, 0x00, 0x00,\ +/* 96 */ 0x00, 0x00, 0x10, 0x10, 0x00, 0x00, 0x26, 0x26,\ /* 104 */ 0x26, 0x26, 0x26, 0x26, 0x26, 0x26, 0x26, 0x26,\ /* 112 */ 0x00, 0x00, 0x12, 0x00, 0x00, 0x10, 0x00, 0x00,\ -/* 120 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x10, 0x00,\ -/* 128 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00,\ -/* 136 */ 0x00, 0x04, 0x04, 0x00, 0x00, 0x10, 0x00, 0x10,\ -/* 144 */ 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,\ -/* 152 */ 0x00, 0x10, 0x00, 0x06, 0x10, 0x00, 0x04, 0x1a,\ -/* 160 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\ -/* 168 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,\ -/* 176 */ 0x10, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\ -/* 184 */ 0x00,} +/* 120 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x10,\ +/* 128 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,\ +/* 136 */ 0x00, 0x00, 0x04, 0x04, 0x00, 0x00, 0x10, 0x00,\ +/* 144 */ 0x10, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00,\ +/* 152 */ 0x00, 0x10, 0x00, 0x00, 0x06, 0x10, 0x00, 0x04,\ +/* 160 */ 0x1a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\ +/* 168 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00,\ +/* 176 */ 0x00, 0x10, 0x10, 0x02, 0x00, 0x00, 0x00, 0x00,\ +/* 184 */ 0x00, 0x00, 0x00,} /* The resolve3P2Values() routine is able to run faster if it knows ** the value of the largest JUMP opcode. The smaller the maximum @@ -15778,8 +16212,10 @@ SQLITE_PRIVATE void sqlite3VdbeVerifyNoResultRow(Vdbe *p); #endif #if defined(SQLITE_DEBUG) SQLITE_PRIVATE void sqlite3VdbeVerifyAbortable(Vdbe *p, int); +SQLITE_PRIVATE void sqlite3VdbeNoJumpsOutsideSubrtn(Vdbe*,int,int,int); #else # define sqlite3VdbeVerifyAbortable(A,B) +# define sqlite3VdbeNoJumpsOutsideSubrtn(A,B,C,D) #endif SQLITE_PRIVATE VdbeOp *sqlite3VdbeAddOpList(Vdbe*, int nOp, VdbeOpList const *aOp,int iLineno); #ifndef SQLITE_OMIT_EXPLAIN @@ -15806,6 +16242,7 @@ SQLITE_PRIVATE void sqlite3VdbeChangeP1(Vdbe*, int addr, int P1); SQLITE_PRIVATE void sqlite3VdbeChangeP2(Vdbe*, int addr, int P2); SQLITE_PRIVATE void sqlite3VdbeChangeP3(Vdbe*, int addr, int P3); SQLITE_PRIVATE void sqlite3VdbeChangeP5(Vdbe*, u16 P5); +SQLITE_PRIVATE void sqlite3VdbeTypeofColumn(Vdbe*, int); SQLITE_PRIVATE void sqlite3VdbeJumpHere(Vdbe*, int addr); SQLITE_PRIVATE void sqlite3VdbeJumpHereOrPopInst(Vdbe*, int addr); SQLITE_PRIVATE int sqlite3VdbeChangeToNoop(Vdbe*, int addr); @@ -15820,11 +16257,11 @@ SQLITE_PRIVATE void sqlite3VdbeAppendP4(Vdbe*, void *pP4, int p4type); SQLITE_PRIVATE void sqlite3VdbeSetP4KeyInfo(Parse*, Index*); SQLITE_PRIVATE void sqlite3VdbeUsesBtree(Vdbe*, int); SQLITE_PRIVATE VdbeOp *sqlite3VdbeGetOp(Vdbe*, int); +SQLITE_PRIVATE VdbeOp *sqlite3VdbeGetLastOp(Vdbe*); SQLITE_PRIVATE int sqlite3VdbeMakeLabel(Parse*); SQLITE_PRIVATE void sqlite3VdbeRunOnlyOnce(Vdbe*); SQLITE_PRIVATE void sqlite3VdbeReusable(Vdbe*); SQLITE_PRIVATE void sqlite3VdbeDelete(Vdbe*); -SQLITE_PRIVATE void sqlite3VdbeClearObject(sqlite3*,Vdbe*); SQLITE_PRIVATE void sqlite3VdbeMakeReady(Vdbe*,Parse*); SQLITE_PRIVATE int sqlite3VdbeFinalize(Vdbe*); SQLITE_PRIVATE void sqlite3VdbeResolveLabel(Vdbe*, int); @@ -16169,290 +16606,6 @@ SQLITE_PRIVATE int sqlite3PCacheIsDirty(PCache *pCache); /************** End of pcache.h **********************************************/ /************** Continuing where we left off in sqliteInt.h ******************/ -/************** Include os.h in the middle of sqliteInt.h ********************/ -/************** Begin file os.h **********************************************/ -/* -** 2001 September 16 -** -** The author disclaims copyright to this source code. In place of -** a legal notice, here is a blessing: -** -** May you do good and not evil. -** May you find forgiveness for yourself and forgive others. -** May you share freely, never taking more than you give. -** -****************************************************************************** -** -** This header file (together with is companion C source-code file -** "os.c") attempt to abstract the underlying operating system so that -** the SQLite library will work on both POSIX and windows systems. -** -** This header file is #include-ed by sqliteInt.h and thus ends up -** being included by every source file. -*/ -#ifndef _SQLITE_OS_H_ -#define _SQLITE_OS_H_ - -/* -** Attempt to automatically detect the operating system and setup the -** necessary pre-processor macros for it. -*/ -/************** Include os_setup.h in the middle of os.h *********************/ -/************** Begin file os_setup.h ****************************************/ -/* -** 2013 November 25 -** -** The author disclaims copyright to this source code. In place of -** a legal notice, here is a blessing: -** -** May you do good and not evil. -** May you find forgiveness for yourself and forgive others. -** May you share freely, never taking more than you give. -** -****************************************************************************** -** -** This file contains pre-processor directives related to operating system -** detection and/or setup. -*/ -#ifndef SQLITE_OS_SETUP_H -#define SQLITE_OS_SETUP_H - -/* -** Figure out if we are dealing with Unix, Windows, or some other operating -** system. -** -** After the following block of preprocess macros, all of SQLITE_OS_UNIX, -** SQLITE_OS_WIN, and SQLITE_OS_OTHER will defined to either 1 or 0. One of -** the three will be 1. The other two will be 0. -*/ -#if defined(SQLITE_OS_OTHER) -# if SQLITE_OS_OTHER==1 -# undef SQLITE_OS_UNIX -# define SQLITE_OS_UNIX 0 -# undef SQLITE_OS_WIN -# define SQLITE_OS_WIN 0 -# else -# undef SQLITE_OS_OTHER -# endif -#endif -#if !defined(SQLITE_OS_UNIX) && !defined(SQLITE_OS_OTHER) -# define SQLITE_OS_OTHER 0 -# ifndef SQLITE_OS_WIN -# if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || \ - defined(__MINGW32__) || defined(__BORLANDC__) -# define SQLITE_OS_WIN 1 -# define SQLITE_OS_UNIX 0 -# else -# define SQLITE_OS_WIN 0 -# define SQLITE_OS_UNIX 1 -# endif -# else -# define SQLITE_OS_UNIX 0 -# endif -#else -# ifndef SQLITE_OS_WIN -# define SQLITE_OS_WIN 0 -# endif -#endif - -#endif /* SQLITE_OS_SETUP_H */ - -/************** End of os_setup.h ********************************************/ -/************** Continuing where we left off in os.h *************************/ - -/* If the SET_FULLSYNC macro is not defined above, then make it -** a no-op -*/ -#ifndef SET_FULLSYNC -# define SET_FULLSYNC(x,y) -#endif - -/* Maximum pathname length. Note: FILENAME_MAX defined by stdio.h -*/ -#ifndef SQLITE_MAX_PATHLEN -# define SQLITE_MAX_PATHLEN FILENAME_MAX -#endif - -/* -** The default size of a disk sector -*/ -#ifndef SQLITE_DEFAULT_SECTOR_SIZE -# define SQLITE_DEFAULT_SECTOR_SIZE 4096 -#endif - -/* -** Temporary files are named starting with this prefix followed by 16 random -** alphanumeric characters, and no file extension. They are stored in the -** OS's standard temporary file directory, and are deleted prior to exit. -** If sqlite is being embedded in another program, you may wish to change the -** prefix to reflect your program's name, so that if your program exits -** prematurely, old temporary files can be easily identified. This can be done -** using -DSQLITE_TEMP_FILE_PREFIX=myprefix_ on the compiler command line. -** -** 2006-10-31: The default prefix used to be "sqlite_". But then -** Mcafee started using SQLite in their anti-virus product and it -** started putting files with the "sqlite" name in the c:/temp folder. -** This annoyed many windows users. Those users would then do a -** Google search for "sqlite", find the telephone numbers of the -** developers and call to wake them up at night and complain. -** For this reason, the default name prefix is changed to be "sqlite" -** spelled backwards. So the temp files are still identified, but -** anybody smart enough to figure out the code is also likely smart -** enough to know that calling the developer will not help get rid -** of the file. -*/ -#ifndef SQLITE_TEMP_FILE_PREFIX -# define SQLITE_TEMP_FILE_PREFIX "etilqs_" -#endif - -/* -** The following values may be passed as the second argument to -** sqlite3OsLock(). The various locks exhibit the following semantics: -** -** SHARED: Any number of processes may hold a SHARED lock simultaneously. -** RESERVED: A single process may hold a RESERVED lock on a file at -** any time. Other processes may hold and obtain new SHARED locks. -** PENDING: A single process may hold a PENDING lock on a file at -** any one time. Existing SHARED locks may persist, but no new -** SHARED locks may be obtained by other processes. -** EXCLUSIVE: An EXCLUSIVE lock precludes all other locks. -** -** PENDING_LOCK may not be passed directly to sqlite3OsLock(). Instead, a -** process that requests an EXCLUSIVE lock may actually obtain a PENDING -** lock. This can be upgraded to an EXCLUSIVE lock by a subsequent call to -** sqlite3OsLock(). -*/ -#define NO_LOCK 0 -#define SHARED_LOCK 1 -#define RESERVED_LOCK 2 -#define PENDING_LOCK 3 -#define EXCLUSIVE_LOCK 4 - -/* -** File Locking Notes: (Mostly about windows but also some info for Unix) -** -** We cannot use LockFileEx() or UnlockFileEx() on Win95/98/ME because -** those functions are not available. So we use only LockFile() and -** UnlockFile(). -** -** LockFile() prevents not just writing but also reading by other processes. -** A SHARED_LOCK is obtained by locking a single randomly-chosen -** byte out of a specific range of bytes. The lock byte is obtained at -** random so two separate readers can probably access the file at the -** same time, unless they are unlucky and choose the same lock byte. -** An EXCLUSIVE_LOCK is obtained by locking all bytes in the range. -** There can only be one writer. A RESERVED_LOCK is obtained by locking -** a single byte of the file that is designated as the reserved lock byte. -** A PENDING_LOCK is obtained by locking a designated byte different from -** the RESERVED_LOCK byte. -** -** On WinNT/2K/XP systems, LockFileEx() and UnlockFileEx() are available, -** which means we can use reader/writer locks. When reader/writer locks -** are used, the lock is placed on the same range of bytes that is used -** for probabilistic locking in Win95/98/ME. Hence, the locking scheme -** will support two or more Win95 readers or two or more WinNT readers. -** But a single Win95 reader will lock out all WinNT readers and a single -** WinNT reader will lock out all other Win95 readers. -** -** The following #defines specify the range of bytes used for locking. -** SHARED_SIZE is the number of bytes available in the pool from which -** a random byte is selected for a shared lock. The pool of bytes for -** shared locks begins at SHARED_FIRST. -** -** The same locking strategy and -** byte ranges are used for Unix. This leaves open the possibility of having -** clients on win95, winNT, and unix all talking to the same shared file -** and all locking correctly. To do so would require that samba (or whatever -** tool is being used for file sharing) implements locks correctly between -** windows and unix. I'm guessing that isn't likely to happen, but by -** using the same locking range we are at least open to the possibility. -** -** Locking in windows is manditory. For this reason, we cannot store -** actual data in the bytes used for locking. The pager never allocates -** the pages involved in locking therefore. SHARED_SIZE is selected so -** that all locks will fit on a single page even at the minimum page size. -** PENDING_BYTE defines the beginning of the locks. By default PENDING_BYTE -** is set high so that we don't have to allocate an unused page except -** for very large databases. But one should test the page skipping logic -** by setting PENDING_BYTE low and running the entire regression suite. -** -** Changing the value of PENDING_BYTE results in a subtly incompatible -** file format. Depending on how it is changed, you might not notice -** the incompatibility right away, even running a full regression test. -** The default location of PENDING_BYTE is the first byte past the -** 1GB boundary. -** -*/ -#ifdef SQLITE_OMIT_WSD -# define PENDING_BYTE (0x40000000) -#else -# define PENDING_BYTE sqlite3PendingByte -#endif -#define RESERVED_BYTE (PENDING_BYTE+1) -#define SHARED_FIRST (PENDING_BYTE+2) -#define SHARED_SIZE 510 - -/* -** Wrapper around OS specific sqlite3_os_init() function. -*/ -SQLITE_PRIVATE int sqlite3OsInit(void); - -/* -** Functions for accessing sqlite3_file methods -*/ -SQLITE_PRIVATE void sqlite3OsClose(sqlite3_file*); -SQLITE_PRIVATE int sqlite3OsRead(sqlite3_file*, void*, int amt, i64 offset); -SQLITE_PRIVATE int sqlite3OsWrite(sqlite3_file*, const void*, int amt, i64 offset); -SQLITE_PRIVATE int sqlite3OsTruncate(sqlite3_file*, i64 size); -SQLITE_PRIVATE int sqlite3OsSync(sqlite3_file*, int); -SQLITE_PRIVATE int sqlite3OsFileSize(sqlite3_file*, i64 *pSize); -SQLITE_PRIVATE int sqlite3OsLock(sqlite3_file*, int); -SQLITE_PRIVATE int sqlite3OsUnlock(sqlite3_file*, int); -SQLITE_PRIVATE int sqlite3OsCheckReservedLock(sqlite3_file *id, int *pResOut); -SQLITE_PRIVATE int sqlite3OsFileControl(sqlite3_file*,int,void*); -SQLITE_PRIVATE void sqlite3OsFileControlHint(sqlite3_file*,int,void*); -#define SQLITE_FCNTL_DB_UNCHANGED 0xca093fa0 -SQLITE_PRIVATE int sqlite3OsSectorSize(sqlite3_file *id); -SQLITE_PRIVATE int sqlite3OsDeviceCharacteristics(sqlite3_file *id); -#ifndef SQLITE_OMIT_WAL -SQLITE_PRIVATE int sqlite3OsShmMap(sqlite3_file *,int,int,int,void volatile **); -SQLITE_PRIVATE int sqlite3OsShmLock(sqlite3_file *id, int, int, int); -SQLITE_PRIVATE void sqlite3OsShmBarrier(sqlite3_file *id); -SQLITE_PRIVATE int sqlite3OsShmUnmap(sqlite3_file *id, int); -#endif /* SQLITE_OMIT_WAL */ -SQLITE_PRIVATE int sqlite3OsFetch(sqlite3_file *id, i64, int, void **); -SQLITE_PRIVATE int sqlite3OsUnfetch(sqlite3_file *, i64, void *); - - -/* -** Functions for accessing sqlite3_vfs methods -*/ -SQLITE_PRIVATE int sqlite3OsOpen(sqlite3_vfs *, const char *, sqlite3_file*, int, int *); -SQLITE_PRIVATE int sqlite3OsDelete(sqlite3_vfs *, const char *, int); -SQLITE_PRIVATE int sqlite3OsAccess(sqlite3_vfs *, const char *, int, int *pResOut); -SQLITE_PRIVATE int sqlite3OsFullPathname(sqlite3_vfs *, const char *, int, char *); -#ifndef SQLITE_OMIT_LOAD_EXTENSION -SQLITE_PRIVATE void *sqlite3OsDlOpen(sqlite3_vfs *, const char *); -SQLITE_PRIVATE void sqlite3OsDlError(sqlite3_vfs *, int, char *); -SQLITE_PRIVATE void (*sqlite3OsDlSym(sqlite3_vfs *, void *, const char *))(void); -SQLITE_PRIVATE void sqlite3OsDlClose(sqlite3_vfs *, void *); -#endif /* SQLITE_OMIT_LOAD_EXTENSION */ -SQLITE_PRIVATE int sqlite3OsRandomness(sqlite3_vfs *, int, char *); -SQLITE_PRIVATE int sqlite3OsSleep(sqlite3_vfs *, int); -SQLITE_PRIVATE int sqlite3OsGetLastError(sqlite3_vfs*); -SQLITE_PRIVATE int sqlite3OsCurrentTimeInt64(sqlite3_vfs *, sqlite3_int64*); - -/* -** Convenience functions for opening and closing files using -** sqlite3_malloc() to obtain space for the file-handle structure. -*/ -SQLITE_PRIVATE int sqlite3OsOpenMalloc(sqlite3_vfs *, const char *, sqlite3_file **, int,int*); -SQLITE_PRIVATE void sqlite3OsCloseFree(sqlite3_file *); - -#endif /* _SQLITE_OS_H_ */ - -/************** End of os.h **************************************************/ -/************** Continuing where we left off in sqliteInt.h ******************/ /************** Include mutex.h in the middle of sqliteInt.h *****************/ /************** Begin file mutex.h *******************************************/ /* @@ -16698,6 +16851,7 @@ struct Lookaside { #endif /* SQLITE_OMIT_TWOSIZE_LOOKASIDE */ void *pStart; /* First byte of available memory space */ void *pEnd; /* First byte past end of available space */ + void *pTrueEnd; /* True value of pEnd, when db->pnBytesFreed!=0 */ }; struct LookasideSlot { LookasideSlot *pNext; /* Next buffer in the list of free buffers */ @@ -17039,6 +17193,10 @@ struct sqlite3 { #define SQLITE_BloomFilter 0x00080000 /* Use a Bloom filter on searches */ #define SQLITE_BloomPulldown 0x00100000 /* Run Bloom filters early */ #define SQLITE_BalancedMerge 0x00200000 /* Balance multi-way merges */ +#define SQLITE_ReleaseReg 0x00400000 /* Use OP_ReleaseReg for testing */ +#define SQLITE_FlttnUnionAll 0x00800000 /* Disable the UNION ALL flattener */ + /* TH3 expects this value ^^^^^^^^^^ See flatten04.test */ +#define SQLITE_IndexedExpr 0x01000000 /* Pull exprs from index when able */ #define SQLITE_AllOpts 0xffffffff /* All optimizations */ /* @@ -17141,7 +17299,7 @@ struct FuncDestructor { #define SQLITE_FUNC_SLOCHNG 0x2000 /* "Slow Change". Value constant during a ** single query - might change over time */ #define SQLITE_FUNC_TEST 0x4000 /* Built-in testing functions */ -#define SQLITE_FUNC_OFFSET 0x8000 /* Built-in sqlite_offset() function */ +/* 0x8000 -- available for reuse */ #define SQLITE_FUNC_WINDOW 0x00010000 /* Built-in window-only function */ #define SQLITE_FUNC_INTERNAL 0x00040000 /* For use by NestedParse() only */ #define SQLITE_FUNC_DIRECT 0x00080000 /* Not for use in TRIGGERs or VIEWs */ @@ -17158,6 +17316,7 @@ struct FuncDestructor { #define INLINEFUNC_expr_compare 3 #define INLINEFUNC_affinity 4 #define INLINEFUNC_iif 5 +#define INLINEFUNC_sqlite_offset 6 #define INLINEFUNC_unlikely 99 /* Default case */ /* @@ -17384,6 +17543,7 @@ struct Column { #define COLFLAG_NOTAVAIL 0x0080 /* STORED column not yet calculated */ #define COLFLAG_BUSY 0x0100 /* Blocks recursion on GENERATED columns */ #define COLFLAG_HASCOLL 0x0200 /* Has collating sequence name in zCnName */ +#define COLFLAG_NOEXPAND 0x0400 /* Omit this column when expanding "*" */ #define COLFLAG_GENERATED 0x0060 /* Combo: _STORED, _VIRTUAL */ #define COLFLAG_NOINSERT 0x0062 /* Combo: _HIDDEN, _STORED, _VIRTUAL */ @@ -17609,7 +17769,7 @@ struct Table { #ifndef SQLITE_OMIT_VIRTUALTABLE # define IsVirtual(X) ((X)->eTabType==TABTYP_VTAB) # define ExprIsVtab(X) \ - ((X)->op==TK_COLUMN && (X)->y.pTab!=0 && (X)->y.pTab->eTabType==TABTYP_VTAB) + ((X)->op==TK_COLUMN && (X)->y.pTab->eTabType==TABTYP_VTAB) #else # define IsVirtual(X) 0 # define ExprIsVtab(X) 0 @@ -17790,6 +17950,11 @@ struct KeyInfo { struct UnpackedRecord { KeyInfo *pKeyInfo; /* Collation and sort-order information */ Mem *aMem; /* Values */ + union { + char *z; /* Cache of aMem[0].z for vdbeRecordCompareString() */ + i64 i; /* Cache of aMem[0].u.i for vdbeRecordCompareInt() */ + } u; + int n; /* Cache of aMem[0].n used by vdbeRecordCompareString() */ u16 nField; /* Number of entries in apMem[] */ i8 default_rc; /* Comparison result if keys are equal */ u8 errCode; /* Error detected by xRecordCompare (CORRUPT or NOMEM) */ @@ -17821,10 +17986,22 @@ struct UnpackedRecord { ** The Index.onError field determines whether or not the indexed columns ** must be unique and what to do if they are not. When Index.onError=OE_None, ** it means this is not a unique index. Otherwise it is a unique index -** and the value of Index.onError indicate the which conflict resolution -** algorithm to employ whenever an attempt is made to insert a non-unique +** and the value of Index.onError indicates which conflict resolution +** algorithm to employ when an attempt is made to insert a non-unique ** element. ** +** The colNotIdxed bitmask is used in combination with SrcItem.colUsed +** for a fast test to see if an index can serve as a covering index. +** colNotIdxed has a 1 bit for every column of the original table that +** is *not* available in the index. Thus the expression +** "colUsed & colNotIdxed" will be non-zero if the index is not a +** covering index. The most significant bit of of colNotIdxed will always +** be true (note-20221022-a). If a column beyond the 63rd column of the +** table is used, the "colUsed & colNotIdxed" test will always be non-zero +** and we have to assume either that the index is not covering, or use +** an alternative (slower) algorithm to determine whether or not +** the index is covering. +** ** While parsing a CREATE TABLE or CREATE INDEX statement in order to ** generate VDBE code (as opposed to parsing one read from an sqlite_schema ** table as part of parsing an existing database schema), transient instances @@ -17860,6 +18037,8 @@ struct Index { unsigned bNoQuery:1; /* Do not use this index to optimize queries */ unsigned bAscKeyBug:1; /* True if the bba7b69f9849b5bf bug applies */ unsigned bHasVCol:1; /* Index references one or more VIRTUAL columns */ + unsigned bHasExpr:1; /* Index contains an expression, either a literal + ** expression, or a reference to a VIRTUAL column */ #ifdef SQLITE_ENABLE_STAT4 int nSample; /* Number of elements in aSample[] */ int nSampleCol; /* Size of IndexSample.anEq[] and so on */ @@ -17868,7 +18047,7 @@ struct Index { tRowcnt *aiRowEst; /* Non-logarithmic stat1 data for this index */ tRowcnt nRowEst0; /* Non-logarithmic number of rows in the index */ #endif - Bitmask colNotIdxed; /* 0 for unindexed columns in pTab */ + Bitmask colNotIdxed; /* Unindexed columns in pTab */ }; /* @@ -18098,7 +18277,7 @@ struct Expr { ** TK_SELECT_COLUMN: column of the result vector */ i16 iAgg; /* Which entry in pAggInfo->aCol[] or ->aFunc[] */ union { - int iRightJoinTable; /* If EP_FromJoin, the right table of the join */ + int iJoin; /* If EP_OuterON or EP_InnerON, the right table */ int iOfst; /* else: start of token from start of statement */ } w; AggInfo *pAggInfo; /* Used by TK_AGG_COLUMN and TK_AGG_FUNCTION */ @@ -18119,29 +18298,29 @@ struct Expr { ** EP_Agg == NC_HasAgg == SF_HasAgg ** EP_Win == NC_HasWin */ -#define EP_FromJoin 0x000001 /* Originates in ON/USING clause of outer join */ -#define EP_Distinct 0x000002 /* Aggregate function with DISTINCT keyword */ -#define EP_HasFunc 0x000004 /* Contains one or more functions of any kind */ -#define EP_FixedCol 0x000008 /* TK_Column with a known fixed value */ +#define EP_OuterON 0x000001 /* Originates in ON/USING clause of outer join */ +#define EP_InnerON 0x000002 /* Originates in ON/USING of an inner join */ +#define EP_Distinct 0x000004 /* Aggregate function with DISTINCT keyword */ +#define EP_HasFunc 0x000008 /* Contains one or more functions of any kind */ #define EP_Agg 0x000010 /* Contains one or more aggregate functions */ -#define EP_VarSelect 0x000020 /* pSelect is correlated, not constant */ -#define EP_DblQuoted 0x000040 /* token.z was originally in "..." */ -#define EP_InfixFunc 0x000080 /* True for an infix function: LIKE, GLOB, etc */ -#define EP_Collate 0x000100 /* Tree contains a TK_COLLATE operator */ -#define EP_Commuted 0x000200 /* Comparison operator has been commuted */ -#define EP_IntValue 0x000400 /* Integer value contained in u.iValue */ -#define EP_xIsSelect 0x000800 /* x.pSelect is valid (otherwise x.pList is) */ -#define EP_Skip 0x001000 /* Operator does not contribute to affinity */ -#define EP_Reduced 0x002000 /* Expr struct EXPR_REDUCEDSIZE bytes only */ -#define EP_TokenOnly 0x004000 /* Expr struct EXPR_TOKENONLYSIZE bytes only */ +#define EP_FixedCol 0x000020 /* TK_Column with a known fixed value */ +#define EP_VarSelect 0x000040 /* pSelect is correlated, not constant */ +#define EP_DblQuoted 0x000080 /* token.z was originally in "..." */ +#define EP_InfixFunc 0x000100 /* True for an infix function: LIKE, GLOB, etc */ +#define EP_Collate 0x000200 /* Tree contains a TK_COLLATE operator */ +#define EP_Commuted 0x000400 /* Comparison operator has been commuted */ +#define EP_IntValue 0x000800 /* Integer value contained in u.iValue */ +#define EP_xIsSelect 0x001000 /* x.pSelect is valid (otherwise x.pList is) */ +#define EP_Skip 0x002000 /* Operator does not contribute to affinity */ +#define EP_Reduced 0x004000 /* Expr struct EXPR_REDUCEDSIZE bytes only */ #define EP_Win 0x008000 /* Contains window functions */ -#define EP_MemToken 0x010000 /* Need to sqlite3DbFree() Expr.zToken */ -#define EP_IfNullRow 0x020000 /* The TK_IF_NULL_ROW opcode */ -#define EP_Unlikely 0x040000 /* unlikely() or likelihood() function */ -#define EP_ConstFunc 0x080000 /* A SQLITE_FUNC_CONSTANT or _SLOCHNG function */ -#define EP_CanBeNull 0x100000 /* Can be null despite NOT NULL constraint */ -#define EP_Subquery 0x200000 /* Tree contains a TK_SELECT operator */ - /* 0x400000 // Available */ +#define EP_TokenOnly 0x010000 /* Expr struct EXPR_TOKENONLYSIZE bytes only */ + /* 0x020000 // Available for reuse */ +#define EP_IfNullRow 0x040000 /* The TK_IF_NULL_ROW opcode */ +#define EP_Unlikely 0x080000 /* unlikely() or likelihood() function */ +#define EP_ConstFunc 0x100000 /* A SQLITE_FUNC_CONSTANT or _SLOCHNG function */ +#define EP_CanBeNull 0x200000 /* Can be null despite NOT NULL constraint */ +#define EP_Subquery 0x400000 /* Tree contains a TK_SELECT operator */ #define EP_Leaf 0x800000 /* Expr.pLeft, .pRight, .u.pSelect all NULL */ #define EP_WinFunc 0x1000000 /* TK_FUNCTION with Expr.y.pWin set */ #define EP_Subrtn 0x2000000 /* Uses Expr.y.sub. TK_IN, _SELECT, or _EXISTS */ @@ -18164,8 +18343,8 @@ struct Expr { #define ExprHasAllProperty(E,P) (((E)->flags&(P))==(P)) #define ExprSetProperty(E,P) (E)->flags|=(P) #define ExprClearProperty(E,P) (E)->flags&=~(P) -#define ExprAlwaysTrue(E) (((E)->flags&(EP_FromJoin|EP_IsTrue))==EP_IsTrue) -#define ExprAlwaysFalse(E) (((E)->flags&(EP_FromJoin|EP_IsFalse))==EP_IsFalse) +#define ExprAlwaysTrue(E) (((E)->flags&(EP_OuterON|EP_IsTrue))==EP_IsTrue) +#define ExprAlwaysFalse(E) (((E)->flags&(EP_OuterON|EP_IsFalse))==EP_IsFalse) /* Macros used to ensure that the correct members of unions are accessed ** in Expr. @@ -18252,12 +18431,18 @@ struct ExprList { struct ExprList_item { /* For each expression in the list */ Expr *pExpr; /* The parse tree for this expression */ char *zEName; /* Token associated with this expression */ - u8 sortFlags; /* Mask of KEYINFO_ORDER_* flags */ - unsigned eEName :2; /* Meaning of zEName */ - unsigned done :1; /* A flag to indicate when processing is finished */ - unsigned reusable :1; /* Constant expression is reusable */ - unsigned bSorterRef :1; /* Defer evaluation until after sorting */ - unsigned bNulls: 1; /* True if explicit "NULLS FIRST/LAST" */ + struct { + u8 sortFlags; /* Mask of KEYINFO_ORDER_* flags */ + unsigned eEName :2; /* Meaning of zEName */ + unsigned done :1; /* Indicates when processing is finished */ + unsigned reusable :1; /* Constant expression is reusable */ + unsigned bSorterRef :1; /* Defer evaluation until after sorting */ + unsigned bNulls :1; /* True if explicit "NULLS FIRST/LAST" */ + unsigned bUsed :1; /* This column used in a SF_NestedFrom subquery */ + unsigned bUsingTerm:1; /* Term from the USING clause of a NestedFrom */ + unsigned bNoExpand: 1; /* Term is an auxiliary in NestedFrom and should + ** not be expanded by "*" in parent queries */ + } fg; union { struct { /* Used by any ExprList other than Parse.pConsExpr */ u16 iOrderByCol; /* For ORDER BY, column number in result set */ @@ -18292,17 +18477,37 @@ struct ExprList { ** If "a" is the k-th column of table "t", then IdList.a[0].idx==k. */ struct IdList { + int nId; /* Number of identifiers on the list */ + u8 eU4; /* Which element of a.u4 is valid */ struct IdList_item { char *zName; /* Name of the identifier */ - int idx; /* Index in some Table.aCol[] of a column named zName */ - } *a; - int nId; /* Number of identifiers on the list */ + union { + int idx; /* Index in some Table.aCol[] of a column named zName */ + Expr *pExpr; /* Expr to implement a USING variable -- NOT USED */ + } u4; + } a[1]; }; +/* +** Allowed values for IdList.eType, which determines which value of the a.u4 +** is valid. +*/ +#define EU4_NONE 0 /* Does not use IdList.a.u4 */ +#define EU4_IDX 1 /* Uses IdList.a.u4.idx */ +#define EU4_EXPR 2 /* Uses IdList.a.u4.pExpr -- NOT CURRENTLY USED */ + /* ** The SrcItem object represents a single term in the FROM clause of a query. ** The SrcList object is mostly an array of SrcItems. ** +** The jointype starts out showing the join type between the current table +** and the next table on the list. The parser builds the list this way. +** But sqlite3SrcListShiftJoinType() later shifts the jointypes so that each +** jointype expresses the join between the table and the previous table. +** +** In the colUsed field, the high-order bit (bit 63) is set if the table +** contains more than 63 columns and the 64-th or later column is used. +** ** Union member validity: ** ** u1.zIndexedBy fg.isIndexedBy && !fg.isTabFunc @@ -18326,44 +18531,48 @@ struct SrcItem { unsigned isIndexedBy :1; /* True if there is an INDEXED BY clause */ unsigned isTabFunc :1; /* True if table-valued-function syntax */ unsigned isCorrelated :1; /* True if sub-query is correlated */ + unsigned isMaterialized:1; /* This is a materialized view */ unsigned viaCoroutine :1; /* Implemented as a co-routine */ unsigned isRecursive :1; /* True for recursive reference in WITH */ unsigned fromDDL :1; /* Comes from sqlite_schema */ unsigned isCte :1; /* This is a CTE */ unsigned notCte :1; /* This item may not match a CTE */ + unsigned isUsing :1; /* u3.pUsing is valid */ + unsigned isOn :1; /* u3.pOn was once valid and non-NULL */ + unsigned isSynthUsing :1; /* u3.pUsing is synthensized from NATURAL */ + unsigned isNestedFrom :1; /* pSelect is a SF_NestedFrom subquery */ } fg; int iCursor; /* The VDBE cursor number used to access this table */ - Expr *pOn; /* The ON clause of a join */ - IdList *pUsing; /* The USING clause of a join */ - Bitmask colUsed; /* Bit N (1< The ON clause of a join */ + IdList *pUsing; /* fg.isUsing==1 => The USING clause of a join */ + } u3; + Bitmask colUsed; /* Bit N set if column N used. Details above for N>62 */ union { char *zIndexedBy; /* Identifier from "INDEXED BY " clause */ ExprList *pFuncArg; /* Arguments to table-valued-function */ } u1; union { Index *pIBIndex; /* Index structure corresponding to u1.zIndexedBy */ - CteUse *pCteUse; /* CTE Usage info info fg.isCte is true */ + CteUse *pCteUse; /* CTE Usage info when fg.isCte is true */ } u2; }; /* -** The following structure describes the FROM clause of a SELECT statement. -** Each table or subquery in the FROM clause is a separate element of -** the SrcList.a[] array. -** -** With the addition of multiple database support, the following structure -** can also be used to describe a particular table such as the table that -** is modified by an INSERT, DELETE, or UPDATE statement. In standard SQL, -** such a table must be a simple name: ID. But in SQLite, the table can -** now be identified by a database name, a dot, then the table name: ID.ID. -** -** The jointype starts out showing the join type between the current table -** and the next table on the list. The parser builds the list this way. -** But sqlite3SrcListShiftJoinType() later shifts the jointypes so that each -** jointype expresses the join between the table and the previous table. +** The OnOrUsing object represents either an ON clause or a USING clause. +** It can never be both at the same time, but it can be neither. +*/ +struct OnOrUsing { + Expr *pOn; /* The ON clause of a join */ + IdList *pUsing; /* The USING clause of a join */ +}; + +/* +** This object represents one or more tables that are the source of +** content for an SQL statement. For example, a single SrcList object +** is used to hold the FROM clause of a SELECT statement. SrcList also +** represents the target tables for DELETE, INSERT, and UPDATE statements. ** -** In the colUsed field, the high-order bit (bit 63) is set if the table -** contains more than 63 columns and the 64-th or later column is used. */ struct SrcList { int nSrc; /* Number of tables or subqueries in the FROM clause */ @@ -18374,14 +18583,15 @@ struct SrcList { /* ** Permitted values of the SrcList.a.jointype field */ -#define JT_INNER 0x0001 /* Any kind of inner or cross join */ -#define JT_CROSS 0x0002 /* Explicit use of the CROSS keyword */ -#define JT_NATURAL 0x0004 /* True for a "natural" join */ -#define JT_LEFT 0x0008 /* Left outer join */ -#define JT_RIGHT 0x0010 /* Right outer join */ -#define JT_OUTER 0x0020 /* The "OUTER" keyword is present */ -#define JT_ERROR 0x0040 /* unknown or unsupported join type */ - +#define JT_INNER 0x01 /* Any kind of inner or cross join */ +#define JT_CROSS 0x02 /* Explicit use of the CROSS keyword */ +#define JT_NATURAL 0x04 /* True for a "natural" join */ +#define JT_LEFT 0x08 /* Left outer join */ +#define JT_RIGHT 0x10 /* Right outer join */ +#define JT_OUTER 0x20 /* The "OUTER" keyword is present */ +#define JT_LTORJ 0x40 /* One of the LEFT operands of a RIGHT JOIN + ** Mnemonic: Left Table Of Right Join */ +#define JT_ERROR 0x80 /* unknown or unsupported join type */ /* ** Flags appropriate for the wctrlFlags parameter of sqlite3WhereBegin() @@ -18404,7 +18614,7 @@ struct SrcList { #define WHERE_SORTBYGROUP 0x0200 /* Support sqlite3WhereIsSorted() */ #define WHERE_AGG_DISTINCT 0x0400 /* Query is "SELECT agg(DISTINCT ...)" */ #define WHERE_ORDERBY_LIMIT 0x0800 /* ORDERBY+LIMIT on the inner loop */ - /* 0x1000 not currently used */ +#define WHERE_RIGHT_JOIN 0x1000 /* Processing a RIGHT JOIN */ /* 0x2000 not currently used */ #define WHERE_USE_LIMIT 0x4000 /* Use the LIMIT in cost estimates */ /* 0x8000 not currently used */ @@ -18600,6 +18810,9 @@ struct Select { #define SF_CopyCte 0x4000000 /* SELECT statement is a copy of a CTE */ #define SF_OrderByReqd 0x8000000 /* The ORDER BY clause may not be omitted */ +/* True if S exists and has SF_NestedFrom */ +#define IsNestedFrom(S) ((S)!=0 && ((S)->selFlags&SF_NestedFrom)!=0) + /* ** The results of a SELECT can be distributed in several ways, as defined ** by one of the following macros. The "SRT" prefix means "SELECT Result @@ -18704,7 +18917,7 @@ struct SelectDest { int iSDParm2; /* A second parameter for the eDest disposal method */ int iSdst; /* Base register where results are written */ int nSdst; /* Number of registers allocated */ - char *zAffSdst; /* Affinity used when eDest==SRT_Set */ + char *zAffSdst; /* Affinity used for SRT_Set, SRT_Table, and similar */ ExprList *pOrderBy; /* Key columns for SRT_Queue and SRT_DistQueue */ }; @@ -18769,6 +18982,28 @@ struct TriggerPrg { # define DbMaskNonZero(M) (M)!=0 #endif +/* +** For each index X that has as one of its arguments either an expression +** or the name of a virtual generated column, and if X is in scope such that +** the value of the expression can simply be read from the index, then +** there is an instance of this object on the Parse.pIdxExpr list. +** +** During code generation, while generating code to evaluate expressions, +** this list is consulted and if a matching expression is found, the value +** is read from the index rather than being recomputed. +*/ +struct IndexedExpr { + Expr *pExpr; /* The expression contained in the index */ + int iDataCur; /* The data cursor associated with the index */ + int iIdxCur; /* The index cursor */ + int iIdxCol; /* The index column that contains value of pExpr */ + u8 bMaybeNullRow; /* True if we need an OP_IfNullRow check */ + IndexedExpr *pIENext; /* Next in a list of all indexed expressions */ +#ifdef SQLITE_ENABLE_EXPLAIN_COMMENTS + const char *zIdxName; /* Name of index, used only for bytecode comments */ +#endif +}; + /* ** An instance of the ParseCleanup object specifies an operation that ** should be performed after parsing to deallocation resources obtained @@ -18810,7 +19045,8 @@ struct Parse { u8 hasCompound; /* Need to invoke convertCompoundSelectToSubquery() */ u8 okConstFactor; /* OK to factor out constants */ u8 disableLookaside; /* Number of times lookaside has been disabled */ - u8 disableVtab; /* Disable all virtual tables for this parse */ + u8 prepFlags; /* SQLITE_PREPARE_* flags */ + u8 withinRJSubrtn; /* Nesting level for RIGHT JOIN body subroutines */ #if defined(SQLITE_DEBUG) || defined(SQLITE_COVERAGE_TEST) u8 earlyCleanup; /* OOM inside sqlite3ParserAddCleanup() */ #endif @@ -18826,6 +19062,7 @@ struct Parse { int nLabelAlloc; /* Number of slots in aLabel */ int *aLabel; /* Space to hold the labels */ ExprList *pConstExpr;/* Constant expressions */ + IndexedExpr *pIdxExpr;/* List of expressions used by active indexes */ Token constraintName;/* Name of the constraint currently being parsed */ yDbMask writeMask; /* Start a write transaction on these databases */ yDbMask cookieMask; /* Bitmask of schema verified databases */ @@ -18983,20 +19220,20 @@ struct AuthContext { #define OPFLAG_PREFORMAT 0x80 /* OP_Insert uses preformatted cell */ /* - * Each trigger present in the database schema is stored as an instance of - * struct Trigger. - * - * Pointers to instances of struct Trigger are stored in two ways. - * 1. In the "trigHash" hash table (part of the sqlite3* that represents the - * database). This allows Trigger structures to be retrieved by name. - * 2. All triggers associated with a single table form a linked list, using the - * pNext member of struct Trigger. A pointer to the first element of the - * linked list is stored as the "pTrigger" member of the associated - * struct Table. - * - * The "step_list" member points to the first element of a linked list - * containing the SQL statements specified as the trigger program. - */ +** Each trigger present in the database schema is stored as an instance of +** struct Trigger. +** +** Pointers to instances of struct Trigger are stored in two ways. +** 1. In the "trigHash" hash table (part of the sqlite3* that represents the +** database). This allows Trigger structures to be retrieved by name. +** 2. All triggers associated with a single table form a linked list, using the +** pNext member of struct Trigger. A pointer to the first element of the +** linked list is stored as the "pTrigger" member of the associated +** struct Table. +** +** The "step_list" member points to the first element of a linked list +** containing the SQL statements specified as the trigger program. +*/ struct Trigger { char *zName; /* The name of the trigger */ char *table; /* The table or view to which the trigger applies */ @@ -19023,43 +19260,48 @@ struct Trigger { #define TRIGGER_AFTER 2 /* - * An instance of struct TriggerStep is used to store a single SQL statement - * that is a part of a trigger-program. - * - * Instances of struct TriggerStep are stored in a singly linked list (linked - * using the "pNext" member) referenced by the "step_list" member of the - * associated struct Trigger instance. The first element of the linked list is - * the first step of the trigger-program. - * - * The "op" member indicates whether this is a "DELETE", "INSERT", "UPDATE" or - * "SELECT" statement. The meanings of the other members is determined by the - * value of "op" as follows: - * - * (op == TK_INSERT) - * orconf -> stores the ON CONFLICT algorithm - * pSelect -> If this is an INSERT INTO ... SELECT ... statement, then - * this stores a pointer to the SELECT statement. Otherwise NULL. - * zTarget -> Dequoted name of the table to insert into. - * pExprList -> If this is an INSERT INTO ... VALUES ... statement, then - * this stores values to be inserted. Otherwise NULL. - * pIdList -> If this is an INSERT INTO ... () VALUES ... - * statement, then this stores the column-names to be - * inserted into. - * - * (op == TK_DELETE) - * zTarget -> Dequoted name of the table to delete from. - * pWhere -> The WHERE clause of the DELETE statement if one is specified. - * Otherwise NULL. - * - * (op == TK_UPDATE) - * zTarget -> Dequoted name of the table to update. - * pWhere -> The WHERE clause of the UPDATE statement if one is specified. - * Otherwise NULL. - * pExprList -> A list of the columns to update and the expressions to update - * them to. See sqlite3Update() documentation of "pChanges" - * argument. - * - */ +** An instance of struct TriggerStep is used to store a single SQL statement +** that is a part of a trigger-program. +** +** Instances of struct TriggerStep are stored in a singly linked list (linked +** using the "pNext" member) referenced by the "step_list" member of the +** associated struct Trigger instance. The first element of the linked list is +** the first step of the trigger-program. +** +** The "op" member indicates whether this is a "DELETE", "INSERT", "UPDATE" or +** "SELECT" statement. The meanings of the other members is determined by the +** value of "op" as follows: +** +** (op == TK_INSERT) +** orconf -> stores the ON CONFLICT algorithm +** pSelect -> The content to be inserted - either a SELECT statement or +** a VALUES clause. +** zTarget -> Dequoted name of the table to insert into. +** pIdList -> If this is an INSERT INTO ... () VALUES ... +** statement, then this stores the column-names to be +** inserted into. +** pUpsert -> The ON CONFLICT clauses for an Upsert +** +** (op == TK_DELETE) +** zTarget -> Dequoted name of the table to delete from. +** pWhere -> The WHERE clause of the DELETE statement if one is specified. +** Otherwise NULL. +** +** (op == TK_UPDATE) +** zTarget -> Dequoted name of the table to update. +** pWhere -> The WHERE clause of the UPDATE statement if one is specified. +** Otherwise NULL. +** pExprList -> A list of the columns to update and the expressions to update +** them to. See sqlite3Update() documentation of "pChanges" +** argument. +** +** (op == TK_SELECT) +** pSelect -> The SELECT statement +** +** (op == TK_RETURNING) +** pExprList -> The list of expressions that follow the RETURNING keyword. +** +*/ struct TriggerStep { u8 op; /* One of TK_DELETE, TK_UPDATE, TK_INSERT, TK_SELECT, ** or TK_RETURNING */ @@ -19256,15 +19498,15 @@ struct Walker { struct RefSrcList *pRefSrcList; /* sqlite3ReferencesSrcList() */ int *aiCol; /* array of column indexes */ struct IdxCover *pIdxCover; /* Check for index coverage */ - struct IdxExprTrans *pIdxTrans; /* Convert idxed expr to column */ ExprList *pGroupBy; /* GROUP BY clause */ Select *pSelect; /* HAVING to WHERE clause ctx */ struct WindowRewrite *pRewrite; /* Window rewrite context */ struct WhereConst *pConst; /* WHERE clause constants */ struct RenameCtx *pRename; /* RENAME COLUMN context */ struct Table *pTab; /* Table of generated column */ + struct CoveringIndexCheck *pCovIdxCk; /* Check for covering index */ SrcItem *pSrcItem; /* A single FROM clause item */ - DbFixer *pFix; + DbFixer *pFix; /* See sqlite3FixSelect() */ } u; }; @@ -19414,7 +19656,7 @@ struct Window { Window **ppThis; /* Pointer to this object in Select.pWin list */ Window *pNextWin; /* Next window function belonging to this SELECT */ Expr *pFilter; /* The FILTER expression */ - FuncDef *pFunc; /* The function */ + FuncDef *pWFunc; /* The function */ int iEphCsr; /* Partition buffer or Peer buffer */ int regAccum; /* Accumulator */ int regResult; /* Interim result */ @@ -19570,6 +19812,7 @@ SQLITE_PRIVATE void *sqlite3DbReallocOrFree(sqlite3 *, void *, u64); SQLITE_PRIVATE void *sqlite3DbRealloc(sqlite3 *, void *, u64); SQLITE_PRIVATE void sqlite3DbFree(sqlite3*, void*); SQLITE_PRIVATE void sqlite3DbFreeNN(sqlite3*, void*); +SQLITE_PRIVATE void sqlite3DbNNFreeNN(sqlite3*, void*); SQLITE_PRIVATE int sqlite3MallocSize(const void*); SQLITE_PRIVATE int sqlite3DbMallocSize(sqlite3*, const void*); SQLITE_PRIVATE void *sqlite3PageMalloc(int); @@ -19590,12 +19833,16 @@ SQLITE_PRIVATE int sqlite3HeapNearlyFull(void); */ #ifdef SQLITE_USE_ALLOCA # define sqlite3StackAllocRaw(D,N) alloca(N) +# define sqlite3StackAllocRawNN(D,N) alloca(N) # define sqlite3StackAllocZero(D,N) memset(alloca(N), 0, N) # define sqlite3StackFree(D,P) +# define sqlite3StackFreeNN(D,P) #else # define sqlite3StackAllocRaw(D,N) sqlite3DbMallocRaw(D,N) +# define sqlite3StackAllocRawNN(D,N) sqlite3DbMallocRawNN(D,N) # define sqlite3StackAllocZero(D,N) sqlite3DbMallocZero(D,N) # define sqlite3StackFree(D,P) sqlite3DbFree(D,P) +# define sqlite3StackFreeNN(D,P) sqlite3DbFreeNN(D,P) #endif /* Do not allow both MEMSYS5 and MEMSYS3 to be defined together. If they @@ -19669,18 +19916,53 @@ SQLITE_PRIVATE void *sqlite3TestTextToPtr(const char*); #endif #if defined(SQLITE_DEBUG) +SQLITE_PRIVATE void sqlite3TreeViewLine(TreeView*, const char *zFormat, ...); SQLITE_PRIVATE void sqlite3TreeViewExpr(TreeView*, const Expr*, u8); SQLITE_PRIVATE void sqlite3TreeViewBareExprList(TreeView*, const ExprList*, const char*); SQLITE_PRIVATE void sqlite3TreeViewExprList(TreeView*, const ExprList*, u8, const char*); +SQLITE_PRIVATE void sqlite3TreeViewBareIdList(TreeView*, const IdList*, const char*); +SQLITE_PRIVATE void sqlite3TreeViewIdList(TreeView*, const IdList*, u8, const char*); +SQLITE_PRIVATE void sqlite3TreeViewColumnList(TreeView*, const Column*, int, u8); SQLITE_PRIVATE void sqlite3TreeViewSrcList(TreeView*, const SrcList*); SQLITE_PRIVATE void sqlite3TreeViewSelect(TreeView*, const Select*, u8); SQLITE_PRIVATE void sqlite3TreeViewWith(TreeView*, const With*, u8); +SQLITE_PRIVATE void sqlite3TreeViewUpsert(TreeView*, const Upsert*, u8); +#if TREETRACE_ENABLED +SQLITE_PRIVATE void sqlite3TreeViewDelete(const With*, const SrcList*, const Expr*, + const ExprList*,const Expr*, const Trigger*); +SQLITE_PRIVATE void sqlite3TreeViewInsert(const With*, const SrcList*, + const IdList*, const Select*, const ExprList*, + int, const Upsert*, const Trigger*); +SQLITE_PRIVATE void sqlite3TreeViewUpdate(const With*, const SrcList*, const ExprList*, + const Expr*, int, const ExprList*, const Expr*, + const Upsert*, const Trigger*); +#endif +#ifndef SQLITE_OMIT_TRIGGER +SQLITE_PRIVATE void sqlite3TreeViewTriggerStep(TreeView*, const TriggerStep*, u8, u8); +SQLITE_PRIVATE void sqlite3TreeViewTrigger(TreeView*, const Trigger*, u8, u8); +#endif #ifndef SQLITE_OMIT_WINDOWFUNC SQLITE_PRIVATE void sqlite3TreeViewWindow(TreeView*, const Window*, u8); SQLITE_PRIVATE void sqlite3TreeViewWinFunc(TreeView*, const Window*, u8); #endif +SQLITE_PRIVATE void sqlite3ShowExpr(const Expr*); +SQLITE_PRIVATE void sqlite3ShowExprList(const ExprList*); +SQLITE_PRIVATE void sqlite3ShowIdList(const IdList*); +SQLITE_PRIVATE void sqlite3ShowSrcList(const SrcList*); +SQLITE_PRIVATE void sqlite3ShowSelect(const Select*); +SQLITE_PRIVATE void sqlite3ShowWith(const With*); +SQLITE_PRIVATE void sqlite3ShowUpsert(const Upsert*); +#ifndef SQLITE_OMIT_TRIGGER +SQLITE_PRIVATE void sqlite3ShowTriggerStep(const TriggerStep*); +SQLITE_PRIVATE void sqlite3ShowTriggerStepList(const TriggerStep*); +SQLITE_PRIVATE void sqlite3ShowTrigger(const Trigger*); +SQLITE_PRIVATE void sqlite3ShowTriggerList(const Trigger*); +#endif +#ifndef SQLITE_OMIT_WINDOWFUNC +SQLITE_PRIVATE void sqlite3ShowWindow(const Window*); +SQLITE_PRIVATE void sqlite3ShowWinFunc(const Window*); +#endif #endif - SQLITE_PRIVATE void sqlite3SetString(char **, sqlite3*, const char*); SQLITE_PRIVATE void sqlite3ErrorMsg(Parse*, const char*, ...); @@ -19829,13 +20111,14 @@ SQLITE_PRIVATE SrcList *sqlite3SrcListEnlarge(Parse*, SrcList*, int, int); SQLITE_PRIVATE SrcList *sqlite3SrcListAppendList(Parse *pParse, SrcList *p1, SrcList *p2); SQLITE_PRIVATE SrcList *sqlite3SrcListAppend(Parse*, SrcList*, Token*, Token*); SQLITE_PRIVATE SrcList *sqlite3SrcListAppendFromTerm(Parse*, SrcList*, Token*, Token*, - Token*, Select*, Expr*, IdList*); + Token*, Select*, OnOrUsing*); SQLITE_PRIVATE void sqlite3SrcListIndexedBy(Parse *, SrcList *, Token *); SQLITE_PRIVATE void sqlite3SrcListFuncArgs(Parse*, SrcList*, ExprList*); SQLITE_PRIVATE int sqlite3IndexedByLookup(Parse *, SrcItem *); -SQLITE_PRIVATE void sqlite3SrcListShiftJoinType(SrcList*); +SQLITE_PRIVATE void sqlite3SrcListShiftJoinType(Parse*,SrcList*); SQLITE_PRIVATE void sqlite3SrcListAssignCursors(Parse*, SrcList*); SQLITE_PRIVATE void sqlite3IdListDelete(sqlite3*, IdList*); +SQLITE_PRIVATE void sqlite3ClearOnOrUsing(sqlite3*, OnOrUsing*); SQLITE_PRIVATE void sqlite3SrcListDelete(sqlite3*, SrcList*); SQLITE_PRIVATE Index *sqlite3AllocateIndexObject(sqlite3*,i16,int,char**); SQLITE_PRIVATE void sqlite3CreateIndex(Parse*,Token*,Token*,SrcList*,ExprList*,int,Token*, @@ -20033,7 +20316,8 @@ SQLITE_PRIVATE SrcList *sqlite3TriggerStepSrc(Parse*, TriggerStep*); SQLITE_PRIVATE int sqlite3JoinType(Parse*, Token*, Token*, Token*); SQLITE_PRIVATE int sqlite3ColumnIndex(Table *pTab, const char *zCol); -SQLITE_PRIVATE void sqlite3SetJoinExpr(Expr*,int); +SQLITE_PRIVATE void sqlite3SrcItemColumnUsed(SrcItem*,int); +SQLITE_PRIVATE void sqlite3SetJoinExpr(Expr*,int,u32); SQLITE_PRIVATE void sqlite3CreateForeignKey(Parse*, ExprList*, Token*, ExprList*, int); SQLITE_PRIVATE void sqlite3DeferForeignKey(Parse*, int); #ifndef SQLITE_OMIT_AUTHORIZATION @@ -20057,6 +20341,7 @@ SQLITE_PRIVATE int sqlite3FixSelect(DbFixer*, Select*); SQLITE_PRIVATE int sqlite3FixExpr(DbFixer*, Expr*); SQLITE_PRIVATE int sqlite3FixTriggerStep(DbFixer*, TriggerStep*); SQLITE_PRIVATE int sqlite3RealSameAsInt(double,sqlite3_int64); +SQLITE_PRIVATE i64 sqlite3RealToI64(double); SQLITE_PRIVATE void sqlite3Int64ToText(i64,char*); SQLITE_PRIVATE int sqlite3AtoF(const char *z, double*, int, u8); SQLITE_PRIVATE int sqlite3GetInt32(const char *, int*); @@ -20102,6 +20387,7 @@ SQLITE_PRIVATE int sqlite3VarintLen(u64 v); SQLITE_PRIVATE const char *sqlite3IndexAffinityStr(sqlite3*, Index*); +SQLITE_PRIVATE char *sqlite3TableAffinityStr(sqlite3*,const Table*); SQLITE_PRIVATE void sqlite3TableAffinity(Vdbe*, Table*, int); SQLITE_PRIVATE char sqlite3CompareAffinity(const Expr *pExpr, char aff2); SQLITE_PRIVATE int sqlite3IndexAffinityOk(const Expr *pExpr, char idx_affinity); @@ -20173,7 +20459,6 @@ SQLITE_PRIVATE const unsigned char sqlite3OpcodeProperty[]; SQLITE_PRIVATE const char sqlite3StrBINARY[]; SQLITE_PRIVATE const unsigned char sqlite3StdTypeLen[]; SQLITE_PRIVATE const char sqlite3StdTypeAffinity[]; -SQLITE_PRIVATE const char sqlite3StdTypeMap[]; SQLITE_PRIVATE const char *sqlite3StdType[]; SQLITE_PRIVATE const unsigned char sqlite3UpperToLower[]; SQLITE_PRIVATE const unsigned char *sqlite3aLTb; @@ -20379,7 +20664,7 @@ SQLITE_PRIVATE int sqlite3VtabBegin(sqlite3 *, VTable *); SQLITE_PRIVATE FuncDef *sqlite3VtabOverloadFunction(sqlite3 *,FuncDef*, int nArg, Expr*); #if (defined(SQLITE_ENABLE_DBPAGE_VTAB) || defined(SQLITE_TEST)) \ && !defined(SQLITE_OMIT_VIRTUALTABLE) -SQLITE_PRIVATE void sqlite3VtabWriteAll(sqlite3_index_info*); +SQLITE_PRIVATE void sqlite3VtabUsesAllSchemas(sqlite3_index_info*); #endif SQLITE_PRIVATE sqlite3_int64 sqlite3StmtCurrentTime(sqlite3_context*); SQLITE_PRIVATE int sqlite3VdbeParameterIndex(Vdbe*, const char*, int); @@ -20617,6 +20902,10 @@ SQLITE_PRIVATE void sqlite3VectorErrorMsg(Parse*, Expr*); SQLITE_PRIVATE const char **sqlite3CompileOptions(int *pnOpt); #endif +#if SQLITE_OS_UNIX && defined(SQLITE_OS_KV_OPTIONAL) +SQLITE_PRIVATE int sqlite3KvvfsInit(void); +#endif + #endif /* SQLITEINT_H */ /************** End of sqliteInt.h *******************************************/ @@ -20848,7 +21137,7 @@ SQLITE_API extern int sqlite3_open_file_count; ** autoconf-based build */ #if defined(_HAVE_SQLITE_CONFIG_H) && !defined(SQLITECONFIG_H) -/* #include "config.h" */ +/* #include "sqlite_cfg.h" */ #define SQLITECONFIG_H 1 #endif @@ -21013,6 +21302,9 @@ static const char * const sqlite3azCompileOpt[] = { #ifdef SQLITE_DISABLE_SKIPAHEAD_DISTINCT "DISABLE_SKIPAHEAD_DISTINCT", #endif +#ifdef SQLITE_DQS + "DQS=" CTIMEOPT_VAL(SQLITE_DQS), +#endif #ifdef SQLITE_ENABLE_8_3_NAMES "ENABLE_8_3_NAMES=" CTIMEOPT_VAL(SQLITE_ENABLE_8_3_NAMES), #endif @@ -21127,9 +21419,6 @@ static const char * const sqlite3azCompileOpt[] = { #ifdef SQLITE_ENABLE_RTREE "ENABLE_RTREE", #endif -#ifdef SQLITE_ENABLE_SELECTTRACE - "ENABLE_SELECTTRACE", -#endif #ifdef SQLITE_ENABLE_SESSION "ENABLE_SESSION", #endif @@ -21151,6 +21440,9 @@ static const char * const sqlite3azCompileOpt[] = { #ifdef SQLITE_ENABLE_STMT_SCANSTATUS "ENABLE_STMT_SCANSTATUS", #endif +#ifdef SQLITE_ENABLE_TREETRACE + "ENABLE_TREETRACE", +#endif #ifdef SQLITE_ENABLE_UNKNOWN_SQL_FUNCTION "ENABLE_UNKNOWN_SQL_FUNCTION", #endif @@ -21503,9 +21795,6 @@ static const char * const sqlite3azCompileOpt[] = { #ifdef SQLITE_OMIT_XFER_OPT "OMIT_XFER_OPT", #endif -#ifdef SQLITE_PCACHE_SEPARATE_HEADER - "PCACHE_SEPARATE_HEADER", -#endif #ifdef SQLITE_PERFORMANCE_TRACE "PERFORMANCE_TRACE", #endif @@ -21900,6 +22189,9 @@ SQLITE_PRIVATE SQLITE_WSD struct Sqlite3Config sqlite3Config = { 0x7ffffffe, /* iOnceResetThreshold */ SQLITE_DEFAULT_SORTERREF_SIZE, /* szSorterRef */ 0, /* iPrngSeed */ +#ifdef SQLITE_DEBUG + {0,0,0,0,0,0} /* aTune */ +#endif }; /* @@ -21954,7 +22246,7 @@ SQLITE_PRIVATE int sqlite3PendingByte = 0x40000000; /* ** Tracing flags set by SQLITE_TESTCTRL_TRACEFLAGS. */ -SQLITE_PRIVATE u32 sqlite3SelectTrace = 0; +SQLITE_PRIVATE u32 sqlite3TreeTrace = 0; SQLITE_PRIVATE u32 sqlite3WhereTrace = 0; /* #include "opcodes.h" */ @@ -21982,10 +22274,6 @@ SQLITE_PRIVATE const char sqlite3StrBINARY[] = "BINARY"; ** ** sqlite3StdTypeAffinity[] The affinity associated with each entry ** in sqlite3StdType[]. -** -** sqlite3StdTypeMap[] The type value (as returned from -** sqlite3_column_type() or sqlite3_value_type()) -** for each entry in sqlite3StdType[]. */ SQLITE_PRIVATE const unsigned char sqlite3StdTypeLen[] = { 3, 4, 3, 7, 4, 4 }; SQLITE_PRIVATE const char sqlite3StdTypeAffinity[] = { @@ -21996,14 +22284,6 @@ SQLITE_PRIVATE const char sqlite3StdTypeAffinity[] = { SQLITE_AFF_REAL, SQLITE_AFF_TEXT }; -SQLITE_PRIVATE const char sqlite3StdTypeMap[] = { - 0, - SQLITE_BLOB, - SQLITE_INTEGER, - SQLITE_INTEGER, - SQLITE_FLOAT, - SQLITE_TEXT -}; SQLITE_PRIVATE const char *sqlite3StdType[] = { "ANY", "BLOB", @@ -22121,7 +22401,7 @@ struct VdbeCursor { Bool isEphemeral:1; /* True for an ephemeral table */ Bool useRandomRowid:1; /* Generate new record numbers semi-randomly */ Bool isOrdered:1; /* True if the table is not BTREE_UNORDERED */ - Bool hasBeenDuped:1; /* This cursor was source or target of OP_OpenDup */ + Bool noReuse:1; /* OpenEphemeral may not reuse this cursor */ u16 seekHit; /* See the OP_SeekHit and OP_IfNoHope opcodes */ union { /* pBtx for isEphermeral. pAltMap otherwise */ Btree *pBtx; /* Separate file holding temporary table */ @@ -22169,6 +22449,11 @@ struct VdbeCursor { u32 aType[1]; /* Type values record decode. MUST BE LAST */ }; +/* Return true if P is a null-only cursor +*/ +#define IsNullCursor(P) \ + ((P)->eCurType==CURTYPE_PSEUDO && (P)->nullRow && (P)->seekResult==0) + /* ** A value for VdbeCursor.cacheStatus that means the cache is always invalid. @@ -22243,16 +22528,16 @@ struct sqlite3_value { const char *zPType; /* Pointer type when MEM_Term|MEM_Subtype|MEM_Null */ FuncDef *pDef; /* Used only when flags==MEM_Agg */ } u; + char *z; /* String or BLOB value */ + int n; /* Number of characters in string value, excluding '\0' */ u16 flags; /* Some combination of MEM_Null, MEM_Str, MEM_Dyn, etc. */ u8 enc; /* SQLITE_UTF8, SQLITE_UTF16BE, SQLITE_UTF16LE */ u8 eSubtype; /* Subtype for this value */ - int n; /* Number of characters in string value, excluding '\0' */ - char *z; /* String or BLOB value */ /* ShallowCopy only needs to copy the information above */ - char *zMalloc; /* Space to hold MEM_Str or MEM_Blob if szMalloc>0 */ + sqlite3 *db; /* The associated database connection */ int szMalloc; /* Size of the zMalloc allocation */ u32 uTemp; /* Transient storage for serial_type in OP_MakeRecord */ - sqlite3 *db; /* The associated database connection */ + char *zMalloc; /* Space to hold MEM_Str or MEM_Blob if szMalloc>0 */ void (*xDel)(void*);/* Destructor for Mem.z - only valid if MEM_Dyn */ #ifdef SQLITE_DEBUG Mem *pScopyFrom; /* This Mem is a shallow copy of pScopyFrom */ @@ -22264,11 +22549,43 @@ struct sqlite3_value { ** Size of struct Mem not including the Mem.zMalloc member or anything that ** follows. */ -#define MEMCELLSIZE offsetof(Mem,zMalloc) +#define MEMCELLSIZE offsetof(Mem,db) -/* One or more of the following flags are set to indicate the validOK +/* One or more of the following flags are set to indicate the ** representations of the value stored in the Mem struct. ** +** * MEM_Null An SQL NULL value +** +** * MEM_Null|MEM_Zero An SQL NULL with the virtual table +** UPDATE no-change flag set +** +** * MEM_Null|MEM_Term| An SQL NULL, but also contains a +** MEM_Subtype pointer accessible using +** sqlite3_value_pointer(). +** +** * MEM_Null|MEM_Cleared Special SQL NULL that compares non-equal +** to other NULLs even using the IS operator. +** +** * MEM_Str A string, stored in Mem.z with +** length Mem.n. Zero-terminated if +** MEM_Term is set. This flag is +** incompatible with MEM_Blob and +** MEM_Null, but can appear with MEM_Int, +** MEM_Real, and MEM_IntReal. +** +** * MEM_Blob A blob, stored in Mem.z length Mem.n. +** Incompatible with MEM_Str, MEM_Null, +** MEM_Int, MEM_Real, and MEM_IntReal. +** +** * MEM_Blob|MEM_Zero A blob in Mem.z of length Mem.n plus +** MEM.u.i extra 0x00 bytes at the end. +** +** * MEM_Int Integer stored in Mem.u.i. +** +** * MEM_Real Real stored in Mem.u.r. +** +** * MEM_IntReal Real stored as an integer in Mem.u.i. +** ** If the MEM_Null flag is set, then the value is an SQL NULL value. ** For a pointer type created using sqlite3_bind_pointer() or ** sqlite3_result_pointer() the MEM_Term and MEM_Subtype flags are also set. @@ -22279,6 +22596,7 @@ struct sqlite3_value { ** set, then the string is nul terminated. The MEM_Int and MEM_Real ** flags may coexist with the MEM_Str flag. */ +#define MEM_Undefined 0x0000 /* Value is undefined */ #define MEM_Null 0x0001 /* Value is NULL (or a pointer) */ #define MEM_Str 0x0002 /* Value is a string */ #define MEM_Int 0x0004 /* Value is an integer */ @@ -22286,28 +22604,24 @@ struct sqlite3_value { #define MEM_Blob 0x0010 /* Value is a BLOB */ #define MEM_IntReal 0x0020 /* MEM_Int that stringifies like MEM_Real */ #define MEM_AffMask 0x003f /* Mask of affinity bits */ + +/* Extra bits that modify the meanings of the core datatypes above +*/ #define MEM_FromBind 0x0040 /* Value originates from sqlite3_bind() */ -#define MEM_Undefined 0x0080 /* Value is undefined */ + /* 0x0080 // Available */ #define MEM_Cleared 0x0100 /* NULL set by OP_Null, not from data */ -#define MEM_TypeMask 0xc1bf /* Mask of type bits */ - +#define MEM_Term 0x0200 /* String in Mem.z is zero terminated */ +#define MEM_Zero 0x0400 /* Mem.i contains count of 0s appended to blob */ +#define MEM_Subtype 0x0800 /* Mem.eSubtype is valid */ +#define MEM_TypeMask 0x0dbf /* Mask of type bits */ -/* Whenever Mem contains a valid string or blob representation, one of -** the following flags must be set to determine the memory management -** policy for Mem.z. The MEM_Term flag tells us whether or not the -** string is \000 or \u0000 terminated +/* Bits that determine the storage for Mem.z for a string or blob or +** aggregate accumulator. */ -#define MEM_Term 0x0200 /* String in Mem.z is zero terminated */ -#define MEM_Dyn 0x0400 /* Need to call Mem.xDel() on Mem.z */ -#define MEM_Static 0x0800 /* Mem.z points to a static string */ -#define MEM_Ephem 0x1000 /* Mem.z points to an ephemeral string */ -#define MEM_Agg 0x2000 /* Mem.z points to an agg function context */ -#define MEM_Zero 0x4000 /* Mem.i contains count of 0s appended to blob */ -#define MEM_Subtype 0x8000 /* Mem.eSubtype is valid */ -#ifdef SQLITE_OMIT_INCRBLOB - #undef MEM_Zero - #define MEM_Zero 0x0000 -#endif +#define MEM_Dyn 0x1000 /* Need to call Mem.xDel() on Mem.z */ +#define MEM_Static 0x2000 /* Mem.z points to a static string */ +#define MEM_Ephem 0x4000 /* Mem.z points to an ephemeral string */ +#define MEM_Agg 0x8000 /* Mem.z points to an agg function context */ /* Return TRUE if Mem X contains dynamically allocated content - anything ** that needs to be deallocated to avoid a leak. @@ -22329,11 +22643,15 @@ struct sqlite3_value { && (X)->n==0 && (X)->u.nZero==0) /* -** Return true if a memory cell is not marked as invalid. This macro +** Return true if a memory cell has been initialized and is valid. ** is for use inside assert() statements only. +** +** A Memory cell is initialized if at least one of the +** MEM_Null, MEM_Str, MEM_Int, MEM_Real, MEM_Blob, or MEM_IntReal bits +** is set. It is "undefined" if all those bits are zero. */ #ifdef SQLITE_DEBUG -#define memIsValid(M) ((M)->flags & MEM_Undefined)==0 +#define memIsValid(M) ((M)->flags & MEM_AffMask)!=0 #endif /* @@ -22371,6 +22689,7 @@ struct sqlite3_context { Vdbe *pVdbe; /* The VM that owns this context */ int iOp; /* Instruction number of OP_Function */ int isError; /* Error code returned by the function. */ + u8 enc; /* Encoding to use for results */ u8 skipFlag; /* Skip accumulator loading if true */ u8 argc; /* Number of arguments */ sqlite3_value *argv[1]; /* Argument set */ @@ -22416,10 +22735,9 @@ struct DblquoteStr { */ struct Vdbe { sqlite3 *db; /* The database connection that owns this statement */ - Vdbe *pPrev,*pNext; /* Linked list of VDBEs with the same Vdbe.db */ + Vdbe **ppVPrev,*pVNext; /* Linked list of VDBEs with the same Vdbe.db */ Parse *pParse; /* Parsing context used to create this Vdbe */ ynVar nVar; /* Number of entries in aVar[] */ - u32 iVdbeMagic; /* Magic number defining state of the SQL statement */ int nMem; /* Number of memory locations currently allocated */ int nCursor; /* Number of slots in apCsr[] */ u32 cacheCtr; /* VdbeCursor row cache generation counter */ @@ -22457,11 +22775,10 @@ struct Vdbe { u8 errorAction; /* Recovery action to do in case of an error */ u8 minWriteFileFormat; /* Minimum file format for writable database files */ u8 prepFlags; /* SQLITE_PREPARE_* flags */ - u8 doingRerun; /* True if rerunning after an auto-reprepare */ + u8 eVdbeState; /* On of the VDBE_*_STATE values */ bft expired:2; /* 1: recompile VM immediately 2: when convenient */ bft explain:2; /* True if EXPLAIN present on SQL command */ bft changeCntOn:1; /* True to update the change-counter */ - bft runOnlyOnce:1; /* Automatically expire on reset */ bft usesStmtJournal:1; /* True if uses a statement journal */ bft readOnly:1; /* True for statements that do not write */ bft bIsReader:1; /* True for statements that read */ @@ -22488,13 +22805,12 @@ struct Vdbe { }; /* -** The following are allowed values for Vdbe.magic +** The following are allowed values for Vdbe.eVdbeState */ -#define VDBE_MAGIC_INIT 0x16bceaa5 /* Building a VDBE program */ -#define VDBE_MAGIC_RUN 0x2df20da3 /* VDBE is ready to execute */ -#define VDBE_MAGIC_HALT 0x319c2973 /* VDBE has completed execution */ -#define VDBE_MAGIC_RESET 0x48fa9f76 /* Reset and ready to run again */ -#define VDBE_MAGIC_DEAD 0x5606c3c8 /* The VDBE has been deallocated */ +#define VDBE_INIT_STATE 0 /* Prepared statement under construction */ +#define VDBE_READY_STATE 1 /* Ready to run but not yet started */ +#define VDBE_RUN_STATE 2 /* Run in progress */ +#define VDBE_HALT_STATE 3 /* Finished. Need reset() or finalize() */ /* ** Structure used to store the context required by the @@ -22535,18 +22851,31 @@ struct ValueList { sqlite3_value *pOut; /* Register to hold each decoded output value */ }; +/* Size of content associated with serial types that fit into a +** single-byte varint. +*/ +#ifndef SQLITE_AMALGAMATION +SQLITE_PRIVATE const u8 sqlite3SmallTypeSizes[]; +#endif + /* ** Function prototypes */ SQLITE_PRIVATE void sqlite3VdbeError(Vdbe*, const char *, ...); SQLITE_PRIVATE void sqlite3VdbeFreeCursor(Vdbe *, VdbeCursor*); +SQLITE_PRIVATE void sqlite3VdbeFreeCursorNN(Vdbe*,VdbeCursor*); void sqliteVdbePopStack(Vdbe*,int); +SQLITE_PRIVATE int SQLITE_NOINLINE sqlite3VdbeHandleMovedCursor(VdbeCursor *p); SQLITE_PRIVATE int SQLITE_NOINLINE sqlite3VdbeFinishMoveto(VdbeCursor*); -SQLITE_PRIVATE int sqlite3VdbeCursorMoveto(VdbeCursor**, u32*); SQLITE_PRIVATE int sqlite3VdbeCursorRestore(VdbeCursor*); SQLITE_PRIVATE u32 sqlite3VdbeSerialTypeLen(u32); SQLITE_PRIVATE u8 sqlite3VdbeOneByteSerialTypeLen(u8); -SQLITE_PRIVATE u32 sqlite3VdbeSerialPut(unsigned char*, Mem*, u32); +#ifdef SQLITE_MIXED_ENDIAN_64BIT_FLOAT +SQLITE_PRIVATE u64 sqlite3FloatSwap(u64 in); +# define swapMixedEndianFloat(X) X = sqlite3FloatSwap(X) +#else +# define swapMixedEndianFloat(X) +#endif SQLITE_PRIVATE void sqlite3VdbeSerialGet(const unsigned char*, u32, Mem*); SQLITE_PRIVATE void sqlite3VdbeDeleteAuxData(sqlite3*, AuxData**, int, int); @@ -22604,6 +22933,7 @@ SQLITE_PRIVATE int sqlite3VdbeMemCast(Mem*,u8,u8); SQLITE_PRIVATE int sqlite3VdbeMemFromBtree(BtCursor*,u32,u32,Mem*); SQLITE_PRIVATE int sqlite3VdbeMemFromBtreeZeroOffset(BtCursor*,u32,Mem*); SQLITE_PRIVATE void sqlite3VdbeMemRelease(Mem *p); +SQLITE_PRIVATE void sqlite3VdbeMemReleaseMalloc(Mem*p); SQLITE_PRIVATE int sqlite3VdbeMemFinalize(Mem*, FuncDef*); #ifndef SQLITE_OMIT_WINDOWFUNC SQLITE_PRIVATE int sqlite3VdbeMemAggValue(Mem*, Mem*, FuncDef*); @@ -22963,6 +23293,8 @@ SQLITE_API int sqlite3_db_status( sqlite3BtreeEnterAll(db); db->pnBytesFreed = &nByte; + assert( db->lookaside.pEnd==db->lookaside.pTrueEnd ); + db->lookaside.pEnd = db->lookaside.pStart; for(i=0; inDb; i++){ Schema *pSchema = db->aDb[i].pSchema; if( ALWAYS(pSchema!=0) ){ @@ -22988,6 +23320,7 @@ SQLITE_API int sqlite3_db_status( } } db->pnBytesFreed = 0; + db->lookaside.pEnd = db->lookaside.pTrueEnd; sqlite3BtreeLeaveAll(db); *pHighwater = 0; @@ -23005,10 +23338,12 @@ SQLITE_API int sqlite3_db_status( int nByte = 0; /* Used to accumulate return value */ db->pnBytesFreed = &nByte; - for(pVdbe=db->pVdbe; pVdbe; pVdbe=pVdbe->pNext){ - sqlite3VdbeClearObject(db, pVdbe); - sqlite3DbFree(db, pVdbe); + assert( db->lookaside.pEnd==db->lookaside.pTrueEnd ); + db->lookaside.pEnd = db->lookaside.pStart; + for(pVdbe=db->pVdbe; pVdbe; pVdbe=pVdbe->pVNext){ + sqlite3VdbeDelete(pVdbe); } + db->lookaside.pEnd = db->lookaside.pTrueEnd; db->pnBytesFreed = 0; *pHighwater = 0; /* IMP: R-64479-57858 */ @@ -23344,7 +23679,7 @@ static void computeJD(DateTime *p){ p->iJD = (sqlite3_int64)((X1 + X2 + D + B - 1524.5 ) * 86400000); p->validJD = 1; if( p->validHMS ){ - p->iJD += p->h*3600000 + p->m*60000 + (sqlite3_int64)(p->s*1000); + p->iJD += p->h*3600000 + p->m*60000 + (sqlite3_int64)(p->s*1000 + 0.5); if( p->validTZ ){ p->iJD -= p->tz*60000; p->validYMD = 0; @@ -23853,7 +24188,7 @@ static int parseModifier( */ if( sqlite3_strnicmp(z, "weekday ", 8)==0 && sqlite3AtoF(&z[8], &r, sqlite3Strlen30(&z[8]), SQLITE_UTF8)>0 - && (n=(int)r)==r && n>=0 && r<7 ){ + && r>=0.0 && r<7.0 && (n=(int)r)==r ){ sqlite3_int64 Z; computeYMD_HMS(p); p->validTZ = 0; @@ -24534,9 +24869,11 @@ SQLITE_PRIVATE int sqlite3OsFileSize(sqlite3_file *id, i64 *pSize){ } SQLITE_PRIVATE int sqlite3OsLock(sqlite3_file *id, int lockType){ DO_OS_MALLOC_TEST(id); + assert( lockType>=SQLITE_LOCK_SHARED && lockType<=SQLITE_LOCK_EXCLUSIVE ); return id->pMethods->xLock(id, lockType); } SQLITE_PRIVATE int sqlite3OsUnlock(sqlite3_file *id, int lockType){ + assert( lockType==SQLITE_LOCK_NONE || lockType==SQLITE_LOCK_SHARED ); return id->pMethods->xUnlock(id, lockType); } SQLITE_PRIVATE int sqlite3OsCheckReservedLock(sqlite3_file *id, int *pResOut){ @@ -24651,6 +24988,7 @@ SQLITE_PRIVATE int sqlite3OsOpen( ** down into the VFS layer. Some SQLITE_OPEN_ flags (for example, ** SQLITE_OPEN_FULLMUTEX or SQLITE_OPEN_SHAREDCACHE) are blocked before ** reaching the VFS. */ + assert( zPath || (flags & SQLITE_OPEN_EXCLUSIVE) ); rc = pVfs->xOpen(pVfs, zPath, pFile, flags & 0x1087f7f, pFlagsOut); assert( rc==SQLITE_OK || pFile->pMethods==0 ); return rc; @@ -26962,8 +27300,17 @@ static void *memsys5Realloc(void *pPrior, int nBytes){ */ static int memsys5Roundup(int n){ int iFullSz; - if( n > 0x40000000 ) return 0; - for(iFullSz=mem5.szAtom; iFullSz0x10000000 ){ + if( n>0x40000000 ) return 0; + if( n>0x20000000 ) return 0x40000000; + return 0x20000000; + } + for(iFullSz=mem5.szAtom*8; iFullSz=(i64)n ) return iFullSz/2; return iFullSz; } @@ -28864,18 +29211,34 @@ static void mallocWithAlarm(int n, void **pp){ *pp = p; } +/* +** Maximum size of any single memory allocation. +** +** This is not a limit on the total amount of memory used. This is +** a limit on the size parameter to sqlite3_malloc() and sqlite3_realloc(). +** +** The upper bound is slightly less than 2GiB: 0x7ffffeff == 2,147,483,391 +** This provides a 256-byte safety margin for defense against 32-bit +** signed integer overflow bugs when computing memory allocation sizes. +** Parnoid applications might want to reduce the maximum allocation size +** further for an even larger safety margin. 0x3fffffff or 0x0fffffff +** or even smaller would be reasonable upper bounds on the size of a memory +** allocations for most applications. +*/ +#ifndef SQLITE_MAX_ALLOCATION_SIZE +# define SQLITE_MAX_ALLOCATION_SIZE 2147483391 +#endif +#if SQLITE_MAX_ALLOCATION_SIZE>2147483391 +# error Maximum size for SQLITE_MAX_ALLOCATION_SIZE is 2147483391 +#endif + /* ** Allocate memory. This routine is like sqlite3_malloc() except that it ** assumes the memory subsystem has already been initialized. */ SQLITE_PRIVATE void *sqlite3Malloc(u64 n){ void *p; - if( n==0 || n>=0x7fffff00 ){ - /* A memory allocation of a number of bytes which is near the maximum - ** signed integer value might cause an integer overflow inside of the - ** xMalloc(). Hence we limit the maximum size to 0x7fffff00, giving - ** 255 bytes of overhead. SQLite itself will never use anything near - ** this amount. The only way to reach the limit is with sqlite3_malloc() */ + if( n==0 || n>SQLITE_MAX_ALLOCATION_SIZE ){ p = 0; }else if( sqlite3GlobalConfig.bMemstat ){ sqlite3_mutex_enter(mem0.mutex); @@ -28911,7 +29274,7 @@ SQLITE_API void *sqlite3_malloc64(sqlite3_uint64 n){ */ #ifndef SQLITE_OMIT_LOOKASIDE static int isLookaside(sqlite3 *db, const void *p){ - return SQLITE_WITHIN(p, db->lookaside.pStart, db->lookaside.pEnd); + return SQLITE_WITHIN(p, db->lookaside.pStart, db->lookaside.pTrueEnd); } #else #define isLookaside(A,B) 0 @@ -28935,18 +29298,16 @@ static int lookasideMallocSize(sqlite3 *db, const void *p){ SQLITE_PRIVATE int sqlite3DbMallocSize(sqlite3 *db, const void *p){ assert( p!=0 ); #ifdef SQLITE_DEBUG - if( db==0 || !isLookaside(db,p) ){ - if( db==0 ){ - assert( sqlite3MemdebugNoType(p, (u8)~MEMTYPE_HEAP) ); - assert( sqlite3MemdebugHasType(p, MEMTYPE_HEAP) ); - }else{ - assert( sqlite3MemdebugHasType(p, (MEMTYPE_LOOKASIDE|MEMTYPE_HEAP)) ); - assert( sqlite3MemdebugNoType(p, (u8)~(MEMTYPE_LOOKASIDE|MEMTYPE_HEAP)) ); - } + if( db==0 ){ + assert( sqlite3MemdebugNoType(p, (u8)~MEMTYPE_HEAP) ); + assert( sqlite3MemdebugHasType(p, MEMTYPE_HEAP) ); + }else if( !isLookaside(db,p) ){ + assert( sqlite3MemdebugHasType(p, (MEMTYPE_LOOKASIDE|MEMTYPE_HEAP)) ); + assert( sqlite3MemdebugNoType(p, (u8)~(MEMTYPE_LOOKASIDE|MEMTYPE_HEAP)) ); } #endif if( db ){ - if( ((uptr)p)<(uptr)(db->lookaside.pEnd) ){ + if( ((uptr)p)<(uptr)(db->lookaside.pTrueEnd) ){ #ifndef SQLITE_OMIT_TWOSIZE_LOOKASIDE if( ((uptr)p)>=(uptr)(db->lookaside.pMiddle) ){ assert( sqlite3_mutex_held(db->mutex) ); @@ -29002,14 +29363,11 @@ SQLITE_PRIVATE void sqlite3DbFreeNN(sqlite3 *db, void *p){ assert( db==0 || sqlite3_mutex_held(db->mutex) ); assert( p!=0 ); if( db ){ - if( db->pnBytesFreed ){ - measureAllocationSize(db, p); - return; - } if( ((uptr)p)<(uptr)(db->lookaside.pEnd) ){ #ifndef SQLITE_OMIT_TWOSIZE_LOOKASIDE if( ((uptr)p)>=(uptr)(db->lookaside.pMiddle) ){ LookasideSlot *pBuf = (LookasideSlot*)p; + assert( db->pnBytesFreed==0 ); #ifdef SQLITE_DEBUG memset(p, 0xaa, LOOKASIDE_SMALL); /* Trash freed content */ #endif @@ -29020,6 +29378,7 @@ SQLITE_PRIVATE void sqlite3DbFreeNN(sqlite3 *db, void *p){ #endif /* SQLITE_OMIT_TWOSIZE_LOOKASIDE */ if( ((uptr)p)>=(uptr)(db->lookaside.pStart) ){ LookasideSlot *pBuf = (LookasideSlot*)p; + assert( db->pnBytesFreed==0 ); #ifdef SQLITE_DEBUG memset(p, 0xaa, db->lookaside.szTrue); /* Trash freed content */ #endif @@ -29028,6 +29387,10 @@ SQLITE_PRIVATE void sqlite3DbFreeNN(sqlite3 *db, void *p){ return; } } + if( db->pnBytesFreed ){ + measureAllocationSize(db, p); + return; + } } assert( sqlite3MemdebugHasType(p, (MEMTYPE_LOOKASIDE|MEMTYPE_HEAP)) ); assert( sqlite3MemdebugNoType(p, (u8)~(MEMTYPE_LOOKASIDE|MEMTYPE_HEAP)) ); @@ -29035,6 +29398,43 @@ SQLITE_PRIVATE void sqlite3DbFreeNN(sqlite3 *db, void *p){ sqlite3MemdebugSetType(p, MEMTYPE_HEAP); sqlite3_free(p); } +SQLITE_PRIVATE void sqlite3DbNNFreeNN(sqlite3 *db, void *p){ + assert( db!=0 ); + assert( sqlite3_mutex_held(db->mutex) ); + assert( p!=0 ); + if( ((uptr)p)<(uptr)(db->lookaside.pEnd) ){ +#ifndef SQLITE_OMIT_TWOSIZE_LOOKASIDE + if( ((uptr)p)>=(uptr)(db->lookaside.pMiddle) ){ + LookasideSlot *pBuf = (LookasideSlot*)p; + assert( db->pnBytesFreed==0 ); +#ifdef SQLITE_DEBUG + memset(p, 0xaa, LOOKASIDE_SMALL); /* Trash freed content */ +#endif + pBuf->pNext = db->lookaside.pSmallFree; + db->lookaside.pSmallFree = pBuf; + return; + } +#endif /* SQLITE_OMIT_TWOSIZE_LOOKASIDE */ + if( ((uptr)p)>=(uptr)(db->lookaside.pStart) ){ + LookasideSlot *pBuf = (LookasideSlot*)p; + assert( db->pnBytesFreed==0 ); +#ifdef SQLITE_DEBUG + memset(p, 0xaa, db->lookaside.szTrue); /* Trash freed content */ +#endif + pBuf->pNext = db->lookaside.pFree; + db->lookaside.pFree = pBuf; + return; + } + } + if( db->pnBytesFreed ){ + measureAllocationSize(db, p); + return; + } + assert( sqlite3MemdebugHasType(p, (MEMTYPE_LOOKASIDE|MEMTYPE_HEAP)) ); + assert( sqlite3MemdebugNoType(p, (u8)~(MEMTYPE_LOOKASIDE|MEMTYPE_HEAP)) ); + sqlite3MemdebugSetType(p, MEMTYPE_HEAP); + sqlite3_free(p); +} SQLITE_PRIVATE void sqlite3DbFree(sqlite3 *db, void *p){ assert( db==0 || sqlite3_mutex_held(db->mutex) ); if( p ) sqlite3DbFreeNN(db, p); @@ -29370,8 +29770,13 @@ SQLITE_PRIVATE void *sqlite3OomFault(sqlite3 *db){ } DisableLookaside; if( db->pParse ){ + Parse *pParse; sqlite3ErrorMsg(db->pParse, "out of memory"); db->pParse->rc = SQLITE_NOMEM_BKPT; + for(pParse=db->pParse->pOuterParse; pParse; pParse = pParse->pOuterParse){ + pParse->nErr++; + pParse->rc = SQLITE_NOMEM; + } } } return 0; @@ -30237,8 +30642,8 @@ SQLITE_API void sqlite3_str_vappendf( case etSQLESCAPE: /* %q: Escape ' characters */ case etSQLESCAPE2: /* %Q: Escape ' and enclose in '...' */ case etSQLESCAPE3: { /* %w: Escape " characters */ - int i, j, k, n, isnull; - int needQuote; + i64 i, j, k, n; + int needQuote, isnull; char ch; char q = ((xtype==etSQLESCAPE3)?'"':'\''); /* Quote character */ char *escarg; @@ -30318,8 +30723,14 @@ SQLITE_API void sqlite3_str_vappendf( sqlite3_str_appendall(pAccum, pItem->zName); }else if( pItem->zAlias ){ sqlite3_str_appendall(pAccum, pItem->zAlias); - }else if( ALWAYS(pItem->pSelect) ){ - sqlite3_str_appendf(pAccum, "SUBQUERY %u", pItem->pSelect->selId); + }else{ + Select *pSel = pItem->pSelect; + assert( pSel!=0 ); + if( pSel->selFlags & SF_NestedFrom ){ + sqlite3_str_appendf(pAccum, "(join-%u)", pSel->selId); + }else{ + sqlite3_str_appendf(pAccum, "(subquery-%u)", pSel->selId); + } } length = width = 0; break; @@ -30382,7 +30793,9 @@ SQLITE_PRIVATE void sqlite3RecordErrorByteOffset(sqlite3 *db, const char *z){ ** as the error offset. */ SQLITE_PRIVATE void sqlite3RecordErrorOffsetOfExpr(sqlite3 *db, const Expr *pExpr){ - while( pExpr && (ExprHasProperty(pExpr,EP_FromJoin) || pExpr->w.iOfst<=0) ){ + while( pExpr + && (ExprHasProperty(pExpr,EP_OuterON|EP_InnerON) || pExpr->w.iOfst<=0) + ){ pExpr = pExpr->pLeft; } if( pExpr==0 ) return; @@ -30842,40 +31255,44 @@ SQLITE_API void sqlite3_str_appendf(StrAccum *p, const char *zFormat, ...){ ** Add a new subitem to the tree. The moreToFollow flag indicates that this ** is not the last item in the tree. */ -static TreeView *sqlite3TreeViewPush(TreeView *p, u8 moreToFollow){ +static void sqlite3TreeViewPush(TreeView **pp, u8 moreToFollow){ + TreeView *p = *pp; if( p==0 ){ - p = sqlite3_malloc64( sizeof(*p) ); - if( p==0 ) return 0; + *pp = p = sqlite3_malloc64( sizeof(*p) ); + if( p==0 ) return; memset(p, 0, sizeof(*p)); }else{ p->iLevel++; } assert( moreToFollow==0 || moreToFollow==1 ); - if( p->iLevelbLine) ) p->bLine[p->iLevel] = moreToFollow; - return p; + if( p->iLevel<(int)sizeof(p->bLine) ) p->bLine[p->iLevel] = moreToFollow; } /* ** Finished with one layer of the tree */ -static void sqlite3TreeViewPop(TreeView *p){ +static void sqlite3TreeViewPop(TreeView **pp){ + TreeView *p = *pp; if( p==0 ) return; p->iLevel--; - if( p->iLevel<0 ) sqlite3_free(p); + if( p->iLevel<0 ){ + sqlite3_free(p); + *pp = 0; + } } /* ** Generate a single line of output for the tree, with a prefix that contains ** all the appropriate tree lines */ -static void sqlite3TreeViewLine(TreeView *p, const char *zFormat, ...){ +SQLITE_PRIVATE void sqlite3TreeViewLine(TreeView *p, const char *zFormat, ...){ va_list ap; int i; StrAccum acc; - char zBuf[500]; + char zBuf[1000]; sqlite3StrAccumInit(&acc, 0, zBuf, sizeof(zBuf), 0); if( p ){ - for(i=0; iiLevel && ibLine)-1; i++){ + for(i=0; iiLevel && i<(int)sizeof(p->bLine)-1; i++){ sqlite3_str_append(&acc, p->bLine[i] ? "| " : " ", 4); } sqlite3_str_append(&acc, p->bLine[i] ? "|-- " : "'-- ", 4); @@ -30896,10 +31313,57 @@ static void sqlite3TreeViewLine(TreeView *p, const char *zFormat, ...){ ** Shorthand for starting a new tree item that consists of a single label */ static void sqlite3TreeViewItem(TreeView *p, const char *zLabel,u8 moreFollows){ - p = sqlite3TreeViewPush(p, moreFollows); + sqlite3TreeViewPush(&p, moreFollows); sqlite3TreeViewLine(p, "%s", zLabel); } +/* +** Show a list of Column objects in tree format. +*/ +SQLITE_PRIVATE void sqlite3TreeViewColumnList( + TreeView *pView, + const Column *aCol, + int nCol, + u8 moreToFollow +){ + int i; + sqlite3TreeViewPush(&pView, moreToFollow); + sqlite3TreeViewLine(pView, "COLUMNS"); + for(i=0; inCte>0 ){ - pView = sqlite3TreeViewPush(pView, 1); + sqlite3TreeViewPush(&pView, moreToFollow); for(i=0; inCte; i++){ StrAccum x; char zLine[1000]; @@ -30929,6 +31393,10 @@ SQLITE_PRIVATE void sqlite3TreeViewWith(TreeView *pView, const With *pWith, u8 m } sqlite3_str_appendf(&x, ")"); } + if( pCte->eM10d!=M10d_Any ){ + sqlite3_str_appendf(&x, " %sMATERIALIZED", + pCte->eM10d==M10d_No ? "NOT " : ""); + } if( pCte->pUse ){ sqlite3_str_appendf(&x, " (pUse=0x%p, nUse=%d)", pCte->pUse, pCte->pUse->nUse); @@ -30936,9 +31404,9 @@ SQLITE_PRIVATE void sqlite3TreeViewWith(TreeView *pView, const With *pWith, u8 m sqlite3StrAccumFinish(&x); sqlite3TreeViewItem(pView, zLine, inCte-1); sqlite3TreeViewSelect(pView, pCte->pSelect, 0); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } } @@ -30947,10 +31415,12 @@ SQLITE_PRIVATE void sqlite3TreeViewWith(TreeView *pView, const With *pWith, u8 m */ SQLITE_PRIVATE void sqlite3TreeViewSrcList(TreeView *pView, const SrcList *pSrc){ int i; + if( pSrc==0 ) return; for(i=0; inSrc; i++){ const SrcItem *pItem = &pSrc->a[i]; StrAccum x; - char zLine[100]; + int n = 0; + char zLine[1000]; sqlite3StrAccumInit(&x, 0, zLine, sizeof(zLine), 0); x.printfFlags |= SQLITE_PRINTF_INTERNAL; sqlite3_str_appendf(&x, "{%d:*} %!S", pItem->iCursor, pItem); @@ -30958,26 +31428,48 @@ SQLITE_PRIVATE void sqlite3TreeViewSrcList(TreeView *pView, const SrcList *pSrc) sqlite3_str_appendf(&x, " tab=%Q nCol=%d ptr=%p used=%llx", pItem->pTab->zName, pItem->pTab->nCol, pItem->pTab, pItem->colUsed); } - if( pItem->fg.jointype & JT_LEFT ){ + if( (pItem->fg.jointype & (JT_LEFT|JT_RIGHT))==(JT_LEFT|JT_RIGHT) ){ + sqlite3_str_appendf(&x, " FULL-OUTER-JOIN"); + }else if( pItem->fg.jointype & JT_LEFT ){ sqlite3_str_appendf(&x, " LEFT-JOIN"); + }else if( pItem->fg.jointype & JT_RIGHT ){ + sqlite3_str_appendf(&x, " RIGHT-JOIN"); }else if( pItem->fg.jointype & JT_CROSS ){ sqlite3_str_appendf(&x, " CROSS-JOIN"); } + if( pItem->fg.jointype & JT_LTORJ ){ + sqlite3_str_appendf(&x, " LTORJ"); + } if( pItem->fg.fromDDL ){ sqlite3_str_appendf(&x, " DDL"); } if( pItem->fg.isCte ){ sqlite3_str_appendf(&x, " CteUse=0x%p", pItem->u2.pCteUse); } + if( pItem->fg.isOn || (pItem->fg.isUsing==0 && pItem->u3.pOn!=0) ){ + sqlite3_str_appendf(&x, " ON"); + } sqlite3StrAccumFinish(&x); sqlite3TreeViewItem(pView, zLine, inSrc-1); + n = 0; + if( pItem->pSelect ) n++; + if( pItem->fg.isTabFunc ) n++; + if( pItem->fg.isUsing ) n++; + if( pItem->fg.isUsing ){ + sqlite3TreeViewIdList(pView, pItem->u3.pUsing, (--n)>0, "USING"); + } if( pItem->pSelect ){ - sqlite3TreeViewSelect(pView, pItem->pSelect, 0); + if( pItem->pTab ){ + Table *pTab = pItem->pTab; + sqlite3TreeViewColumnList(pView, pTab->aCol, pTab->nCol, 1); + } + assert( (int)pItem->fg.isNestedFrom == IsNestedFrom(pItem->pSelect) ); + sqlite3TreeViewSelect(pView, pItem->pSelect, (--n)>0); } if( pItem->fg.isTabFunc ){ sqlite3TreeViewExprList(pView, pItem->u1.pFuncArg, 0, "func-args:"); } - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } } @@ -30991,11 +31483,11 @@ SQLITE_PRIVATE void sqlite3TreeViewSelect(TreeView *pView, const Select *p, u8 m sqlite3TreeViewLine(pView, "nil-SELECT"); return; } - pView = sqlite3TreeViewPush(pView, moreToFollow); + sqlite3TreeViewPush(&pView, moreToFollow); if( p->pWith ){ sqlite3TreeViewWith(pView, p->pWith, 1); cnt = 1; - sqlite3TreeViewPush(pView, 1); + sqlite3TreeViewPush(&pView, 1); } do{ if( p->selFlags & SF_WhereBegin ){ @@ -31009,7 +31501,7 @@ SQLITE_PRIVATE void sqlite3TreeViewSelect(TreeView *pView, const Select *p, u8 m (int)p->nSelectRow ); } - if( cnt++ ) sqlite3TreeViewPop(pView); + if( cnt++ ) sqlite3TreeViewPop(&pView); if( p->pPrior ){ n = 1000; }else{ @@ -31032,24 +31524,24 @@ SQLITE_PRIVATE void sqlite3TreeViewSelect(TreeView *pView, const Select *p, u8 m #ifndef SQLITE_OMIT_WINDOWFUNC if( p->pWin ){ Window *pX; - pView = sqlite3TreeViewPush(pView, (n--)>0); + sqlite3TreeViewPush(&pView, (n--)>0); sqlite3TreeViewLine(pView, "window-functions"); for(pX=p->pWin; pX; pX=pX->pNextWin){ sqlite3TreeViewWinFunc(pView, pX, pX->pNextWin!=0); } - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } #endif if( p->pSrc && p->pSrc->nSrc ){ - pView = sqlite3TreeViewPush(pView, (n--)>0); + sqlite3TreeViewPush(&pView, (n--)>0); sqlite3TreeViewLine(pView, "FROM"); sqlite3TreeViewSrcList(pView, p->pSrc); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } if( p->pWhere ){ sqlite3TreeViewItem(pView, "WHERE", (n--)>0); sqlite3TreeViewExpr(pView, p->pWhere, 0); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } if( p->pGroupBy ){ sqlite3TreeViewExprList(pView, p->pGroupBy, (n--)>0, "GROUPBY"); @@ -31057,7 +31549,7 @@ SQLITE_PRIVATE void sqlite3TreeViewSelect(TreeView *pView, const Select *p, u8 m if( p->pHaving ){ sqlite3TreeViewItem(pView, "HAVING", (n--)>0); sqlite3TreeViewExpr(pView, p->pHaving, 0); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } #ifndef SQLITE_OMIT_WINDOWFUNC if( p->pWinDefn ){ @@ -31066,7 +31558,7 @@ SQLITE_PRIVATE void sqlite3TreeViewSelect(TreeView *pView, const Select *p, u8 m for(pX=p->pWinDefn; pX; pX=pX->pNextWin){ sqlite3TreeViewWindow(pView, pX, pX->pNextWin!=0); } - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } #endif if( p->pOrderBy ){ @@ -31078,9 +31570,9 @@ SQLITE_PRIVATE void sqlite3TreeViewSelect(TreeView *pView, const Select *p, u8 m if( p->pLimit->pRight ){ sqlite3TreeViewItem(pView, "OFFSET", (n--)>0); sqlite3TreeViewExpr(pView, p->pLimit->pRight, 0); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } if( p->pPrior ){ const char *zOp = "UNION"; @@ -31093,7 +31585,7 @@ SQLITE_PRIVATE void sqlite3TreeViewSelect(TreeView *pView, const Select *p, u8 m } p = p->pPrior; }while( p!=0 ); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } #ifndef SQLITE_OMIT_WINDOWFUNC @@ -31109,24 +31601,24 @@ SQLITE_PRIVATE void sqlite3TreeViewBound( switch( eBound ){ case TK_UNBOUNDED: { sqlite3TreeViewItem(pView, "UNBOUNDED", moreToFollow); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); break; } case TK_CURRENT: { sqlite3TreeViewItem(pView, "CURRENT", moreToFollow); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); break; } case TK_PRECEDING: { sqlite3TreeViewItem(pView, "PRECEDING", moreToFollow); sqlite3TreeViewExpr(pView, pExpr, 0); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); break; } case TK_FOLLOWING: { sqlite3TreeViewItem(pView, "FOLLOWING", moreToFollow); sqlite3TreeViewExpr(pView, pExpr, 0); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); break; } } @@ -31139,12 +31631,13 @@ SQLITE_PRIVATE void sqlite3TreeViewBound( */ SQLITE_PRIVATE void sqlite3TreeViewWindow(TreeView *pView, const Window *pWin, u8 more){ int nElement = 0; + if( pWin==0 ) return; if( pWin->pFilter ){ sqlite3TreeViewItem(pView, "FILTER", 1); sqlite3TreeViewExpr(pView, pWin->pFilter, 0); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } - pView = sqlite3TreeViewPush(pView, more); + sqlite3TreeViewPush(&pView, more); if( pWin->zName ){ sqlite3TreeViewLine(pView, "OVER %s (%p)", pWin->zName, pWin); }else{ @@ -31155,9 +31648,9 @@ SQLITE_PRIVATE void sqlite3TreeViewWindow(TreeView *pView, const Window *pWin, u if( pWin->eFrmType ) nElement++; if( pWin->eExclude ) nElement++; if( pWin->zBase ){ - sqlite3TreeViewPush(pView, (--nElement)>0); + sqlite3TreeViewPush(&pView, (--nElement)>0); sqlite3TreeViewLine(pView, "window: %s", pWin->zBase); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } if( pWin->pPartition ){ sqlite3TreeViewExprList(pView, pWin->pPartition, nElement>0,"PARTITION-BY"); @@ -31175,7 +31668,7 @@ SQLITE_PRIVATE void sqlite3TreeViewWindow(TreeView *pView, const Window *pWin, u sqlite3TreeViewItem(pView, zBuf, (--nElement)>0); sqlite3TreeViewBound(pView, pWin->eStart, pWin->pStart, 1); sqlite3TreeViewBound(pView, pWin->eEnd, pWin->pEnd, 0); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } if( pWin->eExclude ){ char zBuf[30]; @@ -31190,11 +31683,11 @@ SQLITE_PRIVATE void sqlite3TreeViewWindow(TreeView *pView, const Window *pWin, u zExclude = zBuf; break; } - sqlite3TreeViewPush(pView, 0); + sqlite3TreeViewPush(&pView, 0); sqlite3TreeViewLine(pView, "EXCLUDE %s", zExclude); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } #endif /* SQLITE_OMIT_WINDOWFUNC */ @@ -31203,11 +31696,12 @@ SQLITE_PRIVATE void sqlite3TreeViewWindow(TreeView *pView, const Window *pWin, u ** Generate a human-readable explanation for a Window Function object */ SQLITE_PRIVATE void sqlite3TreeViewWinFunc(TreeView *pView, const Window *pWin, u8 more){ - pView = sqlite3TreeViewPush(pView, more); + if( pWin==0 ) return; + sqlite3TreeViewPush(&pView, more); sqlite3TreeViewLine(pView, "WINFUNC %s(%d)", - pWin->pFunc->zName, pWin->pFunc->nArg); + pWin->pWFunc->zName, pWin->pWFunc->nArg); sqlite3TreeViewWindow(pView, pWin, 0); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } #endif /* SQLITE_OMIT_WINDOWFUNC */ @@ -31218,10 +31712,10 @@ SQLITE_PRIVATE void sqlite3TreeViewExpr(TreeView *pView, const Expr *pExpr, u8 m const char *zBinOp = 0; /* Binary operator */ const char *zUniOp = 0; /* Unary operator */ char zFlgs[200]; - pView = sqlite3TreeViewPush(pView, moreToFollow); + sqlite3TreeViewPush(&pView, moreToFollow); if( pExpr==0 ){ sqlite3TreeViewLine(pView, "nil"); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); return; } if( pExpr->flags || pExpr->affExpr || pExpr->vvaFlags ){ @@ -31229,8 +31723,11 @@ SQLITE_PRIVATE void sqlite3TreeViewExpr(TreeView *pView, const Expr *pExpr, u8 m sqlite3StrAccumInit(&x, 0, zFlgs, sizeof(zFlgs), 0); sqlite3_str_appendf(&x, " fg.af=%x.%c", pExpr->flags, pExpr->affExpr ? pExpr->affExpr : 'n'); - if( ExprHasProperty(pExpr, EP_FromJoin) ){ - sqlite3_str_appendf(&x, " iRJT=%d", pExpr->w.iRightJoinTable); + if( ExprHasProperty(pExpr, EP_OuterON) ){ + sqlite3_str_appendf(&x, " outer.iJoin=%d", pExpr->w.iJoin); + } + if( ExprHasProperty(pExpr, EP_InnerON) ){ + sqlite3_str_appendf(&x, " inner.iJoin=%d", pExpr->w.iJoin); } if( ExprHasProperty(pExpr, EP_FromDDL) ){ sqlite3_str_appendf(&x, " DDL"); @@ -31454,7 +31951,17 @@ SQLITE_PRIVATE void sqlite3TreeViewExpr(TreeView *pView, const Expr *pExpr, u8 m break; } case TK_IN: { - sqlite3TreeViewLine(pView, "IN flags=0x%x", pExpr->flags); + sqlite3_str *pStr = sqlite3_str_new(0); + char *z; + sqlite3_str_appendf(pStr, "IN flags=0x%x", pExpr->flags); + if( pExpr->iTable ) sqlite3_str_appendf(pStr, " iTable=%d",pExpr->iTable); + if( ExprHasProperty(pExpr, EP_Subrtn) ){ + sqlite3_str_appendf(pStr, " subrtn(%d,%d)", + pExpr->y.sub.regReturn, pExpr->y.sub.iAddr); + } + z = sqlite3_str_finish(pStr); + sqlite3TreeViewLine(pView, z); + sqlite3_free(z); sqlite3TreeViewExpr(pView, pExpr->pLeft, 1); if( ExprUseXSelect(pExpr) ){ sqlite3TreeViewSelect(pView, pExpr->x.pSelect, 0); @@ -31578,7 +32085,7 @@ SQLITE_PRIVATE void sqlite3TreeViewExpr(TreeView *pView, const Expr *pExpr, u8 m sqlite3TreeViewLine(pView, "%s%s", zUniOp, zFlgs); sqlite3TreeViewExpr(pView, pExpr->pLeft, 0); } - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } @@ -31600,13 +32107,25 @@ SQLITE_PRIVATE void sqlite3TreeViewBareExprList( int j = pList->a[i].u.x.iOrderByCol; char *zName = pList->a[i].zEName; int moreToFollow = inExpr - 1; - if( pList->a[i].eEName!=ENAME_NAME ) zName = 0; if( j || zName ){ - sqlite3TreeViewPush(pView, moreToFollow); + sqlite3TreeViewPush(&pView, moreToFollow); moreToFollow = 0; sqlite3TreeViewLine(pView, 0); if( zName ){ - fprintf(stdout, "AS %s ", zName); + switch( pList->a[i].fg.eEName ){ + default: + fprintf(stdout, "AS %s ", zName); + break; + case ENAME_TAB: + fprintf(stdout, "TABLE-ALIAS-NAME(\"%s\") ", zName); + if( pList->a[i].fg.bUsed ) fprintf(stdout, "(used) "); + if( pList->a[i].fg.bUsingTerm ) fprintf(stdout, "(USING-term) "); + if( pList->a[i].fg.bNoExpand ) fprintf(stdout, "(NoExpand) "); + break; + case ENAME_SPAN: + fprintf(stdout, "SPAN(\"%s\") ", zName); + break; + } } if( j ){ fprintf(stdout, "iOrderByCol=%d", j); @@ -31616,7 +32135,7 @@ SQLITE_PRIVATE void sqlite3TreeViewBareExprList( } sqlite3TreeViewExpr(pView, pList->a[i].pExpr, moreToFollow); if( j || zName ){ - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); } } } @@ -31627,10 +32146,377 @@ SQLITE_PRIVATE void sqlite3TreeViewExprList( u8 moreToFollow, const char *zLabel ){ - pView = sqlite3TreeViewPush(pView, moreToFollow); + sqlite3TreeViewPush(&pView, moreToFollow); sqlite3TreeViewBareExprList(pView, pList, zLabel); - sqlite3TreeViewPop(pView); + sqlite3TreeViewPop(&pView); +} + +/* +** Generate a human-readable explanation of an id-list. +*/ +SQLITE_PRIVATE void sqlite3TreeViewBareIdList( + TreeView *pView, + const IdList *pList, + const char *zLabel +){ + if( zLabel==0 || zLabel[0]==0 ) zLabel = "LIST"; + if( pList==0 ){ + sqlite3TreeViewLine(pView, "%s (empty)", zLabel); + }else{ + int i; + sqlite3TreeViewLine(pView, "%s", zLabel); + for(i=0; inId; i++){ + char *zName = pList->a[i].zName; + int moreToFollow = inId - 1; + if( zName==0 ) zName = "(null)"; + sqlite3TreeViewPush(&pView, moreToFollow); + sqlite3TreeViewLine(pView, 0); + if( pList->eU4==EU4_NONE ){ + fprintf(stdout, "%s\n", zName); + }else if( pList->eU4==EU4_IDX ){ + fprintf(stdout, "%s (%d)\n", zName, pList->a[i].u4.idx); + }else{ + assert( pList->eU4==EU4_EXPR ); + if( pList->a[i].u4.pExpr==0 ){ + fprintf(stdout, "%s (pExpr=NULL)\n", zName); + }else{ + fprintf(stdout, "%s\n", zName); + sqlite3TreeViewPush(&pView, inId-1); + sqlite3TreeViewExpr(pView, pList->a[i].u4.pExpr, 0); + sqlite3TreeViewPop(&pView); + } + } + sqlite3TreeViewPop(&pView); + } + } +} +SQLITE_PRIVATE void sqlite3TreeViewIdList( + TreeView *pView, + const IdList *pList, + u8 moreToFollow, + const char *zLabel +){ + sqlite3TreeViewPush(&pView, moreToFollow); + sqlite3TreeViewBareIdList(pView, pList, zLabel); + sqlite3TreeViewPop(&pView); +} + +/* +** Generate a human-readable explanation of a list of Upsert objects +*/ +SQLITE_PRIVATE void sqlite3TreeViewUpsert( + TreeView *pView, + const Upsert *pUpsert, + u8 moreToFollow +){ + if( pUpsert==0 ) return; + sqlite3TreeViewPush(&pView, moreToFollow); + while( pUpsert ){ + int n; + sqlite3TreeViewPush(&pView, pUpsert->pNextUpsert!=0 || moreToFollow); + sqlite3TreeViewLine(pView, "ON CONFLICT DO %s", + pUpsert->isDoUpdate ? "UPDATE" : "NOTHING"); + n = (pUpsert->pUpsertSet!=0) + (pUpsert->pUpsertWhere!=0); + sqlite3TreeViewExprList(pView, pUpsert->pUpsertTarget, (n--)>0, "TARGET"); + sqlite3TreeViewExprList(pView, pUpsert->pUpsertSet, (n--)>0, "SET"); + if( pUpsert->pUpsertWhere ){ + sqlite3TreeViewItem(pView, "WHERE", (n--)>0); + sqlite3TreeViewExpr(pView, pUpsert->pUpsertWhere, 0); + sqlite3TreeViewPop(&pView); + } + sqlite3TreeViewPop(&pView); + pUpsert = pUpsert->pNextUpsert; + } + sqlite3TreeViewPop(&pView); +} + +#if TREETRACE_ENABLED +/* +** Generate a human-readable diagram of the data structure that go +** into generating an DELETE statement. +*/ +SQLITE_PRIVATE void sqlite3TreeViewDelete( + const With *pWith, + const SrcList *pTabList, + const Expr *pWhere, + const ExprList *pOrderBy, + const Expr *pLimit, + const Trigger *pTrigger +){ + int n = 0; + TreeView *pView = 0; + sqlite3TreeViewPush(&pView, 0); + sqlite3TreeViewLine(pView, "DELETE"); + if( pWith ) n++; + if( pTabList ) n++; + if( pWhere ) n++; + if( pOrderBy ) n++; + if( pLimit ) n++; + if( pTrigger ) n++; + if( pWith ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewWith(pView, pWith, 0); + sqlite3TreeViewPop(&pView); + } + if( pTabList ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewLine(pView, "FROM"); + sqlite3TreeViewSrcList(pView, pTabList); + sqlite3TreeViewPop(&pView); + } + if( pWhere ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewLine(pView, "WHERE"); + sqlite3TreeViewExpr(pView, pWhere, 0); + sqlite3TreeViewPop(&pView); + } + if( pOrderBy ){ + sqlite3TreeViewExprList(pView, pOrderBy, (--n)>0, "ORDER-BY"); + } + if( pLimit ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewLine(pView, "LIMIT"); + sqlite3TreeViewExpr(pView, pLimit, 0); + sqlite3TreeViewPop(&pView); + } + if( pTrigger ){ + sqlite3TreeViewTrigger(pView, pTrigger, (--n)>0, 1); + } + sqlite3TreeViewPop(&pView); } +#endif /* TREETRACE_ENABLED */ + +#if TREETRACE_ENABLED +/* +** Generate a human-readable diagram of the data structure that go +** into generating an INSERT statement. +*/ +SQLITE_PRIVATE void sqlite3TreeViewInsert( + const With *pWith, + const SrcList *pTabList, + const IdList *pColumnList, + const Select *pSelect, + const ExprList *pExprList, + int onError, + const Upsert *pUpsert, + const Trigger *pTrigger +){ + TreeView *pView = 0; + int n = 0; + const char *zLabel = "INSERT"; + switch( onError ){ + case OE_Replace: zLabel = "REPLACE"; break; + case OE_Ignore: zLabel = "INSERT OR IGNORE"; break; + case OE_Rollback: zLabel = "INSERT OR ROLLBACK"; break; + case OE_Abort: zLabel = "INSERT OR ABORT"; break; + case OE_Fail: zLabel = "INSERT OR FAIL"; break; + } + sqlite3TreeViewPush(&pView, 0); + sqlite3TreeViewLine(pView, zLabel); + if( pWith ) n++; + if( pTabList ) n++; + if( pColumnList ) n++; + if( pSelect ) n++; + if( pExprList ) n++; + if( pUpsert ) n++; + if( pTrigger ) n++; + if( pWith ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewWith(pView, pWith, 0); + sqlite3TreeViewPop(&pView); + } + if( pTabList ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewLine(pView, "INTO"); + sqlite3TreeViewSrcList(pView, pTabList); + sqlite3TreeViewPop(&pView); + } + if( pColumnList ){ + sqlite3TreeViewIdList(pView, pColumnList, (--n)>0, "COLUMNS"); + } + if( pSelect ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewLine(pView, "DATA-SOURCE"); + sqlite3TreeViewSelect(pView, pSelect, 0); + sqlite3TreeViewPop(&pView); + } + if( pExprList ){ + sqlite3TreeViewExprList(pView, pExprList, (--n)>0, "VALUES"); + } + if( pUpsert ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewLine(pView, "UPSERT"); + sqlite3TreeViewUpsert(pView, pUpsert, 0); + sqlite3TreeViewPop(&pView); + } + if( pTrigger ){ + sqlite3TreeViewTrigger(pView, pTrigger, (--n)>0, 1); + } + sqlite3TreeViewPop(&pView); +} +#endif /* TREETRACE_ENABLED */ + +#if TREETRACE_ENABLED +/* +** Generate a human-readable diagram of the data structure that go +** into generating an UPDATE statement. +*/ +SQLITE_PRIVATE void sqlite3TreeViewUpdate( + const With *pWith, + const SrcList *pTabList, + const ExprList *pChanges, + const Expr *pWhere, + int onError, + const ExprList *pOrderBy, + const Expr *pLimit, + const Upsert *pUpsert, + const Trigger *pTrigger +){ + int n = 0; + TreeView *pView = 0; + const char *zLabel = "UPDATE"; + switch( onError ){ + case OE_Replace: zLabel = "UPDATE OR REPLACE"; break; + case OE_Ignore: zLabel = "UPDATE OR IGNORE"; break; + case OE_Rollback: zLabel = "UPDATE OR ROLLBACK"; break; + case OE_Abort: zLabel = "UPDATE OR ABORT"; break; + case OE_Fail: zLabel = "UPDATE OR FAIL"; break; + } + sqlite3TreeViewPush(&pView, 0); + sqlite3TreeViewLine(pView, zLabel); + if( pWith ) n++; + if( pTabList ) n++; + if( pChanges ) n++; + if( pWhere ) n++; + if( pOrderBy ) n++; + if( pLimit ) n++; + if( pUpsert ) n++; + if( pTrigger ) n++; + if( pWith ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewWith(pView, pWith, 0); + sqlite3TreeViewPop(&pView); + } + if( pTabList ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewLine(pView, "FROM"); + sqlite3TreeViewSrcList(pView, pTabList); + sqlite3TreeViewPop(&pView); + } + if( pChanges ){ + sqlite3TreeViewExprList(pView, pChanges, (--n)>0, "SET"); + } + if( pWhere ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewLine(pView, "WHERE"); + sqlite3TreeViewExpr(pView, pWhere, 0); + sqlite3TreeViewPop(&pView); + } + if( pOrderBy ){ + sqlite3TreeViewExprList(pView, pOrderBy, (--n)>0, "ORDER-BY"); + } + if( pLimit ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewLine(pView, "LIMIT"); + sqlite3TreeViewExpr(pView, pLimit, 0); + sqlite3TreeViewPop(&pView); + } + if( pUpsert ){ + sqlite3TreeViewPush(&pView, (--n)>0); + sqlite3TreeViewLine(pView, "UPSERT"); + sqlite3TreeViewUpsert(pView, pUpsert, 0); + sqlite3TreeViewPop(&pView); + } + if( pTrigger ){ + sqlite3TreeViewTrigger(pView, pTrigger, (--n)>0, 1); + } + sqlite3TreeViewPop(&pView); +} +#endif /* TREETRACE_ENABLED */ + +#ifndef SQLITE_OMIT_TRIGGER +/* +** Show a human-readable graph of a TriggerStep +*/ +SQLITE_PRIVATE void sqlite3TreeViewTriggerStep( + TreeView *pView, + const TriggerStep *pStep, + u8 moreToFollow, + u8 showFullList +){ + int cnt = 0; + if( pStep==0 ) return; + sqlite3TreeViewPush(&pView, + moreToFollow || (showFullList && pStep->pNext!=0)); + do{ + if( cnt++ && pStep->pNext==0 ){ + sqlite3TreeViewPop(&pView); + sqlite3TreeViewPush(&pView, 0); + } + sqlite3TreeViewLine(pView, "%s", pStep->zSpan ? pStep->zSpan : "RETURNING"); + }while( showFullList && (pStep = pStep->pNext)!=0 ); + sqlite3TreeViewPop(&pView); +} + +/* +** Show a human-readable graph of a Trigger +*/ +SQLITE_PRIVATE void sqlite3TreeViewTrigger( + TreeView *pView, + const Trigger *pTrigger, + u8 moreToFollow, + u8 showFullList +){ + int cnt = 0; + if( pTrigger==0 ) return; + sqlite3TreeViewPush(&pView, + moreToFollow || (showFullList && pTrigger->pNext!=0)); + do{ + if( cnt++ && pTrigger->pNext==0 ){ + sqlite3TreeViewPop(&pView); + sqlite3TreeViewPush(&pView, 0); + } + sqlite3TreeViewLine(pView, "TRIGGER %s", pTrigger->zName); + sqlite3TreeViewPush(&pView, 0); + sqlite3TreeViewTriggerStep(pView, pTrigger->step_list, 0, 1); + sqlite3TreeViewPop(&pView); + }while( showFullList && (pTrigger = pTrigger->pNext)!=0 ); + sqlite3TreeViewPop(&pView); +} +#endif /* SQLITE_OMIT_TRIGGER */ + + +/* +** These simplified versions of the tree-view routines omit unnecessary +** parameters. These variants are intended to be used from a symbolic +** debugger, such as "gdb", during interactive debugging sessions. +** +** This routines are given external linkage so that they will always be +** accessible to the debugging, and to avoid warnings about unused +** functions. But these routines only exist in debugging builds, so they +** do not contaminate the interface. +*/ +SQLITE_PRIVATE void sqlite3ShowExpr(const Expr *p){ sqlite3TreeViewExpr(0,p,0); } +SQLITE_PRIVATE void sqlite3ShowExprList(const ExprList *p){ sqlite3TreeViewExprList(0,p,0,0);} +SQLITE_PRIVATE void sqlite3ShowIdList(const IdList *p){ sqlite3TreeViewIdList(0,p,0,0); } +SQLITE_PRIVATE void sqlite3ShowSrcList(const SrcList *p){ sqlite3TreeViewSrcList(0,p); } +SQLITE_PRIVATE void sqlite3ShowSelect(const Select *p){ sqlite3TreeViewSelect(0,p,0); } +SQLITE_PRIVATE void sqlite3ShowWith(const With *p){ sqlite3TreeViewWith(0,p,0); } +SQLITE_PRIVATE void sqlite3ShowUpsert(const Upsert *p){ sqlite3TreeViewUpsert(0,p,0); } +#ifndef SQLITE_OMIT_TRIGGER +SQLITE_PRIVATE void sqlite3ShowTriggerStep(const TriggerStep *p){ + sqlite3TreeViewTriggerStep(0,p,0,0); +} +SQLITE_PRIVATE void sqlite3ShowTriggerStepList(const TriggerStep *p){ + sqlite3TreeViewTriggerStep(0,p,0,1); +} +SQLITE_PRIVATE void sqlite3ShowTrigger(const Trigger *p){ sqlite3TreeViewTrigger(0,p,0,0); } +SQLITE_PRIVATE void sqlite3ShowTriggerList(const Trigger *p){ sqlite3TreeViewTrigger(0,p,0,1);} +#endif +#ifndef SQLITE_OMIT_WINDOWFUNC +SQLITE_PRIVATE void sqlite3ShowWindow(const Window *p){ sqlite3TreeViewWindow(0,p,0); } +SQLITE_PRIVATE void sqlite3ShowWinFunc(const Window *p){ sqlite3TreeViewWinFunc(0,p,0); } +#endif #endif /* SQLITE_DEBUG */ @@ -31660,16 +32546,41 @@ SQLITE_PRIVATE void sqlite3TreeViewExprList( ** This structure is the current state of the generator. */ static SQLITE_WSD struct sqlite3PrngType { - unsigned char isInit; /* True if initialized */ - unsigned char i, j; /* State variables */ - unsigned char s[256]; /* State variables */ + u32 s[16]; /* 64 bytes of chacha20 state */ + u8 out[64]; /* Output bytes */ + u8 n; /* Output bytes remaining */ } sqlite3Prng; + +/* The RFC-7539 ChaCha20 block function +*/ +#define ROTL(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) +#define QR(a, b, c, d) ( \ + a += b, d ^= a, d = ROTL(d,16), \ + c += d, b ^= c, b = ROTL(b,12), \ + a += b, d ^= a, d = ROTL(d, 8), \ + c += d, b ^= c, b = ROTL(b, 7)) +static void chacha_block(u32 *out, const u32 *in){ + int i; + u32 x[16]; + memcpy(x, in, 64); + for(i=0; i<10; i++){ + QR(x[0], x[4], x[ 8], x[12]); + QR(x[1], x[5], x[ 9], x[13]); + QR(x[2], x[6], x[10], x[14]); + QR(x[3], x[7], x[11], x[15]); + QR(x[0], x[5], x[10], x[15]); + QR(x[1], x[6], x[11], x[12]); + QR(x[2], x[7], x[ 8], x[13]); + QR(x[3], x[4], x[ 9], x[14]); + } + for(i=0; i<16; i++) out[i] = x[i]+in[i]; +} + /* ** Return N random bytes. */ SQLITE_API void sqlite3_randomness(int N, void *pBuf){ - unsigned char t; unsigned char *zBuf = pBuf; /* The "wsdPrng" macro will resolve to the pseudo-random number generator @@ -31699,53 +32610,46 @@ SQLITE_API void sqlite3_randomness(int N, void *pBuf){ sqlite3_mutex_enter(mutex); if( N<=0 || pBuf==0 ){ - wsdPrng.isInit = 0; + wsdPrng.s[0] = 0; sqlite3_mutex_leave(mutex); return; } /* Initialize the state of the random number generator once, - ** the first time this routine is called. The seed value does - ** not need to contain a lot of randomness since we are not - ** trying to do secure encryption or anything like that... - ** - ** Nothing in this file or anywhere else in SQLite does any kind of - ** encryption. The RC4 algorithm is being used as a PRNG (pseudo-random - ** number generator) not as an encryption device. + ** the first time this routine is called. */ - if( !wsdPrng.isInit ){ + if( wsdPrng.s[0]==0 ){ sqlite3_vfs *pVfs = sqlite3_vfs_find(0); - int i; - char k[256]; - wsdPrng.j = 0; - wsdPrng.i = 0; + static const u32 chacha20_init[] = { + 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 + }; + memcpy(&wsdPrng.s[0], chacha20_init, 16); if( NEVER(pVfs==0) ){ - memset(k, 0, sizeof(k)); + memset(&wsdPrng.s[4], 0, 44); }else{ - sqlite3OsRandomness(pVfs, 256, k); - } - for(i=0; i<256; i++){ - wsdPrng.s[i] = (u8)i; - } - for(i=0; i<256; i++){ - wsdPrng.j += wsdPrng.s[i] + k[i]; - t = wsdPrng.s[wsdPrng.j]; - wsdPrng.s[wsdPrng.j] = wsdPrng.s[i]; - wsdPrng.s[i] = t; + sqlite3OsRandomness(pVfs, 44, (char*)&wsdPrng.s[4]); } - wsdPrng.isInit = 1; + wsdPrng.s[15] = wsdPrng.s[12]; + wsdPrng.s[12] = 0; + wsdPrng.n = 0; } assert( N>0 ); - do{ - wsdPrng.i++; - t = wsdPrng.s[wsdPrng.i]; - wsdPrng.j += t; - wsdPrng.s[wsdPrng.i] = wsdPrng.s[wsdPrng.j]; - wsdPrng.s[wsdPrng.j] = t; - t += wsdPrng.s[wsdPrng.i]; - *(zBuf++) = wsdPrng.s[t]; - }while( --N ); + while( 1 /* exit by break */ ){ + if( N<=wsdPrng.n ){ + memcpy(zBuf, &wsdPrng.out[wsdPrng.n-N], N); + wsdPrng.n -= N; + break; + } + if( wsdPrng.n>0 ){ + memcpy(zBuf, wsdPrng.out, wsdPrng.n); + N -= wsdPrng.n; + zBuf += wsdPrng.n; + } + wsdPrng.s[12]++; + chacha_block((u32*)wsdPrng.out, wsdPrng.s); + wsdPrng.n = 64; + } sqlite3_mutex_leave(mutex); } @@ -32785,7 +33689,7 @@ SQLITE_PRIVATE void sqlite3ErrorMsg(Parse *pParse, const char *zFormat, ...){ va_list ap; sqlite3 *db = pParse->db; assert( db!=0 ); - assert( db->pParse==pParse ); + assert( db->pParse==pParse || db->pParse->pToplevel==pParse ); db->errByteOffset = -2; va_start(ap, zFormat); zMsg = sqlite3VMPrintf(db, zFormat, ap); @@ -34598,53 +35502,53 @@ SQLITE_PRIVATE const char *sqlite3OpcodeName(int i){ /* 0 */ "Savepoint" OpHelp(""), /* 1 */ "AutoCommit" OpHelp(""), /* 2 */ "Transaction" OpHelp(""), - /* 3 */ "SorterNext" OpHelp(""), - /* 4 */ "Prev" OpHelp(""), - /* 5 */ "Next" OpHelp(""), - /* 6 */ "Checkpoint" OpHelp(""), - /* 7 */ "JournalMode" OpHelp(""), - /* 8 */ "Vacuum" OpHelp(""), - /* 9 */ "VFilter" OpHelp("iplan=r[P3] zplan='P4'"), - /* 10 */ "VUpdate" OpHelp("data=r[P3@P2]"), - /* 11 */ "Goto" OpHelp(""), - /* 12 */ "Gosub" OpHelp(""), - /* 13 */ "InitCoroutine" OpHelp(""), - /* 14 */ "Yield" OpHelp(""), - /* 15 */ "MustBeInt" OpHelp(""), - /* 16 */ "Jump" OpHelp(""), - /* 17 */ "Once" OpHelp(""), - /* 18 */ "If" OpHelp(""), + /* 3 */ "Checkpoint" OpHelp(""), + /* 4 */ "JournalMode" OpHelp(""), + /* 5 */ "Vacuum" OpHelp(""), + /* 6 */ "VFilter" OpHelp("iplan=r[P3] zplan='P4'"), + /* 7 */ "VUpdate" OpHelp("data=r[P3@P2]"), + /* 8 */ "Init" OpHelp("Start at P2"), + /* 9 */ "Goto" OpHelp(""), + /* 10 */ "Gosub" OpHelp(""), + /* 11 */ "InitCoroutine" OpHelp(""), + /* 12 */ "Yield" OpHelp(""), + /* 13 */ "MustBeInt" OpHelp(""), + /* 14 */ "Jump" OpHelp(""), + /* 15 */ "Once" OpHelp(""), + /* 16 */ "If" OpHelp(""), + /* 17 */ "IfNot" OpHelp(""), + /* 18 */ "IsType" OpHelp("if typeof(P1.P3) in P5 goto P2"), /* 19 */ "Not" OpHelp("r[P2]= !r[P1]"), - /* 20 */ "IfNot" OpHelp(""), - /* 21 */ "IsNullOrType" OpHelp("if typeof(r[P1]) IN (P3,5) goto P2"), - /* 22 */ "IfNullRow" OpHelp("if P1.nullRow then r[P3]=NULL, goto P2"), - /* 23 */ "SeekLT" OpHelp("key=r[P3@P4]"), - /* 24 */ "SeekLE" OpHelp("key=r[P3@P4]"), - /* 25 */ "SeekGE" OpHelp("key=r[P3@P4]"), - /* 26 */ "SeekGT" OpHelp("key=r[P3@P4]"), - /* 27 */ "IfNotOpen" OpHelp("if( !csr[P1] ) goto P2"), - /* 28 */ "IfNoHope" OpHelp("key=r[P3@P4]"), - /* 29 */ "NoConflict" OpHelp("key=r[P3@P4]"), - /* 30 */ "NotFound" OpHelp("key=r[P3@P4]"), - /* 31 */ "Found" OpHelp("key=r[P3@P4]"), - /* 32 */ "SeekRowid" OpHelp("intkey=r[P3]"), - /* 33 */ "NotExists" OpHelp("intkey=r[P3]"), - /* 34 */ "Last" OpHelp(""), - /* 35 */ "IfSmaller" OpHelp(""), - /* 36 */ "SorterSort" OpHelp(""), - /* 37 */ "Sort" OpHelp(""), - /* 38 */ "Rewind" OpHelp(""), - /* 39 */ "IdxLE" OpHelp("key=r[P3@P4]"), - /* 40 */ "IdxGT" OpHelp("key=r[P3@P4]"), - /* 41 */ "IdxLT" OpHelp("key=r[P3@P4]"), - /* 42 */ "IdxGE" OpHelp("key=r[P3@P4]"), + /* 20 */ "IfNullRow" OpHelp("if P1.nullRow then r[P3]=NULL, goto P2"), + /* 21 */ "SeekLT" OpHelp("key=r[P3@P4]"), + /* 22 */ "SeekLE" OpHelp("key=r[P3@P4]"), + /* 23 */ "SeekGE" OpHelp("key=r[P3@P4]"), + /* 24 */ "SeekGT" OpHelp("key=r[P3@P4]"), + /* 25 */ "IfNotOpen" OpHelp("if( !csr[P1] ) goto P2"), + /* 26 */ "IfNoHope" OpHelp("key=r[P3@P4]"), + /* 27 */ "NoConflict" OpHelp("key=r[P3@P4]"), + /* 28 */ "NotFound" OpHelp("key=r[P3@P4]"), + /* 29 */ "Found" OpHelp("key=r[P3@P4]"), + /* 30 */ "SeekRowid" OpHelp("intkey=r[P3]"), + /* 31 */ "NotExists" OpHelp("intkey=r[P3]"), + /* 32 */ "Last" OpHelp(""), + /* 33 */ "IfSmaller" OpHelp(""), + /* 34 */ "SorterSort" OpHelp(""), + /* 35 */ "Sort" OpHelp(""), + /* 36 */ "Rewind" OpHelp(""), + /* 37 */ "SorterNext" OpHelp(""), + /* 38 */ "Prev" OpHelp(""), + /* 39 */ "Next" OpHelp(""), + /* 40 */ "IdxLE" OpHelp("key=r[P3@P4]"), + /* 41 */ "IdxGT" OpHelp("key=r[P3@P4]"), + /* 42 */ "IdxLT" OpHelp("key=r[P3@P4]"), /* 43 */ "Or" OpHelp("r[P3]=(r[P1] || r[P2])"), /* 44 */ "And" OpHelp("r[P3]=(r[P1] && r[P2])"), - /* 45 */ "RowSetRead" OpHelp("r[P3]=rowset(P1)"), - /* 46 */ "RowSetTest" OpHelp("if r[P3] in rowset(P1) goto P2"), - /* 47 */ "Program" OpHelp(""), - /* 48 */ "FkIfZero" OpHelp("if fkctr[P1]==0 goto P2"), - /* 49 */ "IfPos" OpHelp("if r[P1]>0 then r[P1]-=P3, goto P2"), + /* 45 */ "IdxGE" OpHelp("key=r[P3@P4]"), + /* 46 */ "RowSetRead" OpHelp("r[P3]=rowset(P1)"), + /* 47 */ "RowSetTest" OpHelp("if r[P3] in rowset(P1) goto P2"), + /* 48 */ "Program" OpHelp(""), + /* 49 */ "FkIfZero" OpHelp("if fkctr[P1]==0 goto P2"), /* 50 */ "IsNull" OpHelp("if r[P1]==NULL goto P2"), /* 51 */ "NotNull" OpHelp("if r[P1]!=NULL goto P2"), /* 52 */ "Ne" OpHelp("IF r[P3]!=r[P1]"), @@ -34654,12 +35558,12 @@ SQLITE_PRIVATE const char *sqlite3OpcodeName(int i){ /* 56 */ "Lt" OpHelp("IF r[P3]=r[P1]"), /* 58 */ "ElseEq" OpHelp(""), - /* 59 */ "IfNotZero" OpHelp("if r[P1]!=0 then r[P1]--, goto P2"), - /* 60 */ "DecrJumpZero" OpHelp("if (--r[P1])==0 goto P2"), - /* 61 */ "IncrVacuum" OpHelp(""), - /* 62 */ "VNext" OpHelp(""), - /* 63 */ "Filter" OpHelp("if key(P3@P4) not in filter(P1) goto P2"), - /* 64 */ "Init" OpHelp("Start at P2"), + /* 59 */ "IfPos" OpHelp("if r[P1]>0 then r[P1]-=P3, goto P2"), + /* 60 */ "IfNotZero" OpHelp("if r[P1]!=0 then r[P1]--, goto P2"), + /* 61 */ "DecrJumpZero" OpHelp("if (--r[P1])==0 goto P2"), + /* 62 */ "IncrVacuum" OpHelp(""), + /* 63 */ "VNext" OpHelp(""), + /* 64 */ "Filter" OpHelp("if key(P3@P4) not in filter(P1) goto P2"), /* 65 */ "PureFunc" OpHelp("r[P3]=func(r[P2@NP])"), /* 66 */ "Function" OpHelp("r[P3]=func(r[P2@NP])"), /* 67 */ "Return" OpHelp(""), @@ -34669,34 +35573,34 @@ SQLITE_PRIVATE const char *sqlite3OpcodeName(int i){ /* 71 */ "Integer" OpHelp("r[P2]=P1"), /* 72 */ "Int64" OpHelp("r[P2]=P4"), /* 73 */ "String" OpHelp("r[P2]='P4' (len=P1)"), - /* 74 */ "Null" OpHelp("r[P2..P3]=NULL"), - /* 75 */ "SoftNull" OpHelp("r[P1]=NULL"), - /* 76 */ "Blob" OpHelp("r[P2]=P4 (len=P1)"), - /* 77 */ "Variable" OpHelp("r[P2]=parameter(P1,P4)"), - /* 78 */ "Move" OpHelp("r[P2@P3]=r[P1@P3]"), - /* 79 */ "Copy" OpHelp("r[P2@P3+1]=r[P1@P3+1]"), - /* 80 */ "SCopy" OpHelp("r[P2]=r[P1]"), - /* 81 */ "IntCopy" OpHelp("r[P2]=r[P1]"), - /* 82 */ "FkCheck" OpHelp(""), - /* 83 */ "ResultRow" OpHelp("output=r[P1@P2]"), - /* 84 */ "CollSeq" OpHelp(""), - /* 85 */ "AddImm" OpHelp("r[P1]=r[P1]+P2"), - /* 86 */ "RealAffinity" OpHelp(""), - /* 87 */ "Cast" OpHelp("affinity(r[P1])"), - /* 88 */ "Permutation" OpHelp(""), - /* 89 */ "Compare" OpHelp("r[P1@P3] <-> r[P2@P3]"), - /* 90 */ "IsTrue" OpHelp("r[P2] = coalesce(r[P1]==TRUE,P3) ^ P4"), - /* 91 */ "ZeroOrNull" OpHelp("r[P2] = 0 OR NULL"), - /* 92 */ "Offset" OpHelp("r[P3] = sqlite_offset(P1)"), - /* 93 */ "Column" OpHelp("r[P3]=PX"), - /* 94 */ "TypeCheck" OpHelp("typecheck(r[P1@P2])"), - /* 95 */ "Affinity" OpHelp("affinity(r[P1@P2])"), - /* 96 */ "MakeRecord" OpHelp("r[P3]=mkrec(r[P1@P2])"), - /* 97 */ "Count" OpHelp("r[P2]=count()"), - /* 98 */ "ReadCookie" OpHelp(""), - /* 99 */ "SetCookie" OpHelp(""), - /* 100 */ "ReopenIdx" OpHelp("root=P2 iDb=P3"), - /* 101 */ "OpenRead" OpHelp("root=P2 iDb=P3"), + /* 74 */ "BeginSubrtn" OpHelp("r[P2]=NULL"), + /* 75 */ "Null" OpHelp("r[P2..P3]=NULL"), + /* 76 */ "SoftNull" OpHelp("r[P1]=NULL"), + /* 77 */ "Blob" OpHelp("r[P2]=P4 (len=P1)"), + /* 78 */ "Variable" OpHelp("r[P2]=parameter(P1,P4)"), + /* 79 */ "Move" OpHelp("r[P2@P3]=r[P1@P3]"), + /* 80 */ "Copy" OpHelp("r[P2@P3+1]=r[P1@P3+1]"), + /* 81 */ "SCopy" OpHelp("r[P2]=r[P1]"), + /* 82 */ "IntCopy" OpHelp("r[P2]=r[P1]"), + /* 83 */ "FkCheck" OpHelp(""), + /* 84 */ "ResultRow" OpHelp("output=r[P1@P2]"), + /* 85 */ "CollSeq" OpHelp(""), + /* 86 */ "AddImm" OpHelp("r[P1]=r[P1]+P2"), + /* 87 */ "RealAffinity" OpHelp(""), + /* 88 */ "Cast" OpHelp("affinity(r[P1])"), + /* 89 */ "Permutation" OpHelp(""), + /* 90 */ "Compare" OpHelp("r[P1@P3] <-> r[P2@P3]"), + /* 91 */ "IsTrue" OpHelp("r[P2] = coalesce(r[P1]==TRUE,P3) ^ P4"), + /* 92 */ "ZeroOrNull" OpHelp("r[P2] = 0 OR NULL"), + /* 93 */ "Offset" OpHelp("r[P3] = sqlite_offset(P1)"), + /* 94 */ "Column" OpHelp("r[P3]=PX cursor P1 column P2"), + /* 95 */ "TypeCheck" OpHelp("typecheck(r[P1@P2])"), + /* 96 */ "Affinity" OpHelp("affinity(r[P1@P2])"), + /* 97 */ "MakeRecord" OpHelp("r[P3]=mkrec(r[P1@P2])"), + /* 98 */ "Count" OpHelp("r[P2]=count()"), + /* 99 */ "ReadCookie" OpHelp(""), + /* 100 */ "SetCookie" OpHelp(""), + /* 101 */ "ReopenIdx" OpHelp("root=P2 iDb=P3"), /* 102 */ "BitAnd" OpHelp("r[P3]=r[P1]&r[P2]"), /* 103 */ "BitOr" OpHelp("r[P3]=r[P1]|r[P2]"), /* 104 */ "ShiftLeft" OpHelp("r[P3]=r[P2]<0 then r[P2]=r[P1]+max(0,r[P3]) else r[P2]=(-1)"), - /* 160 */ "AggInverse" OpHelp("accum=r[P3] inverse(r[P2@P5])"), - /* 161 */ "AggStep" OpHelp("accum=r[P3] step(r[P2@P5])"), - /* 162 */ "AggStep1" OpHelp("accum=r[P3] step(r[P2@P5])"), - /* 163 */ "AggValue" OpHelp("r[P3]=value N=P2"), - /* 164 */ "AggFinal" OpHelp("accum=r[P1] N=P2"), - /* 165 */ "Expire" OpHelp(""), - /* 166 */ "CursorLock" OpHelp(""), - /* 167 */ "CursorUnlock" OpHelp(""), - /* 168 */ "TableLock" OpHelp("iDb=P1 root=P2 write=P3"), - /* 169 */ "VBegin" OpHelp(""), - /* 170 */ "VCreate" OpHelp(""), - /* 171 */ "VDestroy" OpHelp(""), - /* 172 */ "VOpen" OpHelp(""), - /* 173 */ "VInitIn" OpHelp("r[P2]=ValueList(P1,P3)"), - /* 174 */ "VColumn" OpHelp("r[P3]=vcolumn(P2)"), - /* 175 */ "VRename" OpHelp(""), - /* 176 */ "Pagecount" OpHelp(""), - /* 177 */ "MaxPgcnt" OpHelp(""), - /* 178 */ "FilterAdd" OpHelp("filter(P1) += key(P3@P4)"), - /* 179 */ "Trace" OpHelp(""), - /* 180 */ "CursorHint" OpHelp(""), - /* 181 */ "ReleaseReg" OpHelp("release r[P1@P2] mask P3"), - /* 182 */ "Noop" OpHelp(""), - /* 183 */ "Explain" OpHelp(""), - /* 184 */ "Abortable" OpHelp(""), + /* 154 */ "DropTrigger" OpHelp(""), + /* 155 */ "IntegrityCk" OpHelp(""), + /* 156 */ "RowSetAdd" OpHelp("rowset(P1)=r[P2]"), + /* 157 */ "Param" OpHelp(""), + /* 158 */ "FkCounter" OpHelp("fkctr[P1]+=P2"), + /* 159 */ "MemMax" OpHelp("r[P1]=max(r[P1],r[P2])"), + /* 160 */ "OffsetLimit" OpHelp("if r[P1]>0 then r[P2]=r[P1]+max(0,r[P3]) else r[P2]=(-1)"), + /* 161 */ "AggInverse" OpHelp("accum=r[P3] inverse(r[P2@P5])"), + /* 162 */ "AggStep" OpHelp("accum=r[P3] step(r[P2@P5])"), + /* 163 */ "AggStep1" OpHelp("accum=r[P3] step(r[P2@P5])"), + /* 164 */ "AggValue" OpHelp("r[P3]=value N=P2"), + /* 165 */ "AggFinal" OpHelp("accum=r[P1] N=P2"), + /* 166 */ "Expire" OpHelp(""), + /* 167 */ "CursorLock" OpHelp(""), + /* 168 */ "CursorUnlock" OpHelp(""), + /* 169 */ "TableLock" OpHelp("iDb=P1 root=P2 write=P3"), + /* 170 */ "VBegin" OpHelp(""), + /* 171 */ "VCreate" OpHelp(""), + /* 172 */ "VDestroy" OpHelp(""), + /* 173 */ "VOpen" OpHelp(""), + /* 174 */ "VInitIn" OpHelp("r[P2]=ValueList(P1,P3)"), + /* 175 */ "VColumn" OpHelp("r[P3]=vcolumn(P2)"), + /* 176 */ "VRename" OpHelp(""), + /* 177 */ "Pagecount" OpHelp(""), + /* 178 */ "MaxPgcnt" OpHelp(""), + /* 179 */ "ClrSubtype" OpHelp("r[P1].subtype = 0"), + /* 180 */ "FilterAdd" OpHelp("filter(P1) += key(P3@P4)"), + /* 181 */ "Trace" OpHelp(""), + /* 182 */ "CursorHint" OpHelp(""), + /* 183 */ "ReleaseReg" OpHelp("release r[P1@P2] mask P3"), + /* 184 */ "Noop" OpHelp(""), + /* 185 */ "Explain" OpHelp(""), + /* 186 */ "Abortable" OpHelp(""), }; return azName[i]; } #endif /************** End of opcodes.c *********************************************/ +/************** Begin file os_kv.c *******************************************/ +/* +** 2022-09-06 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +** This file contains an experimental VFS layer that operates on a +** Key/Value storage engine where both keys and values must be pure +** text. +*/ +/* #include */ +#if SQLITE_OS_KV || (SQLITE_OS_UNIX && defined(SQLITE_OS_KV_OPTIONAL)) + +/***************************************************************************** +** Debugging logic +*/ + +/* SQLITE_KV_TRACE() is used for tracing calls to kvstorage routines. */ +#if 0 +#define SQLITE_KV_TRACE(X) printf X +#else +#define SQLITE_KV_TRACE(X) +#endif + +/* SQLITE_KV_LOG() is used for tracing calls to the VFS interface */ +#if 0 +#define SQLITE_KV_LOG(X) printf X +#else +#define SQLITE_KV_LOG(X) +#endif + + +/* +** Forward declaration of objects used by this VFS implementation +*/ +typedef struct KVVfsFile KVVfsFile; + +/* A single open file. There are only two files represented by this +** VFS - the database and the rollback journal. +*/ +struct KVVfsFile { + sqlite3_file base; /* IO methods */ + const char *zClass; /* Storage class */ + int isJournal; /* True if this is a journal file */ + unsigned int nJrnl; /* Space allocated for aJrnl[] */ + char *aJrnl; /* Journal content */ + int szPage; /* Last known page size */ + sqlite3_int64 szDb; /* Database file size. -1 means unknown */ +}; + +/* +** Methods for KVVfsFile +*/ +static int kvvfsClose(sqlite3_file*); +static int kvvfsReadDb(sqlite3_file*, void*, int iAmt, sqlite3_int64 iOfst); +static int kvvfsReadJrnl(sqlite3_file*, void*, int iAmt, sqlite3_int64 iOfst); +static int kvvfsWriteDb(sqlite3_file*,const void*,int iAmt, sqlite3_int64); +static int kvvfsWriteJrnl(sqlite3_file*,const void*,int iAmt, sqlite3_int64); +static int kvvfsTruncateDb(sqlite3_file*, sqlite3_int64 size); +static int kvvfsTruncateJrnl(sqlite3_file*, sqlite3_int64 size); +static int kvvfsSyncDb(sqlite3_file*, int flags); +static int kvvfsSyncJrnl(sqlite3_file*, int flags); +static int kvvfsFileSizeDb(sqlite3_file*, sqlite3_int64 *pSize); +static int kvvfsFileSizeJrnl(sqlite3_file*, sqlite3_int64 *pSize); +static int kvvfsLock(sqlite3_file*, int); +static int kvvfsUnlock(sqlite3_file*, int); +static int kvvfsCheckReservedLock(sqlite3_file*, int *pResOut); +static int kvvfsFileControlDb(sqlite3_file*, int op, void *pArg); +static int kvvfsFileControlJrnl(sqlite3_file*, int op, void *pArg); +static int kvvfsSectorSize(sqlite3_file*); +static int kvvfsDeviceCharacteristics(sqlite3_file*); + +/* +** Methods for sqlite3_vfs +*/ +static int kvvfsOpen(sqlite3_vfs*, const char *, sqlite3_file*, int , int *); +static int kvvfsDelete(sqlite3_vfs*, const char *zName, int syncDir); +static int kvvfsAccess(sqlite3_vfs*, const char *zName, int flags, int *); +static int kvvfsFullPathname(sqlite3_vfs*, const char *zName, int, char *zOut); +static void *kvvfsDlOpen(sqlite3_vfs*, const char *zFilename); +static int kvvfsRandomness(sqlite3_vfs*, int nByte, char *zOut); +static int kvvfsSleep(sqlite3_vfs*, int microseconds); +static int kvvfsCurrentTime(sqlite3_vfs*, double*); +static int kvvfsCurrentTimeInt64(sqlite3_vfs*, sqlite3_int64*); + +static sqlite3_vfs sqlite3OsKvvfsObject = { + 1, /* iVersion */ + sizeof(KVVfsFile), /* szOsFile */ + 1024, /* mxPathname */ + 0, /* pNext */ + "kvvfs", /* zName */ + 0, /* pAppData */ + kvvfsOpen, /* xOpen */ + kvvfsDelete, /* xDelete */ + kvvfsAccess, /* xAccess */ + kvvfsFullPathname, /* xFullPathname */ + kvvfsDlOpen, /* xDlOpen */ + 0, /* xDlError */ + 0, /* xDlSym */ + 0, /* xDlClose */ + kvvfsRandomness, /* xRandomness */ + kvvfsSleep, /* xSleep */ + kvvfsCurrentTime, /* xCurrentTime */ + 0, /* xGetLastError */ + kvvfsCurrentTimeInt64 /* xCurrentTimeInt64 */ +}; + +/* Methods for sqlite3_file objects referencing a database file +*/ +static sqlite3_io_methods kvvfs_db_io_methods = { + 1, /* iVersion */ + kvvfsClose, /* xClose */ + kvvfsReadDb, /* xRead */ + kvvfsWriteDb, /* xWrite */ + kvvfsTruncateDb, /* xTruncate */ + kvvfsSyncDb, /* xSync */ + kvvfsFileSizeDb, /* xFileSize */ + kvvfsLock, /* xLock */ + kvvfsUnlock, /* xUnlock */ + kvvfsCheckReservedLock, /* xCheckReservedLock */ + kvvfsFileControlDb, /* xFileControl */ + kvvfsSectorSize, /* xSectorSize */ + kvvfsDeviceCharacteristics, /* xDeviceCharacteristics */ + 0, /* xShmMap */ + 0, /* xShmLock */ + 0, /* xShmBarrier */ + 0, /* xShmUnmap */ + 0, /* xFetch */ + 0 /* xUnfetch */ +}; + +/* Methods for sqlite3_file objects referencing a rollback journal +*/ +static sqlite3_io_methods kvvfs_jrnl_io_methods = { + 1, /* iVersion */ + kvvfsClose, /* xClose */ + kvvfsReadJrnl, /* xRead */ + kvvfsWriteJrnl, /* xWrite */ + kvvfsTruncateJrnl, /* xTruncate */ + kvvfsSyncJrnl, /* xSync */ + kvvfsFileSizeJrnl, /* xFileSize */ + kvvfsLock, /* xLock */ + kvvfsUnlock, /* xUnlock */ + kvvfsCheckReservedLock, /* xCheckReservedLock */ + kvvfsFileControlJrnl, /* xFileControl */ + kvvfsSectorSize, /* xSectorSize */ + kvvfsDeviceCharacteristics, /* xDeviceCharacteristics */ + 0, /* xShmMap */ + 0, /* xShmLock */ + 0, /* xShmBarrier */ + 0, /* xShmUnmap */ + 0, /* xFetch */ + 0 /* xUnfetch */ +}; + +/****** Storage subsystem **************************************************/ +#include +#include +#include + +/* Forward declarations for the low-level storage engine +*/ +static int kvstorageWrite(const char*, const char *zKey, const char *zData); +static int kvstorageDelete(const char*, const char *zKey); +static int kvstorageRead(const char*, const char *zKey, char *zBuf, int nBuf); +#define KVSTORAGE_KEY_SZ 32 + +/* Expand the key name with an appropriate prefix and put the result +** zKeyOut[]. The zKeyOut[] buffer is assumed to hold at least +** KVSTORAGE_KEY_SZ bytes. +*/ +static void kvstorageMakeKey( + const char *zClass, + const char *zKeyIn, + char *zKeyOut +){ + sqlite3_snprintf(KVSTORAGE_KEY_SZ, zKeyOut, "kvvfs-%s-%s", zClass, zKeyIn); +} + +/* Write content into a key. zClass is the particular namespace of the +** underlying key/value store to use - either "local" or "session". +** +** Both zKey and zData are zero-terminated pure text strings. +** +** Return the number of errors. +*/ +static int kvstorageWrite( + const char *zClass, + const char *zKey, + const char *zData +){ + FILE *fd; + char zXKey[KVSTORAGE_KEY_SZ]; + kvstorageMakeKey(zClass, zKey, zXKey); + fd = fopen(zXKey, "wb"); + if( fd ){ + SQLITE_KV_TRACE(("KVVFS-WRITE %-15s (%d) %.50s%s\n", zXKey, + (int)strlen(zData), zData, + strlen(zData)>50 ? "..." : "")); + fputs(zData, fd); + fclose(fd); + return 0; + }else{ + return 1; + } +} + +/* Delete a key (with its corresponding data) from the key/value +** namespace given by zClass. If the key does not previously exist, +** this routine is a no-op. +*/ +static int kvstorageDelete(const char *zClass, const char *zKey){ + char zXKey[KVSTORAGE_KEY_SZ]; + kvstorageMakeKey(zClass, zKey, zXKey); + unlink(zXKey); + SQLITE_KV_TRACE(("KVVFS-DELETE %-15s\n", zXKey)); + return 0; +} + +/* Read the value associated with a zKey from the key/value namespace given +** by zClass and put the text data associated with that key in the first +** nBuf bytes of zBuf[]. The value might be truncated if zBuf is not large +** enough to hold it all. The value put into zBuf must always be zero +** terminated, even if it gets truncated because nBuf is not large enough. +** +** Return the total number of bytes in the data, without truncation, and +** not counting the final zero terminator. Return -1 if the key does +** not exist. +** +** If nBuf<=0 then this routine simply returns the size of the data without +** actually reading it. +*/ +static int kvstorageRead( + const char *zClass, + const char *zKey, + char *zBuf, + int nBuf +){ + FILE *fd; + struct stat buf; + char zXKey[KVSTORAGE_KEY_SZ]; + kvstorageMakeKey(zClass, zKey, zXKey); + if( access(zXKey, R_OK)!=0 + || stat(zXKey, &buf)!=0 + || !S_ISREG(buf.st_mode) + ){ + SQLITE_KV_TRACE(("KVVFS-READ %-15s (-1)\n", zXKey)); + return -1; + } + if( nBuf<=0 ){ + return (int)buf.st_size; + }else if( nBuf==1 ){ + zBuf[0] = 0; + SQLITE_KV_TRACE(("KVVFS-READ %-15s (%d)\n", zXKey, + (int)buf.st_size)); + return (int)buf.st_size; + } + if( nBuf > buf.st_size + 1 ){ + nBuf = buf.st_size + 1; + } + fd = fopen(zXKey, "rb"); + if( fd==0 ){ + SQLITE_KV_TRACE(("KVVFS-READ %-15s (-1)\n", zXKey)); + return -1; + }else{ + sqlite3_int64 n = fread(zBuf, 1, nBuf-1, fd); + fclose(fd); + zBuf[n] = 0; + SQLITE_KV_TRACE(("KVVFS-READ %-15s (%lld) %.50s%s\n", zXKey, + n, zBuf, n>50 ? "..." : "")); + return (int)n; + } +} + +/* +** An internal level of indirection which enables us to replace the +** kvvfs i/o methods with JavaScript implementations in WASM builds. +** Maintenance reminder: if this struct changes in any way, the JSON +** rendering of its structure must be updated in +** sqlite3_wasm_enum_json(). There are no binary compatibility +** concerns, so it does not need an iVersion member. This file is +** necessarily always compiled together with sqlite3_wasm_enum_json(), +** and JS code dynamically creates the mapping of members based on +** that JSON description. +*/ +typedef struct sqlite3_kvvfs_methods sqlite3_kvvfs_methods; +struct sqlite3_kvvfs_methods { + int (*xRead)(const char *zClass, const char *zKey, char *zBuf, int nBuf); + int (*xWrite)(const char *zClass, const char *zKey, const char *zData); + int (*xDelete)(const char *zClass, const char *zKey); + const int nKeySize; +}; + +/* +** This object holds the kvvfs I/O methods which may be swapped out +** for JavaScript-side implementations in WASM builds. In such builds +** it cannot be const, but in native builds it should be so that +** the compiler can hopefully optimize this level of indirection out. +** That said, kvvfs is intended primarily for use in WASM builds. +** +** Note that this is not explicitly flagged as static because the +** amalgamation build will tag it with SQLITE_PRIVATE. +*/ +#ifndef SQLITE_WASM +const +#endif +SQLITE_PRIVATE sqlite3_kvvfs_methods sqlite3KvvfsMethods = { +kvstorageRead, +kvstorageWrite, +kvstorageDelete, +KVSTORAGE_KEY_SZ +}; + +/****** Utility subroutines ************************************************/ + +/* +** Encode binary into the text encoded used to persist on disk. +** The output text is stored in aOut[], which must be at least +** nData+1 bytes in length. +** +** Return the actual length of the encoded text, not counting the +** zero terminator at the end. +** +** Encoding format +** --------------- +** +** * Non-zero bytes are encoded as upper-case hexadecimal +** +** * A sequence of one or more zero-bytes that are not at the +** beginning of the buffer are encoded as a little-endian +** base-26 number using a..z. "a" means 0. "b" means 1, +** "z" means 25. "ab" means 26. "ac" means 52. And so forth. +** +** * Because there is no overlap between the encoding characters +** of hexadecimal and base-26 numbers, it is always clear where +** one stops and the next begins. +*/ +static int kvvfsEncode(const char *aData, int nData, char *aOut){ + int i, j; + const unsigned char *a = (const unsigned char*)aData; + for(i=j=0; i>4]; + aOut[j++] = "0123456789ABCDEF"[c&0xf]; + }else{ + /* A sequence of 1 or more zeros is stored as a little-endian + ** base-26 number using a..z as the digits. So one zero is "b". + ** Two zeros is "c". 25 zeros is "z", 26 zeros is "ab", 27 is "bb", + ** and so forth. + */ + int k; + for(k=1; i+k0 ){ + aOut[j++] = 'a'+(k%26); + k /= 26; + } + } + } + aOut[j] = 0; + return j; +} + +static const signed char kvvfsHexValue[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; + +/* +** Decode the text encoding back to binary. The binary content is +** written into pOut, which must be at least nOut bytes in length. +** +** The return value is the number of bytes actually written into aOut[]. +*/ +static int kvvfsDecode(const char *a, char *aOut, int nOut){ + int i, j; + int c; + const unsigned char *aIn = (const unsigned char*)a; + i = 0; + j = 0; + while( 1 ){ + c = kvvfsHexValue[aIn[i]]; + if( c<0 ){ + int n = 0; + int mult = 1; + c = aIn[i]; + if( c==0 ) break; + while( c>='a' && c<='z' ){ + n += (c - 'a')*mult; + mult *= 26; + c = aIn[++i]; + } + if( j+n>nOut ) return -1; + memset(&aOut[j], 0, n); + j += n; + c = aIn[i]; + if( c==0 ) break; + }else{ + aOut[j] = c<<4; + c = kvvfsHexValue[aIn[++i]]; + if( c<0 ) break; + aOut[j++] += c; + i++; + } + } + return j; +} + +/* +** Decode a complete journal file. Allocate space in pFile->aJrnl +** and store the decoding there. Or leave pFile->aJrnl set to NULL +** if an error is encountered. +** +** The first few characters of the text encoding will be a little-endian +** base-26 number (digits a..z) that is the total number of bytes +** in the decoded journal file image. This base-26 number is followed +** by a single space, then the encoding of the journal. The space +** separator is required to act as a terminator for the base-26 number. +*/ +static void kvvfsDecodeJournal( + KVVfsFile *pFile, /* Store decoding in pFile->aJrnl */ + const char *zTxt, /* Text encoding. Zero-terminated */ + int nTxt /* Bytes in zTxt, excluding zero terminator */ +){ + unsigned int n = 0; + int c, i, mult; + i = 0; + mult = 1; + while( (c = zTxt[i++])>='a' && c<='z' ){ + n += (zTxt[i] - 'a')*mult; + mult *= 26; + } + sqlite3_free(pFile->aJrnl); + pFile->aJrnl = sqlite3_malloc64( n ); + if( pFile->aJrnl==0 ){ + pFile->nJrnl = 0; + return; + } + pFile->nJrnl = n; + n = kvvfsDecode(zTxt+i, pFile->aJrnl, pFile->nJrnl); + if( nnJrnl ){ + sqlite3_free(pFile->aJrnl); + pFile->aJrnl = 0; + pFile->nJrnl = 0; + } +} + +/* +** Read or write the "sz" element, containing the database file size. +*/ +static sqlite3_int64 kvvfsReadFileSize(KVVfsFile *pFile){ + char zData[50]; + zData[0] = 0; + sqlite3KvvfsMethods.xRead(pFile->zClass, "sz", zData, sizeof(zData)-1); + return strtoll(zData, 0, 0); +} +static int kvvfsWriteFileSize(KVVfsFile *pFile, sqlite3_int64 sz){ + char zData[50]; + sqlite3_snprintf(sizeof(zData), zData, "%lld", sz); + return sqlite3KvvfsMethods.xWrite(pFile->zClass, "sz", zData); +} + +/****** sqlite3_io_methods methods ******************************************/ + +/* +** Close an kvvfs-file. +*/ +static int kvvfsClose(sqlite3_file *pProtoFile){ + KVVfsFile *pFile = (KVVfsFile *)pProtoFile; + + SQLITE_KV_LOG(("xClose %s %s\n", pFile->zClass, + pFile->isJournal ? "journal" : "db")); + sqlite3_free(pFile->aJrnl); + return SQLITE_OK; +} + +/* +** Read from the -journal file. +*/ +static int kvvfsReadJrnl( + sqlite3_file *pProtoFile, + void *zBuf, + int iAmt, + sqlite_int64 iOfst +){ + KVVfsFile *pFile = (KVVfsFile*)pProtoFile; + assert( pFile->isJournal ); + SQLITE_KV_LOG(("xRead('%s-journal',%d,%lld)\n", pFile->zClass, iAmt, iOfst)); + if( pFile->aJrnl==0 ){ + int szTxt = kvstorageRead(pFile->zClass, "jrnl", 0, 0); + char *aTxt; + if( szTxt<=4 ){ + return SQLITE_IOERR; + } + aTxt = sqlite3_malloc64( szTxt+1 ); + if( aTxt==0 ) return SQLITE_NOMEM; + kvstorageRead(pFile->zClass, "jrnl", aTxt, szTxt+1); + kvvfsDecodeJournal(pFile, aTxt, szTxt); + sqlite3_free(aTxt); + if( pFile->aJrnl==0 ) return SQLITE_IOERR; + } + if( iOfst+iAmt>pFile->nJrnl ){ + return SQLITE_IOERR_SHORT_READ; + } + memcpy(zBuf, pFile->aJrnl+iOfst, iAmt); + return SQLITE_OK; +} + +/* +** Read from the database file. +*/ +static int kvvfsReadDb( + sqlite3_file *pProtoFile, + void *zBuf, + int iAmt, + sqlite_int64 iOfst +){ + KVVfsFile *pFile = (KVVfsFile*)pProtoFile; + unsigned int pgno; + int got, n; + char zKey[30]; + char aData[133073]; + assert( iOfst>=0 ); + assert( iAmt>=0 ); + SQLITE_KV_LOG(("xRead('%s-db',%d,%lld)\n", pFile->zClass, iAmt, iOfst)); + if( iOfst+iAmt>=512 ){ + if( (iOfst % iAmt)!=0 ){ + return SQLITE_IOERR_READ; + } + if( (iAmt & (iAmt-1))!=0 || iAmt<512 || iAmt>65536 ){ + return SQLITE_IOERR_READ; + } + pFile->szPage = iAmt; + pgno = 1 + iOfst/iAmt; + }else{ + pgno = 1; + } + sqlite3_snprintf(sizeof(zKey), zKey, "%u", pgno); + got = sqlite3KvvfsMethods.xRead(pFile->zClass, zKey, aData, sizeof(aData)-1); + if( got<0 ){ + n = 0; + }else{ + aData[got] = 0; + if( iOfst+iAmt<512 ){ + int k = iOfst+iAmt; + aData[k*2] = 0; + n = kvvfsDecode(aData, &aData[2000], sizeof(aData)-2000); + if( n>=iOfst+iAmt ){ + memcpy(zBuf, &aData[2000+iOfst], iAmt); + n = iAmt; + }else{ + n = 0; + } + }else{ + n = kvvfsDecode(aData, zBuf, iAmt); + } + } + if( nzClass, iAmt, iOfst)); + if( iEnd>=0x10000000 ) return SQLITE_FULL; + if( pFile->aJrnl==0 || pFile->nJrnlaJrnl, iEnd); + if( aNew==0 ){ + return SQLITE_IOERR_NOMEM; + } + pFile->aJrnl = aNew; + if( pFile->nJrnlaJrnl+pFile->nJrnl, 0, iOfst-pFile->nJrnl); + } + pFile->nJrnl = iEnd; + } + memcpy(pFile->aJrnl+iOfst, zBuf, iAmt); + return SQLITE_OK; +} + +/* +** Write into the database file. +*/ +static int kvvfsWriteDb( + sqlite3_file *pProtoFile, + const void *zBuf, + int iAmt, + sqlite_int64 iOfst +){ + KVVfsFile *pFile = (KVVfsFile*)pProtoFile; + unsigned int pgno; + char zKey[30]; + char aData[131073]; + SQLITE_KV_LOG(("xWrite('%s-db',%d,%lld)\n", pFile->zClass, iAmt, iOfst)); + assert( iAmt>=512 && iAmt<=65536 ); + assert( (iAmt & (iAmt-1))==0 ); + assert( pFile->szPage<0 || pFile->szPage==iAmt ); + pFile->szPage = iAmt; + pgno = 1 + iOfst/iAmt; + sqlite3_snprintf(sizeof(zKey), zKey, "%u", pgno); + kvvfsEncode(zBuf, iAmt, aData); + if( sqlite3KvvfsMethods.xWrite(pFile->zClass, zKey, aData) ){ + return SQLITE_IOERR; + } + if( iOfst+iAmt > pFile->szDb ){ + pFile->szDb = iOfst + iAmt; + } + return SQLITE_OK; +} + +/* +** Truncate an kvvfs-file. +*/ +static int kvvfsTruncateJrnl(sqlite3_file *pProtoFile, sqlite_int64 size){ + KVVfsFile *pFile = (KVVfsFile *)pProtoFile; + SQLITE_KV_LOG(("xTruncate('%s-journal',%lld)\n", pFile->zClass, size)); + assert( size==0 ); + sqlite3KvvfsMethods.xDelete(pFile->zClass, "jrnl"); + sqlite3_free(pFile->aJrnl); + pFile->aJrnl = 0; + pFile->nJrnl = 0; + return SQLITE_OK; +} +static int kvvfsTruncateDb(sqlite3_file *pProtoFile, sqlite_int64 size){ + KVVfsFile *pFile = (KVVfsFile *)pProtoFile; + if( pFile->szDb>size + && pFile->szPage>0 + && (size % pFile->szPage)==0 + ){ + char zKey[50]; + unsigned int pgno, pgnoMax; + SQLITE_KV_LOG(("xTruncate('%s-db',%lld)\n", pFile->zClass, size)); + pgno = 1 + size/pFile->szPage; + pgnoMax = 2 + pFile->szDb/pFile->szPage; + while( pgno<=pgnoMax ){ + sqlite3_snprintf(sizeof(zKey), zKey, "%u", pgno); + sqlite3KvvfsMethods.xDelete(pFile->zClass, zKey); + pgno++; + } + pFile->szDb = size; + return kvvfsWriteFileSize(pFile, size) ? SQLITE_IOERR : SQLITE_OK; + } + return SQLITE_IOERR; +} + +/* +** Sync an kvvfs-file. +*/ +static int kvvfsSyncJrnl(sqlite3_file *pProtoFile, int flags){ + int i, n; + KVVfsFile *pFile = (KVVfsFile *)pProtoFile; + char *zOut; + SQLITE_KV_LOG(("xSync('%s-journal')\n", pFile->zClass)); + if( pFile->nJrnl<=0 ){ + return kvvfsTruncateJrnl(pProtoFile, 0); + } + zOut = sqlite3_malloc64( pFile->nJrnl*2 + 50 ); + if( zOut==0 ){ + return SQLITE_IOERR_NOMEM; + } + n = pFile->nJrnl; + i = 0; + do{ + zOut[i++] = 'a' + (n%26); + n /= 26; + }while( n>0 ); + zOut[i++] = ' '; + kvvfsEncode(pFile->aJrnl, pFile->nJrnl, &zOut[i]); + i = sqlite3KvvfsMethods.xWrite(pFile->zClass, "jrnl", zOut); + sqlite3_free(zOut); + return i ? SQLITE_IOERR : SQLITE_OK; +} +static int kvvfsSyncDb(sqlite3_file *pProtoFile, int flags){ + return SQLITE_OK; +} + +/* +** Return the current file-size of an kvvfs-file. +*/ +static int kvvfsFileSizeJrnl(sqlite3_file *pProtoFile, sqlite_int64 *pSize){ + KVVfsFile *pFile = (KVVfsFile *)pProtoFile; + SQLITE_KV_LOG(("xFileSize('%s-journal')\n", pFile->zClass)); + *pSize = pFile->nJrnl; + return SQLITE_OK; +} +static int kvvfsFileSizeDb(sqlite3_file *pProtoFile, sqlite_int64 *pSize){ + KVVfsFile *pFile = (KVVfsFile *)pProtoFile; + SQLITE_KV_LOG(("xFileSize('%s-db')\n", pFile->zClass)); + if( pFile->szDb>=0 ){ + *pSize = pFile->szDb; + }else{ + *pSize = kvvfsReadFileSize(pFile); + } + return SQLITE_OK; +} + +/* +** Lock an kvvfs-file. +*/ +static int kvvfsLock(sqlite3_file *pProtoFile, int eLock){ + KVVfsFile *pFile = (KVVfsFile *)pProtoFile; + assert( !pFile->isJournal ); + SQLITE_KV_LOG(("xLock(%s,%d)\n", pFile->zClass, eLock)); + + if( eLock!=SQLITE_LOCK_NONE ){ + pFile->szDb = kvvfsReadFileSize(pFile); + } + return SQLITE_OK; +} + +/* +** Unlock an kvvfs-file. +*/ +static int kvvfsUnlock(sqlite3_file *pProtoFile, int eLock){ + KVVfsFile *pFile = (KVVfsFile *)pProtoFile; + assert( !pFile->isJournal ); + SQLITE_KV_LOG(("xUnlock(%s,%d)\n", pFile->zClass, eLock)); + if( eLock==SQLITE_LOCK_NONE ){ + pFile->szDb = -1; + } + return SQLITE_OK; +} + +/* +** Check if another file-handle holds a RESERVED lock on an kvvfs-file. +*/ +static int kvvfsCheckReservedLock(sqlite3_file *pProtoFile, int *pResOut){ + SQLITE_KV_LOG(("xCheckReservedLock\n")); + *pResOut = 0; + return SQLITE_OK; +} + +/* +** File control method. For custom operations on an kvvfs-file. +*/ +static int kvvfsFileControlJrnl(sqlite3_file *pProtoFile, int op, void *pArg){ + SQLITE_KV_LOG(("xFileControl(%d) on journal\n", op)); + return SQLITE_NOTFOUND; +} +static int kvvfsFileControlDb(sqlite3_file *pProtoFile, int op, void *pArg){ + SQLITE_KV_LOG(("xFileControl(%d) on database\n", op)); + if( op==SQLITE_FCNTL_SYNC ){ + KVVfsFile *pFile = (KVVfsFile *)pProtoFile; + int rc = SQLITE_OK; + SQLITE_KV_LOG(("xSync('%s-db')\n", pFile->zClass)); + if( pFile->szDb>0 && 0!=kvvfsWriteFileSize(pFile, pFile->szDb) ){ + rc = SQLITE_IOERR; + } + return rc; + } + return SQLITE_NOTFOUND; +} + +/* +** Return the sector-size in bytes for an kvvfs-file. +*/ +static int kvvfsSectorSize(sqlite3_file *pFile){ + return 512; +} + +/* +** Return the device characteristic flags supported by an kvvfs-file. +*/ +static int kvvfsDeviceCharacteristics(sqlite3_file *pProtoFile){ + return 0; +} + +/****** sqlite3_vfs methods *************************************************/ + +/* +** Open an kvvfs file handle. +*/ +static int kvvfsOpen( + sqlite3_vfs *pProtoVfs, + const char *zName, + sqlite3_file *pProtoFile, + int flags, + int *pOutFlags +){ + KVVfsFile *pFile = (KVVfsFile*)pProtoFile; + if( zName==0 ) zName = ""; + SQLITE_KV_LOG(("xOpen(\"%s\")\n", zName)); + if( strcmp(zName, "local")==0 + || strcmp(zName, "session")==0 + ){ + pFile->isJournal = 0; + pFile->base.pMethods = &kvvfs_db_io_methods; + }else + if( strcmp(zName, "local-journal")==0 + || strcmp(zName, "session-journal")==0 + ){ + pFile->isJournal = 1; + pFile->base.pMethods = &kvvfs_jrnl_io_methods; + }else{ + return SQLITE_CANTOPEN; + } + if( zName[0]=='s' ){ + pFile->zClass = "session"; + }else{ + pFile->zClass = "local"; + } + pFile->aJrnl = 0; + pFile->nJrnl = 0; + pFile->szPage = -1; + pFile->szDb = -1; + return SQLITE_OK; +} + +/* +** Delete the file located at zPath. If the dirSync argument is true, +** ensure the file-system modifications are synced to disk before +** returning. +*/ +static int kvvfsDelete(sqlite3_vfs *pVfs, const char *zPath, int dirSync){ + if( strcmp(zPath, "local-journal")==0 ){ + sqlite3KvvfsMethods.xDelete("local", "jrnl"); + }else + if( strcmp(zPath, "session-journal")==0 ){ + sqlite3KvvfsMethods.xDelete("session", "jrnl"); + } + return SQLITE_OK; +} + +/* +** Test for access permissions. Return true if the requested permission +** is available, or false otherwise. +*/ +static int kvvfsAccess( + sqlite3_vfs *pProtoVfs, + const char *zPath, + int flags, + int *pResOut +){ + SQLITE_KV_LOG(("xAccess(\"%s\")\n", zPath)); + if( strcmp(zPath, "local-journal")==0 ){ + *pResOut = sqlite3KvvfsMethods.xRead("local", "jrnl", 0, 0)>0; + }else + if( strcmp(zPath, "session-journal")==0 ){ + *pResOut = sqlite3KvvfsMethods.xRead("session", "jrnl", 0, 0)>0; + }else + if( strcmp(zPath, "local")==0 ){ + *pResOut = sqlite3KvvfsMethods.xRead("local", "sz", 0, 0)>0; + }else + if( strcmp(zPath, "session")==0 ){ + *pResOut = sqlite3KvvfsMethods.xRead("session", "sz", 0, 0)>0; + }else + { + *pResOut = 0; + } + SQLITE_KV_LOG(("xAccess returns %d\n",*pResOut)); + return SQLITE_OK; +} + +/* +** Populate buffer zOut with the full canonical pathname corresponding +** to the pathname in zPath. zOut is guaranteed to point to a buffer +** of at least (INST_MAX_PATHNAME+1) bytes. +*/ +static int kvvfsFullPathname( + sqlite3_vfs *pVfs, + const char *zPath, + int nOut, + char *zOut +){ + size_t nPath; +#ifdef SQLITE_OS_KV_ALWAYS_LOCAL + zPath = "local"; +#endif + nPath = strlen(zPath); + SQLITE_KV_LOG(("xFullPathname(\"%s\")\n", zPath)); + if( nOut +static int kvvfsCurrentTimeInt64(sqlite3_vfs *pVfs, sqlite3_int64 *pTimeOut){ + static const sqlite3_int64 unixEpoch = 24405875*(sqlite3_int64)8640000; + struct timeval sNow; + (void)gettimeofday(&sNow, 0); /* Cannot fail given valid arguments */ + *pTimeOut = unixEpoch + 1000*(sqlite3_int64)sNow.tv_sec + sNow.tv_usec/1000; + return SQLITE_OK; +} +#endif /* SQLITE_OS_KV || SQLITE_OS_UNIX */ + +#if SQLITE_OS_KV +/* +** This routine is called initialize the KV-vfs as the default VFS. +*/ +SQLITE_API int sqlite3_os_init(void){ + return sqlite3_vfs_register(&sqlite3OsKvvfsObject, 1); +} +SQLITE_API int sqlite3_os_end(void){ + return SQLITE_OK; +} +#endif /* SQLITE_OS_KV */ + +#if SQLITE_OS_UNIX && defined(SQLITE_OS_KV_OPTIONAL) +SQLITE_PRIVATE int sqlite3KvvfsInit(void){ + return sqlite3_vfs_register(&sqlite3OsKvvfsObject, 0); +} +#endif + +/************** End of os_kv.c ***********************************************/ /************** Begin file os_unix.c *****************************************/ /* ** 2004 May 22 @@ -34876,13 +36757,13 @@ SQLITE_PRIVATE const char *sqlite3OpcodeName(int i){ /* ** standard include files. */ -#include -#include +#include /* amalgamator: keep */ +#include /* amalgamator: keep */ #include #include -#include +#include /* amalgamator: keep */ /* #include */ -#include +#include /* amalgamator: keep */ #include #if !defined(SQLITE_OMIT_WAL) || SQLITE_MAX_MMAP_SIZE>0 # include @@ -35475,6 +37356,9 @@ static int robust_open(const char *z, int f, mode_t m){ break; } if( fd>=SQLITE_MINIMUM_FILE_DESCRIPTOR ) break; + if( (f & (O_EXCL|O_CREAT))==(O_EXCL|O_CREAT) ){ + (void)osUnlink(z); + } osClose(fd); sqlite3_log(SQLITE_WARNING, "attempt to open \"%s\" as file descriptor %d", z, fd); @@ -40644,6 +42528,7 @@ static const char *unixTempFileDir(void){ static int unixGetTempname(int nBuf, char *zBuf){ const char *zDir; int iLimit = 0; + int rc = SQLITE_OK; /* It's odd to simulate an io-error here, but really this is just ** using the io-error infrastructure to test that SQLite handles this @@ -40652,18 +42537,26 @@ static int unixGetTempname(int nBuf, char *zBuf){ zBuf[0] = 0; SimulateIOError( return SQLITE_IOERR ); + sqlite3_mutex_enter(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); zDir = unixTempFileDir(); - if( zDir==0 ) return SQLITE_IOERR_GETTEMPPATH; - do{ - u64 r; - sqlite3_randomness(sizeof(r), &r); - assert( nBuf>2 ); - zBuf[nBuf-2] = 0; - sqlite3_snprintf(nBuf, zBuf, "%s/"SQLITE_TEMP_FILE_PREFIX"%llx%c", - zDir, r, 0); - if( zBuf[nBuf-2]!=0 || (iLimit++)>10 ) return SQLITE_ERROR; - }while( osAccess(zBuf,0)==0 ); - return SQLITE_OK; + if( zDir==0 ){ + rc = SQLITE_IOERR_GETTEMPPATH; + }else{ + do{ + u64 r; + sqlite3_randomness(sizeof(r), &r); + assert( nBuf>2 ); + zBuf[nBuf-2] = 0; + sqlite3_snprintf(nBuf, zBuf, "%s/"SQLITE_TEMP_FILE_PREFIX"%llx%c", + zDir, r, 0); + if( zBuf[nBuf-2]!=0 || (iLimit++)>10 ){ + rc = SQLITE_ERROR; + break; + } + }while( osAccess(zBuf,0)==0 ); + } + sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); + return rc; } #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__) @@ -40806,20 +42699,23 @@ static int findCreateFileMode( ** ** where NN is a decimal number. The NN naming schemes are ** used by the test_multiplex.c module. + ** + ** In normal operation, the journal file name will always contain + ** a '-' character. However in 8+3 filename mode, or if a corrupt + ** rollback journal specifies a super-journal with a goofy name, then + ** the '-' might be missing or the '-' might be the first character in + ** the filename. In that case, just return SQLITE_OK with *pMode==0. */ nDb = sqlite3Strlen30(zPath) - 1; - while( zPath[nDb]!='-' ){ - /* In normal operation, the journal file name will always contain - ** a '-' character. However in 8+3 filename mode, or if a corrupt - ** rollback journal specifies a super-journal with a goofy name, then - ** the '-' might be missing. */ - if( nDb==0 || zPath[nDb]=='.' ) return SQLITE_OK; + while( nDb>0 && zPath[nDb]!='.' ){ + if( zPath[nDb]=='-' ){ + memcpy(zDb, zPath, nDb); + zDb[nDb] = '\0'; + rc = getFileMode(zDb, pMode, pUid, pGid); + break; + } nDb--; } - memcpy(zDb, zPath, nDb); - zDb[nDb] = '\0'; - - rc = getFileMode(zDb, pMode, pUid, pGid); }else if( flags & SQLITE_OPEN_DELETEONCLOSE ){ *pMode = 0600; }else if( flags & SQLITE_OPEN_URI ){ @@ -41209,86 +43105,99 @@ static int unixAccess( } /* -** If the last component of the pathname in z[0]..z[j-1] is something -** other than ".." then back it out and return true. If the last -** component is empty or if it is ".." then return false. +** A pathname under construction */ -static int unixBackupDir(const char *z, int *pJ){ - int j = *pJ; - int i; - if( j<=0 ) return 0; - for(i=j-1; i>0 && z[i-1]!='/'; i--){} - if( i==0 ) return 0; - if( z[i]=='.' && i==j-2 && z[i+1]=='.' ) return 0; - *pJ = i-1; - return 1; -} +typedef struct DbPath DbPath; +struct DbPath { + int rc; /* Non-zero following any error */ + int nSymlink; /* Number of symlinks resolved */ + char *zOut; /* Write the pathname here */ + int nOut; /* Bytes of space available to zOut[] */ + int nUsed; /* Bytes of zOut[] currently being used */ +}; + +/* Forward reference */ +static void appendAllPathElements(DbPath*,const char*); /* -** Convert a relative pathname into a full pathname. Also -** simplify the pathname as follows: -** -** Remove all instances of /./ -** Remove all isntances of /X/../ for any X +** Append a single path element to the DbPath under construction */ -static int mkFullPathname( - const char *zPath, /* Input path */ - char *zOut, /* Output buffer */ - int nOut /* Allocated size of buffer zOut */ +static void appendOnePathElement( + DbPath *pPath, /* Path under construction, to which to append zName */ + const char *zName, /* Name to append to pPath. Not zero-terminated */ + int nName /* Number of significant bytes in zName */ ){ - int nPath = sqlite3Strlen30(zPath); - int iOff = 0; - int i, j; - if( zPath[0]!='/' ){ - if( osGetcwd(zOut, nOut-2)==0 ){ - return unixLogError(SQLITE_CANTOPEN_BKPT, "getcwd", zPath); + assert( nName>0 ); + assert( zName!=0 ); + if( zName[0]=='.' ){ + if( nName==1 ) return; + if( zName[1]=='.' && nName==2 ){ + if( pPath->nUsed<=1 ){ + pPath->rc = SQLITE_ERROR; + return; + } + assert( pPath->zOut[0]=='/' ); + while( pPath->zOut[--pPath->nUsed]!='/' ){} + return; } - iOff = sqlite3Strlen30(zOut); - zOut[iOff++] = '/'; } - if( (iOff+nPath+1)>nOut ){ - /* SQLite assumes that xFullPathname() nul-terminates the output buffer - ** even if it returns an error. */ - zOut[iOff] = '\0'; - return SQLITE_CANTOPEN_BKPT; - } - sqlite3_snprintf(nOut-iOff, &zOut[iOff], "%s", zPath); - - /* Remove duplicate '/' characters. Except, two // at the beginning - ** of a pathname is allowed since this is important on windows. */ - for(i=j=1; zOut[i]; i++){ - zOut[j++] = zOut[i]; - while( zOut[i]=='/' && zOut[i+1]=='/' ) i++; + if( pPath->nUsed + nName + 2 >= pPath->nOut ){ + pPath->rc = SQLITE_ERROR; + return; } - zOut[j] = 0; - - assert( zOut[0]=='/' ); - for(i=j=0; zOut[i]; i++){ - if( zOut[i]=='/' ){ - /* Skip over internal "/." directory components */ - if( zOut[i+1]=='.' && zOut[i+2]=='/' ){ - i += 1; - continue; + pPath->zOut[pPath->nUsed++] = '/'; + memcpy(&pPath->zOut[pPath->nUsed], zName, nName); + pPath->nUsed += nName; +#if defined(HAVE_READLINK) && defined(HAVE_LSTAT) + if( pPath->rc==SQLITE_OK ){ + const char *zIn; + struct stat buf; + pPath->zOut[pPath->nUsed] = 0; + zIn = pPath->zOut; + if( osLstat(zIn, &buf)!=0 ){ + if( errno!=ENOENT ){ + pPath->rc = unixLogError(SQLITE_CANTOPEN_BKPT, "lstat", zIn); } - - /* If this is a "/.." directory component then back out the - ** previous term of the directory if it is something other than "..". - */ - if( zOut[i+1]=='.' - && zOut[i+2]=='.' - && zOut[i+3]=='/' - && unixBackupDir(zOut, &j) - ){ - i += 2; - continue; + }else if( S_ISLNK(buf.st_mode) ){ + ssize_t got; + char zLnk[SQLITE_MAX_PATHLEN+2]; + if( pPath->nSymlink++ > SQLITE_MAX_SYMLINK ){ + pPath->rc = SQLITE_CANTOPEN_BKPT; + return; + } + got = osReadlink(zIn, zLnk, sizeof(zLnk)-2); + if( got<=0 || got>=(ssize_t)sizeof(zLnk)-2 ){ + pPath->rc = unixLogError(SQLITE_CANTOPEN_BKPT, "readlink", zIn); + return; } + zLnk[got] = 0; + if( zLnk[0]=='/' ){ + pPath->nUsed = 0; + }else{ + pPath->nUsed -= nName + 1; + } + appendAllPathElements(pPath, zLnk); } - if( ALWAYS(j>=0) ) zOut[j] = zOut[i]; - j++; } - if( NEVER(j==0) ) zOut[j++] = '/'; - zOut[j] = 0; - return SQLITE_OK; +#endif +} + +/* +** Append all path elements in zPath to the DbPath under construction. +*/ +static void appendAllPathElements( + DbPath *pPath, /* Path under construction, to which to append zName */ + const char *zPath /* Path to append to pPath. Is zero-terminated */ +){ + int i = 0; + int j = 0; + do{ + while( zPath[i] && zPath[i]!='/' ){ i++; } + if( i>j ){ + appendOnePathElement(pPath, &zPath[j], i-j); + } + j = i+1; + }while( zPath[i++] ); } /* @@ -41306,86 +43215,27 @@ static int unixFullPathname( int nOut, /* Size of output buffer in bytes */ char *zOut /* Output buffer */ ){ -#if !defined(HAVE_READLINK) || !defined(HAVE_LSTAT) - return mkFullPathname(zPath, zOut, nOut); -#else - int rc = SQLITE_OK; - int nByte; - int nLink = 0; /* Number of symbolic links followed so far */ - const char *zIn = zPath; /* Input path for each iteration of loop */ - char *zDel = 0; - - assert( pVfs->mxPathname==MAX_PATHNAME ); + DbPath path; UNUSED_PARAMETER(pVfs); - - /* It's odd to simulate an io-error here, but really this is just - ** using the io-error infrastructure to test that SQLite handles this - ** function failing. This function could fail if, for example, the - ** current working directory has been unlinked. - */ - SimulateIOError( return SQLITE_ERROR ); - - do { - - /* Call stat() on path zIn. Set bLink to true if the path is a symbolic - ** link, or false otherwise. */ - int bLink = 0; - struct stat buf; - if( osLstat(zIn, &buf)!=0 ){ - if( errno!=ENOENT ){ - rc = unixLogError(SQLITE_CANTOPEN_BKPT, "lstat", zIn); - } - }else{ - bLink = S_ISLNK(buf.st_mode); - } - - if( bLink ){ - nLink++; - if( zDel==0 ){ - zDel = sqlite3_malloc(nOut); - if( zDel==0 ) rc = SQLITE_NOMEM_BKPT; - }else if( nLink>=SQLITE_MAX_SYMLINKS ){ - rc = SQLITE_CANTOPEN_BKPT; - } - - if( rc==SQLITE_OK ){ - nByte = osReadlink(zIn, zDel, nOut-1); - if( nByte<0 ){ - rc = unixLogError(SQLITE_CANTOPEN_BKPT, "readlink", zIn); - }else{ - if( zDel[0]!='/' ){ - int n; - for(n = sqlite3Strlen30(zIn); n>0 && zIn[n-1]!='/'; n--); - if( nByte+n+1>nOut ){ - rc = SQLITE_CANTOPEN_BKPT; - }else{ - memmove(&zDel[n], zDel, nByte+1); - memcpy(zDel, zIn, n); - nByte += n; - } - } - zDel[nByte] = '\0'; - } - } - - zIn = zDel; - } - - assert( rc!=SQLITE_OK || zIn!=zOut || zIn[0]=='/' ); - if( rc==SQLITE_OK && zIn!=zOut ){ - rc = mkFullPathname(zIn, zOut, nOut); + path.rc = 0; + path.nUsed = 0; + path.nSymlink = 0; + path.nOut = nOut; + path.zOut = zOut; + if( zPath[0]!='/' ){ + char zPwd[SQLITE_MAX_PATHLEN+2]; + if( osGetcwd(zPwd, sizeof(zPwd)-2)==0 ){ + return unixLogError(SQLITE_CANTOPEN_BKPT, "getcwd", zPath); } - if( bLink==0 ) break; - zIn = zOut; - }while( rc==SQLITE_OK ); - - sqlite3_free(zDel); - if( rc==SQLITE_OK && nLink ) rc = SQLITE_OK_SYMLINK; - return rc; -#endif /* HAVE_READLINK && HAVE_LSTAT */ + appendAllPathElements(&path, zPwd); + } + appendAllPathElements(&path, zPath); + zOut[path.nUsed] = 0; + if( path.rc || path.nUsed<2 ) return SQLITE_CANTOPEN_BKPT; + if( path.nSymlink ) return SQLITE_OK_SYMLINK; + return SQLITE_OK; } - #ifndef SQLITE_OMIT_LOAD_EXTENSION /* ** Interfaces for opening a shared library, finding entry points @@ -42881,8 +44731,16 @@ SQLITE_API int sqlite3_os_init(void){ /* Register all VFSes defined in the aVfs[] array */ for(i=0; i<(sizeof(aVfs)/sizeof(sqlite3_vfs)); i++){ +#ifdef SQLITE_DEFAULT_UNIX_VFS + sqlite3_vfs_register(&aVfs[i], + 0==strcmp(aVfs[i].zName,SQLITE_DEFAULT_UNIX_VFS)); +#else sqlite3_vfs_register(&aVfs[i], i==0); +#endif } +#ifdef SQLITE_OS_KV_OPTIONAL + sqlite3KvvfsInit(); +#endif unixBigLock = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_VFS1); #ifndef SQLITE_OMIT_WAL @@ -44845,10 +46703,12 @@ SQLITE_API int sqlite3_win32_set_directory8( const char *zValue /* New value for directory being set or reset */ ){ char **ppDirectory = 0; + int rc; #ifndef SQLITE_OMIT_AUTOINIT - int rc = sqlite3_initialize(); + rc = sqlite3_initialize(); if( rc ) return rc; #endif + sqlite3_mutex_enter(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); if( type==SQLITE_WIN32_DATA_DIRECTORY_TYPE ){ ppDirectory = &sqlite3_data_directory; }else if( type==SQLITE_WIN32_TEMP_DIRECTORY_TYPE ){ @@ -44863,14 +46723,19 @@ SQLITE_API int sqlite3_win32_set_directory8( if( zValue && zValue[0] ){ zCopy = sqlite3_mprintf("%s", zValue); if ( zCopy==0 ){ - return SQLITE_NOMEM_BKPT; + rc = SQLITE_NOMEM_BKPT; + goto set_directory8_done; } } sqlite3_free(*ppDirectory); *ppDirectory = zCopy; - return SQLITE_OK; + rc = SQLITE_OK; + }else{ + rc = SQLITE_ERROR; } - return SQLITE_ERROR; +set_directory8_done: + sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); + return rc; } /* @@ -47644,6 +49509,19 @@ static int winMakeEndInDirSep(int nBuf, char *zBuf){ return 0; } +/* +** If sqlite3_temp_directory is defined, take the mutex and return true. +** +** If sqlite3_temp_directory is NULL (undefined), omit the mutex and +** return false. +*/ +static int winTempDirDefined(void){ + sqlite3_mutex_enter(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); + if( sqlite3_temp_directory!=0 ) return 1; + sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); + return 0; +} + /* ** Create a temporary file name and store the resulting pointer into pzBuf. ** The pointer returned in pzBuf must be freed via sqlite3_free(). @@ -47680,20 +49558,23 @@ static int winGetTempname(sqlite3_vfs *pVfs, char **pzBuf){ */ nDir = nMax - (nPre + 15); assert( nDir>0 ); - if( sqlite3_temp_directory ){ + if( winTempDirDefined() ){ int nDirLen = sqlite3Strlen30(sqlite3_temp_directory); if( nDirLen>0 ){ if( !winIsDirSep(sqlite3_temp_directory[nDirLen-1]) ){ nDirLen++; } if( nDirLen>nDir ){ + sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); sqlite3_free(zBuf); OSTRACE(("TEMP-FILENAME rc=SQLITE_ERROR\n")); return winLogError(SQLITE_ERROR, 0, "winGetTempname1", 0); } sqlite3_snprintf(nMax, zBuf, "%s", sqlite3_temp_directory); } + sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); } + #if defined(__CYGWIN__) else{ static const char *azDirs[] = { @@ -48482,7 +50363,7 @@ static BOOL winIsVerbatimPathname( ** pathname into zOut[]. zOut[] will be at least pVfs->mxPathname ** bytes in size. */ -static int winFullPathname( +static int winFullPathnameNoMutex( sqlite3_vfs *pVfs, /* Pointer to vfs object */ const char *zRelative, /* Possibly relative input path */ int nFull, /* Size of output buffer in bytes */ @@ -48661,6 +50542,20 @@ static int winFullPathname( } #endif } +static int winFullPathname( + sqlite3_vfs *pVfs, /* Pointer to vfs object */ + const char *zRelative, /* Possibly relative input path */ + int nFull, /* Size of output buffer in bytes */ + char *zFull /* Output buffer */ +){ + int rc; + MUTEX_LOGIC( sqlite3_mutex *pMutex; ) + MUTEX_LOGIC( pMutex = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR); ) + sqlite3_mutex_enter(pMutex); + rc = winFullPathnameNoMutex(pVfs, zRelative, nFull, zFull); + sqlite3_mutex_leave(pMutex); + return rc; +} #ifndef SQLITE_OMIT_LOAD_EXTENSION /* @@ -49197,6 +51092,7 @@ static int memdbTruncate(sqlite3_file*, sqlite3_int64 size); static int memdbSync(sqlite3_file*, int flags); static int memdbFileSize(sqlite3_file*, sqlite3_int64 *pSize); static int memdbLock(sqlite3_file*, int); +static int memdbUnlock(sqlite3_file*, int); /* static int memdbCheckReservedLock(sqlite3_file*, int *pResOut);// not used */ static int memdbFileControl(sqlite3_file*, int op, void *pArg); /* static int memdbSectorSize(sqlite3_file*); // not used */ @@ -49255,7 +51151,7 @@ static const sqlite3_io_methods memdb_io_methods = { memdbSync, /* xSync */ memdbFileSize, /* xFileSize */ memdbLock, /* xLock */ - memdbLock, /* xUnlock - same as xLock in this case */ + memdbUnlock, /* xUnlock */ 0, /* memdbCheckReservedLock, */ /* xCheckReservedLock */ memdbFileControl, /* xFileControl */ 0, /* memdbSectorSize,*/ /* xSectorSize */ @@ -49456,39 +51352,81 @@ static int memdbLock(sqlite3_file *pFile, int eLock){ MemFile *pThis = (MemFile*)pFile; MemStore *p = pThis->pStore; int rc = SQLITE_OK; - if( eLock==pThis->eLock ) return SQLITE_OK; + if( eLock<=pThis->eLock ) return SQLITE_OK; memdbEnter(p); - if( eLock>SQLITE_LOCK_SHARED ){ - if( p->mFlags & SQLITE_DESERIALIZE_READONLY ){ - rc = SQLITE_READONLY; - }else if( pThis->eLock<=SQLITE_LOCK_SHARED ){ - if( p->nWrLock ){ - rc = SQLITE_BUSY; - }else{ - p->nWrLock = 1; + + assert( p->nWrLock==0 || p->nWrLock==1 ); + assert( pThis->eLock<=SQLITE_LOCK_SHARED || p->nWrLock==1 ); + assert( pThis->eLock==SQLITE_LOCK_NONE || p->nRdLock>=1 ); + + if( eLock>SQLITE_LOCK_SHARED && (p->mFlags & SQLITE_DESERIALIZE_READONLY) ){ + rc = SQLITE_READONLY; + }else{ + switch( eLock ){ + case SQLITE_LOCK_SHARED: { + assert( pThis->eLock==SQLITE_LOCK_NONE ); + if( p->nWrLock>0 ){ + rc = SQLITE_BUSY; + }else{ + p->nRdLock++; + } + break; + }; + + case SQLITE_LOCK_RESERVED: + case SQLITE_LOCK_PENDING: { + assert( pThis->eLock>=SQLITE_LOCK_SHARED ); + if( ALWAYS(pThis->eLock==SQLITE_LOCK_SHARED) ){ + if( p->nWrLock>0 ){ + rc = SQLITE_BUSY; + }else{ + p->nWrLock = 1; + } + } + break; + } + + default: { + assert( eLock==SQLITE_LOCK_EXCLUSIVE ); + assert( pThis->eLock>=SQLITE_LOCK_SHARED ); + if( p->nRdLock>1 ){ + rc = SQLITE_BUSY; + }else if( pThis->eLock==SQLITE_LOCK_SHARED ){ + p->nWrLock = 1; + } + break; } } - }else if( eLock==SQLITE_LOCK_SHARED ){ - if( pThis->eLock > SQLITE_LOCK_SHARED ){ - assert( p->nWrLock==1 ); - p->nWrLock = 0; - }else if( p->nWrLock ){ - rc = SQLITE_BUSY; - }else{ - p->nRdLock++; + } + if( rc==SQLITE_OK ) pThis->eLock = eLock; + memdbLeave(p); + return rc; +} + +/* +** Unlock an memdb-file. +*/ +static int memdbUnlock(sqlite3_file *pFile, int eLock){ + MemFile *pThis = (MemFile*)pFile; + MemStore *p = pThis->pStore; + if( eLock>=pThis->eLock ) return SQLITE_OK; + memdbEnter(p); + + assert( eLock==SQLITE_LOCK_SHARED || eLock==SQLITE_LOCK_NONE ); + if( eLock==SQLITE_LOCK_SHARED ){ + if( ALWAYS(pThis->eLock>SQLITE_LOCK_SHARED) ){ + p->nWrLock--; } }else{ - assert( eLock==SQLITE_LOCK_NONE ); if( pThis->eLock>SQLITE_LOCK_SHARED ){ - assert( p->nWrLock==1 ); - p->nWrLock = 0; + p->nWrLock--; } - assert( p->nRdLock>0 ); p->nRdLock--; } - if( rc==SQLITE_OK ) pThis->eLock = eLock; + + pThis->eLock = eLock; memdbLeave(p); - return rc; + return SQLITE_OK; } #if 0 @@ -49598,7 +51536,7 @@ static int memdbOpen( memset(pFile, 0, sizeof(*pFile)); szName = sqlite3Strlen30(zName); - if( szName>1 && zName[0]=='/' ){ + if( szName>1 && (zName[0]=='/' || zName[0]=='\\') ){ int i; #ifndef SQLITE_MUTEX_OMIT sqlite3_mutex *pVfsMutex = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_VFS1); @@ -49945,6 +51883,13 @@ end_deserialize: return rc; } +/* +** Return true if the VFS is the memvfs. +*/ +//SQLITE_PRIVATE int sqlite3IsMemdb(const sqlite3_vfs *pVfs){ +// return pVfs==&memdb_vfs; +//} + /* ** This routine is called when the extension is loaded. ** Register the new VFS. @@ -50449,12 +52394,20 @@ struct PCache { int sqlite3PcacheTrace = 2; /* 0: off 1: simple 2: cache dumps */ int sqlite3PcacheMxDump = 9999; /* Max cache entries for pcacheDump() */ # define pcacheTrace(X) if(sqlite3PcacheTrace){sqlite3DebugPrintf X;} - void pcacheDump(PCache *pCache){ - int N; - int i, j; - sqlite3_pcache_page *pLower; + static void pcachePageTrace(int i, sqlite3_pcache_page *pLower){ PgHdr *pPg; unsigned char *a; + int j; + pPg = (PgHdr*)pLower->pExtra; + printf("%3d: nRef %2d flgs %02x data ", i, pPg->nRef, pPg->flags); + a = (unsigned char *)pLower->pBuf; + for(j=0; j<12; j++) printf("%02x", a[j]); + printf(" ptr %p\n", pPg); + } + static void pcacheDump(PCache *pCache){ + int N; + int i; + sqlite3_pcache_page *pLower; if( sqlite3PcacheTrace<2 ) return; if( pCache->pCache==0 ) return; @@ -50463,21 +52416,32 @@ struct PCache { for(i=1; i<=N; i++){ pLower = sqlite3GlobalConfig.pcache2.xFetch(pCache->pCache, i, 0); if( pLower==0 ) continue; - pPg = (PgHdr*)pLower->pExtra; - printf("%3d: nRef %2d flgs %02x data ", i, pPg->nRef, pPg->flags); - a = (unsigned char *)pLower->pBuf; - for(j=0; j<12; j++) printf("%02x", a[j]); - printf("\n"); - if( pPg->pPage==0 ){ + pcachePageTrace(i, pLower); + if( ((PgHdr*)pLower)->pPage==0 ){ sqlite3GlobalConfig.pcache2.xUnpin(pCache->pCache, pLower, 0); } } } - #else +#else # define pcacheTrace(X) +# define pcachePageTrace(PGNO, X) # define pcacheDump(X) #endif +/* +** Return 1 if pPg is on the dirty list for pCache. Return 0 if not. +** This routine runs inside of assert() statements only. +*/ +#ifdef SQLITE_DEBUG +static int pageOnDirtyList(PCache *pCache, PgHdr *pPg){ + PgHdr *p; + for(p=pCache->pDirty; p; p=p->pDirtyNext){ + if( p==pPg ) return 1; + } + return 0; +} +#endif + /* ** Check invariants on a PgHdr entry. Return true if everything is OK. ** Return false if any invariant is violated. @@ -50496,8 +52460,13 @@ SQLITE_PRIVATE int sqlite3PcachePageSanity(PgHdr *pPg){ assert( pCache!=0 ); /* Every page has an associated PCache */ if( pPg->flags & PGHDR_CLEAN ){ assert( (pPg->flags & PGHDR_DIRTY)==0 );/* Cannot be both CLEAN and DIRTY */ - assert( pCache->pDirty!=pPg ); /* CLEAN pages not on dirty list */ - assert( pCache->pDirtyTail!=pPg ); + assert( !pageOnDirtyList(pCache, pPg) );/* CLEAN pages not on dirty list */ + }else{ + assert( (pPg->flags & PGHDR_DIRTY)!=0 );/* If not CLEAN must be DIRTY */ + assert( pPg->pDirtyNext==0 || pPg->pDirtyNext->pDirtyPrev==pPg ); + assert( pPg->pDirtyPrev==0 || pPg->pDirtyPrev->pDirtyNext==pPg ); + assert( pPg->pDirtyPrev!=0 || pCache->pDirty==pPg ); + assert( pageOnDirtyList(pCache, pPg) ); } /* WRITEABLE pages must also be DIRTY */ if( pPg->flags & PGHDR_WRITEABLE ){ @@ -50771,8 +52740,9 @@ SQLITE_PRIVATE sqlite3_pcache_page *sqlite3PcacheFetch( assert( createFlag==0 || pCache->eCreate==eCreate ); assert( createFlag==0 || eCreate==1+(!pCache->bPurgeable||!pCache->pDirty) ); pRes = sqlite3GlobalConfig.pcache2.xFetch(pCache->pCache, pgno, eCreate); - pcacheTrace(("%p.FETCH %d%s (result: %p)\n",pCache,pgno, + pcacheTrace(("%p.FETCH %d%s (result: %p) ",pCache,pgno, createFlag?" create":"",pRes)); + pcachePageTrace(pgno, pRes); return pRes; } @@ -50900,6 +52870,7 @@ SQLITE_PRIVATE void SQLITE_NOINLINE sqlite3PcacheRelease(PgHdr *p){ pcacheUnpin(p); }else{ pcacheManageDirtyList(p, PCACHE_DIRTYLIST_FRONT); + assert( sqlite3PcachePageSanity(p) ); } } } @@ -50934,8 +52905,7 @@ SQLITE_PRIVATE void sqlite3PcacheDrop(PgHdr *p){ ** make it so. */ SQLITE_PRIVATE void sqlite3PcacheMakeDirty(PgHdr *p){ - assert( p->nRef>0 || p->pCache->bPurgeable==0 ); - testcase( p->nRef==0 ); + assert( p->nRef>0 ); assert( sqlite3PcachePageSanity(p) ); if( p->flags & (PGHDR_CLEAN|PGHDR_DONT_WRITE) ){ /*OPTIMIZATION-IF-FALSE*/ p->flags &= ~PGHDR_DONT_WRITE; @@ -50944,6 +52914,7 @@ SQLITE_PRIVATE void sqlite3PcacheMakeDirty(PgHdr *p){ pcacheTrace(("%p.DIRTY %d\n",p->pCache,p->pgno)); assert( (p->flags & (PGHDR_DIRTY|PGHDR_CLEAN))==PGHDR_DIRTY ); pcacheManageDirtyList(p, PCACHE_DIRTYLIST_ADD); + assert( sqlite3PcachePageSanity(p) ); } assert( sqlite3PcachePageSanity(p) ); } @@ -51006,14 +52977,24 @@ SQLITE_PRIVATE void sqlite3PcacheClearSyncFlags(PCache *pCache){ */ SQLITE_PRIVATE void sqlite3PcacheMove(PgHdr *p, Pgno newPgno){ PCache *pCache = p->pCache; + sqlite3_pcache_page *pOther; assert( p->nRef>0 ); assert( newPgno>0 ); assert( sqlite3PcachePageSanity(p) ); pcacheTrace(("%p.MOVE %d -> %d\n",pCache,p->pgno,newPgno)); + pOther = sqlite3GlobalConfig.pcache2.xFetch(pCache->pCache, newPgno, 0); + if( pOther ){ + PgHdr *pXPage = (PgHdr*)pOther->pExtra; + assert( pXPage->nRef==0 ); + pXPage->nRef++; + pCache->nRefSum++; + sqlite3PcacheDrop(pXPage); + } sqlite3GlobalConfig.pcache2.xRekey(pCache->pCache, p->pPage, p->pgno,newPgno); p->pgno = newPgno; if( (p->flags&PGHDR_DIRTY) && (p->flags&PGHDR_NEED_SYNC) ){ pcacheManageDirtyList(p, PCACHE_DIRTYLIST_FRONT); + assert( sqlite3PcachePageSanity(p) ); } } @@ -51311,12 +53292,13 @@ SQLITE_PRIVATE void sqlite3PcacheIterateDirty(PCache *pCache, void (*xIter)(PgHd ** size can vary according to architecture, compile-time options, and ** SQLite library version number. ** -** If SQLITE_PCACHE_SEPARATE_HEADER is defined, then the extension is obtained -** using a separate memory allocation from the database page content. This -** seeks to overcome the "clownshoe" problem (also called "internal -** fragmentation" in academic literature) of allocating a few bytes more -** than a power of two with the memory allocator rounding up to the next -** power of two, and leaving the rounded-up space unused. +** Historical note: It used to be that if the SQLITE_PCACHE_SEPARATE_HEADER +** was defined, then the page content would be held in a separate memory +** allocation from the PgHdr1. This was intended to avoid clownshoe memory +** allocations. However, the btree layer needs a small (16-byte) overrun +** area after the page content buffer. The header serves as that overrun +** area. Therefore SQLITE_PCACHE_SEPARATE_HEADER was discontinued to avoid +** any possibility of a memory error. ** ** This module tracks pointers to PgHdr1 objects. Only pcache.c communicates ** with this module. Information is passed back and forth as PgHdr1 pointers. @@ -51361,30 +53343,40 @@ typedef struct PGroup PGroup; /* ** Each cache entry is represented by an instance of the following -** structure. Unless SQLITE_PCACHE_SEPARATE_HEADER is defined, a buffer of -** PgHdr1.pCache->szPage bytes is allocated directly before this structure -** in memory. +** structure. A buffer of PgHdr1.pCache->szPage bytes is allocated +** directly before this structure and is used to cache the page content. ** -** Note: Variables isBulkLocal and isAnchor were once type "u8". That works, +** When reading a corrupt database file, it is possible that SQLite might +** read a few bytes (no more than 16 bytes) past the end of the page buffer. +** It will only read past the end of the page buffer, never write. This +** object is positioned immediately after the page buffer to serve as an +** overrun area, so that overreads are harmless. +** +** Variables isBulkLocal and isAnchor were once type "u8". That works, ** but causes a 2-byte gap in the structure for most architectures (since ** pointers must be either 4 or 8-byte aligned). As this structure is located ** in memory directly after the associated page data, if the database is ** corrupt, code at the b-tree layer may overread the page buffer and ** read part of this structure before the corruption is detected. This ** can cause a valgrind error if the unitialized gap is accessed. Using u16 -** ensures there is no such gap, and therefore no bytes of unitialized memory -** in the structure. +** ensures there is no such gap, and therefore no bytes of uninitialized +** memory in the structure. +** +** The pLruNext and pLruPrev pointers form a double-linked circular list +** of all pages that are unpinned. The PGroup.lru element (which should be +** the only element on the list with PgHdr1.isAnchor set to 1) forms the +** beginning and the end of the list. */ struct PgHdr1 { - sqlite3_pcache_page page; /* Base class. Must be first. pBuf & pExtra */ - unsigned int iKey; /* Key value (page number) */ - u16 isBulkLocal; /* This page from bulk local storage */ - u16 isAnchor; /* This is the PGroup.lru element */ - PgHdr1 *pNext; /* Next in hash table chain */ - PCache1 *pCache; /* Cache that currently owns this page */ - PgHdr1 *pLruNext; /* Next in LRU list of unpinned pages */ - PgHdr1 *pLruPrev; /* Previous in LRU list of unpinned pages */ - /* NB: pLruPrev is only valid if pLruNext!=0 */ + sqlite3_pcache_page page; /* Base class. Must be first. pBuf & pExtra */ + unsigned int iKey; /* Key value (page number) */ + u16 isBulkLocal; /* This page from bulk local storage */ + u16 isAnchor; /* This is the PGroup.lru element */ + PgHdr1 *pNext; /* Next in hash table chain */ + PCache1 *pCache; /* Cache that currently owns this page */ + PgHdr1 *pLruNext; /* Next in circular LRU list of unpinned pages */ + PgHdr1 *pLruPrev; /* Previous in LRU list of unpinned pages */ + /* NB: pLruPrev is only valid if pLruNext!=0 */ }; /* @@ -51710,25 +53702,13 @@ static PgHdr1 *pcache1AllocPage(PCache1 *pCache, int benignMalloc){ pcache1LeaveMutex(pCache->pGroup); #endif if( benignMalloc ){ sqlite3BeginBenignMalloc(); } -#ifdef SQLITE_PCACHE_SEPARATE_HEADER - pPg = pcache1Alloc(pCache->szPage); - p = sqlite3Malloc(sizeof(PgHdr1) + pCache->szExtra); - if( !pPg || !p ){ - pcache1Free(pPg); - sqlite3_free(p); - pPg = 0; - } -#else pPg = pcache1Alloc(pCache->szAlloc); -#endif if( benignMalloc ){ sqlite3EndBenignMalloc(); } #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT pcache1EnterMutex(pCache->pGroup); #endif if( pPg==0 ) return 0; -#ifndef SQLITE_PCACHE_SEPARATE_HEADER p = (PgHdr1 *)&((u8 *)pPg)[pCache->szPage]; -#endif p->page.pBuf = pPg; p->page.pExtra = &p[1]; p->isBulkLocal = 0; @@ -51752,9 +53732,6 @@ static void pcache1FreePage(PgHdr1 *p){ pCache->pFree = p; }else{ pcache1Free(p->page.pBuf); -#ifdef SQLITE_PCACHE_SEPARATE_HEADER - sqlite3_free(p); -#endif } (*pCache->pnPurgeable)--; } @@ -52395,23 +54372,26 @@ static void pcache1Rekey( PCache1 *pCache = (PCache1 *)p; PgHdr1 *pPage = (PgHdr1 *)pPg; PgHdr1 **pp; - unsigned int h; + unsigned int hOld, hNew; assert( pPage->iKey==iOld ); assert( pPage->pCache==pCache ); + assert( iOld!=iNew ); /* The page number really is changing */ pcache1EnterMutex(pCache->pGroup); - h = iOld%pCache->nHash; - pp = &pCache->apHash[h]; + assert( pcache1FetchNoMutex(p, iOld, 0)==pPage ); /* pPg really is iOld */ + hOld = iOld%pCache->nHash; + pp = &pCache->apHash[hOld]; while( (*pp)!=pPage ){ pp = &(*pp)->pNext; } *pp = pPage->pNext; - h = iNew%pCache->nHash; + assert( pcache1FetchNoMutex(p, iNew, 0)==0 ); /* iNew not in cache */ + hNew = iNew%pCache->nHash; pPage->iKey = iNew; - pPage->pNext = pCache->apHash[h]; - pCache->apHash[h] = pPage; + pPage->pNext = pCache->apHash[hNew]; + pCache->apHash[hNew] = pPage; if( iNew>pCache->iMaxKey ){ pCache->iMaxKey = iNew; } @@ -52518,9 +54498,6 @@ SQLITE_PRIVATE int sqlite3PcacheReleaseMemory(int nReq){ && p->isAnchor==0 ){ nFree += pcache1MemSize(p->page.pBuf); -#ifdef SQLITE_PCACHE_SEPARATE_HEADER - nFree += sqlite3MemSize(p); -#endif assert( PAGE_IS_UNPINNED(p) ); pcache1PinPage(p); pcache1RemoveFromHash(p, 1); @@ -53904,6 +55881,7 @@ struct Pager { u32 vfsFlags; /* Flags for sqlite3_vfs.xOpen() */ u32 sectorSize; /* Assumed sector size during rollback */ Pgno mxPgno; /* Maximum allowed size of the database */ + Pgno lckPgno; /* Page number for the locking page */ i64 pageSize; /* Number of bytes in a page */ i64 journalSizeLimit; /* Size limit for persistent journal files */ char *zFilename; /* Name of the database file */ @@ -54890,7 +56868,7 @@ static int readJournalHdr( ** journal file descriptor is advanced to the next sector boundary before ** anything is written. The format is: ** -** + 4 bytes: PAGER_MJ_PGNO. +** + 4 bytes: PAGER_SJ_PGNO. ** + N bytes: super-journal filename in utf-8. ** + 4 bytes: N (length of super-journal name in bytes, no nul-terminator). ** + 4 bytes: super-journal name checksum. @@ -54938,7 +56916,7 @@ static int writeSuperJournal(Pager *pPager, const char *zSuper){ /* Write the super-journal data to the end of the journal file. If ** an error occurs, return the error code to the caller. */ - if( (0 != (rc = write32bits(pPager->jfd, iHdrOff, PAGER_MJ_PGNO(pPager)))) + if( (0 != (rc = write32bits(pPager->jfd, iHdrOff, PAGER_SJ_PGNO(pPager)))) || (0 != (rc = sqlite3OsWrite(pPager->jfd, zSuper, nSuper, iHdrOff+4))) || (0 != (rc = write32bits(pPager->jfd, iHdrOff+4+nSuper, nSuper))) || (0 != (rc = write32bits(pPager->jfd, iHdrOff+4+nSuper+4, cksum))) @@ -55448,7 +57426,7 @@ static u32 pager_cksum(Pager *pPager, const u8 *aData){ ** corrupted, SQLITE_DONE is returned. Data is considered corrupted in ** two circumstances: ** -** * If the record page-number is illegal (0 or PAGER_MJ_PGNO), or +** * If the record page-number is illegal (0 or PAGER_SJ_PGNO), or ** * If the record is being rolled back from the main journal file ** and the checksum field does not match the record content. ** @@ -55508,7 +57486,7 @@ static int pager_playback_one_page( ** it could cause invalid data to be written into the journal. We need to ** detect this invalid data (with high probability) and ignore it. */ - if( pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){ + if( pgno==0 || pgno==PAGER_SJ_PGNO(pPager) ){ assert( !isSavepnt ); return SQLITE_DONE; } @@ -55845,6 +57823,7 @@ static int pager_truncate(Pager *pPager, Pgno nPage){ memset(pTmp, 0, szPage); testcase( (newSize-szPage) == currentSize ); testcase( (newSize-szPage) > currentSize ); + sqlite3OsFileControlHint(pPager->fd, SQLITE_FCNTL_SIZE_HINT, &newSize); rc = sqlite3OsWrite(pPager->fd, pTmp, szPage, newSize-szPage); } if( rc==SQLITE_OK ){ @@ -56966,6 +58945,7 @@ SQLITE_PRIVATE int sqlite3PagerSetPagesize(Pager *pPager, u32 *pPageSize, int nR pPager->pTmpSpace = pNew; pPager->dbSize = (Pgno)((nByte+pageSize-1)/pageSize); pPager->pageSize = pageSize; + pPager->lckPgno = (Pgno)(PENDING_BYTE/pageSize) + 1; }else{ sqlite3PageFree(pNew); } @@ -58735,7 +60715,7 @@ static int getPageNormal( if( pPg->pPager && !noContent ){ /* In this case the pcache already contains an initialized copy of ** the page. Return without further ado. */ - assert( pgno!=PAGER_MJ_PGNO(pPager) ); + assert( pgno!=PAGER_SJ_PGNO(pPager) ); pPager->aStat[PAGER_STAT_HIT]++; return SQLITE_OK; @@ -58746,7 +60726,7 @@ static int getPageNormal( ** (*) obsolete. Was: maximum page number is 2^31 ** (2) Never try to fetch the locking page */ - if( pgno==PAGER_MJ_PGNO(pPager) ){ + if( pgno==PAGER_SJ_PGNO(pPager) ){ rc = SQLITE_CORRUPT_BKPT; goto pager_acquire_err; } @@ -59006,6 +60986,7 @@ static int pager_open_journal(Pager *pPager){ if( pPager->tempFile ){ flags |= (SQLITE_OPEN_DELETEONCLOSE|SQLITE_OPEN_TEMP_JOURNAL); + flags |= SQLITE_OPEN_EXCLUSIVE; nSpill = sqlite3Config.nStmtSpill; }else{ flags |= SQLITE_OPEN_MAIN_JOURNAL; @@ -59041,6 +61022,7 @@ static int pager_open_journal(Pager *pPager){ if( rc!=SQLITE_OK ){ sqlite3BitvecDestroy(pPager->pInJournal); pPager->pInJournal = 0; + pPager->journalOff = 0; }else{ assert( pPager->eState==PAGER_WRITER_LOCKED ); pPager->eState = PAGER_WRITER_CACHEMOD; @@ -59145,7 +61127,7 @@ static SQLITE_NOINLINE int pagerAddPageToRollbackJournal(PgHdr *pPg){ /* We should never write to the journal file the page that ** contains the database locks. The following assert verifies ** that we do not. */ - assert( pPg->pgno!=PAGER_MJ_PGNO(pPager) ); + assert( pPg->pgno!=PAGER_SJ_PGNO(pPager) ); assert( pPager->journalHdr<=pPager->journalOff ); pData2 = pPg->pData; @@ -59324,7 +61306,7 @@ static SQLITE_NOINLINE int pagerWriteLargeSector(PgHdr *pPg){ Pgno pg = pg1+ii; PgHdr *pPage; if( pg==pPg->pgno || !sqlite3BitvecTest(pPager->pInJournal, pg) ){ - if( pg!=PAGER_MJ_PGNO(pPager) ){ + if( pg!=PAGER_SJ_PGNO(pPager) ){ rc = sqlite3PagerGet(pPager, pg, &pPage, 0); if( rc==SQLITE_OK ){ rc = pager_write(pPage); @@ -59802,7 +61784,7 @@ SQLITE_PRIVATE int sqlite3PagerCommitPhaseOne( ** last page is never written out to disk, leaving the database file ** undersized. Fix this now if it is the case. */ if( pPager->dbSize>pPager->dbFileSize ){ - Pgno nNew = pPager->dbSize - (pPager->dbSize==PAGER_MJ_PGNO(pPager)); + Pgno nNew = pPager->dbSize - (pPager->dbSize==PAGER_SJ_PGNO(pPager)); assert( pPager->eState==PAGER_WRITER_DBMOD ); rc = pager_truncate(pPager, nNew); if( rc!=SQLITE_OK ) goto commit_phase_one_exit; @@ -65431,7 +67413,7 @@ struct MemPage { u8 *aData; /* Pointer to disk image of the page data */ u8 *aDataEnd; /* One byte past the end of the entire page - not just ** the usable space, the entire page. Used to prevent - ** corruption-induced of buffer overflow. */ + ** corruption-induced buffer overflow. */ u8 *aCellIdx; /* The cell index area */ u8 *aDataOfst; /* Same as aData for leaves. aData+4 for interior */ DbPage *pDbPage; /* Pager page handle */ @@ -65736,7 +67718,7 @@ struct BtCursor { /* ** The database page the PENDING_BYTE occupies. This page is never used. */ -# define PENDING_BYTE_PAGE(pBt) PAGER_MJ_PGNO(pBt) +#define PENDING_BYTE_PAGE(pBt) ((Pgno)((PENDING_BYTE/((pBt)->pageSize))+1)) /* ** These macros define the location of the pointer-map entry for a @@ -66104,6 +68086,7 @@ SQLITE_PRIVATE int sqlite3BtreeHoldsAllMutexes(sqlite3 *db){ SQLITE_PRIVATE int sqlite3SchemaMutexHeld(sqlite3 *db, int iDb, Schema *pSchema){ Btree *p; assert( db!=0 ); + if( db->pVfs==0 && db->nDb==0 ) return 1; if( pSchema ) iDb = sqlite3SchemaToIndex(db, pSchema); assert( iDb>=0 && iDbnDb ); if( !sqlite3_mutex_held(db->mutex) ) return 0; @@ -66377,7 +68360,7 @@ static int hasSharedCacheTableLock( int bSeen = 0; for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){ Index *pIdx = (Index *)sqliteHashData(p); - if( pIdx->tnum==(int)iRoot ){ + if( pIdx->tnum==iRoot ){ if( bSeen ){ /* Two or more indexes share the same root page. There must ** be imposter tables. So just return true. The assert is not @@ -66970,7 +68953,7 @@ SQLITE_PRIVATE void sqlite3BtreeClearCursor(BtCursor *pCur){ /* ** In this version of BtreeMoveto, pKey is a packed index record ** such as is generated by the OP_MakeRecord opcode. Unpack the -** record and then call BtreeMovetoUnpacked() to do the work. +** record and then call sqlite3BtreeIndexMoveto() to do the work. */ static int btreeMoveto( BtCursor *pCur, /* Cursor open on the btree to be searched */ @@ -67490,6 +69473,7 @@ static void btreeParseCell( ** the space used by the cell pointer. ** ** cellSizePtrNoPayload() => table internal nodes +** cellSizePtrTableLeaf() => table leaf nodes ** cellSizePtr() => all index nodes & table leaf nodes */ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ @@ -67515,13 +69499,6 @@ static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ }while( *(pIter)>=0x80 && pIterintKey ){ - /* pIter now points at the 64-bit integer key value, a variable length - ** integer. The following block moves pIter to point at the first byte - ** past the end of the key value. */ - pEnd = &pIter[9]; - while( (*pIter++)&0x80 && pItermaxLocal ); testcase( nSize==(u32)pPage->maxLocal+1 ); if( nSize<=pPage->maxLocal ){ @@ -67561,6 +69538,58 @@ static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){ assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB ); return (u16)(pIter - pCell); } +static u16 cellSizePtrTableLeaf(MemPage *pPage, u8 *pCell){ + u8 *pIter = pCell; /* For looping over bytes of pCell */ + u8 *pEnd; /* End mark for a varint */ + u32 nSize; /* Size value to return */ + +#ifdef SQLITE_DEBUG + /* The value returned by this function should always be the same as + ** the (CellInfo.nSize) value found by doing a full parse of the + ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of + ** this function verifies that this invariant is not violated. */ + CellInfo debuginfo; + pPage->xParseCell(pPage, pCell, &debuginfo); +#endif + + nSize = *pIter; + if( nSize>=0x80 ){ + pEnd = &pIter[8]; + nSize &= 0x7f; + do{ + nSize = (nSize<<7) | (*++pIter & 0x7f); + }while( *(pIter)>=0x80 && pItermaxLocal ); + testcase( nSize==(u32)pPage->maxLocal+1 ); + if( nSize<=pPage->maxLocal ){ + nSize += (u32)(pIter - pCell); + if( nSize<4 ) nSize = 4; + }else{ + int minLocal = pPage->minLocal; + nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); + testcase( nSize==pPage->maxLocal ); + testcase( nSize==(u32)pPage->maxLocal+1 ); + if( nSize>pPage->maxLocal ){ + nSize = minLocal; + } + nSize += 4 + (u16)(pIter - pCell); + } + assert( nSize==debuginfo.nSize || CORRUPT_DB ); + return (u16)nSize; +} #ifdef SQLITE_DEBUG @@ -67574,7 +69603,7 @@ static u16 cellSize(MemPage *pPage, int iCell){ #ifndef SQLITE_OMIT_AUTOVACUUM /* ** The cell pCell is currently part of page pSrc but will ultimately be part -** of pPage. (pSrc and pPager are often the same.) If pCell contains a +** of pPage. (pSrc and pPage are often the same.) If pCell contains a ** pointer to an overflow page, insert an entry into the pointer-map for ** the overflow page that will be valid after pCell has been moved to pPage. */ @@ -67630,8 +69659,7 @@ static int defragmentPage(MemPage *pPage, int nMaxFrag){ assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); assert( pPage->nOverflow==0 ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); - temp = 0; - src = data = pPage->aData; + data = pPage->aData; hdr = pPage->hdrOffset; cellOffset = pPage->cellOffset; nCell = pPage->nCell; @@ -67665,7 +69693,7 @@ static int defragmentPage(MemPage *pPage, int nMaxFrag){ if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage); memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz)); sz += sz2; - }else if( NEVER(iFree+sz>usableSize) ){ + }else if( iFree+sz>usableSize ){ return SQLITE_CORRUPT_PAGE(pPage); } @@ -67685,39 +69713,38 @@ static int defragmentPage(MemPage *pPage, int nMaxFrag){ cbrk = usableSize; iCellLast = usableSize - 4; iCellStart = get2byte(&data[hdr+5]); - for(i=0; iiCellLast ){ - return SQLITE_CORRUPT_PAGE(pPage); - } - assert( pc>=iCellStart && pc<=iCellLast ); - size = pPage->xCellSize(pPage, &src[pc]); - cbrk -= size; - if( cbrkusableSize ){ - return SQLITE_CORRUPT_PAGE(pPage); - } - assert( cbrk+size<=usableSize && cbrk>=iCellStart ); - testcase( cbrk+size==usableSize ); - testcase( pc+size==usableSize ); - put2byte(pAddr, cbrk); - if( temp==0 ){ - if( cbrk==pc ) continue; - temp = sqlite3PagerTempSpace(pPage->pBt->pPager); - memcpy(&temp[iCellStart], &data[iCellStart], usableSize - iCellStart); - src = temp; + if( nCell>0 ){ + temp = sqlite3PagerTempSpace(pPage->pBt->pPager); + memcpy(&temp[iCellStart], &data[iCellStart], usableSize - iCellStart); + src = temp; + for(i=0; iiCellLast ){ + return SQLITE_CORRUPT_PAGE(pPage); + } + assert( pc>=iCellStart && pc<=iCellLast ); + size = pPage->xCellSize(pPage, &src[pc]); + cbrk -= size; + if( cbrkusableSize ){ + return SQLITE_CORRUPT_PAGE(pPage); + } + assert( cbrk+size<=usableSize && cbrk>=iCellStart ); + testcase( cbrk+size==usableSize ); + testcase( pc+size==usableSize ); + put2byte(pAddr, cbrk); + memcpy(&data[cbrk], &src[pc], size); } - memcpy(&data[cbrk], &src[pc], size); } data[hdr+7] = 0; - defragment_out: +defragment_out: assert( pPage->nFree>=0 ); if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){ return SQLITE_CORRUPT_PAGE(pPage); @@ -67749,7 +69776,8 @@ static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ const int hdr = pPg->hdrOffset; /* Offset to page header */ u8 * const aData = pPg->aData; /* Page data */ int iAddr = hdr + 1; /* Address of ptr to pc */ - int pc = get2byte(&aData[iAddr]); /* Address of a free slot */ + u8 *pTmp = &aData[iAddr]; /* Temporary ptr into aData[] */ + int pc = get2byte(pTmp); /* Address of a free slot */ int x; /* Excess size of the slot */ int maxPC = pPg->pBt->usableSize - nByte; /* Max address for a usable slot */ int size; /* Size of the free slot */ @@ -67759,7 +69787,8 @@ static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each ** freeblock form a big-endian integer which is the size of the freeblock ** in bytes, including the 4-byte header. */ - size = get2byte(&aData[pc+2]); + pTmp = &aData[pc+2]; + size = get2byte(pTmp); if( (x = size - nByte)>=0 ){ testcase( x==4 ); testcase( x==3 ); @@ -67772,7 +69801,6 @@ static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ ** fragmented bytes within the page. */ memcpy(&aData[iAddr], &aData[pc], 2); aData[hdr+7] += (u8)x; - testcase( pc+x>maxPC ); return &aData[pc]; }else if( x+pc > maxPC ){ /* This slot extends off the end of the usable part of the page */ @@ -67786,10 +69814,11 @@ static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){ return &aData[pc + x]; } iAddr = pc; - pc = get2byte(&aData[pc]); - if( pc<=iAddr+size ){ + pTmp = &aData[pc]; + pc = get2byte(pTmp); + if( pc<=iAddr ){ if( pc ){ - /* The next slot in the chain is not past the end of the current slot */ + /* The next slot in the chain comes before the current slot */ *pRc = SQLITE_CORRUPT_PAGE(pPg); } return 0; @@ -67820,6 +69849,7 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ u8 * const data = pPage->aData; /* Local cache of pPage->aData */ int top; /* First byte of cell content area */ int rc = SQLITE_OK; /* Integer return code */ + u8 *pTmp; /* Temp ptr into data[] */ int gap; /* First byte of gap between cell pointers and cell content */ assert( sqlite3PagerIswriteable(pPage->pDbPage) ); @@ -67838,7 +69868,8 @@ static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ ** then the cell content offset of an empty page wants to be 65536. ** However, that integer is too large to be stored in a 2-byte unsigned ** integer, so a value of 0 is used in its place. */ - top = get2byte(&data[hdr+5]); + pTmp = &data[hdr+5]; + top = get2byte(pTmp); assert( top<=(int)pPage->pBt->usableSize ); /* by btreeComputeFreeSpace() */ if( gap>top ){ if( top==0 && pPage->pBt->usableSize==65536 ){ @@ -67920,6 +69951,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ u16 x; /* Offset to cell content area */ u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */ unsigned char *data = pPage->aData; /* Page content */ + u8 *pTmp; /* Temporary ptr into data[] */ assert( pPage->pBt!=0 ); assert( sqlite3PagerIswriteable(pPage->pDbPage) ); @@ -67938,7 +69970,7 @@ static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){ iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */ }else{ while( (iFreeBlk = get2byte(&data[iPtr]))data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage); data[hdr+7] -= nFrag; } - x = get2byte(&data[hdr+5]); + pTmp = &data[hdr+5]; + x = get2byte(pTmp); if( iStart<=x ){ /* The new freeblock is at the beginning of the cell content area, ** so just extend the cell content area rather than create another @@ -68026,7 +70059,6 @@ static int decodeFlags(MemPage *pPage, int flagByte){ pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 ); flagByte &= ~PTF_LEAF; pPage->childPtrSize = 4-4*pPage->leaf; - pPage->xCellSize = cellSizePtr; pBt = pPage->pBt; if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an @@ -68038,6 +70070,7 @@ static int decodeFlags(MemPage *pPage, int flagByte){ pPage->intKey = 1; if( pPage->leaf ){ pPage->intKeyLeaf = 1; + pPage->xCellSize = cellSizePtrTableLeaf; pPage->xParseCell = btreeParseCellPtr; }else{ pPage->intKeyLeaf = 0; @@ -68055,12 +70088,17 @@ static int decodeFlags(MemPage *pPage, int flagByte){ assert( (PTF_ZERODATA|PTF_LEAF)==10 ); pPage->intKey = 0; pPage->intKeyLeaf = 0; + pPage->xCellSize = cellSizePtr; pPage->xParseCell = btreeParseCellPtrIndex; pPage->maxLocal = pBt->maxLocal; pPage->minLocal = pBt->minLocal; }else{ /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is ** an error. */ + pPage->intKey = 0; + pPage->intKeyLeaf = 0; + pPage->xCellSize = cellSizePtr; + pPage->xParseCell = btreeParseCellPtrIndex; return SQLITE_CORRUPT_PAGE(pPage); } pPage->max1bytePayload = pBt->max1bytePayload; @@ -68414,7 +70452,7 @@ getAndInitPage_error1: pCur->pPage = pCur->apPage[pCur->iPage]; } testcase( pgno==0 ); - assert( pgno!=0 || rc==SQLITE_CORRUPT ); + assert( pgno!=0 || rc!=SQLITE_OK ); return rc; } @@ -69850,6 +71888,9 @@ static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ } } }else{ + if( pCell+4 > pPage->aData+pPage->pBt->usableSize ){ + return SQLITE_CORRUPT_PAGE(pPage); + } if( get4byte(pCell)==iFrom ){ put4byte(pCell, iTo); break; @@ -70036,12 +72077,17 @@ static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){ } do { MemPage *pFreePg; + Pgno dbSize = btreePagecount(pBt); rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode); if( rc!=SQLITE_OK ){ releasePage(pLastPg); return rc; } releasePage(pFreePg); + if( iFreePg>dbSize ){ + releasePage(pLastPg); + return SQLITE_CORRUPT_BKPT; + } }while( bCommit && iFreePg>nFin ); assert( iFreePgpBt; - assert( cursorOwnsBtShared(pCur) ); assert( pCur->eState==CURSOR_VALID ); assert( pCur->iPageapPage[pCur->iPage] = pCur->pPage; pCur->ix = 0; pCur->iPage++; - return getAndInitPage(pBt, newPgno, &pCur->pPage, pCur, pCur->curPagerFlags); + return getAndInitPage(pCur->pBt, newPgno, &pCur->pPage, pCur, + pCur->curPagerFlags); } #ifdef SQLITE_DEBUG @@ -71472,7 +73517,7 @@ static int moveToRoot(BtCursor *pCur){ } sqlite3BtreeClearCursor(pCur); } - rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->pPage, + rc = getAndInitPage(pCur->pBt, pCur->pgnoRoot, &pCur->pPage, 0, pCur->curPagerFlags); if( rc!=SQLITE_OK ){ pCur->eState = CURSOR_INVALID; @@ -71803,6 +73848,69 @@ moveto_table_finish: return rc; } +/* +** Compare the "idx"-th cell on the page the cursor pCur is currently +** pointing to to pIdxKey using xRecordCompare. Return negative or +** zero if the cell is less than or equal pIdxKey. Return positive +** if unknown. +** +** Return value negative: Cell at pCur[idx] less than pIdxKey +** +** Return value is zero: Cell at pCur[idx] equals pIdxKey +** +** Return value positive: Nothing is known about the relationship +** of the cell at pCur[idx] and pIdxKey. +** +** This routine is part of an optimization. It is always safe to return +** a positive value as that will cause the optimization to be skipped. +*/ +static int indexCellCompare( + BtCursor *pCur, + int idx, + UnpackedRecord *pIdxKey, + RecordCompare xRecordCompare +){ + MemPage *pPage = pCur->pPage; + int c; + int nCell; /* Size of the pCell cell in bytes */ + u8 *pCell = findCellPastPtr(pPage, idx); + + nCell = pCell[0]; + if( nCell<=pPage->max1bytePayload ){ + /* This branch runs if the record-size field of the cell is a + ** single byte varint and the record fits entirely on the main + ** b-tree page. */ + testcase( pCell+nCell+1==pPage->aDataEnd ); + c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); + }else if( !(pCell[1] & 0x80) + && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal + ){ + /* The record-size field is a 2 byte varint and the record + ** fits entirely on the main b-tree page. */ + testcase( pCell+nCell+2==pPage->aDataEnd ); + c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); + }else{ + /* If the record extends into overflow pages, do not attempt + ** the optimization. */ + c = 99; + } + return c; +} + +/* +** Return true (non-zero) if pCur is current pointing to the last +** page of a table. +*/ +static int cursorOnLastPage(BtCursor *pCur){ + int i; + assert( pCur->eState==CURSOR_VALID ); + for(i=0; iiPage; i++){ + MemPage *pPage = pCur->apPage[i]; + if( pCur->aiIdx[i]nCell ) return 0; + } + return 1; +} + /* Move the cursor so that it points to an entry in an index table ** near the key pIdxKey. Return a success code. ** @@ -71853,6 +73961,43 @@ SQLITE_PRIVATE int sqlite3BtreeIndexMoveto( || pIdxKey->default_rc==-1 ); + + /* Check to see if we can skip a lot of work. Two cases: + ** + ** (1) If the cursor is already pointing to the very last cell + ** in the table and the pIdxKey search key is greater than or + ** equal to that last cell, then no movement is required. + ** + ** (2) If the cursor is on the last page of the table and the first + ** cell on that last page is less than or equal to the pIdxKey + ** search key, then we can start the search on the current page + ** without needing to go back to root. + */ + if( pCur->eState==CURSOR_VALID + && pCur->pPage->leaf + && cursorOnLastPage(pCur) + ){ + int c; + if( pCur->ix==pCur->pPage->nCell-1 + && (c = indexCellCompare(pCur, pCur->ix, pIdxKey, xRecordCompare))<=0 + && pIdxKey->errCode==SQLITE_OK + ){ + *pRes = c; + return SQLITE_OK; /* Cursor already pointing at the correct spot */ + } + if( pCur->iPage>0 + && indexCellCompare(pCur, 0, pIdxKey, xRecordCompare)<=0 + && pIdxKey->errCode==SQLITE_OK + ){ + pCur->curFlags &= ~BTCF_ValidOvfl; + if( !pCur->pPage->isInit ){ + return SQLITE_CORRUPT_BKPT; + } + goto bypass_moveto_root; /* Start search on the current page */ + } + pIdxKey->errCode = SQLITE_OK; + } + rc = moveToRoot(pCur); if( rc ){ if( rc==SQLITE_EMPTY ){ @@ -71862,12 +74007,14 @@ SQLITE_PRIVATE int sqlite3BtreeIndexMoveto( } return rc; } + +bypass_moveto_root: assert( pCur->pPage ); assert( pCur->pPage->isInit ); assert( pCur->eState==CURSOR_VALID ); assert( pCur->pPage->nCell > 0 ); - assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey ); - assert( pCur->curIntKey || pIdxKey ); + assert( pCur->curIntKey==0 ); + assert( pIdxKey!=0 ); for(;;){ int lwr, upr, idx, c; Pgno chldPg; @@ -71881,7 +74028,7 @@ SQLITE_PRIVATE int sqlite3BtreeIndexMoveto( ** be the right kind (index or table) of b-tree page. Otherwise ** a moveToChild() or moveToRoot() call would have detected corruption. */ assert( pPage->nCell>0 ); - assert( pPage->intKey==(pIdxKey==0) ); + assert( pPage->intKey==0 ); lwr = 0; upr = pPage->nCell-1; idx = upr>>1; /* idx = (lwr+upr)/2; */ @@ -72078,14 +74225,7 @@ static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){ pPage = pCur->pPage; idx = ++pCur->ix; - if( !pPage->isInit || sqlite3FaultSim(412) ){ - /* The only known way for this to happen is for there to be a - ** recursive SQL function that does a DELETE operation as part of a - ** SELECT which deletes content out from under an active cursor - ** in a corrupt database file where the table being DELETE-ed from - ** has pages in common with the table being queried. See TH3 - ** module cov1/btree78.test testcase 220 (2018-06-08) for an - ** example. */ + if( NEVER(!pPage->isInit) || sqlite3FaultSim(412) ){ return SQLITE_CORRUPT_BKPT; } @@ -72261,8 +74401,8 @@ static int allocateBtreePage( assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) ); pPage1 = pBt->pPage1; mxPage = btreePagecount(pBt); - /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36 - ** stores stores the total number of pages on the freelist. */ + /* EVIDENCE-OF: R-21003-45125 The 4-byte big-endian integer at offset 36 + ** stores the total number of pages on the freelist. */ n = get4byte(&pPage1->aData[36]); testcase( n==mxPage-1 ); if( n>=mxPage ){ @@ -73011,12 +75151,6 @@ static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ assert( pPage->pBt->usableSize > (u32)(ptr-data) ); pc = get2byte(ptr); hdr = pPage->hdrOffset; -#if 0 /* Not required. Omit for efficiency */ - if( pcnCell*2 ){ - *pRC = SQLITE_CORRUPT_BKPT; - return; - } -#endif testcase( pc==(u32)get2byte(&data[hdr+5]) ); testcase( pc+sz==pPage->pBt->usableSize ); if( pc+sz > pPage->pBt->usableSize ){ @@ -73900,8 +76034,6 @@ static int balance_nonroot( Pgno pgno; /* Temp var to store a page number in */ u8 abDone[NB+2]; /* True after i'th new page is populated */ Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */ - Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */ - u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */ CellArray b; /* Parsed information on cells being balanced */ memset(abDone, 0, sizeof(abDone)); @@ -74325,42 +76457,39 @@ static int balance_nonroot( ** of the table is closer to a linear scan through the file. That in turn ** helps the operating system to deliver pages from the disk more rapidly. ** - ** An O(n^2) insertion sort algorithm is used, but since n is never more - ** than (NB+2) (a small constant), that should not be a problem. + ** An O(N*N) sort algorithm is used, but since N is never more than NB+2 + ** (5), that is not a performance concern. ** ** When NB==3, this one optimization makes the database about 25% faster ** for large insertions and deletions. */ for(i=0; ipgno; - aPgFlags[i] = apNew[i]->pDbPage->flags; - for(j=0; jpgno; + assert( apNew[i]->pDbPage->flags & PGHDR_WRITEABLE ); + assert( apNew[i]->pDbPage->flags & PGHDR_DIRTY ); } - for(i=0; ipgno < apNew[iB]->pgno ) iB = j; } - pgno = aPgOrder[iBest]; - aPgOrder[iBest] = 0xffffffff; - if( iBest!=i ){ - if( iBest>i ){ - sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0); - } - sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]); - apNew[i]->pgno = pgno; + + /* If apNew[i] has a page number that is bigger than any of the + ** subsequence apNew[i] entries, then swap apNew[i] with the subsequent + ** entry that has the smallest page number (which we know to be + ** entry apNew[iB]). + */ + if( iB!=i ){ + Pgno pgnoA = apNew[i]->pgno; + Pgno pgnoB = apNew[iB]->pgno; + Pgno pgnoTemp = (PENDING_BYTE/pBt->pageSize)+1; + u16 fgA = apNew[i]->pDbPage->flags; + u16 fgB = apNew[iB]->pDbPage->flags; + sqlite3PagerRekey(apNew[i]->pDbPage, pgnoTemp, fgB); + sqlite3PagerRekey(apNew[iB]->pDbPage, pgnoA, fgA); + sqlite3PagerRekey(apNew[i]->pDbPage, pgnoB, fgB); + apNew[i]->pgno = pgnoB; + apNew[iB]->pgno = pgnoA; } } @@ -74748,7 +76877,6 @@ static int anotherValidCursor(BtCursor *pCur){ */ static int balance(BtCursor *pCur){ int rc = SQLITE_OK; - const int nMin = pCur->pBt->usableSize * 2 / 3; u8 aBalanceQuickSpace[13]; u8 *pFree = 0; @@ -74760,7 +76888,11 @@ static int balance(BtCursor *pCur){ MemPage *pPage = pCur->pPage; if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break; - if( pPage->nOverflow==0 && pPage->nFree<=nMin ){ + if( pPage->nOverflow==0 && pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){ + /* No rebalance required as long as: + ** (1) There are no overflow cells + ** (2) The amount of free space on the page is less than 2/3rds of + ** the total usable space on the page. */ break; }else if( (iPage = pCur->iPage)==0 ){ if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){ @@ -74783,6 +76915,11 @@ static int balance(BtCursor *pCur){ }else{ break; } + }else if( sqlite3PagerPageRefcount(pPage->pDbPage)>1 ){ + /* The page being written is not a root page, and there is currently + ** more than one reference to it. This only happens if the page is one + ** of its own ancestor pages. Corruption. */ + rc = SQLITE_CORRUPT_BKPT; }else{ MemPage * const pParent = pCur->apPage[iPage-1]; int const iIdx = pCur->aiIdx[iPage-1]; @@ -74980,7 +77117,7 @@ static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){ ** pX.pData,nData,nZero fields must be zero. ** ** If the seekResult parameter is non-zero, then a successful call to -** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already +** sqlite3BtreeIndexMoveto() to seek cursor pCur to (pKey,nKey) has already ** been performed. In other words, if seekResult!=0 then the cursor ** is currently pointing to a cell that will be adjacent to the cell ** to be inserted. If seekResult<0 then pCur points to a cell that is @@ -74998,7 +77135,7 @@ SQLITE_PRIVATE int sqlite3BtreeInsert( BtCursor *pCur, /* Insert data into the table of this cursor */ const BtreePayload *pX, /* Content of the row to be inserted */ int flags, /* True if this is likely an append */ - int seekResult /* Result of prior MovetoUnpacked() call */ + int seekResult /* Result of prior IndexMoveto() call */ ){ int rc; int loc = seekResult; /* -1: before desired location +1: after */ @@ -75037,7 +77174,12 @@ SQLITE_PRIVATE int sqlite3BtreeInsert( } } + /* Ensure that the cursor is not in the CURSOR_FAULT state and that it + ** points to a valid cell. + */ if( pCur->eState>=CURSOR_REQUIRESEEK ){ + testcase( pCur->eState==CURSOR_REQUIRESEEK ); + testcase( pCur->eState==CURSOR_FAULT ); rc = moveToRoot(pCur); if( rc && rc!=SQLITE_EMPTY ) return rc; } @@ -75149,7 +77291,8 @@ SQLITE_PRIVATE int sqlite3BtreeInsert( assert( pPage->intKey || pX->nKey>=0 || (flags & BTREE_PREFORMAT) ); assert( pPage->leaf || !pPage->intKey ); if( pPage->nFree<0 ){ - if( pCur->eState>CURSOR_INVALID ){ + if( NEVER(pCur->eState>CURSOR_INVALID) ){ + /* ^^^^^--- due to the moveToRoot() call above */ rc = SQLITE_CORRUPT_BKPT; }else{ rc = btreeComputeFreeSpace(pPage); @@ -75160,7 +77303,7 @@ SQLITE_PRIVATE int sqlite3BtreeInsert( TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno, loc==0 ? "overwrite" : "new entry")); - assert( pPage->isInit ); + assert( pPage->isInit || CORRUPT_DB ); newCell = pBt->pTmpSpace; assert( newCell!=0 ); if( flags & BTREE_PREFORMAT ){ @@ -75311,7 +77454,11 @@ SQLITE_PRIVATE int sqlite3BtreeTransferRow(BtCursor *pDest, BtCursor *pSrc, i64 u32 nRem; /* Bytes of data still to copy */ getCellInfo(pSrc); - aOut += putVarint32(aOut, pSrc->info.nPayload); + if( pSrc->info.nPayload<0x80 ){ + *(aOut++) = pSrc->info.nPayload; + }else{ + aOut += sqlite3PutVarint(aOut, pSrc->info.nPayload); + } if( pDest->pKeyInfo==0 ) aOut += putVarint(aOut, iKey); nIn = pSrc->info.nLocal; aIn = pSrc->info.pPayload; @@ -75471,7 +77618,8 @@ SQLITE_PRIVATE int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ bPreserve = (flags & BTREE_SAVEPOSITION)!=0; if( bPreserve ){ if( !pPage->leaf - || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3) + || (pPage->nFree+pPage->xCellSize(pPage,pCell)+2) > + (int)(pBt->usableSize*2/3) || pPage->nCell==1 /* See dbfuzz001.test for a test case */ ){ /* A b-tree rebalance will be required after deleting this entry. @@ -75567,7 +77715,15 @@ SQLITE_PRIVATE int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){ ** been corrected, so be it. Otherwise, after balancing the leaf node, ** walk the cursor up the tree to the internal node and balance it as ** well. */ - rc = balance(pCur); + assert( pCur->pPage->nOverflow==0 ); + assert( pCur->pPage->nFree>=0 ); + if( pCur->pPage->nFree*3<=(int)pCur->pBt->usableSize*2 ){ + /* Optimization: If the free space is less than 2/3rds of the page, + ** then balance() will always be a no-op. No need to invoke it. */ + rc = SQLITE_OK; + }else{ + rc = balance(pCur); + } if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ releasePageNotNull(pCur->pPage); pCur->iPage--; @@ -77062,6 +79218,17 @@ SQLITE_PRIVATE int sqlite3BtreeIsReadonly(Btree *p){ */ SQLITE_PRIVATE int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); } +/* +** If no transaction is active and the database is not a temp-db, clear +** the in-memory pager cache. +*/ +SQLITE_PRIVATE void sqlite3BtreeClearCache(Btree *p){ + BtShared *pBt = p->pBt; + if( pBt->inTransaction==TRANS_NONE ){ + sqlite3PagerClearCache(pBt->pPager); + } +} + #if !defined(SQLITE_OMIT_SHARED_CACHE) /* ** Return true if the Btree passed as the only argument is sharable. @@ -78315,9 +80482,10 @@ SQLITE_PRIVATE int sqlite3VdbeMemFinalize(Mem *pMem, FuncDef *pFunc){ Mem t; assert( pFunc!=0 ); assert( pMem!=0 ); + assert( pMem->db!=0 ); assert( pFunc->xFinalize!=0 ); assert( (pMem->flags & MEM_Null)!=0 || pFunc==pMem->u.pDef ); - assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) ); + assert( sqlite3_mutex_held(pMem->db->mutex) ); memset(&ctx, 0, sizeof(ctx)); memset(&t, 0, sizeof(t)); t.flags = MEM_Null; @@ -78325,6 +80493,7 @@ SQLITE_PRIVATE int sqlite3VdbeMemFinalize(Mem *pMem, FuncDef *pFunc){ ctx.pOut = &t; ctx.pMem = pMem; ctx.pFunc = pFunc; + ctx.enc = ENC(t.db); pFunc->xFinalize(&ctx); /* IMP: R-24505-23230 */ assert( (pMem->flags & MEM_Dyn)==0 ); if( pMem->szMalloc>0 ) sqlite3DbFreeNN(pMem->db, pMem->zMalloc); @@ -78346,12 +80515,14 @@ SQLITE_PRIVATE int sqlite3VdbeMemAggValue(Mem *pAccum, Mem *pOut, FuncDef *pFunc assert( pFunc!=0 ); assert( pFunc->xValue!=0 ); assert( (pAccum->flags & MEM_Null)!=0 || pFunc==pAccum->u.pDef ); - assert( pAccum->db==0 || sqlite3_mutex_held(pAccum->db->mutex) ); + assert( pAccum->db!=0 ); + assert( sqlite3_mutex_held(pAccum->db->mutex) ); memset(&ctx, 0, sizeof(ctx)); sqlite3VdbeMemSetNull(pOut); ctx.pOut = pOut; ctx.pMem = pAccum; ctx.pFunc = pFunc; + ctx.enc = ENC(pAccum->db); pFunc->xValue(&ctx); return ctx.isError; } @@ -78417,6 +80588,14 @@ SQLITE_PRIVATE void sqlite3VdbeMemRelease(Mem *p){ } } +/* Like sqlite3VdbeMemRelease() but faster for cases where we +** know in advance that the Mem is not MEM_Dyn or MEM_Agg. +*/ +SQLITE_PRIVATE void sqlite3VdbeMemReleaseMalloc(Mem *p){ + assert( !VdbeMemDynamic(p) ); + if( p->szMalloc ) vdbeMemClear(p); +} + /* ** Convert a 64-bit IEEE double into a 64-bit signed integer. ** If the double is out of range of a 64-bit signed integer then @@ -78595,6 +80774,16 @@ SQLITE_PRIVATE int sqlite3RealSameAsInt(double r1, sqlite3_int64 i){ && i >= -2251799813685248LL && i < 2251799813685248LL); } +/* Convert a floating point value to its closest integer. Do so in +** a way that avoids 'outside the range of representable values' warnings +** from UBSAN. +*/ +SQLITE_PRIVATE i64 sqlite3RealToI64(double r){ + if( r<=(double)SMALLEST_INT64 ) return SMALLEST_INT64; + if( r>=(double)LARGEST_INT64) return LARGEST_INT64; + return (i64)r; +} + /* ** Convert pMem so that it has type MEM_Real or MEM_Int. ** Invalidate any prior representations. @@ -78616,7 +80805,7 @@ SQLITE_PRIVATE int sqlite3VdbeMemNumerify(Mem *pMem){ assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) ); rc = sqlite3AtoF(pMem->z, &pMem->u.r, pMem->n, pMem->enc); if( ((rc==0 || rc==1) && sqlite3Atoi64(pMem->z, &ix, pMem->n, pMem->enc)<=1) - || sqlite3RealSameAsInt(pMem->u.r, (ix = (i64)pMem->u.r)) + || sqlite3RealSameAsInt(pMem->u.r, (ix = sqlite3RealToI64(pMem->u.r))) ){ pMem->u.i = ix; MemSetTypeFlag(pMem, MEM_Int); @@ -78668,6 +80857,7 @@ SQLITE_PRIVATE int sqlite3VdbeMemCast(Mem *pMem, u8 aff, u8 encoding){ sqlite3ValueApplyAffinity(pMem, SQLITE_AFF_TEXT, encoding); assert( pMem->flags & MEM_Str || pMem->db->mallocFailed ); pMem->flags &= ~(MEM_Int|MEM_Real|MEM_IntReal|MEM_Blob|MEM_Zero); + if( encoding!=SQLITE_UTF8 ) pMem->n &= ~1; return sqlite3VdbeChangeEncoding(pMem, encoding); } } @@ -78961,6 +81151,13 @@ SQLITE_PRIVATE void sqlite3VdbeMemMove(Mem *pTo, Mem *pFrom){ ** stored without allocating memory, then it is. If a memory allocation ** is required to store the string, then value of pMem is unchanged. In ** either case, SQLITE_TOOBIG is returned. +** +** The "enc" parameter is the text encoding for the string, or zero +** to store a blob. +** +** If n is negative, then the string consists of all bytes up to but +** excluding the first zero character. The n parameter must be +** non-negative for blobs. */ SQLITE_PRIVATE int sqlite3VdbeMemSetStr( Mem *pMem, /* Memory cell to set to string value */ @@ -78971,11 +81168,12 @@ SQLITE_PRIVATE int sqlite3VdbeMemSetStr( ){ i64 nByte = n; /* New value for pMem->n */ int iLimit; /* Maximum allowed string or blob size */ - u16 flags = 0; /* New value for pMem->flags */ + u16 flags; /* New value for pMem->flags */ assert( pMem!=0 ); assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) ); assert( !sqlite3VdbeMemIsRowSet(pMem) ); + assert( enc!=0 || n>=0 ); /* If z is a NULL pointer, set pMem to contain an SQL NULL. */ if( !z ){ @@ -78988,7 +81186,6 @@ SQLITE_PRIVATE int sqlite3VdbeMemSetStr( }else{ iLimit = SQLITE_MAX_LENGTH; } - flags = (enc==0?MEM_Blob:MEM_Str); if( nByte<0 ){ assert( enc!=0 ); if( enc==SQLITE_UTF8 ){ @@ -78996,7 +81193,23 @@ SQLITE_PRIVATE int sqlite3VdbeMemSetStr( }else{ for(nByte=0; nByte<=iLimit && (z[nByte] | z[nByte+1]); nByte+=2){} } - flags |= MEM_Term; + flags= MEM_Str|MEM_Term; + }else if( enc==0 ){ + flags = MEM_Blob; + enc = SQLITE_UTF8; + }else{ + flags = MEM_Str; + } + if( nByte>iLimit ){ + if( xDel && xDel!=SQLITE_TRANSIENT ){ + if( xDel==SQLITE_DYNAMIC ){ + sqlite3DbFree(pMem->db, (void*)z); + }else{ + xDel((void*)z); + } + } + sqlite3VdbeMemSetNull(pMem); + return sqlite3ErrorToParser(pMem->db, SQLITE_TOOBIG); } /* The following block sets the new values of Mem.z and Mem.xDel. It @@ -79008,9 +81221,6 @@ SQLITE_PRIVATE int sqlite3VdbeMemSetStr( if( flags&MEM_Term ){ nAlloc += (enc==SQLITE_UTF8?1:2); } - if( nByte>iLimit ){ - return sqlite3ErrorToParser(pMem->db, SQLITE_TOOBIG); - } testcase( nAlloc==0 ); testcase( nAlloc==31 ); testcase( nAlloc==32 ); @@ -79032,16 +81242,7 @@ SQLITE_PRIVATE int sqlite3VdbeMemSetStr( pMem->n = (int)(nByte & 0x7fffffff); pMem->flags = flags; - if( enc ){ - pMem->enc = enc; -#ifdef SQLITE_ENABLE_SESSION - }else if( pMem->db==0 ){ - pMem->enc = SQLITE_UTF8; -#endif - }else{ - assert( pMem->db!=0 ); - pMem->enc = ENC(pMem->db); - } + pMem->enc = enc; #ifndef SQLITE_OMIT_UTF16 if( enc>SQLITE_UTF8 && sqlite3VdbeMemHandleBom(pMem) ){ @@ -79049,9 +81250,6 @@ SQLITE_PRIVATE int sqlite3VdbeMemSetStr( } #endif - if( nByte>iLimit ){ - return sqlite3ErrorToParser(pMem->db, SQLITE_TOOBIG); - } return SQLITE_OK; } @@ -79329,10 +81527,12 @@ static int valueFromFunction( goto value_from_function_out; } - assert( pCtx->pParse->rc==SQLITE_OK ); + testcase( pCtx->pParse->rc==SQLITE_ERROR ); + testcase( pCtx->pParse->rc==SQLITE_OK ); memset(&ctx, 0, sizeof(ctx)); ctx.pOut = pVal; ctx.pFunc = pFunc; + ctx.enc = ENC(db); pFunc->xSFunc(&ctx, nVal, apVal); if( ctx.isError ){ rc = ctx.isError; @@ -79408,8 +81608,8 @@ static int valueFromExpr( rc = valueFromExpr(db, pExpr->pLeft, enc, aff, ppVal, pCtx); testcase( rc!=SQLITE_OK ); if( *ppVal ){ - sqlite3VdbeMemCast(*ppVal, aff, SQLITE_UTF8); - sqlite3ValueApplyAffinity(*ppVal, affinity, SQLITE_UTF8); + sqlite3VdbeMemCast(*ppVal, aff, enc); + sqlite3ValueApplyAffinity(*ppVal, affinity, enc); } return rc; } @@ -79793,6 +81993,9 @@ SQLITE_PRIVATE int sqlite3ValueBytes(sqlite3_value *pVal, u8 enc){ if( (p->flags & MEM_Str)!=0 && pVal->enc==enc ){ return p->n; } + if( (p->flags & MEM_Str)!=0 && enc!=SQLITE_UTF8 && pVal->enc!=SQLITE_UTF8 ){ + return p->n; + } if( (p->flags & MEM_Blob)!=0 ){ if( p->flags & MEM_Zero ){ return p->n + p->u.nZero; @@ -79838,12 +82041,12 @@ SQLITE_PRIVATE Vdbe *sqlite3VdbeCreate(Parse *pParse){ memset(&p->aOp, 0, sizeof(Vdbe)-offsetof(Vdbe,aOp)); p->db = db; if( db->pVdbe ){ - db->pVdbe->pPrev = p; + db->pVdbe->ppVPrev = &p->pVNext; } - p->pNext = db->pVdbe; - p->pPrev = 0; + p->pVNext = db->pVdbe; + p->ppVPrev = &db->pVdbe; db->pVdbe = p; - p->iVdbeMagic = VDBE_MAGIC_INIT; + assert( p->eVdbeState==VDBE_INIT_STATE ); p->pParse = pParse; pParse->pVdbe = p; assert( pParse->aLabel==0 ); @@ -79923,21 +82126,28 @@ SQLITE_PRIVATE int sqlite3VdbeUsesDoubleQuotedString( #endif /* -** Swap all content between two VDBE structures. +** Swap byte-code between two VDBE structures. +** +** This happens after pB was previously run and returned +** SQLITE_SCHEMA. The statement was then reprepared in pA. +** This routine transfers the new bytecode in pA over to pB +** so that pB can be run again. The old pB byte code is +** moved back to pA so that it will be cleaned up when pA is +** finalized. */ SQLITE_PRIVATE void sqlite3VdbeSwap(Vdbe *pA, Vdbe *pB){ - Vdbe tmp, *pTmp; + Vdbe tmp, *pTmp, **ppTmp; char *zTmp; assert( pA->db==pB->db ); tmp = *pA; *pA = *pB; *pB = tmp; - pTmp = pA->pNext; - pA->pNext = pB->pNext; - pB->pNext = pTmp; - pTmp = pA->pPrev; - pA->pPrev = pB->pPrev; - pB->pPrev = pTmp; + pTmp = pA->pVNext; + pA->pVNext = pB->pVNext; + pB->pVNext = pTmp; + ppTmp = pA->ppVPrev; + pA->ppVPrev = pB->ppVPrev; + pB->ppVPrev = ppTmp; zTmp = pA->zSql; pA->zSql = pB->zSql; pB->zSql = zTmp; @@ -79988,7 +82198,7 @@ static int growOpArray(Vdbe *v, int nOp){ return SQLITE_NOMEM; } - assert( nOp<=(1024/sizeof(Op)) ); + assert( nOp<=(int)(1024/sizeof(Op)) ); assert( nNew>=(v->nOpAlloc+nOp) ); pNew = sqlite3DbRealloc(p->db, v->aOp, nNew*sizeof(Op)); if( pNew ){ @@ -80044,7 +82254,7 @@ SQLITE_PRIVATE int sqlite3VdbeAddOp3(Vdbe *p, int op, int p1, int p2, int p3){ VdbeOp *pOp; i = p->nOp; - assert( p->iVdbeMagic==VDBE_MAGIC_INIT ); + assert( p->eVdbeState==VDBE_INIT_STATE ); assert( op>=0 && op<0xff ); if( p->nOpAlloc<=i ){ return growOp3(p, op, p1, p2, p3); @@ -80189,6 +82399,7 @@ SQLITE_PRIVATE int sqlite3VdbeAddFunctionCall( addr = sqlite3VdbeAddOp4(v, eCallCtx ? OP_PureFunc : OP_Function, p1, p2, p3, (char*)pCtx, P4_FUNCCTX); sqlite3VdbeChangeP5(v, eCallCtx & NC_SelfRef); + sqlite3MayAbort(pParse); return addr; } @@ -80257,7 +82468,7 @@ SQLITE_PRIVATE void sqlite3VdbeExplain(Parse *pParse, u8 bPush, const char *zFmt iThis = v->nOp; sqlite3VdbeAddOp4(v, OP_Explain, iThis, pParse->addrExplain, 0, zMsg, P4_DYNAMIC); - sqlite3ExplainBreakpoint(bPush?"PUSH":"", sqlite3VdbeGetOp(v,-1)->p4.z); + sqlite3ExplainBreakpoint(bPush?"PUSH":"", sqlite3VdbeGetLastOp(v)->p4.z); if( bPush){ pParse->addrExplain = iThis; } @@ -80376,7 +82587,7 @@ static SQLITE_NOINLINE void resizeResolveLabel(Parse *p, Vdbe *v, int j){ SQLITE_PRIVATE void sqlite3VdbeResolveLabel(Vdbe *v, int x){ Parse *p = v->pParse; int j = ADDR(x); - assert( v->iVdbeMagic==VDBE_MAGIC_INIT ); + assert( v->eVdbeState==VDBE_INIT_STATE ); assert( j<-p->nLabel ); assert( j>=0 ); #ifdef SQLITE_DEBUG @@ -80396,14 +82607,20 @@ SQLITE_PRIVATE void sqlite3VdbeResolveLabel(Vdbe *v, int x){ ** Mark the VDBE as one that can only be run one time. */ SQLITE_PRIVATE void sqlite3VdbeRunOnlyOnce(Vdbe *p){ - p->runOnlyOnce = 1; + sqlite3VdbeAddOp2(p, OP_Expire, 1, 1); } /* -** Mark the VDBE as one that can only be run multiple times. +** Mark the VDBE as one that can be run multiple times. */ SQLITE_PRIVATE void sqlite3VdbeReusable(Vdbe *p){ - p->runOnlyOnce = 0; + int i; + for(i=1; ALWAYS(inOp); i++){ + if( ALWAYS(p->aOp[i].opcode==OP_Expire) ){ + p->aOp[1].opcode = OP_Noop; + break; + } + } } #ifdef SQLITE_DEBUG /* sqlite3AssertMayAbort() logic */ @@ -80507,6 +82724,8 @@ SQLITE_PRIVATE int sqlite3VdbeAssertMayAbort(Vdbe *v, int mayAbort){ int hasInitCoroutine = 0; Op *pOp; VdbeOpIter sIter; + + if( v==0 ) return 0; memset(&sIter, 0, sizeof(sIter)); sIter.v = v; @@ -80516,6 +82735,7 @@ SQLITE_PRIVATE int sqlite3VdbeAssertMayAbort(Vdbe *v, int mayAbort){ || opcode==OP_VDestroy || opcode==OP_VCreate || opcode==OP_ParseSchema + || opcode==OP_Function || opcode==OP_PureFunc || ((opcode==OP_Halt || opcode==OP_HaltIfNull) && ((pOp->p1)!=SQLITE_OK && pOp->p2==OE_Abort)) ){ @@ -80590,7 +82810,7 @@ SQLITE_PRIVATE void sqlite3VdbeAssertAbortable(Vdbe *p){ ** (3) Update the Vdbe.readOnly and Vdbe.bIsReader flags to accurately ** indicate what the prepared statement actually does. ** -** (4) Initialize the p4.xAdvance pointer on opcodes that use it. +** (4) (discontinued) ** ** (5) Reclaim the memory allocated for storing labels. ** @@ -80606,8 +82826,8 @@ static void resolveP2Values(Vdbe *p, int *pMaxFuncArgs){ p->readOnly = 1; p->bIsReader = 0; pOp = &p->aOp[p->nOp-1]; - while(1){ - + assert( p->aOp[0].opcode==OP_Init ); + while( 1 /* Loop termates when it reaches the OP_Init opcode */ ){ /* Only JUMP opcodes and the short list of special opcodes in the switch ** below need to be considered. The mkopcodeh.tcl generator script groups ** all these opcodes together near the front of the opcode list. Skip @@ -80636,24 +82856,9 @@ static void resolveP2Values(Vdbe *p, int *pMaxFuncArgs){ p->bIsReader = 1; break; } - case OP_Next: - case OP_SorterNext: { - pOp->p4.xAdvance = sqlite3BtreeNext; - pOp->p4type = P4_ADVANCE; - /* The code generator never codes any of these opcodes as a jump - ** to a label. They are always coded as a jump backwards to a - ** known address */ - assert( pOp->p2>=0 ); - break; - } - case OP_Prev: { - pOp->p4.xAdvance = sqlite3BtreePrevious; - pOp->p4type = P4_ADVANCE; - /* The code generator never codes any of these opcodes as a jump - ** to a label. They are always coded as a jump backwards to a - ** known address */ + case OP_Init: { assert( pOp->p2>=0 ); - break; + goto resolve_p2_values_loop_exit; } #ifndef SQLITE_OMIT_VIRTUALTABLE case OP_VUpdate: { @@ -80687,21 +82892,108 @@ static void resolveP2Values(Vdbe *p, int *pMaxFuncArgs){ ** have non-negative values for P2. */ assert( (sqlite3OpcodeProperty[pOp->opcode]&OPFLG_JUMP)==0 || pOp->p2>=0); } - if( pOp==p->aOp ) break; + assert( pOp>p->aOp ); pOp--; } - sqlite3DbFree(p->db, pParse->aLabel); - pParse->aLabel = 0; +resolve_p2_values_loop_exit: + if( aLabel ){ + sqlite3DbNNFreeNN(p->db, pParse->aLabel); + pParse->aLabel = 0; + } pParse->nLabel = 0; *pMaxFuncArgs = nMaxArgs; assert( p->bIsReader!=0 || DbMaskAllZero(p->btreeMask) ); } +#ifdef SQLITE_DEBUG +/* +** Check to see if a subroutine contains a jump to a location outside of +** the subroutine. If a jump outside the subroutine is detected, add code +** that will cause the program to halt with an error message. +** +** The subroutine consists of opcodes between iFirst and iLast. Jumps to +** locations within the subroutine are acceptable. iRetReg is a register +** that contains the return address. Jumps to outside the range of iFirst +** through iLast are also acceptable as long as the jump destination is +** an OP_Return to iReturnAddr. +** +** A jump to an unresolved label means that the jump destination will be +** beyond the current address. That is normally a jump to an early +** termination and is consider acceptable. +** +** This routine only runs during debug builds. The purpose is (of course) +** to detect invalid escapes out of a subroutine. The OP_Halt opcode +** is generated rather than an assert() or other error, so that ".eqp full" +** will still work to show the original bytecode, to aid in debugging. +*/ +SQLITE_PRIVATE void sqlite3VdbeNoJumpsOutsideSubrtn( + Vdbe *v, /* The byte-code program under construction */ + int iFirst, /* First opcode of the subroutine */ + int iLast, /* Last opcode of the subroutine */ + int iRetReg /* Subroutine return address register */ +){ + VdbeOp *pOp; + Parse *pParse; + int i; + sqlite3_str *pErr = 0; + assert( v!=0 ); + pParse = v->pParse; + assert( pParse!=0 ); + if( pParse->nErr ) return; + assert( iLast>=iFirst ); + assert( iLastnOp ); + pOp = &v->aOp[iFirst]; + for(i=iFirst; i<=iLast; i++, pOp++){ + if( (sqlite3OpcodeProperty[pOp->opcode] & OPFLG_JUMP)!=0 ){ + int iDest = pOp->p2; /* Jump destination */ + if( iDest==0 ) continue; + if( pOp->opcode==OP_Gosub ) continue; + if( iDest<0 ){ + int j = ADDR(iDest); + assert( j>=0 ); + if( j>=-pParse->nLabel || pParse->aLabel[j]<0 ){ + continue; + } + iDest = pParse->aLabel[j]; + } + if( iDestiLast ){ + int j = iDest; + for(; jnOp; j++){ + VdbeOp *pX = &v->aOp[j]; + if( pX->opcode==OP_Return ){ + if( pX->p1==iRetReg ) break; + continue; + } + if( pX->opcode==OP_Noop ) continue; + if( pX->opcode==OP_Explain ) continue; + if( pErr==0 ){ + pErr = sqlite3_str_new(0); + }else{ + sqlite3_str_appendchar(pErr, 1, '\n'); + } + sqlite3_str_appendf(pErr, + "Opcode at %d jumps to %d which is outside the " + "subroutine at %d..%d", + i, iDest, iFirst, iLast); + break; + } + } + } + } + if( pErr ){ + char *zErr = sqlite3_str_finish(pErr); + sqlite3VdbeAddOp4(v, OP_Halt, SQLITE_INTERNAL, OE_Abort, 0, zErr, 0); + sqlite3_free(zErr); + sqlite3MayAbort(pParse); + } +} +#endif /* SQLITE_DEBUG */ + /* ** Return the address of the next instruction to be inserted. */ SQLITE_PRIVATE int sqlite3VdbeCurrentAddr(Vdbe *p){ - assert( p->iVdbeMagic==VDBE_MAGIC_INIT ); + assert( p->eVdbeState==VDBE_INIT_STATE ); return p->nOp; } @@ -80786,7 +83078,7 @@ SQLITE_PRIVATE VdbeOp *sqlite3VdbeAddOpList( int i; VdbeOp *pOut, *pFirst; assert( nOp>0 ); - assert( p->iVdbeMagic==VDBE_MAGIC_INIT ); + assert( p->eVdbeState==VDBE_INIT_STATE ); if( p->nOp + nOp > p->nOpAlloc && growOpArray(p, nOp) ){ return 0; } @@ -80854,15 +83146,19 @@ SQLITE_PRIVATE void sqlite3VdbeScanStatus( ** for a specific instruction. */ SQLITE_PRIVATE void sqlite3VdbeChangeOpcode(Vdbe *p, int addr, u8 iNewOpcode){ + assert( addr>=0 ); sqlite3VdbeGetOp(p,addr)->opcode = iNewOpcode; } SQLITE_PRIVATE void sqlite3VdbeChangeP1(Vdbe *p, int addr, int val){ + assert( addr>=0 ); sqlite3VdbeGetOp(p,addr)->p1 = val; } SQLITE_PRIVATE void sqlite3VdbeChangeP2(Vdbe *p, int addr, int val){ + assert( addr>=0 || p->db->mallocFailed ); sqlite3VdbeGetOp(p,addr)->p2 = val; } SQLITE_PRIVATE void sqlite3VdbeChangeP3(Vdbe *p, int addr, int val){ + assert( addr>=0 ); sqlite3VdbeGetOp(p,addr)->p3 = val; } SQLITE_PRIVATE void sqlite3VdbeChangeP5(Vdbe *p, u16 p5){ @@ -80870,6 +83166,18 @@ SQLITE_PRIVATE void sqlite3VdbeChangeP5(Vdbe *p, u16 p5){ if( p->nOp>0 ) p->aOp[p->nOp-1].p5 = p5; } +/* +** If the previous opcode is an OP_Column that delivers results +** into register iDest, then add the OPFLAG_TYPEOFARG flag to that +** opcode. +*/ +SQLITE_PRIVATE void sqlite3VdbeTypeofColumn(Vdbe *p, int iDest){ + VdbeOp *pOp = sqlite3VdbeGetLastOp(p); + if( pOp->p3==iDest && pOp->opcode==OP_Column ){ + pOp->p5 |= OPFLAG_TYPEOFARG; + } +} + /* ** Change the P2 operand of instruction addr so that it points to ** the address of the next instruction to be coded. @@ -80898,7 +83206,7 @@ SQLITE_PRIVATE void sqlite3VdbeJumpHereOrPopInst(Vdbe *p, int addr){ || p->aOp[addr].opcode==OP_FkIfZero ); assert( p->aOp[addr].p4type==0 ); #ifdef SQLITE_VDBE_COVERAGE - sqlite3VdbeGetOp(p,-1)->iSrcLine = 0; /* Erase VdbeCoverage() macros */ + sqlite3VdbeGetLastOp(p)->iSrcLine = 0; /* Erase VdbeCoverage() macros */ #endif p->nOp--; }else{ @@ -80912,8 +83220,9 @@ SQLITE_PRIVATE void sqlite3VdbeJumpHereOrPopInst(Vdbe *p, int addr){ ** the FuncDef is not ephermal, then do nothing. */ static void freeEphemeralFunction(sqlite3 *db, FuncDef *pDef){ + assert( db!=0 ); if( (pDef->funcFlags & SQLITE_FUNC_EPHEM)!=0 ){ - sqlite3DbFreeNN(db, pDef); + sqlite3DbNNFreeNN(db, pDef); } } @@ -80922,11 +83231,12 @@ static void freeEphemeralFunction(sqlite3 *db, FuncDef *pDef){ */ static SQLITE_NOINLINE void freeP4Mem(sqlite3 *db, Mem *p){ if( p->szMalloc ) sqlite3DbFree(db, p->zMalloc); - sqlite3DbFreeNN(db, p); + sqlite3DbNNFreeNN(db, p); } static SQLITE_NOINLINE void freeP4FuncCtx(sqlite3 *db, sqlite3_context *p){ + assert( db!=0 ); freeEphemeralFunction(db, p->pFunc); - sqlite3DbFreeNN(db, p); + sqlite3DbNNFreeNN(db, p); } static void freeP4(sqlite3 *db, int p4type, void *p4){ assert( db ); @@ -80938,9 +83248,8 @@ static void freeP4(sqlite3 *db, int p4type, void *p4){ case P4_REAL: case P4_INT64: case P4_DYNAMIC: - case P4_DYNBLOB: case P4_INTARRAY: { - sqlite3DbFree(db, p4); + if( p4 ) sqlite3DbNNFreeNN(db, p4); break; } case P4_KEYINFO: { @@ -80978,15 +83287,19 @@ static void freeP4(sqlite3 *db, int p4type, void *p4){ ** nOp entries. */ static void vdbeFreeOpArray(sqlite3 *db, Op *aOp, int nOp){ + assert( nOp>=0 ); + assert( db!=0 ); if( aOp ){ - Op *pOp; - for(pOp=&aOp[nOp-1]; pOp>=aOp; pOp--){ + Op *pOp = &aOp[nOp-1]; + while(1){ /* Exit via break */ if( pOp->p4type <= P4_FREE_IF_LE ) freeP4(db, pOp->p4type, pOp->p4.p); #ifdef SQLITE_ENABLE_EXPLAIN_COMMENTS sqlite3DbFree(db, pOp->zComment); #endif + if( pOp==aOp ) break; + pOp--; } - sqlite3DbFreeNN(db, aOp); + sqlite3DbNNFreeNN(db, aOp); } } @@ -81046,7 +83359,7 @@ SQLITE_PRIVATE void sqlite3VdbeReleaseRegisters( u32 mask, /* Mask of registers to NOT release */ int bUndefine /* If true, mark registers as undefined */ ){ - if( N==0 ) return; + if( N==0 || OptimizationDisabled(pParse->db, SQLITE_ReleaseReg) ) return; assert( pParse->pVdbe ); assert( iFirst>=1 ); assert( iFirst+N-1<=pParse->nMem ); @@ -81110,7 +83423,7 @@ SQLITE_PRIVATE void sqlite3VdbeChangeP4(Vdbe *p, int addr, const char *zP4, int sqlite3 *db; assert( p!=0 ); db = p->db; - assert( p->iVdbeMagic==VDBE_MAGIC_INIT ); + assert( p->eVdbeState==VDBE_INIT_STATE ); assert( p->aOp!=0 || db->mallocFailed ); if( db->mallocFailed ){ if( n!=P4_VTAB ) freeP4(db, n, (void*)*(char**)&zP4); @@ -81155,7 +83468,7 @@ SQLITE_PRIVATE void sqlite3VdbeAppendP4(Vdbe *p, void *pP4, int n){ if( p->db->mallocFailed ){ freeP4(p->db, n, pP4); }else{ - assert( pP4!=0 ); + assert( pP4!=0 || n==P4_DYNAMIC ); assert( p->nOp>0 ); pOp = &p->aOp[p->nOp-1]; assert( pOp->p4type==P4_NOTUSED ); @@ -81217,13 +83530,13 @@ SQLITE_PRIVATE void sqlite3VdbeNoopComment(Vdbe *p, const char *zFormat, ...){ ** Set the value if the iSrcLine field for the previously coded instruction. */ SQLITE_PRIVATE void sqlite3VdbeSetLineNumber(Vdbe *v, int iLine){ - sqlite3VdbeGetOp(v,-1)->iSrcLine = iLine; + sqlite3VdbeGetLastOp(v)->iSrcLine = iLine; } #endif /* SQLITE_VDBE_COVERAGE */ /* -** Return the opcode for a given address. If the address is -1, then -** return the most recently inserted opcode. +** Return the opcode for a given address. The address must be non-negative. +** See sqlite3VdbeGetLastOp() to get the most recently added opcode. ** ** If a memory allocation error has occurred prior to the calling of this ** routine, then a pointer to a dummy VdbeOp will be returned. That opcode @@ -81238,10 +83551,7 @@ SQLITE_PRIVATE VdbeOp *sqlite3VdbeGetOp(Vdbe *p, int addr){ /* C89 specifies that the constant "dummy" will be initialized to all ** zeros, which is correct. MSVC generates a warning, nevertheless. */ static VdbeOp dummy; /* Ignore the MSVC warning about no initializer */ - assert( p->iVdbeMagic==VDBE_MAGIC_INIT ); - if( addr<0 ){ - addr = p->nOp - 1; - } + assert( p->eVdbeState==VDBE_INIT_STATE ); assert( (addr>=0 && addrnOp) || p->db->mallocFailed ); if( p->db->mallocFailed ){ return (VdbeOp*)&dummy; @@ -81250,6 +83560,12 @@ SQLITE_PRIVATE VdbeOp *sqlite3VdbeGetOp(Vdbe *p, int addr){ } } +/* Return the most recently added opcode +*/ +VdbeOp * sqlite3VdbeGetLastOp(Vdbe *p){ + return sqlite3VdbeGetOp(p, p->nOp - 1); +} + #if defined(SQLITE_ENABLE_EXPLAIN_COMMENTS) /* ** Return an integer value for one of the parameters to the opcode pOp @@ -81305,8 +83621,11 @@ SQLITE_PRIVATE char *sqlite3VdbeDisplayComment( if( c=='4' ){ sqlite3_str_appendall(&x, zP4); }else if( c=='X' ){ - sqlite3_str_appendall(&x, pOp->zComment); - seenCom = 1; + if( pOp->zComment && pOp->zComment[0] ){ + sqlite3_str_appendall(&x, pOp->zComment); + seenCom = 1; + break; + } }else{ int v1 = translateP(c, pOp); int v2; @@ -81535,10 +83854,6 @@ SQLITE_PRIVATE char *sqlite3VdbeDisplayP4(sqlite3 *db, Op *pOp){ zP4 = "program"; break; } - case P4_DYNBLOB: - case P4_ADVANCE: { - break; - } case P4_TABLE: { zP4 = pOp->p4.pTab->zName; break; @@ -81670,21 +83985,40 @@ SQLITE_PRIVATE void sqlite3VdbePrintOp(FILE *pOut, int pc, VdbeOp *pOp){ /* ** Initialize an array of N Mem element. +** +** This is a high-runner, so only those fields that really do need to +** be initialized are set. The Mem structure is organized so that +** the fields that get initialized are nearby and hopefully on the same +** cache line. +** +** Mem.flags = flags +** Mem.db = db +** Mem.szMalloc = 0 +** +** All other fields of Mem can safely remain uninitialized for now. They +** will be initialized before use. */ static void initMemArray(Mem *p, int N, sqlite3 *db, u16 flags){ - while( (N--)>0 ){ - p->db = db; - p->flags = flags; - p->szMalloc = 0; + if( N>0 ){ + do{ + p->flags = flags; + p->db = db; + p->szMalloc = 0; #ifdef SQLITE_DEBUG - p->pScopyFrom = 0; + p->pScopyFrom = 0; #endif - p++; + p++; + }while( (--N)>0 ); } } /* -** Release an array of N Mem elements +** Release auxiliary memory held in an array of N Mem elements. +** +** After this routine returns, all Mem elements in the array will still +** be valid. Those Mem elements that were not holding auxiliary resources +** will be unchanged. Mem elements which had something freed will be +** set to MEM_Undefined. */ static void releaseMemArray(Mem *p, int N){ if( p && N ){ @@ -81717,12 +84051,17 @@ static void releaseMemArray(Mem *p, int N){ if( p->flags&(MEM_Agg|MEM_Dyn) ){ testcase( (p->flags & MEM_Dyn)!=0 && p->xDel==sqlite3VdbeFrameMemDel ); sqlite3VdbeMemRelease(p); + p->flags = MEM_Undefined; }else if( p->szMalloc ){ - sqlite3DbFreeNN(db, p->zMalloc); + sqlite3DbNNFreeNN(db, p->zMalloc); p->szMalloc = 0; + p->flags = MEM_Undefined; } - - p->flags = MEM_Undefined; +#ifdef SQLITE_DEBUG + else{ + p->flags = MEM_Undefined; + } +#endif }while( (++p)nChildMem]; assert( sqlite3VdbeFrameIsValid(p) ); for(i=0; inChildCsr; i++){ - sqlite3VdbeFreeCursor(p->v, apCsr[i]); + if( apCsr[i] ) sqlite3VdbeFreeCursorNN(p->v, apCsr[i]); } releaseMemArray(aMem, p->nChildMem); sqlite3VdbeDeleteAuxData(p->v->db, &p->pAuxData, -1, 0); @@ -81920,7 +84259,7 @@ SQLITE_PRIVATE int sqlite3VdbeList( Op *pOp; /* Current opcode */ assert( p->explain ); - assert( p->iVdbeMagic==VDBE_MAGIC_RUN ); + assert( p->eVdbeState==VDBE_RUN_STATE ); assert( p->rc==SQLITE_OK || p->rc==SQLITE_BUSY || p->rc==SQLITE_NOMEM ); /* Even though this opcode does not use dynamic strings for @@ -82075,11 +84414,11 @@ struct ReusableSpace { static void *allocSpace( struct ReusableSpace *p, /* Bulk memory available for allocation */ void *pBuf, /* Pointer to a prior allocation */ - sqlite3_int64 nByte /* Bytes of memory needed */ + sqlite3_int64 nByte /* Bytes of memory needed. */ ){ assert( EIGHT_BYTE_ALIGNMENT(p->pSpace) ); if( pBuf==0 ){ - nByte = ROUND8(nByte); + nByte = ROUND8P(nByte); if( nByte <= p->nFree ){ p->nFree -= nByte; pBuf = &p->pSpace[p->nFree]; @@ -82100,14 +84439,15 @@ SQLITE_PRIVATE void sqlite3VdbeRewind(Vdbe *p){ int i; #endif assert( p!=0 ); - assert( p->iVdbeMagic==VDBE_MAGIC_INIT || p->iVdbeMagic==VDBE_MAGIC_RESET ); + assert( p->eVdbeState==VDBE_INIT_STATE + || p->eVdbeState==VDBE_READY_STATE + || p->eVdbeState==VDBE_HALT_STATE ); /* There should be at least one opcode. */ assert( p->nOp>0 ); - /* Set the magic to VDBE_MAGIC_RUN sooner rather than later. */ - p->iVdbeMagic = VDBE_MAGIC_RUN; + p->eVdbeState = VDBE_READY_STATE; #ifdef SQLITE_DEBUG for(i=0; inMem; i++){ @@ -82163,7 +84503,7 @@ SQLITE_PRIVATE void sqlite3VdbeMakeReady( assert( p!=0 ); assert( p->nOp>0 ); assert( pParse!=0 ); - assert( p->iVdbeMagic==VDBE_MAGIC_INIT ); + assert( p->eVdbeState==VDBE_INIT_STATE ); assert( pParse==p->pParse ); p->pVList = pParse->pVList; pParse->pVList = 0; @@ -82186,7 +84526,7 @@ SQLITE_PRIVATE void sqlite3VdbeMakeReady( ** opcode array. This extra memory will be reallocated for other elements ** of the prepared statement. */ - n = ROUND8(sizeof(Op)*p->nOp); /* Bytes of opcode memory used */ + n = ROUND8P(sizeof(Op)*p->nOp); /* Bytes of opcode memory used */ x.pSpace = &((u8*)p->aOp)[n]; /* Unused opcode memory */ assert( EIGHT_BYTE_ALIGNMENT(x.pSpace) ); x.nFree = ROUNDDOWN8(pParse->szOpAlloc - n); /* Bytes of unused memory */ @@ -82274,9 +84614,9 @@ SQLITE_PRIVATE void sqlite3VdbeMakeReady( ** happens to hold. */ SQLITE_PRIVATE void sqlite3VdbeFreeCursor(Vdbe *p, VdbeCursor *pCx){ - if( pCx==0 ){ - return; - } + if( pCx ) sqlite3VdbeFreeCursorNN(p,pCx); +} +SQLITE_PRIVATE void sqlite3VdbeFreeCursorNN(Vdbe *p, VdbeCursor *pCx){ switch( pCx->eCurType ){ case CURTYPE_SORTER: { sqlite3VdbeSorterClose(p->db, pCx); @@ -82304,14 +84644,12 @@ SQLITE_PRIVATE void sqlite3VdbeFreeCursor(Vdbe *p, VdbeCursor *pCx){ ** Close all cursors in the current frame. */ static void closeCursorsInFrame(Vdbe *p){ - if( p->apCsr ){ - int i; - for(i=0; inCursor; i++){ - VdbeCursor *pC = p->apCsr[i]; - if( pC ){ - sqlite3VdbeFreeCursor(p, pC); - p->apCsr[i] = 0; - } + int i; + for(i=0; inCursor; i++){ + VdbeCursor *pC = p->apCsr[i]; + if( pC ){ + sqlite3VdbeFreeCursorNN(p, pC); + p->apCsr[i] = 0; } } } @@ -82360,9 +84698,7 @@ static void closeAllCursors(Vdbe *p){ } assert( p->nFrame==0 ); closeCursorsInFrame(p); - if( p->aMem ){ - releaseMemArray(p->aMem, p->nMem); - } + releaseMemArray(p->aMem, p->nMem); while( p->pDelFrame ){ VdbeFrame *pDel = p->pDelFrame; p->pDelFrame = pDel->pParent; @@ -82709,7 +85045,7 @@ static void checkActiveVdbeCnt(sqlite3 *db){ if( p->readOnly==0 ) nWrite++; if( p->bIsReader ) nRead++; } - p = p->pNext; + p = p->pVNext; } assert( cnt==db->nVdbeActive ); assert( nWrite==db->nVdbeWrite ); @@ -82802,7 +85138,8 @@ SQLITE_PRIVATE int sqlite3VdbeCheckFk(Vdbe *p, int deferred){ p->rc = SQLITE_CONSTRAINT_FOREIGNKEY; p->errorAction = OE_Abort; sqlite3VdbeError(p, "FOREIGN KEY constraint failed"); - return SQLITE_ERROR; + if( (p->prepFlags & SQLITE_PREPARE_SAVESQL)==0 ) return SQLITE_ERROR; + return SQLITE_CONSTRAINT_FOREIGNKEY; } return SQLITE_OK; } @@ -82841,9 +85178,7 @@ SQLITE_PRIVATE int sqlite3VdbeHalt(Vdbe *p){ ** one, or the complete transaction if there is no statement transaction. */ - if( p->iVdbeMagic!=VDBE_MAGIC_RUN ){ - return SQLITE_OK; - } + assert( p->eVdbeState==VDBE_RUN_STATE ); if( db->mallocFailed ){ p->rc = SQLITE_NOMEM_BKPT; } @@ -82852,7 +85187,7 @@ SQLITE_PRIVATE int sqlite3VdbeHalt(Vdbe *p){ /* No commit or rollback needed if the program never started or if the ** SQL statement does not read or write a database file. */ - if( p->pc>=0 && p->bIsReader ){ + if( p->bIsReader ){ int mrc; /* Primary error code from p->rc */ int eStatementOp = 0; int isSpecialError; /* Set to true if a 'special' error */ @@ -83000,15 +85335,13 @@ SQLITE_PRIVATE int sqlite3VdbeHalt(Vdbe *p){ } /* We have successfully halted and closed the VM. Record this fact. */ - if( p->pc>=0 ){ - db->nVdbeActive--; - if( !p->readOnly ) db->nVdbeWrite--; - if( p->bIsReader ) db->nVdbeRead--; - assert( db->nVdbeActive>=db->nVdbeRead ); - assert( db->nVdbeRead>=db->nVdbeWrite ); - assert( db->nVdbeWrite>=0 ); - } - p->iVdbeMagic = VDBE_MAGIC_HALT; + db->nVdbeActive--; + if( !p->readOnly ) db->nVdbeWrite--; + if( p->bIsReader ) db->nVdbeRead--; + assert( db->nVdbeActive>=db->nVdbeRead ); + assert( db->nVdbeRead>=db->nVdbeWrite ); + assert( db->nVdbeWrite>=0 ); + p->eVdbeState = VDBE_HALT_STATE; checkActiveVdbeCnt(db); if( db->mallocFailed ){ p->rc = SQLITE_NOMEM_BKPT; @@ -83090,8 +85423,8 @@ static void vdbeInvokeSqllog(Vdbe *v){ ** again. ** ** To look at it another way, this routine resets the state of the -** virtual machine from VDBE_MAGIC_RUN or VDBE_MAGIC_HALT back to -** VDBE_MAGIC_INIT. +** virtual machine from VDBE_RUN_STATE or VDBE_HALT_STATE back to +** VDBE_READY_STATE. */ SQLITE_PRIVATE int sqlite3VdbeReset(Vdbe *p){ #if defined(SQLITE_DEBUG) || defined(VDBE_PROFILE) @@ -83105,7 +85438,7 @@ SQLITE_PRIVATE int sqlite3VdbeReset(Vdbe *p){ ** error, then it might not have been halted properly. So halt ** it now. */ - sqlite3VdbeHalt(p); + if( p->eVdbeState==VDBE_RUN_STATE ) sqlite3VdbeHalt(p); /* If the VDBE has been run even partially, then transfer the error code ** and error message from the VDBE into the main database structure. But @@ -83119,13 +85452,6 @@ SQLITE_PRIVATE int sqlite3VdbeReset(Vdbe *p){ }else{ db->errCode = p->rc; } - if( p->runOnlyOnce ) p->expired = 1; - }else if( p->rc && p->expired ){ - /* The expired flag was set on the VDBE before the first call - ** to sqlite3_step(). For consistency (since sqlite3_step() was - ** called), set the database error in this case as well. - */ - sqlite3ErrorWithMsg(db, p->rc, p->zErrMsg ? "%s" : 0, p->zErrMsg); } /* Reset register contents and reclaim error message memory. @@ -83182,7 +85508,6 @@ SQLITE_PRIVATE int sqlite3VdbeReset(Vdbe *p){ } } #endif - p->iVdbeMagic = VDBE_MAGIC_RESET; return p->rc & db->errMask; } @@ -83192,7 +85517,10 @@ SQLITE_PRIVATE int sqlite3VdbeReset(Vdbe *p){ */ SQLITE_PRIVATE int sqlite3VdbeFinalize(Vdbe *p){ int rc = SQLITE_OK; - if( p->iVdbeMagic==VDBE_MAGIC_RUN || p->iVdbeMagic==VDBE_MAGIC_HALT ){ + assert( VDBE_RUN_STATE>VDBE_READY_STATE ); + assert( VDBE_HALT_STATE>VDBE_READY_STATE ); + assert( VDBE_INIT_STATEeVdbeState>=VDBE_READY_STATE ){ rc = sqlite3VdbeReset(p); assert( (rc & p->db->errMask)==rc ); } @@ -83244,23 +85572,26 @@ SQLITE_PRIVATE void sqlite3VdbeDeleteAuxData(sqlite3 *db, AuxData **pp, int iOp, ** VdbeDelete() also unlinks the Vdbe from the list of VMs associated with ** the database connection and frees the object itself. */ -SQLITE_PRIVATE void sqlite3VdbeClearObject(sqlite3 *db, Vdbe *p){ +static void sqlite3VdbeClearObject(sqlite3 *db, Vdbe *p){ SubProgram *pSub, *pNext; + assert( db!=0 ); assert( p->db==0 || p->db==db ); - releaseMemArray(p->aColName, p->nResColumn*COLNAME_N); + if( p->aColName ){ + releaseMemArray(p->aColName, p->nResColumn*COLNAME_N); + sqlite3DbNNFreeNN(db, p->aColName); + } for(pSub=p->pProgram; pSub; pSub=pNext){ pNext = pSub->pNext; vdbeFreeOpArray(db, pSub->aOp, pSub->nOp); sqlite3DbFree(db, pSub); } - if( p->iVdbeMagic!=VDBE_MAGIC_INIT ){ + if( p->eVdbeState!=VDBE_INIT_STATE ){ releaseMemArray(p->aVar, p->nVar); - sqlite3DbFree(db, p->pVList); - sqlite3DbFree(db, p->pFree); + if( p->pVList ) sqlite3DbNNFreeNN(db, p->pVList); + if( p->pFree ) sqlite3DbNNFreeNN(db, p->pFree); } vdbeFreeOpArray(db, p->aOp, p->nOp); - sqlite3DbFree(db, p->aColName); - sqlite3DbFree(db, p->zSql); + if( p->zSql ) sqlite3DbNNFreeNN(db, p->zSql); #ifdef SQLITE_ENABLE_NORMALIZE sqlite3DbFree(db, p->zNormSql); { @@ -83290,20 +85621,17 @@ SQLITE_PRIVATE void sqlite3VdbeDelete(Vdbe *p){ assert( p!=0 ); db = p->db; + assert( db!=0 ); assert( sqlite3_mutex_held(db->mutex) ); sqlite3VdbeClearObject(db, p); - if( p->pPrev ){ - p->pPrev->pNext = p->pNext; - }else{ - assert( db->pVdbe==p ); - db->pVdbe = p->pNext; - } - if( p->pNext ){ - p->pNext->pPrev = p->pPrev; + if( db->pnBytesFreed==0 ){ + assert( p->ppVPrev!=0 ); + *p->ppVPrev = p->pVNext; + if( p->pVNext ){ + p->pVNext->ppVPrev = p->ppVPrev; + } } - p->iVdbeMagic = VDBE_MAGIC_DEAD; - p->db = 0; - sqlite3DbFreeNN(db, p); + sqlite3DbNNFreeNN(db, p); } /* @@ -83337,7 +85665,7 @@ SQLITE_PRIVATE int SQLITE_NOINLINE sqlite3VdbeFinishMoveto(VdbeCursor *p){ ** is supposed to be pointing. If the row was deleted out from under the ** cursor, set the cursor to point to a NULL row. */ -static int SQLITE_NOINLINE handleMovedCursor(VdbeCursor *p){ +SQLITE_PRIVATE int SQLITE_NOINLINE sqlite3VdbeHandleMovedCursor(VdbeCursor *p){ int isDifferentRow, rc; assert( p->eCurType==CURTYPE_BTREE ); assert( p->uc.pCursor!=0 ); @@ -83353,41 +85681,9 @@ static int SQLITE_NOINLINE handleMovedCursor(VdbeCursor *p){ ** if need be. Return any I/O error from the restore operation. */ SQLITE_PRIVATE int sqlite3VdbeCursorRestore(VdbeCursor *p){ - assert( p->eCurType==CURTYPE_BTREE ); - if( sqlite3BtreeCursorHasMoved(p->uc.pCursor) ){ - return handleMovedCursor(p); - } - return SQLITE_OK; -} - -/* -** Make sure the cursor p is ready to read or write the row to which it -** was last positioned. Return an error code if an OOM fault or I/O error -** prevents us from positioning the cursor to its correct position. -** -** If a MoveTo operation is pending on the given cursor, then do that -** MoveTo now. If no move is pending, check to see if the row has been -** deleted out from under the cursor and if it has, mark the row as -** a NULL row. -** -** If the cursor is already pointing to the correct row and that row has -** not been deleted out from under the cursor, then this routine is a no-op. -*/ -SQLITE_PRIVATE int sqlite3VdbeCursorMoveto(VdbeCursor **pp, u32 *piCol){ - VdbeCursor *p = *pp; - assert( p->eCurType==CURTYPE_BTREE || p->eCurType==CURTYPE_PSEUDO ); - if( p->deferredMoveto ){ - u32 iMap; - assert( !p->isEphemeral ); - if( p->ub.aAltMap && (iMap = p->ub.aAltMap[1+*piCol])>0 && !p->nullRow ){ - *pp = p->pAltCursor; - *piCol = iMap - 1; - return SQLITE_OK; - } - return sqlite3VdbeFinishMoveto(p); - } + assert( p->eCurType==CURTYPE_BTREE || IsNullCursor(p) ); if( sqlite3BtreeCursorHasMoved(p->uc.pCursor) ){ - return handleMovedCursor(p); + return sqlite3VdbeHandleMovedCursor(p); } return SQLITE_OK; } @@ -83398,7 +85694,7 @@ SQLITE_PRIVATE int sqlite3VdbeCursorMoveto(VdbeCursor **pp, u32 *piCol){ ** sqlite3VdbeSerialType() ** sqlite3VdbeSerialTypeLen() ** sqlite3VdbeSerialLen() -** sqlite3VdbeSerialPut() +** sqlite3VdbeSerialPut() <--- in-lined into OP_MakeRecord as of 2022-04-02 ** sqlite3VdbeSerialGet() ** ** encapsulate the code that serializes values for storage in SQLite @@ -83510,7 +85806,7 @@ SQLITE_PRIVATE u32 sqlite3VdbeSerialType(Mem *pMem, int file_format, u32 *pLen){ /* ** The sizes for serial types less than 128 */ -static const u8 sqlite3SmallTypeSizes[] = { +SQLITE_PRIVATE const u8 sqlite3SmallTypeSizes[128] = { /* 0 1 2 3 4 5 6 7 8 9 */ /* 0 */ 0, 1, 2, 3, 4, 6, 8, 8, 0, 0, /* 10 */ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, @@ -83579,7 +85875,7 @@ SQLITE_PRIVATE u8 sqlite3VdbeOneByteSerialTypeLen(u8 serial_type){ ** so we trust him. */ #ifdef SQLITE_MIXED_ENDIAN_64BIT_FLOAT -static u64 floatSwap(u64 in){ +SQLITE_PRIVATE u64 sqlite3FloatSwap(u64 in){ union { u64 r; u32 i[2]; @@ -83592,59 +85888,8 @@ static u64 floatSwap(u64 in){ u.i[1] = t; return u.r; } -# define swapMixedEndianFloat(X) X = floatSwap(X) -#else -# define swapMixedEndianFloat(X) -#endif - -/* -** Write the serialized data blob for the value stored in pMem into -** buf. It is assumed that the caller has allocated sufficient space. -** Return the number of bytes written. -** -** nBuf is the amount of space left in buf[]. The caller is responsible -** for allocating enough space to buf[] to hold the entire field, exclusive -** of the pMem->u.nZero bytes for a MEM_Zero value. -** -** Return the number of bytes actually written into buf[]. The number -** of bytes in the zero-filled tail is included in the return value only -** if those bytes were zeroed in buf[]. -*/ -SQLITE_PRIVATE u32 sqlite3VdbeSerialPut(u8 *buf, Mem *pMem, u32 serial_type){ - u32 len; - - /* Integer and Real */ - if( serial_type<=7 && serial_type>0 ){ - u64 v; - u32 i; - if( serial_type==7 ){ - assert( sizeof(v)==sizeof(pMem->u.r) ); - memcpy(&v, &pMem->u.r, sizeof(v)); - swapMixedEndianFloat(v); - }else{ - v = pMem->u.i; - } - len = i = sqlite3SmallTypeSizes[serial_type]; - assert( i>0 ); - do{ - buf[--i] = (u8)(v&0xFF); - v >>= 8; - }while( i ); - return len; - } - - /* String or blob */ - if( serial_type>=12 ){ - assert( pMem->n + ((pMem->flags & MEM_Zero)?pMem->u.nZero:0) - == (int)sqlite3VdbeSerialTypeLen(serial_type) ); - len = pMem->n; - if( len>0 ) memcpy(buf, pMem->z, len); - return len; - } +#endif /* SQLITE_MIXED_ENDIAN_64BIT_FLOAT */ - /* NULL or constants 0 or 1 */ - return 0; -} /* Input "x" is a sequence of unsigned characters that represent a ** big-endian integer. Return the equivalent native integer @@ -83810,10 +86055,10 @@ SQLITE_PRIVATE UnpackedRecord *sqlite3VdbeAllocUnpackedRecord( ){ UnpackedRecord *p; /* Unpacked record to return */ int nByte; /* Number of bytes required for *p */ - nByte = ROUND8(sizeof(UnpackedRecord)) + sizeof(Mem)*(pKeyInfo->nKeyField+1); + nByte = ROUND8P(sizeof(UnpackedRecord)) + sizeof(Mem)*(pKeyInfo->nKeyField+1); p = (UnpackedRecord *)sqlite3DbMallocRaw(pKeyInfo->db, nByte); if( !p ) return 0; - p->aMem = (Mem*)&((char*)p)[ROUND8(sizeof(UnpackedRecord))]; + p->aMem = (Mem*)&((char*)p)[ROUND8P(sizeof(UnpackedRecord))]; assert( pKeyInfo->aSortFlags!=0 ); p->pKeyInfo = pKeyInfo; p->nField = pKeyInfo->nKeyField + 1; @@ -84049,8 +86294,8 @@ static int vdbeCompareMemString( }else{ rc = pColl->xCmp(pColl->pUser, c1.n, v1, c2.n, v2); } - sqlite3VdbeMemRelease(&c1); - sqlite3VdbeMemRelease(&c2); + sqlite3VdbeMemReleaseMalloc(&c1); + sqlite3VdbeMemReleaseMalloc(&c2); return rc; } } @@ -84311,14 +86556,22 @@ SQLITE_PRIVATE int sqlite3VdbeRecordCompareWithSkip( ** two elements in the keys are equal. Fix the various stack variables so ** that this routine begins comparing at the second field. */ if( bSkip ){ - u32 s1; - idx1 = 1 + getVarint32(&aKey1[1], s1); + u32 s1 = aKey1[1]; + if( s1<0x80 ){ + idx1 = 2; + }else{ + idx1 = 1 + sqlite3GetVarint32(&aKey1[1], &s1); + } szHdr1 = aKey1[0]; d1 = szHdr1 + sqlite3VdbeSerialTypeLen(s1); i = 1; pRhs++; }else{ - idx1 = getVarint32(aKey1, szHdr1); + if( (szHdr1 = aKey1[0])<0x80 ){ + idx1 = 1; + }else{ + idx1 = sqlite3GetVarint32(aKey1, &szHdr1); + } d1 = szHdr1; i = 0; } @@ -84333,7 +86586,7 @@ SQLITE_PRIVATE int sqlite3VdbeRecordCompareWithSkip( assert( pPKey2->pKeyInfo->aSortFlags!=0 ); assert( pPKey2->pKeyInfo->nKeyField>0 ); assert( idx1<=szHdr1 || CORRUPT_DB ); - do{ + while( 1 /*exit-by-break*/ ){ u32 serial_type; /* RHS is an integer */ @@ -84343,7 +86596,7 @@ SQLITE_PRIVATE int sqlite3VdbeRecordCompareWithSkip( serial_type = aKey1[idx1]; testcase( serial_type==12 ); if( serial_type>=10 ){ - rc = +1; + rc = serial_type==10 ? -1 : +1; }else if( serial_type==0 ){ rc = -1; }else if( serial_type==7 ){ @@ -84368,7 +86621,7 @@ SQLITE_PRIVATE int sqlite3VdbeRecordCompareWithSkip( ** numbers). Types 10 and 11 are currently "reserved for future ** use", so it doesn't really matter what the results of comparing ** them to numberic values are. */ - rc = +1; + rc = serial_type==10 ? -1 : +1; }else if( serial_type==0 ){ rc = -1; }else{ @@ -84449,7 +86702,7 @@ SQLITE_PRIVATE int sqlite3VdbeRecordCompareWithSkip( /* RHS is null */ else{ serial_type = aKey1[idx1]; - rc = (serial_type!=0); + rc = (serial_type!=0 && serial_type!=10); } if( rc!=0 ){ @@ -84471,8 +86724,13 @@ SQLITE_PRIVATE int sqlite3VdbeRecordCompareWithSkip( if( i==pPKey2->nField ) break; pRhs++; d1 += sqlite3VdbeSerialTypeLen(serial_type); + if( d1>(unsigned)nKey1 ) break; idx1 += sqlite3VarintLen(serial_type); - }while( idx1<(unsigned)szHdr1 && d1<=(unsigned)nKey1 ); + if( idx1>=(unsigned)szHdr1 ){ + pPKey2->errCode = (u8)SQLITE_CORRUPT_BKPT; + return 0; /* Corrupt index */ + } + } /* No memory allocation is ever used on mem1. Prove this using ** the following assert(). If the assert() fails, it indicates a @@ -84574,7 +86832,8 @@ static int vdbeRecordCompareInt( return sqlite3VdbeRecordCompare(nKey1, pKey1, pPKey2); } - v = pPKey2->aMem[0].u.i; + assert( pPKey2->u.i == pPKey2->aMem[0].u.i ); + v = pPKey2->u.i; if( v>lhs ){ res = pPKey2->r1; }else if( vaMem[0].flags & MEM_Str ); + assert( pPKey2->aMem[0].n == pPKey2->n ); + assert( pPKey2->aMem[0].z == pPKey2->u.z ); vdbeAssertFieldCountWithinLimits(nKey1, pKey1, pPKey2->pKeyInfo); - serial_type = (u8)(aKey1[1]); - if( serial_type >= 0x80 ){ - sqlite3GetVarint32(&aKey1[1], (u32*)&serial_type); - } + serial_type = (signed char)(aKey1[1]); + +vrcs_restart: if( serial_type<12 ){ + if( serial_type<0 ){ + sqlite3GetVarint32(&aKey1[1], (u32*)&serial_type); + if( serial_type>=12 ) goto vrcs_restart; + assert( CORRUPT_DB ); + } res = pPKey2->r1; /* (pKey1/nKey1) is a number or a null */ }else if( !(serial_type & 0x01) ){ res = pPKey2->r2; /* (pKey1/nKey1) is a blob */ @@ -84628,15 +86893,15 @@ static int vdbeRecordCompareString( pPKey2->errCode = (u8)SQLITE_CORRUPT_BKPT; return 0; /* Corruption */ } - nCmp = MIN( pPKey2->aMem[0].n, nStr ); - res = memcmp(&aKey1[szHdr], pPKey2->aMem[0].z, nCmp); + nCmp = MIN( pPKey2->n, nStr ); + res = memcmp(&aKey1[szHdr], pPKey2->u.z, nCmp); if( res>0 ){ res = pPKey2->r2; }else if( res<0 ){ res = pPKey2->r1; }else{ - res = nStr - pPKey2->aMem[0].n; + res = nStr - pPKey2->n; if( res==0 ){ if( pPKey2->nField>1 ){ res = sqlite3VdbeRecordCompareWithSkip(nKey1, pKey1, pPKey2, 1); @@ -84691,6 +86956,7 @@ SQLITE_PRIVATE RecordCompare sqlite3VdbeFindCompare(UnpackedRecord *p){ p->r2 = 1; } if( (flags & MEM_Int) ){ + p->u.i = p->aMem[0].u.i; return vdbeRecordCompareInt; } testcase( flags & MEM_Real ); @@ -84700,6 +86966,8 @@ SQLITE_PRIVATE RecordCompare sqlite3VdbeFindCompare(UnpackedRecord *p){ && p->pKeyInfo->aColl[0]==0 ){ assert( flags & MEM_Str ); + p->u.z = p->aMem[0].z; + p->n = p->aMem[0].n; return vdbeRecordCompareString; } } @@ -84772,14 +87040,14 @@ SQLITE_PRIVATE int sqlite3VdbeIdxRowid(sqlite3 *db, BtCursor *pCur, i64 *rowid){ /* Fetch the integer off the end of the index record */ sqlite3VdbeSerialGet((u8*)&m.z[m.n-lenRowid], typeRowid, &v); *rowid = v.u.i; - sqlite3VdbeMemRelease(&m); + sqlite3VdbeMemReleaseMalloc(&m); return SQLITE_OK; /* Jump here if database corruption is detected after m has been ** allocated. Free the m object and return SQLITE_CORRUPT. */ idx_rowid_corruption: testcase( m.szMalloc!=0 ); - sqlite3VdbeMemRelease(&m); + sqlite3VdbeMemReleaseMalloc(&m); return SQLITE_CORRUPT_BKPT; } @@ -84821,7 +87089,7 @@ SQLITE_PRIVATE int sqlite3VdbeIdxKeyCompare( return rc; } *res = sqlite3VdbeRecordCompareWithSkip(m.n, m.z, pUnpacked, 0); - sqlite3VdbeMemRelease(&m); + sqlite3VdbeMemReleaseMalloc(&m); return SQLITE_OK; } @@ -84863,7 +87131,7 @@ SQLITE_PRIVATE void sqlite3VdbeCountChanges(Vdbe *v){ */ SQLITE_PRIVATE void sqlite3ExpirePreparedStatements(sqlite3 *db, int iCode){ Vdbe *p; - for(p = db->pVdbe; p; p=p->pNext){ + for(p = db->pVdbe; p; p=p->pVNext){ p->expired = iCode+1; } } @@ -84984,13 +87252,14 @@ SQLITE_PRIVATE void sqlite3VtabImportErrmsg(Vdbe *p, sqlite3_vtab *pVtab){ ** the vdbeUnpackRecord() function found in vdbeapi.c. */ static void vdbeFreeUnpacked(sqlite3 *db, int nField, UnpackedRecord *p){ + assert( db!=0 ); if( p ){ int i; for(i=0; iaMem[i]; - if( pMem->zMalloc ) sqlite3VdbeMemRelease(pMem); + if( pMem->zMalloc ) sqlite3VdbeMemReleaseMalloc(pMem); } - sqlite3DbFreeNN(db, p); + sqlite3DbNNFreeNN(db, p); } } #endif /* SQLITE_ENABLE_PREUPDATE_HOOK */ @@ -85061,7 +87330,7 @@ SQLITE_PRIVATE void sqlite3VdbePreUpdateHook( for(i=0; inField; i++){ sqlite3VdbeMemRelease(&preupdate.aNew[i]); } - sqlite3DbFreeNN(db, preupdate.aNew); + sqlite3DbNNFreeNN(db, preupdate.aNew); } } #endif /* SQLITE_ENABLE_PREUPDATE_HOOK */ @@ -85178,7 +87447,9 @@ SQLITE_API int sqlite3_finalize(sqlite3_stmt *pStmt){ if( vdbeSafety(v) ) return SQLITE_MISUSE_BKPT; sqlite3_mutex_enter(db->mutex); checkProfileCallback(db, v); - rc = sqlite3VdbeFinalize(v); + assert( v->eVdbeState>=VDBE_READY_STATE ); + rc = sqlite3VdbeReset(v); + sqlite3VdbeDelete(v); rc = sqlite3ApiExit(db, rc); sqlite3LeaveMutexAndCloseZombie(db); } @@ -85386,6 +87657,9 @@ SQLITE_API int sqlite3_value_type(sqlite3_value* pVal){ #endif return aType[pVal->flags&MEM_AffMask]; } +SQLITE_API int sqlite3_value_encoding(sqlite3_value *pVal){ + return pVal->enc; +} /* Return true if a parameter to xUpdate represents an unchanged column */ SQLITE_API int sqlite3_value_nochange(sqlite3_value *pVal){ @@ -85415,6 +87689,9 @@ SQLITE_API sqlite3_value *sqlite3_value_dup(const sqlite3_value *pOrig){ sqlite3ValueFree(pNew); pNew = 0; } + }else if( pNew->flags & MEM_Null ){ + /* Do not duplicate pointer values */ + pNew->flags &= ~(MEM_Term|MEM_Subtype); } return pNew; } @@ -85445,7 +87722,8 @@ static void setResultStrOrError( u8 enc, /* Encoding of z. 0 for BLOBs */ void (*xDel)(void*) /* Destructor function */ ){ - int rc = sqlite3VdbeMemSetStr(pCtx->pOut, z, n, enc, xDel); + Mem *pOut = pCtx->pOut; + int rc = sqlite3VdbeMemSetStr(pOut, z, n, enc, xDel); if( rc ){ if( rc==SQLITE_TOOBIG ){ sqlite3_result_error_toobig(pCtx); @@ -85455,6 +87733,11 @@ static void setResultStrOrError( assert( rc==SQLITE_NOMEM ); sqlite3_result_error_nomem(pCtx); } + return; + } + sqlite3VdbeChangeEncoding(pOut, pCtx->enc); + if( sqlite3VdbeMemTooBig(pOut) ){ + sqlite3_result_error_toobig(pCtx); } } static int invokeValueDestructor( @@ -85598,17 +87881,22 @@ SQLITE_API void sqlite3_result_text16le( } #endif /* SQLITE_OMIT_UTF16 */ SQLITE_API void sqlite3_result_value(sqlite3_context *pCtx, sqlite3_value *pValue){ + Mem *pOut = pCtx->pOut; assert( sqlite3_mutex_held(pCtx->pOut->db->mutex) ); - sqlite3VdbeMemCopy(pCtx->pOut, pValue); + sqlite3VdbeMemCopy(pOut, pValue); + sqlite3VdbeChangeEncoding(pOut, pCtx->enc); + if( sqlite3VdbeMemTooBig(pOut) ){ + sqlite3_result_error_toobig(pCtx); + } } SQLITE_API void sqlite3_result_zeroblob(sqlite3_context *pCtx, int n){ - assert( sqlite3_mutex_held(pCtx->pOut->db->mutex) ); - sqlite3VdbeMemSetZeroBlob(pCtx->pOut, n); + sqlite3_result_zeroblob64(pCtx, n>0 ? n : 0); } SQLITE_API int sqlite3_result_zeroblob64(sqlite3_context *pCtx, u64 n){ Mem *pOut = pCtx->pOut; assert( sqlite3_mutex_held(pOut->db->mutex) ); if( n>(u64)pOut->db->aLimit[SQLITE_LIMIT_LENGTH] ){ + sqlite3_result_error_toobig(pCtx); return SQLITE_TOOBIG; } #ifndef SQLITE_OMIT_INCRBLOB @@ -85624,8 +87912,8 @@ SQLITE_API void sqlite3_result_error_code(sqlite3_context *pCtx, int errCode){ if( pCtx->pVdbe ) pCtx->pVdbe->rcApp = errCode; #endif if( pCtx->pOut->flags & MEM_Null ){ - sqlite3VdbeMemSetStr(pCtx->pOut, sqlite3ErrStr(errCode), -1, - SQLITE_UTF8, SQLITE_STATIC); + setResultStrOrError(pCtx, sqlite3ErrStr(errCode), -1, SQLITE_UTF8, + SQLITE_STATIC); } } @@ -85699,80 +87987,83 @@ static int sqlite3Step(Vdbe *p){ int rc; assert(p); - if( p->iVdbeMagic!=VDBE_MAGIC_RUN ){ - /* We used to require that sqlite3_reset() be called before retrying - ** sqlite3_step() after any error or after SQLITE_DONE. But beginning - ** with version 3.7.0, we changed this so that sqlite3_reset() would - ** be called automatically instead of throwing the SQLITE_MISUSE error. - ** This "automatic-reset" change is not technically an incompatibility, - ** since any application that receives an SQLITE_MISUSE is broken by - ** definition. - ** - ** Nevertheless, some published applications that were originally written - ** for version 3.6.23 or earlier do in fact depend on SQLITE_MISUSE - ** returns, and those were broken by the automatic-reset change. As a - ** a work-around, the SQLITE_OMIT_AUTORESET compile-time restores the - ** legacy behavior of returning SQLITE_MISUSE for cases where the - ** previous sqlite3_step() returned something other than a SQLITE_LOCKED - ** or SQLITE_BUSY error. - */ -#ifdef SQLITE_OMIT_AUTORESET - if( (rc = p->rc&0xff)==SQLITE_BUSY || rc==SQLITE_LOCKED ){ - sqlite3_reset((sqlite3_stmt*)p); - }else{ - return SQLITE_MISUSE_BKPT; - } -#else - sqlite3_reset((sqlite3_stmt*)p); -#endif - } - - /* Check that malloc() has not failed. If it has, return early. */ db = p->db; - if( db->mallocFailed ){ - p->rc = SQLITE_NOMEM; - return SQLITE_NOMEM_BKPT; - } + if( p->eVdbeState!=VDBE_RUN_STATE ){ + restart_step: + if( p->eVdbeState==VDBE_READY_STATE ){ + if( p->expired ){ + p->rc = SQLITE_SCHEMA; + rc = SQLITE_ERROR; + if( (p->prepFlags & SQLITE_PREPARE_SAVESQL)!=0 ){ + /* If this statement was prepared using saved SQL and an + ** error has occurred, then return the error code in p->rc to the + ** caller. Set the error code in the database handle to the same + ** value. + */ + rc = sqlite3VdbeTransferError(p); + } + goto end_of_step; + } - if( p->pc<0 && p->expired ){ - p->rc = SQLITE_SCHEMA; - rc = SQLITE_ERROR; - if( (p->prepFlags & SQLITE_PREPARE_SAVESQL)!=0 ){ - /* If this statement was prepared using saved SQL and an - ** error has occurred, then return the error code in p->rc to the - ** caller. Set the error code in the database handle to the same value. + /* If there are no other statements currently running, then + ** reset the interrupt flag. This prevents a call to sqlite3_interrupt + ** from interrupting a statement that has not yet started. */ - rc = sqlite3VdbeTransferError(p); - } - goto end_of_step; - } - if( p->pc<0 ){ - /* If there are no other statements currently running, then - ** reset the interrupt flag. This prevents a call to sqlite3_interrupt - ** from interrupting a statement that has not yet started. - */ - if( db->nVdbeActive==0 ){ - AtomicStore(&db->u1.isInterrupted, 0); - } + if( db->nVdbeActive==0 ){ + AtomicStore(&db->u1.isInterrupted, 0); + } - assert( db->nVdbeWrite>0 || db->autoCommit==0 - || (db->nDeferredCons==0 && db->nDeferredImmCons==0) - ); + assert( db->nVdbeWrite>0 || db->autoCommit==0 + || (db->nDeferredCons==0 && db->nDeferredImmCons==0) + ); #ifndef SQLITE_OMIT_TRACE - if( (db->mTrace & (SQLITE_TRACE_PROFILE|SQLITE_TRACE_XPROFILE))!=0 - && !db->init.busy && p->zSql ){ - sqlite3OsCurrentTimeInt64(db->pVfs, &p->startTime); - }else{ - assert( p->startTime==0 ); - } + if( (db->mTrace & (SQLITE_TRACE_PROFILE|SQLITE_TRACE_XPROFILE))!=0 + && !db->init.busy && p->zSql ){ + sqlite3OsCurrentTimeInt64(db->pVfs, &p->startTime); + }else{ + assert( p->startTime==0 ); + } #endif - db->nVdbeActive++; - if( p->readOnly==0 ) db->nVdbeWrite++; - if( p->bIsReader ) db->nVdbeRead++; - p->pc = 0; + db->nVdbeActive++; + if( p->readOnly==0 ) db->nVdbeWrite++; + if( p->bIsReader ) db->nVdbeRead++; + p->pc = 0; + p->eVdbeState = VDBE_RUN_STATE; + }else + + if( ALWAYS(p->eVdbeState==VDBE_HALT_STATE) ){ + /* We used to require that sqlite3_reset() be called before retrying + ** sqlite3_step() after any error or after SQLITE_DONE. But beginning + ** with version 3.7.0, we changed this so that sqlite3_reset() would + ** be called automatically instead of throwing the SQLITE_MISUSE error. + ** This "automatic-reset" change is not technically an incompatibility, + ** since any application that receives an SQLITE_MISUSE is broken by + ** definition. + ** + ** Nevertheless, some published applications that were originally written + ** for version 3.6.23 or earlier do in fact depend on SQLITE_MISUSE + ** returns, and those were broken by the automatic-reset change. As a + ** a work-around, the SQLITE_OMIT_AUTORESET compile-time restores the + ** legacy behavior of returning SQLITE_MISUSE for cases where the + ** previous sqlite3_step() returned something other than a SQLITE_LOCKED + ** or SQLITE_BUSY error. + */ +#ifdef SQLITE_OMIT_AUTORESET + if( (rc = p->rc&0xff)==SQLITE_BUSY || rc==SQLITE_LOCKED ){ + sqlite3_reset((sqlite3_stmt*)p); + }else{ + return SQLITE_MISUSE_BKPT; + } +#else + sqlite3_reset((sqlite3_stmt*)p); +#endif + assert( p->eVdbeState==VDBE_READY_STATE ); + goto restart_step; + } } + #ifdef SQLITE_DEBUG p->rcApp = SQLITE_OK; #endif @@ -85787,7 +88078,12 @@ static int sqlite3Step(Vdbe *p){ db->nVdbeExec--; } - if( rc!=SQLITE_ROW ){ + if( rc==SQLITE_ROW ){ + assert( p->rc==SQLITE_OK ); + assert( db->mallocFailed==0 ); + db->errCode = SQLITE_ROW; + return SQLITE_ROW; + }else{ #ifndef SQLITE_OMIT_TRACE /* If the statement completed successfully, invoke the profile callback */ checkProfileCallback(db, p); @@ -85839,7 +88135,6 @@ SQLITE_API int sqlite3_step(sqlite3_stmt *pStmt){ } db = v->db; sqlite3_mutex_enter(db->mutex); - v->doingRerun = 0; while( (rc = sqlite3Step(v))==SQLITE_SCHEMA && cnt++ < SQLITE_MAX_SCHEMA_RETRY ){ int savedPc = v->pc; @@ -85865,7 +88160,13 @@ SQLITE_API int sqlite3_step(sqlite3_stmt *pStmt){ break; } sqlite3_reset(pStmt); - if( savedPc>=0 ) v->doingRerun = 1; + if( savedPc>=0 ){ + /* Setting minWriteFileFormat to 254 is a signal to the OP_Init and + ** OP_Trace opcodes to *not* perform SQLITE_TRACE_STMT because it has + ** already been done once on a prior invocation that failed due to + ** SQLITE_SCHEMA. tag-20220401a */ + v->minWriteFileFormat = 254; + } assert( v->expired==0 ); } sqlite3_mutex_leave(db->mutex); @@ -86174,15 +88475,15 @@ static const Mem *columnNullValue(void){ #endif = { /* .u = */ {0}, + /* .z = */ (char*)0, + /* .n = */ (int)0, /* .flags = */ (u16)MEM_Null, /* .enc = */ (u8)0, /* .eSubtype = */ (u8)0, - /* .n = */ (int)0, - /* .z = */ (char*)0, - /* .zMalloc = */ (char*)0, + /* .db = */ (sqlite3*)0, /* .szMalloc = */ (int)0, /* .uTemp = */ (u32)0, - /* .db = */ (sqlite3*)0, + /* .zMalloc = */ (char*)0, /* .xDel = */ (void(*)(void*))0, #ifdef SQLITE_DEBUG /* .pScopyFrom = */ (Mem*)0, @@ -86473,25 +88774,24 @@ SQLITE_API const void *sqlite3_column_origin_name16(sqlite3_stmt *pStmt, int N){ ** The error code stored in database p->db is overwritten with the return ** value in any case. */ -static int vdbeUnbind(Vdbe *p, int i){ +static int vdbeUnbind(Vdbe *p, unsigned int i){ Mem *pVar; if( vdbeSafetyNotNull(p) ){ return SQLITE_MISUSE_BKPT; } sqlite3_mutex_enter(p->db->mutex); - if( p->iVdbeMagic!=VDBE_MAGIC_RUN || p->pc>=0 ){ + if( p->eVdbeState!=VDBE_READY_STATE ){ sqlite3Error(p->db, SQLITE_MISUSE); sqlite3_mutex_leave(p->db->mutex); sqlite3_log(SQLITE_MISUSE, "bind on a busy prepared statement: [%s]", p->zSql); return SQLITE_MISUSE_BKPT; } - if( i<1 || i>p->nVar ){ + if( i>=(unsigned int)p->nVar ){ sqlite3Error(p->db, SQLITE_RANGE); sqlite3_mutex_leave(p->db->mutex); return SQLITE_RANGE; } - i--; pVar = &p->aVar[i]; sqlite3VdbeMemRelease(pVar); pVar->flags = MEM_Null; @@ -86528,7 +88828,7 @@ static int bindText( Mem *pVar; int rc; - rc = vdbeUnbind(p, i); + rc = vdbeUnbind(p, (u32)(i-1)); if( rc==SQLITE_OK ){ if( zData!=0 ){ pVar = &p->aVar[i-1]; @@ -86577,7 +88877,7 @@ SQLITE_API int sqlite3_bind_blob64( SQLITE_API int sqlite3_bind_double(sqlite3_stmt *pStmt, int i, double rValue){ int rc; Vdbe *p = (Vdbe *)pStmt; - rc = vdbeUnbind(p, i); + rc = vdbeUnbind(p, (u32)(i-1)); if( rc==SQLITE_OK ){ sqlite3VdbeMemSetDouble(&p->aVar[i-1], rValue); sqlite3_mutex_leave(p->db->mutex); @@ -86590,7 +88890,7 @@ SQLITE_API int sqlite3_bind_int(sqlite3_stmt *p, int i, int iValue){ SQLITE_API int sqlite3_bind_int64(sqlite3_stmt *pStmt, int i, sqlite_int64 iValue){ int rc; Vdbe *p = (Vdbe *)pStmt; - rc = vdbeUnbind(p, i); + rc = vdbeUnbind(p, (u32)(i-1)); if( rc==SQLITE_OK ){ sqlite3VdbeMemSetInt64(&p->aVar[i-1], iValue); sqlite3_mutex_leave(p->db->mutex); @@ -86600,7 +88900,7 @@ SQLITE_API int sqlite3_bind_int64(sqlite3_stmt *pStmt, int i, sqlite_int64 iValu SQLITE_API int sqlite3_bind_null(sqlite3_stmt *pStmt, int i){ int rc; Vdbe *p = (Vdbe*)pStmt; - rc = vdbeUnbind(p, i); + rc = vdbeUnbind(p, (u32)(i-1)); if( rc==SQLITE_OK ){ sqlite3_mutex_leave(p->db->mutex); } @@ -86615,7 +88915,7 @@ SQLITE_API int sqlite3_bind_pointer( ){ int rc; Vdbe *p = (Vdbe*)pStmt; - rc = vdbeUnbind(p, i); + rc = vdbeUnbind(p, (u32)(i-1)); if( rc==SQLITE_OK ){ sqlite3VdbeMemSetPointer(&p->aVar[i-1], pPtr, zPTtype, xDestructor); sqlite3_mutex_leave(p->db->mutex); @@ -86693,7 +88993,7 @@ SQLITE_API int sqlite3_bind_value(sqlite3_stmt *pStmt, int i, const sqlite3_valu SQLITE_API int sqlite3_bind_zeroblob(sqlite3_stmt *pStmt, int i, int n){ int rc; Vdbe *p = (Vdbe *)pStmt; - rc = vdbeUnbind(p, i); + rc = vdbeUnbind(p, (u32)(i-1)); if( rc==SQLITE_OK ){ #ifndef SQLITE_OMIT_INCRBLOB sqlite3VdbeMemSetZeroBlob(&p->aVar[i-1], n); @@ -86832,7 +89132,7 @@ SQLITE_API int sqlite3_stmt_isexplain(sqlite3_stmt *pStmt){ */ SQLITE_API int sqlite3_stmt_busy(sqlite3_stmt *pStmt){ Vdbe *v = (Vdbe*)pStmt; - return v!=0 && v->iVdbeMagic==VDBE_MAGIC_RUN && v->pc>=0; + return v!=0 && v->eVdbeState==VDBE_RUN_STATE; } /* @@ -86853,7 +89153,7 @@ SQLITE_API sqlite3_stmt *sqlite3_next_stmt(sqlite3 *pDb, sqlite3_stmt *pStmt){ if( pStmt==0 ){ pNext = (sqlite3_stmt*)pDb->pVdbe; }else{ - pNext = (sqlite3_stmt*)((Vdbe*)pStmt)->pNext; + pNext = (sqlite3_stmt*)((Vdbe*)pStmt)->pVNext; } sqlite3_mutex_leave(pDb->mutex); return pNext; @@ -86878,9 +89178,11 @@ SQLITE_API int sqlite3_stmt_status(sqlite3_stmt *pStmt, int op, int resetFlag){ sqlite3_mutex_enter(db->mutex); v = 0; db->pnBytesFreed = (int*)&v; - sqlite3VdbeClearObject(db, pVdbe); - sqlite3DbFree(db, pVdbe); + assert( db->lookaside.pEnd==db->lookaside.pTrueEnd ); + db->lookaside.pEnd = db->lookaside.pStart; + sqlite3VdbeDelete(pVdbe); db->pnBytesFreed = 0; + db->lookaside.pEnd = db->lookaside.pTrueEnd; sqlite3_mutex_leave(db->mutex); }else{ v = pVdbe->aCounter[op]; @@ -87672,12 +89974,12 @@ static VdbeCursor *allocateCursor( int nByte; VdbeCursor *pCx = 0; nByte = - ROUND8(sizeof(VdbeCursor)) + 2*sizeof(u32)*nField + + ROUND8P(sizeof(VdbeCursor)) + 2*sizeof(u32)*nField + (eCurType==CURTYPE_BTREE?sqlite3BtreeCursorSize():0); assert( iCur>=0 && iCurnCursor ); if( p->apCsr[iCur] ){ /*OPTIMIZATION-IF-FALSE*/ - sqlite3VdbeFreeCursor(p, p->apCsr[iCur]); + sqlite3VdbeFreeCursorNN(p, p->apCsr[iCur]); p->apCsr[iCur] = 0; } @@ -87707,7 +90009,7 @@ static VdbeCursor *allocateCursor( pCx->aOffset = &pCx->aType[nField]; if( eCurType==CURTYPE_BTREE ){ pCx->uc.pCursor = (BtCursor*) - &pMem->z[ROUND8(sizeof(VdbeCursor))+2*sizeof(u32)*nField]; + &pMem->z[ROUND8P(sizeof(VdbeCursor))+2*sizeof(u32)*nField]; sqlite3BtreeCursorZero(pCx->uc.pCursor); } return pCx; @@ -87720,7 +90022,8 @@ static VdbeCursor *allocateCursor( ** return false. */ static int alsoAnInt(Mem *pRec, double rValue, i64 *piValue){ - i64 iValue = (double)rValue; + i64 iValue; + iValue = sqlite3RealToI64(rValue); if( sqlite3RealSameAsInt(rValue,iValue) ){ *piValue = iValue; return 1; @@ -87882,17 +90185,18 @@ static u16 SQLITE_NOINLINE computeNumericType(Mem *pMem){ ** But it does set pMem->u.r and pMem->u.i appropriately. */ static u16 numericType(Mem *pMem){ - if( pMem->flags & (MEM_Int|MEM_Real|MEM_IntReal) ){ + assert( (pMem->flags & MEM_Null)==0 + || pMem->db==0 || pMem->db->mallocFailed ); + if( pMem->flags & (MEM_Int|MEM_Real|MEM_IntReal|MEM_Null) ){ testcase( pMem->flags & MEM_Int ); testcase( pMem->flags & MEM_Real ); testcase( pMem->flags & MEM_IntReal ); - return pMem->flags & (MEM_Int|MEM_Real|MEM_IntReal); - } - if( pMem->flags & (MEM_Str|MEM_Blob) ){ - testcase( pMem->flags & MEM_Str ); - testcase( pMem->flags & MEM_Blob ); - return computeNumericType(pMem); + return pMem->flags & (MEM_Int|MEM_Real|MEM_IntReal|MEM_Null); } + assert( pMem->flags & (MEM_Str|MEM_Blob) ); + testcase( pMem->flags & MEM_Str ); + testcase( pMem->flags & MEM_Blob ); + return computeNumericType(pMem); return 0; } @@ -88146,7 +90450,7 @@ SQLITE_PRIVATE int sqlite3VdbeExec( #endif /*** INSERT STACK UNION HERE ***/ - assert( p->iVdbeMagic==VDBE_MAGIC_RUN ); /* sqlite3_step() verifies this */ + assert( p->eVdbeState==VDBE_RUN_STATE ); /* sqlite3_step() verifies this */ sqlite3VdbeEnter(p); #ifndef SQLITE_OMIT_PROGRESS_CALLBACK if( db->xProgress ){ @@ -88389,26 +90693,39 @@ case OP_Gosub: { /* jump */ pIn1->flags = MEM_Int; pIn1->u.i = (int)(pOp-aOp); REGISTER_TRACE(pOp->p1, pIn1); - - /* Most jump operations do a goto to this spot in order to update - ** the pOp pointer. */ -jump_to_p2: - assert( pOp->p2>0 ); /* There are never any jumps to instruction 0 */ - assert( pOp->p2nOp ); /* Jumps must be in range */ - pOp = &aOp[pOp->p2 - 1]; - break; + goto jump_to_p2_and_check_for_interrupt; } -/* Opcode: Return P1 * * * * +/* Opcode: Return P1 P2 P3 * * +** +** Jump to the address stored in register P1. If P1 is a return address +** register, then this accomplishes a return from a subroutine. +** +** If P3 is 1, then the jump is only taken if register P1 holds an integer +** values, otherwise execution falls through to the next opcode, and the +** OP_Return becomes a no-op. If P3 is 0, then register P1 must hold an +** integer or else an assert() is raised. P3 should be set to 1 when +** this opcode is used in combination with OP_BeginSubrtn, and set to 0 +** otherwise. +** +** The value in register P1 is unchanged by this opcode. ** -** Jump to the next instruction after the address in register P1. After -** the jump, register P1 becomes undefined. +** P2 is not used by the byte-code engine. However, if P2 is positive +** and also less than the current address, then the "EXPLAIN" output +** formatter in the CLI will indent all opcodes from the P2 opcode up +** to be not including the current Return. P2 should be the first opcode +** in the subroutine from which this opcode is returning. Thus the P2 +** value is a byte-code indentation hint. See tag-20220407a in +** wherecode.c and shell.c. */ case OP_Return: { /* in1 */ pIn1 = &aMem[pOp->p1]; - assert( pIn1->flags==MEM_Int ); - pOp = &aOp[pIn1->u.i]; - pIn1->flags = MEM_Undefined; + if( pIn1->flags & MEM_Int ){ + if( pOp->p3 ){ VdbeBranchTaken(1, 2); } + pOp = &aOp[pIn1->u.i]; + }else if( ALWAYS(pOp->p3) ){ + VdbeBranchTaken(0, 2); + } break; } @@ -88431,7 +90748,14 @@ case OP_InitCoroutine: { /* jump */ assert( !VdbeMemDynamic(pOut) ); pOut->u.i = pOp->p3 - 1; pOut->flags = MEM_Int; - if( pOp->p2 ) goto jump_to_p2; + if( pOp->p2==0 ) break; + + /* Most jump operations do a goto to this spot in order to update + ** the pOp pointer. */ +jump_to_p2: + assert( pOp->p2>0 ); /* There are never any jumps to instruction 0 */ + assert( pOp->p2nOp ); /* Jumps must be in range */ + pOp = &aOp[pOp->p2 - 1]; break; } @@ -88533,11 +90857,10 @@ case OP_Halt: { VdbeFrame *pFrame; int pcx; - pcx = (int)(pOp - aOp); #ifdef SQLITE_DEBUG if( pOp->p2==OE_Abort ){ sqlite3VdbeAssertAbortable(p); } #endif - if( pOp->p1==SQLITE_OK && p->pFrame ){ + if( p->pFrame && pOp->p1==SQLITE_OK ){ /* Halt the sub-program. Return control to the parent frame. */ pFrame = p->pFrame; p->pFrame = pFrame->pParent; @@ -88559,7 +90882,6 @@ case OP_Halt: { } p->rc = pOp->p1; p->errorAction = (u8)pOp->p2; - p->pc = pcx; assert( pOp->p5<=4 ); if( p->rc ){ if( pOp->p5 ){ @@ -88576,6 +90898,7 @@ case OP_Halt: { }else{ sqlite3VdbeError(p, "%s", pOp->p4.z); } + pcx = (int)(pOp - aOp); sqlite3_log(pOp->p1, "abort at %d in [%s]: %s", pcx, p->zSql, p->zErrMsg); } rc = sqlite3VdbeHalt(p); @@ -88701,6 +91024,28 @@ case OP_String: { /* out2 */ break; } +/* Opcode: BeginSubrtn * P2 * * * +** Synopsis: r[P2]=NULL +** +** Mark the beginning of a subroutine that can be entered in-line +** or that can be called using OP_Gosub. The subroutine should +** be terminated by an OP_Return instruction that has a P1 operand that +** is the same as the P2 operand to this opcode and that has P3 set to 1. +** If the subroutine is entered in-line, then the OP_Return will simply +** fall through. But if the subroutine is entered using OP_Gosub, then +** the OP_Return will jump back to the first instruction after the OP_Gosub. +** +** This routine works by loading a NULL into the P2 register. When the +** return address register contains a NULL, the OP_Return instruction is +** a no-op that simply falls through to the next instruction (assuming that +** the OP_Return opcode has a P3 value of 1). Thus if the subroutine is +** entered in-line, then the OP_Return will cause in-line execution to +** continue. But if the subroutine is entered via OP_Gosub, then the +** OP_Return will cause a return to the address following the OP_Gosub. +** +** This opcode is identical to OP_Null. It has a different name +** only to make the byte code easier to read and verify. +*/ /* Opcode: Null P1 P2 P3 * * ** Synopsis: r[P2..P3]=NULL ** @@ -88713,6 +91058,7 @@ case OP_String: { /* out2 */ ** NULL values will not compare equal even if SQLITE_NULLEQ is set on ** OP_Ne or OP_Eq. */ +case OP_BeginSubrtn: case OP_Null: { /* out2 */ int cnt; u16 nullFlag; @@ -88843,11 +91189,16 @@ case OP_Move: { break; } -/* Opcode: Copy P1 P2 P3 * * +/* Opcode: Copy P1 P2 P3 * P5 ** Synopsis: r[P2@P3+1]=r[P1@P3+1] ** ** Make a copy of registers P1..P1+P3 into registers P2..P2+P3. ** +** If the 0x0002 bit of P5 is set then also clear the MEM_Subtype flag in the +** destination. The 0x0001 bit of P5 indicates that this Copy opcode cannot +** be merged. The 0x0001 bit is used by the query planner and does not +** come into play during query execution. +** ** This instruction makes a deep copy of the value. A duplicate ** is made of any string or blob constant. See also OP_SCopy. */ @@ -88862,6 +91213,9 @@ case OP_Copy: { memAboutToChange(p, pOut); sqlite3VdbeMemShallowCopy(pOut, pIn1, MEM_Ephem); Deephemeralize(pOut); + if( (pOut->flags & MEM_Subtype)!=0 && (pOp->p5 & 0x0002)!=0 ){ + pOut->flags &= ~MEM_Subtype; + } #ifdef SQLITE_DEBUG pOut->pScopyFrom = 0; #endif @@ -88942,45 +91296,32 @@ case OP_FkCheck: { ** the result row. */ case OP_ResultRow: { - Mem *pMem; - int i; assert( p->nResColumn==pOp->p2 ); assert( pOp->p1>0 || CORRUPT_DB ); assert( pOp->p1+pOp->p2<=(p->nMem+1 - p->nCursor)+1 ); - /* Invalidate all ephemeral cursor row caches */ p->cacheCtr = (p->cacheCtr + 2)|1; - - /* Make sure the results of the current row are \000 terminated - ** and have an assigned type. The results are de-ephemeralized as - ** a side effect. - */ - pMem = p->pResultSet = &aMem[pOp->p1]; - for(i=0; ip2; i++){ - assert( memIsValid(&pMem[i]) ); - Deephemeralize(&pMem[i]); - assert( (pMem[i].flags & MEM_Ephem)==0 - || (pMem[i].flags & (MEM_Str|MEM_Blob))==0 ); - sqlite3VdbeMemNulTerminate(&pMem[i]); - REGISTER_TRACE(pOp->p1+i, &pMem[i]); + p->pResultSet = &aMem[pOp->p1]; #ifdef SQLITE_DEBUG - /* The registers in the result will not be used again when the - ** prepared statement restarts. This is because sqlite3_column() - ** APIs might have caused type conversions of made other changes to - ** the register values. Therefore, we can go ahead and break any - ** OP_SCopy dependencies. */ - pMem[i].pScopyFrom = 0; -#endif + { + Mem *pMem = p->pResultSet; + int i; + for(i=0; ip2; i++){ + assert( memIsValid(&pMem[i]) ); + REGISTER_TRACE(pOp->p1+i, &pMem[i]); + /* The registers in the result will not be used again when the + ** prepared statement restarts. This is because sqlite3_column() + ** APIs might have caused type conversions of made other changes to + ** the register values. Therefore, we can go ahead and break any + ** OP_SCopy dependencies. */ + pMem[i].pScopyFrom = 0; + } } +#endif if( db->mallocFailed ) goto no_mem; - if( db->mTrace & SQLITE_TRACE_ROW ){ db->trace.xV2(SQLITE_TRACE_ROW, db->pTraceArg, p, 0); } - - - /* Return SQLITE_ROW - */ p->pc = (int)(pOp - aOp) + 1; rc = SQLITE_ROW; goto vdbe_return; @@ -89035,7 +91376,7 @@ case OP_Concat: { /* same as TK_CONCAT, in1, in2, out3 */ if( nByte>db->aLimit[SQLITE_LIMIT_LENGTH] ){ goto too_big; } - if( sqlite3VdbeMemGrow(pOut, (int)nByte+3, pOut==pIn2) ){ + if( sqlite3VdbeMemGrow(pOut, (int)nByte+2, pOut==pIn2) ){ goto no_mem; } MemSetTypeFlag(pOut, MEM_Str); @@ -89047,9 +91388,9 @@ case OP_Concat: { /* same as TK_CONCAT, in1, in2, out3 */ memcpy(&pOut->z[pIn2->n], pIn1->z, pIn1->n); assert( (pIn1->flags & MEM_Dyn) == (flags1 & MEM_Dyn) ); pIn1->flags = flags1; + if( encoding>SQLITE_UTF8 ) nByte &= ~1; pOut->z[nByte]=0; pOut->z[nByte+1] = 0; - pOut->z[nByte+2] = 0; pOut->flags |= MEM_Term; pOut->n = (int)nByte; pOut->enc = encoding; @@ -89100,7 +91441,6 @@ case OP_Subtract: /* same as TK_MINUS, in1, in2, out3 */ case OP_Multiply: /* same as TK_STAR, in1, in2, out3 */ case OP_Divide: /* same as TK_SLASH, in1, in2, out3 */ case OP_Remainder: { /* same as TK_REM, in1, in2, out3 */ - u16 flags; /* Combined MEM_* flags from both inputs */ u16 type1; /* Numeric type of left operand */ u16 type2; /* Numeric type of right operand */ i64 iA; /* Integer value of left operand */ @@ -89109,12 +91449,12 @@ case OP_Remainder: { /* same as TK_REM, in1, in2, out3 */ double rB; /* Real value of right operand */ pIn1 = &aMem[pOp->p1]; - type1 = numericType(pIn1); + type1 = pIn1->flags; pIn2 = &aMem[pOp->p2]; - type2 = numericType(pIn2); + type2 = pIn2->flags; pOut = &aMem[pOp->p3]; - flags = pIn1->flags | pIn2->flags; if( (type1 & type2 & MEM_Int)!=0 ){ +int_math: iA = pIn1->u.i; iB = pIn2->u.i; switch( pOp->opcode ){ @@ -89136,9 +91476,12 @@ case OP_Remainder: { /* same as TK_REM, in1, in2, out3 */ } pOut->u.i = iB; MemSetTypeFlag(pOut, MEM_Int); - }else if( (flags & MEM_Null)!=0 ){ + }else if( ((type1 | type2) & MEM_Null)!=0 ){ goto arithmetic_result_is_null; }else{ + type1 = numericType(pIn1); + type2 = numericType(pIn2); + if( (type1 & type2 & MEM_Int)!=0 ) goto int_math; fp_math: rA = sqlite3VdbeRealValue(pIn1); rB = sqlite3VdbeRealValue(pIn2); @@ -89494,23 +91837,23 @@ case OP_Ge: { /* same as TK_GE, jump, in1, in3 */ assert( (pOp->p5 & SQLITE_AFF_MASK)!=SQLITE_AFF_TEXT || CORRUPT_DB ); /* Common case of comparison of two integers */ if( pIn3->u.i > pIn1->u.i ){ - iCompare = +1; if( sqlite3aGTb[pOp->opcode] ){ VdbeBranchTaken(1, (pOp->p5 & SQLITE_NULLEQ)?2:3); goto jump_to_p2; } + iCompare = +1; }else if( pIn3->u.i < pIn1->u.i ){ - iCompare = -1; if( sqlite3aLTb[pOp->opcode] ){ VdbeBranchTaken(1, (pOp->p5 & SQLITE_NULLEQ)?2:3); goto jump_to_p2; } + iCompare = -1; }else{ - iCompare = 0; if( sqlite3aEQb[pOp->opcode] ){ VdbeBranchTaken(1, (pOp->p5 & SQLITE_NULLEQ)?2:3); goto jump_to_p2; } + iCompare = 0; } VdbeBranchTaken(0, (pOp->p5 & SQLITE_NULLEQ)?2:3); break; @@ -89537,11 +91880,11 @@ case OP_Ge: { /* same as TK_GE, jump, in1, in3 */ ** then the result is always NULL. ** The jump is taken if the SQLITE_JUMPIFNULL bit is set. */ - iCompare = 1; /* Operands are not equal */ VdbeBranchTaken(2,3); if( pOp->p5 & SQLITE_JUMPIFNULL ){ goto jump_to_p2; } + iCompare = 1; /* Operands are not equal */ break; } }else{ @@ -89647,9 +91990,8 @@ case OP_ElseEq: { /* same as TK_ESCAPE, jump */ ** Set the permutation used by the OP_Compare operator in the next ** instruction. The permutation is stored in the P4 operand. ** -** The permutation is only valid until the next OP_Compare that has -** the OPFLAG_PERMUTE bit set in P5. Typically the OP_Permutation should -** occur immediately prior to the OP_Compare. +** The permutation is only valid for the next opcode which must be +** an OP_Compare that has the OPFLAG_PERMUTE bit set in P5. ** ** The first integer in the P4 integer array is the length of the array ** and does not become part of the permutation. @@ -89681,6 +92023,8 @@ case OP_Permutation: { ** The comparison is a sort comparison, so NULLs compare equal, ** NULLs are less than numbers, numbers are less than strings, ** and strings are less than blobs. +** +** This opcode must be immediately followed by an OP_Jump opcode. */ case OP_Compare: { int n; @@ -89739,6 +92083,7 @@ case OP_Compare: { break; } } + assert( pOp[1].opcode==OP_Jump ); break; } @@ -89747,8 +92092,11 @@ case OP_Compare: { ** Jump to the instruction at address P1, P2, or P3 depending on whether ** in the most recent OP_Compare instruction the P1 vector was less than ** equal to, or greater than the P2 vector, respectively. +** +** This opcode must immediately follow an OP_Compare opcode. */ case OP_Jump: { /* jump */ + assert( pOp>aOp && pOp[-1].opcode==OP_Compare ); if( iCompare<0 ){ VdbeBranchTaken(0,4); pOp = &aOp[pOp->p1 - 1]; }else if( iCompare==0 ){ @@ -89948,19 +92296,90 @@ case OP_IsNull: { /* same as TK_ISNULL, jump, in1 */ break; } -/* Opcode: IsNullOrType P1 P2 P3 * * -** Synopsis: if typeof(r[P1]) IN (P3,5) goto P2 +/* Opcode: IsType P1 P2 P3 P4 P5 +** Synopsis: if typeof(P1.P3) in P5 goto P2 +** +** Jump to P2 if the type of a column in a btree is one of the types specified +** by the P5 bitmask. +** +** P1 is normally a cursor on a btree for which the row decode cache is +** valid through at least column P3. In other words, there should have been +** a prior OP_Column for column P3 or greater. If the cursor is not valid, +** then this opcode might give spurious results. +** The the btree row has fewer than P3 columns, then use P4 as the +** datatype. +** +** If P1 is -1, then P3 is a register number and the datatype is taken +** from the value in that register. +** +** P5 is a bitmask of data types. SQLITE_INTEGER is the least significant +** (0x01) bit. SQLITE_FLOAT is the 0x02 bit. SQLITE_TEXT is 0x04. +** SQLITE_BLOB is 0x08. SQLITE_NULL is 0x10. +** +** Take the jump to address P2 if and only if the datatype of the +** value determined by P1 and P3 corresponds to one of the bits in the +** P5 bitmask. ** -** Jump to P2 if the value in register P1 is NULL or has a datatype P3. -** P3 is an integer which should be one of SQLITE_INTEGER, SQLITE_FLOAT, -** SQLITE_BLOB, SQLITE_NULL, or SQLITE_TEXT. */ -case OP_IsNullOrType: { /* jump, in1 */ - int doTheJump; - pIn1 = &aMem[pOp->p1]; - doTheJump = (pIn1->flags & MEM_Null)!=0 || sqlite3_value_type(pIn1)==pOp->p3; - VdbeBranchTaken( doTheJump, 2); - if( doTheJump ) goto jump_to_p2; +case OP_IsType: { /* jump */ + VdbeCursor *pC; + u16 typeMask; + u32 serialType; + + assert( pOp->p1>=(-1) && pOp->p1nCursor ); + assert( pOp->p1>=0 || (pOp->p3>=0 && pOp->p3<=(p->nMem+1 - p->nCursor)) ); + if( pOp->p1>=0 ){ + pC = p->apCsr[pOp->p1]; + assert( pC!=0 ); + assert( pOp->p3>=0 ); + if( pOp->p3nHdrParsed ){ + serialType = pC->aType[pOp->p3]; + if( serialType>=12 ){ + if( serialType&1 ){ + typeMask = 0x04; /* SQLITE_TEXT */ + }else{ + typeMask = 0x08; /* SQLITE_BLOB */ + } + }else{ + static const unsigned char aMask[] = { + 0x10, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x2, + 0x01, 0x01, 0x10, 0x10 + }; + testcase( serialType==0 ); + testcase( serialType==1 ); + testcase( serialType==2 ); + testcase( serialType==3 ); + testcase( serialType==4 ); + testcase( serialType==5 ); + testcase( serialType==6 ); + testcase( serialType==7 ); + testcase( serialType==8 ); + testcase( serialType==9 ); + testcase( serialType==10 ); + testcase( serialType==11 ); + typeMask = aMask[serialType]; + } + }else{ + typeMask = 1 << (pOp->p4.i - 1); + testcase( typeMask==0x01 ); + testcase( typeMask==0x02 ); + testcase( typeMask==0x04 ); + testcase( typeMask==0x08 ); + testcase( typeMask==0x10 ); + } + }else{ + assert( memIsValid(&aMem[pOp->p3]) ); + typeMask = 1 << (sqlite3_value_type((sqlite3_value*)&aMem[pOp->p3])-1); + testcase( typeMask==0x01 ); + testcase( typeMask==0x02 ); + testcase( typeMask==0x04 ); + testcase( typeMask==0x08 ); + testcase( typeMask==0x10 ); + } + VdbeBranchTaken( (typeMask & pOp->p5)!=0, 2); + if( typeMask & pOp->p5 ){ + goto jump_to_p2; + } break; } @@ -90003,11 +92422,14 @@ case OP_NotNull: { /* same as TK_NOTNULL, jump, in1 */ ** If it is, then set register P3 to NULL and jump immediately to P2. ** If P1 is not on a NULL row, then fall through without making any ** changes. +** +** If P1 is not an open cursor, then this opcode is a no-op. */ case OP_IfNullRow: { /* jump */ + VdbeCursor *pC; assert( pOp->p1>=0 && pOp->p1nCursor ); - assert( p->apCsr[pOp->p1]!=0 ); - if( p->apCsr[pOp->p1]->nullRow ){ + pC = p->apCsr[pOp->p1]; + if( ALWAYS(pC) && pC->nullRow ){ sqlite3VdbeMemSetNull(aMem + pOp->p3); goto jump_to_p2; } @@ -90053,12 +92475,12 @@ case OP_Offset: { /* out3 */ #endif /* SQLITE_ENABLE_OFFSET_SQL_FUNC */ /* Opcode: Column P1 P2 P3 P4 P5 -** Synopsis: r[P3]=PX +** Synopsis: r[P3]=PX cursor P1 column P2 ** ** Interpret the data that cursor P1 points to as a structure built using ** the MakeRecord instruction. (See the MakeRecord opcode for additional ** information about the format of the data.) Extract the P2-th column -** from this record. If there are less that (P2+1) +** from this record. If there are less than (P2+1) ** values in the record, extract a NULL. ** ** The value extracted is stored in register P3. @@ -90067,15 +92489,17 @@ case OP_Offset: { /* out3 */ ** if the P4 argument is a P4_MEM use the value of the P4 argument as ** the result. ** -** If the OPFLAG_LENGTHARG and OPFLAG_TYPEOFARG bits are set on P5 then -** the result is guaranteed to only be used as the argument of a length() -** or typeof() function, respectively. The loading of large blobs can be -** skipped for length() and all content loading can be skipped for typeof(). +** If the OPFLAG_LENGTHARG bit is set in P5 then the result is guaranteed +** to only be used by the length() function or the equivalent. The content +** of large blobs is not loaded, thus saving CPU cycles. If the +** OPFLAG_TYPEOFARG bit is set then the result will only be used by the +** typeof() function or the IS NULL or IS NOT NULL operators or the +** equivalent. In this case, all content loading can be omitted. */ case OP_Column: { u32 p2; /* column number to retrieve */ VdbeCursor *pC; /* The VDBE cursor */ - BtCursor *pCrsr; /* The BTree cursor */ + BtCursor *pCrsr; /* The B-Tree cursor corresponding to pC */ u32 *aOffset; /* aOffset[i] is offset to start of data for i-th column */ int len; /* The length of the serialized data for the column */ int i; /* Loop counter */ @@ -90089,21 +92513,14 @@ case OP_Column: { Mem *pReg; /* PseudoTable input register */ assert( pOp->p1>=0 && pOp->p1nCursor ); + assert( pOp->p3>0 && pOp->p3<=(p->nMem+1 - p->nCursor) ); pC = p->apCsr[pOp->p1]; - assert( pC!=0 ); p2 = (u32)pOp->p2; - /* If the cursor cache is stale (meaning it is not currently point at - ** the correct row) then bring it up-to-date by doing the necessary - ** B-Tree seek. */ - rc = sqlite3VdbeCursorMoveto(&pC, &p2); - if( rc ) goto abort_due_to_error; - - assert( pOp->p3>0 && pOp->p3<=(p->nMem+1 - p->nCursor) ); - pDest = &aMem[pOp->p3]; - memAboutToChange(p, pDest); +op_column_restart: assert( pC!=0 ); - assert( p2<(u32)pC->nField ); + assert( p2<(u32)pC->nField + || (pC->eCurType==CURTYPE_PSEUDO && pC->seekResult==0) ); aOffset = pC->aOffset; assert( aOffset==pC->aType+pC->nField ); assert( pC->eCurType!=CURTYPE_VTAB ); @@ -90112,21 +92529,37 @@ case OP_Column: { if( pC->cacheStatus!=p->cacheCtr ){ /*OPTIMIZATION-IF-FALSE*/ if( pC->nullRow ){ - if( pC->eCurType==CURTYPE_PSEUDO ){ + if( pC->eCurType==CURTYPE_PSEUDO && pC->seekResult>0 ){ /* For the special case of as pseudo-cursor, the seekResult field ** identifies the register that holds the record */ - assert( pC->seekResult>0 ); pReg = &aMem[pC->seekResult]; assert( pReg->flags & MEM_Blob ); assert( memIsValid(pReg) ); pC->payloadSize = pC->szRow = pReg->n; pC->aRow = (u8*)pReg->z; }else{ + pDest = &aMem[pOp->p3]; + memAboutToChange(p, pDest); sqlite3VdbeMemSetNull(pDest); goto op_column_out; } }else{ pCrsr = pC->uc.pCursor; + if( pC->deferredMoveto ){ + u32 iMap; + assert( !pC->isEphemeral ); + if( pC->ub.aAltMap && (iMap = pC->ub.aAltMap[1+p2])>0 ){ + pC = pC->pAltCursor; + p2 = iMap - 1; + goto op_column_restart; + } + rc = sqlite3VdbeFinishMoveto(pC); + if( rc ) goto abort_due_to_error; + }else if( sqlite3BtreeCursorHasMoved(pCrsr) ){ + rc = sqlite3VdbeHandleMovedCursor(pC); + if( rc ) goto abort_due_to_error; + goto op_column_restart; + } assert( pC->eCurType==CURTYPE_BTREE ); assert( pCrsr ); assert( sqlite3BtreeCursorIsValid(pCrsr) ); @@ -90134,15 +92567,15 @@ case OP_Column: { pC->aRow = sqlite3BtreePayloadFetch(pCrsr, &pC->szRow); assert( pC->szRow<=pC->payloadSize ); assert( pC->szRow<=65536 ); /* Maximum page size is 64KiB */ - if( pC->payloadSize > (u32)db->aLimit[SQLITE_LIMIT_LENGTH] ){ - goto too_big; - } } pC->cacheStatus = p->cacheCtr; - pC->iHdrOffset = getVarint32(pC->aRow, aOffset[0]); + if( (aOffset[0] = pC->aRow[0])<0x80 ){ + pC->iHdrOffset = 1; + }else{ + pC->iHdrOffset = sqlite3GetVarint32(pC->aRow, aOffset); + } pC->nHdrParsed = 0; - if( pC->szRowaRow does not have to hold the entire row, but it does at least ** need to cover the header of the record. If pC->aRow does not contain @@ -90182,6 +92615,10 @@ case OP_Column: { testcase( aOffset[0]==0 ); goto op_column_read_header; } + }else if( sqlite3BtreeCursorHasMoved(pC->uc.pCursor) ){ + rc = sqlite3VdbeHandleMovedCursor(pC); + if( rc ) goto abort_due_to_error; + goto op_column_restart; } /* Make sure at least the first p2+1 entries of the header have been @@ -90250,6 +92687,8 @@ case OP_Column: { ** columns. So the result will be either the default value or a NULL. */ if( pC->nHdrParsed<=p2 ){ + pDest = &aMem[pOp->p3]; + memAboutToChange(p, pDest); if( pOp->p4type==P4_MEM ){ sqlite3VdbeMemShallowCopy(pDest, pOp->p4.pMem, MEM_Static); }else{ @@ -90267,6 +92706,8 @@ case OP_Column: { */ assert( p2nHdrParsed ); assert( rc==SQLITE_OK ); + pDest = &aMem[pOp->p3]; + memAboutToChange(p, pDest); assert( sqlite3VdbeCheckMemInvariants(pDest) ); if( VdbeMemDynamic(pDest) ){ sqlite3VdbeMemSetNull(pDest); @@ -90287,6 +92728,7 @@ case OP_Column: { pDest->n = len = (t-12)/2; pDest->enc = encoding; if( pDest->szMalloc < len+2 ){ + if( len>db->aLimit[SQLITE_LIMIT_LENGTH] ) goto too_big; pDest->flags = MEM_Null; if( sqlite3VdbeMemGrow(pDest, len+2, 0) ) goto no_mem; }else{ @@ -90319,6 +92761,7 @@ case OP_Column: { */ sqlite3VdbeSerialGet((u8*)sqlite3CtypeMap, t, pDest); }else{ + if( len>db->aLimit[SQLITE_LIMIT_LENGTH] ) goto too_big; rc = sqlite3VdbeMemFromBtree(pC->uc.pCursor, aOffset[p2], len, pDest); if( rc!=SQLITE_OK ) goto abort_due_to_error; sqlite3VdbeSerialGet((const u8*)pDest->z, t, pDest); @@ -90531,7 +92974,6 @@ case OP_MakeRecord: { Mem *pLast; /* Last field of the record */ int nField; /* Number of fields in the record */ char *zAffinity; /* The affinity string for the record */ - int file_format; /* File format to use for encoding */ u32 len; /* Length of a field */ u8 *zHdr; /* Where to write next byte of the header */ u8 *zPayload; /* Where to write next byte of the payload */ @@ -90560,7 +93002,6 @@ case OP_MakeRecord: { pData0 = &aMem[nField]; nField = pOp->p2; pLast = &pData0[nField-1]; - file_format = p->minWriteFileFormat; /* Identify the output register */ assert( pOp->p3p1 || pOp->p3>=pOp->p1+pOp->p2 ); @@ -90662,7 +93103,7 @@ case OP_MakeRecord: { testcase( uu==2147483647 ); testcase( uu==2147483648LL ); testcase( uu==140737488355327LL ); testcase( uu==140737488355328LL ); if( uu<=127 ){ - if( (i&1)==i && file_format>=4 ){ + if( (i&1)==i && p->minWriteFileFormat>=4 ){ pRec->uTemp = 8+(u32)uu; }else{ nData++; @@ -90767,18 +93208,60 @@ case OP_MakeRecord: { zPayload = zHdr + nHdr; /* Write the record */ - zHdr += putVarint32(zHdr, nHdr); + if( nHdr<0x80 ){ + *(zHdr++) = nHdr; + }else{ + zHdr += sqlite3PutVarint(zHdr,nHdr); + } assert( pData0<=pLast ); pRec = pData0; - do{ + while( 1 /*exit-by-break*/ ){ serial_type = pRec->uTemp; /* EVIDENCE-OF: R-06529-47362 Following the size varint are one or more - ** additional varints, one per column. */ - zHdr += putVarint32(zHdr, serial_type); /* serial type */ - /* EVIDENCE-OF: R-64536-51728 The values for each column in the record + ** additional varints, one per column. + ** EVIDENCE-OF: R-64536-51728 The values for each column in the record ** immediately follow the header. */ - zPayload += sqlite3VdbeSerialPut(zPayload, pRec, serial_type); /* content */ - }while( (++pRec)<=pLast ); + if( serial_type<=7 ){ + *(zHdr++) = serial_type; + if( serial_type==0 ){ + /* NULL value. No change in zPayload */ + }else{ + u64 v; + u32 i; + if( serial_type==7 ){ + assert( sizeof(v)==sizeof(pRec->u.r) ); + memcpy(&v, &pRec->u.r, sizeof(v)); + swapMixedEndianFloat(v); + }else{ + v = pRec->u.i; + } + len = i = sqlite3SmallTypeSizes[serial_type]; + assert( i>0 ); + while( 1 /*exit-by-break*/ ){ + zPayload[--i] = (u8)(v&0xFF); + if( i==0 ) break; + v >>= 8; + } + zPayload += len; + } + }else if( serial_type<0x80 ){ + *(zHdr++) = serial_type; + if( serial_type>=14 && pRec->n>0 ){ + assert( pRec->z!=0 ); + memcpy(zPayload, pRec->z, pRec->n); + zPayload += pRec->n; + } + }else{ + zHdr += sqlite3PutVarint(zHdr, serial_type); + if( pRec->n ){ + assert( pRec->z!=0 ); + memcpy(zPayload, pRec->z, pRec->n); + zPayload += pRec->n; + } + } + if( pRec==pLast ) break; + pRec++; + } assert( nHdr==(int)(zHdr - (u8*)pOut->z) ); assert( nByte==(int)(zPayload - (u8*)pOut->z) ); @@ -90997,7 +93480,10 @@ case OP_Savepoint: { } } if( rc ) goto abort_due_to_error; - + if( p->eVdbeState==VDBE_HALT_STATE ){ + rc = SQLITE_DONE; + goto vdbe_return; + } break; } @@ -91101,6 +93587,7 @@ case OP_AutoCommit: { */ case OP_Transaction: { Btree *pBt; + Db *pDb; int iMeta = 0; assert( p->bIsReader ); @@ -91120,7 +93607,8 @@ case OP_Transaction: { } goto abort_due_to_error; } - pBt = db->aDb[pOp->p1].pBt; + pDb = &db->aDb[pOp->p1]; + pBt = pDb->pBt; if( pBt ){ rc = sqlite3BtreeBeginTrans(pBt, pOp->p2, &iMeta); @@ -91161,8 +93649,7 @@ case OP_Transaction: { assert( pOp->p5==0 || pOp->p4type==P4_INT32 ); if( rc==SQLITE_OK && pOp->p5 - && (iMeta!=pOp->p3 - || db->aDb[pOp->p1].pSchema->iGeneration!=pOp->p4.i) + && (iMeta!=pOp->p3 || pDb->pSchema->iGeneration!=pOp->p4.i) ){ /* ** IMPLEMENTATION-OF: R-03189-51135 As each SQL statement runs, the schema @@ -91189,6 +93676,11 @@ case OP_Transaction: { } p->expired = 1; rc = SQLITE_SCHEMA; + + /* Set changeCntOn to 0 to prevent the value returned by sqlite3_changes() + ** from being modified in sqlite3VdbeHalt(). If this statement is + ** reprepared, changeCntOn will be set again. */ + p->changeCntOn = 0; } if( rc ) goto abort_due_to_error; break; @@ -91255,7 +93747,7 @@ case OP_SetCookie: { rc = sqlite3BtreeUpdateMeta(pDb->pBt, pOp->p2, pOp->p3); if( pOp->p2==BTREE_SCHEMA_VERSION ){ /* When the schema cookie changes, record the new cookie internally */ - pDb->pSchema->schema_cookie = pOp->p3 - pOp->p5; + *(u32*)&pDb->pSchema->schema_cookie = *(u32*)&pOp->p3 - pOp->p5; db->mDbFlags |= DBFLAG_SchemaChange; sqlite3FkClearTriggerCache(db, pOp->p1); }else if( pOp->p2==BTREE_FILE_FORMAT ){ @@ -91488,8 +93980,8 @@ case OP_OpenDup: { pCx->pgnoRoot = pOrig->pgnoRoot; pCx->isOrdered = pOrig->isOrdered; pCx->ub.pBtx = pOrig->ub.pBtx; - pCx->hasBeenDuped = 1; - pOrig->hasBeenDuped = 1; + pCx->noReuse = 1; + pOrig->noReuse = 1; rc = sqlite3BtreeCursor(pCx->ub.pBtx, pCx->pgnoRoot, BTREE_WRCSR, pCx->pKeyInfo, pCx->uc.pCursor); /* The sqlite3BtreeCursor() routine can only fail for the first cursor @@ -91556,7 +94048,7 @@ case OP_OpenEphemeral: { aMem[pOp->p3].z = ""; } pCx = p->apCsr[pOp->p1]; - if( pCx && !pCx->hasBeenDuped && ALWAYS(pOp->p2<=pCx->nField) ){ + if( pCx && !pCx->noReuse && ALWAYS(pOp->p2<=pCx->nField) ){ /* If the ephermeral table is already open and has no duplicates from ** OP_OpenDup, then erase all existing content so that the table is ** empty again, rather than creating a new table. */ @@ -91941,7 +94433,13 @@ case OP_SeekGT: { /* jump, in3, group */ r.aMem = &aMem[pOp->p3]; #ifdef SQLITE_DEBUG - { int i; for(i=0; i0 ) REGISTER_TRACE(pOp->p3+i, &r.aMem[i]); + } + } #endif r.eqSeen = 0; rc = sqlite3BtreeIndexMoveto(pC->uc.pCursor, &r, &res); @@ -92004,7 +94502,7 @@ seek_not_found: } -/* Opcode: SeekScan P1 P2 * * * +/* Opcode: SeekScan P1 P2 * * P5 ** Synopsis: Scan-ahead up to P1 rows ** ** This opcode is a prefix opcode to OP_SeekGE. In other words, this @@ -92014,8 +94512,8 @@ seek_not_found: ** This opcode uses the P1 through P4 operands of the subsequent ** OP_SeekGE. In the text that follows, the operands of the subsequent ** OP_SeekGE opcode are denoted as SeekOP.P1 through SeekOP.P4. Only -** the P1 and P2 operands of this opcode are also used, and are called -** This.P1 and This.P2. +** the P1, P2 and P5 operands of this opcode are also used, and are called +** This.P1, This.P2 and This.P5. ** ** This opcode helps to optimize IN operators on a multi-column index ** where the IN operator is on the later terms of the index by avoiding @@ -92025,29 +94523,51 @@ seek_not_found: ** ** The SeekGE.P3 and SeekGE.P4 operands identify an unpacked key which ** is the desired entry that we want the cursor SeekGE.P1 to be pointing -** to. Call this SeekGE.P4/P5 row the "target". +** to. Call this SeekGE.P3/P4 row the "target". ** ** If the SeekGE.P1 cursor is not currently pointing to a valid row, ** then this opcode is a no-op and control passes through into the OP_SeekGE. ** ** If the SeekGE.P1 cursor is pointing to a valid row, then that row ** might be the target row, or it might be near and slightly before the -** target row. This opcode attempts to position the cursor on the target -** row by, perhaps by invoking sqlite3BtreeStep() on the cursor -** between 0 and This.P1 times. -** -** There are three possible outcomes from this opcode:

      -** -**
    1. If after This.P1 steps, the cursor is still pointing to a place that -** is earlier in the btree than the target row, then fall through -** into the subsquence OP_SeekGE opcode. -** -**
    2. If the cursor is successfully moved to the target row by 0 or more -** sqlite3BtreeNext() calls, then jump to This.P2, which will land just -** past the OP_IdxGT or OP_IdxGE opcode that follows the OP_SeekGE. -** -**
    3. If the cursor ends up past the target row (indicating the the target -** row does not exist in the btree) then jump to SeekOP.P2. +** target row, or it might be after the target row. If the cursor is +** currently before the target row, then this opcode attempts to position +** the cursor on or after the target row by invoking sqlite3BtreeStep() +** on the cursor between 1 and This.P1 times. +** +** The This.P5 parameter is a flag that indicates what to do if the +** cursor ends up pointing at a valid row that is past the target +** row. If This.P5 is false (0) then a jump is made to SeekGE.P2. If +** This.P5 is true (non-zero) then a jump is made to This.P2. The P5==0 +** case occurs when there are no inequality constraints to the right of +** the IN constraing. The jump to SeekGE.P2 ends the loop. The P5!=0 case +** occurs when there are inequality constraints to the right of the IN +** operator. In that case, the This.P2 will point either directly to or +** to setup code prior to the OP_IdxGT or OP_IdxGE opcode that checks for +** loop terminate. +** +** Possible outcomes from this opcode:
        +** +**
      1. If the cursor is initally not pointed to any valid row, then +** fall through into the subsequent OP_SeekGE opcode. +** +**
      2. If the cursor is left pointing to a row that is before the target +** row, even after making as many as This.P1 calls to +** sqlite3BtreeNext(), then also fall through into OP_SeekGE. +** +**
      3. If the cursor is left pointing at the target row, either because it +** was at the target row to begin with or because one or more +** sqlite3BtreeNext() calls moved the cursor to the target row, +** then jump to This.P2.., +** +**
      4. If the cursor started out before the target row and a call to +** to sqlite3BtreeNext() moved the cursor off the end of the index +** (indicating that the target row definitely does not exist in the +** btree) then jump to SeekGE.P2, ending the loop. +** +**
      5. If the cursor ends up on a valid row that is past the target row +** (indicating that the target row does not exist in the btree) then +** jump to SeekOP.P2 if This.P5==0 or to This.P2 if This.P5>0. **
      */ case OP_SeekScan: { @@ -92058,14 +94578,25 @@ case OP_SeekScan: { assert( pOp[1].opcode==OP_SeekGE ); - /* pOp->p2 points to the first instruction past the OP_IdxGT that - ** follows the OP_SeekGE. */ + /* If pOp->p5 is clear, then pOp->p2 points to the first instruction past the + ** OP_IdxGT that follows the OP_SeekGE. Otherwise, it points to the first + ** opcode past the OP_SeekGE itself. */ assert( pOp->p2>=(int)(pOp-aOp)+2 ); - assert( aOp[pOp->p2-1].opcode==OP_IdxGT || aOp[pOp->p2-1].opcode==OP_IdxGE ); - testcase( aOp[pOp->p2-1].opcode==OP_IdxGE ); - assert( pOp[1].p1==aOp[pOp->p2-1].p1 ); - assert( pOp[1].p2==aOp[pOp->p2-1].p2 ); - assert( pOp[1].p3==aOp[pOp->p2-1].p3 ); +#ifdef SQLITE_DEBUG + if( pOp->p5==0 ){ + /* There are no inequality constraints following the IN constraint. */ + assert( pOp[1].p1==aOp[pOp->p2-1].p1 ); + assert( pOp[1].p2==aOp[pOp->p2-1].p2 ); + assert( pOp[1].p3==aOp[pOp->p2-1].p3 ); + assert( aOp[pOp->p2-1].opcode==OP_IdxGT + || aOp[pOp->p2-1].opcode==OP_IdxGE ); + testcase( aOp[pOp->p2-1].opcode==OP_IdxGE ); + }else{ + /* There are inequality constraints. */ + assert( pOp->p2==(int)(pOp-aOp)+2 ); + assert( aOp[pOp->p2-1].opcode==OP_SeekGE ); + } +#endif assert( pOp->p1>0 ); pC = p->apCsr[pOp[1].p1]; @@ -92099,8 +94630,9 @@ case OP_SeekScan: { while(1){ rc = sqlite3VdbeIdxKeyCompare(db, pC, &r, &res); if( rc ) goto abort_due_to_error; - if( res>0 ){ + if( res>0 && pOp->p5==0 ){ seekscan_search_fail: + /* Jump to SeekGE.P2, ending the loop */ #ifdef SQLITE_DEBUG if( db->flags&SQLITE_VdbeTrace ){ printf("... %d steps and then skip\n", pOp->p1 - nStep); @@ -92110,7 +94642,8 @@ case OP_SeekScan: { pOp++; goto jump_to_p2; } - if( res==0 ){ + if( res>=0 ){ + /* Jump to This.P2, bypassing the OP_SeekGE opcode */ #ifdef SQLITE_DEBUG if( db->flags&SQLITE_VdbeTrace ){ printf("... %d steps and then success\n", pOp->p1 - nStep); @@ -92186,12 +94719,16 @@ case OP_SeekHit: { /* Opcode: IfNotOpen P1 P2 * * * ** Synopsis: if( !csr[P1] ) goto P2 ** -** If cursor P1 is not open, jump to instruction P2. Otherwise, fall through. +** If cursor P1 is not open or if P1 is set to a NULL row using the +** OP_NullRow opcode, then jump to instruction P2. Otherwise, fall through. */ case OP_IfNotOpen: { /* jump */ + VdbeCursor *pCur; + assert( pOp->p1>=0 && pOp->p1nCursor ); - VdbeBranchTaken(p->apCsr[pOp->p1]==0, 2); - if( !p->apCsr[pOp->p1] ){ + pCur = p->apCsr[pOp->p1]; + VdbeBranchTaken(pCur==0 || pCur->nullRow, 2); + if( pCur==0 || pCur->nullRow ){ goto jump_to_p2_and_check_for_interrupt; } break; @@ -92305,11 +94842,8 @@ case OP_NoConflict: /* jump, in3 */ case OP_NotFound: /* jump, in3 */ case OP_Found: { /* jump, in3 */ int alreadyExists; - int takeJump; int ii; VdbeCursor *pC; - int res; - UnpackedRecord *pFree; UnpackedRecord *pIdxKey; UnpackedRecord r; @@ -92324,14 +94858,15 @@ case OP_Found: { /* jump, in3 */ #ifdef SQLITE_DEBUG pC->seekOp = pOp->opcode; #endif - pIn3 = &aMem[pOp->p3]; + r.aMem = &aMem[pOp->p3]; assert( pC->eCurType==CURTYPE_BTREE ); assert( pC->uc.pCursor!=0 ); assert( pC->isTable==0 ); - if( pOp->p4.i>0 ){ + r.nField = (u16)pOp->p4.i; + if( r.nField>0 ){ + /* Key values in an array of registers */ r.pKeyInfo = pC->pKeyInfo; - r.nField = (u16)pOp->p4.i; - r.aMem = pIn3; + r.default_rc = 0; #ifdef SQLITE_DEBUG for(ii=0; iip3+ii, &r.aMem[ii]); } #endif - pIdxKey = &r; - pFree = 0; + rc = sqlite3BtreeIndexMoveto(pC->uc.pCursor, &r, &pC->seekResult); }else{ - assert( pIn3->flags & MEM_Blob ); - rc = ExpandBlob(pIn3); + /* Composite key generated by OP_MakeRecord */ + assert( r.aMem->flags & MEM_Blob ); + assert( pOp->opcode!=OP_NoConflict ); + rc = ExpandBlob(r.aMem); assert( rc==SQLITE_OK || rc==SQLITE_NOMEM ); if( rc ) goto no_mem; - pFree = pIdxKey = sqlite3VdbeAllocUnpackedRecord(pC->pKeyInfo); + pIdxKey = sqlite3VdbeAllocUnpackedRecord(pC->pKeyInfo); if( pIdxKey==0 ) goto no_mem; - sqlite3VdbeRecordUnpack(pC->pKeyInfo, pIn3->n, pIn3->z, pIdxKey); + sqlite3VdbeRecordUnpack(pC->pKeyInfo, r.aMem->n, r.aMem->z, pIdxKey); + pIdxKey->default_rc = 0; + rc = sqlite3BtreeIndexMoveto(pC->uc.pCursor, pIdxKey, &pC->seekResult); + sqlite3DbFreeNN(db, pIdxKey); } - pIdxKey->default_rc = 0; - takeJump = 0; - if( pOp->opcode==OP_NoConflict ){ - /* For the OP_NoConflict opcode, take the jump if any of the - ** input fields are NULL, since any key with a NULL will not - ** conflict */ - for(ii=0; iinField; ii++){ - if( pIdxKey->aMem[ii].flags & MEM_Null ){ - takeJump = 1; - break; - } - } - } - rc = sqlite3BtreeIndexMoveto(pC->uc.pCursor, pIdxKey, &res); - if( pFree ) sqlite3DbFreeNN(db, pFree); if( rc!=SQLITE_OK ){ goto abort_due_to_error; } - pC->seekResult = res; - alreadyExists = (res==0); + alreadyExists = (pC->seekResult==0); pC->nullRow = 1-alreadyExists; pC->deferredMoveto = 0; pC->cacheStatus = CACHE_STALE; @@ -92377,9 +94900,25 @@ case OP_Found: { /* jump, in3 */ VdbeBranchTaken(alreadyExists!=0,2); if( alreadyExists ) goto jump_to_p2; }else{ - VdbeBranchTaken(takeJump||alreadyExists==0,2); - if( takeJump || !alreadyExists ) goto jump_to_p2; - if( pOp->opcode==OP_IfNoHope ) pC->seekHit = pOp->p4.i; + if( !alreadyExists ){ + VdbeBranchTaken(1,2); + goto jump_to_p2; + } + if( pOp->opcode==OP_NoConflict ){ + /* For the OP_NoConflict opcode, take the jump if any of the + ** input fields are NULL, since any key with a NULL will not + ** conflict */ + for(ii=0; iiopcode==OP_IfNoHope ){ + pC->seekHit = pOp->p4.i; + } } break; } @@ -93070,7 +95609,7 @@ case OP_RowData: { } /* Opcode: Rowid P1 P2 * * * -** Synopsis: r[P2]=rowid +** Synopsis: r[P2]=PX rowid of P1 ** ** Store in register P2 an integer which is the key of the table entry that ** P1 is currently point to. @@ -93126,16 +95665,24 @@ case OP_Rowid: { /* out2 */ ** that occur while the cursor is on the null row will always ** write a NULL. ** -** Or, if P1 is a Pseudo-Cursor (a cursor opened using OP_OpenPseudo) -** just reset the cache for that cursor. This causes the row of -** content held by the pseudo-cursor to be reparsed. +** If cursor P1 is not previously opened, open it now to a special +** pseudo-cursor that always returns NULL for every column. */ case OP_NullRow: { VdbeCursor *pC; assert( pOp->p1>=0 && pOp->p1nCursor ); pC = p->apCsr[pOp->p1]; - assert( pC!=0 ); + if( pC==0 ){ + /* If the cursor is not already open, create a special kind of + ** pseudo-cursor that always gives null rows. */ + pC = allocateCursor(p, pOp->p1, 1, CURTYPE_PSEUDO); + if( pC==0 ) goto no_mem; + pC->seekResult = 0; + pC->isTable = 1; + pC->noReuse = 1; + pC->uc.pCursor = sqlite3BtreeFakeValidCursor(); + } pC->nullRow = 1; pC->cacheStatus = CACHE_STALE; if( pC->eCurType==CURTYPE_BTREE ){ @@ -93308,7 +95855,7 @@ case OP_Rewind: { /* jump */ break; } -/* Opcode: Next P1 P2 P3 P4 P5 +/* Opcode: Next P1 P2 P3 * P5 ** ** Advance cursor P1 so that it points to the next key/data pair in its ** table or index. If there are no more key/value pairs then fall through @@ -93327,15 +95874,12 @@ case OP_Rewind: { /* jump */ ** omitted if that index had been unique. P3 is usually 0. P3 is ** always either 0 or 1. ** -** P4 is always of type P4_ADVANCE. The function pointer points to -** sqlite3BtreeNext(). -** ** If P5 is positive and the jump is taken, then event counter ** number P5-1 in the prepared statement is incremented. ** ** See also: Prev */ -/* Opcode: Prev P1 P2 P3 P4 P5 +/* Opcode: Prev P1 P2 P3 * P5 ** ** Back up cursor P1 so that it points to the previous key/data pair in its ** table or index. If there is no previous key/value pairs then fall through @@ -93355,9 +95899,6 @@ case OP_Rewind: { /* jump */ ** omitted if that index had been unique. P3 is usually 0. P3 is ** always either 0 or 1. ** -** P4 is always of type P4_ADVANCE. The function pointer points to -** sqlite3BtreePrevious(). -** ** If P5 is positive and the jump is taken, then event counter ** number P5-1 in the prepared statement is incremented. */ @@ -93375,30 +95916,37 @@ case OP_SorterNext: { /* jump */ assert( isSorter(pC) ); rc = sqlite3VdbeSorterNext(db, pC); goto next_tail; + case OP_Prev: /* jump */ -case OP_Next: /* jump */ assert( pOp->p1>=0 && pOp->p1nCursor ); - assert( pOp->p5aCounter) ); + assert( pOp->p5==0 + || pOp->p5==SQLITE_STMTSTATUS_FULLSCAN_STEP + || pOp->p5==SQLITE_STMTSTATUS_AUTOINDEX); pC = p->apCsr[pOp->p1]; assert( pC!=0 ); assert( pC->deferredMoveto==0 ); assert( pC->eCurType==CURTYPE_BTREE ); - assert( pOp->opcode!=OP_Next || pOp->p4.xAdvance==sqlite3BtreeNext ); - assert( pOp->opcode!=OP_Prev || pOp->p4.xAdvance==sqlite3BtreePrevious ); + assert( pC->seekOp==OP_SeekLT || pC->seekOp==OP_SeekLE + || pC->seekOp==OP_Last || pC->seekOp==OP_IfNoHope + || pC->seekOp==OP_NullRow); + rc = sqlite3BtreePrevious(pC->uc.pCursor, pOp->p3); + goto next_tail; - /* The Next opcode is only used after SeekGT, SeekGE, Rewind, and Found. - ** The Prev opcode is only used after SeekLT, SeekLE, and Last. */ - assert( pOp->opcode!=OP_Next - || pC->seekOp==OP_SeekGT || pC->seekOp==OP_SeekGE +case OP_Next: /* jump */ + assert( pOp->p1>=0 && pOp->p1nCursor ); + assert( pOp->p5==0 + || pOp->p5==SQLITE_STMTSTATUS_FULLSCAN_STEP + || pOp->p5==SQLITE_STMTSTATUS_AUTOINDEX); + pC = p->apCsr[pOp->p1]; + assert( pC!=0 ); + assert( pC->deferredMoveto==0 ); + assert( pC->eCurType==CURTYPE_BTREE ); + assert( pC->seekOp==OP_SeekGT || pC->seekOp==OP_SeekGE || pC->seekOp==OP_Rewind || pC->seekOp==OP_Found || pC->seekOp==OP_NullRow|| pC->seekOp==OP_SeekRowid || pC->seekOp==OP_IfNoHope); - assert( pOp->opcode!=OP_Prev - || pC->seekOp==OP_SeekLT || pC->seekOp==OP_SeekLE - || pC->seekOp==OP_Last || pC->seekOp==OP_IfNoHope - || pC->seekOp==OP_NullRow); + rc = sqlite3BtreeNext(pC->uc.pCursor, pOp->p3); - rc = pOp->p4.xAdvance(pC->uc.pCursor, pOp->p3); next_tail: pC->cacheStatus = CACHE_STALE; VdbeBranchTaken(rc==SQLITE_OK,2); @@ -93585,9 +96133,9 @@ case OP_IdxRowid: { /* out2 */ assert( pOp->p1>=0 && pOp->p1nCursor ); pC = p->apCsr[pOp->p1]; assert( pC!=0 ); - assert( pC->eCurType==CURTYPE_BTREE ); + assert( pC->eCurType==CURTYPE_BTREE || IsNullCursor(pC) ); assert( pC->uc.pCursor!=0 ); - assert( pC->isTable==0 ); + assert( pC->isTable==0 || IsNullCursor(pC) ); assert( pC->deferredMoveto==0 ); assert( !pC->nullRow || pOp->opcode==OP_IdxRowid ); @@ -93595,10 +96143,10 @@ case OP_IdxRowid: { /* out2 */ ** of sqlite3VdbeCursorRestore() and sqlite3VdbeIdxRowid(). */ rc = sqlite3VdbeCursorRestore(pC); - /* sqlite3VbeCursorRestore() can only fail if the record has been deleted - ** out from under the cursor. That will never happens for an IdxRowid - ** or Seek opcode */ - if( NEVER(rc!=SQLITE_OK) ) goto abort_due_to_error; + /* sqlite3VdbeCursorRestore() may fail if the cursor has been disturbed + ** since it was last positioned and an error (e.g. OOM or an IO error) + ** occurs while trying to reposition it. */ + if( rc!=SQLITE_OK ) goto abort_due_to_error; if( !pC->nullRow ){ rowid = 0; /* Not needed. Only used to silence a warning. */ @@ -93616,6 +96164,7 @@ case OP_IdxRowid: { /* out2 */ pTabCur->nullRow = 0; pTabCur->movetoTarget = rowid; pTabCur->deferredMoveto = 1; + pTabCur->cacheStatus = CACHE_STALE; assert( pOp->p4type==P4_INTARRAY || pOp->p4.ai==0 ); assert( !pTabCur->isEphemeral ); pTabCur->ub.aAltMap = pOp->p4.ai; @@ -93750,7 +96299,7 @@ case OP_IdxGE: { /* jump */ rc = sqlite3VdbeMemFromBtreeZeroOffset(pCur, (u32)nCellKey, &m); if( rc ) goto abort_due_to_error; res = sqlite3VdbeRecordCompareWithSkip(m.n, m.z, &r, 0); - sqlite3VdbeMemRelease(&m); + sqlite3VdbeMemReleaseMalloc(&m); } /* End of inlined sqlite3VdbeIdxKeyCompare() */ @@ -94499,7 +97048,7 @@ case OP_IfPos: { /* jump, in1 */ ** Synopsis: if r[P1]>0 then r[P2]=r[P1]+max(0,r[P3]) else r[P2]=(-1) ** ** This opcode performs a commonly used computation associated with -** LIMIT and OFFSET process. r[P1] holds the limit counter. r[P3] +** LIMIT and OFFSET processing. r[P1] holds the limit counter. r[P3] ** holds the offset counter. The opcode computes the combined value ** of the LIMIT and OFFSET and stores that value in r[P2]. The r[P2] ** value computed is the total number of rows that will need to be @@ -94631,6 +97180,7 @@ case OP_AggStep: { pCtx->pVdbe = p; pCtx->skipFlag = 0; pCtx->isError = 0; + pCtx->enc = encoding; pCtx->argc = n; pOp->p4type = P4_FUNCCTX; pOp->p4.pCtx = pCtx; @@ -94760,9 +97310,6 @@ case OP_AggFinal: { } sqlite3VdbeChangeEncoding(pMem, encoding); UPDATE_MAX_BLOBSIZE(pMem); - if( sqlite3VdbeMemTooBig(pMem) ){ - goto too_big; - } break; } @@ -95270,7 +97817,6 @@ case OP_VColumn: { VdbeCursor *pCur = p->apCsr[pOp->p1]; assert( pCur!=0 ); - assert( pCur->eCurType==CURTYPE_VTAB ); assert( pOp->p3>0 && pOp->p3<=(p->nMem+1 - p->nCursor) ); pDest = &aMem[pOp->p3]; memAboutToChange(p, pDest); @@ -95278,11 +97824,13 @@ case OP_VColumn: { sqlite3VdbeMemSetNull(pDest); break; } + assert( pCur->eCurType==CURTYPE_VTAB ); pVtab = pCur->uc.pVCur->pVtab; pModule = pVtab->pModule; assert( pModule->xColumn ); memset(&sContext, 0, sizeof(sContext)); sContext.pOut = pDest; + sContext.enc = encoding; assert( pOp->p5==OPFLAG_NOCHNG || pOp->p5==0 ); if( pOp->p5 & OPFLAG_NOCHNG ){ sqlite3VdbeMemSetNull(pDest); @@ -95301,9 +97849,6 @@ case OP_VColumn: { REGISTER_TRACE(pOp->p3, pDest); UPDATE_MAX_BLOBSIZE(pDest); - if( sqlite3VdbeMemTooBig(pDest) ){ - goto too_big; - } if( rc ) goto abort_due_to_error; break; } @@ -95570,6 +98115,7 @@ case OP_Function: { /* group */ if( pCtx->pOut != pOut ){ pCtx->pVdbe = p; pCtx->pOut = pOut; + pCtx->enc = encoding; for(i=pCtx->argc-1; i>=0; i--) pCtx->argv[i] = &aMem[pOp->p2+i]; } assert( pCtx->pVdbe==p ); @@ -95596,17 +98142,27 @@ case OP_Function: { /* group */ if( rc ) goto abort_due_to_error; } - /* Copy the result of the function into register P3 */ - if( pOut->flags & (MEM_Str|MEM_Blob) ){ - sqlite3VdbeChangeEncoding(pOut, encoding); - if( sqlite3VdbeMemTooBig(pOut) ) goto too_big; - } + assert( (pOut->flags&MEM_Str)==0 + || pOut->enc==encoding + || db->mallocFailed ); + assert( !sqlite3VdbeMemTooBig(pOut) ); REGISTER_TRACE(pOp->p3, pOut); UPDATE_MAX_BLOBSIZE(pOut); break; } +/* Opcode: ClrSubtype P1 * * * * +** Synopsis: r[P1].subtype = 0 +** +** Clear the subtype from register P1. +*/ +case OP_ClrSubtype: { /* in1 */ + pIn1 = &aMem[pOp->p1]; + pIn1->flags &= ~MEM_Subtype; + break; +} + /* Opcode: FilterAdd P1 * P3 P4 * ** Synopsis: filter(P1) += key(P3@P4) ** @@ -95726,7 +98282,7 @@ case OP_Init: { /* jump */ #ifndef SQLITE_OMIT_TRACE if( (db->mTrace & (SQLITE_TRACE_STMT|SQLITE_TRACE_LEGACY))!=0 - && !p->doingRerun + && p->minWriteFileFormat!=254 /* tag-20220401a */ && (zTrace = (pOp->p4.z ? pOp->p4.z : p->zSql))!=0 ){ #ifndef SQLITE_OMIT_DEPRECATED @@ -95955,7 +98511,7 @@ abort_due_to_error: testcase( sqlite3GlobalConfig.xLog!=0 ); sqlite3_log(rc, "statement aborts at %d: [%s] %s", (int)(pOp - aOp), p->zSql, p->zErrMsg); - sqlite3VdbeHalt(p); + if( p->eVdbeState==VDBE_RUN_STATE ) sqlite3VdbeHalt(p); if( rc==SQLITE_IOERR_NOMEM ) sqlite3OomFault(db); if( rc==SQLITE_CORRUPT && db->autoCommit==0 ){ db->flags |= SQLITE_CorruptRdOnly; @@ -100090,6 +102646,8 @@ SQLITE_PRIVATE int sqlite3JournalOpen( ){ MemJournal *p = (MemJournal*)pJfd; + assert( zName || nSpill<0 || (flags & SQLITE_OPEN_EXCLUSIVE) ); + /* Zero the file-handle object. If nSpill was passed zero, initialize ** it using the sqlite3OsOpen() function of the underlying VFS. In this ** case none of the code in this module is executed as a result of calls @@ -100517,53 +103075,24 @@ static void resolveAlias( sqlite3ExprDelete(db, pDup); pDup = 0; }else{ + Expr temp; incrAggFunctionDepth(pDup, nSubquery); if( pExpr->op==TK_COLLATE ){ assert( !ExprHasProperty(pExpr, EP_IntValue) ); pDup = sqlite3ExprAddCollateString(pParse, pDup, pExpr->u.zToken); } - - /* Before calling sqlite3ExprDelete(), set the EP_Static flag. This - ** prevents ExprDelete() from deleting the Expr structure itself, - ** allowing it to be repopulated by the memcpy() on the following line. - ** The pExpr->u.zToken might point into memory that will be freed by the - ** sqlite3DbFree(db, pDup) on the last line of this block, so be sure to - ** make a copy of the token before doing the sqlite3DbFree(). - */ - ExprSetProperty(pExpr, EP_Static); - sqlite3ExprDelete(db, pExpr); - memcpy(pExpr, pDup, sizeof(*pExpr)); - if( !ExprHasProperty(pExpr, EP_IntValue) && pExpr->u.zToken!=0 ){ - assert( (pExpr->flags & (EP_Reduced|EP_TokenOnly))==0 ); - pExpr->u.zToken = sqlite3DbStrDup(db, pExpr->u.zToken); - pExpr->flags |= EP_MemToken; - } + memcpy(&temp, pDup, sizeof(Expr)); + memcpy(pDup, pExpr, sizeof(Expr)); + memcpy(pExpr, &temp, sizeof(Expr)); if( ExprHasProperty(pExpr, EP_WinFunc) ){ if( ALWAYS(pExpr->y.pWin!=0) ){ pExpr->y.pWin->pOwner = pExpr; } } - sqlite3DbFree(db, pDup); + sqlite3ExprDeferredDelete(pParse, pDup); } } - -/* -** Return TRUE if the name zCol occurs anywhere in the USING clause. -** -** Return FALSE if the USING clause is NULL or if it does not contain -** zCol. -*/ -static int nameInUsingClause(IdList *pUsing, const char *zCol){ - if( pUsing ){ - int k; - for(k=0; knId; k++){ - if( sqlite3StrICmp(pUsing->a[k].zName, zCol)==0 ) return 1; - } - } - return 0; -} - /* ** Subqueries stores the original database, table and column names for their ** result sets in ExprList.a[].zSpan, in the form "DATABASE.TABLE.COLUMN". @@ -100579,7 +103108,7 @@ SQLITE_PRIVATE int sqlite3MatchEName( ){ int n; const char *zSpan; - if( pItem->eEName!=ENAME_TAB ) return 0; + if( pItem->fg.eEName!=ENAME_TAB ) return 0; zSpan = pItem->zEName; for(n=0; ALWAYS(zSpan[n]) && zSpan[n]!='.'; n++){} if( zDb && (sqlite3StrNICmp(zSpan, zDb, n)!=0 || zDb[n]!=0) ){ @@ -100640,6 +103169,29 @@ SQLITE_PRIVATE Bitmask sqlite3ExprColUsed(Expr *pExpr){ } } +/* +** Create a new expression term for the column specified by pMatch and +** iColumn. Append this new expression term to the FULL JOIN Match set +** in *ppList. Create a new *ppList if this is the first term in the +** set. +*/ +static void extendFJMatch( + Parse *pParse, /* Parsing context */ + ExprList **ppList, /* ExprList to extend */ + SrcItem *pMatch, /* Source table containing the column */ + i16 iColumn /* The column number */ +){ + Expr *pNew = sqlite3ExprAlloc(pParse->db, TK_COLUMN, 0, 0); + if( pNew ){ + pNew->iTable = pMatch->iCursor; + pNew->iColumn = iColumn; + pNew->y.pTab = pMatch->pTab; + assert( (pMatch->fg.jointype & (JT_LEFT|JT_LTORJ))!=0 ); + ExprSetProperty(pNew, EP_CanBeNull); + *ppList = sqlite3ExprListAppend(pParse, *ppList, pNew); + } +} + /* ** Given the name of a column of the form X.Y.Z or Y.Z or just Z, look up ** that name in the set of source tables in pSrcList and make the pExpr @@ -100685,11 +103237,13 @@ static int lookupName( NameContext *pTopNC = pNC; /* First namecontext in the list */ Schema *pSchema = 0; /* Schema of the expression */ int eNewExprOp = TK_COLUMN; /* New value for pExpr->op on success */ - Table *pTab = 0; /* Table hold the row */ + Table *pTab = 0; /* Table holding the row */ Column *pCol; /* A column of pTab */ + ExprList *pFJMatch = 0; /* Matches for FULL JOIN .. USING */ assert( pNC ); /* the name context cannot be NULL. */ assert( zCol ); /* The Z in X.Y.Z cannot be NULL */ + assert( zDb==0 || zTab!=0 ); assert( !ExprHasProperty(pExpr, EP_TokenOnly|EP_Reduced) ); /* Initialize the node to no-match */ @@ -100738,26 +103292,65 @@ static int lookupName( pTab = pItem->pTab; assert( pTab!=0 && pTab->zName!=0 ); assert( pTab->nCol>0 || pParse->nErr ); - if( pItem->pSelect && (pItem->pSelect->selFlags & SF_NestedFrom)!=0 ){ + assert( (int)pItem->fg.isNestedFrom == IsNestedFrom(pItem->pSelect) ); + if( pItem->fg.isNestedFrom ){ + /* In this case, pItem is a subquery that has been formed from a + ** parenthesized subset of the FROM clause terms. Example: + ** .... FROM t1 LEFT JOIN (t2 RIGHT JOIN t3 USING(x)) USING(y) ... + ** \_________________________/ + ** This pItem -------------^ + */ int hit = 0; + assert( pItem->pSelect!=0 ); pEList = pItem->pSelect->pEList; + assert( pEList!=0 ); + assert( pEList->nExpr==pTab->nCol ); for(j=0; jnExpr; j++){ - if( sqlite3MatchEName(&pEList->a[j], zCol, zTab, zDb) ){ - cnt++; - cntTab = 2; - pMatch = pItem; - pExpr->iColumn = j; - hit = 1; + if( !sqlite3MatchEName(&pEList->a[j], zCol, zTab, zDb) ){ + continue; } + if( cnt>0 ){ + if( pItem->fg.isUsing==0 + || sqlite3IdListIndex(pItem->u3.pUsing, zCol)<0 + ){ + /* Two or more tables have the same column name which is + ** not joined by USING. This is an error. Signal as much + ** by clearing pFJMatch and letting cnt go above 1. */ + sqlite3ExprListDelete(db, pFJMatch); + pFJMatch = 0; + }else + if( (pItem->fg.jointype & JT_RIGHT)==0 ){ + /* An INNER or LEFT JOIN. Use the left-most table */ + continue; + }else + if( (pItem->fg.jointype & JT_LEFT)==0 ){ + /* A RIGHT JOIN. Use the right-most table */ + cnt = 0; + sqlite3ExprListDelete(db, pFJMatch); + pFJMatch = 0; + }else{ + /* For a FULL JOIN, we must construct a coalesce() func */ + extendFJMatch(pParse, &pFJMatch, pMatch, pExpr->iColumn); + } + } + cnt++; + cntTab = 2; + pMatch = pItem; + pExpr->iColumn = j; + pEList->a[j].fg.bUsed = 1; + hit = 1; + if( pEList->a[j].fg.bUsingTerm ) break; } if( hit || zTab==0 ) continue; } - if( zDb ){ - if( pTab->pSchema!=pSchema ) continue; - if( pSchema==0 && strcmp(zDb,"*")!=0 ) continue; - } + assert( zDb==0 || zTab!=0 ); if( zTab ){ - const char *zTabName = pItem->zAlias ? pItem->zAlias : pTab->zName; + const char *zTabName; + if( zDb ){ + if( pTab->pSchema!=pSchema ) continue; + if( pSchema==0 && strcmp(zDb,"*")!=0 ) continue; + } + zTabName = pItem->zAlias ? pItem->zAlias : pTab->zName; assert( zTabName!=0 ); if( sqlite3StrICmp(zTabName, zTab)!=0 ){ continue; @@ -100772,18 +103365,37 @@ static int lookupName( if( pCol->hName==hCol && sqlite3StrICmp(pCol->zCnName, zCol)==0 ){ - /* If there has been exactly one prior match and this match - ** is for the right-hand table of a NATURAL JOIN or is in a - ** USING clause, then skip this match. - */ - if( cnt==1 ){ - if( pItem->fg.jointype & JT_NATURAL ) continue; - if( nameInUsingClause(pItem->pUsing, zCol) ) continue; + if( cnt>0 ){ + if( pItem->fg.isUsing==0 + || sqlite3IdListIndex(pItem->u3.pUsing, zCol)<0 + ){ + /* Two or more tables have the same column name which is + ** not joined by USING. This is an error. Signal as much + ** by clearing pFJMatch and letting cnt go above 1. */ + sqlite3ExprListDelete(db, pFJMatch); + pFJMatch = 0; + }else + if( (pItem->fg.jointype & JT_RIGHT)==0 ){ + /* An INNER or LEFT JOIN. Use the left-most table */ + continue; + }else + if( (pItem->fg.jointype & JT_LEFT)==0 ){ + /* A RIGHT JOIN. Use the right-most table */ + cnt = 0; + sqlite3ExprListDelete(db, pFJMatch); + pFJMatch = 0; + }else{ + /* For a FULL JOIN, we must construct a coalesce() func */ + extendFJMatch(pParse, &pFJMatch, pMatch, pExpr->iColumn); + } } cnt++; pMatch = pItem; /* Substitute the rowid (column -1) for the INTEGER PRIMARY KEY */ pExpr->iColumn = j==pTab->iPKey ? -1 : (i16)j; + if( pItem->fg.isNestedFrom ){ + sqlite3SrcItemColumnUsed(pItem, j); + } break; } } @@ -100796,9 +103408,7 @@ static int lookupName( pExpr->iTable = pMatch->iCursor; assert( ExprUseYTab(pExpr) ); pExpr->y.pTab = pMatch->pTab; - /* RIGHT JOIN not (yet) supported */ - assert( (pMatch->fg.jointype & JT_RIGHT)==0 ); - if( (pMatch->fg.jointype & JT_LEFT)!=0 ){ + if( (pMatch->fg.jointype & (JT_LEFT|JT_LTORJ))!=0 ){ ExprSetProperty(pExpr, EP_CanBeNull); } pSchema = pExpr->y.pTab->pSchema; @@ -100952,7 +103562,7 @@ static int lookupName( assert( pEList!=0 ); for(j=0; jnExpr; j++){ char *zAs = pEList->a[j].zEName; - if( pEList->a[j].eEName==ENAME_NAME + if( pEList->a[j].fg.eEName==ENAME_NAME && sqlite3_stricmp(zAs, zCol)==0 ){ Expr *pOrig; @@ -101039,11 +103649,37 @@ static int lookupName( } /* - ** cnt==0 means there was not match. cnt>1 means there were two or - ** more matches. Either way, we have an error. + ** cnt==0 means there was not match. + ** cnt>1 means there were two or more matches. + ** + ** cnt==0 is always an error. cnt>1 is often an error, but might + ** be multiple matches for a NATURAL LEFT JOIN or a LEFT JOIN USING. */ + assert( pFJMatch==0 || cnt>0 ); + assert( !ExprHasProperty(pExpr, EP_xIsSelect|EP_IntValue) ); if( cnt!=1 ){ const char *zErr; + if( pFJMatch ){ + if( pFJMatch->nExpr==cnt-1 ){ + if( ExprHasProperty(pExpr,EP_Leaf) ){ + ExprClearProperty(pExpr,EP_Leaf); + }else{ + sqlite3ExprDelete(db, pExpr->pLeft); + pExpr->pLeft = 0; + sqlite3ExprDelete(db, pExpr->pRight); + pExpr->pRight = 0; + } + extendFJMatch(pParse, &pFJMatch, pMatch, pExpr->iColumn); + pExpr->op = TK_FUNCTION; + pExpr->u.zToken = "coalesce"; + pExpr->x.pList = pFJMatch; + cnt = 1; + goto lookupname_end; + }else{ + sqlite3ExprListDelete(db, pFJMatch); + pFJMatch = 0; + } + } zErr = cnt==0 ? "no such column" : "ambiguous column name"; if( zDb ){ sqlite3ErrorMsg(pParse, "%s: %s.%s.%s", zErr, zDb, zTab, zCol); @@ -101056,6 +103692,16 @@ static int lookupName( pParse->checkSchema = 1; pTopNC->nNcErr++; } + assert( pFJMatch==0 ); + + /* Remove all substructure from pExpr */ + if( !ExprHasProperty(pExpr,(EP_TokenOnly|EP_Leaf)) ){ + sqlite3ExprDelete(db, pExpr->pLeft); + pExpr->pLeft = 0; + sqlite3ExprDelete(db, pExpr->pRight); + pExpr->pRight = 0; + ExprSetProperty(pExpr, EP_Leaf); + } /* If a column from a table in pSrcList is referenced, then record ** this fact in the pSrcList.a[].colUsed bitmask. Column 0 causes @@ -101075,16 +103721,7 @@ static int lookupName( pMatch->colUsed |= sqlite3ExprColUsed(pExpr); } - /* Clean up and return - */ - if( !ExprHasProperty(pExpr,(EP_TokenOnly|EP_Leaf)) ){ - sqlite3ExprDelete(db, pExpr->pLeft); - pExpr->pLeft = 0; - sqlite3ExprDelete(db, pExpr->pRight); - pExpr->pRight = 0; - } pExpr->op = eNewExprOp; - ExprSetProperty(pExpr, EP_Leaf); lookupname_end: if( cnt==1 ){ assert( pNC!=0 ); @@ -101269,7 +103906,7 @@ static int resolveExprStep(Walker *pWalker, Expr *pExpr){ } sqlite3WalkExpr(pWalker, pExpr->pLeft); if( 0==sqlite3ExprCanBeNull(pExpr->pLeft) && !IN_RENAME_OBJECT ){ - testcase( ExprHasProperty(pExpr, EP_FromJoin) ); + testcase( ExprHasProperty(pExpr, EP_OuterON) ); assert( !ExprHasProperty(pExpr, EP_IntValue) ); if( pExpr->op==TK_NOTNULL ){ pExpr->u.zToken = "true"; @@ -101678,7 +104315,7 @@ static int resolveAsName( assert( !ExprHasProperty(pE, EP_IntValue) ); zCol = pE->u.zToken; for(i=0; inExpr; i++){ - if( pEList->a[i].eEName==ENAME_NAME + if( pEList->a[i].fg.eEName==ENAME_NAME && sqlite3_stricmp(pEList->a[i].zEName, zCol)==0 ){ return i+1; @@ -101799,7 +104436,7 @@ static int resolveCompoundOrderBy( return 1; } for(i=0; inExpr; i++){ - pOrderBy->a[i].done = 0; + pOrderBy->a[i].fg.done = 0; } pSelect->pNext = 0; while( pSelect->pPrior ){ @@ -101814,7 +104451,7 @@ static int resolveCompoundOrderBy( for(i=0, pItem=pOrderBy->a; inExpr; i++, pItem++){ int iCol = -1; Expr *pE, *pDup; - if( pItem->done ) continue; + if( pItem->fg.done ) continue; pE = sqlite3ExprSkipCollateAndLikely(pItem->pExpr); if( NEVER(pE==0) ) continue; if( sqlite3ExprIsInteger(pE, &iCol) ){ @@ -101867,7 +104504,7 @@ static int resolveCompoundOrderBy( sqlite3ExprDelete(db, pE); pItem->u.x.iOrderByCol = (u16)iCol; } - pItem->done = 1; + pItem->fg.done = 1; }else{ moreToDo = 1; } @@ -101875,7 +104512,7 @@ static int resolveCompoundOrderBy( pSelect = pSelect->pNext; } for(i=0; inExpr; i++){ - if( pOrderBy->a[i].done==0 ){ + if( pOrderBy->a[i].fg.done==0 ){ sqlite3ErrorMsg(pParse, "%r ORDER BY term does not match any " "column in the result set", i+1); return 1; @@ -102165,8 +104802,8 @@ static int resolveSelectStep(Walker *pWalker, Select *p){ sNC.uNC.pEList = p->pEList; sNC.ncFlags |= NC_UEList; if( p->pHaving ){ - if( !pGroupBy ){ - sqlite3ErrorMsg(pParse, "a GROUP BY clause is required before HAVING"); + if( (p->selFlags & SF_Aggregate)==0 ){ + sqlite3ErrorMsg(pParse, "HAVING clause on a non-aggregate query"); return WRC_Abort; } if( sqlite3ResolveExprNames(&sNC, p->pHaving) ) return WRC_Abort; @@ -102546,9 +105183,8 @@ SQLITE_PRIVATE char sqlite3ExprAffinity(const Expr *pExpr){ if( op==TK_REGISTER ) op = pExpr->op2; if( op==TK_COLUMN || op==TK_AGG_COLUMN ){ assert( ExprUseYTab(pExpr) ); - if( pExpr->y.pTab ){ - return sqlite3TableColumnAffinity(pExpr->y.pTab, pExpr->iColumn); - } + assert( pExpr->y.pTab!=0 ); + return sqlite3TableColumnAffinity(pExpr->y.pTab, pExpr->iColumn); } if( op==TK_SELECT ){ assert( ExprUseXSelect(pExpr) ); @@ -102666,17 +105302,14 @@ SQLITE_PRIVATE CollSeq *sqlite3ExprCollSeq(Parse *pParse, const Expr *pExpr){ int op = p->op; if( op==TK_REGISTER ) op = p->op2; if( op==TK_AGG_COLUMN || op==TK_COLUMN || op==TK_TRIGGER ){ + int j; assert( ExprUseYTab(p) ); - if( p->y.pTab!=0 ){ - /* op==TK_REGISTER && p->y.pTab!=0 happens when pExpr was originally - ** a TK_COLUMN but was previously evaluated and cached in a register */ - int j = p->iColumn; - if( j>=0 ){ - const char *zColl = sqlite3ColumnColl(&p->y.pTab->aCol[j]); - pColl = sqlite3FindCollSeq(db, ENC(db), zColl, 0); - } - break; + assert( p->y.pTab!=0 ); + if( (j = p->iColumn)>=0 ){ + const char *zColl = sqlite3ColumnColl(&p->y.pTab->aCol[j]); + pColl = sqlite3FindCollSeq(db, ENC(db), zColl, 0); } + break; } if( op==TK_CAST || op==TK_UPLUS ){ p = p->pLeft; @@ -103261,7 +105894,9 @@ static void heightOfSelect(const Select *pSelect, int *pnHeight){ */ static void exprSetHeight(Expr *p){ int nHeight = p->pLeft ? p->pLeft->nHeight : 0; - if( p->pRight && p->pRight->nHeight>nHeight ) nHeight = p->pRight->nHeight; + if( NEVER(p->pRight) && p->pRight->nHeight>nHeight ){ + nHeight = p->pRight->nHeight; + } if( ExprUseXSelect(p) ){ heightOfSelect(p->x.pSelect, &nHeight); }else if( p->x.pList ){ @@ -103404,15 +106039,26 @@ SQLITE_PRIVATE void sqlite3ExprAttachSubtrees( sqlite3ExprDelete(db, pLeft); sqlite3ExprDelete(db, pRight); }else{ + assert( ExprUseXList(pRoot) ); + assert( pRoot->x.pSelect==0 ); if( pRight ){ pRoot->pRight = pRight; pRoot->flags |= EP_Propagate & pRight->flags; +#if SQLITE_MAX_EXPR_DEPTH>0 + pRoot->nHeight = pRight->nHeight+1; + }else{ + pRoot->nHeight = 1; +#endif } if( pLeft ){ pRoot->pLeft = pLeft; pRoot->flags |= EP_Propagate & pLeft->flags; +#if SQLITE_MAX_EXPR_DEPTH>0 + if( pLeft->nHeight>=pRoot->nHeight ){ + pRoot->nHeight = pLeft->nHeight+1; + } +#endif } - exprSetHeight(pRoot); } } @@ -103560,6 +106206,7 @@ SQLITE_PRIVATE Expr *sqlite3ExprFunction( sqlite3ExprListDelete(db, pList); /* Avoid memory leak when malloc fails */ return 0; } + assert( !ExprHasProperty(pNew, EP_InnerON|EP_OuterON) ); pNew->w.iOfst = (int)(pToken->z - pParse->zTail); if( pList && pList->nExpr > pParse->db->aLimit[SQLITE_LIMIT_FUNCTION_ARG] @@ -103697,6 +106344,7 @@ SQLITE_PRIVATE void sqlite3ExprAssignVarNumber(Parse *pParse, Expr *pExpr, u32 n */ static SQLITE_NOINLINE void sqlite3ExprDeleteNN(sqlite3 *db, Expr *p){ assert( p!=0 ); + assert( db!=0 ); assert( !ExprUseUValue(p) || p->u.iValue>=0 ); assert( !ExprUseYWin(p) || !ExprUseYSub(p) ); assert( !ExprUseYWin(p) || p->y.pWin!=0 || db->mallocFailed ); @@ -103728,18 +106376,26 @@ static SQLITE_NOINLINE void sqlite3ExprDeleteNN(sqlite3 *db, Expr *p){ #endif } } - if( ExprHasProperty(p, EP_MemToken) ){ - assert( !ExprHasProperty(p, EP_IntValue) ); - sqlite3DbFree(db, p->u.zToken); - } if( !ExprHasProperty(p, EP_Static) ){ - sqlite3DbFreeNN(db, p); + sqlite3DbNNFreeNN(db, p); } } SQLITE_PRIVATE void sqlite3ExprDelete(sqlite3 *db, Expr *p){ if( p ) sqlite3ExprDeleteNN(db, p); } +/* +** Clear both elements of an OnOrUsing object +*/ +SQLITE_PRIVATE void sqlite3ClearOnOrUsing(sqlite3 *db, OnOrUsing *p){ + if( p==0 ){ + /* Nothing to clear */ + }else if( p->pOn ){ + sqlite3ExprDeleteNN(db, p->pOn); + }else if( p->pUsing ){ + sqlite3IdListDelete(db, p->pUsing); + } +} /* ** Arrange to cause pExpr to be deleted when the pParse is deleted. @@ -103752,8 +106408,9 @@ SQLITE_PRIVATE void sqlite3ExprDelete(sqlite3 *db, Expr *p){ ** pExpr to the pParse->pConstExpr list with a register number of 0. */ SQLITE_PRIVATE void sqlite3ExprDeferredDelete(Parse *pParse, Expr *pExpr){ - pParse->pConstExpr = - sqlite3ExprListAppend(pParse, pParse->pConstExpr, pExpr); + sqlite3ParserAddCleanup(pParse, + (void(*)(sqlite3*,void*))sqlite3ExprDelete, + pExpr); } /* Invoke sqlite3RenameExprUnmap() and sqlite3ExprDelete() on the @@ -103826,8 +106483,7 @@ static int dupedExprStructSize(const Expr *p, int flags){ nSize = EXPR_FULLSIZE; }else{ assert( !ExprHasProperty(p, EP_TokenOnly|EP_Reduced) ); - assert( !ExprHasProperty(p, EP_FromJoin) ); - assert( !ExprHasProperty(p, EP_MemToken) ); + assert( !ExprHasProperty(p, EP_OuterON) ); assert( !ExprHasVVAProperty(p, EP_NoReduce) ); if( p->pLeft || p->x.pList ){ nSize = EXPR_REDUCEDSIZE | EP_Reduced; @@ -103931,7 +106587,7 @@ static Expr *exprDup(sqlite3 *db, const Expr *p, int dupFlags, u8 **pzBuffer){ } /* Set the EP_Reduced, EP_TokenOnly, and EP_Static flags appropriately. */ - pNew->flags &= ~(EP_Reduced|EP_TokenOnly|EP_Static|EP_MemToken); + pNew->flags &= ~(EP_Reduced|EP_TokenOnly|EP_Static); pNew->flags |= nStructSize & (EP_Reduced|EP_TokenOnly); pNew->flags |= staticFlag; ExprClearVVAProperties(pNew); @@ -104006,6 +106662,7 @@ SQLITE_PRIVATE With *sqlite3WithDup(sqlite3 *db, With *p){ pRet->a[i].pSelect = sqlite3SelectDup(db, p->a[i].pSelect, 0); pRet->a[i].pCols = sqlite3ExprListDup(db, p->a[i].pCols, 0); pRet->a[i].zName = sqlite3DbStrDup(db, p->a[i].zName); + pRet->a[i].eM10d = p->a[i].eM10d; } } } @@ -104106,11 +106763,8 @@ SQLITE_PRIVATE ExprList *sqlite3ExprListDup(sqlite3 *db, const ExprList *p, int } } pItem->zEName = sqlite3DbStrDup(db, pOldItem->zEName); - pItem->sortFlags = pOldItem->sortFlags; - pItem->eEName = pOldItem->eEName; - pItem->done = 0; - pItem->bNulls = pOldItem->bNulls; - pItem->bSorterRef = pOldItem->bSorterRef; + pItem->fg = pOldItem->fg; + pItem->fg.done = 0; pItem->u = pOldItem->u; } return pNew; @@ -104162,8 +106816,12 @@ SQLITE_PRIVATE SrcList *sqlite3SrcListDup(sqlite3 *db, const SrcList *p, int fla pTab->nTabRef++; } pNewItem->pSelect = sqlite3SelectDup(db, pOldItem->pSelect, flags); - pNewItem->pOn = sqlite3ExprDup(db, pOldItem->pOn, flags); - pNewItem->pUsing = sqlite3IdListDup(db, pOldItem->pUsing); + if( pOldItem->fg.isUsing ){ + assert( pNewItem->fg.isUsing ); + pNewItem->u3.pUsing = sqlite3IdListDup(db, pOldItem->u3.pUsing); + }else{ + pNewItem->u3.pOn = sqlite3ExprDup(db, pOldItem->u3.pOn, flags); + } pNewItem->colUsed = pOldItem->colUsed; } return pNew; @@ -104173,22 +106831,16 @@ SQLITE_PRIVATE IdList *sqlite3IdListDup(sqlite3 *db, const IdList *p){ int i; assert( db!=0 ); if( p==0 ) return 0; - pNew = sqlite3DbMallocRawNN(db, sizeof(*pNew) ); + assert( p->eU4!=EU4_EXPR ); + pNew = sqlite3DbMallocRawNN(db, sizeof(*pNew)+(p->nId-1)*sizeof(p->a[0]) ); if( pNew==0 ) return 0; pNew->nId = p->nId; - pNew->a = sqlite3DbMallocRawNN(db, p->nId*sizeof(p->a[0]) ); - if( pNew->a==0 ){ - sqlite3DbFreeNN(db, pNew); - return 0; - } - /* Note that because the size of the allocation for p->a[] is not - ** necessarily a power of two, sqlite3IdListAppend() may not be called - ** on the duplicate created by this function. */ + pNew->eU4 = p->eU4; for(i=0; inId; i++){ struct IdList_item *pNewItem = &pNew->a[i]; - struct IdList_item *pOldItem = &p->a[i]; + const struct IdList_item *pOldItem = &p->a[i]; pNewItem->zName = sqlite3DbStrDup(db, pOldItem->zName); - pNewItem->idx = pOldItem->idx; + pNewItem->u4 = pOldItem->u4; } return pNew; } @@ -104412,16 +107064,16 @@ SQLITE_PRIVATE void sqlite3ExprListSetSortOrder(ExprList *p, int iSortOrder, int ); pItem = &p->a[p->nExpr-1]; - assert( pItem->bNulls==0 ); + assert( pItem->fg.bNulls==0 ); if( iSortOrder==SQLITE_SO_UNDEFINED ){ iSortOrder = SQLITE_SO_ASC; } - pItem->sortFlags = (u8)iSortOrder; + pItem->fg.sortFlags = (u8)iSortOrder; if( eNulls!=SQLITE_SO_UNDEFINED ){ - pItem->bNulls = 1; + pItem->fg.bNulls = 1; if( iSortOrder!=eNulls ){ - pItem->sortFlags |= KEYINFO_ORDER_BIGNULL; + pItem->fg.sortFlags |= KEYINFO_ORDER_BIGNULL; } } } @@ -104447,7 +107099,7 @@ SQLITE_PRIVATE void sqlite3ExprListSetName( assert( pList->nExpr>0 ); pItem = &pList->a[pList->nExpr-1]; assert( pItem->zEName==0 ); - assert( pItem->eEName==ENAME_NAME ); + assert( pItem->fg.eEName==ENAME_NAME ); pItem->zEName = sqlite3DbStrNDup(pParse->db, pName->z, pName->n); if( dequote ){ /* If dequote==0, then pName->z does not point to part of a DDL @@ -104482,7 +107134,7 @@ SQLITE_PRIVATE void sqlite3ExprListSetSpan( assert( pList->nExpr>0 ); if( pItem->zEName==0 ){ pItem->zEName = sqlite3DbSpanDup(db, zStart, zEnd); - pItem->eEName = ENAME_SPAN; + pItem->fg.eEName = ENAME_SPAN; } } } @@ -104511,12 +107163,13 @@ static SQLITE_NOINLINE void exprListDeleteNN(sqlite3 *db, ExprList *pList){ int i = pList->nExpr; struct ExprList_item *pItem = pList->a; assert( pList->nExpr>0 ); + assert( db!=0 ); do{ sqlite3ExprDelete(db, pItem->pExpr); - sqlite3DbFree(db, pItem->zEName); + if( pItem->zEName ) sqlite3DbNNFreeNN(db, pItem->zEName); pItem++; }while( --i>0 ); - sqlite3DbFreeNN(db, pList); + sqlite3DbNNFreeNN(db, pList); } SQLITE_PRIVATE void sqlite3ExprListDelete(sqlite3 *db, ExprList *pList){ if( pList ) exprListDeleteNN(db, pList); @@ -104654,9 +107307,9 @@ SQLITE_PRIVATE Expr *sqlite3ExprSimplifiedAndOr(Expr *pExpr){ static int exprNodeIsConstant(Walker *pWalker, Expr *pExpr){ /* If pWalker->eCode is 2 then any term of the expression that comes from - ** the ON or USING clauses of a left join disqualifies the expression + ** the ON or USING clauses of an outer join disqualifies the expression ** from being considered constant. */ - if( pWalker->eCode==2 && ExprHasProperty(pExpr, EP_FromJoin) ){ + if( pWalker->eCode==2 && ExprHasProperty(pExpr, EP_OuterON) ){ pWalker->eCode = 0; return WRC_Abort; } @@ -104779,7 +107432,7 @@ SQLITE_PRIVATE int sqlite3ExprIsTableConstant(Expr *p, int iCur){ ** Check pExpr to see if it is an invariant constraint on data source pSrc. ** This is an optimization. False negatives will perhaps cause slower ** queries, but false positives will yield incorrect answers. So when in -** double, return 0. +** doubt, return 0. ** ** To be an invariant constraint, the following must be true: ** @@ -104787,24 +107440,28 @@ SQLITE_PRIVATE int sqlite3ExprIsTableConstant(Expr *p, int iCur){ ** ** (2) pExpr cannot use subqueries or non-deterministic functions. ** -** (*) ** Not applicable to this branch ** +** (3) pSrc cannot be part of the left operand for a RIGHT JOIN. +** (Is there some way to relax this constraint?) ** ** (4) If pSrc is the right operand of a LEFT JOIN, then... ** (4a) pExpr must come from an ON clause.. -** (4b) and specifically the ON clause associated with the LEFT JOIN. + (4b) and specifically the ON clause associated with the LEFT JOIN. ** ** (5) If pSrc is not the right operand of a LEFT JOIN or the left ** operand of a RIGHT JOIN, then pExpr must be from the WHERE ** clause, not an ON clause. */ SQLITE_PRIVATE int sqlite3ExprIsTableConstraint(Expr *pExpr, const SrcItem *pSrc){ + if( pSrc->fg.jointype & JT_LTORJ ){ + return 0; /* rule (3) */ + } if( pSrc->fg.jointype & JT_LEFT ){ - if( !ExprHasProperty(pExpr, EP_FromJoin) ) return 0; /* rule (4a) */ - if( pExpr->w.iRightJoinTable!=pSrc->iCursor ) return 0; /* rule (4b) */ + if( !ExprHasProperty(pExpr, EP_OuterON) ) return 0; /* rule (4a) */ + if( pExpr->w.iJoin!=pSrc->iCursor ) return 0; /* rule (4b) */ }else{ - if( ExprHasProperty(pExpr, EP_FromJoin) ) return 0; /* rule (5) */ + if( ExprHasProperty(pExpr, EP_OuterON) ) return 0; /* rule (5) */ } - return sqlite3ExprIsTableConstant(pExpr, pSrc->iCursor); /* rules (1), (2) */ + return sqlite3ExprIsTableConstant(pExpr, pSrc->iCursor); /* rules (1), (2) */ } @@ -105134,7 +107791,7 @@ static int sqlite3InRhsIsConstant(Expr *pIn){ ** all members of the RHS set, skipping duplicates. ** ** A cursor is opened on the b-tree object that is the RHS of the IN operator -** and pX->iTable is set to the index of that cursor. +** and the *piTab parameter is set to the index of that cursor. ** ** The returned value of this function indicates the b-tree type, as follows: ** @@ -105154,7 +107811,10 @@ static int sqlite3InRhsIsConstant(Expr *pIn){ ** If the RHS of the IN operator is a list or a more complex subquery, then ** an ephemeral table might need to be generated from the RHS and then ** pX->iTable made to point to the ephemeral table instead of an -** existing table. +** existing table. In this case, the creation and initialization of the +** ephmeral table might be put inside of a subroutine, the EP_Subrtn flag +** will be set on pX and the pX->y.sub fields will be set to show where +** the subroutine is coded. ** ** The inFlags parameter must contain, at a minimum, one of the bits ** IN_INDEX_MEMBERSHIP or IN_INDEX_LOOP but not both. If inFlags contains @@ -105215,12 +107875,13 @@ SQLITE_PRIVATE int sqlite3FindInIndex( ){ Select *p; /* SELECT to the right of IN operator */ int eType = 0; /* Type of RHS table. IN_INDEX_* */ - int iTab = pParse->nTab++; /* Cursor of the RHS table */ + int iTab; /* Cursor of the RHS table */ int mustBeUnique; /* True if RHS must be unique */ Vdbe *v = sqlite3GetVdbe(pParse); /* Virtual machine being coded */ assert( pX->op==TK_IN ); mustBeUnique = (inFlags & IN_INDEX_LOOP)!=0; + iTab = pParse->nTab++; /* If the RHS of this IN(...) operator is a SELECT, and if it matters ** whether or not the SELECT result contains NULL values, check whether @@ -105386,6 +108047,8 @@ SQLITE_PRIVATE int sqlite3FindInIndex( && ExprUseXList(pX) && (!sqlite3InRhsIsConstant(pX) || pX->x.pList->nExpr<=2) ){ + pParse->nTab--; /* Back out the allocation of the unused cursor */ + iTab = -1; /* Cursor is not allocated */ eType = IN_INDEX_NOOP; } @@ -105552,6 +108215,7 @@ SQLITE_PRIVATE void sqlite3CodeRhsOfIN( assert( ExprUseYSub(pExpr) ); sqlite3VdbeAddOp2(v, OP_Gosub, pExpr->y.sub.regReturn, pExpr->y.sub.iAddr); + assert( iTab!=pExpr->iTable ); sqlite3VdbeAddOp2(v, OP_OpenDup, iTab, pExpr->iTable); sqlite3VdbeJumpHere(v, addrOnce); return; @@ -105563,8 +108227,7 @@ SQLITE_PRIVATE void sqlite3CodeRhsOfIN( assert( !ExprHasProperty(pExpr, EP_TokenOnly|EP_Reduced) ); pExpr->y.sub.regReturn = ++pParse->nMem; pExpr->y.sub.iAddr = - sqlite3VdbeAddOp2(v, OP_Integer, 0, pExpr->y.sub.regReturn) + 1; - VdbeComment((v, "return address")); + sqlite3VdbeAddOp2(v, OP_BeginSubrtn, 0, pExpr->y.sub.regReturn) + 1; addrOnce = sqlite3VdbeAddOp0(v, OP_Once); VdbeCoverage(v); } @@ -105666,6 +108329,7 @@ SQLITE_PRIVATE void sqlite3CodeRhsOfIN( ** expression we need to rerun this code each time. */ if( addrOnce && !sqlite3ExprIsConstant(pE2) ){ + sqlite3VdbeChangeToNoop(v, addrOnce-1); sqlite3VdbeChangeToNoop(v, addrOnce); ExprClearProperty(pExpr, EP_Subrtn); addrOnce = 0; @@ -105683,11 +108347,15 @@ SQLITE_PRIVATE void sqlite3CodeRhsOfIN( sqlite3VdbeChangeP4(v, addr, (void *)pKeyInfo, P4_KEYINFO); } if( addrOnce ){ + sqlite3VdbeAddOp1(v, OP_NullRow, iTab); sqlite3VdbeJumpHere(v, addrOnce); /* Subroutine return */ assert( ExprUseYSub(pExpr) ); - sqlite3VdbeAddOp1(v, OP_Return, pExpr->y.sub.regReturn); - sqlite3VdbeChangeP1(v, pExpr->y.sub.iAddr-1, sqlite3VdbeCurrentAddr(v)-1); + assert( sqlite3VdbeGetOp(v,pExpr->y.sub.iAddr-1)->opcode==OP_BeginSubrtn + || pParse->nErr ); + sqlite3VdbeAddOp3(v, OP_Return, pExpr->y.sub.regReturn, + pExpr->y.sub.iAddr, 1); + VdbeCoverage(v); sqlite3ClearTempRegCache(pParse); } } @@ -105741,9 +108409,7 @@ SQLITE_PRIVATE int sqlite3CodeSubselect(Parse *pParse, Expr *pExpr){ ExprSetProperty(pExpr, EP_Subrtn); pExpr->y.sub.regReturn = ++pParse->nMem; pExpr->y.sub.iAddr = - sqlite3VdbeAddOp2(v, OP_Integer, 0, pExpr->y.sub.regReturn) + 1; - VdbeComment((v, "return address")); - + sqlite3VdbeAddOp2(v, OP_BeginSubrtn, 0, pExpr->y.sub.regReturn) + 1; /* The evaluation of the EXISTS/SELECT must be repeated every time it ** is encountered if any of the following is true: @@ -105795,7 +108461,7 @@ SQLITE_PRIVATE int sqlite3CodeSubselect(Parse *pParse, Expr *pExpr){ pLimit = sqlite3PExpr(pParse, TK_NE, sqlite3ExprDup(db, pSel->pLimit->pLeft, 0), pLimit); } - sqlite3ExprDelete(db, pSel->pLimit->pLeft); + sqlite3ExprDeferredDelete(pParse, pSel->pLimit->pLeft); pSel->pLimit->pLeft = pLimit; }else{ /* If there is no pre-existing limit add a limit of 1 */ @@ -105816,8 +108482,11 @@ SQLITE_PRIVATE int sqlite3CodeSubselect(Parse *pParse, Expr *pExpr){ /* Subroutine return */ assert( ExprUseYSub(pExpr) ); - sqlite3VdbeAddOp1(v, OP_Return, pExpr->y.sub.regReturn); - sqlite3VdbeChangeP1(v, pExpr->y.sub.iAddr-1, sqlite3VdbeCurrentAddr(v)-1); + assert( sqlite3VdbeGetOp(v,pExpr->y.sub.iAddr-1)->opcode==OP_BeginSubrtn + || pParse->nErr ); + sqlite3VdbeAddOp3(v, OP_Return, pExpr->y.sub.regReturn, + pExpr->y.sub.iAddr, 1); + VdbeCoverage(v); sqlite3ClearTempRegCache(pParse); return rReg; } @@ -106245,12 +108914,10 @@ SQLITE_PRIVATE void sqlite3ExprCodeGetColumnOfTable( ){ Column *pCol; assert( v!=0 ); - if( pTab==0 ){ - sqlite3VdbeAddOp3(v, OP_Column, iTabCur, iCol, regOut); - return; - } + assert( pTab!=0 ); if( iCol<0 || iCol==pTab->iPKey ){ sqlite3VdbeAddOp2(v, OP_Rowid, iTabCur, regOut); + VdbeComment((v, "%s.rowid", pTab->zName)); }else{ int op; int x; @@ -106305,7 +108972,7 @@ SQLITE_PRIVATE int sqlite3ExprCodeGetColumn( assert( pParse->pVdbe!=0 ); sqlite3ExprCodeGetColumnOfTable(pParse->pVdbe, pTab, iTable, iColumn, iReg); if( p5 ){ - VdbeOp *pOp = sqlite3VdbeGetOp(pParse->pVdbe,-1); + VdbeOp *pOp = sqlite3VdbeGetLastOp(pParse->pVdbe); if( pOp->opcode==OP_Column ) pOp->p5 = p5; } return iReg; @@ -106374,7 +109041,7 @@ static int exprCodeVector(Parse *pParse, Expr *p, int *piFreeable){ ** so that a subsequent copy will not be merged into this one. */ static void setDoNotMergeFlagOnCopy(Vdbe *v){ - if( sqlite3VdbeGetOp(v, -1)->opcode==OP_Copy ){ + if( sqlite3VdbeGetLastOp(v)->opcode==OP_Copy ){ sqlite3VdbeChangeP5(v, 1); /* Tag trailing OP_Copy as not mergable */ } } @@ -106421,7 +109088,17 @@ static int exprCodeInlineFunction( caseExpr.x.pList = pFarg; return sqlite3ExprCodeTarget(pParse, &caseExpr, target); } - +#ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC + case INLINEFUNC_sqlite_offset: { + Expr *pArg = pFarg->a[0].pExpr; + if( pArg->op==TK_COLUMN && pArg->iTable>=0 ){ + sqlite3VdbeAddOp3(v, OP_Offset, pArg->iTable, pArg->iColumn, target); + }else{ + sqlite3VdbeAddOp2(v, OP_Null, 0, target); + } + break; + } +#endif default: { /* The UNLIKELY() function is a no-op. The result is the value ** of the first argument. @@ -106487,6 +109164,53 @@ static int exprCodeInlineFunction( return target; } +/* +** Check to see if pExpr is one of the indexed expressions on pParse->pIdxExpr. +** If it is, then resolve the expression by reading from the index and +** return the register into which the value has been read. If pExpr is +** not an indexed expression, then return negative. +*/ +static SQLITE_NOINLINE int sqlite3IndexedExprLookup( + Parse *pParse, /* The parsing context */ + Expr *pExpr, /* The expression to potentially bypass */ + int target /* Where to store the result of the expression */ +){ + IndexedExpr *p; + Vdbe *v; + for(p=pParse->pIdxExpr; p; p=p->pIENext){ + int iDataCur = p->iDataCur; + if( iDataCur<0 ) continue; + if( pParse->iSelfTab ){ + if( p->iDataCur!=pParse->iSelfTab-1 ) continue; + iDataCur = -1; + } + if( sqlite3ExprCompare(0, pExpr, p->pExpr, iDataCur)!=0 ) continue; + v = pParse->pVdbe; + assert( v!=0 ); + if( p->bMaybeNullRow ){ + /* If the index is on a NULL row due to an outer join, then we + ** cannot extract the value from the index. The value must be + ** computed using the original expression. */ + int addr = sqlite3VdbeCurrentAddr(v); + sqlite3VdbeAddOp3(v, OP_IfNullRow, p->iIdxCur, addr+3, target); + VdbeCoverage(v); + sqlite3VdbeAddOp3(v, OP_Column, p->iIdxCur, p->iIdxCol, target); + VdbeComment((v, "%s expr-column %d", p->zIdxName, p->iIdxCol)); + sqlite3VdbeGoto(v, 0); + p = pParse->pIdxExpr; + pParse->pIdxExpr = 0; + sqlite3ExprCode(pParse, pExpr, target); + pParse->pIdxExpr = p; + sqlite3VdbeJumpHere(v, addr+2); + }else{ + sqlite3VdbeAddOp3(v, OP_Column, p->iIdxCur, p->iIdxCol, target); + VdbeComment((v, "%s expr-column %d", p->zIdxName, p->iIdxCol)); + } + return target; + } + return -1; /* Not found */ +} + /* ** Generate code into the current Vdbe to evaluate the given @@ -106515,6 +109239,11 @@ SQLITE_PRIVATE int sqlite3ExprCodeTarget(Parse *pParse, Expr *pExpr, int target) expr_code_doover: if( pExpr==0 ){ op = TK_NULL; + }else if( pParse->pIdxExpr!=0 + && !ExprHasProperty(pExpr, EP_Leaf) + && (r1 = sqlite3IndexedExprLookup(pParse, pExpr, target))>=0 + ){ + return r1; }else{ assert( !ExprHasVVAProperty(pExpr,EP_Immutable) ); op = pExpr->op; @@ -106535,7 +109264,7 @@ expr_code_doover: pCol->iSorterColumn, target); if( pCol->iColumn<0 ){ VdbeComment((v,"%s.rowid",pTab->zName)); - }else{ + }else if( ALWAYS(pTab!=0) ){ VdbeComment((v,"%s.%s", pTab->zName, pTab->aCol[pCol->iColumn].zCnName)); if( pTab->aCol[pCol->iColumn].affinity==SQLITE_AFF_REAL ){ @@ -106560,11 +109289,8 @@ expr_code_doover: int aff; iReg = sqlite3ExprCodeTarget(pParse, pExpr->pLeft,target); assert( ExprUseYTab(pExpr) ); - if( pExpr->y.pTab ){ - aff = sqlite3TableColumnAffinity(pExpr->y.pTab, pExpr->iColumn); - }else{ - aff = pExpr->affExpr; - } + assert( pExpr->y.pTab!=0 ); + aff = sqlite3TableColumnAffinity(pExpr->y.pTab, pExpr->iColumn); if( aff>SQLITE_AFF_BLOB ){ static const char zAff[] = "B\000C\000D\000E"; assert( SQLITE_AFF_BLOB=='A' ); @@ -106626,12 +109352,10 @@ expr_code_doover: } } assert( ExprUseYTab(pExpr) ); + assert( pExpr->y.pTab!=0 ); iReg = sqlite3ExprCodeGetColumn(pParse, pExpr->y.pTab, pExpr->iColumn, iTab, target, pExpr->op2); - if( pExpr->y.pTab==0 && pExpr->affExpr==SQLITE_AFF_REAL ){ - sqlite3VdbeAddOp1(v, OP_RealAffinity, iReg); - } return iReg; } case TK_INTEGER: { @@ -106960,20 +109684,8 @@ expr_code_doover: if( !pColl ) pColl = db->pDfltColl; sqlite3VdbeAddOp4(v, OP_CollSeq, 0, 0, 0, (char *)pColl, P4_COLLSEQ); } -#ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC - if( (pDef->funcFlags & SQLITE_FUNC_OFFSET)!=0 && ALWAYS(pFarg!=0) ){ - Expr *pArg = pFarg->a[0].pExpr; - if( pArg->op==TK_COLUMN ){ - sqlite3VdbeAddOp3(v, OP_Offset, pArg->iTable, pArg->iColumn, target); - }else{ - sqlite3VdbeAddOp2(v, OP_Null, 0, target); - } - }else -#endif - { - sqlite3VdbeAddFunctionCall(pParse, constMask, r1, target, nFarg, - pDef, pExpr->op2); - } + sqlite3VdbeAddFunctionCall(pParse, constMask, r1, target, nFarg, + pDef, pExpr->op2); if( nFarg ){ if( constMask==0 ){ sqlite3ReleaseTempRange(pParse, r1, nFarg); @@ -107003,16 +109715,18 @@ expr_code_doover: } case TK_SELECT_COLUMN: { int n; - if( pExpr->pLeft->iTable==0 ){ - pExpr->pLeft->iTable = sqlite3CodeSubselect(pParse, pExpr->pLeft); + Expr *pLeft = pExpr->pLeft; + if( pLeft->iTable==0 || pParse->withinRJSubrtn > pLeft->op2 ){ + pLeft->iTable = sqlite3CodeSubselect(pParse, pLeft); + pLeft->op2 = pParse->withinRJSubrtn; } - assert( pExpr->pLeft->op==TK_SELECT || pExpr->pLeft->op==TK_ERROR ); - n = sqlite3ExprVectorSize(pExpr->pLeft); + assert( pLeft->op==TK_SELECT || pLeft->op==TK_ERROR ); + n = sqlite3ExprVectorSize(pLeft); if( pExpr->iTable!=n ){ sqlite3ErrorMsg(pParse, "%d columns assigned %d values", pExpr->iTable, n); } - return pExpr->pLeft->iTable + pExpr->iColumn; + return pLeft->iTable + pExpr->iColumn; } case TK_IN: { int destIfFalse = sqlite3VdbeMakeLabel(pParse); @@ -107043,8 +109757,24 @@ expr_code_doover: exprCodeBetween(pParse, pExpr, target, 0, 0); return target; } + case TK_COLLATE: { + if( !ExprHasProperty(pExpr, EP_Collate) + && ALWAYS(pExpr->pLeft) + && pExpr->pLeft->op==TK_FUNCTION + ){ + inReg = sqlite3ExprCodeTarget(pParse, pExpr->pLeft, target); + if( inReg!=target ){ + sqlite3VdbeAddOp2(v, OP_SCopy, inReg, target); + inReg = target; + } + sqlite3VdbeAddOp1(v, OP_ClrSubtype, inReg); + return inReg; + }else{ + pExpr = pExpr->pLeft; + goto expr_code_doover; /* 2018-04-28: Prevent deep recursion. */ + } + } case TK_SPAN: - case TK_COLLATE: case TK_UPLUS: { pExpr = pExpr->pLeft; goto expr_code_doover; /* 2018-04-28: Prevent deep recursion. OSSFuzz. */ @@ -107124,6 +109854,21 @@ expr_code_doover: case TK_IF_NULL_ROW: { int addrINR; u8 okConstFactor = pParse->okConstFactor; + AggInfo *pAggInfo = pExpr->pAggInfo; + if( pAggInfo ){ + assert( pExpr->iAgg>=0 && pExpr->iAggnColumn ); + if( !pAggInfo->directMode ){ + inReg = pAggInfo->aCol[pExpr->iAgg].iMem; + break; + } + if( pExpr->pAggInfo->useSortingIdx ){ + sqlite3VdbeAddOp3(v, OP_Column, pAggInfo->sortingIdxPTab, + pAggInfo->aCol[pExpr->iAgg].iSorterColumn, + target); + inReg = target; + break; + } + } addrINR = sqlite3VdbeAddOp1(v, OP_IfNullRow, pExpr->iTable); /* Temporarily disable factoring of constant expressions, since ** even though expressions may appear to be constant, they are not @@ -107285,7 +110030,9 @@ SQLITE_PRIVATE int sqlite3ExprCodeRunJustOnce( struct ExprList_item *pItem; int i; for(pItem=p->a, i=p->nExpr; i>0; pItem++, i--){ - if( pItem->reusable && sqlite3ExprCompare(0,pItem->pExpr,pExpr,-1)==0 ){ + if( pItem->fg.reusable + && sqlite3ExprCompare(0,pItem->pExpr,pExpr,-1)==0 + ){ return pItem->u.iConstExprReg; } } @@ -107308,7 +110055,7 @@ SQLITE_PRIVATE int sqlite3ExprCodeRunJustOnce( p = sqlite3ExprListAppend(pParse, p, pExpr); if( p ){ struct ExprList_item *pItem = &p->a[p->nExpr-1]; - pItem->reusable = regDest<0; + pItem->fg.reusable = regDest<0; if( regDest<0 ) regDest = ++pParse->nMem; pItem->u.iConstExprReg = regDest; } @@ -107442,7 +110189,7 @@ SQLITE_PRIVATE int sqlite3ExprCodeExprList( for(pItem=pList->a, i=0; ipExpr; #ifdef SQLITE_ENABLE_SORTER_REFERENCES - if( pItem->bSorterRef ){ + if( pItem->fg.bSorterRef ){ i--; n--; }else @@ -107463,7 +110210,7 @@ SQLITE_PRIVATE int sqlite3ExprCodeExprList( if( inReg!=target+i ){ VdbeOp *pOp; if( copyOp==OP_Copy - && (pOp=sqlite3VdbeGetOp(v, -1))->opcode==OP_Copy + && (pOp=sqlite3VdbeGetLastOp(v))->opcode==OP_Copy && pOp->p1+pOp->p3+1==inReg && pOp->p2+pOp->p3+1==target+i && pOp->p5==0 /* The do-not-merge flag must be clear */ @@ -107536,8 +110283,8 @@ static void exprCodeBetween( ** so that the sqlite3ExprCodeTarget() routine will not attempt to move ** it into the Parse.pConstExpr list. We should use a new bit for this, ** for clarity, but we are out of bits in the Expr.flags field so we - ** have to reuse the EP_FromJoin bit. Bummer. */ - pDel->flags |= EP_FromJoin; + ** have to reuse the EP_OuterON bit. Bummer. */ + pDel->flags |= EP_OuterON; sqlite3ExprCodeTarget(pParse, &exprAnd, dest); } sqlite3ReleaseTempReg(pParse, regFree1); @@ -107662,6 +110409,7 @@ SQLITE_PRIVATE void sqlite3ExprIfTrue(Parse *pParse, Expr *pExpr, int dest, int assert( TK_ISNULL==OP_IsNull ); testcase( op==TK_ISNULL ); assert( TK_NOTNULL==OP_NotNull ); testcase( op==TK_NOTNULL ); r1 = sqlite3ExprCodeTemp(pParse, pExpr->pLeft, ®Free1); + sqlite3VdbeTypeofColumn(v, r1); sqlite3VdbeAddOp2(v, op, r1, dest); VdbeCoverageIf(v, op==TK_ISNULL); VdbeCoverageIf(v, op==TK_NOTNULL); @@ -107836,6 +110584,7 @@ SQLITE_PRIVATE void sqlite3ExprIfFalse(Parse *pParse, Expr *pExpr, int dest, int case TK_ISNULL: case TK_NOTNULL: { r1 = sqlite3ExprCodeTemp(pParse, pExpr->pLeft, ®Free1); + sqlite3VdbeTypeofColumn(v, r1); sqlite3VdbeAddOp2(v, op, r1, dest); testcase( op==TK_ISNULL ); VdbeCoverageIf(v, op==TK_ISNULL); testcase( op==TK_NOTNULL ); VdbeCoverageIf(v, op==TK_NOTNULL); @@ -107989,7 +110738,13 @@ SQLITE_PRIVATE int sqlite3ExprCompare( if( pB->op==TK_COLLATE && sqlite3ExprCompare(pParse, pA,pB->pLeft,iTab)<2 ){ return 1; } - return 2; + if( pA->op==TK_AGG_COLUMN && pB->op==TK_COLUMN + && pB->iTable<0 && pA->iTable==iTab + ){ + /* fall through */ + }else{ + return 2; + } } assert( !ExprHasProperty(pA, EP_IntValue) ); assert( !ExprHasProperty(pB, EP_IntValue) ); @@ -108067,7 +110822,7 @@ SQLITE_PRIVATE int sqlite3ExprListCompare(const ExprList *pA, const ExprList *pB int res; Expr *pExprA = pA->a[i].pExpr; Expr *pExprB = pB->a[i].pExpr; - if( pA->a[i].sortFlags!=pB->a[i].sortFlags ) return 1; + if( pA->a[i].fg.sortFlags!=pB->a[i].fg.sortFlags ) return 1; if( (res = sqlite3ExprCompare(0, pExprA, pExprB, iTab)) ) return res; } return 0; @@ -108222,7 +110977,7 @@ SQLITE_PRIVATE int sqlite3ExprImpliesExpr( static int impliesNotNullRow(Walker *pWalker, Expr *pExpr){ testcase( pExpr->op==TK_AGG_COLUMN ); testcase( pExpr->op==TK_AGG_FUNCTION ); - if( ExprHasProperty(pExpr, EP_FromJoin) ) return WRC_Prune; + if( ExprHasProperty(pExpr, EP_OuterON) ) return WRC_Prune; switch( pExpr->op ){ case TK_ISNOT: case TK_ISNULL: @@ -108291,10 +111046,10 @@ static int impliesNotNullRow(Walker *pWalker, Expr *pExpr){ assert( pLeft->op!=TK_COLUMN || ExprUseYTab(pLeft) ); assert( pRight->op!=TK_COLUMN || ExprUseYTab(pRight) ); if( (pLeft->op==TK_COLUMN - && pLeft->y.pTab!=0 + && ALWAYS(pLeft->y.pTab!=0) && IsVirtual(pLeft->y.pTab)) || (pRight->op==TK_COLUMN - && pRight->y.pTab!=0 + && ALWAYS(pRight->y.pTab!=0) && IsVirtual(pRight->y.pTab)) ){ return WRC_Prune; @@ -108319,8 +111074,8 @@ static int impliesNotNullRow(Walker *pWalker, Expr *pExpr){ ** False positives are not allowed, however. A false positive may result ** in an incorrect answer. ** -** Terms of p that are marked with EP_FromJoin (and hence that come from -** the ON or USING clauses of LEFT JOINS) are excluded from the analysis. +** Terms of p that are marked with EP_OuterON (and hence that come from +** the ON or USING clauses of OUTER JOINS) are excluded from the analysis. ** ** This routine is used to check if a LEFT JOIN can be converted into ** an ordinary JOIN. The p argument is the WHERE clause. If the WHERE @@ -108499,6 +111254,7 @@ static int exprRefToSrcList(Walker *pWalker, Expr *pExpr){ SQLITE_PRIVATE int sqlite3ReferencesSrcList(Parse *pParse, Expr *pExpr, SrcList *pSrcList){ Walker w; struct RefSrcList x; + assert( pParse->db!=0 ); memset(&w, 0, sizeof(w)); memset(&x, 0, sizeof(x)); w.xExprCallback = exprRefToSrcList; @@ -108515,7 +111271,7 @@ SQLITE_PRIVATE int sqlite3ReferencesSrcList(Parse *pParse, Expr *pExpr, SrcList sqlite3WalkExpr(&w, pExpr->y.pWin->pFilter); } #endif - sqlite3DbFree(pParse->db, x.aiExclude); + if( x.aiExclude ) sqlite3DbNNFreeNN(pParse->db, x.aiExclude); if( w.eCode & 0x01 ){ return 1; }else if( w.eCode ){ @@ -108546,8 +111302,8 @@ static int agginfoPersistExprCb(Walker *pWalker, Expr *pExpr){ int iAgg = pExpr->iAgg; Parse *pParse = pWalker->pParse; sqlite3 *db = pParse->db; - assert( pExpr->op==TK_AGG_COLUMN || pExpr->op==TK_AGG_FUNCTION ); - if( pExpr->op==TK_AGG_COLUMN ){ + if( pExpr->op!=TK_AGG_FUNCTION ){ + assert( pExpr->op==TK_AGG_COLUMN || pExpr->op==TK_IF_NULL_ROW ); assert( iAgg>=0 && iAggnColumn ); if( pAggInfo->aCol[iAgg].pCExpr==pExpr ){ pExpr = sqlite3ExprDup(db, pExpr, 0); @@ -108557,6 +111313,7 @@ static int agginfoPersistExprCb(Walker *pWalker, Expr *pExpr){ } } }else{ + assert( pExpr->op==TK_AGG_FUNCTION ); assert( iAgg>=0 && iAggnFunc ); if( pAggInfo->aFunc[iAgg].pFExpr==pExpr ){ pExpr = sqlite3ExprDup(db, pExpr, 0); @@ -108627,10 +111384,12 @@ static int analyzeAggregate(Walker *pWalker, Expr *pExpr){ assert( pNC->ncFlags & NC_UAggInfo ); switch( pExpr->op ){ + case TK_IF_NULL_ROW: case TK_AGG_COLUMN: case TK_COLUMN: { testcase( pExpr->op==TK_AGG_COLUMN ); testcase( pExpr->op==TK_COLUMN ); + testcase( pExpr->op==TK_IF_NULL_ROW ); /* Check to see if the column is in one of the tables in the FROM ** clause of the aggregate query */ if( ALWAYS(pSrcList!=0) ){ @@ -108648,8 +111407,10 @@ static int analyzeAggregate(Walker *pWalker, Expr *pExpr){ int k; pCol = pAggInfo->aCol; for(k=0; knColumn; k++, pCol++){ - if( pCol->iTable==pExpr->iTable && - pCol->iColumn==pExpr->iColumn ){ + if( pCol->iTable==pExpr->iTable + && pCol->iColumn==pExpr->iColumn + && pExpr->op!=TK_IF_NULL_ROW + ){ break; } } @@ -108664,15 +111425,17 @@ static int analyzeAggregate(Walker *pWalker, Expr *pExpr){ pCol->iMem = ++pParse->nMem; pCol->iSorterColumn = -1; pCol->pCExpr = pExpr; - if( pAggInfo->pGroupBy ){ + if( pAggInfo->pGroupBy && pExpr->op!=TK_IF_NULL_ROW ){ int j, n; ExprList *pGB = pAggInfo->pGroupBy; struct ExprList_item *pTerm = pGB->a; n = pGB->nExpr; for(j=0; jpExpr; - if( pE->op==TK_COLUMN && pE->iTable==pExpr->iTable && - pE->iColumn==pExpr->iColumn ){ + if( pE->op==TK_COLUMN + && pE->iTable==pExpr->iTable + && pE->iColumn==pExpr->iColumn + ){ pCol->iSorterColumn = j; break; } @@ -108689,7 +111452,9 @@ static int analyzeAggregate(Walker *pWalker, Expr *pExpr){ */ ExprSetVVAProperty(pExpr, EP_NoReduce); pExpr->pAggInfo = pAggInfo; - pExpr->op = TK_AGG_COLUMN; + if( pExpr->op==TK_COLUMN ){ + pExpr->op = TK_AGG_COLUMN; + } pExpr->iAgg = (i16)k; break; } /* endif pExpr->iTable==pItem->iCursor */ @@ -109734,11 +112499,10 @@ static void unmapColumnIdlistNames( Parse *pParse, const IdList *pIdList ){ - if( pIdList ){ - int ii; - for(ii=0; iinId; ii++){ - sqlite3RenameTokenRemap(pParse, 0, (const void*)pIdList->a[ii].zName); - } + int ii; + assert( pIdList!=0 ); + for(ii=0; iinId; ii++){ + sqlite3RenameTokenRemap(pParse, 0, (const void*)pIdList->a[ii].zName); } } @@ -109757,7 +112521,7 @@ static int renameUnmapSelectCb(Walker *pWalker, Select *p){ if( ALWAYS(p->pEList) ){ ExprList *pList = p->pEList; for(i=0; inExpr; i++){ - if( pList->a[i].zEName && pList->a[i].eEName==ENAME_NAME ){ + if( pList->a[i].zEName && pList->a[i].fg.eEName==ENAME_NAME ){ sqlite3RenameTokenRemap(pParse, 0, (void*)pList->a[i].zEName); } } @@ -109766,8 +112530,11 @@ static int renameUnmapSelectCb(Walker *pWalker, Select *p){ SrcList *pSrc = p->pSrc; for(i=0; inSrc; i++){ sqlite3RenameTokenRemap(pParse, 0, (void*)pSrc->a[i].zName); - sqlite3WalkExpr(pWalker, pSrc->a[i].pOn); - unmapColumnIdlistNames(pParse, pSrc->a[i].pUsing); + if( pSrc->a[i].fg.isUsing==0 ){ + sqlite3WalkExpr(pWalker, pSrc->a[i].u3.pOn); + }else{ + unmapColumnIdlistNames(pParse, pSrc->a[i].u3.pUsing); + } } } @@ -109803,7 +112570,7 @@ SQLITE_PRIVATE void sqlite3RenameExprlistUnmap(Parse *pParse, ExprList *pEList){ sWalker.xExprCallback = renameUnmapExprCb; sqlite3WalkExprList(&sWalker, pEList); for(i=0; inExpr; i++){ - if( ALWAYS(pEList->a[i].eEName==ENAME_NAME) ){ + if( ALWAYS(pEList->a[i].fg.eEName==ENAME_NAME) ){ sqlite3RenameTokenRemap(pParse, 0, (void*)pEList->a[i].zEName); } } @@ -109961,7 +112728,7 @@ static void renameColumnElistNames( int i; for(i=0; inExpr; i++){ const char *zName = pEList->a[i].zEName; - if( ALWAYS(pEList->a[i].eEName==ENAME_NAME) + if( ALWAYS(pEList->a[i].fg.eEName==ENAME_NAME) && ALWAYS(zName!=0) && 0==sqlite3_stricmp(zName, zOld) ){ @@ -110191,27 +112958,33 @@ static int renameResolveTrigger(Parse *pParse){ if( rc==SQLITE_OK && pStep->zTarget ){ SrcList *pSrc = sqlite3TriggerStepSrc(pParse, pStep); if( pSrc ){ - int i; - for(i=0; inSrc && rc==SQLITE_OK; i++){ - SrcItem *p = &pSrc->a[i]; - p->iCursor = pParse->nTab++; - if( p->pSelect ){ - sqlite3SelectPrep(pParse, p->pSelect, 0); - sqlite3ExpandSubquery(pParse, p); - assert( i>0 ); - assert( pStep->pFrom->a[i-1].pSelect ); - sqlite3SelectPrep(pParse, pStep->pFrom->a[i-1].pSelect, 0); - }else{ - p->pTab = sqlite3LocateTableItem(pParse, 0, p); - if( p->pTab==0 ){ - rc = SQLITE_ERROR; - }else{ - p->pTab->nTabRef++; - rc = sqlite3ViewGetColumnNames(pParse, p->pTab); + Select *pSel = sqlite3SelectNew( + pParse, pStep->pExprList, pSrc, 0, 0, 0, 0, 0, 0 + ); + if( pSel==0 ){ + pStep->pExprList = 0; + pSrc = 0; + rc = SQLITE_NOMEM; + }else{ + sqlite3SelectPrep(pParse, pSel, 0); + rc = pParse->nErr ? SQLITE_ERROR : SQLITE_OK; + assert( pStep->pExprList==0 || pStep->pExprList==pSel->pEList ); + assert( pSrc==pSel->pSrc ); + if( pStep->pExprList ) pSel->pEList = 0; + pSel->pSrc = 0; + sqlite3SelectDelete(db, pSel); + } + if( pStep->pFrom ){ + int i; + for(i=0; ipFrom->nSrc && rc==SQLITE_OK; i++){ + SrcItem *p = &pStep->pFrom->a[i]; + if( p->pSelect ){ + sqlite3SelectPrep(pParse, p->pSelect, 0); } } } - if( rc==SQLITE_OK && db->mallocFailed ){ + + if( db->mallocFailed ){ rc = SQLITE_NOMEM; } sNC.pSrcList = pSrc; @@ -110663,6 +113436,15 @@ static void renameTableFunc( if( pStep->zTarget && 0==sqlite3_stricmp(pStep->zTarget, zOld) ){ renameTokenFind(&sParse, &sCtx, pStep->zTarget); } + if( pStep->pFrom ){ + int i; + for(i=0; ipFrom->nSrc; i++){ + SrcItem *pItem = &pStep->pFrom->a[i]; + if( 0==sqlite3_stricmp(pItem->zName, zOld) ){ + renameTokenFind(&sParse, &sCtx, pItem->zName); + } + } + } } } } @@ -111986,9 +114768,14 @@ static void statGet( ** * "WHERE a=? AND b=?" matches 2 rows. ** ** If D is the count of distinct values and K is the total number of - ** rows, then each estimate is computed as: + ** rows, then each estimate is usually computed as: ** ** I = (K+D-1)/D + ** + ** In other words, I is K/D rounded up to the next whole integer. + ** However, if I is between 1.0 and 1.1 (in other words if I is + ** close to 1.0 but just a little larger) then do not round up but + ** instead keep the I value at 1.0. */ sqlite3_str sStat; /* Text of the constructed "stat" line */ int i; /* Loop counter */ @@ -111999,6 +114786,7 @@ static void statGet( for(i=0; inKeyCol; i++){ u64 nDistinct = p->current.anDLt[i] + 1; u64 iVal = (p->nRow + nDistinct - 1) / nDistinct; + if( iVal==2 && p->nRow*10 <= nDistinct*11 ) iVal = 1; sqlite3_str_appendf(&sStat, " %llu", iVal); assert( p->current.anEq[i] ); } @@ -112086,6 +114874,7 @@ static void analyzeVdbeCommentIndexWithColumnName( if( NEVER(i==XN_ROWID) ){ VdbeComment((v,"%s.rowid",pIdx->zName)); }else if( i==XN_EXPR ){ + assert( pIdx->bHasExpr ); VdbeComment((v,"%s.expr(%d)",pIdx->zName, k)); }else{ VdbeComment((v,"%s.%s", pIdx->zName, pIdx->pTable->aCol[i].zCnName)); @@ -112162,7 +114951,7 @@ static void analyzeOneTable( memcpy(pStat1->zName, "sqlite_stat1", 13); pStat1->nCol = 3; pStat1->iPKey = -1; - sqlite3VdbeAddOp4(pParse->pVdbe, OP_Noop, 0, 0, 0,(char*)pStat1,P4_DYNBLOB); + sqlite3VdbeAddOp4(pParse->pVdbe, OP_Noop, 0, 0, 0,(char*)pStat1,P4_DYNAMIC); } #endif @@ -113552,7 +116341,11 @@ static int fixSelectCb(Walker *p, Select *pSelect){ pItem->fg.fromDDL = 1; } #if !defined(SQLITE_OMIT_VIEW) || !defined(SQLITE_OMIT_TRIGGER) - if( sqlite3WalkExpr(&pFix->w, pList->a[i].pOn) ) return WRC_Abort; + if( pList->a[i].fg.isUsing==0 + && sqlite3WalkExpr(&pFix->w, pList->a[i].u3.pOn) + ){ + return WRC_Abort; + } #endif } if( pSelect->pWith ){ @@ -114084,6 +116877,7 @@ SQLITE_PRIVATE int sqlite3DbMaskAllZero(yDbMask m){ SQLITE_PRIVATE void sqlite3FinishCoding(Parse *pParse){ sqlite3 *db; Vdbe *v; + int iDb, i; assert( pParse->pToplevel==0 ); db = pParse->db; @@ -114113,12 +116907,9 @@ SQLITE_PRIVATE void sqlite3FinishCoding(Parse *pParse){ if( pParse->bReturning ){ Returning *pReturning = pParse->u1.pReturning; int addrRewind; - int i; int reg; - if( NEVER(pReturning->nRetCol==0) ){ - assert( CORRUPT_DB ); - }else{ + if( pReturning->nRetCol ){ sqlite3VdbeAddOp0(v, OP_FkCheck); addrRewind = sqlite3VdbeAddOp1(v, OP_Rewind, pReturning->iRetCur); @@ -114152,76 +116943,69 @@ SQLITE_PRIVATE void sqlite3FinishCoding(Parse *pParse){ ** transaction on each used database and to verify the schema cookie ** on each used database. */ - if( db->mallocFailed==0 - && (DbMaskNonZero(pParse->cookieMask) || pParse->pConstExpr) - ){ - int iDb, i; - assert( sqlite3VdbeGetOp(v, 0)->opcode==OP_Init ); - sqlite3VdbeJumpHere(v, 0); - for(iDb=0; iDbnDb; iDb++){ - Schema *pSchema; - if( DbMaskTest(pParse->cookieMask, iDb)==0 ) continue; - sqlite3VdbeUsesBtree(v, iDb); - pSchema = db->aDb[iDb].pSchema; - sqlite3VdbeAddOp4Int(v, - OP_Transaction, /* Opcode */ - iDb, /* P1 */ - DbMaskTest(pParse->writeMask,iDb), /* P2 */ - pSchema->schema_cookie, /* P3 */ - pSchema->iGeneration /* P4 */ - ); - if( db->init.busy==0 ) sqlite3VdbeChangeP5(v, 1); - VdbeComment((v, - "usesStmtJournal=%d", pParse->mayAbort && pParse->isMultiWrite)); - } + assert( pParse->nErr>0 || sqlite3VdbeGetOp(v, 0)->opcode==OP_Init ); + sqlite3VdbeJumpHere(v, 0); + assert( db->nDb>0 ); + iDb = 0; + do{ + Schema *pSchema; + if( DbMaskTest(pParse->cookieMask, iDb)==0 ) continue; + sqlite3VdbeUsesBtree(v, iDb); + pSchema = db->aDb[iDb].pSchema; + sqlite3VdbeAddOp4Int(v, + OP_Transaction, /* Opcode */ + iDb, /* P1 */ + DbMaskTest(pParse->writeMask,iDb), /* P2 */ + pSchema->schema_cookie, /* P3 */ + pSchema->iGeneration /* P4 */ + ); + if( db->init.busy==0 ) sqlite3VdbeChangeP5(v, 1); + VdbeComment((v, + "usesStmtJournal=%d", pParse->mayAbort && pParse->isMultiWrite)); + }while( ++iDbnDb ); #ifndef SQLITE_OMIT_VIRTUALTABLE - for(i=0; inVtabLock; i++){ - char *vtab = (char *)sqlite3GetVTable(db, pParse->apVtabLock[i]); - sqlite3VdbeAddOp4(v, OP_VBegin, 0, 0, 0, vtab, P4_VTAB); - } - pParse->nVtabLock = 0; + for(i=0; inVtabLock; i++){ + char *vtab = (char *)sqlite3GetVTable(db, pParse->apVtabLock[i]); + sqlite3VdbeAddOp4(v, OP_VBegin, 0, 0, 0, vtab, P4_VTAB); + } + pParse->nVtabLock = 0; #endif - /* Once all the cookies have been verified and transactions opened, - ** obtain the required table-locks. This is a no-op unless the - ** shared-cache feature is enabled. - */ - codeTableLocks(pParse); + /* Once all the cookies have been verified and transactions opened, + ** obtain the required table-locks. This is a no-op unless the + ** shared-cache feature is enabled. + */ + codeTableLocks(pParse); - /* Initialize any AUTOINCREMENT data structures required. - */ - sqlite3AutoincrementBegin(pParse); + /* Initialize any AUTOINCREMENT data structures required. + */ + sqlite3AutoincrementBegin(pParse); - /* Code constant expressions that where factored out of inner loops. - ** - ** The pConstExpr list might also contain expressions that we simply - ** want to keep around until the Parse object is deleted. Such - ** expressions have iConstExprReg==0. Do not generate code for - ** those expressions, of course. - */ - if( pParse->pConstExpr ){ - ExprList *pEL = pParse->pConstExpr; - pParse->okConstFactor = 0; - for(i=0; inExpr; i++){ - int iReg = pEL->a[i].u.iConstExprReg; - if( iReg>0 ){ - sqlite3ExprCode(pParse, pEL->a[i].pExpr, iReg); - } - } + /* Code constant expressions that where factored out of inner loops. + ** + ** The pConstExpr list might also contain expressions that we simply + ** want to keep around until the Parse object is deleted. Such + ** expressions have iConstExprReg==0. Do not generate code for + ** those expressions, of course. + */ + if( pParse->pConstExpr ){ + ExprList *pEL = pParse->pConstExpr; + pParse->okConstFactor = 0; + for(i=0; inExpr; i++){ + int iReg = pEL->a[i].u.iConstExprReg; + sqlite3ExprCode(pParse, pEL->a[i].pExpr, iReg); } + } - if( pParse->bReturning ){ - Returning *pRet = pParse->u1.pReturning; - if( NEVER(pRet->nRetCol==0) ){ - assert( CORRUPT_DB ); - }else{ - sqlite3VdbeAddOp2(v, OP_OpenEphemeral, pRet->iRetCur, pRet->nRetCol); - } + if( pParse->bReturning ){ + Returning *pRet = pParse->u1.pReturning; + if( pRet->nRetCol ){ + sqlite3VdbeAddOp2(v, OP_OpenEphemeral, pRet->iRetCur, pRet->nRetCol); } - - /* Finally, jump back to the beginning of the executable code. */ - sqlite3VdbeGoto(v, 1); } + + /* Finally, jump back to the beginning of the executable code. */ + sqlite3VdbeGoto(v, 1); } /* Get the VDBE program ready for execution @@ -114277,8 +117061,6 @@ SQLITE_PRIVATE void sqlite3NestedParse(Parse *pParse, const char *zFormat, ...){ memset(PARSE_TAIL(pParse), 0, PARSE_TAIL_SZ); db->mDbFlags |= DBFLAG_PreferBuiltin; sqlite3RunParser(pParse, zSql); - sqlite3DbFree(db, pParse->zErrMsg); - pParse->zErrMsg = 0; db->mDbFlags = savedDbFlags; sqlite3DbFree(db, zSql); memcpy(PARSE_TAIL(pParse), saveBuf, PARSE_TAIL_SZ); @@ -114408,7 +117190,7 @@ SQLITE_PRIVATE Table *sqlite3LocateTable( /* If zName is the not the name of a table in the schema created using ** CREATE, then check to see if it is the name of an virtual table that ** can be an eponymous virtual table. */ - if( pParse->disableVtab==0 && db->init.busy==0 ){ + if( (pParse->prepFlags & SQLITE_PREPARE_NO_VTAB)==0 && db->init.busy==0 ){ Module *pMod = (Module*)sqlite3HashFind(&db->aModule, zName); if( pMod==0 && sqlite3_strnicmp(zName, "pragma_", 7)==0 ){ pMod = sqlite3PragmaVtabRegister(db, zName); @@ -114421,7 +117203,7 @@ SQLITE_PRIVATE Table *sqlite3LocateTable( #endif if( flags & LOCATE_NOERR ) return 0; pParse->checkSchema = 1; - }else if( IsVirtual(p) && pParse->disableVtab ){ + }else if( IsVirtual(p) && (pParse->prepFlags & SQLITE_PREPARE_NO_VTAB)!=0 ){ p = 0; } @@ -114730,16 +117512,17 @@ SQLITE_PRIVATE void sqlite3DeleteColumnNames(sqlite3 *db, Table *pTable){ int i; Column *pCol; assert( pTable!=0 ); + assert( db!=0 ); if( (pCol = pTable->aCol)!=0 ){ for(i=0; inCol; i++, pCol++){ assert( pCol->zCnName==0 || pCol->hName==sqlite3StrIHash(pCol->zCnName) ); sqlite3DbFree(db, pCol->zCnName); } - sqlite3DbFree(db, pTable->aCol); + sqlite3DbNNFreeNN(db, pTable->aCol); if( IsOrdinaryTable(pTable) ){ sqlite3ExprListDelete(db, pTable->u.tab.pDfltList); } - if( db==0 || db->pnBytesFreed==0 ){ + if( db->pnBytesFreed==0 ){ pTable->aCol = 0; pTable->nCol = 0; if( IsOrdinaryTable(pTable) ){ @@ -114776,7 +117559,8 @@ static void SQLITE_NOINLINE deleteTable(sqlite3 *db, Table *pTable){ ** a Table object that was going to be marked ephemeral. So do not check ** that no lookaside memory is used in this case either. */ int nLookaside = 0; - if( db && !db->mallocFailed && (pTable->tabFlags & TF_Ephemeral)==0 ){ + assert( db!=0 ); + if( !db->mallocFailed && (pTable->tabFlags & TF_Ephemeral)==0 ){ nLookaside = sqlite3LookasideUsed(db, 0); } #endif @@ -114786,7 +117570,7 @@ static void SQLITE_NOINLINE deleteTable(sqlite3 *db, Table *pTable){ pNext = pIndex->pNext; assert( pIndex->pSchema==pTable->pSchema || (IsVirtual(pTable) && pIndex->idxType!=SQLITE_IDXTYPE_APPDEF) ); - if( (db==0 || db->pnBytesFreed==0) && !IsVirtual(pTable) ){ + if( db->pnBytesFreed==0 && !IsVirtual(pTable) ){ char *zName = pIndex->zName; TESTONLY ( Index *pOld = ) sqlite3HashInsert( &pIndex->pSchema->idxHash, zName, 0 @@ -114823,8 +117607,9 @@ static void SQLITE_NOINLINE deleteTable(sqlite3 *db, Table *pTable){ } SQLITE_PRIVATE void sqlite3DeleteTable(sqlite3 *db, Table *pTable){ /* Do not delete the table until the reference count reaches zero. */ + assert( db!=0 ); if( !pTable ) return; - if( ((!db || db->pnBytesFreed==0) && (--pTable->nTabRef)>0) ) return; + if( db->pnBytesFreed==0 && (--pTable->nTabRef)>0 ) return; deleteTable(db, pTable); } @@ -115836,7 +118621,7 @@ SQLITE_PRIVATE void sqlite3AddPrimaryKey( pTab->keyConf = (u8)onError; assert( autoInc==0 || autoInc==1 ); pTab->tabFlags |= autoInc*TF_Autoincrement; - if( pList ) pParse->iPkSortOrder = pList->a[0].sortFlags; + if( pList ) pParse->iPkSortOrder = pList->a[0].fg.sortFlags; (void)sqlite3HasExplicitNulls(pParse, pList); }else if( autoInc ){ #ifndef SQLITE_OMIT_AUTOINCREMENT @@ -116228,7 +119013,8 @@ static int isDupColumn(Index *pIdx, int nKey, Index *pPk, int iCol){ /* Recompute the colNotIdxed field of the Index. ** ** colNotIdxed is a bitmask that has a 0 bit representing each indexed -** columns that are within the first 63 columns of the table. The +** columns that are within the first 63 columns of the table and a 1 for +** all other bits (all columns that are not in the index). The ** high-order bit of colNotIdxed is always 1. All unindexed columns ** of the table have a 1. ** @@ -116256,7 +119042,7 @@ static void recomputeColumnsNotIndexed(Index *pIdx){ } } pIdx->colNotIdxed = ~m; - assert( (pIdx->colNotIdxed>>63)==1 ); + assert( (pIdx->colNotIdxed>>63)==1 ); /* See note-20221022-a */ } /* @@ -116330,7 +119116,7 @@ static void convertToWithoutRowidTable(Parse *pParse, Table *pTab){ if( IN_RENAME_OBJECT ){ sqlite3RenameTokenRemap(pParse, pList->a[0].pExpr, &pTab->iPKey); } - pList->a[0].sortFlags = pParse->iPkSortOrder; + pList->a[0].fg.sortFlags = pParse->iPkSortOrder; assert( pParse->pNewTable==pTab ); pTab->iPKey = -1; sqlite3CreateIndex(pParse, 0, 0, 0, pList, pTab->keyConf, 0, 0, 0, 0, @@ -116997,11 +119783,10 @@ create_view_fail: ** the columns of the view in the pTable structure. Return the number ** of errors. If an error is seen leave an error message in pParse->zErrMsg. */ -SQLITE_PRIVATE int sqlite3ViewGetColumnNames(Parse *pParse, Table *pTable){ +static SQLITE_NOINLINE int viewGetColumnNames(Parse *pParse, Table *pTable){ Table *pSelTab; /* A fake table from which we get the result set */ Select *pSel; /* Copy of the SELECT that implements the view */ int nErr = 0; /* Number of errors encountered */ - int n; /* Temporarily holds the number of cursors assigned */ sqlite3 *db = pParse->db; /* Database connection for malloc errors */ #ifndef SQLITE_OMIT_VIRTUALTABLE int rc; @@ -117023,9 +119808,10 @@ SQLITE_PRIVATE int sqlite3ViewGetColumnNames(Parse *pParse, Table *pTable){ #ifndef SQLITE_OMIT_VIEW /* A positive nCol means the columns names for this view are - ** already known. + ** already known. This routine is not called unless either the + ** table is virtual or nCol is zero. */ - if( pTable->nCol>0 ) return 0; + assert( pTable->nCol<=0 ); /* A negative nCol is a special marker meaning that we are currently ** trying to compute the column names. If we enter this routine with @@ -117059,8 +119845,9 @@ SQLITE_PRIVATE int sqlite3ViewGetColumnNames(Parse *pParse, Table *pTable){ pSel = sqlite3SelectDup(db, pTable->u.view.pSelect, 0); if( pSel ){ u8 eParseMode = pParse->eParseMode; + int nTab = pParse->nTab; + int nSelect = pParse->nSelect; pParse->eParseMode = PARSE_MODE_NORMAL; - n = pParse->nTab; sqlite3SrcListAssignCursors(pParse, pSel->pSrc); pTable->nCol = -1; DisableLookaside; @@ -117072,7 +119859,8 @@ SQLITE_PRIVATE int sqlite3ViewGetColumnNames(Parse *pParse, Table *pTable){ #else pSelTab = sqlite3ResultSetOfSelect(pParse, pSel, SQLITE_AFF_NONE); #endif - pParse->nTab = n; + pParse->nTab = nTab; + pParse->nSelect = nSelect; if( pSelTab==0 ){ pTable->nCol = 0; nErr++; @@ -117119,6 +119907,11 @@ SQLITE_PRIVATE int sqlite3ViewGetColumnNames(Parse *pParse, Table *pTable){ #endif /* SQLITE_OMIT_VIEW */ return nErr; } +SQLITE_PRIVATE int sqlite3ViewGetColumnNames(Parse *pParse, Table *pTable){ + assert( pTable!=0 ); + if( !IsVirtual(pTable) && pTable->nCol>0 ) return 0; + return viewGetColumnNames(pParse, pTable); +} #endif /* !defined(SQLITE_OMIT_VIEW) || !defined(SQLITE_OMIT_VIRTUALTABLE) */ #ifndef SQLITE_OMIT_VIEW @@ -117817,8 +120610,8 @@ SQLITE_PRIVATE int sqlite3HasExplicitNulls(Parse *pParse, ExprList *pList){ if( pList ){ int i; for(i=0; inExpr; i++){ - if( pList->a[i].bNulls ){ - u8 sf = pList->a[i].sortFlags; + if( pList->a[i].fg.bNulls ){ + u8 sf = pList->a[i].fg.sortFlags; sqlite3ErrorMsg(pParse, "unsupported use of NULLS %s", (sf==0 || sf==3) ? "FIRST" : "LAST" ); @@ -117984,7 +120777,7 @@ SQLITE_PRIVATE void sqlite3CreateIndex( } if( !IN_RENAME_OBJECT ){ if( !db->init.busy ){ - if( sqlite3FindTable(db, zName, 0)!=0 ){ + if( sqlite3FindTable(db, zName, pDb->zDbSName)!=0 ){ sqlite3ErrorMsg(pParse, "there is already a table named %s", zName); goto exit_create_index; } @@ -118137,6 +120930,7 @@ SQLITE_PRIVATE void sqlite3CreateIndex( j = XN_EXPR; pIndex->aiColumn[i] = XN_EXPR; pIndex->uniqNotNull = 0; + pIndex->bHasExpr = 1; }else{ j = pCExpr->iColumn; assert( j<=0x7fff ); @@ -118148,6 +120942,7 @@ SQLITE_PRIVATE void sqlite3CreateIndex( } if( pTab->aCol[j].colFlags & COLFLAG_VIRTUAL ){ pIndex->bHasVCol = 1; + pIndex->bHasExpr = 1; } } pIndex->aiColumn[i] = (i16)j; @@ -118171,7 +120966,7 @@ SQLITE_PRIVATE void sqlite3CreateIndex( goto exit_create_index; } pIndex->azColl[i] = zColl; - requestedSortOrder = pListItem->sortFlags & sortOrderMask; + requestedSortOrder = pListItem->fg.sortFlags & sortOrderMask; pIndex->aSortOrder[i] = (u8)requestedSortOrder; } @@ -118614,18 +121409,17 @@ SQLITE_PRIVATE IdList *sqlite3IdListAppend(Parse *pParse, IdList *pList, Token * if( pList==0 ){ pList = sqlite3DbMallocZero(db, sizeof(IdList) ); if( pList==0 ) return 0; + }else{ + IdList *pNew; + pNew = sqlite3DbRealloc(db, pList, + sizeof(IdList) + pList->nId*sizeof(pList->a)); + if( pNew==0 ){ + sqlite3IdListDelete(db, pList); + return 0; + } + pList = pNew; } - pList->a = sqlite3ArrayAllocate( - db, - pList->a, - sizeof(pList->a[0]), - &pList->nId, - &i - ); - if( i<0 ){ - sqlite3IdListDelete(db, pList); - return 0; - } + i = pList->nId++; pList->a[i].zName = sqlite3NameFromToken(db, pToken); if( IN_RENAME_OBJECT && pList->a[i].zName ){ sqlite3RenameTokenMap(pParse, (void*)pList->a[i].zName, pToken); @@ -118638,12 +121432,13 @@ SQLITE_PRIVATE IdList *sqlite3IdListAppend(Parse *pParse, IdList *pList, Token * */ SQLITE_PRIVATE void sqlite3IdListDelete(sqlite3 *db, IdList *pList){ int i; + assert( db!=0 ); if( pList==0 ) return; + assert( pList->eU4!=EU4_EXPR ); /* EU4_EXPR mode is not currently used */ for(i=0; inId; i++){ sqlite3DbFree(db, pList->a[i].zName); } - sqlite3DbFree(db, pList->a); - sqlite3DbFreeNN(db, pList); + sqlite3DbNNFreeNN(db, pList); } /* @@ -118652,7 +121447,7 @@ SQLITE_PRIVATE void sqlite3IdListDelete(sqlite3 *db, IdList *pList){ */ SQLITE_PRIVATE int sqlite3IdListIndex(IdList *pList, const char *zName){ int i; - if( pList==0 ) return -1; + assert( pList!=0 ); for(i=0; inId; i++){ if( sqlite3StrICmp(pList->a[i].zName, zName)==0 ) return i; } @@ -118846,19 +121641,23 @@ SQLITE_PRIVATE void sqlite3SrcListAssignCursors(Parse *pParse, SrcList *pList){ SQLITE_PRIVATE void sqlite3SrcListDelete(sqlite3 *db, SrcList *pList){ int i; SrcItem *pItem; + assert( db!=0 ); if( pList==0 ) return; for(pItem=pList->a, i=0; inSrc; i++, pItem++){ - if( pItem->zDatabase ) sqlite3DbFreeNN(db, pItem->zDatabase); - sqlite3DbFree(db, pItem->zName); - if( pItem->zAlias ) sqlite3DbFreeNN(db, pItem->zAlias); + if( pItem->zDatabase ) sqlite3DbNNFreeNN(db, pItem->zDatabase); + if( pItem->zName ) sqlite3DbNNFreeNN(db, pItem->zName); + if( pItem->zAlias ) sqlite3DbNNFreeNN(db, pItem->zAlias); if( pItem->fg.isIndexedBy ) sqlite3DbFree(db, pItem->u1.zIndexedBy); if( pItem->fg.isTabFunc ) sqlite3ExprListDelete(db, pItem->u1.pFuncArg); sqlite3DeleteTable(db, pItem->pTab); if( pItem->pSelect ) sqlite3SelectDelete(db, pItem->pSelect); - if( pItem->pOn ) sqlite3ExprDelete(db, pItem->pOn); - if( pItem->pUsing ) sqlite3IdListDelete(db, pItem->pUsing); + if( pItem->fg.isUsing ){ + sqlite3IdListDelete(db, pItem->u3.pUsing); + }else if( pItem->u3.pOn ){ + sqlite3ExprDelete(db, pItem->u3.pOn); + } } - sqlite3DbFreeNN(db, pList); + sqlite3DbNNFreeNN(db, pList); } /* @@ -118884,14 +121683,13 @@ SQLITE_PRIVATE SrcList *sqlite3SrcListAppendFromTerm( Token *pDatabase, /* Name of the database containing pTable */ Token *pAlias, /* The right-hand side of the AS subexpression */ Select *pSubquery, /* A subquery used in place of a table name */ - Expr *pOn, /* The ON clause of a join */ - IdList *pUsing /* The USING clause of a join */ + OnOrUsing *pOnUsing /* Either the ON clause or the USING clause */ ){ SrcItem *pItem; sqlite3 *db = pParse->db; - if( !p && (pOn || pUsing) ){ + if( !p && pOnUsing!=0 && (pOnUsing->pOn || pOnUsing->pUsing) ){ sqlite3ErrorMsg(pParse, "a JOIN clause is required before %s", - (pOn ? "ON" : "USING") + (pOnUsing->pOn ? "ON" : "USING") ); goto append_from_error; } @@ -118911,15 +121709,27 @@ SQLITE_PRIVATE SrcList *sqlite3SrcListAppendFromTerm( if( pAlias->n ){ pItem->zAlias = sqlite3NameFromToken(db, pAlias); } - pItem->pSelect = pSubquery; - pItem->pOn = pOn; - pItem->pUsing = pUsing; + if( pSubquery ){ + pItem->pSelect = pSubquery; + if( pSubquery->selFlags & SF_NestedFrom ){ + pItem->fg.isNestedFrom = 1; + } + } + assert( pOnUsing==0 || pOnUsing->pOn==0 || pOnUsing->pUsing==0 ); + assert( pItem->fg.isUsing==0 ); + if( pOnUsing==0 ){ + pItem->u3.pOn = 0; + }else if( pOnUsing->pUsing ){ + pItem->fg.isUsing = 1; + pItem->u3.pUsing = pOnUsing->pUsing; + }else{ + pItem->u3.pOn = pOnUsing->pOn; + } return p; append_from_error: assert( p==0 ); - sqlite3ExprDelete(db, pOn); - sqlite3IdListDelete(db, pUsing); + sqlite3ClearOnOrUsing(db, pOnUsing); sqlite3SelectDelete(db, pSubquery); return 0; } @@ -118964,6 +121774,7 @@ SQLITE_PRIVATE SrcList *sqlite3SrcListAppendList(Parse *pParse, SrcList *p1, Src p1 = pNew; memcpy(&p1->a[1], p2->a, p2->nSrc*sizeof(SrcItem)); sqlite3DbFree(pParse->db, p2); + p1->a[0].fg.jointype |= (JT_LTORJ & p1->a[1].fg.jointype); } } return p1; @@ -119000,14 +121811,34 @@ SQLITE_PRIVATE void sqlite3SrcListFuncArgs(Parse *pParse, SrcList *p, ExprList * ** The operator is "natural cross join". The A and B operands are stored ** in p->a[0] and p->a[1], respectively. The parser initially stores the ** operator with A. This routine shifts that operator over to B. +** +** Additional changes: +** +** * All tables to the left of the right-most RIGHT JOIN are tagged with +** JT_LTORJ (mnemonic: Left Table Of Right Join) so that the +** code generator can easily tell that the table is part of +** the left operand of at least one RIGHT JOIN. */ -SQLITE_PRIVATE void sqlite3SrcListShiftJoinType(SrcList *p){ - if( p ){ - int i; - for(i=p->nSrc-1; i>0; i--){ - p->a[i].fg.jointype = p->a[i-1].fg.jointype; - } +SQLITE_PRIVATE void sqlite3SrcListShiftJoinType(Parse *pParse, SrcList *p){ + (void)pParse; + if( p && p->nSrc>1 ){ + int i = p->nSrc-1; + u8 allFlags = 0; + do{ + allFlags |= p->a[i].fg.jointype = p->a[i-1].fg.jointype; + }while( (--i)>0 ); p->a[0].fg.jointype = 0; + + /* All terms to the left of a RIGHT JOIN should be tagged with the + ** JT_LTORJ flags */ + if( allFlags & JT_RIGHT ){ + for(i=p->nSrc-1; ALWAYS(i>0) && (p->a[i].fg.jointype&JT_RIGHT)==0; i--){} + i--; + assert( i>=0 ); + do{ + p->a[i].fg.jointype |= JT_LTORJ; + }while( (--i)>=0 ); + } } } @@ -120078,19 +122909,21 @@ SQLITE_PRIVATE void sqlite3SchemaClear(void *p){ Hash temp2; HashElem *pElem; Schema *pSchema = (Schema *)p; + sqlite3 xdb; + memset(&xdb, 0, sizeof(xdb)); temp1 = pSchema->tblHash; temp2 = pSchema->trigHash; sqlite3HashInit(&pSchema->trigHash); sqlite3HashClear(&pSchema->idxHash); for(pElem=sqliteHashFirst(&temp2); pElem; pElem=sqliteHashNext(pElem)){ - sqlite3DeleteTrigger(0, (Trigger*)sqliteHashData(pElem)); + sqlite3DeleteTrigger(&xdb, (Trigger*)sqliteHashData(pElem)); } sqlite3HashClear(&temp2); sqlite3HashInit(&pSchema->tblHash); for(pElem=sqliteHashFirst(&temp1); pElem; pElem=sqliteHashNext(pElem)){ Table *pTab = sqliteHashData(pElem); - sqlite3DeleteTable(0, pTab); + sqlite3DeleteTable(&xdb, pTab); } sqlite3HashClear(&temp1); sqlite3HashClear(&pSchema->fkeyHash); @@ -120189,18 +123022,42 @@ SQLITE_PRIVATE void sqlite3CodeChangeCount(Vdbe *v, int regCounter, const char * ** 1) It is a virtual table and no implementation of the xUpdate method ** has been provided ** -** 2) It is a system table (i.e. sqlite_schema), this call is not +** 2) A trigger is currently being coded and the table is a virtual table +** that is SQLITE_VTAB_DIRECTONLY or if PRAGMA trusted_schema=OFF and +** the table is not SQLITE_VTAB_INNOCUOUS. +** +** 3) It is a system table (i.e. sqlite_schema), this call is not ** part of a nested parse and writable_schema pragma has not ** been specified ** -** 3) The table is a shadow table, the database connection is in +** 4) The table is a shadow table, the database connection is in ** defensive mode, and the current sqlite3_prepare() ** is for a top-level SQL statement. */ +static int vtabIsReadOnly(Parse *pParse, Table *pTab){ + if( sqlite3GetVTable(pParse->db, pTab)->pMod->pModule->xUpdate==0 ){ + return 1; + } + + /* Within triggers: + ** * Do not allow DELETE, INSERT, or UPDATE of SQLITE_VTAB_DIRECTONLY + ** virtual tables + ** * Only allow DELETE, INSERT, or UPDATE of non-SQLITE_VTAB_INNOCUOUS + ** virtual tables if PRAGMA trusted_schema=ON. + */ + if( pParse->pToplevel!=0 + && pTab->u.vtab.p->eVtabRisk > + ((pParse->db->flags & SQLITE_TrustedSchema)!=0) + ){ + sqlite3ErrorMsg(pParse, "unsafe use of virtual table \"%s\"", + pTab->zName); + } + return 0; +} static int tabIsReadOnly(Parse *pParse, Table *pTab){ sqlite3 *db; if( IsVirtual(pTab) ){ - return sqlite3GetVTable(pParse->db, pTab)->pMod->pModule->xUpdate==0; + return vtabIsReadOnly(pParse, pTab); } if( (pTab->tabFlags & (TF_Readonly|TF_Shadow))==0 ) return 0; db = pParse->db; @@ -120212,9 +123069,11 @@ static int tabIsReadOnly(Parse *pParse, Table *pTab){ } /* -** Check to make sure the given table is writable. If it is not -** writable, generate an error message and return 1. If it is -** writable return 0; +** Check to make sure the given table is writable. +** +** If pTab is not writable -> generate an error message and return 1. +** If pTab is writable but other errors have occurred -> return 1. +** If pTab is writable and no prior errors -> return 0; */ SQLITE_PRIVATE int sqlite3IsReadOnly(Parse *pParse, Table *pTab, int viewOk){ if( tabIsReadOnly(pParse, pTab) ){ @@ -120256,8 +123115,8 @@ SQLITE_PRIVATE void sqlite3MaterializeView( assert( pFrom->nSrc==1 ); pFrom->a[0].zName = sqlite3DbStrDup(db, pView->zName); pFrom->a[0].zDatabase = sqlite3DbStrDup(db, db->aDb[iDb].zDbSName); - assert( pFrom->a[0].pOn==0 ); - assert( pFrom->a[0].pUsing==0 ); + assert( pFrom->a[0].fg.isUsing==0 ); + assert( pFrom->a[0].u3.pOn==0 ); } pSel = sqlite3SelectNew(pParse, 0, pFrom, pWhere, 0, 0, pOrderBy, SF_IncludeHidden, pLimit); @@ -120428,7 +123287,6 @@ SQLITE_PRIVATE void sqlite3DeleteFrom( assert( db->mallocFailed==0 ); assert( pTabList->nSrc==1 ); - /* Locate the table which we want to delete. This table has to be ** put in an SrcList structure because some of the subroutines we ** will be calling are designed to work with multiple tables and expect @@ -120453,6 +123311,14 @@ SQLITE_PRIVATE void sqlite3DeleteFrom( # define isView 0 #endif +#if TREETRACE_ENABLED + if( sqlite3TreeTrace & 0x10000 ){ + sqlite3TreeViewLine(0, "In sqlite3Delete() at %s:%d", __FILE__, __LINE__); + sqlite3TreeViewDelete(pParse->pWith, pTabList, pWhere, + pOrderBy, pLimit, pTrigger); + } +#endif + #ifdef SQLITE_ENABLE_UPDATE_DELETE_LIMIT if( !isView ){ pWhere = sqlite3LimitWhere( @@ -120568,9 +123434,10 @@ SQLITE_PRIVATE void sqlite3DeleteFrom( } for(pIdx=pTab->pIndex; pIdx; pIdx=pIdx->pNext){ assert( pIdx->pSchema==pTab->pSchema ); - sqlite3VdbeAddOp2(v, OP_Clear, pIdx->tnum, iDb); if( IsPrimaryKeyIndex(pIdx) && !HasRowid(pTab) ){ - sqlite3VdbeChangeP3(v, -1, memCnt ? memCnt : -1); + sqlite3VdbeAddOp3(v, OP_Clear, pIdx->tnum, iDb, memCnt ? memCnt : -1); + }else{ + sqlite3VdbeAddOp2(v, OP_Clear, pIdx->tnum, iDb); } } }else @@ -120770,7 +123637,7 @@ delete_from_cleanup: sqlite3ExprListDelete(db, pOrderBy); sqlite3ExprDelete(db, pLimit); #endif - sqlite3DbFree(db, aToOpen); + if( aToOpen ) sqlite3DbNNFreeNN(db, aToOpen); return; } /* Make sure "isView" and other macros defined above are undefined. Otherwise @@ -121853,7 +124720,7 @@ static int patternCompare( ** c but in the other case and search the input string for either ** c or cx. */ - if( c<=0x80 ){ + if( c<0x80 ){ char zStop[3]; int bMatch; if( noCase ){ @@ -121936,7 +124803,13 @@ static int patternCompare( ** non-zero if there is no match. */ SQLITE_API int sqlite3_strglob(const char *zGlobPattern, const char *zString){ - return patternCompare((u8*)zGlobPattern, (u8*)zString, &globInfo, '['); + if( zString==0 ){ + return zGlobPattern!=0; + }else if( zGlobPattern==0 ){ + return 1; + }else { + return patternCompare((u8*)zGlobPattern, (u8*)zString, &globInfo, '['); + } } /* @@ -121944,7 +124817,13 @@ SQLITE_API int sqlite3_strglob(const char *zGlobPattern, const char *zString){ ** a miss - like strcmp(). */ SQLITE_API int sqlite3_strlike(const char *zPattern, const char *zStr, unsigned int esc){ - return patternCompare((u8*)zPattern, (u8*)zStr, &likeInfoNorm, esc); + if( zStr==0 ){ + return zPattern!=0; + }else if( zPattern==0 ){ + return 1; + }else{ + return patternCompare((u8*)zPattern, (u8*)zStr, &likeInfoNorm, esc); + } } /* @@ -123211,11 +126090,11 @@ static void logFunc( switch( SQLITE_PTR_TO_INT(sqlite3_user_data(context)) ){ case 1: /* Convert from natural logarithm to log base 10 */ - ans *= 1.0/M_LN10; + ans /= M_LN10; break; case 2: /* Convert from natural logarithm to log base 2 */ - ans *= 1.0/M_LN2; + ans /= M_LN2; break; default: break; @@ -123354,8 +126233,7 @@ SQLITE_PRIVATE void sqlite3RegisterBuiltinFunctions(void){ INLINE_FUNC(likelihood, 2, INLINEFUNC_unlikely, SQLITE_FUNC_UNLIKELY), INLINE_FUNC(likely, 1, INLINEFUNC_unlikely, SQLITE_FUNC_UNLIKELY), #ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC - {1, SQLITE_FUNC_BUILTIN|SQLITE_UTF8|SQLITE_FUNC_OFFSET|SQLITE_FUNC_TYPEOF, - 0, 0, noopFunc, 0, 0, 0, "sqlite_offset", {0} }, + INLINE_FUNC(sqlite_offset, 1, INLINEFUNC_sqlite_offset, 0 ), #endif FUNCTION(ltrim, 1, 1, 0, trimFunc ), FUNCTION(ltrim, 2, 1, 0, trimFunc ), @@ -123890,7 +126768,6 @@ static void fkLookupParent( }else{ int nCol = pFKey->nCol; int regTemp = sqlite3GetTempRange(pParse, nCol); - int regRec = sqlite3GetTempReg(pParse); sqlite3VdbeAddOp3(v, OP_OpenRead, iCur, pIdx->tnum, iDb); sqlite3VdbeSetP4KeyInfo(pParse, pIdx); @@ -123930,11 +126807,10 @@ static void fkLookupParent( sqlite3VdbeGoto(v, iOk); } - sqlite3VdbeAddOp4(v, OP_MakeRecord, regTemp, nCol, regRec, + sqlite3VdbeAddOp4(v, OP_Affinity, regTemp, nCol, 0, sqlite3IndexAffinityStr(pParse->db,pIdx), nCol); - sqlite3VdbeAddOp4Int(v, OP_Found, iCur, iOk, regRec, 0); VdbeCoverage(v); - - sqlite3ReleaseTempReg(pParse, regRec); + sqlite3VdbeAddOp4Int(v, OP_Found, iCur, iOk, regTemp, nCol); + VdbeCoverage(v); sqlite3ReleaseTempRange(pParse, regTemp, nCol); } } @@ -124036,14 +126912,10 @@ static Expr *exprTableColumn( ** Operation | FK type | Action taken ** -------------------------------------------------------------------------- ** DELETE immediate Increment the "immediate constraint counter". -** Or, if the ON (UPDATE|DELETE) action is RESTRICT, -** throw a "FOREIGN KEY constraint failed" exception. ** ** INSERT immediate Decrement the "immediate constraint counter". ** ** DELETE deferred Increment the "deferred constraint counter". -** Or, if the ON (UPDATE|DELETE) action is RESTRICT, -** throw a "FOREIGN KEY constraint failed" exception. ** ** INSERT deferred Decrement the "deferred constraint counter". ** @@ -124691,9 +127563,9 @@ SQLITE_PRIVATE int sqlite3FkRequired( ** ** It returns a pointer to a Trigger structure containing a trigger ** equivalent to the ON UPDATE or ON DELETE action specified by pFKey. -** If the action is "NO ACTION" or "RESTRICT", then a NULL pointer is -** returned (these actions require no special handling by the triggers -** sub-system, code for them is created by fkScanChildren()). +** If the action is "NO ACTION" then a NULL pointer is returned (these actions +** require no special handling by the triggers sub-system, code for them is +** created by fkScanChildren()). ** ** For example, if pFKey is the foreign key and pTab is table "p" in ** the following schema: @@ -124822,18 +127694,23 @@ static Trigger *fkActionTrigger( nFrom = sqlite3Strlen30(zFrom); if( action==OE_Restrict ){ + int iDb = sqlite3SchemaToIndex(db, pTab->pSchema); Token tFrom; + Token tDb; Expr *pRaise; tFrom.z = zFrom; tFrom.n = nFrom; + tDb.z = db->aDb[iDb].zDbSName; + tDb.n = sqlite3Strlen30(tDb.z); + pRaise = sqlite3Expr(db, TK_RAISE, "FOREIGN KEY constraint failed"); if( pRaise ){ pRaise->affExpr = OE_Abort; } pSelect = sqlite3SelectNew(pParse, sqlite3ExprListAppend(pParse, 0, pRaise), - sqlite3SrcListAppend(pParse, 0, &tFrom, 0), + sqlite3SrcListAppend(pParse, 0, &tDb, &tFrom), pWhere, 0, 0, 0, 0, 0 ); @@ -124940,11 +127817,12 @@ SQLITE_PRIVATE void sqlite3FkDelete(sqlite3 *db, Table *pTab){ FKey *pNext; /* Copy of pFKey->pNextFrom */ assert( IsOrdinaryTable(pTab) ); + assert( db!=0 ); for(pFKey=pTab->u.tab.pFKey; pFKey; pFKey=pNext){ assert( db==0 || sqlite3SchemaMutexHeld(db, 0, pTab->pSchema) ); /* Remove the FK from the fkeyHash hash table. */ - if( !db || db->pnBytesFreed==0 ){ + if( db->pnBytesFreed==0 ){ if( pFKey->pPrevTo ){ pFKey->pPrevTo->pNextTo = pFKey->pNextTo; }else{ @@ -125074,6 +127952,7 @@ SQLITE_PRIVATE const char *sqlite3IndexAffinityStr(sqlite3 *db, Index *pIdx){ aff = SQLITE_AFF_INTEGER; }else{ assert( x==XN_EXPR ); + assert( pIdx->bHasExpr ); assert( pIdx->aColExpr!=0 ); aff = sqlite3ExprAffinity(pIdx->aColExpr->a[n].pExpr); } @@ -125087,6 +127966,28 @@ SQLITE_PRIVATE const char *sqlite3IndexAffinityStr(sqlite3 *db, Index *pIdx){ return pIdx->zColAff; } +/* +** Compute an affinity string for a table. Space is obtained +** from sqlite3DbMalloc(). The caller is responsible for freeing +** the space when done. +*/ +SQLITE_PRIVATE char *sqlite3TableAffinityStr(sqlite3 *db, const Table *pTab){ + char *zColAff; + zColAff = (char *)sqlite3DbMallocRaw(db, pTab->nCol+1); + if( zColAff ){ + int i, j; + for(i=j=0; inCol; i++){ + if( (pTab->aCol[i].colFlags & COLFLAG_VIRTUAL)==0 ){ + zColAff[j++] = pTab->aCol[i].affinity; + } + } + do{ + zColAff[j--] = 0; + }while( j>=0 && zColAff[j]<=SQLITE_AFF_BLOB ); + } + return zColAff; +} + /* ** Make changes to the evolving bytecode to do affinity transformations ** of values that are about to be gathered into a row for table pTab. @@ -125128,7 +128029,7 @@ SQLITE_PRIVATE const char *sqlite3IndexAffinityStr(sqlite3 *db, Index *pIdx){ ** Apply the type checking to that array of registers. */ SQLITE_PRIVATE void sqlite3TableAffinity(Vdbe *v, Table *pTab, int iReg){ - int i, j; + int i; char *zColAff; if( pTab->tabFlags & TF_Strict ){ if( iReg==0 ){ @@ -125137,7 +128038,7 @@ SQLITE_PRIVATE void sqlite3TableAffinity(Vdbe *v, Table *pTab, int iReg){ ** OP_MakeRecord is found */ VdbeOp *pPrev; sqlite3VdbeAppendP4(v, pTab, P4_TABLE); - pPrev = sqlite3VdbeGetOp(v, -1); + pPrev = sqlite3VdbeGetLastOp(v); assert( pPrev!=0 ); assert( pPrev->opcode==OP_MakeRecord || sqlite3VdbeDb(v)->mallocFailed ); pPrev->opcode = OP_TypeCheck; @@ -125151,22 +128052,11 @@ SQLITE_PRIVATE void sqlite3TableAffinity(Vdbe *v, Table *pTab, int iReg){ } zColAff = pTab->zColAff; if( zColAff==0 ){ - sqlite3 *db = sqlite3VdbeDb(v); - zColAff = (char *)sqlite3DbMallocRaw(0, pTab->nCol+1); + zColAff = sqlite3TableAffinityStr(0, pTab); if( !zColAff ){ - sqlite3OomFault(db); + sqlite3OomFault(sqlite3VdbeDb(v)); return; } - - for(i=j=0; inCol; i++){ - assert( pTab->aCol[i].affinity!=0 || sqlite3VdbeParser(v)->nErr>0 ); - if( (pTab->aCol[i].colFlags & COLFLAG_VIRTUAL)==0 ){ - zColAff[j++] = pTab->aCol[i].affinity; - } - } - do{ - zColAff[j--] = 0; - }while( j>=0 && zColAff[j]<=SQLITE_AFF_BLOB ); pTab->zColAff = zColAff; } assert( zColAff!=0 ); @@ -125175,7 +128065,7 @@ SQLITE_PRIVATE void sqlite3TableAffinity(Vdbe *v, Table *pTab, int iReg){ if( iReg ){ sqlite3VdbeAddOp4(v, OP_Affinity, iReg, i, 0, zColAff, i); }else{ - assert( sqlite3VdbeGetOp(v, -1)->opcode==OP_MakeRecord + assert( sqlite3VdbeGetLastOp(v)->opcode==OP_MakeRecord || sqlite3VdbeDb(v)->mallocFailed ); sqlite3VdbeChangeP4(v, -1, zColAff, i); } @@ -125261,7 +128151,7 @@ SQLITE_PRIVATE void sqlite3ComputeGeneratedColumns( */ sqlite3TableAffinity(pParse->pVdbe, pTab, iRegStore); if( (pTab->tabFlags & TF_HasStored)!=0 ){ - pOp = sqlite3VdbeGetOp(pParse->pVdbe,-1); + pOp = sqlite3VdbeGetLastOp(pParse->pVdbe); if( pOp->opcode==OP_Affinity ){ /* Change the OP_Affinity argument to '@' (NONE) for all stored ** columns. '@' is the no-op affinity and those columns have not @@ -125743,6 +128633,14 @@ SQLITE_PRIVATE void sqlite3Insert( #endif assert( (pTrigger && tmask) || (pTrigger==0 && tmask==0) ); +#if TREETRACE_ENABLED + if( sqlite3TreeTrace & 0x10000 ){ + sqlite3TreeViewLine(0, "In sqlite3Insert() at %s:%d", __FILE__, __LINE__); + sqlite3TreeViewInsert(pParse->pWith, pTabList, pColumn, pSelect, pList, + onError, pUpsert, pTrigger); + } +#endif + /* If pTab is really a view, make sure it has been initialized. ** ViewGetColumnNames() is a no-op if pTab is not a view. */ @@ -125821,13 +128719,15 @@ SQLITE_PRIVATE void sqlite3Insert( */ bIdListInOrder = (pTab->tabFlags & (TF_OOOHidden|TF_HasStored))==0; if( pColumn ){ + assert( pColumn->eU4!=EU4_EXPR ); + pColumn->eU4 = EU4_IDX; for(i=0; inId; i++){ - pColumn->a[i].idx = -1; + pColumn->a[i].u4.idx = -1; } for(i=0; inId; i++){ for(j=0; jnCol; j++){ if( sqlite3StrICmp(pColumn->a[i].zName, pTab->aCol[j].zCnName)==0 ){ - pColumn->a[i].idx = j; + pColumn->a[i].u4.idx = j; if( i!=j ) bIdListInOrder = 0; if( j==pTab->iPKey ){ ipkColumn = i; assert( !withoutRowid ); @@ -126129,7 +129029,8 @@ SQLITE_PRIVATE void sqlite3Insert( } } if( pColumn ){ - for(j=0; jnId && pColumn->a[j].idx!=i; j++){} + assert( pColumn->eU4==EU4_IDX ); + for(j=0; jnId && pColumn->a[j].u4.idx!=i; j++){} if( j>=pColumn->nId ){ /* A column not named in the insert column list gets its ** default value */ @@ -126156,7 +129057,12 @@ SQLITE_PRIVATE void sqlite3Insert( sqlite3VdbeAddOp2(v, OP_SCopy, regFromSelect+k, iRegStore); } }else{ - sqlite3ExprCode(pParse, pList->a[k].pExpr, iRegStore); + Expr *pX = pList->a[k].pExpr; + int y = sqlite3ExprCodeTarget(pParse, pX, iRegStore); + if( y!=iRegStore ){ + sqlite3VdbeAddOp2(v, + ExprHasProperty(pX, EP_Subquery) ? OP_Copy : OP_SCopy, y, iRegStore); + } } } @@ -126293,7 +129199,9 @@ SQLITE_PRIVATE void sqlite3Insert( sqlite3GenerateConstraintChecks(pParse, pTab, aRegIdx, iDataCur, iIdxCur, regIns, 0, ipkColumn>=0, onError, endOfLoop, &isReplace, 0, pUpsert ); - sqlite3FkCheck(pParse, pTab, 0, regIns, 0, 0); + if( db->flags & SQLITE_ForeignKeys ){ + sqlite3FkCheck(pParse, pTab, 0, regIns, 0, 0); + } /* Set the OPFLAG_USESEEKRESULT flag if either (a) there are no REPLACE ** constraints or (b) there are no triggers and this table is not a @@ -126377,7 +129285,7 @@ insert_cleanup: sqlite3UpsertDelete(db, pUpsert); sqlite3SelectDelete(db, pSelect); sqlite3IdListDelete(db, pColumn); - sqlite3DbFree(db, aRegIdx); + if( aRegIdx ) sqlite3DbNNFreeNN(db, aRegIdx); } /* Make sure "isView" and other macros defined above are undefined. Otherwise @@ -127260,7 +130168,7 @@ SQLITE_PRIVATE void sqlite3GenerateConstraintChecks( if( isUpdate ){ /* If currently processing the PRIMARY KEY of a WITHOUT ROWID ** table, only conflict if the new PRIMARY KEY values are actually - ** different from the old. + ** different from the old. See TH3 withoutrowid04.test. ** ** For a UNIQUE index, only conflict if the PRIMARY KEY values ** of the matched index row are different from the original PRIMARY @@ -128604,9 +131512,9 @@ struct sqlite3_api_routines { const char *(*filename_journal)(const char*); const char *(*filename_wal)(const char*); /* Version 3.32.0 and later */ - char *(*create_filename)(const char*,const char*,const char*, + const char *(*create_filename)(const char*,const char*,const char*, int,const char**); - void (*free_filename)(char*); + void (*free_filename)(const char*); sqlite3_file *(*database_file_object)(const char*); /* Version 3.34.0 and later */ int (*txn_state)(sqlite3*,const char*); @@ -128624,6 +131532,14 @@ struct sqlite3_api_routines { int (*vtab_in)(sqlite3_index_info*,int,int); int (*vtab_in_first)(sqlite3_value*,sqlite3_value**); int (*vtab_in_next)(sqlite3_value*,sqlite3_value**); + /* Version 3.39.0 and later */ + int (*deserialize)(sqlite3*,const char*,unsigned char*, + sqlite3_int64,sqlite3_int64,unsigned); + unsigned char *(*serialize)(sqlite3*,const char *,sqlite3_int64*, + unsigned int); + const char *(*db_name)(sqlite3*,int); + /* Version 3.40.0 and later */ + int (*value_encoding)(sqlite3_value*); }; /* @@ -128942,6 +131858,14 @@ typedef int (*sqlite3_loadext_entry)( #define sqlite3_vtab_in sqlite3_api->vtab_in #define sqlite3_vtab_in_first sqlite3_api->vtab_in_first #define sqlite3_vtab_in_next sqlite3_api->vtab_in_next +/* Version 3.39.0 and later */ +#ifndef SQLITE_OMIT_DESERIALIZE +#define sqlite3_deserialize sqlite3_api->deserialize +#define sqlite3_serialize sqlite3_api->serialize +#endif +#define sqlite3_db_name sqlite3_api->db_name +/* Version 3.40.0 and later */ +#define sqlite3_value_encoding sqlite3_api->value_encoding #endif /* !defined(SQLITE_CORE) && !defined(SQLITE_OMIT_LOAD_EXTENSION) */ #if !defined(SQLITE_CORE) && !defined(SQLITE_OMIT_LOAD_EXTENSION) @@ -129433,11 +132357,30 @@ static const sqlite3_api_routines sqlite3Apis = { sqlite3_autovacuum_pages, /* Version 3.38.0 and later */ sqlite3_error_offset, +#ifndef SQLITE_OMIT_VIRTUALTABLE sqlite3_vtab_rhs_value, sqlite3_vtab_distinct, sqlite3_vtab_in, sqlite3_vtab_in_first, - sqlite3_vtab_in_next + sqlite3_vtab_in_next, +#else + 0, + 0, + 0, + 0, + 0, +#endif + /* Version 3.39.0 and later */ +#ifndef SQLITE_OMIT_DESERIALIZE + sqlite3_deserialize, + sqlite3_serialize, +#else + 0, + 0, +#endif + sqlite3_db_name, + /* Version 3.40.0 and later */ + sqlite3_value_encoding }; /* True if x is the directory separator character @@ -130108,7 +133051,7 @@ static const PragmaName aPragmaName[] = { #if !defined(SQLITE_OMIT_SCHEMA_PRAGMAS) {/* zName: */ "database_list", /* ePragTyp: */ PragTyp_DATABASE_LIST, - /* ePragFlg: */ PragFlg_NeedSchema|PragFlg_Result0, + /* ePragFlg: */ PragFlg_Result0, /* ColNames: */ 47, 3, /* iArg: */ 0 }, #endif @@ -130796,15 +133739,16 @@ static void pragmaFunclistLine( int isBuiltin, /* True if this is a built-in function */ int showInternFuncs /* True if showing internal functions */ ){ + u32 mask = + SQLITE_DETERMINISTIC | + SQLITE_DIRECTONLY | + SQLITE_SUBTYPE | + SQLITE_INNOCUOUS | + SQLITE_FUNC_INTERNAL + ; + if( showInternFuncs ) mask = 0xffffffff; for(; p; p=p->pNext){ const char *zType; - static const u32 mask = - SQLITE_DETERMINISTIC | - SQLITE_DIRECTONLY | - SQLITE_SUBTYPE | - SQLITE_INNOCUOUS | - SQLITE_FUNC_INTERNAL - ; static const char *azEnc[] = { 0, "utf8", "utf16le", "utf16be" }; assert( SQLITE_FUNC_ENCMASK==0x3 ); @@ -131296,7 +134240,7 @@ SQLITE_PRIVATE void sqlite3Pragma( */ #ifndef SQLITE_OMIT_AUTOVACUUM case PragTyp_INCREMENTAL_VACUUM: { - int iLimit, addr; + int iLimit = 0, addr; if( zRight==0 || !sqlite3GetInt32(zRight, &iLimit) || iLimit<=0 ){ iLimit = 0x7fffffff; } @@ -131453,6 +134397,7 @@ SQLITE_PRIVATE void sqlite3Pragma( ** */ case PragTyp_TEMP_STORE_DIRECTORY: { + sqlite3_mutex_enter(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); if( !zRight ){ returnSingleText(v, sqlite3_temp_directory); }else{ @@ -131462,6 +134407,7 @@ SQLITE_PRIVATE void sqlite3Pragma( rc = sqlite3OsAccess(db->pVfs, zRight, SQLITE_ACCESS_READWRITE, &res); if( rc!=SQLITE_OK || res==0 ){ sqlite3ErrorMsg(pParse, "not a writable directory"); + sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); goto pragma_out; } } @@ -131479,6 +134425,7 @@ SQLITE_PRIVATE void sqlite3Pragma( } #endif /* SQLITE_OMIT_WSD */ } + sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); break; } @@ -131497,6 +134444,7 @@ SQLITE_PRIVATE void sqlite3Pragma( ** */ case PragTyp_DATA_STORE_DIRECTORY: { + sqlite3_mutex_enter(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); if( !zRight ){ returnSingleText(v, sqlite3_data_directory); }else{ @@ -131506,6 +134454,7 @@ SQLITE_PRIVATE void sqlite3Pragma( rc = sqlite3OsAccess(db->pVfs, zRight, SQLITE_ACCESS_READWRITE, &res); if( rc!=SQLITE_OK || res==0 ){ sqlite3ErrorMsg(pParse, "not a writable directory"); + sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); goto pragma_out; } } @@ -131517,6 +134466,7 @@ SQLITE_PRIVATE void sqlite3Pragma( } #endif /* SQLITE_OMIT_WSD */ } + sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_TEMPDIR)); break; } #endif @@ -131984,7 +134934,6 @@ SQLITE_PRIVATE void sqlite3Pragma( HashElem *k; /* Loop counter: Next table in schema */ int x; /* result variable */ int regResult; /* 3 registers to hold a result row */ - int regKey; /* Register to hold key for checking the FK */ int regRow; /* Registers to hold a row from pTab */ int addrTop; /* Top of a loop checking foreign keys */ int addrOk; /* Jump here if the key is OK */ @@ -131992,7 +134941,6 @@ SQLITE_PRIVATE void sqlite3Pragma( regResult = pParse->nMem+1; pParse->nMem += 4; - regKey = ++pParse->nMem; regRow = ++pParse->nMem; k = sqliteHashFirst(&db->aDb[iDb].pSchema->tblHash); while( k ){ @@ -132059,9 +135007,9 @@ SQLITE_PRIVATE void sqlite3Pragma( /* Generate code to query the parent index for a matching parent ** key. If a match is found, jump to addrOk. */ if( pIdx ){ - sqlite3VdbeAddOp4(v, OP_MakeRecord, regRow, pFK->nCol, regKey, + sqlite3VdbeAddOp4(v, OP_Affinity, regRow, pFK->nCol, 0, sqlite3IndexAffinityStr(db,pIdx), pFK->nCol); - sqlite3VdbeAddOp4Int(v, OP_Found, i, addrOk, regKey, 0); + sqlite3VdbeAddOp4Int(v, OP_Found, i, addrOk, regRow, pFK->nCol); VdbeCoverage(v); }else if( pParent ){ int jmp = sqlite3VdbeCurrentAddr(v)+2; @@ -132232,15 +135180,24 @@ SQLITE_PRIVATE void sqlite3Pragma( for(x=sqliteHashFirst(pTbls); x; x=sqliteHashNext(x)){ Table *pTab = sqliteHashData(x); Index *pIdx, *pPk; - Index *pPrior = 0; + Index *pPrior = 0; /* Previous index */ int loopTop; int iDataCur, iIdxCur; int r1 = -1; - int bStrict; + int bStrict; /* True for a STRICT table */ + int r2; /* Previous key for WITHOUT ROWID tables */ + int mxCol; /* Maximum non-virtual column number */ if( !IsOrdinaryTable(pTab) ) continue; if( pObjTab && pObjTab!=pTab ) continue; - pPk = HasRowid(pTab) ? 0 : sqlite3PrimaryKeyIndex(pTab); + if( isQuick || HasRowid(pTab) ){ + pPk = 0; + r2 = 0; + }else{ + pPk = sqlite3PrimaryKeyIndex(pTab); + r2 = sqlite3GetTempRange(pParse, pPk->nKeyCol); + sqlite3VdbeAddOp3(v, OP_Null, 1, r2, r2+pPk->nKeyCol-1); + } sqlite3OpenTableAndIndices(pParse, pTab, OP_OpenRead, 0, 1, 0, &iDataCur, &iIdxCur); /* reg[7] counts the number of entries in the table. @@ -132254,52 +135211,157 @@ SQLITE_PRIVATE void sqlite3Pragma( assert( sqlite3NoTempsInRange(pParse,1,7+j) ); sqlite3VdbeAddOp2(v, OP_Rewind, iDataCur, 0); VdbeCoverage(v); loopTop = sqlite3VdbeAddOp2(v, OP_AddImm, 7, 1); + + /* Fetch the right-most column from the table. This will cause + ** the entire record header to be parsed and sanity checked. It + ** will also prepopulate the cursor column cache that is used + ** by the OP_IsType code, so it is a required step. + */ + mxCol = pTab->nCol-1; + while( mxCol>=0 + && ((pTab->aCol[mxCol].colFlags & COLFLAG_VIRTUAL)!=0 + || pTab->iPKey==mxCol) ) mxCol--; + if( mxCol>=0 ){ + sqlite3ExprCodeGetColumnOfTable(v, pTab, iDataCur, mxCol, 3); + sqlite3VdbeTypeofColumn(v, 3); + } + if( !isQuick ){ - /* Sanity check on record header decoding */ - sqlite3VdbeAddOp3(v, OP_Column, iDataCur, pTab->nNVCol-1,3); - sqlite3VdbeChangeP5(v, OPFLAG_TYPEOFARG); - VdbeComment((v, "(right-most column)")); + if( pPk ){ + /* Verify WITHOUT ROWID keys are in ascending order */ + int a1; + char *zErr; + a1 = sqlite3VdbeAddOp4Int(v, OP_IdxGT, iDataCur, 0,r2,pPk->nKeyCol); + VdbeCoverage(v); + sqlite3VdbeAddOp1(v, OP_IsNull, r2); VdbeCoverage(v); + zErr = sqlite3MPrintf(db, + "row not in PRIMARY KEY order for %s", + pTab->zName); + sqlite3VdbeAddOp4(v, OP_String8, 0, 3, 0, zErr, P4_DYNAMIC); + integrityCheckResultRow(v); + sqlite3VdbeJumpHere(v, a1); + sqlite3VdbeJumpHere(v, a1+1); + for(j=0; jnKeyCol; j++){ + sqlite3ExprCodeLoadIndexColumn(pParse, pPk, iDataCur, j, r2+j); + } + } } - /* Verify that all NOT NULL columns really are NOT NULL. At the - ** same time verify the type of the content of STRICT tables */ + /* Verify datatypes for all columns: + ** + ** (1) NOT NULL columns may not contain a NULL + ** (2) Datatype must be exact for non-ANY columns in STRICT tables + ** (3) Datatype for TEXT columns in non-STRICT tables must be + ** NULL, TEXT, or BLOB. + ** (4) Datatype for numeric columns in non-STRICT tables must not + ** be a TEXT value that can be losslessly converted to numeric. + */ bStrict = (pTab->tabFlags & TF_Strict)!=0; for(j=0; jnCol; j++){ char *zErr; - Column *pCol = pTab->aCol + j; - int doError, jmp2; + Column *pCol = pTab->aCol + j; /* The column to be checked */ + int labelError; /* Jump here to report an error */ + int labelOk; /* Jump here if all looks ok */ + int p1, p3, p4; /* Operands to the OP_IsType opcode */ + int doTypeCheck; /* Check datatypes (besides NOT NULL) */ + if( j==pTab->iPKey ) continue; - if( pCol->notNull==0 && !bStrict ) continue; - doError = bStrict ? sqlite3VdbeMakeLabel(pParse) : 0; - sqlite3ExprCodeGetColumnOfTable(v, pTab, iDataCur, j, 3); - if( sqlite3VdbeGetOp(v,-1)->opcode==OP_Column ){ - sqlite3VdbeChangeP5(v, OPFLAG_TYPEOFARG); + if( bStrict ){ + doTypeCheck = pCol->eCType>COLTYPE_ANY; + }else{ + doTypeCheck = pCol->affinity>SQLITE_AFF_BLOB; } + if( pCol->notNull==0 && !doTypeCheck ) continue; + + /* Compute the operands that will be needed for OP_IsType */ + p4 = SQLITE_NULL; + if( pCol->colFlags & COLFLAG_VIRTUAL ){ + sqlite3ExprCodeGetColumnOfTable(v, pTab, iDataCur, j, 3); + p1 = -1; + p3 = 3; + }else{ + if( pCol->iDflt ){ + sqlite3_value *pDfltValue = 0; + sqlite3ValueFromExpr(db, sqlite3ColumnExpr(pTab,pCol), ENC(db), + pCol->affinity, &pDfltValue); + if( pDfltValue ){ + p4 = sqlite3_value_type(pDfltValue); + sqlite3ValueFree(pDfltValue); + } + } + p1 = iDataCur; + if( !HasRowid(pTab) ){ + testcase( j!=sqlite3TableColumnToStorage(pTab, j) ); + p3 = sqlite3TableColumnToIndex(sqlite3PrimaryKeyIndex(pTab), j); + }else{ + p3 = sqlite3TableColumnToStorage(pTab,j); + testcase( p3!=j); + } + } + + labelError = sqlite3VdbeMakeLabel(pParse); + labelOk = sqlite3VdbeMakeLabel(pParse); if( pCol->notNull ){ - jmp2 = sqlite3VdbeAddOp1(v, OP_NotNull, 3); VdbeCoverage(v); + /* (1) NOT NULL columns may not contain a NULL */ + int jmp2 = sqlite3VdbeAddOp4Int(v, OP_IsType, p1, labelOk, p3, p4); + sqlite3VdbeChangeP5(v, 0x0f); + VdbeCoverage(v); zErr = sqlite3MPrintf(db, "NULL value in %s.%s", pTab->zName, pCol->zCnName); sqlite3VdbeAddOp4(v, OP_String8, 0, 3, 0, zErr, P4_DYNAMIC); - if( bStrict && pCol->eCType!=COLTYPE_ANY ){ - sqlite3VdbeGoto(v, doError); + if( doTypeCheck ){ + sqlite3VdbeGoto(v, labelError); + sqlite3VdbeJumpHere(v, jmp2); }else{ - integrityCheckResultRow(v); + /* VDBE byte code will fall thru */ } - sqlite3VdbeJumpHere(v, jmp2); } - if( (pTab->tabFlags & TF_Strict)!=0 - && pCol->eCType!=COLTYPE_ANY - ){ - jmp2 = sqlite3VdbeAddOp3(v, OP_IsNullOrType, 3, 0, - sqlite3StdTypeMap[pCol->eCType-1]); + if( bStrict && doTypeCheck ){ + /* (2) Datatype must be exact for non-ANY columns in STRICT tables*/ + static unsigned char aStdTypeMask[] = { + 0x1f, /* ANY */ + 0x18, /* BLOB */ + 0x11, /* INT */ + 0x11, /* INTEGER */ + 0x13, /* REAL */ + 0x14 /* TEXT */ + }; + sqlite3VdbeAddOp4Int(v, OP_IsType, p1, labelOk, p3, p4); + assert( pCol->eCType>=1 && pCol->eCType<=sizeof(aStdTypeMask) ); + sqlite3VdbeChangeP5(v, aStdTypeMask[pCol->eCType-1]); VdbeCoverage(v); zErr = sqlite3MPrintf(db, "non-%s value in %s.%s", sqlite3StdType[pCol->eCType-1], pTab->zName, pTab->aCol[j].zCnName); sqlite3VdbeAddOp4(v, OP_String8, 0, 3, 0, zErr, P4_DYNAMIC); - sqlite3VdbeResolveLabel(v, doError); - integrityCheckResultRow(v); - sqlite3VdbeJumpHere(v, jmp2); + }else if( !bStrict && pCol->affinity==SQLITE_AFF_TEXT ){ + /* (3) Datatype for TEXT columns in non-STRICT tables must be + ** NULL, TEXT, or BLOB. */ + sqlite3VdbeAddOp4Int(v, OP_IsType, p1, labelOk, p3, p4); + sqlite3VdbeChangeP5(v, 0x1c); /* NULL, TEXT, or BLOB */ + VdbeCoverage(v); + zErr = sqlite3MPrintf(db, "NUMERIC value in %s.%s", + pTab->zName, pTab->aCol[j].zCnName); + sqlite3VdbeAddOp4(v, OP_String8, 0, 3, 0, zErr, P4_DYNAMIC); + }else if( !bStrict && pCol->affinity>=SQLITE_AFF_NUMERIC ){ + /* (4) Datatype for numeric columns in non-STRICT tables must not + ** be a TEXT value that can be converted to numeric. */ + sqlite3VdbeAddOp4Int(v, OP_IsType, p1, labelOk, p3, p4); + sqlite3VdbeChangeP5(v, 0x1b); /* NULL, INT, FLOAT, or BLOB */ + VdbeCoverage(v); + if( p1>=0 ){ + sqlite3ExprCodeGetColumnOfTable(v, pTab, iDataCur, j, 3); + } + sqlite3VdbeAddOp4(v, OP_Affinity, 3, 1, 0, "C", P4_STATIC); + sqlite3VdbeAddOp4Int(v, OP_IsType, -1, labelOk, 3, p4); + sqlite3VdbeChangeP5(v, 0x1c); /* NULL, TEXT, or BLOB */ + VdbeCoverage(v); + zErr = sqlite3MPrintf(db, "TEXT value in %s.%s", + pTab->zName, pTab->aCol[j].zCnName); + sqlite3VdbeAddOp4(v, OP_String8, 0, 3, 0, zErr, P4_DYNAMIC); } + sqlite3VdbeResolveLabel(v, labelError); + integrityCheckResultRow(v); + sqlite3VdbeResolveLabel(v, labelOk); } /* Verify CHECK constraints */ if( pTab->pCheck && (db->flags & SQLITE_IgnoreChecks)==0 ){ @@ -132387,6 +135449,9 @@ SQLITE_PRIVATE void sqlite3Pragma( integrityCheckResultRow(v); sqlite3VdbeJumpHere(v, addr); } + if( pPk ){ + sqlite3ReleaseTempRange(pParse, r2, pPk->nKeyCol); + } } } } @@ -132537,6 +135602,11 @@ SQLITE_PRIVATE void sqlite3Pragma( aOp[1].p2 = iCookie; aOp[1].p3 = sqlite3Atoi(zRight); aOp[1].p5 = 1; + if( iCookie==BTREE_SCHEMA_VERSION && (db->flags & SQLITE_Defensive)!=0 ){ + /* Do not allow the use of PRAGMA schema_version=VALUE in defensive + ** mode. Change the OP_SetCookie opcode into a no-op. */ + aOp[1].opcode = OP_Noop; + } }else{ /* Read the specified cookie value */ static const VdbeOpList readCookie[] = { @@ -133785,15 +136855,15 @@ SQLITE_PRIVATE void sqlite3ParseObjectReset(Parse *pParse){ assert( db->pParse==pParse ); assert( pParse->nested==0 ); #ifndef SQLITE_OMIT_SHARED_CACHE - sqlite3DbFree(db, pParse->aTableLock); + if( pParse->aTableLock ) sqlite3DbNNFreeNN(db, pParse->aTableLock); #endif while( pParse->pCleanup ){ ParseCleanup *pCleanup = pParse->pCleanup; pParse->pCleanup = pCleanup->pNext; pCleanup->xCleanup(db, pCleanup->pPtr); - sqlite3DbFreeNN(db, pCleanup); + sqlite3DbNNFreeNN(db, pCleanup); } - sqlite3DbFree(db, pParse->aLabel); + if( pParse->aLabel ) sqlite3DbNNFreeNN(db, pParse->aLabel); if( pParse->pConstExpr ){ sqlite3ExprListDelete(db, pParse->pConstExpr); } @@ -133916,7 +136986,7 @@ static int sqlite3Prepare( sParse.disableLookaside++; DisableLookaside; } - sParse.disableVtab = (prepFlags & SQLITE_PREPARE_NO_VTAB)!=0; + sParse.prepFlags = prepFlags & 0xff; /* Check to verify that it is possible to get a read lock on all ** database schemas. The inability to get a read lock indicates that @@ -133957,7 +137027,9 @@ static int sqlite3Prepare( } } - sqlite3VtabUnlockList(db); +#ifndef SQLITE_OMIT_VIRTUALTABLE + if( db->pDisconnect ) sqlite3VtabUnlockList(db); +#endif if( nBytes>=0 && (nBytes==0 || zSql[nBytes-1]!=0) ){ char *zSqlCopy; @@ -134297,7 +137369,7 @@ SQLITE_API int sqlite3_prepare16_v3( */ typedef struct DistinctCtx DistinctCtx; struct DistinctCtx { - u8 isTnct; /* True if the DISTINCT keyword is present */ + u8 isTnct; /* 0: Not distinct. 1: DISTICT 2: DISTINCT and ORDER BY */ u8 eTnctType; /* One of the WHERE_DISTINCT_* operators */ int tabTnct; /* Ephemeral table used for DISTINCT processing */ int addrTnct; /* Address of OP_OpenEphemeral opcode for tabTnct */ @@ -134352,6 +137424,7 @@ struct SortCtx { ** If bFree==0, Leave the first Select object unfreed */ static void clearSelect(sqlite3 *db, Select *p, int bFree){ + assert( db!=0 ); while( p ){ Select *pPrior = p->pPrior; sqlite3ExprListDelete(db, p->pEList); @@ -134371,7 +137444,7 @@ static void clearSelect(sqlite3 *db, Select *p, int bFree){ sqlite3WindowUnlinkFromSelect(p->pWin); } #endif - if( bFree ) sqlite3DbFreeNN(db, p); + if( bFree ) sqlite3DbNNFreeNN(db, p); p = pPrior; bFree = 1; } @@ -134480,6 +137553,52 @@ static Select *findRightmost(Select *p){ ** ** If an illegal or unsupported join type is seen, then still return ** a join type, but put an error in the pParse structure. +** +** These are the valid join types: +** +** +** pA pB pC Return Value +** ------- ----- ----- ------------ +** CROSS - - JT_CROSS +** INNER - - JT_INNER +** LEFT - - JT_LEFT|JT_OUTER +** LEFT OUTER - JT_LEFT|JT_OUTER +** RIGHT - - JT_RIGHT|JT_OUTER +** RIGHT OUTER - JT_RIGHT|JT_OUTER +** FULL - - JT_LEFT|JT_RIGHT|JT_OUTER +** FULL OUTER - JT_LEFT|JT_RIGHT|JT_OUTER +** NATURAL INNER - JT_NATURAL|JT_INNER +** NATURAL LEFT - JT_NATURAL|JT_LEFT|JT_OUTER +** NATURAL LEFT OUTER JT_NATURAL|JT_LEFT|JT_OUTER +** NATURAL RIGHT - JT_NATURAL|JT_RIGHT|JT_OUTER +** NATURAL RIGHT OUTER JT_NATURAL|JT_RIGHT|JT_OUTER +** NATURAL FULL - JT_NATURAL|JT_LEFT|JT_RIGHT +** NATURAL FULL OUTER JT_NATRUAL|JT_LEFT|JT_RIGHT +** +** To preserve historical compatibly, SQLite also accepts a variety +** of other non-standard and in many cases non-sensical join types. +** This routine makes as much sense at it can from the nonsense join +** type and returns a result. Examples of accepted nonsense join types +** include but are not limited to: +** +** INNER CROSS JOIN -> same as JOIN +** NATURAL CROSS JOIN -> same as NATURAL JOIN +** OUTER LEFT JOIN -> same as LEFT JOIN +** LEFT NATURAL JOIN -> same as NATURAL LEFT JOIN +** LEFT RIGHT JOIN -> same as FULL JOIN +** RIGHT OUTER FULL JOIN -> same as FULL JOIN +** CROSS CROSS CROSS JOIN -> same as JOIN +** +** The only restrictions on the join type name are: +** +** * "INNER" cannot appear together with "OUTER", "LEFT", "RIGHT", +** or "FULL". +** +** * "CROSS" cannot appear together with "OUTER", "LEFT", "RIGHT, +** or "FULL". +** +** * If "OUTER" is present then there must also be one of +** "LEFT", "RIGHT", or "FULL" */ SQLITE_PRIVATE int sqlite3JoinType(Parse *pParse, Token *pA, Token *pB, Token *pC){ int jointype = 0; @@ -134492,13 +137611,13 @@ SQLITE_PRIVATE int sqlite3JoinType(Parse *pParse, Token *pA, Token *pB, Token *p u8 nChar; /* Length of the keyword in characters */ u8 code; /* Join type mask */ } aKeyword[] = { - /* natural */ { 0, 7, JT_NATURAL }, - /* left */ { 6, 4, JT_LEFT|JT_OUTER }, - /* outer */ { 10, 5, JT_OUTER }, - /* right */ { 14, 5, JT_RIGHT|JT_OUTER }, - /* full */ { 19, 4, JT_LEFT|JT_RIGHT|JT_OUTER }, - /* inner */ { 23, 5, JT_INNER }, - /* cross */ { 28, 5, JT_INNER|JT_CROSS }, + /* (0) natural */ { 0, 7, JT_NATURAL }, + /* (1) left */ { 6, 4, JT_LEFT|JT_OUTER }, + /* (2) outer */ { 10, 5, JT_OUTER }, + /* (3) right */ { 14, 5, JT_RIGHT|JT_OUTER }, + /* (4) full */ { 19, 4, JT_LEFT|JT_RIGHT|JT_OUTER }, + /* (5) inner */ { 23, 5, JT_INNER }, + /* (6) cross */ { 28, 5, JT_INNER|JT_CROSS }, }; int i, j; apAll[0] = pA; @@ -134521,18 +137640,15 @@ SQLITE_PRIVATE int sqlite3JoinType(Parse *pParse, Token *pA, Token *pB, Token *p } if( (jointype & (JT_INNER|JT_OUTER))==(JT_INNER|JT_OUTER) || - (jointype & JT_ERROR)!=0 + (jointype & JT_ERROR)!=0 || + (jointype & (JT_OUTER|JT_LEFT|JT_RIGHT))==JT_OUTER ){ - const char *zSp = " "; - assert( pB!=0 ); - if( pC==0 ){ zSp++; } - sqlite3ErrorMsg(pParse, "unknown or unsupported join type: " - "%T %T%s%T", pA, pB, zSp, pC); - jointype = JT_INNER; - }else if( (jointype & JT_OUTER)!=0 - && (jointype & (JT_LEFT|JT_RIGHT))!=JT_LEFT ){ - sqlite3ErrorMsg(pParse, - "RIGHT and FULL OUTER JOINs are not currently supported"); + const char *zSp1 = " "; + const char *zSp2 = " "; + if( pB==0 ){ zSp1++; } + if( pC==0 ){ zSp2++; } + sqlite3ErrorMsg(pParse, "unknown join type: " + "%T%s%T%s%T", pA, zSp1, pB, zSp2, pC); jointype = JT_INNER; } return jointype; @@ -134553,8 +137669,25 @@ SQLITE_PRIVATE int sqlite3ColumnIndex(Table *pTab, const char *zCol){ } /* -** Search the first N tables in pSrc, from left to right, looking for a -** table that has a column named zCol. +** Mark a subquery result column as having been used. +*/ +SQLITE_PRIVATE void sqlite3SrcItemColumnUsed(SrcItem *pItem, int iCol){ + assert( pItem!=0 ); + assert( (int)pItem->fg.isNestedFrom == IsNestedFrom(pItem->pSelect) ); + if( pItem->fg.isNestedFrom ){ + ExprList *pResults; + assert( pItem->pSelect!=0 ); + pResults = pItem->pSelect->pEList; + assert( pResults!=0 ); + assert( iCol>=0 && iColnExpr ); + pResults->a[iCol].fg.bUsed = 1; + } +} + +/* +** Search the tables iStart..iEnd (inclusive) in pSrc, looking for a +** table that has a column named zCol. The search is left-to-right. +** The first match found is returned. ** ** When found, set *piTab and *piCol to the table index and column index ** of the matching column and return TRUE. @@ -134563,22 +137696,27 @@ SQLITE_PRIVATE int sqlite3ColumnIndex(Table *pTab, const char *zCol){ */ static int tableAndColumnIndex( SrcList *pSrc, /* Array of tables to search */ - int N, /* Number of tables in pSrc->a[] to search */ + int iStart, /* First member of pSrc->a[] to check */ + int iEnd, /* Last member of pSrc->a[] to check */ const char *zCol, /* Name of the column we are looking for */ int *piTab, /* Write index of pSrc->a[] here */ int *piCol, /* Write index of pSrc->a[*piTab].pTab->aCol[] here */ - int bIgnoreHidden /* True to ignore hidden columns */ + int bIgnoreHidden /* Ignore hidden columns */ ){ int i; /* For looping over tables in pSrc */ int iCol; /* Index of column matching zCol */ + assert( iEndnSrc ); + assert( iStart>=0 ); assert( (piTab==0)==(piCol==0) ); /* Both or neither are NULL */ - for(i=0; ia[i].pTab, zCol); if( iCol>=0 && (bIgnoreHidden==0 || IsHiddenColumn(&pSrc->a[i].pTab->aCol[iCol])==0) ){ if( piTab ){ + sqlite3SrcItemColumnUsed(&pSrc->a[i], iCol); *piTab = i; *piCol = iCol; } @@ -134589,66 +137727,19 @@ static int tableAndColumnIndex( } /* -** This function is used to add terms implied by JOIN syntax to the -** WHERE clause expression of a SELECT statement. The new term, which -** is ANDed with the existing WHERE clause, is of the form: -** -** (tab1.col1 = tab2.col2) -** -** where tab1 is the iSrc'th table in SrcList pSrc and tab2 is the -** (iSrc+1)'th. Column col1 is column iColLeft of tab1, and col2 is -** column iColRight of tab2. -*/ -static void addWhereTerm( - Parse *pParse, /* Parsing context */ - SrcList *pSrc, /* List of tables in FROM clause */ - int iLeft, /* Index of first table to join in pSrc */ - int iColLeft, /* Index of column in first table */ - int iRight, /* Index of second table in pSrc */ - int iColRight, /* Index of column in second table */ - int isOuterJoin, /* True if this is an OUTER join */ - Expr **ppWhere /* IN/OUT: The WHERE clause to add to */ -){ - sqlite3 *db = pParse->db; - Expr *pE1; - Expr *pE2; - Expr *pEq; - - assert( iLeftnSrc>iRight ); - assert( pSrc->a[iLeft].pTab ); - assert( pSrc->a[iRight].pTab ); - - pE1 = sqlite3CreateColumnExpr(db, pSrc, iLeft, iColLeft); - pE2 = sqlite3CreateColumnExpr(db, pSrc, iRight, iColRight); - - pEq = sqlite3PExpr(pParse, TK_EQ, pE1, pE2); - assert( pE2!=0 || pEq==0 ); /* Due to db->mallocFailed test - ** in sqlite3DbMallocRawNN() called from - ** sqlite3PExpr(). */ - if( pEq && isOuterJoin ){ - ExprSetProperty(pEq, EP_FromJoin); - assert( !ExprHasProperty(pEq, EP_TokenOnly|EP_Reduced) ); - ExprSetVVAProperty(pEq, EP_NoReduce); - pEq->w.iRightJoinTable = pE2->iTable; - } - *ppWhere = sqlite3ExprAnd(pParse, *ppWhere, pEq); -} - -/* -** Set the EP_FromJoin property on all terms of the given expression. -** And set the Expr.w.iRightJoinTable to iTable for every term in the +** Set the EP_OuterON property on all terms of the given expression. +** And set the Expr.w.iJoin to iTable for every term in the ** expression. ** -** The EP_FromJoin property is used on terms of an expression to tell -** the LEFT OUTER JOIN processing logic that this term is part of the +** The EP_OuterON property is used on terms of an expression to tell +** the OUTER JOIN processing logic that this term is part of the ** join restriction specified in the ON or USING clause and not a part ** of the more general WHERE clause. These terms are moved over to the ** WHERE clause during join processing but we need to remember that they ** originated in the ON or USING clause. ** -** The Expr.w.iRightJoinTable tells the WHERE clause processing that the -** expression depends on table w.iRightJoinTable even if that table is not +** The Expr.w.iJoin tells the WHERE clause processing that the +** expression depends on table w.iJoin even if that table is not ** explicitly mentioned in the expression. That information is needed ** for cases like this: ** @@ -134661,39 +137752,48 @@ static void addWhereTerm( ** after the t1 loop and rows with t1.x!=5 will never appear in ** the output, which is incorrect. */ -SQLITE_PRIVATE void sqlite3SetJoinExpr(Expr *p, int iTable){ +SQLITE_PRIVATE void sqlite3SetJoinExpr(Expr *p, int iTable, u32 joinFlag){ + assert( joinFlag==EP_OuterON || joinFlag==EP_InnerON ); while( p ){ - ExprSetProperty(p, EP_FromJoin); + ExprSetProperty(p, joinFlag); assert( !ExprHasProperty(p, EP_TokenOnly|EP_Reduced) ); ExprSetVVAProperty(p, EP_NoReduce); - p->w.iRightJoinTable = iTable; + p->w.iJoin = iTable; if( p->op==TK_FUNCTION ){ assert( ExprUseXList(p) ); if( p->x.pList ){ int i; for(i=0; ix.pList->nExpr; i++){ - sqlite3SetJoinExpr(p->x.pList->a[i].pExpr, iTable); + sqlite3SetJoinExpr(p->x.pList->a[i].pExpr, iTable, joinFlag); } } } - sqlite3SetJoinExpr(p->pLeft, iTable); + sqlite3SetJoinExpr(p->pLeft, iTable, joinFlag); p = p->pRight; } } -/* Undo the work of sqlite3SetJoinExpr(). In the expression p, convert every -** term that is marked with EP_FromJoin and w.iRightJoinTable==iTable into -** an ordinary term that omits the EP_FromJoin mark. +/* Undo the work of sqlite3SetJoinExpr(). This is used when a LEFT JOIN +** is simplified into an ordinary JOIN, and when an ON expression is +** "pushed down" into the WHERE clause of a subquery. +** +** Convert every term that is marked with EP_OuterON and w.iJoin==iTable into +** an ordinary term that omits the EP_OuterON mark. Or if iTable<0, then +** just clear every EP_OuterON and EP_InnerON mark from the expression tree. ** -** This happens when a LEFT JOIN is simplified into an ordinary JOIN. +** If nullable is true, that means that Expr p might evaluate to NULL even +** if it is a reference to a NOT NULL column. This can happen, for example, +** if the table that p references is on the left side of a RIGHT JOIN. +** If nullable is true, then take care to not remove the EP_CanBeNull bit. +** See forum thread https://sqlite.org/forum/forumpost/b40696f50145d21c */ -static void unsetJoinExpr(Expr *p, int iTable){ +static void unsetJoinExpr(Expr *p, int iTable, int nullable){ while( p ){ - if( ExprHasProperty(p, EP_FromJoin) - && (iTable<0 || p->w.iRightJoinTable==iTable) ){ - ExprClearProperty(p, EP_FromJoin); + if( iTable<0 || (ExprHasProperty(p, EP_OuterON) && p->w.iJoin==iTable) ){ + ExprClearProperty(p, EP_OuterON|EP_InnerON); + if( iTable>=0 ) ExprSetProperty(p, EP_InnerON); } - if( p->op==TK_COLUMN && p->iTable==iTable ){ + if( p->op==TK_COLUMN && p->iTable==iTable && !nullable ){ ExprClearProperty(p, EP_CanBeNull); } if( p->op==TK_FUNCTION ){ @@ -134701,30 +137801,37 @@ static void unsetJoinExpr(Expr *p, int iTable){ if( p->x.pList ){ int i; for(i=0; ix.pList->nExpr; i++){ - unsetJoinExpr(p->x.pList->a[i].pExpr, iTable); + unsetJoinExpr(p->x.pList->a[i].pExpr, iTable, nullable); } } } - unsetJoinExpr(p->pLeft, iTable); + unsetJoinExpr(p->pLeft, iTable, nullable); p = p->pRight; } } /* ** This routine processes the join information for a SELECT statement. -** ON and USING clauses are converted into extra terms of the WHERE clause. -** NATURAL joins also create extra WHERE clause terms. +** +** * A NATURAL join is converted into a USING join. After that, we +** do not need to be concerned with NATURAL joins and we only have +** think about USING joins. +** +** * ON and USING clauses result in extra terms being added to the +** WHERE clause to enforce the specified constraints. The extra +** WHERE clause terms will be tagged with EP_OuterON or +** EP_InnerON so that we know that they originated in ON/USING. ** ** The terms of a FROM clause are contained in the Select.pSrc structure. ** The left most table is the first entry in Select.pSrc. The right-most ** table is the last entry. The join operator is held in the entry to -** the left. Thus entry 0 contains the join operator for the join between +** the right. Thus entry 1 contains the join operator for the join between ** entries 0 and 1. Any ON or USING clauses associated with the join are -** also attached to the left entry. +** also attached to the right entry. ** ** This routine returns the number of errors encountered. */ -static int sqliteProcessJoin(Parse *pParse, Select *p){ +static int sqlite3ProcessJoin(Parse *pParse, Select *p){ SrcList *pSrc; /* All tables in the FROM clause */ int i, j; /* Loop counters */ SrcItem *pLeft; /* Left table being joined */ @@ -134735,49 +137842,41 @@ static int sqliteProcessJoin(Parse *pParse, Select *p){ pRight = &pLeft[1]; for(i=0; inSrc-1; i++, pRight++, pLeft++){ Table *pRightTab = pRight->pTab; - int isOuter; + u32 joinType; if( NEVER(pLeft->pTab==0 || pRightTab==0) ) continue; - isOuter = (pRight->fg.jointype & JT_OUTER)!=0; + joinType = (pRight->fg.jointype & JT_OUTER)!=0 ? EP_OuterON : EP_InnerON; - /* When the NATURAL keyword is present, add WHERE clause terms for - ** every column that the two tables have in common. + /* If this is a NATURAL join, synthesize an approprate USING clause + ** to specify which columns should be joined. */ if( pRight->fg.jointype & JT_NATURAL ){ - if( pRight->pOn || pRight->pUsing ){ + IdList *pUsing = 0; + if( pRight->fg.isUsing || pRight->u3.pOn ){ sqlite3ErrorMsg(pParse, "a NATURAL join may not have " "an ON or USING clause", 0); return 1; } for(j=0; jnCol; j++){ char *zName; /* Name of column in the right table */ - int iLeft; /* Matching left table */ - int iLeftCol; /* Matching column in the left table */ if( IsHiddenColumn(&pRightTab->aCol[j]) ) continue; zName = pRightTab->aCol[j].zCnName; - if( tableAndColumnIndex(pSrc, i+1, zName, &iLeft, &iLeftCol, 1) ){ - addWhereTerm(pParse, pSrc, iLeft, iLeftCol, i+1, j, - isOuter, &p->pWhere); + if( tableAndColumnIndex(pSrc, 0, i, zName, 0, 0, 1) ){ + pUsing = sqlite3IdListAppend(pParse, pUsing, 0); + if( pUsing ){ + assert( pUsing->nId>0 ); + assert( pUsing->a[pUsing->nId-1].zName==0 ); + pUsing->a[pUsing->nId-1].zName = sqlite3DbStrDup(pParse->db, zName); + } } } - } - - /* Disallow both ON and USING clauses in the same join - */ - if( pRight->pOn && pRight->pUsing ){ - sqlite3ErrorMsg(pParse, "cannot have both ON and USING " - "clauses in the same join"); - return 1; - } - - /* Add the ON clause to the end of the WHERE clause, connected by - ** an AND operator. - */ - if( pRight->pOn ){ - if( isOuter ) sqlite3SetJoinExpr(pRight->pOn, pRight->iCursor); - p->pWhere = sqlite3ExprAnd(pParse, p->pWhere, pRight->pOn); - pRight->pOn = 0; + if( pUsing ){ + pRight->fg.isUsing = 1; + pRight->fg.isSynthUsing = 1; + pRight->u3.pUsing = pUsing; + } + if( pParse->nErr ) return 1; } /* Create extra terms on the WHERE clause for each column named @@ -134787,27 +137886,88 @@ static int sqliteProcessJoin(Parse *pParse, Select *p){ ** Report an error if any column mentioned in the USING clause is ** not contained in both tables to be joined. */ - if( pRight->pUsing ){ - IdList *pList = pRight->pUsing; + if( pRight->fg.isUsing ){ + IdList *pList = pRight->u3.pUsing; + sqlite3 *db = pParse->db; + assert( pList!=0 ); for(j=0; jnId; j++){ char *zName; /* Name of the term in the USING clause */ int iLeft; /* Table on the left with matching column name */ int iLeftCol; /* Column number of matching column on the left */ int iRightCol; /* Column number of matching column on the right */ + Expr *pE1; /* Reference to the column on the LEFT of the join */ + Expr *pE2; /* Reference to the column on the RIGHT of the join */ + Expr *pEq; /* Equality constraint. pE1 == pE2 */ zName = pList->a[j].zName; iRightCol = sqlite3ColumnIndex(pRightTab, zName); if( iRightCol<0 - || !tableAndColumnIndex(pSrc, i+1, zName, &iLeft, &iLeftCol, 0) + || tableAndColumnIndex(pSrc, 0, i, zName, &iLeft, &iLeftCol, + pRight->fg.isSynthUsing)==0 ){ sqlite3ErrorMsg(pParse, "cannot join using column %s - column " "not present in both tables", zName); return 1; } - addWhereTerm(pParse, pSrc, iLeft, iLeftCol, i+1, iRightCol, - isOuter, &p->pWhere); + pE1 = sqlite3CreateColumnExpr(db, pSrc, iLeft, iLeftCol); + sqlite3SrcItemColumnUsed(&pSrc->a[iLeft], iLeftCol); + if( (pSrc->a[0].fg.jointype & JT_LTORJ)!=0 ){ + /* This branch runs if the query contains one or more RIGHT or FULL + ** JOINs. If only a single table on the left side of this join + ** contains the zName column, then this branch is a no-op. + ** But if there are two or more tables on the left side + ** of the join, construct a coalesce() function that gathers all + ** such tables. Raise an error if more than one of those references + ** to zName is not also within a prior USING clause. + ** + ** We really ought to raise an error if there are two or more + ** non-USING references to zName on the left of an INNER or LEFT + ** JOIN. But older versions of SQLite do not do that, so we avoid + ** adding a new error so as to not break legacy applications. + */ + ExprList *pFuncArgs = 0; /* Arguments to the coalesce() */ + static const Token tkCoalesce = { "coalesce", 8 }; + while( tableAndColumnIndex(pSrc, iLeft+1, i, zName, &iLeft, &iLeftCol, + pRight->fg.isSynthUsing)!=0 ){ + if( pSrc->a[iLeft].fg.isUsing==0 + || sqlite3IdListIndex(pSrc->a[iLeft].u3.pUsing, zName)<0 + ){ + sqlite3ErrorMsg(pParse, "ambiguous reference to %s in USING()", + zName); + break; + } + pFuncArgs = sqlite3ExprListAppend(pParse, pFuncArgs, pE1); + pE1 = sqlite3CreateColumnExpr(db, pSrc, iLeft, iLeftCol); + sqlite3SrcItemColumnUsed(&pSrc->a[iLeft], iLeftCol); + } + if( pFuncArgs ){ + pFuncArgs = sqlite3ExprListAppend(pParse, pFuncArgs, pE1); + pE1 = sqlite3ExprFunction(pParse, pFuncArgs, &tkCoalesce, 0); + } + } + pE2 = sqlite3CreateColumnExpr(db, pSrc, i+1, iRightCol); + sqlite3SrcItemColumnUsed(pRight, iRightCol); + pEq = sqlite3PExpr(pParse, TK_EQ, pE1, pE2); + assert( pE2!=0 || pEq==0 ); + if( pEq ){ + ExprSetProperty(pEq, joinType); + assert( !ExprHasProperty(pEq, EP_TokenOnly|EP_Reduced) ); + ExprSetVVAProperty(pEq, EP_NoReduce); + pEq->w.iJoin = pE2->iTable; + } + p->pWhere = sqlite3ExprAnd(pParse, p->pWhere, pEq); } } + + /* Add the ON clause to the end of the WHERE clause, connected by + ** an AND operator. + */ + else if( pRight->u3.pOn ){ + sqlite3SetJoinExpr(pRight->u3.pOn, pRight->iCursor, joinType); + p->pWhere = sqlite3ExprAnd(pParse, p->pWhere, pRight->u3.pOn); + pRight->u3.pOn = 0; + pRight->fg.isOn = 1; + } } return 0; } @@ -135196,7 +138356,7 @@ static void fixDistinctOpenEph( ** retrieved directly from table t1. If the values are very large, this ** can be more efficient than storing them directly in the sorter records. ** -** The ExprList_item.bSorterRef flag is set for each expression in pEList +** The ExprList_item.fg.bSorterRef flag is set for each expression in pEList ** for which the sorter-reference optimization should be enabled. ** Additionally, the pSort->aDefer[] array is populated with entries ** for all cursors required to evaluate all selected expressions. Finally. @@ -135256,7 +138416,7 @@ static void selectExprDefer( nDefer++; } } - pItem->bSorterRef = 1; + pItem->fg.bSorterRef = 1; } } } @@ -135387,7 +138547,7 @@ static void selectInnerLoop( for(i=0; inExpr; i++){ if( pEList->a[i].u.x.iOrderByCol>0 #ifdef SQLITE_ENABLE_SORTER_REFERENCES - || pEList->a[i].bSorterRef + || pEList->a[i].fg.bSorterRef #endif ){ nResultCol--; @@ -135475,6 +138635,9 @@ static void selectInnerLoop( testcase( eDest==SRT_Fifo ); testcase( eDest==SRT_DistFifo ); sqlite3VdbeAddOp3(v, OP_MakeRecord, regResult, nResultCol, r1+nPrefixReg); + if( pDest->zAffSdst ){ + sqlite3VdbeChangeP4(v, -1, pDest->zAffSdst, nResultCol); + } #ifndef SQLITE_OMIT_CTE if( eDest==SRT_DistFifo ){ /* If the destination is DistFifo, then cursor (iParm+1) is open @@ -135690,9 +138853,10 @@ SQLITE_PRIVATE KeyInfo *sqlite3KeyInfoAlloc(sqlite3 *db, int N, int X){ */ SQLITE_PRIVATE void sqlite3KeyInfoUnref(KeyInfo *p){ if( p ){ + assert( p->db!=0 ); assert( p->nRef>0 ); p->nRef--; - if( p->nRef==0 ) sqlite3DbFreeNN(p->db, p); + if( p->nRef==0 ) sqlite3DbNNFreeNN(p->db, p); } } @@ -135749,7 +138913,7 @@ SQLITE_PRIVATE KeyInfo *sqlite3KeyInfoFromExprList( assert( sqlite3KeyInfoIsWriteable(pInfo) ); for(i=iStart, pItem=pList->a+iStart; iaColl[i-iStart] = sqlite3ExprNNCollSeq(pParse, pItem->pExpr); - pInfo->aSortFlags[i-iStart] = pItem->sortFlags; + pInfo->aSortFlags[i-iStart] = pItem->fg.sortFlags; } } return pInfo; @@ -135877,7 +139041,7 @@ static void generateSortTail( if( addrOnce ) sqlite3VdbeJumpHere(v, addrOnce); addr = 1 + sqlite3VdbeAddOp2(v, OP_SorterSort, iTab, addrBreak); VdbeCoverage(v); - codeOffset(v, p->iOffset, addrContinue); + assert( p->iLimit==0 && p->iOffset==0 ); sqlite3VdbeAddOp3(v, OP_SorterData, iTab, regSortOut, iSortTab); bSeq = 0; }else{ @@ -135885,10 +139049,13 @@ static void generateSortTail( codeOffset(v, p->iOffset, addrContinue); iSortTab = iTab; bSeq = 1; + if( p->iOffset>0 ){ + sqlite3VdbeAddOp2(v, OP_AddImm, p->iLimit, -1); + } } for(i=0, iCol=nKey+bSeq-1; i=0; i--){ #ifdef SQLITE_ENABLE_SORTER_REFERENCES - if( aOutEx[i].bSorterRef ){ + if( aOutEx[i].fg.bSorterRef ){ sqlite3ExprCode(pParse, aOutEx[i].pExpr, regRow+i); }else #endif @@ -136009,9 +139176,6 @@ static void generateSortTail( ** Return a pointer to a string containing the 'declaration type' of the ** expression pExpr. The string may be treated as static by the caller. ** -** Also try to estimate the size of the returned value and return that -** result in *pEstWidth. -** ** The declaration type is the exact datatype definition extracted from the ** original CREATE TABLE statement if the expression is a column. The ** declaration type for a ROWID field is INTEGER. Exactly when an expression @@ -136291,7 +139455,7 @@ SQLITE_PRIVATE void sqlite3GenerateColumnNames( assert( p->op!=TK_AGG_COLUMN ); /* Agg processing has not run yet */ assert( p->op!=TK_COLUMN || (ExprUseYTab(p) && p->y.pTab!=0) ); /* Covering idx not yet coded */ - if( pEList->a[i].zEName && pEList->a[i].eEName==ENAME_NAME ){ + if( pEList->a[i].zEName && pEList->a[i].fg.eEName==ENAME_NAME ){ /* An AS clause always takes first priority */ char *zName = pEList->a[i].zEName; sqlite3VdbeSetColName(v, i, COLNAME_NAME, zName, SQLITE_TRANSIENT); @@ -136376,22 +139540,25 @@ SQLITE_PRIVATE int sqlite3ColumnsFromExprList( *paCol = aCol; for(i=0, pCol=aCol; imallocFailed; i++, pCol++){ + struct ExprList_item *pX = &pEList->a[i]; + struct ExprList_item *pCollide; /* Get an appropriate name for the column */ - if( (zName = pEList->a[i].zEName)!=0 && pEList->a[i].eEName==ENAME_NAME ){ + if( (zName = pX->zEName)!=0 && pX->fg.eEName==ENAME_NAME ){ /* If the column contains an "AS " phrase, use as the name */ }else{ - Expr *pColExpr = sqlite3ExprSkipCollateAndLikely(pEList->a[i].pExpr); + Expr *pColExpr = sqlite3ExprSkipCollateAndLikely(pX->pExpr); while( ALWAYS(pColExpr!=0) && pColExpr->op==TK_DOT ){ pColExpr = pColExpr->pRight; assert( pColExpr!=0 ); } if( pColExpr->op==TK_COLUMN && ALWAYS( ExprUseYTab(pColExpr) ) - && (pTab = pColExpr->y.pTab)!=0 + && ALWAYS( pColExpr->y.pTab!=0 ) ){ /* For columns use the column name name */ int iCol = pColExpr->iColumn; + pTab = pColExpr->y.pTab; if( iCol<0 ) iCol = pTab->iPKey; zName = iCol>=0 ? pTab->aCol[iCol].zCnName : "rowid"; }else if( pColExpr->op==TK_ID ){ @@ -136399,7 +139566,7 @@ SQLITE_PRIVATE int sqlite3ColumnsFromExprList( zName = pColExpr->u.zToken; }else{ /* Use the original text of the column expression as its name */ - zName = pEList->a[i].zEName; + assert( zName==pX->zEName ); /* pointer comparison intended */ } } if( zName && !sqlite3IsTrueOrFalse(zName) ){ @@ -136412,7 +139579,10 @@ SQLITE_PRIVATE int sqlite3ColumnsFromExprList( ** append an integer to the name so that it becomes unique. */ cnt = 0; - while( zName && sqlite3HashFind(&ht, zName)!=0 ){ + while( zName && (pCollide = sqlite3HashFind(&ht, zName))!=0 ){ + if( pCollide->fg.bUsingTerm ){ + pCol->colFlags |= COLFLAG_NOEXPAND; + } nName = sqlite3Strlen30(zName); if( nName>0 ){ for(j=nName-1; j>0 && sqlite3Isdigit(zName[j]); j--){} @@ -136423,8 +139593,11 @@ SQLITE_PRIVATE int sqlite3ColumnsFromExprList( } pCol->zCnName = zName; pCol->hName = sqlite3StrIHash(zName); + if( pX->fg.bNoExpand ){ + pCol->colFlags |= COLFLAG_NOEXPAND; + } sqlite3ColumnPropertiesFromName(0, pCol); - if( zName && sqlite3HashInsert(&ht, zName, pCol)==pCol ){ + if( zName && sqlite3HashInsert(&ht, zName, pX)==pX ){ sqlite3OomFault(db); } } @@ -136681,7 +139854,7 @@ static KeyInfo *multiSelectOrderByKeyInfo(Parse *pParse, Select *p, int nExtra){ } assert( sqlite3KeyInfoIsWriteable(pRet) ); pRet->aColl[i] = pColl; - pRet->aSortFlags[i] = pOrderBy->a[i].sortFlags; + pRet->aSortFlags[i] = pOrderBy->a[i].fg.sortFlags; } } @@ -136899,7 +140072,7 @@ static int multiSelectOrderBy( ** The "LIMIT of exactly 1" case of condition (1) comes about when a VALUES ** clause occurs within scalar expression (ex: "SELECT (VALUES(1),(2),(3))"). ** The sqlite3CodeSubselect will have added the LIMIT 1 clause in tht case. -** Since the limit is exactly 1, we only need to evalutes the left-most VALUES. +** Since the limit is exactly 1, we only need to evaluate the left-most VALUES. */ static int multiSelectValues( Parse *pParse, /* Parsing context */ @@ -137868,10 +141041,11 @@ static int multiSelectOrderBy( */ sqlite3VdbeResolveLabel(v, labelEnd); - /* Reassembly the compound query so that it will be freed correctly - ** by the calling function */ + /* Make arrangements to free the 2nd and subsequent arms of the compound + ** after the parse has finished */ if( pSplit->pPrior ){ - sqlite3SelectDelete(db, pSplit->pPrior); + sqlite3ParserAddCleanup(pParse, + (void(*)(sqlite3*,void*))sqlite3SelectDelete, pSplit->pPrior); } pSplit->pPrior = pPrior; pPrior->pNext = pSplit; @@ -137892,13 +141066,42 @@ static int multiSelectOrderBy( ** ** All references to columns in table iTable are to be replaced by corresponding ** expressions in pEList. +** +** ## About "isOuterJoin": +** +** The isOuterJoin column indicates that the replacement will occur into a +** position in the parent that NULL-able due to an OUTER JOIN. Either the +** target slot in the parent is the right operand of a LEFT JOIN, or one of +** the left operands of a RIGHT JOIN. In either case, we need to potentially +** bypass the substituted expression with OP_IfNullRow. +** +** Suppose the original expression is an integer constant. Even though the table +** has the nullRow flag set, because the expression is an integer constant, +** it will not be NULLed out. So instead, we insert an OP_IfNullRow opcode +** that checks to see if the nullRow flag is set on the table. If the nullRow +** flag is set, then the value in the register is set to NULL and the original +** expression is bypassed. If the nullRow flag is not set, then the original +** expression runs to populate the register. +** +** Example where this is needed: +** +** CREATE TABLE t1(a INTEGER PRIMARY KEY, b INT); +** CREATE TABLE t2(x INT UNIQUE); +** +** SELECT a,b,m,x FROM t1 LEFT JOIN (SELECT 59 AS m,x FROM t2) ON b=x; +** +** When the subquery on the right side of the LEFT JOIN is flattened, we +** have to add OP_IfNullRow in front of the OP_Integer that implements the +** "m" value of the subquery so that a NULL will be loaded instead of 59 +** when processing a non-matched row of the left. */ typedef struct SubstContext { Parse *pParse; /* The parsing context */ int iTable; /* Replace references to this table */ int iNewTable; /* New table number */ - int isLeftJoin; /* Add TK_IF_NULL_ROW opcodes on each replacement */ + int isOuterJoin; /* Add TK_IF_NULL_ROW opcodes on each replacement */ ExprList *pEList; /* Replacement expressions */ + ExprList *pCList; /* Collation sequences for replacement expr */ } SubstContext; /* Forward Declarations */ @@ -137923,10 +141126,11 @@ static Expr *substExpr( Expr *pExpr /* Expr in which substitution occurs */ ){ if( pExpr==0 ) return 0; - if( ExprHasProperty(pExpr, EP_FromJoin) - && pExpr->w.iRightJoinTable==pSubst->iTable + if( ExprHasProperty(pExpr, EP_OuterON|EP_InnerON) + && pExpr->w.iJoin==pSubst->iTable ){ - pExpr->w.iRightJoinTable = pSubst->iNewTable; + testcase( ExprHasProperty(pExpr, EP_InnerON) ); + pExpr->w.iJoin = pSubst->iNewTable; } if( pExpr->op==TK_COLUMN && pExpr->iTable==pSubst->iTable @@ -137939,19 +141143,21 @@ static Expr *substExpr( #endif { Expr *pNew; - Expr *pCopy = pSubst->pEList->a[pExpr->iColumn].pExpr; + int iColumn = pExpr->iColumn; + Expr *pCopy = pSubst->pEList->a[iColumn].pExpr; Expr ifNullRow; - assert( pSubst->pEList!=0 && pExpr->iColumnpEList->nExpr ); + assert( pSubst->pEList!=0 && iColumnpEList->nExpr ); assert( pExpr->pRight==0 ); if( sqlite3ExprIsVector(pCopy) ){ sqlite3VectorErrorMsg(pSubst->pParse, pCopy); }else{ sqlite3 *db = pSubst->pParse->db; - if( pSubst->isLeftJoin && pCopy->op!=TK_COLUMN ){ + if( pSubst->isOuterJoin && pCopy->op!=TK_COLUMN ){ memset(&ifNullRow, 0, sizeof(ifNullRow)); ifNullRow.op = TK_IF_NULL_ROW; ifNullRow.pLeft = pCopy; ifNullRow.iTable = pSubst->iNewTable; + ifNullRow.iColumn = -99; ifNullRow.flags = EP_IfNullRow; pCopy = &ifNullRow; } @@ -137961,22 +141167,33 @@ static Expr *substExpr( sqlite3ExprDelete(db, pNew); return pExpr; } - if( pSubst->isLeftJoin ){ + if( pSubst->isOuterJoin ){ ExprSetProperty(pNew, EP_CanBeNull); } - if( ExprHasProperty(pExpr,EP_FromJoin) ){ - sqlite3SetJoinExpr(pNew, pExpr->w.iRightJoinTable); + if( ExprHasProperty(pExpr,EP_OuterON|EP_InnerON) ){ + sqlite3SetJoinExpr(pNew, pExpr->w.iJoin, + pExpr->flags & (EP_OuterON|EP_InnerON)); } sqlite3ExprDelete(db, pExpr); pExpr = pNew; + if( pExpr->op==TK_TRUEFALSE ){ + pExpr->u.iValue = sqlite3ExprTruthValue(pExpr); + pExpr->op = TK_INTEGER; + ExprSetProperty(pExpr, EP_IntValue); + } /* Ensure that the expression now has an implicit collation sequence, ** just as it did when it was a column of a view or sub-query. */ - if( pExpr->op!=TK_COLUMN && pExpr->op!=TK_COLLATE ){ - CollSeq *pColl = sqlite3ExprCollSeq(pSubst->pParse, pExpr); - pExpr = sqlite3ExprAddCollateString(pSubst->pParse, pExpr, - (pColl ? pColl->zName : "BINARY") + { + CollSeq *pNat = sqlite3ExprCollSeq(pSubst->pParse, pExpr); + CollSeq *pColl = sqlite3ExprCollSeq(pSubst->pParse, + pSubst->pCList->a[iColumn].pExpr ); + if( pNat!=pColl || (pExpr->op!=TK_COLUMN && pExpr->op!=TK_COLLATE) ){ + pExpr = sqlite3ExprAddCollateString(pSubst->pParse, pExpr, + (pColl ? pColl->zName : "BINARY") + ); + } } ExprClearProperty(pExpr, EP_Collate); } @@ -138129,8 +141346,8 @@ static int renumberCursorsCb(Walker *pWalker, Expr *pExpr){ if( op==TK_COLUMN || op==TK_IF_NULL_ROW ){ renumberCursorDoMapping(pWalker, &pExpr->iTable); } - if( ExprHasProperty(pExpr, EP_FromJoin) ){ - renumberCursorDoMapping(pWalker, &pExpr->w.iRightJoinTable); + if( ExprHasProperty(pExpr, EP_OuterON) ){ + renumberCursorDoMapping(pWalker, &pExpr->w.iJoin); } return WRC_Continue; } @@ -138169,6 +141386,18 @@ static void renumberCursors( } #endif /* !defined(SQLITE_OMIT_SUBQUERY) || !defined(SQLITE_OMIT_VIEW) */ +/* +** If pSel is not part of a compound SELECT, return a pointer to its +** expression list. Otherwise, return a pointer to the expression list +** of the leftmost SELECT in the compound. +*/ +static ExprList *findLeftmostExprlist(Select *pSel){ + while( pSel->pPrior ){ + pSel = pSel->pPrior; + } + return pSel->pEList; +} + #if !defined(SQLITE_OMIT_SUBQUERY) || !defined(SQLITE_OMIT_VIEW) /* ** This routine attempts to flatten subqueries as a performance optimization. @@ -138213,8 +141442,10 @@ static void renumberCursors( ** (3a) the subquery may not be a join and ** (3b) the FROM clause of the subquery may not contain a virtual ** table and -** (3c) the outer query may not be an aggregate. +** (**) Was: "The outer query may not have a GROUP BY." This case +** is now managed correctly ** (3d) the outer query may not be DISTINCT. +** See also (26) for restrictions on RIGHT JOIN. ** ** (4) The subquery can not be DISTINCT. ** @@ -138266,6 +141497,11 @@ static void renumberCursors( ** (17d2) DISTINCT ** (17e) the subquery may not contain window functions, and ** (17f) the subquery must not be the RHS of a LEFT JOIN. +** (17g) either the subquery is the first element of the outer +** query or there are no RIGHT or FULL JOINs in any arm +** of the subquery. (This is a duplicate of condition (27b).) +** (17h) The corresponding result set expressions in all arms of the +** compound must have the same affinity. ** ** The parent and sub-query may contain WHERE clauses. Subject to ** rules (11), (13) and (14), they may also contain ORDER BY, @@ -138313,6 +141549,17 @@ static void renumberCursors( ** function in the select list or ORDER BY clause, flattening ** is not attempted. ** +** (26) The subquery may not be the right operand of a RIGHT JOIN. +** See also (3) for restrictions on LEFT JOIN. +** +** (27) The subquery may not contain a FULL or RIGHT JOIN unless it +** is the first element of the parent query. Two subcases: +** (27a) the subquery is not a compound query. +** (27b) the subquery is a compound query and the RIGHT JOIN occurs +** in any arm of the compound query. (See also (17g).) +** +** (28) The subquery is not a MATERIALIZED CTE. +** ** ** In this routine, the "p" parameter is a pointer to the outer query. ** The subquery is p->pSrc->a[iFrom]. isAgg is true if the outer query @@ -138338,7 +141585,7 @@ static int flattenSubquery( SrcList *pSubSrc; /* The FROM clause of the subquery */ int iParent; /* VDBE cursor number of the pSub result set temp table */ int iNewParent = -1;/* Replacement table for iParent */ - int isLeftJoin = 0; /* True if pSub is the right side of a LEFT JOIN */ + int isOuterJoin = 0; /* True if pSub is the right side of a LEFT JOIN */ int i; /* Loop counter */ Expr *pWhere; /* The WHERE clause */ SrcItem *pSubitem; /* The subquery */ @@ -138404,32 +141651,26 @@ static int flattenSubquery( ** ** which is not at all the same thing. ** - ** If the subquery is the right operand of a LEFT JOIN, then the outer - ** query cannot be an aggregate. (3c) This is an artifact of the way - ** aggregates are processed - there is no mechanism to determine if - ** the LEFT JOIN table should be all-NULL. - ** ** See also tickets #306, #350, and #3300. */ - if( (pSubitem->fg.jointype & JT_OUTER)!=0 ){ - isLeftJoin = 1; - if( pSubSrc->nSrc>1 /* (3a) */ - || isAgg /* (3b) */ - || IsVirtual(pSubSrc->a[0].pTab) /* (3c) */ - || (p->selFlags & SF_Distinct)!=0 /* (3d) */ + if( (pSubitem->fg.jointype & (JT_OUTER|JT_LTORJ))!=0 ){ + if( pSubSrc->nSrc>1 /* (3a) */ + || IsVirtual(pSubSrc->a[0].pTab) /* (3b) */ + || (p->selFlags & SF_Distinct)!=0 /* (3d) */ + || (pSubitem->fg.jointype & JT_RIGHT)!=0 /* (26) */ ){ return 0; } + isOuterJoin = 1; } -#ifdef SQLITE_EXTRA_IFNULLROW - else if( iFrom>0 && !isAgg ){ - /* Setting isLeftJoin to -1 causes OP_IfNullRow opcodes to be generated for - ** every reference to any result column from subquery in a join, even - ** though they are not necessary. This will stress-test the OP_IfNullRow - ** opcode. */ - isLeftJoin = -1; + + assert( pSubSrc->nSrc>0 ); /* True by restriction (7) */ + if( iFrom>0 && (pSubSrc->a[0].fg.jointype & JT_LTORJ)!=0 ){ + return 0; /* Restriction (27a) */ + } + if( pSubitem->fg.isCte && pSubitem->u2.pCteUse->eM10d==M10d_Yes ){ + return 0; /* (28) */ } -#endif /* Restriction (17): If the sub-query is a compound SELECT, then it must ** use only the UNION ALL operator. And none of the simple select queries @@ -138437,10 +141678,11 @@ static int flattenSubquery( ** queries. */ if( pSub->pPrior ){ + int ii; if( pSub->pOrderBy ){ return 0; /* Restriction (20) */ } - if( isAgg || (p->selFlags & SF_Distinct)!=0 || isLeftJoin>0 ){ + if( isAgg || (p->selFlags & SF_Distinct)!=0 || isOuterJoin>0 ){ return 0; /* (17d1), (17d2), or (17f) */ } for(pSub1=pSub; pSub1; pSub1=pSub1->pPrior){ @@ -138458,12 +141700,17 @@ static int flattenSubquery( ){ return 0; } + if( iFrom>0 && (pSub1->pSrc->a[0].fg.jointype & JT_LTORJ)!=0 ){ + /* Without this restriction, the JT_LTORJ flag would end up being + ** omitted on left-hand tables of the right join that is being + ** flattened. */ + return 0; /* Restrictions (17g), (27b) */ + } testcase( pSub1->pSrc->nSrc>1 ); } /* Restriction (18). */ if( p->pOrderBy ){ - int ii; for(ii=0; iipOrderBy->nExpr; ii++){ if( p->pOrderBy->a[ii].u.x.iOrderByCol==0 ) return 0; } @@ -138472,8 +141719,24 @@ static int flattenSubquery( /* Restriction (23) */ if( (p->selFlags & SF_Recursive) ) return 0; + /* Restriction (17h) */ + for(ii=0; iipEList->nExpr; ii++){ + char aff; + assert( pSub->pEList->a[ii].pExpr!=0 ); + aff = sqlite3ExprAffinity(pSub->pEList->a[ii].pExpr); + for(pSub1=pSub->pPrior; pSub1; pSub1=pSub1->pPrior){ + assert( pSub1->pEList!=0 ); + assert( pSub1->pEList->nExpr>ii ); + assert( pSub1->pEList->a[ii].pExpr!=0 ); + if( sqlite3ExprAffinity(pSub1->pEList->a[ii].pExpr)!=aff ){ + return 0; + } + } + } + if( pSrc->nSrc>1 ){ if( pParse->nSelect>500 ) return 0; + if( OptimizationDisabled(db, SQLITE_FlttnUnionAll) ) return 0; aCsrMap = sqlite3DbMallocZero(db, ((i64)pParse->nTab+1)*sizeof(int)); if( aCsrMap ) aCsrMap[0] = pParse->nTab; } @@ -138498,7 +141761,7 @@ static int flattenSubquery( pSubitem->zName = 0; pSubitem->zAlias = 0; pSubitem->pSelect = 0; - assert( pSubitem->pOn==0 ); + assert( pSubitem->fg.isUsing!=0 || pSubitem->u3.pOn==0 ); /* If the sub-query is a compound SELECT statement, then (by restrictions ** 17 and 18 above) it must be a UNION ALL and the parent query must @@ -138608,6 +141871,7 @@ static int flattenSubquery( for(pParent=p; pParent; pParent=pParent->pPrior, pSub=pSub->pPrior){ int nSubSrc; u8 jointype = 0; + u8 ltorj = pSrc->a[iFrom].fg.jointype & JT_LTORJ; assert( pSub!=0 ); pSubSrc = pSub->pSrc; /* FROM clause of subquery */ nSubSrc = pSubSrc->nSrc; /* Number of terms in subquery FROM clause */ @@ -138642,13 +141906,16 @@ static int flattenSubquery( ** outer query. */ for(i=0; ia[i+iFrom].pUsing); - assert( pSrc->a[i+iFrom].fg.isTabFunc==0 ); - pSrc->a[i+iFrom] = pSubSrc->a[i]; + SrcItem *pItem = &pSrc->a[i+iFrom]; + if( pItem->fg.isUsing ) sqlite3IdListDelete(db, pItem->u3.pUsing); + assert( pItem->fg.isTabFunc==0 ); + *pItem = pSubSrc->a[i]; + pItem->fg.jointype |= ltorj; iNewParent = pSubSrc->a[i].iCursor; memset(&pSubSrc->a[i], 0, sizeof(pSubSrc->a[i])); } - pSrc->a[iFrom].fg.jointype = jointype; + pSrc->a[iFrom].fg.jointype &= JT_LTORJ; + pSrc->a[iFrom].fg.jointype |= jointype | ltorj; /* Now begin substituting subquery result set expressions for ** references to the iParent in the outer query. @@ -138683,8 +141950,8 @@ static int flattenSubquery( } pWhere = pSub->pWhere; pSub->pWhere = 0; - if( isLeftJoin>0 ){ - sqlite3SetJoinExpr(pWhere, iNewParent); + if( isOuterJoin>0 ){ + sqlite3SetJoinExpr(pWhere, iNewParent, EP_OuterON); } if( pWhere ){ if( pParent->pWhere ){ @@ -138698,8 +141965,9 @@ static int flattenSubquery( x.pParse = pParse; x.iTable = iParent; x.iNewTable = iNewParent; - x.isLeftJoin = isLeftJoin; + x.isOuterJoin = isOuterJoin; x.pEList = pSub->pEList; + x.pCList = findLeftmostExprlist(pSub); substSelect(&x, pParent, 0); } @@ -138719,7 +141987,7 @@ static int flattenSubquery( pSub->pLimit = 0; } - /* Recompute the SrcList_item.colUsed masks for the flattened + /* Recompute the SrcItem.colUsed masks for the flattened ** tables. */ for(i=0; ia[i+iFrom]); @@ -138733,8 +142001,8 @@ static int flattenSubquery( sqlite3WalkSelect(&w,pSub1); sqlite3SelectDelete(db, pSub1); -#if SELECTTRACE_ENABLED - if( sqlite3SelectTrace & 0x100 ){ +#if TREETRACE_ENABLED + if( sqlite3TreeTrace & 0x100 ){ SELECTTRACE(0x100,pParse,p,("After flattening:\n")); sqlite3TreeViewSelect(0, p, 0); } @@ -138755,6 +142023,8 @@ struct WhereConst { int nConst; /* Number for COLUMN=CONSTANT terms */ int nChng; /* Number of times a constant is propagated */ int bHasAffBlob; /* At least one column in apExpr[] as affinity BLOB */ + u32 mExcludeOn; /* Which ON expressions to exclude from considertion. + ** Either EP_OuterON or EP_InnerON|EP_OuterON */ Expr **apExpr; /* [i*2] is COLUMN and [i*2+1] is VALUE */ }; @@ -138817,7 +142087,11 @@ static void constInsert( static void findConstInWhere(WhereConst *pConst, Expr *pExpr){ Expr *pRight, *pLeft; if( NEVER(pExpr==0) ) return; - if( ExprHasProperty(pExpr, EP_FromJoin) ) return; + if( ExprHasProperty(pExpr, pConst->mExcludeOn) ){ + testcase( ExprHasProperty(pExpr, EP_OuterON) ); + testcase( ExprHasProperty(pExpr, EP_InnerON) ); + return; + } if( pExpr->op==TK_AND ){ findConstInWhere(pConst, pExpr->pRight); findConstInWhere(pConst, pExpr->pLeft); @@ -138853,9 +142127,10 @@ static int propagateConstantExprRewriteOne( int i; if( pConst->pOomFault[0] ) return WRC_Prune; if( pExpr->op!=TK_COLUMN ) return WRC_Continue; - if( ExprHasProperty(pExpr, EP_FixedCol|EP_FromJoin) ){ + if( ExprHasProperty(pExpr, EP_FixedCol|pConst->mExcludeOn) ){ testcase( ExprHasProperty(pExpr, EP_FixedCol) ); - testcase( ExprHasProperty(pExpr, EP_FromJoin) ); + testcase( ExprHasProperty(pExpr, EP_OuterON) ); + testcase( ExprHasProperty(pExpr, EP_InnerON) ); return WRC_Continue; } for(i=0; inConst; i++){ @@ -138979,6 +142254,17 @@ static int propagateConstants( x.nChng = 0; x.apExpr = 0; x.bHasAffBlob = 0; + if( ALWAYS(p->pSrc!=0) + && p->pSrc->nSrc>0 + && (p->pSrc->a[0].fg.jointype & JT_LTORJ)!=0 + ){ + /* Do not propagate constants on any ON clause if there is a + ** RIGHT JOIN anywhere in the query */ + x.mExcludeOn = EP_InnerON | EP_OuterON; + }else{ + /* Do not propagate constants through the ON clause of a LEFT JOIN */ + x.mExcludeOn = EP_OuterON; + } findConstInWhere(&x, p->pWhere); if( x.nConst ){ memset(&w, 0, sizeof(w)); @@ -139091,6 +142377,13 @@ static int pushDownWindowCheck(Parse *pParse, Select *pSubq, Expr *pExpr){ ** be materialized. (This restriction is implemented in the calling ** routine.) ** +** (8) The subquery may not be a compound that uses UNION, INTERSECT, +** or EXCEPT. (We could, perhaps, relax this restriction to allow +** this case if none of the comparisons operators between left and +** right arms of the compound use a collation other than BINARY. +** But it is a lot of work to check that case for an obscure and +** minor optimization, so we omit it for now.) +** ** Return 0 if no changes are made and non-zero if one or more WHERE clause ** terms are duplicated into the subquery. */ @@ -139104,11 +142397,16 @@ static int pushDownWhereTerms( int nChng = 0; if( pWhere==0 ) return 0; if( pSubq->selFlags & (SF_Recursive|SF_MultiPart) ) return 0; + if( pSrc->fg.jointype & (JT_LTORJ|JT_RIGHT) ) return 0; #ifndef SQLITE_OMIT_WINDOWFUNC if( pSubq->pPrior ){ Select *pSel; for(pSel=pSubq; pSel; pSel=pSel->pPrior){ + u8 op = pSel->op; + assert( op==TK_ALL || op==TK_SELECT + || op==TK_UNION || op==TK_INTERSECT || op==TK_EXCEPT ); + if( op!=TK_ALL && op!=TK_SELECT ) return 0; /* restriction (8) */ if( pSel->pWin ) return 0; /* restriction (6b) */ } }else{ @@ -139139,13 +142437,13 @@ static int pushDownWhereTerms( #if 0 /* Legacy code. Checks now done by sqlite3ExprIsTableConstraint() */ if( isLeftJoin - && (ExprHasProperty(pWhere,EP_FromJoin)==0 - || pWhere->w.iRightJoinTable!=iCursor) + && (ExprHasProperty(pWhere,EP_OuterON)==0 + || pWhere->w.iJoin!=iCursor) ){ return 0; /* restriction (4) */ } - if( ExprHasProperty(pWhere,EP_FromJoin) - && pWhere->w.iRightJoinTable!=iCursor + if( ExprHasProperty(pWhere,EP_OuterON) + && pWhere->w.iJoin!=iCursor ){ return 0; /* restriction (5) */ } @@ -139157,12 +142455,13 @@ static int pushDownWhereTerms( while( pSubq ){ SubstContext x; pNew = sqlite3ExprDup(pParse->db, pWhere, 0); - unsetJoinExpr(pNew, -1); + unsetJoinExpr(pNew, -1, 1); x.pParse = pParse; x.iTable = pSrc->iCursor; x.iNewTable = pSrc->iCursor; - x.isLeftJoin = 0; + x.isOuterJoin = 0; x.pEList = pSubq->pEList; + x.pCList = findLeftmostExprlist(pSubq); pNew = substExpr(&x, pNew); #ifndef SQLITE_OMIT_WINDOWFUNC if( pSubq->pWin && 0==pushDownWindowCheck(pParse, pSubq, pNew) ){ @@ -139234,7 +142533,7 @@ static u8 minMaxQuery(sqlite3 *db, Expr *pFunc, ExprList **ppMinMax){ } *ppMinMax = pOrderBy = sqlite3ExprListDup(db, pEList, 0); assert( pOrderBy!=0 || db->mallocFailed ); - if( pOrderBy ) pOrderBy->a[0].sortFlags = sortFlags; + if( pOrderBy ) pOrderBy->a[0].fg.sortFlags = sortFlags; return eRet; } @@ -139266,6 +142565,7 @@ static Table *isSimpleCount(Select *p, AggInfo *pAggInfo){ || p->pSrc->nSrc!=1 || p->pSrc->a[0].pSelect || pAggInfo->nFunc!=1 + || p->pHaving ){ return 0; } @@ -139370,7 +142670,7 @@ static int convertCompoundSelectToSubquery(Walker *pWalker, Select *p){ pNew = sqlite3DbMallocZero(db, sizeof(*pNew) ); if( pNew==0 ) return WRC_Abort; memset(&dummy, 0, sizeof(dummy)); - pNewSrc = sqlite3SrcListAppendFromTerm(pParse,0,0,0,&dummy,pNew,0,0); + pNewSrc = sqlite3SrcListAppendFromTerm(pParse,0,0,0,&dummy,pNew,0); if( pNewSrc==0 ) return WRC_Abort; *pNew = *p; p->pSrc = pNewSrc; @@ -139686,9 +142986,9 @@ SQLITE_PRIVATE void sqlite3SelectPopWith(Walker *pWalker, Select *p){ #endif /* -** The SrcList_item structure passed as the second argument represents a +** The SrcItem structure passed as the second argument represents a ** sub-query in the FROM clause of a SELECT statement. This function -** allocates and populates the SrcList_item.pTab object. If successful, +** allocates and populates the SrcItem.pTab object. If successful, ** SQLITE_OK is returned. Otherwise, if an OOM error is encountered, ** SQLITE_NOMEM. */ @@ -139703,7 +143003,7 @@ SQLITE_PRIVATE int sqlite3ExpandSubquery(Parse *pParse, SrcItem *pFrom){ if( pFrom->zAlias ){ pTab->zName = sqlite3DbStrDup(pParse->db, pFrom->zAlias); }else{ - pTab->zName = sqlite3MPrintf(pParse->db, "subquery_%u", pSel->selId); + pTab->zName = sqlite3MPrintf(pParse->db, "%!S", pFrom); } while( pSel->pPrior ){ pSel = pSel->pPrior; } sqlite3ColumnsFromExprList(pParse, pSel->pEList,&pTab->nCol,&pTab->aCol); @@ -139715,11 +143015,35 @@ SQLITE_PRIVATE int sqlite3ExpandSubquery(Parse *pParse, SrcItem *pFrom){ #else pTab->tabFlags |= TF_Ephemeral; /* Legacy compatibility mode */ #endif + return pParse->nErr ? SQLITE_ERROR : SQLITE_OK; +} - return pParse->nErr ? SQLITE_ERROR : SQLITE_OK; +/* +** Check the N SrcItem objects to the right of pBase. (N might be zero!) +** If any of those SrcItem objects have a USING clause containing zName +** then return true. +** +** If N is zero, or none of the N SrcItem objects to the right of pBase +** contains a USING clause, or if none of the USING clauses contain zName, +** then return false. +*/ +static int inAnyUsingClause( + const char *zName, /* Name we are looking for */ + SrcItem *pBase, /* The base SrcItem. Looking at pBase[1] and following */ + int N /* How many SrcItems to check */ +){ + while( N>0 ){ + N--; + pBase++; + if( pBase->fg.isUsing==0 ) continue; + if( NEVER(pBase->u3.pUsing==0) ) continue; + if( sqlite3IdListIndex(pBase->u3.pUsing, zName)>=0 ) return 1; + } + return 0; } + /* ** This routine is a Walker callback for "expanding" a SELECT statement. ** "Expanding" means to do the following: @@ -139869,7 +143193,7 @@ static int selectExpander(Walker *pWalker, Select *p){ /* Process NATURAL keywords, and ON and USING clauses of joins. */ assert( db->mallocFailed==0 || pParse->nErr!=0 ); - if( pParse->nErr || sqliteProcessJoin(pParse, p) ){ + if( pParse->nErr || sqlite3ProcessJoin(pParse, p) ){ return WRC_Abort; } @@ -139917,7 +143241,7 @@ static int selectExpander(Walker *pWalker, Select *p){ pNew = sqlite3ExprListAppend(pParse, pNew, a[k].pExpr); if( pNew ){ pNew->a[pNew->nExpr-1].zEName = a[k].zEName; - pNew->a[pNew->nExpr-1].eEName = a[k].eEName; + pNew->a[pNew->nExpr-1].fg.eEName = a[k].fg.eEName; a[k].zEName = 0; } a[k].pExpr = 0; @@ -139932,32 +143256,60 @@ static int selectExpander(Walker *pWalker, Select *p){ zTName = pE->pLeft->u.zToken; } for(i=0, pFrom=pTabList->a; inSrc; i++, pFrom++){ - Table *pTab = pFrom->pTab; - Select *pSub = pFrom->pSelect; - char *zTabName = pFrom->zAlias; - const char *zSchemaName = 0; - int iDb; - if( zTabName==0 ){ + Table *pTab = pFrom->pTab; /* Table for this data source */ + ExprList *pNestedFrom; /* Result-set of a nested FROM clause */ + char *zTabName; /* AS name for this data source */ + const char *zSchemaName = 0; /* Schema name for this data source */ + int iDb; /* Schema index for this data src */ + IdList *pUsing; /* USING clause for pFrom[1] */ + + if( (zTabName = pFrom->zAlias)==0 ){ zTabName = pTab->zName; } if( db->mallocFailed ) break; - if( pSub==0 || (pSub->selFlags & SF_NestedFrom)==0 ){ - pSub = 0; + assert( (int)pFrom->fg.isNestedFrom == IsNestedFrom(pFrom->pSelect) ); + if( pFrom->fg.isNestedFrom ){ + assert( pFrom->pSelect!=0 ); + pNestedFrom = pFrom->pSelect->pEList; + assert( pNestedFrom!=0 ); + assert( pNestedFrom->nExpr==pTab->nCol ); + }else{ if( zTName && sqlite3StrICmp(zTName, zTabName)!=0 ){ continue; } + pNestedFrom = 0; iDb = sqlite3SchemaToIndex(db, pTab->pSchema); zSchemaName = iDb>=0 ? db->aDb[iDb].zDbSName : "*"; } + if( i+1nSrc + && pFrom[1].fg.isUsing + && (selFlags & SF_NestedFrom)!=0 + ){ + int ii; + pUsing = pFrom[1].u3.pUsing; + for(ii=0; iinId; ii++){ + const char *zUName = pUsing->a[ii].zName; + pRight = sqlite3Expr(db, TK_ID, zUName); + pNew = sqlite3ExprListAppend(pParse, pNew, pRight); + if( pNew ){ + struct ExprList_item *pX = &pNew->a[pNew->nExpr-1]; + assert( pX->zEName==0 ); + pX->zEName = sqlite3MPrintf(db,"..%s", zUName); + pX->fg.eEName = ENAME_TAB; + pX->fg.bUsingTerm = 1; + } + } + }else{ + pUsing = 0; + } for(j=0; jnCol; j++){ char *zName = pTab->aCol[j].zCnName; - char *zColname; /* The computed column name */ - char *zToFree; /* Malloced string that needs to be freed */ - Token sColname; /* Computed column name as a token */ + struct ExprList_item *pX; /* Newly added ExprList term */ assert( zName ); - if( zTName && pSub - && sqlite3MatchEName(&pSub->pEList->a[j], 0, zTName, 0)==0 + if( zTName + && pNestedFrom + && sqlite3MatchEName(&pNestedFrom->a[j], 0, zTName, 0)==0 ){ continue; } @@ -139971,57 +143323,75 @@ static int selectExpander(Walker *pWalker, Select *p){ ){ continue; } + if( (pTab->aCol[j].colFlags & COLFLAG_NOEXPAND)!=0 + && zTName==0 + && (selFlags & (SF_NestedFrom))==0 + ){ + continue; + } tableSeen = 1; - if( i>0 && zTName==0 ){ - if( (pFrom->fg.jointype & JT_NATURAL)!=0 - && tableAndColumnIndex(pTabList, i, zName, 0, 0, 1) + if( i>0 && zTName==0 && (selFlags & SF_NestedFrom)==0 ){ + if( pFrom->fg.isUsing + && sqlite3IdListIndex(pFrom->u3.pUsing, zName)>=0 ){ - /* In a NATURAL join, omit the join columns from the - ** table to the right of the join */ - continue; - } - if( sqlite3IdListIndex(pFrom->pUsing, zName)>=0 ){ /* In a join with a USING clause, omit columns in the ** using clause from the table on the right. */ continue; } } pRight = sqlite3Expr(db, TK_ID, zName); - zColname = zName; - zToFree = 0; - if( longNames || pTabList->nSrc>1 ){ + if( (pTabList->nSrc>1 + && ( (pFrom->fg.jointype & JT_LTORJ)==0 + || (selFlags & SF_NestedFrom)!=0 + || !inAnyUsingClause(zName,pFrom,pTabList->nSrc-i-1) + ) + ) + || IN_RENAME_OBJECT + ){ Expr *pLeft; pLeft = sqlite3Expr(db, TK_ID, zTabName); pExpr = sqlite3PExpr(pParse, TK_DOT, pLeft, pRight); + if( IN_RENAME_OBJECT && pE->pLeft ){ + sqlite3RenameTokenRemap(pParse, pLeft, pE->pLeft); + } if( zSchemaName ){ pLeft = sqlite3Expr(db, TK_ID, zSchemaName); pExpr = sqlite3PExpr(pParse, TK_DOT, pLeft, pExpr); } - if( longNames ){ - zColname = sqlite3MPrintf(db, "%s.%s", zTabName, zName); - zToFree = zColname; - } }else{ pExpr = pRight; } pNew = sqlite3ExprListAppend(pParse, pNew, pExpr); - sqlite3TokenInit(&sColname, zColname); - sqlite3ExprListSetName(pParse, pNew, &sColname, 0); - if( pNew && (p->selFlags & SF_NestedFrom)!=0 && !IN_RENAME_OBJECT ){ - struct ExprList_item *pX = &pNew->a[pNew->nExpr-1]; - sqlite3DbFree(db, pX->zEName); - if( pSub ){ - pX->zEName = sqlite3DbStrDup(db, pSub->pEList->a[j].zEName); + if( pNew==0 ){ + break; /* OOM */ + } + pX = &pNew->a[pNew->nExpr-1]; + assert( pX->zEName==0 ); + if( (selFlags & SF_NestedFrom)!=0 && !IN_RENAME_OBJECT ){ + if( pNestedFrom ){ + pX->zEName = sqlite3DbStrDup(db, pNestedFrom->a[j].zEName); testcase( pX->zEName==0 ); }else{ pX->zEName = sqlite3MPrintf(db, "%s.%s.%s", - zSchemaName, zTabName, zColname); + zSchemaName, zTabName, zName); testcase( pX->zEName==0 ); } - pX->eEName = ENAME_TAB; + pX->fg.eEName = ENAME_TAB; + if( (pFrom->fg.isUsing + && sqlite3IdListIndex(pFrom->u3.pUsing, zName)>=0) + || (pUsing && sqlite3IdListIndex(pUsing, zName)>=0) + || (pTab->aCol[j].colFlags & COLFLAG_NOEXPAND)!=0 + ){ + pX->fg.bNoExpand = 1; + } + }else if( longNames ){ + pX->zEName = sqlite3MPrintf(db, "%s.%s", zTabName, zName); + pX->fg.eEName = ENAME_NAME; + }else{ + pX->zEName = sqlite3DbStrDup(db, zName); + pX->fg.eEName = ENAME_NAME; } - sqlite3DbFree(db, zToFree); } } if( !tableSeen ){ @@ -140045,6 +143415,12 @@ static int selectExpander(Walker *pWalker, Select *p){ p->selFlags |= SF_ComplexResult; } } +#if TREETRACE_ENABLED + if( sqlite3TreeTrace & 0x100 ){ + SELECTTRACE(0x100,pParse,p,("After result-set wildcard expansion:\n")); + sqlite3TreeViewSelect(0, p, 0); + } +#endif return WRC_Continue; } @@ -140435,8 +143811,8 @@ static void havingToWhere(Parse *pParse, Select *p){ sWalker.xExprCallback = havingToWhereExprCb; sWalker.u.pSelect = p; sqlite3WalkExpr(&sWalker, p->pHaving); -#if SELECTTRACE_ENABLED - if( sWalker.eCode && (sqlite3SelectTrace & 0x100)!=0 ){ +#if TREETRACE_ENABLED + if( sWalker.eCode && (sqlite3TreeTrace & 0x100)!=0 ){ SELECTTRACE(0x100,pParse,p,("Move HAVING terms into WHERE:\n")); sqlite3TreeViewSelect(0, p, 0); } @@ -140445,7 +143821,7 @@ static void havingToWhere(Parse *pParse, Select *p){ /* ** Check to see if the pThis entry of pTabList is a self-join of a prior view. -** If it is, then return the SrcList_item for the prior view. If it is not, +** If it is, then return the SrcItem for the prior view. If it is not, ** then return 0. */ static SrcItem *isSelfJoinView( @@ -140568,8 +143944,8 @@ static int countOfViewOptimization(Parse *pParse, Select *p){ p->pEList->a[0].pExpr = pExpr; p->selFlags &= ~SF_Aggregate; -#if SELECTTRACE_ENABLED - if( sqlite3SelectTrace & 0x400 ){ +#if TREETRACE_ENABLED + if( sqlite3TreeTrace & 0x400 ){ SELECTTRACE(0x400,pParse,p,("After count-of-view optimization:\n")); sqlite3TreeViewSelect(0, p, 0); } @@ -140578,6 +143954,29 @@ static int countOfViewOptimization(Parse *pParse, Select *p){ } #endif /* SQLITE_COUNTOFVIEW_OPTIMIZATION */ +/* +** If any term of pSrc, or any SF_NestedFrom sub-query, is not the same +** as pSrcItem but has the same alias as p0, then return true. +** Otherwise return false. +*/ +static int sameSrcAlias(SrcItem *p0, SrcList *pSrc){ + int i; + for(i=0; inSrc; i++){ + SrcItem *p1 = &pSrc->a[i]; + if( p1==p0 ) continue; + if( p0->pTab==p1->pTab && 0==sqlite3_stricmp(p0->zAlias, p1->zAlias) ){ + return 1; + } + if( p1->pSelect + && (p1->pSelect->selFlags & SF_NestedFrom)!=0 + && sameSrcAlias(p0, p1->pSelect->pSrc) + ){ + return 1; + } + } + return 0; +} + /* ** Generate code for the SELECT statement given in the p argument. ** @@ -140622,10 +144021,14 @@ SQLITE_PRIVATE int sqlite3Select( } assert( db->mallocFailed==0 ); if( sqlite3AuthCheck(pParse, SQLITE_SELECT, 0, 0, 0) ) return 1; -#if SELECTTRACE_ENABLED +#if TREETRACE_ENABLED SELECTTRACE(1,pParse,p, ("begin processing:\n", pParse->addrExplain)); - if( sqlite3SelectTrace & 0x100 ){ - sqlite3TreeViewSelect(0, p, 0); + if( sqlite3TreeTrace & 0x10100 ){ + if( (sqlite3TreeTrace & 0x10001)==0x10000 ){ + sqlite3TreeViewLine(0, "In sqlite3Select() at %s:%d", + __FILE__, __LINE__); + } + sqlite3ShowSelect(p); } #endif @@ -140639,9 +144042,9 @@ SQLITE_PRIVATE int sqlite3Select( pDest->eDest==SRT_DistQueue || pDest->eDest==SRT_DistFifo ); /* All of these destinations are also able to ignore the ORDER BY clause */ if( p->pOrderBy ){ -#if SELECTTRACE_ENABLED +#if TREETRACE_ENABLED SELECTTRACE(1,pParse,p, ("dropping superfluous ORDER BY:\n")); - if( sqlite3SelectTrace & 0x100 ){ + if( sqlite3TreeTrace & 0x100 ){ sqlite3TreeViewExprList(0, p->pOrderBy, 0, "ORDERBY"); } #endif @@ -140660,8 +144063,8 @@ SQLITE_PRIVATE int sqlite3Select( } assert( db->mallocFailed==0 ); assert( p->pEList!=0 ); -#if SELECTTRACE_ENABLED - if( sqlite3SelectTrace & 0x104 ){ +#if TREETRACE_ENABLED + if( sqlite3TreeTrace & 0x104 ){ SELECTTRACE(0x104,pParse,p, ("after name resolution:\n")); sqlite3TreeViewSelect(0, p, 0); } @@ -140678,15 +144081,12 @@ SQLITE_PRIVATE int sqlite3Select( ** disallow it altogether. */ if( p->selFlags & SF_UFSrcCheck ){ SrcItem *p0 = &p->pSrc->a[0]; - for(i=1; ipSrc->nSrc; i++){ - SrcItem *p1 = &p->pSrc->a[i]; - if( p0->pTab==p1->pTab && 0==sqlite3_stricmp(p0->zAlias, p1->zAlias) ){ - sqlite3ErrorMsg(pParse, - "target object/alias may not appear in FROM clause: %s", - p0->zAlias ? p0->zAlias : p0->pTab->zName - ); - goto select_end; - } + if( sameSrcAlias(p0, p->pSrc) ){ + sqlite3ErrorMsg(pParse, + "target object/alias may not appear in FROM clause: %s", + p0->zAlias ? p0->zAlias : p0->pTab->zName + ); + goto select_end; } /* Clear the SF_UFSrcCheck flag. The check has already been performed, @@ -140705,8 +144105,8 @@ SQLITE_PRIVATE int sqlite3Select( assert( pParse->nErr ); goto select_end; } -#if SELECTTRACE_ENABLED - if( p->pWin && (sqlite3SelectTrace & 0x108)!=0 ){ +#if TREETRACE_ENABLED + if( p->pWin && (sqlite3TreeTrace & 0x108)!=0 ){ SELECTTRACE(0x104,pParse,p, ("after window rewrite:\n")); sqlite3TreeViewSelect(0, p, 0); } @@ -140734,14 +144134,16 @@ SQLITE_PRIVATE int sqlite3Select( /* Convert LEFT JOIN into JOIN if there are terms of the right table ** of the LEFT JOIN used in the WHERE clause. */ - if( (pItem->fg.jointype & JT_LEFT)!=0 + if( (pItem->fg.jointype & (JT_LEFT|JT_RIGHT))==JT_LEFT && sqlite3ExprImpliesNonNullRow(p->pWhere, pItem->iCursor) && OptimizationEnabled(db, SQLITE_SimplifyJoin) ){ SELECTTRACE(0x100,pParse,p, ("LEFT-JOIN simplifies to JOIN on term %d\n",i)); pItem->fg.jointype &= ~(JT_LEFT|JT_OUTER); - unsetJoinExpr(p->pWhere, pItem->iCursor); + assert( pItem->iCursor>=0 ); + unsetJoinExpr(p->pWhere, pItem->iCursor, + pTabList->a[0].fg.jointype & JT_LTORJ); } /* No futher action if this term of the FROM clause is no a subquery */ @@ -140794,7 +144196,9 @@ SQLITE_PRIVATE int sqlite3Select( ){ SELECTTRACE(0x100,pParse,p, ("omit superfluous ORDER BY on %r FROM-clause subquery\n",i+1)); - sqlite3ExprListDelete(db, pSub->pOrderBy); + sqlite3ParserAddCleanup(pParse, + (void(*)(sqlite3*,void*))sqlite3ExprListDelete, + pSub->pOrderBy); pSub->pOrderBy = 0; } @@ -140820,7 +144224,7 @@ SQLITE_PRIVATE int sqlite3Select( && i==0 && (p->selFlags & SF_ComplexResult)!=0 && (pTabList->nSrc==1 - || (pTabList->a[1].fg.jointype&(JT_LEFT|JT_CROSS))!=0) + || (pTabList->a[1].fg.jointype&(JT_OUTER|JT_CROSS))!=0) ){ continue; } @@ -140844,9 +144248,9 @@ SQLITE_PRIVATE int sqlite3Select( */ if( p->pPrior ){ rc = multiSelect(pParse, p, pDest); -#if SELECTTRACE_ENABLED +#if TREETRACE_ENABLED SELECTTRACE(0x1,pParse,p,("end compound-select processing\n")); - if( (sqlite3SelectTrace & 0x2000)!=0 && ExplainQueryPlanParent(pParse)==0 ){ + if( (sqlite3TreeTrace & 0x2000)!=0 && ExplainQueryPlanParent(pParse)==0 ){ sqlite3TreeViewSelect(0, p, 0); } #endif @@ -140865,8 +144269,8 @@ SQLITE_PRIVATE int sqlite3Select( && OptimizationEnabled(db, SQLITE_PropagateConst) && propagateConstants(pParse, p) ){ -#if SELECTTRACE_ENABLED - if( sqlite3SelectTrace & 0x100 ){ +#if TREETRACE_ENABLED + if( sqlite3TreeTrace & 0x100 ){ SELECTTRACE(0x100,pParse,p,("After constant propagation:\n")); sqlite3TreeViewSelect(0, p, 0); } @@ -140944,8 +144348,8 @@ SQLITE_PRIVATE int sqlite3Select( || (pItem->u2.pCteUse->eM10d!=M10d_Yes && pItem->u2.pCteUse->nUse<2)) && pushDownWhereTerms(pParse, pSub, p->pWhere, pItem) ){ -#if SELECTTRACE_ENABLED - if( sqlite3SelectTrace & 0x100 ){ +#if TREETRACE_ENABLED + if( sqlite3TreeTrace & 0x100 ){ SELECTTRACE(0x100,pParse,p, ("After WHERE-clause push-down into subquery %d:\n", pSub->selId)); sqlite3TreeViewSelect(0, p, 0); @@ -140961,18 +144365,19 @@ SQLITE_PRIVATE int sqlite3Select( /* Generate code to implement the subquery ** - ** The subquery is implemented as a co-routine if: + ** The subquery is implemented as a co-routine if all of the following are + ** true: + ** ** (1) the subquery is guaranteed to be the outer loop (so that ** it does not need to be computed more than once), and ** (2) the subquery is not a CTE that should be materialized - ** - ** TODO: Are there other reasons beside (1) and (2) to use a co-routine - ** implementation? + ** (3) the subquery is not part of a left operand for a RIGHT JOIN */ if( i==0 && (pTabList->nSrc==1 - || (pTabList->a[1].fg.jointype&(JT_LEFT|JT_CROSS))!=0) /* (1) */ - && (pItem->fg.isCte==0 || pItem->u2.pCteUse->eM10d!=M10d_Yes) /* (2) */ + || (pTabList->a[1].fg.jointype&(JT_OUTER|JT_CROSS))!=0) /* (1) */ + && (pItem->fg.isCte==0 || pItem->u2.pCteUse->eM10d!=M10d_Yes) /* (2) */ + && (pTabList->a[0].fg.jointype & JT_LTORJ)==0 /* (3) */ ){ /* Implement a co-routine that will return a single row of the result ** set on each invocation. @@ -141018,11 +144423,11 @@ SQLITE_PRIVATE int sqlite3Select( ** the same view can reuse the materialization. */ int topAddr; int onceAddr = 0; - int retAddr; pItem->regReturn = ++pParse->nMem; - topAddr = sqlite3VdbeAddOp2(v, OP_Integer, 0, pItem->regReturn); + topAddr = sqlite3VdbeAddOp0(v, OP_Goto); pItem->addrFillSub = topAddr+1; + pItem->fg.isMaterialized = 1; if( pItem->fg.isCorrelated==0 ){ /* If the subquery is not correlated and if we are not inside of ** a trigger, then we only need to compute the value of the subquery @@ -141034,12 +144439,15 @@ SQLITE_PRIVATE int sqlite3Select( } sqlite3SelectDestInit(&dest, SRT_EphemTab, pItem->iCursor); ExplainQueryPlan((pParse, 1, "MATERIALIZE %!S", pItem)); + dest.zAffSdst = sqlite3TableAffinityStr(db, pItem->pTab); sqlite3Select(pParse, pSub, &dest); + sqlite3DbFree(db, dest.zAffSdst); + dest.zAffSdst = 0; pItem->pTab->nRowLogEst = pSub->nSelectRow; if( onceAddr ) sqlite3VdbeJumpHere(v, onceAddr); - retAddr = sqlite3VdbeAddOp1(v, OP_Return, pItem->regReturn); + sqlite3VdbeAddOp2(v, OP_Return, pItem->regReturn, topAddr+1); VdbeComment((v, "end %!S", pItem)); - sqlite3VdbeChangeP1(v, topAddr, retAddr); + sqlite3VdbeJumpHere(v, topAddr); sqlite3ClearTempRegCache(pParse); if( pItem->fg.isCte && pItem->fg.isCorrelated==0 ){ CteUse *pCteUse = pItem->u2.pCteUse; @@ -141063,8 +144471,8 @@ SQLITE_PRIVATE int sqlite3Select( pHaving = p->pHaving; sDistinct.isTnct = (p->selFlags & SF_Distinct)!=0; -#if SELECTTRACE_ENABLED - if( sqlite3SelectTrace & 0x400 ){ +#if TREETRACE_ENABLED + if( sqlite3TreeTrace & 0x400 ){ SELECTTRACE(0x400,pParse,p,("After all FROM-clause analysis:\n")); sqlite3TreeViewSelect(0, p, 0); } @@ -141098,9 +144506,10 @@ SQLITE_PRIVATE int sqlite3Select( ** the sDistinct.isTnct is still set. Hence, isTnct represents the ** original setting of the SF_Distinct flag, not the current setting */ assert( sDistinct.isTnct ); + sDistinct.isTnct = 2; -#if SELECTTRACE_ENABLED - if( sqlite3SelectTrace & 0x400 ){ +#if TREETRACE_ENABLED + if( sqlite3TreeTrace & 0x400 ){ SELECTTRACE(0x400,pParse,p,("Transform DISTINCT into GROUP BY:\n")); sqlite3TreeViewSelect(0, p, 0); } @@ -141133,6 +144542,18 @@ SQLITE_PRIVATE int sqlite3Select( */ if( pDest->eDest==SRT_EphemTab ){ sqlite3VdbeAddOp2(v, OP_OpenEphemeral, pDest->iSDParm, pEList->nExpr); + if( p->selFlags & SF_NestedFrom ){ + /* Delete or NULL-out result columns that will never be used */ + int ii; + for(ii=pEList->nExpr-1; ii>0 && pEList->a[ii].fg.bUsed==0; ii--){ + sqlite3ExprDelete(db, pEList->a[ii].pExpr); + sqlite3DbFree(db, pEList->a[ii].zEName); + pEList->nExpr--; + } + for(ii=0; iinExpr; ii++){ + if( pEList->a[ii].fg.bUsed==0 ) pEList->a[ii].pExpr->op = TK_NULL; + } + } } /* Set the limiter. @@ -141141,7 +144562,7 @@ SQLITE_PRIVATE int sqlite3Select( if( (p->selFlags & SF_FixedLimit)==0 ){ p->nSelectRow = 320; /* 4 billion rows */ } - computeLimitRegisters(pParse, p, iEnd); + if( p->pLimit ) computeLimitRegisters(pParse, p, iEnd); if( p->iLimit==0 && sSort.addrSortIndex>=0 ){ sqlite3VdbeChangeOpcode(v, sSort.addrSortIndex, OP_SorterOpen); sSort.sortFlags |= SORTFLAG_UseSorter; @@ -141282,8 +144703,9 @@ SQLITE_PRIVATE int sqlite3Select( ** ORDER BY to maximize the chances of rows being delivered in an ** order that makes the ORDER BY redundant. */ for(ii=0; iinExpr; ii++){ - u8 sortFlags = sSort.pOrderBy->a[ii].sortFlags & KEYINFO_ORDER_DESC; - pGroupBy->a[ii].sortFlags = sortFlags; + u8 sortFlags; + sortFlags = sSort.pOrderBy->a[ii].fg.sortFlags & KEYINFO_ORDER_DESC; + pGroupBy->a[ii].fg.sortFlags = sortFlags; } if( sqlite3ExprListCompare(pGroupBy, sSort.pOrderBy, -1)==0 ){ orderByGrp = 1; @@ -141352,8 +144774,8 @@ SQLITE_PRIVATE int sqlite3Select( } pAggInfo->mxReg = pParse->nMem; if( db->mallocFailed ) goto select_end; -#if SELECTTRACE_ENABLED - if( sqlite3SelectTrace & 0x400 ){ +#if TREETRACE_ENABLED + if( sqlite3TreeTrace & 0x400 ){ int ii; SELECTTRACE(0x400,pParse,p,("After aggregate analysis %p:\n", pAggInfo)); sqlite3TreeViewSelect(0, p, 0); @@ -141362,8 +144784,13 @@ SQLITE_PRIVATE int sqlite3Select( sqlite3TreeViewExprList(0, pMinMaxOrderBy, 0, "ORDERBY"); } for(ii=0; iinColumn; ii++){ - sqlite3DebugPrintf("agg-column[%d] iMem=%d\n", - ii, pAggInfo->aCol[ii].iMem); + struct AggInfo_col *pCol = &pAggInfo->aCol[ii]; + sqlite3DebugPrintf( + "agg-column[%d] pTab=%s iTable=%d iColumn=%d iMem=%d" + " iSorterColumn=%d\n", + ii, pCol->pTab ? pCol->pTab->zName : "NULL", + pCol->iTable, pCol->iColumn, pCol->iMem, + pCol->iSorterColumn); sqlite3TreeViewExpr(0, pAggInfo->aCol[ii].pCExpr, 0); } for(ii=0; iinFunc; ii++){ @@ -141441,7 +144868,8 @@ SQLITE_PRIVATE int sqlite3Select( sqlite3VdbeAddOp2(v, OP_Gosub, regReset, addrReset); SELECTTRACE(1,pParse,p,("WhereBegin\n")); pWInfo = sqlite3WhereBegin(pParse, pTabList, pWhere, pGroupBy, pDistinct, - 0, (WHERE_GROUPBY|(orderByGrp ? WHERE_SORTBYGROUP : 0)|distFlag), 0 + p, (sDistinct.isTnct==2 ? WHERE_DISTINCTBY : WHERE_GROUPBY) + | (orderByGrp ? WHERE_SORTBYGROUP : 0) | distFlag, 0 ); if( pWInfo==0 ){ sqlite3ExprListDelete(db, pDistinct); @@ -141483,15 +144911,15 @@ SQLITE_PRIVATE int sqlite3Select( regBase = sqlite3GetTempRange(pParse, nCol); sqlite3ExprCodeExprList(pParse, pGroupBy, regBase, 0, 0); j = nGroupBy; + pAggInfo->directMode = 1; for(i=0; inColumn; i++){ struct AggInfo_col *pCol = &pAggInfo->aCol[i]; if( pCol->iSorterColumn>=j ){ - int r1 = j + regBase; - sqlite3ExprCodeGetColumnOfTable(v, - pCol->pTab, pCol->iTable, pCol->iColumn, r1); + sqlite3ExprCode(pParse, pCol->pCExpr, j + regBase); j++; } } + pAggInfo->directMode = 0; regRecord = sqlite3GetTempReg(pParse); sqlite3VdbeAddOp3(v, OP_MakeRecord, regBase, nCol, regRecord); sqlite3VdbeAddOp2(v, OP_SorterInsert, pAggInfo->sortingIdx, regRecord); @@ -141623,7 +145051,7 @@ SQLITE_PRIVATE int sqlite3Select( VdbeComment((v, "indicate accumulator empty")); sqlite3VdbeAddOp1(v, OP_Return, regReset); - if( eDist!=WHERE_DISTINCT_NOOP ){ + if( distFlag!=0 && eDist!=WHERE_DISTINCT_NOOP ){ struct AggInfo_func *pF = &pAggInfo->aFunc[0]; fixDistinctOpenEph(pParse, eDist, pF->iDistinct, pF->iDistAddr); } @@ -141739,7 +145167,7 @@ SQLITE_PRIVATE int sqlite3Select( SELECTTRACE(1,pParse,p,("WhereBegin\n")); pWInfo = sqlite3WhereBegin(pParse, pTabList, pWhere, pMinMaxOrderBy, - pDistinct, 0, minMaxFlag|distFlag, 0); + pDistinct, p, minMaxFlag|distFlag, 0); if( pWInfo==0 ){ goto select_end; } @@ -141747,8 +145175,10 @@ SQLITE_PRIVATE int sqlite3Select( eDist = sqlite3WhereIsDistinct(pWInfo); updateAccumulator(pParse, regAcc, pAggInfo, eDist); if( eDist!=WHERE_DISTINCT_NOOP ){ - struct AggInfo_func *pF = &pAggInfo->aFunc[0]; - fixDistinctOpenEph(pParse, eDist, pF->iDistinct, pF->iDistAddr); + struct AggInfo_func *pF = pAggInfo->aFunc; + if( pF ){ + fixDistinctOpenEph(pParse, eDist, pF->iDistinct, pF->iDistAddr); + } } if( regAcc ) sqlite3VdbeAddOp2(v, OP_Integer, 1, regAcc); @@ -141815,9 +145245,9 @@ select_end: } #endif -#if SELECTTRACE_ENABLED +#if TREETRACE_ENABLED SELECTTRACE(0x1,pParse,p,("end processing\n")); - if( (sqlite3SelectTrace & 0x2000)!=0 && ExplainQueryPlanParent(pParse)==0 ){ + if( (sqlite3TreeTrace & 0x2000)!=0 && ExplainQueryPlanParent(pParse)==0 ){ sqlite3TreeViewSelect(0, p, 0); } #endif @@ -142082,9 +145512,7 @@ SQLITE_PRIVATE Trigger *sqlite3TriggerList(Parse *pParse, Table *pTab){ Trigger *pList; /* List of triggers to return */ HashElem *p; /* Loop variable for TEMP triggers */ - if( pParse->disableTriggers ){ - return 0; - } + assert( pParse->disableTriggers==0 ); pTmpSchema = pParse->db->aDb[1].pSchema; p = sqliteHashFirst(&pTmpSchema->trigHash); pList = pTab->pTrigger; @@ -142093,15 +145521,14 @@ SQLITE_PRIVATE Trigger *sqlite3TriggerList(Parse *pParse, Table *pTab){ if( pTrig->pTabSchema==pTab->pSchema && pTrig->table && 0==sqlite3StrICmp(pTrig->table, pTab->zName) - && pTrig->pTabSchema!=pTmpSchema + && (pTrig->pTabSchema!=pTmpSchema || pTrig->bReturning) ){ pTrig->pNext = pList; pList = pTrig; - }else if( pTrig->op==TK_RETURNING + }else if( pTrig->op==TK_RETURNING ){ #ifndef SQLITE_OMIT_VIRTUALTABLE - && pParse->db->pVtabCtx==0 + assert( pParse->db->pVtabCtx==0 ); #endif - ){ assert( pParse->bReturning ); assert( &(pParse->u1.pReturning->retTrig) == pTrig ); pTrig->table = pTab->zName; @@ -142384,6 +145811,23 @@ SQLITE_PRIVATE void sqlite3FinishTrigger( Vdbe *v; char *z; + /* If this is a new CREATE TABLE statement, and if shadow tables + ** are read-only, and the trigger makes a change to a shadow table, + ** then raise an error - do not allow the trigger to be created. */ + if( sqlite3ReadOnlyShadowTables(db) ){ + TriggerStep *pStep; + for(pStep=pTrig->step_list; pStep; pStep=pStep->pNext){ + if( pStep->zTarget!=0 + && sqlite3ShadowTableName(db, pStep->zTarget) + ){ + sqlite3ErrorMsg(pParse, + "trigger \"%s\" may not write to shadow table \"%s\"", + pTrig->zName, pStep->zTarget); + goto triggerfinish_cleanup; + } + } + } + /* Make an entry in the sqlite_schema table */ v = sqlite3GetVdbe(pParse); if( v==0 ) goto triggerfinish_cleanup; @@ -142547,7 +145991,7 @@ SQLITE_PRIVATE TriggerStep *sqlite3TriggerInsertStep( SQLITE_PRIVATE TriggerStep *sqlite3TriggerUpdateStep( Parse *pParse, /* Parser */ Token *pTableName, /* Name of the table to be updated */ - SrcList *pFrom, + SrcList *pFrom, /* FROM clause for an UPDATE-FROM, or NULL */ ExprList *pEList, /* The SET clause: list of column and new values */ Expr *pWhere, /* The WHERE clause */ u8 orconf, /* The conflict algorithm. (OE_Abort, OE_Ignore, etc) */ @@ -142760,13 +146204,22 @@ static int checkColumnOverlap(IdList *pIdList, ExprList *pEList){ return 0; } +/* +** Return true if any TEMP triggers exist +*/ +static int tempTriggersExist(sqlite3 *db){ + if( NEVER(db->aDb[1].pSchema==0) ) return 0; + if( sqliteHashFirst(&db->aDb[1].pSchema->trigHash)==0 ) return 0; + return 1; +} + /* ** Return a list of all triggers on table pTab if there exists at least ** one trigger that must be fired when an operation of type 'op' is ** performed on the table, and, if that operation is an UPDATE, if at ** least one of the columns in pChanges is being modified. */ -SQLITE_PRIVATE Trigger *sqlite3TriggersExist( +static SQLITE_NOINLINE Trigger *triggersReallyExist( Parse *pParse, /* Parse context */ Table *pTab, /* The table the contains the triggers */ int op, /* one of TK_DELETE, TK_INSERT, TK_UPDATE */ @@ -142829,6 +146282,22 @@ exit_triggers_exist: } return (mask ? pList : 0); } +SQLITE_PRIVATE Trigger *sqlite3TriggersExist( + Parse *pParse, /* Parse context */ + Table *pTab, /* The table the contains the triggers */ + int op, /* one of TK_DELETE, TK_INSERT, TK_UPDATE */ + ExprList *pChanges, /* Columns that change in an UPDATE statement */ + int *pMask /* OUT: Mask of TRIGGER_BEFORE|TRIGGER_AFTER */ +){ + assert( pTab!=0 ); + if( (pTab->pTrigger==0 && !tempTriggersExist(pParse->db)) + || pParse->disableTriggers + ){ + if( pMask ) *pMask = 0; + return 0; + } + return triggersReallyExist(pParse,pTab,op,pChanges,pMask); +} /* ** Convert the pStep->zTarget string into a SrcList and return a pointer @@ -142858,6 +146327,14 @@ SQLITE_PRIVATE SrcList *sqlite3TriggerStepSrc( } if( pStep->pFrom ){ SrcList *pDup = sqlite3SrcListDup(db, pStep->pFrom, 0); + if( pDup && pDup->nSrc>1 && !IN_RENAME_OBJECT ){ + Select *pSubquery; + Token as; + pSubquery = sqlite3SelectNew(pParse,0,pDup,0,0,0,0,SF_NestedFrom,0); + as.n = 0; + as.z = 0; + pDup = sqlite3SrcListAppendFromTerm(pParse,0,0,0,&as,pSubquery,0); + } pSrc = sqlite3SrcListAppendList(pParse, pSrc, pDup); } }else{ @@ -142913,7 +146390,7 @@ static ExprList *sqlite3ExpandReturning( if( !db->mallocFailed ){ struct ExprList_item *pItem = &pNew->a[pNew->nExpr-1]; pItem->zEName = sqlite3DbStrDup(db, pTab->aCol[jj].zCnName); - pItem->eEName = ENAME_NAME; + pItem->fg.eEName = ENAME_NAME; } } }else{ @@ -142922,7 +146399,7 @@ static ExprList *sqlite3ExpandReturning( if( !db->mallocFailed && ALWAYS(pList->a[i].zEName!=0) ){ struct ExprList_item *pItem = &pNew->a[pNew->nExpr-1]; pItem->zEName = sqlite3DbStrDup(db, pList->a[i].zEName); - pItem->eEName = pList->a[i].eEName; + pItem->fg.eEName = pList->a[i].fg.eEName; } } } @@ -143174,7 +146651,7 @@ static TriggerPrg *codeRowTrigger( sSubParse.zAuthContext = pTrigger->zName; sSubParse.eTriggerOp = pTrigger->op; sSubParse.nQueryLoop = pParse->nQueryLoop; - sSubParse.disableVtab = pParse->disableVtab; + sSubParse.prepFlags = pParse->prepFlags; v = sqlite3GetVdbe(&sSubParse); if( v ){ @@ -143520,11 +146997,14 @@ static void updateVirtualTable( ** it has been converted into REAL. */ SQLITE_PRIVATE void sqlite3ColumnDefault(Vdbe *v, Table *pTab, int i, int iReg){ + Column *pCol; assert( pTab!=0 ); - if( !IsView(pTab) ){ + assert( pTab->nCol>i ); + pCol = &pTab->aCol[i]; + if( pCol->iDflt ){ sqlite3_value *pValue = 0; u8 enc = ENC(sqlite3VdbeDb(v)); - Column *pCol = &pTab->aCol[i]; + assert( !IsView(pTab) ); VdbeComment((v, "%s.%s", pTab->zName, pCol->zCnName)); assert( inCol ); sqlite3ValueFromExpr(sqlite3VdbeDb(v), @@ -143535,7 +147015,7 @@ SQLITE_PRIVATE void sqlite3ColumnDefault(Vdbe *v, Table *pTab, int i, int iReg){ } } #ifndef SQLITE_OMIT_FLOATING_POINT - if( pTab->aCol[i].affinity==SQLITE_AFF_REAL && !IsVirtual(pTab) ){ + if( pCol->affinity==SQLITE_AFF_REAL && !IsVirtual(pTab) ){ sqlite3VdbeAddOp1(v, OP_RealAffinity, iReg); } #endif @@ -143837,6 +147317,14 @@ SQLITE_PRIVATE void sqlite3Update( # define isView 0 #endif +#if TREETRACE_ENABLED + if( sqlite3TreeTrace & 0x10000 ){ + sqlite3TreeViewLine(0, "In sqlite3Update() at %s:%d", __FILE__, __LINE__); + sqlite3TreeViewUpdate(pParse->pWith, pTabList, pChanges, pWhere, + onError, pOrderBy, pLimit, pUpsert, pTrigger); + } +#endif + /* If there was a FROM clause, set nChangeFrom to the number of expressions ** in the change-list. Otherwise, set it to 0. There cannot be a FROM ** clause if this function is being called to generate code for part of @@ -144481,7 +147969,7 @@ SQLITE_PRIVATE void sqlite3Update( }else{ sqlite3VdbeAddOp3(v, OP_NotExists, iDataCur, labelContinue,regOldRowid); } - VdbeCoverageNeverTaken(v); + VdbeCoverage(v); } /* Do FK constraint checks. */ @@ -144967,6 +148455,7 @@ SQLITE_PRIVATE int sqlite3UpsertAnalyzeTarget( if( pIdx->aiColumn[ii]==XN_EXPR ){ assert( pIdx->aColExpr!=0 ); assert( pIdx->aColExpr->nExpr>ii ); + assert( pIdx->bHasExpr ); pExpr = pIdx->aColExpr->a[ii].pExpr; if( pExpr->op!=TK_COLLATE ){ sCol[0].pLeft = pExpr; @@ -145280,6 +148769,7 @@ SQLITE_PRIVATE SQLITE_NOINLINE int sqlite3RunVacuum( int nDb; /* Number of attached databases */ const char *zDbMain; /* Schema name of database to vacuum */ const char *zOut; /* Name of output file */ + u32 pgflags = PAGER_SYNCHRONOUS_OFF; /* sync flags for output db */ if( !db->autoCommit ){ sqlite3SetString(pzErrMsg, db, "cannot VACUUM from within a transaction"); @@ -145351,12 +148841,17 @@ SQLITE_PRIVATE SQLITE_NOINLINE int sqlite3RunVacuum( goto end_of_vacuum; } db->mDbFlags |= DBFLAG_VacuumInto; + + /* For a VACUUM INTO, the pager-flags are set to the same values as + ** they are for the database being vacuumed, except that PAGER_CACHESPILL + ** is always set. */ + pgflags = db->aDb[iDb].safety_level | (db->flags & PAGER_FLAGS_MASK); } nRes = sqlite3BtreeGetRequestedReserve(pMain); sqlite3BtreeSetCacheSize(pTemp, db->aDb[iDb].pSchema->cache_size); sqlite3BtreeSetSpillSize(pTemp, sqlite3BtreeSetSpillSize(pMain,0)); - sqlite3BtreeSetPagerFlags(pTemp, PAGER_SYNCHRONOUS_OFF|PAGER_CACHESPILL); + sqlite3BtreeSetPagerFlags(pTemp, pgflags|PAGER_CACHESPILL); /* Begin a transaction and take an exclusive lock on the main database ** file. This is done before the sqlite3BtreeGetPageSize(pMain) call below, @@ -145487,6 +148982,7 @@ SQLITE_PRIVATE SQLITE_NOINLINE int sqlite3RunVacuum( assert( rc==SQLITE_OK ); if( pOut==0 ){ + nRes = sqlite3BtreeGetRequestedReserve(pTemp); rc = sqlite3BtreeSetPageSize(pMain, sqlite3BtreeGetPageSize(pTemp), nRes,1); } @@ -145868,7 +149364,8 @@ SQLITE_PRIVATE void sqlite3VtabUnlockList(sqlite3 *db){ */ SQLITE_PRIVATE void sqlite3VtabClear(sqlite3 *db, Table *p){ assert( IsVirtual(p) ); - if( !db || db->pnBytesFreed==0 ) vtabDisconnectAll(0, p); + assert( db!=0 ); + if( db->pnBytesFreed==0 ) vtabDisconnectAll(0, p); if( p->u.vtab.azArg ){ int i; for(i=0; iu.vtab.nArg; i++){ @@ -146668,7 +150165,7 @@ SQLITE_PRIVATE FuncDef *sqlite3VtabOverloadFunction( if( pExpr->op!=TK_COLUMN ) return pDef; assert( ExprUseYTab(pExpr) ); pTab = pExpr->y.pTab; - if( pTab==0 ) return pDef; + if( NEVER(pTab==0) ) return pDef; if( !IsVirtual(pTab) ) return pDef; pVtab = sqlite3GetVTable(db, pTab)->pVtab; assert( pVtab!=0 ); @@ -146929,6 +150426,28 @@ typedef struct WhereLoopBuilder WhereLoopBuilder; typedef struct WhereScan WhereScan; typedef struct WhereOrCost WhereOrCost; typedef struct WhereOrSet WhereOrSet; +typedef struct WhereMemBlock WhereMemBlock; +typedef struct WhereRightJoin WhereRightJoin; + +/* +** This object is a header on a block of allocated memory that will be +** automatically freed when its WInfo oject is destructed. +*/ +struct WhereMemBlock { + WhereMemBlock *pNext; /* Next block in the chain */ + u64 sz; /* Bytes of space */ +}; + +/* +** Extra information attached to a WhereLevel that is a RIGHT JOIN. +*/ +struct WhereRightJoin { + int iMatch; /* Cursor used to determine prior matched rows */ + int regBloom; /* Bloom filter for iRJMatch */ + int regReturn; /* Return register for the interior subroutine */ + int addrSubrtn; /* Starting address for the interior subroutine */ + int endSubrtn; /* The last opcode in the interior subroutine */ +}; /* ** This object contains information needed to implement a single nested @@ -146962,6 +150481,7 @@ struct WhereLevel { int addrLikeRep; /* LIKE range processing address */ #endif int regFilter; /* Bloom filter */ + WhereRightJoin *pRJ; /* Extra information for RIGHT JOIN */ u8 iFrom; /* Which entry in the FROM clause */ u8 op, p3, p5; /* Opcode, P3 & P5 of the opcode that ends the loop */ int p1, p2; /* Operands of the opcode used to end the loop */ @@ -147252,7 +150772,7 @@ struct WhereAndInfo { ** between VDBE cursor numbers and bits of the bitmasks in WhereTerm. ** ** The VDBE cursor numbers are small integers contained in -** SrcList_item.iCursor and Expr.iTable fields. For any given WHERE +** SrcItem.iCursor and Expr.iTable fields. For any given WHERE ** clause, the cursor numbers might not begin with 0 and they might ** contain gaps in the numbering sequence. But we want to make maximum ** use of the bits in our bitmasks. This structure provides a mapping @@ -147323,20 +150843,6 @@ struct WhereLoopBuilder { # define SQLITE_QUERY_PLANNER_LIMIT_INCR 1000 #endif -/* -** Each instance of this object records a change to a single node -** in an expression tree to cause that node to point to a column -** of an index rather than an expression or a virtual column. All -** such transformations need to be undone at the end of WHERE clause -** processing. -*/ -typedef struct WhereExprMod WhereExprMod; -struct WhereExprMod { - WhereExprMod *pNext; /* Next translation on a list of them all */ - Expr *pExpr; /* The Expr node that was transformed */ - Expr orig; /* Original value of the Expr node */ -}; - /* ** The WHERE clause processing routine has two halves. The ** first part does the start of the WHERE loop and the second @@ -147352,10 +150858,10 @@ struct WhereInfo { SrcList *pTabList; /* List of tables in the join */ ExprList *pOrderBy; /* The ORDER BY clause or NULL */ ExprList *pResultSet; /* Result set of the query */ +#if WHERETRACE_ENABLED Expr *pWhere; /* The complete WHERE clause */ -#ifndef SQLITE_OMIT_VIRTUALTABLE - Select *pLimit; /* Used to access LIMIT expr/registers for vtabs */ #endif + Select *pSelect; /* The entire SELECT statement containing WHERE */ int aiCurOnePass[2]; /* OP_OpenWrite cursors for the ONEPASS opt */ int iContinue; /* Jump here to continue with next record */ int iBreak; /* Jump here to break out of the loop */ @@ -147374,7 +150880,7 @@ struct WhereInfo { int iTop; /* The very beginning of the WHERE loop */ int iEndWhere; /* End of the WHERE clause itself */ WhereLoop *pLoops; /* List of all WhereLoop objects */ - WhereExprMod *pExprMods; /* Expression modifications */ + WhereMemBlock *pMemToFree;/* Memory to free when this object destroyed */ Bitmask revMask; /* Mask of ORDER BY terms that need reversing */ WhereClause sWC; /* Decomposition of the WHERE clause */ WhereMaskSet sMaskSet; /* Map cursor numbers to bitmasks */ @@ -147400,6 +150906,8 @@ SQLITE_PRIVATE WhereTerm *sqlite3WhereFindTerm( u32 op, /* Mask of WO_xx values describing operator */ Index *pIdx /* Must be compatible with this index, if not NULL */ ); +SQLITE_PRIVATE void *sqlite3WhereMalloc(WhereInfo *pWInfo, u64 nByte); +SQLITE_PRIVATE void *sqlite3WhereRealloc(WhereInfo *pWInfo, void *pOld, u64 nByte); /* wherecode.c: */ #ifndef SQLITE_OMIT_EXPLAIN @@ -147436,6 +150944,11 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( WhereLevel *pLevel, /* The current level pointer */ Bitmask notReady /* Which tables are currently available */ ); +SQLITE_PRIVATE SQLITE_NOINLINE void sqlite3WhereRightJoinLoop( + WhereInfo *pWInfo, + int iLevel, + WhereLevel *pLevel +); /* whereexpr.c: */ SQLITE_PRIVATE void sqlite3WhereClauseInit(WhereClause*,WhereInfo*); @@ -147478,8 +150991,9 @@ SQLITE_PRIVATE void sqlite3WhereTabFuncArgs(Parse*, SrcItem*, WhereClause*); #define WO_AND 0x0400 /* Two or more AND-connected terms */ #define WO_EQUIV 0x0800 /* Of the form A==B, both columns */ #define WO_NOOP 0x1000 /* This term does not restrict search space */ +#define WO_ROWVAL 0x2000 /* A row-value term */ -#define WO_ALL 0x1fff /* Mask of all possible WO_* values */ +#define WO_ALL 0x3fff /* Mask of all possible WO_* values */ #define WO_SINGLE 0x01ff /* Mask of all non-compound WO_* values */ /* @@ -147513,6 +151027,7 @@ SQLITE_PRIVATE void sqlite3WhereTabFuncArgs(Parse*, SrcItem*, WhereClause*); #define WHERE_BLOOMFILTER 0x00400000 /* Consider using a Bloom-filter */ #define WHERE_SELFCULL 0x00800000 /* nOut reduced by extra WHERE terms */ #define WHERE_OMIT_OFFSET 0x01000000 /* Set offset counter to zero */ +#define WHERE_VIEWSCAN 0x02000000 /* A full-scan of a VIEW or subquery */ #endif /* !defined(SQLITE_WHEREINT_H) */ @@ -147703,6 +151218,9 @@ SQLITE_PRIVATE int sqlite3WhereExplainOneScan( pLoop->u.vtab.idxNum, pLoop->u.vtab.idxStr); } #endif + if( pItem->fg.jointype & JT_LEFT ){ + sqlite3_str_appendf(&str, " LEFT-JOIN"); + } #ifdef SQLITE_EXPLAIN_ESTIMATED_ROWS if( pLoop->nOut>=10 ){ sqlite3_str_appendf(&str, " (~%llu rows)", @@ -147846,7 +151364,7 @@ static void disableTerm(WhereLevel *pLevel, WhereTerm *pTerm){ int nLoop = 0; assert( pTerm!=0 ); while( (pTerm->wtFlags & TERM_CODED)==0 - && (pLevel->iLeftJoin==0 || ExprHasProperty(pTerm->pExpr, EP_FromJoin)) + && (pLevel->iLeftJoin==0 || ExprHasProperty(pTerm->pExpr, EP_OuterON)) && (pLevel->notReady & pTerm->prereqAll)==0 ){ if( nLoop && (pTerm->wtFlags & TERM_LIKE)!=0 ){ @@ -148107,16 +151625,22 @@ static int codeEqualityTerm( if( !ExprUseXSelect(pX) || pX->x.pSelect->pEList->nExpr==1 ){ eType = sqlite3FindInIndex(pParse, pX, IN_INDEX_LOOP, 0, 0, &iTab); }else{ - sqlite3 *db = pParse->db; - pX = removeUnindexableInClauseTerms(pParse, iEq, pLoop, pX); - - if( !db->mallocFailed ){ - aiMap = (int*)sqlite3DbMallocZero(pParse->db, sizeof(int)*nEq); + Expr *pExpr = pTerm->pExpr; + if( pExpr->iTable==0 || !ExprHasProperty(pExpr, EP_Subrtn) ){ + sqlite3 *db = pParse->db; + pX = removeUnindexableInClauseTerms(pParse, iEq, pLoop, pX); + if( !db->mallocFailed ){ + aiMap = (int*)sqlite3DbMallocZero(pParse->db, sizeof(int)*nEq); + eType = sqlite3FindInIndex(pParse, pX, IN_INDEX_LOOP, 0, aiMap,&iTab); + pExpr->iTable = iTab; + } + sqlite3ExprDelete(db, pX); + }else{ + int n = sqlite3ExprVectorSize(pX->pLeft); + aiMap = (int*)sqlite3DbMallocZero(pParse->db, sizeof(int)*MAX(nEq,n)); eType = sqlite3FindInIndex(pParse, pX, IN_INDEX_LOOP, 0, aiMap, &iTab); - pTerm->pExpr->iTable = iTab; } - sqlite3ExprDelete(db, pX); - pX = pTerm->pExpr; + pX = pExpr; } if( eType==IN_INDEX_INDEX_DESC ){ @@ -148139,8 +151663,9 @@ static int codeEqualityTerm( i = pLevel->u.in.nIn; pLevel->u.in.nIn += nEq; pLevel->u.in.aInLoop = - sqlite3DbReallocOrFree(pParse->db, pLevel->u.in.aInLoop, - sizeof(pLevel->u.in.aInLoop[0])*pLevel->u.in.nIn); + sqlite3WhereRealloc(pTerm->pWC->pWInfo, + pLevel->u.in.aInLoop, + sizeof(pLevel->u.in.aInLoop[0])*pLevel->u.in.nIn); pIn = pLevel->u.in.aInLoop; if( pIn ){ int iMap = 0; /* Index in aiMap[] */ @@ -148382,7 +151907,7 @@ static void whereLikeOptimizationStringFixup( if( pTerm->wtFlags & TERM_LIKEOPT ){ VdbeOp *pOp; assert( pLevel->iLikeRepCntr>0 ); - pOp = sqlite3VdbeGetOp(v, -1); + pOp = sqlite3VdbeGetLastOp(v); assert( pOp!=0 ); assert( pOp->opcode==OP_String8 || pTerm->pWC->pWInfo->pParse->db->mallocFailed ); @@ -148561,8 +152086,8 @@ static void codeCursorHint( */ if( pTabItem->fg.jointype & JT_LEFT ){ Expr *pExpr = pTerm->pExpr; - if( !ExprHasProperty(pExpr, EP_FromJoin) - || pExpr->w.iRightJoinTable!=pTabItem->iCursor + if( !ExprHasProperty(pExpr, EP_OuterON) + || pExpr->w.iJoin!=pTabItem->iCursor ){ sWalker.eCode = 0; sWalker.xExprCallback = codeCursorHintIsOrFunction; @@ -148570,7 +152095,7 @@ static void codeCursorHint( if( sWalker.eCode ) continue; } }else{ - if( ExprHasProperty(pTerm->pExpr, EP_FromJoin) ) continue; + if( ExprHasProperty(pTerm->pExpr, EP_OuterON) ) continue; } /* All terms in pWLoop->aLTerm[] except pEndRange are used to initialize @@ -148618,13 +152143,21 @@ static void codeCursorHint( ** ** OP_DeferredSeek $iCur $iRowid ** +** Which causes a seek on $iCur to the row with rowid $iRowid. +** ** However, if the scan currently being coded is a branch of an OR-loop and -** the statement currently being coded is a SELECT, then P3 of OP_DeferredSeek -** is set to iIdxCur and P4 is set to point to an array of integers -** containing one entry for each column of the table cursor iCur is open -** on. For each table column, if the column is the i'th column of the -** index, then the corresponding array entry is set to (i+1). If the column -** does not appear in the index at all, the array entry is set to 0. +** the statement currently being coded is a SELECT, then additional information +** is added that might allow OP_Column to omit the seek and instead do its +** lookup on the index, thus avoiding an expensive seek operation. To +** enable this optimization, the P3 of OP_DeferredSeek is set to iIdxCur +** and P4 is set to an array of integers containing one entry for each column +** in the table. For each table column, if the column is the i'th +** column of the index, then the corresponding array entry is set to (i+1). +** If the column does not appear in the index at all, the array entry is set +** to 0. The OP_Column opcode can check this array to see if the column it +** wants is in the index and if it is, it will substitute the index cursor +** and column number and continue with those new values, rather than seeking +** the table cursor. */ static void codeDeferredSeek( WhereInfo *pWInfo, /* Where clause context */ @@ -148640,7 +152173,7 @@ static void codeDeferredSeek( pWInfo->bDeferredSeek = 1; sqlite3VdbeAddOp3(v, OP_DeferredSeek, iIdxCur, 0, iCur); - if( (pWInfo->wctrlFlags & WHERE_OR_SUBCLAUSE) + if( (pWInfo->wctrlFlags & (WHERE_OR_SUBCLAUSE|WHERE_RIGHT_JOIN)) && DbMaskAllZero(sqlite3ParseToplevel(pParse)->writeMask) ){ int i; @@ -148698,144 +152231,6 @@ static void codeExprOrVector(Parse *pParse, Expr *p, int iReg, int nReg){ } } -/* An instance of the IdxExprTrans object carries information about a -** mapping from an expression on table columns into a column in an index -** down through the Walker. -*/ -typedef struct IdxExprTrans { - Expr *pIdxExpr; /* The index expression */ - int iTabCur; /* The cursor of the corresponding table */ - int iIdxCur; /* The cursor for the index */ - int iIdxCol; /* The column for the index */ - int iTabCol; /* The column for the table */ - WhereInfo *pWInfo; /* Complete WHERE clause information */ - sqlite3 *db; /* Database connection (for malloc()) */ -} IdxExprTrans; - -/* -** Preserve pExpr on the WhereETrans list of the WhereInfo. -*/ -static void preserveExpr(IdxExprTrans *pTrans, Expr *pExpr){ - WhereExprMod *pNew; - pNew = sqlite3DbMallocRaw(pTrans->db, sizeof(*pNew)); - if( pNew==0 ) return; - pNew->pNext = pTrans->pWInfo->pExprMods; - pTrans->pWInfo->pExprMods = pNew; - pNew->pExpr = pExpr; - memcpy(&pNew->orig, pExpr, sizeof(*pExpr)); -} - -/* The walker node callback used to transform matching expressions into -** a reference to an index column for an index on an expression. -** -** If pExpr matches, then transform it into a reference to the index column -** that contains the value of pExpr. -*/ -static int whereIndexExprTransNode(Walker *p, Expr *pExpr){ - IdxExprTrans *pX = p->u.pIdxTrans; - if( sqlite3ExprCompare(0, pExpr, pX->pIdxExpr, pX->iTabCur)==0 ){ - pExpr = sqlite3ExprSkipCollate(pExpr); - preserveExpr(pX, pExpr); - pExpr->affExpr = sqlite3ExprAffinity(pExpr); - pExpr->op = TK_COLUMN; - pExpr->iTable = pX->iIdxCur; - pExpr->iColumn = pX->iIdxCol; - testcase( ExprHasProperty(pExpr, EP_Skip) ); - testcase( ExprHasProperty(pExpr, EP_Unlikely) ); - ExprClearProperty(pExpr, EP_Skip|EP_Unlikely|EP_WinFunc|EP_Subrtn); - pExpr->y.pTab = 0; - return WRC_Prune; - }else{ - return WRC_Continue; - } -} - -#ifndef SQLITE_OMIT_GENERATED_COLUMNS -/* A walker node callback that translates a column reference to a table -** into a corresponding column reference of an index. -*/ -static int whereIndexExprTransColumn(Walker *p, Expr *pExpr){ - if( pExpr->op==TK_COLUMN ){ - IdxExprTrans *pX = p->u.pIdxTrans; - if( pExpr->iTable==pX->iTabCur && pExpr->iColumn==pX->iTabCol ){ - assert( ExprUseYTab(pExpr) && pExpr->y.pTab!=0 ); - preserveExpr(pX, pExpr); - pExpr->affExpr = sqlite3TableColumnAffinity(pExpr->y.pTab,pExpr->iColumn); - pExpr->iTable = pX->iIdxCur; - pExpr->iColumn = pX->iIdxCol; - pExpr->y.pTab = 0; - } - } - return WRC_Continue; -} -#endif /* SQLITE_OMIT_GENERATED_COLUMNS */ - -/* -** For an indexes on expression X, locate every instance of expression X -** in pExpr and change that subexpression into a reference to the appropriate -** column of the index. -** -** 2019-10-24: Updated to also translate references to a VIRTUAL column in -** the table into references to the corresponding (stored) column of the -** index. -*/ -static void whereIndexExprTrans( - Index *pIdx, /* The Index */ - int iTabCur, /* Cursor of the table that is being indexed */ - int iIdxCur, /* Cursor of the index itself */ - WhereInfo *pWInfo /* Transform expressions in this WHERE clause */ -){ - int iIdxCol; /* Column number of the index */ - ExprList *aColExpr; /* Expressions that are indexed */ - Table *pTab; - Walker w; - IdxExprTrans x; - aColExpr = pIdx->aColExpr; - if( aColExpr==0 && !pIdx->bHasVCol ){ - /* The index does not reference any expressions or virtual columns - ** so no translations are needed. */ - return; - } - pTab = pIdx->pTable; - memset(&w, 0, sizeof(w)); - w.u.pIdxTrans = &x; - x.iTabCur = iTabCur; - x.iIdxCur = iIdxCur; - x.pWInfo = pWInfo; - x.db = pWInfo->pParse->db; - for(iIdxCol=0; iIdxColnColumn; iIdxCol++){ - i16 iRef = pIdx->aiColumn[iIdxCol]; - if( iRef==XN_EXPR ){ - assert( aColExpr!=0 && aColExpr->a[iIdxCol].pExpr!=0 ); - x.pIdxExpr = aColExpr->a[iIdxCol].pExpr; - if( sqlite3ExprIsConstant(x.pIdxExpr) ) continue; - w.xExprCallback = whereIndexExprTransNode; -#ifndef SQLITE_OMIT_GENERATED_COLUMNS - }else if( iRef>=0 - && (pTab->aCol[iRef].colFlags & COLFLAG_VIRTUAL)!=0 - && ((pTab->aCol[iRef].colFlags & COLFLAG_HASCOLL)==0 - || sqlite3StrICmp(sqlite3ColumnColl(&pTab->aCol[iRef]), - sqlite3StrBINARY)==0) - ){ - /* Check to see if there are direct references to generated columns - ** that are contained in the index. Pulling the generated column - ** out of the index is an optimization only - the main table is always - ** available if the index cannot be used. To avoid unnecessary - ** complication, omit this optimization if the collating sequence for - ** the column is non-standard */ - x.iTabCol = iRef; - w.xExprCallback = whereIndexExprTransColumn; -#endif /* SQLITE_OMIT_GENERATED_COLUMNS */ - }else{ - continue; - } - x.iIdxCol = iIdxCol; - sqlite3WalkExpr(&w, pWInfo->pWhere); - sqlite3WalkExprList(&w, pWInfo->pOrderBy); - sqlite3WalkExprList(&w, pWInfo->pResultSet); - } -} - /* ** The pTruth expression is always true because it is the WHERE clause ** a partial index that is driving a query loop. Look through all of the @@ -148904,6 +152299,8 @@ static SQLITE_NOINLINE void filterPullDown( testcase( pTerm->wtFlags & TERM_VIRTUAL ); regRowid = sqlite3GetTempReg(pParse); regRowid = codeEqualityTerm(pParse, pTerm, pLevel, 0, 0, regRowid); + sqlite3VdbeAddOp2(pParse->pVdbe, OP_MustBeInt, regRowid, addrNxt); + VdbeCoverage(pParse->pVdbe); sqlite3VdbeAddOp4Int(pParse->pVdbe, OP_Filter, pLevel->regFilter, addrNxt, regRowid, 1); VdbeCoverage(pParse->pVdbe); @@ -148996,7 +152393,7 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( ** initialize a memory cell that records if this table matches any ** row of the left table of the join. */ - assert( (pWInfo->wctrlFlags & WHERE_OR_SUBCLAUSE) + assert( (pWInfo->wctrlFlags & (WHERE_OR_SUBCLAUSE|WHERE_RIGHT_JOIN)) || pLevel->iFrom>0 || (pTabItem[0].fg.jointype & JT_LEFT)==0 ); if( pLevel->iFrom>0 && (pTabItem[0].fg.jointype & JT_LEFT)!=0 ){ @@ -149007,7 +152404,10 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( /* Compute a safe address to jump to if we discover that the table for ** this loop is empty and can never contribute content. */ - for(j=iLevel; j>0 && pWInfo->a[j].iLeftJoin==0; j--){} + for(j=iLevel; j>0; j--){ + if( pWInfo->a[j].iLeftJoin ) break; + if( pWInfo->a[j].pRJ ) break; + } addrHalt = pWInfo->a[j].addrBrk; /* Special case of a FROM clause subquery implemented as a co-routine */ @@ -149052,9 +152452,9 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( && pLoop->u.vtab.bOmitOffset ){ assert( pTerm->eOperator==WO_AUX ); - assert( pWInfo->pLimit!=0 ); - assert( pWInfo->pLimit->iOffset>0 ); - sqlite3VdbeAddOp2(v, OP_Integer, 0, pWInfo->pLimit->iOffset); + assert( pWInfo->pSelect!=0 ); + assert( pWInfo->pSelect->iOffset>0 ); + sqlite3VdbeAddOp2(v, OP_Integer, 0, pWInfo->pSelect->iOffset); VdbeComment((v,"Zero OFFSET counter")); } } @@ -149162,6 +152562,8 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( if( iRowidReg!=iReleaseReg ) sqlite3ReleaseTempReg(pParse, iReleaseReg); addrNxt = pLevel->addrNxt; if( pLevel->regFilter ){ + sqlite3VdbeAddOp2(v, OP_MustBeInt, iRowidReg, addrNxt); + VdbeCoverage(v); sqlite3VdbeAddOp4Int(v, OP_Filter, pLevel->regFilter, addrNxt, iRowidReg, 1); VdbeCoverage(v); @@ -149513,6 +152915,11 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( ** guess. */ addrSeekScan = sqlite3VdbeAddOp1(v, OP_SeekScan, (pIdx->aiRowLogEst[0]+9)/10); + if( pRangeStart ){ + sqlite3VdbeChangeP5(v, 1); + sqlite3VdbeChangeP2(v, addrSeekScan, sqlite3VdbeCurrentAddr(v)+1); + addrSeekScan = 0; + } VdbeCoverage(v); } sqlite3VdbeAddOp4Int(v, op, iIdxCur, addrNxt, regBase, nConstraint); @@ -149588,8 +152995,8 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( } nConstraint++; } - sqlite3DbFree(db, zStartAff); - sqlite3DbFree(db, zEndAff); + if( zStartAff ) sqlite3DbNNFreeNN(db, zStartAff); + if( zEndAff ) sqlite3DbNNFreeNN(db, zEndAff); /* Top of the loop body */ if( pLevel->p2==0 ) pLevel->p2 = sqlite3VdbeCurrentAddr(v); @@ -149634,7 +153041,7 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( /* Seek the table cursor, if required */ omitTable = (pLoop->wsFlags & WHERE_IDX_ONLY)!=0 - && (pWInfo->wctrlFlags & WHERE_OR_SUBCLAUSE)==0; + && (pWInfo->wctrlFlags & (WHERE_OR_SUBCLAUSE|WHERE_RIGHT_JOIN))==0; if( omitTable ){ /* pIdx is a covering index. No need to access the main table. */ }else if( HasRowid(pIdx->pTable) ){ @@ -149651,27 +153058,6 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( } if( pLevel->iLeftJoin==0 ){ - /* If pIdx is an index on one or more expressions, then look through - ** all the expressions in pWInfo and try to transform matching expressions - ** into reference to index columns. Also attempt to translate references - ** to virtual columns in the table into references to (stored) columns - ** of the index. - ** - ** Do not do this for the RHS of a LEFT JOIN. This is because the - ** expression may be evaluated after OP_NullRow has been executed on - ** the cursor. In this case it is important to do the full evaluation, - ** as the result of the expression may not be NULL, even if all table - ** column values are. https://www.sqlite.org/src/info/7fa8049685b50b5a - ** - ** Also, do not do this when processing one index an a multi-index - ** OR clause, since the transformation will become invalid once we - ** move forward to the next index. - ** https://sqlite.org/src/info/4e8e4857d32d401f - */ - if( (pWInfo->wctrlFlags & WHERE_OR_SUBCLAUSE)==0 ){ - whereIndexExprTrans(pIdx, iCur, iIdxCur, pWInfo); - } - /* If a partial index is driving the loop, try to eliminate WHERE clause ** terms from the query that must be true due to the WHERE clause of ** the partial index. @@ -149687,7 +153073,7 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( /* The following assert() is not a requirement, merely an observation: ** The OR-optimization doesn't work for the right hand table of ** a LEFT JOIN: */ - assert( (pWInfo->wctrlFlags & WHERE_OR_SUBCLAUSE)==0 ); + assert( (pWInfo->wctrlFlags & (WHERE_OR_SUBCLAUSE|WHERE_RIGHT_JOIN))==0 ); } /* Record the instruction used to terminate the loop. */ @@ -149784,7 +153170,7 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( int nNotReady; /* The number of notReady tables */ SrcItem *origSrc; /* Original list of tables */ nNotReady = pWInfo->nLevel - iLevel - 1; - pOrTab = sqlite3StackAllocRaw(db, + pOrTab = sqlite3DbMallocRawNN(db, sizeof(*pOrTab)+ nNotReady*sizeof(pOrTab->a[0])); if( pOrTab==0 ) return notReady; pOrTab->nAlloc = (u8)(nNotReady + 1); @@ -149891,7 +153277,7 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( Expr *pDelete; /* Local copy of OR clause term */ int jmp1 = 0; /* Address of jump operation */ testcase( (pTabItem[0].fg.jointype & JT_LEFT)!=0 - && !ExprHasProperty(pOrExpr, EP_FromJoin) + && !ExprHasProperty(pOrExpr, EP_OuterON) ); /* See TH3 vtab25.400 and ticket 614b25314c766238 */ pDelete = pOrExpr = sqlite3ExprDup(db, pOrExpr, 0); if( db->mallocFailed ){ @@ -150029,7 +153415,15 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( sqlite3VdbeGoto(v, pLevel->addrBrk); sqlite3VdbeResolveLabel(v, iLoopBody); - if( pWInfo->nLevel>1 ){ sqlite3StackFree(db, pOrTab); } + /* Set the P2 operand of the OP_Return opcode that will end the current + ** loop to point to this spot, which is the top of the next containing + ** loop. The byte-code formatter will use that P2 value as a hint to + ** indent everything in between the this point and the final OP_Return. + ** See tag-20220407a in vdbe.c and shell.c */ + assert( pLevel->op==OP_Return ); + pLevel->p2 = sqlite3VdbeCurrentAddr(v); + + if( pWInfo->nLevel>1 ){ sqlite3DbFreeNN(db, pOrTab); } if( !untestedTerms ) disableTerm(pLevel, pTerm); }else #endif /* SQLITE_OMIT_OR_OPTIMIZATION */ @@ -150091,10 +153485,22 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( } pE = pTerm->pExpr; assert( pE!=0 ); - if( (pTabItem->fg.jointype&JT_LEFT) && !ExprHasProperty(pE,EP_FromJoin) ){ - continue; + if( pTabItem->fg.jointype & (JT_LEFT|JT_LTORJ|JT_RIGHT) ){ + if( !ExprHasProperty(pE,EP_OuterON|EP_InnerON) ){ + /* Defer processing WHERE clause constraints until after outer + ** join processing. tag-20220513a */ + continue; + }else if( (pTabItem->fg.jointype & JT_LEFT)==JT_LEFT + && !ExprHasProperty(pE,EP_OuterON) ){ + continue; + }else{ + Bitmask m = sqlite3WhereGetMask(&pWInfo->sMaskSet, pE->w.iJoin); + if( m & pLevel->notReady ){ + /* An ON clause that is not ripe */ + continue; + } + } } - if( iLoop==1 && !sqlite3ExprCoveredByIndex(pE, pLevel->iTabCur, pIdx) ){ iNext = 2; continue; @@ -150153,7 +153559,7 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( if( (pTerm->eOperator & (WO_EQ|WO_IS))==0 ) continue; if( (pTerm->eOperator & WO_EQUIV)==0 ) continue; if( pTerm->leftCursor!=iCur ) continue; - if( pTabItem->fg.jointype & JT_LEFT ) continue; + if( pTabItem->fg.jointype & (JT_LEFT|JT_LTORJ|JT_RIGHT) ) continue; pE = pTerm->pExpr; #ifdef WHERETRACE_ENABLED /* 0x800 */ if( sqlite3WhereTrace & 0x800 ){ @@ -150161,7 +153567,7 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( sqlite3WhereTermPrint(pTerm, pWC->nTerm-j); } #endif - assert( !ExprHasProperty(pE, EP_FromJoin) ); + assert( !ExprHasProperty(pE, EP_OuterON) ); assert( (pTerm->prereqRight & pLevel->notReady)!=0 ); assert( (pTerm->eOperator & (WO_OR|WO_AND))==0 ); pAlt = sqlite3WhereFindTerm(pWC, iCur, pTerm->u.x.leftColumn, notReady, @@ -150184,6 +153590,47 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( pAlt->wtFlags |= TERM_CODED; } + /* For a RIGHT OUTER JOIN, record the fact that the current row has + ** been matched at least once. + */ + if( pLevel->pRJ ){ + Table *pTab; + int nPk; + int r; + int jmp1 = 0; + WhereRightJoin *pRJ = pLevel->pRJ; + + /* pTab is the right-hand table of the RIGHT JOIN. Generate code that + ** will record that the current row of that table has been matched at + ** least once. This is accomplished by storing the PK for the row in + ** both the iMatch index and the regBloom Bloom filter. + */ + pTab = pWInfo->pTabList->a[pLevel->iFrom].pTab; + if( HasRowid(pTab) ){ + r = sqlite3GetTempRange(pParse, 2); + sqlite3ExprCodeGetColumnOfTable(v, pTab, pLevel->iTabCur, -1, r+1); + nPk = 1; + }else{ + int iPk; + Index *pPk = sqlite3PrimaryKeyIndex(pTab); + nPk = pPk->nKeyCol; + r = sqlite3GetTempRange(pParse, nPk+1); + for(iPk=0; iPkaiColumn[iPk]; + sqlite3ExprCodeGetColumnOfTable(v, pTab, iCur, iCol,r+1+iPk); + } + } + jmp1 = sqlite3VdbeAddOp4Int(v, OP_Found, pRJ->iMatch, 0, r+1, nPk); + VdbeCoverage(v); + VdbeComment((v, "match against %s", pTab->zName)); + sqlite3VdbeAddOp3(v, OP_MakeRecord, r+1, nPk, r); + sqlite3VdbeAddOp4Int(v, OP_IdxInsert, pRJ->iMatch, r, r+1, nPk); + sqlite3VdbeAddOp4Int(v, OP_FilterAdd, pRJ->regBloom, 0, r+1, nPk); + sqlite3VdbeChangeP5(v, OPFLAG_USESEEKRESULT); + sqlite3VdbeJumpHere(v, jmp1); + sqlite3ReleaseTempRange(pParse, r, nPk+1); + } + /* For a LEFT OUTER JOIN, generate code that will record the fact that ** at least one row of the right table has matched the left table. */ @@ -150191,6 +153638,30 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( pLevel->addrFirst = sqlite3VdbeCurrentAddr(v); sqlite3VdbeAddOp2(v, OP_Integer, 1, pLevel->iLeftJoin); VdbeComment((v, "record LEFT JOIN hit")); + if( pLevel->pRJ==0 ){ + goto code_outer_join_constraints; /* WHERE clause constraints */ + } + } + + if( pLevel->pRJ ){ + /* Create a subroutine used to process all interior loops and code + ** of the RIGHT JOIN. During normal operation, the subroutine will + ** be in-line with the rest of the code. But at the end, a separate + ** loop will run that invokes this subroutine for unmatched rows + ** of pTab, with all tables to left begin set to NULL. + */ + WhereRightJoin *pRJ = pLevel->pRJ; + sqlite3VdbeAddOp2(v, OP_BeginSubrtn, 0, pRJ->regReturn); + pRJ->addrSubrtn = sqlite3VdbeCurrentAddr(v); + assert( pParse->withinRJSubrtn < 255 ); + pParse->withinRJSubrtn++; + + /* WHERE clause constraints must be deferred until after outer join + ** row elimination has completed, since WHERE clause constraints apply + ** to the results of the OUTER JOIN. The following loop generates the + ** appropriate WHERE clause constraint checks. tag-20220513a. + */ + code_outer_join_constraints: for(pTerm=pWC->a, j=0; jnBase; j++, pTerm++){ testcase( pTerm->wtFlags & TERM_VIRTUAL ); testcase( pTerm->wtFlags & TERM_CODED ); @@ -150199,6 +153670,7 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( assert( pWInfo->untestedTerms ); continue; } + if( pTabItem->fg.jointype & JT_LTORJ ) continue; assert( pTerm->pExpr ); sqlite3ExprIfFalse(pParse, pTerm->pExpr, addrCont, SQLITE_JUMPIFNULL); pTerm->wtFlags |= TERM_CODED; @@ -150219,6 +153691,96 @@ SQLITE_PRIVATE Bitmask sqlite3WhereCodeOneLoopStart( return pLevel->notReady; } +/* +** Generate the code for the loop that finds all non-matched terms +** for a RIGHT JOIN. +*/ +SQLITE_PRIVATE SQLITE_NOINLINE void sqlite3WhereRightJoinLoop( + WhereInfo *pWInfo, + int iLevel, + WhereLevel *pLevel +){ + Parse *pParse = pWInfo->pParse; + Vdbe *v = pParse->pVdbe; + WhereRightJoin *pRJ = pLevel->pRJ; + Expr *pSubWhere = 0; + WhereClause *pWC = &pWInfo->sWC; + WhereInfo *pSubWInfo; + WhereLoop *pLoop = pLevel->pWLoop; + SrcItem *pTabItem = &pWInfo->pTabList->a[pLevel->iFrom]; + SrcList sFrom; + Bitmask mAll = 0; + int k; + + ExplainQueryPlan((pParse, 1, "RIGHT-JOIN %s", pTabItem->pTab->zName)); + sqlite3VdbeNoJumpsOutsideSubrtn(v, pRJ->addrSubrtn, pRJ->endSubrtn, + pRJ->regReturn); + for(k=0; ka[k].pWLoop->maskSelf; + sqlite3VdbeAddOp1(v, OP_NullRow, pWInfo->a[k].iTabCur); + iIdxCur = pWInfo->a[k].iIdxCur; + if( iIdxCur ){ + sqlite3VdbeAddOp1(v, OP_NullRow, iIdxCur); + } + } + if( (pTabItem->fg.jointype & JT_LTORJ)==0 ){ + mAll |= pLoop->maskSelf; + for(k=0; knTerm; k++){ + WhereTerm *pTerm = &pWC->a[k]; + if( (pTerm->wtFlags & (TERM_VIRTUAL|TERM_SLICE))!=0 + && pTerm->eOperator!=WO_ROWVAL + ){ + break; + } + if( pTerm->prereqAll & ~mAll ) continue; + if( ExprHasProperty(pTerm->pExpr, EP_OuterON|EP_InnerON) ) continue; + pSubWhere = sqlite3ExprAnd(pParse, pSubWhere, + sqlite3ExprDup(pParse->db, pTerm->pExpr, 0)); + } + } + sFrom.nSrc = 1; + sFrom.nAlloc = 1; + memcpy(&sFrom.a[0], pTabItem, sizeof(SrcItem)); + sFrom.a[0].fg.jointype = 0; + assert( pParse->withinRJSubrtn < 100 ); + pParse->withinRJSubrtn++; + pSubWInfo = sqlite3WhereBegin(pParse, &sFrom, pSubWhere, 0, 0, 0, + WHERE_RIGHT_JOIN, 0); + if( pSubWInfo ){ + int iCur = pLevel->iTabCur; + int r = ++pParse->nMem; + int nPk; + int jmp; + int addrCont = sqlite3WhereContinueLabel(pSubWInfo); + Table *pTab = pTabItem->pTab; + if( HasRowid(pTab) ){ + sqlite3ExprCodeGetColumnOfTable(v, pTab, iCur, -1, r); + nPk = 1; + }else{ + int iPk; + Index *pPk = sqlite3PrimaryKeyIndex(pTab); + nPk = pPk->nKeyCol; + pParse->nMem += nPk - 1; + for(iPk=0; iPkaiColumn[iPk]; + sqlite3ExprCodeGetColumnOfTable(v, pTab, iCur, iCol,r+iPk); + } + } + jmp = sqlite3VdbeAddOp4Int(v, OP_Filter, pRJ->regBloom, 0, r, nPk); + VdbeCoverage(v); + sqlite3VdbeAddOp4Int(v, OP_Found, pRJ->iMatch, addrCont, r, nPk); + VdbeCoverage(v); + sqlite3VdbeJumpHere(v, jmp); + sqlite3VdbeAddOp2(v, OP_Gosub, pRJ->regReturn, pRJ->addrSubrtn); + sqlite3WhereEnd(pSubWInfo); + } + sqlite3ExprDelete(pParse->db, pSubWhere); + ExplainQueryPlanPop(pParse); + assert( pParse->withinRJSubrtn>0 ); + pParse->withinRJSubrtn--; +} + /************** End of wherecode.c *******************************************/ /************** Begin file whereexpr.c ***************************************/ /* @@ -150287,7 +153849,7 @@ static int whereClauseInsert(WhereClause *pWC, Expr *p, u16 wtFlags){ if( pWC->nTerm>=pWC->nSlot ){ WhereTerm *pOld = pWC->a; sqlite3 *db = pWC->pWInfo->pParse->db; - pWC->a = sqlite3DbMallocRawNN(db, sizeof(pWC->a[0])*pWC->nSlot*2 ); + pWC->a = sqlite3WhereMalloc(pWC->pWInfo, sizeof(pWC->a[0])*pWC->nSlot*2 ); if( pWC->a==0 ){ if( wtFlags & TERM_DYNAMIC ){ sqlite3ExprDelete(db, p); @@ -150296,10 +153858,7 @@ static int whereClauseInsert(WhereClause *pWC, Expr *p, u16 wtFlags){ return 0; } memcpy(pWC->a, pOld, sizeof(pWC->a[0])*pWC->nTerm); - if( pOld!=pWC->aStatic ){ - sqlite3DbFree(db, pOld); - } - pWC->nSlot = sqlite3DbMallocSize(db, pWC->a)/sizeof(pWC->a[0]); + pWC->nSlot = pWC->nSlot*2; } pTerm = &pWC->a[idx = pWC->nTerm++]; if( (wtFlags & TERM_VIRTUAL)==0 ) pWC->nBase = pWC->nTerm; @@ -150492,7 +154051,7 @@ static int isLikeOrGlob( if( pLeft->op!=TK_COLUMN || sqlite3ExprAffinity(pLeft)!=SQLITE_AFF_TEXT || (ALWAYS( ExprUseYTab(pLeft) ) - && pLeft->y.pTab + && ALWAYS(pLeft->y.pTab) && IsVirtual(pLeft->y.pTab)) /* Might be numeric */ ){ int isNum; @@ -150609,8 +154168,7 @@ static int isAuxiliaryVtabOperator( ** MATCH(expression,vtab_column) */ pCol = pList->a[1].pExpr; - assert( pCol->op!=TK_COLUMN || ExprUseYTab(pCol) ); - testcase( pCol->op==TK_COLUMN && pCol->y.pTab==0 ); + assert( pCol->op!=TK_COLUMN || (ExprUseYTab(pCol) && pCol->y.pTab!=0) ); if( ExprIsVtab(pCol) ){ for(i=0; ia[0].pExpr; assert( pCol->op!=TK_COLUMN || ExprUseYTab(pCol) ); - testcase( pCol->op==TK_COLUMN && pCol->y.pTab==0 ); + assert( pCol->op!=TK_COLUMN || (ExprUseYTab(pCol) && pCol->y.pTab!=0) ); if( ExprIsVtab(pCol) ){ sqlite3_vtab *pVtab; sqlite3_module *pMod; @@ -150660,13 +154218,12 @@ static int isAuxiliaryVtabOperator( int res = 0; Expr *pLeft = pExpr->pLeft; Expr *pRight = pExpr->pRight; - assert( pLeft->op!=TK_COLUMN || ExprUseYTab(pLeft) ); - testcase( pLeft->op==TK_COLUMN && pLeft->y.pTab==0 ); + assert( pLeft->op!=TK_COLUMN || (ExprUseYTab(pLeft) && pLeft->y.pTab!=0) ); if( ExprIsVtab(pLeft) ){ res++; } - assert( pRight==0 || pRight->op!=TK_COLUMN || ExprUseYTab(pRight) ); - testcase( pRight && pRight->op==TK_COLUMN && pRight->y.pTab==0 ); + assert( pRight==0 || pRight->op!=TK_COLUMN + || (ExprUseYTab(pRight) && pRight->y.pTab!=0) ); if( pRight && ExprIsVtab(pRight) ){ res++; SWAP(Expr*, pLeft, pRight); @@ -150687,9 +154244,9 @@ static int isAuxiliaryVtabOperator( ** a join, then transfer the appropriate markings over to derived. */ static void transferJoinMarkings(Expr *pDerived, Expr *pBase){ - if( pDerived ){ - pDerived->flags |= pBase->flags & EP_FromJoin; - pDerived->w.iRightJoinTable = pBase->w.iRightJoinTable; + if( pDerived && ExprHasProperty(pBase, EP_OuterON|EP_InnerON) ){ + pDerived->flags |= pBase->flags & (EP_OuterON|EP_InnerON); + pDerived->w.iJoin = pBase->w.iJoin; } } @@ -151143,7 +154700,7 @@ static int termIsEquivalence(Parse *pParse, Expr *pExpr){ CollSeq *pColl; if( !OptimizationEnabled(pParse->db, SQLITE_Transitive) ) return 0; if( pExpr->op!=TK_EQ && pExpr->op!=TK_IS ) return 0; - if( ExprHasProperty(pExpr, EP_FromJoin) ) return 0; + if( ExprHasProperty(pExpr, EP_OuterON) ) return 0; aff1 = sqlite3ExprAffinity(pExpr->pLeft); aff2 = sqlite3ExprAffinity(pExpr->pRight); if( aff1!=aff2 @@ -151174,7 +154731,9 @@ static Bitmask exprSelectUsage(WhereMaskSet *pMaskSet, Select *pS){ int i; for(i=0; inSrc; i++){ mask |= exprSelectUsage(pMaskSet, pSrc->a[i].pSelect); - mask |= sqlite3WhereExprUsage(pMaskSet, pSrc->a[i].pOn); + if( pSrc->a[i].fg.isUsing==0 ){ + mask |= sqlite3WhereExprUsage(pMaskSet, pSrc->a[i].u3.pOn); + } if( pSrc->a[i].fg.isTabFunc ){ mask |= sqlite3WhereExprListUsage(pMaskSet, pSrc->a[i].u1.pFuncArg); } @@ -151213,6 +154772,7 @@ static SQLITE_NOINLINE int exprMightBeIndexed2( if( pIdx->aColExpr==0 ) continue; for(i=0; inKeyCol; i++){ if( pIdx->aiColumn[i]!=XN_EXPR ) continue; + assert( pIdx->bHasExpr ); if( sqlite3ExprCompareSkip(pExpr, pIdx->aColExpr->a[i].pExpr, iCur)==0 ){ aiCurCol[0] = iCur; aiCurCol[1] = XN_EXPR; @@ -151329,18 +154889,32 @@ static void exprAnalyze( if( prereqAll!=sqlite3WhereExprUsageNN(pMaskSet, pExpr) ){ printf("\n*** Incorrect prereqAll computed for:\n"); sqlite3TreeViewExpr(0,pExpr,0); - abort(); + assert( 0 ); } #endif - if( ExprHasProperty(pExpr, EP_FromJoin) ){ - Bitmask x = sqlite3WhereGetMask(pMaskSet, pExpr->w.iRightJoinTable); - prereqAll |= x; - extraRight = x-1; /* ON clause terms may not be used with an index - ** on left table of a LEFT JOIN. Ticket #3015 */ - if( (prereqAll>>1)>=x ){ - sqlite3ErrorMsg(pParse, "ON clause references tables to its right"); - return; + if( ExprHasProperty(pExpr, EP_OuterON|EP_InnerON) ){ + Bitmask x = sqlite3WhereGetMask(pMaskSet, pExpr->w.iJoin); + if( ExprHasProperty(pExpr, EP_OuterON) ){ + prereqAll |= x; + extraRight = x-1; /* ON clause terms may not be used with an index + ** on left table of a LEFT JOIN. Ticket #3015 */ + if( (prereqAll>>1)>=x ){ + sqlite3ErrorMsg(pParse, "ON clause references tables to its right"); + return; + } + }else if( (prereqAll>>1)>=x ){ + /* The ON clause of an INNER JOIN references a table to its right. + ** Most other SQL database engines raise an error. But SQLite versions + ** 3.0 through 3.38 just put the ON clause constraint into the WHERE + ** clause and carried on. Beginning with 3.39, raise an error only + ** if there is a RIGHT or FULL JOIN in the query. This makes SQLite + ** more like other systems, and also preserves legacy. */ + if( ALWAYS(pSrc->nSrc>0) && (pSrc->a[0].fg.jointype & JT_LTORJ)!=0 ){ + sqlite3ErrorMsg(pParse, "ON clause references tables to its right"); + return; + } + ExprClearProperty(pExpr, EP_InnerON); } } pTerm->prereqAll = prereqAll; @@ -151408,7 +154982,7 @@ static void exprAnalyze( pNew->eOperator = (operatorMask(pDup->op) + eExtraOp) & opMask; }else if( op==TK_ISNULL - && !ExprHasProperty(pExpr,EP_FromJoin) + && !ExprHasProperty(pExpr,EP_OuterON) && 0==sqlite3ExprCanBeNull(pLeft) ){ assert( !ExprHasProperty(pExpr, EP_IntValue) ); @@ -151479,7 +155053,7 @@ static void exprAnalyze( else if( pExpr->op==TK_NOTNULL ){ if( pExpr->pLeft->op==TK_COLUMN && pExpr->pLeft->iColumn>=0 - && !ExprHasProperty(pExpr, EP_FromJoin) + && !ExprHasProperty(pExpr, EP_OuterON) ){ Expr *pNewExpr; Expr *pLeft = pExpr->pLeft; @@ -151627,7 +155201,7 @@ static void exprAnalyze( } pTerm = &pWC->a[idxTerm]; pTerm->wtFlags |= TERM_CODED|TERM_VIRTUAL; /* Disable the original */ - pTerm->eOperator = 0; + pTerm->eOperator = WO_ROWVAL; } /* If there is a vector IN term - e.g. "(a, b) IN (SELECT ...)" - create @@ -151683,9 +155257,9 @@ static void exprAnalyze( Expr *pNewExpr; pNewExpr = sqlite3PExpr(pParse, TK_MATCH, 0, sqlite3ExprDup(db, pRight, 0)); - if( ExprHasProperty(pExpr, EP_FromJoin) && pNewExpr ){ - ExprSetProperty(pNewExpr, EP_FromJoin); - pNewExpr->w.iRightJoinTable = pExpr->w.iRightJoinTable; + if( ExprHasProperty(pExpr, EP_OuterON) && pNewExpr ){ + ExprSetProperty(pNewExpr, EP_OuterON); + pNewExpr->w.iJoin = pExpr->w.iJoin; } idxNew = whereClauseInsert(pWC, pNewExpr, TERM_VIRTUAL|TERM_DYNAMIC); testcase( idxNew==0 ); @@ -151812,9 +155386,9 @@ static void whereAddLimitExpr( ** exist only so that they may be passed to the xBestIndex method of the ** single virtual table in the FROM clause of the SELECT. */ -SQLITE_PRIVATE void sqlite3WhereAddLimit(WhereClause *pWC, Select *p){ - assert( p==0 || (p->pGroupBy==0 && (p->selFlags & SF_Aggregate)==0) ); - if( (p && p->pLimit) /* 1 */ +SQLITE_PRIVATE void SQLITE_NOINLINE sqlite3WhereAddLimit(WhereClause *pWC, Select *p){ + assert( p!=0 && p->pLimit!=0 ); /* 1 -- checked by caller */ + if( p->pGroupBy==0 && (p->selFlags & (SF_Distinct|SF_Aggregate))==0 /* 2 */ && (p->pSrc->nSrc==1 && IsVirtual(p->pSrc->a[0].pTab)) /* 3 */ ){ @@ -151828,7 +155402,7 @@ SQLITE_PRIVATE void sqlite3WhereAddLimit(WhereClause *pWC, Select *p){ /* This term is a vector operation that has been decomposed into ** other, subsequent terms. It can be ignored. See tag-20220128a */ assert( pWC->a[ii].wtFlags & TERM_VIRTUAL ); - assert( pWC->a[ii].eOperator==0 ); + assert( pWC->a[ii].eOperator==WO_ROWVAL ); continue; } if( pWC->a[ii].leftCursor!=iCsr ) return; @@ -151840,7 +155414,7 @@ SQLITE_PRIVATE void sqlite3WhereAddLimit(WhereClause *pWC, Select *p){ Expr *pExpr = pOrderBy->a[ii].pExpr; if( pExpr->op!=TK_COLUMN ) return; if( pExpr->iTable!=iCsr ) return; - if( pOrderBy->a[ii].sortFlags & KEYINFO_ORDER_BIGNULL ) return; + if( pOrderBy->a[ii].fg.sortFlags & KEYINFO_ORDER_BIGNULL ) return; } } @@ -151907,9 +155481,6 @@ SQLITE_PRIVATE void sqlite3WhereClauseClear(WhereClause *pWC){ a++; } } - if( pWC->a!=pWC->aStatic ){ - sqlite3DbFree(db, pWC->a); - } } @@ -152036,6 +155607,7 @@ SQLITE_PRIVATE void sqlite3WhereTabFuncArgs( if( pArgs==0 ) return; for(j=k=0; jnExpr; j++){ Expr *pRhs; + u32 joinType; while( knCol && (pTab->aCol[k].colFlags & COLFLAG_HIDDEN)==0 ){k++;} if( k>=pTab->nCol ){ sqlite3ErrorMsg(pParse, "too many arguments on %s() - max %d", @@ -152052,9 +155624,12 @@ SQLITE_PRIVATE void sqlite3WhereTabFuncArgs( pRhs = sqlite3PExpr(pParse, TK_UPLUS, sqlite3ExprDup(pParse->db, pArgs->a[j].pExpr, 0), 0); pTerm = sqlite3PExpr(pParse, TK_EQ, pColRef, pRhs); - if( pItem->fg.jointype & JT_LEFT ){ - sqlite3SetJoinExpr(pTerm, pItem->iCursor); + if( pItem->fg.jointype & (JT_LEFT|JT_LTORJ) ){ + joinType = EP_OuterON; + }else{ + joinType = EP_InnerON; } + sqlite3SetJoinExpr(pTerm, pItem->iCursor, joinType); whereClauseInsert(pWC, pTerm, TERM_DYNAMIC); } } @@ -152130,7 +155705,7 @@ SQLITE_PRIVATE int sqlite3WhereIsDistinct(WhereInfo *pWInfo){ ** block sorting is required. */ SQLITE_PRIVATE int sqlite3WhereIsOrdered(WhereInfo *pWInfo){ - return pWInfo->nOBSat; + return pWInfo->nOBSat<0 ? 0 : pWInfo->nOBSat; } /* @@ -152165,7 +155740,7 @@ SQLITE_PRIVATE int sqlite3WhereOrderByLimitOptLabel(WhereInfo *pWInfo){ } pInner = &pWInfo->a[pWInfo->nLevel-1]; assert( pInner->addrNxt!=0 ); - return pInner->addrNxt; + return pInner->pRJ ? pWInfo->iContinue : pInner->addrNxt; } /* @@ -152316,6 +155891,30 @@ SQLITE_PRIVATE Bitmask sqlite3WhereGetMask(WhereMaskSet *pMaskSet, int iCursor){ return 0; } +/* Allocate memory that is automatically freed when pWInfo is freed. +*/ +SQLITE_PRIVATE void *sqlite3WhereMalloc(WhereInfo *pWInfo, u64 nByte){ + WhereMemBlock *pBlock; + pBlock = sqlite3DbMallocRawNN(pWInfo->pParse->db, nByte+sizeof(*pBlock)); + if( pBlock ){ + pBlock->pNext = pWInfo->pMemToFree; + pBlock->sz = nByte; + pWInfo->pMemToFree = pBlock; + pBlock++; + } + return (void*)pBlock; +} +SQLITE_PRIVATE void *sqlite3WhereRealloc(WhereInfo *pWInfo, void *pOld, u64 nByte){ + void *pNew = sqlite3WhereMalloc(pWInfo, nByte); + if( pNew && pOld ){ + WhereMemBlock *pOldBlk = (WhereMemBlock*)pOld; + pOldBlk--; + assert( pOldBlk->szsz); + } + return pNew; +} + /* ** Create a new mask for cursor iCursor. ** @@ -152369,7 +155968,7 @@ static WhereTerm *whereScanNext(WhereScan *pScan){ && (iColumn!=XN_EXPR || sqlite3ExprCompareSkip(pTerm->pExpr->pLeft, pScan->pIdxExpr,iCur)==0) - && (pScan->iEquiv<=1 || !ExprHasProperty(pTerm->pExpr, EP_FromJoin)) + && (pScan->iEquiv<=1 || !ExprHasProperty(pTerm->pExpr, EP_OuterON)) ){ if( (pTerm->eOperator & WO_EQUIV)!=0 && pScan->nEquivaiCur) @@ -152721,6 +156320,7 @@ static void translateColumnToCopy( pOp->p1 = pOp->p2 + iRegister; pOp->p2 = pOp->p3; pOp->p3 = 0; + pOp->p5 = 2; /* Cause the MEM_Subtype flag to be cleared */ }else if( pOp->opcode==OP_Rowid ){ pOp->opcode = OP_Sequence; pOp->p1 = iAutoidxCur; @@ -152781,6 +156381,43 @@ static void whereTraceIndexInfoOutputs(sqlite3_index_info *p){ #define whereTraceIndexInfoOutputs(A) #endif +/* +** We know that pSrc is an operand of an outer join. Return true if +** pTerm is a constraint that is compatible with that join. +** +** pTerm must be EP_OuterON if pSrc is the right operand of an +** outer join. pTerm can be either EP_OuterON or EP_InnerON if pSrc +** is the left operand of a RIGHT join. +** +** See https://sqlite.org/forum/forumpost/206d99a16dd9212f +** for an example of a WHERE clause constraints that may not be used on +** the right table of a RIGHT JOIN because the constraint implies a +** not-NULL condition on the left table of the RIGHT JOIN. +*/ +static int constraintCompatibleWithOuterJoin( + const WhereTerm *pTerm, /* WHERE clause term to check */ + const SrcItem *pSrc /* Table we are trying to access */ +){ + assert( (pSrc->fg.jointype&(JT_LEFT|JT_LTORJ|JT_RIGHT))!=0 ); /* By caller */ + testcase( (pSrc->fg.jointype & (JT_LEFT|JT_LTORJ|JT_RIGHT))==JT_LEFT ); + testcase( (pSrc->fg.jointype & (JT_LEFT|JT_LTORJ|JT_RIGHT))==JT_LTORJ ); + testcase( ExprHasProperty(pTerm->pExpr, EP_OuterON) ) + testcase( ExprHasProperty(pTerm->pExpr, EP_InnerON) ); + if( !ExprHasProperty(pTerm->pExpr, EP_OuterON|EP_InnerON) + || pTerm->pExpr->w.iJoin != pSrc->iCursor + ){ + return 0; + } + if( (pSrc->fg.jointype & (JT_LEFT|JT_RIGHT))!=0 + && ExprHasProperty(pTerm->pExpr, EP_InnerON) + ){ + return 0; + } + return 1; +} + + + #ifndef SQLITE_OMIT_AUTOMATIC_INDEX /* ** Return TRUE if the WHERE clause term pTerm is of a form where it @@ -152795,14 +156432,11 @@ static int termCanDriveIndex( char aff; if( pTerm->leftCursor!=pSrc->iCursor ) return 0; if( (pTerm->eOperator & (WO_EQ|WO_IS))==0 ) return 0; - if( (pSrc->fg.jointype & JT_LEFT) - && !ExprHasProperty(pTerm->pExpr, EP_FromJoin) - && (pTerm->eOperator & WO_IS) + assert( (pSrc->fg.jointype & JT_RIGHT)==0 ); + if( (pSrc->fg.jointype & (JT_LEFT|JT_LTORJ|JT_RIGHT))!=0 + && !constraintCompatibleWithOuterJoin(pTerm,pSrc) ){ - /* Cannot use an IS term from the WHERE clause as an index driver for - ** the RHS of a LEFT JOIN. Such a term can only be used if it is from - ** the ON clause. */ - return 0; + return 0; /* See https://sqlite.org/forum/forumpost/51e6959f61 */ } if( (pTerm->prereqRight & notReady)!=0 ) return 0; assert( (pTerm->eOperator & (WO_OR|WO_AND))==0 ); @@ -153143,7 +156777,7 @@ static SQLITE_NOINLINE void sqlite3ConstructBloomFilter( const SrcItem *pTabItem; pLevel = &pWInfo->a[iLevel]; pTabItem = &pWInfo->pTabList->a[pLevel->iFrom]; - if( pTabItem->fg.jointype & JT_LEFT ) continue; + if( pTabItem->fg.jointype & (JT_LEFT|JT_LTORJ) ) continue; pLoop = pLevel->pWLoop; if( NEVER(pLoop==0) ) continue; if( pLoop->prereq & notReady ) continue; @@ -153214,12 +156848,8 @@ static sqlite3_index_info *allocateIndexInfo( assert( (pTerm->eOperator & (WO_OR|WO_AND))==0 ); assert( pTerm->u.x.leftColumn>=XN_ROWID ); assert( pTerm->u.x.leftColumnnCol ); - - /* tag-20191211-002: WHERE-clause constraints are not useful to the - ** right-hand table of a LEFT JOIN. See tag-20191211-001 for the - ** equivalent restriction for ordinary tables. */ - if( (pSrc->fg.jointype & JT_LEFT)!=0 - && !ExprHasProperty(pTerm->pExpr, EP_FromJoin) + if( (pSrc->fg.jointype & (JT_LEFT|JT_LTORJ|JT_RIGHT))!=0 + && !constraintCompatibleWithOuterJoin(pTerm,pSrc) ){ continue; } @@ -153244,7 +156874,7 @@ static sqlite3_index_info *allocateIndexInfo( } /* Virtual tables are unable to deal with NULLS FIRST */ - if( pOrderBy->a[i].sortFlags & KEYINFO_ORDER_BIGNULL ) break; + if( pOrderBy->a[i].fg.sortFlags & KEYINFO_ORDER_BIGNULL ) break; /* First case - a direct column references without a COLLATE operator */ if( pExpr->op==TK_COLUMN && pExpr->iTable==pSrc->iCursor ){ @@ -153274,8 +156904,10 @@ static sqlite3_index_info *allocateIndexInfo( } if( i==n ){ nOrderBy = n; - if( (pWInfo->wctrlFlags & (WHERE_GROUPBY|WHERE_DISTINCTBY)) ){ - eDistinct = 1 + ((pWInfo->wctrlFlags & WHERE_DISTINCTBY)!=0); + if( (pWInfo->wctrlFlags & WHERE_DISTINCTBY) ){ + eDistinct = 2 + ((pWInfo->wctrlFlags & WHERE_SORTBYGROUP)!=0); + }else if( pWInfo->wctrlFlags & WHERE_GROUPBY ){ + eDistinct = 1; } } } @@ -153354,7 +156986,7 @@ static sqlite3_index_info *allocateIndexInfo( || (pExpr->op==TK_COLLATE && pExpr->pLeft->op==TK_COLUMN && pExpr->iColumn==pExpr->pLeft->iColumn) ); pIdxOrderBy[j].iColumn = pExpr->iColumn; - pIdxOrderBy[j].desc = pOrderBy->a[i].sortFlags & KEYINFO_ORDER_DESC; + pIdxOrderBy[j].desc = pOrderBy->a[i].fg.sortFlags & KEYINFO_ORDER_DESC; j++; } pIdxInfo->nOrderBy = j; @@ -153458,7 +157090,7 @@ static int whereKeyStats( #endif assert( pRec!=0 ); assert( pIdx->nSample>0 ); - assert( pRec->nField>0 && pRec->nField<=pIdx->nSampleCol ); + assert( pRec->nField>0 ); /* Do a binary search to find the first sample greater than or equal ** to pRec. If pRec contains a single field, the set of samples to search @@ -153504,7 +157136,7 @@ static int whereKeyStats( ** it is extended to two fields. The duplicates that this creates do not ** cause any problems. */ - nField = pRec->nField; + nField = MIN(pRec->nField, pIdx->nSample); iCol = 0; iSample = pIdx->nSample * nField; do{ @@ -153592,7 +157224,7 @@ static int whereKeyStats( ** is larger than all samples in the array. */ tRowcnt iUpper, iGap; if( i>=pIdx->nSample ){ - iUpper = sqlite3LogEstToInt(pIdx->aiRowLogEst[0]); + iUpper = pIdx->nRowEst0; }else{ iUpper = aSample[i].anLt[iCol]; } @@ -154095,7 +157727,7 @@ SQLITE_PRIVATE void sqlite3WhereTermPrint(WhereTerm *pTerm, int iTerm){ memcpy(zType, "....", 5); if( pTerm->wtFlags & TERM_VIRTUAL ) zType[0] = 'V'; if( pTerm->eOperator & WO_EQUIV ) zType[1] = 'E'; - if( ExprHasProperty(pTerm->pExpr, EP_FromJoin) ) zType[2] = 'L'; + if( ExprHasProperty(pTerm->pExpr, EP_OuterON) ) zType[2] = 'L'; if( pTerm->wtFlags & TERM_CODED ) zType[3] = 'C'; if( pTerm->eOperator & WO_SINGLE ){ assert( (pTerm->eOperator & (WO_OR|WO_AND))==0 ); @@ -154221,12 +157853,18 @@ static void whereLoopClearUnion(sqlite3 *db, WhereLoop *p){ } /* -** Deallocate internal memory used by a WhereLoop object +** Deallocate internal memory used by a WhereLoop object. Leave the +** object in an initialized state, as if it had been newly allocated. */ static void whereLoopClear(sqlite3 *db, WhereLoop *p){ - if( p->aLTerm!=p->aLTermSpace ) sqlite3DbFreeNN(db, p->aLTerm); + if( p->aLTerm!=p->aLTermSpace ){ + sqlite3DbFreeNN(db, p->aLTerm); + p->aLTerm = p->aLTermSpace; + p->nLSlot = ArraySize(p->aLTermSpace); + } whereLoopClearUnion(db, p); - whereLoopInit(p); + p->nLTerm = 0; + p->wsFlags = 0; } /* @@ -154250,7 +157888,9 @@ static int whereLoopResize(sqlite3 *db, WhereLoop *p, int n){ */ static int whereLoopXfer(sqlite3 *db, WhereLoop *pTo, WhereLoop *pFrom){ whereLoopClearUnion(db, pTo); - if( whereLoopResize(db, pTo, pFrom->nLTerm) ){ + if( pFrom->nLTerm > pTo->nLSlot + && whereLoopResize(db, pTo, pFrom->nLTerm) + ){ memset(pTo, 0, WHERE_LOOP_XFER_SZ); return SQLITE_NOMEM_BKPT; } @@ -154268,42 +157908,29 @@ static int whereLoopXfer(sqlite3 *db, WhereLoop *pTo, WhereLoop *pFrom){ ** Delete a WhereLoop object */ static void whereLoopDelete(sqlite3 *db, WhereLoop *p){ + assert( db!=0 ); whereLoopClear(db, p); - sqlite3DbFreeNN(db, p); + sqlite3DbNNFreeNN(db, p); } /* ** Free a WhereInfo structure */ static void whereInfoFree(sqlite3 *db, WhereInfo *pWInfo){ - int i; assert( pWInfo!=0 ); - for(i=0; inLevel; i++){ - WhereLevel *pLevel = &pWInfo->a[i]; - if( pLevel->pWLoop && (pLevel->pWLoop->wsFlags & WHERE_IN_ABLE)!=0 ){ - assert( (pLevel->pWLoop->wsFlags & WHERE_MULTI_OR)==0 ); - sqlite3DbFree(db, pLevel->u.in.aInLoop); - } - } + assert( db!=0 ); sqlite3WhereClauseClear(&pWInfo->sWC); while( pWInfo->pLoops ){ WhereLoop *p = pWInfo->pLoops; pWInfo->pLoops = p->pNextLoop; whereLoopDelete(db, p); } - assert( pWInfo->pExprMods==0 ); - sqlite3DbFreeNN(db, pWInfo); -} - -/* Undo all Expr node modifications -*/ -static void whereUndoExprMods(WhereInfo *pWInfo){ - while( pWInfo->pExprMods ){ - WhereExprMod *p = pWInfo->pExprMods; - pWInfo->pExprMods = p->pNext; - memcpy(p->pExpr, &p->orig, sizeof(p->orig)); - sqlite3DbFree(pWInfo->pParse->db, p); + while( pWInfo->pMemToFree ){ + WhereMemBlock *pNext = pWInfo->pMemToFree->pNext; + sqlite3DbNNFreeNN(db, pWInfo->pMemToFree); + pWInfo->pMemToFree = pNext; } + sqlite3DbNNFreeNN(db, pWInfo); } /* @@ -154660,10 +158287,11 @@ static void whereLoopOutputAdjust( ** ** 2022-03-24: Self-culling only applies if either the extra terms ** are straight comparison operators that are non-true with NULL - ** operand, or if the loop is not a LEFT JOIN. + ** operand, or if the loop is not an OUTER JOIN. */ if( (pTerm->eOperator & 0x3f)!=0 - || (pWC->pWInfo->pTabList->a[pLoop->iTab].fg.jointype & JT_LEFT)==0 + || (pWC->pWInfo->pTabList->a[pLoop->iTab].fg.jointype + & (JT_LEFT|JT_LTORJ))==0 ){ pLoop->wsFlags |= WHERE_SELFCULL; } @@ -154869,15 +158497,11 @@ static int whereLoopAddBtreeIndex( ** to mix with a lower range bound from some other source */ if( pTerm->wtFlags & TERM_LIKEOPT && pTerm->eOperator==WO_LT ) continue; - /* tag-20191211-001: Do not allow constraints from the WHERE clause to - ** be used by the right table of a LEFT JOIN. Only constraints in the - ** ON clause are allowed. See tag-20191211-002 for the vtab equivalent. */ - if( (pSrc->fg.jointype & JT_LEFT)!=0 - && !ExprHasProperty(pTerm->pExpr, EP_FromJoin) + if( (pSrc->fg.jointype & (JT_LEFT|JT_LTORJ|JT_RIGHT))!=0 + && !constraintCompatibleWithOuterJoin(pTerm,pSrc) ){ continue; } - if( IsUniqueIndex(pProbe) && saved_nEq==pProbe->nKeyCol-1 ){ pBuilder->bldFlags1 |= SQLITE_BLDF1_UNIQUE; }else{ @@ -154888,7 +158512,11 @@ static int whereLoopAddBtreeIndex( pNew->u.btree.nBtm = saved_nBtm; pNew->u.btree.nTop = saved_nTop; pNew->nLTerm = saved_nLTerm; - if( whereLoopResize(db, pNew, pNew->nLTerm+1) ) break; /* OOM */ + if( pNew->nLTerm>=pNew->nLSlot + && whereLoopResize(db, pNew, pNew->nLTerm+1) + ){ + break; /* OOM while trying to enlarge the pNew->aLTerm array */ + } pNew->aLTerm[pNew->nLTerm++] = pTerm; pNew->prereq = (saved_prereq | pTerm->prereqRight) & ~pNew->maskSelf; @@ -154981,38 +158609,39 @@ static int whereLoopAddBtreeIndex( if( scan.iEquiv>1 ) pNew->wsFlags |= WHERE_TRANSCONS; }else if( eOp & WO_ISNULL ){ pNew->wsFlags |= WHERE_COLUMN_NULL; - }else if( eOp & (WO_GT|WO_GE) ){ - testcase( eOp & WO_GT ); - testcase( eOp & WO_GE ); - pNew->wsFlags |= WHERE_COLUMN_RANGE|WHERE_BTM_LIMIT; - pNew->u.btree.nBtm = whereRangeVectorLen( - pParse, pSrc->iCursor, pProbe, saved_nEq, pTerm - ); - pBtm = pTerm; - pTop = 0; - if( pTerm->wtFlags & TERM_LIKEOPT ){ - /* Range constraints that come from the LIKE optimization are - ** always used in pairs. */ - pTop = &pTerm[1]; - assert( (pTop-(pTerm->pWC->a))pWC->nTerm ); - assert( pTop->wtFlags & TERM_LIKEOPT ); - assert( pTop->eOperator==WO_LT ); - if( whereLoopResize(db, pNew, pNew->nLTerm+1) ) break; /* OOM */ - pNew->aLTerm[pNew->nLTerm++] = pTop; - pNew->wsFlags |= WHERE_TOP_LIMIT; - pNew->u.btree.nTop = 1; - } - }else{ - assert( eOp & (WO_LT|WO_LE) ); - testcase( eOp & WO_LT ); - testcase( eOp & WO_LE ); - pNew->wsFlags |= WHERE_COLUMN_RANGE|WHERE_TOP_LIMIT; - pNew->u.btree.nTop = whereRangeVectorLen( + }else{ + int nVecLen = whereRangeVectorLen( pParse, pSrc->iCursor, pProbe, saved_nEq, pTerm ); - pTop = pTerm; - pBtm = (pNew->wsFlags & WHERE_BTM_LIMIT)!=0 ? - pNew->aLTerm[pNew->nLTerm-2] : 0; + if( eOp & (WO_GT|WO_GE) ){ + testcase( eOp & WO_GT ); + testcase( eOp & WO_GE ); + pNew->wsFlags |= WHERE_COLUMN_RANGE|WHERE_BTM_LIMIT; + pNew->u.btree.nBtm = nVecLen; + pBtm = pTerm; + pTop = 0; + if( pTerm->wtFlags & TERM_LIKEOPT ){ + /* Range constraints that come from the LIKE optimization are + ** always used in pairs. */ + pTop = &pTerm[1]; + assert( (pTop-(pTerm->pWC->a))pWC->nTerm ); + assert( pTop->wtFlags & TERM_LIKEOPT ); + assert( pTop->eOperator==WO_LT ); + if( whereLoopResize(db, pNew, pNew->nLTerm+1) ) break; /* OOM */ + pNew->aLTerm[pNew->nLTerm++] = pTop; + pNew->wsFlags |= WHERE_TOP_LIMIT; + pNew->u.btree.nTop = 1; + } + }else{ + assert( eOp & (WO_LT|WO_LE) ); + testcase( eOp & WO_LT ); + testcase( eOp & WO_LE ); + pNew->wsFlags |= WHERE_COLUMN_RANGE|WHERE_TOP_LIMIT; + pNew->u.btree.nTop = nVecLen; + pTop = pTerm; + pBtm = (pNew->wsFlags & WHERE_BTM_LIMIT)!=0 ? + pNew->aLTerm[pNew->nLTerm-2] : 0; + } } /* At this point pNew->nOut is set to the number of rows expected to @@ -155227,23 +158856,26 @@ static int indexMightHelpWithOrderBy( */ static int whereUsablePartialIndex( int iTab, /* The table for which we want an index */ - int isLeft, /* True if iTab is the right table of a LEFT JOIN */ + u8 jointype, /* The JT_* flags on the join */ WhereClause *pWC, /* The WHERE clause of the query */ Expr *pWhere /* The WHERE clause from the partial index */ ){ int i; WhereTerm *pTerm; - Parse *pParse = pWC->pWInfo->pParse; + Parse *pParse; + + if( jointype & JT_LTORJ ) return 0; + pParse = pWC->pWInfo->pParse; while( pWhere->op==TK_AND ){ - if( !whereUsablePartialIndex(iTab,isLeft,pWC,pWhere->pLeft) ) return 0; + if( !whereUsablePartialIndex(iTab,jointype,pWC,pWhere->pLeft) ) return 0; pWhere = pWhere->pRight; } if( pParse->db->flags & SQLITE_EnableQPSG ) pParse = 0; for(i=0, pTerm=pWC->a; inTerm; i++, pTerm++){ Expr *pExpr; pExpr = pTerm->pExpr; - if( (!ExprHasProperty(pExpr, EP_FromJoin) || pExpr->w.iRightJoinTable==iTab) - && (isLeft==0 || ExprHasProperty(pExpr, EP_FromJoin)) + if( (!ExprHasProperty(pExpr, EP_OuterON) || pExpr->w.iJoin==iTab) + && ((jointype & JT_OUTER)==0 || ExprHasProperty(pExpr, EP_OuterON)) && sqlite3ExprImpliesExpr(pParse, pExpr, pWhere, iTab) && (pTerm->wtFlags & TERM_VNULL)==0 ){ @@ -155253,6 +158885,94 @@ static int whereUsablePartialIndex( return 0; } +/* +** Structure passed to the whereIsCoveringIndex Walker callback. +*/ +struct CoveringIndexCheck { + Index *pIdx; /* The index */ + int iTabCur; /* Cursor number for the corresponding table */ +}; + +/* +** Information passed in is pWalk->u.pCovIdxCk. Call is pCk. +** +** If the Expr node references the table with cursor pCk->iTabCur, then +** make sure that column is covered by the index pCk->pIdx. We know that +** all columns less than 63 (really BMS-1) are covered, so we don't need +** to check them. But we do need to check any column at 63 or greater. +** +** If the index does not cover the column, then set pWalk->eCode to +** non-zero and return WRC_Abort to stop the search. +** +** If this node does not disprove that the index can be a covering index, +** then just return WRC_Continue, to continue the search. +*/ +static int whereIsCoveringIndexWalkCallback(Walker *pWalk, Expr *pExpr){ + int i; /* Loop counter */ + const Index *pIdx; /* The index of interest */ + const i16 *aiColumn; /* Columns contained in the index */ + u16 nColumn; /* Number of columns in the index */ + if( pExpr->op!=TK_COLUMN && pExpr->op!=TK_AGG_COLUMN ) return WRC_Continue; + if( pExpr->iColumn<(BMS-1) ) return WRC_Continue; + if( pExpr->iTable!=pWalk->u.pCovIdxCk->iTabCur ) return WRC_Continue; + pIdx = pWalk->u.pCovIdxCk->pIdx; + aiColumn = pIdx->aiColumn; + nColumn = pIdx->nColumn; + for(i=0; iiColumn ) return WRC_Continue; + } + pWalk->eCode = 1; + return WRC_Abort; +} + + +/* +** pIdx is an index that covers all of the low-number columns used by +** pWInfo->pSelect (columns from 0 through 62). But there are columns +** in pWInfo->pSelect beyond 62. This routine tries to answer the question +** of whether pIdx covers *all* columns in the query. +** +** Return 0 if pIdx is a covering index. Return non-zero if pIdx is +** not a covering index or if we are unable to determine if pIdx is a +** covering index. +** +** This routine is an optimization. It is always safe to return non-zero. +** But returning zero when non-zero should have been returned can lead to +** incorrect bytecode and assertion faults. +*/ +static SQLITE_NOINLINE u32 whereIsCoveringIndex( + WhereInfo *pWInfo, /* The WHERE clause context */ + Index *pIdx, /* Index that is being tested */ + int iTabCur /* Cursor for the table being indexed */ +){ + int i; + struct CoveringIndexCheck ck; + Walker w; + if( pWInfo->pSelect==0 ){ + /* We don't have access to the full query, so we cannot check to see + ** if pIdx is covering. Assume it is not. */ + return 1; + } + for(i=0; inColumn; i++){ + if( pIdx->aiColumn[i]>=BMS-1 ) break; + } + if( i>=pIdx->nColumn ){ + /* pIdx does not index any columns greater than 62, but we know from + ** colMask that columns greater than 62 are used, so this is not a + ** covering index */ + return 1; + } + ck.pIdx = pIdx; + ck.iTabCur = iTabCur; + memset(&w, 0, sizeof(w)); + w.xExprCallback = whereIsCoveringIndexWalkCallback; + w.xSelectCallback = sqlite3SelectWalkNoop; + w.u.pCovIdxCk = &ck; + w.eCode = 0; + sqlite3WalkSelect(&w, pWInfo->pSelect); + return w.eCode; +} + /* ** Add all WhereLoop objects for a single table of the join where the table ** is identified by pBuilder->pNew->iTab. That table is guaranteed to be @@ -155352,13 +159072,14 @@ static int whereLoopAddBtree( #ifndef SQLITE_OMIT_AUTOMATIC_INDEX /* Automatic indexes */ if( !pBuilder->pOrSet /* Not part of an OR optimization */ - && (pWInfo->wctrlFlags & WHERE_OR_SUBCLAUSE)==0 + && (pWInfo->wctrlFlags & (WHERE_RIGHT_JOIN|WHERE_OR_SUBCLAUSE))==0 && (pWInfo->pParse->db->flags & SQLITE_AutoIndex)!=0 && !pSrc->fg.isIndexedBy /* Has no INDEXED BY clause */ && !pSrc->fg.notIndexed /* Has no NOT INDEXED clause */ && HasRowid(pTab) /* Not WITHOUT ROWID table. (FIXME: Why not?) */ && !pSrc->fg.isCorrelated /* Not a correlated subquery */ && !pSrc->fg.isRecursive /* Not a recursive common table expression. */ + && (pSrc->fg.jointype & JT_RIGHT)==0 /* Not the right tab of a RIGHT JOIN */ ){ /* Generate auto-index WhereLoops */ LogEst rLogSize; /* Logarithm of the number of rows in the table */ @@ -155408,9 +159129,8 @@ static int whereLoopAddBtree( for(; rc==SQLITE_OK && pProbe; pProbe=(pSrc->fg.isIndexedBy ? 0 : pProbe->pNext), iSortIdx++ ){ - int isLeft = (pSrc->fg.jointype & JT_OUTER)!=0; if( pProbe->pPartIdxWhere!=0 - && !whereUsablePartialIndex(pSrc->iCursor, isLeft, pWC, + && !whereUsablePartialIndex(pSrc->iCursor, pSrc->fg.jointype, pWC, pProbe->pPartIdxWhere) ){ testcase( pNew->iTab!=pSrc->iCursor ); /* See ticket [98d973b8f5] */ @@ -155455,6 +159175,9 @@ static int whereLoopAddBtree( #else pNew->rRun = rSize + 16; #endif + if( IsView(pTab) || (pTab->tabFlags & TF_Ephemeral)!=0 ){ + pNew->wsFlags |= WHERE_VIEWSCAN; + } ApplyCostMultiplier(pNew->rRun, pTab->costMult); whereLoopOutputAdjust(pWC, pNew, rSize); rc = whereLoopInsert(pBuilder, pNew); @@ -155467,6 +159190,9 @@ static int whereLoopAddBtree( m = 0; }else{ m = pSrc->colUsed & pProbe->colNotIdxed; + if( m==TOPBIT ){ + m = whereIsCoveringIndex(pWInfo, pProbe, pSrc->iCursor); + } pNew->wsFlags = (m==0) ? (WHERE_IDX_ONLY|WHERE_INDEXED) : WHERE_INDEXED; } @@ -155518,7 +159244,14 @@ static int whereLoopAddBtree( } ApplyCostMultiplier(pNew->rRun, pTab->costMult); whereLoopOutputAdjust(pWC, pNew, rSize); - rc = whereLoopInsert(pBuilder, pNew); + if( (pSrc->fg.jointype & JT_RIGHT)!=0 && pProbe->aColExpr ){ + /* Do not do an SCAN of a index-on-expression in a RIGHT JOIN + ** because the cursor used to access the index might not be + ** positioned to the correct row during the right-join no-match + ** loop. */ + }else{ + rc = whereLoopInsert(pBuilder, pNew); + } pNew->nOut = rSize; if( rc ) break; } @@ -155693,6 +159426,7 @@ static int whereLoopAddVirtualOne( *pbIn = 1; assert( (mExclude & WO_IN)==0 ); } + assert( pbRetryLimit || !isLimitTerm(pTerm) ); if( isLimitTerm(pTerm) && *pbIn ){ /* If there is an IN(...) term handled as an == (separate call to ** xFilter for each value on the RHS of the IN) and a LIMIT or @@ -155840,9 +159574,7 @@ SQLITE_API int sqlite3_vtab_rhs_value( */ SQLITE_API int sqlite3_vtab_distinct(sqlite3_index_info *pIdxInfo){ HiddenIndexInfo *pHidden = (HiddenIndexInfo*)&pIdxInfo[1]; - assert( pHidden->eDistinct==0 - || pHidden->eDistinct==1 - || pHidden->eDistinct==2 ); + assert( pHidden->eDistinct>=0 && pHidden->eDistinct<=3 ); return pHidden->eDistinct; } @@ -155850,15 +159582,26 @@ SQLITE_API int sqlite3_vtab_distinct(sqlite3_index_info *pIdxInfo){ && !defined(SQLITE_OMIT_VIRTUALTABLE) /* ** Cause the prepared statement that is associated with a call to -** xBestIndex to open write transactions on all attached schemas. +** xBestIndex to potentiall use all schemas. If the statement being +** prepared is read-only, then just start read transactions on all +** schemas. But if this is a write operation, start writes on all +** schemas. +** ** This is used by the (built-in) sqlite_dbpage virtual table. */ -SQLITE_PRIVATE void sqlite3VtabWriteAll(sqlite3_index_info *pIdxInfo){ +SQLITE_PRIVATE void sqlite3VtabUsesAllSchemas(sqlite3_index_info *pIdxInfo){ HiddenIndexInfo *pHidden = (HiddenIndexInfo*)&pIdxInfo[1]; Parse *pParse = pHidden->pParse; int nDb = pParse->db->nDb; int i; - for(i=0; iwriteMask ){ + for(i=0; ipTabList->a + pNew->iTab; iCur = pItem->iCursor; + /* The multi-index OR optimization does not work for RIGHT and FULL JOIN */ + if( pItem->fg.jointype & JT_RIGHT ) return SQLITE_OK; + for(pTerm=pWC->a; pTermeOperator & WO_OR)!=0 && (pTerm->u.pOrInfo->indexable & pNew->maskSelf)!=0 @@ -156154,29 +159900,50 @@ static int whereLoopAddAll(WhereLoopBuilder *pBuilder){ SrcItem *pEnd = &pTabList->a[pWInfo->nLevel]; sqlite3 *db = pWInfo->pParse->db; int rc = SQLITE_OK; + int bFirstPastRJ = 0; + int hasRightJoin = 0; WhereLoop *pNew; + /* Loop over the tables in the join, from left to right */ pNew = pBuilder->pNew; - whereLoopInit(pNew); + + /* Verify that pNew has already been initialized */ + assert( pNew->nLTerm==0 ); + assert( pNew->wsFlags==0 ); + assert( pNew->nLSlot>=ArraySize(pNew->aLTermSpace) ); + assert( pNew->aLTerm!=0 ); + pBuilder->iPlanLimit = SQLITE_QUERY_PLANNER_LIMIT; for(iTab=0, pItem=pTabList->a; pItemiTab = iTab; pBuilder->iPlanLimit += SQLITE_QUERY_PLANNER_LIMIT_INCR; pNew->maskSelf = sqlite3WhereGetMask(&pWInfo->sMaskSet, pItem->iCursor); - if( (pItem->fg.jointype & (JT_LEFT|JT_CROSS))!=0 ){ - /* This condition is true when pItem is the FROM clause term on the - ** right-hand-side of a LEFT or CROSS JOIN. */ - mPrereq = mPrior; - }else{ + if( bFirstPastRJ + || (pItem->fg.jointype & (JT_OUTER|JT_CROSS|JT_LTORJ))!=0 + ){ + /* Add prerequisites to prevent reordering of FROM clause terms + ** across CROSS joins and outer joins. The bFirstPastRJ boolean + ** prevents the right operand of a RIGHT JOIN from being swapped with + ** other elements even further to the right. + ** + ** The JT_LTORJ case and the hasRightJoin flag work together to + ** prevent FROM-clause terms from moving from the right side of + ** a LEFT JOIN over to the left side of that join if the LEFT JOIN + ** is itself on the left side of a RIGHT JOIN. + */ + if( pItem->fg.jointype & JT_LTORJ ) hasRightJoin = 1; + mPrereq |= mPrior; + bFirstPastRJ = (pItem->fg.jointype & JT_RIGHT)!=0; + }else if( !hasRightJoin ){ mPrereq = 0; } #ifndef SQLITE_OMIT_VIRTUALTABLE if( IsVirtual(pItem->pTab) ){ SrcItem *p; for(p=&pItem[1]; pfg.jointype & (JT_LEFT|JT_CROSS)) ){ + if( mUnusable || (p->fg.jointype & (JT_OUTER|JT_CROSS)) ){ mUnusable |= sqlite3WhereGetMask(&pWInfo->sMaskSet, p->iCursor); } } @@ -156301,7 +160068,9 @@ static i8 wherePathSatisfiesOrderBy( pLoop = pLast; } if( pLoop->wsFlags & WHERE_VIRTUALTABLE ){ - if( pLoop->u.vtab.isOrdered && (wctrlFlags & WHERE_DISTINCTBY)==0 ){ + if( pLoop->u.vtab.isOrdered + && ((wctrlFlags&(WHERE_DISTINCTBY|WHERE_SORTBYGROUP))!=WHERE_DISTINCTBY) + ){ obSat = obDone; } break; @@ -156479,16 +160248,18 @@ static i8 wherePathSatisfiesOrderBy( /* Make sure the sort order is compatible in an ORDER BY clause. ** Sort order is irrelevant for a GROUP BY clause. */ if( revSet ){ - if( (rev ^ revIdx)!=(pOrderBy->a[i].sortFlags&KEYINFO_ORDER_DESC) ){ + if( (rev ^ revIdx) + != (pOrderBy->a[i].fg.sortFlags&KEYINFO_ORDER_DESC) + ){ isMatch = 0; } }else{ - rev = revIdx ^ (pOrderBy->a[i].sortFlags & KEYINFO_ORDER_DESC); + rev = revIdx ^ (pOrderBy->a[i].fg.sortFlags & KEYINFO_ORDER_DESC); if( rev ) *pRevMask |= MASKBIT(iLoop); revSet = 1; } } - if( isMatch && (pOrderBy->a[i].sortFlags & KEYINFO_ORDER_BIGNULL) ){ + if( isMatch && (pOrderBy->a[i].fg.sortFlags & KEYINFO_ORDER_BIGNULL) ){ if( j==pLoop->u.btree.nEq ){ pLoop->wsFlags |= WHERE_BIGNULL_SORT; }else{ @@ -156568,7 +160339,7 @@ static i8 wherePathSatisfiesOrderBy( ** SELECT * FROM t1 GROUP BY y,x ORDER BY y,x; -- IsSorted()==0 */ SQLITE_PRIVATE int sqlite3WhereIsSorted(WhereInfo *pWInfo){ - assert( pWInfo->wctrlFlags & WHERE_GROUPBY ); + assert( pWInfo->wctrlFlags & (WHERE_GROUPBY|WHERE_DISTINCTBY) ); assert( pWInfo->wctrlFlags & WHERE_SORTBYGROUP ); return pWInfo->sorted; } @@ -156647,7 +160418,6 @@ static int wherePathSolver(WhereInfo *pWInfo, LogEst nRowEst){ int mxChoice; /* Maximum number of simultaneous paths tracked */ int nLoop; /* Number of terms in the join */ Parse *pParse; /* Parsing context */ - sqlite3 *db; /* The database connection */ int iLoop; /* Loop counter over the terms of the join */ int ii, jj; /* Loop counters */ int mxI = 0; /* Index of next entry to replace */ @@ -156666,7 +160436,6 @@ static int wherePathSolver(WhereInfo *pWInfo, LogEst nRowEst){ int nSpace; /* Bytes of space allocated at pSpace */ pParse = pWInfo->pParse; - db = pParse->db; nLoop = pWInfo->nLevel; /* TUNING: For simple queries, only the best path is tracked. ** For 2-way joins, the 5 best paths are followed. @@ -156689,7 +160458,7 @@ static int wherePathSolver(WhereInfo *pWInfo, LogEst nRowEst){ /* Allocate and initialize space for aTo, aFrom and aSortCost[] */ nSpace = (sizeof(WherePath)+sizeof(WhereLoop*)*nLoop)*mxChoice*2; nSpace += sizeof(LogEst) * nOrderBy; - pSpace = sqlite3DbMallocRawNN(db, nSpace); + pSpace = sqlite3StackAllocRawNN(pParse->db, nSpace); if( pSpace==0 ) return SQLITE_NOMEM_BKPT; aTo = (WherePath*)pSpace; aFrom = aTo+mxChoice; @@ -156739,9 +160508,9 @@ static int wherePathSolver(WhereInfo *pWInfo, LogEst nRowEst){ LogEst nOut; /* Rows visited by (pFrom+pWLoop) */ LogEst rCost; /* Cost of path (pFrom+pWLoop) */ LogEst rUnsorted; /* Unsorted cost of (pFrom+pWLoop) */ - i8 isOrdered = pFrom->isOrdered; /* isOrdered for (pFrom+pWLoop) */ + i8 isOrdered; /* isOrdered for (pFrom+pWLoop) */ Bitmask maskNew; /* Mask of src visited by (..) */ - Bitmask revMask = 0; /* Mask of rev-order loops for (..) */ + Bitmask revMask; /* Mask of rev-order loops for (..) */ if( (pWLoop->prereq & ~pFrom->maskLoop)!=0 ) continue; if( (pWLoop->maskSelf & pFrom->maskLoop)!=0 ) continue; @@ -156760,7 +160529,9 @@ static int wherePathSolver(WhereInfo *pWInfo, LogEst nRowEst){ rUnsorted = sqlite3LogEstAdd(rUnsorted, pFrom->rUnsorted); nOut = pFrom->nRow + pWLoop->nOut; maskNew = pFrom->maskLoop | pWLoop->maskSelf; + isOrdered = pFrom->isOrdered; if( isOrdered<0 ){ + revMask = 0; isOrdered = wherePathSatisfiesOrderBy(pWInfo, pWInfo->pOrderBy, pFrom, pWInfo->wctrlFlags, iLoop, pWLoop, &revMask); @@ -156788,6 +160559,13 @@ static int wherePathSolver(WhereInfo *pWInfo, LogEst nRowEst){ rUnsorted -= 2; /* TUNING: Slight bias in favor of no-sort plans */ } + /* TUNING: A full-scan of a VIEW or subquery in the outer loop + ** is not so bad. */ + if( iLoop==0 && (pWLoop->wsFlags & WHERE_VIEWSCAN)!=0 ){ + rCost += -10; + nOut += -30; + } + /* Check to see if pWLoop should be added to the set of ** mxChoice best-so-far paths. ** @@ -156938,7 +160716,7 @@ static int wherePathSolver(WhereInfo *pWInfo, LogEst nRowEst){ if( nFrom==0 ){ sqlite3ErrorMsg(pParse, "no query solution"); - sqlite3DbFreeNN(db, pSpace); + sqlite3StackFreeNN(pParse->db, pSpace); return SQLITE_ERROR; } @@ -156969,12 +160747,12 @@ static int wherePathSolver(WhereInfo *pWInfo, LogEst nRowEst){ } pWInfo->bOrderedInnerLoop = 0; if( pWInfo->pOrderBy ){ + pWInfo->nOBSat = pFrom->isOrdered; if( pWInfo->wctrlFlags & WHERE_DISTINCTBY ){ if( pFrom->isOrdered==pWInfo->pOrderBy->nExpr ){ pWInfo->eDistinct = WHERE_DISTINCT_ORDERED; } }else{ - pWInfo->nOBSat = pFrom->isOrdered; pWInfo->revMask = pFrom->revLoop; if( pWInfo->nOBSat<=0 ){ pWInfo->nOBSat = 0; @@ -157020,7 +160798,7 @@ static int wherePathSolver(WhereInfo *pWInfo, LogEst nRowEst){ pWInfo->nRowOut = pFrom->nRow; /* Free temporary memory and return success */ - sqlite3DbFreeNN(db, pSpace); + sqlite3StackFreeNN(pParse->db, pSpace); return SQLITE_OK; } @@ -157053,7 +160831,11 @@ static int whereShortCut(WhereLoopBuilder *pBuilder){ pItem = pWInfo->pTabList->a; pTab = pItem->pTab; if( IsVirtual(pTab) ) return 0; - if( pItem->fg.isIndexedBy ) return 0; + if( pItem->fg.isIndexedBy || pItem->fg.notIndexed ){ + testcase( pItem->fg.isIndexedBy ); + testcase( pItem->fg.notIndexed ); + return 0; + } iCur = pItem->iCursor; pWC = &pWInfo->sWC; pLoop = pBuilder->pNew; @@ -157226,7 +161008,7 @@ static SQLITE_NOINLINE Bitmask whereOmitNoopJoin( WhereLoop *pLoop; pLoop = pWInfo->a[i].pWLoop; pItem = &pWInfo->pTabList->a[pLoop->iTab]; - if( (pItem->fg.jointype & JT_LEFT)==0 ) continue; + if( (pItem->fg.jointype & (JT_LEFT|JT_RIGHT))!=JT_LEFT ) continue; if( (pWInfo->wctrlFlags & WHERE_WANT_DISTINCT)==0 && (pLoop->wsFlags & WHERE_ONEROW)==0 ){ @@ -157236,8 +161018,8 @@ static SQLITE_NOINLINE Bitmask whereOmitNoopJoin( pEnd = pWInfo->sWC.a + pWInfo->sWC.nTerm; for(pTerm=pWInfo->sWC.a; pTermprereqAll & pLoop->maskSelf)!=0 ){ - if( !ExprHasProperty(pTerm->pExpr, EP_FromJoin) - || pTerm->pExpr->w.iRightJoinTable!=pItem->iCursor + if( !ExprHasProperty(pTerm->pExpr, EP_OuterON) + || pTerm->pExpr->w.iJoin!=pItem->iCursor ){ break; } @@ -157315,6 +161097,77 @@ static SQLITE_NOINLINE void whereCheckIfBloomFilterIsUseful( } } +/* +** This is an sqlite3ParserAddCleanup() callback that is invoked to +** free the Parse->pIdxExpr list when the Parse object is destroyed. +*/ +static void whereIndexedExprCleanup(sqlite3 *db, void *pObject){ + Parse *pParse = (Parse*)pObject; + while( pParse->pIdxExpr!=0 ){ + IndexedExpr *p = pParse->pIdxExpr; + pParse->pIdxExpr = p->pIENext; + sqlite3ExprDelete(db, p->pExpr); + sqlite3DbFreeNN(db, p); + } +} + +/* +** The index pIdx is used by a query and contains one or more expressions. +** In other words pIdx is an index on an expression. iIdxCur is the cursor +** number for the index and iDataCur is the cursor number for the corresponding +** table. +** +** This routine adds IndexedExpr entries to the Parse->pIdxExpr field for +** each of the expressions in the index so that the expression code generator +** will know to replace occurrences of the indexed expression with +** references to the corresponding column of the index. +*/ +static SQLITE_NOINLINE void whereAddIndexedExpr( + Parse *pParse, /* Add IndexedExpr entries to pParse->pIdxExpr */ + Index *pIdx, /* The index-on-expression that contains the expressions */ + int iIdxCur, /* Cursor number for pIdx */ + SrcItem *pTabItem /* The FROM clause entry for the table */ +){ + int i; + IndexedExpr *p; + Table *pTab; + assert( pIdx->bHasExpr ); + pTab = pIdx->pTable; + for(i=0; inColumn; i++){ + Expr *pExpr; + int j = pIdx->aiColumn[i]; + int bMaybeNullRow; + if( j==XN_EXPR ){ + pExpr = pIdx->aColExpr->a[i].pExpr; + testcase( pTabItem->fg.jointype & JT_LEFT ); + testcase( pTabItem->fg.jointype & JT_RIGHT ); + testcase( pTabItem->fg.jointype & JT_LTORJ ); + bMaybeNullRow = (pTabItem->fg.jointype & (JT_LEFT|JT_LTORJ|JT_RIGHT))!=0; + }else if( j>=0 && (pTab->aCol[j].colFlags & COLFLAG_VIRTUAL)!=0 ){ + pExpr = sqlite3ColumnExpr(pTab, &pTab->aCol[j]); + bMaybeNullRow = 0; + }else{ + continue; + } + if( sqlite3ExprIsConstant(pExpr) ) continue; + p = sqlite3DbMallocRaw(pParse->db, sizeof(IndexedExpr)); + if( p==0 ) break; + p->pIENext = pParse->pIdxExpr; + p->pExpr = sqlite3ExprDup(pParse->db, pExpr, 0); + p->iDataCur = pTabItem->iCursor; + p->iIdxCur = iIdxCur; + p->iIdxCol = i; + p->bMaybeNullRow = bMaybeNullRow; +#ifdef SQLITE_ENABLE_EXPLAIN_COMMENTS + p->zIdxName = pIdx->zName; +#endif + pParse->pIdxExpr = p; + if( p->pIENext==0 ){ + sqlite3ParserAddCleanup(pParse, whereIndexedExprCleanup, pParse); + } + } +} + /* ** Generate the beginning of the loop used for WHERE clause processing. ** The return value is a pointer to an opaque structure that contains @@ -157409,7 +161262,7 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin( Expr *pWhere, /* The WHERE clause */ ExprList *pOrderBy, /* An ORDER BY (or GROUP BY) clause, or NULL */ ExprList *pResultSet, /* Query result set. Req'd for DISTINCT */ - Select *pLimit, /* Use this LIMIT/OFFSET clause, if any */ + Select *pSelect, /* The entire SELECT statement */ u16 wctrlFlags, /* The WHERE_* flags defined in sqliteInt.h */ int iAuxArg /* If WHERE_OR_SUBCLAUSE is set, index cursor number ** If WHERE_USE_LIMIT, then the limit amount */ @@ -157468,7 +161321,7 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin( ** field (type Bitmask) it must be aligned on an 8-byte boundary on ** some architectures. Hence the ROUND8() below. */ - nByteWInfo = ROUND8(sizeof(WhereInfo)+(nTabList-1)*sizeof(WhereLevel)); + nByteWInfo = ROUND8P(sizeof(WhereInfo)+(nTabList-1)*sizeof(WhereLevel)); pWInfo = sqlite3DbMallocRawNN(db, nByteWInfo + sizeof(WhereLoop)); if( db->mallocFailed ){ sqlite3DbFree(db, pWInfo); @@ -157478,7 +161331,9 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin( pWInfo->pParse = pParse; pWInfo->pTabList = pTabList; pWInfo->pOrderBy = pOrderBy; +#if WHERETRACE_ENABLED pWInfo->pWhere = pWhere; +#endif pWInfo->pResultSet = pResultSet; pWInfo->aiCurOnePass[0] = pWInfo->aiCurOnePass[1] = -1; pWInfo->nLevel = nTabList; @@ -157486,9 +161341,7 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin( pWInfo->wctrlFlags = wctrlFlags; pWInfo->iLimit = iAuxArg; pWInfo->savedNQueryLoop = pParse->nQueryLoop; -#ifndef SQLITE_OMIT_VIRTUALTABLE - pWInfo->pLimit = pLimit; -#endif + pWInfo->pSelect = pSelect; memset(&pWInfo->nOBSat, 0, offsetof(WhereInfo,sWC) - offsetof(WhereInfo,nOBSat)); memset(&pWInfo->a[0], 0, sizeof(WhereLoop)+nTabList*sizeof(WhereLevel)); @@ -157557,8 +161410,10 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin( /* Analyze all of the subexpressions. */ sqlite3WhereExprAnalyze(pTabList, &pWInfo->sWC); - sqlite3WhereAddLimit(&pWInfo->sWC, pLimit); - if( db->mallocFailed ) goto whereBeginError; + if( pSelect && pSelect->pLimit ){ + sqlite3WhereAddLimit(&pWInfo->sWC, pSelect); + } + if( pParse->nErr ) goto whereBeginError; /* Special case: WHERE terms that do not refer to any tables in the join ** (constant expressions). Evaluate each such term, and jump over all the @@ -157790,8 +161645,10 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin( /* noop */ }else #endif - if( (pLoop->wsFlags & WHERE_IDX_ONLY)==0 - && (wctrlFlags & WHERE_OR_SUBCLAUSE)==0 ){ + if( ((pLoop->wsFlags & WHERE_IDX_ONLY)==0 + && (wctrlFlags & WHERE_OR_SUBCLAUSE)==0) + || (pTabItem->fg.jointype & (JT_LTORJ|JT_RIGHT))!=0 + ){ int op = OP_OpenRead; if( pWInfo->eOnePass!=ONEPASS_OFF ){ op = OP_OpenWrite; @@ -157858,8 +161715,12 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin( op = OP_ReopenIdx; }else{ iIndexCur = pParse->nTab++; + if( pIx->bHasExpr && OptimizationEnabled(db, SQLITE_IndexedExpr) ){ + whereAddIndexedExpr(pParse, pIx, iIndexCur, pTabItem); + } } pLevel->iIdxCur = iIndexCur; + assert( pIx!=0 ); assert( pIx->pSchema==pTab->pSchema ); assert( iIndexCur>=0 ); if( op ){ @@ -157893,6 +161754,37 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin( } } if( iDb>=0 ) sqlite3CodeVerifySchema(pParse, iDb); + if( (pTabItem->fg.jointype & JT_RIGHT)!=0 + && (pLevel->pRJ = sqlite3WhereMalloc(pWInfo, sizeof(WhereRightJoin)))!=0 + ){ + WhereRightJoin *pRJ = pLevel->pRJ; + pRJ->iMatch = pParse->nTab++; + pRJ->regBloom = ++pParse->nMem; + sqlite3VdbeAddOp2(v, OP_Blob, 65536, pRJ->regBloom); + pRJ->regReturn = ++pParse->nMem; + sqlite3VdbeAddOp2(v, OP_Null, 0, pRJ->regReturn); + assert( pTab==pTabItem->pTab ); + if( HasRowid(pTab) ){ + KeyInfo *pInfo; + sqlite3VdbeAddOp2(v, OP_OpenEphemeral, pRJ->iMatch, 1); + pInfo = sqlite3KeyInfoAlloc(pParse->db, 1, 0); + if( pInfo ){ + pInfo->aColl[0] = 0; + pInfo->aSortFlags[0] = 0; + sqlite3VdbeAppendP4(v, pInfo, P4_KEYINFO); + } + }else{ + Index *pPk = sqlite3PrimaryKeyIndex(pTab); + sqlite3VdbeAddOp2(v, OP_OpenEphemeral, pRJ->iMatch, pPk->nKeyCol); + sqlite3VdbeSetP4KeyInfo(pParse, pPk); + } + pLoop->wsFlags &= ~WHERE_IDX_ONLY; + /* The nature of RIGHT JOIN processing is such that it messes up + ** the output order. So omit any ORDER BY/GROUP BY elimination + ** optimizations. We need to do an actual sort for RIGHT JOIN. */ + pWInfo->nOBSat = 0; + pWInfo->eDistinct = WHERE_DISTINCT_UNORDERED; + } } pWInfo->iTop = sqlite3VdbeCurrentAddr(v); if( db->mallocFailed ) goto whereBeginError; @@ -157904,9 +161796,20 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin( for(ii=0; iinErr ) goto whereBeginError; pLevel = &pWInfo->a[ii]; wsFlags = pLevel->pWLoop->wsFlags; + pSrc = &pTabList->a[pLevel->iFrom]; + if( pSrc->fg.isMaterialized ){ + if( pSrc->fg.isCorrelated ){ + sqlite3VdbeAddOp2(v, OP_Gosub, pSrc->regReturn, pSrc->addrFillSub); + }else{ + int iOnce = sqlite3VdbeAddOp0(v, OP_Once); VdbeCoverage(v); + sqlite3VdbeAddOp2(v, OP_Gosub, pSrc->regReturn, pSrc->addrFillSub); + sqlite3VdbeJumpHere(v, iOnce); + } + } if( (wsFlags & (WHERE_AUTO_INDEX|WHERE_BLOOMFILTER))!=0 ){ if( (wsFlags & WHERE_AUTO_INDEX)!=0 ){ #ifndef SQLITE_OMIT_AUTOMATIC_INDEX @@ -157937,8 +161840,6 @@ SQLITE_PRIVATE WhereInfo *sqlite3WhereBegin( /* Jump here if malloc fails */ whereBeginError: if( pWInfo ){ - testcase( pWInfo->pExprMods!=0 ); - whereUndoExprMods(pWInfo); pParse->nQueryLoop = pWInfo->savedNQueryLoop; whereInfoFree(db, pWInfo); } @@ -157998,6 +161899,7 @@ SQLITE_PRIVATE void sqlite3WhereEnd(WhereInfo *pWInfo){ SrcList *pTabList = pWInfo->pTabList; sqlite3 *db = pParse->db; int iEnd = sqlite3VdbeCurrentAddr(v); + int nRJ = 0; /* Generate loop termination code. */ @@ -158005,6 +161907,17 @@ SQLITE_PRIVATE void sqlite3WhereEnd(WhereInfo *pWInfo){ for(i=pWInfo->nLevel-1; i>=0; i--){ int addr; pLevel = &pWInfo->a[i]; + if( pLevel->pRJ ){ + /* Terminate the subroutine that forms the interior of the loop of + ** the RIGHT JOIN table */ + WhereRightJoin *pRJ = pLevel->pRJ; + sqlite3VdbeResolveLabel(v, pLevel->addrCont); + pLevel->addrCont = 0; + pRJ->endSubrtn = sqlite3VdbeCurrentAddr(v); + sqlite3VdbeAddOp3(v, OP_Return, pRJ->regReturn, pRJ->addrSubrtn, 1); + VdbeCoverage(v); + nRJ++; + } pLoop = pLevel->pWLoop; if( pLevel->op!=OP_Noop ){ #ifndef SQLITE_DISABLE_SKIPAHEAD_DISTINCT @@ -158032,7 +161945,7 @@ SQLITE_PRIVATE void sqlite3WhereEnd(WhereInfo *pWInfo){ } #endif /* SQLITE_DISABLE_SKIPAHEAD_DISTINCT */ /* The common case: Advance to the next row */ - sqlite3VdbeResolveLabel(v, pLevel->addrCont); + if( pLevel->addrCont ) sqlite3VdbeResolveLabel(v, pLevel->addrCont); sqlite3VdbeAddOp3(v, pLevel->op, pLevel->p1, pLevel->p2, pLevel->p3); sqlite3VdbeChangeP5(v, pLevel->p5); VdbeCoverage(v); @@ -158047,7 +161960,7 @@ SQLITE_PRIVATE void sqlite3WhereEnd(WhereInfo *pWInfo){ #ifndef SQLITE_DISABLE_SKIPAHEAD_DISTINCT if( addrSeek ) sqlite3VdbeJumpHere(v, addrSeek); #endif - }else{ + }else if( pLevel->addrCont ){ sqlite3VdbeResolveLabel(v, pLevel->addrCont); } if( (pLoop->wsFlags & WHERE_IN_ABLE)!=0 && pLevel->u.in.nIn>0 ){ @@ -158097,6 +162010,10 @@ SQLITE_PRIVATE void sqlite3WhereEnd(WhereInfo *pWInfo){ } } sqlite3VdbeResolveLabel(v, pLevel->addrBrk); + if( pLevel->pRJ ){ + sqlite3VdbeAddOp3(v, OP_Return, pLevel->pRJ->regReturn, 0, 1); + VdbeCoverage(v); + } if( pLevel->addrSkip ){ sqlite3VdbeGoto(v, pLevel->addrSkip); VdbeComment((v, "next skip-scan on %s", pLoop->u.btree.pIndex->zName)); @@ -158140,11 +162057,6 @@ SQLITE_PRIVATE void sqlite3WhereEnd(WhereInfo *pWInfo){ pWInfo->pTabList->a[pLevel->iFrom].pTab->zName)); } - /* The "break" point is here, just past the end of the outer loop. - ** Set it. - */ - sqlite3VdbeResolveLabel(v, pWInfo->iBreak); - assert( pWInfo->nLevel<=pTabList->nSrc ); for(i=0, pLevel=pWInfo->a; inLevel; i++, pLevel++){ int k, last; @@ -158155,6 +162067,15 @@ SQLITE_PRIVATE void sqlite3WhereEnd(WhereInfo *pWInfo){ assert( pTab!=0 ); pLoop = pLevel->pWLoop; + /* Do RIGHT JOIN processing. Generate code that will output the + ** unmatched rows of the right operand of the RIGHT JOIN with + ** all of the columns of the left operand set to NULL. + */ + if( pLevel->pRJ ){ + sqlite3WhereRightJoinLoop(pWInfo, i, pLevel); + continue; + } + /* For a co-routine, change all OP_Column references to the table of ** the co-routine into OP_Copy of result contained in a register. ** OP_Rowid becomes OP_Null. @@ -158166,29 +162087,6 @@ SQLITE_PRIVATE void sqlite3WhereEnd(WhereInfo *pWInfo){ continue; } -#ifdef SQLITE_ENABLE_EARLY_CURSOR_CLOSE - /* Close all of the cursors that were opened by sqlite3WhereBegin. - ** Except, do not close cursors that will be reused by the OR optimization - ** (WHERE_OR_SUBCLAUSE). And do not close the OP_OpenWrite cursors - ** created for the ONEPASS optimization. - */ - if( (pTab->tabFlags & TF_Ephemeral)==0 - && !IsView(pTab) - && (pWInfo->wctrlFlags & WHERE_OR_SUBCLAUSE)==0 - ){ - int ws = pLoop->wsFlags; - if( pWInfo->eOnePass==ONEPASS_OFF && (ws & WHERE_IDX_ONLY)==0 ){ - sqlite3VdbeAddOp1(v, OP_Close, pTabItem->iCursor); - } - if( (ws & WHERE_INDEXED)!=0 - && (ws & (WHERE_IPK|WHERE_AUTO_INDEX))==0 - && pLevel->iIdxCur!=pWInfo->aiCurOnePass[1] - ){ - sqlite3VdbeAddOp1(v, OP_Close, pLevel->iIdxCur); - } - } -#endif - /* If this scan uses an index, make VDBE code substitutions to read data ** from the index instead of from the table where possible. In some cases ** this optimization prevents the table from ever being read, which can @@ -158213,6 +162111,16 @@ SQLITE_PRIVATE void sqlite3WhereEnd(WhereInfo *pWInfo){ }else{ last = pWInfo->iEndWhere; } + if( pIdx->bHasExpr ){ + IndexedExpr *p = pParse->pIdxExpr; + while( p ){ + if( p->iIdxCur==pLevel->iIdxCur ){ + p->iDataCur = -1; + p->iIdxCur = -1; + } + p = p->pIENext; + } + } k = pLevel->addrBody + 1; #ifdef SQLITE_DEBUG if( db->flags & SQLITE_VdbeAddopTrace ){ @@ -158289,11 +162197,16 @@ SQLITE_PRIVATE void sqlite3WhereEnd(WhereInfo *pWInfo){ } } + /* The "break" point is here, just past the end of the outer loop. + ** Set it. + */ + sqlite3VdbeResolveLabel(v, pWInfo->iBreak); + /* Final cleanup */ - if( pWInfo->pExprMods ) whereUndoExprMods(pWInfo); pParse->nQueryLoop = pWInfo->savedNQueryLoop; whereInfoFree(db, pWInfo); + pParse->withinRJSubrtn -= nRJ; return; } @@ -159025,7 +162938,7 @@ SQLITE_PRIVATE void sqlite3WindowUpdate( } } } - pWin->pFunc = pFunc; + pWin->pWFunc = pFunc; } /* @@ -159201,7 +163114,6 @@ static ExprList *exprListAppendList( for(i=0; inExpr; i++){ sqlite3 *db = pParse->db; Expr *pDup = sqlite3ExprDup(db, pAppend->a[i].pExpr, 0); - assert( pDup==0 || !ExprHasProperty(pDup, EP_MemToken) ); if( db->mallocFailed ){ sqlite3ExprDelete(db, pDup); break; @@ -159217,7 +163129,7 @@ static ExprList *exprListAppendList( } } pList = sqlite3ExprListAppend(pParse, pList, pDup); - if( pList ) pList->a[nInit+i].sortFlags = pAppend->a[i].sortFlags; + if( pList ) pList->a[nInit+i].fg.sortFlags = pAppend->a[i].fg.sortFlags; } } return pList; @@ -159337,9 +163249,9 @@ SQLITE_PRIVATE int sqlite3WindowRewrite(Parse *pParse, Select *p){ for(pWin=pMWin; pWin; pWin=pWin->pNextWin){ ExprList *pArgs; assert( ExprUseXList(pWin->pOwner) ); - assert( pWin->pFunc!=0 ); + assert( pWin->pWFunc!=0 ); pArgs = pWin->pOwner->x.pList; - if( pWin->pFunc->funcFlags & SQLITE_FUNC_SUBTYPE ){ + if( pWin->pWFunc->funcFlags & SQLITE_FUNC_SUBTYPE ){ selectWindowRewriteEList(pParse, pMWin, pSrc, pArgs, pTab, &pSublist); pWin->iArgCol = (pSublist ? pSublist->nExpr : 0); pWin->bExprArgs = 1; @@ -159721,7 +163633,7 @@ SQLITE_PRIVATE void sqlite3WindowCodeInit(Parse *pParse, Select *pSelect){ } for(pWin=pMWin; pWin; pWin=pWin->pNextWin){ - FuncDef *p = pWin->pFunc; + FuncDef *p = pWin->pWFunc; if( (p->funcFlags & SQLITE_FUNC_MINMAX) && pWin->eStart!=TK_UNBOUNDED ){ /* The inline versions of min() and max() require a single ephemeral ** table and 3 registers. The registers are used as follows: @@ -159738,7 +163650,7 @@ SQLITE_PRIVATE void sqlite3WindowCodeInit(Parse *pParse, Select *pSelect){ pWin->csrApp = pParse->nTab++; pWin->regApp = pParse->nMem+1; pParse->nMem += 3; - if( pKeyInfo && pWin->pFunc->zName[1]=='i' ){ + if( pKeyInfo && pWin->pWFunc->zName[1]=='i' ){ assert( pKeyInfo->aSortFlags[0]==0 ); pKeyInfo->aSortFlags[0] = KEYINFO_ORDER_DESC; } @@ -159961,7 +163873,7 @@ static void windowAggStep( Vdbe *v = sqlite3GetVdbe(pParse); Window *pWin; for(pWin=pMWin; pWin; pWin=pWin->pNextWin){ - FuncDef *pFunc = pWin->pFunc; + FuncDef *pFunc = pWin->pWFunc; int regArg; int nArg = pWin->bExprArgs ? 0 : windowArgCount(pWin); int i; @@ -160075,7 +163987,7 @@ static void windowAggFinal(WindowCodeArg *p, int bFin){ for(pWin=pMWin; pWin; pWin=pWin->pNextWin){ if( pMWin->regStartRowid==0 - && (pWin->pFunc->funcFlags & SQLITE_FUNC_MINMAX) + && (pWin->pWFunc->funcFlags & SQLITE_FUNC_MINMAX) && (pWin->eStart!=TK_UNBOUNDED) ){ sqlite3VdbeAddOp2(v, OP_Null, 0, pWin->regResult); @@ -160089,12 +164001,12 @@ static void windowAggFinal(WindowCodeArg *p, int bFin){ int nArg = windowArgCount(pWin); if( bFin ){ sqlite3VdbeAddOp2(v, OP_AggFinal, pWin->regAccum, nArg); - sqlite3VdbeAppendP4(v, pWin->pFunc, P4_FUNCDEF); + sqlite3VdbeAppendP4(v, pWin->pWFunc, P4_FUNCDEF); sqlite3VdbeAddOp2(v, OP_Copy, pWin->regAccum, pWin->regResult); sqlite3VdbeAddOp2(v, OP_Null, 0, pWin->regAccum); }else{ sqlite3VdbeAddOp3(v, OP_AggValue,pWin->regAccum,nArg,pWin->regResult); - sqlite3VdbeAppendP4(v, pWin->pFunc, P4_FUNCDEF); + sqlite3VdbeAppendP4(v, pWin->pWFunc, P4_FUNCDEF); } } } @@ -160223,7 +164135,7 @@ static void windowReturnOneRow(WindowCodeArg *p){ Window *pWin; for(pWin=pMWin; pWin; pWin=pWin->pNextWin){ - FuncDef *pFunc = pWin->pFunc; + FuncDef *pFunc = pWin->pWFunc; assert( ExprUseXList(pWin->pOwner) ); if( pFunc->zName==nth_valueName || pFunc->zName==first_valueName @@ -160295,7 +164207,7 @@ static int windowInitAccum(Parse *pParse, Window *pMWin){ int nArg = 0; Window *pWin; for(pWin=pMWin; pWin; pWin=pWin->pNextWin){ - FuncDef *pFunc = pWin->pFunc; + FuncDef *pFunc = pWin->pWFunc; assert( pWin->regAccum ); sqlite3VdbeAddOp2(v, OP_Null, 0, pWin->regAccum); nArg = MAX(nArg, windowArgCount(pWin)); @@ -160325,7 +164237,7 @@ static int windowCacheFrame(Window *pMWin){ Window *pWin; if( pMWin->regStartRowid ) return 1; for(pWin=pMWin; pWin; pWin=pWin->pNextWin){ - FuncDef *pFunc = pWin->pFunc; + FuncDef *pFunc = pWin->pWFunc; if( (pFunc->zName==nth_valueName) || (pFunc->zName==first_valueName) || (pFunc->zName==leadName) @@ -160418,7 +164330,7 @@ static void windowCodeRangeTest( assert( op==OP_Ge || op==OP_Gt || op==OP_Le ); assert( pOrderBy && pOrderBy->nExpr==1 ); - if( pOrderBy->a[0].sortFlags & KEYINFO_ORDER_DESC ){ + if( pOrderBy->a[0].fg.sortFlags & KEYINFO_ORDER_DESC ){ switch( op ){ case OP_Ge: op = OP_Le; break; case OP_Gt: op = OP_Lt; break; @@ -160451,7 +164363,7 @@ static void windowCodeRangeTest( ** Additionally, if either reg1 or reg2 are NULL but the jump to lbl is ** not taken, control jumps over the comparison operator coded below this ** block. */ - if( pOrderBy->a[0].sortFlags & KEYINFO_ORDER_BIGNULL ){ + if( pOrderBy->a[0].fg.sortFlags & KEYINFO_ORDER_BIGNULL ){ /* This block runs if reg1 contains a NULL. */ int addr = sqlite3VdbeAddOp1(v, OP_NotNull, reg1); VdbeCoverage(v); switch( op ){ @@ -160472,10 +164384,9 @@ static void windowCodeRangeTest( /* This block runs if reg1 is not NULL, but reg2 is. */ sqlite3VdbeJumpHere(v, addr); - sqlite3VdbeAddOp2(v, OP_IsNull, reg2, lbl); VdbeCoverage(v); - if( op==OP_Gt || op==OP_Ge ){ - sqlite3VdbeChangeP2(v, -1, addrDone); - } + sqlite3VdbeAddOp2(v, OP_IsNull, reg2, + (op==OP_Gt || op==OP_Ge) ? addrDone : lbl); + VdbeCoverage(v); } /* Register reg1 currently contains csr1.peerVal (the peer-value from csr1). @@ -160683,7 +164594,7 @@ SQLITE_PRIVATE Window *sqlite3WindowDup(sqlite3 *db, Expr *pOwner, Window *p){ pNew->zName = sqlite3DbStrDup(db, p->zName); pNew->zBase = sqlite3DbStrDup(db, p->zBase); pNew->pFilter = sqlite3ExprDup(db, p->pFilter, 0); - pNew->pFunc = p->pFunc; + pNew->pWFunc = p->pWFunc; pNew->pPartition = sqlite3ExprListDup(db, p->pPartition, 0); pNew->pOrderBy = sqlite3ExprListDup(db, p->pOrderBy, 0); pNew->eFrmType = p->eFrmType; @@ -161569,7 +165480,7 @@ static void updateDeleteLimitError( p->affExpr = 0; p->flags = EP_Leaf; ExprClearVVAProperties(p); - p->iAgg = -1; + /* p->iAgg = -1; // Not required */ p->pLeft = p->pRight = 0; p->pAggInfo = 0; memset(&p->x, 0, sizeof(p->x)); @@ -161902,6 +165813,7 @@ typedef union { With* yy521; const char* yy522; Expr* yy528; + OnOrUsing yy561; struct FrameBound yy595; } YYMINORTYPE; #ifndef YYSTACKDEPTH @@ -161918,18 +165830,18 @@ typedef union { #define sqlite3ParserCTX_FETCH Parse *pParse=yypParser->pParse; #define sqlite3ParserCTX_STORE yypParser->pParse=pParse; #define YYFALLBACK 1 -#define YYNSTATE 578 -#define YYNRULE 402 -#define YYNRULE_WITH_ACTION 340 +#define YYNSTATE 580 +#define YYNRULE 405 +#define YYNRULE_WITH_ACTION 342 #define YYNTOKEN 185 -#define YY_MAX_SHIFT 577 -#define YY_MIN_SHIFTREDUCE 835 -#define YY_MAX_SHIFTREDUCE 1236 -#define YY_ERROR_ACTION 1237 -#define YY_ACCEPT_ACTION 1238 -#define YY_NO_ACTION 1239 -#define YY_MIN_REDUCE 1240 -#define YY_MAX_REDUCE 1641 +#define YY_MAX_SHIFT 579 +#define YY_MIN_SHIFTREDUCE 839 +#define YY_MAX_SHIFTREDUCE 1243 +#define YY_ERROR_ACTION 1244 +#define YY_ACCEPT_ACTION 1245 +#define YY_NO_ACTION 1246 +#define YY_MIN_REDUCE 1247 +#define YY_MAX_REDUCE 1651 /************* End control #defines *******************************************/ #define YY_NLOOKAHEAD ((int)(sizeof(yy_lookahead)/sizeof(yy_lookahead[0]))) @@ -161996,429 +165908,432 @@ typedef union { ** yy_default[] Default action for each state. ** *********** Begin parsing tables **********************************************/ -#define YY_ACTTAB_COUNT (2071) +#define YY_ACTTAB_COUNT (2101) static const YYACTIONTYPE yy_action[] = { - /* 0 */ 570, 1311, 570, 1290, 201, 201, 570, 116, 112, 222, - /* 10 */ 570, 1311, 381, 570, 116, 112, 222, 401, 412, 413, - /* 20 */ 1264, 382, 1273, 41, 41, 41, 41, 1416, 1521, 71, - /* 30 */ 71, 971, 1262, 41, 41, 495, 71, 71, 272, 972, - /* 40 */ 298, 480, 298, 123, 124, 114, 1214, 1214, 1048, 1051, - /* 50 */ 1040, 1040, 121, 121, 122, 122, 122, 122, 547, 413, - /* 60 */ 1238, 1, 1, 577, 2, 1242, 552, 116, 112, 222, - /* 70 */ 309, 484, 142, 552, 1276, 528, 116, 112, 222, 1324, - /* 80 */ 421, 527, 551, 123, 124, 114, 1214, 1214, 1048, 1051, - /* 90 */ 1040, 1040, 121, 121, 122, 122, 122, 122, 428, 116, - /* 100 */ 112, 222, 120, 120, 120, 120, 119, 119, 118, 118, - /* 110 */ 118, 117, 113, 448, 277, 277, 277, 277, 564, 564, - /* 120 */ 564, 1562, 380, 1564, 1190, 379, 1161, 567, 1161, 567, - /* 130 */ 413, 1562, 541, 252, 219, 1557, 99, 141, 453, 6, - /* 140 */ 369, 233, 120, 120, 120, 120, 119, 119, 118, 118, - /* 150 */ 118, 117, 113, 448, 123, 124, 114, 1214, 1214, 1048, - /* 160 */ 1051, 1040, 1040, 121, 121, 122, 122, 122, 122, 138, - /* 170 */ 289, 1190, 1550, 452, 118, 118, 118, 117, 113, 448, - /* 180 */ 125, 1190, 1191, 1192, 144, 469, 338, 570, 150, 127, - /* 190 */ 448, 122, 122, 122, 122, 115, 120, 120, 120, 120, - /* 200 */ 119, 119, 118, 118, 118, 117, 113, 448, 458, 423, - /* 210 */ 13, 13, 215, 120, 120, 120, 120, 119, 119, 118, - /* 220 */ 118, 118, 117, 113, 448, 426, 308, 561, 1190, 1191, - /* 230 */ 1192, 445, 444, 413, 1275, 122, 122, 122, 122, 120, - /* 240 */ 120, 120, 120, 119, 119, 118, 118, 118, 117, 113, - /* 250 */ 448, 1547, 98, 1037, 1037, 1049, 1052, 123, 124, 114, - /* 260 */ 1214, 1214, 1048, 1051, 1040, 1040, 121, 121, 122, 122, - /* 270 */ 122, 122, 570, 410, 409, 1190, 570, 413, 1221, 319, - /* 280 */ 1221, 80, 81, 120, 120, 120, 120, 119, 119, 118, - /* 290 */ 118, 118, 117, 113, 448, 70, 70, 1190, 1608, 71, - /* 300 */ 71, 123, 124, 114, 1214, 1214, 1048, 1051, 1040, 1040, - /* 310 */ 121, 121, 122, 122, 122, 122, 120, 120, 120, 120, - /* 320 */ 119, 119, 118, 118, 118, 117, 113, 448, 1041, 210, - /* 330 */ 1190, 369, 1190, 1191, 1192, 245, 552, 403, 508, 505, - /* 340 */ 504, 108, 562, 138, 4, 520, 937, 437, 503, 217, - /* 350 */ 518, 526, 356, 883, 1190, 1191, 1192, 387, 565, 570, - /* 360 */ 120, 120, 120, 120, 119, 119, 118, 118, 118, 117, - /* 370 */ 113, 448, 277, 277, 16, 16, 1602, 445, 444, 153, - /* 380 */ 413, 449, 13, 13, 1283, 567, 1218, 1190, 1191, 1192, - /* 390 */ 1007, 1220, 264, 559, 1578, 186, 570, 431, 138, 1219, - /* 400 */ 308, 561, 476, 138, 123, 124, 114, 1214, 1214, 1048, - /* 410 */ 1051, 1040, 1040, 121, 121, 122, 122, 122, 122, 55, - /* 420 */ 55, 417, 1027, 511, 1221, 1190, 1221, 478, 106, 106, - /* 430 */ 1316, 1316, 1190, 171, 570, 388, 107, 384, 449, 572, - /* 440 */ 571, 434, 1547, 1017, 336, 553, 569, 263, 280, 364, - /* 450 */ 514, 359, 513, 250, 495, 308, 561, 71, 71, 355, - /* 460 */ 308, 561, 378, 120, 120, 120, 120, 119, 119, 118, - /* 470 */ 118, 118, 117, 113, 448, 1017, 1017, 1019, 1020, 27, - /* 480 */ 277, 277, 1190, 1191, 1192, 1156, 570, 532, 413, 1190, - /* 490 */ 1191, 1192, 352, 567, 552, 1264, 537, 521, 1156, 1520, - /* 500 */ 317, 1156, 285, 554, 489, 573, 570, 573, 486, 51, - /* 510 */ 51, 207, 123, 124, 114, 1214, 1214, 1048, 1051, 1040, - /* 520 */ 1040, 121, 121, 122, 122, 122, 122, 171, 1416, 13, - /* 530 */ 13, 413, 277, 277, 1190, 509, 119, 119, 118, 118, - /* 540 */ 118, 117, 113, 448, 433, 567, 522, 220, 519, 1556, - /* 550 */ 369, 550, 1190, 6, 536, 123, 124, 114, 1214, 1214, - /* 560 */ 1048, 1051, 1040, 1040, 121, 121, 122, 122, 122, 122, - /* 570 */ 145, 120, 120, 120, 120, 119, 119, 118, 118, 118, - /* 580 */ 117, 113, 448, 245, 570, 478, 508, 505, 504, 570, - /* 590 */ 1485, 1190, 1191, 1192, 1314, 1314, 503, 1190, 149, 429, - /* 600 */ 1190, 484, 413, 274, 369, 956, 876, 56, 56, 1190, - /* 610 */ 1191, 1192, 71, 71, 120, 120, 120, 120, 119, 119, - /* 620 */ 118, 118, 118, 117, 113, 448, 123, 124, 114, 1214, - /* 630 */ 1214, 1048, 1051, 1040, 1040, 121, 121, 122, 122, 122, - /* 640 */ 122, 413, 545, 1556, 83, 869, 98, 6, 932, 533, - /* 650 */ 852, 547, 151, 931, 1190, 1191, 1192, 1190, 1191, 1192, - /* 660 */ 290, 1547, 187, 1637, 399, 123, 124, 114, 1214, 1214, - /* 670 */ 1048, 1051, 1040, 1040, 121, 121, 122, 122, 122, 122, - /* 680 */ 570, 958, 570, 457, 957, 120, 120, 120, 120, 119, - /* 690 */ 119, 118, 118, 118, 117, 113, 448, 1156, 221, 1190, - /* 700 */ 335, 457, 456, 13, 13, 13, 13, 1007, 369, 467, - /* 710 */ 1156, 193, 413, 1156, 386, 1547, 1174, 32, 297, 478, - /* 720 */ 195, 1531, 5, 956, 120, 120, 120, 120, 119, 119, - /* 730 */ 118, 118, 118, 117, 113, 448, 123, 124, 114, 1214, - /* 740 */ 1214, 1048, 1051, 1040, 1040, 121, 121, 122, 122, 122, - /* 750 */ 122, 413, 1071, 423, 1190, 1028, 1190, 1191, 1192, 1190, - /* 760 */ 423, 336, 464, 322, 548, 1549, 446, 446, 446, 570, - /* 770 */ 3, 117, 113, 448, 457, 123, 124, 114, 1214, 1214, - /* 780 */ 1048, 1051, 1040, 1040, 121, 121, 122, 122, 122, 122, - /* 790 */ 1477, 570, 15, 15, 293, 120, 120, 120, 120, 119, - /* 800 */ 119, 118, 118, 118, 117, 113, 448, 1190, 570, 1490, - /* 810 */ 1416, 1190, 1191, 1192, 13, 13, 1190, 1191, 1192, 1548, - /* 820 */ 271, 271, 413, 286, 308, 561, 1012, 1490, 1492, 196, - /* 830 */ 288, 71, 71, 567, 120, 120, 120, 120, 119, 119, - /* 840 */ 118, 118, 118, 117, 113, 448, 123, 124, 114, 1214, - /* 850 */ 1214, 1048, 1051, 1040, 1040, 121, 121, 122, 122, 122, - /* 860 */ 122, 413, 201, 1091, 1190, 1191, 1192, 1328, 304, 1533, - /* 870 */ 392, 278, 278, 454, 568, 406, 926, 926, 570, 567, - /* 880 */ 570, 430, 495, 484, 567, 123, 124, 114, 1214, 1214, - /* 890 */ 1048, 1051, 1040, 1040, 121, 121, 122, 122, 122, 122, - /* 900 */ 1490, 71, 71, 13, 13, 120, 120, 120, 120, 119, - /* 910 */ 119, 118, 118, 118, 117, 113, 448, 570, 549, 570, - /* 920 */ 1581, 577, 2, 1242, 1096, 1096, 492, 1484, 309, 1529, - /* 930 */ 142, 328, 413, 840, 841, 842, 312, 1324, 305, 367, - /* 940 */ 43, 43, 57, 57, 120, 120, 120, 120, 119, 119, - /* 950 */ 118, 118, 118, 117, 113, 448, 123, 124, 114, 1214, - /* 960 */ 1214, 1048, 1051, 1040, 1040, 121, 121, 122, 122, 122, - /* 970 */ 122, 12, 277, 277, 570, 1156, 413, 576, 432, 1242, - /* 980 */ 469, 338, 296, 478, 309, 567, 142, 249, 1156, 308, - /* 990 */ 561, 1156, 325, 1324, 327, 495, 459, 71, 71, 233, - /* 1000 */ 283, 101, 114, 1214, 1214, 1048, 1051, 1040, 1040, 121, - /* 1010 */ 121, 122, 122, 122, 122, 120, 120, 120, 120, 119, - /* 1020 */ 119, 118, 118, 118, 117, 113, 448, 1112, 277, 277, - /* 1030 */ 1416, 452, 398, 1234, 443, 277, 277, 248, 247, 246, - /* 1040 */ 1323, 567, 1113, 313, 198, 294, 495, 1322, 567, 468, - /* 1050 */ 570, 1431, 398, 1134, 1027, 233, 418, 1114, 295, 120, - /* 1060 */ 120, 120, 120, 119, 119, 118, 118, 118, 117, 113, - /* 1070 */ 448, 1018, 104, 71, 71, 1017, 326, 500, 912, 570, - /* 1080 */ 277, 277, 277, 277, 1112, 1265, 419, 452, 913, 365, - /* 1090 */ 1575, 1319, 413, 567, 956, 567, 9, 202, 255, 1113, - /* 1100 */ 316, 491, 44, 44, 249, 563, 419, 1017, 1017, 1019, - /* 1110 */ 447, 1235, 413, 1607, 1114, 901, 123, 124, 114, 1214, - /* 1120 */ 1214, 1048, 1051, 1040, 1040, 121, 121, 122, 122, 122, - /* 1130 */ 122, 1235, 413, 1211, 215, 558, 123, 124, 114, 1214, - /* 1140 */ 1214, 1048, 1051, 1040, 1040, 121, 121, 122, 122, 122, - /* 1150 */ 122, 1135, 1635, 474, 1635, 255, 123, 111, 114, 1214, - /* 1160 */ 1214, 1048, 1051, 1040, 1040, 121, 121, 122, 122, 122, - /* 1170 */ 122, 1135, 1636, 418, 1636, 120, 120, 120, 120, 119, - /* 1180 */ 119, 118, 118, 118, 117, 113, 448, 221, 209, 355, - /* 1190 */ 1211, 1211, 147, 1430, 495, 120, 120, 120, 120, 119, - /* 1200 */ 119, 118, 118, 118, 117, 113, 448, 1260, 543, 523, - /* 1210 */ 892, 555, 956, 12, 570, 120, 120, 120, 120, 119, - /* 1220 */ 119, 118, 118, 118, 117, 113, 448, 542, 570, 864, - /* 1230 */ 1133, 365, 1575, 350, 1360, 413, 1167, 58, 58, 343, - /* 1240 */ 1359, 512, 277, 277, 277, 277, 277, 277, 1211, 893, - /* 1250 */ 1133, 59, 59, 463, 367, 567, 570, 567, 96, 567, - /* 1260 */ 124, 114, 1214, 1214, 1048, 1051, 1040, 1040, 121, 121, - /* 1270 */ 122, 122, 122, 122, 570, 1416, 570, 281, 1190, 60, - /* 1280 */ 60, 110, 396, 396, 395, 266, 393, 864, 1167, 849, - /* 1290 */ 570, 485, 570, 440, 345, 1156, 348, 61, 61, 62, - /* 1300 */ 62, 971, 227, 1554, 315, 435, 544, 6, 1156, 972, - /* 1310 */ 570, 1156, 314, 45, 45, 46, 46, 516, 120, 120, - /* 1320 */ 120, 120, 119, 119, 118, 118, 118, 117, 113, 448, - /* 1330 */ 420, 173, 1536, 47, 47, 1190, 1191, 1192, 108, 562, - /* 1340 */ 329, 4, 229, 1555, 932, 570, 441, 6, 570, 931, - /* 1350 */ 164, 570, 1294, 137, 1194, 565, 570, 1553, 570, 1093, - /* 1360 */ 570, 6, 570, 1093, 535, 570, 872, 8, 49, 49, - /* 1370 */ 228, 50, 50, 570, 63, 63, 570, 461, 449, 64, - /* 1380 */ 64, 65, 65, 14, 14, 66, 66, 411, 129, 129, - /* 1390 */ 559, 570, 462, 570, 1509, 490, 67, 67, 570, 52, - /* 1400 */ 52, 550, 411, 471, 539, 414, 226, 1027, 570, 538, - /* 1410 */ 308, 561, 1194, 411, 68, 68, 69, 69, 570, 1027, - /* 1420 */ 570, 53, 53, 872, 1018, 106, 106, 529, 1017, 570, - /* 1430 */ 1508, 159, 159, 107, 455, 449, 572, 571, 475, 307, - /* 1440 */ 1017, 160, 160, 76, 76, 570, 1552, 470, 411, 411, - /* 1450 */ 6, 1229, 54, 54, 482, 276, 219, 570, 891, 890, - /* 1460 */ 1017, 1017, 1019, 84, 206, 1210, 230, 282, 72, 72, - /* 1470 */ 333, 487, 1017, 1017, 1019, 1020, 27, 1580, 1178, 451, - /* 1480 */ 130, 130, 281, 148, 105, 38, 103, 396, 396, 395, - /* 1490 */ 266, 393, 570, 1130, 849, 400, 570, 108, 562, 570, - /* 1500 */ 4, 311, 570, 30, 17, 570, 279, 227, 570, 315, - /* 1510 */ 108, 562, 472, 4, 565, 73, 73, 314, 570, 157, - /* 1520 */ 157, 570, 131, 131, 530, 132, 132, 565, 128, 128, - /* 1530 */ 570, 158, 158, 570, 31, 291, 570, 449, 334, 525, - /* 1540 */ 98, 152, 152, 424, 136, 136, 1009, 229, 254, 559, - /* 1550 */ 449, 483, 340, 135, 135, 164, 133, 133, 137, 134, - /* 1560 */ 134, 879, 559, 539, 570, 477, 570, 254, 540, 479, - /* 1570 */ 339, 254, 98, 898, 899, 228, 539, 570, 1027, 570, - /* 1580 */ 1078, 538, 210, 232, 106, 106, 1356, 75, 75, 77, - /* 1590 */ 77, 1027, 107, 344, 449, 572, 571, 106, 106, 1017, - /* 1600 */ 74, 74, 42, 42, 570, 107, 347, 449, 572, 571, - /* 1610 */ 414, 501, 1017, 251, 363, 308, 561, 1139, 353, 879, - /* 1620 */ 98, 1074, 349, 251, 362, 1595, 351, 48, 48, 1021, - /* 1630 */ 1307, 1017, 1017, 1019, 1020, 27, 1293, 1291, 1078, 455, - /* 1640 */ 965, 929, 254, 110, 1017, 1017, 1019, 1020, 27, 1178, - /* 1650 */ 451, 974, 975, 281, 108, 562, 1292, 4, 396, 396, - /* 1660 */ 395, 266, 393, 1347, 1090, 849, 1090, 1089, 862, 1089, - /* 1670 */ 146, 565, 930, 358, 110, 303, 368, 557, 227, 1368, - /* 1680 */ 315, 108, 562, 1415, 4, 1343, 496, 1021, 314, 1354, - /* 1690 */ 1569, 556, 1421, 1272, 449, 204, 1263, 1251, 565, 1250, - /* 1700 */ 1252, 1588, 269, 1340, 371, 373, 559, 375, 11, 212, - /* 1710 */ 397, 225, 321, 284, 1402, 460, 287, 331, 229, 332, - /* 1720 */ 292, 449, 324, 216, 337, 1407, 164, 481, 377, 137, - /* 1730 */ 1406, 404, 506, 559, 1290, 1027, 361, 1481, 199, 1591, - /* 1740 */ 211, 106, 106, 936, 1480, 1229, 228, 560, 175, 107, - /* 1750 */ 200, 449, 572, 571, 258, 391, 1017, 1528, 1526, 223, - /* 1760 */ 1226, 422, 1027, 83, 208, 79, 82, 184, 106, 106, - /* 1770 */ 1486, 126, 1397, 550, 169, 320, 107, 1403, 449, 572, - /* 1780 */ 571, 414, 177, 1017, 1390, 323, 308, 561, 1017, 1017, - /* 1790 */ 1019, 1020, 27, 465, 35, 235, 100, 562, 499, 4, - /* 1800 */ 179, 180, 181, 466, 182, 96, 402, 1409, 473, 1408, - /* 1810 */ 455, 36, 1411, 565, 188, 1017, 1017, 1019, 1020, 27, - /* 1820 */ 405, 1475, 488, 239, 89, 494, 270, 192, 1497, 342, - /* 1830 */ 241, 497, 346, 242, 515, 243, 449, 1253, 1310, 1309, - /* 1840 */ 407, 91, 436, 1308, 883, 217, 438, 439, 559, 524, - /* 1850 */ 531, 408, 1351, 1606, 1301, 301, 1280, 1605, 360, 1279, - /* 1860 */ 1278, 1604, 1574, 302, 95, 366, 370, 372, 1300, 1352, - /* 1870 */ 1350, 374, 256, 257, 442, 10, 1349, 1027, 1461, 385, - /* 1880 */ 97, 1375, 102, 106, 106, 534, 1560, 34, 1559, 574, - /* 1890 */ 1184, 107, 265, 449, 572, 571, 267, 268, 1017, 203, - /* 1900 */ 1333, 383, 389, 1332, 390, 575, 376, 1248, 1243, 1513, - /* 1910 */ 161, 143, 1374, 1514, 1512, 162, 299, 1511, 163, 213, - /* 1920 */ 836, 214, 78, 450, 205, 310, 224, 1088, 140, 1086, - /* 1930 */ 1017, 1017, 1019, 1020, 27, 318, 306, 176, 165, 1210, - /* 1940 */ 178, 231, 915, 234, 330, 1102, 183, 166, 167, 425, - /* 1950 */ 427, 185, 85, 86, 87, 168, 88, 415, 1105, 236, - /* 1960 */ 174, 237, 416, 1101, 154, 18, 238, 341, 1223, 240, - /* 1970 */ 254, 493, 190, 1094, 37, 189, 851, 498, 362, 244, - /* 1980 */ 354, 510, 191, 90, 170, 502, 19, 20, 507, 93, - /* 1990 */ 881, 357, 92, 300, 894, 155, 517, 218, 1172, 156, - /* 2000 */ 1054, 959, 1141, 94, 39, 1140, 273, 275, 964, 194, - /* 2010 */ 110, 1158, 1162, 1160, 253, 21, 1166, 7, 1146, 33, - /* 2020 */ 22, 197, 23, 24, 25, 1165, 546, 26, 98, 1069, - /* 2030 */ 1055, 1053, 1057, 1111, 1058, 1110, 259, 260, 28, 40, - /* 2040 */ 1180, 1022, 863, 109, 29, 566, 394, 1179, 139, 172, - /* 2050 */ 925, 261, 1239, 1239, 1239, 1239, 1239, 1239, 1239, 1239, - /* 2060 */ 262, 1239, 1239, 1239, 1239, 1597, 1239, 1239, 1239, 1239, - /* 2070 */ 1596, + /* 0 */ 572, 208, 572, 118, 115, 229, 572, 118, 115, 229, + /* 10 */ 572, 1318, 381, 1297, 412, 566, 566, 566, 572, 413, + /* 20 */ 382, 1318, 1280, 41, 41, 41, 41, 208, 1530, 71, + /* 30 */ 71, 975, 423, 41, 41, 495, 303, 279, 303, 976, + /* 40 */ 401, 71, 71, 125, 126, 80, 1221, 1221, 1054, 1057, + /* 50 */ 1044, 1044, 123, 123, 124, 124, 124, 124, 480, 413, + /* 60 */ 1245, 1, 1, 579, 2, 1249, 554, 118, 115, 229, + /* 70 */ 317, 484, 146, 484, 528, 118, 115, 229, 533, 1331, + /* 80 */ 421, 527, 142, 125, 126, 80, 1221, 1221, 1054, 1057, + /* 90 */ 1044, 1044, 123, 123, 124, 124, 124, 124, 118, 115, + /* 100 */ 229, 327, 122, 122, 122, 122, 121, 121, 120, 120, + /* 110 */ 120, 119, 116, 448, 284, 284, 284, 284, 446, 446, + /* 120 */ 446, 1571, 380, 1573, 1196, 379, 1167, 569, 1167, 569, + /* 130 */ 413, 1571, 541, 259, 226, 448, 101, 145, 453, 316, + /* 140 */ 563, 240, 122, 122, 122, 122, 121, 121, 120, 120, + /* 150 */ 120, 119, 116, 448, 125, 126, 80, 1221, 1221, 1054, + /* 160 */ 1057, 1044, 1044, 123, 123, 124, 124, 124, 124, 142, + /* 170 */ 294, 1196, 343, 452, 120, 120, 120, 119, 116, 448, + /* 180 */ 127, 1196, 1197, 1198, 148, 445, 444, 572, 119, 116, + /* 190 */ 448, 124, 124, 124, 124, 117, 122, 122, 122, 122, + /* 200 */ 121, 121, 120, 120, 120, 119, 116, 448, 458, 113, + /* 210 */ 13, 13, 550, 122, 122, 122, 122, 121, 121, 120, + /* 220 */ 120, 120, 119, 116, 448, 426, 316, 563, 1196, 1197, + /* 230 */ 1198, 149, 1228, 413, 1228, 124, 124, 124, 124, 122, + /* 240 */ 122, 122, 122, 121, 121, 120, 120, 120, 119, 116, + /* 250 */ 448, 469, 346, 1041, 1041, 1055, 1058, 125, 126, 80, + /* 260 */ 1221, 1221, 1054, 1057, 1044, 1044, 123, 123, 124, 124, + /* 270 */ 124, 124, 1283, 526, 222, 1196, 572, 413, 224, 518, + /* 280 */ 175, 82, 83, 122, 122, 122, 122, 121, 121, 120, + /* 290 */ 120, 120, 119, 116, 448, 1011, 16, 16, 1196, 133, + /* 300 */ 133, 125, 126, 80, 1221, 1221, 1054, 1057, 1044, 1044, + /* 310 */ 123, 123, 124, 124, 124, 124, 122, 122, 122, 122, + /* 320 */ 121, 121, 120, 120, 120, 119, 116, 448, 1045, 550, + /* 330 */ 1196, 377, 1196, 1197, 1198, 252, 1438, 403, 508, 505, + /* 340 */ 504, 111, 564, 570, 4, 930, 930, 437, 503, 344, + /* 350 */ 464, 330, 364, 398, 1241, 1196, 1197, 1198, 567, 572, + /* 360 */ 122, 122, 122, 122, 121, 121, 120, 120, 120, 119, + /* 370 */ 116, 448, 284, 284, 373, 1584, 1611, 445, 444, 154, + /* 380 */ 413, 449, 71, 71, 1290, 569, 1225, 1196, 1197, 1198, + /* 390 */ 85, 1227, 271, 561, 547, 519, 1565, 572, 98, 1226, + /* 400 */ 6, 1282, 476, 142, 125, 126, 80, 1221, 1221, 1054, + /* 410 */ 1057, 1044, 1044, 123, 123, 124, 124, 124, 124, 554, + /* 420 */ 13, 13, 1031, 511, 1228, 1196, 1228, 553, 109, 109, + /* 430 */ 222, 572, 1242, 175, 572, 431, 110, 197, 449, 574, + /* 440 */ 573, 434, 1556, 1021, 325, 555, 1196, 270, 287, 372, + /* 450 */ 514, 367, 513, 257, 71, 71, 547, 71, 71, 363, + /* 460 */ 316, 563, 1617, 122, 122, 122, 122, 121, 121, 120, + /* 470 */ 120, 120, 119, 116, 448, 1021, 1021, 1023, 1024, 27, + /* 480 */ 284, 284, 1196, 1197, 1198, 1162, 572, 1616, 413, 905, + /* 490 */ 190, 554, 360, 569, 554, 941, 537, 521, 1162, 520, + /* 500 */ 417, 1162, 556, 1196, 1197, 1198, 572, 548, 1558, 51, + /* 510 */ 51, 214, 125, 126, 80, 1221, 1221, 1054, 1057, 1044, + /* 520 */ 1044, 123, 123, 124, 124, 124, 124, 1196, 478, 135, + /* 530 */ 135, 413, 284, 284, 1494, 509, 121, 121, 120, 120, + /* 540 */ 120, 119, 116, 448, 1011, 569, 522, 217, 545, 1565, + /* 550 */ 316, 563, 142, 6, 536, 125, 126, 80, 1221, 1221, + /* 560 */ 1054, 1057, 1044, 1044, 123, 123, 124, 124, 124, 124, + /* 570 */ 1559, 122, 122, 122, 122, 121, 121, 120, 120, 120, + /* 580 */ 119, 116, 448, 489, 1196, 1197, 1198, 486, 281, 1271, + /* 590 */ 961, 252, 1196, 377, 508, 505, 504, 1196, 344, 575, + /* 600 */ 1196, 575, 413, 292, 503, 961, 880, 191, 484, 316, + /* 610 */ 563, 388, 290, 384, 122, 122, 122, 122, 121, 121, + /* 620 */ 120, 120, 120, 119, 116, 448, 125, 126, 80, 1221, + /* 630 */ 1221, 1054, 1057, 1044, 1044, 123, 123, 124, 124, 124, + /* 640 */ 124, 413, 398, 1140, 1196, 873, 100, 284, 284, 1196, + /* 650 */ 1197, 1198, 377, 1097, 1196, 1197, 1198, 1196, 1197, 1198, + /* 660 */ 569, 459, 32, 377, 233, 125, 126, 80, 1221, 1221, + /* 670 */ 1054, 1057, 1044, 1044, 123, 123, 124, 124, 124, 124, + /* 680 */ 1437, 963, 572, 228, 962, 122, 122, 122, 122, 121, + /* 690 */ 121, 120, 120, 120, 119, 116, 448, 1162, 228, 1196, + /* 700 */ 157, 1196, 1197, 1198, 1557, 13, 13, 301, 961, 1236, + /* 710 */ 1162, 153, 413, 1162, 377, 1587, 1180, 5, 373, 1584, + /* 720 */ 433, 1242, 3, 961, 122, 122, 122, 122, 121, 121, + /* 730 */ 120, 120, 120, 119, 116, 448, 125, 126, 80, 1221, + /* 740 */ 1221, 1054, 1057, 1044, 1044, 123, 123, 124, 124, 124, + /* 750 */ 124, 413, 208, 571, 1196, 1032, 1196, 1197, 1198, 1196, + /* 760 */ 392, 856, 155, 1556, 286, 406, 1102, 1102, 492, 572, + /* 770 */ 469, 346, 1323, 1323, 1556, 125, 126, 80, 1221, 1221, + /* 780 */ 1054, 1057, 1044, 1044, 123, 123, 124, 124, 124, 124, + /* 790 */ 129, 572, 13, 13, 378, 122, 122, 122, 122, 121, + /* 800 */ 121, 120, 120, 120, 119, 116, 448, 302, 572, 457, + /* 810 */ 532, 1196, 1197, 1198, 13, 13, 1196, 1197, 1198, 1301, + /* 820 */ 467, 1271, 413, 1321, 1321, 1556, 1016, 457, 456, 200, + /* 830 */ 299, 71, 71, 1269, 122, 122, 122, 122, 121, 121, + /* 840 */ 120, 120, 120, 119, 116, 448, 125, 126, 80, 1221, + /* 850 */ 1221, 1054, 1057, 1044, 1044, 123, 123, 124, 124, 124, + /* 860 */ 124, 413, 227, 1077, 1162, 284, 284, 423, 312, 278, + /* 870 */ 278, 285, 285, 1423, 410, 409, 386, 1162, 569, 572, + /* 880 */ 1162, 1200, 569, 1604, 569, 125, 126, 80, 1221, 1221, + /* 890 */ 1054, 1057, 1044, 1044, 123, 123, 124, 124, 124, 124, + /* 900 */ 457, 1486, 13, 13, 1540, 122, 122, 122, 122, 121, + /* 910 */ 121, 120, 120, 120, 119, 116, 448, 201, 572, 358, + /* 920 */ 1590, 579, 2, 1249, 844, 845, 846, 1566, 317, 1216, + /* 930 */ 146, 6, 413, 255, 254, 253, 206, 1331, 9, 1200, + /* 940 */ 262, 71, 71, 428, 122, 122, 122, 122, 121, 121, + /* 950 */ 120, 120, 120, 119, 116, 448, 125, 126, 80, 1221, + /* 960 */ 1221, 1054, 1057, 1044, 1044, 123, 123, 124, 124, 124, + /* 970 */ 124, 572, 284, 284, 572, 1217, 413, 578, 313, 1249, + /* 980 */ 353, 1300, 356, 423, 317, 569, 146, 495, 529, 1647, + /* 990 */ 399, 375, 495, 1331, 70, 70, 1299, 71, 71, 240, + /* 1000 */ 1329, 104, 80, 1221, 1221, 1054, 1057, 1044, 1044, 123, + /* 1010 */ 123, 124, 124, 124, 124, 122, 122, 122, 122, 121, + /* 1020 */ 121, 120, 120, 120, 119, 116, 448, 1118, 284, 284, + /* 1030 */ 432, 452, 1529, 1217, 443, 284, 284, 1493, 1356, 311, + /* 1040 */ 478, 569, 1119, 975, 495, 495, 217, 1267, 569, 1542, + /* 1050 */ 572, 976, 207, 572, 1031, 240, 387, 1120, 523, 122, + /* 1060 */ 122, 122, 122, 121, 121, 120, 120, 120, 119, 116, + /* 1070 */ 448, 1022, 107, 71, 71, 1021, 13, 13, 916, 572, + /* 1080 */ 1499, 572, 284, 284, 97, 530, 495, 452, 917, 1330, + /* 1090 */ 1326, 549, 413, 284, 284, 569, 151, 209, 1499, 1501, + /* 1100 */ 262, 454, 55, 55, 56, 56, 569, 1021, 1021, 1023, + /* 1110 */ 447, 336, 413, 531, 12, 295, 125, 126, 80, 1221, + /* 1120 */ 1221, 1054, 1057, 1044, 1044, 123, 123, 124, 124, 124, + /* 1130 */ 124, 351, 413, 868, 1538, 1217, 125, 126, 80, 1221, + /* 1140 */ 1221, 1054, 1057, 1044, 1044, 123, 123, 124, 124, 124, + /* 1150 */ 124, 1141, 1645, 478, 1645, 375, 125, 114, 80, 1221, + /* 1160 */ 1221, 1054, 1057, 1044, 1044, 123, 123, 124, 124, 124, + /* 1170 */ 124, 1499, 333, 478, 335, 122, 122, 122, 122, 121, + /* 1180 */ 121, 120, 120, 120, 119, 116, 448, 203, 1423, 572, + /* 1190 */ 1298, 868, 468, 1217, 440, 122, 122, 122, 122, 121, + /* 1200 */ 121, 120, 120, 120, 119, 116, 448, 557, 1141, 1646, + /* 1210 */ 543, 1646, 15, 15, 896, 122, 122, 122, 122, 121, + /* 1220 */ 121, 120, 120, 120, 119, 116, 448, 572, 298, 542, + /* 1230 */ 1139, 1423, 1563, 1564, 1335, 413, 6, 6, 1173, 1272, + /* 1240 */ 419, 320, 284, 284, 1423, 512, 569, 529, 300, 461, + /* 1250 */ 43, 43, 572, 897, 12, 569, 334, 482, 429, 411, + /* 1260 */ 126, 80, 1221, 1221, 1054, 1057, 1044, 1044, 123, 123, + /* 1270 */ 124, 124, 124, 124, 572, 57, 57, 288, 1196, 1423, + /* 1280 */ 500, 462, 396, 396, 395, 273, 393, 1139, 1562, 853, + /* 1290 */ 1173, 411, 6, 572, 321, 1162, 474, 44, 44, 1561, + /* 1300 */ 1118, 430, 234, 6, 323, 256, 544, 256, 1162, 435, + /* 1310 */ 572, 1162, 322, 17, 491, 1119, 58, 58, 122, 122, + /* 1320 */ 122, 122, 121, 121, 120, 120, 120, 119, 116, 448, + /* 1330 */ 1120, 216, 485, 59, 59, 1196, 1197, 1198, 111, 564, + /* 1340 */ 324, 4, 236, 460, 530, 572, 237, 460, 572, 441, + /* 1350 */ 168, 560, 424, 141, 483, 567, 572, 293, 572, 1099, + /* 1360 */ 572, 293, 572, 1099, 535, 572, 876, 8, 60, 60, + /* 1370 */ 235, 61, 61, 572, 418, 572, 418, 572, 449, 62, + /* 1380 */ 62, 45, 45, 46, 46, 47, 47, 199, 49, 49, + /* 1390 */ 561, 572, 363, 572, 100, 490, 50, 50, 63, 63, + /* 1400 */ 64, 64, 565, 419, 539, 414, 572, 1031, 572, 538, + /* 1410 */ 316, 563, 316, 563, 65, 65, 14, 14, 572, 1031, + /* 1420 */ 572, 516, 936, 876, 1022, 109, 109, 935, 1021, 66, + /* 1430 */ 66, 131, 131, 110, 455, 449, 574, 573, 420, 177, + /* 1440 */ 1021, 132, 132, 67, 67, 572, 471, 572, 936, 475, + /* 1450 */ 1368, 283, 226, 935, 315, 1367, 411, 572, 463, 411, + /* 1460 */ 1021, 1021, 1023, 239, 411, 86, 213, 1354, 52, 52, + /* 1470 */ 68, 68, 1021, 1021, 1023, 1024, 27, 1589, 1184, 451, + /* 1480 */ 69, 69, 288, 97, 108, 1545, 106, 396, 396, 395, + /* 1490 */ 273, 393, 572, 883, 853, 887, 572, 111, 564, 470, + /* 1500 */ 4, 572, 152, 30, 38, 572, 1136, 234, 400, 323, + /* 1510 */ 111, 564, 531, 4, 567, 53, 53, 322, 572, 163, + /* 1520 */ 163, 572, 341, 472, 164, 164, 337, 567, 76, 76, + /* 1530 */ 572, 289, 1518, 572, 31, 1517, 572, 449, 342, 487, + /* 1540 */ 100, 54, 54, 348, 72, 72, 296, 236, 1084, 561, + /* 1550 */ 449, 883, 1364, 134, 134, 168, 73, 73, 141, 161, + /* 1560 */ 161, 1578, 561, 539, 572, 319, 572, 352, 540, 1013, + /* 1570 */ 477, 261, 261, 895, 894, 235, 539, 572, 1031, 572, + /* 1580 */ 479, 538, 261, 371, 109, 109, 525, 136, 136, 130, + /* 1590 */ 130, 1031, 110, 370, 449, 574, 573, 109, 109, 1021, + /* 1600 */ 162, 162, 156, 156, 572, 110, 1084, 449, 574, 573, + /* 1610 */ 414, 355, 1021, 572, 357, 316, 563, 572, 347, 572, + /* 1620 */ 100, 501, 361, 258, 100, 902, 903, 140, 140, 359, + /* 1630 */ 1314, 1021, 1021, 1023, 1024, 27, 139, 139, 366, 455, + /* 1640 */ 137, 137, 138, 138, 1021, 1021, 1023, 1024, 27, 1184, + /* 1650 */ 451, 572, 376, 288, 111, 564, 1025, 4, 396, 396, + /* 1660 */ 395, 273, 393, 572, 1145, 853, 572, 1080, 572, 258, + /* 1670 */ 496, 567, 572, 211, 75, 75, 559, 966, 234, 261, + /* 1680 */ 323, 111, 564, 933, 4, 113, 77, 77, 322, 74, + /* 1690 */ 74, 42, 42, 1377, 449, 48, 48, 1422, 567, 978, + /* 1700 */ 979, 1096, 1095, 1096, 1095, 866, 561, 150, 934, 1350, + /* 1710 */ 113, 1362, 558, 1428, 1025, 1279, 1270, 1258, 236, 1257, + /* 1720 */ 1259, 449, 1597, 1347, 308, 276, 168, 309, 11, 141, + /* 1730 */ 397, 310, 232, 561, 1409, 1031, 339, 291, 329, 219, + /* 1740 */ 340, 109, 109, 940, 297, 1414, 235, 345, 481, 110, + /* 1750 */ 506, 449, 574, 573, 332, 1413, 1021, 404, 1297, 369, + /* 1760 */ 223, 1490, 1031, 1489, 1359, 1360, 1358, 1357, 109, 109, + /* 1770 */ 204, 1600, 1236, 562, 265, 218, 110, 205, 449, 574, + /* 1780 */ 573, 414, 391, 1021, 1537, 179, 316, 563, 1021, 1021, + /* 1790 */ 1023, 1024, 27, 230, 1535, 1233, 79, 564, 85, 4, + /* 1800 */ 422, 215, 552, 81, 84, 188, 1410, 128, 1404, 550, + /* 1810 */ 455, 35, 328, 567, 173, 1021, 1021, 1023, 1024, 27, + /* 1820 */ 181, 1495, 1397, 331, 465, 183, 184, 185, 186, 466, + /* 1830 */ 499, 242, 98, 402, 1416, 1418, 449, 1415, 473, 36, + /* 1840 */ 192, 488, 405, 1506, 246, 91, 494, 196, 561, 1484, + /* 1850 */ 350, 497, 277, 354, 248, 249, 111, 564, 1260, 4, + /* 1860 */ 250, 407, 515, 436, 1317, 1308, 93, 1316, 1315, 887, + /* 1870 */ 1307, 224, 1583, 567, 438, 524, 439, 1031, 263, 264, + /* 1880 */ 442, 1615, 10, 109, 109, 1287, 408, 1614, 1286, 368, + /* 1890 */ 1285, 110, 1613, 449, 574, 573, 449, 306, 1021, 307, + /* 1900 */ 374, 1382, 1569, 1470, 1381, 385, 105, 314, 561, 99, + /* 1910 */ 1568, 534, 34, 576, 1190, 272, 1340, 551, 383, 274, + /* 1920 */ 1339, 210, 389, 390, 275, 577, 1255, 1250, 415, 165, + /* 1930 */ 1021, 1021, 1023, 1024, 27, 147, 1522, 1031, 166, 1523, + /* 1940 */ 416, 1521, 178, 109, 109, 1520, 304, 167, 840, 450, + /* 1950 */ 220, 110, 221, 449, 574, 573, 212, 78, 1021, 318, + /* 1960 */ 231, 1094, 1092, 144, 180, 326, 169, 1216, 241, 182, + /* 1970 */ 919, 338, 238, 1108, 187, 170, 171, 425, 427, 189, + /* 1980 */ 87, 88, 89, 90, 172, 1111, 243, 1107, 244, 158, + /* 1990 */ 1021, 1021, 1023, 1024, 27, 18, 245, 1230, 493, 349, + /* 2000 */ 1100, 261, 247, 193, 194, 37, 370, 855, 498, 251, + /* 2010 */ 195, 510, 92, 19, 174, 362, 502, 20, 507, 885, + /* 2020 */ 365, 898, 94, 305, 159, 95, 517, 96, 1178, 160, + /* 2030 */ 1060, 1147, 39, 1146, 225, 280, 282, 970, 198, 964, + /* 2040 */ 113, 1164, 1168, 260, 1166, 21, 1172, 7, 22, 1152, + /* 2050 */ 33, 23, 24, 25, 1171, 546, 26, 202, 100, 102, + /* 2060 */ 1075, 103, 1061, 1059, 1063, 1117, 1064, 1116, 266, 267, + /* 2070 */ 28, 40, 929, 1026, 867, 112, 29, 568, 394, 143, + /* 2080 */ 1186, 268, 176, 1185, 269, 1246, 1246, 1246, 1246, 1246, + /* 2090 */ 1246, 1246, 1246, 1246, 1246, 1606, 1246, 1246, 1246, 1246, + /* 2100 */ 1605, }; static const YYCODETYPE yy_lookahead[] = { - /* 0 */ 193, 223, 193, 225, 193, 193, 193, 274, 275, 276, - /* 10 */ 193, 233, 219, 193, 274, 275, 276, 206, 206, 19, - /* 20 */ 193, 219, 216, 216, 217, 216, 217, 193, 295, 216, - /* 30 */ 217, 31, 205, 216, 217, 193, 216, 217, 213, 39, - /* 40 */ 228, 193, 230, 43, 44, 45, 46, 47, 48, 49, + /* 0 */ 193, 193, 193, 274, 275, 276, 193, 274, 275, 276, + /* 10 */ 193, 223, 219, 225, 206, 210, 211, 212, 193, 19, + /* 20 */ 219, 233, 216, 216, 217, 216, 217, 193, 295, 216, + /* 30 */ 217, 31, 193, 216, 217, 193, 228, 213, 230, 39, + /* 40 */ 206, 216, 217, 43, 44, 45, 46, 47, 48, 49, /* 50 */ 50, 51, 52, 53, 54, 55, 56, 57, 193, 19, /* 60 */ 185, 186, 187, 188, 189, 190, 253, 274, 275, 276, - /* 70 */ 195, 193, 197, 253, 216, 262, 274, 275, 276, 204, - /* 80 */ 238, 204, 262, 43, 44, 45, 46, 47, 48, 49, - /* 90 */ 50, 51, 52, 53, 54, 55, 56, 57, 264, 274, - /* 100 */ 275, 276, 102, 103, 104, 105, 106, 107, 108, 109, + /* 70 */ 195, 193, 197, 193, 261, 274, 275, 276, 253, 204, + /* 80 */ 238, 204, 81, 43, 44, 45, 46, 47, 48, 49, + /* 90 */ 50, 51, 52, 53, 54, 55, 56, 57, 274, 275, + /* 100 */ 276, 262, 102, 103, 104, 105, 106, 107, 108, 109, /* 110 */ 110, 111, 112, 113, 239, 240, 239, 240, 210, 211, /* 120 */ 212, 314, 315, 314, 59, 316, 86, 252, 88, 252, - /* 130 */ 19, 314, 315, 256, 257, 309, 25, 72, 296, 313, - /* 140 */ 193, 266, 102, 103, 104, 105, 106, 107, 108, 109, + /* 130 */ 19, 314, 315, 256, 257, 113, 25, 72, 296, 138, + /* 140 */ 139, 266, 102, 103, 104, 105, 106, 107, 108, 109, /* 150 */ 110, 111, 112, 113, 43, 44, 45, 46, 47, 48, /* 160 */ 49, 50, 51, 52, 53, 54, 55, 56, 57, 81, - /* 170 */ 292, 59, 307, 298, 108, 109, 110, 111, 112, 113, - /* 180 */ 69, 116, 117, 118, 72, 128, 129, 193, 241, 22, + /* 170 */ 292, 59, 292, 298, 108, 109, 110, 111, 112, 113, + /* 180 */ 69, 116, 117, 118, 72, 106, 107, 193, 111, 112, /* 190 */ 113, 54, 55, 56, 57, 58, 102, 103, 104, 105, - /* 200 */ 106, 107, 108, 109, 110, 111, 112, 113, 120, 193, - /* 210 */ 216, 217, 25, 102, 103, 104, 105, 106, 107, 108, + /* 200 */ 106, 107, 108, 109, 110, 111, 112, 113, 120, 25, + /* 210 */ 216, 217, 145, 102, 103, 104, 105, 106, 107, 108, /* 220 */ 109, 110, 111, 112, 113, 231, 138, 139, 116, 117, - /* 230 */ 118, 106, 107, 19, 216, 54, 55, 56, 57, 102, + /* 230 */ 118, 164, 153, 19, 155, 54, 55, 56, 57, 102, /* 240 */ 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, - /* 250 */ 113, 304, 25, 46, 47, 48, 49, 43, 44, 45, + /* 250 */ 113, 128, 129, 46, 47, 48, 49, 43, 44, 45, /* 260 */ 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, - /* 270 */ 56, 57, 193, 106, 107, 59, 193, 19, 153, 263, - /* 280 */ 155, 67, 24, 102, 103, 104, 105, 106, 107, 108, - /* 290 */ 109, 110, 111, 112, 113, 216, 217, 59, 230, 216, + /* 270 */ 56, 57, 216, 193, 25, 59, 193, 19, 165, 166, + /* 280 */ 193, 67, 24, 102, 103, 104, 105, 106, 107, 108, + /* 290 */ 109, 110, 111, 112, 113, 73, 216, 217, 59, 216, /* 300 */ 217, 43, 44, 45, 46, 47, 48, 49, 50, 51, /* 310 */ 52, 53, 54, 55, 56, 57, 102, 103, 104, 105, - /* 320 */ 106, 107, 108, 109, 110, 111, 112, 113, 121, 142, - /* 330 */ 59, 193, 116, 117, 118, 119, 253, 204, 122, 123, - /* 340 */ 124, 19, 20, 81, 22, 262, 108, 19, 132, 165, - /* 350 */ 166, 193, 24, 126, 116, 117, 118, 278, 36, 193, + /* 320 */ 106, 107, 108, 109, 110, 111, 112, 113, 121, 145, + /* 330 */ 59, 193, 116, 117, 118, 119, 273, 204, 122, 123, + /* 340 */ 124, 19, 20, 134, 22, 136, 137, 19, 132, 127, + /* 350 */ 128, 129, 24, 22, 23, 116, 117, 118, 36, 193, /* 360 */ 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, - /* 370 */ 112, 113, 239, 240, 216, 217, 215, 106, 107, 241, + /* 370 */ 112, 113, 239, 240, 311, 312, 215, 106, 107, 241, /* 380 */ 19, 59, 216, 217, 223, 252, 115, 116, 117, 118, - /* 390 */ 73, 120, 26, 71, 193, 22, 193, 231, 81, 128, - /* 400 */ 138, 139, 269, 81, 43, 44, 45, 46, 47, 48, - /* 410 */ 49, 50, 51, 52, 53, 54, 55, 56, 57, 216, - /* 420 */ 217, 198, 100, 95, 153, 59, 155, 193, 106, 107, - /* 430 */ 235, 236, 59, 193, 193, 249, 114, 251, 116, 117, - /* 440 */ 118, 113, 304, 121, 127, 204, 193, 119, 120, 121, - /* 450 */ 122, 123, 124, 125, 193, 138, 139, 216, 217, 131, - /* 460 */ 138, 139, 193, 102, 103, 104, 105, 106, 107, 108, + /* 390 */ 151, 120, 26, 71, 193, 308, 309, 193, 149, 128, + /* 400 */ 313, 216, 269, 81, 43, 44, 45, 46, 47, 48, + /* 410 */ 49, 50, 51, 52, 53, 54, 55, 56, 57, 253, + /* 420 */ 216, 217, 100, 95, 153, 59, 155, 261, 106, 107, + /* 430 */ 25, 193, 101, 193, 193, 231, 114, 25, 116, 117, + /* 440 */ 118, 113, 304, 121, 193, 204, 59, 119, 120, 121, + /* 450 */ 122, 123, 124, 125, 216, 217, 193, 216, 217, 131, + /* 460 */ 138, 139, 230, 102, 103, 104, 105, 106, 107, 108, /* 470 */ 109, 110, 111, 112, 113, 153, 154, 155, 156, 157, - /* 480 */ 239, 240, 116, 117, 118, 76, 193, 193, 19, 116, - /* 490 */ 117, 118, 23, 252, 253, 193, 87, 204, 89, 238, - /* 500 */ 193, 92, 268, 262, 281, 203, 193, 205, 285, 216, + /* 480 */ 239, 240, 116, 117, 118, 76, 193, 23, 19, 25, + /* 490 */ 22, 253, 23, 252, 253, 108, 87, 204, 89, 261, + /* 500 */ 198, 92, 261, 116, 117, 118, 193, 306, 307, 216, /* 510 */ 217, 150, 43, 44, 45, 46, 47, 48, 49, 50, - /* 520 */ 51, 52, 53, 54, 55, 56, 57, 193, 193, 216, - /* 530 */ 217, 19, 239, 240, 59, 23, 106, 107, 108, 109, - /* 540 */ 110, 111, 112, 113, 231, 252, 253, 193, 308, 309, - /* 550 */ 193, 145, 59, 313, 145, 43, 44, 45, 46, 47, + /* 520 */ 51, 52, 53, 54, 55, 56, 57, 59, 193, 216, + /* 530 */ 217, 19, 239, 240, 283, 23, 106, 107, 108, 109, + /* 540 */ 110, 111, 112, 113, 73, 252, 253, 142, 308, 309, + /* 550 */ 138, 139, 81, 313, 145, 43, 44, 45, 46, 47, /* 560 */ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, - /* 570 */ 164, 102, 103, 104, 105, 106, 107, 108, 109, 110, - /* 580 */ 111, 112, 113, 119, 193, 193, 122, 123, 124, 193, - /* 590 */ 283, 116, 117, 118, 235, 236, 132, 59, 241, 264, - /* 600 */ 59, 193, 19, 23, 193, 25, 23, 216, 217, 116, - /* 610 */ 117, 118, 216, 217, 102, 103, 104, 105, 106, 107, + /* 570 */ 307, 102, 103, 104, 105, 106, 107, 108, 109, 110, + /* 580 */ 111, 112, 113, 281, 116, 117, 118, 285, 23, 193, + /* 590 */ 25, 119, 59, 193, 122, 123, 124, 59, 127, 203, + /* 600 */ 59, 205, 19, 268, 132, 25, 23, 22, 193, 138, + /* 610 */ 139, 249, 204, 251, 102, 103, 104, 105, 106, 107, /* 620 */ 108, 109, 110, 111, 112, 113, 43, 44, 45, 46, /* 630 */ 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, - /* 640 */ 57, 19, 308, 309, 151, 23, 25, 313, 135, 253, - /* 650 */ 21, 193, 241, 140, 116, 117, 118, 116, 117, 118, - /* 660 */ 268, 304, 22, 301, 302, 43, 44, 45, 46, 47, + /* 640 */ 57, 19, 22, 23, 59, 23, 25, 239, 240, 116, + /* 650 */ 117, 118, 193, 11, 116, 117, 118, 116, 117, 118, + /* 660 */ 252, 269, 22, 193, 15, 43, 44, 45, 46, 47, /* 670 */ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, - /* 680 */ 193, 143, 193, 193, 143, 102, 103, 104, 105, 106, + /* 680 */ 273, 143, 193, 118, 143, 102, 103, 104, 105, 106, /* 690 */ 107, 108, 109, 110, 111, 112, 113, 76, 118, 59, - /* 700 */ 292, 211, 212, 216, 217, 216, 217, 73, 193, 80, - /* 710 */ 89, 25, 19, 92, 193, 304, 23, 22, 231, 193, - /* 720 */ 231, 193, 22, 143, 102, 103, 104, 105, 106, 107, + /* 700 */ 241, 116, 117, 118, 304, 216, 217, 292, 143, 60, + /* 710 */ 89, 241, 19, 92, 193, 193, 23, 22, 311, 312, + /* 720 */ 231, 101, 22, 143, 102, 103, 104, 105, 106, 107, /* 730 */ 108, 109, 110, 111, 112, 113, 43, 44, 45, 46, /* 740 */ 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, - /* 750 */ 57, 19, 123, 193, 59, 23, 116, 117, 118, 59, - /* 760 */ 193, 127, 128, 129, 306, 307, 210, 211, 212, 193, - /* 770 */ 22, 111, 112, 113, 284, 43, 44, 45, 46, 47, + /* 750 */ 57, 19, 193, 193, 59, 23, 116, 117, 118, 59, + /* 760 */ 201, 21, 241, 304, 22, 206, 127, 128, 129, 193, + /* 770 */ 128, 129, 235, 236, 304, 43, 44, 45, 46, 47, /* 780 */ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, - /* 790 */ 161, 193, 216, 217, 268, 102, 103, 104, 105, 106, - /* 800 */ 107, 108, 109, 110, 111, 112, 113, 59, 193, 193, - /* 810 */ 193, 116, 117, 118, 216, 217, 116, 117, 118, 304, - /* 820 */ 239, 240, 19, 263, 138, 139, 23, 211, 212, 231, - /* 830 */ 263, 216, 217, 252, 102, 103, 104, 105, 106, 107, + /* 790 */ 22, 193, 216, 217, 193, 102, 103, 104, 105, 106, + /* 800 */ 107, 108, 109, 110, 111, 112, 113, 231, 193, 193, + /* 810 */ 193, 116, 117, 118, 216, 217, 116, 117, 118, 226, + /* 820 */ 80, 193, 19, 235, 236, 304, 23, 211, 212, 231, + /* 830 */ 204, 216, 217, 205, 102, 103, 104, 105, 106, 107, /* 840 */ 108, 109, 110, 111, 112, 113, 43, 44, 45, 46, /* 850 */ 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, - /* 860 */ 57, 19, 193, 11, 116, 117, 118, 240, 253, 193, - /* 870 */ 201, 239, 240, 193, 134, 206, 136, 137, 193, 252, - /* 880 */ 193, 264, 193, 193, 252, 43, 44, 45, 46, 47, + /* 860 */ 57, 19, 193, 123, 76, 239, 240, 193, 253, 239, + /* 870 */ 240, 239, 240, 193, 106, 107, 193, 89, 252, 193, + /* 880 */ 92, 59, 252, 141, 252, 43, 44, 45, 46, 47, /* 890 */ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, - /* 900 */ 284, 216, 217, 216, 217, 102, 103, 104, 105, 106, - /* 910 */ 107, 108, 109, 110, 111, 112, 113, 193, 231, 193, - /* 920 */ 187, 188, 189, 190, 127, 128, 129, 238, 195, 193, - /* 930 */ 197, 16, 19, 7, 8, 9, 193, 204, 253, 193, - /* 940 */ 216, 217, 216, 217, 102, 103, 104, 105, 106, 107, + /* 900 */ 284, 161, 216, 217, 193, 102, 103, 104, 105, 106, + /* 910 */ 107, 108, 109, 110, 111, 112, 113, 231, 193, 16, + /* 920 */ 187, 188, 189, 190, 7, 8, 9, 309, 195, 25, + /* 930 */ 197, 313, 19, 127, 128, 129, 262, 204, 22, 117, + /* 940 */ 24, 216, 217, 263, 102, 103, 104, 105, 106, 107, /* 950 */ 108, 109, 110, 111, 112, 113, 43, 44, 45, 46, /* 960 */ 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, - /* 970 */ 57, 213, 239, 240, 193, 76, 19, 188, 232, 190, - /* 980 */ 128, 129, 292, 193, 195, 252, 197, 46, 89, 138, - /* 990 */ 139, 92, 77, 204, 79, 193, 269, 216, 217, 266, + /* 970 */ 57, 193, 239, 240, 193, 59, 19, 188, 253, 190, + /* 980 */ 77, 226, 79, 193, 195, 252, 197, 193, 19, 301, + /* 990 */ 302, 193, 193, 204, 216, 217, 226, 216, 217, 266, /* 1000 */ 204, 159, 45, 46, 47, 48, 49, 50, 51, 52, /* 1010 */ 53, 54, 55, 56, 57, 102, 103, 104, 105, 106, /* 1020 */ 107, 108, 109, 110, 111, 112, 113, 12, 239, 240, - /* 1030 */ 193, 298, 22, 23, 253, 239, 240, 127, 128, 129, - /* 1040 */ 238, 252, 27, 193, 286, 204, 193, 204, 252, 291, - /* 1050 */ 193, 273, 22, 23, 100, 266, 115, 42, 268, 102, + /* 1030 */ 232, 298, 238, 117, 253, 239, 240, 238, 259, 260, + /* 1040 */ 193, 252, 27, 31, 193, 193, 142, 204, 252, 193, + /* 1050 */ 193, 39, 262, 193, 100, 266, 278, 42, 204, 102, /* 1060 */ 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, - /* 1070 */ 113, 117, 159, 216, 217, 121, 161, 19, 63, 193, - /* 1080 */ 239, 240, 239, 240, 12, 208, 209, 298, 73, 311, - /* 1090 */ 312, 238, 19, 252, 25, 252, 22, 24, 24, 27, - /* 1100 */ 193, 264, 216, 217, 46, 208, 209, 153, 154, 155, - /* 1110 */ 253, 101, 19, 23, 42, 25, 43, 44, 45, 46, + /* 1070 */ 113, 117, 159, 216, 217, 121, 216, 217, 63, 193, + /* 1080 */ 193, 193, 239, 240, 115, 116, 193, 298, 73, 238, + /* 1090 */ 238, 231, 19, 239, 240, 252, 22, 24, 211, 212, + /* 1100 */ 24, 193, 216, 217, 216, 217, 252, 153, 154, 155, + /* 1110 */ 253, 16, 19, 144, 213, 268, 43, 44, 45, 46, /* 1120 */ 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, - /* 1130 */ 57, 101, 19, 59, 25, 63, 43, 44, 45, 46, + /* 1130 */ 57, 238, 19, 59, 193, 59, 43, 44, 45, 46, /* 1140 */ 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, - /* 1150 */ 57, 22, 23, 115, 25, 24, 43, 44, 45, 46, + /* 1150 */ 57, 22, 23, 193, 25, 193, 43, 44, 45, 46, /* 1160 */ 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, - /* 1170 */ 57, 22, 23, 115, 25, 102, 103, 104, 105, 106, - /* 1180 */ 107, 108, 109, 110, 111, 112, 113, 118, 150, 131, - /* 1190 */ 59, 117, 22, 273, 193, 102, 103, 104, 105, 106, - /* 1200 */ 107, 108, 109, 110, 111, 112, 113, 204, 66, 204, - /* 1210 */ 35, 204, 143, 213, 193, 102, 103, 104, 105, 106, - /* 1220 */ 107, 108, 109, 110, 111, 112, 113, 85, 193, 59, - /* 1230 */ 101, 311, 312, 16, 193, 19, 94, 216, 217, 238, - /* 1240 */ 193, 66, 239, 240, 239, 240, 239, 240, 117, 74, - /* 1250 */ 101, 216, 217, 193, 193, 252, 193, 252, 149, 252, + /* 1170 */ 57, 284, 77, 193, 79, 102, 103, 104, 105, 106, + /* 1180 */ 107, 108, 109, 110, 111, 112, 113, 286, 193, 193, + /* 1190 */ 193, 117, 291, 117, 232, 102, 103, 104, 105, 106, + /* 1200 */ 107, 108, 109, 110, 111, 112, 113, 204, 22, 23, + /* 1210 */ 66, 25, 216, 217, 35, 102, 103, 104, 105, 106, + /* 1220 */ 107, 108, 109, 110, 111, 112, 113, 193, 268, 85, + /* 1230 */ 101, 193, 309, 309, 240, 19, 313, 313, 94, 208, + /* 1240 */ 209, 193, 239, 240, 193, 66, 252, 19, 268, 244, + /* 1250 */ 216, 217, 193, 74, 213, 252, 161, 19, 263, 254, /* 1260 */ 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, - /* 1270 */ 54, 55, 56, 57, 193, 193, 193, 5, 59, 216, - /* 1280 */ 217, 25, 10, 11, 12, 13, 14, 117, 146, 17, - /* 1290 */ 193, 291, 193, 232, 77, 76, 79, 216, 217, 216, - /* 1300 */ 217, 31, 30, 309, 32, 130, 87, 313, 89, 39, - /* 1310 */ 193, 92, 40, 216, 217, 216, 217, 108, 102, 103, + /* 1270 */ 54, 55, 56, 57, 193, 216, 217, 5, 59, 193, + /* 1280 */ 19, 244, 10, 11, 12, 13, 14, 101, 309, 17, + /* 1290 */ 146, 254, 313, 193, 193, 76, 115, 216, 217, 309, + /* 1300 */ 12, 263, 30, 313, 32, 46, 87, 46, 89, 130, + /* 1310 */ 193, 92, 40, 22, 263, 27, 216, 217, 102, 103, /* 1320 */ 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, - /* 1330 */ 299, 300, 193, 216, 217, 116, 117, 118, 19, 20, - /* 1340 */ 193, 22, 70, 309, 135, 193, 264, 313, 193, 140, - /* 1350 */ 78, 193, 226, 81, 59, 36, 193, 309, 193, 29, - /* 1360 */ 193, 313, 193, 33, 145, 193, 59, 48, 216, 217, - /* 1370 */ 98, 216, 217, 193, 216, 217, 193, 244, 59, 216, - /* 1380 */ 217, 216, 217, 216, 217, 216, 217, 254, 216, 217, - /* 1390 */ 71, 193, 244, 193, 193, 65, 216, 217, 193, 216, - /* 1400 */ 217, 145, 254, 244, 85, 133, 15, 100, 193, 90, - /* 1410 */ 138, 139, 117, 254, 216, 217, 216, 217, 193, 100, - /* 1420 */ 193, 216, 217, 116, 117, 106, 107, 19, 121, 193, - /* 1430 */ 193, 216, 217, 114, 162, 116, 117, 118, 244, 244, - /* 1440 */ 121, 216, 217, 216, 217, 193, 309, 129, 254, 254, - /* 1450 */ 313, 60, 216, 217, 19, 256, 257, 193, 120, 121, - /* 1460 */ 153, 154, 155, 149, 150, 25, 24, 99, 216, 217, - /* 1470 */ 152, 193, 153, 154, 155, 156, 157, 0, 1, 2, - /* 1480 */ 216, 217, 5, 22, 158, 24, 160, 10, 11, 12, - /* 1490 */ 13, 14, 193, 23, 17, 25, 193, 19, 20, 193, - /* 1500 */ 22, 133, 193, 22, 22, 193, 22, 30, 193, 32, - /* 1510 */ 19, 20, 129, 22, 36, 216, 217, 40, 193, 216, - /* 1520 */ 217, 193, 216, 217, 116, 216, 217, 36, 216, 217, - /* 1530 */ 193, 216, 217, 193, 53, 152, 193, 59, 23, 19, - /* 1540 */ 25, 216, 217, 61, 216, 217, 23, 70, 25, 71, - /* 1550 */ 59, 116, 193, 216, 217, 78, 216, 217, 81, 216, - /* 1560 */ 217, 59, 71, 85, 193, 23, 193, 25, 90, 23, - /* 1570 */ 23, 25, 25, 7, 8, 98, 85, 193, 100, 193, - /* 1580 */ 59, 90, 142, 141, 106, 107, 193, 216, 217, 216, - /* 1590 */ 217, 100, 114, 193, 116, 117, 118, 106, 107, 121, - /* 1600 */ 216, 217, 216, 217, 193, 114, 193, 116, 117, 118, - /* 1610 */ 133, 23, 121, 25, 121, 138, 139, 97, 23, 117, - /* 1620 */ 25, 23, 193, 25, 131, 141, 193, 216, 217, 59, - /* 1630 */ 193, 153, 154, 155, 156, 157, 226, 193, 117, 162, - /* 1640 */ 23, 23, 25, 25, 153, 154, 155, 156, 157, 1, - /* 1650 */ 2, 83, 84, 5, 19, 20, 226, 22, 10, 11, - /* 1660 */ 12, 13, 14, 258, 153, 17, 155, 153, 23, 155, - /* 1670 */ 25, 36, 23, 193, 25, 255, 193, 236, 30, 193, - /* 1680 */ 32, 19, 20, 193, 22, 193, 288, 117, 40, 193, - /* 1690 */ 318, 193, 193, 193, 59, 242, 193, 193, 36, 193, - /* 1700 */ 193, 193, 287, 255, 255, 255, 71, 255, 243, 214, - /* 1710 */ 191, 297, 267, 245, 271, 259, 259, 293, 70, 246, - /* 1720 */ 246, 59, 267, 229, 245, 271, 78, 293, 259, 81, - /* 1730 */ 271, 271, 220, 71, 225, 100, 219, 219, 249, 196, - /* 1740 */ 243, 106, 107, 108, 219, 60, 98, 280, 297, 114, - /* 1750 */ 249, 116, 117, 118, 141, 245, 121, 200, 200, 297, - /* 1760 */ 38, 200, 100, 151, 150, 294, 294, 22, 106, 107, - /* 1770 */ 283, 148, 250, 145, 43, 249, 114, 272, 116, 117, - /* 1780 */ 118, 133, 234, 121, 250, 249, 138, 139, 153, 154, - /* 1790 */ 155, 156, 157, 18, 270, 199, 19, 20, 18, 22, - /* 1800 */ 237, 237, 237, 200, 237, 149, 246, 272, 246, 272, - /* 1810 */ 162, 270, 234, 36, 234, 153, 154, 155, 156, 157, - /* 1820 */ 246, 246, 200, 199, 158, 62, 200, 22, 290, 289, - /* 1830 */ 199, 221, 200, 199, 115, 199, 59, 200, 218, 218, - /* 1840 */ 221, 22, 64, 218, 126, 165, 24, 113, 71, 305, - /* 1850 */ 144, 221, 261, 224, 227, 282, 218, 224, 218, 220, - /* 1860 */ 218, 218, 312, 282, 115, 221, 260, 260, 227, 261, - /* 1870 */ 261, 260, 200, 91, 82, 22, 261, 100, 277, 200, - /* 1880 */ 147, 265, 158, 106, 107, 146, 317, 25, 317, 202, - /* 1890 */ 13, 114, 194, 116, 117, 118, 194, 6, 121, 248, - /* 1900 */ 250, 249, 247, 250, 246, 192, 260, 192, 192, 213, - /* 1910 */ 207, 222, 265, 213, 213, 207, 222, 213, 207, 214, - /* 1920 */ 4, 214, 213, 3, 22, 163, 15, 23, 16, 23, - /* 1930 */ 153, 154, 155, 156, 157, 139, 279, 151, 130, 25, - /* 1940 */ 142, 24, 20, 144, 16, 1, 142, 130, 130, 61, - /* 1950 */ 37, 151, 53, 53, 53, 130, 53, 303, 116, 34, - /* 1960 */ 300, 141, 303, 1, 5, 22, 115, 161, 75, 141, - /* 1970 */ 25, 41, 115, 68, 24, 68, 20, 19, 131, 125, - /* 1980 */ 23, 96, 22, 22, 37, 67, 22, 22, 67, 149, - /* 1990 */ 59, 24, 22, 67, 28, 23, 22, 141, 23, 23, - /* 2000 */ 23, 143, 23, 25, 22, 97, 23, 23, 116, 22, - /* 2010 */ 25, 88, 75, 86, 34, 34, 75, 44, 23, 22, - /* 2020 */ 34, 25, 34, 34, 34, 93, 24, 34, 25, 23, - /* 2030 */ 23, 23, 23, 23, 11, 23, 25, 22, 22, 22, - /* 2040 */ 1, 23, 23, 22, 22, 25, 15, 1, 23, 25, - /* 2050 */ 135, 141, 319, 319, 319, 319, 319, 319, 319, 319, - /* 2060 */ 141, 319, 319, 319, 319, 141, 319, 319, 319, 319, - /* 2070 */ 141, 319, 319, 319, 319, 319, 319, 319, 319, 319, - /* 2080 */ 319, 319, 319, 319, 319, 319, 319, 319, 319, 319, - /* 2090 */ 319, 319, 319, 319, 319, 319, 319, 319, 319, 319, - /* 2100 */ 319, 319, 319, 319, 319, 319, 319, 319, 319, 319, + /* 1330 */ 42, 150, 291, 216, 217, 116, 117, 118, 19, 20, + /* 1340 */ 193, 22, 70, 260, 116, 193, 24, 264, 193, 263, + /* 1350 */ 78, 63, 61, 81, 116, 36, 193, 260, 193, 29, + /* 1360 */ 193, 264, 193, 33, 145, 193, 59, 48, 216, 217, + /* 1370 */ 98, 216, 217, 193, 115, 193, 115, 193, 59, 216, + /* 1380 */ 217, 216, 217, 216, 217, 216, 217, 255, 216, 217, + /* 1390 */ 71, 193, 131, 193, 25, 65, 216, 217, 216, 217, + /* 1400 */ 216, 217, 208, 209, 85, 133, 193, 100, 193, 90, + /* 1410 */ 138, 139, 138, 139, 216, 217, 216, 217, 193, 100, + /* 1420 */ 193, 108, 135, 116, 117, 106, 107, 140, 121, 216, + /* 1430 */ 217, 216, 217, 114, 162, 116, 117, 118, 299, 300, + /* 1440 */ 121, 216, 217, 216, 217, 193, 244, 193, 135, 244, + /* 1450 */ 193, 256, 257, 140, 244, 193, 254, 193, 193, 254, + /* 1460 */ 153, 154, 155, 141, 254, 149, 150, 258, 216, 217, + /* 1470 */ 216, 217, 153, 154, 155, 156, 157, 0, 1, 2, + /* 1480 */ 216, 217, 5, 115, 158, 193, 160, 10, 11, 12, + /* 1490 */ 13, 14, 193, 59, 17, 126, 193, 19, 20, 129, + /* 1500 */ 22, 193, 22, 22, 24, 193, 23, 30, 25, 32, + /* 1510 */ 19, 20, 144, 22, 36, 216, 217, 40, 193, 216, + /* 1520 */ 217, 193, 152, 129, 216, 217, 193, 36, 216, 217, + /* 1530 */ 193, 99, 193, 193, 53, 193, 193, 59, 23, 193, + /* 1540 */ 25, 216, 217, 193, 216, 217, 152, 70, 59, 71, + /* 1550 */ 59, 117, 193, 216, 217, 78, 216, 217, 81, 216, + /* 1560 */ 217, 318, 71, 85, 193, 133, 193, 193, 90, 23, + /* 1570 */ 23, 25, 25, 120, 121, 98, 85, 193, 100, 193, + /* 1580 */ 23, 90, 25, 121, 106, 107, 19, 216, 217, 216, + /* 1590 */ 217, 100, 114, 131, 116, 117, 118, 106, 107, 121, + /* 1600 */ 216, 217, 216, 217, 193, 114, 117, 116, 117, 118, + /* 1610 */ 133, 193, 121, 193, 193, 138, 139, 193, 23, 193, + /* 1620 */ 25, 23, 23, 25, 25, 7, 8, 216, 217, 193, + /* 1630 */ 193, 153, 154, 155, 156, 157, 216, 217, 193, 162, + /* 1640 */ 216, 217, 216, 217, 153, 154, 155, 156, 157, 1, + /* 1650 */ 2, 193, 193, 5, 19, 20, 59, 22, 10, 11, + /* 1660 */ 12, 13, 14, 193, 97, 17, 193, 23, 193, 25, + /* 1670 */ 288, 36, 193, 242, 216, 217, 236, 23, 30, 25, + /* 1680 */ 32, 19, 20, 23, 22, 25, 216, 217, 40, 216, + /* 1690 */ 217, 216, 217, 193, 59, 216, 217, 193, 36, 83, + /* 1700 */ 84, 153, 153, 155, 155, 23, 71, 25, 23, 193, + /* 1710 */ 25, 193, 193, 193, 117, 193, 193, 193, 70, 193, + /* 1720 */ 193, 59, 193, 255, 255, 287, 78, 255, 243, 81, + /* 1730 */ 191, 255, 297, 71, 271, 100, 293, 245, 267, 214, + /* 1740 */ 246, 106, 107, 108, 246, 271, 98, 245, 293, 114, + /* 1750 */ 220, 116, 117, 118, 267, 271, 121, 271, 225, 219, + /* 1760 */ 229, 219, 100, 219, 259, 259, 259, 259, 106, 107, + /* 1770 */ 249, 196, 60, 280, 141, 243, 114, 249, 116, 117, + /* 1780 */ 118, 133, 245, 121, 200, 297, 138, 139, 153, 154, + /* 1790 */ 155, 156, 157, 297, 200, 38, 19, 20, 151, 22, + /* 1800 */ 200, 150, 140, 294, 294, 22, 272, 148, 250, 145, + /* 1810 */ 162, 270, 249, 36, 43, 153, 154, 155, 156, 157, + /* 1820 */ 234, 283, 250, 249, 18, 237, 237, 237, 237, 200, + /* 1830 */ 18, 199, 149, 246, 272, 234, 59, 272, 246, 270, + /* 1840 */ 234, 200, 246, 290, 199, 158, 62, 22, 71, 246, + /* 1850 */ 289, 221, 200, 200, 199, 199, 19, 20, 200, 22, + /* 1860 */ 199, 221, 115, 64, 218, 227, 22, 218, 218, 126, + /* 1870 */ 227, 165, 312, 36, 24, 305, 113, 100, 200, 91, + /* 1880 */ 82, 224, 22, 106, 107, 218, 221, 224, 220, 218, + /* 1890 */ 218, 114, 218, 116, 117, 118, 59, 282, 121, 282, + /* 1900 */ 221, 265, 317, 277, 265, 200, 158, 279, 71, 147, + /* 1910 */ 317, 146, 25, 202, 13, 194, 250, 140, 249, 194, + /* 1920 */ 250, 248, 247, 246, 6, 192, 192, 192, 303, 207, + /* 1930 */ 153, 154, 155, 156, 157, 222, 213, 100, 207, 213, + /* 1940 */ 303, 213, 300, 106, 107, 213, 222, 207, 4, 3, + /* 1950 */ 214, 114, 214, 116, 117, 118, 22, 213, 121, 163, + /* 1960 */ 15, 23, 23, 16, 151, 139, 130, 25, 144, 142, + /* 1970 */ 20, 16, 24, 1, 142, 130, 130, 61, 37, 151, + /* 1980 */ 53, 53, 53, 53, 130, 116, 34, 1, 141, 5, + /* 1990 */ 153, 154, 155, 156, 157, 22, 115, 75, 41, 161, + /* 2000 */ 68, 25, 141, 68, 115, 24, 131, 20, 19, 125, + /* 2010 */ 22, 96, 22, 22, 37, 23, 67, 22, 67, 59, + /* 2020 */ 24, 28, 22, 67, 23, 149, 22, 25, 23, 23, + /* 2030 */ 23, 23, 22, 97, 141, 23, 23, 116, 22, 143, + /* 2040 */ 25, 88, 75, 34, 86, 34, 75, 44, 34, 23, + /* 2050 */ 22, 34, 34, 34, 93, 24, 34, 25, 25, 142, + /* 2060 */ 23, 142, 23, 23, 23, 23, 11, 23, 25, 22, + /* 2070 */ 22, 22, 135, 23, 23, 22, 22, 25, 15, 23, + /* 2080 */ 1, 141, 25, 1, 141, 319, 319, 319, 319, 319, + /* 2090 */ 319, 319, 319, 319, 319, 141, 319, 319, 319, 319, + /* 2100 */ 141, 319, 319, 319, 319, 319, 319, 319, 319, 319, /* 2110 */ 319, 319, 319, 319, 319, 319, 319, 319, 319, 319, /* 2120 */ 319, 319, 319, 319, 319, 319, 319, 319, 319, 319, /* 2130 */ 319, 319, 319, 319, 319, 319, 319, 319, 319, 319, @@ -162433,177 +166348,180 @@ static const YYCODETYPE yy_lookahead[] = { /* 2220 */ 319, 319, 319, 319, 319, 319, 319, 319, 319, 319, /* 2230 */ 319, 319, 319, 319, 319, 319, 319, 319, 319, 319, /* 2240 */ 319, 319, 319, 319, 319, 319, 319, 319, 319, 319, - /* 2250 */ 319, 319, 319, 319, 319, 319, + /* 2250 */ 319, 319, 319, 319, 319, 319, 319, 319, 319, 319, + /* 2260 */ 319, 319, 319, 319, 319, 319, 319, 319, 319, 319, + /* 2270 */ 319, 319, 319, 319, 319, 319, 319, 319, 319, 319, + /* 2280 */ 319, 319, 319, 319, 319, 319, }; -#define YY_SHIFT_COUNT (577) +#define YY_SHIFT_COUNT (579) #define YY_SHIFT_MIN (0) -#define YY_SHIFT_MAX (2046) +#define YY_SHIFT_MAX (2082) static const unsigned short int yy_shift_ofst[] = { - /* 0 */ 1648, 1477, 1272, 322, 322, 262, 1319, 1478, 1491, 1662, - /* 10 */ 1662, 1662, 317, 0, 0, 214, 1093, 1662, 1662, 1662, - /* 20 */ 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, - /* 30 */ 271, 271, 1219, 1219, 216, 88, 262, 262, 262, 262, - /* 40 */ 262, 40, 111, 258, 361, 469, 512, 583, 622, 693, + /* 0 */ 1648, 1477, 1272, 322, 322, 1, 1319, 1478, 1491, 1837, + /* 10 */ 1837, 1837, 471, 0, 0, 214, 1093, 1837, 1837, 1837, + /* 20 */ 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, + /* 30 */ 271, 271, 1219, 1219, 216, 88, 1, 1, 1, 1, + /* 40 */ 1, 40, 111, 258, 361, 469, 512, 583, 622, 693, /* 50 */ 732, 803, 842, 913, 1073, 1093, 1093, 1093, 1093, 1093, /* 60 */ 1093, 1093, 1093, 1093, 1093, 1093, 1093, 1093, 1093, 1093, /* 70 */ 1093, 1093, 1093, 1113, 1093, 1216, 957, 957, 1635, 1662, - /* 80 */ 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, - /* 90 */ 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, - /* 100 */ 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, - /* 110 */ 1662, 1662, 1662, 1662, 1777, 1662, 1662, 1662, 1662, 1662, - /* 120 */ 1662, 1662, 1662, 1662, 1662, 1662, 1662, 1662, 137, 181, - /* 130 */ 181, 181, 181, 181, 94, 430, 66, 65, 112, 366, - /* 140 */ 475, 475, 629, 1058, 475, 475, 125, 125, 475, 686, - /* 150 */ 686, 686, 660, 686, 57, 184, 184, 77, 77, 2071, - /* 160 */ 2071, 328, 328, 328, 493, 373, 373, 373, 373, 1015, - /* 170 */ 1015, 409, 366, 1129, 1149, 475, 475, 475, 475, 475, - /* 180 */ 475, 475, 475, 475, 475, 475, 475, 475, 475, 475, - /* 190 */ 475, 475, 475, 475, 475, 621, 621, 475, 852, 899, - /* 200 */ 899, 1295, 1295, 406, 851, 2071, 2071, 2071, 2071, 2071, - /* 210 */ 2071, 2071, 1307, 954, 954, 640, 464, 695, 238, 700, - /* 220 */ 538, 541, 748, 475, 475, 475, 475, 475, 475, 475, - /* 230 */ 475, 475, 475, 634, 475, 475, 475, 475, 475, 475, - /* 240 */ 475, 475, 475, 475, 475, 475, 1175, 1175, 1175, 475, - /* 250 */ 475, 475, 580, 475, 475, 475, 1074, 1142, 475, 475, - /* 260 */ 1072, 475, 475, 475, 475, 475, 475, 475, 475, 797, - /* 270 */ 1330, 740, 1131, 1131, 1131, 1131, 1069, 740, 740, 1209, - /* 280 */ 167, 926, 1391, 1038, 1314, 187, 1408, 1314, 1408, 1435, - /* 290 */ 1109, 1038, 1038, 1109, 1038, 187, 1435, 227, 1090, 941, - /* 300 */ 1270, 1270, 1270, 1408, 1256, 1256, 1326, 1440, 513, 1461, - /* 310 */ 1685, 1685, 1613, 1613, 1722, 1722, 1613, 1612, 1614, 1745, - /* 320 */ 1623, 1628, 1731, 1623, 1628, 1775, 1775, 1775, 1775, 1613, - /* 330 */ 1780, 1656, 1614, 1614, 1656, 1745, 1731, 1656, 1731, 1656, - /* 340 */ 1613, 1780, 1666, 1763, 1613, 1780, 1805, 1613, 1780, 1613, - /* 350 */ 1780, 1805, 1719, 1719, 1719, 1778, 1819, 1819, 1805, 1719, - /* 360 */ 1718, 1719, 1778, 1719, 1719, 1680, 1822, 1734, 1734, 1805, - /* 370 */ 1706, 1749, 1706, 1749, 1706, 1749, 1706, 1749, 1613, 1782, - /* 380 */ 1782, 1792, 1792, 1623, 1628, 1853, 1613, 1724, 1623, 1733, - /* 390 */ 1739, 1656, 1862, 1877, 1877, 1891, 1891, 1891, 2071, 2071, - /* 400 */ 2071, 2071, 2071, 2071, 2071, 2071, 2071, 2071, 2071, 2071, - /* 410 */ 2071, 2071, 2071, 207, 915, 1010, 1030, 1217, 910, 1170, - /* 420 */ 1470, 1368, 1481, 1442, 1318, 1383, 1515, 1482, 1523, 1542, - /* 430 */ 1546, 1547, 1588, 1595, 1502, 1338, 1566, 1493, 1520, 1521, - /* 440 */ 1598, 1617, 1568, 1618, 1511, 1514, 1645, 1649, 1570, 1484, - /* 450 */ 1916, 1920, 1902, 1762, 1911, 1912, 1904, 1906, 1796, 1786, - /* 460 */ 1808, 1914, 1914, 1917, 1798, 1922, 1799, 1928, 1944, 1804, - /* 470 */ 1817, 1914, 1818, 1888, 1913, 1914, 1800, 1899, 1900, 1901, - /* 480 */ 1903, 1825, 1842, 1925, 1820, 1962, 1959, 1943, 1851, 1806, - /* 490 */ 1905, 1945, 1907, 1893, 1930, 1828, 1857, 1950, 1956, 1958, - /* 500 */ 1847, 1854, 1960, 1918, 1961, 1964, 1957, 1965, 1921, 1931, - /* 510 */ 1967, 1885, 1966, 1970, 1926, 1947, 1972, 1840, 1974, 1975, - /* 520 */ 1976, 1977, 1978, 1979, 1982, 1908, 1856, 1983, 1984, 1892, - /* 530 */ 1980, 1987, 1858, 1985, 1981, 1986, 1988, 1989, 1923, 1937, - /* 540 */ 1927, 1973, 1941, 1932, 1990, 1995, 1997, 2002, 1996, 2003, - /* 550 */ 1993, 2006, 1985, 2007, 2008, 2009, 2010, 2011, 2012, 2015, - /* 560 */ 2023, 2016, 2017, 2018, 2019, 2021, 2022, 2020, 1915, 1910, - /* 570 */ 1919, 1924, 1929, 2024, 2025, 2031, 2039, 2046, + /* 80 */ 1777, 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, + /* 90 */ 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, + /* 100 */ 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, + /* 110 */ 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, + /* 120 */ 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, 1837, + /* 130 */ 137, 181, 181, 181, 181, 181, 181, 181, 94, 430, + /* 140 */ 66, 65, 112, 366, 533, 533, 740, 1261, 533, 533, + /* 150 */ 79, 79, 533, 412, 412, 412, 77, 412, 123, 113, + /* 160 */ 113, 22, 22, 2101, 2101, 328, 328, 328, 239, 468, + /* 170 */ 468, 468, 468, 1015, 1015, 409, 366, 1129, 1186, 533, + /* 180 */ 533, 533, 533, 533, 533, 533, 533, 533, 533, 533, + /* 190 */ 533, 533, 533, 533, 533, 533, 533, 533, 533, 969, + /* 200 */ 621, 621, 533, 642, 788, 788, 1228, 1228, 822, 822, + /* 210 */ 67, 1274, 2101, 2101, 2101, 2101, 2101, 2101, 2101, 1307, + /* 220 */ 954, 954, 585, 472, 640, 387, 695, 538, 541, 700, + /* 230 */ 533, 533, 533, 533, 533, 533, 533, 533, 533, 533, + /* 240 */ 222, 533, 533, 533, 533, 533, 533, 533, 533, 533, + /* 250 */ 533, 533, 533, 1179, 1179, 1179, 533, 533, 533, 565, + /* 260 */ 533, 533, 533, 916, 1144, 533, 533, 1288, 533, 533, + /* 270 */ 533, 533, 533, 533, 533, 533, 639, 1330, 209, 1076, + /* 280 */ 1076, 1076, 1076, 580, 209, 209, 1313, 768, 917, 649, + /* 290 */ 1181, 1316, 405, 1316, 1238, 249, 1181, 1181, 249, 1181, + /* 300 */ 405, 1238, 1369, 464, 1259, 1012, 1012, 1012, 1368, 1368, + /* 310 */ 1368, 1368, 184, 184, 1326, 904, 1287, 1480, 1712, 1712, + /* 320 */ 1633, 1633, 1757, 1757, 1633, 1647, 1651, 1783, 1659, 1664, + /* 330 */ 1771, 1659, 1664, 1806, 1806, 1806, 1806, 1633, 1812, 1683, + /* 340 */ 1651, 1651, 1683, 1783, 1771, 1683, 1771, 1683, 1633, 1812, + /* 350 */ 1687, 1784, 1633, 1812, 1825, 1633, 1812, 1633, 1812, 1825, + /* 360 */ 1747, 1747, 1747, 1799, 1844, 1844, 1825, 1747, 1743, 1747, + /* 370 */ 1799, 1747, 1747, 1706, 1850, 1763, 1763, 1825, 1633, 1788, + /* 380 */ 1788, 1798, 1798, 1659, 1664, 1860, 1633, 1748, 1659, 1762, + /* 390 */ 1765, 1683, 1887, 1901, 1901, 1918, 1918, 1918, 2101, 2101, + /* 400 */ 2101, 2101, 2101, 2101, 2101, 2101, 2101, 2101, 2101, 2101, + /* 410 */ 2101, 2101, 2101, 207, 1095, 331, 620, 903, 806, 1074, + /* 420 */ 1483, 1432, 1481, 1322, 1370, 1394, 1515, 1291, 1546, 1547, + /* 430 */ 1557, 1595, 1598, 1599, 1434, 1453, 1618, 1462, 1567, 1489, + /* 440 */ 1644, 1654, 1616, 1660, 1548, 1549, 1682, 1685, 1597, 742, + /* 450 */ 1944, 1946, 1934, 1796, 1945, 1947, 1938, 1939, 1826, 1813, + /* 460 */ 1836, 1942, 1942, 1948, 1827, 1950, 1824, 1955, 1972, 1832, + /* 470 */ 1845, 1942, 1846, 1916, 1941, 1942, 1828, 1927, 1928, 1929, + /* 480 */ 1930, 1854, 1869, 1952, 1847, 1986, 1984, 1973, 1881, 1838, + /* 490 */ 1932, 1976, 1935, 1922, 1957, 1861, 1889, 1981, 1987, 1989, + /* 500 */ 1875, 1884, 1988, 1949, 1990, 1991, 1992, 1995, 1951, 1960, + /* 510 */ 1996, 1915, 1993, 2000, 1956, 1977, 2001, 1876, 2004, 2005, + /* 520 */ 2006, 2007, 2002, 2008, 2010, 1936, 1893, 2012, 2013, 1921, + /* 530 */ 2009, 2016, 1896, 2015, 2011, 2014, 2017, 2018, 1953, 1967, + /* 540 */ 1958, 2003, 1971, 1961, 2019, 2026, 2028, 2031, 2032, 2033, + /* 550 */ 2022, 1917, 1919, 2037, 2015, 2039, 2040, 2041, 2042, 2043, + /* 560 */ 2044, 2047, 2055, 2048, 2049, 2050, 2051, 2053, 2054, 2052, + /* 570 */ 1937, 1940, 1943, 1954, 1959, 2057, 2056, 2063, 2079, 2082, }; #define YY_REDUCE_COUNT (412) -#define YY_REDUCE_MIN (-267) -#define YY_REDUCE_MAX (1716) +#define YY_REDUCE_MIN (-271) +#define YY_REDUCE_MAX (1744) static const short yy_reduce_ofst[] = { /* 0 */ -125, 733, 789, 241, 293, -123, -193, -191, -183, -187, - /* 10 */ -180, 83, 133, -207, -198, -267, -175, -6, 166, 313, - /* 20 */ 487, 396, 489, 598, 615, 685, 687, 79, 781, 857, - /* 30 */ 490, 616, 240, 334, -188, 796, 841, 843, 1003, 1005, - /* 40 */ 1007, -260, -260, -260, -260, -260, -260, -260, -260, -260, - /* 50 */ -260, -260, -260, -260, -260, -260, -260, -260, -260, -260, - /* 60 */ -260, -260, -260, -260, -260, -260, -260, -260, -260, -260, - /* 70 */ -260, -260, -260, -260, -260, -260, -260, -260, 158, 203, - /* 80 */ 391, 576, 724, 726, 886, 1021, 1035, 1063, 1081, 1083, - /* 90 */ 1097, 1099, 1117, 1152, 1155, 1158, 1163, 1165, 1167, 1169, - /* 100 */ 1172, 1180, 1183, 1198, 1200, 1205, 1215, 1225, 1227, 1236, - /* 110 */ 1252, 1264, 1299, 1303, 1306, 1309, 1312, 1315, 1325, 1328, - /* 120 */ 1337, 1340, 1343, 1371, 1373, 1384, 1386, 1411, -260, -260, - /* 130 */ -260, -260, -260, -260, -260, -260, -260, -53, 138, 302, - /* 140 */ -158, 357, 223, -222, 411, 458, -92, 556, 669, 581, - /* 150 */ 632, 581, -260, 632, 758, 778, 920, -260, -260, -260, - /* 160 */ -260, 161, 161, 161, 307, 234, 392, 526, 790, 195, - /* 170 */ 359, -174, -173, 362, 362, -189, 16, 560, 567, 261, - /* 180 */ 689, 802, 853, -122, -166, 408, 335, 617, 690, 837, - /* 190 */ 1001, 746, 1061, 515, 1082, 994, 1034, -135, 1000, 1048, - /* 200 */ 1137, 877, 897, 186, 627, 1031, 1133, 1148, 1159, 1194, - /* 210 */ 1199, 1195, -194, -142, 18, -152, 68, 201, 253, 269, - /* 220 */ 294, 354, 521, 528, 676, 680, 736, 743, 850, 907, - /* 230 */ 1041, 1047, 1060, 727, 1139, 1147, 1201, 1237, 1278, 1359, - /* 240 */ 1393, 1400, 1413, 1429, 1433, 1437, 1126, 1410, 1430, 1444, - /* 250 */ 1480, 1483, 1405, 1486, 1490, 1492, 1420, 1372, 1496, 1498, - /* 260 */ 1441, 1499, 253, 1500, 1503, 1504, 1506, 1507, 1508, 1398, - /* 270 */ 1415, 1453, 1448, 1449, 1450, 1452, 1405, 1453, 1453, 1465, - /* 280 */ 1495, 1519, 1414, 1443, 1445, 1468, 1456, 1455, 1457, 1424, - /* 290 */ 1473, 1454, 1459, 1474, 1460, 1479, 1434, 1512, 1494, 1509, - /* 300 */ 1517, 1518, 1525, 1469, 1489, 1501, 1467, 1510, 1497, 1543, - /* 310 */ 1451, 1462, 1557, 1558, 1471, 1472, 1561, 1487, 1505, 1524, - /* 320 */ 1522, 1526, 1548, 1534, 1536, 1563, 1564, 1565, 1567, 1603, - /* 330 */ 1596, 1560, 1535, 1537, 1562, 1541, 1578, 1574, 1580, 1575, - /* 340 */ 1622, 1624, 1538, 1540, 1626, 1631, 1610, 1632, 1634, 1637, - /* 350 */ 1636, 1619, 1620, 1621, 1625, 1627, 1629, 1633, 1630, 1638, - /* 360 */ 1639, 1640, 1641, 1642, 1643, 1550, 1544, 1573, 1581, 1644, - /* 370 */ 1591, 1606, 1608, 1607, 1609, 1611, 1615, 1646, 1672, 1569, - /* 380 */ 1571, 1616, 1647, 1650, 1652, 1601, 1679, 1657, 1653, 1651, - /* 390 */ 1655, 1658, 1687, 1698, 1702, 1713, 1715, 1716, 1654, 1659, - /* 400 */ 1660, 1703, 1696, 1700, 1701, 1704, 1708, 1689, 1694, 1705, - /* 410 */ 1707, 1709, 1711, + /* 10 */ 166, 238, 133, -207, -199, -267, -176, -6, 204, 489, + /* 20 */ 576, -175, 598, 686, 615, 725, 860, 778, 781, 857, + /* 30 */ 616, 887, 87, 240, -192, 408, 626, 796, 843, 854, + /* 40 */ 1003, -271, -271, -271, -271, -271, -271, -271, -271, -271, + /* 50 */ -271, -271, -271, -271, -271, -271, -271, -271, -271, -271, + /* 60 */ -271, -271, -271, -271, -271, -271, -271, -271, -271, -271, + /* 70 */ -271, -271, -271, -271, -271, -271, -271, -271, 80, 83, + /* 80 */ 313, 886, 888, 996, 1034, 1059, 1081, 1100, 1117, 1152, + /* 90 */ 1155, 1163, 1165, 1167, 1169, 1172, 1180, 1182, 1184, 1198, + /* 100 */ 1200, 1213, 1215, 1225, 1227, 1252, 1254, 1264, 1299, 1303, + /* 110 */ 1308, 1312, 1325, 1328, 1337, 1340, 1343, 1371, 1373, 1384, + /* 120 */ 1386, 1411, 1420, 1424, 1426, 1458, 1470, 1473, 1475, 1479, + /* 130 */ -271, -271, -271, -271, -271, -271, -271, -271, -271, -271, + /* 140 */ -271, 138, 459, 396, -158, 470, 302, -212, 521, 201, + /* 150 */ -195, -92, 559, 630, 632, 630, -271, 632, 901, 63, + /* 160 */ 407, -271, -271, -271, -271, 161, 161, 161, 251, 335, + /* 170 */ 847, 960, 980, 537, 588, 618, 628, 688, 688, -166, + /* 180 */ -161, 674, 790, 794, 799, 851, 852, -122, 680, -120, + /* 190 */ 995, 1038, 415, 1051, 893, 798, 962, 400, 1086, 779, + /* 200 */ 923, 924, 263, 1041, 979, 990, 1083, 1097, 1031, 1194, + /* 210 */ 362, 994, 1139, 1005, 1037, 1202, 1205, 1195, 1210, -194, + /* 220 */ 56, 185, -135, 232, 522, 560, 601, 617, 669, 683, + /* 230 */ 711, 856, 908, 941, 1048, 1101, 1147, 1257, 1262, 1265, + /* 240 */ 392, 1292, 1333, 1339, 1342, 1346, 1350, 1359, 1374, 1418, + /* 250 */ 1421, 1436, 1437, 593, 755, 770, 997, 1445, 1459, 1209, + /* 260 */ 1500, 1504, 1516, 1132, 1243, 1518, 1519, 1440, 1520, 560, + /* 270 */ 1522, 1523, 1524, 1526, 1527, 1529, 1382, 1438, 1431, 1468, + /* 280 */ 1469, 1472, 1476, 1209, 1431, 1431, 1485, 1525, 1539, 1435, + /* 290 */ 1463, 1471, 1492, 1487, 1443, 1494, 1474, 1484, 1498, 1486, + /* 300 */ 1502, 1455, 1530, 1531, 1533, 1540, 1542, 1544, 1505, 1506, + /* 310 */ 1507, 1508, 1521, 1528, 1493, 1537, 1532, 1575, 1488, 1496, + /* 320 */ 1584, 1594, 1509, 1510, 1600, 1538, 1534, 1541, 1558, 1563, + /* 330 */ 1586, 1572, 1574, 1588, 1589, 1590, 1591, 1629, 1632, 1587, + /* 340 */ 1562, 1565, 1592, 1569, 1601, 1596, 1606, 1603, 1641, 1645, + /* 350 */ 1553, 1561, 1652, 1655, 1630, 1653, 1656, 1658, 1661, 1640, + /* 360 */ 1646, 1649, 1650, 1638, 1657, 1663, 1665, 1667, 1668, 1671, + /* 370 */ 1643, 1672, 1674, 1560, 1570, 1615, 1617, 1679, 1678, 1585, + /* 380 */ 1593, 1636, 1639, 1666, 1669, 1626, 1705, 1628, 1670, 1673, + /* 390 */ 1675, 1677, 1711, 1721, 1725, 1733, 1734, 1735, 1625, 1637, + /* 400 */ 1642, 1722, 1723, 1726, 1728, 1732, 1731, 1713, 1724, 1736, + /* 410 */ 1738, 1744, 1740, }; static const YYACTIONTYPE yy_default[] = { - /* 0 */ 1641, 1641, 1641, 1470, 1237, 1348, 1237, 1237, 1237, 1470, - /* 10 */ 1470, 1470, 1237, 1378, 1378, 1523, 1270, 1237, 1237, 1237, - /* 20 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1469, 1237, 1237, - /* 30 */ 1237, 1237, 1558, 1558, 1237, 1237, 1237, 1237, 1237, 1237, - /* 40 */ 1237, 1237, 1387, 1237, 1394, 1237, 1237, 1237, 1237, 1237, - /* 50 */ 1471, 1472, 1237, 1237, 1237, 1522, 1524, 1487, 1401, 1400, - /* 60 */ 1399, 1398, 1505, 1365, 1392, 1385, 1389, 1465, 1466, 1464, - /* 70 */ 1468, 1472, 1471, 1237, 1388, 1435, 1449, 1434, 1237, 1237, - /* 80 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 90 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 100 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 110 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 120 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1443, 1448, - /* 130 */ 1455, 1447, 1444, 1437, 1436, 1438, 1439, 1237, 1237, 1261, - /* 140 */ 1237, 1237, 1258, 1312, 1237, 1237, 1237, 1237, 1237, 1542, - /* 150 */ 1541, 1237, 1440, 1237, 1270, 1429, 1428, 1452, 1441, 1451, - /* 160 */ 1450, 1530, 1594, 1593, 1488, 1237, 1237, 1237, 1237, 1237, - /* 170 */ 1237, 1558, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 180 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 190 */ 1237, 1237, 1237, 1237, 1237, 1558, 1558, 1237, 1270, 1558, - /* 200 */ 1558, 1266, 1266, 1372, 1237, 1537, 1339, 1339, 1339, 1339, - /* 210 */ 1348, 1339, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 220 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1527, 1525, 1237, - /* 230 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 240 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 250 */ 1237, 1237, 1237, 1237, 1237, 1237, 1344, 1237, 1237, 1237, - /* 260 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1587, 1237, - /* 270 */ 1500, 1326, 1344, 1344, 1344, 1344, 1346, 1327, 1325, 1338, - /* 280 */ 1271, 1244, 1633, 1404, 1393, 1345, 1367, 1393, 1367, 1630, - /* 290 */ 1391, 1404, 1404, 1391, 1404, 1345, 1630, 1287, 1610, 1282, - /* 300 */ 1378, 1378, 1378, 1367, 1372, 1372, 1467, 1345, 1338, 1237, - /* 310 */ 1633, 1633, 1353, 1353, 1632, 1632, 1353, 1488, 1617, 1413, - /* 320 */ 1386, 1372, 1315, 1386, 1372, 1321, 1321, 1321, 1321, 1353, - /* 330 */ 1255, 1391, 1617, 1617, 1391, 1413, 1315, 1391, 1315, 1391, - /* 340 */ 1353, 1255, 1504, 1627, 1353, 1255, 1478, 1353, 1255, 1353, - /* 350 */ 1255, 1478, 1313, 1313, 1313, 1302, 1237, 1237, 1478, 1313, - /* 360 */ 1287, 1313, 1302, 1313, 1313, 1576, 1237, 1482, 1482, 1478, - /* 370 */ 1371, 1366, 1371, 1366, 1371, 1366, 1371, 1366, 1353, 1568, - /* 380 */ 1568, 1381, 1381, 1386, 1372, 1473, 1353, 1237, 1386, 1384, - /* 390 */ 1382, 1391, 1305, 1590, 1590, 1586, 1586, 1586, 1638, 1638, - /* 400 */ 1537, 1603, 1270, 1270, 1270, 1270, 1603, 1289, 1289, 1271, - /* 410 */ 1271, 1270, 1603, 1237, 1237, 1237, 1237, 1237, 1237, 1598, - /* 420 */ 1237, 1532, 1489, 1357, 1237, 1237, 1237, 1237, 1237, 1237, - /* 430 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1543, 1237, - /* 440 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1418, - /* 450 */ 1237, 1240, 1534, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 460 */ 1237, 1395, 1396, 1358, 1237, 1237, 1237, 1237, 1237, 1237, - /* 470 */ 1237, 1410, 1237, 1237, 1237, 1405, 1237, 1237, 1237, 1237, - /* 480 */ 1237, 1237, 1237, 1237, 1629, 1237, 1237, 1237, 1237, 1237, - /* 490 */ 1237, 1503, 1502, 1237, 1237, 1355, 1237, 1237, 1237, 1237, - /* 500 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1285, - /* 510 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 520 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 530 */ 1237, 1237, 1237, 1383, 1237, 1237, 1237, 1237, 1237, 1237, - /* 540 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1573, 1373, - /* 550 */ 1237, 1237, 1620, 1237, 1237, 1237, 1237, 1237, 1237, 1237, - /* 560 */ 1237, 1237, 1237, 1237, 1237, 1237, 1237, 1614, 1329, 1420, - /* 570 */ 1237, 1419, 1423, 1259, 1237, 1249, 1237, 1237, + /* 0 */ 1651, 1651, 1651, 1479, 1244, 1355, 1244, 1244, 1244, 1479, + /* 10 */ 1479, 1479, 1244, 1385, 1385, 1532, 1277, 1244, 1244, 1244, + /* 20 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1478, 1244, 1244, + /* 30 */ 1244, 1244, 1567, 1567, 1244, 1244, 1244, 1244, 1244, 1244, + /* 40 */ 1244, 1244, 1394, 1244, 1401, 1244, 1244, 1244, 1244, 1244, + /* 50 */ 1480, 1481, 1244, 1244, 1244, 1531, 1533, 1496, 1408, 1407, + /* 60 */ 1406, 1405, 1514, 1373, 1399, 1392, 1396, 1474, 1475, 1473, + /* 70 */ 1477, 1481, 1480, 1244, 1395, 1442, 1458, 1441, 1244, 1244, + /* 80 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, + /* 90 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, + /* 100 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, + /* 110 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, + /* 120 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, + /* 130 */ 1450, 1457, 1456, 1455, 1464, 1454, 1451, 1444, 1443, 1445, + /* 140 */ 1446, 1244, 1244, 1268, 1244, 1244, 1265, 1319, 1244, 1244, + /* 150 */ 1244, 1244, 1244, 1551, 1550, 1244, 1447, 1244, 1277, 1436, + /* 160 */ 1435, 1461, 1448, 1460, 1459, 1539, 1603, 1602, 1497, 1244, + /* 170 */ 1244, 1244, 1244, 1244, 1244, 1567, 1244, 1244, 1244, 1244, + /* 180 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, + /* 190 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1375, + /* 200 */ 1567, 1567, 1244, 1277, 1567, 1567, 1376, 1376, 1273, 1273, + /* 210 */ 1379, 1244, 1546, 1346, 1346, 1346, 1346, 1355, 1346, 1244, + /* 220 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, + /* 230 */ 1244, 1244, 1244, 1244, 1536, 1534, 1244, 1244, 1244, 1244, + /* 240 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, + /* 250 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, + /* 260 */ 1244, 1244, 1244, 1351, 1244, 1244, 1244, 1244, 1244, 1244, + /* 270 */ 1244, 1244, 1244, 1244, 1244, 1596, 1244, 1509, 1333, 1351, + /* 280 */ 1351, 1351, 1351, 1353, 1334, 1332, 1345, 1278, 1251, 1643, + /* 290 */ 1411, 1400, 1352, 1400, 1640, 1398, 1411, 1411, 1398, 1411, + /* 300 */ 1352, 1640, 1294, 1619, 1289, 1385, 1385, 1385, 1375, 1375, + /* 310 */ 1375, 1375, 1379, 1379, 1476, 1352, 1345, 1244, 1643, 1643, + /* 320 */ 1361, 1361, 1642, 1642, 1361, 1497, 1627, 1420, 1393, 1379, + /* 330 */ 1322, 1393, 1379, 1328, 1328, 1328, 1328, 1361, 1262, 1398, + /* 340 */ 1627, 1627, 1398, 1420, 1322, 1398, 1322, 1398, 1361, 1262, + /* 350 */ 1513, 1637, 1361, 1262, 1487, 1361, 1262, 1361, 1262, 1487, + /* 360 */ 1320, 1320, 1320, 1309, 1244, 1244, 1487, 1320, 1294, 1320, + /* 370 */ 1309, 1320, 1320, 1585, 1244, 1491, 1491, 1487, 1361, 1577, + /* 380 */ 1577, 1388, 1388, 1393, 1379, 1482, 1361, 1244, 1393, 1391, + /* 390 */ 1389, 1398, 1312, 1599, 1599, 1595, 1595, 1595, 1648, 1648, + /* 400 */ 1546, 1612, 1277, 1277, 1277, 1277, 1612, 1296, 1296, 1278, + /* 410 */ 1278, 1277, 1612, 1244, 1244, 1244, 1244, 1244, 1244, 1607, + /* 420 */ 1244, 1541, 1498, 1365, 1244, 1244, 1244, 1244, 1244, 1244, + /* 430 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1552, 1244, + /* 440 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1425, + /* 450 */ 1244, 1247, 1543, 1244, 1244, 1244, 1244, 1244, 1244, 1244, + /* 460 */ 1244, 1402, 1403, 1366, 1244, 1244, 1244, 1244, 1244, 1244, + /* 470 */ 1244, 1417, 1244, 1244, 1244, 1412, 1244, 1244, 1244, 1244, + /* 480 */ 1244, 1244, 1244, 1244, 1639, 1244, 1244, 1244, 1244, 1244, + /* 490 */ 1244, 1512, 1511, 1244, 1244, 1363, 1244, 1244, 1244, 1244, + /* 500 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1292, + /* 510 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, + /* 520 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, + /* 530 */ 1244, 1244, 1244, 1390, 1244, 1244, 1244, 1244, 1244, 1244, + /* 540 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1582, 1380, + /* 550 */ 1244, 1244, 1244, 1244, 1630, 1244, 1244, 1244, 1244, 1244, + /* 560 */ 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1244, 1623, + /* 570 */ 1336, 1427, 1244, 1426, 1430, 1266, 1244, 1256, 1244, 1244, }; /********** End of lemon-generated parsing tables *****************************/ @@ -163155,12 +167073,12 @@ static const char *const yyTokenName[] = { /* 256 */ "seltablist", /* 257 */ "stl_prefix", /* 258 */ "joinop", - /* 259 */ "indexed_opt", - /* 260 */ "on_opt", - /* 261 */ "using_opt", - /* 262 */ "exprlist", - /* 263 */ "xfullname", - /* 264 */ "idlist", + /* 259 */ "on_using", + /* 260 */ "indexed_by", + /* 261 */ "exprlist", + /* 262 */ "xfullname", + /* 263 */ "idlist", + /* 264 */ "indexed_opt", /* 265 */ "nulls", /* 266 */ "with", /* 267 */ "where_opt_ret", @@ -163331,29 +167249,29 @@ static const char *const yyRuleName[] = { /* 106 */ "from ::= FROM seltablist", /* 107 */ "stl_prefix ::= seltablist joinop", /* 108 */ "stl_prefix ::=", - /* 109 */ "seltablist ::= stl_prefix nm dbnm as indexed_opt on_opt using_opt", - /* 110 */ "seltablist ::= stl_prefix nm dbnm LP exprlist RP as on_opt using_opt", - /* 111 */ "seltablist ::= stl_prefix LP select RP as on_opt using_opt", - /* 112 */ "seltablist ::= stl_prefix LP seltablist RP as on_opt using_opt", - /* 113 */ "dbnm ::=", - /* 114 */ "dbnm ::= DOT nm", - /* 115 */ "fullname ::= nm", - /* 116 */ "fullname ::= nm DOT nm", - /* 117 */ "xfullname ::= nm", - /* 118 */ "xfullname ::= nm DOT nm", - /* 119 */ "xfullname ::= nm DOT nm AS nm", - /* 120 */ "xfullname ::= nm AS nm", - /* 121 */ "joinop ::= COMMA|JOIN", - /* 122 */ "joinop ::= JOIN_KW JOIN", - /* 123 */ "joinop ::= JOIN_KW nm JOIN", - /* 124 */ "joinop ::= JOIN_KW nm nm JOIN", - /* 125 */ "on_opt ::= ON expr", - /* 126 */ "on_opt ::=", - /* 127 */ "indexed_opt ::=", - /* 128 */ "indexed_opt ::= INDEXED BY nm", - /* 129 */ "indexed_opt ::= NOT INDEXED", - /* 130 */ "using_opt ::= USING LP idlist RP", - /* 131 */ "using_opt ::=", + /* 109 */ "seltablist ::= stl_prefix nm dbnm as on_using", + /* 110 */ "seltablist ::= stl_prefix nm dbnm as indexed_by on_using", + /* 111 */ "seltablist ::= stl_prefix nm dbnm LP exprlist RP as on_using", + /* 112 */ "seltablist ::= stl_prefix LP select RP as on_using", + /* 113 */ "seltablist ::= stl_prefix LP seltablist RP as on_using", + /* 114 */ "dbnm ::=", + /* 115 */ "dbnm ::= DOT nm", + /* 116 */ "fullname ::= nm", + /* 117 */ "fullname ::= nm DOT nm", + /* 118 */ "xfullname ::= nm", + /* 119 */ "xfullname ::= nm DOT nm", + /* 120 */ "xfullname ::= nm DOT nm AS nm", + /* 121 */ "xfullname ::= nm AS nm", + /* 122 */ "joinop ::= COMMA|JOIN", + /* 123 */ "joinop ::= JOIN_KW JOIN", + /* 124 */ "joinop ::= JOIN_KW nm JOIN", + /* 125 */ "joinop ::= JOIN_KW nm nm JOIN", + /* 126 */ "on_using ::= ON expr", + /* 127 */ "on_using ::= USING LP idlist RP", + /* 128 */ "on_using ::=", + /* 129 */ "indexed_opt ::=", + /* 130 */ "indexed_by ::= INDEXED BY nm", + /* 131 */ "indexed_by ::= NOT INDEXED", /* 132 */ "orderby_opt ::=", /* 133 */ "orderby_opt ::= ORDER BY sortlist", /* 134 */ "sortlist ::= sortlist COMMA expr sortorder nulls", @@ -163431,199 +167349,202 @@ static const char *const yyRuleName[] = { /* 206 */ "expr ::= expr NOT NULL", /* 207 */ "expr ::= expr IS expr", /* 208 */ "expr ::= expr IS NOT expr", - /* 209 */ "expr ::= NOT expr", - /* 210 */ "expr ::= BITNOT expr", - /* 211 */ "expr ::= PLUS|MINUS expr", - /* 212 */ "expr ::= expr PTR expr", - /* 213 */ "between_op ::= BETWEEN", - /* 214 */ "between_op ::= NOT BETWEEN", - /* 215 */ "expr ::= expr between_op expr AND expr", - /* 216 */ "in_op ::= IN", - /* 217 */ "in_op ::= NOT IN", - /* 218 */ "expr ::= expr in_op LP exprlist RP", - /* 219 */ "expr ::= LP select RP", - /* 220 */ "expr ::= expr in_op LP select RP", - /* 221 */ "expr ::= expr in_op nm dbnm paren_exprlist", - /* 222 */ "expr ::= EXISTS LP select RP", - /* 223 */ "expr ::= CASE case_operand case_exprlist case_else END", - /* 224 */ "case_exprlist ::= case_exprlist WHEN expr THEN expr", - /* 225 */ "case_exprlist ::= WHEN expr THEN expr", - /* 226 */ "case_else ::= ELSE expr", - /* 227 */ "case_else ::=", - /* 228 */ "case_operand ::= expr", - /* 229 */ "case_operand ::=", - /* 230 */ "exprlist ::=", - /* 231 */ "nexprlist ::= nexprlist COMMA expr", - /* 232 */ "nexprlist ::= expr", - /* 233 */ "paren_exprlist ::=", - /* 234 */ "paren_exprlist ::= LP exprlist RP", - /* 235 */ "cmd ::= createkw uniqueflag INDEX ifnotexists nm dbnm ON nm LP sortlist RP where_opt", - /* 236 */ "uniqueflag ::= UNIQUE", - /* 237 */ "uniqueflag ::=", - /* 238 */ "eidlist_opt ::=", - /* 239 */ "eidlist_opt ::= LP eidlist RP", - /* 240 */ "eidlist ::= eidlist COMMA nm collate sortorder", - /* 241 */ "eidlist ::= nm collate sortorder", - /* 242 */ "collate ::=", - /* 243 */ "collate ::= COLLATE ID|STRING", - /* 244 */ "cmd ::= DROP INDEX ifexists fullname", - /* 245 */ "cmd ::= VACUUM vinto", - /* 246 */ "cmd ::= VACUUM nm vinto", - /* 247 */ "vinto ::= INTO expr", - /* 248 */ "vinto ::=", - /* 249 */ "cmd ::= PRAGMA nm dbnm", - /* 250 */ "cmd ::= PRAGMA nm dbnm EQ nmnum", - /* 251 */ "cmd ::= PRAGMA nm dbnm LP nmnum RP", - /* 252 */ "cmd ::= PRAGMA nm dbnm EQ minus_num", - /* 253 */ "cmd ::= PRAGMA nm dbnm LP minus_num RP", - /* 254 */ "plus_num ::= PLUS INTEGER|FLOAT", - /* 255 */ "minus_num ::= MINUS INTEGER|FLOAT", - /* 256 */ "cmd ::= createkw trigger_decl BEGIN trigger_cmd_list END", - /* 257 */ "trigger_decl ::= temp TRIGGER ifnotexists nm dbnm trigger_time trigger_event ON fullname foreach_clause when_clause", - /* 258 */ "trigger_time ::= BEFORE|AFTER", - /* 259 */ "trigger_time ::= INSTEAD OF", - /* 260 */ "trigger_time ::=", - /* 261 */ "trigger_event ::= DELETE|INSERT", - /* 262 */ "trigger_event ::= UPDATE", - /* 263 */ "trigger_event ::= UPDATE OF idlist", - /* 264 */ "when_clause ::=", - /* 265 */ "when_clause ::= WHEN expr", - /* 266 */ "trigger_cmd_list ::= trigger_cmd_list trigger_cmd SEMI", - /* 267 */ "trigger_cmd_list ::= trigger_cmd SEMI", - /* 268 */ "trnm ::= nm DOT nm", - /* 269 */ "tridxby ::= INDEXED BY nm", - /* 270 */ "tridxby ::= NOT INDEXED", - /* 271 */ "trigger_cmd ::= UPDATE orconf trnm tridxby SET setlist from where_opt scanpt", - /* 272 */ "trigger_cmd ::= scanpt insert_cmd INTO trnm idlist_opt select upsert scanpt", - /* 273 */ "trigger_cmd ::= DELETE FROM trnm tridxby where_opt scanpt", - /* 274 */ "trigger_cmd ::= scanpt select scanpt", - /* 275 */ "expr ::= RAISE LP IGNORE RP", - /* 276 */ "expr ::= RAISE LP raisetype COMMA nm RP", - /* 277 */ "raisetype ::= ROLLBACK", - /* 278 */ "raisetype ::= ABORT", - /* 279 */ "raisetype ::= FAIL", - /* 280 */ "cmd ::= DROP TRIGGER ifexists fullname", - /* 281 */ "cmd ::= ATTACH database_kw_opt expr AS expr key_opt", - /* 282 */ "cmd ::= DETACH database_kw_opt expr", - /* 283 */ "key_opt ::=", - /* 284 */ "key_opt ::= KEY expr", - /* 285 */ "cmd ::= REINDEX", - /* 286 */ "cmd ::= REINDEX nm dbnm", - /* 287 */ "cmd ::= ANALYZE", - /* 288 */ "cmd ::= ANALYZE nm dbnm", - /* 289 */ "cmd ::= ALTER TABLE fullname RENAME TO nm", - /* 290 */ "cmd ::= ALTER TABLE add_column_fullname ADD kwcolumn_opt columnname carglist", - /* 291 */ "cmd ::= ALTER TABLE fullname DROP kwcolumn_opt nm", - /* 292 */ "add_column_fullname ::= fullname", - /* 293 */ "cmd ::= ALTER TABLE fullname RENAME kwcolumn_opt nm TO nm", - /* 294 */ "cmd ::= create_vtab", - /* 295 */ "cmd ::= create_vtab LP vtabarglist RP", - /* 296 */ "create_vtab ::= createkw VIRTUAL TABLE ifnotexists nm dbnm USING nm", - /* 297 */ "vtabarg ::=", - /* 298 */ "vtabargtoken ::= ANY", - /* 299 */ "vtabargtoken ::= lp anylist RP", - /* 300 */ "lp ::= LP", - /* 301 */ "with ::= WITH wqlist", - /* 302 */ "with ::= WITH RECURSIVE wqlist", - /* 303 */ "wqas ::= AS", - /* 304 */ "wqas ::= AS MATERIALIZED", - /* 305 */ "wqas ::= AS NOT MATERIALIZED", - /* 306 */ "wqitem ::= nm eidlist_opt wqas LP select RP", - /* 307 */ "wqlist ::= wqitem", - /* 308 */ "wqlist ::= wqlist COMMA wqitem", - /* 309 */ "windowdefn_list ::= windowdefn", - /* 310 */ "windowdefn_list ::= windowdefn_list COMMA windowdefn", - /* 311 */ "windowdefn ::= nm AS LP window RP", - /* 312 */ "window ::= PARTITION BY nexprlist orderby_opt frame_opt", - /* 313 */ "window ::= nm PARTITION BY nexprlist orderby_opt frame_opt", - /* 314 */ "window ::= ORDER BY sortlist frame_opt", - /* 315 */ "window ::= nm ORDER BY sortlist frame_opt", - /* 316 */ "window ::= frame_opt", - /* 317 */ "window ::= nm frame_opt", - /* 318 */ "frame_opt ::=", - /* 319 */ "frame_opt ::= range_or_rows frame_bound_s frame_exclude_opt", - /* 320 */ "frame_opt ::= range_or_rows BETWEEN frame_bound_s AND frame_bound_e frame_exclude_opt", - /* 321 */ "range_or_rows ::= RANGE|ROWS|GROUPS", - /* 322 */ "frame_bound_s ::= frame_bound", - /* 323 */ "frame_bound_s ::= UNBOUNDED PRECEDING", - /* 324 */ "frame_bound_e ::= frame_bound", - /* 325 */ "frame_bound_e ::= UNBOUNDED FOLLOWING", - /* 326 */ "frame_bound ::= expr PRECEDING|FOLLOWING", - /* 327 */ "frame_bound ::= CURRENT ROW", - /* 328 */ "frame_exclude_opt ::=", - /* 329 */ "frame_exclude_opt ::= EXCLUDE frame_exclude", - /* 330 */ "frame_exclude ::= NO OTHERS", - /* 331 */ "frame_exclude ::= CURRENT ROW", - /* 332 */ "frame_exclude ::= GROUP|TIES", - /* 333 */ "window_clause ::= WINDOW windowdefn_list", - /* 334 */ "filter_over ::= filter_clause over_clause", - /* 335 */ "filter_over ::= over_clause", - /* 336 */ "filter_over ::= filter_clause", - /* 337 */ "over_clause ::= OVER LP window RP", - /* 338 */ "over_clause ::= OVER nm", - /* 339 */ "filter_clause ::= FILTER LP WHERE expr RP", - /* 340 */ "input ::= cmdlist", - /* 341 */ "cmdlist ::= cmdlist ecmd", - /* 342 */ "cmdlist ::= ecmd", - /* 343 */ "ecmd ::= SEMI", - /* 344 */ "ecmd ::= cmdx SEMI", - /* 345 */ "ecmd ::= explain cmdx SEMI", - /* 346 */ "trans_opt ::=", - /* 347 */ "trans_opt ::= TRANSACTION", - /* 348 */ "trans_opt ::= TRANSACTION nm", - /* 349 */ "savepoint_opt ::= SAVEPOINT", - /* 350 */ "savepoint_opt ::=", - /* 351 */ "cmd ::= create_table create_table_args", - /* 352 */ "table_option_set ::= table_option", - /* 353 */ "columnlist ::= columnlist COMMA columnname carglist", - /* 354 */ "columnlist ::= columnname carglist", - /* 355 */ "nm ::= ID|INDEXED", - /* 356 */ "nm ::= STRING", - /* 357 */ "nm ::= JOIN_KW", - /* 358 */ "typetoken ::= typename", - /* 359 */ "typename ::= ID|STRING", - /* 360 */ "signed ::= plus_num", - /* 361 */ "signed ::= minus_num", - /* 362 */ "carglist ::= carglist ccons", - /* 363 */ "carglist ::=", - /* 364 */ "ccons ::= NULL onconf", - /* 365 */ "ccons ::= GENERATED ALWAYS AS generated", - /* 366 */ "ccons ::= AS generated", - /* 367 */ "conslist_opt ::= COMMA conslist", - /* 368 */ "conslist ::= conslist tconscomma tcons", - /* 369 */ "conslist ::= tcons", - /* 370 */ "tconscomma ::=", - /* 371 */ "defer_subclause_opt ::= defer_subclause", - /* 372 */ "resolvetype ::= raisetype", - /* 373 */ "selectnowith ::= oneselect", - /* 374 */ "oneselect ::= values", - /* 375 */ "sclp ::= selcollist COMMA", - /* 376 */ "as ::= ID|STRING", - /* 377 */ "returning ::=", - /* 378 */ "expr ::= term", - /* 379 */ "likeop ::= LIKE_KW|MATCH", - /* 380 */ "exprlist ::= nexprlist", - /* 381 */ "nmnum ::= plus_num", - /* 382 */ "nmnum ::= nm", - /* 383 */ "nmnum ::= ON", - /* 384 */ "nmnum ::= DELETE", - /* 385 */ "nmnum ::= DEFAULT", - /* 386 */ "plus_num ::= INTEGER|FLOAT", - /* 387 */ "foreach_clause ::=", - /* 388 */ "foreach_clause ::= FOR EACH ROW", - /* 389 */ "trnm ::= nm", - /* 390 */ "tridxby ::=", - /* 391 */ "database_kw_opt ::= DATABASE", - /* 392 */ "database_kw_opt ::=", - /* 393 */ "kwcolumn_opt ::=", - /* 394 */ "kwcolumn_opt ::= COLUMNKW", - /* 395 */ "vtabarglist ::= vtabarg", - /* 396 */ "vtabarglist ::= vtabarglist COMMA vtabarg", - /* 397 */ "vtabarg ::= vtabarg vtabargtoken", - /* 398 */ "anylist ::=", - /* 399 */ "anylist ::= anylist LP anylist RP", - /* 400 */ "anylist ::= anylist ANY", - /* 401 */ "with ::=", + /* 209 */ "expr ::= expr IS NOT DISTINCT FROM expr", + /* 210 */ "expr ::= expr IS DISTINCT FROM expr", + /* 211 */ "expr ::= NOT expr", + /* 212 */ "expr ::= BITNOT expr", + /* 213 */ "expr ::= PLUS|MINUS expr", + /* 214 */ "expr ::= expr PTR expr", + /* 215 */ "between_op ::= BETWEEN", + /* 216 */ "between_op ::= NOT BETWEEN", + /* 217 */ "expr ::= expr between_op expr AND expr", + /* 218 */ "in_op ::= IN", + /* 219 */ "in_op ::= NOT IN", + /* 220 */ "expr ::= expr in_op LP exprlist RP", + /* 221 */ "expr ::= LP select RP", + /* 222 */ "expr ::= expr in_op LP select RP", + /* 223 */ "expr ::= expr in_op nm dbnm paren_exprlist", + /* 224 */ "expr ::= EXISTS LP select RP", + /* 225 */ "expr ::= CASE case_operand case_exprlist case_else END", + /* 226 */ "case_exprlist ::= case_exprlist WHEN expr THEN expr", + /* 227 */ "case_exprlist ::= WHEN expr THEN expr", + /* 228 */ "case_else ::= ELSE expr", + /* 229 */ "case_else ::=", + /* 230 */ "case_operand ::= expr", + /* 231 */ "case_operand ::=", + /* 232 */ "exprlist ::=", + /* 233 */ "nexprlist ::= nexprlist COMMA expr", + /* 234 */ "nexprlist ::= expr", + /* 235 */ "paren_exprlist ::=", + /* 236 */ "paren_exprlist ::= LP exprlist RP", + /* 237 */ "cmd ::= createkw uniqueflag INDEX ifnotexists nm dbnm ON nm LP sortlist RP where_opt", + /* 238 */ "uniqueflag ::= UNIQUE", + /* 239 */ "uniqueflag ::=", + /* 240 */ "eidlist_opt ::=", + /* 241 */ "eidlist_opt ::= LP eidlist RP", + /* 242 */ "eidlist ::= eidlist COMMA nm collate sortorder", + /* 243 */ "eidlist ::= nm collate sortorder", + /* 244 */ "collate ::=", + /* 245 */ "collate ::= COLLATE ID|STRING", + /* 246 */ "cmd ::= DROP INDEX ifexists fullname", + /* 247 */ "cmd ::= VACUUM vinto", + /* 248 */ "cmd ::= VACUUM nm vinto", + /* 249 */ "vinto ::= INTO expr", + /* 250 */ "vinto ::=", + /* 251 */ "cmd ::= PRAGMA nm dbnm", + /* 252 */ "cmd ::= PRAGMA nm dbnm EQ nmnum", + /* 253 */ "cmd ::= PRAGMA nm dbnm LP nmnum RP", + /* 254 */ "cmd ::= PRAGMA nm dbnm EQ minus_num", + /* 255 */ "cmd ::= PRAGMA nm dbnm LP minus_num RP", + /* 256 */ "plus_num ::= PLUS INTEGER|FLOAT", + /* 257 */ "minus_num ::= MINUS INTEGER|FLOAT", + /* 258 */ "cmd ::= createkw trigger_decl BEGIN trigger_cmd_list END", + /* 259 */ "trigger_decl ::= temp TRIGGER ifnotexists nm dbnm trigger_time trigger_event ON fullname foreach_clause when_clause", + /* 260 */ "trigger_time ::= BEFORE|AFTER", + /* 261 */ "trigger_time ::= INSTEAD OF", + /* 262 */ "trigger_time ::=", + /* 263 */ "trigger_event ::= DELETE|INSERT", + /* 264 */ "trigger_event ::= UPDATE", + /* 265 */ "trigger_event ::= UPDATE OF idlist", + /* 266 */ "when_clause ::=", + /* 267 */ "when_clause ::= WHEN expr", + /* 268 */ "trigger_cmd_list ::= trigger_cmd_list trigger_cmd SEMI", + /* 269 */ "trigger_cmd_list ::= trigger_cmd SEMI", + /* 270 */ "trnm ::= nm DOT nm", + /* 271 */ "tridxby ::= INDEXED BY nm", + /* 272 */ "tridxby ::= NOT INDEXED", + /* 273 */ "trigger_cmd ::= UPDATE orconf trnm tridxby SET setlist from where_opt scanpt", + /* 274 */ "trigger_cmd ::= scanpt insert_cmd INTO trnm idlist_opt select upsert scanpt", + /* 275 */ "trigger_cmd ::= DELETE FROM trnm tridxby where_opt scanpt", + /* 276 */ "trigger_cmd ::= scanpt select scanpt", + /* 277 */ "expr ::= RAISE LP IGNORE RP", + /* 278 */ "expr ::= RAISE LP raisetype COMMA nm RP", + /* 279 */ "raisetype ::= ROLLBACK", + /* 280 */ "raisetype ::= ABORT", + /* 281 */ "raisetype ::= FAIL", + /* 282 */ "cmd ::= DROP TRIGGER ifexists fullname", + /* 283 */ "cmd ::= ATTACH database_kw_opt expr AS expr key_opt", + /* 284 */ "cmd ::= DETACH database_kw_opt expr", + /* 285 */ "key_opt ::=", + /* 286 */ "key_opt ::= KEY expr", + /* 287 */ "cmd ::= REINDEX", + /* 288 */ "cmd ::= REINDEX nm dbnm", + /* 289 */ "cmd ::= ANALYZE", + /* 290 */ "cmd ::= ANALYZE nm dbnm", + /* 291 */ "cmd ::= ALTER TABLE fullname RENAME TO nm", + /* 292 */ "cmd ::= ALTER TABLE add_column_fullname ADD kwcolumn_opt columnname carglist", + /* 293 */ "cmd ::= ALTER TABLE fullname DROP kwcolumn_opt nm", + /* 294 */ "add_column_fullname ::= fullname", + /* 295 */ "cmd ::= ALTER TABLE fullname RENAME kwcolumn_opt nm TO nm", + /* 296 */ "cmd ::= create_vtab", + /* 297 */ "cmd ::= create_vtab LP vtabarglist RP", + /* 298 */ "create_vtab ::= createkw VIRTUAL TABLE ifnotexists nm dbnm USING nm", + /* 299 */ "vtabarg ::=", + /* 300 */ "vtabargtoken ::= ANY", + /* 301 */ "vtabargtoken ::= lp anylist RP", + /* 302 */ "lp ::= LP", + /* 303 */ "with ::= WITH wqlist", + /* 304 */ "with ::= WITH RECURSIVE wqlist", + /* 305 */ "wqas ::= AS", + /* 306 */ "wqas ::= AS MATERIALIZED", + /* 307 */ "wqas ::= AS NOT MATERIALIZED", + /* 308 */ "wqitem ::= nm eidlist_opt wqas LP select RP", + /* 309 */ "wqlist ::= wqitem", + /* 310 */ "wqlist ::= wqlist COMMA wqitem", + /* 311 */ "windowdefn_list ::= windowdefn", + /* 312 */ "windowdefn_list ::= windowdefn_list COMMA windowdefn", + /* 313 */ "windowdefn ::= nm AS LP window RP", + /* 314 */ "window ::= PARTITION BY nexprlist orderby_opt frame_opt", + /* 315 */ "window ::= nm PARTITION BY nexprlist orderby_opt frame_opt", + /* 316 */ "window ::= ORDER BY sortlist frame_opt", + /* 317 */ "window ::= nm ORDER BY sortlist frame_opt", + /* 318 */ "window ::= frame_opt", + /* 319 */ "window ::= nm frame_opt", + /* 320 */ "frame_opt ::=", + /* 321 */ "frame_opt ::= range_or_rows frame_bound_s frame_exclude_opt", + /* 322 */ "frame_opt ::= range_or_rows BETWEEN frame_bound_s AND frame_bound_e frame_exclude_opt", + /* 323 */ "range_or_rows ::= RANGE|ROWS|GROUPS", + /* 324 */ "frame_bound_s ::= frame_bound", + /* 325 */ "frame_bound_s ::= UNBOUNDED PRECEDING", + /* 326 */ "frame_bound_e ::= frame_bound", + /* 327 */ "frame_bound_e ::= UNBOUNDED FOLLOWING", + /* 328 */ "frame_bound ::= expr PRECEDING|FOLLOWING", + /* 329 */ "frame_bound ::= CURRENT ROW", + /* 330 */ "frame_exclude_opt ::=", + /* 331 */ "frame_exclude_opt ::= EXCLUDE frame_exclude", + /* 332 */ "frame_exclude ::= NO OTHERS", + /* 333 */ "frame_exclude ::= CURRENT ROW", + /* 334 */ "frame_exclude ::= GROUP|TIES", + /* 335 */ "window_clause ::= WINDOW windowdefn_list", + /* 336 */ "filter_over ::= filter_clause over_clause", + /* 337 */ "filter_over ::= over_clause", + /* 338 */ "filter_over ::= filter_clause", + /* 339 */ "over_clause ::= OVER LP window RP", + /* 340 */ "over_clause ::= OVER nm", + /* 341 */ "filter_clause ::= FILTER LP WHERE expr RP", + /* 342 */ "input ::= cmdlist", + /* 343 */ "cmdlist ::= cmdlist ecmd", + /* 344 */ "cmdlist ::= ecmd", + /* 345 */ "ecmd ::= SEMI", + /* 346 */ "ecmd ::= cmdx SEMI", + /* 347 */ "ecmd ::= explain cmdx SEMI", + /* 348 */ "trans_opt ::=", + /* 349 */ "trans_opt ::= TRANSACTION", + /* 350 */ "trans_opt ::= TRANSACTION nm", + /* 351 */ "savepoint_opt ::= SAVEPOINT", + /* 352 */ "savepoint_opt ::=", + /* 353 */ "cmd ::= create_table create_table_args", + /* 354 */ "table_option_set ::= table_option", + /* 355 */ "columnlist ::= columnlist COMMA columnname carglist", + /* 356 */ "columnlist ::= columnname carglist", + /* 357 */ "nm ::= ID|INDEXED", + /* 358 */ "nm ::= STRING", + /* 359 */ "nm ::= JOIN_KW", + /* 360 */ "typetoken ::= typename", + /* 361 */ "typename ::= ID|STRING", + /* 362 */ "signed ::= plus_num", + /* 363 */ "signed ::= minus_num", + /* 364 */ "carglist ::= carglist ccons", + /* 365 */ "carglist ::=", + /* 366 */ "ccons ::= NULL onconf", + /* 367 */ "ccons ::= GENERATED ALWAYS AS generated", + /* 368 */ "ccons ::= AS generated", + /* 369 */ "conslist_opt ::= COMMA conslist", + /* 370 */ "conslist ::= conslist tconscomma tcons", + /* 371 */ "conslist ::= tcons", + /* 372 */ "tconscomma ::=", + /* 373 */ "defer_subclause_opt ::= defer_subclause", + /* 374 */ "resolvetype ::= raisetype", + /* 375 */ "selectnowith ::= oneselect", + /* 376 */ "oneselect ::= values", + /* 377 */ "sclp ::= selcollist COMMA", + /* 378 */ "as ::= ID|STRING", + /* 379 */ "indexed_opt ::= indexed_by", + /* 380 */ "returning ::=", + /* 381 */ "expr ::= term", + /* 382 */ "likeop ::= LIKE_KW|MATCH", + /* 383 */ "exprlist ::= nexprlist", + /* 384 */ "nmnum ::= plus_num", + /* 385 */ "nmnum ::= nm", + /* 386 */ "nmnum ::= ON", + /* 387 */ "nmnum ::= DELETE", + /* 388 */ "nmnum ::= DEFAULT", + /* 389 */ "plus_num ::= INTEGER|FLOAT", + /* 390 */ "foreach_clause ::=", + /* 391 */ "foreach_clause ::= FOR EACH ROW", + /* 392 */ "trnm ::= nm", + /* 393 */ "tridxby ::=", + /* 394 */ "database_kw_opt ::= DATABASE", + /* 395 */ "database_kw_opt ::=", + /* 396 */ "kwcolumn_opt ::=", + /* 397 */ "kwcolumn_opt ::= COLUMNKW", + /* 398 */ "vtabarglist ::= vtabarg", + /* 399 */ "vtabarglist ::= vtabarglist COMMA vtabarg", + /* 400 */ "vtabarg ::= vtabarg vtabargtoken", + /* 401 */ "anylist ::=", + /* 402 */ "anylist ::= anylist LP anylist RP", + /* 403 */ "anylist ::= anylist ANY", + /* 404 */ "with ::=", }; #endif /* NDEBUG */ @@ -163761,7 +167682,6 @@ sqlite3SelectDelete(pParse->db, (yypminor->yy47)); case 217: /* expr */ case 246: /* where_opt */ case 248: /* having_opt */ - case 260: /* on_opt */ case 267: /* where_opt_ret */ case 278: /* case_operand */ case 280: /* case_else */ @@ -163781,7 +167701,7 @@ sqlite3ExprDelete(pParse->db, (yypminor->yy528)); case 249: /* orderby_opt */ case 253: /* nexprlist */ case 254: /* sclp */ - case 262: /* exprlist */ + case 261: /* exprlist */ case 268: /* setlist */ case 277: /* paren_exprlist */ case 279: /* case_exprlist */ @@ -163794,7 +167714,7 @@ sqlite3ExprListDelete(pParse->db, (yypminor->yy322)); case 245: /* from */ case 256: /* seltablist */ case 257: /* stl_prefix */ - case 263: /* xfullname */ + case 262: /* xfullname */ { sqlite3SrcListDelete(pParse->db, (yypminor->yy131)); } @@ -163810,8 +167730,7 @@ sqlite3WithDelete(pParse->db, (yypminor->yy521)); sqlite3WindowListDelete(pParse->db, (yypminor->yy41)); } break; - case 261: /* using_opt */ - case 264: /* idlist */ + case 263: /* idlist */ case 270: /* idlist_opt */ { sqlite3IdListDelete(pParse->db, (yypminor->yy254)); @@ -164241,29 +168160,29 @@ static const YYCODETYPE yyRuleInfoLhs[] = { 245, /* (106) from ::= FROM seltablist */ 257, /* (107) stl_prefix ::= seltablist joinop */ 257, /* (108) stl_prefix ::= */ - 256, /* (109) seltablist ::= stl_prefix nm dbnm as indexed_opt on_opt using_opt */ - 256, /* (110) seltablist ::= stl_prefix nm dbnm LP exprlist RP as on_opt using_opt */ - 256, /* (111) seltablist ::= stl_prefix LP select RP as on_opt using_opt */ - 256, /* (112) seltablist ::= stl_prefix LP seltablist RP as on_opt using_opt */ - 200, /* (113) dbnm ::= */ - 200, /* (114) dbnm ::= DOT nm */ - 238, /* (115) fullname ::= nm */ - 238, /* (116) fullname ::= nm DOT nm */ - 263, /* (117) xfullname ::= nm */ - 263, /* (118) xfullname ::= nm DOT nm */ - 263, /* (119) xfullname ::= nm DOT nm AS nm */ - 263, /* (120) xfullname ::= nm AS nm */ - 258, /* (121) joinop ::= COMMA|JOIN */ - 258, /* (122) joinop ::= JOIN_KW JOIN */ - 258, /* (123) joinop ::= JOIN_KW nm JOIN */ - 258, /* (124) joinop ::= JOIN_KW nm nm JOIN */ - 260, /* (125) on_opt ::= ON expr */ - 260, /* (126) on_opt ::= */ - 259, /* (127) indexed_opt ::= */ - 259, /* (128) indexed_opt ::= INDEXED BY nm */ - 259, /* (129) indexed_opt ::= NOT INDEXED */ - 261, /* (130) using_opt ::= USING LP idlist RP */ - 261, /* (131) using_opt ::= */ + 256, /* (109) seltablist ::= stl_prefix nm dbnm as on_using */ + 256, /* (110) seltablist ::= stl_prefix nm dbnm as indexed_by on_using */ + 256, /* (111) seltablist ::= stl_prefix nm dbnm LP exprlist RP as on_using */ + 256, /* (112) seltablist ::= stl_prefix LP select RP as on_using */ + 256, /* (113) seltablist ::= stl_prefix LP seltablist RP as on_using */ + 200, /* (114) dbnm ::= */ + 200, /* (115) dbnm ::= DOT nm */ + 238, /* (116) fullname ::= nm */ + 238, /* (117) fullname ::= nm DOT nm */ + 262, /* (118) xfullname ::= nm */ + 262, /* (119) xfullname ::= nm DOT nm */ + 262, /* (120) xfullname ::= nm DOT nm AS nm */ + 262, /* (121) xfullname ::= nm AS nm */ + 258, /* (122) joinop ::= COMMA|JOIN */ + 258, /* (123) joinop ::= JOIN_KW JOIN */ + 258, /* (124) joinop ::= JOIN_KW nm JOIN */ + 258, /* (125) joinop ::= JOIN_KW nm nm JOIN */ + 259, /* (126) on_using ::= ON expr */ + 259, /* (127) on_using ::= USING LP idlist RP */ + 259, /* (128) on_using ::= */ + 264, /* (129) indexed_opt ::= */ + 260, /* (130) indexed_by ::= INDEXED BY nm */ + 260, /* (131) indexed_by ::= NOT INDEXED */ 249, /* (132) orderby_opt ::= */ 249, /* (133) orderby_opt ::= ORDER BY sortlist */ 231, /* (134) sortlist ::= sortlist COMMA expr sortorder nulls */ @@ -164307,8 +168226,8 @@ static const YYCODETYPE yyRuleInfoLhs[] = { 269, /* (172) insert_cmd ::= REPLACE */ 270, /* (173) idlist_opt ::= */ 270, /* (174) idlist_opt ::= LP idlist RP */ - 264, /* (175) idlist ::= idlist COMMA nm */ - 264, /* (176) idlist ::= nm */ + 263, /* (175) idlist ::= idlist COMMA nm */ + 263, /* (176) idlist ::= nm */ 217, /* (177) expr ::= LP expr RP */ 217, /* (178) expr ::= ID|INDEXED */ 217, /* (179) expr ::= JOIN_KW */ @@ -164341,199 +168260,202 @@ static const YYCODETYPE yyRuleInfoLhs[] = { 217, /* (206) expr ::= expr NOT NULL */ 217, /* (207) expr ::= expr IS expr */ 217, /* (208) expr ::= expr IS NOT expr */ - 217, /* (209) expr ::= NOT expr */ - 217, /* (210) expr ::= BITNOT expr */ - 217, /* (211) expr ::= PLUS|MINUS expr */ - 217, /* (212) expr ::= expr PTR expr */ - 275, /* (213) between_op ::= BETWEEN */ - 275, /* (214) between_op ::= NOT BETWEEN */ - 217, /* (215) expr ::= expr between_op expr AND expr */ - 276, /* (216) in_op ::= IN */ - 276, /* (217) in_op ::= NOT IN */ - 217, /* (218) expr ::= expr in_op LP exprlist RP */ - 217, /* (219) expr ::= LP select RP */ - 217, /* (220) expr ::= expr in_op LP select RP */ - 217, /* (221) expr ::= expr in_op nm dbnm paren_exprlist */ - 217, /* (222) expr ::= EXISTS LP select RP */ - 217, /* (223) expr ::= CASE case_operand case_exprlist case_else END */ - 279, /* (224) case_exprlist ::= case_exprlist WHEN expr THEN expr */ - 279, /* (225) case_exprlist ::= WHEN expr THEN expr */ - 280, /* (226) case_else ::= ELSE expr */ - 280, /* (227) case_else ::= */ - 278, /* (228) case_operand ::= expr */ - 278, /* (229) case_operand ::= */ - 262, /* (230) exprlist ::= */ - 253, /* (231) nexprlist ::= nexprlist COMMA expr */ - 253, /* (232) nexprlist ::= expr */ - 277, /* (233) paren_exprlist ::= */ - 277, /* (234) paren_exprlist ::= LP exprlist RP */ - 190, /* (235) cmd ::= createkw uniqueflag INDEX ifnotexists nm dbnm ON nm LP sortlist RP where_opt */ - 281, /* (236) uniqueflag ::= UNIQUE */ - 281, /* (237) uniqueflag ::= */ - 221, /* (238) eidlist_opt ::= */ - 221, /* (239) eidlist_opt ::= LP eidlist RP */ - 232, /* (240) eidlist ::= eidlist COMMA nm collate sortorder */ - 232, /* (241) eidlist ::= nm collate sortorder */ - 282, /* (242) collate ::= */ - 282, /* (243) collate ::= COLLATE ID|STRING */ - 190, /* (244) cmd ::= DROP INDEX ifexists fullname */ - 190, /* (245) cmd ::= VACUUM vinto */ - 190, /* (246) cmd ::= VACUUM nm vinto */ - 283, /* (247) vinto ::= INTO expr */ - 283, /* (248) vinto ::= */ - 190, /* (249) cmd ::= PRAGMA nm dbnm */ - 190, /* (250) cmd ::= PRAGMA nm dbnm EQ nmnum */ - 190, /* (251) cmd ::= PRAGMA nm dbnm LP nmnum RP */ - 190, /* (252) cmd ::= PRAGMA nm dbnm EQ minus_num */ - 190, /* (253) cmd ::= PRAGMA nm dbnm LP minus_num RP */ - 211, /* (254) plus_num ::= PLUS INTEGER|FLOAT */ - 212, /* (255) minus_num ::= MINUS INTEGER|FLOAT */ - 190, /* (256) cmd ::= createkw trigger_decl BEGIN trigger_cmd_list END */ - 285, /* (257) trigger_decl ::= temp TRIGGER ifnotexists nm dbnm trigger_time trigger_event ON fullname foreach_clause when_clause */ - 287, /* (258) trigger_time ::= BEFORE|AFTER */ - 287, /* (259) trigger_time ::= INSTEAD OF */ - 287, /* (260) trigger_time ::= */ - 288, /* (261) trigger_event ::= DELETE|INSERT */ - 288, /* (262) trigger_event ::= UPDATE */ - 288, /* (263) trigger_event ::= UPDATE OF idlist */ - 290, /* (264) when_clause ::= */ - 290, /* (265) when_clause ::= WHEN expr */ - 286, /* (266) trigger_cmd_list ::= trigger_cmd_list trigger_cmd SEMI */ - 286, /* (267) trigger_cmd_list ::= trigger_cmd SEMI */ - 292, /* (268) trnm ::= nm DOT nm */ - 293, /* (269) tridxby ::= INDEXED BY nm */ - 293, /* (270) tridxby ::= NOT INDEXED */ - 291, /* (271) trigger_cmd ::= UPDATE orconf trnm tridxby SET setlist from where_opt scanpt */ - 291, /* (272) trigger_cmd ::= scanpt insert_cmd INTO trnm idlist_opt select upsert scanpt */ - 291, /* (273) trigger_cmd ::= DELETE FROM trnm tridxby where_opt scanpt */ - 291, /* (274) trigger_cmd ::= scanpt select scanpt */ - 217, /* (275) expr ::= RAISE LP IGNORE RP */ - 217, /* (276) expr ::= RAISE LP raisetype COMMA nm RP */ - 236, /* (277) raisetype ::= ROLLBACK */ - 236, /* (278) raisetype ::= ABORT */ - 236, /* (279) raisetype ::= FAIL */ - 190, /* (280) cmd ::= DROP TRIGGER ifexists fullname */ - 190, /* (281) cmd ::= ATTACH database_kw_opt expr AS expr key_opt */ - 190, /* (282) cmd ::= DETACH database_kw_opt expr */ - 295, /* (283) key_opt ::= */ - 295, /* (284) key_opt ::= KEY expr */ - 190, /* (285) cmd ::= REINDEX */ - 190, /* (286) cmd ::= REINDEX nm dbnm */ - 190, /* (287) cmd ::= ANALYZE */ - 190, /* (288) cmd ::= ANALYZE nm dbnm */ - 190, /* (289) cmd ::= ALTER TABLE fullname RENAME TO nm */ - 190, /* (290) cmd ::= ALTER TABLE add_column_fullname ADD kwcolumn_opt columnname carglist */ - 190, /* (291) cmd ::= ALTER TABLE fullname DROP kwcolumn_opt nm */ - 296, /* (292) add_column_fullname ::= fullname */ - 190, /* (293) cmd ::= ALTER TABLE fullname RENAME kwcolumn_opt nm TO nm */ - 190, /* (294) cmd ::= create_vtab */ - 190, /* (295) cmd ::= create_vtab LP vtabarglist RP */ - 298, /* (296) create_vtab ::= createkw VIRTUAL TABLE ifnotexists nm dbnm USING nm */ - 300, /* (297) vtabarg ::= */ - 301, /* (298) vtabargtoken ::= ANY */ - 301, /* (299) vtabargtoken ::= lp anylist RP */ - 302, /* (300) lp ::= LP */ - 266, /* (301) with ::= WITH wqlist */ - 266, /* (302) with ::= WITH RECURSIVE wqlist */ - 305, /* (303) wqas ::= AS */ - 305, /* (304) wqas ::= AS MATERIALIZED */ - 305, /* (305) wqas ::= AS NOT MATERIALIZED */ - 304, /* (306) wqitem ::= nm eidlist_opt wqas LP select RP */ - 241, /* (307) wqlist ::= wqitem */ - 241, /* (308) wqlist ::= wqlist COMMA wqitem */ - 306, /* (309) windowdefn_list ::= windowdefn */ - 306, /* (310) windowdefn_list ::= windowdefn_list COMMA windowdefn */ - 307, /* (311) windowdefn ::= nm AS LP window RP */ - 308, /* (312) window ::= PARTITION BY nexprlist orderby_opt frame_opt */ - 308, /* (313) window ::= nm PARTITION BY nexprlist orderby_opt frame_opt */ - 308, /* (314) window ::= ORDER BY sortlist frame_opt */ - 308, /* (315) window ::= nm ORDER BY sortlist frame_opt */ - 308, /* (316) window ::= frame_opt */ - 308, /* (317) window ::= nm frame_opt */ - 309, /* (318) frame_opt ::= */ - 309, /* (319) frame_opt ::= range_or_rows frame_bound_s frame_exclude_opt */ - 309, /* (320) frame_opt ::= range_or_rows BETWEEN frame_bound_s AND frame_bound_e frame_exclude_opt */ - 313, /* (321) range_or_rows ::= RANGE|ROWS|GROUPS */ - 315, /* (322) frame_bound_s ::= frame_bound */ - 315, /* (323) frame_bound_s ::= UNBOUNDED PRECEDING */ - 316, /* (324) frame_bound_e ::= frame_bound */ - 316, /* (325) frame_bound_e ::= UNBOUNDED FOLLOWING */ - 314, /* (326) frame_bound ::= expr PRECEDING|FOLLOWING */ - 314, /* (327) frame_bound ::= CURRENT ROW */ - 317, /* (328) frame_exclude_opt ::= */ - 317, /* (329) frame_exclude_opt ::= EXCLUDE frame_exclude */ - 318, /* (330) frame_exclude ::= NO OTHERS */ - 318, /* (331) frame_exclude ::= CURRENT ROW */ - 318, /* (332) frame_exclude ::= GROUP|TIES */ - 251, /* (333) window_clause ::= WINDOW windowdefn_list */ - 273, /* (334) filter_over ::= filter_clause over_clause */ - 273, /* (335) filter_over ::= over_clause */ - 273, /* (336) filter_over ::= filter_clause */ - 312, /* (337) over_clause ::= OVER LP window RP */ - 312, /* (338) over_clause ::= OVER nm */ - 311, /* (339) filter_clause ::= FILTER LP WHERE expr RP */ - 185, /* (340) input ::= cmdlist */ - 186, /* (341) cmdlist ::= cmdlist ecmd */ - 186, /* (342) cmdlist ::= ecmd */ - 187, /* (343) ecmd ::= SEMI */ - 187, /* (344) ecmd ::= cmdx SEMI */ - 187, /* (345) ecmd ::= explain cmdx SEMI */ - 192, /* (346) trans_opt ::= */ - 192, /* (347) trans_opt ::= TRANSACTION */ - 192, /* (348) trans_opt ::= TRANSACTION nm */ - 194, /* (349) savepoint_opt ::= SAVEPOINT */ - 194, /* (350) savepoint_opt ::= */ - 190, /* (351) cmd ::= create_table create_table_args */ - 203, /* (352) table_option_set ::= table_option */ - 201, /* (353) columnlist ::= columnlist COMMA columnname carglist */ - 201, /* (354) columnlist ::= columnname carglist */ - 193, /* (355) nm ::= ID|INDEXED */ - 193, /* (356) nm ::= STRING */ - 193, /* (357) nm ::= JOIN_KW */ - 208, /* (358) typetoken ::= typename */ - 209, /* (359) typename ::= ID|STRING */ - 210, /* (360) signed ::= plus_num */ - 210, /* (361) signed ::= minus_num */ - 207, /* (362) carglist ::= carglist ccons */ - 207, /* (363) carglist ::= */ - 215, /* (364) ccons ::= NULL onconf */ - 215, /* (365) ccons ::= GENERATED ALWAYS AS generated */ - 215, /* (366) ccons ::= AS generated */ - 202, /* (367) conslist_opt ::= COMMA conslist */ - 228, /* (368) conslist ::= conslist tconscomma tcons */ - 228, /* (369) conslist ::= tcons */ - 229, /* (370) tconscomma ::= */ - 233, /* (371) defer_subclause_opt ::= defer_subclause */ - 235, /* (372) resolvetype ::= raisetype */ - 239, /* (373) selectnowith ::= oneselect */ - 240, /* (374) oneselect ::= values */ - 254, /* (375) sclp ::= selcollist COMMA */ - 255, /* (376) as ::= ID|STRING */ - 272, /* (377) returning ::= */ - 217, /* (378) expr ::= term */ - 274, /* (379) likeop ::= LIKE_KW|MATCH */ - 262, /* (380) exprlist ::= nexprlist */ - 284, /* (381) nmnum ::= plus_num */ - 284, /* (382) nmnum ::= nm */ - 284, /* (383) nmnum ::= ON */ - 284, /* (384) nmnum ::= DELETE */ - 284, /* (385) nmnum ::= DEFAULT */ - 211, /* (386) plus_num ::= INTEGER|FLOAT */ - 289, /* (387) foreach_clause ::= */ - 289, /* (388) foreach_clause ::= FOR EACH ROW */ - 292, /* (389) trnm ::= nm */ - 293, /* (390) tridxby ::= */ - 294, /* (391) database_kw_opt ::= DATABASE */ - 294, /* (392) database_kw_opt ::= */ - 297, /* (393) kwcolumn_opt ::= */ - 297, /* (394) kwcolumn_opt ::= COLUMNKW */ - 299, /* (395) vtabarglist ::= vtabarg */ - 299, /* (396) vtabarglist ::= vtabarglist COMMA vtabarg */ - 300, /* (397) vtabarg ::= vtabarg vtabargtoken */ - 303, /* (398) anylist ::= */ - 303, /* (399) anylist ::= anylist LP anylist RP */ - 303, /* (400) anylist ::= anylist ANY */ - 266, /* (401) with ::= */ + 217, /* (209) expr ::= expr IS NOT DISTINCT FROM expr */ + 217, /* (210) expr ::= expr IS DISTINCT FROM expr */ + 217, /* (211) expr ::= NOT expr */ + 217, /* (212) expr ::= BITNOT expr */ + 217, /* (213) expr ::= PLUS|MINUS expr */ + 217, /* (214) expr ::= expr PTR expr */ + 275, /* (215) between_op ::= BETWEEN */ + 275, /* (216) between_op ::= NOT BETWEEN */ + 217, /* (217) expr ::= expr between_op expr AND expr */ + 276, /* (218) in_op ::= IN */ + 276, /* (219) in_op ::= NOT IN */ + 217, /* (220) expr ::= expr in_op LP exprlist RP */ + 217, /* (221) expr ::= LP select RP */ + 217, /* (222) expr ::= expr in_op LP select RP */ + 217, /* (223) expr ::= expr in_op nm dbnm paren_exprlist */ + 217, /* (224) expr ::= EXISTS LP select RP */ + 217, /* (225) expr ::= CASE case_operand case_exprlist case_else END */ + 279, /* (226) case_exprlist ::= case_exprlist WHEN expr THEN expr */ + 279, /* (227) case_exprlist ::= WHEN expr THEN expr */ + 280, /* (228) case_else ::= ELSE expr */ + 280, /* (229) case_else ::= */ + 278, /* (230) case_operand ::= expr */ + 278, /* (231) case_operand ::= */ + 261, /* (232) exprlist ::= */ + 253, /* (233) nexprlist ::= nexprlist COMMA expr */ + 253, /* (234) nexprlist ::= expr */ + 277, /* (235) paren_exprlist ::= */ + 277, /* (236) paren_exprlist ::= LP exprlist RP */ + 190, /* (237) cmd ::= createkw uniqueflag INDEX ifnotexists nm dbnm ON nm LP sortlist RP where_opt */ + 281, /* (238) uniqueflag ::= UNIQUE */ + 281, /* (239) uniqueflag ::= */ + 221, /* (240) eidlist_opt ::= */ + 221, /* (241) eidlist_opt ::= LP eidlist RP */ + 232, /* (242) eidlist ::= eidlist COMMA nm collate sortorder */ + 232, /* (243) eidlist ::= nm collate sortorder */ + 282, /* (244) collate ::= */ + 282, /* (245) collate ::= COLLATE ID|STRING */ + 190, /* (246) cmd ::= DROP INDEX ifexists fullname */ + 190, /* (247) cmd ::= VACUUM vinto */ + 190, /* (248) cmd ::= VACUUM nm vinto */ + 283, /* (249) vinto ::= INTO expr */ + 283, /* (250) vinto ::= */ + 190, /* (251) cmd ::= PRAGMA nm dbnm */ + 190, /* (252) cmd ::= PRAGMA nm dbnm EQ nmnum */ + 190, /* (253) cmd ::= PRAGMA nm dbnm LP nmnum RP */ + 190, /* (254) cmd ::= PRAGMA nm dbnm EQ minus_num */ + 190, /* (255) cmd ::= PRAGMA nm dbnm LP minus_num RP */ + 211, /* (256) plus_num ::= PLUS INTEGER|FLOAT */ + 212, /* (257) minus_num ::= MINUS INTEGER|FLOAT */ + 190, /* (258) cmd ::= createkw trigger_decl BEGIN trigger_cmd_list END */ + 285, /* (259) trigger_decl ::= temp TRIGGER ifnotexists nm dbnm trigger_time trigger_event ON fullname foreach_clause when_clause */ + 287, /* (260) trigger_time ::= BEFORE|AFTER */ + 287, /* (261) trigger_time ::= INSTEAD OF */ + 287, /* (262) trigger_time ::= */ + 288, /* (263) trigger_event ::= DELETE|INSERT */ + 288, /* (264) trigger_event ::= UPDATE */ + 288, /* (265) trigger_event ::= UPDATE OF idlist */ + 290, /* (266) when_clause ::= */ + 290, /* (267) when_clause ::= WHEN expr */ + 286, /* (268) trigger_cmd_list ::= trigger_cmd_list trigger_cmd SEMI */ + 286, /* (269) trigger_cmd_list ::= trigger_cmd SEMI */ + 292, /* (270) trnm ::= nm DOT nm */ + 293, /* (271) tridxby ::= INDEXED BY nm */ + 293, /* (272) tridxby ::= NOT INDEXED */ + 291, /* (273) trigger_cmd ::= UPDATE orconf trnm tridxby SET setlist from where_opt scanpt */ + 291, /* (274) trigger_cmd ::= scanpt insert_cmd INTO trnm idlist_opt select upsert scanpt */ + 291, /* (275) trigger_cmd ::= DELETE FROM trnm tridxby where_opt scanpt */ + 291, /* (276) trigger_cmd ::= scanpt select scanpt */ + 217, /* (277) expr ::= RAISE LP IGNORE RP */ + 217, /* (278) expr ::= RAISE LP raisetype COMMA nm RP */ + 236, /* (279) raisetype ::= ROLLBACK */ + 236, /* (280) raisetype ::= ABORT */ + 236, /* (281) raisetype ::= FAIL */ + 190, /* (282) cmd ::= DROP TRIGGER ifexists fullname */ + 190, /* (283) cmd ::= ATTACH database_kw_opt expr AS expr key_opt */ + 190, /* (284) cmd ::= DETACH database_kw_opt expr */ + 295, /* (285) key_opt ::= */ + 295, /* (286) key_opt ::= KEY expr */ + 190, /* (287) cmd ::= REINDEX */ + 190, /* (288) cmd ::= REINDEX nm dbnm */ + 190, /* (289) cmd ::= ANALYZE */ + 190, /* (290) cmd ::= ANALYZE nm dbnm */ + 190, /* (291) cmd ::= ALTER TABLE fullname RENAME TO nm */ + 190, /* (292) cmd ::= ALTER TABLE add_column_fullname ADD kwcolumn_opt columnname carglist */ + 190, /* (293) cmd ::= ALTER TABLE fullname DROP kwcolumn_opt nm */ + 296, /* (294) add_column_fullname ::= fullname */ + 190, /* (295) cmd ::= ALTER TABLE fullname RENAME kwcolumn_opt nm TO nm */ + 190, /* (296) cmd ::= create_vtab */ + 190, /* (297) cmd ::= create_vtab LP vtabarglist RP */ + 298, /* (298) create_vtab ::= createkw VIRTUAL TABLE ifnotexists nm dbnm USING nm */ + 300, /* (299) vtabarg ::= */ + 301, /* (300) vtabargtoken ::= ANY */ + 301, /* (301) vtabargtoken ::= lp anylist RP */ + 302, /* (302) lp ::= LP */ + 266, /* (303) with ::= WITH wqlist */ + 266, /* (304) with ::= WITH RECURSIVE wqlist */ + 305, /* (305) wqas ::= AS */ + 305, /* (306) wqas ::= AS MATERIALIZED */ + 305, /* (307) wqas ::= AS NOT MATERIALIZED */ + 304, /* (308) wqitem ::= nm eidlist_opt wqas LP select RP */ + 241, /* (309) wqlist ::= wqitem */ + 241, /* (310) wqlist ::= wqlist COMMA wqitem */ + 306, /* (311) windowdefn_list ::= windowdefn */ + 306, /* (312) windowdefn_list ::= windowdefn_list COMMA windowdefn */ + 307, /* (313) windowdefn ::= nm AS LP window RP */ + 308, /* (314) window ::= PARTITION BY nexprlist orderby_opt frame_opt */ + 308, /* (315) window ::= nm PARTITION BY nexprlist orderby_opt frame_opt */ + 308, /* (316) window ::= ORDER BY sortlist frame_opt */ + 308, /* (317) window ::= nm ORDER BY sortlist frame_opt */ + 308, /* (318) window ::= frame_opt */ + 308, /* (319) window ::= nm frame_opt */ + 309, /* (320) frame_opt ::= */ + 309, /* (321) frame_opt ::= range_or_rows frame_bound_s frame_exclude_opt */ + 309, /* (322) frame_opt ::= range_or_rows BETWEEN frame_bound_s AND frame_bound_e frame_exclude_opt */ + 313, /* (323) range_or_rows ::= RANGE|ROWS|GROUPS */ + 315, /* (324) frame_bound_s ::= frame_bound */ + 315, /* (325) frame_bound_s ::= UNBOUNDED PRECEDING */ + 316, /* (326) frame_bound_e ::= frame_bound */ + 316, /* (327) frame_bound_e ::= UNBOUNDED FOLLOWING */ + 314, /* (328) frame_bound ::= expr PRECEDING|FOLLOWING */ + 314, /* (329) frame_bound ::= CURRENT ROW */ + 317, /* (330) frame_exclude_opt ::= */ + 317, /* (331) frame_exclude_opt ::= EXCLUDE frame_exclude */ + 318, /* (332) frame_exclude ::= NO OTHERS */ + 318, /* (333) frame_exclude ::= CURRENT ROW */ + 318, /* (334) frame_exclude ::= GROUP|TIES */ + 251, /* (335) window_clause ::= WINDOW windowdefn_list */ + 273, /* (336) filter_over ::= filter_clause over_clause */ + 273, /* (337) filter_over ::= over_clause */ + 273, /* (338) filter_over ::= filter_clause */ + 312, /* (339) over_clause ::= OVER LP window RP */ + 312, /* (340) over_clause ::= OVER nm */ + 311, /* (341) filter_clause ::= FILTER LP WHERE expr RP */ + 185, /* (342) input ::= cmdlist */ + 186, /* (343) cmdlist ::= cmdlist ecmd */ + 186, /* (344) cmdlist ::= ecmd */ + 187, /* (345) ecmd ::= SEMI */ + 187, /* (346) ecmd ::= cmdx SEMI */ + 187, /* (347) ecmd ::= explain cmdx SEMI */ + 192, /* (348) trans_opt ::= */ + 192, /* (349) trans_opt ::= TRANSACTION */ + 192, /* (350) trans_opt ::= TRANSACTION nm */ + 194, /* (351) savepoint_opt ::= SAVEPOINT */ + 194, /* (352) savepoint_opt ::= */ + 190, /* (353) cmd ::= create_table create_table_args */ + 203, /* (354) table_option_set ::= table_option */ + 201, /* (355) columnlist ::= columnlist COMMA columnname carglist */ + 201, /* (356) columnlist ::= columnname carglist */ + 193, /* (357) nm ::= ID|INDEXED */ + 193, /* (358) nm ::= STRING */ + 193, /* (359) nm ::= JOIN_KW */ + 208, /* (360) typetoken ::= typename */ + 209, /* (361) typename ::= ID|STRING */ + 210, /* (362) signed ::= plus_num */ + 210, /* (363) signed ::= minus_num */ + 207, /* (364) carglist ::= carglist ccons */ + 207, /* (365) carglist ::= */ + 215, /* (366) ccons ::= NULL onconf */ + 215, /* (367) ccons ::= GENERATED ALWAYS AS generated */ + 215, /* (368) ccons ::= AS generated */ + 202, /* (369) conslist_opt ::= COMMA conslist */ + 228, /* (370) conslist ::= conslist tconscomma tcons */ + 228, /* (371) conslist ::= tcons */ + 229, /* (372) tconscomma ::= */ + 233, /* (373) defer_subclause_opt ::= defer_subclause */ + 235, /* (374) resolvetype ::= raisetype */ + 239, /* (375) selectnowith ::= oneselect */ + 240, /* (376) oneselect ::= values */ + 254, /* (377) sclp ::= selcollist COMMA */ + 255, /* (378) as ::= ID|STRING */ + 264, /* (379) indexed_opt ::= indexed_by */ + 272, /* (380) returning ::= */ + 217, /* (381) expr ::= term */ + 274, /* (382) likeop ::= LIKE_KW|MATCH */ + 261, /* (383) exprlist ::= nexprlist */ + 284, /* (384) nmnum ::= plus_num */ + 284, /* (385) nmnum ::= nm */ + 284, /* (386) nmnum ::= ON */ + 284, /* (387) nmnum ::= DELETE */ + 284, /* (388) nmnum ::= DEFAULT */ + 211, /* (389) plus_num ::= INTEGER|FLOAT */ + 289, /* (390) foreach_clause ::= */ + 289, /* (391) foreach_clause ::= FOR EACH ROW */ + 292, /* (392) trnm ::= nm */ + 293, /* (393) tridxby ::= */ + 294, /* (394) database_kw_opt ::= DATABASE */ + 294, /* (395) database_kw_opt ::= */ + 297, /* (396) kwcolumn_opt ::= */ + 297, /* (397) kwcolumn_opt ::= COLUMNKW */ + 299, /* (398) vtabarglist ::= vtabarg */ + 299, /* (399) vtabarglist ::= vtabarglist COMMA vtabarg */ + 300, /* (400) vtabarg ::= vtabarg vtabargtoken */ + 303, /* (401) anylist ::= */ + 303, /* (402) anylist ::= anylist LP anylist RP */ + 303, /* (403) anylist ::= anylist ANY */ + 266, /* (404) with ::= */ }; /* For rule J, yyRuleInfoNRhs[J] contains the negative of the number @@ -164648,29 +168570,29 @@ static const signed char yyRuleInfoNRhs[] = { -2, /* (106) from ::= FROM seltablist */ -2, /* (107) stl_prefix ::= seltablist joinop */ 0, /* (108) stl_prefix ::= */ - -7, /* (109) seltablist ::= stl_prefix nm dbnm as indexed_opt on_opt using_opt */ - -9, /* (110) seltablist ::= stl_prefix nm dbnm LP exprlist RP as on_opt using_opt */ - -7, /* (111) seltablist ::= stl_prefix LP select RP as on_opt using_opt */ - -7, /* (112) seltablist ::= stl_prefix LP seltablist RP as on_opt using_opt */ - 0, /* (113) dbnm ::= */ - -2, /* (114) dbnm ::= DOT nm */ - -1, /* (115) fullname ::= nm */ - -3, /* (116) fullname ::= nm DOT nm */ - -1, /* (117) xfullname ::= nm */ - -3, /* (118) xfullname ::= nm DOT nm */ - -5, /* (119) xfullname ::= nm DOT nm AS nm */ - -3, /* (120) xfullname ::= nm AS nm */ - -1, /* (121) joinop ::= COMMA|JOIN */ - -2, /* (122) joinop ::= JOIN_KW JOIN */ - -3, /* (123) joinop ::= JOIN_KW nm JOIN */ - -4, /* (124) joinop ::= JOIN_KW nm nm JOIN */ - -2, /* (125) on_opt ::= ON expr */ - 0, /* (126) on_opt ::= */ - 0, /* (127) indexed_opt ::= */ - -3, /* (128) indexed_opt ::= INDEXED BY nm */ - -2, /* (129) indexed_opt ::= NOT INDEXED */ - -4, /* (130) using_opt ::= USING LP idlist RP */ - 0, /* (131) using_opt ::= */ + -5, /* (109) seltablist ::= stl_prefix nm dbnm as on_using */ + -6, /* (110) seltablist ::= stl_prefix nm dbnm as indexed_by on_using */ + -8, /* (111) seltablist ::= stl_prefix nm dbnm LP exprlist RP as on_using */ + -6, /* (112) seltablist ::= stl_prefix LP select RP as on_using */ + -6, /* (113) seltablist ::= stl_prefix LP seltablist RP as on_using */ + 0, /* (114) dbnm ::= */ + -2, /* (115) dbnm ::= DOT nm */ + -1, /* (116) fullname ::= nm */ + -3, /* (117) fullname ::= nm DOT nm */ + -1, /* (118) xfullname ::= nm */ + -3, /* (119) xfullname ::= nm DOT nm */ + -5, /* (120) xfullname ::= nm DOT nm AS nm */ + -3, /* (121) xfullname ::= nm AS nm */ + -1, /* (122) joinop ::= COMMA|JOIN */ + -2, /* (123) joinop ::= JOIN_KW JOIN */ + -3, /* (124) joinop ::= JOIN_KW nm JOIN */ + -4, /* (125) joinop ::= JOIN_KW nm nm JOIN */ + -2, /* (126) on_using ::= ON expr */ + -4, /* (127) on_using ::= USING LP idlist RP */ + 0, /* (128) on_using ::= */ + 0, /* (129) indexed_opt ::= */ + -3, /* (130) indexed_by ::= INDEXED BY nm */ + -2, /* (131) indexed_by ::= NOT INDEXED */ 0, /* (132) orderby_opt ::= */ -3, /* (133) orderby_opt ::= ORDER BY sortlist */ -5, /* (134) sortlist ::= sortlist COMMA expr sortorder nulls */ @@ -164748,199 +168670,202 @@ static const signed char yyRuleInfoNRhs[] = { -3, /* (206) expr ::= expr NOT NULL */ -3, /* (207) expr ::= expr IS expr */ -4, /* (208) expr ::= expr IS NOT expr */ - -2, /* (209) expr ::= NOT expr */ - -2, /* (210) expr ::= BITNOT expr */ - -2, /* (211) expr ::= PLUS|MINUS expr */ - -3, /* (212) expr ::= expr PTR expr */ - -1, /* (213) between_op ::= BETWEEN */ - -2, /* (214) between_op ::= NOT BETWEEN */ - -5, /* (215) expr ::= expr between_op expr AND expr */ - -1, /* (216) in_op ::= IN */ - -2, /* (217) in_op ::= NOT IN */ - -5, /* (218) expr ::= expr in_op LP exprlist RP */ - -3, /* (219) expr ::= LP select RP */ - -5, /* (220) expr ::= expr in_op LP select RP */ - -5, /* (221) expr ::= expr in_op nm dbnm paren_exprlist */ - -4, /* (222) expr ::= EXISTS LP select RP */ - -5, /* (223) expr ::= CASE case_operand case_exprlist case_else END */ - -5, /* (224) case_exprlist ::= case_exprlist WHEN expr THEN expr */ - -4, /* (225) case_exprlist ::= WHEN expr THEN expr */ - -2, /* (226) case_else ::= ELSE expr */ - 0, /* (227) case_else ::= */ - -1, /* (228) case_operand ::= expr */ - 0, /* (229) case_operand ::= */ - 0, /* (230) exprlist ::= */ - -3, /* (231) nexprlist ::= nexprlist COMMA expr */ - -1, /* (232) nexprlist ::= expr */ - 0, /* (233) paren_exprlist ::= */ - -3, /* (234) paren_exprlist ::= LP exprlist RP */ - -12, /* (235) cmd ::= createkw uniqueflag INDEX ifnotexists nm dbnm ON nm LP sortlist RP where_opt */ - -1, /* (236) uniqueflag ::= UNIQUE */ - 0, /* (237) uniqueflag ::= */ - 0, /* (238) eidlist_opt ::= */ - -3, /* (239) eidlist_opt ::= LP eidlist RP */ - -5, /* (240) eidlist ::= eidlist COMMA nm collate sortorder */ - -3, /* (241) eidlist ::= nm collate sortorder */ - 0, /* (242) collate ::= */ - -2, /* (243) collate ::= COLLATE ID|STRING */ - -4, /* (244) cmd ::= DROP INDEX ifexists fullname */ - -2, /* (245) cmd ::= VACUUM vinto */ - -3, /* (246) cmd ::= VACUUM nm vinto */ - -2, /* (247) vinto ::= INTO expr */ - 0, /* (248) vinto ::= */ - -3, /* (249) cmd ::= PRAGMA nm dbnm */ - -5, /* (250) cmd ::= PRAGMA nm dbnm EQ nmnum */ - -6, /* (251) cmd ::= PRAGMA nm dbnm LP nmnum RP */ - -5, /* (252) cmd ::= PRAGMA nm dbnm EQ minus_num */ - -6, /* (253) cmd ::= PRAGMA nm dbnm LP minus_num RP */ - -2, /* (254) plus_num ::= PLUS INTEGER|FLOAT */ - -2, /* (255) minus_num ::= MINUS INTEGER|FLOAT */ - -5, /* (256) cmd ::= createkw trigger_decl BEGIN trigger_cmd_list END */ - -11, /* (257) trigger_decl ::= temp TRIGGER ifnotexists nm dbnm trigger_time trigger_event ON fullname foreach_clause when_clause */ - -1, /* (258) trigger_time ::= BEFORE|AFTER */ - -2, /* (259) trigger_time ::= INSTEAD OF */ - 0, /* (260) trigger_time ::= */ - -1, /* (261) trigger_event ::= DELETE|INSERT */ - -1, /* (262) trigger_event ::= UPDATE */ - -3, /* (263) trigger_event ::= UPDATE OF idlist */ - 0, /* (264) when_clause ::= */ - -2, /* (265) when_clause ::= WHEN expr */ - -3, /* (266) trigger_cmd_list ::= trigger_cmd_list trigger_cmd SEMI */ - -2, /* (267) trigger_cmd_list ::= trigger_cmd SEMI */ - -3, /* (268) trnm ::= nm DOT nm */ - -3, /* (269) tridxby ::= INDEXED BY nm */ - -2, /* (270) tridxby ::= NOT INDEXED */ - -9, /* (271) trigger_cmd ::= UPDATE orconf trnm tridxby SET setlist from where_opt scanpt */ - -8, /* (272) trigger_cmd ::= scanpt insert_cmd INTO trnm idlist_opt select upsert scanpt */ - -6, /* (273) trigger_cmd ::= DELETE FROM trnm tridxby where_opt scanpt */ - -3, /* (274) trigger_cmd ::= scanpt select scanpt */ - -4, /* (275) expr ::= RAISE LP IGNORE RP */ - -6, /* (276) expr ::= RAISE LP raisetype COMMA nm RP */ - -1, /* (277) raisetype ::= ROLLBACK */ - -1, /* (278) raisetype ::= ABORT */ - -1, /* (279) raisetype ::= FAIL */ - -4, /* (280) cmd ::= DROP TRIGGER ifexists fullname */ - -6, /* (281) cmd ::= ATTACH database_kw_opt expr AS expr key_opt */ - -3, /* (282) cmd ::= DETACH database_kw_opt expr */ - 0, /* (283) key_opt ::= */ - -2, /* (284) key_opt ::= KEY expr */ - -1, /* (285) cmd ::= REINDEX */ - -3, /* (286) cmd ::= REINDEX nm dbnm */ - -1, /* (287) cmd ::= ANALYZE */ - -3, /* (288) cmd ::= ANALYZE nm dbnm */ - -6, /* (289) cmd ::= ALTER TABLE fullname RENAME TO nm */ - -7, /* (290) cmd ::= ALTER TABLE add_column_fullname ADD kwcolumn_opt columnname carglist */ - -6, /* (291) cmd ::= ALTER TABLE fullname DROP kwcolumn_opt nm */ - -1, /* (292) add_column_fullname ::= fullname */ - -8, /* (293) cmd ::= ALTER TABLE fullname RENAME kwcolumn_opt nm TO nm */ - -1, /* (294) cmd ::= create_vtab */ - -4, /* (295) cmd ::= create_vtab LP vtabarglist RP */ - -8, /* (296) create_vtab ::= createkw VIRTUAL TABLE ifnotexists nm dbnm USING nm */ - 0, /* (297) vtabarg ::= */ - -1, /* (298) vtabargtoken ::= ANY */ - -3, /* (299) vtabargtoken ::= lp anylist RP */ - -1, /* (300) lp ::= LP */ - -2, /* (301) with ::= WITH wqlist */ - -3, /* (302) with ::= WITH RECURSIVE wqlist */ - -1, /* (303) wqas ::= AS */ - -2, /* (304) wqas ::= AS MATERIALIZED */ - -3, /* (305) wqas ::= AS NOT MATERIALIZED */ - -6, /* (306) wqitem ::= nm eidlist_opt wqas LP select RP */ - -1, /* (307) wqlist ::= wqitem */ - -3, /* (308) wqlist ::= wqlist COMMA wqitem */ - -1, /* (309) windowdefn_list ::= windowdefn */ - -3, /* (310) windowdefn_list ::= windowdefn_list COMMA windowdefn */ - -5, /* (311) windowdefn ::= nm AS LP window RP */ - -5, /* (312) window ::= PARTITION BY nexprlist orderby_opt frame_opt */ - -6, /* (313) window ::= nm PARTITION BY nexprlist orderby_opt frame_opt */ - -4, /* (314) window ::= ORDER BY sortlist frame_opt */ - -5, /* (315) window ::= nm ORDER BY sortlist frame_opt */ - -1, /* (316) window ::= frame_opt */ - -2, /* (317) window ::= nm frame_opt */ - 0, /* (318) frame_opt ::= */ - -3, /* (319) frame_opt ::= range_or_rows frame_bound_s frame_exclude_opt */ - -6, /* (320) frame_opt ::= range_or_rows BETWEEN frame_bound_s AND frame_bound_e frame_exclude_opt */ - -1, /* (321) range_or_rows ::= RANGE|ROWS|GROUPS */ - -1, /* (322) frame_bound_s ::= frame_bound */ - -2, /* (323) frame_bound_s ::= UNBOUNDED PRECEDING */ - -1, /* (324) frame_bound_e ::= frame_bound */ - -2, /* (325) frame_bound_e ::= UNBOUNDED FOLLOWING */ - -2, /* (326) frame_bound ::= expr PRECEDING|FOLLOWING */ - -2, /* (327) frame_bound ::= CURRENT ROW */ - 0, /* (328) frame_exclude_opt ::= */ - -2, /* (329) frame_exclude_opt ::= EXCLUDE frame_exclude */ - -2, /* (330) frame_exclude ::= NO OTHERS */ - -2, /* (331) frame_exclude ::= CURRENT ROW */ - -1, /* (332) frame_exclude ::= GROUP|TIES */ - -2, /* (333) window_clause ::= WINDOW windowdefn_list */ - -2, /* (334) filter_over ::= filter_clause over_clause */ - -1, /* (335) filter_over ::= over_clause */ - -1, /* (336) filter_over ::= filter_clause */ - -4, /* (337) over_clause ::= OVER LP window RP */ - -2, /* (338) over_clause ::= OVER nm */ - -5, /* (339) filter_clause ::= FILTER LP WHERE expr RP */ - -1, /* (340) input ::= cmdlist */ - -2, /* (341) cmdlist ::= cmdlist ecmd */ - -1, /* (342) cmdlist ::= ecmd */ - -1, /* (343) ecmd ::= SEMI */ - -2, /* (344) ecmd ::= cmdx SEMI */ - -3, /* (345) ecmd ::= explain cmdx SEMI */ - 0, /* (346) trans_opt ::= */ - -1, /* (347) trans_opt ::= TRANSACTION */ - -2, /* (348) trans_opt ::= TRANSACTION nm */ - -1, /* (349) savepoint_opt ::= SAVEPOINT */ - 0, /* (350) savepoint_opt ::= */ - -2, /* (351) cmd ::= create_table create_table_args */ - -1, /* (352) table_option_set ::= table_option */ - -4, /* (353) columnlist ::= columnlist COMMA columnname carglist */ - -2, /* (354) columnlist ::= columnname carglist */ - -1, /* (355) nm ::= ID|INDEXED */ - -1, /* (356) nm ::= STRING */ - -1, /* (357) nm ::= JOIN_KW */ - -1, /* (358) typetoken ::= typename */ - -1, /* (359) typename ::= ID|STRING */ - -1, /* (360) signed ::= plus_num */ - -1, /* (361) signed ::= minus_num */ - -2, /* (362) carglist ::= carglist ccons */ - 0, /* (363) carglist ::= */ - -2, /* (364) ccons ::= NULL onconf */ - -4, /* (365) ccons ::= GENERATED ALWAYS AS generated */ - -2, /* (366) ccons ::= AS generated */ - -2, /* (367) conslist_opt ::= COMMA conslist */ - -3, /* (368) conslist ::= conslist tconscomma tcons */ - -1, /* (369) conslist ::= tcons */ - 0, /* (370) tconscomma ::= */ - -1, /* (371) defer_subclause_opt ::= defer_subclause */ - -1, /* (372) resolvetype ::= raisetype */ - -1, /* (373) selectnowith ::= oneselect */ - -1, /* (374) oneselect ::= values */ - -2, /* (375) sclp ::= selcollist COMMA */ - -1, /* (376) as ::= ID|STRING */ - 0, /* (377) returning ::= */ - -1, /* (378) expr ::= term */ - -1, /* (379) likeop ::= LIKE_KW|MATCH */ - -1, /* (380) exprlist ::= nexprlist */ - -1, /* (381) nmnum ::= plus_num */ - -1, /* (382) nmnum ::= nm */ - -1, /* (383) nmnum ::= ON */ - -1, /* (384) nmnum ::= DELETE */ - -1, /* (385) nmnum ::= DEFAULT */ - -1, /* (386) plus_num ::= INTEGER|FLOAT */ - 0, /* (387) foreach_clause ::= */ - -3, /* (388) foreach_clause ::= FOR EACH ROW */ - -1, /* (389) trnm ::= nm */ - 0, /* (390) tridxby ::= */ - -1, /* (391) database_kw_opt ::= DATABASE */ - 0, /* (392) database_kw_opt ::= */ - 0, /* (393) kwcolumn_opt ::= */ - -1, /* (394) kwcolumn_opt ::= COLUMNKW */ - -1, /* (395) vtabarglist ::= vtabarg */ - -3, /* (396) vtabarglist ::= vtabarglist COMMA vtabarg */ - -2, /* (397) vtabarg ::= vtabarg vtabargtoken */ - 0, /* (398) anylist ::= */ - -4, /* (399) anylist ::= anylist LP anylist RP */ - -2, /* (400) anylist ::= anylist ANY */ - 0, /* (401) with ::= */ + -6, /* (209) expr ::= expr IS NOT DISTINCT FROM expr */ + -5, /* (210) expr ::= expr IS DISTINCT FROM expr */ + -2, /* (211) expr ::= NOT expr */ + -2, /* (212) expr ::= BITNOT expr */ + -2, /* (213) expr ::= PLUS|MINUS expr */ + -3, /* (214) expr ::= expr PTR expr */ + -1, /* (215) between_op ::= BETWEEN */ + -2, /* (216) between_op ::= NOT BETWEEN */ + -5, /* (217) expr ::= expr between_op expr AND expr */ + -1, /* (218) in_op ::= IN */ + -2, /* (219) in_op ::= NOT IN */ + -5, /* (220) expr ::= expr in_op LP exprlist RP */ + -3, /* (221) expr ::= LP select RP */ + -5, /* (222) expr ::= expr in_op LP select RP */ + -5, /* (223) expr ::= expr in_op nm dbnm paren_exprlist */ + -4, /* (224) expr ::= EXISTS LP select RP */ + -5, /* (225) expr ::= CASE case_operand case_exprlist case_else END */ + -5, /* (226) case_exprlist ::= case_exprlist WHEN expr THEN expr */ + -4, /* (227) case_exprlist ::= WHEN expr THEN expr */ + -2, /* (228) case_else ::= ELSE expr */ + 0, /* (229) case_else ::= */ + -1, /* (230) case_operand ::= expr */ + 0, /* (231) case_operand ::= */ + 0, /* (232) exprlist ::= */ + -3, /* (233) nexprlist ::= nexprlist COMMA expr */ + -1, /* (234) nexprlist ::= expr */ + 0, /* (235) paren_exprlist ::= */ + -3, /* (236) paren_exprlist ::= LP exprlist RP */ + -12, /* (237) cmd ::= createkw uniqueflag INDEX ifnotexists nm dbnm ON nm LP sortlist RP where_opt */ + -1, /* (238) uniqueflag ::= UNIQUE */ + 0, /* (239) uniqueflag ::= */ + 0, /* (240) eidlist_opt ::= */ + -3, /* (241) eidlist_opt ::= LP eidlist RP */ + -5, /* (242) eidlist ::= eidlist COMMA nm collate sortorder */ + -3, /* (243) eidlist ::= nm collate sortorder */ + 0, /* (244) collate ::= */ + -2, /* (245) collate ::= COLLATE ID|STRING */ + -4, /* (246) cmd ::= DROP INDEX ifexists fullname */ + -2, /* (247) cmd ::= VACUUM vinto */ + -3, /* (248) cmd ::= VACUUM nm vinto */ + -2, /* (249) vinto ::= INTO expr */ + 0, /* (250) vinto ::= */ + -3, /* (251) cmd ::= PRAGMA nm dbnm */ + -5, /* (252) cmd ::= PRAGMA nm dbnm EQ nmnum */ + -6, /* (253) cmd ::= PRAGMA nm dbnm LP nmnum RP */ + -5, /* (254) cmd ::= PRAGMA nm dbnm EQ minus_num */ + -6, /* (255) cmd ::= PRAGMA nm dbnm LP minus_num RP */ + -2, /* (256) plus_num ::= PLUS INTEGER|FLOAT */ + -2, /* (257) minus_num ::= MINUS INTEGER|FLOAT */ + -5, /* (258) cmd ::= createkw trigger_decl BEGIN trigger_cmd_list END */ + -11, /* (259) trigger_decl ::= temp TRIGGER ifnotexists nm dbnm trigger_time trigger_event ON fullname foreach_clause when_clause */ + -1, /* (260) trigger_time ::= BEFORE|AFTER */ + -2, /* (261) trigger_time ::= INSTEAD OF */ + 0, /* (262) trigger_time ::= */ + -1, /* (263) trigger_event ::= DELETE|INSERT */ + -1, /* (264) trigger_event ::= UPDATE */ + -3, /* (265) trigger_event ::= UPDATE OF idlist */ + 0, /* (266) when_clause ::= */ + -2, /* (267) when_clause ::= WHEN expr */ + -3, /* (268) trigger_cmd_list ::= trigger_cmd_list trigger_cmd SEMI */ + -2, /* (269) trigger_cmd_list ::= trigger_cmd SEMI */ + -3, /* (270) trnm ::= nm DOT nm */ + -3, /* (271) tridxby ::= INDEXED BY nm */ + -2, /* (272) tridxby ::= NOT INDEXED */ + -9, /* (273) trigger_cmd ::= UPDATE orconf trnm tridxby SET setlist from where_opt scanpt */ + -8, /* (274) trigger_cmd ::= scanpt insert_cmd INTO trnm idlist_opt select upsert scanpt */ + -6, /* (275) trigger_cmd ::= DELETE FROM trnm tridxby where_opt scanpt */ + -3, /* (276) trigger_cmd ::= scanpt select scanpt */ + -4, /* (277) expr ::= RAISE LP IGNORE RP */ + -6, /* (278) expr ::= RAISE LP raisetype COMMA nm RP */ + -1, /* (279) raisetype ::= ROLLBACK */ + -1, /* (280) raisetype ::= ABORT */ + -1, /* (281) raisetype ::= FAIL */ + -4, /* (282) cmd ::= DROP TRIGGER ifexists fullname */ + -6, /* (283) cmd ::= ATTACH database_kw_opt expr AS expr key_opt */ + -3, /* (284) cmd ::= DETACH database_kw_opt expr */ + 0, /* (285) key_opt ::= */ + -2, /* (286) key_opt ::= KEY expr */ + -1, /* (287) cmd ::= REINDEX */ + -3, /* (288) cmd ::= REINDEX nm dbnm */ + -1, /* (289) cmd ::= ANALYZE */ + -3, /* (290) cmd ::= ANALYZE nm dbnm */ + -6, /* (291) cmd ::= ALTER TABLE fullname RENAME TO nm */ + -7, /* (292) cmd ::= ALTER TABLE add_column_fullname ADD kwcolumn_opt columnname carglist */ + -6, /* (293) cmd ::= ALTER TABLE fullname DROP kwcolumn_opt nm */ + -1, /* (294) add_column_fullname ::= fullname */ + -8, /* (295) cmd ::= ALTER TABLE fullname RENAME kwcolumn_opt nm TO nm */ + -1, /* (296) cmd ::= create_vtab */ + -4, /* (297) cmd ::= create_vtab LP vtabarglist RP */ + -8, /* (298) create_vtab ::= createkw VIRTUAL TABLE ifnotexists nm dbnm USING nm */ + 0, /* (299) vtabarg ::= */ + -1, /* (300) vtabargtoken ::= ANY */ + -3, /* (301) vtabargtoken ::= lp anylist RP */ + -1, /* (302) lp ::= LP */ + -2, /* (303) with ::= WITH wqlist */ + -3, /* (304) with ::= WITH RECURSIVE wqlist */ + -1, /* (305) wqas ::= AS */ + -2, /* (306) wqas ::= AS MATERIALIZED */ + -3, /* (307) wqas ::= AS NOT MATERIALIZED */ + -6, /* (308) wqitem ::= nm eidlist_opt wqas LP select RP */ + -1, /* (309) wqlist ::= wqitem */ + -3, /* (310) wqlist ::= wqlist COMMA wqitem */ + -1, /* (311) windowdefn_list ::= windowdefn */ + -3, /* (312) windowdefn_list ::= windowdefn_list COMMA windowdefn */ + -5, /* (313) windowdefn ::= nm AS LP window RP */ + -5, /* (314) window ::= PARTITION BY nexprlist orderby_opt frame_opt */ + -6, /* (315) window ::= nm PARTITION BY nexprlist orderby_opt frame_opt */ + -4, /* (316) window ::= ORDER BY sortlist frame_opt */ + -5, /* (317) window ::= nm ORDER BY sortlist frame_opt */ + -1, /* (318) window ::= frame_opt */ + -2, /* (319) window ::= nm frame_opt */ + 0, /* (320) frame_opt ::= */ + -3, /* (321) frame_opt ::= range_or_rows frame_bound_s frame_exclude_opt */ + -6, /* (322) frame_opt ::= range_or_rows BETWEEN frame_bound_s AND frame_bound_e frame_exclude_opt */ + -1, /* (323) range_or_rows ::= RANGE|ROWS|GROUPS */ + -1, /* (324) frame_bound_s ::= frame_bound */ + -2, /* (325) frame_bound_s ::= UNBOUNDED PRECEDING */ + -1, /* (326) frame_bound_e ::= frame_bound */ + -2, /* (327) frame_bound_e ::= UNBOUNDED FOLLOWING */ + -2, /* (328) frame_bound ::= expr PRECEDING|FOLLOWING */ + -2, /* (329) frame_bound ::= CURRENT ROW */ + 0, /* (330) frame_exclude_opt ::= */ + -2, /* (331) frame_exclude_opt ::= EXCLUDE frame_exclude */ + -2, /* (332) frame_exclude ::= NO OTHERS */ + -2, /* (333) frame_exclude ::= CURRENT ROW */ + -1, /* (334) frame_exclude ::= GROUP|TIES */ + -2, /* (335) window_clause ::= WINDOW windowdefn_list */ + -2, /* (336) filter_over ::= filter_clause over_clause */ + -1, /* (337) filter_over ::= over_clause */ + -1, /* (338) filter_over ::= filter_clause */ + -4, /* (339) over_clause ::= OVER LP window RP */ + -2, /* (340) over_clause ::= OVER nm */ + -5, /* (341) filter_clause ::= FILTER LP WHERE expr RP */ + -1, /* (342) input ::= cmdlist */ + -2, /* (343) cmdlist ::= cmdlist ecmd */ + -1, /* (344) cmdlist ::= ecmd */ + -1, /* (345) ecmd ::= SEMI */ + -2, /* (346) ecmd ::= cmdx SEMI */ + -3, /* (347) ecmd ::= explain cmdx SEMI */ + 0, /* (348) trans_opt ::= */ + -1, /* (349) trans_opt ::= TRANSACTION */ + -2, /* (350) trans_opt ::= TRANSACTION nm */ + -1, /* (351) savepoint_opt ::= SAVEPOINT */ + 0, /* (352) savepoint_opt ::= */ + -2, /* (353) cmd ::= create_table create_table_args */ + -1, /* (354) table_option_set ::= table_option */ + -4, /* (355) columnlist ::= columnlist COMMA columnname carglist */ + -2, /* (356) columnlist ::= columnname carglist */ + -1, /* (357) nm ::= ID|INDEXED */ + -1, /* (358) nm ::= STRING */ + -1, /* (359) nm ::= JOIN_KW */ + -1, /* (360) typetoken ::= typename */ + -1, /* (361) typename ::= ID|STRING */ + -1, /* (362) signed ::= plus_num */ + -1, /* (363) signed ::= minus_num */ + -2, /* (364) carglist ::= carglist ccons */ + 0, /* (365) carglist ::= */ + -2, /* (366) ccons ::= NULL onconf */ + -4, /* (367) ccons ::= GENERATED ALWAYS AS generated */ + -2, /* (368) ccons ::= AS generated */ + -2, /* (369) conslist_opt ::= COMMA conslist */ + -3, /* (370) conslist ::= conslist tconscomma tcons */ + -1, /* (371) conslist ::= tcons */ + 0, /* (372) tconscomma ::= */ + -1, /* (373) defer_subclause_opt ::= defer_subclause */ + -1, /* (374) resolvetype ::= raisetype */ + -1, /* (375) selectnowith ::= oneselect */ + -1, /* (376) oneselect ::= values */ + -2, /* (377) sclp ::= selcollist COMMA */ + -1, /* (378) as ::= ID|STRING */ + -1, /* (379) indexed_opt ::= indexed_by */ + 0, /* (380) returning ::= */ + -1, /* (381) expr ::= term */ + -1, /* (382) likeop ::= LIKE_KW|MATCH */ + -1, /* (383) exprlist ::= nexprlist */ + -1, /* (384) nmnum ::= plus_num */ + -1, /* (385) nmnum ::= nm */ + -1, /* (386) nmnum ::= ON */ + -1, /* (387) nmnum ::= DELETE */ + -1, /* (388) nmnum ::= DEFAULT */ + -1, /* (389) plus_num ::= INTEGER|FLOAT */ + 0, /* (390) foreach_clause ::= */ + -3, /* (391) foreach_clause ::= FOR EACH ROW */ + -1, /* (392) trnm ::= nm */ + 0, /* (393) tridxby ::= */ + -1, /* (394) database_kw_opt ::= DATABASE */ + 0, /* (395) database_kw_opt ::= */ + 0, /* (396) kwcolumn_opt ::= */ + -1, /* (397) kwcolumn_opt ::= COLUMNKW */ + -1, /* (398) vtabarglist ::= vtabarg */ + -3, /* (399) vtabarglist ::= vtabarglist COMMA vtabarg */ + -2, /* (400) vtabarg ::= vtabarg vtabargtoken */ + 0, /* (401) anylist ::= */ + -4, /* (402) anylist ::= anylist LP anylist RP */ + -2, /* (403) anylist ::= anylist ANY */ + 0, /* (404) with ::= */ }; static void yy_accept(yyParser*); /* Forward Declaration */ @@ -165000,7 +168925,7 @@ static YYACTIONTYPE yy_reduce( case 5: /* transtype ::= DEFERRED */ case 6: /* transtype ::= IMMEDIATE */ yytestcase(yyruleno==6); case 7: /* transtype ::= EXCLUSIVE */ yytestcase(yyruleno==7); - case 321: /* range_or_rows ::= RANGE|ROWS|GROUPS */ yytestcase(yyruleno==321); + case 323: /* range_or_rows ::= RANGE|ROWS|GROUPS */ yytestcase(yyruleno==323); {yymsp[0].minor.yy394 = yymsp[0].major; /*A-overwrites-X*/} break; case 8: /* cmd ::= COMMIT|END trans_opt */ @@ -165037,7 +168962,7 @@ static YYACTIONTYPE yy_reduce( case 72: /* defer_subclause_opt ::= */ yytestcase(yyruleno==72); case 81: /* ifexists ::= */ yytestcase(yyruleno==81); case 98: /* distinct ::= */ yytestcase(yyruleno==98); - case 242: /* collate ::= */ yytestcase(yyruleno==242); + case 244: /* collate ::= */ yytestcase(yyruleno==244); {yymsp[1].minor.yy394 = 0;} break; case 16: /* ifnotexists ::= IF NOT EXISTS */ @@ -165221,9 +169146,9 @@ static YYACTIONTYPE yy_reduce( break; case 63: /* init_deferred_pred_opt ::= INITIALLY DEFERRED */ case 80: /* ifexists ::= IF EXISTS */ yytestcase(yyruleno==80); - case 214: /* between_op ::= NOT BETWEEN */ yytestcase(yyruleno==214); - case 217: /* in_op ::= NOT IN */ yytestcase(yyruleno==217); - case 243: /* collate ::= COLLATE ID|STRING */ yytestcase(yyruleno==243); + case 216: /* between_op ::= NOT BETWEEN */ yytestcase(yyruleno==216); + case 219: /* in_op ::= NOT IN */ yytestcase(yyruleno==219); + case 245: /* collate ::= COLLATE ID|STRING */ yytestcase(yyruleno==245); {yymsp[-1].minor.yy394 = 1;} break; case 64: /* init_deferred_pred_opt ::= INITIALLY IMMEDIATE */ @@ -165308,7 +169233,7 @@ static YYACTIONTYPE yy_reduce( Token x; x.n = 0; parserDoubleLinkSelect(pParse, pRhs); - pFrom = sqlite3SrcListAppendFromTerm(pParse,0,0,0,&x,pRhs,0,0); + pFrom = sqlite3SrcListAppendFromTerm(pParse,0,0,0,&x,pRhs,0); pRhs = sqlite3SelectNew(pParse,0,pFrom,0,0,0,0,0,0); } if( pRhs ){ @@ -165373,9 +169298,9 @@ static YYACTIONTYPE yy_reduce( case 99: /* sclp ::= */ case 132: /* orderby_opt ::= */ yytestcase(yyruleno==132); case 142: /* groupby_opt ::= */ yytestcase(yyruleno==142); - case 230: /* exprlist ::= */ yytestcase(yyruleno==230); - case 233: /* paren_exprlist ::= */ yytestcase(yyruleno==233); - case 238: /* eidlist_opt ::= */ yytestcase(yyruleno==238); + case 232: /* exprlist ::= */ yytestcase(yyruleno==232); + case 235: /* paren_exprlist ::= */ yytestcase(yyruleno==235); + case 240: /* eidlist_opt ::= */ yytestcase(yyruleno==240); {yymsp[1].minor.yy322 = 0;} break; case 100: /* selcollist ::= sclp scanpt expr scanpt as */ @@ -165400,9 +169325,9 @@ static YYACTIONTYPE yy_reduce( } break; case 103: /* as ::= AS nm */ - case 114: /* dbnm ::= DOT nm */ yytestcase(yyruleno==114); - case 254: /* plus_num ::= PLUS INTEGER|FLOAT */ yytestcase(yyruleno==254); - case 255: /* minus_num ::= MINUS INTEGER|FLOAT */ yytestcase(yyruleno==255); + case 115: /* dbnm ::= DOT nm */ yytestcase(yyruleno==115); + case 256: /* plus_num ::= PLUS INTEGER|FLOAT */ yytestcase(yyruleno==256); + case 257: /* minus_num ::= MINUS INTEGER|FLOAT */ yytestcase(yyruleno==257); {yymsp[-1].minor.yy0 = yymsp[0].minor.yy0;} break; case 105: /* from ::= */ @@ -165412,7 +169337,7 @@ static YYACTIONTYPE yy_reduce( case 106: /* from ::= FROM seltablist */ { yymsp[-1].minor.yy131 = yymsp[0].minor.yy131; - sqlite3SrcListShiftJoinType(yymsp[-1].minor.yy131); + sqlite3SrcListShiftJoinType(pParse,yymsp[-1].minor.yy131); } break; case 107: /* stl_prefix ::= seltablist joinop */ @@ -165420,35 +169345,43 @@ static YYACTIONTYPE yy_reduce( if( ALWAYS(yymsp[-1].minor.yy131 && yymsp[-1].minor.yy131->nSrc>0) ) yymsp[-1].minor.yy131->a[yymsp[-1].minor.yy131->nSrc-1].fg.jointype = (u8)yymsp[0].minor.yy394; } break; - case 109: /* seltablist ::= stl_prefix nm dbnm as indexed_opt on_opt using_opt */ + case 109: /* seltablist ::= stl_prefix nm dbnm as on_using */ +{ + yymsp[-4].minor.yy131 = sqlite3SrcListAppendFromTerm(pParse,yymsp[-4].minor.yy131,&yymsp[-3].minor.yy0,&yymsp[-2].minor.yy0,&yymsp[-1].minor.yy0,0,&yymsp[0].minor.yy561); +} + break; + case 110: /* seltablist ::= stl_prefix nm dbnm as indexed_by on_using */ { - yymsp[-6].minor.yy131 = sqlite3SrcListAppendFromTerm(pParse,yymsp[-6].minor.yy131,&yymsp[-5].minor.yy0,&yymsp[-4].minor.yy0,&yymsp[-3].minor.yy0,0,yymsp[-1].minor.yy528,yymsp[0].minor.yy254); - sqlite3SrcListIndexedBy(pParse, yymsp[-6].minor.yy131, &yymsp[-2].minor.yy0); + yymsp[-5].minor.yy131 = sqlite3SrcListAppendFromTerm(pParse,yymsp[-5].minor.yy131,&yymsp[-4].minor.yy0,&yymsp[-3].minor.yy0,&yymsp[-2].minor.yy0,0,&yymsp[0].minor.yy561); + sqlite3SrcListIndexedBy(pParse, yymsp[-5].minor.yy131, &yymsp[-1].minor.yy0); } break; - case 110: /* seltablist ::= stl_prefix nm dbnm LP exprlist RP as on_opt using_opt */ + case 111: /* seltablist ::= stl_prefix nm dbnm LP exprlist RP as on_using */ { - yymsp[-8].minor.yy131 = sqlite3SrcListAppendFromTerm(pParse,yymsp[-8].minor.yy131,&yymsp[-7].minor.yy0,&yymsp[-6].minor.yy0,&yymsp[-2].minor.yy0,0,yymsp[-1].minor.yy528,yymsp[0].minor.yy254); - sqlite3SrcListFuncArgs(pParse, yymsp[-8].minor.yy131, yymsp[-4].minor.yy322); + yymsp[-7].minor.yy131 = sqlite3SrcListAppendFromTerm(pParse,yymsp[-7].minor.yy131,&yymsp[-6].minor.yy0,&yymsp[-5].minor.yy0,&yymsp[-1].minor.yy0,0,&yymsp[0].minor.yy561); + sqlite3SrcListFuncArgs(pParse, yymsp[-7].minor.yy131, yymsp[-3].minor.yy322); } break; - case 111: /* seltablist ::= stl_prefix LP select RP as on_opt using_opt */ + case 112: /* seltablist ::= stl_prefix LP select RP as on_using */ { - yymsp[-6].minor.yy131 = sqlite3SrcListAppendFromTerm(pParse,yymsp[-6].minor.yy131,0,0,&yymsp[-2].minor.yy0,yymsp[-4].minor.yy47,yymsp[-1].minor.yy528,yymsp[0].minor.yy254); + yymsp[-5].minor.yy131 = sqlite3SrcListAppendFromTerm(pParse,yymsp[-5].minor.yy131,0,0,&yymsp[-1].minor.yy0,yymsp[-3].minor.yy47,&yymsp[0].minor.yy561); } break; - case 112: /* seltablist ::= stl_prefix LP seltablist RP as on_opt using_opt */ + case 113: /* seltablist ::= stl_prefix LP seltablist RP as on_using */ { - if( yymsp[-6].minor.yy131==0 && yymsp[-2].minor.yy0.n==0 && yymsp[-1].minor.yy528==0 && yymsp[0].minor.yy254==0 ){ - yymsp[-6].minor.yy131 = yymsp[-4].minor.yy131; - }else if( yymsp[-4].minor.yy131->nSrc==1 ){ - yymsp[-6].minor.yy131 = sqlite3SrcListAppendFromTerm(pParse,yymsp[-6].minor.yy131,0,0,&yymsp[-2].minor.yy0,0,yymsp[-1].minor.yy528,yymsp[0].minor.yy254); - if( yymsp[-6].minor.yy131 ){ - SrcItem *pNew = &yymsp[-6].minor.yy131->a[yymsp[-6].minor.yy131->nSrc-1]; - SrcItem *pOld = yymsp[-4].minor.yy131->a; + if( yymsp[-5].minor.yy131==0 && yymsp[-1].minor.yy0.n==0 && yymsp[0].minor.yy561.pOn==0 && yymsp[0].minor.yy561.pUsing==0 ){ + yymsp[-5].minor.yy131 = yymsp[-3].minor.yy131; + }else if( yymsp[-3].minor.yy131->nSrc==1 ){ + yymsp[-5].minor.yy131 = sqlite3SrcListAppendFromTerm(pParse,yymsp[-5].minor.yy131,0,0,&yymsp[-1].minor.yy0,0,&yymsp[0].minor.yy561); + if( yymsp[-5].minor.yy131 ){ + SrcItem *pNew = &yymsp[-5].minor.yy131->a[yymsp[-5].minor.yy131->nSrc-1]; + SrcItem *pOld = yymsp[-3].minor.yy131->a; pNew->zName = pOld->zName; pNew->zDatabase = pOld->zDatabase; pNew->pSelect = pOld->pSelect; + if( pNew->pSelect && (pNew->pSelect->selFlags & SF_NestedFrom)!=0 ){ + pNew->fg.isNestedFrom = 1; + } if( pOld->fg.isTabFunc ){ pNew->u1.pFuncArg = pOld->u1.pFuncArg; pOld->u1.pFuncArg = 0; @@ -165458,94 +169391,78 @@ static YYACTIONTYPE yy_reduce( pOld->zName = pOld->zDatabase = 0; pOld->pSelect = 0; } - sqlite3SrcListDelete(pParse->db, yymsp[-4].minor.yy131); + sqlite3SrcListDelete(pParse->db, yymsp[-3].minor.yy131); }else{ Select *pSubquery; - sqlite3SrcListShiftJoinType(yymsp[-4].minor.yy131); - pSubquery = sqlite3SelectNew(pParse,0,yymsp[-4].minor.yy131,0,0,0,0,SF_NestedFrom,0); - yymsp[-6].minor.yy131 = sqlite3SrcListAppendFromTerm(pParse,yymsp[-6].minor.yy131,0,0,&yymsp[-2].minor.yy0,pSubquery,yymsp[-1].minor.yy528,yymsp[0].minor.yy254); + sqlite3SrcListShiftJoinType(pParse,yymsp[-3].minor.yy131); + pSubquery = sqlite3SelectNew(pParse,0,yymsp[-3].minor.yy131,0,0,0,0,SF_NestedFrom,0); + yymsp[-5].minor.yy131 = sqlite3SrcListAppendFromTerm(pParse,yymsp[-5].minor.yy131,0,0,&yymsp[-1].minor.yy0,pSubquery,&yymsp[0].minor.yy561); } } break; - case 113: /* dbnm ::= */ - case 127: /* indexed_opt ::= */ yytestcase(yyruleno==127); + case 114: /* dbnm ::= */ + case 129: /* indexed_opt ::= */ yytestcase(yyruleno==129); {yymsp[1].minor.yy0.z=0; yymsp[1].minor.yy0.n=0;} break; - case 115: /* fullname ::= nm */ + case 116: /* fullname ::= nm */ { yylhsminor.yy131 = sqlite3SrcListAppend(pParse,0,&yymsp[0].minor.yy0,0); if( IN_RENAME_OBJECT && yylhsminor.yy131 ) sqlite3RenameTokenMap(pParse, yylhsminor.yy131->a[0].zName, &yymsp[0].minor.yy0); } yymsp[0].minor.yy131 = yylhsminor.yy131; break; - case 116: /* fullname ::= nm DOT nm */ + case 117: /* fullname ::= nm DOT nm */ { yylhsminor.yy131 = sqlite3SrcListAppend(pParse,0,&yymsp[-2].minor.yy0,&yymsp[0].minor.yy0); if( IN_RENAME_OBJECT && yylhsminor.yy131 ) sqlite3RenameTokenMap(pParse, yylhsminor.yy131->a[0].zName, &yymsp[0].minor.yy0); } yymsp[-2].minor.yy131 = yylhsminor.yy131; break; - case 117: /* xfullname ::= nm */ + case 118: /* xfullname ::= nm */ {yymsp[0].minor.yy131 = sqlite3SrcListAppend(pParse,0,&yymsp[0].minor.yy0,0); /*A-overwrites-X*/} break; - case 118: /* xfullname ::= nm DOT nm */ + case 119: /* xfullname ::= nm DOT nm */ {yymsp[-2].minor.yy131 = sqlite3SrcListAppend(pParse,0,&yymsp[-2].minor.yy0,&yymsp[0].minor.yy0); /*A-overwrites-X*/} break; - case 119: /* xfullname ::= nm DOT nm AS nm */ + case 120: /* xfullname ::= nm DOT nm AS nm */ { yymsp[-4].minor.yy131 = sqlite3SrcListAppend(pParse,0,&yymsp[-4].minor.yy0,&yymsp[-2].minor.yy0); /*A-overwrites-X*/ if( yymsp[-4].minor.yy131 ) yymsp[-4].minor.yy131->a[0].zAlias = sqlite3NameFromToken(pParse->db, &yymsp[0].minor.yy0); } break; - case 120: /* xfullname ::= nm AS nm */ + case 121: /* xfullname ::= nm AS nm */ { yymsp[-2].minor.yy131 = sqlite3SrcListAppend(pParse,0,&yymsp[-2].minor.yy0,0); /*A-overwrites-X*/ if( yymsp[-2].minor.yy131 ) yymsp[-2].minor.yy131->a[0].zAlias = sqlite3NameFromToken(pParse->db, &yymsp[0].minor.yy0); } break; - case 121: /* joinop ::= COMMA|JOIN */ + case 122: /* joinop ::= COMMA|JOIN */ { yymsp[0].minor.yy394 = JT_INNER; } break; - case 122: /* joinop ::= JOIN_KW JOIN */ + case 123: /* joinop ::= JOIN_KW JOIN */ {yymsp[-1].minor.yy394 = sqlite3JoinType(pParse,&yymsp[-1].minor.yy0,0,0); /*X-overwrites-A*/} break; - case 123: /* joinop ::= JOIN_KW nm JOIN */ + case 124: /* joinop ::= JOIN_KW nm JOIN */ {yymsp[-2].minor.yy394 = sqlite3JoinType(pParse,&yymsp[-2].minor.yy0,&yymsp[-1].minor.yy0,0); /*X-overwrites-A*/} break; - case 124: /* joinop ::= JOIN_KW nm nm JOIN */ + case 125: /* joinop ::= JOIN_KW nm nm JOIN */ {yymsp[-3].minor.yy394 = sqlite3JoinType(pParse,&yymsp[-3].minor.yy0,&yymsp[-2].minor.yy0,&yymsp[-1].minor.yy0);/*X-overwrites-A*/} break; - case 125: /* on_opt ::= ON expr */ - case 145: /* having_opt ::= HAVING expr */ yytestcase(yyruleno==145); - case 152: /* where_opt ::= WHERE expr */ yytestcase(yyruleno==152); - case 154: /* where_opt_ret ::= WHERE expr */ yytestcase(yyruleno==154); - case 226: /* case_else ::= ELSE expr */ yytestcase(yyruleno==226); - case 247: /* vinto ::= INTO expr */ yytestcase(yyruleno==247); -{yymsp[-1].minor.yy528 = yymsp[0].minor.yy528;} + case 126: /* on_using ::= ON expr */ +{yymsp[-1].minor.yy561.pOn = yymsp[0].minor.yy528; yymsp[-1].minor.yy561.pUsing = 0;} break; - case 126: /* on_opt ::= */ - case 144: /* having_opt ::= */ yytestcase(yyruleno==144); - case 146: /* limit_opt ::= */ yytestcase(yyruleno==146); - case 151: /* where_opt ::= */ yytestcase(yyruleno==151); - case 153: /* where_opt_ret ::= */ yytestcase(yyruleno==153); - case 227: /* case_else ::= */ yytestcase(yyruleno==227); - case 229: /* case_operand ::= */ yytestcase(yyruleno==229); - case 248: /* vinto ::= */ yytestcase(yyruleno==248); -{yymsp[1].minor.yy528 = 0;} + case 127: /* on_using ::= USING LP idlist RP */ +{yymsp[-3].minor.yy561.pOn = 0; yymsp[-3].minor.yy561.pUsing = yymsp[-1].minor.yy254;} + break; + case 128: /* on_using ::= */ +{yymsp[1].minor.yy561.pOn = 0; yymsp[1].minor.yy561.pUsing = 0;} break; - case 128: /* indexed_opt ::= INDEXED BY nm */ + case 130: /* indexed_by ::= INDEXED BY nm */ {yymsp[-2].minor.yy0 = yymsp[0].minor.yy0;} break; - case 129: /* indexed_opt ::= NOT INDEXED */ + case 131: /* indexed_by ::= NOT INDEXED */ {yymsp[-1].minor.yy0.z=0; yymsp[-1].minor.yy0.n=1;} break; - case 130: /* using_opt ::= USING LP idlist RP */ -{yymsp[-3].minor.yy254 = yymsp[-1].minor.yy254;} - break; - case 131: /* using_opt ::= */ - case 173: /* idlist_opt ::= */ yytestcase(yyruleno==173); -{yymsp[1].minor.yy254 = 0;} - break; case 133: /* orderby_opt ::= ORDER BY sortlist */ case 143: /* groupby_opt ::= GROUP BY nexprlist */ yytestcase(yyruleno==143); {yymsp[-2].minor.yy322 = yymsp[0].minor.yy322;} @@ -165578,6 +169495,22 @@ static YYACTIONTYPE yy_reduce( case 140: /* nulls ::= NULLS LAST */ {yymsp[-1].minor.yy394 = SQLITE_SO_DESC;} break; + case 144: /* having_opt ::= */ + case 146: /* limit_opt ::= */ yytestcase(yyruleno==146); + case 151: /* where_opt ::= */ yytestcase(yyruleno==151); + case 153: /* where_opt_ret ::= */ yytestcase(yyruleno==153); + case 229: /* case_else ::= */ yytestcase(yyruleno==229); + case 231: /* case_operand ::= */ yytestcase(yyruleno==231); + case 250: /* vinto ::= */ yytestcase(yyruleno==250); +{yymsp[1].minor.yy528 = 0;} + break; + case 145: /* having_opt ::= HAVING expr */ + case 152: /* where_opt ::= WHERE expr */ yytestcase(yyruleno==152); + case 154: /* where_opt_ret ::= WHERE expr */ yytestcase(yyruleno==154); + case 228: /* case_else ::= ELSE expr */ yytestcase(yyruleno==228); + case 249: /* vinto ::= INTO expr */ yytestcase(yyruleno==249); +{yymsp[-1].minor.yy528 = yymsp[0].minor.yy528;} + break; case 147: /* limit_opt ::= LIMIT expr */ {yymsp[-1].minor.yy528 = sqlite3PExpr(pParse,TK_LIMIT,yymsp[0].minor.yy528,0);} break; @@ -165609,7 +169542,18 @@ static YYACTIONTYPE yy_reduce( case 157: /* cmd ::= with UPDATE orconf xfullname indexed_opt SET setlist from where_opt_ret orderby_opt limit_opt */ { sqlite3SrcListIndexedBy(pParse, yymsp[-7].minor.yy131, &yymsp[-6].minor.yy0); - yymsp[-7].minor.yy131 = sqlite3SrcListAppendList(pParse, yymsp[-7].minor.yy131, yymsp[-3].minor.yy131); + if( yymsp[-3].minor.yy131 ){ + SrcList *pFromClause = yymsp[-3].minor.yy131; + if( pFromClause->nSrc>1 ){ + Select *pSubquery; + Token as; + pSubquery = sqlite3SelectNew(pParse,0,pFromClause,0,0,0,0,SF_NestedFrom,0); + as.n = 0; + as.z = 0; + pFromClause = sqlite3SrcListAppendFromTerm(pParse,0,0,0,&as,pSubquery,0); + } + yymsp[-7].minor.yy131 = sqlite3SrcListAppendList(pParse, yymsp[-7].minor.yy131, pFromClause); + } sqlite3ExprListCheckLength(pParse,yymsp[-4].minor.yy322,"set list"); #ifndef SQLITE_ENABLE_UPDATE_DELETE_LIMIT if( yymsp[-1].minor.yy322 || yymsp[0].minor.yy528 ){ @@ -165675,6 +169619,9 @@ static YYACTIONTYPE yy_reduce( case 170: /* returning ::= RETURNING selcollist */ {sqlite3AddReturning(pParse,yymsp[0].minor.yy322);} break; + case 173: /* idlist_opt ::= */ +{yymsp[1].minor.yy254 = 0;} + break; case 174: /* idlist_opt ::= LP idlist RP */ {yymsp[-2].minor.yy254 = yymsp[-1].minor.yy254;} break; @@ -165860,17 +169807,29 @@ static YYACTIONTYPE yy_reduce( binaryToUnaryIfNull(pParse, yymsp[0].minor.yy528, yymsp[-3].minor.yy528, TK_NOTNULL); } break; - case 209: /* expr ::= NOT expr */ - case 210: /* expr ::= BITNOT expr */ yytestcase(yyruleno==210); + case 209: /* expr ::= expr IS NOT DISTINCT FROM expr */ +{ + yymsp[-5].minor.yy528 = sqlite3PExpr(pParse,TK_IS,yymsp[-5].minor.yy528,yymsp[0].minor.yy528); + binaryToUnaryIfNull(pParse, yymsp[0].minor.yy528, yymsp[-5].minor.yy528, TK_ISNULL); +} + break; + case 210: /* expr ::= expr IS DISTINCT FROM expr */ +{ + yymsp[-4].minor.yy528 = sqlite3PExpr(pParse,TK_ISNOT,yymsp[-4].minor.yy528,yymsp[0].minor.yy528); + binaryToUnaryIfNull(pParse, yymsp[0].minor.yy528, yymsp[-4].minor.yy528, TK_NOTNULL); +} + break; + case 211: /* expr ::= NOT expr */ + case 212: /* expr ::= BITNOT expr */ yytestcase(yyruleno==212); {yymsp[-1].minor.yy528 = sqlite3PExpr(pParse, yymsp[-1].major, yymsp[0].minor.yy528, 0);/*A-overwrites-B*/} break; - case 211: /* expr ::= PLUS|MINUS expr */ + case 213: /* expr ::= PLUS|MINUS expr */ { yymsp[-1].minor.yy528 = sqlite3PExpr(pParse, yymsp[-1].major==TK_PLUS ? TK_UPLUS : TK_UMINUS, yymsp[0].minor.yy528, 0); /*A-overwrites-B*/ } break; - case 212: /* expr ::= expr PTR expr */ + case 214: /* expr ::= expr PTR expr */ { ExprList *pList = sqlite3ExprListAppend(pParse, 0, yymsp[-2].minor.yy528); pList = sqlite3ExprListAppend(pParse, pList, yymsp[0].minor.yy528); @@ -165878,11 +169837,11 @@ static YYACTIONTYPE yy_reduce( } yymsp[-2].minor.yy528 = yylhsminor.yy528; break; - case 213: /* between_op ::= BETWEEN */ - case 216: /* in_op ::= IN */ yytestcase(yyruleno==216); + case 215: /* between_op ::= BETWEEN */ + case 218: /* in_op ::= IN */ yytestcase(yyruleno==218); {yymsp[0].minor.yy394 = 0;} break; - case 215: /* expr ::= expr between_op expr AND expr */ + case 217: /* expr ::= expr between_op expr AND expr */ { ExprList *pList = sqlite3ExprListAppend(pParse,0, yymsp[-2].minor.yy528); pList = sqlite3ExprListAppend(pParse,pList, yymsp[0].minor.yy528); @@ -165895,7 +169854,7 @@ static YYACTIONTYPE yy_reduce( if( yymsp[-3].minor.yy394 ) yymsp[-4].minor.yy528 = sqlite3PExpr(pParse, TK_NOT, yymsp[-4].minor.yy528, 0); } break; - case 218: /* expr ::= expr in_op LP exprlist RP */ + case 220: /* expr ::= expr in_op LP exprlist RP */ { if( yymsp[-1].minor.yy322==0 ){ /* Expressions of the form @@ -165907,7 +169866,8 @@ static YYACTIONTYPE yy_reduce( ** regardless of the value of expr1. */ sqlite3ExprUnmapAndDelete(pParse, yymsp[-4].minor.yy528); - yymsp[-4].minor.yy528 = sqlite3Expr(pParse->db, TK_INTEGER, yymsp[-3].minor.yy394 ? "1" : "0"); + yymsp[-4].minor.yy528 = sqlite3Expr(pParse->db, TK_STRING, yymsp[-3].minor.yy394 ? "true" : "false"); + if( yymsp[-4].minor.yy528 ) sqlite3ExprIdToTrueFalse(yymsp[-4].minor.yy528); }else{ Expr *pRHS = yymsp[-1].minor.yy322->a[0].pExpr; if( yymsp[-1].minor.yy322->nExpr==1 && sqlite3ExprIsConstant(pRHS) && yymsp[-4].minor.yy528->op!=TK_VECTOR ){ @@ -165935,20 +169895,20 @@ static YYACTIONTYPE yy_reduce( } } break; - case 219: /* expr ::= LP select RP */ + case 221: /* expr ::= LP select RP */ { yymsp[-2].minor.yy528 = sqlite3PExpr(pParse, TK_SELECT, 0, 0); sqlite3PExprAddSelect(pParse, yymsp[-2].minor.yy528, yymsp[-1].minor.yy47); } break; - case 220: /* expr ::= expr in_op LP select RP */ + case 222: /* expr ::= expr in_op LP select RP */ { yymsp[-4].minor.yy528 = sqlite3PExpr(pParse, TK_IN, yymsp[-4].minor.yy528, 0); sqlite3PExprAddSelect(pParse, yymsp[-4].minor.yy528, yymsp[-1].minor.yy47); if( yymsp[-3].minor.yy394 ) yymsp[-4].minor.yy528 = sqlite3PExpr(pParse, TK_NOT, yymsp[-4].minor.yy528, 0); } break; - case 221: /* expr ::= expr in_op nm dbnm paren_exprlist */ + case 223: /* expr ::= expr in_op nm dbnm paren_exprlist */ { SrcList *pSrc = sqlite3SrcListAppend(pParse, 0,&yymsp[-2].minor.yy0,&yymsp[-1].minor.yy0); Select *pSelect = sqlite3SelectNew(pParse, 0,pSrc,0,0,0,0,0,0); @@ -165958,14 +169918,14 @@ static YYACTIONTYPE yy_reduce( if( yymsp[-3].minor.yy394 ) yymsp[-4].minor.yy528 = sqlite3PExpr(pParse, TK_NOT, yymsp[-4].minor.yy528, 0); } break; - case 222: /* expr ::= EXISTS LP select RP */ + case 224: /* expr ::= EXISTS LP select RP */ { Expr *p; p = yymsp[-3].minor.yy528 = sqlite3PExpr(pParse, TK_EXISTS, 0, 0); sqlite3PExprAddSelect(pParse, p, yymsp[-1].minor.yy47); } break; - case 223: /* expr ::= CASE case_operand case_exprlist case_else END */ + case 225: /* expr ::= CASE case_operand case_exprlist case_else END */ { yymsp[-4].minor.yy528 = sqlite3PExpr(pParse, TK_CASE, yymsp[-3].minor.yy528, 0); if( yymsp[-4].minor.yy528 ){ @@ -165977,32 +169937,32 @@ static YYACTIONTYPE yy_reduce( } } break; - case 224: /* case_exprlist ::= case_exprlist WHEN expr THEN expr */ + case 226: /* case_exprlist ::= case_exprlist WHEN expr THEN expr */ { yymsp[-4].minor.yy322 = sqlite3ExprListAppend(pParse,yymsp[-4].minor.yy322, yymsp[-2].minor.yy528); yymsp[-4].minor.yy322 = sqlite3ExprListAppend(pParse,yymsp[-4].minor.yy322, yymsp[0].minor.yy528); } break; - case 225: /* case_exprlist ::= WHEN expr THEN expr */ + case 227: /* case_exprlist ::= WHEN expr THEN expr */ { yymsp[-3].minor.yy322 = sqlite3ExprListAppend(pParse,0, yymsp[-2].minor.yy528); yymsp[-3].minor.yy322 = sqlite3ExprListAppend(pParse,yymsp[-3].minor.yy322, yymsp[0].minor.yy528); } break; - case 228: /* case_operand ::= expr */ + case 230: /* case_operand ::= expr */ {yymsp[0].minor.yy528 = yymsp[0].minor.yy528; /*A-overwrites-X*/} break; - case 231: /* nexprlist ::= nexprlist COMMA expr */ + case 233: /* nexprlist ::= nexprlist COMMA expr */ {yymsp[-2].minor.yy322 = sqlite3ExprListAppend(pParse,yymsp[-2].minor.yy322,yymsp[0].minor.yy528);} break; - case 232: /* nexprlist ::= expr */ + case 234: /* nexprlist ::= expr */ {yymsp[0].minor.yy322 = sqlite3ExprListAppend(pParse,0,yymsp[0].minor.yy528); /*A-overwrites-Y*/} break; - case 234: /* paren_exprlist ::= LP exprlist RP */ - case 239: /* eidlist_opt ::= LP eidlist RP */ yytestcase(yyruleno==239); + case 236: /* paren_exprlist ::= LP exprlist RP */ + case 241: /* eidlist_opt ::= LP eidlist RP */ yytestcase(yyruleno==241); {yymsp[-2].minor.yy322 = yymsp[-1].minor.yy322;} break; - case 235: /* cmd ::= createkw uniqueflag INDEX ifnotexists nm dbnm ON nm LP sortlist RP where_opt */ + case 237: /* cmd ::= createkw uniqueflag INDEX ifnotexists nm dbnm ON nm LP sortlist RP where_opt */ { sqlite3CreateIndex(pParse, &yymsp[-7].minor.yy0, &yymsp[-6].minor.yy0, sqlite3SrcListAppend(pParse,0,&yymsp[-4].minor.yy0,0), yymsp[-2].minor.yy322, yymsp[-10].minor.yy394, @@ -166012,48 +169972,48 @@ static YYACTIONTYPE yy_reduce( } } break; - case 236: /* uniqueflag ::= UNIQUE */ - case 278: /* raisetype ::= ABORT */ yytestcase(yyruleno==278); + case 238: /* uniqueflag ::= UNIQUE */ + case 280: /* raisetype ::= ABORT */ yytestcase(yyruleno==280); {yymsp[0].minor.yy394 = OE_Abort;} break; - case 237: /* uniqueflag ::= */ + case 239: /* uniqueflag ::= */ {yymsp[1].minor.yy394 = OE_None;} break; - case 240: /* eidlist ::= eidlist COMMA nm collate sortorder */ + case 242: /* eidlist ::= eidlist COMMA nm collate sortorder */ { yymsp[-4].minor.yy322 = parserAddExprIdListTerm(pParse, yymsp[-4].minor.yy322, &yymsp[-2].minor.yy0, yymsp[-1].minor.yy394, yymsp[0].minor.yy394); } break; - case 241: /* eidlist ::= nm collate sortorder */ + case 243: /* eidlist ::= nm collate sortorder */ { yymsp[-2].minor.yy322 = parserAddExprIdListTerm(pParse, 0, &yymsp[-2].minor.yy0, yymsp[-1].minor.yy394, yymsp[0].minor.yy394); /*A-overwrites-Y*/ } break; - case 244: /* cmd ::= DROP INDEX ifexists fullname */ + case 246: /* cmd ::= DROP INDEX ifexists fullname */ {sqlite3DropIndex(pParse, yymsp[0].minor.yy131, yymsp[-1].minor.yy394);} break; - case 245: /* cmd ::= VACUUM vinto */ + case 247: /* cmd ::= VACUUM vinto */ {sqlite3Vacuum(pParse,0,yymsp[0].minor.yy528);} break; - case 246: /* cmd ::= VACUUM nm vinto */ + case 248: /* cmd ::= VACUUM nm vinto */ {sqlite3Vacuum(pParse,&yymsp[-1].minor.yy0,yymsp[0].minor.yy528);} break; - case 249: /* cmd ::= PRAGMA nm dbnm */ + case 251: /* cmd ::= PRAGMA nm dbnm */ {sqlite3Pragma(pParse,&yymsp[-1].minor.yy0,&yymsp[0].minor.yy0,0,0);} break; - case 250: /* cmd ::= PRAGMA nm dbnm EQ nmnum */ + case 252: /* cmd ::= PRAGMA nm dbnm EQ nmnum */ {sqlite3Pragma(pParse,&yymsp[-3].minor.yy0,&yymsp[-2].minor.yy0,&yymsp[0].minor.yy0,0);} break; - case 251: /* cmd ::= PRAGMA nm dbnm LP nmnum RP */ + case 253: /* cmd ::= PRAGMA nm dbnm LP nmnum RP */ {sqlite3Pragma(pParse,&yymsp[-4].minor.yy0,&yymsp[-3].minor.yy0,&yymsp[-1].minor.yy0,0);} break; - case 252: /* cmd ::= PRAGMA nm dbnm EQ minus_num */ + case 254: /* cmd ::= PRAGMA nm dbnm EQ minus_num */ {sqlite3Pragma(pParse,&yymsp[-3].minor.yy0,&yymsp[-2].minor.yy0,&yymsp[0].minor.yy0,1);} break; - case 253: /* cmd ::= PRAGMA nm dbnm LP minus_num RP */ + case 255: /* cmd ::= PRAGMA nm dbnm LP minus_num RP */ {sqlite3Pragma(pParse,&yymsp[-4].minor.yy0,&yymsp[-3].minor.yy0,&yymsp[-1].minor.yy0,1);} break; - case 256: /* cmd ::= createkw trigger_decl BEGIN trigger_cmd_list END */ + case 258: /* cmd ::= createkw trigger_decl BEGIN trigger_cmd_list END */ { Token all; all.z = yymsp[-3].minor.yy0.z; @@ -166061,50 +170021,50 @@ static YYACTIONTYPE yy_reduce( sqlite3FinishTrigger(pParse, yymsp[-1].minor.yy33, &all); } break; - case 257: /* trigger_decl ::= temp TRIGGER ifnotexists nm dbnm trigger_time trigger_event ON fullname foreach_clause when_clause */ + case 259: /* trigger_decl ::= temp TRIGGER ifnotexists nm dbnm trigger_time trigger_event ON fullname foreach_clause when_clause */ { sqlite3BeginTrigger(pParse, &yymsp[-7].minor.yy0, &yymsp[-6].minor.yy0, yymsp[-5].minor.yy394, yymsp[-4].minor.yy180.a, yymsp[-4].minor.yy180.b, yymsp[-2].minor.yy131, yymsp[0].minor.yy528, yymsp[-10].minor.yy394, yymsp[-8].minor.yy394); yymsp[-10].minor.yy0 = (yymsp[-6].minor.yy0.n==0?yymsp[-7].minor.yy0:yymsp[-6].minor.yy0); /*A-overwrites-T*/ } break; - case 258: /* trigger_time ::= BEFORE|AFTER */ + case 260: /* trigger_time ::= BEFORE|AFTER */ { yymsp[0].minor.yy394 = yymsp[0].major; /*A-overwrites-X*/ } break; - case 259: /* trigger_time ::= INSTEAD OF */ + case 261: /* trigger_time ::= INSTEAD OF */ { yymsp[-1].minor.yy394 = TK_INSTEAD;} break; - case 260: /* trigger_time ::= */ + case 262: /* trigger_time ::= */ { yymsp[1].minor.yy394 = TK_BEFORE; } break; - case 261: /* trigger_event ::= DELETE|INSERT */ - case 262: /* trigger_event ::= UPDATE */ yytestcase(yyruleno==262); + case 263: /* trigger_event ::= DELETE|INSERT */ + case 264: /* trigger_event ::= UPDATE */ yytestcase(yyruleno==264); {yymsp[0].minor.yy180.a = yymsp[0].major; /*A-overwrites-X*/ yymsp[0].minor.yy180.b = 0;} break; - case 263: /* trigger_event ::= UPDATE OF idlist */ + case 265: /* trigger_event ::= UPDATE OF idlist */ {yymsp[-2].minor.yy180.a = TK_UPDATE; yymsp[-2].minor.yy180.b = yymsp[0].minor.yy254;} break; - case 264: /* when_clause ::= */ - case 283: /* key_opt ::= */ yytestcase(yyruleno==283); + case 266: /* when_clause ::= */ + case 285: /* key_opt ::= */ yytestcase(yyruleno==285); { yymsp[1].minor.yy528 = 0; } break; - case 265: /* when_clause ::= WHEN expr */ - case 284: /* key_opt ::= KEY expr */ yytestcase(yyruleno==284); + case 267: /* when_clause ::= WHEN expr */ + case 286: /* key_opt ::= KEY expr */ yytestcase(yyruleno==286); { yymsp[-1].minor.yy528 = yymsp[0].minor.yy528; } break; - case 266: /* trigger_cmd_list ::= trigger_cmd_list trigger_cmd SEMI */ + case 268: /* trigger_cmd_list ::= trigger_cmd_list trigger_cmd SEMI */ { assert( yymsp[-2].minor.yy33!=0 ); yymsp[-2].minor.yy33->pLast->pNext = yymsp[-1].minor.yy33; yymsp[-2].minor.yy33->pLast = yymsp[-1].minor.yy33; } break; - case 267: /* trigger_cmd_list ::= trigger_cmd SEMI */ + case 269: /* trigger_cmd_list ::= trigger_cmd SEMI */ { assert( yymsp[-1].minor.yy33!=0 ); yymsp[-1].minor.yy33->pLast = yymsp[-1].minor.yy33; } break; - case 268: /* trnm ::= nm DOT nm */ + case 270: /* trnm ::= nm DOT nm */ { yymsp[-2].minor.yy0 = yymsp[0].minor.yy0; sqlite3ErrorMsg(pParse, @@ -166112,39 +170072,39 @@ static YYACTIONTYPE yy_reduce( "statements within triggers"); } break; - case 269: /* tridxby ::= INDEXED BY nm */ + case 271: /* tridxby ::= INDEXED BY nm */ { sqlite3ErrorMsg(pParse, "the INDEXED BY clause is not allowed on UPDATE or DELETE statements " "within triggers"); } break; - case 270: /* tridxby ::= NOT INDEXED */ + case 272: /* tridxby ::= NOT INDEXED */ { sqlite3ErrorMsg(pParse, "the NOT INDEXED clause is not allowed on UPDATE or DELETE statements " "within triggers"); } break; - case 271: /* trigger_cmd ::= UPDATE orconf trnm tridxby SET setlist from where_opt scanpt */ + case 273: /* trigger_cmd ::= UPDATE orconf trnm tridxby SET setlist from where_opt scanpt */ {yylhsminor.yy33 = sqlite3TriggerUpdateStep(pParse, &yymsp[-6].minor.yy0, yymsp[-2].minor.yy131, yymsp[-3].minor.yy322, yymsp[-1].minor.yy528, yymsp[-7].minor.yy394, yymsp[-8].minor.yy0.z, yymsp[0].minor.yy522);} yymsp[-8].minor.yy33 = yylhsminor.yy33; break; - case 272: /* trigger_cmd ::= scanpt insert_cmd INTO trnm idlist_opt select upsert scanpt */ + case 274: /* trigger_cmd ::= scanpt insert_cmd INTO trnm idlist_opt select upsert scanpt */ { yylhsminor.yy33 = sqlite3TriggerInsertStep(pParse,&yymsp[-4].minor.yy0,yymsp[-3].minor.yy254,yymsp[-2].minor.yy47,yymsp[-6].minor.yy394,yymsp[-1].minor.yy444,yymsp[-7].minor.yy522,yymsp[0].minor.yy522);/*yylhsminor.yy33-overwrites-yymsp[-6].minor.yy394*/ } yymsp[-7].minor.yy33 = yylhsminor.yy33; break; - case 273: /* trigger_cmd ::= DELETE FROM trnm tridxby where_opt scanpt */ + case 275: /* trigger_cmd ::= DELETE FROM trnm tridxby where_opt scanpt */ {yylhsminor.yy33 = sqlite3TriggerDeleteStep(pParse, &yymsp[-3].minor.yy0, yymsp[-1].minor.yy528, yymsp[-5].minor.yy0.z, yymsp[0].minor.yy522);} yymsp[-5].minor.yy33 = yylhsminor.yy33; break; - case 274: /* trigger_cmd ::= scanpt select scanpt */ + case 276: /* trigger_cmd ::= scanpt select scanpt */ {yylhsminor.yy33 = sqlite3TriggerSelectStep(pParse->db, yymsp[-1].minor.yy47, yymsp[-2].minor.yy522, yymsp[0].minor.yy522); /*yylhsminor.yy33-overwrites-yymsp[-1].minor.yy47*/} yymsp[-2].minor.yy33 = yylhsminor.yy33; break; - case 275: /* expr ::= RAISE LP IGNORE RP */ + case 277: /* expr ::= RAISE LP IGNORE RP */ { yymsp[-3].minor.yy528 = sqlite3PExpr(pParse, TK_RAISE, 0, 0); if( yymsp[-3].minor.yy528 ){ @@ -166152,7 +170112,7 @@ static YYACTIONTYPE yy_reduce( } } break; - case 276: /* expr ::= RAISE LP raisetype COMMA nm RP */ + case 278: /* expr ::= RAISE LP raisetype COMMA nm RP */ { yymsp[-5].minor.yy528 = sqlite3ExprAlloc(pParse->db, TK_RAISE, &yymsp[-1].minor.yy0, 1); if( yymsp[-5].minor.yy528 ) { @@ -166160,118 +170120,118 @@ static YYACTIONTYPE yy_reduce( } } break; - case 277: /* raisetype ::= ROLLBACK */ + case 279: /* raisetype ::= ROLLBACK */ {yymsp[0].minor.yy394 = OE_Rollback;} break; - case 279: /* raisetype ::= FAIL */ + case 281: /* raisetype ::= FAIL */ {yymsp[0].minor.yy394 = OE_Fail;} break; - case 280: /* cmd ::= DROP TRIGGER ifexists fullname */ + case 282: /* cmd ::= DROP TRIGGER ifexists fullname */ { sqlite3DropTrigger(pParse,yymsp[0].minor.yy131,yymsp[-1].minor.yy394); } break; - case 281: /* cmd ::= ATTACH database_kw_opt expr AS expr key_opt */ + case 283: /* cmd ::= ATTACH database_kw_opt expr AS expr key_opt */ { sqlite3Attach(pParse, yymsp[-3].minor.yy528, yymsp[-1].minor.yy528, yymsp[0].minor.yy528); } break; - case 282: /* cmd ::= DETACH database_kw_opt expr */ + case 284: /* cmd ::= DETACH database_kw_opt expr */ { sqlite3Detach(pParse, yymsp[0].minor.yy528); } break; - case 285: /* cmd ::= REINDEX */ + case 287: /* cmd ::= REINDEX */ {sqlite3Reindex(pParse, 0, 0);} break; - case 286: /* cmd ::= REINDEX nm dbnm */ + case 288: /* cmd ::= REINDEX nm dbnm */ {sqlite3Reindex(pParse, &yymsp[-1].minor.yy0, &yymsp[0].minor.yy0);} break; - case 287: /* cmd ::= ANALYZE */ + case 289: /* cmd ::= ANALYZE */ {sqlite3Analyze(pParse, 0, 0);} break; - case 288: /* cmd ::= ANALYZE nm dbnm */ + case 290: /* cmd ::= ANALYZE nm dbnm */ {sqlite3Analyze(pParse, &yymsp[-1].minor.yy0, &yymsp[0].minor.yy0);} break; - case 289: /* cmd ::= ALTER TABLE fullname RENAME TO nm */ + case 291: /* cmd ::= ALTER TABLE fullname RENAME TO nm */ { sqlite3AlterRenameTable(pParse,yymsp[-3].minor.yy131,&yymsp[0].minor.yy0); } break; - case 290: /* cmd ::= ALTER TABLE add_column_fullname ADD kwcolumn_opt columnname carglist */ + case 292: /* cmd ::= ALTER TABLE add_column_fullname ADD kwcolumn_opt columnname carglist */ { yymsp[-1].minor.yy0.n = (int)(pParse->sLastToken.z-yymsp[-1].minor.yy0.z) + pParse->sLastToken.n; sqlite3AlterFinishAddColumn(pParse, &yymsp[-1].minor.yy0); } break; - case 291: /* cmd ::= ALTER TABLE fullname DROP kwcolumn_opt nm */ + case 293: /* cmd ::= ALTER TABLE fullname DROP kwcolumn_opt nm */ { sqlite3AlterDropColumn(pParse, yymsp[-3].minor.yy131, &yymsp[0].minor.yy0); } break; - case 292: /* add_column_fullname ::= fullname */ + case 294: /* add_column_fullname ::= fullname */ { disableLookaside(pParse); sqlite3AlterBeginAddColumn(pParse, yymsp[0].minor.yy131); } break; - case 293: /* cmd ::= ALTER TABLE fullname RENAME kwcolumn_opt nm TO nm */ + case 295: /* cmd ::= ALTER TABLE fullname RENAME kwcolumn_opt nm TO nm */ { sqlite3AlterRenameColumn(pParse, yymsp[-5].minor.yy131, &yymsp[-2].minor.yy0, &yymsp[0].minor.yy0); } break; - case 294: /* cmd ::= create_vtab */ + case 296: /* cmd ::= create_vtab */ {sqlite3VtabFinishParse(pParse,0);} break; - case 295: /* cmd ::= create_vtab LP vtabarglist RP */ + case 297: /* cmd ::= create_vtab LP vtabarglist RP */ {sqlite3VtabFinishParse(pParse,&yymsp[0].minor.yy0);} break; - case 296: /* create_vtab ::= createkw VIRTUAL TABLE ifnotexists nm dbnm USING nm */ + case 298: /* create_vtab ::= createkw VIRTUAL TABLE ifnotexists nm dbnm USING nm */ { sqlite3VtabBeginParse(pParse, &yymsp[-3].minor.yy0, &yymsp[-2].minor.yy0, &yymsp[0].minor.yy0, yymsp[-4].minor.yy394); } break; - case 297: /* vtabarg ::= */ + case 299: /* vtabarg ::= */ {sqlite3VtabArgInit(pParse);} break; - case 298: /* vtabargtoken ::= ANY */ - case 299: /* vtabargtoken ::= lp anylist RP */ yytestcase(yyruleno==299); - case 300: /* lp ::= LP */ yytestcase(yyruleno==300); + case 300: /* vtabargtoken ::= ANY */ + case 301: /* vtabargtoken ::= lp anylist RP */ yytestcase(yyruleno==301); + case 302: /* lp ::= LP */ yytestcase(yyruleno==302); {sqlite3VtabArgExtend(pParse,&yymsp[0].minor.yy0);} break; - case 301: /* with ::= WITH wqlist */ - case 302: /* with ::= WITH RECURSIVE wqlist */ yytestcase(yyruleno==302); + case 303: /* with ::= WITH wqlist */ + case 304: /* with ::= WITH RECURSIVE wqlist */ yytestcase(yyruleno==304); { sqlite3WithPush(pParse, yymsp[0].minor.yy521, 1); } break; - case 303: /* wqas ::= AS */ + case 305: /* wqas ::= AS */ {yymsp[0].minor.yy516 = M10d_Any;} break; - case 304: /* wqas ::= AS MATERIALIZED */ + case 306: /* wqas ::= AS MATERIALIZED */ {yymsp[-1].minor.yy516 = M10d_Yes;} break; - case 305: /* wqas ::= AS NOT MATERIALIZED */ + case 307: /* wqas ::= AS NOT MATERIALIZED */ {yymsp[-2].minor.yy516 = M10d_No;} break; - case 306: /* wqitem ::= nm eidlist_opt wqas LP select RP */ + case 308: /* wqitem ::= nm eidlist_opt wqas LP select RP */ { yymsp[-5].minor.yy385 = sqlite3CteNew(pParse, &yymsp[-5].minor.yy0, yymsp[-4].minor.yy322, yymsp[-1].minor.yy47, yymsp[-3].minor.yy516); /*A-overwrites-X*/ } break; - case 307: /* wqlist ::= wqitem */ + case 309: /* wqlist ::= wqitem */ { yymsp[0].minor.yy521 = sqlite3WithAdd(pParse, 0, yymsp[0].minor.yy385); /*A-overwrites-X*/ } break; - case 308: /* wqlist ::= wqlist COMMA wqitem */ + case 310: /* wqlist ::= wqlist COMMA wqitem */ { yymsp[-2].minor.yy521 = sqlite3WithAdd(pParse, yymsp[-2].minor.yy521, yymsp[0].minor.yy385); } break; - case 309: /* windowdefn_list ::= windowdefn */ + case 311: /* windowdefn_list ::= windowdefn */ { yylhsminor.yy41 = yymsp[0].minor.yy41; } yymsp[0].minor.yy41 = yylhsminor.yy41; break; - case 310: /* windowdefn_list ::= windowdefn_list COMMA windowdefn */ + case 312: /* windowdefn_list ::= windowdefn_list COMMA windowdefn */ { assert( yymsp[0].minor.yy41!=0 ); sqlite3WindowChain(pParse, yymsp[0].minor.yy41, yymsp[-2].minor.yy41); @@ -166280,7 +170240,7 @@ static YYACTIONTYPE yy_reduce( } yymsp[-2].minor.yy41 = yylhsminor.yy41; break; - case 311: /* windowdefn ::= nm AS LP window RP */ + case 313: /* windowdefn ::= nm AS LP window RP */ { if( ALWAYS(yymsp[-1].minor.yy41) ){ yymsp[-1].minor.yy41->zName = sqlite3DbStrNDup(pParse->db, yymsp[-4].minor.yy0.z, yymsp[-4].minor.yy0.n); @@ -166289,90 +170249,90 @@ static YYACTIONTYPE yy_reduce( } yymsp[-4].minor.yy41 = yylhsminor.yy41; break; - case 312: /* window ::= PARTITION BY nexprlist orderby_opt frame_opt */ + case 314: /* window ::= PARTITION BY nexprlist orderby_opt frame_opt */ { yymsp[-4].minor.yy41 = sqlite3WindowAssemble(pParse, yymsp[0].minor.yy41, yymsp[-2].minor.yy322, yymsp[-1].minor.yy322, 0); } break; - case 313: /* window ::= nm PARTITION BY nexprlist orderby_opt frame_opt */ + case 315: /* window ::= nm PARTITION BY nexprlist orderby_opt frame_opt */ { yylhsminor.yy41 = sqlite3WindowAssemble(pParse, yymsp[0].minor.yy41, yymsp[-2].minor.yy322, yymsp[-1].minor.yy322, &yymsp[-5].minor.yy0); } yymsp[-5].minor.yy41 = yylhsminor.yy41; break; - case 314: /* window ::= ORDER BY sortlist frame_opt */ + case 316: /* window ::= ORDER BY sortlist frame_opt */ { yymsp[-3].minor.yy41 = sqlite3WindowAssemble(pParse, yymsp[0].minor.yy41, 0, yymsp[-1].minor.yy322, 0); } break; - case 315: /* window ::= nm ORDER BY sortlist frame_opt */ + case 317: /* window ::= nm ORDER BY sortlist frame_opt */ { yylhsminor.yy41 = sqlite3WindowAssemble(pParse, yymsp[0].minor.yy41, 0, yymsp[-1].minor.yy322, &yymsp[-4].minor.yy0); } yymsp[-4].minor.yy41 = yylhsminor.yy41; break; - case 316: /* window ::= frame_opt */ - case 335: /* filter_over ::= over_clause */ yytestcase(yyruleno==335); + case 318: /* window ::= frame_opt */ + case 337: /* filter_over ::= over_clause */ yytestcase(yyruleno==337); { yylhsminor.yy41 = yymsp[0].minor.yy41; } yymsp[0].minor.yy41 = yylhsminor.yy41; break; - case 317: /* window ::= nm frame_opt */ + case 319: /* window ::= nm frame_opt */ { yylhsminor.yy41 = sqlite3WindowAssemble(pParse, yymsp[0].minor.yy41, 0, 0, &yymsp[-1].minor.yy0); } yymsp[-1].minor.yy41 = yylhsminor.yy41; break; - case 318: /* frame_opt ::= */ + case 320: /* frame_opt ::= */ { yymsp[1].minor.yy41 = sqlite3WindowAlloc(pParse, 0, TK_UNBOUNDED, 0, TK_CURRENT, 0, 0); } break; - case 319: /* frame_opt ::= range_or_rows frame_bound_s frame_exclude_opt */ + case 321: /* frame_opt ::= range_or_rows frame_bound_s frame_exclude_opt */ { yylhsminor.yy41 = sqlite3WindowAlloc(pParse, yymsp[-2].minor.yy394, yymsp[-1].minor.yy595.eType, yymsp[-1].minor.yy595.pExpr, TK_CURRENT, 0, yymsp[0].minor.yy516); } yymsp[-2].minor.yy41 = yylhsminor.yy41; break; - case 320: /* frame_opt ::= range_or_rows BETWEEN frame_bound_s AND frame_bound_e frame_exclude_opt */ + case 322: /* frame_opt ::= range_or_rows BETWEEN frame_bound_s AND frame_bound_e frame_exclude_opt */ { yylhsminor.yy41 = sqlite3WindowAlloc(pParse, yymsp[-5].minor.yy394, yymsp[-3].minor.yy595.eType, yymsp[-3].minor.yy595.pExpr, yymsp[-1].minor.yy595.eType, yymsp[-1].minor.yy595.pExpr, yymsp[0].minor.yy516); } yymsp[-5].minor.yy41 = yylhsminor.yy41; break; - case 322: /* frame_bound_s ::= frame_bound */ - case 324: /* frame_bound_e ::= frame_bound */ yytestcase(yyruleno==324); + case 324: /* frame_bound_s ::= frame_bound */ + case 326: /* frame_bound_e ::= frame_bound */ yytestcase(yyruleno==326); {yylhsminor.yy595 = yymsp[0].minor.yy595;} yymsp[0].minor.yy595 = yylhsminor.yy595; break; - case 323: /* frame_bound_s ::= UNBOUNDED PRECEDING */ - case 325: /* frame_bound_e ::= UNBOUNDED FOLLOWING */ yytestcase(yyruleno==325); - case 327: /* frame_bound ::= CURRENT ROW */ yytestcase(yyruleno==327); + case 325: /* frame_bound_s ::= UNBOUNDED PRECEDING */ + case 327: /* frame_bound_e ::= UNBOUNDED FOLLOWING */ yytestcase(yyruleno==327); + case 329: /* frame_bound ::= CURRENT ROW */ yytestcase(yyruleno==329); {yylhsminor.yy595.eType = yymsp[-1].major; yylhsminor.yy595.pExpr = 0;} yymsp[-1].minor.yy595 = yylhsminor.yy595; break; - case 326: /* frame_bound ::= expr PRECEDING|FOLLOWING */ + case 328: /* frame_bound ::= expr PRECEDING|FOLLOWING */ {yylhsminor.yy595.eType = yymsp[0].major; yylhsminor.yy595.pExpr = yymsp[-1].minor.yy528;} yymsp[-1].minor.yy595 = yylhsminor.yy595; break; - case 328: /* frame_exclude_opt ::= */ + case 330: /* frame_exclude_opt ::= */ {yymsp[1].minor.yy516 = 0;} break; - case 329: /* frame_exclude_opt ::= EXCLUDE frame_exclude */ + case 331: /* frame_exclude_opt ::= EXCLUDE frame_exclude */ {yymsp[-1].minor.yy516 = yymsp[0].minor.yy516;} break; - case 330: /* frame_exclude ::= NO OTHERS */ - case 331: /* frame_exclude ::= CURRENT ROW */ yytestcase(yyruleno==331); + case 332: /* frame_exclude ::= NO OTHERS */ + case 333: /* frame_exclude ::= CURRENT ROW */ yytestcase(yyruleno==333); {yymsp[-1].minor.yy516 = yymsp[-1].major; /*A-overwrites-X*/} break; - case 332: /* frame_exclude ::= GROUP|TIES */ + case 334: /* frame_exclude ::= GROUP|TIES */ {yymsp[0].minor.yy516 = yymsp[0].major; /*A-overwrites-X*/} break; - case 333: /* window_clause ::= WINDOW windowdefn_list */ + case 335: /* window_clause ::= WINDOW windowdefn_list */ { yymsp[-1].minor.yy41 = yymsp[0].minor.yy41; } break; - case 334: /* filter_over ::= filter_clause over_clause */ + case 336: /* filter_over ::= filter_clause over_clause */ { if( yymsp[0].minor.yy41 ){ yymsp[0].minor.yy41->pFilter = yymsp[-1].minor.yy528; @@ -166383,7 +170343,7 @@ static YYACTIONTYPE yy_reduce( } yymsp[-1].minor.yy41 = yylhsminor.yy41; break; - case 336: /* filter_over ::= filter_clause */ + case 338: /* filter_over ::= filter_clause */ { yylhsminor.yy41 = (Window*)sqlite3DbMallocZero(pParse->db, sizeof(Window)); if( yylhsminor.yy41 ){ @@ -166395,13 +170355,13 @@ static YYACTIONTYPE yy_reduce( } yymsp[0].minor.yy41 = yylhsminor.yy41; break; - case 337: /* over_clause ::= OVER LP window RP */ + case 339: /* over_clause ::= OVER LP window RP */ { yymsp[-3].minor.yy41 = yymsp[-1].minor.yy41; assert( yymsp[-3].minor.yy41!=0 ); } break; - case 338: /* over_clause ::= OVER nm */ + case 340: /* over_clause ::= OVER nm */ { yymsp[-1].minor.yy41 = (Window*)sqlite3DbMallocZero(pParse->db, sizeof(Window)); if( yymsp[-1].minor.yy41 ){ @@ -166409,72 +170369,73 @@ static YYACTIONTYPE yy_reduce( } } break; - case 339: /* filter_clause ::= FILTER LP WHERE expr RP */ + case 341: /* filter_clause ::= FILTER LP WHERE expr RP */ { yymsp[-4].minor.yy528 = yymsp[-1].minor.yy528; } break; default: - /* (340) input ::= cmdlist */ yytestcase(yyruleno==340); - /* (341) cmdlist ::= cmdlist ecmd */ yytestcase(yyruleno==341); - /* (342) cmdlist ::= ecmd (OPTIMIZED OUT) */ assert(yyruleno!=342); - /* (343) ecmd ::= SEMI */ yytestcase(yyruleno==343); - /* (344) ecmd ::= cmdx SEMI */ yytestcase(yyruleno==344); - /* (345) ecmd ::= explain cmdx SEMI (NEVER REDUCES) */ assert(yyruleno!=345); - /* (346) trans_opt ::= */ yytestcase(yyruleno==346); - /* (347) trans_opt ::= TRANSACTION */ yytestcase(yyruleno==347); - /* (348) trans_opt ::= TRANSACTION nm */ yytestcase(yyruleno==348); - /* (349) savepoint_opt ::= SAVEPOINT */ yytestcase(yyruleno==349); - /* (350) savepoint_opt ::= */ yytestcase(yyruleno==350); - /* (351) cmd ::= create_table create_table_args */ yytestcase(yyruleno==351); - /* (352) table_option_set ::= table_option (OPTIMIZED OUT) */ assert(yyruleno!=352); - /* (353) columnlist ::= columnlist COMMA columnname carglist */ yytestcase(yyruleno==353); - /* (354) columnlist ::= columnname carglist */ yytestcase(yyruleno==354); - /* (355) nm ::= ID|INDEXED */ yytestcase(yyruleno==355); - /* (356) nm ::= STRING */ yytestcase(yyruleno==356); - /* (357) nm ::= JOIN_KW */ yytestcase(yyruleno==357); - /* (358) typetoken ::= typename */ yytestcase(yyruleno==358); - /* (359) typename ::= ID|STRING */ yytestcase(yyruleno==359); - /* (360) signed ::= plus_num (OPTIMIZED OUT) */ assert(yyruleno!=360); - /* (361) signed ::= minus_num (OPTIMIZED OUT) */ assert(yyruleno!=361); - /* (362) carglist ::= carglist ccons */ yytestcase(yyruleno==362); - /* (363) carglist ::= */ yytestcase(yyruleno==363); - /* (364) ccons ::= NULL onconf */ yytestcase(yyruleno==364); - /* (365) ccons ::= GENERATED ALWAYS AS generated */ yytestcase(yyruleno==365); - /* (366) ccons ::= AS generated */ yytestcase(yyruleno==366); - /* (367) conslist_opt ::= COMMA conslist */ yytestcase(yyruleno==367); - /* (368) conslist ::= conslist tconscomma tcons */ yytestcase(yyruleno==368); - /* (369) conslist ::= tcons (OPTIMIZED OUT) */ assert(yyruleno!=369); - /* (370) tconscomma ::= */ yytestcase(yyruleno==370); - /* (371) defer_subclause_opt ::= defer_subclause (OPTIMIZED OUT) */ assert(yyruleno!=371); - /* (372) resolvetype ::= raisetype (OPTIMIZED OUT) */ assert(yyruleno!=372); - /* (373) selectnowith ::= oneselect (OPTIMIZED OUT) */ assert(yyruleno!=373); - /* (374) oneselect ::= values */ yytestcase(yyruleno==374); - /* (375) sclp ::= selcollist COMMA */ yytestcase(yyruleno==375); - /* (376) as ::= ID|STRING */ yytestcase(yyruleno==376); - /* (377) returning ::= */ yytestcase(yyruleno==377); - /* (378) expr ::= term (OPTIMIZED OUT) */ assert(yyruleno!=378); - /* (379) likeop ::= LIKE_KW|MATCH */ yytestcase(yyruleno==379); - /* (380) exprlist ::= nexprlist */ yytestcase(yyruleno==380); - /* (381) nmnum ::= plus_num (OPTIMIZED OUT) */ assert(yyruleno!=381); - /* (382) nmnum ::= nm (OPTIMIZED OUT) */ assert(yyruleno!=382); - /* (383) nmnum ::= ON */ yytestcase(yyruleno==383); - /* (384) nmnum ::= DELETE */ yytestcase(yyruleno==384); - /* (385) nmnum ::= DEFAULT */ yytestcase(yyruleno==385); - /* (386) plus_num ::= INTEGER|FLOAT */ yytestcase(yyruleno==386); - /* (387) foreach_clause ::= */ yytestcase(yyruleno==387); - /* (388) foreach_clause ::= FOR EACH ROW */ yytestcase(yyruleno==388); - /* (389) trnm ::= nm */ yytestcase(yyruleno==389); - /* (390) tridxby ::= */ yytestcase(yyruleno==390); - /* (391) database_kw_opt ::= DATABASE */ yytestcase(yyruleno==391); - /* (392) database_kw_opt ::= */ yytestcase(yyruleno==392); - /* (393) kwcolumn_opt ::= */ yytestcase(yyruleno==393); - /* (394) kwcolumn_opt ::= COLUMNKW */ yytestcase(yyruleno==394); - /* (395) vtabarglist ::= vtabarg */ yytestcase(yyruleno==395); - /* (396) vtabarglist ::= vtabarglist COMMA vtabarg */ yytestcase(yyruleno==396); - /* (397) vtabarg ::= vtabarg vtabargtoken */ yytestcase(yyruleno==397); - /* (398) anylist ::= */ yytestcase(yyruleno==398); - /* (399) anylist ::= anylist LP anylist RP */ yytestcase(yyruleno==399); - /* (400) anylist ::= anylist ANY */ yytestcase(yyruleno==400); - /* (401) with ::= */ yytestcase(yyruleno==401); + /* (342) input ::= cmdlist */ yytestcase(yyruleno==342); + /* (343) cmdlist ::= cmdlist ecmd */ yytestcase(yyruleno==343); + /* (344) cmdlist ::= ecmd (OPTIMIZED OUT) */ assert(yyruleno!=344); + /* (345) ecmd ::= SEMI */ yytestcase(yyruleno==345); + /* (346) ecmd ::= cmdx SEMI */ yytestcase(yyruleno==346); + /* (347) ecmd ::= explain cmdx SEMI (NEVER REDUCES) */ assert(yyruleno!=347); + /* (348) trans_opt ::= */ yytestcase(yyruleno==348); + /* (349) trans_opt ::= TRANSACTION */ yytestcase(yyruleno==349); + /* (350) trans_opt ::= TRANSACTION nm */ yytestcase(yyruleno==350); + /* (351) savepoint_opt ::= SAVEPOINT */ yytestcase(yyruleno==351); + /* (352) savepoint_opt ::= */ yytestcase(yyruleno==352); + /* (353) cmd ::= create_table create_table_args */ yytestcase(yyruleno==353); + /* (354) table_option_set ::= table_option (OPTIMIZED OUT) */ assert(yyruleno!=354); + /* (355) columnlist ::= columnlist COMMA columnname carglist */ yytestcase(yyruleno==355); + /* (356) columnlist ::= columnname carglist */ yytestcase(yyruleno==356); + /* (357) nm ::= ID|INDEXED */ yytestcase(yyruleno==357); + /* (358) nm ::= STRING */ yytestcase(yyruleno==358); + /* (359) nm ::= JOIN_KW */ yytestcase(yyruleno==359); + /* (360) typetoken ::= typename */ yytestcase(yyruleno==360); + /* (361) typename ::= ID|STRING */ yytestcase(yyruleno==361); + /* (362) signed ::= plus_num (OPTIMIZED OUT) */ assert(yyruleno!=362); + /* (363) signed ::= minus_num (OPTIMIZED OUT) */ assert(yyruleno!=363); + /* (364) carglist ::= carglist ccons */ yytestcase(yyruleno==364); + /* (365) carglist ::= */ yytestcase(yyruleno==365); + /* (366) ccons ::= NULL onconf */ yytestcase(yyruleno==366); + /* (367) ccons ::= GENERATED ALWAYS AS generated */ yytestcase(yyruleno==367); + /* (368) ccons ::= AS generated */ yytestcase(yyruleno==368); + /* (369) conslist_opt ::= COMMA conslist */ yytestcase(yyruleno==369); + /* (370) conslist ::= conslist tconscomma tcons */ yytestcase(yyruleno==370); + /* (371) conslist ::= tcons (OPTIMIZED OUT) */ assert(yyruleno!=371); + /* (372) tconscomma ::= */ yytestcase(yyruleno==372); + /* (373) defer_subclause_opt ::= defer_subclause (OPTIMIZED OUT) */ assert(yyruleno!=373); + /* (374) resolvetype ::= raisetype (OPTIMIZED OUT) */ assert(yyruleno!=374); + /* (375) selectnowith ::= oneselect (OPTIMIZED OUT) */ assert(yyruleno!=375); + /* (376) oneselect ::= values */ yytestcase(yyruleno==376); + /* (377) sclp ::= selcollist COMMA */ yytestcase(yyruleno==377); + /* (378) as ::= ID|STRING */ yytestcase(yyruleno==378); + /* (379) indexed_opt ::= indexed_by (OPTIMIZED OUT) */ assert(yyruleno!=379); + /* (380) returning ::= */ yytestcase(yyruleno==380); + /* (381) expr ::= term (OPTIMIZED OUT) */ assert(yyruleno!=381); + /* (382) likeop ::= LIKE_KW|MATCH */ yytestcase(yyruleno==382); + /* (383) exprlist ::= nexprlist */ yytestcase(yyruleno==383); + /* (384) nmnum ::= plus_num (OPTIMIZED OUT) */ assert(yyruleno!=384); + /* (385) nmnum ::= nm (OPTIMIZED OUT) */ assert(yyruleno!=385); + /* (386) nmnum ::= ON */ yytestcase(yyruleno==386); + /* (387) nmnum ::= DELETE */ yytestcase(yyruleno==387); + /* (388) nmnum ::= DEFAULT */ yytestcase(yyruleno==388); + /* (389) plus_num ::= INTEGER|FLOAT */ yytestcase(yyruleno==389); + /* (390) foreach_clause ::= */ yytestcase(yyruleno==390); + /* (391) foreach_clause ::= FOR EACH ROW */ yytestcase(yyruleno==391); + /* (392) trnm ::= nm */ yytestcase(yyruleno==392); + /* (393) tridxby ::= */ yytestcase(yyruleno==393); + /* (394) database_kw_opt ::= DATABASE */ yytestcase(yyruleno==394); + /* (395) database_kw_opt ::= */ yytestcase(yyruleno==395); + /* (396) kwcolumn_opt ::= */ yytestcase(yyruleno==396); + /* (397) kwcolumn_opt ::= COLUMNKW */ yytestcase(yyruleno==397); + /* (398) vtabarglist ::= vtabarg */ yytestcase(yyruleno==398); + /* (399) vtabarglist ::= vtabarglist COMMA vtabarg */ yytestcase(yyruleno==399); + /* (400) vtabarg ::= vtabarg vtabargtoken */ yytestcase(yyruleno==400); + /* (401) anylist ::= */ yytestcase(yyruleno==401); + /* (402) anylist ::= anylist LP anylist RP */ yytestcase(yyruleno==402); + /* (403) anylist ::= anylist ANY */ yytestcase(yyruleno==403); + /* (404) with ::= */ yytestcase(yyruleno==404); break; /********** End reduce actions ************************************************/ }; @@ -167921,6 +171882,7 @@ SQLITE_PRIVATE int sqlite3RunParser(Parse *pParse, const char *zSql){ mxSqlLen -= n; if( mxSqlLen<0 ){ pParse->rc = SQLITE_TOOBIG; + pParse->nErr++; break; } #ifndef SQLITE_OMIT_WINDOWFUNC @@ -168017,7 +171979,7 @@ SQLITE_PRIVATE int sqlite3RunParser(Parse *pParse, const char *zSql){ if( pParse->pNewTrigger && !IN_RENAME_OBJECT ){ sqlite3DeleteTrigger(db, pParse->pNewTrigger); } - sqlite3DbFree(db, pParse->pVList); + if( pParse->pVList ) sqlite3DbNNFreeNN(db, pParse->pVList); db->pParse = pParentParse; assert( nErr==0 || pParse->rc!=SQLITE_OK ); return nErr; @@ -169373,18 +173335,19 @@ static int setupLookaside(sqlite3 *db, void *pBuf, int sz, int cnt){ db->lookaside.bMalloced = pBuf==0 ?1:0; db->lookaside.nSlot = nBig+nSm; }else{ - db->lookaside.pStart = db; + db->lookaside.pStart = 0; #ifndef SQLITE_OMIT_TWOSIZE_LOOKASIDE db->lookaside.pSmallInit = 0; db->lookaside.pSmallFree = 0; - db->lookaside.pMiddle = db; + db->lookaside.pMiddle = 0; #endif /* SQLITE_OMIT_TWOSIZE_LOOKASIDE */ - db->lookaside.pEnd = db; + db->lookaside.pEnd = 0; db->lookaside.bDisable = 1; db->lookaside.sz = 0; db->lookaside.bMalloced = 0; db->lookaside.nSlot = 0; } + db->lookaside.pTrueEnd = db->lookaside.pEnd; assert( sqlite3LookasideUsed(db,0)==0 ); #endif /* SQLITE_OMIT_LOOKASIDE */ return SQLITE_OK; @@ -169463,6 +173426,7 @@ SQLITE_API int sqlite3_db_cacheflush(sqlite3 *db){ SQLITE_API int sqlite3_db_config(sqlite3 *db, int op, ...){ va_list ap; int rc; + sqlite3_mutex_enter(db->mutex); va_start(ap, op); switch( op ){ case SQLITE_DBCONFIG_MAINDBNAME: { @@ -169528,6 +173492,7 @@ SQLITE_API int sqlite3_db_config(sqlite3 *db, int op, ...){ } } va_end(ap); + sqlite3_mutex_leave(db->mutex); return rc; } @@ -170664,7 +174629,7 @@ SQLITE_API int sqlite3_overload_function( rc = sqlite3FindFunction(db, zName, nArg, SQLITE_UTF8, 0)!=0; sqlite3_mutex_leave(db->mutex); if( rc ) return SQLITE_OK; - zCopy = sqlite3_mprintf(zName); + zCopy = sqlite3_mprintf("%s", zName); if( zCopy==0 ) return SQLITE_NOMEM; return sqlite3_create_function_v2(db, zName, nArg, SQLITE_UTF8, zCopy, sqlite3InvalidFunction, 0, 0, sqlite3_free); @@ -171898,6 +175863,19 @@ static int openDatabase( goto opendb_out; } +#if SQLITE_OS_UNIX && defined(SQLITE_OS_KV_OPTIONAL) + /* Process magic filenames ":localStorage:" and ":sessionStorage:" */ + if( zFilename && zFilename[0]==':' ){ + if( strcmp(zFilename, ":localStorage:")==0 ){ + zFilename = "file:local?vfs=kvvfs"; + flags |= SQLITE_OPEN_URI; + }else if( strcmp(zFilename, ":sessionStorage:")==0 ){ + zFilename = "file:session?vfs=kvvfs"; + flags |= SQLITE_OPEN_URI; + } + } +#endif /* SQLITE_OS_UNIX && defined(SQLITE_OS_KV_OPTIONAL) */ + /* Parse the filename/URI argument ** ** Only allow sensible combinations of bits in the flags argument. @@ -171928,6 +175906,12 @@ static int openDatabase( sqlite3_free(zErrMsg); goto opendb_out; } + assert( db->pVfs!=0 ); +#if SQLITE_OS_KV || defined(SQLITE_OS_KV_OPTIONAL) + if( sqlite3_stricmp(db->pVfs->zName, "kvvfs")==0 ){ + db->temp_store = 2; + } +#endif /* Open the backend database driver */ rc = sqlite3BtreeOpen(db->pVfs, zOpen, db, &db->aDb[0].pBt, 0, @@ -172477,6 +176461,9 @@ SQLITE_API int sqlite3_file_control(sqlite3 *db, const char *zDbName, int op, vo sqlite3BtreeSetPageSize(pBtree, 0, iNew, 0); } rc = SQLITE_OK; + }else if( op==SQLITE_FCNTL_RESET_CACHE ){ + sqlite3BtreeClearCache(pBtree); + rc = SQLITE_OK; }else{ int nSave = db->busyHandler.nBusy; rc = sqlite3OsFileControl(fd, op, pArg); @@ -172646,6 +176633,28 @@ SQLITE_API int sqlite3_test_control(int op, ...){ volatile int x = 0; assert( /*side-effects-ok*/ (x = va_arg(ap,int))!=0 ); rc = x; +#if defined(SQLITE_DEBUG) + /* Invoke these debugging routines so that the compiler does not + ** issue "defined but not used" warnings. */ + if( x==9999 ){ + sqlite3ShowExpr(0); + sqlite3ShowExpr(0); + sqlite3ShowExprList(0); + sqlite3ShowIdList(0); + sqlite3ShowSrcList(0); + sqlite3ShowWith(0); + sqlite3ShowUpsert(0); + sqlite3ShowTriggerStep(0); + sqlite3ShowTriggerStepList(0); + sqlite3ShowTrigger(0); + sqlite3ShowTriggerList(0); +#ifndef SQLITE_OMIT_WINDOWFUNC + sqlite3ShowWindow(0); + sqlite3ShowWinFunc(0); +#endif + sqlite3ShowSelect(0); + } +#endif break; } @@ -172907,8 +176916,8 @@ SQLITE_API int sqlite3_test_control(int op, ...){ ** ** "ptr" is a pointer to a u32. ** - ** op==0 Store the current sqlite3SelectTrace in *ptr - ** op==1 Set sqlite3SelectTrace to the value *ptr + ** op==0 Store the current sqlite3TreeTrace in *ptr + ** op==1 Set sqlite3TreeTrace to the value *ptr ** op==3 Store the current sqlite3WhereTrace in *ptr ** op==3 Set sqlite3WhereTrace to the value *ptr */ @@ -172916,10 +176925,10 @@ SQLITE_API int sqlite3_test_control(int op, ...){ int opTrace = va_arg(ap, int); u32 *ptr = va_arg(ap, u32*); switch( opTrace ){ - case 0: *ptr = sqlite3SelectTrace; break; - case 1: sqlite3SelectTrace = *ptr; break; - case 2: *ptr = sqlite3WhereTrace; break; - case 3: sqlite3WhereTrace = *ptr; break; + case 0: *ptr = sqlite3TreeTrace; break; + case 1: sqlite3TreeTrace = *ptr; break; + case 2: *ptr = sqlite3WhereTrace; break; + case 3: sqlite3WhereTrace = *ptr; break; } break; } @@ -172936,10 +176945,12 @@ SQLITE_API int sqlite3_test_control(int op, ...){ case SQLITE_TESTCTRL_LOGEST: { double rIn = va_arg(ap, double); LogEst rLogEst = sqlite3LogEstFromDouble(rIn); - u64 iInt = sqlite3LogEstToInt(rLogEst); - va_arg(ap, int*)[0] = rLogEst; - va_arg(ap, u64*)[0] = iInt; - va_arg(ap, int*)[0] = sqlite3LogEst(iInt); + int *pI1 = va_arg(ap,int*); + u64 *pU64 = va_arg(ap,u64*); + int *pI2 = va_arg(ap,int*); + *pI1 = rLogEst; + *pU64 = sqlite3LogEstToInt(rLogEst); + *pI2 = sqlite3LogEst(*pU64); break; } @@ -173013,7 +177024,7 @@ static char *appendText(char *p, const char *z){ ** Memory layout must be compatible with that generated by the pager ** and expected by sqlite3_uri_parameter() and databaseName(). */ -SQLITE_API char *sqlite3_create_filename( +SQLITE_API const char *sqlite3_create_filename( const char *zDatabase, const char *zJournal, const char *zWal, @@ -173049,10 +177060,10 @@ SQLITE_API char *sqlite3_create_filename( ** error to call this routine with any parameter other than a pointer ** previously obtained from sqlite3_create_filename() or a NULL pointer. */ -SQLITE_API void sqlite3_free_filename(char *p){ +SQLITE_API void sqlite3_free_filename(const char *p){ if( p==0 ) return; - p = (char*)databaseName(p); - sqlite3_free(p - 4); + p = databaseName(p); + sqlite3_free((char*)p - 4); } @@ -173154,6 +177165,24 @@ SQLITE_PRIVATE Btree *sqlite3DbNameToBtree(sqlite3 *db, const char *zDbName){ return iDb<0 ? 0 : db->aDb[iDb].pBt; } +/* +** Return the name of the N-th database schema. Return NULL if N is out +** of range. +*/ +SQLITE_API const char *sqlite3_db_name(sqlite3 *db, int N){ +#ifdef SQLITE_ENABLE_API_ARMOR + if( !sqlite3SafetyCheckOk(db) ){ + (void)SQLITE_MISUSE_BKPT; + return 0; + } +#endif + if( N<0 || N>=db->nDb ){ + return 0; + }else{ + return db->aDb[N].zDbSName; + } +} + /* ** Return the filename of the database associated with a database ** connection. @@ -173285,8 +177314,8 @@ SQLITE_API int sqlite3_snapshot_open( */ SQLITE_API int sqlite3_snapshot_recover(sqlite3 *db, const char *zDb){ int rc = SQLITE_ERROR; - int iDb; #ifndef SQLITE_OMIT_WAL + int iDb; #ifdef SQLITE_ENABLE_API_ARMOR if( !sqlite3SafetyCheckOk(db) ){ @@ -174841,7 +178870,7 @@ struct Fts3MultiSegReader { int nAdvance; /* How many seg-readers to advance */ Fts3SegFilter *pFilter; /* Pointer to filter object */ char *aBuffer; /* Buffer to merge doclists in */ - int nBuffer; /* Allocated size of aBuffer[] in bytes */ + i64 nBuffer; /* Allocated size of aBuffer[] in bytes */ int iColFilter; /* If >=0, filter for this column */ int bRestart; @@ -177537,7 +181566,7 @@ static int fts3TermSelectMerge( ** ** Similar padding is added in the fts3DoclistOrMerge() function. */ - pTS->aaOutput[0] = sqlite3_malloc(nDoclist + FTS3_VARINT_MAX + 1); + pTS->aaOutput[0] = sqlite3_malloc64((i64)nDoclist + FTS3_VARINT_MAX + 1); pTS->anOutput[0] = nDoclist; if( pTS->aaOutput[0] ){ memcpy(pTS->aaOutput[0], aDoclist, nDoclist); @@ -178957,8 +182986,7 @@ static int fts3EvalDeferredPhrase(Fts3Cursor *pCsr, Fts3Phrase *pPhrase){ char *aPoslist = 0; /* Position list for deferred tokens */ int nPoslist = 0; /* Number of bytes in aPoslist */ int iPrev = -1; /* Token number of previous deferred token */ - - assert( pPhrase->doclist.bFreeList==0 ); + char *aFree = (pPhrase->doclist.bFreeList ? pPhrase->doclist.pList : 0); for(iToken=0; iTokennToken; iToken++){ Fts3PhraseToken *pToken = &pPhrase->aToken[iToken]; @@ -178972,6 +183000,7 @@ static int fts3EvalDeferredPhrase(Fts3Cursor *pCsr, Fts3Phrase *pPhrase){ if( pList==0 ){ sqlite3_free(aPoslist); + sqlite3_free(aFree); pPhrase->doclist.pList = 0; pPhrase->doclist.nList = 0; return SQLITE_OK; @@ -178992,6 +183021,7 @@ static int fts3EvalDeferredPhrase(Fts3Cursor *pCsr, Fts3Phrase *pPhrase){ nPoslist = (int)(aOut - aPoslist); if( nPoslist==0 ){ sqlite3_free(aPoslist); + sqlite3_free(aFree); pPhrase->doclist.pList = 0; pPhrase->doclist.nList = 0; return SQLITE_OK; @@ -179024,13 +183054,14 @@ static int fts3EvalDeferredPhrase(Fts3Cursor *pCsr, Fts3Phrase *pPhrase){ nDistance = iPrev - nMaxUndeferred; } - aOut = (char *)sqlite3_malloc(nPoslist+8); + aOut = (char *)sqlite3Fts3MallocZero(nPoslist+FTS3_BUFFER_PADDING); if( !aOut ){ sqlite3_free(aPoslist); return SQLITE_NOMEM; } pPhrase->doclist.pList = aOut; + assert( p1 && p2 ); if( fts3PoslistPhraseMerge(&aOut, nDistance, 0, 1, &p1, &p2) ){ pPhrase->doclist.bFreeList = 1; pPhrase->doclist.nList = (int)(aOut - pPhrase->doclist.pList); @@ -179043,6 +183074,7 @@ static int fts3EvalDeferredPhrase(Fts3Cursor *pCsr, Fts3Phrase *pPhrase){ } } + if( pPhrase->doclist.pList!=aFree ) sqlite3_free(aFree); return SQLITE_OK; } #endif /* SQLITE_DISABLE_FTS4_DEFERRED */ @@ -179391,7 +183423,7 @@ static int fts3EvalIncrPhraseNext( if( bEof==0 ){ int nList = 0; int nByte = a[p->nToken-1].nList; - char *aDoclist = sqlite3_malloc(nByte+FTS3_BUFFER_PADDING); + char *aDoclist = sqlite3_malloc64((i64)nByte+FTS3_BUFFER_PADDING); if( !aDoclist ) return SQLITE_NOMEM; memcpy(aDoclist, a[p->nToken-1].pList, nByte+1); memset(&aDoclist[nByte], 0, FTS3_BUFFER_PADDING); @@ -180217,11 +184249,10 @@ static int fts3EvalTestExpr( default: { #ifndef SQLITE_DISABLE_FTS4_DEFERRED - if( pCsr->pDeferred - && (pExpr->iDocid==pCsr->iPrevId || pExpr->bDeferred) - ){ + if( pCsr->pDeferred && (pExpr->bDeferred || ( + pExpr->iDocid==pCsr->iPrevId && pExpr->pPhrase->doclist.pList + ))){ Fts3Phrase *pPhrase = pExpr->pPhrase; - assert( pExpr->bDeferred || pPhrase->doclist.bFreeList==0 ); if( pExpr->bDeferred ){ fts3EvalInvalidatePoslist(pPhrase); } @@ -183628,7 +187659,7 @@ static int porterNext( if( n>c->nAllocated ){ char *pNew; c->nAllocated = n+20; - pNew = sqlite3_realloc(c->zToken, c->nAllocated); + pNew = sqlite3_realloc64(c->zToken, c->nAllocated); if( !pNew ) return SQLITE_NOMEM; c->zToken = pNew; } @@ -184380,7 +188411,7 @@ static int simpleNext( if( n>c->nTokenAllocated ){ char *pNew; c->nTokenAllocated = n+20; - pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated); + pNew = sqlite3_realloc64(c->pToken, c->nTokenAllocated); if( !pNew ) return SQLITE_NOMEM; c->pToken = pNew; } @@ -185542,7 +189573,7 @@ static int fts3PendingListAppendVarint( /* Allocate or grow the PendingList as required. */ if( !p ){ - p = sqlite3_malloc(sizeof(*p) + 100); + p = sqlite3_malloc64(sizeof(*p) + 100); if( !p ){ return SQLITE_NOMEM; } @@ -185551,14 +189582,14 @@ static int fts3PendingListAppendVarint( p->nData = 0; } else if( p->nData+FTS3_VARINT_MAX+1>p->nSpace ){ - int nNew = p->nSpace * 2; - p = sqlite3_realloc(p, sizeof(*p) + nNew); + i64 nNew = p->nSpace * 2; + p = sqlite3_realloc64(p, sizeof(*p) + nNew); if( !p ){ sqlite3_free(*pp); *pp = 0; return SQLITE_NOMEM; } - p->nSpace = nNew; + p->nSpace = (int)nNew; p->aData = (char *)&p[1]; } @@ -186115,7 +190146,7 @@ SQLITE_PRIVATE int sqlite3Fts3ReadBlock( int nByte = sqlite3_blob_bytes(p->pSegments); *pnBlob = nByte; if( paBlob ){ - char *aByte = sqlite3_malloc(nByte + FTS3_NODE_PADDING); + char *aByte = sqlite3_malloc64((i64)nByte + FTS3_NODE_PADDING); if( !aByte ){ rc = SQLITE_NOMEM; }else{ @@ -186232,7 +190263,7 @@ static int fts3SegReaderNext( int nTerm = fts3HashKeysize(pElem); if( (nTerm+1)>pReader->nTermAlloc ){ sqlite3_free(pReader->zTerm); - pReader->zTerm = (char*)sqlite3_malloc((nTerm+1)*2); + pReader->zTerm = (char*)sqlite3_malloc64(((i64)nTerm+1)*2); if( !pReader->zTerm ) return SQLITE_NOMEM; pReader->nTermAlloc = (nTerm+1)*2; } @@ -186240,7 +190271,7 @@ static int fts3SegReaderNext( pReader->zTerm[nTerm] = '\0'; pReader->nTerm = nTerm; - aCopy = (char*)sqlite3_malloc(nCopy); + aCopy = (char*)sqlite3_malloc64(nCopy); if( !aCopy ) return SQLITE_NOMEM; memcpy(aCopy, pList->aData, nCopy); pReader->nNode = pReader->nDoclist = nCopy; @@ -186527,7 +190558,7 @@ SQLITE_PRIVATE int sqlite3Fts3SegReaderNew( nExtra = nRoot + FTS3_NODE_PADDING; } - pReader = (Fts3SegReader *)sqlite3_malloc(sizeof(Fts3SegReader) + nExtra); + pReader = (Fts3SegReader *)sqlite3_malloc64(sizeof(Fts3SegReader) + nExtra); if( !pReader ){ return SQLITE_NOMEM; } @@ -186619,7 +190650,7 @@ SQLITE_PRIVATE int sqlite3Fts3SegReaderPending( if( nElem==nAlloc ){ Fts3HashElem **aElem2; nAlloc += 16; - aElem2 = (Fts3HashElem **)sqlite3_realloc( + aElem2 = (Fts3HashElem **)sqlite3_realloc64( aElem, nAlloc*sizeof(Fts3HashElem *) ); if( !aElem2 ){ @@ -186953,7 +190984,7 @@ static int fts3NodeAddTerm( ** this is not expected to be a serious problem. */ assert( pTree->aData==(char *)&pTree[1] ); - pTree->aData = (char *)sqlite3_malloc(nReq); + pTree->aData = (char *)sqlite3_malloc64(nReq); if( !pTree->aData ){ return SQLITE_NOMEM; } @@ -186971,7 +191002,7 @@ static int fts3NodeAddTerm( if( isCopyTerm ){ if( pTree->nMalloczMalloc, nTerm*2); + char *zNew = sqlite3_realloc64(pTree->zMalloc, (i64)nTerm*2); if( !zNew ){ return SQLITE_NOMEM; } @@ -186997,7 +191028,7 @@ static int fts3NodeAddTerm( ** now. Instead, the term is inserted into the parent of pTree. If pTree ** has no parent, one is created here. */ - pNew = (SegmentNode *)sqlite3_malloc(sizeof(SegmentNode) + p->nNodeSize); + pNew = (SegmentNode *)sqlite3_malloc64(sizeof(SegmentNode) + p->nNodeSize); if( !pNew ){ return SQLITE_NOMEM; } @@ -187135,7 +191166,7 @@ static int fts3SegWriterAdd( ){ int nPrefix; /* Size of term prefix in bytes */ int nSuffix; /* Size of term suffix in bytes */ - int nReq; /* Number of bytes required on leaf page */ + i64 nReq; /* Number of bytes required on leaf page */ int nData; SegmentWriter *pWriter = *ppWriter; @@ -187144,13 +191175,13 @@ static int fts3SegWriterAdd( sqlite3_stmt *pStmt; /* Allocate the SegmentWriter structure */ - pWriter = (SegmentWriter *)sqlite3_malloc(sizeof(SegmentWriter)); + pWriter = (SegmentWriter *)sqlite3_malloc64(sizeof(SegmentWriter)); if( !pWriter ) return SQLITE_NOMEM; memset(pWriter, 0, sizeof(SegmentWriter)); *ppWriter = pWriter; /* Allocate a buffer in which to accumulate data */ - pWriter->aData = (char *)sqlite3_malloc(p->nNodeSize); + pWriter->aData = (char *)sqlite3_malloc64(p->nNodeSize); if( !pWriter->aData ) return SQLITE_NOMEM; pWriter->nSize = p->nNodeSize; @@ -187225,7 +191256,7 @@ static int fts3SegWriterAdd( ** the buffer to make it large enough. */ if( nReq>pWriter->nSize ){ - char *aNew = sqlite3_realloc(pWriter->aData, nReq); + char *aNew = sqlite3_realloc64(pWriter->aData, nReq); if( !aNew ) return SQLITE_NOMEM; pWriter->aData = aNew; pWriter->nSize = nReq; @@ -187250,7 +191281,7 @@ static int fts3SegWriterAdd( */ if( isCopyTerm ){ if( nTerm>pWriter->nMalloc ){ - char *zNew = sqlite3_realloc(pWriter->zMalloc, nTerm*2); + char *zNew = sqlite3_realloc64(pWriter->zMalloc, (i64)nTerm*2); if( !zNew ){ return SQLITE_NOMEM; } @@ -187558,12 +191589,12 @@ static void fts3ColumnFilter( static int fts3MsrBufferData( Fts3MultiSegReader *pMsr, /* Multi-segment-reader handle */ char *pList, - int nList + i64 nList ){ if( nList>pMsr->nBuffer ){ char *pNew; pMsr->nBuffer = nList*2; - pNew = (char *)sqlite3_realloc(pMsr->aBuffer, pMsr->nBuffer); + pNew = (char *)sqlite3_realloc64(pMsr->aBuffer, pMsr->nBuffer); if( !pNew ) return SQLITE_NOMEM; pMsr->aBuffer = pNew; } @@ -187619,7 +191650,7 @@ SQLITE_PRIVATE int sqlite3Fts3MsrIncrNext( fts3SegReaderSort(pMsr->apSegment, nMerge, j, xCmp); if( nList>0 && fts3SegReaderIsPending(apSegment[0]) ){ - rc = fts3MsrBufferData(pMsr, pList, nList+1); + rc = fts3MsrBufferData(pMsr, pList, (i64)nList+1); if( rc!=SQLITE_OK ) return rc; assert( (pMsr->aBuffer[nList] & 0xFE)==0x00 ); pList = pMsr->aBuffer; @@ -187756,11 +191787,11 @@ SQLITE_PRIVATE int sqlite3Fts3MsrIncrRestart(Fts3MultiSegReader *pCsr){ return SQLITE_OK; } -static int fts3GrowSegReaderBuffer(Fts3MultiSegReader *pCsr, int nReq){ +static int fts3GrowSegReaderBuffer(Fts3MultiSegReader *pCsr, i64 nReq){ if( nReq>pCsr->nBuffer ){ char *aNew; pCsr->nBuffer = nReq*2; - aNew = sqlite3_realloc(pCsr->aBuffer, pCsr->nBuffer); + aNew = sqlite3_realloc64(pCsr->aBuffer, pCsr->nBuffer); if( !aNew ){ return SQLITE_NOMEM; } @@ -187851,7 +191882,8 @@ SQLITE_PRIVATE int sqlite3Fts3SegReaderStep( ){ pCsr->nDoclist = apSegment[0]->nDoclist; if( fts3SegReaderIsPending(apSegment[0]) ){ - rc = fts3MsrBufferData(pCsr, apSegment[0]->aDoclist, pCsr->nDoclist); + rc = fts3MsrBufferData(pCsr, apSegment[0]->aDoclist, + (i64)pCsr->nDoclist); pCsr->aDoclist = pCsr->aBuffer; }else{ pCsr->aDoclist = apSegment[0]->aDoclist; @@ -187904,7 +191936,8 @@ SQLITE_PRIVATE int sqlite3Fts3SegReaderStep( nByte = sqlite3Fts3VarintLen(iDelta) + (isRequirePos?nList+1:0); - rc = fts3GrowSegReaderBuffer(pCsr, nByte+nDoclist+FTS3_NODE_PADDING); + rc = fts3GrowSegReaderBuffer(pCsr, + (i64)nByte+nDoclist+FTS3_NODE_PADDING); if( rc ) return rc; if( isFirst ){ @@ -187930,7 +191963,7 @@ SQLITE_PRIVATE int sqlite3Fts3SegReaderStep( fts3SegReaderSort(apSegment, nMerge, j, xCmp); } if( nDoclist>0 ){ - rc = fts3GrowSegReaderBuffer(pCsr, nDoclist+FTS3_NODE_PADDING); + rc = fts3GrowSegReaderBuffer(pCsr, (i64)nDoclist+FTS3_NODE_PADDING); if( rc ) return rc; memset(&pCsr->aBuffer[nDoclist], 0, FTS3_NODE_PADDING); pCsr->aDoclist = pCsr->aBuffer; @@ -188643,7 +192676,7 @@ struct NodeReader { static void blobGrowBuffer(Blob *pBlob, int nMin, int *pRc){ if( *pRc==SQLITE_OK && nMin>pBlob->nAlloc ){ int nAlloc = nMin; - char *a = (char *)sqlite3_realloc(pBlob->a, nAlloc); + char *a = (char *)sqlite3_realloc64(pBlob->a, nAlloc); if( a ){ pBlob->nAlloc = nAlloc; pBlob->a = a; @@ -188792,6 +192825,8 @@ static int fts3IncrmergePush( pBlk->n += sqlite3Fts3PutVarint(&pBlk->a[pBlk->n], nPrefix); } pBlk->n += sqlite3Fts3PutVarint(&pBlk->a[pBlk->n], nSuffix); + assert( nPrefix+nSuffix<=nTerm ); + assert( nPrefix>=0 ); memcpy(&pBlk->a[pBlk->n], &zTerm[nPrefix], nSuffix); pBlk->n += nSuffix; @@ -188914,6 +192949,7 @@ static int fts3IncrmergeAppend( pLeaf = &pWriter->aNodeWriter[0]; nPrefix = fts3PrefixCompress(pLeaf->key.a, pLeaf->key.n, zTerm, nTerm); nSuffix = nTerm - nPrefix; + if(nSuffix<=0 ) return FTS_CORRUPT_VTAB; nSpace = sqlite3Fts3VarintLen(nPrefix); nSpace += sqlite3Fts3VarintLen(nSuffix) + nSuffix; @@ -189437,7 +193473,7 @@ static int fts3RepackSegdirLevel( if( nIdx>=nAlloc ){ int *aNew; nAlloc += 16; - aNew = sqlite3_realloc(aIdx, nAlloc*sizeof(int)); + aNew = sqlite3_realloc64(aIdx, nAlloc*sizeof(int)); if( !aNew ){ rc = SQLITE_NOMEM; break; @@ -189811,7 +193847,7 @@ SQLITE_PRIVATE int sqlite3Fts3Incrmerge(Fts3Table *p, int nMerge, int nMin){ /* Allocate space for the cursor, filter and writer objects */ const int nAlloc = sizeof(*pCsr) + sizeof(*pFilter) + sizeof(*pWriter); - pWriter = (IncrmergeWriter *)sqlite3_malloc(nAlloc); + pWriter = (IncrmergeWriter *)sqlite3_malloc64(nAlloc); if( !pWriter ) return SQLITE_NOMEM; pFilter = (Fts3SegFilter *)&pWriter[1]; pCsr = (Fts3MultiSegReader *)&pFilter[1]; @@ -190447,7 +194483,7 @@ SQLITE_PRIVATE int sqlite3Fts3DeferredTokenList( return SQLITE_OK; } - pRet = (char *)sqlite3_malloc(p->pList->nData); + pRet = (char *)sqlite3_malloc64(p->pList->nData); if( !pRet ) return SQLITE_NOMEM; nSkip = sqlite3Fts3GetVarint(p->pList->aData, &dummy); @@ -190467,7 +194503,7 @@ SQLITE_PRIVATE int sqlite3Fts3DeferToken( int iCol /* Column that token must appear in (or -1) */ ){ Fts3DeferredToken *pDeferred; - pDeferred = sqlite3_malloc(sizeof(*pDeferred)); + pDeferred = sqlite3_malloc64(sizeof(*pDeferred)); if( !pDeferred ){ return SQLITE_NOMEM; } @@ -194871,7 +198907,7 @@ static JsonNode *jsonMergePatch( if( pPatch->eType!=JSON_OBJECT ){ return pPatch; } - assert( iTarget>=0 && iTargetnNode ); + assert( iTargetnNode ); pTarget = &pParse->aNode[iTarget]; assert( (pPatch->jnFlags & JNODE_APPEND)==0 ); if( pTarget->eType!=JSON_OBJECT ){ @@ -199182,7 +203218,7 @@ static int rtreeUpdate( rtreeReference(pRtree); assert(nData>=1); - cell.iRowid = 0; /* Used only to suppress a compiler warning */ + memset(&cell, 0, sizeof(cell)); /* Constraint handling. A write operation on an r-tree table may return ** SQLITE_CONSTRAINT for two reasons: @@ -202046,7 +206082,7 @@ static int geopolyUpdate( sqlite3_free(p); nChange = 1; } - for(jj=1; jjnAux; jj++){ + for(jj=1; jjdbRbu, "main", SQLITE_FCNTL_RBUCNT, (void*)p); if( p->zState==0 ){ const char *zFile = sqlite3_db_filename(p->dbRbu, "main"); - p->zState = rbuMPrintf(p, "file://%s-vacuum?modeof=%s", zFile, zFile); + p->zState = rbuMPrintf(p, "file:///%s-vacuum?modeof=%s", zFile, zFile); } } @@ -207022,32 +211089,7 @@ static void rbuMoveOalFile(sqlite3rbu *p){ } if( p->rc==SQLITE_OK ){ -#if defined(_WIN32_WCE) - { - LPWSTR zWideOal; - LPWSTR zWideWal; - - zWideOal = rbuWinUtf8ToUnicode(zOal); - if( zWideOal ){ - zWideWal = rbuWinUtf8ToUnicode(zWal); - if( zWideWal ){ - if( MoveFileW(zWideOal, zWideWal) ){ - p->rc = SQLITE_OK; - }else{ - p->rc = SQLITE_IOERR; - } - sqlite3_free(zWideWal); - }else{ - p->rc = SQLITE_IOERR_NOMEM; - } - sqlite3_free(zWideOal); - }else{ - p->rc = SQLITE_IOERR_NOMEM; - } - } -#else - p->rc = rename(zOal, zWal) ? SQLITE_IOERR : SQLITE_OK; -#endif + p->rc = p->xRename(p->pRenameArg, zOal, zWal); } if( p->rc!=SQLITE_OK @@ -207786,6 +211828,7 @@ static sqlite3rbu *openRbuHandle( /* Create the custom VFS. */ memset(p, 0, sizeof(sqlite3rbu)); + sqlite3rbu_rename_handler(p, 0, 0); rbuCreateVfs(p); /* Open the target, RBU and state databases */ @@ -208177,6 +212220,54 @@ SQLITE_API int sqlite3rbu_savestate(sqlite3rbu *p){ return rc; } +/* +** Default xRename callback for RBU. +*/ +static int xDefaultRename(void *pArg, const char *zOld, const char *zNew){ + int rc = SQLITE_OK; +#if defined(_WIN32_WCE) + { + LPWSTR zWideOld; + LPWSTR zWideNew; + + zWideOld = rbuWinUtf8ToUnicode(zOld); + if( zWideOld ){ + zWideNew = rbuWinUtf8ToUnicode(zNew); + if( zWideNew ){ + if( MoveFileW(zWideOld, zWideNew) ){ + rc = SQLITE_OK; + }else{ + rc = SQLITE_IOERR; + } + sqlite3_free(zWideNew); + }else{ + rc = SQLITE_IOERR_NOMEM; + } + sqlite3_free(zWideOld); + }else{ + rc = SQLITE_IOERR_NOMEM; + } + } +#else + rc = rename(zOld, zNew) ? SQLITE_IOERR : SQLITE_OK; +#endif + return rc; +} + +SQLITE_API void sqlite3rbu_rename_handler( + sqlite3rbu *pRbu, + void *pArg, + int (*xRename)(void *pArg, const char *zOld, const char *zNew) +){ + if( xRename ){ + pRbu->xRename = xRename; + pRbu->pRenameArg = pArg; + }else{ + pRbu->xRename = xDefaultRename; + pRbu->pRenameArg = 0; + } +} + /************************************************************************** ** Beginning of RBU VFS shim methods. The VFS shim modifies the behaviour ** of a standard VFS in the following ways: @@ -209285,7 +213376,7 @@ struct StatTable { */ static int statConnect( sqlite3 *db, - void *pAux __maybe_unused, + void *pAux __maybe_unused, int argc, const char *const*argv, sqlite3_vtab **ppVtab, char **pzErr @@ -210189,7 +214280,7 @@ static int dbpageBestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){ ){ pIdxInfo->orderByConsumed = 1; } - sqlite3VtabWriteAll(pIdxInfo); + sqlite3VtabUsesAllSchemas(pIdxInfo); return SQLITE_OK; } @@ -210307,12 +214398,18 @@ static int dbpageColumn( } case 1: { /* data */ DbPage *pDbPage = 0; - rc = sqlite3PagerGet(pCsr->pPager, pCsr->pgno, (DbPage**)&pDbPage, 0); - if( rc==SQLITE_OK ){ - sqlite3_result_blob(ctx, sqlite3PagerGetData(pDbPage), pCsr->szPage, - SQLITE_TRANSIENT); + if( pCsr->pgno==((PENDING_BYTE/pCsr->szPage)+1) ){ + /* The pending byte page. Assume it is zeroed out. Attempting to + ** request this page from the page is an SQLITE_CORRUPT error. */ + sqlite3_result_zeroblob(ctx, pCsr->szPage); + }else{ + rc = sqlite3PagerGet(pCsr->pPager, pCsr->pgno, (DbPage**)&pDbPage, 0); + if( rc==SQLITE_OK ){ + sqlite3_result_blob(ctx, sqlite3PagerGetData(pDbPage), pCsr->szPage, + SQLITE_TRANSIENT); + } + sqlite3PagerUnref(pDbPage); } - sqlite3PagerUnref(pDbPage); break; } default: { /* schema */ @@ -210321,7 +214418,7 @@ static int dbpageColumn( break; } } - return SQLITE_OK; + return rc; } static int dbpageRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ @@ -210367,7 +214464,7 @@ static int dbpageUpdate( goto update_fail; } pBt = pTab->db->aDb[iDb].pBt; - if( pgno<1 || pBt==0 || pgno>(int)sqlite3BtreeLastPage(pBt) ){ + if( pgno<1 || pBt==0 || pgno>sqlite3BtreeLastPage(pBt) ){ zErr = "bad page number"; goto update_fail; } @@ -210381,11 +214478,12 @@ static int dbpageUpdate( pPager = sqlite3BtreePager(pBt); rc = sqlite3PagerGet(pPager, pgno, (DbPage**)&pDbPage, 0); if( rc==SQLITE_OK ){ - rc = sqlite3PagerWrite(pDbPage); - if( rc==SQLITE_OK ){ - memcpy(sqlite3PagerGetData(pDbPage), - sqlite3_value_blob(argv[3]), - szPage); + const void *pData = sqlite3_value_blob(argv[3]); + assert( pData!=0 || pTab->db->mallocFailed ); + if( pData + && (rc = sqlite3PagerWrite(pDbPage))==SQLITE_OK + ){ + memcpy(sqlite3PagerGetData(pDbPage), pData, szPage); } } sqlite3PagerUnref(pDbPage); @@ -210405,11 +214503,12 @@ static int dbpageBegin(sqlite3_vtab *pVtab){ DbpageTable *pTab = (DbpageTable *)pVtab; sqlite3 *db = pTab->db; int i; - for(i=0; inDb; i++){ + int rc = SQLITE_OK; + for(i=0; rc==SQLITE_OK && inDb; i++){ Btree *pBt = db->aDb[i].pBt; - if( pBt ) sqlite3BtreeBeginTrans(pBt, 1, 0); + if( pBt ) rc = sqlite3BtreeBeginTrans(pBt, 1, 0); } - return SQLITE_OK; + return rc; } @@ -213780,6 +217879,22 @@ static int sessionChangesetNextOne( if( p->op==SQLITE_INSERT ) p->op = SQLITE_DELETE; else if( p->op==SQLITE_DELETE ) p->op = SQLITE_INSERT; } + + /* If this is an UPDATE that is part of a changeset, then check that + ** there are no fields in the old.* record that are not (a) PK fields, + ** or (b) also present in the new.* record. + ** + ** Such records are technically corrupt, but the rebaser was at one + ** point generating them. Under most circumstances this is benign, but + ** can cause spurious SQLITE_RANGE errors when applying the changeset. */ + if( p->bPatchset==0 && p->op==SQLITE_UPDATE){ + for(i=0; inCol; i++){ + if( p->abPK[i]==0 && p->apValue[i+p->nCol]==0 ){ + sqlite3ValueFree(p->apValue[i]); + p->apValue[i] = 0; + } + } + } } return SQLITE_ROW; @@ -215976,7 +220091,7 @@ static void sessionAppendPartialUpdate( if( !pIter->abPK[i] && a1[0] ) bData = 1; memcpy(pOut, a1, n1); pOut += n1; - }else if( a2[0]!=0xFF ){ + }else if( a2[0]!=0xFF && a1[0] ){ bData = 1; memcpy(pOut, a2, n2); pOut += n2; @@ -217133,7 +221248,7 @@ static void sqlite3Fts5BufferAppendPrintf(int *, Fts5Buffer*, char *zFmt, ...); static char *sqlite3Fts5Mprintf(int *pRc, const char *zFmt, ...); #define fts5BufferZero(x) sqlite3Fts5BufferZero(x) -#define fts5BufferAppendVarint(a,b,c) sqlite3Fts5BufferAppendVarint(a,b,c) +#define fts5BufferAppendVarint(a,b,c) sqlite3Fts5BufferAppendVarint(a,b,(i64)c) #define fts5BufferFree(a) sqlite3Fts5BufferFree(a) #define fts5BufferAppendBlob(a,b,c,d) sqlite3Fts5BufferAppendBlob(a,b,c,d) #define fts5BufferSet(a,b,c,d) sqlite3Fts5BufferSet(a,b,c,d) @@ -222917,6 +227032,9 @@ static Fts5ExprNearset *sqlite3Fts5ParseNearset( }else{ if( pRet->nPhrase>0 ){ Fts5ExprPhrase *pLast = pRet->apPhrase[pRet->nPhrase-1]; + assert( pParse!=0 ); + assert( pParse->apPhrase!=0 ); + assert( pParse->nPhrase>=2 ); assert( pLast==pParse->apPhrase[pParse->nPhrase-2] ); if( pPhrase->nTerm==0 ){ fts5ExprPhraseFree(pPhrase); @@ -225216,7 +229334,7 @@ struct Fts5Index { sqlite3_stmt *pWriter; /* "INSERT ... %_data VALUES(?,?)" */ sqlite3_stmt *pDeleter; /* "DELETE FROM %_data ... id>=? AND id<=?" */ sqlite3_stmt *pIdxWriter; /* "INSERT ... %_idx VALUES(?,?,?,?)" */ - sqlite3_stmt *pIdxDeleter; /* "DELETE FROM %_idx WHERE segid=? */ + sqlite3_stmt *pIdxDeleter; /* "DELETE FROM %_idx WHERE segid=?" */ sqlite3_stmt *pIdxSelect; int nRead; /* Total number of blocks read */ @@ -229007,7 +233125,9 @@ static void fts5WriteAppendRowid( fts5BufferAppendVarint(&p->rc, &pPage->buf, iRowid); }else{ assert_nc( p->rc || iRowid>pWriter->iPrevRowid ); - fts5BufferAppendVarint(&p->rc, &pPage->buf, iRowid - pWriter->iPrevRowid); + fts5BufferAppendVarint(&p->rc, &pPage->buf, + (u64)iRowid - (u64)pWriter->iPrevRowid + ); } pWriter->iPrevRowid = iRowid; pWriter->bFirstRowidInDoclist = 0; @@ -229771,7 +233891,7 @@ static int sqlite3Fts5IndexMerge(Fts5Index *p, int nMerge){ static void fts5AppendRowid( Fts5Index *p, - i64 iDelta, + u64 iDelta, Fts5Iter *pUnused, Fts5Buffer *pBuf ){ @@ -229781,7 +233901,7 @@ static void fts5AppendRowid( static void fts5AppendPoslist( Fts5Index *p, - i64 iDelta, + u64 iDelta, Fts5Iter *pMulti, Fts5Buffer *pBuf ){ @@ -229856,10 +233976,10 @@ static void fts5MergeAppendDocid( } #endif -#define fts5MergeAppendDocid(pBuf, iLastRowid, iRowid) { \ - assert( (pBuf)->n!=0 || (iLastRowid)==0 ); \ - fts5BufferSafeAppendVarint((pBuf), (iRowid) - (iLastRowid)); \ - (iLastRowid) = (iRowid); \ +#define fts5MergeAppendDocid(pBuf, iLastRowid, iRowid) { \ + assert( (pBuf)->n!=0 || (iLastRowid)==0 ); \ + fts5BufferSafeAppendVarint((pBuf), (u64)(iRowid) - (u64)(iLastRowid)); \ + (iLastRowid) = (iRowid); \ } /* @@ -230130,7 +234250,7 @@ static void fts5SetupPrefixIter( int nMerge = 1; void (*xMerge)(Fts5Index*, Fts5Buffer*, int, Fts5Buffer*); - void (*xAppend)(Fts5Index*, i64, Fts5Iter*, Fts5Buffer*); + void (*xAppend)(Fts5Index*, u64, Fts5Iter*, Fts5Buffer*); if( p->pConfig->eDetail==FTS5_DETAIL_NONE ){ xMerge = fts5MergeRowidLists; xAppend = fts5AppendRowid; @@ -230169,7 +234289,7 @@ static void fts5SetupPrefixIter( Fts5SegIter *pSeg = &p1->aSeg[ p1->aFirst[1].iFirst ]; p1->xSetOutputs(p1, pSeg); if( p1->base.nData ){ - xAppend(p, p1->base.iRowid-iLastRowid, p1, &doclist); + xAppend(p, (u64)p1->base.iRowid-(u64)iLastRowid, p1, &doclist); iLastRowid = p1->base.iRowid; } } @@ -230217,7 +234337,7 @@ static void fts5SetupPrefixIter( iLastRowid = 0; } - xAppend(p, p1->base.iRowid-iLastRowid, p1, &doclist); + xAppend(p, (u64)p1->base.iRowid-(u64)iLastRowid, p1, &doclist); iLastRowid = p1->base.iRowid; } @@ -231196,6 +235316,7 @@ static int sqlite3Fts5IndexIntegrityCheck(Fts5Index *p, u64 cksum, int bUseCksum /* If this is a new term, query for it. Update cksum3 with the results. */ fts5TestTerm(p, &term, z, n, cksum2, &cksum3); + if( p->rc ) break; if( eDetail==FTS5_DETAIL_NONE ){ if( 0==fts5MultiIterIsEmpty(p, pIter) ){ @@ -232000,7 +236121,7 @@ static void fts5CheckTransactionState(Fts5FullTable *p, int op, int iSavepoint){ break; case FTS5_SYNC: - assert( p->ts.eState==1 ); + assert( p->ts.eState==1 || p->ts.eState==2 ); p->ts.eState = 2; break; @@ -232015,21 +236136,21 @@ static void fts5CheckTransactionState(Fts5FullTable *p, int op, int iSavepoint){ break; case FTS5_SAVEPOINT: - assert( p->ts.eState==1 ); + assert( p->ts.eState>=1 ); assert( iSavepoint>=0 ); assert( iSavepoint>=p->ts.iSavepoint ); p->ts.iSavepoint = iSavepoint; break; case FTS5_RELEASE: - assert( p->ts.eState==1 ); + assert( p->ts.eState>=1 ); assert( iSavepoint>=0 ); assert( iSavepoint<=p->ts.iSavepoint ); p->ts.iSavepoint = iSavepoint-1; break; case FTS5_ROLLBACKTO: - assert( p->ts.eState==1 ); + assert( p->ts.eState>=1 ); assert( iSavepoint>=-1 ); /* The following assert() can fail if another vtab strikes an error ** within an xSavepoint() call then SQLite calls xRollbackTo() - without @@ -233365,7 +237486,7 @@ static int fts5UpdateMethod( int rc = SQLITE_OK; /* Return code */ /* A transaction must be open when this is called. */ - assert( pTab->ts.eState==1 ); + assert( pTab->ts.eState==1 || pTab->ts.eState==2 ); assert( pVtab->zErrMsg==0 ); assert( nArg==1 || nArg==(2+pConfig->nCol+2) ); @@ -234533,7 +238654,7 @@ static void fts5SourceIdFunc( ){ assert( nArg==0 ); UNUSED_PARAM2(nArg, apUnused); - sqlite3_result_text(pCtx, "fts5: 2022-05-06 15:25:27 78d9c993d404cdfaa7fdd2973fa1052e3da9f66215cff9c5540ebe55c407d9fe", -1, SQLITE_TRANSIENT); + sqlite3_result_text(pCtx, "fts5: 2022-12-28 14:03:47 df5c253c0b3dd24916e4ec7cf77d3db5294cc9fd45ae7b9c5e82ad8197f38a24", -1, SQLITE_TRANSIENT); } /* @@ -239204,6 +243325,16 @@ SQLITE_EXTENSION_INIT1 #ifndef SQLITE_OMIT_VIRTUALTABLE + +#define STMT_NUM_INTEGER_COLUMN 10 +typedef struct StmtRow StmtRow; +struct StmtRow { + sqlite3_int64 iRowid; /* Rowid value */ + char *zSql; /* column "sql" */ + int aCol[STMT_NUM_INTEGER_COLUMN+1]; /* all other column values */ + StmtRow *pNext; /* Next row to return */ +}; + /* stmt_vtab is a subclass of sqlite3_vtab which will ** serve as the underlying representation of a stmt virtual table */ @@ -239221,8 +243352,7 @@ typedef struct stmt_cursor stmt_cursor; struct stmt_cursor { sqlite3_vtab_cursor base; /* Base class - must be first */ sqlite3 *db; /* Database connection for this cursor */ - sqlite3_stmt *pStmt; /* Statement cursor is currently pointing at */ - sqlite3_int64 iRowid; /* The rowid */ + StmtRow *pRow; /* Current row */ }; /* @@ -239266,7 +243396,7 @@ static int stmtConnect( "CREATE TABLE x(sql,ncol,ro,busy,nscan,nsort,naidx,nstep," "reprep,run,mem)"); if( rc==SQLITE_OK ){ - pNew = sqlite3_malloc( sizeof(*pNew) ); + pNew = sqlite3_malloc64( sizeof(*pNew) ); *ppVtab = (sqlite3_vtab*)pNew; if( pNew==0 ) return SQLITE_NOMEM; memset(pNew, 0, sizeof(*pNew)); @@ -239288,7 +243418,7 @@ static int stmtDisconnect(sqlite3_vtab *pVtab){ */ static int stmtOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor){ stmt_cursor *pCur; - pCur = sqlite3_malloc( sizeof(*pCur) ); + pCur = sqlite3_malloc64( sizeof(*pCur) ); if( pCur==0 ) return SQLITE_NOMEM; memset(pCur, 0, sizeof(*pCur)); pCur->db = ((stmt_vtab*)p)->db; @@ -239296,10 +243426,21 @@ static int stmtOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor){ return SQLITE_OK; } +static void stmtCsrReset(stmt_cursor *pCur){ + StmtRow *pRow = 0; + StmtRow *pNext = 0; + for(pRow=pCur->pRow; pRow; pRow=pNext){ + pNext = pRow->pNext; + sqlite3_free(pRow); + } + pCur->pRow = 0; +} + /* ** Destructor for a stmt_cursor. */ static int stmtClose(sqlite3_vtab_cursor *cur){ + stmtCsrReset((stmt_cursor*)cur); sqlite3_free(cur); return SQLITE_OK; } @@ -239310,8 +243451,9 @@ static int stmtClose(sqlite3_vtab_cursor *cur){ */ static int stmtNext(sqlite3_vtab_cursor *cur){ stmt_cursor *pCur = (stmt_cursor*)cur; - pCur->iRowid++; - pCur->pStmt = sqlite3_next_stmt(pCur->db, pCur->pStmt); + StmtRow *pNext = pCur->pRow->pNext; + sqlite3_free(pCur->pRow); + pCur->pRow = pNext; return SQLITE_OK; } @@ -239325,39 +243467,11 @@ static int stmtColumn( int i /* Which column to return */ ){ stmt_cursor *pCur = (stmt_cursor*)cur; - switch( i ){ - case STMT_COLUMN_SQL: { - sqlite3_result_text(ctx, sqlite3_sql(pCur->pStmt), -1, SQLITE_TRANSIENT); - break; - } - case STMT_COLUMN_NCOL: { - sqlite3_result_int(ctx, sqlite3_column_count(pCur->pStmt)); - break; - } - case STMT_COLUMN_RO: { - sqlite3_result_int(ctx, sqlite3_stmt_readonly(pCur->pStmt)); - break; - } - case STMT_COLUMN_BUSY: { - sqlite3_result_int(ctx, sqlite3_stmt_busy(pCur->pStmt)); - break; - } - default: { - assert( i==STMT_COLUMN_MEM ); - i = SQLITE_STMTSTATUS_MEMUSED + - STMT_COLUMN_NSCAN - SQLITE_STMTSTATUS_FULLSCAN_STEP; - /* Fall thru */ - } - case STMT_COLUMN_NSCAN: - case STMT_COLUMN_NSORT: - case STMT_COLUMN_NAIDX: - case STMT_COLUMN_NSTEP: - case STMT_COLUMN_REPREP: - case STMT_COLUMN_RUN: { - sqlite3_result_int(ctx, sqlite3_stmt_status(pCur->pStmt, - i-STMT_COLUMN_NSCAN+SQLITE_STMTSTATUS_FULLSCAN_STEP, 0)); - break; - } + StmtRow *pRow = pCur->pRow; + if( i==STMT_COLUMN_SQL ){ + sqlite3_result_text(ctx, pRow->zSql, -1, SQLITE_TRANSIENT); + }else{ + sqlite3_result_int(ctx, pRow->aCol[i]); } return SQLITE_OK; } @@ -239368,7 +243482,7 @@ static int stmtColumn( */ static int stmtRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){ stmt_cursor *pCur = (stmt_cursor*)cur; - *pRowid = pCur->iRowid; + *pRowid = pCur->pRow->iRowid; return SQLITE_OK; } @@ -239378,7 +243492,7 @@ static int stmtRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){ */ static int stmtEof(sqlite3_vtab_cursor *cur){ stmt_cursor *pCur = (stmt_cursor*)cur; - return pCur->pStmt==0; + return pCur->pRow==0; } /* @@ -239393,9 +243507,53 @@ static int stmtFilter( int argc, sqlite3_value **argv ){ stmt_cursor *pCur = (stmt_cursor *)pVtabCursor; - pCur->pStmt = 0; - pCur->iRowid = 0; - return stmtNext(pVtabCursor); + sqlite3_stmt *p = 0; + sqlite3_int64 iRowid = 1; + StmtRow **ppRow = 0; + + stmtCsrReset(pCur); + ppRow = &pCur->pRow; + for(p=sqlite3_next_stmt(pCur->db, 0); p; p=sqlite3_next_stmt(pCur->db, p)){ + const char *zSql = sqlite3_sql(p); + sqlite3_int64 nSql = zSql ? strlen(zSql)+1 : 0; + StmtRow *pNew = (StmtRow*)sqlite3_malloc64(sizeof(StmtRow) + nSql); + + if( pNew==0 ) return SQLITE_NOMEM; + memset(pNew, 0, sizeof(StmtRow)); + if( zSql ){ + pNew->zSql = (char*)&pNew[1]; + memcpy(pNew->zSql, zSql, nSql); + } + pNew->aCol[STMT_COLUMN_NCOL] = sqlite3_column_count(p); + pNew->aCol[STMT_COLUMN_RO] = sqlite3_stmt_readonly(p); + pNew->aCol[STMT_COLUMN_BUSY] = sqlite3_stmt_busy(p); + pNew->aCol[STMT_COLUMN_NSCAN] = sqlite3_stmt_status( + p, SQLITE_STMTSTATUS_FULLSCAN_STEP, 0 + ); + pNew->aCol[STMT_COLUMN_NSORT] = sqlite3_stmt_status( + p, SQLITE_STMTSTATUS_SORT, 0 + ); + pNew->aCol[STMT_COLUMN_NAIDX] = sqlite3_stmt_status( + p, SQLITE_STMTSTATUS_AUTOINDEX, 0 + ); + pNew->aCol[STMT_COLUMN_NSTEP] = sqlite3_stmt_status( + p, SQLITE_STMTSTATUS_VM_STEP, 0 + ); + pNew->aCol[STMT_COLUMN_REPREP] = sqlite3_stmt_status( + p, SQLITE_STMTSTATUS_REPREPARE, 0 + ); + pNew->aCol[STMT_COLUMN_RUN] = sqlite3_stmt_status( + p, SQLITE_STMTSTATUS_RUN, 0 + ); + pNew->aCol[STMT_COLUMN_MEM] = sqlite3_stmt_status( + p, SQLITE_STMTSTATUS_MEMUSED, 0 + ); + pNew->iRowid = iRowid++; + *ppRow = pNew; + ppRow = &pNew->pNext; + } + + return SQLITE_OK; } /* diff --git a/database/sqlite/sqlite3.h b/database/sqlite/sqlite3.h index de393da9d..24b916750 100644 --- a/database/sqlite/sqlite3.h +++ b/database/sqlite/sqlite3.h @@ -146,9 +146,9 @@ extern "C" { ** [sqlite3_libversion_number()], [sqlite3_sourceid()], ** [sqlite_version()] and [sqlite_source_id()]. */ -#define SQLITE_VERSION "3.38.5" -#define SQLITE_VERSION_NUMBER 3038005 -#define SQLITE_SOURCE_ID "2022-05-06 15:25:27 78d9c993d404cdfaa7fdd2973fa1052e3da9f66215cff9c5540ebe55c407d9fe" +#define SQLITE_VERSION "3.40.1" +#define SQLITE_VERSION_NUMBER 3040001 +#define SQLITE_SOURCE_ID "2022-12-28 14:03:47 df5c253c0b3dd24916e4ec7cf77d3db5294cc9fd45ae7b9c5e82ad8197f38a24" /* ** CAPI3REF: Run-Time Library Version Numbers @@ -670,13 +670,17 @@ SQLITE_API int sqlite3_exec( ** ** SQLite uses one of these integer values as the second ** argument to calls it makes to the xLock() and xUnlock() methods -** of an [sqlite3_io_methods] object. +** of an [sqlite3_io_methods] object. These values are ordered from +** lest restrictive to most restrictive. +** +** The argument to xLock() is always SHARED or higher. The argument to +** xUnlock is either SHARED or NONE. */ -#define SQLITE_LOCK_NONE 0 -#define SQLITE_LOCK_SHARED 1 -#define SQLITE_LOCK_RESERVED 2 -#define SQLITE_LOCK_PENDING 3 -#define SQLITE_LOCK_EXCLUSIVE 4 +#define SQLITE_LOCK_NONE 0 /* xUnlock() only */ +#define SQLITE_LOCK_SHARED 1 /* xLock() or xUnlock() */ +#define SQLITE_LOCK_RESERVED 2 /* xLock() only */ +#define SQLITE_LOCK_PENDING 3 /* xLock() only */ +#define SQLITE_LOCK_EXCLUSIVE 4 /* xLock() only */ /* ** CAPI3REF: Synchronization Type Flags @@ -754,7 +758,14 @@ struct sqlite3_file { **
    4. [SQLITE_LOCK_PENDING], or **
    5. [SQLITE_LOCK_EXCLUSIVE]. ** -** xLock() increases the lock. xUnlock() decreases the lock. +** xLock() upgrades the database file lock. In other words, xLock() moves the +** database file lock in the direction NONE toward EXCLUSIVE. The argument to +** xLock() is always on of SHARED, RESERVED, PENDING, or EXCLUSIVE, never +** SQLITE_LOCK_NONE. If the database file lock is already at or above the +** requested lock, then the call to xLock() is a no-op. +** xUnlock() downgrades the database file lock to either SHARED or NONE. +* If the lock is already at or below the requested lock state, then the call +** to xUnlock() is a no-op. ** The xCheckReservedLock() method checks whether any database connection, ** either in this process or in some other process, is holding a RESERVED, ** PENDING, or EXCLUSIVE lock on the file. It returns true @@ -859,9 +870,8 @@ struct sqlite3_io_methods { ** opcode causes the xFileControl method to write the current state of ** the lock (one of [SQLITE_LOCK_NONE], [SQLITE_LOCK_SHARED], ** [SQLITE_LOCK_RESERVED], [SQLITE_LOCK_PENDING], or [SQLITE_LOCK_EXCLUSIVE]) -** into an integer that the pArg argument points to. This capability -** is used during testing and is only available when the SQLITE_TEST -** compile-time option is used. +** into an integer that the pArg argument points to. +** This capability is only available if SQLite is compiled with [SQLITE_DEBUG]. ** **
    6. [[SQLITE_FCNTL_SIZE_HINT]] ** The [SQLITE_FCNTL_SIZE_HINT] opcode is used by SQLite to give the VFS @@ -1182,6 +1192,12 @@ struct sqlite3_io_methods { ** **
    7. [[SQLITE_FCNTL_CKSM_FILE]] ** Used by the cksmvfs VFS module only. +** +**
    8. [[SQLITE_FCNTL_RESET_CACHE]] +** If there is currently no transaction open on the database, and the +** database is not a temp db, then this file-control purges the contents +** of the in-memory page cache. If there is an open transaction, or if +** the db is a temp-db, it is a no-op, not an error. ** */ #define SQLITE_FCNTL_LOCKSTATE 1 @@ -1224,6 +1240,7 @@ struct sqlite3_io_methods { #define SQLITE_FCNTL_CKPT_START 39 #define SQLITE_FCNTL_EXTERNAL_READER 40 #define SQLITE_FCNTL_CKSM_FILE 41 +#define SQLITE_FCNTL_RESET_CACHE 42 /* deprecated names */ #define SQLITE_GET_LOCKPROXYFILE SQLITE_FCNTL_GET_LOCKPROXYFILE @@ -1253,6 +1270,26 @@ typedef struct sqlite3_mutex sqlite3_mutex; */ typedef struct sqlite3_api_routines sqlite3_api_routines; +/* +** CAPI3REF: File Name +** +** Type [sqlite3_filename] is used by SQLite to pass filenames to the +** xOpen method of a [VFS]. It may be cast to (const char*) and treated +** as a normal, nul-terminated, UTF-8 buffer containing the filename, but +** may also be passed to special APIs such as: +** +**
        +**
      • sqlite3_filename_database() +**
      • sqlite3_filename_journal() +**
      • sqlite3_filename_wal() +**
      • sqlite3_uri_parameter() +**
      • sqlite3_uri_boolean() +**
      • sqlite3_uri_int64() +**
      • sqlite3_uri_key() +**
      +*/ +typedef const char *sqlite3_filename; + /* ** CAPI3REF: OS Interface Object ** @@ -1431,7 +1468,7 @@ struct sqlite3_vfs { sqlite3_vfs *pNext; /* Next registered VFS */ const char *zName; /* Name of this virtual file system */ void *pAppData; /* Pointer to application-specific data */ - int (*xOpen)(sqlite3_vfs*, const char *zName, sqlite3_file*, + int (*xOpen)(sqlite3_vfs*, sqlite3_filename zName, sqlite3_file*, int flags, int *pOutFlags); int (*xDelete)(sqlite3_vfs*, const char *zName, int syncDir); int (*xAccess)(sqlite3_vfs*, const char *zName, int flags, int *pResOut); @@ -2309,6 +2346,7 @@ struct sqlite3_mem_methods { **
        **
      • The [PRAGMA writable_schema=ON] statement. **
      • The [PRAGMA journal_mode=OFF] statement. +**
      • The [PRAGMA schema_version=N] statement. **
      • Writes to the [sqlite_dbpage] virtual table. **
      • Direct writes to [shadow tables]. **
      @@ -3424,6 +3462,9 @@ SQLITE_API void sqlite3_progress_handler(sqlite3*, int, int(*)(void*), void*); **
      The database is opened [shared cache] enabled, overriding ** the default shared cache setting provided by ** [sqlite3_enable_shared_cache()].)^ +** The [use of shared cache mode is discouraged] and hence shared cache +** capabilities may be omitted from many builds of SQLite. In such cases, +** this option is a no-op. ** ** ^(
      [SQLITE_OPEN_PRIVATECACHE]
      **
      The database is opened [shared cache] disabled, overriding @@ -3439,7 +3480,7 @@ SQLITE_API void sqlite3_progress_handler(sqlite3*, int, int(*)(void*), void*); ** to return an extended result code.
      ** ** [[OPEN_NOFOLLOW]] ^(
      [SQLITE_OPEN_NOFOLLOW]
      -**
      The database filename is not allowed to be a symbolic link
      +**
      The database filename is not allowed to contain a symbolic link
      ** )^ ** ** If the 3rd parameter to sqlite3_open_v2() is not one of the @@ -3698,10 +3739,10 @@ SQLITE_API int sqlite3_open_v2( ** ** See the [URI filename] documentation for additional information. */ -SQLITE_API const char *sqlite3_uri_parameter(const char *zFilename, const char *zParam); -SQLITE_API int sqlite3_uri_boolean(const char *zFile, const char *zParam, int bDefault); -SQLITE_API sqlite3_int64 sqlite3_uri_int64(const char*, const char*, sqlite3_int64); -SQLITE_API const char *sqlite3_uri_key(const char *zFilename, int N); +SQLITE_API const char *sqlite3_uri_parameter(sqlite3_filename z, const char *zParam); +SQLITE_API int sqlite3_uri_boolean(sqlite3_filename z, const char *zParam, int bDefault); +SQLITE_API sqlite3_int64 sqlite3_uri_int64(sqlite3_filename, const char*, sqlite3_int64); +SQLITE_API const char *sqlite3_uri_key(sqlite3_filename z, int N); /* ** CAPI3REF: Translate filenames @@ -3730,9 +3771,9 @@ SQLITE_API const char *sqlite3_uri_key(const char *zFilename, int N); ** return value from [sqlite3_db_filename()], then the result is ** undefined and is likely a memory access violation. */ -SQLITE_API const char *sqlite3_filename_database(const char*); -SQLITE_API const char *sqlite3_filename_journal(const char*); -SQLITE_API const char *sqlite3_filename_wal(const char*); +SQLITE_API const char *sqlite3_filename_database(sqlite3_filename); +SQLITE_API const char *sqlite3_filename_journal(sqlite3_filename); +SQLITE_API const char *sqlite3_filename_wal(sqlite3_filename); /* ** CAPI3REF: Database File Corresponding To A Journal @@ -3798,14 +3839,14 @@ SQLITE_API sqlite3_file *sqlite3_database_file_object(const char*); ** then the corresponding [sqlite3_module.xClose() method should also be ** invoked prior to calling sqlite3_free_filename(Y). */ -SQLITE_API char *sqlite3_create_filename( +SQLITE_API sqlite3_filename sqlite3_create_filename( const char *zDatabase, const char *zJournal, const char *zWal, int nParam, const char **azParam ); -SQLITE_API void sqlite3_free_filename(char*); +SQLITE_API void sqlite3_free_filename(sqlite3_filename); /* ** CAPI3REF: Error Codes And Messages @@ -5508,6 +5549,16 @@ SQLITE_API SQLITE_DEPRECATED int sqlite3_memory_alarm(void(*)(void*,sqlite3_int6 ** then the conversion is performed. Otherwise no conversion occurs. ** The [SQLITE_INTEGER | datatype] after conversion is returned.)^ ** +** ^(The sqlite3_value_encoding(X) interface returns one of [SQLITE_UTF8], +** [SQLITE_UTF16BE], or [SQLITE_UTF16LE] according to the current encoding +** of the value X, assuming that X has type TEXT.)^ If sqlite3_value_type(X) +** returns something other than SQLITE_TEXT, then the return value from +** sqlite3_value_encoding(X) is meaningless. ^Calls to +** sqlite3_value_text(X), sqlite3_value_text16(X), sqlite3_value_text16be(X), +** sqlite3_value_text16le(X), sqlite3_value_bytes(X), or +** sqlite3_value_bytes16(X) might change the encoding of the value X and +** thus change the return from subsequent calls to sqlite3_value_encoding(X). +** ** ^Within the [xUpdate] method of a [virtual table], the ** sqlite3_value_nochange(X) interface returns true if and only if ** the column corresponding to X is unchanged by the UPDATE operation @@ -5572,6 +5623,7 @@ SQLITE_API int sqlite3_value_type(sqlite3_value*); SQLITE_API int sqlite3_value_numeric_type(sqlite3_value*); SQLITE_API int sqlite3_value_nochange(sqlite3_value*); SQLITE_API int sqlite3_value_frombind(sqlite3_value*); +SQLITE_API int sqlite3_value_encoding(sqlite3_value*); /* ** CAPI3REF: Finding The Subtype Of SQL Values @@ -5593,7 +5645,8 @@ SQLITE_API unsigned int sqlite3_value_subtype(sqlite3_value*); ** object D and returns a pointer to that copy. ^The [sqlite3_value] returned ** is a [protected sqlite3_value] object even if the input is not. ** ^The sqlite3_value_dup(V) interface returns NULL if V is NULL or if a -** memory allocation fails. +** memory allocation fails. ^If V is a [pointer value], then the result +** of sqlite3_value_dup(V) is a NULL value. ** ** ^The sqlite3_value_free(V) interface frees an [sqlite3_value] object ** previously obtained from [sqlite3_value_dup()]. ^If V is a NULL pointer @@ -5624,7 +5677,7 @@ SQLITE_API void sqlite3_value_free(sqlite3_value*); ** ** ^The sqlite3_aggregate_context(C,N) routine returns a NULL pointer ** when first called if N is less than or equal to zero or if a memory -** allocate error occurs. +** allocation error occurs. ** ** ^(The amount of space allocated by sqlite3_aggregate_context(C,N) is ** determined by the N parameter on first successful call. Changing the @@ -5829,9 +5882,10 @@ typedef void (*sqlite3_destructor_type)(void*); ** of [SQLITE_UTF8], [SQLITE_UTF16], [SQLITE_UTF16BE], or [SQLITE_UTF16LE]. ** ^SQLite takes the text result from the application from ** the 2nd parameter of the sqlite3_result_text* interfaces. -** ^If the 3rd parameter to the sqlite3_result_text* interfaces -** is negative, then SQLite takes result text from the 2nd parameter -** through the first zero character. +** ^If the 3rd parameter to any of the sqlite3_result_text* interfaces +** other than sqlite3_result_text64() is negative, then SQLite computes +** the string length itself by searching the 2nd parameter for the first +** zero character. ** ^If the 3rd parameter to the sqlite3_result_text* interfaces ** is non-negative, then as many bytes (not characters) of the text ** pointed to by the 2nd parameter are taken as the application-defined @@ -6275,6 +6329,28 @@ SQLITE_API int sqlite3_get_autocommit(sqlite3*); */ SQLITE_API sqlite3 *sqlite3_db_handle(sqlite3_stmt*); +/* +** CAPI3REF: Return The Schema Name For A Database Connection +** METHOD: sqlite3 +** +** ^The sqlite3_db_name(D,N) interface returns a pointer to the schema name +** for the N-th database on database connection D, or a NULL pointer of N is +** out of range. An N value of 0 means the main database file. An N of 1 is +** the "temp" schema. Larger values of N correspond to various ATTACH-ed +** databases. +** +** Space to hold the string that is returned by sqlite3_db_name() is managed +** by SQLite itself. The string might be deallocated by any operation that +** changes the schema, including [ATTACH] or [DETACH] or calls to +** [sqlite3_serialize()] or [sqlite3_deserialize()], even operations that +** occur on a different thread. Applications that need to +** remember the string long-term should make their own copy. Applications that +** are accessing the same database connection simultaneously on multiple +** threads should mutex-protect calls to this API and should make their own +** private copy of the result prior to releasing the mutex. +*/ +SQLITE_API const char *sqlite3_db_name(sqlite3 *db, int N); + /* ** CAPI3REF: Return The Filename For A Database Connection ** METHOD: sqlite3 @@ -6305,7 +6381,7 @@ SQLITE_API sqlite3 *sqlite3_db_handle(sqlite3_stmt*); **
    9. [sqlite3_filename_wal()] ** */ -SQLITE_API const char *sqlite3_db_filename(sqlite3 *db, const char *zDbName); +SQLITE_API sqlite3_filename sqlite3_db_filename(sqlite3 *db, const char *zDbName); /* ** CAPI3REF: Determine if a database is read-only @@ -6442,7 +6518,7 @@ SQLITE_API void *sqlite3_rollback_hook(sqlite3*, void(*)(void *), void*); ** function C that is invoked prior to each autovacuum of the database ** file. ^The callback is passed a copy of the generic data pointer (P), ** the schema-name of the attached database that is being autovacuumed, -** the the size of the database file in pages, the number of free pages, +** the size of the database file in pages, the number of free pages, ** and the number of bytes per page, respectively. The callback should ** return the number of free pages that should be removed by the ** autovacuum. ^If the callback returns zero, then no autovacuum happens. @@ -6563,6 +6639,11 @@ SQLITE_API void *sqlite3_update_hook( ** to the same database. Sharing is enabled if the argument is true ** and disabled if the argument is false.)^ ** +** This interface is omitted if SQLite is compiled with +** [-DSQLITE_OMIT_SHARED_CACHE]. The [-DSQLITE_OMIT_SHARED_CACHE] +** compile-time option is recommended because the +** [use of shared cache mode is discouraged]. +** ** ^Cache sharing is enabled and disabled for an entire process. ** This is a change as of SQLite [version 3.5.0] ([dateof:3.5.0]). ** In prior versions of SQLite, @@ -6661,7 +6742,7 @@ SQLITE_API int sqlite3_db_release_memory(sqlite3*); ** ^The soft heap limit may not be greater than the hard heap limit. ** ^If the hard heap limit is enabled and if sqlite3_soft_heap_limit(N) ** is invoked with a value of N that is greater than the hard heap limit, -** the the soft heap limit is set to the value of the hard heap limit. +** the soft heap limit is set to the value of the hard heap limit. ** ^The soft heap limit is automatically enabled whenever the hard heap ** limit is enabled. ^When sqlite3_hard_heap_limit64(N) is invoked and ** the soft heap limit is outside the range of 1..N, then the soft heap @@ -8956,7 +9037,7 @@ typedef struct sqlite3_backup sqlite3_backup; ** if the application incorrectly accesses the destination [database connection] ** and so no error code is reported, but the operations may malfunction ** nevertheless. Use of the destination database connection while a -** backup is in progress might also also cause a mutex deadlock. +** backup is in progress might also cause a mutex deadlock. ** ** If running in [shared cache mode], the application must ** guarantee that the shared cache used by the destination database @@ -9384,7 +9465,7 @@ SQLITE_API int sqlite3_wal_checkpoint_v2( */ #define SQLITE_CHECKPOINT_PASSIVE 0 /* Do as much as possible w/o blocking */ #define SQLITE_CHECKPOINT_FULL 1 /* Wait for writers, then checkpoint */ -#define SQLITE_CHECKPOINT_RESTART 2 /* Like FULL but wait for for readers */ +#define SQLITE_CHECKPOINT_RESTART 2 /* Like FULL but wait for readers */ #define SQLITE_CHECKPOINT_TRUNCATE 3 /* Like RESTART but also truncate WAL */ /* @@ -9554,8 +9635,8 @@ SQLITE_API SQLITE_EXPERIMENTAL const char *sqlite3_vtab_collation(sqlite3_index_ ** of a [virtual table] implementation. The result of calling this ** interface from outside of xBestIndex() is undefined and probably harmful. ** -** ^The sqlite3_vtab_distinct() interface returns an integer that is -** either 0, 1, or 2. The integer returned by sqlite3_vtab_distinct() +** ^The sqlite3_vtab_distinct() interface returns an integer between 0 and +** 3. The integer returned by sqlite3_vtab_distinct() ** gives the virtual table additional information about how the query ** planner wants the output to be ordered. As long as the virtual table ** can meet the ordering requirements of the query planner, it may set @@ -9587,6 +9668,13 @@ SQLITE_API SQLITE_EXPERIMENTAL const char *sqlite3_vtab_collation(sqlite3_index_ ** that have the same value for all columns identified by "aOrderBy". ** ^However omitting the extra rows is optional. ** This mode is used for a DISTINCT query. +**
    10. +** ^(If the sqlite3_vtab_distinct() interface returns 3, that means +** that the query planner needs only distinct rows but it does need the +** rows to be sorted.)^ ^The virtual table implementation is free to omit +** rows that are identical in all aOrderBy columns, if it wants to, but +** it is not required to omit any rows. This mode is used for queries +** that have both DISTINCT and ORDER BY clauses. **

    ** ** ^For the purposes of comparing virtual table output values to see if the diff --git a/database/sqlite/sqlite_aclk.c b/database/sqlite/sqlite_aclk.c index 7e3a9b2eb..3b0c40522 100644 --- a/database/sqlite/sqlite_aclk.c +++ b/database/sqlite/sqlite_aclk.c @@ -10,10 +10,140 @@ void sanity_check(void) { BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < ACLK_MAX_ENUMERATIONS_DEFINED); } -const char *aclk_sync_config[] = { +static int sql_check_aclk_table(void *data, int argc, char **argv, char **column) +{ + struct aclk_database_worker_config *wc = data; + UNUSED(argc); + UNUSED(column); - NULL, -}; + debug(D_ACLK_SYNC,"Scheduling aclk sync table check for node %s", (char *) argv[0]); + struct aclk_database_cmd cmd; + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = ACLK_DATABASE_DELETE_HOST; + cmd.data = strdupz((char *) argv[0]); + aclk_database_enq_cmd_noblock(wc, &cmd); + return 0; +} + +#define SQL_SELECT_ACLK_ACTIVE_LIST "SELECT REPLACE(SUBSTR(name,19),'_','-') FROM sqlite_schema " \ + "WHERE name LIKE 'aclk_chart_latest_%' AND type IN ('table');" + +static void sql_check_aclk_table_list(struct aclk_database_worker_config *wc) +{ + char *err_msg = NULL; + debug(D_ACLK_SYNC,"Cleaning tables for nodes that do not exist"); + int rc = sqlite3_exec_monitored(db_meta, SQL_SELECT_ACLK_ACTIVE_LIST, sql_check_aclk_table, (void *) wc, &err_msg); + if (rc != SQLITE_OK) { + error_report("Query failed when trying to check for obsolete ACLK sync tables, %s", err_msg); + sqlite3_free(err_msg); + } +} + +static void sql_maint_aclk_sync_database(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) +{ + UNUSED(cmd); + + debug(D_ACLK, "Checking database for %s", wc->host_guid); + + BUFFER *sql = buffer_create(ACLK_SYNC_QUERY_SIZE, &netdata_buffers_statistics.buffers_sqlite); + + buffer_sprintf(sql,"DELETE FROM aclk_alert_%s WHERE date_submitted IS NOT NULL AND " + "CAST(date_cloud_ack AS INT) < unixepoch()-%d;", wc->uuid_str, ACLK_DELETE_ACK_ALERTS_INTERNAL); + db_execute(buffer_tostring(sql)); + + buffer_free(sql); +} + + +#define SQL_SELECT_HOST_BY_UUID "SELECT host_id FROM host WHERE host_id = @host_id;" + +static int is_host_available(uuid_t *host_id) +{ + sqlite3_stmt *res = NULL; + int rc; + + if (unlikely(!db_meta)) { + if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) + error_report("Database has not been initialized"); + return 1; + } + + rc = sqlite3_prepare_v2(db_meta, SQL_SELECT_HOST_BY_UUID, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to select node instance information for a node"); + return 1; + } + + rc = sqlite3_bind_blob(res, 1, host_id, sizeof(*host_id), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter to select node instance information"); + goto failed; + } + rc = sqlite3_step_monitored(res); + +failed: + if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) + error_report("Failed to finalize the prepared statement when checking host existence"); + + return (rc == SQLITE_ROW); +} + +// OPCODE: ACLK_DATABASE_DELETE_HOST +void sql_delete_aclk_table_list(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) +{ + UNUSED(wc); + char uuid_str[GUID_LEN + 1]; + char host_str[GUID_LEN + 1]; + + int rc; + uuid_t host_uuid; + char *host_guid = (char *)cmd.data; + + if (unlikely(!host_guid)) + return; + + rc = uuid_parse(host_guid, host_uuid); + freez(host_guid); + if (rc) + return; + + uuid_unparse_lower(host_uuid, host_str); + uuid_unparse_lower_fix(&host_uuid, uuid_str); + + debug(D_ACLK_SYNC, "Checking if I should delete aclk tables for node %s", host_str); + + if (is_host_available(&host_uuid)) { + debug(D_ACLK_SYNC, "Host %s exists, not deleting aclk sync tables", host_str); + return; + } + + debug(D_ACLK_SYNC, "Host %s does NOT exist, can delete aclk sync tables", host_str); + + sqlite3_stmt *res = NULL; + BUFFER *sql = buffer_create(ACLK_SYNC_QUERY_SIZE, &netdata_buffers_statistics.buffers_sqlite); + + buffer_sprintf(sql,"SELECT 'drop '||type||' IF EXISTS '||name||';' FROM sqlite_schema " \ + "WHERE name LIKE 'aclk_%%_%s' AND type IN ('table', 'trigger', 'index');", uuid_str); + + rc = sqlite3_prepare_v2(db_meta, buffer_tostring(sql), -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement to clean up aclk tables"); + goto fail; + } + buffer_flush(sql); + + while (sqlite3_step_monitored(res) == SQLITE_ROW) + buffer_strcat(sql, (char *) sqlite3_column_text(res, 0)); + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement to clean up aclk tables, rc = %d", rc); + + db_execute(buffer_tostring(sql)); + +fail: + buffer_free(sql); +} uv_mutex_t aclk_async_lock; struct aclk_database_worker_config *aclk_thread_head = NULL; @@ -38,7 +168,6 @@ void aclk_add_worker_thread(struct aclk_database_worker_config *wc) aclk_thread_head = wc; } uv_mutex_unlock(&aclk_async_lock); - return; } void aclk_del_worker_thread(struct aclk_database_worker_config *wc) @@ -53,7 +182,6 @@ void aclk_del_worker_thread(struct aclk_database_worker_config *wc) if (*tmp) *tmp = wc->next; uv_mutex_unlock(&aclk_async_lock); - return; } int aclk_worker_thread_exists(char *guid) @@ -199,7 +327,6 @@ void aclk_sync_exit_all() uv_mutex_unlock(&aclk_async_lock); } -#ifdef ENABLE_ACLK enum { IDX_HOST_ID, IDX_HOSTNAME, @@ -228,6 +355,8 @@ static int create_host_callback(void *data, int argc, char **argv, char **column uuid_unparse_lower(*(uuid_t *)argv[IDX_HOST_ID], guid); struct rrdhost_system_info *system_info = callocz(1, sizeof(struct rrdhost_system_info)); + __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(struct rrdhost_system_info), __ATOMIC_RELAXED); + system_info->hops = str2i((const char *) argv[IDX_HOPS]); sql_build_host_system_info((uuid_t *)argv[IDX_HOST_ID], system_info); @@ -268,9 +397,9 @@ static int create_host_callback(void *data, int argc, char **argv, char **column #endif return 0; } -#endif -int aclk_start_sync_thread(void *data, int argc, char **argv, char **column) +#ifdef ENABLE_ACLK +static int aclk_start_sync_thread(void *data, int argc, char **argv, char **column) { char uuid_str[GUID_LEN + 1]; UNUSED(data); @@ -286,10 +415,9 @@ int aclk_start_sync_thread(void *data, int argc, char **argv, char **column) sql_create_aclk_table(host, (uuid_t *) argv[0], (uuid_t *) argv[1]); return 0; } - +#endif void sql_aclk_sync_init(void) { -#ifdef ENABLE_ACLK char *err_msg = NULL; int rc; @@ -301,21 +429,7 @@ void sql_aclk_sync_init(void) return; } - info("SQLite aclk sync initialization"); - - for (int i = 0; aclk_sync_config[i]; i++) { - debug(D_ACLK_SYNC, "Executing %s", aclk_sync_config[i]); - rc = sqlite3_exec_monitored(db_meta, aclk_sync_config[i], 0, 0, &err_msg); - if (rc != SQLITE_OK) { - error_report("SQLite error aclk sync initialization setup, rc = %d (%s)", rc, err_msg); - error_report("SQLite failed statement %s", aclk_sync_config[i]); - sqlite3_free(err_msg); - return; - } - } - info("SQLite aclk sync initialization completed"); - fatal_assert(0 == uv_mutex_init(&aclk_async_lock)); - + info("Creating archived hosts"); rc = sqlite3_exec_monitored(db_meta, "SELECT host_id, hostname, registry_hostname, update_every, os, " "timezone, tags, hops, memory_mode, abbrev_timezone, utc_offset, program_name, " "program_version, entries, health_enabled FROM host WHERE hops >0;", @@ -325,14 +439,16 @@ void sql_aclk_sync_init(void) sqlite3_free(err_msg); } +#ifdef ENABLE_ACLK + fatal_assert(0 == uv_mutex_init(&aclk_async_lock)); rc = sqlite3_exec_monitored(db_meta, "SELECT ni.host_id, ni.node_id FROM host h, node_instance ni WHERE " "h.host_id = ni.host_id AND ni.node_id IS NOT NULL;", aclk_start_sync_thread, NULL, &err_msg); if (rc != SQLITE_OK) { error_report("SQLite error when starting ACLK sync threads, rc = %d (%s)", rc, err_msg); sqlite3_free(err_msg); } + info("ACLK sync initialization completed"); #endif - return; } static void async_cb(uv_async_t *handle) @@ -374,10 +490,9 @@ static void timer_cb(uv_timer_t* handle) #endif } -#define MAX_CMD_BATCH_SIZE (256) - -void aclk_database_worker(void *arg) +static void aclk_database_worker(void *arg) { + service_register(SERVICE_THREAD_TYPE_EVENT_LOOP, NULL, NULL, NULL, true); worker_register("ACLKSYNC"); worker_register_job_name(ACLK_DATABASE_NOOP, "noop"); worker_register_job_name(ACLK_DATABASE_ORPHAN_HOST, "node orphan"); @@ -398,15 +513,12 @@ void aclk_database_worker(void *arg) enum aclk_database_opcode opcode; uv_timer_t timer_req; struct aclk_database_cmd cmd; - unsigned cmd_batch_size; - - //aclk_database_init_cmd_queue(wc); char threadname[NETDATA_THREAD_NAME_MAX+1]; if (wc->host) - snprintfz(threadname, NETDATA_THREAD_NAME_MAX, "AS_%s", rrdhost_hostname(wc->host)); + snprintfz(threadname, NETDATA_THREAD_NAME_MAX, "ACLK[%s]", rrdhost_hostname(wc->host)); else { - snprintfz(threadname, NETDATA_THREAD_NAME_MAX, "AS_%s", wc->uuid_str); + snprintfz(threadname, NETDATA_THREAD_NAME_MAX, "ACLK[%s]", wc->uuid_str); threadname[11] = '\0'; } uv_thread_set_name_np(wc->thread, threadname); @@ -449,17 +561,13 @@ void aclk_database_worker(void *arg) uv_run(loop, UV_RUN_DEFAULT); /* wait for commands */ - cmd_batch_size = 0; do { - if (unlikely(cmd_batch_size >= MAX_CMD_BATCH_SIZE)) - break; cmd = aclk_database_deq_cmd(wc); if (netdata_exit) break; opcode = cmd.opcode; - ++cmd_batch_size; if(likely(opcode != ACLK_DATABASE_NOOP)) worker_is_busy(opcode); @@ -535,7 +643,7 @@ void aclk_database_worker(void *arg) wc->host = rrdhost_find_by_guid(wc->host_guid); if (wc->host) { info("HOST %s (%s) detected as active", rrdhost_hostname(wc->host), wc->host_guid); - snprintfz(threadname, NETDATA_THREAD_NAME_MAX, "AS_%s", rrdhost_hostname(wc->host)); + snprintfz(threadname, NETDATA_THREAD_NAME_MAX, "ACLK[%s]", rrdhost_hostname(wc->host)); uv_thread_set_name_np(wc->thread, threadname); wc->host->dbsync_worker = wc; if (unlikely(!wc->hostname)) @@ -584,10 +692,8 @@ void aclk_database_worker(void *arg) info("Shutting down ACLK sync event loop complete for host %s", wc->host_guid); /* TODO: don't let the API block by waiting to enqueue commands */ uv_cond_destroy(&wc->cmd_cond); -/* uv_mutex_destroy(&wc->cmd_mutex); */ - //fatal_assert(0 == uv_loop_close(loop)); - int rc; + int rc; do { rc = uv_loop_close(loop); } while (rc != UV_EBUSY); @@ -628,7 +734,7 @@ void sql_create_aclk_table(RRDHOST *host, uuid_t *host_uuid, uuid_t *node_id) uuid_unparse_lower(*host_uuid, host_guid); - BUFFER *sql = buffer_create(ACLK_SYNC_QUERY_SIZE); + BUFFER *sql = buffer_create(ACLK_SYNC_QUERY_SIZE, &netdata_buffers_statistics.buffers_sqlite); buffer_sprintf(sql, TABLE_ACLK_ALERT, uuid_str); db_execute(buffer_tostring(sql)); @@ -648,6 +754,10 @@ void sql_create_aclk_table(RRDHOST *host, uuid_t *host_uuid, uuid_t *node_id) if (likely(host)) { host->dbsync_worker = (void *)wc; wc->hostname = strdupz(rrdhost_hostname(host)); + if (node_id && !host->node_id) { + host->node_id = mallocz(sizeof(*host->node_id)); + uuid_copy(*host->node_id, *node_id); + } } else wc->hostname = get_hostname_by_node_id(wc->node_id); @@ -663,142 +773,4 @@ void sql_create_aclk_table(RRDHOST *host, uuid_t *host_uuid, uuid_t *node_id) UNUSED(host_uuid); UNUSED(node_id); #endif - return; -} - -void sql_maint_aclk_sync_database(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) -{ - UNUSED(cmd); - - debug(D_ACLK, "Checking database for %s", wc->host_guid); - - BUFFER *sql = buffer_create(ACLK_SYNC_QUERY_SIZE); - - buffer_sprintf(sql,"DELETE FROM aclk_alert_%s WHERE date_submitted IS NOT NULL AND " - "CAST(date_cloud_ack AS INT) < unixepoch()-%d;", wc->uuid_str, ACLK_DELETE_ACK_ALERTS_INTERNAL); - db_execute(buffer_tostring(sql)); - - buffer_free(sql); - return; -} - -#define SQL_SELECT_HOST_BY_UUID "SELECT host_id FROM host WHERE host_id = @host_id;" - -static int is_host_available(uuid_t *host_id) -{ - sqlite3_stmt *res = NULL; - int rc; - - if (unlikely(!db_meta)) { - if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) - error_report("Database has not been initialized"); - return 1; - } - - rc = sqlite3_prepare_v2(db_meta, SQL_SELECT_HOST_BY_UUID, -1, &res, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement to select node instance information for a node"); - return 1; - } - - rc = sqlite3_bind_blob(res, 1, host_id, sizeof(*host_id), SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind host_id parameter to select node instance information"); - goto failed; - } - rc = sqlite3_step_monitored(res); - - failed: - if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) - error_report("Failed to finalize the prepared statement when checking host existence"); - - return (rc == SQLITE_ROW); -} - -// OPCODE: ACLK_DATABASE_DELETE_HOST -void sql_delete_aclk_table_list(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) -{ - UNUSED(wc); - char uuid_str[GUID_LEN + 1]; - char host_str[GUID_LEN + 1]; - - int rc; - uuid_t host_uuid; - char *host_guid = (char *)cmd.data; - - if (unlikely(!host_guid)) - return; - - rc = uuid_parse(host_guid, host_uuid); - freez(host_guid); - if (rc) - return; - - uuid_unparse_lower(host_uuid, host_str); - uuid_unparse_lower_fix(&host_uuid, uuid_str); - - debug(D_ACLK_SYNC, "Checking if I should delete aclk tables for node %s", host_str); - - if (is_host_available(&host_uuid)) { - debug(D_ACLK_SYNC, "Host %s exists, not deleting aclk sync tables", host_str); - return; - } - - debug(D_ACLK_SYNC, "Host %s does NOT exist, can delete aclk sync tables", host_str); - - sqlite3_stmt *res = NULL; - BUFFER *sql = buffer_create(ACLK_SYNC_QUERY_SIZE); - - buffer_sprintf(sql,"SELECT 'drop '||type||' IF EXISTS '||name||';' FROM sqlite_schema " \ - "WHERE name LIKE 'aclk_%%_%s' AND type IN ('table', 'trigger', 'index');", uuid_str); - - rc = sqlite3_prepare_v2(db_meta, buffer_tostring(sql), -1, &res, 0); - if (rc != SQLITE_OK) { - error_report("Failed to prepare statement to clean up aclk tables"); - goto fail; - } - buffer_flush(sql); - - while (sqlite3_step_monitored(res) == SQLITE_ROW) - buffer_strcat(sql, (char *) sqlite3_column_text(res, 0)); - - rc = sqlite3_finalize(res); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to finalize statement to clean up aclk tables, rc = %d", rc); - - db_execute(buffer_tostring(sql)); - -fail: - buffer_free(sql); - return; -} - -static int sql_check_aclk_table(void *data, int argc, char **argv, char **column) -{ - struct aclk_database_worker_config *wc = data; - UNUSED(argc); - UNUSED(column); - - debug(D_ACLK_SYNC,"Scheduling aclk sync table check for node %s", (char *) argv[0]); - struct aclk_database_cmd cmd; - memset(&cmd, 0, sizeof(cmd)); - cmd.opcode = ACLK_DATABASE_DELETE_HOST; - cmd.data = strdupz((char *) argv[0]); - aclk_database_enq_cmd_noblock(wc, &cmd); - return 0; -} - -#define SQL_SELECT_ACLK_ACTIVE_LIST "SELECT REPLACE(SUBSTR(name,19),'_','-') FROM sqlite_schema " \ - "WHERE name LIKE 'aclk_chart_latest_%' AND type IN ('table');" - -void sql_check_aclk_table_list(struct aclk_database_worker_config *wc) -{ - char *err_msg = NULL; - debug(D_ACLK_SYNC,"Cleaning tables for nodes that do not exist"); - int rc = sqlite3_exec_monitored(db_meta, SQL_SELECT_ACLK_ACTIVE_LIST, sql_check_aclk_table, (void *) wc, &err_msg); - if (rc != SQLITE_OK) { - error_report("Query failed when trying to check for obsolete ACLK sync tables, %s", err_msg); - sqlite3_free(err_msg); - } - return; -} +} \ No newline at end of file diff --git a/database/sqlite/sqlite_aclk.h b/database/sqlite/sqlite_aclk.h index 06d5d0270..208177e45 100644 --- a/database/sqlite/sqlite_aclk.h +++ b/database/sqlite/sqlite_aclk.h @@ -99,7 +99,7 @@ struct aclk_database_cmd { struct aclk_completion *completion; }; -#define ACLK_DATABASE_CMD_Q_MAX_SIZE (16384) +#define ACLK_DATABASE_CMD_Q_MAX_SIZE (1024) struct aclk_database_cmdqueue { unsigned head, tail; @@ -166,9 +166,6 @@ int aclk_database_enq_cmd_noblock(struct aclk_database_worker_config *wc, struct void aclk_database_enq_cmd(struct aclk_database_worker_config *wc, struct aclk_database_cmd *cmd); void sql_create_aclk_table(RRDHOST *host, uuid_t *host_uuid, uuid_t *node_id); void sql_aclk_sync_init(void); -void sql_check_aclk_table_list(struct aclk_database_worker_config *wc); -void sql_delete_aclk_table_list(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); -void sql_maint_aclk_sync_database(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); int claimed(); void aclk_sync_exit_all(); struct aclk_database_worker_config *find_inactive_wc_by_node_id(char *node_id); diff --git a/database/sqlite/sqlite_aclk_alert.c b/database/sqlite/sqlite_aclk_alert.c index 47663a8d1..ce284ebc3 100644 --- a/database/sqlite/sqlite_aclk_alert.c +++ b/database/sqlite/sqlite_aclk_alert.c @@ -43,6 +43,34 @@ void update_filtered(ALARM_ENTRY *ae, uint32_t unique_id, char *uuid_str) { ae->flags |= HEALTH_ENTRY_FLAG_ACLK_QUEUED; } +static inline bool is_event_from_alert_variable_config(uint32_t unique_id, char *uuid_str) { + sqlite3_stmt *res = NULL; + int rc = 0; + bool ret = false; + + char sql[ACLK_SYNC_QUERY_SIZE]; + snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, "select hl.unique_id from health_log_%s hl, alert_hash ah where hl.unique_id = %u " \ + "and hl.config_hash_id = ah.hash_id " \ + "and ah.warn is null and ah.crit is null;", uuid_str, unique_id); + + rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to check for alert variables."); + return false; + } + + rc = sqlite3_step_monitored(res); + if (likely(rc == SQLITE_ROW)) { + ret = true; + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement when trying to check for alert variables, rc = %d", rc); + + return ret; +} + #define MAX_REMOVED_PERIOD 86400 //decide if some events should be sent or not int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) @@ -59,6 +87,9 @@ int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) if (unlikely(uuid_is_null(ae->config_hash_id))) return 0; + if (is_event_from_alert_variable_config(ae->unique_id, uuid_str)) + return 0; + char sql[ACLK_SYNC_QUERY_SIZE]; uuid_t config_hash_id; RRDCALC_STATUS status; @@ -133,6 +164,9 @@ done: // and handle both cases int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, int skip_filter) { + if(!service_running(SERVICE_ACLK)) + return 0; + if (!claimed()) return 0; @@ -153,7 +187,7 @@ int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, int skip_filter) char uuid_str[GUID_LEN + 1]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - BUFFER *sql = buffer_create(1024); + BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); buffer_sprintf( sql, @@ -242,7 +276,7 @@ void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_d return; } - BUFFER *sql = buffer_create(1024); + BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); if (wc->alerts_start_seq_id != 0) { buffer_sprintf( @@ -267,20 +301,38 @@ void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_d sqlite3_stmt *res = NULL; - buffer_sprintf(sql, "select aa.sequence_id, hl.unique_id, hl.alarm_id, hl.config_hash_id, hl.updated_by_id, hl.when_key, \ - hl.duration, hl.non_clear_duration, hl.flags, hl.exec_run_timestamp, hl.delay_up_to_timestamp, hl.name, \ - hl.chart, hl.family, hl.exec, hl.recipient, hl.source, hl.units, hl.info, hl.exec_code, hl.new_status, \ - hl.old_status, hl.delay, hl.new_value, hl.old_value, hl.last_repeat, hl.chart_context \ - from health_log_%s hl, aclk_alert_%s aa \ - where hl.unique_id = aa.alert_unique_id and aa.date_submitted is null \ - order by aa.sequence_id asc limit %d;", wc->uuid_str, wc->uuid_str, limit); + buffer_sprintf(sql, "select aa.sequence_id, hl.unique_id, hl.alarm_id, hl.config_hash_id, hl.updated_by_id, hl.when_key, " \ + " hl.duration, hl.non_clear_duration, hl.flags, hl.exec_run_timestamp, hl.delay_up_to_timestamp, hl.name, " \ + " hl.chart, hl.family, hl.exec, hl.recipient, hl.source, hl.units, hl.info, hl.exec_code, hl.new_status, " \ + " hl.old_status, hl.delay, hl.new_value, hl.old_value, hl.last_repeat, hl.chart_context " \ + " from health_log_%s hl, aclk_alert_%s aa " \ + " where hl.unique_id = aa.alert_unique_id and aa.date_submitted is null " \ + " order by aa.sequence_id asc limit %d;", wc->uuid_str, wc->uuid_str, limit); rc = sqlite3_prepare_v2(db_meta, buffer_tostring(sql), -1, &res, 0); if (rc != SQLITE_OK) { - error_report("Failed to prepare statement when trying to send an alert update via ACLK"); - buffer_free(sql); - freez(claim_id); - return; + + // Try to create tables + if (wc->host) + sql_create_health_log_table(wc->host); + + BUFFER *sql_fix = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + buffer_sprintf(sql_fix, TABLE_ACLK_ALERT, wc->uuid_str); + db_execute(buffer_tostring(sql_fix)); + buffer_flush(sql_fix); + buffer_sprintf(sql_fix, INDEX_ACLK_ALERT, wc->uuid_str, wc->uuid_str); + db_execute(buffer_tostring(sql_fix)); + buffer_free(sql_fix); + + // Try again + rc = sqlite3_prepare_v2(db_meta, buffer_tostring(sql), -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to send an alert update via ACLK"); + + buffer_free(sql); + freez(claim_id); + return; + } } char uuid_str[GUID_LEN + 1]; @@ -311,7 +363,7 @@ void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_d alarm_log.utc_offset = wc->host->utc_offset; alarm_log.timezone = strdupz(rrdhost_abbrev_timezone(wc->host)); alarm_log.exec_path = sqlite3_column_bytes(res, 14) > 0 ? strdupz((char *)sqlite3_column_text(res, 14)) : - strdupz((char *)string2str(wc->host->health_default_exec)); + strdupz((char *)string2str(wc->host->health.health_default_exec)); alarm_log.conf_source = strdupz((char *)sqlite3_column_text(res, 16)); char *edit_command = sqlite3_column_bytes(res, 16) > 0 ? @@ -407,7 +459,7 @@ void sql_queue_existing_alerts_to_aclk(RRDHOST *host) { char uuid_str[GUID_LEN + 1]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - BUFFER *sql = buffer_create(1024); + BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); buffer_sprintf(sql,"delete from aclk_alert_%s; " \ "insert into aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " \ @@ -484,7 +536,7 @@ void aclk_push_alarm_health_log(struct aclk_database_worker_config *wc, struct a struct timeval first_timestamp; struct timeval last_timestamp; - BUFFER *sql = buffer_create(1024); + BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); sqlite3_stmt *res = NULL; @@ -528,7 +580,7 @@ void aclk_push_alarm_health_log(struct aclk_database_worker_config *wc, struct a alarm_log.node_id = wc->node_id; alarm_log.log_entries = log_entries; alarm_log.status = wc->alert_updates == 0 ? 2 : 1; - alarm_log.enabled = (int)host->health_enabled; + alarm_log.enabled = (int)host->health.health_enabled; wc->alert_sequence_id = last_sequence; @@ -541,6 +593,8 @@ void aclk_push_alarm_health_log(struct aclk_database_worker_config *wc, struct a freez(claim_id); buffer_free(sql); + + aclk_alert_reloaded = 1; #endif return; @@ -651,7 +705,7 @@ int aclk_push_alert_config_event(struct aclk_database_worker_config *wc, struct alarm_config.p_db_lookup_dimensions = sqlite3_column_bytes(res, 27) > 0 ? strdupz((char *)sqlite3_column_text(res, 27)) : NULL; alarm_config.p_db_lookup_method = sqlite3_column_bytes(res, 28) > 0 ? strdupz((char *)sqlite3_column_text(res, 28)) : NULL; - BUFFER *tmp_buf = buffer_create(1024); + BUFFER *tmp_buf = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); buffer_data_options2string(tmp_buf, sqlite3_column_int(res, 29)); alarm_config.p_db_lookup_options = strdupz((char *)buffer_tostring(tmp_buf)); buffer_free(tmp_buf); @@ -706,7 +760,7 @@ void aclk_start_alert_streaming(char *node_id, uint64_t batch_id, uint64_t start (struct aclk_database_worker_config *)host->dbsync_worker : (struct aclk_database_worker_config *)find_inactive_wc_by_node_id(node_id); - if (unlikely(!host->health_enabled)) { + if (unlikely(!host->health.health_enabled)) { log_access("ACLK STA [%s (N/A)]: Ignoring request to stream alert state changes, health is disabled.", node_id); return; } @@ -735,7 +789,7 @@ void sql_process_queue_removed_alerts_to_aclk(struct aclk_database_worker_config { UNUSED(cmd); - BUFFER *sql = buffer_create(1024); + BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); buffer_sprintf(sql,"insert into aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " \ "select unique_id alert_unique_id, unixepoch(), unique_id alert_unique_id from health_log_%s " \ @@ -813,7 +867,7 @@ void aclk_process_send_alarm_snapshot(char *node_id, char *claim_id, uint64_t sn void aclk_mark_alert_cloud_ack(char *uuid_str, uint64_t alerts_ack_sequence_id) { - BUFFER *sql = buffer_create(1024); + BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); if (alerts_ack_sequence_id != 0) { buffer_sprintf( @@ -846,7 +900,7 @@ void health_alarm_entry2proto_nolock(struct alarm_log_entry *alarm_log, ALARM_EN alarm_log->utc_offset = host->utc_offset; alarm_log->timezone = strdupz(rrdhost_abbrev_timezone(host)); - alarm_log->exec_path = ae->exec ? strdupz(ae_exec(ae)) : strdupz((char *)string2str(host->health_default_exec)); + alarm_log->exec_path = ae->exec ? strdupz(ae_exec(ae)) : strdupz((char *)string2str(host->health.health_default_exec)); alarm_log->conf_source = ae->source ? strdupz(ae_source(ae)) : strdupz((char *)""); alarm_log->command = strdupz((char *)edit_command); @@ -1022,7 +1076,7 @@ void sql_aclk_alert_clean_dead_entries(RRDHOST *host) char uuid_str[GUID_LEN + 1]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - BUFFER *sql = buffer_create(1024); + BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); buffer_sprintf(sql,"delete from aclk_alert_%s where filtered_alert_unique_id not in " " (select unique_id from health_log_%s); ", uuid_str, uuid_str); @@ -1048,7 +1102,7 @@ int get_proto_alert_status(RRDHOST *host, struct proto_alert_status *proto_alert proto_alert_status->alert_updates = wc->alert_updates; proto_alert_status->alerts_batch_id = wc->alerts_batch_id; - BUFFER *sql = buffer_create(1024); + BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); sqlite3_stmt *res = NULL; buffer_sprintf(sql, "SELECT MIN(sequence_id), MAX(sequence_id), " \ diff --git a/database/sqlite/sqlite_context.c b/database/sqlite/sqlite_context.c index deca84584..892292cc7 100644 --- a/database/sqlite/sqlite_context.c +++ b/database/sqlite/sqlite_context.c @@ -283,8 +283,8 @@ void ctx_get_context_list(uuid_t *host_uuid, void (*dict_cb)(VERSIONED_CONTEXT_D context_data.chart_type = (char *) sqlite3_column_text(res, 3); context_data.units = (char *) sqlite3_column_text(res, 4); context_data.priority = sqlite3_column_int64(res, 5); - context_data.first_time_t = sqlite3_column_int64(res, 6); - context_data.last_time_t = sqlite3_column_int64(res, 7); + context_data.first_time_s = sqlite3_column_int64(res, 6); + context_data.last_time_s = sqlite3_column_int64(res, 7); context_data.deleted = sqlite3_column_int(res, 8); context_data.family = (char *) sqlite3_column_text(res, 9); dict_cb(&context_data, data); @@ -360,13 +360,13 @@ int ctx_store_context(uuid_t *host_uuid, VERSIONED_CONTEXT_DATA *context_data) goto skip_store; } - rc = sqlite3_bind_int64(res, 8, (time_t) context_data->first_time_t); + rc = sqlite3_bind_int64(res, 8, (time_t) context_data->first_time_s); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to bind first_time_t to store context details"); goto skip_store; } - rc = sqlite3_bind_int64(res, 9, (time_t) context_data->last_time_t); + rc = sqlite3_bind_int64(res, 9, (time_t) context_data->last_time_s); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to bind last_time_t to store context details"); goto skip_store; @@ -478,8 +478,8 @@ static void dict_ctx_get_context_list_cb(VERSIONED_CONTEXT_DATA *context_data, v context_data->chart_type, context_data->units, context_data->priority, - context_data->first_time_t, - context_data->last_time_t, + context_data->first_time_s, + context_data->last_time_s, context_data->deleted, context_data->family); } @@ -504,8 +504,8 @@ int ctx_unittest(void) context_data.family = strdupz("TestContextFamily"); context_data.priority = 50000; context_data.deleted = 0; - context_data.first_time_t = 1657781000; - context_data.last_time_t = 1657781100; + context_data.first_time_s = 1657781000; + context_data.last_time_s = 1657781100; context_data.version = now_realtime_usec(); if (likely(!ctx_store_context(&host_uuid, &context_data))) @@ -519,8 +519,8 @@ int ctx_unittest(void) info("Entry %s not inserted", context_data.id); // This will change end time - context_data.first_time_t = 1657781000; - context_data.last_time_t = 1657782001; + context_data.first_time_s = 1657781000; + context_data.last_time_s = 1657782001; if (likely(!ctx_update_context(&host_uuid, &context_data))) info("Entry %s updated", context_data.id); else @@ -530,8 +530,8 @@ int ctx_unittest(void) info("List context end after insert"); // This will change start time - context_data.first_time_t = 1657782000; - context_data.last_time_t = 1657782001; + context_data.first_time_s = 1657782000; + context_data.last_time_s = 1657782001; if (likely(!ctx_update_context(&host_uuid, &context_data))) info("Entry %s updated", context_data.id); else diff --git a/database/sqlite/sqlite_context.h b/database/sqlite/sqlite_context.h index 2e52b9bf8..2586916ea 100644 --- a/database/sqlite/sqlite_context.h +++ b/database/sqlite/sqlite_context.h @@ -45,8 +45,8 @@ typedef struct versioned_context_data { uint64_t priority; // the chart priority of the context - uint64_t first_time_t; // the first entry in the database, in seconds - uint64_t last_time_t; // the last point in the database, in seconds + uint64_t first_time_s; // the first entry in the database, in seconds + uint64_t last_time_s; // the last point in the database, in seconds bool deleted; // true when this is deleted diff --git a/database/sqlite/sqlite_functions.c b/database/sqlite/sqlite_functions.c index ce5487fbf..1d03cfc2a 100644 --- a/database/sqlite/sqlite_functions.c +++ b/database/sqlite/sqlite_functions.c @@ -22,9 +22,8 @@ const char *database_config[] = { "multiplier int, divisor int , algorithm int, options text);", "CREATE TABLE IF NOT EXISTS metadata_migration(filename text, file_size, date_created int);", - "CREATE INDEX IF NOT EXISTS ind_d1 on dimension (chart_id, id, name);", - "CREATE INDEX IF NOT EXISTS ind_c1 on chart (host_id, id, type, name);", - "CREATE INDEX IF NOT EXISTS ind_c2 on chart (host_id, context);", + "CREATE INDEX IF NOT EXISTS ind_d2 on dimension (chart_id);", + "CREATE INDEX IF NOT EXISTS ind_c3 on chart (host_id);", "CREATE TABLE IF NOT EXISTS chart_label(chart_id blob, source_type int, label_key text, " "label_value text, date_created int, PRIMARY KEY (chart_id, label_key));", "CREATE TABLE IF NOT EXISTS node_instance (host_id blob PRIMARY KEY, claim_id, node_id, date_created);", @@ -55,6 +54,9 @@ const char *database_cleanup[] = { "DELETE FROM host_info WHERE host_id NOT IN (SELECT host_id FROM host);", "DELETE FROM host_label WHERE host_id NOT IN (SELECT host_id FROM host);", "DROP TRIGGER IF EXISTS tr_dim_del;", + "DROP INDEX IF EXISTS ind_d1;", + "DROP INDEX IF EXISTS ind_c1;", + "DROP INDEX IF EXISTS ind_c2;", NULL }; @@ -504,211 +506,6 @@ skip: return result; } - - -// -// Support for archived charts (TO BE REMOVED) -// -#define SELECT_DIMENSION "select d.id, d.name from dimension d where d.chart_id = @chart_uuid;" - -static void sql_rrdim2json(sqlite3_stmt *res_dim, uuid_t *chart_uuid, BUFFER *wb, size_t *dimensions_count) -{ - int rc; - - rc = sqlite3_bind_blob(res_dim, 1, chart_uuid, sizeof(*chart_uuid), SQLITE_STATIC); - if (rc != SQLITE_OK) - return; - - int dimensions = 0; - buffer_sprintf(wb, "\t\t\t\"dimensions\": {\n"); - - while (sqlite3_step_monitored(res_dim) == SQLITE_ROW) { - if (dimensions) - buffer_strcat(wb, ",\n\t\t\t\t\""); - else - buffer_strcat(wb, "\t\t\t\t\""); - buffer_strcat_jsonescape(wb, (const char *) sqlite3_column_text(res_dim, 0)); - buffer_strcat(wb, "\": { \"name\": \""); - buffer_strcat_jsonescape(wb, (const char *) sqlite3_column_text(res_dim, 1)); - buffer_strcat(wb, "\" }"); - dimensions++; - } - *dimensions_count += dimensions; - buffer_sprintf(wb, "\n\t\t\t}"); -} - -#define SELECT_CHART "select chart_id, id, name, type, family, context, title, priority, plugin, " \ - "module, unit, chart_type, update_every from chart " \ - "where host_id = @host_uuid and chart_id not in (select chart_id from chart_active) order by chart_id asc;" - -void sql_rrdset2json(RRDHOST *host, BUFFER *wb) -{ - // time_t first_entry_t = 0; //= rrdset_first_entry_t(st); - // time_t last_entry_t = 0; //rrdset_last_entry_t(st); - static char *custom_dashboard_info_js_filename = NULL; - int rc; - - sqlite3_stmt *res_chart = NULL; - sqlite3_stmt *res_dim = NULL; - time_t now = now_realtime_sec(); - - rc = sqlite3_prepare_v2(db_meta, SELECT_CHART, -1, &res_chart, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement to fetch host archived charts"); - return; - } - - rc = sqlite3_bind_blob(res_chart, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind host parameter to fetch archived charts"); - goto failed; - } - - rc = sqlite3_prepare_v2(db_meta, SELECT_DIMENSION, -1, &res_dim, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement to fetch chart archived dimensions"); - goto failed; - }; - - if(unlikely(!custom_dashboard_info_js_filename)) - custom_dashboard_info_js_filename = config_get(CONFIG_SECTION_WEB, "custom dashboard_info.js", ""); - - buffer_sprintf(wb, "{\n" - "\t\"hostname\": \"%s\"" - ",\n\t\"version\": \"%s\"" - ",\n\t\"release_channel\": \"%s\"" - ",\n\t\"os\": \"%s\"" - ",\n\t\"timezone\": \"%s\"" - ",\n\t\"update_every\": %d" - ",\n\t\"history\": %ld" - ",\n\t\"memory_mode\": \"%s\"" - ",\n\t\"custom_info\": \"%s\"" - ",\n\t\"charts\": {" - , rrdhost_hostname(host) - , rrdhost_program_version(host) - , get_release_channel() - , rrdhost_os(host) - , rrdhost_timezone(host) - , host->rrd_update_every - , host->rrd_history_entries - , rrd_memory_mode_name(host->rrd_memory_mode) - , custom_dashboard_info_js_filename - ); - - size_t c = 0; - size_t dimensions = 0; - - while (sqlite3_step_monitored(res_chart) == SQLITE_ROW) { - char id[512]; - sprintf(id, "%s.%s", sqlite3_column_text(res_chart, 3), sqlite3_column_text(res_chart, 1)); - RRDSET *st = rrdset_find(host, id); - if (st && !rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED)) - continue; - - if (c) - buffer_strcat(wb, ",\n\t\t\""); - else - buffer_strcat(wb, "\n\t\t\""); - c++; - - buffer_strcat(wb, id); - buffer_strcat(wb, "\": "); - - buffer_sprintf( - wb, - "\t\t{\n" - "\t\t\t\"id\": \"%s\",\n" - "\t\t\t\"name\": \"%s\",\n" - "\t\t\t\"type\": \"%s\",\n" - "\t\t\t\"family\": \"%s\",\n" - "\t\t\t\"context\": \"%s\",\n" - "\t\t\t\"title\": \"%s (%s)\",\n" - "\t\t\t\"priority\": %ld,\n" - "\t\t\t\"plugin\": \"%s\",\n" - "\t\t\t\"module\": \"%s\",\n" - "\t\t\t\"enabled\": %s,\n" - "\t\t\t\"units\": \"%s\",\n" - "\t\t\t\"data_url\": \"/api/v1/data?chart=%s\",\n" - "\t\t\t\"chart_type\": \"%s\",\n", - id //sqlite3_column_text(res_chart, 1) - , - id // sqlite3_column_text(res_chart, 2) - , - sqlite3_column_text(res_chart, 3), sqlite3_column_text(res_chart, 4), sqlite3_column_text(res_chart, 5), - sqlite3_column_text(res_chart, 6), id //sqlite3_column_text(res_chart, 2) - , - (long ) sqlite3_column_int(res_chart, 7), - (const char *) sqlite3_column_text(res_chart, 8) ? (const char *) sqlite3_column_text(res_chart, 8) : (char *) "", - (const char *) sqlite3_column_text(res_chart, 9) ? (const char *) sqlite3_column_text(res_chart, 9) : (char *) "", (char *) "false", - (const char *) sqlite3_column_text(res_chart, 10), id //sqlite3_column_text(res_chart, 2) - , - rrdset_type_name(sqlite3_column_int(res_chart, 11))); - - sql_rrdim2json(res_dim, (uuid_t *) sqlite3_column_blob(res_chart, 0), wb, &dimensions); - - rc = sqlite3_reset(res_dim); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to reset the prepared statement when reading archived chart dimensions"); - buffer_strcat(wb, "\n\t\t}"); - } - - buffer_sprintf(wb - , "\n\t}" - ",\n\t\"charts_count\": %zu" - ",\n\t\"dimensions_count\": %zu" - ",\n\t\"alarms_count\": %zu" - ",\n\t\"rrd_memory_bytes\": %zu" - ",\n\t\"hosts_count\": %zu" - ",\n\t\"hosts\": [" - , c - , dimensions - , (size_t) 0 - , (size_t) 0 - , rrd_hosts_available - ); - - if(unlikely(rrd_hosts_available > 1)) { - rrd_rdlock(); - - size_t found = 0; - RRDHOST *h; - rrdhost_foreach_read(h) { - if(!rrdhost_should_be_removed(h, host, now) && !rrdhost_flag_check(h, RRDHOST_FLAG_ARCHIVED)) { - buffer_sprintf(wb - , "%s\n\t\t{" - "\n\t\t\t\"hostname\": \"%s\"" - "\n\t\t}" - , (found > 0) ? "," : "" - , rrdhost_hostname(h) - ); - - found++; - } - } - - rrd_unlock(); - } - else { - buffer_sprintf(wb - , "\n\t\t{" - "\n\t\t\t\"hostname\": \"%s\"" - "\n\t\t}" - , rrdhost_hostname(host) - ); - } - - buffer_sprintf(wb, "\n\t]\n}\n"); - - rc = sqlite3_finalize(res_dim); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to finalize the prepared statement when reading archived chart dimensions"); - -failed: - rc = sqlite3_finalize(res_chart); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to finalize the prepared statement when reading archived charts"); -} - void db_execute(const char *cmd) { int rc; @@ -732,116 +529,6 @@ void db_execute(const char *cmd) } } -#define SELECT_MIGRATED_FILE "select 1 from metadata_migration where filename = @path;" - -int file_is_migrated(char *path) -{ - sqlite3_stmt *res = NULL; - int rc; - - rc = sqlite3_prepare_v2(db_meta, SELECT_MIGRATED_FILE, -1, &res, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement to fetch host"); - return 0; - } - - rc = sqlite3_bind_text(res, 1, path, -1, SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind filename parameter to check migration"); - return 0; - } - - rc = sqlite3_step_monitored(res); - - if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) - error_report("Failed to finalize the prepared statement when checking if metadata file is migrated"); - - return (rc == SQLITE_ROW); -} - -#define STORE_MIGRATED_FILE "insert or replace into metadata_migration (filename, file_size, date_created) " \ - "values (@file, @size, unixepoch());" - -void add_migrated_file(char *path, uint64_t file_size) -{ - sqlite3_stmt *res = NULL; - int rc; - - rc = sqlite3_prepare_v2(db_meta, STORE_MIGRATED_FILE, -1, &res, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement to fetch host"); - return; - } - - rc = sqlite3_bind_text(res, 1, path, -1, SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind filename parameter to store migration information"); - return; - } - - rc = sqlite3_bind_int64(res, 2, (sqlite_int64) file_size); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind size parameter to store migration information"); - return; - } - - rc = execute_insert(res); - if (unlikely(rc != SQLITE_DONE)) - error_report("Failed to store migrated file, rc = %d", rc); - - if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) - error_report("Failed to finalize the prepared statement when checking if metadata file is migrated"); -} - - - -#define SQL_STORE_CLAIM_ID "insert into node_instance " \ - "(host_id, claim_id, date_created) values (@host_id, @claim_id, unixepoch()) " \ - "on conflict(host_id) do update set claim_id = excluded.claim_id;" - -void store_claim_id(uuid_t *host_id, uuid_t *claim_id) -{ - sqlite3_stmt *res = NULL; - int rc; - - if (unlikely(!db_meta)) { - if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) - error_report("Database has not been initialized"); - return; - } - - rc = sqlite3_prepare_v2(db_meta, SQL_STORE_CLAIM_ID, -1, &res, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement store chart labels"); - return; - } - - rc = sqlite3_bind_blob(res, 1, host_id, sizeof(*host_id), SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind host_id parameter to store node instance information"); - goto failed; - } - - if (claim_id) - rc = sqlite3_bind_blob(res, 2, claim_id, sizeof(*claim_id), SQLITE_STATIC); - else - rc = sqlite3_bind_null(res, 2); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind claim_id parameter to store node instance information"); - goto failed; - } - - rc = execute_insert(res); - if (unlikely(rc != SQLITE_DONE)) - error_report("Failed to store node instance information, rc = %d", rc); - -failed: - if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) - error_report("Failed to finalize the prepared statement when storing node instance information"); - - return; -} - static inline void set_host_node_id(RRDHOST *host, uuid_t *node_id) { if (unlikely(!host)) @@ -1268,112 +955,3 @@ int sql_metadata_cache_stats(int op) netdata_thread_enable_cancelability(); return count; } - -#define SQL_FIND_CHART_UUID \ - "SELECT chart_id FROM chart WHERE host_id = @host AND type=@type AND id=@id AND (name IS NULL OR name=@name) AND chart_id IS NOT NULL;" - -#define SQL_FIND_DIMENSION_UUID \ - "SELECT dim_id FROM dimension WHERE chart_id=@chart AND id=@id AND name=@name AND LENGTH(dim_id)=16;" - - -//Do a database lookup to find the UUID of a chart -//If found store it in store_uuid and return 0 -int sql_find_chart_uuid(RRDHOST *host, RRDSET *st, uuid_t *store_uuid) -{ - static __thread sqlite3_stmt *res = NULL; - int rc; - - const char *name = string2str(st->parts.name); - - if (unlikely(!db_meta) && default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) - return 1; - - if (unlikely(!res)) { - rc = prepare_statement(db_meta, SQL_FIND_CHART_UUID, &res); - if (rc != SQLITE_OK) { - error_report("Failed to prepare statement to lookup chart UUID in the database"); - return 1; - } - } - - rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) - goto bind_fail; - - rc = sqlite3_bind_text(res, 2, string2str(st->parts.type), -1, SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) - goto bind_fail; - - rc = sqlite3_bind_text(res, 3, string2str(st->parts.id), -1, SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) - goto bind_fail; - - rc = sqlite3_bind_text(res, 4, name && *name ? name : string2str(st->parts.id), -1, SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) - goto bind_fail; - - int status = 1; - rc = sqlite3_step_monitored(res); - if (likely(rc == SQLITE_ROW)) { - uuid_copy(*store_uuid, sqlite3_column_blob(res, 0)); - status = 0; - } - - rc = sqlite3_reset(res); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to reset statement when searching for a chart UUID, rc = %d", rc); - - return status; - -bind_fail: - error_report("Failed to bind input parameter to perform chart UUID database lookup, rc = %d", rc); - rc = sqlite3_reset(res); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to reset statement when searching for a chart UUID, rc = %d", rc); - return 1; -} - -int sql_find_dimension_uuid(RRDSET *st, RRDDIM *rd, uuid_t *store_uuid) -{ - static __thread sqlite3_stmt *res = NULL; - int rc; - int status = 1; - - if (unlikely(!db_meta) && default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) - return 1; - - if (unlikely(!res)) { - rc = prepare_statement(db_meta, SQL_FIND_DIMENSION_UUID, &res); - if (rc != SQLITE_OK) { - error_report("Failed to bind prepare statement to lookup dimension UUID in the database"); - return 1; - } - } - - rc = sqlite3_bind_blob(res, 1, st->chart_uuid, sizeof(*st->chart_uuid), SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) - goto bind_fail; - - rc = sqlite3_bind_text(res, 2, rrddim_id(rd), -1, SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) - goto bind_fail; - - rc = sqlite3_bind_text(res, 3, rrddim_name(rd), -1, SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) - goto bind_fail; - - rc = sqlite3_step_monitored(res); - if (likely(rc == SQLITE_ROW)) { - uuid_copy(*store_uuid, *((uuid_t *) sqlite3_column_blob(res, 0))); - status = 0; - } - - rc = sqlite3_reset(res); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to reset statement find dimension uuid, rc = %d", rc); - return status; - -bind_fail: - error_report("Failed to bind input parameter to perform dimension UUID database lookup, rc = %d", rc); - return 1; -} diff --git a/database/sqlite/sqlite_functions.h b/database/sqlite/sqlite_functions.h index 5731d5c9e..40abd010d 100644 --- a/database/sqlite/sqlite_functions.h +++ b/database/sqlite/sqlite_functions.h @@ -54,9 +54,7 @@ void sql_close_database(void); int bind_text_null(sqlite3_stmt *res, int position, const char *text, bool can_be_null); int prepare_statement(sqlite3 *database, const char *query, sqlite3_stmt **statement); int execute_insert(sqlite3_stmt *res); -int file_is_migrated(char *path); int exec_statement_with_uuid(const char *sql, uuid_t *uuid); -void add_migrated_file(char *path, uint64_t file_size); void db_execute(const char *cmd); // Look up functions @@ -65,16 +63,11 @@ int get_host_id(uuid_t *node_id, uuid_t *host_id); struct node_instance_list *get_node_list(void); void sql_load_node_id(RRDHOST *host); char *get_hostname_by_node_id(char *node_id); -int sql_find_chart_uuid(RRDHOST *host, RRDSET *st, uuid_t *store_uuid); -int sql_find_dimension_uuid(RRDSET *st, RRDDIM *rd, uuid_t *store_uuid); // Help build archived hosts in memory when agent starts void sql_build_host_system_info(uuid_t *host_id, struct rrdhost_system_info *system_info); DICTIONARY *sql_load_host_labels(uuid_t *host_id); -// For queries: To be removed when context queries are implemented -void sql_rrdset2json(RRDHOST *host, BUFFER *wb); - // TODO: move to metadata int update_node_id(uuid_t *host_id, uuid_t *node_id); diff --git a/database/sqlite/sqlite_health.c b/database/sqlite/sqlite_health.c index c189305b8..471fa3add 100644 --- a/database/sqlite/sqlite_health.c +++ b/database/sqlite/sqlite_health.c @@ -61,8 +61,12 @@ void sql_health_alarm_log_update(RRDHOST *host, ALARM_ENTRY *ae) { rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { - error_report("HEALTH [%s]: Failed to prepare statement for SQL_UPDATE_HEALTH_LOG", rrdhost_hostname(host)); - return; + sql_create_health_log_table(host); + rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("HEALTH [%s]: Failed to prepare statement for SQL_INSERT_HEALTH_LOG", rrdhost_hostname(host)); + return; + } } rc = sqlite3_bind_int64(res, 1, (sqlite3_int64) ae->updated_by_id); @@ -103,8 +107,6 @@ void sql_health_alarm_log_update(RRDHOST *host, ALARM_ENTRY *ae) { failed: if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) error_report("HEALTH [%s]: Failed to finalize the prepared statement for updating health log.", rrdhost_hostname(host)); - - return; } /* Health related SQL queries @@ -134,8 +136,12 @@ void sql_health_alarm_log_insert(RRDHOST *host, ALARM_ENTRY *ae) { rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { - error_report("HEALTH [%s]: Failed to prepare statement for SQL_INSERT_HEALTH_LOG", rrdhost_hostname(host)); - return; + sql_create_health_log_table(host); + rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("HEALTH [%s]: Failed to prepare statement for SQL_INSERT_HEALTH_LOG", rrdhost_hostname(host)); + return; + } } rc = sqlite3_bind_text(res, 1, rrdhost_hostname(host), -1, SQLITE_STATIC); @@ -337,13 +343,11 @@ void sql_health_alarm_log_insert(RRDHOST *host, ALARM_ENTRY *ae) { } ae->flags |= HEALTH_ENTRY_FLAG_SAVED; - host->health_log_entries_written++; + host->health.health_log_entries_written++; failed: if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) error_report("HEALTH [%s]: Failed to finalize the prepared statement for inserting to health log.", rrdhost_hostname(host)); - - return; } void sql_health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) @@ -369,7 +373,7 @@ void sql_health_alarm_log_cleanup(RRDHOST *host) { if(rotate_every < 100) rotate_every = 100; } - if(likely(host->health_log_entries_written < rotate_every)) { + if(likely(host->health.health_log_entries_written < rotate_every)) { return; } @@ -382,7 +386,7 @@ void sql_health_alarm_log_cleanup(RRDHOST *host) { char uuid_str[GUID_LEN + 1]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG(uuid_str, uuid_str, (unsigned long int) (host->health_log_entries_written - rotate_every))); + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG(uuid_str, uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every))); rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { @@ -398,7 +402,7 @@ void sql_health_alarm_log_cleanup(RRDHOST *host) { if (unlikely(rc != SQLITE_OK)) error_report("Failed to finalize the prepared statement to cleanup health log table"); - host->health_log_entries_written = rotate_every; + host->health.health_log_entries_written = rotate_every; sql_aclk_alert_clean_dead_entries(host); } @@ -431,13 +435,13 @@ void sql_health_alarm_log_count(RRDHOST *host) { rc = sqlite3_step_monitored(res); if (likely(rc == SQLITE_ROW)) - host->health_log_entries_written = (size_t) sqlite3_column_int64(res, 0); + host->health.health_log_entries_written = (size_t) sqlite3_column_int64(res, 0); rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) error_report("Failed to finalize the prepared statement to count health log entries from db"); - info("HEALTH [%s]: Table health_log_%s, contains %lu entries.", rrdhost_hostname(host), uuid_str, (unsigned long int) host->health_log_entries_written); + info("HEALTH [%s]: Table health_log_%s, contains %lu entries.", rrdhost_hostname(host), uuid_str, (unsigned long int) host->health.health_log_entries_written); } #define SQL_INJECT_REMOVED(guid, guid2) "insert into health_log_%s (hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, " \ @@ -537,8 +541,6 @@ void sql_inject_removed_status(char *uuid_str, uint32_t alarm_id, uint32_t alarm failed: if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) error_report("HEALTH [N/A]: Failed to finalize the prepared statement for injecting removed event."); - return; - } #define SQL_SELECT_MAX_UNIQUE_ID(guid) "SELECT MAX(unique_id) from health_log_%s", guid @@ -612,7 +614,7 @@ void sql_health_alarm_log_load(RRDHOST *host) { ssize_t errored = 0, loaded = 0; char command[MAX_HEALTH_SQL_SIZE + 1]; - host->health_log_entries_written = 0; + host->health.health_log_entries_written = 0; if (unlikely(!db_meta)) { if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) diff --git a/database/sqlite/sqlite_metadata.c b/database/sqlite/sqlite_metadata.c index 4eb212152..35f928ffa 100644 --- a/database/sqlite/sqlite_metadata.c +++ b/database/sqlite/sqlite_metadata.c @@ -4,9 +4,9 @@ // SQL statements -#define SQL_STORE_CLAIM_ID "insert into node_instance " \ - "(host_id, claim_id, date_created) values (@host_id, @claim_id, unixepoch()) " \ - "on conflict(host_id) do update set claim_id = excluded.claim_id;" +#define SQL_STORE_CLAIM_ID "INSERT INTO node_instance " \ + "(host_id, claim_id, date_created) VALUES (@host_id, @claim_id, unixepoch()) " \ + "ON CONFLICT(host_id) DO UPDATE SET claim_id = excluded.claim_id;" #define SQL_DELETE_HOST_LABELS "DELETE FROM host_label WHERE host_id = @uuid;" @@ -56,24 +56,13 @@ #define MAX_METADATA_CLEANUP (500) // Maximum metadata write operations (e.g deletes before retrying) #define METADATA_MAX_BATCH_SIZE (512) // Maximum commands to execute before running the event loop -#define METADATA_MAX_TRANSACTION_BATCH (128) // Maximum commands to add in a transaction enum metadata_opcode { METADATA_DATABASE_NOOP = 0, METADATA_DATABASE_TIMER, - METADATA_ADD_CHART, - METADATA_ADD_CHART_LABEL, - METADATA_ADD_DIMENSION, METADATA_DEL_DIMENSION, - METADATA_ADD_DIMENSION_OPTION, - METADATA_ADD_HOST_SYSTEM_INFO, - METADATA_ADD_HOST_INFO, METADATA_STORE_CLAIM_ID, - METADATA_STORE_HOST_LABELS, - METADATA_STORE_BUFFER, - - METADATA_SKIP_TRANSACTION, // Dummy -- OPCODES less than this one can be in a tranasction - + METADATA_ADD_HOST_INFO, METADATA_SCAN_HOSTS, METADATA_MAINTENANCE, METADATA_SYNC_SHUTDOWN, @@ -105,14 +94,14 @@ typedef enum { struct metadata_wc { uv_thread_t thread; + uv_loop_t *loop; + uv_async_t async; + uv_timer_t timer_req; time_t check_metadata_after; time_t check_hosts_after; volatile unsigned queue_size; - uv_loop_t *loop; - uv_async_t async; METADATA_FLAG flags; uint64_t row_id; - uv_timer_t timer_req; struct completion init_complete; /* FIFO command queue */ uv_mutex_t cmd_mutex; @@ -339,7 +328,7 @@ static int sql_store_host_info(RRDHOST *host) if (unlikely(rc != SQLITE_OK)) goto bind_fail; - rc = sqlite3_bind_int(res, ++param, (int ) host->health_enabled); + rc = sqlite3_bind_int(res, ++param, (int ) host->health.health_enabled); if (unlikely(rc != SQLITE_OK)) goto bind_fail; @@ -383,7 +372,7 @@ static BUFFER *sql_store_host_system_info(RRDHOST *host) if (unlikely(!system_info)) return NULL; - BUFFER *work_buffer = buffer_create(1024); + BUFFER *work_buffer = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); struct query_build key_data = {.sql = work_buffer, .count = 0}; uuid_unparse_lower(host->host_uuid, key_data.uuid_str); @@ -417,49 +406,6 @@ static BUFFER *sql_store_host_system_info(RRDHOST *host) } -/* - * Store set option for a dimension - */ -static int sql_set_dimension_option(uuid_t *dim_uuid, char *option) -{ - sqlite3_stmt *res = NULL; - int rc; - - if (unlikely(!db_meta)) { - if (default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) - return 0; - error_report("Database has not been initialized"); - return 1; - } - - rc = sqlite3_prepare_v2(db_meta, "UPDATE dimension SET options = @options WHERE dim_id = @dim_id", -1, &res, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement to update dimension options"); - return 0; - }; - - rc = sqlite3_bind_blob(res, 2, dim_uuid, sizeof(*dim_uuid), SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) - goto bind_fail; - - if (!option || !strcmp(option,"unhide")) - rc = sqlite3_bind_null(res, 1); - else - rc = sqlite3_bind_text(res, 1, option, -1, SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) - goto bind_fail; - - rc = execute_insert(res); - if (unlikely(rc != SQLITE_DONE)) - error_report("Failed to update dimension option, rc = %d", rc); - -bind_fail: - rc = sqlite3_finalize(res); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to finalize statement in update dimension options, rc = %d", rc); - return 0; -} - /* * Store a chart in the database */ @@ -665,22 +611,26 @@ bind_fail: return 1; } -static bool dimension_can_be_deleted(uuid_t *dim_uuid) +static bool dimension_can_be_deleted(uuid_t *dim_uuid __maybe_unused) { #ifdef ENABLE_DBENGINE - bool no_retention = true; - for (size_t tier = 0; tier < storage_tiers; tier++) { - if (!multidb_ctx[tier]) - continue; - time_t first_time_t = 0, last_time_t = 0; - if (rrdeng_metric_retention_by_uuid((void *) multidb_ctx[tier], dim_uuid, &first_time_t, &last_time_t) == 0) { - if (first_time_t > 0) { - no_retention = false; - break; + if(dbengine_enabled) { + bool no_retention = true; + for (size_t tier = 0; tier < storage_tiers; tier++) { + if (!multidb_ctx[tier]) + continue; + time_t first_time_t = 0, last_time_t = 0; + if (rrdeng_metric_retention_by_uuid((void *) multidb_ctx[tier], dim_uuid, &first_time_t, &last_time_t)) { + if (first_time_t > 0) { + no_retention = false; + break; + } } } + return no_retention; } - return no_retention; + else + return false; #else return false; #endif @@ -736,6 +686,16 @@ skip_run: error_report("Failed to finalize the prepared statement when reading dimensions"); } +static void cleanup_health_log(void) +{ + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host) { + if (rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED)) + continue; + sql_health_alarm_log_cleanup(host); + } + dfe_done(host); +} // // EVENT LOOP STARTS HERE @@ -817,7 +777,7 @@ static void metadata_enq_cmd(struct metadata_wc *wc, struct metadata_cmd *cmd) (void) uv_async_send(&wc->async); } -static struct metadata_cmd metadata_deq_cmd(struct metadata_wc *wc, enum metadata_opcode *next_opcode) +static struct metadata_cmd metadata_deq_cmd(struct metadata_wc *wc) { struct metadata_cmd ret; unsigned queue_size; @@ -828,7 +788,6 @@ static struct metadata_cmd metadata_deq_cmd(struct metadata_wc *wc, enum metadat memset(&ret, 0, sizeof(ret)); ret.opcode = METADATA_DATABASE_NOOP; ret.completion = NULL; - *next_opcode = METADATA_DATABASE_NOOP; } else { /* dequeue command */ ret = wc->cmd_queue.cmd_array[wc->cmd_queue.head]; @@ -840,10 +799,6 @@ static struct metadata_cmd metadata_deq_cmd(struct metadata_wc *wc, enum metadat wc->cmd_queue.head + 1 : 0; } wc->queue_size = queue_size - 1; - if (wc->queue_size > 0) - *next_opcode = wc->cmd_queue.cmd_array[wc->cmd_queue.head].opcode; - else - *next_opcode = METADATA_DATABASE_NOOP; /* wake up producers */ uv_cond_signal(&wc->cmd_cond); } @@ -892,10 +847,16 @@ static void after_metadata_cleanup(uv_work_t *req, int status) struct metadata_wc *wc = req->data; metadata_flag_clear(wc, METADATA_FLAG_CLEANUP); } + static void start_metadata_cleanup(uv_work_t *req) { + register_libuv_worker_jobs(); + + worker_is_busy(UV_EVENT_METADATA_CLEANUP); struct metadata_wc *wc = req->data; check_dimension_metadata(wc); + cleanup_health_log(); + worker_is_idle(); } struct scan_metadata_payload { @@ -920,13 +881,13 @@ static void after_metadata_hosts(uv_work_t *req, int status __maybe_unused) freez(data); } -static bool metadata_scan_host(RRDHOST *host, uint32_t max_count) { +static bool metadata_scan_host(RRDHOST *host, uint32_t max_count, size_t *query_counter) { RRDSET *st; int rc; bool more_to_do = false; uint32_t scan_count = 1; - BUFFER *work_buffer = buffer_create(1024); + BUFFER *work_buffer = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); rrdset_foreach_reentrant(st, host) { if (scan_count == max_count) { @@ -934,6 +895,8 @@ static bool metadata_scan_host(RRDHOST *host, uint32_t max_count) { break; } if(rrdset_flag_check(st, RRDSET_FLAG_METADATA_UPDATE)) { + (*query_counter)++; + rrdset_flag_clear(st, RRDSET_FLAG_METADATA_UPDATE); scan_count++; @@ -963,8 +926,15 @@ static bool metadata_scan_host(RRDHOST *host, uint32_t max_count) { RRDDIM *rd; rrddim_foreach_read(rd, st) { if(rrddim_flag_check(rd, RRDDIM_FLAG_METADATA_UPDATE)) { + (*query_counter)++; + rrddim_flag_clear(rd, RRDDIM_FLAG_METADATA_UPDATE); + if (rrddim_option_check(rd, RRDDIM_OPTION_HIDDEN)) + rrddim_flag_set(rd, RRDDIM_FLAG_META_HIDDEN); + else + rrddim_flag_clear(rd, RRDDIM_FLAG_META_HIDDEN); + rc = sql_store_dimension( &rd->metric_uuid, &rd->rrdset->chart_uuid, @@ -990,52 +960,119 @@ static bool metadata_scan_host(RRDHOST *host, uint32_t max_count) { // Worker thread to scan hosts for pending metadata to store static void start_metadata_hosts(uv_work_t *req __maybe_unused) { + register_libuv_worker_jobs(); + RRDHOST *host; struct scan_metadata_payload *data = req->data; struct metadata_wc *wc = data->wc; + usec_t all_started_ut = now_monotonic_usec(); (void)all_started_ut; + internal_error(true, "METADATA: checking all hosts..."); + bool run_again = false; + worker_is_busy(UV_EVENT_METADATA_STORE); + + if (!data->max_count) + db_execute("BEGIN TRANSACTION;"); dfe_start_reentrant(rrdhost_root_index, host) { if (rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED) || !rrdhost_flag_check(host, RRDHOST_FLAG_METADATA_UPDATE)) continue; - internal_error(true, "METADATA: Scanning host %s", rrdhost_hostname(host)); + + size_t query_counter = 0; (void)query_counter; + usec_t started_ut = now_monotonic_usec(); (void)started_ut; + rrdhost_flag_clear(host,RRDHOST_FLAG_METADATA_UPDATE); - if (unlikely(metadata_scan_host(host, data->max_count))) { + + if (unlikely(rrdhost_flag_check(host, RRDHOST_FLAG_METADATA_LABELS))) { + rrdhost_flag_clear(host, RRDHOST_FLAG_METADATA_LABELS); + int rc = exec_statement_with_uuid(SQL_DELETE_HOST_LABELS, &host->host_uuid); + if (likely(rc == SQLITE_OK)) { + BUFFER *work_buffer = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + struct query_build tmp = {.sql = work_buffer, .count = 0}; + uuid_unparse_lower(host->host_uuid, tmp.uuid_str); + rrdlabels_walkthrough_read(host->rrdlabels, host_label_store_to_sql_callback, &tmp); + db_execute(buffer_tostring(work_buffer)); + buffer_free(work_buffer); + query_counter++; + } + } + + if (unlikely(rrdhost_flag_check(host, RRDHOST_FLAG_METADATA_CLAIMID))) { + rrdhost_flag_clear(host, RRDHOST_FLAG_METADATA_CLAIMID); + uuid_t uuid; + + if (likely(host->aclk_state.claimed_id && !uuid_parse(host->aclk_state.claimed_id, uuid))) + store_claim_id(&host->host_uuid, &uuid); + else + store_claim_id(&host->host_uuid, NULL); + + query_counter++; + } + + if (unlikely(rrdhost_flag_check(host, RRDHOST_FLAG_METADATA_INFO))) { + rrdhost_flag_clear(host, RRDHOST_FLAG_METADATA_INFO); + + BUFFER *work_buffer = sql_store_host_system_info(host); + if(work_buffer) { + db_execute(buffer_tostring(work_buffer)); + buffer_free(work_buffer); + query_counter++; + } + + int rc = sql_store_host_info(host); + if (unlikely(rc)) + error_report("METADATA: 'host:%s': failed to store host info", string2str(host->hostname)); + else + query_counter++; + } + + if (data->max_count) + db_execute("BEGIN TRANSACTION;"); + if (unlikely(metadata_scan_host(host, data->max_count, &query_counter))) { run_again = true; rrdhost_flag_set(host,RRDHOST_FLAG_METADATA_UPDATE); - internal_error(true,"METADATA: Rescheduling host %s to run; more charts to store", rrdhost_hostname(host)); + internal_error(true,"METADATA: 'host:%s': scheduling another run, more charts to store", rrdhost_hostname(host)); } + if (data->max_count) + db_execute("COMMIT TRANSACTION;"); + + usec_t ended_ut = now_monotonic_usec(); (void)ended_ut; + internal_error(true, "METADATA: 'host:%s': saved metadata with %zu SQL statements, in %0.2f ms", + rrdhost_hostname(host), query_counter, + (double)(ended_ut - started_ut) / USEC_PER_MS); } dfe_done(host); + if (!data->max_count) + db_execute("COMMIT TRANSACTION;"); + + usec_t all_ended_ut = now_monotonic_usec(); (void)all_ended_ut; + internal_error(true, "METADATA: checking all hosts completed in %0.2f ms", + (double)(all_ended_ut - all_started_ut) / USEC_PER_MS); + if (unlikely(run_again)) wc->check_hosts_after = now_realtime_sec() + METADATA_HOST_CHECK_IMMEDIATE; else wc->check_hosts_after = now_realtime_sec() + METADATA_HOST_CHECK_INTERVAL; + worker_is_idle(); } static void metadata_event_loop(void *arg) { + service_register(SERVICE_THREAD_TYPE_EVENT_LOOP, NULL, NULL, NULL, true); worker_register("METASYNC"); worker_register_job_name(METADATA_DATABASE_NOOP, "noop"); worker_register_job_name(METADATA_DATABASE_TIMER, "timer"); - worker_register_job_name(METADATA_ADD_CHART, "add chart"); - worker_register_job_name(METADATA_ADD_CHART_LABEL, "add chart label"); - worker_register_job_name(METADATA_ADD_DIMENSION, "add dimension"); worker_register_job_name(METADATA_DEL_DIMENSION, "delete dimension"); - worker_register_job_name(METADATA_ADD_DIMENSION_OPTION, "dimension option"); - worker_register_job_name(METADATA_ADD_HOST_SYSTEM_INFO, "host system info"); - worker_register_job_name(METADATA_ADD_HOST_INFO, "host info"); worker_register_job_name(METADATA_STORE_CLAIM_ID, "add claim id"); - worker_register_job_name(METADATA_STORE_HOST_LABELS, "host labels"); + worker_register_job_name(METADATA_ADD_HOST_INFO, "add host info"); worker_register_job_name(METADATA_MAINTENANCE, "maintenance"); - int ret; uv_loop_t *loop; unsigned cmd_batch_size; struct metadata_wc *wc = arg; - enum metadata_opcode opcode, next_opcode; + enum metadata_opcode opcode; uv_work_t metadata_cleanup_worker; uv_thread_set_name_np(wc->thread, "METASYNC"); @@ -1073,20 +1110,12 @@ static void metadata_event_loop(void *arg) wc->check_hosts_after = now_realtime_sec() + METADATA_HOST_CHECK_FIRST_CHECK; int shutdown = 0; - int in_transaction = 0; - int commands_in_transaction = 0; - // This can be used in the event loop for all opcodes (not workers) - BUFFER *work_buffer = buffer_create(1024); wc->row_id = 0; completion_mark_complete(&wc->init_complete); while (shutdown == 0 || (wc->flags & METADATA_WORKER_BUSY)) { - RRDDIM *rd = NULL; - RRDSET *st = NULL; - RRDHOST *host = NULL; - DICTIONARY_ITEM *dict_item = NULL; - BUFFER *buffer = NULL; uuid_t *uuid; + RRDHOST *host = NULL; int rc; worker_is_idle(); @@ -1098,7 +1127,7 @@ static void metadata_event_loop(void *arg) if (unlikely(cmd_batch_size >= METADATA_MAX_BATCH_SIZE)) break; - cmd = metadata_deq_cmd(wc, &next_opcode); + cmd = metadata_deq_cmd(wc); opcode = cmd.opcode; if (unlikely(opcode == METADATA_DATABASE_NOOP && metadata_flag_check(wc, METADATA_FLAG_SHUTDOWN))) { @@ -1108,130 +1137,38 @@ static void metadata_event_loop(void *arg) ++cmd_batch_size; - // If we are not in transaction and this command is the same with the next ; start a transaction - if (!in_transaction && opcode < METADATA_SKIP_TRANSACTION && opcode == next_opcode) { - if (opcode != METADATA_DATABASE_NOOP) { - in_transaction = 1; - db_execute("BEGIN TRANSACTION;"); - } - } - - if (likely(in_transaction)) { - commands_in_transaction++; - } - if (likely(opcode != METADATA_DATABASE_NOOP)) - worker_is_busy(opcode); + worker_is_busy(opcode); switch (opcode) { case METADATA_DATABASE_NOOP: case METADATA_DATABASE_TIMER: break; - case METADATA_ADD_CHART: - dict_item = (DICTIONARY_ITEM * ) cmd.param[0]; - st = (RRDSET *) dictionary_acquired_item_value(dict_item); - - rc = sql_store_chart( - &st->chart_uuid, - &st->rrdhost->host_uuid, - string2str(st->parts.type), - string2str(st->parts.id), - string2str(st->parts.name), - rrdset_family(st), - rrdset_context(st), - rrdset_title(st), - rrdset_units(st), - rrdset_plugin_name(st), - rrdset_module_name(st), - st->priority, - st->update_every, - st->chart_type, - st->rrd_memory_mode, - st->entries); - - if (unlikely(rc)) - error_report("Failed to store chart %s", rrdset_id(st)); - - dictionary_acquired_item_release(st->rrdhost->rrdset_root_index, dict_item); - break; - case METADATA_ADD_CHART_LABEL: - dict_item = (DICTIONARY_ITEM * ) cmd.param[0]; - st = (RRDSET *) dictionary_acquired_item_value(dict_item); - check_and_update_chart_labels(st, work_buffer); - dictionary_acquired_item_release(st->rrdhost->rrdset_root_index, dict_item); - break; - case METADATA_ADD_DIMENSION: - dict_item = (DICTIONARY_ITEM * ) cmd.param[0]; - rd = (RRDDIM *) dictionary_acquired_item_value(dict_item); - - rc = sql_store_dimension( - &rd->metric_uuid, - &rd->rrdset->chart_uuid, - string2str(rd->id), - string2str(rd->name), - rd->multiplier, - rd->divisor, - rd->algorithm, - rrddim_option_check(rd, RRDDIM_OPTION_HIDDEN)); - if (unlikely(rc)) - error_report("Failed to store dimension %s", rrddim_id(rd)); - - dictionary_acquired_item_release(rd->rrdset->rrddim_root_index, dict_item); - break; case METADATA_DEL_DIMENSION: uuid = (uuid_t *) cmd.param[0]; if (likely(dimension_can_be_deleted(uuid))) delete_dimension_uuid(uuid); freez(uuid); break; - case METADATA_ADD_DIMENSION_OPTION: - dict_item = (DICTIONARY_ITEM * ) cmd.param[0]; - rd = (RRDDIM *) dictionary_acquired_item_value(dict_item); - rc = sql_set_dimension_option( - &rd->metric_uuid, rrddim_flag_check(rd, RRDDIM_FLAG_META_HIDDEN) ? "hidden" : NULL); - if (unlikely(rc)) - error_report("Failed to store dimension option for %s", string2str(rd->id)); - dictionary_acquired_item_release(rd->rrdset->rrddim_root_index, dict_item); - break; - case METADATA_ADD_HOST_SYSTEM_INFO: - buffer = (BUFFER *) cmd.param[0]; - db_execute(buffer_tostring(buffer)); - buffer_free(buffer); - break; - case METADATA_ADD_HOST_INFO: - dict_item = (DICTIONARY_ITEM * ) cmd.param[0]; - host = (RRDHOST *) dictionary_acquired_item_value(dict_item); - rc = sql_store_host_info(host); - if (unlikely(rc)) - error_report("Failed to store host info in the database for %s", string2str(host->hostname)); - dictionary_acquired_item_release(rrdhost_root_index, dict_item); - break; case METADATA_STORE_CLAIM_ID: store_claim_id((uuid_t *) cmd.param[0], (uuid_t *) cmd.param[1]); freez((void *) cmd.param[0]); freez((void *) cmd.param[1]); break; - case METADATA_STORE_HOST_LABELS: - dict_item = (DICTIONARY_ITEM * ) cmd.param[0]; - host = (RRDHOST *) dictionary_acquired_item_value(dict_item); - rc = exec_statement_with_uuid(SQL_DELETE_HOST_LABELS, &host->host_uuid); - - if (likely(rc == SQLITE_OK)) { - buffer_flush(work_buffer); - struct query_build tmp = {.sql = work_buffer, .count = 0}; - uuid_unparse_lower(host->host_uuid, tmp.uuid_str); - rrdlabels_walkthrough_read(host->rrdlabels, host_label_store_to_sql_callback, &tmp); - db_execute(buffer_tostring(work_buffer)); - } - - dictionary_acquired_item_release(rrdhost_root_index, dict_item); + case METADATA_ADD_HOST_INFO: + host = (RRDHOST *) cmd.param[0]; + rc = sql_store_host_info(host); + if (unlikely(rc)) + error_report("Failed to store host info in the database for %s", string2str(host->hostname)); break; - case METADATA_SCAN_HOSTS: if (unlikely(metadata_flag_check(wc, METADATA_FLAG_SCANNING_HOSTS))) break; + if (unittest_running) + break; + struct scan_metadata_payload *data = mallocz(sizeof(*data)); data->request.data = data; data->wc = wc; @@ -1242,7 +1179,7 @@ static void metadata_event_loop(void *arg) cmd.completion = NULL; // Do not complete after launching worker (worker will do) } else - data->max_count = 1000; + data->max_count = 5000; metadata_flag_set(wc, METADATA_FLAG_SCANNING_HOSTS); if (unlikely( @@ -1255,11 +1192,6 @@ static void metadata_event_loop(void *arg) metadata_flag_clear(wc, METADATA_FLAG_SCANNING_HOSTS); } break; - case METADATA_STORE_BUFFER: - buffer = (BUFFER *) cmd.param[0]; - db_execute(buffer_tostring(buffer)); - buffer_free(buffer); - break; case METADATA_MAINTENANCE: if (unlikely(metadata_flag_check(wc, METADATA_FLAG_CLEANUP))) break; @@ -1279,11 +1211,6 @@ static void metadata_event_loop(void *arg) default: break; } - if (in_transaction && (commands_in_transaction >= METADATA_MAX_TRANSACTION_BATCH || opcode != next_opcode)) { - in_transaction = 0; - db_execute("COMMIT TRANSACTION;"); - commands_in_transaction = 0; - } if (cmd.completion) completion_mark_complete(cmd.completion); @@ -1302,8 +1229,6 @@ static void metadata_event_loop(void *arg) uv_run(loop, UV_RUN_DEFAULT); uv_cond_destroy(&wc->cmd_cond); - /* uv_mutex_destroy(&wc->cmd_mutex); */ - //fatal_assert(0 == uv_loop_close(loop)); int rc; do { @@ -1313,7 +1238,6 @@ static void metadata_event_loop(void *arg) freez(loop); worker_unregister(); - buffer_free(work_buffer); info("METADATA: Shutting down event loop"); completion_mark_complete(&wc->init_complete); return; @@ -1408,50 +1332,6 @@ static inline void queue_metadata_cmd(enum metadata_opcode opcode, const void *p } // Public -void metaqueue_chart_update(RRDSET *st) -{ - const DICTIONARY_ITEM *acquired_st = dictionary_get_and_acquire_item(st->rrdhost->rrdset_root_index, string2str(st->id)); - queue_metadata_cmd(METADATA_ADD_CHART, acquired_st, NULL); -} - -// -// RD may not be collected, so we may store it needlessly -void metaqueue_dimension_update(RRDDIM *rd) -{ - const DICTIONARY_ITEM *acquired_rd = - dictionary_get_and_acquire_item(rd->rrdset->rrddim_root_index, string2str(rd->id)); - - if (unlikely(rrdset_flag_check(rd->rrdset, RRDSET_FLAG_METADATA_UPDATE))) { - metaqueue_chart_update(rd->rrdset); - rrdset_flag_clear(rd->rrdset, RRDSET_FLAG_METADATA_UPDATE); - } - - queue_metadata_cmd(METADATA_ADD_DIMENSION, acquired_rd, NULL); -} - -void metaqueue_dimension_update_flags(RRDDIM *rd) -{ - const DICTIONARY_ITEM *acquired_rd = - dictionary_get_and_acquire_item(rd->rrdset->rrddim_root_index, string2str(rd->id)); - queue_metadata_cmd(METADATA_ADD_DIMENSION_OPTION, acquired_rd, NULL); -} - -void metaqueue_host_update_system_info(RRDHOST *host) -{ - BUFFER *work_buffer = sql_store_host_system_info(host); - - if (unlikely(!work_buffer)) - return; - - queue_metadata_cmd(METADATA_ADD_HOST_SYSTEM_INFO, work_buffer, NULL); -} - -void metaqueue_host_update_info(const char *machine_guid) -{ - const DICTIONARY_ITEM *acquired_host = dictionary_get_and_acquire_item(rrdhost_root_index, machine_guid); - queue_metadata_cmd(METADATA_ADD_HOST_INFO, acquired_host, NULL); -} - void metaqueue_delete_dimension_uuid(uuid_t *uuid) { if (unlikely(!metasync_worker.loop)) @@ -1477,24 +1357,13 @@ void metaqueue_store_claim_id(uuid_t *host_uuid, uuid_t *claim_uuid) queue_metadata_cmd(METADATA_STORE_CLAIM_ID, local_host_uuid, local_claim_uuid); } -void metaqueue_store_host_labels(const char *machine_guid) +void metaqueue_host_update_info(RRDHOST *host) { - const DICTIONARY_ITEM *acquired_host = dictionary_get_and_acquire_item(rrdhost_root_index, machine_guid); - queue_metadata_cmd(METADATA_STORE_HOST_LABELS, acquired_host, NULL); -} - -void metaqueue_buffer(BUFFER *buffer) -{ - queue_metadata_cmd(METADATA_STORE_BUFFER, buffer, NULL); -} - -void metaqueue_chart_labels(RRDSET *st) -{ - const DICTIONARY_ITEM *acquired_st = dictionary_get_and_acquire_item(st->rrdhost->rrdset_root_index, string2str(st->id)); - queue_metadata_cmd(METADATA_ADD_CHART_LABEL, acquired_st, NULL); + if (unlikely(!metasync_worker.loop)) + return; + queue_metadata_cmd(METADATA_ADD_HOST_INFO, host, NULL); } - // // unitests // @@ -1542,7 +1411,7 @@ static void *metadata_unittest_threads(void) tu.join = 0; for (int i = 0; i < threads_to_create; i++) { char buf[100 + 1]; - snprintf(buf, 100, "meta%d", i); + snprintf(buf, 100, "META[%d]", i); netdata_thread_create( &threads[i], buf, @@ -1558,7 +1427,6 @@ static void *metadata_unittest_threads(void) void *retval; netdata_thread_join(threads[i], &retval); } -// uv_async_send(&metasync_worker.async); sleep_usec(5 * USEC_PER_SEC); fprintf(stderr, "Added %u elements, processed %u\n", tu.added, tu.processed); diff --git a/database/sqlite/sqlite_metadata.h b/database/sqlite/sqlite_metadata.h index 9293facf8..d578b7a8f 100644 --- a/database/sqlite/sqlite_metadata.h +++ b/database/sqlite/sqlite_metadata.h @@ -11,17 +11,10 @@ void metadata_sync_init(void); void metadata_sync_shutdown(void); void metadata_sync_shutdown_prepare(void); -void metaqueue_dimension_update(RRDDIM *rd); -void metaqueue_chart_update(RRDSET *st); -void metaqueue_dimension_update_flags(RRDDIM *rd); -void metaqueue_host_update_system_info(RRDHOST *host); -void metaqueue_host_update_info(const char *machine_guid); void metaqueue_delete_dimension_uuid(uuid_t *uuid); void metaqueue_store_claim_id(uuid_t *host_uuid, uuid_t *claim_uuid); -void metaqueue_store_host_labels(const char *machine_guid); -void metaqueue_chart_labels(RRDSET *st); +void metaqueue_host_update_info(RRDHOST *host); void migrate_localhost(uuid_t *host_uuid); -void metaqueue_buffer(BUFFER *buffer); // UNIT TEST int metadata_unittest(void); diff --git a/database/storage_engine.c b/database/storage_engine.c index edf017db4..c5ba86552 100644 --- a/database/storage_engine.c +++ b/database/storage_engine.c @@ -6,23 +6,24 @@ #include "engine/rrdengineapi.h" #endif -#define im_collect_ops { \ - .init = rrddim_collect_init,\ - .store_metric = rrddim_collect_store_metric,\ - .flush = rrddim_store_metric_flush,\ - .finalize = rrddim_collect_finalize, \ +#define im_collect_ops { \ + .init = rrddim_collect_init, \ + .store_metric = rrddim_collect_store_metric, \ + .flush = rrddim_store_metric_flush, \ + .finalize = rrddim_collect_finalize, \ .change_collection_frequency = rrddim_store_metric_change_collection_frequency, \ - .metrics_group_get = rrddim_metrics_group_get, \ - .metrics_group_release = rrddim_metrics_group_release, \ + .metrics_group_get = rrddim_metrics_group_get, \ + .metrics_group_release = rrddim_metrics_group_release, \ } -#define im_query_ops { \ - .init = rrddim_query_init, \ - .next_metric = rrddim_query_next_metric, \ - .is_finished = rrddim_query_is_finished, \ - .finalize = rrddim_query_finalize, \ - .latest_time = rrddim_query_latest_time, \ - .oldest_time = rrddim_query_oldest_time \ +#define im_query_ops { \ + .init = rrddim_query_init, \ + .next_metric = rrddim_query_next_metric, \ + .is_finished = rrddim_query_is_finished, \ + .finalize = rrddim_query_finalize, \ + .latest_time_s = rrddim_query_latest_time_s, \ + .oldest_time_s = rrddim_query_oldest_time_s, \ + .align_to_optimal_before = rrddim_query_align_to_optimal_before, \ } static STORAGE_ENGINE engines[] = { @@ -34,8 +35,9 @@ static STORAGE_ENGINE engines[] = { .metric_get_or_create = rrddim_metric_get_or_create, .metric_dup = rrddim_metric_dup, .metric_release = rrddim_metric_release, + .metric_retention_by_uuid = rrddim_metric_retention_by_uuid, .collect_ops = im_collect_ops, - .query_ops = im_query_ops + .query_ops = im_query_ops, } }, { @@ -46,8 +48,9 @@ static STORAGE_ENGINE engines[] = { .metric_get_or_create = rrddim_metric_get_or_create, .metric_dup = rrddim_metric_dup, .metric_release = rrddim_metric_release, + .metric_retention_by_uuid = rrddim_metric_retention_by_uuid, .collect_ops = im_collect_ops, - .query_ops = im_query_ops + .query_ops = im_query_ops, } }, { @@ -58,8 +61,9 @@ static STORAGE_ENGINE engines[] = { .metric_get_or_create = rrddim_metric_get_or_create, .metric_dup = rrddim_metric_dup, .metric_release = rrddim_metric_release, + .metric_retention_by_uuid = rrddim_metric_retention_by_uuid, .collect_ops = im_collect_ops, - .query_ops = im_query_ops + .query_ops = im_query_ops, } }, { @@ -70,8 +74,9 @@ static STORAGE_ENGINE engines[] = { .metric_get_or_create = rrddim_metric_get_or_create, .metric_dup = rrddim_metric_dup, .metric_release = rrddim_metric_release, + .metric_retention_by_uuid = rrddim_metric_retention_by_uuid, .collect_ops = im_collect_ops, - .query_ops = im_query_ops + .query_ops = im_query_ops, } }, { @@ -82,8 +87,9 @@ static STORAGE_ENGINE engines[] = { .metric_get_or_create = rrddim_metric_get_or_create, .metric_dup = rrddim_metric_dup, .metric_release = rrddim_metric_release, + .metric_retention_by_uuid = rrddim_metric_retention_by_uuid, .collect_ops = im_collect_ops, - .query_ops = im_query_ops + .query_ops = im_query_ops, } }, #ifdef ENABLE_DBENGINE @@ -95,6 +101,7 @@ static STORAGE_ENGINE engines[] = { .metric_get_or_create = rrdeng_metric_get_or_create, .metric_dup = rrdeng_metric_dup, .metric_release = rrdeng_metric_release, + .metric_retention_by_uuid = rrdeng_metric_retention_by_uuid, .collect_ops = { .init = rrdeng_store_metric_init, .store_metric = rrdeng_store_metric_next, @@ -109,8 +116,9 @@ static STORAGE_ENGINE engines[] = { .next_metric = rrdeng_load_metric_next, .is_finished = rrdeng_load_metric_is_finished, .finalize = rrdeng_load_metric_finalize, - .latest_time = rrdeng_metric_latest_time, - .oldest_time = rrdeng_metric_oldest_time + .latest_time_s = rrdeng_metric_latest_time, + .oldest_time_s = rrdeng_metric_oldest_time, + .align_to_optimal_before = rrdeng_load_align_to_optimal_before, } } }, -- cgit v1.2.3