diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2019-10-13 08:36:33 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2019-10-13 08:36:33 +0000 |
commit | a30a849b78fa4fe8552141b7b2802d1af1b18c09 (patch) | |
tree | fab3c8bf29bf2d565595d4fa6a9413916ff02fee /database | |
parent | Adding upstream version 1.17.1. (diff) | |
download | netdata-a30a849b78fa4fe8552141b7b2802d1af1b18c09.tar.xz netdata-a30a849b78fa4fe8552141b7b2802d1af1b18c09.zip |
Adding upstream version 1.18.0.upstream/1.18.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | database/README.md | 206 | ||||
-rw-r--r-- | database/engine/README.md | 186 | ||||
-rw-r--r-- | database/engine/pagecache.c | 36 | ||||
-rw-r--r-- | database/engine/pagecache.h | 38 | ||||
-rw-r--r-- | database/engine/rrdengine.c | 69 | ||||
-rw-r--r-- | database/engine/rrdengineapi.c | 42 | ||||
-rw-r--r-- | database/engine/rrdengineapi.h | 2 | ||||
-rw-r--r-- | database/engine/rrdenginelib.c | 4 | ||||
-rw-r--r-- | database/rrd.c | 4 | ||||
-rw-r--r-- | database/rrd.h | 3 | ||||
-rw-r--r-- | database/rrdcalc.c | 215 | ||||
-rw-r--r-- | database/rrdcalc.h | 11 | ||||
-rw-r--r-- | database/rrdcalctemplate.c | 36 | ||||
-rw-r--r-- | database/rrdcalctemplate.h | 8 | ||||
-rw-r--r-- | database/rrddim.c | 52 | ||||
-rw-r--r-- | database/rrdhost.c | 14 | ||||
-rw-r--r-- | database/rrdset.c | 2 |
17 files changed, 613 insertions, 315 deletions
diff --git a/database/README.md b/database/README.md index 2fcb69b67..143615a0e 100644 --- a/database/README.md +++ b/database/README.md @@ -1,59 +1,54 @@ # Database -Although `netdata` does all its calculations using `long double`, it stores all values using -a [custom-made 32-bit number](../libnetdata/storage_number/). +Although `netdata` does all its calculations using `long double`, it stores all values using a [custom-made 32-bit +number](../libnetdata/storage_number/). -So, for each dimension of a chart, Netdata will need: `4 bytes for the value * the entries -of its history`. It will not store any other data for each value in the time series database. -Since all its values are stored in a time series with fixed step, the time each value -corresponds can be calculated at run time, using the position of a value in the round robin database. +So, for each dimension of a chart, Netdata will need: `4 bytes for the value * the entries of its history`. It will not +store any other data for each value in the time series database. Since all its values are stored in a time series with +fixed step, the time each value corresponds can be calculated at run time, using the position of a value in the round +robin database. -The default history is 3.600 entries, thus it will need 14.4KB for each chart dimension. -If you need 1.000 dimensions, they will occupy just 14.4MB. +The default history is 3.600 entries, thus it will need 14.4KB for each chart dimension. If you need 1.000 dimensions, +they will occupy just 14.4MB. -Of course, 3.600 entries is a very short history, especially if data collection frequency is set -to 1 second. You will have just one hour of data. +Of course, 3.600 entries is a very short history, especially if data collection frequency is set to 1 second. You will +have just one hour of data. -For a day of data and 1.000 dimensions, you will need: 86.400 seconds * 4 bytes * 1.000 -dimensions = 345MB of RAM. +For a day of data and 1.000 dimensions, you will need: `86.400 seconds * 4 bytes * 1.000 dimensions = 345MB of RAM`. -One option you have to lower this number is to use -**[Memory Deduplication - Kernel Same Page Merging - KSM](#ksm)**. Another possibility is to -use the **[Database Engine](engine/)**. +One option you have to lower this number is to use **[Memory Deduplication - Kernel Same Page Merging - KSM](#ksm)**. +Another possibility is to use the **[Database Engine](engine/)**. ## Memory modes Currently Netdata supports 6 memory modes: -1. `ram`, data are purely in memory. Data are never saved on disk. This mode uses `mmap()` and - supports [KSM](#ksm). +1. `ram`, data are purely in memory. Data are never saved on disk. This mode uses `mmap()` and supports [KSM](#ksm). -2. `save`, (the default) data are only in RAM while Netdata runs and are saved to / loaded from - disk on Netdata restart. It also uses `mmap()` and supports [KSM](#ksm). +2. `save`, data are only in RAM while Netdata runs and are saved to / loaded from disk on Netdata + restart. It also uses `mmap()` and supports [KSM](#ksm). -3. `map`, data are in memory mapped files. This works like the swap. Keep in mind though, this - will have a constant write on your disk. When Netdata writes data on its memory, the Linux kernel - marks the related memory pages as dirty and automatically starts updating them on disk. - Unfortunately we cannot control how frequently this works. The Linux kernel uses exactly the - same algorithm it uses for its swap memory. Check below for additional information on running a - dedicated central Netdata server. This mode uses `mmap()` but does not support [KSM](#ksm). +3. `map`, data are in memory mapped files. This works like the swap. Keep in mind though, this will have a constant + write on your disk. When Netdata writes data on its memory, the Linux kernel marks the related memory pages as dirty + and automatically starts updating them on disk. Unfortunately we cannot control how frequently this works. The Linux + kernel uses exactly the same algorithm it uses for its swap memory. Check below for additional information on + running a dedicated central Netdata server. This mode uses `mmap()` but does not support [KSM](#ksm). 4. `none`, without a database (collected metrics can only be streamed to another Netdata). -5. `alloc`, like `ram` but it uses `calloc()` and does not support [KSM](#ksm). This mode is the - fallback for all others except `none`. +5. `alloc`, like `ram` but it uses `calloc()` and does not support [KSM](#ksm). This mode is the fallback for all + others except `none`. -6. `dbengine`, data are in database files. The [Database Engine](engine/) works like a traditional - database. There is some amount of RAM dedicated to data caching and indexing and the rest of - the data reside compressed on disk. The number of history entries is not fixed in this case, - but depends on the configured disk space and the effective compression ratio of the data stored. - This is the **only mode** that supports changing the data collection update frequency - (`update_every`) **without losing** the previously stored metrics. - For more details see [here](engine/). +6. `dbengine`, (the default) data are in database files. The [Database Engine](engine/) works like a traditional + database. There is some amount of RAM dedicated to data caching and indexing and the rest of the data reside + compressed on disk. The number of history entries is not fixed in this case, but depends on the configured disk + space and the effective compression ratio of the data stored. This is the **only mode** that supports changing the + data collection update frequency (`update_every`) **without losing** the previously stored metrics. For more details + see [here](engine/). You can select the memory mode by editing `netdata.conf` and setting: -``` +```conf [global] # ram, save (the default, save on exit, load on start), map (swap like) memory mode = save @@ -69,64 +64,60 @@ Embedded devices usually have very limited RAM resources available. There are 2 settings for you to tweak: 1. `update every`, which controls the data collection frequency -2. `history`, which controls the size of the database in RAM +2. `history`, which controls the size of the database in RAM (except for `memory mode = dbengine`) -By default `update every = 1` and `history = 3600`. This gives you an hour of data with per -second updates. +By default `update every = 1` and `history = 3600`. This gives you an hour of data with per second updates. -If you set `update every = 2` and `history = 1800`, you will still have an hour of data, but -collected once every 2 seconds. This will **cut in half** both CPU and RAM resources consumed -by Netdata. Of course experiment a bit. On very weak devices you might have to use -`update every = 5` and `history = 720` (still 1 hour of data, but 1/5 of the CPU and RAM resources). +If you set `update every = 2` and `history = 1800`, you will still have an hour of data, but collected once every 2 +seconds. This will **cut in half** both CPU and RAM resources consumed by Netdata. Of course experiment a bit. On very +weak devices you might have to use `update every = 5` and `history = 720` (still 1 hour of data, but 1/5 of the CPU and +RAM resources). -You can also disable [data collection plugins](../collectors) you don't need. -Disabling such plugins will also free both CPU and RAM resources. +You can also disable [data collection plugins](../collectors) you don't need. Disabling such plugins will also free both +CPU and RAM resources. ## Running a dedicated central Netdata server -Netdata allows streaming data between Netdata nodes. This allows us to have a central Netdata -server that will maintain the entire database for all nodes, and will also run health checks/alarms -for all nodes. +Netdata allows streaming data between Netdata nodes. This allows us to have a central Netdata server that will maintain +the entire database for all nodes, and will also run health checks/alarms for all nodes. -For this central Netdata, memory size can be a problem. Fortunately, Netdata supports several -memory modes. **One interesting option** for this setup is `memory mode = map`. +For this central Netdata, memory size can be a problem. Fortunately, Netdata supports several memory modes. **One +interesting option** for this setup is `memory mode = map`. ### map -In this mode, the database of Netdata is stored in memory mapped files. Netdata continues to read -and write the database in memory, but the kernel automatically loads and saves memory pages from/to -disk. +In this mode, the database of Netdata is stored in memory mapped files. Netdata continues to read and write the database +in memory, but the kernel automatically loads and saves memory pages from/to disk. -**We suggest _not_ to use this mode on nodes that run other applications.** There will always be -dirty memory to be synced and this syncing process may influence the way other applications work. -This mode however is useful when we need a central Netdata server that would normally need huge -amounts of memory. Using memory mode `map` we can overcome all memory restrictions. +**We suggest _not_ to use this mode on nodes that run other applications.** There will always be dirty memory to be +synced and this syncing process may influence the way other applications work. This mode however is useful when we need +a central Netdata server that would normally need huge amounts of memory. Using memory mode `map` we can overcome all +memory restrictions. -There are a few kernel options that provide finer control on the way this syncing works. But before -explaining them, a brief introduction of how Netdata database works is needed. +There are a few kernel options that provide finer control on the way this syncing works. But before explaining them, a +brief introduction of how Netdata database works is needed. For each chart, Netdata maps the following files: -1. `chart/main.db`, this is the file that maintains chart information. Every time data are collected - for a chart, this is updated. -2. `chart/dimension_name.db`, this is the file for each dimension. At its beginning there is a - header, followed by the round robin database where metrics are stored. +1. `chart/main.db`, this is the file that maintains chart information. Every time data are collected for a chart, this + is updated. +2. `chart/dimension_name.db`, this is the file for each dimension. At its beginning there is a header, followed by the + round robin database where metrics are stored. So, every time Netdata collects data, the following pages will become dirty: 1. the chart file 2. the header part of all dimension files -3. if the collected metrics are stored far enough in the dimension file, another page will - become dirty, for each dimension +3. if the collected metrics are stored far enough in the dimension file, another page will become dirty, for each + dimension -Each page in Linux is 4KB. So, with 200 charts and 1000 dimensions, there will be 1200 to 2200 4KB -pages dirty pages every second. Of course 1200 of them will always be dirty (the chart header and -the dimensions headers) and 1000 will be dirty for about 1000 seconds (4 bytes per metric, 4KB per -page, so 1000 seconds, or 16 minutes per page). +Each page in Linux is 4KB. So, with 200 charts and 1000 dimensions, there will be 1200 to 2200 4KB pages dirty pages +every second. Of course 1200 of them will always be dirty (the chart header and the dimensions headers) and 1000 will be +dirty for about 1000 seconds (4 bytes per metric, 4KB per page, so 1000 seconds, or 16 minutes per page). -Hopefully, the Linux kernel does not sync all these data every second. The frequency they are -synced is controlled by `/proc/sys/vm/dirty_expire_centisecs` or the -`sysctl` `vm.dirty_expire_centisecs`. The default on most systems is 3000 (30 seconds). +Hopefully, the Linux kernel does not sync all these data every second. The frequency they are synced is controlled by +`/proc/sys/vm/dirty_expire_centisecs` or the `sysctl` `vm.dirty_expire_centisecs`. The default on most systems is 3000 +(30 seconds). On a busy server centralizing metrics from 20+ servers you will experience this: @@ -134,62 +125,59 @@ On a busy server centralizing metrics from 20+ servers you will experience this: As you can see, there is quite some stress (this is `iowait`) every 30 seconds. -A simple solution is to increase this time to 10 minutes (60000). This is the same system -with this setting in 10 minutes: +A simple solution is to increase this time to 10 minutes (60000). This is the same system with this setting in 10 +minutes: ![image](https://cloud.githubusercontent.com/assets/2662304/23834784/d2304f72-0764-11e7-8389-fb830ffd973a.png) -Of course, setting this to 10 minutes means that data on disk might be up to 10 minutes old if you -get an abnormal shutdown. +Of course, setting this to 10 minutes means that data on disk might be up to 10 minutes old if you get an abnormal +shutdown. There are 2 more options to tweak: 1. `dirty_background_ratio`, by default `10`. 2. `dirty_ratio`, by default `20`. -These control the amount of memory that should be dirty for disk syncing to be triggered. -On dedicated Netdata servers, you can use: `80` and `90` respectively, so that all RAM is given -to Netdata. +These control the amount of memory that should be dirty for disk syncing to be triggered. On dedicated Netdata servers, +you can use: `80` and `90` respectively, so that all RAM is given to Netdata. -With these settings, you can expect a little `iowait` spike once every 10 minutes and in case -of system crash, data on disk will be up to 10 minutes old. +With these settings, you can expect a little `iowait` spike once every 10 minutes and in case of system crash, data on +disk will be up to 10 minutes old. ![image](https://cloud.githubusercontent.com/assets/2662304/23835030/ba4bf506-0768-11e7-9bc6-3b23e080c69f.png) -To have these settings automatically applied on boot, create the file `/etc/sysctl.d/netdata-memory.conf` with these contents: +To have these settings automatically applied on boot, create the file `/etc/sysctl.d/netdata-memory.conf` with these +contents: -``` +```conf vm.dirty_expire_centisecs = 60000 vm.dirty_background_ratio = 80 vm.dirty_ratio = 90 vm.dirty_writeback_centisecs = 0 ``` -There is another memory mode to help overcome the memory size problem. What is **most interesting -for this setup** is `memory mode = dbengine`. +There is another memory mode to help overcome the memory size problem. What is **most interesting for this setup** is +`memory mode = dbengine`. ### dbengine -In this mode, the database of Netdata is stored in database files. The [Database Engine](engine/) -works like a traditional database. There is some amount of RAM dedicated to data caching and -indexing and the rest of the data reside compressed on disk. The number of history entries is not -fixed in this case, but depends on the configured disk space and the effective compression ratio -of the data stored. +In this mode, the database of Netdata is stored in database files. The [Database Engine](engine/) works like a +traditional database. There is some amount of RAM dedicated to data caching and indexing and the rest of the data reside +compressed on disk. The number of history entries is not fixed in this case, but depends on the configured disk space +and the effective compression ratio of the data stored. -We suggest to use **this** mode on nodes that also run other applications. The Database Engine uses -direct I/O to avoid polluting the OS filesystem caches and does not generate excessive I/O traffic -so as to create the minimum possible interference with other applications. Using memory mode -`dbengine` we can overcome most memory restrictions. For more details see [here](engine/). +We suggest to use **this** mode on nodes that also run other applications. The Database Engine uses direct I/O to avoid +polluting the OS filesystem caches and does not generate excessive I/O traffic so as to create the minimum possible +interference with other applications. Using memory mode `dbengine` we can overcome most memory restrictions. For more +details see [here](engine/). ## KSM -Netdata offers all its round robin database to kernel for deduplication -(except for `memory mode = dbengine`). +Netdata offers all its round robin database to kernel for deduplication (except for `memory mode = dbengine`). -In the past KSM has been criticized for consuming a lot of CPU resources. -Although this is true when KSM is used for deduplicating certain applications, it is not true with -netdata, since the Netdata memory is written very infrequently (if you have 24 hours of metrics in -netdata, each byte at the in-memory database will be updated just once per day). +In the past KSM has been criticized for consuming a lot of CPU resources. Although this is true when KSM is used for +deduplicating certain applications, it is not true with netdata, since the Netdata memory is written very infrequently +(if you have 24 hours of metrics in netdata, each byte at the in-memory database will be updated just once per day). KSM is a solution that will provide 60+% memory savings to Netdata. @@ -203,15 +191,20 @@ CONFIG_KSM=y When KSM is enabled at the kernel is just available for the user to enable it. -So, if you build a kernel with `CONFIG_KSM=y` you will just get a few files in `/sys/kernel/mm/ksm`. Nothing else happens. There is no performance penalty (apart I guess from the memory this code occupies into the kernel). +So, if you build a kernel with `CONFIG_KSM=y` you will just get a few files in `/sys/kernel/mm/ksm`. Nothing else +happens. There is no performance penalty (apart I guess from the memory this code occupies into the kernel). The files that `CONFIG_KSM=y` offers include: -- `/sys/kernel/mm/ksm/run` by default `0`. You have to set this to `1` for the kernel to spawn `ksmd`. -- `/sys/kernel/mm/ksm/sleep_millisecs`, by default `20`. The frequency ksmd should evaluate memory for deduplication. -- `/sys/kernel/mm/ksm/pages_to_scan`, by default `100`. The amount of pages ksmd will evaluate on each run. +- `/sys/kernel/mm/ksm/run` by default `0`. You have to set this to `1` for the + kernel to spawn `ksmd`. +- `/sys/kernel/mm/ksm/sleep_millisecs`, by default `20`. The frequency ksmd + should evaluate memory for deduplication. +- `/sys/kernel/mm/ksm/pages_to_scan`, by default `100`. The amount of pages + ksmd will evaluate on each run. -So, by default `ksmd` is just disabled. It will not harm performance and the user/admin can control the CPU resources he/she is willing `ksmd` to use. +So, by default `ksmd` is just disabled. It will not harm performance and the user/admin can control the CPU resources +he/she is willing `ksmd` to use. ### Run `ksmd` kernel daemon @@ -222,7 +215,8 @@ echo 1 >/sys/kernel/mm/ksm/run echo 1000 >/sys/kernel/mm/ksm/sleep_millisecs ``` -With these settings ksmd does not even appear in the running process list (it will run once per second and evaluate 100 pages for de-duplication). +With these settings ksmd does not even appear in the running process list (it will run once per second and evaluate 100 +pages for de-duplication). Put the above lines in your boot sequence (`/etc/rc.local` or equivalent) to have `ksmd` run at boot. @@ -232,4 +226,4 @@ Netdata will create charts for kernel memory de-duplication performance, like th ![image](https://cloud.githubusercontent.com/assets/2662304/11998786/eb23ae54-aab6-11e5-94d4-e848e8a5c56a.png) -[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdatabase%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdatabase%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>)
\ No newline at end of file diff --git a/database/engine/README.md b/database/engine/README.md index 7791a549f..78f3b15ec 100644 --- a/database/engine/README.md +++ b/database/engine/README.md @@ -1,18 +1,17 @@ # Database engine -The Database Engine works like a traditional -database. There is some amount of RAM dedicated to data caching and indexing and the rest of -the data reside compressed on disk. The number of history entries is not fixed in this case, -but depends on the configured disk space and the effective compression ratio of the data stored. -This is the **only mode** that supports changing the data collection update frequency -(`update_every`) **without losing** the previously stored metrics. +The Database Engine works like a traditional database. There is some amount of RAM dedicated to data caching and +indexing and the rest of the data reside compressed on disk. The number of history entries is not fixed in this case, +but depends on the configured disk space and the effective compression ratio of the data stored. This is the **only +mode** that supports changing the data collection update frequency (`update_every`) **without losing** the previously +stored metrics. ## Files -With the DB engine memory mode the metric data are stored in database files. These files are -organized in pairs, the datafiles and their corresponding journalfiles, e.g.: +With the DB engine memory mode the metric data are stored in database files. These files are organized in pairs, the +datafiles and their corresponding journalfiles, e.g.: -``` +```sh datafile-1-0000000001.ndf journalfile-1-0000000001.njf datafile-1-0000000002.ndf @@ -22,21 +21,19 @@ journalfile-1-0000000003.njf ... ``` -They are located under their host's cache directory in the directory `./dbengine` -(e.g. for localhost the default location is `/var/cache/netdata/dbengine/*`). The higher -numbered filenames contain more recent metric data. The user can safely delete some pairs -of files when Netdata is stopped to manually free up some space. +They are located under their host's cache directory in the directory `./dbengine` (e.g. for localhost the default +location is `/var/cache/netdata/dbengine/*`). The higher numbered filenames contain more recent metric data. The user +can safely delete some pairs of files when Netdata is stopped to manually free up some space. _Users should_ **back up** _their `./dbengine` folders if they consider this data to be important._ ## Configuration -There is one DB engine instance per Netdata host/node. That is, there is one `./dbengine` folder -per node, and all charts of `dbengine` memory mode in such a host share the same storage space -and DB engine instance memory state. You can select the memory mode for localhost by editing -netdata.conf and setting: +There is one DB engine instance per Netdata host/node. That is, there is one `./dbengine` folder per node, and all +charts of `dbengine` memory mode in such a host share the same storage space and DB engine instance memory state. You +can select the memory mode for localhost by editing netdata.conf and setting: -``` +```conf [global] memory mode = dbengine ``` @@ -44,110 +41,157 @@ netdata.conf and setting: For setting the memory mode for the rest of the nodes you should look at [streaming](../../streaming/). -The `history` configuration option is meaningless for `memory mode = dbengine` and is ignored -for any metrics being stored in the DB engine. +The `history` configuration option is meaningless for `memory mode = dbengine` and is ignored for any metrics being +stored in the DB engine. -All DB engine instances, for localhost and all other streaming recipient nodes inherit their -configuration from `netdata.conf`: +All DB engine instances, for localhost and all other streaming recipient nodes inherit their configuration from +`netdata.conf`: -``` +```conf [global] page cache size = 32 dbengine disk space = 256 ``` -The above values are the default and minimum values for Page Cache size and DB engine disk space -quota. Both numbers are in **MiB**. All DB engine instances will allocate the configured resources -separately. +The above values are the default and minimum values for Page Cache size and DB engine disk space quota. Both numbers are +in **MiB**. All DB engine instances will allocate the configured resources separately. -The `page cache size` option determines the amount of RAM in **MiB** that is dedicated to caching -Netdata metric values themselves. +The `page cache size` option determines the amount of RAM in **MiB** that is dedicated to caching Netdata metric values +themselves as far as queries are concerned. The total page cache size will be greater since data collection itself will +consume additional memory as is described in the [Memory requirements](#memory-requirements) section. -The `dbengine disk space` option determines the amount of disk space in **MiB** that is dedicated -to storing Netdata metric values and all related metadata describing them. +The `dbengine disk space` option determines the amount of disk space in **MiB** that is dedicated to storing Netdata +metric values and all related metadata describing them. ## Operation -The DB engine stores chart metric values in 4096-byte pages in memory. Each chart dimension gets -its own page to store consecutive values generated from the data collectors. Those pages comprise -the **Page Cache**. +The DB engine stores chart metric values in 4096-byte pages in memory. Each chart dimension gets its own page to store +consecutive values generated from the data collectors. Those pages comprise the **Page Cache**. -When those pages fill up they are slowly compressed and flushed to disk. -It can take `4096 / 4 = 1024 seconds = 17 minutes`, for a chart dimension that is being collected -every 1 second, to fill a page. Pages can be cut short when we stop Netdata or the DB engine -instance so as to not lose the data. When we query the DB engine for data we trigger disk read -I/O requests that fill the Page Cache with the requested pages and potentially evict cold -(not recently used) pages. +When those pages fill up they are slowly compressed and flushed to disk. It can take `4096 / 4 = 1024 seconds = 17 +minutes`, for a chart dimension that is being collected every 1 second, to fill a page. Pages can be cut short when we +stop Netdata or the DB engine instance so as to not lose the data. When we query the DB engine for data we trigger disk +read I/O requests that fill the Page Cache with the requested pages and potentially evict cold (not recently used) +pages. -When the disk quota is exceeded the oldest values are removed from the DB engine at real time, by -automatically deleting the oldest datafile and journalfile pair. Any corresponding pages residing -in the Page Cache will also be invalidated and removed. The DB engine logic will try to maintain -between 10 and 20 file pairs at any point in time. +When the disk quota is exceeded the oldest values are removed from the DB engine at real time, by automatically deleting +the oldest datafile and journalfile pair. Any corresponding pages residing in the Page Cache will also be invalidated +and removed. The DB engine logic will try to maintain between 10 and 20 file pairs at any point in time. -The Database Engine uses direct I/O to avoid polluting the OS filesystem caches and does not -generate excessive I/O traffic so as to create the minimum possible interference with other -applications. +The Database Engine uses direct I/O to avoid polluting the OS filesystem caches and does not generate excessive I/O +traffic so as to create the minimum possible interference with other applications. ## Memory requirements -Using memory mode `dbengine` we can overcome most memory restrictions and store a dataset that -is much larger than the available memory. +Using memory mode `dbengine` we can overcome most memory restrictions and store a dataset that is much larger than the +available memory. -There are explicit memory requirements **per** DB engine **instance**, meaning **per** Netdata -**node** (e.g. localhost and streaming recipient nodes): +There are explicit memory requirements **per** DB engine **instance**, meaning **per** Netdata **node** (e.g. localhost +and streaming recipient nodes): -- `page cache size` must be at least `#dimensions-being-collected x 4096 x 2` bytes. +- The total page cache memory footprint will be an additional `#dimensions-being-collected x 4096 x 2` bytes over what + the user configured with `page cache size`. - an additional `#pages-on-disk x 4096 x 0.03` bytes of RAM are allocated for metadata. - roughly speaking this is 3% of the uncompressed disk space taken by the DB files. - - for very highly compressible data (compression ratio > 90%) this RAM overhead - is comparable to the disk space footprint. + - for very highly compressible data (compression ratio > 90%) this RAM overhead is comparable to the disk space + footprint. -An important observation is that RAM usage depends on both the `page cache size` and the -`dbengine disk space` options. +An important observation is that RAM usage depends on both the `page cache size` and the `dbengine disk space` options. ## File descriptor requirements -The Database Engine may keep a **significant** amount of files open per instance (e.g. per streaming -slave or master server). When configuring your system you should make sure there are at least 50 -file descriptors available per `dbengine` instance. +The Database Engine may keep a **significant** amount of files open per instance (e.g. per streaming slave or master +server). When configuring your system you should make sure there are at least 50 file descriptors available per +`dbengine` instance. -Netdata allocates 25% of the available file descriptors to its Database Engine instances. This means that only 25% -of the file descriptors that are available to the Netdata service are accessible by dbengine instances. -You should take that into account when configuring your service -or system-wide file descriptor limits. You can roughly estimate that the Netdata service needs 2048 file -descriptors for every 10 streaming slave hosts when streaming is configured to use `memory mode = dbengine`. +Netdata allocates 25% of the available file descriptors to its Database Engine instances. This means that only 25% of +the file descriptors that are available to the Netdata service are accessible by dbengine instances. You should take +that into account when configuring your service or system-wide file descriptor limits. You can roughly estimate that the +Netdata service needs 2048 file descriptors for every 10 streaming slave hosts when streaming is configured to use +`memory mode = dbengine`. -If for example one wants to allocate 65536 file descriptors to the Netdata service on a systemd system -one needs to override the Netdata service by running `sudo systemctl edit netdata` and creating a -file with contents: +If for example one wants to allocate 65536 file descriptors to the Netdata service on a systemd system one needs to +override the Netdata service by running `sudo systemctl edit netdata` and creating a file with contents: -``` +```sh [Service] LimitNOFILE=65536 ``` For other types of services one can add the line: -``` +```sh ulimit -n 65536 ``` -at the beginning of the service file. Alternatively you can change the system-wide limits of the kernel by changing `/etc/sysctl.conf`. For linux that would be: +at the beginning of the service file. Alternatively you can change the system-wide limits of the kernel by changing + `/etc/sysctl.conf`. For linux that would be: -``` +```conf fs.file-max = 65536 ``` In FreeBSD and OS X you change the lines like this: -``` +```conf kern.maxfilesperproc=65536 kern.maxfiles=65536 ``` You can apply the settings by running `sysctl -p` or by rebooting. +## Evaluation + +We have evaluated the performance of the `dbengine` API that the netdata daemon uses internally. This is **not** the +web API of netdata. Our benchmarks ran on a **single** `dbengine` instance, multiple of which can be running in a +netdata master server. We used a server with an AMD Ryzen Threadripper 2950X 16-Core Processor and 2 disk drives, a +Seagate Constellation ES.3 2TB magnetic HDD and a SAMSUNG MZQLB960HAJR-00007 960GB NAND Flash SSD. + +For our workload, we defined 32 charts with 128 metrics each, giving us a total of 4096 metrics. We defined 1 worker +thread per chart (32 threads) that generates new data points with a data generation interval of 1 second. The time axis +of the time-series is emulated and accelerated so that the worker threads can generate as many data points as possible +without delays. + +We also defined 32 worker threads that perform queries on random metrics with semi-random time ranges. The +starting time of the query is randomly selected between the beginning of the time-series and the time of the latest data +point. The ending time is randomly selected between 1 second and 1 hour after the starting time. The pseudo-random +numbers are generated with a uniform distribution. + +The data are written to the database at the same time as they are read from it. This is a concurrent read/write mixed +workload with a duration of 60 seconds. The faster `dbengine` runs, the bigger the dataset size becomes since more +data points will be generated. We set a page cache size of 64MiB for the two disk-bound scenarios. This way, the dataset +size of the metric data is much bigger than the RAM that is being used for caching so as to trigger I/O requests most +of the time. In our final scenario, we set the page cache size to 16 GiB. That way, the dataset fits in the page cache +so as to avoid all disk bottlenecks. + +The reported numbers are the following: + +| device | page cache | dataset | reads/sec | writes/sec | +| :---: | :---: | ---: | ---: | ---: | +| HDD | 64 MiB | 4.1 GiB | 813K | 18.0M | +| SSD | 64 MiB | 9.8 GiB | 1.7M | 43.0M | +| N/A | 16 GiB | 6.8 GiB |118.2M | 30.2M | + +where "reads/sec" is the number of metric data points being read from the database via its API per second and +"writes/sec" is the number of metric data points being written to the database per second. + +Notice that the HDD numbers are pretty high and not much slower than the SSD numbers. This is thanks to the database +engine design being optimized for rotating media. In the database engine disk I/O requests are: + +- asynchronous to mask the high I/O latency of HDDs. +- mostly large to reduce the amount of HDD seeking time. +- mostly sequential to reduce the amount of HDD seeking time. +- compressed to reduce the amount of required throughput. + +As a result, the HDD is not thousands of times slower than the SSD, which is typical for other workloads. + +An interesting observation to make is that the CPU-bound run (16 GiB page cache) generates fewer data than the SSD run +(6.8 GiB vs 9.8 GiB). The reason is that the 32 reader threads in the SSD scenario are more frequently blocked by I/O, +and generate a read load of 1.7M/sec, whereas in the CPU-bound scenario the read load is 70 times higher at 118M/sec. +Consequently, there is a significant degree of interference by the reader threads, that slow down the writer threads. +This is also possible because the interference effects are greater than the SSD impact on data generation throughput. + [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdatabase%2Fengine%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/database/engine/pagecache.c b/database/engine/pagecache.c index 457bcb218..a419ba981 100644 --- a/database/engine/pagecache.c +++ b/database/engine/pagecache.c @@ -209,9 +209,31 @@ static void pg_cache_release_pages(struct rrdengine_instance *ctx, unsigned numb pg_cache_release_pages_unsafe(ctx, number); uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); } + +/* + * This function returns the maximum number of pages allowed in the page cache. + * The caller must hold the page cache lock. + */ +static inline unsigned long pg_cache_hard_limit(struct rrdengine_instance *ctx) +{ + /* it's twice the number of producers since we pin 2 pages per producer */ + return ctx->max_cache_pages + 2 * (unsigned long)ctx->stats.metric_API_producers; +} + +/* + * This function returns the low watermark number of pages in the page cache. The page cache should strive to keep the + * number of pages below that number. + * The caller must hold the page cache lock. + */ +static inline unsigned long pg_cache_soft_limit(struct rrdengine_instance *ctx) +{ + /* it's twice the number of producers since we pin 2 pages per producer */ + return ctx->cache_pages_low_watermark + 2 * (unsigned long)ctx->stats.metric_API_producers; +} + /* * This function will block until it reserves #number populated pages. - * It will trigger evictions or dirty page flushing if the ctx->max_cache_pages limit is hit. + * It will trigger evictions or dirty page flushing if the pg_cache_hard_limit() limit is hit. */ static void pg_cache_reserve_pages(struct rrdengine_instance *ctx, unsigned number) { @@ -223,10 +245,10 @@ static void pg_cache_reserve_pages(struct rrdengine_instance *ctx, unsigned numb assert(number < ctx->max_cache_pages); uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - if (pg_cache->populated_pages + number >= ctx->max_cache_pages + 1) + if (pg_cache->populated_pages + number >= pg_cache_hard_limit(ctx) + 1) debug(D_RRDENGINE, "==Page cache full. Reserving %u pages.==", number); - while (pg_cache->populated_pages + number >= ctx->max_cache_pages + 1) { + while (pg_cache->populated_pages + number >= pg_cache_hard_limit(ctx) + 1) { if (!pg_cache_try_evict_one_page_unsafe(ctx)) { /* failed to evict */ @@ -260,7 +282,7 @@ static void pg_cache_reserve_pages(struct rrdengine_instance *ctx, unsigned numb /* * This function will attempt to reserve #number populated pages. - * It may trigger evictions if the ctx->cache_pages_low_watermark limit is hit. + * It may trigger evictions if the pg_cache_soft_limit() limit is hit. * Returns 0 on failure and 1 on success. */ static int pg_cache_try_reserve_pages(struct rrdengine_instance *ctx, unsigned number) @@ -272,7 +294,7 @@ static int pg_cache_try_reserve_pages(struct rrdengine_instance *ctx, unsigned n assert(number < ctx->max_cache_pages); uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); - if (pg_cache->populated_pages + number >= ctx->cache_pages_low_watermark + 1) { + if (pg_cache->populated_pages + number >= pg_cache_soft_limit(ctx) + 1) { debug(D_RRDENGINE, "==Page cache full. Trying to reserve %u pages.==", number); @@ -280,11 +302,11 @@ static int pg_cache_try_reserve_pages(struct rrdengine_instance *ctx, unsigned n if (!pg_cache_try_evict_one_page_unsafe(ctx)) break; ++count; - } while (pg_cache->populated_pages + number >= ctx->cache_pages_low_watermark + 1); + } while (pg_cache->populated_pages + number >= pg_cache_soft_limit(ctx) + 1); debug(D_RRDENGINE, "Evicted %u pages.", count); } - if (pg_cache->populated_pages + number < ctx->max_cache_pages + 1) { + if (pg_cache->populated_pages + number < pg_cache_hard_limit(ctx) + 1) { pg_cache->populated_pages += number; ret = 1; /* success */ } diff --git a/database/engine/pagecache.h b/database/engine/pagecache.h index d464211e9..ab1a5c1ad 100644 --- a/database/engine/pagecache.h +++ b/database/engine/pagecache.h @@ -183,4 +183,42 @@ extern void free_page_cache(struct rrdengine_instance *ctx); extern void pg_cache_add_new_metric_time(struct pg_cache_page_index *page_index, struct rrdeng_page_descr *descr); extern void pg_cache_update_metric_times(struct pg_cache_page_index *page_index); +static inline void + pg_cache_atomic_get_pg_info(struct rrdeng_page_descr *descr, usec_t *end_timep, uint32_t *page_lengthp) +{ + usec_t end_time, old_end_time; + uint32_t page_length; + + if (NULL == descr->extent) { + /* this page is currently being modified, get consistent info locklessly */ + do { + end_time = descr->end_time; + __sync_synchronize(); + old_end_time = end_time; + page_length = descr->page_length; + __sync_synchronize(); + end_time = descr->end_time; + __sync_synchronize(); + } while ((end_time != old_end_time || (end_time & 1) != 0)); + + *end_timep = end_time; + *page_lengthp = page_length; + } else { + *end_timep = descr->end_time; + *page_lengthp = descr->page_length; + } +} + +/* The caller must hold a reference to the page and must have already set the new data */ +static inline void pg_cache_atomic_set_pg_info(struct rrdeng_page_descr *descr, usec_t end_time, uint32_t page_length) +{ + assert(!(end_time & 1)); + __sync_synchronize(); + descr->end_time |= 1; /* mark start of uncertainty period by adding 1 microsecond */ + __sync_synchronize(); + descr->page_length = page_length; + __sync_synchronize(); + descr->end_time = end_time; /* mark end of uncertainty period */ +} + #endif /* NETDATA_PAGECACHE_H */ diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c index 36d917541..896d71f16 100644 --- a/database/engine/rrdengine.c +++ b/database/engine/rrdengine.c @@ -5,6 +5,8 @@ rrdeng_stats_t global_io_errors = 0; rrdeng_stats_t global_fs_errors = 0; +rrdeng_stats_t global_pg_cache_warnings = 0; +rrdeng_stats_t global_pg_cache_errors = 0; rrdeng_stats_t rrdeng_reserved_file_descriptors = 0; void sanity_check(void) @@ -251,13 +253,10 @@ void flush_pages_cb(uv_fs_t* req) { struct rrdengine_worker_config* wc = req->loop->data; struct rrdengine_instance *ctx = wc->ctx; - struct page_cache *pg_cache = &ctx->pg_cache; struct extent_io_descriptor *xt_io_descr; struct rrdeng_page_descr *descr; struct page_cache_descr *pg_cache_descr; - int ret; unsigned i, count; - Word_t commit_id; xt_io_descr = req->data; if (req->result < 0) { @@ -277,13 +276,6 @@ void flush_pages_cb(uv_fs_t* req) /* care, we don't hold the descriptor mutex */ descr = xt_io_descr->descr_array[i]; - uv_rwlock_wrlock(&pg_cache->commited_page_index.lock); - commit_id = xt_io_descr->descr_commit_idx_array[i]; - ret = JudyLDel(&pg_cache->commited_page_index.JudyL_array, commit_id, PJE0); - assert(1 == ret); - --pg_cache->commited_page_index.nr_commited_pages; - uv_rwlock_wrunlock(&pg_cache->commited_page_index.lock); - pg_cache_replaceQ_insert(ctx, descr); rrdeng_page_descr_mutex_lock(ctx, descr); @@ -331,7 +323,7 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct if (force) { debug(D_RRDENGINE, "Asynchronous flushing of extent has been forced by page pressure."); } - uv_rwlock_rdlock(&pg_cache->commited_page_index.lock); + uv_rwlock_wrlock(&pg_cache->commited_page_index.lock); for (Index = 0, count = 0, uncompressed_payload_length = 0, PValue = JudyLFirst(pg_cache->commited_page_index.JudyL_array, &Index, PJE0), descr = unlikely(NULL == PValue) ? NULL : *PValue ; @@ -340,11 +332,15 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct PValue = JudyLNext(pg_cache->commited_page_index.JudyL_array, &Index, PJE0), descr = unlikely(NULL == PValue) ? NULL : *PValue) { + uint8_t page_write_pending; + assert(0 != descr->page_length); + page_write_pending = 0; rrdeng_page_descr_mutex_lock(ctx, descr); pg_cache_descr = descr->pg_cache_descr; if (!(pg_cache_descr->flags & RRD_PAGE_WRITE_PENDING)) { + page_write_pending = 1; /* care, no reference being held */ pg_cache_descr->flags |= RRD_PAGE_WRITE_PENDING; uncompressed_payload_length += descr->page_length; @@ -352,8 +348,14 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct eligible_pages[count++] = descr; } rrdeng_page_descr_mutex_unlock(ctx, descr); + + if (page_write_pending) { + ret = JudyLDel(&pg_cache->commited_page_index.JudyL_array, Index, PJE0); + assert(1 == ret); + --pg_cache->commited_page_index.nr_commited_pages; + } } - uv_rwlock_rdunlock(&pg_cache->commited_page_index.lock); + uv_rwlock_wrunlock(&pg_cache->commited_page_index.lock); if (!count) { debug(D_RRDENGINE, "%s: no pages eligible for flushing.", __func__); @@ -813,47 +815,6 @@ error_after_loop_init: complete(&ctx->rrdengine_completion); } - -#define NR_PAGES (256) -static void basic_functional_test(struct rrdengine_instance *ctx) -{ - int i, j, failed_validations; - uuid_t uuid[NR_PAGES]; - void *buf; - struct rrdeng_page_descr *handle[NR_PAGES]; - char uuid_str[UUID_STR_LEN]; - char backup[NR_PAGES][UUID_STR_LEN * 100]; /* backup storage for page data verification */ - - for (i = 0 ; i < NR_PAGES ; ++i) { - uuid_generate(uuid[i]); - uuid_unparse_lower(uuid[i], uuid_str); -// fprintf(stderr, "Generated uuid[%d]=%s\n", i, uuid_str); - buf = rrdeng_create_page(ctx, &uuid[i], &handle[i]); - /* Each page contains 10 times its own UUID stringified */ - for (j = 0 ; j < 100 ; ++j) { - strcpy(buf + UUID_STR_LEN * j, uuid_str); - strcpy(backup[i] + UUID_STR_LEN * j, uuid_str); - } - rrdeng_commit_page(ctx, handle[i], (Word_t)i); - } - fprintf(stderr, "\n********** CREATED %d METRIC PAGES ***********\n\n", NR_PAGES); - failed_validations = 0; - for (i = 0 ; i < NR_PAGES ; ++i) { - buf = rrdeng_get_latest_page(ctx, &uuid[i], (void **)&handle[i]); - if (NULL == buf) { - ++failed_validations; - fprintf(stderr, "Page %d was LOST.\n", i); - } - if (memcmp(backup[i], buf, UUID_STR_LEN * 100)) { - ++failed_validations; - fprintf(stderr, "Page %d data comparison with backup FAILED validation.\n", i); - } - rrdeng_put_page(ctx, handle[i]); - } - fprintf(stderr, "\n********** CORRECTLY VALIDATED %d/%d METRIC PAGES ***********\n\n", - NR_PAGES - failed_validations, NR_PAGES); - -} /* C entry point for development purposes * make "LDFLAGS=-errdengine_main" */ @@ -866,8 +827,6 @@ void rrdengine_main(void) if (ret) { exit(ret); } - basic_functional_test(ctx); - rrdeng_exit(ctx); fprintf(stderr, "Hello world!"); exit(0); diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c index bf373f31c..5fa23d8fd 100644 --- a/database/engine/rrdengineapi.c +++ b/database/engine/rrdengineapi.c @@ -4,7 +4,7 @@ /* Default global database instance */ static struct rrdengine_instance default_global_ctx; -int default_rrdeng_page_cache_mb = RRDENG_MIN_PAGE_CACHE_SIZE_MB; +int default_rrdeng_page_cache_mb = 32; int default_rrdeng_disk_quota_mb = RRDENG_MIN_DISK_SPACE_MB; /* @@ -95,9 +95,8 @@ void rrdeng_store_metric_flush_current_page(RRDDIM *rd) if (likely(descr->page_length)) { int ret, page_is_empty; -#ifdef NETDATA_INTERNAL_CHECKS rrd_stat_atomic_add(&ctx->stats.metric_API_producers, -1); -#endif + if (handle->prev_descr) { /* unpin old second page */ pg_cache_put(ctx, handle->prev_descr); @@ -185,16 +184,14 @@ void rrdeng_store_metric_next(RRDDIM *rd, usec_t point_in_time, storage_number n } page = descr->pg_cache_descr->page; page[descr->page_length / sizeof(number)] = number; - descr->end_time = point_in_time; - descr->page_length += sizeof(number); + pg_cache_atomic_set_pg_info(descr, point_in_time, descr->page_length + sizeof(number)); + if (perfect_page_alignment) rd->rrdset->rrddim_page_alignment = descr->page_length; if (unlikely(INVALID_TIME == descr->start_time)) { descr->start_time = point_in_time; -#ifdef NETDATA_INTERNAL_CHECKS rrd_stat_atomic_add(&ctx->stats.metric_API_producers, 1); -#endif pg_cache_insert(ctx, handle->page_index, descr); } else { pg_cache_add_new_metric_time(handle->page_index, descr); @@ -312,8 +309,9 @@ unsigned rrdeng_variable_step_boundaries(RRDSET *st, time_t start_time, time_t e curr = &page_info_array[i]; *pginfo_to_points(curr) = 0; /* initialize to invalid page */ *pginfo_to_dt(curr) = 0; /* no known data collection interval yet */ - if (unlikely(INVALID_TIME == curr->start_time || INVALID_TIME == curr->end_time)) { - info("Ignoring page with invalid timestamp."); + if (unlikely(INVALID_TIME == curr->start_time || INVALID_TIME == curr->end_time || + curr->end_time < curr->start_time)) { + info("Ignoring page with invalid timestamps."); prev = old_prev; continue; } @@ -366,7 +364,7 @@ unsigned rrdeng_variable_step_boundaries(RRDSET *st, time_t start_time, time_t e continue; } - if (unlikely(0 == dt)) { /* unknown data collection interval */ + if (unlikely(0 == *pginfo_to_dt(curr))) { /* unknown data collection interval */ assert(1 == page_points); if (likely(NULL != prev)) { /* get interval from previous page */ @@ -454,7 +452,8 @@ storage_number rrdeng_load_metric_next(struct rrddim_query_handle *rrdimm_handle struct rrdeng_page_descr *descr; storage_number *page, ret; unsigned position, entries; - usec_t next_page_time, current_position_time; + usec_t next_page_time, current_position_time, page_end_time; + uint32_t page_length; handle = &rrdimm_handle->rrdeng; if (unlikely(INVALID_TIME == handle->next_page_time)) { @@ -464,15 +463,17 @@ storage_number rrdeng_load_metric_next(struct rrddim_query_handle *rrdimm_handle if (unlikely(NULL == (descr = handle->descr))) { /* it's the first call */ next_page_time = handle->next_page_time * USEC_PER_SEC; + } else { + pg_cache_atomic_get_pg_info(descr, &page_end_time, &page_length); } position = handle->position + 1; if (unlikely(NULL == descr || - position >= (descr->page_length / sizeof(storage_number)))) { + position >= (page_length / sizeof(storage_number)))) { /* We need to get a new page */ if (descr) { /* Drop old page's reference */ - handle->next_page_time = (descr->end_time / USEC_PER_SEC) + 1; + handle->next_page_time = (page_end_time / USEC_PER_SEC) + 1; if (unlikely(handle->next_page_time > rrdimm_handle->end_time)) { goto no_more_metrics; } @@ -492,26 +493,27 @@ storage_number rrdeng_load_metric_next(struct rrddim_query_handle *rrdimm_handle rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, 1); #endif handle->descr = descr; + pg_cache_atomic_get_pg_info(descr, &page_end_time, &page_length); if (unlikely(INVALID_TIME == descr->start_time || - INVALID_TIME == descr->end_time)) { + INVALID_TIME == page_end_time)) { goto no_more_metrics; } - if (unlikely(descr->start_time != descr->end_time && next_page_time > descr->start_time)) { + if (unlikely(descr->start_time != page_end_time && next_page_time > descr->start_time)) { /* we're in the middle of the page somewhere */ - entries = descr->page_length / sizeof(storage_number); - position = ((uint64_t)(next_page_time - descr->start_time)) * entries / - (descr->end_time - descr->start_time + 1); + entries = page_length / sizeof(storage_number); + position = ((uint64_t)(next_page_time - descr->start_time)) * (entries - 1) / + (page_end_time - descr->start_time); } else { position = 0; } } page = descr->pg_cache_descr->page; ret = page[position]; - entries = descr->page_length / sizeof(storage_number); + entries = page_length / sizeof(storage_number); if (entries > 1) { usec_t dt; - dt = (descr->end_time - descr->start_time) / (entries - 1); + dt = (page_end_time - descr->start_time) / (entries - 1); current_position_time = descr->start_time + position * dt; } else { current_position_time = descr->start_time; diff --git a/database/engine/rrdengineapi.h b/database/engine/rrdengineapi.h index 9b1ab1874..c876705e4 100644 --- a/database/engine/rrdengineapi.h +++ b/database/engine/rrdengineapi.h @@ -5,7 +5,7 @@ #include "rrdengine.h" -#define RRDENG_MIN_PAGE_CACHE_SIZE_MB (32) +#define RRDENG_MIN_PAGE_CACHE_SIZE_MB (8) #define RRDENG_MIN_DISK_SPACE_MB (256) #define RRDENG_NR_STATS (33) diff --git a/database/engine/rrdenginelib.c b/database/engine/rrdenginelib.c index 96504b275..1a04dc2a4 100644 --- a/database/engine/rrdenginelib.c +++ b/database/engine/rrdenginelib.c @@ -8,7 +8,7 @@ void print_page_cache_descr(struct rrdeng_page_descr *descr) { struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; char uuid_str[UUID_STR_LEN]; - char str[BUFSIZE]; + char str[BUFSIZE + 1]; int pos = 0; uuid_unparse_lower(*descr->id, uuid_str); @@ -31,7 +31,7 @@ void print_page_cache_descr(struct rrdeng_page_descr *descr) void print_page_descr(struct rrdeng_page_descr *descr) { char uuid_str[UUID_STR_LEN]; - char str[BUFSIZE]; + char str[BUFSIZE + 1]; int pos = 0; uuid_unparse_lower(*descr->id, uuid_str); diff --git a/database/rrd.c b/database/rrd.c index 31ad3f07e..dcab65189 100644 --- a/database/rrd.c +++ b/database/rrd.c @@ -15,7 +15,11 @@ int rrd_delete_unupdated_dimensions = 0; int default_rrd_update_every = UPDATE_EVERY; int default_rrd_history_entries = RRD_DEFAULT_HISTORY_ENTRIES; +#ifdef ENABLE_DBENGINE +RRD_MEMORY_MODE default_rrd_memory_mode = RRD_MEMORY_MODE_DBENGINE; +#else RRD_MEMORY_MODE default_rrd_memory_mode = RRD_MEMORY_MODE_SAVE; +#endif int gap_when_lost_iterations_above = 1; diff --git a/database/rrd.h b/database/rrd.h index 39e881252..e335f0dd0 100644 --- a/database/rrd.h +++ b/database/rrd.h @@ -697,6 +697,7 @@ struct rrdhost { // RRDCALCs may be linked to charts at any point // (charts may or may not exist when these are loaded) RRDCALC *alarms; + RRDCALC *alarms_with_foreach; avl_tree_lock alarms_idx_health_log; avl_tree_lock alarms_idx_name; @@ -709,6 +710,7 @@ struct rrdhost { // these are used to create alarms when charts // are created or renamed, that match them RRDCALCTEMPLATE *templates; + RRDCALCTEMPLATE *alarms_template_with_foreach; // ------------------------------------------------------------------------ @@ -1008,6 +1010,7 @@ static inline time_t rrdset_slot2time(RRDSET *st, size_t slot) { // ---------------------------------------------------------------------------- // RRD DIMENSION functions +extern void rrdcalc_link_to_rrddim(RRDDIM *rd, RRDSET *st, RRDHOST *host); extern RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collected_number multiplier, collected_number divisor, RRD_ALGORITHM algorithm, RRD_MEMORY_MODE memory_mode); #define rrddim_add(st, id, name, multiplier, divisor, algorithm) rrddim_add_custom(st, id, name, multiplier, divisor, algorithm, (st)->rrd_memory_mode) diff --git a/database/rrdcalc.c b/database/rrdcalc.c index 908fc2ebf..9f16ce374 100644 --- a/database/rrdcalc.c +++ b/database/rrdcalc.c @@ -255,6 +255,53 @@ inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const ch return host->health_log.next_alarm_id++; } +/** + * Alarm name with dimension + * + * Change the name of the current alarm appending a new diagram. + * + * @param name the alarm name + * @param namelen is the length of the previous vector. + * @param dim the dimension of the chart. + * @param dimlen is the length of the previous vector. + * + * @return It returns the new name on success and the old otherwise + */ +char *alarm_name_with_dim(char *name, size_t namelen, const char *dim, size_t dimlen) { + char *newname,*move; + + newname = malloc(namelen + dimlen + 2); + if(newname) { + move = newname; + memcpy(move, name, namelen); + move += namelen; + + *move++ = '_'; + memcpy(move, dim, dimlen); + move += dimlen; + *move = '\0'; + } else { + newname = name; + } + + return newname; +} + +/** + * Remove pipe comma + * + * Remove the pipes and commas converting to space. + * + * @param str the string to change. + */ +void dimension_remove_pipe_comma(char *str) { + while(*str) { + if(*str == '|' || *str == ',') *str = ' '; + + str++; + } +} + inline void rrdcalc_add_to_host(RRDHOST *host, RRDCALC *rc) { rrdhost_check_rdlock(host); @@ -282,24 +329,39 @@ inline void rrdcalc_add_to_host(RRDHOST *host, RRDCALC *rc) { rc->critical->rrdcalc = rc; } - // link it to the host - if(likely(host->alarms)) { - // append it - RRDCALC *t; - for(t = host->alarms; t && t->next ; t = t->next) ; - t->next = rc; - } - else { - host->alarms = rc; - } + if(!rc->foreachdim) { + // link it to the host alarms list + if(likely(host->alarms)) { + // append it + RRDCALC *t; + for(t = host->alarms; t && t->next ; t = t->next) ; + t->next = rc; + } + else { + host->alarms = rc; + } - // link it to its chart - RRDSET *st; - rrdset_foreach_read(st, host) { - if(rrdcalc_is_matching_this_rrdset(rc, st)) { - rrdsetcalc_link(st, rc); - break; + // link it to its chart + RRDSET *st; + rrdset_foreach_read(st, host) { + if(rrdcalc_is_matching_this_rrdset(rc, st)) { + rrdsetcalc_link(st, rc); + break; + } + } + } else { + //link it case there is a foreach + if(likely(host->alarms_with_foreach)) { + // append it + RRDCALC *t; + for(t = host->alarms_with_foreach; t && t->next ; t = t->next) ; + t->next = rc; } + else { + host->alarms_with_foreach = rc; + } + + //I am not linking this alarm direct to the host here, this will be done when the children is created } } @@ -311,13 +373,19 @@ inline RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, RRDCALC *rc = callocz(1, sizeof(RRDCALC)); rc->next_event_id = 1; - rc->id = rrdcalc_get_unique_id(host, chart, rt->name, &rc->next_event_id); rc->name = strdupz(rt->name); rc->hash = simple_hash(rc->name); rc->chart = strdupz(chart); rc->hash_chart = simple_hash(rc->chart); + rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id); + if(rt->dimensions) rc->dimensions = strdupz(rt->dimensions); + if(rt->foreachdim) { + rc->foreachdim = strdupz(rt->foreachdim); + rc->spdim = health_pattern_from_foreach(rc->foreachdim); + } + rc->foreachcounter = rt->foreachcounter; rc->green = rt->green; rc->red = rt->red; @@ -361,7 +429,7 @@ inline RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source); } - debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", + debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", (rc->chart)?rc->chart:"NOCHART", rc->name, (rc->exec)?rc->exec:"DEFAULT", @@ -373,6 +441,7 @@ inline RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, rc->before, rc->options, (rc->dimensions)?rc->dimensions:"NONE", + (rc->foreachdim)?rc->foreachdim:"NONE", rc->update_every, (rc->calculation)?rc->calculation->parsed_as:"NONE", (rc->warning)?rc->warning->parsed_as:"NONE", @@ -387,18 +456,94 @@ inline RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, ); rrdcalc_add_to_host(host, rc); - RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl *)rc); - if (rdcmp != rc) { - error("Cannot insert the alarm index ID %s",rc->name); + if(!rt->foreachdim) { + RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl *)rc); + if (rdcmp != rc) { + error("Cannot insert the alarm index ID %s",rc->name); + } } return rc; } +/** + * Create from RRDCALC + * + * Create a new alarm using another alarm as template. + * + * @param rc is the alarm that will be used as source + * @param host is the host structure. + * @param name is the newest chart name. + * @param dimension is the current dimension + * @param foreachdim the whole list of dimension + * + * @return it returns the new alarm changed. + */ +inline RRDCALC *rrdcalc_create_from_rrdcalc(RRDCALC *rc, RRDHOST *host, const char *name, const char *dimension) { + RRDCALC *newrc = callocz(1, sizeof(RRDCALC)); + + newrc->next_event_id = 1; + newrc->id = rrdcalc_get_unique_id(host, rc->chart, name, &rc->next_event_id); + newrc->name = (char *)name; + newrc->hash = simple_hash(newrc->name); + newrc->chart = strdupz(rc->chart); + newrc->hash_chart = simple_hash(rc->chart); + + newrc->dimensions = strdupz(dimension); + newrc->foreachdim = NULL; + rc->foreachcounter++; + newrc->foreachcounter = rc->foreachcounter; + + newrc->green = rc->green; + newrc->red = rc->red; + newrc->value = NAN; + newrc->old_value = NAN; + + newrc->delay_up_duration = rc->delay_up_duration; + newrc->delay_down_duration = rc->delay_down_duration; + newrc->delay_max_duration = rc->delay_max_duration; + newrc->delay_multiplier = rc->delay_multiplier; + + newrc->last_repeat = 0; + newrc->warn_repeat_every = rc->warn_repeat_every; + newrc->crit_repeat_every = rc->crit_repeat_every; + + newrc->group = rc->group; + newrc->after = rc->after; + newrc->before = rc->before; + newrc->update_every = rc->update_every; + newrc->options = rc->options; + + if(rc->exec) newrc->exec = strdupz(rc->exec); + if(rc->recipient) newrc->recipient = strdupz(rc->recipient); + if(rc->source) newrc->source = strdupz(rc->source); + if(rc->units) newrc->units = strdupz(rc->units); + if(rc->info) newrc->info = strdupz(rc->info); + + if(rc->calculation) { + newrc->calculation = expression_parse(rc->calculation->source, NULL, NULL); + if(!newrc->calculation) + error("Health alarm '%s.%s': failed to parse calculation expression '%s'", rc->chart, rc->name, rc->calculation->source); + } + + if(rc->warning) { + newrc->warning = expression_parse(rc->warning->source, NULL, NULL); + if(!newrc->warning) + error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", rc->chart, rc->name, rc->warning->source); + } + + if(rc->critical) { + newrc->critical = expression_parse(rc->critical->source, NULL, NULL); + if(!newrc->critical) + error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", rc->chart, rc->name, rc->critical->source); + } + + return newrc; +} + void rrdcalc_free(RRDCALC *rc) { if(unlikely(!rc)) return; - expression_free(rc->calculation); expression_free(rc->warning); expression_free(rc->critical); @@ -407,11 +552,13 @@ void rrdcalc_free(RRDCALC *rc) { freez(rc->chart); freez(rc->family); freez(rc->dimensions); + freez(rc->foreachdim); freez(rc->exec); freez(rc->recipient); freez(rc->source); freez(rc->units); freez(rc->info); + simple_pattern_free(rc->spdim); freez(rc); } @@ -437,21 +584,19 @@ void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc) { error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname); } - if (rc) { - RRDCALC *rdcmp = (RRDCALC *) avl_search_lock(&(host)->alarms_idx_health_log, (avl *)rc); - if (rdcmp) { - rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_health_log, (avl *)rc); - if (!rdcmp) { - error("Cannot remove the health alarm index from health_log"); - } + RRDCALC *rdcmp = (RRDCALC *) avl_search_lock(&(host)->alarms_idx_health_log, (avl *)rc); + if (rdcmp) { + rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_health_log, (avl *)rc); + if (!rdcmp) { + error("Cannot remove the health alarm index from health_log"); } + } - rdcmp = (RRDCALC *) avl_search_lock(&(host)->alarms_idx_name, (avl *)rc); - if (rdcmp) { - rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_name, (avl *)rc); - if (!rdcmp) { - error("Cannot remove the health alarm index from idx_name"); - } + rdcmp = (RRDCALC *) avl_search_lock(&(host)->alarms_idx_name, (avl *)rc); + if (rdcmp) { + rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_name, (avl *)rc); + if (!rdcmp) { + error("Cannot remove the health alarm index from idx_name"); } } diff --git a/database/rrdcalc.h b/database/rrdcalc.h index f0c34b543..e0b632597 100644 --- a/database/rrdcalc.h +++ b/database/rrdcalc.h @@ -37,7 +37,7 @@ struct rrdcalc { uint32_t next_event_id; // the next event id that will be used for this alarm char *name; // the name of this alarm - uint32_t hash; + uint32_t hash; // the hash of the alarm name char *exec; // the command to execute when this alarm switches state char *recipient; // the recipient of the alarm (the first parameter to exec) @@ -59,7 +59,11 @@ struct rrdcalc { // database lookup settings char *dimensions; // the chart dimensions - RRDR_GROUPING group; // grouping method: average, max, etc. + char *foreachdim; // the group of dimensions that the `foreach` will be applied. + SIMPLE_PATTERN *spdim; // used if and only if there is a simple pattern for the chart. + int foreachcounter; // the number of alarms created with foreachdim, this also works as an id of the + // children + RRDR_GROUPING group; // grouping method: average, max, etc. int before; // ending point in time-series int after; // starting point in time-series uint32_t options; // calculation options @@ -148,7 +152,10 @@ extern void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc); extern int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name); extern uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id); extern RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart); +extern RRDCALC *rrdcalc_create_from_rrdcalc(RRDCALC *rc, RRDHOST *host, const char *name, const char *dimension); extern void rrdcalc_add_to_host(RRDHOST *host, RRDCALC *rc); +extern void dimension_remove_pipe_comma(char *str); +extern char *alarm_name_with_dim(char *name, size_t namelen, const char *dim, size_t dimlen); static inline int rrdcalc_isrepeating(RRDCALC *rc) { if (unlikely(rc->warn_repeat_every > 0 || rc->crit_repeat_every > 0)) { diff --git a/database/rrdcalctemplate.c b/database/rrdcalctemplate.c index f2b9767c6..f7a085561 100644 --- a/database/rrdcalctemplate.c +++ b/database/rrdcalctemplate.c @@ -5,23 +5,35 @@ // ---------------------------------------------------------------------------- // RRDCALCTEMPLATE management +/** + * RRDCALC TEMPLATE LINK MATCHING + * + * @param rt is the template used to create the chart. + * @param st is the chart where the alarm will be attached. + */ +void rrdcalctemplate_link_matching_test(RRDCALCTEMPLATE *rt, RRDSET *st, RRDHOST *host ) { + if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context) + && (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) { + RRDCALC *rc = rrdcalc_create_from_template(host, rt, st->id); + if(unlikely(!rc)) + info("Health tried to create alarm from template '%s' on chart '%s' of host '%s', but it failed", rt->name, st->id, host->hostname); +#ifdef NETDATA_INTERNAL_CHECKS + else if(rc->rrdset != st && !rc->foreachdim) //When we have a template with foreadhdim, the child will be added to the index late + error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id); +#endif + } +} void rrdcalctemplate_link_matching(RRDSET *st) { RRDHOST *host = st->rrdhost; RRDCALCTEMPLATE *rt; for(rt = host->templates; rt ; rt = rt->next) { - if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context) - && (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) { - RRDCALC *rc = rrdcalc_create_from_template(host, rt, st->id); - if(unlikely(!rc)) - info("Health tried to create alarm from template '%s' on chart '%s' of host '%s', but it failed", rt->name, st->id, host->hostname); + rrdcalctemplate_link_matching_test(rt, st, host); + } -#ifdef NETDATA_INTERNAL_CHECKS - else if(rc->rrdset != st) - error("Health alarm '%s.%s' should be linked to chart '%s', but it is not", rc->chart?rc->chart:"NOCHART", rc->name, st->id); -#endif - } + for(rt = host->alarms_template_with_foreach; rt ; rt = rt->next) { + rrdcalctemplate_link_matching_test(rt, st, host); } } @@ -43,6 +55,8 @@ inline void rrdcalctemplate_free(RRDCALCTEMPLATE *rt) { freez(rt->units); freez(rt->info); freez(rt->dimensions); + freez(rt->foreachdim); + simple_pattern_free(rt->spdim); freez(rt); } @@ -67,5 +81,3 @@ inline void rrdcalctemplate_unlink_and_free(RRDHOST *host, RRDCALCTEMPLATE *rt) rrdcalctemplate_free(rt); } - - diff --git a/database/rrdcalctemplate.h b/database/rrdcalctemplate.h index 92bb4138e..676b4cf64 100644 --- a/database/rrdcalctemplate.h +++ b/database/rrdcalctemplate.h @@ -35,7 +35,11 @@ struct rrdcalctemplate { // database lookup settings char *dimensions; // the chart dimensions - RRDR_GROUPING group; // grouping method: average, max, etc. + char *foreachdim; // the group of dimensions that the lookup will be applied. + SIMPLE_PATTERN *spdim; // used if and only if there is a simple pattern for the chart. + int foreachcounter; // the number of alarms created with foreachdim, this also works as an id of the + // children + RRDR_GROUPING group; // grouping method: average, max, etc. int before; // ending point in time-series int after; // starting point in time-series uint32_t options; // calculation options @@ -70,5 +74,5 @@ extern void rrdcalctemplate_link_matching(RRDSET *st); extern void rrdcalctemplate_free(RRDCALCTEMPLATE *rt); extern void rrdcalctemplate_unlink_and_free(RRDHOST *host, RRDCALCTEMPLATE *rt); - +extern void rrdcalctemplate_create_alarms(RRDHOST *host, RRDCALCTEMPLATE *rt, RRDSET *st); #endif //NETDATA_RRDCALCTEMPLATE_H diff --git a/database/rrddim.c b/database/rrddim.c index 019ca34a1..8ab5a7237 100644 --- a/database/rrddim.c +++ b/database/rrddim.c @@ -156,7 +156,37 @@ static time_t rrddim_query_oldest_time(RRDDIM *rd) { // ---------------------------------------------------------------------------- // RRDDIM create a dimension +void rrdcalc_link_to_rrddim(RRDDIM *rd, RRDSET *st, RRDHOST *host) { + RRDCALC *rrdc; + for (rrdc = host->alarms_with_foreach; rrdc ; rrdc = rrdc->next) { + if (simple_pattern_matches(rrdc->spdim, rd->id) || simple_pattern_matches(rrdc->spdim, rd->name)) { + if (!strcmp(rrdc->chart, st->name)) { + char *usename = alarm_name_with_dim(rrdc->name, strlen(rrdc->name), rd->name, strlen(rd->name)); + if (usename) { + if(rrdcalc_exists(host, st->name, usename, 0, 0)){ + freez(usename); + continue; + } + + RRDCALC *child = rrdcalc_create_from_rrdcalc(rrdc, host, usename, rd->name); + if (child) { + rrdcalc_add_to_host(host, child); + RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl *)child); + if (rdcmp != child) { + error("Cannot insert the alarm index ID %s",child->name); + } + } else { + error("Cannot allocate a new alarm."); + rrdc->foreachcounter--; + } + } + } + } + } +} + RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collected_number multiplier, collected_number divisor, RRD_ALGORITHM algorithm, RRD_MEMORY_MODE memory_mode) { + RRDHOST *host = st->rrdhost; rrdset_wrlock(st); rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK); @@ -175,7 +205,6 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte return rd; } - RRDHOST *host = st->rrdhost; char filename[FILENAME_MAX + 1]; char fullfilename[FILENAME_MAX + 1]; @@ -371,7 +400,28 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte if(unlikely(rrddim_index_add(st, rd) != rd)) error("RRDDIM: INTERNAL ERROR: attempt to index duplicate dimension '%s' on chart '%s'", rd->id, st->id); + if (host->alarms_with_foreach || host->alarms_template_with_foreach) { + int count = 0; + int hostlocked; + for (count = 0 ; count < 5 ; count++) { + hostlocked = netdata_rwlock_trywrlock(&host->rrdhost_rwlock); + if (!hostlocked) { + rrdcalc_link_to_rrddim(rd, st, host); + rrdhost_unlock(host); + break; + } else if (hostlocked != EBUSY) { + error("Cannot lock host to create an alarm for the dimension."); + } + usleep(200000); + } + + if (count == 5) { + error("Failed to create an alarm for dimension %s of chart %s 5 times. Skipping alarm." + , rd->name, st->name); + } + } rrdset_unlock(st); + return(rd); } diff --git a/database/rrdhost.c b/database/rrdhost.c index d6252d206..9075787b0 100644 --- a/database/rrdhost.c +++ b/database/rrdhost.c @@ -598,9 +598,23 @@ void rrdhost_free(RRDHOST *host) { while(host->alarms) rrdcalc_unlink_and_free(host, host->alarms); + RRDCALC *rc,*nc; + for(rc = host->alarms_with_foreach; rc ; rc = nc) { + nc = rc->next; + rrdcalc_free(rc); + } + host->alarms_with_foreach = NULL; + while(host->templates) rrdcalctemplate_unlink_and_free(host, host->templates); + RRDCALCTEMPLATE *rt,*next; + for(rt = host->alarms_template_with_foreach; rt ; rt = next) { + next = rt->next; + rrdcalctemplate_free(rt); + } + host->alarms_template_with_foreach = NULL; + debug(D_RRD_CALLS, "RRDHOST: Cleaning up remaining host variables for host '%s'", host->hostname); rrdvar_free_remaining_variables(host, &host->rrdvar_root_index); diff --git a/database/rrdset.c b/database/rrdset.c index f8962b2fb..26df8d737 100644 --- a/database/rrdset.c +++ b/database/rrdset.c @@ -150,7 +150,7 @@ int rrdset_set_name(RRDSET *st, const char *name) { rrdset_strncpyz_name(b, n, CONFIG_MAX_VALUE); if(rrdset_index_find_name(host, b, 0)) { - error("RRDSET: chart name '%s' on host '%s' already exists.", b, host->hostname); + info("RRDSET: chart name '%s' on host '%s' already exists.", b, host->hostname); return 0; } |