diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2023-05-08 16:27:04 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2023-05-08 16:27:04 +0000 |
commit | a836a244a3d2bdd4da1ee2641e3e957850668cea (patch) | |
tree | cb87c75b3677fab7144f868435243f864048a1e6 /database | |
parent | Adding upstream version 1.38.1. (diff) | |
download | netdata-a836a244a3d2bdd4da1ee2641e3e957850668cea.tar.xz netdata-a836a244a3d2bdd4da1ee2641e3e957850668cea.zip |
Adding upstream version 1.39.0.upstream/1.39.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'database')
63 files changed, 8331 insertions, 6723 deletions
diff --git a/database/Makefile.am b/database/Makefile.am index dc87e61b..21b4896d 100644 --- a/database/Makefile.am +++ b/database/Makefile.am @@ -7,6 +7,7 @@ SUBDIRS = \ engine \ ram \ sqlite \ + contexts \ $(NULL) dist_noinst_DATA = \ diff --git a/database/README.md b/database/README.md index becd4165..eb708162 100644 --- a/database/README.md +++ b/database/README.md @@ -1,13 +1,3 @@ -<!-- -title: "Database" -description: "The Netdata Agent leverages multiple, user-configurable time-series databases that use RAM and/or disk to store metrics on any type of node." -custom_edit_url: "https://github.com/netdata/netdata/edit/master/database/README.md" -sidebar_label: "Database" -learn_status: "Published" -learn_topic_type: "Tasks" -learn_rel_path: "Setup" ---> - # Database Netdata is fully capable of long-term metrics storage, at per-second granularity, via its default database engine @@ -42,13 +32,13 @@ The default mode `[db].mode = dbengine` has been designed to scale for longer re for parent Agents in the _Parent - Child_ setups The other available database modes are designed to minimize resource utilization and should only be considered on -[Parent - Child](https://github.com/netdata/netdata/blob/master/docs/metrics-storage-management/how-streaming-works.mdx) setups at the children side and only when the +[Parent - Child](https://github.com/netdata/netdata/blob/master/docs/metrics-storage-management/enable-streaming.md) setups at the children side and only when the resource constraints are very strict. So, - On a single node setup, use `[db].mode = dbengine`. -- On a [Parent - Child](https://github.com/netdata/netdata/blob/master/docs/metrics-storage-management/how-streaming-works.mdx) setup, use `[db].mode = dbengine` on the +- On a [Parent - Child](https://github.com/netdata/netdata/blob/master/docs/metrics-storage-management/enable-streaming.md) setup, use `[db].mode = dbengine` on the parent to increase retention, a more resource efficient mode like, `dbengine` with light retention settings, and `save`, `ram` or `none` modes for the children to minimize resource utilization. @@ -68,7 +58,7 @@ Metrics retention is controlled only by the disk space allocated to storing metr CPU required by the agent to query longer timeframes. Since Netdata Agents usually run on the edge, on production systems, Netdata Agent **parents** should be considered. -When having a [**parent - child**](https://github.com/netdata/netdata/blob/master/docs/metrics-storage-management/how-streaming-works.mdx) setup, the child (the +When having a [**parent - child**](https://github.com/netdata/netdata/blob/master/docs/metrics-storage-management/enable-streaming.md) setup, the child (the Netdata Agent running on a production system) delegates all of its functions, including longer metrics retention and querying, to the parent node that can dedicate more resources to this task. A single Netdata Agent parent can centralize multiple children Netdata Agents (dozens, hundreds, or even thousands depending on its available resources). @@ -144,8 +134,30 @@ Put the above lines in your boot sequence (`/etc/rc.local` or equivalent) to hav ### Monitoring Kernel Memory de-duplication performance -Netdata will create charts for kernel memory de-duplication performance, like this: +Netdata will create charts for kernel memory de-duplication performance, the **deduper (ksm)** charts can be seen under the **Memory** section in the Netdata UI. + +#### KSM summary + +The summary gives you a quick idea of how much savings (in terms of bytes and in terms of percentage) KSM is able to achieve. + +![image](https://user-images.githubusercontent.com/24860547/199454880-123ae7c4-071a-4811-95b8-18cf4e4f60a2.png) + +#### KSM pages merge performance + +This chart indicates the performance of page merging. **Shared** indicates used shared pages, **Unshared** indicates memory no longer shared (pages are unique but repeatedly checked for merging), **Sharing** indicates memory currently shared(how many more sites are sharing the pages, i.e. how much saved) and **Volatile** indicates volatile pages (changing too fast to be placed in a tree). + +A high ratio of Sharing to Shared indicates good sharing, but a high ratio of Unshared to Sharing indicates wasted effort. + +![image](https://user-images.githubusercontent.com/24860547/199455374-d63fd2c2-e12b-4ddf-947b-35371215eb05.png) + +#### KSM savings + +This chart shows the amount of memory saved by KSM. **Savings** indicates saved memory. **Offered** indicates memory marked as mergeable. + +![image](https://user-images.githubusercontent.com/24860547/199455604-43cd9248-1f6e-4c31-be56-e0b9e432f48a.png) -![image](https://cloud.githubusercontent.com/assets/2662304/11998786/eb23ae54-aab6-11e5-94d4-e848e8a5c56a.png) +#### KSM effectiveness +This chart tells you how well KSM is doing at what it is supposed to. It does this by charting the percentage of the mergeable pages that are currently merged. +![image](https://user-images.githubusercontent.com/24860547/199455770-4d7991ff-6b7e-4d96-9d23-33ffc572b370.png) diff --git a/database/contexts/Makefile.am b/database/contexts/Makefile.am new file mode 100644 index 00000000..59250a99 --- /dev/null +++ b/database/contexts/Makefile.am @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +AUTOMAKE_OPTIONS = subdir-objects +MAINTAINERCLEANFILES = $(srcdir)/Makefile.in + +SUBDIRS = \ + $(NULL) + +dist_noinst_DATA = \ + README.md \ + $(NULL) diff --git a/database/contexts/README.md b/database/contexts/README.md new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/database/contexts/README.md diff --git a/database/contexts/api_v1.c b/database/contexts/api_v1.c new file mode 100644 index 00000000..daf945ee --- /dev/null +++ b/database/contexts/api_v1.c @@ -0,0 +1,439 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "internal.h" + +static void rrd_flags_to_buffer_json_array_items(RRD_FLAGS flags, BUFFER *wb) { + if(flags & RRD_FLAG_QUEUED_FOR_HUB) + buffer_json_add_array_item_string(wb, "QUEUED"); + + if(flags & RRD_FLAG_DELETED) + buffer_json_add_array_item_string(wb, "DELETED"); + + if(flags & RRD_FLAG_COLLECTED) + buffer_json_add_array_item_string(wb, "COLLECTED"); + + if(flags & RRD_FLAG_UPDATED) + buffer_json_add_array_item_string(wb, "UPDATED"); + + if(flags & RRD_FLAG_ARCHIVED) + buffer_json_add_array_item_string(wb, "ARCHIVED"); + + if(flags & RRD_FLAG_OWN_LABELS) + buffer_json_add_array_item_string(wb, "OWN_LABELS"); + + if(flags & RRD_FLAG_LIVE_RETENTION) + buffer_json_add_array_item_string(wb, "LIVE_RETENTION"); + + if(flags & RRD_FLAG_HIDDEN) + buffer_json_add_array_item_string(wb, "HIDDEN"); + + if(flags & RRD_FLAG_QUEUED_FOR_PP) + buffer_json_add_array_item_string(wb, "PENDING_UPDATES"); +} + +// ---------------------------------------------------------------------------- +// /api/v1/context(s) API + +struct rrdcontext_to_json { + BUFFER *wb; + RRDCONTEXT_TO_JSON_OPTIONS options; + time_t after; + time_t before; + SIMPLE_PATTERN *chart_label_key; + SIMPLE_PATTERN *chart_labels_filter; + SIMPLE_PATTERN *chart_dimensions; + size_t written; + time_t now; + time_t combined_first_time_s; + time_t combined_last_time_s; + RRD_FLAGS combined_flags; +}; + +static inline int rrdmetric_to_json_callback(const DICTIONARY_ITEM *item, void *value, void *data) { + const char *id = dictionary_acquired_item_name(item); + struct rrdcontext_to_json * t = data; + RRDMETRIC *rm = value; + BUFFER *wb = t->wb; + RRDCONTEXT_TO_JSON_OPTIONS options = t->options; + time_t after = t->after; + time_t before = t->before; + + if(unlikely(rrd_flag_is_deleted(rm) && !(options & RRDCONTEXT_OPTION_SHOW_DELETED))) + return 0; + + if(after && (!rm->last_time_s || after > rm->last_time_s)) + return 0; + + if(before && (!rm->first_time_s || before < rm->first_time_s)) + return 0; + + if(t->chart_dimensions + && !simple_pattern_matches_string(t->chart_dimensions, rm->id) + && rm->name != rm->id + && !simple_pattern_matches_string(t->chart_dimensions, rm->name)) + return 0; + + if(t->written) { + t->combined_first_time_s = MIN(t->combined_first_time_s, rm->first_time_s); + t->combined_last_time_s = MAX(t->combined_last_time_s, rm->last_time_s); + t->combined_flags |= rrd_flags_get(rm); + } + else { + t->combined_first_time_s = rm->first_time_s; + t->combined_last_time_s = rm->last_time_s; + t->combined_flags = rrd_flags_get(rm); + } + + buffer_json_member_add_object(wb, id); + + if(options & RRDCONTEXT_OPTION_SHOW_UUIDS) { + char uuid[UUID_STR_LEN]; + uuid_unparse(rm->uuid, uuid); + buffer_json_member_add_string(wb, "uuid", uuid); + } + + buffer_json_member_add_string(wb, "name", string2str(rm->name)); + buffer_json_member_add_time_t(wb, "first_time_t", rm->first_time_s); + buffer_json_member_add_time_t(wb, "last_time_t", rrd_flag_is_collected(rm) ? (long long)t->now : (long long)rm->last_time_s); + buffer_json_member_add_boolean(wb, "collected", rrd_flag_is_collected(rm)); + + if(options & RRDCONTEXT_OPTION_SHOW_DELETED) + buffer_json_member_add_boolean(wb, "deleted", rrd_flag_is_deleted(rm)); + + if(options & RRDCONTEXT_OPTION_SHOW_FLAGS) { + buffer_json_member_add_array(wb, "flags"); + rrd_flags_to_buffer_json_array_items(rrd_flags_get(rm), wb); + buffer_json_array_close(wb); + } + + buffer_json_object_close(wb); + t->written++; + return 1; +} + +static inline int rrdinstance_to_json_callback(const DICTIONARY_ITEM *item, void *value, void *data) { + const char *id = dictionary_acquired_item_name(item); + + struct rrdcontext_to_json *t_parent = data; + RRDINSTANCE *ri = value; + BUFFER *wb = t_parent->wb; + RRDCONTEXT_TO_JSON_OPTIONS options = t_parent->options; + time_t after = t_parent->after; + time_t before = t_parent->before; + bool has_filter = t_parent->chart_label_key || t_parent->chart_labels_filter || t_parent->chart_dimensions; + + if(unlikely(rrd_flag_is_deleted(ri) && !(options & RRDCONTEXT_OPTION_SHOW_DELETED))) + return 0; + + if(after && (!ri->last_time_s || after > ri->last_time_s)) + return 0; + + if(before && (!ri->first_time_s || before < ri->first_time_s)) + return 0; + + if(t_parent->chart_label_key && !rrdlabels_match_simple_pattern_parsed(ri->rrdlabels, t_parent->chart_label_key, + '\0', NULL)) + return 0; + + if(t_parent->chart_labels_filter && !rrdlabels_match_simple_pattern_parsed(ri->rrdlabels, + t_parent->chart_labels_filter, ':', + NULL)) + return 0; + + time_t first_time_s = ri->first_time_s; + time_t last_time_s = ri->last_time_s; + RRD_FLAGS flags = rrd_flags_get(ri); + + BUFFER *wb_metrics = NULL; + if(options & RRDCONTEXT_OPTION_SHOW_METRICS || t_parent->chart_dimensions) { + + wb_metrics = buffer_create(4096, &netdata_buffers_statistics.buffers_api); + buffer_json_initialize(wb_metrics, "\"", "\"", wb->json.depth + 2, false, false); + + struct rrdcontext_to_json t_metrics = { + .wb = wb_metrics, + .options = options, + .chart_label_key = t_parent->chart_label_key, + .chart_labels_filter = t_parent->chart_labels_filter, + .chart_dimensions = t_parent->chart_dimensions, + .after = after, + .before = before, + .written = 0, + .now = t_parent->now, + }; + dictionary_walkthrough_read(ri->rrdmetrics, rrdmetric_to_json_callback, &t_metrics); + + if(has_filter && !t_metrics.written) { + buffer_free(wb_metrics); + return 0; + } + + first_time_s = t_metrics.combined_first_time_s; + last_time_s = t_metrics.combined_last_time_s; + flags = t_metrics.combined_flags; + } + + if(t_parent->written) { + t_parent->combined_first_time_s = MIN(t_parent->combined_first_time_s, first_time_s); + t_parent->combined_last_time_s = MAX(t_parent->combined_last_time_s, last_time_s); + t_parent->combined_flags |= flags; + } + else { + t_parent->combined_first_time_s = first_time_s; + t_parent->combined_last_time_s = last_time_s; + t_parent->combined_flags = flags; + } + + buffer_json_member_add_object(wb, id); + + if(options & RRDCONTEXT_OPTION_SHOW_UUIDS) { + char uuid[UUID_STR_LEN]; + uuid_unparse(ri->uuid, uuid); + buffer_json_member_add_string(wb, "uuid", uuid); + } + + buffer_json_member_add_string(wb, "name", string2str(ri->name)); + buffer_json_member_add_string(wb, "context", string2str(ri->rc->id)); + buffer_json_member_add_string(wb, "title", string2str(ri->title)); + buffer_json_member_add_string(wb, "units", string2str(ri->units)); + buffer_json_member_add_string(wb, "family", string2str(ri->family)); + buffer_json_member_add_string(wb, "chart_type", rrdset_type_name(ri->chart_type)); + buffer_json_member_add_uint64(wb, "priority", ri->priority); + buffer_json_member_add_time_t(wb, "update_every", ri->update_every_s); + buffer_json_member_add_time_t(wb, "first_time_t", first_time_s); + buffer_json_member_add_time_t(wb, "last_time_t", (flags & RRD_FLAG_COLLECTED) ? (long long)t_parent->now : (long long)last_time_s); + buffer_json_member_add_boolean(wb, "collected", flags & RRD_FLAG_COLLECTED); + + if(options & RRDCONTEXT_OPTION_SHOW_DELETED) + buffer_json_member_add_boolean(wb, "deleted", rrd_flag_is_deleted(ri)); + + if(options & RRDCONTEXT_OPTION_SHOW_FLAGS) { + buffer_json_member_add_array(wb, "flags"); + rrd_flags_to_buffer_json_array_items(rrd_flags_get(ri), wb); + buffer_json_array_close(wb); + } + + if(options & RRDCONTEXT_OPTION_SHOW_LABELS && ri->rrdlabels && dictionary_entries(ri->rrdlabels)) { + buffer_json_member_add_object(wb, "labels"); + rrdlabels_to_buffer_json_members(ri->rrdlabels, wb); + buffer_json_object_close(wb); + } + + if(wb_metrics) { + buffer_json_member_add_object(wb, "dimensions"); + buffer_fast_strcat(wb, buffer_tostring(wb_metrics), buffer_strlen(wb_metrics)); + buffer_json_object_close(wb); + + buffer_free(wb_metrics); + } + + buffer_json_object_close(wb); + t_parent->written++; + return 1; +} + +static inline int rrdcontext_to_json_callback(const DICTIONARY_ITEM *item, void *value, void *data) { + const char *id = dictionary_acquired_item_name(item); + struct rrdcontext_to_json *t_parent = data; + RRDCONTEXT *rc = value; + BUFFER *wb = t_parent->wb; + RRDCONTEXT_TO_JSON_OPTIONS options = t_parent->options; + time_t after = t_parent->after; + time_t before = t_parent->before; + bool has_filter = t_parent->chart_label_key || t_parent->chart_labels_filter || t_parent->chart_dimensions; + + if(unlikely(rrd_flag_check(rc, RRD_FLAG_HIDDEN) && !(options & RRDCONTEXT_OPTION_SHOW_HIDDEN))) + return 0; + + if(unlikely(rrd_flag_is_deleted(rc) && !(options & RRDCONTEXT_OPTION_SHOW_DELETED))) + return 0; + + if(options & RRDCONTEXT_OPTION_DEEPSCAN) + rrdcontext_recalculate_context_retention(rc, RRD_FLAG_NONE, false); + + if(after && (!rc->last_time_s || after > rc->last_time_s)) + return 0; + + if(before && (!rc->first_time_s || before < rc->first_time_s)) + return 0; + + time_t first_time_s = rc->first_time_s; + time_t last_time_s = rc->last_time_s; + RRD_FLAGS flags = rrd_flags_get(rc); + + BUFFER *wb_instances = NULL; + if((options & (RRDCONTEXT_OPTION_SHOW_LABELS|RRDCONTEXT_OPTION_SHOW_INSTANCES|RRDCONTEXT_OPTION_SHOW_METRICS)) + || t_parent->chart_label_key + || t_parent->chart_labels_filter + || t_parent->chart_dimensions) { + + wb_instances = buffer_create(4096, &netdata_buffers_statistics.buffers_api); + buffer_json_initialize(wb_instances, "\"", "\"", wb->json.depth + 2, false, false); + + struct rrdcontext_to_json t_instances = { + .wb = wb_instances, + .options = options, + .chart_label_key = t_parent->chart_label_key, + .chart_labels_filter = t_parent->chart_labels_filter, + .chart_dimensions = t_parent->chart_dimensions, + .after = after, + .before = before, + .written = 0, + .now = t_parent->now, + }; + dictionary_walkthrough_read(rc->rrdinstances, rrdinstance_to_json_callback, &t_instances); + + if(has_filter && !t_instances.written) { + buffer_free(wb_instances); + return 0; + } + + first_time_s = t_instances.combined_first_time_s; + last_time_s = t_instances.combined_last_time_s; + flags = t_instances.combined_flags; + } + + if(!(options & RRDCONTEXT_OPTION_SKIP_ID)) + buffer_json_member_add_object(wb, id); + + rrdcontext_lock(rc); + + buffer_json_member_add_string(wb, "title", string2str(rc->title)); + buffer_json_member_add_string(wb, "units", string2str(rc->units)); + buffer_json_member_add_string(wb, "family", string2str(rc->family)); + buffer_json_member_add_string(wb, "chart_type", rrdset_type_name(rc->chart_type)); + buffer_json_member_add_uint64(wb, "priority", rc->priority); + buffer_json_member_add_time_t(wb, "first_time_t", first_time_s); + buffer_json_member_add_time_t(wb, "last_time_t", (flags & RRD_FLAG_COLLECTED) ? (long long)t_parent->now : (long long)last_time_s); + buffer_json_member_add_boolean(wb, "collected", (flags & RRD_FLAG_COLLECTED)); + + if(options & RRDCONTEXT_OPTION_SHOW_DELETED) + buffer_json_member_add_boolean(wb, "deleted", rrd_flag_is_deleted(rc)); + + if(options & RRDCONTEXT_OPTION_SHOW_FLAGS) { + buffer_json_member_add_array(wb, "flags"); + rrd_flags_to_buffer_json_array_items(rrd_flags_get(rc), wb); + buffer_json_array_close(wb); + } + + if(options & RRDCONTEXT_OPTION_SHOW_QUEUED) { + buffer_json_member_add_array(wb, "queued_reasons"); + rrd_reasons_to_buffer_json_array_items(rc->queue.queued_flags, wb); + buffer_json_array_close(wb); + + buffer_json_member_add_time_t(wb, "last_queued", (time_t)(rc->queue.queued_ut / USEC_PER_SEC)); + buffer_json_member_add_time_t(wb, "scheduled_dispatch", (time_t)(rc->queue.scheduled_dispatch_ut / USEC_PER_SEC)); + buffer_json_member_add_time_t(wb, "last_dequeued", (time_t)(rc->queue.dequeued_ut / USEC_PER_SEC)); + buffer_json_member_add_uint64(wb, "dispatches", rc->queue.dispatches); + buffer_json_member_add_uint64(wb, "hub_version", rc->hub.version); + buffer_json_member_add_uint64(wb, "version", rc->version); + + buffer_json_member_add_array(wb, "pp_reasons"); + rrd_reasons_to_buffer_json_array_items(rc->pp.queued_flags, wb); + buffer_json_array_close(wb); + + buffer_json_member_add_time_t(wb, "pp_last_queued", (time_t)(rc->pp.queued_ut / USEC_PER_SEC)); + buffer_json_member_add_time_t(wb, "pp_last_dequeued", (time_t)(rc->pp.dequeued_ut / USEC_PER_SEC)); + buffer_json_member_add_uint64(wb, "pp_executed", rc->pp.executions); + } + + rrdcontext_unlock(rc); + + if(wb_instances) { + buffer_json_member_add_object(wb, "charts"); + buffer_fast_strcat(wb, buffer_tostring(wb_instances), buffer_strlen(wb_instances)); + buffer_json_object_close(wb); + + buffer_free(wb_instances); + } + + if(!(options & RRDCONTEXT_OPTION_SKIP_ID)) + buffer_json_object_close(wb); + + t_parent->written++; + return 1; +} + +int rrdcontext_to_json(RRDHOST *host, BUFFER *wb, time_t after, time_t before, RRDCONTEXT_TO_JSON_OPTIONS options, const char *context, SIMPLE_PATTERN *chart_label_key, SIMPLE_PATTERN *chart_labels_filter, SIMPLE_PATTERN *chart_dimensions) { + if(!host->rrdctx.contexts) { + error("%s(): request for host '%s' that does not have rrdcontexts initialized.", __FUNCTION__, rrdhost_hostname(host)); + return HTTP_RESP_NOT_FOUND; + } + + RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item(host->rrdctx.contexts, context); + if(!rca) return HTTP_RESP_NOT_FOUND; + + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + + if(after != 0 && before != 0) + rrdr_relative_window_to_absolute(&after, &before, NULL); + + buffer_json_initialize(wb, "\"", "\"", 0, true, false); + struct rrdcontext_to_json t_contexts = { + .wb = wb, + .options = options|RRDCONTEXT_OPTION_SKIP_ID, + .chart_label_key = chart_label_key, + .chart_labels_filter = chart_labels_filter, + .chart_dimensions = chart_dimensions, + .after = after, + .before = before, + .written = 0, + .now = now_realtime_sec(), + }; + rrdcontext_to_json_callback((DICTIONARY_ITEM *)rca, rc, &t_contexts); + buffer_json_finalize(wb); + + rrdcontext_release(rca); + + if(!t_contexts.written) + return HTTP_RESP_NOT_FOUND; + + return HTTP_RESP_OK; +} + +int rrdcontexts_to_json(RRDHOST *host, BUFFER *wb, time_t after, time_t before, RRDCONTEXT_TO_JSON_OPTIONS options, SIMPLE_PATTERN *chart_label_key, SIMPLE_PATTERN *chart_labels_filter, SIMPLE_PATTERN *chart_dimensions) { + if(!host->rrdctx.contexts) { + error("%s(): request for host '%s' that does not have rrdcontexts initialized.", __FUNCTION__, rrdhost_hostname(host)); + return HTTP_RESP_NOT_FOUND; + } + + char node_uuid[UUID_STR_LEN] = ""; + + if(host->node_id) + uuid_unparse(*host->node_id, node_uuid); + + if(after != 0 && before != 0) + rrdr_relative_window_to_absolute(&after, &before, NULL); + + buffer_json_initialize(wb, "\"", "\"", 0, true, false); + buffer_json_member_add_string(wb, "hostname", rrdhost_hostname(host)); + buffer_json_member_add_string(wb, "machine_guid", host->machine_guid); + buffer_json_member_add_string(wb, "node_id", node_uuid); + buffer_json_member_add_string(wb, "claim_id", host->aclk_state.claimed_id ? host->aclk_state.claimed_id : ""); + + if(options & RRDCONTEXT_OPTION_SHOW_LABELS) { + buffer_json_member_add_object(wb, "host_labels"); + rrdlabels_to_buffer_json_members(host->rrdlabels, wb); + buffer_json_object_close(wb); + } + + buffer_json_member_add_object(wb, "contexts"); + struct rrdcontext_to_json t_contexts = { + .wb = wb, + .options = options, + .chart_label_key = chart_label_key, + .chart_labels_filter = chart_labels_filter, + .chart_dimensions = chart_dimensions, + .after = after, + .before = before, + .written = 0, + .now = now_realtime_sec(), + }; + dictionary_walkthrough_read(host->rrdctx.contexts, rrdcontext_to_json_callback, &t_contexts); + buffer_json_object_close(wb); + + buffer_json_finalize(wb); + + return HTTP_RESP_OK; +} + diff --git a/database/contexts/api_v2.c b/database/contexts/api_v2.c new file mode 100644 index 00000000..a08d1509 --- /dev/null +++ b/database/contexts/api_v2.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "internal.h" + +#include "aclk/aclk_capas.h" + +// ---------------------------------------------------------------------------- +// /api/v2/contexts API + +typedef enum __attribute__ ((__packed__)) { + FTS_MATCHED_NONE = 0, + FTS_MATCHED_HOST, + FTS_MATCHED_CONTEXT, + FTS_MATCHED_INSTANCE, + FTS_MATCHED_DIMENSION, + FTS_MATCHED_LABEL, + FTS_MATCHED_ALERT, + FTS_MATCHED_ALERT_INFO, + FTS_MATCHED_FAMILY, + FTS_MATCHED_TITLE, + FTS_MATCHED_UNITS, +} FTS_MATCH; + +static const char *fts_match_to_string(FTS_MATCH match) { + switch(match) { + case FTS_MATCHED_HOST: + return "HOST"; + + case FTS_MATCHED_CONTEXT: + return "CONTEXT"; + + case FTS_MATCHED_INSTANCE: + return "INSTANCE"; + + case FTS_MATCHED_DIMENSION: + return "DIMENSION"; + + case FTS_MATCHED_ALERT: + return "ALERT"; + + case FTS_MATCHED_ALERT_INFO: + return "ALERT_INFO"; + + case FTS_MATCHED_LABEL: + return "LABEL"; + + case FTS_MATCHED_FAMILY: + return "FAMILY"; + + case FTS_MATCHED_TITLE: + return "TITLE"; + + case FTS_MATCHED_UNITS: + return "UNITS"; + + default: + return "NONE"; + } +} + +struct rrdcontext_to_json_v2_entry { + size_t count; + STRING *id; + STRING *family; + uint32_t priority; + time_t first_time_s; + time_t last_time_s; + RRD_FLAGS flags; + FTS_MATCH match; +}; + +typedef struct full_text_search_index { + size_t searches; + size_t string_searches; + size_t char_searches; +} FTS_INDEX; + +static inline bool full_text_search_string(FTS_INDEX *fts, SIMPLE_PATTERN *q, STRING *ptr) { + fts->searches++; + fts->string_searches++; + return simple_pattern_matches_string(q, ptr); +} + +static inline bool full_text_search_char(FTS_INDEX *fts, SIMPLE_PATTERN *q, char *ptr) { + fts->searches++; + fts->char_searches++; + return simple_pattern_matches(q, ptr); +} + +struct rrdcontext_to_json_v2_data { + BUFFER *wb; + struct api_v2_contexts_request *request; + DICTIONARY *ctx; + + CONTEXTS_V2_OPTIONS options; + struct query_versions versions; + + struct { + SIMPLE_PATTERN *scope_pattern; + SIMPLE_PATTERN *pattern; + size_t ni; + } nodes; + + struct { + SIMPLE_PATTERN *scope_pattern; + SIMPLE_PATTERN *pattern; + } contexts; + + struct { + FTS_MATCH host_match; + char host_node_id_str[UUID_STR_LEN]; + SIMPLE_PATTERN *pattern; + FTS_INDEX fts; + } q; + + struct query_timings timings; +}; + +static FTS_MATCH rrdcontext_to_json_v2_full_text_search(struct rrdcontext_to_json_v2_data *ctl, RRDCONTEXT *rc, SIMPLE_PATTERN *q) { + if(unlikely(full_text_search_string(&ctl->q.fts, q, rc->id) || + full_text_search_string(&ctl->q.fts, q, rc->family))) + return FTS_MATCHED_CONTEXT; + + if(unlikely(full_text_search_string(&ctl->q.fts, q, rc->title))) + return FTS_MATCHED_TITLE; + + if(unlikely(full_text_search_string(&ctl->q.fts, q, rc->units))) + return FTS_MATCHED_UNITS; + + FTS_MATCH matched = FTS_MATCHED_NONE; + RRDINSTANCE *ri; + dfe_start_read(rc->rrdinstances, ri) { + if(matched) break; + + if(unlikely(full_text_search_string(&ctl->q.fts, q, ri->id)) || + (ri->name != ri->id && full_text_search_string(&ctl->q.fts, q, ri->name))) { + matched = FTS_MATCHED_INSTANCE; + break; + } + + RRDMETRIC *rm; + dfe_start_read(ri->rrdmetrics, rm) { + if(unlikely(full_text_search_string(&ctl->q.fts, q, rm->id)) || + (rm->name != rm->id && full_text_search_string(&ctl->q.fts, q, rm->name))) { + matched = FTS_MATCHED_DIMENSION; + break; + } + } + dfe_done(rm); + + size_t label_searches = 0; + if(unlikely(ri->rrdlabels && dictionary_entries(ri->rrdlabels) && + rrdlabels_match_simple_pattern_parsed(ri->rrdlabels, q, ':', &label_searches))) { + ctl->q.fts.searches += label_searches; + ctl->q.fts.char_searches += label_searches; + matched = FTS_MATCHED_LABEL; + break; + } + ctl->q.fts.searches += label_searches; + ctl->q.fts.char_searches += label_searches; + + if(ri->rrdset) { + RRDSET *st = ri->rrdset; + netdata_rwlock_rdlock(&st->alerts.rwlock); + for (RRDCALC *rcl = st->alerts.base; rcl; rcl = rcl->next) { + if(unlikely(full_text_search_string(&ctl->q.fts, q, rcl->name))) { + matched = FTS_MATCHED_ALERT; + break; + } + + if(unlikely(full_text_search_string(&ctl->q.fts, q, rcl->info))) { + matched = FTS_MATCHED_ALERT_INFO; + break; + } + } + netdata_rwlock_unlock(&st->alerts.rwlock); + } + } + dfe_done(ri); + return matched; +} + +static ssize_t rrdcontext_to_json_v2_add_context(void *data, RRDCONTEXT_ACQUIRED *rca, bool queryable_context __maybe_unused) { + struct rrdcontext_to_json_v2_data *ctl = data; + + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + + FTS_MATCH match = ctl->q.host_match; + if((ctl->options & CONTEXTS_V2_SEARCH) && ctl->q.pattern) { + match = rrdcontext_to_json_v2_full_text_search(ctl, rc, ctl->q.pattern); + + if(match == FTS_MATCHED_NONE) + return 0; + } + + struct rrdcontext_to_json_v2_entry t = { + .count = 0, + .id = rc->id, + .family = string_dup(rc->family), + .priority = rc->priority, + .first_time_s = rc->first_time_s, + .last_time_s = rc->last_time_s, + .flags = rc->flags, + .match = match, + }, *z = dictionary_set(ctl->ctx, string2str(rc->id), &t, sizeof(t)); + + if(!z->count) { + // we just added this + z->count = 1; + } + else { + // it is already in there + z->count++; + z->flags |= rc->flags; + + if(z->priority > rc->priority) + z->priority = rc->priority; + + if(z->first_time_s > rc->first_time_s) + z->first_time_s = rc->first_time_s; + + if(z->last_time_s < rc->last_time_s) + z->last_time_s = rc->last_time_s; + + if(z->family != rc->family) { + z->family = string_2way_merge(z->family, rc->family); + } + } + + return 1; +} + +void buffer_json_node_add_v2(BUFFER *wb, RRDHOST *host, size_t ni, usec_t duration_ut) { + buffer_json_member_add_string(wb, "mg", host->machine_guid); + if(host->node_id) + buffer_json_member_add_uuid(wb, "nd", host->node_id); + buffer_json_member_add_string(wb, "nm", rrdhost_hostname(host)); + buffer_json_member_add_uint64(wb, "ni", ni); + buffer_json_member_add_object(wb, "st"); + buffer_json_member_add_uint64(wb, "ai", 0); + buffer_json_member_add_uint64(wb, "code", 200); + buffer_json_member_add_string(wb, "msg", ""); + if(duration_ut) + buffer_json_member_add_double(wb, "ms", (NETDATA_DOUBLE)duration_ut / 1000.0); + buffer_json_object_close(wb); +} + +static ssize_t rrdcontext_to_json_v2_add_host(void *data, RRDHOST *host, bool queryable_host) { + if(!queryable_host || !host->rrdctx.contexts) + // the host matches the 'scope_host' but does not match the 'host' patterns + // or the host does not have any contexts + return 0; + + struct rrdcontext_to_json_v2_data *ctl = data; + BUFFER *wb = ctl->wb; + + if(ctl->request->timeout_ms && now_monotonic_usec() > ctl->timings.received_ut + ctl->request->timeout_ms * USEC_PER_MS) + // timed out + return -2; + + if(ctl->request->interrupt_callback && ctl->request->interrupt_callback(ctl->request->interrupt_callback_data)) + // interrupted + return -1; + + bool host_matched = (ctl->options & CONTEXTS_V2_NODES); + bool do_contexts = (ctl->options & (CONTEXTS_V2_CONTEXTS | CONTEXTS_V2_SEARCH)); + + ctl->q.host_match = FTS_MATCHED_NONE; + if((ctl->options & CONTEXTS_V2_SEARCH)) { + // check if we match the host itself + if(ctl->q.pattern && ( + full_text_search_string(&ctl->q.fts, ctl->q.pattern, host->hostname) || + full_text_search_char(&ctl->q.fts, ctl->q.pattern, host->machine_guid) || + (ctl->q.pattern && full_text_search_char(&ctl->q.fts, ctl->q.pattern, ctl->q.host_node_id_str)))) { + ctl->q.host_match = FTS_MATCHED_HOST; + do_contexts = true; + } + } + + if(do_contexts) { + // save it + SIMPLE_PATTERN *old_q = ctl->q.pattern; + + if(ctl->q.host_match == FTS_MATCHED_HOST) + // do not do pattern matching on contexts - we matched the host itself + ctl->q.pattern = NULL; + + ssize_t added = query_scope_foreach_context( + host, ctl->request->scope_contexts, + ctl->contexts.scope_pattern, ctl->contexts.pattern, + rrdcontext_to_json_v2_add_context, queryable_host, ctl); + + // restore it + ctl->q.pattern = old_q; + + if(added == -1) + return -1; + + if(added) + host_matched = true; + } + + if(host_matched && (ctl->options & (CONTEXTS_V2_NODES | CONTEXTS_V2_NODES_DETAILED | CONTEXTS_V2_DEBUG))) { + buffer_json_add_array_item_object(wb); + buffer_json_node_add_v2(wb, host, ctl->nodes.ni++, 0); + + if(ctl->options & CONTEXTS_V2_NODES_DETAILED) { + buffer_json_member_add_string(wb, "version", rrdhost_program_version(host)); + buffer_json_member_add_uint64(wb, "hops", host->system_info ? host->system_info->hops : (host == localhost) ? 0 : 1); + buffer_json_member_add_string(wb, "state", (host == localhost || !rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN)) ? "reachable" : "stale"); + buffer_json_member_add_boolean(wb, "isDeleted", false); + + buffer_json_member_add_array(wb, "services"); + buffer_json_array_close(wb); + + buffer_json_member_add_array(wb, "nodeInstanceCapabilities"); + + struct capability *capas = aclk_get_node_instance_capas(host); + struct capability *capa = capas; + while(capa->name != NULL) { + buffer_json_add_array_item_object(wb); + buffer_json_member_add_string(wb, "name", capa->name); + buffer_json_member_add_uint64(wb, "version", capa->version); + buffer_json_member_add_boolean(wb, "enabled", capa->enabled); + buffer_json_object_close(wb); + capa++; + } + buffer_json_array_close(wb); + freez(capas); + + web_client_api_request_v1_info_summary_alarm_statuses(host, wb, "alarmCounters"); + + host_labels2json(host, wb, "hostLabels"); + + buffer_json_member_add_object(wb, "mlInfo"); + buffer_json_member_add_boolean(wb, "mlCapable", ml_capable(host)); + buffer_json_member_add_boolean(wb, "mlEnabled", ml_enabled(host)); + buffer_json_object_close(wb); + + if(host->system_info) { + buffer_json_member_add_string_or_empty(wb, "architecture", host->system_info->architecture); + buffer_json_member_add_string_or_empty(wb, "kernelName", host->system_info->kernel_name); + buffer_json_member_add_string_or_empty(wb, "kernelVersion", host->system_info->kernel_version); + buffer_json_member_add_string_or_empty(wb, "cpuFrequency", host->system_info->host_cpu_freq); + buffer_json_member_add_string_or_empty(wb, "cpus", host->system_info->host_cores); + buffer_json_member_add_string_or_empty(wb, "memory", host->system_info->host_ram_total); + buffer_json_member_add_string_or_empty(wb, "diskSpace", host->system_info->host_disk_space); + buffer_json_member_add_string_or_empty(wb, "container", host->system_info->container); + buffer_json_member_add_string_or_empty(wb, "virtualization", host->system_info->virtualization); + buffer_json_member_add_string_or_empty(wb, "os", host->system_info->host_os_id); + buffer_json_member_add_string_or_empty(wb, "osName", host->system_info->host_os_name); + buffer_json_member_add_string_or_empty(wb, "osVersion", host->system_info->host_os_version); + } + + buffer_json_member_add_object(wb, "status"); + + size_t receiver_hops = host->system_info ? host->system_info->hops : (host == localhost) ? 0 : 1; + buffer_json_member_add_object(wb, "collection"); + buffer_json_member_add_uint64(wb, "hops", receiver_hops); + buffer_json_member_add_boolean(wb, "online", host == localhost || !rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN | RRDHOST_FLAG_RRDPUSH_RECEIVER_DISCONNECTED)); + buffer_json_member_add_boolean(wb, "replicating", rrdhost_receiver_replicating_charts(host)); + buffer_json_object_close(wb); // collection + + buffer_json_member_add_object(wb, "streaming"); + buffer_json_member_add_uint64(wb, "hops", host->sender ? host->sender->hops : receiver_hops + 1); + buffer_json_member_add_boolean(wb, "online", rrdhost_flag_check(host, RRDHOST_FLAG_RRDPUSH_SENDER_CONNECTED)); + buffer_json_member_add_boolean(wb, "replicating", rrdhost_sender_replicating_charts(host)); + buffer_json_object_close(wb); // streaming + + buffer_json_object_close(wb); // status + } + + buffer_json_object_close(wb); + } + + return host_matched ? 1 : 0; +} + +static void buffer_json_contexts_v2_options_to_array(BUFFER *wb, CONTEXTS_V2_OPTIONS options) { + if(options & CONTEXTS_V2_DEBUG) + buffer_json_add_array_item_string(wb, "debug"); + + if(options & (CONTEXTS_V2_NODES | CONTEXTS_V2_NODES_DETAILED)) + buffer_json_add_array_item_string(wb, "nodes"); + + if(options & CONTEXTS_V2_CONTEXTS) + buffer_json_add_array_item_string(wb, "contexts"); + + if(options & CONTEXTS_V2_SEARCH) + buffer_json_add_array_item_string(wb, "search"); +} + +void buffer_json_query_timings(BUFFER *wb, const char *key, struct query_timings *timings) { + timings->finished_ut = now_monotonic_usec(); + if(!timings->executed_ut) + timings->executed_ut = timings->finished_ut; + if(!timings->preprocessed_ut) + timings->preprocessed_ut = timings->received_ut; + buffer_json_member_add_object(wb, key); + buffer_json_member_add_double(wb, "prep_ms", (NETDATA_DOUBLE)(timings->preprocessed_ut - timings->received_ut) / USEC_PER_MS); + buffer_json_member_add_double(wb, "query_ms", (NETDATA_DOUBLE)(timings->executed_ut - timings->preprocessed_ut) / USEC_PER_MS); + buffer_json_member_add_double(wb, "output_ms", (NETDATA_DOUBLE)(timings->finished_ut - timings->executed_ut) / USEC_PER_MS); + buffer_json_member_add_double(wb, "total_ms", (NETDATA_DOUBLE)(timings->finished_ut - timings->received_ut) / USEC_PER_MS); + buffer_json_member_add_double(wb, "cloud_ms", (NETDATA_DOUBLE)(timings->finished_ut - timings->received_ut) / USEC_PER_MS); + buffer_json_object_close(wb); +} + +void buffer_json_agents_array_v2(BUFFER *wb, struct query_timings *timings, time_t now_s) { + if(!now_s) + now_s = now_realtime_sec(); + + buffer_json_member_add_array(wb, "agents"); + buffer_json_add_array_item_object(wb); + buffer_json_member_add_string(wb, "mg", localhost->machine_guid); + buffer_json_member_add_uuid(wb, "nd", localhost->node_id); + buffer_json_member_add_string(wb, "nm", rrdhost_hostname(localhost)); + buffer_json_member_add_time_t(wb, "now", now_s); + buffer_json_member_add_uint64(wb, "ai", 0); + + if(timings) + buffer_json_query_timings(wb, "timings", timings); + + buffer_json_object_close(wb); + buffer_json_array_close(wb); +} + +void buffer_json_cloud_timings(BUFFER *wb, const char *key, struct query_timings *timings) { + buffer_json_member_add_object(wb, key); + buffer_json_member_add_double(wb, "routing_ms", 0.0); + buffer_json_member_add_double(wb, "node_max_ms", 0.0); + buffer_json_member_add_double(wb, "total_ms", (NETDATA_DOUBLE)(timings->finished_ut - timings->received_ut) / USEC_PER_MS); + buffer_json_object_close(wb); +} + +void contexts_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { + struct rrdcontext_to_json_v2_entry *z = value; + string_freez(z->family); +} + +int rrdcontext_to_json_v2(BUFFER *wb, struct api_v2_contexts_request *req, CONTEXTS_V2_OPTIONS options) { + int resp = HTTP_RESP_OK; + + if(options & CONTEXTS_V2_SEARCH) + options |= CONTEXTS_V2_CONTEXTS; + + struct rrdcontext_to_json_v2_data ctl = { + .wb = wb, + .request = req, + .ctx = NULL, + .options = options, + .versions = { 0 }, + .nodes.scope_pattern = string_to_simple_pattern(req->scope_nodes), + .nodes.pattern = string_to_simple_pattern(req->nodes), + .contexts.pattern = string_to_simple_pattern(req->contexts), + .contexts.scope_pattern = string_to_simple_pattern(req->scope_contexts), + .q.pattern = string_to_simple_pattern_nocase(req->q), + .timings = { + .received_ut = now_monotonic_usec(), + } + }; + + if(options & CONTEXTS_V2_CONTEXTS) { + ctl.ctx = dictionary_create_advanced( + DICT_OPTION_SINGLE_THREADED | DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, NULL, + sizeof(struct rrdcontext_to_json_v2_entry)); + + dictionary_register_delete_callback(ctl.ctx, contexts_delete_callback, NULL); + } + + time_t now_s = now_realtime_sec(); + buffer_json_initialize(wb, "\"", "\"", 0, true, false); + buffer_json_member_add_uint64(wb, "api", 2); + + if(options & CONTEXTS_V2_DEBUG) { + buffer_json_member_add_object(wb, "request"); + + buffer_json_member_add_object(wb, "scope"); + buffer_json_member_add_string(wb, "scope_nodes", req->scope_nodes); + buffer_json_member_add_string(wb, "scope_contexts", req->scope_contexts); + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "selectors"); + buffer_json_member_add_string(wb, "nodes", req->nodes); + buffer_json_member_add_string(wb, "contexts", req->contexts); + buffer_json_object_close(wb); + + buffer_json_member_add_string(wb, "q", req->q); + buffer_json_member_add_array(wb, "options"); + buffer_json_contexts_v2_options_to_array(wb, options); + buffer_json_array_close(wb); + + buffer_json_object_close(wb); + } + + if(options & (CONTEXTS_V2_NODES | CONTEXTS_V2_NODES_DETAILED | CONTEXTS_V2_DEBUG)) + buffer_json_member_add_array(wb, "nodes"); + + ssize_t ret = query_scope_foreach_host(ctl.nodes.scope_pattern, ctl.nodes.pattern, + rrdcontext_to_json_v2_add_host, &ctl, + &ctl.versions, ctl.q.host_node_id_str); + + if(unlikely(ret < 0)) { + buffer_flush(wb); + + if(ret == -2) { + buffer_strcat(wb, "query timeout"); + resp = HTTP_RESP_GATEWAY_TIMEOUT; + } + else { + buffer_strcat(wb, "query interrupted"); + resp = HTTP_RESP_BACKEND_FETCH_FAILED; + } + goto cleanup; + } + + if(options & (CONTEXTS_V2_NODES | CONTEXTS_V2_NODES_DETAILED | CONTEXTS_V2_DEBUG)) + buffer_json_array_close(wb); + + ctl.timings.executed_ut = now_monotonic_usec(); + version_hashes_api_v2(wb, &ctl.versions); + + if(options & CONTEXTS_V2_CONTEXTS) { + buffer_json_member_add_object(wb, "contexts"); + struct rrdcontext_to_json_v2_entry *z; + dfe_start_read(ctl.ctx, z){ + bool collected = z->flags & RRD_FLAG_COLLECTED; + + buffer_json_member_add_object(wb, string2str(z->id)); + { + buffer_json_member_add_string(wb, "family", string2str(z->family)); + buffer_json_member_add_uint64(wb, "priority", z->priority); + buffer_json_member_add_time_t(wb, "first_entry", z->first_time_s); + buffer_json_member_add_time_t(wb, "last_entry", collected ? now_s : z->last_time_s); + buffer_json_member_add_boolean(wb, "live", collected); + if (options & CONTEXTS_V2_SEARCH) + buffer_json_member_add_string(wb, "match", fts_match_to_string(z->match)); + } + buffer_json_object_close(wb); + } + dfe_done(z); + buffer_json_object_close(wb); // contexts + } + + if(options & CONTEXTS_V2_SEARCH) { + buffer_json_member_add_object(wb, "searches"); + buffer_json_member_add_uint64(wb, "strings", ctl.q.fts.string_searches); + buffer_json_member_add_uint64(wb, "char", ctl.q.fts.char_searches); + buffer_json_member_add_uint64(wb, "total", ctl.q.fts.searches); + buffer_json_object_close(wb); + } + + buffer_json_agents_array_v2(wb, &ctl.timings, now_s); + buffer_json_cloud_timings(wb, "timings", &ctl.timings); + buffer_json_finalize(wb); + +cleanup: + dictionary_destroy(ctl.ctx); + simple_pattern_free(ctl.nodes.scope_pattern); + simple_pattern_free(ctl.nodes.pattern); + simple_pattern_free(ctl.contexts.pattern); + simple_pattern_free(ctl.contexts.scope_pattern); + simple_pattern_free(ctl.q.pattern); + + return resp; +} + diff --git a/database/contexts/context.c b/database/contexts/context.c new file mode 100644 index 00000000..f941050d --- /dev/null +++ b/database/contexts/context.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "internal.h" + +inline const char *rrdcontext_acquired_id(RRDCONTEXT_ACQUIRED *rca) { + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + return string2str(rc->id); +} + +inline bool rrdcontext_acquired_belongs_to_host(RRDCONTEXT_ACQUIRED *rca, RRDHOST *host) { + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + return rc->rrdhost == host; +} + +// ---------------------------------------------------------------------------- +// RRDCONTEXT + +static void rrdcontext_freez(RRDCONTEXT *rc) { + string_freez(rc->id); + string_freez(rc->title); + string_freez(rc->units); + string_freez(rc->family); +} + +static void rrdcontext_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdhost) { + RRDHOST *host = (RRDHOST *)rrdhost; + RRDCONTEXT *rc = (RRDCONTEXT *)value; + + rc->rrdhost = host; + rc->flags = rc->flags & RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS; // no need for atomics at constructor + + if(rc->hub.version) { + // we are loading data from the SQL database + + if(rc->version) + error("RRDCONTEXT: context '%s' is already initialized with version %"PRIu64", but it is loaded again from SQL with version %"PRIu64"", string2str(rc->id), rc->version, rc->hub.version); + + // IMPORTANT + // replace all string pointers in rc->hub with our own versions + // the originals are coming from a tmp allocation of sqlite + + string_freez(rc->id); + rc->id = string_strdupz(rc->hub.id); + rc->hub.id = string2str(rc->id); + + string_freez(rc->title); + rc->title = string_strdupz(rc->hub.title); + rc->hub.title = string2str(rc->title); + + string_freez(rc->units); + rc->units = string_strdupz(rc->hub.units); + rc->hub.units = string2str(rc->units); + + string_freez(rc->family); + rc->family = string_strdupz(rc->hub.family); + rc->hub.family = string2str(rc->family); + + rc->chart_type = rrdset_type_id(rc->hub.chart_type); + rc->hub.chart_type = rrdset_type_name(rc->chart_type); + + rc->version = rc->hub.version; + rc->priority = rc->hub.priority; + rc->first_time_s = (time_t)rc->hub.first_time_s; + rc->last_time_s = (time_t)rc->hub.last_time_s; + + if(rc->hub.deleted || !rc->hub.first_time_s) + rrd_flag_set_deleted(rc, RRD_FLAG_NONE); + else { + if (rc->last_time_s == 0) + rrd_flag_set_collected(rc); + else + rrd_flag_set_archived(rc); + } + + rc->flags |= RRD_FLAG_UPDATE_REASON_LOAD_SQL; // no need for atomics at constructor + } + else { + // we are adding this context now for the first time + rc->version = now_realtime_sec(); + } + + rrdinstances_create_in_rrdcontext(rc); + netdata_mutex_init(&rc->mutex); + + // signal the react callback to do the job + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_NEW_OBJECT); +} + +static void rrdcontext_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdhost __maybe_unused) { + + RRDCONTEXT *rc = (RRDCONTEXT *)value; + + rrdinstances_destroy_from_rrdcontext(rc); + netdata_mutex_destroy(&rc->mutex); + rrdcontext_freez(rc); +} + +static bool rrdcontext_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *rrdhost __maybe_unused) { + RRDCONTEXT *rc = (RRDCONTEXT *)old_value; + RRDCONTEXT *rc_new = (RRDCONTEXT *)new_value; + + //current rc is not archived, new_rc is archived, don't merge + if (!rrd_flag_is_archived(rc) && rrd_flag_is_archived(rc_new)) { + rrdcontext_freez(rc_new); + return false; + } + + rrdcontext_lock(rc); + + if(rc->title != rc_new->title) { + STRING *old_title = rc->title; + if (rrd_flag_is_archived(rc) && !rrd_flag_is_archived(rc_new)) + rc->title = string_dup(rc_new->title); + else + rc->title = string_2way_merge(rc->title, rc_new->title); + string_freez(old_title); + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(rc->units != rc_new->units) { + STRING *old_units = rc->units; + rc->units = string_dup(rc_new->units); + string_freez(old_units); + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(rc->family != rc_new->family) { + STRING *old_family = rc->family; + if (rrd_flag_is_archived(rc) && !rrd_flag_is_archived(rc_new)) + rc->family = string_dup(rc_new->family); + else + rc->family = string_2way_merge(rc->family, rc_new->family); + string_freez(old_family); + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(rc->chart_type != rc_new->chart_type) { + rc->chart_type = rc_new->chart_type; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(rc->priority != rc_new->priority) { + rc->priority = rc_new->priority; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + rrd_flag_set(rc, rc_new->flags & RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS); // no need for atomics on rc_new + + if(rrd_flag_is_collected(rc) && rrd_flag_is_archived(rc)) + rrd_flag_set_collected(rc); + + if(rrd_flag_is_updated(rc)) + rrd_flag_set(rc, RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT); + + rrdcontext_unlock(rc); + + // free the resources of the new one + rrdcontext_freez(rc_new); + + // the react callback will continue from here + return rrd_flag_is_updated(rc); +} + +static void rrdcontext_react_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdhost __maybe_unused) { + RRDCONTEXT *rc = (RRDCONTEXT *)value; + rrdcontext_trigger_updates(rc, __FUNCTION__ ); +} + +void rrdcontext_trigger_updates(RRDCONTEXT *rc, const char *function) { + if(rrd_flag_is_updated(rc) || !rrd_flag_check(rc, RRD_FLAG_LIVE_RETENTION)) + rrdcontext_queue_for_post_processing(rc, function, rc->flags); +} + +static void rrdcontext_hub_queue_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *context, void *nothing __maybe_unused) { + RRDCONTEXT *rc = context; + rrd_flag_set(rc, RRD_FLAG_QUEUED_FOR_HUB); + rc->queue.queued_ut = now_realtime_usec(); + rc->queue.queued_flags = rrd_flags_get(rc); +} + +static void rrdcontext_hub_queue_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *context, void *nothing __maybe_unused) { + RRDCONTEXT *rc = context; + rrd_flag_clear(rc, RRD_FLAG_QUEUED_FOR_HUB); +} + +static bool rrdcontext_hub_queue_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *context, void *new_context __maybe_unused, void *nothing __maybe_unused) { + // context and new_context are the same + // we just need to update the timings + RRDCONTEXT *rc = context; + rrd_flag_set(rc, RRD_FLAG_QUEUED_FOR_HUB); + rc->queue.queued_ut = now_realtime_usec(); + rc->queue.queued_flags |= rrd_flags_get(rc); + + return true; +} + +static void rrdcontext_post_processing_queue_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *context, void *nothing __maybe_unused) { + RRDCONTEXT *rc = context; + rrd_flag_set(rc, RRD_FLAG_QUEUED_FOR_PP); + rc->pp.queued_flags = rc->flags; + rc->pp.queued_ut = now_realtime_usec(); +} + +static void rrdcontext_post_processing_queue_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *context, void *nothing __maybe_unused) { + RRDCONTEXT *rc = context; + rrd_flag_clear(rc, RRD_FLAG_QUEUED_FOR_PP); + rc->pp.dequeued_ut = now_realtime_usec(); +} + +static bool rrdcontext_post_processing_queue_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *context, void *new_context __maybe_unused, void *nothing __maybe_unused) { + RRDCONTEXT *rc = context; + bool changed = false; + + if(!(rc->flags & RRD_FLAG_QUEUED_FOR_PP)) { + rrd_flag_set(rc, RRD_FLAG_QUEUED_FOR_PP); + changed = true; + } + + if(rc->pp.queued_flags != rc->flags) { + rc->pp.queued_flags |= rc->flags; + changed = true; + } + + return changed; +} + + +void rrdhost_create_rrdcontexts(RRDHOST *host) { + if(unlikely(!host)) return; + if(likely(host->rrdctx.contexts)) return; + + host->rrdctx.contexts = dictionary_create_advanced( + DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdcontext, sizeof(RRDCONTEXT)); + + dictionary_register_insert_callback(host->rrdctx.contexts, rrdcontext_insert_callback, host); + dictionary_register_delete_callback(host->rrdctx.contexts, rrdcontext_delete_callback, host); + dictionary_register_conflict_callback(host->rrdctx.contexts, rrdcontext_conflict_callback, host); + dictionary_register_react_callback(host->rrdctx.contexts, rrdcontext_react_callback, host); + + host->rrdctx.hub_queue = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_VALUE_LINK_DONT_CLONE, &dictionary_stats_category_rrdcontext, 0); + dictionary_register_insert_callback(host->rrdctx.hub_queue, rrdcontext_hub_queue_insert_callback, NULL); + dictionary_register_delete_callback(host->rrdctx.hub_queue, rrdcontext_hub_queue_delete_callback, NULL); + dictionary_register_conflict_callback(host->rrdctx.hub_queue, rrdcontext_hub_queue_conflict_callback, NULL); + + host->rrdctx.pp_queue = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_VALUE_LINK_DONT_CLONE, &dictionary_stats_category_rrdcontext, 0); + dictionary_register_insert_callback(host->rrdctx.pp_queue, rrdcontext_post_processing_queue_insert_callback, NULL); + dictionary_register_delete_callback(host->rrdctx.pp_queue, rrdcontext_post_processing_queue_delete_callback, NULL); + dictionary_register_conflict_callback(host->rrdctx.pp_queue, rrdcontext_post_processing_queue_conflict_callback, NULL); +} + +void rrdhost_destroy_rrdcontexts(RRDHOST *host) { + if(unlikely(!host)) return; + if(unlikely(!host->rrdctx.contexts)) return; + + DICTIONARY *old; + + if(host->rrdctx.hub_queue) { + old = host->rrdctx.hub_queue; + host->rrdctx.hub_queue = NULL; + + RRDCONTEXT *rc; + dfe_start_write(old, rc) { + dictionary_del(old, string2str(rc->id)); + } + dfe_done(rc); + dictionary_destroy(old); + } + + if(host->rrdctx.pp_queue) { + old = host->rrdctx.pp_queue; + host->rrdctx.pp_queue = NULL; + + RRDCONTEXT *rc; + dfe_start_write(old, rc) { + dictionary_del(old, string2str(rc->id)); + } + dfe_done(rc); + dictionary_destroy(old); + } + + old = host->rrdctx.contexts; + host->rrdctx.contexts = NULL; + dictionary_destroy(old); +} + diff --git a/database/contexts/instance.c b/database/contexts/instance.c new file mode 100644 index 00000000..665022af --- /dev/null +++ b/database/contexts/instance.c @@ -0,0 +1,524 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "internal.h" + +// ---------------------------------------------------------------------------- +// helper one-liners for RRDINSTANCE + +bool rrdinstance_acquired_id_and_name_are_same(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return ri->id == ri->name; +} + +inline const char *rrdinstance_acquired_id(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return string2str(ri->id); +} + +inline const char *rrdinstance_acquired_name(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return string2str(ri->name); +} + +inline bool rrdinstance_acquired_has_name(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return (ri->name && ri->name != ri->id); +} + +inline const char *rrdinstance_acquired_units(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return string2str(ri->units); +} + +inline STRING *rrdinstance_acquired_units_dup(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return string_dup(ri->units); +} + +inline DICTIONARY *rrdinstance_acquired_labels(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return ri->rrdlabels; +} + +inline DICTIONARY *rrdinstance_acquired_functions(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + if(!ri->rrdset) return NULL; + return ri->rrdset->functions_view; +} + +inline RRDHOST *rrdinstance_acquired_rrdhost(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return ri->rc->rrdhost; +} + +inline bool rrdinstance_acquired_belongs_to_context(RRDINSTANCE_ACQUIRED *ria, RRDCONTEXT_ACQUIRED *rca) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + return ri->rc == rc; +} + +inline time_t rrdinstance_acquired_update_every(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return ri->update_every_s; +} + +// ---------------------------------------------------------------------------- +// RRDINSTANCE + +static void rrdinstance_free(RRDINSTANCE *ri) { + + if(rrd_flag_check(ri, RRD_FLAG_OWN_LABELS)) + dictionary_destroy(ri->rrdlabels); + + rrdmetrics_destroy_from_rrdinstance(ri); + string_freez(ri->id); + string_freez(ri->name); + string_freez(ri->title); + string_freez(ri->units); + string_freez(ri->family); + + ri->id = NULL; + ri->name = NULL; + ri->title = NULL; + ri->units = NULL; + ri->family = NULL; + ri->rc = NULL; + ri->rrdlabels = NULL; + ri->rrdmetrics = NULL; + ri->rrdset = NULL; +} + +static void rrdinstance_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdcontext) { + RRDINSTANCE *ri = value; + + // link it to its parent + ri->rc = rrdcontext; + + ri->flags = ri->flags & RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS; // no need for atomics + + if(!ri->name) + ri->name = string_dup(ri->id); + + if(ri->rrdset) { + ri->rrdlabels = ri->rrdset->rrdlabels; + ri->flags &= ~RRD_FLAG_OWN_LABELS; // no need of atomics at the constructor + } + else { + ri->rrdlabels = rrdlabels_create(); + ri->flags |= RRD_FLAG_OWN_LABELS; // no need of atomics at the constructor + } + + if(ri->rrdset) { + if(unlikely(rrdset_flag_check(ri->rrdset, RRDSET_FLAG_HIDDEN))) + ri->flags |= RRD_FLAG_HIDDEN; // no need of atomics at the constructor + else + ri->flags &= ~RRD_FLAG_HIDDEN; // no need of atomics at the constructor + } + + rrdmetrics_create_in_rrdinstance(ri); + + // signal the react callback to do the job + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_NEW_OBJECT); +} + +static void rrdinstance_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdcontext __maybe_unused) { + RRDINSTANCE *ri = (RRDINSTANCE *)value; + + internal_error(ri->rrdset, "RRDINSTANCE: '%s' is freed but there is a RRDSET linked to it.", string2str(ri->id)); + + rrdinstance_free(ri); +} + +static bool rrdinstance_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *rrdcontext __maybe_unused) { + RRDINSTANCE *ri = (RRDINSTANCE *)old_value; + RRDINSTANCE *ri_new = (RRDINSTANCE *)new_value; + + internal_error(ri->id != ri_new->id, + "RRDINSTANCE: '%s' cannot change id to '%s'", + string2str(ri->id), string2str(ri_new->id)); + + if(uuid_memcmp(&ri->uuid, &ri_new->uuid) != 0) { +#ifdef NETDATA_INTERNAL_CHECKS + char uuid1[UUID_STR_LEN], uuid2[UUID_STR_LEN]; + uuid_unparse(ri->uuid, uuid1); + uuid_unparse(ri_new->uuid, uuid2); + internal_error(true, "RRDINSTANCE: '%s' of host '%s' changed UUID from '%s' to '%s'", + string2str(ri->id), rrdhost_hostname(ri->rc->rrdhost), uuid1, uuid2); +#endif + + uuid_copy(ri->uuid, ri_new->uuid); + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(ri->rrdset && ri_new->rrdset && ri->rrdset != ri_new->rrdset) { + ri->rrdset = ri_new->rrdset; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LINKING); + } + +#ifdef NETDATA_INTERNAL_CHECKS + if(ri->rrdset && uuid_memcmp(&ri->uuid, &ri->rrdset->chart_uuid) != 0) { + char uuid1[UUID_STR_LEN], uuid2[UUID_STR_LEN]; + uuid_unparse(ri->uuid, uuid1); + uuid_unparse(ri->rrdset->chart_uuid, uuid2); + internal_error(true, "RRDINSTANCE: '%s' is linked to RRDSET '%s' but they have different UUIDs. RRDINSTANCE has '%s', RRDSET has '%s'", string2str(ri->id), rrdset_id(ri->rrdset), uuid1, uuid2); + } +#endif + + if(ri->name != ri_new->name) { + STRING *old = ri->name; + ri->name = string_dup(ri_new->name); + string_freez(old); + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(ri->title != ri_new->title) { + STRING *old = ri->title; + ri->title = string_dup(ri_new->title); + string_freez(old); + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(ri->units != ri_new->units) { + STRING *old = ri->units; + ri->units = string_dup(ri_new->units); + string_freez(old); + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(ri->family != ri_new->family) { + STRING *old = ri->family; + ri->family = string_dup(ri_new->family); + string_freez(old); + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(ri->chart_type != ri_new->chart_type) { + ri->chart_type = ri_new->chart_type; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(ri->priority != ri_new->priority) { + ri->priority = ri_new->priority; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(ri->update_every_s != ri_new->update_every_s) { + ri->update_every_s = ri_new->update_every_s; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(ri->rrdset != ri_new->rrdset) { + ri->rrdset = ri_new->rrdset; + + if(ri->rrdset && rrd_flag_check(ri, RRD_FLAG_OWN_LABELS)) { + DICTIONARY *old = ri->rrdlabels; + ri->rrdlabels = ri->rrdset->rrdlabels; + rrd_flag_clear(ri, RRD_FLAG_OWN_LABELS); + rrdlabels_destroy(old); + } + else if(!ri->rrdset && !rrd_flag_check(ri, RRD_FLAG_OWN_LABELS)) { + ri->rrdlabels = rrdlabels_create(); + rrd_flag_set(ri, RRD_FLAG_OWN_LABELS); + } + } + + if(ri->rrdset) { + if(unlikely(rrdset_flag_check(ri->rrdset, RRDSET_FLAG_HIDDEN))) + rrd_flag_set(ri, RRD_FLAG_HIDDEN); + else + rrd_flag_clear(ri, RRD_FLAG_HIDDEN); + } + + rrd_flag_set(ri, ri_new->flags & RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS); // no need for atomics on ri_new + + if(rrd_flag_is_collected(ri) && rrd_flag_is_archived(ri)) + rrd_flag_set_collected(ri); + + if(rrd_flag_is_updated(ri)) + rrd_flag_set(ri, RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT); + + // free the new one + rrdinstance_free(ri_new); + + // the react callback will continue from here + return rrd_flag_is_updated(ri); +} + +static void rrdinstance_react_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdcontext __maybe_unused) { + RRDINSTANCE *ri = value; + + rrdinstance_trigger_updates(ri, __FUNCTION__ ); +} + +void rrdinstances_create_in_rrdcontext(RRDCONTEXT *rc) { + if(unlikely(!rc || rc->rrdinstances)) return; + + rc->rrdinstances = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdcontext, sizeof(RRDINSTANCE)); + + dictionary_register_insert_callback(rc->rrdinstances, rrdinstance_insert_callback, rc); + dictionary_register_delete_callback(rc->rrdinstances, rrdinstance_delete_callback, rc); + dictionary_register_conflict_callback(rc->rrdinstances, rrdinstance_conflict_callback, rc); + dictionary_register_react_callback(rc->rrdinstances, rrdinstance_react_callback, rc); +} + +void rrdinstances_destroy_from_rrdcontext(RRDCONTEXT *rc) { + if(unlikely(!rc || !rc->rrdinstances)) return; + + dictionary_destroy(rc->rrdinstances); + rc->rrdinstances = NULL; +} + +void rrdinstance_trigger_updates(RRDINSTANCE *ri, const char *function) { + RRDSET *st = ri->rrdset; + + if(likely(st)) { + if(unlikely((unsigned int) st->priority != ri->priority)) { + ri->priority = st->priority; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + if(unlikely(st->update_every != ri->update_every_s)) { + ri->update_every_s = st->update_every; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + } + else if(unlikely(rrd_flag_is_collected(ri))) { + // there is no rrdset, but we have it as collected! + + rrd_flag_set_archived(ri); + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LINKING); + } + + if(rrd_flag_is_updated(ri) || !rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION)) { + rrd_flag_set_updated(ri->rc, RRD_FLAG_UPDATE_REASON_TRIGGERED); + rrdcontext_queue_for_post_processing(ri->rc, function, ri->flags); + } +} + +// ---------------------------------------------------------------------------- +// RRDINSTANCE HOOKS ON RRDSET + +inline void rrdinstance_from_rrdset(RRDSET *st) { + RRDCONTEXT trc = { + .id = string_dup(st->context), + .title = string_dup(st->title), + .units = string_dup(st->units), + .family = string_dup(st->family), + .priority = st->priority, + .chart_type = st->chart_type, + .flags = RRD_FLAG_NONE, // no need for atomics + .rrdhost = st->rrdhost, + }; + + RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_set_and_acquire_item(st->rrdhost->rrdctx.contexts, string2str(trc.id), &trc, sizeof(trc)); + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + + RRDINSTANCE tri = { + .id = string_dup(st->id), + .name = string_dup(st->name), + .units = string_dup(st->units), + .family = string_dup(st->family), + .title = string_dup(st->title), + .chart_type = st->chart_type, + .priority = st->priority, + .update_every_s = st->update_every, + .flags = RRD_FLAG_NONE, // no need for atomics + .rrdset = st, + }; + uuid_copy(tri.uuid, st->chart_uuid); + + RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *)dictionary_set_and_acquire_item(rc->rrdinstances, string2str(tri.id), &tri, sizeof(tri)); + + RRDCONTEXT_ACQUIRED *rca_old = st->rrdcontext; + RRDINSTANCE_ACQUIRED *ria_old = st->rrdinstance; + + st->rrdcontext = rca; + st->rrdinstance = ria; + + if(rca == rca_old) { + rrdcontext_release(rca_old); + rca_old = NULL; + } + + if(ria == ria_old) { + rrdinstance_release(ria_old); + ria_old = NULL; + } + + if(rca_old && ria_old) { + // Oops! The chart changed context! + + // RRDCONTEXT *rc_old = rrdcontext_acquired_value(rca_old); + RRDINSTANCE *ri_old = rrdinstance_acquired_value(ria_old); + + // migrate all dimensions to the new metrics + RRDDIM *rd; + rrddim_foreach_read(rd, st) { + if (!rd->rrdmetric) continue; + + RRDMETRIC *rm_old = rrdmetric_acquired_value(rd->rrdmetric); + rrd_flags_replace(rm_old, RRD_FLAG_DELETED|RRD_FLAG_UPDATED|RRD_FLAG_LIVE_RETENTION|RRD_FLAG_UPDATE_REASON_UNUSED|RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + rm_old->rrddim = NULL; + rm_old->first_time_s = 0; + rm_old->last_time_s = 0; + + rrdmetric_release(rd->rrdmetric); + rd->rrdmetric = NULL; + + rrdmetric_from_rrddim(rd); + } + rrddim_foreach_done(rd); + + // mark the old instance, ready to be deleted + if(!rrd_flag_check(ri_old, RRD_FLAG_OWN_LABELS)) + ri_old->rrdlabels = rrdlabels_create(); + + rrd_flags_replace(ri_old, RRD_FLAG_OWN_LABELS|RRD_FLAG_DELETED|RRD_FLAG_UPDATED|RRD_FLAG_LIVE_RETENTION|RRD_FLAG_UPDATE_REASON_UNUSED|RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + ri_old->rrdset = NULL; + ri_old->first_time_s = 0; + ri_old->last_time_s = 0; + + rrdinstance_trigger_updates(ri_old, __FUNCTION__ ); + rrdinstance_release(ria_old); + + /* + // trigger updates on the old context + if(!dictionary_entries(rc_old->rrdinstances) && !dictionary_stats_referenced_items(rc_old->rrdinstances)) { + rrdcontext_lock(rc_old); + rc_old->flags = ((rc_old->flags & RRD_FLAG_QUEUED)?RRD_FLAG_QUEUED:RRD_FLAG_NONE)|RRD_FLAG_DELETED|RRD_FLAG_UPDATED|RRD_FLAG_LIVE_RETENTION|RRD_FLAG_UPDATE_REASON_UNUSED|RRD_FLAG_UPDATE_REASON_ZERO_RETENTION; + rc_old->first_time_s = 0; + rc_old->last_time_s = 0; + rrdcontext_unlock(rc_old); + rrdcontext_trigger_updates(rc_old, __FUNCTION__ ); + } + else + rrdcontext_trigger_updates(rc_old, __FUNCTION__ ); + */ + + rrdcontext_release(rca_old); + rca_old = NULL; + ria_old = NULL; + } + + if(rca_old || ria_old) + fatal("RRDCONTEXT: cannot switch rrdcontext without switching rrdinstance too"); +} + +#define rrdset_get_rrdinstance(st) rrdset_get_rrdinstance_with_trace(st, __FUNCTION__); +static inline RRDINSTANCE *rrdset_get_rrdinstance_with_trace(RRDSET *st, const char *function) { + if(unlikely(!st->rrdinstance)) { + error("RRDINSTANCE: RRDSET '%s' is not linked to an RRDINSTANCE at %s()", rrdset_id(st), function); + return NULL; + } + + RRDINSTANCE *ri = rrdinstance_acquired_value(st->rrdinstance); + if(unlikely(!ri)) { + error("RRDINSTANCE: RRDSET '%s' lost its link to an RRDINSTANCE at %s()", rrdset_id(st), function); + return NULL; + } + + if(unlikely(ri->rrdset != st)) + fatal("RRDINSTANCE: '%s' is not linked to RRDSET '%s' at %s()", string2str(ri->id), rrdset_id(st), function); + + return ri; +} + +inline void rrdinstance_rrdset_is_freed(RRDSET *st) { + RRDINSTANCE *ri = rrdset_get_rrdinstance(st); + if(unlikely(!ri)) return; + + rrd_flag_set_archived(ri); + + if(!rrd_flag_check(ri, RRD_FLAG_OWN_LABELS)) { + ri->rrdlabels = rrdlabels_create(); + rrdlabels_copy(ri->rrdlabels, st->rrdlabels); + rrd_flag_set(ri, RRD_FLAG_OWN_LABELS); + } + + ri->rrdset = NULL; + + rrdinstance_trigger_updates(ri, __FUNCTION__ ); + + rrdinstance_release(st->rrdinstance); + st->rrdinstance = NULL; + + rrdcontext_release(st->rrdcontext); + st->rrdcontext = NULL; +} + +inline void rrdinstance_rrdset_has_updated_retention(RRDSET *st) { + RRDINSTANCE *ri = rrdset_get_rrdinstance(st); + if(unlikely(!ri)) return; + + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION); + rrdinstance_trigger_updates(ri, __FUNCTION__ ); +} + +inline void rrdinstance_updated_rrdset_name(RRDSET *st) { + // the chart may not be initialized when this is called + if(unlikely(!st->rrdinstance)) return; + + RRDINSTANCE *ri = rrdset_get_rrdinstance(st); + if(unlikely(!ri)) return; + + if(st->name != ri->name) { + STRING *old = ri->name; + ri->name = string_dup(st->name); + string_freez(old); + + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + rrdinstance_trigger_updates(ri, __FUNCTION__ ); + } +} + +inline void rrdinstance_updated_rrdset_flags_no_action(RRDINSTANCE *ri, RRDSET *st) { + if(unlikely(ri->rrdset != st)) + fatal("RRDCONTEXT: instance '%s' is not linked to chart '%s' on host '%s'", + string2str(ri->id), rrdset_id(st), rrdhost_hostname(st->rrdhost)); + + bool st_is_hidden = rrdset_flag_check(st, RRDSET_FLAG_HIDDEN); + bool ri_is_hidden = rrd_flag_check(ri, RRD_FLAG_HIDDEN); + + if(unlikely(st_is_hidden != ri_is_hidden)) { + if (unlikely(st_is_hidden && !ri_is_hidden)) + rrd_flag_set_updated(ri, RRD_FLAG_HIDDEN | RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + + else if (unlikely(!st_is_hidden && ri_is_hidden)) { + rrd_flag_clear(ri, RRD_FLAG_HIDDEN); + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + } +} + +inline void rrdinstance_updated_rrdset_flags(RRDSET *st) { + RRDINSTANCE *ri = rrdset_get_rrdinstance(st); + if(unlikely(!ri)) return; + + if(unlikely(rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED|RRDSET_FLAG_OBSOLETE))) + rrd_flag_set_archived(ri); + + rrdinstance_updated_rrdset_flags_no_action(ri, st); + + rrdinstance_trigger_updates(ri, __FUNCTION__ ); +} + +inline void rrdinstance_collected_rrdset(RRDSET *st) { + RRDINSTANCE *ri = rrdset_get_rrdinstance(st); + if(unlikely(!ri)) { + rrdcontext_updated_rrdset(st); + ri = rrdset_get_rrdinstance(st); + if(unlikely(!ri)) + return; + } + + rrdinstance_updated_rrdset_flags_no_action(ri, st); + + if(unlikely(ri->internal.collected_metrics_count && !rrd_flag_is_collected(ri))) + rrd_flag_set_collected(ri); + + // we use this variable to detect BEGIN/END without SET + ri->internal.collected_metrics_count = 0; + + rrdinstance_trigger_updates(ri, __FUNCTION__ ); +} + diff --git a/database/contexts/internal.h b/database/contexts/internal.h new file mode 100644 index 00000000..9917d58e --- /dev/null +++ b/database/contexts/internal.h @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_RRDCONTEXT_INTERNAL_H +#define NETDATA_RRDCONTEXT_INTERNAL_H 1 + +#include "rrdcontext.h" +#include "../sqlite/sqlite_context.h" +#include "../../aclk/schema-wrappers/context.h" +#include "../../aclk/aclk_contexts_api.h" +#include "../../aclk/aclk.h" +#include "../storage_engine.h" + +#define MESSAGES_PER_BUNDLE_TO_SEND_TO_HUB_PER_HOST 5000 +#define FULL_RETENTION_SCAN_DELAY_AFTER_DB_ROTATION_SECS 120 +#define RRDCONTEXT_WORKER_THREAD_HEARTBEAT_USEC (1000 * USEC_PER_MS) +#define RRDCONTEXT_MINIMUM_ALLOWED_PRIORITY 10 + +#define LOG_TRANSITIONS false + +#define WORKER_JOB_HOSTS 1 +#define WORKER_JOB_CHECK 2 +#define WORKER_JOB_SEND 3 +#define WORKER_JOB_DEQUEUE 4 +#define WORKER_JOB_RETENTION 5 +#define WORKER_JOB_QUEUED 6 +#define WORKER_JOB_CLEANUP 7 +#define WORKER_JOB_CLEANUP_DELETE 8 +#define WORKER_JOB_PP_METRIC 9 // post-processing metrics +#define WORKER_JOB_PP_INSTANCE 10 // post-processing instances +#define WORKER_JOB_PP_CONTEXT 11 // post-processing contexts +#define WORKER_JOB_HUB_QUEUE_SIZE 12 +#define WORKER_JOB_PP_QUEUE_SIZE 13 + + +typedef enum __attribute__ ((__packed__)) { + RRD_FLAG_NONE = 0, + RRD_FLAG_DELETED = (1 << 0), // this is a deleted object (metrics, instances, contexts) + RRD_FLAG_COLLECTED = (1 << 1), // this object is currently being collected + RRD_FLAG_UPDATED = (1 << 2), // this object has updates to propagate + RRD_FLAG_ARCHIVED = (1 << 3), // this object is not currently being collected + RRD_FLAG_OWN_LABELS = (1 << 4), // this instance has its own labels - not linked to an RRDSET + RRD_FLAG_LIVE_RETENTION = (1 << 5), // we have got live retention from the database + RRD_FLAG_QUEUED_FOR_HUB = (1 << 6), // this context is currently queued to be dispatched to hub + RRD_FLAG_QUEUED_FOR_PP = (1 << 7), // this context is currently queued to be post-processed + RRD_FLAG_HIDDEN = (1 << 8), // don't expose this to the hub or the API + + RRD_FLAG_UPDATE_REASON_TRIGGERED = (1 << 9), // the update was triggered by the child object + RRD_FLAG_UPDATE_REASON_LOAD_SQL = (1 << 10), // this object has just been loaded from SQL + RRD_FLAG_UPDATE_REASON_NEW_OBJECT = (1 << 11), // this object has just been created + RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT = (1 << 12), // we received an update on this object + RRD_FLAG_UPDATE_REASON_CHANGED_LINKING = (1 << 13), // an instance or a metric switched RRDSET or RRDDIM + RRD_FLAG_UPDATE_REASON_CHANGED_METADATA = (1 << 14), // this context or instance changed uuid, name, units, title, family, chart type, priority, update every, rrd changed flags + RRD_FLAG_UPDATE_REASON_ZERO_RETENTION = (1 << 15), // this object has no retention + RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T = (1 << 16), // this object changed its oldest time in the db + RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T = (1 << 17), // this object change its latest time in the db + RRD_FLAG_UPDATE_REASON_STOPPED_BEING_COLLECTED = (1 << 18), // this object has stopped being collected + RRD_FLAG_UPDATE_REASON_STARTED_BEING_COLLECTED = (1 << 19), // this object has started being collected + RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD = (1 << 20), // this context belongs to a host that just disconnected + RRD_FLAG_UPDATE_REASON_UNUSED = (1 << 21), // this context is not used anymore + RRD_FLAG_UPDATE_REASON_DB_ROTATION = (1 << 22), // this context changed because of a db rotation + + // action to perform on an object + RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION = (1 << 30), // this object has to update its retention from the db +} RRD_FLAGS; + +struct rrdcontext_reason { + RRD_FLAGS flag; + const char *name; + usec_t delay_ut; +}; + +extern struct rrdcontext_reason rrdcontext_reasons[]; + +#define RRD_FLAG_ALL_UPDATE_REASONS ( \ + RRD_FLAG_UPDATE_REASON_TRIGGERED \ + |RRD_FLAG_UPDATE_REASON_LOAD_SQL \ + |RRD_FLAG_UPDATE_REASON_NEW_OBJECT \ + |RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT \ + |RRD_FLAG_UPDATE_REASON_CHANGED_LINKING \ + |RRD_FLAG_UPDATE_REASON_CHANGED_METADATA \ + |RRD_FLAG_UPDATE_REASON_ZERO_RETENTION \ + |RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T \ + |RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T \ + |RRD_FLAG_UPDATE_REASON_STOPPED_BEING_COLLECTED \ + |RRD_FLAG_UPDATE_REASON_STARTED_BEING_COLLECTED \ + |RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD \ + |RRD_FLAG_UPDATE_REASON_DB_ROTATION \ + |RRD_FLAG_UPDATE_REASON_UNUSED \ + ) + +#define RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS ( \ + RRD_FLAG_ARCHIVED \ + |RRD_FLAG_HIDDEN \ + |RRD_FLAG_ALL_UPDATE_REASONS \ + ) + +#define RRD_FLAGS_REQUIRED_FOR_DELETIONS ( \ + RRD_FLAG_DELETED \ + |RRD_FLAG_LIVE_RETENTION \ +) + +#define RRD_FLAGS_PREVENTING_DELETIONS ( \ + RRD_FLAG_QUEUED_FOR_HUB \ + |RRD_FLAG_COLLECTED \ + |RRD_FLAG_QUEUED_FOR_PP \ +) + +// get all the flags of an object +#define rrd_flags_get(obj) __atomic_load_n(&((obj)->flags), __ATOMIC_SEQ_CST) + +// check if ANY of the given flags (bits) is set +#define rrd_flag_check(obj, flag) (rrd_flags_get(obj) & (flag)) + +// check if ALL the given flags (bits) are set +#define rrd_flag_check_all(obj, flag) (rrd_flag_check(obj, flag) == (flag)) + +// set one or more flags (bits) +#define rrd_flag_set(obj, flag) __atomic_or_fetch(&((obj)->flags), flag, __ATOMIC_SEQ_CST) + +// clear one or more flags (bits) +#define rrd_flag_clear(obj, flag) __atomic_and_fetch(&((obj)->flags), ~(flag), __ATOMIC_SEQ_CST) + +// replace the flags of an object, with the supplied ones +#define rrd_flags_replace(obj, all_flags) __atomic_store_n(&((obj)->flags), all_flags, __ATOMIC_SEQ_CST) + +static inline void +rrd_flag_add_remove_atomic(RRD_FLAGS *flags, RRD_FLAGS check, RRD_FLAGS conditionally_add, RRD_FLAGS always_remove) { + RRD_FLAGS expected, desired; + + do { + expected = *flags; + + desired = expected; + desired &= ~(always_remove); + + if(!(expected & check)) + desired |= (check | conditionally_add); + + } while(!__atomic_compare_exchange_n(flags, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)); +} + +#define rrd_flag_set_collected(obj) \ + rrd_flag_add_remove_atomic(&((obj)->flags) \ + /* check this flag */ \ + , RRD_FLAG_COLLECTED \ + \ + /* add these flags together with the above, if the above is not already set */ \ + , RRD_FLAG_UPDATE_REASON_STARTED_BEING_COLLECTED | RRD_FLAG_UPDATED \ + \ + /* always remove these flags */ \ + , RRD_FLAG_ARCHIVED \ + | RRD_FLAG_DELETED \ + | RRD_FLAG_UPDATE_REASON_STOPPED_BEING_COLLECTED \ + | RRD_FLAG_UPDATE_REASON_ZERO_RETENTION \ + | RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD \ + ) + +#define rrd_flag_set_archived(obj) \ + rrd_flag_add_remove_atomic(&((obj)->flags) \ + /* check this flag */ \ + , RRD_FLAG_ARCHIVED \ + \ + /* add these flags together with the above, if the above is not already set */ \ + , RRD_FLAG_UPDATE_REASON_STOPPED_BEING_COLLECTED | RRD_FLAG_UPDATED \ + \ + /* always remove these flags */ \ + , RRD_FLAG_COLLECTED \ + | RRD_FLAG_DELETED \ + | RRD_FLAG_UPDATE_REASON_STARTED_BEING_COLLECTED \ + | RRD_FLAG_UPDATE_REASON_ZERO_RETENTION \ + ) + +#define rrd_flag_set_deleted(obj, reason) \ + rrd_flag_add_remove_atomic(&((obj)->flags) \ + /* check this flag */ \ + , RRD_FLAG_DELETED \ + \ + /* add these flags together with the above, if the above is not already set */ \ + , RRD_FLAG_UPDATE_REASON_ZERO_RETENTION | RRD_FLAG_UPDATED | (reason) \ + \ + /* always remove these flags */ \ + , RRD_FLAG_ARCHIVED \ + | RRD_FLAG_COLLECTED \ + ) + +#define rrd_flag_is_collected(obj) rrd_flag_check(obj, RRD_FLAG_COLLECTED) +#define rrd_flag_is_archived(obj) rrd_flag_check(obj, RRD_FLAG_ARCHIVED) +#define rrd_flag_is_deleted(obj) rrd_flag_check(obj, RRD_FLAG_DELETED) +#define rrd_flag_is_updated(obj) rrd_flag_check(obj, RRD_FLAG_UPDATED) + +// mark an object as updated, providing reasons (additional bits) +#define rrd_flag_set_updated(obj, reason) rrd_flag_set(obj, RRD_FLAG_UPDATED | (reason)) + +// clear an object as being updated, clearing also all the reasons +#define rrd_flag_unset_updated(obj) rrd_flag_clear(obj, RRD_FLAG_UPDATED | RRD_FLAG_ALL_UPDATE_REASONS) + + +typedef struct rrdmetric { + uuid_t uuid; + + STRING *id; + STRING *name; + + RRDDIM *rrddim; + + time_t first_time_s; + time_t last_time_s; + RRD_FLAGS flags; + + struct rrdinstance *ri; +} RRDMETRIC; + +typedef struct rrdinstance { + uuid_t uuid; + + STRING *id; + STRING *name; + STRING *title; + STRING *units; + STRING *family; + uint32_t priority:24; + RRDSET_TYPE chart_type; + + RRD_FLAGS flags; // flags related to this instance + time_t first_time_s; + time_t last_time_s; + + time_t update_every_s; // data collection frequency + RRDSET *rrdset; // pointer to RRDSET when collected, or NULL + + DICTIONARY *rrdlabels; // linked to RRDSET->chart_labels or own version + + struct rrdcontext *rc; + DICTIONARY *rrdmetrics; + + struct { + uint32_t collected_metrics_count; // a temporary variable to detect BEGIN/END without SET + // don't use it for other purposes + // it goes up and then resets to zero, on every iteration + } internal; +} RRDINSTANCE; + +typedef struct rrdcontext { + uint64_t version; + + STRING *id; + STRING *title; + STRING *units; + STRING *family; + uint32_t priority; + RRDSET_TYPE chart_type; + + RRD_FLAGS flags; + time_t first_time_s; + time_t last_time_s; + + VERSIONED_CONTEXT_DATA hub; + + DICTIONARY *rrdinstances; + RRDHOST *rrdhost; + + struct { + RRD_FLAGS queued_flags; // the last flags that triggered the post-processing + usec_t queued_ut; // the last time this was queued + usec_t dequeued_ut; // the last time we sent (or deduplicated) this context + size_t executions; // how many times this context has been processed + } pp; + + struct { + RRD_FLAGS queued_flags; // the last flags that triggered the queueing + usec_t queued_ut; // the last time this was queued + usec_t delay_calc_ut; // the last time we calculated the scheduled_dispatched_ut + usec_t scheduled_dispatch_ut; // the time it was/is scheduled to be sent + usec_t dequeued_ut; // the last time we sent (or deduplicated) this context + size_t dispatches; // the number of times this has been dispatched to hub + } queue; + + netdata_mutex_t mutex; +} RRDCONTEXT; + + +// ---------------------------------------------------------------------------- +// helper one-liners for RRDMETRIC + +bool rrdmetric_update_retention(RRDMETRIC *rm); + +static inline RRDMETRIC *rrdmetric_acquired_value(RRDMETRIC_ACQUIRED *rma) { + return dictionary_acquired_item_value((DICTIONARY_ITEM *)rma); +} + +static inline RRDMETRIC_ACQUIRED *rrdmetric_acquired_dup(RRDMETRIC_ACQUIRED *rma) { + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + return (RRDMETRIC_ACQUIRED *)dictionary_acquired_item_dup(rm->ri->rrdmetrics, (DICTIONARY_ITEM *)rma); +} + +static inline void rrdmetric_release(RRDMETRIC_ACQUIRED *rma) { + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + dictionary_acquired_item_release(rm->ri->rrdmetrics, (DICTIONARY_ITEM *)rma); +} + +void rrdmetric_rrddim_is_freed(RRDDIM *rd); +void rrdmetric_updated_rrddim_flags(RRDDIM *rd); +void rrdmetric_collected_rrddim(RRDDIM *rd); + +// ---------------------------------------------------------------------------- +// helper one-liners for RRDINSTANCE + +static inline RRDINSTANCE *rrdinstance_acquired_value(RRDINSTANCE_ACQUIRED *ria) { + return dictionary_acquired_item_value((DICTIONARY_ITEM *)ria); +} + +static inline RRDINSTANCE_ACQUIRED *rrdinstance_acquired_dup(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return (RRDINSTANCE_ACQUIRED *)dictionary_acquired_item_dup(ri->rc->rrdinstances, (DICTIONARY_ITEM *)ria); +} + +static inline void rrdinstance_release(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + dictionary_acquired_item_release(ri->rc->rrdinstances, (DICTIONARY_ITEM *)ria); +} + +void rrdinstance_from_rrdset(RRDSET *st); +void rrdinstance_rrdset_is_freed(RRDSET *st); +void rrdinstance_rrdset_has_updated_retention(RRDSET *st); +void rrdinstance_updated_rrdset_name(RRDSET *st); +void rrdinstance_updated_rrdset_flags_no_action(RRDINSTANCE *ri, RRDSET *st); +void rrdinstance_updated_rrdset_flags(RRDSET *st); +void rrdinstance_collected_rrdset(RRDSET *st); + +void rrdcontext_queue_for_post_processing(RRDCONTEXT *rc, const char *function, RRD_FLAGS flags); + +// ---------------------------------------------------------------------------- +// helper one-liners for RRDCONTEXT + +static inline RRDCONTEXT *rrdcontext_acquired_value(RRDCONTEXT_ACQUIRED *rca) { + return dictionary_acquired_item_value((DICTIONARY_ITEM *)rca); +} + +static inline RRDCONTEXT_ACQUIRED *rrdcontext_acquired_dup(RRDCONTEXT_ACQUIRED *rca) { + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + return (RRDCONTEXT_ACQUIRED *)dictionary_acquired_item_dup(rc->rrdhost->rrdctx.contexts, (DICTIONARY_ITEM *)rca); +} + +static inline void rrdcontext_release(RRDCONTEXT_ACQUIRED *rca) { + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + dictionary_acquired_item_release(rc->rrdhost->rrdctx.contexts, (DICTIONARY_ITEM *)rca); +} + +// ---------------------------------------------------------------------------- +// Forward definitions + +void rrdcontext_recalculate_context_retention(RRDCONTEXT *rc, RRD_FLAGS reason, bool worker_jobs); +void rrdcontext_recalculate_host_retention(RRDHOST *host, RRD_FLAGS reason, bool worker_jobs); + +#define rrdcontext_lock(rc) netdata_mutex_lock(&((rc)->mutex)) +#define rrdcontext_unlock(rc) netdata_mutex_unlock(&((rc)->mutex)) + +void rrdinstance_trigger_updates(RRDINSTANCE *ri, const char *function); +void rrdcontext_trigger_updates(RRDCONTEXT *rc, const char *function); + +void rrdinstances_create_in_rrdcontext(RRDCONTEXT *rc); +void rrdinstances_destroy_from_rrdcontext(RRDCONTEXT *rc); + +void rrdmetrics_destroy_from_rrdinstance(RRDINSTANCE *ri); +void rrdmetrics_create_in_rrdinstance(RRDINSTANCE *ri); + +void rrdmetric_from_rrddim(RRDDIM *rd); + +void rrd_reasons_to_buffer_json_array_items(RRD_FLAGS flags, BUFFER *wb); + +#define rrdcontext_version_hash(host) rrdcontext_version_hash_with_callback(host, NULL, false, NULL) +uint64_t rrdcontext_version_hash_with_callback( + RRDHOST *host, + void (*callback)(RRDCONTEXT *, bool, void *), + bool snapshot, + void *bundle); + +void rrdcontext_message_send_unsafe(RRDCONTEXT *rc, bool snapshot __maybe_unused, void *bundle __maybe_unused); + +#endif //NETDATA_RRDCONTEXT_INTERNAL_H diff --git a/database/contexts/metric.c b/database/contexts/metric.c new file mode 100644 index 00000000..80756b54 --- /dev/null +++ b/database/contexts/metric.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "internal.h" + +static void rrdmetric_trigger_updates(RRDMETRIC *rm, const char *function); + +inline const char *rrdmetric_acquired_id(RRDMETRIC_ACQUIRED *rma) { + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + return string2str(rm->id); +} + +inline const char *rrdmetric_acquired_name(RRDMETRIC_ACQUIRED *rma) { + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + return string2str(rm->name); +} + +inline bool rrdmetric_acquired_has_name(RRDMETRIC_ACQUIRED *rma) { + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + return (rm->name && rm->name != rm->id); +} + +inline STRING *rrdmetric_acquired_id_dup(RRDMETRIC_ACQUIRED *rma) { + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + return string_dup(rm->id); +} + +inline STRING *rrdmetric_acquired_name_dup(RRDMETRIC_ACQUIRED *rma) { + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + return string_dup(rm->name); +} + +inline NETDATA_DOUBLE rrdmetric_acquired_last_stored_value(RRDMETRIC_ACQUIRED *rma) { + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + + if(rm->rrddim) + return rm->rrddim->last_stored_value; + + return NAN; +} + +inline bool rrdmetric_acquired_belongs_to_instance(RRDMETRIC_ACQUIRED *rma, RRDINSTANCE_ACQUIRED *ria) { + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return rm->ri == ri; +} + +inline time_t rrdmetric_acquired_first_entry(RRDMETRIC_ACQUIRED *rma) { + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + return rm->first_time_s; +} + +inline time_t rrdmetric_acquired_last_entry(RRDMETRIC_ACQUIRED *rma) { + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + + if(rrd_flag_check(rm, RRD_FLAG_COLLECTED)) + return 0; + + return rm->last_time_s; +} + +// ---------------------------------------------------------------------------- +// RRDMETRIC + +// free the contents of RRDMETRIC. +// RRDMETRIC itself is managed by DICTIONARY - no need to free it here. +static void rrdmetric_free(RRDMETRIC *rm) { + string_freez(rm->id); + string_freez(rm->name); + + rm->id = NULL; + rm->name = NULL; + rm->ri = NULL; +} + +// called when this rrdmetric is inserted to the rrdmetrics dictionary of a rrdinstance +// the constructor of the rrdmetric object +static void rrdmetric_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdinstance) { + RRDMETRIC *rm = value; + + // link it to its parent + rm->ri = rrdinstance; + + // remove flags that we need to figure out at runtime + rm->flags = rm->flags & RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS; // no need for atomics + + // signal the react callback to do the job + rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_NEW_OBJECT); +} + +// called when this rrdmetric is deleted from the rrdmetrics dictionary of a rrdinstance +// the destructor of the rrdmetric object +static void rrdmetric_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdinstance __maybe_unused) { + RRDMETRIC *rm = value; + + internal_error(rm->rrddim, "RRDMETRIC: '%s' is freed but there is a RRDDIM linked to it.", string2str(rm->id)); + + // free the resources + rrdmetric_free(rm); +} + +// called when the same rrdmetric is inserted again to the rrdmetrics dictionary of a rrdinstance +// while this is called, the dictionary is write locked, but there may be other users of the object +static bool rrdmetric_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *rrdinstance __maybe_unused) { + RRDMETRIC *rm = old_value; + RRDMETRIC *rm_new = new_value; + + internal_error(rm->id != rm_new->id, + "RRDMETRIC: '%s' cannot change id to '%s'", + string2str(rm->id), string2str(rm_new->id)); + + if(uuid_memcmp(&rm->uuid, &rm_new->uuid) != 0) { +#ifdef NETDATA_INTERNAL_CHECKS + char uuid1[UUID_STR_LEN], uuid2[UUID_STR_LEN]; + uuid_unparse(rm->uuid, uuid1); + uuid_unparse(rm_new->uuid, uuid2); + + time_t old_first_time_s = 0; + time_t old_last_time_s = 0; + if(rrdmetric_update_retention(rm)) { + old_first_time_s = rm->first_time_s; + old_last_time_s = rm->last_time_s; + } + + uuid_copy(rm->uuid, rm_new->uuid); + + time_t new_first_time_s = 0; + time_t new_last_time_s = 0; + if(rrdmetric_update_retention(rm)) { + new_first_time_s = rm->first_time_s; + new_last_time_s = rm->last_time_s; + } + + internal_error(true, + "RRDMETRIC: '%s' of instance '%s' of host '%s' changed UUID from '%s' (retention %ld to %ld, %ld secs) to '%s' (retention %ld to %ld, %ld secs)" + , string2str(rm->id) + , string2str(rm->ri->id) + , rrdhost_hostname(rm->ri->rc->rrdhost) + , uuid1, old_first_time_s, old_last_time_s, old_last_time_s - old_first_time_s + , uuid2, new_first_time_s, new_last_time_s, new_last_time_s - new_first_time_s + ); +#else + uuid_copy(rm->uuid, rm_new->uuid); +#endif + rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(rm->rrddim && rm_new->rrddim && rm->rrddim != rm_new->rrddim) { + rm->rrddim = rm_new->rrddim; + rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_LINKING); + } + +#ifdef NETDATA_INTERNAL_CHECKS + if(rm->rrddim && uuid_memcmp(&rm->uuid, &rm->rrddim->metric_uuid) != 0) { + char uuid1[UUID_STR_LEN], uuid2[UUID_STR_LEN]; + uuid_unparse(rm->uuid, uuid1); + uuid_unparse(rm_new->uuid, uuid2); + internal_error(true, "RRDMETRIC: '%s' is linked to RRDDIM '%s' but they have different UUIDs. RRDMETRIC has '%s', RRDDIM has '%s'", string2str(rm->id), rrddim_id(rm->rrddim), uuid1, uuid2); + } +#endif + + if(rm->rrddim != rm_new->rrddim) + rm->rrddim = rm_new->rrddim; + + if(rm->name != rm_new->name) { + STRING *old = rm->name; + rm->name = string_dup(rm_new->name); + string_freez(old); + rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + + if(!rm->first_time_s || (rm_new->first_time_s && rm_new->first_time_s < rm->first_time_s)) { + rm->first_time_s = rm_new->first_time_s; + rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if(!rm->last_time_s || (rm_new->last_time_s && rm_new->last_time_s > rm->last_time_s)) { + rm->last_time_s = rm_new->last_time_s; + rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + rrd_flag_set(rm, rm_new->flags & RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS); // no needs for atomics on rm_new + + if(rrd_flag_is_collected(rm) && rrd_flag_is_archived(rm)) + rrd_flag_set_collected(rm); + + if(rrd_flag_check(rm, RRD_FLAG_UPDATED)) + rrd_flag_set(rm, RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT); + + rrdmetric_free(rm_new); + + // the react callback will continue from here + return rrd_flag_is_updated(rm); +} + +// this is called after the insert or the conflict callbacks, +// but the dictionary is now unlocked +static void rrdmetric_react_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdinstance __maybe_unused) { + RRDMETRIC *rm = value; + rrdmetric_trigger_updates(rm, __FUNCTION__ ); +} + +void rrdmetrics_create_in_rrdinstance(RRDINSTANCE *ri) { + if(unlikely(!ri)) return; + if(likely(ri->rrdmetrics)) return; + + ri->rrdmetrics = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + &dictionary_stats_category_rrdcontext, sizeof(RRDMETRIC)); + + dictionary_register_insert_callback(ri->rrdmetrics, rrdmetric_insert_callback, ri); + dictionary_register_delete_callback(ri->rrdmetrics, rrdmetric_delete_callback, ri); + dictionary_register_conflict_callback(ri->rrdmetrics, rrdmetric_conflict_callback, ri); + dictionary_register_react_callback(ri->rrdmetrics, rrdmetric_react_callback, ri); +} + +void rrdmetrics_destroy_from_rrdinstance(RRDINSTANCE *ri) { + if(unlikely(!ri || !ri->rrdmetrics)) return; + dictionary_destroy(ri->rrdmetrics); + ri->rrdmetrics = NULL; +} + +// trigger post-processing of the rrdmetric, escalating changes to the rrdinstance it belongs +static void rrdmetric_trigger_updates(RRDMETRIC *rm, const char *function) { + if(unlikely(rrd_flag_is_collected(rm)) && (!rm->rrddim || rrd_flag_check(rm, RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD))) + rrd_flag_set_archived(rm); + + if(rrd_flag_is_updated(rm) || !rrd_flag_check(rm, RRD_FLAG_LIVE_RETENTION)) { + rrd_flag_set_updated(rm->ri, RRD_FLAG_UPDATE_REASON_TRIGGERED); + rrdcontext_queue_for_post_processing(rm->ri->rc, function, rm->flags); + } +} + +// ---------------------------------------------------------------------------- +// RRDMETRIC HOOKS ON RRDDIM + +void rrdmetric_from_rrddim(RRDDIM *rd) { + if(unlikely(!rd->rrdset)) + fatal("RRDMETRIC: rrddim '%s' does not have a rrdset.", rrddim_id(rd)); + + if(unlikely(!rd->rrdset->rrdhost)) + fatal("RRDMETRIC: rrdset '%s' does not have a rrdhost", rrdset_id(rd->rrdset)); + + if(unlikely(!rd->rrdset->rrdinstance)) + fatal("RRDMETRIC: rrdset '%s' does not have a rrdinstance", rrdset_id(rd->rrdset)); + + RRDINSTANCE *ri = rrdinstance_acquired_value(rd->rrdset->rrdinstance); + + RRDMETRIC trm = { + .id = string_dup(rd->id), + .name = string_dup(rd->name), + .flags = RRD_FLAG_NONE, // no need for atomics + .rrddim = rd, + }; + uuid_copy(trm.uuid, rd->metric_uuid); + + RRDMETRIC_ACQUIRED *rma = (RRDMETRIC_ACQUIRED *)dictionary_set_and_acquire_item(ri->rrdmetrics, string2str(trm.id), &trm, sizeof(trm)); + + if(rd->rrdmetric) + rrdmetric_release(rd->rrdmetric); + + rd->rrdmetric = rma; +} + +#define rrddim_get_rrdmetric(rd) rrddim_get_rrdmetric_with_trace(rd, __FUNCTION__) +static inline RRDMETRIC *rrddim_get_rrdmetric_with_trace(RRDDIM *rd, const char *function) { + if(unlikely(!rd->rrdmetric)) { + error("RRDMETRIC: RRDDIM '%s' is not linked to an RRDMETRIC at %s()", rrddim_id(rd), function); + return NULL; + } + + RRDMETRIC *rm = rrdmetric_acquired_value(rd->rrdmetric); + if(unlikely(!rm)) { + error("RRDMETRIC: RRDDIM '%s' lost the link to its RRDMETRIC at %s()", rrddim_id(rd), function); + return NULL; + } + + if(unlikely(rm->rrddim != rd)) + fatal("RRDMETRIC: '%s' is not linked to RRDDIM '%s' at %s()", string2str(rm->id), rrddim_id(rd), function); + + return rm; +} + +inline void rrdmetric_rrddim_is_freed(RRDDIM *rd) { + RRDMETRIC *rm = rrddim_get_rrdmetric(rd); + if(unlikely(!rm)) return; + + if(unlikely(rrd_flag_is_collected(rm))) + rrd_flag_set_archived(rm); + + rm->rrddim = NULL; + rrdmetric_trigger_updates(rm, __FUNCTION__ ); + rrdmetric_release(rd->rrdmetric); + rd->rrdmetric = NULL; +} + +inline void rrdmetric_updated_rrddim_flags(RRDDIM *rd) { + RRDMETRIC *rm = rrddim_get_rrdmetric(rd); + if(unlikely(!rm)) return; + + if(unlikely(rrddim_flag_check(rd, RRDDIM_FLAG_ARCHIVED|RRDDIM_FLAG_OBSOLETE))) { + if(unlikely(rrd_flag_is_collected(rm))) + rrd_flag_set_archived(rm); + } + + rrdmetric_trigger_updates(rm, __FUNCTION__ ); +} + +inline void rrdmetric_collected_rrddim(RRDDIM *rd) { + RRDMETRIC *rm = rrddim_get_rrdmetric(rd); + if(unlikely(!rm)) return; + + if(unlikely(!rrd_flag_is_collected(rm))) + rrd_flag_set_collected(rm); + + // we use this variable to detect BEGIN/END without SET + rm->ri->internal.collected_metrics_count++; + + rrdmetric_trigger_updates(rm, __FUNCTION__ ); +} + diff --git a/database/contexts/query_scope.c b/database/contexts/query_scope.c new file mode 100644 index 00000000..f3bcd0b3 --- /dev/null +++ b/database/contexts/query_scope.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "internal.h" + +ssize_t query_scope_foreach_host(SIMPLE_PATTERN *scope_hosts_sp, SIMPLE_PATTERN *hosts_sp, + foreach_host_cb_t cb, void *data, + struct query_versions *versions, + char *host_node_id_str) { + char uuid[UUID_STR_LEN]; + if(!host_node_id_str) host_node_id_str = uuid; + host_node_id_str[0] = '\0'; + + RRDHOST *host; + ssize_t added = 0; + uint64_t v_hash = 0; + uint64_t h_hash = 0; + uint64_t a_hash = 0; + uint64_t t_hash = 0; + + dfe_start_read(rrdhost_root_index, host) { + if(host->node_id) + uuid_unparse_lower(*host->node_id, host_node_id_str); + else + host_node_id_str[0] = '\0'; + + SIMPLE_PATTERN_RESULT match = SP_MATCHED_POSITIVE; + if(scope_hosts_sp) { + match = simple_pattern_matches_string_extract(scope_hosts_sp, host->hostname, NULL, 0); + if(match == SP_NOT_MATCHED) { + match = simple_pattern_matches_extract(scope_hosts_sp, host->machine_guid, NULL, 0); + if(match == SP_NOT_MATCHED && *host_node_id_str) + match = simple_pattern_matches_extract(scope_hosts_sp, host_node_id_str, NULL, 0); + } + } + + if(match != SP_MATCHED_POSITIVE) + continue; + + dfe_unlock(host); + + if(hosts_sp) { + match = simple_pattern_matches_string_extract(hosts_sp, host->hostname, NULL, 0); + if(match == SP_NOT_MATCHED) { + match = simple_pattern_matches_extract(hosts_sp, host->machine_guid, NULL, 0); + if(match == SP_NOT_MATCHED && *host_node_id_str) + match = simple_pattern_matches_extract(hosts_sp, host_node_id_str, NULL, 0); + } + } + + bool queryable_host = (match == SP_MATCHED_POSITIVE); + + v_hash += dictionary_version(host->rrdctx.contexts); + h_hash += dictionary_version(host->rrdctx.hub_queue); + a_hash += dictionary_version(host->rrdcalc_root_index); + t_hash += __atomic_load_n(&host->health_transitions, __ATOMIC_RELAXED); + ssize_t ret = cb(data, host, queryable_host); + if(ret < 0) { + added = ret; + break; + } + added += ret; + } + dfe_done(host); + + if(versions) { + versions->contexts_hard_hash = v_hash; + versions->contexts_soft_hash = h_hash; + versions->alerts_hard_hash = a_hash; + versions->alerts_soft_hash = t_hash; + } + + return added; +} + +ssize_t query_scope_foreach_context(RRDHOST *host, const char *scope_contexts, SIMPLE_PATTERN *scope_contexts_sp, + SIMPLE_PATTERN *contexts_sp, foreach_context_cb_t cb, bool queryable_host, void *data) { + if(unlikely(!host->rrdctx.contexts)) + return 0; + + ssize_t added = 0; + + RRDCONTEXT_ACQUIRED *rca = NULL; + + if(scope_contexts) + rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item(host->rrdctx.contexts, scope_contexts); + + if(likely(rca)) { + // we found it! + + bool queryable_context = queryable_host; + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + if(queryable_context && contexts_sp && !simple_pattern_matches_string(contexts_sp, rc->id)) + queryable_context = false; + + added = cb(data, rca, queryable_context); + + rrdcontext_release(rca); + } + else { + // Probably it is a pattern, we need to search for it... + RRDCONTEXT *rc; + dfe_start_read(host->rrdctx.contexts, rc) { + if(scope_contexts_sp && !simple_pattern_matches_string(scope_contexts_sp, rc->id)) + continue; + + dfe_unlock(rc); + + bool queryable_context = queryable_host; + if(queryable_context && contexts_sp && !simple_pattern_matches_string(contexts_sp, rc->id)) + queryable_context = false; + + ssize_t ret = cb(data, (RRDCONTEXT_ACQUIRED *)rc_dfe.item, queryable_context); + + if(ret < 0) { + added = ret; + break; + } + + added += ret; + } + dfe_done(rc); + } + + return added; +} + diff --git a/database/contexts/query_target.c b/database/contexts/query_target.c new file mode 100644 index 00000000..69386a3f --- /dev/null +++ b/database/contexts/query_target.c @@ -0,0 +1,1219 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "internal.h" + +#define QUERY_TARGET_MAX_REALLOC_INCREASE 500 +#define query_target_realloc_size(size, start) \ + (size) ? ((size) < QUERY_TARGET_MAX_REALLOC_INCREASE ? (size) * 2 : (size) + QUERY_TARGET_MAX_REALLOC_INCREASE) : (start); + +static void query_metric_release(QUERY_TARGET *qt, QUERY_METRIC *qm); +static void query_dimension_release(QUERY_DIMENSION *qd); +static void query_instance_release(QUERY_INSTANCE *qi); +static void query_context_release(QUERY_CONTEXT *qc); +static void query_node_release(QUERY_NODE *qn); + +static __thread QUERY_TARGET *thread_qt = NULL; +static struct { + struct { + SPINLOCK spinlock; + size_t count; + QUERY_TARGET *base; + } available; + + struct { + SPINLOCK spinlock; + size_t count; + QUERY_TARGET *base; + } used; +} query_target_base = { + .available = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .base = NULL, + .count = 0, + }, + .used = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .base = NULL, + .count = 0, + }, +}; + +static void query_target_destroy(QUERY_TARGET *qt) { + __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->query.size * sizeof(*qt->query.array), __ATOMIC_RELAXED); + freez(qt->query.array); + + __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->dimensions.size * sizeof(*qt->dimensions.array), __ATOMIC_RELAXED); + freez(qt->dimensions.array); + + __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->instances.size * sizeof(*qt->instances.array), __ATOMIC_RELAXED); + freez(qt->instances.array); + + __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->contexts.size * sizeof(*qt->contexts.array), __ATOMIC_RELAXED); + freez(qt->contexts.array); + + __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->nodes.size * sizeof(*qt->nodes.array), __ATOMIC_RELAXED); + freez(qt->nodes.array); + + freez(qt); +} + +void query_target_release(QUERY_TARGET *qt) { + if(unlikely(!qt)) return; + + internal_fatal(!qt->internal.used, "QUERY TARGET: qt to be released is not used"); + + simple_pattern_free(qt->nodes.scope_pattern); + qt->nodes.scope_pattern = NULL; + + simple_pattern_free(qt->nodes.pattern); + qt->nodes.pattern = NULL; + + simple_pattern_free(qt->contexts.scope_pattern); + qt->contexts.scope_pattern = NULL; + + simple_pattern_free(qt->contexts.pattern); + qt->contexts.pattern = NULL; + + simple_pattern_free(qt->instances.pattern); + qt->instances.pattern = NULL; + + simple_pattern_free(qt->instances.chart_label_key_pattern); + qt->instances.chart_label_key_pattern = NULL; + + simple_pattern_free(qt->instances.labels_pattern); + qt->instances.labels_pattern = NULL; + + simple_pattern_free(qt->query.pattern); + qt->query.pattern = NULL; + + // release the query + for(size_t i = 0, used = qt->query.used; i < used ;i++) { + QUERY_METRIC *qm = query_metric(qt, i); + query_metric_release(qt, qm); + } + qt->query.used = 0; + + // release the dimensions + for(size_t i = 0, used = qt->dimensions.used; i < used ; i++) { + QUERY_DIMENSION *qd = query_dimension(qt, i); + query_dimension_release(qd); + } + qt->dimensions.used = 0; + + // release the instances + for(size_t i = 0, used = qt->instances.used; i < used ;i++) { + QUERY_INSTANCE *qi = query_instance(qt, i); + query_instance_release(qi); + } + qt->instances.used = 0; + + // release the contexts + for(size_t i = 0, used = qt->contexts.used; i < used ;i++) { + QUERY_CONTEXT *qc = query_context(qt, i); + rrdcontext_release(qc->rca); + qc->rca = NULL; + } + qt->contexts.used = 0; + + // release the nodes + for(size_t i = 0, used = qt->nodes.used; i < used ; i++) { + QUERY_NODE *qn = query_node(qt, i); + query_node_release(qn); + } + qt->nodes.used = 0; + + qt->db.minimum_latest_update_every_s = 0; + qt->db.first_time_s = 0; + qt->db.last_time_s = 0; + + for(size_t g = 0; g < MAX_QUERY_GROUP_BY_PASSES ;g++) + qt->group_by[g].used = 0; + + qt->id[0] = '\0'; + + netdata_spinlock_lock(&query_target_base.used.spinlock); + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(query_target_base.used.base, qt, internal.prev, internal.next); + query_target_base.used.count--; + netdata_spinlock_unlock(&query_target_base.used.spinlock); + + qt->internal.used = false; + thread_qt = NULL; + + if (qt->internal.queries > 1000) { + query_target_destroy(qt); + } + else { + netdata_spinlock_lock(&query_target_base.available.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(query_target_base.available.base, qt, internal.prev, internal.next); + query_target_base.available.count++; + netdata_spinlock_unlock(&query_target_base.available.spinlock); + } +} + +static QUERY_TARGET *query_target_get(void) { + netdata_spinlock_lock(&query_target_base.available.spinlock); + QUERY_TARGET *qt = query_target_base.available.base; + if (qt) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(query_target_base.available.base, qt, internal.prev, internal.next); + query_target_base.available.count--; + } + netdata_spinlock_unlock(&query_target_base.available.spinlock); + + if(unlikely(!qt)) + qt = callocz(1, sizeof(*qt)); + + netdata_spinlock_lock(&query_target_base.used.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(query_target_base.used.base, qt, internal.prev, internal.next); + query_target_base.used.count++; + netdata_spinlock_unlock(&query_target_base.used.spinlock); + + qt->internal.used = true; + qt->internal.queries++; + thread_qt = qt; + + return qt; +} + +// this is used to release a query target from a cancelled thread +void query_target_free(void) { + query_target_release(thread_qt); +} + +// ---------------------------------------------------------------------------- +// query API + +typedef struct query_target_locals { + time_t start_s; + + QUERY_TARGET *qt; + + RRDSET *st; + + const char *scope_nodes; + const char *scope_contexts; + + const char *nodes; + const char *contexts; + const char *instances; + const char *dimensions; + const char *chart_label_key; + const char *labels; + const char *alerts; + + long long after; + long long before; + bool match_ids; + bool match_names; + + size_t metrics_skipped_due_to_not_matching_timeframe; + + char host_node_id_str[UUID_STR_LEN]; + QUERY_NODE *qn; // temp to pass on callbacks, ignore otherwise - no need to free +} QUERY_TARGET_LOCALS; + +struct storage_engine *query_metric_storage_engine(QUERY_TARGET *qt, QUERY_METRIC *qm, size_t tier) { + QUERY_NODE *qn = query_node(qt, qm->link.query_node_id); + return qn->rrdhost->db[tier].eng; +} + +static inline void query_metric_release(QUERY_TARGET *qt, QUERY_METRIC *qm) { + qm->plan.used = 0; + + // reset the tiers + for(size_t tier = 0; tier < storage_tiers ;tier++) { + if(qm->tiers[tier].db_metric_handle) { + STORAGE_ENGINE *eng = query_metric_storage_engine(qt, qm, tier); + eng->api.metric_release(qm->tiers[tier].db_metric_handle); + qm->tiers[tier].db_metric_handle = NULL; + } + } +} + +static bool query_metric_add(QUERY_TARGET_LOCALS *qtl, QUERY_NODE *qn, QUERY_CONTEXT *qc, + QUERY_INSTANCE *qi, size_t qd_slot, RRDMETRIC *rm, RRDR_DIMENSION_FLAGS options) { + QUERY_TARGET *qt = qtl->qt; + RRDINSTANCE *ri = rm->ri; + + time_t common_first_time_s = 0; + time_t common_last_time_s = 0; + time_t common_update_every_s = 0; + size_t tiers_added = 0; + + struct { + STORAGE_ENGINE *eng; + STORAGE_METRIC_HANDLE *db_metric_handle; + time_t db_first_time_s; + time_t db_last_time_s; + time_t db_update_every_s; + } tier_retention[storage_tiers]; + + for (size_t tier = 0; tier < storage_tiers; tier++) { + STORAGE_ENGINE *eng = qn->rrdhost->db[tier].eng; + tier_retention[tier].eng = eng; + tier_retention[tier].db_update_every_s = (time_t) (qn->rrdhost->db[tier].tier_grouping * ri->update_every_s); + + if(rm->rrddim && rm->rrddim->tiers[tier].db_metric_handle) + tier_retention[tier].db_metric_handle = eng->api.metric_dup(rm->rrddim->tiers[tier].db_metric_handle); + else + tier_retention[tier].db_metric_handle = eng->api.metric_get(qn->rrdhost->db[tier].instance, &rm->uuid); + + if(tier_retention[tier].db_metric_handle) { + tier_retention[tier].db_first_time_s = storage_engine_oldest_time_s(tier_retention[tier].eng->backend, tier_retention[tier].db_metric_handle); + tier_retention[tier].db_last_time_s = storage_engine_latest_time_s(tier_retention[tier].eng->backend, tier_retention[tier].db_metric_handle); + + if(!common_first_time_s) + common_first_time_s = tier_retention[tier].db_first_time_s; + else if(tier_retention[tier].db_first_time_s) + common_first_time_s = MIN(common_first_time_s, tier_retention[tier].db_first_time_s); + + if(!common_last_time_s) + common_last_time_s = tier_retention[tier].db_last_time_s; + else + common_last_time_s = MAX(common_last_time_s, tier_retention[tier].db_last_time_s); + + if(!common_update_every_s) + common_update_every_s = tier_retention[tier].db_update_every_s; + else if(tier_retention[tier].db_update_every_s) + common_update_every_s = MIN(common_update_every_s, tier_retention[tier].db_update_every_s); + + tiers_added++; + } + else { + tier_retention[tier].db_first_time_s = 0; + tier_retention[tier].db_last_time_s = 0; + tier_retention[tier].db_update_every_s = 0; + } + } + + for (size_t tier = 0; tier < storage_tiers; tier++) { + if(!qt->db.tiers[tier].update_every || (tier_retention[tier].db_update_every_s && tier_retention[tier].db_update_every_s < qt->db.tiers[tier].update_every)) + qt->db.tiers[tier].update_every = tier_retention[tier].db_update_every_s; + + if(!qt->db.tiers[tier].retention.first_time_s || (tier_retention[tier].db_first_time_s && tier_retention[tier].db_first_time_s < qt->db.tiers[tier].retention.first_time_s)) + qt->db.tiers[tier].retention.first_time_s = tier_retention[tier].db_first_time_s; + + if(!qt->db.tiers[tier].retention.last_time_s || (tier_retention[tier].db_last_time_s && tier_retention[tier].db_last_time_s > qt->db.tiers[tier].retention.last_time_s)) + qt->db.tiers[tier].retention.last_time_s = tier_retention[tier].db_last_time_s; + } + + bool timeframe_matches = + (tiers_added && + query_matches_retention(qt->window.after, qt->window.before, common_first_time_s, common_last_time_s, common_update_every_s)) + ? true : false; + + if(timeframe_matches) { + if(ri->rrdset) + ri->rrdset->last_accessed_time_s = qtl->start_s; + + if (qt->query.used == qt->query.size) { + size_t old_mem = qt->query.size * sizeof(*qt->query.array); + qt->query.size = query_target_realloc_size(qt->query.size, 4); + size_t new_mem = qt->query.size * sizeof(*qt->query.array); + qt->query.array = reallocz(qt->query.array, new_mem); + + __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); + } + QUERY_METRIC *qm = &qt->query.array[qt->query.used++]; + memset(qm, 0, sizeof(*qm)); + + qm->status = options; + + qm->link.query_node_id = qn->slot; + qm->link.query_context_id = qc->slot; + qm->link.query_instance_id = qi->slot; + qm->link.query_dimension_id = qd_slot; + + if (!qt->db.first_time_s || common_first_time_s < qt->db.first_time_s) + qt->db.first_time_s = common_first_time_s; + + if (!qt->db.last_time_s || common_last_time_s > qt->db.last_time_s) + qt->db.last_time_s = common_last_time_s; + + for (size_t tier = 0; tier < storage_tiers; tier++) { + internal_fatal(tier_retention[tier].eng != query_metric_storage_engine(qt, qm, tier), "QUERY TARGET: storage engine mismatch"); + qm->tiers[tier].db_metric_handle = tier_retention[tier].db_metric_handle; + qm->tiers[tier].db_first_time_s = tier_retention[tier].db_first_time_s; + qm->tiers[tier].db_last_time_s = tier_retention[tier].db_last_time_s; + qm->tiers[tier].db_update_every_s = tier_retention[tier].db_update_every_s; + } + + return true; + } + + // cleanup anything we allocated to the retention we will not use + for(size_t tier = 0; tier < storage_tiers ;tier++) { + if (tier_retention[tier].db_metric_handle) + tier_retention[tier].eng->api.metric_release(tier_retention[tier].db_metric_handle); + } + + return false; +} + +static inline bool rrdmetric_retention_matches_query(QUERY_TARGET *qt, RRDMETRIC *rm, time_t now_s) { + time_t first_time_s = rm->first_time_s; + time_t last_time_s = rrd_flag_is_collected(rm) ? now_s : rm->last_time_s; + time_t update_every_s = rm->ri->update_every_s; + return query_matches_retention(qt->window.after, qt->window.before, first_time_s, last_time_s, update_every_s); +} + +static inline void query_dimension_release(QUERY_DIMENSION *qd) { + rrdmetric_release(qd->rma); + qd->rma = NULL; +} + +static QUERY_DIMENSION *query_dimension_allocate(QUERY_TARGET *qt, RRDMETRIC_ACQUIRED *rma, QUERY_STATUS status, size_t priority) { + if(qt->dimensions.used == qt->dimensions.size) { + size_t old_mem = qt->dimensions.size * sizeof(*qt->dimensions.array); + qt->dimensions.size = query_target_realloc_size(qt->dimensions.size, 4); + size_t new_mem = qt->dimensions.size * sizeof(*qt->dimensions.array); + qt->dimensions.array = reallocz(qt->dimensions.array, new_mem); + + __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); + } + QUERY_DIMENSION *qd = &qt->dimensions.array[qt->dimensions.used]; + memset(qd, 0, sizeof(*qd)); + + qd->slot = qt->dimensions.used++; + qd->rma = rrdmetric_acquired_dup(rma); + qd->status = status; + qd->priority = priority; + + return qd; +} + +static bool query_dimension_add(QUERY_TARGET_LOCALS *qtl, QUERY_NODE *qn, QUERY_CONTEXT *qc, QUERY_INSTANCE *qi, + RRDMETRIC_ACQUIRED *rma, bool queryable_instance, size_t *metrics_added, size_t priority) { + QUERY_TARGET *qt = qtl->qt; + + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + if(rrd_flag_is_deleted(rm)) + return false; + + QUERY_STATUS status = QUERY_STATUS_NONE; + + bool undo = false; + if(!queryable_instance) { + if(rrdmetric_retention_matches_query(qt, rm, qtl->start_s)) { + qi->metrics.excluded++; + qc->metrics.excluded++; + qn->metrics.excluded++; + status |= QUERY_STATUS_EXCLUDED; + } + else + undo = true; + } + else { + RRDR_DIMENSION_FLAGS options = RRDR_DIMENSION_DEFAULT; + bool needed = false; + + if (qt->query.pattern) { + // the user asked for specific dimensions + + SIMPLE_PATTERN_RESULT ret = SP_NOT_MATCHED; + + if(qtl->match_ids) + ret = simple_pattern_matches_string_extract(qt->query.pattern, rm->id, NULL, 0); + + if(ret == SP_NOT_MATCHED && qtl->match_names && (rm->name != rm->id || !qtl->match_ids)) + ret = simple_pattern_matches_string_extract(qt->query.pattern, rm->name, NULL, 0); + + if(ret == SP_MATCHED_POSITIVE) { + needed = true; + options |= RRDR_DIMENSION_SELECTED | RRDR_DIMENSION_NONZERO; + } + else { + // the user selection does not match this dimension + // but, we may still need to query it + + if (query_target_needs_all_dimensions(qt)) { + // this is percentage calculation + // so, we need this dimension to calculate the percentage + needed = true; + options |= RRDR_DIMENSION_HIDDEN; + } + else { + // the user did not select this dimension + // and the calculation is not percentage + // so, no need to query it + ; + } + } + } + else { + // we don't have a dimensions pattern + // so this is a selected dimension + // if it is not hidden + + if(rrd_flag_check(rm, RRD_FLAG_HIDDEN) || (rm->rrddim && rrddim_option_check(rm->rrddim, RRDDIM_OPTION_HIDDEN))) { + // this is a hidden dimension + // we don't need to query it + status |= QUERY_STATUS_DIMENSION_HIDDEN; + options |= RRDR_DIMENSION_HIDDEN; + + if (query_target_needs_all_dimensions(qt)) { + // this is percentage calculation + // so, we need this dimension to calculate the percentage + needed = true; + } + } + else { + // this is a not hidden dimension + // and the user did not provide any selection for dimensions + // so, we need to query it + needed = true; + options |= RRDR_DIMENSION_SELECTED; + } + } + + if (needed) { + if(query_metric_add(qtl, qn, qc, qi, qt->dimensions.used, rm, options)) { + (*metrics_added)++; + + qi->metrics.selected++; + qc->metrics.selected++; + qn->metrics.selected++; + } + else { + undo = true; + qtl->metrics_skipped_due_to_not_matching_timeframe++; + } + } + else if(rrdmetric_retention_matches_query(qt, rm, qtl->start_s)) { + qi->metrics.excluded++; + qc->metrics.excluded++; + qn->metrics.excluded++; + status |= QUERY_STATUS_EXCLUDED; + } + else + undo = true; + } + + if(undo) + return false; + + query_dimension_allocate(qt, rma, status, priority); + return true; +} + +static inline STRING *rrdinstance_create_id_fqdn_v1(RRDINSTANCE_ACQUIRED *ria) { + if(unlikely(!ria)) + return NULL; + + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return string_dup(ri->id); +} + +static inline STRING *rrdinstance_create_name_fqdn_v1(RRDINSTANCE_ACQUIRED *ria) { + if(unlikely(!ria)) + return NULL; + + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return string_dup(ri->name); +} + +static inline STRING *rrdinstance_create_id_fqdn_v2(RRDINSTANCE_ACQUIRED *ria) { + if(unlikely(!ria)) + return NULL; + + char buffer[RRD_ID_LENGTH_MAX + 1]; + + RRDHOST *host = rrdinstance_acquired_rrdhost(ria); + snprintfz(buffer, RRD_ID_LENGTH_MAX, "%s@%s", rrdinstance_acquired_id(ria), host->machine_guid); + return string_strdupz(buffer); +} + +static inline STRING *rrdinstance_create_name_fqdn_v2(RRDINSTANCE_ACQUIRED *ria) { + if(unlikely(!ria)) + return NULL; + + char buffer[RRD_ID_LENGTH_MAX + 1]; + + RRDHOST *host = rrdinstance_acquired_rrdhost(ria); + snprintfz(buffer, RRD_ID_LENGTH_MAX, "%s@%s", rrdinstance_acquired_name(ria), rrdhost_hostname(host)); + return string_strdupz(buffer); +} + +inline STRING *query_instance_id_fqdn(QUERY_INSTANCE *qi, size_t version) { + if(!qi->id_fqdn) { + if (version <= 1) + qi->id_fqdn = rrdinstance_create_id_fqdn_v1(qi->ria); + else + qi->id_fqdn = rrdinstance_create_id_fqdn_v2(qi->ria); + } + + return qi->id_fqdn; +} + +inline STRING *query_instance_name_fqdn(QUERY_INSTANCE *qi, size_t version) { + if(!qi->name_fqdn) { + if (version <= 1) + qi->name_fqdn = rrdinstance_create_name_fqdn_v1(qi->ria); + else + qi->name_fqdn = rrdinstance_create_name_fqdn_v2(qi->ria); + } + + return qi->name_fqdn; +} + +RRDSET *rrdinstance_acquired_rrdset(RRDINSTANCE_ACQUIRED *ria) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + return ri->rrdset; +} + +const char *rrdcontext_acquired_units(RRDCONTEXT_ACQUIRED *rca) { + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + return string2str(rc->units); +} + +RRDSET_TYPE rrdcontext_acquired_chart_type(RRDCONTEXT_ACQUIRED *rca) { + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + return rc->chart_type; +} + +const char *rrdcontext_acquired_title(RRDCONTEXT_ACQUIRED *rca) { + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + return string2str(rc->title); +} + +static void query_target_eval_instance_rrdcalc(QUERY_TARGET_LOCALS *qtl __maybe_unused, + QUERY_NODE *qn, QUERY_CONTEXT *qc, QUERY_INSTANCE *qi) { + RRDSET *st = rrdinstance_acquired_rrdset(qi->ria); + if (st) { + netdata_rwlock_rdlock(&st->alerts.rwlock); + for (RRDCALC *rc = st->alerts.base; rc; rc = rc->next) { + switch(rc->status) { + case RRDCALC_STATUS_CLEAR: + qi->alerts.clear++; + qc->alerts.clear++; + qn->alerts.clear++; + break; + + case RRDCALC_STATUS_WARNING: + qi->alerts.warning++; + qc->alerts.warning++; + qn->alerts.warning++; + break; + + case RRDCALC_STATUS_CRITICAL: + qi->alerts.critical++; + qc->alerts.critical++; + qn->alerts.critical++; + break; + + default: + case RRDCALC_STATUS_UNINITIALIZED: + case RRDCALC_STATUS_UNDEFINED: + case RRDCALC_STATUS_REMOVED: + qi->alerts.other++; + qc->alerts.other++; + qn->alerts.other++; + break; + } + } + netdata_rwlock_unlock(&st->alerts.rwlock); + } +} + +static bool query_target_match_alert_pattern(RRDINSTANCE_ACQUIRED *ria, SIMPLE_PATTERN *pattern) { + if(!pattern) + return true; + + RRDSET *st = rrdinstance_acquired_rrdset(ria); + if (!st) + return false; + + BUFFER *wb = NULL; + bool matched = false; + netdata_rwlock_rdlock(&st->alerts.rwlock); + if (st->alerts.base) { + for (RRDCALC *rc = st->alerts.base; rc; rc = rc->next) { + SIMPLE_PATTERN_RESULT ret = simple_pattern_matches_string_extract(pattern, rc->name, NULL, 0); + + if(ret == SP_MATCHED_POSITIVE) { + matched = true; + break; + } + else if(ret == SP_MATCHED_NEGATIVE) + break; + + if (!wb) + wb = buffer_create(0, NULL); + else + buffer_flush(wb); + + buffer_fast_strcat(wb, string2str(rc->name), string_strlen(rc->name)); + buffer_fast_strcat(wb, ":", 1); + buffer_strcat(wb, rrdcalc_status2string(rc->status)); + + ret = simple_pattern_matches_buffer_extract(pattern, wb, NULL, 0); + + if(ret == SP_MATCHED_POSITIVE) { + matched = true; + break; + } + else if(ret == SP_MATCHED_NEGATIVE) + break; + } + } + netdata_rwlock_unlock(&st->alerts.rwlock); + + buffer_free(wb); + return matched; +} + +static inline void query_instance_release(QUERY_INSTANCE *qi) { + if(qi->ria) { + rrdinstance_release(qi->ria); + qi->ria = NULL; + } + + string_freez(qi->id_fqdn); + qi->id_fqdn = NULL; + + string_freez(qi->name_fqdn); + qi->name_fqdn = NULL; +} + +static inline QUERY_INSTANCE *query_instance_allocate(QUERY_TARGET *qt, RRDINSTANCE_ACQUIRED *ria, size_t qn_slot) { + if(qt->instances.used == qt->instances.size) { + size_t old_mem = qt->instances.size * sizeof(*qt->instances.array); + qt->instances.size = query_target_realloc_size(qt->instances.size, 2); + size_t new_mem = qt->instances.size * sizeof(*qt->instances.array); + qt->instances.array = reallocz(qt->instances.array, new_mem); + + __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); + } + QUERY_INSTANCE *qi = &qt->instances.array[qt->instances.used]; + memset(qi, 0, sizeof(*qi)); + + qi->slot = qt->instances.used; + qt->instances.used++; + qi->ria = rrdinstance_acquired_dup(ria); + qi->query_host_id = qn_slot; + + return qi; +} + +static inline SIMPLE_PATTERN_RESULT query_instance_matches(QUERY_INSTANCE *qi, + RRDINSTANCE *ri, + SIMPLE_PATTERN *instances_sp, + bool match_ids, + bool match_names, + size_t version, + char *host_node_id_str) { + SIMPLE_PATTERN_RESULT ret = SP_MATCHED_POSITIVE; + + if(instances_sp) { + ret = SP_NOT_MATCHED; + + if(match_ids) + ret = simple_pattern_matches_string_extract(instances_sp, ri->id, NULL, 0); + if (ret == SP_NOT_MATCHED && match_names && (ri->name != ri->id || !match_ids)) + ret = simple_pattern_matches_string_extract(instances_sp, ri->name, NULL, 0); + if (ret == SP_NOT_MATCHED && match_ids) + ret = simple_pattern_matches_string_extract(instances_sp, query_instance_id_fqdn(qi, version), NULL, 0); + if (ret == SP_NOT_MATCHED && match_names) + ret = simple_pattern_matches_string_extract(instances_sp, query_instance_name_fqdn(qi, version), NULL, 0); + + if (ret == SP_NOT_MATCHED && match_ids && host_node_id_str[0]) { + char buffer[RRD_ID_LENGTH_MAX + 1]; + snprintfz(buffer, RRD_ID_LENGTH_MAX, "%s@%s", rrdinstance_acquired_id(qi->ria), host_node_id_str); + ret = simple_pattern_matches_extract(instances_sp, buffer, NULL, 0); + } + } + + return ret; +} + +static inline bool query_instance_matches_labels(RRDINSTANCE *ri, SIMPLE_PATTERN *chart_label_key_sp, SIMPLE_PATTERN *labels_sp) { + if ((chart_label_key_sp && !rrdlabels_match_simple_pattern_parsed( + ri->rrdlabels, chart_label_key_sp, '\0', NULL)) || + (labels_sp && !rrdlabels_match_simple_pattern_parsed( + ri->rrdlabels, labels_sp, ':', NULL))) + return false; + + return true; +} + +static bool query_instance_add(QUERY_TARGET_LOCALS *qtl, QUERY_NODE *qn, QUERY_CONTEXT *qc, + RRDINSTANCE_ACQUIRED *ria, bool queryable_instance, bool filter_instances) { + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + if(rrd_flag_is_deleted(ri)) + return false; + + QUERY_TARGET *qt = qtl->qt; + QUERY_INSTANCE *qi = query_instance_allocate(qt, ria, qn->slot); + + if(qt->db.minimum_latest_update_every_s == 0 || ri->update_every_s < qt->db.minimum_latest_update_every_s) + qt->db.minimum_latest_update_every_s = ri->update_every_s; + + if(queryable_instance && filter_instances) + queryable_instance = (SP_MATCHED_POSITIVE == query_instance_matches( + qi, ri, qt->instances.pattern, qtl->match_ids, qtl->match_names, qt->request.version, qtl->host_node_id_str)); + + if(queryable_instance) + queryable_instance = query_instance_matches_labels(ri, qt->instances.chart_label_key_pattern, qt->instances.labels_pattern); + + if(queryable_instance) { + if(qt->instances.alerts_pattern && !query_target_match_alert_pattern(ria, qt->instances.alerts_pattern)) + queryable_instance = false; + } + + if(queryable_instance && qt->request.version >= 2) + query_target_eval_instance_rrdcalc(qtl, qn, qc, qi); + + size_t dimensions_added = 0, metrics_added = 0, priority = 0; + + if(unlikely(qt->request.rma)) { + if(query_dimension_add(qtl, qn, qc, qi, qt->request.rma, queryable_instance, &metrics_added, priority++)) + dimensions_added++; + } + else { + RRDMETRIC *rm; + dfe_start_read(ri->rrdmetrics, rm) { + if(query_dimension_add(qtl, qn, qc, qi, (RRDMETRIC_ACQUIRED *) rm_dfe.item, + queryable_instance, &metrics_added, priority++)) + dimensions_added++; + } + dfe_done(rm); + } + + if(!dimensions_added) { + qt->instances.used--; + query_instance_release(qi); + return false; + } + else { + if(metrics_added) { + qc->instances.selected++; + qn->instances.selected++; + } + else { + qc->instances.excluded++; + qn->instances.excluded++; + } + } + + return true; +} + +static inline void query_context_release(QUERY_CONTEXT *qc) { + rrdcontext_release(qc->rca); + qc->rca = NULL; +} + +static inline QUERY_CONTEXT *query_context_allocate(QUERY_TARGET *qt, RRDCONTEXT_ACQUIRED *rca) { + if(qt->contexts.used == qt->contexts.size) { + size_t old_mem = qt->contexts.size * sizeof(*qt->contexts.array); + qt->contexts.size = query_target_realloc_size(qt->contexts.size, 2); + size_t new_mem = qt->contexts.size * sizeof(*qt->contexts.array); + qt->contexts.array = reallocz(qt->contexts.array, new_mem); + + __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); + } + QUERY_CONTEXT *qc = &qt->contexts.array[qt->contexts.used]; + memset(qc, 0, sizeof(*qc)); + qc->slot = qt->contexts.used++; + qc->rca = rrdcontext_acquired_dup(rca); + + return qc; +} + +static ssize_t query_context_add(void *data, RRDCONTEXT_ACQUIRED *rca, bool queryable_context) { + QUERY_TARGET_LOCALS *qtl = data; + + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + if(rrd_flag_is_deleted(rc)) + return 0; + + QUERY_NODE *qn = qtl->qn; + QUERY_TARGET *qt = qtl->qt; + QUERY_CONTEXT *qc = query_context_allocate(qt, rca); + + ssize_t added = 0; + if(unlikely(qt->request.ria)) { + if(query_instance_add(qtl, qn, qc, qt->request.ria, queryable_context, false)) + added++; + } + else if(unlikely(qtl->st && qtl->st->rrdcontext == rca && qtl->st->rrdinstance)) { + if(query_instance_add(qtl, qn, qc, qtl->st->rrdinstance, queryable_context, false)) + added++; + } + else { + RRDINSTANCE *ri; + dfe_start_read(rc->rrdinstances, ri) { + if(query_instance_add(qtl, qn, qc, (RRDINSTANCE_ACQUIRED *) ri_dfe.item, queryable_context, true)) + added++; + } + dfe_done(ri); + } + + if(!added) { + query_context_release(qc); + qt->contexts.used--; + return 0; + } + + return added; +} + +static inline void query_node_release(QUERY_NODE *qn) { + qn->rrdhost = NULL; +} + +static inline QUERY_NODE *query_node_allocate(QUERY_TARGET *qt, RRDHOST *host) { + if(qt->nodes.used == qt->nodes.size) { + size_t old_mem = qt->nodes.size * sizeof(*qt->nodes.array); + qt->nodes.size = query_target_realloc_size(qt->nodes.size, 2); + size_t new_mem = qt->nodes.size * sizeof(*qt->nodes.array); + qt->nodes.array = reallocz(qt->nodes.array, new_mem); + + __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); + } + QUERY_NODE *qn = &qt->nodes.array[qt->nodes.used]; + memset(qn, 0, sizeof(*qn)); + + qn->slot = qt->nodes.used++; + qn->rrdhost = host; + + return qn; +} + +static ssize_t query_node_add(void *data, RRDHOST *host, bool queryable_host) { + QUERY_TARGET_LOCALS *qtl = data; + QUERY_TARGET *qt = qtl->qt; + QUERY_NODE *qn = query_node_allocate(qt, host); + + if(host->node_id) { + if(!qtl->host_node_id_str[0]) + uuid_unparse_lower(*host->node_id, qn->node_id); + else + memcpy(qn->node_id, qtl->host_node_id_str, sizeof(qn->node_id)); + } + else + qn->node_id[0] = '\0'; + + // is the chart given valid? + if(unlikely(qtl->st && (!qtl->st->rrdinstance || !qtl->st->rrdcontext))) { + error("QUERY TARGET: RRDSET '%s' given, but it is not linked to rrdcontext structures. Linking it now.", rrdset_name(qtl->st)); + rrdinstance_from_rrdset(qtl->st); + + if(unlikely(qtl->st && (!qtl->st->rrdinstance || !qtl->st->rrdcontext))) { + error("QUERY TARGET: RRDSET '%s' given, but failed to be linked to rrdcontext structures. Switching to context query.", + rrdset_name(qtl->st)); + + if (!is_valid_sp(qtl->instances)) + qtl->instances = rrdset_name(qtl->st); + + qtl->st = NULL; + } + } + + qtl->qn = qn; + + ssize_t added = 0; + if(unlikely(qt->request.rca)) { + if(query_context_add(qtl, qt->request.rca, true)) + added++; + } + else if(unlikely(qtl->st)) { + // single chart data queries + if(query_context_add(qtl, qtl->st->rrdcontext, true)) + added++; + } + else { + // context pattern queries + added = query_scope_foreach_context( + host, qtl->scope_contexts, + qt->contexts.scope_pattern, qt->contexts.pattern, + query_context_add, queryable_host, qtl); + + if(added < 0) + added = 0; + } + + qtl->qn = NULL; + + if(!added) { + query_node_release(qn); + qt->nodes.used--; + return false; + } + + return true; +} + +void query_target_generate_name(QUERY_TARGET *qt) { + char options_buffer[100 + 1]; + web_client_api_request_v1_data_options_to_string(options_buffer, 100, qt->request.options); + + char resampling_buffer[20 + 1] = ""; + if(qt->request.resampling_time > 1) + snprintfz(resampling_buffer, 20, "/resampling:%lld", (long long)qt->request.resampling_time); + + char tier_buffer[20 + 1] = ""; + if(qt->request.options & RRDR_OPTION_SELECTED_TIER) + snprintfz(tier_buffer, 20, "/tier:%zu", qt->request.tier); + + if(qt->request.st) + snprintfz(qt->id, MAX_QUERY_TARGET_ID_LENGTH, "chart://hosts:%s/instance:%s/dimensions:%s/after:%lld/before:%lld/points:%zu/group:%s%s/options:%s%s%s" + , rrdhost_hostname(qt->request.st->rrdhost) + , rrdset_name(qt->request.st) + , (qt->request.dimensions) ? qt->request.dimensions : "*" + , (long long)qt->request.after + , (long long)qt->request.before + , qt->request.points + , time_grouping_tostring(qt->request.time_group_method) + , qt->request.time_group_options ? qt->request.time_group_options : "" + , options_buffer + , resampling_buffer + , tier_buffer + ); + else if(qt->request.host && qt->request.rca && qt->request.ria && qt->request.rma) + snprintfz(qt->id, MAX_QUERY_TARGET_ID_LENGTH, "metric://hosts:%s/context:%s/instance:%s/dimension:%s/after:%lld/before:%lld/points:%zu/group:%s%s/options:%s%s%s" + , rrdhost_hostname(qt->request.host) + , rrdcontext_acquired_id(qt->request.rca) + , rrdinstance_acquired_id(qt->request.ria) + , rrdmetric_acquired_id(qt->request.rma) + , (long long)qt->request.after + , (long long)qt->request.before + , qt->request.points + , time_grouping_tostring(qt->request.time_group_method) + , qt->request.time_group_options ? qt->request.time_group_options : "" + , options_buffer + , resampling_buffer + , tier_buffer + ); + else if(qt->request.version >= 2) + snprintfz(qt->id, MAX_QUERY_TARGET_ID_LENGTH, "data_v2://scope_nodes:%s/scope_contexts:%s/nodes:%s/contexts:%s/instances:%s/labels:%s/dimensions:%s/after:%lld/before:%lld/points:%zu/time_group:%s%s/options:%s%s%s" + , qt->request.scope_nodes ? qt->request.scope_nodes : "*" + , qt->request.scope_contexts ? qt->request.scope_contexts : "*" + , qt->request.nodes ? qt->request.nodes : "*" + , (qt->request.contexts) ? qt->request.contexts : "*" + , (qt->request.instances) ? qt->request.instances : "*" + , (qt->request.labels) ? qt->request.labels : "*" + , (qt->request.dimensions) ? qt->request.dimensions : "*" + , (long long)qt->request.after + , (long long)qt->request.before + , qt->request.points + , time_grouping_tostring(qt->request.time_group_method) + , qt->request.time_group_options ? qt->request.time_group_options : "" + , options_buffer + , resampling_buffer + , tier_buffer + ); + else + snprintfz(qt->id, MAX_QUERY_TARGET_ID_LENGTH, "context://hosts:%s/contexts:%s/instances:%s/dimensions:%s/after:%lld/before:%lld/points:%zu/group:%s%s/options:%s%s%s" + , (qt->request.host) ? rrdhost_hostname(qt->request.host) : ((qt->request.nodes) ? qt->request.nodes : "*") + , (qt->request.contexts) ? qt->request.contexts : "*" + , (qt->request.instances) ? qt->request.instances : "*" + , (qt->request.dimensions) ? qt->request.dimensions : "*" + , (long long)qt->request.after + , (long long)qt->request.before + , qt->request.points + , time_grouping_tostring(qt->request.time_group_method) + , qt->request.time_group_options ? qt->request.time_group_options : "" + , options_buffer + , resampling_buffer + , tier_buffer + ); + + json_fix_string(qt->id); +} + +QUERY_TARGET *query_target_create(QUERY_TARGET_REQUEST *qtr) { + if(!service_running(ABILITY_DATA_QUERIES)) + return NULL; + + QUERY_TARGET *qt = query_target_get(); + + if(!qtr->received_ut) + qtr->received_ut = now_monotonic_usec(); + + qt->timings.received_ut = qtr->received_ut; + + if(qtr->nodes && !qtr->scope_nodes) + qtr->scope_nodes = qtr->nodes; + + if(qtr->contexts && !qtr->scope_contexts) + qtr->scope_contexts = qtr->contexts; + + memset(&qt->db, 0, sizeof(qt->db)); + qt->query_points = STORAGE_POINT_UNSET; + + // copy the request into query_thread_target + qt->request = *qtr; + + query_target_generate_name(qt); + qt->window.after = qt->request.after; + qt->window.before = qt->request.before; + + qt->window.options = qt->request.options; + if(query_target_has_percentage_of_instance(qt)) + qt->window.options &= ~RRDR_OPTION_PERCENTAGE; + + rrdr_relative_window_to_absolute(&qt->window.after, &qt->window.before, &qt->window.now); + + // prepare our local variables - we need these across all these functions + QUERY_TARGET_LOCALS qtl = { + .qt = qt, + .start_s = now_realtime_sec(), + .st = qt->request.st, + .scope_nodes = qt->request.scope_nodes, + .scope_contexts = qt->request.scope_contexts, + .nodes = qt->request.nodes, + .contexts = qt->request.contexts, + .instances = qt->request.instances, + .dimensions = qt->request.dimensions, + .chart_label_key = qt->request.chart_label_key, + .labels = qt->request.labels, + .alerts = qt->request.alerts, + }; + + RRDHOST *host = qt->request.host; + + // prepare all the patterns + qt->nodes.scope_pattern = string_to_simple_pattern(qtl.scope_nodes); + qt->nodes.pattern = string_to_simple_pattern(qtl.nodes); + + qt->contexts.pattern = string_to_simple_pattern(qtl.contexts); + qt->contexts.scope_pattern = string_to_simple_pattern(qtl.scope_contexts); + + qt->instances.pattern = string_to_simple_pattern(qtl.instances); + qt->query.pattern = string_to_simple_pattern(qtl.dimensions); + qt->instances.chart_label_key_pattern = string_to_simple_pattern(qtl.chart_label_key); + qt->instances.labels_pattern = string_to_simple_pattern(qtl.labels); + qt->instances.alerts_pattern = string_to_simple_pattern(qtl.alerts); + + qtl.match_ids = qt->request.options & RRDR_OPTION_MATCH_IDS; + qtl.match_names = qt->request.options & RRDR_OPTION_MATCH_NAMES; + if(likely(!qtl.match_ids && !qtl.match_names)) + qtl.match_ids = qtl.match_names = true; + + // verify that the chart belongs to the host we are interested + if(qtl.st) { + if (!host) { + // It is NULL, set it ourselves. + host = qtl.st->rrdhost; + } + else if (unlikely(host != qtl.st->rrdhost)) { + // Oops! A different host! + error("QUERY TARGET: RRDSET '%s' given does not belong to host '%s'. Switching query host to '%s'", + rrdset_name(qtl.st), rrdhost_hostname(host), rrdhost_hostname(qtl.st->rrdhost)); + host = qtl.st->rrdhost; + } + } + + if(host) { + if(host->node_id) + uuid_unparse_lower(*host->node_id, qtl.host_node_id_str); + else + qtl.host_node_id_str[0] = '\0'; + + // single host query + qt->versions.contexts_hard_hash = dictionary_version(host->rrdctx.contexts); + qt->versions.contexts_soft_hash = dictionary_version(host->rrdctx.hub_queue); + qt->versions.alerts_hard_hash = dictionary_version(host->rrdcalc_root_index); + qt->versions.alerts_soft_hash = __atomic_load_n(&host->health_transitions, __ATOMIC_RELAXED); + query_node_add(&qtl, host, true); + qtl.nodes = rrdhost_hostname(host); + } + else + query_scope_foreach_host(qt->nodes.scope_pattern, qt->nodes.pattern, + query_node_add, &qtl, + &qt->versions, + qtl.host_node_id_str); + + // we need the available db retention for this call + // so it has to be done last + query_target_calculate_window(qt); + + qt->timings.preprocessed_ut = now_monotonic_usec(); + + return qt; +} + +ssize_t weights_foreach_rrdmetric_in_context(RRDCONTEXT_ACQUIRED *rca, + SIMPLE_PATTERN *instances_sp, + SIMPLE_PATTERN *chart_label_key_sp, + SIMPLE_PATTERN *labels_sp, + SIMPLE_PATTERN *alerts_sp, + SIMPLE_PATTERN *dimensions_sp, + bool match_ids, bool match_names, + size_t version, + weights_add_metric_t cb, + void *data + ) { + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + if(!rc || rrd_flag_is_deleted(rc)) + return 0; + + char host_node_id_str[UUID_STR_LEN] = ""; + + bool proceed = true; + + ssize_t count = 0; + RRDINSTANCE *ri; + dfe_start_read(rc->rrdinstances, ri) { + if(rrd_flag_is_deleted(ri)) + continue; + + RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *) ri_dfe.item; + + if(instances_sp) { + QUERY_INSTANCE qi = { .ria = ria, }; + SIMPLE_PATTERN_RESULT ret = query_instance_matches(&qi, ri, instances_sp, match_ids, match_names, version, host_node_id_str); + qi.ria = NULL; + query_instance_release(&qi); + + if (ret != SP_MATCHED_POSITIVE) + continue; + } + + if(!query_instance_matches_labels(ri, chart_label_key_sp, labels_sp)) + continue; + + if(alerts_sp && !query_target_match_alert_pattern(ria, alerts_sp)) + continue; + + dfe_unlock(ri); + + RRDMETRIC *rm; + dfe_start_read(ri->rrdmetrics, rm) { + if(rrd_flag_is_deleted(rm)) + continue; + + if(dimensions_sp) { + SIMPLE_PATTERN_RESULT ret = SP_NOT_MATCHED; + + if (match_ids) + ret = simple_pattern_matches_string_extract(dimensions_sp, rm->id, NULL, 0); + + if (ret == SP_NOT_MATCHED && match_names && (rm->name != rm->id || !match_ids)) + ret = simple_pattern_matches_string_extract(dimensions_sp, rm->name, NULL, 0); + + if(ret != SP_MATCHED_POSITIVE) + continue; + } + + dfe_unlock(rm); + + RRDMETRIC_ACQUIRED *rma = (RRDMETRIC_ACQUIRED *)rm_dfe.item; + ssize_t ret = cb(data, rc->rrdhost, rca, ria, rma); + + if(ret < 0) { + proceed = false; + break; + } + + count += ret; + } + dfe_done(rm); + + if(unlikely(!proceed)) + break; + } + dfe_done(ri); + + return count; +} diff --git a/database/contexts/rrdcontext.c b/database/contexts/rrdcontext.c new file mode 100644 index 00000000..40a7e420 --- /dev/null +++ b/database/contexts/rrdcontext.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "internal.h" + +// ---------------------------------------------------------------------------- +// visualizing flags + +struct rrdcontext_reason rrdcontext_reasons[] = { + // context related + {RRD_FLAG_UPDATE_REASON_TRIGGERED, "triggered transition", 65 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_NEW_OBJECT, "object created", 65 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT, "object updated", 65 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_LOAD_SQL, "loaded from sql", 65 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_CHANGED_METADATA, "changed metadata", 65 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_ZERO_RETENTION, "has no retention", 65 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T, "updated first_time_t", 65 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T, "updated last_time_t", 65 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_STOPPED_BEING_COLLECTED, "stopped collected", 65 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_STARTED_BEING_COLLECTED, "started collected", 5 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_UNUSED, "unused", 5 * USEC_PER_SEC }, + + // not context related + {RRD_FLAG_UPDATE_REASON_CHANGED_LINKING, "changed rrd link", 65 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD, "child disconnected", 65 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_DB_ROTATION, "db rotation", 65 * USEC_PER_SEC }, + {RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION, "updated retention", 65 * USEC_PER_SEC }, + + // terminator + {0, NULL, 0 }, +}; + +void rrd_reasons_to_buffer_json_array_items(RRD_FLAGS flags, BUFFER *wb) { + for(int i = 0, added = 0; rrdcontext_reasons[i].name ; i++) { + if (flags & rrdcontext_reasons[i].flag) { + buffer_json_add_array_item_string(wb, rrdcontext_reasons[i].name); + added++; + } + } +} +// ---------------------------------------------------------------------------- +// public API + +void rrdcontext_updated_rrddim(RRDDIM *rd) { + rrdmetric_from_rrddim(rd); +} + +void rrdcontext_removed_rrddim(RRDDIM *rd) { + rrdmetric_rrddim_is_freed(rd); +} + +void rrdcontext_updated_rrddim_algorithm(RRDDIM *rd) { + rrdmetric_updated_rrddim_flags(rd); +} + +void rrdcontext_updated_rrddim_multiplier(RRDDIM *rd) { + rrdmetric_updated_rrddim_flags(rd); +} + +void rrdcontext_updated_rrddim_divisor(RRDDIM *rd) { + rrdmetric_updated_rrddim_flags(rd); +} + +void rrdcontext_updated_rrddim_flags(RRDDIM *rd) { + rrdmetric_updated_rrddim_flags(rd); +} + +void rrdcontext_collected_rrddim(RRDDIM *rd) { + rrdmetric_collected_rrddim(rd); +} + +void rrdcontext_updated_rrdset(RRDSET *st) { + rrdinstance_from_rrdset(st); +} + +void rrdcontext_removed_rrdset(RRDSET *st) { + rrdinstance_rrdset_is_freed(st); +} + +void rrdcontext_updated_retention_rrdset(RRDSET *st) { + rrdinstance_rrdset_has_updated_retention(st); +} + +void rrdcontext_updated_rrdset_name(RRDSET *st) { + rrdinstance_updated_rrdset_name(st); +} + +void rrdcontext_updated_rrdset_flags(RRDSET *st) { + rrdinstance_updated_rrdset_flags(st); +} + +void rrdcontext_collected_rrdset(RRDSET *st) { + rrdinstance_collected_rrdset(st); +} + +void rrdcontext_host_child_connected(RRDHOST *host) { + (void)host; + + // no need to do anything here + ; +} + +usec_t rrdcontext_next_db_rotation_ut = 0; +void rrdcontext_db_rotation(void) { + // called when the db rotates its database + rrdcontext_next_db_rotation_ut = now_realtime_usec() + FULL_RETENTION_SCAN_DELAY_AFTER_DB_ROTATION_SECS * USEC_PER_SEC; +} + +int rrdcontext_find_dimension_uuid(RRDSET *st, const char *id, uuid_t *store_uuid) { + if(!st->rrdhost) return 1; + if(!st->context) return 2; + + RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item(st->rrdhost->rrdctx.contexts, string2str(st->context)); + if(!rca) return 3; + + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + + RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *)dictionary_get_and_acquire_item(rc->rrdinstances, string2str(st->id)); + if(!ria) { + rrdcontext_release(rca); + return 4; + } + + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + + RRDMETRIC_ACQUIRED *rma = (RRDMETRIC_ACQUIRED *)dictionary_get_and_acquire_item(ri->rrdmetrics, id); + if(!rma) { + rrdinstance_release(ria); + rrdcontext_release(rca); + return 5; + } + + RRDMETRIC *rm = rrdmetric_acquired_value(rma); + + uuid_copy(*store_uuid, rm->uuid); + + rrdmetric_release(rma); + rrdinstance_release(ria); + rrdcontext_release(rca); + return 0; +} + +int rrdcontext_find_chart_uuid(RRDSET *st, uuid_t *store_uuid) { + if(!st->rrdhost) return 1; + if(!st->context) return 2; + + RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item(st->rrdhost->rrdctx.contexts, string2str(st->context)); + if(!rca) return 3; + + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + + RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *)dictionary_get_and_acquire_item(rc->rrdinstances, string2str(st->id)); + if(!ria) { + rrdcontext_release(rca); + return 4; + } + + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + uuid_copy(*store_uuid, ri->uuid); + + rrdinstance_release(ria); + rrdcontext_release(rca); + return 0; +} + +void rrdcontext_host_child_disconnected(RRDHOST *host) { + rrdcontext_recalculate_host_retention(host, RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD, false); +} + +int rrdcontext_foreach_instance_with_rrdset_in_context(RRDHOST *host, const char *context, int (*callback)(RRDSET *st, void *data), void *data) { + if(unlikely(!host || !context || !*context || !callback)) + return -1; + + RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item(host->rrdctx.contexts, context); + if(unlikely(!rca)) return -1; + + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + if(unlikely(!rc)) return -1; + + int ret = 0; + RRDINSTANCE *ri; + dfe_start_read(rc->rrdinstances, ri) { + if(ri->rrdset) { + int r = callback(ri->rrdset, data); + if(r >= 0) ret += r; + else { + ret = r; + break; + } + } + } + dfe_done(ri); + + rrdcontext_release(rca); + + return ret; +} + +// ---------------------------------------------------------------------------- +// ACLK interface + +static bool rrdhost_check_our_claim_id(const char *claim_id) { + if(!localhost->aclk_state.claimed_id) return false; + return (strcasecmp(claim_id, localhost->aclk_state.claimed_id) == 0) ? true : false; +} + +static RRDHOST *rrdhost_find_by_node_id(const char *node_id) { + uuid_t uuid; + if (uuid_parse(node_id, uuid)) + return NULL; + + RRDHOST *host = NULL; + dfe_start_read(rrdhost_root_index, host) { + if(!host->node_id) continue; + + if(uuid_memcmp(&uuid, host->node_id) == 0) + break; + } + dfe_done(host); + + return host; +} + +void rrdcontext_hub_checkpoint_command(void *ptr) { + struct ctxs_checkpoint *cmd = ptr; + + if(!rrdhost_check_our_claim_id(cmd->claim_id)) { + error("RRDCONTEXT: received checkpoint command for claim_id '%s', node id '%s', but this is not our claim id. Ours '%s', received '%s'. Ignoring command.", + cmd->claim_id, cmd->node_id, + localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET", + cmd->claim_id); + + return; + } + + RRDHOST *host = rrdhost_find_by_node_id(cmd->node_id); + if(!host) { + error("RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', but there is no node with such node id here. Ignoring command.", + cmd->claim_id, cmd->node_id); + + return; + } + + if(rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS)) { + info("RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', while node '%s' has an active context streaming.", + cmd->claim_id, cmd->node_id, rrdhost_hostname(host)); + + // disable it temporarily, so that our worker will not attempt to send messages in parallel + rrdhost_flag_clear(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS); + } + + uint64_t our_version_hash = rrdcontext_version_hash(host); + + if(cmd->version_hash != our_version_hash) { + error("RRDCONTEXT: received version hash %"PRIu64" for host '%s', does not match our version hash %"PRIu64". Sending snapshot of all contexts.", + cmd->version_hash, rrdhost_hostname(host), our_version_hash); + +#ifdef ENABLE_ACLK + // prepare the snapshot + char uuid[UUID_STR_LEN]; + uuid_unparse_lower(*host->node_id, uuid); + contexts_snapshot_t bundle = contexts_snapshot_new(cmd->claim_id, uuid, our_version_hash); + + // do a deep scan on every metric of the host to make sure all our data are updated + rrdcontext_recalculate_host_retention(host, RRD_FLAG_NONE, false); + + // calculate version hash and pack all the messages together in one go + our_version_hash = rrdcontext_version_hash_with_callback(host, rrdcontext_message_send_unsafe, true, bundle); + + // update the version + contexts_snapshot_set_version(bundle, our_version_hash); + + // send it + aclk_send_contexts_snapshot(bundle); +#endif + } + + internal_error(true, "RRDCONTEXT: host '%s' enabling streaming of contexts", rrdhost_hostname(host)); + rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS); + char node_str[UUID_STR_LEN]; + uuid_unparse_lower(*host->node_id, node_str); + log_access("ACLK REQ [%s (%s)]: STREAM CONTEXTS ENABLED", node_str, rrdhost_hostname(host)); +} + +void rrdcontext_hub_stop_streaming_command(void *ptr) { + struct stop_streaming_ctxs *cmd = ptr; + + if(!rrdhost_check_our_claim_id(cmd->claim_id)) { + error("RRDCONTEXT: received stop streaming command for claim_id '%s', node id '%s', but this is not our claim id. Ours '%s', received '%s'. Ignoring command.", + cmd->claim_id, cmd->node_id, + localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET", + cmd->claim_id); + + return; + } + + RRDHOST *host = rrdhost_find_by_node_id(cmd->node_id); + if(!host) { + error("RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', but there is no node with such node id here. Ignoring command.", + cmd->claim_id, cmd->node_id); + + return; + } + + if(!rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS)) { + error("RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', but node '%s' does not have active context streaming. Ignoring command.", + cmd->claim_id, cmd->node_id, rrdhost_hostname(host)); + + return; + } + + internal_error(true, "RRDCONTEXT: host '%s' disabling streaming of contexts", rrdhost_hostname(host)); + rrdhost_flag_clear(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS); +} + +bool rrdcontext_retention_match(RRDCONTEXT_ACQUIRED *rca, time_t after, time_t before) { + if(unlikely(!rca)) return false; + + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + + if(rrd_flag_is_collected(rc)) + return query_matches_retention(after, before, rc->first_time_s, before > rc->last_time_s ? before : rc->last_time_s, 1); + else + return query_matches_retention(after, before, rc->first_time_s, rc->last_time_s, 1); +}
\ No newline at end of file diff --git a/database/contexts/rrdcontext.h b/database/contexts/rrdcontext.h new file mode 100644 index 00000000..5328483d --- /dev/null +++ b/database/contexts/rrdcontext.h @@ -0,0 +1,553 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_RRDCONTEXT_H +#define NETDATA_RRDCONTEXT_H 1 + +// ---------------------------------------------------------------------------- +// RRDMETRIC + +typedef struct rrdmetric_acquired RRDMETRIC_ACQUIRED; + +// ---------------------------------------------------------------------------- +// RRDINSTANCE + +typedef struct rrdinstance_acquired RRDINSTANCE_ACQUIRED; + +// ---------------------------------------------------------------------------- +// RRDCONTEXT + +typedef struct rrdcontext_acquired RRDCONTEXT_ACQUIRED; + +// ---------------------------------------------------------------------------- + +#include "../rrd.h" + +bool rrdinstance_acquired_id_and_name_are_same(RRDINSTANCE_ACQUIRED *ria); +const char *rrdmetric_acquired_id(RRDMETRIC_ACQUIRED *rma); +const char *rrdmetric_acquired_name(RRDMETRIC_ACQUIRED *rma); +bool rrdmetric_acquired_has_name(RRDMETRIC_ACQUIRED *rma); + +STRING *rrdmetric_acquired_id_dup(RRDMETRIC_ACQUIRED *rma); +STRING *rrdmetric_acquired_name_dup(RRDMETRIC_ACQUIRED *rma); + +NETDATA_DOUBLE rrdmetric_acquired_last_stored_value(RRDMETRIC_ACQUIRED *rma); +time_t rrdmetric_acquired_first_entry(RRDMETRIC_ACQUIRED *rma); +time_t rrdmetric_acquired_last_entry(RRDMETRIC_ACQUIRED *rma); +bool rrdmetric_acquired_belongs_to_instance(RRDMETRIC_ACQUIRED *rma, RRDINSTANCE_ACQUIRED *ria); + +const char *rrdinstance_acquired_id(RRDINSTANCE_ACQUIRED *ria); +const char *rrdinstance_acquired_name(RRDINSTANCE_ACQUIRED *ria); +bool rrdinstance_acquired_has_name(RRDINSTANCE_ACQUIRED *ria); +const char *rrdinstance_acquired_units(RRDINSTANCE_ACQUIRED *ria); +STRING *rrdinstance_acquired_units_dup(RRDINSTANCE_ACQUIRED *ria); +DICTIONARY *rrdinstance_acquired_labels(RRDINSTANCE_ACQUIRED *ria); +DICTIONARY *rrdinstance_acquired_functions(RRDINSTANCE_ACQUIRED *ria); +RRDHOST *rrdinstance_acquired_rrdhost(RRDINSTANCE_ACQUIRED *ria); +RRDSET *rrdinstance_acquired_rrdset(RRDINSTANCE_ACQUIRED *ria); + +bool rrdinstance_acquired_belongs_to_context(RRDINSTANCE_ACQUIRED *ria, RRDCONTEXT_ACQUIRED *rca); +time_t rrdinstance_acquired_update_every(RRDINSTANCE_ACQUIRED *ria); + +const char *rrdcontext_acquired_units(RRDCONTEXT_ACQUIRED *rca); +const char *rrdcontext_acquired_title(RRDCONTEXT_ACQUIRED *rca); +RRDSET_TYPE rrdcontext_acquired_chart_type(RRDCONTEXT_ACQUIRED *rca); + +// ---------------------------------------------------------------------------- +// public API for rrdhost + +void rrdhost_load_rrdcontext_data(RRDHOST *host); +void rrdhost_create_rrdcontexts(RRDHOST *host); +void rrdhost_destroy_rrdcontexts(RRDHOST *host); + +void rrdcontext_host_child_connected(RRDHOST *host); +void rrdcontext_host_child_disconnected(RRDHOST *host); + +int rrdcontext_foreach_instance_with_rrdset_in_context(RRDHOST *host, const char *context, int (*callback)(RRDSET *st, void *data), void *data); + +typedef enum { + RRDCONTEXT_OPTION_NONE = 0, + RRDCONTEXT_OPTION_SHOW_METRICS = (1 << 0), + RRDCONTEXT_OPTION_SHOW_INSTANCES = (1 << 1), + RRDCONTEXT_OPTION_SHOW_LABELS = (1 << 2), + RRDCONTEXT_OPTION_SHOW_QUEUED = (1 << 3), + RRDCONTEXT_OPTION_SHOW_FLAGS = (1 << 4), + RRDCONTEXT_OPTION_SHOW_DELETED = (1 << 5), + RRDCONTEXT_OPTION_DEEPSCAN = (1 << 6), + RRDCONTEXT_OPTION_SHOW_UUIDS = (1 << 7), + RRDCONTEXT_OPTION_SHOW_HIDDEN = (1 << 8), + RRDCONTEXT_OPTION_SKIP_ID = (1 << 31), // internal use +} RRDCONTEXT_TO_JSON_OPTIONS; + +#define RRDCONTEXT_OPTIONS_ALL (RRDCONTEXT_OPTION_SHOW_METRICS|RRDCONTEXT_OPTION_SHOW_INSTANCES|RRDCONTEXT_OPTION_SHOW_LABELS|RRDCONTEXT_OPTION_SHOW_QUEUED|RRDCONTEXT_OPTION_SHOW_FLAGS|RRDCONTEXT_OPTION_SHOW_DELETED|RRDCONTEXT_OPTION_SHOW_UUIDS|RRDCONTEXT_OPTION_SHOW_HIDDEN) + +int rrdcontext_to_json(RRDHOST *host, BUFFER *wb, time_t after, time_t before, RRDCONTEXT_TO_JSON_OPTIONS options, const char *context, SIMPLE_PATTERN *chart_label_key, SIMPLE_PATTERN *chart_labels_filter, SIMPLE_PATTERN *chart_dimensions); +int rrdcontexts_to_json(RRDHOST *host, BUFFER *wb, time_t after, time_t before, RRDCONTEXT_TO_JSON_OPTIONS options, SIMPLE_PATTERN *chart_label_key, SIMPLE_PATTERN *chart_labels_filter, SIMPLE_PATTERN *chart_dimensions); + +// ---------------------------------------------------------------------------- +// public API for rrdcontexts + +const char *rrdcontext_acquired_id(RRDCONTEXT_ACQUIRED *rca); +bool rrdcontext_acquired_belongs_to_host(RRDCONTEXT_ACQUIRED *rca, RRDHOST *host); + +// ---------------------------------------------------------------------------- +// public API for rrddims + +void rrdcontext_updated_rrddim(RRDDIM *rd); +void rrdcontext_removed_rrddim(RRDDIM *rd); +void rrdcontext_updated_rrddim_algorithm(RRDDIM *rd); +void rrdcontext_updated_rrddim_multiplier(RRDDIM *rd); +void rrdcontext_updated_rrddim_divisor(RRDDIM *rd); +void rrdcontext_updated_rrddim_flags(RRDDIM *rd); +void rrdcontext_collected_rrddim(RRDDIM *rd); +int rrdcontext_find_dimension_uuid(RRDSET *st, const char *id, uuid_t *store_uuid); + +// ---------------------------------------------------------------------------- +// public API for rrdsets + +void rrdcontext_updated_rrdset(RRDSET *st); +void rrdcontext_removed_rrdset(RRDSET *st); +void rrdcontext_updated_rrdset_name(RRDSET *st); +void rrdcontext_updated_rrdset_flags(RRDSET *st); +void rrdcontext_updated_retention_rrdset(RRDSET *st); +void rrdcontext_collected_rrdset(RRDSET *st); +int rrdcontext_find_chart_uuid(RRDSET *st, uuid_t *store_uuid); + +// ---------------------------------------------------------------------------- +// public API for ACLK + +void rrdcontext_hub_checkpoint_command(void *cmd); +void rrdcontext_hub_stop_streaming_command(void *cmd); + + +// ---------------------------------------------------------------------------- +// public API for threads + +void rrdcontext_db_rotation(void); +void *rrdcontext_main(void *); + +// ---------------------------------------------------------------------------- +// public API for queries + +typedef enum __attribute__ ((__packed__)) { + QUERY_STATUS_NONE = 0, + QUERY_STATUS_QUERIED = (1 << 0), + QUERY_STATUS_DIMENSION_HIDDEN = (1 << 1), + QUERY_STATUS_EXCLUDED = (1 << 2), + QUERY_STATUS_FAILED = (1 << 3), +} QUERY_STATUS; + +typedef struct query_plan_entry { + size_t tier; + time_t after; + time_t before; +} QUERY_PLAN_ENTRY; + +#define QUERY_PLANS_MAX (RRD_STORAGE_TIERS) + +typedef struct query_metrics_counts { // counts the number of metrics related to an object + size_t selected; // selected to be queried + size_t excluded; // not selected to be queried + size_t queried; // successfully queried + size_t failed; // failed to be queried +} QUERY_METRICS_COUNTS; + +typedef struct query_instances_counts { // counts the number of instances related to an object + size_t selected; // selected to be queried + size_t excluded; // not selected to be queried + size_t queried; // successfully queried + size_t failed; // failed to be queried +} QUERY_INSTANCES_COUNTS; + +typedef struct query_alerts_counts { // counts the number of alerts related to an object + size_t clear; // number of alerts in clear state + size_t warning; // number of alerts in warning state + size_t critical; // number of alerts in critical state + size_t other; // number of alerts in any other state +} QUERY_ALERTS_COUNTS; + +typedef struct query_node { + uint32_t slot; + RRDHOST *rrdhost; + char node_id[UUID_STR_LEN]; + usec_t duration_ut; + + STORAGE_POINT query_points; + QUERY_INSTANCES_COUNTS instances; + QUERY_METRICS_COUNTS metrics; + QUERY_ALERTS_COUNTS alerts; +} QUERY_NODE; + +typedef struct query_context { + uint32_t slot; + RRDCONTEXT_ACQUIRED *rca; + + STORAGE_POINT query_points; + QUERY_INSTANCES_COUNTS instances; + QUERY_METRICS_COUNTS metrics; + QUERY_ALERTS_COUNTS alerts; +} QUERY_CONTEXT; + +typedef struct query_instance { + uint32_t slot; + uint32_t query_host_id; + RRDINSTANCE_ACQUIRED *ria; + STRING *id_fqdn; // never access this directly - it is created on demand via query_instance_id_fqdn() + STRING *name_fqdn; // never access this directly - it is created on demand via query_instance_name_fqdn() + + STORAGE_POINT query_points; + QUERY_METRICS_COUNTS metrics; + QUERY_ALERTS_COUNTS alerts; +} QUERY_INSTANCE; + +typedef struct query_dimension { + uint32_t slot; + uint32_t priority; + RRDMETRIC_ACQUIRED *rma; + QUERY_STATUS status; +} QUERY_DIMENSION; + +typedef struct query_metric { + RRDR_DIMENSION_FLAGS status; + + struct query_metric_tier { + STORAGE_METRIC_HANDLE *db_metric_handle; + time_t db_first_time_s; // the oldest timestamp available for this tier + time_t db_last_time_s; // the latest timestamp available for this tier + time_t db_update_every_s; // latest update every for this tier + long weight; + } tiers[RRD_STORAGE_TIERS]; + + struct { + size_t used; + QUERY_PLAN_ENTRY array[QUERY_PLANS_MAX]; + } plan; + + struct { + uint32_t query_node_id; + uint32_t query_context_id; + uint32_t query_instance_id; + uint32_t query_dimension_id; + } link; + + STORAGE_POINT query_points; + + struct { + uint32_t slot; + uint32_t first_slot; + STRING *id; + STRING *name; + STRING *units; + } grouped_as; + + usec_t duration_ut; +} QUERY_METRIC; + +#define MAX_QUERY_TARGET_ID_LENGTH 255 +#define MAX_QUERY_GROUP_BY_PASSES 2 + +typedef bool (*qt_interrupt_callback_t)(void *data); + +struct group_by_pass { + RRDR_GROUP_BY group_by; + char *group_by_label; + RRDR_GROUP_BY_FUNCTION aggregation; +}; + +typedef struct query_target_request { + size_t version; + + const char *scope_nodes; + const char *scope_contexts; + + // selecting / filtering metrics to be queried + RRDHOST *host; // the host to be queried (can be NULL, hosts will be used) + RRDCONTEXT_ACQUIRED *rca; // the context to be queried (can be NULL) + RRDINSTANCE_ACQUIRED *ria; // the instance to be queried (can be NULL) + RRDMETRIC_ACQUIRED *rma; // the metric to be queried (can be NULL) + RRDSET *st; // the chart to be queried (NULL, for context queries) + const char *nodes; // hosts simple pattern + const char *contexts; // contexts simple pattern (context queries) + const char *instances; // charts simple pattern (for context queries) + const char *dimensions; // dimensions simple pattern + const char *chart_label_key; // select only the chart having this label key + const char *labels; // select only the charts having this combo of label key:value + const char *alerts; // select only the charts having this combo of alert name:status + + time_t after; // the requested timeframe + time_t before; // the requested timeframe + size_t points; // the requested number of points to be returned + + uint32_t format; // DATASOURCE_FORMAT + RRDR_OPTIONS options; + time_t timeout_ms; // the timeout of the query in milliseconds + + size_t tier; + QUERY_SOURCE query_source; + STORAGE_PRIORITY priority; + + // resampling metric values across time + time_t resampling_time; + + // grouping metric values across time + RRDR_TIME_GROUPING time_group_method; + const char *time_group_options; + + // group by across multiple time-series + struct group_by_pass group_by[MAX_QUERY_GROUP_BY_PASSES]; + + usec_t received_ut; + + qt_interrupt_callback_t interrupt_callback; + void *interrupt_callback_data; +} QUERY_TARGET_REQUEST; + +#define GROUP_BY_MAX_LABEL_KEYS 10 + +struct query_tier_statistics { + size_t queries; + size_t points; + time_t update_every; + struct { + time_t first_time_s; + time_t last_time_s; + } retention; +}; + +struct query_versions { + uint64_t contexts_hard_hash; + uint64_t contexts_soft_hash; + uint64_t alerts_hard_hash; + uint64_t alerts_soft_hash; +}; + +struct query_timings { + usec_t received_ut; + usec_t preprocessed_ut; + usec_t executed_ut; + usec_t finished_ut; +}; + +#define query_view_update_every(qt) ((qt)->window.group * (qt)->window.query_granularity) + +typedef struct query_target { + char id[MAX_QUERY_TARGET_ID_LENGTH + 1]; // query identifier (for logging) + QUERY_TARGET_REQUEST request; + + struct { + time_t now; // the current timestamp, the absolute max for any query timestamp + bool relative; // true when the request made with relative timestamps, true if it was absolute + bool aligned; + time_t after; // the absolute timestamp this query is about + time_t before; // the absolute timestamp this query is about + time_t query_granularity; + size_t points; // the number of points the query will return (maybe different from the request) + size_t group; + RRDR_TIME_GROUPING time_group_method; + const char *time_group_options; + size_t resampling_group; + NETDATA_DOUBLE resampling_divisor; + RRDR_OPTIONS options; + size_t tier; + } window; + + struct { + size_t queries[RRD_STORAGE_TIERS]; + time_t first_time_s; // the combined first_time_t of all metrics in the query, across all tiers + time_t last_time_s; // the combined last_time_T of all metrics in the query, across all tiers + time_t minimum_latest_update_every_s; // the min update every of the metrics in the query + struct query_tier_statistics tiers[RRD_STORAGE_TIERS]; + } db; + + struct { + QUERY_METRIC *array; // the metrics to be queried (all of them should be queried, no exceptions) + uint32_t used; // how many items of the array are used + uint32_t size; // the size of the array + SIMPLE_PATTERN *pattern; + } query; + + struct { + QUERY_DIMENSION *array; + uint32_t used; // how many items of the array are used + uint32_t size; // the size of the array + } dimensions; + + struct { + QUERY_INSTANCE *array; + uint32_t used; // how many items of the array are used + uint32_t size; // the size of the array + SIMPLE_PATTERN *pattern; + SIMPLE_PATTERN *labels_pattern; + SIMPLE_PATTERN *alerts_pattern; + SIMPLE_PATTERN *chart_label_key_pattern; + } instances; + + struct { + QUERY_CONTEXT *array; + uint32_t used; // how many items of the array are used + uint32_t size; // the size of the array + SIMPLE_PATTERN *pattern; + SIMPLE_PATTERN *scope_pattern; + } contexts; + + struct { + QUERY_NODE *array; + uint32_t used; // how many items of the array are used + uint32_t size; // the size of the array + SIMPLE_PATTERN *pattern; + SIMPLE_PATTERN *scope_pattern; + } nodes; + + struct { + size_t used; + char *label_keys[GROUP_BY_MAX_LABEL_KEYS * MAX_QUERY_GROUP_BY_PASSES]; + } group_by[MAX_QUERY_GROUP_BY_PASSES]; + + STORAGE_POINT query_points; + struct query_versions versions; + struct query_timings timings; + + struct { + SPINLOCK spinlock; + bool used; // when true, this query is currently being used + size_t queries; // how many query we have done so far with this QUERY_TARGET - not related to database queries + struct query_target *prev; + struct query_target *next; + } internal; +} QUERY_TARGET; + +static inline NEVERNULL QUERY_NODE *query_node(QUERY_TARGET *qt, size_t id) { + internal_fatal(id >= qt->nodes.used, "QUERY: invalid query host id"); + return &qt->nodes.array[id]; +} + +static inline NEVERNULL QUERY_CONTEXT *query_context(QUERY_TARGET *qt, size_t query_context_id) { + internal_fatal(query_context_id >= qt->contexts.used, "QUERY: invalid query context id"); + return &qt->contexts.array[query_context_id]; +} + +static inline NEVERNULL QUERY_INSTANCE *query_instance(QUERY_TARGET *qt, size_t query_instance_id) { + internal_fatal(query_instance_id >= qt->instances.used, "QUERY: invalid query instance id"); + return &qt->instances.array[query_instance_id]; +} + +static inline NEVERNULL QUERY_DIMENSION *query_dimension(QUERY_TARGET *qt, size_t query_dimension_id) { + internal_fatal(query_dimension_id >= qt->dimensions.used, "QUERY: invalid query dimension id"); + return &qt->dimensions.array[query_dimension_id]; +} + +static inline NEVERNULL QUERY_METRIC *query_metric(QUERY_TARGET *qt, size_t id) { + internal_fatal(id >= qt->query.used, "QUERY: invalid query metric id"); + return &qt->query.array[id]; +} + +static inline const char *query_metric_id(QUERY_TARGET *qt, QUERY_METRIC *qm) { + QUERY_DIMENSION *qd = query_dimension(qt, qm->link.query_dimension_id); + return rrdmetric_acquired_id(qd->rma); +} + +static inline const char *query_metric_name(QUERY_TARGET *qt, QUERY_METRIC *qm) { + QUERY_DIMENSION *qd = query_dimension(qt, qm->link.query_dimension_id); + return rrdmetric_acquired_name(qd->rma); +} + +struct storage_engine *query_metric_storage_engine(QUERY_TARGET *qt, QUERY_METRIC *qm, size_t tier); + +STRING *query_instance_id_fqdn(QUERY_INSTANCE *qi, size_t version); +STRING *query_instance_name_fqdn(QUERY_INSTANCE *qi, size_t version); + +void query_target_free(void); +void query_target_release(QUERY_TARGET *qt); + +QUERY_TARGET *query_target_create(QUERY_TARGET_REQUEST *qtr); + +struct api_v2_contexts_request { + char *scope_nodes; + char *scope_contexts; + char *nodes; + char *contexts; + char *q; + + time_t timeout_ms; + + qt_interrupt_callback_t interrupt_callback; + void *interrupt_callback_data; +}; + +typedef enum __attribute__ ((__packed__)) { + CONTEXTS_V2_DEBUG = (1 << 0), + CONTEXTS_V2_SEARCH = (1 << 1), + CONTEXTS_V2_NODES = (1 << 2), + CONTEXTS_V2_NODES_DETAILED = (1 << 3), + CONTEXTS_V2_CONTEXTS = (1 << 4), +} CONTEXTS_V2_OPTIONS; + +int rrdcontext_to_json_v2(BUFFER *wb, struct api_v2_contexts_request *req, CONTEXTS_V2_OPTIONS options); + +RRDCONTEXT_TO_JSON_OPTIONS rrdcontext_to_json_parse_options(char *o); +void buffer_json_agents_array_v2(BUFFER *wb, struct query_timings *timings, time_t now_s); +void buffer_json_node_add_v2(BUFFER *wb, RRDHOST *host, size_t ni, usec_t duration_ut); +void buffer_json_query_timings(BUFFER *wb, const char *key, struct query_timings *timings); +void buffer_json_cloud_timings(BUFFER *wb, const char *key, struct query_timings *timings); + +// ---------------------------------------------------------------------------- +// scope + +typedef ssize_t (*foreach_host_cb_t)(void *data, RRDHOST *host, bool queryable); +ssize_t query_scope_foreach_host(SIMPLE_PATTERN *scope_hosts_sp, SIMPLE_PATTERN *hosts_sp, + foreach_host_cb_t cb, void *data, + struct query_versions *versions, + char *host_node_id_str); + +typedef ssize_t (*foreach_context_cb_t)(void *data, RRDCONTEXT_ACQUIRED *rca, bool queryable_context); +ssize_t query_scope_foreach_context(RRDHOST *host, const char *scope_contexts, SIMPLE_PATTERN *scope_contexts_sp, SIMPLE_PATTERN *contexts_sp, foreach_context_cb_t cb, bool queryable_host, void *data); + +// ---------------------------------------------------------------------------- +// public API for weights + +typedef ssize_t (*weights_add_metric_t)(void *data, RRDHOST *host, RRDCONTEXT_ACQUIRED *rca, RRDINSTANCE_ACQUIRED *ria, RRDMETRIC_ACQUIRED *rma); +ssize_t weights_foreach_rrdmetric_in_context(RRDCONTEXT_ACQUIRED *rca, + SIMPLE_PATTERN *instances_sp, + SIMPLE_PATTERN *chart_label_key_sp, + SIMPLE_PATTERN *labels_sp, + SIMPLE_PATTERN *alerts_sp, + SIMPLE_PATTERN *dimensions_sp, + bool match_ids, bool match_names, + size_t version, + weights_add_metric_t cb, + void *data); + +bool rrdcontext_retention_match(RRDCONTEXT_ACQUIRED *rca, time_t after, time_t before); + +#define query_matches_retention(after, before, first_entry_s, last_entry_s, update_every_s) \ + (((first_entry_s) - ((update_every_s) * 2) <= (before)) && \ + ((last_entry_s) + ((update_every_s) * 2) >= (after))) + +#define query_target_aggregatable(qt) ((qt)->window.options & RRDR_OPTION_RETURN_RAW) + +static inline bool query_target_has_percentage_of_instance(QUERY_TARGET *qt) { + for(size_t g = 0; g < MAX_QUERY_GROUP_BY_PASSES ;g++) + if(qt->request.group_by[g].group_by & RRDR_GROUP_BY_PERCENTAGE_OF_INSTANCE) + return true; + + return false; +} + +static inline bool query_target_needs_all_dimensions(QUERY_TARGET *qt) { + if(qt->request.options & RRDR_OPTION_PERCENTAGE) + return true; + + return query_target_has_percentage_of_instance(qt); +} + +static inline bool query_target_has_percentage_units(QUERY_TARGET *qt) { + if(qt->window.time_group_method == RRDR_GROUPING_CV) + return true; + + if((qt->request.options & RRDR_OPTION_PERCENTAGE) && !(qt->window.options & RRDR_OPTION_RETURN_RAW)) + return true; + + return query_target_has_percentage_of_instance(qt); +} + +#endif // NETDATA_RRDCONTEXT_H + diff --git a/database/contexts/worker.c b/database/contexts/worker.c new file mode 100644 index 00000000..22e28b2a --- /dev/null +++ b/database/contexts/worker.c @@ -0,0 +1,1094 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "internal.h" + +static uint64_t rrdcontext_get_next_version(RRDCONTEXT *rc); + +static bool check_if_cloud_version_changed_unsafe(RRDCONTEXT *rc, bool sending __maybe_unused); + +static void rrdcontext_delete_from_sql_unsafe(RRDCONTEXT *rc); + +static void rrdcontext_dequeue_from_post_processing(RRDCONTEXT *rc); +static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAGS reason, bool worker_jobs); + +static void rrdcontext_garbage_collect_single_host(RRDHOST *host, bool worker_jobs); +static void rrdcontext_garbage_collect_for_all_hosts(void); + +extern usec_t rrdcontext_next_db_rotation_ut; + +// ---------------------------------------------------------------------------- +// load from SQL + +static void rrdinstance_load_clabel(SQL_CLABEL_DATA *sld, void *data) { + RRDINSTANCE *ri = data; + rrdlabels_add(ri->rrdlabels, sld->label_key, sld->label_value, sld->label_source); +} + +static void rrdinstance_load_dimension(SQL_DIMENSION_DATA *sd, void *data) { + RRDINSTANCE *ri = data; + + RRDMETRIC trm = { + .id = string_strdupz(sd->id), + .name = string_strdupz(sd->name), + .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomic + }; + if(sd->hidden) trm.flags |= RRD_FLAG_HIDDEN; + + uuid_copy(trm.uuid, sd->dim_id); + + dictionary_set(ri->rrdmetrics, string2str(trm.id), &trm, sizeof(trm)); +} + +static void rrdinstance_load_chart_callback(SQL_CHART_DATA *sc, void *data) { + RRDHOST *host = data; + + RRDCONTEXT tc = { + .id = string_strdupz(sc->context), + .title = string_strdupz(sc->title), + .units = string_strdupz(sc->units), + .family = string_strdupz(sc->family), + .priority = sc->priority, + .chart_type = sc->chart_type, + .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomics + .rrdhost = host, + }; + + RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_set_and_acquire_item(host->rrdctx.contexts, string2str(tc.id), &tc, sizeof(tc)); + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + + RRDINSTANCE tri = { + .id = string_strdupz(sc->id), + .name = string_strdupz(sc->name), + .title = string_strdupz(sc->title), + .units = string_strdupz(sc->units), + .family = string_strdupz(sc->family), + .chart_type = sc->chart_type, + .priority = sc->priority, + .update_every_s = sc->update_every, + .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomics + }; + uuid_copy(tri.uuid, sc->chart_id); + + RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *)dictionary_set_and_acquire_item(rc->rrdinstances, sc->id, &tri, sizeof(tri)); + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + + ctx_get_dimension_list(&ri->uuid, rrdinstance_load_dimension, ri); + ctx_get_label_list(&ri->uuid, rrdinstance_load_clabel, ri); + rrdinstance_trigger_updates(ri, __FUNCTION__ ); + rrdinstance_release(ria); + rrdcontext_release(rca); +} + +static void rrdcontext_load_context_callback(VERSIONED_CONTEXT_DATA *ctx_data, void *data) { + RRDHOST *host = data; + (void)host; + + RRDCONTEXT trc = { + .id = string_strdupz(ctx_data->id), + .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomics + + // no need to set more data here + // we only need the hub data + + .hub = *ctx_data, + }; + dictionary_set(host->rrdctx.contexts, string2str(trc.id), &trc, sizeof(trc)); +} + +void rrdhost_load_rrdcontext_data(RRDHOST *host) { + if(host->rrdctx.contexts) return; + + rrdhost_create_rrdcontexts(host); + ctx_get_context_list(&host->host_uuid, rrdcontext_load_context_callback, host); + ctx_get_chart_list(&host->host_uuid, rrdinstance_load_chart_callback, host); + + RRDCONTEXT *rc; + dfe_start_read(host->rrdctx.contexts, rc) { + rrdcontext_trigger_updates(rc, __FUNCTION__ ); + } + dfe_done(rc); + + rrdcontext_garbage_collect_single_host(host, false); +} + +// ---------------------------------------------------------------------------- +// version hash calculation + +uint64_t rrdcontext_version_hash_with_callback( + RRDHOST *host, + void (*callback)(RRDCONTEXT *, bool, void *), + bool snapshot, + void *bundle) { + + if(unlikely(!host || !host->rrdctx.contexts)) return 0; + + RRDCONTEXT *rc; + uint64_t hash = 0; + + // loop through all contexts of the host + dfe_start_read(host->rrdctx.contexts, rc) { + + rrdcontext_lock(rc); + + if(unlikely(rrd_flag_check(rc, RRD_FLAG_HIDDEN))) { + rrdcontext_unlock(rc); + continue; + } + + if(unlikely(callback)) + callback(rc, snapshot, bundle); + + // skip any deleted contexts + if(unlikely(rrd_flag_is_deleted(rc))) { + rrdcontext_unlock(rc); + continue; + } + + // we use rc->hub.* which has the latest + // metadata we have sent to the hub + + // if a context is currently queued, rc->hub.* does NOT + // reflect the queued changes. rc->hub.* is updated with + // their metadata, after messages are dispatched to hub. + + // when the context is being collected, + // rc->hub.last_time_t is already zero + + hash += rc->hub.version + rc->hub.last_time_s - rc->hub.first_time_s; + + rrdcontext_unlock(rc); + + } + dfe_done(rc); + + return hash; +} + +// ---------------------------------------------------------------------------- +// retention recalculation + +void rrdcontext_recalculate_context_retention(RRDCONTEXT *rc, RRD_FLAGS reason, bool worker_jobs) { + rrdcontext_post_process_updates(rc, true, reason, worker_jobs); +} + +void rrdcontext_recalculate_host_retention(RRDHOST *host, RRD_FLAGS reason, bool worker_jobs) { + if(unlikely(!host || !host->rrdctx.contexts)) return; + + RRDCONTEXT *rc; + dfe_start_read(host->rrdctx.contexts, rc) { + rrdcontext_recalculate_context_retention(rc, reason, worker_jobs); + } + dfe_done(rc); +} + +static void rrdcontext_recalculate_retention_all_hosts(void) { + rrdcontext_next_db_rotation_ut = 0; + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host) { + worker_is_busy(WORKER_JOB_RETENTION); + rrdcontext_recalculate_host_retention(host, RRD_FLAG_UPDATE_REASON_DB_ROTATION, true); + } + dfe_done(host); +} + +// ---------------------------------------------------------------------------- +// garbage collector + +bool rrdmetric_update_retention(RRDMETRIC *rm) { + time_t min_first_time_t = LONG_MAX, max_last_time_t = 0; + + if(rm->rrddim) { + min_first_time_t = rrddim_first_entry_s(rm->rrddim); + max_last_time_t = rrddim_last_entry_s(rm->rrddim); + } + else { + RRDHOST *rrdhost = rm->ri->rc->rrdhost; + for (size_t tier = 0; tier < storage_tiers; tier++) { + STORAGE_ENGINE *eng = rrdhost->db[tier].eng; + + time_t first_time_t, last_time_t; + if (eng->api.metric_retention_by_uuid(rrdhost->db[tier].instance, &rm->uuid, &first_time_t, &last_time_t)) { + if (first_time_t < min_first_time_t) + min_first_time_t = first_time_t; + + if (last_time_t > max_last_time_t) + max_last_time_t = last_time_t; + } + } + } + + if((min_first_time_t == LONG_MAX || min_first_time_t == 0) && max_last_time_t == 0) + return false; + + if(min_first_time_t == LONG_MAX) + min_first_time_t = 0; + + if(min_first_time_t > max_last_time_t) { + internal_error(true, "RRDMETRIC: retention of '%s' is flipped, first_time_t = %ld, last_time_t = %ld", string2str(rm->id), min_first_time_t, max_last_time_t); + time_t tmp = min_first_time_t; + min_first_time_t = max_last_time_t; + max_last_time_t = tmp; + } + + // check if retention changed + + if (min_first_time_t != rm->first_time_s) { + rm->first_time_s = min_first_time_t; + rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if (max_last_time_t != rm->last_time_s) { + rm->last_time_s = max_last_time_t; + rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + if(unlikely(!rm->first_time_s && !rm->last_time_s)) + rrd_flag_set_deleted(rm, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + + rrd_flag_set(rm, RRD_FLAG_LIVE_RETENTION); + + return true; +} + +static inline bool rrdmetric_should_be_deleted(RRDMETRIC *rm) { + if(likely(!rrd_flag_check(rm, RRD_FLAGS_REQUIRED_FOR_DELETIONS))) + return false; + + if(likely(rrd_flag_check(rm, RRD_FLAGS_PREVENTING_DELETIONS))) + return false; + + if(likely(rm->rrddim)) + return false; + + rrdmetric_update_retention(rm); + if(rm->first_time_s || rm->last_time_s) + return false; + + return true; +} + +static inline bool rrdinstance_should_be_deleted(RRDINSTANCE *ri) { + if(likely(!rrd_flag_check(ri, RRD_FLAGS_REQUIRED_FOR_DELETIONS))) + return false; + + if(likely(rrd_flag_check(ri, RRD_FLAGS_PREVENTING_DELETIONS))) + return false; + + if(likely(ri->rrdset)) + return false; + + if(unlikely(dictionary_referenced_items(ri->rrdmetrics) != 0)) + return false; + + if(unlikely(dictionary_entries(ri->rrdmetrics) != 0)) + return false; + + if(ri->first_time_s || ri->last_time_s) + return false; + + return true; +} + +static inline bool rrdcontext_should_be_deleted(RRDCONTEXT *rc) { + if(likely(!rrd_flag_check(rc, RRD_FLAGS_REQUIRED_FOR_DELETIONS))) + return false; + + if(likely(rrd_flag_check(rc, RRD_FLAGS_PREVENTING_DELETIONS))) + return false; + + if(unlikely(dictionary_referenced_items(rc->rrdinstances) != 0)) + return false; + + if(unlikely(dictionary_entries(rc->rrdinstances) != 0)) + return false; + + if(unlikely(rc->first_time_s || rc->last_time_s)) + return false; + + return true; +} + +void rrdcontext_delete_from_sql_unsafe(RRDCONTEXT *rc) { + // we need to refresh the string pointers in rc->hub + // in case the context changed values + rc->hub.id = string2str(rc->id); + rc->hub.title = string2str(rc->title); + rc->hub.units = string2str(rc->units); + rc->hub.family = string2str(rc->family); + + // delete it from SQL + if(ctx_delete_context(&rc->rrdhost->host_uuid, &rc->hub) != 0) + error("RRDCONTEXT: failed to delete context '%s' version %"PRIu64" from SQL.", rc->hub.id, rc->hub.version); +} + +static void rrdcontext_garbage_collect_single_host(RRDHOST *host, bool worker_jobs) { + + internal_error(true, "RRDCONTEXT: garbage collecting context structures of host '%s'", rrdhost_hostname(host)); + + RRDCONTEXT *rc; + dfe_start_reentrant(host->rrdctx.contexts, rc) { + if(unlikely(worker_jobs && !service_running(SERVICE_CONTEXT))) break; + + if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP); + + rrdcontext_lock(rc); + + RRDINSTANCE *ri; + dfe_start_reentrant(rc->rrdinstances, ri) { + if(unlikely(worker_jobs && !service_running(SERVICE_CONTEXT))) break; + + RRDMETRIC *rm; + dfe_start_write(ri->rrdmetrics, rm) { + if(rrdmetric_should_be_deleted(rm)) { + if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP_DELETE); + if(!dictionary_del(ri->rrdmetrics, string2str(rm->id))) + error("RRDCONTEXT: metric '%s' of instance '%s' of context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", + string2str(rm->id), + string2str(ri->id), + string2str(rc->id), + rrdhost_hostname(host)); + else + internal_error( + true, + "RRDCONTEXT: metric '%s' of instance '%s' of context '%s' of host '%s', deleted from rrdmetrics dictionary.", + string2str(rm->id), + string2str(ri->id), + string2str(rc->id), + rrdhost_hostname(host)); + } + } + dfe_done(rm); + + if(rrdinstance_should_be_deleted(ri)) { + if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP_DELETE); + if(!dictionary_del(rc->rrdinstances, string2str(ri->id))) + error("RRDCONTEXT: instance '%s' of context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", + string2str(ri->id), + string2str(rc->id), + rrdhost_hostname(host)); + else + internal_error( + true, + "RRDCONTEXT: instance '%s' of context '%s' of host '%s', deleted from rrdmetrics dictionary.", + string2str(ri->id), + string2str(rc->id), + rrdhost_hostname(host)); + } + } + dfe_done(ri); + + if(unlikely(rrdcontext_should_be_deleted(rc))) { + if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP_DELETE); + rrdcontext_dequeue_from_post_processing(rc); + rrdcontext_delete_from_sql_unsafe(rc); + + if(!dictionary_del(host->rrdctx.contexts, string2str(rc->id))) + error("RRDCONTEXT: context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", + string2str(rc->id), + rrdhost_hostname(host)); + else + internal_error( + true, + "RRDCONTEXT: context '%s' of host '%s', deleted from rrdmetrics dictionary.", + string2str(rc->id), + rrdhost_hostname(host)); + } + + // the item is referenced in the dictionary + // so, it is still here to unlock, even if we have deleted it + rrdcontext_unlock(rc); + } + dfe_done(rc); +} + +static void rrdcontext_garbage_collect_for_all_hosts(void) { + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host) { + rrdcontext_garbage_collect_single_host(host, true); + } + dfe_done(host); +} + +// ---------------------------------------------------------------------------- +// post processing + +static void rrdmetric_process_updates(RRDMETRIC *rm, bool force, RRD_FLAGS reason, bool worker_jobs) { + if(reason != RRD_FLAG_NONE) + rrd_flag_set_updated(rm, reason); + + if(!force && !rrd_flag_is_updated(rm) && rrd_flag_check(rm, RRD_FLAG_LIVE_RETENTION) && !rrd_flag_check(rm, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) + return; + + if(worker_jobs) + worker_is_busy(WORKER_JOB_PP_METRIC); + + if(reason & RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD) { + rrd_flag_set_archived(rm); + rrd_flag_set(rm, RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD); + } + if(rrd_flag_is_deleted(rm) && (reason & RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) + rrd_flag_set_archived(rm); + + rrdmetric_update_retention(rm); + + rrd_flag_unset_updated(rm); +} + +static void rrdinstance_post_process_updates(RRDINSTANCE *ri, bool force, RRD_FLAGS reason, bool worker_jobs) { + if(reason != RRD_FLAG_NONE) + rrd_flag_set_updated(ri, reason); + + if(!force && !rrd_flag_is_updated(ri) && rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION)) + return; + + if(worker_jobs) + worker_is_busy(WORKER_JOB_PP_INSTANCE); + + time_t min_first_time_t = LONG_MAX, max_last_time_t = 0; + size_t metrics_active = 0, metrics_deleted = 0; + bool live_retention = true, currently_collected = false; + if(dictionary_entries(ri->rrdmetrics) > 0) { + RRDMETRIC *rm; + dfe_start_read((DICTIONARY *)ri->rrdmetrics, rm) { + if(unlikely(!service_running(SERVICE_CONTEXT))) break; + + RRD_FLAGS reason_to_pass = reason; + if(rrd_flag_check(ri, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) + reason_to_pass |= RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION; + + rrdmetric_process_updates(rm, force, reason_to_pass, worker_jobs); + + if(unlikely(!rrd_flag_check(rm, RRD_FLAG_LIVE_RETENTION))) + live_retention = false; + + if (unlikely((rrdmetric_should_be_deleted(rm)))) { + metrics_deleted++; + continue; + } + + if(!currently_collected && rrd_flag_check(rm, RRD_FLAG_COLLECTED) && rm->first_time_s) + currently_collected = true; + + metrics_active++; + + if (rm->first_time_s && rm->first_time_s < min_first_time_t) + min_first_time_t = rm->first_time_s; + + if (rm->last_time_s && rm->last_time_s > max_last_time_t) + max_last_time_t = rm->last_time_s; + } + dfe_done(rm); + } + + if(unlikely(live_retention && !rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION))) + rrd_flag_set(ri, RRD_FLAG_LIVE_RETENTION); + else if(unlikely(!live_retention && rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION))) + rrd_flag_clear(ri, RRD_FLAG_LIVE_RETENTION); + + if(unlikely(!metrics_active)) { + // no metrics available + + if(ri->first_time_s) { + ri->first_time_s = 0; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if(ri->last_time_s) { + ri->last_time_s = 0; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + rrd_flag_set_deleted(ri, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + } + else { + // we have active metrics... + + if (unlikely(min_first_time_t == LONG_MAX)) + min_first_time_t = 0; + + if (unlikely(min_first_time_t == 0 || max_last_time_t == 0)) { + if(ri->first_time_s) { + ri->first_time_s = 0; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if(ri->last_time_s) { + ri->last_time_s = 0; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + if(likely(live_retention)) + rrd_flag_set_deleted(ri, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + } + else { + rrd_flag_clear(ri, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + + if (unlikely(ri->first_time_s != min_first_time_t)) { + ri->first_time_s = min_first_time_t; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if (unlikely(ri->last_time_s != max_last_time_t)) { + ri->last_time_s = max_last_time_t; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + if(likely(currently_collected)) + rrd_flag_set_collected(ri); + else + rrd_flag_set_archived(ri); + } + } + + rrd_flag_unset_updated(ri); +} + +static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAGS reason, bool worker_jobs) { + if(reason != RRD_FLAG_NONE) + rrd_flag_set_updated(rc, reason); + + if(worker_jobs) + worker_is_busy(WORKER_JOB_PP_CONTEXT); + + size_t min_priority_collected = LONG_MAX; + size_t min_priority_not_collected = LONG_MAX; + size_t min_priority = LONG_MAX; + time_t min_first_time_t = LONG_MAX, max_last_time_t = 0; + size_t instances_active = 0, instances_deleted = 0; + bool live_retention = true, currently_collected = false, hidden = true; + if(dictionary_entries(rc->rrdinstances) > 0) { + RRDINSTANCE *ri; + dfe_start_reentrant(rc->rrdinstances, ri) { + if(unlikely(!service_running(SERVICE_CONTEXT))) break; + + RRD_FLAGS reason_to_pass = reason; + if(rrd_flag_check(rc, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) + reason_to_pass |= RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION; + + rrdinstance_post_process_updates(ri, force, reason_to_pass, worker_jobs); + + if(unlikely(hidden && !rrd_flag_check(ri, RRD_FLAG_HIDDEN))) + hidden = false; + + if(unlikely(live_retention && !rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION))) + live_retention = false; + + if (unlikely(rrdinstance_should_be_deleted(ri))) { + instances_deleted++; + continue; + } + + if(unlikely(!currently_collected && rrd_flag_is_collected(ri) && ri->first_time_s)) + currently_collected = true; + + internal_error(rc->units != ri->units, + "RRDCONTEXT: '%s' rrdinstance '%s' has different units, context '%s', instance '%s'", + string2str(rc->id), string2str(ri->id), + string2str(rc->units), string2str(ri->units)); + + instances_active++; + + if (ri->priority >= RRDCONTEXT_MINIMUM_ALLOWED_PRIORITY) { + if(rrd_flag_check(ri, RRD_FLAG_COLLECTED)) { + if(ri->priority < min_priority_collected) + min_priority_collected = ri->priority; + } + else { + if(ri->priority < min_priority_not_collected) + min_priority_not_collected = ri->priority; + } + } + + if (ri->first_time_s && ri->first_time_s < min_first_time_t) + min_first_time_t = ri->first_time_s; + + if (ri->last_time_s && ri->last_time_s > max_last_time_t) + max_last_time_t = ri->last_time_s; + } + dfe_done(ri); + + if(min_priority_collected != LONG_MAX) + // use the collected priority + min_priority = min_priority_collected; + else + // use the non-collected priority + min_priority = min_priority_not_collected; + } + + { + bool previous_hidden = rrd_flag_check(rc, RRD_FLAG_HIDDEN); + if (hidden != previous_hidden) { + if (hidden && !rrd_flag_check(rc, RRD_FLAG_HIDDEN)) + rrd_flag_set(rc, RRD_FLAG_HIDDEN); + else if (!hidden && rrd_flag_check(rc, RRD_FLAG_HIDDEN)) + rrd_flag_clear(rc, RRD_FLAG_HIDDEN); + } + + bool previous_live_retention = rrd_flag_check(rc, RRD_FLAG_LIVE_RETENTION); + if (live_retention != previous_live_retention) { + if (live_retention && !rrd_flag_check(rc, RRD_FLAG_LIVE_RETENTION)) + rrd_flag_set(rc, RRD_FLAG_LIVE_RETENTION); + else if (!live_retention && rrd_flag_check(rc, RRD_FLAG_LIVE_RETENTION)) + rrd_flag_clear(rc, RRD_FLAG_LIVE_RETENTION); + } + } + + rrdcontext_lock(rc); + rc->pp.executions++; + + if(unlikely(!instances_active)) { + // we had some instances, but they are gone now... + + if(rc->first_time_s) { + rc->first_time_s = 0; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if(rc->last_time_s) { + rc->last_time_s = 0; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + rrd_flag_set_deleted(rc, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + } + else { + // we have some active instances... + + if (unlikely(min_first_time_t == LONG_MAX)) + min_first_time_t = 0; + + if (unlikely(min_first_time_t == 0 && max_last_time_t == 0)) { + if(rc->first_time_s) { + rc->first_time_s = 0; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if(rc->last_time_s) { + rc->last_time_s = 0; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + rrd_flag_set_deleted(rc, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + } + else { + rrd_flag_clear(rc, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + + if (unlikely(rc->first_time_s != min_first_time_t)) { + rc->first_time_s = min_first_time_t; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if (rc->last_time_s != max_last_time_t) { + rc->last_time_s = max_last_time_t; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + if(likely(currently_collected)) + rrd_flag_set_collected(rc); + else + rrd_flag_set_archived(rc); + } + + if (min_priority != LONG_MAX && rc->priority != min_priority) { + rc->priority = min_priority; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + } + + if(unlikely(rrd_flag_is_updated(rc) && rc->rrdhost->rrdctx.hub_queue)) { + if(check_if_cloud_version_changed_unsafe(rc, false)) { + rc->version = rrdcontext_get_next_version(rc); + dictionary_set((DICTIONARY *)rc->rrdhost->rrdctx.hub_queue, + string2str(rc->id), rc, sizeof(*rc)); + } + } + + rrd_flag_unset_updated(rc); + rrdcontext_unlock(rc); +} + +void rrdcontext_queue_for_post_processing(RRDCONTEXT *rc, const char *function __maybe_unused, RRD_FLAGS flags __maybe_unused) { + if(unlikely(!rc->rrdhost->rrdctx.pp_queue)) return; + + if(!rrd_flag_check(rc, RRD_FLAG_QUEUED_FOR_PP)) { + dictionary_set((DICTIONARY *)rc->rrdhost->rrdctx.pp_queue, + string2str(rc->id), + rc, + sizeof(*rc)); + +#if(defined(NETDATA_INTERNAL_CHECKS) && defined(LOG_POST_PROCESSING_QUEUE_INSERTIONS)) + { + BUFFER *wb_flags = buffer_create(1000); + rrd_flags_to_buffer(flags, wb_flags); + + BUFFER *wb_reasons = buffer_create(1000); + rrd_reasons_to_buffer(flags, wb_reasons); + + internal_error(true, "RRDCONTEXT: '%s' update triggered by function %s(), due to flags: %s, reasons: %s", + string2str(rc->id), function, + buffer_tostring(wb_flags), + buffer_tostring(wb_reasons)); + + buffer_free(wb_reasons); + buffer_free(wb_flags); + } +#endif + } +} + +static void rrdcontext_dequeue_from_post_processing(RRDCONTEXT *rc) { + if(unlikely(!rc->rrdhost->rrdctx.pp_queue)) return; + dictionary_del(rc->rrdhost->rrdctx.pp_queue, string2str(rc->id)); +} + +static void rrdcontext_post_process_queued_contexts(RRDHOST *host) { + if(unlikely(!host->rrdctx.pp_queue)) return; + + RRDCONTEXT *rc; + dfe_start_reentrant(host->rrdctx.pp_queue, rc) { + if(unlikely(!service_running(SERVICE_CONTEXT))) break; + + rrdcontext_dequeue_from_post_processing(rc); + rrdcontext_post_process_updates(rc, false, RRD_FLAG_NONE, true); + } + dfe_done(rc); +} + +// ---------------------------------------------------------------------------- +// dispatching contexts to cloud + +static uint64_t rrdcontext_get_next_version(RRDCONTEXT *rc) { + time_t now = now_realtime_sec(); + uint64_t version = MAX(rc->version, rc->hub.version); + version = MAX((uint64_t)now, version); + version++; + return version; +} + +void rrdcontext_message_send_unsafe(RRDCONTEXT *rc, bool snapshot __maybe_unused, void *bundle __maybe_unused) { + + // save it, so that we know the last version we sent to hub + rc->version = rc->hub.version = rrdcontext_get_next_version(rc); + rc->hub.id = string2str(rc->id); + rc->hub.title = string2str(rc->title); + rc->hub.units = string2str(rc->units); + rc->hub.family = string2str(rc->family); + rc->hub.chart_type = rrdset_type_name(rc->chart_type); + rc->hub.priority = rc->priority; + rc->hub.first_time_s = rc->first_time_s; + rc->hub.last_time_s = rrd_flag_is_collected(rc) ? 0 : rc->last_time_s; + rc->hub.deleted = rrd_flag_is_deleted(rc) ? true : false; + +#ifdef ENABLE_ACLK + struct context_updated message = { + .id = rc->hub.id, + .version = rc->hub.version, + .title = rc->hub.title, + .units = rc->hub.units, + .family = rc->hub.family, + .chart_type = rc->hub.chart_type, + .priority = rc->hub.priority, + .first_entry = rc->hub.first_time_s, + .last_entry = rc->hub.last_time_s, + .deleted = rc->hub.deleted, + }; + + if(likely(!rrd_flag_check(rc, RRD_FLAG_HIDDEN))) { + if (snapshot) { + if (!rc->hub.deleted) + contexts_snapshot_add_ctx_update(bundle, &message); + } + else + contexts_updated_add_ctx_update(bundle, &message); + } +#endif + + // store it to SQL + + if(rrd_flag_is_deleted(rc)) + rrdcontext_delete_from_sql_unsafe(rc); + + else if (ctx_store_context(&rc->rrdhost->host_uuid, &rc->hub) != 0) + error("RRDCONTEXT: failed to save context '%s' version %"PRIu64" to SQL.", rc->hub.id, rc->hub.version); +} + +static bool check_if_cloud_version_changed_unsafe(RRDCONTEXT *rc, bool sending __maybe_unused) { + bool id_changed = false, + title_changed = false, + units_changed = false, + family_changed = false, + chart_type_changed = false, + priority_changed = false, + first_time_changed = false, + last_time_changed = false, + deleted_changed = false; + + RRD_FLAGS flags = rrd_flags_get(rc); + + if(unlikely(string2str(rc->id) != rc->hub.id)) + id_changed = true; + + if(unlikely(string2str(rc->title) != rc->hub.title)) + title_changed = true; + + if(unlikely(string2str(rc->units) != rc->hub.units)) + units_changed = true; + + if(unlikely(string2str(rc->family) != rc->hub.family)) + family_changed = true; + + if(unlikely(rrdset_type_name(rc->chart_type) != rc->hub.chart_type)) + chart_type_changed = true; + + if(unlikely(rc->priority != rc->hub.priority)) + priority_changed = true; + + if(unlikely((uint64_t)rc->first_time_s != rc->hub.first_time_s)) + first_time_changed = true; + + if(unlikely((uint64_t)((flags & RRD_FLAG_COLLECTED) ? 0 : rc->last_time_s) != rc->hub.last_time_s)) + last_time_changed = true; + + if(unlikely(((flags & RRD_FLAG_DELETED) ? true : false) != rc->hub.deleted)) + deleted_changed = true; + + if(unlikely(id_changed || title_changed || units_changed || family_changed || chart_type_changed || priority_changed || first_time_changed || last_time_changed || deleted_changed)) { + + internal_error(LOG_TRANSITIONS, + "RRDCONTEXT: %s NEW VERSION '%s'%s of host '%s', version %"PRIu64", title '%s'%s, units '%s'%s, family '%s'%s, chart type '%s'%s, priority %u%s, first_time_t %ld%s, last_time_t %ld%s, deleted '%s'%s, (queued for %llu ms, expected %llu ms)", + sending?"SENDING":"QUEUE", + string2str(rc->id), id_changed ? " (CHANGED)" : "", + rrdhost_hostname(rc->rrdhost), + rc->version, + string2str(rc->title), title_changed ? " (CHANGED)" : "", + string2str(rc->units), units_changed ? " (CHANGED)" : "", + string2str(rc->family), family_changed ? " (CHANGED)" : "", + rrdset_type_name(rc->chart_type), chart_type_changed ? " (CHANGED)" : "", + rc->priority, priority_changed ? " (CHANGED)" : "", + rc->first_time_s, first_time_changed ? " (CHANGED)" : "", + (flags & RRD_FLAG_COLLECTED) ? 0 : rc->last_time_s, last_time_changed ? " (CHANGED)" : "", + (flags & RRD_FLAG_DELETED) ? "true" : "false", deleted_changed ? " (CHANGED)" : "", + sending ? (now_realtime_usec() - rc->queue.queued_ut) / USEC_PER_MS : 0, + sending ? (rc->queue.scheduled_dispatch_ut - rc->queue.queued_ut) / USEC_PER_MS : 0 + ); + + return true; + } + + return false; +} + +static inline usec_t rrdcontext_calculate_queued_dispatch_time_ut(RRDCONTEXT *rc, usec_t now_ut) { + + if(likely(rc->queue.delay_calc_ut >= rc->queue.queued_ut)) + return rc->queue.scheduled_dispatch_ut; + + RRD_FLAGS flags = rc->queue.queued_flags; + + usec_t delay = LONG_MAX; + int i; + struct rrdcontext_reason *reason; + for(i = 0, reason = &rrdcontext_reasons[i]; reason->name ; reason = &rrdcontext_reasons[++i]) { + if(unlikely(flags & reason->flag)) { + if(reason->delay_ut < delay) + delay = reason->delay_ut; + } + } + + if(unlikely(delay == LONG_MAX)) { + internal_error(true, "RRDCONTEXT: '%s', cannot find minimum delay of flags %x", string2str(rc->id), (unsigned int)flags); + delay = 60 * USEC_PER_SEC; + } + + rc->queue.delay_calc_ut = now_ut; + usec_t dispatch_ut = rc->queue.scheduled_dispatch_ut = rc->queue.queued_ut + delay; + return dispatch_ut; +} + +static void rrdcontext_dequeue_from_hub_queue(RRDCONTEXT *rc) { + dictionary_del(rc->rrdhost->rrdctx.hub_queue, string2str(rc->id)); +} + +static void rrdcontext_dispatch_queued_contexts_to_hub(RRDHOST *host, usec_t now_ut) { + + // check if we have received a streaming command for this host + if(!rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS) || !aclk_connected || !host->rrdctx.hub_queue) + return; + + // check if there are queued items to send + if(!dictionary_entries(host->rrdctx.hub_queue)) + return; + + if(!host->node_id) + return; + + size_t messages_added = 0; + contexts_updated_t bundle = NULL; + + RRDCONTEXT *rc; + dfe_start_reentrant(host->rrdctx.hub_queue, rc) { + if(unlikely(!service_running(SERVICE_CONTEXT))) break; + + if(unlikely(messages_added >= MESSAGES_PER_BUNDLE_TO_SEND_TO_HUB_PER_HOST)) + break; + + worker_is_busy(WORKER_JOB_QUEUED); + usec_t dispatch_ut = rrdcontext_calculate_queued_dispatch_time_ut(rc, now_ut); + char *claim_id = get_agent_claimid(); + + if(unlikely(now_ut >= dispatch_ut) && claim_id) { + worker_is_busy(WORKER_JOB_CHECK); + + rrdcontext_lock(rc); + + if(check_if_cloud_version_changed_unsafe(rc, true)) { + worker_is_busy(WORKER_JOB_SEND); + +#ifdef ENABLE_ACLK + if(!bundle) { + // prepare the bundle to send the messages + char uuid[UUID_STR_LEN]; + uuid_unparse_lower(*host->node_id, uuid); + + bundle = contexts_updated_new(claim_id, uuid, 0, now_ut); + } +#endif + // update the hub data of the context, give a new version, pack the message + // and save an update to SQL + rrdcontext_message_send_unsafe(rc, false, bundle); + messages_added++; + + rc->queue.dispatches++; + rc->queue.dequeued_ut = now_ut; + } + else + rc->version = rc->hub.version; + + // remove it from the queue + worker_is_busy(WORKER_JOB_DEQUEUE); + rrdcontext_dequeue_from_hub_queue(rc); + + if(unlikely(rrdcontext_should_be_deleted(rc))) { + // this is a deleted context - delete it forever... + + worker_is_busy(WORKER_JOB_CLEANUP_DELETE); + + rrdcontext_dequeue_from_post_processing(rc); + rrdcontext_delete_from_sql_unsafe(rc); + + STRING *id = string_dup(rc->id); + rrdcontext_unlock(rc); + + // delete it from the master dictionary + if(!dictionary_del(host->rrdctx.contexts, string2str(rc->id))) + error("RRDCONTEXT: '%s' of host '%s' failed to be deleted from rrdcontext dictionary.", + string2str(id), rrdhost_hostname(host)); + + string_freez(id); + } + else + rrdcontext_unlock(rc); + } + freez(claim_id); + } + dfe_done(rc); + +#ifdef ENABLE_ACLK + if(service_running(SERVICE_CONTEXT) && bundle) { + // we have a bundle to send messages + + // update the version hash + contexts_updated_update_version_hash(bundle, rrdcontext_version_hash(host)); + + // send it + aclk_send_contexts_updated(bundle); + } + else if(bundle) + contexts_updated_delete(bundle); +#endif + +} + +// ---------------------------------------------------------------------------- +// worker thread + +static void rrdcontext_main_cleanup(void *ptr) { + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; + static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; + + // custom code + worker_unregister(); + + static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; +} + +void *rrdcontext_main(void *ptr) { + netdata_thread_cleanup_push(rrdcontext_main_cleanup, ptr); + + worker_register("RRDCONTEXT"); + worker_register_job_name(WORKER_JOB_HOSTS, "hosts"); + worker_register_job_name(WORKER_JOB_CHECK, "dedup checks"); + worker_register_job_name(WORKER_JOB_SEND, "sent contexts"); + worker_register_job_name(WORKER_JOB_DEQUEUE, "deduplicated contexts"); + worker_register_job_name(WORKER_JOB_RETENTION, "metrics retention"); + worker_register_job_name(WORKER_JOB_QUEUED, "queued contexts"); + worker_register_job_name(WORKER_JOB_CLEANUP, "cleanups"); + worker_register_job_name(WORKER_JOB_CLEANUP_DELETE, "deletes"); + worker_register_job_name(WORKER_JOB_PP_METRIC, "check metrics"); + worker_register_job_name(WORKER_JOB_PP_INSTANCE, "check instances"); + worker_register_job_name(WORKER_JOB_PP_CONTEXT, "check contexts"); + + worker_register_job_custom_metric(WORKER_JOB_HUB_QUEUE_SIZE, "hub queue size", "contexts", WORKER_METRIC_ABSOLUTE); + worker_register_job_custom_metric(WORKER_JOB_PP_QUEUE_SIZE, "post processing queue size", "contexts", WORKER_METRIC_ABSOLUTE); + + heartbeat_t hb; + heartbeat_init(&hb); + usec_t step = RRDCONTEXT_WORKER_THREAD_HEARTBEAT_USEC; + + while (service_running(SERVICE_CONTEXT)) { + worker_is_idle(); + heartbeat_next(&hb, step); + + if(unlikely(!service_running(SERVICE_CONTEXT))) break; + + usec_t now_ut = now_realtime_usec(); + + if(rrdcontext_next_db_rotation_ut && now_ut > rrdcontext_next_db_rotation_ut) { + rrdcontext_recalculate_retention_all_hosts(); + rrdcontext_garbage_collect_for_all_hosts(); + rrdcontext_next_db_rotation_ut = 0; + } + + size_t hub_queued_contexts_for_all_hosts = 0; + size_t pp_queued_contexts_for_all_hosts = 0; + + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host) { + if(unlikely(!service_running(SERVICE_CONTEXT))) break; + + worker_is_busy(WORKER_JOB_HOSTS); + + if(host->rrdctx.pp_queue) { + pp_queued_contexts_for_all_hosts += dictionary_entries(host->rrdctx.pp_queue); + rrdcontext_post_process_queued_contexts(host); + dictionary_garbage_collect(host->rrdctx.pp_queue); + } + + if(host->rrdctx.hub_queue) { + hub_queued_contexts_for_all_hosts += dictionary_entries(host->rrdctx.hub_queue); + rrdcontext_dispatch_queued_contexts_to_hub(host, now_ut); + dictionary_garbage_collect(host->rrdctx.hub_queue); + } + + if (host->rrdctx.contexts) + dictionary_garbage_collect(host->rrdctx.contexts); + } + dfe_done(host); + + worker_set_metric(WORKER_JOB_HUB_QUEUE_SIZE, (NETDATA_DOUBLE)hub_queued_contexts_for_all_hosts); + worker_set_metric(WORKER_JOB_PP_QUEUE_SIZE, (NETDATA_DOUBLE)pp_queued_contexts_for_all_hosts); + } + + netdata_thread_cleanup_pop(1); + return NULL; +} diff --git a/database/engine/README.md b/database/engine/README.md index 664d4050..89001864 100644 --- a/database/engine/README.md +++ b/database/engine/README.md @@ -1,17 +1,9 @@ -<!-- -title: "Database engine" -description: "Netdata's highly-efficient database engine use both RAM and disk for distributed, long-term storage of per-second metrics." -custom_edit_url: "https://github.com/netdata/netdata/edit/master/database/engine/README.md" -sidebar_label: "Database engine" -learn_status: "Published" -learn_topic_type: "Concepts" -learn_rel_path: "Concepts" ---> - -# DBENGINE +# Database engine DBENGINE is the time-series database of Netdata. +![image](https://user-images.githubusercontent.com/2662304/233838474-d4f8f0b9-61dc-4409-a708-97d403cd153a.png) + ## Design ### Data Points @@ -118,53 +110,13 @@ Tiers are supported in Netdata Agents with version `netdata-1.35.0.138.nightly` Updating the higher **tiers** is automated, and it happens in real-time while data are being collected for **tier 0**. -When the Netdata Agent starts, during the first data collection of each metric, higher tiers are automatically **backfilled** with data from lower tiers, so that the aggregation they provide will be accurate. - -3 tiers are enabled by default in Netdata, with the following configuration: - -``` -[db] - mode = dbengine - - # per second data collection - update every = 1 - - # number of tiers used (1 to 5, 3 being default) - storage tiers = 3 - - # Tier 0, per second data - dbengine multihost disk space MB = 256 - - # Tier 1, per minute data - dbengine tier 1 multihost disk space MB = 128 - - # Tier 2, per hour data - dbengine tier 2 multihost disk space MB = 64 -``` - -The exact retention that can be achieved by each tier depends on the number of metrics collected. The more the metrics, the smaller the retention that will fit in a given size. The general rule is that Netdata needs about **1 byte per data point on disk for tier 0**, and **4 bytes per data point on disk for tier 1 and above**. - -So, for 1000 metrics collected per second and 256 MB for tier 0, Netdata will store about: +When the Netdata Agent starts, during the first data collection of each metric, higher tiers are automatically **backfilled** with +data from lower tiers, so that the aggregation they provide will be accurate. -``` -256MB on disk / 1 byte per point / 1000 metrics => 256k points per metric / 86400 seconds per day = about 3 days -``` - -At tier 1 (per minute): - -``` -128MB on disk / 4 bytes per point / 1000 metrics => 32k points per metric / (24 hours * 60 minutes) = about 22 days -``` +Configuring how the number of tiers and the disk space allocated to each tier is how you can +[change how long netdata stores metrics](https://github.com/netdata/netdata/blob/master/docs/store/change-metrics-storage.md). -At tier 2 (per hour): - -``` -64MB on disk / 4 bytes per point / 1000 metrics => 16k points per metric / 24 hours per day = about 2 years -``` - -Of course double the metrics, half the retention. There are more factors that affect retention. The number of ephemeral metrics (i.e. metrics that are collected for part of the time). The number of metrics that are usually constant over time (affecting compression efficiency). The number of restarts a Netdata Agents gets through time (because it has to break pages prematurely, increasing the metadata overhead). But the actual numbers should not deviate significantly from the above. - -### Data Loss +### Data loss Until **hot pages** and **dirty pages** are **flushed** to disk they are at risk (e.g. due to a crash, or power failure), as they are stored only in memory. @@ -172,36 +124,9 @@ power failure), as they are stored only in memory. The supported way of ensuring high data availability is the use of Netdata Parents to stream the data in real-time to multiple other Netdata agents. -## Memory Requirements - -DBENGINE memory is related to the number of metrics concurrently being collected, the retention of the metrics on disk in relation with the queries running, and the number of metrics for which retention is maintained. - -### Memory for concurrently collected metrics - -DBENGINE is automatically sized to use memory according to this equation: - -``` -memory in KiB = METRICS x (TIERS - 1) x 4KiB x 2 + 32768 KiB -``` - -Where: -- `METRICS`: the maximum number of concurrently collected metrics (dimensions) from the time the agent started. -- `TIERS`: the number of storage tiers configured, by default 3 ( `-1` when using 3+ tiers) -- `x 2`, to accommodate room for flushing data to disk -- `x 4KiB`, the data segment size of each metric -- `+ 32768 KiB`, 32 MB for operational caches - -So, for 2000 metrics (dimensions) in 3 storage tiers: +## Memory requirements and retention -``` -memory for 2k metrics = 2000 x (3 - 1) x 4 KiB x 2 + 32768 KiB = 64 MiB -``` - -For 100k concurrently collected metrics in 3 storage tiers: - -``` -memory for 100k metrics = 100000 x (3 - 1) x 4 KiB x 2 + 32768 KiB = 1.6 GiB -``` +See (change how long netdata stores metrics)[https://github.com/netdata/netdata/edit/master/docs/store/change-metrics-storage.md] #### Exceptions @@ -262,216 +187,6 @@ The time-ranges of the queries running control the amount of shared memory requi DBENGINE uses 150 bytes of memory for every metric for which retention is maintained but is not currently being collected. ---- - ---- OLD DOCS BELOW THIS POINT --- - ---- - - -## Legacy configuration - -### v1.35.1 and prior - -These versions of the Agent do not support [Tiers](#Tiers). You could change the metric retention for the parent and -all of its children only with the `dbengine multihost disk space MB` setting. This setting accounts the space allocation -for the parent node and all of its children. - -To configure the database engine, look for the `page cache size MB` and `dbengine multihost disk space MB` settings in -the `[db]` section of your `netdata.conf`. - -```conf -[db] - dbengine page cache size MB = 32 - dbengine multihost disk space MB = 256 -``` - -### v1.23.2 and prior - -_For Netdata Agents earlier than v1.23.2_, the Agent on the parent node uses one dbengine instance for itself, and another instance for every child node it receives metrics from. If you had four streaming nodes, you would have five instances in total (`1 parent + 4 child nodes = 5 instances`). - -The Agent allocates resources for each instance separately using the `dbengine disk space MB` (**deprecated**) setting. If `dbengine disk space MB`(**deprecated**) is set to the default `256`, each instance is given 256 MiB in disk space, which means the total disk space required to store all instances is, roughly, `256 MiB * 1 parent * 4 child nodes = 1280 MiB`. - -#### Backward compatibility - -All existing metrics belonging to child nodes are automatically converted to legacy dbengine instances and the localhost -metrics are transferred to the multihost dbengine instance. - -All new child nodes are automatically transferred to the multihost dbengine instance and share its page cache and disk -space. If you want to migrate a child node from its legacy dbengine instance to the multihost dbengine instance, you -must delete the instance's directory, which is located in `/var/cache/netdata/MACHINE_GUID/dbengine`, after stopping the -Agent. - -##### Information - -For more information about setting `[db].mode` on your nodes, in addition to other streaming configurations, see -[streaming](https://github.com/netdata/netdata/blob/master/streaming/README.md). - -## Requirements & limitations - -### Memory - -Using database mode `dbengine` we can overcome most memory restrictions and store a dataset that is much larger than the -available memory. - -There are explicit memory requirements **per** DB engine **instance**: - -- The total page cache memory footprint will be an additional `#dimensions-being-collected x 4096 x 2` bytes over what - the user configured with `dbengine page cache size MB`. - - -- an additional `#pages-on-disk x 4096 x 0.03` bytes of RAM are allocated for metadata. - - - roughly speaking this is 3% of the uncompressed disk space taken by the DB files. - - - for very highly compressible data (compression ratio > 90%) this RAM overhead is comparable to the disk space - footprint. - -An important observation is that RAM usage depends on both the `page cache size` and the `dbengine multihost disk space` -options. - -You can use -our [database engine calculator](https://github.com/netdata/netdata/blob/master/docs/store/change-metrics-storage.md#calculate-the-system-resources-ram-disk-space-needed-to-store-metrics) -to validate the memory requirements for your particular system(s) and configuration (**out-of-date**). - -### Disk space - -There are explicit disk space requirements **per** DB engine **instance**: - -- The total disk space footprint will be the maximum between `#dimensions-being-collected x 4096 x 2` bytes or what the - user configured with `dbengine multihost disk space` or `dbengine disk space`. - -### File descriptor - -The Database Engine may keep a **significant** amount of files open per instance (e.g. per streaming child or parent -server). When configuring your system you should make sure there are at least 50 file descriptors available per -`dbengine` instance. - -Netdata allocates 25% of the available file descriptors to its Database Engine instances. This means that only 25% of -the file descriptors that are available to the Netdata service are accessible by dbengine instances. You should take -that into account when configuring your service or system-wide file descriptor limits. You can roughly estimate that the -Netdata service needs 2048 file descriptors for every 10 streaming child hosts when streaming is configured to use -`[db].mode = dbengine`. - -If for example one wants to allocate 65536 file descriptors to the Netdata service on a systemd system one needs to -override the Netdata service by running `sudo systemctl edit netdata` and creating a file with contents: - -```sh -[Service] -LimitNOFILE=65536 -``` - -For other types of services one can add the line: - -```sh -ulimit -n 65536 -``` - -at the beginning of the service file. Alternatively you can change the system-wide limits of the kernel by changing -`/etc/sysctl.conf`. For linux that would be: - -```conf -fs.file-max = 65536 -``` - -In FreeBSD and OS X you change the lines like this: - -```conf -kern.maxfilesperproc=65536 -kern.maxfiles=65536 -``` - -You can apply the settings by running `sysctl -p` or by rebooting. - -## Files - -With the DB engine mode the metric data are stored in database files. These files are organized in pairs, the datafiles -and their corresponding journalfiles, e.g.: - -```sh -datafile-1-0000000001.ndf -journalfile-1-0000000001.njf -datafile-1-0000000002.ndf -journalfile-1-0000000002.njf -datafile-1-0000000003.ndf -journalfile-1-0000000003.njf -... -``` - -They are located under their host's cache directory in the directory `./dbengine` (e.g. for localhost the default -location is `/var/cache/netdata/dbengine/*`). The higher numbered filenames contain more recent metric data. The user -can safely delete some pairs of files when Netdata is stopped to manually free up some space. - -_Users should_ **back up** _their `./dbengine` folders if they consider this data to be important._ You can also set up -one or more [exporting connectors](https://github.com/netdata/netdata/blob/master/exporting/README.md) to send your Netdata metrics to other databases for long-term -storage at lower granularity. - -## Operation - -The DB engine stores chart metric values in 4096-byte pages in memory. Each chart dimension gets its own page to store -consecutive values generated from the data collectors. Those pages comprise the **Page Cache**. - -When those pages fill up, they are slowly compressed and flushed to disk. It can -take `4096 / 4 = 1024 seconds = 17 minutes`, for a chart dimension that is being collected every 1 second, to fill a -page. Pages can be cut short when we stop Netdata or the DB engine instance so as to not lose the data. When we query -the DB engine for data we trigger disk read I/O requests that fill the Page Cache with the requested pages and -potentially evict cold (not recently used) -pages. - -When the disk quota is exceeded the oldest values are removed from the DB engine at real time, by automatically deleting -the oldest datafile and journalfile pair. Any corresponding pages residing in the Page Cache will also be invalidated -and removed. The DB engine logic will try to maintain between 10 and 20 file pairs at any point in time. - -The Database Engine uses direct I/O to avoid polluting the OS filesystem caches and does not generate excessive I/O -traffic so as to create the minimum possible interference with other applications. - -## Evaluation - -We have evaluated the performance of the `dbengine` API that the netdata daemon uses internally. This is **not** the web -API of netdata. Our benchmarks ran on a **single** `dbengine` instance, multiple of which can be running in a Netdata -parent node. We used a server with an AMD Ryzen Threadripper 2950X 16-Core Processor and 2 disk drives, a Seagate -Constellation ES.3 2TB magnetic HDD and a SAMSUNG MZQLB960HAJR-00007 960GB NAND Flash SSD. - -For our workload, we defined 32 charts with 128 metrics each, giving us a total of 4096 metrics. We defined 1 worker -thread per chart (32 threads) that generates new data points with a data generation interval of 1 second. The time axis -of the time-series is emulated and accelerated so that the worker threads can generate as many data points as possible -without delays. - -We also defined 32 worker threads that perform queries on random metrics with semi-random time ranges. The starting time -of the query is randomly selected between the beginning of the time-series and the time of the latest data point. The -ending time is randomly selected between 1 second and 1 hour after the starting time. The pseudo-random numbers are -generated with a uniform distribution. - -The data are written to the database at the same time as they are read from it. This is a concurrent read/write mixed -workload with a duration of 60 seconds. The faster `dbengine` runs, the bigger the dataset size becomes since more data -points will be generated. We set a page cache size of 64MiB for the two disk-bound scenarios. This way, the dataset size -of the metric data is much bigger than the RAM that is being used for caching so as to trigger I/O requests most of the -time. In our final scenario, we set the page cache size to 16 GiB. That way, the dataset fits in the page cache so as to -avoid all disk bottlenecks. - -The reported numbers are the following: - -| device | page cache | dataset | reads/sec | writes/sec | -|:------:|:----------:|--------:|----------:|-----------:| -| HDD | 64 MiB | 4.1 GiB | 813K | 18.0M | -| SSD | 64 MiB | 9.8 GiB | 1.7M | 43.0M | -| N/A | 16 GiB | 6.8 GiB | 118.2M | 30.2M | - -where "reads/sec" is the number of metric data points being read from the database via its API per second and -"writes/sec" is the number of metric data points being written to the database per second. - -Notice that the HDD numbers are pretty high and not much slower than the SSD numbers. This is thanks to the database -engine design being optimized for rotating media. In the database engine disk I/O requests are: -- asynchronous to mask the high I/O latency of HDDs. -- mostly large to reduce the amount of HDD seeking time. -- mostly sequential to reduce the amount of HDD seeking time. -- compressed to reduce the amount of required throughput. -As a result, the HDD is not thousands of times slower than the SSD, which is typical for other workloads. -An interesting observation to make is that the CPU-bound run (16 GiB page cache) generates fewer data than the SSD run -(6.8 GiB vs 9.8 GiB). The reason is that the 32 reader threads in the SSD scenario are more frequently blocked by I/O, -and generate a read load of 1.7M/sec, whereas in the CPU-bound scenario the read load is 70 times higher at 118M/sec. -Consequently, there is a significant degree of interference by the reader threads, that slow down the writer threads. -This is also possible because the interference effects are greater than the SSD impact on data generation throughput. diff --git a/database/engine/cache.c b/database/engine/cache.c index 4091684b..bc3ba6b6 100644 --- a/database/engine/cache.c +++ b/database/engine/cache.c @@ -1189,6 +1189,9 @@ premature_exit: } static PGC_PAGE *page_add(PGC *cache, PGC_ENTRY *entry, bool *added) { + internal_fatal(entry->start_time_s < 0 || entry->end_time_s < 0, + "DBENGINE CACHE: timestamps are negative"); + __atomic_add_fetch(&cache->stats.workers_add, 1, __ATOMIC_RELAXED); size_t partition = pgc_indexing_partition(cache, entry->metric_id); @@ -1199,6 +1202,12 @@ static PGC_PAGE *page_add(PGC *cache, PGC_ENTRY *entry, bool *added) { PGC_PAGE *page; size_t spins = 0; + if(unlikely(entry->start_time_s < 0)) + entry->start_time_s = 0; + + if(unlikely(entry->end_time_s < 0)) + entry->end_time_s = 0; + do { if(++spins > 1) __atomic_add_fetch(&cache->stats.insert_spins, 1, __ATOMIC_RELAXED); @@ -1755,7 +1764,7 @@ PGC *pgc_create(const char *name, cache->config.max_dirty_pages_per_call = max_dirty_pages_per_flush; cache->config.pgc_save_init_cb = pgc_save_init_cb; cache->config.pgc_save_dirty_cb = pgc_save_dirty_cb; - cache->config.max_pages_per_inline_eviction = (max_pages_per_inline_eviction < 2) ? 2 : max_pages_per_inline_eviction; + cache->config.max_pages_per_inline_eviction = max_pages_per_inline_eviction; cache->config.max_skip_pages_per_inline_eviction = (max_skip_pages_per_inline_eviction < 2) ? 2 : max_skip_pages_per_inline_eviction; cache->config.max_flushes_inline = (max_flushes_inline < 1) ? 1 : max_flushes_inline; cache->config.partitions = partitions < 1 ? (size_t)get_netdata_cpus() : partitions; @@ -1946,7 +1955,7 @@ time_t pgc_page_update_every_s(PGC_PAGE *page) { time_t pgc_page_fix_update_every(PGC_PAGE *page, time_t update_every_s) { if(page->update_every_s == 0) - page->update_every_s = update_every_s; + page->update_every_s = (uint32_t) update_every_s; return page->update_every_s; } @@ -2083,7 +2092,7 @@ void pgc_open_cache_to_journal_v2(PGC *cache, Word_t section, unsigned datafile_ struct section_pages *sp = *section_pages_pptr; if(!netdata_spinlock_trylock(&sp->migration_to_v2_spinlock)) { - internal_fatal(true, "DBENGINE: migration to journal v2 is already running for this section"); + info("DBENGINE: migration to journal v2 for datafile %u is postponed, another jv2 indexer is already running for this section", datafile_fileno); pgc_ll_unlock(cache, &cache->hot); return; } diff --git a/database/engine/datafile.c b/database/engine/datafile.c index 286ae1e3..8c413d8d 100644 --- a/database/engine/datafile.c +++ b/database/engine/datafile.c @@ -34,17 +34,6 @@ static struct rrdengine_datafile *datafile_alloc_and_init(struct rrdengine_insta return datafile; } -void datafile_acquire_dup(struct rrdengine_datafile *df) { - netdata_spinlock_lock(&df->users.spinlock); - - if(!df->users.lockers) - fatal("DBENGINE: datafile is not acquired to duplicate"); - - df->users.lockers++; - - netdata_spinlock_unlock(&df->users.spinlock); -} - bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason) { bool ret; @@ -390,8 +379,8 @@ static int scan_data_files_cmp(const void *a, const void *b) /* Returns number of datafiles that were loaded or < 0 on error */ static int scan_data_files(struct rrdengine_instance *ctx) { - int ret; - unsigned tier, no, matched_files, i,failed_to_load; + int ret, matched_files, failed_to_load, i; + unsigned tier, no; uv_fs_t req; uv_dirent_t dent; struct rrdengine_datafile **datafiles, *datafile; diff --git a/database/engine/datafile.h b/database/engine/datafile.h index 274add91..a08f3ae0 100644 --- a/database/engine/datafile.h +++ b/database/engine/datafile.h @@ -70,7 +70,6 @@ struct rrdengine_datafile { } extent_queries; }; -void datafile_acquire_dup(struct rrdengine_datafile *df); bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason); void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason); bool datafile_acquire_for_deletion(struct rrdengine_datafile *df); diff --git a/database/engine/journalfile.c b/database/engine/journalfile.c index de2b909c..9998ee54 100644 --- a/database/engine/journalfile.c +++ b/database/engine/journalfile.c @@ -40,7 +40,7 @@ static void update_metric_retention_and_granularity_by_uuid( .section = (Word_t) ctx, .first_time_s = first_time_s, .last_time_s = last_time_s, - .latest_update_every_s = update_every_s + .latest_update_every_s = (uint32_t) update_every_s }; uuid_copy(entry.uuid, *uuid); metric = mrg_metric_add_and_acquire(main_mrg, entry, &added); @@ -617,7 +617,7 @@ static void journalfile_restore_extent_metadata(struct rrdengine_instance *ctx, .section = (Word_t)ctx, .first_time_s = vd.start_time_s, .last_time_s = vd.end_time_s, - .latest_update_every_s = vd.update_every_s, + .latest_update_every_s = (uint32_t) vd.update_every_s, }; uuid_copy(entry.uuid, *temp_id); @@ -911,15 +911,10 @@ void journalfile_v2_populate_retention_to_mrg(struct rrdengine_instance *ctx, st for (size_t i=0; i < entries; i++) { time_t start_time_s = header_start_time_s + metric->delta_start_s; time_t end_time_s = header_start_time_s + metric->delta_end_s; - time_t update_every_s = (metric->entries > 1) ? ((end_time_s - start_time_s) / (entries - 1)) : 0; + update_metric_retention_and_granularity_by_uuid( - ctx, &metric->uuid, start_time_s, end_time_s, update_every_s, now_s); + ctx, &metric->uuid, start_time_s, end_time_s, (time_t) metric->update_every_s, now_s); -#ifdef NETDATA_INTERNAL_CHECKS - struct journal_page_header *metric_list_header = (void *) (data_start + metric->page_offset); - fatal_assert(uuid_compare(metric_list_header->uuid, metric->uuid) == 0); - fatal_assert(metric->entries == metric_list_header->entries); -#endif metric++; } @@ -1038,7 +1033,7 @@ static int journalfile_metric_compare (const void *item1, const void *item2) const struct jv2_metrics_info *metric1 = ((struct journal_metric_list_to_sort *) item1)->metric_info; const struct jv2_metrics_info *metric2 = ((struct journal_metric_list_to_sort *) item2)->metric_info; - return uuid_compare(*(metric1->uuid), *(metric2->uuid)); + return memcmp(metric1->uuid, metric2->uuid, sizeof(uuid_t)); } @@ -1084,6 +1079,7 @@ void *journalfile_v2_write_metric_page(struct journal_v2_header *j2_header, void metric->page_offset = pages_offset; metric->delta_start_s = (uint32_t)(metric_info->first_time_s - (time_t)(j2_header->start_time_ut / USEC_PER_SEC)); metric->delta_end_s = (uint32_t)(metric_info->last_time_s - (time_t)(j2_header->start_time_ut / USEC_PER_SEC)); + metric->update_every_s = 0; return ++metric; } @@ -1128,7 +1124,7 @@ void *journalfile_v2_write_data_page(struct journal_v2_header *j2_header, void * data_page->delta_end_s = (uint32_t) (page_info->end_time_s - (time_t) (j2_header->start_time_ut) / USEC_PER_SEC); data_page->extent_index = page_info->extent_index; - data_page->update_every_s = page_info->update_every_s; + data_page->update_every_s = (uint32_t) page_info->update_every_s; data_page->page_length = (uint16_t) (ei ? ei->page_length : page_info->page_length); data_page->type = 0; @@ -1136,7 +1132,8 @@ void *journalfile_v2_write_data_page(struct journal_v2_header *j2_header, void * } // Must be recorded in metric_info->entries -void *journalfile_v2_write_descriptors(struct journal_v2_header *j2_header, void *data, struct jv2_metrics_info *metric_info) +static void *journalfile_v2_write_descriptors(struct journal_v2_header *j2_header, void *data, struct jv2_metrics_info *metric_info, + struct journal_metric_list *current_metric) { Pvoid_t *PValue; @@ -1148,13 +1145,16 @@ void *journalfile_v2_write_descriptors(struct journal_v2_header *j2_header, void Word_t index_time = 0; bool first = true; struct jv2_page_info *page_info; + uint32_t update_every_s = 0; while ((PValue = JudyLFirstThenNext(JudyL_array, &index_time, &first))) { page_info = *PValue; // Write one descriptor and return the next data page location data_page = journalfile_v2_write_data_page(j2_header, (void *) data_page, page_info); + update_every_s = (uint32_t) page_info->update_every_s; if (NULL == data_page) break; } + current_metric->update_every_s = update_every_s; return data_page; } @@ -1291,6 +1291,7 @@ void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno // Calculate current UUID offset from start of file. We will store this in the data page header uint32_t uuid_offset = data - data_start; + struct journal_metric_list *current_metric = (void *) data; // Write the UUID we are processing data = (void *) journalfile_v2_write_metric_page(&j2_header, data, metric_info, pages_offset); if (unlikely(!data)) @@ -1308,7 +1309,7 @@ void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno uuid_offset); // Start writing descr @ time - void *page_trailer = journalfile_v2_write_descriptors(&j2_header, metric_page, metric_info); + void *page_trailer = journalfile_v2_write_descriptors(&j2_header, metric_page, metric_info, current_metric); if (unlikely(!page_trailer)) break; diff --git a/database/engine/journalfile.h b/database/engine/journalfile.h index 5fbcc90f..f6be6bcd 100644 --- a/database/engine/journalfile.h +++ b/database/engine/journalfile.h @@ -59,9 +59,9 @@ static inline uint64_t journalfile_current_size(struct rrdengine_journalfile *jo // Journal v2 structures -#define JOURVAL_V2_MAGIC (0x01221019) -#define JOURVAL_V2_REBUILD_MAGIC (0x00221019) -#define JOURVAL_V2_SKIP_MAGIC (0x02221019) +#define JOURVAL_V2_MAGIC (0x01230317) +#define JOURVAL_V2_REBUILD_MAGIC (0x00230317) +#define JOURVAL_V2_SKIP_MAGIC (0x02230317) struct journal_v2_block_trailer { union { @@ -93,13 +93,14 @@ struct journal_page_list { }; // UUID_LIST -// 32 bytes +// 36 bytes struct journal_metric_list { uuid_t uuid; - uint32_t entries; // Number of entries - uint32_t page_offset; // OFFSET that contains entries * struct( journal_page_list ) + uint32_t entries; // Number of entries + uint32_t page_offset; // OFFSET that contains entries * struct( journal_page_list ) uint32_t delta_start_s; // Min time of metric uint32_t delta_end_s; // Max time of metric (to be used to populate page_index) + uint32_t update_every_s; // Last update every for this metric in this journal (last page collected) }; // 16 bytes diff --git a/database/engine/journalfile.ksy b/database/engine/journalfile_v2.ksy.in index 858db83d..6a656bc4 100644 --- a/database/engine/journalfile.ksy +++ b/database/engine/journalfile_v2.ksy.in @@ -1,6 +1,9 @@ meta: - id: netdata_journalfile_v2 + id: journalfile_v2`'ifdef(`VIRT_MEMBERS',`_virtmemb') endian: le + application: netdata + file-extension: njfv2 + license: GPL-3.0-or-later seq: - id: journal_v2_header @@ -19,12 +22,14 @@ seq: - id: metric_trailer type: journal_v2_block_trailer - id: page_blocs - type: jounral_v2_page_blocs + type: journal_v2_page_block + repeat: expr + repeat-expr: _root.journal_v2_header.metric_count + - id: padding size: _root._io.size - _root._io.pos - 4 - id: journal_file_trailer type: journal_v2_block_trailer - types: journal_v2_metric_list: seq: @@ -38,11 +43,13 @@ types: type: u4 - id: delta_end_s type: u4 - instances: +ifdef(`VIRT_MEMBERS', +` instances: page_block: type: journal_v2_page_block io: _root._io pos: page_offset +')dnl journal_v2_page_hdr: seq: - id: crc @@ -69,11 +76,13 @@ types: type: u1 - id: reserved type: u1 - instances: +ifdef(`VIRT_MEMBERS', +` instances: extent: io: _root._io type: journal_v2_extent_list pos: _root.journal_v2_header.extent_offset + (extent_idx * 16) +')dnl journal_v2_header: seq: - id: magic @@ -106,11 +115,13 @@ types: type: u4 - id: data type: u8 - instances: +ifdef(`VIRT_MEMBERS', +` instances: trailer: io: _root._io type: journal_v2_block_trailer pos: _root._io.size - 4 +')dnl journal_v2_block_trailer: seq: - id: checksum @@ -137,8 +148,3 @@ types: repeat-expr: hdr.entries - id: block_trailer type: journal_v2_block_trailer - jounral_v2_page_blocs: - seq: - - id: blocs - type: journal_v2_page_block - repeat: eos diff --git a/database/engine/metric.c b/database/engine/metric.c index 9dc9d9eb..6b65df9b 100644 --- a/database/engine/metric.c +++ b/database/engine/metric.c @@ -105,7 +105,7 @@ static inline size_t uuid_partition(MRG *mrg __maybe_unused, uuid_t *uuid) { } static inline bool metric_has_retention_unsafe(MRG *mrg __maybe_unused, METRIC *metric) { - bool has_retention = (metric->first_time_s || metric->latest_time_s_clean || metric->latest_time_s_hot); + bool has_retention = (metric->first_time_s > 0 || metric->latest_time_s_clean > 0 || metric->latest_time_s_hot > 0); if(has_retention && !(metric->flags & METRIC_FLAG_HAS_RETENTION)) { metric->flags |= METRIC_FLAG_HAS_RETENTION; @@ -210,8 +210,8 @@ static METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *ret) { METRIC *metric = allocation; uuid_copy(metric->uuid, entry->uuid); metric->section = entry->section; - metric->first_time_s = entry->first_time_s; - metric->latest_time_s_clean = entry->last_time_s; + metric->first_time_s = MAX(0, entry->first_time_s); + metric->latest_time_s_clean = MAX(0, entry->last_time_s); metric->latest_time_s_hot = 0; metric->latest_update_every_s = entry->latest_update_every_s; metric->writer = 0; @@ -388,6 +388,11 @@ Word_t mrg_metric_section(MRG *mrg __maybe_unused, METRIC *metric) { } bool mrg_metric_set_first_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) { + internal_fatal(first_time_s < 0, "DBENGINE METRIC: timestamp is negative"); + + if(unlikely(first_time_s < 0)) + return false; + netdata_spinlock_lock(&metric->spinlock); metric->first_time_s = first_time_s; metric_has_retention_unsafe(mrg, metric); @@ -397,12 +402,25 @@ bool mrg_metric_set_first_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t } void mrg_metric_expand_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s, time_t last_time_s, time_t update_every_s) { - + internal_fatal(first_time_s < 0 || last_time_s < 0 || update_every_s < 0, + "DBENGINE METRIC: timestamp is negative"); internal_fatal(first_time_s > max_acceptable_collected_time(), "DBENGINE METRIC: metric first time is in the future"); internal_fatal(last_time_s > max_acceptable_collected_time(), "DBENGINE METRIC: metric last time is in the future"); + if(unlikely(first_time_s < 0)) + first_time_s = 0; + + if(unlikely(last_time_s < 0)) + last_time_s = 0; + + if(unlikely(update_every_s < 0)) + update_every_s = 0; + + if(unlikely(!first_time_s && !last_time_s && !update_every_s)) + return; + netdata_spinlock_lock(&metric->spinlock); if(unlikely(first_time_s && (!metric->first_time_s || first_time_s < metric->first_time_s))) @@ -412,16 +430,18 @@ void mrg_metric_expand_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t metric->latest_time_s_clean = last_time_s; if(likely(update_every_s)) - metric->latest_update_every_s = update_every_s; + metric->latest_update_every_s = (uint32_t) update_every_s; } else if(unlikely(!metric->latest_update_every_s && update_every_s)) - metric->latest_update_every_s = update_every_s; + metric->latest_update_every_s = (uint32_t) update_every_s; metric_has_retention_unsafe(mrg, metric); netdata_spinlock_unlock(&metric->spinlock); } bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) { + internal_fatal(first_time_s < 0, "DBENGINE METRIC: timestamp is negative"); + bool ret = false; netdata_spinlock_lock(&metric->spinlock); @@ -474,6 +494,11 @@ void mrg_metric_get_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t *f } bool mrg_metric_set_clean_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) { + internal_fatal(latest_time_s < 0, "DBENGINE METRIC: timestamp is negative"); + + if(unlikely(latest_time_s < 0)) + return false; + netdata_spinlock_lock(&metric->spinlock); // internal_fatal(latest_time_s > max_acceptable_collected_time(), @@ -487,9 +512,6 @@ bool mrg_metric_set_clean_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, if(unlikely(!metric->first_time_s)) metric->first_time_s = latest_time_s; -// if(unlikely(metric->first_time_s > latest_time_s)) -// metric->first_time_s = latest_time_s; - metric_has_retention_unsafe(mrg, metric); netdata_spinlock_unlock(&metric->spinlock); return true; @@ -517,7 +539,7 @@ bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric) { page_first_time_s = pgc_page_start_time_s(page); page_end_time_s = pgc_page_end_time_s(page); - if ((is_hot || is_dirty) && page_first_time_s < min_first_time_s) + if ((is_hot || is_dirty) && page_first_time_s > 0 && page_first_time_s < min_first_time_s) min_first_time_s = page_first_time_s; if (is_dirty && page_end_time_s > max_end_time_s) @@ -548,18 +570,20 @@ bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric) { } bool mrg_metric_set_hot_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) { + internal_fatal(latest_time_s < 0, "DBENGINE METRIC: timestamp is negative"); + // internal_fatal(latest_time_s > max_acceptable_collected_time(), // "DBENGINE METRIC: metric latest time is in the future"); + if(unlikely(latest_time_s < 0)) + return false; + netdata_spinlock_lock(&metric->spinlock); metric->latest_time_s_hot = latest_time_s; if(unlikely(!metric->first_time_s)) metric->first_time_s = latest_time_s; -// if(unlikely(metric->first_time_s > latest_time_s)) -// metric->first_time_s = latest_time_s; - metric_has_retention_unsafe(mrg, metric); netdata_spinlock_unlock(&metric->spinlock); return true; @@ -574,23 +598,27 @@ time_t mrg_metric_get_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric) { } bool mrg_metric_set_update_every(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) { - if(!update_every_s) + internal_fatal(update_every_s < 0, "DBENGINE METRIC: timestamp is negative"); + + if(update_every_s <= 0) return false; netdata_spinlock_lock(&metric->spinlock); - metric->latest_update_every_s = update_every_s; + metric->latest_update_every_s = (uint32_t) update_every_s; netdata_spinlock_unlock(&metric->spinlock); return true; } bool mrg_metric_set_update_every_s_if_zero(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) { - if(!update_every_s) + internal_fatal(update_every_s < 0, "DBENGINE METRIC: timestamp is negative"); + + if(update_every_s <= 0) return false; netdata_spinlock_lock(&metric->spinlock); if(!metric->latest_update_every_s) - metric->latest_update_every_s = update_every_s; + metric->latest_update_every_s = (uint32_t) update_every_s; netdata_spinlock_unlock(&metric->spinlock); return true; diff --git a/database/engine/pagecache.c b/database/engine/pagecache.c index b4902d78..02d08a16 100644 --- a/database/engine/pagecache.c +++ b/database/engine/pagecache.c @@ -99,11 +99,6 @@ inline TIME_RANGE_COMPARE is_page_in_time_range(time_t page_first_time_s, time_t return PAGE_IS_IN_RANGE; } -static int journal_metric_uuid_compare(const void *key, const void *metric) -{ - return uuid_compare(*(uuid_t *) key, ((struct journal_metric_list *) metric)->uuid); -} - static inline struct page_details *pdc_find_page_for_time( Pcvoid_t PArray, time_t wanted_time_s, @@ -310,7 +305,7 @@ static size_t get_page_list_from_pgc(PGC *cache, METRIC *metric, struct rrdengin pd->first_time_s = page_start_time_s; pd->last_time_s = page_end_time_s; pd->page_length = page_length; - pd->update_every_s = page_update_every_s; + pd->update_every_s = (uint32_t) page_update_every_s; pd->page = (open_cache_mode) ? NULL : page; pd->status |= tags; @@ -581,7 +576,7 @@ static size_t get_page_list_from_journal_v2(struct rrdengine_instance *ctx, METR .metric_id = metric_id, .start_time_s = page_first_time_s, .end_time_s = page_last_time_s, - .update_every_s = page_update_every_s, + .update_every_s = (uint32_t) page_update_every_s, .data = datafile, .size = 0, .custom_data = (uint8_t *) &ei, @@ -635,7 +630,7 @@ void add_page_details_from_journal_v2(PGC_PAGE *page, void *JudyL_pptr) { pd->last_time_s = pgc_page_end_time_s(page); pd->datafile.ptr = datafile; pd->page_length = ei->page_length; - pd->update_every_s = pgc_page_update_every_s(page); + pd->update_every_s = (uint32_t) pgc_page_update_every_s(page); pd->metric_id = metric_id; pd->status |= PDC_PAGE_DISK_PENDING | PDC_PAGE_SOURCE_JOURNAL_V2 | PDC_PAGE_DATAFILE_ACQUIRED; } @@ -774,7 +769,10 @@ inline void rrdeng_prep_wait(PDC *pdc) { } } -void rrdeng_prep_query(PDC *pdc) { +void rrdeng_prep_query(struct page_details_control *pdc, bool worker) { + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_QUERY); + size_t pages_to_load = 0; pdc->page_list_JudyL = get_page_list(pdc->ctx, pdc->metric, pdc->start_time_s * USEC_PER_SEC, @@ -785,10 +783,10 @@ void rrdeng_prep_query(PDC *pdc) { if (pages_to_load && pdc->page_list_JudyL) { pdc_acquire(pdc); // we get 1 for the 1st worker in the chain: do_read_page_list_work() usec_t start_ut = now_monotonic_usec(); -// if(likely(priority == STORAGE_PRIORITY_BEST_EFFORT)) -// dbengine_load_page_list_directly(ctx, handle->pdc); -// else - pdc_route_asynchronously(pdc->ctx, pdc); + if(likely(pdc->priority == STORAGE_PRIORITY_SYNCHRONOUS)) + pdc_route_synchronously(pdc->ctx, pdc); + else + pdc_route_asynchronously(pdc->ctx, pdc); __atomic_add_fetch(&rrdeng_cache_efficiency_stats.prep_time_to_route, now_monotonic_usec() - start_ut, __ATOMIC_RELAXED); } else @@ -797,6 +795,9 @@ void rrdeng_prep_query(PDC *pdc) { completion_mark_complete(&pdc->prep_completion); pdc_release_and_destroy_if_unreferenced(pdc, true, true); + + if(worker) + worker_is_idle(); } /** @@ -827,7 +828,11 @@ void pg_cache_preload(struct rrdeng_query_handle *handle) { if(ctx_is_available_for_queries(handle->ctx)) { handle->pdc->refcount++; // we get 1 for the query thread and 1 for the prep thread - rrdeng_enq_cmd(handle->ctx, RRDENG_OPCODE_QUERY, handle->pdc, NULL, handle->priority, NULL, NULL); + + if(unlikely(handle->pdc->priority == STORAGE_PRIORITY_SYNCHRONOUS)) + rrdeng_prep_query(handle->pdc, false); + else + rrdeng_enq_cmd(handle->ctx, RRDENG_OPCODE_QUERY, handle->pdc, NULL, handle->priority, NULL, NULL); } else { completion_mark_complete(&handle->pdc->prep_completion); @@ -924,7 +929,8 @@ struct pgc_page *pg_cache_lookup_next( else { if (unlikely(page_update_every_s <= 0 || page_update_every_s > 86400)) { __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_invalid_update_every_fixed, 1, __ATOMIC_RELAXED); - pd->update_every_s = page_update_every_s = pgc_page_fix_update_every(page, last_update_every_s); + page_update_every_s = pgc_page_fix_update_every(page, last_update_every_s); + pd->update_every_s = (uint32_t) page_update_every_s; } size_t entries_by_size = page_entries_by_size(page_length, CTX_POINT_SIZE_BYTES(ctx)); @@ -1009,7 +1015,7 @@ void pgc_open_add_hot_page(Word_t section, Word_t metric_id, time_t start_time_s .metric_id = metric_id, .start_time_s = start_time_s, .end_time_s = end_time_s, - .update_every_s = update_every_s, + .update_every_s = (uint32_t) update_every_s, .size = 0, .data = datafile, .custom_data = (uint8_t *) &ext_io_data, @@ -1069,6 +1075,8 @@ void pgc_and_mrg_initialize(void) main_cache_size = target_cache_size - extent_cache_size; } + extent_cache_size += (size_t)(default_rrdeng_extent_cache_mb * 1024ULL * 1024ULL); + main_cache = pgc_create( "main_cache", main_cache_size, diff --git a/database/engine/pagecache.h b/database/engine/pagecache.h index 9ab7db07..5242db89 100644 --- a/database/engine/pagecache.h +++ b/database/engine/pagecache.h @@ -45,16 +45,14 @@ struct rrdeng_page_info { }; struct pg_alignment { - uint32_t page_position; uint32_t refcount; - uint16_t initial_slots; }; struct rrdeng_query_handle; struct page_details_control; void rrdeng_prep_wait(struct page_details_control *pdc); -void rrdeng_prep_query(struct page_details_control *pdc); +void rrdeng_prep_query(struct page_details_control *pdc, bool worker); void pg_cache_preload(struct rrdeng_query_handle *handle); struct pgc_page *pg_cache_lookup_next(struct rrdengine_instance *ctx, struct page_details_control *pdc, time_t now_s, time_t last_update_every_s, size_t *entries); void pgc_and_mrg_initialize(void); diff --git a/database/engine/pdc.c b/database/engine/pdc.c index 8b8e7195..42fb2f6d 100644 --- a/database/engine/pdc.c +++ b/database/engine/pdc.c @@ -692,8 +692,9 @@ VALIDATED_PAGE_DESCRIPTOR validate_page( vd.page_length > RRDENG_BLOCK_SIZE || vd.start_time_s > vd.end_time_s || (now_s && vd.end_time_s > now_s) || - vd.start_time_s == 0 || - vd.end_time_s == 0 || + vd.start_time_s <= 0 || + vd.end_time_s <= 0 || + vd.update_every_s < 0 || (vd.start_time_s == vd.end_time_s && vd.entries > 1) || (vd.update_every_s == 0 && vd.entries > 1) ) @@ -835,7 +836,7 @@ static void epdl_extent_loading_error_log(struct rrdengine_instance *ctx, EPDL * uuid_unparse_lower(descr->uuid, uuid); used_descr = true; } - else if (epdl) { + else { struct page_details *pd = NULL; Word_t start = 0; @@ -855,7 +856,7 @@ static void epdl_extent_loading_error_log(struct rrdengine_instance *ctx, EPDL * } } - if(!used_epdl && !used_descr && epdl && epdl->pdc) { + if(!used_epdl && !used_descr && epdl->pdc) { start_time_s = epdl->pdc->start_time_s; end_time_s = epdl->pdc->end_time_s; } @@ -1059,7 +1060,7 @@ static bool epdl_populate_pages_from_extent_data( .metric_id = metric_id, .start_time_s = vd.start_time_s, .end_time_s = vd.end_time_s, - .update_every_s = vd.update_every_s, + .update_every_s = (uint32_t) vd.update_every_s, .size = (size_t) ((page_data == DBENGINE_EMPTY_PAGE) ? 0 : vd.page_length), .data = page_data }; @@ -1150,6 +1151,9 @@ static inline void datafile_extent_read_free(void *buffer) { } void epdl_find_extent_and_populate_pages(struct rrdengine_instance *ctx, EPDL *epdl, bool worker) { + if(worker) + worker_is_busy(UV_EVENT_DBENGINE_EXTENT_CACHE_LOOKUP); + size_t *statistics_counter = NULL; PDC_PAGE_STATUS not_loaded_pages_tag = 0, loaded_pages_tag = 0; @@ -1172,9 +1176,6 @@ void epdl_find_extent_and_populate_pages(struct rrdengine_instance *ctx, EPDL *e goto cleanup; } - if(worker) - worker_is_busy(UV_EVENT_DBENGINE_EXTENT_CACHE_LOOKUP); - bool extent_found_in_cache = false; void *extent_compressed_data = NULL; diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c index d64868f0..7811a5ea 100644 --- a/database/engine/rrdengine.c +++ b/database/engine/rrdengine.c @@ -16,6 +16,24 @@ unsigned rrdeng_pages_per_extent = MAX_PAGES_PER_EXTENT; #error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least (RRDENG_MAX_OPCODE + 2) #endif +struct rrdeng_cmd { + struct rrdengine_instance *ctx; + enum rrdeng_opcode opcode; + void *data; + struct completion *completion; + enum storage_priority priority; + dequeue_callback_t dequeue_cb; + + struct { + struct rrdeng_cmd *prev; + struct rrdeng_cmd *next; + } queue; +}; + +static inline struct rrdeng_cmd rrdeng_deq_cmd(bool from_worker); +static inline void worker_dispatch_extent_read(struct rrdeng_cmd cmd, bool from_worker); +static inline void worker_dispatch_query_prep(struct rrdeng_cmd cmd, bool from_worker); + struct rrdeng_main { uv_thread_t thread; uv_loop_t loop; @@ -45,7 +63,6 @@ struct rrdeng_main { struct { size_t dispatched; size_t executing; - size_t pending_cb; } atomics; } work_cmd; @@ -132,8 +149,22 @@ static void work_request_init(void) { ); } -static inline bool work_request_full(void) { - return __atomic_load_n(&rrdeng_main.work_cmd.atomics.dispatched, __ATOMIC_RELAXED) >= (size_t)(libuv_worker_threads - RESERVED_LIBUV_WORKER_THREADS); +enum LIBUV_WORKERS_STATUS { + LIBUV_WORKERS_RELAXED, + LIBUV_WORKERS_STRESSED, + LIBUV_WORKERS_CRITICAL, +}; + +static inline enum LIBUV_WORKERS_STATUS work_request_full(void) { + size_t dispatched = __atomic_load_n(&rrdeng_main.work_cmd.atomics.dispatched, __ATOMIC_RELAXED); + + if(dispatched >= (size_t)(libuv_worker_threads)) + return LIBUV_WORKERS_CRITICAL; + + else if(dispatched >= (size_t)(libuv_worker_threads - RESERVED_LIBUV_WORKER_THREADS)) + return LIBUV_WORKERS_STRESSED; + + return LIBUV_WORKERS_RELAXED; } static inline void work_done(struct rrdeng_work *work_request) { @@ -147,12 +178,38 @@ static void work_standard_worker(uv_work_t *req) { worker_is_busy(UV_EVENT_WORKER_INIT); struct rrdeng_work *work_request = req->data; + work_request->data = work_request->work_cb(work_request->ctx, work_request->data, work_request->completion, req); worker_is_idle(); + if(work_request->opcode == RRDENG_OPCODE_EXTENT_READ || work_request->opcode == RRDENG_OPCODE_QUERY) { + internal_fatal(work_request->after_work_cb != NULL, "DBENGINE: opcodes with a callback should not boosted"); + + while(1) { + struct rrdeng_cmd cmd = rrdeng_deq_cmd(true); + if (cmd.opcode == RRDENG_OPCODE_NOOP) + break; + + worker_is_busy(UV_EVENT_WORKER_INIT); + switch (cmd.opcode) { + case RRDENG_OPCODE_EXTENT_READ: + worker_dispatch_extent_read(cmd, true); + break; + + case RRDENG_OPCODE_QUERY: + worker_dispatch_query_prep(cmd, true); + break; + + default: + fatal("DBENGINE: Opcode should not be executed synchronously"); + break; + } + worker_is_idle(); + } + } + __atomic_sub_fetch(&rrdeng_main.work_cmd.atomics.dispatched, 1, __ATOMIC_RELAXED); __atomic_sub_fetch(&rrdeng_main.work_cmd.atomics.executing, 1, __ATOMIC_RELAXED); - __atomic_add_fetch(&rrdeng_main.work_cmd.atomics.pending_cb, 1, __ATOMIC_RELAXED); // signal the event loop a worker is available fatal_assert(0 == uv_async_send(&rrdeng_main.async)); @@ -167,7 +224,6 @@ static void after_work_standard_callback(uv_work_t* req, int status) { work_request->after_work_cb(work_request->ctx, work_request->data, work_request->completion, req, status); work_done(work_request); - __atomic_sub_fetch(&rrdeng_main.work_cmd.atomics.pending_cb, 1, __ATOMIC_RELAXED); worker_is_idle(); } @@ -369,20 +425,6 @@ void wal_release(WAL *wal) { // ---------------------------------------------------------------------------- // command queue cache -struct rrdeng_cmd { - struct rrdengine_instance *ctx; - enum rrdeng_opcode opcode; - void *data; - struct completion *completion; - enum storage_priority priority; - dequeue_callback_t dequeue_cb; - - struct { - struct rrdeng_cmd *prev; - struct rrdeng_cmd *next; - } queue; -}; - static void rrdeng_cmd_queue_init(void) { rrdeng_main.cmd_queue.ar = aral_create("dbengine-opcodes", sizeof(struct rrdeng_cmd), @@ -465,14 +507,33 @@ static inline bool rrdeng_cmd_has_waiting_opcodes_in_lower_priorities(STORAGE_PR return false; } -static inline struct rrdeng_cmd rrdeng_deq_cmd(void) { +#define opcode_empty (struct rrdeng_cmd) { \ + .ctx = NULL, \ + .opcode = RRDENG_OPCODE_NOOP, \ + .priority = STORAGE_PRIORITY_BEST_EFFORT, \ + .completion = NULL, \ + .data = NULL, \ +} + +static inline struct rrdeng_cmd rrdeng_deq_cmd(bool from_worker) { struct rrdeng_cmd *cmd = NULL; + enum LIBUV_WORKERS_STATUS status = work_request_full(); + + STORAGE_PRIORITY min_priority, max_priority; + min_priority = STORAGE_PRIORITY_INTERNAL_DBENGINE; + max_priority = (status != LIBUV_WORKERS_RELAXED) ? STORAGE_PRIORITY_INTERNAL_DBENGINE : STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE - 1; - STORAGE_PRIORITY max_priority = work_request_full() ? STORAGE_PRIORITY_INTERNAL_DBENGINE : STORAGE_PRIORITY_BEST_EFFORT; + if(from_worker) { + if(status == LIBUV_WORKERS_CRITICAL) + return opcode_empty; + + min_priority = STORAGE_PRIORITY_INTERNAL_QUERY_PREP; + max_priority = STORAGE_PRIORITY_BEST_EFFORT; + } // find an opcode to execute from the queue netdata_spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); - for(STORAGE_PRIORITY priority = STORAGE_PRIORITY_INTERNAL_DBENGINE; priority <= max_priority ; priority++) { + for(STORAGE_PRIORITY priority = min_priority; priority <= max_priority ; priority++) { cmd = rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority]; if(cmd) { @@ -508,13 +569,7 @@ static inline struct rrdeng_cmd rrdeng_deq_cmd(void) { aral_freez(rrdeng_main.cmd_queue.ar, cmd); } else - ret = (struct rrdeng_cmd) { - .ctx = NULL, - .opcode = RRDENG_OPCODE_NOOP, - .priority = STORAGE_PRIORITY_BEST_EFFORT, - .completion = NULL, - .data = NULL, - }; + ret = opcode_empty; return ret; } @@ -927,11 +982,6 @@ struct uuid_first_time_s { size_t df_index_oldest; }; -static int journal_metric_compare(const void *key, const void *metric) -{ - return uuid_compare(*(uuid_t *) key, ((struct journal_metric_list *) metric)->uuid); -} - struct rrdengine_datafile *datafile_release_and_acquire_next_for_retention(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) { uv_rwlock_rdlock(&ctx->datafiles.rwlock); @@ -987,7 +1037,10 @@ void find_uuid_first_time( if (uuid_original_entry->df_matched > 3 || uuid_original_entry->pages_found > 5) continue; - struct journal_metric_list *live_entry = bsearch(uuid_original_entry->uuid,uuid_list,journal_metric_count,sizeof(*uuid_list), journal_metric_compare); + struct journal_metric_list *live_entry = + bsearch(uuid_original_entry->uuid,uuid_list,journal_metric_count, + sizeof(*uuid_list), journal_metric_uuid_compare); + if (!live_entry) { // Not found in this journal not_matching_bsearches++; @@ -1087,13 +1140,20 @@ void find_uuid_first_time( } static void update_metrics_first_time_s(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile_to_delete, struct rrdengine_datafile *first_datafile_remaining, bool worker) { - __atomic_add_fetch(&rrdeng_cache_efficiency_stats.metrics_retention_started, 1, __ATOMIC_RELAXED); - if(worker) worker_is_busy(UV_EVENT_DBENGINE_FIND_ROTATED_METRICS); struct rrdengine_journalfile *journalfile = datafile_to_delete->journalfile; struct journal_v2_header *j2_header = journalfile_v2_data_acquire(journalfile, NULL, 0, 0); + + if (unlikely(!j2_header)) { + if (worker) + worker_is_idle(); + return; + } + + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.metrics_retention_started, 1, __ATOMIC_RELAXED); + struct journal_metric_list *uuid_list = (struct journal_metric_list *)((uint8_t *) j2_header + j2_header->metric_offset); size_t count = j2_header->metric_count; @@ -1348,14 +1408,9 @@ static void *cache_evict_tp_worker(struct rrdengine_instance *ctx __maybe_unused return data; } -static void after_prep_query(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { - ; -} - static void *query_prep_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *req __maybe_unused) { - worker_is_busy(UV_EVENT_DBENGINE_QUERY); PDC *pdc = data; - rrdeng_prep_query(pdc); + rrdeng_prep_query(pdc, true); return data; } @@ -1435,21 +1490,28 @@ static void *journal_v2_indexing_tp_worker(struct rrdengine_instance *ctx __mayb worker_is_busy(UV_EVENT_DBENGINE_JOURNAL_INDEX); count = 0; while (datafile && datafile->fileno != ctx_last_fileno_get(ctx) && datafile->fileno != ctx_last_flush_fileno_get(ctx)) { + if(journalfile_v2_data_available(datafile->journalfile)) { + // journal file v2 is already there for this datafile + datafile = datafile->next; + continue; + } netdata_spinlock_lock(&datafile->writers.spinlock); bool available = (datafile->writers.running || datafile->writers.flushed_to_open_running) ? false : true; netdata_spinlock_unlock(&datafile->writers.spinlock); - if(!available) + if(!available) { + info("DBENGINE: journal file %u needs to be indexed, but it has writers working on it - skipping it for now", datafile->fileno); + datafile = datafile->next; continue; - - if (unlikely(!journalfile_v2_data_available(datafile->journalfile))) { - info("DBENGINE: journal file %u is ready to be indexed", datafile->fileno); - pgc_open_cache_to_journal_v2(open_cache, (Word_t) ctx, (int) datafile->fileno, ctx->config.page_type, - journalfile_migrate_to_v2_callback, (void *) datafile->journalfile); - count++; } + info("DBENGINE: journal file %u is ready to be indexed", datafile->fileno); + pgc_open_cache_to_journal_v2(open_cache, (Word_t) ctx, (int) datafile->fileno, ctx->config.page_type, + journalfile_migrate_to_v2_callback, (void *) datafile->journalfile); + + count++; + datafile = datafile->next; if (unlikely(!ctx_is_available_for_queries(ctx))) @@ -1472,10 +1534,6 @@ static void after_do_cache_evict(struct rrdengine_instance *ctx __maybe_unused, rrdeng_main.evictions_running--; } -static void after_extent_read(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { - ; -} - static void after_journal_v2_indexing(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t* req __maybe_unused, int status __maybe_unused) { __atomic_store_n(&ctx->atomic.migration_to_v2_running, false, __ATOMIC_RELAXED); rrdeng_enq_cmd(ctx, RRDENG_OPCODE_DATABASE_ROTATE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); @@ -1604,6 +1662,26 @@ bool rrdeng_dbengine_spawn(struct rrdengine_instance *ctx __maybe_unused) { return true; } +static inline void worker_dispatch_extent_read(struct rrdeng_cmd cmd, bool from_worker) { + struct rrdengine_instance *ctx = cmd.ctx; + EPDL *epdl = cmd.data; + + if(from_worker) + epdl_find_extent_and_populate_pages(ctx, epdl, true); + else + work_dispatch(ctx, epdl, NULL, cmd.opcode, extent_read_tp_worker, NULL); +} + +static inline void worker_dispatch_query_prep(struct rrdeng_cmd cmd, bool from_worker) { + struct rrdengine_instance *ctx = cmd.ctx; + PDC *pdc = cmd.data; + + if(from_worker) + rrdeng_prep_query(pdc, true); + else + work_dispatch(ctx, pdc, NULL, cmd.opcode, query_prep_tp_worker, NULL); +} + void dbengine_event_loop(void* arg) { sanity_check(); uv_thread_set_name_np(pthread_self(), "DBENGINE"); @@ -1661,25 +1739,19 @@ void dbengine_event_loop(void* arg) { /* wait for commands */ do { worker_is_busy(RRDENG_OPCODE_MAX); - cmd = rrdeng_deq_cmd(); + cmd = rrdeng_deq_cmd(RRDENG_OPCODE_NOOP); opcode = cmd.opcode; worker_is_busy(opcode); switch (opcode) { - case RRDENG_OPCODE_EXTENT_READ: { - struct rrdengine_instance *ctx = cmd.ctx; - EPDL *epdl = cmd.data; - work_dispatch(ctx, epdl, NULL, opcode, extent_read_tp_worker, after_extent_read); + case RRDENG_OPCODE_EXTENT_READ: + worker_dispatch_extent_read(cmd, false); break; - } - case RRDENG_OPCODE_QUERY: { - struct rrdengine_instance *ctx = cmd.ctx; - PDC *pdc = cmd.data; - work_dispatch(ctx, pdc, NULL, opcode, query_prep_tp_worker, after_prep_query); + case RRDENG_OPCODE_QUERY: + worker_dispatch_query_prep(cmd, false); break; - } case RRDENG_OPCODE_EXTENT_WRITE: { struct rrdengine_instance *ctx = cmd.ctx; diff --git a/database/engine/rrdengine.h b/database/engine/rrdengine.h index 49266681..69e41235 100644 --- a/database/engine/rrdengine.h +++ b/database/engine/rrdengine.h @@ -160,9 +160,7 @@ struct jv2_page_info { }; typedef enum __attribute__ ((__packed__)) { - RRDENG_CHO_UNALIGNED = (1 << 0), // set when this metric is not page aligned according to page alignment - RRDENG_FIRST_PAGE_ALLOCATED = (1 << 1), // set when this metric has allocated its first page - RRDENG_1ST_METRIC_WRITER = (1 << 2), + RRDENG_1ST_METRIC_WRITER = (1 << 0), } RRDENG_COLLECT_HANDLE_OPTIONS; typedef enum __attribute__ ((__packed__)) { @@ -183,12 +181,17 @@ typedef enum __attribute__ ((__packed__)) { } RRDENG_COLLECT_PAGE_FLAGS; struct rrdeng_collect_handle { + struct storage_collect_handle common; // has to be first item + + RRDENG_COLLECT_PAGE_FLAGS page_flags; + RRDENG_COLLECT_HANDLE_OPTIONS options; + uint8_t type; + struct metric *metric; struct pgc_page *page; + void *data; + size_t data_size; struct pg_alignment *alignment; - RRDENG_COLLECT_HANDLE_OPTIONS options; - uint8_t type; - RRDENG_COLLECT_PAGE_FLAGS page_flags; uint32_t page_entries_max; uint32_t page_position; // keep track of the current page size, to make sure we don't exceed it usec_t page_start_time_ut; @@ -515,4 +518,8 @@ static inline time_t max_acceptable_collected_time(void) { void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool update_retention, bool worker); +static inline int journal_metric_uuid_compare(const void *key, const void *metric) { + return uuid_memcmp((uuid_t *)key, &(((struct journal_metric_list *) metric)->uuid)); +} + #endif /* NETDATA_RRDENGINE_H */ diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c index 27497bbb..ddc306ed 100755 --- a/database/engine/rrdengineapi.c +++ b/database/engine/rrdengineapi.c @@ -35,16 +35,16 @@ __attribute__((constructor)) void initialize_multidb_ctx(void) { multidb_ctx[4] = &multidb_ctx_storage_tier4; } -int default_rrdeng_page_fetch_timeout = 3; -int default_rrdeng_page_fetch_retries = 3; int db_engine_journal_check = 0; int default_rrdeng_disk_quota_mb = 256; int default_multidb_disk_quota_mb = 256; #if defined(ENV32BIT) int default_rrdeng_page_cache_mb = 16; +int default_rrdeng_extent_cache_mb = 0; #else int default_rrdeng_page_cache_mb = 32; +int default_rrdeng_extent_cache_mb = 0; #endif // ---------------------------------------------------------------------------- @@ -163,7 +163,7 @@ STORAGE_METRIC_HANDLE *rrdeng_metric_get_or_create(RRDDIM *rd, STORAGE_INSTANCE } #ifdef NETDATA_INTERNAL_CHECKS - if(uuid_compare(rd->metric_uuid, *mrg_metric_uuid(main_mrg, metric)) != 0) { + if(uuid_memcmp(&rd->metric_uuid, mrg_metric_uuid(main_mrg, metric)) != 0) { char uuid1[UUID_STR_LEN + 1]; char uuid2[UUID_STR_LEN + 1]; @@ -255,8 +255,11 @@ STORAGE_COLLECT_HANDLE *rrdeng_store_metric_init(STORAGE_METRIC_HANDLE *db_metri struct rrdeng_collect_handle *handle; handle = callocz(1, sizeof(struct rrdeng_collect_handle)); + handle->common.backend = STORAGE_ENGINE_BACKEND_DBENGINE; handle->metric = metric; handle->page = NULL; + handle->data = NULL; + handle->data_size = 0; handle->page_position = 0; handle->page_entries_max = 0; handle->update_every_ut = (usec_t)update_every * USEC_PER_SEC; @@ -340,6 +343,8 @@ void rrdeng_store_metric_flush_current_page(STORAGE_COLLECT_HANDLE *collection_h handle->page_flags = 0; handle->page_position = 0; handle->page_entries_max = 0; + handle->data = NULL; + handle->data_size = 0; // important! // we should never zero page end time ut, because this will allow @@ -348,6 +353,8 @@ void rrdeng_store_metric_flush_current_page(STORAGE_COLLECT_HANDLE *collection_h // handle->page_start_time_ut; check_and_fix_mrg_update_every(handle); + + timing_step(TIMING_STEP_DBENGINE_FLUSH_PAGE); } static void rrdeng_store_metric_create_new_page(struct rrdeng_collect_handle *handle, @@ -365,7 +372,7 @@ static void rrdeng_store_metric_create_new_page(struct rrdeng_collect_handle *ha .end_time_s = point_in_time_s, .size = data_size, .data = data, - .update_every_s = update_every_s, + .update_every_s = (uint32_t) update_every_s, .hot = true }; @@ -414,62 +421,57 @@ static void rrdeng_store_metric_create_new_page(struct rrdeng_collect_handle *ha handle->page_flags |= RRDENG_PAGE_CREATED_IN_FUTURE; check_and_fix_mrg_update_every(handle); + + timing_step(TIMING_STEP_DBENGINE_CREATE_NEW_PAGE); } -static void *rrdeng_alloc_new_metric_data(struct rrdeng_collect_handle *handle, size_t *data_size, usec_t point_in_time_ut) { - struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); - size_t size; +static size_t aligned_allocation_entries(size_t max_slots, size_t target_slot, time_t now_s) { + size_t slots = target_slot; + size_t pos = (now_s % max_slots); - if(handle->options & RRDENG_FIRST_PAGE_ALLOCATED) { - // any page except the first - size = tier_page_size[ctx->config.tier]; - } - else { - size_t final_slots = 0; + if(pos > slots) + slots += max_slots - pos; - // the first page - handle->options |= RRDENG_FIRST_PAGE_ALLOCATED; - size_t max_size = tier_page_size[ctx->config.tier]; - size_t max_slots = max_size / CTX_POINT_SIZE_BYTES(ctx); + else if(pos < slots) + slots -= pos; - if(handle->alignment->initial_slots) { - final_slots = handle->alignment->initial_slots; - } - else { - max_slots -= 3; + else + slots = max_slots; - size_t smaller_slot = indexing_partition((Word_t)handle->alignment, max_slots); - final_slots = smaller_slot; + return slots; +} - time_t now_s = (time_t)(point_in_time_ut / USEC_PER_SEC); - size_t current_pos = (now_s % max_slots); +static void *rrdeng_alloc_new_metric_data(struct rrdeng_collect_handle *handle, size_t *data_size, usec_t point_in_time_ut) { + struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); - if(current_pos > final_slots) - final_slots += max_slots - current_pos; + size_t max_size = tier_page_size[ctx->config.tier]; + size_t max_slots = max_size / CTX_POINT_SIZE_BYTES(ctx); - else if(current_pos < final_slots) - final_slots -= current_pos; + size_t slots = aligned_allocation_entries( + max_slots, + indexing_partition((Word_t) handle->alignment, max_slots), + (time_t) (point_in_time_ut / USEC_PER_SEC) + ); - if(final_slots < 3) { - final_slots += 3; - smaller_slot += 3; + if(slots < max_slots / 3) + slots = max_slots / 3; - if(smaller_slot >= max_slots) - smaller_slot -= max_slots; - } + if(slots < 3) + slots = 3; - max_slots += 3; - handle->alignment->initial_slots = smaller_slot + 3; + size_t size = slots * CTX_POINT_SIZE_BYTES(ctx); - internal_fatal(handle->alignment->initial_slots < 3 || handle->alignment->initial_slots >= max_slots, "ooops! wrong distribution of metrics across time"); - internal_fatal(final_slots < 3 || final_slots >= max_slots, "ooops! wrong distribution of metrics across time"); - } + // internal_error(true, "PAGE ALLOC %zu bytes (%zu max)", size, max_size); - size = final_slots * CTX_POINT_SIZE_BYTES(ctx); - } + internal_fatal(slots < 3 || slots > max_slots, "ooops! wrong distribution of metrics across time"); + internal_fatal(size > tier_page_size[ctx->config.tier] || size < CTX_POINT_SIZE_BYTES(ctx) * 2, "ooops! wrong page size"); *data_size = size; - return dbengine_page_alloc(size); + void *d = dbengine_page_alloc(size); + + timing_step(TIMING_STEP_DBENGINE_PAGE_ALLOC); + + return d; } static void rrdeng_store_metric_append_point(STORAGE_COLLECT_HANDLE *collection_handle, @@ -484,75 +486,33 @@ static void rrdeng_store_metric_append_point(STORAGE_COLLECT_HANDLE *collection_ struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); - bool perfect_page_alignment = false; - void *data; - size_t data_size; + if(unlikely(!handle->data)) + handle->data = rrdeng_alloc_new_metric_data(handle, &handle->data_size, point_in_time_ut); - if(likely(handle->page)) { - /* Make alignment decisions */ - if (handle->page_position == handle->alignment->page_position) { - /* this is the leading dimension that defines chart alignment */ - perfect_page_alignment = true; - } - - /* is the metric far enough out of alignment with the others? */ - if (unlikely(handle->page_position + 1 < handle->alignment->page_position)) - handle->options |= RRDENG_CHO_UNALIGNED; + timing_step(TIMING_STEP_DBENGINE_CHECK_DATA); - if (unlikely((handle->options & RRDENG_CHO_UNALIGNED) && - /* did the other metrics change page? */ - handle->alignment->page_position <= 1)) { - handle->options &= ~RRDENG_CHO_UNALIGNED; - handle->page_flags |= RRDENG_PAGE_UNALIGNED; - rrdeng_store_metric_flush_current_page(collection_handle); - - data = rrdeng_alloc_new_metric_data(handle, &data_size, point_in_time_ut); - } - else { - data = pgc_page_data(handle->page); - data_size = pgc_page_data_size(main_cache, handle->page); - } + if(likely(ctx->config.page_type == PAGE_METRICS)) { + storage_number *tier0_metric_data = handle->data; + tier0_metric_data[handle->page_position] = pack_storage_number(n, flags); + } + else if(likely(ctx->config.page_type == PAGE_TIER)) { + storage_number_tier1_t *tier12_metric_data = handle->data; + storage_number_tier1_t number_tier1; + number_tier1.sum_value = (float) n; + number_tier1.min_value = (float) min_value; + number_tier1.max_value = (float) max_value; + number_tier1.anomaly_count = anomaly_count; + number_tier1.count = count; + tier12_metric_data[handle->page_position] = number_tier1; } else - data = rrdeng_alloc_new_metric_data(handle, &data_size, point_in_time_ut); - - switch (ctx->config.page_type) { - case PAGE_METRICS: { - storage_number *tier0_metric_data = data; - tier0_metric_data[handle->page_position] = pack_storage_number(n, flags); - } - break; + fatal("DBENGINE: cannot store metric on unknown page type id %d", ctx->config.page_type); - case PAGE_TIER: { - storage_number_tier1_t *tier12_metric_data = data; - storage_number_tier1_t number_tier1; - number_tier1.sum_value = (float)n; - number_tier1.min_value = (float)min_value; - number_tier1.max_value = (float)max_value; - number_tier1.anomaly_count = anomaly_count; - number_tier1.count = count; - tier12_metric_data[handle->page_position] = number_tier1; - } - break; - - default: { - static bool logged = false; - if(!logged) { - error("DBENGINE: cannot store metric on unknown page type id %d", ctx->config.page_type); - logged = true; - } - } - break; - } + timing_step(TIMING_STEP_DBENGINE_PACK); if(unlikely(!handle->page)){ - rrdeng_store_metric_create_new_page(handle, ctx, point_in_time_ut, data, data_size); + rrdeng_store_metric_create_new_page(handle, ctx, point_in_time_ut, handle->data, handle->data_size); // handle->position is set to 1 already - - if (0 == handle->alignment->page_position) { - /* this is the leading dimension that defines chart alignment */ - perfect_page_alignment = true; - } } else { // update an existing page @@ -566,11 +526,12 @@ static void rrdeng_store_metric_append_point(STORAGE_COLLECT_HANDLE *collection_ } } - if (perfect_page_alignment) - handle->alignment->page_position = handle->page_position; + timing_step(TIMING_STEP_DBENGINE_PAGE_FIN); // update the metric information mrg_metric_set_hot_latest_time_s(main_mrg, handle->metric, (time_t) (point_in_time_ut / USEC_PER_SEC)); + + timing_step(TIMING_STEP_DBENGINE_MRG_UPDATE); } static void store_metric_next_error_log(struct rrdeng_collect_handle *handle, usec_t point_in_time_ut, const char *msg) { @@ -612,6 +573,8 @@ void rrdeng_store_metric_next(STORAGE_COLLECT_HANDLE *collection_handle, const uint16_t anomaly_count, const SN_FLAGS flags) { + timing_step(TIMING_STEP_RRDSET_STORE_METRIC); + struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; #ifdef NETDATA_INTERNAL_CHECKS @@ -619,59 +582,62 @@ void rrdeng_store_metric_next(STORAGE_COLLECT_HANDLE *collection_handle, handle->page_flags |= RRDENG_PAGE_FUTURE_POINT; #endif - if(likely(handle->page_end_time_ut + handle->update_every_ut == point_in_time_ut)) { + usec_t delta_ut = point_in_time_ut - handle->page_end_time_ut; + + if(likely(delta_ut == handle->update_every_ut)) { // happy path ; } + else if(unlikely(point_in_time_ut > handle->page_end_time_ut)) { + if(handle->page) { + if (unlikely(delta_ut < handle->update_every_ut)) { + handle->page_flags |= RRDENG_PAGE_STEP_TOO_SMALL; + rrdeng_store_metric_flush_current_page(collection_handle); + } + else if (unlikely(delta_ut % handle->update_every_ut)) { + handle->page_flags |= RRDENG_PAGE_STEP_UNALIGNED; + rrdeng_store_metric_flush_current_page(collection_handle); + } + else { + size_t points_gap = delta_ut / handle->update_every_ut; + size_t page_remaining_points = handle->page_entries_max - handle->page_position; + + if (points_gap >= page_remaining_points) { + handle->page_flags |= RRDENG_PAGE_BIG_GAP; + rrdeng_store_metric_flush_current_page(collection_handle); + } + else { + // loop to fill the gap + handle->page_flags |= RRDENG_PAGE_GAP; + + usec_t stop_ut = point_in_time_ut - handle->update_every_ut; + for (usec_t this_ut = handle->page_end_time_ut + handle->update_every_ut; + this_ut <= stop_ut; + this_ut = handle->page_end_time_ut + handle->update_every_ut) { + rrdeng_store_metric_append_point( + collection_handle, + this_ut, + NAN, NAN, NAN, + 1, 0, + SN_EMPTY_SLOT); + } + } + } + } + } else if(unlikely(point_in_time_ut < handle->page_end_time_ut)) { handle->page_flags |= RRDENG_PAGE_PAST_COLLECTION; store_metric_next_error_log(handle, point_in_time_ut, "is older than the"); return; } - else if(unlikely(point_in_time_ut == handle->page_end_time_ut)) { + else /* if(unlikely(point_in_time_ut == handle->page_end_time_ut)) */ { handle->page_flags |= RRDENG_PAGE_REPEATED_COLLECTION; store_metric_next_error_log(handle, point_in_time_ut, "is at the same time as the"); return; } - else if(handle->page) { - usec_t delta_ut = point_in_time_ut - handle->page_end_time_ut; - - if(unlikely(delta_ut < handle->update_every_ut)) { - handle->page_flags |= RRDENG_PAGE_STEP_TOO_SMALL; - rrdeng_store_metric_flush_current_page(collection_handle); - } - else if(unlikely(delta_ut % handle->update_every_ut)) { - handle->page_flags |= RRDENG_PAGE_STEP_UNALIGNED; - rrdeng_store_metric_flush_current_page(collection_handle); - } - else { - size_t points_gap = delta_ut / handle->update_every_ut; - size_t page_remaining_points = handle->page_entries_max - handle->page_position; - - if(points_gap >= page_remaining_points) { - handle->page_flags |= RRDENG_PAGE_BIG_GAP; - rrdeng_store_metric_flush_current_page(collection_handle); - } - else { - // loop to fill the gap - handle->page_flags |= RRDENG_PAGE_GAP; - - usec_t stop_ut = point_in_time_ut - handle->update_every_ut; - for(usec_t this_ut = handle->page_end_time_ut + handle->update_every_ut; - this_ut <= stop_ut ; - this_ut = handle->page_end_time_ut + handle->update_every_ut) { - rrdeng_store_metric_append_point( - collection_handle, - this_ut, - NAN, NAN, NAN, - 1, 0, - SN_EMPTY_SLOT); - } - } - } - } + timing_step(TIMING_STEP_DBENGINE_FIRST_CHECK); rrdeng_store_metric_append_point(collection_handle, point_in_time_ut, @@ -776,10 +742,10 @@ void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, handle = rrdeng_query_handle_get(); register_query_handle(handle); - if(unlikely(priority < STORAGE_PRIORITY_HIGH)) + if (unlikely(priority < STORAGE_PRIORITY_HIGH)) priority = STORAGE_PRIORITY_HIGH; - else if(unlikely(priority > STORAGE_PRIORITY_BEST_EFFORT)) - priority = STORAGE_PRIORITY_BEST_EFFORT; + else if (unlikely(priority >= STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE)) + priority = STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE - 1; handle->ctx = ctx; handle->metric = metric; @@ -809,6 +775,7 @@ void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, rrddim_handle->start_time_s = handle->start_time_s; rrddim_handle->end_time_s = handle->end_time_s; rrddim_handle->priority = priority; + rrddim_handle->backend = STORAGE_ENGINE_BACKEND_DBENGINE; pg_cache_preload(handle); @@ -824,6 +791,7 @@ void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, rrddim_handle->start_time_s = handle->start_time_s; rrddim_handle->end_time_s = 0; rrddim_handle->priority = priority; + rrddim_handle->backend = STORAGE_ENGINE_BACKEND_DBENGINE; } } @@ -906,6 +874,7 @@ STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim // We need to get a new page if (!rrdeng_load_page_next(rrddim_handle, false)) { + handle->now_s = rrddim_handle->end_time_s; storage_point_empty(sp, handle->now_s - handle->dt_s, handle->now_s); goto prepare_for_next_iteration; } diff --git a/database/engine/rrdengineapi.h b/database/engine/rrdengineapi.h index feb79b97..514954af 100644 --- a/database/engine/rrdengineapi.h +++ b/database/engine/rrdengineapi.h @@ -12,11 +12,8 @@ #define RRDENG_FD_BUDGET_PER_INSTANCE (50) -extern int db_engine_use_malloc; -extern int default_rrdeng_page_fetch_timeout; -extern int default_rrdeng_page_fetch_retries; extern int default_rrdeng_page_cache_mb; -extern int db_engine_journal_indexing; +extern int default_rrdeng_extent_cache_mb; extern int db_engine_journal_check; extern int default_rrdeng_disk_quota_mb; extern int default_multidb_disk_quota_mb; @@ -27,9 +24,6 @@ extern size_t tier_page_size[]; #define CTX_POINT_SIZE_BYTES(ctx) page_type_size[(ctx)->config.page_type] void rrdeng_generate_legacy_uuid(const char *dim_id, const char *chart_id, uuid_t *ret_uuid); -void rrdeng_convert_legacy_uuid_to_multihost(char machine_guid[GUID_LEN + 1], uuid_t *legacy_uuid, - uuid_t *ret_uuid); - STORAGE_METRIC_HANDLE *rrdeng_metric_get_or_create(RRDDIM *rd, STORAGE_INSTANCE *db_instance); STORAGE_METRIC_HANDLE *rrdeng_metric_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid); diff --git a/database/engine/rrdenginelib.c b/database/engine/rrdenginelib.c index 7ec626c5..984a591e 100644 --- a/database/engine/rrdenginelib.c +++ b/database/engine/rrdenginelib.c @@ -1,72 +1,6 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "rrdengine.h" -#define BUFSIZE (512) - -/* Caller must hold descriptor lock */ -//void print_page_cache_descr(struct rrdeng_page_descr *descr, const char *msg, bool log_debug) -//{ -// if(log_debug && !(debug_flags & D_RRDENGINE)) -// return; -// -// BUFFER *wb = buffer_create(512); -// -// if(!descr) { -// buffer_sprintf(wb, "DBENGINE: %s : descr is NULL", msg); -// } -// else { -// struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; -// char uuid_str[UUID_STR_LEN]; -// -// uuid_unparse_lower(*descr->id, uuid_str); -// buffer_sprintf(wb, "DBENGINE: %s : page(%p) metric:%s, len:%"PRIu32", time:%"PRIu64"->%"PRIu64", update_every:%u, type:%u, xt_offset:", -// msg, -// pg_cache_descr->page, uuid_str, -// descr->page_length, -// (uint64_t)descr->start_time_ut, -// (uint64_t)descr->end_time_ut, -// (uint32_t)descr->update_every_s, -// (uint32_t)descr->type -// ); -// if (!descr->extent) { -// buffer_strcat(wb, "N/A"); -// } else { -// buffer_sprintf(wb, "%"PRIu64, descr->extent->offset); -// } -// -// buffer_sprintf(wb, ", flags:0x%2.2lX refcnt:%u", pg_cache_descr->flags, pg_cache_descr->refcnt); -// } -// -// if(log_debug) -// debug(D_RRDENGINE, "%s", buffer_tostring(wb)); -// else -// internal_error(true, "%s", buffer_tostring(wb)); -// -// buffer_free(wb); -//} -// -//void print_page_descr(struct rrdeng_page_descr *descr) -//{ -// char uuid_str[UUID_STR_LEN]; -// char str[BUFSIZE + 1]; -// int pos = 0; -// -// uuid_unparse_lower(*descr->id, uuid_str); -// pos += snprintfz(str, BUFSIZE - pos, "id=%s\n" -// "--->len:%"PRIu32" time:%"PRIu64"->%"PRIu64" xt_offset:", -// uuid_str, -// descr->page_length, -// (uint64_t)descr->start_time_ut, -// (uint64_t)descr->end_time_ut); -// if (!descr->extent) { -// pos += snprintfz(str + pos, BUFSIZE - pos, "N/A"); -// } else { -// pos += snprintfz(str + pos, BUFSIZE - pos, "%"PRIu64, descr->extent->offset); -// } -// snprintfz(str + pos, BUFSIZE - pos, "\n\n"); -// fputs(str, stderr); -//} - int check_file_properties(uv_file file, uint64_t *file_size, size_t min_size) { int ret; diff --git a/database/ram/README.md b/database/ram/README.md index 73562f0f..56cb7275 100644 --- a/database/ram/README.md +++ b/database/ram/README.md @@ -2,6 +2,10 @@ title: "RAM database modes" description: "Netdata's RAM database modes." custom_edit_url: https://github.com/netdata/netdata/edit/master/database/ram/README.md +sidebar_label: "RAM database modes" +learn_status: "Published" +learn_topic_type: "References" +learn_rel_path: "Developers/Database" --> -# RAM modes
\ No newline at end of file +# RAM database modes diff --git a/database/ram/rrddim_mem.c b/database/ram/rrddim_mem.c index 0f17d6cb..a417c5ae 100644 --- a/database/ram/rrddim_mem.c +++ b/database/ram/rrddim_mem.c @@ -143,6 +143,7 @@ STORAGE_COLLECT_HANDLE *rrddim_collect_init(STORAGE_METRIC_HANDLE *db_metric_han internal_fatal((uint32_t)mh->update_every_s != update_every, "RRDDIM: update requested does not match the dimension"); struct mem_collect_handle *ch = callocz(1, sizeof(struct mem_collect_handle)); + ch->common.backend = STORAGE_ENGINE_BACKEND_RRDDIM; ch->rd = rd; ch->db_metric_handle = db_metric_handle; @@ -204,7 +205,7 @@ static inline void rrddim_fill_the_gap(STORAGE_COLLECT_HANDLE *collection_handle void rrddim_collect_store_metric(STORAGE_COLLECT_HANDLE *collection_handle, usec_t point_in_time_ut, - NETDATA_DOUBLE number, + NETDATA_DOUBLE n, NETDATA_DOUBLE min_value __maybe_unused, NETDATA_DOUBLE max_value __maybe_unused, uint16_t count __maybe_unused, @@ -226,7 +227,7 @@ void rrddim_collect_store_metric(STORAGE_COLLECT_HANDLE *collection_handle, if(unlikely(mh->last_updated_s && point_in_time_s - mh->update_every_s > mh->last_updated_s)) rrddim_fill_the_gap(collection_handle, point_in_time_s); - rd->db[mh->current_entry] = pack_storage_number(number, flags); + rd->db[mh->current_entry] = pack_storage_number(n, flags); mh->counter++; mh->current_entry = (mh->current_entry + 1) >= mh->entries ? 0 : mh->current_entry + 1; mh->last_updated_s = point_in_time_s; @@ -340,6 +341,7 @@ void rrddim_query_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_e handle->start_time_s = start_time_s; handle->end_time_s = end_time_s; handle->priority = priority; + handle->backend = STORAGE_ENGINE_BACKEND_RRDDIM; struct mem_query_handle* h = mallocz(sizeof(struct mem_query_handle)); h->db_metric_handle = db_metric_handle; diff --git a/database/ram/rrddim_mem.h b/database/ram/rrddim_mem.h index 373a2bd7..a75814a0 100644 --- a/database/ram/rrddim_mem.h +++ b/database/ram/rrddim_mem.h @@ -6,6 +6,8 @@ #include "database/rrd.h" struct mem_collect_handle { + struct storage_collect_handle common; // has to be first item + STORAGE_METRIC_HANDLE *db_metric_handle; RRDDIM *rd; }; @@ -32,7 +34,7 @@ void rrddim_metrics_group_release(STORAGE_INSTANCE *db_instance, STORAGE_METRICS STORAGE_COLLECT_HANDLE *rrddim_collect_init(STORAGE_METRIC_HANDLE *db_metric_handle, uint32_t update_every, STORAGE_METRICS_GROUP *smg); void rrddim_store_metric_change_collection_frequency(STORAGE_COLLECT_HANDLE *collection_handle, int update_every); -void rrddim_collect_store_metric(STORAGE_COLLECT_HANDLE *collection_handle, usec_t point_in_time_ut, NETDATA_DOUBLE number, +void rrddim_collect_store_metric(STORAGE_COLLECT_HANDLE *collection_handle, usec_t point_in_time_ut, NETDATA_DOUBLE n, NETDATA_DOUBLE min_value, NETDATA_DOUBLE max_value, uint16_t count, diff --git a/database/rrd.h b/database/rrd.h index 42eeb165..0f67a3b7 100644 --- a/database/rrd.h +++ b/database/rrd.h @@ -30,9 +30,9 @@ typedef struct rrdhost_acquired RRDHOST_ACQUIRED; typedef struct rrdset_acquired RRDSET_ACQUIRED; typedef struct rrddim_acquired RRDDIM_ACQUIRED; -typedef struct ml_host ml_host_t; -typedef struct ml_chart ml_chart_t; -typedef struct ml_dimension ml_dimension_t; +typedef struct ml_host rrd_ml_host_t; +typedef struct ml_chart rrd_ml_chart_t; +typedef struct ml_dimension rrd_ml_dimension_t; typedef enum __attribute__ ((__packed__)) { QUERY_SOURCE_UNKNOWN = 0, @@ -54,6 +54,9 @@ typedef enum __attribute__ ((__packed__)) storage_priority { STORAGE_PRIORITY_LOW, STORAGE_PRIORITY_BEST_EFFORT, + // synchronous query, not to be dispatched to workers or queued + STORAGE_PRIORITY_SYNCHRONOUS, + STORAGE_PRIORITY_INTERNAL_MAX_DONT_USE, } STORAGE_PRIORITY; @@ -106,30 +109,39 @@ RRD_MEMORY_MODE rrd_memory_mode_id(const char *name); typedef struct storage_query_handle STORAGE_QUERY_HANDLE; +typedef enum __attribute__ ((__packed__)) { + STORAGE_ENGINE_BACKEND_RRDDIM = 1, + STORAGE_ENGINE_BACKEND_DBENGINE = 2, +} STORAGE_ENGINE_BACKEND; + +#define is_valid_backend(backend) ((backend) >= STORAGE_ENGINE_BACKEND_RRDDIM && (backend) <= STORAGE_ENGINE_BACKEND_DBENGINE) + // iterator state for RRD dimension data queries struct storage_engine_query_handle { time_t start_time_s; time_t end_time_s; STORAGE_PRIORITY priority; - STORAGE_QUERY_HANDLE* handle; + STORAGE_ENGINE_BACKEND backend; + STORAGE_QUERY_HANDLE *handle; }; -typedef struct storage_point { - NETDATA_DOUBLE min; // when count > 1, this is the minimum among them - NETDATA_DOUBLE max; // when count > 1, this is the maximum among them - NETDATA_DOUBLE sum; // the point sum - divided by count gives the average +// ---------------------------------------------------------------------------- +// chart types - // end_time - start_time = point duration - time_t start_time_s; // the time the point starts - time_t end_time_s; // the time the point ends +typedef enum __attribute__ ((__packed__)) rrdset_type { + RRDSET_TYPE_LINE = 0, + RRDSET_TYPE_AREA = 1, + RRDSET_TYPE_STACKED = 2, +} RRDSET_TYPE; - size_t count; // the number of original points aggregated - size_t anomaly_count; // the number of original points found anomalous +#define RRDSET_TYPE_LINE_NAME "line" +#define RRDSET_TYPE_AREA_NAME "area" +#define RRDSET_TYPE_STACKED_NAME "stacked" - SN_FLAGS flags; // flags stored with the point -} STORAGE_POINT; +RRDSET_TYPE rrdset_type_id(const char *name); +const char *rrdset_type_name(RRDSET_TYPE chart_type); -#include "rrdcontext.h" +#include "contexts/rrdcontext.h" extern bool unittest_running; extern bool dbengine_enabled; @@ -158,39 +170,23 @@ extern time_t rrdset_free_obsolete_time_s; #if defined(ENV32BIT) #define MIN_LIBUV_WORKER_THREADS 8 -#define MAX_LIBUV_WORKER_THREADS 64 +#define MAX_LIBUV_WORKER_THREADS 128 #define RESERVED_LIBUV_WORKER_THREADS 3 #else #define MIN_LIBUV_WORKER_THREADS 16 -#define MAX_LIBUV_WORKER_THREADS 128 +#define MAX_LIBUV_WORKER_THREADS 1024 #define RESERVED_LIBUV_WORKER_THREADS 6 #endif extern int libuv_worker_threads; +extern bool ieee754_doubles; -#define RRD_ID_LENGTH_MAX 200 +#define RRD_ID_LENGTH_MAX 1000 typedef long long total_number; #define TOTAL_NUMBER_FORMAT "%lld" // ---------------------------------------------------------------------------- -// chart types - -typedef enum __attribute__ ((__packed__)) rrdset_type { - RRDSET_TYPE_LINE = 0, - RRDSET_TYPE_AREA = 1, - RRDSET_TYPE_STACKED = 2, -} RRDSET_TYPE; - -#define RRDSET_TYPE_LINE_NAME "line" -#define RRDSET_TYPE_AREA_NAME "area" -#define RRDSET_TYPE_STACKED_NAME "stacked" - -RRDSET_TYPE rrdset_type_id(const char *name); -const char *rrdset_type_name(RRDSET_TYPE chart_type); - - -// ---------------------------------------------------------------------------- // algorithms types typedef enum __attribute__ ((__packed__)) rrd_algorithm { @@ -279,7 +275,11 @@ void rrdlabels_destroy(DICTIONARY *labels_dict); void rrdlabels_add(DICTIONARY *dict, const char *name, const char *value, RRDLABEL_SRC ls); void rrdlabels_add_pair(DICTIONARY *dict, const char *string, RRDLABEL_SRC ls); void rrdlabels_get_value_to_buffer_or_null(DICTIONARY *labels, BUFFER *wb, const char *key, const char *quote, const char *null); -void rrdlabels_get_value_to_char_or_null(DICTIONARY *labels, char **value, const char *key); +void rrdlabels_value_to_buffer_array_item_or_null(DICTIONARY *labels, BUFFER *wb, const char *key); +void rrdlabels_get_value_strdup_or_null(DICTIONARY *labels, char **value, const char *key); +void rrdlabels_get_value_strcpyz(DICTIONARY *labels, char *dst, size_t dst_len, const char *key); +STRING *rrdlabels_get_value_string_dup(DICTIONARY *labels, const char *key); +STRING *rrdlabels_get_value_to_buffer_or_unset(DICTIONARY *labels, BUFFER *wb, const char *key, const char *unset); void rrdlabels_flush(DICTIONARY *labels_dict); void rrdlabels_unmark_all(DICTIONARY *labels); @@ -290,8 +290,10 @@ int rrdlabels_sorted_walkthrough_read(DICTIONARY *labels, int (*callback)(const void rrdlabels_log_to_buffer(DICTIONARY *labels, BUFFER *wb); bool rrdlabels_match_simple_pattern(DICTIONARY *labels, const char *simple_pattern_txt); -bool rrdlabels_match_simple_pattern_parsed(DICTIONARY *labels, SIMPLE_PATTERN *pattern, char equal); + +bool rrdlabels_match_simple_pattern_parsed(DICTIONARY *labels, SIMPLE_PATTERN *pattern, char equal, size_t *searches); int rrdlabels_to_buffer(DICTIONARY *labels, BUFFER *wb, const char *before_each, const char *equal, const char *quote, const char *between_them, bool (*filter_callback)(const char *name, const char *value, RRDLABEL_SRC ls, void *data), void *filter_data, void (*name_sanitizer)(char *dst, const char *src, size_t dst_size), void (*value_sanitizer)(char *dst, const char *src, size_t dst_size)); +void rrdlabels_to_buffer_json_members(DICTIONARY *labels, BUFFER *wb); void rrdlabels_migrate_to_these(DICTIONARY *dst, DICTIONARY *src); void rrdlabels_copy(DICTIONARY *dst, DICTIONARY *src); @@ -307,19 +309,20 @@ bool exporting_labels_filter_callback(const char *name, const char *value, RRDLA // ---------------------------------------------------------------------------- // engine-specific iterator state for dimension data collection -typedef struct storage_collect_handle STORAGE_COLLECT_HANDLE; +typedef struct storage_collect_handle { + STORAGE_ENGINE_BACKEND backend; +} STORAGE_COLLECT_HANDLE; // ---------------------------------------------------------------------------- // Storage tier data for every dimension struct rrddim_tier { STORAGE_POINT virtual_point; - size_t tier_grouping; + STORAGE_ENGINE_BACKEND backend; + uint32_t tier_grouping; time_t next_point_end_time_s; STORAGE_METRIC_HANDLE *db_metric_handle; // the metric handle inside the database STORAGE_COLLECT_HANDLE *db_collection_handle; // the data collection handle - struct storage_engine_collect_ops *collect_ops; - struct storage_engine_query_ops *query_ops; }; void rrdr_fill_tier_gap_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now_s); @@ -354,7 +357,7 @@ struct rrddim { // ------------------------------------------------------------------------ // operational state members - ml_dimension_t *ml_dimension; // machine learning data about this dimension + rrd_ml_dimension_t *ml_dimension; // machine learning data about this dimension // ------------------------------------------------------------------------ // linking to siblings and parents @@ -417,82 +420,215 @@ size_t rrddim_memory_file_header_size(void); void rrddim_memory_file_save(RRDDIM *rd); -// ---------------------------------------------------------------------------- +// ------------------------------------------------------------------------ +// DATA COLLECTION STORAGE OPS -#define storage_point_unset(x) do { \ - (x).min = (x).max = (x).sum = NAN; \ - (x).count = 0; \ - (x).anomaly_count = 0; \ - (x).flags = SN_FLAG_NONE; \ - (x).start_time_s = 0; \ - (x).end_time_s = 0; \ - } while(0) - -#define storage_point_empty(x, start_s, end_s) do { \ - (x).min = (x).max = (x).sum = NAN; \ - (x).count = 1; \ - (x).anomaly_count = 0; \ - (x).flags = SN_FLAG_NONE; \ - (x).start_time_s = start_s; \ - (x).end_time_s = end_s; \ - } while(0) - -#define STORAGE_POINT_UNSET { .min = NAN, .max = NAN, .sum = NAN, .count = 0, .anomaly_count = 0, .flags = SN_FLAG_NONE, .start_time_s = 0, .end_time_s = 0 } - -#define storage_point_is_unset(x) (!(x).count) -#define storage_point_is_gap(x) (!netdata_double_isnumber((x).sum)) +STORAGE_METRICS_GROUP *rrdeng_metrics_group_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid); +STORAGE_METRICS_GROUP *rrddim_metrics_group_get(STORAGE_INSTANCE *db_instance, uuid_t *uuid); +static inline STORAGE_METRICS_GROUP *storage_engine_metrics_group_get(STORAGE_ENGINE_BACKEND backend __maybe_unused, STORAGE_INSTANCE *db_instance, uuid_t *uuid) { + internal_fatal(!is_valid_backend(backend), "STORAGE: invalid backend"); -// ------------------------------------------------------------------------ -// function pointers that handle data collection -struct storage_engine_collect_ops { - // an initialization function to run before starting collection - STORAGE_COLLECT_HANDLE *(*init)(STORAGE_METRIC_HANDLE *db_metric_handle, uint32_t update_every, STORAGE_METRICS_GROUP *smg); +#ifdef ENABLE_DBENGINE + if(likely(backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_metrics_group_get(db_instance, uuid); +#endif + return rrddim_metrics_group_get(db_instance, uuid); +} + +void rrdeng_metrics_group_release(STORAGE_INSTANCE *db_instance, STORAGE_METRICS_GROUP *smg); +void rrddim_metrics_group_release(STORAGE_INSTANCE *db_instance, STORAGE_METRICS_GROUP *smg); +static inline void storage_engine_metrics_group_release(STORAGE_ENGINE_BACKEND backend __maybe_unused, STORAGE_INSTANCE *db_instance, STORAGE_METRICS_GROUP *smg) { + internal_fatal(!is_valid_backend(backend), "STORAGE: invalid backend"); + +#ifdef ENABLE_DBENGINE + if(likely(backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + rrdeng_metrics_group_release(db_instance, smg); + else +#endif + rrddim_metrics_group_release(db_instance, smg); +} - // run this to store each metric into the database - void (*store_metric)(STORAGE_COLLECT_HANDLE *collection_handle, usec_t point_in_time, NETDATA_DOUBLE number, NETDATA_DOUBLE min_value, - NETDATA_DOUBLE max_value, uint16_t count, uint16_t anomaly_count, SN_FLAGS flags); +STORAGE_COLLECT_HANDLE *rrdeng_store_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, uint32_t update_every, STORAGE_METRICS_GROUP *smg); +STORAGE_COLLECT_HANDLE *rrddim_collect_init(STORAGE_METRIC_HANDLE *db_metric_handle, uint32_t update_every, STORAGE_METRICS_GROUP *smg); +static inline STORAGE_COLLECT_HANDLE *storage_metric_store_init(STORAGE_ENGINE_BACKEND backend __maybe_unused, STORAGE_METRIC_HANDLE *db_metric_handle, uint32_t update_every, STORAGE_METRICS_GROUP *smg) { + internal_fatal(!is_valid_backend(backend), "STORAGE: invalid backend"); - // run this to flush / reset the current data collection sequence - void (*flush)(STORAGE_COLLECT_HANDLE *collection_handle); +#ifdef ENABLE_DBENGINE + if(likely(backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_store_metric_init(db_metric_handle, update_every, smg); +#endif + return rrddim_collect_init(db_metric_handle, update_every, smg); +} - // a finalization function to run after collection is over - // returns 1 if it's safe to delete the dimension - int (*finalize)(STORAGE_COLLECT_HANDLE *collection_handle); +void rrdeng_store_metric_next( + STORAGE_COLLECT_HANDLE *collection_handle, usec_t point_in_time_ut, + NETDATA_DOUBLE n, NETDATA_DOUBLE min_value, NETDATA_DOUBLE max_value, + uint16_t count, uint16_t anomaly_count, SN_FLAGS flags); - void (*change_collection_frequency)(STORAGE_COLLECT_HANDLE *collection_handle, int update_every); +void rrddim_collect_store_metric( + STORAGE_COLLECT_HANDLE *collection_handle, usec_t point_in_time_ut, + NETDATA_DOUBLE n, NETDATA_DOUBLE min_value, NETDATA_DOUBLE max_value, + uint16_t count, uint16_t anomaly_count, SN_FLAGS flags); + +static inline void storage_engine_store_metric( + STORAGE_COLLECT_HANDLE *collection_handle, usec_t point_in_time_ut, + NETDATA_DOUBLE n, NETDATA_DOUBLE min_value, NETDATA_DOUBLE max_value, + uint16_t count, uint16_t anomaly_count, SN_FLAGS flags) { + internal_fatal(!is_valid_backend(collection_handle->backend), "STORAGE: invalid backend"); + +#ifdef ENABLE_DBENGINE + if(likely(collection_handle->backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_store_metric_next(collection_handle, point_in_time_ut, + n, min_value, max_value, + count, anomaly_count, flags); +#endif + return rrddim_collect_store_metric(collection_handle, point_in_time_ut, + n, min_value, max_value, + count, anomaly_count, flags); +} + +void rrdeng_store_metric_flush_current_page(STORAGE_COLLECT_HANDLE *collection_handle); +void rrddim_store_metric_flush(STORAGE_COLLECT_HANDLE *collection_handle); +static inline void storage_engine_store_flush(STORAGE_COLLECT_HANDLE *collection_handle) { + if(unlikely(!collection_handle)) + return; + + internal_fatal(!is_valid_backend(collection_handle->backend), "STORAGE: invalid backend"); + +#ifdef ENABLE_DBENGINE + if(likely(collection_handle->backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + rrdeng_store_metric_flush_current_page(collection_handle); + else +#endif + rrddim_store_metric_flush(collection_handle); +} + +int rrdeng_store_metric_finalize(STORAGE_COLLECT_HANDLE *collection_handle); +int rrddim_collect_finalize(STORAGE_COLLECT_HANDLE *collection_handle); +// a finalization function to run after collection is over +// returns 1 if it's safe to delete the dimension +static inline int storage_engine_store_finalize(STORAGE_COLLECT_HANDLE *collection_handle) { + internal_fatal(!is_valid_backend(collection_handle->backend), "STORAGE: invalid backend"); + +#ifdef ENABLE_DBENGINE + if(likely(collection_handle->backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_store_metric_finalize(collection_handle); +#endif + + return rrddim_collect_finalize(collection_handle); +} + +void rrdeng_store_metric_change_collection_frequency(STORAGE_COLLECT_HANDLE *collection_handle, int update_every); +void rrddim_store_metric_change_collection_frequency(STORAGE_COLLECT_HANDLE *collection_handle, int update_every); +static inline void storage_engine_store_change_collection_frequency(STORAGE_COLLECT_HANDLE *collection_handle, int update_every) { + internal_fatal(!is_valid_backend(collection_handle->backend), "STORAGE: invalid backend"); + +#ifdef ENABLE_DBENGINE + if(likely(collection_handle->backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + rrdeng_store_metric_change_collection_frequency(collection_handle, update_every); + else +#endif + rrddim_store_metric_change_collection_frequency(collection_handle, update_every); +} - STORAGE_METRICS_GROUP *(*metrics_group_get)(STORAGE_INSTANCE *db_instance, uuid_t *uuid); - void (*metrics_group_release)(STORAGE_INSTANCE *db_instance, STORAGE_METRICS_GROUP *sa); -}; // ---------------------------------------------------------------------------- +// STORAGE ENGINE QUERY OPS + +time_t rrdeng_metric_oldest_time(STORAGE_METRIC_HANDLE *db_metric_handle); +time_t rrddim_query_oldest_time_s(STORAGE_METRIC_HANDLE *db_metric_handle); +static inline time_t storage_engine_oldest_time_s(STORAGE_ENGINE_BACKEND backend __maybe_unused, STORAGE_METRIC_HANDLE *db_metric_handle) { + internal_fatal(!is_valid_backend(backend), "STORAGE: invalid backend"); + +#ifdef ENABLE_DBENGINE + if(likely(backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_metric_oldest_time(db_metric_handle); +#endif + return rrddim_query_oldest_time_s(db_metric_handle); +} -// function pointers that handle database queries -struct storage_engine_query_ops { - // run this before starting a series of next_metric() database queries - void (*init)(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *handle, time_t start_time_s, time_t end_time_s, STORAGE_PRIORITY priority); +time_t rrdeng_metric_latest_time(STORAGE_METRIC_HANDLE *db_metric_handle); +time_t rrddim_query_latest_time_s(STORAGE_METRIC_HANDLE *db_metric_handle); +static inline time_t storage_engine_latest_time_s(STORAGE_ENGINE_BACKEND backend __maybe_unused, STORAGE_METRIC_HANDLE *db_metric_handle) { + internal_fatal(!is_valid_backend(backend), "STORAGE: invalid backend"); - // run this to load each metric number from the database - STORAGE_POINT (*next_metric)(struct storage_engine_query_handle *handle); +#ifdef ENABLE_DBENGINE + if(likely(backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_metric_latest_time(db_metric_handle); +#endif + return rrddim_query_latest_time_s(db_metric_handle); +} - // run this to test if the series of next_metric() database queries is finished - int (*is_finished)(struct storage_engine_query_handle *handle); +void rrdeng_load_metric_init( + STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *rrddim_handle, + time_t start_time_s, time_t end_time_s, STORAGE_PRIORITY priority); - // run this after finishing a series of load_metric() database queries - void (*finalize)(struct storage_engine_query_handle *handle); +void rrddim_query_init( + STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *handle, + time_t start_time_s, time_t end_time_s, STORAGE_PRIORITY priority); - // get the timestamp of the last entry of this metric - time_t (*latest_time_s)(STORAGE_METRIC_HANDLE *db_metric_handle); +static inline void storage_engine_query_init( + STORAGE_ENGINE_BACKEND backend __maybe_unused, + STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *handle, + time_t start_time_s, time_t end_time_s, STORAGE_PRIORITY priority) { + internal_fatal(!is_valid_backend(backend), "STORAGE: invalid backend"); - // get the timestamp of the first entry of this metric - time_t (*oldest_time_s)(STORAGE_METRIC_HANDLE *db_metric_handle); +#ifdef ENABLE_DBENGINE + if(likely(backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + rrdeng_load_metric_init(db_metric_handle, handle, start_time_s, end_time_s, priority); + else +#endif + rrddim_query_init(db_metric_handle, handle, start_time_s, end_time_s, priority); +} - // adapt 'before' timestamp to the optimal for the query - // can only move 'before' ahead (to the future) - time_t (*align_to_optimal_before)(struct storage_engine_query_handle *handle); -}; +STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim_handle); +STORAGE_POINT rrddim_query_next_metric(struct storage_engine_query_handle *handle); +static inline STORAGE_POINT storage_engine_query_next_metric(struct storage_engine_query_handle *handle) { + internal_fatal(!is_valid_backend(handle->backend), "STORAGE: invalid backend"); -typedef struct storage_engine STORAGE_ENGINE; +#ifdef ENABLE_DBENGINE + if(likely(handle->backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_load_metric_next(handle); +#endif + return rrddim_query_next_metric(handle); +} + +int rrdeng_load_metric_is_finished(struct storage_engine_query_handle *rrddim_handle); +int rrddim_query_is_finished(struct storage_engine_query_handle *handle); +static inline int storage_engine_query_is_finished(struct storage_engine_query_handle *handle) { + internal_fatal(!is_valid_backend(handle->backend), "STORAGE: invalid backend"); + +#ifdef ENABLE_DBENGINE + if(likely(handle->backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_load_metric_is_finished(handle); +#endif + return rrddim_query_is_finished(handle); +} + +void rrdeng_load_metric_finalize(struct storage_engine_query_handle *rrddim_handle); +void rrddim_query_finalize(struct storage_engine_query_handle *handle); +static inline void storage_engine_query_finalize(struct storage_engine_query_handle *handle) { + internal_fatal(!is_valid_backend(handle->backend), "STORAGE: invalid backend"); + +#ifdef ENABLE_DBENGINE + if(likely(handle->backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + rrdeng_load_metric_finalize(handle); + else +#endif + rrddim_query_finalize(handle); +} + +time_t rrdeng_load_align_to_optimal_before(struct storage_engine_query_handle *rrddim_handle); +time_t rrddim_query_align_to_optimal_before(struct storage_engine_query_handle *rrddim_handle); +static inline time_t storage_engine_align_to_optimal_before(struct storage_engine_query_handle *handle) { + internal_fatal(!is_valid_backend(handle->backend), "STORAGE: invalid backend"); + +#ifdef ENABLE_DBENGINE + if(likely(handle->backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_load_align_to_optimal_before(handle); +#endif + return rrddim_query_align_to_optimal_before(handle); +} // ------------------------------------------------------------------------ // function pointers for all APIs provided by a storage engine @@ -503,17 +639,14 @@ typedef struct storage_engine_api { void (*metric_release)(STORAGE_METRIC_HANDLE *); STORAGE_METRIC_HANDLE *(*metric_dup)(STORAGE_METRIC_HANDLE *); bool (*metric_retention_by_uuid)(STORAGE_INSTANCE *db_instance, uuid_t *uuid, time_t *first_entry_s, time_t *last_entry_s); - - // operations - struct storage_engine_collect_ops collect_ops; - struct storage_engine_query_ops query_ops; } STORAGE_ENGINE_API; -struct storage_engine { +typedef struct storage_engine { + STORAGE_ENGINE_BACKEND backend; RRD_MEMORY_MODE id; const char* name; STORAGE_ENGINE_API api; -}; +} STORAGE_ENGINE; STORAGE_ENGINE* storage_engine_get(RRD_MEMORY_MODE mmode); STORAGE_ENGINE* storage_engine_find(const char* name); @@ -617,7 +750,7 @@ struct rrdset { DICTIONARY *rrddimvar_root_index; // dimension variables // we use this dictionary to manage their allocation - ml_chart_t *ml_chart; + rrd_ml_chart_t *ml_chart; // ------------------------------------------------------------------------ // operational state members @@ -702,6 +835,13 @@ struct rrdset { RRDCALC *base; // double linked list of RRDCALC related to this RRDSET } alerts; + struct { + size_t pos; + size_t size; + size_t used; + RRDDIM_ACQUIRED **rda; + } pluginsd; + #ifdef NETDATA_LOG_REPLICATION_REQUESTS struct { bool log_next_data_collection; @@ -757,35 +897,41 @@ bool rrdset_memory_load_or_create_map_save(RRDSET *st_on_file, RRD_MEMORY_MODE m // and may lead to missing information. typedef enum __attribute__ ((__packed__)) rrdhost_flags { + + // Careful not to overlap with rrdhost_options to avoid bugs if + // rrdhost_flags_xxx is used instead of rrdhost_option_xxx or vice-versa // Orphan, Archived and Obsolete flags - RRDHOST_FLAG_ORPHAN = (1 << 10), // this host is orphan (not receiving data) - RRDHOST_FLAG_ARCHIVED = (1 << 11), // The host is archived, no collected charts yet - RRDHOST_FLAG_PENDING_OBSOLETE_CHARTS = (1 << 12), // the host has pending chart obsoletions - RRDHOST_FLAG_PENDING_OBSOLETE_DIMENSIONS = (1 << 13), // the host has pending dimension obsoletions + RRDHOST_FLAG_ORPHAN = (1 << 8), // this host is orphan (not receiving data) + RRDHOST_FLAG_ARCHIVED = (1 << 9), // The host is archived, no collected charts yet + RRDHOST_FLAG_PENDING_OBSOLETE_CHARTS = (1 << 10), // the host has pending chart obsoletions + RRDHOST_FLAG_PENDING_OBSOLETE_DIMENSIONS = (1 << 11), // the host has pending dimension obsoletions // Streaming sender - RRDHOST_FLAG_RRDPUSH_SENDER_INITIALIZED = (1 << 14), // the host has initialized rrdpush structures - RRDHOST_FLAG_RRDPUSH_SENDER_SPAWN = (1 << 15), // When set, the sender thread is running - RRDHOST_FLAG_RRDPUSH_SENDER_CONNECTED = (1 << 16), // When set, the host is connected to a parent - RRDHOST_FLAG_RRDPUSH_SENDER_READY_4_METRICS = (1 << 17), // when set, rrdset_done() should push metrics to parent - RRDHOST_FLAG_RRDPUSH_SENDER_LOGGED_STATUS = (1 << 18), // when set, we have logged the status of metrics streaming + RRDHOST_FLAG_RRDPUSH_SENDER_INITIALIZED = (1 << 12), // the host has initialized rrdpush structures + RRDHOST_FLAG_RRDPUSH_SENDER_SPAWN = (1 << 13), // When set, the sender thread is running + RRDHOST_FLAG_RRDPUSH_SENDER_CONNECTED = (1 << 14), // When set, the host is connected to a parent + RRDHOST_FLAG_RRDPUSH_SENDER_READY_4_METRICS = (1 << 15), // when set, rrdset_done() should push metrics to parent + RRDHOST_FLAG_RRDPUSH_SENDER_LOGGED_STATUS = (1 << 16), // when set, we have logged the status of metrics streaming // Health - RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION = (1 << 20), // contains charts and dims with uninitialized variables - RRDHOST_FLAG_INITIALIZED_HEALTH = (1 << 21), // the host has initialized health structures + RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION = (1 << 17), // contains charts and dims with uninitialized variables + RRDHOST_FLAG_INITIALIZED_HEALTH = (1 << 18), // the host has initialized health structures // Exporting - RRDHOST_FLAG_EXPORTING_SEND = (1 << 22), // send it to external databases - RRDHOST_FLAG_EXPORTING_DONT_SEND = (1 << 23), // don't send it to external databases + RRDHOST_FLAG_EXPORTING_SEND = (1 << 19), // send it to external databases + RRDHOST_FLAG_EXPORTING_DONT_SEND = (1 << 20), // don't send it to external databases // ACLK - RRDHOST_FLAG_ACLK_STREAM_CONTEXTS = (1 << 24), // when set, we should send ACLK stream context updates + RRDHOST_FLAG_ACLK_STREAM_CONTEXTS = (1 << 21), // when set, we should send ACLK stream context updates + RRDHOST_FLAG_ACLK_STREAM_ALERTS = (1 << 22), // set when the receiver part is disconnected // Metadata - RRDHOST_FLAG_METADATA_UPDATE = (1 << 25), // metadata needs to be stored in the database - RRDHOST_FLAG_METADATA_LABELS = (1 << 26), // metadata needs to be stored in the database - RRDHOST_FLAG_METADATA_INFO = (1 << 27), // metadata needs to be stored in the database - RRDHOST_FLAG_METADATA_CLAIMID = (1 << 28), // metadata needs to be stored in the database + RRDHOST_FLAG_METADATA_UPDATE = (1 << 23), // metadata needs to be stored in the database + RRDHOST_FLAG_METADATA_LABELS = (1 << 24), // metadata needs to be stored in the database + RRDHOST_FLAG_METADATA_INFO = (1 << 25), // metadata needs to be stored in the database + RRDHOST_FLAG_PENDING_CONTEXT_LOAD = (1 << 26), // metadata needs to be stored in the database + RRDHOST_FLAG_CONTEXT_LOAD_IN_PROGRESS = (1 << 27), // metadata needs to be stored in the database + RRDHOST_FLAG_METADATA_CLAIMID = (1 << 28), // metadata needs to be stored in the database RRDHOST_FLAG_RRDPUSH_RECEIVER_DISCONNECTED = (1 << 29), // set when the receiver part is disconnected } RRDHOST_FLAGS; @@ -954,6 +1100,8 @@ struct rrdhost_system_info { int mc_version; }; +struct rrdhost_system_info *rrdhost_labels_to_system_info(DICTIONARY *labels); + struct rrdhost { char machine_guid[GUID_LEN + 1]; // the unique ID of this host @@ -982,13 +1130,12 @@ struct rrdhost { // the actual per tier is at .db[tier].mode char *cache_dir; // the directory to save RRD cache files - char *varlib_dir; // the directory to save health log struct { RRD_MEMORY_MODE mode; // the db mode for this tier STORAGE_ENGINE *eng; // the storage engine API for this tier STORAGE_INSTANCE *instance; // the db instance for this tier - size_t tier_grouping; // tier 0 iterations aggregated on this tier + uint32_t tier_grouping; // tier 0 iterations aggregated on this tier } db[RRD_STORAGE_TIERS]; struct rrdhost_system_info *system_info; // information collected from the host environment @@ -1011,7 +1158,7 @@ struct rrdhost { struct sender_state *sender; netdata_thread_t rrdpush_sender_thread; // the sender thread size_t rrdpush_sender_replicating_charts; // the number of charts currently being replicated to a parent - void *dbsync_worker; + void *aclk_sync_host_config; // ------------------------------------------------------------------------ // streaming of data from remote hosts - rrdpush receiver @@ -1042,6 +1189,7 @@ struct rrdhost { uint32_t health_last_processed_id; // the last processed health id from the log uint32_t health_max_unique_id; // the max alarm log unique id given for the host uint32_t health_max_alarm_id; // the max alarm id given for the host + size_t health_transitions; // the number of times an alert changed state // ------------------------------------------------------------------------ // locks @@ -1050,7 +1198,7 @@ struct rrdhost { // ------------------------------------------------------------------------ // ML handle - ml_host_t *ml_host; + rrd_ml_host_t *ml_host; // ------------------------------------------------------------------------ // Support for host-level labels @@ -1070,9 +1218,11 @@ struct rrdhost { DICTIONARY *rrdvars; // the host's chart variables index // this includes custom host variables - RRDCONTEXTS *rrdctx_hub_queue; - RRDCONTEXTS *rrdctx_post_processing_queue; - RRDCONTEXTS *rrdctx; + struct { + DICTIONARY *contexts; + DICTIONARY *hub_queue; + DICTIONARY *pp_queue; + } rrdctx; uuid_t host_uuid; // Global GUID for this host uuid_t *node_id; // Cloud node_id @@ -1110,6 +1260,10 @@ extern RRDHOST *localhost; extern DICTIONARY *rrdhost_root_index; size_t rrdhost_hosts_available(void); +RRDHOST_ACQUIRED *rrdhost_find_and_acquire(const char *machine_guid); +RRDHOST *rrdhost_acquired_to_rrdhost(RRDHOST_ACQUIRED *rha); +void rrdhost_acquired_release(RRDHOST_ACQUIRED *rha); + // ---------------------------------------------------------------------------- #define rrdhost_foreach_read(var) \ @@ -1145,6 +1299,7 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unitt RRDHOST *rrdhost_find_by_hostname(const char *hostname); RRDHOST *rrdhost_find_by_guid(const char *guid); +RRDHOST *find_host_by_node_id(char *node_id); RRDHOST *rrdhost_find_or_create( const char *hostname @@ -1217,6 +1372,11 @@ void rrdset_update_heterogeneous_flag(RRDSET *st); time_t rrdset_set_update_every_s(RRDSET *st, time_t update_every_s); RRDSET *rrdset_find(RRDHOST *host, const char *id); + +RRDSET_ACQUIRED *rrdset_find_and_acquire(RRDHOST *host, const char *id); +RRDSET *rrdset_acquired_to_rrdset(RRDSET_ACQUIRED *rsa); +void rrdset_acquired_release(RRDSET_ACQUIRED *rsa); + #define rrdset_find_localhost(id) rrdset_find(localhost, id) /* This will not return charts that are archived */ static inline RRDSET *rrdset_find_active_localhost(const char *id) @@ -1339,13 +1499,13 @@ void rrdset_delete_files(RRDSET *st); void rrdset_save(RRDSET *st); void rrdset_free(RRDSET *st); +void rrddim_free(RRDSET *st, RRDDIM *rd); + #ifdef NETDATA_RRD_INTERNALS char *rrdhost_cache_dir_for_rrdset_alloc(RRDHOST *host, const char *id); const char *rrdset_cache_dir(RRDSET *st); -void rrddim_free(RRDSET *st, RRDDIM *rd); - void rrdset_reset(RRDSET *st); void rrdset_delete_obsolete_dimensions(RRDSET *st); diff --git a/database/rrdcalc.c b/database/rrdcalc.c index 76263582..3ee8719c 100644 --- a/database/rrdcalc.c +++ b/database/rrdcalc.c @@ -95,12 +95,12 @@ static STRING *rrdcalc_replace_variables_with_rrdset_labels(const char *line, RR temp = buf; } else if (!strncmp(var, RRDCALC_VAR_LABEL, RRDCALC_VAR_LABEL_LEN)) { - char label_val[RRDCALC_VAR_MAX + 1] = { 0 }; + char label_val[RRDCALC_VAR_MAX + RRDCALC_VAR_LABEL_LEN + 1] = { 0 }; strcpy(label_val, var+RRDCALC_VAR_LABEL_LEN); label_val[i - RRDCALC_VAR_LABEL_LEN - 1] = '\0'; if(likely(rc->rrdset && rc->rrdset->rrdlabels)) { - rrdlabels_get_value_to_char_or_null(rc->rrdset->rrdlabels, &lbl_value, label_val); + rrdlabels_get_value_strdup_or_null(rc->rrdset->rrdlabels, &lbl_value, label_val); if (lbl_value) { char *buf = find_and_replace(temp, var, lbl_value, m); freez(temp); @@ -245,6 +245,8 @@ static void rrdcalc_link_to_rrdset(RRDSET *st, RRDCALC *rc) { if(!rc->units) rc->units = string_dup(st->units); + rrdvar_store_for_chart(host, st); + rrdcalc_update_info_using_rrdset_labels(rc); time_t now = now_realtime_sec(); @@ -357,13 +359,14 @@ static inline bool rrdcalc_check_if_it_matches_rrdset(RRDCALC *rc, RRDSET *st) { && (rc->chart != st->name)) return false; - if (rc->module_pattern && !simple_pattern_matches(rc->module_pattern, rrdset_module_name(st))) + if (rc->module_pattern && !simple_pattern_matches_string(rc->module_pattern, st->module_name)) return false; - if (rc->plugin_pattern && !simple_pattern_matches(rc->plugin_pattern, rrdset_plugin_name(st))) + if (rc->plugin_pattern && !simple_pattern_matches_string(rc->plugin_pattern, st->module_name)) return false; - if (st->rrdhost->rrdlabels && rc->host_labels_pattern && !rrdlabels_match_simple_pattern_parsed(st->rrdhost->rrdlabels, rc->host_labels_pattern, '=')) + if (st->rrdhost->rrdlabels && rc->host_labels_pattern && !rrdlabels_match_simple_pattern_parsed( + st->rrdhost->rrdlabels, rc->host_labels_pattern, '=', NULL)) return false; return true; @@ -739,7 +742,7 @@ void rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(RRDHOST *host if (!rc->host_labels) continue; - if(!rrdlabels_match_simple_pattern_parsed(host->rrdlabels, rc->host_labels_pattern, '=')) { + if(!rrdlabels_match_simple_pattern_parsed(host->rrdlabels, rc->host_labels_pattern, '=', NULL)) { log_health("Health configuration for alarm '%s' cannot be applied, because the host %s does not have the label(s) '%s'", rrdcalc_name(rc), rrdhost_hostname(host), @@ -752,18 +755,15 @@ void rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(RRDHOST *host } void rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts() { - rrd_rdlock(); - RRDHOST *host; - rrdhost_foreach_read(host) { + dfe_start_reentrant(rrdhost_root_index, host) { if (unlikely(!host->health.health_enabled)) continue; if (host->rrdlabels) rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host); } - - rrd_unlock(); + dfe_done(host); } void rrdcalc_unlink_all_rrdset_alerts(RRDSET *st) { diff --git a/database/rrdcalc.h b/database/rrdcalc.h index 08d8beee..c6d6fd4e 100644 --- a/database/rrdcalc.h +++ b/database/rrdcalc.h @@ -31,7 +31,7 @@ typedef enum { typedef enum { // This list uses several other options from RRDR_OPTIONS for db lookups. // To add an item here, you need to reserve a bit in RRDR_OPTIONS. - RRDCALC_OPTION_NO_CLEAR_NOTIFICATION = 0x80000000, + RRDCALC_OPTION_NO_CLEAR_NOTIFICATION = RRDR_OPTION_HEALTH_RSRVD1, } RRDCALC_OPTIONS; #define RRDCALC_ALL_OPTIONS_EXCLUDING_THE_RRDR_ONES (RRDCALC_OPTION_NO_CLEAR_NOTIFICATION) @@ -77,7 +77,7 @@ struct rrdcalc { STRING *dimensions; // the chart dimensions STRING *foreach_dimension; // the group of dimensions that the `foreach` will be applied. SIMPLE_PATTERN *foreach_dimension_pattern; // used if and only if there is a simple pattern for the chart. - RRDR_GROUPING group; // grouping method: average, max, etc. + RRDR_TIME_GROUPING group; // grouping method: average, max, etc. int before; // ending point in time-series int after; // starting point in time-series RRDCALC_OPTIONS options; // configuration options diff --git a/database/rrdcalctemplate.c b/database/rrdcalctemplate.c index 4d7352b2..4dacb6c7 100644 --- a/database/rrdcalctemplate.c +++ b/database/rrdcalctemplate.c @@ -34,26 +34,29 @@ bool rrdcalctemplate_check_rrdset_conditions(RRDCALCTEMPLATE *rt, RRDSET *st, RR if(rt->foreach_dimension_pattern && !rrdset_number_of_dimensions(st)) return false; - if (rt->charts_pattern && !simple_pattern_matches(rt->charts_pattern, rrdset_name(st)) && !simple_pattern_matches(rt->charts_pattern, rrdset_id(st))) + if (rt->charts_pattern && !simple_pattern_matches_string(rt->charts_pattern, st->name) && !simple_pattern_matches_string(rt->charts_pattern, st->id)) return false; - if (rt->family_pattern && !simple_pattern_matches(rt->family_pattern, rrdset_family(st))) + if (rt->family_pattern && !simple_pattern_matches_string(rt->family_pattern, st->family)) return false; - if (rt->module_pattern && !simple_pattern_matches(rt->module_pattern, rrdset_module_name(st))) + if (rt->module_pattern && !simple_pattern_matches_string(rt->module_pattern, st->module_name)) return false; - if (rt->plugin_pattern && !simple_pattern_matches(rt->plugin_pattern, rrdset_plugin_name(st))) + if (rt->plugin_pattern && !simple_pattern_matches_string(rt->plugin_pattern, st->plugin_name)) return false; - if(host->rrdlabels && rt->host_labels_pattern && !rrdlabels_match_simple_pattern_parsed(host->rrdlabels, rt->host_labels_pattern, '=')) + if(host->rrdlabels && rt->host_labels_pattern && !rrdlabels_match_simple_pattern_parsed(host->rrdlabels, + rt->host_labels_pattern, + '=', NULL)) return false; return true; } void rrdcalctemplate_check_rrddim_conditions_and_link(RRDCALCTEMPLATE *rt, RRDSET *st, RRDDIM *rd, RRDHOST *host) { - if (simple_pattern_matches(rt->foreach_dimension_pattern, rrddim_id(rd)) || simple_pattern_matches(rt->foreach_dimension_pattern, rrddim_name(rd))) { + if (simple_pattern_matches_string(rt->foreach_dimension_pattern, rd->id) || + simple_pattern_matches_string(rt->foreach_dimension_pattern, rd->name)) { char *overwrite_alert_name = rrdcalc_alert_name_with_dimension( rrdcalctemplate_name(rt), string_strlen(rt->name), rrddim_name(rd), string_strlen(rd->name)); rrdcalc_add_from_rrdcalctemplate(host, rt, st, overwrite_alert_name, rrddim_name(rd)); diff --git a/database/rrdcalctemplate.h b/database/rrdcalctemplate.h index 6212a42d..22cfe06e 100644 --- a/database/rrdcalctemplate.h +++ b/database/rrdcalctemplate.h @@ -50,7 +50,7 @@ struct rrdcalctemplate { STRING *dimensions; // the chart dimensions STRING *foreach_dimension; // the group of dimensions that the lookup will be applied. SIMPLE_PATTERN *foreach_dimension_pattern; // used if and only if there is a simple pattern for the chart. - RRDR_GROUPING group; // grouping method: average, max, etc. + RRDR_TIME_GROUPING group; // grouping method: average, max, etc. int before; // ending point in time-series int after; // starting point in time-series RRDCALC_OPTIONS options; // configuration options diff --git a/database/rrdcontext.c b/database/rrdcontext.c deleted file mode 100644 index 3f1ce73f..00000000 --- a/database/rrdcontext.c +++ /dev/null @@ -1,3993 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#include "rrdcontext.h" -#include "sqlite/sqlite_context.h" -#include "aclk/schema-wrappers/context.h" -#include "aclk/aclk_contexts_api.h" -#include "aclk/aclk.h" -#include "storage_engine.h" - -#define MESSAGES_PER_BUNDLE_TO_SEND_TO_HUB_PER_HOST 5000 -#define FULL_RETENTION_SCAN_DELAY_AFTER_DB_ROTATION_SECS 120 -#define RRDCONTEXT_WORKER_THREAD_HEARTBEAT_USEC (1000 * USEC_PER_MS) -#define RRDCONTEXT_MINIMUM_ALLOWED_PRIORITY 10 - -#define LOG_TRANSITIONS false - -#define WORKER_JOB_HOSTS 1 -#define WORKER_JOB_CHECK 2 -#define WORKER_JOB_SEND 3 -#define WORKER_JOB_DEQUEUE 4 -#define WORKER_JOB_RETENTION 5 -#define WORKER_JOB_QUEUED 6 -#define WORKER_JOB_CLEANUP 7 -#define WORKER_JOB_CLEANUP_DELETE 8 -#define WORKER_JOB_PP_METRIC 9 // post-processing metrics -#define WORKER_JOB_PP_INSTANCE 10 // post-processing instances -#define WORKER_JOB_PP_CONTEXT 11 // post-processing contexts -#define WORKER_JOB_HUB_QUEUE_SIZE 12 -#define WORKER_JOB_PP_QUEUE_SIZE 13 - - -typedef enum __attribute__ ((__packed__)) { - RRD_FLAG_NONE = 0, - RRD_FLAG_DELETED = (1 << 0), // this is a deleted object (metrics, instances, contexts) - RRD_FLAG_COLLECTED = (1 << 1), // this object is currently being collected - RRD_FLAG_UPDATED = (1 << 2), // this object has updates to propagate - RRD_FLAG_ARCHIVED = (1 << 3), // this object is not currently being collected - RRD_FLAG_OWN_LABELS = (1 << 4), // this instance has its own labels - not linked to an RRDSET - RRD_FLAG_LIVE_RETENTION = (1 << 5), // we have got live retention from the database - RRD_FLAG_QUEUED_FOR_HUB = (1 << 6), // this context is currently queued to be dispatched to hub - RRD_FLAG_QUEUED_FOR_PP = (1 << 7), // this context is currently queued to be post-processed - RRD_FLAG_HIDDEN = (1 << 8), // don't expose this to the hub or the API - - RRD_FLAG_UPDATE_REASON_TRIGGERED = (1 << 9), // the update was triggered by the child object - RRD_FLAG_UPDATE_REASON_LOAD_SQL = (1 << 10), // this object has just been loaded from SQL - RRD_FLAG_UPDATE_REASON_NEW_OBJECT = (1 << 11), // this object has just been created - RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT = (1 << 12), // we received an update on this object - RRD_FLAG_UPDATE_REASON_CHANGED_LINKING = (1 << 13), // an instance or a metric switched RRDSET or RRDDIM - RRD_FLAG_UPDATE_REASON_CHANGED_METADATA = (1 << 14), // this context or instance changed uuid, name, units, title, family, chart type, priority, update every, rrd changed flags - RRD_FLAG_UPDATE_REASON_ZERO_RETENTION = (1 << 15), // this object has no retention - RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T = (1 << 16), // this object changed its oldest time in the db - RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T = (1 << 17), // this object change its latest time in the db - RRD_FLAG_UPDATE_REASON_STOPPED_BEING_COLLECTED = (1 << 18), // this object has stopped being collected - RRD_FLAG_UPDATE_REASON_STARTED_BEING_COLLECTED = (1 << 19), // this object has started being collected - RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD = (1 << 20), // this context belongs to a host that just disconnected - RRD_FLAG_UPDATE_REASON_UNUSED = (1 << 21), // this context is not used anymore - RRD_FLAG_UPDATE_REASON_DB_ROTATION = (1 << 22), // this context changed because of a db rotation - - // action to perform on an object - RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION = (1 << 30), // this object has to update its retention from the db -} RRD_FLAGS; - -#define RRD_FLAG_ALL_UPDATE_REASONS ( \ - RRD_FLAG_UPDATE_REASON_TRIGGERED \ - |RRD_FLAG_UPDATE_REASON_LOAD_SQL \ - |RRD_FLAG_UPDATE_REASON_NEW_OBJECT \ - |RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT \ - |RRD_FLAG_UPDATE_REASON_CHANGED_LINKING \ - |RRD_FLAG_UPDATE_REASON_CHANGED_METADATA \ - |RRD_FLAG_UPDATE_REASON_ZERO_RETENTION \ - |RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T \ - |RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T \ - |RRD_FLAG_UPDATE_REASON_STOPPED_BEING_COLLECTED \ - |RRD_FLAG_UPDATE_REASON_STARTED_BEING_COLLECTED \ - |RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD \ - |RRD_FLAG_UPDATE_REASON_DB_ROTATION \ - |RRD_FLAG_UPDATE_REASON_UNUSED \ - ) - -#define RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS ( \ - RRD_FLAG_ARCHIVED \ - |RRD_FLAG_HIDDEN \ - |RRD_FLAG_ALL_UPDATE_REASONS \ - ) - -#define RRD_FLAGS_REQUIRED_FOR_DELETIONS ( \ - RRD_FLAG_DELETED \ - |RRD_FLAG_LIVE_RETENTION \ -) - -#define RRD_FLAGS_PREVENTING_DELETIONS ( \ - RRD_FLAG_QUEUED_FOR_HUB \ - |RRD_FLAG_COLLECTED \ - |RRD_FLAG_QUEUED_FOR_PP \ -) - -// get all the flags of an object -#define rrd_flags_get(obj) __atomic_load_n(&((obj)->flags), __ATOMIC_SEQ_CST) - -// check if ANY of the given flags (bits) is set -#define rrd_flag_check(obj, flag) (rrd_flags_get(obj) & (flag)) - -// check if ALL the given flags (bits) are set -#define rrd_flag_check_all(obj, flag) (rrd_flag_check(obj, flag) == (flag)) - -// set one or more flags (bits) -#define rrd_flag_set(obj, flag) __atomic_or_fetch(&((obj)->flags), flag, __ATOMIC_SEQ_CST) - -// clear one or more flags (bits) -#define rrd_flag_clear(obj, flag) __atomic_and_fetch(&((obj)->flags), ~(flag), __ATOMIC_SEQ_CST) - -// replace the flags of an object, with the supplied ones -#define rrd_flags_replace(obj, all_flags) __atomic_store_n(&((obj)->flags), all_flags, __ATOMIC_SEQ_CST) - -static inline void -rrd_flag_add_remove_atomic(RRD_FLAGS *flags, RRD_FLAGS check, RRD_FLAGS conditionally_add, RRD_FLAGS always_remove) { - RRD_FLAGS expected, desired; - - do { - expected = *flags; - - desired = expected; - desired &= ~(always_remove); - - if(!(expected & check)) - desired |= (check | conditionally_add); - - } while(!__atomic_compare_exchange_n(flags, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)); -} - -#define rrd_flag_set_collected(obj) \ - rrd_flag_add_remove_atomic(&((obj)->flags) \ - /* check this flag */ \ - , RRD_FLAG_COLLECTED \ - \ - /* add these flags together with the above, if the above is not already set */ \ - , RRD_FLAG_UPDATE_REASON_STARTED_BEING_COLLECTED | RRD_FLAG_UPDATED \ - \ - /* always remove these flags */ \ - , RRD_FLAG_ARCHIVED \ - | RRD_FLAG_DELETED \ - | RRD_FLAG_UPDATE_REASON_STOPPED_BEING_COLLECTED \ - | RRD_FLAG_UPDATE_REASON_ZERO_RETENTION \ - | RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD \ - ) - -#define rrd_flag_set_archived(obj) \ - rrd_flag_add_remove_atomic(&((obj)->flags) \ - /* check this flag */ \ - , RRD_FLAG_ARCHIVED \ - \ - /* add these flags together with the above, if the above is not already set */ \ - , RRD_FLAG_UPDATE_REASON_STOPPED_BEING_COLLECTED | RRD_FLAG_UPDATED \ - \ - /* always remove these flags */ \ - , RRD_FLAG_COLLECTED \ - | RRD_FLAG_DELETED \ - | RRD_FLAG_UPDATE_REASON_STARTED_BEING_COLLECTED \ - | RRD_FLAG_UPDATE_REASON_ZERO_RETENTION \ - ) - -#define rrd_flag_set_deleted(obj, reason) \ - rrd_flag_add_remove_atomic(&((obj)->flags) \ - /* check this flag */ \ - , RRD_FLAG_DELETED \ - \ - /* add these flags together with the above, if the above is not already set */ \ - , RRD_FLAG_UPDATE_REASON_ZERO_RETENTION | RRD_FLAG_UPDATED | (reason) \ - \ - /* always remove these flags */ \ - , RRD_FLAG_ARCHIVED \ - | RRD_FLAG_COLLECTED \ - ) - -#define rrd_flag_is_collected(obj) rrd_flag_check(obj, RRD_FLAG_COLLECTED) -#define rrd_flag_is_archived(obj) rrd_flag_check(obj, RRD_FLAG_ARCHIVED) -#define rrd_flag_is_deleted(obj) rrd_flag_check(obj, RRD_FLAG_DELETED) -#define rrd_flag_is_updated(obj) rrd_flag_check(obj, RRD_FLAG_UPDATED) - -// mark an object as updated, providing reasons (additional bits) -#define rrd_flag_set_updated(obj, reason) rrd_flag_set(obj, RRD_FLAG_UPDATED | (reason)) - -// clear an object as being updated, clearing also all the reasons -#define rrd_flag_unset_updated(obj) rrd_flag_clear(obj, RRD_FLAG_UPDATED | RRD_FLAG_ALL_UPDATE_REASONS) - - -static struct rrdcontext_reason { - RRD_FLAGS flag; - const char *name; - usec_t delay_ut; -} rrdcontext_reasons[] = { - // context related - {RRD_FLAG_UPDATE_REASON_TRIGGERED, "triggered transition", 65 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_NEW_OBJECT, "object created", 65 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT, "object updated", 65 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_LOAD_SQL, "loaded from sql", 65 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_CHANGED_METADATA, "changed metadata", 65 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_ZERO_RETENTION, "has no retention", 65 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T, "updated first_time_t", 65 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T, "updated last_time_t", 65 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_STOPPED_BEING_COLLECTED, "stopped collected", 65 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_STARTED_BEING_COLLECTED, "started collected", 5 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_UNUSED, "unused", 5 * USEC_PER_SEC }, - - // not context related - {RRD_FLAG_UPDATE_REASON_CHANGED_LINKING, "changed rrd link", 65 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD, "child disconnected", 65 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_DB_ROTATION, "db rotation", 65 * USEC_PER_SEC }, - {RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION, "updated retention", 65 * USEC_PER_SEC }, - - // terminator - {0, NULL, 0 }, -}; - - -typedef struct rrdmetric { - uuid_t uuid; - - STRING *id; - STRING *name; - - RRDDIM *rrddim; - - time_t first_time_s; - time_t last_time_s; - RRD_FLAGS flags; - - struct rrdinstance *ri; -} RRDMETRIC; - -typedef struct rrdinstance { - uuid_t uuid; - - STRING *id; - STRING *name; - STRING *title; - STRING *units; - STRING *family; - uint32_t priority; - RRDSET_TYPE chart_type; - - RRD_FLAGS flags; // flags related to this instance - time_t first_time_s; - time_t last_time_s; - - time_t update_every_s; // data collection frequency - RRDSET *rrdset; // pointer to RRDSET when collected, or NULL - - DICTIONARY *rrdlabels; // linked to RRDSET->chart_labels or own version - - struct rrdcontext *rc; - DICTIONARY *rrdmetrics; - - struct { - uint32_t collected_metrics_count; // a temporary variable to detect BEGIN/END without SET - // don't use it for other purposes - // it goes up and then resets to zero, on every iteration - } internal; -} RRDINSTANCE; - -typedef struct rrdcontext { - uint64_t version; - - STRING *id; - STRING *title; - STRING *units; - STRING *family; - uint32_t priority; - RRDSET_TYPE chart_type; - - RRD_FLAGS flags; - time_t first_time_s; - time_t last_time_s; - - VERSIONED_CONTEXT_DATA hub; - - DICTIONARY *rrdinstances; - RRDHOST *rrdhost; - - struct { - RRD_FLAGS queued_flags; // the last flags that triggered the post-processing - usec_t queued_ut; // the last time this was queued - usec_t dequeued_ut; // the last time we sent (or deduplicated) this context - size_t executions; // how many times this context has been processed - } pp; - - struct { - RRD_FLAGS queued_flags; // the last flags that triggered the queueing - usec_t queued_ut; // the last time this was queued - usec_t delay_calc_ut; // the last time we calculated the scheduled_dispatched_ut - usec_t scheduled_dispatch_ut; // the time it was/is scheduled to be sent - usec_t dequeued_ut; // the last time we sent (or deduplicated) this context - size_t dispatches; // the number of times this has been dispatched to hub - } queue; - - netdata_mutex_t mutex; -} RRDCONTEXT; - -// ---------------------------------------------------------------------------- -// helper one-liners for RRDMETRIC - -static bool rrdmetric_update_retention(RRDMETRIC *rm); - -static inline RRDMETRIC *rrdmetric_acquired_value(RRDMETRIC_ACQUIRED *rma) { - return dictionary_acquired_item_value((DICTIONARY_ITEM *)rma); -} - -static inline RRDMETRIC_ACQUIRED *rrdmetric_acquired_dup(RRDMETRIC_ACQUIRED *rma) { - RRDMETRIC *rm = rrdmetric_acquired_value(rma); - return (RRDMETRIC_ACQUIRED *)dictionary_acquired_item_dup(rm->ri->rrdmetrics, (DICTIONARY_ITEM *)rma); -} - -static inline void rrdmetric_release(RRDMETRIC_ACQUIRED *rma) { - RRDMETRIC *rm = rrdmetric_acquired_value(rma); - dictionary_acquired_item_release(rm->ri->rrdmetrics, (DICTIONARY_ITEM *)rma); -} - -const char *rrdmetric_acquired_id(RRDMETRIC_ACQUIRED *rma) { - RRDMETRIC *rm = rrdmetric_acquired_value(rma); - return string2str(rm->id); -} - -const char *rrdmetric_acquired_name(RRDMETRIC_ACQUIRED *rma) { - RRDMETRIC *rm = rrdmetric_acquired_value(rma); - return string2str(rm->name); -} - -NETDATA_DOUBLE rrdmetric_acquired_last_stored_value(RRDMETRIC_ACQUIRED *rma) { - RRDMETRIC *rm = rrdmetric_acquired_value(rma); - - if(rm->rrddim) - return rm->rrddim->last_stored_value; - - return NAN; -} - -// ---------------------------------------------------------------------------- -// helper one-liners for RRDINSTANCE - -static inline RRDINSTANCE *rrdinstance_acquired_value(RRDINSTANCE_ACQUIRED *ria) { - return dictionary_acquired_item_value((DICTIONARY_ITEM *)ria); -} - -static inline RRDINSTANCE_ACQUIRED *rrdinstance_acquired_dup(RRDINSTANCE_ACQUIRED *ria) { - RRDINSTANCE *ri = rrdinstance_acquired_value(ria); - return (RRDINSTANCE_ACQUIRED *)dictionary_acquired_item_dup(ri->rc->rrdinstances, (DICTIONARY_ITEM *)ria); -} - -static inline void rrdinstance_release(RRDINSTANCE_ACQUIRED *ria) { - RRDINSTANCE *ri = rrdinstance_acquired_value(ria); - dictionary_acquired_item_release(ri->rc->rrdinstances, (DICTIONARY_ITEM *)ria); -} - -const char *rrdinstance_acquired_id(RRDINSTANCE_ACQUIRED *ria) { - RRDINSTANCE *ri = rrdinstance_acquired_value(ria); - return string2str(ri->id); -} - -const char *rrdinstance_acquired_name(RRDINSTANCE_ACQUIRED *ria) { - RRDINSTANCE *ri = rrdinstance_acquired_value(ria); - return string2str(ri->name); -} - -DICTIONARY *rrdinstance_acquired_labels(RRDINSTANCE_ACQUIRED *ria) { - RRDINSTANCE *ri = rrdinstance_acquired_value(ria); - return ri->rrdlabels; -} - -DICTIONARY *rrdinstance_acquired_functions(RRDINSTANCE_ACQUIRED *ria) { - RRDINSTANCE *ri = rrdinstance_acquired_value(ria); - if(!ri->rrdset) return NULL; - return ri->rrdset->functions_view; -} - -// ---------------------------------------------------------------------------- -// helper one-liners for RRDCONTEXT - -static inline RRDCONTEXT *rrdcontext_acquired_value(RRDCONTEXT_ACQUIRED *rca) { - return dictionary_acquired_item_value((DICTIONARY_ITEM *)rca); -} - -const char *rrdcontext_acquired_id(RRDCONTEXT_ACQUIRED *rca) { - RRDCONTEXT *rc = rrdcontext_acquired_value(rca); - return string2str(rc->id); -} - -static inline RRDCONTEXT_ACQUIRED *rrdcontext_acquired_dup(RRDCONTEXT_ACQUIRED *rca) { - RRDCONTEXT *rc = rrdcontext_acquired_value(rca); - return (RRDCONTEXT_ACQUIRED *)dictionary_acquired_item_dup((DICTIONARY *)rc->rrdhost->rrdctx, (DICTIONARY_ITEM *)rca); -} - -static inline void rrdcontext_release(RRDCONTEXT_ACQUIRED *rca) { - RRDCONTEXT *rc = rrdcontext_acquired_value(rca); - dictionary_acquired_item_release((DICTIONARY *)rc->rrdhost->rrdctx, (DICTIONARY_ITEM *)rca); -} - -static void rrdcontext_recalculate_context_retention(RRDCONTEXT *rc, RRD_FLAGS reason, bool worker_jobs); -static void rrdcontext_recalculate_host_retention(RRDHOST *host, RRD_FLAGS reason, bool worker_jobs); - -#define rrdcontext_version_hash(host) rrdcontext_version_hash_with_callback(host, NULL, false, NULL) -static uint64_t rrdcontext_version_hash_with_callback(RRDHOST *host, void (*callback)(RRDCONTEXT *, bool, void *), bool snapshot, void *bundle); - -static void rrdcontext_garbage_collect_single_host(RRDHOST *host, bool worker_jobs); -static void rrdcontext_garbage_collect_for_all_hosts(void); - -#define rrdcontext_lock(rc) netdata_mutex_lock(&((rc)->mutex)) -#define rrdcontext_unlock(rc) netdata_mutex_unlock(&((rc)->mutex)) - -// ---------------------------------------------------------------------------- -// Forward definitions - -static uint64_t rrdcontext_get_next_version(RRDCONTEXT *rc); -static bool check_if_cloud_version_changed_unsafe(RRDCONTEXT *rc, bool sending __maybe_unused); -static void rrdcontext_message_send_unsafe(RRDCONTEXT *rc, bool snapshot __maybe_unused, void *bundle __maybe_unused); - -static void rrdcontext_delete_from_sql_unsafe(RRDCONTEXT *rc); - -static void rrdcontext_dequeue_from_post_processing(RRDCONTEXT *rc); -static void rrdcontext_queue_for_post_processing(RRDCONTEXT *rc, const char *function, RRD_FLAGS flags); -static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAGS reason, bool worker_jobs); - -static void rrdmetric_trigger_updates(RRDMETRIC *rm, const char *function); -static void rrdinstance_trigger_updates(RRDINSTANCE *ri, const char *function); -static void rrdcontext_trigger_updates(RRDCONTEXT *rc, const char *function); - -// ---------------------------------------------------------------------------- -// visualizing flags - -static void rrd_flags_to_buffer(RRD_FLAGS flags, BUFFER *wb) { - if(flags & RRD_FLAG_QUEUED_FOR_HUB) - buffer_strcat(wb, "QUEUED "); - - if(flags & RRD_FLAG_DELETED) - buffer_strcat(wb, "DELETED "); - - if(flags & RRD_FLAG_COLLECTED) - buffer_strcat(wb, "COLLECTED "); - - if(flags & RRD_FLAG_UPDATED) - buffer_strcat(wb, "UPDATED "); - - if(flags & RRD_FLAG_ARCHIVED) - buffer_strcat(wb, "ARCHIVED "); - - if(flags & RRD_FLAG_OWN_LABELS) - buffer_strcat(wb, "OWN_LABELS "); - - if(flags & RRD_FLAG_LIVE_RETENTION) - buffer_strcat(wb, "LIVE_RETENTION "); - - if(flags & RRD_FLAG_HIDDEN) - buffer_strcat(wb, "HIDDEN "); - - if(flags & RRD_FLAG_QUEUED_FOR_PP) - buffer_strcat(wb, "PENDING_UPDATES "); -} - -static void rrd_reasons_to_buffer(RRD_FLAGS flags, BUFFER *wb) { - for(int i = 0, added = 0; rrdcontext_reasons[i].name ; i++) { - if (flags & rrdcontext_reasons[i].flag) { - if (added) - buffer_strcat(wb, ", "); - buffer_strcat(wb, rrdcontext_reasons[i].name); - added++; - } - } -} - -// ---------------------------------------------------------------------------- -// RRDMETRIC - -// free the contents of RRDMETRIC. -// RRDMETRIC itself is managed by DICTIONARY - no need to free it here. -static void rrdmetric_free(RRDMETRIC *rm) { - string_freez(rm->id); - string_freez(rm->name); - - rm->id = NULL; - rm->name = NULL; - rm->ri = NULL; -} - -// called when this rrdmetric is inserted to the rrdmetrics dictionary of a rrdinstance -// the constructor of the rrdmetric object -static void rrdmetric_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdinstance) { - RRDMETRIC *rm = value; - - // link it to its parent - rm->ri = rrdinstance; - - // remove flags that we need to figure out at runtime - rm->flags = rm->flags & RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS; // no need for atomics - - // signal the react callback to do the job - rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_NEW_OBJECT); -} - -// called when this rrdmetric is deleted from the rrdmetrics dictionary of a rrdinstance -// the destructor of the rrdmetric object -static void rrdmetric_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdinstance __maybe_unused) { - RRDMETRIC *rm = value; - - internal_error(rm->rrddim, "RRDMETRIC: '%s' is freed but there is a RRDDIM linked to it.", string2str(rm->id)); - - // free the resources - rrdmetric_free(rm); -} - -// called when the same rrdmetric is inserted again to the rrdmetrics dictionary of a rrdinstance -// while this is called, the dictionary is write locked, but there may be other users of the object -static bool rrdmetric_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *rrdinstance __maybe_unused) { - RRDMETRIC *rm = old_value; - RRDMETRIC *rm_new = new_value; - - internal_error(rm->id != rm_new->id, - "RRDMETRIC: '%s' cannot change id to '%s'", - string2str(rm->id), string2str(rm_new->id)); - - if(uuid_compare(rm->uuid, rm_new->uuid) != 0) { -#ifdef NETDATA_INTERNAL_CHECKS - char uuid1[UUID_STR_LEN], uuid2[UUID_STR_LEN]; - uuid_unparse(rm->uuid, uuid1); - uuid_unparse(rm_new->uuid, uuid2); - - time_t old_first_time_s = 0; - time_t old_last_time_s = 0; - if(rrdmetric_update_retention(rm)) { - old_first_time_s = rm->first_time_s; - old_last_time_s = rm->last_time_s; - } - - uuid_copy(rm->uuid, rm_new->uuid); - - time_t new_first_time_s = 0; - time_t new_last_time_s = 0; - if(rrdmetric_update_retention(rm)) { - new_first_time_s = rm->first_time_s; - new_last_time_s = rm->last_time_s; - } - - internal_error(true, - "RRDMETRIC: '%s' of instance '%s' of host '%s' changed UUID from '%s' (retention %ld to %ld, %ld secs) to '%s' (retention %ld to %ld, %ld secs)" - , string2str(rm->id) - , string2str(rm->ri->id) - , rrdhost_hostname(rm->ri->rc->rrdhost) - , uuid1, old_first_time_s, old_last_time_s, old_last_time_s - old_first_time_s - , uuid2, new_first_time_s, new_last_time_s, new_last_time_s - new_first_time_s - ); -#else - uuid_copy(rm->uuid, rm_new->uuid); -#endif - rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(rm->rrddim && rm_new->rrddim && rm->rrddim != rm_new->rrddim) { - rm->rrddim = rm_new->rrddim; - rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_LINKING); - } - -#ifdef NETDATA_INTERNAL_CHECKS - if(rm->rrddim && uuid_compare(rm->uuid, rm->rrddim->metric_uuid) != 0) { - char uuid1[UUID_STR_LEN], uuid2[UUID_STR_LEN]; - uuid_unparse(rm->uuid, uuid1); - uuid_unparse(rm_new->uuid, uuid2); - internal_error(true, "RRDMETRIC: '%s' is linked to RRDDIM '%s' but they have different UUIDs. RRDMETRIC has '%s', RRDDIM has '%s'", string2str(rm->id), rrddim_id(rm->rrddim), uuid1, uuid2); - } -#endif - - if(rm->rrddim != rm_new->rrddim) - rm->rrddim = rm_new->rrddim; - - if(rm->name != rm_new->name) { - STRING *old = rm->name; - rm->name = string_dup(rm_new->name); - string_freez(old); - rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(!rm->first_time_s || (rm_new->first_time_s && rm_new->first_time_s < rm->first_time_s)) { - rm->first_time_s = rm_new->first_time_s; - rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); - } - - if(!rm->last_time_s || (rm_new->last_time_s && rm_new->last_time_s > rm->last_time_s)) { - rm->last_time_s = rm_new->last_time_s; - rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); - } - - rrd_flag_set(rm, rm_new->flags & RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS); // no needs for atomics on rm_new - - if(rrd_flag_is_collected(rm) && rrd_flag_is_archived(rm)) - rrd_flag_set_collected(rm); - - if(rrd_flag_check(rm, RRD_FLAG_UPDATED)) - rrd_flag_set(rm, RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT); - - rrdmetric_free(rm_new); - - // the react callback will continue from here - return rrd_flag_is_updated(rm); -} - -// this is called after the insert or the conflict callbacks, -// but the dictionary is now unlocked -static void rrdmetric_react_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdinstance __maybe_unused) { - RRDMETRIC *rm = value; - rrdmetric_trigger_updates(rm, __FUNCTION__ ); -} - -static void rrdmetrics_create_in_rrdinstance(RRDINSTANCE *ri) { - if(unlikely(!ri)) return; - if(likely(ri->rrdmetrics)) return; - - ri->rrdmetrics = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, - &dictionary_stats_category_rrdcontext, sizeof(RRDMETRIC)); - - dictionary_register_insert_callback(ri->rrdmetrics, rrdmetric_insert_callback, ri); - dictionary_register_delete_callback(ri->rrdmetrics, rrdmetric_delete_callback, ri); - dictionary_register_conflict_callback(ri->rrdmetrics, rrdmetric_conflict_callback, ri); - dictionary_register_react_callback(ri->rrdmetrics, rrdmetric_react_callback, ri); -} - -static void rrdmetrics_destroy_from_rrdinstance(RRDINSTANCE *ri) { - if(unlikely(!ri || !ri->rrdmetrics)) return; - dictionary_destroy(ri->rrdmetrics); - ri->rrdmetrics = NULL; -} - -// trigger post-processing of the rrdmetric, escalating changes to the rrdinstance it belongs -static void rrdmetric_trigger_updates(RRDMETRIC *rm, const char *function) { - if(unlikely(rrd_flag_is_collected(rm)) && (!rm->rrddim || rrd_flag_check(rm, RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD))) - rrd_flag_set_archived(rm); - - if(rrd_flag_is_updated(rm) || !rrd_flag_check(rm, RRD_FLAG_LIVE_RETENTION)) { - rrd_flag_set_updated(rm->ri, RRD_FLAG_UPDATE_REASON_TRIGGERED); - rrdcontext_queue_for_post_processing(rm->ri->rc, function, rm->flags); - } -} - -// ---------------------------------------------------------------------------- -// RRDMETRIC HOOKS ON RRDDIM - -static inline void rrdmetric_from_rrddim(RRDDIM *rd) { - if(unlikely(!rd->rrdset)) - fatal("RRDMETRIC: rrddim '%s' does not have a rrdset.", rrddim_id(rd)); - - if(unlikely(!rd->rrdset->rrdhost)) - fatal("RRDMETRIC: rrdset '%s' does not have a rrdhost", rrdset_id(rd->rrdset)); - - if(unlikely(!rd->rrdset->rrdinstance)) - fatal("RRDMETRIC: rrdset '%s' does not have a rrdinstance", rrdset_id(rd->rrdset)); - - RRDINSTANCE *ri = rrdinstance_acquired_value(rd->rrdset->rrdinstance); - - RRDMETRIC trm = { - .id = string_dup(rd->id), - .name = string_dup(rd->name), - .flags = RRD_FLAG_NONE, // no need for atomics - .rrddim = rd, - }; - uuid_copy(trm.uuid, rd->metric_uuid); - - RRDMETRIC_ACQUIRED *rma = (RRDMETRIC_ACQUIRED *)dictionary_set_and_acquire_item(ri->rrdmetrics, string2str(trm.id), &trm, sizeof(trm)); - - if(rd->rrdmetric) - rrdmetric_release(rd->rrdmetric); - - rd->rrdmetric = rma; -} - -#define rrddim_get_rrdmetric(rd) rrddim_get_rrdmetric_with_trace(rd, __FUNCTION__) -static inline RRDMETRIC *rrddim_get_rrdmetric_with_trace(RRDDIM *rd, const char *function) { - if(unlikely(!rd->rrdmetric)) { - error("RRDMETRIC: RRDDIM '%s' is not linked to an RRDMETRIC at %s()", rrddim_id(rd), function); - return NULL; - } - - RRDMETRIC *rm = rrdmetric_acquired_value(rd->rrdmetric); - if(unlikely(!rm)) { - error("RRDMETRIC: RRDDIM '%s' lost the link to its RRDMETRIC at %s()", rrddim_id(rd), function); - return NULL; - } - - if(unlikely(rm->rrddim != rd)) - fatal("RRDMETRIC: '%s' is not linked to RRDDIM '%s' at %s()", string2str(rm->id), rrddim_id(rd), function); - - return rm; -} - -static inline void rrdmetric_rrddim_is_freed(RRDDIM *rd) { - RRDMETRIC *rm = rrddim_get_rrdmetric(rd); - if(unlikely(!rm)) return; - - if(unlikely(rrd_flag_is_collected(rm))) - rrd_flag_set_archived(rm); - - rm->rrddim = NULL; - rrdmetric_trigger_updates(rm, __FUNCTION__ ); - rrdmetric_release(rd->rrdmetric); - rd->rrdmetric = NULL; -} - -static inline void rrdmetric_updated_rrddim_flags(RRDDIM *rd) { - RRDMETRIC *rm = rrddim_get_rrdmetric(rd); - if(unlikely(!rm)) return; - - if(unlikely(rrddim_flag_check(rd, RRDDIM_FLAG_ARCHIVED|RRDDIM_FLAG_OBSOLETE))) { - if(unlikely(rrd_flag_is_collected(rm))) - rrd_flag_set_archived(rm); - } - - rrdmetric_trigger_updates(rm, __FUNCTION__ ); -} - -static inline void rrdmetric_collected_rrddim(RRDDIM *rd) { - RRDMETRIC *rm = rrddim_get_rrdmetric(rd); - if(unlikely(!rm)) return; - - if(unlikely(!rrd_flag_is_collected(rm))) - rrd_flag_set_collected(rm); - - // we use this variable to detect BEGIN/END without SET - rm->ri->internal.collected_metrics_count++; - - rrdmetric_trigger_updates(rm, __FUNCTION__ ); -} - -// ---------------------------------------------------------------------------- -// RRDINSTANCE - -static void rrdinstance_free(RRDINSTANCE *ri) { - - if(rrd_flag_check(ri, RRD_FLAG_OWN_LABELS)) - dictionary_destroy(ri->rrdlabels); - - rrdmetrics_destroy_from_rrdinstance(ri); - string_freez(ri->id); - string_freez(ri->name); - string_freez(ri->title); - string_freez(ri->units); - string_freez(ri->family); - - ri->id = NULL; - ri->name = NULL; - ri->title = NULL; - ri->units = NULL; - ri->family = NULL; - ri->rc = NULL; - ri->rrdlabels = NULL; - ri->rrdmetrics = NULL; - ri->rrdset = NULL; -} - -static void rrdinstance_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdcontext) { - RRDINSTANCE *ri = value; - - // link it to its parent - ri->rc = rrdcontext; - - ri->flags = ri->flags & RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS; // no need for atomics - - if(!ri->name) - ri->name = string_dup(ri->id); - - if(ri->rrdset) { - ri->rrdlabels = ri->rrdset->rrdlabels; - ri->flags &= ~RRD_FLAG_OWN_LABELS; // no need of atomics at the constructor - } - else { - ri->rrdlabels = rrdlabels_create(); - ri->flags |= RRD_FLAG_OWN_LABELS; // no need of atomics at the constructor - } - - if(ri->rrdset) { - if(unlikely(rrdset_flag_check(ri->rrdset, RRDSET_FLAG_HIDDEN))) - ri->flags |= RRD_FLAG_HIDDEN; // no need of atomics at the constructor - else - ri->flags &= ~RRD_FLAG_HIDDEN; // no need of atomics at the constructor - } - - rrdmetrics_create_in_rrdinstance(ri); - - // signal the react callback to do the job - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_NEW_OBJECT); -} - -static void rrdinstance_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdcontext __maybe_unused) { - RRDINSTANCE *ri = (RRDINSTANCE *)value; - - internal_error(ri->rrdset, "RRDINSTANCE: '%s' is freed but there is a RRDSET linked to it.", string2str(ri->id)); - - rrdinstance_free(ri); -} - -static bool rrdinstance_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *rrdcontext __maybe_unused) { - RRDINSTANCE *ri = (RRDINSTANCE *)old_value; - RRDINSTANCE *ri_new = (RRDINSTANCE *)new_value; - - internal_error(ri->id != ri_new->id, - "RRDINSTANCE: '%s' cannot change id to '%s'", - string2str(ri->id), string2str(ri_new->id)); - - if(uuid_compare(ri->uuid, ri_new->uuid) != 0) { -#ifdef NETDATA_INTERNAL_CHECKS - char uuid1[UUID_STR_LEN], uuid2[UUID_STR_LEN]; - uuid_unparse(ri->uuid, uuid1); - uuid_unparse(ri_new->uuid, uuid2); - internal_error(true, "RRDINSTANCE: '%s' of host '%s' changed UUID from '%s' to '%s'", - string2str(ri->id), rrdhost_hostname(ri->rc->rrdhost), uuid1, uuid2); -#endif - - uuid_copy(ri->uuid, ri_new->uuid); - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(ri->rrdset && ri_new->rrdset && ri->rrdset != ri_new->rrdset) { - ri->rrdset = ri_new->rrdset; - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LINKING); - } - -#ifdef NETDATA_INTERNAL_CHECKS - if(ri->rrdset && uuid_compare(ri->uuid, ri->rrdset->chart_uuid) != 0) { - char uuid1[UUID_STR_LEN], uuid2[UUID_STR_LEN]; - uuid_unparse(ri->uuid, uuid1); - uuid_unparse(ri->rrdset->chart_uuid, uuid2); - internal_error(true, "RRDINSTANCE: '%s' is linked to RRDSET '%s' but they have different UUIDs. RRDINSTANCE has '%s', RRDSET has '%s'", string2str(ri->id), rrdset_id(ri->rrdset), uuid1, uuid2); - } -#endif - - if(ri->name != ri_new->name) { - STRING *old = ri->name; - ri->name = string_dup(ri_new->name); - string_freez(old); - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(ri->title != ri_new->title) { - STRING *old = ri->title; - ri->title = string_dup(ri_new->title); - string_freez(old); - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(ri->units != ri_new->units) { - STRING *old = ri->units; - ri->units = string_dup(ri_new->units); - string_freez(old); - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(ri->family != ri_new->family) { - STRING *old = ri->family; - ri->family = string_dup(ri_new->family); - string_freez(old); - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(ri->chart_type != ri_new->chart_type) { - ri->chart_type = ri_new->chart_type; - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(ri->priority != ri_new->priority) { - ri->priority = ri_new->priority; - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(ri->update_every_s != ri_new->update_every_s) { - ri->update_every_s = ri_new->update_every_s; - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(ri->rrdset != ri_new->rrdset) { - ri->rrdset = ri_new->rrdset; - - if(ri->rrdset && rrd_flag_check(ri, RRD_FLAG_OWN_LABELS)) { - DICTIONARY *old = ri->rrdlabels; - ri->rrdlabels = ri->rrdset->rrdlabels; - rrd_flag_clear(ri, RRD_FLAG_OWN_LABELS); - rrdlabels_destroy(old); - } - else if(!ri->rrdset && !rrd_flag_check(ri, RRD_FLAG_OWN_LABELS)) { - ri->rrdlabels = rrdlabels_create(); - rrd_flag_set(ri, RRD_FLAG_OWN_LABELS); - } - } - - if(ri->rrdset) { - if(unlikely(rrdset_flag_check(ri->rrdset, RRDSET_FLAG_HIDDEN))) - rrd_flag_set(ri, RRD_FLAG_HIDDEN); - else - rrd_flag_clear(ri, RRD_FLAG_HIDDEN); - } - - rrd_flag_set(ri, ri_new->flags & RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS); // no need for atomics on ri_new - - if(rrd_flag_is_collected(ri) && rrd_flag_is_archived(ri)) - rrd_flag_set_collected(ri); - - if(rrd_flag_is_updated(ri)) - rrd_flag_set(ri, RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT); - - // free the new one - rrdinstance_free(ri_new); - - // the react callback will continue from here - return rrd_flag_is_updated(ri); -} - -static void rrdinstance_react_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdcontext __maybe_unused) { - RRDINSTANCE *ri = value; - - rrdinstance_trigger_updates(ri, __FUNCTION__ ); -} - -void rrdinstances_create_in_rrdcontext(RRDCONTEXT *rc) { - if(unlikely(!rc || rc->rrdinstances)) return; - - rc->rrdinstances = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, - &dictionary_stats_category_rrdcontext, sizeof(RRDINSTANCE)); - - dictionary_register_insert_callback(rc->rrdinstances, rrdinstance_insert_callback, rc); - dictionary_register_delete_callback(rc->rrdinstances, rrdinstance_delete_callback, rc); - dictionary_register_conflict_callback(rc->rrdinstances, rrdinstance_conflict_callback, rc); - dictionary_register_react_callback(rc->rrdinstances, rrdinstance_react_callback, rc); -} - -void rrdinstances_destroy_from_rrdcontext(RRDCONTEXT *rc) { - if(unlikely(!rc || !rc->rrdinstances)) return; - - dictionary_destroy(rc->rrdinstances); - rc->rrdinstances = NULL; -} - -static void rrdinstance_trigger_updates(RRDINSTANCE *ri, const char *function) { - RRDSET *st = ri->rrdset; - - if(likely(st)) { - if(unlikely((unsigned int) st->priority != ri->priority)) { - ri->priority = st->priority; - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - if(unlikely(st->update_every != ri->update_every_s)) { - ri->update_every_s = st->update_every; - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - } - else if(unlikely(rrd_flag_is_collected(ri))) { - // there is no rrdset, but we have it as collected! - - rrd_flag_set_archived(ri); - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LINKING); - } - - if(rrd_flag_is_updated(ri) || !rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION)) { - rrd_flag_set_updated(ri->rc, RRD_FLAG_UPDATE_REASON_TRIGGERED); - rrdcontext_queue_for_post_processing(ri->rc, function, ri->flags); - } -} - -// ---------------------------------------------------------------------------- -// RRDINSTANCE HOOKS ON RRDSET - -static inline void rrdinstance_from_rrdset(RRDSET *st) { - RRDCONTEXT trc = { - .id = string_dup(st->context), - .title = string_dup(st->title), - .units = string_dup(st->units), - .family = string_dup(st->family), - .priority = st->priority, - .chart_type = st->chart_type, - .flags = RRD_FLAG_NONE, // no need for atomics - .rrdhost = st->rrdhost, - }; - - RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_set_and_acquire_item((DICTIONARY *)st->rrdhost->rrdctx, string2str(trc.id), &trc, sizeof(trc)); - RRDCONTEXT *rc = rrdcontext_acquired_value(rca); - - RRDINSTANCE tri = { - .id = string_dup(st->id), - .name = string_dup(st->name), - .units = string_dup(st->units), - .family = string_dup(st->family), - .title = string_dup(st->title), - .chart_type = st->chart_type, - .priority = st->priority, - .update_every_s = st->update_every, - .flags = RRD_FLAG_NONE, // no need for atomics - .rrdset = st, - }; - uuid_copy(tri.uuid, st->chart_uuid); - - RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *)dictionary_set_and_acquire_item(rc->rrdinstances, string2str(tri.id), &tri, sizeof(tri)); - - RRDCONTEXT_ACQUIRED *rca_old = st->rrdcontext; - RRDINSTANCE_ACQUIRED *ria_old = st->rrdinstance; - - st->rrdcontext = rca; - st->rrdinstance = ria; - - if(rca == rca_old) { - rrdcontext_release(rca_old); - rca_old = NULL; - } - - if(ria == ria_old) { - rrdinstance_release(ria_old); - ria_old = NULL; - } - - if(rca_old && ria_old) { - // Oops! The chart changed context! - - // RRDCONTEXT *rc_old = rrdcontext_acquired_value(rca_old); - RRDINSTANCE *ri_old = rrdinstance_acquired_value(ria_old); - - // migrate all dimensions to the new metrics - RRDDIM *rd; - rrddim_foreach_read(rd, st) { - if (!rd->rrdmetric) continue; - - RRDMETRIC *rm_old = rrdmetric_acquired_value(rd->rrdmetric); - rrd_flags_replace(rm_old, RRD_FLAG_DELETED|RRD_FLAG_UPDATED|RRD_FLAG_LIVE_RETENTION|RRD_FLAG_UPDATE_REASON_UNUSED|RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); - rm_old->rrddim = NULL; - rm_old->first_time_s = 0; - rm_old->last_time_s = 0; - - rrdmetric_release(rd->rrdmetric); - rd->rrdmetric = NULL; - - rrdmetric_from_rrddim(rd); - } - rrddim_foreach_done(rd); - - // mark the old instance, ready to be deleted - if(!rrd_flag_check(ri_old, RRD_FLAG_OWN_LABELS)) - ri_old->rrdlabels = rrdlabels_create(); - - rrd_flags_replace(ri_old, RRD_FLAG_OWN_LABELS|RRD_FLAG_DELETED|RRD_FLAG_UPDATED|RRD_FLAG_LIVE_RETENTION|RRD_FLAG_UPDATE_REASON_UNUSED|RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); - ri_old->rrdset = NULL; - ri_old->first_time_s = 0; - ri_old->last_time_s = 0; - - rrdinstance_trigger_updates(ri_old, __FUNCTION__ ); - rrdinstance_release(ria_old); - - /* - // trigger updates on the old context - if(!dictionary_entries(rc_old->rrdinstances) && !dictionary_stats_referenced_items(rc_old->rrdinstances)) { - rrdcontext_lock(rc_old); - rc_old->flags = ((rc_old->flags & RRD_FLAG_QUEUED)?RRD_FLAG_QUEUED:RRD_FLAG_NONE)|RRD_FLAG_DELETED|RRD_FLAG_UPDATED|RRD_FLAG_LIVE_RETENTION|RRD_FLAG_UPDATE_REASON_UNUSED|RRD_FLAG_UPDATE_REASON_ZERO_RETENTION; - rc_old->first_time_s = 0; - rc_old->last_time_s = 0; - rrdcontext_unlock(rc_old); - rrdcontext_trigger_updates(rc_old, __FUNCTION__ ); - } - else - rrdcontext_trigger_updates(rc_old, __FUNCTION__ ); - */ - - rrdcontext_release(rca_old); - rca_old = NULL; - ria_old = NULL; - } - - if(rca_old || ria_old) - fatal("RRDCONTEXT: cannot switch rrdcontext without switching rrdinstance too"); -} - -#define rrdset_get_rrdinstance(st) rrdset_get_rrdinstance_with_trace(st, __FUNCTION__); -static inline RRDINSTANCE *rrdset_get_rrdinstance_with_trace(RRDSET *st, const char *function) { - if(unlikely(!st->rrdinstance)) { - error("RRDINSTANCE: RRDSET '%s' is not linked to an RRDINSTANCE at %s()", rrdset_id(st), function); - return NULL; - } - - RRDINSTANCE *ri = rrdinstance_acquired_value(st->rrdinstance); - if(unlikely(!ri)) { - error("RRDINSTANCE: RRDSET '%s' lost its link to an RRDINSTANCE at %s()", rrdset_id(st), function); - return NULL; - } - - if(unlikely(ri->rrdset != st)) - fatal("RRDINSTANCE: '%s' is not linked to RRDSET '%s' at %s()", string2str(ri->id), rrdset_id(st), function); - - return ri; -} - -static inline void rrdinstance_rrdset_is_freed(RRDSET *st) { - RRDINSTANCE *ri = rrdset_get_rrdinstance(st); - if(unlikely(!ri)) return; - - rrd_flag_set_archived(ri); - - if(!rrd_flag_check(ri, RRD_FLAG_OWN_LABELS)) { - ri->rrdlabels = rrdlabels_create(); - rrdlabels_copy(ri->rrdlabels, st->rrdlabels); - rrd_flag_set(ri, RRD_FLAG_OWN_LABELS); - } - - ri->rrdset = NULL; - - rrdinstance_trigger_updates(ri, __FUNCTION__ ); - - rrdinstance_release(st->rrdinstance); - st->rrdinstance = NULL; - - rrdcontext_release(st->rrdcontext); - st->rrdcontext = NULL; -} - -static inline void rrdinstance_rrdset_has_updated_retention(RRDSET *st) { - RRDINSTANCE *ri = rrdset_get_rrdinstance(st); - if(unlikely(!ri)) return; - - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION); - rrdinstance_trigger_updates(ri, __FUNCTION__ ); -} - -static inline void rrdinstance_updated_rrdset_name(RRDSET *st) { - // the chart may not be initialized when this is called - if(unlikely(!st->rrdinstance)) return; - - RRDINSTANCE *ri = rrdset_get_rrdinstance(st); - if(unlikely(!ri)) return; - - if(st->name != ri->name) { - STRING *old = ri->name; - ri->name = string_dup(st->name); - string_freez(old); - - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - rrdinstance_trigger_updates(ri, __FUNCTION__ ); - } -} - -static inline void rrdinstance_updated_rrdset_flags_no_action(RRDINSTANCE *ri, RRDSET *st) { - if(unlikely(ri->rrdset != st)) - fatal("RRDCONTEXT: instance '%s' is not linked to chart '%s' on host '%s'", - string2str(ri->id), rrdset_id(st), rrdhost_hostname(st->rrdhost)); - - bool st_is_hidden = rrdset_flag_check(st, RRDSET_FLAG_HIDDEN); - bool ri_is_hidden = rrd_flag_check(ri, RRD_FLAG_HIDDEN); - - if(unlikely(st_is_hidden != ri_is_hidden)) { - if (unlikely(st_is_hidden && !ri_is_hidden)) - rrd_flag_set_updated(ri, RRD_FLAG_HIDDEN | RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - - else if (unlikely(!st_is_hidden && ri_is_hidden)) { - rrd_flag_clear(ri, RRD_FLAG_HIDDEN); - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - } -} - -static inline void rrdinstance_updated_rrdset_flags(RRDSET *st) { - RRDINSTANCE *ri = rrdset_get_rrdinstance(st); - if(unlikely(!ri)) return; - - if(unlikely(rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED|RRDSET_FLAG_OBSOLETE))) - rrd_flag_set_archived(ri); - - rrdinstance_updated_rrdset_flags_no_action(ri, st); - - rrdinstance_trigger_updates(ri, __FUNCTION__ ); -} - -static inline void rrdinstance_collected_rrdset(RRDSET *st) { - RRDINSTANCE *ri = rrdset_get_rrdinstance(st); - if(unlikely(!ri)) return; - - rrdinstance_updated_rrdset_flags_no_action(ri, st); - - if(unlikely(ri->internal.collected_metrics_count && !rrd_flag_is_collected(ri))) - rrd_flag_set_collected(ri); - - // we use this variable to detect BEGIN/END without SET - ri->internal.collected_metrics_count = 0; - - rrdinstance_trigger_updates(ri, __FUNCTION__ ); -} - -// ---------------------------------------------------------------------------- -// RRDCONTEXT - -static void rrdcontext_freez(RRDCONTEXT *rc) { - string_freez(rc->id); - string_freez(rc->title); - string_freez(rc->units); - string_freez(rc->family); -} - -static void rrdcontext_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdhost) { - RRDHOST *host = (RRDHOST *)rrdhost; - RRDCONTEXT *rc = (RRDCONTEXT *)value; - - rc->rrdhost = host; - rc->flags = rc->flags & RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS; // no need for atomics at constructor - - if(rc->hub.version) { - // we are loading data from the SQL database - - if(rc->version) - error("RRDCONTEXT: context '%s' is already initialized with version %"PRIu64", but it is loaded again from SQL with version %"PRIu64"", string2str(rc->id), rc->version, rc->hub.version); - - // IMPORTANT - // replace all string pointers in rc->hub with our own versions - // the originals are coming from a tmp allocation of sqlite - - string_freez(rc->id); - rc->id = string_strdupz(rc->hub.id); - rc->hub.id = string2str(rc->id); - - string_freez(rc->title); - rc->title = string_strdupz(rc->hub.title); - rc->hub.title = string2str(rc->title); - - string_freez(rc->units); - rc->units = string_strdupz(rc->hub.units); - rc->hub.units = string2str(rc->units); - - string_freez(rc->family); - rc->family = string_strdupz(rc->hub.family); - rc->hub.family = string2str(rc->family); - - rc->chart_type = rrdset_type_id(rc->hub.chart_type); - rc->hub.chart_type = rrdset_type_name(rc->chart_type); - - rc->version = rc->hub.version; - rc->priority = rc->hub.priority; - rc->first_time_s = (time_t)rc->hub.first_time_s; - rc->last_time_s = (time_t)rc->hub.last_time_s; - - if(rc->hub.deleted || !rc->hub.first_time_s) - rrd_flag_set_deleted(rc, RRD_FLAG_NONE); - else { - if (rc->last_time_s == 0) - rrd_flag_set_collected(rc); - else - rrd_flag_set_archived(rc); - } - - rc->flags |= RRD_FLAG_UPDATE_REASON_LOAD_SQL; // no need for atomics at constructor - } - else { - // we are adding this context now for the first time - rc->version = now_realtime_sec(); - } - - rrdinstances_create_in_rrdcontext(rc); - netdata_mutex_init(&rc->mutex); - - // signal the react callback to do the job - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_NEW_OBJECT); -} - -static void rrdcontext_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdhost __maybe_unused) { - - RRDCONTEXT *rc = (RRDCONTEXT *)value; - - rrdinstances_destroy_from_rrdcontext(rc); - netdata_mutex_destroy(&rc->mutex); - rrdcontext_freez(rc); -} - -static bool rrdcontext_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *rrdhost __maybe_unused) { - RRDCONTEXT *rc = (RRDCONTEXT *)old_value; - RRDCONTEXT *rc_new = (RRDCONTEXT *)new_value; - - //current rc is not archived, new_rc is archived, don't merge - if (!rrd_flag_is_archived(rc) && rrd_flag_is_archived(rc_new)) { - rrdcontext_freez(rc_new); - return false; - } - - rrdcontext_lock(rc); - - if(rc->title != rc_new->title) { - STRING *old_title = rc->title; - if (rrd_flag_is_archived(rc) && !rrd_flag_is_archived(rc_new)) - rc->title = string_dup(rc_new->title); - else - rc->title = string_2way_merge(rc->title, rc_new->title); - string_freez(old_title); - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(rc->units != rc_new->units) { - STRING *old_units = rc->units; - rc->units = string_dup(rc_new->units); - string_freez(old_units); - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(rc->family != rc_new->family) { - STRING *old_family = rc->family; - if (rrd_flag_is_archived(rc) && !rrd_flag_is_archived(rc_new)) - rc->family = string_dup(rc_new->family); - else - rc->family = string_2way_merge(rc->family, rc_new->family); - string_freez(old_family); - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(rc->chart_type != rc_new->chart_type) { - rc->chart_type = rc_new->chart_type; - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - if(rc->priority != rc_new->priority) { - rc->priority = rc_new->priority; - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - - rrd_flag_set(rc, rc_new->flags & RRD_FLAGS_ALLOWED_EXTERNALLY_ON_NEW_OBJECTS); // no need for atomics on rc_new - - if(rrd_flag_is_collected(rc) && rrd_flag_is_archived(rc)) - rrd_flag_set_collected(rc); - - if(rrd_flag_is_updated(rc)) - rrd_flag_set(rc, RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT); - - rrdcontext_unlock(rc); - - // free the resources of the new one - rrdcontext_freez(rc_new); - - // the react callback will continue from here - return rrd_flag_is_updated(rc); -} - -static void rrdcontext_react_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *rrdhost __maybe_unused) { - RRDCONTEXT *rc = (RRDCONTEXT *)value; - rrdcontext_trigger_updates(rc, __FUNCTION__ ); -} - -static void rrdcontext_trigger_updates(RRDCONTEXT *rc, const char *function) { - if(rrd_flag_is_updated(rc) || !rrd_flag_check(rc, RRD_FLAG_LIVE_RETENTION)) - rrdcontext_queue_for_post_processing(rc, function, rc->flags); -} - -static void rrdcontext_hub_queue_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *context, void *nothing __maybe_unused) { - RRDCONTEXT *rc = context; - rrd_flag_set(rc, RRD_FLAG_QUEUED_FOR_HUB); - rc->queue.queued_ut = now_realtime_usec(); - rc->queue.queued_flags = rrd_flags_get(rc); -} - -static void rrdcontext_hub_queue_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *context, void *nothing __maybe_unused) { - RRDCONTEXT *rc = context; - rrd_flag_clear(rc, RRD_FLAG_QUEUED_FOR_HUB); -} - -static bool rrdcontext_hub_queue_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *context, void *new_context __maybe_unused, void *nothing __maybe_unused) { - // context and new_context are the same - // we just need to update the timings - RRDCONTEXT *rc = context; - rrd_flag_set(rc, RRD_FLAG_QUEUED_FOR_HUB); - rc->queue.queued_ut = now_realtime_usec(); - rc->queue.queued_flags |= rrd_flags_get(rc); - - return true; -} - -static void rrdcontext_post_processing_queue_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *context, void *nothing __maybe_unused) { - RRDCONTEXT *rc = context; - rrd_flag_set(rc, RRD_FLAG_QUEUED_FOR_PP); - rc->pp.queued_flags = rc->flags; - rc->pp.queued_ut = now_realtime_usec(); -} - -static void rrdcontext_post_processing_queue_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *context, void *nothing __maybe_unused) { - RRDCONTEXT *rc = context; - rrd_flag_clear(rc, RRD_FLAG_QUEUED_FOR_PP); - rc->pp.dequeued_ut = now_realtime_usec(); -} - -static bool rrdcontext_post_processing_queue_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *context, void *new_context __maybe_unused, void *nothing __maybe_unused) { - RRDCONTEXT *rc = context; - bool changed = false; - - if(!(rc->flags & RRD_FLAG_QUEUED_FOR_PP)) { - rrd_flag_set(rc, RRD_FLAG_QUEUED_FOR_PP); - changed = true; - } - - if(rc->pp.queued_flags != rc->flags) { - rc->pp.queued_flags |= rc->flags; - changed = true; - } - - return changed; -} - -void rrdhost_create_rrdcontexts(RRDHOST *host) { - if(unlikely(!host)) return; - if(likely(host->rrdctx)) return; - - host->rrdctx = (RRDCONTEXTS *)dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, - &dictionary_stats_category_rrdcontext, sizeof(RRDCONTEXT)); - - dictionary_register_insert_callback((DICTIONARY *)host->rrdctx, rrdcontext_insert_callback, host); - dictionary_register_delete_callback((DICTIONARY *)host->rrdctx, rrdcontext_delete_callback, host); - dictionary_register_conflict_callback((DICTIONARY *)host->rrdctx, rrdcontext_conflict_callback, host); - dictionary_register_react_callback((DICTIONARY *)host->rrdctx, rrdcontext_react_callback, host); - - host->rrdctx_hub_queue = (RRDCONTEXTS *)dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_VALUE_LINK_DONT_CLONE, &dictionary_stats_category_rrdcontext, 0); - dictionary_register_insert_callback((DICTIONARY *)host->rrdctx_hub_queue, rrdcontext_hub_queue_insert_callback, NULL); - dictionary_register_delete_callback((DICTIONARY *)host->rrdctx_hub_queue, rrdcontext_hub_queue_delete_callback, NULL); - dictionary_register_conflict_callback((DICTIONARY *)host->rrdctx_hub_queue, rrdcontext_hub_queue_conflict_callback, NULL); - - host->rrdctx_post_processing_queue = (RRDCONTEXTS *)dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_VALUE_LINK_DONT_CLONE, &dictionary_stats_category_rrdcontext, 0); - dictionary_register_insert_callback((DICTIONARY *)host->rrdctx_post_processing_queue, rrdcontext_post_processing_queue_insert_callback, NULL); - dictionary_register_delete_callback((DICTIONARY *)host->rrdctx_post_processing_queue, rrdcontext_post_processing_queue_delete_callback, NULL); - dictionary_register_conflict_callback((DICTIONARY *)host->rrdctx_post_processing_queue, rrdcontext_post_processing_queue_conflict_callback, NULL); -} - -void rrdhost_destroy_rrdcontexts(RRDHOST *host) { - if(unlikely(!host)) return; - if(unlikely(!host->rrdctx)) return; - - DICTIONARY *old; - - if(host->rrdctx_hub_queue) { - old = (DICTIONARY *)host->rrdctx_hub_queue; - host->rrdctx_hub_queue = NULL; - - RRDCONTEXT *rc; - dfe_start_write(old, rc) { - dictionary_del(old, string2str(rc->id)); - } - dfe_done(rc); - dictionary_destroy(old); - } - - if(host->rrdctx_post_processing_queue) { - old = (DICTIONARY *)host->rrdctx_post_processing_queue; - host->rrdctx_post_processing_queue = NULL; - - RRDCONTEXT *rc; - dfe_start_write(old, rc) { - dictionary_del(old, string2str(rc->id)); - } - dfe_done(rc); - dictionary_destroy(old); - } - - old = (DICTIONARY *)host->rrdctx; - host->rrdctx = NULL; - dictionary_destroy(old); -} - -// ---------------------------------------------------------------------------- -// public API - -void rrdcontext_updated_rrddim(RRDDIM *rd) { - rrdmetric_from_rrddim(rd); -} - -void rrdcontext_removed_rrddim(RRDDIM *rd) { - rrdmetric_rrddim_is_freed(rd); -} - -void rrdcontext_updated_rrddim_algorithm(RRDDIM *rd) { - rrdmetric_updated_rrddim_flags(rd); -} - -void rrdcontext_updated_rrddim_multiplier(RRDDIM *rd) { - rrdmetric_updated_rrddim_flags(rd); -} - -void rrdcontext_updated_rrddim_divisor(RRDDIM *rd) { - rrdmetric_updated_rrddim_flags(rd); -} - -void rrdcontext_updated_rrddim_flags(RRDDIM *rd) { - rrdmetric_updated_rrddim_flags(rd); -} - -void rrdcontext_collected_rrddim(RRDDIM *rd) { - rrdmetric_collected_rrddim(rd); -} - -void rrdcontext_updated_rrdset(RRDSET *st) { - rrdinstance_from_rrdset(st); -} - -void rrdcontext_removed_rrdset(RRDSET *st) { - rrdinstance_rrdset_is_freed(st); -} - -void rrdcontext_updated_retention_rrdset(RRDSET *st) { - rrdinstance_rrdset_has_updated_retention(st); -} - -void rrdcontext_updated_rrdset_name(RRDSET *st) { - rrdinstance_updated_rrdset_name(st); -} - -void rrdcontext_updated_rrdset_flags(RRDSET *st) { - rrdinstance_updated_rrdset_flags(st); -} - -void rrdcontext_collected_rrdset(RRDSET *st) { - rrdinstance_collected_rrdset(st); -} - -void rrdcontext_host_child_connected(RRDHOST *host) { - (void)host; - - // no need to do anything here - ; -} - -int rrdcontext_find_dimension_uuid(RRDSET *st, const char *id, uuid_t *store_uuid) { - if(!st->rrdhost) return 1; - if(!st->context) return 2; - - RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item((DICTIONARY *)st->rrdhost->rrdctx, string2str(st->context)); - if(!rca) return 3; - - RRDCONTEXT *rc = rrdcontext_acquired_value(rca); - - RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *)dictionary_get_and_acquire_item(rc->rrdinstances, string2str(st->id)); - if(!ria) { - rrdcontext_release(rca); - return 4; - } - - RRDINSTANCE *ri = rrdinstance_acquired_value(ria); - - RRDMETRIC_ACQUIRED *rma = (RRDMETRIC_ACQUIRED *)dictionary_get_and_acquire_item(ri->rrdmetrics, id); - if(!rma) { - rrdinstance_release(ria); - rrdcontext_release(rca); - return 5; - } - - RRDMETRIC *rm = rrdmetric_acquired_value(rma); - - uuid_copy(*store_uuid, rm->uuid); - - rrdmetric_release(rma); - rrdinstance_release(ria); - rrdcontext_release(rca); - return 0; -} - -int rrdcontext_find_chart_uuid(RRDSET *st, uuid_t *store_uuid) { - if(!st->rrdhost) return 1; - if(!st->context) return 2; - - RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item((DICTIONARY *)st->rrdhost->rrdctx, string2str(st->context)); - if(!rca) return 3; - - RRDCONTEXT *rc = rrdcontext_acquired_value(rca); - - RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *)dictionary_get_and_acquire_item(rc->rrdinstances, string2str(st->id)); - if(!ria) { - rrdcontext_release(rca); - return 4; - } - - RRDINSTANCE *ri = rrdinstance_acquired_value(ria); - uuid_copy(*store_uuid, ri->uuid); - - rrdinstance_release(ria); - rrdcontext_release(rca); - return 0; -} - -void rrdcontext_host_child_disconnected(RRDHOST *host) { - rrdcontext_recalculate_host_retention(host, RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD, false); -} - -static usec_t rrdcontext_next_db_rotation_ut = 0; -void rrdcontext_db_rotation(void) { - // called when the db rotates its database - rrdcontext_next_db_rotation_ut = now_realtime_usec() + FULL_RETENTION_SCAN_DELAY_AFTER_DB_ROTATION_SECS * USEC_PER_SEC; -} - -int rrdcontext_foreach_instance_with_rrdset_in_context(RRDHOST *host, const char *context, int (*callback)(RRDSET *st, void *data), void *data) { - if(unlikely(!host || !context || !*context || !callback)) - return -1; - - RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item((DICTIONARY *)host->rrdctx, context); - if(unlikely(!rca)) return -1; - - RRDCONTEXT *rc = rrdcontext_acquired_value(rca); - if(unlikely(!rc)) return -1; - - int ret = 0; - RRDINSTANCE *ri; - dfe_start_read(rc->rrdinstances, ri) { - if(ri->rrdset) { - int r = callback(ri->rrdset, data); - if(r >= 0) ret += r; - else { - ret = r; - break; - } - } - } - dfe_done(ri); - - rrdcontext_release(rca); - - return ret; -} - -// ---------------------------------------------------------------------------- -// ACLK interface - -static bool rrdhost_check_our_claim_id(const char *claim_id) { - if(!localhost->aclk_state.claimed_id) return false; - return (strcasecmp(claim_id, localhost->aclk_state.claimed_id) == 0) ? true : false; -} - -static RRDHOST *rrdhost_find_by_node_id(const char *node_id) { - uuid_t uuid; - if (uuid_parse(node_id, uuid)) - return NULL; - - RRDHOST *host = NULL; - - rrd_rdlock(); - rrdhost_foreach_read(host) { - if(!host->node_id) continue; - - if(uuid_compare(uuid, *host->node_id) == 0) - break; - } - rrd_unlock(); - - return host; -} - -void rrdcontext_hub_checkpoint_command(void *ptr) { - struct ctxs_checkpoint *cmd = ptr; - - if(!rrdhost_check_our_claim_id(cmd->claim_id)) { - error("RRDCONTEXT: received checkpoint command for claim_id '%s', node id '%s', but this is not our claim id. Ours '%s', received '%s'. Ignoring command.", - cmd->claim_id, cmd->node_id, - localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET", - cmd->claim_id); - - return; - } - - RRDHOST *host = rrdhost_find_by_node_id(cmd->node_id); - if(!host) { - error("RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', but there is no node with such node id here. Ignoring command.", - cmd->claim_id, cmd->node_id); - - return; - } - - if(rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS)) { - info("RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', while node '%s' has an active context streaming.", - cmd->claim_id, cmd->node_id, rrdhost_hostname(host)); - - // disable it temporarily, so that our worker will not attempt to send messages in parallel - rrdhost_flag_clear(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS); - } - - uint64_t our_version_hash = rrdcontext_version_hash(host); - - if(cmd->version_hash != our_version_hash) { - error("RRDCONTEXT: received version hash %"PRIu64" for host '%s', does not match our version hash %"PRIu64". Sending snapshot of all contexts.", - cmd->version_hash, rrdhost_hostname(host), our_version_hash); - -#ifdef ENABLE_ACLK - // prepare the snapshot - char uuid[UUID_STR_LEN]; - uuid_unparse_lower(*host->node_id, uuid); - contexts_snapshot_t bundle = contexts_snapshot_new(cmd->claim_id, uuid, our_version_hash); - - // do a deep scan on every metric of the host to make sure all our data are updated - rrdcontext_recalculate_host_retention(host, RRD_FLAG_NONE, false); - - // calculate version hash and pack all the messages together in one go - our_version_hash = rrdcontext_version_hash_with_callback(host, rrdcontext_message_send_unsafe, true, bundle); - - // update the version - contexts_snapshot_set_version(bundle, our_version_hash); - - // send it - aclk_send_contexts_snapshot(bundle); -#endif - } - - internal_error(true, "RRDCONTEXT: host '%s' enabling streaming of contexts", rrdhost_hostname(host)); - rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS); - char node_str[UUID_STR_LEN]; - uuid_unparse_lower(*host->node_id, node_str); - log_access("ACLK REQ [%s (%s)]: STREAM CONTEXTS ENABLED", node_str, rrdhost_hostname(host)); -} - -void rrdcontext_hub_stop_streaming_command(void *ptr) { - struct stop_streaming_ctxs *cmd = ptr; - - if(!rrdhost_check_our_claim_id(cmd->claim_id)) { - error("RRDCONTEXT: received stop streaming command for claim_id '%s', node id '%s', but this is not our claim id. Ours '%s', received '%s'. Ignoring command.", - cmd->claim_id, cmd->node_id, - localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET", - cmd->claim_id); - - return; - } - - RRDHOST *host = rrdhost_find_by_node_id(cmd->node_id); - if(!host) { - error("RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', but there is no node with such node id here. Ignoring command.", - cmd->claim_id, cmd->node_id); - - return; - } - - if(!rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS)) { - error("RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', but node '%s' does not have active context streaming. Ignoring command.", - cmd->claim_id, cmd->node_id, rrdhost_hostname(host)); - - return; - } - - internal_error(true, "RRDCONTEXT: host '%s' disabling streaming of contexts", rrdhost_hostname(host)); - rrdhost_flag_clear(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS); -} - -// ---------------------------------------------------------------------------- -// web API - -struct rrdcontext_to_json { - BUFFER *wb; - RRDCONTEXT_TO_JSON_OPTIONS options; - time_t after; - time_t before; - SIMPLE_PATTERN *chart_label_key; - SIMPLE_PATTERN *chart_labels_filter; - SIMPLE_PATTERN *chart_dimensions; - size_t written; - time_t now; - time_t combined_first_time_s; - time_t combined_last_time_s; - RRD_FLAGS combined_flags; -}; - -static inline int rrdmetric_to_json_callback(const DICTIONARY_ITEM *item, void *value, void *data) { - const char *id = dictionary_acquired_item_name(item); - struct rrdcontext_to_json * t = data; - RRDMETRIC *rm = value; - BUFFER *wb = t->wb; - RRDCONTEXT_TO_JSON_OPTIONS options = t->options; - time_t after = t->after; - time_t before = t->before; - - if(unlikely(rrd_flag_is_deleted(rm) && !(options & RRDCONTEXT_OPTION_SHOW_DELETED))) - return 0; - - if(after && (!rm->last_time_s || after > rm->last_time_s)) - return 0; - - if(before && (!rm->first_time_s || before < rm->first_time_s)) - return 0; - - if(t->chart_dimensions - && !simple_pattern_matches(t->chart_dimensions, string2str(rm->id)) - && !simple_pattern_matches(t->chart_dimensions, string2str(rm->name))) - return 0; - - if(t->written) { - buffer_strcat(wb, ",\n"); - t->combined_first_time_s = MIN(t->combined_first_time_s, rm->first_time_s); - t->combined_last_time_s = MAX(t->combined_last_time_s, rm->last_time_s); - t->combined_flags |= rrd_flags_get(rm); - } - else { - buffer_strcat(wb, "\n"); - t->combined_first_time_s = rm->first_time_s; - t->combined_last_time_s = rm->last_time_s; - t->combined_flags = rrd_flags_get(rm); - } - - buffer_sprintf(wb, "\t\t\t\t\t\t\"%s\": {", id); - - if(options & RRDCONTEXT_OPTION_SHOW_UUIDS) { - char uuid[UUID_STR_LEN]; - uuid_unparse(rm->uuid, uuid); - buffer_sprintf(wb, "\n\t\t\t\t\t\t\t\"uuid\":\"%s\",", uuid); - } - - buffer_sprintf(wb, - "\n\t\t\t\t\t\t\t\"name\":\"%s\"" - ",\n\t\t\t\t\t\t\t\"first_time_t\":%lld" - ",\n\t\t\t\t\t\t\t\"last_time_t\":%lld" - ",\n\t\t\t\t\t\t\t\"collected\":%s" - , string2str(rm->name) - , (long long)rm->first_time_s - , rrd_flag_is_collected(rm) ? (long long)t->now : (long long)rm->last_time_s - , rrd_flag_is_collected(rm) ? "true" : "false" - ); - - if(options & RRDCONTEXT_OPTION_SHOW_DELETED) { - buffer_sprintf(wb, - ",\n\t\t\t\t\t\t\t\"deleted\":%s" - , rrd_flag_is_deleted(rm) ? "true" : "false" - ); - } - - if(options & RRDCONTEXT_OPTION_SHOW_FLAGS) { - buffer_strcat(wb, ",\n\t\t\t\t\t\t\t\"flags\":\""); - rrd_flags_to_buffer(rrd_flags_get(rm), wb); - buffer_strcat(wb, "\""); - } - - buffer_strcat(wb, "\n\t\t\t\t\t\t}"); - t->written++; - return 1; -} - -static inline int rrdinstance_to_json_callback(const DICTIONARY_ITEM *item, void *value, void *data) { - const char *id = dictionary_acquired_item_name(item); - - struct rrdcontext_to_json *t_parent = data; - RRDINSTANCE *ri = value; - BUFFER *wb = t_parent->wb; - RRDCONTEXT_TO_JSON_OPTIONS options = t_parent->options; - time_t after = t_parent->after; - time_t before = t_parent->before; - bool has_filter = t_parent->chart_label_key || t_parent->chart_labels_filter || t_parent->chart_dimensions; - - if(unlikely(rrd_flag_is_deleted(ri) && !(options & RRDCONTEXT_OPTION_SHOW_DELETED))) - return 0; - - if(after && (!ri->last_time_s || after > ri->last_time_s)) - return 0; - - if(before && (!ri->first_time_s || before < ri->first_time_s)) - return 0; - - if(t_parent->chart_label_key && !rrdlabels_match_simple_pattern_parsed(ri->rrdlabels, t_parent->chart_label_key, '\0')) - return 0; - - if(t_parent->chart_labels_filter && !rrdlabels_match_simple_pattern_parsed(ri->rrdlabels, t_parent->chart_labels_filter, ':')) - return 0; - - time_t first_time_s = ri->first_time_s; - time_t last_time_s = ri->last_time_s; - RRD_FLAGS flags = rrd_flags_get(ri); - - BUFFER *wb_metrics = NULL; - if(options & RRDCONTEXT_OPTION_SHOW_METRICS || t_parent->chart_dimensions) { - - wb_metrics = buffer_create(4096, &netdata_buffers_statistics.buffers_api); - - struct rrdcontext_to_json t_metrics = { - .wb = wb_metrics, - .options = options, - .chart_label_key = t_parent->chart_label_key, - .chart_labels_filter = t_parent->chart_labels_filter, - .chart_dimensions = t_parent->chart_dimensions, - .after = after, - .before = before, - .written = 0, - .now = t_parent->now, - }; - dictionary_walkthrough_read(ri->rrdmetrics, rrdmetric_to_json_callback, &t_metrics); - - if(has_filter && !t_metrics.written) { - buffer_free(wb_metrics); - return 0; - } - - first_time_s = t_metrics.combined_first_time_s; - last_time_s = t_metrics.combined_last_time_s; - flags = t_metrics.combined_flags; - } - - if(t_parent->written) { - buffer_strcat(wb, ",\n"); - t_parent->combined_first_time_s = MIN(t_parent->combined_first_time_s, first_time_s); - t_parent->combined_last_time_s = MAX(t_parent->combined_last_time_s, last_time_s); - t_parent->combined_flags |= flags; - } - else { - buffer_strcat(wb, "\n"); - t_parent->combined_first_time_s = first_time_s; - t_parent->combined_last_time_s = last_time_s; - t_parent->combined_flags = flags; - } - - buffer_sprintf(wb, "\t\t\t\t\"%s\": {", id); - - if(options & RRDCONTEXT_OPTION_SHOW_UUIDS) { - char uuid[UUID_STR_LEN]; - uuid_unparse(ri->uuid, uuid); - buffer_sprintf(wb,"\n\t\t\t\t\t\"uuid\":\"%s\",", uuid); - } - - buffer_sprintf(wb, - "\n\t\t\t\t\t\"name\":\"%s\"" - ",\n\t\t\t\t\t\"context\":\"%s\"" - ",\n\t\t\t\t\t\"title\":\"%s\"" - ",\n\t\t\t\t\t\"units\":\"%s\"" - ",\n\t\t\t\t\t\"family\":\"%s\"" - ",\n\t\t\t\t\t\"chart_type\":\"%s\"" - ",\n\t\t\t\t\t\"priority\":%u" - ",\n\t\t\t\t\t\"update_every\":%ld" - ",\n\t\t\t\t\t\"first_time_t\":%lld" - ",\n\t\t\t\t\t\"last_time_t\":%lld" - ",\n\t\t\t\t\t\"collected\":%s" - , string2str(ri->name) - , string2str(ri->rc->id) - , string2str(ri->title) - , string2str(ri->units) - , string2str(ri->family) - , rrdset_type_name(ri->chart_type) - , ri->priority - , ri->update_every_s - , (long long)first_time_s - , (flags & RRD_FLAG_COLLECTED) ? (long long)t_parent->now : (long long)last_time_s - , (flags & RRD_FLAG_COLLECTED) ? "true" : "false" - ); - - if(options & RRDCONTEXT_OPTION_SHOW_DELETED) { - buffer_sprintf(wb, - ",\n\t\t\t\t\t\"deleted\":%s" - , rrd_flag_is_deleted(ri) ? "true" : "false" - ); - } - - if(options & RRDCONTEXT_OPTION_SHOW_FLAGS) { - buffer_strcat(wb, ",\n\t\t\t\t\t\"flags\":\""); - rrd_flags_to_buffer(rrd_flags_get(ri), wb); - buffer_strcat(wb, "\""); - } - - if(options & RRDCONTEXT_OPTION_SHOW_LABELS && ri->rrdlabels && dictionary_entries(ri->rrdlabels)) { - buffer_sprintf(wb, ",\n\t\t\t\t\t\"labels\": {\n"); - rrdlabels_to_buffer(ri->rrdlabels, wb, "\t\t\t\t\t\t", ":", "\"", ",\n", NULL, NULL, NULL, NULL); - buffer_strcat(wb, "\n\t\t\t\t\t}"); - } - - if(wb_metrics) { - buffer_sprintf(wb, ",\n\t\t\t\t\t\"dimensions\": {"); - buffer_fast_strcat(wb, buffer_tostring(wb_metrics), buffer_strlen(wb_metrics)); - buffer_strcat(wb, "\n\t\t\t\t\t}"); - - buffer_free(wb_metrics); - } - - buffer_strcat(wb, "\n\t\t\t\t}"); - t_parent->written++; - return 1; -} - -static inline int rrdcontext_to_json_callback(const DICTIONARY_ITEM *item, void *value, void *data) { - const char *id = dictionary_acquired_item_name(item); - struct rrdcontext_to_json *t_parent = data; - RRDCONTEXT *rc = value; - BUFFER *wb = t_parent->wb; - RRDCONTEXT_TO_JSON_OPTIONS options = t_parent->options; - time_t after = t_parent->after; - time_t before = t_parent->before; - bool has_filter = t_parent->chart_label_key || t_parent->chart_labels_filter || t_parent->chart_dimensions; - - if(unlikely(rrd_flag_check(rc, RRD_FLAG_HIDDEN) && !(options & RRDCONTEXT_OPTION_SHOW_HIDDEN))) - return 0; - - if(unlikely(rrd_flag_is_deleted(rc) && !(options & RRDCONTEXT_OPTION_SHOW_DELETED))) - return 0; - - if(options & RRDCONTEXT_OPTION_DEEPSCAN) - rrdcontext_recalculate_context_retention(rc, RRD_FLAG_NONE, false); - - if(after && (!rc->last_time_s || after > rc->last_time_s)) - return 0; - - if(before && (!rc->first_time_s || before < rc->first_time_s)) - return 0; - - time_t first_time_s = rc->first_time_s; - time_t last_time_s = rc->last_time_s; - RRD_FLAGS flags = rrd_flags_get(rc); - - BUFFER *wb_instances = NULL; - if((options & (RRDCONTEXT_OPTION_SHOW_LABELS|RRDCONTEXT_OPTION_SHOW_INSTANCES|RRDCONTEXT_OPTION_SHOW_METRICS)) - || t_parent->chart_label_key - || t_parent->chart_labels_filter - || t_parent->chart_dimensions) { - - wb_instances = buffer_create(4096, &netdata_buffers_statistics.buffers_api); - - struct rrdcontext_to_json t_instances = { - .wb = wb_instances, - .options = options, - .chart_label_key = t_parent->chart_label_key, - .chart_labels_filter = t_parent->chart_labels_filter, - .chart_dimensions = t_parent->chart_dimensions, - .after = after, - .before = before, - .written = 0, - .now = t_parent->now, - }; - dictionary_walkthrough_read(rc->rrdinstances, rrdinstance_to_json_callback, &t_instances); - - if(has_filter && !t_instances.written) { - buffer_free(wb_instances); - return 0; - } - - first_time_s = t_instances.combined_first_time_s; - last_time_s = t_instances.combined_last_time_s; - flags = t_instances.combined_flags; - } - - if(t_parent->written) - buffer_strcat(wb, ",\n"); - else - buffer_strcat(wb, "\n"); - - if(options & RRDCONTEXT_OPTION_SKIP_ID) - buffer_sprintf(wb, "\t\t\{"); - else - buffer_sprintf(wb, "\t\t\"%s\": {", id); - - rrdcontext_lock(rc); - - buffer_sprintf(wb, - "\n\t\t\t\"title\":\"%s\"" - ",\n\t\t\t\"units\":\"%s\"" - ",\n\t\t\t\"family\":\"%s\"" - ",\n\t\t\t\"chart_type\":\"%s\"" - ",\n\t\t\t\"priority\":%u" - ",\n\t\t\t\"first_time_t\":%lld" - ",\n\t\t\t\"last_time_t\":%lld" - ",\n\t\t\t\"collected\":%s" - , string2str(rc->title) - , string2str(rc->units) - , string2str(rc->family) - , rrdset_type_name(rc->chart_type) - , rc->priority - , (long long)first_time_s - , (flags & RRD_FLAG_COLLECTED) ? (long long)t_parent->now : (long long)last_time_s - , (flags & RRD_FLAG_COLLECTED) ? "true" : "false" - ); - - if(options & RRDCONTEXT_OPTION_SHOW_DELETED) { - buffer_sprintf(wb, - ",\n\t\t\t\"deleted\":%s" - , rrd_flag_is_deleted(rc) ? "true" : "false" - ); - } - - if(options & RRDCONTEXT_OPTION_SHOW_FLAGS) { - buffer_strcat(wb, ",\n\t\t\t\"flags\":\""); - rrd_flags_to_buffer(rrd_flags_get(rc), wb); - buffer_strcat(wb, "\""); - } - - if(options & RRDCONTEXT_OPTION_SHOW_QUEUED) { - buffer_strcat(wb, ",\n\t\t\t\"queued_reasons\":\""); - rrd_reasons_to_buffer(rc->queue.queued_flags, wb); - buffer_strcat(wb, "\""); - - buffer_sprintf(wb, - ",\n\t\t\t\"last_queued\":%llu" - ",\n\t\t\t\"scheduled_dispatch\":%llu" - ",\n\t\t\t\"last_dequeued\":%llu" - ",\n\t\t\t\"dispatches\":%zu" - ",\n\t\t\t\"hub_version\":%"PRIu64"" - ",\n\t\t\t\"version\":%"PRIu64"" - , rc->queue.queued_ut / USEC_PER_SEC - , rc->queue.scheduled_dispatch_ut / USEC_PER_SEC - , rc->queue.dequeued_ut / USEC_PER_SEC - , rc->queue.dispatches - , rc->hub.version - , rc->version - ); - - buffer_strcat(wb, ",\n\t\t\t\"pp_reasons\":\""); - rrd_reasons_to_buffer(rc->pp.queued_flags, wb); - buffer_strcat(wb, "\""); - - buffer_sprintf(wb, - ",\n\t\t\t\"pp_last_queued\":%llu" - ",\n\t\t\t\"pp_last_dequeued\":%llu" - ",\n\t\t\t\"pp_executed\":%zu" - , rc->pp.queued_ut / USEC_PER_SEC - , rc->pp.dequeued_ut / USEC_PER_SEC - , rc->pp.executions - ); - } - - rrdcontext_unlock(rc); - - if(wb_instances) { - buffer_sprintf(wb, ",\n\t\t\t\"charts\": {"); - buffer_fast_strcat(wb, buffer_tostring(wb_instances), buffer_strlen(wb_instances)); - buffer_strcat(wb, "\n\t\t\t}"); - - buffer_free(wb_instances); - } - - buffer_strcat(wb, "\n\t\t}"); - t_parent->written++; - return 1; -} - -int rrdcontext_to_json(RRDHOST *host, BUFFER *wb, time_t after, time_t before, RRDCONTEXT_TO_JSON_OPTIONS options, const char *context, SIMPLE_PATTERN *chart_label_key, SIMPLE_PATTERN *chart_labels_filter, SIMPLE_PATTERN *chart_dimensions) { - if(!host->rrdctx) { - error("%s(): request for host '%s' that does not have rrdcontexts initialized.", __FUNCTION__, rrdhost_hostname(host)); - return HTTP_RESP_NOT_FOUND; - } - - RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item((DICTIONARY *)host->rrdctx, context); - if(!rca) return HTTP_RESP_NOT_FOUND; - - RRDCONTEXT *rc = rrdcontext_acquired_value(rca); - - if(after != 0 && before != 0) - rrdr_relative_window_to_absolute(&after, &before); - - struct rrdcontext_to_json t_contexts = { - .wb = wb, - .options = options|RRDCONTEXT_OPTION_SKIP_ID, - .chart_label_key = chart_label_key, - .chart_labels_filter = chart_labels_filter, - .chart_dimensions = chart_dimensions, - .after = after, - .before = before, - .written = 0, - .now = now_realtime_sec(), - }; - rrdcontext_to_json_callback((DICTIONARY_ITEM *)rca, rc, &t_contexts); - - rrdcontext_release(rca); - - if(!t_contexts.written) - return HTTP_RESP_NOT_FOUND; - - return HTTP_RESP_OK; -} - -int rrdcontexts_to_json(RRDHOST *host, BUFFER *wb, time_t after, time_t before, RRDCONTEXT_TO_JSON_OPTIONS options, SIMPLE_PATTERN *chart_label_key, SIMPLE_PATTERN *chart_labels_filter, SIMPLE_PATTERN *chart_dimensions) { - if(!host->rrdctx) { - error("%s(): request for host '%s' that does not have rrdcontexts initialized.", __FUNCTION__, rrdhost_hostname(host)); - return HTTP_RESP_NOT_FOUND; - } - - char node_uuid[UUID_STR_LEN] = ""; - - if(host->node_id) - uuid_unparse(*host->node_id, node_uuid); - - if(after != 0 && before != 0) - rrdr_relative_window_to_absolute(&after, &before); - - buffer_sprintf(wb, "{\n" - "\t\"hostname\": \"%s\"" - ",\n\t\"machine_guid\": \"%s\"" - ",\n\t\"node_id\": \"%s\"" - ",\n\t\"claim_id\": \"%s\"" - , rrdhost_hostname(host) - , host->machine_guid - , node_uuid - , host->aclk_state.claimed_id ? host->aclk_state.claimed_id : "" - ); - - if(options & RRDCONTEXT_OPTION_SHOW_LABELS) { - buffer_sprintf(wb, ",\n\t\"host_labels\": {\n"); - rrdlabels_to_buffer(host->rrdlabels, wb, "\t\t", ":", "\"", ",\n", NULL, NULL, NULL, NULL); - buffer_strcat(wb, "\n\t}"); - } - - buffer_sprintf(wb, ",\n\t\"contexts\": {"); - struct rrdcontext_to_json t_contexts = { - .wb = wb, - .options = options, - .chart_label_key = chart_label_key, - .chart_labels_filter = chart_labels_filter, - .chart_dimensions = chart_dimensions, - .after = after, - .before = before, - .written = 0, - .now = now_realtime_sec(), - }; - dictionary_walkthrough_read((DICTIONARY *)host->rrdctx, rrdcontext_to_json_callback, &t_contexts); - - // close contexts, close main - buffer_strcat(wb, "\n\t}\n}"); - - return HTTP_RESP_OK; -} - -// ---------------------------------------------------------------------------- -// weights API - -static void metric_entry_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { - struct metric_entry *t = value; - t->rca = rrdcontext_acquired_dup(t->rca); - t->ria = rrdinstance_acquired_dup(t->ria); - t->rma = rrdmetric_acquired_dup(t->rma); -} -static void metric_entry_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { - struct metric_entry *t = value; - rrdcontext_release(t->rca); - rrdinstance_release(t->ria); - rrdmetric_release(t->rma); -} -static bool metric_entry_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value __maybe_unused, void *new_value __maybe_unused, void *data __maybe_unused) { - fatal("RRDCONTEXT: %s() detected a conflict on a metric pointer!", __FUNCTION__); - return false; -} - -DICTIONARY *rrdcontext_all_metrics_to_dict(RRDHOST *host, SIMPLE_PATTERN *contexts) { - if(!host || !host->rrdctx) - return NULL; - - DICTIONARY *dict = dictionary_create_advanced(DICT_OPTION_SINGLE_THREADED|DICT_OPTION_DONT_OVERWRITE_VALUE, &dictionary_stats_category_rrdcontext, 0); - dictionary_register_insert_callback(dict, metric_entry_insert_callback, NULL); - dictionary_register_delete_callback(dict, metric_entry_delete_callback, NULL); - dictionary_register_conflict_callback(dict, metric_entry_conflict_callback, NULL); - - RRDCONTEXT *rc; - dfe_start_reentrant((DICTIONARY *)host->rrdctx, rc) { - if(rrd_flag_is_deleted(rc)) - continue; - - if(contexts && !simple_pattern_matches(contexts, string2str(rc->id))) - continue; - - RRDINSTANCE *ri; - dfe_start_read(rc->rrdinstances, ri) { - if(rrd_flag_is_deleted(ri)) - continue; - - RRDMETRIC *rm; - dfe_start_read(ri->rrdmetrics, rm) { - if(rrd_flag_is_deleted(rm)) - continue; - - struct metric_entry tmp = { - .rca = (RRDCONTEXT_ACQUIRED *)rc_dfe.item, - .ria = (RRDINSTANCE_ACQUIRED *)ri_dfe.item, - .rma = (RRDMETRIC_ACQUIRED *)rm_dfe.item, - }; - - char buffer[20 + 1]; - ssize_t len = snprintfz(buffer, 20, "%p", rm); - dictionary_set_advanced(dict, buffer, len + 1, &tmp, sizeof(struct metric_entry), NULL); - } - dfe_done(rm); - } - dfe_done(ri); - } - dfe_done(rc); - - return dict; -} - -// ---------------------------------------------------------------------------- -// query API - -typedef struct query_target_locals { - time_t start_s; - - QUERY_TARGET *qt; - - RRDSET *st; - - const char *hosts; - const char *contexts; - const char *charts; - const char *dimensions; - const char *chart_label_key; - const char *charts_labels_filter; - - long long after; - long long before; - bool match_ids; - bool match_names; - - RRDHOST *host; - RRDCONTEXT_ACQUIRED *rca; - RRDINSTANCE_ACQUIRED *ria; - - size_t metrics_skipped_due_to_not_matching_timeframe; -} QUERY_TARGET_LOCALS; - -static __thread QUERY_TARGET thread_query_target = {}; -void query_target_release(QUERY_TARGET *qt) { - if(unlikely(!qt)) return; - if(unlikely(!qt->used)) return; - - simple_pattern_free(qt->hosts.pattern); - qt->hosts.pattern = NULL; - - simple_pattern_free(qt->contexts.pattern); - qt->contexts.pattern = NULL; - - simple_pattern_free(qt->instances.pattern); - qt->instances.pattern = NULL; - - simple_pattern_free(qt->instances.chart_label_key_pattern); - qt->instances.chart_label_key_pattern = NULL; - - simple_pattern_free(qt->instances.charts_labels_filter_pattern); - qt->instances.charts_labels_filter_pattern = NULL; - - simple_pattern_free(qt->query.pattern); - qt->query.pattern = NULL; - - // release the query - for(size_t i = 0, used = qt->query.used; i < used ;i++) { - string_freez(qt->query.array[i].dimension.id); - qt->query.array[i].dimension.id = NULL; - - string_freez(qt->query.array[i].dimension.name); - qt->query.array[i].dimension.name = NULL; - - string_freez(qt->query.array[i].chart.id); - qt->query.array[i].chart.id = NULL; - - string_freez(qt->query.array[i].chart.name); - qt->query.array[i].chart.name = NULL; - - // reset the plans - for(size_t p = 0; p < qt->query.array[i].plan.used; p++) { - internal_fatal(qt->query.array[i].plan.array[p].initialized && - !qt->query.array[i].plan.array[p].finalized, - "QUERY: left-over initialized plan"); - - qt->query.array[i].plan.array[p].initialized = false; - qt->query.array[i].plan.array[p].finalized = false; - } - qt->query.array[i].plan.used = 0; - - // reset the tiers - for(size_t tier = 0; tier < storage_tiers ;tier++) { - if(qt->query.array[i].tiers[tier].db_metric_handle) { - STORAGE_ENGINE *eng = qt->query.array[i].tiers[tier].eng; - eng->api.metric_release(qt->query.array[i].tiers[tier].db_metric_handle); - qt->query.array[i].tiers[tier].db_metric_handle = NULL; - qt->query.array[i].tiers[tier].weight = 0; - qt->query.array[i].tiers[tier].eng = NULL; - } - } - } - - // release the metrics - for(size_t i = 0, used = qt->metrics.used; i < used ;i++) { - rrdmetric_release(qt->metrics.array[i]); - qt->metrics.array[i] = NULL; - } - - // release the instances - for(size_t i = 0, used = qt->instances.used; i < used ;i++) { - rrdinstance_release(qt->instances.array[i]); - qt->instances.array[i] = NULL; - } - - // release the contexts - for(size_t i = 0, used = qt->contexts.used; i < used ;i++) { - rrdcontext_release(qt->contexts.array[i]); - qt->contexts.array[i] = NULL; - } - - // release the hosts - for(size_t i = 0, used = qt->hosts.used; i < used ;i++) { - qt->hosts.array[i] = NULL; - } - - qt->query.used = 0; - qt->metrics.used = 0; - qt->instances.used = 0; - qt->contexts.used = 0; - qt->hosts.used = 0; - - qt->db.minimum_latest_update_every_s = 0; - qt->db.first_time_s = 0; - qt->db.last_time_s = 0; - - qt->id[0] = '\0'; - - qt->used = false; -} -void query_target_free(void) { - QUERY_TARGET *qt = &thread_query_target; - - if(qt->used) - query_target_release(qt); - - __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->query.size * sizeof(QUERY_METRIC), __ATOMIC_RELAXED); - freez(qt->query.array); - qt->query.array = NULL; - qt->query.size = 0; - - __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->metrics.size * sizeof(RRDMETRIC_ACQUIRED *), __ATOMIC_RELAXED); - freez(qt->metrics.array); - qt->metrics.array = NULL; - qt->metrics.size = 0; - - __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->instances.size * sizeof(RRDINSTANCE_ACQUIRED *), __ATOMIC_RELAXED); - freez(qt->instances.array); - qt->instances.array = NULL; - qt->instances.size = 0; - - __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->contexts.size * sizeof(RRDCONTEXT_ACQUIRED *), __ATOMIC_RELAXED); - freez(qt->contexts.array); - qt->contexts.array = NULL; - qt->contexts.size = 0; - - __atomic_sub_fetch(&netdata_buffers_statistics.query_targets_size, qt->hosts.size * sizeof(RRDHOST *), __ATOMIC_RELAXED); - freez(qt->hosts.array); - qt->hosts.array = NULL; - qt->hosts.size = 0; -} - -static void query_target_add_metric(QUERY_TARGET_LOCALS *qtl, RRDMETRIC_ACQUIRED *rma, RRDINSTANCE *ri, - bool queryable_instance) { - QUERY_TARGET *qt = qtl->qt; - - RRDMETRIC *rm = rrdmetric_acquired_value(rma); - if(rrd_flag_is_deleted(rm)) - return; - - if(qt->metrics.used == qt->metrics.size) { - size_t old_mem = qt->metrics.size * sizeof(RRDMETRIC_ACQUIRED *); - qt->metrics.size = (qt->metrics.size) ? qt->metrics.size * 2 : 1; - size_t new_mem = qt->metrics.size * sizeof(RRDMETRIC_ACQUIRED *); - qt->metrics.array = reallocz(qt->metrics.array, new_mem); - - __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); - } - qt->metrics.array[qt->metrics.used++] = rrdmetric_acquired_dup(rma); - - if(!queryable_instance) - return; - - time_t common_first_time_s = 0; - time_t common_last_time_s = 0; - time_t common_update_every_s = 0; - size_t tiers_added = 0; - struct { - STORAGE_ENGINE *eng; - STORAGE_METRIC_HANDLE *db_metric_handle; - time_t db_first_time_s; - time_t db_last_time_s; - time_t db_update_every_s; - } tier_retention[storage_tiers]; - - for (size_t tier = 0; tier < storage_tiers; tier++) { - STORAGE_ENGINE *eng = qtl->host->db[tier].eng; - tier_retention[tier].eng = eng; - tier_retention[tier].db_update_every_s = (time_t) (qtl->host->db[tier].tier_grouping * ri->update_every_s); - - if(rm->rrddim && rm->rrddim->tiers[tier].db_metric_handle) - tier_retention[tier].db_metric_handle = eng->api.metric_dup(rm->rrddim->tiers[tier].db_metric_handle); - else - tier_retention[tier].db_metric_handle = eng->api.metric_get(qtl->host->db[tier].instance, &rm->uuid); - - if(tier_retention[tier].db_metric_handle) { - tier_retention[tier].db_first_time_s = tier_retention[tier].eng->api.query_ops.oldest_time_s(tier_retention[tier].db_metric_handle); - tier_retention[tier].db_last_time_s = tier_retention[tier].eng->api.query_ops.latest_time_s(tier_retention[tier].db_metric_handle); - - if(!common_first_time_s) - common_first_time_s = tier_retention[tier].db_first_time_s; - else if(tier_retention[tier].db_first_time_s) - common_first_time_s = MIN(common_first_time_s, tier_retention[tier].db_first_time_s); - - if(!common_last_time_s) - common_last_time_s = tier_retention[tier].db_last_time_s; - else - common_last_time_s = MAX(common_last_time_s, tier_retention[tier].db_last_time_s); - - if(!common_update_every_s) - common_update_every_s = tier_retention[tier].db_update_every_s; - else if(tier_retention[tier].db_update_every_s) - common_update_every_s = MIN(common_update_every_s, tier_retention[tier].db_update_every_s); - - tiers_added++; - } - else { - tier_retention[tier].db_first_time_s = 0; - tier_retention[tier].db_last_time_s = 0; - tier_retention[tier].db_update_every_s = 0; - } - } - - bool release_retention = true; - bool timeframe_matches = - (tiers_added - && (common_first_time_s - common_update_every_s * 2) <= qt->window.before - && (common_last_time_s + common_update_every_s * 2) >= qt->window.after - ) ? true : false; - - if(timeframe_matches) { - RRDR_DIMENSION_FLAGS options = RRDR_DIMENSION_DEFAULT; - - if (rrd_flag_check(rm, RRD_FLAG_HIDDEN) - || (rm->rrddim && rrddim_option_check(rm->rrddim, RRDDIM_OPTION_HIDDEN))) { - options |= RRDR_DIMENSION_HIDDEN; - options &= ~RRDR_DIMENSION_QUERIED; - } - - if (qt->query.pattern) { - // we have a dimensions pattern - // lets see if this dimension is selected - - if ((qtl->match_ids && simple_pattern_matches(qt->query.pattern, string2str(rm->id))) - || (qtl->match_names && simple_pattern_matches(qt->query.pattern, string2str(rm->name))) - ) { - // it matches the pattern - options |= (RRDR_DIMENSION_QUERIED | RRDR_DIMENSION_NONZERO); - options &= ~RRDR_DIMENSION_HIDDEN; - } - else { - // it does not match the pattern - options |= RRDR_DIMENSION_HIDDEN; - options &= ~RRDR_DIMENSION_QUERIED; - } - } - else { - // we don't have a dimensions pattern - // so this is a selected dimension - // if it is not hidden - if(!(options & RRDR_DIMENSION_HIDDEN)) - options |= RRDR_DIMENSION_QUERIED; - } - - if((options & RRDR_DIMENSION_HIDDEN) && (options & RRDR_DIMENSION_QUERIED)) - options &= ~RRDR_DIMENSION_HIDDEN; - - if(!(options & RRDR_DIMENSION_HIDDEN) || (qt->request.options & RRDR_OPTION_PERCENTAGE)) { - // we have a non-hidden dimension - // let's add it to the query metrics - - if(ri->rrdset) - ri->rrdset->last_accessed_time_s = qtl->start_s; - - if (qt->query.used == qt->query.size) { - size_t old_mem = qt->query.size * sizeof(QUERY_METRIC); - qt->query.size = (qt->query.size) ? qt->query.size * 2 : 1; - size_t new_mem = qt->query.size * sizeof(QUERY_METRIC); - qt->query.array = reallocz(qt->query.array, new_mem); - - __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); - } - QUERY_METRIC *qm = &qt->query.array[qt->query.used++]; - - qm->plan.used = 0; - qm->dimension.options = options; - - qm->link.host = qtl->host; - qm->link.rca = qtl->rca; - qm->link.ria = qtl->ria; - qm->link.rma = rma; - - qm->chart.id = string_dup(ri->id); - qm->chart.name = string_dup(ri->name); - - qm->dimension.id = string_dup(rm->id); - qm->dimension.name = string_dup(rm->name); - - if (!qt->db.first_time_s || common_first_time_s < qt->db.first_time_s) - qt->db.first_time_s = common_first_time_s; - - if (!qt->db.last_time_s || common_last_time_s > qt->db.last_time_s) - qt->db.last_time_s = common_last_time_s; - - for (size_t tier = 0; tier < storage_tiers; tier++) { - qm->tiers[tier].eng = tier_retention[tier].eng; - qm->tiers[tier].db_metric_handle = tier_retention[tier].db_metric_handle; - qm->tiers[tier].db_first_time_s = tier_retention[tier].db_first_time_s; - qm->tiers[tier].db_last_time_s = tier_retention[tier].db_last_time_s; - qm->tiers[tier].db_update_every_s = tier_retention[tier].db_update_every_s; - } - release_retention = false; - } - } - else - qtl->metrics_skipped_due_to_not_matching_timeframe++; - - if(release_retention) { - // cleanup anything we allocated to the retention we will not use - for(size_t tier = 0; tier < storage_tiers ;tier++) { - if (tier_retention[tier].db_metric_handle) - tier_retention[tier].eng->api.metric_release(tier_retention[tier].db_metric_handle); - } - } -} - -static void query_target_add_instance(QUERY_TARGET_LOCALS *qtl, RRDINSTANCE_ACQUIRED *ria, bool queryable_instance) { - QUERY_TARGET *qt = qtl->qt; - - RRDINSTANCE *ri = rrdinstance_acquired_value(ria); - if(rrd_flag_is_deleted(ri)) - return; - - if(qt->instances.used == qt->instances.size) { - size_t old_mem = qt->instances.size * sizeof(RRDINSTANCE_ACQUIRED *); - qt->instances.size = (qt->instances.size) ? qt->instances.size * 2 : 1; - size_t new_mem = qt->instances.size * sizeof(RRDINSTANCE_ACQUIRED *); - qt->instances.array = reallocz(qt->instances.array, new_mem); - - __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); - } - - qtl->ria = qt->instances.array[qt->instances.used++] = rrdinstance_acquired_dup(ria); - - if(qt->db.minimum_latest_update_every_s == 0 || ri->update_every_s < qt->db.minimum_latest_update_every_s) - qt->db.minimum_latest_update_every_s = ri->update_every_s; - - if(queryable_instance) { - if ((qt->instances.chart_label_key_pattern && !rrdlabels_match_simple_pattern_parsed(ri->rrdlabels, qt->instances.chart_label_key_pattern, ':')) || - (qt->instances.charts_labels_filter_pattern && !rrdlabels_match_simple_pattern_parsed(ri->rrdlabels, qt->instances.charts_labels_filter_pattern, ':'))) - queryable_instance = false; - } - - size_t added = 0; - - if(unlikely(qt->request.rma)) { - query_target_add_metric(qtl, qt->request.rma, ri, queryable_instance); - added++; - } - else { - RRDMETRIC *rm; - dfe_start_read(ri->rrdmetrics, rm) { - query_target_add_metric(qtl, (RRDMETRIC_ACQUIRED *) rm_dfe.item, ri, queryable_instance); - added++; - } - dfe_done(rm); - } - - if(!added) { - qt->instances.used--; - rrdinstance_release(ria); - } -} - -static void query_target_add_context(QUERY_TARGET_LOCALS *qtl, RRDCONTEXT_ACQUIRED *rca) { - QUERY_TARGET *qt = qtl->qt; - - RRDCONTEXT *rc = rrdcontext_acquired_value(rca); - if(rrd_flag_is_deleted(rc)) - return; - - if(qt->contexts.used == qt->contexts.size) { - size_t old_mem = qt->contexts.size * sizeof(RRDCONTEXT_ACQUIRED *); - qt->contexts.size = (qt->contexts.size) ? qt->contexts.size * 2 : 1; - size_t new_mem = qt->contexts.size * sizeof(RRDCONTEXT_ACQUIRED *); - qt->contexts.array = reallocz(qt->contexts.array, new_mem); - - __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); - } - qtl->rca = qt->contexts.array[qt->contexts.used++] = rrdcontext_acquired_dup(rca); - - size_t added = 0; - if(unlikely(qt->request.ria)) { - query_target_add_instance(qtl, qt->request.ria, true); - added++; - } - else if(unlikely(qtl->st && qtl->st->rrdcontext == rca && qtl->st->rrdinstance)) { - query_target_add_instance(qtl, qtl->st->rrdinstance, true); - added++; - } - else { - RRDINSTANCE *ri; - dfe_start_read(rc->rrdinstances, ri) { - bool queryable_instance = false; - if(!qt->instances.pattern - || (qtl->match_ids && simple_pattern_matches(qt->instances.pattern, string2str(ri->id))) - || (qtl->match_names && simple_pattern_matches(qt->instances.pattern, string2str(ri->name))) - ) - queryable_instance = true; - - query_target_add_instance(qtl, (RRDINSTANCE_ACQUIRED *)ri_dfe.item, queryable_instance); - added++; - } - dfe_done(ri); - } - - if(!added) { - qt->contexts.used--; - rrdcontext_release(rca); - } -} - -static void query_target_add_host(QUERY_TARGET_LOCALS *qtl, RRDHOST *host) { - QUERY_TARGET *qt = qtl->qt; - - if(qt->hosts.used == qt->hosts.size) { - size_t old_mem = qt->hosts.size * sizeof(RRDHOST *); - qt->hosts.size = (qt->hosts.size) ? qt->hosts.size * 2 : 1; - size_t new_mem = qt->hosts.size * sizeof(RRDHOST *); - qt->hosts.array = reallocz(qt->hosts.array, new_mem); - - __atomic_add_fetch(&netdata_buffers_statistics.query_targets_size, new_mem - old_mem, __ATOMIC_RELAXED); - } - qtl->host = qt->hosts.array[qt->hosts.used++] = host; - - // is the chart given valid? - if(unlikely(qtl->st && (!qtl->st->rrdinstance || !qtl->st->rrdcontext))) { - error("QUERY TARGET: RRDSET '%s' given, because it is not linked to rrdcontext structures. Switching to context query.", rrdset_name(qtl->st)); - - if(!is_valid_sp(qtl->charts)) - qtl->charts = rrdset_name(qtl->st); - - qtl->st = NULL; - } - - size_t added = 0; - if(unlikely(qt->request.rca)) { - query_target_add_context(qtl, qt->request.rca); - added++; - } - else if(unlikely(qtl->st)) { - // single chart data queries - query_target_add_context(qtl, qtl->st->rrdcontext); - added++; - } - else { - // context pattern queries - RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item((DICTIONARY *)qtl->host->rrdctx, qtl->contexts); - if(likely(rca)) { - // we found it! - query_target_add_context(qtl, rca); - rrdcontext_release(rca); - added++; - } - else { - // Probably it is a pattern, we need to search for it... - RRDCONTEXT *rc; - dfe_start_read((DICTIONARY *)qtl->host->rrdctx, rc) { - if(qt->contexts.pattern && !simple_pattern_matches(qt->contexts.pattern, string2str(rc->id))) - continue; - - query_target_add_context(qtl, (RRDCONTEXT_ACQUIRED *)rc_dfe.item); - added++; - } - dfe_done(rc); - } - } - - if(!added) { - qt->hosts.used--; - } -} - -void query_target_generate_name(QUERY_TARGET *qt) { - char options_buffer[100 + 1]; - web_client_api_request_v1_data_options_to_string(options_buffer, 100, qt->request.options); - - char resampling_buffer[20 + 1] = ""; - if(qt->request.resampling_time > 1) - snprintfz(resampling_buffer, 20, "/resampling:%lld", (long long)qt->request.resampling_time); - - char tier_buffer[20 + 1] = ""; - if(qt->request.options & RRDR_OPTION_SELECTED_TIER) - snprintfz(tier_buffer, 20, "/tier:%zu", qt->request.tier); - - if(qt->request.st) - snprintfz(qt->id, MAX_QUERY_TARGET_ID_LENGTH, "chart://host:%s/instance:%s/dimensions:%s/after:%lld/before:%lld/points:%zu/group:%s%s/options:%s%s%s" - , rrdhost_hostname(qt->request.st->rrdhost) - , rrdset_name(qt->request.st) - , (qt->request.dimensions) ? qt->request.dimensions : "*" - , (long long)qt->request.after - , (long long)qt->request.before - , qt->request.points - , web_client_api_request_v1_data_group_to_string(qt->request.group_method) - , qt->request.group_options?qt->request.group_options:"" - , options_buffer - , resampling_buffer - , tier_buffer - ); - else if(qt->request.host && qt->request.rca && qt->request.ria && qt->request.rma) - snprintfz(qt->id, MAX_QUERY_TARGET_ID_LENGTH, "metric://host:%s/context:%s/instance:%s/dimension:%s/after:%lld/before:%lld/points:%zu/group:%s%s/options:%s%s%s" - , rrdhost_hostname(qt->request.host) - , rrdcontext_acquired_id(qt->request.rca) - , rrdinstance_acquired_id(qt->request.ria) - , rrdmetric_acquired_id(qt->request.rma) - , (long long)qt->request.after - , (long long)qt->request.before - , qt->request.points - , web_client_api_request_v1_data_group_to_string(qt->request.group_method) - , qt->request.group_options?qt->request.group_options:"" - , options_buffer - , resampling_buffer - , tier_buffer - ); - else - snprintfz(qt->id, MAX_QUERY_TARGET_ID_LENGTH, "context://host:%s/contexts:%s/instances:%s/dimensions:%s/after:%lld/before:%lld/points:%zu/group:%s%s/options:%s%s%s" - , (qt->request.host) ? rrdhost_hostname(qt->request.host) : ((qt->request.hosts) ? qt->request.hosts : "*") - , (qt->request.contexts) ? qt->request.contexts : "*" - , (qt->request.charts) ? qt->request.charts : "*" - , (qt->request.dimensions) ? qt->request.dimensions : "*" - , (long long)qt->request.after - , (long long)qt->request.before - , qt->request.points - , web_client_api_request_v1_data_group_to_string(qt->request.group_method) - , qt->request.group_options?qt->request.group_options:"" - , options_buffer - , resampling_buffer - , tier_buffer - ); - - json_fix_string(qt->id); -} - -QUERY_TARGET *query_target_create(QUERY_TARGET_REQUEST *qtr) { - if(!service_running(ABILITY_DATA_QUERIES)) - return NULL; - - QUERY_TARGET *qt = &thread_query_target; - - if(qt->used) - fatal("QUERY TARGET: this query target is already used (%zu queries made with this QUERY_TARGET so far).", qt->queries); - - qt->used = true; - qt->queries++; - - // copy the request into query_thread_target - qt->request = *qtr; - - query_target_generate_name(qt); - qt->window.after = qt->request.after; - qt->window.before = qt->request.before; - rrdr_relative_window_to_absolute(&qt->window.after, &qt->window.before); - - // prepare our local variables - we need these across all these functions - QUERY_TARGET_LOCALS qtl = { - .qt = qt, - .start_s = now_realtime_sec(), - .host = qt->request.host, - .st = qt->request.st, - .hosts = qt->request.hosts, - .contexts = qt->request.contexts, - .charts = qt->request.charts, - .dimensions = qt->request.dimensions, - .chart_label_key = qt->request.chart_label_key, - .charts_labels_filter = qt->request.charts_labels_filter, - }; - - qt->db.minimum_latest_update_every_s = 0; // it will be updated by query_target_add_query() - - // prepare all the patterns - qt->hosts.pattern = is_valid_sp(qtl.hosts) ? simple_pattern_create(qtl.hosts, ",|\t\r\n\f\v", SIMPLE_PATTERN_EXACT) : NULL; - qt->contexts.pattern = is_valid_sp(qtl.contexts) ? simple_pattern_create(qtl.contexts, ",|\t\r\n\f\v", SIMPLE_PATTERN_EXACT) : NULL; - qt->instances.pattern = is_valid_sp(qtl.charts) ? simple_pattern_create(qtl.charts, ",|\t\r\n\f\v", SIMPLE_PATTERN_EXACT) : NULL; - qt->query.pattern = is_valid_sp(qtl.dimensions) ? simple_pattern_create(qtl.dimensions, ",|\t\r\n\f\v", SIMPLE_PATTERN_EXACT) : NULL; - qt->instances.chart_label_key_pattern = is_valid_sp(qtl.chart_label_key) ? simple_pattern_create(qtl.chart_label_key, ",|\t\r\n\f\v", SIMPLE_PATTERN_EXACT) : NULL; - qt->instances.charts_labels_filter_pattern = is_valid_sp(qtl.charts_labels_filter) ? simple_pattern_create(qtl.charts_labels_filter, ",|\t\r\n\f\v", SIMPLE_PATTERN_EXACT) : NULL; - - qtl.match_ids = qt->request.options & RRDR_OPTION_MATCH_IDS; - qtl.match_names = qt->request.options & RRDR_OPTION_MATCH_NAMES; - if(likely(!qtl.match_ids && !qtl.match_names)) - qtl.match_ids = qtl.match_names = true; - - // verify that the chart belongs to the host we are interested - if(qtl.st) { - if (!qtl.host) { - // It is NULL, set it ourselves. - qtl.host = qtl.st->rrdhost; - } - else if (unlikely(qtl.host != qtl.st->rrdhost)) { - // Oops! A different host! - error("QUERY TARGET: RRDSET '%s' given does not belong to host '%s'. Switching query host to '%s'", - rrdset_name(qtl.st), rrdhost_hostname(qtl.host), rrdhost_hostname(qtl.st->rrdhost)); - qtl.host = qtl.st->rrdhost; - } - } - - if(qtl.host) { - // single host query - query_target_add_host(&qtl, qtl.host); - qtl.hosts = rrdhost_hostname(qtl.host); - } - else { - // multi host query - rrd_rdlock(); - rrdhost_foreach_read(qtl.host) { - if(!qt->hosts.pattern || simple_pattern_matches(qt->hosts.pattern, rrdhost_hostname(qtl.host))) - query_target_add_host(&qtl, qtl.host); - } - rrd_unlock(); - } - - // make sure everything is good - if(!qt->query.used || !qt->metrics.used || !qt->instances.used || !qt->contexts.used || !qt->hosts.used) { - internal_error( - true - , "QUERY TARGET: query '%s' does not have all the data required. " - "Matched %u hosts, %u contexts, %u instances, %u dimensions, %u metrics to query, " - "%zu metrics skipped because they don't have data in the desired time-frame. " - "Aborting it." - , qt->id - , qt->hosts.used - , qt->contexts.used - , qt->instances.used - , qt->metrics.used - , qt->query.used - , qtl.metrics_skipped_due_to_not_matching_timeframe - ); - - query_target_release(qt); - return NULL; - } - - if(!query_target_calculate_window(qt)) { - query_target_release(qt); - return NULL; - } - - return qt; -} - - -// ---------------------------------------------------------------------------- -// load from SQL - -static void rrdinstance_load_clabel(SQL_CLABEL_DATA *sld, void *data) { - RRDINSTANCE *ri = data; - rrdlabels_add(ri->rrdlabels, sld->label_key, sld->label_value, sld->label_source); -} - -static void rrdinstance_load_dimension(SQL_DIMENSION_DATA *sd, void *data) { - RRDINSTANCE *ri = data; - - RRDMETRIC trm = { - .id = string_strdupz(sd->id), - .name = string_strdupz(sd->name), - .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomic - }; - if(sd->hidden) trm.flags |= RRD_FLAG_HIDDEN; - - uuid_copy(trm.uuid, sd->dim_id); - - dictionary_set(ri->rrdmetrics, string2str(trm.id), &trm, sizeof(trm)); -} - -static void rrdinstance_load_chart_callback(SQL_CHART_DATA *sc, void *data) { - RRDHOST *host = data; - - RRDCONTEXT tc = { - .id = string_strdupz(sc->context), - .title = string_strdupz(sc->title), - .units = string_strdupz(sc->units), - .family = string_strdupz(sc->family), - .priority = sc->priority, - .chart_type = sc->chart_type, - .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomics - .rrdhost = host, - }; - - RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_set_and_acquire_item((DICTIONARY *)host->rrdctx, string2str(tc.id), &tc, sizeof(tc)); - RRDCONTEXT *rc = rrdcontext_acquired_value(rca); - - RRDINSTANCE tri = { - .id = string_strdupz(sc->id), - .name = string_strdupz(sc->name), - .title = string_strdupz(sc->title), - .units = string_strdupz(sc->units), - .family = string_strdupz(sc->family), - .chart_type = sc->chart_type, - .priority = sc->priority, - .update_every_s = sc->update_every, - .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomics - }; - uuid_copy(tri.uuid, sc->chart_id); - - RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *)dictionary_set_and_acquire_item(rc->rrdinstances, sc->id, &tri, sizeof(tri)); - RRDINSTANCE *ri = rrdinstance_acquired_value(ria); - - ctx_get_dimension_list(&ri->uuid, rrdinstance_load_dimension, ri); - ctx_get_label_list(&ri->uuid, rrdinstance_load_clabel, ri); - rrdinstance_trigger_updates(ri, __FUNCTION__ ); - rrdinstance_release(ria); - rrdcontext_release(rca); -} - -static void rrdcontext_load_context_callback(VERSIONED_CONTEXT_DATA *ctx_data, void *data) { - RRDHOST *host = data; - (void)host; - - RRDCONTEXT trc = { - .id = string_strdupz(ctx_data->id), - .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomics - - // no need to set more data here - // we only need the hub data - - .hub = *ctx_data, - }; - dictionary_set((DICTIONARY *)host->rrdctx, string2str(trc.id), &trc, sizeof(trc)); -} - -void rrdhost_load_rrdcontext_data(RRDHOST *host) { - if(host->rrdctx) return; - - rrdhost_create_rrdcontexts(host); - ctx_get_context_list(&host->host_uuid, rrdcontext_load_context_callback, host); - ctx_get_chart_list(&host->host_uuid, rrdinstance_load_chart_callback, host); - - RRDCONTEXT *rc; - dfe_start_read((DICTIONARY *)host->rrdctx, rc) { - rrdcontext_trigger_updates(rc, __FUNCTION__ ); - } - dfe_done(rc); - - rrdcontext_garbage_collect_single_host(host, false); -} - -// ---------------------------------------------------------------------------- -// version hash calculation - -static uint64_t rrdcontext_version_hash_with_callback( - RRDHOST *host, - void (*callback)(RRDCONTEXT *, bool, void *), - bool snapshot, - void *bundle) { - - if(unlikely(!host || !host->rrdctx)) return 0; - - RRDCONTEXT *rc; - uint64_t hash = 0; - - // loop through all contexts of the host - dfe_start_read((DICTIONARY *)host->rrdctx, rc) { - - rrdcontext_lock(rc); - - if(unlikely(rrd_flag_check(rc, RRD_FLAG_HIDDEN))) { - rrdcontext_unlock(rc); - continue; - } - - if(unlikely(callback)) - callback(rc, snapshot, bundle); - - // skip any deleted contexts - if(unlikely(rrd_flag_is_deleted(rc))) { - rrdcontext_unlock(rc); - continue; - } - - // we use rc->hub.* which has the latest - // metadata we have sent to the hub - - // if a context is currently queued, rc->hub.* does NOT - // reflect the queued changes. rc->hub.* is updated with - // their metadata, after messages are dispatched to hub. - - // when the context is being collected, - // rc->hub.last_time_t is already zero - - hash += rc->hub.version + rc->hub.last_time_s - rc->hub.first_time_s; - - rrdcontext_unlock(rc); - - } - dfe_done(rc); - - return hash; -} - -// ---------------------------------------------------------------------------- -// retention recalculation - -static void rrdcontext_recalculate_context_retention(RRDCONTEXT *rc, RRD_FLAGS reason, bool worker_jobs) { - rrdcontext_post_process_updates(rc, true, reason, worker_jobs); -} - -static void rrdcontext_recalculate_host_retention(RRDHOST *host, RRD_FLAGS reason, bool worker_jobs) { - if(unlikely(!host || !host->rrdctx)) return; - - RRDCONTEXT *rc; - dfe_start_read((DICTIONARY *)host->rrdctx, rc) { - rrdcontext_recalculate_context_retention(rc, reason, worker_jobs); - } - dfe_done(rc); -} - -static void rrdcontext_recalculate_retention_all_hosts(void) { - rrdcontext_next_db_rotation_ut = 0; - rrd_rdlock(); - RRDHOST *host; - rrdhost_foreach_read(host) { - worker_is_busy(WORKER_JOB_RETENTION); - rrdcontext_recalculate_host_retention(host, RRD_FLAG_UPDATE_REASON_DB_ROTATION, true); - } - rrd_unlock(); -} - -// ---------------------------------------------------------------------------- -// garbage collector - -static bool rrdmetric_update_retention(RRDMETRIC *rm) { - time_t min_first_time_t = LONG_MAX, max_last_time_t = 0; - - if(rm->rrddim) { - min_first_time_t = rrddim_first_entry_s(rm->rrddim); - max_last_time_t = rrddim_last_entry_s(rm->rrddim); - } - else { - RRDHOST *rrdhost = rm->ri->rc->rrdhost; - for (size_t tier = 0; tier < storage_tiers; tier++) { - STORAGE_ENGINE *eng = rrdhost->db[tier].eng; - - time_t first_time_t, last_time_t; - if (eng->api.metric_retention_by_uuid(rrdhost->db[tier].instance, &rm->uuid, &first_time_t, &last_time_t)) { - if (first_time_t < min_first_time_t) - min_first_time_t = first_time_t; - - if (last_time_t > max_last_time_t) - max_last_time_t = last_time_t; - } - } - } - - if((min_first_time_t == LONG_MAX || min_first_time_t == 0) && max_last_time_t == 0) - return false; - - if(min_first_time_t == LONG_MAX) - min_first_time_t = 0; - - if(min_first_time_t > max_last_time_t) { - internal_error(true, "RRDMETRIC: retention of '%s' is flipped, first_time_t = %ld, last_time_t = %ld", string2str(rm->id), min_first_time_t, max_last_time_t); - time_t tmp = min_first_time_t; - min_first_time_t = max_last_time_t; - max_last_time_t = tmp; - } - - // check if retention changed - - if (min_first_time_t != rm->first_time_s) { - rm->first_time_s = min_first_time_t; - rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); - } - - if (max_last_time_t != rm->last_time_s) { - rm->last_time_s = max_last_time_t; - rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); - } - - if(unlikely(!rm->first_time_s && !rm->last_time_s)) - rrd_flag_set_deleted(rm, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); - - rrd_flag_set(rm, RRD_FLAG_LIVE_RETENTION); - - return true; -} - -static inline bool rrdmetric_should_be_deleted(RRDMETRIC *rm) { - if(likely(!rrd_flag_check(rm, RRD_FLAGS_REQUIRED_FOR_DELETIONS))) - return false; - - if(likely(rrd_flag_check(rm, RRD_FLAGS_PREVENTING_DELETIONS))) - return false; - - if(likely(rm->rrddim)) - return false; - - rrdmetric_update_retention(rm); - if(rm->first_time_s || rm->last_time_s) - return false; - - return true; -} - -static inline bool rrdinstance_should_be_deleted(RRDINSTANCE *ri) { - if(likely(!rrd_flag_check(ri, RRD_FLAGS_REQUIRED_FOR_DELETIONS))) - return false; - - if(likely(rrd_flag_check(ri, RRD_FLAGS_PREVENTING_DELETIONS))) - return false; - - if(likely(ri->rrdset)) - return false; - - if(unlikely(dictionary_referenced_items(ri->rrdmetrics) != 0)) - return false; - - if(unlikely(dictionary_entries(ri->rrdmetrics) != 0)) - return false; - - if(ri->first_time_s || ri->last_time_s) - return false; - - return true; -} - -static inline bool rrdcontext_should_be_deleted(RRDCONTEXT *rc) { - if(likely(!rrd_flag_check(rc, RRD_FLAGS_REQUIRED_FOR_DELETIONS))) - return false; - - if(likely(rrd_flag_check(rc, RRD_FLAGS_PREVENTING_DELETIONS))) - return false; - - if(unlikely(dictionary_referenced_items(rc->rrdinstances) != 0)) - return false; - - if(unlikely(dictionary_entries(rc->rrdinstances) != 0)) - return false; - - if(unlikely(rc->first_time_s || rc->last_time_s)) - return false; - - return true; -} - -void rrdcontext_delete_from_sql_unsafe(RRDCONTEXT *rc) { - // we need to refresh the string pointers in rc->hub - // in case the context changed values - rc->hub.id = string2str(rc->id); - rc->hub.title = string2str(rc->title); - rc->hub.units = string2str(rc->units); - rc->hub.family = string2str(rc->family); - - // delete it from SQL - if(ctx_delete_context(&rc->rrdhost->host_uuid, &rc->hub) != 0) - error("RRDCONTEXT: failed to delete context '%s' version %"PRIu64" from SQL.", rc->hub.id, rc->hub.version); -} - -static void rrdcontext_garbage_collect_single_host(RRDHOST *host, bool worker_jobs) { - - internal_error(true, "RRDCONTEXT: garbage collecting context structures of host '%s'", rrdhost_hostname(host)); - - RRDCONTEXT *rc; - dfe_start_reentrant((DICTIONARY *)host->rrdctx, rc) { - if(unlikely(!service_running(SERVICE_CONTEXT))) break; - - if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP); - - rrdcontext_lock(rc); - - RRDINSTANCE *ri; - dfe_start_reentrant(rc->rrdinstances, ri) { - if(unlikely(!service_running(SERVICE_CONTEXT))) break; - - RRDMETRIC *rm; - dfe_start_write(ri->rrdmetrics, rm) { - if(rrdmetric_should_be_deleted(rm)) { - if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP_DELETE); - if(!dictionary_del(ri->rrdmetrics, string2str(rm->id))) - error("RRDCONTEXT: metric '%s' of instance '%s' of context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", - string2str(rm->id), - string2str(ri->id), - string2str(rc->id), - rrdhost_hostname(host)); - else - internal_error( - true, - "RRDCONTEXT: metric '%s' of instance '%s' of context '%s' of host '%s', deleted from rrdmetrics dictionary.", - string2str(rm->id), - string2str(ri->id), - string2str(rc->id), - rrdhost_hostname(host)); - } - } - dfe_done(rm); - - if(rrdinstance_should_be_deleted(ri)) { - if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP_DELETE); - if(!dictionary_del(rc->rrdinstances, string2str(ri->id))) - error("RRDCONTEXT: instance '%s' of context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", - string2str(ri->id), - string2str(rc->id), - rrdhost_hostname(host)); - else - internal_error( - true, - "RRDCONTEXT: instance '%s' of context '%s' of host '%s', deleted from rrdmetrics dictionary.", - string2str(ri->id), - string2str(rc->id), - rrdhost_hostname(host)); - } - } - dfe_done(ri); - - if(unlikely(rrdcontext_should_be_deleted(rc))) { - if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP_DELETE); - rrdcontext_dequeue_from_post_processing(rc); - rrdcontext_delete_from_sql_unsafe(rc); - - if(!dictionary_del((DICTIONARY *)host->rrdctx, string2str(rc->id))) - error("RRDCONTEXT: context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", - string2str(rc->id), - rrdhost_hostname(host)); - else - internal_error( - true, - "RRDCONTEXT: context '%s' of host '%s', deleted from rrdmetrics dictionary.", - string2str(rc->id), - rrdhost_hostname(host)); - } - - // the item is referenced in the dictionary - // so, it is still here to unlock, even if we have deleted it - rrdcontext_unlock(rc); - } - dfe_done(rc); -} - -static void rrdcontext_garbage_collect_for_all_hosts(void) { - rrd_rdlock(); - RRDHOST *host; - rrdhost_foreach_read(host) { - rrdcontext_garbage_collect_single_host(host, true); - } - rrd_unlock(); -} - -// ---------------------------------------------------------------------------- -// post processing - -static void rrdmetric_process_updates(RRDMETRIC *rm, bool force, RRD_FLAGS reason, bool worker_jobs) { - if(reason != RRD_FLAG_NONE) - rrd_flag_set_updated(rm, reason); - - if(!force && !rrd_flag_is_updated(rm) && rrd_flag_check(rm, RRD_FLAG_LIVE_RETENTION) && !rrd_flag_check(rm, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) - return; - - if(worker_jobs) - worker_is_busy(WORKER_JOB_PP_METRIC); - - if(reason & RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD) { - rrd_flag_set_archived(rm); - rrd_flag_set(rm, RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD); - } - if(rrd_flag_is_deleted(rm) && (reason & RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) - rrd_flag_set_archived(rm); - - rrdmetric_update_retention(rm); - - rrd_flag_unset_updated(rm); -} - -static void rrdinstance_post_process_updates(RRDINSTANCE *ri, bool force, RRD_FLAGS reason, bool worker_jobs) { - if(reason != RRD_FLAG_NONE) - rrd_flag_set_updated(ri, reason); - - if(!force && !rrd_flag_is_updated(ri) && rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION)) - return; - - if(worker_jobs) - worker_is_busy(WORKER_JOB_PP_INSTANCE); - - time_t min_first_time_t = LONG_MAX, max_last_time_t = 0; - size_t metrics_active = 0, metrics_deleted = 0; - bool live_retention = true, currently_collected = false; - if(dictionary_entries(ri->rrdmetrics) > 0) { - RRDMETRIC *rm; - dfe_start_read((DICTIONARY *)ri->rrdmetrics, rm) { - if(unlikely(!service_running(SERVICE_CONTEXT))) break; - - RRD_FLAGS reason_to_pass = reason; - if(rrd_flag_check(ri, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) - reason_to_pass |= RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION; - - rrdmetric_process_updates(rm, force, reason_to_pass, worker_jobs); - - if(unlikely(!rrd_flag_check(rm, RRD_FLAG_LIVE_RETENTION))) - live_retention = false; - - if (unlikely((rrdmetric_should_be_deleted(rm)))) { - metrics_deleted++; - continue; - } - - if(!currently_collected && rrd_flag_check(rm, RRD_FLAG_COLLECTED) && rm->first_time_s) - currently_collected = true; - - metrics_active++; - - if (rm->first_time_s && rm->first_time_s < min_first_time_t) - min_first_time_t = rm->first_time_s; - - if (rm->last_time_s && rm->last_time_s > max_last_time_t) - max_last_time_t = rm->last_time_s; - } - dfe_done(rm); - } - - if(unlikely(live_retention && !rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION))) - rrd_flag_set(ri, RRD_FLAG_LIVE_RETENTION); - else if(unlikely(!live_retention && rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION))) - rrd_flag_clear(ri, RRD_FLAG_LIVE_RETENTION); - - if(unlikely(!metrics_active)) { - // no metrics available - - if(ri->first_time_s) { - ri->first_time_s = 0; - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); - } - - if(ri->last_time_s) { - ri->last_time_s = 0; - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); - } - - rrd_flag_set_deleted(ri, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); - } - else { - // we have active metrics... - - if (unlikely(min_first_time_t == LONG_MAX)) - min_first_time_t = 0; - - if (unlikely(min_first_time_t == 0 || max_last_time_t == 0)) { - if(ri->first_time_s) { - ri->first_time_s = 0; - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); - } - - if(ri->last_time_s) { - ri->last_time_s = 0; - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); - } - - if(likely(live_retention)) - rrd_flag_set_deleted(ri, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); - } - else { - rrd_flag_clear(ri, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); - - if (unlikely(ri->first_time_s != min_first_time_t)) { - ri->first_time_s = min_first_time_t; - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); - } - - if (unlikely(ri->last_time_s != max_last_time_t)) { - ri->last_time_s = max_last_time_t; - rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); - } - - if(likely(currently_collected)) - rrd_flag_set_collected(ri); - else - rrd_flag_set_archived(ri); - } - } - - rrd_flag_unset_updated(ri); -} - -static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAGS reason, bool worker_jobs) { - if(reason != RRD_FLAG_NONE) - rrd_flag_set_updated(rc, reason); - - if(worker_jobs) - worker_is_busy(WORKER_JOB_PP_CONTEXT); - - size_t min_priority_collected = LONG_MAX; - size_t min_priority_not_collected = LONG_MAX; - size_t min_priority = LONG_MAX; - time_t min_first_time_t = LONG_MAX, max_last_time_t = 0; - size_t instances_active = 0, instances_deleted = 0; - bool live_retention = true, currently_collected = false, hidden = true; - if(dictionary_entries(rc->rrdinstances) > 0) { - RRDINSTANCE *ri; - dfe_start_reentrant(rc->rrdinstances, ri) { - if(unlikely(!service_running(SERVICE_CONTEXT))) break; - - RRD_FLAGS reason_to_pass = reason; - if(rrd_flag_check(rc, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) - reason_to_pass |= RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION; - - rrdinstance_post_process_updates(ri, force, reason_to_pass, worker_jobs); - - if(unlikely(hidden && !rrd_flag_check(ri, RRD_FLAG_HIDDEN))) - hidden = false; - - if(unlikely(live_retention && !rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION))) - live_retention = false; - - if (unlikely(rrdinstance_should_be_deleted(ri))) { - instances_deleted++; - continue; - } - - if(unlikely(!currently_collected && rrd_flag_is_collected(ri) && ri->first_time_s)) - currently_collected = true; - - internal_error(rc->units != ri->units, - "RRDCONTEXT: '%s' rrdinstance '%s' has different units, context '%s', instance '%s'", - string2str(rc->id), string2str(ri->id), - string2str(rc->units), string2str(ri->units)); - - instances_active++; - - if (ri->priority >= RRDCONTEXT_MINIMUM_ALLOWED_PRIORITY) { - if(rrd_flag_check(ri, RRD_FLAG_COLLECTED)) { - if(ri->priority < min_priority_collected) - min_priority_collected = ri->priority; - } - else { - if(ri->priority < min_priority_not_collected) - min_priority_not_collected = ri->priority; - } - } - - if (ri->first_time_s && ri->first_time_s < min_first_time_t) - min_first_time_t = ri->first_time_s; - - if (ri->last_time_s && ri->last_time_s > max_last_time_t) - max_last_time_t = ri->last_time_s; - } - dfe_done(ri); - - if(min_priority_collected != LONG_MAX) - // use the collected priority - min_priority = min_priority_collected; - else - // use the non-collected priority - min_priority = min_priority_not_collected; - } - - { - bool previous_hidden = rrd_flag_check(rc, RRD_FLAG_HIDDEN); - if (hidden != previous_hidden) { - if (hidden && !rrd_flag_check(rc, RRD_FLAG_HIDDEN)) - rrd_flag_set(rc, RRD_FLAG_HIDDEN); - else if (!hidden && rrd_flag_check(rc, RRD_FLAG_HIDDEN)) - rrd_flag_clear(rc, RRD_FLAG_HIDDEN); - } - - bool previous_live_retention = rrd_flag_check(rc, RRD_FLAG_LIVE_RETENTION); - if (live_retention != previous_live_retention) { - if (live_retention && !rrd_flag_check(rc, RRD_FLAG_LIVE_RETENTION)) - rrd_flag_set(rc, RRD_FLAG_LIVE_RETENTION); - else if (!live_retention && rrd_flag_check(rc, RRD_FLAG_LIVE_RETENTION)) - rrd_flag_clear(rc, RRD_FLAG_LIVE_RETENTION); - } - } - - rrdcontext_lock(rc); - rc->pp.executions++; - - if(unlikely(!instances_active)) { - // we had some instances, but they are gone now... - - if(rc->first_time_s) { - rc->first_time_s = 0; - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); - } - - if(rc->last_time_s) { - rc->last_time_s = 0; - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); - } - - rrd_flag_set_deleted(rc, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); - } - else { - // we have some active instances... - - if (unlikely(min_first_time_t == LONG_MAX)) - min_first_time_t = 0; - - if (unlikely(min_first_time_t == 0 && max_last_time_t == 0)) { - if(rc->first_time_s) { - rc->first_time_s = 0; - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); - } - - if(rc->last_time_s) { - rc->last_time_s = 0; - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); - } - - rrd_flag_set_deleted(rc, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); - } - else { - rrd_flag_clear(rc, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); - - if (unlikely(rc->first_time_s != min_first_time_t)) { - rc->first_time_s = min_first_time_t; - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); - } - - if (rc->last_time_s != max_last_time_t) { - rc->last_time_s = max_last_time_t; - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); - } - - if(likely(currently_collected)) - rrd_flag_set_collected(rc); - else - rrd_flag_set_archived(rc); - } - - if (min_priority != LONG_MAX && rc->priority != min_priority) { - rc->priority = min_priority; - rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); - } - } - - if(unlikely(rrd_flag_is_updated(rc) && rc->rrdhost->rrdctx_hub_queue)) { - if(check_if_cloud_version_changed_unsafe(rc, false)) { - rc->version = rrdcontext_get_next_version(rc); - dictionary_set((DICTIONARY *)rc->rrdhost->rrdctx_hub_queue, - string2str(rc->id), rc, sizeof(*rc)); - } - } - - rrd_flag_unset_updated(rc); - rrdcontext_unlock(rc); -} - -static void rrdcontext_queue_for_post_processing(RRDCONTEXT *rc, const char *function __maybe_unused, RRD_FLAGS flags __maybe_unused) { - if(unlikely(!rc->rrdhost->rrdctx_post_processing_queue)) return; - - if(!rrd_flag_check(rc, RRD_FLAG_QUEUED_FOR_PP)) { - dictionary_set((DICTIONARY *)rc->rrdhost->rrdctx_post_processing_queue, - string2str(rc->id), - rc, - sizeof(*rc)); - -#if(defined(NETDATA_INTERNAL_CHECKS) && defined(LOG_POST_PROCESSING_QUEUE_INSERTIONS)) - { - BUFFER *wb_flags = buffer_create(1000); - rrd_flags_to_buffer(flags, wb_flags); - - BUFFER *wb_reasons = buffer_create(1000); - rrd_reasons_to_buffer(flags, wb_reasons); - - internal_error(true, "RRDCONTEXT: '%s' update triggered by function %s(), due to flags: %s, reasons: %s", - string2str(rc->id), function, - buffer_tostring(wb_flags), - buffer_tostring(wb_reasons)); - - buffer_free(wb_reasons); - buffer_free(wb_flags); - } -#endif - } -} - -static void rrdcontext_dequeue_from_post_processing(RRDCONTEXT *rc) { - if(unlikely(!rc->rrdhost->rrdctx_post_processing_queue)) return; - dictionary_del((DICTIONARY *)rc->rrdhost->rrdctx_post_processing_queue, string2str(rc->id)); -} - -static void rrdcontext_post_process_queued_contexts(RRDHOST *host) { - if(unlikely(!host->rrdctx_post_processing_queue)) return; - - RRDCONTEXT *rc; - dfe_start_reentrant((DICTIONARY *)host->rrdctx_post_processing_queue, rc) { - if(unlikely(!service_running(SERVICE_CONTEXT))) break; - - rrdcontext_dequeue_from_post_processing(rc); - rrdcontext_post_process_updates(rc, false, RRD_FLAG_NONE, true); - } - dfe_done(rc); -} - -// ---------------------------------------------------------------------------- -// dispatching contexts to cloud - -static uint64_t rrdcontext_get_next_version(RRDCONTEXT *rc) { - time_t now = now_realtime_sec(); - uint64_t version = MAX(rc->version, rc->hub.version); - version = MAX((uint64_t)now, version); - version++; - return version; -} - -static void rrdcontext_message_send_unsafe(RRDCONTEXT *rc, bool snapshot __maybe_unused, void *bundle __maybe_unused) { - - // save it, so that we know the last version we sent to hub - rc->version = rc->hub.version = rrdcontext_get_next_version(rc); - rc->hub.id = string2str(rc->id); - rc->hub.title = string2str(rc->title); - rc->hub.units = string2str(rc->units); - rc->hub.family = string2str(rc->family); - rc->hub.chart_type = rrdset_type_name(rc->chart_type); - rc->hub.priority = rc->priority; - rc->hub.first_time_s = rc->first_time_s; - rc->hub.last_time_s = rrd_flag_is_collected(rc) ? 0 : rc->last_time_s; - rc->hub.deleted = rrd_flag_is_deleted(rc) ? true : false; - -#ifdef ENABLE_ACLK - struct context_updated message = { - .id = rc->hub.id, - .version = rc->hub.version, - .title = rc->hub.title, - .units = rc->hub.units, - .family = rc->hub.family, - .chart_type = rc->hub.chart_type, - .priority = rc->hub.priority, - .first_entry = rc->hub.first_time_s, - .last_entry = rc->hub.last_time_s, - .deleted = rc->hub.deleted, - }; - - if(likely(!rrd_flag_check(rc, RRD_FLAG_HIDDEN))) { - if (snapshot) { - if (!rc->hub.deleted) - contexts_snapshot_add_ctx_update(bundle, &message); - } - else - contexts_updated_add_ctx_update(bundle, &message); - } -#endif - - // store it to SQL - - if(rrd_flag_is_deleted(rc)) - rrdcontext_delete_from_sql_unsafe(rc); - - else if (ctx_store_context(&rc->rrdhost->host_uuid, &rc->hub) != 0) - error("RRDCONTEXT: failed to save context '%s' version %"PRIu64" to SQL.", rc->hub.id, rc->hub.version); -} - -static bool check_if_cloud_version_changed_unsafe(RRDCONTEXT *rc, bool sending __maybe_unused) { - bool id_changed = false, - title_changed = false, - units_changed = false, - family_changed = false, - chart_type_changed = false, - priority_changed = false, - first_time_changed = false, - last_time_changed = false, - deleted_changed = false; - - RRD_FLAGS flags = rrd_flags_get(rc); - - if(unlikely(string2str(rc->id) != rc->hub.id)) - id_changed = true; - - if(unlikely(string2str(rc->title) != rc->hub.title)) - title_changed = true; - - if(unlikely(string2str(rc->units) != rc->hub.units)) - units_changed = true; - - if(unlikely(string2str(rc->family) != rc->hub.family)) - family_changed = true; - - if(unlikely(rrdset_type_name(rc->chart_type) != rc->hub.chart_type)) - chart_type_changed = true; - - if(unlikely(rc->priority != rc->hub.priority)) - priority_changed = true; - - if(unlikely((uint64_t)rc->first_time_s != rc->hub.first_time_s)) - first_time_changed = true; - - if(unlikely((uint64_t)((flags & RRD_FLAG_COLLECTED) ? 0 : rc->last_time_s) != rc->hub.last_time_s)) - last_time_changed = true; - - if(unlikely(((flags & RRD_FLAG_DELETED) ? true : false) != rc->hub.deleted)) - deleted_changed = true; - - if(unlikely(id_changed || title_changed || units_changed || family_changed || chart_type_changed || priority_changed || first_time_changed || last_time_changed || deleted_changed)) { - - internal_error(LOG_TRANSITIONS, - "RRDCONTEXT: %s NEW VERSION '%s'%s of host '%s', version %"PRIu64", title '%s'%s, units '%s'%s, family '%s'%s, chart type '%s'%s, priority %u%s, first_time_t %ld%s, last_time_t %ld%s, deleted '%s'%s, (queued for %llu ms, expected %llu ms)", - sending?"SENDING":"QUEUE", - string2str(rc->id), id_changed ? " (CHANGED)" : "", - rrdhost_hostname(rc->rrdhost), - rc->version, - string2str(rc->title), title_changed ? " (CHANGED)" : "", - string2str(rc->units), units_changed ? " (CHANGED)" : "", - string2str(rc->family), family_changed ? " (CHANGED)" : "", - rrdset_type_name(rc->chart_type), chart_type_changed ? " (CHANGED)" : "", - rc->priority, priority_changed ? " (CHANGED)" : "", - rc->first_time_s, first_time_changed ? " (CHANGED)" : "", - (flags & RRD_FLAG_COLLECTED) ? 0 : rc->last_time_s, last_time_changed ? " (CHANGED)" : "", - (flags & RRD_FLAG_DELETED) ? "true" : "false", deleted_changed ? " (CHANGED)" : "", - sending ? (now_realtime_usec() - rc->queue.queued_ut) / USEC_PER_MS : 0, - sending ? (rc->queue.scheduled_dispatch_ut - rc->queue.queued_ut) / USEC_PER_MS : 0 - ); - - return true; - } - - return false; -} - -static inline usec_t rrdcontext_calculate_queued_dispatch_time_ut(RRDCONTEXT *rc, usec_t now_ut) { - - if(likely(rc->queue.delay_calc_ut >= rc->queue.queued_ut)) - return rc->queue.scheduled_dispatch_ut; - - RRD_FLAGS flags = rc->queue.queued_flags; - - usec_t delay = LONG_MAX; - int i; - struct rrdcontext_reason *reason; - for(i = 0, reason = &rrdcontext_reasons[i]; reason->name ; reason = &rrdcontext_reasons[++i]) { - if(unlikely(flags & reason->flag)) { - if(reason->delay_ut < delay) - delay = reason->delay_ut; - } - } - - if(unlikely(delay == LONG_MAX)) { - internal_error(true, "RRDCONTEXT: '%s', cannot find minimum delay of flags %x", string2str(rc->id), (unsigned int)flags); - delay = 60 * USEC_PER_SEC; - } - - rc->queue.delay_calc_ut = now_ut; - usec_t dispatch_ut = rc->queue.scheduled_dispatch_ut = rc->queue.queued_ut + delay; - return dispatch_ut; -} - -static void rrdcontext_dequeue_from_hub_queue(RRDCONTEXT *rc) { - dictionary_del((DICTIONARY *)rc->rrdhost->rrdctx_hub_queue, string2str(rc->id)); -} - -static void rrdcontext_dispatch_queued_contexts_to_hub(RRDHOST *host, usec_t now_ut) { - - // check if we have received a streaming command for this host - if(!rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS) || !aclk_connected || !host->rrdctx_hub_queue) - return; - - // check if there are queued items to send - if(!dictionary_entries((DICTIONARY *)host->rrdctx_hub_queue)) - return; - - if(!host->node_id) - return; - - size_t messages_added = 0; - contexts_updated_t bundle = NULL; - - RRDCONTEXT *rc; - dfe_start_reentrant((DICTIONARY *)host->rrdctx_hub_queue, rc) { - if(unlikely(!service_running(SERVICE_CONTEXT))) break; - - if(unlikely(messages_added >= MESSAGES_PER_BUNDLE_TO_SEND_TO_HUB_PER_HOST)) - break; - - worker_is_busy(WORKER_JOB_QUEUED); - usec_t dispatch_ut = rrdcontext_calculate_queued_dispatch_time_ut(rc, now_ut); - char *claim_id = get_agent_claimid(); - - if(unlikely(now_ut >= dispatch_ut) && claim_id) { - worker_is_busy(WORKER_JOB_CHECK); - - rrdcontext_lock(rc); - - if(check_if_cloud_version_changed_unsafe(rc, true)) { - worker_is_busy(WORKER_JOB_SEND); - -#ifdef ENABLE_ACLK - if(!bundle) { - // prepare the bundle to send the messages - char uuid[UUID_STR_LEN]; - uuid_unparse_lower(*host->node_id, uuid); - - bundle = contexts_updated_new(claim_id, uuid, 0, now_ut); - } -#endif - // update the hub data of the context, give a new version, pack the message - // and save an update to SQL - rrdcontext_message_send_unsafe(rc, false, bundle); - messages_added++; - - rc->queue.dispatches++; - rc->queue.dequeued_ut = now_ut; - } - else - rc->version = rc->hub.version; - - // remove it from the queue - worker_is_busy(WORKER_JOB_DEQUEUE); - rrdcontext_dequeue_from_hub_queue(rc); - - if(unlikely(rrdcontext_should_be_deleted(rc))) { - // this is a deleted context - delete it forever... - - worker_is_busy(WORKER_JOB_CLEANUP_DELETE); - - rrdcontext_dequeue_from_post_processing(rc); - rrdcontext_delete_from_sql_unsafe(rc); - - STRING *id = string_dup(rc->id); - rrdcontext_unlock(rc); - - // delete it from the master dictionary - if(!dictionary_del((DICTIONARY *)host->rrdctx, string2str(rc->id))) - error("RRDCONTEXT: '%s' of host '%s' failed to be deleted from rrdcontext dictionary.", - string2str(id), rrdhost_hostname(host)); - - string_freez(id); - } - else - rrdcontext_unlock(rc); - } - freez(claim_id); - } - dfe_done(rc); - -#ifdef ENABLE_ACLK - if(service_running(SERVICE_CONTEXT) && bundle) { - // we have a bundle to send messages - - // update the version hash - contexts_updated_update_version_hash(bundle, rrdcontext_version_hash(host)); - - // send it - aclk_send_contexts_updated(bundle); - } - else if(bundle) - contexts_updated_delete(bundle); -#endif - -} - -// ---------------------------------------------------------------------------- -// worker thread - -static void rrdcontext_main_cleanup(void *ptr) { - struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; - static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; - - // custom code - worker_unregister(); - - static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; -} - -void *rrdcontext_main(void *ptr) { - netdata_thread_cleanup_push(rrdcontext_main_cleanup, ptr); - - worker_register("RRDCONTEXT"); - worker_register_job_name(WORKER_JOB_HOSTS, "hosts"); - worker_register_job_name(WORKER_JOB_CHECK, "dedup checks"); - worker_register_job_name(WORKER_JOB_SEND, "sent contexts"); - worker_register_job_name(WORKER_JOB_DEQUEUE, "deduplicated contexts"); - worker_register_job_name(WORKER_JOB_RETENTION, "metrics retention"); - worker_register_job_name(WORKER_JOB_QUEUED, "queued contexts"); - worker_register_job_name(WORKER_JOB_CLEANUP, "cleanups"); - worker_register_job_name(WORKER_JOB_CLEANUP_DELETE, "deletes"); - worker_register_job_name(WORKER_JOB_PP_METRIC, "check metrics"); - worker_register_job_name(WORKER_JOB_PP_INSTANCE, "check instances"); - worker_register_job_name(WORKER_JOB_PP_CONTEXT, "check contexts"); - - worker_register_job_custom_metric(WORKER_JOB_HUB_QUEUE_SIZE, "hub queue size", "contexts", WORKER_METRIC_ABSOLUTE); - worker_register_job_custom_metric(WORKER_JOB_PP_QUEUE_SIZE, "post processing queue size", "contexts", WORKER_METRIC_ABSOLUTE); - - heartbeat_t hb; - heartbeat_init(&hb); - usec_t step = RRDCONTEXT_WORKER_THREAD_HEARTBEAT_USEC; - - while (service_running(SERVICE_CONTEXT)) { - worker_is_idle(); - heartbeat_next(&hb, step); - - if(unlikely(!service_running(SERVICE_CONTEXT))) break; - - usec_t now_ut = now_realtime_usec(); - - if(rrdcontext_next_db_rotation_ut && now_ut > rrdcontext_next_db_rotation_ut) { - rrdcontext_recalculate_retention_all_hosts(); - rrdcontext_garbage_collect_for_all_hosts(); - rrdcontext_next_db_rotation_ut = 0; - } - - size_t hub_queued_contexts_for_all_hosts = 0; - size_t pp_queued_contexts_for_all_hosts = 0; - - rrd_rdlock(); - RRDHOST *host; - rrdhost_foreach_read(host) { - if(unlikely(!service_running(SERVICE_CONTEXT))) break; - - worker_is_busy(WORKER_JOB_HOSTS); - - if(host->rrdctx_post_processing_queue) { - pp_queued_contexts_for_all_hosts += - dictionary_entries((DICTIONARY *)host->rrdctx_post_processing_queue); - rrdcontext_post_process_queued_contexts(host); - } - - if(host->rrdctx_hub_queue) { - hub_queued_contexts_for_all_hosts += dictionary_entries((DICTIONARY *)host->rrdctx_hub_queue); - rrdcontext_dispatch_queued_contexts_to_hub(host, now_ut); - } - } - rrd_unlock(); - - worker_set_metric(WORKER_JOB_HUB_QUEUE_SIZE, (NETDATA_DOUBLE)hub_queued_contexts_for_all_hosts); - worker_set_metric(WORKER_JOB_PP_QUEUE_SIZE, (NETDATA_DOUBLE)pp_queued_contexts_for_all_hosts); - } - - netdata_thread_cleanup_pop(1); - return NULL; -} diff --git a/database/rrdcontext.h b/database/rrdcontext.h deleted file mode 100644 index eae37036..00000000 --- a/database/rrdcontext.h +++ /dev/null @@ -1,273 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef NETDATA_RRDCONTEXT_H -#define NETDATA_RRDCONTEXT_H 1 - -// ---------------------------------------------------------------------------- -// RRDMETRIC - -typedef struct rrdmetric_acquired RRDMETRIC_ACQUIRED; - -// ---------------------------------------------------------------------------- -// RRDINSTANCE - -typedef struct rrdinstance_acquired RRDINSTANCE_ACQUIRED; - -// ---------------------------------------------------------------------------- -// RRDCONTEXT - -typedef struct rrdcontexts_dictionary RRDCONTEXTS; -typedef struct rrdcontext_acquired RRDCONTEXT_ACQUIRED; - -// ---------------------------------------------------------------------------- - -#include "rrd.h" - -const char *rrdmetric_acquired_id(RRDMETRIC_ACQUIRED *rma); -const char *rrdmetric_acquired_name(RRDMETRIC_ACQUIRED *rma); -NETDATA_DOUBLE rrdmetric_acquired_last_stored_value(RRDMETRIC_ACQUIRED *rma); - -const char *rrdinstance_acquired_id(RRDINSTANCE_ACQUIRED *ria); -const char *rrdinstance_acquired_name(RRDINSTANCE_ACQUIRED *ria); -DICTIONARY *rrdinstance_acquired_labels(RRDINSTANCE_ACQUIRED *ria); -DICTIONARY *rrdinstance_acquired_functions(RRDINSTANCE_ACQUIRED *ria); - -// ---------------------------------------------------------------------------- -// public API for rrdhost - -void rrdhost_load_rrdcontext_data(RRDHOST *host); -void rrdhost_create_rrdcontexts(RRDHOST *host); -void rrdhost_destroy_rrdcontexts(RRDHOST *host); - -void rrdcontext_host_child_connected(RRDHOST *host); -void rrdcontext_host_child_disconnected(RRDHOST *host); - -int rrdcontext_foreach_instance_with_rrdset_in_context(RRDHOST *host, const char *context, int (*callback)(RRDSET *st, void *data), void *data); - -typedef enum { - RRDCONTEXT_OPTION_NONE = 0, - RRDCONTEXT_OPTION_SHOW_METRICS = (1 << 0), - RRDCONTEXT_OPTION_SHOW_INSTANCES = (1 << 1), - RRDCONTEXT_OPTION_SHOW_LABELS = (1 << 2), - RRDCONTEXT_OPTION_SHOW_QUEUED = (1 << 3), - RRDCONTEXT_OPTION_SHOW_FLAGS = (1 << 4), - RRDCONTEXT_OPTION_SHOW_DELETED = (1 << 5), - RRDCONTEXT_OPTION_DEEPSCAN = (1 << 6), - RRDCONTEXT_OPTION_SHOW_UUIDS = (1 << 7), - RRDCONTEXT_OPTION_SHOW_HIDDEN = (1 << 8), - RRDCONTEXT_OPTION_SKIP_ID = (1 << 31), // internal use -} RRDCONTEXT_TO_JSON_OPTIONS; - -#define RRDCONTEXT_OPTIONS_ALL (RRDCONTEXT_OPTION_SHOW_METRICS|RRDCONTEXT_OPTION_SHOW_INSTANCES|RRDCONTEXT_OPTION_SHOW_LABELS|RRDCONTEXT_OPTION_SHOW_QUEUED|RRDCONTEXT_OPTION_SHOW_FLAGS|RRDCONTEXT_OPTION_SHOW_DELETED|RRDCONTEXT_OPTION_SHOW_UUIDS|RRDCONTEXT_OPTION_SHOW_HIDDEN) - -int rrdcontext_to_json(RRDHOST *host, BUFFER *wb, time_t after, time_t before, RRDCONTEXT_TO_JSON_OPTIONS options, const char *context, SIMPLE_PATTERN *chart_label_key, SIMPLE_PATTERN *chart_labels_filter, SIMPLE_PATTERN *chart_dimensions); -int rrdcontexts_to_json(RRDHOST *host, BUFFER *wb, time_t after, time_t before, RRDCONTEXT_TO_JSON_OPTIONS options, SIMPLE_PATTERN *chart_label_key, SIMPLE_PATTERN *chart_labels_filter, SIMPLE_PATTERN *chart_dimensions); - -// ---------------------------------------------------------------------------- -// public API for rrdcontexts - -const char *rrdcontext_acquired_id(RRDCONTEXT_ACQUIRED *rca); - -// ---------------------------------------------------------------------------- -// public API for rrddims - -void rrdcontext_updated_rrddim(RRDDIM *rd); -void rrdcontext_removed_rrddim(RRDDIM *rd); -void rrdcontext_updated_rrddim_algorithm(RRDDIM *rd); -void rrdcontext_updated_rrddim_multiplier(RRDDIM *rd); -void rrdcontext_updated_rrddim_divisor(RRDDIM *rd); -void rrdcontext_updated_rrddim_flags(RRDDIM *rd); -void rrdcontext_collected_rrddim(RRDDIM *rd); -int rrdcontext_find_dimension_uuid(RRDSET *st, const char *id, uuid_t *store_uuid); - -// ---------------------------------------------------------------------------- -// public API for rrdsets - -void rrdcontext_updated_rrdset(RRDSET *st); -void rrdcontext_removed_rrdset(RRDSET *st); -void rrdcontext_updated_rrdset_name(RRDSET *st); -void rrdcontext_updated_rrdset_flags(RRDSET *st); -void rrdcontext_updated_retention_rrdset(RRDSET *st); -void rrdcontext_collected_rrdset(RRDSET *st); -int rrdcontext_find_chart_uuid(RRDSET *st, uuid_t *store_uuid); - -// ---------------------------------------------------------------------------- -// public API for ACLK - -void rrdcontext_hub_checkpoint_command(void *cmd); -void rrdcontext_hub_stop_streaming_command(void *cmd); - - -// ---------------------------------------------------------------------------- -// public API for threads - -void rrdcontext_db_rotation(void); -void *rrdcontext_main(void *); - -// ---------------------------------------------------------------------------- -// public API for weights - -struct metric_entry { - RRDCONTEXT_ACQUIRED *rca; - RRDINSTANCE_ACQUIRED *ria; - RRDMETRIC_ACQUIRED *rma; -}; - -DICTIONARY *rrdcontext_all_metrics_to_dict(RRDHOST *host, SIMPLE_PATTERN *contexts); - -// ---------------------------------------------------------------------------- -// public API for queries - -typedef struct query_plan_entry { - size_t tier; - time_t after; - time_t before; - time_t expanded_after; - time_t expanded_before; - struct storage_engine_query_handle handle; - STORAGE_POINT (*next_metric)(struct storage_engine_query_handle *handle); - int (*is_finished)(struct storage_engine_query_handle *handle); - void (*finalize)(struct storage_engine_query_handle *handle); - bool initialized; - bool finalized; -} QUERY_PLAN_ENTRY; - -#define QUERY_PLANS_MAX (RRD_STORAGE_TIERS * 2) - -typedef struct query_metric { - struct query_metric_tier { - struct storage_engine *eng; - STORAGE_METRIC_HANDLE *db_metric_handle; - time_t db_first_time_s; // the oldest timestamp available for this tier - time_t db_last_time_s; // the latest timestamp available for this tier - time_t db_update_every_s; // latest update every for this tier - long weight; - } tiers[RRD_STORAGE_TIERS]; - - struct { - size_t used; - QUERY_PLAN_ENTRY array[QUERY_PLANS_MAX]; - } plan; - - struct { - RRDHOST *host; - RRDCONTEXT_ACQUIRED *rca; - RRDINSTANCE_ACQUIRED *ria; - RRDMETRIC_ACQUIRED *rma; - } link; - - struct { - STRING *id; - STRING *name; - RRDR_DIMENSION_FLAGS options; - } dimension; - - struct { - STRING *id; - STRING *name; - } chart; - -} QUERY_METRIC; - -#define MAX_QUERY_TARGET_ID_LENGTH 255 - -typedef struct query_target_request { - RRDHOST *host; // the host to be queried (can be NULL, hosts will be used) - RRDCONTEXT_ACQUIRED *rca; // the context to be queried (can be NULL) - RRDINSTANCE_ACQUIRED *ria; // the instance to be queried (can be NULL) - RRDMETRIC_ACQUIRED *rma; // the metric to be queried (can be NULL) - RRDSET *st; // the chart to be queried (NULL, for context queries) - const char *hosts; // hosts simple pattern - const char *contexts; // contexts simple pattern (context queries) - const char *charts; // charts simple pattern (for context queries) - const char *dimensions; // dimensions simple pattern - const char *chart_label_key; // select only the chart having this label key - const char *charts_labels_filter; // select only the charts having this combo of label key:value - time_t after; // the requested timeframe - time_t before; // the requested timeframe - size_t points; // the requested number of points - time_t timeout; // the timeout of the query in seconds - uint32_t format; // DATASOURCE_FORMAT - RRDR_OPTIONS options; - RRDR_GROUPING group_method; - const char *group_options; - time_t resampling_time; - size_t tier; - QUERY_SOURCE query_source; - STORAGE_PRIORITY priority; -} QUERY_TARGET_REQUEST; - -typedef struct query_target { - char id[MAX_QUERY_TARGET_ID_LENGTH + 1]; // query identifier (for logging) - QUERY_TARGET_REQUEST request; - - bool used; // when true, this query is currently being used - size_t queries; // how many query we have done so far - - struct { - bool relative; // true when the request made with relative timestamps, true if it was absolute - bool aligned; - time_t after; // the absolute timestamp this query is about - time_t before; // the absolute timestamp this query is about - time_t query_granularity; - size_t points; // the number of points the query will return (maybe different from the request) - size_t group; - RRDR_GROUPING group_method; - const char *group_options; - size_t resampling_group; - NETDATA_DOUBLE resampling_divisor; - RRDR_OPTIONS options; - size_t tier; - } window; - - struct { - time_t first_time_s; // the combined first_time_t of all metrics in the query, across all tiers - time_t last_time_s; // the combined last_time_T of all metrics in the query, across all tiers - time_t minimum_latest_update_every_s; // the min update every of the metrics in the query - } db; - - struct { - QUERY_METRIC *array; // the metrics to be queried (all of them should be queried, no exceptions) - uint32_t used; // how many items of the array are used - uint32_t size; // the size of the array - SIMPLE_PATTERN *pattern; - } query; - - struct { - RRDMETRIC_ACQUIRED **array; - uint32_t used; // how many items of the array are used - uint32_t size; // the size of the array - } metrics; - - struct { - RRDINSTANCE_ACQUIRED **array; - uint32_t used; // how many items of the array are used - uint32_t size; // the size of the array - SIMPLE_PATTERN *pattern; - SIMPLE_PATTERN *chart_label_key_pattern; - SIMPLE_PATTERN *charts_labels_filter_pattern; - } instances; - - struct { - RRDCONTEXT_ACQUIRED **array; - uint32_t used; // how many items of the array are used - uint32_t size; // the size of the array - SIMPLE_PATTERN *pattern; - } contexts; - - struct { - RRDHOST **array; - uint32_t used; // how many items of the array are used - uint32_t size; // the size of the array - SIMPLE_PATTERN *pattern; - } hosts; - -} QUERY_TARGET; - -void query_target_free(void); -void query_target_release(QUERY_TARGET *qt); - -QUERY_TARGET *query_target_create(QUERY_TARGET_REQUEST *qtr); - -#endif // NETDATA_RRDCONTEXT_H - diff --git a/database/rrddim.c b/database/rrddim.c index b8059b3c..496fdc61 100644 --- a/database/rrddim.c +++ b/database/rrddim.c @@ -94,9 +94,8 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v size_t initialized = 0; for(size_t tier = 0; tier < storage_tiers ; tier++) { STORAGE_ENGINE *eng = host->db[tier].eng; + rd->tiers[tier].backend = eng->backend; rd->tiers[tier].tier_grouping = host->db[tier].tier_grouping; - rd->tiers[tier].collect_ops = &eng->api.collect_ops; - rd->tiers[tier].query_ops = &eng->api.query_ops; rd->tiers[tier].db_metric_handle = eng->api.metric_get_or_create(rd, host->db[tier].instance); storage_point_unset(rd->tiers[tier].virtual_point); initialized++; @@ -116,7 +115,8 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v size_t initialized = 0; for (size_t tier = 0; tier < storage_tiers; tier++) { if (rd->tiers[tier].db_metric_handle) { - rd->tiers[tier].db_collection_handle = rd->tiers[tier].collect_ops->init(rd->tiers[tier].db_metric_handle, st->rrdhost->db[tier].tier_grouping * st->update_every, rd->rrdset->storage_metrics_groups[tier]); + rd->tiers[tier].db_collection_handle = + storage_metric_store_init(rd->tiers[tier].backend, rd->tiers[tier].db_metric_handle, st->rrdhost->db[tier].tier_grouping * st->update_every, rd->rrdset->storage_metrics_groups[tier]); initialized++; } } @@ -175,7 +175,7 @@ bool rrddim_finalize_collection_and_check_retention(RRDDIM *rd) { tiers_available++; - if(rd->tiers[tier].collect_ops->finalize(rd->tiers[tier].db_collection_handle)) + if(storage_engine_store_finalize(rd->tiers[tier].db_collection_handle)) tiers_said_no_retention++; rd->tiers[tier].db_collection_handle = NULL; @@ -253,7 +253,7 @@ static bool rrddim_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, for(size_t tier = 0; tier < storage_tiers ;tier++) { if (!rd->tiers[tier].db_collection_handle) rd->tiers[tier].db_collection_handle = - rd->tiers[tier].collect_ops->init(rd->tiers[tier].db_metric_handle, st->rrdhost->db[tier].tier_grouping * st->update_every, rd->rrdset->storage_metrics_groups[tier]); + storage_metric_store_init(rd->tiers[tier].backend, rd->tiers[tier].db_metric_handle, st->rrdhost->db[tier].tier_grouping * st->update_every, rd->rrdset->storage_metrics_groups[tier]); } if(rrddim_flag_check(rd, RRDDIM_FLAG_ARCHIVED)) { @@ -320,7 +320,7 @@ inline RRDDIM *rrddim_find(RRDSET *st, const char *id) { } inline RRDDIM_ACQUIRED *rrddim_find_and_acquire(RRDSET *st, const char *id) { - debug(D_RRD_CALLS, "rrddim_find() for chart %s, dimension %s", rrdset_name(st), id); + debug(D_RRD_CALLS, "rrddim_find_and_acquire() for chart %s, dimension %s", rrdset_name(st), id); return (RRDDIM_ACQUIRED *)dictionary_get_and_acquire_item(st->rrddim_root_index, id); } @@ -416,7 +416,7 @@ time_t rrddim_last_entry_s_of_tier(RRDDIM *rd, size_t tier) { if(unlikely(tier > storage_tiers || !rd->tiers[tier].db_metric_handle)) return 0; - return rd->tiers[tier].query_ops->latest_time_s(rd->tiers[tier].db_metric_handle); + return storage_engine_latest_time_s(rd->tiers[tier].backend, rd->tiers[tier].db_metric_handle); } // get the timestamp of the last entry in the round-robin database @@ -438,7 +438,7 @@ time_t rrddim_first_entry_s_of_tier(RRDDIM *rd, size_t tier) { if(unlikely(tier > storage_tiers || !rd->tiers[tier].db_metric_handle)) return 0; - return rd->tiers[tier].query_ops->oldest_time_s(rd->tiers[tier].db_metric_handle); + return storage_engine_oldest_time_s(rd->tiers[tier].backend, rd->tiers[tier].db_metric_handle); } time_t rrddim_first_entry_s(RRDDIM *rd) { diff --git a/database/rrdfunctions.c b/database/rrdfunctions.c index a8341f87..cdba221a 100644 --- a/database/rrdfunctions.c +++ b/database/rrdfunctions.c @@ -496,7 +496,7 @@ static void rrd_function_call_wait_free(struct rrd_function_call_wait *tmp) { struct { const char *format; - uint8_t content_type; + HTTP_CONTENT_TYPE content_type; } function_formats[] = { { .format = "application/json", CT_APPLICATION_JSON }, { .format = "text/plain", CT_TEXT_PLAIN }, @@ -523,7 +523,7 @@ uint8_t functions_format_to_content_type(const char *format) { return CT_TEXT_PLAIN; } -const char *functions_content_type_to_format(uint8_t content_type) { +const char *functions_content_type_to_format(HTTP_CONTENT_TYPE content_type) { for (int i = 0; function_formats[i].format; i++) if (function_formats[i].content_type == content_type) return function_formats[i].format; @@ -537,7 +537,7 @@ int rrd_call_function_error(BUFFER *wb, const char *msg, int code) { buffer_flush(wb); buffer_sprintf(wb, "{\"status\":%d,\"error_message\":\"%s\"}", code, buffer); - wb->contenttype = CT_APPLICATION_JSON; + wb->content_type = CT_APPLICATION_JSON; buffer_no_cacheable(wb); return code; } @@ -632,7 +632,7 @@ int rrd_call_function_and_wait(RRDHOST *host, BUFFER *wb, int timeout, const cha bool we_should_free = true; BUFFER *temp_wb = buffer_create(PLUGINSD_LINE_MAX + 1, &netdata_buffers_statistics.buffers_functions); // we need it because we may give up on it - temp_wb->contenttype = wb->contenttype; + temp_wb->content_type = wb->content_type; code = rdcf->function(temp_wb, timeout, key, rdcf->collector_data, rrd_call_function_signal_when_ready, tmp); if (code == HTTP_RESP_OK) { netdata_mutex_lock(&tmp->mutex); @@ -647,7 +647,7 @@ int rrd_call_function_and_wait(RRDHOST *host, BUFFER *wb, int timeout, const cha if (tmp->data_are_ready) { // we have a response buffer_fast_strcat(wb, buffer_tostring(temp_wb), buffer_strlen(temp_wb)); - wb->contenttype = temp_wb->contenttype; + wb->content_type = temp_wb->content_type; wb->expires = temp_wb->expires; if(wb->expires) @@ -738,14 +738,29 @@ void chart_functions2json(RRDSET *st, BUFFER *wb, int tabs, const char *kq, cons functions2json(st->functions_view, wb, ident, kq, sq); } -void host_functions2json(RRDHOST *host, BUFFER *wb, int tabs, const char *kq, const char *sq) { +void host_functions2json(RRDHOST *host, BUFFER *wb) { if(!host || !host->functions) return; - char ident[tabs + 1]; - ident[tabs] = '\0'; - while(tabs) ident[--tabs] = '\t'; + buffer_json_member_add_object(wb, "functions"); + + struct rrd_collector_function *t; + dfe_start_read(host->functions, t) { + if(!t->collector->running) continue; + + buffer_json_member_add_object(wb, t_dfe.name); + buffer_json_member_add_string(wb, "help", string2str(t->help)); + buffer_json_member_add_int64(wb, "timeout", t->timeout); + buffer_json_member_add_array(wb, "options"); + if(t->options & RRD_FUNCTION_GLOBAL) + buffer_json_add_array_item_string(wb, "GLOBAL"); + if(t->options & RRD_FUNCTION_LOCAL) + buffer_json_add_array_item_string(wb, "LOCAL"); + buffer_json_array_close(wb); + buffer_json_object_close(wb); + } + dfe_done(t); - functions2json(host->functions, wb, ident, kq, sq); + buffer_json_object_close(wb); } void chart_functions_to_dict(DICTIONARY *rrdset_functions_view, DICTIONARY *dst) { diff --git a/database/rrdfunctions.h b/database/rrdfunctions.h index f031ec34..920ada8d 100644 --- a/database/rrdfunctions.h +++ b/database/rrdfunctions.h @@ -26,10 +26,10 @@ void rrd_functions_expose_rrdpush(RRDSET *st, BUFFER *wb); void chart_functions2json(RRDSET *st, BUFFER *wb, int tabs, const char *kq, const char *sq); void chart_functions_to_dict(DICTIONARY *rrdset_functions_view, DICTIONARY *dst); -void host_functions2json(RRDHOST *host, BUFFER *wb, int tabs, const char *kq, const char *sq); +void host_functions2json(RRDHOST *host, BUFFER *wb); uint8_t functions_format_to_content_type(const char *format); -const char *functions_content_type_to_format(uint8_t content_type); +const char *functions_content_type_to_format(HTTP_CONTENT_TYPE content_type); int rrd_call_function_error(BUFFER *wb, const char *msg, int code); #endif // NETDATA_RRDFUNCTIONS_H diff --git a/database/rrdhost.c b/database/rrdhost.c index 60b14c13..88e411de 100644 --- a/database/rrdhost.c +++ b/database/rrdhost.c @@ -41,6 +41,22 @@ bool is_storage_engine_shared(STORAGE_INSTANCE *engine __maybe_unused) { return false; } +RRDHOST *find_host_by_node_id(char *node_id) { + uuid_t node_uuid; + if (unlikely(!node_id || uuid_parse(node_id, node_uuid))) + return NULL; + + RRDHOST *host, *ret = NULL; + dfe_start_read(rrdhost_root_index, host) { + if (host->node_id && !(uuid_memcmp(host->node_id, &node_uuid))) { + ret = host; + break; + } + } + dfe_done(host); + + return ret; +} // ---------------------------------------------------------------------------- // RRDHOST indexes management @@ -62,6 +78,26 @@ static inline void rrdhost_init() { } } +RRDHOST_ACQUIRED *rrdhost_find_and_acquire(const char *machine_guid) { + debug(D_RRD_CALLS, "rrdhost_find_and_acquire() host %s", machine_guid); + + return (RRDHOST_ACQUIRED *)dictionary_get_and_acquire_item(rrdhost_root_index, machine_guid); +} + +RRDHOST *rrdhost_acquired_to_rrdhost(RRDHOST_ACQUIRED *rha) { + if(unlikely(!rha)) + return NULL; + + return (RRDHOST *) dictionary_acquired_item_value((const DICTIONARY_ITEM *)rha); +} + +void rrdhost_acquired_release(RRDHOST_ACQUIRED *rha) { + if(unlikely(!rha)) + return; + + dictionary_acquired_item_release(rrdhost_root_index, (const DICTIONARY_ITEM *)rha); +} + // ---------------------------------------------------------------------------- // RRDHOST index by UUID @@ -104,6 +140,17 @@ inline RRDHOST *rrdhost_find_by_hostname(const char *hostname) { return dictionary_get(rrdhost_root_index_hostname, hostname); } +static inline void rrdhost_index_del_hostname(RRDHOST *host) { + if(unlikely(!host->hostname)) return; + + if(rrdhost_option_check(host, RRDHOST_OPTION_INDEXED_HOSTNAME)) { + if(!dictionary_del(rrdhost_root_index_hostname, rrdhost_hostname(host))) + error("RRDHOST: %s() failed to delete hostname '%s' from index", __FUNCTION__, rrdhost_hostname(host)); + + rrdhost_option_clear(host, RRDHOST_OPTION_INDEXED_HOSTNAME); + } +} + static inline RRDHOST *rrdhost_index_add_hostname(RRDHOST *host) { if(!host->hostname) return host; @@ -111,24 +158,17 @@ static inline RRDHOST *rrdhost_index_add_hostname(RRDHOST *host) { if(ret_hostname == host) rrdhost_option_set(host, RRDHOST_OPTION_INDEXED_HOSTNAME); else { - rrdhost_option_clear(host, RRDHOST_OPTION_INDEXED_HOSTNAME); - error("RRDHOST: %s() host with hostname '%s' is already indexed", __FUNCTION__, rrdhost_hostname(host)); + //have the same hostname but it's not the same host + //keep the new one only if the old one is orphan or archived + if (rrdhost_flag_check(ret_hostname, RRDHOST_FLAG_ORPHAN) || rrdhost_flag_check(ret_hostname, RRDHOST_FLAG_ARCHIVED)) { + rrdhost_index_del_hostname(ret_hostname); + rrdhost_index_add_hostname(host); + } } return host; } -static inline void rrdhost_index_del_hostname(RRDHOST *host) { - if(unlikely(!host->hostname)) return; - - if(rrdhost_option_check(host, RRDHOST_OPTION_INDEXED_HOSTNAME)) { - if(!dictionary_del(rrdhost_root_index_hostname, rrdhost_hostname(host))) - error("RRDHOST: %s() failed to delete hostname '%s' from index", __FUNCTION__, rrdhost_hostname(host)); - - rrdhost_option_clear(host, RRDHOST_OPTION_INDEXED_HOSTNAME); - } -} - // ---------------------------------------------------------------------------- // RRDHOST - internal helpers @@ -225,7 +265,8 @@ static void rrdhost_initialize_rrdpush_sender(RRDHOST *host, rrdpush_destinations_init(host); host->rrdpush_send_api_key = strdupz(rrdpush_api_key); - host->rrdpush_send_charts_matching = simple_pattern_create(rrdpush_send_charts_matching, NULL, SIMPLE_PATTERN_EXACT); + host->rrdpush_send_charts_matching = simple_pattern_create(rrdpush_send_charts_matching, NULL, + SIMPLE_PATTERN_EXACT, true); rrdhost_option_set(host, RRDHOST_OPTION_SENDER_ENABLED); } @@ -329,10 +370,8 @@ int is_legacy = 1; rrdhost_option_set(host, RRDHOST_OPTION_DELETE_ORPHAN_HOST); char filename[FILENAME_MAX + 1]; - if(is_localhost) { + if(is_localhost) host->cache_dir = strdupz(netdata_configured_cache_dir); - host->varlib_dir = strdupz(netdata_configured_varlib_dir); - } else { // this is not localhost - append our GUID to localhost path if (is_in_multihost) { // don't append to cache dir in multihost @@ -349,9 +388,6 @@ int is_legacy = 1; if(r != 0 && errno != EEXIST) error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->cache_dir); } - - snprintfz(filename, FILENAME_MAX, "%s/%s", netdata_configured_varlib_dir, host->machine_guid); - host->varlib_dir = strdupz(filename); } // this is also needed for custom host variables - not only health @@ -366,7 +402,6 @@ int is_legacy = 1; rrdfamily_index_init(host); rrdcalctemplate_index_init(host); rrdcalc_rrdhost_index_init(host); - metaqueue_host_update_info(host); if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { #ifdef ENABLE_DBENGINE @@ -498,7 +533,6 @@ int is_legacy = 1; " (to '%s' with api key '%s')" ", health %s" ", cache_dir '%s'" - ", varlib_dir '%s'" ", alarms default handler '%s'" ", alarms default recipient '%s'" , rrdhost_hostname(host) @@ -517,20 +551,17 @@ int is_legacy = 1; , host->rrdpush_send_api_key?host->rrdpush_send_api_key:"" , host->health.health_enabled?"enabled":"disabled" , host->cache_dir - , host->varlib_dir , string2str(host->health.health_default_exec) , string2str(host->health.health_default_recipient) ); - if(!archived) - rrdhost_flag_set(host,RRDHOST_FLAG_METADATA_INFO | RRDHOST_FLAG_METADATA_UPDATE); - - rrdhost_load_rrdcontext_data(host); - if (!archived) { + if(!archived) { + metaqueue_host_update_info(host); + rrdhost_load_rrdcontext_data(host); +// rrdhost_flag_set(host, RRDHOST_FLAG_METADATA_INFO | RRDHOST_FLAG_METADATA_UPDATE); ml_host_new(host); - ml_start_anomaly_detection_threads(host); } else - rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED | RRDHOST_FLAG_ORPHAN); + rrdhost_flag_set(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD | RRDHOST_FLAG_ARCHIVED | RRDHOST_FLAG_ORPHAN); return host; } @@ -582,6 +613,8 @@ static void rrdhost_update(RRDHOST *host if(strcmp(rrdhost_hostname(host), hostname) != 0) { info("Host '%s' has been renamed to '%s'. If this is not intentional it may mean multiple hosts are using the same machine_guid.", rrdhost_hostname(host), hostname); rrdhost_init_hostname(host, hostname, true); + } else { + rrdhost_index_add_hostname(host); } if(strcmp(rrdhost_program_name(host), program_name) != 0) { @@ -643,7 +676,6 @@ static void rrdhost_update(RRDHOST *host host->rrdpush_replication_step = rrdpush_replication_step; ml_host_new(host); - ml_start_anomaly_detection_threads(host); rrdhost_load_rrdcontext_data(host); info("Host %s is not in archived mode anymore", rrdhost_hostname(host)); @@ -681,6 +713,10 @@ RRDHOST *rrdhost_find_or_create( RRDHOST *host = rrdhost_find_by_guid(guid); if (unlikely(host && host->rrd_memory_mode != mode && rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED))) { + + if (likely(!archived && rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD))) + return host; + /* If a legacy memory mode instantiates all dbengine state must be discarded to avoid inconsistencies */ error("Archived host '%s' has memory mode '%s', but the wanted one is '%s'. Discarding archived state.", rrdhost_hostname(host), rrd_memory_mode_name(host->rrd_memory_mode), rrd_memory_mode_name(mode)); @@ -720,31 +756,30 @@ RRDHOST *rrdhost_find_or_create( ); } else { - - rrdhost_update(host - , hostname - , registry_hostname - , guid - , os - , timezone - , abbrev_timezone - , utc_offset - , tags - , program_name - , program_version - , update_every - , history - , mode - , health_enabled - , rrdpush_enabled - , rrdpush_destination - , rrdpush_api_key - , rrdpush_send_charts_matching - , rrdpush_enable_replication - , rrdpush_seconds_to_replicate - , rrdpush_replication_step - , system_info); - + if (likely(!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD))) + rrdhost_update(host + , hostname + , registry_hostname + , guid + , os + , timezone + , abbrev_timezone + , utc_offset + , tags + , program_name + , program_version + , update_every + , history + , mode + , health_enabled + , rrdpush_enabled + , rrdpush_destination + , rrdpush_api_key + , rrdpush_send_charts_matching + , rrdpush_enable_replication + , rrdpush_seconds_to_replicate + , rrdpush_replication_step + , system_info); } return host; @@ -811,20 +846,6 @@ void dbengine_init(char *hostname) { bool parallel_initialization = (storage_tiers <= (size_t)get_netdata_cpus()) ? true : false; parallel_initialization = config_get_boolean(CONFIG_SECTION_DB, "dbengine parallel initialization", parallel_initialization); - default_rrdeng_page_fetch_timeout = (int) config_get_number(CONFIG_SECTION_DB, "dbengine page fetch timeout secs", PAGE_CACHE_FETCH_WAIT_TIMEOUT); - if (default_rrdeng_page_fetch_timeout < 1) { - info("'dbengine page fetch timeout secs' cannot be %d, using 1", default_rrdeng_page_fetch_timeout); - default_rrdeng_page_fetch_timeout = 1; - config_set_number(CONFIG_SECTION_DB, "dbengine page fetch timeout secs", default_rrdeng_page_fetch_timeout); - } - - default_rrdeng_page_fetch_retries = (int) config_get_number(CONFIG_SECTION_DB, "dbengine page fetch retries", MAX_PAGE_CACHE_FETCH_RETRIES); - if (default_rrdeng_page_fetch_retries < 1) { - info("\"dbengine page fetch retries\" found in netdata.conf cannot be %d, using 1", default_rrdeng_page_fetch_retries); - default_rrdeng_page_fetch_retries = 1; - config_set_number(CONFIG_SECTION_DB, "dbengine page fetch retries", default_rrdeng_page_fetch_retries); - } - struct dbengine_initialization tiers_init[RRD_STORAGE_TIERS] = {}; size_t created_tiers = 0; @@ -938,8 +959,10 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unitt rrdhost_init(); if (unlikely(sql_init_database(DB_CHECK_NONE, system_info ? 0 : 1))) { - if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) + if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { + set_late_global_environment(system_info); fatal("Failed to initialize SQLite"); + } info("Skipping SQLITE metadata initialization since memory mode is not dbengine"); } @@ -1071,7 +1094,7 @@ static void rrdhost_streaming_sender_structures_init(RRDHOST *host) host->sender->host = host; host->sender->buffer = cbuffer_new(CBUFFER_INITIAL_SIZE, 1024 * 1024, &netdata_buffers_statistics.cbuffers_streaming); - host->sender->capabilities = STREAM_OUR_CAPABILITIES; + host->sender->capabilities = stream_our_capabilities(); host->sender->rrdpush_sender_pipe[PIPE_READ] = -1; host->sender->rrdpush_sender_pipe[PIPE_WRITE] = -1; @@ -1160,7 +1183,6 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { rrdcalctemplate_index_destroy(host); // cleanup ML resources - ml_stop_anomaly_detection_threads(host); ml_host_delete(host); freez(host->exporting_flags); @@ -1182,21 +1204,6 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { return; } -#ifdef ENABLE_ACLK - struct aclk_database_worker_config *wc = host->dbsync_worker; - if (wc && !netdata_exit) { - struct aclk_database_cmd cmd; - memset(&cmd, 0, sizeof(cmd)); - cmd.opcode = ACLK_DATABASE_ORPHAN_HOST; - struct aclk_completion compl ; - init_aclk_completion(&compl ); - cmd.completion = &compl ; - aclk_database_enq_cmd(wc, &cmd); - wait_for_aclk_completion(&compl ); - destroy_aclk_completion(&compl ); - } -#endif - // ------------------------------------------------------------------------ // free it @@ -1212,7 +1219,6 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { string_freez(host->program_version); rrdhost_system_info_free(host->system_info); freez(host->cache_dir); - freez(host->varlib_dir); freez(host->rrdpush_send_api_key); freez(host->rrdpush_send_destination); rrdpush_destinations_free(host); @@ -1226,16 +1232,14 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { rrdfamily_index_destroy(host); rrdfunctions_destroy(host); rrdvariables_destroy(host->rrdvars); + if (host == localhost) + rrdvariables_destroy(health_rrdvars); rrdhost_destroy_rrdcontexts(host); string_freez(host->hostname); __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(RRDHOST), __ATOMIC_RELAXED); freez(host); -#ifdef ENABLE_ACLK - if (wc) - wc->is_orphan = 0; -#endif } void rrdhost_free_all(void) { @@ -1253,11 +1257,10 @@ void rrdhost_free_all(void) { void rrd_finalize_collection_for_all_hosts(void) { RRDHOST *host; - rrd_wrlock(); - rrdhost_foreach_read(host) { + dfe_start_reentrant(rrdhost_root_index, host) { rrdhost_finalize_collection(host); } - rrd_unlock(); + dfe_done(host); } // ---------------------------------------------------------------------------- @@ -1278,6 +1281,33 @@ void rrdhost_save_charts(RRDHOST *host) { rrdset_foreach_done(st); } +struct rrdhost_system_info *rrdhost_labels_to_system_info(DICTIONARY *labels) { + struct rrdhost_system_info *info = callocz(1, sizeof(struct rrdhost_system_info)); + info->hops = 1; + + rrdlabels_get_value_strdup_or_null(labels, &info->cloud_provider_type, "_cloud_provider_type"); + rrdlabels_get_value_strdup_or_null(labels, &info->cloud_instance_type, "_cloud_instance_type"); + rrdlabels_get_value_strdup_or_null(labels, &info->cloud_instance_region, "_cloud_instance_region"); + rrdlabels_get_value_strdup_or_null(labels, &info->host_os_name, "_os_name"); + rrdlabels_get_value_strdup_or_null(labels, &info->host_os_version, "_os_version"); + rrdlabels_get_value_strdup_or_null(labels, &info->kernel_version, "_kernel_version"); + rrdlabels_get_value_strdup_or_null(labels, &info->host_cores, "_system_cores"); + rrdlabels_get_value_strdup_or_null(labels, &info->host_cpu_freq, "_system_cpu_freq"); + rrdlabels_get_value_strdup_or_null(labels, &info->host_ram_total, "_system_ram_total"); + rrdlabels_get_value_strdup_or_null(labels, &info->host_disk_space, "_system_disk_space"); + rrdlabels_get_value_strdup_or_null(labels, &info->architecture, "_architecture"); + rrdlabels_get_value_strdup_or_null(labels, &info->virtualization, "_virtualization"); + rrdlabels_get_value_strdup_or_null(labels, &info->container, "_container"); + rrdlabels_get_value_strdup_or_null(labels, &info->container_detection, "_container_detection"); + rrdlabels_get_value_strdup_or_null(labels, &info->virt_detection, "_virt_detection"); + rrdlabels_get_value_strdup_or_null(labels, &info->is_k8s_node, "_is_k8s_node"); + rrdlabels_get_value_strdup_or_null(labels, &info->install_type, "_install_type"); + rrdlabels_get_value_strdup_or_null(labels, &info->prebuilt_arch, "_prebuilt_arch"); + rrdlabels_get_value_strdup_or_null(labels, &info->prebuilt_dist, "_prebuilt_dist"); + + return info; +} + static void rrdhost_load_auto_labels(void) { DICTIONARY *labels = localhost->rrdlabels; @@ -1288,8 +1318,7 @@ static void rrdhost_load_auto_labels(void) { rrdlabels_add(labels, "_cloud_instance_type", localhost->system_info->cloud_instance_type, RRDLABEL_SRC_AUTO); if (localhost->system_info->cloud_instance_region) - rrdlabels_add( - labels, "_cloud_instance_region", localhost->system_info->cloud_instance_region, RRDLABEL_SRC_AUTO); + rrdlabels_add(labels, "_cloud_instance_region", localhost->system_info->cloud_instance_region, RRDLABEL_SRC_AUTO); if (localhost->system_info->host_os_name) rrdlabels_add(labels, "_os_name", localhost->system_info->host_os_name, RRDLABEL_SRC_AUTO); @@ -1353,13 +1382,12 @@ void rrdhost_set_is_parent_label(int count) { DICTIONARY *labels = localhost->rrdlabels; if (count == 0 || count == 1) { - rrdlabels_add( - labels, "_is_parent", (count) ? "true" : "false", RRDLABEL_SRC_AUTO); + rrdlabels_add(labels, "_is_parent", (count) ? "true" : "false", RRDLABEL_SRC_AUTO); //queue a node info #ifdef ENABLE_ACLK if (netdata_cloud_setting) { - aclk_queue_node_info(localhost); + aclk_queue_node_info(localhost, false); } #endif } @@ -1430,7 +1458,7 @@ void rrdhost_finalize_collection(RRDHOST *host) { info("RRD: 'host:%s' stopping data collection...", rrdhost_hostname(host)); RRDSET *st; - rrdset_foreach_write(st, host) + rrdset_foreach_read(st, host) rrdset_finalize_collection(st, true); rrdset_foreach_done(st); } diff --git a/database/rrdlabels.c b/database/rrdlabels.c index 4a9a6dae..f6abd602 100644 --- a/database/rrdlabels.c +++ b/database/rrdlabels.c @@ -399,7 +399,7 @@ size_t text_sanitize(unsigned char *dst, const unsigned char *src, size_t dst_si // find how big this character is (2-4 bytes) size_t utf_character_size = 2; - while(utf_character_size <= 4 && src[utf_character_size] && IS_UTF8_BYTE(src[utf_character_size]) && !IS_UTF8_STARTBYTE(src[utf_character_size])) + while(utf_character_size < 4 && src[utf_character_size] && IS_UTF8_BYTE(src[utf_character_size]) && !IS_UTF8_STARTBYTE(src[utf_character_size])) utf_character_size++; if(utf) { @@ -651,10 +651,24 @@ void rrdlabels_get_value_to_buffer_or_null(DICTIONARY *labels, BUFFER *wb, const dictionary_acquired_item_release(labels, acquired_item); } +void rrdlabels_value_to_buffer_array_item_or_null(DICTIONARY *labels, BUFFER *wb, const char *key) { + if(!labels) return; + + const DICTIONARY_ITEM *acquired_item = dictionary_get_and_acquire_item(labels, key); + RRDLABEL *lb = dictionary_acquired_item_value(acquired_item); + + if(lb && lb->label_value) + buffer_json_add_array_item_string(wb, string2str(lb->label_value)); + else + buffer_json_add_array_item_string(wb, NULL); + + dictionary_acquired_item_release(labels, acquired_item); +} + // ---------------------------------------------------------------------------- // rrdlabels_get_value_to_char_or_null() -void rrdlabels_get_value_to_char_or_null(DICTIONARY *labels, char **value, const char *key) { +void rrdlabels_get_value_strdup_or_null(DICTIONARY *labels, char **value, const char *key) { const DICTIONARY_ITEM *acquired_item = dictionary_get_and_acquire_item(labels, key); RRDLABEL *lb = dictionary_acquired_item_value(acquired_item); @@ -663,6 +677,46 @@ void rrdlabels_get_value_to_char_or_null(DICTIONARY *labels, char **value, const dictionary_acquired_item_release(labels, acquired_item); } +void rrdlabels_get_value_strcpyz(DICTIONARY *labels, char *dst, size_t dst_len, const char *key) { + const DICTIONARY_ITEM *acquired_item = dictionary_get_and_acquire_item(labels, key); + RRDLABEL *lb = dictionary_acquired_item_value(acquired_item); + + if(lb && lb->label_value) + strncpyz(dst, string2str(lb->label_value), dst_len); + else + dst[0] = '\0'; + + dictionary_acquired_item_release(labels, acquired_item); +} + +STRING *rrdlabels_get_value_string_dup(DICTIONARY *labels, const char *key) { + const DICTIONARY_ITEM *acquired_item = dictionary_get_and_acquire_item(labels, key); + RRDLABEL *lb = dictionary_acquired_item_value(acquired_item); + + STRING *ret = NULL; + if(lb && lb->label_value) + ret = string_dup(lb->label_value); + + dictionary_acquired_item_release(labels, acquired_item); + + return ret; +} + +STRING *rrdlabels_get_value_to_buffer_or_unset(DICTIONARY *labels, BUFFER *wb, const char *key, const char *unset) { + const DICTIONARY_ITEM *acquired_item = dictionary_get_and_acquire_item(labels, key); + RRDLABEL *lb = dictionary_acquired_item_value(acquired_item); + + STRING *ret = NULL; + if(lb && lb->label_value) + buffer_strcat(wb, string2str(lb->label_value)); + else + buffer_strcat(wb, unset); + + dictionary_acquired_item_release(labels, acquired_item); + + return ret; +} + // ---------------------------------------------------------------------------- // rrdlabels_unmark_all() // remove labels RRDLABEL_FLAG_OLD and RRDLABEL_FLAG_NEW from all dictionary items @@ -778,6 +832,7 @@ void rrdlabels_copy(DICTIONARY *dst, DICTIONARY *src) { // returns true when there are keys in the dictionary matching a simple pattern struct simple_pattern_match_name_value { + size_t searches; SIMPLE_PATTERN *pattern; char equal; }; @@ -788,6 +843,7 @@ static int simple_pattern_match_name_only_callback(const DICTIONARY_ITEM *item, (void)value; // we return -1 to stop the walkthrough on first match + t->searches++; if(simple_pattern_matches(t->pattern, name)) return -1; return 0; @@ -799,6 +855,7 @@ static int simple_pattern_match_name_and_value_callback(const DICTIONARY_ITEM *i RRDLABEL *lb = (RRDLABEL *)value; // we return -1 to stop the walkthrough on first match + t->searches++; if(simple_pattern_matches(t->pattern, name)) return -1; size_t len = RRDLABELS_MAX_NAME_LENGTH + RRDLABELS_MAX_VALUE_LENGTH + 2; // +1 for =, +1 for \0 @@ -817,28 +874,34 @@ static int simple_pattern_match_name_and_value_callback(const DICTIONARY_ITEM *i // terminate it *dst = '\0'; - if(simple_pattern_matches(t->pattern, tmp)) return -1; + t->searches++; + if(simple_pattern_matches_length_extract(t->pattern, tmp, dst - tmp, NULL, 0) == SP_MATCHED_POSITIVE) + return -1; return 0; } -bool rrdlabels_match_simple_pattern_parsed(DICTIONARY *labels, SIMPLE_PATTERN *pattern, char equal) { +bool rrdlabels_match_simple_pattern_parsed(DICTIONARY *labels, SIMPLE_PATTERN *pattern, char equal, size_t *searches) { if (!labels) return false; struct simple_pattern_match_name_value t = { + .searches = 0, .pattern = pattern, .equal = equal }; int ret = dictionary_walkthrough_read(labels, equal?simple_pattern_match_name_and_value_callback:simple_pattern_match_name_only_callback, &t); + if(searches) + *searches = t.searches; + return (ret == -1)?true:false; } bool rrdlabels_match_simple_pattern(DICTIONARY *labels, const char *simple_pattern_txt) { if (!labels) return false; - SIMPLE_PATTERN *pattern = simple_pattern_create(simple_pattern_txt, " ,|\t\r\n\f\v", SIMPLE_PATTERN_EXACT); + SIMPLE_PATTERN *pattern = simple_pattern_create(simple_pattern_txt, " ,|\t\r\n\f\v", SIMPLE_PATTERN_EXACT, true); char equal = '\0'; const char *s; @@ -849,7 +912,7 @@ bool rrdlabels_match_simple_pattern(DICTIONARY *labels, const char *simple_patte } } - bool ret = rrdlabels_match_simple_pattern_parsed(labels, pattern, equal); + bool ret = rrdlabels_match_simple_pattern_parsed(labels, pattern, equal, NULL); simple_pattern_free(pattern); @@ -959,6 +1022,14 @@ int rrdlabels_to_buffer(DICTIONARY *labels, BUFFER *wb, const char *before_each, return dictionary_walkthrough_read(labels, label_to_buffer_callback, (void *)&tmp); } +void rrdlabels_to_buffer_json_members(DICTIONARY *labels, BUFFER *wb) { + RRDLABEL *lb; + dfe_start_read(labels, lb) { + buffer_json_member_add_string(wb, lb_dfe.name, string2str(lb->label_value)); + } + dfe_done(lb); +} + void rrdset_update_rrdlabels(RRDSET *st, DICTIONARY *new_rrdlabels) { if(!st->rrdlabels) st->rrdlabels = rrdlabels_create(); diff --git a/database/rrdset.c b/database/rrdset.c index 57f962cd..2843bb33 100644 --- a/database/rrdset.c +++ b/database/rrdset.c @@ -158,7 +158,7 @@ static void rrdset_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v STORAGE_ENGINE *eng = st->rrdhost->db[tier].eng; if(!eng) continue; - st->storage_metrics_groups[tier] = eng->api.collect_ops.metrics_group_get(host->db[tier].instance, &st->chart_uuid); + st->storage_metrics_groups[tier] = storage_engine_metrics_group_get(eng->backend, host->db[tier].instance, &st->chart_uuid); } } @@ -184,6 +184,8 @@ static void rrdset_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v ml_chart_new(st); } +void pluginsd_rrdset_cleanup(RRDSET *st); + void rrdset_finalize_collection(RRDSET *st, bool dimensions_too) { RRDHOST *host = st->rrdhost; @@ -201,10 +203,12 @@ void rrdset_finalize_collection(RRDSET *st, bool dimensions_too) { if(!eng) continue; if(st->storage_metrics_groups[tier]) { - eng->api.collect_ops.metrics_group_release(host->db[tier].instance, st->storage_metrics_groups[tier]); + storage_engine_metrics_group_release(eng->backend, host->db[tier].instance, st->storage_metrics_groups[tier]); st->storage_metrics_groups[tier] = NULL; } } + + pluginsd_rrdset_cleanup(st); } // the destructor - the dictionary is write locked while this runs @@ -475,6 +479,27 @@ inline RRDSET *rrdset_find_byname(RRDHOST *host, const char *name) { return(st); } +RRDSET_ACQUIRED *rrdset_find_and_acquire(RRDHOST *host, const char *id) { + debug(D_RRD_CALLS, "rrdset_find_and_acquire() for host %s, chart %s", rrdhost_hostname(host), id); + + return (RRDSET_ACQUIRED *)dictionary_get_and_acquire_item(host->rrdset_root_index, id); +} + +RRDSET *rrdset_acquired_to_rrdset(RRDSET_ACQUIRED *rsa) { + if(unlikely(!rsa)) + return NULL; + + return (RRDSET *) dictionary_acquired_item_value((const DICTIONARY_ITEM *)rsa); +} + +void rrdset_acquired_release(RRDSET_ACQUIRED *rsa) { + if(unlikely(!rsa)) + return; + + RRDSET *rs = rrdset_acquired_to_rrdset(rsa); + dictionary_acquired_item_release(rs->rrdhost->rrdset_root_index, (const DICTIONARY_ITEM *)rsa); +} + // ---------------------------------------------------------------------------- // RRDSET - rename charts @@ -737,10 +762,8 @@ void rrdset_reset(RRDSET *st) { rd->collections_counter = 0; if(!rrddim_flag_check(rd, RRDDIM_FLAG_ARCHIVED)) { - for(size_t tier = 0; tier < storage_tiers ;tier++) { - if(rd->tiers[tier].db_collection_handle) - rd->tiers[tier].collect_ops->flush(rd->tiers[tier].db_collection_handle); - } + for(size_t tier = 0; tier < storage_tiers ;tier++) + storage_engine_store_flush(rd->tiers[tier].db_collection_handle); } } rrddim_foreach_done(rd); @@ -1116,7 +1139,7 @@ void store_metric_at_tier(RRDDIM *rd, size_t tier, struct rrddim_tier *t, STORAG if (likely(!storage_point_is_unset(t->virtual_point))) { - t->collect_ops->store_metric( + storage_engine_store_metric( t->db_collection_handle, t->next_point_end_time_s * USEC_PER_SEC, t->virtual_point.sum, @@ -1127,7 +1150,7 @@ void store_metric_at_tier(RRDDIM *rd, size_t tier, struct rrddim_tier *t, STORAG t->virtual_point.flags); } else { - t->collect_ops->store_metric( + storage_engine_store_metric( t->db_collection_handle, t->next_point_end_time_s * USEC_PER_SEC, NAN, @@ -1199,7 +1222,10 @@ void rrddim_store_metric(RRDDIM *rd, usec_t point_end_time_ut, NETDATA_DOUBLE n, #endif // NETDATA_LOG_COLLECTION_ERRORS // store the metric on tier 0 - rd->tiers[0].collect_ops->store_metric(rd->tiers[0].db_collection_handle, point_end_time_ut, n, 0, 0, 1, 0, flags); + storage_engine_store_metric(rd->tiers[0].db_collection_handle, point_end_time_ut, + n, 0, 0, + 1, 0, flags); + rrdset_done_statistics_points_stored_per_tier[0]++; time_t now_s = (time_t)(point_end_time_ut / USEC_PER_SEC); @@ -1229,6 +1255,8 @@ void rrddim_store_metric(RRDDIM *rd, usec_t point_end_time_ut, NETDATA_DOUBLE n, store_metric_at_tier(rd, tier, t, sp, point_end_time_ut); } + + rrdcontext_collected_rrddim(rd); } void store_metric_collection_completed() { @@ -1269,7 +1297,8 @@ void rrdset_thread_rda_free(void) { } static inline size_t rrdset_done_interpolate( - RRDSET *st + RRDSET_STREAM_BUFFER *rsb + , RRDSET *st , struct rda_item *rda_base , size_t rda_slots , usec_t update_every_ut @@ -1399,32 +1428,38 @@ static inline size_t rrdset_done_interpolate( time_t current_time_s = (time_t) (next_store_ut / USEC_PER_SEC); if(unlikely(!store_this_entry)) { - (void) ml_is_anomalous(rd, current_time_s, 0, false); + (void) ml_dimension_is_anomalous(rd, current_time_s, 0, false); + + if(rsb->wb && rsb->v2) + rrddim_push_metrics_v2(rsb, rd, next_store_ut, NAN, SN_FLAG_NONE); rrddim_store_metric(rd, next_store_ut, NAN, SN_FLAG_NONE); - rrdcontext_collected_rrddim(rd); continue; } if(likely(rd->updated && rd->collections_counter > 1 && iterations < gap_when_lost_iterations_above)) { uint32_t dim_storage_flags = storage_flags; - if (ml_is_anomalous(rd, current_time_s, new_value, true)) { + if (ml_dimension_is_anomalous(rd, current_time_s, new_value, true)) { // clear anomaly bit: 0 -> is anomalous, 1 -> not anomalous dim_storage_flags &= ~((storage_number)SN_FLAG_NOT_ANOMALOUS); } + if(rsb->wb && rsb->v2) + rrddim_push_metrics_v2(rsb, rd, next_store_ut, new_value, dim_storage_flags); + rrddim_store_metric(rd, next_store_ut, new_value, dim_storage_flags); - rrdcontext_collected_rrddim(rd); rd->last_stored_value = new_value; } else { - (void) ml_is_anomalous(rd, current_time_s, 0, false); + (void) ml_dimension_is_anomalous(rd, current_time_s, 0, false); rrdset_debug(st, "%s: STORE[%ld] = NON EXISTING ", rrddim_name(rd), current_entry); + if(rsb->wb && rsb->v2) + rrddim_push_metrics_v2(rsb, rd, next_store_ut, NAN, SN_FLAG_NONE); + rrddim_store_metric(rd, next_store_ut, NAN, SN_FLAG_NONE); - rrdcontext_collected_rrddim(rd); rd->last_stored_value = NAN; } @@ -1468,6 +1503,10 @@ void rrdset_done(RRDSET *st) { void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) { if(unlikely(!service_running(SERVICE_COLLECTORS))) return; + RRDSET_STREAM_BUFFER stream_buffer = { .wb = NULL, }; + if(unlikely(rrdhost_has_rrdpush_sender_enabled(st->rrdhost))) + stream_buffer = rrdset_push_metric_initialize(st, now.tv_sec); + netdata_spinlock_lock(&st->data_collection_lock); if (pending_rrdset_next) @@ -1489,10 +1528,10 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) update_every_ut = st->update_every * USEC_PER_SEC; // st->update_every in microseconds RRDSET_FLAGS rrdset_flags = rrdset_flag_check(st, ~0); - if(unlikely(rrdset_flags & RRDSET_FLAG_COLLECTION_FINISHED)) + if(unlikely(rrdset_flags & RRDSET_FLAG_COLLECTION_FINISHED)) { + netdata_spinlock_unlock(&st->data_collection_lock); return; - - netdata_thread_disable_cancelability(); + } if (unlikely(rrdset_flags & RRDSET_FLAG_OBSOLETE)) { error("Chart '%s' has the OBSOLETE flag set, but it is collected.", rrdset_id(st)); @@ -1500,8 +1539,7 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) } // check if the chart has a long time to be updated - if(unlikely(st->usec_since_last_update > st->entries * update_every_ut && - st->rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE && st->rrd_memory_mode != RRD_MEMORY_MODE_NONE)) { + if(unlikely(st->usec_since_last_update > MAX(st->entries, 60) * update_every_ut)) { info("host '%s', chart '%s': took too long to be updated (counter #%zu, update #%zu, %0.3" NETDATA_DOUBLE_MODIFIER " secs). Resetting it.", rrdhost_hostname(st->rrdhost), rrdset_id(st), st->counter, st->counter_done, (NETDATA_DOUBLE)st->usec_since_last_update / USEC_PER_SEC); rrdset_reset(st); @@ -1527,9 +1565,6 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) // calculate the proper last_collected_time, using usec_since_last_update last_collect_ut = rrdset_update_last_collected_time(st); } - if (unlikely(st->rrd_memory_mode == RRD_MEMORY_MODE_NONE)) { - goto after_first_database_work; - } // if this set has not been updated in the past // we fake the last_update time to be = now - usec_since_last_update @@ -1592,11 +1627,10 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) } } -after_first_database_work: st->counter_done++; - if(unlikely(rrdhost_has_rrdpush_sender_enabled(st->rrdhost))) - rrdset_done_push(st); + if(stream_buffer.wb && !stream_buffer.v2) + rrdset_push_metrics_v1(&stream_buffer, st); uint32_t has_reset_value = 0; @@ -1654,9 +1688,6 @@ after_first_database_work: rrddim_foreach_done(rd); rda_slots = dimensions; - if (unlikely(st->rrd_memory_mode == RRD_MEMORY_MODE_NONE)) - goto after_second_database_work; - rrdset_debug(st, "last_collect_ut = %0.3" NETDATA_DOUBLE_MODIFIER " (last collection time)", (NETDATA_DOUBLE)last_collect_ut/USEC_PER_SEC); rrdset_debug(st, "now_collect_ut = %0.3" NETDATA_DOUBLE_MODIFIER " (current collection time)", (NETDATA_DOUBLE)now_collect_ut/USEC_PER_SEC); rrdset_debug(st, "last_stored_ut = %0.3" NETDATA_DOUBLE_MODIFIER " (last updated time)", (NETDATA_DOUBLE)last_stored_ut/USEC_PER_SEC); @@ -1857,7 +1888,8 @@ after_first_database_work: // #endif rrdset_done_interpolate( - st + &stream_buffer + , st , rda_base , rda_slots , update_every_ut @@ -1869,7 +1901,6 @@ after_first_database_work: , has_reset_value ); -after_second_database_work: for(dim_id = 0, rda = rda_base ; dim_id < rda_slots ; ++dim_id, ++rda) { rd = rda->rd; if(unlikely(!rd)) continue; @@ -1928,6 +1959,7 @@ after_second_database_work: } netdata_spinlock_unlock(&st->data_collection_lock); + rrdset_push_metrics_finished(&stream_buffer, st); // ALL DONE ABOUT THE DATA UPDATE // -------------------------------------------------------------------- @@ -1955,8 +1987,6 @@ after_second_database_work: rrdcontext_collected_rrdset(st); - netdata_thread_enable_cancelability(); - store_metric_collection_completed(); } @@ -1965,18 +1995,20 @@ time_t rrdset_set_update_every_s(RRDSET *st, time_t update_every_s) { internal_error(true, "RRDSET '%s' switching update every from %d to %d", rrdset_id(st), (int)st->update_every, (int)update_every_s); - time_t prev_update_every_s = st->update_every; - st->update_every = update_every_s; + time_t prev_update_every_s = (time_t) st->update_every; + st->update_every = (int) update_every_s; // switch update every to the storage engine RRDDIM *rd; rrddim_foreach_read(rd, st) { for (size_t tier = 0; tier < storage_tiers; tier++) { if (rd->tiers[tier].db_collection_handle) - rd->tiers[tier].collect_ops->change_collection_frequency(rd->tiers[tier].db_collection_handle, (int)(st->rrdhost->db[tier].tier_grouping * st->update_every)); + storage_engine_store_change_collection_frequency( + rd->tiers[tier].db_collection_handle, + (int)(st->rrdhost->db[tier].tier_grouping * st->update_every)); } - assert(rd->update_every == prev_update_every_s && + assert(rd->update_every == (int) prev_update_every_s && "chart's update every differs from the update every of its dimensions"); rd->update_every = st->update_every; } diff --git a/database/rrdvar.c b/database/rrdvar.c index 72decbd4..914a5d6e 100644 --- a/database/rrdvar.c +++ b/database/rrdvar.c @@ -93,6 +93,15 @@ DICTIONARY *rrdvariables_create(void) { return dict; } +DICTIONARY *health_rrdvariables_create(void) { + DICTIONARY *dict = dictionary_create_advanced(DICT_OPTION_NONE, &dictionary_stats_category_rrdhealth, 0); + + dictionary_register_insert_callback(dict, rrdvar_insert_callback, NULL); + dictionary_register_delete_callback(dict, rrdvar_delete_callback, NULL); + + return dict; +} + void rrdvariables_destroy(DICTIONARY *dict) { dictionary_destroy(dict); } @@ -124,6 +133,19 @@ inline const RRDVAR_ACQUIRED *rrdvar_add_and_acquire(const char *scope __maybe_u return (const RRDVAR_ACQUIRED *)dictionary_set_and_acquire_item_advanced(dict, string2str(name), (ssize_t)string_strlen(name) + 1, NULL, sizeof(RRDVAR), &tmp); } +inline void rrdvar_add(const char *scope __maybe_unused, DICTIONARY *dict, STRING *name, RRDVAR_TYPE type, RRDVAR_FLAGS options, void *value) { + if(unlikely(!dict || !name)) return; + + struct rrdvar_constructor tmp = { + .name = name, + .value = value, + .type = type, + .options = options, + .react_action = RRDVAR_REACT_NONE, + }; + dictionary_set_advanced(dict, string2str(name), (ssize_t)string_strlen(name) + 1, NULL, sizeof(RRDVAR), &tmp); +} + void rrdvar_delete_all(DICTIONARY *dict) { dictionary_flush(dict); } @@ -211,6 +233,52 @@ NETDATA_DOUBLE rrdvar2number(const RRDVAR_ACQUIRED *rva) { } } +int health_variable_check(DICTIONARY *dict, RRDSET *st, RRDDIM *rd) { + if (!dict || !st || !rd) return 0; + + STRING *helper_str; + char helper[RRDVAR_MAX_LENGTH + 1]; + snprintfz(helper, RRDVAR_MAX_LENGTH, "%s.%s", string2str(st->name), string2str(rd->name)); + helper_str = string_strdupz(helper); + + const RRDVAR_ACQUIRED *rva; + rva = rrdvar_get_and_acquire(dict, helper_str); + if(rva) { + dictionary_acquired_item_release(dict, (const DICTIONARY_ITEM *)rva); + string_freez(helper_str); + return 1; + } + + string_freez(helper_str); + + return 0; +} + +void rrdvar_store_for_chart(RRDHOST *host, RRDSET *st) { + if (!st) return; + + if(!st->rrdfamily) + st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st)); + + if(!st->rrdvars) + st->rrdvars = rrdvariables_create(); + + rrddimvar_index_init(st); + + rrdsetvar_add_and_leave_released(st, "last_collected_t", RRDVAR_TYPE_TIME_T, &st->last_collected_time.tv_sec, RRDVAR_FLAG_NONE); + rrdsetvar_add_and_leave_released(st, "green", RRDVAR_TYPE_CALCULATED, &st->green, RRDVAR_FLAG_NONE); + rrdsetvar_add_and_leave_released(st, "red", RRDVAR_TYPE_CALCULATED, &st->red, RRDVAR_FLAG_NONE); + rrdsetvar_add_and_leave_released(st, "update_every", RRDVAR_TYPE_INT, &st->update_every, RRDVAR_FLAG_NONE); + + RRDDIM *rd; + rrddim_foreach_read(rd, st) { + rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->last_stored_value, RRDVAR_FLAG_NONE); + rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_COLLECTED, NULL, "_raw", &rd->last_collected_value, RRDVAR_FLAG_NONE); + rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->last_collected_time.tv_sec, RRDVAR_FLAG_NONE); + } + rrddim_foreach_done(rd); +} + int health_variable_lookup(STRING *variable, RRDCALC *rc, NETDATA_DOUBLE *result) { RRDSET *st = rc->rrdset; if(!st) return 0; diff --git a/database/rrdvar.h b/database/rrdvar.h index a511c732..3b6e9cb9 100644 --- a/database/rrdvar.h +++ b/database/rrdvar.h @@ -26,6 +26,7 @@ typedef enum rrdvar_options { RRDVAR_FLAG_RRDCALC_FAMILY_VAR = (1 << 4), // this is a an alarm variable, attached to a family RRDVAR_FLAG_RRDCALC_HOST_CHARTID_VAR = (1 << 5), // this is a an alarm variable, attached to the host, using the chart id RRDVAR_FLAG_RRDCALC_HOST_CHARTNAME_VAR = (1 << 6), // this is a an alarm variable, attached to the host, using the chart name + RRDVAR_FLAG_CONFIG_VAR = (1 << 7), // this is a an alarm variable, read from alarm config // this is 24 bit // to increase it you have to set change the bitfield in @@ -47,6 +48,7 @@ int rrdvar_fix_name(char *variable); STRING *rrdvar_name_to_string(const char *name); const RRDVAR_ACQUIRED *rrdvar_custom_host_variable_add_and_acquire(RRDHOST *host, const char *name); +void rrdvar_add(const char *scope __maybe_unused, DICTIONARY *dict, STRING *name, RRDVAR_TYPE type, RRDVAR_FLAGS options, void *value); void rrdvar_custom_host_variable_set(RRDHOST *host, const RRDVAR_ACQUIRED *rva, NETDATA_DOUBLE value); int rrdvar_walkthrough_read(DICTIONARY *dict, int (*callback)(const DICTIONARY_ITEM *item, void *rrdvar, void *data), void *data); @@ -60,8 +62,12 @@ const RRDVAR_ACQUIRED *rrdvar_add_and_acquire(const char *scope, DICTIONARY *dic void rrdvar_release_and_del(DICTIONARY *dict, const RRDVAR_ACQUIRED *rva); DICTIONARY *rrdvariables_create(void); +DICTIONARY *health_rrdvariables_create(void); void rrdvariables_destroy(DICTIONARY *dict); +void rrdvar_store_for_chart(RRDHOST *host, RRDSET *st); +int health_variable_check(DICTIONARY *dict, RRDSET *st, RRDDIM *rd); + void rrdvar_delete_all(DICTIONARY *dict); const char *rrdvar_name(const RRDVAR_ACQUIRED *rva); diff --git a/database/sqlite/sqlite_aclk.c b/database/sqlite/sqlite_aclk.c index 3b0c4052..a33e09f5 100644 --- a/database/sqlite/sqlite_aclk.c +++ b/database/sqlite/sqlite_aclk.c @@ -5,58 +5,176 @@ #include "sqlite_aclk_node.h" +struct aclk_sync_config_s { + uv_thread_t thread; + uv_loop_t loop; + uv_timer_t timer_req; + time_t cleanup_after; // Start a cleanup after this timestamp + uv_async_t async; + /* FIFO command queue */ + uv_mutex_t cmd_mutex; + uv_cond_t cmd_cond; + bool initialized; + volatile unsigned queue_size; + struct aclk_database_cmdqueue cmd_queue; +} aclk_sync_config = { 0 }; + + void sanity_check(void) { // make sure the compiler will stop on misconfigurations BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < ACLK_MAX_ENUMERATIONS_DEFINED); } -static int sql_check_aclk_table(void *data, int argc, char **argv, char **column) + +int aclk_database_enq_cmd_noblock(struct aclk_database_cmd *cmd) { - struct aclk_database_worker_config *wc = data; - UNUSED(argc); - UNUSED(column); + unsigned queue_size; - debug(D_ACLK_SYNC,"Scheduling aclk sync table check for node %s", (char *) argv[0]); - struct aclk_database_cmd cmd; - memset(&cmd, 0, sizeof(cmd)); - cmd.opcode = ACLK_DATABASE_DELETE_HOST; - cmd.data = strdupz((char *) argv[0]); - aclk_database_enq_cmd_noblock(wc, &cmd); + /* wait for free space in queue */ + uv_mutex_lock(&aclk_sync_config.cmd_mutex); + if ((queue_size = aclk_sync_config.queue_size) == ACLK_DATABASE_CMD_Q_MAX_SIZE) { + uv_mutex_unlock(&aclk_sync_config.cmd_mutex); + return 1; + } + + fatal_assert(queue_size < ACLK_DATABASE_CMD_Q_MAX_SIZE); + /* enqueue command */ + aclk_sync_config.cmd_queue.cmd_array[aclk_sync_config.cmd_queue.tail] = *cmd; + aclk_sync_config.cmd_queue.tail = aclk_sync_config.cmd_queue.tail != ACLK_DATABASE_CMD_Q_MAX_SIZE - 1 ? + aclk_sync_config.cmd_queue.tail + 1 : 0; + aclk_sync_config.queue_size = queue_size + 1; + uv_mutex_unlock(&aclk_sync_config.cmd_mutex); return 0; } -#define SQL_SELECT_ACLK_ACTIVE_LIST "SELECT REPLACE(SUBSTR(name,19),'_','-') FROM sqlite_schema " \ - "WHERE name LIKE 'aclk_chart_latest_%' AND type IN ('table');" - -static void sql_check_aclk_table_list(struct aclk_database_worker_config *wc) +static void aclk_database_enq_cmd(struct aclk_database_cmd *cmd) { - char *err_msg = NULL; - debug(D_ACLK_SYNC,"Cleaning tables for nodes that do not exist"); - int rc = sqlite3_exec_monitored(db_meta, SQL_SELECT_ACLK_ACTIVE_LIST, sql_check_aclk_table, (void *) wc, &err_msg); - if (rc != SQLITE_OK) { - error_report("Query failed when trying to check for obsolete ACLK sync tables, %s", err_msg); - sqlite3_free(err_msg); + unsigned queue_size; + + /* wait for free space in queue */ + uv_mutex_lock(&aclk_sync_config.cmd_mutex); + while ((queue_size = aclk_sync_config.queue_size) == ACLK_DATABASE_CMD_Q_MAX_SIZE) { + uv_cond_wait(&aclk_sync_config.cmd_cond, &aclk_sync_config.cmd_mutex); } + fatal_assert(queue_size < ACLK_DATABASE_CMD_Q_MAX_SIZE); + /* enqueue command */ + aclk_sync_config.cmd_queue.cmd_array[aclk_sync_config.cmd_queue.tail] = *cmd; + aclk_sync_config.cmd_queue.tail = aclk_sync_config.cmd_queue.tail != ACLK_DATABASE_CMD_Q_MAX_SIZE - 1 ? + aclk_sync_config.cmd_queue.tail + 1 : 0; + aclk_sync_config.queue_size = queue_size + 1; + uv_mutex_unlock(&aclk_sync_config.cmd_mutex); + + /* wake up event loop */ + int rc = uv_async_send(&aclk_sync_config.async); + if (unlikely(rc)) + debug(D_ACLK_SYNC, "Failed to wake up event loop"); } -static void sql_maint_aclk_sync_database(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) +enum { + IDX_HOST_ID, + IDX_HOSTNAME, + IDX_REGISTRY, + IDX_UPDATE_EVERY, + IDX_OS, + IDX_TIMEZONE, + IDX_TAGS, + IDX_HOPS, + IDX_MEMORY_MODE, + IDX_ABBREV_TIMEZONE, + IDX_UTC_OFFSET, + IDX_PROGRAM_NAME, + IDX_PROGRAM_VERSION, + IDX_ENTRIES, + IDX_HEALTH_ENABLED, +}; + +static int create_host_callback(void *data, int argc, char **argv, char **column) { - UNUSED(cmd); + int *number_of_chidren = data; + UNUSED(argc); + UNUSED(column); - debug(D_ACLK, "Checking database for %s", wc->host_guid); + char guid[UUID_STR_LEN]; + uuid_unparse_lower(*(uuid_t *)argv[IDX_HOST_ID], guid); - BUFFER *sql = buffer_create(ACLK_SYNC_QUERY_SIZE, &netdata_buffers_statistics.buffers_sqlite); + struct rrdhost_system_info *system_info = callocz(1, sizeof(struct rrdhost_system_info)); + __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(struct rrdhost_system_info), __ATOMIC_RELAXED); - buffer_sprintf(sql,"DELETE FROM aclk_alert_%s WHERE date_submitted IS NOT NULL AND " - "CAST(date_cloud_ack AS INT) < unixepoch()-%d;", wc->uuid_str, ACLK_DELETE_ACK_ALERTS_INTERNAL); - db_execute(buffer_tostring(sql)); + system_info->hops = str2i((const char *) argv[IDX_HOPS]); - buffer_free(sql); + sql_build_host_system_info((uuid_t *)argv[IDX_HOST_ID], system_info); + + RRDHOST *host = rrdhost_find_or_create( + (const char *) argv[IDX_HOSTNAME] + , (const char *) argv[IDX_REGISTRY] + , guid + , (const char *) argv[IDX_OS] + , (const char *) argv[IDX_TIMEZONE] + , (const char *) argv[IDX_ABBREV_TIMEZONE] + , (int32_t) (argv[IDX_UTC_OFFSET] ? str2uint32_t(argv[IDX_UTC_OFFSET], NULL) : 0) + , (const char *) argv[IDX_TAGS] + , (const char *) (argv[IDX_PROGRAM_NAME] ? argv[IDX_PROGRAM_NAME] : "unknown") + , (const char *) (argv[IDX_PROGRAM_VERSION] ? argv[IDX_PROGRAM_VERSION] : "unknown") + , argv[IDX_UPDATE_EVERY] ? str2i(argv[IDX_UPDATE_EVERY]) : 1 + , argv[IDX_ENTRIES] ? str2i(argv[IDX_ENTRIES]) : 0 + , default_rrd_memory_mode + , 0 // health + , 0 // rrdpush enabled + , NULL //destination + , NULL // api key + , NULL // send charts matching + , false // rrdpush_enable_replication + , 0 // rrdpush_seconds_to_replicate + , 0 // rrdpush_replication_step + , system_info + , 1 + ); + if (likely(host)) + host->rrdlabels = sql_load_host_labels((uuid_t *)argv[IDX_HOST_ID]); + + (*number_of_chidren)++; + +#ifdef NETDATA_INTERNAL_CHECKS + char node_str[UUID_STR_LEN] = "<none>"; + if (likely(host->node_id)) + uuid_unparse_lower(*host->node_id, node_str); + internal_error(true, "Adding archived host \"%s\" with GUID \"%s\" node id = \"%s\"", rrdhost_hostname(host), host->machine_guid, node_str); +#endif + return 0; } +#ifdef ENABLE_ACLK +static struct aclk_database_cmd aclk_database_deq_cmd(void) +{ + struct aclk_database_cmd ret; + unsigned queue_size; -#define SQL_SELECT_HOST_BY_UUID "SELECT host_id FROM host WHERE host_id = @host_id;" + uv_mutex_lock(&aclk_sync_config.cmd_mutex); + queue_size = aclk_sync_config.queue_size; + if (queue_size == 0) { + memset(&ret, 0, sizeof(ret)); + ret.opcode = ACLK_DATABASE_NOOP; + ret.completion = NULL; + + } else { + /* dequeue command */ + ret = aclk_sync_config.cmd_queue.cmd_array[aclk_sync_config.cmd_queue.head]; + if (queue_size == 1) { + aclk_sync_config.cmd_queue.head = aclk_sync_config.cmd_queue.tail = 0; + } else { + aclk_sync_config.cmd_queue.head = aclk_sync_config.cmd_queue.head != ACLK_DATABASE_CMD_Q_MAX_SIZE - 1 ? + aclk_sync_config.cmd_queue.head + 1 : 0; + } + aclk_sync_config.queue_size = queue_size - 1; + /* wake up producers */ + uv_cond_signal(&aclk_sync_config.cmd_cond); + } + uv_mutex_unlock(&aclk_sync_config.cmd_mutex); + return ret; +} + +#define SQL_SELECT_HOST_BY_UUID "SELECT host_id FROM host WHERE host_id = @host_id;" static int is_host_available(uuid_t *host_id) { sqlite3_stmt *res = NULL; @@ -76,7 +194,7 @@ static int is_host_available(uuid_t *host_id) rc = sqlite3_bind_blob(res, 1, host_id, sizeof(*host_id), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind host_id parameter to select node instance information"); + error_report("Failed to bind host_id parameter to check host existence"); goto failed; } rc = sqlite3_step_monitored(res); @@ -89,15 +207,13 @@ failed: } // OPCODE: ACLK_DATABASE_DELETE_HOST -void sql_delete_aclk_table_list(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) +static void sql_delete_aclk_table_list(char *host_guid) { - UNUSED(wc); - char uuid_str[GUID_LEN + 1]; - char host_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; + char host_str[UUID_STR_LEN]; int rc; uuid_t host_uuid; - char *host_guid = (char *)cmd.data; if (unlikely(!host_guid)) return; @@ -139,273 +255,67 @@ void sql_delete_aclk_table_list(struct aclk_database_worker_config *wc, struct a if (unlikely(rc != SQLITE_OK)) error_report("Failed to finalize statement to clean up aclk tables, rc = %d", rc); - db_execute(buffer_tostring(sql)); + rc = db_execute(db_meta, buffer_tostring(sql)); + if (unlikely(rc)) + error("Failed to drop unused ACLK tables"); fail: buffer_free(sql); } -uv_mutex_t aclk_async_lock; -struct aclk_database_worker_config *aclk_thread_head = NULL; - -int claimed() +static int sql_check_aclk_table(void *data __maybe_unused, int argc __maybe_unused, char **argv __maybe_unused, char **column __maybe_unused) { - int rc; - rrdhost_aclk_state_lock(localhost); - rc = (localhost->aclk_state.claimed_id != NULL); - rrdhost_aclk_state_unlock(localhost); - return rc; -} - -void aclk_add_worker_thread(struct aclk_database_worker_config *wc) -{ - if (unlikely(!wc)) - return; - - uv_mutex_lock(&aclk_async_lock); - if (unlikely(!wc->host)) { - wc->next = aclk_thread_head; - aclk_thread_head = wc; - } - uv_mutex_unlock(&aclk_async_lock); + debug(D_ACLK_SYNC,"Scheduling aclk sync table check for node %s", (char *) argv[0]); + struct aclk_database_cmd cmd; + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = ACLK_DATABASE_DELETE_HOST; + cmd.param[0] = strdupz((char *) argv[0]); + aclk_database_enq_cmd_noblock(&cmd); + return 0; } -void aclk_del_worker_thread(struct aclk_database_worker_config *wc) -{ - if (unlikely(!wc)) - return; - - uv_mutex_lock(&aclk_async_lock); - struct aclk_database_worker_config **tmp = &aclk_thread_head; - while (*tmp && (*tmp) != wc) - tmp = &(*tmp)->next; - if (*tmp) - *tmp = wc->next; - uv_mutex_unlock(&aclk_async_lock); -} +#define SQL_SELECT_ACLK_ACTIVE_LIST "SELECT REPLACE(SUBSTR(name,19),'_','-') FROM sqlite_schema " \ + "WHERE name LIKE 'aclk_chart_latest_%' AND type IN ('table');" -int aclk_worker_thread_exists(char *guid) +static void sql_check_aclk_table_list(void) { - int rc = 0; - uv_mutex_lock(&aclk_async_lock); - - struct aclk_database_worker_config *tmp = aclk_thread_head; - - while (tmp && !rc) { - rc = strcmp(tmp->uuid_str, guid) == 0; - tmp = tmp->next; + char *err_msg = NULL; + debug(D_ACLK_SYNC,"Cleaning tables for nodes that do not exist"); + int rc = sqlite3_exec_monitored(db_meta, SQL_SELECT_ACLK_ACTIVE_LIST, sql_check_aclk_table, NULL, &err_msg); + if (rc != SQLITE_OK) { + error_report("Query failed when trying to check for obsolete ACLK sync tables, %s", err_msg); + sqlite3_free(err_msg); } - uv_mutex_unlock(&aclk_async_lock); - return rc; } -void aclk_database_init_cmd_queue(struct aclk_database_worker_config *wc) -{ - wc->cmd_queue.head = wc->cmd_queue.tail = 0; - wc->queue_size = 0; - fatal_assert(0 == uv_cond_init(&wc->cmd_cond)); - fatal_assert(0 == uv_mutex_init(&wc->cmd_mutex)); -} +#define SQL_ALERT_CLEANUP "DELETE FROM aclk_alert_%s WHERE date_submitted IS NOT NULL AND CAST(date_cloud_ack AS INT) < unixepoch()-%d;" -int aclk_database_enq_cmd_noblock(struct aclk_database_worker_config *wc, struct aclk_database_cmd *cmd) +static int sql_maint_aclk_sync_database(void *data __maybe_unused, int argc __maybe_unused, char **argv, char **column __maybe_unused) { - unsigned queue_size; - - /* wait for free space in queue */ - uv_mutex_lock(&wc->cmd_mutex); - if ((queue_size = wc->queue_size) == ACLK_DATABASE_CMD_Q_MAX_SIZE || wc->is_shutting_down) { - uv_mutex_unlock(&wc->cmd_mutex); - return 1; - } - - fatal_assert(queue_size < ACLK_DATABASE_CMD_Q_MAX_SIZE); - /* enqueue command */ - wc->cmd_queue.cmd_array[wc->cmd_queue.tail] = *cmd; - wc->cmd_queue.tail = wc->cmd_queue.tail != ACLK_DATABASE_CMD_Q_MAX_SIZE - 1 ? - wc->cmd_queue.tail + 1 : 0; - wc->queue_size = queue_size + 1; - uv_mutex_unlock(&wc->cmd_mutex); + char sql[512]; + snprintfz(sql,511, SQL_ALERT_CLEANUP, (char *) argv[0], ACLK_DELETE_ACK_ALERTS_INTERNAL); + if (unlikely(db_execute(db_meta, sql))) + error_report("Failed to clean stale ACLK alert entries"); return 0; } -void aclk_database_enq_cmd(struct aclk_database_worker_config *wc, struct aclk_database_cmd *cmd) -{ - unsigned queue_size; - - /* wait for free space in queue */ - uv_mutex_lock(&wc->cmd_mutex); - if (wc->is_shutting_down) { - uv_mutex_unlock(&wc->cmd_mutex); - return; - } - while ((queue_size = wc->queue_size) == ACLK_DATABASE_CMD_Q_MAX_SIZE) { - uv_cond_wait(&wc->cmd_cond, &wc->cmd_mutex); - } - fatal_assert(queue_size < ACLK_DATABASE_CMD_Q_MAX_SIZE); - /* enqueue command */ - wc->cmd_queue.cmd_array[wc->cmd_queue.tail] = *cmd; - wc->cmd_queue.tail = wc->cmd_queue.tail != ACLK_DATABASE_CMD_Q_MAX_SIZE - 1 ? - wc->cmd_queue.tail + 1 : 0; - wc->queue_size = queue_size + 1; - uv_mutex_unlock(&wc->cmd_mutex); - - /* wake up event loop */ - int rc = uv_async_send(&wc->async); - if (unlikely(rc)) - debug(D_ACLK_SYNC, "Failed to wake up event loop"); -} - -struct aclk_database_cmd aclk_database_deq_cmd(struct aclk_database_worker_config* wc) -{ - struct aclk_database_cmd ret; - unsigned queue_size; - uv_mutex_lock(&wc->cmd_mutex); - queue_size = wc->queue_size; - if (queue_size == 0 || wc->is_shutting_down) { - memset(&ret, 0, sizeof(ret)); - ret.opcode = ACLK_DATABASE_NOOP; - ret.completion = NULL; - if (wc->is_shutting_down) - uv_cond_signal(&wc->cmd_cond); - } else { - /* dequeue command */ - ret = wc->cmd_queue.cmd_array[wc->cmd_queue.head]; - if (queue_size == 1) { - wc->cmd_queue.head = wc->cmd_queue.tail = 0; - } else { - wc->cmd_queue.head = wc->cmd_queue.head != ACLK_DATABASE_CMD_Q_MAX_SIZE - 1 ? - wc->cmd_queue.head + 1 : 0; - } - wc->queue_size = queue_size - 1; - /* wake up producers */ - uv_cond_signal(&wc->cmd_cond); - } - uv_mutex_unlock(&wc->cmd_mutex); - - return ret; -} +#define SQL_SELECT_ACLK_ALERT_LIST "SELECT SUBSTR(name,12) FROM sqlite_schema WHERE name LIKE 'aclk_alert_%' AND type IN ('table');" -struct aclk_database_worker_config *find_inactive_wc_by_node_id(char *node_id) +static void sql_maint_aclk_sync_database_all(void) { - if (unlikely(!node_id)) - return NULL; - - uv_mutex_lock(&aclk_async_lock); - struct aclk_database_worker_config *wc = aclk_thread_head; - - while (wc) { - if (!strcmp(wc->node_id, node_id)) - break; - wc = wc->next; + char *err_msg = NULL; + debug(D_ACLK_SYNC,"Cleaning tables for nodes that do not exist"); + int rc = sqlite3_exec_monitored(db_meta, SQL_SELECT_ACLK_ALERT_LIST, sql_maint_aclk_sync_database, NULL, &err_msg); + if (rc != SQLITE_OK) { + error_report("Query failed when trying to check for obsolete ACLK sync tables, %s", err_msg); + sqlite3_free(err_msg); } - uv_mutex_unlock(&aclk_async_lock); - - return (wc); } -void aclk_sync_exit_all() -{ - rrd_rdlock(); - RRDHOST *host; - rrdhost_foreach_read(host) { - struct aclk_database_worker_config *wc = host->dbsync_worker; - if (wc) { - wc->is_shutting_down = 1; - (void) aclk_database_deq_cmd(wc); - uv_cond_signal(&wc->cmd_cond); - } - } - rrd_unlock(); - - uv_mutex_lock(&aclk_async_lock); - struct aclk_database_worker_config *wc = aclk_thread_head; - while (wc) { - wc->is_shutting_down = 1; - wc = wc->next; - } - uv_mutex_unlock(&aclk_async_lock); -} - -enum { - IDX_HOST_ID, - IDX_HOSTNAME, - IDX_REGISTRY, - IDX_UPDATE_EVERY, - IDX_OS, - IDX_TIMEZONE, - IDX_TAGS, - IDX_HOPS, - IDX_MEMORY_MODE, - IDX_ABBREV_TIMEZONE, - IDX_UTC_OFFSET, - IDX_PROGRAM_NAME, - IDX_PROGRAM_VERSION, - IDX_ENTRIES, - IDX_HEALTH_ENABLED, -}; - -static int create_host_callback(void *data, int argc, char **argv, char **column) -{ - UNUSED(data); - UNUSED(argc); - UNUSED(column); - - char guid[UUID_STR_LEN]; - uuid_unparse_lower(*(uuid_t *)argv[IDX_HOST_ID], guid); - - struct rrdhost_system_info *system_info = callocz(1, sizeof(struct rrdhost_system_info)); - __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(struct rrdhost_system_info), __ATOMIC_RELAXED); - - system_info->hops = str2i((const char *) argv[IDX_HOPS]); - - sql_build_host_system_info((uuid_t *)argv[IDX_HOST_ID], system_info); - - RRDHOST *host = rrdhost_find_or_create( - (const char *) argv[IDX_HOSTNAME] - , (const char *) argv[IDX_REGISTRY] - , guid - , (const char *) argv[IDX_OS] - , (const char *) argv[IDX_TIMEZONE] - , (const char *) argv[IDX_ABBREV_TIMEZONE] - , argv[IDX_UTC_OFFSET] ? str2uint32_t(argv[IDX_UTC_OFFSET]) : 0 - , (const char *) argv[IDX_TAGS] - , (const char *) (argv[IDX_PROGRAM_NAME] ? argv[IDX_PROGRAM_NAME] : "unknown") - , (const char *) (argv[IDX_PROGRAM_VERSION] ? argv[IDX_PROGRAM_VERSION] : "unknown") - , argv[3] ? str2i(argv[IDX_UPDATE_EVERY]) : 1 - , argv[13] ? str2i(argv[IDX_ENTRIES]) : 0 - , default_rrd_memory_mode - , 0 // health - , 0 // rrdpush enabled - , NULL //destination - , NULL // api key - , NULL // send charts matching - , false // rrdpush_enable_replication - , 0 // rrdpush_seconds_to_replicate - , 0 // rrdpush_replication_step - , system_info - , 1 - ); - if (likely(host)) - host->rrdlabels = sql_load_host_labels((uuid_t *)argv[IDX_HOST_ID]); - -#ifdef NETDATA_INTERNAL_CHECKS - char node_str[UUID_STR_LEN] = "<none>"; - if (likely(host->node_id)) - uuid_unparse_lower(*host->node_id, node_str); - internal_error(true, "Adding archived host \"%s\" with GUID \"%s\" node id = \"%s\"", rrdhost_hostname(host), host->machine_guid, node_str); -#endif - return 0; -} - -#ifdef ENABLE_ACLK -static int aclk_start_sync_thread(void *data, int argc, char **argv, char **column) +static int aclk_config_parameters(void *data __maybe_unused, int argc __maybe_unused, char **argv, char **column __maybe_unused) { char uuid_str[GUID_LEN + 1]; - UNUSED(data); - UNUSED(argc); - UNUSED(column); - uuid_unparse_lower(*((uuid_t *) argv[0]), uuid_str); RRDHOST *host = rrdhost_find_by_guid(uuid_str); @@ -415,156 +325,81 @@ static int aclk_start_sync_thread(void *data, int argc, char **argv, char **colu sql_create_aclk_table(host, (uuid_t *) argv[0], (uuid_t *) argv[1]); return 0; } -#endif -void sql_aclk_sync_init(void) -{ - char *err_msg = NULL; - int rc; - - if (unlikely(!db_meta)) { - if (default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) { - return; - } - error_report("Database has not been initialized"); - return; - } - - info("Creating archived hosts"); - rc = sqlite3_exec_monitored(db_meta, "SELECT host_id, hostname, registry_hostname, update_every, os, " - "timezone, tags, hops, memory_mode, abbrev_timezone, utc_offset, program_name, " - "program_version, entries, health_enabled FROM host WHERE hops >0;", - create_host_callback, NULL, &err_msg); - if (rc != SQLITE_OK) { - error_report("SQLite error when loading archived hosts, rc = %d (%s)", rc, err_msg); - sqlite3_free(err_msg); - } - -#ifdef ENABLE_ACLK - fatal_assert(0 == uv_mutex_init(&aclk_async_lock)); - rc = sqlite3_exec_monitored(db_meta, "SELECT ni.host_id, ni.node_id FROM host h, node_instance ni WHERE " - "h.host_id = ni.host_id AND ni.node_id IS NOT NULL;", aclk_start_sync_thread, NULL, &err_msg); - if (rc != SQLITE_OK) { - error_report("SQLite error when starting ACLK sync threads, rc = %d (%s)", rc, err_msg); - sqlite3_free(err_msg); - } - info("ACLK sync initialization completed"); -#endif -} static void async_cb(uv_async_t *handle) { uv_stop(handle->loop); uv_update_time(handle->loop); - debug(D_ACLK_SYNC, "%s called, active=%d.", __func__, uv_is_active((uv_handle_t *)handle)); } #define TIMER_PERIOD_MS (1000) -static void timer_cb(uv_timer_t* handle) +static void timer_cb(uv_timer_t *handle) { uv_stop(handle->loop); uv_update_time(handle->loop); -#ifdef ENABLE_ACLK - struct aclk_database_worker_config *wc = handle->data; + struct aclk_sync_config_s *config = handle->data; struct aclk_database_cmd cmd; memset(&cmd, 0, sizeof(cmd)); - cmd.opcode = ACLK_DATABASE_TIMER; - aclk_database_enq_cmd_noblock(wc, &cmd); time_t now = now_realtime_sec(); - if (wc->cleanup_after && wc->cleanup_after < now) { + if (config->cleanup_after && config->cleanup_after < now) { cmd.opcode = ACLK_DATABASE_CLEANUP; - if (!aclk_database_enq_cmd_noblock(wc, &cmd)) - wc->cleanup_after += ACLK_DATABASE_CLEANUP_INTERVAL; + if (!aclk_database_enq_cmd_noblock(&cmd)) + config->cleanup_after += ACLK_DATABASE_CLEANUP_INTERVAL; } if (aclk_connected) { - if (wc->alert_updates && !wc->pause_alert_updates) { - cmd.opcode = ACLK_DATABASE_PUSH_ALERT; - cmd.count = ACLK_MAX_ALERT_UPDATES; - aclk_database_enq_cmd_noblock(wc, &cmd); - } + cmd.opcode = ACLK_DATABASE_PUSH_ALERT; + aclk_database_enq_cmd_noblock(&cmd); + + aclk_check_node_info_and_collectors(); } -#endif } -static void aclk_database_worker(void *arg) +static void aclk_synchronization(void *arg __maybe_unused) { - service_register(SERVICE_THREAD_TYPE_EVENT_LOOP, NULL, NULL, NULL, true); + struct aclk_sync_config_s *config = arg; + uv_thread_set_name_np(config->thread, "ACLKSYNC"); worker_register("ACLKSYNC"); + service_register(SERVICE_THREAD_TYPE_EVENT_LOOP, NULL, NULL, NULL, true); + worker_register_job_name(ACLK_DATABASE_NOOP, "noop"); - worker_register_job_name(ACLK_DATABASE_ORPHAN_HOST, "node orphan"); - worker_register_job_name(ACLK_DATABASE_ALARM_HEALTH_LOG, "alert log"); worker_register_job_name(ACLK_DATABASE_CLEANUP, "cleanup"); worker_register_job_name(ACLK_DATABASE_DELETE_HOST, "node delete"); - worker_register_job_name(ACLK_DATABASE_NODE_INFO, "node info"); - worker_register_job_name(ACLK_DATABASE_NODE_COLLECTORS, "node collectors"); + worker_register_job_name(ACLK_DATABASE_NODE_STATE, "node state"); worker_register_job_name(ACLK_DATABASE_PUSH_ALERT, "alert push"); worker_register_job_name(ACLK_DATABASE_PUSH_ALERT_CONFIG, "alert conf push"); + worker_register_job_name(ACLK_DATABASE_PUSH_ALERT_CHECKPOINT,"alert checkpoint"); worker_register_job_name(ACLK_DATABASE_PUSH_ALERT_SNAPSHOT, "alert snapshot"); worker_register_job_name(ACLK_DATABASE_QUEUE_REMOVED_ALERTS, "alerts check"); worker_register_job_name(ACLK_DATABASE_TIMER, "timer"); - struct aclk_database_worker_config *wc = arg; - uv_loop_t *loop; - int ret; - enum aclk_database_opcode opcode; - uv_timer_t timer_req; - struct aclk_database_cmd cmd; + uv_loop_t *loop = &config->loop; + fatal_assert(0 == uv_loop_init(loop)); + fatal_assert(0 == uv_async_init(loop, &config->async, async_cb)); - char threadname[NETDATA_THREAD_NAME_MAX+1]; - if (wc->host) - snprintfz(threadname, NETDATA_THREAD_NAME_MAX, "ACLK[%s]", rrdhost_hostname(wc->host)); - else { - snprintfz(threadname, NETDATA_THREAD_NAME_MAX, "ACLK[%s]", wc->uuid_str); - threadname[11] = '\0'; - } - uv_thread_set_name_np(wc->thread, threadname); - - loop = wc->loop = mallocz(sizeof(uv_loop_t)); - ret = uv_loop_init(loop); - if (ret) { - error("uv_loop_init(): %s", uv_strerror(ret)); - goto error_after_loop_init; - } - loop->data = wc; + fatal_assert(0 == uv_timer_init(loop, &config->timer_req)); + config->timer_req.data = config; + fatal_assert(0 == uv_timer_start(&config->timer_req, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS)); - ret = uv_async_init(wc->loop, &wc->async, async_cb); - if (ret) { - error("uv_async_init(): %s", uv_strerror(ret)); - goto error_after_async_init; - } - wc->async.data = wc; - - ret = uv_timer_init(loop, &timer_req); - if (ret) { - error("uv_timer_init(): %s", uv_strerror(ret)); - goto error_after_timer_init; - } - timer_req.data = wc; - fatal_assert(0 == uv_timer_start(&timer_req, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS)); - - wc->node_info_send = 1; - info("Starting ACLK sync thread for host %s -- scratch area %lu bytes", wc->host_guid, (unsigned long int) sizeof(*wc)); - - memset(&cmd, 0, sizeof(cmd)); + info("Starting ACLK synchronization thread"); - wc->startup_time = now_realtime_sec(); - wc->cleanup_after = wc->startup_time + ACLK_DATABASE_CLEANUP_FIRST; + config->cleanup_after = now_realtime_sec() + ACLK_DATABASE_CLEANUP_FIRST; + config->initialized = true; - debug(D_ACLK_SYNC,"Node %s reports pending message count = %u", wc->node_id, wc->chart_payload_count); - - while (likely(!netdata_exit)) { + while (likely(service_running(SERVICE_ACLKSYNC))) { + enum aclk_database_opcode opcode; worker_is_idle(); uv_run(loop, UV_RUN_DEFAULT); /* wait for commands */ do { - cmd = aclk_database_deq_cmd(wc); + struct aclk_database_cmd cmd = aclk_database_deq_cmd(); - if (netdata_exit) + if (unlikely(!service_running(SERVICE_ACLKSYNC))) break; opcode = cmd.opcode; @@ -576,201 +411,216 @@ static void aclk_database_worker(void *arg) case ACLK_DATABASE_NOOP: /* the command queue was empty, do nothing */ break; - // MAINTENANCE case ACLK_DATABASE_CLEANUP: - debug(D_ACLK_SYNC, "Database cleanup for %s", wc->host_guid); - - if (wc->startup_time + ACLK_DATABASE_CLEANUP_FIRST + 2 < now_realtime_sec() && claimed() && aclk_connected) { - cmd.opcode = ACLK_DATABASE_NODE_INFO; - cmd.completion = NULL; - (void) aclk_database_enq_cmd_noblock(wc, &cmd); - } - - sql_maint_aclk_sync_database(wc, cmd); - if (wc->host == localhost) - sql_check_aclk_table_list(wc); + // Scan all aclk_alert_ tables and cleanup as needed + sql_maint_aclk_sync_database_all(); + sql_check_aclk_table_list(); break; case ACLK_DATABASE_DELETE_HOST: - debug(D_ACLK_SYNC,"Cleaning ACLK tables for %s", (char *) cmd.data); - sql_delete_aclk_table_list(wc, cmd); + sql_delete_aclk_table_list(cmd.param[0]); + break; +// NODE STATE + case ACLK_DATABASE_NODE_STATE:; + RRDHOST *host = cmd.param[0]; + int live = (host == localhost || host->receiver || !(rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN))) ? 1 : 0; + struct aclk_sync_host_config *ahc = host->aclk_sync_host_config; + if (unlikely(!ahc)) + sql_create_aclk_table(host, &host->host_uuid, host->node_id); + aclk_host_state_update(host, live); break; - // ALERTS case ACLK_DATABASE_PUSH_ALERT_CONFIG: - debug(D_ACLK_SYNC,"Pushing chart config info to the cloud for %s", wc->host_guid); - aclk_push_alert_config_event(wc, cmd); + aclk_push_alert_config_event(cmd.param[0], cmd.param[1]); break; case ACLK_DATABASE_PUSH_ALERT: - debug(D_ACLK_SYNC, "Pushing alert info to the cloud for %s", wc->host_guid); - aclk_push_alert_event(wc, cmd); - break; - case ACLK_DATABASE_ALARM_HEALTH_LOG: - debug(D_ACLK_SYNC, "Pushing alarm health log to the cloud for %s", wc->host_guid); - aclk_push_alarm_health_log(wc, cmd); + aclk_push_alert_events_for_all_hosts(); break; - case ACLK_DATABASE_PUSH_ALERT_SNAPSHOT: - debug(D_ACLK_SYNC, "Pushing alert snapshot to the cloud for node %s", wc->host_guid); - aclk_push_alert_snapshot_event(wc, cmd); + case ACLK_DATABASE_PUSH_ALERT_SNAPSHOT:; + aclk_push_alert_snapshot_event(cmd.param[0]); break; case ACLK_DATABASE_QUEUE_REMOVED_ALERTS: - debug(D_ACLK_SYNC, "Queueing removed alerts for node %s", wc->host_guid); - sql_process_queue_removed_alerts_to_aclk(wc, cmd); - break; - -// NODE OPERATIONS - case ACLK_DATABASE_NODE_INFO: - debug(D_ACLK_SYNC,"Sending node info for %s", wc->uuid_str); - sql_build_node_info(wc, cmd); - break; - case ACLK_DATABASE_NODE_COLLECTORS: - debug(D_ACLK_SYNC,"Sending node collectors info for %s", wc->uuid_str); - sql_build_node_collectors(wc); - break; -#ifdef ENABLE_ACLK - -// NODE_INSTANCE DETECTION - case ACLK_DATABASE_ORPHAN_HOST: - wc->host = NULL; - wc->is_orphan = 1; - aclk_add_worker_thread(wc); - break; -#endif - case ACLK_DATABASE_TIMER: - if (unlikely(localhost && !wc->host && !wc->is_orphan)) { - if (claimed()) { - wc->host = rrdhost_find_by_guid(wc->host_guid); - if (wc->host) { - info("HOST %s (%s) detected as active", rrdhost_hostname(wc->host), wc->host_guid); - snprintfz(threadname, NETDATA_THREAD_NAME_MAX, "ACLK[%s]", rrdhost_hostname(wc->host)); - uv_thread_set_name_np(wc->thread, threadname); - wc->host->dbsync_worker = wc; - if (unlikely(!wc->hostname)) - wc->hostname = strdupz(rrdhost_hostname(wc->host)); - aclk_del_worker_thread(wc); - wc->node_info_send = 1; - } - } - } - if (wc->node_info_send && localhost && claimed() && aclk_connected) { - cmd.opcode = ACLK_DATABASE_NODE_INFO; - cmd.completion = NULL; - wc->node_info_send = aclk_database_enq_cmd_noblock(wc, &cmd); - } - if (wc->node_collectors_send && wc->node_collectors_send + 30 < now_realtime_sec()) { - cmd.opcode = ACLK_DATABASE_NODE_COLLECTORS; - cmd.completion = NULL; - wc->node_collectors_send = aclk_database_enq_cmd_noblock(wc, &cmd); - } - if (localhost == wc->host) - (void) sqlite3_wal_checkpoint(db_meta, NULL); + sql_process_queue_removed_alerts_to_aclk(cmd.param[0]); break; default: debug(D_ACLK_SYNC, "%s: default.", __func__); break; } if (cmd.completion) - aclk_complete(cmd.completion); + completion_mark_complete(cmd.completion); } while (opcode != ACLK_DATABASE_NOOP); } - if (!uv_timer_stop(&timer_req)) - uv_close((uv_handle_t *)&timer_req, NULL); - - /* cleanup operations of the event loop */ - //info("Shutting down ACLK sync event loop for %s", wc->host_guid); + if (!uv_timer_stop(&config->timer_req)) + uv_close((uv_handle_t *)&config->timer_req, NULL); - /* - * uv_async_send after uv_close does not seem to crash in linux at the moment, - * it is however undocumented behaviour we need to be aware if this becomes - * an issue in the future. - */ - uv_close((uv_handle_t *)&wc->async, NULL); - uv_run(loop, UV_RUN_DEFAULT); + uv_close((uv_handle_t *)&config->async, NULL); +// uv_close((uv_handle_t *)&config->async_exit, NULL); + uv_cond_destroy(&config->cmd_cond); + (void) uv_loop_close(loop); - info("Shutting down ACLK sync event loop complete for host %s", wc->host_guid); - /* TODO: don't let the API block by waiting to enqueue commands */ - uv_cond_destroy(&wc->cmd_cond); - - int rc; - do { - rc = uv_loop_close(loop); - } while (rc != UV_EBUSY); - - freez(loop); + worker_unregister(); + service_exits(); + info("ACLK SYNC: Shutting down ACLK synchronization event loop"); +} - rrd_rdlock(); - if (likely(wc->host)) - wc->host->dbsync_worker = NULL; - freez(wc->hostname); - freez(wc); - rrd_unlock(); +static void aclk_synchronization_init(void) +{ + aclk_sync_config.cmd_queue.head = aclk_sync_config.cmd_queue.tail = 0; + aclk_sync_config.queue_size = 0; + fatal_assert(0 == uv_cond_init(&aclk_sync_config.cmd_cond)); + fatal_assert(0 == uv_mutex_init(&aclk_sync_config.cmd_mutex)); - worker_unregister(); - return; - -error_after_timer_init: - uv_close((uv_handle_t *)&wc->async, NULL); -error_after_async_init: - fatal_assert(0 == uv_loop_close(loop)); -error_after_loop_init: - freez(loop); - worker_unregister(); + fatal_assert(0 == uv_thread_create(&aclk_sync_config.thread, aclk_synchronization, &aclk_sync_config)); } +#endif // ------------------------------------------------------------- -void sql_create_aclk_table(RRDHOST *host, uuid_t *host_uuid, uuid_t *node_id) +void sql_create_aclk_table(RRDHOST *host __maybe_unused, uuid_t *host_uuid __maybe_unused, uuid_t *node_id __maybe_unused) { #ifdef ENABLE_ACLK char uuid_str[GUID_LEN + 1]; char host_guid[GUID_LEN + 1]; + int rc; uuid_unparse_lower_fix(host_uuid, uuid_str); - - if (aclk_worker_thread_exists(uuid_str)) - return; - uuid_unparse_lower(*host_uuid, host_guid); - BUFFER *sql = buffer_create(ACLK_SYNC_QUERY_SIZE, &netdata_buffers_statistics.buffers_sqlite); + char sql[ACLK_SYNC_QUERY_SIZE]; - buffer_sprintf(sql, TABLE_ACLK_ALERT, uuid_str); - db_execute(buffer_tostring(sql)); - buffer_flush(sql); - - buffer_sprintf(sql, INDEX_ACLK_ALERT, uuid_str, uuid_str); - db_execute(buffer_tostring(sql)); - - buffer_free(sql); + snprintfz(sql, ACLK_SYNC_QUERY_SIZE-1, TABLE_ACLK_ALERT, uuid_str); + rc = db_execute(db_meta, sql); + if (unlikely(rc)) + error_report("Failed to create ACLK alert table for host %s", host ? rrdhost_hostname(host) : host_guid); + else { + snprintfz(sql, ACLK_SYNC_QUERY_SIZE -1, INDEX_ACLK_ALERT, uuid_str, uuid_str); + rc = db_execute(db_meta, sql); + if (unlikely(rc)) + error_report("Failed to create ACLK alert table index for host %s", host ? string2str(host->hostname) : host_guid); + } + if (likely(host) && unlikely(host->aclk_sync_host_config)) + return; - if (likely(host) && unlikely(host->dbsync_worker)) + if (unlikely(!host)) return; - struct aclk_database_worker_config *wc = callocz(1, sizeof(struct aclk_database_worker_config)); + struct aclk_sync_host_config *wc = callocz(1, sizeof(struct aclk_sync_host_config)); if (node_id && !uuid_is_null(*node_id)) uuid_unparse_lower(*node_id, wc->node_id); - if (likely(host)) { - host->dbsync_worker = (void *)wc; - wc->hostname = strdupz(rrdhost_hostname(host)); - if (node_id && !host->node_id) { - host->node_id = mallocz(sizeof(*host->node_id)); - uuid_copy(*host->node_id, *node_id); - } + + host->aclk_sync_host_config = (void *)wc; + if (node_id && !host->node_id) { + host->node_id = mallocz(sizeof(*host->node_id)); + uuid_copy(*host->node_id, *node_id); } - else - wc->hostname = get_hostname_by_node_id(wc->node_id); + wc->host = host; strcpy(wc->uuid_str, uuid_str); - strcpy(wc->host_guid, host_guid); wc->alert_updates = 0; - aclk_database_init_cmd_queue(wc); - aclk_add_worker_thread(wc); - fatal_assert(0 == uv_thread_create(&(wc->thread), aclk_database_worker, wc)); -#else - UNUSED(host); - UNUSED(host_uuid); - UNUSED(node_id); + time_t now = now_realtime_sec(); + wc->node_info_send_time = (host == localhost || NULL == localhost) ? now - 25 : now; #endif -}
\ No newline at end of file +} + +#define SQL_FETCH_ALL_HOSTS "SELECT host_id, hostname, registry_hostname, update_every, os, " \ + "timezone, tags, hops, memory_mode, abbrev_timezone, utc_offset, program_name, " \ + "program_version, entries, health_enabled FROM host WHERE hops >0;" + +#define SQL_FETCH_ALL_INSTANCES "SELECT ni.host_id, ni.node_id FROM host h, node_instance ni " \ + "WHERE h.host_id = ni.host_id AND ni.node_id IS NOT NULL; " +void sql_aclk_sync_init(void) +{ + char *err_msg = NULL; + int rc; + + if (unlikely(!db_meta)) { + if (default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) { + return; + } + error_report("Database has not been initialized"); + return; + } + + info("Creating archived hosts"); + int number_of_children = 0; + rc = sqlite3_exec_monitored(db_meta, SQL_FETCH_ALL_HOSTS, create_host_callback, &number_of_children, &err_msg); + + if (rc != SQLITE_OK) { + error_report("SQLite error when loading archived hosts, rc = %d (%s)", rc, err_msg); + sqlite3_free(err_msg); + } + + info("Created %d archived hosts", number_of_children); + // Trigger host context load for hosts that have been created + metadata_queue_load_host_context(NULL); + +#ifdef ENABLE_ACLK + if (!number_of_children) + aclk_queue_node_info(localhost, true); + + rc = sqlite3_exec_monitored(db_meta, SQL_FETCH_ALL_INSTANCES,aclk_config_parameters, NULL,&err_msg); + + if (rc != SQLITE_OK) { + error_report("SQLite error when configuring host ACLK synchonization parameters, rc = %d (%s)", rc, err_msg); + sqlite3_free(err_msg); + } + aclk_synchronization_init(); + + info("ACLK sync initialization completed"); +#endif +} + +// Public + +static inline void queue_aclk_sync_cmd(enum aclk_database_opcode opcode, const void *param0, const void *param1) +{ + struct aclk_database_cmd cmd; + cmd.opcode = opcode; + cmd.param[0] = (void *) param0; + cmd.param[1] = (void *) param1; + cmd.completion = NULL; + aclk_database_enq_cmd(&cmd); +} + +// Public +void aclk_push_alert_config(const char *node_id, const char *config_hash) +{ + if (unlikely(!aclk_sync_config.initialized)) + return; + + queue_aclk_sync_cmd(ACLK_DATABASE_PUSH_ALERT_CONFIG, strdupz(node_id), strdupz(config_hash)); +} + +void aclk_push_node_alert_snapshot(const char *node_id) +{ + if (unlikely(!aclk_sync_config.initialized)) + return; + + queue_aclk_sync_cmd(ACLK_DATABASE_PUSH_ALERT_SNAPSHOT, strdupz(node_id), NULL); +} + + +void aclk_push_node_removed_alerts(const char *node_id) +{ + if (unlikely(!aclk_sync_config.initialized)) + return; + + queue_aclk_sync_cmd(ACLK_DATABASE_QUEUE_REMOVED_ALERTS, strdupz(node_id), NULL); +} + +void schedule_node_info_update(RRDHOST *host __maybe_unused) +{ +#ifdef ENABLE_ACLK + if (unlikely(!host)) + return; + + struct aclk_database_cmd cmd; + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = ACLK_DATABASE_NODE_STATE; + cmd.param[0] = host; + cmd.completion = NULL; + aclk_database_enq_cmd(&cmd); +#endif +} diff --git a/database/sqlite/sqlite_aclk.h b/database/sqlite/sqlite_aclk.h index 208177e4..d555a0ce 100644 --- a/database/sqlite/sqlite_aclk.h +++ b/database/sqlite/sqlite_aclk.h @@ -18,45 +18,6 @@ #define ACLK_DELETE_ACK_ALERTS_INTERNAL (86400) #define ACLK_SYNC_QUERY_SIZE 512 -struct aclk_completion { - uv_mutex_t mutex; - uv_cond_t cond; - volatile unsigned completed; -}; - -static inline void init_aclk_completion(struct aclk_completion *p) -{ - p->completed = 0; - fatal_assert(0 == uv_cond_init(&p->cond)); - fatal_assert(0 == uv_mutex_init(&p->mutex)); -} - -static inline void destroy_aclk_completion(struct aclk_completion *p) -{ - uv_cond_destroy(&p->cond); - uv_mutex_destroy(&p->mutex); -} - -static inline void wait_for_aclk_completion(struct aclk_completion *p) -{ - uv_mutex_lock(&p->mutex); - while (0 == p->completed) { - uv_cond_wait(&p->cond, &p->mutex); - } - fatal_assert(1 == p->completed); - uv_mutex_unlock(&p->mutex); -} - -static inline void aclk_complete(struct aclk_completion *p) -{ - uv_mutex_lock(&p->mutex); - p->completed = 1; - uv_mutex_unlock(&p->mutex); - uv_cond_broadcast(&p->cond); -} - -extern uv_mutex_t aclk_async_lock; - static inline void uuid_unparse_lower_fix(uuid_t *uuid, char *out) { uuid_unparse_lower(*uuid, out); @@ -66,6 +27,12 @@ static inline void uuid_unparse_lower_fix(uuid_t *uuid, char *out) out[23] = '_'; } +static inline int claimed() +{ + return localhost->aclk_state.claimed_id != NULL; +} + + #define TABLE_ACLK_ALERT "CREATE TABLE IF NOT EXISTS aclk_alert_%s (sequence_id INTEGER PRIMARY KEY, " \ "alert_unique_id, date_created, date_submitted, date_cloud_ack, filtered_alert_unique_id NOT NULL, " \ "unique(alert_unique_id));" @@ -74,16 +41,14 @@ static inline void uuid_unparse_lower_fix(uuid_t *uuid, char *out) enum aclk_database_opcode { ACLK_DATABASE_NOOP = 0, - ACLK_DATABASE_ORPHAN_HOST, - ACLK_DATABASE_ALARM_HEALTH_LOG, ACLK_DATABASE_CLEANUP, ACLK_DATABASE_DELETE_HOST, - ACLK_DATABASE_NODE_INFO, + ACLK_DATABASE_NODE_STATE, ACLK_DATABASE_PUSH_ALERT, ACLK_DATABASE_PUSH_ALERT_CONFIG, ACLK_DATABASE_PUSH_ALERT_SNAPSHOT, + ACLK_DATABASE_PUSH_ALERT_CHECKPOINT, ACLK_DATABASE_QUEUE_REMOVED_ALERTS, - ACLK_DATABASE_NODE_COLLECTORS, ACLK_DATABASE_TIMER, // leave this last @@ -93,10 +58,8 @@ enum aclk_database_opcode { struct aclk_database_cmd { enum aclk_database_opcode opcode; - void *data; - void *data_param; - int count; - struct aclk_completion *completion; + void *param[2]; + struct completion *completion; }; #define ACLK_DATABASE_CMD_Q_MAX_SIZE (1024) @@ -106,67 +69,27 @@ struct aclk_database_cmdqueue { struct aclk_database_cmd cmd_array[ACLK_DATABASE_CMD_Q_MAX_SIZE]; }; -struct aclk_database_worker_config { - uv_thread_t thread; - char uuid_str[GUID_LEN + 1]; - char node_id[GUID_LEN + 1]; - char host_guid[GUID_LEN + 1]; - char *hostname; // hostname to avoid constant lookups - time_t cleanup_after; // Start a cleanup after this timestamp - time_t startup_time; // When the sync thread started - uint64_t alerts_batch_id; // batch id for alerts to use - uint64_t alerts_start_seq_id; // cloud has asked to start streaming from - uint64_t alert_sequence_id; // last alert sequence_id - int pause_alert_updates; - uint32_t chart_payload_count; - uint64_t alerts_snapshot_id; //will contain the snapshot_id value if snapshot was requested - uint64_t alerts_ack_sequence_id; //last sequence_id ack'ed from cloud via sendsnapshot message - uv_loop_t *loop; +struct aclk_sync_host_config { RRDHOST *host; - uv_async_t async; - /* FIFO command queue */ - uv_mutex_t cmd_mutex; - uv_cond_t cmd_cond; - volatile unsigned queue_size; - struct aclk_database_cmdqueue cmd_queue; int alert_updates; - int node_info_send; + int alert_checkpoint_req; + int alert_queue_removed; + time_t node_info_send_time; time_t node_collectors_send; - volatile unsigned is_shutting_down; - volatile unsigned is_orphan; - struct aclk_database_worker_config *next; + char uuid_str[UUID_STR_LEN]; + char node_id[UUID_STR_LEN]; + char *alerts_snapshot_uuid; // will contain the snapshot_uuid value if snapshot was requested }; -static inline RRDHOST *find_host_by_node_id(char *node_id) -{ - uuid_t node_uuid; - if (unlikely(!node_id)) - return NULL; - - if (uuid_parse(node_id, node_uuid)) - return NULL; - - rrd_rdlock(); - RRDHOST *host, *ret = NULL; - rrdhost_foreach_read(host) { - if (host->node_id && !(uuid_compare(*host->node_id, node_uuid))) { - ret = host; - break; - } - } - rrd_unlock(); - - return ret; -} - - extern sqlite3 *db_meta; -int aclk_database_enq_cmd_noblock(struct aclk_database_worker_config *wc, struct aclk_database_cmd *cmd); -void aclk_database_enq_cmd(struct aclk_database_worker_config *wc, struct aclk_database_cmd *cmd); +int aclk_database_enq_cmd_noblock(struct aclk_database_cmd *cmd); void sql_create_aclk_table(RRDHOST *host, uuid_t *host_uuid, uuid_t *node_id); void sql_aclk_sync_init(void); -int claimed(); -void aclk_sync_exit_all(); -struct aclk_database_worker_config *find_inactive_wc_by_node_id(char *node_id); +void aclk_push_alert_config(const char *node_id, const char *config_hash); +void aclk_push_node_alert_snapshot(const char *node_id); +void aclk_push_node_health_log(const char *node_id); +void aclk_push_node_removed_alerts(const char *node_id); +void schedule_node_info_update(RRDHOST *host); + #endif //NETDATA_SQLITE_ACLK_H diff --git a/database/sqlite/sqlite_aclk_alert.c b/database/sqlite/sqlite_aclk_alert.c index ce284ebc..62f1df29 100644 --- a/database/sqlite/sqlite_aclk_alert.c +++ b/database/sqlite/sqlite_aclk_alert.c @@ -5,20 +5,20 @@ #ifdef ENABLE_ACLK #include "../../aclk/aclk_alarm_api.h" -#include "../../aclk/aclk.h" #endif +#define SQL_GET_ALERT_REMOVE_TIME "SELECT when_key FROM health_log_%s WHERE alarm_id = %u " \ + "AND unique_id > %u AND unique_id < %u " \ + "AND new_status = -2;" + time_t removed_when(uint32_t alarm_id, uint32_t before_unique_id, uint32_t after_unique_id, char *uuid_str) { sqlite3_stmt *res = NULL; - int rc = 0; time_t when = 0; char sql[ACLK_SYNC_QUERY_SIZE]; - snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, "select when_key from health_log_%s where alarm_id = %u " \ - "and unique_id > %u and unique_id < %u " \ - "and new_status = -2;", uuid_str, alarm_id, after_unique_id, before_unique_id); + snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, SQL_GET_ALERT_REMOVE_TIME, uuid_str, alarm_id, after_unique_id, before_unique_id); - rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); + int rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement when trying to find removed gap."); return 0; @@ -36,22 +36,26 @@ time_t removed_when(uint32_t alarm_id, uint32_t before_unique_id, uint32_t after return when; } +#define SQL_UPDATE_FILTERED_ALERT "UPDATE aclk_alert_%s SET filtered_alert_unique_id = %u where filtered_alert_unique_id = %u" + void update_filtered(ALARM_ENTRY *ae, uint32_t unique_id, char *uuid_str) { char sql[ACLK_SYNC_QUERY_SIZE]; - snprintfz(sql, ACLK_SYNC_QUERY_SIZE-1, "UPDATE aclk_alert_%s SET filtered_alert_unique_id = %u where filtered_alert_unique_id = %u", uuid_str, ae->unique_id, unique_id); + snprintfz(sql, ACLK_SYNC_QUERY_SIZE-1, SQL_UPDATE_FILTERED_ALERT, uuid_str, ae->unique_id, unique_id); sqlite3_exec_monitored(db_meta, sql, 0, 0, NULL); ae->flags |= HEALTH_ENTRY_FLAG_ACLK_QUEUED; } +#define SQL_SELECT_ALERT_BY_UNIQUE_ID "SELECT hl.unique_id FROM health_log_%s hl, alert_hash ah WHERE hl.unique_id = %u " \ + "AND hl.config_hash_id = ah.hash_id " \ + "AND ah.warn IS NULL AND ah.crit IS NULL;" + static inline bool is_event_from_alert_variable_config(uint32_t unique_id, char *uuid_str) { sqlite3_stmt *res = NULL; int rc = 0; bool ret = false; char sql[ACLK_SYNC_QUERY_SIZE]; - snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, "select hl.unique_id from health_log_%s hl, alert_hash ah where hl.unique_id = %u " \ - "and hl.config_hash_id = ah.hash_id " \ - "and ah.warn is null and ah.crit is null;", uuid_str, unique_id); + snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, SQL_SELECT_ALERT_BY_UNIQUE_ID, uuid_str, unique_id); rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); if (rc != SQLITE_OK) { @@ -73,12 +77,18 @@ static inline bool is_event_from_alert_variable_config(uint32_t unique_id, char #define MAX_REMOVED_PERIOD 86400 //decide if some events should be sent or not + +#define SQL_SELECT_ALERT_BY_ID "SELECT hl.new_status, hl.config_hash_id, hl.unique_id FROM health_log_%s hl, aclk_alert_%s aa " \ + "WHERE hl.unique_id = aa.filtered_alert_unique_id " \ + "AND hl.alarm_id = %u " \ + "ORDER BY alarm_event_id DESC LIMIT 1;" + int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) { sqlite3_stmt *res = NULL; - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - int send = 1, rc = 0; + int send = 1; if (ae->new_status == RRDCALC_STATUS_REMOVED || ae->new_status == RRDCALC_STATUS_UNINITIALIZED) { return 0; @@ -87,9 +97,6 @@ int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) if (unlikely(uuid_is_null(ae->config_hash_id))) return 0; - if (is_event_from_alert_variable_config(ae->unique_id, uuid_str)) - return 0; - char sql[ACLK_SYNC_QUERY_SIZE]; uuid_t config_hash_id; RRDCALC_STATUS status; @@ -97,12 +104,9 @@ int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) //get the previous sent event of this alarm_id //base the search on the last filtered event - snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, "select hl.new_status, hl.config_hash_id, hl.unique_id from health_log_%s hl, aclk_alert_%s aa \ - where hl.unique_id = aa.filtered_alert_unique_id \ - and hl.alarm_id = %u \ - order by alarm_event_id desc LIMIT 1;", uuid_str, uuid_str, ae->alarm_id); + snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, SQL_SELECT_ALERT_BY_ID, uuid_str, uuid_str, ae->alarm_id); - rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); + int rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement when trying to filter alert events."); send = 1; @@ -126,7 +130,7 @@ int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) goto done; } - if (uuid_compare(ae->config_hash_id, config_hash_id)) { + if (uuid_memcmp(&ae->config_hash_id, &config_hash_id)) { send = 1; goto done; } @@ -162,6 +166,10 @@ done: // will replace call to aclk_update_alarm in health/health_log.c // and handle both cases + +#define SQL_QUEUE_ALERT_TO_CLOUD "INSERT INTO aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " \ + "VALUES (@alert_unique_id, unixepoch(), @alert_unique_id) ON CONFLICT (alert_unique_id) do nothing;" + int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, int skip_filter) { if(!service_running(SERVICE_ACLK)) @@ -182,27 +190,24 @@ int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, int skip_filter) } } - int rc = 0; - sqlite3_stmt *res_alert = NULL; - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + if (is_event_from_alert_variable_config(ae->unique_id, uuid_str)) + return 0; + + sqlite3_stmt *res_alert = NULL; + char sql[ACLK_SYNC_QUERY_SIZE]; - buffer_sprintf( - sql, - "INSERT INTO aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " - "VALUES (@alert_unique_id, unixepoch(), @alert_unique_id) on conflict (alert_unique_id) do nothing; ", - uuid_str); + snprintfz(sql, ACLK_SYNC_QUERY_SIZE - 1, SQL_QUEUE_ALERT_TO_CLOUD, uuid_str); - rc = sqlite3_prepare_v2(db_meta, buffer_tostring(sql), -1, &res_alert, 0); + int rc = sqlite3_prepare_v2(db_meta, sql, -1, &res_alert, 0); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to prepare statement to store alert event"); - buffer_free(sql); return 1; } - rc = sqlite3_bind_int(res_alert, 1, ae->unique_id); + rc = sqlite3_bind_int(res_alert, 1, (int) ae->unique_id); if (unlikely(rc != SQLITE_OK)) goto bind_fail; @@ -213,16 +218,12 @@ int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, int skip_filter) } ae->flags |= HEALTH_ENTRY_FLAG_ACLK_QUEUED; - struct aclk_database_worker_config *wc = (struct aclk_database_worker_config *)host->dbsync_worker; - if (wc) { - wc->pause_alert_updates = 0; - } + rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); bind_fail: if (unlikely(sqlite3_finalize(res_alert) != SQLITE_OK)) error_report("Failed to reset statement in store alert event, rc = %d", rc); - buffer_free(sql); return 0; } @@ -254,11 +255,10 @@ int rrdcalc_status_to_proto_enum(RRDCALC_STATUS status) #endif } -void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) +void aclk_push_alert_event(struct aclk_sync_host_config *wc) { #ifndef ENABLE_ACLK UNUSED(wc); - UNUSED(cmd); #else int rc; @@ -278,26 +278,7 @@ void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_d BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); - if (wc->alerts_start_seq_id != 0) { - buffer_sprintf( - sql, - "UPDATE aclk_alert_%s SET date_submitted = NULL, date_cloud_ack = NULL WHERE sequence_id >= %"PRIu64 - "; UPDATE aclk_alert_%s SET date_cloud_ack = unixepoch() WHERE sequence_id < %"PRIu64 - " and date_cloud_ack is null " - "; UPDATE aclk_alert_%s SET date_submitted = unixepoch() WHERE sequence_id < %"PRIu64 - " and date_submitted is null", - wc->uuid_str, - wc->alerts_start_seq_id, - wc->uuid_str, - wc->alerts_start_seq_id, - wc->uuid_str, - wc->alerts_start_seq_id); - db_execute(buffer_tostring(sql)); - buffer_reset(sql); - wc->alerts_start_seq_id = 0; - } - - int limit = cmd.count > 0 ? cmd.count : 1; + int limit = ACLK_MAX_ALERT_UPDATES; sqlite3_stmt *res = NULL; @@ -318,10 +299,15 @@ void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_d BUFFER *sql_fix = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); buffer_sprintf(sql_fix, TABLE_ACLK_ALERT, wc->uuid_str); - db_execute(buffer_tostring(sql_fix)); - buffer_flush(sql_fix); - buffer_sprintf(sql_fix, INDEX_ACLK_ALERT, wc->uuid_str, wc->uuid_str); - db_execute(buffer_tostring(sql_fix)); + rc = db_execute(db_meta, buffer_tostring(sql_fix)); + if (unlikely(rc)) + error_report("Failed to create ACLK alert table for host %s", rrdhost_hostname(wc->host)); + else { + buffer_flush(sql_fix); + buffer_sprintf(sql_fix, INDEX_ACLK_ALERT, wc->uuid_str, wc->uuid_str); + if (unlikely(db_execute(db_meta, buffer_tostring(sql_fix)))) + error_report("Failed to create ACLK alert table for host %s", rrdhost_hostname(wc->host)); + } buffer_free(sql_fix); // Try again @@ -353,8 +339,8 @@ void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_d alarm_log.name = strdupz((char *)sqlite3_column_text(res, 11)); alarm_log.family = sqlite3_column_bytes(res, 13) > 0 ? strdupz((char *)sqlite3_column_text(res, 13)) : NULL; - alarm_log.batch_id = wc->alerts_batch_id; - alarm_log.sequence_id = (uint64_t) sqlite3_column_int64(res, 0); + //alarm_log.batch_id = wc->alerts_batch_id; + //alarm_log.sequence_id = (uint64_t) sqlite3_column_int64(res, 0); alarm_log.when = (time_t) sqlite3_column_int64(res, 5); uuid_unparse_lower(*((uuid_t *) sqlite3_column_blob(res, 3)), uuid_str); @@ -429,19 +415,23 @@ void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_d buffer_sprintf(sql, "UPDATE aclk_alert_%s SET date_submitted=unixepoch() " "WHERE date_submitted IS NULL AND sequence_id BETWEEN %" PRIu64 " AND %" PRIu64 ";", wc->uuid_str, first_sequence_id, last_sequence_id); - db_execute(buffer_tostring(sql)); + + if (unlikely(db_execute(db_meta, buffer_tostring(sql)))) + error_report("Failed to mark ACLK alert entries as submitted for host %s", rrdhost_hostname(wc->host)); + + // Mark to do one more check + rrdhost_flag_set(wc->host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); + } else { if (log_first_sequence_id) log_access( - "ACLK RES [%s (%s)]: ALERTS SENT from %" PRIu64 " to %" PRIu64 " batch=%" PRIu64, + "ACLK RES [%s (%s)]: ALERTS SENT from %" PRIu64 " to %" PRIu64 "", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", log_first_sequence_id, - log_last_sequence_id, - wc->alerts_batch_id); + log_last_sequence_id); log_first_sequence_id = 0; log_last_sequence_id = 0; - wc->pause_alert_updates = 1; } rc = sqlite3_finalize(res); @@ -451,8 +441,24 @@ void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_d freez(claim_id); buffer_free(sql); #endif +} + +void aclk_push_alert_events_for_all_hosts(void) +{ + RRDHOST *host; + + dfe_start_reentrant(rrdhost_root_index, host) { + if (rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED) || !rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS)) + continue; - return; + internal_error(true, "ACLK SYNC: Scanning host %s", rrdhost_hostname(host)); + rrdhost_flag_clear(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); + + struct aclk_sync_host_config *wc = host->aclk_sync_host_config; + if (likely(wc)) + aclk_push_alert_event(wc); + } + dfe_done(host); } void sql_queue_existing_alerts_to_aclk(RRDHOST *host) @@ -467,137 +473,15 @@ void sql_queue_existing_alerts_to_aclk(RRDHOST *host) "where new_status <> 0 and new_status <> -2 and config_hash_id is not null and updated_by_id = 0 " \ "order by unique_id asc on conflict (alert_unique_id) do nothing;", uuid_str, uuid_str, uuid_str); - db_execute(buffer_tostring(sql)); - - buffer_free(sql); - - struct aclk_database_worker_config *wc = (struct aclk_database_worker_config *)host->dbsync_worker; - if (wc) { - wc->pause_alert_updates = 0; - } -} - -void aclk_send_alarm_health_log(char *node_id) -{ - if (unlikely(!node_id)) - return; - - struct aclk_database_worker_config *wc = find_inactive_wc_by_node_id(node_id); - - if (likely(!wc)) { - RRDHOST *host = find_host_by_node_id(node_id); - if (likely(host)) - wc = (struct aclk_database_worker_config *)host->dbsync_worker; - } - - if (!wc) { - log_access("ACLK REQ [%s (N/A)]: HEALTH LOG REQUEST RECEIVED FOR INVALID NODE", node_id); - return; - } - - log_access("ACLK REQ [%s (%s)]: HEALTH LOG REQUEST RECEIVED", node_id, wc->hostname ? wc->hostname : "N/A"); - - struct aclk_database_cmd cmd; - memset(&cmd, 0, sizeof(cmd)); - cmd.opcode = ACLK_DATABASE_ALARM_HEALTH_LOG; - - aclk_database_enq_cmd(wc, &cmd); - return; -} - -void aclk_push_alarm_health_log(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) -{ - UNUSED(cmd); -#ifndef ENABLE_ACLK - UNUSED(wc); -#else - int rc; - - char *claim_id = get_agent_claimid(); - if (unlikely(!claim_id)) - return; - - RRDHOST *host = wc->host; - if (unlikely(!host)) { - host = find_host_by_node_id(wc->node_id); - - if (unlikely(!host)) { - log_access( - "AC [%s (N/A)]: ACLK synchronization thread for %s is not yet linked to HOST.", - wc->node_id, - wc->host_guid); - freez(claim_id); - return; - } - } - - uint64_t first_sequence = 0; - uint64_t last_sequence = 0; - struct timeval first_timestamp; - struct timeval last_timestamp; - - BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); - - sqlite3_stmt *res = NULL; - - //TODO: make this better: include info from health log too - buffer_sprintf(sql, "SELECT MIN(sequence_id), MIN(date_created), MAX(sequence_id), MAX(date_created) " \ - "FROM aclk_alert_%s;", wc->uuid_str); - - rc = sqlite3_prepare_v2(db_meta, buffer_tostring(sql), -1, &res, 0); - if (rc != SQLITE_OK) { - error_report("Failed to prepare statement to get health log statistics from the database"); - buffer_free(sql); - freez(claim_id); - return; - } - - first_timestamp.tv_sec = 0; - first_timestamp.tv_usec = 0; - last_timestamp.tv_sec = 0; - last_timestamp.tv_usec = 0; - - while (sqlite3_step_monitored(res) == SQLITE_ROW) { - first_sequence = sqlite3_column_bytes(res, 0) > 0 ? (uint64_t) sqlite3_column_int64(res, 0) : 0; - if (sqlite3_column_bytes(res, 1) > 0) { - first_timestamp.tv_sec = sqlite3_column_int64(res, 1); - } - - last_sequence = sqlite3_column_bytes(res, 2) > 0 ? (uint64_t) sqlite3_column_int64(res, 2) : 0; - if (sqlite3_column_bytes(res, 3) > 0) { - last_timestamp.tv_sec = sqlite3_column_int64(res, 3); - } - } - - struct alarm_log_entries log_entries; - log_entries.first_seq_id = first_sequence; - log_entries.first_when = first_timestamp; - log_entries.last_seq_id = last_sequence; - log_entries.last_when = last_timestamp; - - struct alarm_log_health alarm_log; - alarm_log.claim_id = claim_id; - alarm_log.node_id = wc->node_id; - alarm_log.log_entries = log_entries; - alarm_log.status = wc->alert_updates == 0 ? 2 : 1; - alarm_log.enabled = (int)host->health.health_enabled; - - wc->alert_sequence_id = last_sequence; + netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); - aclk_send_alarm_log_health(&alarm_log); - log_access("ACLK RES [%s (%s)]: HEALTH LOG SENT from %"PRIu64" to %"PRIu64, wc->node_id, wc->hostname ? wc->hostname : "N/A", first_sequence, last_sequence); + if (unlikely(db_execute(db_meta, buffer_tostring(sql)))) + error_report("Failed to queue existing ACLK alert events for host %s", rrdhost_hostname(host)); - rc = sqlite3_finalize(res); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to reset statement to get health log statistics from the database, rc = %d", rc); + netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); - freez(claim_id); buffer_free(sql); - - aclk_alert_reloaded = 1; -#endif - - return; + rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); } void aclk_send_alarm_configuration(char *config_hash) @@ -605,22 +489,14 @@ void aclk_send_alarm_configuration(char *config_hash) if (unlikely(!config_hash)) return; - struct aclk_database_worker_config *wc = (struct aclk_database_worker_config *) localhost->dbsync_worker; + struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *) localhost->aclk_sync_host_config; - if (unlikely(!wc)) { + if (unlikely(!wc)) return; - } log_access("ACLK REQ [%s (%s)]: Request to send alert config %s.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", config_hash); - struct aclk_database_cmd cmd; - memset(&cmd, 0, sizeof(cmd)); - cmd.opcode = ACLK_DATABASE_PUSH_ALERT_CONFIG; - cmd.data_param = (void *) strdupz(config_hash); - cmd.completion = NULL; - aclk_database_enq_cmd(wc, &cmd); - - return; + aclk_push_alert_config(wc->node_id, config_hash); } #define SQL_SELECT_ALERT_CONFIG "SELECT alarm, template, on_key, class, type, component, os, hosts, plugin," \ @@ -628,19 +504,31 @@ void aclk_send_alarm_configuration(char *config_hash) "options, host_labels, p_db_lookup_dimensions, p_db_lookup_method, p_db_lookup_options, p_db_lookup_after," \ "p_db_lookup_before, p_update_every FROM alert_hash WHERE hash_id = @hash_id;" -int aclk_push_alert_config_event(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) +int aclk_push_alert_config_event(char *node_id __maybe_unused, char *config_hash __maybe_unused) { - UNUSED(wc); -#ifndef ENABLE_ACLK - UNUSED(cmd); -#else int rc = 0; +#ifdef ENABLE_ACLK + CHECK_SQLITE_CONNECTION(db_meta); sqlite3_stmt *res = NULL; - char *config_hash = (char *) cmd.data_param; + struct aclk_sync_host_config *wc = NULL; + RRDHOST *host = find_host_by_node_id(node_id); + + if (unlikely(!host)) { + freez(config_hash); + freez(node_id); + return 1; + } + + wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; + if (unlikely(!wc)) { + freez(config_hash); + freez(node_id); + return 1; + } rc = sqlite3_prepare_v2(db_meta, SQL_SELECT_ALERT_CONFIG, -1, &res, 0); if (rc != SQLITE_OK) { @@ -723,7 +611,6 @@ int aclk_push_alert_config_event(struct aclk_database_worker_config *wc, struct if (likely(p_alarm_config.cfg_hash)) { log_access("ACLK RES [%s (%s)]: Sent alert config %s.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", config_hash); aclk_send_provide_alarm_cfg(&p_alarm_config); - freez((char *) cmd.data_param); freez(p_alarm_config.cfg_hash); destroy_aclk_alarm_configuration(&alarm_config); } @@ -735,150 +622,125 @@ bind_fail: if (unlikely(rc != SQLITE_OK)) error_report("Failed to reset statement when pushing alarm config hash, rc = %d", rc); - return rc; + freez(config_hash); + freez(node_id); #endif - return 0; + return rc; } // Start streaming alerts -void aclk_start_alert_streaming(char *node_id, uint64_t batch_id, uint64_t start_seq_id) +void aclk_start_alert_streaming(char *node_id, bool resets) { if (unlikely(!node_id)) return; - //log_access("ACLK REQ [%s (N/A)]: ALERTS STREAM from %"PRIu64" batch=%"PRIu64".", node_id, start_seq_id, batch_id); - uuid_t node_uuid; if (uuid_parse(node_id, node_uuid)) return; - struct aclk_database_worker_config *wc = NULL; RRDHOST *host = find_host_by_node_id(node_id); - if (likely(host)) { - wc = (struct aclk_database_worker_config *)host->dbsync_worker ? - (struct aclk_database_worker_config *)host->dbsync_worker : - (struct aclk_database_worker_config *)find_inactive_wc_by_node_id(node_id); - if (unlikely(!host->health.health_enabled)) { - log_access("ACLK STA [%s (N/A)]: Ignoring request to stream alert state changes, health is disabled.", node_id); - return; - } + if (unlikely(!host)) + return; - if (unlikely(batch_id == 1) && unlikely(start_seq_id == 1)) - sql_queue_existing_alerts_to_aclk(host); - } else - wc = (struct aclk_database_worker_config *)find_inactive_wc_by_node_id(node_id); - - if (likely(wc)) { - log_access("ACLK REQ [%s (%s)]: ALERTS STREAM from %"PRIu64" batch=%"PRIu64, node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", start_seq_id, batch_id); - __sync_synchronize(); - wc->alerts_batch_id = batch_id; - wc->alerts_start_seq_id = start_seq_id; - wc->alert_updates = 1; - wc->pause_alert_updates = 0; - __sync_synchronize(); + struct aclk_sync_host_config *wc = host->aclk_sync_host_config; + + if (unlikely(!wc)) + return; + + if (unlikely(!host->health.health_enabled)) { + log_access("ACLK STA [%s (N/A)]: Ignoring request to stream alert state changes, health is disabled.", node_id); + return; } - else - log_access("ACLK STA [%s (N/A)]: ACLK synchronization thread is not active.", node_id); - return; + if (resets) { + log_access("ACLK REQ [%s (%s)]: STREAM ALERTS ENABLED (RESET REQUESTED)", node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); + sql_queue_existing_alerts_to_aclk(host); + } else + log_access("ACLK REQ [%s (%s)]: STREAM ALERTS ENABLED", node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); + + wc->alert_updates = 1; + wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS; } -void sql_process_queue_removed_alerts_to_aclk(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) -{ - UNUSED(cmd); +#define SQL_QUEUE_REMOVE_ALERTS "INSERT INTO aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " \ + "SELECT unique_id alert_unique_id, UNIXEPOCH(), unique_id alert_unique_id FROM health_log_%s " \ + "WHERE new_status = -2 AND updated_by_id = 0 AND unique_id NOT IN " \ + "(SELECT alert_unique_id FROM aclk_alert_%s) " \ + "AND config_hash_id NOT IN (select hash_id from alert_hash where warn is null and crit is null) " \ + "ORDER BY unique_id ASC " \ + "ON CONFLICT (alert_unique_id) DO NOTHING;" - BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); +void sql_process_queue_removed_alerts_to_aclk(char *node_id) +{ + struct aclk_sync_host_config *wc; + RRDHOST *host = find_host_by_node_id(node_id); + freez(node_id); - buffer_sprintf(sql,"insert into aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " \ - "select unique_id alert_unique_id, unixepoch(), unique_id alert_unique_id from health_log_%s " \ - "where new_status = -2 and updated_by_id = 0 and unique_id not in " \ - "(select alert_unique_id from aclk_alert_%s) order by unique_id asc " \ - "on conflict (alert_unique_id) do nothing;", wc->uuid_str, wc->uuid_str, wc->uuid_str); + if (unlikely(!host || !(wc = host->aclk_sync_host_config))) + return; - db_execute(buffer_tostring(sql)); + char sql[ACLK_SYNC_QUERY_SIZE * 2]; - log_access("ACLK STA [%s (%s)]: QUEUED REMOVED ALERTS", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); + snprintfz(sql,ACLK_SYNC_QUERY_SIZE * 2 - 1, SQL_QUEUE_REMOVE_ALERTS, wc->uuid_str, wc->uuid_str, wc->uuid_str); - buffer_free(sql); + if (unlikely(db_execute(db_meta, sql))) { + log_access("ACLK STA [%s (%s)]: QUEUED REMOVED ALERTS FAILED", wc->node_id, rrdhost_hostname(wc->host)); + error_report("Failed to queue ACLK alert removed entries for host %s", rrdhost_hostname(wc->host)); + } + else + log_access("ACLK STA [%s (%s)]: QUEUED REMOVED ALERTS", wc->node_id, rrdhost_hostname(wc->host)); - wc->pause_alert_updates = 0; - return; + rrdhost_flag_set(wc->host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); + wc->alert_queue_removed = 0; } void sql_queue_removed_alerts_to_aclk(RRDHOST *host) { - if (unlikely(!host->dbsync_worker)) + if (unlikely(!host->aclk_sync_host_config)) return; - if (!claimed()) + if (!claimed() || !host->node_id) return; - struct aclk_database_cmd cmd; - memset(&cmd, 0, sizeof(cmd)); - cmd.opcode = ACLK_DATABASE_QUEUE_REMOVED_ALERTS; - cmd.data = NULL; - cmd.data_param = NULL; - cmd.completion = NULL; - aclk_database_enq_cmd((struct aclk_database_worker_config *) host->dbsync_worker, &cmd); + char node_id[UUID_STR_LEN]; + uuid_unparse_lower(*host->node_id, node_id); + + aclk_push_node_removed_alerts(node_id); } -void aclk_process_send_alarm_snapshot(char *node_id, char *claim_id, uint64_t snapshot_id, uint64_t sequence_id) +void aclk_process_send_alarm_snapshot(char *node_id, char *claim_id __maybe_unused, char *snapshot_uuid) { - UNUSED(claim_id); - if (unlikely(!node_id)) - return; - uuid_t node_uuid; - if (uuid_parse(node_id, node_uuid)) + if (unlikely(!node_id || uuid_parse(node_id, node_uuid))) return; - struct aclk_database_worker_config *wc = NULL; RRDHOST *host = find_host_by_node_id(node_id); - if (likely(host)) - wc = (struct aclk_database_worker_config *)host->dbsync_worker; - - if (likely(wc)) { - log_access( - "IN [%s (%s)]: Request to send alerts snapshot, snapshot_id %" PRIu64 " and ack_sequence_id %" PRIu64, - wc->node_id, - wc->host ? rrdhost_hostname(wc->host) : "N/A", - snapshot_id, - sequence_id); - if (wc->alerts_snapshot_id == snapshot_id) - return; - __sync_synchronize(); - wc->alerts_snapshot_id = snapshot_id; - wc->alerts_ack_sequence_id = sequence_id; - __sync_synchronize(); - - struct aclk_database_cmd cmd; - memset(&cmd, 0, sizeof(cmd)); - cmd.opcode = ACLK_DATABASE_PUSH_ALERT_SNAPSHOT; - cmd.data_param = NULL; - cmd.completion = NULL; - aclk_database_enq_cmd(wc, &cmd); - } else - log_access("ACLK STA [%s (N/A)]: ACLK synchronization thread is not active.", node_id); - - return; -} + if (unlikely(!host)) { + log_access("ACLK STA [%s (N/A)]: ACLK node id does not exist", node_id); + return; + } -void aclk_mark_alert_cloud_ack(char *uuid_str, uint64_t alerts_ack_sequence_id) -{ - BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; - if (alerts_ack_sequence_id != 0) { - buffer_sprintf( - sql, - "UPDATE aclk_alert_%s SET date_cloud_ack = unixepoch() WHERE sequence_id <= %" PRIu64 "", - uuid_str, - alerts_ack_sequence_id); - db_execute(buffer_tostring(sql)); + if (unlikely(!wc)) { + log_access("ACLK STA [%s (N/A)]: ACLK node id does not exist", node_id); + return; } - buffer_free(sql); + log_access( + "IN [%s (%s)]: Request to send alerts snapshot, snapshot_uuid %s", + node_id, + wc->host ? rrdhost_hostname(wc->host) : "N/A", + snapshot_uuid); + if (wc->alerts_snapshot_uuid && !strcmp(wc->alerts_snapshot_uuid,snapshot_uuid)) + return; + __sync_synchronize(); + wc->alerts_snapshot_uuid = strdupz(snapshot_uuid); + __sync_synchronize(); + + aclk_push_node_alert_snapshot(node_id); } #ifdef ENABLE_ACLK @@ -949,37 +811,41 @@ static int have_recent_alarm(RRDHOST *host, uint32_t alarm_id, uint32_t mark) #endif #define ALARM_EVENTS_PER_CHUNK 10 -void aclk_push_alert_snapshot_event(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) +void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) { -#ifndef ENABLE_ACLK - UNUSED(wc); - UNUSED(cmd); -#else - UNUSED(cmd); - // we perhaps we don't need this for snapshots - if (unlikely(!wc->alert_updates)) { - log_access("ACLK STA [%s (%s)]: Ignoring alert snapshot event, updates have been turned off for this node.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); +#ifdef ENABLE_ACLK + RRDHOST *host = find_host_by_node_id(node_id); + + if (unlikely(!host)) { + log_access("AC [%s (N/A)]: Node id not found", node_id); + freez(node_id); return; } + freez(node_id); - if (unlikely(!wc->host)) { - error_report("ACLK synchronization thread for %s is not linked to HOST", wc->host_guid); + struct aclk_sync_host_config *wc = host->aclk_sync_host_config; + + // we perhaps we don't need this for snapshots + if (unlikely(!wc->alert_updates)) { + log_access( + "ACLK STA [%s (%s)]: Ignoring alert snapshot event, updates have been turned off for this node.", + wc->node_id, + wc->host ? rrdhost_hostname(wc->host) : "N/A"); return; } - if (unlikely(!wc->alerts_snapshot_id)) + if (unlikely(!wc->alerts_snapshot_uuid)) return; char *claim_id = get_agent_claimid(); if (unlikely(!claim_id)) return; - log_access("ACLK REQ [%s (%s)]: Sending alerts snapshot, snapshot_id %" PRIu64, wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", wc->alerts_snapshot_id); + log_access("ACLK REQ [%s (%s)]: Sending alerts snapshot, snapshot_uuid %s", wc->node_id, rrdhost_hostname(wc->host), wc->alerts_snapshot_uuid); - aclk_mark_alert_cloud_ack(wc->uuid_str, wc->alerts_ack_sequence_id); - - RRDHOST *host = wc->host; uint32_t cnt = 0; + char uuid_str[UUID_STR_LEN]; + uuid_unparse_lower_fix(&host->host_uuid, uuid_str); netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); @@ -995,6 +861,9 @@ void aclk_push_alert_snapshot_event(struct aclk_database_worker_config *wc, stru if (have_recent_alarm(host, ae->alarm_id, ae->unique_id)) continue; + if (is_event_from_alert_variable_config(ae->unique_id, uuid_str)) + continue; + cnt++; } @@ -1008,7 +877,7 @@ void aclk_push_alert_snapshot_event(struct aclk_database_worker_config *wc, stru struct alarm_snapshot alarm_snap; alarm_snap.node_id = wc->node_id; alarm_snap.claim_id = claim_id; - alarm_snap.snapshot_id = wc->alerts_snapshot_id; + alarm_snap.snapshot_uuid = wc->alerts_snapshot_uuid; alarm_snap.chunks = chunks; alarm_snap.chunk = chunk; @@ -1024,6 +893,9 @@ void aclk_push_alert_snapshot_event(struct aclk_database_worker_config *wc, stru if (have_recent_alarm(host, ae->alarm_id, ae->unique_id)) continue; + if (is_event_from_alert_variable_config(ae->unique_id, uuid_str)) + continue; + cnt++; struct alarm_log_entry alarm_log; @@ -1047,7 +919,7 @@ void aclk_push_alert_snapshot_event(struct aclk_database_worker_config *wc, stru struct alarm_snapshot alarm_snap; alarm_snap.node_id = wc->node_id; alarm_snap.claim_id = claim_id; - alarm_snap.snapshot_id = wc->alerts_snapshot_id; + alarm_snap.snapshot_uuid = wc->alerts_snapshot_uuid; alarm_snap.chunks = chunks; alarm_snap.chunk = chunk; @@ -1061,73 +933,204 @@ void aclk_push_alert_snapshot_event(struct aclk_database_worker_config *wc, stru } netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); - wc->alerts_snapshot_id = 0; + wc->alerts_snapshot_uuid = NULL; freez(claim_id); #endif - return; } +#define SQL_DELETE_ALERT_ENTRIES "DELETE FROM aclk_alert_%s WHERE filtered_alert_unique_id NOT IN (SELECT unique_id FROM health_log_%s);" + void sql_aclk_alert_clean_dead_entries(RRDHOST *host) { if (!claimed()) return; - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + char sql[512]; + snprintfz(sql,511,SQL_DELETE_ALERT_ENTRIES, uuid_str, uuid_str); - buffer_sprintf(sql,"delete from aclk_alert_%s where filtered_alert_unique_id not in " - " (select unique_id from health_log_%s); ", uuid_str, uuid_str); - char *err_msg = NULL; - int rc = sqlite3_exec_monitored(db_meta, buffer_tostring(sql), NULL, NULL, &err_msg); + int rc = sqlite3_exec_monitored(db_meta, sql, NULL, NULL, &err_msg); if (rc != SQLITE_OK) { - error_report("Failed when trying to clean stale ACLK alert entries from aclk_alert_%s, error message \"%s""", - uuid_str, err_msg); + error_report("Failed when trying to clean stale ACLK alert entries from aclk_alert_%s, error message \"%s\"", uuid_str, err_msg); sqlite3_free(err_msg); } - buffer_free(sql); } +#define SQL_GET_MIN_MAX_ALERT_SEQ "SELECT MIN(sequence_id), MAX(sequence_id), " \ + "(SELECT MAX(sequence_id) FROM aclk_alert_%s WHERE date_submitted IS NOT NULL) " \ + "FROM aclk_alert_%s WHERE date_submitted IS NULL;" + int get_proto_alert_status(RRDHOST *host, struct proto_alert_status *proto_alert_status) { int rc; - struct aclk_database_worker_config *wc = NULL; - wc = (struct aclk_database_worker_config *)host->dbsync_worker; + struct aclk_sync_host_config *wc = NULL; + wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; if (!wc) return 1; proto_alert_status->alert_updates = wc->alert_updates; - proto_alert_status->alerts_batch_id = wc->alerts_batch_id; - BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + char sql[ACLK_SYNC_QUERY_SIZE]; sqlite3_stmt *res = NULL; - buffer_sprintf(sql, "SELECT MIN(sequence_id), MAX(sequence_id), " \ - "(select MAX(sequence_id) from aclk_alert_%s where date_cloud_ack is not NULL), " \ - "(select MAX(sequence_id) from aclk_alert_%s where date_submitted is not NULL) " \ - "FROM aclk_alert_%s where date_submitted is null;", wc->uuid_str, wc->uuid_str, wc->uuid_str); + snprintfz(sql, ACLK_SYNC_QUERY_SIZE - 1, SQL_GET_MIN_MAX_ALERT_SEQ, wc->uuid_str, wc->uuid_str); - rc = sqlite3_prepare_v2(db_meta, buffer_tostring(sql), -1, &res, 0); + rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement to get alert log status from the database."); - buffer_free(sql); return 1; } while (sqlite3_step_monitored(res) == SQLITE_ROW) { proto_alert_status->pending_min_sequence_id = sqlite3_column_bytes(res, 0) > 0 ? (uint64_t) sqlite3_column_int64(res, 0) : 0; proto_alert_status->pending_max_sequence_id = sqlite3_column_bytes(res, 1) > 0 ? (uint64_t) sqlite3_column_int64(res, 1) : 0; - proto_alert_status->last_acked_sequence_id = sqlite3_column_bytes(res, 2) > 0 ? (uint64_t) sqlite3_column_int64(res, 2) : 0; - proto_alert_status->last_submitted_sequence_id = sqlite3_column_bytes(res, 3) > 0 ? (uint64_t) sqlite3_column_int64(res, 3) : 0; + proto_alert_status->last_submitted_sequence_id = sqlite3_column_bytes(res, 2) > 0 ? (uint64_t) sqlite3_column_int64(res, 2) : 0; } rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) error_report("Failed to finalize statement to get alert log status from the database, rc = %d", rc); - buffer_free(sql); return 0; } + +void aclk_send_alarm_checkpoint(char *node_id, char *claim_id __maybe_unused) +{ + if (unlikely(!node_id)) + return; + + struct aclk_sync_host_config *wc = NULL; + RRDHOST *host = find_host_by_node_id(node_id); + + if (unlikely(!host)) + return; + + wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; + if (unlikely(!wc)) { + log_access("ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT REQUEST RECEIVED FOR INVALID NODE", node_id); + return; + } + + log_access("ACLK REQ [%s (%s)]: ALERTS CHECKPOINT REQUEST RECEIVED", node_id, rrdhost_hostname(host)); + + wc->alert_checkpoint_req = SEND_CHECKPOINT_AFTER_HEALTH_LOOPS; +} + +typedef struct active_alerts { + char *name; + char *chart; + RRDCALC_STATUS status; +} active_alerts_t; + +static inline int compare_active_alerts(const void * a, const void * b) { + active_alerts_t *active_alerts_a = (active_alerts_t *)a; + active_alerts_t *active_alerts_b = (active_alerts_t *)b; + + if( !(strcmp(active_alerts_a->name, active_alerts_b->name)) ) + { + return strcmp(active_alerts_a->chart, active_alerts_b->chart); + } + else + return strcmp(active_alerts_a->name, active_alerts_b->name); +} + +void aclk_push_alarm_checkpoint(RRDHOST *host __maybe_unused) +{ +#ifdef ENABLE_ACLK + struct aclk_sync_host_config *wc = host->aclk_sync_host_config; + if (unlikely(!wc)) { + log_access("ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT REQUEST RECEIVED FOR INVALID NODE", rrdhost_hostname(host)); + return; + } + + //TODO: make sure all pending events are sent. + if (rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS)) { + //postpone checkpoint send + wc->alert_checkpoint_req++; + log_access("ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT POSTPONED", rrdhost_hostname(host)); + return; + } + + //TODO: lock rc here, or make sure it's called when health decides + //count them + RRDCALC *rc; + uint32_t cnt = 0; + size_t len = 0; + active_alerts_t *active_alerts = NULL; + + foreach_rrdcalc_in_rrdhost_read(host, rc) { + if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) + continue; + + if (rc->status == RRDCALC_STATUS_WARNING || + rc->status == RRDCALC_STATUS_CRITICAL) { + + cnt++; + } + } + foreach_rrdcalc_in_rrdhost_done(rc); + + if (cnt) { + active_alerts = callocz(cnt, sizeof(active_alerts_t)); + cnt = 0; + foreach_rrdcalc_in_rrdhost_read(host, rc) { + if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) + continue; + + if (rc->status == RRDCALC_STATUS_WARNING || + rc->status == RRDCALC_STATUS_CRITICAL) { + + active_alerts[cnt].name = (char *)rrdcalc_name(rc); + len += string_strlen(rc->name); + active_alerts[cnt].chart = (char *)rrdcalc_chart_name(rc); + len += string_strlen(rc->chart); + active_alerts[cnt].status = rc->status; + len++; + cnt++; + } + } + foreach_rrdcalc_in_rrdhost_done(rc); + } + + BUFFER *alarms_to_hash; + if (cnt) { + qsort (active_alerts, cnt, sizeof(active_alerts_t), compare_active_alerts); + + alarms_to_hash = buffer_create(len, NULL); + for (uint32_t i=0;i<cnt;i++) { + buffer_strcat(alarms_to_hash, active_alerts[i].name); + buffer_strcat(alarms_to_hash, active_alerts[i].chart); + if (active_alerts[i].status == RRDCALC_STATUS_WARNING) + buffer_strcat(alarms_to_hash, "W"); + else if (active_alerts[i].status == RRDCALC_STATUS_CRITICAL) + buffer_strcat(alarms_to_hash, "C"); + } + } else { + alarms_to_hash = buffer_create(1, NULL); + buffer_strcat(alarms_to_hash, ""); + len = 0; + } + + char hash[SHA256_DIGEST_LENGTH + 1]; + if (hash256_string((const unsigned char *)buffer_tostring(alarms_to_hash), len, hash)) { + hash[SHA256_DIGEST_LENGTH] = 0; + + struct alarm_checkpoint alarm_checkpoint; + char *claim_id = get_agent_claimid(); + alarm_checkpoint.claim_id = claim_id; + alarm_checkpoint.node_id = wc->node_id; + alarm_checkpoint.checksum = (char *)hash; + + aclk_send_provide_alarm_checkpoint(&alarm_checkpoint); + log_access("ACLK RES [%s (%s)]: ALERTS CHECKPOINT SENT", wc->node_id, rrdhost_hostname(host)); + } else { + log_access("ACLK RES [%s (%s)]: FAILED TO CREATE ALERTS CHECKPOINT HASH", wc->node_id, rrdhost_hostname(host)); + } + wc->alert_checkpoint_req = 0; + buffer_free(alarms_to_hash); +#endif +} diff --git a/database/sqlite/sqlite_aclk_alert.h b/database/sqlite/sqlite_aclk_alert.h index 88a939e8..d7252aad 100644 --- a/database/sqlite/sqlite_aclk_alert.h +++ b/database/sqlite/sqlite_aclk_alert.h @@ -5,27 +5,31 @@ extern sqlite3 *db_meta; +#define SEND_REMOVED_AFTER_HEALTH_LOOPS 3 +#define SEND_CHECKPOINT_AFTER_HEALTH_LOOPS 4 + struct proto_alert_status { int alert_updates; - uint64_t alerts_batch_id; - uint64_t last_acked_sequence_id; uint64_t pending_min_sequence_id; uint64_t pending_max_sequence_id; uint64_t last_submitted_sequence_id; }; -int aclk_add_alert_event(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); -void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); -void aclk_send_alarm_health_log(char *node_id); -void aclk_push_alarm_health_log(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); +int aclk_add_alert_event(struct aclk_sync_host_config *wc, struct aclk_database_cmd cmd); +void aclk_push_alert_event(struct aclk_sync_host_config *wc); void aclk_send_alarm_configuration (char *config_hash); -int aclk_push_alert_config_event(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); -void aclk_start_alert_streaming(char *node_id, uint64_t batch_id, uint64_t start_seq_id); +int aclk_push_alert_config_event(char *node_id, char *config_hash); +void aclk_start_alert_streaming(char *node_id, bool resets); void sql_queue_removed_alerts_to_aclk(RRDHOST *host); -void sql_process_queue_removed_alerts_to_aclk(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); -void aclk_push_alert_snapshot_event(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); -void aclk_process_send_alarm_snapshot(char *node_id, char *claim_id, uint64_t snapshot_id, uint64_t sequence_id); +void sql_process_queue_removed_alerts_to_aclk(char *node_id); +void aclk_send_alarm_checkpoint(char *node_id, char *claim_id); +void aclk_push_alarm_checkpoint(RRDHOST *host); + +void aclk_push_alert_snapshot_event(char *node_id); +void aclk_process_send_alarm_snapshot(char *node_id, char *claim_id, char *snapshot_uuid); int get_proto_alert_status(RRDHOST *host, struct proto_alert_status *proto_alert_status); int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, int skip_filter); +void aclk_push_alert_events_for_all_hosts(void); + #endif //NETDATA_SQLITE_ACLK_ALERT_H diff --git a/database/sqlite/sqlite_aclk_node.c b/database/sqlite/sqlite_aclk_node.c index afe77499..3817296d 100644 --- a/database/sqlite/sqlite_aclk_node.c +++ b/database/sqlite/sqlite_aclk_node.c @@ -25,12 +25,17 @@ DICTIONARY *collectors_from_charts(RRDHOST *host, DICTIONARY *dict) { return dict; } -#endif -void sql_build_node_collectors(struct aclk_database_worker_config *wc) +static void build_node_collectors(char *node_id __maybe_unused) { -#ifdef ENABLE_ACLK - if (!wc->host) + + RRDHOST *host = find_host_by_node_id(node_id); + + if (unlikely(!host)) + return; + + struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *) host->aclk_sync_host_config; + if (unlikely(!wc)) return; struct update_node_collectors upd_node_collectors; @@ -39,48 +44,51 @@ void sql_build_node_collectors(struct aclk_database_worker_config *wc) upd_node_collectors.node_id = wc->node_id; upd_node_collectors.claim_id = get_agent_claimid(); - upd_node_collectors.node_collectors = collectors_from_charts(wc->host, dict); + upd_node_collectors.node_collectors = collectors_from_charts(host, dict); aclk_update_node_collectors(&upd_node_collectors); dictionary_destroy(dict); freez(upd_node_collectors.claim_id); - log_access("ACLK RES [%s (%s)]: NODE COLLECTORS SENT", wc->node_id, rrdhost_hostname(wc->host)); -#else - UNUSED(wc); -#endif - return; + log_access("ACLK RES [%s (%s)]: NODE COLLECTORS SENT", node_id, rrdhost_hostname(host)); + + freez(node_id); } -void sql_build_node_info(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) +static void build_node_info(char *node_id __maybe_unused) { - UNUSED(cmd); - -#ifdef ENABLE_ACLK struct update_node_info node_info; - if (!wc->host) { - wc->node_info_send = 1; + RRDHOST *host = find_host_by_node_id(node_id); + + if (unlikely((!host))) { + freez(node_id); + return; + } + + struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *) host->aclk_sync_host_config; + + if (unlikely(!wc)) { + freez(node_id); return; } rrd_rdlock(); node_info.node_id = wc->node_id; node_info.claim_id = get_agent_claimid(); - node_info.machine_guid = wc->host_guid; + node_info.machine_guid = host->machine_guid; node_info.child = (wc->host != localhost); - node_info.ml_info.ml_capable = ml_capable(localhost); + node_info.ml_info.ml_capable = ml_capable(); node_info.ml_info.ml_enabled = ml_enabled(wc->host); node_info.node_instance_capabilities = aclk_get_node_instance_capas(wc->host); now_realtime_timeval(&node_info.updated_at); - RRDHOST *host = wc->host; char *host_version = NULL; if (host != localhost) { netdata_mutex_lock(&host->receiver_lock); - host_version = strdupz(host->receiver && host->receiver->program_version ? host->receiver->program_version : "unknown"); + host_version = strdupz(host->receiver && host->receiver->program_version ? host->receiver->program_version : rrdhost_program_version(host)); netdata_mutex_unlock(&host->receiver_lock); } @@ -91,7 +99,7 @@ void sql_build_node_info(struct aclk_database_worker_config *wc, struct aclk_dat node_info.data.kernel_name = host->system_info->kernel_name; node_info.data.kernel_version = host->system_info->kernel_version; node_info.data.architecture = host->system_info->architecture; - node_info.data.cpus = host->system_info->host_cores ? str2uint32_t(host->system_info->host_cores) : 0; + node_info.data.cpus = host->system_info->host_cores ? str2uint32_t(host->system_info->host_cores, NULL) : 0; node_info.data.cpu_frequency = host->system_info->host_cpu_freq ? host->system_info->host_cpu_freq : "0"; node_info.data.memory = host->system_info->host_ram_total ? host->system_info->host_ram_total : "0"; node_info.data.disk_space = host->system_info->host_disk_space ? host->system_info->host_disk_space : "0"; @@ -101,7 +109,7 @@ void sql_build_node_info(struct aclk_database_worker_config *wc, struct aclk_dat node_info.data.virtualization_type = host->system_info->virtualization ? host->system_info->virtualization : "unknown"; node_info.data.container_type = host->system_info->container ? host->system_info->container : "unknown"; node_info.data.custom_info = config_get(CONFIG_SECTION_WEB, "custom dashboard_info.js", ""); - node_info.data.machine_guid = wc->host_guid; + node_info.data.machine_guid = host->machine_guid; struct capability node_caps[] = { { .name = "ml", .version = host->system_info->ml_capable, .enabled = host->system_info->ml_enabled }, @@ -116,7 +124,7 @@ void sql_build_node_info(struct aclk_database_worker_config *wc, struct aclk_dat node_info.data.host_labels_ptr = host->rrdlabels; aclk_update_node_info(&node_info); - log_access("ACLK RES [%s (%s)]: NODE INFO SENT for guid [%s] (%s)", wc->node_id, rrdhost_hostname(wc->host), wc->host_guid, wc->host == localhost ? "parent" : "child"); + log_access("ACLK RES [%s (%s)]: NODE INFO SENT for guid [%s] (%s)", wc->node_id, rrdhost_hostname(wc->host), host->machine_guid, wc->host == localhost ? "parent" : "child"); rrd_unlock(); freez(node_info.claim_id); @@ -124,9 +132,47 @@ void sql_build_node_info(struct aclk_database_worker_config *wc, struct aclk_dat freez(host_version); wc->node_collectors_send = now_realtime_sec(); -#else - UNUSED(wc); -#endif + freez(node_id); + +} + - return; +void aclk_check_node_info_and_collectors(void) +{ + RRDHOST *host; + + if (unlikely(!aclk_connected)) + return; + + size_t pending = 0; + dfe_start_reentrant(rrdhost_root_index, host) { + + struct aclk_sync_host_config *wc = host->aclk_sync_host_config; + if (unlikely(!wc)) + continue; + + if (unlikely(rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD))) { + internal_error(true, "ACLK SYNC: Context still pending for %s", rrdhost_hostname(host)); + pending++; + continue; + } + + if (wc->node_info_send_time && wc->node_info_send_time + 30 < now_realtime_sec()) { + wc->node_info_send_time = 0; + build_node_info(strdupz(wc->node_id)); + internal_error(true, "ACLK SYNC: Sending node info for %s", rrdhost_hostname(host)); + } + + if (wc->node_collectors_send && wc->node_collectors_send + 30 < now_realtime_sec()) { + build_node_collectors(strdupz(wc->node_id)); + internal_error(true, "ACLK SYNC: Sending collectors for %s", rrdhost_hostname(host)); + wc->node_collectors_send = 0; + } + } + dfe_done(host); + + if(pending) + info("ACLK: %zu nodes are pending for contexts to load, skipped sending node info for them", pending); } + +#endif diff --git a/database/sqlite/sqlite_aclk_node.h b/database/sqlite/sqlite_aclk_node.h index c2c54f8c..6afdf8d7 100644 --- a/database/sqlite/sqlite_aclk_node.h +++ b/database/sqlite/sqlite_aclk_node.h @@ -3,6 +3,5 @@ #ifndef NETDATA_SQLITE_ACLK_NODE_H #define NETDATA_SQLITE_ACLK_NODE_H -void sql_build_node_info(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); -void sql_build_node_collectors(struct aclk_database_worker_config *wc); +void aclk_check_node_info_and_collectors(void); #endif //NETDATA_SQLITE_ACLK_NODE_H diff --git a/database/sqlite/sqlite_context.c b/database/sqlite/sqlite_context.c index 892292cc..b72726dc 100644 --- a/database/sqlite/sqlite_context.c +++ b/database/sqlite/sqlite_context.c @@ -117,7 +117,6 @@ void sql_close_context_database(void) rc = sqlite3_close_v2(db_context_meta); if (unlikely(rc != SQLITE_OK)) error_report("Error %d while closing the context SQLite database, %s", rc, sqlite3_errstr(rc)); - return; } // @@ -243,8 +242,6 @@ failed: rc = sqlite3_reset(res); if (rc != SQLITE_OK) error_report("Failed to reset statement that fetches chart label data, rc = %d", rc); - - return; } // CONTEXT LIST @@ -372,9 +369,9 @@ int ctx_store_context(uuid_t *host_uuid, VERSIONED_CONTEXT_DATA *context_data) goto skip_store; } - rc = sqlite3_bind_int(res, 10, (time_t) context_data->deleted); + rc = sqlite3_bind_int(res, 10, context_data->deleted); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind last_time_t to store context details"); + error_report("Failed to bind deleted flag to store context details"); goto skip_store; } @@ -431,11 +428,11 @@ int ctx_delete_context(uuid_t *host_uuid, VERSIONED_CONTEXT_DATA *context_data) if (rc_stored != SQLITE_DONE) error_report("Failed to delete context %s, rc = %d", context_data->id, rc_stored); #ifdef NETDATA_INTERNAL_CHECKS - else { - char host_uuid_str[UUID_STR_LEN]; - uuid_unparse_lower(*host_uuid, host_uuid_str); - info("%s: Deleted context %s under host %s", __FUNCTION__ , context_data->id, host_uuid_str); - } + else { + char host_uuid_str[UUID_STR_LEN]; + uuid_unparse_lower(*host_uuid, host_uuid_str); + info("%s: Deleted context %s under host %s", __FUNCTION__, context_data->id, host_uuid_str); + } #endif skip_delete: @@ -449,6 +446,10 @@ skip_delete: int sql_context_cache_stats(int op) { int count, dummy; + + if (unlikely(!db_context_meta)) + return 0; + netdata_thread_disable_cancelability(); sqlite3_db_status(db_context_meta, op, &count, &dummy, 0); netdata_thread_enable_cancelability(); @@ -489,6 +490,8 @@ int ctx_unittest(void) uuid_t host_uuid; uuid_generate(host_uuid); + initialize_thread_key_pool(); + int rc = sql_init_context_database(1); if (rc != SQLITE_OK) @@ -556,6 +559,7 @@ int ctx_unittest(void) freez((void *)context_data.title); freez((void *)context_data.chart_type); freez((void *)context_data.family); + freez((void *)context_data.units); // The list should be empty info("List context start after delete"); diff --git a/database/sqlite/sqlite_db_migration.c b/database/sqlite/sqlite_db_migration.c index 8b1d0159..3132ae2d 100644 --- a/database/sqlite/sqlite_db_migration.c +++ b/database/sqlite/sqlite_db_migration.c @@ -7,7 +7,7 @@ static int return_int_cb(void *data, int argc, char **argv, char **column) int *status = data; UNUSED(argc); UNUSED(column); - *status = str2uint32_t(argv[0]); + *status = str2uint32_t(argv[0], NULL); return 0; } @@ -49,7 +49,7 @@ static int column_exists_in_table(const char *table, const char *column) } const char *database_migrate_v1_v2[] = { - "ALTER TABLE host ADD hops INTEGER;", + "ALTER TABLE host ADD hops INTEGER NOT NULL DEFAULT 0;", NULL }; diff --git a/database/sqlite/sqlite_functions.c b/database/sqlite/sqlite_functions.c index 1d03cfc2..2fca2dfc 100644 --- a/database/sqlite/sqlite_functions.c +++ b/database/sqlite/sqlite_functions.c @@ -49,7 +49,6 @@ const char *database_config[] = { const char *database_cleanup[] = { "DELETE FROM chart WHERE chart_id NOT IN (SELECT chart_id FROM dimension);", "DELETE FROM host WHERE host_id NOT IN (SELECT host_id FROM chart);", - "DELETE FROM chart_label WHERE chart_id NOT IN (SELECT chart_id FROM chart);", "DELETE FROM node_instance WHERE host_id NOT IN (SELECT host_id FROM host);", "DELETE FROM host_info WHERE host_id NOT IN (SELECT host_id FROM host);", "DELETE FROM host_label WHERE host_id NOT IN (SELECT host_id FROM host);", @@ -117,7 +116,6 @@ int execute_insert(sqlite3_stmt *res) break; } } - return rc; } @@ -156,6 +154,12 @@ static void release_statement(void *statement) error_report("Failed to finalize statement, rc = %d", rc); } +void initialize_thread_key_pool(void) +{ + for (int i = 0; i < MAX_PREPARED_STATEMENTS; i++) + (void)pthread_key_create(&key_pool[i], release_statement); +} + int prepare_statement(sqlite3 *database, const char *query, sqlite3_stmt **statement) { static __thread uint32_t keys_used = 0; @@ -448,8 +452,7 @@ int sql_init_database(db_check_action_type_t rebuild, int memory) info("SQLite database initialization completed"); - for (int i = 0; i < MAX_PREPARED_STATEMENTS; i++) - (void)pthread_key_create(&key_pool[i], release_statement); + initialize_thread_key_pool(); rc = sqlite3_create_function(db_meta, "u2h", 1, SQLITE_ANY | SQLITE_DETERMINISTIC, 0, sqlite_uuid_parse, 0, 0); if (unlikely(rc != SQLITE_OK)) @@ -505,14 +508,15 @@ skip: error_report("Failed to finalize statement %s, rc = %d", sql, rc); return result; } - -void db_execute(const char *cmd) +// Return 0 OK +// Return 1 Failed +int db_execute(sqlite3 *db, const char *cmd) { int rc; int cnt = 0; while (cnt < SQL_MAX_RETRY) { char *err_msg; - rc = sqlite3_exec_monitored(db_meta, cmd, 0, 0, &err_msg); + rc = sqlite3_exec_monitored(db, cmd, 0, 0, &err_msg); if (rc != SQLITE_OK) { error_report("Failed to execute '%s', rc = %d (%s) -- attempt %d", cmd, rc, err_msg, cnt); sqlite3_free(err_msg); @@ -527,6 +531,7 @@ void db_execute(const char *cmd) ++cnt; } + return (rc != SQLITE_OK); } static inline void set_host_node_id(RRDHOST *host, uuid_t *node_id) @@ -540,7 +545,7 @@ static inline void set_host_node_id(RRDHOST *host, uuid_t *node_id) return; } - struct aclk_database_worker_config *wc = host->dbsync_worker; + struct aclk_sync_host_config *wc = host->aclk_sync_host_config; if (unlikely(!host->node_id)) host->node_id = mallocz(sizeof(*host->node_id)); @@ -594,7 +599,7 @@ int update_node_id(uuid_t *host_id, uuid_t *node_id) rrd_wrlock(); host = rrdhost_find_by_guid(host_guid); if (likely(host)) - set_host_node_id(host, node_id); + set_host_node_id(host, node_id); rrd_unlock(); failed: @@ -604,48 +609,6 @@ failed: return rc - 1; } -#define SQL_SELECT_HOSTNAME_BY_NODE_ID "SELECT h.hostname FROM node_instance ni, " \ -"host h WHERE ni.host_id = h.host_id AND ni.node_id = @node_id;" - -char *get_hostname_by_node_id(char *node) -{ - sqlite3_stmt *res = NULL; - char *hostname = NULL; - int rc; - - if (unlikely(!db_meta)) { - if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) - error_report("Database has not been initialized"); - return NULL; - } - - uuid_t node_id; - if (uuid_parse(node, node_id)) - return NULL; - - rc = sqlite3_prepare_v2(db_meta, SQL_SELECT_HOSTNAME_BY_NODE_ID, -1, &res, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement to fetch hostname by node id"); - return NULL; - } - - rc = sqlite3_bind_blob(res, 1, &node_id, sizeof(node_id), SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind host_id parameter to select node instance information"); - goto failed; - } - - rc = sqlite3_step_monitored(res); - if (likely(rc == SQLITE_ROW)) - hostname = strdupz((char *)sqlite3_column_text(res, 0)); - -failed: - if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) - error_report("Failed to finalize the prepared statement when search for hostname by node id"); - - return hostname; -} - #define SQL_SELECT_HOST_BY_NODE_ID "select host_id from node_instance where node_id = @node_id;" int get_host_id(uuid_t *node_id, uuid_t *host_id) @@ -684,7 +647,7 @@ failed: return (rc == SQLITE_ROW) ? 0 : -1; } -#define SQL_SELECT_NODE_ID "select node_id from node_instance where host_id = @host_id and node_id not null;" +#define SQL_SELECT_NODE_ID "SELECT node_id FROM node_instance WHERE host_id = @host_id AND node_id IS NOT NULL;" int get_node_id(uuid_t *host_id, uuid_t *node_id) { @@ -720,8 +683,8 @@ failed: return (rc == SQLITE_ROW) ? 0 : -1; } -#define SQL_INVALIDATE_NODE_INSTANCES "update node_instance set node_id = NULL where exists " \ - "(select host_id from node_instance where host_id = @host_id and (@claim_id is null or claim_id <> @claim_id));" +#define SQL_INVALIDATE_NODE_INSTANCES "UPDATE node_instance SET node_id = NULL WHERE EXISTS " \ + "(SELECT host_id FROM node_instance WHERE host_id = @host_id AND (@claim_id IS NULL OR claim_id <> @claim_id));" void invalidate_node_instances(uuid_t *host_id, uuid_t *claim_id) { @@ -765,8 +728,8 @@ failed: error_report("Failed to finalize the prepared statement when invalidating node instance information"); } -#define SQL_GET_NODE_INSTANCE_LIST "select ni.node_id, ni.host_id, h.hostname " \ - "from node_instance ni, host h where ni.host_id = h.host_id;" +#define SQL_GET_NODE_INSTANCE_LIST "SELECT ni.node_id, ni.host_id, h.hostname " \ + "FROM node_instance ni, host h WHERE ni.host_id = h.host_id AND h.hops >=0;" struct node_instance_list *get_node_list(void) { @@ -805,13 +768,18 @@ struct node_instance_list *get_node_list(void) uuid_copy(node_list[row].node_id, *((uuid_t *)sqlite3_column_blob(res, 0))); if (sqlite3_column_bytes(res, 1) == sizeof(uuid_t)) { uuid_t *host_id = (uuid_t *)sqlite3_column_blob(res, 1); - uuid_copy(node_list[row].host_id, *host_id); - node_list[row].queryable = 1; uuid_unparse_lower(*host_id, host_guid); RRDHOST *host = rrdhost_find_by_guid(host_guid); - node_list[row].live = host && (host == localhost || host->receiver) ? 1 : 0; + if (rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD)) { + info("ACLK: 'host:%s' skipping get node list because context is initializing", rrdhost_hostname(host)); + continue; + } + uuid_copy(node_list[row].host_id, *host_id); + node_list[row].queryable = 1; + node_list[row].live = (host && (host == localhost || host->receiver + || !(rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN)))) ? 1 : 0; node_list[row].hops = (host && host->system_info) ? host->system_info->hops : - uuid_compare(*host_id, localhost->host_uuid) ? 1 : 0; + uuid_memcmp(host_id, &localhost->host_uuid) ? 1 : 0; node_list[row].hostname = sqlite3_column_bytes(res, 2) ? strdupz((char *)sqlite3_column_text(res, 2)) : NULL; } @@ -950,6 +918,10 @@ int bind_text_null(sqlite3_stmt *res, int position, const char *text, bool can_b int sql_metadata_cache_stats(int op) { int count, dummy; + + if (unlikely(!db_meta)) + return 0; + netdata_thread_disable_cancelability(); sqlite3_db_status(db_meta, op, &count, &dummy, 0); netdata_thread_enable_cancelability(); diff --git a/database/sqlite/sqlite_functions.h b/database/sqlite/sqlite_functions.h index 40abd010..ee63a397 100644 --- a/database/sqlite/sqlite_functions.h +++ b/database/sqlite/sqlite_functions.h @@ -55,7 +55,8 @@ int bind_text_null(sqlite3_stmt *res, int position, const char *text, bool can_b int prepare_statement(sqlite3 *database, const char *query, sqlite3_stmt **statement); int execute_insert(sqlite3_stmt *res); int exec_statement_with_uuid(const char *sql, uuid_t *uuid); -void db_execute(const char *cmd); +int db_execute(sqlite3 *database, const char *cmd); +void initialize_thread_key_pool(void); // Look up functions int get_node_id(uuid_t *host_id, uuid_t *node_id); diff --git a/database/sqlite/sqlite_health.c b/database/sqlite/sqlite_health.c index 471fa3ad..dd08f63e 100644 --- a/database/sqlite/sqlite_health.c +++ b/database/sqlite/sqlite_health.c @@ -12,7 +12,7 @@ #define SQL_CREATE_HEALTH_LOG_TABLE(guid) "CREATE TABLE IF NOT EXISTS health_log_%s(hostname text, unique_id int, alarm_id int, alarm_event_id int, config_hash_id blob, updated_by_id int, updates_id int, when_key int, duration int, non_clear_duration int, flags int, exec_run_timestamp int, delay_up_to_timestamp int, name text, chart text, family text, exec text, recipient text, source text, units text, info text, exec_code int, new_status real, old_status real, delay int, new_value double, old_value double, last_repeat int, class text, component text, type text, chart_context text);", guid int sql_create_health_log_table(RRDHOST *host) { int rc; - char *err_msg = NULL, command[MAX_HEALTH_SQL_SIZE + 1]; + char command[MAX_HEALTH_SQL_SIZE + 1]; if (unlikely(!db_meta)) { if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) @@ -25,18 +25,17 @@ int sql_create_health_log_table(RRDHOST *host) { snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CREATE_HEALTH_LOG_TABLE(uuid_str)); - rc = sqlite3_exec_monitored(db_meta, command, 0, 0, &err_msg); - if (rc != SQLITE_OK) { - error_report("HEALTH [%s]: SQLite error during creation of health log table, rc = %d (%s)", rrdhost_hostname(host), rc, err_msg); - sqlite3_free(err_msg); - return 1; + rc = db_execute(db_meta, command); + if (unlikely(rc)) + error_report("HEALTH [%s]: SQLite error during creation of health log table", rrdhost_hostname(host)); + else { + snprintfz(command, MAX_HEALTH_SQL_SIZE, "CREATE INDEX IF NOT EXISTS health_log_index_%s ON health_log_%s (unique_id); ", uuid_str, uuid_str); + rc = db_execute(db_meta, command); + if (unlikely(unlikely(rc))) + error_report("HEALTH [%s]: SQLite error during creation of health log table index", rrdhost_hostname(host)); } - snprintfz(command, MAX_HEALTH_SQL_SIZE, "CREATE INDEX IF NOT EXISTS " - "health_log_index_%s ON health_log_%s (unique_id); ", uuid_str, uuid_str); - db_execute(command); - - return 0; + return rc; } /* Health related SQL queries @@ -104,7 +103,7 @@ void sql_health_alarm_log_update(RRDHOST *host, ALARM_ENTRY *ae) { error_report("HEALTH [%s]: Failed to update health log, rc = %d", rrdhost_hostname(host), rc); } - failed: +failed: if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) error_report("HEALTH [%s]: Failed to finalize the prepared statement for updating health log.", rrdhost_hostname(host)); } @@ -345,7 +344,7 @@ void sql_health_alarm_log_insert(RRDHOST *host, ALARM_ENTRY *ae) { ae->flags |= HEALTH_ENTRY_FLAG_SAVED; host->health.health_log_entries_written++; - failed: +failed: if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) error_report("HEALTH [%s]: Failed to finalize the prepared statement for inserting to health log.", rrdhost_hostname(host)); } @@ -452,7 +451,7 @@ void sql_health_alarm_log_count(RRDHOST *host) { #define SQL_INJECT_REMOVED_UPDATE(guid) "update health_log_%s set flags = flags | ?1, updated_by_id = ?2 where unique_id = ?3; ", guid void sql_inject_removed_status(char *uuid_str, uint32_t alarm_id, uint32_t alarm_event_id, uint32_t unique_id, uint32_t max_unique_id) { - int rc = 0; + int rc; char command[MAX_HEALTH_SQL_SIZE + 1]; if (!alarm_id || !alarm_event_id || !unique_id || !max_unique_id) @@ -546,7 +545,7 @@ failed: #define SQL_SELECT_MAX_UNIQUE_ID(guid) "SELECT MAX(unique_id) from health_log_%s", guid uint32_t sql_get_max_unique_id (char *uuid_str) { - int rc = 0; + int rc; char command[MAX_HEALTH_SQL_SIZE + 1]; uint32_t max_unique_id = 0; @@ -573,10 +572,9 @@ uint32_t sql_get_max_unique_id (char *uuid_str) #define SQL_SELECT_LAST_STATUSES(guid) "SELECT new_status, unique_id, alarm_id, alarm_event_id from health_log_%s group by alarm_id having max(alarm_event_id)", guid void sql_check_removed_alerts_state(char *uuid_str) { - int rc = 0; + int rc; char command[MAX_HEALTH_SQL_SIZE + 1]; - RRDCALC_STATUS status; - uint32_t alarm_id = 0, alarm_event_id = 0, unique_id = 0, max_unique_id = 0; + uint32_t max_unique_id = 0; sqlite3_stmt *res = NULL; @@ -588,6 +586,9 @@ void sql_check_removed_alerts_state(char *uuid_str) } while (sqlite3_step_monitored(res) == SQLITE_ROW) { + uint32_t alarm_id, alarm_event_id, unique_id; + RRDCALC_STATUS status; + status = (RRDCALC_STATUS) sqlite3_column_int(res, 0); unique_id = (uint32_t) sqlite3_column_int64(res, 1); alarm_id = (uint32_t) sqlite3_column_int64(res, 2); @@ -683,8 +684,7 @@ void sql_health_alarm_log_load(RRDHOST *host) { } // Check if we got last_repeat field - time_t last_repeat = 0; - last_repeat = (time_t)sqlite3_column_int64(res, 27); + time_t last_repeat = (time_t)sqlite3_column_int64(res, 27); rc = dictionary_get(all_rrdcalcs, (char *) sqlite3_column_text(res, 14)); if(unlikely(rc)) { @@ -847,196 +847,161 @@ int sql_store_alert_config_hash(uuid_t *hash_id, struct alert_config *cfg) } } - param++; - rc = sqlite3_bind_blob(res, param, hash_id, sizeof(*hash_id), SQLITE_STATIC); + rc = sqlite3_bind_blob(res, ++param, hash_id, sizeof(*hash_id), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->alarm, param); + rc = sqlite3_bind_string_or_null(res, cfg->alarm, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->template_key, param); + rc = sqlite3_bind_string_or_null(res, cfg->template_key, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->on, param); + rc = sqlite3_bind_string_or_null(res, cfg->on, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->classification, param); + rc = sqlite3_bind_string_or_null(res, cfg->classification, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->component, param); + rc = sqlite3_bind_string_or_null(res, cfg->component, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->type, param); + rc = sqlite3_bind_string_or_null(res, cfg->type, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->os, param); + rc = sqlite3_bind_string_or_null(res, cfg->os, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->host, param); + rc = sqlite3_bind_string_or_null(res, cfg->host, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->lookup, param); + rc = sqlite3_bind_string_or_null(res, cfg->lookup, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->every, param); + rc = sqlite3_bind_string_or_null(res, cfg->every, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->units, param); + rc = sqlite3_bind_string_or_null(res, cfg->units, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->calc, param); + rc = sqlite3_bind_string_or_null(res, cfg->calc, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->families, param); + rc = sqlite3_bind_string_or_null(res, cfg->families, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->plugin, param); + rc = sqlite3_bind_string_or_null(res, cfg->plugin, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->module, param); + rc = sqlite3_bind_string_or_null(res, cfg->module, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->charts, param); + rc = sqlite3_bind_string_or_null(res, cfg->charts, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->green, param); + rc = sqlite3_bind_string_or_null(res, cfg->green, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->red, param); + rc = sqlite3_bind_string_or_null(res, cfg->red, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->warn, param); + rc = sqlite3_bind_string_or_null(res, cfg->warn, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->crit, param); + rc = sqlite3_bind_string_or_null(res, cfg->crit, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->exec, param); + rc = sqlite3_bind_string_or_null(res, cfg->exec, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->to, param); + rc = sqlite3_bind_string_or_null(res, cfg->to, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->info, param); + rc = sqlite3_bind_string_or_null(res, cfg->info, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->delay, param); + rc = sqlite3_bind_string_or_null(res, cfg->delay, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->options, param); + rc = sqlite3_bind_string_or_null(res, cfg->options, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->repeat, param); + rc = sqlite3_bind_string_or_null(res, cfg->repeat, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->host_labels, param); + rc = sqlite3_bind_string_or_null(res, cfg->host_labels, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; if (cfg->p_db_lookup_after) { - param++; - rc = sqlite3_bind_string_or_null(res, cfg->p_db_lookup_dimensions, param); + rc = sqlite3_bind_string_or_null(res, cfg->p_db_lookup_dimensions, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_string_or_null(res, cfg->p_db_lookup_method, param); + rc = sqlite3_bind_string_or_null(res, cfg->p_db_lookup_method, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_int(res, 31, cfg->p_db_lookup_options); + rc = sqlite3_bind_int(res, ++param, (int) cfg->p_db_lookup_options); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_int(res, 32, cfg->p_db_lookup_after); + rc = sqlite3_bind_int(res, ++param, (int) cfg->p_db_lookup_after); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_int(res, 33, cfg->p_db_lookup_before); + rc = sqlite3_bind_int(res, ++param, (int) cfg->p_db_lookup_before); if (unlikely(rc != SQLITE_OK)) goto bind_fail; } else { - param++; - rc = sqlite3_bind_null(res, 29); + rc = sqlite3_bind_null(res, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_null(res, 30); + + rc = sqlite3_bind_null(res, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_null(res, 31); + + rc = sqlite3_bind_null(res, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_null(res, 32); + + rc = sqlite3_bind_null(res, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_null(res, 33); + + rc = sqlite3_bind_null(res, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; } - param++; - rc = sqlite3_bind_int(res, 34, cfg->p_update_every); + rc = sqlite3_bind_int(res, ++param, cfg->p_update_every); if (unlikely(rc != SQLITE_OK)) goto bind_fail; @@ -1050,7 +1015,7 @@ int sql_store_alert_config_hash(uuid_t *hash_id, struct alert_config *cfg) return 0; - bind_fail: +bind_fail: error_report("Failed to bind parameter %d to store alert hash_id, rc = %d", param, rc); rc = sqlite3_reset(res); if (unlikely(rc != SQLITE_OK)) diff --git a/database/sqlite/sqlite_metadata.c b/database/sqlite/sqlite_metadata.c index 35f928ff..607d789a 100644 --- a/database/sqlite/sqlite_metadata.c +++ b/database/sqlite/sqlite_metadata.c @@ -64,9 +64,11 @@ enum metadata_opcode { METADATA_STORE_CLAIM_ID, METADATA_ADD_HOST_INFO, METADATA_SCAN_HOSTS, + METADATA_LOAD_HOST_CONTEXT, METADATA_MAINTENANCE, METADATA_SYNC_SHUTDOWN, METADATA_UNITTEST, + METADATA_ML_LOAD_MODELS, // leave this last // we need it to check for worker utilization METADATA_MAX_ENUMERATIONS_DEFINED @@ -154,19 +156,43 @@ static int chart_label_store_to_sql_callback(const char *name, const char *value return 1; } -static void check_and_update_chart_labels(RRDSET *st, BUFFER *work_buffer) +#define SQL_DELETE_CHART_LABEL "DELETE FROM chart_label WHERE chart_id = @chart_id;" +#define SQL_DELETE_CHART_LABEL_HISTORY "DELETE FROM chart_label WHERE date_created < %ld AND chart_id = @chart_id;" + +static void clean_old_chart_labels(RRDSET *st) +{ + char sql[512]; + time_t first_time_s = rrdset_first_entry_s(st); + + if (unlikely(!first_time_s)) + snprintfz(sql, 511,SQL_DELETE_CHART_LABEL); + else + snprintfz(sql, 511,SQL_DELETE_CHART_LABEL_HISTORY, first_time_s); + + int rc = exec_statement_with_uuid(sql, &st->chart_uuid); + if (unlikely(rc)) + error_report("METADATA: 'host:%s' Failed to clean old labels for chart %s", rrdhost_hostname(st->rrdhost), rrdset_name(st)); +} + +static int check_and_update_chart_labels(RRDSET *st, BUFFER *work_buffer, size_t *query_counter) { size_t old_version = st->rrdlabels_last_saved_version; size_t new_version = dictionary_version(st->rrdlabels); - if(new_version != old_version) { - buffer_flush(work_buffer); - struct query_build tmp = {.sql = work_buffer, .count = 0}; - uuid_unparse_lower(st->chart_uuid, tmp.uuid_str); - rrdlabels_walkthrough_read(st->rrdlabels, chart_label_store_to_sql_callback, &tmp); + if (new_version == old_version) + return 0; + + struct query_build tmp = {.sql = work_buffer, .count = 0}; + uuid_unparse_lower(st->chart_uuid, tmp.uuid_str); + rrdlabels_walkthrough_read(st->rrdlabels, chart_label_store_to_sql_callback, &tmp); + int rc = db_execute(db_meta, buffer_tostring(work_buffer)); + if (likely(!rc)) { st->rrdlabels_last_saved_version = new_version; - db_execute(buffer_tostring(work_buffer)); + (*query_counter)++; } + + clean_old_chart_labels(st); + return rc; } // Migrate all hosts with hops zero to this host_uuid @@ -177,12 +203,13 @@ void migrate_localhost(uuid_t *host_uuid) rc = exec_statement_with_uuid(MIGRATE_LOCALHOST_TO_NEW_MACHINE_GUID, host_uuid); if (!rc) rc = exec_statement_with_uuid(DELETE_NON_EXISTING_LOCALHOST, host_uuid); - if (!rc) - db_execute(DELETE_MISSING_NODE_INSTANCES); - + if (!rc) { + if (unlikely(db_execute(db_meta, DELETE_MISSING_NODE_INSTANCES))) + error_report("Failed to remove deleted hosts from node instances"); + } } -static void store_claim_id(uuid_t *host_id, uuid_t *claim_id) +static int store_claim_id(uuid_t *host_id, uuid_t *claim_id) { sqlite3_stmt *res = NULL; int rc; @@ -190,18 +217,18 @@ static void store_claim_id(uuid_t *host_id, uuid_t *claim_id) if (unlikely(!db_meta)) { if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) error_report("Database has not been initialized"); - return; + return 1; } rc = sqlite3_prepare_v2(db_meta, SQL_STORE_CLAIM_ID, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement store chart labels"); - return; + error_report("Failed to prepare statement to store host claim id"); + return 1; } rc = sqlite3_bind_blob(res, 1, host_id, sizeof(*host_id), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind host_id parameter to store node instance information"); + error_report("Failed to bind host_id parameter to store claim id"); goto failed; } @@ -210,17 +237,19 @@ static void store_claim_id(uuid_t *host_id, uuid_t *claim_id) else rc = sqlite3_bind_null(res, 2); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind claim_id parameter to store node instance information"); + error_report("Failed to bind claim_id parameter to host claim id"); goto failed; } rc = execute_insert(res); if (unlikely(rc != SQLITE_DONE)) - error_report("Failed to store node instance information, rc = %d", rc); + error_report("Failed to store host claim id rc = %d", rc); failed: if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) - error_report("Failed to finalize the prepared statement when storing node instance information"); + error_report("Failed to finalize the prepared statement when storing a host claim id"); + + return rc != SQLITE_DONE; } static void delete_dimension_uuid(uuid_t *dimension_uuid) @@ -252,7 +281,7 @@ skip_execution: // // Store host and host system info information in the database -static int sql_store_host_info(RRDHOST *host) +static int store_host_metadata(RRDHOST *host) { static __thread sqlite3_stmt *res = NULL; int rc, param = 0; @@ -340,7 +369,7 @@ static int sql_store_host_info(RRDHOST *host) if (unlikely(rc != SQLITE_OK)) error_report("Failed to reset statement to store host %s, rc = %d", rrdhost_hostname(host), rc); - return !(store_rc == SQLITE_DONE); + return store_rc != SQLITE_DONE; bind_fail: error_report("Failed to bind %d parameter to store host %s, rc = %d", param, rrdhost_hostname(host), rc); rc = sqlite3_reset(res); @@ -349,7 +378,7 @@ bind_fail: return 1; } -static void sql_store_host_system_info_key_value(const char *name, const char *value, void *data) +static void add_host_sysinfo_key_value(const char *name, const char *value, void *data) { struct query_build *lb = data; @@ -365,44 +394,43 @@ static void sql_store_host_system_info_key_value(const char *name, const char *v lb->count++; } -static BUFFER *sql_store_host_system_info(RRDHOST *host) +static bool build_host_system_info_statements(RRDHOST *host, BUFFER *work_buffer) { struct rrdhost_system_info *system_info = host->system_info; if (unlikely(!system_info)) - return NULL; - - BUFFER *work_buffer = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + return false; + buffer_flush(work_buffer); struct query_build key_data = {.sql = work_buffer, .count = 0}; uuid_unparse_lower(host->host_uuid, key_data.uuid_str); - sql_store_host_system_info_key_value("NETDATA_CONTAINER_OS_NAME", system_info->container_os_name, &key_data); - sql_store_host_system_info_key_value("NETDATA_CONTAINER_OS_ID", system_info->container_os_id, &key_data); - sql_store_host_system_info_key_value("NETDATA_CONTAINER_OS_ID_LIKE", system_info->container_os_id_like, &key_data); - sql_store_host_system_info_key_value("NETDATA_CONTAINER_OS_VERSION", system_info->container_os_version, &key_data); - sql_store_host_system_info_key_value("NETDATA_CONTAINER_OS_VERSION_ID", system_info->container_os_version_id, &key_data); - sql_store_host_system_info_key_value("NETDATA_CONTAINER_OS_DETECTION", system_info->host_os_detection, &key_data); - sql_store_host_system_info_key_value("NETDATA_HOST_OS_NAME", system_info->host_os_name, &key_data); - sql_store_host_system_info_key_value("NETDATA_HOST_OS_ID", system_info->host_os_id, &key_data); - sql_store_host_system_info_key_value("NETDATA_HOST_OS_ID_LIKE", system_info->host_os_id_like, &key_data); - sql_store_host_system_info_key_value("NETDATA_HOST_OS_VERSION", system_info->host_os_version, &key_data); - sql_store_host_system_info_key_value("NETDATA_HOST_OS_VERSION_ID", system_info->host_os_version_id, &key_data); - sql_store_host_system_info_key_value("NETDATA_HOST_OS_DETECTION", system_info->host_os_detection, &key_data); - sql_store_host_system_info_key_value("NETDATA_SYSTEM_KERNEL_NAME", system_info->kernel_name, &key_data); - sql_store_host_system_info_key_value("NETDATA_SYSTEM_CPU_LOGICAL_CPU_COUNT", system_info->host_cores, &key_data); - sql_store_host_system_info_key_value("NETDATA_SYSTEM_CPU_FREQ", system_info->host_cpu_freq, &key_data); - sql_store_host_system_info_key_value("NETDATA_SYSTEM_TOTAL_RAM", system_info->host_ram_total, &key_data); - sql_store_host_system_info_key_value("NETDATA_SYSTEM_TOTAL_DISK_SIZE", system_info->host_disk_space, &key_data); - sql_store_host_system_info_key_value("NETDATA_SYSTEM_KERNEL_VERSION", system_info->kernel_version, &key_data); - sql_store_host_system_info_key_value("NETDATA_SYSTEM_ARCHITECTURE", system_info->architecture, &key_data); - sql_store_host_system_info_key_value("NETDATA_SYSTEM_VIRTUALIZATION", system_info->virtualization, &key_data); - sql_store_host_system_info_key_value("NETDATA_SYSTEM_VIRT_DETECTION", system_info->virt_detection, &key_data); - sql_store_host_system_info_key_value("NETDATA_SYSTEM_CONTAINER", system_info->container, &key_data); - sql_store_host_system_info_key_value("NETDATA_SYSTEM_CONTAINER_DETECTION", system_info->container_detection, &key_data); - sql_store_host_system_info_key_value("NETDATA_HOST_IS_K8S_NODE", system_info->is_k8s_node, &key_data); - - return work_buffer; + add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_NAME", system_info->container_os_name, &key_data); + add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_ID", system_info->container_os_id, &key_data); + add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_ID_LIKE", system_info->container_os_id_like, &key_data); + add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_VERSION", system_info->container_os_version, &key_data); + add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_VERSION_ID", system_info->container_os_version_id, &key_data); + add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_DETECTION", system_info->host_os_detection, &key_data); + add_host_sysinfo_key_value("NETDATA_HOST_OS_NAME", system_info->host_os_name, &key_data); + add_host_sysinfo_key_value("NETDATA_HOST_OS_ID", system_info->host_os_id, &key_data); + add_host_sysinfo_key_value("NETDATA_HOST_OS_ID_LIKE", system_info->host_os_id_like, &key_data); + add_host_sysinfo_key_value("NETDATA_HOST_OS_VERSION", system_info->host_os_version, &key_data); + add_host_sysinfo_key_value("NETDATA_HOST_OS_VERSION_ID", system_info->host_os_version_id, &key_data); + add_host_sysinfo_key_value("NETDATA_HOST_OS_DETECTION", system_info->host_os_detection, &key_data); + add_host_sysinfo_key_value("NETDATA_SYSTEM_KERNEL_NAME", system_info->kernel_name, &key_data); + add_host_sysinfo_key_value("NETDATA_SYSTEM_CPU_LOGICAL_CPU_COUNT", system_info->host_cores, &key_data); + add_host_sysinfo_key_value("NETDATA_SYSTEM_CPU_FREQ", system_info->host_cpu_freq, &key_data); + add_host_sysinfo_key_value("NETDATA_SYSTEM_TOTAL_RAM", system_info->host_ram_total, &key_data); + add_host_sysinfo_key_value("NETDATA_SYSTEM_TOTAL_DISK_SIZE", system_info->host_disk_space, &key_data); + add_host_sysinfo_key_value("NETDATA_SYSTEM_KERNEL_VERSION", system_info->kernel_version, &key_data); + add_host_sysinfo_key_value("NETDATA_SYSTEM_ARCHITECTURE", system_info->architecture, &key_data); + add_host_sysinfo_key_value("NETDATA_SYSTEM_VIRTUALIZATION", system_info->virtualization, &key_data); + add_host_sysinfo_key_value("NETDATA_SYSTEM_VIRT_DETECTION", system_info->virt_detection, &key_data); + add_host_sysinfo_key_value("NETDATA_SYSTEM_CONTAINER", system_info->container, &key_data); + add_host_sysinfo_key_value("NETDATA_SYSTEM_CONTAINER_DETECTION", system_info->container_detection, &key_data); + add_host_sysinfo_key_value("NETDATA_HOST_IS_K8S_NODE", system_info->is_k8s_node, &key_data); + + return true; } @@ -410,13 +438,10 @@ static BUFFER *sql_store_host_system_info(RRDHOST *host) * Store a chart in the database */ -static int sql_store_chart( - uuid_t *chart_uuid, uuid_t *host_uuid, const char *type, const char *id, const char *name, const char *family, - const char *context, const char *title, const char *units, const char *plugin, const char *module, long priority, - int update_every, int chart_type, int memory_mode, long history_entries) +static int store_chart_metadata(RRDSET *st) { static __thread sqlite3_stmt *res = NULL; - int rc, param = 0; + int rc, param = 0, store_rc = 0; if (unlikely(!db_meta)) { if (default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) @@ -433,98 +458,83 @@ static int sql_store_chart( } } - param++; - rc = sqlite3_bind_blob(res, 1, chart_uuid, sizeof(*chart_uuid), SQLITE_STATIC); + rc = sqlite3_bind_blob(res, ++param, &st->chart_uuid, sizeof(st->chart_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_blob(res, 2, host_uuid, sizeof(*host_uuid), SQLITE_STATIC); + rc = sqlite3_bind_blob(res, ++param, &st->rrdhost->host_uuid, sizeof(st->rrdhost->host_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_text(res, 3, type, -1, SQLITE_STATIC); + rc = sqlite3_bind_text(res, ++param, string2str(st->parts.type), -1, SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_text(res, 4, id, -1, SQLITE_STATIC); + rc = sqlite3_bind_text(res, ++param, string2str(st->parts.id), -1, SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; + const char *name = string2str(st->parts.name); if (name && *name) - rc = sqlite3_bind_text(res, 5, name, -1, SQLITE_STATIC); + rc = sqlite3_bind_text(res, ++param, name, -1, SQLITE_STATIC); else - rc = sqlite3_bind_null(res, 5); + rc = sqlite3_bind_null(res, ++param); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_text(res, 6, family, -1, SQLITE_STATIC); + rc = sqlite3_bind_text(res, ++param, rrdset_family(st), -1, SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_text(res, 7, context, -1, SQLITE_STATIC); + rc = sqlite3_bind_text(res, ++param, rrdset_context(st), -1, SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_text(res, 8, title, -1, SQLITE_STATIC); + rc = sqlite3_bind_text(res, ++param, rrdset_title(st), -1, SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_text(res, 9, units, -1, SQLITE_STATIC); + rc = sqlite3_bind_text(res, ++param, rrdset_units(st), -1, SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_text(res, 10, plugin, -1, SQLITE_STATIC); + rc = sqlite3_bind_text(res, ++param, rrdset_plugin_name(st), -1, SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_text(res, 11, module, -1, SQLITE_STATIC); + rc = sqlite3_bind_text(res, ++param, rrdset_module_name(st), -1, SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_int(res, 12, (int) priority); + rc = sqlite3_bind_int(res, ++param, (int) st->priority); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_int(res, 13, update_every); + rc = sqlite3_bind_int(res, ++param, st->update_every); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_int(res, 14, chart_type); + rc = sqlite3_bind_int(res, ++param, st->chart_type); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_int(res, 15, memory_mode); + rc = sqlite3_bind_int(res, ++param, st->rrd_memory_mode); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - param++; - rc = sqlite3_bind_int(res, 16, (int) history_entries); + rc = sqlite3_bind_int(res, ++param, (int) st->entries); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - rc = execute_insert(res); - if (unlikely(rc != SQLITE_DONE)) - error_report("Failed to store chart, rc = %d", rc); + store_rc = execute_insert(res); + if (unlikely(store_rc != SQLITE_DONE)) + error_report("Failed to store chart, rc = %d", store_rc); rc = sqlite3_reset(res); if (unlikely(rc != SQLITE_OK)) error_report("Failed to reset statement in chart store function, rc = %d", rc); - return 0; + return store_rc != SQLITE_DONE; bind_fail: error_report("Failed to bind parameter %d to store chart, rc = %d", param, rc); @@ -537,9 +547,7 @@ bind_fail: /* * Store a dimension */ -static int sql_store_dimension( - uuid_t *dim_uuid, uuid_t *chart_uuid, const char *id, const char *name, collected_number multiplier, - collected_number divisor, int algorithm, bool hidden) +static int store_dimension_metadata(RRDDIM *rd) { static __thread sqlite3_stmt *res = NULL; int rc, param = 0; @@ -559,35 +567,35 @@ static int sql_store_dimension( } } - rc = sqlite3_bind_blob(res, ++param, dim_uuid, sizeof(*dim_uuid), SQLITE_STATIC); + rc = sqlite3_bind_blob(res, ++param, &rd->metric_uuid, sizeof(rd->metric_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - rc = sqlite3_bind_blob(res, ++param, chart_uuid, sizeof(*chart_uuid), SQLITE_STATIC); + rc = sqlite3_bind_blob(res, ++param, &rd->rrdset->chart_uuid, sizeof(rd->rrdset->chart_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - rc = sqlite3_bind_text(res, ++param, id, -1, SQLITE_STATIC); + rc = sqlite3_bind_text(res, ++param, string2str(rd->id), -1, SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - rc = sqlite3_bind_text(res, ++param, name, -1, SQLITE_STATIC); + rc = sqlite3_bind_text(res, ++param, string2str(rd->name), -1, SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - rc = sqlite3_bind_int(res, ++param, (int) multiplier); + rc = sqlite3_bind_int(res, ++param, (int) rd->multiplier); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - rc = sqlite3_bind_int(res, ++param, (int ) divisor); + rc = sqlite3_bind_int(res, ++param, (int ) rd->divisor); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - rc = sqlite3_bind_int(res, ++param, algorithm); + rc = sqlite3_bind_int(res, ++param, rd->algorithm); if (unlikely(rc != SQLITE_OK)) goto bind_fail; - if (hidden) + if (rrddim_option_check(rd, RRDDIM_OPTION_HIDDEN)) rc = sqlite3_bind_text(res, ++param, "hidden", -1, SQLITE_STATIC); else rc = sqlite3_bind_null(res, ++param); @@ -700,7 +708,6 @@ static void cleanup_health_log(void) // // EVENT LOOP STARTS HERE // -static uv_mutex_t metadata_async_lock; static void metadata_init_cmd_queue(struct metadata_wc *wc) { @@ -856,6 +863,7 @@ static void start_metadata_cleanup(uv_work_t *req) struct metadata_wc *wc = req->data; check_dimension_metadata(wc); cleanup_health_log(); + (void) sqlite3_wal_checkpoint(db_meta, NULL); worker_is_idle(); } @@ -863,9 +871,125 @@ struct scan_metadata_payload { uv_work_t request; struct metadata_wc *wc; struct completion *completion; + BUFFER *work_buffer; uint32_t max_count; }; +struct host_context_load_thread { + uv_thread_t thread; + RRDHOST *host; + bool busy; + bool finished; +}; + +static void restore_host_context(void *arg) +{ + struct host_context_load_thread *hclt = arg; + RRDHOST *host = hclt->host; + + usec_t started_ut = now_monotonic_usec(); (void)started_ut; + rrdhost_load_rrdcontext_data(host); + usec_t ended_ut = now_monotonic_usec(); (void)ended_ut; + + rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD | RRDHOST_FLAG_CONTEXT_LOAD_IN_PROGRESS); + +#ifdef ENABLE_ACLK + aclk_queue_node_info(host, false); +#endif + + internal_error(true, "METADATA: 'host:%s' context load in %0.2f ms", rrdhost_hostname(host), + (double)(ended_ut - started_ut) / USEC_PER_MS); + + __atomic_store_n(&hclt->finished, true, __ATOMIC_RELEASE); +} + +// Callback after scan of hosts is done +static void after_start_host_load_context(uv_work_t *req, int status __maybe_unused) +{ + struct scan_metadata_payload *data = req->data; + freez(data); +} + +#define MAX_FIND_THREAD_RETRIES (10) + +static void cleanup_finished_threads(struct host_context_load_thread *hclt, size_t max_thread_slots, bool wait) +{ + for (size_t index = 0; index < max_thread_slots; index++) { + if (__atomic_load_n(&(hclt[index].finished), __ATOMIC_RELAXED) + || (wait && __atomic_load_n(&(hclt[index].busy), __ATOMIC_ACQUIRE))) { + int rc = uv_thread_join(&(hclt[index].thread)); + if (rc) + error("Failed to join thread, rc = %d",rc); + __atomic_store_n(&(hclt[index].busy), false, __ATOMIC_RELEASE); + __atomic_store_n(&(hclt[index].finished), false, __ATOMIC_RELEASE); + } + } +} + +static size_t find_available_thread_slot(struct host_context_load_thread *hclt, size_t max_thread_slots, size_t *found_index) +{ + size_t retries = MAX_FIND_THREAD_RETRIES; + while (retries--) { + size_t index = 0; + while (index < max_thread_slots) { + if (false == __atomic_load_n(&(hclt[index].busy), __ATOMIC_ACQUIRE)) { + *found_index = index; + return true; + } + index++; + } + sleep_usec(10 * USEC_PER_MS); + } + return false; +} + +static void start_all_host_load_context(uv_work_t *req __maybe_unused) +{ + register_libuv_worker_jobs(); + + struct scan_metadata_payload *data = req->data; + UNUSED(data); + + worker_is_busy(UV_EVENT_HOST_CONTEXT_LOAD); + usec_t started_ut = now_monotonic_usec(); (void)started_ut; + + RRDHOST *host; + + size_t max_threads = MIN(get_netdata_cpus() / 2, 6); + struct host_context_load_thread *hclt = callocz(max_threads, sizeof(*hclt)); + + size_t thread_index; + dfe_start_reentrant(rrdhost_root_index, host) { + if (rrdhost_flag_check(host, RRDHOST_FLAG_CONTEXT_LOAD_IN_PROGRESS) || + !rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD)) + continue; + + rrdhost_flag_set(host, RRDHOST_FLAG_CONTEXT_LOAD_IN_PROGRESS); + internal_error(true, "METADATA: 'host:%s' loading context", rrdhost_hostname(host)); + + cleanup_finished_threads(hclt, max_threads, false); + bool found_slot = find_available_thread_slot(hclt, max_threads, &thread_index); + + if (unlikely(!found_slot)) { + struct host_context_load_thread hclt_sync = {.host = host}; + restore_host_context(&hclt_sync); + } + else { + __atomic_store_n(&hclt[thread_index].busy, true, __ATOMIC_RELAXED); + hclt[thread_index].host = host; + assert(0 == uv_thread_create(&hclt[thread_index].thread, restore_host_context, &hclt[thread_index])); + } + } + dfe_done(host); + + cleanup_finished_threads(hclt, max_threads, true); + freez(hclt); + usec_t ended_ut = now_monotonic_usec(); (void)ended_ut; + internal_error(true, "METADATA: 'host:ALL' contexts loaded in %0.2f ms", (double)(ended_ut - started_ut) / USEC_PER_MS); + + worker_is_idle(); +} + // Callback after scan of hosts is done static void after_metadata_hosts(uv_work_t *req, int status __maybe_unused) { @@ -881,13 +1005,15 @@ static void after_metadata_hosts(uv_work_t *req, int status __maybe_unused) freez(data); } -static bool metadata_scan_host(RRDHOST *host, uint32_t max_count, size_t *query_counter) { +static bool metadata_scan_host(RRDHOST *host, uint32_t max_count, bool use_transaction, BUFFER *work_buffer, size_t *query_counter) { RRDSET *st; int rc; bool more_to_do = false; uint32_t scan_count = 1; - BUFFER *work_buffer = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + + if (use_transaction) + (void)db_execute(db_meta, "BEGIN TRANSACTION;"); rrdset_foreach_reentrant(st, host) { if (scan_count == max_count) { @@ -900,27 +1026,16 @@ static bool metadata_scan_host(RRDHOST *host, uint32_t max_count, size_t *query_ rrdset_flag_clear(st, RRDSET_FLAG_METADATA_UPDATE); scan_count++; - check_and_update_chart_labels(st, work_buffer); - - rc = sql_store_chart( - &st->chart_uuid, - &st->rrdhost->host_uuid, - string2str(st->parts.type), - string2str(st->parts.id), - string2str(st->parts.name), - rrdset_family(st), - rrdset_context(st), - rrdset_title(st), - rrdset_units(st), - rrdset_plugin_name(st), - rrdset_module_name(st), - st->priority, - st->update_every, - st->chart_type, - st->rrd_memory_mode, - st->entries); + buffer_flush(work_buffer); + rc = check_and_update_chart_labels(st, work_buffer, query_counter); + if (unlikely(rc)) + error_report("METADATA: 'host:%s': Failed to update labels for chart %s", rrdhost_hostname(host), rrdset_name(st)); + else + (*query_counter)++; + + rc = store_chart_metadata(st); if (unlikely(rc)) - internal_error(true, "METADATA: Failed to store chart metadata %s", string2str(st->id)); + error_report("METADATA: 'host:%s': Failed to store metadata for chart %s", rrdhost_hostname(host), rrdset_name(st)); } RRDDIM *rd; @@ -935,116 +1050,145 @@ static bool metadata_scan_host(RRDHOST *host, uint32_t max_count, size_t *query_ else rrddim_flag_clear(rd, RRDDIM_FLAG_META_HIDDEN); - rc = sql_store_dimension( - &rd->metric_uuid, - &rd->rrdset->chart_uuid, - string2str(rd->id), - string2str(rd->name), - rd->multiplier, - rd->divisor, - rd->algorithm, - rrddim_option_check(rd, RRDDIM_OPTION_HIDDEN)); - + rc = store_dimension_metadata(rd); if (unlikely(rc)) - error_report("METADATA: Failed to store dimension %s", string2str(rd->id)); + error_report("METADATA: 'host:%s': Failed to dimension metadata for chart %s. dimension %s", + rrdhost_hostname(host), rrdset_name(st), + rrddim_name(rd)); } } rrddim_foreach_done(rd); } rrdset_foreach_done(st); - buffer_free(work_buffer); + if (use_transaction) + (void)db_execute(db_meta, "COMMIT TRANSACTION;"); + return more_to_do; } +static void store_host_and_system_info(RRDHOST *host, BUFFER *work_buffer, size_t *query_counter) +{ + bool free_work_buffer = (NULL == work_buffer); + + if (unlikely(free_work_buffer)) + work_buffer = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + + if (build_host_system_info_statements(host, work_buffer)) { + int rc = db_execute(db_meta, buffer_tostring(work_buffer)); + if (unlikely(rc)) { + error_report("METADATA: 'host:%s': Failed to store host updated information in the database", rrdhost_hostname(host)); + rrdhost_flag_set(host, RRDHOST_FLAG_METADATA_INFO | RRDHOST_FLAG_METADATA_UPDATE); + } + else { + if (likely(query_counter)) + (*query_counter)++; + } + } + + if (unlikely(store_host_metadata(host))) { + error_report("METADATA: 'host:%s': Failed to store host info in the database", rrdhost_hostname(host)); + rrdhost_flag_set(host, RRDHOST_FLAG_METADATA_INFO | RRDHOST_FLAG_METADATA_UPDATE); + } + else { + if (likely(query_counter)) + (*query_counter)++; + } + + if (unlikely(free_work_buffer)) + buffer_free(work_buffer); +} + // Worker thread to scan hosts for pending metadata to store static void start_metadata_hosts(uv_work_t *req __maybe_unused) { register_libuv_worker_jobs(); RRDHOST *host; + int transaction_started = 0; struct scan_metadata_payload *data = req->data; struct metadata_wc *wc = data->wc; + BUFFER *work_buffer = data->work_buffer; usec_t all_started_ut = now_monotonic_usec(); (void)all_started_ut; internal_error(true, "METADATA: checking all hosts..."); + usec_t started_ut = now_monotonic_usec(); (void)started_ut; bool run_again = false; worker_is_busy(UV_EVENT_METADATA_STORE); if (!data->max_count) - db_execute("BEGIN TRANSACTION;"); + transaction_started = !db_execute(db_meta, "BEGIN TRANSACTION;"); + dfe_start_reentrant(rrdhost_root_index, host) { if (rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED) || !rrdhost_flag_check(host, RRDHOST_FLAG_METADATA_UPDATE)) continue; size_t query_counter = 0; (void)query_counter; - usec_t started_ut = now_monotonic_usec(); (void)started_ut; rrdhost_flag_clear(host,RRDHOST_FLAG_METADATA_UPDATE); if (unlikely(rrdhost_flag_check(host, RRDHOST_FLAG_METADATA_LABELS))) { rrdhost_flag_clear(host, RRDHOST_FLAG_METADATA_LABELS); + int rc = exec_statement_with_uuid(SQL_DELETE_HOST_LABELS, &host->host_uuid); - if (likely(rc == SQLITE_OK)) { - BUFFER *work_buffer = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + if (likely(!rc)) { + query_counter++; + + buffer_flush(work_buffer); struct query_build tmp = {.sql = work_buffer, .count = 0}; uuid_unparse_lower(host->host_uuid, tmp.uuid_str); rrdlabels_walkthrough_read(host->rrdlabels, host_label_store_to_sql_callback, &tmp); - db_execute(buffer_tostring(work_buffer)); - buffer_free(work_buffer); - query_counter++; + rc = db_execute(db_meta, buffer_tostring(work_buffer)); + + if (unlikely(rc)) { + error_report("METADATA: 'host:%s': failed to update metadata host labels", rrdhost_hostname(host)); + rrdhost_flag_set(host, RRDHOST_FLAG_METADATA_LABELS | RRDHOST_FLAG_METADATA_UPDATE); + } + else + query_counter++; + } else { + error_report("METADATA: 'host:%s': failed to delete old host labels", rrdhost_hostname(host)); + rrdhost_flag_set(host, RRDHOST_FLAG_METADATA_LABELS | RRDHOST_FLAG_METADATA_UPDATE); } } if (unlikely(rrdhost_flag_check(host, RRDHOST_FLAG_METADATA_CLAIMID))) { rrdhost_flag_clear(host, RRDHOST_FLAG_METADATA_CLAIMID); uuid_t uuid; - + int rc; if (likely(host->aclk_state.claimed_id && !uuid_parse(host->aclk_state.claimed_id, uuid))) - store_claim_id(&host->host_uuid, &uuid); + rc = store_claim_id(&host->host_uuid, &uuid); else - store_claim_id(&host->host_uuid, NULL); - - query_counter++; - } - - if (unlikely(rrdhost_flag_check(host, RRDHOST_FLAG_METADATA_INFO))) { - rrdhost_flag_clear(host, RRDHOST_FLAG_METADATA_INFO); - - BUFFER *work_buffer = sql_store_host_system_info(host); - if(work_buffer) { - db_execute(buffer_tostring(work_buffer)); - buffer_free(work_buffer); - query_counter++; - } + rc = store_claim_id(&host->host_uuid, NULL); - int rc = sql_store_host_info(host); if (unlikely(rc)) - error_report("METADATA: 'host:%s': failed to store host info", string2str(host->hostname)); + rrdhost_flag_set(host, RRDHOST_FLAG_METADATA_CLAIMID | RRDHOST_FLAG_METADATA_UPDATE); else query_counter++; } + if (unlikely(rrdhost_flag_check(host, RRDHOST_FLAG_METADATA_INFO))) { + rrdhost_flag_clear(host, RRDHOST_FLAG_METADATA_INFO); + store_host_and_system_info(host, work_buffer, &query_counter); + } - if (data->max_count) - db_execute("BEGIN TRANSACTION;"); - if (unlikely(metadata_scan_host(host, data->max_count, &query_counter))) { + // For clarity + bool use_transaction = data->max_count; + if (unlikely(metadata_scan_host(host, data->max_count, use_transaction, work_buffer, &query_counter))) { run_again = true; rrdhost_flag_set(host,RRDHOST_FLAG_METADATA_UPDATE); internal_error(true,"METADATA: 'host:%s': scheduling another run, more charts to store", rrdhost_hostname(host)); } - if (data->max_count) - db_execute("COMMIT TRANSACTION;"); - usec_t ended_ut = now_monotonic_usec(); (void)ended_ut; internal_error(true, "METADATA: 'host:%s': saved metadata with %zu SQL statements, in %0.2f ms", rrdhost_hostname(host), query_counter, (double)(ended_ut - started_ut) / USEC_PER_MS); } dfe_done(host); - if (!data->max_count) - db_execute("COMMIT TRANSACTION;"); + + if (!data->max_count && transaction_started) + transaction_started = db_execute(db_meta, "COMMIT TRANSACTION;"); usec_t all_ended_ut = now_monotonic_usec(); (void)all_ended_ut; internal_error(true, "METADATA: checking all hosts completed in %0.2f ms", @@ -1059,7 +1203,6 @@ static void start_metadata_hosts(uv_work_t *req __maybe_unused) static void metadata_event_loop(void *arg) { - service_register(SERVICE_THREAD_TYPE_EVENT_LOOP, NULL, NULL, NULL, true); worker_register("METASYNC"); worker_register_job_name(METADATA_DATABASE_NOOP, "noop"); worker_register_job_name(METADATA_DATABASE_TIMER, "timer"); @@ -1067,6 +1210,7 @@ static void metadata_event_loop(void *arg) worker_register_job_name(METADATA_STORE_CLAIM_ID, "add claim id"); worker_register_job_name(METADATA_ADD_HOST_INFO, "add host info"); worker_register_job_name(METADATA_MAINTENANCE, "maintenance"); + worker_register_job_name(METADATA_ML_LOAD_MODELS, "ml load models"); int ret; uv_loop_t *loop; @@ -1076,6 +1220,7 @@ static void metadata_event_loop(void *arg) uv_work_t metadata_cleanup_worker; uv_thread_set_name_np(wc->thread, "METASYNC"); +// service_register(SERVICE_THREAD_TYPE_EVENT_LOOP, NULL, NULL, NULL, true); loop = wc->loop = mallocz(sizeof(uv_loop_t)); ret = uv_loop_init(loop); if (ret) { @@ -1112,11 +1257,12 @@ static void metadata_event_loop(void *arg) int shutdown = 0; wc->row_id = 0; completion_mark_complete(&wc->init_complete); + BUFFER *work_buffer = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + struct scan_metadata_payload *data; while (shutdown == 0 || (wc->flags & METADATA_WORKER_BUSY)) { uuid_t *uuid; RRDHOST *host = NULL; - int rc; worker_is_idle(); uv_run(loop, UV_RUN_DEFAULT); @@ -1145,6 +1291,11 @@ static void metadata_event_loop(void *arg) case METADATA_DATABASE_TIMER: break; + case METADATA_ML_LOAD_MODELS: { + RRDDIM *rd = (RRDDIM *) cmd.param[0]; + ml_dimension_load_models(rd); + break; + } case METADATA_DEL_DIMENSION: uuid = (uuid_t *) cmd.param[0]; if (likely(dimension_can_be_deleted(uuid))) @@ -1158,9 +1309,7 @@ static void metadata_event_loop(void *arg) break; case METADATA_ADD_HOST_INFO: host = (RRDHOST *) cmd.param[0]; - rc = sql_store_host_info(host); - if (unlikely(rc)) - error_report("Failed to store host info in the database for %s", string2str(host->hostname)); + store_host_and_system_info(host, NULL, NULL); break; case METADATA_SCAN_HOSTS: if (unlikely(metadata_flag_check(wc, METADATA_FLAG_SCANNING_HOSTS))) @@ -1169,10 +1318,11 @@ static void metadata_event_loop(void *arg) if (unittest_running) break; - struct scan_metadata_payload *data = mallocz(sizeof(*data)); + data = mallocz(sizeof(*data)); data->request.data = data; data->wc = wc; data->completion = cmd.completion; // Completion by the worker + data->work_buffer = work_buffer; if (unlikely(cmd.completion)) { data->max_count = 0; // 0 will process all pending updates @@ -1192,6 +1342,19 @@ static void metadata_event_loop(void *arg) metadata_flag_clear(wc, METADATA_FLAG_SCANNING_HOSTS); } break; + case METADATA_LOAD_HOST_CONTEXT:; + if (unittest_running) + break; + + data = callocz(1,sizeof(*data)); + data->request.data = data; + data->wc = wc; + if (unlikely( + uv_queue_work(loop,&data->request, start_all_host_load_context, + after_start_host_load_context))) { + freez(data); + } + break; case METADATA_MAINTENANCE: if (unlikely(metadata_flag_check(wc, METADATA_FLAG_CLEANUP))) break; @@ -1220,21 +1383,14 @@ static void metadata_event_loop(void *arg) if (!uv_timer_stop(&wc->timer_req)) uv_close((uv_handle_t *)&wc->timer_req, NULL); - /* - * uv_async_send after uv_close does not seem to crash in linux at the moment, - * it is however undocumented behaviour we need to be aware if this becomes - * an issue in the future. - */ uv_close((uv_handle_t *)&wc->async, NULL); - uv_run(loop, UV_RUN_DEFAULT); - uv_cond_destroy(&wc->cmd_cond); int rc; - do { rc = uv_loop_close(loop); } while (rc != UV_EBUSY); + buffer_free(work_buffer); freez(loop); worker_unregister(); @@ -1272,6 +1428,9 @@ void metadata_sync_shutdown(void) void metadata_sync_shutdown_prepare(void) { + if (unlikely(!metasync_worker.loop)) + return; + struct metadata_cmd cmd; memset(&cmd, 0, sizeof(cmd)); @@ -1303,8 +1462,6 @@ void metadata_sync_init(void) { struct metadata_wc *wc = &metasync_worker; - fatal_assert(0 == uv_mutex_init(&metadata_async_lock)); - memset(wc, 0, sizeof(*wc)); metadata_init_cmd_queue(wc); completion_init(&wc->init_complete); @@ -1364,6 +1521,20 @@ void metaqueue_host_update_info(RRDHOST *host) queue_metadata_cmd(METADATA_ADD_HOST_INFO, host, NULL); } +void metaqueue_ml_load_models(RRDDIM *rd) +{ + if (unlikely(!metasync_worker.loop)) + return; + queue_metadata_cmd(METADATA_ML_LOAD_MODELS, rd, NULL); +} + +void metadata_queue_load_host_context(RRDHOST *host) +{ + if (unlikely(!metasync_worker.loop)) + return; + queue_metadata_cmd(METADATA_LOAD_HOST_CONTEXT, host, NULL); +} + // // unitests // @@ -1419,7 +1590,7 @@ static void *metadata_unittest_threads(void) unittest_queue_metadata, &tu); } - uv_async_send(&metasync_worker.async); + (void) uv_async_send(&metasync_worker.async); sleep_usec(seconds_to_run * USEC_PER_SEC); __atomic_store_n(&tu.join, 1, __ATOMIC_RELAXED); diff --git a/database/sqlite/sqlite_metadata.h b/database/sqlite/sqlite_metadata.h index d578b7a8..6b0676ee 100644 --- a/database/sqlite/sqlite_metadata.h +++ b/database/sqlite/sqlite_metadata.h @@ -14,7 +14,9 @@ void metadata_sync_shutdown_prepare(void); void metaqueue_delete_dimension_uuid(uuid_t *uuid); void metaqueue_store_claim_id(uuid_t *host_uuid, uuid_t *claim_uuid); void metaqueue_host_update_info(RRDHOST *host); +void metaqueue_ml_load_models(RRDDIM *rd); void migrate_localhost(uuid_t *host_uuid); +void metadata_queue_load_host_context(RRDHOST *host); // UNIT TEST int metadata_unittest(void); diff --git a/database/storage_engine.c b/database/storage_engine.c index c5ba8655..19982382 100644 --- a/database/storage_engine.c +++ b/database/storage_engine.c @@ -6,120 +6,78 @@ #include "engine/rrdengineapi.h" #endif -#define im_collect_ops { \ - .init = rrddim_collect_init, \ - .store_metric = rrddim_collect_store_metric, \ - .flush = rrddim_store_metric_flush, \ - .finalize = rrddim_collect_finalize, \ - .change_collection_frequency = rrddim_store_metric_change_collection_frequency, \ - .metrics_group_get = rrddim_metrics_group_get, \ - .metrics_group_release = rrddim_metrics_group_release, \ -} - -#define im_query_ops { \ - .init = rrddim_query_init, \ - .next_metric = rrddim_query_next_metric, \ - .is_finished = rrddim_query_is_finished, \ - .finalize = rrddim_query_finalize, \ - .latest_time_s = rrddim_query_latest_time_s, \ - .oldest_time_s = rrddim_query_oldest_time_s, \ - .align_to_optimal_before = rrddim_query_align_to_optimal_before, \ -} - static STORAGE_ENGINE engines[] = { { .id = RRD_MEMORY_MODE_NONE, .name = RRD_MEMORY_MODE_NONE_NAME, + .backend = STORAGE_ENGINE_BACKEND_RRDDIM, .api = { .metric_get = rrddim_metric_get, .metric_get_or_create = rrddim_metric_get_or_create, .metric_dup = rrddim_metric_dup, .metric_release = rrddim_metric_release, .metric_retention_by_uuid = rrddim_metric_retention_by_uuid, - .collect_ops = im_collect_ops, - .query_ops = im_query_ops, } }, { .id = RRD_MEMORY_MODE_RAM, .name = RRD_MEMORY_MODE_RAM_NAME, + .backend = STORAGE_ENGINE_BACKEND_RRDDIM, .api = { .metric_get = rrddim_metric_get, .metric_get_or_create = rrddim_metric_get_or_create, .metric_dup = rrddim_metric_dup, .metric_release = rrddim_metric_release, .metric_retention_by_uuid = rrddim_metric_retention_by_uuid, - .collect_ops = im_collect_ops, - .query_ops = im_query_ops, } }, { .id = RRD_MEMORY_MODE_MAP, .name = RRD_MEMORY_MODE_MAP_NAME, + .backend = STORAGE_ENGINE_BACKEND_RRDDIM, .api = { .metric_get = rrddim_metric_get, .metric_get_or_create = rrddim_metric_get_or_create, .metric_dup = rrddim_metric_dup, .metric_release = rrddim_metric_release, .metric_retention_by_uuid = rrddim_metric_retention_by_uuid, - .collect_ops = im_collect_ops, - .query_ops = im_query_ops, } }, { .id = RRD_MEMORY_MODE_SAVE, .name = RRD_MEMORY_MODE_SAVE_NAME, + .backend = STORAGE_ENGINE_BACKEND_RRDDIM, .api = { .metric_get = rrddim_metric_get, .metric_get_or_create = rrddim_metric_get_or_create, .metric_dup = rrddim_metric_dup, .metric_release = rrddim_metric_release, .metric_retention_by_uuid = rrddim_metric_retention_by_uuid, - .collect_ops = im_collect_ops, - .query_ops = im_query_ops, } }, { .id = RRD_MEMORY_MODE_ALLOC, .name = RRD_MEMORY_MODE_ALLOC_NAME, + .backend = STORAGE_ENGINE_BACKEND_RRDDIM, .api = { .metric_get = rrddim_metric_get, .metric_get_or_create = rrddim_metric_get_or_create, .metric_dup = rrddim_metric_dup, .metric_release = rrddim_metric_release, .metric_retention_by_uuid = rrddim_metric_retention_by_uuid, - .collect_ops = im_collect_ops, - .query_ops = im_query_ops, } }, #ifdef ENABLE_DBENGINE { .id = RRD_MEMORY_MODE_DBENGINE, .name = RRD_MEMORY_MODE_DBENGINE_NAME, + .backend = STORAGE_ENGINE_BACKEND_DBENGINE, .api = { .metric_get = rrdeng_metric_get, .metric_get_or_create = rrdeng_metric_get_or_create, .metric_dup = rrdeng_metric_dup, .metric_release = rrdeng_metric_release, .metric_retention_by_uuid = rrdeng_metric_retention_by_uuid, - .collect_ops = { - .init = rrdeng_store_metric_init, - .store_metric = rrdeng_store_metric_next, - .flush = rrdeng_store_metric_flush_current_page, - .finalize = rrdeng_store_metric_finalize, - .change_collection_frequency = rrdeng_store_metric_change_collection_frequency, - .metrics_group_get = rrdeng_metrics_group_get, - .metrics_group_release = rrdeng_metrics_group_release, - }, - .query_ops = { - .init = rrdeng_load_metric_init, - .next_metric = rrdeng_load_metric_next, - .is_finished = rrdeng_load_metric_is_finished, - .finalize = rrdeng_load_metric_finalize, - .latest_time_s = rrdeng_metric_latest_time, - .oldest_time_s = rrdeng_metric_oldest_time, - .align_to_optimal_before = rrdeng_load_align_to_optimal_before, - } } }, #endif |