diff options
Diffstat (limited to '')
47 files changed, 3574 insertions, 2003 deletions
diff --git a/database/contexts/api_v2.c b/database/contexts/api_v2.c index d0b27a2aa..3ca49a319 100644 --- a/database/contexts/api_v2.c +++ b/database/contexts/api_v2.c @@ -507,7 +507,7 @@ static bool rrdcontext_matches_alert(struct rrdcontext_to_json_v2_data *ctl, RRD if (ctl->options & (CONTEXT_V2_OPTION_ALERTS_WITH_INSTANCES | CONTEXT_V2_OPTION_ALERTS_WITH_VALUES)) { char key[20 + 1]; - snprintfz(key, 20, "%p", rcl); + snprintfz(key, sizeof(key) - 1, "%p", rcl); struct sql_alert_instance_v2_entry z = { .ati = ati, @@ -616,10 +616,10 @@ static void rrdhost_receiver_to_json(BUFFER *wb, RRDHOST_STATUS *s, const char * buffer_json_member_add_object(wb, "source"); { char buf[1024 + 1]; - snprintfz(buf, 1024, "[%s]:%d%s", s->ingest.peers.local.ip, s->ingest.peers.local.port, s->ingest.ssl ? ":SSL" : ""); + snprintfz(buf, sizeof(buf) - 1, "[%s]:%d%s", s->ingest.peers.local.ip, s->ingest.peers.local.port, s->ingest.ssl ? ":SSL" : ""); buffer_json_member_add_string(wb, "local", buf); - snprintfz(buf, 1024, "[%s]:%d%s", s->ingest.peers.peer.ip, s->ingest.peers.peer.port, s->ingest.ssl ? ":SSL" : ""); + snprintfz(buf, sizeof(buf) - 1, "[%s]:%d%s", s->ingest.peers.peer.ip, s->ingest.peers.peer.port, s->ingest.ssl ? ":SSL" : ""); buffer_json_member_add_string(wb, "remote", buf); stream_capabilities_to_json_array(wb, s->ingest.capabilities, "capabilities"); @@ -659,10 +659,10 @@ static void rrdhost_sender_to_json(BUFFER *wb, RRDHOST_STATUS *s, const char *ke buffer_json_member_add_object(wb, "destination"); { char buf[1024 + 1]; - snprintfz(buf, 1024, "[%s]:%d%s", s->stream.peers.local.ip, s->stream.peers.local.port, s->stream.ssl ? ":SSL" : ""); + snprintfz(buf, sizeof(buf) - 1, "[%s]:%d%s", s->stream.peers.local.ip, s->stream.peers.local.port, s->stream.ssl ? ":SSL" : ""); buffer_json_member_add_string(wb, "local", buf); - snprintfz(buf, 1024, "[%s]:%d%s", s->stream.peers.peer.ip, s->stream.peers.peer.port, s->stream.ssl ? ":SSL" : ""); + snprintfz(buf, sizeof(buf) - 1, "[%s]:%d%s", s->stream.peers.peer.ip, s->stream.peers.peer.port, s->stream.ssl ? ":SSL" : ""); buffer_json_member_add_string(wb, "remote", buf); stream_capabilities_to_json_array(wb, s->stream.capabilities, "capabilities"); @@ -674,6 +674,7 @@ static void rrdhost_sender_to_json(BUFFER *wb, RRDHOST_STATUS *s, const char *ke buffer_json_member_add_uint64(wb, "metadata", s->stream.sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_METADATA]); buffer_json_member_add_uint64(wb, "functions", s->stream.sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_FUNCTIONS]); buffer_json_member_add_uint64(wb, "replication", s->stream.sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_REPLICATION]); + buffer_json_member_add_uint64(wb, "dyncfg", s->stream.sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_DYNCFG]); } buffer_json_object_close(wb); // traffic @@ -685,7 +686,7 @@ static void rrdhost_sender_to_json(BUFFER *wb, RRDHOST_STATUS *s, const char *ke { if (d->ssl) { - snprintfz(buf, 1024, "%s:SSL", string2str(d->destination)); + snprintfz(buf, sizeof(buf) - 1, "%s:SSL", string2str(d->destination)); buffer_json_member_add_string(wb, "destination", buf); } else diff --git a/database/contexts/instance.c b/database/contexts/instance.c index 8a60ce662..39837dbf6 100644 --- a/database/contexts/instance.c +++ b/database/contexts/instance.c @@ -329,11 +329,11 @@ inline void rrdinstance_from_rrdset(RRDSET *st) { RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *)dictionary_set_and_acquire_item(rc->rrdinstances, string2str(tri.id), &tri, sizeof(tri)); - RRDCONTEXT_ACQUIRED *rca_old = st->rrdcontext; - RRDINSTANCE_ACQUIRED *ria_old = st->rrdinstance; + RRDCONTEXT_ACQUIRED *rca_old = st->rrdcontexts.rrdcontext; + RRDINSTANCE_ACQUIRED *ria_old = st->rrdcontexts.rrdinstance; - st->rrdcontext = rca; - st->rrdinstance = ria; + st->rrdcontexts.rrdcontext = rca; + st->rrdcontexts.rrdinstance = ria; if(rca == rca_old) { rrdcontext_release(rca_old); @@ -354,16 +354,16 @@ inline void rrdinstance_from_rrdset(RRDSET *st) { // migrate all dimensions to the new metrics RRDDIM *rd; rrddim_foreach_read(rd, st) { - if (!rd->rrdmetric) continue; + if (!rd->rrdcontexts.rrdmetric) continue; - RRDMETRIC *rm_old = rrdmetric_acquired_value(rd->rrdmetric); + RRDMETRIC *rm_old = rrdmetric_acquired_value(rd->rrdcontexts.rrdmetric); rrd_flags_replace(rm_old, RRD_FLAG_DELETED|RRD_FLAG_UPDATED|RRD_FLAG_LIVE_RETENTION|RRD_FLAG_UPDATE_REASON_UNUSED|RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); rm_old->rrddim = NULL; rm_old->first_time_s = 0; rm_old->last_time_s = 0; - rrdmetric_release(rd->rrdmetric); - rd->rrdmetric = NULL; + rrdmetric_release(rd->rrdcontexts.rrdmetric); + rd->rrdcontexts.rrdmetric = NULL; rrdmetric_from_rrddim(rd); } @@ -406,12 +406,12 @@ inline void rrdinstance_from_rrdset(RRDSET *st) { #define rrdset_get_rrdinstance(st) rrdset_get_rrdinstance_with_trace(st, __FUNCTION__); static inline RRDINSTANCE *rrdset_get_rrdinstance_with_trace(RRDSET *st, const char *function) { - if(unlikely(!st->rrdinstance)) { + if(unlikely(!st->rrdcontexts.rrdinstance)) { netdata_log_error("RRDINSTANCE: RRDSET '%s' is not linked to an RRDINSTANCE at %s()", rrdset_id(st), function); return NULL; } - RRDINSTANCE *ri = rrdinstance_acquired_value(st->rrdinstance); + RRDINSTANCE *ri = rrdinstance_acquired_value(st->rrdcontexts.rrdinstance); if(unlikely(!ri)) { netdata_log_error("RRDINSTANCE: RRDSET '%s' lost its link to an RRDINSTANCE at %s()", rrdset_id(st), function); return NULL; @@ -439,14 +439,17 @@ inline void rrdinstance_rrdset_is_freed(RRDSET *st) { rrdinstance_trigger_updates(ri, __FUNCTION__ ); - rrdinstance_release(st->rrdinstance); - st->rrdinstance = NULL; + rrdinstance_release(st->rrdcontexts.rrdinstance); + st->rrdcontexts.rrdinstance = NULL; - rrdcontext_release(st->rrdcontext); - st->rrdcontext = NULL; + rrdcontext_release(st->rrdcontexts.rrdcontext); + st->rrdcontexts.rrdcontext = NULL; + st->rrdcontexts.collected = false; } inline void rrdinstance_rrdset_has_updated_retention(RRDSET *st) { + st->rrdcontexts.collected = false; + RRDINSTANCE *ri = rrdset_get_rrdinstance(st); if(unlikely(!ri)) return; @@ -455,8 +458,10 @@ inline void rrdinstance_rrdset_has_updated_retention(RRDSET *st) { } inline void rrdinstance_updated_rrdset_name(RRDSET *st) { + st->rrdcontexts.collected = false; + // the chart may not be initialized when this is called - if(unlikely(!st->rrdinstance)) return; + if(unlikely(!st->rrdcontexts.rrdinstance)) return; RRDINSTANCE *ri = rrdset_get_rrdinstance(st); if(unlikely(!ri)) return; @@ -491,6 +496,8 @@ inline void rrdinstance_updated_rrdset_flags_no_action(RRDINSTANCE *ri, RRDSET * } inline void rrdinstance_updated_rrdset_flags(RRDSET *st) { + st->rrdcontexts.collected = false; + RRDINSTANCE *ri = rrdset_get_rrdinstance(st); if(unlikely(!ri)) return; @@ -503,6 +510,11 @@ inline void rrdinstance_updated_rrdset_flags(RRDSET *st) { } inline void rrdinstance_collected_rrdset(RRDSET *st) { + if(st->rrdcontexts.collected) + return; + + st->rrdcontexts.collected = true; + RRDINSTANCE *ri = rrdset_get_rrdinstance(st); if(unlikely(!ri)) { rrdcontext_updated_rrdset(st); diff --git a/database/contexts/metric.c b/database/contexts/metric.c index 55efde4e9..0f0785972 100644 --- a/database/contexts/metric.c +++ b/database/contexts/metric.c @@ -239,10 +239,10 @@ void rrdmetric_from_rrddim(RRDDIM *rd) { if(unlikely(!rd->rrdset->rrdhost)) fatal("RRDMETRIC: rrdset '%s' does not have a rrdhost", rrdset_id(rd->rrdset)); - if(unlikely(!rd->rrdset->rrdinstance)) + if(unlikely(!rd->rrdset->rrdcontexts.rrdinstance)) fatal("RRDMETRIC: rrdset '%s' does not have a rrdinstance", rrdset_id(rd->rrdset)); - RRDINSTANCE *ri = rrdinstance_acquired_value(rd->rrdset->rrdinstance); + RRDINSTANCE *ri = rrdinstance_acquired_value(rd->rrdset->rrdcontexts.rrdinstance); RRDMETRIC trm = { .id = string_dup(rd->id), @@ -254,20 +254,21 @@ void rrdmetric_from_rrddim(RRDDIM *rd) { RRDMETRIC_ACQUIRED *rma = (RRDMETRIC_ACQUIRED *)dictionary_set_and_acquire_item(ri->rrdmetrics, string2str(trm.id), &trm, sizeof(trm)); - if(rd->rrdmetric) - rrdmetric_release(rd->rrdmetric); + if(rd->rrdcontexts.rrdmetric) + rrdmetric_release(rd->rrdcontexts.rrdmetric); - rd->rrdmetric = rma; + rd->rrdcontexts.rrdmetric = rma; + rd->rrdcontexts.collected = false; } #define rrddim_get_rrdmetric(rd) rrddim_get_rrdmetric_with_trace(rd, __FUNCTION__) static inline RRDMETRIC *rrddim_get_rrdmetric_with_trace(RRDDIM *rd, const char *function) { - if(unlikely(!rd->rrdmetric)) { + if(unlikely(!rd->rrdcontexts.rrdmetric)) { netdata_log_error("RRDMETRIC: RRDDIM '%s' is not linked to an RRDMETRIC at %s()", rrddim_id(rd), function); return NULL; } - RRDMETRIC *rm = rrdmetric_acquired_value(rd->rrdmetric); + RRDMETRIC *rm = rrdmetric_acquired_value(rd->rrdcontexts.rrdmetric); if(unlikely(!rm)) { netdata_log_error("RRDMETRIC: RRDDIM '%s' lost the link to its RRDMETRIC at %s()", rrddim_id(rd), function); return NULL; @@ -288,11 +289,14 @@ inline void rrdmetric_rrddim_is_freed(RRDDIM *rd) { rm->rrddim = NULL; rrdmetric_trigger_updates(rm, __FUNCTION__ ); - rrdmetric_release(rd->rrdmetric); - rd->rrdmetric = NULL; + rrdmetric_release(rd->rrdcontexts.rrdmetric); + rd->rrdcontexts.rrdmetric = NULL; + rd->rrdcontexts.collected = false; } inline void rrdmetric_updated_rrddim_flags(RRDDIM *rd) { + rd->rrdcontexts.collected = false; + RRDMETRIC *rm = rrddim_get_rrdmetric(rd); if(unlikely(!rm)) return; @@ -305,6 +309,11 @@ inline void rrdmetric_updated_rrddim_flags(RRDDIM *rd) { } inline void rrdmetric_collected_rrddim(RRDDIM *rd) { + if(rd->rrdcontexts.collected) + return; + + rd->rrdcontexts.collected = true; + RRDMETRIC *rm = rrddim_get_rrdmetric(rd); if(unlikely(!rm)) return; @@ -316,4 +325,3 @@ inline void rrdmetric_collected_rrddim(RRDDIM *rd) { rrdmetric_trigger_updates(rm, __FUNCTION__ ); } - diff --git a/database/contexts/query_target.c b/database/contexts/query_target.c index d969691dd..95abc3e65 100644 --- a/database/contexts/query_target.c +++ b/database/contexts/query_target.c @@ -835,8 +835,8 @@ static ssize_t query_context_add(void *data, RRDCONTEXT_ACQUIRED *rca, bool quer if(query_instance_add(qtl, qn, qc, qt->request.ria, queryable_context, false)) added++; } - else if(unlikely(qtl->st && qtl->st->rrdcontext == rca && qtl->st->rrdinstance)) { - if(query_instance_add(qtl, qn, qc, qtl->st->rrdinstance, queryable_context, false)) + else if(unlikely(qtl->st && qtl->st->rrdcontexts.rrdcontext == rca && qtl->st->rrdcontexts.rrdinstance)) { + if(query_instance_add(qtl, qn, qc, qtl->st->rrdcontexts.rrdinstance, queryable_context, false)) added++; } else { @@ -894,11 +894,11 @@ static ssize_t query_node_add(void *data, RRDHOST *host, bool queryable_host) { qn->node_id[0] = '\0'; // is the chart given valid? - if(unlikely(qtl->st && (!qtl->st->rrdinstance || !qtl->st->rrdcontext))) { + if(unlikely(qtl->st && (!qtl->st->rrdcontexts.rrdinstance || !qtl->st->rrdcontexts.rrdcontext))) { netdata_log_error("QUERY TARGET: RRDSET '%s' given, but it is not linked to rrdcontext structures. Linking it now.", rrdset_name(qtl->st)); rrdinstance_from_rrdset(qtl->st); - if(unlikely(qtl->st && (!qtl->st->rrdinstance || !qtl->st->rrdcontext))) { + if(unlikely(qtl->st && (!qtl->st->rrdcontexts.rrdinstance || !qtl->st->rrdcontexts.rrdcontext))) { netdata_log_error("QUERY TARGET: RRDSET '%s' given, but failed to be linked to rrdcontext structures. Switching to context query.", rrdset_name(qtl->st)); @@ -918,7 +918,7 @@ static ssize_t query_node_add(void *data, RRDHOST *host, bool queryable_host) { } else if(unlikely(qtl->st)) { // single chart data queries - if(query_context_add(qtl, qtl->st->rrdcontext, true)) + if(query_context_add(qtl, qtl->st->rrdcontexts.rrdcontext, true)) added++; } else { diff --git a/database/contexts/rrdcontext.c b/database/contexts/rrdcontext.c index 8538d17f2..9dee39be2 100644 --- a/database/contexts/rrdcontext.c +++ b/database/contexts/rrdcontext.c @@ -224,26 +224,31 @@ void rrdcontext_hub_checkpoint_command(void *ptr) { struct ctxs_checkpoint *cmd = ptr; if(!rrdhost_check_our_claim_id(cmd->claim_id)) { - netdata_log_error("RRDCONTEXT: received checkpoint command for claim_id '%s', node id '%s', but this is not our claim id. Ours '%s', received '%s'. Ignoring command.", - cmd->claim_id, cmd->node_id, - localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET", - cmd->claim_id); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "RRDCONTEXT: received checkpoint command for claim_id '%s', node id '%s', " + "but this is not our claim id. Ours '%s', received '%s'. Ignoring command.", + cmd->claim_id, cmd->node_id, + localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET", + cmd->claim_id); return; } RRDHOST *host = rrdhost_find_by_node_id(cmd->node_id); if(!host) { - netdata_log_error("RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', but there is no node with such node id here. Ignoring command.", - cmd->claim_id, - cmd->node_id); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', " + "but there is no node with such node id here. Ignoring command.", + cmd->claim_id, cmd->node_id); return; } if(rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS)) { - netdata_log_info("RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', while node '%s' has an active context streaming.", - cmd->claim_id, cmd->node_id, rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', " + "while node '%s' has an active context streaming.", + cmd->claim_id, cmd->node_id, rrdhost_hostname(host)); // disable it temporarily, so that our worker will not attempt to send messages in parallel rrdhost_flag_clear(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS); @@ -252,8 +257,10 @@ void rrdcontext_hub_checkpoint_command(void *ptr) { uint64_t our_version_hash = rrdcontext_version_hash(host); if(cmd->version_hash != our_version_hash) { - netdata_log_error("RRDCONTEXT: received version hash %"PRIu64" for host '%s', does not match our version hash %"PRIu64". Sending snapshot of all contexts.", - cmd->version_hash, rrdhost_hostname(host), our_version_hash); + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "RRDCONTEXT: received version hash %"PRIu64" for host '%s', does not match our version hash %"PRIu64". " + "Sending snapshot of all contexts.", + cmd->version_hash, rrdhost_hostname(host), our_version_hash); #ifdef ENABLE_ACLK // prepare the snapshot @@ -275,41 +282,55 @@ void rrdcontext_hub_checkpoint_command(void *ptr) { #endif } - internal_error(true, "RRDCONTEXT: host '%s' enabling streaming of contexts", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "RRDCONTEXT: host '%s' enabling streaming of contexts", + rrdhost_hostname(host)); + rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS); char node_str[UUID_STR_LEN]; uuid_unparse_lower(*host->node_id, node_str); - netdata_log_access("ACLK REQ [%s (%s)]: STREAM CONTEXTS ENABLED", node_str, rrdhost_hostname(host)); + nd_log(NDLS_ACCESS, NDLP_DEBUG, + "ACLK REQ [%s (%s)]: STREAM CONTEXTS ENABLED", + node_str, rrdhost_hostname(host)); } void rrdcontext_hub_stop_streaming_command(void *ptr) { struct stop_streaming_ctxs *cmd = ptr; if(!rrdhost_check_our_claim_id(cmd->claim_id)) { - netdata_log_error("RRDCONTEXT: received stop streaming command for claim_id '%s', node id '%s', but this is not our claim id. Ours '%s', received '%s'. Ignoring command.", - cmd->claim_id, cmd->node_id, - localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET", - cmd->claim_id); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "RRDCONTEXT: received stop streaming command for claim_id '%s', node id '%s', " + "but this is not our claim id. Ours '%s', received '%s'. Ignoring command.", + cmd->claim_id, cmd->node_id, + localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET", + cmd->claim_id); return; } RRDHOST *host = rrdhost_find_by_node_id(cmd->node_id); if(!host) { - netdata_log_error("RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', but there is no node with such node id here. Ignoring command.", - cmd->claim_id, cmd->node_id); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', " + "but there is no node with such node id here. Ignoring command.", + cmd->claim_id, cmd->node_id); return; } if(!rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS)) { - netdata_log_error("RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', but node '%s' does not have active context streaming. Ignoring command.", - cmd->claim_id, cmd->node_id, rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', " + "but node '%s' does not have active context streaming. Ignoring command.", + cmd->claim_id, cmd->node_id, rrdhost_hostname(host)); return; } - internal_error(true, "RRDCONTEXT: host '%s' disabling streaming of contexts", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "RRDCONTEXT: host '%s' disabling streaming of contexts", + rrdhost_hostname(host)); + rrdhost_flag_clear(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS); } diff --git a/database/engine/cache.c b/database/engine/cache.c index 7a9ccf8d1..eb1c35298 100644 --- a/database/engine/cache.c +++ b/database/engine/cache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-3.0-or-later #include "cache.h" /* STATES AND TRANSITIONS @@ -1170,9 +1171,10 @@ static bool evict_pages_with_filter(PGC *cache, size_t max_skip, size_t max_evic if(all_of_them && !filter) { pgc_ll_lock(cache, &cache->clean); if(cache->clean.stats->entries) { - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, "DBENGINE CACHE: cannot free all clean pages, %zu are still in the clean queue", - cache->clean.stats->entries); + nd_log_limit_static_global_var(erl, 1, 0); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE, + "DBENGINE CACHE: cannot free all clean pages, %zu are still in the clean queue", + cache->clean.stats->entries); } pgc_ll_unlock(cache, &cache->clean); } @@ -1801,7 +1803,7 @@ PGC *pgc_create(const char *name, cache->aral = callocz(cache->config.partitions, sizeof(ARAL *)); for(size_t part = 0; part < cache->config.partitions ; part++) { char buf[100 +1]; - snprintfz(buf, 100, "%s[%zu]", name, part); + snprintfz(buf, sizeof(buf) - 1, "%s[%zu]", name, part); cache->aral[part] = aral_create( buf, sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page, @@ -1860,7 +1862,7 @@ void pgc_destroy(PGC *cache) { freez(cache->aral); #endif - + freez(cache->index); freez(cache); } } @@ -2517,7 +2519,7 @@ void unittest_stress_test(void) { for(size_t i = 0; i < pgc_uts.collect_threads ;i++) { collect_thread_ids[i] = i; char buffer[100 + 1]; - snprintfz(buffer, 100, "COLLECT_%zu", i); + snprintfz(buffer, sizeof(buffer) - 1, "COLLECT_%zu", i); netdata_thread_create(&collect_threads[i], buffer, NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, unittest_stress_test_collector, &collect_thread_ids[i]); @@ -2529,7 +2531,7 @@ void unittest_stress_test(void) { for(size_t i = 0; i < pgc_uts.query_threads ;i++) { query_thread_ids[i] = i; char buffer[100 + 1]; - snprintfz(buffer, 100, "QUERY_%zu", i); + snprintfz(buffer, sizeof(buffer) - 1, "QUERY_%zu", i); initstate_r(1, pgc_uts.rand_statebufs, 1024, &pgc_uts.random_data[i]); netdata_thread_create(&queries_threads[i], buffer, NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, diff --git a/database/engine/cache.h b/database/engine/cache.h index c10e09928..7cd7c0636 100644 --- a/database/engine/cache.h +++ b/database/engine/cache.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-3.0-or-later #ifndef DBENGINE_CACHE_H #define DBENGINE_CACHE_H diff --git a/database/engine/datafile.c b/database/engine/datafile.c index fcda84bd6..7322039cd 100644 --- a/database/engine/datafile.c +++ b/database/engine/datafile.c @@ -160,7 +160,7 @@ bool datafile_acquire_for_deletion(struct rrdengine_datafile *df) { void generate_datafilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen) { - (void) snprintfz(str, maxlen, "%s/" DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION, + (void) snprintfz(str, maxlen - 1, "%s/" DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION, datafile->ctx->config.dbfiles_path, datafile->tier, datafile->fileno); } @@ -338,7 +338,8 @@ static int load_data_file(struct rrdengine_datafile *datafile) ctx_fs_error(ctx); return fd; } - netdata_log_info("DBENGINE: initializing data file \"%s\".", path); + + nd_log_daemon(NDLP_DEBUG, "DBENGINE: initializing data file \"%s\".", path); ret = check_file_properties(file, &file_size, sizeof(struct rrdeng_df_sb)); if (ret) @@ -354,7 +355,8 @@ static int load_data_file(struct rrdengine_datafile *datafile) datafile->file = file; datafile->pos = file_size; - netdata_log_info("DBENGINE: data file \"%s\" initialized (size:%"PRIu64").", path, file_size); + nd_log_daemon(NDLP_DEBUG, "DBENGINE: data file \"%s\" initialized (size:%" PRIu64 ").", path, file_size); + return 0; error: @@ -422,6 +424,7 @@ static int scan_data_files(struct rrdengine_instance *ctx) ctx->atomic.last_fileno = datafiles[matched_files - 1]->fileno; + netdata_log_info("DBENGINE: loading %d data/journal of tier %d...", matched_files, ctx->config.tier); for (failed_to_load = 0, i = 0 ; i < matched_files ; ++i) { uint8_t must_delete_pair = 0; @@ -479,14 +482,18 @@ int create_new_datafile_pair(struct rrdengine_instance *ctx, bool having_lock) int ret; char path[RRDENG_PATH_MAX]; - netdata_log_info("DBENGINE: creating new data and journal files in path %s", ctx->config.dbfiles_path); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "DBENGINE: creating new data and journal files in path %s", + ctx->config.dbfiles_path); + datafile = datafile_alloc_and_init(ctx, 1, fileno); ret = create_data_file(datafile); if(ret) goto error_after_datafile; generate_datafilepath(datafile, path, sizeof(path)); - netdata_log_info("DBENGINE: created data file \"%s\".", path); + nd_log(NDLS_DAEMON, NDLP_INFO, + "DBENGINE: created data file \"%s\".", path); journalfile = journalfile_alloc_and_init(datafile); ret = journalfile_create(journalfile, datafile); @@ -494,7 +501,8 @@ int create_new_datafile_pair(struct rrdengine_instance *ctx, bool having_lock) goto error_after_journalfile; journalfile_v1_generate_path(datafile, path, sizeof(path)); - netdata_log_info("DBENGINE: created journal file \"%s\".", path); + nd_log(NDLS_DAEMON, NDLP_INFO, + "DBENGINE: created journal file \"%s\".", path); ctx_current_disk_space_increase(ctx, datafile->pos + journalfile->unsafe.pos); datafile_list_insert(ctx, datafile, having_lock); diff --git a/database/engine/journalfile.c b/database/engine/journalfile.c index abb9d2eb9..9005b81ca 100644 --- a/database/engine/journalfile.c +++ b/database/engine/journalfile.c @@ -67,7 +67,7 @@ void journalfile_v2_generate_path(struct rrdengine_datafile *datafile, char *str void journalfile_v1_generate_path(struct rrdengine_datafile *datafile, char *str, size_t maxlen) { - (void) snprintfz(str, maxlen, "%s/" WALFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL WALFILE_EXTENSION, + (void) snprintfz(str, maxlen - 1, "%s/" WALFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL WALFILE_EXTENSION, datafile->ctx->config.dbfiles_path, datafile->tier, datafile->fileno); } @@ -169,7 +169,7 @@ static void njfv2idx_add(struct rrdengine_datafile *datafile) { *PValue = datafile; break; } - } while(0); + } while(1); rw_spinlock_write_unlock(&datafile->ctx->njfv2idx.spinlock); } @@ -1013,7 +1013,7 @@ void journalfile_v2_populate_retention_to_mrg(struct rrdengine_instance *ctx, st journalfile_v2_data_release(journalfile); usec_t ended_ut = now_monotonic_usec(); - netdata_log_info("DBENGINE: journal v2 of tier %d, datafile %u populated, size: %0.2f MiB, metrics: %0.2f k, %0.2f ms" + nd_log_daemon(NDLP_DEBUG, "DBENGINE: journal v2 of tier %d, datafile %u populated, size: %0.2f MiB, metrics: %0.2f k, %0.2f ms" , ctx->config.tier, journalfile->datafile->fileno , (double)data_size / 1024 / 1024 , (double)entries / 1000 @@ -1073,7 +1073,8 @@ int journalfile_v2_load(struct rrdengine_instance *ctx, struct rrdengine_journal return 1; } - netdata_log_info("DBENGINE: checking integrity of '%s'", path_v2); + nd_log_daemon(NDLP_DEBUG, "DBENGINE: checking integrity of '%s'", path_v2); + usec_t validation_start_ut = now_monotonic_usec(); int rc = journalfile_v2_validate(data_start, journal_v2_file_size, journal_v1_file_size); if (unlikely(rc)) { @@ -1104,7 +1105,7 @@ int journalfile_v2_load(struct rrdengine_instance *ctx, struct rrdengine_journal usec_t finished_ut = now_monotonic_usec(); - netdata_log_info("DBENGINE: journal v2 '%s' loaded, size: %0.2f MiB, metrics: %0.2f k, " + nd_log_daemon(NDLP_DEBUG, "DBENGINE: journal v2 '%s' loaded, size: %0.2f MiB, metrics: %0.2f k, " "mmap: %0.2f ms, validate: %0.2f ms" , path_v2 , (double)journal_v2_file_size / 1024 / 1024 @@ -1535,13 +1536,13 @@ int journalfile_load(struct rrdengine_instance *ctx, struct rrdengine_journalfil } ctx_io_read_op_bytes(ctx, sizeof(struct rrdeng_jf_sb)); - netdata_log_info("DBENGINE: loading journal file '%s'", path); + nd_log_daemon(NDLP_DEBUG, "DBENGINE: loading journal file '%s'", path); max_id = journalfile_iterate_transactions(ctx, journalfile); __atomic_store_n(&ctx->atomic.transaction_id, MAX(__atomic_load_n(&ctx->atomic.transaction_id, __ATOMIC_RELAXED), max_id + 1), __ATOMIC_RELAXED); - netdata_log_info("DBENGINE: journal file '%s' loaded (size:%"PRIu64").", path, file_size); + nd_log_daemon(NDLP_DEBUG, "DBENGINE: journal file '%s' loaded (size:%" PRIu64 ").", path, file_size); bool is_last_file = (ctx_last_fileno_get(ctx) == journalfile->datafile->fileno); if (is_last_file && journalfile->datafile->pos <= rrdeng_target_data_file_size(ctx) / 3) { diff --git a/database/engine/metric.c b/database/engine/metric.c index 69b8f3116..2e132612e 100644 --- a/database/engine/metric.c +++ b/database/engine/metric.c @@ -1,30 +1,44 @@ +// SPDX-License-Identifier: GPL-3.0-or-later #include "metric.h" typedef int32_t REFCOUNT; #define REFCOUNT_DELETING (-100) -typedef enum __attribute__ ((__packed__)) { - METRIC_FLAG_HAS_RETENTION = (1 << 0), -} METRIC_FLAGS; - struct metric { uuid_t uuid; // never changes Word_t section; // never changes - time_t first_time_s; // - time_t latest_time_s_clean; // archived pages latest time - time_t latest_time_s_hot; // latest time of the currently collected page - uint32_t latest_update_every_s; // + time_t first_time_s; // the timestamp of the oldest point in the database + time_t latest_time_s_clean; // the timestamp of the newest point in the database + time_t latest_time_s_hot; // the timestamp of the latest point that has been collected (not yet stored) + uint32_t latest_update_every_s; // the latest data collection frequency pid_t writer; uint8_t partition; - METRIC_FLAGS flags; REFCOUNT refcount; - SPINLOCK spinlock; // protects all variable members // THIS IS allocated with malloc() // YOU HAVE TO INITIALIZE IT YOURSELF ! }; +#define set_metric_field_with_condition(field, value, condition) ({ \ + typeof(field) _current = __atomic_load_n(&(field), __ATOMIC_RELAXED); \ + typeof(field) _wanted = value; \ + bool did_it = true; \ + \ + do { \ + if((condition) && (_current != _wanted)) { \ + ; \ + } \ + else { \ + did_it = false; \ + break; \ + } \ + } while(!__atomic_compare_exchange_n(&(field), &_current, _wanted, \ + false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); \ + \ + did_it; \ +}) + static struct aral_statistics mrg_aral_statistics; struct mrg { @@ -73,9 +87,6 @@ static inline void MRG_STATS_DELETE_MISS(MRG *mrg, size_t partition) { #define mrg_index_write_lock(mrg, partition) rw_spinlock_write_lock(&(mrg)->index[partition].rw_spinlock) #define mrg_index_write_unlock(mrg, partition) rw_spinlock_write_unlock(&(mrg)->index[partition].rw_spinlock) -#define metric_lock(metric) spinlock_lock(&(metric)->spinlock) -#define metric_unlock(metric) spinlock_unlock(&(metric)->spinlock) - static inline void mrg_stats_size_judyl_change(MRG *mrg, size_t mem_before_judyl, size_t mem_after_judyl, size_t partition) { if(mem_after_judyl > mem_before_judyl) __atomic_add_fetch(&mrg->index[partition].stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED); @@ -97,40 +108,34 @@ static inline size_t uuid_partition(MRG *mrg __maybe_unused, uuid_t *uuid) { return *n % mrg->partitions; } -static inline bool metric_has_retention_unsafe(MRG *mrg __maybe_unused, METRIC *metric) { - size_t partition = metric->partition; +static inline time_t mrg_metric_get_first_time_s_smart(MRG *mrg __maybe_unused, METRIC *metric) { + time_t first_time_s = __atomic_load_n(&metric->first_time_s, __ATOMIC_RELAXED); - bool has_retention = (metric->first_time_s > 0 || metric->latest_time_s_clean > 0 || metric->latest_time_s_hot > 0); + if(first_time_s <= 0) { + first_time_s = __atomic_load_n(&metric->latest_time_s_clean, __ATOMIC_RELAXED); + if(first_time_s <= 0) + first_time_s = __atomic_load_n(&metric->latest_time_s_hot, __ATOMIC_RELAXED); - if(has_retention && !(metric->flags & METRIC_FLAG_HAS_RETENTION)) { - metric->flags |= METRIC_FLAG_HAS_RETENTION; - __atomic_add_fetch(&mrg->index[partition].stats.entries_with_retention, 1, __ATOMIC_RELAXED); - } - else if(!has_retention && (metric->flags & METRIC_FLAG_HAS_RETENTION)) { - metric->flags &= ~METRIC_FLAG_HAS_RETENTION; - __atomic_sub_fetch(&mrg->index[partition].stats.entries_with_retention, 1, __ATOMIC_RELAXED); + if(first_time_s <= 0) + first_time_s = 0; + else + __atomic_store_n(&metric->first_time_s, first_time_s, __ATOMIC_RELAXED); } - return has_retention; + return first_time_s; } -static inline REFCOUNT metric_acquire(MRG *mrg __maybe_unused, METRIC *metric, bool having_spinlock) { +static inline REFCOUNT metric_acquire(MRG *mrg __maybe_unused, METRIC *metric) { size_t partition = metric->partition; + REFCOUNT expected = __atomic_load_n(&metric->refcount, __ATOMIC_RELAXED); REFCOUNT refcount; - if(!having_spinlock) - metric_lock(metric); - - if(unlikely(metric->refcount < 0)) - fatal("METRIC: refcount is %d (negative) during acquire", metric->refcount); - - refcount = ++metric->refcount; - - // update its retention flags - metric_has_retention_unsafe(mrg, metric); + do { + if(expected < 0) + fatal("METRIC: refcount is %d (negative) during acquire", metric->refcount); - if(!having_spinlock) - metric_unlock(metric); + refcount = expected + 1; + } while(!__atomic_compare_exchange_n(&metric->refcount, &expected, refcount, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); if(refcount == 1) __atomic_add_fetch(&mrg->index[partition].stats.entries_referenced, 1, __ATOMIC_RELAXED); @@ -141,28 +146,25 @@ static inline REFCOUNT metric_acquire(MRG *mrg __maybe_unused, METRIC *metric, b } static inline bool metric_release_and_can_be_deleted(MRG *mrg __maybe_unused, METRIC *metric) { - bool ret = true; size_t partition = metric->partition; + REFCOUNT expected = __atomic_load_n(&metric->refcount, __ATOMIC_RELAXED); REFCOUNT refcount; - metric_lock(metric); - - if(unlikely(metric->refcount <= 0)) - fatal("METRIC: refcount is %d (zero or negative) during release", metric->refcount); - - refcount = --metric->refcount; - - if(likely(metric_has_retention_unsafe(mrg, metric) || refcount != 0)) - ret = false; + do { + if(expected <= 0) + fatal("METRIC: refcount is %d (zero or negative) during release", metric->refcount); - metric_unlock(metric); + refcount = expected - 1; + } while(!__atomic_compare_exchange_n(&metric->refcount, &expected, refcount, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); if(unlikely(!refcount)) __atomic_sub_fetch(&mrg->index[partition].stats.entries_referenced, 1, __ATOMIC_RELAXED); __atomic_sub_fetch(&mrg->index[partition].stats.current_references, 1, __ATOMIC_RELAXED); - return ret; + time_t first, last, ue; + mrg_metric_get_retention(mrg, metric, &first, &last, &ue); + return (!first || !last || first > last); } static inline METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *ret) { @@ -192,7 +194,7 @@ static inline METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *r if(unlikely(*PValue != NULL)) { METRIC *metric = *PValue; - metric_acquire(mrg, metric, false); + metric_acquire(mrg, metric); MRG_STATS_DUPLICATE_ADD(mrg, partition); @@ -215,10 +217,8 @@ static inline METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *r metric->latest_update_every_s = entry->latest_update_every_s; metric->writer = 0; metric->refcount = 0; - metric->flags = 0; metric->partition = partition; - spinlock_init(&metric->spinlock); - metric_acquire(mrg, metric, true); // no spinlock use required here + metric_acquire(mrg, metric); *PValue = metric; MRG_STATS_ADDED_METRIC(mrg, partition); @@ -252,7 +252,7 @@ static inline METRIC *metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t sect METRIC *metric = *PValue; - metric_acquire(mrg, metric, false); + metric_acquire(mrg, metric); mrg_index_read_unlock(mrg, partition); @@ -363,7 +363,7 @@ inline bool mrg_metric_release_and_delete(MRG *mrg, METRIC *metric) { } inline METRIC *mrg_metric_dup(MRG *mrg, METRIC *metric) { - metric_acquire(mrg, metric, false); + metric_acquire(mrg, metric); return metric; } @@ -389,10 +389,7 @@ inline bool mrg_metric_set_first_time_s(MRG *mrg __maybe_unused, METRIC *metric, if(unlikely(first_time_s < 0)) return false; - metric_lock(metric); - metric->first_time_s = first_time_s; - metric_has_retention_unsafe(mrg, metric); - metric_unlock(metric); + __atomic_store_n(&metric->first_time_s, first_time_s, __ATOMIC_RELAXED); return true; } @@ -405,112 +402,56 @@ inline void mrg_metric_expand_retention(MRG *mrg __maybe_unused, METRIC *metric, internal_fatal(last_time_s > max_acceptable_collected_time(), "DBENGINE METRIC: metric last time is in the future"); - if(unlikely(first_time_s < 0)) - first_time_s = 0; - - if(unlikely(last_time_s < 0)) - last_time_s = 0; - - if(unlikely(update_every_s < 0)) - update_every_s = 0; - - if(unlikely(!first_time_s && !last_time_s && !update_every_s)) - return; + if(first_time_s > 0) + set_metric_field_with_condition(metric->first_time_s, first_time_s, _current <= 0 || _wanted < _current); - metric_lock(metric); - - if(unlikely(first_time_s && (!metric->first_time_s || first_time_s < metric->first_time_s))) - metric->first_time_s = first_time_s; - - if(likely(last_time_s && (!metric->latest_time_s_clean || last_time_s > metric->latest_time_s_clean))) { - metric->latest_time_s_clean = last_time_s; - - if(likely(update_every_s)) - metric->latest_update_every_s = (uint32_t) update_every_s; + if(last_time_s > 0) { + if(set_metric_field_with_condition(metric->latest_time_s_clean, last_time_s, _current <= 0 || _wanted > _current) && + update_every_s > 0) + // set the latest update every too + set_metric_field_with_condition(metric->latest_update_every_s, update_every_s, true); } - else if(unlikely(!metric->latest_update_every_s && update_every_s)) - metric->latest_update_every_s = (uint32_t) update_every_s; - - metric_has_retention_unsafe(mrg, metric); - metric_unlock(metric); + else if(update_every_s > 0) + // set it only if it is invalid + set_metric_field_with_condition(metric->latest_update_every_s, update_every_s, _current <= 0); } inline bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) { internal_fatal(first_time_s < 0, "DBENGINE METRIC: timestamp is negative"); - - bool ret = false; - - metric_lock(metric); - if(first_time_s > metric->first_time_s) { - metric->first_time_s = first_time_s; - ret = true; - } - metric_has_retention_unsafe(mrg, metric); - metric_unlock(metric); - - return ret; + return set_metric_field_with_condition(metric->first_time_s, first_time_s, _wanted > _current); } inline time_t mrg_metric_get_first_time_s(MRG *mrg __maybe_unused, METRIC *metric) { - time_t first_time_s; - - metric_lock(metric); - - if(unlikely(!metric->first_time_s)) { - if(metric->latest_time_s_clean) - metric->first_time_s = metric->latest_time_s_clean; - - else if(metric->latest_time_s_hot) - metric->first_time_s = metric->latest_time_s_hot; - } - - first_time_s = metric->first_time_s; - - metric_unlock(metric); - - return first_time_s; + return mrg_metric_get_first_time_s_smart(mrg, metric); } inline void mrg_metric_get_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t *first_time_s, time_t *last_time_s, time_t *update_every_s) { - metric_lock(metric); - - if(unlikely(!metric->first_time_s)) { - if(metric->latest_time_s_clean) - metric->first_time_s = metric->latest_time_s_clean; - - else if(metric->latest_time_s_hot) - metric->first_time_s = metric->latest_time_s_hot; - } - - *first_time_s = metric->first_time_s; - *last_time_s = MAX(metric->latest_time_s_clean, metric->latest_time_s_hot); - *update_every_s = metric->latest_update_every_s; + time_t clean = __atomic_load_n(&metric->latest_time_s_clean, __ATOMIC_RELAXED); + time_t hot = __atomic_load_n(&metric->latest_time_s_hot, __ATOMIC_RELAXED); - metric_unlock(metric); + *last_time_s = MAX(clean, hot); + *first_time_s = mrg_metric_get_first_time_s_smart(mrg, metric); + *update_every_s = __atomic_load_n(&metric->latest_update_every_s, __ATOMIC_RELAXED); } inline bool mrg_metric_set_clean_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) { internal_fatal(latest_time_s < 0, "DBENGINE METRIC: timestamp is negative"); - if(unlikely(latest_time_s < 0)) - return false; - - metric_lock(metric); - // internal_fatal(latest_time_s > max_acceptable_collected_time(), // "DBENGINE METRIC: metric latest time is in the future"); // internal_fatal(metric->latest_time_s_clean > latest_time_s, // "DBENGINE METRIC: metric new clean latest time is older than the previous one"); - metric->latest_time_s_clean = latest_time_s; + if(latest_time_s > 0) { + if(set_metric_field_with_condition(metric->latest_time_s_clean, latest_time_s, true)) { + set_metric_field_with_condition(metric->first_time_s, latest_time_s, _current <= 0 || _wanted < _current); - if(unlikely(!metric->first_time_s)) - metric->first_time_s = latest_time_s; + return true; + } + } - metric_has_retention_unsafe(mrg, metric); - metric_unlock(metric); - return true; + return false; } // returns true when metric still has retention @@ -518,7 +459,6 @@ inline bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metr Word_t section = mrg_metric_section(mrg, metric); bool do_again = false; size_t countdown = 5; - bool ret = true; do { time_t min_first_time_s = LONG_MAX; @@ -547,22 +487,20 @@ inline bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metr if (min_first_time_s == LONG_MAX) min_first_time_s = 0; - metric_lock(metric); - if (--countdown && !min_first_time_s && metric->latest_time_s_hot) + if (--countdown && !min_first_time_s && __atomic_load_n(&metric->latest_time_s_hot, __ATOMIC_RELAXED)) do_again = true; else { internal_error(!countdown, "METRIC: giving up on updating the retention of metric without disk retention"); do_again = false; - metric->first_time_s = min_first_time_s; - metric->latest_time_s_clean = max_end_time_s; - - ret = metric_has_retention_unsafe(mrg, metric); + set_metric_field_with_condition(metric->first_time_s, min_first_time_s, true); + set_metric_field_with_condition(metric->latest_time_s_clean, max_end_time_s, true); } - metric_unlock(metric); } while(do_again); - return ret; + time_t first, last, ue; + mrg_metric_get_retention(mrg, metric, &first, &last, &ue); + return (first && last && first < last); } inline bool mrg_metric_set_hot_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) { @@ -571,88 +509,80 @@ inline bool mrg_metric_set_hot_latest_time_s(MRG *mrg __maybe_unused, METRIC *me // internal_fatal(latest_time_s > max_acceptable_collected_time(), // "DBENGINE METRIC: metric latest time is in the future"); - if(unlikely(latest_time_s < 0)) - return false; - - metric_lock(metric); - metric->latest_time_s_hot = latest_time_s; - - if(unlikely(!metric->first_time_s)) - metric->first_time_s = latest_time_s; + if(likely(latest_time_s > 0)) { + __atomic_store_n(&metric->latest_time_s_hot, latest_time_s, __ATOMIC_RELAXED); + return true; + } - metric_has_retention_unsafe(mrg, metric); - metric_unlock(metric); - return true; + return false; } inline time_t mrg_metric_get_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric) { - time_t max; - metric_lock(metric); - max = MAX(metric->latest_time_s_clean, metric->latest_time_s_hot); - metric_unlock(metric); - return max; + time_t clean = __atomic_load_n(&metric->latest_time_s_clean, __ATOMIC_RELAXED); + time_t hot = __atomic_load_n(&metric->latest_time_s_hot, __ATOMIC_RELAXED); + + return MAX(clean, hot); } inline bool mrg_metric_set_update_every(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) { internal_fatal(update_every_s < 0, "DBENGINE METRIC: timestamp is negative"); - if(update_every_s <= 0) - return false; - - metric_lock(metric); - metric->latest_update_every_s = (uint32_t) update_every_s; - metric_unlock(metric); + if(update_every_s > 0) + return set_metric_field_with_condition(metric->latest_update_every_s, update_every_s, true); - return true; + return false; } inline bool mrg_metric_set_update_every_s_if_zero(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) { internal_fatal(update_every_s < 0, "DBENGINE METRIC: timestamp is negative"); - if(update_every_s <= 0) - return false; - - metric_lock(metric); - if(!metric->latest_update_every_s) - metric->latest_update_every_s = (uint32_t) update_every_s; - metric_unlock(metric); + if(update_every_s > 0) + return set_metric_field_with_condition(metric->latest_update_every_s, update_every_s, _current <= 0); - return true; + return false; } inline time_t mrg_metric_get_update_every_s(MRG *mrg __maybe_unused, METRIC *metric) { - time_t update_every_s; - - metric_lock(metric); - update_every_s = metric->latest_update_every_s; - metric_unlock(metric); - - return update_every_s; + return __atomic_load_n(&metric->latest_update_every_s, __ATOMIC_RELAXED); } inline bool mrg_metric_set_writer(MRG *mrg, METRIC *metric) { - bool done = false; - metric_lock(metric); - if(!metric->writer) { - metric->writer = gettid(); + pid_t expected = __atomic_load_n(&metric->writer, __ATOMIC_RELAXED); + pid_t wanted = gettid(); + bool done = true; + + do { + if(expected != 0) { + done = false; + break; + } + } while(!__atomic_compare_exchange_n(&metric->writer, &expected, wanted, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); + + if(done) __atomic_add_fetch(&mrg->index[metric->partition].stats.writers, 1, __ATOMIC_RELAXED); - done = true; - } else __atomic_add_fetch(&mrg->index[metric->partition].stats.writers_conflicts, 1, __ATOMIC_RELAXED); - metric_unlock(metric); + return done; } inline bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric) { - bool done = false; - metric_lock(metric); - if(metric->writer) { - metric->writer = 0; + // this function can be called from a different thread than the one than the writer + + pid_t expected = __atomic_load_n(&metric->writer, __ATOMIC_RELAXED); + pid_t wanted = 0; + bool done = true; + + do { + if(!expected) { + done = false; + break; + } + } while(!__atomic_compare_exchange_n(&metric->writer, &expected, wanted, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); + + if(done) __atomic_sub_fetch(&mrg->index[metric->partition].stats.writers, 1, __ATOMIC_RELAXED); - done = true; - } - metric_unlock(metric); + return done; } @@ -662,27 +592,30 @@ inline void mrg_update_metric_retention_and_granularity_by_uuid( time_t update_every_s, time_t now_s) { if(unlikely(last_time_s > now_s)) { - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, "DBENGINE JV2: wrong last time on-disk (%ld - %ld, now %ld), " - "fixing last time to now", - first_time_s, last_time_s, now_s); + nd_log_limit_static_global_var(erl, 1, 0); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING, + "DBENGINE JV2: wrong last time on-disk (%ld - %ld, now %ld), " + "fixing last time to now", + first_time_s, last_time_s, now_s); last_time_s = now_s; } if (unlikely(first_time_s > last_time_s)) { - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, "DBENGINE JV2: wrong first time on-disk (%ld - %ld, now %ld), " - "fixing first time to last time", - first_time_s, last_time_s, now_s); + nd_log_limit_static_global_var(erl, 1, 0); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING, + "DBENGINE JV2: wrong first time on-disk (%ld - %ld, now %ld), " + "fixing first time to last time", + first_time_s, last_time_s, now_s); first_time_s = last_time_s; } if (unlikely(first_time_s == 0 || last_time_s == 0)) { - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, "DBENGINE JV2: zero on-disk timestamps (%ld - %ld, now %ld), " - "using them as-is", - first_time_s, last_time_s, now_s); + nd_log_limit_static_global_var(erl, 1, 0); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING, + "DBENGINE JV2: zero on-disk timestamps (%ld - %ld, now %ld), " + "using them as-is", + first_time_s, last_time_s, now_s); } bool added = false; @@ -710,7 +643,6 @@ inline void mrg_get_statistics(MRG *mrg, struct mrg_statistics *s) { for(size_t i = 0; i < mrg->partitions ;i++) { s->entries += __atomic_load_n(&mrg->index[i].stats.entries, __ATOMIC_RELAXED); s->entries_referenced += __atomic_load_n(&mrg->index[i].stats.entries_referenced, __ATOMIC_RELAXED); - s->entries_with_retention += __atomic_load_n(&mrg->index[i].stats.entries_with_retention, __ATOMIC_RELAXED); s->size += __atomic_load_n(&mrg->index[i].stats.size, __ATOMIC_RELAXED); s->current_references += __atomic_load_n(&mrg->index[i].stats.current_references, __ATOMIC_RELAXED); s->additions += __atomic_load_n(&mrg->index[i].stats.additions, __ATOMIC_RELAXED); @@ -900,7 +832,7 @@ int mrg_unittest(void) { pthread_t th[threads]; for(size_t i = 0; i < threads ; i++) { char buf[15 + 1]; - snprintfz(buf, 15, "TH[%zu]", i); + snprintfz(buf, sizeof(buf) - 1, "TH[%zu]", i); netdata_thread_create(&th[i], buf, NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, mrg_stress, &t); diff --git a/database/engine/metric.h b/database/engine/metric.h index 5d5ebd7b1..dbb949301 100644 --- a/database/engine/metric.h +++ b/database/engine/metric.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-3.0-or-later #ifndef DBENGINE_METRIC_H #define DBENGINE_METRIC_H @@ -35,9 +36,6 @@ struct mrg_statistics { size_t entries_referenced; - MRG_CACHE_LINE_PADDING(1); - size_t entries_with_retention; - MRG_CACHE_LINE_PADDING(2); size_t current_references; diff --git a/database/engine/page.c b/database/engine/page.c new file mode 100644 index 000000000..b7a393483 --- /dev/null +++ b/database/engine/page.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "page.h" + +#include "libnetdata/libnetdata.h" + +typedef enum __attribute__((packed)) { + PAGE_OPTION_ALL_VALUES_EMPTY = (1 << 0), +} PAGE_OPTIONS; + +typedef enum __attribute__((packed)) { + PGD_STATE_CREATED_FROM_COLLECTOR = (1 << 0), + PGD_STATE_CREATED_FROM_DISK = (1 << 1), + PGD_STATE_SCHEDULED_FOR_FLUSHING = (1 << 2), + PGD_STATE_FLUSHED_TO_DISK = (1 << 3), +} PGD_STATES; + +typedef struct { + uint8_t *data; + uint32_t size; +} page_raw_t; + + +typedef struct { + size_t num_buffers; + gorilla_writer_t *writer; + int aral_index; +} page_gorilla_t; + +struct pgd { + // the page type + uint8_t type; + + // options related to the page + PAGE_OPTIONS options; + + PGD_STATES states; + + // the uses number of slots in the page + uint32_t used; + + // the total number of slots available in the page + uint32_t slots; + + union { + page_raw_t raw; + page_gorilla_t gorilla; + }; +}; + +// ---------------------------------------------------------------------------- +// memory management + +struct { + ARAL *aral_pgd; + ARAL *aral_data[RRD_STORAGE_TIERS]; + ARAL *aral_gorilla_buffer[4]; + ARAL *aral_gorilla_writer[4]; +} pgd_alloc_globals = {}; + +static ARAL *pgd_aral_data_lookup(size_t size) +{ + for (size_t tier = 0; tier < storage_tiers; tier++) + if (size == tier_page_size[tier]) + return pgd_alloc_globals.aral_data[tier]; + + return NULL; +} + +void pgd_init_arals(void) +{ + // pgd aral + { + char buf[20 + 1]; + snprintfz(buf, sizeof(buf) - 1, "pgd"); + + // FIXME: add stats + pgd_alloc_globals.aral_pgd = aral_create( + buf, + sizeof(struct pgd), + 64, + 512 * (sizeof(struct pgd)), + pgc_aral_statistics(), + NULL, NULL, false, false); + } + + // tier page aral + { + for (size_t i = storage_tiers; i > 0 ;i--) + { + size_t tier = storage_tiers - i; + + char buf[20 + 1]; + snprintfz(buf, sizeof(buf) - 1, "tier%zu-pages", tier); + + pgd_alloc_globals.aral_data[tier] = aral_create( + buf, + tier_page_size[tier], + 64, + 512 * (tier_page_size[tier]), + pgc_aral_statistics(), + NULL, NULL, false, false); + } + } + + // gorilla buffers aral + for (size_t i = 0; i != 4; i++) { + char buf[20 + 1]; + snprintfz(buf, sizeof(buf) - 1, "gbuffer-%zu", i); + + // FIXME: add stats + pgd_alloc_globals.aral_gorilla_buffer[i] = aral_create( + buf, + GORILLA_BUFFER_SIZE, + 64, + 512 * GORILLA_BUFFER_SIZE, + pgc_aral_statistics(), + NULL, NULL, false, false); + } + + // gorilla writers aral + for (size_t i = 0; i != 4; i++) { + char buf[20 + 1]; + snprintfz(buf, sizeof(buf) - 1, "gwriter-%zu", i); + + // FIXME: add stats + pgd_alloc_globals.aral_gorilla_writer[i] = aral_create( + buf, + sizeof(gorilla_writer_t), + 64, + 512 * sizeof(gorilla_writer_t), + pgc_aral_statistics(), + NULL, NULL, false, false); + } +} + +static void *pgd_data_aral_alloc(size_t size) +{ + ARAL *ar = pgd_aral_data_lookup(size); + if (!ar) + return mallocz(size); + else + return aral_mallocz(ar); +} + +static void pgd_data_aral_free(void *page, size_t size) +{ + ARAL *ar = pgd_aral_data_lookup(size); + if (!ar) + freez(page); + else + aral_freez(ar, page); +} + +// ---------------------------------------------------------------------------- +// management api + +PGD *pgd_create(uint8_t type, uint32_t slots) +{ + PGD *pg = aral_mallocz(pgd_alloc_globals.aral_pgd); + pg->type = type; + pg->used = 0; + pg->slots = slots; + pg->options = PAGE_OPTION_ALL_VALUES_EMPTY; + pg->states = PGD_STATE_CREATED_FROM_COLLECTOR; + + switch (type) { + case PAGE_METRICS: + case PAGE_TIER: { + uint32_t size = slots * page_type_size[type]; + + internal_fatal(!size || slots == 1, + "DBENGINE: invalid number of slots (%u) or page type (%u)", slots, type); + + pg->raw.size = size; + pg->raw.data = pgd_data_aral_alloc(size); + break; + } + case PAGE_GORILLA_METRICS: { + internal_fatal(slots == 1, + "DBENGINE: invalid number of slots (%u) or page type (%u)", slots, type); + + pg->slots = 8 * GORILLA_BUFFER_SLOTS; + + // allocate new gorilla writer + pg->gorilla.aral_index = gettid() % 4; + pg->gorilla.writer = aral_mallocz(pgd_alloc_globals.aral_gorilla_writer[pg->gorilla.aral_index]); + + // allocate new gorilla buffer + gorilla_buffer_t *gbuf = aral_mallocz(pgd_alloc_globals.aral_gorilla_buffer[pg->gorilla.aral_index]); + memset(gbuf, 0, GORILLA_BUFFER_SIZE); + global_statistics_gorilla_buffer_add_hot(); + + *pg->gorilla.writer = gorilla_writer_init(gbuf, GORILLA_BUFFER_SLOTS); + pg->gorilla.num_buffers = 1; + + break; + } + default: + fatal("Unknown page type: %uc", type); + } + + return pg; +} + +PGD *pgd_create_from_disk_data(uint8_t type, void *base, uint32_t size) +{ + if (!size) + return PGD_EMPTY; + + if (size < page_type_size[type]) + return PGD_EMPTY; + + PGD *pg = aral_mallocz(pgd_alloc_globals.aral_pgd); + + pg->type = type; + pg->states = PGD_STATE_CREATED_FROM_DISK; + pg->options = ~PAGE_OPTION_ALL_VALUES_EMPTY; + + switch (type) + { + case PAGE_METRICS: + case PAGE_TIER: + pg->raw.size = size; + pg->used = size / page_type_size[type]; + pg->slots = pg->used; + + pg->raw.data = pgd_data_aral_alloc(size); + memcpy(pg->raw.data, base, size); + break; + case PAGE_GORILLA_METRICS: + internal_fatal(size == 0, "Asked to create page with 0 data!!!"); + internal_fatal(size % sizeof(uint32_t), "Unaligned gorilla buffer size"); + internal_fatal(size % GORILLA_BUFFER_SIZE, "Expected size to be a multiple of %zu-bytes", GORILLA_BUFFER_SIZE); + + pg->raw.data = mallocz(size); + pg->raw.size = size; + + // TODO: rm this + memset(pg->raw.data, 0, size); + memcpy(pg->raw.data, base, size); + + uint32_t total_entries = gorilla_buffer_patch((void *) pg->raw.data); + + pg->used = total_entries; + pg->slots = pg->used; + break; + default: + fatal("Unknown page type: %uc", type); + } + + return pg; +} + +void pgd_free(PGD *pg) +{ + if (!pg) + return; + + if (pg == PGD_EMPTY) + return; + + switch (pg->type) + { + case PAGE_METRICS: + case PAGE_TIER: + pgd_data_aral_free(pg->raw.data, pg->raw.size); + break; + case PAGE_GORILLA_METRICS: { + if (pg->states & PGD_STATE_CREATED_FROM_DISK) + { + internal_fatal(pg->raw.data == NULL, "Tried to free gorilla PGD loaded from disk with NULL data"); + freez(pg->raw.data); + pg->raw.data = NULL; + } + else if ((pg->states & PGD_STATE_CREATED_FROM_COLLECTOR) || + (pg->states & PGD_STATE_SCHEDULED_FOR_FLUSHING) || + (pg->states & PGD_STATE_FLUSHED_TO_DISK)) + { + internal_fatal(pg->gorilla.writer == NULL, + "PGD does not have an active gorilla writer"); + + internal_fatal(pg->gorilla.num_buffers == 0, + "PGD does not have any gorilla buffers allocated"); + + while (true) { + gorilla_buffer_t *gbuf = gorilla_writer_drop_head_buffer(pg->gorilla.writer); + if (!gbuf) + break; + aral_freez(pgd_alloc_globals.aral_gorilla_buffer[pg->gorilla.aral_index], gbuf); + pg->gorilla.num_buffers -= 1; + } + + internal_fatal(pg->gorilla.num_buffers != 0, + "Could not free all gorilla writer buffers"); + + aral_freez(pgd_alloc_globals.aral_gorilla_writer[pg->gorilla.aral_index], pg->gorilla.writer); + pg->gorilla.writer = NULL; + } else { + fatal("pgd_free() called on gorilla page with unsupported state"); + // TODO: should we support any other states? + // if (!(pg->states & PGD_STATE_FLUSHED_TO_DISK)) + // fatal("pgd_free() is not supported yet for pages flushed to disk"); + } + + break; + } + default: + fatal("Unknown page type: %uc", pg->type); + } + + aral_freez(pgd_alloc_globals.aral_pgd, pg); +} + +// ---------------------------------------------------------------------------- +// utility functions + +uint32_t pgd_type(PGD *pg) +{ + return pg->type; +} + +bool pgd_is_empty(PGD *pg) +{ + if (!pg) + return true; + + if (pg == PGD_EMPTY) + return true; + + if (pg->used == 0) + return true; + + if (pg->options & PAGE_OPTION_ALL_VALUES_EMPTY) + return true; + + return false; +} + +uint32_t pgd_slots_used(PGD *pg) +{ + if (!pg) + return 0; + + if (pg == PGD_EMPTY) + return 0; + + return pg->used; +} + +uint32_t pgd_memory_footprint(PGD *pg) +{ + if (!pg) + return 0; + + if (pg == PGD_EMPTY) + return 0; + + size_t footprint = 0; + switch (pg->type) { + case PAGE_METRICS: + case PAGE_TIER: + footprint = sizeof(PGD) + pg->raw.size; + break; + case PAGE_GORILLA_METRICS: { + if (pg->states & PGD_STATE_CREATED_FROM_DISK) + footprint = sizeof(PGD) + pg->raw.size; + else + footprint = sizeof(PGD) + sizeof(gorilla_writer_t) + (pg->gorilla.num_buffers * GORILLA_BUFFER_SIZE); + + break; + } + default: + fatal("Unknown page type: %uc", pg->type); + } + + return footprint; +} + +uint32_t pgd_disk_footprint(PGD *pg) +{ + if (!pgd_slots_used(pg)) + return 0; + + size_t size = 0; + + switch (pg->type) { + case PAGE_METRICS: + case PAGE_TIER: { + uint32_t used_size = pg->used * page_type_size[pg->type]; + internal_fatal(used_size > pg->raw.size, "Wrong disk footprint page size"); + size = used_size; + + break; + } + case PAGE_GORILLA_METRICS: { + if (pg->states & PGD_STATE_CREATED_FROM_COLLECTOR || + pg->states & PGD_STATE_SCHEDULED_FOR_FLUSHING || + pg->states & PGD_STATE_FLUSHED_TO_DISK) + { + internal_fatal(!pg->gorilla.writer, + "pgd_disk_footprint() not implemented for NULL gorilla writers"); + + internal_fatal(pg->gorilla.num_buffers == 0, + "Gorilla writer does not have any buffers"); + + size = pg->gorilla.num_buffers * GORILLA_BUFFER_SIZE; + + if (pg->states & PGD_STATE_CREATED_FROM_COLLECTOR) { + global_statistics_tier0_disk_compressed_bytes(gorilla_writer_nbytes(pg->gorilla.writer)); + global_statistics_tier0_disk_uncompressed_bytes(gorilla_writer_entries(pg->gorilla.writer) * sizeof(storage_number)); + } + } else if (pg->states & PGD_STATE_CREATED_FROM_DISK) { + size = pg->raw.size; + } else { + fatal("Asked disk footprint on unknown page state"); + } + + break; + } + default: + fatal("Unknown page type: %uc", pg->type); + } + + internal_fatal(pg->states & PGD_STATE_CREATED_FROM_DISK, + "Disk footprint asked for page created from disk."); + pg->states = PGD_STATE_SCHEDULED_FOR_FLUSHING; + return size; +} + +void pgd_copy_to_extent(PGD *pg, uint8_t *dst, uint32_t dst_size) +{ + internal_fatal(pgd_disk_footprint(pg) != dst_size, "Wrong disk footprint size requested (need %u, available %u)", + pgd_disk_footprint(pg), dst_size); + + switch (pg->type) { + case PAGE_METRICS: + case PAGE_TIER: + memcpy(dst, pg->raw.data, dst_size); + break; + case PAGE_GORILLA_METRICS: { + if ((pg->states & PGD_STATE_SCHEDULED_FOR_FLUSHING) == 0) + fatal("Copying to extent is supported only for PGDs that are scheduled for flushing."); + + internal_fatal(!pg->gorilla.writer, + "pgd_copy_to_extent() not implemented for NULL gorilla writers"); + + internal_fatal(pg->gorilla.num_buffers == 0, + "pgd_copy_to_extent() gorilla writer does not have any buffers"); + + bool ok = gorilla_writer_serialize(pg->gorilla.writer, dst, dst_size); + UNUSED(ok); + internal_fatal(!ok, + "pgd_copy_to_extent() tried to serialize pg=%p, gw=%p (with dst_size=%u bytes, num_buffers=%zu)", + pg, pg->gorilla.writer, dst_size, pg->gorilla.num_buffers); + break; + } + default: + fatal("Unknown page type: %uc", pg->type); + } + + pg->states = PGD_STATE_FLUSHED_TO_DISK; +} + +// ---------------------------------------------------------------------------- +// data collection + +void pgd_append_point(PGD *pg, + usec_t point_in_time_ut __maybe_unused, + NETDATA_DOUBLE n, + NETDATA_DOUBLE min_value, + NETDATA_DOUBLE max_value, + uint16_t count, + uint16_t anomaly_count, + SN_FLAGS flags, + uint32_t expected_slot) +{ + if (unlikely(pg->used >= pg->slots)) + fatal("DBENGINE: attempted to write beyond page size (page type %u, slots %u, used %u)", + pg->type, pg->slots, pg->used /* FIXME:, pg->size */); + + if (unlikely(pg->used != expected_slot)) + fatal("DBENGINE: page is not aligned to expected slot (used %u, expected %u)", + pg->used, expected_slot); + + if (!(pg->states & PGD_STATE_CREATED_FROM_COLLECTOR)) + fatal("DBENGINE: collection on page not created from a collector"); + + if (pg->states & PGD_STATE_SCHEDULED_FOR_FLUSHING) + fatal("Data collection on page already scheduled for flushing"); + + switch (pg->type) { + case PAGE_METRICS: { + storage_number *tier0_metric_data = (storage_number *)pg->raw.data; + storage_number t = pack_storage_number(n, flags); + tier0_metric_data[pg->used++] = t; + + if ((pg->options & PAGE_OPTION_ALL_VALUES_EMPTY) && does_storage_number_exist(t)) + pg->options &= ~PAGE_OPTION_ALL_VALUES_EMPTY; + + break; + } + case PAGE_TIER: { + storage_number_tier1_t *tier12_metric_data = (storage_number_tier1_t *)pg->raw.data; + storage_number_tier1_t t; + t.sum_value = (float) n; + t.min_value = (float) min_value; + t.max_value = (float) max_value; + t.anomaly_count = anomaly_count; + t.count = count; + tier12_metric_data[pg->used++] = t; + + if ((pg->options & PAGE_OPTION_ALL_VALUES_EMPTY) && fpclassify(n) != FP_NAN) + pg->options &= ~PAGE_OPTION_ALL_VALUES_EMPTY; + + break; + } + case PAGE_GORILLA_METRICS: { + pg->used++; + storage_number t = pack_storage_number(n, flags); + + if ((pg->options & PAGE_OPTION_ALL_VALUES_EMPTY) && does_storage_number_exist(t)) + pg->options &= ~PAGE_OPTION_ALL_VALUES_EMPTY; + + bool ok = gorilla_writer_write(pg->gorilla.writer, t); + if (!ok) { + gorilla_buffer_t *new_buffer = aral_mallocz(pgd_alloc_globals.aral_gorilla_buffer[pg->gorilla.aral_index]); + memset(new_buffer, 0, GORILLA_BUFFER_SIZE); + + gorilla_writer_add_buffer(pg->gorilla.writer, new_buffer, GORILLA_BUFFER_SLOTS); + pg->gorilla.num_buffers += 1; + global_statistics_gorilla_buffer_add_hot(); + + ok = gorilla_writer_write(pg->gorilla.writer, t); + internal_fatal(ok == false, "Failed to writer value in newly allocated gorilla buffer."); + } + break; + } + default: + fatal("DBENGINE: unknown page type id %d", pg->type); + break; + } +} + +// ---------------------------------------------------------------------------- +// querying with cursor + +static void pgdc_seek(PGDC *pgdc, uint32_t position) +{ + PGD *pg = pgdc->pgd; + + switch (pg->type) { + case PAGE_METRICS: + case PAGE_TIER: + pgdc->slots = pgdc->pgd->used; + break; + case PAGE_GORILLA_METRICS: { + if (pg->states & PGD_STATE_CREATED_FROM_DISK) { + pgdc->slots = pgdc->pgd->slots; + pgdc->gr = gorilla_reader_init((void *) pg->raw.data); + } else { + if (!(pg->states & PGD_STATE_CREATED_FROM_COLLECTOR) && + !(pg->states & PGD_STATE_SCHEDULED_FOR_FLUSHING) && + !(pg->states & PGD_STATE_FLUSHED_TO_DISK)) + fatal("pgdc_seek() currently is not supported for pages created from disk."); + + if (!pg->gorilla.writer) + fatal("Seeking from a page without an active gorilla writer is not supported (yet)."); + + pgdc->slots = gorilla_writer_entries(pg->gorilla.writer); + pgdc->gr = gorilla_writer_get_reader(pg->gorilla.writer); + } + + if (position > pgdc->slots) + position = pgdc->slots; + + for (uint32_t i = 0; i != position; i++) { + uint32_t value; + + bool ok = gorilla_reader_read(&pgdc->gr, &value); + + if (!ok) { + // this is fine, the reader will return empty points + break; + } + } + + break; + } + default: + fatal("DBENGINE: unknown page type id %d", pg->type); + break; + } +} + +void pgdc_reset(PGDC *pgdc, PGD *pgd, uint32_t position) +{ + // pgd might be null and position equal to UINT32_MAX + + pgdc->pgd = pgd; + pgdc->position = position; + + if (!pgd) + return; + + if (pgd == PGD_EMPTY) + return; + + if (position == UINT32_MAX) + return; + + pgdc_seek(pgdc, position); +} + +bool pgdc_get_next_point(PGDC *pgdc, uint32_t expected_position, STORAGE_POINT *sp) +{ + if (!pgdc->pgd || pgdc->pgd == PGD_EMPTY || pgdc->position >= pgdc->slots) + { + storage_point_empty(*sp, sp->start_time_s, sp->end_time_s); + return false; + } + + internal_fatal(pgdc->position != expected_position, "Wrong expected cursor position"); + + switch (pgdc->pgd->type) + { + case PAGE_METRICS: { + storage_number *array = (storage_number *) pgdc->pgd->raw.data; + storage_number n = array[pgdc->position++]; + + sp->min = sp->max = sp->sum = unpack_storage_number(n); + sp->flags = (SN_FLAGS)(n & SN_USER_FLAGS); + sp->count = 1; + sp->anomaly_count = is_storage_number_anomalous(n) ? 1 : 0; + + return true; + } + case PAGE_TIER: { + storage_number_tier1_t *array = (storage_number_tier1_t *) pgdc->pgd->raw.data; + storage_number_tier1_t n = array[pgdc->position++]; + + sp->flags = n.anomaly_count ? SN_FLAG_NONE : SN_FLAG_NOT_ANOMALOUS; + sp->count = n.count; + sp->anomaly_count = n.anomaly_count; + sp->min = n.min_value; + sp->max = n.max_value; + sp->sum = n.sum_value; + + return true; + } + case PAGE_GORILLA_METRICS: { + pgdc->position++; + + uint32_t n = 666666666; + bool ok = gorilla_reader_read(&pgdc->gr, &n); + if (ok) { + sp->min = sp->max = sp->sum = unpack_storage_number(n); + sp->flags = (SN_FLAGS)(n & SN_USER_FLAGS); + sp->count = 1; + sp->anomaly_count = is_storage_number_anomalous(n) ? 1 : 0; + } else { + storage_point_empty(*sp, sp->start_time_s, sp->end_time_s); + } + + return ok; + } + default: { + static bool logged = false; + if (!logged) + { + netdata_log_error("DBENGINE: unknown page type %d found. Cannot decode it. Ignoring its metrics.", pgd_type(pgdc->pgd)); + logged = true; + } + + storage_point_empty(*sp, sp->start_time_s, sp->end_time_s); + return false; + } + } +} diff --git a/database/engine/page.h b/database/engine/page.h new file mode 100644 index 000000000..32c87c580 --- /dev/null +++ b/database/engine/page.h @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef DBENGINE_PAGE_H +#define DBENGINE_PAGE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "libnetdata/libnetdata.h" + +typedef struct pgd_cursor { + struct pgd *pgd; + uint32_t position; + uint32_t slots; + + gorilla_reader_t gr; +} PGDC; + +#include "rrdengine.h" + +typedef struct pgd PGD; + +#define PGD_EMPTY (PGD *)(-1) + +void pgd_init_arals(void); + +PGD *pgd_create(uint8_t type, uint32_t slots); +PGD *pgd_create_from_disk_data(uint8_t type, void *base, uint32_t size); +void pgd_free(PGD *pg); + +uint32_t pgd_type(PGD *pg); +bool pgd_is_empty(PGD *pg); +uint32_t pgd_slots_used(PGD *pg); + +uint32_t pgd_memory_footprint(PGD *pg); +uint32_t pgd_disk_footprint(PGD *pg); + +void pgd_copy_to_extent(PGD *pg, uint8_t *dst, uint32_t dst_size); + +void pgd_append_point(PGD *pg, + usec_t point_in_time_ut, + NETDATA_DOUBLE n, + NETDATA_DOUBLE min_value, + NETDATA_DOUBLE max_value, + uint16_t count, + uint16_t anomaly_count, + SN_FLAGS flags, + uint32_t expected_slot); + +void pgdc_reset(PGDC *pgdc, PGD *pgd, uint32_t position); +bool pgdc_get_next_point(PGDC *pgdc, uint32_t expected_position, STORAGE_POINT *sp); + +#ifdef __cplusplus +} +#endif + +#endif // DBENGINE_PAGE_H diff --git a/database/engine/page_test.cc b/database/engine/page_test.cc new file mode 100644 index 000000000..d61299bc4 --- /dev/null +++ b/database/engine/page_test.cc @@ -0,0 +1,405 @@ +#include "page.h" +#include "page_test.h" + +#ifdef HAVE_GTEST + +#include <gtest/gtest.h> +#include <limits> +#include <random> + +bool operator==(const STORAGE_POINT lhs, const STORAGE_POINT rhs) { + if (lhs.min != rhs.min) + return false; + + if (lhs.max != rhs.max) + return false; + + if (lhs.sum != rhs.sum) + return false; + + if (lhs.start_time_s != rhs.start_time_s) + return false; + + if (lhs.end_time_s != rhs.end_time_s) + return false; + + if (lhs.count != rhs.count) + return false; + + if (lhs.flags != rhs.flags) + return false; + + return true; +} + +// TODO: use value-parameterized tests +// http://google.github.io/googletest/advanced.html#value-parameterized-tests +static uint8_t page_type = PAGE_GORILLA_METRICS; + +static size_t slots_for_page(size_t n) { + switch (page_type) { + case PAGE_METRICS: + return 1024; + case PAGE_GORILLA_METRICS: + return n; + default: + fatal("Slots requested for unsupported page: %uc", page_type); + } +} + +TEST(PGD, EmptyOrNull) { + PGD *pg = NULL; + + PGDC cursor; + STORAGE_POINT sp; + + EXPECT_TRUE(pgd_is_empty(pg)); + EXPECT_EQ(pgd_slots_used(pg), 0); + EXPECT_EQ(pgd_memory_footprint(pg), 0); + EXPECT_EQ(pgd_disk_footprint(pg), 0); + + pgdc_reset(&cursor, pg, 0); + EXPECT_FALSE(pgdc_get_next_point(&cursor, 0, &sp)); + + pgd_free(pg); + + pg = PGD_EMPTY; + + EXPECT_TRUE(pgd_is_empty(pg)); + EXPECT_EQ(pgd_slots_used(pg), 0); + EXPECT_EQ(pgd_memory_footprint(pg), 0); + EXPECT_EQ(pgd_disk_footprint(pg), 0); + EXPECT_FALSE(pgdc_get_next_point(&cursor, 0, &sp)); + + pgdc_reset(&cursor, pg, 0); + EXPECT_FALSE(pgdc_get_next_point(&cursor, 0, &sp)); + + pgd_free(pg); +} + +TEST(PGD, Create) { + size_t slots = slots_for_page(1024 * 1024); + PGD *pg = pgd_create(page_type, slots); + + EXPECT_EQ(pgd_type(pg), page_type); + EXPECT_TRUE(pgd_is_empty(pg)); + EXPECT_EQ(pgd_slots_used(pg), 0); + + for (size_t i = 0; i != slots; i++) { + pgd_append_point(pg, i, i, 0, 0, 1, 1, SN_DEFAULT_FLAGS, i); + EXPECT_FALSE(pgd_is_empty(pg)); + } + EXPECT_EQ(pgd_slots_used(pg), slots); + + EXPECT_DEATH( + pgd_append_point(pg, slots, slots, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slots), + ".*" + ); + + pgd_free(pg); +} + +TEST(PGD, CursorFullPage) { + size_t slots = slots_for_page(1024 * 1024); + PGD *pg = pgd_create(page_type, slots); + + for (size_t slot = 0; slot != slots; slot++) + pgd_append_point(pg, slot, slot, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slot); + + for (size_t i = 0; i != 2; i++) { + PGDC cursor; + pgdc_reset(&cursor, pg, 0); + + STORAGE_POINT sp; + for (size_t slot = 0; slot != slots; slot++) { + EXPECT_TRUE(pgdc_get_next_point(&cursor, slot, &sp)); + + EXPECT_EQ(slot, static_cast<size_t>(sp.min)); + EXPECT_EQ(sp.min, sp.max); + EXPECT_EQ(sp.min, sp.sum); + EXPECT_EQ(sp.count, 1); + EXPECT_EQ(sp.anomaly_count, 0); + } + + EXPECT_FALSE(pgdc_get_next_point(&cursor, slots, &sp)); + } + + for (size_t i = 0; i != 2; i++) { + PGDC cursor; + pgdc_reset(&cursor, pg, slots / 2); + + STORAGE_POINT sp; + for (size_t slot = slots / 2; slot != slots; slot++) { + EXPECT_TRUE(pgdc_get_next_point(&cursor, slot, &sp)); + + EXPECT_EQ(slot, static_cast<size_t>(sp.min)); + EXPECT_EQ(sp.min, sp.max); + EXPECT_EQ(sp.min, sp.sum); + EXPECT_EQ(sp.count, 1); + EXPECT_EQ(sp.anomaly_count, 0); + } + + EXPECT_FALSE(pgdc_get_next_point(&cursor, slots, &sp)); + } + + // out of bounds seek + { + PGDC cursor; + pgdc_reset(&cursor, pg, 2 * slots); + + STORAGE_POINT sp; + EXPECT_FALSE(pgdc_get_next_point(&cursor, 2 * slots, &sp)); + } + + pgd_free(pg); +} + +TEST(PGD, CursorHalfPage) { + size_t slots = slots_for_page(1024 * 1024); + PGD *pg = pgd_create(page_type, slots); + + PGDC cursor; + STORAGE_POINT sp; + + // fill the 1st half of the page + for (size_t slot = 0; slot != slots / 2; slot++) + pgd_append_point(pg, slot, slot, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slot); + + pgdc_reset(&cursor, pg, 0); + + for (size_t slot = 0; slot != slots / 2; slot++) { + EXPECT_TRUE(pgdc_get_next_point(&cursor, slot, &sp)); + + EXPECT_EQ(slot, static_cast<size_t>(sp.min)); + EXPECT_EQ(sp.min, sp.max); + EXPECT_EQ(sp.min, sp.sum); + EXPECT_EQ(sp.count, 1); + EXPECT_EQ(sp.anomaly_count, 0); + } + EXPECT_FALSE(pgdc_get_next_point(&cursor, slots / 2, &sp)); + + // reset pgdc to the end of the page, we should not be getting more + // points even if the page has grown in between. + + pgdc_reset(&cursor, pg, slots / 2); + + for (size_t slot = slots / 2; slot != slots; slot++) + pgd_append_point(pg, slot, slot, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slot); + + for (size_t slot = slots / 2; slot != slots; slot++) + EXPECT_FALSE(pgdc_get_next_point(&cursor, slot, &sp)); + + EXPECT_FALSE(pgdc_get_next_point(&cursor, slots, &sp)); + + pgd_free(pg); +} + +TEST(PGD, MemoryFootprint) { + size_t slots = slots_for_page(1024 * 1024); + PGD *pg = pgd_create(page_type, slots); + + uint32_t footprint = 0; + switch (pgd_type(pg)) { + case PAGE_METRICS: + footprint = slots * sizeof(uint32_t); + break; + case PAGE_GORILLA_METRICS: + footprint = 128 * sizeof(uint32_t); + break; + default: + fatal("Uknown page type: %uc", pgd_type(pg)); + } + EXPECT_NEAR(pgd_memory_footprint(pg), footprint, 128); + + std::random_device rand_dev; + std::mt19937 gen(rand_dev()); + std::uniform_int_distribution<uint32_t> distr(std::numeric_limits<uint32_t>::min(), + std::numeric_limits<uint32_t>::max()); // define the range + + for (size_t slot = 0; slot != slots; slot++) { + uint32_t n = distr(gen); + pgd_append_point(pg, slot, n, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slot); + } + + footprint = slots * sizeof(uint32_t); + + uint32_t abs_error = 0; + switch (pgd_type(pg)) { + case PAGE_METRICS: + abs_error = 128; + break; + case PAGE_GORILLA_METRICS: + abs_error = footprint / 10; + break; + default: + fatal("Uknown page type: %uc", pgd_type(pg)); + } + + EXPECT_NEAR(pgd_memory_footprint(pg), footprint, abs_error); +} + +TEST(PGD, DiskFootprint) { + size_t slots = slots_for_page(1024 * 1024); + PGD *pg = pgd_create(page_type, slots); + + std::random_device rand_dev; + std::mt19937 gen(rand_dev()); + std::uniform_int_distribution<uint32_t> distr(std::numeric_limits<uint32_t>::min(), + std::numeric_limits<uint32_t>::max()); // define the range + + size_t used_slots = 16; + + for (size_t slot = 0; slot != used_slots; slot++) { + uint32_t n = distr(gen); + pgd_append_point(pg, slot, n, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slot); + } + + uint32_t footprint = 0; + switch (pgd_type(pg)) { + case PAGE_METRICS: + footprint = used_slots * sizeof(uint32_t); + break; + case PAGE_GORILLA_METRICS: + footprint = 128 * sizeof(uint32_t); + break; + default: + fatal("Uknown page type: %uc", pgd_type(pg)); + } + EXPECT_EQ(pgd_disk_footprint(pg), footprint); + + pgd_free(pg); + + pg = pgd_create(page_type, slots); + + used_slots = 128 + 64; + + for (size_t slot = 0; slot != used_slots; slot++) { + uint32_t n = distr(gen); + pgd_append_point(pg, slot, n, 0, 0, 1, 1, SN_DEFAULT_FLAGS, slot); + } + + switch (pgd_type(pg)) { + case PAGE_METRICS: + footprint = used_slots * sizeof(uint32_t); + break; + case PAGE_GORILLA_METRICS: + footprint = 2 * (128 * sizeof(uint32_t)); + break; + default: + fatal("Uknown page type: %uc", pgd_type(pg)); + } + EXPECT_EQ(pgd_disk_footprint(pg), footprint); + + pgd_free(pg); +} + +TEST(PGD, CopyToExtent) { + size_t slots = slots_for_page(1024 * 1024); + PGD *pg_collector = pgd_create(page_type, slots); + + uint32_t value = 666; + pgd_append_point(pg_collector, 0, value, 0, 0, 1, 0, SN_DEFAULT_FLAGS, 0); + + uint32_t size_in_bytes = pgd_disk_footprint(pg_collector); + EXPECT_EQ(size_in_bytes, 512); + + uint32_t size_in_words = size_in_bytes / sizeof(uint32_t); + alignas(sizeof(uintptr_t)) uint32_t disk_buffer[size_in_words]; + + for (size_t i = 0; i != size_in_words; i++) { + disk_buffer[i] = std::numeric_limits<uint32_t>::max(); + } + + pgd_copy_to_extent(pg_collector, (uint8_t *) &disk_buffer[0], size_in_bytes); + + EXPECT_EQ(disk_buffer[0], NULL); + EXPECT_EQ(disk_buffer[1], NULL); + EXPECT_EQ(disk_buffer[2], 1); + EXPECT_EQ(disk_buffer[3], 32); + storage_number sn = pack_storage_number(value, SN_DEFAULT_FLAGS); + EXPECT_EQ(disk_buffer[4], sn); + + // make sure the rest of the page is 0'ed so that it's amenable to compression + for (size_t i = 5; i != size_in_words; i++) + EXPECT_EQ(disk_buffer[i], 0); + + pgd_free(pg_collector); +} + +TEST(PGD, Roundtrip) { + size_t slots = slots_for_page(1024 * 1024); + PGD *pg_collector = pgd_create(page_type, slots); + + for (size_t i = 0; i != slots; i++) + pgd_append_point(pg_collector, i, i, 0, 0, 1, 1, SN_DEFAULT_FLAGS, i); + + uint32_t size_in_bytes = pgd_disk_footprint(pg_collector); + uint32_t size_in_words = size_in_bytes / sizeof(uint32_t); + + alignas(sizeof(uintptr_t)) uint32_t disk_buffer[size_in_words]; + for (size_t i = 0; i != size_in_words; i++) + disk_buffer[i] = std::numeric_limits<uint32_t>::max(); + + pgd_copy_to_extent(pg_collector, (uint8_t *) &disk_buffer[0], size_in_bytes); + + PGD *pg_disk = pgd_create_from_disk_data(page_type, &disk_buffer[0], size_in_bytes); + EXPECT_EQ(pgd_slots_used(pg_disk), slots); + + // Expected memory footprint is equal to the disk footprint + a couple + // bytes for the PGD metadata. + EXPECT_NEAR(pgd_memory_footprint(pg_disk), size_in_bytes, 128); + + // Do not allow calling disk footprint for pages created from disk. + EXPECT_DEATH(pgd_disk_footprint(pg_disk), ".*"); + + for (size_t i = 0; i != 10; i++) { + PGDC cursor_collector; + PGDC cursor_disk; + + pgdc_reset(&cursor_collector, pg_collector, i * 1024); + pgdc_reset(&cursor_disk, pg_disk, i * 1024); + + STORAGE_POINT sp_collector = {}; + STORAGE_POINT sp_disk = {}; + + for (size_t slot = i * 1024; slot != slots; slot++) { + EXPECT_TRUE(pgdc_get_next_point(&cursor_collector, slot, &sp_collector)); + EXPECT_TRUE(pgdc_get_next_point(&cursor_disk, slot, &sp_disk)); + + EXPECT_EQ(sp_collector, sp_disk); + } + + EXPECT_FALSE(pgdc_get_next_point(&cursor_collector, slots, &sp_collector)); + EXPECT_FALSE(pgdc_get_next_point(&cursor_disk, slots, &sp_disk)); + } + + pgd_free(pg_disk); + pgd_free(pg_collector); +} + +int pgd_test(int argc, char *argv[]) +{ + // Dummy/necessary initialization stuff + PGC *dummy_cache = pgc_create("pgd-tests-cache", 32 * 1024 * 1024, NULL, 64, NULL, NULL, + 10, 10, 1000, 10, PGC_OPTIONS_NONE, 1, 11); + pgd_init_arals(); + + ::testing::InitGoogleTest(&argc, argv); + int rc = RUN_ALL_TESTS(); + + pgc_destroy(dummy_cache); + + return rc; +} + +#else // HAVE_GTEST + +int pgd_test(int argc, char *argv[]) +{ + (void) argc; + (void) argv; + fprintf(stderr, "Can not run PGD tests because the agent was not build with support for google tests.\n"); + return 0; +} + +#endif // HAVE_GTEST diff --git a/database/engine/page_test.h b/database/engine/page_test.h new file mode 100644 index 000000000..30837f0ab --- /dev/null +++ b/database/engine/page_test.h @@ -0,0 +1,14 @@ +#ifndef PAGE_TEST_H +#define PAGE_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +int pgd_test(int argc, char *argv[]); + +#ifdef __cplusplus +} +#endif + +#endif /* PAGE_TEST_H */ diff --git a/database/engine/pagecache.c b/database/engine/pagecache.c index c608c3270..dab9cdd0d 100644 --- a/database/engine/pagecache.c +++ b/database/engine/pagecache.c @@ -12,8 +12,9 @@ struct rrdeng_cache_efficiency_stats rrdeng_cache_efficiency_stats = {}; static void main_cache_free_clean_page_callback(PGC *cache __maybe_unused, PGC_ENTRY entry __maybe_unused) { // Release storage associated with the page - dbengine_page_free(entry.data, entry.size); + pgd_free(entry.data); } + static void main_cache_flush_dirty_page_init_callback(PGC *cache __maybe_unused, Word_t section) { struct rrdengine_instance *ctx = (struct rrdengine_instance *) section; @@ -28,8 +29,6 @@ static void main_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_ struct rrdengine_instance *ctx = (struct rrdengine_instance *) entries_array[0].section; - size_t bytes_per_point = CTX_POINT_SIZE_BYTES(ctx); - struct page_descr_with_data *base = NULL; for (size_t Index = 0 ; Index < entries; Index++) { @@ -42,21 +41,15 @@ static void main_cache_flush_dirty_page_callback(PGC *cache __maybe_unused, PGC_ descr->start_time_ut = start_time_s * USEC_PER_SEC; descr->end_time_ut = end_time_s * USEC_PER_SEC; descr->update_every_s = entries_array[Index].update_every_s; - descr->type = ctx->config.page_type; - descr->page_length = (end_time_s - (start_time_s - descr->update_every_s)) / descr->update_every_s * bytes_per_point; + descr->pgd = pgc_page_data(pages_array[Index]); + descr->type = pgd_type(descr->pgd); + descr->page_length = pgd_disk_footprint(descr->pgd); - if(descr->page_length > entries_array[Index].size) { - descr->page_length = entries_array[Index].size; - - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, "DBENGINE: page exceeds the maximum size, adjusting it to max."); - } - - descr->page = pgc_page_data(pages_array[Index]); DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(base, descr, link.prev, link.next); - internal_fatal(descr->page_length > RRDENG_BLOCK_SIZE, "DBENGINE: faulty page length calculation"); + // TODO: ask @stelfrag/@ktsaou about this. + // internal_fatal(descr->page_length > RRDENG_BLOCK_SIZE, "DBENGINE: faulty page length calculation"); } struct completion completion; @@ -254,7 +247,6 @@ static size_t get_page_list_from_pgc(PGC *cache, METRIC *metric, struct rrdengin time_t page_start_time_s = pgc_page_start_time_s(page); time_t page_end_time_s = pgc_page_end_time_s(page); time_t page_update_every_s = pgc_page_update_every_s(page); - size_t page_length = pgc_page_data_size(cache, page); if(!page_update_every_s) page_update_every_s = dt_s; @@ -277,24 +269,10 @@ static size_t get_page_list_from_pgc(PGC *cache, METRIC *metric, struct rrdengin if (!PValue || PValue == PJERR) fatal("DBENGINE: corrupted judy array in %s()", __FUNCTION__ ); - if (unlikely(*PValue)) { - struct page_details *pd = *PValue; - UNUSED(pd); - -// internal_error( -// pd->first_time_s != page_first_time_s || -// pd->last_time_s != page_last_time_s || -// pd->update_every_s != page_update_every_s, -// "DBENGINE: duplicate page with different retention in %s cache " -// "1st: %ld to %ld, ue %u, size %u " -// "2nd: %ld to %ld, ue %ld size %zu " -// "- ignoring the second", -// cache == open_cache ? "open" : "main", -// pd->first_time_s, pd->last_time_s, pd->update_every_s, pd->page_length, -// page_first_time_s, page_last_time_s, page_update_every_s, page_length); - + if (unlikely(*PValue)) + // already exists in our list pgc_page_release(cache, page); - } + else { internal_fatal(pgc_page_metric(page) != metric_id, "Wrong metric id in page found in cache"); @@ -304,7 +282,6 @@ static size_t get_page_list_from_pgc(PGC *cache, METRIC *metric, struct rrdengin pd->metric_id = metric_id; pd->first_time_s = page_start_time_s; pd->last_time_s = page_end_time_s; - pd->page_length = page_length; pd->update_every_s = (uint32_t) page_update_every_s; pd->page = (open_cache_mode) ? NULL : page; pd->status |= tags; @@ -312,7 +289,7 @@ static size_t get_page_list_from_pgc(PGC *cache, METRIC *metric, struct rrdengin if((pd->page)) { pd->status |= PDC_PAGE_READY | PDC_PAGE_PRELOADED; - if(pgc_page_data(page) == DBENGINE_EMPTY_PAGE) + if(pgd_is_empty(pgc_page_data(page))) pd->status |= PDC_PAGE_EMPTY; } @@ -369,7 +346,7 @@ static void pgc_inject_gap(struct rrdengine_instance *ctx, METRIC *metric, time_ .end_time_s = MIN(end_time_s, db_last_time_s), .update_every_s = 0, .size = 0, - .data = DBENGINE_EMPTY_PAGE, + .data = PGD_EMPTY, }; if(page_entry.start_time_s >= page_entry.end_time_s) @@ -478,7 +455,7 @@ static size_t list_has_time_gaps( pd->status &= ~PDC_PAGE_DISK_PENDING; pd->status |= PDC_PAGE_READY | PDC_PAGE_PRELOADED | PDC_PAGE_PRELOADED_PASS4; - if(pgc_page_data(pd->page) == DBENGINE_EMPTY_PAGE) + if(pgd_is_empty(pgc_page_data(pd->page))) pd->status |= PDC_PAGE_EMPTY; } @@ -642,7 +619,6 @@ void add_page_details_from_journal_v2(PGC_PAGE *page, void *JudyL_pptr) { pd->first_time_s = pgc_page_start_time_s(page); pd->last_time_s = pgc_page_end_time_s(page); pd->datafile.ptr = datafile; - pd->page_length = ei->page_length; pd->update_every_s = (uint32_t) pgc_page_update_every_s(page); pd->metric_id = metric_id; pd->status |= PDC_PAGE_DISK_PENDING | PDC_PAGE_SOURCE_JOURNAL_V2 | PDC_PAGE_DATAFILE_ACQUIRED; @@ -917,7 +893,7 @@ struct pgc_page *pg_cache_lookup_next( } } - if(page && pgc_page_data(page) == DBENGINE_EMPTY_PAGE) + if(page && pgd_is_empty(pgc_page_data(page))) pdc_page_status_set(pd, PDC_PAGE_EMPTY); if(!page || pdc_page_status_check(pd, PDC_PAGE_QUERY_GLOBAL_SKIP_LIST | PDC_PAGE_EMPTY)) { @@ -930,7 +906,6 @@ struct pgc_page *pg_cache_lookup_next( time_t page_start_time_s = pgc_page_start_time_s(page); time_t page_end_time_s = pgc_page_end_time_s(page); time_t page_update_every_s = pgc_page_update_every_s(page); - size_t page_length = pgc_page_data_size(main_cache, page); if(unlikely(page_start_time_s == INVALID_TIME || page_end_time_s == INVALID_TIME)) { __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_zero_time_skipped, 1, __ATOMIC_RELAXED); @@ -939,13 +914,6 @@ struct pgc_page *pg_cache_lookup_next( pd->page = page = NULL; continue; } - else if(page_length > RRDENG_BLOCK_SIZE) { - __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_invalid_size_skipped, 1, __ATOMIC_RELAXED); - pgc_page_to_clean_evict_or_release(main_cache, page); - pdc_page_status_set(pd, PDC_PAGE_INVALID | PDC_PAGE_RELEASED); - pd->page = page = NULL; - continue; - } else { if (unlikely(page_update_every_s <= 0 || page_update_every_s > 86400)) { __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_invalid_update_every_fixed, 1, __ATOMIC_RELAXED); @@ -953,7 +921,7 @@ struct pgc_page *pg_cache_lookup_next( pd->update_every_s = (uint32_t) page_update_every_s; } - size_t entries_by_size = page_entries_by_size(page_length, CTX_POINT_SIZE_BYTES(ctx)); + size_t entries_by_size = pgd_slots_used(pgc_page_data(page)); size_t entries_by_time = page_entries_by_time(page_start_time_s, page_end_time_s, page_update_every_s); if(unlikely(entries_by_size < entries_by_time)) { time_t fixed_page_end_time_s = (time_t)(page_start_time_s + (entries_by_size - 1) * page_update_every_s); diff --git a/database/engine/pagecache.h b/database/engine/pagecache.h index 5242db89e..dbcbea53a 100644 --- a/database/engine/pagecache.h +++ b/database/engine/pagecache.h @@ -27,7 +27,7 @@ struct page_descr_with_data { uint8_t type; uint32_t update_every_s; uint32_t page_length; - uint8_t *page; + struct pgd *pgd; struct { struct page_descr_with_data *prev; diff --git a/database/engine/pdc.c b/database/engine/pdc.c index 7da568787..5fe205e64 100644 --- a/database/engine/pdc.c +++ b/database/engine/pdc.c @@ -629,14 +629,33 @@ void collect_page_flags_to_buffer(BUFFER *wb, RRDENG_COLLECT_PAGE_FLAGS flags) { } inline VALIDATED_PAGE_DESCRIPTOR validate_extent_page_descr(const struct rrdeng_extent_page_descr *descr, time_t now_s, time_t overwrite_zero_update_every_s, bool have_read_error) { + time_t start_time_s = (time_t) (descr->start_time_ut / USEC_PER_SEC); + + time_t end_time_s; + size_t entries; + + switch (descr->type) { + case PAGE_METRICS: + case PAGE_TIER: + end_time_s = descr->end_time_ut / USEC_PER_SEC; + entries = 0; + break; + case PAGE_GORILLA_METRICS: + end_time_s = start_time_s + descr->gorilla.delta_time_s; + entries = descr->gorilla.entries; + break; + default: + fatal("Unknown page type: %uc\n", descr->type); + } + return validate_page( (uuid_t *)descr->uuid, - (time_t) (descr->start_time_ut / USEC_PER_SEC), - (time_t) (descr->end_time_ut / USEC_PER_SEC), + start_time_s, + end_time_s, 0, descr->page_length, descr->type, - 0, + entries, now_s, overwrite_zero_update_every_s, have_read_error, @@ -666,13 +685,25 @@ VALIDATED_PAGE_DESCRIPTOR validate_page( .is_valid = true, }; - // always calculate entries by size vd.point_size = page_type_size[vd.type]; - vd.entries = page_entries_by_size(vd.page_length, vd.point_size); - - // allow to be called without entries (when loading pages from disk) - if(!entries) - entries = vd.entries; + switch (page_type) { + case PAGE_METRICS: + case PAGE_TIER: + // always calculate entries by size + vd.entries = page_entries_by_size(vd.page_length, vd.point_size); + + // allow to be called without entries (when loading pages from disk) + if(!entries) + entries = vd.entries; + break; + case PAGE_GORILLA_METRICS: + internal_fatal(entries == 0, "0 number of entries found on gorilla page"); + vd.entries = entries; + break; + default: + // TODO: should set vd.is_valid false instead? + fatal("Unknown page type: %uc", page_type); + } // allow to be called without update every (when loading pages from disk) if(!update_every_s) { @@ -687,19 +718,26 @@ VALIDATED_PAGE_DESCRIPTOR validate_page( bool updated = false; + size_t max_page_length = RRDENG_BLOCK_SIZE; + + // If gorilla can not compress the data we might end up needing slightly more + // than 4KiB. However, gorilla pages extend the page length by increments of + // 512 bytes. + max_page_length += ((page_type == PAGE_GORILLA_METRICS) * GORILLA_BUFFER_SIZE); + if( have_read_error || vd.page_length == 0 || - vd.page_length > RRDENG_BLOCK_SIZE || + vd.page_length > max_page_length || vd.start_time_s > vd.end_time_s || (now_s && vd.end_time_s > now_s) || vd.start_time_s <= 0 || vd.end_time_s <= 0 || vd.update_every_s < 0 || (vd.start_time_s == vd.end_time_s && vd.entries > 1) || - (vd.update_every_s == 0 && vd.entries > 1) - ) + (vd.update_every_s == 0 && vd.entries > 1)) + { vd.is_valid = false; - + } else { if(unlikely(vd.entries != entries || vd.update_every_s != update_every_s)) updated = true; @@ -734,7 +772,7 @@ VALIDATED_PAGE_DESCRIPTOR validate_page( if(unlikely(!vd.is_valid || updated)) { #ifndef NETDATA_INTERNAL_CHECKS - error_limit_static_global_var(erl, 1, 0); + nd_log_limit_static_global_var(erl, 1, 0); #endif char uuid_str[UUID_STR_LEN + 1]; uuid_unparse(*uuid, uuid_str); @@ -750,7 +788,7 @@ VALIDATED_PAGE_DESCRIPTOR validate_page( #ifdef NETDATA_INTERNAL_CHECKS internal_error(true, #else - error_limit(&erl, + nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, #endif "DBENGINE: metric '%s' %s invalid page of type %u " "from %ld to %ld (now %ld), update every %ld, page length %zu, entries %zu (flags: %s)", @@ -770,7 +808,7 @@ VALIDATED_PAGE_DESCRIPTOR validate_page( #ifdef NETDATA_INTERNAL_CHECKS internal_error(true, #else - error_limit(&erl, + nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, #endif "DBENGINE: metric '%s' %s page of type %u " "from %ld to %ld (now %ld), update every %ld, page length %zu, entries %zu (flags: %s), " @@ -832,7 +870,15 @@ static void epdl_extent_loading_error_log(struct rrdengine_instance *ctx, EPDL * if (descr) { start_time_s = (time_t)(descr->start_time_ut / USEC_PER_SEC); - end_time_s = (time_t)(descr->end_time_ut / USEC_PER_SEC); + switch (descr->type) { + case PAGE_METRICS: + case PAGE_TIER: + end_time_s = (time_t)(descr->end_time_ut / USEC_PER_SEC); + break; + case PAGE_GORILLA_METRICS: + end_time_s = (time_t) start_time_s + (descr->gorilla.delta_time_s); + break; + } uuid_unparse_lower(descr->uuid, uuid); used_descr = true; } @@ -869,8 +915,8 @@ static void epdl_extent_loading_error_log(struct rrdengine_instance *ctx, EPDL * if(end_time_s) log_date(end_time_str, LOG_DATE_LENGTH, end_time_s); - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, + nd_log_limit_static_global_var(erl, 1, 0); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, "DBENGINE: error while reading extent from datafile %u of tier %d, at offset %" PRIu64 " (%u bytes) " "%s from %ld (%s) to %ld (%s) %s%s: " "%s", @@ -952,7 +998,9 @@ static bool epdl_populate_pages_from_extent_data( uncompressed_payload_length = 0; for (i = 0; i < count; ++i) { size_t page_length = header->descr[i].page_length; - if(page_length > RRDENG_BLOCK_SIZE) { + if (page_length > RRDENG_BLOCK_SIZE && (header->descr[i].type != PAGE_GORILLA_METRICS || + (header->descr[i].type == PAGE_GORILLA_METRICS && + (page_length - RRDENG_BLOCK_SIZE) % GORILLA_BUFFER_SIZE))) { have_read_error = true; break; } @@ -993,7 +1041,7 @@ static bool epdl_populate_pages_from_extent_data( if(!page_length || !start_time_s) { char log[200 + 1]; - snprintfz(log, 200, "page %u (out of %u) is EMPTY", i, count); + snprintfz(log, sizeof(log) - 1, "page %u (out of %u) is EMPTY", i, count); epdl_extent_loading_error_log(ctx, epdl, &header->descr[i], log); continue; } @@ -1002,7 +1050,7 @@ static bool epdl_populate_pages_from_extent_data( Word_t metric_id = (Word_t)metric; if(!metric) { char log[200 + 1]; - snprintfz(log, 200, "page %u (out of %u) has unknown UUID", i, count); + snprintfz(log, sizeof(log) - 1, "page %u (out of %u) has unknown UUID", i, count); epdl_extent_loading_error_log(ctx, epdl, &header->descr[i], log); continue; } @@ -1020,32 +1068,34 @@ static bool epdl_populate_pages_from_extent_data( if(worker) worker_is_busy(UV_EVENT_DBENGINE_EXTENT_PAGE_ALLOCATION); - void *page_data; + PGD *pgd; if (unlikely(!vd.is_valid)) { - page_data = DBENGINE_EMPTY_PAGE; + pgd = PGD_EMPTY; stats_load_invalid_page++; } else { if (RRD_NO_COMPRESSION == header->compression_algorithm) { - page_data = dbengine_page_alloc(vd.page_length); - memcpy(page_data, data + payload_offset + page_offset, (size_t) vd.page_length); + pgd = pgd_create_from_disk_data(header->descr[i].type, + data + payload_offset + page_offset, + vd.page_length); stats_load_uncompressed++; } else { if (unlikely(page_offset + vd.page_length > uncompressed_payload_length)) { char log[200 + 1]; - snprintfz(log, 200, "page %u (out of %u) offset %u + page length %zu, " + snprintfz(log, sizeof(log) - 1, "page %u (out of %u) offset %u + page length %zu, " "exceeds the uncompressed buffer size %u", i, count, page_offset, vd.page_length, uncompressed_payload_length); epdl_extent_loading_error_log(ctx, epdl, &header->descr[i], log); - page_data = DBENGINE_EMPTY_PAGE; + pgd = PGD_EMPTY; stats_load_invalid_page++; } else { - page_data = dbengine_page_alloc(vd.page_length); - memcpy(page_data, uncompressed_buf + page_offset, vd.page_length); + pgd = pgd_create_from_disk_data(header->descr[i].type, + uncompressed_buf + page_offset, + vd.page_length); stats_load_compressed++; } } @@ -1061,14 +1111,14 @@ static bool epdl_populate_pages_from_extent_data( .start_time_s = vd.start_time_s, .end_time_s = vd.end_time_s, .update_every_s = (uint32_t) vd.update_every_s, - .size = (size_t) ((page_data == DBENGINE_EMPTY_PAGE) ? 0 : vd.page_length), - .data = page_data + .size = pgd_memory_footprint(pgd), // the footprint of the entire PGD, for accurate memory management + .data = pgd, }; bool added = true; PGC_PAGE *page = pgc_page_add_and_acquire(main_cache, page_entry, &added); if (false == added) { - dbengine_page_free(page_data, vd.page_length); + pgd_free(pgd); stats_cache_hit_while_inserting++; stats_data_from_main_cache++; } @@ -1081,8 +1131,7 @@ static bool epdl_populate_pages_from_extent_data( pgc_page_dup(main_cache, page); pd->page = page; - pd->page_length = pgc_page_data_size(main_cache, page); - pdc_page_status_set(pd, PDC_PAGE_READY | tags | ((page_data == DBENGINE_EMPTY_PAGE) ? PDC_PAGE_EMPTY : 0)); + pdc_page_status_set(pd, PDC_PAGE_READY | tags | (pgd_is_empty(pgd) ? PDC_PAGE_EMPTY : 0)); pd = pd->load.next; } while(pd); diff --git a/database/engine/rrddiskprotocol.h b/database/engine/rrddiskprotocol.h index 5b4be9498..86b41f0b3 100644 --- a/database/engine/rrddiskprotocol.h +++ b/database/engine/rrddiskprotocol.h @@ -3,6 +3,8 @@ #ifndef NETDATA_RRDDISKPROTOCOL_H #define NETDATA_RRDDISKPROTOCOL_H +#include <stdint.h> + #define RRDENG_BLOCK_SIZE (4096) #define RRDFILE_ALIGNMENT RRDENG_BLOCK_SIZE @@ -36,7 +38,8 @@ struct rrdeng_df_sb { */ #define PAGE_METRICS (0) #define PAGE_TIER (1) -#define PAGE_TYPE_MAX 1 // Maximum page type (inclusive) +#define PAGE_GORILLA_METRICS (2) +#define PAGE_TYPE_MAX 2 // Maximum page type (inclusive) /* * Data file page descriptor @@ -47,7 +50,14 @@ struct rrdeng_extent_page_descr { uint8_t uuid[UUID_SZ]; uint32_t page_length; uint64_t start_time_ut; - uint64_t end_time_ut; + union { + struct { + uint32_t entries; + uint32_t delta_time_s; + } gorilla __attribute__((packed)); + + uint64_t end_time_ut; + }; } __attribute__ ((packed)); /* diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c index 99257b79d..b82cc1ad1 100644 --- a/database/engine/rrdengine.c +++ b/database/engine/rrdengine.c @@ -40,6 +40,7 @@ struct rrdeng_main { uv_async_t async; uv_timer_t timer; pid_t tid; + bool shutdown; size_t flushes_running; size_t evictions_running; @@ -577,55 +578,6 @@ static inline struct rrdeng_cmd rrdeng_deq_cmd(bool from_worker) { // ---------------------------------------------------------------------------- -struct { - ARAL *aral[RRD_STORAGE_TIERS]; -} dbengine_page_alloc_globals = {}; - -static inline ARAL *page_size_lookup(size_t size) { - for(size_t tier = 0; tier < storage_tiers ;tier++) - if(size == tier_page_size[tier]) - return dbengine_page_alloc_globals.aral[tier]; - - return NULL; -} - -static void dbengine_page_alloc_init(void) { - for(size_t i = storage_tiers; i > 0 ;i--) { - size_t tier = storage_tiers - i; - - char buf[20 + 1]; - snprintfz(buf, 20, "tier%zu-pages", tier); - - dbengine_page_alloc_globals.aral[tier] = aral_create( - buf, - tier_page_size[tier], - 64, - 512 * tier_page_size[tier], - pgc_aral_statistics(), - NULL, NULL, false, false); - } -} - -void *dbengine_page_alloc(size_t size) { - ARAL *ar = page_size_lookup(size); - if(ar) return aral_mallocz(ar); - - return mallocz(size); -} - -void dbengine_page_free(void *page, size_t size __maybe_unused) { - if(unlikely(!page || page == DBENGINE_EMPTY_PAGE)) - return; - - ARAL *ar = page_size_lookup(size); - if(ar) - aral_freez(ar, page); - else - freez(page); -} - -// ---------------------------------------------------------------------------- - void *dbengine_extent_alloc(size_t size) { void *extent = mallocz(size); return extent; @@ -890,12 +842,25 @@ static struct extent_io_descriptor *datafile_extent_build(struct rrdengine_insta uuid_copy(*(uuid_t *)header->descr[i].uuid, *descr->id); header->descr[i].page_length = descr->page_length; header->descr[i].start_time_ut = descr->start_time_ut; - header->descr[i].end_time_ut = descr->end_time_ut; + + switch (descr->type) { + case PAGE_METRICS: + case PAGE_TIER: + header->descr[i].end_time_ut = descr->end_time_ut; + break; + case PAGE_GORILLA_METRICS: + header->descr[i].gorilla.delta_time_s = (uint32_t) ((descr->end_time_ut - descr->start_time_ut) / USEC_PER_SEC); + header->descr[i].gorilla.entries = pgd_slots_used(descr->pgd); + break; + default: + fatal("Unknown page type: %uc", descr->type); + } + pos += sizeof(header->descr[i]); } for (i = 0 ; i < count ; ++i) { descr = xt_io_descr->descr_array[i]; - (void) memcpy(xt_io_descr->buf + pos, descr->page, descr->page_length); + pgd_copy_to_extent(descr->pgd, xt_io_descr->buf + pos, descr->page_length); pos += descr->page_length; } @@ -1381,9 +1346,6 @@ static void after_ctx_shutdown(struct rrdengine_instance *ctx __maybe_unused, vo static void *ctx_shutdown_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { worker_is_busy(UV_EVENT_DBENGINE_SHUTDOWN); - completion_wait_for(&ctx->quiesce.completion); - completion_destroy(&ctx->quiesce.completion); - bool logged = false; while(__atomic_load_n(&ctx->atomic.extents_currently_being_flushed, __ATOMIC_RELAXED) || __atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED)) { @@ -1436,6 +1398,14 @@ uint64_t rrdeng_target_data_file_size(struct rrdengine_instance *ctx) { bool rrdeng_ctx_exceeded_disk_quota(struct rrdengine_instance *ctx) { + if(!ctx->datafiles.first) + // no datafiles available + return false; + + if(!ctx->datafiles.first->next) + // only 1 datafile available + return false; + uint64_t estimated_disk_space = ctx_current_disk_space_get(ctx) + rrdeng_target_data_file_size(ctx) - (ctx->datafiles.first->prev ? ctx->datafiles.first->prev->pos : 0); @@ -1514,12 +1484,19 @@ static void *journal_v2_indexing_tp_worker(struct rrdengine_instance *ctx __mayb spinlock_unlock(&datafile->writers.spinlock); if(!available) { - netdata_log_info("DBENGINE: journal file %u needs to be indexed, but it has writers working on it - skipping it for now", datafile->fileno); + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "DBENGINE: journal file %u needs to be indexed, but it has writers working on it - " + "skipping it for now", + datafile->fileno); + datafile = datafile->next; continue; } - netdata_log_info("DBENGINE: journal file %u is ready to be indexed", datafile->fileno); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "DBENGINE: journal file %u is ready to be indexed", + datafile->fileno); + pgc_open_cache_to_journal_v2(open_cache, (Word_t) ctx, (int) datafile->fileno, ctx->config.page_type, journalfile_migrate_to_v2_callback, (void *) datafile->journalfile); @@ -1532,7 +1509,10 @@ static void *journal_v2_indexing_tp_worker(struct rrdengine_instance *ctx __mayb } errno = 0; - internal_error(count, "DBENGINE: journal indexing done; %u files processed", count); + if(count) + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "DBENGINE: journal indexing done; %u files processed", + count); worker_is_idle(); @@ -1628,7 +1608,7 @@ static void dbengine_initialize_structures(void) { rrdeng_query_handle_init(); page_descriptors_init(); extent_buffer_init(); - dbengine_page_alloc_init(); + pgd_init_arals(); extent_io_descriptor_init(); } @@ -1715,6 +1695,7 @@ void dbengine_event_loop(void* arg) { worker_register_job_name(RRDENG_OPCODE_EVICT_INIT, "evict init"); worker_register_job_name(RRDENG_OPCODE_CTX_SHUTDOWN, "ctx shutdown"); worker_register_job_name(RRDENG_OPCODE_CTX_QUIESCE, "ctx quiesce"); + worker_register_job_name(RRDENG_OPCODE_SHUTDOWN_EVLOOP, "dbengine shutdown"); worker_register_job_name(RRDENG_OPCODE_MAX, "get opcode"); @@ -1856,6 +1837,13 @@ void dbengine_event_loop(void* arg) { break; } + case RRDENG_OPCODE_SHUTDOWN_EVLOOP: { + uv_close((uv_handle_t *)&main->async, NULL); + (void) uv_timer_stop(&main->timer); + uv_close((uv_handle_t *)&main->timer, NULL); + shutdown = true; + } + case RRDENG_OPCODE_NOOP: { /* the command queue was empty, do nothing */ break; @@ -1872,18 +1860,7 @@ void dbengine_event_loop(void* arg) { } while (opcode != RRDENG_OPCODE_NOOP); } - /* cleanup operations of the event loop */ - netdata_log_info("DBENGINE: shutting down dbengine thread"); - - /* - * uv_async_send after uv_close does not seem to crash in linux at the moment, - * it is however undocumented behaviour and we need to be aware if this becomes - * an issue in the future. - */ - uv_close((uv_handle_t *)&main->async, NULL); - uv_timer_stop(&main->timer); - uv_close((uv_handle_t *)&main->timer, NULL); - uv_run(&main->loop, UV_RUN_DEFAULT); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "Shutting down dbengine thread"); uv_loop_close(&main->loop); worker_unregister(); } diff --git a/database/engine/rrdengine.h b/database/engine/rrdengine.h index 08eaf4128..cd3352f12 100644 --- a/database/engine/rrdengine.h +++ b/database/engine/rrdengine.h @@ -22,6 +22,7 @@ #include "metric.h" #include "cache.h" #include "pdc.h" +#include "page.h" extern unsigned rrdeng_pages_per_extent; @@ -119,7 +120,6 @@ struct page_details { time_t first_time_s; time_t last_time_s; uint32_t update_every_s; - uint16_t page_length; PDC_PAGE_STATUS status; struct { @@ -190,10 +190,11 @@ struct rrdeng_collect_handle { RRDENG_COLLECT_HANDLE_OPTIONS options; uint8_t type; + struct rrdengine_instance *ctx; struct metric *metric; - struct pgc_page *page; - void *data; - size_t data_size; + struct pgc_page *pgc_page; + struct pgd *page_data; + size_t page_data_size; struct pg_alignment *alignment; uint32_t page_entries_max; uint32_t page_position; // keep track of the current page size, to make sure we don't exceed it @@ -206,7 +207,7 @@ struct rrdeng_query_handle { struct metric *metric; struct pgc_page *page; struct rrdengine_instance *ctx; - storage_number *metric_data; + struct pgd_cursor pgdc; struct page_details_control *pdc; // the request @@ -246,6 +247,7 @@ enum rrdeng_opcode { RRDENG_OPCODE_CTX_SHUTDOWN, RRDENG_OPCODE_CTX_QUIESCE, RRDENG_OPCODE_CTX_POPULATE_MRG, + RRDENG_OPCODE_SHUTDOWN_EVLOOP, RRDENG_OPCODE_CLEANUP, RRDENG_OPCODE_MAX @@ -445,9 +447,6 @@ static inline void ctx_last_flush_fileno_set(struct rrdengine_instance *ctx, uns #define ctx_is_available_for_queries(ctx) (__atomic_load_n(&(ctx)->quiesce.enabled, __ATOMIC_RELAXED) == false && __atomic_load_n(&(ctx)->quiesce.exit_mode, __ATOMIC_RELAXED) == false) -void *dbengine_page_alloc(size_t size); -void dbengine_page_free(void *page, size_t size); - void *dbengine_extent_alloc(size_t size); void dbengine_extent_free(void *extent, size_t size); @@ -491,8 +490,6 @@ typedef struct validated_page_descriptor { bool is_valid; } VALIDATED_PAGE_DESCRIPTOR; -#define DBENGINE_EMPTY_PAGE (void *)(-1) - #define page_entries_by_time(start_time_s, end_time_s, update_every_s) \ ((update_every_s) ? (((end_time_s) - ((start_time_s) - (update_every_s))) / (update_every_s)) : 1) diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c index 318a933f1..1ddce5243 100755 --- a/database/engine/rrdengineapi.c +++ b/database/engine/rrdengineapi.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-3.0-or-later + +#include "database/engine/rrddiskprotocol.h" #include "rrdengine.h" /* Default global database instance */ @@ -22,10 +24,15 @@ size_t tier_page_size[RRD_STORAGE_TIERS] = {2048, 1024, 192, 192, 192}; size_t tier_page_size[RRD_STORAGE_TIERS] = {4096, 2048, 384, 384, 384}; #endif -#if PAGE_TYPE_MAX != 1 -#error PAGE_TYPE_MAX is not 1 - you need to add allocations here +#if PAGE_TYPE_MAX != 2 +#error PAGE_TYPE_MAX is not 2 - you need to add allocations here #endif -size_t page_type_size[256] = {sizeof(storage_number), sizeof(storage_number_tier1_t)}; + +size_t page_type_size[256] = { + [PAGE_METRICS] = sizeof(storage_number), + [PAGE_TIER] = sizeof(storage_number_tier1_t), + [PAGE_GORILLA_METRICS] = sizeof(storage_number) +}; __attribute__((constructor)) void initialize_multidb_ctx(void) { multidb_ctx[0] = &multidb_ctx_storage_tier0; @@ -198,15 +205,15 @@ static inline void check_and_fix_mrg_update_every(struct rrdeng_collect_handle * static inline bool check_completed_page_consistency(struct rrdeng_collect_handle *handle __maybe_unused) { #ifdef NETDATA_INTERNAL_CHECKS - if (unlikely(!handle->page || !handle->page_entries_max || !handle->page_position || !handle->page_end_time_ut)) + if (unlikely(!handle->pgc_page || !handle->page_entries_max || !handle->page_position || !handle->page_end_time_ut)) return false; struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); uuid_t *uuid = mrg_metric_uuid(main_mrg, handle->metric); - time_t start_time_s = pgc_page_start_time_s(handle->page); - time_t end_time_s = pgc_page_end_time_s(handle->page); - time_t update_every_s = pgc_page_update_every_s(handle->page); + time_t start_time_s = pgc_page_start_time_s(handle->pgc_page); + time_t end_time_s = pgc_page_end_time_s(handle->pgc_page); + time_t update_every_s = pgc_page_update_every_s(handle->pgc_page); size_t page_length = handle->page_position * CTX_POINT_SIZE_BYTES(ctx); size_t entries = handle->page_position; time_t overwrite_zero_update_every_s = (time_t)(handle->update_every_ut / USEC_PER_SEC); @@ -257,9 +264,11 @@ STORAGE_COLLECT_HANDLE *rrdeng_store_metric_init(STORAGE_METRIC_HANDLE *db_metri handle = callocz(1, sizeof(struct rrdeng_collect_handle)); handle->common.backend = STORAGE_ENGINE_BACKEND_DBENGINE; handle->metric = metric; - handle->page = NULL; - handle->data = NULL; - handle->data_size = 0; + + handle->pgc_page = NULL; + handle->page_data = NULL; + handle->page_data_size = 0; + handle->page_position = 0; handle->page_entries_max = 0; handle->update_every_ut = (usec_t)update_every * USEC_PER_SEC; @@ -286,65 +295,29 @@ STORAGE_COLLECT_HANDLE *rrdeng_store_metric_init(STORAGE_METRIC_HANDLE *db_metri return (STORAGE_COLLECT_HANDLE *)handle; } -/* The page must be populated and referenced */ -static bool page_has_only_empty_metrics(struct rrdeng_collect_handle *handle) { - switch(handle->type) { - case PAGE_METRICS: { - size_t slots = handle->page_position; - storage_number *array = (storage_number *)pgc_page_data(handle->page); - for (size_t i = 0 ; i < slots; ++i) { - if(does_storage_number_exist(array[i])) - return false; - } - } - break; - - case PAGE_TIER: { - size_t slots = handle->page_position; - storage_number_tier1_t *array = (storage_number_tier1_t *)pgc_page_data(handle->page); - for (size_t i = 0 ; i < slots; ++i) { - if(fpclassify(array[i].sum_value) != FP_NAN) - return false; - } - } - break; - - default: { - static bool logged = false; - if(!logged) { - netdata_log_error("DBENGINE: cannot check page for nulls on unknown page type id %d", (mrg_metric_ctx(handle->metric))->config.page_type); - logged = true; - } - return false; - } - } - - return true; -} - void rrdeng_store_metric_flush_current_page(STORAGE_COLLECT_HANDLE *collection_handle) { struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; - if (unlikely(!handle->page)) + if (unlikely(!handle->pgc_page)) return; - if(!handle->page_position || page_has_only_empty_metrics(handle)) - pgc_page_to_clean_evict_or_release(main_cache, handle->page); + if(pgd_is_empty(handle->page_data)) + pgc_page_to_clean_evict_or_release(main_cache, handle->pgc_page); else { check_completed_page_consistency(handle); - mrg_metric_set_clean_latest_time_s(main_mrg, handle->metric, pgc_page_end_time_s(handle->page)); - pgc_page_hot_to_dirty_and_release(main_cache, handle->page); + mrg_metric_set_clean_latest_time_s(main_mrg, handle->metric, pgc_page_end_time_s(handle->pgc_page)); + pgc_page_hot_to_dirty_and_release(main_cache, handle->pgc_page); } mrg_metric_set_hot_latest_time_s(main_mrg, handle->metric, 0); - handle->page = NULL; + handle->pgc_page = NULL; handle->page_flags = 0; handle->page_position = 0; handle->page_entries_max = 0; - handle->data = NULL; - handle->data_size = 0; + handle->page_data = NULL; + handle->page_data_size = 0; // important! // we should never zero page end time ut, because this will allow @@ -358,10 +331,10 @@ void rrdeng_store_metric_flush_current_page(STORAGE_COLLECT_HANDLE *collection_h } static void rrdeng_store_metric_create_new_page(struct rrdeng_collect_handle *handle, - struct rrdengine_instance *ctx, - usec_t point_in_time_ut, - void *data, - size_t data_size) { + struct rrdengine_instance *ctx, + usec_t point_in_time_ut, + PGD *data, + size_t data_size) { time_t point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC); const time_t update_every_s = (time_t)(handle->update_every_ut / USEC_PER_SEC); @@ -378,7 +351,7 @@ static void rrdeng_store_metric_create_new_page(struct rrdeng_collect_handle *ha size_t conflicts = 0; bool added = true; - PGC_PAGE *page = pgc_page_add_and_acquire(main_cache, page_entry, &added); + PGC_PAGE *pgc_page = pgc_page_add_and_acquire(main_cache, page_entry, &added); while (unlikely(!added)) { conflicts++; @@ -388,33 +361,33 @@ static void rrdeng_store_metric_create_new_page(struct rrdeng_collect_handle *ha #ifdef NETDATA_INTERNAL_CHECKS internal_error(true, #else - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, + nd_log_limit_static_global_var(erl, 1, 0); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING, #endif - "DBENGINE: metric '%s' new page from %ld to %ld, update every %ld, has a conflict in main cache " - "with existing %s%s page from %ld to %ld, update every %ld - " - "is it collected more than once?", - uuid, - page_entry.start_time_s, page_entry.end_time_s, (time_t)page_entry.update_every_s, - pgc_is_page_hot(page) ? "hot" : "not-hot", - pgc_page_data(page) == DBENGINE_EMPTY_PAGE ? " gap" : "", - pgc_page_start_time_s(page), pgc_page_end_time_s(page), pgc_page_update_every_s(page) + "DBENGINE: metric '%s' new page from %ld to %ld, update every %ld, has a conflict in main cache " + "with existing %s%s page from %ld to %ld, update every %ld - " + "is it collected more than once?", + uuid, + page_entry.start_time_s, page_entry.end_time_s, (time_t)page_entry.update_every_s, + pgc_is_page_hot(pgc_page) ? "hot" : "not-hot", + pgc_page_data(pgc_page) == PGD_EMPTY ? " gap" : "", + pgc_page_start_time_s(pgc_page), pgc_page_end_time_s(pgc_page), pgc_page_update_every_s(pgc_page) ); - pgc_page_release(main_cache, page); + pgc_page_release(main_cache, pgc_page); point_in_time_ut -= handle->update_every_ut; point_in_time_s = (time_t)(point_in_time_ut / USEC_PER_SEC); page_entry.start_time_s = point_in_time_s; page_entry.end_time_s = point_in_time_s; - page = pgc_page_add_and_acquire(main_cache, page_entry, &added); + pgc_page = pgc_page_add_and_acquire(main_cache, page_entry, &added); } handle->page_entries_max = data_size / CTX_POINT_SIZE_BYTES(ctx); handle->page_start_time_ut = point_in_time_ut; handle->page_end_time_ut = point_in_time_ut; handle->page_position = 1; // zero is already in our data - handle->page = page; + handle->pgc_page = pgc_page; handle->page_flags = conflicts? RRDENG_PAGE_CONFLICT : 0; if(point_in_time_s > max_acceptable_collected_time()) @@ -441,9 +414,11 @@ static size_t aligned_allocation_entries(size_t max_slots, size_t target_slot, t return slots; } -static void *rrdeng_alloc_new_metric_data(struct rrdeng_collect_handle *handle, size_t *data_size, usec_t point_in_time_ut) { +static PGD *rrdeng_alloc_new_page_data(struct rrdeng_collect_handle *handle, size_t *data_size, usec_t point_in_time_ut) { struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); + PGD *d = NULL; + size_t max_size = tier_page_size[ctx->config.tier]; size_t max_slots = max_size / CTX_POINT_SIZE_BYTES(ctx); @@ -467,10 +442,22 @@ static void *rrdeng_alloc_new_metric_data(struct rrdeng_collect_handle *handle, internal_fatal(size > tier_page_size[ctx->config.tier] || size < CTX_POINT_SIZE_BYTES(ctx) * 2, "ooops! wrong page size"); *data_size = size; - void *d = dbengine_page_alloc(size); - timing_step(TIMING_STEP_DBENGINE_PAGE_ALLOC); + switch (ctx->config.page_type) { + case PAGE_METRICS: + case PAGE_TIER: + d = pgd_create(ctx->config.page_type, slots); + break; + case PAGE_GORILLA_METRICS: + // ignore slots, and use the fixed number of slots per gorilla buffer. + // gorilla will automatically add more buffers if needed. + d = pgd_create(ctx->config.page_type, GORILLA_BUFFER_SLOTS); + break; + default: + fatal("Unknown page type: %uc\n", ctx->config.page_type); + } + timing_step(TIMING_STEP_DBENGINE_PAGE_ALLOC); return d; } @@ -486,37 +473,25 @@ static void rrdeng_store_metric_append_point(STORAGE_COLLECT_HANDLE *collection_ struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)collection_handle; struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); - if(unlikely(!handle->data)) - handle->data = rrdeng_alloc_new_metric_data(handle, &handle->data_size, point_in_time_ut); + if(unlikely(!handle->page_data)) + handle->page_data = rrdeng_alloc_new_page_data(handle, &handle->page_data_size, point_in_time_ut); timing_step(TIMING_STEP_DBENGINE_CHECK_DATA); - if(likely(ctx->config.page_type == PAGE_METRICS)) { - storage_number *tier0_metric_data = handle->data; - tier0_metric_data[handle->page_position] = pack_storage_number(n, flags); - } - else if(likely(ctx->config.page_type == PAGE_TIER)) { - storage_number_tier1_t *tier12_metric_data = handle->data; - storage_number_tier1_t number_tier1; - number_tier1.sum_value = (float) n; - number_tier1.min_value = (float) min_value; - number_tier1.max_value = (float) max_value; - number_tier1.anomaly_count = anomaly_count; - number_tier1.count = count; - tier12_metric_data[handle->page_position] = number_tier1; - } - else - fatal("DBENGINE: cannot store metric on unknown page type id %d", ctx->config.page_type); + pgd_append_point(handle->page_data, + point_in_time_ut, + n, min_value, max_value, count, anomaly_count, flags, + handle->page_position); timing_step(TIMING_STEP_DBENGINE_PACK); - if(unlikely(!handle->page)){ - rrdeng_store_metric_create_new_page(handle, ctx, point_in_time_ut, handle->data, handle->data_size); + if(unlikely(!handle->pgc_page)) { + rrdeng_store_metric_create_new_page(handle, ctx, point_in_time_ut, handle->page_data, handle->page_data_size); // handle->position is set to 1 already } else { // update an existing page - pgc_page_hot_set_end_time_s(main_cache, handle->page, (time_t) (point_in_time_ut / USEC_PER_SEC)); + pgc_page_hot_set_end_time_s(main_cache, handle->pgc_page, (time_t) (point_in_time_ut / USEC_PER_SEC)); handle->page_end_time_ut = point_in_time_ut; if(unlikely(++handle->page_position >= handle->page_entries_max)) { @@ -541,13 +516,13 @@ static void store_metric_next_error_log(struct rrdeng_collect_handle *handle __m uuid_unparse(*mrg_metric_uuid(main_mrg, handle->metric), uuid); BUFFER *wb = NULL; - if(handle->page && handle->page_flags) { + if(handle->pgc_page && handle->page_flags) { wb = buffer_create(0, NULL); collect_page_flags_to_buffer(wb, handle->page_flags); } - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, + nd_log_limit_static_global_var(erl, 1, 0); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE, "DBENGINE: metric '%s' collected point at %ld, %s last collection at %ld, " "update every %ld, %s page from %ld to %ld, position %u (of %u), flags: %s", uuid, @@ -555,12 +530,12 @@ static void store_metric_next_error_log(struct rrdeng_collect_handle *handle __m msg, (time_t)(handle->page_end_time_ut / USEC_PER_SEC), (time_t)(handle->update_every_ut / USEC_PER_SEC), - handle->page ? "current" : "*LAST*", + handle->pgc_page ? "current" : "*LAST*", (time_t)(handle->page_start_time_ut / USEC_PER_SEC), (time_t)(handle->page_end_time_ut / USEC_PER_SEC), handle->page_position, handle->page_entries_max, wb ? buffer_tostring(wb) : "" - ); + ); buffer_free(wb); #else @@ -593,7 +568,7 @@ void rrdeng_store_metric_next(STORAGE_COLLECT_HANDLE *collection_handle, ; } else if(unlikely(point_in_time_ut > handle->page_end_time_ut)) { - if(handle->page) { + if(handle->pgc_page) { if (unlikely(delta_ut < handle->update_every_ut)) { handle->page_flags |= RRDENG_PAGE_STEP_TOO_SMALL; rrdeng_store_metric_flush_current_page(collection_handle); @@ -801,12 +776,13 @@ void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, static bool rrdeng_load_page_next(struct storage_engine_query_handle *rrddim_handle, bool debug_this __maybe_unused) { struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle; - struct rrdengine_instance *ctx = handle->ctx; + struct rrdengine_instance *ctx = mrg_metric_ctx(handle->metric); if (likely(handle->page)) { // we have a page to release pgc_page_release(main_cache, handle->page); handle->page = NULL; + pgdc_reset(&handle->pgdc, NULL, UINT32_MAX); } if (unlikely(handle->now_s > rrddim_handle->end_time_s)) @@ -815,10 +791,10 @@ static bool rrdeng_load_page_next(struct storage_engine_query_handle *rrddim_han size_t entries = 0; handle->page = pg_cache_lookup_next(ctx, handle->pdc, handle->now_s, handle->dt_s, &entries); - internal_fatal(handle->page && (pgc_page_data(handle->page) == DBENGINE_EMPTY_PAGE || !entries), + internal_fatal(handle->page && (pgc_page_data(handle->page) == PGD_EMPTY || !entries), "A page was returned, but it is empty - pg_cache_lookup_next() should be handling this case"); - if (unlikely(!handle->page || pgc_page_data(handle->page) == DBENGINE_EMPTY_PAGE || !entries)) + if (unlikely(!handle->page || pgc_page_data(handle->page) == PGD_EMPTY || !entries)) return false; time_t page_start_time_s = pgc_page_start_time_s(handle->page); @@ -859,8 +835,10 @@ static bool rrdeng_load_page_next(struct storage_engine_query_handle *rrddim_han handle->entries = entries; handle->position = position; - handle->metric_data = pgc_page_data((PGC_PAGE *)handle->page); handle->dt_s = page_update_every_s; + + pgdc_reset(&handle->pgdc, pgc_page_data(handle->page), handle->position); + return true; } @@ -889,38 +867,7 @@ STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim sp.start_time_s = handle->now_s - handle->dt_s; sp.end_time_s = handle->now_s; - switch(handle->ctx->config.page_type) { - case PAGE_METRICS: { - storage_number n = handle->metric_data[handle->position]; - sp.min = sp.max = sp.sum = unpack_storage_number(n); - sp.flags = n & SN_USER_FLAGS; - sp.count = 1; - sp.anomaly_count = is_storage_number_anomalous(n) ? 1 : 0; - } - break; - - case PAGE_TIER: { - storage_number_tier1_t tier1_value = ((storage_number_tier1_t *)handle->metric_data)[handle->position]; - sp.flags = tier1_value.anomaly_count ? SN_FLAG_NONE : SN_FLAG_NOT_ANOMALOUS; - sp.count = tier1_value.count; - sp.anomaly_count = tier1_value.anomaly_count; - sp.min = tier1_value.min_value; - sp.max = tier1_value.max_value; - sp.sum = tier1_value.sum_value; - } - break; - - // we don't know this page type - default: { - static bool logged = false; - if(!logged) { - netdata_log_error("DBENGINE: unknown page type %d found. Cannot decode it. Ignoring its metrics.", handle->ctx->config.page_type); - logged = true; - } - storage_point_empty(sp, sp.start_time_s, sp.end_time_s); - } - break; - } + pgdc_get_next_point(&handle->pgdc, handle->position, &sp); prepare_for_next_iteration: internal_fatal(sp.end_time_s < rrddim_handle->start_time_s, "DBENGINE: this point is too old for this query"); @@ -944,8 +891,10 @@ void rrdeng_load_metric_finalize(struct storage_engine_query_handle *rrddim_hand { struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrddim_handle->handle; - if (handle->page) + if (handle->page) { pgc_page_release(main_cache, handle->page); + pgdc_reset(&handle->pgdc, NULL, UINT32_MAX); + } if(!pdc_release_and_destroy_if_unreferenced(handle->pdc, false, false)) __atomic_store_n(&handle->pdc->workers_should_stop, true, __ATOMIC_RELAXED); @@ -1240,12 +1189,14 @@ int rrdeng_exit(struct rrdengine_instance *ctx) { // 4. then wait for completion bool logged = false; - while(__atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED) && !unittest_running) { + size_t count = 10; + while(__atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED) && count && !unittest_running) { if(!logged) { netdata_log_info("DBENGINE: waiting for collectors to finish on tier %d...", (ctx->config.legacy) ? -1 : ctx->config.tier); logged = true; } sleep_usec(100 * USEC_PER_MS); + count--; } netdata_log_info("DBENGINE: flushing main cache for tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier); diff --git a/database/engine/rrdengineapi.h b/database/engine/rrdengineapi.h index 61449426f..7ae0e7079 100644 --- a/database/engine/rrdengineapi.h +++ b/database/engine/rrdengineapi.h @@ -20,6 +20,7 @@ extern int default_multidb_disk_quota_mb; extern struct rrdengine_instance *multidb_ctx[RRD_STORAGE_TIERS]; extern size_t page_type_size[]; extern size_t tier_page_size[]; +extern uint8_t tier_page_type[]; #define CTX_POINT_SIZE_BYTES(ctx) page_type_size[(ctx)->config.page_type] diff --git a/database/engine/rrdenginelib.h b/database/engine/rrdenginelib.h index 831e48531..a0febd4f4 100644 --- a/database/engine/rrdenginelib.h +++ b/database/engine/rrdenginelib.h @@ -8,16 +8,9 @@ /* Forward declarations */ struct rrdengine_instance; -#define STR_HELPER(x) #x -#define STR(x) STR_HELPER(x) - -#define BITS_PER_ULONG (sizeof(unsigned long) * 8) - #define ALIGN_BYTES_FLOOR(x) (((x) / RRDENG_BLOCK_SIZE) * RRDENG_BLOCK_SIZE) #define ALIGN_BYTES_CEILING(x) ((((x) + RRDENG_BLOCK_SIZE - 1) / RRDENG_BLOCK_SIZE) * RRDENG_BLOCK_SIZE) -#define ROUND_USEC_TO_SEC(x) (((x) + USEC_PER_SEC / 2 - 1) / USEC_PER_SEC) - typedef uintptr_t rrdeng_stats_t; #ifdef __ATOMIC_RELAXED @@ -58,7 +51,7 @@ static inline void modify_bit(unsigned *x, unsigned pos, uint8_t val) } } -#define RRDENG_PATH_MAX (4096) +#define RRDENG_PATH_MAX (FILENAME_MAX + 1) /* returns old *ptr value */ static inline unsigned long ulong_compare_and_swap(volatile unsigned long *ptr, @@ -74,12 +67,15 @@ static inline unsigned long ulong_compare_and_swap(volatile unsigned long *ptr, static inline int crc32cmp(void *crcp, uLong crc) { - return (*(uint32_t *)crcp != crc); + uint32_t loaded_crc; + memcpy(&loaded_crc, crcp, sizeof(loaded_crc)); + return (loaded_crc != crc); } static inline void crc32set(void *crcp, uLong crc) { - *(uint32_t *)crcp = crc; + uint32_t store_crc = (uint32_t) crc; + memcpy(crcp, &store_crc, sizeof(store_crc)); } int check_file_properties(uv_file file, uint64_t *file_size, size_t min_size); diff --git a/database/rrd.h b/database/rrd.h index 9c5ad6f2f..5f4bee037 100644 --- a/database/rrd.h +++ b/database/rrd.h @@ -239,7 +239,6 @@ typedef enum __attribute__ ((__packed__)) rrddim_options { RRDDIM_OPTION_DONT_DETECT_RESETS_OR_OVERFLOWS = (1 << 1), // do not offer RESET or OVERFLOW info to callers RRDDIM_OPTION_BACKFILLED_HIGH_TIERS = (1 << 2), // when set, we have backfilled higher tiers RRDDIM_OPTION_UPDATED = (1 << 3), // single-threaded collector updated flag - RRDDIM_OPTION_EXPOSED = (1 << 4), // single-threaded collector exposed flag // this is 8-bit } RRDDIM_OPTIONS; @@ -253,20 +252,22 @@ typedef enum __attribute__ ((__packed__)) rrddim_flags { RRDDIM_FLAG_NONE = 0, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION = (1 << 0), - RRDDIM_FLAG_OBSOLETE = (1 << 2), // this is marked by the collector/module as obsolete + RRDDIM_FLAG_OBSOLETE = (1 << 1), // this is marked by the collector/module as obsolete // No new values have been collected for this dimension since agent start, or it was marked RRDDIM_FLAG_OBSOLETE at // least rrdset_free_obsolete_time seconds ago. - RRDDIM_FLAG_ARCHIVED = (1 << 3), - RRDDIM_FLAG_METADATA_UPDATE = (1 << 4), // Metadata needs to go to the database + RRDDIM_FLAG_ARCHIVED = (1 << 2), + RRDDIM_FLAG_METADATA_UPDATE = (1 << 3), // Metadata needs to go to the database - RRDDIM_FLAG_META_HIDDEN = (1 << 6), // Status of hidden option in the metadata database + RRDDIM_FLAG_META_HIDDEN = (1 << 4), // Status of hidden option in the metadata database + RRDDIM_FLAG_ML_MODEL_LOAD = (1 << 5), // Do ML LOAD for this dimension // this is 8 bit } RRDDIM_FLAGS; -#define rrddim_flag_check(rd, flag) (__atomic_load_n(&((rd)->flags), __ATOMIC_SEQ_CST) & (flag)) -#define rrddim_flag_set(rd, flag) __atomic_or_fetch(&((rd)->flags), (flag), __ATOMIC_SEQ_CST) -#define rrddim_flag_clear(rd, flag) __atomic_and_fetch(&((rd)->flags), ~(flag), __ATOMIC_SEQ_CST) +#define rrddim_flag_get(rd) __atomic_load_n(&((rd)->flags), __ATOMIC_ACQUIRE) +#define rrddim_flag_check(rd, flag) (__atomic_load_n(&((rd)->flags), __ATOMIC_ACQUIRE) & (flag)) +#define rrddim_flag_set(rd, flag) __atomic_or_fetch(&((rd)->flags), (flag), __ATOMIC_RELEASE) +#define rrddim_flag_clear(rd, flag) __atomic_and_fetch(&((rd)->flags), ~(flag), __ATOMIC_RELEASE) // ---------------------------------------------------------------------------- // engine-specific iterator state for dimension data collection @@ -312,7 +313,11 @@ struct rrddim { struct rrdset *rrdset; rrd_ml_dimension_t *ml_dimension; // machine learning data about this dimension - RRDMETRIC_ACQUIRED *rrdmetric; // the rrdmetric of this dimension + + struct { + RRDMETRIC_ACQUIRED *rrdmetric; // the rrdmetric of this dimension + bool collected; + } rrdcontexts; #ifdef NETDATA_LOG_COLLECTION_ERRORS usec_t rrddim_store_metric_last_ut; // the timestamp we last called rrddim_store_metric() @@ -332,6 +337,16 @@ struct rrddim { } db; // ------------------------------------------------------------------------ + // streaming + + struct { + struct { + uint32_t sent_version; + uint32_t dim_slot; + } sender; + } rrdpush; + + // ------------------------------------------------------------------------ // data collection members struct { @@ -367,10 +382,6 @@ size_t rrddim_size(void); #define rrddim_set_updated(rd) (rd)->collector.options |= RRDDIM_OPTION_UPDATED #define rrddim_clear_updated(rd) (rd)->collector.options &= ~RRDDIM_OPTION_UPDATED -#define rrddim_check_exposed(rd) ((rd)->collector.options & RRDDIM_OPTION_EXPOSED) -#define rrddim_set_exposed(rd) (rd)->collector.options |= RRDDIM_OPTION_EXPOSED -#define rrddim_clear_exposed(rd) (rd)->collector.options &= ~RRDDIM_OPTION_EXPOSED - // returns the RRDDIM cache filename, or NULL if it does not exist const char *rrddim_cache_filename(RRDDIM *rd); @@ -685,41 +696,47 @@ typedef enum __attribute__ ((__packed__)) rrdset_flags { RRDSET_FLAG_UPSTREAM_SEND = (1 << 6), // if set, this chart should be sent upstream (streaming) RRDSET_FLAG_UPSTREAM_IGNORE = (1 << 7), // if set, this chart should not be sent upstream (streaming) - RRDSET_FLAG_UPSTREAM_EXPOSED = (1 << 8), // if set, we have sent this chart definition to netdata parent (streaming) - RRDSET_FLAG_STORE_FIRST = (1 << 9), // if set, do not eliminate the first collection during interpolation - RRDSET_FLAG_HETEROGENEOUS = (1 << 10), // if set, the chart is not homogeneous (dimensions in it have multiple algorithms, multipliers or dividers) - RRDSET_FLAG_HOMOGENEOUS_CHECK = (1 << 11), // if set, the chart should be checked to determine if the dimensions are homogeneous - RRDSET_FLAG_HIDDEN = (1 << 12), // if set, do not show this chart on the dashboard, but use it for exporting - RRDSET_FLAG_SYNC_CLOCK = (1 << 13), // if set, microseconds on next data collection will be ignored (the chart will be synced to now) - RRDSET_FLAG_OBSOLETE_DIMENSIONS = (1 << 14), // this is marked by the collector/module when a chart has obsolete dimensions + RRDSET_FLAG_STORE_FIRST = (1 << 8), // if set, do not eliminate the first collection during interpolation + RRDSET_FLAG_HETEROGENEOUS = (1 << 9), // if set, the chart is not homogeneous (dimensions in it have multiple algorithms, multipliers or dividers) + RRDSET_FLAG_HOMOGENEOUS_CHECK = (1 << 10), // if set, the chart should be checked to determine if the dimensions are homogeneous + RRDSET_FLAG_HIDDEN = (1 << 11), // if set, do not show this chart on the dashboard, but use it for exporting + RRDSET_FLAG_SYNC_CLOCK = (1 << 12), // if set, microseconds on next data collection will be ignored (the chart will be synced to now) + RRDSET_FLAG_OBSOLETE_DIMENSIONS = (1 << 13), // this is marked by the collector/module when a chart has obsolete dimensions - RRDSET_FLAG_METADATA_UPDATE = (1 << 16), // Mark that metadata needs to be stored - RRDSET_FLAG_ANOMALY_DETECTION = (1 << 18), // flag to identify anomaly detection charts. - RRDSET_FLAG_INDEXED_ID = (1 << 19), // the rrdset is indexed by its id - RRDSET_FLAG_INDEXED_NAME = (1 << 20), // the rrdset is indexed by its name + RRDSET_FLAG_METADATA_UPDATE = (1 << 14), // Mark that metadata needs to be stored + RRDSET_FLAG_ANOMALY_DETECTION = (1 << 15), // flag to identify anomaly detection charts. + RRDSET_FLAG_INDEXED_ID = (1 << 16), // the rrdset is indexed by its id + RRDSET_FLAG_INDEXED_NAME = (1 << 17), // the rrdset is indexed by its name - RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION = (1 << 21), + RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION = (1 << 18), - RRDSET_FLAG_SENDER_REPLICATION_IN_PROGRESS = (1 << 22), // the sending side has replication in progress - RRDSET_FLAG_SENDER_REPLICATION_FINISHED = (1 << 23), // the sending side has completed replication - RRDSET_FLAG_RECEIVER_REPLICATION_IN_PROGRESS = (1 << 24), // the receiving side has replication in progress - RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED = (1 << 25), // the receiving side has completed replication + RRDSET_FLAG_SENDER_REPLICATION_IN_PROGRESS = (1 << 19), // the sending side has replication in progress + RRDSET_FLAG_SENDER_REPLICATION_FINISHED = (1 << 20), // the sending side has completed replication + RRDSET_FLAG_RECEIVER_REPLICATION_IN_PROGRESS = (1 << 21), // the receiving side has replication in progress + RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED = (1 << 22), // the receiving side has completed replication - RRDSET_FLAG_UPSTREAM_SEND_VARIABLES = (1 << 26), // a custom variable has been updated and needs to be exposed to parent + RRDSET_FLAG_UPSTREAM_SEND_VARIABLES = (1 << 23), // a custom variable has been updated and needs to be exposed to parent - RRDSET_FLAG_COLLECTION_FINISHED = (1 << 27), // when set, data collection is not available for this chart + RRDSET_FLAG_COLLECTION_FINISHED = (1 << 24), // when set, data collection is not available for this chart - RRDSET_FLAG_HAS_RRDCALC_LINKED = (1 << 28), // this chart has at least one rrdcal linked + RRDSET_FLAG_HAS_RRDCALC_LINKED = (1 << 25), // this chart has at least one rrdcal linked } RRDSET_FLAGS; -#define rrdset_flag_check(st, flag) (__atomic_load_n(&((st)->flags), __ATOMIC_SEQ_CST) & (flag)) -#define rrdset_flag_set(st, flag) __atomic_or_fetch(&((st)->flags), flag, __ATOMIC_SEQ_CST) -#define rrdset_flag_clear(st, flag) __atomic_and_fetch(&((st)->flags), ~(flag), __ATOMIC_SEQ_CST) +#define rrdset_flag_get(st) __atomic_load_n(&((st)->flags), __ATOMIC_ACQUIRE) +#define rrdset_flag_check(st, flag) (__atomic_load_n(&((st)->flags), __ATOMIC_ACQUIRE) & (flag)) +#define rrdset_flag_set(st, flag) __atomic_or_fetch(&((st)->flags), flag, __ATOMIC_RELEASE) +#define rrdset_flag_clear(st, flag) __atomic_and_fetch(&((st)->flags), ~(flag), __ATOMIC_RELEASE) #define rrdset_is_replicating(st) (rrdset_flag_check(st, RRDSET_FLAG_SENDER_REPLICATION_IN_PROGRESS|RRDSET_FLAG_RECEIVER_REPLICATION_IN_PROGRESS) \ && !rrdset_flag_check(st, RRDSET_FLAG_SENDER_REPLICATION_FINISHED|RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED)) +struct pluginsd_rrddim { + RRDDIM_ACQUIRED *rda; + RRDDIM *rd; + const char *id; +}; + struct rrdset { uuid_t chart_uuid; // the global UUID for this chart @@ -749,6 +766,8 @@ struct rrdset { DICTIONARY *rrddimvar_root_index; // dimension variables // we use this dictionary to manage their allocation + uint32_t version; // the metadata version (auto-increment) + RRDSET_TYPE chart_type; // line, area, stacked // ------------------------------------------------------------------------ @@ -768,8 +787,11 @@ struct rrdset { RRDHOST *rrdhost; // pointer to RRDHOST this chart belongs to - RRDINSTANCE_ACQUIRED *rrdinstance; // the rrdinstance of this chart - RRDCONTEXT_ACQUIRED *rrdcontext; // the rrdcontext this chart belongs to + struct { + RRDINSTANCE_ACQUIRED *rrdinstance; // the rrdinstance of this chart + RRDCONTEXT_ACQUIRED *rrdcontext; // the rrdcontext this chart belongs to + bool collected; + } rrdcontexts; // ------------------------------------------------------------------------ // data collection members @@ -792,7 +814,15 @@ struct rrdset { // ------------------------------------------------------------------------ // data collection - streaming to parents, temp variables - time_t upstream_resync_time_s; // the timestamp up to which we should resync clock upstream + struct { + struct { + uint32_t sent_version; + uint32_t chart_slot; + uint32_t dim_last_slot_used; + + time_t resync_time_s; // the timestamp up to which we should resync clock upstream + } sender; + } rrdpush; // ------------------------------------------------------------------------ // db mode SAVE, MAP specifics @@ -835,10 +865,12 @@ struct rrdset { struct { SPINLOCK spinlock; // used only for cleanup pid_t collector_tid; + bool dims_with_slots; bool set; uint32_t pos; + int32_t last_slot; uint32_t size; - RRDDIM_ACQUIRED **rda; + struct pluginsd_rrddim *prd_array; } pluginsd; #ifdef NETDATA_LOG_REPLICATION_REQUESTS @@ -861,6 +893,54 @@ struct rrdset { #define rrdset_name(st) string2str((st)->name) #define rrdset_id(st) string2str((st)->id) +static inline uint32_t rrdset_metadata_version(RRDSET *st) { + return __atomic_load_n(&st->version, __ATOMIC_RELAXED); +} + +static inline uint32_t rrdset_metadata_upstream_version(RRDSET *st) { + return __atomic_load_n(&st->rrdpush.sender.sent_version, __ATOMIC_RELAXED); +} + +void rrdset_metadata_updated(RRDSET *st); + +static inline void rrdset_metadata_exposed_upstream(RRDSET *st, uint32_t version) { + __atomic_store_n(&st->rrdpush.sender.sent_version, version, __ATOMIC_RELAXED); +} + +static inline bool rrdset_check_upstream_exposed(RRDSET *st) { + return rrdset_metadata_version(st) == rrdset_metadata_upstream_version(st); +} + +static inline uint32_t rrddim_metadata_version(RRDDIM *rd) { + // the metadata version of the dimension, is the version of the chart + return rrdset_metadata_version(rd->rrdset); +} + +static inline uint32_t rrddim_metadata_upstream_version(RRDDIM *rd) { + return __atomic_load_n(&rd->rrdpush.sender.sent_version, __ATOMIC_RELAXED); +} + +void rrddim_metadata_updated(RRDDIM *rd); + +static inline void rrddim_metadata_exposed_upstream(RRDDIM *rd, uint32_t version) { + __atomic_store_n(&rd->rrdpush.sender.sent_version, version, __ATOMIC_RELAXED); +} + +static inline void rrddim_metadata_exposed_upstream_clear(RRDDIM *rd) { + __atomic_store_n(&rd->rrdpush.sender.sent_version, 0, __ATOMIC_RELAXED); +} + +static inline bool rrddim_check_upstream_exposed(RRDDIM *rd) { + return rrddim_metadata_upstream_version(rd) != 0; +} + +// the collector sets the exposed flag, but anyone can remove it +// still, it can be removed, after the collector has finished +// so, it is safe to check it without atomics +static inline bool rrddim_check_upstream_exposed_collector(RRDDIM *rd) { + return rd->rrdset->version == rd->rrdpush.sender.sent_version; +} + STRING *rrd_string_strdupz(const char *s); // ---------------------------------------------------------------------------- @@ -943,7 +1023,7 @@ typedef enum __attribute__ ((__packed__)) rrdhost_flags { #ifdef NETDATA_INTERNAL_CHECKS #define rrdset_debug(st, fmt, args...) do { if(unlikely(debug_flags & D_RRD_STATS && rrdset_flag_check(st, RRDSET_FLAG_DEBUG))) \ - debug_int(__FILE__, __FUNCTION__, __LINE__, "%s: " fmt, rrdset_name(st), ##args); } while(0) + netdata_logger(NDLS_DEBUG, NDLP_DEBUG, __FILE__, __FUNCTION__, __LINE__, "%s: " fmt, rrdset_name(st), ##args); } while(0) #else #define rrdset_debug(st, fmt, args...) debug_dummy() #endif @@ -963,6 +1043,7 @@ typedef enum __attribute__ ((__packed__)) { RRDHOST_OPTION_REPLICATION = (1 << 5), // when set, we support replication for this host RRDHOST_OPTION_VIRTUAL_HOST = (1 << 6), // when set, this host is a virtual one + RRDHOST_OPTION_EPHEMERAL_HOST = (1 << 7), // when set, this host is an ephemeral one } RRDHOST_OPTIONS; #define rrdhost_option_check(host, flag) ((host)->options & (flag)) @@ -1060,7 +1141,6 @@ typedef struct health { time_t health_delay_up_to; // a timestamp to delay alarms processing up to STRING *health_default_exec; // the full path of the alarms notifications program STRING *health_default_recipient; // the default recipient for all alarms - int health_log_entries_written; // the number of alarm events written to the alarms event log uint32_t health_default_warn_repeat_every; // the default value for the interval between repeating warning notifications uint32_t health_default_crit_repeat_every; // the default value for the interval between repeating critical notifications unsigned int health_enabled; // 1 when this host has health enabled @@ -1151,6 +1231,31 @@ struct rrdhost { // ------------------------------------------------------------------------ // streaming of data to remote hosts - rrdpush sender + struct { + struct { + struct { + struct { + SPINLOCK spinlock; + + bool ignore; // when set, freeing slots will not put them in the available + uint32_t used; + uint32_t size; + uint32_t *array; + } available; // keep track of the available chart slots per host + + uint32_t last_used; // the last slot we used for a chart (increments only) + } pluginsd_chart_slots; + } send; + + struct { + struct { + SPINLOCK spinlock; // lock for the management of the allocation + uint32_t size; + RRDSET **array; + } pluginsd_chart_slots; + } receive; + } rrdpush; + char *rrdpush_send_destination; // where to send metrics to char *rrdpush_send_api_key; // the api key at the receiving netdata struct rrdpush_destinations *destinations; // a linked list of possible destinations @@ -1168,7 +1273,7 @@ struct rrdhost { struct sender_state *sender; netdata_thread_t rrdpush_sender_thread; // the sender thread size_t rrdpush_sender_replicating_charts; // the number of charts currently being replicated to a parent - void *aclk_sync_host_config; + struct aclk_sync_cfg_t *aclk_config; uint32_t rrdpush_receiver_connection_counter; // the number of times this receiver has connected uint32_t rrdpush_sender_connection_counter; // the number of times this sender has connected @@ -1321,6 +1426,7 @@ void rrddim_index_destroy(RRDSET *st); // ---------------------------------------------------------------------------- extern time_t rrdhost_free_orphan_time_s; +extern time_t rrdhost_free_ephemeral_time_s; int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unittest); @@ -1329,30 +1435,29 @@ RRDHOST *rrdhost_find_by_guid(const char *guid); RRDHOST *find_host_by_node_id(char *node_id); RRDHOST *rrdhost_find_or_create( - const char *hostname - , const char *registry_hostname - , const char *guid - , const char *os - , const char *timezone - , const char *abbrev_timezone - , int32_t utc_offset - , const char *tags - , const char *program_name - , const char *program_version - , int update_every - , long history - , RRD_MEMORY_MODE mode - , unsigned int health_enabled - , unsigned int rrdpush_enabled - , char *rrdpush_destination - , char *rrdpush_api_key - , char *rrdpush_send_charts_matching - , bool rrdpush_enable_replication - , time_t rrdpush_seconds_to_replicate - , time_t rrdpush_replication_step - , struct rrdhost_system_info *system_info - , bool is_archived -); + const char *hostname, + const char *registry_hostname, + const char *guid, + const char *os, + const char *timezone, + const char *abbrev_timezone, + int32_t utc_offset, + const char *tags, + const char *program_name, + const char *program_version, + int update_every, + long history, + RRD_MEMORY_MODE mode, + unsigned int health_enabled, + unsigned int rrdpush_enabled, + char *rrdpush_destination, + char *rrdpush_api_key, + char *rrdpush_send_charts_matching, + bool rrdpush_enable_replication, + time_t rrdpush_seconds_to_replicate, + time_t rrdpush_replication_step, + struct rrdhost_system_info *system_info, + bool is_archived); int rrdhost_set_system_info_variable(struct rrdhost_system_info *system_info, char *name, char *value); @@ -1438,8 +1543,8 @@ void rrdset_timed_next(RRDSET *st, struct timeval now, usec_t microseconds); void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next); void rrdset_done(RRDSET *st); -void rrdset_is_obsolete(RRDSET *st); -void rrdset_isnot_obsolete(RRDSET *st); +void rrdset_is_obsolete___safe_from_collector_thread(RRDSET *st); +void rrdset_isnot_obsolete___safe_from_collector_thread(RRDSET *st); // checks if the RRDSET should be offered to viewers #define rrdset_is_available_for_viewers(st) (!rrdset_flag_check(st, RRDSET_FLAG_HIDDEN) && !rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE) && rrdset_number_of_dimensions(st) && (st)->rrd_memory_mode != RRD_MEMORY_MODE_NONE) @@ -1488,8 +1593,8 @@ RRDDIM *rrddim_find_active(RRDSET *st, const char *id); int rrddim_hide(RRDSET *st, const char *id); int rrddim_unhide(RRDSET *st, const char *id); -void rrddim_is_obsolete(RRDSET *st, RRDDIM *rd); -void rrddim_isnot_obsolete(RRDSET *st, RRDDIM *rd); +void rrddim_is_obsolete___safe_from_collector_thread(RRDSET *st, RRDDIM *rd); +void rrddim_isnot_obsolete___safe_from_collector_thread(RRDSET *st, RRDDIM *rd); collected_number rrddim_timed_set_by_pointer(RRDSET *st, RRDDIM *rd, struct timeval collected_time, collected_number value); collected_number rrddim_set_by_pointer(RRDSET *st, RRDDIM *rd, collected_number value); @@ -1557,7 +1662,28 @@ static inline void rrdhost_retention(RRDHOST *host, time_t now, bool online, tim *to = online ? now : last_time_s; } +void rrdhost_pluginsd_send_chart_slots_free(RRDHOST *host); +void rrdhost_pluginsd_receive_chart_slots_free(RRDHOST *host); +void rrdset_pluginsd_receive_unslot_and_cleanup(RRDSET *st); +void rrdset_pluginsd_receive_unslot(RRDSET *st); + // ---------------------------------------------------------------------------- +static inline double rrddim_get_last_stored_value(RRDDIM *rd_dim, double *max_value, double div) { + if (!rd_dim) + return NAN; + + if (isnan(div) || div == 0.0) + div = 1.0; + + double value = rd_dim->collector.last_stored_value / div; + value = ABS(value); + + *max_value = MAX(*max_value, value); + + return value; +} + +// // RRD DB engine declarations #ifdef ENABLE_DBENGINE diff --git a/database/rrdcalc.c b/database/rrdcalc.c index 620883ec2..199d90803 100644 --- a/database/rrdcalc.c +++ b/database/rrdcalc.c @@ -230,7 +230,7 @@ static void rrdcalc_link_to_rrdset(RRDSET *st, RRDCALC *rc) { rw_spinlock_write_unlock(&st->alerts.spinlock); if(rc->update_every < rc->rrdset->update_every) { - netdata_log_error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rrdset_id(rc->rrdset), rrdcalc_name(rc), rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every); + netdata_log_info("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rrdset_id(rc->rrdset), rrdcalc_name(rc), rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every); rc->update_every = rc->rrdset->update_every; } @@ -809,10 +809,10 @@ void rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(RRDHOST *host continue; if(!rrdlabels_match_simple_pattern_parsed(host->rrdlabels, rc->host_labels_pattern, '=', NULL)) { - netdata_log_health("Health configuration for alarm '%s' cannot be applied, because the host %s does not have the label(s) '%s'", - rrdcalc_name(rc), - rrdhost_hostname(host), - rrdcalc_host_labels(rc)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "Health configuration for alarm '%s' cannot be applied, " + "because the host %s does not have the label(s) '%s'", + rrdcalc_name(rc), rrdhost_hostname(host), rrdcalc_host_labels(rc)); rrdcalc_unlink_and_delete(host, rc, false); } diff --git a/database/rrddim.c b/database/rrddim.c index cae88674d..46226a548 100644 --- a/database/rrddim.c +++ b/database/rrddim.c @@ -4,6 +4,11 @@ #include "rrd.h" #include "storage_engine.h" +void rrddim_metadata_updated(RRDDIM *rd) { + rrdcontext_updated_rrddim(rd); + rrdset_metadata_updated(rd->rrdset); +} + // ---------------------------------------------------------------------------- // RRDDIM index @@ -48,6 +53,8 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v rd->rrdset = st; + rd->rrdpush.sender.dim_slot = __atomic_add_fetch(&st->rrdpush.sender.dim_last_slot_used, 1, __ATOMIC_RELAXED); + if(rrdset_flag_check(st, RRDSET_FLAG_STORE_FIRST)) rd->collector.counter = 1; @@ -155,7 +162,6 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v // let the chart resync rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); ml_dimension_new(rd); @@ -283,10 +289,9 @@ static void rrddim_react_callback(const DICTIONARY_ITEM *item __maybe_unused, vo if(ctr->react_action == RRDDIM_REACT_UPDATED) { // the chart needs to be updated to the parent rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); } - rrdcontext_updated_rrddim(rd); + rrddim_metadata_updated(rd); } size_t rrddim_size(void) { @@ -369,8 +374,7 @@ inline int rrddim_reset_name(RRDSET *st, RRDDIM *rd, const char *name) { rrddimvar_rename_all(rd); - rrddim_clear_exposed(rd); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); + rrddim_metadata_updated(rd); return 1; } @@ -381,8 +385,7 @@ inline int rrddim_set_algorithm(RRDSET *st, RRDDIM *rd, RRD_ALGORITHM algorithm) netdata_log_debug(D_RRD_CALLS, "Updating algorithm of dimension '%s/%s' from %s to %s", rrdset_id(st), rrddim_name(rd), rrd_algorithm_name(rd->algorithm), rrd_algorithm_name(algorithm)); rd->algorithm = algorithm; - rrddim_clear_exposed(rd); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); + rrddim_metadata_updated(rd); rrdset_flag_set(st, RRDSET_FLAG_HOMOGENEOUS_CHECK); rrdcontext_updated_rrddim_algorithm(rd); return 1; @@ -395,8 +398,7 @@ inline int rrddim_set_multiplier(RRDSET *st, RRDDIM *rd, int32_t multiplier) { netdata_log_debug(D_RRD_CALLS, "Updating multiplier of dimension '%s/%s' from %d to %d", rrdset_id(st), rrddim_name(rd), rd->multiplier, multiplier); rd->multiplier = multiplier; - rrddim_clear_exposed(rd); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); + rrddim_metadata_updated(rd); rrdset_flag_set(st, RRDSET_FLAG_HOMOGENEOUS_CHECK); rrdcontext_updated_rrddim_multiplier(rd); return 1; @@ -409,8 +411,7 @@ inline int rrddim_set_divisor(RRDSET *st, RRDDIM *rd, int32_t divisor) { netdata_log_debug(D_RRD_CALLS, "Updating divisor of dimension '%s/%s' from %d to %d", rrdset_id(st), rrddim_name(rd), rd->divisor, divisor); rd->divisor = divisor; - rrddim_clear_exposed(rd); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); + rrddim_metadata_updated(rd); rrdset_flag_set(st, RRDSET_FLAG_HOMOGENEOUS_CHECK); rrdcontext_updated_rrddim_divisor(rd); return 1; @@ -532,8 +533,8 @@ int rrddim_unhide(RRDSET *st, const char *id) { return 0; } -inline void rrddim_is_obsolete(RRDSET *st, RRDDIM *rd) { - netdata_log_debug(D_RRD_CALLS, "rrddim_is_obsolete() for chart %s, dimension %s", rrdset_name(st), rrddim_name(rd)); +inline void rrddim_is_obsolete___safe_from_collector_thread(RRDSET *st, RRDDIM *rd) { + netdata_log_debug(D_RRD_CALLS, "rrddim_is_obsolete___safe_from_collector_thread() for chart %s, dimension %s", rrdset_name(st), rrddim_name(rd)); if(unlikely(rrddim_flag_check(rd, RRDDIM_FLAG_ARCHIVED))) { netdata_log_info("Cannot obsolete already archived dimension %s from chart %s", rrddim_name(rd), rrdset_name(st)); @@ -545,8 +546,8 @@ inline void rrddim_is_obsolete(RRDSET *st, RRDDIM *rd) { rrdcontext_updated_rrddim_flags(rd); } -inline void rrddim_isnot_obsolete(RRDSET *st __maybe_unused, RRDDIM *rd) { - netdata_log_debug(D_RRD_CALLS, "rrddim_isnot_obsolete() for chart %s, dimension %s", rrdset_name(st), rrddim_name(rd)); +inline void rrddim_isnot_obsolete___safe_from_collector_thread(RRDSET *st __maybe_unused, RRDDIM *rd) { + netdata_log_debug(D_RRD_CALLS, "rrddim_isnot_obsolete___safe_from_collector_thread() for chart %s, dimension %s", rrdset_name(st), rrddim_name(rd)); rrddim_flag_clear(rd, RRDDIM_FLAG_OBSOLETE); rrdcontext_updated_rrddim_flags(rd); diff --git a/database/rrdfunctions.c b/database/rrdfunctions.c index 6c5baf346..2659130f0 100644 --- a/database/rrdfunctions.c +++ b/database/rrdfunctions.c @@ -991,7 +991,7 @@ int rrd_function_run(RRDHOST *host, BUFFER *result_wb, int timeout, const char * // the caller has to wait code = rdcf->execute_cb(result_wb, timeout, sanitized_cmd, rdcf->execute_cb_data, - NULL, NULL, // no callback needed, it is synchronous + result_cb, result_cb_data, is_cancelled_cb, is_cancelled_cb_data, // it is ok to pass these, we block the caller NULL, NULL); // no need to pass, we will wait @@ -1007,11 +1007,11 @@ int rrd_function_run(RRDHOST *host, BUFFER *result_wb, int timeout, const char * // the function can only be executed in async mode // put the function into the inflight requests - char uuid_str[UUID_STR_LEN]; + char uuid_str[UUID_COMPACT_STR_LEN]; if(!transaction) { uuid_t uuid; uuid_generate_random(uuid); - uuid_unparse_lower(uuid, uuid_str); + uuid_unparse_lower_compact(uuid, uuid_str); transaction = uuid_str; } @@ -1097,35 +1097,36 @@ cleanup: // ---------------------------------------------------------------------------- -static void functions2json(DICTIONARY *functions, BUFFER *wb, const char *ident, const char *kq, const char *sq) { +static void functions2json(DICTIONARY *functions, BUFFER *wb) +{ struct rrd_host_function *t; - dfe_start_read(functions, t) { - if(!rrd_collector_running(t->collector)) continue; - - if(t_dfe.counter) - buffer_strcat(wb, ",\n"); + dfe_start_read(functions, t) + { + if (!rrd_collector_running(t->collector)) + continue; - buffer_sprintf(wb, "%s%s%s%s: {", ident, kq, t_dfe.name, kq); - buffer_sprintf(wb, "\n\t%s%shelp%s: %s%s%s", ident, kq, kq, sq, string2str(t->help), sq); - buffer_sprintf(wb, ",\n\t%s%stimeout%s: %d", ident, kq, kq, t->timeout); - buffer_sprintf(wb, ",\n\t%s%soptions%s: \"%s%s\"", ident, kq, kq - , (t->options & RRD_FUNCTION_LOCAL)?"LOCAL ":"" - , (t->options & RRD_FUNCTION_GLOBAL)?"GLOBAL ":"" - ); - buffer_sprintf(wb, "\n%s}", ident); + buffer_json_member_add_object(wb, t_dfe.name); + buffer_json_member_add_string_or_empty(wb, "help", string2str(t->help)); + buffer_json_member_add_int64(wb, "timeout", (int64_t)t->timeout); + + char options[65]; + snprintfz( + options, + 64, + "%s%s", + (t->options & RRD_FUNCTION_LOCAL) ? "LOCAL " : "", + (t->options & RRD_FUNCTION_GLOBAL) ? "GLOBAL" : ""); + + buffer_json_member_add_string_or_empty(wb, "options", options); + buffer_json_object_close(wb); } dfe_done(t); - buffer_strcat(wb, "\n"); } -void chart_functions2json(RRDSET *st, BUFFER *wb, int tabs, const char *kq, const char *sq) { +void chart_functions2json(RRDSET *st, BUFFER *wb) { if(!st || !st->functions_view) return; - char ident[tabs + 1]; - ident[tabs] = '\0'; - while(tabs) ident[--tabs] = '\t'; - - functions2json(st->functions_view, wb, ident, kq, sq); + functions2json(st->functions_view, wb); } void host_functions2json(RRDHOST *host, BUFFER *wb) { @@ -1372,19 +1373,19 @@ int rrdhost_function_streaming(BUFFER *wb, int timeout __maybe_unused, const cha RRDF_FIELD_TYPE_TIMESTAMP, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DATETIME_MS, 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, RRDF_FIELD_SUMMARY_MIN, RRDF_FIELD_FILTER_RANGE, - RRDF_FIELD_OPTS_VISIBLE, NULL); + RRDF_FIELD_OPTS_NONE, NULL); buffer_rrdf_table_add_field(wb, field_id++, "dbTo", "DB Data Retention To", RRDF_FIELD_TYPE_TIMESTAMP, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DATETIME_MS, 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, RRDF_FIELD_SUMMARY_MAX, RRDF_FIELD_FILTER_RANGE, - RRDF_FIELD_OPTS_VISIBLE, NULL); + RRDF_FIELD_OPTS_NONE, NULL); buffer_rrdf_table_add_field(wb, field_id++, "dbDuration", "DB Data Retention Duration", RRDF_FIELD_TYPE_DURATION, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DURATION_S, 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, RRDF_FIELD_SUMMARY_MAX, RRDF_FIELD_FILTER_RANGE, - RRDF_FIELD_OPTS_NONE, NULL); + RRDF_FIELD_OPTS_VISIBLE, NULL); buffer_rrdf_table_add_field(wb, field_id++, "dbMetrics", "Time-series Metrics in the DB", RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, diff --git a/database/rrdfunctions.h b/database/rrdfunctions.h index 96aa3965e..21ca5c734 100644 --- a/database/rrdfunctions.h +++ b/database/rrdfunctions.h @@ -40,7 +40,7 @@ void rrd_function_cancel(const char *transaction); void rrd_functions_expose_rrdpush(RRDSET *st, BUFFER *wb); void rrd_functions_expose_global_rrdpush(RRDHOST *host, BUFFER *wb); -void chart_functions2json(RRDSET *st, BUFFER *wb, int tabs, const char *kq, const char *sq); +void chart_functions2json(RRDSET *st, BUFFER *wb); void chart_functions_to_dict(DICTIONARY *rrdset_functions_view, DICTIONARY *dst, void *value, size_t value_size); void host_functions_to_dict(RRDHOST *host, DICTIONARY *dst, void *value, size_t value_size, STRING **help); void host_functions2json(RRDHOST *host, BUFFER *wb); diff --git a/database/rrdhost.c b/database/rrdhost.c index 6abd3b816..a3c272153 100644 --- a/database/rrdhost.c +++ b/database/rrdhost.c @@ -31,6 +31,7 @@ netdata_rwlock_t rrd_rwlock = NETDATA_RWLOCK_INITIALIZER; time_t rrdset_free_obsolete_time_s = 3600; time_t rrdhost_free_orphan_time_s = 3600; +time_t rrdhost_free_ephemeral_time_s = 86400; bool is_storage_engine_shared(STORAGE_INSTANCE *engine __maybe_unused) { #ifdef ENABLE_DBENGINE @@ -80,8 +81,6 @@ static inline void rrdhost_init() { } RRDHOST_ACQUIRED *rrdhost_find_and_acquire(const char *machine_guid) { - netdata_log_debug(D_RRD_CALLS, "rrdhost_find_and_acquire() host %s", machine_guid); - return (RRDHOST_ACQUIRED *)dictionary_get_and_acquire_item(rrdhost_root_index, machine_guid); } @@ -116,8 +115,9 @@ static inline RRDHOST *rrdhost_index_add_by_guid(RRDHOST *host) { rrdhost_option_set(host, RRDHOST_OPTION_INDEXED_MACHINE_GUID); else { rrdhost_option_clear(host, RRDHOST_OPTION_INDEXED_MACHINE_GUID); - netdata_log_error("RRDHOST: %s() host with machine guid '%s' is already indexed", - __FUNCTION__, host->machine_guid); + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "RRDHOST: host with machine guid '%s' is already indexed. Not adding it again.", + host->machine_guid); } return host; @@ -126,8 +126,9 @@ static inline RRDHOST *rrdhost_index_add_by_guid(RRDHOST *host) { static void rrdhost_index_del_by_guid(RRDHOST *host) { if(rrdhost_option_check(host, RRDHOST_OPTION_INDEXED_MACHINE_GUID)) { if(!dictionary_del(rrdhost_root_index, host->machine_guid)) - netdata_log_error("RRDHOST: %s() failed to delete machine guid '%s' from index", - __FUNCTION__, host->machine_guid); + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "RRDHOST: failed to delete machine guid '%s' from index", + host->machine_guid); rrdhost_option_clear(host, RRDHOST_OPTION_INDEXED_MACHINE_GUID); } @@ -148,8 +149,9 @@ static inline void rrdhost_index_del_hostname(RRDHOST *host) { if(rrdhost_option_check(host, RRDHOST_OPTION_INDEXED_HOSTNAME)) { if(!dictionary_del(rrdhost_root_index_hostname, rrdhost_hostname(host))) - netdata_log_error("RRDHOST: %s() failed to delete hostname '%s' from index", - __FUNCTION__, rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "RRDHOST: failed to delete hostname '%s' from index", + rrdhost_hostname(host)); rrdhost_option_clear(host, RRDHOST_OPTION_INDEXED_HOSTNAME); } @@ -303,11 +305,11 @@ static RRDHOST *rrdhost_create( int is_localhost, bool archived ) { - netdata_log_debug(D_RRDHOST, "Host '%s': adding with guid '%s'", hostname, guid); - if(memory_mode == RRD_MEMORY_MODE_DBENGINE && !dbengine_enabled) { - netdata_log_error("memory mode 'dbengine' is not enabled, but host '%s' is configured for it. Falling back to 'alloc'", - hostname); + nd_log(NDLS_DAEMON, NDLP_ERR, + "memory mode 'dbengine' is not enabled, but host '%s' is configured for it. Falling back to 'alloc'", + hostname); + memory_mode = RRD_MEMORY_MODE_ALLOC; } @@ -392,7 +394,9 @@ int is_legacy = 1; (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE && is_legacy))) { int r = mkdir(host->cache_dir, 0775); if(r != 0 && errno != EEXIST) - netdata_log_error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->cache_dir); + nd_log(NDLS_DAEMON, NDLP_CRIT, + "Host '%s': cannot create directory '%s'", + rrdhost_hostname(host), host->cache_dir); } } @@ -418,7 +422,9 @@ int is_legacy = 1; ret = mkdir(dbenginepath, 0775); if (ret != 0 && errno != EEXIST) - netdata_log_error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), dbenginepath); + nd_log(NDLS_DAEMON, NDLP_CRIT, + "Host '%s': cannot create directory '%s'", + rrdhost_hostname(host), dbenginepath); else ret = 0; // succeed @@ -459,8 +465,9 @@ int is_legacy = 1; } if (ret) { // check legacy or multihost initialization success - netdata_log_error("Host '%s': cannot initialize host with machine guid '%s'. Failed to initialize DB engine at '%s'.", - rrdhost_hostname(host), host->machine_guid, host->cache_dir); + nd_log(NDLS_DAEMON, NDLP_CRIT, + "Host '%s': cannot initialize host with machine guid '%s'. Failed to initialize DB engine at '%s'.", + rrdhost_hostname(host), host->machine_guid, host->cache_dir); rrd_wrlock(); rrdhost_free___while_having_rrd_wrlock(host, true); @@ -508,10 +515,13 @@ int is_legacy = 1; RRDHOST *t = rrdhost_index_add_by_guid(host); if(t != host) { - netdata_log_error("Host '%s': cannot add host with machine guid '%s' to index. It already exists as host '%s' with machine guid '%s'.", - rrdhost_hostname(host), host->machine_guid, rrdhost_hostname(t), t->machine_guid); + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "Host '%s': cannot add host with machine guid '%s' to index. It already exists as host '%s' with machine guid '%s'.", + rrdhost_hostname(host), host->machine_guid, rrdhost_hostname(t), t->machine_guid); + if (!is_localhost) rrdhost_free___while_having_rrd_wrlock(host, true); + rrd_unlock(); return NULL; } @@ -527,21 +537,22 @@ int is_legacy = 1; // ------------------------------------------------------------------------ - netdata_log_info("Host '%s' (at registry as '%s') with guid '%s' initialized" - ", os '%s'" - ", timezone '%s'" - ", tags '%s'" - ", program_name '%s'" - ", program_version '%s'" - ", update every %d" - ", memory mode %s" - ", history entries %d" - ", streaming %s" - " (to '%s' with api key '%s')" - ", health %s" - ", cache_dir '%s'" - ", alarms default handler '%s'" - ", alarms default recipient '%s'" + nd_log(NDLS_DAEMON, NDLP_INFO, + "Host '%s' (at registry as '%s') with guid '%s' initialized" + ", os '%s'" + ", timezone '%s'" + ", tags '%s'" + ", program_name '%s'" + ", program_version '%s'" + ", update every %d" + ", memory mode %s" + ", history entries %d" + ", streaming %s" + " (to '%s' with api key '%s')" + ", health %s" + ", cache_dir '%s'" + ", alarms default handler '%s'" + ", alarms default recipient '%s'" , rrdhost_hostname(host) , rrdhost_registry_hostname(host) , host->machine_guid @@ -621,44 +632,56 @@ static void rrdhost_update(RRDHOST *host host->registry_hostname = string_strdupz((registry_hostname && *registry_hostname)?registry_hostname:hostname); if(strcmp(rrdhost_hostname(host), hostname) != 0) { - netdata_log_info("Host '%s' has been renamed to '%s'. If this is not intentional it may mean multiple hosts are using the same machine_guid.", rrdhost_hostname(host), hostname); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "Host '%s' has been renamed to '%s'. If this is not intentional it may mean multiple hosts are using the same machine_guid.", + rrdhost_hostname(host), hostname); + rrdhost_init_hostname(host, hostname, true); } else { rrdhost_index_add_hostname(host); } if(strcmp(rrdhost_program_name(host), program_name) != 0) { - netdata_log_info("Host '%s' switched program name from '%s' to '%s'", rrdhost_hostname(host), rrdhost_program_name(host), program_name); + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "Host '%s' switched program name from '%s' to '%s'", + rrdhost_hostname(host), rrdhost_program_name(host), program_name); + STRING *t = host->program_name; host->program_name = string_strdupz(program_name); string_freez(t); } if(strcmp(rrdhost_program_version(host), program_version) != 0) { - netdata_log_info("Host '%s' switched program version from '%s' to '%s'", rrdhost_hostname(host), rrdhost_program_version(host), program_version); + nd_log(NDLS_DAEMON, NDLP_NOTICE, + "Host '%s' switched program version from '%s' to '%s'", + rrdhost_hostname(host), rrdhost_program_version(host), program_version); + STRING *t = host->program_version; host->program_version = string_strdupz(program_version); string_freez(t); } if(host->rrd_update_every != update_every) - netdata_log_error("Host '%s' has an update frequency of %d seconds, but the wanted one is %d seconds. " - "Restart netdata here to apply the new settings.", - rrdhost_hostname(host), host->rrd_update_every, update_every); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "Host '%s' has an update frequency of %d seconds, but the wanted one is %d seconds. " + "Restart netdata here to apply the new settings.", + rrdhost_hostname(host), host->rrd_update_every, update_every); if(host->rrd_memory_mode != mode) - netdata_log_error("Host '%s' has memory mode '%s', but the wanted one is '%s'. " - "Restart netdata here to apply the new settings.", - rrdhost_hostname(host), - rrd_memory_mode_name(host->rrd_memory_mode), - rrd_memory_mode_name(mode)); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "Host '%s' has memory mode '%s', but the wanted one is '%s'. " + "Restart netdata here to apply the new settings.", + rrdhost_hostname(host), + rrd_memory_mode_name(host->rrd_memory_mode), + rrd_memory_mode_name(mode)); else if(host->rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE && host->rrd_history_entries < history) - netdata_log_error("Host '%s' has history of %d entries, but the wanted one is %ld entries. " - "Restart netdata here to apply the new settings.", - rrdhost_hostname(host), - host->rrd_history_entries, - history); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "Host '%s' has history of %d entries, but the wanted one is %ld entries. " + "Restart netdata here to apply the new settings.", + rrdhost_hostname(host), + host->rrd_history_entries, + history); // update host tags rrdhost_init_tags(host, tags); @@ -700,7 +723,9 @@ static void rrdhost_update(RRDHOST *host ml_host_new(host); rrdhost_load_rrdcontext_data(host); - netdata_log_info("Host %s is not in archived mode anymore", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "Host %s is not in archived mode anymore", + rrdhost_hostname(host)); } spinlock_unlock(&host->rrdhost_update_lock); @@ -731,8 +756,6 @@ RRDHOST *rrdhost_find_or_create( , struct rrdhost_system_info *system_info , bool archived ) { - netdata_log_debug(D_RRDHOST, "Searching for host '%s' with guid '%s'", hostname, guid); - RRDHOST *host = rrdhost_find_by_guid(guid); if (unlikely(host && host->rrd_memory_mode != mode && rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED))) { @@ -740,10 +763,11 @@ RRDHOST *rrdhost_find_or_create( return host; /* If a legacy memory mode instantiates all dbengine state must be discarded to avoid inconsistencies */ - netdata_log_error("Archived host '%s' has memory mode '%s', but the wanted one is '%s'. Discarding archived state.", - rrdhost_hostname(host), - rrd_memory_mode_name(host->rrd_memory_mode), - rrd_memory_mode_name(mode)); + nd_log(NDLS_DAEMON, NDLP_INFO, + "Archived host '%s' has memory mode '%s', but the wanted one is '%s'. Discarding archived state.", + rrdhost_hostname(host), + rrd_memory_mode_name(host->rrd_memory_mode), + rrd_memory_mode_name(mode)); rrd_wrlock(); rrdhost_free___while_having_rrd_wrlock(host, true); @@ -815,7 +839,7 @@ inline int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, tim && rrdhost_receiver_replicating_charts(host) == 0 && rrdhost_sender_replicating_charts(host) == 0 && rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN) - && !rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED) + && !rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD) && !host->receiver && host->child_disconnected_time && host->child_disconnected_time + rrdhost_free_orphan_time_s < now_s) @@ -851,18 +875,26 @@ void dbengine_init(char *hostname) { if (read_num > 0 && read_num <= MAX_PAGES_PER_EXTENT) rrdeng_pages_per_extent = read_num; else { - netdata_log_error("Invalid dbengine pages per extent %u given. Using %u.", read_num, rrdeng_pages_per_extent); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "Invalid dbengine pages per extent %u given. Using %u.", + read_num, rrdeng_pages_per_extent); + config_set_number(CONFIG_SECTION_DB, "dbengine pages per extent", rrdeng_pages_per_extent); } storage_tiers = config_get_number(CONFIG_SECTION_DB, "storage tiers", storage_tiers); if(storage_tiers < 1) { - netdata_log_error("At least 1 storage tier is required. Assuming 1."); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "At least 1 storage tier is required. Assuming 1."); + storage_tiers = 1; config_set_number(CONFIG_SECTION_DB, "storage tiers", storage_tiers); } if(storage_tiers > RRD_STORAGE_TIERS) { - netdata_log_error("Up to %d storage tier are supported. Assuming %d.", RRD_STORAGE_TIERS, RRD_STORAGE_TIERS); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "Up to %d storage tier are supported. Assuming %d.", + RRD_STORAGE_TIERS, RRD_STORAGE_TIERS); + storage_tiers = RRD_STORAGE_TIERS; config_set_number(CONFIG_SECTION_DB, "storage tiers", storage_tiers); } @@ -884,7 +916,9 @@ void dbengine_init(char *hostname) { int ret = mkdir(dbenginepath, 0775); if (ret != 0 && errno != EEXIST) { - netdata_log_error("DBENGINE on '%s': cannot create directory '%s'", hostname, dbenginepath); + nd_log(NDLS_DAEMON, NDLP_CRIT, + "DBENGINE on '%s': cannot create directory '%s'", + hostname, dbenginepath); break; } @@ -896,26 +930,29 @@ void dbengine_init(char *hostname) { RRD_BACKFILL backfill = storage_tiers_backfill[tier]; if(tier > 0) { - snprintfz(dbengineconfig, 200, "dbengine tier %zu multihost disk space MB", tier); + snprintfz(dbengineconfig, sizeof(dbengineconfig) - 1, "dbengine tier %zu multihost disk space MB", tier); disk_space_mb = config_get_number(CONFIG_SECTION_DB, dbengineconfig, disk_space_mb); - snprintfz(dbengineconfig, 200, "dbengine tier %zu update every iterations", tier); + snprintfz(dbengineconfig, sizeof(dbengineconfig) - 1, "dbengine tier %zu update every iterations", tier); grouping_iterations = config_get_number(CONFIG_SECTION_DB, dbengineconfig, grouping_iterations); if(grouping_iterations < 2) { grouping_iterations = 2; config_set_number(CONFIG_SECTION_DB, dbengineconfig, grouping_iterations); - netdata_log_error("DBENGINE on '%s': 'dbegnine tier %zu update every iterations' cannot be less than 2. Assuming 2.", - hostname, - tier); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "DBENGINE on '%s': 'dbegnine tier %zu update every iterations' cannot be less than 2. Assuming 2.", + hostname, tier); } - snprintfz(dbengineconfig, 200, "dbengine tier %zu backfill", tier); + snprintfz(dbengineconfig, sizeof(dbengineconfig) - 1, "dbengine tier %zu backfill", tier); const char *bf = config_get(CONFIG_SECTION_DB, dbengineconfig, backfill == RRD_BACKFILL_NEW ? "new" : backfill == RRD_BACKFILL_FULL ? "full" : "none"); if(strcmp(bf, "new") == 0) backfill = RRD_BACKFILL_NEW; else if(strcmp(bf, "full") == 0) backfill = RRD_BACKFILL_FULL; else if(strcmp(bf, "none") == 0) backfill = RRD_BACKFILL_NONE; else { - netdata_log_error("DBENGINE: unknown backfill value '%s', assuming 'new'", bf); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "DBENGINE: unknown backfill value '%s', assuming 'new'", + bf); + config_set(CONFIG_SECTION_DB, dbengineconfig, "new"); backfill = RRD_BACKFILL_NEW; } @@ -926,10 +963,10 @@ void dbengine_init(char *hostname) { if(tier > 0 && get_tier_grouping(tier) > 65535) { storage_tiers_grouping_iterations[tier] = 1; - netdata_log_error("DBENGINE on '%s': dbengine tier %zu gives aggregation of more than 65535 points of tier 0. Disabling tiers above %zu", - hostname, - tier, - tier); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "DBENGINE on '%s': dbengine tier %zu gives aggregation of more than 65535 points of tier 0. " + "Disabling tiers above %zu", + hostname, tier, tier); break; } @@ -957,21 +994,19 @@ void dbengine_init(char *hostname) { netdata_thread_join(tiers_init[tier].thread, &ptr); if(tiers_init[tier].ret != 0) { - netdata_log_error("DBENGINE on '%s': Failed to initialize multi-host database tier %zu on path '%s'", - hostname, - tiers_init[tier].tier, - tiers_init[tier].path); + nd_log(NDLS_DAEMON, NDLP_ERR, + "DBENGINE on '%s': Failed to initialize multi-host database tier %zu on path '%s'", + hostname, tiers_init[tier].tier, tiers_init[tier].path); } else if(created_tiers == tier) created_tiers++; } if(created_tiers && created_tiers < storage_tiers) { - netdata_log_error("DBENGINE on '%s': Managed to create %zu tiers instead of %zu. Continuing with %zu available.", - hostname, - created_tiers, - storage_tiers, - created_tiers); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "DBENGINE on '%s': Managed to create %zu tiers instead of %zu. Continuing with %zu available.", + hostname, created_tiers, storage_tiers, created_tiers); + storage_tiers = created_tiers; } else if(!created_tiers) @@ -984,7 +1019,10 @@ void dbengine_init(char *hostname) { #else storage_tiers = config_get_number(CONFIG_SECTION_DB, "storage tiers", 1); if(storage_tiers != 1) { - netdata_log_error("DBENGINE is not available on '%s', so only 1 database tier can be supported.", hostname); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "DBENGINE is not available on '%s', so only 1 database tier can be supported.", + hostname); + storage_tiers = 1; config_set_number(CONFIG_SECTION_DB, "storage tiers", storage_tiers); } @@ -1000,7 +1038,9 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unitt set_late_global_environment(system_info); fatal("Failed to initialize SQLite"); } - netdata_log_info("Skipping SQLITE metadata initialization since memory mode is not dbengine"); + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "Skipping SQLITE metadata initialization since memory mode is not dbengine"); } if (unlikely(sql_init_context_database(system_info ? 0 : 1))) { @@ -1015,23 +1055,28 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unitt rrdpush_init(); if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE || rrdpush_receiver_needs_dbengine()) { - netdata_log_info("DBENGINE: Initializing ..."); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "DBENGINE: Initializing ..."); + dbengine_init(hostname); } - else { - netdata_log_info("DBENGINE: Not initializing ..."); + else storage_tiers = 1; - } if (!dbengine_enabled) { if (storage_tiers > 1) { - netdata_log_error("dbengine is not enabled, but %zu tiers have been requested. Resetting tiers to 1", - storage_tiers); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "dbengine is not enabled, but %zu tiers have been requested. Resetting tiers to 1", + storage_tiers); + storage_tiers = 1; } if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { - netdata_log_error("dbengine is not enabled, but it has been given as the default db mode. Resetting db mode to alloc"); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "dbengine is not enabled, but it has been given as the default db mode. " + "Resetting db mode to alloc"); + default_rrd_memory_mode = RRD_MEMORY_MODE_ALLOC; } } @@ -1040,7 +1085,6 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unitt if(!unittest) metadata_sync_init(); - netdata_log_debug(D_RRDHOST, "Initializing localhost with hostname '%s'", hostname); localhost = rrdhost_create( hostname , registry_get_this_machine_hostname() @@ -1072,14 +1116,12 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unitt return 1; } -#ifdef NETDATA_DEV_MODE // we register this only on localhost // for the other nodes, the origin server should register it rrd_collector_started(); // this creates a collector that runs for as long as netdata runs rrd_function_add(localhost, NULL, "streaming", 10, RRDFUNCTIONS_STREAMING_HELP, true, rrdhost_function_streaming, NULL); -#endif if (likely(system_info)) { migrate_localhost(&localhost->host_uuid); @@ -1145,13 +1187,10 @@ static void rrdhost_streaming_sender_structures_init(RRDHOST *host) host->sender->rrdpush_sender_pipe[PIPE_READ] = -1; host->sender->rrdpush_sender_pipe[PIPE_WRITE] = -1; host->sender->rrdpush_sender_socket = -1; + host->sender->disabled_capabilities = STREAM_CAP_NONE; -#ifdef ENABLE_RRDPUSH_COMPRESSION - if(default_rrdpush_compression_enabled) - host->sender->flags |= SENDER_FLAG_COMPRESSION; - else - host->sender->flags &= ~SENDER_FLAG_COMPRESSION; -#endif + if(!default_rrdpush_compression_enabled) + host->sender->disabled_capabilities |= STREAM_CAP_COMPRESSIONS_AVAILABLE; spinlock_init(&host->sender->spinlock); replication_init_sender(host->sender); @@ -1167,9 +1206,7 @@ static void rrdhost_streaming_sender_structures_free(RRDHOST *host) rrdpush_sender_thread_stop(host, STREAM_HANDSHAKE_DISCONNECT_HOST_CLEANUP, true); // stop a possibly running thread cbuffer_free(host->sender->buffer); -#ifdef ENABLE_RRDPUSH_COMPRESSION rrdpush_compressor_destroy(&host->sender->compressor); -#endif replication_cleanup_sender(host->sender); @@ -1184,7 +1221,9 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { if(!host) return; if (netdata_exit || force) { - netdata_log_info("RRD: 'host:%s' freeing memory...", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "RRD: 'host:%s' freeing memory...", + rrdhost_hostname(host)); // ------------------------------------------------------------------------ // first remove it from the indexes, so that it will not be discoverable @@ -1197,6 +1236,12 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { } // ------------------------------------------------------------------------ + // clean up streaming chart slots + + rrdhost_pluginsd_send_chart_slots_free(host); + rrdhost_pluginsd_receive_chart_slots_free(host); + + // ------------------------------------------------------------------------ // clean up streaming rrdhost_streaming_sender_structures_free(host); @@ -1244,7 +1289,10 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { #endif if (!netdata_exit && !force) { - netdata_log_info("RRD: 'host:%s' is now in archive mode...", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "RRD: 'host:%s' is now in archive mode...", + rrdhost_hostname(host)); + rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED | RRDHOST_FLAG_ORPHAN); return; } @@ -1283,6 +1331,7 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { string_freez(host->hostname); __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(RRDHOST), __ATOMIC_RELAXED); + freez(host); } @@ -1313,7 +1362,9 @@ void rrd_finalize_collection_for_all_hosts(void) { void rrdhost_save_charts(RRDHOST *host) { if(!host) return; - netdata_log_info("RRD: 'host:%s' saving / closing database...", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "RRD: 'host:%s' saving / closing database...", + rrdhost_hostname(host)); RRDSET *st; @@ -1414,7 +1465,14 @@ static void rrdhost_load_auto_labels(void) { add_aclk_host_labels(); - health_add_host_labels(); + // The source should be CONF, but when it is set, these labels are exported by default ('send configured labels' in exporting.conf). + // Their export seems to break exporting to Graphite, see https://github.com/netdata/netdata/issues/14084. + + int is_ephemeral = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_GLOBAL, "is ephemeral node", CONFIG_BOOLEAN_NO); + rrdlabels_add(labels, "_is_ephemeral", is_ephemeral ? "true" : "false", RRDLABEL_SRC_AUTO); + + int has_unstable_connection = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_GLOBAL, "has unstable connection", CONFIG_BOOLEAN_NO); + rrdlabels_add(labels, "_has_unstable_connection", has_unstable_connection ? "true" : "false", RRDLABEL_SRC_AUTO); rrdlabels_add(labels, "_is_parent", (localhost->connected_children_count > 0) ? "true" : "false", RRDLABEL_SRC_AUTO); @@ -1442,7 +1500,9 @@ static void rrdhost_load_config_labels(void) { int status = config_load(NULL, 1, CONFIG_SECTION_HOST_LABEL); if(!status) { char *filename = CONFIG_DIR "/" CONFIG_FILENAME; - netdata_log_error("RRDLABEL: Cannot reload the configuration file '%s', using labels in memory", filename); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "RRDLABEL: Cannot reload the configuration file '%s', using labels in memory", + filename); } struct section *co = appconfig_get_section(&netdata_config, CONFIG_SECTION_HOST_LABEL); @@ -1462,12 +1522,13 @@ static void rrdhost_load_kubernetes_labels(void) { sprintf(label_script, "%s/%s", netdata_configured_primary_plugins_dir, "get-kubernetes-labels.sh"); if (unlikely(access(label_script, R_OK) != 0)) { - netdata_log_error("Kubernetes pod label fetching script %s not found.",label_script); + nd_log(NDLS_DAEMON, NDLP_ERR, + "Kubernetes pod label fetching script %s not found.", + label_script); + return; } - netdata_log_debug(D_RRDHOST, "Attempting to fetch external labels via %s", label_script); - pid_t pid; FILE *fp_child_input; FILE *fp_child_output = netdata_popen(label_script, &pid, &fp_child_input); @@ -1481,7 +1542,9 @@ static void rrdhost_load_kubernetes_labels(void) { // Here we'll inform with an ERROR that the script failed, show whatever (if anything) was added to the list of labels, free the memory and set the return to null int rc = netdata_pclose(fp_child_input, fp_child_output, pid); if(rc) - netdata_log_error("%s exited abnormally. Failed to get kubernetes labels.", label_script); + nd_log(NDLS_DAEMON, NDLP_ERR, + "%s exited abnormally. Failed to get kubernetes labels.", + label_script); } void reload_host_labels(void) { @@ -1501,7 +1564,9 @@ void reload_host_labels(void) { } void rrdhost_finalize_collection(RRDHOST *host) { - netdata_log_info("RRD: 'host:%s' stopping data collection...", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "RRD: 'host:%s' stopping data collection...", + rrdhost_hostname(host)); RRDSET *st; rrdset_foreach_read(st, host) @@ -1515,7 +1580,9 @@ void rrdhost_finalize_collection(RRDHOST *host) { void rrdhost_delete_charts(RRDHOST *host) { if(!host) return; - netdata_log_info("RRD: 'host:%s' deleting disk files...", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "RRD: 'host:%s' deleting disk files...", + rrdhost_hostname(host)); RRDSET *st; @@ -1523,8 +1590,8 @@ void rrdhost_delete_charts(RRDHOST *host) { // we get a write lock // to ensure only one thread is saving the database rrdset_foreach_write(st, host){ - rrdset_delete_files(st); - } + rrdset_delete_files(st); + } rrdset_foreach_done(st); } @@ -1537,7 +1604,9 @@ void rrdhost_delete_charts(RRDHOST *host) { void rrdhost_cleanup_charts(RRDHOST *host) { if(!host) return; - netdata_log_info("RRD: 'host:%s' cleaning up disk files...", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "RRD: 'host:%s' cleaning up disk files...", + rrdhost_hostname(host)); RRDSET *st; uint32_t rrdhost_delete_obsolete_charts = rrdhost_option_check(host, RRDHOST_OPTION_DELETE_OBSOLETE_CHARTS); @@ -1564,7 +1633,9 @@ void rrdhost_cleanup_charts(RRDHOST *host) { // RRDHOST - save all hosts to disk void rrdhost_save_all(void) { - netdata_log_info("RRD: saving databases [%zu hosts(s)]...", rrdhost_hosts_available()); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "RRD: saving databases [%zu hosts(s)]...", + rrdhost_hosts_available()); rrd_rdlock(); @@ -1579,7 +1650,9 @@ void rrdhost_save_all(void) { // RRDHOST - save or delete all hosts from disk void rrdhost_cleanup_all(void) { - netdata_log_info("RRD: cleaning up database [%zu hosts(s)]...", rrdhost_hosts_available()); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "RRD: cleaning up database [%zu hosts(s)]...", + rrdhost_hosts_available()); rrd_rdlock(); @@ -1885,9 +1958,7 @@ void rrdhost_status(RRDHOST *host, time_t now, RRDHOST_STATUS *s) { else s->stream.status = RRDHOST_STREAM_STATUS_ONLINE; -#ifdef ENABLE_RRDPUSH_COMPRESSION - s->stream.compression = (stream_has_capability(host->sender, STREAM_CAP_COMPRESSION) && host->sender->compressor.initialized); -#endif + s->stream.compression = host->sender->compressor.initialized; } else { s->stream.status = RRDHOST_STREAM_STATUS_OFFLINE; diff --git a/database/rrdlabels.c b/database/rrdlabels.c index 6505b4b2d..69ee55526 100644 --- a/database/rrdlabels.c +++ b/database/rrdlabels.c @@ -710,10 +710,13 @@ static void labels_add_already_sanitized(RRDLABELS *labels, const char *key, con if(*PValue) { new_ls |= RRDLABEL_FLAG_OLD; + *((RRDLABEL_SRC *)PValue) = new_ls; + delete_label(new_label); } else { new_ls |= RRDLABEL_FLAG_NEW; + *((RRDLABEL_SRC *)PValue) = new_ls; RRDLABEL *old_label_with_same_key = rrdlabels_find_label_with_key_unsafe(labels, new_label); if (old_label_with_same_key) { @@ -723,7 +726,6 @@ static void labels_add_already_sanitized(RRDLABELS *labels, const char *key, con } labels->version++; - *((RRDLABEL_SRC *)PValue) = new_ls; size_t mem_after_judyl = JudyLMemUsed(labels->JudyL); STATS_PLUS_MEMORY(&dictionary_stats_category_rrdlabels, 0, mem_after_judyl - mem_before_judyl, 0); @@ -934,7 +936,7 @@ static void rrdlabels_remove_all_unmarked_unsafe(RRDLABELS *labels) bool first_then_next = true; while ((PValue = JudyLFirstThenNext(labels->JudyL, &Index, &first_then_next))) { - if (!((*((RRDLABEL_SRC *)PValue)) & (RRDLABEL_FLAG_OLD | RRDLABEL_FLAG_NEW | RRDLABEL_FLAG_PERMANENT))) { + if (!((*((RRDLABEL_SRC *)PValue)) & (RRDLABEL_FLAG_INTERNAL))) { size_t mem_before_judyl = JudyLMemUsed(labels->JudyL); (void)JudyLDel(&labels->JudyL, Index, PJE0); @@ -1004,8 +1006,9 @@ void rrdlabels_migrate_to_these(RRDLABELS *dst, RRDLABELS *src) { if(unlikely(!PValue || PValue == PJERR)) fatal("RRDLABELS migrate: corrupted labels array"); - RRDLABEL_SRC flag = RRDLABEL_FLAG_NEW; + RRDLABEL_SRC flag; if (!*PValue) { + flag = (ls & ~(RRDLABEL_FLAG_OLD | RRDLABEL_FLAG_NEW)) | RRDLABEL_FLAG_NEW; dup_label(label); size_t mem_after_judyl = JudyLMemUsed(dst->JudyL); STATS_PLUS_MEMORY(&dictionary_stats_category_rrdlabels, 0, mem_after_judyl - mem_before_judyl, 0); @@ -1040,14 +1043,13 @@ void rrdlabels_copy(RRDLABELS *dst, RRDLABELS *src) lfe_start_nolock(src, label, ls) { RRDLABEL *old_label_with_key = rrdlabels_find_label_with_key_unsafe(dst, label); - Pvoid_t *PValue = JudyLIns(&dst->JudyL, (Word_t)label, PJE0); if(unlikely(!PValue || PValue == PJERR)) fatal("RRDLABELS: corrupted labels array"); if (!*PValue) { dup_label(label); - *((RRDLABEL_SRC *)PValue) = ls; + ls = (ls & ~(RRDLABEL_FLAG_OLD)) | RRDLABEL_FLAG_NEW; dst->version++; update_statistics = true; if (old_label_with_key) { @@ -1055,6 +1057,10 @@ void rrdlabels_copy(RRDLABELS *dst, RRDLABELS *src) delete_label((RRDLABEL *)old_label_with_key); } } + else + ls = (ls & ~(RRDLABEL_FLAG_NEW)) | RRDLABEL_FLAG_OLD; + + *((RRDLABEL_SRC *)PValue) = ls; } lfe_done_nolock(); if (update_statistics) { @@ -1302,7 +1308,7 @@ void rrdset_update_rrdlabels(RRDSET *st, RRDLABELS *new_rrdlabels) { rrdset_flag_set(st, RRDSET_FLAG_METADATA_UPDATE); rrdhost_flag_set(st->rrdhost, RRDHOST_FLAG_METADATA_UPDATE); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); + rrdset_metadata_updated(st); } @@ -1318,7 +1324,30 @@ struct rrdlabels_unittest_add_a_pair { int errors; }; -int rrdlabels_unittest_add_a_pair_callback(const char *name, const char *value, RRDLABEL_SRC ls __maybe_unused, void *data) { +RRDLABEL *rrdlabels_find_label_with_key(RRDLABELS *labels, const char *key, RRDLABEL_SRC *source) +{ + if (!labels || !key) + return NULL; + + STRING *this_key = string_strdupz(key); + + RRDLABEL *lb = NULL; + RRDLABEL_SRC ls; + + lfe_start_read(labels, lb, ls) + { + if (lb->index.key == this_key) { + if (source) + *source = ls; + break; + } + } + lfe_done(labels); + string_freez(this_key); + return lb; +} + +static int rrdlabels_unittest_add_a_pair_callback(const char *name, const char *value, RRDLABEL_SRC ls __maybe_unused, void *data) { struct rrdlabels_unittest_add_a_pair *t = (struct rrdlabels_unittest_add_a_pair *)data; t->name = name; @@ -1344,7 +1373,7 @@ int rrdlabels_unittest_add_a_pair_callback(const char *name, const char *value, return 1; } -int rrdlabels_unittest_add_a_pair(const char *pair, const char *name, const char *value) { +static int rrdlabels_unittest_add_a_pair(const char *pair, const char *name, const char *value) { RRDLABELS *labels = rrdlabels_create(); int errors; @@ -1374,7 +1403,7 @@ int rrdlabels_unittest_add_a_pair(const char *pair, const char *name, const char return errors; } -int rrdlabels_unittest_add_pairs() { +static int rrdlabels_unittest_add_pairs() { fprintf(stderr, "\n%s() tests\n", __FUNCTION__); int errors = 0; @@ -1422,69 +1451,70 @@ int rrdlabels_unittest_add_pairs() { return errors; } -int rrdlabels_unittest_double_check() { +static int rrdlabels_unittest_expect_value(RRDLABELS *labels, const char *key, const char *value, RRDLABEL_SRC required_source) +{ + RRDLABEL_SRC source; + RRDLABEL *label = rrdlabels_find_label_with_key(labels, key, &source); + return (!label || strcmp(string2str(label->index.value), value) != 0 || (source != required_source)); +} + +static int rrdlabels_unittest_double_check() +{ fprintf(stderr, "\n%s() tests\n", __FUNCTION__); - int errors = 1; int ret = 0; RRDLABELS *labels = rrdlabels_create(); - const char *pair = "key1=value1"; - - struct rrdlabels_unittest_add_a_pair tmp = { - .pair = pair, - .expected_name = "key1", - .expected_value = NULL, - .errors = 0 - }; + rrdlabels_add(labels, "key1", "value1", RRDLABEL_SRC_CONFIG); + ret += rrdlabels_unittest_expect_value(labels, "key1", "value1", RRDLABEL_FLAG_NEW | RRDLABEL_SRC_CONFIG); - fprintf(stderr, "rrdlabels_add_pair(labels, %s) ...\n ", pair); + rrdlabels_add(labels, "key1", "value2", RRDLABEL_SRC_CONFIG); + ret += !rrdlabels_unittest_expect_value(labels, "key1", "value2", RRDLABEL_FLAG_OLD | RRDLABEL_SRC_CONFIG); - rrdlabels_add_pair(labels, pair, RRDLABEL_SRC_CONFIG); - size_t count = rrdlabels_entries(labels); - fprintf(stderr, "Added one key with \"value1\", entries found %zu\n", count); - tmp.expected_value = "value1"; - ret = rrdlabels_walkthrough_read(labels, rrdlabels_unittest_add_a_pair_callback, &tmp); + rrdlabels_add(labels, "key2", "value1", RRDLABEL_SRC_ACLK|RRDLABEL_SRC_AUTO); + ret += !rrdlabels_unittest_expect_value(labels, "key1", "value3", RRDLABEL_FLAG_NEW | RRDLABEL_SRC_ACLK); - fprintf(stderr, "Adding key with same value \"value1\" (collision check)\n"); - rrdlabels_add_pair(labels, pair, RRDLABEL_SRC_CONFIG); - count = rrdlabels_entries(labels); - fprintf(stderr, "Added same key again \"value1\", entries found %zu\n", count); + ret += (rrdlabels_entries(labels) != 2); - ret = rrdlabels_walkthrough_read(labels, rrdlabels_unittest_add_a_pair_callback, &tmp); + rrdlabels_destroy(labels); - // Add same key with different value - pair = "key1=value2"; - rrdlabels_add_pair(labels, pair, RRDLABEL_SRC_CONFIG); - count = rrdlabels_entries(labels); - fprintf(stderr, "Added same key again with \"value2\", entries found %zu\n", count); + if (ret) + fprintf(stderr, "\n%s() tests failed\n", __FUNCTION__); + return ret; +} - tmp.expected_value = "value2"; - ret = rrdlabels_walkthrough_read(labels, rrdlabels_unittest_add_a_pair_callback, &tmp); +static int rrdlabels_walkthrough_index_read(RRDLABELS *labels, int (*callback)(const char *name, const char *value, RRDLABEL_SRC ls, size_t index, void *data), void *data) +{ + int ret = 0; - fprintf(stderr, "Adding key with same value \"value2\" (collision check)\n"); - rrdlabels_add_pair(labels, pair, RRDLABEL_SRC_CONFIG); - count = rrdlabels_entries(labels); - fprintf(stderr, "Added same key again with \"value2\", entries found %zu\n", count); + if(unlikely(!labels || !callback)) return 0; - ret = rrdlabels_walkthrough_read(labels, rrdlabels_unittest_add_a_pair_callback, &tmp); - errors = tmp.errors; - if(ret != 1) { - fprintf(stderr, "failed to get \"%s\" label", "key1"); - errors++; + RRDLABEL *lb; + RRDLABEL_SRC ls; + size_t index = 0; + lfe_start_read(labels, lb, ls) + { + ret = callback(string2str(lb->index.key), string2str(lb->index.value), ls, index, data); + if (ret < 0) + break; + index++; } + lfe_done(labels); - if(!errors) - fprintf(stderr, " OK, name='%s' and value='%s'\n", tmp.name, tmp.value?tmp.value:"(null)"); - else - fprintf(stderr, " FAILED\n"); - - rrdlabels_destroy(labels); + return ret; +} - return errors; +static int unittest_dump_labels(const char *name, const char *value, RRDLABEL_SRC ls, size_t index, void *data __maybe_unused) +{ + if (!index && data) { + fprintf(stderr, "%s\n", (char *) data); + } + fprintf(stderr, "LABEL \"%s\" = %d \"%s\"\n", name, ls & (~RRDLABEL_FLAG_INTERNAL), value); + return 1; } -int rrdlabels_unittest_migrate_check() { +static int rrdlabels_unittest_migrate_check() +{ fprintf(stderr, "\n%s() tests\n", __FUNCTION__); RRDLABELS *labels1 = NULL; @@ -1504,6 +1534,12 @@ int rrdlabels_unittest_migrate_check() { fprintf(stderr, "Labels2 entries found %zu (should be 3)\n", rrdlabels_entries(labels2)); rrdlabels_migrate_to_these(labels1, labels2); + + int rc = 0; + rc = rrdlabels_unittest_expect_value(labels1, "key1", "value2", RRDLABEL_FLAG_OLD | RRDLABEL_SRC_CONFIG); + if (rc) + return rc; + fprintf(stderr, "labels1 (migrated) entries found %zu (should be 3)\n", rrdlabels_entries(labels1)); size_t entries = rrdlabels_entries(labels1); @@ -1518,34 +1554,47 @@ int rrdlabels_unittest_migrate_check() { labels2 = rrdlabels_create(); rrdlabels_add(labels1, "key1", "value1", RRDLABEL_SRC_CONFIG); - rrdlabels_add(labels1, "key2", "value1", RRDLABEL_SRC_CONFIG); - rrdlabels_add(labels1, "key3", "value1", RRDLABEL_SRC_CONFIG); - rrdlabels_add(labels1, "key4", "value1", RRDLABEL_SRC_CONFIG); // 4 keys + rrdlabels_add(labels1, "key2", "value2", RRDLABEL_SRC_CONFIG); + rrdlabels_add(labels1, "key3", "value3", RRDLABEL_SRC_CONFIG); + rrdlabels_add(labels1, "key4", "value4", RRDLABEL_SRC_CONFIG); // 4 keys + rrdlabels_walkthrough_index_read(labels1, unittest_dump_labels, "\nlabels1"); + + rrdlabels_add(labels2, "key0", "value0", RRDLABEL_SRC_CONFIG); + rrdlabels_add(labels2, "key1", "value1", RRDLABEL_SRC_CONFIG); + rrdlabels_add(labels2, "key2", "value2", RRDLABEL_SRC_CONFIG); + + rc = rrdlabels_unittest_expect_value(labels1, "key1", "value1", RRDLABEL_FLAG_NEW | RRDLABEL_SRC_CONFIG); + if (rc) + return rc; - rrdlabels_add(labels2, "key1", "value10", RRDLABEL_SRC_CONFIG); - rrdlabels_add(labels2, "key2", "value1", RRDLABEL_SRC_CONFIG); - rrdlabels_add(labels2, "key0", "value1", RRDLABEL_SRC_CONFIG); + rrdlabels_walkthrough_index_read(labels2, unittest_dump_labels, "\nlabels2"); rrdlabels_copy(labels1, labels2); // labels1 should have 5 keys + rc = rrdlabels_unittest_expect_value(labels1, "key1", "value1", RRDLABEL_FLAG_OLD | RRDLABEL_SRC_CONFIG); + if (rc) + return rc; + rc = rrdlabels_unittest_expect_value(labels1, "key0", "value0", RRDLABEL_FLAG_NEW | RRDLABEL_SRC_CONFIG); + if (rc) + return rc; + + rrdlabels_walkthrough_index_read(labels1, unittest_dump_labels, "\nlabels1 after copy from labels2"); entries = rrdlabels_entries(labels1); + fprintf(stderr, "labels1 (copied) entries found %zu (should be 5)\n", rrdlabels_entries(labels1)); if (entries != 5) return 1; - rrdlabels_add(labels1, "key100", "value1", RRDLABEL_SRC_CONFIG); - rrdlabels_copy(labels2, labels1); // labels2 should have 6 keys - entries = rrdlabels_entries(labels1); - - fprintf(stderr, "labels2 (copied) entries found %zu (should be 6)\n", rrdlabels_entries(labels1)); + rrdlabels_add(labels1, "key0", "value0", RRDLABEL_SRC_CONFIG); + rc = rrdlabels_unittest_expect_value(labels1, "key0", "value0", RRDLABEL_FLAG_OLD | RRDLABEL_SRC_CONFIG); rrdlabels_destroy(labels1); rrdlabels_destroy(labels2); - return entries != 6; + return rc; } -int rrdlabels_unittest_check_simple_pattern(RRDLABELS *labels, const char *pattern, bool expected) { +static int rrdlabels_unittest_check_simple_pattern(RRDLABELS *labels, const char *pattern, bool expected) { fprintf(stderr, "rrdlabels_match_simple_pattern(labels, \"%s\") ... ", pattern); bool ret = rrdlabels_match_simple_pattern(labels, pattern); @@ -1554,7 +1603,7 @@ int rrdlabels_unittest_check_simple_pattern(RRDLABELS *labels, const char *patte return (ret == expected)?0:1; } -int rrdlabels_unittest_simple_pattern() { +static int rrdlabels_unittest_simple_pattern() { fprintf(stderr, "\n%s() tests\n", __FUNCTION__); int errors = 0; diff --git a/database/rrdlabels.h b/database/rrdlabels.h index c65fbb2c3..64a0e2384 100644 --- a/database/rrdlabels.h +++ b/database/rrdlabels.h @@ -13,12 +13,12 @@ typedef enum __attribute__ ((__packed__)) rrdlabel_source { // more sources can be added here - RRDLABEL_FLAG_PERMANENT = (1 << 29), // set when this label should never be removed (can be overwritten though) - RRDLABEL_FLAG_OLD = (1 << 30), // marks for rrdlabels internal use - they are not exposed outside rrdlabels - RRDLABEL_FLAG_NEW = (1 << 31) // marks for rrdlabels internal use - they are not exposed outside rrdlabels + RRDLABEL_FLAG_DONT_DELETE = (1 << 29), // set when this label should never be removed (can be overwritten though) + RRDLABEL_FLAG_OLD = (1 << 30), // marks for rrdlabels internal use - they are not exposed outside rrdlabels + RRDLABEL_FLAG_NEW = (1 << 31) // marks for rrdlabels internal use - they are not exposed outside rrdlabels } RRDLABEL_SRC; -#define RRDLABEL_FLAG_INTERNAL (RRDLABEL_FLAG_OLD | RRDLABEL_FLAG_NEW | RRDLABEL_FLAG_PERMANENT) +#define RRDLABEL_FLAG_INTERNAL (RRDLABEL_FLAG_OLD | RRDLABEL_FLAG_NEW | RRDLABEL_FLAG_DONT_DELETE) size_t text_sanitize(unsigned char *dst, const unsigned char *src, size_t dst_size, unsigned char *char_map, bool utf, const char *empty, size_t *multibyte_length); diff --git a/database/rrdset.c b/database/rrdset.c index 92386f45e..f4bb48aa7 100644 --- a/database/rrdset.c +++ b/database/rrdset.c @@ -5,6 +5,129 @@ #include <sched.h> #include "storage_engine.h" + +void rrdset_metadata_updated(RRDSET *st) { + __atomic_add_fetch(&st->version, 1, __ATOMIC_RELAXED); + rrdcontext_updated_rrdset(st); +} + +// ---------------------------------------------------------------------------- +// RRDSET rrdpush send chart_slots + +static void rrdset_rrdpush_send_chart_slot_assign(RRDSET *st) { + RRDHOST *host = st->rrdhost; + spinlock_lock(&host->rrdpush.send.pluginsd_chart_slots.available.spinlock); + + if(host->rrdpush.send.pluginsd_chart_slots.available.used > 0) + st->rrdpush.sender.chart_slot = + host->rrdpush.send.pluginsd_chart_slots.available.array[--host->rrdpush.send.pluginsd_chart_slots.available.used]; + else + st->rrdpush.sender.chart_slot = ++host->rrdpush.send.pluginsd_chart_slots.last_used; + + spinlock_unlock(&host->rrdpush.send.pluginsd_chart_slots.available.spinlock); +} + +static void rrdset_rrdpush_send_chart_slot_release(RRDSET *st) { + if(!st->rrdpush.sender.chart_slot || st->rrdhost->rrdpush.send.pluginsd_chart_slots.available.ignore) + return; + + RRDHOST *host = st->rrdhost; + spinlock_lock(&host->rrdpush.send.pluginsd_chart_slots.available.spinlock); + + if(host->rrdpush.send.pluginsd_chart_slots.available.used >= host->rrdpush.send.pluginsd_chart_slots.available.size) { + uint32_t old_size = host->rrdpush.send.pluginsd_chart_slots.available.size; + uint32_t new_size = (old_size > 0) ? (old_size * 2) : 1024; + + host->rrdpush.send.pluginsd_chart_slots.available.array = + reallocz(host->rrdpush.send.pluginsd_chart_slots.available.array, new_size * sizeof(uint32_t)); + + host->rrdpush.send.pluginsd_chart_slots.available.size = new_size; + } + + host->rrdpush.send.pluginsd_chart_slots.available.array[host->rrdpush.send.pluginsd_chart_slots.available.used++] = + st->rrdpush.sender.chart_slot; + + st->rrdpush.sender.chart_slot = 0; + spinlock_unlock(&host->rrdpush.send.pluginsd_chart_slots.available.spinlock); +} + +void rrdhost_pluginsd_send_chart_slots_free(RRDHOST *host) { + spinlock_lock(&host->rrdpush.send.pluginsd_chart_slots.available.spinlock); + host->rrdpush.send.pluginsd_chart_slots.available.ignore = true; + freez(host->rrdpush.send.pluginsd_chart_slots.available.array); + host->rrdpush.send.pluginsd_chart_slots.available.array = NULL; + host->rrdpush.send.pluginsd_chart_slots.available.used = 0; + host->rrdpush.send.pluginsd_chart_slots.available.size = 0; + spinlock_unlock(&host->rrdpush.send.pluginsd_chart_slots.available.spinlock); + + // zero all the slots on all charts, so that they will not attempt to access the array + RRDSET *st; + rrdset_foreach_read(st, host) { + st->rrdpush.sender.chart_slot = 0; + } + rrdset_foreach_done(st); +} + +void rrdset_pluginsd_receive_unslot(RRDSET *st) { + for(size_t i = 0; i < st->pluginsd.size ;i++) { + rrddim_acquired_release(st->pluginsd.prd_array[i].rda); // can be NULL + st->pluginsd.prd_array[i].rda = NULL; + st->pluginsd.prd_array[i].rd = NULL; + st->pluginsd.prd_array[i].id = NULL; + } + + RRDHOST *host = st->rrdhost; + + if(st->pluginsd.last_slot >= 0 && + (uint32_t)st->pluginsd.last_slot < host->rrdpush.receive.pluginsd_chart_slots.size && + host->rrdpush.receive.pluginsd_chart_slots.array[st->pluginsd.last_slot] == st) { + host->rrdpush.receive.pluginsd_chart_slots.array[st->pluginsd.last_slot] = NULL; + } + + st->pluginsd.last_slot = -1; + st->pluginsd.dims_with_slots = false; +} + +void rrdset_pluginsd_receive_unslot_and_cleanup(RRDSET *st) { + if(!st) + return; + + spinlock_lock(&st->pluginsd.spinlock); + + rrdset_pluginsd_receive_unslot(st); + + freez(st->pluginsd.prd_array); + st->pluginsd.prd_array = NULL; + st->pluginsd.size = 0; + st->pluginsd.pos = 0; + st->pluginsd.set = false; + st->pluginsd.last_slot = -1; + st->pluginsd.dims_with_slots = false; + st->pluginsd.collector_tid = 0; + + spinlock_unlock(&st->pluginsd.spinlock); +} + +static void rrdset_pluginsd_receive_slots_initialize(RRDSET *st) { + spinlock_init(&st->pluginsd.spinlock); + st->pluginsd.last_slot = -1; +} + +void rrdhost_pluginsd_receive_chart_slots_free(RRDHOST *host) { + spinlock_lock(&host->rrdpush.receive.pluginsd_chart_slots.spinlock); + + if(host->rrdpush.receive.pluginsd_chart_slots.array) { + for (size_t s = 0; s < host->rrdpush.receive.pluginsd_chart_slots.size; s++) + rrdset_pluginsd_receive_unslot_and_cleanup(host->rrdpush.receive.pluginsd_chart_slots.array[s]); + + freez(host->rrdpush.receive.pluginsd_chart_slots.array); + host->rrdpush.receive.pluginsd_chart_slots.array = NULL; + host->rrdpush.receive.pluginsd_chart_slots.size = 0; + } + + spinlock_unlock(&host->rrdpush.receive.pluginsd_chart_slots.spinlock); +} + // ---------------------------------------------------------------------------- // RRDSET name index @@ -39,8 +162,8 @@ static inline RRDSET *rrdset_index_find_name(RRDHOST *host, const char *name) { static inline void rrdset_update_permanent_labels(RRDSET *st) { if(!st->rrdlabels) return; - rrdlabels_add(st->rrdlabels, "_collect_plugin", rrdset_plugin_name(st), RRDLABEL_SRC_AUTO| RRDLABEL_FLAG_PERMANENT); - rrdlabels_add(st->rrdlabels, "_collect_module", rrdset_module_name(st), RRDLABEL_SRC_AUTO| RRDLABEL_FLAG_PERMANENT); + rrdlabels_add(st->rrdlabels, "_collect_plugin", rrdset_plugin_name(st), RRDLABEL_SRC_AUTO | RRDLABEL_FLAG_DONT_DELETE); + rrdlabels_add(st->rrdlabels, "_collect_module", rrdset_module_name(st), RRDLABEL_SRC_AUTO | RRDLABEL_FLAG_DONT_DELETE); } static STRING *rrdset_fix_name(RRDHOST *host, const char *chart_full_id, const char *type, const char *current_name, const char *name) { @@ -64,7 +187,7 @@ static STRING *rrdset_fix_name(RRDHOST *host, const char *chart_full_id, const c i++; } while (rrdset_index_find_name(host, new_name)); - netdata_log_info("RRDSET: using name '%s' for chart '%s' on host '%s'.", new_name, full_name, rrdhost_hostname(host)); +// netdata_log_info("RRDSET: using name '%s' for chart '%s' on host '%s'.", new_name, full_name, rrdhost_hostname(host)); } else return NULL; @@ -135,6 +258,8 @@ static void rrdset_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v st->chart_type = ctr->chart_type; st->rrdhost = host; + rrdset_rrdpush_send_chart_slot_assign(st); + spinlock_init(&st->data_collection_lock); st->flags = RRDSET_FLAG_SYNC_CLOCK @@ -179,13 +304,13 @@ static void rrdset_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v st->green = NAN; st->red = NAN; + rrdset_pluginsd_receive_slots_initialize(st); + ctr->react_action = RRDSET_REACT_NEW; ml_chart_new(st); } -void pluginsd_rrdset_cleanup(RRDSET *st); - void rrdset_finalize_collection(RRDSET *st, bool dimensions_too) { RRDHOST *host = st->rrdhost; @@ -208,7 +333,7 @@ void rrdset_finalize_collection(RRDSET *st, bool dimensions_too) { } } - pluginsd_rrdset_cleanup(st); + rrdset_pluginsd_receive_unslot_and_cleanup(st); } // the destructor - the dictionary is write locked while this runs @@ -220,6 +345,8 @@ static void rrdset_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, v rrdset_finalize_collection(st, false); + rrdset_rrdpush_send_chart_slot_release(st); + // remove it from the name index rrdset_index_del_name(host, st); @@ -288,7 +415,7 @@ static bool rrdset_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, struct rrdset_constructor *ctr = constructor_data; RRDSET *st = rrdset; - rrdset_isnot_obsolete(st); + rrdset_isnot_obsolete___safe_from_collector_thread(st); ctr->react_action = RRDSET_REACT_NONE; @@ -363,7 +490,6 @@ static bool rrdset_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, rrdset_update_permanent_labels(st); rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); return ctr->react_action != RRDSET_REACT_NONE; } @@ -389,10 +515,9 @@ static void rrdset_react_callback(const DICTIONARY_ITEM *item __maybe_unused, vo } rrdset_flag_set(st, RRDSET_FLAG_METADATA_UPDATE); rrdhost_flag_set(st->rrdhost, RRDHOST_FLAG_METADATA_UPDATE); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); } - rrdcontext_updated_rrdset(st); + rrdset_metadata_updated(st); } void rrdset_index_init(RRDHOST *host) { @@ -543,7 +668,7 @@ int rrdset_reset_name(RRDSET *st, const char *name) { rrdset_flag_clear(st, RRDSET_FLAG_EXPORTING_IGNORE); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_SEND); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_IGNORE); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); + rrdset_metadata_updated(st); rrdcontext_updated_rrdset_name(st); return 2; @@ -652,14 +777,19 @@ void rrdset_get_retention_of_tier_for_collected_chart(RRDSET *st, time_t *first_ *last_time_s = db_last_entry_s; } -inline void rrdset_is_obsolete(RRDSET *st) { +inline void rrdset_is_obsolete___safe_from_collector_thread(RRDSET *st) { + rrdset_pluginsd_receive_unslot(st); + if(unlikely(!(rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE)))) { +// netdata_log_info("Setting obsolete flag on chart 'host:%s/chart:%s'", +// rrdhost_hostname(st->rrdhost), rrdset_id(st)); + rrdset_flag_set(st, RRDSET_FLAG_OBSOLETE); rrdhost_flag_set(st->rrdhost, RRDHOST_FLAG_PENDING_OBSOLETE_CHARTS); st->last_accessed_time_s = now_realtime_sec(); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); + rrdset_metadata_updated(st); // the chart will not get more updates (data collection) // so, we have to push its definition now @@ -668,12 +798,16 @@ inline void rrdset_is_obsolete(RRDSET *st) { } } -inline void rrdset_isnot_obsolete(RRDSET *st) { +inline void rrdset_isnot_obsolete___safe_from_collector_thread(RRDSET *st) { if(unlikely((rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE)))) { + +// netdata_log_info("Clearing obsolete flag on chart 'host:%s/chart:%s'", +// rrdhost_hostname(st->rrdhost), rrdset_id(st)); + rrdset_flag_clear(st, RRDSET_FLAG_OBSOLETE); st->last_accessed_time_s = now_realtime_sec(); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); + rrdset_metadata_updated(st); // the chart will be pushed upstream automatically // due to data collection @@ -1189,6 +1323,14 @@ void rrddim_store_metric_with_trace(RRDDIM *rd, usec_t point_end_time_ut, NETDAT #else // !NETDATA_LOG_COLLECTION_ERRORS void rrddim_store_metric(RRDDIM *rd, usec_t point_end_time_ut, NETDATA_DOUBLE n, SN_FLAGS flags) { #endif // !NETDATA_LOG_COLLECTION_ERRORS + + static __thread struct log_stack_entry lgs[] = { + [0] = ND_LOG_FIELD_STR(NDF_NIDL_DIMENSION, NULL), + [1] = ND_LOG_FIELD_END(), + }; + lgs[0].str = rd->id; + log_stack_push(lgs); + #ifdef NETDATA_LOG_COLLECTION_ERRORS rd->rrddim_store_metric_count++; @@ -1250,6 +1392,7 @@ void rrddim_store_metric(RRDDIM *rd, usec_t point_end_time_ut, NETDATA_DOUBLE n, } rrdcontext_collected_rrddim(rd); + log_stack_pop(&lgs); } void store_metric_collection_completed() { @@ -1528,12 +1671,12 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) if (unlikely(rrdset_flags & RRDSET_FLAG_OBSOLETE)) { netdata_log_error("Chart '%s' has the OBSOLETE flag set, but it is collected.", rrdset_id(st)); - rrdset_isnot_obsolete(st); + rrdset_isnot_obsolete___safe_from_collector_thread(st); } // check if the chart has a long time to be updated if(unlikely(st->usec_since_last_update > MAX(st->db.entries, 60) * update_every_ut)) { - netdata_log_info("host '%s', chart '%s': took too long to be updated (counter #%u, update #%u, %0.3" NETDATA_DOUBLE_MODIFIER + nd_log_daemon(NDLP_DEBUG, "host '%s', chart '%s': took too long to be updated (counter #%u, update #%u, %0.3" NETDATA_DOUBLE_MODIFIER " secs). Resetting it.", rrdhost_hostname(st->rrdhost), rrdset_id(st), st->counter, st->counter_done, (NETDATA_DOUBLE)st->usec_since_last_update / USEC_PER_SEC); rrdset_reset(st); @@ -1675,7 +1818,7 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) if(unlikely(rrddim_flag_check(rd, RRDDIM_FLAG_OBSOLETE))) { netdata_log_error("Dimension %s in chart '%s' has the OBSOLETE flag set, but it is collected.", rrddim_name(rd), rrdset_id(st)); - rrddim_isnot_obsolete(st, rd); + rrddim_isnot_obsolete___safe_from_collector_thread(st, rd); } } } diff --git a/database/rrdvar.c b/database/rrdvar.c index 09c4d404d..68d22abb9 100644 --- a/database/rrdvar.c +++ b/database/rrdvar.c @@ -315,7 +315,6 @@ int health_variable_lookup(STRING *variable, RRDCALC *rc, NETDATA_DOUBLE *result struct variable2json_helper { BUFFER *buf; - size_t counter; RRDVAR_FLAGS options; }; @@ -326,47 +325,54 @@ static int single_variable2json_callback(const DICTIONARY_ITEM *item __maybe_unu if (helper->options == RRDVAR_FLAG_NONE || rrdvar_flags(rva) & helper->options) { if(unlikely(isnan(value) || isinf(value))) - buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": null", helper->counter?",":"", rrdvar_name(rva)); + buffer_json_member_add_string(helper->buf, rrdvar_name(rva), NULL); else - buffer_sprintf(helper->buf, "%s\n\t\t\"%s\": %0.5" NETDATA_DOUBLE_MODIFIER, helper->counter?",":"", rrdvar_name(rva), (NETDATA_DOUBLE)value); - - helper->counter++; + buffer_json_member_add_double(helper->buf, rrdvar_name(rva), (NETDATA_DOUBLE)value); } return 0; } void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf) { - struct variable2json_helper helper = { - .buf = buf, - .counter = 0, - .options = RRDVAR_FLAG_CUSTOM_CHART_VAR}; + struct variable2json_helper helper = {.buf = buf, .options = RRDVAR_FLAG_CUSTOM_CHART_VAR}; - buffer_sprintf(buf, "{"); rrdvar_walkthrough_read(st->rrdvars, single_variable2json_callback, &helper); - buffer_strcat(buf, "\n\t\t\t}"); } void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf) { RRDHOST *host = st->rrdhost; - struct variable2json_helper helper = { - .buf = buf, - .counter = 0, - .options = RRDVAR_FLAG_NONE}; + struct variable2json_helper helper = {.buf = buf, .options = RRDVAR_FLAG_NONE}; - buffer_sprintf(buf, "{\n\t\"chart\": \"%s\",\n\t\"chart_name\": \"%s\",\n\t\"chart_context\": \"%s\",\n\t\"chart_variables\": {", rrdset_id(st), rrdset_name(st), rrdset_context(st)); - rrdvar_walkthrough_read(st->rrdvars, single_variable2json_callback, &helper); + buffer_json_initialize(buf, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); + + buffer_json_member_add_string(buf, "chart", rrdset_id(st)); + buffer_json_member_add_string(buf, "chart_name", rrdset_name(st)); + buffer_json_member_add_string(buf, "chart_context", rrdset_context(st)); + + { + buffer_json_member_add_object(buf, "chart_variables"); + rrdvar_walkthrough_read(st->rrdvars, single_variable2json_callback, &helper); + buffer_json_object_close(buf); + } - buffer_sprintf(buf, "\n\t},\n\t\"family\": \"%s\",\n\t\"family_variables\": {", rrdset_family(st)); - helper.counter = 0; - rrdvar_walkthrough_read(rrdfamily_rrdvars_dict(st->rrdfamily), single_variable2json_callback, &helper); + buffer_json_member_add_string(buf, "family", rrdset_family(st)); - buffer_sprintf(buf, "\n\t},\n\t\"host\": \"%s\",\n\t\"host_variables\": {", rrdhost_hostname(host)); - helper.counter = 0; - rrdvar_walkthrough_read(host->rrdvars, single_variable2json_callback, &helper); + { + buffer_json_member_add_object(buf, "family_variables"); + rrdvar_walkthrough_read(rrdfamily_rrdvars_dict(st->rrdfamily), single_variable2json_callback, &helper); + buffer_json_object_close(buf); + } + + buffer_json_member_add_string(buf, "host", rrdhost_hostname(host)); + + { + buffer_json_member_add_object(buf, "host_variables"); + rrdvar_walkthrough_read(host->rrdvars, single_variable2json_callback, &helper); + buffer_json_object_close(buf); + } - buffer_strcat(buf, "\n\t}\n}\n"); + buffer_json_finalize(buf); } // ---------------------------------------------------------------------------- diff --git a/database/sqlite/sqlite_aclk.c b/database/sqlite/sqlite_aclk.c index 1298045c2..ac574879c 100644 --- a/database/sqlite/sqlite_aclk.c +++ b/database/sqlite/sqlite_aclk.c @@ -11,60 +11,46 @@ struct aclk_sync_config_s { uv_timer_t timer_req; time_t cleanup_after; // Start a cleanup after this timestamp uv_async_t async; - /* FIFO command queue */ - uv_mutex_t cmd_mutex; - uv_cond_t cmd_cond; bool initialized; - volatile unsigned queue_size; - struct aclk_database_cmdqueue cmd_queue; + SPINLOCK cmd_queue_lock; + struct aclk_database_cmd *cmd_base; } aclk_sync_config = { 0 }; - void sanity_check(void) { // make sure the compiler will stop on misconfigurations BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < ACLK_MAX_ENUMERATIONS_DEFINED); } - -int aclk_database_enq_cmd_noblock(struct aclk_database_cmd *cmd) +static struct aclk_database_cmd aclk_database_deq_cmd(void) { - unsigned queue_size; + struct aclk_database_cmd ret; - /* wait for free space in queue */ - uv_mutex_lock(&aclk_sync_config.cmd_mutex); - if ((queue_size = aclk_sync_config.queue_size) == ACLK_DATABASE_CMD_Q_MAX_SIZE) { - uv_mutex_unlock(&aclk_sync_config.cmd_mutex); - return 1; + spinlock_lock(&aclk_sync_config.cmd_queue_lock); + if(aclk_sync_config.cmd_base) { + struct aclk_database_cmd *t = aclk_sync_config.cmd_base; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(aclk_sync_config.cmd_base, t, prev, next); + ret = *t; + freez(t); } + else { + ret.opcode = ACLK_DATABASE_NOOP; + ret.completion = NULL; + } + spinlock_unlock(&aclk_sync_config.cmd_queue_lock); - fatal_assert(queue_size < ACLK_DATABASE_CMD_Q_MAX_SIZE); - /* enqueue command */ - aclk_sync_config.cmd_queue.cmd_array[aclk_sync_config.cmd_queue.tail] = *cmd; - aclk_sync_config.cmd_queue.tail = aclk_sync_config.cmd_queue.tail != ACLK_DATABASE_CMD_Q_MAX_SIZE - 1 ? - aclk_sync_config.cmd_queue.tail + 1 : 0; - aclk_sync_config.queue_size = queue_size + 1; - uv_mutex_unlock(&aclk_sync_config.cmd_mutex); - return 0; + return ret; } static void aclk_database_enq_cmd(struct aclk_database_cmd *cmd) { - unsigned queue_size; + struct aclk_database_cmd *t = mallocz(sizeof(*t)); + *t = *cmd; + t->prev = t->next = NULL; + + spinlock_lock(&aclk_sync_config.cmd_queue_lock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(aclk_sync_config.cmd_base, t, prev, next); + spinlock_unlock(&aclk_sync_config.cmd_queue_lock); - /* wait for free space in queue */ - uv_mutex_lock(&aclk_sync_config.cmd_mutex); - while ((queue_size = aclk_sync_config.queue_size) == ACLK_DATABASE_CMD_Q_MAX_SIZE) { - uv_cond_wait(&aclk_sync_config.cmd_cond, &aclk_sync_config.cmd_mutex); - } - fatal_assert(queue_size < ACLK_DATABASE_CMD_Q_MAX_SIZE); - /* enqueue command */ - aclk_sync_config.cmd_queue.cmd_array[aclk_sync_config.cmd_queue.tail] = *cmd; - aclk_sync_config.cmd_queue.tail = aclk_sync_config.cmd_queue.tail != ACLK_DATABASE_CMD_Q_MAX_SIZE - 1 ? - aclk_sync_config.cmd_queue.tail + 1 : 0; - aclk_sync_config.queue_size = queue_size + 1; - uv_mutex_unlock(&aclk_sync_config.cmd_mutex); - - /* wake up event loop */ (void) uv_async_send(&aclk_sync_config.async); } @@ -84,6 +70,8 @@ enum { IDX_PROGRAM_VERSION, IDX_ENTRIES, IDX_HEALTH_ENABLED, + IDX_LAST_CONNECTED, + IDX_IS_EPHEMERAL, }; static int create_host_callback(void *data, int argc, char **argv, char **column) @@ -92,9 +80,31 @@ static int create_host_callback(void *data, int argc, char **argv, char **column UNUSED(argc); UNUSED(column); + time_t last_connected = + (time_t)(argv[IDX_LAST_CONNECTED] ? str2uint64_t(argv[IDX_LAST_CONNECTED], NULL) : 0); + + if (!last_connected) + last_connected = now_realtime_sec(); + + time_t age = now_realtime_sec() - last_connected; + int is_ephemeral = 0; + + if (argv[IDX_IS_EPHEMERAL]) + is_ephemeral = str2i(argv[IDX_IS_EPHEMERAL]); + char guid[UUID_STR_LEN]; uuid_unparse_lower(*(uuid_t *)argv[IDX_HOST_ID], guid); + if (is_ephemeral && age > rrdhost_free_ephemeral_time_s) { + netdata_log_info( + "Skipping ephemeral hostname \"%s\" with GUID \"%s\", age = %ld seconds (limit %ld seconds)", + (const char *)argv[IDX_HOSTNAME], + guid, + age, + rrdhost_free_ephemeral_time_s); + return 0; + } + struct rrdhost_system_info *system_info = callocz(1, sizeof(struct rrdhost_system_info)); __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(struct rrdhost_system_info), __ATOMIC_RELAXED); @@ -103,32 +113,48 @@ static int create_host_callback(void *data, int argc, char **argv, char **column sql_build_host_system_info((uuid_t *)argv[IDX_HOST_ID], system_info); RRDHOST *host = rrdhost_find_or_create( - (const char *) argv[IDX_HOSTNAME] - , (const char *) argv[IDX_REGISTRY] - , guid - , (const char *) argv[IDX_OS] - , (const char *) argv[IDX_TIMEZONE] - , (const char *) argv[IDX_ABBREV_TIMEZONE] - , (int32_t) (argv[IDX_UTC_OFFSET] ? str2uint32_t(argv[IDX_UTC_OFFSET], NULL) : 0) - , (const char *) argv[IDX_TAGS] - , (const char *) (argv[IDX_PROGRAM_NAME] ? argv[IDX_PROGRAM_NAME] : "unknown") - , (const char *) (argv[IDX_PROGRAM_VERSION] ? argv[IDX_PROGRAM_VERSION] : "unknown") - , argv[IDX_UPDATE_EVERY] ? str2i(argv[IDX_UPDATE_EVERY]) : 1 - , argv[IDX_ENTRIES] ? str2i(argv[IDX_ENTRIES]) : 0 - , default_rrd_memory_mode - , 0 // health - , 0 // rrdpush enabled - , NULL //destination - , NULL // api key - , NULL // send charts matching - , false // rrdpush_enable_replication - , 0 // rrdpush_seconds_to_replicate - , 0 // rrdpush_replication_step - , system_info - , 1 - ); - if (likely(host)) + (const char *)argv[IDX_HOSTNAME], + (const char *)argv[IDX_REGISTRY], + guid, + (const char *)argv[IDX_OS], + (const char *)argv[IDX_TIMEZONE], + (const char *)argv[IDX_ABBREV_TIMEZONE], + (int32_t)(argv[IDX_UTC_OFFSET] ? str2uint32_t(argv[IDX_UTC_OFFSET], NULL) : 0), + (const char *)argv[IDX_TAGS], + (const char *)(argv[IDX_PROGRAM_NAME] ? argv[IDX_PROGRAM_NAME] : "unknown"), + (const char *)(argv[IDX_PROGRAM_VERSION] ? argv[IDX_PROGRAM_VERSION] : "unknown"), + argv[IDX_UPDATE_EVERY] ? str2i(argv[IDX_UPDATE_EVERY]) : 1, + argv[IDX_ENTRIES] ? str2i(argv[IDX_ENTRIES]) : 0, + default_rrd_memory_mode, + 0 // health + , + 0 // rrdpush enabled + , + NULL //destination + , + NULL // api key + , + NULL // send charts matching + , + false // rrdpush_enable_replication + , + 0 // rrdpush_seconds_to_replicate + , + 0 // rrdpush_replication_step + , + system_info, + 1); + + if (likely(host)) { + if (is_ephemeral) + rrdhost_option_set(host, RRDHOST_OPTION_EPHEMERAL_HOST); + + if (is_ephemeral) + host->child_disconnected_time = now_realtime_sec(); + host->rrdlabels = sql_load_host_labels((uuid_t *)argv[IDX_HOST_ID]); + host->last_connected = last_connected; + } (*number_of_chidren)++; @@ -136,43 +162,14 @@ static int create_host_callback(void *data, int argc, char **argv, char **column char node_str[UUID_STR_LEN] = "<none>"; if (likely(host->node_id)) uuid_unparse_lower(*host->node_id, node_str); - internal_error(true, "Adding archived host \"%s\" with GUID \"%s\" node id = \"%s\"", rrdhost_hostname(host), host->machine_guid, node_str); + internal_error(true, "Adding archived host \"%s\" with GUID \"%s\" node id = \"%s\" ephemeral=%d", rrdhost_hostname(host), host->machine_guid, node_str, is_ephemeral); #endif return 0; } #ifdef ENABLE_ACLK -static struct aclk_database_cmd aclk_database_deq_cmd(void) -{ - struct aclk_database_cmd ret; - unsigned queue_size; - - uv_mutex_lock(&aclk_sync_config.cmd_mutex); - queue_size = aclk_sync_config.queue_size; - if (queue_size == 0) { - memset(&ret, 0, sizeof(ret)); - ret.opcode = ACLK_DATABASE_NOOP; - ret.completion = NULL; - - } else { - /* dequeue command */ - ret = aclk_sync_config.cmd_queue.cmd_array[aclk_sync_config.cmd_queue.head]; - if (queue_size == 1) { - aclk_sync_config.cmd_queue.head = aclk_sync_config.cmd_queue.tail = 0; - } else { - aclk_sync_config.cmd_queue.head = aclk_sync_config.cmd_queue.head != ACLK_DATABASE_CMD_Q_MAX_SIZE - 1 ? - aclk_sync_config.cmd_queue.head + 1 : 0; - } - aclk_sync_config.queue_size = queue_size - 1; - /* wake up producers */ - uv_cond_signal(&aclk_sync_config.cmd_cond); - } - uv_mutex_unlock(&aclk_sync_config.cmd_mutex); - return ret; -} - -#define SQL_SELECT_HOST_BY_UUID "SELECT host_id FROM host WHERE host_id = @host_id;" +#define SQL_SELECT_HOST_BY_UUID "SELECT host_id FROM host WHERE host_id = @host_id" static int is_host_available(uuid_t *host_id) { sqlite3_stmt *res = NULL; @@ -231,7 +228,7 @@ static void sql_delete_aclk_table_list(char *host_guid) BUFFER *sql = buffer_create(ACLK_SYNC_QUERY_SIZE, &netdata_buffers_statistics.buffers_sqlite); buffer_sprintf(sql,"SELECT 'drop '||type||' IF EXISTS '||name||';' FROM sqlite_schema " \ - "WHERE name LIKE 'aclk_%%_%s' AND type IN ('table', 'trigger', 'index');", uuid_str); + "WHERE name LIKE 'aclk_%%_%s' AND type IN ('table', 'trigger', 'index')", uuid_str); rc = sqlite3_prepare_v2(db_meta, buffer_tostring(sql), -1, &res, 0); if (rc != SQLITE_OK) { @@ -255,18 +252,63 @@ fail: buffer_free(sql); } +// OPCODE: ACLK_DATABASE_NODE_UNREGISTER +static void sql_unregister_node(char *machine_guid) +{ + int rc; + uuid_t host_uuid; + + if (unlikely(!machine_guid)) + return; + + rc = uuid_parse(machine_guid, host_uuid); + if (rc) { + freez(machine_guid); + return; + } + + sqlite3_stmt *res = NULL; + + rc = sqlite3_prepare_v2(db_meta, "UPDATE node_instance SET node_id = NULL WHERE host_id = @host_id", -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to remove the host node id"); + freez(machine_guid); + return; + } + + rc = sqlite3_bind_blob(res, 1, &host_uuid, sizeof(host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter to remove host node id"); + goto skip; + } + rc = sqlite3_step_monitored(res); + if (unlikely(rc != SQLITE_DONE)) { + error_report("Failed to execute command to remove host node id"); + } else { + // node: machine guid will be freed after processing + metadata_delete_host_chart_labels(machine_guid); + machine_guid = NULL; + } + +skip: + if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) + error_report("Failed to finalize statement to remove host node id"); + freez(machine_guid); +} + + static int sql_check_aclk_table(void *data __maybe_unused, int argc __maybe_unused, char **argv __maybe_unused, char **column __maybe_unused) { struct aclk_database_cmd cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = ACLK_DATABASE_DELETE_HOST; cmd.param[0] = strdupz((char *) argv[0]); - aclk_database_enq_cmd_noblock(&cmd); + aclk_database_enq_cmd(&cmd); return 0; } #define SQL_SELECT_ACLK_ACTIVE_LIST "SELECT REPLACE(SUBSTR(name,19),'_','-') FROM sqlite_schema " \ - "WHERE name LIKE 'aclk_chart_latest_%' AND type IN ('table');" + "WHERE name LIKE 'aclk_chart_latest_%' AND type IN ('table')" static void sql_check_aclk_table_list(void) { @@ -278,19 +320,18 @@ static void sql_check_aclk_table_list(void) } } -#define SQL_ALERT_CLEANUP "DELETE FROM aclk_alert_%s WHERE date_submitted IS NOT NULL AND CAST(date_cloud_ack AS INT) < unixepoch()-%d;" +#define SQL_ALERT_CLEANUP "DELETE FROM aclk_alert_%s WHERE date_submitted IS NOT NULL AND CAST(date_cloud_ack AS INT) < unixepoch()-%d" static int sql_maint_aclk_sync_database(void *data __maybe_unused, int argc __maybe_unused, char **argv, char **column __maybe_unused) { - char sql[512]; - snprintfz(sql,511, SQL_ALERT_CLEANUP, (char *) argv[0], ACLK_DELETE_ACK_ALERTS_INTERNAL); + char sql[ACLK_SYNC_QUERY_SIZE]; + snprintfz(sql,sizeof(sql) - 1, SQL_ALERT_CLEANUP, (char *) argv[0], ACLK_DELETE_ACK_ALERTS_INTERNAL); if (unlikely(db_execute(db_meta, sql))) error_report("Failed to clean stale ACLK alert entries"); return 0; } - -#define SQL_SELECT_ACLK_ALERT_LIST "SELECT SUBSTR(name,12) FROM sqlite_schema WHERE name LIKE 'aclk_alert_%' AND type IN ('table');" +#define SQL_SELECT_ACLK_ALERT_LIST "SELECT SUBSTR(name,12) FROM sqlite_schema WHERE name LIKE 'aclk_alert_%' AND type IN ('table')" static void sql_maint_aclk_sync_database_all(void) { @@ -304,7 +345,7 @@ static void sql_maint_aclk_sync_database_all(void) static int aclk_config_parameters(void *data __maybe_unused, int argc __maybe_unused, char **argv, char **column __maybe_unused) { - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uuid_unparse_lower(*((uuid_t *) argv[0]), uuid_str); RRDHOST *host = rrdhost_find_by_guid(uuid_str); @@ -332,18 +373,15 @@ static void timer_cb(uv_timer_t *handle) struct aclk_database_cmd cmd; memset(&cmd, 0, sizeof(cmd)); - time_t now = now_realtime_sec(); - - if (config->cleanup_after && config->cleanup_after < now) { + if (config->cleanup_after < now_realtime_sec()) { cmd.opcode = ACLK_DATABASE_CLEANUP; - if (!aclk_database_enq_cmd_noblock(&cmd)) - config->cleanup_after += ACLK_DATABASE_CLEANUP_INTERVAL; + aclk_database_enq_cmd(&cmd); + config->cleanup_after += ACLK_DATABASE_CLEANUP_INTERVAL; } if (aclk_connected) { cmd.opcode = ACLK_DATABASE_PUSH_ALERT; - aclk_database_enq_cmd_noblock(&cmd); - + aclk_database_enq_cmd(&cmd); aclk_check_node_info_and_collectors(); } } @@ -414,12 +452,16 @@ static void aclk_synchronization(void *arg __maybe_unused) case ACLK_DATABASE_NODE_STATE:; RRDHOST *host = cmd.param[0]; int live = (host == localhost || host->receiver || !(rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN))) ? 1 : 0; - struct aclk_sync_host_config *ahc = host->aclk_sync_host_config; + struct aclk_sync_cfg_t *ahc = host->aclk_config; if (unlikely(!ahc)) sql_create_aclk_table(host, &host->host_uuid, host->node_id); - aclk_host_state_update(host, live); + aclk_host_state_update(host, live, 1); break; -// ALERTS + case ACLK_DATABASE_NODE_UNREGISTER: + sql_unregister_node(cmd.param[0]); + + break; + // ALERTS case ACLK_DATABASE_PUSH_ALERT_CONFIG: aclk_push_alert_config_event(cmd.param[0], cmd.param[1]); break; @@ -444,8 +486,6 @@ static void aclk_synchronization(void *arg __maybe_unused) uv_close((uv_handle_t *)&config->timer_req, NULL); uv_close((uv_handle_t *)&config->async, NULL); -// uv_close((uv_handle_t *)&config->async_exit, NULL); - uv_cond_destroy(&config->cmd_cond); (void) uv_loop_close(loop); worker_unregister(); @@ -455,11 +495,7 @@ static void aclk_synchronization(void *arg __maybe_unused) static void aclk_synchronization_init(void) { - aclk_sync_config.cmd_queue.head = aclk_sync_config.cmd_queue.tail = 0; - aclk_sync_config.queue_size = 0; - fatal_assert(0 == uv_cond_init(&aclk_sync_config.cmd_cond)); - fatal_assert(0 == uv_mutex_init(&aclk_sync_config.cmd_mutex)); - + memset(&aclk_sync_config, 0, sizeof(aclk_sync_config)); fatal_assert(0 == uv_thread_create(&aclk_sync_config.thread, aclk_synchronization, &aclk_sync_config)); } #endif @@ -469,8 +505,8 @@ static void aclk_synchronization_init(void) void sql_create_aclk_table(RRDHOST *host __maybe_unused, uuid_t *host_uuid __maybe_unused, uuid_t *node_id __maybe_unused) { #ifdef ENABLE_ACLK - char uuid_str[GUID_LEN + 1]; - char host_guid[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; + char host_guid[UUID_STR_LEN]; int rc; uuid_unparse_lower_fix(host_uuid, uuid_str); @@ -478,37 +514,34 @@ void sql_create_aclk_table(RRDHOST *host __maybe_unused, uuid_t *host_uuid __may char sql[ACLK_SYNC_QUERY_SIZE]; - snprintfz(sql, ACLK_SYNC_QUERY_SIZE-1, TABLE_ACLK_ALERT, uuid_str); + snprintfz(sql, sizeof(sql) - 1, TABLE_ACLK_ALERT, uuid_str); rc = db_execute(db_meta, sql); if (unlikely(rc)) error_report("Failed to create ACLK alert table for host %s", host ? rrdhost_hostname(host) : host_guid); else { - snprintfz(sql, ACLK_SYNC_QUERY_SIZE -1, INDEX_ACLK_ALERT, uuid_str, uuid_str); + snprintfz(sql, sizeof(sql) - 1, INDEX_ACLK_ALERT1, uuid_str, uuid_str); rc = db_execute(db_meta, sql); if (unlikely(rc)) - error_report("Failed to create ACLK alert table index for host %s", host ? string2str(host->hostname) : host_guid); + error_report( + "Failed to create ACLK alert table index 1 for host %s", host ? string2str(host->hostname) : host_guid); - snprintfz(sql, ACLK_SYNC_QUERY_SIZE -1, INDEX_ACLK_ALERT1, uuid_str, uuid_str); + snprintfz(sql, sizeof(sql) - 1, INDEX_ACLK_ALERT2, uuid_str, uuid_str); rc = db_execute(db_meta, sql); if (unlikely(rc)) - error_report("Failed to create ACLK alert table index 1 for host %s", host ? string2str(host->hostname) : host_guid); - - snprintfz(sql, ACLK_SYNC_QUERY_SIZE -1, INDEX_ACLK_ALERT2, uuid_str, uuid_str); - rc = db_execute(db_meta, sql); - if (unlikely(rc)) - error_report("Failed to create ACLK alert table index 2 for host %s", host ? string2str(host->hostname) : host_guid); + error_report( + "Failed to create ACLK alert table index 2 for host %s", host ? string2str(host->hostname) : host_guid); } - if (likely(host) && unlikely(host->aclk_sync_host_config)) + if (likely(host) && unlikely(host->aclk_config)) return; if (unlikely(!host)) return; - struct aclk_sync_host_config *wc = callocz(1, sizeof(struct aclk_sync_host_config)); + struct aclk_sync_cfg_t *wc = callocz(1, sizeof(struct aclk_sync_cfg_t)); if (node_id && !uuid_is_null(*node_id)) uuid_unparse_lower(*node_id, wc->node_id); - host->aclk_sync_host_config = (void *)wc; + host->aclk_config = wc; if (node_id && !host->node_id) { host->node_id = mallocz(sizeof(*host->node_id)); uuid_copy(*host->node_id, *node_id); @@ -522,12 +555,18 @@ void sql_create_aclk_table(RRDHOST *host __maybe_unused, uuid_t *host_uuid __may #endif } -#define SQL_FETCH_ALL_HOSTS "SELECT host_id, hostname, registry_hostname, update_every, os, " \ - "timezone, tags, hops, memory_mode, abbrev_timezone, utc_offset, program_name, " \ - "program_version, entries, health_enabled FROM host WHERE hops >0;" +#define SQL_FETCH_ALL_HOSTS \ + "SELECT host_id, hostname, registry_hostname, update_every, os, " \ + "timezone, tags, hops, memory_mode, abbrev_timezone, utc_offset, program_name, " \ + "program_version, entries, health_enabled, last_connected, " \ + "(SELECT CASE WHEN hl.label_value = 'true' THEN 1 ELSE 0 END FROM " \ + "host_label hl WHERE hl.host_id = h.host_id AND hl.label_key = '_is_ephemeral') " \ + "FROM host h WHERE hops > 0" + +#define SQL_FETCH_ALL_INSTANCES \ + "SELECT ni.host_id, ni.node_id FROM host h, node_instance ni " \ + "WHERE h.host_id = ni.host_id AND ni.node_id IS NOT NULL" -#define SQL_FETCH_ALL_INSTANCES "SELECT ni.host_id, ni.node_id FROM host h, node_instance ni " \ - "WHERE h.host_id = ni.host_id AND ni.node_id IS NOT NULL; " void sql_aclk_sync_init(void) { char *err_msg = NULL; @@ -622,3 +661,18 @@ void schedule_node_info_update(RRDHOST *host __maybe_unused) aclk_database_enq_cmd(&cmd); #endif } + +#ifdef ENABLE_ACLK +void unregister_node(const char *machine_guid) +{ + if (unlikely(!machine_guid)) + return; + + struct aclk_database_cmd cmd; + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = ACLK_DATABASE_NODE_UNREGISTER; + cmd.param[0] = strdupz(machine_guid); + cmd.completion = NULL; + aclk_database_enq_cmd(&cmd); +} +#endif
\ No newline at end of file diff --git a/database/sqlite/sqlite_aclk.h b/database/sqlite/sqlite_aclk.h index 850ca434e..0db2647bf 100644 --- a/database/sqlite/sqlite_aclk.h +++ b/database/sqlite/sqlite_aclk.h @@ -5,14 +5,13 @@ #include "sqlite3.h" - #ifndef ACLK_MAX_CHART_BATCH #define ACLK_MAX_CHART_BATCH (200) #endif #ifndef ACLK_MAX_CHART_BATCH_COUNT #define ACLK_MAX_CHART_BATCH_COUNT (10) #endif -#define ACLK_MAX_ALERT_UPDATES (5) +#define ACLK_MAX_ALERT_UPDATES "5" #define ACLK_DATABASE_CLEANUP_FIRST (1200) #define ACLK_DATABASE_CLEANUP_INTERVAL (3600) #define ACLK_DELETE_ACK_ALERTS_INTERNAL (86400) @@ -41,13 +40,13 @@ static inline int claimed() return localhost->aclk_state.claimed_id != NULL; } -#define TABLE_ACLK_ALERT "CREATE TABLE IF NOT EXISTS aclk_alert_%s (sequence_id INTEGER PRIMARY KEY, " \ - "alert_unique_id, date_created, date_submitted, date_cloud_ack, filtered_alert_unique_id NOT NULL, " \ - "unique(alert_unique_id));" +#define TABLE_ACLK_ALERT \ + "CREATE TABLE IF NOT EXISTS aclk_alert_%s (sequence_id INTEGER PRIMARY KEY, " \ + "alert_unique_id, date_created, date_submitted, date_cloud_ack, filtered_alert_unique_id NOT NULL, " \ + "UNIQUE(alert_unique_id))" -#define INDEX_ACLK_ALERT "CREATE INDEX IF NOT EXISTS aclk_alert_index_%s ON aclk_alert_%s (alert_unique_id);" -#define INDEX_ACLK_ALERT1 "CREATE INDEX IF NOT EXISTS aclk_alert_index1_%s ON aclk_alert_%s (filtered_alert_unique_id);" -#define INDEX_ACLK_ALERT2 "CREATE INDEX IF NOT EXISTS aclk_alert_index2_%s ON aclk_alert_%s (date_submitted);" +#define INDEX_ACLK_ALERT1 "CREATE INDEX IF NOT EXISTS aclk_alert_index1_%s ON aclk_alert_%s (filtered_alert_unique_id)" +#define INDEX_ACLK_ALERT2 "CREATE INDEX IF NOT EXISTS aclk_alert_index2_%s ON aclk_alert_%s (date_submitted)" enum aclk_database_opcode { ACLK_DATABASE_NOOP = 0, @@ -60,6 +59,7 @@ enum aclk_database_opcode { ACLK_DATABASE_PUSH_ALERT_SNAPSHOT, ACLK_DATABASE_PUSH_ALERT_CHECKPOINT, ACLK_DATABASE_QUEUE_REMOVED_ALERTS, + ACLK_DATABASE_NODE_UNREGISTER, ACLK_DATABASE_TIMER, // leave this last @@ -71,16 +71,10 @@ struct aclk_database_cmd { enum aclk_database_opcode opcode; void *param[2]; struct completion *completion; + struct aclk_database_cmd *prev, *next; }; -#define ACLK_DATABASE_CMD_Q_MAX_SIZE (1024) - -struct aclk_database_cmdqueue { - unsigned head, tail; - struct aclk_database_cmd cmd_array[ACLK_DATABASE_CMD_Q_MAX_SIZE]; -}; - -struct aclk_sync_host_config { +typedef struct aclk_sync_cfg_t { RRDHOST *host; int alert_updates; int alert_checkpoint_req; @@ -92,17 +86,16 @@ struct aclk_sync_host_config { char *alerts_snapshot_uuid; // will contain the snapshot_uuid value if snapshot was requested uint64_t alerts_log_first_sequence_id; uint64_t alerts_log_last_sequence_id; -}; - -extern sqlite3 *db_meta; +} aclk_sync_cfg_t; -int aclk_database_enq_cmd_noblock(struct aclk_database_cmd *cmd); void sql_create_aclk_table(RRDHOST *host, uuid_t *host_uuid, uuid_t *node_id); void sql_aclk_sync_init(void); void aclk_push_alert_config(const char *node_id, const char *config_hash); void aclk_push_node_alert_snapshot(const char *node_id); -void aclk_push_node_health_log(const char *node_id); void aclk_push_node_removed_alerts(const char *node_id); void schedule_node_info_update(RRDHOST *host); +#ifdef ENABLE_ACLK +void unregister_node(const char *machine_guid); +#endif #endif //NETDATA_SQLITE_ACLK_H diff --git a/database/sqlite/sqlite_aclk_alert.c b/database/sqlite/sqlite_aclk_alert.c index 60bf5dbdc..9bd060f96 100644 --- a/database/sqlite/sqlite_aclk_alert.c +++ b/database/sqlite/sqlite_aclk_alert.c @@ -13,16 +13,42 @@ sqlite3_column_bytes((res), (_param)) ? strdupz((char *)sqlite3_column_text((res), (_param))) : NULL; \ }) - #define SQL_UPDATE_FILTERED_ALERT \ - "UPDATE aclk_alert_%s SET filtered_alert_unique_id = %u, date_created = unixepoch() where filtered_alert_unique_id = %u" + "UPDATE aclk_alert_%s SET filtered_alert_unique_id = @new_alert, date_created = UNIXEPOCH() " \ + "WHERE filtered_alert_unique_id = @old_alert" -static void update_filtered(ALARM_ENTRY *ae, uint32_t unique_id, char *uuid_str) +static void update_filtered(ALARM_ENTRY *ae, int64_t unique_id, char *uuid_str) { + sqlite3_stmt *res = NULL; + char sql[ACLK_SYNC_QUERY_SIZE]; - snprintfz(sql, ACLK_SYNC_QUERY_SIZE-1, SQL_UPDATE_FILTERED_ALERT, uuid_str, ae->unique_id, unique_id); - sqlite3_exec_monitored(db_meta, sql, 0, 0, NULL); - ae->flags |= HEALTH_ENTRY_FLAG_ACLK_QUEUED; + snprintfz(sql, sizeof(sql) - 1, SQL_UPDATE_FILTERED_ALERT, uuid_str); + int rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to update_filtered"); + return; + } + + rc = sqlite3_bind_int64(res, 1, ae->unique_id); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind ae unique_id for update_filtered"); + goto done; + } + + rc = sqlite3_bind_int64(res, 2, unique_id); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind unique_id for update_filtered"); + goto done; + } + + rc = sqlite3_step_monitored(res); + if (likely(rc == SQLITE_DONE)) + ae->flags |= HEALTH_ENTRY_FLAG_ACLK_QUEUED; + +done: + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement when trying to update_filtered, rc = %d", rc); } #define SQL_SELECT_VARIABLE_ALERT_BY_UNIQUE_ID \ @@ -30,35 +56,35 @@ static void update_filtered(ALARM_ENTRY *ae, uint32_t unique_id, char *uuid_str) "WHERE hld.unique_id = @unique_id AND hl.config_hash_id = ah.hash_id AND hld.health_log_id = hl.health_log_id " \ "AND hl.host_id = @host_id AND ah.warn IS NULL AND ah.crit IS NULL" -static inline bool is_event_from_alert_variable_config(uint32_t unique_id, uuid_t *host_id) +static inline bool is_event_from_alert_variable_config(int64_t unique_id, uuid_t *host_id) { sqlite3_stmt *res = NULL; - int rc = 0; - bool ret = false; - rc = sqlite3_prepare_v2(db_meta, SQL_SELECT_VARIABLE_ALERT_BY_UNIQUE_ID, -1, &res, 0); + int rc = sqlite3_prepare_v2(db_meta, SQL_SELECT_VARIABLE_ALERT_BY_UNIQUE_ID, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement when trying to check for alert variables."); return false; } - rc = sqlite3_bind_int(res, 1, (int) unique_id); + bool ret = false; + + rc = sqlite3_bind_int64(res, 1, unique_id); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to bind unique_id for checking alert variable."); - goto fail; + goto done; } rc = sqlite3_bind_blob(res, 2, host_id, sizeof(*host_id), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to bind host_id for checking alert variable."); - goto fail; + goto done; } rc = sqlite3_step_monitored(res); if (likely(rc == SQLITE_ROW)) ret = true; -fail: +done: rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) error_report("Failed to finalize statement when trying to check for alert variables, rc = %d", rc); @@ -71,32 +97,25 @@ fail: //decide if some events should be sent or not #define SQL_SELECT_ALERT_BY_ID \ "SELECT hld.new_status, hl.config_hash_id, hld.unique_id FROM health_log hl, aclk_alert_%s aa, health_log_detail hld " \ - "WHERE hl.host_id = @host_id AND +hld.unique_id = aa.filtered_alert_unique_id " \ + "WHERE hl.host_id = @host_id AND hld.unique_id = aa.filtered_alert_unique_id " \ "AND hld.alarm_id = @alarm_id AND hl.health_log_id = hld.health_log_id " \ - "ORDER BY hld.rowid DESC LIMIT 1;" + "ORDER BY hld.rowid DESC LIMIT 1" static bool should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) { sqlite3_stmt *res = NULL; - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - - bool send = false; if (ae->new_status == RRDCALC_STATUS_REMOVED || ae->new_status == RRDCALC_STATUS_UNINITIALIZED) return 0; - if (unlikely(uuid_is_null(ae->config_hash_id))) + if (unlikely(uuid_is_null(ae->config_hash_id) || !host->aclk_config)) return 0; char sql[ACLK_SYNC_QUERY_SIZE]; - uuid_t config_hash_id; - RRDCALC_STATUS status; - uint32_t unique_id; //get the previous sent event of this alarm_id //base the search on the last filtered event - snprintfz(sql, ACLK_SYNC_QUERY_SIZE - 1, SQL_SELECT_ALERT_BY_ID, uuid_str); + snprintfz(sql, sizeof(sql) - 1, SQL_SELECT_ALERT_BY_ID, host->aclk_config->uuid_str); int rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); if (rc != SQLITE_OK) { @@ -104,6 +123,8 @@ static bool should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) return true; } + bool send = false; + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to bind host_id for checking should_send_to_cloud"); @@ -119,17 +140,18 @@ static bool should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) rc = sqlite3_step_monitored(res); if (likely(rc == SQLITE_ROW)) { - status = (RRDCALC_STATUS)sqlite3_column_int(res, 0); + uuid_t config_hash_id; + RRDCALC_STATUS status = (RRDCALC_STATUS)sqlite3_column_int(res, 0); if (sqlite3_column_type(res, 1) != SQLITE_NULL) uuid_copy(config_hash_id, *((uuid_t *)sqlite3_column_blob(res, 1))); - unique_id = (uint32_t)sqlite3_column_int64(res, 2); + int64_t unique_id = sqlite3_column_int64(res, 2); if (ae->new_status != (RRDCALC_STATUS)status || uuid_memcmp(&ae->config_hash_id, &config_hash_id)) send = true; else - update_filtered(ae, unique_id, uuid_str); + update_filtered(ae, unique_id, host->aclk_config->uuid_str); } else send = true; @@ -143,13 +165,12 @@ done: #define SQL_QUEUE_ALERT_TO_CLOUD \ "INSERT INTO aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " \ - "VALUES (@alert_unique_id, UNIXEPOCH(), @alert_unique_id) ON CONFLICT (alert_unique_id) DO NOTHING;" + "VALUES (@alert_unique_id, UNIXEPOCH(), @alert_unique_id) ON CONFLICT (alert_unique_id) DO NOTHING" void sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, bool skip_filter) { sqlite3_stmt *res_alert = NULL; char sql[ACLK_SYNC_QUERY_SIZE]; - char uuid_str[UUID_STR_LEN]; if (!service_running(SERVICE_ACLK)) return; @@ -163,8 +184,7 @@ void sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, bool skip_filter) if (is_event_from_alert_variable_config(ae->unique_id, &host->host_uuid)) return; - uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - snprintfz(sql, ACLK_SYNC_QUERY_SIZE - 1, SQL_QUEUE_ALERT_TO_CLOUD, uuid_str); + snprintfz(sql, sizeof(sql) - 1, SQL_QUEUE_ALERT_TO_CLOUD, host->aclk_config->uuid_str); int rc = sqlite3_prepare_v2(db_meta, sql, -1, &res_alert, 0); if (unlikely(rc != SQLITE_OK)) { @@ -172,18 +192,18 @@ void sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, bool skip_filter) return; } - rc = sqlite3_bind_int(res_alert, 1, (int) ae->unique_id); + rc = sqlite3_bind_int64(res_alert, 1, ae->unique_id); if (unlikely(rc != SQLITE_OK)) - goto bind_fail; + goto done; rc = execute_insert(res_alert); if (unlikely(rc == SQLITE_DONE)) { ae->flags |= HEALTH_ENTRY_FLAG_ACLK_QUEUED; rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); } else - error_report("Failed to store alert event %u, rc = %d", ae->unique_id, rc); + error_report("Failed to store alert event %"PRId64", rc = %d", ae->unique_id, rc); -bind_fail: +done: if (unlikely(sqlite3_finalize(res_alert) != SQLITE_OK)) error_report("Failed to reset statement in store alert event, rc = %d", rc); } @@ -239,15 +259,13 @@ static inline char *sqlite3_text_strdupz_empty(sqlite3_stmt *res, int iCol) { } -void aclk_push_alert_event(struct aclk_sync_host_config *wc) +static void aclk_push_alert_event(struct aclk_sync_cfg_t *wc __maybe_unused) { -#ifndef ENABLE_ACLK - UNUSED(wc); -#else +#ifdef ENABLE_ACLK int rc; if (unlikely(!wc->alert_updates)) { - netdata_log_access( + nd_log(NDLS_ACCESS, NDLP_NOTICE, "ACLK STA [%s (%s)]: Ignoring alert push event, updates have been turned off for this node.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); @@ -265,23 +283,20 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); - int limit = ACLK_MAX_ALERT_UPDATES; - sqlite3_stmt *res = NULL; buffer_sprintf( sql, - "select aa.sequence_id, hld.unique_id, hld.alarm_id, hl.config_hash_id, hld.updated_by_id, hld.when_key, " + "SELECT aa.sequence_id, hld.unique_id, hld.alarm_id, hl.config_hash_id, hld.updated_by_id, hld.when_key, " " hld.duration, hld.non_clear_duration, hld.flags, hld.exec_run_timestamp, hld.delay_up_to_timestamp, hl.name, " " hl.chart, hl.exec, hl.recipient, ha.source, hl.units, hld.info, hld.exec_code, hld.new_status, " " hld.old_status, hld.delay, hld.new_value, hld.old_value, hld.last_repeat, hl.chart_context, hld.transition_id, " " hld.alarm_event_id, hl.chart_name, hld.summary " - " from health_log hl, aclk_alert_%s aa, alert_hash ha, health_log_detail hld " - " where hld.unique_id = aa.alert_unique_id and hl.config_hash_id = ha.hash_id and aa.date_submitted is null " - " and hl.host_id = @host_id and hl.health_log_id = hld.health_log_id " - " order by aa.sequence_id asc limit %d;", - wc->uuid_str, - limit); + " FROM health_log hl, aclk_alert_%s aa, alert_hash ha, health_log_detail hld " + " WHERE hld.unique_id = aa.alert_unique_id AND hl.config_hash_id = ha.hash_id AND aa.date_submitted IS NULL " + " AND hl.host_id = @host_id AND hl.health_log_id = hld.health_log_id " + " ORDER BY aa.sequence_id ASC LIMIT "ACLK_MAX_ALERT_UPDATES, + wc->uuid_str); rc = sqlite3_prepare_v2(db_meta, buffer_tostring(sql), -1, &res, 0); if (rc != SQLITE_OK) { @@ -292,13 +307,6 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) rc = db_execute(db_meta, buffer_tostring(sql_fix)); if (unlikely(rc)) error_report("Failed to create ACLK alert table for host %s", rrdhost_hostname(wc->host)); - - else { - buffer_flush(sql_fix); - buffer_sprintf(sql_fix, INDEX_ACLK_ALERT, wc->uuid_str, wc->uuid_str); - if (unlikely(db_execute(db_meta, buffer_tostring(sql_fix)))) - error_report("Failed to create ACLK alert table for host %s", rrdhost_hostname(wc->host)); - } buffer_free(sql_fix); // Try again @@ -315,10 +323,7 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) rc = sqlite3_bind_blob(res, 1, &wc->host->host_uuid, sizeof(wc->host->host_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to bind host_id for pushing alert event."); - sqlite3_finalize(res); - buffer_free(sql); - freez(claim_id); - return; + goto done; } uint64_t first_sequence_id = 0; @@ -395,9 +400,13 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) if (first_sequence_id) { buffer_flush(sql); - buffer_sprintf(sql, "UPDATE aclk_alert_%s SET date_submitted=unixepoch() " - "WHERE +date_submitted IS NULL AND sequence_id BETWEEN %" PRIu64 " AND %" PRIu64 ";", - wc->uuid_str, first_sequence_id, last_sequence_id); + buffer_sprintf( + sql, + "UPDATE aclk_alert_%s SET date_submitted=unixepoch() " + "WHERE +date_submitted IS NULL AND sequence_id BETWEEN %" PRIu64 " AND %" PRIu64, + wc->uuid_str, + first_sequence_id, + last_sequence_id); if (unlikely(db_execute(db_meta, buffer_tostring(sql)))) error_report("Failed to mark ACLK alert entries as submitted for host %s", rrdhost_hostname(wc->host)); @@ -407,7 +416,7 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) } else { if (wc->alerts_log_first_sequence_id) - netdata_log_access( + nd_log(NDLS_ACCESS, NDLP_DEBUG, "ACLK RES [%s (%s)]: ALERTS SENT from %" PRIu64 " to %" PRIu64 "", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", @@ -417,6 +426,7 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) wc->alerts_log_last_sequence_id = 0; } +done: rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) error_report("Failed to finalize statement to send alert entries from the database, rc = %d", rc); @@ -437,7 +447,7 @@ void aclk_push_alert_events_for_all_hosts(void) rrdhost_flag_clear(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); - struct aclk_sync_host_config *wc = host->aclk_sync_host_config; + struct aclk_sync_cfg_t *wc = host->aclk_config; if (likely(wc)) aclk_push_alert_event(wc); } @@ -446,59 +456,54 @@ void aclk_push_alert_events_for_all_hosts(void) void sql_queue_existing_alerts_to_aclk(RRDHOST *host) { - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); sqlite3_stmt *res = NULL; int rc; + struct aclk_sync_cfg_t *wc = host->aclk_config; + + BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + rw_spinlock_write_lock(&host->health_log.spinlock); - buffer_sprintf(sql, "delete from aclk_alert_%s; ", uuid_str); - if (unlikely(db_execute(db_meta, buffer_tostring(sql)))) { - rw_spinlock_write_unlock(&host->health_log.spinlock); - buffer_free(sql); - return; - } + buffer_sprintf(sql, "DELETE FROM aclk_alert_%s", wc->uuid_str); + if (unlikely(db_execute(db_meta, buffer_tostring(sql)))) + goto skip; buffer_flush(sql); + buffer_sprintf( sql, - "insert into aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " - "select hld.unique_id alert_unique_id, unixepoch(), hld.unique_id alert_unique_id from health_log_detail hld, health_log hl " - "where hld.new_status <> 0 and hld.new_status <> -2 and hl.health_log_id = hld.health_log_id and hl.config_hash_id is not null " - "and hld.updated_by_id = 0 and hl.host_id = @host_id order by hld.unique_id asc on conflict (alert_unique_id) do nothing;", - uuid_str); + "INSERT INTO aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " + "SELECT hld.unique_id alert_unique_id, unixepoch(), hld.unique_id alert_unique_id FROM health_log_detail hld, health_log hl " + "WHERE hld.new_status <> 0 AND hld.new_status <> -2 AND hl.health_log_id = hld.health_log_id AND hl.config_hash_id IS NOT NULL " + "AND hld.updated_by_id = 0 AND hl.host_id = @host_id ORDER BY hld.unique_id ASC ON CONFLICT (alert_unique_id) DO NOTHING", + wc->uuid_str); rc = sqlite3_prepare_v2(db_meta, buffer_tostring(sql), -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement when trying to queue existing alerts."); - rw_spinlock_write_unlock(&host->health_log.spinlock); - buffer_free(sql); - return; + goto skip; } rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to bind host_id for when trying to queue existing alerts."); - sqlite3_finalize(res); - rw_spinlock_write_unlock(&host->health_log.spinlock); - buffer_free(sql); - return; + goto done; } rc = execute_insert(res); if (unlikely(rc != SQLITE_DONE)) error_report("Failed to queue existing alerts, rc = %d", rc); - + else + rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); +done: rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) error_report("Failed to finalize statement to queue existing alerts, rc = %d", rc); +skip: rw_spinlock_write_unlock(&host->health_log.spinlock); - buffer_free(sql); - rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); } void aclk_send_alarm_configuration(char *config_hash) @@ -506,12 +511,12 @@ void aclk_send_alarm_configuration(char *config_hash) if (unlikely(!config_hash)) return; - struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *) localhost->aclk_sync_host_config; + struct aclk_sync_cfg_t *wc = localhost->aclk_config; if (unlikely(!wc)) return; - netdata_log_access( + nd_log(NDLS_ACCESS, NDLP_DEBUG, "ACLK REQ [%s (%s)]: Request to send alert config %s.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", @@ -524,43 +529,33 @@ void aclk_send_alarm_configuration(char *config_hash) "SELECT alarm, template, on_key, class, type, component, os, hosts, plugin," \ "module, charts, lookup, every, units, green, red, calc, warn, crit, to_key, exec, delay, repeat, info," \ "options, host_labels, p_db_lookup_dimensions, p_db_lookup_method, p_db_lookup_options, p_db_lookup_after," \ - "p_db_lookup_before, p_update_every, chart_labels, summary FROM alert_hash WHERE hash_id = @hash_id;" + "p_db_lookup_before, p_update_every, chart_labels, summary FROM alert_hash WHERE hash_id = @hash_id" -int aclk_push_alert_config_event(char *node_id __maybe_unused, char *config_hash __maybe_unused) +void aclk_push_alert_config_event(char *node_id __maybe_unused, char *config_hash __maybe_unused) { - int rc = 0; - #ifdef ENABLE_ACLK - - CHECK_SQLITE_CONNECTION(db_meta); + int rc; sqlite3_stmt *res = NULL; + struct aclk_sync_cfg_t *wc; - struct aclk_sync_host_config *wc = NULL; RRDHOST *host = find_host_by_node_id(node_id); - if (unlikely(!host)) { + if (unlikely(!host || !(wc = host->aclk_config))) { freez(config_hash); freez(node_id); - return 1; - } - - wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; - if (unlikely(!wc)) { - freez(config_hash); - freez(node_id); - return 1; + return; } rc = sqlite3_prepare_v2(db_meta, SQL_SELECT_ALERT_CONFIG, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement when trying to fetch an alarm hash configuration"); - return 1; + return; } uuid_t hash_uuid; if (uuid_parse(config_hash, hash_uuid)) - return 1; + return; rc = sqlite3_bind_blob(res, 1, &hash_uuid , sizeof(hash_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) @@ -632,13 +627,13 @@ int aclk_push_alert_config_event(char *node_id __maybe_unused, char *config_hash } if (likely(p_alarm_config.cfg_hash)) { - netdata_log_access("ACLK RES [%s (%s)]: Sent alert config %s.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", config_hash); + nd_log(NDLS_ACCESS, NDLP_DEBUG, "ACLK RES [%s (%s)]: Sent alert config %s.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", config_hash); aclk_send_provide_alarm_cfg(&p_alarm_config); freez(p_alarm_config.cfg_hash); destroy_aclk_alarm_configuration(&alarm_config); } else - netdata_log_access("ACLK STA [%s (%s)]: Alert config for %s not found.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", config_hash); + nd_log(NDLS_ACCESS, NDLP_WARNING, "ACLK STA [%s (%s)]: Alert config for %s not found.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", config_hash); bind_fail: rc = sqlite3_finalize(res); @@ -648,7 +643,6 @@ bind_fail: freez(config_hash); freez(node_id); #endif - return rc; } @@ -660,51 +654,50 @@ void aclk_start_alert_streaming(char *node_id, bool resets) if (unlikely(!node_id || uuid_parse(node_id, node_uuid))) return; - RRDHOST *host = find_host_by_node_id(node_id); - - if (unlikely(!host)) - return; - - struct aclk_sync_host_config *wc = host->aclk_sync_host_config; + struct aclk_sync_cfg_t *wc; - if (unlikely(!wc)) + RRDHOST *host = find_host_by_node_id(node_id); + if (unlikely(!host || !(wc = host->aclk_config))) return; if (unlikely(!host->health.health_enabled)) { - netdata_log_access("ACLK STA [%s (N/A)]: Ignoring request to stream alert state changes, health is disabled.", node_id); + nd_log(NDLS_ACCESS, NDLP_NOTICE, "ACLK STA [%s (N/A)]: Ignoring request to stream alert state changes, health is disabled.", node_id); return; } if (resets) { - netdata_log_access("ACLK REQ [%s (%s)]: STREAM ALERTS ENABLED (RESET REQUESTED)", node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); + nd_log(NDLS_ACCESS, NDLP_DEBUG, "ACLK REQ [%s (%s)]: STREAM ALERTS ENABLED (RESET REQUESTED)", node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); sql_queue_existing_alerts_to_aclk(host); } else - netdata_log_access("ACLK REQ [%s (%s)]: STREAM ALERTS ENABLED", node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); + nd_log(NDLS_ACCESS, NDLP_DEBUG, "ACLK REQ [%s (%s)]: STREAM ALERTS ENABLED", node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); wc->alert_updates = 1; wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS; } -#define SQL_QUEUE_REMOVE_ALERTS "INSERT INTO aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " \ +#define SQL_QUEUE_REMOVE_ALERTS \ + "INSERT INTO aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " \ "SELECT hld.unique_id alert_unique_id, UNIXEPOCH(), hld.unique_id alert_unique_id FROM health_log hl, health_log_detail hld " \ - "WHERE hl.host_id = @host_id AND hl.health_log_id = hld.health_log_id AND hld.new_status = -2 AND hld.updated_by_id = 0 " \ - "AND hld.unique_id NOT IN (SELECT alert_unique_id FROM aclk_alert_%s) " \ - "AND hl.config_hash_id NOT IN (select hash_id from alert_hash where warn is null and crit is null) " \ - "AND hl.name || hl.chart NOT IN (select name || chart from health_log where name = hl.name and chart = hl.chart and alarm_id > hl.alarm_id and host_id = hl.host_id) " \ - "ORDER BY hld.unique_id ASC ON CONFLICT (alert_unique_id) DO NOTHING;" + "WHERE hl.host_id = @host_id AND hl.health_log_id = hld.health_log_id AND hld.new_status = -2 AND hld.updated_by_id = 0 " \ + "AND hld.unique_id NOT IN (SELECT alert_unique_id FROM aclk_alert_%s) " \ + "AND hl.config_hash_id NOT IN (SELECT hash_id FROM alert_hash WHERE warn IS NULL AND crit IS NULL) " \ + "AND hl.name || hl.chart NOT IN (select name || chart FROM health_log WHERE name = hl.name AND " \ + "chart = hl.chart AND alarm_id > hl.alarm_id AND host_id = hl.host_id) " \ + "ORDER BY hld.unique_id ASC ON CONFLICT (alert_unique_id) DO NOTHING" + void sql_process_queue_removed_alerts_to_aclk(char *node_id) { - struct aclk_sync_host_config *wc; + struct aclk_sync_cfg_t *wc; RRDHOST *host = find_host_by_node_id(node_id); freez(node_id); - if (unlikely(!host || !(wc = host->aclk_sync_host_config))) + if (unlikely(!host || !(wc = host->aclk_config))) return; char sql[ACLK_SYNC_QUERY_SIZE * 2]; sqlite3_stmt *res = NULL; - snprintfz(sql, ACLK_SYNC_QUERY_SIZE * 2 - 1, SQL_QUEUE_REMOVE_ALERTS, wc->uuid_str, wc->uuid_str); + snprintfz(sql, sizeof(sql) - 1, SQL_QUEUE_REMOVE_ALERTS, wc->uuid_str, wc->uuid_str); int rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); if (rc != SQLITE_OK) { @@ -715,33 +708,25 @@ void sql_process_queue_removed_alerts_to_aclk(char *node_id) rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to bind host_id for when trying to queue remvoed alerts."); - sqlite3_finalize(res); - return; + goto skip; } rc = execute_insert(res); - if (unlikely(rc != SQLITE_DONE)) { - sqlite3_finalize(res); - error_report("Failed to queue removed alerts, rc = %d", rc); - return; + if (likely(rc == SQLITE_DONE)) { + nd_log(NDLS_ACCESS, NDLP_DEBUG, "ACLK STA [%s (%s)]: QUEUED REMOVED ALERTS", wc->node_id, rrdhost_hostname(wc->host)); + rrdhost_flag_set(wc->host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); + wc->alert_queue_removed = 0; } +skip: rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) error_report("Failed to finalize statement to queue removed alerts, rc = %d", rc); - - netdata_log_access("ACLK STA [%s (%s)]: QUEUED REMOVED ALERTS", wc->node_id, rrdhost_hostname(wc->host)); - - rrdhost_flag_set(wc->host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); - wc->alert_queue_removed = 0; } void sql_queue_removed_alerts_to_aclk(RRDHOST *host) { - if (unlikely(!host->aclk_sync_host_config)) - return; - - if (!claimed() || !host->node_id) + if (unlikely(!host->aclk_config || !claimed() || !host->node_id)) return; char node_id[UUID_STR_LEN]; @@ -753,32 +738,28 @@ void sql_queue_removed_alerts_to_aclk(RRDHOST *host) void aclk_process_send_alarm_snapshot(char *node_id, char *claim_id __maybe_unused, char *snapshot_uuid) { uuid_t node_uuid; + if (unlikely(!node_id || uuid_parse(node_id, node_uuid))) return; + struct aclk_sync_cfg_t *wc; + RRDHOST *host = find_host_by_node_id(node_id); - if (unlikely(!host)) { - netdata_log_access("ACLK STA [%s (N/A)]: ACLK node id does not exist", node_id); + if (unlikely(!host || !(wc = host->aclk_config))) { + nd_log(NDLS_ACCESS, NDLP_WARNING, "ACLK STA [%s (N/A)]: ACLK node id does not exist", node_id); return; } - struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; - - if (unlikely(!wc)) { - netdata_log_access("ACLK STA [%s (N/A)]: ACLK node id does not exist", node_id); - return; - } - - netdata_log_access( + nd_log(NDLS_ACCESS, NDLP_DEBUG, "IN [%s (%s)]: Request to send alerts snapshot, snapshot_uuid %s", node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", snapshot_uuid); + if (wc->alerts_snapshot_uuid && !strcmp(wc->alerts_snapshot_uuid,snapshot_uuid)) return; - __sync_synchronize(); + wc->alerts_snapshot_uuid = strdupz(snapshot_uuid); - __sync_synchronize(); aclk_push_node_alert_snapshot(node_id); } @@ -795,9 +776,7 @@ void health_alarm_entry2proto_nolock(struct alarm_log_entry *alarm_log, ALARM_EN alarm_log->chart = strdupz(ae_chart_id(ae)); alarm_log->name = strdupz(ae_name(ae)); - alarm_log->batch_id = 0; - alarm_log->sequence_id = 0; - alarm_log->when = (time_t)ae->when; + alarm_log->when = ae->when; alarm_log->config_hash = strdupz((char *)config_hash_id); @@ -812,7 +791,7 @@ void health_alarm_entry2proto_nolock(struct alarm_log_entry *alarm_log, ALARM_EN alarm_log->non_clear_duration = (time_t)ae->non_clear_duration; alarm_log->status = rrdcalc_status_to_proto_enum((RRDCALC_STATUS)ae->new_status); alarm_log->old_status = rrdcalc_status_to_proto_enum((RRDCALC_STATUS)ae->old_status); - alarm_log->delay = (int)ae->delay; + alarm_log->delay = ae->delay; alarm_log->delay_up_to_timestamp = (time_t)ae->delay_up_to_timestamp; alarm_log->last_repeat = (time_t)ae->last_repeat; @@ -842,18 +821,18 @@ void health_alarm_entry2proto_nolock(struct alarm_log_entry *alarm_log, ALARM_EN #endif #ifdef ENABLE_ACLK -static int have_recent_alarm(RRDHOST *host, uint32_t alarm_id, uint32_t mark) +static bool have_recent_alarm(RRDHOST *host, int64_t alarm_id, int64_t mark) { ALARM_ENTRY *ae = host->health_log.alarms; while (ae) { if (ae->alarm_id == alarm_id && ae->unique_id >mark && (ae->new_status != RRDCALC_STATUS_WARNING && ae->new_status != RRDCALC_STATUS_CRITICAL)) - return 1; + return true; ae = ae->next; } - return 0; + return false; } #endif @@ -864,17 +843,17 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) RRDHOST *host = find_host_by_node_id(node_id); if (unlikely(!host)) { - netdata_log_access("AC [%s (N/A)]: Node id not found", node_id); + nd_log(NDLS_ACCESS, NDLP_WARNING, "AC [%s (N/A)]: Node id not found", node_id); freez(node_id); return; } freez(node_id); - struct aclk_sync_host_config *wc = host->aclk_sync_host_config; + struct aclk_sync_cfg_t *wc = host->aclk_config; // we perhaps we don't need this for snapshots if (unlikely(!wc->alert_updates)) { - netdata_log_access( + nd_log(NDLS_ACCESS, NDLP_NOTICE, "ACLK STA [%s (%s)]: Ignoring alert snapshot event, updates have been turned off for this node.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); @@ -888,11 +867,9 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) if (unlikely(!claim_id)) return; - netdata_log_access("ACLK REQ [%s (%s)]: Sending alerts snapshot, snapshot_uuid %s", wc->node_id, rrdhost_hostname(wc->host), wc->alerts_snapshot_uuid); + nd_log(NDLS_ACCESS, NDLP_DEBUG, "ACLK REQ [%s (%s)]: Sending alerts snapshot, snapshot_uuid %s", wc->node_id, rrdhost_hostname(wc->host), wc->alerts_snapshot_uuid); uint32_t cnt = 0; - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower_fix(&host->host_uuid, uuid_str); rw_spinlock_read_lock(&host->health_log.spinlock); @@ -915,7 +892,7 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) } if (cnt) { - uint32_t chunk = 1, chunks = 0; + uint32_t chunks; chunks = (cnt / ALARM_EVENTS_PER_CHUNK) + (cnt % ALARM_EVENTS_PER_CHUNK != 0); ae = host->health_log.alarms; @@ -926,15 +903,12 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) alarm_snap.claim_id = claim_id; alarm_snap.snapshot_uuid = wc->alerts_snapshot_uuid; alarm_snap.chunks = chunks; - alarm_snap.chunk = chunk; + alarm_snap.chunk = 1; alarm_snapshot_proto_ptr_t snapshot_proto = NULL; for (; ae; ae = ae->next) { - if (likely(ae->updated_by_id)) - continue; - - if (unlikely(ae->new_status == RRDCALC_STATUS_UNINITIALIZED)) + if (likely(ae->updated_by_id) || unlikely(ae->new_status == RRDCALC_STATUS_UNINITIALIZED)) continue; if (have_recent_alarm(host, ae->alarm_id, ae->unique_id)) @@ -957,19 +931,9 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) if (cnt == ALARM_EVENTS_PER_CHUNK) { aclk_send_alarm_snapshot(snapshot_proto); - cnt = 0; - - if (chunk < chunks) { - chunk++; - - struct alarm_snapshot alarm_snap; - alarm_snap.node_id = wc->node_id; - alarm_snap.claim_id = claim_id; - alarm_snap.snapshot_uuid = wc->alerts_snapshot_uuid; - alarm_snap.chunks = chunks; - alarm_snap.chunk = chunk; - + if (alarm_snap.chunk < chunks) { + alarm_snap.chunk++; snapshot_proto = generate_alarm_snapshot_proto(&alarm_snap); } } @@ -986,51 +950,70 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) #endif } -#define SQL_DELETE_ALERT_ENTRIES "DELETE FROM aclk_alert_%s WHERE date_created + %d < UNIXEPOCH();" +#define SQL_DELETE_ALERT_ENTRIES "DELETE FROM aclk_alert_%s WHERE date_created < UNIXEPOCH() - @period" + void sql_aclk_alert_clean_dead_entries(RRDHOST *host) { - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower_fix(&host->host_uuid, uuid_str); + struct aclk_sync_cfg_t *wc = host->aclk_config; + if (unlikely(!wc)) + return; char sql[ACLK_SYNC_QUERY_SIZE]; - snprintfz(sql, ACLK_SYNC_QUERY_SIZE - 1, SQL_DELETE_ALERT_ENTRIES, uuid_str, MAX_REMOVED_PERIOD); + snprintfz(sql, sizeof(sql) - 1, SQL_DELETE_ALERT_ENTRIES, wc->uuid_str); - char *err_msg = NULL; - int rc = sqlite3_exec_monitored(db_meta, sql, NULL, NULL, &err_msg); + sqlite3_stmt *res = NULL; + int rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); if (rc != SQLITE_OK) { - error_report("Failed when trying to clean stale ACLK alert entries from aclk_alert_%s, error message \"%s\"", uuid_str, err_msg); - sqlite3_free(err_msg); + error_report("Failed to prepare statement for cleaning stale ACLK alert entries."); + return; } + + rc = sqlite3_bind_int64(res, 1, MAX_REMOVED_PERIOD); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind MAX_REMOVED_PERIOD parameter."); + goto skip; + } + + rc = sqlite3_step_monitored(res); + if (rc != SQLITE_DONE) + error_report("Failed to execute DELETE query for cleaning stale ACLK alert entries."); + +skip: + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement for cleaning stale ACLK alert entries."); } #define SQL_GET_MIN_MAX_ALERT_SEQ "SELECT MIN(sequence_id), MAX(sequence_id), " \ "(SELECT MAX(sequence_id) FROM aclk_alert_%s WHERE date_submitted IS NOT NULL) " \ - "FROM aclk_alert_%s WHERE date_submitted IS NULL;" + "FROM aclk_alert_%s WHERE date_submitted IS NULL" int get_proto_alert_status(RRDHOST *host, struct proto_alert_status *proto_alert_status) { - int rc; - struct aclk_sync_host_config *wc = NULL; - wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; + + struct aclk_sync_cfg_t *wc = host->aclk_config; if (!wc) return 1; proto_alert_status->alert_updates = wc->alert_updates; char sql[ACLK_SYNC_QUERY_SIZE]; - sqlite3_stmt *res = NULL; - snprintfz(sql, ACLK_SYNC_QUERY_SIZE - 1, SQL_GET_MIN_MAX_ALERT_SEQ, wc->uuid_str, wc->uuid_str); + sqlite3_stmt *res = NULL; + snprintfz(sql, sizeof(sql) - 1, SQL_GET_MIN_MAX_ALERT_SEQ, wc->uuid_str, wc->uuid_str); - rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); + int rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement to get alert log status from the database."); return 1; } while (sqlite3_step_monitored(res) == SQLITE_ROW) { - proto_alert_status->pending_min_sequence_id = sqlite3_column_bytes(res, 0) > 0 ? (uint64_t) sqlite3_column_int64(res, 0) : 0; - proto_alert_status->pending_max_sequence_id = sqlite3_column_bytes(res, 1) > 0 ? (uint64_t) sqlite3_column_int64(res, 1) : 0; - proto_alert_status->last_submitted_sequence_id = sqlite3_column_bytes(res, 2) > 0 ? (uint64_t) sqlite3_column_int64(res, 2) : 0; + proto_alert_status->pending_min_sequence_id = + sqlite3_column_bytes(res, 0) > 0 ? (uint64_t)sqlite3_column_int64(res, 0) : 0; + proto_alert_status->pending_max_sequence_id = + sqlite3_column_bytes(res, 1) > 0 ? (uint64_t)sqlite3_column_int64(res, 1) : 0; + proto_alert_status->last_submitted_sequence_id = + sqlite3_column_bytes(res, 2) > 0 ? (uint64_t)sqlite3_column_int64(res, 2) : 0; } rc = sqlite3_finalize(res); @@ -1045,21 +1028,15 @@ void aclk_send_alarm_checkpoint(char *node_id, char *claim_id __maybe_unused) if (unlikely(!node_id)) return; - struct aclk_sync_host_config *wc = NULL; + struct aclk_sync_cfg_t *wc; RRDHOST *host = find_host_by_node_id(node_id); - if (unlikely(!host)) - return; - - wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; - if (unlikely(!wc)) { - netdata_log_access("ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT REQUEST RECEIVED FOR INVALID NODE", node_id); - return; + if (unlikely(!host || !(wc = host->aclk_config))) + nd_log(NDLS_ACCESS, NDLP_WARNING, "ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT REQUEST RECEIVED FOR INVALID NODE", node_id); + else { + nd_log(NDLS_ACCESS, NDLP_DEBUG, "ACLK REQ [%s (%s)]: ALERTS CHECKPOINT REQUEST RECEIVED", node_id, rrdhost_hostname(host)); + wc->alert_checkpoint_req = SEND_CHECKPOINT_AFTER_HEALTH_LOOPS; } - - netdata_log_access("ACLK REQ [%s (%s)]: ALERTS CHECKPOINT REQUEST RECEIVED", node_id, rrdhost_hostname(host)); - - wc->alert_checkpoint_req = SEND_CHECKPOINT_AFTER_HEALTH_LOOPS; } typedef struct active_alerts { @@ -1068,15 +1045,14 @@ typedef struct active_alerts { RRDCALC_STATUS status; } active_alerts_t; -static inline int compare_active_alerts(const void * a, const void * b) { +static inline int compare_active_alerts(const void *a, const void *b) +{ active_alerts_t *active_alerts_a = (active_alerts_t *)a; active_alerts_t *active_alerts_b = (active_alerts_t *)b; - if( !(strcmp(active_alerts_a->name, active_alerts_b->name)) ) - { - return strcmp(active_alerts_a->chart, active_alerts_b->chart); - } - else + if (!(strcmp(active_alerts_a->name, active_alerts_b->name))) { + return strcmp(active_alerts_a->chart, active_alerts_b->chart); + } else return strcmp(active_alerts_a->name, active_alerts_b->name); } @@ -1084,16 +1060,16 @@ static inline int compare_active_alerts(const void * a, const void * b) { void aclk_push_alarm_checkpoint(RRDHOST *host __maybe_unused) { #ifdef ENABLE_ACLK - struct aclk_sync_host_config *wc = host->aclk_sync_host_config; + struct aclk_sync_cfg_t *wc = host->aclk_config; if (unlikely(!wc)) { - netdata_log_access("ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT REQUEST RECEIVED FOR INVALID NODE", rrdhost_hostname(host)); + nd_log(NDLS_ACCESS, NDLP_WARNING, "ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT REQUEST RECEIVED FOR INVALID NODE", rrdhost_hostname(host)); return; } if (rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS)) { //postpone checkpoint send - wc->alert_checkpoint_req+=3; - netdata_log_access("ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT POSTPONED", rrdhost_hostname(host)); + wc->alert_checkpoint_req += 3; + nd_log(NDLS_ACCESS, NDLP_NOTICE, "ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT POSTPONED", rrdhost_hostname(host)); return; } @@ -1126,16 +1102,16 @@ void aclk_push_alarm_checkpoint(RRDHOST *host __maybe_unused) BUFFER *alarms_to_hash; if (cnt) { - qsort (active_alerts, cnt, sizeof(active_alerts_t), compare_active_alerts); + qsort(active_alerts, cnt, sizeof(active_alerts_t), compare_active_alerts); alarms_to_hash = buffer_create(len, NULL); - for (uint32_t i=0;i<cnt;i++) { + for (uint32_t i = 0; i < cnt; i++) { buffer_strcat(alarms_to_hash, active_alerts[i].name); buffer_strcat(alarms_to_hash, active_alerts[i].chart); if (active_alerts[i].status == RRDCALC_STATUS_WARNING) - buffer_strcat(alarms_to_hash, "W"); + buffer_fast_strcat(alarms_to_hash, "W", 1); else if (active_alerts[i].status == RRDCALC_STATUS_CRITICAL) - buffer_strcat(alarms_to_hash, "C"); + buffer_fast_strcat(alarms_to_hash, "C", 1); } } else { alarms_to_hash = buffer_create(1, NULL); @@ -1156,10 +1132,10 @@ void aclk_push_alarm_checkpoint(RRDHOST *host __maybe_unused) aclk_send_provide_alarm_checkpoint(&alarm_checkpoint); freez(claim_id); - netdata_log_access("ACLK RES [%s (%s)]: ALERTS CHECKPOINT SENT", wc->node_id, rrdhost_hostname(host)); - } else { - netdata_log_access("ACLK RES [%s (%s)]: FAILED TO CREATE ALERTS CHECKPOINT HASH", wc->node_id, rrdhost_hostname(host)); - } + nd_log(NDLS_ACCESS, NDLP_DEBUG, "ACLK RES [%s (%s)]: ALERTS CHECKPOINT SENT", wc->node_id, rrdhost_hostname(host)); + } else + nd_log(NDLS_ACCESS, NDLP_ERR, "ACLK RES [%s (%s)]: FAILED TO CREATE ALERTS CHECKPOINT HASH", wc->node_id, rrdhost_hostname(host)); + wc->alert_checkpoint_req = 0; buffer_free(alarms_to_hash); #endif diff --git a/database/sqlite/sqlite_aclk_alert.h b/database/sqlite/sqlite_aclk_alert.h index c92aef083..cfb3468b9 100644 --- a/database/sqlite/sqlite_aclk_alert.h +++ b/database/sqlite/sqlite_aclk_alert.h @@ -15,9 +15,8 @@ struct proto_alert_status { uint64_t last_submitted_sequence_id; }; -void aclk_push_alert_event(struct aclk_sync_host_config *wc); void aclk_send_alarm_configuration (char *config_hash); -int aclk_push_alert_config_event(char *node_id, char *config_hash); +void aclk_push_alert_config_event(char *node_id, char *config_hash); void aclk_start_alert_streaming(char *node_id, bool resets); void sql_queue_removed_alerts_to_aclk(RRDHOST *host); void sql_process_queue_removed_alerts_to_aclk(char *node_id); diff --git a/database/sqlite/sqlite_aclk_node.c b/database/sqlite/sqlite_aclk_node.c index 82927854a..dcc8c375c 100644 --- a/database/sqlite/sqlite_aclk_node.c +++ b/database/sqlite/sqlite_aclk_node.c @@ -7,17 +7,16 @@ #include "../../aclk/aclk_capas.h" #ifdef ENABLE_ACLK + DICTIONARY *collectors_from_charts(RRDHOST *host, DICTIONARY *dict) { RRDSET *st; char name[500]; - rrdset_foreach_read(st, host) { + rrdset_foreach_read(st, host) + { if (rrdset_is_available_for_viewers(st)) { - struct collector_info col = { - .plugin = rrdset_plugin_name(st), - .module = rrdset_module_name(st) - }; - snprintfz(name, 499, "%s:%s", col.plugin, col.module); + struct collector_info col = {.plugin = rrdset_plugin_name(st), .module = rrdset_module_name(st)}; + snprintfz(name, sizeof(name) - 1, "%s:%s", col.plugin, col.module); dictionary_set(dict, name, &col, sizeof(struct collector_info)); } } @@ -26,17 +25,9 @@ DICTIONARY *collectors_from_charts(RRDHOST *host, DICTIONARY *dict) { return dict; } -static void build_node_collectors(char *node_id __maybe_unused) +static void build_node_collectors(RRDHOST *host) { - - RRDHOST *host = find_host_by_node_id(node_id); - - if (unlikely(!host)) - return; - - struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *) host->aclk_sync_host_config; - if (unlikely(!wc)) - return; + struct aclk_sync_cfg_t *wc = host->aclk_config; struct update_node_collectors upd_node_collectors; DICTIONARY *dict = dictionary_create(DICT_OPTION_SINGLE_THREADED); @@ -50,45 +41,33 @@ static void build_node_collectors(char *node_id __maybe_unused) dictionary_destroy(dict); freez(upd_node_collectors.claim_id); - netdata_log_access("ACLK RES [%s (%s)]: NODE COLLECTORS SENT", node_id, rrdhost_hostname(host)); - - freez(node_id); + nd_log(NDLS_ACCESS, NDLP_DEBUG, "ACLK RES [%s (%s)]: NODE COLLECTORS SENT", wc->node_id, rrdhost_hostname(host)); } -static void build_node_info(char *node_id __maybe_unused) +static void build_node_info(RRDHOST *host) { struct update_node_info node_info; - RRDHOST *host = find_host_by_node_id(node_id); - - if (unlikely((!host))) { - freez(node_id); - return; - } - - struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *) host->aclk_sync_host_config; - - if (unlikely(!wc)) { - freez(node_id); - return; - } + struct aclk_sync_cfg_t *wc = host->aclk_config; rrd_rdlock(); node_info.node_id = wc->node_id; node_info.claim_id = get_agent_claimid(); node_info.machine_guid = host->machine_guid; - node_info.child = (wc->host != localhost); + node_info.child = (host != localhost); node_info.ml_info.ml_capable = ml_capable(); - node_info.ml_info.ml_enabled = ml_enabled(wc->host); + node_info.ml_info.ml_enabled = ml_enabled(host); - node_info.node_instance_capabilities = aclk_get_node_instance_capas(wc->host); + node_info.node_instance_capabilities = aclk_get_node_instance_capas(host); now_realtime_timeval(&node_info.updated_at); char *host_version = NULL; if (host != localhost) { netdata_mutex_lock(&host->receiver_lock); - host_version = strdupz(host->receiver && host->receiver->program_version ? host->receiver->program_version : rrdhost_program_version(host)); + host_version = strdupz( + host->receiver && host->receiver->program_version ? host->receiver->program_version : + rrdhost_program_version(host)); netdata_mutex_unlock(&host->receiver_lock); } @@ -112,10 +91,11 @@ static void build_node_info(char *node_id __maybe_unused) node_info.data.machine_guid = host->machine_guid; struct capability node_caps[] = { - { .name = "ml", .version = host->system_info->ml_capable, .enabled = host->system_info->ml_enabled }, - { .name = "mc", .version = host->system_info->mc_version ? host->system_info->mc_version : 0, .enabled = host->system_info->mc_version ? 1 : 0 }, - { .name = NULL, .version = 0, .enabled = 0 } - }; + {.name = "ml", .version = host->system_info->ml_capable, .enabled = host->system_info->ml_enabled}, + {.name = "mc", + .version = host->system_info->mc_version ? host->system_info->mc_version : 0, + .enabled = host->system_info->mc_version ? 1 : 0}, + {.name = NULL, .version = 0, .enabled = 0}}; node_info.node_capabilities = node_caps; node_info.data.ml_info.ml_capable = host->system_info->ml_capable; @@ -124,7 +104,14 @@ static void build_node_info(char *node_id __maybe_unused) node_info.data.host_labels_ptr = host->rrdlabels; aclk_update_node_info(&node_info); - netdata_log_access("ACLK RES [%s (%s)]: NODE INFO SENT for guid [%s] (%s)", wc->node_id, rrdhost_hostname(wc->host), host->machine_guid, wc->host == localhost ? "parent" : "child"); + nd_log( + NDLS_ACCESS, + NDLP_DEBUG, + "ACLK RES [%s (%s)]: NODE INFO SENT for guid [%s] (%s)", + wc->node_id, + rrdhost_hostname(host), + host->machine_guid, + host == localhost ? "parent" : "child"); rrd_unlock(); freez(node_info.claim_id); @@ -132,10 +119,21 @@ static void build_node_info(char *node_id __maybe_unused) freez(host_version); wc->node_collectors_send = now_realtime_sec(); - freez(node_id); - } +static bool host_is_replicating(RRDHOST *host) +{ + bool replicating = false; + RRDSET *st; + rrdset_foreach_reentrant(st, host) { + if (rrdset_is_replicating(st)) { + replicating = true; + break; + } + } + rrdset_foreach_done(st); + return replicating; +} void aclk_check_node_info_and_collectors(void) { @@ -144,35 +142,59 @@ void aclk_check_node_info_and_collectors(void) if (unlikely(!aclk_connected)) return; - size_t pending = 0; - dfe_start_reentrant(rrdhost_root_index, host) { + size_t context_loading = 0; + size_t replicating = 0; + size_t context_pp = 0; - struct aclk_sync_host_config *wc = host->aclk_sync_host_config; + time_t now = now_realtime_sec(); + dfe_start_reentrant(rrdhost_root_index, host) + { + struct aclk_sync_cfg_t *wc = host->aclk_config; if (unlikely(!wc)) continue; if (unlikely(rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD))) { internal_error(true, "ACLK SYNC: Context still pending for %s", rrdhost_hostname(host)); - pending++; + context_loading++; continue; } - if (wc->node_info_send_time && wc->node_info_send_time + 30 < now_realtime_sec()) { + if (unlikely(host_is_replicating(host))) { + internal_error(true, "ACLK SYNC: Host %s is still replicating", rrdhost_hostname(host)); + replicating++; + continue; + } + + bool pp_queue_empty = !(host->rrdctx.pp_queue && dictionary_entries(host->rrdctx.pp_queue)); + + if (!pp_queue_empty && (wc->node_info_send_time || wc->node_collectors_send)) + context_pp++; + + if (pp_queue_empty && wc->node_info_send_time && wc->node_info_send_time + 30 < now) { wc->node_info_send_time = 0; - build_node_info(strdupz(wc->node_id)); + build_node_info(host); internal_error(true, "ACLK SYNC: Sending node info for %s", rrdhost_hostname(host)); } - if (wc->node_collectors_send && wc->node_collectors_send + 30 < now_realtime_sec()) { - build_node_collectors(strdupz(wc->node_id)); + if (pp_queue_empty && wc->node_collectors_send && wc->node_collectors_send + 30 < now) { + build_node_collectors(host); internal_error(true, "ACLK SYNC: Sending collectors for %s", rrdhost_hostname(host)); wc->node_collectors_send = 0; } } dfe_done(host); - if(pending) - netdata_log_info("ACLK: %zu nodes are pending for contexts to load, skipped sending node info for them", pending); + if (context_loading || replicating || context_pp) { + nd_log_limit_static_thread_var(erl, 10, 100 * USEC_PER_MS); + nd_log_limit( + &erl, + NDLS_DAEMON, + NDLP_INFO, + "%zu nodes loading contexts, %zu replicating data, %zu pending context post processing", + context_loading, + replicating, + context_pp); + } } #endif diff --git a/database/sqlite/sqlite_context.c b/database/sqlite/sqlite_context.c index d4b15e99d..26ed8a96a 100644 --- a/database/sqlite/sqlite_context.c +++ b/database/sqlite/sqlite_context.c @@ -7,16 +7,16 @@ #define DB_CONTEXT_METADATA_VERSION 1 const char *database_context_config[] = { - "CREATE TABLE IF NOT EXISTS context (host_id BLOB, id TEXT NOT NULL, version INT NOT NULL, title TEXT NOT NULL, " \ + "CREATE TABLE IF NOT EXISTS context (host_id BLOB, id TEXT NOT NULL, version INT NOT NULL, title TEXT NOT NULL, " "chart_type TEXT NOT NULL, unit TEXT NOT NULL, priority INT NOT NULL, first_time_t INT NOT NULL, " "last_time_t INT NOT NULL, deleted INT NOT NULL, " - "family TEXT, PRIMARY KEY (host_id, id));", + "family TEXT, PRIMARY KEY (host_id, id))", NULL }; const char *database_context_cleanup[] = { - "VACUUM;", + "VACUUM", NULL }; @@ -31,7 +31,7 @@ int sql_init_context_database(int memory) int rc; if (likely(!memory)) - snprintfz(sqlite_database, FILENAME_MAX, "%s/context-meta.db", netdata_configured_cache_dir); + snprintfz(sqlite_database, sizeof(sqlite_database) - 1, "%s/context-meta.db", netdata_configured_cache_dir); else strcpy(sqlite_database, ":memory:"); @@ -56,9 +56,9 @@ int sql_init_context_database(int memory) return 1; if (likely(!memory)) - snprintfz(buf, 1024, "ATTACH DATABASE \"%s/netdata-meta.db\" as meta;", netdata_configured_cache_dir); + snprintfz(buf, sizeof(buf) - 1, "ATTACH DATABASE \"%s/netdata-meta.db\" as meta", netdata_configured_cache_dir); else - snprintfz(buf, 1024, "ATTACH DATABASE ':memory:' as meta;"); + snprintfz(buf, sizeof(buf) - 1, "ATTACH DATABASE ':memory:' as meta"); if(init_database_batch(db_context_meta, list)) return 1; @@ -92,7 +92,7 @@ void sql_close_context_database(void) // Fetching data // #define CTX_GET_CHART_LIST "SELECT c.chart_id, c.type||'.'||c.id, c.name, c.context, c.title, c.unit, c.priority, " \ - "c.update_every, c.chart_type, c.family FROM meta.chart c WHERE c.host_id = @host_id and c.chart_id is not null; " + "c.update_every, c.chart_type, c.family FROM chart c WHERE c.host_id = @host_id AND c.chart_id IS NOT NULL" void ctx_get_chart_list(uuid_t *host_uuid, void (*dict_cb)(SQL_CHART_DATA *, void *), void *data) { @@ -105,7 +105,7 @@ void ctx_get_chart_list(uuid_t *host_uuid, void (*dict_cb)(SQL_CHART_DATA *, voi } if (unlikely(!res)) { - rc = prepare_statement(db_context_meta, CTX_GET_CHART_LIST, &res); + rc = prepare_statement(db_meta, CTX_GET_CHART_LIST, &res); if (rc != SQLITE_OK) { error_report("Failed to prepare statement to fetch chart list"); return; @@ -141,14 +141,14 @@ skip_load: // Dimension list #define CTX_GET_DIMENSION_LIST "SELECT d.dim_id, d.id, d.name, CASE WHEN INSTR(d.options,\"hidden\") > 0 THEN 1 ELSE 0 END " \ - "FROM meta.dimension d WHERE d.chart_id = @id and d.dim_id is not null ORDER BY d.rowid ASC;" + "FROM dimension d WHERE d.chart_id = @id AND d.dim_id IS NOT NULL ORDER BY d.rowid ASC" void ctx_get_dimension_list(uuid_t *chart_uuid, void (*dict_cb)(SQL_DIMENSION_DATA *, void *), void *data) { int rc; static __thread sqlite3_stmt *res = NULL; if (unlikely(!res)) { - rc = prepare_statement(db_context_meta, CTX_GET_DIMENSION_LIST, &res); + rc = prepare_statement(db_meta, CTX_GET_DIMENSION_LIST, &res); if (rc != SQLITE_OK) { error_report("Failed to prepare statement to fetch chart dimension data"); return; @@ -178,7 +178,8 @@ failed: } // LABEL LIST -#define CTX_GET_LABEL_LIST "SELECT l.label_key, l.label_value, l.source_type FROM meta.chart_label l WHERE l.chart_id = @id;" +#define CTX_GET_LABEL_LIST "SELECT l.label_key, l.label_value, l.source_type FROM meta.chart_label l WHERE l.chart_id = @id" + void ctx_get_label_list(uuid_t *chart_uuid, void (*dict_cb)(SQL_CLABEL_DATA *, void *), void *data) { int rc; @@ -215,7 +216,8 @@ failed: // CONTEXT LIST #define CTX_GET_CONTEXT_LIST "SELECT id, version, title, chart_type, unit, priority, first_time_t, " \ - "last_time_t, deleted, family FROM context c WHERE c.host_id = @host_id;" + "last_time_t, deleted, family FROM context c WHERE c.host_id = @host_id" + void ctx_get_context_list(uuid_t *host_uuid, void (*dict_cb)(VERSIONED_CONTEXT_DATA *, void *), void *data) { @@ -266,9 +268,10 @@ failed: // // Storing Data // -#define CTX_STORE_CONTEXT "INSERT OR REPLACE INTO context " \ - "(host_id, id, version, title, chart_type, unit, priority, first_time_t, last_time_t, deleted, family) " \ - "VALUES (@host_id, @context, @version, @title, @chart_type, @unit, @priority, @first_time_t, @last_time_t, @deleted, @family);" +#define CTX_STORE_CONTEXT \ + "INSERT OR REPLACE INTO context " \ + "(host_id, id, version, title, chart_type, unit, priority, first_time_t, last_time_t, deleted, family) " \ + "VALUES (@host_id, @context, @version, @title, @chart_type, @unit, @priority, @first_t, @last_t, @delete, @family)" int ctx_store_context(uuid_t *host_uuid, VERSIONED_CONTEXT_DATA *context_data) { @@ -292,7 +295,7 @@ int ctx_store_context(uuid_t *host_uuid, VERSIONED_CONTEXT_DATA *context_data) rc = bind_text_null(res, 2, context_data->id, 0); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind context to store details"); + error_report("Failed to bind context to store context details"); goto skip_store; } @@ -304,19 +307,19 @@ int ctx_store_context(uuid_t *host_uuid, VERSIONED_CONTEXT_DATA *context_data) rc = bind_text_null(res, 4, context_data->title, 0); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind context to store details"); + error_report("Failed to bind context to store context details"); goto skip_store; } rc = bind_text_null(res, 5, context_data->chart_type, 0); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind context to store details"); + error_report("Failed to bind context to store context details"); goto skip_store; } rc = bind_text_null(res, 6, context_data->units, 0); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind context to store details"); + error_report("Failed to bind context to store context details"); goto skip_store; } @@ -365,7 +368,7 @@ skip_store: // Delete a context -#define CTX_DELETE_CONTEXT "DELETE FROM context WHERE host_id = @host_id AND id = @context;" +#define CTX_DELETE_CONTEXT "DELETE FROM context WHERE host_id = @host_id AND id = @context" int ctx_delete_context(uuid_t *host_uuid, VERSIONED_CONTEXT_DATA *context_data) { int rc, rc_stored = 1; @@ -382,13 +385,13 @@ int ctx_delete_context(uuid_t *host_uuid, VERSIONED_CONTEXT_DATA *context_data) rc = sqlite3_bind_blob(res, 1, host_uuid, sizeof(*host_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind host_id to delete context data"); + error_report("Failed to bind host_id for context data deletion"); goto skip_delete; } rc = sqlite3_bind_text(res, 2, context_data->id, -1, SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind context id for data deletion"); + error_report("Failed to bind context id for context data deletion"); goto skip_delete; } @@ -396,13 +399,6 @@ int ctx_delete_context(uuid_t *host_uuid, VERSIONED_CONTEXT_DATA *context_data) if (rc_stored != SQLITE_DONE) error_report("Failed to delete context %s, rc = %d", context_data->id, rc_stored); -#ifdef NETDATA_INTERNAL_CHECKS - else { - char host_uuid_str[UUID_STR_LEN]; - uuid_unparse_lower(*host_uuid, host_uuid_str); - netdata_log_info("%s: Deleted context %s under host %s", __FUNCTION__, context_data->id, host_uuid_str); - } -#endif skip_delete: rc = sqlite3_finalize(res); diff --git a/database/sqlite/sqlite_db_migration.c b/database/sqlite/sqlite_db_migration.c index a011d0fef..29da6c249 100644 --- a/database/sqlite/sqlite_db_migration.c +++ b/database/sqlite/sqlite_db_migration.c @@ -7,7 +7,7 @@ static int return_int_cb(void *data, int argc, char **argv, char **column) int *status = data; UNUSED(argc); UNUSED(column); - *status = str2uint32_t(argv[0], NULL); + *status = (int) str2uint32_t(argv[0], NULL); return 0; } @@ -18,7 +18,7 @@ static int get_auto_vaccum(sqlite3 *database) int exists = 0; - snprintf(sql, 127, "PRAGMA auto_vacuum"); + snprintf(sql, sizeof(sql) - 1, "PRAGMA auto_vacuum"); int rc = sqlite3_exec_monitored(database, sql, return_int_cb, (void *) &exists, &err_msg); if (rc != SQLITE_OK) { @@ -35,7 +35,7 @@ int db_table_count(sqlite3 *database) char sql[128]; int count = 0; - snprintf(sql, 127, "select count(1) from sqlite_schema where type = 'table'"); + snprintf(sql, sizeof(sql) - 1, "select count(1) from sqlite_schema where type = 'table'"); int rc = sqlite3_exec_monitored(database, sql, return_int_cb, (void *) &count, &err_msg); if (rc != SQLITE_OK) { netdata_log_info("Error checking database table count; %s", err_msg); @@ -51,7 +51,7 @@ int table_exists_in_database(sqlite3 *database, const char *table) int exists = 0; - snprintf(sql, 127, "select 1 from sqlite_schema where type = 'table' and name = '%s';", table); + snprintf(sql, sizeof(sql) - 1, "select 1 from sqlite_schema where type = 'table' and name = '%s'", table); int rc = sqlite3_exec_monitored(database, sql, return_int_cb, (void *) &exists, &err_msg); if (rc != SQLITE_OK) { @@ -69,7 +69,7 @@ static int column_exists_in_table(sqlite3 *database, const char *table, const ch int exists = 0; - snprintf(sql, 127, "SELECT 1 FROM pragma_table_info('%s') where name = '%s';", table, column); + snprintf(sql, sizeof(sql) - 1, "SELECT 1 FROM pragma_table_info('%s') where name = '%s'", table, column); int rc = sqlite3_exec_monitored(database, sql, return_int_cb, (void *) &exists, &err_msg); if (rc != SQLITE_OK) { @@ -92,64 +92,64 @@ static int get_database_user_version(sqlite3 *database) } const char *database_migrate_v1_v2[] = { - "ALTER TABLE host ADD hops INTEGER NOT NULL DEFAULT 0;", + "ALTER TABLE host ADD hops INTEGER NOT NULL DEFAULT 0", NULL }; const char *database_migrate_v2_v3[] = { - "ALTER TABLE host ADD memory_mode INT NOT NULL DEFAULT 0;", - "ALTER TABLE host ADD abbrev_timezone TEXT NOT NULL DEFAULT '';", - "ALTER TABLE host ADD utc_offset INT NOT NULL DEFAULT 0;", - "ALTER TABLE host ADD program_name TEXT NOT NULL DEFAULT 'unknown';", - "ALTER TABLE host ADD program_version TEXT NOT NULL DEFAULT 'unknown';", - "ALTER TABLE host ADD entries INT NOT NULL DEFAULT 0;", - "ALTER TABLE host ADD health_enabled INT NOT NULL DEFAULT 0;", + "ALTER TABLE host ADD memory_mode INT NOT NULL DEFAULT 0", + "ALTER TABLE host ADD abbrev_timezone TEXT NOT NULL DEFAULT ''", + "ALTER TABLE host ADD utc_offset INT NOT NULL DEFAULT 0", + "ALTER TABLE host ADD program_name TEXT NOT NULL DEFAULT 'unknown'", + "ALTER TABLE host ADD program_version TEXT NOT NULL DEFAULT 'unknown'", + "ALTER TABLE host ADD entries INT NOT NULL DEFAULT 0", + "ALTER TABLE host ADD health_enabled INT NOT NULL DEFAULT 0", NULL }; const char *database_migrate_v4_v5[] = { - "DROP TABLE IF EXISTS chart_active;", - "DROP TABLE IF EXISTS dimension_active;", - "DROP TABLE IF EXISTS chart_hash;", - "DROP TABLE IF EXISTS chart_hash_map;", - "DROP VIEW IF EXISTS v_chart_hash;", + "DROP TABLE IF EXISTS chart_active", + "DROP TABLE IF EXISTS dimension_active", + "DROP TABLE IF EXISTS chart_hash", + "DROP TABLE IF EXISTS chart_hash_map", + "DROP VIEW IF EXISTS v_chart_hash", NULL }; const char *database_migrate_v5_v6[] = { - "DROP TRIGGER IF EXISTS tr_dim_del;", - "DROP TABLE IF EXISTS dimension_delete;", + "DROP TRIGGER IF EXISTS tr_dim_del", + "DROP TABLE IF EXISTS dimension_delete", NULL }; const char *database_migrate_v9_v10[] = { - "ALTER TABLE alert_hash ADD chart_labels TEXT;", + "ALTER TABLE alert_hash ADD chart_labels TEXT", NULL }; const char *database_migrate_v10_v11[] = { - "ALTER TABLE health_log ADD chart_name TEXT;", + "ALTER TABLE health_log ADD chart_name TEXT", NULL }; const char *database_migrate_v11_v12[] = { - "ALTER TABLE health_log_detail ADD summary TEXT;", - "ALTER TABLE alert_hash ADD summary TEXT;", + "ALTER TABLE health_log_detail ADD summary TEXT", + "ALTER TABLE alert_hash ADD summary TEXT", NULL }; const char *database_migrate_v12_v13_detail[] = { - "ALTER TABLE health_log_detail ADD summary TEXT;", + "ALTER TABLE health_log_detail ADD summary TEXT", NULL }; const char *database_migrate_v12_v13_hash[] = { - "ALTER TABLE alert_hash ADD summary TEXT;", + "ALTER TABLE alert_hash ADD summary TEXT", NULL }; const char *database_migrate_v13_v14[] = { - "ALTER TABLE host ADD last_connected INT NOT NULL DEFAULT 0;", + "ALTER TABLE host ADD last_connected INT NOT NULL DEFAULT 0", NULL }; @@ -173,7 +173,7 @@ static int do_migration_v3_v4(sqlite3 *database) int rc; sqlite3_stmt *res = NULL; - snprintfz(sql, 255, "SELECT name FROM sqlite_schema WHERE type ='table' AND name LIKE 'health_log_%%';"); + snprintfz(sql, sizeof(sql) - 1, "SELECT name FROM sqlite_schema WHERE type ='table' AND name LIKE 'health_log_%%'"); rc = sqlite3_prepare_v2(database, sql, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement to alter health_log tables"); @@ -183,7 +183,7 @@ static int do_migration_v3_v4(sqlite3 *database) while (sqlite3_step_monitored(res) == SQLITE_ROW) { char *table = strdupz((char *) sqlite3_column_text(res, 0)); if (!column_exists_in_table(database, table, "chart_context")) { - snprintfz(sql, 255, "ALTER TABLE %s ADD chart_context text", table); + snprintfz(sql, sizeof(sql) - 1, "ALTER TABLE %s ADD chart_context text", table); sqlite3_exec_monitored(database, sql, 0, 0, NULL); } freez(table); @@ -212,7 +212,7 @@ static int do_migration_v6_v7(sqlite3 *database) int rc; sqlite3_stmt *res = NULL; - snprintfz(sql, 255, "SELECT name FROM sqlite_schema WHERE type ='table' AND name LIKE 'aclk_alert_%%';"); + snprintfz(sql, sizeof(sql) - 1, "SELECT name FROM sqlite_schema WHERE type ='table' AND name LIKE 'aclk_alert_%%'"); rc = sqlite3_prepare_v2(database, sql, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement to alter aclk_alert tables"); @@ -222,9 +222,9 @@ static int do_migration_v6_v7(sqlite3 *database) while (sqlite3_step_monitored(res) == SQLITE_ROW) { char *table = strdupz((char *) sqlite3_column_text(res, 0)); if (!column_exists_in_table(database, table, "filtered_alert_unique_id")) { - snprintfz(sql, 255, "ALTER TABLE %s ADD filtered_alert_unique_id", table); + snprintfz(sql, sizeof(sql) - 1, "ALTER TABLE %s ADD filtered_alert_unique_id", table); sqlite3_exec_monitored(database, sql, 0, 0, NULL); - snprintfz(sql, 255, "UPDATE %s SET filtered_alert_unique_id = alert_unique_id", table); + snprintfz(sql, sizeof(sql) - 1, "UPDATE %s SET filtered_alert_unique_id = alert_unique_id", table); sqlite3_exec_monitored(database, sql, 0, 0, NULL); } freez(table); @@ -243,7 +243,7 @@ static int do_migration_v7_v8(sqlite3 *database) int rc; sqlite3_stmt *res = NULL; - snprintfz(sql, 255, "SELECT name FROM sqlite_schema WHERE type ='table' AND name LIKE 'health_log_%%';"); + snprintfz(sql, sizeof(sql) - 1, "SELECT name FROM sqlite_schema WHERE type ='table' AND name LIKE 'health_log_%%'"); rc = sqlite3_prepare_v2(database, sql, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement to alter health_log tables"); @@ -253,7 +253,7 @@ static int do_migration_v7_v8(sqlite3 *database) while (sqlite3_step_monitored(res) == SQLITE_ROW) { char *table = strdupz((char *) sqlite3_column_text(res, 0)); if (!column_exists_in_table(database, table, "transition_id")) { - snprintfz(sql, 255, "ALTER TABLE %s ADD transition_id blob", table); + snprintfz(sql, sizeof(sql) - 1, "ALTER TABLE %s ADD transition_id blob", table); sqlite3_exec_monitored(database, sql, 0, 0, NULL); } freez(table); @@ -273,38 +273,38 @@ static int do_migration_v8_v9(sqlite3 *database) sqlite3_stmt *res = NULL; //create the health_log table and it's index - snprintfz(sql, 2047, "CREATE TABLE IF NOT EXISTS health_log (health_log_id INTEGER PRIMARY KEY, host_id blob, alarm_id int, " \ + snprintfz(sql, sizeof(sql) - 1, "CREATE TABLE IF NOT EXISTS health_log (health_log_id INTEGER PRIMARY KEY, host_id blob, alarm_id int, " \ "config_hash_id blob, name text, chart text, family text, recipient text, units text, exec text, " \ - "chart_context text, last_transition_id blob, UNIQUE (host_id, alarm_id)) ;"); + "chart_context text, last_transition_id blob, UNIQUE (host_id, alarm_id))"); sqlite3_exec_monitored(database, sql, 0, 0, NULL); //TODO indexes - snprintfz(sql, 2047, "CREATE INDEX IF NOT EXISTS health_log_ind_1 ON health_log (host_id);"); + snprintfz(sql, sizeof(sql) - 1, "CREATE INDEX IF NOT EXISTS health_log_ind_1 ON health_log (host_id)"); sqlite3_exec_monitored(database, sql, 0, 0, NULL); - snprintfz(sql, 2047, "CREATE TABLE IF NOT EXISTS health_log_detail (health_log_id int, unique_id int, alarm_id int, alarm_event_id int, " \ + snprintfz(sql, sizeof(sql) - 1, "CREATE TABLE IF NOT EXISTS health_log_detail (health_log_id int, unique_id int, alarm_id int, alarm_event_id int, " \ "updated_by_id int, updates_id int, when_key int, duration int, non_clear_duration int, " \ "flags int, exec_run_timestamp int, delay_up_to_timestamp int, " \ "info text, exec_code int, new_status real, old_status real, delay int, " \ - "new_value double, old_value double, last_repeat int, transition_id blob, global_id int, summary text, host_id blob);"); + "new_value double, old_value double, last_repeat int, transition_id blob, global_id int, summary text, host_id blob)"); sqlite3_exec_monitored(database, sql, 0, 0, NULL); - snprintfz(sql, 2047, "CREATE INDEX IF NOT EXISTS health_log_d_ind_1 ON health_log_detail (unique_id);"); + snprintfz(sql, sizeof(sql) - 1, "CREATE INDEX IF NOT EXISTS health_log_d_ind_1 ON health_log_detail (unique_id)"); sqlite3_exec_monitored(database, sql, 0, 0, NULL); - snprintfz(sql, 2047, "CREATE INDEX IF NOT EXISTS health_log_d_ind_2 ON health_log_detail (global_id);"); + snprintfz(sql, sizeof(sql) - 1, "CREATE INDEX IF NOT EXISTS health_log_d_ind_2 ON health_log_detail (global_id)"); sqlite3_exec_monitored(database, sql, 0, 0, NULL); - snprintfz(sql, 2047, "CREATE INDEX IF NOT EXISTS health_log_d_ind_3 ON health_log_detail (transition_id);"); + snprintfz(sql, sizeof(sql) - 1, "CREATE INDEX IF NOT EXISTS health_log_d_ind_3 ON health_log_detail (transition_id)"); sqlite3_exec_monitored(database, sql, 0, 0, NULL); - snprintfz(sql, 2047, "CREATE INDEX IF NOT EXISTS health_log_d_ind_4 ON health_log_detail (health_log_id);"); + snprintfz(sql, sizeof(sql) - 1, "CREATE INDEX IF NOT EXISTS health_log_d_ind_4 ON health_log_detail (health_log_id)"); sqlite3_exec_monitored(database, sql, 0, 0, NULL); - snprintfz(sql, 2047, "ALTER TABLE alert_hash ADD source text;"); + snprintfz(sql, sizeof(sql) - 1, "ALTER TABLE alert_hash ADD source text"); sqlite3_exec_monitored(database, sql, 0, 0, NULL); - snprintfz(sql, 2047, "CREATE INDEX IF NOT EXISTS alert_hash_index ON alert_hash (hash_id);"); + snprintfz(sql, sizeof(sql) - 1, "CREATE INDEX IF NOT EXISTS alert_hash_index ON alert_hash (hash_id)"); sqlite3_exec_monitored(database, sql, 0, 0, NULL); - snprintfz(sql, 2047, "SELECT name FROM sqlite_schema WHERE type ='table' AND name LIKE 'health_log_%%' AND name <> 'health_log_detail';"); + snprintfz(sql, sizeof(sql) - 1, "SELECT name FROM sqlite_schema WHERE type ='table' AND name LIKE 'health_log_%%' AND name <> 'health_log_detail'"); rc = sqlite3_prepare_v2(database, sql, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement to alter health_log tables"); @@ -332,7 +332,7 @@ static int do_migration_v8_v9(sqlite3 *database) dfe_done(table); dictionary_destroy(dict_tables); - snprintfz(sql, 2047, "ALTER TABLE health_log_detail DROP COLUMN host_id;"); + snprintfz(sql, sizeof(sql) - 1, "ALTER TABLE health_log_detail DROP COLUMN host_id"); sqlite3_exec_monitored(database, sql, 0, 0, NULL); return 0; @@ -353,7 +353,7 @@ static int do_migration_v10_v11(sqlite3 *database) return 0; } -#define MIGR_11_12_UPD_HEALTH_LOG_DETAIL "UPDATE health_log_detail SET summary = (select name from health_log where health_log_id = health_log_detail.health_log_id);" +#define MIGR_11_12_UPD_HEALTH_LOG_DETAIL "UPDATE health_log_detail SET summary = (select name from health_log where health_log_id = health_log_detail.health_log_id)" static int do_migration_v11_v12(sqlite3 *database) { int rc = 0; @@ -368,6 +368,68 @@ static int do_migration_v11_v12(sqlite3 *database) return rc; } +static int do_migration_v14_v15(sqlite3 *database) +{ + char sql[256]; + + int rc; + sqlite3_stmt *res = NULL; + snprintfz(sql, sizeof(sql) - 1, "SELECT name FROM sqlite_schema WHERE type = \"index\" AND name LIKE \"aclk_alert_index@_%%\" ESCAPE \"@\""); + rc = sqlite3_prepare_v2(database, sql, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement to drop unused indices"); + return 1; + } + + BUFFER *wb = buffer_create(128, NULL); + size_t count = 0; + while (sqlite3_step_monitored(res) == SQLITE_ROW) { + buffer_sprintf(wb, "DROP INDEX IF EXISTS %s; ", (char *)sqlite3_column_text(res, 0)); + count++; + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement when dropping unused indices, rc = %d", rc); + + if (count) + (void) db_execute(database, buffer_tostring(wb)); + + buffer_free(wb); + return 0; +} + +static int do_migration_v15_v16(sqlite3 *database) +{ + char sql[256]; + + int rc; + sqlite3_stmt *res = NULL; + snprintfz(sql, sizeof(sql) - 1, "SELECT name FROM sqlite_schema WHERE type = \"table\" AND name LIKE \"aclk_alert_%%\""); + rc = sqlite3_prepare_v2(database, sql, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement to drop unused indices"); + return 1; + } + + BUFFER *wb = buffer_create(128, NULL); + size_t count = 0; + while (sqlite3_step_monitored(res) == SQLITE_ROW) { + buffer_sprintf(wb, "ANALYZE %s ; ", (char *)sqlite3_column_text(res, 0)); + count++; + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement when running ANALYZE on aclk_alert_tables, rc = %d", rc); + + if (count) + (void) db_execute(database, buffer_tostring(wb)); + + buffer_free(wb); + return 0; +} + static int do_migration_v12_v13(sqlite3 *database) { int rc = 0; @@ -425,7 +487,7 @@ static int migrate_database(sqlite3 *database, int target_version, char *db_name int user_version = 0; char *err_msg = NULL; - int rc = sqlite3_exec_monitored(database, "PRAGMA user_version;", return_int_cb, (void *) &user_version, &err_msg); + int rc = sqlite3_exec_monitored(database, "PRAGMA user_version", return_int_cb, (void *) &user_version, &err_msg); if (rc != SQLITE_OK) { netdata_log_info("Error checking the %s database version; %s", db_name, err_msg); sqlite3_free(err_msg); @@ -446,7 +508,6 @@ static int migrate_database(sqlite3 *database, int target_version, char *db_name } } return target_version; - } DATABASE_FUNC_MIGRATION_LIST migration_action[] = { @@ -464,6 +525,8 @@ DATABASE_FUNC_MIGRATION_LIST migration_action[] = { {.name = "v11 to v12", .func = do_migration_v11_v12}, {.name = "v12 to v13", .func = do_migration_v12_v13}, {.name = "v13 to v14", .func = do_migration_v13_v14}, + {.name = "v14 to v15", .func = do_migration_v14_v15}, + {.name = "v15 to v16", .func = do_migration_v15_v16}, // the terminator of this array {.name = NULL, .func = NULL} }; diff --git a/database/sqlite/sqlite_functions.c b/database/sqlite/sqlite_functions.c index 393d6a238..e3537bf5a 100644 --- a/database/sqlite/sqlite_functions.c +++ b/database/sqlite/sqlite_functions.c @@ -4,7 +4,7 @@ #include "sqlite3recover.h" #include "sqlite_db_migration.h" -#define DB_METADATA_VERSION 14 +#define DB_METADATA_VERSION 16 const char *database_config[] = { "CREATE TABLE IF NOT EXISTS host(host_id BLOB PRIMARY KEY, hostname TEXT NOT NULL, " @@ -14,70 +14,77 @@ const char *database_config[] = { "memory_mode INT DEFAULT 0, abbrev_timezone TEXT DEFAULT '', utc_offset INT NOT NULL DEFAULT 0," "program_name TEXT NOT NULL DEFAULT 'unknown', program_version TEXT NOT NULL DEFAULT 'unknown', " "entries INT NOT NULL DEFAULT 0," - "health_enabled INT NOT NULL DEFAULT 0, last_connected INT NOT NULL DEFAULT 0);", + "health_enabled INT NOT NULL DEFAULT 0, last_connected INT NOT NULL DEFAULT 0)", "CREATE TABLE IF NOT EXISTS chart(chart_id blob PRIMARY KEY, host_id blob, type text, id text, name text, " "family text, context text, title text, unit text, plugin text, module text, priority int, update_every int, " - "chart_type int, memory_mode int, history_entries);", + "chart_type int, memory_mode int, history_entries)", + "CREATE TABLE IF NOT EXISTS dimension(dim_id blob PRIMARY KEY, chart_id blob, id text, name text, " - "multiplier int, divisor int , algorithm int, options text);", + "multiplier int, divisor int , algorithm int, options text)", + + "CREATE TABLE IF NOT EXISTS metadata_migration(filename text, file_size, date_created int)", + + "CREATE INDEX IF NOT EXISTS ind_d2 on dimension (chart_id)", + + "CREATE INDEX IF NOT EXISTS ind_c3 on chart (host_id)", - "CREATE TABLE IF NOT EXISTS metadata_migration(filename text, file_size, date_created int);", - "CREATE INDEX IF NOT EXISTS ind_d2 on dimension (chart_id);", - "CREATE INDEX IF NOT EXISTS ind_c3 on chart (host_id);", "CREATE TABLE IF NOT EXISTS chart_label(chart_id blob, source_type int, label_key text, " - "label_value text, date_created int, PRIMARY KEY (chart_id, label_key));", - "CREATE TABLE IF NOT EXISTS node_instance (host_id blob PRIMARY KEY, claim_id, node_id, date_created);", + "label_value text, date_created int, PRIMARY KEY (chart_id, label_key))", + + "CREATE TABLE IF NOT EXISTS node_instance (host_id blob PRIMARY KEY, claim_id, node_id, date_created)", + "CREATE TABLE IF NOT EXISTS alert_hash(hash_id blob PRIMARY KEY, date_updated int, alarm text, template text, " "on_key text, class text, component text, type text, os text, hosts text, lookup text, " "every text, units text, calc text, families text, plugin text, module text, charts text, green text, " "red text, warn text, crit text, exec text, to_key text, info text, delay text, options text, " "repeat text, host_labels text, p_db_lookup_dimensions text, p_db_lookup_method text, p_db_lookup_options int, " - "p_db_lookup_after int, p_db_lookup_before int, p_update_every int, source text, chart_labels text, summary text);", + "p_db_lookup_after int, p_db_lookup_before int, p_update_every int, source text, chart_labels text, summary text)", "CREATE TABLE IF NOT EXISTS host_info(host_id blob, system_key text NOT NULL, system_value text NOT NULL, " - "date_created INT, PRIMARY KEY(host_id, system_key));", + "date_created INT, PRIMARY KEY(host_id, system_key))", "CREATE TABLE IF NOT EXISTS host_label(host_id blob, source_type int, label_key text NOT NULL, " - "label_value text NOT NULL, date_created INT, PRIMARY KEY (host_id, label_key));", + "label_value text NOT NULL, date_created INT, PRIMARY KEY (host_id, label_key))", "CREATE TRIGGER IF NOT EXISTS ins_host AFTER INSERT ON host BEGIN INSERT INTO node_instance (host_id, date_created)" - " SELECT new.host_id, unixepoch() WHERE new.host_id NOT IN (SELECT host_id FROM node_instance); END;", + " SELECT new.host_id, unixepoch() WHERE new.host_id NOT IN (SELECT host_id FROM node_instance); END", "CREATE TABLE IF NOT EXISTS health_log (health_log_id INTEGER PRIMARY KEY, host_id blob, alarm_id int, " "config_hash_id blob, name text, chart text, family text, recipient text, units text, exec text, " - "chart_context text, last_transition_id blob, chart_name text, UNIQUE (host_id, alarm_id)) ;", + "chart_context text, last_transition_id blob, chart_name text, UNIQUE (host_id, alarm_id))", - "CREATE INDEX IF NOT EXISTS health_log_ind_1 ON health_log (host_id);", + "CREATE INDEX IF NOT EXISTS health_log_ind_1 ON health_log (host_id)", "CREATE TABLE IF NOT EXISTS health_log_detail (health_log_id int, unique_id int, alarm_id int, alarm_event_id int, " "updated_by_id int, updates_id int, when_key int, duration int, non_clear_duration int, " "flags int, exec_run_timestamp int, delay_up_to_timestamp int, " "info text, exec_code int, new_status real, old_status real, delay int, " - "new_value double, old_value double, last_repeat int, transition_id blob, global_id int, summary text);", + "new_value double, old_value double, last_repeat int, transition_id blob, global_id int, summary text)", - "CREATE INDEX IF NOT EXISTS health_log_d_ind_2 ON health_log_detail (global_id);", - "CREATE INDEX IF NOT EXISTS health_log_d_ind_3 ON health_log_detail (transition_id);", - "CREATE INDEX IF NOT EXISTS health_log_d_ind_5 ON health_log_detail (health_log_id, unique_id DESC);", + "CREATE INDEX IF NOT EXISTS health_log_d_ind_2 ON health_log_detail (global_id)", + "CREATE INDEX IF NOT EXISTS health_log_d_ind_3 ON health_log_detail (transition_id)", + "CREATE INDEX IF NOT EXISTS health_log_d_ind_9 ON health_log_detail (unique_id DESC, health_log_id)", "CREATE INDEX IF NOT EXISTS health_log_d_ind_6 on health_log_detail (health_log_id, when_key)", - "CREATE INDEX IF NOT EXISTS health_log_d_ind_7 on health_log_detail (alarm_id);", - "CREATE INDEX IF NOT EXISTS health_log_d_ind_8 on health_log_detail (new_status, updated_by_id);", + "CREATE INDEX IF NOT EXISTS health_log_d_ind_7 on health_log_detail (alarm_id)", + "CREATE INDEX IF NOT EXISTS health_log_d_ind_8 on health_log_detail (new_status, updated_by_id)", NULL }; const char *database_cleanup[] = { - "DELETE FROM host WHERE host_id NOT IN (SELECT host_id FROM chart);", - "DELETE FROM node_instance WHERE host_id NOT IN (SELECT host_id FROM host);", - "DELETE FROM host_info WHERE host_id NOT IN (SELECT host_id FROM host);", - "DELETE FROM host_label WHERE host_id NOT IN (SELECT host_id FROM host);", - "DROP TRIGGER IF EXISTS tr_dim_del;", - "DROP INDEX IF EXISTS ind_d1;", - "DROP INDEX IF EXISTS ind_c1;", - "DROP INDEX IF EXISTS ind_c2;", - "DROP INDEX IF EXISTS alert_hash_index;", - "DROP INDEX IF EXISTS health_log_d_ind_4;", - "DROP INDEX IF EXISTS health_log_d_ind_1;", + "DELETE FROM host WHERE host_id NOT IN (SELECT host_id FROM chart)", + "DELETE FROM node_instance WHERE host_id NOT IN (SELECT host_id FROM host)", + "DELETE FROM host_info WHERE host_id NOT IN (SELECT host_id FROM host)", + "DELETE FROM host_label WHERE host_id NOT IN (SELECT host_id FROM host)", + "DROP TRIGGER IF EXISTS tr_dim_del", + "DROP INDEX IF EXISTS ind_d1", + "DROP INDEX IF EXISTS ind_c1", + "DROP INDEX IF EXISTS ind_c2", + "DROP INDEX IF EXISTS alert_hash_index", + "DROP INDEX IF EXISTS health_log_d_ind_4", + "DROP INDEX IF EXISTS health_log_d_ind_1", + "DROP INDEX IF EXISTS health_log_d_ind_5", NULL }; @@ -202,42 +209,42 @@ int configure_sqlite_database(sqlite3 *database, int target_version) // https://www.sqlite.org/pragma.html#pragma_auto_vacuum // PRAGMA schema.auto_vacuum = 0 | NONE | 1 | FULL | 2 | INCREMENTAL; - snprintfz(buf, 1024, "PRAGMA auto_vacuum=%s;", config_get(CONFIG_SECTION_SQLITE, "auto vacuum", "INCREMENTAL")); + snprintfz(buf, sizeof(buf) - 1, "PRAGMA auto_vacuum=%s", config_get(CONFIG_SECTION_SQLITE, "auto vacuum", "INCREMENTAL")); if (init_database_batch(database, list)) return 1; // https://www.sqlite.org/pragma.html#pragma_synchronous // PRAGMA schema.synchronous = 0 | OFF | 1 | NORMAL | 2 | FULL | 3 | EXTRA; - snprintfz(buf, 1024, "PRAGMA synchronous=%s;", config_get(CONFIG_SECTION_SQLITE, "synchronous", "NORMAL")); + snprintfz(buf, sizeof(buf) - 1, "PRAGMA synchronous=%s", config_get(CONFIG_SECTION_SQLITE, "synchronous", "NORMAL")); if (init_database_batch(database, list)) return 1; // https://www.sqlite.org/pragma.html#pragma_journal_mode // PRAGMA schema.journal_mode = DELETE | TRUNCATE | PERSIST | MEMORY | WAL | OFF - snprintfz(buf, 1024, "PRAGMA journal_mode=%s;", config_get(CONFIG_SECTION_SQLITE, "journal mode", "WAL")); + snprintfz(buf, sizeof(buf) - 1, "PRAGMA journal_mode=%s", config_get(CONFIG_SECTION_SQLITE, "journal mode", "WAL")); if (init_database_batch(database, list)) return 1; // https://www.sqlite.org/pragma.html#pragma_temp_store // PRAGMA temp_store = 0 | DEFAULT | 1 | FILE | 2 | MEMORY; - snprintfz(buf, 1024, "PRAGMA temp_store=%s;", config_get(CONFIG_SECTION_SQLITE, "temp store", "MEMORY")); + snprintfz(buf, sizeof(buf) - 1, "PRAGMA temp_store=%s", config_get(CONFIG_SECTION_SQLITE, "temp store", "MEMORY")); if (init_database_batch(database, list)) return 1; // https://www.sqlite.org/pragma.html#pragma_journal_size_limit // PRAGMA schema.journal_size_limit = N ; - snprintfz(buf, 1024, "PRAGMA journal_size_limit=%lld;", config_get_number(CONFIG_SECTION_SQLITE, "journal size limit", 16777216)); + snprintfz(buf, sizeof(buf) - 1, "PRAGMA journal_size_limit=%lld", config_get_number(CONFIG_SECTION_SQLITE, "journal size limit", 16777216)); if (init_database_batch(database, list)) return 1; // https://www.sqlite.org/pragma.html#pragma_cache_size // PRAGMA schema.cache_size = pages; // PRAGMA schema.cache_size = -kibibytes; - snprintfz(buf, 1024, "PRAGMA cache_size=%lld;", config_get_number(CONFIG_SECTION_SQLITE, "cache size", -2000)); + snprintfz(buf, sizeof(buf) - 1, "PRAGMA cache_size=%lld", config_get_number(CONFIG_SECTION_SQLITE, "cache size", -2000)); if (init_database_batch(database, list)) return 1; - snprintfz(buf, 1024, "PRAGMA user_version=%d;", target_version); + snprintfz(buf, sizeof(buf) - 1, "PRAGMA user_version=%d", target_version); if (init_database_batch(database, list)) return 1; @@ -384,13 +391,13 @@ int sql_init_database(db_check_action_type_t rebuild, int memory) int rc; if (likely(!memory)) { - snprintfz(sqlite_database, FILENAME_MAX, "%s/.netdata-meta.db.recover", netdata_configured_cache_dir); + snprintfz(sqlite_database, sizeof(sqlite_database) - 1, "%s/.netdata-meta.db.recover", netdata_configured_cache_dir); rc = unlink(sqlite_database); snprintfz(sqlite_database, FILENAME_MAX, "%s/netdata-meta.db", netdata_configured_cache_dir); if (rc == 0 || (rebuild & DB_CHECK_RECOVER)) { char new_sqlite_database[FILENAME_MAX + 1]; - snprintfz(new_sqlite_database, FILENAME_MAX, "%s/netdata-meta-recover.db", netdata_configured_cache_dir); + snprintfz(new_sqlite_database, sizeof(new_sqlite_database) - 1, "%s/netdata-meta-recover.db", netdata_configured_cache_dir); recover_database(sqlite_database, new_sqlite_database); if (rebuild & DB_CHECK_RECOVER) return 0; @@ -410,7 +417,7 @@ int sql_init_database(db_check_action_type_t rebuild, int memory) if (rebuild & DB_CHECK_RECLAIM_SPACE) { netdata_log_info("Reclaiming space of %s", sqlite_database); - rc = sqlite3_exec_monitored(db_meta, "VACUUM;", 0, 0, &err_msg); + rc = sqlite3_exec_monitored(db_meta, "VACUUM", 0, 0, &err_msg); if (rc != SQLITE_OK) { error_report("Failed to execute VACUUM rc = %d (%s)", rc, err_msg); sqlite3_free(err_msg); @@ -422,6 +429,20 @@ int sql_init_database(db_check_action_type_t rebuild, int memory) return 1; } + if (rebuild & DB_CHECK_ANALYZE) { + netdata_log_info("Running ANALYZE on %s", sqlite_database); + rc = sqlite3_exec_monitored(db_meta, "ANALYZE", 0, 0, &err_msg); + if (rc != SQLITE_OK) { + error_report("Failed to execute ANALYZE rc = %d (%s)", rc, err_msg); + sqlite3_free(err_msg); + } + else { + (void) db_execute(db_meta, "select count(*) from sqlite_master limit 0"); + (void) sqlite3_close(db_meta); + } + return 1; + } + netdata_log_info("SQLite database %s initialization", sqlite_database); rc = sqlite3_create_function(db_meta, "u2h", 1, SQLITE_ANY | SQLITE_DETERMINISTIC, 0, sqlite_uuid_parse, 0, 0); @@ -471,6 +492,9 @@ void sql_close_database(void) add_stmt_to_list(NULL); + (void) db_execute(db_meta, "PRAGMA analysis_limit=10000"); + (void) db_execute(db_meta, "PRAGMA optimize"); + rc = sqlite3_close_v2(db_meta); if (unlikely(rc != SQLITE_OK)) error_report("Error %d while closing the SQLite database, %s", rc, sqlite3_errstr(rc)); @@ -511,22 +535,25 @@ int db_execute(sqlite3 *db, const char *cmd) { int rc; int cnt = 0; + while (cnt < SQL_MAX_RETRY) { char *err_msg; rc = sqlite3_exec_monitored(db, cmd, 0, 0, &err_msg); - if (rc != SQLITE_OK) { - error_report("Failed to execute '%s', rc = %d (%s) -- attempt %d", cmd, rc, err_msg, cnt); - sqlite3_free(err_msg); - if (likely(rc == SQLITE_BUSY || rc == SQLITE_LOCKED)) { - usleep(SQLITE_INSERT_DELAY * USEC_PER_MS); - } - else - break; - } - else + if (likely(rc == SQLITE_OK)) break; ++cnt; + error_report("Failed to execute '%s', rc = %d (%s) -- attempt %d", cmd, rc, err_msg, cnt); + sqlite3_free(err_msg); + + if (likely(rc == SQLITE_BUSY || rc == SQLITE_LOCKED)) { + usleep(SQLITE_INSERT_DELAY * USEC_PER_MS); + continue; + } + + if (rc == SQLITE_CORRUPT) + mark_database_to_recover(NULL, db); + break; } return (rc != SQLITE_OK); } @@ -542,7 +569,7 @@ static inline void set_host_node_id(RRDHOST *host, uuid_t *node_id) return; } - struct aclk_sync_host_config *wc = host->aclk_sync_host_config; + struct aclk_sync_cfg_t *wc = host->aclk_config; if (unlikely(!host->node_id)) { uuid_t *t = mallocz(sizeof(*host->node_id)); @@ -559,7 +586,7 @@ static inline void set_host_node_id(RRDHOST *host, uuid_t *node_id) uuid_unparse_lower(*node_id, wc->node_id); } -#define SQL_UPDATE_NODE_ID "update node_instance set node_id = @node_id where host_id = @host_id;" +#define SQL_UPDATE_NODE_ID "UPDATE node_instance SET node_id = @node_id WHERE host_id = @host_id" int update_node_id(uuid_t *host_id, uuid_t *node_id) { @@ -611,7 +638,7 @@ failed: return rc - 1; } -#define SQL_SELECT_NODE_ID "SELECT node_id FROM node_instance WHERE host_id = @host_id AND node_id IS NOT NULL;" +#define SQL_SELECT_NODE_ID "SELECT node_id FROM node_instance WHERE host_id = @host_id AND node_id IS NOT NULL" int get_node_id(uuid_t *host_id, uuid_t *node_id) { @@ -647,8 +674,9 @@ failed: return (rc == SQLITE_ROW) ? 0 : -1; } -#define SQL_INVALIDATE_NODE_INSTANCES "UPDATE node_instance SET node_id = NULL WHERE EXISTS " \ - "(SELECT host_id FROM node_instance WHERE host_id = @host_id AND (@claim_id IS NULL OR claim_id <> @claim_id));" +#define SQL_INVALIDATE_NODE_INSTANCES \ + "UPDATE node_instance SET node_id = NULL WHERE EXISTS " \ + "(SELECT host_id FROM node_instance WHERE host_id = @host_id AND (@claim_id IS NULL OR claim_id <> @claim_id))" void invalidate_node_instances(uuid_t *host_id, uuid_t *claim_id) { @@ -692,8 +720,9 @@ failed: error_report("Failed to finalize the prepared statement when invalidating node instance information"); } -#define SQL_GET_NODE_INSTANCE_LIST "SELECT ni.node_id, ni.host_id, h.hostname " \ - "FROM node_instance ni, host h WHERE ni.host_id = h.host_id AND h.hops >=0;" +#define SQL_GET_NODE_INSTANCE_LIST \ + "SELECT ni.node_id, ni.host_id, h.hostname " \ + "FROM node_instance ni, host h WHERE ni.host_id = h.host_id AND h.hops >=0" struct node_instance_list *get_node_list(void) { @@ -762,7 +791,7 @@ failed: return node_list; }; -#define SQL_GET_HOST_NODE_ID "select node_id from node_instance where host_id = @host_id;" +#define SQL_GET_HOST_NODE_ID "SELECT node_id FROM node_instance WHERE host_id = @host_id" void sql_load_node_id(RRDHOST *host) { @@ -801,7 +830,7 @@ failed: }; -#define SELECT_HOST_INFO "SELECT system_key, system_value FROM host_info WHERE host_id = @host_id;" +#define SELECT_HOST_INFO "SELECT system_key, system_value FROM host_info WHERE host_id = @host_id" void sql_build_host_system_info(uuid_t *host_id, struct rrdhost_system_info *system_info) { @@ -832,7 +861,7 @@ skip: } #define SELECT_HOST_LABELS "SELECT label_key, label_value, source_type FROM host_label WHERE host_id = @host_id " \ - "AND label_key IS NOT NULL AND label_value IS NOT NULL;" + "AND label_key IS NOT NULL AND label_value IS NOT NULL" RRDLABELS *sql_load_host_labels(uuid_t *host_id) { @@ -888,7 +917,7 @@ int sql_metadata_cache_stats(int op) return count; } -#define SQL_DROP_TABLE "DROP table %s;" +#define SQL_DROP_TABLE "DROP table %s" void sql_drop_table(const char *table) { @@ -896,7 +925,7 @@ void sql_drop_table(const char *table) return; char wstr[255]; - snprintfz(wstr, 254, SQL_DROP_TABLE, table); + snprintfz(wstr, sizeof(wstr) - 1, SQL_DROP_TABLE, table); int rc = sqlite3_exec_monitored(db_meta, wstr, 0, 0, NULL); if (rc != SQLITE_OK) { diff --git a/database/sqlite/sqlite_functions.h b/database/sqlite/sqlite_functions.h index 9cd1f7ad4..40b5af464 100644 --- a/database/sqlite/sqlite_functions.h +++ b/database/sqlite/sqlite_functions.h @@ -21,8 +21,9 @@ struct node_instance_list { typedef enum db_check_action_type { DB_CHECK_NONE = (1 << 0), DB_CHECK_RECLAIM_SPACE = (1 << 1), - DB_CHECK_CONT = (1 << 2), - DB_CHECK_RECOVER = (1 << 3), + DB_CHECK_ANALYZE = (1 << 2), + DB_CHECK_CONT = (1 << 3), + DB_CHECK_RECOVER = (1 << 4), } db_check_action_type_t; #define SQL_MAX_RETRY (100) diff --git a/database/sqlite/sqlite_health.c b/database/sqlite/sqlite_health.c index 6fc6a2e64..7d79ff70b 100644 --- a/database/sqlite/sqlite_health.c +++ b/database/sqlite/sqlite_health.c @@ -95,18 +95,22 @@ failed: /* Health related SQL queries Inserts an entry in the table */ + #define SQL_INSERT_HEALTH_LOG \ "INSERT INTO health_log (host_id, alarm_id, " \ - "config_hash_id, name, chart, exec, recipient, units, chart_context, last_transition_id, chart_name) " \ - "VALUES (?,?,?,?,?,?,?,?,?,?,?) " \ - "ON CONFLICT (host_id, alarm_id) DO UPDATE SET last_transition_id = excluded.last_transition_id, " \ - "chart_name = excluded.chart_name RETURNING health_log_id; " - -#define SQL_INSERT_HEALTH_LOG_DETAIL \ - "INSERT INTO health_log_detail (health_log_id, unique_id, alarm_id, alarm_event_id, " \ - "updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, " \ + "config_hash_id, name, chart, exec, recipient, units, chart_context, last_transition_id, chart_name) " \ + "VALUES (@host_id,@alarm_id, @config_hash_id,@name,@chart,@exec,@recipient,@units,@chart_context," \ + "@last_transition_id,@chart_name) ON CONFLICT (host_id, alarm_id) DO UPDATE " \ + "SET last_transition_id = excluded.last_transition_id, chart_name = excluded.chart_name RETURNING health_log_id" + +#define SQL_INSERT_HEALTH_LOG_DETAIL \ + "INSERT INTO health_log_detail (health_log_id, unique_id, alarm_id, alarm_event_id, " \ + "updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, " \ "info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, transition_id, global_id, summary) " \ - "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,@global_id,?); " + "VALUES (@health_log_id,@unique_id,@alarm_id,@alarm_event_id,@updated_by_id,@updates_id,@when_key,@duration," \ + "@non_clear_duration,@flags,@exec_run_timestamp,@delay_up_to_timestamp, @info,@exec_code,@new_status,@old_status," \ + "@delay,@new_value,@old_value,@last_repeat,@transition_id,@global_id,@summary)" + static void sql_health_alarm_log_insert(RRDHOST *host, ALARM_ENTRY *ae) { sqlite3_stmt *res = NULL; int rc; @@ -353,7 +357,6 @@ static void sql_health_alarm_log_insert(RRDHOST *host, ALARM_ENTRY *ae) { } ae->flags |= HEALTH_ENTRY_FLAG_SAVED; - host->health.health_log_entries_written++; failed: if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) @@ -374,48 +377,6 @@ void sql_health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) } } -/* Health related SQL queries - Get a count of rows from health log table -*/ -#define SQL_COUNT_HEALTH_LOG_DETAIL "SELECT count(1) FROM health_log_detail hld, health_log hl " \ - "where hl.host_id = @host_id and hl.health_log_id = hld.health_log_id" - -static int sql_health_alarm_log_count(RRDHOST *host) { - sqlite3_stmt *res = NULL; - int rc; - - if (unlikely(!db_meta)) { - if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) - error_report("Database has not been initialized"); - return -1; - } - - int entries_in_db = -1; - - rc = sqlite3_prepare_v2(db_meta, SQL_COUNT_HEALTH_LOG_DETAIL, -1, &res, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement to count health log entries from db"); - goto done; - } - - rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind host_id for SQL_COUNT_HEALTH_LOG."); - goto done; - } - - rc = sqlite3_step_monitored(res); - if (likely(rc == SQLITE_ROW)) - entries_in_db = (int) sqlite3_column_int64(res, 0); - -done: - rc = sqlite3_finalize(res); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to finalize the prepared statement to count health log entries from db"); - - return entries_in_db; -} - /* * * Health related SQL queries @@ -423,19 +384,22 @@ done: * */ -#define SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED "DELETE FROM health_log_detail WHERE health_log_id IN " \ - "(SELECT health_log_id FROM health_log WHERE host_id = @host_id) AND when_key < unixepoch() - @history " \ - "AND updated_by_id <> 0 AND transition_id NOT IN " \ - "(SELECT last_transition_id FROM health_log hl WHERE hl.host_id = @host_id);" - -#define SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(guid) "DELETE from health_log_detail WHERE unique_id NOT IN " \ - "(SELECT filtered_alert_unique_id FROM aclk_alert_%s) " \ - "AND unique_id IN (SELECT hld.unique_id FROM health_log hl, health_log_detail hld WHERE " \ - "hl.host_id = @host_id AND hl.health_log_id = hld.health_log_id) " \ - "AND health_log_id IN (SELECT health_log_id FROM health_log WHERE host_id = @host_id) " \ - "AND when_key < unixepoch() - @history " \ - "AND updated_by_id <> 0 AND transition_id NOT IN " \ - "(SELECT last_transition_id FROM health_log hl WHERE hl.host_id = @host_id);", guid +#define SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED \ + "DELETE FROM health_log_detail WHERE health_log_id IN " \ + "(SELECT health_log_id FROM health_log WHERE host_id = @host_id) AND when_key < UNIXEPOCH() - @history " \ + "AND updated_by_id <> 0 AND transition_id NOT IN " \ + "(SELECT last_transition_id FROM health_log hl WHERE hl.host_id = @host_id)" + +#define SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(guid) \ + "DELETE from health_log_detail WHERE unique_id NOT IN " \ + "(SELECT filtered_alert_unique_id FROM aclk_alert_%s) " \ + "AND unique_id IN (SELECT hld.unique_id FROM health_log hl, health_log_detail hld WHERE " \ + "hl.host_id = @host_id AND hl.health_log_id = hld.health_log_id) " \ + "AND health_log_id IN (SELECT health_log_id FROM health_log WHERE host_id = @host_id) " \ + "AND when_key < unixepoch() - @history " \ + "AND updated_by_id <> 0 AND transition_id NOT IN " \ + "(SELECT last_transition_id FROM health_log hl WHERE hl.host_id = @host_id)", \ + guid void sql_health_alarm_log_cleanup(RRDHOST *host, bool claimed) { sqlite3_stmt *res = NULL; @@ -450,14 +414,14 @@ void sql_health_alarm_log_cleanup(RRDHOST *host, bool claimed) { char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - snprintfz(command, MAX_HEALTH_SQL_SIZE, "aclk_alert_%s", uuid_str); + snprintfz(command, sizeof(command) - 1, "aclk_alert_%s", uuid_str); bool aclk_table_exists = table_exists_in_database(db_meta, command); char *sql = SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED; if (claimed && aclk_table_exists) { - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(uuid_str)); + snprintfz(command, sizeof(command) - 1, SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(uuid_str)); sql = command; } @@ -483,10 +447,6 @@ void sql_health_alarm_log_cleanup(RRDHOST *host, bool claimed) { if (unlikely(rc != SQLITE_DONE)) error_report("Failed to cleanup health log detail table, rc = %d", rc); - int rows = sql_health_alarm_log_count(host); - if (rows >= 0) - host->health.health_log_entries_written = rows; - if (aclk_table_exists) sql_aclk_alert_clean_dead_entries(host); @@ -497,17 +457,25 @@ done: } #define SQL_INJECT_REMOVED \ - "insert into health_log_detail (health_log_id, unique_id, alarm_id, alarm_event_id, updated_by_id, updates_id, when_key, " \ + "INSERT INTO health_log_detail (health_log_id, unique_id, alarm_id, alarm_event_id, updated_by_id, updates_id, when_key, " \ "duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, info, exec_code, new_status, old_status, " \ "delay, new_value, old_value, last_repeat, transition_id, global_id, summary) " \ - "select health_log_id, ?1, ?2, ?3, 0, ?4, unixepoch(), 0, 0, flags, exec_run_timestamp, unixepoch(), info, exec_code, -2, " \ - "new_status, delay, NULL, new_value, 0, ?5, now_usec(0), summary from health_log_detail where unique_id = ?6 and transition_id = ?7;" - -#define SQL_INJECT_REMOVED_UPDATE_DETAIL "update health_log_detail set flags = flags | ?1, updated_by_id = ?2 where unique_id = ?3 and transition_id = ?4;" - -#define SQL_INJECT_REMOVED_UPDATE_LOG "update health_log set last_transition_id = ?1 where alarm_id = ?2 and last_transition_id = ?3 and host_id = ?4;" - -void sql_inject_removed_status(RRDHOST *host, uint32_t alarm_id, uint32_t alarm_event_id, uint32_t unique_id, uint32_t max_unique_id, uuid_t *prev_transition_id) + "SELECT health_log_id, ?1, ?2, ?3, 0, ?4, UNIXEPOCH(), 0, 0, flags, exec_run_timestamp, UNIXEPOCH(), info, exec_code, -2, " \ + "new_status, delay, NULL, new_value, 0, ?5, NOW_USEC(0), summary FROM health_log_detail WHERE unique_id = ?6 AND transition_id = ?7" + +#define SQL_INJECT_REMOVED_UPDATE_DETAIL \ + "UPDATE health_log_detail SET flags = flags | ?1, updated_by_id = ?2 WHERE unique_id = ?3 AND transition_id = ?4" + +#define SQL_INJECT_REMOVED_UPDATE_LOG \ + "UPDATE health_log SET last_transition_id = ?1 WHERE alarm_id = ?2 AND last_transition_id = ?3 AND host_id = ?4" + +void sql_inject_removed_status( + RRDHOST *host, + uint32_t alarm_id, + uint32_t alarm_event_id, + uint32_t unique_id, + uint32_t max_unique_id, + uuid_t *prev_transition_id) { int rc; @@ -737,13 +705,14 @@ void sql_check_removed_alerts_state(RRDHOST *host) /* Health related SQL queries Load from the health log table */ -#define SQL_LOAD_HEALTH_LOG "SELECT hld.unique_id, hld.alarm_id, hld.alarm_event_id, hl.config_hash_id, hld.updated_by_id, " \ - "hld.updates_id, hld.when_key, hld.duration, hld.non_clear_duration, hld.flags, hld.exec_run_timestamp, " \ - "hld.delay_up_to_timestamp, hl.name, hl.chart, hl.exec, hl.recipient, ah.source, hl.units, " \ - "hld.info, hld.exec_code, hld.new_status, hld.old_status, hld.delay, hld.new_value, hld.old_value, " \ - "hld.last_repeat, ah.class, ah.component, ah.type, hl.chart_context, hld.transition_id, hld.global_id, hl.chart_name, hld.summary " \ - "FROM health_log hl, alert_hash ah, health_log_detail hld " \ - "WHERE hl.config_hash_id = ah.hash_id and hl.host_id = @host_id and hl.last_transition_id = hld.transition_id;" +#define SQL_LOAD_HEALTH_LOG \ + "SELECT hld.unique_id, hld.alarm_id, hld.alarm_event_id, hl.config_hash_id, hld.updated_by_id, " \ + "hld.updates_id, hld.when_key, hld.duration, hld.non_clear_duration, hld.flags, hld.exec_run_timestamp, " \ + "hld.delay_up_to_timestamp, hl.name, hl.chart, hl.exec, hl.recipient, ah.source, hl.units, " \ + "hld.info, hld.exec_code, hld.new_status, hld.old_status, hld.delay, hld.new_value, hld.old_value, " \ + "hld.last_repeat, ah.class, ah.component, ah.type, hl.chart_context, hld.transition_id, hld.global_id, " \ + "hl.chart_name, hld.summary FROM health_log hl, alert_hash ah, health_log_detail hld " \ + "WHERE hl.config_hash_id = ah.hash_id and hl.host_id = @host_id and hl.last_transition_id = hld.transition_id" void sql_health_alarm_log_load(RRDHOST *host) { @@ -751,8 +720,6 @@ void sql_health_alarm_log_load(RRDHOST *host) int ret; ssize_t errored = 0, loaded = 0; - host->health.health_log_entries_written = 0; - if (unlikely(!db_meta)) { if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) error_report("HEALTH [%s]: Database has not been initialized", rrdhost_hostname(host)); @@ -914,27 +881,28 @@ void sql_health_alarm_log_load(RRDHOST *host) if (unlikely(!host->health_log.next_alarm_id || host->health_log.next_alarm_id <= host->health_max_alarm_id)) host->health_log.next_alarm_id = host->health_max_alarm_id + 1; - netdata_log_health("[%s]: Table health_log, loaded %zd alarm entries, errors in %zd entries.", rrdhost_hostname(host), loaded, errored); + nd_log(NDLS_DAEMON, errored ? NDLP_WARNING : NDLP_DEBUG, + "[%s]: Table health_log, loaded %zd alarm entries, errors in %zd entries.", + rrdhost_hostname(host), loaded, errored); ret = sqlite3_finalize(res); if (unlikely(ret != SQLITE_OK)) error_report("Failed to finalize the health log read statement"); - - int rows = sql_health_alarm_log_count(host); - - if (rows >= 0) - host->health.health_log_entries_written = rows; } /* * Store an alert config hash in the database */ -#define SQL_STORE_ALERT_CONFIG_HASH "insert or replace into alert_hash (hash_id, date_updated, alarm, template, " \ - "on_key, class, component, type, os, hosts, lookup, every, units, calc, plugin, module, " \ - "charts, green, red, warn, crit, exec, to_key, info, delay, options, repeat, host_labels, " \ - "p_db_lookup_dimensions, p_db_lookup_method, p_db_lookup_options, p_db_lookup_after, " \ - "p_db_lookup_before, p_update_every, source, chart_labels, summary) values (?1,unixepoch(),?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12," \ - "?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24,?25,?26,?27,?28,?29,?30,?31,?32,?33,?34,?35,?36);" +#define SQL_STORE_ALERT_CONFIG_HASH \ + "insert or replace into alert_hash (hash_id, date_updated, alarm, template, " \ + "on_key, class, component, type, os, hosts, lookup, every, units, calc, plugin, module, " \ + "charts, green, red, warn, crit, exec, to_key, info, delay, options, repeat, host_labels, " \ + "p_db_lookup_dimensions, p_db_lookup_method, p_db_lookup_options, p_db_lookup_after, " \ + "p_db_lookup_before, p_update_every, source, chart_labels, summary) values (@hash_id,UNIXEPOCH(),@alarm,@template," \ + "@on_key,@class,@component,@type,@os,@hosts,@lookup,@every,@units,@calc,@plugin,@module," \ + "@charts,@green,@red,@warn,@crit,@exec,@to_key,@info,@delay,@options,@repeat,@host_labels," \ + "@p_db_lookup_dimensions,@p_db_lookup_method,@p_db_lookup_options,@p_db_lookup_after," \ + "@p_db_lookup_before,@p_update_every,@source,@chart_labels,@summary)" int sql_store_alert_config_hash(uuid_t *hash_id, struct alert_config *cfg) { @@ -1212,7 +1180,7 @@ int alert_hash_and_store_config( #define SQL_SELECT_HEALTH_LAST_EXECUTED_EVENT \ "SELECT hld.new_status FROM health_log hl, health_log_detail hld " \ "WHERE hl.host_id = @host_id AND hl.alarm_id = @alarm_id AND hld.unique_id != @unique_id AND hld.flags & @flags " \ - "AND hl.health_log_id = hld.health_log_id ORDER BY hld.unique_id DESC LIMIT 1;" + "AND hl.health_log_id = hld.health_log_id ORDER BY hld.unique_id DESC LIMIT 1" int sql_health_get_last_executed_event(RRDHOST *host, ALARM_ENTRY *ae, RRDCALC_STATUS *last_executed_status) { @@ -1270,191 +1238,162 @@ done: "hl.units, hld.info, hld.exec_code, hld.new_status, hld.old_status, hld.delay, hld.new_value, hld.old_value, " \ "hld.last_repeat, ah.class, ah.component, ah.type, hl.chart_context, hld.transition_id, hld.summary " \ "FROM health_log hl, alert_hash ah, health_log_detail hld WHERE hl.config_hash_id = ah.hash_id and " \ - "hl.health_log_id = hld.health_log_id and hl.host_id = @host_id " + "hl.health_log_id = hld.health_log_id and hl.host_id = @host_id AND hld.unique_id > @after " -void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) { +void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, time_t after, const char *chart) +{ + unsigned int max = host->health_log.max; - buffer_strcat(wb, "["); + static __thread sqlite3_stmt *stmt_no_chart = NULL; + static __thread sqlite3_stmt *stmt_with_chart = NULL; - unsigned int max = host->health_log.max; - unsigned int count = 0; + sqlite3_stmt **active_stmt; + sqlite3_stmt *stmt_query; - sqlite3_stmt *res = NULL; - int rc; + int rc; - BUFFER *command = buffer_create(MAX_HEALTH_SQL_SIZE, NULL); - buffer_sprintf(command, SQL_SELECT_HEALTH_LOG); + active_stmt = chart ? &stmt_with_chart : &stmt_no_chart; - if (chart) { - char chart_sql[MAX_HEALTH_SQL_SIZE + 1]; - snprintfz(chart_sql, MAX_HEALTH_SQL_SIZE, "AND hl.chart = '%s' ", chart); - buffer_strcat(command, chart_sql); - } + if (!*active_stmt) { - if (after) { - char after_sql[MAX_HEALTH_SQL_SIZE + 1]; - snprintfz(after_sql, MAX_HEALTH_SQL_SIZE, "AND hld.unique_id > %u ", after); - buffer_strcat(command, after_sql); - } + BUFFER *command = buffer_create(MAX_HEALTH_SQL_SIZE, NULL); + buffer_sprintf(command, SQL_SELECT_HEALTH_LOG); - { - char limit_sql[MAX_HEALTH_SQL_SIZE + 1]; - snprintfz(limit_sql, MAX_HEALTH_SQL_SIZE, "ORDER BY hld.unique_id DESC LIMIT %u ", max); - buffer_strcat(command, limit_sql); - } + if (chart) + buffer_strcat(command, " AND hl.chart = @chart "); - rc = sqlite3_prepare_v2(db_meta, buffer_tostring(command), -1, &res, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement SQL_SELECT_HEALTH_LOG"); - buffer_free(command); - return; - } + buffer_strcat(command, " ORDER BY hld.unique_id DESC LIMIT @limit"); - rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); - if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind host_id for SQL_SELECT_HEALTH_LOG."); - sqlite3_finalize(res); - buffer_free(command); - return; - } + rc = prepare_statement(db_meta, buffer_tostring(command), active_stmt); + buffer_free(command); - while (sqlite3_step(res) == SQLITE_ROW) { + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement SQL_SELECT_HEALTH_LOG"); + return; + } + } - char old_value_string[100 + 1]; - char new_value_string[100 + 1]; + stmt_query = *active_stmt; - char config_hash_id[UUID_STR_LEN]; - uuid_unparse_lower(*((uuid_t *) sqlite3_column_blob(res, 3)), config_hash_id); + int param = 0; + rc = sqlite3_bind_blob(stmt_query, ++param, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id for SQL_SELECT_HEALTH_LOG."); + goto finish; + } - char transition_id[UUID_STR_LEN] = {0}; - if (sqlite3_column_type(res, 30) != SQLITE_NULL) - uuid_unparse_lower(*((uuid_t *) sqlite3_column_blob(res, 30)), transition_id); - - char *edit_command = sqlite3_column_bytes(res, 16) > 0 ? health_edit_command_from_source((char *)sqlite3_column_text(res, 16)) : strdupz("UNKNOWN=0=UNKNOWN"); - - if (count) - buffer_sprintf(wb, ","); - - count++; - - buffer_sprintf( - wb, - "\n\t{\n" - "\t\t\"hostname\": \"%s\",\n" - "\t\t\"utc_offset\": %d,\n" - "\t\t\"timezone\": \"%s\",\n" - "\t\t\"unique_id\": %u,\n" - "\t\t\"alarm_id\": %u,\n" - "\t\t\"alarm_event_id\": %u,\n" - "\t\t\"config_hash_id\": \"%s\",\n" - "\t\t\"transition_id\": \"%s\",\n" - "\t\t\"name\": \"%s\",\n" - "\t\t\"chart\": \"%s\",\n" - "\t\t\"context\": \"%s\",\n" - "\t\t\"class\": \"%s\",\n" - "\t\t\"component\": \"%s\",\n" - "\t\t\"type\": \"%s\",\n" - "\t\t\"processed\": %s,\n" - "\t\t\"updated\": %s,\n" - "\t\t\"exec_run\": %lu,\n" - "\t\t\"exec_failed\": %s,\n" - "\t\t\"exec\": \"%s\",\n" - "\t\t\"recipient\": \"%s\",\n" - "\t\t\"exec_code\": %d,\n" - "\t\t\"source\": \"%s\",\n" - "\t\t\"command\": \"%s\",\n" - "\t\t\"units\": \"%s\",\n" - "\t\t\"when\": %lu,\n" - "\t\t\"duration\": %lu,\n" - "\t\t\"non_clear_duration\": %lu,\n" - "\t\t\"status\": \"%s\",\n" - "\t\t\"old_status\": \"%s\",\n" - "\t\t\"delay\": %d,\n" - "\t\t\"delay_up_to_timestamp\": %lu,\n" - "\t\t\"updated_by_id\": %u,\n" - "\t\t\"updates_id\": %u,\n" - "\t\t\"value_string\": \"%s\",\n" - "\t\t\"old_value_string\": \"%s\",\n" - "\t\t\"last_repeat\": %lu,\n" - "\t\t\"silenced\": \"%s\",\n", - rrdhost_hostname(host), - host->utc_offset, - rrdhost_abbrev_timezone(host), - (unsigned int) sqlite3_column_int64(res, 0), - (unsigned int) sqlite3_column_int64(res, 1), - (unsigned int) sqlite3_column_int64(res, 2), - config_hash_id, - transition_id, - sqlite3_column_text(res, 12), - sqlite3_column_text(res, 13), - sqlite3_column_text(res, 29), - sqlite3_column_text(res, 26) ? (const char *) sqlite3_column_text(res, 26) : (char *) "Unknown", - sqlite3_column_text(res, 27) ? (const char *) sqlite3_column_text(res, 27) : (char *) "Unknown", - sqlite3_column_text(res, 28) ? (const char *) sqlite3_column_text(res, 28) : (char *) "Unknown", - (sqlite3_column_int64(res, 9) & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false", - (sqlite3_column_int64(res, 9) & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false", - (long unsigned int)sqlite3_column_int64(res, 10), - (sqlite3_column_int64(res, 9) & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false", - sqlite3_column_text(res, 14) ? (const char *) sqlite3_column_text(res, 14) : string2str(host->health.health_default_exec), - sqlite3_column_text(res, 15) ? (const char *) sqlite3_column_text(res, 15) : string2str(host->health.health_default_recipient), - sqlite3_column_int(res, 19), - sqlite3_column_text(res, 16) ? (const char *) sqlite3_column_text(res, 16) : (char *) "Unknown", - edit_command, - sqlite3_column_text(res, 17), - (long unsigned int)sqlite3_column_int64(res, 6), - (long unsigned int)sqlite3_column_int64(res, 7), - (long unsigned int)sqlite3_column_int64(res, 8), - rrdcalc_status2string(sqlite3_column_int(res, 20)), - rrdcalc_status2string(sqlite3_column_int(res, 21)), - sqlite3_column_int(res, 22), - (long unsigned int)sqlite3_column_int64(res, 11), - (unsigned int)sqlite3_column_int64(res, 4), - (unsigned int)sqlite3_column_int64(res, 5), - sqlite3_column_type(res, 23) == SQLITE_NULL ? "-" : format_value_and_unit(new_value_string, 100, sqlite3_column_double(res, 23), (char *) sqlite3_column_text(res, 17), -1), - sqlite3_column_type(res, 24) == SQLITE_NULL ? "-" : format_value_and_unit(old_value_string, 100, sqlite3_column_double(res, 24), (char *) sqlite3_column_text(res, 17), -1), - (long unsigned int)sqlite3_column_int64(res, 25), - (sqlite3_column_int64(res, 9) & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"); - - health_string2json(wb, "\t\t", "summary", (char *) sqlite3_column_text(res, 31), ",\n"); - health_string2json(wb, "\t\t", "info", (char *) sqlite3_column_text(res, 18), ",\n"); - - if(unlikely(sqlite3_column_int64(res, 9) & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) { - buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n"); - } + rc = sqlite3_bind_int64(stmt_query, ++param, after); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind after for SQL_SELECT_HEALTH_LOG."); + goto finish; + } - buffer_strcat(wb, "\t\t\"value\":"); - if (sqlite3_column_type(res, 23) == SQLITE_NULL) - buffer_strcat(wb, "null"); - else - buffer_print_netdata_double(wb, sqlite3_column_double(res, 23)); - buffer_strcat(wb, ",\n"); + if (chart) { + rc = sqlite3_bind_text(stmt_query, ++param, chart, -1, SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind after for SQL_SELECT_HEALTH_LOG."); + goto finish; + } + } - buffer_strcat(wb, "\t\t\"old_value\":"); - if (sqlite3_column_type(res, 24) == SQLITE_NULL) - buffer_strcat(wb, "null"); + rc = sqlite3_bind_int64(stmt_query, ++param, max); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind max lines for SQL_SELECT_HEALTH_LOG."); + goto finish; + } + + buffer_json_initialize(wb, "\"", "\"", 0, false, BUFFER_JSON_OPTIONS_DEFAULT); + buffer_json_member_add_array(wb, NULL); + + while (sqlite3_step(stmt_query) == SQLITE_ROW) { + char old_value_string[100 + 1]; + char new_value_string[100 + 1]; + + char config_hash_id[UUID_STR_LEN]; + uuid_unparse_lower(*((uuid_t *)sqlite3_column_blob(stmt_query, 3)), config_hash_id); + + char transition_id[UUID_STR_LEN] = {0}; + if (sqlite3_column_type(stmt_query, 30) != SQLITE_NULL) + uuid_unparse_lower(*((uuid_t *)sqlite3_column_blob(stmt_query, 30)), transition_id); + + char *edit_command = sqlite3_column_bytes(stmt_query, 16) > 0 ? + health_edit_command_from_source((char *)sqlite3_column_text(stmt_query, 16)) : + strdupz("UNKNOWN=0=UNKNOWN"); + + buffer_json_add_array_item_object(wb); // this node + + buffer_json_member_add_string_or_empty(wb, "hostname", rrdhost_hostname(host)); + buffer_json_member_add_int64(wb, "utc_offset", (int64_t)host->utc_offset); + buffer_json_member_add_string_or_empty(wb, "timezone", rrdhost_abbrev_timezone(host)); + buffer_json_member_add_int64(wb, "unique_id", (int64_t) sqlite3_column_int64(stmt_query, 0)); + buffer_json_member_add_int64(wb, "alarm_id", (int64_t) sqlite3_column_int64(stmt_query, 1)); + buffer_json_member_add_int64(wb, "alarm_event_id", (int64_t) sqlite3_column_int64(stmt_query, 2)); + buffer_json_member_add_string_or_empty(wb, "config_hash_id", config_hash_id); + buffer_json_member_add_string_or_empty(wb, "transition_id", transition_id); + buffer_json_member_add_string_or_empty(wb, "name", (const char *) sqlite3_column_text(stmt_query, 12)); + buffer_json_member_add_string_or_empty(wb, "chart", (const char *) sqlite3_column_text(stmt_query, 13)); + buffer_json_member_add_string_or_empty(wb, "context", (const char *) sqlite3_column_text(stmt_query, 29)); + buffer_json_member_add_string_or_empty(wb, "class", sqlite3_column_text(stmt_query, 26) ? (const char *) sqlite3_column_text(stmt_query, 26) : (char *) "Unknown"); + buffer_json_member_add_string_or_empty(wb, "component", sqlite3_column_text(stmt_query, 27) ? (const char *) sqlite3_column_text(stmt_query, 27) : (char *) "Unknown"); + buffer_json_member_add_string_or_empty(wb, "type", sqlite3_column_text(stmt_query, 28) ? (const char *) sqlite3_column_text(stmt_query, 28) : (char *) "Unknown"); + buffer_json_member_add_boolean(wb, "processed", (sqlite3_column_int64(stmt_query, 9) & HEALTH_ENTRY_FLAG_PROCESSED)); + buffer_json_member_add_boolean(wb, "updated", (sqlite3_column_int64(stmt_query, 9) & HEALTH_ENTRY_FLAG_UPDATED)); + buffer_json_member_add_int64(wb, "exec_run", (int64_t)sqlite3_column_int64(stmt_query, 10)); + buffer_json_member_add_boolean(wb, "exec_failed", (sqlite3_column_int64(stmt_query, 9) & HEALTH_ENTRY_FLAG_EXEC_FAILED)); + buffer_json_member_add_string_or_empty(wb, "exec", sqlite3_column_text(stmt_query, 14) ? (const char *) sqlite3_column_text(stmt_query, 14) : string2str(host->health.health_default_exec)); + buffer_json_member_add_string_or_empty(wb, "recipient", sqlite3_column_text(stmt_query, 15) ? (const char *) sqlite3_column_text(stmt_query, 15) : string2str(host->health.health_default_recipient)); + buffer_json_member_add_int64(wb, "exec_code", sqlite3_column_int(stmt_query, 19)); + buffer_json_member_add_string_or_empty(wb, "source", sqlite3_column_text(stmt_query, 16) ? (const char *) sqlite3_column_text(stmt_query, 16) : (char *) "Unknown"); + buffer_json_member_add_string_or_empty(wb, "command", edit_command); + buffer_json_member_add_string_or_empty(wb, "units", (const char *) sqlite3_column_text(stmt_query, 17)); + buffer_json_member_add_int64(wb, "when", (int64_t)sqlite3_column_int64(stmt_query, 6)); + buffer_json_member_add_int64(wb, "duration", (int64_t)sqlite3_column_int64(stmt_query, 7)); + buffer_json_member_add_int64(wb, "non_clear_duration", (int64_t)sqlite3_column_int64(stmt_query, 8)); + buffer_json_member_add_string_or_empty(wb, "status", rrdcalc_status2string(sqlite3_column_int(stmt_query, 20))); + buffer_json_member_add_string_or_empty(wb, "old_status", rrdcalc_status2string(sqlite3_column_int(stmt_query, 21))); + buffer_json_member_add_int64(wb, "delay", sqlite3_column_int(stmt_query, 22)); + buffer_json_member_add_int64(wb, "delay_up_to_timestamp",(int64_t)sqlite3_column_int64(stmt_query, 11)); + buffer_json_member_add_int64(wb, "updated_by_id", (unsigned int)sqlite3_column_int64(stmt_query, 4)); + buffer_json_member_add_int64(wb, "updates_id", (unsigned int)sqlite3_column_int64(stmt_query, 5)); + buffer_json_member_add_string_or_empty(wb, "value_string", sqlite3_column_type(stmt_query, 23) == SQLITE_NULL ? "-" : + format_value_and_unit(new_value_string, 100, sqlite3_column_double(stmt_query, 23), (char *) sqlite3_column_text(stmt_query, 17), -1)); + buffer_json_member_add_string_or_empty(wb, "old_value_string", sqlite3_column_type(stmt_query, 24) == SQLITE_NULL ? "-" : + format_value_and_unit(old_value_string, 100, sqlite3_column_double(stmt_query, 24), (char *) sqlite3_column_text(stmt_query, 17), -1)); + buffer_json_member_add_int64(wb, "last_repeat", (int64_t)sqlite3_column_int64(stmt_query, 25)); + buffer_json_member_add_boolean(wb, "silenced", (sqlite3_column_int64(stmt_query, 9) & HEALTH_ENTRY_FLAG_SILENCED)); + buffer_json_member_add_string_or_empty(wb, "summary", (const char *) sqlite3_column_text(stmt_query, 31)); + buffer_json_member_add_string_or_empty(wb, "info", (const char *) sqlite3_column_text(stmt_query, 18)); + buffer_json_member_add_boolean(wb, "no_clear_notification",(sqlite3_column_int64(stmt_query, 9) & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)); + + if (sqlite3_column_type(stmt_query, 23) == SQLITE_NULL) + buffer_json_member_add_string(wb, "value", NULL); else - buffer_print_netdata_double(wb, sqlite3_column_double(res, 24)); - buffer_strcat(wb, "\n"); + buffer_json_member_add_double(wb, "value", sqlite3_column_double(stmt_query, 23)); - buffer_strcat(wb, "\t}"); + if (sqlite3_column_type(stmt_query, 24) == SQLITE_NULL) + buffer_json_member_add_string(wb, "old_value", NULL); + else + buffer_json_member_add_double(wb, "old_value", sqlite3_column_double(stmt_query, 23)); freez(edit_command); + + buffer_json_object_close(wb); } - buffer_strcat(wb, "\n]"); + buffer_json_array_close(wb); + buffer_json_finalize(wb); - rc = sqlite3_finalize(res); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to finalize statement for SQL_SELECT_HEALTH_LOG"); - - buffer_free(command); +finish: + rc = sqlite3_reset(stmt_query); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement for SQL_SELECT_HEALTH_LOG"); } -#define SQL_COPY_HEALTH_LOG(table) "INSERT OR IGNORE INTO health_log (host_id, alarm_id, config_hash_id, name, chart, family, exec, recipient, units, chart_context) SELECT ?1, alarm_id, config_hash_id, name, chart, family, exec, recipient, units, chart_context from %s;", table -#define SQL_COPY_HEALTH_LOG_DETAIL(table) "INSERT INTO health_log_detail (unique_id, alarm_id, alarm_event_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, transition_id, global_id, host_id) SELECT unique_id, alarm_id, alarm_event_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, transition_id, now_usec(1), ?1 from %s;", table -#define SQL_UPDATE_HEALTH_LOG_DETAIL_TRANSITION_ID "update health_log_detail set transition_id = uuid_random() where transition_id is null;" -#define SQL_UPDATE_HEALTH_LOG_DETAIL_HEALTH_LOG_ID "update health_log_detail set health_log_id = (select health_log_id from health_log where host_id = ?1 and alarm_id = health_log_detail.alarm_id) where health_log_id is null and host_id = ?2;" -#define SQL_UPDATE_HEALTH_LOG_LAST_TRANSITION_ID "update health_log set last_transition_id = (select transition_id from health_log_detail where health_log_id = health_log.health_log_id and alarm_id = health_log.alarm_id group by (alarm_id) having max(alarm_event_id)) where host_id = ?1;" +#define SQL_COPY_HEALTH_LOG(table) "INSERT OR IGNORE INTO health_log (host_id, alarm_id, config_hash_id, name, chart, family, exec, recipient, units, chart_context) SELECT ?1, alarm_id, config_hash_id, name, chart, family, exec, recipient, units, chart_context from %s", table +#define SQL_COPY_HEALTH_LOG_DETAIL(table) "INSERT INTO health_log_detail (unique_id, alarm_id, alarm_event_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, transition_id, global_id, host_id) SELECT unique_id, alarm_id, alarm_event_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, transition_id, now_usec(1), ?1 from %s", table +#define SQL_UPDATE_HEALTH_LOG_DETAIL_TRANSITION_ID "update health_log_detail set transition_id = uuid_random() where transition_id is null" +#define SQL_UPDATE_HEALTH_LOG_DETAIL_HEALTH_LOG_ID "update health_log_detail set health_log_id = (select health_log_id from health_log where host_id = ?1 and alarm_id = health_log_detail.alarm_id) where health_log_id is null and host_id = ?2" +#define SQL_UPDATE_HEALTH_LOG_LAST_TRANSITION_ID "update health_log set last_transition_id = (select transition_id from health_log_detail where health_log_id = health_log.health_log_id and alarm_id = health_log.alarm_id group by (alarm_id) having max(alarm_event_id)) where host_id = ?1" int health_migrate_old_health_log_table(char *table) { if (!table) return 0; @@ -1476,7 +1415,7 @@ int health_migrate_old_health_log_table(char *table) { int rc; char command[MAX_HEALTH_SQL_SIZE + 1]; sqlite3_stmt *res = NULL; - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_COPY_HEALTH_LOG(table)); + snprintfz(command, sizeof(command) - 1, SQL_COPY_HEALTH_LOG(table)); rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to prepare statement to copy health log, rc = %d", rc); @@ -1503,7 +1442,7 @@ int health_migrate_old_health_log_table(char *table) { } //detail - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_COPY_HEALTH_LOG_DETAIL(table)); + snprintfz(command, sizeof(command) - 1, SQL_COPY_HEALTH_LOG_DETAIL(table)); rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to prepare statement to copy health log detail, rc = %d", rc); @@ -1913,12 +1852,12 @@ void sql_alert_transitions( goto run_query; } - snprintfz(sql, 511, SQL_BUILD_ALERT_TRANSITION, nodes); + snprintfz(sql, sizeof(sql) - 1, SQL_BUILD_ALERT_TRANSITION, nodes); rc = db_execute(db_meta, sql); if (rc) return; - snprintfz(sql, 511, SQL_POPULATE_TEMP_ALERT_TRANSITION_TABLE, nodes); + snprintfz(sql, sizeof(sql) - 1, SQL_POPULATE_TEMP_ALERT_TRANSITION_TABLE, nodes); // Prepare statement to add things rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); @@ -2046,7 +1985,7 @@ done: done_only_drop: if (likely(!transition)) { - (void)snprintfz(sql, 511, "DROP TABLE IF EXISTS v_%p", nodes); + (void)snprintfz(sql, sizeof(sql) - 1, "DROP TABLE IF EXISTS v_%p", nodes); (void)db_execute(db_meta, sql); buffer_free(command); } @@ -2078,12 +2017,12 @@ int sql_get_alert_configuration( if (unlikely(!configs)) return added; - snprintfz(sql, 511, SQL_BUILD_CONFIG_TARGET_LIST, configs); + snprintfz(sql, sizeof(sql) - 1, SQL_BUILD_CONFIG_TARGET_LIST, configs); rc = db_execute(db_meta, sql); if (rc) return added; - snprintfz(sql, 511, SQL_POPULATE_TEMP_CONFIG_TARGET_TABLE, configs); + snprintfz(sql, sizeof(sql) - 1, SQL_POPULATE_TEMP_CONFIG_TARGET_TABLE, configs); // Prepare statement to add things rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); @@ -2180,7 +2119,7 @@ int sql_get_alert_configuration( error_report("Failed to finalize statement for sql_get_alert_configuration"); fail_only_drop: - (void)snprintfz(sql, 511, "DROP TABLE IF EXISTS c_%p", configs); + (void)snprintfz(sql, sizeof(sql) - 1, "DROP TABLE IF EXISTS c_%p", configs); (void)db_execute(db_meta, sql); buffer_free(command); return added; diff --git a/database/sqlite/sqlite_health.h b/database/sqlite/sqlite_health.h index e21912368..5549b7525 100644 --- a/database/sqlite/sqlite_health.h +++ b/database/sqlite/sqlite_health.h @@ -13,7 +13,7 @@ void sql_health_alarm_log_cleanup(RRDHOST *host, bool claimed); int alert_hash_and_store_config(uuid_t hash_id, struct alert_config *cfg, int store_hash); void sql_aclk_alert_clean_dead_entries(RRDHOST *host); int sql_health_get_last_executed_event(RRDHOST *host, ALARM_ENTRY *ae, RRDCALC_STATUS *last_executed_status); -void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart); +void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, time_t after, const char *chart); int health_migrate_old_health_log_table(char *table); uint32_t sql_get_alarm_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id); uint32_t sql_get_alarm_id_check_zero_hash(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id); diff --git a/database/sqlite/sqlite_metadata.c b/database/sqlite/sqlite_metadata.c index 143783163..636f51966 100644 --- a/database/sqlite/sqlite_metadata.c +++ b/database/sqlite/sqlite_metadata.c @@ -4,11 +4,12 @@ // SQL statements -#define SQL_STORE_CLAIM_ID "INSERT INTO node_instance " \ - "(host_id, claim_id, date_created) VALUES (@host_id, @claim_id, unixepoch()) " \ - "ON CONFLICT(host_id) DO UPDATE SET claim_id = excluded.claim_id;" +#define SQL_STORE_CLAIM_ID \ + "INSERT INTO node_instance " \ + "(host_id, claim_id, date_created) VALUES (@host_id, @claim_id, UNIXEPOCH()) " \ + "ON CONFLICT(host_id) DO UPDATE SET claim_id = excluded.claim_id" -#define SQL_DELETE_HOST_LABELS "DELETE FROM host_label WHERE host_id = @uuid;" +#define SQL_DELETE_HOST_LABELS "DELETE FROM host_label WHERE host_id = @uuid" #define STORE_HOST_LABEL \ "INSERT INTO host_label (host_id, source_type, label_key, label_value, date_created) VALUES " @@ -18,13 +19,13 @@ #define STORE_HOST_OR_CHART_LABEL_VALUE "(u2h('%s'), %d,'%s','%s', unixepoch())" -#define DELETE_DIMENSION_UUID "DELETE FROM dimension WHERE dim_id = @uuid;" +#define DELETE_DIMENSION_UUID "DELETE FROM dimension WHERE dim_id = @uuid" #define SQL_STORE_HOST_INFO \ "INSERT OR REPLACE INTO host (host_id, hostname, registry_hostname, update_every, os, timezone, tags, hops, " \ "memory_mode, abbrev_timezone, utc_offset, program_name, program_version, entries, health_enabled, last_connected) " \ "VALUES (@host_id, @hostname, @registry_hostname, @update_every, @os, @timezone, @tags, @hops, " \ - "@memory_mode, @abbrev_tz, @utc_offset, @prog_name, @prog_version, @entries, @health_enabled, @last_connected);" + "@memory_mode, @abbrev_tz, @utc_offset, @prog_name, @prog_version, @entries, @health_enabled, @last_connected)" #define SQL_STORE_CHART \ "INSERT INTO chart (chart_id, host_id, type, id, name, family, context, title, unit, plugin, module, priority, " \ @@ -51,11 +52,10 @@ "(@uuid, @name, @value, UNIXEPOCH())" #define MIGRATE_LOCALHOST_TO_NEW_MACHINE_GUID \ - "UPDATE chart SET host_id = @host_id WHERE host_id in (SELECT host_id FROM host where host_id <> @host_id and hops = 0);" -#define DELETE_NON_EXISTING_LOCALHOST "DELETE FROM host WHERE hops = 0 AND host_id <> @host_id;" -#define DELETE_MISSING_NODE_INSTANCES "DELETE FROM node_instance WHERE host_id NOT IN (SELECT host_id FROM host);" + "UPDATE chart SET host_id = @host_id WHERE host_id in (SELECT host_id FROM host where host_id <> @host_id and hops = 0)" +#define DELETE_NON_EXISTING_LOCALHOST "DELETE FROM host WHERE hops = 0 AND host_id <> @host_id" +#define DELETE_MISSING_NODE_INSTANCES "DELETE FROM node_instance WHERE host_id NOT IN (SELECT host_id FROM host)" -#define METADATA_CMD_Q_MAX_SIZE (2048) // Max queue size; callers will block until there is room #define METADATA_MAINTENANCE_FIRST_CHECK (1800) // Maintenance first run after agent startup in seconds #define METADATA_MAINTENANCE_REPEAT (60) // Repeat if last run for dimensions, charts, labels needs more work #define METADATA_HEALTH_LOG_INTERVAL (3600) // Repeat maintenance for health @@ -81,10 +81,10 @@ enum metadata_opcode { METADATA_ADD_HOST_INFO, METADATA_SCAN_HOSTS, METADATA_LOAD_HOST_CONTEXT, + METADATA_DELETE_HOST_CHART_LABELS, METADATA_MAINTENANCE, METADATA_SYNC_SHUTDOWN, METADATA_UNITTEST, - METADATA_ML_LOAD_MODELS, // leave this last // we need it to check for worker utilization METADATA_MAX_ENUMERATIONS_DEFINED @@ -98,14 +98,9 @@ struct metadata_cmd { struct metadata_cmd *prev, *next; }; -struct metadata_database_cmdqueue { - struct metadata_cmd *cmd_base; -}; - typedef enum { METADATA_FLAG_PROCESSING = (1 << 0), // store or cleanup METADATA_FLAG_SHUTDOWN = (1 << 1), // Shutting down - METADATA_FLAG_ML_LOADING = (1 << 2), // ML model load in progress } METADATA_FLAG; struct metadata_wc { @@ -114,19 +109,20 @@ struct metadata_wc { uv_async_t async; uv_timer_t timer_req; time_t metadata_check_after; - volatile unsigned queue_size; METADATA_FLAG flags; - struct completion init_complete; + struct completion start_stop_complete; struct completion *scan_complete; /* FIFO command queue */ - uv_mutex_t cmd_mutex; - struct metadata_database_cmdqueue cmd_queue; + SPINLOCK cmd_queue_lock; + struct metadata_cmd *cmd_base; }; #define metadata_flag_check(target_flags, flag) (__atomic_load_n(&((target_flags)->flags), __ATOMIC_SEQ_CST) & (flag)) #define metadata_flag_set(target_flags, flag) __atomic_or_fetch(&((target_flags)->flags), (flag), __ATOMIC_SEQ_CST) #define metadata_flag_clear(target_flags, flag) __atomic_and_fetch(&((target_flags)->flags), ~(flag), __ATOMIC_SEQ_CST) +struct metadata_wc metasync_worker = {.loop = NULL}; + // // For unittest // @@ -146,6 +142,33 @@ struct query_build { char uuid_str[UUID_STR_LEN]; }; +#define SQL_DELETE_CHART_LABELS_BY_HOST \ + "DELETE FROM chart_label WHERE chart_id in (SELECT chart_id FROM chart WHERE host_id = @host_id)" + +static void delete_host_chart_labels(uuid_t *host_uuid) +{ + sqlite3_stmt *res = NULL; + + int rc = sqlite3_prepare_v2(db_meta, SQL_DELETE_CHART_LABELS_BY_HOST, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to delete chart labels by host"); + return; + } + + rc = sqlite3_bind_blob(res, 1, host_uuid, sizeof(*host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter to host chart labels"); + goto failed; + } + rc = sqlite3_step_monitored(res); + if (unlikely(rc != SQLITE_DONE)) + error_report("Failed to execute command to remove host chart labels"); + +failed: + if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) + error_report("Failed to finalize statement to remove host chart labels"); +} + static int host_label_store_to_sql_callback(const char *name, const char *value, RRDLABEL_SRC ls, void *data) { struct query_build *lb = data; if (unlikely(!lb->count)) @@ -168,8 +191,8 @@ static int chart_label_store_to_sql_callback(const char *name, const char *value return 1; } -#define SQL_DELETE_CHART_LABEL "DELETE FROM chart_label WHERE chart_id = @chart_id;" -#define SQL_DELETE_CHART_LABEL_HISTORY "DELETE FROM chart_label WHERE date_created < %ld AND chart_id = @chart_id;" +#define SQL_DELETE_CHART_LABEL "DELETE FROM chart_label WHERE chart_id = @chart_id" +#define SQL_DELETE_CHART_LABEL_HISTORY "DELETE FROM chart_label WHERE date_created < %ld AND chart_id = @chart_id" static void clean_old_chart_labels(RRDSET *st) { @@ -177,9 +200,9 @@ static void clean_old_chart_labels(RRDSET *st) time_t first_time_s = rrdset_first_entry_s(st); if (unlikely(!first_time_s)) - snprintfz(sql, 511,SQL_DELETE_CHART_LABEL); + snprintfz(sql, sizeof(sql) - 1, SQL_DELETE_CHART_LABEL); else - snprintfz(sql, 511,SQL_DELETE_CHART_LABEL_HISTORY, first_time_s); + snprintfz(sql, sizeof(sql) - 1, SQL_DELETE_CHART_LABEL_HISTORY, first_time_s); int rc = exec_statement_with_uuid(sql, &st->chart_uuid); if (unlikely(rc)) @@ -873,7 +896,7 @@ static void check_dimension_metadata(struct metadata_wc *wc) next_execution_t = now + METADATA_DIM_CHECK_INTERVAL; } - netdata_log_info( + internal_error(true, "METADATA: Dimensions checked %u, deleted %u. Checks will %s in %lld seconds", total_checked, total_deleted, @@ -940,7 +963,7 @@ static void check_chart_metadata(struct metadata_wc *wc) next_execution_t = now + METADATA_CHART_CHECK_INTERVAL; } - netdata_log_info( + internal_error(true, "METADATA: Charts checked %u, deleted %u. Checks will %s in %lld seconds", total_checked, total_deleted, @@ -1009,7 +1032,7 @@ static void check_label_metadata(struct metadata_wc *wc) next_execution_t = now + METADATA_LABEL_CHECK_INTERVAL; } - netdata_log_info( + internal_error(true, "METADATA: Chart labels checked %u, deleted %u. Checks will %s in %lld seconds", total_checked, total_deleted, @@ -1059,21 +1082,15 @@ static void cleanup_health_log(struct metadata_wc *wc) // EVENT LOOP STARTS HERE // -static void metadata_init_cmd_queue(struct metadata_wc *wc) -{ - wc->cmd_queue.cmd_base = NULL; - fatal_assert(0 == uv_mutex_init(&wc->cmd_mutex)); -} - static void metadata_free_cmd_queue(struct metadata_wc *wc) { - uv_mutex_lock(&wc->cmd_mutex); - while(wc->cmd_queue.cmd_base) { - struct metadata_cmd *t = wc->cmd_queue.cmd_base; - DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wc->cmd_queue.cmd_base, t, prev, next); + spinlock_lock(&wc->cmd_queue_lock); + while(wc->cmd_base) { + struct metadata_cmd *t = wc->cmd_base; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wc->cmd_base, t, prev, next); freez(t); } - uv_mutex_unlock(&wc->cmd_mutex); + spinlock_unlock(&wc->cmd_queue_lock); } static void metadata_enq_cmd(struct metadata_wc *wc, struct metadata_cmd *cmd) @@ -1090,9 +1107,9 @@ static void metadata_enq_cmd(struct metadata_wc *wc, struct metadata_cmd *cmd) *t = *cmd; t->prev = t->next = NULL; - uv_mutex_lock(&wc->cmd_mutex); - DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(wc->cmd_queue.cmd_base, t, prev, next); - uv_mutex_unlock(&wc->cmd_mutex); + spinlock_lock(&wc->cmd_queue_lock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(wc->cmd_base, t, prev, next); + spinlock_unlock(&wc->cmd_queue_lock); wakeup_event_loop: (void) uv_async_send(&wc->async); @@ -1102,10 +1119,10 @@ static struct metadata_cmd metadata_deq_cmd(struct metadata_wc *wc) { struct metadata_cmd ret; - uv_mutex_lock(&wc->cmd_mutex); - if(wc->cmd_queue.cmd_base) { - struct metadata_cmd *t = wc->cmd_queue.cmd_base; - DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wc->cmd_queue.cmd_base, t, prev, next); + spinlock_lock(&wc->cmd_queue_lock); + if(wc->cmd_base) { + struct metadata_cmd *t = wc->cmd_base; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(wc->cmd_base, t, prev, next); ret = *t; freez(t); } @@ -1113,7 +1130,7 @@ static struct metadata_cmd metadata_deq_cmd(struct metadata_wc *wc) ret.opcode = METADATA_DATABASE_NOOP; ret.completion = NULL; } - uv_mutex_unlock(&wc->cmd_mutex); + spinlock_unlock(&wc->cmd_queue_lock); return ret; } @@ -1136,9 +1153,7 @@ static void timer_cb(uv_timer_t* handle) struct metadata_cmd cmd; memset(&cmd, 0, sizeof(cmd)); - time_t now = now_realtime_sec(); - - if (wc->metadata_check_after && wc->metadata_check_after < now) { + if (wc->metadata_check_after < now_realtime_sec()) { cmd.opcode = METADATA_SCAN_HOSTS; metadata_enq_cmd(wc, &cmd); } @@ -1158,10 +1173,10 @@ void vacuum_database(sqlite3 *database, const char *db_alias, int threshold, int if (free_pages > (total_pages * threshold / 100)) { int do_free_pages = (int) (free_pages * vacuum_pc / 100); - netdata_log_info("%s: Freeing %d database pages", db_alias, do_free_pages); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "%s: Freeing %d database pages", db_alias, do_free_pages); char sql[128]; - snprintfz(sql, 127, "PRAGMA incremental_vacuum(%d)", do_free_pages); + snprintfz(sql, sizeof(sql) - 1, "PRAGMA incremental_vacuum(%d)", do_free_pages); (void) db_execute(database, sql); } } @@ -1184,16 +1199,10 @@ void run_metadata_cleanup(struct metadata_wc *wc) (void) sqlite3_wal_checkpoint(db_meta, NULL); } -struct ml_model_payload { - uv_work_t request; - struct metadata_wc *wc; - Pvoid_t JudyL; - size_t count; -}; - struct scan_metadata_payload { uv_work_t request; struct metadata_wc *wc; + void *data; BUFFER *work_buffer; uint32_t max_count; }; @@ -1271,7 +1280,7 @@ static void start_all_host_load_context(uv_work_t *req __maybe_unused) register_libuv_worker_jobs(); struct scan_metadata_payload *data = req->data; - UNUSED(data); + struct metadata_wc *wc = data->wc; worker_is_busy(UV_EVENT_HOST_CONTEXT_LOAD); usec_t started_ut = now_monotonic_usec(); (void)started_ut; @@ -1279,6 +1288,9 @@ static void start_all_host_load_context(uv_work_t *req __maybe_unused) RRDHOST *host; size_t max_threads = MIN(get_netdata_cpus() / 2, 6); + if (max_threads < 1) + max_threads = 1; + nd_log(NDLS_DAEMON, NDLP_DEBUG, "METADATA: Using %zu threads for context loading", max_threads); struct host_context_load_thread *hclt = callocz(max_threads, sizeof(*hclt)); size_t thread_index; @@ -1290,25 +1302,28 @@ static void start_all_host_load_context(uv_work_t *req __maybe_unused) rrdhost_flag_set(host, RRDHOST_FLAG_CONTEXT_LOAD_IN_PROGRESS); internal_error(true, "METADATA: 'host:%s' loading context", rrdhost_hostname(host)); - cleanup_finished_threads(hclt, max_threads, false); - bool found_slot = find_available_thread_slot(hclt, max_threads, &thread_index); + bool found_slot = false; + do { + if (metadata_flag_check(wc, METADATA_FLAG_SHUTDOWN)) + break; - if (unlikely(!found_slot)) { - struct host_context_load_thread hclt_sync = {.host = host}; - restore_host_context(&hclt_sync); - } - else { - __atomic_store_n(&hclt[thread_index].busy, true, __ATOMIC_RELAXED); - hclt[thread_index].host = host; - assert(0 == uv_thread_create(&hclt[thread_index].thread, restore_host_context, &hclt[thread_index])); - } + cleanup_finished_threads(hclt, max_threads, false); + found_slot = find_available_thread_slot(hclt, max_threads, &thread_index); + } while (!found_slot); + + if (metadata_flag_check(wc, METADATA_FLAG_SHUTDOWN)) + break; + + __atomic_store_n(&hclt[thread_index].busy, true, __ATOMIC_RELAXED); + hclt[thread_index].host = host; + fatal_assert(0 == uv_thread_create(&hclt[thread_index].thread, restore_host_context, &hclt[thread_index])); } dfe_done(host); cleanup_finished_threads(hclt, max_threads, true); freez(hclt); usec_t ended_ut = now_monotonic_usec(); (void)ended_ut; - internal_error(true, "METADATA: 'host:ALL' contexts loaded in %0.2f ms", (double)(ended_ut - started_ut) / USEC_PER_MS); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "METADATA: host contexts loaded in %0.2f ms", (double)(ended_ut - started_ut) / USEC_PER_MS); worker_is_idle(); } @@ -1335,6 +1350,10 @@ static bool metadata_scan_host(RRDHOST *host, uint32_t max_count, bool use_trans bool more_to_do = false; uint32_t scan_count = 1; + sqlite3_stmt *ml_load_stmt = NULL; + + bool load_ml_models = max_count; + if (use_transaction) (void)db_execute(db_meta, "BEGIN TRANSACTION"); @@ -1379,6 +1398,14 @@ static bool metadata_scan_host(RRDHOST *host, uint32_t max_count, bool use_trans rrdhost_hostname(host), rrdset_name(st), rrddim_name(rd)); } + + if(rrddim_flag_check(rd, RRDDIM_FLAG_ML_MODEL_LOAD)) { + rrddim_flag_clear(rd, RRDDIM_FLAG_ML_MODEL_LOAD); + if (likely(load_ml_models)) + (void) ml_dimension_load_models(rd, &ml_load_stmt); + } + + worker_is_idle(); } rrddim_foreach_done(rd); } @@ -1387,6 +1414,11 @@ static bool metadata_scan_host(RRDHOST *host, uint32_t max_count, bool use_trans if (use_transaction) (void)db_execute(db_meta, "COMMIT TRANSACTION"); + if (ml_load_stmt) { + sqlite3_finalize(ml_load_stmt); + ml_load_stmt = NULL; + } + return more_to_do; } @@ -1411,6 +1443,11 @@ static void store_host_and_system_info(RRDHOST *host, size_t *query_counter) } } +struct host_chart_label_cleanup { + Pvoid_t JudyL; + Word_t count; +}; + // Worker thread to scan hosts for pending metadata to store static void start_metadata_hosts(uv_work_t *req __maybe_unused) { @@ -1427,11 +1464,33 @@ static void start_metadata_hosts(uv_work_t *req __maybe_unused) internal_error(true, "METADATA: checking all hosts..."); usec_t started_ut = now_monotonic_usec(); (void)started_ut; + struct host_chart_label_cleanup *cl_cleanup_data = data->data; + + if (cl_cleanup_data) { + Word_t Index = 0; + bool first = true; + Pvoid_t *PValue; + while ((PValue = JudyLFirstThenNext(cl_cleanup_data->JudyL, &Index, &first))) { + char *machine_guid = *PValue; + + host = rrdhost_find_by_guid(machine_guid); + if (likely(!host)) { + uuid_t host_uuid; + if (!uuid_parse(machine_guid, host_uuid)) + delete_host_chart_labels(&host_uuid); + } + + freez(machine_guid); + } + JudyLFreeArray(&cl_cleanup_data->JudyL, PJE0); + freez(cl_cleanup_data); + } + bool run_again = false; worker_is_busy(UV_EVENT_METADATA_STORE); if (!data->max_count) - transaction_started = !db_execute(db_meta, "BEGIN TRANSACTION;"); + transaction_started = !db_execute(db_meta, "BEGIN TRANSACTION"); dfe_start_reentrant(rrdhost_root_index, host) { if (rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED) || !rrdhost_flag_check(host, RRDHOST_FLAG_METADATA_UPDATE)) @@ -1501,7 +1560,7 @@ static void start_metadata_hosts(uv_work_t *req __maybe_unused) dfe_done(host); if (!data->max_count && transaction_started) - transaction_started = db_execute(db_meta, "COMMIT TRANSACTION;"); + transaction_started = db_execute(db_meta, "COMMIT TRANSACTION"); usec_t all_ended_ut = now_monotonic_usec(); (void)all_ended_ut; internal_error(true, "METADATA: checking all hosts completed in %0.2f ms", @@ -1516,42 +1575,6 @@ static void start_metadata_hosts(uv_work_t *req __maybe_unused) worker_is_idle(); } -// Callback after scan of hosts is done -static void after_start_ml_model_load(uv_work_t *req, int status __maybe_unused) -{ - struct ml_model_payload *ml_data = req->data; - struct metadata_wc *wc = ml_data->wc; - metadata_flag_clear(wc, METADATA_FLAG_ML_LOADING); - JudyLFreeArray(&ml_data->JudyL, PJE0); - freez(ml_data); -} - -static void start_ml_model_load(uv_work_t *req __maybe_unused) -{ - register_libuv_worker_jobs(); - - struct ml_model_payload *ml_data = req->data; - - worker_is_busy(UV_EVENT_METADATA_ML_LOAD); - - Pvoid_t *PValue; - Word_t Index = 0; - bool first = true; - RRDDIM *rd; - RRDDIM_ACQUIRED *rda; - internal_error(true, "Batch ML load loader, %zu items", ml_data->count); - while((PValue = JudyLFirstThenNext(ml_data->JudyL, &Index, &first))) { - UNUSED(PValue); - rda = (RRDDIM_ACQUIRED *) Index; - rd = rrddim_acquired_to_rrddim(rda); - ml_dimension_load_models(rd); - rrddim_acquired_release(rda); - } - worker_is_idle(); -} - - - static void metadata_event_loop(void *arg) { worker_register("METASYNC"); @@ -1561,7 +1584,6 @@ static void metadata_event_loop(void *arg) worker_register_job_name(METADATA_STORE_CLAIM_ID, "add claim id"); worker_register_job_name(METADATA_ADD_HOST_INFO, "add host info"); worker_register_job_name(METADATA_MAINTENANCE, "maintenance"); - worker_register_job_name(METADATA_ML_LOAD_MODELS, "ml load models"); int ret; uv_loop_t *loop; @@ -1593,7 +1615,7 @@ static void metadata_event_loop(void *arg) wc->timer_req.data = wc; fatal_assert(0 == uv_timer_start(&wc->timer_req, timer_cb, TIMER_INITIAL_PERIOD_MS, TIMER_REPEAT_PERIOD_MS)); - netdata_log_info("Starting metadata sync thread with %d entries command queue", METADATA_CMD_Q_MAX_SIZE); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "Starting metadata sync thread"); struct metadata_cmd cmd; memset(&cmd, 0, sizeof(cmd)); @@ -1602,11 +1624,11 @@ static void metadata_event_loop(void *arg) wc->metadata_check_after = now_realtime_sec() + METADATA_HOST_CHECK_FIRST_CHECK; int shutdown = 0; - completion_mark_complete(&wc->init_complete); + completion_mark_complete(&wc->start_stop_complete); BUFFER *work_buffer = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); struct scan_metadata_payload *data; + struct host_chart_label_cleanup *cl_cleanup_data = NULL; - struct ml_model_payload *ml_data = NULL; while (shutdown == 0 || (wc->flags & METADATA_FLAG_PROCESSING)) { uuid_t *uuid; RRDHOST *host = NULL; @@ -1633,43 +1655,10 @@ static void metadata_event_loop(void *arg) if (likely(opcode != METADATA_DATABASE_NOOP)) worker_is_busy(opcode); - // Have pending ML models to load? - if (opcode != METADATA_ML_LOAD_MODELS && ml_data && ml_data->count) { - static usec_t ml_submit_last = 0; - usec_t now = now_monotonic_usec(); - if (!ml_submit_last) - ml_submit_last = now; - - if (!metadata_flag_check(wc, METADATA_FLAG_ML_LOADING) && (now - ml_submit_last > 150 * USEC_PER_MS)) { - metadata_flag_set(wc, METADATA_FLAG_ML_LOADING); - if (unlikely(uv_queue_work(loop, &ml_data->request, start_ml_model_load, after_start_ml_model_load))) - metadata_flag_clear(wc, METADATA_FLAG_ML_LOADING); - else { - ml_submit_last = now; - ml_data = NULL; - } - } - } - switch (opcode) { case METADATA_DATABASE_NOOP: case METADATA_DATABASE_TIMER: break; - - case METADATA_ML_LOAD_MODELS: { - RRDDIM *rd = (RRDDIM *) cmd.param[0]; - RRDDIM_ACQUIRED *rda = rrddim_find_and_acquire(rd->rrdset, rrddim_id(rd)); - if (likely(rda)) { - if (!ml_data) { - ml_data = callocz(1,sizeof(*ml_data)); - ml_data->request.data = ml_data; - ml_data->wc = wc; - } - JudyLIns(&ml_data->JudyL, (Word_t)rda, PJE0); - ml_data->count++; - } - break; - } case METADATA_DEL_DIMENSION: uuid = (uuid_t *) cmd.param[0]; if (likely(dimension_can_be_deleted(uuid, NULL, false))) @@ -1695,7 +1684,9 @@ static void metadata_event_loop(void *arg) data = mallocz(sizeof(*data)); data->request.data = data; data->wc = wc; + data->data = cl_cleanup_data; data->work_buffer = work_buffer; + cl_cleanup_data = NULL; if (unlikely(cmd.completion)) { data->max_count = 0; // 0 will process all pending updates @@ -1711,6 +1702,7 @@ static void metadata_event_loop(void *arg) after_metadata_hosts))) { // Failed to launch worker -- let the event loop handle completion cmd.completion = wc->scan_complete; + cl_cleanup_data = data->data; freez(data); metadata_flag_clear(wc, METADATA_FLAG_PROCESSING); } @@ -1728,6 +1720,15 @@ static void metadata_event_loop(void *arg) freez(data); } break; + case METADATA_DELETE_HOST_CHART_LABELS:; + if (!cl_cleanup_data) + cl_cleanup_data = callocz(1,sizeof(*cl_cleanup_data)); + + Pvoid_t *PValue = JudyLIns(&cl_cleanup_data->JudyL, (Word_t) ++cl_cleanup_data->count, PJE0); + if (PValue) + *PValue = (void *) cmd.param[0]; + + break; case METADATA_UNITTEST:; struct thread_unittest *tu = (struct thread_unittest *) cmd.param[0]; sleep_usec(1000); // processing takes 1ms @@ -1755,10 +1756,12 @@ static void metadata_event_loop(void *arg) freez(loop); worker_unregister(); - netdata_log_info("METADATA: Shutting down event loop"); - completion_mark_complete(&wc->init_complete); - completion_destroy(wc->scan_complete); - freez(wc->scan_complete); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "Shutting down metadata thread"); + completion_mark_complete(&wc->start_stop_complete); + if (wc->scan_complete) { + completion_destroy(wc->scan_complete); + freez(wc->scan_complete); + } metadata_free_cmd_queue(wc); return; @@ -1771,23 +1774,21 @@ error_after_loop_init: worker_unregister(); } -struct metadata_wc metasync_worker = {.loop = NULL}; - void metadata_sync_shutdown(void) { - completion_init(&metasync_worker.init_complete); + completion_init(&metasync_worker.start_stop_complete); struct metadata_cmd cmd; memset(&cmd, 0, sizeof(cmd)); - netdata_log_info("METADATA: Sending a shutdown command"); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "METADATA: Sending a shutdown command"); cmd.opcode = METADATA_SYNC_SHUTDOWN; metadata_enq_cmd(&metasync_worker, &cmd); /* wait for metadata thread to shut down */ - netdata_log_info("METADATA: Waiting for shutdown ACK"); - completion_wait_for(&metasync_worker.init_complete); - completion_destroy(&metasync_worker.init_complete); - netdata_log_info("METADATA: Shutdown complete"); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "METADATA: Waiting for shutdown ACK"); + completion_wait_for(&metasync_worker.start_stop_complete); + completion_destroy(&metasync_worker.start_stop_complete); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "METADATA: Shutdown complete"); } void metadata_sync_shutdown_prepare(void) @@ -1804,20 +1805,20 @@ void metadata_sync_shutdown_prepare(void) completion_init(compl); __atomic_store_n(&wc->scan_complete, compl, __ATOMIC_RELAXED); - netdata_log_info("METADATA: Sending a scan host command"); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "METADATA: Sending a scan host command"); uint32_t max_wait_iterations = 2000; while (unlikely(metadata_flag_check(&metasync_worker, METADATA_FLAG_PROCESSING)) && max_wait_iterations--) { if (max_wait_iterations == 1999) - netdata_log_info("METADATA: Current worker is running; waiting to finish"); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "METADATA: Current worker is running; waiting to finish"); sleep_usec(1000); } cmd.opcode = METADATA_SCAN_HOSTS; metadata_enq_cmd(&metasync_worker, &cmd); - netdata_log_info("METADATA: Waiting for host scan completion"); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "METADATA: Waiting for host scan completion"); completion_wait_for(wc->scan_complete); - netdata_log_info("METADATA: Host scan complete; can continue with shutdown"); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "METADATA: Host scan complete; can continue with shutdown"); } // ------------------------------------------------------------- @@ -1828,15 +1829,14 @@ void metadata_sync_init(void) struct metadata_wc *wc = &metasync_worker; memset(wc, 0, sizeof(*wc)); - metadata_init_cmd_queue(wc); - completion_init(&wc->init_complete); + completion_init(&wc->start_stop_complete); fatal_assert(0 == uv_thread_create(&(wc->thread), metadata_event_loop, wc)); - completion_wait_for(&wc->init_complete); - completion_destroy(&wc->init_complete); + completion_wait_for(&wc->start_stop_complete); + completion_destroy(&wc->start_stop_complete); - netdata_log_info("SQLite metadata sync initialization complete"); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "SQLite metadata sync initialization complete"); } @@ -1887,9 +1887,7 @@ void metaqueue_host_update_info(RRDHOST *host) void metaqueue_ml_load_models(RRDDIM *rd) { - if (unlikely(!metasync_worker.loop)) - return; - queue_metadata_cmd(METADATA_ML_LOAD_MODELS, rd, NULL); + rrddim_flag_set(rd, RRDDIM_FLAG_ML_MODEL_LOAD); } void metadata_queue_load_host_context(RRDHOST *host) @@ -1897,8 +1895,22 @@ void metadata_queue_load_host_context(RRDHOST *host) if (unlikely(!metasync_worker.loop)) return; queue_metadata_cmd(METADATA_LOAD_HOST_CONTEXT, host, NULL); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "Queued command to load host contexts"); } +void metadata_delete_host_chart_labels(char *machine_guid) +{ + if (unlikely(!metasync_worker.loop)) { + freez(machine_guid); + return; + } + + // Node machine guid is already strdup-ed + queue_metadata_cmd(METADATA_DELETE_HOST_CHART_LABELS, machine_guid, NULL); + nd_log(NDLS_DAEMON, NDLP_DEBUG, "Queued command delete chart labels for host %s", machine_guid); +} + + // // unitests // @@ -1946,7 +1958,7 @@ static void *metadata_unittest_threads(void) tu.join = 0; for (int i = 0; i < threads_to_create; i++) { char buf[100 + 1]; - snprintf(buf, 100, "META[%d]", i); + snprintf(buf, sizeof(buf) - 1, "META[%d]", i); netdata_thread_create( &threads[i], buf, diff --git a/database/sqlite/sqlite_metadata.h b/database/sqlite/sqlite_metadata.h index f75a9ab00..6860cfedf 100644 --- a/database/sqlite/sqlite_metadata.h +++ b/database/sqlite/sqlite_metadata.h @@ -17,6 +17,7 @@ void metaqueue_host_update_info(RRDHOST *host); void metaqueue_ml_load_models(RRDDIM *rd); void migrate_localhost(uuid_t *host_uuid); void metadata_queue_load_host_context(RRDHOST *host); +void metadata_delete_host_chart_labels(char *machine_guid); void vacuum_database(sqlite3 *database, const char *db_alias, int threshold, int vacuum_pc); // UNIT TEST |