diff options
Diffstat (limited to 'src/database/contexts/worker.c')
-rw-r--r-- | src/database/contexts/worker.c | 1153 |
1 files changed, 1153 insertions, 0 deletions
diff --git a/src/database/contexts/worker.c b/src/database/contexts/worker.c new file mode 100644 index 000000000..2aae8363d --- /dev/null +++ b/src/database/contexts/worker.c @@ -0,0 +1,1153 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "internal.h" + +static uint64_t rrdcontext_get_next_version(RRDCONTEXT *rc); + +static bool check_if_cloud_version_changed_unsafe(RRDCONTEXT *rc, bool sending __maybe_unused); + +static void rrdcontext_delete_from_sql_unsafe(RRDCONTEXT *rc); + +static void rrdcontext_dequeue_from_post_processing(RRDCONTEXT *rc); +static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAGS reason, bool worker_jobs); + +static void rrdcontext_garbage_collect_single_host(RRDHOST *host, bool worker_jobs); +static void rrdcontext_garbage_collect_for_all_hosts(void); + +extern usec_t rrdcontext_next_db_rotation_ut; + +// ---------------------------------------------------------------------------- +// load from SQL + +static void rrdinstance_load_clabel(SQL_CLABEL_DATA *sld, void *data) { + RRDINSTANCE *ri = data; + rrdlabels_add(ri->rrdlabels, sld->label_key, sld->label_value, sld->label_source); +} + +static void rrdinstance_load_dimension(SQL_DIMENSION_DATA *sd, void *data) { + RRDINSTANCE *ri = data; + + RRDMETRIC trm = { + .id = string_strdupz(sd->id), + .name = string_strdupz(sd->name), + .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomic + }; + if(sd->hidden) trm.flags |= RRD_FLAG_HIDDEN; + + uuid_copy(trm.uuid, sd->dim_id); + + dictionary_set(ri->rrdmetrics, string2str(trm.id), &trm, sizeof(trm)); +} + +static void rrdinstance_load_chart_callback(SQL_CHART_DATA *sc, void *data) { + RRDHOST *host = data; + + RRDCONTEXT tc = { + .id = string_strdupz(sc->context), + .title = string_strdupz(sc->title), + .units = string_strdupz(sc->units), + .family = string_strdupz(sc->family), + .priority = sc->priority, + .chart_type = sc->chart_type, + .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomics + .rrdhost = host, + }; + + RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_set_and_acquire_item(host->rrdctx.contexts, string2str(tc.id), &tc, sizeof(tc)); + RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + + RRDINSTANCE tri = { + .id = string_strdupz(sc->id), + .name = string_strdupz(sc->name), + .title = string_strdupz(sc->title), + .units = string_strdupz(sc->units), + .family = string_strdupz(sc->family), + .chart_type = sc->chart_type, + .priority = sc->priority, + .update_every_s = sc->update_every, + .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomics + }; + uuid_copy(tri.uuid, sc->chart_id); + + RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *)dictionary_set_and_acquire_item(rc->rrdinstances, sc->id, &tri, sizeof(tri)); + RRDINSTANCE *ri = rrdinstance_acquired_value(ria); + + ctx_get_dimension_list(&ri->uuid, rrdinstance_load_dimension, ri); + ctx_get_label_list(&ri->uuid, rrdinstance_load_clabel, ri); + rrdinstance_trigger_updates(ri, __FUNCTION__ ); + rrdinstance_release(ria); + rrdcontext_release(rca); +} + +static void rrdcontext_load_context_callback(VERSIONED_CONTEXT_DATA *ctx_data, void *data) { + RRDHOST *host = data; + (void)host; + + RRDCONTEXT trc = { + .id = string_strdupz(ctx_data->id), + .flags = RRD_FLAG_ARCHIVED | RRD_FLAG_UPDATE_REASON_LOAD_SQL, // no need for atomics + + // no need to set more data here + // we only need the hub data + + .hub = *ctx_data, + }; + dictionary_set(host->rrdctx.contexts, string2str(trc.id), &trc, sizeof(trc)); +} + +void rrdhost_load_rrdcontext_data(RRDHOST *host) { + if(host->rrdctx.contexts) return; + + rrdhost_create_rrdcontexts(host); + ctx_get_context_list(&host->host_uuid, rrdcontext_load_context_callback, host); + ctx_get_chart_list(&host->host_uuid, rrdinstance_load_chart_callback, host); + + RRDCONTEXT *rc; + dfe_start_read(host->rrdctx.contexts, rc) { + rrdcontext_trigger_updates(rc, __FUNCTION__ ); + } + dfe_done(rc); + + rrdcontext_garbage_collect_single_host(host, false); +} + +// ---------------------------------------------------------------------------- +// version hash calculation + +uint64_t rrdcontext_version_hash_with_callback( + RRDHOST *host, + void (*callback)(RRDCONTEXT *, bool, void *), + bool snapshot, + void *bundle) { + + if(unlikely(!host || !host->rrdctx.contexts)) return 0; + + RRDCONTEXT *rc; + uint64_t hash = 0; + + // loop through all contexts of the host + dfe_start_read(host->rrdctx.contexts, rc) { + + rrdcontext_lock(rc); + + if(unlikely(rrd_flag_check(rc, RRD_FLAG_HIDDEN))) { + rrdcontext_unlock(rc); + continue; + } + + if(unlikely(callback)) + callback(rc, snapshot, bundle); + + // skip any deleted contexts + if(unlikely(rrd_flag_is_deleted(rc))) { + rrdcontext_unlock(rc); + continue; + } + + // we use rc->hub.* which has the latest + // metadata we have sent to the hub + + // if a context is currently queued, rc->hub.* does NOT + // reflect the queued changes. rc->hub.* is updated with + // their metadata, after messages are dispatched to hub. + + // when the context is being collected, + // rc->hub.last_time_t is already zero + + hash += rc->hub.version + rc->hub.last_time_s - rc->hub.first_time_s; + + rrdcontext_unlock(rc); + + } + dfe_done(rc); + + return hash; +} + +// ---------------------------------------------------------------------------- +// retention recalculation + +static void rrdhost_update_cached_retention(RRDHOST *host, time_t first_time_s, time_t last_time_s, bool global) { + if(unlikely(!host)) + return; + + spinlock_lock(&host->retention.spinlock); + + if(global) { + host->retention.first_time_s = first_time_s; + host->retention.last_time_s = last_time_s; + } + else { + if(!host->retention.first_time_s || first_time_s < host->retention.first_time_s) + host->retention.first_time_s = first_time_s; + + if(!host->retention.last_time_s || last_time_s > host->retention.last_time_s) + host->retention.last_time_s = last_time_s; + } + + spinlock_unlock(&host->retention.spinlock); +} + +void rrdcontext_recalculate_context_retention(RRDCONTEXT *rc, RRD_FLAGS reason, bool worker_jobs) { + rrdcontext_post_process_updates(rc, true, reason, worker_jobs); +} + +void rrdcontext_recalculate_host_retention(RRDHOST *host, RRD_FLAGS reason, bool worker_jobs) { + if(unlikely(!host || !host->rrdctx.contexts)) return; + + time_t first_time_s = 0; + time_t last_time_s = 0; + + RRDCONTEXT *rc; + dfe_start_read(host->rrdctx.contexts, rc) { + rrdcontext_recalculate_context_retention(rc, reason, worker_jobs); + + if(!first_time_s || rc->first_time_s < first_time_s) + first_time_s = rc->first_time_s; + + if(!last_time_s || rc->last_time_s > last_time_s) + last_time_s = rc->last_time_s; + } + dfe_done(rc); + + rrdhost_update_cached_retention(host, first_time_s, last_time_s, true); +} + +static void rrdcontext_recalculate_retention_all_hosts(void) { + rrdcontext_next_db_rotation_ut = 0; + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host) { + worker_is_busy(WORKER_JOB_RETENTION); + rrdcontext_recalculate_host_retention(host, RRD_FLAG_UPDATE_REASON_DB_ROTATION, true); + } + dfe_done(host); +} + +// ---------------------------------------------------------------------------- +// garbage collector + +bool rrdmetric_update_retention(RRDMETRIC *rm) { + time_t min_first_time_t = LONG_MAX, max_last_time_t = 0; + + if(rm->rrddim) { + min_first_time_t = rrddim_first_entry_s(rm->rrddim); + max_last_time_t = rrddim_last_entry_s(rm->rrddim); + } + else { + RRDHOST *rrdhost = rm->ri->rc->rrdhost; + for (size_t tier = 0; tier < storage_tiers; tier++) { + STORAGE_ENGINE *eng = rrdhost->db[tier].eng; + + time_t first_time_t, last_time_t; + if (eng->api.metric_retention_by_uuid(rrdhost->db[tier].si, &rm->uuid, &first_time_t, &last_time_t)) { + if (first_time_t < min_first_time_t) + min_first_time_t = first_time_t; + + if (last_time_t > max_last_time_t) + max_last_time_t = last_time_t; + } + } + } + + if((min_first_time_t == LONG_MAX || min_first_time_t == 0) && max_last_time_t == 0) + return false; + + if(min_first_time_t == LONG_MAX) + min_first_time_t = 0; + + if(min_first_time_t > max_last_time_t) { + internal_error(true, "RRDMETRIC: retention of '%s' is flipped, first_time_t = %ld, last_time_t = %ld", string2str(rm->id), min_first_time_t, max_last_time_t); + time_t tmp = min_first_time_t; + min_first_time_t = max_last_time_t; + max_last_time_t = tmp; + } + + // check if retention changed + + if (min_first_time_t != rm->first_time_s) { + rm->first_time_s = min_first_time_t; + rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if (max_last_time_t != rm->last_time_s) { + rm->last_time_s = max_last_time_t; + rrd_flag_set_updated(rm, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + if(unlikely(!rm->first_time_s && !rm->last_time_s)) + rrd_flag_set_deleted(rm, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + + rrd_flag_set(rm, RRD_FLAG_LIVE_RETENTION); + + return true; +} + +static inline bool rrdmetric_should_be_deleted(RRDMETRIC *rm) { + if(likely(!rrd_flag_check(rm, RRD_FLAGS_REQUIRED_FOR_DELETIONS))) + return false; + + if(likely(rrd_flag_check(rm, RRD_FLAGS_PREVENTING_DELETIONS))) + return false; + + if(likely(rm->rrddim)) + return false; + + rrdmetric_update_retention(rm); + if(rm->first_time_s || rm->last_time_s) + return false; + + return true; +} + +static inline bool rrdinstance_should_be_deleted(RRDINSTANCE *ri) { + if(likely(!rrd_flag_check(ri, RRD_FLAGS_REQUIRED_FOR_DELETIONS))) + return false; + + if(likely(rrd_flag_check(ri, RRD_FLAGS_PREVENTING_DELETIONS))) + return false; + + if(likely(ri->rrdset)) + return false; + + if(unlikely(dictionary_referenced_items(ri->rrdmetrics) != 0)) + return false; + + if(unlikely(dictionary_entries(ri->rrdmetrics) != 0)) + return false; + + if(ri->first_time_s || ri->last_time_s) + return false; + + return true; +} + +static inline bool rrdcontext_should_be_deleted(RRDCONTEXT *rc) { + if(likely(!rrd_flag_check(rc, RRD_FLAGS_REQUIRED_FOR_DELETIONS))) + return false; + + if(likely(rrd_flag_check(rc, RRD_FLAGS_PREVENTING_DELETIONS))) + return false; + + if(unlikely(dictionary_referenced_items(rc->rrdinstances) != 0)) + return false; + + if(unlikely(dictionary_entries(rc->rrdinstances) != 0)) + return false; + + if(unlikely(rc->first_time_s || rc->last_time_s)) + return false; + + return true; +} + +void rrdcontext_delete_from_sql_unsafe(RRDCONTEXT *rc) { + // we need to refresh the string pointers in rc->hub + // in case the context changed values + rc->hub.id = string2str(rc->id); + rc->hub.title = string2str(rc->title); + rc->hub.units = string2str(rc->units); + rc->hub.family = string2str(rc->family); + + // delete it from SQL + if(ctx_delete_context(&rc->rrdhost->host_uuid, &rc->hub) != 0) + netdata_log_error("RRDCONTEXT: failed to delete context '%s' version %"PRIu64" from SQL.", + rc->hub.id, rc->hub.version); +} + +static void rrdcontext_garbage_collect_single_host(RRDHOST *host, bool worker_jobs) { + + internal_error(true, "RRDCONTEXT: garbage collecting context structures of host '%s'", rrdhost_hostname(host)); + + RRDCONTEXT *rc; + dfe_start_reentrant(host->rrdctx.contexts, rc) { + if(unlikely(worker_jobs && !service_running(SERVICE_CONTEXT))) break; + + if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP); + + rrdcontext_lock(rc); + + RRDINSTANCE *ri; + dfe_start_reentrant(rc->rrdinstances, ri) { + if(unlikely(worker_jobs && !service_running(SERVICE_CONTEXT))) break; + + RRDMETRIC *rm; + dfe_start_write(ri->rrdmetrics, rm) { + if(rrdmetric_should_be_deleted(rm)) { + if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP_DELETE); + if(!dictionary_del(ri->rrdmetrics, string2str(rm->id))) + netdata_log_error("RRDCONTEXT: metric '%s' of instance '%s' of context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", + string2str(rm->id), + string2str(ri->id), + string2str(rc->id), + rrdhost_hostname(host)); + else + internal_error( + true, + "RRDCONTEXT: metric '%s' of instance '%s' of context '%s' of host '%s', deleted from rrdmetrics dictionary.", + string2str(rm->id), + string2str(ri->id), + string2str(rc->id), + rrdhost_hostname(host)); + } + } + dfe_done(rm); + + if(rrdinstance_should_be_deleted(ri)) { + if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP_DELETE); + if(!dictionary_del(rc->rrdinstances, string2str(ri->id))) + netdata_log_error("RRDCONTEXT: instance '%s' of context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", + string2str(ri->id), + string2str(rc->id), + rrdhost_hostname(host)); + else + internal_error( + true, + "RRDCONTEXT: instance '%s' of context '%s' of host '%s', deleted from rrdmetrics dictionary.", + string2str(ri->id), + string2str(rc->id), + rrdhost_hostname(host)); + } + } + dfe_done(ri); + + if(unlikely(rrdcontext_should_be_deleted(rc))) { + if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP_DELETE); + rrdcontext_dequeue_from_post_processing(rc); + rrdcontext_delete_from_sql_unsafe(rc); + + if(!dictionary_del(host->rrdctx.contexts, string2str(rc->id))) + netdata_log_error("RRDCONTEXT: context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", + string2str(rc->id), + rrdhost_hostname(host)); + else + internal_error( + true, + "RRDCONTEXT: context '%s' of host '%s', deleted from rrdmetrics dictionary.", + string2str(rc->id), + rrdhost_hostname(host)); + } + + // the item is referenced in the dictionary + // so, it is still here to unlock, even if we have deleted it + rrdcontext_unlock(rc); + } + dfe_done(rc); +} + +static void rrdcontext_garbage_collect_for_all_hosts(void) { + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host) { + rrdcontext_garbage_collect_single_host(host, true); + } + dfe_done(host); +} + +// ---------------------------------------------------------------------------- +// post processing + +static void rrdmetric_process_updates(RRDMETRIC *rm, bool force, RRD_FLAGS reason, bool worker_jobs) { + if(reason != RRD_FLAG_NONE) + rrd_flag_set_updated(rm, reason); + + if(!force && !rrd_flag_is_updated(rm) && rrd_flag_check(rm, RRD_FLAG_LIVE_RETENTION) && !rrd_flag_check(rm, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) + return; + + if(worker_jobs) + worker_is_busy(WORKER_JOB_PP_METRIC); + + if(reason & RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD) { + rrd_flag_set_archived(rm); + rrd_flag_set(rm, RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD); + } + if(rrd_flag_is_deleted(rm) && (reason & RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) + rrd_flag_set_archived(rm); + + rrdmetric_update_retention(rm); + + rrd_flag_unset_updated(rm); +} + +static void rrdinstance_post_process_updates(RRDINSTANCE *ri, bool force, RRD_FLAGS reason, bool worker_jobs) { + if(reason != RRD_FLAG_NONE) + rrd_flag_set_updated(ri, reason); + + if(!force && !rrd_flag_is_updated(ri) && rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION)) + return; + + if(worker_jobs) + worker_is_busy(WORKER_JOB_PP_INSTANCE); + + time_t min_first_time_t = LONG_MAX, max_last_time_t = 0; + size_t metrics_active = 0, metrics_deleted = 0; + bool live_retention = true, currently_collected = false; + if(dictionary_entries(ri->rrdmetrics) > 0) { + RRDMETRIC *rm; + dfe_start_read((DICTIONARY *)ri->rrdmetrics, rm) { + if(unlikely(!service_running(SERVICE_CONTEXT))) break; + + RRD_FLAGS reason_to_pass = reason; + if(rrd_flag_check(ri, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) + reason_to_pass |= RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION; + + rrdmetric_process_updates(rm, force, reason_to_pass, worker_jobs); + + if(unlikely(!rrd_flag_check(rm, RRD_FLAG_LIVE_RETENTION))) + live_retention = false; + + if (unlikely((rrdmetric_should_be_deleted(rm)))) { + metrics_deleted++; + continue; + } + + if(!currently_collected && rrd_flag_check(rm, RRD_FLAG_COLLECTED) && rm->first_time_s) + currently_collected = true; + + metrics_active++; + + if (rm->first_time_s && rm->first_time_s < min_first_time_t) + min_first_time_t = rm->first_time_s; + + if (rm->last_time_s && rm->last_time_s > max_last_time_t) + max_last_time_t = rm->last_time_s; + } + dfe_done(rm); + } + + if(unlikely(live_retention && !rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION))) + rrd_flag_set(ri, RRD_FLAG_LIVE_RETENTION); + else if(unlikely(!live_retention && rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION))) + rrd_flag_clear(ri, RRD_FLAG_LIVE_RETENTION); + + if(unlikely(!metrics_active)) { + // no metrics available + + if(ri->first_time_s) { + ri->first_time_s = 0; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if(ri->last_time_s) { + ri->last_time_s = 0; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + rrd_flag_set_deleted(ri, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + } + else { + // we have active metrics... + + if (unlikely(min_first_time_t == LONG_MAX)) + min_first_time_t = 0; + + if (unlikely(min_first_time_t == 0 || max_last_time_t == 0)) { + if(ri->first_time_s) { + ri->first_time_s = 0; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if(ri->last_time_s) { + ri->last_time_s = 0; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + if(likely(live_retention)) + rrd_flag_set_deleted(ri, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + } + else { + rrd_flag_clear(ri, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + + if (unlikely(ri->first_time_s != min_first_time_t)) { + ri->first_time_s = min_first_time_t; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if (unlikely(ri->last_time_s != max_last_time_t)) { + ri->last_time_s = max_last_time_t; + rrd_flag_set_updated(ri, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + if(likely(currently_collected)) + rrd_flag_set_collected(ri); + else + rrd_flag_set_archived(ri); + } + } + + rrd_flag_unset_updated(ri); +} + +static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAGS reason, bool worker_jobs) { + if(reason != RRD_FLAG_NONE) + rrd_flag_set_updated(rc, reason); + + if(worker_jobs) + worker_is_busy(WORKER_JOB_PP_CONTEXT); + + size_t min_priority_collected = LONG_MAX; + size_t min_priority_not_collected = LONG_MAX; + size_t min_priority = LONG_MAX; + time_t min_first_time_t = LONG_MAX, max_last_time_t = 0; + size_t instances_active = 0, instances_deleted = 0, metrics = 0; + bool live_retention = true, currently_collected = false, hidden = true; + if(dictionary_entries(rc->rrdinstances) > 0) { + RRDINSTANCE *ri; + dfe_start_reentrant(rc->rrdinstances, ri) { + if(unlikely(!service_running(SERVICE_CONTEXT))) break; + + RRD_FLAGS reason_to_pass = reason; + if(rrd_flag_check(rc, RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION)) + reason_to_pass |= RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION; + + rrdinstance_post_process_updates(ri, force, reason_to_pass, worker_jobs); + + if(unlikely(hidden && !rrd_flag_check(ri, RRD_FLAG_HIDDEN))) + hidden = false; + + if(unlikely(live_retention && !rrd_flag_check(ri, RRD_FLAG_LIVE_RETENTION))) + live_retention = false; + + if (unlikely(rrdinstance_should_be_deleted(ri))) { + instances_deleted++; + continue; + } + + bool ri_collected = rrd_flag_is_collected(ri); + + if(ri_collected && !rrd_flag_check(ri, RRD_FLAG_MERGED_COLLECTED_RI_TO_RC)) { + rrdcontext_update_from_collected_rrdinstance(ri); + rrd_flag_set(ri, RRD_FLAG_MERGED_COLLECTED_RI_TO_RC); + } + + if(unlikely(!currently_collected && rrd_flag_is_collected(ri) && ri->first_time_s)) + currently_collected = true; + + internal_error(rc->units != ri->units, + "RRDCONTEXT: '%s' rrdinstance '%s' has different units, context '%s', instance '%s'", + string2str(rc->id), string2str(ri->id), + string2str(rc->units), string2str(ri->units)); + + instances_active++; + metrics += dictionary_entries(ri->rrdmetrics); + + if (ri->priority >= RRDCONTEXT_MINIMUM_ALLOWED_PRIORITY) { + if(rrd_flag_check(ri, RRD_FLAG_COLLECTED)) { + if(ri->priority < min_priority_collected) + min_priority_collected = ri->priority; + } + else { + if(ri->priority < min_priority_not_collected) + min_priority_not_collected = ri->priority; + } + } + + if (ri->first_time_s && ri->first_time_s < min_first_time_t) + min_first_time_t = ri->first_time_s; + + if (ri->last_time_s && ri->last_time_s > max_last_time_t) + max_last_time_t = ri->last_time_s; + } + dfe_done(ri); + + rc->stats.metrics = metrics; + + if(min_priority_collected != LONG_MAX) + // use the collected priority + min_priority = min_priority_collected; + else + // use the non-collected priority + min_priority = min_priority_not_collected; + } + + { + bool previous_hidden = rrd_flag_check(rc, RRD_FLAG_HIDDEN); + if (hidden != previous_hidden) { + if (hidden && !rrd_flag_check(rc, RRD_FLAG_HIDDEN)) + rrd_flag_set(rc, RRD_FLAG_HIDDEN); + else if (!hidden && rrd_flag_check(rc, RRD_FLAG_HIDDEN)) + rrd_flag_clear(rc, RRD_FLAG_HIDDEN); + } + + bool previous_live_retention = rrd_flag_check(rc, RRD_FLAG_LIVE_RETENTION); + if (live_retention != previous_live_retention) { + if (live_retention && !rrd_flag_check(rc, RRD_FLAG_LIVE_RETENTION)) + rrd_flag_set(rc, RRD_FLAG_LIVE_RETENTION); + else if (!live_retention && rrd_flag_check(rc, RRD_FLAG_LIVE_RETENTION)) + rrd_flag_clear(rc, RRD_FLAG_LIVE_RETENTION); + } + } + + rrdcontext_lock(rc); + rc->pp.executions++; + + if(unlikely(!instances_active)) { + // we had some instances, but they are gone now... + + if(rc->first_time_s) { + rc->first_time_s = 0; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if(rc->last_time_s) { + rc->last_time_s = 0; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + rrd_flag_set_deleted(rc, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + } + else { + // we have some active instances... + + if (unlikely(min_first_time_t == LONG_MAX)) + min_first_time_t = 0; + + if (unlikely(min_first_time_t == 0 && max_last_time_t == 0)) { + if(rc->first_time_s) { + rc->first_time_s = 0; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if(rc->last_time_s) { + rc->last_time_s = 0; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + rrd_flag_set_deleted(rc, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + } + else { + rrd_flag_clear(rc, RRD_FLAG_UPDATE_REASON_ZERO_RETENTION); + + if (unlikely(rc->first_time_s != min_first_time_t)) { + rc->first_time_s = min_first_time_t; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T); + } + + if (rc->last_time_s != max_last_time_t) { + rc->last_time_s = max_last_time_t; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T); + } + + if(likely(currently_collected)) + rrd_flag_set_collected(rc); + else + rrd_flag_set_archived(rc); + } + + if (min_priority != LONG_MAX && rc->priority != min_priority) { + rc->priority = min_priority; + rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_CHANGED_METADATA); + } + } + + if(unlikely(rrd_flag_is_updated(rc) && rc->rrdhost->rrdctx.hub_queue)) { + if(check_if_cloud_version_changed_unsafe(rc, false)) { + rc->version = rrdcontext_get_next_version(rc); + dictionary_set((DICTIONARY *)rc->rrdhost->rrdctx.hub_queue, + string2str(rc->id), rc, sizeof(*rc)); + } + } + + rrd_flag_unset_updated(rc); + rrdcontext_unlock(rc); +} + +void rrdcontext_queue_for_post_processing(RRDCONTEXT *rc, const char *function __maybe_unused, RRD_FLAGS flags __maybe_unused) { + if(unlikely(!rc->rrdhost->rrdctx.pp_queue)) return; + + if(!rrd_flag_check(rc, RRD_FLAG_QUEUED_FOR_PP)) { + dictionary_set((DICTIONARY *)rc->rrdhost->rrdctx.pp_queue, + string2str(rc->id), + rc, + sizeof(*rc)); + +#if(defined(NETDATA_INTERNAL_CHECKS) && defined(LOG_POST_PROCESSING_QUEUE_INSERTIONS)) + { + BUFFER *wb_flags = buffer_create(1000); + rrd_flags_to_buffer(flags, wb_flags); + + BUFFER *wb_reasons = buffer_create(1000); + rrd_reasons_to_buffer(flags, wb_reasons); + + internal_error(true, "RRDCONTEXT: '%s' update triggered by function %s(), due to flags: %s, reasons: %s", + string2str(rc->id), function, + buffer_tostring(wb_flags), + buffer_tostring(wb_reasons)); + + buffer_free(wb_reasons); + buffer_free(wb_flags); + } +#endif + } +} + +static void rrdcontext_dequeue_from_post_processing(RRDCONTEXT *rc) { + if(unlikely(!rc->rrdhost->rrdctx.pp_queue)) return; + dictionary_del(rc->rrdhost->rrdctx.pp_queue, string2str(rc->id)); +} + +static void rrdcontext_post_process_queued_contexts(RRDHOST *host) { + if(unlikely(!host->rrdctx.pp_queue)) return; + + RRDCONTEXT *rc; + dfe_start_reentrant(host->rrdctx.pp_queue, rc) { + if(unlikely(!service_running(SERVICE_CONTEXT))) break; + + rrdcontext_dequeue_from_post_processing(rc); + rrdcontext_post_process_updates(rc, false, RRD_FLAG_NONE, true); + } + dfe_done(rc); +} + +// ---------------------------------------------------------------------------- +// dispatching contexts to cloud + +static uint64_t rrdcontext_get_next_version(RRDCONTEXT *rc) { + time_t now = now_realtime_sec(); + uint64_t version = MAX(rc->version, rc->hub.version); + version = MAX((uint64_t)now, version); + version++; + return version; +} + +void rrdcontext_message_send_unsafe(RRDCONTEXT *rc, bool snapshot __maybe_unused, void *bundle __maybe_unused) { + + // save it, so that we know the last version we sent to hub + rc->version = rc->hub.version = rrdcontext_get_next_version(rc); + rc->hub.id = string2str(rc->id); + rc->hub.title = string2str(rc->title); + rc->hub.units = string2str(rc->units); + rc->hub.family = string2str(rc->family); + rc->hub.chart_type = rrdset_type_name(rc->chart_type); + rc->hub.priority = rc->priority; + rc->hub.first_time_s = rc->first_time_s; + rc->hub.last_time_s = rrd_flag_is_collected(rc) ? 0 : rc->last_time_s; + rc->hub.deleted = rrd_flag_is_deleted(rc) ? true : false; + +#ifdef ENABLE_ACLK + struct context_updated message = { + .id = rc->hub.id, + .version = rc->hub.version, + .title = rc->hub.title, + .units = rc->hub.units, + .family = rc->hub.family, + .chart_type = rc->hub.chart_type, + .priority = rc->hub.priority, + .first_entry = rc->hub.first_time_s, + .last_entry = rc->hub.last_time_s, + .deleted = rc->hub.deleted, + }; + + if(likely(!rrd_flag_check(rc, RRD_FLAG_HIDDEN))) { + if (snapshot) { + if (!rc->hub.deleted) + contexts_snapshot_add_ctx_update(bundle, &message); + } + else + contexts_updated_add_ctx_update(bundle, &message); + } +#endif + + // store it to SQL + + if(rrd_flag_is_deleted(rc)) + rrdcontext_delete_from_sql_unsafe(rc); + + else if (ctx_store_context(&rc->rrdhost->host_uuid, &rc->hub) != 0) + netdata_log_error("RRDCONTEXT: failed to save context '%s' version %"PRIu64" to SQL.", rc->hub.id, rc->hub.version); +} + +static bool check_if_cloud_version_changed_unsafe(RRDCONTEXT *rc, bool sending __maybe_unused) { + bool id_changed = false, + title_changed = false, + units_changed = false, + family_changed = false, + chart_type_changed = false, + priority_changed = false, + first_time_changed = false, + last_time_changed = false, + deleted_changed = false; + + RRD_FLAGS flags = rrd_flags_get(rc); + + if(unlikely(string2str(rc->id) != rc->hub.id)) + id_changed = true; + + if(unlikely(string2str(rc->title) != rc->hub.title)) + title_changed = true; + + if(unlikely(string2str(rc->units) != rc->hub.units)) + units_changed = true; + + if(unlikely(string2str(rc->family) != rc->hub.family)) + family_changed = true; + + if(unlikely(rrdset_type_name(rc->chart_type) != rc->hub.chart_type)) + chart_type_changed = true; + + if(unlikely(rc->priority != rc->hub.priority)) + priority_changed = true; + + if(unlikely((uint64_t)rc->first_time_s != rc->hub.first_time_s)) + first_time_changed = true; + + if(unlikely((uint64_t)((flags & RRD_FLAG_COLLECTED) ? 0 : rc->last_time_s) != rc->hub.last_time_s)) + last_time_changed = true; + + if(unlikely(((flags & RRD_FLAG_DELETED) ? true : false) != rc->hub.deleted)) + deleted_changed = true; + + if(unlikely(id_changed || title_changed || units_changed || family_changed || chart_type_changed || priority_changed || first_time_changed || last_time_changed || deleted_changed)) { + + internal_error(LOG_TRANSITIONS, + "RRDCONTEXT: %s NEW VERSION '%s'%s of host '%s', version %"PRIu64", title '%s'%s, units '%s'%s, family '%s'%s, chart type '%s'%s, priority %u%s, first_time_t %ld%s, last_time_t %ld%s, deleted '%s'%s, (queued for %llu ms, expected %llu ms)", + sending?"SENDING":"QUEUE", + string2str(rc->id), id_changed ? " (CHANGED)" : "", + rrdhost_hostname(rc->rrdhost), + rc->version, + string2str(rc->title), title_changed ? " (CHANGED)" : "", + string2str(rc->units), units_changed ? " (CHANGED)" : "", + string2str(rc->family), family_changed ? " (CHANGED)" : "", + rrdset_type_name(rc->chart_type), chart_type_changed ? " (CHANGED)" : "", + rc->priority, priority_changed ? " (CHANGED)" : "", + rc->first_time_s, first_time_changed ? " (CHANGED)" : "", + (flags & RRD_FLAG_COLLECTED) ? 0 : rc->last_time_s, last_time_changed ? " (CHANGED)" : "", + (flags & RRD_FLAG_DELETED) ? "true" : "false", deleted_changed ? " (CHANGED)" : "", + sending ? (now_realtime_usec() - rc->queue.queued_ut) / USEC_PER_MS : 0, + sending ? (rc->queue.scheduled_dispatch_ut - rc->queue.queued_ut) / USEC_PER_MS : 0 + ); + + rrdhost_update_cached_retention(rc->rrdhost, rc->first_time_s, rc->last_time_s, false); + + return true; + } + + if(!(flags & RRD_FLAG_COLLECTED)) + rrdhost_update_cached_retention(rc->rrdhost, rc->first_time_s, rc->last_time_s, false); + + return false; +} + +static inline usec_t rrdcontext_calculate_queued_dispatch_time_ut(RRDCONTEXT *rc, usec_t now_ut) { + + if(likely(rc->queue.delay_calc_ut >= rc->queue.queued_ut)) + return rc->queue.scheduled_dispatch_ut; + + RRD_FLAGS flags = rc->queue.queued_flags; + + usec_t delay = LONG_MAX; + int i; + struct rrdcontext_reason *reason; + for(i = 0, reason = &rrdcontext_reasons[i]; reason->name ; reason = &rrdcontext_reasons[++i]) { + if(unlikely(flags & reason->flag)) { + if(reason->delay_ut < delay) + delay = reason->delay_ut; + } + } + + if(unlikely(delay == LONG_MAX)) { + internal_error(true, "RRDCONTEXT: '%s', cannot find minimum delay of flags %x", string2str(rc->id), (unsigned int)flags); + delay = 60 * USEC_PER_SEC; + } + + rc->queue.delay_calc_ut = now_ut; + usec_t dispatch_ut = rc->queue.scheduled_dispatch_ut = rc->queue.queued_ut + delay; + return dispatch_ut; +} + +static void rrdcontext_dequeue_from_hub_queue(RRDCONTEXT *rc) { + dictionary_del(rc->rrdhost->rrdctx.hub_queue, string2str(rc->id)); +} + +static void rrdcontext_dispatch_queued_contexts_to_hub(RRDHOST *host, usec_t now_ut) { + + // check if we have received a streaming command for this host + if(!rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS) || !aclk_connected || !host->rrdctx.hub_queue) + return; + + // check if there are queued items to send + if(!dictionary_entries(host->rrdctx.hub_queue)) + return; + + if(!host->node_id) + return; + + size_t messages_added = 0; + contexts_updated_t bundle = NULL; + + RRDCONTEXT *rc; + dfe_start_reentrant(host->rrdctx.hub_queue, rc) { + if(unlikely(!service_running(SERVICE_CONTEXT))) break; + + if(unlikely(messages_added >= MESSAGES_PER_BUNDLE_TO_SEND_TO_HUB_PER_HOST)) + break; + + worker_is_busy(WORKER_JOB_QUEUED); + usec_t dispatch_ut = rrdcontext_calculate_queued_dispatch_time_ut(rc, now_ut); + char *claim_id = get_agent_claimid(); + + if(unlikely(now_ut >= dispatch_ut) && claim_id) { + worker_is_busy(WORKER_JOB_CHECK); + + rrdcontext_lock(rc); + + if(check_if_cloud_version_changed_unsafe(rc, true)) { + worker_is_busy(WORKER_JOB_SEND); + +#ifdef ENABLE_ACLK + if(!bundle) { + // prepare the bundle to send the messages + char uuid[UUID_STR_LEN]; + uuid_unparse_lower(*host->node_id, uuid); + + bundle = contexts_updated_new(claim_id, uuid, 0, now_ut); + } +#endif + // update the hub data of the context, give a new version, pack the message + // and save an update to SQL + rrdcontext_message_send_unsafe(rc, false, bundle); + messages_added++; + + rc->queue.dispatches++; + rc->queue.dequeued_ut = now_ut; + } + else + rc->version = rc->hub.version; + + // remove it from the queue + worker_is_busy(WORKER_JOB_DEQUEUE); + rrdcontext_dequeue_from_hub_queue(rc); + + if(unlikely(rrdcontext_should_be_deleted(rc))) { + // this is a deleted context - delete it forever... + + worker_is_busy(WORKER_JOB_CLEANUP_DELETE); + + rrdcontext_dequeue_from_post_processing(rc); + rrdcontext_delete_from_sql_unsafe(rc); + + STRING *id = string_dup(rc->id); + rrdcontext_unlock(rc); + + // delete it from the master dictionary + if(!dictionary_del(host->rrdctx.contexts, string2str(rc->id))) + netdata_log_error("RRDCONTEXT: '%s' of host '%s' failed to be deleted from rrdcontext dictionary.", + string2str(id), rrdhost_hostname(host)); + + string_freez(id); + } + else + rrdcontext_unlock(rc); + } + freez(claim_id); + } + dfe_done(rc); + +#ifdef ENABLE_ACLK + if(service_running(SERVICE_CONTEXT) && bundle) { + // we have a bundle to send messages + + // update the version hash + contexts_updated_update_version_hash(bundle, rrdcontext_version_hash(host)); + + // send it + aclk_send_contexts_updated(bundle); + } + else if(bundle) + contexts_updated_delete(bundle); +#endif + +} + +// ---------------------------------------------------------------------------- +// worker thread + +static void rrdcontext_main_cleanup(void *ptr) { + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; + static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; + + // custom code + worker_unregister(); + + static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; +} + +void *rrdcontext_main(void *ptr) { + netdata_thread_cleanup_push(rrdcontext_main_cleanup, ptr); + + worker_register("RRDCONTEXT"); + worker_register_job_name(WORKER_JOB_HOSTS, "hosts"); + worker_register_job_name(WORKER_JOB_CHECK, "dedup checks"); + worker_register_job_name(WORKER_JOB_SEND, "sent contexts"); + worker_register_job_name(WORKER_JOB_DEQUEUE, "deduplicated contexts"); + worker_register_job_name(WORKER_JOB_RETENTION, "metrics retention"); + worker_register_job_name(WORKER_JOB_QUEUED, "queued contexts"); + worker_register_job_name(WORKER_JOB_CLEANUP, "cleanups"); + worker_register_job_name(WORKER_JOB_CLEANUP_DELETE, "deletes"); + worker_register_job_name(WORKER_JOB_PP_METRIC, "check metrics"); + worker_register_job_name(WORKER_JOB_PP_INSTANCE, "check instances"); + worker_register_job_name(WORKER_JOB_PP_CONTEXT, "check contexts"); + + worker_register_job_custom_metric(WORKER_JOB_HUB_QUEUE_SIZE, "hub queue size", "contexts", WORKER_METRIC_ABSOLUTE); + worker_register_job_custom_metric(WORKER_JOB_PP_QUEUE_SIZE, "post processing queue size", "contexts", WORKER_METRIC_ABSOLUTE); + + heartbeat_t hb; + heartbeat_init(&hb); + usec_t step = RRDCONTEXT_WORKER_THREAD_HEARTBEAT_USEC; + + while (service_running(SERVICE_CONTEXT)) { + worker_is_idle(); + heartbeat_next(&hb, step); + + if(unlikely(!service_running(SERVICE_CONTEXT))) break; + + usec_t now_ut = now_realtime_usec(); + + if(rrdcontext_next_db_rotation_ut && now_ut > rrdcontext_next_db_rotation_ut) { + rrdcontext_recalculate_retention_all_hosts(); + rrdcontext_garbage_collect_for_all_hosts(); + rrdcontext_next_db_rotation_ut = 0; + } + + size_t hub_queued_contexts_for_all_hosts = 0; + size_t pp_queued_contexts_for_all_hosts = 0; + + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host) { + if(unlikely(!service_running(SERVICE_CONTEXT))) break; + + worker_is_busy(WORKER_JOB_HOSTS); + + if(host->rrdctx.pp_queue) { + pp_queued_contexts_for_all_hosts += dictionary_entries(host->rrdctx.pp_queue); + rrdcontext_post_process_queued_contexts(host); + dictionary_garbage_collect(host->rrdctx.pp_queue); + } + + if(host->rrdctx.hub_queue) { + hub_queued_contexts_for_all_hosts += dictionary_entries(host->rrdctx.hub_queue); + rrdcontext_dispatch_queued_contexts_to_hub(host, now_ut); + dictionary_garbage_collect(host->rrdctx.hub_queue); + } + + if (host->rrdctx.contexts) + dictionary_garbage_collect(host->rrdctx.contexts); + + // calculate the number of metrics and instances in the host + RRDCONTEXT *rc; + uint32_t metrics = 0, instances = 0; + dfe_start_read(host->rrdctx.contexts, rc) { + metrics += rc->stats.metrics; + instances += dictionary_entries(rc->rrdinstances); + } + dfe_done(rc); + host->rrdctx.metrics = metrics; + host->rrdctx.instances = instances; + } + dfe_done(host); + + worker_set_metric(WORKER_JOB_HUB_QUEUE_SIZE, (NETDATA_DOUBLE)hub_queued_contexts_for_all_hosts); + worker_set_metric(WORKER_JOB_PP_QUEUE_SIZE, (NETDATA_DOUBLE)pp_queued_contexts_for_all_hosts); + } + + netdata_thread_cleanup_pop(1); + return NULL; +} |