diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2023-07-20 04:50:01 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2023-07-20 04:50:01 +0000 |
commit | cd4377fab21e0f500bef7f06543fa848a039c1e0 (patch) | |
tree | ba00a55e430c052d6bed0b61c0f8bbe8ebedd313 /database | |
parent | Releasing debian version 1.40.1-1. (diff) | |
download | netdata-cd4377fab21e0f500bef7f06543fa848a039c1e0.tar.xz netdata-cd4377fab21e0f500bef7f06543fa848a039c1e0.zip |
Merging upstream version 1.41.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
52 files changed, 6063 insertions, 2023 deletions
diff --git a/database/contexts/api_v1.c b/database/contexts/api_v1.c index daf945eeb..b4bcfe4ae 100644 --- a/database/contexts/api_v1.c +++ b/database/contexts/api_v1.c @@ -356,7 +356,7 @@ static inline int rrdcontext_to_json_callback(const DICTIONARY_ITEM *item, void int rrdcontext_to_json(RRDHOST *host, BUFFER *wb, time_t after, time_t before, RRDCONTEXT_TO_JSON_OPTIONS options, const char *context, SIMPLE_PATTERN *chart_label_key, SIMPLE_PATTERN *chart_labels_filter, SIMPLE_PATTERN *chart_dimensions) { if(!host->rrdctx.contexts) { - error("%s(): request for host '%s' that does not have rrdcontexts initialized.", __FUNCTION__, rrdhost_hostname(host)); + netdata_log_error("%s(): request for host '%s' that does not have rrdcontexts initialized.", __FUNCTION__, rrdhost_hostname(host)); return HTTP_RESP_NOT_FOUND; } @@ -393,7 +393,7 @@ int rrdcontext_to_json(RRDHOST *host, BUFFER *wb, time_t after, time_t before, R int rrdcontexts_to_json(RRDHOST *host, BUFFER *wb, time_t after, time_t before, RRDCONTEXT_TO_JSON_OPTIONS options, SIMPLE_PATTERN *chart_label_key, SIMPLE_PATTERN *chart_labels_filter, SIMPLE_PATTERN *chart_dimensions) { if(!host->rrdctx.contexts) { - error("%s(): request for host '%s' that does not have rrdcontexts initialized.", __FUNCTION__, rrdhost_hostname(host)); + netdata_log_error("%s(): request for host '%s' that does not have rrdcontexts initialized.", __FUNCTION__, rrdhost_hostname(host)); return HTTP_RESP_NOT_FOUND; } diff --git a/database/contexts/api_v2.c b/database/contexts/api_v2.c index d83a9e9e3..ed7f955ad 100644 --- a/database/contexts/api_v2.c +++ b/database/contexts/api_v2.c @@ -7,6 +7,110 @@ // ---------------------------------------------------------------------------- // /api/v2/contexts API +struct alert_transitions_facets alert_transition_facets[] = { + [ATF_STATUS] = { + .id = "f_status", + .name = "Alert Status", + .query_param = "f_status", + .order = 1, + }, + [ATF_TYPE] = { + .id = "f_type", + .name = "Alert Type", + .query_param = "f_type", + .order = 2, + }, + [ATF_ROLE] = { + .id = "f_role", + .name = "Recipient Role", + .query_param = "f_role", + .order = 3, + }, + [ATF_CLASS] = { + .id = "f_class", + .name = "Alert Class", + .query_param = "f_class", + .order = 4, + }, + [ATF_COMPONENT] = { + .id = "f_component", + .name = "Alert Component", + .query_param = "f_component", + .order = 5, + }, + [ATF_NODE] = { + .id = "f_node", + .name = "Alert Node", + .query_param = "f_node", + .order = 6, + }, + [ATF_ALERT_NAME] = { + .id = "f_alert", + .name = "Alert Name", + .query_param = "f_alert", + .order = 7, + }, + [ATF_CHART_NAME] = { + .id = "f_instance", + .name = "Instance Name", + .query_param = "f_instance", + .order = 8, + }, + [ATF_CONTEXT] = { + .id = "f_context", + .name = "Context", + .query_param = "f_context", + .order = 9, + }, + + // terminator + [ATF_TOTAL_ENTRIES] = { + .id = NULL, + .name = NULL, + .query_param = NULL, + .order = 9999, + } +}; + +struct facet_entry { + uint32_t count; +}; + +struct alert_transitions_callback_data { + struct rrdcontext_to_json_v2_data *ctl; + BUFFER *wb; + bool debug; + bool only_one_config; + + struct { + SIMPLE_PATTERN *pattern; + DICTIONARY *dict; + } facets[ATF_TOTAL_ENTRIES]; + + uint32_t max_items_to_return; + uint32_t items_to_return; + + uint32_t items_evaluated; + uint32_t items_matched; + + + struct sql_alert_transition_fixed_size *base; // double linked list - last item is base->prev + struct sql_alert_transition_fixed_size *last_added; // the last item added, not the last of the list + + struct { + size_t first; + size_t skips_before; + size_t skips_after; + size_t backwards; + size_t forwards; + size_t prepend; + size_t append; + size_t shifts; + } operations; + + uint32_t configs_added; +}; + typedef enum __attribute__ ((__packed__)) { FTS_MATCHED_NONE = 0, FTS_MATCHED_HOST, @@ -58,7 +162,14 @@ static const char *fts_match_to_string(FTS_MATCH match) { } } -struct rrdcontext_to_json_v2_entry { +struct function_v2_entry { + size_t size; + size_t used; + size_t *node_ids; + STRING *help; +}; + +struct context_v2_entry { size_t count; STRING *id; STRING *family; @@ -69,6 +180,23 @@ struct rrdcontext_to_json_v2_entry { FTS_MATCH match; }; +struct alert_v2_entry { + RRDCALC *tmp; + + STRING *name; + + size_t ati; + + size_t critical; + size_t warning; + size_t clear; + size_t error; + + size_t instances; + DICTIONARY *nodes; + DICTIONARY *configs; +}; + typedef struct full_text_search_index { size_t searches; size_t string_searches; @@ -87,11 +215,18 @@ static inline bool full_text_search_char(FTS_INDEX *fts, SIMPLE_PATTERN *q, char return simple_pattern_matches(q, ptr); } +struct contexts_v2_node { + size_t ni; + RRDHOST *host; +}; + struct rrdcontext_to_json_v2_data { + time_t now; + BUFFER *wb; struct api_v2_contexts_request *request; - DICTIONARY *ctx; + CONTEXTS_V2_MODE mode; CONTEXTS_V2_OPTIONS options; struct query_versions versions; @@ -99,23 +234,151 @@ struct rrdcontext_to_json_v2_data { SIMPLE_PATTERN *scope_pattern; SIMPLE_PATTERN *pattern; size_t ni; + DICTIONARY *dict; // the result set } nodes; struct { SIMPLE_PATTERN *scope_pattern; SIMPLE_PATTERN *pattern; + size_t ci; + DICTIONARY *dict; // the result set } contexts; struct { + SIMPLE_PATTERN *alert_name_pattern; + time_t alarm_id_filter; + + size_t ati; + + DICTIONARY *alerts; + DICTIONARY *alert_instances; + } alerts; + + struct { FTS_MATCH host_match; char host_node_id_str[UUID_STR_LEN]; SIMPLE_PATTERN *pattern; FTS_INDEX fts; } q; + struct { + DICTIONARY *dict; // the result set + } functions; + + struct { + bool enabled; + bool relative; + time_t after; + time_t before; + } window; + struct query_timings timings; }; +static void alerts_v2_add(struct alert_v2_entry *t, RRDCALC *rc) { + t->instances++; + + switch(rc->status) { + case RRDCALC_STATUS_CRITICAL: + t->critical++; + break; + + case RRDCALC_STATUS_WARNING: + t->warning++; + break; + + case RRDCALC_STATUS_CLEAR: + t->clear++; + break; + + case RRDCALC_STATUS_REMOVED: + case RRDCALC_STATUS_UNINITIALIZED: + break; + + case RRDCALC_STATUS_UNDEFINED: + default: + if(!netdata_double_isnumber(rc->value)) + t->error++; + + break; + } + + dictionary_set(t->nodes, rc->rrdset->rrdhost->machine_guid, NULL, 0); + + char key[UUID_STR_LEN + 1]; + uuid_unparse_lower(rc->config_hash_id, key); + dictionary_set(t->configs, key, NULL, 0); +} + +static void alerts_v2_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data) { + struct rrdcontext_to_json_v2_data *ctl = data; + struct alert_v2_entry *t = value; + RRDCALC *rc = t->tmp; + t->name = rc->name; + t->ati = ctl->alerts.ati++; + + t->nodes = dictionary_create(DICT_OPTION_SINGLE_THREADED|DICT_OPTION_VALUE_LINK_DONT_CLONE|DICT_OPTION_NAME_LINK_DONT_CLONE); + t->configs = dictionary_create(DICT_OPTION_SINGLE_THREADED|DICT_OPTION_VALUE_LINK_DONT_CLONE|DICT_OPTION_NAME_LINK_DONT_CLONE); + + alerts_v2_add(t, rc); +} + +static bool alerts_v2_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *data __maybe_unused) { + struct alert_v2_entry *t = old_value, *n = new_value; + RRDCALC *rc = n->tmp; + alerts_v2_add(t, rc); + return true; +} + +static void alerts_v2_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { + struct alert_v2_entry *t = value; + dictionary_destroy(t->nodes); + dictionary_destroy(t->configs); +} + +static void alert_instances_v2_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data) { + struct rrdcontext_to_json_v2_data *ctl = data; + struct sql_alert_instance_v2_entry *t = value; + RRDCALC *rc = t->tmp; + + t->context = rc->rrdset->context; + t->chart_id = rc->rrdset->id; + t->chart_name = rc->rrdset->name; + t->family = rc->rrdset->family; + t->units = rc->units; + t->classification = rc->classification; + t->type = rc->type; + t->recipient = rc->recipient; + t->component = rc->component; + t->name = rc->name; + t->source = rc->source; + t->status = rc->status; + t->flags = rc->run_flags; + t->info = rc->info; + t->value = rc->value; + t->last_updated = rc->last_updated; + t->last_status_change = rc->last_status_change; + t->last_status_change_value = rc->last_status_change_value; + t->host = rc->rrdset->rrdhost; + t->alarm_id = rc->id; + t->ni = ctl->nodes.ni; + t->global_id = rc->ae ? rc->ae->global_id : 0; + t->name = rc->name; + + uuid_copy(t->config_hash_id, rc->config_hash_id); + if(rc->ae) + uuid_copy(t->last_transition_id, rc->ae->transition_id); +} + +static bool alert_instances_v2_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value __maybe_unused, void *new_value __maybe_unused, void *data __maybe_unused) { + internal_fatal(true, "This should never happen!"); + return true; +} + +static void alert_instances_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value __maybe_unused, void *data __maybe_unused) { + ; +} + static FTS_MATCH rrdcontext_to_json_v2_full_text_search(struct rrdcontext_to_json_v2_data *ctl, RRDCONTEXT *rc, SIMPLE_PATTERN *q) { if(unlikely(full_text_search_string(&ctl->q.fts, q, rc->id) || full_text_search_string(&ctl->q.fts, q, rc->family))) @@ -132,6 +395,9 @@ static FTS_MATCH rrdcontext_to_json_v2_full_text_search(struct rrdcontext_to_jso dfe_start_read(rc->rrdinstances, ri) { if(matched) break; + if(ctl->window.enabled && !query_matches_retention(ctl->window.after, ctl->window.before, ri->first_time_s, (ri->flags & RRD_FLAG_COLLECTED) ? ctl->now : ri->last_time_s, 0)) + continue; + if(unlikely(full_text_search_string(&ctl->q.fts, q, ri->id)) || (ri->name != ri->id && full_text_search_string(&ctl->q.fts, q, ri->name))) { matched = FTS_MATCHED_INSTANCE; @@ -140,6 +406,9 @@ static FTS_MATCH rrdcontext_to_json_v2_full_text_search(struct rrdcontext_to_jso RRDMETRIC *rm; dfe_start_read(ri->rrdmetrics, rm) { + if(ctl->window.enabled && !query_matches_retention(ctl->window.after, ctl->window.before, rm->first_time_s, (rm->flags & RRD_FLAG_COLLECTED) ? ctl->now : rm->last_time_s, 0)) + continue; + if(unlikely(full_text_search_string(&ctl->q.fts, q, rm->id)) || (rm->name != rm->id && full_text_search_string(&ctl->q.fts, q, rm->name))) { matched = FTS_MATCHED_DIMENSION; @@ -161,7 +430,7 @@ static FTS_MATCH rrdcontext_to_json_v2_full_text_search(struct rrdcontext_to_jso if(ri->rrdset) { RRDSET *st = ri->rrdset; - netdata_rwlock_rdlock(&st->alerts.rwlock); + rw_spinlock_read_lock(&st->alerts.spinlock); for (RRDCALC *rcl = st->alerts.base; rcl; rcl = rcl->next) { if(unlikely(full_text_search_string(&ctl->q.fts, q, rcl->name))) { matched = FTS_MATCHED_ALERT; @@ -173,100 +442,431 @@ static FTS_MATCH rrdcontext_to_json_v2_full_text_search(struct rrdcontext_to_jso break; } } - netdata_rwlock_unlock(&st->alerts.rwlock); + rw_spinlock_read_unlock(&st->alerts.spinlock); } } dfe_done(ri); return matched; } +static bool rrdcontext_matches_alert(struct rrdcontext_to_json_v2_data *ctl, RRDCONTEXT *rc) { + size_t matches = 0; + RRDINSTANCE *ri; + dfe_start_read(rc->rrdinstances, ri) { + if(ri->rrdset) { + RRDSET *st = ri->rrdset; + rw_spinlock_read_lock(&st->alerts.spinlock); + for (RRDCALC *rcl = st->alerts.base; rcl; rcl = rcl->next) { + if(ctl->alerts.alert_name_pattern && !simple_pattern_matches_string(ctl->alerts.alert_name_pattern, rcl->name)) + continue; + + if(ctl->alerts.alarm_id_filter && ctl->alerts.alarm_id_filter != rcl->id) + continue; + + size_t m = ctl->request->alerts.status & CONTEXTS_V2_ALERT_STATUSES ? 0 : 1; + + if (!m) { + if ((ctl->request->alerts.status & CONTEXT_V2_ALERT_UNINITIALIZED) && + rcl->status == RRDCALC_STATUS_UNINITIALIZED) + m++; + + if ((ctl->request->alerts.status & CONTEXT_V2_ALERT_UNDEFINED) && + rcl->status == RRDCALC_STATUS_UNDEFINED) + m++; + + if ((ctl->request->alerts.status & CONTEXT_V2_ALERT_CLEAR) && + rcl->status == RRDCALC_STATUS_CLEAR) + m++; + + if ((ctl->request->alerts.status & CONTEXT_V2_ALERT_RAISED) && + rcl->status >= RRDCALC_STATUS_RAISED) + m++; + + if ((ctl->request->alerts.status & CONTEXT_V2_ALERT_WARNING) && + rcl->status == RRDCALC_STATUS_WARNING) + m++; + + if ((ctl->request->alerts.status & CONTEXT_V2_ALERT_CRITICAL) && + rcl->status == RRDCALC_STATUS_CRITICAL) + m++; + + if(!m) + continue; + } + + struct alert_v2_entry t = { + .tmp = rcl, + }; + struct alert_v2_entry *a2e = dictionary_set(ctl->alerts.alerts, string2str(rcl->name), &t, + sizeof(struct alert_v2_entry)); + size_t ati = a2e->ati; + matches++; + + if (ctl->options & (CONTEXT_V2_OPTION_ALERTS_WITH_INSTANCES | CONTEXT_V2_OPTION_ALERTS_WITH_VALUES)) { + char key[20 + 1]; + snprintfz(key, 20, "%p", rcl); + + struct sql_alert_instance_v2_entry z = { + .ati = ati, + .tmp = rcl, + }; + dictionary_set(ctl->alerts.alert_instances, key, &z, sizeof(z)); + } + } + rw_spinlock_read_unlock(&st->alerts.spinlock); + } + } + dfe_done(ri); + + return matches != 0; +} + + static ssize_t rrdcontext_to_json_v2_add_context(void *data, RRDCONTEXT_ACQUIRED *rca, bool queryable_context __maybe_unused) { struct rrdcontext_to_json_v2_data *ctl = data; RRDCONTEXT *rc = rrdcontext_acquired_value(rca); + if(ctl->window.enabled && !query_matches_retention(ctl->window.after, ctl->window.before, rc->first_time_s, (rc->flags & RRD_FLAG_COLLECTED) ? ctl->now : rc->last_time_s, 0)) + return 0; // continue to next context + FTS_MATCH match = ctl->q.host_match; - if((ctl->options & CONTEXTS_V2_SEARCH) && ctl->q.pattern) { + if((ctl->mode & CONTEXTS_V2_SEARCH) && ctl->q.pattern) { match = rrdcontext_to_json_v2_full_text_search(ctl, rc, ctl->q.pattern); if(match == FTS_MATCHED_NONE) - return 0; + return 0; // continue to next context } - struct rrdcontext_to_json_v2_entry t = { - .count = 0, - .id = rc->id, - .family = string_dup(rc->family), - .priority = rc->priority, - .first_time_s = rc->first_time_s, - .last_time_s = rc->last_time_s, - .flags = rc->flags, - .match = match, - }, *z = dictionary_set(ctl->ctx, string2str(rc->id), &t, sizeof(t)); - - if(!z->count) { - // we just added this - z->count = 1; + if(ctl->mode & CONTEXTS_V2_ALERTS) { + if(!rrdcontext_matches_alert(ctl, rc)) + return 0; // continue to next context } - else { - // it is already in there - z->count++; - z->flags |= rc->flags; - if(z->priority > rc->priority) - z->priority = rc->priority; - - if(z->first_time_s > rc->first_time_s) - z->first_time_s = rc->first_time_s; - - if(z->last_time_s < rc->last_time_s) - z->last_time_s = rc->last_time_s; - - if(z->family != rc->family) { - z->family = string_2way_merge(z->family, rc->family); - } + if(ctl->contexts.dict) { + struct context_v2_entry t = { + .count = 1, + .id = rc->id, + .family = string_dup(rc->family), + .priority = rc->priority, + .first_time_s = rc->first_time_s, + .last_time_s = rc->last_time_s, + .flags = rc->flags, + .match = match, + }; + + dictionary_set(ctl->contexts.dict, string2str(rc->id), &t, sizeof(struct context_v2_entry)); } return 1; } -void buffer_json_node_add_v2(BUFFER *wb, RRDHOST *host, size_t ni, usec_t duration_ut) { +void buffer_json_agent_status_id(BUFFER *wb, size_t ai, usec_t duration_ut) { + buffer_json_member_add_object(wb, "st"); + { + buffer_json_member_add_uint64(wb, "ai", ai); + buffer_json_member_add_uint64(wb, "code", 200); + buffer_json_member_add_string(wb, "msg", ""); + if (duration_ut) + buffer_json_member_add_double(wb, "ms", (NETDATA_DOUBLE) duration_ut / 1000.0); + } + buffer_json_object_close(wb); +} + +void buffer_json_node_add_v2(BUFFER *wb, RRDHOST *host, size_t ni, usec_t duration_ut, bool status) { buffer_json_member_add_string(wb, "mg", host->machine_guid); + if(host->node_id) buffer_json_member_add_uuid(wb, "nd", host->node_id); buffer_json_member_add_string(wb, "nm", rrdhost_hostname(host)); buffer_json_member_add_uint64(wb, "ni", ni); - buffer_json_member_add_object(wb, "st"); - buffer_json_member_add_uint64(wb, "ai", 0); - buffer_json_member_add_uint64(wb, "code", 200); - buffer_json_member_add_string(wb, "msg", ""); - if(duration_ut) - buffer_json_member_add_double(wb, "ms", (NETDATA_DOUBLE)duration_ut / 1000.0); - buffer_json_object_close(wb); + + if(status) + buffer_json_agent_status_id(wb, 0, duration_ut); +} + +static void rrdhost_receiver_to_json(BUFFER *wb, RRDHOST_STATUS *s, const char *key) { + buffer_json_member_add_object(wb, key); + { + buffer_json_member_add_uint64(wb, "id", s->ingest.id); + buffer_json_member_add_uint64(wb, "hops", s->ingest.hops); + buffer_json_member_add_string(wb, "type", rrdhost_ingest_type_to_string(s->ingest.type)); + buffer_json_member_add_string(wb, "status", rrdhost_ingest_status_to_string(s->ingest.status)); + buffer_json_member_add_time_t(wb, "since", s->ingest.since); + buffer_json_member_add_time_t(wb, "age", s->now - s->ingest.since); + + if(s->ingest.type == RRDHOST_INGEST_TYPE_CHILD) { + if(s->ingest.status == RRDHOST_INGEST_STATUS_OFFLINE) + buffer_json_member_add_string(wb, "reason", stream_handshake_error_to_string(s->ingest.reason)); + + if(s->ingest.status == RRDHOST_INGEST_STATUS_REPLICATING) { + buffer_json_member_add_object(wb, "replication"); + { + buffer_json_member_add_boolean(wb, "in_progress", s->ingest.replication.in_progress); + buffer_json_member_add_double(wb, "completion", s->ingest.replication.completion); + buffer_json_member_add_uint64(wb, "instances", s->ingest.replication.instances); + } + buffer_json_object_close(wb); // replication + } + + if(s->ingest.status == RRDHOST_INGEST_STATUS_REPLICATING || s->ingest.status == RRDHOST_INGEST_STATUS_ONLINE) { + buffer_json_member_add_object(wb, "source"); + { + char buf[1024 + 1]; + snprintfz(buf, 1024, "[%s]:%d%s", s->ingest.peers.local.ip, s->ingest.peers.local.port, s->ingest.ssl ? ":SSL" : ""); + buffer_json_member_add_string(wb, "local", buf); + + snprintfz(buf, 1024, "[%s]:%d%s", s->ingest.peers.peer.ip, s->ingest.peers.peer.port, s->ingest.ssl ? ":SSL" : ""); + buffer_json_member_add_string(wb, "remote", buf); + + stream_capabilities_to_json_array(wb, s->ingest.capabilities, "capabilities"); + } + buffer_json_object_close(wb); // source + } + } + } + buffer_json_object_close(wb); // collection +} + +static void rrdhost_sender_to_json(BUFFER *wb, RRDHOST_STATUS *s, const char *key) { + if(s->stream.status == RRDHOST_STREAM_STATUS_DISABLED) + return; + + buffer_json_member_add_object(wb, key); + { + buffer_json_member_add_uint64(wb, "id", s->stream.id); + buffer_json_member_add_uint64(wb, "hops", s->stream.hops); + buffer_json_member_add_string(wb, "status", rrdhost_streaming_status_to_string(s->stream.status)); + buffer_json_member_add_time_t(wb, "since", s->stream.since); + buffer_json_member_add_time_t(wb, "age", s->now - s->stream.since); + + if (s->stream.status == RRDHOST_STREAM_STATUS_OFFLINE) + buffer_json_member_add_string(wb, "reason", stream_handshake_error_to_string(s->stream.reason)); + + if (s->stream.status == RRDHOST_STREAM_STATUS_REPLICATING) { + buffer_json_member_add_object(wb, "replication"); + { + buffer_json_member_add_boolean(wb, "in_progress", s->stream.replication.in_progress); + buffer_json_member_add_double(wb, "completion", s->stream.replication.completion); + buffer_json_member_add_uint64(wb, "instances", s->stream.replication.instances); + } + buffer_json_object_close(wb); + } + + buffer_json_member_add_object(wb, "destination"); + { + char buf[1024 + 1]; + snprintfz(buf, 1024, "[%s]:%d%s", s->stream.peers.local.ip, s->stream.peers.local.port, s->stream.ssl ? ":SSL" : ""); + buffer_json_member_add_string(wb, "local", buf); + + snprintfz(buf, 1024, "[%s]:%d%s", s->stream.peers.peer.ip, s->stream.peers.peer.port, s->stream.ssl ? ":SSL" : ""); + buffer_json_member_add_string(wb, "remote", buf); + + stream_capabilities_to_json_array(wb, s->stream.capabilities, "capabilities"); + + buffer_json_member_add_object(wb, "traffic"); + { + buffer_json_member_add_boolean(wb, "compression", s->stream.compression); + buffer_json_member_add_uint64(wb, "data", s->stream.sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_DATA]); + buffer_json_member_add_uint64(wb, "metadata", s->stream.sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_METADATA]); + buffer_json_member_add_uint64(wb, "functions", s->stream.sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_FUNCTIONS]); + buffer_json_member_add_uint64(wb, "replication", s->stream.sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_REPLICATION]); + } + buffer_json_object_close(wb); // traffic + + buffer_json_member_add_array(wb, "candidates"); + struct rrdpush_destinations *d; + for (d = s->host->destinations; d; d = d->next) { + buffer_json_add_array_item_object(wb); + buffer_json_member_add_uint64(wb, "attempts", d->attempts); + { + + if (d->ssl) { + snprintfz(buf, 1024, "%s:SSL", string2str(d->destination)); + buffer_json_member_add_string(wb, "destination", buf); + } + else + buffer_json_member_add_string(wb, "destination", string2str(d->destination)); + + buffer_json_member_add_time_t(wb, "since", d->since); + buffer_json_member_add_time_t(wb, "age", s->now - d->since); + buffer_json_member_add_string(wb, "last_handshake", stream_handshake_error_to_string(d->reason)); + if(d->postpone_reconnection_until > s->now) { + buffer_json_member_add_time_t(wb, "next_check", d->postpone_reconnection_until); + buffer_json_member_add_time_t(wb, "next_in", d->postpone_reconnection_until - s->now); + } + } + buffer_json_object_close(wb); // each candidate + } + buffer_json_array_close(wb); // candidates + } + buffer_json_object_close(wb); // destination + } + buffer_json_object_close(wb); // streaming +} + +static void agent_capabilities_to_json(BUFFER *wb, RRDHOST *host, const char *key) { + buffer_json_member_add_array(wb, key); + + struct capability *capas = aclk_get_node_instance_capas(host); + for(struct capability *capa = capas; capa->name ;capa++) { + buffer_json_add_array_item_object(wb); + { + buffer_json_member_add_string(wb, "name", capa->name); + buffer_json_member_add_uint64(wb, "version", capa->version); + buffer_json_member_add_boolean(wb, "enabled", capa->enabled); + } + buffer_json_object_close(wb); + } + buffer_json_array_close(wb); + freez(capas); +} + +static inline void rrdhost_health_to_json_v2(BUFFER *wb, const char *key, RRDHOST_STATUS *s) { + buffer_json_member_add_object(wb, key); + { + buffer_json_member_add_string(wb, "status", rrdhost_health_status_to_string(s->health.status)); + if (s->health.status == RRDHOST_HEALTH_STATUS_RUNNING) { + buffer_json_member_add_object(wb, "alerts"); + { + buffer_json_member_add_uint64(wb, "critical", s->health.alerts.critical); + buffer_json_member_add_uint64(wb, "warning", s->health.alerts.warning); + buffer_json_member_add_uint64(wb, "clear", s->health.alerts.clear); + buffer_json_member_add_uint64(wb, "undefined", s->health.alerts.undefined); + buffer_json_member_add_uint64(wb, "uninitialized", s->health.alerts.uninitialized); + } + buffer_json_object_close(wb); // alerts + } + } + buffer_json_object_close(wb); // health +} + +static void rrdcontext_to_json_v2_rrdhost(BUFFER *wb, RRDHOST *host, struct rrdcontext_to_json_v2_data *ctl, size_t node_id) { + buffer_json_add_array_item_object(wb); // this node + buffer_json_node_add_v2(wb, host, node_id, 0, + (ctl->mode & CONTEXTS_V2_AGENTS) && !(ctl->mode & CONTEXTS_V2_NODE_INSTANCES)); + + if(ctl->mode & (CONTEXTS_V2_NODES_INFO | CONTEXTS_V2_NODE_INSTANCES)) { + RRDHOST_STATUS s; + rrdhost_status(host, ctl->now, &s); + + if (ctl->mode & (CONTEXTS_V2_NODES_INFO)) { + buffer_json_member_add_string(wb, "v", rrdhost_program_version(host)); + + host_labels2json(host, wb, "labels"); + + if (host->system_info) { + buffer_json_member_add_object(wb, "hw"); + { + buffer_json_member_add_string_or_empty(wb, "architecture", host->system_info->architecture); + buffer_json_member_add_string_or_empty(wb, "cpu_frequency", host->system_info->host_cpu_freq); + buffer_json_member_add_string_or_empty(wb, "cpus", host->system_info->host_cores); + buffer_json_member_add_string_or_empty(wb, "memory", host->system_info->host_ram_total); + buffer_json_member_add_string_or_empty(wb, "disk_space", host->system_info->host_disk_space); + buffer_json_member_add_string_or_empty(wb, "virtualization", host->system_info->virtualization); + buffer_json_member_add_string_or_empty(wb, "container", host->system_info->container); + } + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "os"); + { + buffer_json_member_add_string_or_empty(wb, "id", host->system_info->host_os_id); + buffer_json_member_add_string_or_empty(wb, "nm", host->system_info->host_os_name); + buffer_json_member_add_string_or_empty(wb, "v", host->system_info->host_os_version); + buffer_json_member_add_object(wb, "kernel"); + buffer_json_member_add_string_or_empty(wb, "nm", host->system_info->kernel_name); + buffer_json_member_add_string_or_empty(wb, "v", host->system_info->kernel_version); + buffer_json_object_close(wb); + } + buffer_json_object_close(wb); + } + + // created - the node is created but never connected to cloud + // unreachable - not currently connected + // stale - connected but not having live data + // reachable - connected with live data + // pruned - not connected for some time and has been removed + buffer_json_member_add_string(wb, "state", rrdhost_state_cloud_emulation(host) ? "reachable" : "stale"); + + rrdhost_health_to_json_v2(wb, "health", &s); + agent_capabilities_to_json(wb, host, "capabilities"); + } + + if (ctl->mode & (CONTEXTS_V2_NODE_INSTANCES)) { + buffer_json_member_add_array(wb, "instances"); + buffer_json_add_array_item_object(wb); // this instance + { + buffer_json_agent_status_id(wb, 0, 0); + + buffer_json_member_add_object(wb, "db"); + { + buffer_json_member_add_string(wb, "status", rrdhost_db_status_to_string(s.db.status)); + buffer_json_member_add_string(wb, "liveness", rrdhost_db_liveness_to_string(s.db.liveness)); + buffer_json_member_add_string(wb, "mode", rrd_memory_mode_name(s.db.mode)); + buffer_json_member_add_time_t(wb, "first_time", s.db.first_time_s); + buffer_json_member_add_time_t(wb, "last_time", s.db.last_time_s); + buffer_json_member_add_uint64(wb, "metrics", s.db.metrics); + buffer_json_member_add_uint64(wb, "instances", s.db.instances); + buffer_json_member_add_uint64(wb, "contexts", s.db.contexts); + } + buffer_json_object_close(wb); + + rrdhost_receiver_to_json(wb, &s, "ingest"); + rrdhost_sender_to_json(wb, &s, "stream"); + + buffer_json_member_add_object(wb, "ml"); + buffer_json_member_add_string(wb, "status", rrdhost_ml_status_to_string(s.ml.status)); + buffer_json_member_add_string(wb, "type", rrdhost_ml_type_to_string(s.ml.type)); + if (s.ml.status == RRDHOST_ML_STATUS_RUNNING) { + buffer_json_member_add_object(wb, "metrics"); + { + buffer_json_member_add_uint64(wb, "anomalous", s.ml.metrics.anomalous); + buffer_json_member_add_uint64(wb, "normal", s.ml.metrics.normal); + buffer_json_member_add_uint64(wb, "trained", s.ml.metrics.trained); + buffer_json_member_add_uint64(wb, "pending", s.ml.metrics.pending); + buffer_json_member_add_uint64(wb, "silenced", s.ml.metrics.silenced); + } + buffer_json_object_close(wb); // metrics + } + buffer_json_object_close(wb); // ml + + rrdhost_health_to_json_v2(wb, "health", &s); + + host_functions2json(host, wb); // functions + agent_capabilities_to_json(wb, host, "capabilities"); + } + buffer_json_object_close(wb); // this instance + buffer_json_array_close(wb); // instances + } + } + buffer_json_object_close(wb); // this node } static ssize_t rrdcontext_to_json_v2_add_host(void *data, RRDHOST *host, bool queryable_host) { if(!queryable_host || !host->rrdctx.contexts) // the host matches the 'scope_host' but does not match the 'host' patterns // or the host does not have any contexts - return 0; + return 0; // continue to next host struct rrdcontext_to_json_v2_data *ctl = data; - BUFFER *wb = ctl->wb; + + if(ctl->window.enabled && !rrdhost_matches_window(host, ctl->window.after, ctl->window.before, ctl->now)) + // the host does not have data in the requested window + return 0; // continue to next host if(ctl->request->timeout_ms && now_monotonic_usec() > ctl->timings.received_ut + ctl->request->timeout_ms * USEC_PER_MS) // timed out - return -2; + return -2; // stop the query if(ctl->request->interrupt_callback && ctl->request->interrupt_callback(ctl->request->interrupt_callback_data)) // interrupted - return -1; + return -1; // stop the query - bool host_matched = (ctl->options & CONTEXTS_V2_NODES); - bool do_contexts = (ctl->options & (CONTEXTS_V2_CONTEXTS | CONTEXTS_V2_SEARCH)); + bool host_matched = (ctl->mode & CONTEXTS_V2_NODES); + bool do_contexts = (ctl->mode & (CONTEXTS_V2_CONTEXTS | CONTEXTS_V2_ALERTS)); ctl->q.host_match = FTS_MATCHED_NONE; - if((ctl->options & CONTEXTS_V2_SEARCH)) { + if((ctl->mode & CONTEXTS_V2_SEARCH)) { // check if we match the host itself if(ctl->q.pattern && ( full_text_search_string(&ctl->q.fts, ctl->q.pattern, host->hostname) || @@ -293,92 +893,72 @@ static ssize_t rrdcontext_to_json_v2_add_host(void *data, RRDHOST *host, bool qu // restore it ctl->q.pattern = old_q; - if(added == -1) - return -1; + if(unlikely(added < 0)) + return -1; // stop the query if(added) host_matched = true; } - if(host_matched && (ctl->options & (CONTEXTS_V2_NODES | CONTEXTS_V2_NODES_DETAILED | CONTEXTS_V2_DEBUG))) { - buffer_json_add_array_item_object(wb); - buffer_json_node_add_v2(wb, host, ctl->nodes.ni++, 0); - - if(ctl->options & CONTEXTS_V2_NODES_DETAILED) { - buffer_json_member_add_string(wb, "version", rrdhost_program_version(host)); - buffer_json_member_add_uint64(wb, "hops", host->system_info ? host->system_info->hops : (host == localhost) ? 0 : 1); - buffer_json_member_add_string(wb, "state", (host == localhost || !rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN)) ? "reachable" : "stale"); - buffer_json_member_add_boolean(wb, "isDeleted", false); + if(!host_matched) + return 0; - buffer_json_member_add_array(wb, "services"); - buffer_json_array_close(wb); + if(ctl->mode & CONTEXTS_V2_FUNCTIONS) { + struct function_v2_entry t = { + .used = 1, + .size = 1, + .node_ids = &ctl->nodes.ni, + .help = NULL, + }; + host_functions_to_dict(host, ctl->functions.dict, &t, sizeof(t), &t.help); + } - buffer_json_member_add_array(wb, "nodeInstanceCapabilities"); + if(ctl->mode & CONTEXTS_V2_NODES) { + struct contexts_v2_node t = { + .ni = ctl->nodes.ni++, + .host = host, + }; - struct capability *capas = aclk_get_node_instance_capas(host); - struct capability *capa = capas; - while(capa->name != NULL) { - buffer_json_add_array_item_object(wb); - buffer_json_member_add_string(wb, "name", capa->name); - buffer_json_member_add_uint64(wb, "version", capa->version); - buffer_json_member_add_boolean(wb, "enabled", capa->enabled); - buffer_json_object_close(wb); - capa++; - } - buffer_json_array_close(wb); - freez(capas); + dictionary_set(ctl->nodes.dict, host->machine_guid, &t, sizeof(struct contexts_v2_node)); + } - web_client_api_request_v1_info_summary_alarm_statuses(host, wb, "alarmCounters"); + return 1; +} - host_labels2json(host, wb, "hostLabels"); +static void buffer_json_contexts_v2_mode_to_array(BUFFER *wb, const char *key, CONTEXTS_V2_MODE mode) { + buffer_json_member_add_array(wb, key); - buffer_json_member_add_object(wb, "mlInfo"); - buffer_json_member_add_boolean(wb, "mlCapable", ml_capable(host)); - buffer_json_member_add_boolean(wb, "mlEnabled", ml_enabled(host)); - buffer_json_object_close(wb); + if(mode & CONTEXTS_V2_VERSIONS) + buffer_json_add_array_item_string(wb, "versions"); - if(host->system_info) { - buffer_json_member_add_string_or_empty(wb, "architecture", host->system_info->architecture); - buffer_json_member_add_string_or_empty(wb, "kernelName", host->system_info->kernel_name); - buffer_json_member_add_string_or_empty(wb, "kernelVersion", host->system_info->kernel_version); - buffer_json_member_add_string_or_empty(wb, "cpuFrequency", host->system_info->host_cpu_freq); - buffer_json_member_add_string_or_empty(wb, "cpus", host->system_info->host_cores); - buffer_json_member_add_string_or_empty(wb, "memory", host->system_info->host_ram_total); - buffer_json_member_add_string_or_empty(wb, "diskSpace", host->system_info->host_disk_space); - buffer_json_member_add_string_or_empty(wb, "container", host->system_info->container); - buffer_json_member_add_string_or_empty(wb, "virtualization", host->system_info->virtualization); - buffer_json_member_add_string_or_empty(wb, "os", host->system_info->host_os_id); - buffer_json_member_add_string_or_empty(wb, "osName", host->system_info->host_os_name); - buffer_json_member_add_string_or_empty(wb, "osVersion", host->system_info->host_os_version); - } - - time_t now = now_realtime_sec(); - buffer_json_member_add_object(wb, "status"); - { - rrdhost_receiver_to_json(wb, host, "collection", now); - rrdhost_sender_to_json(wb, host, "streaming", now); - } - buffer_json_object_close(wb); // status - } + if(mode & CONTEXTS_V2_AGENTS) + buffer_json_add_array_item_string(wb, "agents"); - buffer_json_object_close(wb); - } + if(mode & CONTEXTS_V2_AGENTS_INFO) + buffer_json_add_array_item_string(wb, "agents-info"); - return host_matched ? 1 : 0; -} + if(mode & CONTEXTS_V2_NODES) + buffer_json_add_array_item_string(wb, "nodes"); -static void buffer_json_contexts_v2_options_to_array(BUFFER *wb, CONTEXTS_V2_OPTIONS options) { - if(options & CONTEXTS_V2_DEBUG) - buffer_json_add_array_item_string(wb, "debug"); + if(mode & CONTEXTS_V2_NODES_INFO) + buffer_json_add_array_item_string(wb, "nodes-info"); - if(options & (CONTEXTS_V2_NODES | CONTEXTS_V2_NODES_DETAILED)) - buffer_json_add_array_item_string(wb, "nodes"); + if(mode & CONTEXTS_V2_NODE_INSTANCES) + buffer_json_add_array_item_string(wb, "nodes-instances"); - if(options & CONTEXTS_V2_CONTEXTS) + if(mode & CONTEXTS_V2_CONTEXTS) buffer_json_add_array_item_string(wb, "contexts"); - if(options & CONTEXTS_V2_SEARCH) + if(mode & CONTEXTS_V2_SEARCH) buffer_json_add_array_item_string(wb, "search"); + + if(mode & CONTEXTS_V2_ALERTS) + buffer_json_add_array_item_string(wb, "alerts"); + + if(mode & CONTEXTS_V2_ALERT_TRANSITIONS) + buffer_json_add_array_item_string(wb, "alert_transitions"); + + buffer_json_array_close(wb); } void buffer_json_query_timings(BUFFER *wb, const char *key, struct query_timings *timings) { @@ -396,26 +976,90 @@ void buffer_json_query_timings(BUFFER *wb, const char *key, struct query_timings buffer_json_object_close(wb); } -void buffer_json_agents_array_v2(BUFFER *wb, struct query_timings *timings, time_t now_s) { +void build_info_to_json_object(BUFFER *b); + +void buffer_json_agents_v2(BUFFER *wb, struct query_timings *timings, time_t now_s, bool info, bool array) { if(!now_s) now_s = now_realtime_sec(); - buffer_json_member_add_array(wb, "agents"); - buffer_json_add_array_item_object(wb); + if(array) { + buffer_json_member_add_array(wb, "agents"); + buffer_json_add_array_item_object(wb); + } + else + buffer_json_member_add_object(wb, "agent"); + buffer_json_member_add_string(wb, "mg", localhost->machine_guid); buffer_json_member_add_uuid(wb, "nd", localhost->node_id); buffer_json_member_add_string(wb, "nm", rrdhost_hostname(localhost)); buffer_json_member_add_time_t(wb, "now", now_s); - buffer_json_member_add_uint64(wb, "ai", 0); + + if(array) + buffer_json_member_add_uint64(wb, "ai", 0); + + if(info) { + buffer_json_member_add_object(wb, "application"); + build_info_to_json_object(wb); + buffer_json_object_close(wb); // netdata + + buffer_json_cloud_status(wb, now_s); + + buffer_json_member_add_array(wb, "db_size"); + for (size_t tier = 0; tier < storage_tiers; tier++) { + STORAGE_ENGINE *eng = localhost->db[tier].eng; + if (!eng) continue; + + size_t max = storage_engine_disk_space_max(eng->backend, localhost->db[tier].instance); + size_t used = storage_engine_disk_space_used(eng->backend, localhost->db[tier].instance); + time_t first_time_s = storage_engine_global_first_time_s(eng->backend, localhost->db[tier].instance); + size_t currently_collected_metrics = storage_engine_collected_metrics(eng->backend, localhost->db[tier].instance); + + NETDATA_DOUBLE percent; + if (used && max) + percent = (NETDATA_DOUBLE) used * 100.0 / (NETDATA_DOUBLE) max; + else + percent = 0.0; + + buffer_json_add_array_item_object(wb); + buffer_json_member_add_uint64(wb, "tier", tier); + + if(used || max) { + buffer_json_member_add_uint64(wb, "disk_used", used); + buffer_json_member_add_uint64(wb, "disk_max", max); + buffer_json_member_add_double(wb, "disk_percent", percent); + } + + if(first_time_s) { + buffer_json_member_add_time_t(wb, "from", first_time_s); + buffer_json_member_add_time_t(wb, "to", now_s); + buffer_json_member_add_time_t(wb, "retention", now_s - first_time_s); + + if(used || max) // we have disk space information + buffer_json_member_add_time_t(wb, "expected_retention", + (time_t) ((NETDATA_DOUBLE) (now_s - first_time_s) * 100.0 / percent)); + } + + if(currently_collected_metrics) + buffer_json_member_add_uint64(wb, "currently_collected_metrics", currently_collected_metrics); + + buffer_json_object_close(wb); + } + buffer_json_array_close(wb); // db_size + } if(timings) buffer_json_query_timings(wb, "timings", timings); buffer_json_object_close(wb); - buffer_json_array_close(wb); + + if(array) + buffer_json_array_close(wb); } void buffer_json_cloud_timings(BUFFER *wb, const char *key, struct query_timings *timings) { + if(!timings->finished_ut) + timings->finished_ut = now_monotonic_usec(); + buffer_json_member_add_object(wb, key); buffer_json_member_add_double(wb, "routing_ms", 0.0); buffer_json_member_add_double(wb, "node_max_ms", 0.0); @@ -423,70 +1067,939 @@ void buffer_json_cloud_timings(BUFFER *wb, const char *key, struct query_timings buffer_json_object_close(wb); } -void contexts_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { - struct rrdcontext_to_json_v2_entry *z = value; +static void functions_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { + struct function_v2_entry *t = value; + + // it is initialized with a static reference - we need to mallocz() the array + size_t *v = t->node_ids; + t->node_ids = mallocz(sizeof(size_t)); + *t->node_ids = *v; + t->size = 1; + t->used = 1; +} + +static bool functions_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *data __maybe_unused) { + struct function_v2_entry *t = old_value, *n = new_value; + size_t *v = n->node_ids; + + if(t->used >= t->size) { + t->node_ids = reallocz(t->node_ids, t->size * 2 * sizeof(size_t)); + t->size *= 2; + } + + t->node_ids[t->used++] = *v; + + return true; +} + +static void functions_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { + struct function_v2_entry *t = value; + freez(t->node_ids); +} + +static bool contexts_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *data __maybe_unused) { + struct context_v2_entry *o = old_value; + struct context_v2_entry *n = new_value; + + o->count++; + + if(o->family != n->family) { + STRING *m = string_2way_merge(o->family, n->family); + string_freez(o->family); + o->family = m; + } + + if(o->priority != n->priority) { + if((o->flags & RRD_FLAG_COLLECTED) && !(n->flags & RRD_FLAG_COLLECTED)) + // keep o + ; + else if(!(o->flags & RRD_FLAG_COLLECTED) && (n->flags & RRD_FLAG_COLLECTED)) + // keep n + o->priority = n->priority; + else + // keep the min + o->priority = MIN(o->priority, n->priority); + } + + if(o->first_time_s && n->first_time_s) + o->first_time_s = MIN(o->first_time_s, n->first_time_s); + else if(!o->first_time_s) + o->first_time_s = n->first_time_s; + + if(o->last_time_s && n->last_time_s) + o->last_time_s = MAX(o->last_time_s, n->last_time_s); + else if(!o->last_time_s) + o->last_time_s = n->last_time_s; + + o->flags |= n->flags; + o->match = MIN(o->match, n->match); + + string_freez(n->family); + + return true; +} + +static void contexts_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { + struct context_v2_entry *z = value; string_freez(z->family); } -int rrdcontext_to_json_v2(BUFFER *wb, struct api_v2_contexts_request *req, CONTEXTS_V2_OPTIONS options) { +static void rrdcontext_v2_set_transition_filter(const char *machine_guid, const char *context, time_t alarm_id, void *data) { + struct rrdcontext_to_json_v2_data *ctl = data; + + if(machine_guid && *machine_guid) { + if(ctl->nodes.scope_pattern) + simple_pattern_free(ctl->nodes.scope_pattern); + + if(ctl->nodes.pattern) + simple_pattern_free(ctl->nodes.pattern); + + ctl->nodes.scope_pattern = string_to_simple_pattern(machine_guid); + ctl->nodes.pattern = NULL; + } + + if(context && *context) { + if(ctl->contexts.scope_pattern) + simple_pattern_free(ctl->contexts.scope_pattern); + + if(ctl->contexts.pattern) + simple_pattern_free(ctl->contexts.pattern); + + ctl->contexts.scope_pattern = string_to_simple_pattern(context); + ctl->contexts.pattern = NULL; + } + + ctl->alerts.alarm_id_filter = alarm_id; +} + +struct alert_instances_callback_data { + BUFFER *wb; + struct rrdcontext_to_json_v2_data *ctl; + bool debug; +}; + +static void contexts_v2_alert_config_to_json_from_sql_alert_config_data(struct sql_alert_config_data *t, void *data) { + struct alert_transitions_callback_data *d = data; + BUFFER *wb = d->wb; + bool debug = d->debug; + d->configs_added++; + + if(d->only_one_config) + buffer_json_add_array_item_object(wb); // alert config + + { + buffer_json_member_add_string(wb, "name", t->name); + buffer_json_member_add_uuid(wb, "config_hash_id", t->config_hash_id); + + buffer_json_member_add_object(wb, "selectors"); + { + bool is_template = t->selectors.on_template && *t->selectors.on_template ? true : false; + buffer_json_member_add_string(wb, "type", is_template ? "template" : "alarm"); + buffer_json_member_add_string(wb, "on", is_template ? t->selectors.on_template : t->selectors.on_key); + + buffer_json_member_add_string(wb, "os", t->selectors.os); + buffer_json_member_add_string(wb, "hosts", t->selectors.hosts); + buffer_json_member_add_string(wb, "families", t->selectors.families); + buffer_json_member_add_string(wb, "plugin", t->selectors.plugin); + buffer_json_member_add_string(wb, "module", t->selectors.module); + buffer_json_member_add_string(wb, "host_labels", t->selectors.host_labels); + buffer_json_member_add_string(wb, "chart_labels", t->selectors.chart_labels); + buffer_json_member_add_string(wb, "charts", t->selectors.charts); + } + buffer_json_object_close(wb); // selectors + + buffer_json_member_add_object(wb, "value"); // value + { + // buffer_json_member_add_string(wb, "every", t->value.every); // does not exist in Netdata Cloud + buffer_json_member_add_string(wb, "units", t->value.units); + buffer_json_member_add_uint64(wb, "update_every", t->value.update_every); + + if (t->value.db.after || debug) { + buffer_json_member_add_object(wb, "db"); + { + // buffer_json_member_add_string(wb, "lookup", t->value.db.lookup); // does not exist in Netdata Cloud + + buffer_json_member_add_time_t(wb, "after", t->value.db.after); + buffer_json_member_add_time_t(wb, "before", t->value.db.before); + buffer_json_member_add_string(wb, "method", t->value.db.method); + buffer_json_member_add_string(wb, "dimensions", t->value.db.dimensions); + web_client_api_request_v1_data_options_to_buffer_json_array(wb, "options",(RRDR_OPTIONS) t->value.db.options); + } + buffer_json_object_close(wb); // db + } + + if (t->value.calc || debug) + buffer_json_member_add_string(wb, "calc", t->value.calc); + } + buffer_json_object_close(wb); // value + + if (t->status.warn || t->status.crit || debug) { + buffer_json_member_add_object(wb, "status"); // status + { + NETDATA_DOUBLE green = t->status.green ? str2ndd(t->status.green, NULL) : NAN; + NETDATA_DOUBLE red = t->status.red ? str2ndd(t->status.red, NULL) : NAN; + + if (!isnan(green) || debug) + buffer_json_member_add_double(wb, "green", green); + + if (!isnan(red) || debug) + buffer_json_member_add_double(wb, "red", red); + + if (t->status.warn || debug) + buffer_json_member_add_string(wb, "warn", t->status.warn); + + if (t->status.crit || debug) + buffer_json_member_add_string(wb, "crit", t->status.crit); + } + buffer_json_object_close(wb); // status + } + + buffer_json_member_add_object(wb, "notification"); + { + buffer_json_member_add_string(wb, "type", "agent"); + buffer_json_member_add_string(wb, "exec", t->notification.exec ? t->notification.exec : NULL); + buffer_json_member_add_string(wb, "to", t->notification.to_key ? t->notification.to_key : string2str(localhost->health.health_default_recipient)); + buffer_json_member_add_string(wb, "delay", t->notification.delay); + buffer_json_member_add_string(wb, "repeat", t->notification.repeat); + buffer_json_member_add_string(wb, "options", t->notification.options); + } + buffer_json_object_close(wb); // notification + + buffer_json_member_add_string(wb, "class", t->classification); + buffer_json_member_add_string(wb, "component", t->component); + buffer_json_member_add_string(wb, "type", t->type); + buffer_json_member_add_string(wb, "info", t->info); + // buffer_json_member_add_string(wb, "source", t->source); // moved to alert instance + } + + if(d->only_one_config) + buffer_json_object_close(wb); +} + +int contexts_v2_alert_config_to_json(struct web_client *w, const char *config_hash_id) { + struct alert_transitions_callback_data data = { + .wb = w->response.data, + .debug = false, + .only_one_config = false, + }; + DICTIONARY *configs = dictionary_create(DICT_OPTION_SINGLE_THREADED | DICT_OPTION_DONT_OVERWRITE_VALUE); + dictionary_set(configs, config_hash_id, NULL, 0); + + buffer_flush(w->response.data); + + buffer_json_initialize(w->response.data, "\"", "\"", 0, true, false); + + int added = sql_get_alert_configuration(configs, contexts_v2_alert_config_to_json_from_sql_alert_config_data, &data, false); + buffer_json_finalize(w->response.data); + + int ret = HTTP_RESP_OK; + + if(added <= 0) { + buffer_flush(w->response.data); + w->response.data->content_type = CT_TEXT_PLAIN; + if(added < 0) { + buffer_strcat(w->response.data, "Failed to execute SQL query."); + ret = HTTP_RESP_INTERNAL_SERVER_ERROR; + } + else { + buffer_strcat(w->response.data, "Config is not found."); + ret = HTTP_RESP_NOT_FOUND; + } + } + + return ret; +} + +static int contexts_v2_alert_instance_to_json_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data) { + struct sql_alert_instance_v2_entry *t = value; + struct alert_instances_callback_data *d = data; + struct rrdcontext_to_json_v2_data *ctl = d->ctl; (void)ctl; + bool debug = d->debug; (void)debug; + BUFFER *wb = d->wb; + + buffer_json_add_array_item_object(wb); + { + buffer_json_member_add_uint64(wb, "ni", t->ni); + + buffer_json_member_add_string(wb, "nm", string2str(t->name)); + buffer_json_member_add_string(wb, "ch", string2str(t->chart_id)); + buffer_json_member_add_string(wb, "ch_n", string2str(t->chart_name)); + + if(ctl->request->options & CONTEXT_V2_OPTION_ALERTS_WITH_SUMMARY) + buffer_json_member_add_uint64(wb, "ati", t->ati); + + if(ctl->request->options & CONTEXT_V2_OPTION_ALERTS_WITH_INSTANCES) { + buffer_json_member_add_string(wb, "units", string2str(t->units)); + buffer_json_member_add_string(wb, "fami", string2str(t->family)); + buffer_json_member_add_string(wb, "info", string2str(t->info)); + buffer_json_member_add_string(wb, "ctx", string2str(t->context)); + buffer_json_member_add_string(wb, "st", rrdcalc_status2string(t->status)); + buffer_json_member_add_uuid(wb, "tr_i", &t->last_transition_id); + buffer_json_member_add_double(wb, "tr_v", t->last_status_change_value); + buffer_json_member_add_time_t(wb, "tr_t", t->last_status_change); + buffer_json_member_add_uuid(wb, "cfg", &t->config_hash_id); + buffer_json_member_add_string(wb, "src", string2str(t->source)); + + buffer_json_member_add_string(wb, "to", string2str(t->recipient)); + buffer_json_member_add_string(wb, "tp", string2str(t->type)); + buffer_json_member_add_string(wb, "cm", string2str(t->component)); + buffer_json_member_add_string(wb, "cl", string2str(t->classification)); + + // Agent specific fields + buffer_json_member_add_uint64(wb, "gi", t->global_id); + // rrdcalc_flags_to_json_array (wb, "flags", t->flags); + } + + if(ctl->request->options & CONTEXT_V2_OPTION_ALERTS_WITH_VALUES) { + // Netdata Cloud fetched these by querying the agents + buffer_json_member_add_double(wb, "v", t->value); + buffer_json_member_add_time_t(wb, "t", t->last_updated); + } + } + buffer_json_object_close(wb); // alert instance + + return 1; +} + +static void contexts_v2_alert_instances_to_json(BUFFER *wb, const char *key, struct rrdcontext_to_json_v2_data *ctl, bool debug) { + buffer_json_member_add_array(wb, key); + { + struct alert_instances_callback_data data = { + .wb = wb, + .ctl = ctl, + .debug = debug, + }; + dictionary_walkthrough_rw(ctl->alerts.alert_instances, DICTIONARY_LOCK_READ, + contexts_v2_alert_instance_to_json_callback, &data); + } + buffer_json_array_close(wb); // alerts_instances +} + +static void contexts_v2_alerts_to_json(BUFFER *wb, struct rrdcontext_to_json_v2_data *ctl, bool debug) { + if(ctl->request->options & CONTEXT_V2_OPTION_ALERTS_WITH_SUMMARY) { + buffer_json_member_add_array(wb, "alerts"); + { + struct alert_v2_entry *t; + dfe_start_read(ctl->alerts.alerts, t) + { + buffer_json_add_array_item_object(wb); + { + buffer_json_member_add_uint64(wb, "ati", t->ati); + buffer_json_member_add_string(wb, "nm", string2str(t->name)); + + buffer_json_member_add_uint64(wb, "cr", t->critical); + buffer_json_member_add_uint64(wb, "wr", t->warning); + buffer_json_member_add_uint64(wb, "cl", t->clear); + buffer_json_member_add_uint64(wb, "er", t->error); + + buffer_json_member_add_uint64(wb, "in", t->instances); + buffer_json_member_add_uint64(wb, "nd", dictionary_entries(t->nodes)); + buffer_json_member_add_uint64(wb, "cfg", dictionary_entries(t->configs)); + } + buffer_json_object_close(wb); // alert name + } + dfe_done(t); + } + buffer_json_array_close(wb); // alerts + } + + if(ctl->request->options & (CONTEXT_V2_OPTION_ALERTS_WITH_INSTANCES|CONTEXT_V2_OPTION_ALERTS_WITH_VALUES)) { + contexts_v2_alert_instances_to_json(wb, "alert_instances", ctl, debug); + } +} + +#define SQL_TRANSITION_DATA_SMALL_STRING (6 * 8) +#define SQL_TRANSITION_DATA_MEDIUM_STRING (12 * 8) +#define SQL_TRANSITION_DATA_BIG_STRING 512 + +struct sql_alert_transition_fixed_size { + usec_t global_id; + uuid_t transition_id; + uuid_t host_id; + uuid_t config_hash_id; + uint32_t alarm_id; + char alert_name[SQL_TRANSITION_DATA_SMALL_STRING]; + char chart[RRD_ID_LENGTH_MAX]; + char chart_name[RRD_ID_LENGTH_MAX]; + char chart_context[SQL_TRANSITION_DATA_MEDIUM_STRING]; + char family[SQL_TRANSITION_DATA_SMALL_STRING]; + char recipient[SQL_TRANSITION_DATA_MEDIUM_STRING]; + char units[SQL_TRANSITION_DATA_SMALL_STRING]; + char exec[SQL_TRANSITION_DATA_BIG_STRING]; + char info[SQL_TRANSITION_DATA_BIG_STRING]; + char classification[SQL_TRANSITION_DATA_SMALL_STRING]; + char type[SQL_TRANSITION_DATA_SMALL_STRING]; + char component[SQL_TRANSITION_DATA_SMALL_STRING]; + time_t when_key; + time_t duration; + time_t non_clear_duration; + uint64_t flags; + time_t delay_up_to_timestamp; + time_t exec_run_timestamp; + int exec_code; + int new_status; + int old_status; + int delay; + time_t last_repeat; + NETDATA_DOUBLE new_value; + NETDATA_DOUBLE old_value; + + char machine_guid[UUID_STR_LEN]; + struct sql_alert_transition_fixed_size *next; + struct sql_alert_transition_fixed_size *prev; +}; + +static struct sql_alert_transition_fixed_size *contexts_v2_alert_transition_dup(struct sql_alert_transition_data *t, const char *machine_guid, struct sql_alert_transition_fixed_size *dst) { + struct sql_alert_transition_fixed_size *n = dst ? dst : mallocz(sizeof(*n)); + + n->global_id = t->global_id; + uuid_copy(n->transition_id, *t->transition_id); + uuid_copy(n->host_id, *t->host_id); + uuid_copy(n->config_hash_id, *t->config_hash_id); + n->alarm_id = t->alarm_id; + strncpyz(n->alert_name, t->alert_name ? t->alert_name : "", sizeof(n->alert_name) - 1); + strncpyz(n->chart, t->chart ? t->chart : "", sizeof(n->chart) - 1); + strncpyz(n->chart_name, t->chart_name ? t->chart_name : n->chart, sizeof(n->chart_name) - 1); + strncpyz(n->chart_context, t->chart_context ? t->chart_context : "", sizeof(n->chart_context) - 1); + strncpyz(n->family, t->family ? t->family : "", sizeof(n->family) - 1); + strncpyz(n->recipient, t->recipient ? t->recipient : "", sizeof(n->recipient) - 1); + strncpyz(n->units, t->units ? t->units : "", sizeof(n->units) - 1); + strncpyz(n->exec, t->exec ? t->exec : "", sizeof(n->exec) - 1); + strncpyz(n->info, t->info ? t->info : "", sizeof(n->info) - 1); + strncpyz(n->classification, t->classification ? t->classification : "", sizeof(n->classification) - 1); + strncpyz(n->type, t->type ? t->type : "", sizeof(n->type) - 1); + strncpyz(n->component, t->component ? t->component : "", sizeof(n->component) - 1); + n->when_key = t->when_key; + n->duration = t->duration; + n->non_clear_duration = t->non_clear_duration; + n->flags = t->flags; + n->delay_up_to_timestamp = t->delay_up_to_timestamp; + n->exec_run_timestamp = t->exec_run_timestamp; + n->exec_code = t->exec_code; + n->new_status = t->new_status; + n->old_status = t->old_status; + n->delay = t->delay; + n->last_repeat = t->last_repeat; + n->new_value = t->new_value; + n->old_value = t->old_value; + + memcpy(n->machine_guid, machine_guid, sizeof(n->machine_guid)); + n->next = n->prev = NULL; + + return n; +} + +static void contexts_v2_alert_transition_free(struct sql_alert_transition_fixed_size *t) { + freez(t); +} + +static inline void contexts_v2_alert_transition_keep(struct alert_transitions_callback_data *d, struct sql_alert_transition_data *t, const char *machine_guid) { + d->items_matched++; + + if(unlikely(t->global_id <= d->ctl->request->alerts.global_id_anchor)) { + // this is in our past, we are not interested + d->operations.skips_before++; + return; + } + + if(unlikely(!d->base)) { + d->last_added = contexts_v2_alert_transition_dup(t, machine_guid, NULL); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(d->base, d->last_added, prev, next); + d->items_to_return++; + d->operations.first++; + return; + } + + struct sql_alert_transition_fixed_size *last = d->last_added; + while(last->prev != d->base->prev && t->global_id > last->prev->global_id) { + last = last->prev; + d->operations.backwards++; + } + + while(last->next && t->global_id < last->next->global_id) { + last = last->next; + d->operations.forwards++; + } + + if(d->items_to_return >= d->max_items_to_return) { + if(last == d->base->prev && t->global_id < last->global_id) { + d->operations.skips_after++; + return; + } + } + + d->items_to_return++; + + if(t->global_id > last->global_id) { + if(d->items_to_return > d->max_items_to_return) { + d->items_to_return--; + d->operations.shifts++; + d->last_added = d->base->prev; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(d->base, d->last_added, prev, next); + d->last_added = contexts_v2_alert_transition_dup(t, machine_guid, d->last_added); + } + DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(d->base, d->last_added, prev, next); + d->operations.prepend++; + } + else { + d->last_added = contexts_v2_alert_transition_dup(t, machine_guid, NULL); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(d->base, d->last_added, prev, next); + d->operations.append++; + } + + while(d->items_to_return > d->max_items_to_return) { + // we have to remove something + + struct sql_alert_transition_fixed_size *tmp = d->base->prev; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(d->base, tmp, prev, next); + d->items_to_return--; + + if(unlikely(d->last_added == tmp)) + d->last_added = d->base; + + contexts_v2_alert_transition_free(tmp); + + d->operations.shifts++; + } +} + +static void contexts_v2_alert_transition_callback(struct sql_alert_transition_data *t, void *data) { + struct alert_transitions_callback_data *d = data; + d->items_evaluated++; + + char machine_guid[UUID_STR_LEN] = ""; + uuid_unparse_lower(*t->host_id, machine_guid); + + const char *facets[ATF_TOTAL_ENTRIES] = { + [ATF_STATUS] = rrdcalc_status2string(t->new_status), + [ATF_CLASS] = t->classification, + [ATF_TYPE] = t->type, + [ATF_COMPONENT] = t->component, + [ATF_ROLE] = t->recipient && *t->recipient ? t->recipient : string2str(localhost->health.health_default_recipient), + [ATF_NODE] = machine_guid, + [ATF_ALERT_NAME] = t->alert_name, + [ATF_CHART_NAME] = t->chart_name, + [ATF_CONTEXT] = t->chart_context, + }; + + for(size_t i = 0; i < ATF_TOTAL_ENTRIES ;i++) { + if (!facets[i] || !*facets[i]) facets[i] = "unknown"; + + struct facet_entry tmp = { + .count = 0, + }; + dictionary_set(d->facets[i].dict, facets[i], &tmp, sizeof(tmp)); + } + + bool selected[ATF_TOTAL_ENTRIES] = { 0 }; + + uint32_t selected_by = 0; + for(size_t i = 0; i < ATF_TOTAL_ENTRIES ;i++) { + selected[i] = !d->facets[i].pattern || simple_pattern_matches(d->facets[i].pattern, facets[i]); + if(selected[i]) + selected_by++; + } + + if(selected_by == ATF_TOTAL_ENTRIES) { + // this item is selected by all facets + // put it in our result (if it fits) + contexts_v2_alert_transition_keep(d, t, machine_guid); + } + + if(selected_by >= ATF_TOTAL_ENTRIES - 1) { + // this item is selected by all, or all except one facet + // in both cases we need to add it to our counters + + for (size_t i = 0; i < ATF_TOTAL_ENTRIES; i++) { + uint32_t counted_by = selected_by; + + if (counted_by != ATF_TOTAL_ENTRIES) { + counted_by = 0; + for (size_t j = 0; j < ATF_TOTAL_ENTRIES; j++) { + if (i == j || selected[j]) + counted_by++; + } + } + + if (counted_by == ATF_TOTAL_ENTRIES) { + // we need to count it on this facet + struct facet_entry *x = dictionary_get(d->facets[i].dict, facets[i]); + internal_fatal(!x, "facet is not found"); + if(x) + x->count++; + } + } + } +} + +static void contexts_v2_alert_transitions_to_json(BUFFER *wb, struct rrdcontext_to_json_v2_data *ctl, bool debug) { + struct alert_transitions_callback_data data = { + .wb = wb, + .ctl = ctl, + .debug = debug, + .only_one_config = true, + .max_items_to_return = ctl->request->alerts.last, + .items_to_return = 0, + .base = NULL, + }; + + for(size_t i = 0; i < ATF_TOTAL_ENTRIES ;i++) { + data.facets[i].dict = dictionary_create_advanced(DICT_OPTION_SINGLE_THREADED | DICT_OPTION_FIXED_SIZE | DICT_OPTION_DONT_OVERWRITE_VALUE, NULL, sizeof(struct facet_entry)); + if(ctl->request->alerts.facets[i]) + data.facets[i].pattern = simple_pattern_create(ctl->request->alerts.facets[i], ",|", SIMPLE_PATTERN_EXACT, false); + } + + sql_alert_transitions( + ctl->nodes.dict, + ctl->window.after, + ctl->window.before, + ctl->request->contexts, + ctl->request->alerts.alert, + ctl->request->alerts.transition, + contexts_v2_alert_transition_callback, + &data, + debug); + + buffer_json_member_add_array(wb, "facets"); + for (size_t i = 0; i < ATF_TOTAL_ENTRIES; i++) { + buffer_json_add_array_item_object(wb); + { + buffer_json_member_add_string(wb, "id", alert_transition_facets[i].id); + buffer_json_member_add_string(wb, "name", alert_transition_facets[i].name); + buffer_json_member_add_uint64(wb, "order", alert_transition_facets[i].order); + buffer_json_member_add_array(wb, "options"); + { + struct facet_entry *x; + dfe_start_read(data.facets[i].dict, x) { + buffer_json_add_array_item_object(wb); + { + buffer_json_member_add_string(wb, "id", x_dfe.name); + if (i == ATF_NODE) { + RRDHOST *host = rrdhost_find_by_guid(x_dfe.name); + if (host) + buffer_json_member_add_string(wb, "name", rrdhost_hostname(host)); + else + buffer_json_member_add_string(wb, "name", x_dfe.name); + } else + buffer_json_member_add_string(wb, "name", x_dfe.name); + buffer_json_member_add_uint64(wb, "count", x->count); + } + buffer_json_object_close(wb); + } + dfe_done(x); + } + buffer_json_array_close(wb); // options + } + buffer_json_object_close(wb); // facet + } + buffer_json_array_close(wb); // facets + + buffer_json_member_add_array(wb, "transitions"); + for(struct sql_alert_transition_fixed_size *t = data.base; t ; t = t->next) { + buffer_json_add_array_item_object(wb); + { + RRDHOST *host = rrdhost_find_by_guid(t->machine_guid); + + buffer_json_member_add_uint64(wb, "gi", t->global_id); + buffer_json_member_add_uuid(wb, "transition_id", &t->transition_id); + buffer_json_member_add_uuid(wb, "config_hash_id", &t->config_hash_id); + buffer_json_member_add_string(wb, "machine_guid", t->machine_guid); + + if(host) { + buffer_json_member_add_string(wb, "hostname", rrdhost_hostname(host)); + + if(host->node_id) + buffer_json_member_add_uuid(wb, "node_id", host->node_id); + } + + buffer_json_member_add_string(wb, "alert", *t->alert_name ? t->alert_name : NULL); + buffer_json_member_add_string(wb, "instance", *t->chart ? t->chart : NULL); + buffer_json_member_add_string(wb, "instance_n", *t->chart_name ? t->chart_name : NULL); + buffer_json_member_add_string(wb, "context", *t->chart_context ? t->chart_context : NULL); + // buffer_json_member_add_string(wb, "family", *t->family ? t->family : NULL); + buffer_json_member_add_string(wb, "component", *t->component ? t->component : NULL); + buffer_json_member_add_string(wb, "classification", *t->classification ? t->classification : NULL); + buffer_json_member_add_string(wb, "type", *t->type ? t->type : NULL); + + buffer_json_member_add_time_t(wb, "when", t->when_key); + buffer_json_member_add_string(wb, "info", *t->info ? t->info : ""); + buffer_json_member_add_string(wb, "units", *t->units ? t->units : NULL); + buffer_json_member_add_object(wb, "new"); + { + buffer_json_member_add_string(wb, "status", rrdcalc_status2string(t->new_status)); + buffer_json_member_add_double(wb, "value", t->new_value); + } + buffer_json_object_close(wb); // new + buffer_json_member_add_object(wb, "old"); + { + buffer_json_member_add_string(wb, "status", rrdcalc_status2string(t->old_status)); + buffer_json_member_add_double(wb, "value", t->old_value); + buffer_json_member_add_time_t(wb, "duration", t->duration); + buffer_json_member_add_time_t(wb, "raised_duration", t->non_clear_duration); + } + buffer_json_object_close(wb); // old + + buffer_json_member_add_object(wb, "notification"); + { + buffer_json_member_add_time_t(wb, "when", t->exec_run_timestamp); + buffer_json_member_add_time_t(wb, "delay", t->delay); + buffer_json_member_add_time_t(wb, "delay_up_to_time", t->delay_up_to_timestamp); + health_entry_flags_to_json_array(wb, "flags", t->flags); + buffer_json_member_add_string(wb, "exec", *t->exec ? t->exec : string2str(localhost->health.health_default_exec)); + buffer_json_member_add_uint64(wb, "exec_code", t->exec_code); + buffer_json_member_add_string(wb, "to", *t->recipient ? t->recipient : string2str(localhost->health.health_default_recipient)); + } + buffer_json_object_close(wb); // notification + } + buffer_json_object_close(wb); // a transition + } + buffer_json_array_close(wb); // all transitions + + if(ctl->options & CONTEXT_V2_OPTION_ALERTS_WITH_CONFIGURATIONS) { + DICTIONARY *configs = dictionary_create(DICT_OPTION_SINGLE_THREADED | DICT_OPTION_DONT_OVERWRITE_VALUE); + + for(struct sql_alert_transition_fixed_size *t = data.base; t ; t = t->next) { + char guid[UUID_STR_LEN]; + uuid_unparse_lower(t->config_hash_id, guid); + dictionary_set(configs, guid, NULL, 0); + } + + buffer_json_member_add_array(wb, "configurations"); + sql_get_alert_configuration(configs, contexts_v2_alert_config_to_json_from_sql_alert_config_data, &data, debug); + buffer_json_array_close(wb); + + dictionary_destroy(configs); + } + + while(data.base) { + struct sql_alert_transition_fixed_size *t = data.base; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(data.base, t, prev, next); + contexts_v2_alert_transition_free(t); + } + + for(size_t i = 0; i < ATF_TOTAL_ENTRIES ;i++) { + dictionary_destroy(data.facets[i].dict); + simple_pattern_free(data.facets[i].pattern); + } + + buffer_json_member_add_object(wb, "items"); + { + // all the items in the window, under the scope_nodes, ignoring the facets (filters) + buffer_json_member_add_uint64(wb, "evaluated", data.items_evaluated); + + // all the items matching the query (if you didn't put anchor_gi and last, these are all the items you would get back) + buffer_json_member_add_uint64(wb, "matched", data.items_matched); + + // the items included in this response + buffer_json_member_add_uint64(wb, "returned", data.items_to_return); + + // same as last=X parameter + buffer_json_member_add_uint64(wb, "max_to_return", data.max_items_to_return); + + // items before the first returned, this should be 0 if anchor_gi is not set + buffer_json_member_add_uint64(wb, "before", data.operations.skips_before); + + // items after the last returned, when this is zero there aren't any items after the current list + buffer_json_member_add_uint64(wb, "after", data.operations.skips_after + data.operations.shifts); + } + buffer_json_object_close(wb); // items + + if(debug) { + buffer_json_member_add_object(wb, "stats"); + { + buffer_json_member_add_uint64(wb, "first", data.operations.first); + buffer_json_member_add_uint64(wb, "prepend", data.operations.prepend); + buffer_json_member_add_uint64(wb, "append", data.operations.append); + buffer_json_member_add_uint64(wb, "backwards", data.operations.backwards); + buffer_json_member_add_uint64(wb, "forwards", data.operations.forwards); + buffer_json_member_add_uint64(wb, "shifts", data.operations.shifts); + buffer_json_member_add_uint64(wb, "skips_before", data.operations.skips_before); + buffer_json_member_add_uint64(wb, "skips_after", data.operations.skips_after); + } + buffer_json_object_close(wb); + } +} + +int rrdcontext_to_json_v2(BUFFER *wb, struct api_v2_contexts_request *req, CONTEXTS_V2_MODE mode) { int resp = HTTP_RESP_OK; + bool run = true; + + if(mode & CONTEXTS_V2_SEARCH) + mode |= CONTEXTS_V2_CONTEXTS; + + if(mode & (CONTEXTS_V2_AGENTS_INFO)) + mode |= CONTEXTS_V2_AGENTS; - if(options & CONTEXTS_V2_SEARCH) - options |= CONTEXTS_V2_CONTEXTS; + if(mode & (CONTEXTS_V2_FUNCTIONS | CONTEXTS_V2_CONTEXTS | CONTEXTS_V2_SEARCH | CONTEXTS_V2_NODES_INFO | CONTEXTS_V2_NODE_INSTANCES)) + mode |= CONTEXTS_V2_NODES; + + if(mode & CONTEXTS_V2_ALERTS) { + mode |= CONTEXTS_V2_NODES; + req->options &= ~CONTEXT_V2_OPTION_ALERTS_WITH_CONFIGURATIONS; + + if(!(req->options & (CONTEXT_V2_OPTION_ALERTS_WITH_SUMMARY|CONTEXT_V2_OPTION_ALERTS_WITH_INSTANCES|CONTEXT_V2_OPTION_ALERTS_WITH_VALUES))) + req->options |= CONTEXT_V2_OPTION_ALERTS_WITH_SUMMARY; + } + + if(mode & CONTEXTS_V2_ALERT_TRANSITIONS) { + mode |= CONTEXTS_V2_NODES; + req->options &= ~CONTEXT_V2_OPTION_ALERTS_WITH_INSTANCES; + } struct rrdcontext_to_json_v2_data ctl = { .wb = wb, .request = req, - .ctx = NULL, - .options = options, + .mode = mode, + .options = req->options, .versions = { 0 }, .nodes.scope_pattern = string_to_simple_pattern(req->scope_nodes), .nodes.pattern = string_to_simple_pattern(req->nodes), .contexts.pattern = string_to_simple_pattern(req->contexts), .contexts.scope_pattern = string_to_simple_pattern(req->scope_contexts), .q.pattern = string_to_simple_pattern_nocase(req->q), + .alerts.alert_name_pattern = string_to_simple_pattern(req->alerts.alert), + .window = { + .enabled = false, + .relative = false, + .after = req->after, + .before = req->before, + }, .timings = { .received_ut = now_monotonic_usec(), } }; - if(options & CONTEXTS_V2_CONTEXTS) { - ctl.ctx = dictionary_create_advanced( + bool debug = ctl.options & CONTEXT_V2_OPTION_DEBUG; + + if(mode & CONTEXTS_V2_NODES) { + ctl.nodes.dict = dictionary_create_advanced(DICT_OPTION_SINGLE_THREADED | DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + NULL, sizeof(struct contexts_v2_node)); + } + + if(mode & CONTEXTS_V2_CONTEXTS) { + ctl.contexts.dict = dictionary_create_advanced( + DICT_OPTION_SINGLE_THREADED | DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, NULL, + sizeof(struct context_v2_entry)); + + dictionary_register_conflict_callback(ctl.contexts.dict, contexts_conflict_callback, &ctl); + dictionary_register_delete_callback(ctl.contexts.dict, contexts_delete_callback, &ctl); + } + + if(mode & CONTEXTS_V2_FUNCTIONS) { + ctl.functions.dict = dictionary_create_advanced( DICT_OPTION_SINGLE_THREADED | DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, NULL, - sizeof(struct rrdcontext_to_json_v2_entry)); + sizeof(struct function_v2_entry)); + + dictionary_register_insert_callback(ctl.functions.dict, functions_insert_callback, &ctl); + dictionary_register_conflict_callback(ctl.functions.dict, functions_conflict_callback, &ctl); + dictionary_register_delete_callback(ctl.functions.dict, functions_delete_callback, &ctl); + } + + if(mode & CONTEXTS_V2_ALERTS) { + if(req->alerts.transition) { + ctl.options |= CONTEXT_V2_OPTION_ALERTS_WITH_INSTANCES|CONTEXT_V2_OPTION_ALERTS_WITH_VALUES; + run = sql_find_alert_transition(req->alerts.transition, rrdcontext_v2_set_transition_filter, &ctl); + if(!run) { + resp = HTTP_RESP_NOT_FOUND; + goto cleanup; + } + } + + ctl.alerts.alerts = dictionary_create_advanced(DICT_OPTION_SINGLE_THREADED | DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + NULL, sizeof(struct alert_v2_entry)); + + dictionary_register_insert_callback(ctl.alerts.alerts, alerts_v2_insert_callback, &ctl); + dictionary_register_conflict_callback(ctl.alerts.alerts, alerts_v2_conflict_callback, &ctl); + dictionary_register_delete_callback(ctl.alerts.alerts, alerts_v2_delete_callback, &ctl); + + if(ctl.options & (CONTEXT_V2_OPTION_ALERTS_WITH_INSTANCES | CONTEXT_V2_OPTION_ALERTS_WITH_VALUES)) { + ctl.alerts.alert_instances = dictionary_create_advanced(DICT_OPTION_SINGLE_THREADED | DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, + NULL, sizeof(struct sql_alert_instance_v2_entry)); + + dictionary_register_insert_callback(ctl.alerts.alert_instances, alert_instances_v2_insert_callback, &ctl); + dictionary_register_conflict_callback(ctl.alerts.alert_instances, alert_instances_v2_conflict_callback, &ctl); + dictionary_register_delete_callback(ctl.alerts.alert_instances, alert_instances_delete_callback, &ctl); + } + } - dictionary_register_delete_callback(ctl.ctx, contexts_delete_callback, NULL); + if(req->after || req->before) { + ctl.window.relative = rrdr_relative_window_to_absolute(&ctl.window.after, &ctl.window.before, &ctl.now); + ctl.window.enabled = !(mode & CONTEXTS_V2_ALERT_TRANSITIONS); } + else + ctl.now = now_realtime_sec(); + + buffer_json_initialize(wb, "\"", "\"", 0, + true, (req->options & CONTEXT_V2_OPTION_MINIFY) && !(req->options & CONTEXT_V2_OPTION_DEBUG)); - time_t now_s = now_realtime_sec(); - buffer_json_initialize(wb, "\"", "\"", 0, true, false); buffer_json_member_add_uint64(wb, "api", 2); - if(options & CONTEXTS_V2_DEBUG) { + if(req->options & CONTEXT_V2_OPTION_DEBUG) { buffer_json_member_add_object(wb, "request"); + { + buffer_json_contexts_v2_mode_to_array(wb, "mode", mode); + web_client_api_request_v2_contexts_options_to_buffer_json_array(wb, "options", req->options); - buffer_json_member_add_object(wb, "scope"); - buffer_json_member_add_string(wb, "scope_nodes", req->scope_nodes); - buffer_json_member_add_string(wb, "scope_contexts", req->scope_contexts); - buffer_json_object_close(wb); + buffer_json_member_add_object(wb, "scope"); + { + buffer_json_member_add_string(wb, "scope_nodes", req->scope_nodes); + if (mode & (CONTEXTS_V2_CONTEXTS | CONTEXTS_V2_SEARCH | CONTEXTS_V2_ALERTS)) + buffer_json_member_add_string(wb, "scope_contexts", req->scope_contexts); + } + buffer_json_object_close(wb); - buffer_json_member_add_object(wb, "selectors"); - buffer_json_member_add_string(wb, "nodes", req->nodes); - buffer_json_member_add_string(wb, "contexts", req->contexts); - buffer_json_object_close(wb); + buffer_json_member_add_object(wb, "selectors"); + { + buffer_json_member_add_string(wb, "nodes", req->nodes); - buffer_json_member_add_string(wb, "q", req->q); - buffer_json_member_add_array(wb, "options"); - buffer_json_contexts_v2_options_to_array(wb, options); - buffer_json_array_close(wb); + if (mode & (CONTEXTS_V2_CONTEXTS | CONTEXTS_V2_SEARCH | CONTEXTS_V2_ALERTS)) + buffer_json_member_add_string(wb, "contexts", req->contexts); + + if(mode & (CONTEXTS_V2_ALERTS | CONTEXTS_V2_ALERT_TRANSITIONS)) { + buffer_json_member_add_object(wb, "alerts"); + + if(mode & CONTEXTS_V2_ALERTS) + web_client_api_request_v2_contexts_alerts_status_to_buffer_json_array(wb, "status", req->alerts.status); + + if(mode & CONTEXTS_V2_ALERT_TRANSITIONS) { + buffer_json_member_add_string(wb, "context", req->contexts); + buffer_json_member_add_uint64(wb, "anchor_gi", req->alerts.global_id_anchor); + buffer_json_member_add_uint64(wb, "last", req->alerts.last); + } + + buffer_json_member_add_string(wb, "alert", req->alerts.alert); + buffer_json_member_add_string(wb, "transition", req->alerts.transition); + buffer_json_object_close(wb); // alerts + } + } + buffer_json_object_close(wb); // selectors + buffer_json_member_add_object(wb, "filters"); + { + if (mode & CONTEXTS_V2_SEARCH) + buffer_json_member_add_string(wb, "q", req->q); + + buffer_json_member_add_time_t(wb, "after", req->after); + buffer_json_member_add_time_t(wb, "before", req->before); + } + buffer_json_object_close(wb); // filters + + if(mode & CONTEXTS_V2_ALERT_TRANSITIONS) { + buffer_json_member_add_object(wb, "facets"); + { + for (int i = 0; i < ATF_TOTAL_ENTRIES; i++) { + buffer_json_member_add_string(wb, alert_transition_facets[i].query_param, req->alerts.facets[i]); + } + } + buffer_json_object_close(wb); // facets + } + } buffer_json_object_close(wb); } - if(options & (CONTEXTS_V2_NODES | CONTEXTS_V2_NODES_DETAILED | CONTEXTS_V2_DEBUG)) - buffer_json_member_add_array(wb, "nodes"); - - ssize_t ret = query_scope_foreach_host(ctl.nodes.scope_pattern, ctl.nodes.pattern, + ssize_t ret = 0; + if(run) + ret = query_scope_foreach_host(ctl.nodes.scope_pattern, ctl.nodes.pattern, rrdcontext_to_json_v2_add_host, &ctl, &ctl.versions, ctl.q.host_node_id_str); @@ -504,54 +2017,101 @@ int rrdcontext_to_json_v2(BUFFER *wb, struct api_v2_contexts_request *req, CONTE goto cleanup; } - if(options & (CONTEXTS_V2_NODES | CONTEXTS_V2_NODES_DETAILED | CONTEXTS_V2_DEBUG)) - buffer_json_array_close(wb); - ctl.timings.executed_ut = now_monotonic_usec(); - version_hashes_api_v2(wb, &ctl.versions); - if(options & CONTEXTS_V2_CONTEXTS) { - buffer_json_member_add_object(wb, "contexts"); - struct rrdcontext_to_json_v2_entry *z; - dfe_start_read(ctl.ctx, z){ - bool collected = z->flags & RRD_FLAG_COLLECTED; + if(mode & CONTEXTS_V2_ALERT_TRANSITIONS) { + contexts_v2_alert_transitions_to_json(wb, &ctl, debug); + } + else { + if (mode & CONTEXTS_V2_NODES) { + buffer_json_member_add_array(wb, "nodes"); + struct contexts_v2_node *t; + dfe_start_read(ctl.nodes.dict, t) { + rrdcontext_to_json_v2_rrdhost(wb, t->host, &ctl, t->ni); + } + dfe_done(t); + buffer_json_array_close(wb); + } - buffer_json_member_add_object(wb, string2str(z->id)); + if (mode & CONTEXTS_V2_FUNCTIONS) { + buffer_json_member_add_array(wb, "functions"); { - buffer_json_member_add_string(wb, "family", string2str(z->family)); - buffer_json_member_add_uint64(wb, "priority", z->priority); - buffer_json_member_add_time_t(wb, "first_entry", z->first_time_s); - buffer_json_member_add_time_t(wb, "last_entry", collected ? now_s : z->last_time_s); - buffer_json_member_add_boolean(wb, "live", collected); - if (options & CONTEXTS_V2_SEARCH) - buffer_json_member_add_string(wb, "match", fts_match_to_string(z->match)); + struct function_v2_entry *t; + dfe_start_read(ctl.functions.dict, t) { + buffer_json_add_array_item_object(wb); + buffer_json_member_add_string(wb, "name", t_dfe.name); + buffer_json_member_add_string(wb, "help", string2str(t->help)); + buffer_json_member_add_array(wb, "ni"); + for (size_t i = 0; i < t->used; i++) + buffer_json_add_array_item_uint64(wb, t->node_ids[i]); + buffer_json_array_close(wb); + buffer_json_object_close(wb); + } + dfe_done(t); + } + buffer_json_array_close(wb); + } + + if (mode & CONTEXTS_V2_CONTEXTS) { + buffer_json_member_add_object(wb, "contexts"); + { + struct context_v2_entry *z; + dfe_start_read(ctl.contexts.dict, z) { + bool collected = z->flags & RRD_FLAG_COLLECTED; + + buffer_json_member_add_object(wb, string2str(z->id)); + { + buffer_json_member_add_string(wb, "family", string2str(z->family)); + buffer_json_member_add_uint64(wb, "priority", z->priority); + buffer_json_member_add_time_t(wb, "first_entry", z->first_time_s); + buffer_json_member_add_time_t(wb, "last_entry", collected ? ctl.now : z->last_time_s); + buffer_json_member_add_boolean(wb, "live", collected); + if (mode & CONTEXTS_V2_SEARCH) + buffer_json_member_add_string(wb, "match", fts_match_to_string(z->match)); + } + buffer_json_object_close(wb); + } + dfe_done(z); + } + buffer_json_object_close(wb); // contexts + } + + if (mode & CONTEXTS_V2_ALERTS) + contexts_v2_alerts_to_json(wb, &ctl, debug); + + if (mode & CONTEXTS_V2_SEARCH) { + buffer_json_member_add_object(wb, "searches"); + { + buffer_json_member_add_uint64(wb, "strings", ctl.q.fts.string_searches); + buffer_json_member_add_uint64(wb, "char", ctl.q.fts.char_searches); + buffer_json_member_add_uint64(wb, "total", ctl.q.fts.searches); } buffer_json_object_close(wb); } - dfe_done(z); - buffer_json_object_close(wb); // contexts - } - if(options & CONTEXTS_V2_SEARCH) { - buffer_json_member_add_object(wb, "searches"); - buffer_json_member_add_uint64(wb, "strings", ctl.q.fts.string_searches); - buffer_json_member_add_uint64(wb, "char", ctl.q.fts.char_searches); - buffer_json_member_add_uint64(wb, "total", ctl.q.fts.searches); - buffer_json_object_close(wb); + if (mode & (CONTEXTS_V2_VERSIONS)) + version_hashes_api_v2(wb, &ctl.versions); + + if (mode & CONTEXTS_V2_AGENTS) + buffer_json_agents_v2(wb, &ctl.timings, ctl.now, mode & (CONTEXTS_V2_AGENTS_INFO), true); } - buffer_json_agents_array_v2(wb, &ctl.timings, now_s); buffer_json_cloud_timings(wb, "timings", &ctl.timings); + buffer_json_finalize(wb); cleanup: - dictionary_destroy(ctl.ctx); + dictionary_destroy(ctl.nodes.dict); + dictionary_destroy(ctl.contexts.dict); + dictionary_destroy(ctl.functions.dict); + dictionary_destroy(ctl.alerts.alerts); + dictionary_destroy(ctl.alerts.alert_instances); simple_pattern_free(ctl.nodes.scope_pattern); simple_pattern_free(ctl.nodes.pattern); simple_pattern_free(ctl.contexts.pattern); simple_pattern_free(ctl.contexts.scope_pattern); simple_pattern_free(ctl.q.pattern); + simple_pattern_free(ctl.alerts.alert_name_pattern); return resp; } - diff --git a/database/contexts/context.c b/database/contexts/context.c index f941050d9..47946f1e0 100644 --- a/database/contexts/context.c +++ b/database/contexts/context.c @@ -33,7 +33,7 @@ static void rrdcontext_insert_callback(const DICTIONARY_ITEM *item __maybe_unuse // we are loading data from the SQL database if(rc->version) - error("RRDCONTEXT: context '%s' is already initialized with version %"PRIu64", but it is loaded again from SQL with version %"PRIu64"", string2str(rc->id), rc->version, rc->hub.version); + netdata_log_error("RRDCONTEXT: context '%s' is already initialized with version %"PRIu64", but it is loaded again from SQL with version %"PRIu64"", string2str(rc->id), rc->version, rc->hub.version); // IMPORTANT // replace all string pointers in rc->hub with our own versions @@ -80,7 +80,7 @@ static void rrdcontext_insert_callback(const DICTIONARY_ITEM *item __maybe_unuse } rrdinstances_create_in_rrdcontext(rc); - netdata_mutex_init(&rc->mutex); + spinlock_init(&rc->spinlock); // signal the react callback to do the job rrd_flag_set_updated(rc, RRD_FLAG_UPDATE_REASON_NEW_OBJECT); @@ -91,7 +91,6 @@ static void rrdcontext_delete_callback(const DICTIONARY_ITEM *item __maybe_unuse RRDCONTEXT *rc = (RRDCONTEXT *)value; rrdinstances_destroy_from_rrdcontext(rc); - netdata_mutex_destroy(&rc->mutex); rrdcontext_freez(rc); } diff --git a/database/contexts/instance.c b/database/contexts/instance.c index 665022afd..7e572fb80 100644 --- a/database/contexts/instance.c +++ b/database/contexts/instance.c @@ -407,13 +407,13 @@ inline void rrdinstance_from_rrdset(RRDSET *st) { #define rrdset_get_rrdinstance(st) rrdset_get_rrdinstance_with_trace(st, __FUNCTION__); static inline RRDINSTANCE *rrdset_get_rrdinstance_with_trace(RRDSET *st, const char *function) { if(unlikely(!st->rrdinstance)) { - error("RRDINSTANCE: RRDSET '%s' is not linked to an RRDINSTANCE at %s()", rrdset_id(st), function); + netdata_log_error("RRDINSTANCE: RRDSET '%s' is not linked to an RRDINSTANCE at %s()", rrdset_id(st), function); return NULL; } RRDINSTANCE *ri = rrdinstance_acquired_value(st->rrdinstance); if(unlikely(!ri)) { - error("RRDINSTANCE: RRDSET '%s' lost its link to an RRDINSTANCE at %s()", rrdset_id(st), function); + netdata_log_error("RRDINSTANCE: RRDSET '%s' lost its link to an RRDINSTANCE at %s()", rrdset_id(st), function); return NULL; } diff --git a/database/contexts/internal.h b/database/contexts/internal.h index 9917d58e4..c5663dd24 100644 --- a/database/contexts/internal.h +++ b/database/contexts/internal.h @@ -250,6 +250,8 @@ typedef struct rrdcontext { uint32_t priority; RRDSET_TYPE chart_type; + SPINLOCK spinlock; + RRD_FLAGS flags; time_t first_time_s; time_t last_time_s; @@ -275,7 +277,9 @@ typedef struct rrdcontext { size_t dispatches; // the number of times this has been dispatched to hub } queue; - netdata_mutex_t mutex; + struct { + uint32_t metrics; // the number of metrics in this context + } stats; } RRDCONTEXT; @@ -352,8 +356,8 @@ static inline void rrdcontext_release(RRDCONTEXT_ACQUIRED *rca) { void rrdcontext_recalculate_context_retention(RRDCONTEXT *rc, RRD_FLAGS reason, bool worker_jobs); void rrdcontext_recalculate_host_retention(RRDHOST *host, RRD_FLAGS reason, bool worker_jobs); -#define rrdcontext_lock(rc) netdata_mutex_lock(&((rc)->mutex)) -#define rrdcontext_unlock(rc) netdata_mutex_unlock(&((rc)->mutex)) +#define rrdcontext_lock(rc) spinlock_lock(&((rc)->spinlock)) +#define rrdcontext_unlock(rc) spinlock_unlock(&((rc)->spinlock)) void rrdinstance_trigger_updates(RRDINSTANCE *ri, const char *function); void rrdcontext_trigger_updates(RRDCONTEXT *rc, const char *function); diff --git a/database/contexts/metric.c b/database/contexts/metric.c index 80756b54c..55efde4e9 100644 --- a/database/contexts/metric.c +++ b/database/contexts/metric.c @@ -33,7 +33,7 @@ inline NETDATA_DOUBLE rrdmetric_acquired_last_stored_value(RRDMETRIC_ACQUIRED *r RRDMETRIC *rm = rrdmetric_acquired_value(rma); if(rm->rrddim) - return rm->rrddim->last_stored_value; + return rm->rrddim->collector.last_stored_value; return NAN; } @@ -263,13 +263,13 @@ void rrdmetric_from_rrddim(RRDDIM *rd) { #define rrddim_get_rrdmetric(rd) rrddim_get_rrdmetric_with_trace(rd, __FUNCTION__) static inline RRDMETRIC *rrddim_get_rrdmetric_with_trace(RRDDIM *rd, const char *function) { if(unlikely(!rd->rrdmetric)) { - error("RRDMETRIC: RRDDIM '%s' is not linked to an RRDMETRIC at %s()", rrddim_id(rd), function); + netdata_log_error("RRDMETRIC: RRDDIM '%s' is not linked to an RRDMETRIC at %s()", rrddim_id(rd), function); return NULL; } RRDMETRIC *rm = rrdmetric_acquired_value(rd->rrdmetric); if(unlikely(!rm)) { - error("RRDMETRIC: RRDDIM '%s' lost the link to its RRDMETRIC at %s()", rrddim_id(rd), function); + netdata_log_error("RRDMETRIC: RRDDIM '%s' lost the link to its RRDMETRIC at %s()", rrddim_id(rd), function); return NULL; } diff --git a/database/contexts/query_target.c b/database/contexts/query_target.c index 7759f85e8..508977ce7 100644 --- a/database/contexts/query_target.c +++ b/database/contexts/query_target.c @@ -131,10 +131,10 @@ void query_target_release(QUERY_TARGET *qt) { qt->id[0] = '\0'; - netdata_spinlock_lock(&query_target_base.used.spinlock); + spinlock_lock(&query_target_base.used.spinlock); DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(query_target_base.used.base, qt, internal.prev, internal.next); query_target_base.used.count--; - netdata_spinlock_unlock(&query_target_base.used.spinlock); + spinlock_unlock(&query_target_base.used.spinlock); qt->internal.used = false; thread_qt = NULL; @@ -143,29 +143,29 @@ void query_target_release(QUERY_TARGET *qt) { query_target_destroy(qt); } else { - netdata_spinlock_lock(&query_target_base.available.spinlock); + spinlock_lock(&query_target_base.available.spinlock); DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(query_target_base.available.base, qt, internal.prev, internal.next); query_target_base.available.count++; - netdata_spinlock_unlock(&query_target_base.available.spinlock); + spinlock_unlock(&query_target_base.available.spinlock); } } static QUERY_TARGET *query_target_get(void) { - netdata_spinlock_lock(&query_target_base.available.spinlock); + spinlock_lock(&query_target_base.available.spinlock); QUERY_TARGET *qt = query_target_base.available.base; if (qt) { DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(query_target_base.available.base, qt, internal.prev, internal.next); query_target_base.available.count--; } - netdata_spinlock_unlock(&query_target_base.available.spinlock); + spinlock_unlock(&query_target_base.available.spinlock); if(unlikely(!qt)) qt = callocz(1, sizeof(*qt)); - netdata_spinlock_lock(&query_target_base.used.spinlock); + spinlock_lock(&query_target_base.used.spinlock); DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(query_target_base.used.base, qt, internal.prev, internal.next); query_target_base.used.count++; - netdata_spinlock_unlock(&query_target_base.used.spinlock); + spinlock_unlock(&query_target_base.used.spinlock); qt->internal.used = true; qt->internal.queries++; @@ -579,7 +579,7 @@ static void query_target_eval_instance_rrdcalc(QUERY_TARGET_LOCALS *qtl __maybe_ QUERY_NODE *qn, QUERY_CONTEXT *qc, QUERY_INSTANCE *qi) { RRDSET *st = rrdinstance_acquired_rrdset(qi->ria); if (st) { - netdata_rwlock_rdlock(&st->alerts.rwlock); + rw_spinlock_read_lock(&st->alerts.spinlock); for (RRDCALC *rc = st->alerts.base; rc; rc = rc->next) { switch(rc->status) { case RRDCALC_STATUS_CLEAR: @@ -610,7 +610,7 @@ static void query_target_eval_instance_rrdcalc(QUERY_TARGET_LOCALS *qtl __maybe_ break; } } - netdata_rwlock_unlock(&st->alerts.rwlock); + rw_spinlock_read_unlock(&st->alerts.spinlock); } } @@ -624,7 +624,7 @@ static bool query_target_match_alert_pattern(RRDINSTANCE_ACQUIRED *ria, SIMPLE_P BUFFER *wb = NULL; bool matched = false; - netdata_rwlock_rdlock(&st->alerts.rwlock); + rw_spinlock_read_lock(&st->alerts.spinlock); if (st->alerts.base) { for (RRDCALC *rc = st->alerts.base; rc; rc = rc->next) { SIMPLE_PATTERN_RESULT ret = simple_pattern_matches_string_extract(pattern, rc->name, NULL, 0); @@ -655,7 +655,7 @@ static bool query_target_match_alert_pattern(RRDINSTANCE_ACQUIRED *ria, SIMPLE_P break; } } - netdata_rwlock_unlock(&st->alerts.rwlock); + rw_spinlock_read_unlock(&st->alerts.spinlock); buffer_free(wb); return matched; @@ -895,12 +895,12 @@ static ssize_t query_node_add(void *data, RRDHOST *host, bool queryable_host) { // is the chart given valid? if(unlikely(qtl->st && (!qtl->st->rrdinstance || !qtl->st->rrdcontext))) { - error("QUERY TARGET: RRDSET '%s' given, but it is not linked to rrdcontext structures. Linking it now.", rrdset_name(qtl->st)); + netdata_log_error("QUERY TARGET: RRDSET '%s' given, but it is not linked to rrdcontext structures. Linking it now.", rrdset_name(qtl->st)); rrdinstance_from_rrdset(qtl->st); if(unlikely(qtl->st && (!qtl->st->rrdinstance || !qtl->st->rrdcontext))) { - error("QUERY TARGET: RRDSET '%s' given, but failed to be linked to rrdcontext structures. Switching to context query.", - rrdset_name(qtl->st)); + netdata_log_error("QUERY TARGET: RRDSET '%s' given, but failed to be linked to rrdcontext structures. Switching to context query.", + rrdset_name(qtl->st)); if (!is_valid_sp(qtl->instances)) qtl->instances = rrdset_name(qtl->st); @@ -1098,7 +1098,7 @@ QUERY_TARGET *query_target_create(QUERY_TARGET_REQUEST *qtr) { } else if (unlikely(host != qtl.st->rrdhost)) { // Oops! A different host! - error("QUERY TARGET: RRDSET '%s' given does not belong to host '%s'. Switching query host to '%s'", + netdata_log_error("QUERY TARGET: RRDSET '%s' given does not belong to host '%s'. Switching query host to '%s'", rrdset_name(qtl.st), rrdhost_hostname(host), rrdhost_hostname(qtl.st->rrdhost)); host = qtl.st->rrdhost; } diff --git a/database/contexts/rrdcontext.c b/database/contexts/rrdcontext.c index 40a7e420b..8538d17f2 100644 --- a/database/contexts/rrdcontext.c +++ b/database/contexts/rrdcontext.c @@ -224,25 +224,26 @@ void rrdcontext_hub_checkpoint_command(void *ptr) { struct ctxs_checkpoint *cmd = ptr; if(!rrdhost_check_our_claim_id(cmd->claim_id)) { - error("RRDCONTEXT: received checkpoint command for claim_id '%s', node id '%s', but this is not our claim id. Ours '%s', received '%s'. Ignoring command.", - cmd->claim_id, cmd->node_id, - localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET", - cmd->claim_id); + netdata_log_error("RRDCONTEXT: received checkpoint command for claim_id '%s', node id '%s', but this is not our claim id. Ours '%s', received '%s'. Ignoring command.", + cmd->claim_id, cmd->node_id, + localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET", + cmd->claim_id); return; } RRDHOST *host = rrdhost_find_by_node_id(cmd->node_id); if(!host) { - error("RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', but there is no node with such node id here. Ignoring command.", - cmd->claim_id, cmd->node_id); + netdata_log_error("RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', but there is no node with such node id here. Ignoring command.", + cmd->claim_id, + cmd->node_id); return; } if(rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS)) { - info("RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', while node '%s' has an active context streaming.", - cmd->claim_id, cmd->node_id, rrdhost_hostname(host)); + netdata_log_info("RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', while node '%s' has an active context streaming.", + cmd->claim_id, cmd->node_id, rrdhost_hostname(host)); // disable it temporarily, so that our worker will not attempt to send messages in parallel rrdhost_flag_clear(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS); @@ -251,8 +252,8 @@ void rrdcontext_hub_checkpoint_command(void *ptr) { uint64_t our_version_hash = rrdcontext_version_hash(host); if(cmd->version_hash != our_version_hash) { - error("RRDCONTEXT: received version hash %"PRIu64" for host '%s', does not match our version hash %"PRIu64". Sending snapshot of all contexts.", - cmd->version_hash, rrdhost_hostname(host), our_version_hash); + netdata_log_error("RRDCONTEXT: received version hash %"PRIu64" for host '%s', does not match our version hash %"PRIu64". Sending snapshot of all contexts.", + cmd->version_hash, rrdhost_hostname(host), our_version_hash); #ifdef ENABLE_ACLK // prepare the snapshot @@ -278,32 +279,32 @@ void rrdcontext_hub_checkpoint_command(void *ptr) { rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS); char node_str[UUID_STR_LEN]; uuid_unparse_lower(*host->node_id, node_str); - log_access("ACLK REQ [%s (%s)]: STREAM CONTEXTS ENABLED", node_str, rrdhost_hostname(host)); + netdata_log_access("ACLK REQ [%s (%s)]: STREAM CONTEXTS ENABLED", node_str, rrdhost_hostname(host)); } void rrdcontext_hub_stop_streaming_command(void *ptr) { struct stop_streaming_ctxs *cmd = ptr; if(!rrdhost_check_our_claim_id(cmd->claim_id)) { - error("RRDCONTEXT: received stop streaming command for claim_id '%s', node id '%s', but this is not our claim id. Ours '%s', received '%s'. Ignoring command.", - cmd->claim_id, cmd->node_id, - localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET", - cmd->claim_id); + netdata_log_error("RRDCONTEXT: received stop streaming command for claim_id '%s', node id '%s', but this is not our claim id. Ours '%s', received '%s'. Ignoring command.", + cmd->claim_id, cmd->node_id, + localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET", + cmd->claim_id); return; } RRDHOST *host = rrdhost_find_by_node_id(cmd->node_id); if(!host) { - error("RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', but there is no node with such node id here. Ignoring command.", - cmd->claim_id, cmd->node_id); + netdata_log_error("RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', but there is no node with such node id here. Ignoring command.", + cmd->claim_id, cmd->node_id); return; } if(!rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS)) { - error("RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', but node '%s' does not have active context streaming. Ignoring command.", - cmd->claim_id, cmd->node_id, rrdhost_hostname(host)); + netdata_log_error("RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', but node '%s' does not have active context streaming. Ignoring command.", + cmd->claim_id, cmd->node_id, rrdhost_hostname(host)); return; } @@ -321,4 +322,4 @@ bool rrdcontext_retention_match(RRDCONTEXT_ACQUIRED *rca, time_t after, time_t b return query_matches_retention(after, before, rc->first_time_s, before > rc->last_time_s ? before : rc->last_time_s, 1); else return query_matches_retention(after, before, rc->first_time_s, rc->last_time_s, 1); -}
\ No newline at end of file +} diff --git a/database/contexts/rrdcontext.h b/database/contexts/rrdcontext.h index 0f0f90d32..e3a1ab9af 100644 --- a/database/contexts/rrdcontext.h +++ b/database/contexts/rrdcontext.h @@ -415,6 +415,133 @@ typedef struct query_target { } internal; } QUERY_TARGET; + +struct sql_alert_transition_data { + usec_t global_id; + uuid_t *transition_id; + uuid_t *host_id; + uuid_t *config_hash_id; + uint32_t alarm_id; + const char *alert_name; + const char *chart; + const char *chart_name; + const char *chart_context; + const char *family; + const char *recipient; + const char *units; + const char *exec; + const char *info; + const char *classification; + const char *type; + const char *component; + time_t when_key; + time_t duration; + time_t non_clear_duration; + uint64_t flags; + time_t delay_up_to_timestamp; + time_t exec_run_timestamp; + int exec_code; + int new_status; + int old_status; + int delay; + time_t last_repeat; + NETDATA_DOUBLE new_value; + NETDATA_DOUBLE old_value; +}; + +struct sql_alert_config_data { + uuid_t *config_hash_id; + const char *name; + + struct { + const char *on_template; + const char *on_key; + + const char *os; + const char *hosts; + const char *families; + const char *plugin; + const char *module; + const char *host_labels; + const char *chart_labels; + const char *charts; + } selectors; + + const char *info; + const char *classification; + const char *component; + const char *type; + + struct { + struct { + const char *dimensions; + const char *method; + uint32_t options; + + int32_t after; + int32_t before; + + const char *lookup; // the lookup line, unparsed + } db; + + const char *calc; // the calculation expression, unparsed + const char *units; + + int32_t update_every; // the update frequency of the alert, in seconds + const char *every; // the every line, unparsed + } value; + + struct { + const char *green; // the green threshold, unparsed + const char *red; // the red threshold, unparsed + const char *warn; // the warning expression, unparsed + const char *crit; // the critical expression, unparsed + } status; + + struct { + const char *exec; // the script to execute, or NULL to execute the default script + const char *to_key; // the recipient, or NULL for the default recipient + const char *delay; // the delay line, unparsed + const char *repeat; // the repeat line, unparsed + const char *options; // FIXME what is this? + } notification; + + const char *source; // the configuration file and line this alert came from +}; + +int contexts_v2_alert_config_to_json(struct web_client *w, const char *config_hash_id); + +struct sql_alert_instance_v2_entry { + RRDCALC *tmp; + + size_t ati; + + STRING *context; + STRING *chart_id; + STRING *chart_name; + STRING *name; + STRING *family; + STRING *units; + STRING *source; + STRING *classification; + STRING *type; + STRING *component; + STRING *recipient; + RRDCALC_STATUS status; + RRDCALC_FLAGS flags; + STRING *info; + NETDATA_DOUBLE value; + time_t last_updated; + time_t last_status_change; + NETDATA_DOUBLE last_status_change_value; + uuid_t config_hash_id; + usec_t global_id; + uuid_t last_transition_id; + uint32_t alarm_id; + RRDHOST *host; + size_t ni; +}; + static inline NEVERNULL QUERY_NODE *query_node(QUERY_TARGET *qt, size_t id) { internal_fatal(id >= qt->nodes.used, "QUERY: invalid query host id"); return &qt->nodes.array[id]; @@ -460,6 +587,30 @@ void query_target_release(QUERY_TARGET *qt); QUERY_TARGET *query_target_create(QUERY_TARGET_REQUEST *qtr); +typedef enum __attribute__((packed)) { + ATF_STATUS = 0, + ATF_CLASS, + ATF_TYPE, + ATF_COMPONENT, + ATF_ROLE, + ATF_NODE, + ATF_ALERT_NAME, + ATF_CHART_NAME, + ATF_CONTEXT, + + // total + ATF_TOTAL_ENTRIES, +} ALERT_TRANSITION_FACET; + +struct alert_transitions_facets { + const char *name; + const char *query_param; + const char *id; + size_t order; +}; + +extern struct alert_transitions_facets alert_transition_facets[]; + struct api_v2_contexts_request { char *scope_nodes; char *scope_contexts; @@ -467,6 +618,20 @@ struct api_v2_contexts_request { char *contexts; char *q; + CONTEXTS_V2_OPTIONS options; + + struct { + CONTEXTS_V2_ALERT_STATUS status; + char *alert; + char *transition; + uint32_t last; + + const char *facets[ATF_TOTAL_ENTRIES]; + usec_t global_id_anchor; + } alerts; + + time_t after; + time_t before; time_t timeout_ms; qt_interrupt_callback_t interrupt_callback; @@ -474,18 +639,24 @@ struct api_v2_contexts_request { }; typedef enum __attribute__ ((__packed__)) { - CONTEXTS_V2_DEBUG = (1 << 0), - CONTEXTS_V2_SEARCH = (1 << 1), - CONTEXTS_V2_NODES = (1 << 2), - CONTEXTS_V2_NODES_DETAILED = (1 << 3), - CONTEXTS_V2_CONTEXTS = (1 << 4), -} CONTEXTS_V2_OPTIONS; - -int rrdcontext_to_json_v2(BUFFER *wb, struct api_v2_contexts_request *req, CONTEXTS_V2_OPTIONS options); + CONTEXTS_V2_SEARCH = (1 << 1), + CONTEXTS_V2_NODES = (1 << 2), + CONTEXTS_V2_NODES_INFO = (1 << 3), + CONTEXTS_V2_NODE_INSTANCES = (1 << 4), + CONTEXTS_V2_CONTEXTS = (1 << 5), + CONTEXTS_V2_AGENTS = (1 << 6), + CONTEXTS_V2_AGENTS_INFO = (1 << 7), + CONTEXTS_V2_VERSIONS = (1 << 8), + CONTEXTS_V2_FUNCTIONS = (1 << 9), + CONTEXTS_V2_ALERTS = (1 << 10), + CONTEXTS_V2_ALERT_TRANSITIONS = (1 << 11), +} CONTEXTS_V2_MODE; + +int rrdcontext_to_json_v2(BUFFER *wb, struct api_v2_contexts_request *req, CONTEXTS_V2_MODE mode); RRDCONTEXT_TO_JSON_OPTIONS rrdcontext_to_json_parse_options(char *o); -void buffer_json_agents_array_v2(BUFFER *wb, struct query_timings *timings, time_t now_s); -void buffer_json_node_add_v2(BUFFER *wb, RRDHOST *host, size_t ni, usec_t duration_ut); +void buffer_json_agents_v2(BUFFER *wb, struct query_timings *timings, time_t now_s, bool info, bool array); +void buffer_json_node_add_v2(BUFFER *wb, RRDHOST *host, size_t ni, usec_t duration_ut, bool status); void buffer_json_query_timings(BUFFER *wb, const char *key, struct query_timings *timings); void buffer_json_cloud_timings(BUFFER *wb, const char *key, struct query_timings *timings); diff --git a/database/contexts/worker.c b/database/contexts/worker.c index 22e28b2ad..e6c3ff3df 100644 --- a/database/contexts/worker.c +++ b/database/contexts/worker.c @@ -167,6 +167,27 @@ uint64_t rrdcontext_version_hash_with_callback( // ---------------------------------------------------------------------------- // retention recalculation +static void rrdhost_update_cached_retention(RRDHOST *host, time_t first_time_s, time_t last_time_s, bool global) { + if(unlikely(!host)) + return; + + spinlock_lock(&host->retention.spinlock); + + if(global) { + host->retention.first_time_s = first_time_s; + host->retention.last_time_s = last_time_s; + } + else { + if(!host->retention.first_time_s || first_time_s < host->retention.first_time_s) + host->retention.first_time_s = first_time_s; + + if(!host->retention.last_time_s || last_time_s > host->retention.last_time_s) + host->retention.last_time_s = last_time_s; + } + + spinlock_unlock(&host->retention.spinlock); +} + void rrdcontext_recalculate_context_retention(RRDCONTEXT *rc, RRD_FLAGS reason, bool worker_jobs) { rrdcontext_post_process_updates(rc, true, reason, worker_jobs); } @@ -174,11 +195,22 @@ void rrdcontext_recalculate_context_retention(RRDCONTEXT *rc, RRD_FLAGS reason, void rrdcontext_recalculate_host_retention(RRDHOST *host, RRD_FLAGS reason, bool worker_jobs) { if(unlikely(!host || !host->rrdctx.contexts)) return; + time_t first_time_s = 0; + time_t last_time_s = 0; + RRDCONTEXT *rc; dfe_start_read(host->rrdctx.contexts, rc) { - rrdcontext_recalculate_context_retention(rc, reason, worker_jobs); - } + rrdcontext_recalculate_context_retention(rc, reason, worker_jobs); + + if(!first_time_s || rc->first_time_s < first_time_s) + first_time_s = rc->first_time_s; + + if(!last_time_s || rc->last_time_s > last_time_s) + last_time_s = rc->last_time_s; + } dfe_done(rc); + + rrdhost_update_cached_retention(host, first_time_s, last_time_s, true); } static void rrdcontext_recalculate_retention_all_hosts(void) { @@ -318,7 +350,8 @@ void rrdcontext_delete_from_sql_unsafe(RRDCONTEXT *rc) { // delete it from SQL if(ctx_delete_context(&rc->rrdhost->host_uuid, &rc->hub) != 0) - error("RRDCONTEXT: failed to delete context '%s' version %"PRIu64" from SQL.", rc->hub.id, rc->hub.version); + netdata_log_error("RRDCONTEXT: failed to delete context '%s' version %"PRIu64" from SQL.", + rc->hub.id, rc->hub.version); } static void rrdcontext_garbage_collect_single_host(RRDHOST *host, bool worker_jobs) { @@ -342,11 +375,11 @@ static void rrdcontext_garbage_collect_single_host(RRDHOST *host, bool worker_jo if(rrdmetric_should_be_deleted(rm)) { if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP_DELETE); if(!dictionary_del(ri->rrdmetrics, string2str(rm->id))) - error("RRDCONTEXT: metric '%s' of instance '%s' of context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", - string2str(rm->id), - string2str(ri->id), - string2str(rc->id), - rrdhost_hostname(host)); + netdata_log_error("RRDCONTEXT: metric '%s' of instance '%s' of context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", + string2str(rm->id), + string2str(ri->id), + string2str(rc->id), + rrdhost_hostname(host)); else internal_error( true, @@ -362,10 +395,10 @@ static void rrdcontext_garbage_collect_single_host(RRDHOST *host, bool worker_jo if(rrdinstance_should_be_deleted(ri)) { if(worker_jobs) worker_is_busy(WORKER_JOB_CLEANUP_DELETE); if(!dictionary_del(rc->rrdinstances, string2str(ri->id))) - error("RRDCONTEXT: instance '%s' of context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", - string2str(ri->id), - string2str(rc->id), - rrdhost_hostname(host)); + netdata_log_error("RRDCONTEXT: instance '%s' of context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", + string2str(ri->id), + string2str(rc->id), + rrdhost_hostname(host)); else internal_error( true, @@ -383,7 +416,7 @@ static void rrdcontext_garbage_collect_single_host(RRDHOST *host, bool worker_jo rrdcontext_delete_from_sql_unsafe(rc); if(!dictionary_del(host->rrdctx.contexts, string2str(rc->id))) - error("RRDCONTEXT: context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", + netdata_log_error("RRDCONTEXT: context '%s' of host '%s', failed to be deleted from rrdmetrics dictionary.", string2str(rc->id), rrdhost_hostname(host)); else @@ -554,7 +587,7 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG size_t min_priority_not_collected = LONG_MAX; size_t min_priority = LONG_MAX; time_t min_first_time_t = LONG_MAX, max_last_time_t = 0; - size_t instances_active = 0, instances_deleted = 0; + size_t instances_active = 0, instances_deleted = 0, metrics = 0; bool live_retention = true, currently_collected = false, hidden = true; if(dictionary_entries(rc->rrdinstances) > 0) { RRDINSTANCE *ri; @@ -587,6 +620,7 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG string2str(rc->units), string2str(ri->units)); instances_active++; + metrics += dictionary_entries(ri->rrdmetrics); if (ri->priority >= RRDCONTEXT_MINIMUM_ALLOWED_PRIORITY) { if(rrd_flag_check(ri, RRD_FLAG_COLLECTED)) { @@ -607,6 +641,8 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG } dfe_done(ri); + rc->stats.metrics = metrics; + if(min_priority_collected != LONG_MAX) // use the collected priority min_priority = min_priority_collected; @@ -809,7 +845,7 @@ void rrdcontext_message_send_unsafe(RRDCONTEXT *rc, bool snapshot __maybe_unused rrdcontext_delete_from_sql_unsafe(rc); else if (ctx_store_context(&rc->rrdhost->host_uuid, &rc->hub) != 0) - error("RRDCONTEXT: failed to save context '%s' version %"PRIu64" to SQL.", rc->hub.id, rc->hub.version); + netdata_log_error("RRDCONTEXT: failed to save context '%s' version %"PRIu64" to SQL.", rc->hub.id, rc->hub.version); } static bool check_if_cloud_version_changed_unsafe(RRDCONTEXT *rc, bool sending __maybe_unused) { @@ -872,9 +908,14 @@ static bool check_if_cloud_version_changed_unsafe(RRDCONTEXT *rc, bool sending _ sending ? (rc->queue.scheduled_dispatch_ut - rc->queue.queued_ut) / USEC_PER_MS : 0 ); + rrdhost_update_cached_retention(rc->rrdhost, rc->first_time_s, rc->last_time_s, false); + return true; } + if(!(flags & RRD_FLAG_COLLECTED)) + rrdhost_update_cached_retention(rc->rrdhost, rc->first_time_s, rc->last_time_s, false); + return false; } @@ -981,8 +1022,8 @@ static void rrdcontext_dispatch_queued_contexts_to_hub(RRDHOST *host, usec_t now // delete it from the master dictionary if(!dictionary_del(host->rrdctx.contexts, string2str(rc->id))) - error("RRDCONTEXT: '%s' of host '%s' failed to be deleted from rrdcontext dictionary.", - string2str(id), rrdhost_hostname(host)); + netdata_log_error("RRDCONTEXT: '%s' of host '%s' failed to be deleted from rrdcontext dictionary.", + string2str(id), rrdhost_hostname(host)); string_freez(id); } @@ -1082,6 +1123,17 @@ void *rrdcontext_main(void *ptr) { if (host->rrdctx.contexts) dictionary_garbage_collect(host->rrdctx.contexts); + + // calculate the number of metrics and instances in the host + RRDCONTEXT *rc; + uint32_t metrics = 0, instances = 0; + dfe_start_read(host->rrdctx.contexts, rc) { + metrics += rc->stats.metrics; + instances += dictionary_entries(rc->rrdinstances); + } + dfe_done(rc); + host->rrdctx.metrics = metrics; + host->rrdctx.instances = instances; } dfe_done(host); diff --git a/database/engine/cache.c b/database/engine/cache.c index bc3ba6b6a..7a9ccf8d1 100644 --- a/database/engine/cache.c +++ b/database/engine/cache.c @@ -112,8 +112,9 @@ struct pgc { PGC_CACHE_LINE_PADDING(0); struct pgc_index { - netdata_rwlock_t rwlock; + RW_SPINLOCK rw_spinlock; Pvoid_t sections_judy; + PGC_CACHE_LINE_PADDING(0); } *index; PGC_CACHE_LINE_PADDING(1); @@ -222,43 +223,40 @@ static inline size_t pgc_indexing_partition(PGC *cache, Word_t metric_id) { } static inline void pgc_index_read_lock(PGC *cache, size_t partition) { - netdata_rwlock_rdlock(&cache->index[partition].rwlock); + rw_spinlock_read_lock(&cache->index[partition].rw_spinlock); } static inline void pgc_index_read_unlock(PGC *cache, size_t partition) { - netdata_rwlock_unlock(&cache->index[partition].rwlock); + rw_spinlock_read_unlock(&cache->index[partition].rw_spinlock); } -//static inline bool pgc_index_write_trylock(PGC *cache, size_t partition) { -// return !netdata_rwlock_trywrlock(&cache->index[partition].rwlock); -//} static inline void pgc_index_write_lock(PGC *cache, size_t partition) { - netdata_rwlock_wrlock(&cache->index[partition].rwlock); + rw_spinlock_write_lock(&cache->index[partition].rw_spinlock); } static inline void pgc_index_write_unlock(PGC *cache, size_t partition) { - netdata_rwlock_unlock(&cache->index[partition].rwlock); + rw_spinlock_write_unlock(&cache->index[partition].rw_spinlock); } static inline bool pgc_ll_trylock(PGC *cache __maybe_unused, struct pgc_linked_list *ll) { - return netdata_spinlock_trylock(&ll->spinlock); + return spinlock_trylock(&ll->spinlock); } static inline void pgc_ll_lock(PGC *cache __maybe_unused, struct pgc_linked_list *ll) { - netdata_spinlock_lock(&ll->spinlock); + spinlock_lock(&ll->spinlock); } static inline void pgc_ll_unlock(PGC *cache __maybe_unused, struct pgc_linked_list *ll) { - netdata_spinlock_unlock(&ll->spinlock); + spinlock_unlock(&ll->spinlock); } static inline bool page_transition_trylock(PGC *cache __maybe_unused, PGC_PAGE *page) { - return netdata_spinlock_trylock(&page->transition_spinlock); + return spinlock_trylock(&page->transition_spinlock); } static inline void page_transition_lock(PGC *cache __maybe_unused, PGC_PAGE *page) { - netdata_spinlock_lock(&page->transition_spinlock); + spinlock_lock(&page->transition_spinlock); } static inline void page_transition_unlock(PGC *cache __maybe_unused, PGC_PAGE *page) { - netdata_spinlock_unlock(&page->transition_spinlock); + spinlock_unlock(&page->transition_spinlock); } // ---------------------------------------------------------------------------- @@ -267,9 +265,9 @@ static inline void page_transition_unlock(PGC *cache __maybe_unused, PGC_PAGE *p static inline size_t cache_usage_per1000(PGC *cache, size_t *size_to_evict) { if(size_to_evict) - netdata_spinlock_lock(&cache->usage.spinlock); + spinlock_lock(&cache->usage.spinlock); - else if(!netdata_spinlock_trylock(&cache->usage.spinlock)) + else if(!spinlock_trylock(&cache->usage.spinlock)) return __atomic_load_n(&cache->usage.per1000, __ATOMIC_RELAXED); size_t current_cache_size; @@ -319,7 +317,7 @@ static inline size_t cache_usage_per1000(PGC *cache, size_t *size_to_evict) { __atomic_store_n(&cache->stats.wanted_cache_size, wanted_cache_size, __ATOMIC_RELAXED); __atomic_store_n(&cache->stats.current_cache_size, current_cache_size, __ATOMIC_RELAXED); - netdata_spinlock_unlock(&cache->usage.spinlock); + spinlock_unlock(&cache->usage.spinlock); if(size_to_evict) { size_t target = (size_t)((unsigned long long)wanted_cache_size * (unsigned long long)cache->config.evict_low_threshold_per1000 / 1000ULL); @@ -422,7 +420,7 @@ static void pgc_section_pages_static_aral_init(void) { static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; if(unlikely(!pgc_section_pages_aral)) { - netdata_spinlock_lock(&spinlock); + spinlock_lock(&spinlock); // we have to check again if(!pgc_section_pages_aral) @@ -433,7 +431,7 @@ static void pgc_section_pages_static_aral_init(void) { 65536, NULL, NULL, NULL, false, false); - netdata_spinlock_unlock(&spinlock); + spinlock_unlock(&spinlock); } } @@ -1255,7 +1253,7 @@ static PGC_PAGE *page_add(PGC *cache, PGC_ENTRY *entry, bool *added) { page->update_every_s = entry->update_every_s, page->data = entry->data; page->assumed_size = page_assumed_size(cache, entry->size); - netdata_spinlock_init(&page->transition_spinlock); + spinlock_init(&page->transition_spinlock); page->link.prev = NULL; page->link.next = NULL; @@ -1378,7 +1376,7 @@ static PGC_PAGE *page_find_and_acquire(PGC *cache, Word_t section, Word_t metric Word_t time = start_time_s; // find the previous page - page_ptr = JudyLLast(*pages_judy_pptr, &time, PJE0); + page_ptr = JudyLPrev(*pages_judy_pptr, &time, PJE0); if(unlikely(page_ptr == PJERR)) fatal("DBENGINE CACHE: corrupted page in pages judy array #2"); @@ -1779,11 +1777,11 @@ PGC *pgc_create(const char *name, cache->index = callocz(cache->config.partitions, sizeof(struct pgc_index)); for(size_t part = 0; part < cache->config.partitions ; part++) - netdata_rwlock_init(&cache->index[part].rwlock); + rw_spinlock_init(&cache->index[part].rw_spinlock); - netdata_spinlock_init(&cache->hot.spinlock); - netdata_spinlock_init(&cache->dirty.spinlock); - netdata_spinlock_init(&cache->clean.spinlock); + spinlock_init(&cache->hot.spinlock); + spinlock_init(&cache->dirty.spinlock); + spinlock_init(&cache->clean.spinlock); cache->hot.flags = PGC_PAGE_HOT; cache->hot.linked_list_in_sections_judy = true; @@ -1849,12 +1847,12 @@ void pgc_destroy(PGC *cache) { free_all_unreferenced_clean_pages(cache); if(PGC_REFERENCED_PAGES(cache)) - error("DBENGINE CACHE: there are %zu referenced cache pages - leaving the cache allocated", PGC_REFERENCED_PAGES(cache)); + netdata_log_error("DBENGINE CACHE: there are %zu referenced cache pages - leaving the cache allocated", PGC_REFERENCED_PAGES(cache)); else { pointer_destroy_index(cache); - for(size_t part = 0; part < cache->config.partitions ; part++) - netdata_rwlock_destroy(&cache->index[part].rwlock); +// for(size_t part = 0; part < cache->config.partitions ; part++) +// netdata_rwlock_destroy(&cache->index[part].rw_spinlock); #ifdef PGC_WITH_ARAL for(size_t part = 0; part < cache->config.partitions ; part++) @@ -2091,8 +2089,8 @@ void pgc_open_cache_to_journal_v2(PGC *cache, Word_t section, unsigned datafile_ } struct section_pages *sp = *section_pages_pptr; - if(!netdata_spinlock_trylock(&sp->migration_to_v2_spinlock)) { - info("DBENGINE: migration to journal v2 for datafile %u is postponed, another jv2 indexer is already running for this section", datafile_fileno); + if(!spinlock_trylock(&sp->migration_to_v2_spinlock)) { + netdata_log_info("DBENGINE: migration to journal v2 for datafile %u is postponed, another jv2 indexer is already running for this section", datafile_fileno); pgc_ll_unlock(cache, &cache->hot); return; } @@ -2205,7 +2203,7 @@ void pgc_open_cache_to_journal_v2(PGC *cache, Word_t section, unsigned datafile_ pgc_ll_lock(cache, &cache->hot); } - netdata_spinlock_unlock(&sp->migration_to_v2_spinlock); + spinlock_unlock(&sp->migration_to_v2_spinlock); pgc_ll_unlock(cache, &cache->hot); // callback @@ -2355,7 +2353,7 @@ void *unittest_stress_test_collector(void *ptr) { heartbeat_init(&hb); while(!__atomic_load_n(&pgc_uts.stop, __ATOMIC_RELAXED)) { - // info("COLLECTOR %zu: collecting metrics %zu to %zu, from %ld to %lu", id, metric_start, metric_end, start_time_t, start_time_t + pgc_uts.points_per_page); + // netdata_log_info("COLLECTOR %zu: collecting metrics %zu to %zu, from %ld to %lu", id, metric_start, metric_end, start_time_t, start_time_t + pgc_uts.points_per_page); netdata_thread_disable_cancelability(); @@ -2485,7 +2483,7 @@ void *unittest_stress_test_service(void *ptr) { } static void unittest_stress_test_save_dirty_page_callback(PGC *cache __maybe_unused, PGC_ENTRY *entries_array __maybe_unused, PGC_PAGE **pages_array __maybe_unused, size_t entries __maybe_unused) { - // info("SAVE %zu pages", entries); + // netdata_log_info("SAVE %zu pages", entries); if(!pgc_uts.stop) { usec_t t = pgc_uts.time_per_flush_ut; @@ -2625,7 +2623,7 @@ void unittest_stress_test(void) { if(stats.events_flush_critical > old_stats.events_flush_critical) flushing_status = "F"; - info("PGS %5zuk +%4zuk/-%4zuk " + netdata_log_info("PGS %5zuk +%4zuk/-%4zuk " "| RF %5zuk " "| HOT %5zuk +%4zuk -%4zuk " "| DRT %s %5zuk +%4zuk -%4zuk " @@ -2651,7 +2649,7 @@ void unittest_stress_test(void) { #endif ); } - info("Waiting for threads to stop..."); + netdata_log_info("Waiting for threads to stop..."); __atomic_store_n(&pgc_uts.stop, true, __ATOMIC_RELAXED); netdata_thread_join(service_thread, NULL); diff --git a/database/engine/cache.h b/database/engine/cache.h index 65e6a6137..1486fdc16 100644 --- a/database/engine/cache.h +++ b/database/engine/cache.h @@ -31,7 +31,7 @@ typedef struct pgc_entry { uint8_t *custom_data; } PGC_ENTRY; -#define PGC_CACHE_LINE_PADDING(x) uint8_t padding##x[128] +#define PGC_CACHE_LINE_PADDING(x) uint8_t padding##x[64] struct pgc_queue_statistics { size_t entries; diff --git a/database/engine/datafile.c b/database/engine/datafile.c index 8c413d8dc..d5c1285be 100644 --- a/database/engine/datafile.c +++ b/database/engine/datafile.c @@ -1,11 +1,15 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "rrdengine.h" -void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) +void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool having_lock) { - uv_rwlock_wrlock(&ctx->datafiles.rwlock); + if(!having_lock) + uv_rwlock_wrlock(&ctx->datafiles.rwlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ctx->datafiles.first, datafile, prev, next); - uv_rwlock_wrunlock(&ctx->datafiles.rwlock); + + if(!having_lock) + uv_rwlock_wrunlock(&ctx->datafiles.rwlock); } void datafile_list_delete_unsafe(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) @@ -27,9 +31,9 @@ static struct rrdengine_datafile *datafile_alloc_and_init(struct rrdengine_insta datafile->users.available = true; - netdata_spinlock_init(&datafile->users.spinlock); - netdata_spinlock_init(&datafile->writers.spinlock); - netdata_spinlock_init(&datafile->extent_queries.spinlock); + spinlock_init(&datafile->users.spinlock); + spinlock_init(&datafile->writers.spinlock); + spinlock_init(&datafile->extent_queries.spinlock); return datafile; } @@ -37,7 +41,7 @@ static struct rrdengine_datafile *datafile_alloc_and_init(struct rrdengine_insta bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason) { bool ret; - netdata_spinlock_lock(&df->users.spinlock); + spinlock_lock(&df->users.spinlock); if(df->users.available) { ret = true; @@ -47,25 +51,25 @@ bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS re else ret = false; - netdata_spinlock_unlock(&df->users.spinlock); + spinlock_unlock(&df->users.spinlock); return ret; } void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason) { - netdata_spinlock_lock(&df->users.spinlock); + spinlock_lock(&df->users.spinlock); if(!df->users.lockers) fatal("DBENGINE DATAFILE: cannot release a datafile that is not acquired"); df->users.lockers--; df->users.lockers_by_reason[reason]--; - netdata_spinlock_unlock(&df->users.spinlock); + spinlock_unlock(&df->users.spinlock); } bool datafile_acquire_for_deletion(struct rrdengine_datafile *df) { bool can_be_deleted = false; - netdata_spinlock_lock(&df->users.spinlock); + spinlock_lock(&df->users.spinlock); df->users.available = false; if(!df->users.lockers) @@ -75,9 +79,9 @@ bool datafile_acquire_for_deletion(struct rrdengine_datafile *df) { // there are lockers // evict any pages referencing this in the open cache - netdata_spinlock_unlock(&df->users.spinlock); + spinlock_unlock(&df->users.spinlock); pgc_open_evict_clean_pages_of_datafile(open_cache, df); - netdata_spinlock_lock(&df->users.spinlock); + spinlock_lock(&df->users.spinlock); if(!df->users.lockers) can_be_deleted = true; @@ -86,12 +90,12 @@ bool datafile_acquire_for_deletion(struct rrdengine_datafile *df) { // there are lockers still // count the number of pages referencing this in the open cache - netdata_spinlock_unlock(&df->users.spinlock); + spinlock_unlock(&df->users.spinlock); usec_t time_to_scan_ut = now_monotonic_usec(); size_t clean_pages_in_open_cache = pgc_count_clean_pages_having_data_ptr(open_cache, (Word_t)df->ctx, df); size_t hot_pages_in_open_cache = pgc_count_hot_pages_having_data_ptr(open_cache, (Word_t)df->ctx, df); time_to_scan_ut = now_monotonic_usec() - time_to_scan_ut; - netdata_spinlock_lock(&df->users.spinlock); + spinlock_lock(&df->users.spinlock); if(!df->users.lockers) can_be_deleted = true; @@ -149,7 +153,7 @@ bool datafile_acquire_for_deletion(struct rrdengine_datafile *df) { time_to_scan_ut); } } - netdata_spinlock_unlock(&df->users.spinlock); + spinlock_unlock(&df->users.spinlock); return can_be_deleted; } @@ -171,7 +175,7 @@ int close_data_file(struct rrdengine_datafile *datafile) ret = uv_fs_close(NULL, &req, datafile->file, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); @@ -190,7 +194,7 @@ int unlink_data_file(struct rrdengine_datafile *datafile) ret = uv_fs_unlink(NULL, &req, path, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); @@ -211,21 +215,21 @@ int destroy_data_file_unsafe(struct rrdengine_datafile *datafile) ret = uv_fs_ftruncate(NULL, &req, datafile->file, 0, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); ret = uv_fs_close(NULL, &req, datafile->file, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); ret = uv_fs_unlink(NULL, &req, path, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); @@ -268,7 +272,7 @@ int create_data_file(struct rrdengine_datafile *datafile) ret = uv_fs_write(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { fatal_assert(req.result < 0); - error("DBENGINE: uv_fs_write: %s", uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_write: %s", uv_strerror(ret)); ctx_io_error(ctx); } uv_fs_req_cleanup(&req); @@ -299,7 +303,7 @@ static int check_data_file_superblock(uv_file file) ret = uv_fs_read(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_read: %s", uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_read: %s", uv_strerror(ret)); uv_fs_req_cleanup(&req); goto error; } @@ -309,7 +313,7 @@ static int check_data_file_superblock(uv_file file) if (strncmp(superblock->magic_number, RRDENG_DF_MAGIC, RRDENG_MAGIC_SZ) || strncmp(superblock->version, RRDENG_DF_VER, RRDENG_VER_SZ) || superblock->tier != 1) { - error("DBENGINE: file has invalid superblock."); + netdata_log_error("DBENGINE: file has invalid superblock."); ret = UV_EINVAL; } else { ret = 0; @@ -334,7 +338,7 @@ static int load_data_file(struct rrdengine_datafile *datafile) ctx_fs_error(ctx); return fd; } - info("DBENGINE: initializing data file \"%s\".", path); + netdata_log_info("DBENGINE: initializing data file \"%s\".", path); ret = check_file_properties(file, &file_size, sizeof(struct rrdeng_df_sb)); if (ret) @@ -350,14 +354,14 @@ static int load_data_file(struct rrdengine_datafile *datafile) datafile->file = file; datafile->pos = file_size; - info("DBENGINE: data file \"%s\" initialized (size:%"PRIu64").", path, file_size); + netdata_log_info("DBENGINE: data file \"%s\" initialized (size:%"PRIu64").", path, file_size); return 0; error: error = ret; ret = uv_fs_close(NULL, &req, file, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); @@ -390,11 +394,11 @@ static int scan_data_files(struct rrdengine_instance *ctx) if (ret < 0) { fatal_assert(req.result < 0); uv_fs_req_cleanup(&req); - error("DBENGINE: uv_fs_scandir(%s): %s", ctx->config.dbfiles_path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_scandir(%s): %s", ctx->config.dbfiles_path, uv_strerror(ret)); ctx_fs_error(ctx); return ret; } - info("DBENGINE: found %d files in path %s", ret, ctx->config.dbfiles_path); + netdata_log_info("DBENGINE: found %d files in path %s", ret, ctx->config.dbfiles_path); datafiles = callocz(MIN(ret, MAX_DATAFILES), sizeof(*datafiles)); for (matched_files = 0 ; UV_EOF != uv_fs_scandir_next(&req, &dent) && matched_files < MAX_DATAFILES ; ) { @@ -410,11 +414,12 @@ static int scan_data_files(struct rrdengine_instance *ctx) freez(datafiles); return 0; } - if (matched_files == MAX_DATAFILES) { - error("DBENGINE: warning: hit maximum database engine file limit of %d files", MAX_DATAFILES); - } + + if (matched_files == MAX_DATAFILES) + netdata_log_error("DBENGINE: warning: hit maximum database engine file limit of %d files", MAX_DATAFILES); + qsort(datafiles, matched_files, sizeof(*datafiles), scan_data_files_cmp); - /* TODO: change this when tiering is implemented */ + ctx->atomic.last_fileno = datafiles[matched_files - 1]->fileno; for (failed_to_load = 0, i = 0 ; i < matched_files ; ++i) { @@ -422,9 +427,9 @@ static int scan_data_files(struct rrdengine_instance *ctx) datafile = datafiles[i]; ret = load_data_file(datafile); - if (0 != ret) { + if (0 != ret) must_delete_pair = 1; - } + journalfile = journalfile_alloc_and_init(datafile); ret = journalfile_load(ctx, journalfile, datafile); if (0 != ret) { @@ -432,19 +437,20 @@ static int scan_data_files(struct rrdengine_instance *ctx) close_data_file(datafile); must_delete_pair = 1; } + if (must_delete_pair) { char path[RRDENG_PATH_MAX]; - error("DBENGINE: deleting invalid data and journal file pair."); + netdata_log_error("DBENGINE: deleting invalid data and journal file pair."); ret = journalfile_unlink(journalfile); if (!ret) { journalfile_v1_generate_path(datafile, path, sizeof(path)); - info("DBENGINE: deleted journal file \"%s\".", path); + netdata_log_info("DBENGINE: deleted journal file \"%s\".", path); } ret = unlink_data_file(datafile); if (!ret) { generate_datafilepath(datafile, path, sizeof(path)); - info("DBENGINE: deleted data file \"%s\".", path); + netdata_log_info("DBENGINE: deleted data file \"%s\".", path); } freez(journalfile); freez(datafile); @@ -453,8 +459,9 @@ static int scan_data_files(struct rrdengine_instance *ctx) } ctx_current_disk_space_increase(ctx, datafile->pos + journalfile->unsafe.pos); - datafile_list_insert(ctx, datafile); + datafile_list_insert(ctx, datafile, false); } + matched_files -= failed_to_load; freez(datafiles); @@ -462,7 +469,7 @@ static int scan_data_files(struct rrdengine_instance *ctx) } /* Creates a datafile and a journalfile pair */ -int create_new_datafile_pair(struct rrdengine_instance *ctx) +int create_new_datafile_pair(struct rrdengine_instance *ctx, bool having_lock) { __atomic_add_fetch(&rrdeng_cache_efficiency_stats.datafile_creation_started, 1, __ATOMIC_RELAXED); @@ -472,14 +479,14 @@ int create_new_datafile_pair(struct rrdengine_instance *ctx) int ret; char path[RRDENG_PATH_MAX]; - info("DBENGINE: creating new data and journal files in path %s", ctx->config.dbfiles_path); + netdata_log_info("DBENGINE: creating new data and journal files in path %s", ctx->config.dbfiles_path); datafile = datafile_alloc_and_init(ctx, 1, fileno); ret = create_data_file(datafile); if(ret) goto error_after_datafile; generate_datafilepath(datafile, path, sizeof(path)); - info("DBENGINE: created data file \"%s\".", path); + netdata_log_info("DBENGINE: created data file \"%s\".", path); journalfile = journalfile_alloc_and_init(datafile); ret = journalfile_create(journalfile, datafile); @@ -487,10 +494,10 @@ int create_new_datafile_pair(struct rrdengine_instance *ctx) goto error_after_journalfile; journalfile_v1_generate_path(datafile, path, sizeof(path)); - info("DBENGINE: created journal file \"%s\".", path); + netdata_log_info("DBENGINE: created journal file \"%s\".", path); ctx_current_disk_space_increase(ctx, datafile->pos + journalfile->unsafe.pos); - datafile_list_insert(ctx, datafile); + datafile_list_insert(ctx, datafile, having_lock); ctx_last_fileno_increment(ctx); return 0; @@ -514,20 +521,20 @@ int init_data_files(struct rrdengine_instance *ctx) fatal_assert(0 == uv_rwlock_init(&ctx->datafiles.rwlock)); ret = scan_data_files(ctx); if (ret < 0) { - error("DBENGINE: failed to scan path \"%s\".", ctx->config.dbfiles_path); + netdata_log_error("DBENGINE: failed to scan path \"%s\".", ctx->config.dbfiles_path); return ret; } else if (0 == ret) { - info("DBENGINE: data files not found, creating in path \"%s\".", ctx->config.dbfiles_path); + netdata_log_info("DBENGINE: data files not found, creating in path \"%s\".", ctx->config.dbfiles_path); ctx->atomic.last_fileno = 0; - ret = create_new_datafile_pair(ctx); + ret = create_new_datafile_pair(ctx, false); if (ret) { - error("DBENGINE: failed to create data and journal files in path \"%s\".", ctx->config.dbfiles_path); + netdata_log_error("DBENGINE: failed to create data and journal files in path \"%s\".", ctx->config.dbfiles_path); return ret; } } else { if (ctx->loading.create_new_datafile_pair) - create_new_datafile_pair(ctx); + create_new_datafile_pair(ctx, false); while(rrdeng_ctx_exceeded_disk_quota(ctx)) datafile_delete(ctx, ctx->datafiles.first, false, false); @@ -545,7 +552,7 @@ void finalize_data_files(struct rrdengine_instance *ctx) logged = false; while(__atomic_load_n(&ctx->atomic.extents_currently_being_flushed, __ATOMIC_RELAXED)) { if(!logged) { - info("Waiting for inflight flush to finish on tier %d...", ctx->config.tier); + netdata_log_info("Waiting for inflight flush to finish on tier %d...", ctx->config.tier); logged = true; } sleep_usec(100 * USEC_PER_MS); @@ -559,7 +566,7 @@ void finalize_data_files(struct rrdengine_instance *ctx) size_t iterations = 100; while(!datafile_acquire_for_deletion(datafile) && datafile != ctx->datafiles.first->prev && --iterations > 0) { if(!logged) { - info("Waiting to acquire data file %u of tier %d to close it...", datafile->fileno, ctx->config.tier); + netdata_log_info("Waiting to acquire data file %u of tier %d to close it...", datafile->fileno, ctx->config.tier); logged = true; } sleep_usec(100 * USEC_PER_MS); @@ -569,14 +576,14 @@ void finalize_data_files(struct rrdengine_instance *ctx) bool available = false; do { uv_rwlock_wrlock(&ctx->datafiles.rwlock); - netdata_spinlock_lock(&datafile->writers.spinlock); + spinlock_lock(&datafile->writers.spinlock); available = (datafile->writers.running || datafile->writers.flushed_to_open_running) ? false : true; if(!available) { - netdata_spinlock_unlock(&datafile->writers.spinlock); + spinlock_unlock(&datafile->writers.spinlock); uv_rwlock_wrunlock(&ctx->datafiles.rwlock); if(!logged) { - info("Waiting for writers to data file %u of tier %d to finish...", datafile->fileno, ctx->config.tier); + netdata_log_info("Waiting for writers to data file %u of tier %d to finish...", datafile->fileno, ctx->config.tier); logged = true; } sleep_usec(100 * USEC_PER_MS); @@ -586,7 +593,7 @@ void finalize_data_files(struct rrdengine_instance *ctx) journalfile_close(journalfile, datafile); close_data_file(datafile); datafile_list_delete_unsafe(ctx, datafile); - netdata_spinlock_unlock(&datafile->writers.spinlock); + spinlock_unlock(&datafile->writers.spinlock); uv_rwlock_wrunlock(&ctx->datafiles.rwlock); freez(journalfile); diff --git a/database/engine/datafile.h b/database/engine/datafile.h index a08f3ae04..569f1b0a2 100644 --- a/database/engine/datafile.h +++ b/database/engine/datafile.h @@ -21,7 +21,7 @@ struct rrdengine_instance; #endif #define MIN_DATAFILE_SIZE (4LU * 1024LU * 1024LU) -#define MAX_DATAFILES (65536) /* Supports up to 64TiB for now */ +#define MAX_DATAFILES (65536 * 4) /* Supports up to 64TiB for now */ #define TARGET_DATAFILES (50) typedef enum __attribute__ ((__packed__)) { @@ -74,14 +74,14 @@ bool datafile_acquire(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS re void datafile_release(struct rrdengine_datafile *df, DATAFILE_ACQUIRE_REASONS reason); bool datafile_acquire_for_deletion(struct rrdengine_datafile *df); -void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile); +void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, bool having_lock); void datafile_list_delete_unsafe(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile); void generate_datafilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen); int close_data_file(struct rrdengine_datafile *datafile); int unlink_data_file(struct rrdengine_datafile *datafile); int destroy_data_file_unsafe(struct rrdengine_datafile *datafile); int create_data_file(struct rrdengine_datafile *datafile); -int create_new_datafile_pair(struct rrdengine_instance *ctx); +int create_new_datafile_pair(struct rrdengine_instance *ctx, bool having_lock); int init_data_files(struct rrdengine_instance *ctx); void finalize_data_files(struct rrdengine_instance *ctx); diff --git a/database/engine/journalfile.c b/database/engine/journalfile.c index 9998ee540..24d3c1c6d 100644 --- a/database/engine/journalfile.c +++ b/database/engine/journalfile.c @@ -1,57 +1,6 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "rrdengine.h" - -// DBENGINE2: Helper - -static void update_metric_retention_and_granularity_by_uuid( - struct rrdengine_instance *ctx, uuid_t *uuid, - time_t first_time_s, time_t last_time_s, - time_t update_every_s, time_t now_s) -{ - if(unlikely(last_time_s > now_s)) { - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, "DBENGINE JV2: wrong last time on-disk (%ld - %ld, now %ld), " - "fixing last time to now", - first_time_s, last_time_s, now_s); - last_time_s = now_s; - } - - if (unlikely(first_time_s > last_time_s)) { - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, "DBENGINE JV2: wrong first time on-disk (%ld - %ld, now %ld), " - "fixing first time to last time", - first_time_s, last_time_s, now_s); - - first_time_s = last_time_s; - } - - if (unlikely(first_time_s == 0 || last_time_s == 0)) { - error_limit_static_global_var(erl, 1, 0); - error_limit(&erl, "DBENGINE JV2: zero on-disk timestamps (%ld - %ld, now %ld), " - "using them as-is", - first_time_s, last_time_s, now_s); - } - - bool added = false; - METRIC *metric = mrg_metric_get_and_acquire(main_mrg, uuid, (Word_t) ctx); - if (!metric) { - MRG_ENTRY entry = { - .section = (Word_t) ctx, - .first_time_s = first_time_s, - .last_time_s = last_time_s, - .latest_update_every_s = (uint32_t) update_every_s - }; - uuid_copy(entry.uuid, *uuid); - metric = mrg_metric_add_and_acquire(main_mrg, entry, &added); - } - - if (likely(!added)) - mrg_metric_expand_retention(main_mrg, metric, first_time_s, last_time_s, update_every_s); - - mrg_metric_release(main_mrg, metric); -} - static void after_extent_write_journalfile_v1_io(uv_fs_t* req) { worker_is_busy(RRDENG_FLUSH_TRANSACTION_BUFFER_CB); @@ -60,12 +9,12 @@ static void after_extent_write_journalfile_v1_io(uv_fs_t* req) struct generic_io_descriptor *io_descr = &wal->io_descr; struct rrdengine_instance *ctx = io_descr->ctx; - debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__); + netdata_log_debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__); if (req->result < 0) { ctx_io_error(ctx); - error("DBENGINE: %s: uv_fs_write: %s", __func__, uv_strerror((int)req->result)); + netdata_log_error("DBENGINE: %s: uv_fs_write: %s", __func__, uv_strerror((int)req->result)); } else { - debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__); + netdata_log_debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__); } uv_fs_req_cleanup(req); @@ -92,10 +41,10 @@ void journalfile_v1_extent_write(struct rrdengine_instance *ctx, struct rrdengin io_descr->buf = wal->buf; io_descr->bytes = wal->buf_size; - netdata_spinlock_lock(&journalfile->unsafe.spinlock); + spinlock_lock(&journalfile->unsafe.spinlock); io_descr->pos = journalfile->unsafe.pos; journalfile->unsafe.pos += wal->buf_size; - netdata_spinlock_unlock(&journalfile->unsafe.spinlock); + spinlock_unlock(&journalfile->unsafe.spinlock); io_descr->req.data = wal; io_descr->data = journalfile; @@ -122,10 +71,129 @@ void journalfile_v1_generate_path(struct rrdengine_datafile *datafile, char *str datafile->ctx->config.dbfiles_path, datafile->tier, datafile->fileno); } +// ---------------------------------------------------------------------------- + +struct rrdengine_datafile *njfv2idx_find_and_acquire_j2_header(NJFV2IDX_FIND_STATE *s) { + struct rrdengine_datafile *datafile = NULL; + + rw_spinlock_read_lock(&s->ctx->njfv2idx.spinlock); + + Pvoid_t *PValue = NULL; + + if(unlikely(!s->init)) { + s->init = true; + s->last = s->wanted_start_time_s; + + PValue = JudyLPrev(s->ctx->njfv2idx.JudyL, &s->last, PJE0); + if (unlikely(PValue == PJERR)) + fatal("DBENGINE: NJFV2IDX corrupted judy array"); + + if(!PValue) { + s->last = 0; + PValue = JudyLFirst(s->ctx->njfv2idx.JudyL, &s->last, PJE0); + if (unlikely(PValue == PJERR)) + fatal("DBENGINE: NJFV2IDX corrupted judy array"); + + if(!PValue) + s->last = s->wanted_start_time_s; + } + } + + while(1) { + if (likely(!PValue)) { + PValue = JudyLNext(s->ctx->njfv2idx.JudyL, &s->last, PJE0); + if (unlikely(PValue == PJERR)) + fatal("DBENGINE: NJFV2IDX corrupted judy array"); + + if(!PValue) { + // cannot find anything after that point + datafile = NULL; + break; + } + } + + datafile = *PValue; + TIME_RANGE_COMPARE rc = is_page_in_time_range(datafile->journalfile->v2.first_time_s, + datafile->journalfile->v2.last_time_s, + s->wanted_start_time_s, + s->wanted_end_time_s); + + if(rc == PAGE_IS_IN_RANGE) { + // this is good to return + break; + } + else if(rc == PAGE_IS_IN_THE_PAST) { + // continue to get the next + datafile = NULL; + PValue = NULL; + continue; + } + else /* PAGE_IS_IN_THE_FUTURE */ { + // we finished - no more datafiles + datafile = NULL; + PValue = NULL; + break; + } + } + + if(datafile) + s->j2_header_acquired = journalfile_v2_data_acquire(datafile->journalfile, NULL, + s->wanted_start_time_s, + s->wanted_end_time_s); + else + s->j2_header_acquired = NULL; + + rw_spinlock_read_unlock(&s->ctx->njfv2idx.spinlock); + + return datafile; +} + +static void njfv2idx_add(struct rrdengine_datafile *datafile) { + internal_fatal(datafile->journalfile->v2.last_time_s <= 0, "DBENGINE: NJFV2IDX trying to index a journal file with invalid first_time_s"); + + rw_spinlock_write_lock(&datafile->ctx->njfv2idx.spinlock); + datafile->journalfile->njfv2idx.indexed_as = datafile->journalfile->v2.last_time_s; + + do { + internal_fatal(datafile->journalfile->njfv2idx.indexed_as <= 0, "DBENGINE: NJFV2IDX journalfile is already indexed"); + + Pvoid_t *PValue = JudyLIns(&datafile->ctx->njfv2idx.JudyL, datafile->journalfile->njfv2idx.indexed_as, PJE0); + if (!PValue || PValue == PJERR) + fatal("DBENGINE: NJFV2IDX corrupted judy array"); + + if (unlikely(*PValue)) { + // already there + datafile->journalfile->njfv2idx.indexed_as++; + } + else { + *PValue = datafile; + break; + } + } while(0); + + rw_spinlock_write_unlock(&datafile->ctx->njfv2idx.spinlock); +} + +static void njfv2idx_remove(struct rrdengine_datafile *datafile) { + internal_fatal(!datafile->journalfile->njfv2idx.indexed_as, "DBENGINE: NJFV2IDX journalfile to remove is not indexed"); + + rw_spinlock_write_lock(&datafile->ctx->njfv2idx.spinlock); + + int rc = JudyLDel(&datafile->ctx->njfv2idx.JudyL, datafile->journalfile->njfv2idx.indexed_as, PJE0); + (void)rc; + internal_fatal(!rc, "DBENGINE: NJFV2IDX cannot remove entry"); + + datafile->journalfile->njfv2idx.indexed_as = 0; + + rw_spinlock_write_unlock(&datafile->ctx->njfv2idx.spinlock); +} + +// ---------------------------------------------------------------------------- + static struct journal_v2_header *journalfile_v2_mounted_data_get(struct rrdengine_journalfile *journalfile, size_t *data_size) { struct journal_v2_header *j2_header = NULL; - netdata_spinlock_lock(&journalfile->mmap.spinlock); + spinlock_lock(&journalfile->mmap.spinlock); if(!journalfile->mmap.data) { journalfile->mmap.data = mmap(NULL, journalfile->mmap.size, PROT_READ, MAP_SHARED, journalfile->mmap.fd, 0); @@ -136,9 +204,9 @@ static struct journal_v2_header *journalfile_v2_mounted_data_get(struct rrdengin journalfile->mmap.data = NULL; journalfile->mmap.size = 0; - netdata_spinlock_lock(&journalfile->v2.spinlock); + spinlock_lock(&journalfile->v2.spinlock); journalfile->v2.flags &= ~(JOURNALFILE_FLAG_IS_AVAILABLE | JOURNALFILE_FLAG_IS_MOUNTED); - netdata_spinlock_unlock(&journalfile->v2.spinlock); + spinlock_unlock(&journalfile->v2.spinlock); ctx_fs_error(journalfile->datafile->ctx); } @@ -147,12 +215,21 @@ static struct journal_v2_header *journalfile_v2_mounted_data_get(struct rrdengin madvise_dontfork(journalfile->mmap.data, journalfile->mmap.size); madvise_dontdump(journalfile->mmap.data, journalfile->mmap.size); - madvise_random(journalfile->mmap.data, journalfile->mmap.size); - madvise_dontneed(journalfile->mmap.data, journalfile->mmap.size); - netdata_spinlock_lock(&journalfile->v2.spinlock); + spinlock_lock(&journalfile->v2.spinlock); journalfile->v2.flags |= JOURNALFILE_FLAG_IS_AVAILABLE | JOURNALFILE_FLAG_IS_MOUNTED; - netdata_spinlock_unlock(&journalfile->v2.spinlock); + JOURNALFILE_FLAGS flags = journalfile->v2.flags; + spinlock_unlock(&journalfile->v2.spinlock); + + if(flags & JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION) { + // we need the entire metrics directory into memory to process it + madvise_willneed(journalfile->mmap.data, journalfile->v2.size_of_directory); + } + else { + // let the kernel know that we don't want read-ahead on this file + madvise_random(journalfile->mmap.data, journalfile->mmap.size); + // madvise_dontneed(journalfile->mmap.data, journalfile->mmap.size); + } } } @@ -163,7 +240,7 @@ static struct journal_v2_header *journalfile_v2_mounted_data_get(struct rrdengin *data_size = journalfile->mmap.size; } - netdata_spinlock_unlock(&journalfile->mmap.spinlock); + spinlock_unlock(&journalfile->mmap.spinlock); return j2_header; } @@ -173,20 +250,20 @@ static bool journalfile_v2_mounted_data_unmount(struct rrdengine_journalfile *jo if(!have_locks) { if(!wait) { - if (!netdata_spinlock_trylock(&journalfile->mmap.spinlock)) + if (!spinlock_trylock(&journalfile->mmap.spinlock)) return false; } else - netdata_spinlock_lock(&journalfile->mmap.spinlock); + spinlock_lock(&journalfile->mmap.spinlock); if(!wait) { - if(!netdata_spinlock_trylock(&journalfile->v2.spinlock)) { - netdata_spinlock_unlock(&journalfile->mmap.spinlock); + if(!spinlock_trylock(&journalfile->v2.spinlock)) { + spinlock_unlock(&journalfile->mmap.spinlock); return false; } } else - netdata_spinlock_lock(&journalfile->v2.spinlock); + spinlock_lock(&journalfile->v2.spinlock); } if(!journalfile->v2.refcount) { @@ -194,7 +271,7 @@ static bool journalfile_v2_mounted_data_unmount(struct rrdengine_journalfile *jo if (munmap(journalfile->mmap.data, journalfile->mmap.size)) { char path[RRDENG_PATH_MAX]; journalfile_v2_generate_path(journalfile->datafile, path, sizeof(path)); - error("DBENGINE: failed to unmap index file '%s'", path); + netdata_log_error("DBENGINE: failed to unmap index file '%s'", path); internal_fatal(true, "DBENGINE: failed to unmap file '%s'", path); ctx_fs_error(journalfile->datafile->ctx); } @@ -209,8 +286,8 @@ static bool journalfile_v2_mounted_data_unmount(struct rrdengine_journalfile *jo } if(!have_locks) { - netdata_spinlock_unlock(&journalfile->v2.spinlock); - netdata_spinlock_unlock(&journalfile->mmap.spinlock); + spinlock_unlock(&journalfile->v2.spinlock); + spinlock_unlock(&journalfile->mmap.spinlock); } return unmounted; @@ -230,7 +307,7 @@ void journalfile_v2_data_unmount_cleanup(time_t now_s) { for (datafile = ctx->datafiles.first; datafile; datafile = datafile->next) { struct rrdengine_journalfile *journalfile = datafile->journalfile; - if(!netdata_spinlock_trylock(&journalfile->v2.spinlock)) + if(!spinlock_trylock(&journalfile->v2.spinlock)) continue; bool unmount = false; @@ -244,7 +321,7 @@ void journalfile_v2_data_unmount_cleanup(time_t now_s) { // 2 minutes have passed since last use unmount = true; } - netdata_spinlock_unlock(&journalfile->v2.spinlock); + spinlock_unlock(&journalfile->v2.spinlock); if (unmount) journalfile_v2_mounted_data_unmount(journalfile, false, false); @@ -254,7 +331,7 @@ void journalfile_v2_data_unmount_cleanup(time_t now_s) { } struct journal_v2_header *journalfile_v2_data_acquire(struct rrdengine_journalfile *journalfile, size_t *data_size, time_t wanted_first_time_s, time_t wanted_last_time_s) { - netdata_spinlock_lock(&journalfile->v2.spinlock); + spinlock_lock(&journalfile->v2.spinlock); bool has_data = (journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE); bool is_mounted = (journalfile->v2.flags & JOURNALFILE_FLAG_IS_MOUNTED); @@ -276,7 +353,7 @@ struct journal_v2_header *journalfile_v2_data_acquire(struct rrdengine_journalfi } } - netdata_spinlock_unlock(&journalfile->v2.spinlock); + spinlock_unlock(&journalfile->v2.spinlock); if(do_we_need_it) return journalfile_v2_mounted_data_get(journalfile, data_size); @@ -285,7 +362,7 @@ struct journal_v2_header *journalfile_v2_data_acquire(struct rrdengine_journalfi } void journalfile_v2_data_release(struct rrdengine_journalfile *journalfile) { - netdata_spinlock_lock(&journalfile->v2.spinlock); + spinlock_lock(&journalfile->v2.spinlock); internal_fatal(!journalfile->mmap.data, "trying to release a journalfile without data"); internal_fatal(journalfile->v2.refcount < 1, "trying to release a non-acquired journalfile"); @@ -300,7 +377,7 @@ void journalfile_v2_data_release(struct rrdengine_journalfile *journalfile) { if(journalfile->v2.flags & JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION) unmount = true; } - netdata_spinlock_unlock(&journalfile->v2.spinlock); + spinlock_unlock(&journalfile->v2.spinlock); if(unmount) journalfile_v2_mounted_data_unmount(journalfile, false, true); @@ -308,25 +385,25 @@ void journalfile_v2_data_release(struct rrdengine_journalfile *journalfile) { bool journalfile_v2_data_available(struct rrdengine_journalfile *journalfile) { - netdata_spinlock_lock(&journalfile->v2.spinlock); + spinlock_lock(&journalfile->v2.spinlock); bool has_data = (journalfile->v2.flags & JOURNALFILE_FLAG_IS_AVAILABLE); - netdata_spinlock_unlock(&journalfile->v2.spinlock); + spinlock_unlock(&journalfile->v2.spinlock); return has_data; } size_t journalfile_v2_data_size_get(struct rrdengine_journalfile *journalfile) { - netdata_spinlock_lock(&journalfile->mmap.spinlock); + spinlock_lock(&journalfile->mmap.spinlock); size_t data_size = journalfile->mmap.size; - netdata_spinlock_unlock(&journalfile->mmap.spinlock); + spinlock_unlock(&journalfile->mmap.spinlock); return data_size; } void journalfile_v2_data_set(struct rrdengine_journalfile *journalfile, int fd, void *journal_data, uint32_t journal_data_size) { - netdata_spinlock_lock(&journalfile->mmap.spinlock); - netdata_spinlock_lock(&journalfile->v2.spinlock); + spinlock_lock(&journalfile->mmap.spinlock); + spinlock_lock(&journalfile->v2.spinlock); internal_fatal(journalfile->mmap.fd != -1, "DBENGINE JOURNALFILE: trying to re-set journal fd"); internal_fatal(journalfile->mmap.data, "DBENGINE JOURNALFILE: trying to re-set journal_data"); @@ -341,22 +418,27 @@ void journalfile_v2_data_set(struct rrdengine_journalfile *journalfile, int fd, struct journal_v2_header *j2_header = journalfile->mmap.data; journalfile->v2.first_time_s = (time_t)(j2_header->start_time_ut / USEC_PER_SEC); journalfile->v2.last_time_s = (time_t)(j2_header->end_time_ut / USEC_PER_SEC); + journalfile->v2.size_of_directory = j2_header->metric_offset + j2_header->metric_count * sizeof(struct journal_metric_list); journalfile_v2_mounted_data_unmount(journalfile, true, true); - netdata_spinlock_unlock(&journalfile->v2.spinlock); - netdata_spinlock_unlock(&journalfile->mmap.spinlock); + spinlock_unlock(&journalfile->v2.spinlock); + spinlock_unlock(&journalfile->mmap.spinlock); + + njfv2idx_add(journalfile->datafile); } static void journalfile_v2_data_unmap_permanently(struct rrdengine_journalfile *journalfile) { + njfv2idx_remove(journalfile->datafile); + bool has_references = false; do { if (has_references) sleep_usec(10 * USEC_PER_MS); - netdata_spinlock_lock(&journalfile->mmap.spinlock); - netdata_spinlock_lock(&journalfile->v2.spinlock); + spinlock_lock(&journalfile->mmap.spinlock); + spinlock_lock(&journalfile->v2.spinlock); if(journalfile_v2_mounted_data_unmount(journalfile, true, true)) { if(journalfile->mmap.fd != -1) @@ -374,8 +456,8 @@ static void journalfile_v2_data_unmap_permanently(struct rrdengine_journalfile * internal_error(true, "DBENGINE JOURNALFILE: waiting for journalfile to be available to unmap..."); } - netdata_spinlock_unlock(&journalfile->v2.spinlock); - netdata_spinlock_unlock(&journalfile->mmap.spinlock); + spinlock_unlock(&journalfile->v2.spinlock); + spinlock_unlock(&journalfile->mmap.spinlock); } while(has_references); } @@ -384,9 +466,9 @@ struct rrdengine_journalfile *journalfile_alloc_and_init(struct rrdengine_datafi { struct rrdengine_journalfile *journalfile = callocz(1, sizeof(struct rrdengine_journalfile)); journalfile->datafile = datafile; - netdata_spinlock_init(&journalfile->mmap.spinlock); - netdata_spinlock_init(&journalfile->v2.spinlock); - netdata_spinlock_init(&journalfile->unsafe.spinlock); + spinlock_init(&journalfile->mmap.spinlock); + spinlock_init(&journalfile->v2.spinlock); + spinlock_init(&journalfile->unsafe.spinlock); journalfile->mmap.fd = -1; datafile->journalfile = journalfile; return journalfile; @@ -401,7 +483,7 @@ static int close_uv_file(struct rrdengine_datafile *datafile, uv_file file) ret = uv_fs_close(NULL, &req, file, NULL); if (ret < 0) { journalfile_v1_generate_path(datafile, path, sizeof(path)); - error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); ctx_fs_error(datafile->ctx); } uv_fs_req_cleanup(&req); @@ -430,7 +512,7 @@ int journalfile_unlink(struct rrdengine_journalfile *journalfile) ret = uv_fs_unlink(NULL, &req, path, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); @@ -454,7 +536,7 @@ int journalfile_destroy_unsafe(struct rrdengine_journalfile *journalfile, struct if (journalfile->file) { ret = uv_fs_ftruncate(NULL, &req, journalfile->file, 0, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); @@ -464,14 +546,14 @@ int journalfile_destroy_unsafe(struct rrdengine_journalfile *journalfile, struct // This is the new journal v2 index file ret = uv_fs_unlink(NULL, &req, path_v2, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); ret = uv_fs_unlink(NULL, &req, path, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); @@ -516,7 +598,7 @@ int journalfile_create(struct rrdengine_journalfile *journalfile, struct rrdengi ret = uv_fs_write(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { fatal_assert(req.result < 0); - error("DBENGINE: uv_fs_write: %s", uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_write: %s", uv_strerror(ret)); ctx_io_error(ctx); } uv_fs_req_cleanup(&req); @@ -548,7 +630,7 @@ static int journalfile_check_superblock(uv_file file) ret = uv_fs_read(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_read: %s", uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_read: %s", uv_strerror(ret)); uv_fs_req_cleanup(&req); goto error; } @@ -557,7 +639,7 @@ static int journalfile_check_superblock(uv_file file) if (strncmp(superblock->magic_number, RRDENG_JF_MAGIC, RRDENG_MAGIC_SZ) || strncmp(superblock->version, RRDENG_JF_VER, RRDENG_VER_SZ)) { - error("DBENGINE: File has invalid superblock."); + netdata_log_error("DBENGINE: File has invalid superblock."); ret = UV_EINVAL; } else { ret = 0; @@ -569,7 +651,7 @@ static int journalfile_check_superblock(uv_file file) static void journalfile_restore_extent_metadata(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, void *buf, unsigned max_size) { - static BITMAP256 page_error_map; + static BITMAP256 page_error_map = BITMAP256_INITIALIZER; unsigned i, count, payload_length, descr_size; struct rrdeng_jf_store_data *jf_metric_data; @@ -578,7 +660,7 @@ static void journalfile_restore_extent_metadata(struct rrdengine_instance *ctx, descr_size = sizeof(*jf_metric_data->descr) * count; payload_length = sizeof(*jf_metric_data) + descr_size; if (payload_length > max_size) { - error("DBENGINE: corrupted transaction payload."); + netdata_log_error("DBENGINE: corrupted transaction payload."); return; } @@ -589,7 +671,7 @@ static void journalfile_restore_extent_metadata(struct rrdengine_instance *ctx, if (page_type > PAGE_TYPE_MAX) { if (!bitmap256_get_bit(&page_error_map, page_type)) { - error("DBENGINE: unknown page type %d encountered.", page_type); + netdata_log_error("DBENGINE: unknown page type %d encountered.", page_type); bitmap256_set_bit(&page_error_map, page_type, 1); } continue; @@ -658,36 +740,36 @@ static unsigned journalfile_replay_transaction(struct rrdengine_instance *ctx, s *id = 0; jf_header = buf; if (STORE_PADDING == jf_header->type) { - debug(D_RRDENGINE, "Skipping padding."); + netdata_log_debug(D_RRDENGINE, "Skipping padding."); return 0; } if (sizeof(*jf_header) > max_size) { - error("DBENGINE: corrupted transaction record, skipping."); + netdata_log_error("DBENGINE: corrupted transaction record, skipping."); return 0; } *id = jf_header->id; payload_length = jf_header->payload_length; size_bytes = sizeof(*jf_header) + payload_length + sizeof(*jf_trailer); if (size_bytes > max_size) { - error("DBENGINE: corrupted transaction record, skipping."); + netdata_log_error("DBENGINE: corrupted transaction record, skipping."); return 0; } jf_trailer = buf + sizeof(*jf_header) + payload_length; crc = crc32(0L, Z_NULL, 0); crc = crc32(crc, buf, sizeof(*jf_header) + payload_length); ret = crc32cmp(jf_trailer->checksum, crc); - debug(D_RRDENGINE, "Transaction %"PRIu64" was read from disk. CRC32 check: %s", *id, ret ? "FAILED" : "SUCCEEDED"); + netdata_log_debug(D_RRDENGINE, "Transaction %"PRIu64" was read from disk. CRC32 check: %s", *id, ret ? "FAILED" : "SUCCEEDED"); if (unlikely(ret)) { - error("DBENGINE: transaction %"PRIu64" was read from disk. CRC32 check: FAILED", *id); + netdata_log_error("DBENGINE: transaction %"PRIu64" was read from disk. CRC32 check: FAILED", *id); return size_bytes; } switch (jf_header->type) { case STORE_DATA: - debug(D_RRDENGINE, "Replaying transaction %"PRIu64"", jf_header->id); + netdata_log_debug(D_RRDENGINE, "Replaying transaction %"PRIu64"", jf_header->id); journalfile_restore_extent_metadata(ctx, journalfile, buf + sizeof(*jf_header), payload_length); break; default: - error("DBENGINE: unknown transaction type, skipping record."); + netdata_log_error("DBENGINE: unknown transaction type, skipping record."); break; } @@ -725,7 +807,7 @@ static uint64_t journalfile_iterate_transactions(struct rrdengine_instance *ctx, iov = uv_buf_init(buf, size_bytes); ret = uv_fs_read(NULL, &req, file, &iov, 1, pos, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_read: pos=%" PRIu64 ", %s", pos, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_read: pos=%" PRIu64 ", %s", pos, uv_strerror(ret)); uv_fs_req_cleanup(&req); goto skip_file; } @@ -764,7 +846,7 @@ static int journalfile_check_v2_extent_list (void *data_start, size_t file_size) crc = crc32(0L, Z_NULL, 0); crc = crc32(crc, (uint8_t *) data_start + j2_header->extent_offset, j2_header->extent_count * sizeof(struct journal_extent_list)); if (unlikely(crc32cmp(journal_v2_trailer->checksum, crc))) { - error("DBENGINE: extent list CRC32 check: FAILED"); + netdata_log_error("DBENGINE: extent list CRC32 check: FAILED"); return 1; } @@ -784,7 +866,7 @@ static int journalfile_check_v2_metric_list(void *data_start, size_t file_size) crc = crc32(0L, Z_NULL, 0); crc = crc32(crc, (uint8_t *) data_start + j2_header->metric_offset, j2_header->metric_count * sizeof(struct journal_metric_list)); if (unlikely(crc32cmp(journal_v2_trailer->checksum, crc))) { - error("DBENGINE: metric list CRC32 check: FAILED"); + netdata_log_error("DBENGINE: metric list CRC32 check: FAILED"); return 1; } return 0; @@ -828,19 +910,19 @@ static int journalfile_v2_validate(void *data_start, size_t journal_v2_file_size rc = crc32cmp(journal_v2_trailer->checksum, crc); if (unlikely(rc)) { - error("DBENGINE: file CRC32 check: FAILED"); + netdata_log_error("DBENGINE: file CRC32 check: FAILED"); return 1; } rc = journalfile_check_v2_extent_list(data_start, journal_v2_file_size); if (rc) return 1; - rc = journalfile_check_v2_metric_list(data_start, journal_v2_file_size); - if (rc) return 1; - if (!db_engine_journal_check) return 0; + rc = journalfile_check_v2_metric_list(data_start, journal_v2_file_size); + if (rc) return 1; + // Verify complete UUID chain struct journal_metric_list *metric = (void *) (data_start + j2_header->metric_offset); @@ -849,7 +931,7 @@ static int journalfile_v2_validate(void *data_start, size_t journal_v2_file_size unsigned entries; unsigned total_pages = 0; - info("DBENGINE: checking %u metrics that exist in the journal", j2_header->metric_count); + netdata_log_info("DBENGINE: checking %u metrics that exist in the journal", j2_header->metric_count); for (entries = 0; entries < j2_header->metric_count; entries++) { char uuid_str[UUID_STR_LEN]; @@ -880,16 +962,16 @@ static int journalfile_v2_validate(void *data_start, size_t journal_v2_file_size metric++; if ((uint32_t)((uint8_t *) metric - (uint8_t *) data_start) > (uint32_t) journal_v2_file_size) { - info("DBENGINE: verification failed EOF reached -- total entries %u, verified %u", entries, verified); + netdata_log_info("DBENGINE: verification failed EOF reached -- total entries %u, verified %u", entries, verified); return 1; } } if (entries != verified) { - info("DBENGINE: verification failed -- total entries %u, verified %u", entries, verified); + netdata_log_info("DBENGINE: verification failed -- total entries %u, verified %u", entries, verified); return 1; } - info("DBENGINE: verification succeeded -- total entries %u, verified %u (%u total pages)", entries, verified, total_pages); + netdata_log_info("DBENGINE: verification succeeded -- total entries %u, verified %u (%u total pages)", entries, verified, total_pages); return 0; } @@ -905,15 +987,25 @@ void journalfile_v2_populate_retention_to_mrg(struct rrdengine_instance *ctx, st uint8_t *data_start = (uint8_t *)j2_header; uint32_t entries = j2_header->metric_count; + if (journalfile->v2.flags & JOURNALFILE_FLAG_METRIC_CRC_CHECK) { + journalfile->v2.flags &= ~JOURNALFILE_FLAG_METRIC_CRC_CHECK; + if (journalfile_check_v2_metric_list(data_start, j2_header->journal_v2_file_size)) { + journalfile->v2.flags &= ~JOURNALFILE_FLAG_IS_AVAILABLE; + // needs rebuild + return; + } + } + struct journal_metric_list *metric = (struct journal_metric_list *) (data_start + j2_header->metric_offset); time_t header_start_time_s = (time_t) (j2_header->start_time_ut / USEC_PER_SEC); + time_t global_first_time_s = header_start_time_s; time_t now_s = max_acceptable_collected_time(); for (size_t i=0; i < entries; i++) { time_t start_time_s = header_start_time_s + metric->delta_start_s; time_t end_time_s = header_start_time_s + metric->delta_end_s; - update_metric_retention_and_granularity_by_uuid( - ctx, &metric->uuid, start_time_s, end_time_s, (time_t) metric->update_every_s, now_s); + mrg_update_metric_retention_and_granularity_by_uuid( + main_mrg, (Word_t)ctx, &metric->uuid, start_time_s, end_time_s, (time_t) metric->update_every_s, now_s); metric++; } @@ -921,12 +1013,18 @@ void journalfile_v2_populate_retention_to_mrg(struct rrdengine_instance *ctx, st journalfile_v2_data_release(journalfile); usec_t ended_ut = now_monotonic_usec(); - info("DBENGINE: journal v2 of tier %d, datafile %u populated, size: %0.2f MiB, metrics: %0.2f k, %0.2f ms" + netdata_log_info("DBENGINE: journal v2 of tier %d, datafile %u populated, size: %0.2f MiB, metrics: %0.2f k, %0.2f ms" , ctx->config.tier, journalfile->datafile->fileno , (double)data_size / 1024 / 1024 , (double)entries / 1000 , ((double)(ended_ut - started_ut) / USEC_PER_MS) ); + + time_t old = __atomic_load_n(&ctx->atomic.first_time_s, __ATOMIC_RELAXED);; + do { + if(old <= global_first_time_s) + break; + } while(!__atomic_compare_exchange_n(&ctx->atomic.first_time_s, &old, global_first_time_s, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); } int journalfile_v2_load(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) @@ -949,13 +1047,13 @@ int journalfile_v2_load(struct rrdengine_instance *ctx, struct rrdengine_journal if (errno == ENOENT) return 1; ctx_fs_error(ctx); - error("DBENGINE: failed to open '%s'", path_v2); + netdata_log_error("DBENGINE: failed to open '%s'", path_v2); return 1; } ret = fstat(fd, &statbuf); if (ret) { - error("DBENGINE: failed to get file information for '%s'", path_v2); + netdata_log_error("DBENGINE: failed to get file information for '%s'", path_v2); close(fd); return 1; } @@ -975,7 +1073,7 @@ int journalfile_v2_load(struct rrdengine_instance *ctx, struct rrdengine_journal return 1; } - info("DBENGINE: checking integrity of '%s'", path_v2); + netdata_log_info("DBENGINE: checking integrity of '%s'", path_v2); usec_t validation_start_ut = now_monotonic_usec(); int rc = journalfile_v2_validate(data_start, journal_v2_file_size, journal_v1_file_size); if (unlikely(rc)) { @@ -987,7 +1085,7 @@ int journalfile_v2_load(struct rrdengine_instance *ctx, struct rrdengine_journal error_report("File %s is invalid and it will be rebuilt", path_v2); if (unlikely(munmap(data_start, journal_v2_file_size))) - error("DBENGINE: failed to unmap '%s'", path_v2); + netdata_log_error("DBENGINE: failed to unmap '%s'", path_v2); close(fd); return rc; @@ -998,7 +1096,7 @@ int journalfile_v2_load(struct rrdengine_instance *ctx, struct rrdengine_journal if (unlikely(!entries)) { if (unlikely(munmap(data_start, journal_v2_file_size))) - error("DBENGINE: failed to unmap '%s'", path_v2); + netdata_log_error("DBENGINE: failed to unmap '%s'", path_v2); close(fd); return 1; @@ -1006,7 +1104,7 @@ int journalfile_v2_load(struct rrdengine_instance *ctx, struct rrdengine_journal usec_t finished_ut = now_monotonic_usec(); - info("DBENGINE: journal v2 '%s' loaded, size: %0.2f MiB, metrics: %0.2f k, " + netdata_log_info("DBENGINE: journal v2 '%s' loaded, size: %0.2f MiB, metrics: %0.2f k, " "mmap: %0.2f ms, validate: %0.2f ms" , path_v2 , (double)journal_v2_file_size / 1024 / 1024 @@ -1016,6 +1114,9 @@ int journalfile_v2_load(struct rrdengine_instance *ctx, struct rrdengine_journal ); // Initialize the journal file to be able to access the data + + if (!db_engine_journal_check) + journalfile->v2.flags |= JOURNALFILE_FLAG_METRIC_CRC_CHECK; journalfile_v2_data_set(journalfile, fd, data_start, journal_v2_file_size); ctx_current_disk_space_increase(ctx, journal_v2_file_size); @@ -1179,7 +1280,7 @@ void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno journalfile_v2_generate_path(datafile, path, sizeof(path)); - info("DBENGINE: indexing file '%s': extents %zu, metrics %zu, pages %zu", + netdata_log_info("DBENGINE: indexing file '%s': extents %zu, metrics %zu, pages %zu", path, number_of_extents, number_of_metrics, @@ -1350,7 +1451,7 @@ void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno internal_error(true, "DBENGINE: FILE COMPLETED --------> %llu", (now_monotonic_usec() - start_loading) / USEC_PER_MS); - info("DBENGINE: migrated journal file '%s', file size %zu", path, total_file_size); + netdata_log_info("DBENGINE: migrated journal file '%s', file size %zu", path, total_file_size); // msync(data_start, total_file_size, MS_SYNC); journalfile_v2_data_set(journalfile, fd_v2, data_start, total_file_size); @@ -1361,7 +1462,7 @@ void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno return; } else { - info("DBENGINE: failed to build index '%s', file will be skipped", path); + netdata_log_info("DBENGINE: failed to build index '%s', file will be skipped", path); j2_header.data = NULL; j2_header.magic = JOURVAL_V2_SKIP_MAGIC; memcpy(data_start, &j2_header, sizeof(j2_header)); @@ -1378,7 +1479,7 @@ void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno if (ret < 0) { ctx_current_disk_space_increase(ctx, total_file_size); ctx_fs_error(ctx); - error("DBENGINE: failed to resize file '%s'", path); + netdata_log_error("DBENGINE: failed to resize file '%s'", path); } else ctx_current_disk_space_increase(ctx, resize_file_to); @@ -1428,19 +1529,19 @@ int journalfile_load(struct rrdengine_instance *ctx, struct rrdengine_journalfil ret = journalfile_check_superblock(file); if (ret) { - info("DBENGINE: invalid journal file '%s' ; superblock check failed.", path); + netdata_log_info("DBENGINE: invalid journal file '%s' ; superblock check failed.", path); error = ret; goto cleanup; } ctx_io_read_op_bytes(ctx, sizeof(struct rrdeng_jf_sb)); - info("DBENGINE: loading journal file '%s'", path); + netdata_log_info("DBENGINE: loading journal file '%s'", path); max_id = journalfile_iterate_transactions(ctx, journalfile); __atomic_store_n(&ctx->atomic.transaction_id, MAX(__atomic_load_n(&ctx->atomic.transaction_id, __ATOMIC_RELAXED), max_id + 1), __ATOMIC_RELAXED); - info("DBENGINE: journal file '%s' loaded (size:%"PRIu64").", path, file_size); + netdata_log_info("DBENGINE: journal file '%s' loaded (size:%"PRIu64").", path, file_size); bool is_last_file = (ctx_last_fileno_get(ctx) == journalfile->datafile->fileno); if (is_last_file && journalfile->datafile->pos <= rrdeng_target_data_file_size(ctx) / 3) { @@ -1459,7 +1560,7 @@ int journalfile_load(struct rrdengine_instance *ctx, struct rrdengine_journalfil cleanup: ret = uv_fs_close(NULL, &req, file, NULL); if (ret < 0) { - error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_fs_close(%s): %s", path, uv_strerror(ret)); ctx_fs_error(ctx); } uv_fs_req_cleanup(&req); diff --git a/database/engine/journalfile.h b/database/engine/journalfile.h index f6be6bcd9..5cdf72b9d 100644 --- a/database/engine/journalfile.h +++ b/database/engine/journalfile.h @@ -21,6 +21,7 @@ typedef enum __attribute__ ((__packed__)) { JOURNALFILE_FLAG_IS_AVAILABLE = (1 << 0), JOURNALFILE_FLAG_IS_MOUNTED = (1 << 1), JOURNALFILE_FLAG_MOUNTED_FOR_RETENTION = (1 << 2), + JOURNALFILE_FLAG_METRIC_CRC_CHECK = (1 << 3), } JOURNALFILE_FLAGS; /* only one event loop is supported for now */ @@ -39,9 +40,14 @@ struct rrdengine_journalfile { time_t first_time_s; time_t last_time_s; time_t not_needed_since_s; + uint32_t size_of_directory; } v2; struct { + Word_t indexed_as; + } njfv2idx; + + struct { SPINLOCK spinlock; uint64_t pos; } unsafe; @@ -51,9 +57,9 @@ struct rrdengine_journalfile { }; static inline uint64_t journalfile_current_size(struct rrdengine_journalfile *journalfile) { - netdata_spinlock_lock(&journalfile->unsafe.spinlock); + spinlock_lock(&journalfile->unsafe.spinlock); uint64_t size = journalfile->unsafe.pos; - netdata_spinlock_unlock(&journalfile->unsafe.spinlock); + spinlock_unlock(&journalfile->unsafe.spinlock); return size; } @@ -157,4 +163,15 @@ struct journal_v2_header *journalfile_v2_data_acquire(struct rrdengine_journalfi void journalfile_v2_data_release(struct rrdengine_journalfile *journalfile); void journalfile_v2_data_unmount_cleanup(time_t now_s); +typedef struct { + bool init; + Word_t last; + time_t wanted_start_time_s; + time_t wanted_end_time_s; + struct rrdengine_instance *ctx; + struct journal_v2_header *j2_header_acquired; +} NJFV2IDX_FIND_STATE; + +struct rrdengine_datafile *njfv2idx_find_and_acquire_j2_header(NJFV2IDX_FIND_STATE *s); + #endif /* NETDATA_JOURNALFILE_H */
\ No newline at end of file diff --git a/database/engine/metric.c b/database/engine/metric.c index 6b65df9bb..1370f9d7a 100644 --- a/database/engine/metric.c +++ b/database/engine/metric.c @@ -16,6 +16,7 @@ struct metric { time_t latest_time_s_hot; // latest time of the currently collected page uint32_t latest_update_every_s; // pid_t writer; + uint8_t partition; METRIC_FLAGS flags; REFCOUNT refcount; SPINLOCK spinlock; // protects all variable members @@ -27,103 +28,98 @@ struct metric { static struct aral_statistics mrg_aral_statistics; struct mrg { - ARAL *aral[MRG_PARTITIONS]; + size_t partitions; - struct pgc_index { - netdata_rwlock_t rwlock; - Pvoid_t uuid_judy; // each UUID has a JudyL of sections (tiers) - } index[MRG_PARTITIONS]; + struct mrg_partition { + ARAL *aral; // not protected by our spinlock - it has its own - struct mrg_statistics stats; + RW_SPINLOCK rw_spinlock; + Pvoid_t uuid_judy; // JudyHS: each UUID has a JudyL of sections (tiers) - size_t entries_per_partition[MRG_PARTITIONS]; + struct mrg_statistics stats; + } index[]; }; -static inline void MRG_STATS_DUPLICATE_ADD(MRG *mrg) { - __atomic_add_fetch(&mrg->stats.additions_duplicate, 1, __ATOMIC_RELAXED); +static inline void MRG_STATS_DUPLICATE_ADD(MRG *mrg, size_t partition) { + mrg->index[partition].stats.additions_duplicate++; } static inline void MRG_STATS_ADDED_METRIC(MRG *mrg, size_t partition) { - __atomic_add_fetch(&mrg->stats.entries, 1, __ATOMIC_RELAXED); - __atomic_add_fetch(&mrg->stats.additions, 1, __ATOMIC_RELAXED); - __atomic_add_fetch(&mrg->stats.size, sizeof(METRIC), __ATOMIC_RELAXED); - - __atomic_add_fetch(&mrg->entries_per_partition[partition], 1, __ATOMIC_RELAXED); + mrg->index[partition].stats.entries++; + mrg->index[partition].stats.additions++; + mrg->index[partition].stats.size += sizeof(METRIC); } static inline void MRG_STATS_DELETED_METRIC(MRG *mrg, size_t partition) { - __atomic_sub_fetch(&mrg->stats.entries, 1, __ATOMIC_RELAXED); - __atomic_sub_fetch(&mrg->stats.size, sizeof(METRIC), __ATOMIC_RELAXED); - __atomic_add_fetch(&mrg->stats.deletions, 1, __ATOMIC_RELAXED); - - __atomic_sub_fetch(&mrg->entries_per_partition[partition], 1, __ATOMIC_RELAXED); + mrg->index[partition].stats.entries--; + mrg->index[partition].stats.size -= sizeof(METRIC); + mrg->index[partition].stats.deletions++; } -static inline void MRG_STATS_SEARCH_HIT(MRG *mrg) { - __atomic_add_fetch(&mrg->stats.search_hits, 1, __ATOMIC_RELAXED); +static inline void MRG_STATS_SEARCH_HIT(MRG *mrg, size_t partition) { + __atomic_add_fetch(&mrg->index[partition].stats.search_hits, 1, __ATOMIC_RELAXED); } -static inline void MRG_STATS_SEARCH_MISS(MRG *mrg) { - __atomic_add_fetch(&mrg->stats.search_misses, 1, __ATOMIC_RELAXED); +static inline void MRG_STATS_SEARCH_MISS(MRG *mrg, size_t partition) { + __atomic_add_fetch(&mrg->index[partition].stats.search_misses, 1, __ATOMIC_RELAXED); } -static inline void MRG_STATS_DELETE_MISS(MRG *mrg) { - __atomic_add_fetch(&mrg->stats.delete_misses, 1, __ATOMIC_RELAXED); +static inline void MRG_STATS_DELETE_MISS(MRG *mrg, size_t partition) { + mrg->index[partition].stats.delete_misses++; } -static inline void mrg_index_read_lock(MRG *mrg, size_t partition) { - netdata_rwlock_rdlock(&mrg->index[partition].rwlock); -} -static inline void mrg_index_read_unlock(MRG *mrg, size_t partition) { - netdata_rwlock_unlock(&mrg->index[partition].rwlock); -} -static inline void mrg_index_write_lock(MRG *mrg, size_t partition) { - netdata_rwlock_wrlock(&mrg->index[partition].rwlock); -} -static inline void mrg_index_write_unlock(MRG *mrg, size_t partition) { - netdata_rwlock_unlock(&mrg->index[partition].rwlock); -} +#define mrg_index_read_lock(mrg, partition) rw_spinlock_read_lock(&(mrg)->index[partition].rw_spinlock) +#define mrg_index_read_unlock(mrg, partition) rw_spinlock_read_unlock(&(mrg)->index[partition].rw_spinlock) +#define mrg_index_write_lock(mrg, partition) rw_spinlock_write_lock(&(mrg)->index[partition].rw_spinlock) +#define mrg_index_write_unlock(mrg, partition) rw_spinlock_write_unlock(&(mrg)->index[partition].rw_spinlock) + +#define metric_lock(metric) spinlock_lock(&(metric)->spinlock) +#define metric_unlock(metric) spinlock_unlock(&(metric)->spinlock) -static inline void mrg_stats_size_judyl_change(MRG *mrg, size_t mem_before_judyl, size_t mem_after_judyl) { +static inline void mrg_stats_size_judyl_change(MRG *mrg, size_t mem_before_judyl, size_t mem_after_judyl, size_t partition) { if(mem_after_judyl > mem_before_judyl) - __atomic_add_fetch(&mrg->stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED); + __atomic_add_fetch(&mrg->index[partition].stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED); else if(mem_after_judyl < mem_before_judyl) - __atomic_sub_fetch(&mrg->stats.size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED); + __atomic_sub_fetch(&mrg->index[partition].stats.size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED); } -static inline void mrg_stats_size_judyhs_added_uuid(MRG *mrg) { - __atomic_add_fetch(&mrg->stats.size, JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(uuid_t)), __ATOMIC_RELAXED); +static inline void mrg_stats_size_judyhs_added_uuid(MRG *mrg, size_t partition) { + __atomic_add_fetch(&mrg->index[partition].stats.size, JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(uuid_t)), __ATOMIC_RELAXED); } -static inline void mrg_stats_size_judyhs_removed_uuid(MRG *mrg) { - __atomic_sub_fetch(&mrg->stats.size, JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(uuid_t)), __ATOMIC_RELAXED); +static inline void mrg_stats_size_judyhs_removed_uuid(MRG *mrg, size_t partition) { + __atomic_sub_fetch(&mrg->index[partition].stats.size, JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(uuid_t)), __ATOMIC_RELAXED); } static inline size_t uuid_partition(MRG *mrg __maybe_unused, uuid_t *uuid) { uint8_t *u = (uint8_t *)uuid; - return u[UUID_SZ - 1] % MRG_PARTITIONS; + size_t *n = (size_t *)&u[UUID_SZ - sizeof(size_t)]; + return *n % mrg->partitions; } static inline bool metric_has_retention_unsafe(MRG *mrg __maybe_unused, METRIC *metric) { + size_t partition = metric->partition; + bool has_retention = (metric->first_time_s > 0 || metric->latest_time_s_clean > 0 || metric->latest_time_s_hot > 0); if(has_retention && !(metric->flags & METRIC_FLAG_HAS_RETENTION)) { metric->flags |= METRIC_FLAG_HAS_RETENTION; - __atomic_add_fetch(&mrg->stats.entries_with_retention, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&mrg->index[partition].stats.entries_with_retention, 1, __ATOMIC_RELAXED); } else if(!has_retention && (metric->flags & METRIC_FLAG_HAS_RETENTION)) { metric->flags &= ~METRIC_FLAG_HAS_RETENTION; - __atomic_sub_fetch(&mrg->stats.entries_with_retention, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&mrg->index[partition].stats.entries_with_retention, 1, __ATOMIC_RELAXED); } return has_retention; } static inline REFCOUNT metric_acquire(MRG *mrg __maybe_unused, METRIC *metric, bool having_spinlock) { + size_t partition = metric->partition; REFCOUNT refcount; if(!having_spinlock) - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); if(unlikely(metric->refcount < 0)) fatal("METRIC: refcount is %d (negative) during acquire", metric->refcount); @@ -134,21 +130,22 @@ static inline REFCOUNT metric_acquire(MRG *mrg __maybe_unused, METRIC *metric, b metric_has_retention_unsafe(mrg, metric); if(!having_spinlock) - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); if(refcount == 1) - __atomic_add_fetch(&mrg->stats.entries_referenced, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&mrg->index[partition].stats.entries_referenced, 1, __ATOMIC_RELAXED); - __atomic_add_fetch(&mrg->stats.current_references, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&mrg->index[partition].stats.current_references, 1, __ATOMIC_RELAXED); return refcount; } static inline bool metric_release_and_can_be_deleted(MRG *mrg __maybe_unused, METRIC *metric) { bool ret = true; + size_t partition = metric->partition; REFCOUNT refcount; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); if(unlikely(metric->refcount <= 0)) fatal("METRIC: refcount is %d (zero or negative) during release", metric->refcount); @@ -158,20 +155,20 @@ static inline bool metric_release_and_can_be_deleted(MRG *mrg __maybe_unused, ME if(likely(metric_has_retention_unsafe(mrg, metric) || refcount != 0)) ret = false; - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); if(unlikely(!refcount)) - __atomic_sub_fetch(&mrg->stats.entries_referenced, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&mrg->index[partition].stats.entries_referenced, 1, __ATOMIC_RELAXED); - __atomic_sub_fetch(&mrg->stats.current_references, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&mrg->index[partition].stats.current_references, 1, __ATOMIC_RELAXED); return ret; } -static METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *ret) { +static inline METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *ret) { size_t partition = uuid_partition(mrg, &entry->uuid); - METRIC *allocation = aral_mallocz(mrg->aral[partition]); + METRIC *allocation = aral_mallocz(mrg->index[partition].aral); mrg_index_write_lock(mrg, partition); @@ -182,12 +179,12 @@ static METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *ret) { fatal("DBENGINE METRIC: corrupted UUIDs JudyHS array"); if(unlikely(!*sections_judy_pptr)) - mrg_stats_size_judyhs_added_uuid(mrg); + mrg_stats_size_judyhs_added_uuid(mrg, partition); mem_before_judyl = JudyLMemUsed(*sections_judy_pptr); Pvoid_t *PValue = JudyLIns(sections_judy_pptr, entry->section, PJE0); mem_after_judyl = JudyLMemUsed(*sections_judy_pptr); - mrg_stats_size_judyl_change(mrg, mem_before_judyl, mem_after_judyl); + mrg_stats_size_judyl_change(mrg, mem_before_judyl, mem_after_judyl, partition); if(unlikely(!PValue || PValue == PJERR)) fatal("DBENGINE METRIC: corrupted section JudyL array"); @@ -196,18 +193,21 @@ static METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *ret) { METRIC *metric = *PValue; metric_acquire(mrg, metric, false); + + MRG_STATS_DUPLICATE_ADD(mrg, partition); + mrg_index_write_unlock(mrg, partition); if(ret) *ret = false; - aral_freez(mrg->aral[partition], allocation); + aral_freez(mrg->index[partition].aral, allocation); - MRG_STATS_DUPLICATE_ADD(mrg); return metric; } METRIC *metric = allocation; + // memcpy(metric->uuid, entry->uuid, sizeof(uuid_t)); uuid_copy(metric->uuid, entry->uuid); metric->section = entry->section; metric->first_time_s = MAX(0, entry->first_time_s); @@ -217,21 +217,22 @@ static METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *ret) { metric->writer = 0; metric->refcount = 0; metric->flags = 0; - netdata_spinlock_init(&metric->spinlock); + metric->partition = partition; + spinlock_init(&metric->spinlock); metric_acquire(mrg, metric, true); // no spinlock use required here *PValue = metric; + MRG_STATS_ADDED_METRIC(mrg, partition); + mrg_index_write_unlock(mrg, partition); if(ret) *ret = true; - MRG_STATS_ADDED_METRIC(mrg, partition); - return metric; } -static METRIC *metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) { +static inline METRIC *metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) { size_t partition = uuid_partition(mrg, uuid); mrg_index_read_lock(mrg, partition); @@ -239,14 +240,14 @@ static METRIC *metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) { Pvoid_t *sections_judy_pptr = JudyHSGet(mrg->index[partition].uuid_judy, uuid, sizeof(uuid_t)); if(unlikely(!sections_judy_pptr)) { mrg_index_read_unlock(mrg, partition); - MRG_STATS_SEARCH_MISS(mrg); + MRG_STATS_SEARCH_MISS(mrg, partition); return NULL; } Pvoid_t *PValue = JudyLGet(*sections_judy_pptr, section, PJE0); if(unlikely(!PValue)) { mrg_index_read_unlock(mrg, partition); - MRG_STATS_SEARCH_MISS(mrg); + MRG_STATS_SEARCH_MISS(mrg, partition); return NULL; } @@ -256,38 +257,38 @@ static METRIC *metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) { mrg_index_read_unlock(mrg, partition); - MRG_STATS_SEARCH_HIT(mrg); + MRG_STATS_SEARCH_HIT(mrg, partition); return metric; } -static bool acquired_metric_del(MRG *mrg, METRIC *metric) { - size_t partition = uuid_partition(mrg, &metric->uuid); +static inline bool acquired_metric_del(MRG *mrg, METRIC *metric) { + size_t partition = metric->partition; size_t mem_before_judyl, mem_after_judyl; mrg_index_write_lock(mrg, partition); if(!metric_release_and_can_be_deleted(mrg, metric)) { + mrg->index[partition].stats.delete_having_retention_or_referenced++; mrg_index_write_unlock(mrg, partition); - __atomic_add_fetch(&mrg->stats.delete_having_retention_or_referenced, 1, __ATOMIC_RELAXED); return false; } Pvoid_t *sections_judy_pptr = JudyHSGet(mrg->index[partition].uuid_judy, &metric->uuid, sizeof(uuid_t)); if(unlikely(!sections_judy_pptr || !*sections_judy_pptr)) { + MRG_STATS_DELETE_MISS(mrg, partition); mrg_index_write_unlock(mrg, partition); - MRG_STATS_DELETE_MISS(mrg); return false; } mem_before_judyl = JudyLMemUsed(*sections_judy_pptr); int rc = JudyLDel(sections_judy_pptr, metric->section, PJE0); mem_after_judyl = JudyLMemUsed(*sections_judy_pptr); - mrg_stats_size_judyl_change(mrg, mem_before_judyl, mem_after_judyl); + mrg_stats_size_judyl_change(mrg, mem_before_judyl, mem_after_judyl, partition); if(unlikely(!rc)) { + MRG_STATS_DELETE_MISS(mrg, partition); mrg_index_write_unlock(mrg, partition); - MRG_STATS_DELETE_MISS(mrg); return false; } @@ -295,14 +296,14 @@ static bool acquired_metric_del(MRG *mrg, METRIC *metric) { rc = JudyHSDel(&mrg->index[partition].uuid_judy, &metric->uuid, sizeof(uuid_t), PJE0); if(unlikely(!rc)) fatal("DBENGINE METRIC: cannot delete UUID from JudyHS"); - mrg_stats_size_judyhs_removed_uuid(mrg); + mrg_stats_size_judyhs_removed_uuid(mrg, partition); } - mrg_index_write_unlock(mrg, partition); + MRG_STATS_DELETED_METRIC(mrg, partition); - aral_freez(mrg->aral[partition], metric); + mrg_index_write_unlock(mrg, partition); - MRG_STATS_DELETED_METRIC(mrg, partition); + aral_freez(mrg->index[partition].aral, metric); return true; } @@ -310,38 +311,34 @@ static bool acquired_metric_del(MRG *mrg, METRIC *metric) { // ---------------------------------------------------------------------------- // public API -MRG *mrg_create(void) { - MRG *mrg = callocz(1, sizeof(MRG)); +inline MRG *mrg_create(ssize_t partitions) { + if(partitions < 1) + partitions = get_netdata_cpus(); + + MRG *mrg = callocz(1, sizeof(MRG) + sizeof(struct mrg_partition) * partitions); + mrg->partitions = partitions; - for(size_t i = 0; i < MRG_PARTITIONS ; i++) { - netdata_rwlock_init(&mrg->index[i].rwlock); + for(size_t i = 0; i < mrg->partitions ; i++) { + rw_spinlock_init(&mrg->index[i].rw_spinlock); char buf[ARAL_MAX_NAME + 1]; snprintfz(buf, ARAL_MAX_NAME, "mrg[%zu]", i); - mrg->aral[i] = aral_create(buf, - sizeof(METRIC), - 0, - 16384, - &mrg_aral_statistics, - NULL, NULL, false, - false); + mrg->index[i].aral = aral_create(buf, sizeof(METRIC), 0, 16384, &mrg_aral_statistics, NULL, NULL, false, false); } - mrg->stats.size = sizeof(MRG); - return mrg; } -size_t mrg_aral_structures(void) { +inline size_t mrg_aral_structures(void) { return aral_structures_from_stats(&mrg_aral_statistics); } -size_t mrg_aral_overhead(void) { +inline size_t mrg_aral_overhead(void) { return aral_overhead_from_stats(&mrg_aral_statistics); } -void mrg_destroy(MRG *mrg __maybe_unused) { +inline void mrg_destroy(MRG *mrg __maybe_unused) { // no destruction possible // we can't traverse the metrics list @@ -351,57 +348,57 @@ void mrg_destroy(MRG *mrg __maybe_unused) { ; } -METRIC *mrg_metric_add_and_acquire(MRG *mrg, MRG_ENTRY entry, bool *ret) { +inline METRIC *mrg_metric_add_and_acquire(MRG *mrg, MRG_ENTRY entry, bool *ret) { // internal_fatal(entry.latest_time_s > max_acceptable_collected_time(), // "DBENGINE METRIC: metric latest time is in the future"); return metric_add_and_acquire(mrg, &entry, ret); } -METRIC *mrg_metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) { +inline METRIC *mrg_metric_get_and_acquire(MRG *mrg, uuid_t *uuid, Word_t section) { return metric_get_and_acquire(mrg, uuid, section); } -bool mrg_metric_release_and_delete(MRG *mrg, METRIC *metric) { +inline bool mrg_metric_release_and_delete(MRG *mrg, METRIC *metric) { return acquired_metric_del(mrg, metric); } -METRIC *mrg_metric_dup(MRG *mrg, METRIC *metric) { +inline METRIC *mrg_metric_dup(MRG *mrg, METRIC *metric) { metric_acquire(mrg, metric, false); return metric; } -bool mrg_metric_release(MRG *mrg, METRIC *metric) { +inline bool mrg_metric_release(MRG *mrg, METRIC *metric) { return metric_release_and_can_be_deleted(mrg, metric); } -Word_t mrg_metric_id(MRG *mrg __maybe_unused, METRIC *metric) { +inline Word_t mrg_metric_id(MRG *mrg __maybe_unused, METRIC *metric) { return (Word_t)metric; } -uuid_t *mrg_metric_uuid(MRG *mrg __maybe_unused, METRIC *metric) { +inline uuid_t *mrg_metric_uuid(MRG *mrg __maybe_unused, METRIC *metric) { return &metric->uuid; } -Word_t mrg_metric_section(MRG *mrg __maybe_unused, METRIC *metric) { +inline Word_t mrg_metric_section(MRG *mrg __maybe_unused, METRIC *metric) { return metric->section; } -bool mrg_metric_set_first_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) { +inline bool mrg_metric_set_first_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) { internal_fatal(first_time_s < 0, "DBENGINE METRIC: timestamp is negative"); if(unlikely(first_time_s < 0)) return false; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); metric->first_time_s = first_time_s; metric_has_retention_unsafe(mrg, metric); - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); return true; } -void mrg_metric_expand_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s, time_t last_time_s, time_t update_every_s) { +inline void mrg_metric_expand_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s, time_t last_time_s, time_t update_every_s) { internal_fatal(first_time_s < 0 || last_time_s < 0 || update_every_s < 0, "DBENGINE METRIC: timestamp is negative"); internal_fatal(first_time_s > max_acceptable_collected_time(), @@ -421,7 +418,7 @@ void mrg_metric_expand_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t if(unlikely(!first_time_s && !last_time_s && !update_every_s)) return; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); if(unlikely(first_time_s && (!metric->first_time_s || first_time_s < metric->first_time_s))) metric->first_time_s = first_time_s; @@ -436,29 +433,29 @@ void mrg_metric_expand_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t metric->latest_update_every_s = (uint32_t) update_every_s; metric_has_retention_unsafe(mrg, metric); - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); } -bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) { +inline bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) { internal_fatal(first_time_s < 0, "DBENGINE METRIC: timestamp is negative"); bool ret = false; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); if(first_time_s > metric->first_time_s) { metric->first_time_s = first_time_s; ret = true; } metric_has_retention_unsafe(mrg, metric); - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); return ret; } -time_t mrg_metric_get_first_time_s(MRG *mrg __maybe_unused, METRIC *metric) { +inline time_t mrg_metric_get_first_time_s(MRG *mrg __maybe_unused, METRIC *metric) { time_t first_time_s; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); if(unlikely(!metric->first_time_s)) { if(metric->latest_time_s_clean) @@ -470,13 +467,13 @@ time_t mrg_metric_get_first_time_s(MRG *mrg __maybe_unused, METRIC *metric) { first_time_s = metric->first_time_s; - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); return first_time_s; } -void mrg_metric_get_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t *first_time_s, time_t *last_time_s, time_t *update_every_s) { - netdata_spinlock_lock(&metric->spinlock); +inline void mrg_metric_get_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t *first_time_s, time_t *last_time_s, time_t *update_every_s) { + metric_lock(metric); if(unlikely(!metric->first_time_s)) { if(metric->latest_time_s_clean) @@ -490,16 +487,16 @@ void mrg_metric_get_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t *f *last_time_s = MAX(metric->latest_time_s_clean, metric->latest_time_s_hot); *update_every_s = metric->latest_update_every_s; - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); } -bool mrg_metric_set_clean_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) { +inline bool mrg_metric_set_clean_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) { internal_fatal(latest_time_s < 0, "DBENGINE METRIC: timestamp is negative"); if(unlikely(latest_time_s < 0)) return false; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); // internal_fatal(latest_time_s > max_acceptable_collected_time(), // "DBENGINE METRIC: metric latest time is in the future"); @@ -513,12 +510,12 @@ bool mrg_metric_set_clean_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, metric->first_time_s = latest_time_s; metric_has_retention_unsafe(mrg, metric); - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); return true; } // returns true when metric still has retention -bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric) { +inline bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric) { Word_t section = mrg_metric_section(mrg, metric); bool do_again = false; size_t countdown = 5; @@ -551,7 +548,7 @@ bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric) { if (min_first_time_s == LONG_MAX) min_first_time_s = 0; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); if (--countdown && !min_first_time_s && metric->latest_time_s_hot) do_again = true; else { @@ -563,13 +560,13 @@ bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric) { ret = metric_has_retention_unsafe(mrg, metric); } - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); } while(do_again); return ret; } -bool mrg_metric_set_hot_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) { +inline bool mrg_metric_set_hot_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) { internal_fatal(latest_time_s < 0, "DBENGINE METRIC: timestamp is negative"); // internal_fatal(latest_time_s > max_acceptable_collected_time(), @@ -578,204 +575,215 @@ bool mrg_metric_set_hot_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, t if(unlikely(latest_time_s < 0)) return false; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); metric->latest_time_s_hot = latest_time_s; if(unlikely(!metric->first_time_s)) metric->first_time_s = latest_time_s; metric_has_retention_unsafe(mrg, metric); - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); return true; } -time_t mrg_metric_get_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric) { +inline time_t mrg_metric_get_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric) { time_t max; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); max = MAX(metric->latest_time_s_clean, metric->latest_time_s_hot); - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); return max; } -bool mrg_metric_set_update_every(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) { +inline bool mrg_metric_set_update_every(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) { internal_fatal(update_every_s < 0, "DBENGINE METRIC: timestamp is negative"); if(update_every_s <= 0) return false; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); metric->latest_update_every_s = (uint32_t) update_every_s; - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); return true; } -bool mrg_metric_set_update_every_s_if_zero(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) { +inline bool mrg_metric_set_update_every_s_if_zero(MRG *mrg __maybe_unused, METRIC *metric, time_t update_every_s) { internal_fatal(update_every_s < 0, "DBENGINE METRIC: timestamp is negative"); if(update_every_s <= 0) return false; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); if(!metric->latest_update_every_s) metric->latest_update_every_s = (uint32_t) update_every_s; - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); return true; } -time_t mrg_metric_get_update_every_s(MRG *mrg __maybe_unused, METRIC *metric) { +inline time_t mrg_metric_get_update_every_s(MRG *mrg __maybe_unused, METRIC *metric) { time_t update_every_s; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); update_every_s = metric->latest_update_every_s; - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); return update_every_s; } -bool mrg_metric_set_writer(MRG *mrg, METRIC *metric) { +inline bool mrg_metric_set_writer(MRG *mrg, METRIC *metric) { bool done = false; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); if(!metric->writer) { metric->writer = gettid(); - __atomic_add_fetch(&mrg->stats.writers, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&mrg->index[metric->partition].stats.writers, 1, __ATOMIC_RELAXED); done = true; } else - __atomic_add_fetch(&mrg->stats.writers_conflicts, 1, __ATOMIC_RELAXED); - netdata_spinlock_unlock(&metric->spinlock); + __atomic_add_fetch(&mrg->index[metric->partition].stats.writers_conflicts, 1, __ATOMIC_RELAXED); + metric_unlock(metric); return done; } -bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric) { +inline bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric) { bool done = false; - netdata_spinlock_lock(&metric->spinlock); + metric_lock(metric); if(metric->writer) { metric->writer = 0; - __atomic_sub_fetch(&mrg->stats.writers, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&mrg->index[metric->partition].stats.writers, 1, __ATOMIC_RELAXED); done = true; } - netdata_spinlock_unlock(&metric->spinlock); + metric_unlock(metric); return done; } -struct mrg_statistics mrg_get_statistics(MRG *mrg) { - // FIXME - use atomics - return mrg->stats; -} - -// ---------------------------------------------------------------------------- -// unit test +inline void mrg_update_metric_retention_and_granularity_by_uuid( + MRG *mrg, Word_t section, uuid_t *uuid, + time_t first_time_s, time_t last_time_s, + time_t update_every_s, time_t now_s) +{ + if(unlikely(last_time_s > now_s)) { + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE JV2: wrong last time on-disk (%ld - %ld, now %ld), " + "fixing last time to now", + first_time_s, last_time_s, now_s); + last_time_s = now_s; + } -#ifdef MRG_STRESS_TEST + if (unlikely(first_time_s > last_time_s)) { + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE JV2: wrong first time on-disk (%ld - %ld, now %ld), " + "fixing first time to last time", + first_time_s, last_time_s, now_s); -static void mrg_stress(MRG *mrg, size_t entries, size_t sections) { - bool ret; + first_time_s = last_time_s; + } - info("DBENGINE METRIC: stress testing %zu entries on %zu sections...", entries, sections); + if (unlikely(first_time_s == 0 || last_time_s == 0)) { + error_limit_static_global_var(erl, 1, 0); + error_limit(&erl, "DBENGINE JV2: zero on-disk timestamps (%ld - %ld, now %ld), " + "using them as-is", + first_time_s, last_time_s, now_s); + } - METRIC *array[entries][sections]; - for(size_t i = 0; i < entries ; i++) { - MRG_ENTRY e = { - .first_time_s = (time_t)(i + 1), - .latest_time_s = (time_t)(i + 2), - .latest_update_every_s = (time_t)(i + 3), + bool added = false; + METRIC *metric = mrg_metric_get_and_acquire(mrg, uuid, section); + if (!metric) { + MRG_ENTRY entry = { + .section = section, + .first_time_s = first_time_s, + .last_time_s = last_time_s, + .latest_update_every_s = (uint32_t) update_every_s }; - uuid_generate_random(e.uuid); - - for(size_t section = 0; section < sections ;section++) { - e.section = section; - array[i][section] = mrg_metric_add_and_acquire(mrg, e, &ret); - if(!ret) - fatal("DBENGINE METRIC: failed to add metric %zu, section %zu", i, section); - - if(mrg_metric_add_and_acquire(mrg, e, &ret) != array[i][section]) - fatal("DBENGINE METRIC: adding the same metric twice, returns a different metric"); - - if(ret) - fatal("DBENGINE METRIC: adding the same metric twice, returns success"); - - if(mrg_metric_get_and_acquire(mrg, &e.uuid, e.section) != array[i][section]) - fatal("DBENGINE METRIC: cannot get back the same metric"); - - if(uuid_compare(*mrg_metric_uuid(mrg, array[i][section]), e.uuid) != 0) - fatal("DBENGINE METRIC: uuids do not match"); - } + // memcpy(entry.uuid, *uuid, sizeof(uuid_t)); + uuid_copy(entry.uuid, *uuid); + metric = mrg_metric_add_and_acquire(mrg, entry, &added); } - for(size_t i = 0; i < entries ; i++) { - for (size_t section = 0; section < sections; section++) { - uuid_t uuid; - uuid_generate_random(uuid); - - if(mrg_metric_get_and_acquire(mrg, &uuid, section)) - fatal("DBENGINE METRIC: found non-existing uuid"); - - if(mrg_metric_id(mrg, array[i][section]) != (Word_t)array[i][section]) - fatal("DBENGINE METRIC: metric id does not match"); - - if(mrg_metric_get_first_time_s(mrg, array[i][section]) != (time_t)(i + 1)) - fatal("DBENGINE METRIC: wrong first time returned"); - if(mrg_metric_get_latest_time_s(mrg, array[i][section]) != (time_t)(i + 2)) - fatal("DBENGINE METRIC: wrong latest time returned"); - if(mrg_metric_get_update_every_s(mrg, array[i][section]) != (time_t)(i + 3)) - fatal("DBENGINE METRIC: wrong latest time returned"); - - if(!mrg_metric_set_first_time_s(mrg, array[i][section], (time_t)((i + 1) * 2))) - fatal("DBENGINE METRIC: cannot set first time"); - if(!mrg_metric_set_clean_latest_time_s(mrg, array[i][section], (time_t) ((i + 1) * 3))) - fatal("DBENGINE METRIC: cannot set latest time"); - if(!mrg_metric_set_update_every(mrg, array[i][section], (time_t)((i + 1) * 4))) - fatal("DBENGINE METRIC: cannot set update every"); - - if(mrg_metric_get_first_time_s(mrg, array[i][section]) != (time_t)((i + 1) * 2)) - fatal("DBENGINE METRIC: wrong first time returned"); - if(mrg_metric_get_latest_time_s(mrg, array[i][section]) != (time_t)((i + 1) * 3)) - fatal("DBENGINE METRIC: wrong latest time returned"); - if(mrg_metric_get_update_every_s(mrg, array[i][section]) != (time_t)((i + 1) * 4)) - fatal("DBENGINE METRIC: wrong latest time returned"); - } + if (likely(!added)) + mrg_metric_expand_retention(mrg, metric, first_time_s, last_time_s, update_every_s); + + mrg_metric_release(mrg, metric); +} + +inline void mrg_get_statistics(MRG *mrg, struct mrg_statistics *s) { + memset(s, 0, sizeof(struct mrg_statistics)); + + for(size_t i = 0; i < mrg->partitions ;i++) { + s->entries += __atomic_load_n(&mrg->index[i].stats.entries, __ATOMIC_RELAXED); + s->entries_referenced += __atomic_load_n(&mrg->index[i].stats.entries_referenced, __ATOMIC_RELAXED); + s->entries_with_retention += __atomic_load_n(&mrg->index[i].stats.entries_with_retention, __ATOMIC_RELAXED); + s->size += __atomic_load_n(&mrg->index[i].stats.size, __ATOMIC_RELAXED); + s->current_references += __atomic_load_n(&mrg->index[i].stats.current_references, __ATOMIC_RELAXED); + s->additions += __atomic_load_n(&mrg->index[i].stats.additions, __ATOMIC_RELAXED); + s->additions_duplicate += __atomic_load_n(&mrg->index[i].stats.additions_duplicate, __ATOMIC_RELAXED); + s->deletions += __atomic_load_n(&mrg->index[i].stats.deletions, __ATOMIC_RELAXED); + s->delete_having_retention_or_referenced += __atomic_load_n(&mrg->index[i].stats.delete_having_retention_or_referenced, __ATOMIC_RELAXED); + s->delete_misses += __atomic_load_n(&mrg->index[i].stats.delete_misses, __ATOMIC_RELAXED); + s->search_hits += __atomic_load_n(&mrg->index[i].stats.search_hits, __ATOMIC_RELAXED); + s->search_misses += __atomic_load_n(&mrg->index[i].stats.search_misses, __ATOMIC_RELAXED); + s->writers += __atomic_load_n(&mrg->index[i].stats.writers, __ATOMIC_RELAXED); + s->writers_conflicts += __atomic_load_n(&mrg->index[i].stats.writers_conflicts, __ATOMIC_RELAXED); } - for(size_t i = 0; i < entries ; i++) { - for (size_t section = 0; section < sections; section++) { - if(!mrg_metric_release_and_delete(mrg, array[i][section])) - fatal("DBENGINE METRIC: failed to delete metric"); - } - } + s->size += sizeof(MRG) + sizeof(struct mrg_partition) * mrg->partitions; } -static void *mrg_stress_test_thread1(void *ptr) { - MRG *mrg = ptr; +// ---------------------------------------------------------------------------- +// unit test - for(int i = 0; i < 5 ; i++) - mrg_stress(mrg, 10000, 5); +struct mrg_stress_entry { + uuid_t uuid; + time_t after; + time_t before; +}; - return ptr; -} +struct mrg_stress { + MRG *mrg; + bool stop; + size_t entries; + struct mrg_stress_entry *array; + size_t updates; +}; -static void *mrg_stress_test_thread2(void *ptr) { - MRG *mrg = ptr; +static void *mrg_stress(void *ptr) { + struct mrg_stress *t = ptr; + MRG *mrg = t->mrg; - for(int i = 0; i < 10 ; i++) - mrg_stress(mrg, 500, 50); + ssize_t start = 0; + ssize_t end = (ssize_t)t->entries; + ssize_t step = 1; - return ptr; -} + if(gettid() % 2) { + start = (ssize_t)t->entries - 1; + end = -1; + step = -1; + } + + while(!__atomic_load_n(&t->stop, __ATOMIC_RELAXED)) { + for (ssize_t i = start; i != end; i += step) { + struct mrg_stress_entry *e = &t->array[i]; -static void *mrg_stress_test_thread3(void *ptr) { - MRG *mrg = ptr; + time_t after = __atomic_sub_fetch(&e->after, 1, __ATOMIC_RELAXED); + time_t before = __atomic_add_fetch(&e->before, 1, __ATOMIC_RELAXED); - for(int i = 0; i < 50 ; i++) - mrg_stress(mrg, 5000, 1); + mrg_update_metric_retention_and_granularity_by_uuid( + mrg, 0x01, + &e->uuid, + after, + before, + 1, + before); + + __atomic_add_fetch(&t->updates, 1, __ATOMIC_RELAXED); + } + } return ptr; } -#endif int mrg_unittest(void) { - MRG *mrg = mrg_create(); + MRG *mrg = mrg_create(0); METRIC *m1_t0, *m2_t0, *m3_t0, *m4_t0; METRIC *m1_t1, *m2_t1, *m3_t1, *m4_t1; bool ret; @@ -850,54 +858,84 @@ int mrg_unittest(void) { if(!mrg_metric_release_and_delete(mrg, m1_t1)) fatal("DBENGINE METRIC: cannot delete the second metric"); - if(mrg->stats.entries != 0) + struct mrg_statistics s; + mrg_get_statistics(mrg, &s); + if(s.entries != 0) fatal("DBENGINE METRIC: invalid entries counter"); -#ifdef MRG_STRESS_TEST - usec_t started_ut = now_monotonic_usec(); - pthread_t thread1; - netdata_thread_create(&thread1, "TH1", - NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, - mrg_stress_test_thread1, mrg); + size_t entries = 1000000; + size_t threads = mrg->partitions / 3 + 1; + size_t tiers = 3; + size_t run_for_secs = 5; + netdata_log_info("preparing stress test of %zu entries...", entries); + struct mrg_stress t = { + .mrg = mrg, + .entries = entries, + .array = callocz(entries, sizeof(struct mrg_stress_entry)), + }; - pthread_t thread2; - netdata_thread_create(&thread2, "TH2", - NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, - mrg_stress_test_thread2, mrg); + time_t now = max_acceptable_collected_time(); + for(size_t i = 0; i < entries ;i++) { + uuid_generate_random(t.array[i].uuid); + t.array[i].after = now / 3; + t.array[i].before = now / 2; + } + netdata_log_info("stress test is populating MRG with 3 tiers..."); + for(size_t i = 0; i < entries ;i++) { + struct mrg_stress_entry *e = &t.array[i]; + for(size_t tier = 1; tier <= tiers ;tier++) { + mrg_update_metric_retention_and_granularity_by_uuid( + mrg, tier, + &e->uuid, + e->after, + e->before, + 1, + e->before); + } + } + netdata_log_info("stress test ready to run..."); - pthread_t thread3; - netdata_thread_create(&thread3, "TH3", - NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, - mrg_stress_test_thread3, mrg); + usec_t started_ut = now_monotonic_usec(); + pthread_t th[threads]; + for(size_t i = 0; i < threads ; i++) { + char buf[15 + 1]; + snprintfz(buf, 15, "TH[%zu]", i); + netdata_thread_create(&th[i], buf, + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + mrg_stress, &t); + } - sleep_usec(5 * USEC_PER_SEC); + sleep_usec(run_for_secs * USEC_PER_SEC); + __atomic_store_n(&t.stop, true, __ATOMIC_RELAXED); - netdata_thread_cancel(thread1); - netdata_thread_cancel(thread2); - netdata_thread_cancel(thread3); + for(size_t i = 0; i < threads ; i++) + netdata_thread_cancel(th[i]); + + for(size_t i = 0; i < threads ; i++) + netdata_thread_join(th[i], NULL); - netdata_thread_join(thread1, NULL); - netdata_thread_join(thread2, NULL); - netdata_thread_join(thread3, NULL); usec_t ended_ut = now_monotonic_usec(); - info("DBENGINE METRIC: did %zu additions, %zu duplicate additions, " + struct mrg_statistics stats; + mrg_get_statistics(mrg, &stats); + + netdata_log_info("DBENGINE METRIC: did %zu additions, %zu duplicate additions, " "%zu deletions, %zu wrong deletions, " "%zu successful searches, %zu wrong searches, " - "%zu successful pointer validations, %zu wrong pointer validations " "in %llu usecs", - mrg->stats.additions, mrg->stats.additions_duplicate, - mrg->stats.deletions, mrg->stats.delete_misses, - mrg->stats.search_hits, mrg->stats.search_misses, - mrg->stats.pointer_validation_hits, mrg->stats.pointer_validation_misses, + stats.additions, stats.additions_duplicate, + stats.deletions, stats.delete_misses, + stats.search_hits, stats.search_misses, ended_ut - started_ut); -#endif + netdata_log_info("DBENGINE METRIC: updates performance: %0.2fk/sec total, %0.2fk/sec/thread", + (double)t.updates / (double)((ended_ut - started_ut) / USEC_PER_SEC) / 1000.0, + (double)t.updates / (double)((ended_ut - started_ut) / USEC_PER_SEC) / 1000.0 / threads); mrg_destroy(mrg); - info("DBENGINE METRIC: all tests passed!"); + netdata_log_info("DBENGINE METRIC: all tests passed!"); return 0; } diff --git a/database/engine/metric.h b/database/engine/metric.h index 82aff903a..5cb5b045e 100644 --- a/database/engine/metric.h +++ b/database/engine/metric.h @@ -3,7 +3,7 @@ #include "../rrd.h" -#define MRG_PARTITIONS 10 +#define MRG_CACHE_LINE_PADDING(x) uint8_t padding##x[64] typedef struct metric METRIC; typedef struct mrg MRG; @@ -17,13 +17,10 @@ typedef struct mrg_entry { } MRG_ENTRY; struct mrg_statistics { - size_t entries; - size_t entries_referenced; - size_t entries_with_retention; - - size_t size; // total memory used, with indexing + // --- non-atomic --- under a write lock - size_t current_references; + size_t entries; + size_t size; // total memory used, with indexing size_t additions; size_t additions_duplicate; @@ -32,14 +29,28 @@ struct mrg_statistics { size_t delete_having_retention_or_referenced; size_t delete_misses; + MRG_CACHE_LINE_PADDING(0); + + // --- atomic --- multiple readers / writers + + size_t entries_referenced; + + MRG_CACHE_LINE_PADDING(1); + size_t entries_with_retention; + + MRG_CACHE_LINE_PADDING(2); + size_t current_references; + + MRG_CACHE_LINE_PADDING(3); size_t search_hits; size_t search_misses; + MRG_CACHE_LINE_PADDING(4); size_t writers; size_t writers_conflicts; }; -MRG *mrg_create(void); +MRG *mrg_create(ssize_t partitions); void mrg_destroy(MRG *mrg); METRIC *mrg_metric_dup(MRG *mrg, METRIC *metric); @@ -72,8 +83,14 @@ bool mrg_metric_zero_disk_retention(MRG *mrg __maybe_unused, METRIC *metric); bool mrg_metric_set_writer(MRG *mrg, METRIC *metric); bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric); -struct mrg_statistics mrg_get_statistics(MRG *mrg); +void mrg_get_statistics(MRG *mrg, struct mrg_statistics *s); size_t mrg_aral_structures(void); size_t mrg_aral_overhead(void); + +void mrg_update_metric_retention_and_granularity_by_uuid( + MRG *mrg, Word_t section, uuid_t *uuid, + time_t first_time_s, time_t last_time_s, + time_t update_every_s, time_t now_s); + #endif // DBENGINE_METRIC_H diff --git a/database/engine/pagecache.c b/database/engine/pagecache.c index 02d08a164..c608c3270 100644 --- a/database/engine/pagecache.c +++ b/database/engine/pagecache.c @@ -387,15 +387,17 @@ static size_t list_has_time_gaps( time_t wanted_end_time_s, size_t *pages_total, size_t *pages_found_pass4, - size_t *pages_pending, + size_t *pages_to_load_from_disk, size_t *pages_overlapping, time_t *optimal_end_time_s, - bool populate_gaps + bool populate_gaps, + PDC_PAGE_STATUS *common_status ) { // we will recalculate these, so zero them - *pages_pending = 0; + *pages_to_load_from_disk = 0; *pages_overlapping = 0; *optimal_end_time_s = 0; + *common_status = 0; bool first; Pvoid_t *PValue; @@ -461,6 +463,7 @@ static size_t list_has_time_gaps( (*pages_overlapping)++; pd->status |= PDC_PAGE_SKIP; pd->status &= ~(PDC_PAGE_READY | PDC_PAGE_DISK_PENDING); + *common_status |= pd->status; continue; } @@ -480,7 +483,7 @@ static size_t list_has_time_gaps( } else if(!(pd->status & PDC_PAGE_FAILED) && (pd->status & PDC_PAGE_DATAFILE_ACQUIRED)) { - (*pages_pending)++; + (*pages_to_load_from_disk)++; pd->status |= PDC_PAGE_DISK_PENDING; @@ -495,6 +498,8 @@ static size_t list_has_time_gaps( pd->status &= ~PDC_PAGE_DISK_PENDING; pd->status |= (PDC_PAGE_READY | PDC_PAGE_PRELOADED); } + + *common_status |= pd->status; } internal_fatal(pages_pass2 != pages_pass3, @@ -505,6 +510,8 @@ static size_t list_has_time_gaps( return gaps; } +// ---------------------------------------------------------------------------- + typedef void (*page_found_callback_t)(PGC_PAGE *page, void *data); static size_t get_page_list_from_journal_v2(struct rrdengine_instance *ctx, METRIC *metric, usec_t start_time_ut, usec_t end_time_ut, page_found_callback_t callback, void *callback_data) { uuid_t *uuid = mrg_metric_uuid(main_mrg, metric); @@ -515,12 +522,19 @@ static size_t get_page_list_from_journal_v2(struct rrdengine_instance *ctx, METR size_t pages_found = 0; - uv_rwlock_rdlock(&ctx->datafiles.rwlock); + NJFV2IDX_FIND_STATE state = { + .init = false, + .last = 0, + .ctx = ctx, + .wanted_start_time_s = wanted_start_time_s, + .wanted_end_time_s = wanted_end_time_s, + .j2_header_acquired = NULL, + }; + struct rrdengine_datafile *datafile; - for(datafile = ctx->datafiles.first; datafile ; datafile = datafile->next) { - struct journal_v2_header *j2_header = journalfile_v2_data_acquire(datafile->journalfile, NULL, - wanted_start_time_s, - wanted_end_time_s); + while((datafile = njfv2idx_find_and_acquire_j2_header(&state))) { + struct journal_v2_header *j2_header = state.j2_header_acquired; + if (unlikely(!j2_header)) continue; @@ -595,7 +609,6 @@ static size_t get_page_list_from_journal_v2(struct rrdengine_instance *ctx, METR journalfile_v2_data_release(datafile->journalfile); } - uv_rwlock_rdunlock(&ctx->datafiles.rwlock); return pages_found; } @@ -644,10 +657,13 @@ static Pvoid_t get_page_list( METRIC *metric, usec_t start_time_ut, usec_t end_time_ut, - size_t *pages_to_load, - time_t *optimal_end_time_s + time_t *optimal_end_time_s, + size_t *pages_to_load_from_disk, + PDC_PAGE_STATUS *common_status ) { *optimal_end_time_s = 0; + *pages_to_load_from_disk = 0; + *common_status = 0; Pvoid_t JudyL_page_array = (Pvoid_t) NULL; @@ -658,14 +674,13 @@ static Pvoid_t get_page_list( pages_found_in_open_cache = 0, pages_found_in_journals_v2 = 0, pages_found_pass4 = 0, - pages_pending = 0, pages_overlapping = 0, pages_total = 0; size_t cache_gaps = 0, query_gaps = 0; bool done_v2 = false, done_open = false; - usec_t pass1_ut = 0, pass2_ut = 0, pass3_ut = 0, pass4_ut = 0; + usec_t pass1_ut = 0, pass2_ut = 0, pass3_ut = 0, pass4_ut = 0, finish_ut = 0; // -------------------------------------------------------------- // PASS 1: Check what the main page cache has available @@ -680,8 +695,8 @@ static Pvoid_t get_page_list( if(pages_found_in_main_cache && !cache_gaps) { query_gaps = list_has_time_gaps(ctx, metric, JudyL_page_array, wanted_start_time_s, wanted_end_time_s, - &pages_total, &pages_found_pass4, &pages_pending, &pages_overlapping, - optimal_end_time_s, false); + &pages_total, &pages_found_pass4, pages_to_load_from_disk, &pages_overlapping, + optimal_end_time_s, false, common_status); if (pages_total && !query_gaps) goto we_are_done; @@ -702,8 +717,8 @@ static Pvoid_t get_page_list( if(pages_found_in_open_cache) { query_gaps = list_has_time_gaps(ctx, metric, JudyL_page_array, wanted_start_time_s, wanted_end_time_s, - &pages_total, &pages_found_pass4, &pages_pending, &pages_overlapping, - optimal_end_time_s, false); + &pages_total, &pages_found_pass4, pages_to_load_from_disk, &pages_overlapping, + optimal_end_time_s, false, common_status); if (pages_total && !query_gaps) goto we_are_done; @@ -726,15 +741,11 @@ static Pvoid_t get_page_list( pass4_ut = now_monotonic_usec(); query_gaps = list_has_time_gaps(ctx, metric, JudyL_page_array, wanted_start_time_s, wanted_end_time_s, - &pages_total, &pages_found_pass4, &pages_pending, &pages_overlapping, - optimal_end_time_s, true); + &pages_total, &pages_found_pass4, pages_to_load_from_disk, &pages_overlapping, + optimal_end_time_s, true, common_status); we_are_done: - - if(pages_to_load) - *pages_to_load = pages_pending; - - usec_t finish_ut = now_monotonic_usec(); + finish_ut = now_monotonic_usec(); time_delta(finish_ut, pass4_ut); time_delta(finish_ut, pass3_ut); time_delta(finish_ut, pass2_ut); @@ -754,7 +765,7 @@ we_are_done: __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_meta_source_journal_v2, pages_found_in_journals_v2, __ATOMIC_RELAXED); __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache, pages_found_in_main_cache, __ATOMIC_RELAXED); __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_data_source_main_cache_at_pass4, pages_found_pass4, __ATOMIC_RELAXED); - __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_to_load_from_disk, pages_pending, __ATOMIC_RELAXED); + __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_to_load_from_disk, *pages_to_load_from_disk, __ATOMIC_RELAXED); __atomic_add_fetch(&rrdeng_cache_efficiency_stats.pages_overlapping_skipped, pages_overlapping, __ATOMIC_RELAXED); return JudyL_page_array; @@ -773,14 +784,23 @@ void rrdeng_prep_query(struct page_details_control *pdc, bool worker) { if(worker) worker_is_busy(UV_EVENT_DBENGINE_QUERY); - size_t pages_to_load = 0; pdc->page_list_JudyL = get_page_list(pdc->ctx, pdc->metric, pdc->start_time_s * USEC_PER_SEC, pdc->end_time_s * USEC_PER_SEC, - &pages_to_load, - &pdc->optimal_end_time_s); + &pdc->optimal_end_time_s, + &pdc->pages_to_load_from_disk, + &pdc->common_status); + + internal_fatal(pdc->pages_to_load_from_disk && !(pdc->common_status & PDC_PAGE_DISK_PENDING), + "DBENGINE: PDC reports there are %zu pages to load from disk, " + "but none of the pages has the PDC_PAGE_DISK_PENDING flag", + pdc->pages_to_load_from_disk); + + internal_fatal(!pdc->pages_to_load_from_disk && (pdc->common_status & PDC_PAGE_DISK_PENDING), + "DBENGINE: PDC reports there are no pages to load from disk, " + "but one or more pages have the PDC_PAGE_DISK_PENDING flag"); - if (pages_to_load && pdc->page_list_JudyL) { + if (pdc->pages_to_load_from_disk && pdc->page_list_JudyL) { pdc_acquire(pdc); // we get 1 for the 1st worker in the chain: do_read_page_list_work() usec_t start_ut = now_monotonic_usec(); if(likely(pdc->priority == STORAGE_PRIORITY_SYNCHRONOUS)) @@ -822,7 +842,7 @@ void pg_cache_preload(struct rrdeng_query_handle *handle) { handle->pdc->optimal_end_time_s = handle->end_time_s; handle->pdc->ctx = handle->ctx; handle->pdc->refcount = 1; - netdata_spinlock_init(&handle->pdc->refcount_spinlock); + spinlock_init(&handle->pdc->refcount_spinlock); completion_init(&handle->pdc->prep_completion); completion_init(&handle->pdc->page_completion); @@ -1063,7 +1083,7 @@ size_t dynamic_extent_cache_size(void) { void pgc_and_mrg_initialize(void) { - main_mrg = mrg_create(); + main_mrg = mrg_create(0); size_t target_cache_size = (size_t)default_rrdeng_page_cache_mb * 1024ULL * 1024ULL; size_t main_cache_size = (target_cache_size / 100) * 95; diff --git a/database/engine/pdc.c b/database/engine/pdc.c index 42fb2f6de..7da568787 100644 --- a/database/engine/pdc.c +++ b/database/engine/pdc.c @@ -198,7 +198,7 @@ void extent_buffer_init(void) { void extent_buffer_cleanup1(void) { struct extent_buffer *item = NULL; - if(!netdata_spinlock_trylock(&extent_buffer_globals.protected.spinlock)) + if(!spinlock_trylock(&extent_buffer_globals.protected.spinlock)) return; if(extent_buffer_globals.protected.available_items && extent_buffer_globals.protected.available > 1) { @@ -207,7 +207,7 @@ void extent_buffer_cleanup1(void) { extent_buffer_globals.protected.available--; } - netdata_spinlock_unlock(&extent_buffer_globals.protected.spinlock); + spinlock_unlock(&extent_buffer_globals.protected.spinlock); if(item) { size_t bytes = sizeof(struct extent_buffer) + item->bytes; @@ -225,13 +225,13 @@ struct extent_buffer *extent_buffer_get(size_t size) { if(size < extent_buffer_globals.max_size) size = extent_buffer_globals.max_size; - netdata_spinlock_lock(&extent_buffer_globals.protected.spinlock); + spinlock_lock(&extent_buffer_globals.protected.spinlock); if(likely(extent_buffer_globals.protected.available_items)) { eb = extent_buffer_globals.protected.available_items; DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, eb, cache.prev, cache.next); extent_buffer_globals.protected.available--; } - netdata_spinlock_unlock(&extent_buffer_globals.protected.spinlock); + spinlock_unlock(&extent_buffer_globals.protected.spinlock); if(unlikely(eb && eb->bytes < size)) { size_t bytes = sizeof(struct extent_buffer) + eb->bytes; @@ -255,10 +255,10 @@ struct extent_buffer *extent_buffer_get(size_t size) { void extent_buffer_release(struct extent_buffer *eb) { if(unlikely(!eb)) return; - netdata_spinlock_lock(&extent_buffer_globals.protected.spinlock); + spinlock_lock(&extent_buffer_globals.protected.spinlock); DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(extent_buffer_globals.protected.available_items, eb, cache.prev, cache.next); extent_buffer_globals.protected.available++; - netdata_spinlock_unlock(&extent_buffer_globals.protected.spinlock); + spinlock_unlock(&extent_buffer_globals.protected.spinlock); } size_t extent_buffer_cache_size(void) { @@ -400,20 +400,20 @@ static void pdc_destroy(PDC *pdc) { } void pdc_acquire(PDC *pdc) { - netdata_spinlock_lock(&pdc->refcount_spinlock); + spinlock_lock(&pdc->refcount_spinlock); if(pdc->refcount < 1) fatal("DBENGINE: pdc is not referenced and cannot be acquired"); pdc->refcount++; - netdata_spinlock_unlock(&pdc->refcount_spinlock); + spinlock_unlock(&pdc->refcount_spinlock); } bool pdc_release_and_destroy_if_unreferenced(PDC *pdc, bool worker, bool router __maybe_unused) { if(unlikely(!pdc)) return true; - netdata_spinlock_lock(&pdc->refcount_spinlock); + spinlock_lock(&pdc->refcount_spinlock); if(pdc->refcount <= 0) fatal("DBENGINE: pdc is not referenced and cannot be released"); @@ -429,12 +429,12 @@ bool pdc_release_and_destroy_if_unreferenced(PDC *pdc, bool worker, bool router } if (pdc->refcount == 0) { - netdata_spinlock_unlock(&pdc->refcount_spinlock); + spinlock_unlock(&pdc->refcount_spinlock); pdc_destroy(pdc); return true; } - netdata_spinlock_unlock(&pdc->refcount_spinlock); + spinlock_unlock(&pdc->refcount_spinlock); return false; } @@ -456,7 +456,7 @@ static struct rrdeng_cmd *epdl_get_cmd(void *epdl_ptr) { static bool epdl_pending_add(EPDL *epdl) { bool added_new; - netdata_spinlock_lock(&epdl->datafile->extent_queries.spinlock); + spinlock_lock(&epdl->datafile->extent_queries.spinlock); Pvoid_t *PValue = JudyLIns(&epdl->datafile->extent_queries.pending_epdl_by_extent_offset_judyL, epdl->extent_offset, PJE0); internal_fatal(!PValue || PValue == PJERR, "DBENGINE: corrupted pending extent judy"); @@ -478,20 +478,20 @@ static bool epdl_pending_add(EPDL *epdl) { DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(base, epdl, query.prev, query.next); *PValue = base; - netdata_spinlock_unlock(&epdl->datafile->extent_queries.spinlock); + spinlock_unlock(&epdl->datafile->extent_queries.spinlock); return added_new; } static void epdl_pending_del(EPDL *epdl) { - netdata_spinlock_lock(&epdl->datafile->extent_queries.spinlock); + spinlock_lock(&epdl->datafile->extent_queries.spinlock); if(epdl->head_to_datafile_extent_queries_pending_for_extent) { epdl->head_to_datafile_extent_queries_pending_for_extent = false; int rc = JudyLDel(&epdl->datafile->extent_queries.pending_epdl_by_extent_offset_judyL, epdl->extent_offset, PJE0); (void) rc; internal_fatal(!rc, "DBENGINE: epdl not found in pending list"); } - netdata_spinlock_unlock(&epdl->datafile->extent_queries.spinlock); + spinlock_unlock(&epdl->datafile->extent_queries.spinlock); } void pdc_to_epdl_router(struct rrdengine_instance *ctx, PDC *pdc, execute_extent_page_details_list_t exec_first_extent_list, execute_extent_page_details_list_t exec_rest_extent_list) diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c index 7811a5eaa..ce363183d 100644 --- a/database/engine/rrdengine.c +++ b/database/engine/rrdengine.c @@ -351,7 +351,7 @@ static struct { static void wal_cleanup1(void) { WAL *wal = NULL; - if(!netdata_spinlock_trylock(&wal_globals.protected.spinlock)) + if(!spinlock_trylock(&wal_globals.protected.spinlock)) return; if(wal_globals.protected.available_items && wal_globals.protected.available > storage_tiers) { @@ -360,7 +360,7 @@ static void wal_cleanup1(void) { wal_globals.protected.available--; } - netdata_spinlock_unlock(&wal_globals.protected.spinlock); + spinlock_unlock(&wal_globals.protected.spinlock); if(wal) { posix_memfree(wal->buf); @@ -375,7 +375,7 @@ WAL *wal_get(struct rrdengine_instance *ctx, unsigned size) { WAL *wal = NULL; - netdata_spinlock_lock(&wal_globals.protected.spinlock); + spinlock_lock(&wal_globals.protected.spinlock); if(likely(wal_globals.protected.available_items)) { wal = wal_globals.protected.available_items; @@ -384,7 +384,7 @@ WAL *wal_get(struct rrdengine_instance *ctx, unsigned size) { } uint64_t transaction_id = __atomic_fetch_add(&ctx->atomic.transaction_id, 1, __ATOMIC_RELAXED); - netdata_spinlock_unlock(&wal_globals.protected.spinlock); + spinlock_unlock(&wal_globals.protected.spinlock); if(unlikely(!wal)) { wal = mallocz(sizeof(WAL)); @@ -416,10 +416,10 @@ WAL *wal_get(struct rrdengine_instance *ctx, unsigned size) { void wal_release(WAL *wal) { if(unlikely(!wal)) return; - netdata_spinlock_lock(&wal_globals.protected.spinlock); + spinlock_lock(&wal_globals.protected.spinlock); DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(wal_globals.protected.available_items, wal, cache.prev, cache.next); wal_globals.protected.available++; - netdata_spinlock_unlock(&wal_globals.protected.spinlock); + spinlock_unlock(&wal_globals.protected.spinlock); } // ---------------------------------------------------------------------------- @@ -459,7 +459,7 @@ void rrdeng_dequeue_epdl_cmd(struct rrdeng_cmd *cmd) { } void rrdeng_req_cmd(requeue_callback_t get_cmd_cb, void *data, STORAGE_PRIORITY priority) { - netdata_spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); + spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); struct rrdeng_cmd *cmd = get_cmd_cb(data); if(cmd) { @@ -472,7 +472,7 @@ void rrdeng_req_cmd(requeue_callback_t get_cmd_cb, void *data, STORAGE_PRIORITY } } - netdata_spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock); + spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock); } void rrdeng_enq_cmd(struct rrdengine_instance *ctx, enum rrdeng_opcode opcode, void *data, struct completion *completion, @@ -489,12 +489,12 @@ void rrdeng_enq_cmd(struct rrdengine_instance *ctx, enum rrdeng_opcode opcode, v cmd->priority = priority; cmd->dequeue_cb = dequeue_cb; - netdata_spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); + spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority], cmd, queue.prev, queue.next); rrdeng_main.cmd_queue.unsafe.waiting++; if(enqueue_cb) enqueue_cb(cmd); - netdata_spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock); + spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock); fatal_assert(0 == uv_async_send(&rrdeng_main.async)); } @@ -532,7 +532,7 @@ static inline struct rrdeng_cmd rrdeng_deq_cmd(bool from_worker) { } // find an opcode to execute from the queue - netdata_spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); + spinlock_lock(&rrdeng_main.cmd_queue.unsafe.spinlock); for(STORAGE_PRIORITY priority = min_priority; priority <= max_priority ; priority++) { cmd = rrdeng_main.cmd_queue.unsafe.waiting_items_by_priority[priority]; if(cmd) { @@ -559,7 +559,7 @@ static inline struct rrdeng_cmd rrdeng_deq_cmd(bool from_worker) { cmd->dequeue_cb = NULL; } - netdata_spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock); + spinlock_unlock(&rrdeng_main.cmd_queue.unsafe.spinlock); struct rrdeng_cmd ret; if(cmd) { @@ -712,9 +712,9 @@ static void *extent_flushed_to_open_tp_worker(struct rrdengine_instance *ctx __m posix_memfree(xt_io_descr->buf); extent_io_descriptor_release(xt_io_descr); - netdata_spinlock_lock(&datafile->writers.spinlock); + spinlock_lock(&datafile->writers.spinlock); datafile->writers.flushed_to_open_running--; - netdata_spinlock_unlock(&datafile->writers.spinlock); + spinlock_unlock(&datafile->writers.spinlock); if(datafile->fileno != ctx_last_fileno_get(ctx) && still_running) // we just finished a flushing on a datafile that is not the active one @@ -733,15 +733,15 @@ static void after_extent_write_datafile_io(uv_fs_t *uv_fs_request) { if (uv_fs_request->result < 0) { ctx_io_error(ctx); - error("DBENGINE: %s: uv_fs_write(): %s", __func__, uv_strerror((int)uv_fs_request->result)); + netdata_log_error("DBENGINE: %s: uv_fs_write(): %s", __func__, uv_strerror((int)uv_fs_request->result)); } journalfile_v1_extent_write(ctx, xt_io_descr->datafile, xt_io_descr->wal, &rrdeng_main.loop); - netdata_spinlock_lock(&datafile->writers.spinlock); + spinlock_lock(&datafile->writers.spinlock); datafile->writers.running--; datafile->writers.flushed_to_open_running++; - netdata_spinlock_unlock(&datafile->writers.spinlock); + spinlock_unlock(&datafile->writers.spinlock); rrdeng_enq_cmd(xt_io_descr->ctx, RRDENG_OPCODE_FLUSHED_TO_OPEN, @@ -756,12 +756,12 @@ static void after_extent_write_datafile_io(uv_fs_t *uv_fs_request) { static bool datafile_is_full(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile) { bool ret = false; - netdata_spinlock_lock(&datafile->writers.spinlock); + spinlock_lock(&datafile->writers.spinlock); if(ctx_is_available_for_queries(ctx) && datafile->pos > rrdeng_target_data_file_size(ctx)) ret = true; - netdata_spinlock_unlock(&datafile->writers.spinlock); + spinlock_unlock(&datafile->writers.spinlock); return ret; } @@ -773,9 +773,9 @@ static struct rrdengine_datafile *get_datafile_to_write_extent(struct rrdengine_ uv_rwlock_rdlock(&ctx->datafiles.rwlock); datafile = ctx->datafiles.first->prev; // become a writer on this datafile, to prevent it from vanishing - netdata_spinlock_lock(&datafile->writers.spinlock); + spinlock_lock(&datafile->writers.spinlock); datafile->writers.running++; - netdata_spinlock_unlock(&datafile->writers.spinlock); + spinlock_unlock(&datafile->writers.spinlock); uv_rwlock_rdunlock(&ctx->datafiles.rwlock); if(datafile_is_full(ctx, datafile)) { @@ -791,7 +791,7 @@ static struct rrdengine_datafile *get_datafile_to_write_extent(struct rrdengine_ datafile = ctx->datafiles.first->prev; uv_rwlock_rdunlock(&ctx->datafiles.rwlock); - if(datafile_is_full(ctx, datafile) && create_new_datafile_pair(ctx) == 0) + if(datafile_is_full(ctx, datafile) && create_new_datafile_pair(ctx, true) == 0) rrdeng_enq_cmd(ctx, RRDENG_OPCODE_JOURNAL_INDEX, datafile, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL); @@ -801,15 +801,15 @@ static struct rrdengine_datafile *get_datafile_to_write_extent(struct rrdengine_ uv_rwlock_rdlock(&ctx->datafiles.rwlock); datafile = ctx->datafiles.first->prev; // become a writer on this datafile, to prevent it from vanishing - netdata_spinlock_lock(&datafile->writers.spinlock); + spinlock_lock(&datafile->writers.spinlock); datafile->writers.running++; - netdata_spinlock_unlock(&datafile->writers.spinlock); + spinlock_unlock(&datafile->writers.spinlock); uv_rwlock_rdunlock(&ctx->datafiles.rwlock); // release the writers on the old datafile - netdata_spinlock_lock(&old_datafile->writers.spinlock); + spinlock_lock(&old_datafile->writers.spinlock); old_datafile->writers.running--; - netdata_spinlock_unlock(&old_datafile->writers.spinlock); + spinlock_unlock(&old_datafile->writers.spinlock); } return datafile; @@ -921,11 +921,11 @@ static struct extent_io_descriptor *datafile_extent_build(struct rrdengine_insta real_io_size = ALIGN_BYTES_CEILING(size_bytes); datafile = get_datafile_to_write_extent(ctx); - netdata_spinlock_lock(&datafile->writers.spinlock); + spinlock_lock(&datafile->writers.spinlock); xt_io_descr->datafile = datafile; xt_io_descr->pos = datafile->pos; datafile->pos += real_io_size; - netdata_spinlock_unlock(&datafile->writers.spinlock); + spinlock_unlock(&datafile->writers.spinlock); xt_io_descr->bytes = size_bytes; xt_io_descr->uv_fs_request.data = xt_io_descr; @@ -998,12 +998,14 @@ struct rrdengine_datafile *datafile_release_and_acquire_next_for_retention(struc return next_datafile; } -void find_uuid_first_time( +time_t find_uuid_first_time( struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile, struct uuid_first_time_s *uuid_first_entry_list, size_t count) { + time_t global_first_time_s = LONG_MAX; + // acquire the datafile to work with it uv_rwlock_rdlock(&ctx->datafiles.rwlock); while(datafile && !datafile_acquire(datafile, DATAFILE_ACQUIRE_RETENTION)) @@ -1011,7 +1013,7 @@ void find_uuid_first_time( uv_rwlock_rdunlock(&ctx->datafiles.rwlock); if (unlikely(!datafile)) - return; + return global_first_time_s; unsigned journalfile_count = 0; size_t binary_match = 0; @@ -1025,6 +1027,10 @@ void find_uuid_first_time( } time_t journal_start_time_s = (time_t) (j2_header->start_time_ut / USEC_PER_SEC); + + if(journal_start_time_s < global_first_time_s) + global_first_time_s = journal_start_time_s; + struct journal_metric_list *uuid_list = (struct journal_metric_list *)((uint8_t *) j2_header + j2_header->metric_offset); struct uuid_first_time_s *uuid_original_entry; @@ -1137,9 +1143,13 @@ void find_uuid_first_time( without_retention, without_metric ); + + return global_first_time_s; } static void update_metrics_first_time_s(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile_to_delete, struct rrdengine_datafile *first_datafile_remaining, bool worker) { + time_t global_first_time_s = LONG_MAX; + if(worker) worker_is_busy(UV_EVENT_DBENGINE_FIND_ROTATED_METRICS); @@ -1174,7 +1184,7 @@ static void update_metrics_first_time_s(struct rrdengine_instance *ctx, struct r added++; } - info("DBENGINE: recalculating tier %d retention for %zu metrics starting with datafile %u", + netdata_log_info("DBENGINE: recalculating tier %d retention for %zu metrics starting with datafile %u", ctx->config.tier, count, first_datafile_remaining->fileno); journalfile_v2_data_release(journalfile); @@ -1184,12 +1194,12 @@ static void update_metrics_first_time_s(struct rrdengine_instance *ctx, struct r if(worker) worker_is_busy(UV_EVENT_DBENGINE_FIND_REMAINING_RETENTION); - find_uuid_first_time(ctx, first_datafile_remaining, uuid_first_entry_list, added); + global_first_time_s = find_uuid_first_time(ctx, first_datafile_remaining, uuid_first_entry_list, added); if(worker) worker_is_busy(UV_EVENT_DBENGINE_POPULATE_MRG); - info("DBENGINE: updating tier %d metrics registry retention for %zu metrics", + netdata_log_info("DBENGINE: updating tier %d metrics registry retention for %zu metrics", ctx->config.tier, added); size_t deleted_metrics = 0, zero_retention_referenced = 0, zero_disk_retention = 0, zero_disk_but_live = 0; @@ -1223,6 +1233,9 @@ static void update_metrics_first_time_s(struct rrdengine_instance *ctx, struct r "DBENGINE: deleted %zu metrics, zero retention but referenced %zu (out of %zu total, of which %zu have main cache retention) zero on-disk retention tier %d metrics from metrics registry", deleted_metrics, zero_retention_referenced, zero_disk_retention, zero_disk_but_live, ctx->config.tier); + if(global_first_time_s != LONG_MAX) + __atomic_store_n(&ctx->atomic.first_time_s, global_first_time_s, __ATOMIC_RELAXED); + if(worker) worker_is_idle(); } @@ -1243,7 +1256,7 @@ void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile * datafile_got_for_deletion = datafile_acquire_for_deletion(datafile); if (!datafile_got_for_deletion) { - info("DBENGINE: waiting for data file '%s/" + netdata_log_info("DBENGINE: waiting for data file '%s/" DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION "' to be available for deletion, " "it is in use currently by %u users.", @@ -1255,7 +1268,7 @@ void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile * } __atomic_add_fetch(&rrdeng_cache_efficiency_stats.datafile_deletion_started, 1, __ATOMIC_RELAXED); - info("DBENGINE: deleting data file '%s/" + netdata_log_info("DBENGINE: deleting data file '%s/" DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION "'.", ctx->config.dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno); @@ -1277,26 +1290,26 @@ void datafile_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile * journal_file_bytes = journalfile_current_size(journal_file); deleted_bytes = journalfile_v2_data_size_get(journal_file); - info("DBENGINE: deleting data and journal files to maintain disk quota"); + netdata_log_info("DBENGINE: deleting data and journal files to maintain disk quota"); ret = journalfile_destroy_unsafe(journal_file, datafile); if (!ret) { journalfile_v1_generate_path(datafile, path, sizeof(path)); - info("DBENGINE: deleted journal file \"%s\".", path); + netdata_log_info("DBENGINE: deleted journal file \"%s\".", path); journalfile_v2_generate_path(datafile, path, sizeof(path)); - info("DBENGINE: deleted journal file \"%s\".", path); + netdata_log_info("DBENGINE: deleted journal file \"%s\".", path); deleted_bytes += journal_file_bytes; } ret = destroy_data_file_unsafe(datafile); if (!ret) { generate_datafilepath(datafile, path, sizeof(path)); - info("DBENGINE: deleted data file \"%s\".", path); + netdata_log_info("DBENGINE: deleted data file \"%s\".", path); deleted_bytes += datafile_bytes; } freez(journal_file); freez(datafile); ctx_current_disk_space_decrease(ctx, deleted_bytes); - info("DBENGINE: reclaimed %u bytes of disk space.", deleted_bytes); + netdata_log_info("DBENGINE: reclaimed %u bytes of disk space.", deleted_bytes); } static void *database_rotate_tp_worker(struct rrdengine_instance *ctx __maybe_unused, void *data __maybe_unused, struct completion *completion __maybe_unused, uv_work_t *uv_work_req __maybe_unused) { @@ -1334,11 +1347,11 @@ static void *populate_mrg_tp_worker(struct rrdengine_instance *ctx __maybe_unuse // find a datafile to work uv_rwlock_rdlock(&ctx->datafiles.rwlock); for(datafile = ctx->datafiles.first; datafile ; datafile = datafile->next) { - if(!netdata_spinlock_trylock(&datafile->populate_mrg.spinlock)) + if(!spinlock_trylock(&datafile->populate_mrg.spinlock)) continue; if(datafile->populate_mrg.populated) { - netdata_spinlock_unlock(&datafile->populate_mrg.spinlock); + spinlock_unlock(&datafile->populate_mrg.spinlock); continue; } @@ -1352,7 +1365,7 @@ static void *populate_mrg_tp_worker(struct rrdengine_instance *ctx __maybe_unuse journalfile_v2_populate_retention_to_mrg(ctx, datafile->journalfile); datafile->populate_mrg.populated = true; - netdata_spinlock_unlock(&datafile->populate_mrg.spinlock); + spinlock_unlock(&datafile->populate_mrg.spinlock); } while(1); @@ -1376,7 +1389,7 @@ static void *ctx_shutdown_tp_worker(struct rrdengine_instance *ctx __maybe_unuse __atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED)) { if(!logged) { logged = true; - info("DBENGINE: waiting for %zu inflight queries to finish to shutdown tier %d...", + netdata_log_info("DBENGINE: waiting for %zu inflight queries to finish to shutdown tier %d...", __atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED), (ctx->config.legacy) ? -1 : ctx->config.tier); } @@ -1444,7 +1457,7 @@ void async_cb(uv_async_t *handle) { uv_stop(handle->loop); uv_update_time(handle->loop); - debug(D_RRDENGINE, "%s called, active=%d.", __func__, uv_is_active((uv_handle_t *)handle)); + netdata_log_debug(D_RRDENGINE, "%s called, active=%d.", __func__, uv_is_active((uv_handle_t *)handle)); } #define TIMER_PERIOD_MS (1000) @@ -1496,17 +1509,17 @@ static void *journal_v2_indexing_tp_worker(struct rrdengine_instance *ctx __mayb continue; } - netdata_spinlock_lock(&datafile->writers.spinlock); + spinlock_lock(&datafile->writers.spinlock); bool available = (datafile->writers.running || datafile->writers.flushed_to_open_running) ? false : true; - netdata_spinlock_unlock(&datafile->writers.spinlock); + spinlock_unlock(&datafile->writers.spinlock); if(!available) { - info("DBENGINE: journal file %u needs to be indexed, but it has writers working on it - skipping it for now", datafile->fileno); + netdata_log_info("DBENGINE: journal file %u needs to be indexed, but it has writers working on it - skipping it for now", datafile->fileno); datafile = datafile->next; continue; } - info("DBENGINE: journal file %u is ready to be indexed", datafile->fileno); + netdata_log_info("DBENGINE: journal file %u is ready to be indexed", datafile->fileno); pgc_open_cache_to_journal_v2(open_cache, (Word_t) ctx, (int) datafile->fileno, ctx->config.page_type, journalfile_migrate_to_v2_callback, (void *) datafile->journalfile); @@ -1623,21 +1636,21 @@ bool rrdeng_dbengine_spawn(struct rrdengine_instance *ctx __maybe_unused) { static bool spawned = false; static SPINLOCK spinlock = NETDATA_SPINLOCK_INITIALIZER; - netdata_spinlock_lock(&spinlock); + spinlock_lock(&spinlock); if(!spawned) { int ret; ret = uv_loop_init(&rrdeng_main.loop); if (ret) { - error("DBENGINE: uv_loop_init(): %s", uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_loop_init(): %s", uv_strerror(ret)); return false; } rrdeng_main.loop.data = &rrdeng_main; ret = uv_async_init(&rrdeng_main.loop, &rrdeng_main.async, async_cb); if (ret) { - error("DBENGINE: uv_async_init(): %s", uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_async_init(): %s", uv_strerror(ret)); fatal_assert(0 == uv_loop_close(&rrdeng_main.loop)); return false; } @@ -1645,7 +1658,7 @@ bool rrdeng_dbengine_spawn(struct rrdengine_instance *ctx __maybe_unused) { ret = uv_timer_init(&rrdeng_main.loop, &rrdeng_main.timer); if (ret) { - error("DBENGINE: uv_timer_init(): %s", uv_strerror(ret)); + netdata_log_error("DBENGINE: uv_timer_init(): %s", uv_strerror(ret)); uv_close((uv_handle_t *)&rrdeng_main.async, NULL); fatal_assert(0 == uv_loop_close(&rrdeng_main.loop)); return false; @@ -1658,7 +1671,7 @@ bool rrdeng_dbengine_spawn(struct rrdengine_instance *ctx __maybe_unused) { spawned = true; } - netdata_spinlock_unlock(&spinlock); + spinlock_unlock(&spinlock); return true; } @@ -1860,7 +1873,7 @@ void dbengine_event_loop(void* arg) { } /* cleanup operations of the event loop */ - info("DBENGINE: shutting down dbengine thread"); + netdata_log_info("DBENGINE: shutting down dbengine thread"); /* * uv_async_send after uv_close does not seem to crash in linux at the moment, diff --git a/database/engine/rrdengine.h b/database/engine/rrdengine.h index 69e412354..b5476930a 100644 --- a/database/engine/rrdengine.h +++ b/database/engine/rrdengine.h @@ -34,31 +34,6 @@ struct rrdeng_cmd; #define RRDENG_FILE_NUMBER_SCAN_TMPL "%1u-%10u" #define RRDENG_FILE_NUMBER_PRINT_TMPL "%1.1u-%10.10u" -typedef struct page_details_control { - struct rrdengine_instance *ctx; - struct metric *metric; - - struct completion prep_completion; - struct completion page_completion; // sync between the query thread and the workers - - Pvoid_t page_list_JudyL; // the list of page details - unsigned completed_jobs; // the number of jobs completed last time the query thread checked - bool workers_should_stop; // true when the query thread left and the workers should stop - bool prep_done; - - SPINLOCK refcount_spinlock; // spinlock to protect refcount - int32_t refcount; // the number of workers currently working on this request + 1 for the query thread - size_t executed_with_gaps; - - time_t start_time_s; - time_t end_time_s; - STORAGE_PRIORITY priority; - - time_t optimal_end_time_s; -} PDC; - -PDC *pdc_get(void); - typedef enum __attribute__ ((__packed__)) { // final status for all pages // if a page does not have one of these, it is considered unroutable @@ -99,6 +74,34 @@ typedef enum __attribute__ ((__packed__)) { #define PDC_PAGE_QUERY_GLOBAL_SKIP_LIST (PDC_PAGE_FAILED | PDC_PAGE_SKIP | PDC_PAGE_INVALID | PDC_PAGE_RELEASED) +typedef struct page_details_control { + struct rrdengine_instance *ctx; + struct metric *metric; + + struct completion prep_completion; + struct completion page_completion; // sync between the query thread and the workers + + Pvoid_t page_list_JudyL; // the list of page details + unsigned completed_jobs; // the number of jobs completed last time the query thread checked + bool workers_should_stop; // true when the query thread left and the workers should stop + bool prep_done; + + PDC_PAGE_STATUS common_status; + size_t pages_to_load_from_disk; + + SPINLOCK refcount_spinlock; // spinlock to protect refcount + int32_t refcount; // the number of workers currently working on this request + 1 for the query thread + size_t executed_with_gaps; + + time_t start_time_s; + time_t end_time_s; + STORAGE_PRIORITY priority; + + time_t optimal_end_time_s; +} PDC; + +PDC *pdc_get(void); + struct page_details { struct { struct rrdengine_datafile *ptr; @@ -362,6 +365,11 @@ struct rrdengine_instance { } datafiles; struct { + RW_SPINLOCK spinlock; + Pvoid_t JudyL; + } njfv2idx; + + struct { unsigned last_fileno; // newest index of datafile and journalfile unsigned last_flush_fileno; // newest index of datafile received data @@ -375,6 +383,8 @@ struct rrdengine_instance { bool migration_to_v2_running; bool now_deleting_files; unsigned extents_currently_being_flushed; // non-zero until we commit data to disk (both datafile and journal file) + + time_t first_time_s; } atomic; struct { diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c index ddc306ed7..49df5c814 100755 --- a/database/engine/rrdengineapi.c +++ b/database/engine/rrdengineapi.c @@ -247,7 +247,7 @@ STORAGE_COLLECT_HANDLE *rrdeng_store_metric_init(STORAGE_METRIC_HANDLE *db_metri is_1st_metric_writer = false; char uuid[UUID_STR_LEN + 1]; uuid_unparse(*mrg_metric_uuid(main_mrg, metric), uuid); - error("DBENGINE: metric '%s' is already collected and should not be collected twice - expect gaps on the charts", uuid); + netdata_log_error("DBENGINE: metric '%s' is already collected and should not be collected twice - expect gaps on the charts", uuid); } metric = mrg_metric_dup(main_mrg, metric); @@ -312,7 +312,7 @@ static bool page_has_only_empty_metrics(struct rrdeng_collect_handle *handle) { default: { static bool logged = false; if(!logged) { - error("DBENGINE: cannot check page for nulls on unknown page type id %d", (mrg_metric_ctx(handle->metric))->config.page_type); + netdata_log_error("DBENGINE: cannot check page for nulls on unknown page type id %d", (mrg_metric_ctx(handle->metric))->config.page_type); logged = true; } return false; @@ -703,14 +703,14 @@ static void register_query_handle(struct rrdeng_query_handle *handle) { handle->query_pid = gettid(); handle->started_time_s = now_realtime_sec(); - netdata_spinlock_lock(&global_query_handle_spinlock); + spinlock_lock(&global_query_handle_spinlock); DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(global_query_handle_ll, handle, prev, next); - netdata_spinlock_unlock(&global_query_handle_spinlock); + spinlock_unlock(&global_query_handle_spinlock); } static void unregister_query_handle(struct rrdeng_query_handle *handle) { - netdata_spinlock_lock(&global_query_handle_spinlock); + spinlock_lock(&global_query_handle_spinlock); DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(global_query_handle_ll, handle, prev, next); - netdata_spinlock_unlock(&global_query_handle_spinlock); + spinlock_unlock(&global_query_handle_spinlock); } #else static void register_query_handle(struct rrdeng_query_handle *handle __maybe_unused) { @@ -908,7 +908,7 @@ STORAGE_POINT rrdeng_load_metric_next(struct storage_engine_query_handle *rrddim default: { static bool logged = false; if(!logged) { - error("DBENGINE: unknown page type %d found. Cannot decode it. Ignoring its metrics.", handle->ctx->config.page_type); + netdata_log_error("DBENGINE: unknown page type %d found. Cannot decode it. Ignoring its metrics.", handle->ctx->config.page_type); logged = true; } storage_point_empty(sp, sp.start_time_s, sp.end_time_s); @@ -986,7 +986,7 @@ bool rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *db_instance, uuid_t *dim_ { struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; if (unlikely(!ctx)) { - error("DBENGINE: invalid STORAGE INSTANCE to %s()", __FUNCTION__); + netdata_log_error("DBENGINE: invalid STORAGE INSTANCE to %s()", __FUNCTION__); return false; } @@ -1002,6 +1002,26 @@ bool rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *db_instance, uuid_t *dim_ return true; } +size_t rrdeng_disk_space_max(STORAGE_INSTANCE *db_instance) { + struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; + return ctx->config.max_disk_space; +} + +size_t rrdeng_disk_space_used(STORAGE_INSTANCE *db_instance) { + struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; + return __atomic_load_n(&ctx->atomic.current_disk_space, __ATOMIC_RELAXED); +} + +time_t rrdeng_global_first_time_s(STORAGE_INSTANCE *db_instance) { + struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; + return __atomic_load_n(&ctx->atomic.first_time_s, __ATOMIC_RELAXED); +} + +size_t rrdeng_currently_collected_metrics(STORAGE_INSTANCE *db_instance) { + struct rrdengine_instance *ctx = (struct rrdengine_instance *)db_instance; + return __atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED); +} + /* * Gathers Database Engine statistics. * Careful when modifying this function. @@ -1062,20 +1082,20 @@ static void rrdeng_populate_mrg(struct rrdengine_instance *ctx) { datafiles++; uv_rwlock_rdunlock(&ctx->datafiles.rwlock); - size_t cpus = get_netdata_cpus() / storage_tiers; - if(cpus > datafiles) - cpus = datafiles; + ssize_t cpus = (ssize_t)get_netdata_cpus() / (ssize_t)storage_tiers; + if(cpus > (ssize_t)datafiles) + cpus = (ssize_t)datafiles; - if(cpus < 1) - cpus = 1; + if(cpus > (ssize_t)libuv_worker_threads) + cpus = (ssize_t)libuv_worker_threads; - if(cpus > (size_t)libuv_worker_threads) - cpus = (size_t)libuv_worker_threads; + if(cpus >= (ssize_t)get_netdata_cpus() / 2) + cpus = (ssize_t)(get_netdata_cpus() / 2 - 1); - if(cpus > MRG_PARTITIONS) - cpus = MRG_PARTITIONS; + if(cpus < 1) + cpus = 1; - info("DBENGINE: populating retention to MRG from %zu journal files of tier %d, using %zu threads...", datafiles, ctx->config.tier, cpus); + netdata_log_info("DBENGINE: populating retention to MRG from %zu journal files of tier %d, using %zd threads...", datafiles, ctx->config.tier, cpus); if(datafiles > 2) { struct rrdengine_datafile *datafile; @@ -1116,7 +1136,7 @@ void rrdeng_readiness_wait(struct rrdengine_instance *ctx) { ctx->loading.populate_mrg.array = NULL; ctx->loading.populate_mrg.size = 0; - info("DBENGINE: tier %d is ready for data collection and queries", ctx->config.tier); + netdata_log_info("DBENGINE: tier %d is ready for data collection and queries", ctx->config.tier); } bool rrdeng_is_legacy(STORAGE_INSTANCE *db_instance) { @@ -1140,7 +1160,7 @@ int rrdeng_init(struct rrdengine_instance **ctxp, const char *dbfiles_path, /* reserve RRDENG_FD_BUDGET_PER_INSTANCE file descriptors for this instance */ rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, RRDENG_FD_BUDGET_PER_INSTANCE); if (rrdeng_reserved_file_descriptors > max_open_files) { - error( + netdata_log_error( "Exceeded the budget of available file descriptors (%u/%u), cannot create new dbengine instance.", (unsigned)rrdeng_reserved_file_descriptors, (unsigned)max_open_files); @@ -1172,6 +1192,9 @@ int rrdeng_init(struct rrdengine_instance **ctxp, const char *dbfiles_path, ctx->atomic.transaction_id = 1; ctx->quiesce.enabled = false; + rw_spinlock_init(&ctx->njfv2idx.spinlock); + ctx->atomic.first_time_s = LONG_MAX; + if (rrdeng_dbengine_spawn(ctx) && !init_rrd_files(ctx)) { // success - we run this ctx too rrdeng_populate_mrg(ctx); @@ -1208,16 +1231,16 @@ int rrdeng_exit(struct rrdengine_instance *ctx) { bool logged = false; while(__atomic_load_n(&ctx->atomic.collectors_running, __ATOMIC_RELAXED) && !unittest_running) { if(!logged) { - info("DBENGINE: waiting for collectors to finish on tier %d...", (ctx->config.legacy) ? -1 : ctx->config.tier); + netdata_log_info("DBENGINE: waiting for collectors to finish on tier %d...", (ctx->config.legacy) ? -1 : ctx->config.tier); logged = true; } sleep_usec(100 * USEC_PER_MS); } - info("DBENGINE: flushing main cache for tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier); + netdata_log_info("DBENGINE: flushing main cache for tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier); pgc_flush_all_hot_and_dirty_pages(main_cache, (Word_t)ctx); - info("DBENGINE: shutting down tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier); + netdata_log_info("DBENGINE: shutting down tier %d", (ctx->config.legacy) ? -1 : ctx->config.tier); struct completion completion = {}; completion_init(&completion); rrdeng_enq_cmd(ctx, RRDENG_OPCODE_CTX_SHUTDOWN, NULL, &completion, STORAGE_PRIORITY_BEST_EFFORT, NULL, NULL); diff --git a/database/engine/rrdengineapi.h b/database/engine/rrdengineapi.h index 514954af7..12f1becd1 100644 --- a/database/engine/rrdengineapi.h +++ b/database/engine/rrdengineapi.h @@ -222,4 +222,7 @@ RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx); size_t rrdeng_collectors_running(struct rrdengine_instance *ctx); bool rrdeng_is_legacy(STORAGE_INSTANCE *db_instance); +size_t rrdeng_disk_space_max(STORAGE_INSTANCE *db_instance); +size_t rrdeng_disk_space_used(STORAGE_INSTANCE *db_instance); + #endif /* NETDATA_RRDENGINEAPI_H */ diff --git a/database/engine/rrdenginelib.c b/database/engine/rrdenginelib.c index 984a591e8..dc581d98d 100644 --- a/database/engine/rrdenginelib.c +++ b/database/engine/rrdenginelib.c @@ -14,12 +14,12 @@ int check_file_properties(uv_file file, uint64_t *file_size, size_t min_size) fatal_assert(req.result == 0); s = req.ptr; if (!(s->st_mode & S_IFREG)) { - error("Not a regular file.\n"); + netdata_log_error("Not a regular file.\n"); uv_fs_req_cleanup(&req); return UV_EINVAL; } if (s->st_size < min_size) { - error("File length is too short.\n"); + netdata_log_error("File length is too short.\n"); uv_fs_req_cleanup(&req); return UV_EINVAL; } @@ -56,16 +56,16 @@ int open_file_for_io(char *path, int flags, uv_file *file, int direct) fd = uv_fs_open(NULL, &req, path, current_flags, S_IRUSR | S_IWUSR, NULL); if (fd < 0) { if ((direct) && (UV_EINVAL == fd)) { - error("File \"%s\" does not support direct I/O, falling back to buffered I/O.", path); + netdata_log_error("File \"%s\" does not support direct I/O, falling back to buffered I/O.", path); } else { - error("Failed to open file \"%s\".", path); + netdata_log_error("Failed to open file \"%s\".", path); --direct; /* break the loop */ } } else { fatal_assert(req.result >= 0); *file = req.result; #ifdef __APPLE__ - info("Disabling OS X caching for file \"%s\".", path); + netdata_log_info("Disabling OS X caching for file \"%s\".", path); fcntl(fd, F_NOCACHE, 1); #endif --direct; /* break the loop */ @@ -90,7 +90,7 @@ int is_legacy_child(const char *machine_guid) snprintfz(dbengine_file, FILENAME_MAX, "%s/%s/dbengine", netdata_configured_cache_dir, machine_guid); int rc = uv_fs_stat(NULL, &stat_req, dbengine_file, NULL); if (likely(rc == 0 && ((stat_req.statbuf.st_mode & S_IFMT) == S_IFDIR))) { - //info("Found legacy engine folder \"%s\"", dbengine_file); + //netdata_log_info("Found legacy engine folder \"%s\"", dbengine_file); return 1; } } @@ -107,7 +107,7 @@ int count_legacy_children(char *dbfiles_path) ret = uv_fs_scandir(NULL, &req, dbfiles_path, 0, NULL); if (ret < 0) { uv_fs_req_cleanup(&req); - error("uv_fs_scandir(%s): %s", dbfiles_path, uv_strerror(ret)); + netdata_log_error("uv_fs_scandir(%s): %s", dbfiles_path, uv_strerror(ret)); return ret; } @@ -134,7 +134,7 @@ int compute_multidb_diskspace() fclose(fp); if (unlikely(rc != 1 || computed_multidb_disk_quota_mb < RRDENG_MIN_DISK_SPACE_MB)) { errno = 0; - error("File '%s' contains invalid input, it will be rebuild", multidb_disk_space_file); + netdata_log_error("File '%s' contains invalid input, it will be rebuild", multidb_disk_space_file); computed_multidb_disk_quota_mb = -1; } } @@ -143,15 +143,15 @@ int compute_multidb_diskspace() int rc = count_legacy_children(netdata_configured_cache_dir); if (likely(rc >= 0)) { computed_multidb_disk_quota_mb = (rc + 1) * default_rrdeng_disk_quota_mb; - info("Found %d legacy dbengines, setting multidb diskspace to %dMB", rc, computed_multidb_disk_quota_mb); + netdata_log_info("Found %d legacy dbengines, setting multidb diskspace to %dMB", rc, computed_multidb_disk_quota_mb); fp = fopen(multidb_disk_space_file, "w"); if (likely(fp)) { fprintf(fp, "%d", computed_multidb_disk_quota_mb); - info("Created file '%s' to store the computed value", multidb_disk_space_file); + netdata_log_info("Created file '%s' to store the computed value", multidb_disk_space_file); fclose(fp); } else - error("Failed to store the default multidb disk quota size on '%s'", multidb_disk_space_file); + netdata_log_error("Failed to store the default multidb disk quota size on '%s'", multidb_disk_space_file); } else computed_multidb_disk_quota_mb = default_rrdeng_disk_quota_mb; diff --git a/database/engine/rrdenginelib.h b/database/engine/rrdenginelib.h index ca8eacae4..831e48531 100644 --- a/database/engine/rrdenginelib.h +++ b/database/engine/rrdenginelib.h @@ -53,7 +53,7 @@ static inline void modify_bit(unsigned *x, unsigned pos, uint8_t val) *x |= 1U << pos; break; default: - error("modify_bit() called with invalid argument."); + netdata_log_error("modify_bit() called with invalid argument."); break; } } diff --git a/database/ram/rrddim_mem.c b/database/ram/rrddim_mem.c index a417c5ae3..a434f57d1 100644 --- a/database/ram/rrddim_mem.c +++ b/database/ram/rrddim_mem.c @@ -35,15 +35,15 @@ struct mem_metric_handle { static void update_metric_handle_from_rrddim(struct mem_metric_handle *mh, RRDDIM *rd) { mh->counter = rd->rrdset->counter; - mh->entries = rd->rrdset->entries; - mh->current_entry = rd->rrdset->current_entry; + mh->entries = rd->rrdset->db.entries; + mh->current_entry = rd->rrdset->db.current_entry; mh->last_updated_s = rd->rrdset->last_updated.tv_sec; mh->update_every_s = rd->rrdset->update_every; } static void check_metric_handle_from_rrddim(struct mem_metric_handle *mh) { RRDDIM *rd = mh->rd; (void)rd; - internal_fatal(mh->entries != (size_t)rd->rrdset->entries, "RRDDIM: entries do not match"); + internal_fatal(mh->entries != (size_t)rd->rrdset->db.entries, "RRDDIM: entries do not match"); internal_fatal(mh->update_every_s != rd->rrdset->update_every, "RRDDIM: update every does not match"); } @@ -161,7 +161,7 @@ void rrddim_store_metric_flush(STORAGE_COLLECT_HANDLE *collection_handle) { storage_number empty = pack_storage_number(NAN, SN_FLAG_NONE); for(size_t i = 0; i < entries ;i++) - rd->db[i] = empty; + rd->db.data[i] = empty; mh->counter = 0; mh->last_updated_s = 0; @@ -192,7 +192,7 @@ static inline void rrddim_fill_the_gap(STORAGE_COLLECT_HANDLE *collection_handle // fill the dimension size_t c; for(c = 0; c < entries && now_store_s <= now_collect_s ; now_store_s += update_every_s, c++) { - rd->db[current_entry++] = empty; + rd->db.data[current_entry++] = empty; if(unlikely(current_entry >= entries)) current_entry = 0; @@ -227,7 +227,7 @@ void rrddim_collect_store_metric(STORAGE_COLLECT_HANDLE *collection_handle, if(unlikely(mh->last_updated_s && point_in_time_s - mh->update_every_s > mh->last_updated_s)) rrddim_fill_the_gap(collection_handle, point_in_time_s); - rd->db[mh->current_entry] = pack_storage_number(n, flags); + rd->db.data[mh->current_entry] = pack_storage_number(n, flags); mh->counter++; mh->current_entry = (mh->current_entry + 1) >= mh->entries ? 0 : mh->current_entry + 1; mh->last_updated_s = point_in_time_s; @@ -283,7 +283,7 @@ static inline size_t rrddim_time2slot(STORAGE_METRIC_HANDLE *db_metric_handle, t } if(unlikely(ret >= entries)) { - error("INTERNAL ERROR: rrddim_time2slot() on %s returns values outside entries", rrddim_name(rd)); + netdata_log_error("INTERNAL ERROR: rrddim_time2slot() on %s returns values outside entries", rrddim_name(rd)); ret = entries - 1; } @@ -304,7 +304,7 @@ static inline time_t rrddim_slot2time(STORAGE_METRIC_HANDLE *db_metric_handle, s size_t update_every = mh->update_every_s; if(slot >= entries) { - error("INTERNAL ERROR: caller of rrddim_slot2time() gives invalid slot %zu", slot); + netdata_log_error("INTERNAL ERROR: caller of rrddim_slot2time() gives invalid slot %zu", slot); slot = entries - 1; } @@ -314,14 +314,14 @@ static inline time_t rrddim_slot2time(STORAGE_METRIC_HANDLE *db_metric_handle, s ret = last_entry_s - (time_t)(update_every * (last_slot - slot)); if(unlikely(ret < first_entry_s)) { - error("INTERNAL ERROR: rrddim_slot2time() on dimension '%s' of chart '%s' returned time (%ld) too far in the past (before first_entry_s %ld) for slot %zu", + netdata_log_error("INTERNAL ERROR: rrddim_slot2time() on dimension '%s' of chart '%s' returned time (%ld) too far in the past (before first_entry_s %ld) for slot %zu", rrddim_name(rd), rrdset_id(rd->rrdset), ret, first_entry_s, slot); ret = first_entry_s; } if(unlikely(ret > last_entry_s)) { - error("INTERNAL ERROR: rrddim_slot2time() on dimension '%s' of chart '%s' returned time (%ld) too far into the future (after last_entry_s %ld) for slot %zu", + netdata_log_error("INTERNAL ERROR: rrddim_slot2time() on dimension '%s' of chart '%s' returned time (%ld) too far into the future (after last_entry_s %ld) for slot %zu", rrddim_name(rd), rrdset_id(rd->rrdset), ret, last_entry_s, slot); ret = last_entry_s; @@ -353,7 +353,7 @@ void rrddim_query_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_e h->slot_timestamp = rrddim_slot2time(db_metric_handle, h->slot); h->last_timestamp = rrddim_slot2time(db_metric_handle, h->last_slot); - // info("RRDDIM QUERY INIT: start %ld, end %ld, next %ld, first %ld, last %ld, dt %ld", start_time, end_time, h->next_timestamp, h->slot_timestamp, h->last_timestamp, h->dt); + // netdata_log_info("RRDDIM QUERY INIT: start %ld, end %ld, next %ld, first %ld, last %ld, dt %ld", start_time, end_time, h->next_timestamp, h->slot_timestamp, h->last_timestamp, h->dt); __atomic_add_fetch(&rrddim_db_memory_size, sizeof(struct mem_query_handle), __ATOMIC_RELAXED); handle->handle = (STORAGE_QUERY_HANDLE *)h; @@ -390,7 +390,7 @@ STORAGE_POINT rrddim_query_next_metric(struct storage_engine_query_handle *handl return sp; } - storage_number n = rd->db[slot++]; + storage_number n = rd->db.data[slot++]; if(unlikely(slot >= entries)) slot = 0; h->slot = slot; diff --git a/database/rrd.c b/database/rrd.c index d489ddb8b..5b7752a5e 100644 --- a/database/rrd.c +++ b/database/rrd.c @@ -148,7 +148,7 @@ char *rrdhost_cache_dir_for_rrdset_alloc(RRDHOST *host, const char *id) { if(host->rrd_memory_mode == RRD_MEMORY_MODE_MAP || host->rrd_memory_mode == RRD_MEMORY_MODE_SAVE) { int r = mkdir(ret, 0775); if(r != 0 && errno != EEXIST) - error("Cannot create directory '%s'", ret); + netdata_log_error("Cannot create directory '%s'", ret); } return ret; diff --git a/database/rrd.h b/database/rrd.h index 3f125c5a7..95da17c82 100644 --- a/database/rrd.h +++ b/database/rrd.h @@ -7,6 +7,8 @@ extern "C" { #endif +#include "libnetdata/libnetdata.h" + // non-existing structs instead of voids // to enable type checking at compile time typedef struct storage_instance STORAGE_INSTANCE; @@ -95,6 +97,14 @@ extern RRD_MEMORY_MODE default_rrd_memory_mode; const char *rrd_memory_mode_name(RRD_MEMORY_MODE id); RRD_MEMORY_MODE rrd_memory_mode_id(const char *name); +struct ml_metrics_statistics { + size_t anomalous; + size_t normal; + size_t trained; + size_t pending; + size_t silenced; +}; + #include "daemon/common.h" #include "web/api/queries/query.h" #include "web/api/queries/rrdr.h" @@ -225,13 +235,15 @@ typedef enum __attribute__ ((__packed__)) rrddim_options { RRDDIM_OPTION_HIDDEN = (1 << 0), // this dimension will not be offered to callers RRDDIM_OPTION_DONT_DETECT_RESETS_OR_OVERFLOWS = (1 << 1), // do not offer RESET or OVERFLOW info to callers RRDDIM_OPTION_BACKFILLED_HIGH_TIERS = (1 << 2), // when set, we have backfilled higher tiers + RRDDIM_OPTION_UPDATED = (1 << 3), // single-threaded collector updated flag + RRDDIM_OPTION_EXPOSED = (1 << 4), // single-threaded collector exposed flag // this is 8-bit } RRDDIM_OPTIONS; -#define rrddim_option_check(rd, option) ((rd)->options & (option)) -#define rrddim_option_set(rd, option) (rd)->options |= (option) -#define rrddim_option_clear(rd, option) (rd)->options &= ~(option) +#define rrddim_option_check(rd, option) ((rd)->collector.options & (option)) +#define rrddim_option_set(rd, option) (rd)->collector.options |= (option) +#define rrddim_option_clear(rd, option) (rd)->collector.options &= ~(option) // flags are runtime changing status flags (atomics are required to alter/access them) typedef enum __attribute__ ((__packed__)) rrddim_flags { @@ -301,7 +313,7 @@ void rrdlabels_copy(DICTIONARY *dst, DICTIONARY *src); void reload_host_labels(void); void rrdset_update_rrdlabels(RRDSET *st, DICTIONARY *new_rrdlabels); void rrdset_save_rrdlabels_to_sql(RRDSET *st); -void rrdhost_set_is_parent_label(int count); +void rrdhost_set_is_parent_label(void); int rrdlabels_unittest(void); // unfortunately this break when defined in exporting_engine.h @@ -340,70 +352,76 @@ struct rrddim { STRING *name; // the name of this dimension (as presented to user) RRD_ALGORITHM algorithm; // the algorithm that is applied to add new collected values - RRDDIM_OPTIONS options; // permanent configuration options RRD_MEMORY_MODE rrd_memory_mode; // the memory mode for this dimension RRDDIM_FLAGS flags; // run time changing status flags - bool updated; // 1 when the dimension has been updated since the last processing - bool exposed; // 1 when set what have sent this dimension to the central netdata - - collected_number multiplier; // the multiplier of the collected values - collected_number divisor; // the divider of the collected values - - int update_every; // every how many seconds is this updated - // TODO - remove update_every from rrddim - // it is always the same in rrdset + int32_t multiplier; // the multiplier of the collected values + int32_t divisor; // the divider of the collected values // ------------------------------------------------------------------------ // operational state members + struct rrdset *rrdset; rrd_ml_dimension_t *ml_dimension; // machine learning data about this dimension + RRDMETRIC_ACQUIRED *rrdmetric; // the rrdmetric of this dimension - // ------------------------------------------------------------------------ - // linking to siblings and parents +#ifdef NETDATA_LOG_COLLECTION_ERRORS + usec_t rrddim_store_metric_last_ut; // the timestamp we last called rrddim_store_metric() + size_t rrddim_store_metric_count; // the rrddim_store_metric() counter + const char *rrddim_store_metric_last_caller; // the name of the function that last called rrddim_store_metric() +#endif - struct rrdset *rrdset; + // ------------------------------------------------------------------------ + // db mode RAM, SAVE, MAP, ALLOC, NONE specifics + // TODO - they should be managed by storage engine + // (RRDDIM_DB_STATE ptr to an undefined structure, and a call to clean this up during destruction) - RRDMETRIC_ACQUIRED *rrdmetric; // the rrdmetric of this dimension + struct { + size_t memsize; // the memory allocated for this dimension (without RRDDIM) + void *rd_on_file; // pointer to the header written on disk + storage_number *data; // the array of values + } db; // ------------------------------------------------------------------------ // data collection members - struct rrddim_tier tiers[RRD_STORAGE_TIERS]; // our tiers of databases + struct { + RRDDIM_OPTIONS options; // permanent configuration options - struct timeval last_collected_time; // when was this dimension last updated - // this is actual date time we updated the last_collected_value - // THIS IS DIFFERENT FROM THE SAME MEMBER OF RRDSET + uint32_t counter; // the number of times we added values to this rrddim - size_t collections_counter; // the number of times we added values to this rrddim - collected_number collected_value_max; // the absolute maximum of the collected value + collected_number collected_value; // the current value, as collected - resets to 0 after being used + collected_number collected_value_max; // the absolute maximum of the collected value + collected_number last_collected_value; // the last value that was collected, after being processed - NETDATA_DOUBLE calculated_value; // the current calculated value, after applying the algorithm - resets to zero after being used - NETDATA_DOUBLE last_calculated_value; // the last calculated value processed - NETDATA_DOUBLE last_stored_value; // the last value as stored in the database (after interpolation) + struct timeval last_collected_time; // when was this dimension last updated + // this is actual date time we updated the last_collected_value + // THIS IS DIFFERENT FROM THE SAME MEMBER OF RRDSET - collected_number collected_value; // the current value, as collected - resets to 0 after being used - collected_number last_collected_value; // the last value that was collected, after being processed + NETDATA_DOUBLE calculated_value; // the current calculated value, after applying the algorithm - resets to zero after being used + NETDATA_DOUBLE last_calculated_value; // the last calculated value processed -#ifdef NETDATA_LOG_COLLECTION_ERRORS - usec_t rrddim_store_metric_last_ut; // the timestamp we last called rrddim_store_metric() - size_t rrddim_store_metric_count; // the rrddim_store_metric() counter - const char *rrddim_store_metric_last_caller; // the name of the function that last called rrddim_store_metric() -#endif + NETDATA_DOUBLE last_stored_value; // the last value as stored in the database (after interpolation) + } collector; // ------------------------------------------------------------------------ - // db mode RAM, SAVE, MAP, ALLOC, NONE specifics - // TODO - they should be managed by storage engine - // (RRDDIM_DB_STATE ptr to an undefined structure, and a call to clean this up during destruction) - size_t memsize; // the memory allocated for this dimension (without RRDDIM) - void *rd_on_file; // pointer to the header written on disk - storage_number *db; // the array of values + struct rrddim_tier tiers[]; // our tiers of databases }; +size_t rrddim_size(void); + #define rrddim_id(rd) string2str((rd)->id) #define rrddim_name(rd) string2str((rd) ->name) +#define rrddim_check_updated(rd) ((rd)->collector.options & RRDDIM_OPTION_UPDATED) +#define rrddim_set_updated(rd) (rd)->collector.options |= RRDDIM_OPTION_UPDATED +#define rrddim_clear_updated(rd) (rd)->collector.options &= ~RRDDIM_OPTION_UPDATED + +#define rrddim_check_exposed(rd) ((rd)->collector.options & RRDDIM_OPTION_EXPOSED) +#define rrddim_set_exposed(rd) (rd)->collector.options |= RRDDIM_OPTION_EXPOSED +#define rrddim_clear_exposed(rd) (rd)->collector.options &= ~RRDDIM_OPTION_EXPOSED + // returns the RRDDIM cache filename, or NULL if it does not exist const char *rrddim_cache_filename(RRDDIM *rd); @@ -487,6 +505,48 @@ static inline void storage_engine_store_metric( count, anomaly_count, flags); } +size_t rrdeng_disk_space_max(STORAGE_INSTANCE *db_instance); +static inline size_t storage_engine_disk_space_max(STORAGE_ENGINE_BACKEND backend __maybe_unused, STORAGE_INSTANCE *db_instance) { +#ifdef ENABLE_DBENGINE + if(likely(backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_disk_space_max(db_instance); +#endif + + return 0; +} + +size_t rrdeng_disk_space_used(STORAGE_INSTANCE *db_instance); +static inline size_t storage_engine_disk_space_used(STORAGE_ENGINE_BACKEND backend __maybe_unused, STORAGE_INSTANCE *db_instance) { +#ifdef ENABLE_DBENGINE + if(likely(backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_disk_space_used(db_instance); +#endif + + // TODO - calculate the total host disk space for memory mode save and map + return 0; +} + +time_t rrdeng_global_first_time_s(STORAGE_INSTANCE *db_instance); +static inline time_t storage_engine_global_first_time_s(STORAGE_ENGINE_BACKEND backend __maybe_unused, STORAGE_INSTANCE *db_instance) { +#ifdef ENABLE_DBENGINE + if(likely(backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_global_first_time_s(db_instance); +#endif + + return now_realtime_sec() - (time_t)(default_rrd_history_entries * default_rrd_update_every); +} + +size_t rrdeng_currently_collected_metrics(STORAGE_INSTANCE *db_instance); +static inline size_t storage_engine_collected_metrics(STORAGE_ENGINE_BACKEND backend __maybe_unused, STORAGE_INSTANCE *db_instance) { +#ifdef ENABLE_DBENGINE + if(likely(backend == STORAGE_ENGINE_BACKEND_DBENGINE)) + return rrdeng_currently_collected_metrics(db_instance); +#endif + + // TODO - calculate the total host disk space for memory mode save and map + return 0; +} + void rrdeng_store_metric_flush_current_page(STORAGE_COLLECT_HANDLE *collection_handle); void rrddim_store_metric_flush(STORAGE_COLLECT_HANDLE *collection_handle); static inline void storage_engine_store_flush(STORAGE_COLLECT_HANDLE *collection_handle) { @@ -739,18 +799,15 @@ struct rrdset { STRING *plugin_name; // the name of the plugin that generated this STRING *module_name; // the name of the plugin module that generated this - RRDSET_TYPE chart_type; // line, area, stacked - - long priority; // the sorting priority of this chart - - int update_every; // data collection frequency + int32_t priority; // the sorting priority of this chart + int32_t update_every; // data collection frequency DICTIONARY *rrdlabels; // chart labels DICTIONARY *rrdsetvar_root_index; // chart variables DICTIONARY *rrddimvar_root_index; // dimension variables // we use this dictionary to manage their allocation - rrd_ml_chart_t *ml_chart; + RRDSET_TYPE chart_type; // line, area, stacked // ------------------------------------------------------------------------ // operational state members @@ -760,6 +817,8 @@ struct rrdset { DICTIONARY *rrddim_root_index; // dimensions index + rrd_ml_chart_t *ml_chart; + STORAGE_METRICS_GROUP *storage_metrics_groups[RRD_STORAGE_TIERS]; // ------------------------------------------------------------------------ @@ -775,11 +834,10 @@ struct rrdset { SPINLOCK data_collection_lock; - size_t counter; // the number of times we added values to this database - size_t counter_done; // the number of times rrdset_done() has been called + uint32_t counter; // the number of times we added values to this database + uint32_t counter_done; // the number of times rrdset_done() has been called time_t last_accessed_time_s; // the last time this RRDSET has been accessed - usec_t usec_since_last_update; // the time in microseconds since the last collection of data struct timeval last_updated; // when this data set was last updated (updated every time the rrd_stats_done() function) @@ -799,18 +857,15 @@ struct rrdset { // TODO - they should be managed by storage engine // (RRDSET_DB_STATE ptr to an undefined structure, and a call to clean this up during destruction) - char *cache_dir; // the directory to store dimensions - void *st_on_file; // compatibility with V019 RRDSET files - - // ------------------------------------------------------------------------ - // db mode RAM, SAVE, MAP, ALLOC, NONE specifics - // TODO - they should be managed by storage engine - // (RRDSET_DB_STATE ptr to an undefined structure, and a call to clean this up during destruction) + struct { + char *cache_dir; // the directory to store dimensions + void *st_on_file; // compatibility with V019 RRDSET files - long entries; // total number of entries in the data set + int32_t entries; // total number of entries in the data set - long current_entry; // the entry that is currently being updated - // it goes around in a round-robin fashion + int32_t current_entry; // the entry that is currently being updated + // it goes around in a round-robin fashion + } db; // ------------------------------------------------------------------------ // exporting to 3rd party time-series members @@ -831,14 +886,14 @@ struct rrdset { const RRDFAMILY_ACQUIRED *rrdfamily; // pointer to RRDFAMILY dictionary item, this chart belongs to struct { - netdata_rwlock_t rwlock; // protection for RRDCALC *base + RW_SPINLOCK spinlock; // protection for RRDCALC *base RRDCALC *base; // double linked list of RRDCALC related to this RRDSET } alerts; struct { - size_t pos; - size_t size; - size_t used; + uint32_t pos; + uint32_t size; + uint32_t used; RRDDIM_ACQUIRED **rda; } pluginsd; @@ -924,6 +979,7 @@ typedef enum __attribute__ ((__packed__)) rrdhost_flags { // ACLK RRDHOST_FLAG_ACLK_STREAM_CONTEXTS = (1 << 21), // when set, we should send ACLK stream context updates RRDHOST_FLAG_ACLK_STREAM_ALERTS = (1 << 22), // set when the receiver part is disconnected + // Metadata RRDHOST_FLAG_METADATA_UPDATE = (1 << 23), // metadata needs to be stored in the database RRDHOST_FLAG_METADATA_LABELS = (1 << 24), // metadata needs to be stored in the database @@ -959,6 +1015,8 @@ typedef enum __attribute__ ((__packed__)) { RRDHOST_OPTION_DELETE_ORPHAN_HOST = (1 << 4), // delete the entire host when orphan RRDHOST_OPTION_REPLICATION = (1 << 5), // when set, we support replication for this host + + RRDHOST_OPTION_VIRTUAL_HOST = (1 << 6), // when set, this host is a virtual one } RRDHOST_OPTIONS; #define rrdhost_option_check(host, flag) ((host)->options & (flag)) @@ -976,6 +1034,7 @@ struct alarm_entry { uint32_t unique_id; uint32_t alarm_id; uint32_t alarm_event_id; + usec_t global_id; uuid_t config_hash_id; uuid_t transition_id; @@ -1046,8 +1105,9 @@ typedef struct alarm_log { uint32_t next_alarm_id; unsigned int count; unsigned int max; + uint32_t health_log_history; // the health log history in seconds to be kept in db ALARM_ENTRY *alarms; - netdata_rwlock_t alarm_log_rwlock; + RW_SPINLOCK spinlock; } ALARM_LOG; typedef struct health { @@ -1124,8 +1184,8 @@ struct rrdhost { RRDHOST_FLAGS flags; // runtime flags about this RRDHOST (atomics on this) RRDHOST_FLAGS *exporting_flags; // array of flags for exporting connector instances - int rrd_update_every; // the update frequency of the host - long rrd_history_entries; // the number of history entries for the host's charts + int32_t rrd_update_every; // the update frequency of the host + int32_t rrd_history_entries; // the number of history entries for the host's charts RRD_MEMORY_MODE rrd_memory_mode; // the configured memory more for the charts of this host // the actual per tier is at .db[tier].mode @@ -1150,7 +1210,7 @@ struct rrdhost { struct rrdpush_destinations *destination; // the current destination from the above list SIMPLE_PATTERN *rrdpush_send_charts_matching; // pattern to match the charts to be sent - const char *rrdpush_last_receiver_exit_reason; + int32_t rrdpush_last_receiver_exit_reason; time_t rrdpush_seconds_to_replicate; // max time we want to replicate from the child time_t rrdpush_replication_step; // seconds per replication step size_t rrdpush_receiver_replicating_charts; // the number of charts currently being replicated from a child @@ -1163,6 +1223,9 @@ struct rrdhost { size_t rrdpush_sender_replicating_charts; // the number of charts currently being replicated to a parent void *aclk_sync_host_config; + uint32_t rrdpush_receiver_connection_counter; // the number of times this receiver has connected + uint32_t rrdpush_sender_connection_counter; // the number of times this sender has connected + // ------------------------------------------------------------------------ // streaming of data from remote hosts - rrdpush receiver @@ -1225,8 +1288,16 @@ struct rrdhost { DICTIONARY *contexts; DICTIONARY *hub_queue; DICTIONARY *pp_queue; + uint32_t metrics; + uint32_t instances; } rrdctx; + struct { + SPINLOCK spinlock; + time_t first_time_s; + time_t last_time_s; + } retention; + uuid_t host_uuid; // Global GUID for this host uuid_t *node_id; // Cloud node_id @@ -1260,6 +1331,9 @@ extern RRDHOST *localhost; #define rrdhost_sender_replicating_charts_minus_one(host) (__atomic_sub_fetch(&((host)->rrdpush_sender_replicating_charts), 1, __ATOMIC_RELAXED)) #define rrdhost_sender_replicating_charts_zero(host) (__atomic_store_n(&((host)->rrdpush_sender_replicating_charts), 0, __ATOMIC_RELAXED)) +#define rrdhost_is_online(host) ((host) == localhost || rrdhost_option_check(host, RRDHOST_OPTION_VIRTUAL_HOST) || !rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN | RRDHOST_FLAG_RRDPUSH_RECEIVER_DISCONNECTED)) +bool rrdhost_matches_window(RRDHOST *host, time_t after, time_t before, time_t now); + extern DICTIONARY *rrdhost_root_index; size_t rrdhost_hosts_available(void); @@ -1457,8 +1531,8 @@ RRDDIM *rrddim_add_custom(RRDSET *st int rrddim_reset_name(RRDSET *st, RRDDIM *rd, const char *name); int rrddim_set_algorithm(RRDSET *st, RRDDIM *rd, RRD_ALGORITHM algorithm); -int rrddim_set_multiplier(RRDSET *st, RRDDIM *rd, collected_number multiplier); -int rrddim_set_divisor(RRDSET *st, RRDDIM *rd, collected_number divisor); +int rrddim_set_multiplier(RRDSET *st, RRDDIM *rd, int32_t multiplier); +int rrddim_set_divisor(RRDSET *st, RRDDIM *rd, int32_t divisor); RRDDIM *rrddim_find(RRDSET *st, const char *id); RRDDIM_ACQUIRED *rrddim_find_and_acquire(RRDSET *st, const char *id); @@ -1522,6 +1596,20 @@ void set_host_properties( size_t get_tier_grouping(size_t tier); void store_metric_collection_completed(void); +static inline void rrdhost_retention(RRDHOST *host, time_t now, bool online, time_t *from, time_t *to) { + time_t first_time_s = 0, last_time_s = 0; + spinlock_lock(&host->retention.spinlock); + first_time_s = host->retention.first_time_s; + last_time_s = host->retention.last_time_s; + spinlock_unlock(&host->retention.spinlock); + + if(from) + *from = first_time_s; + + if(to) + *to = online ? now : last_time_s; +} + // ---------------------------------------------------------------------------- // RRD DB engine declarations diff --git a/database/rrdcalc.c b/database/rrdcalc.c index 948ebe8a5..d85825c50 100644 --- a/database/rrdcalc.c +++ b/database/rrdcalc.c @@ -5,6 +5,33 @@ // ---------------------------------------------------------------------------- // RRDCALC helpers +void rrdcalc_flags_to_json_array(BUFFER *wb, const char *key, RRDCALC_FLAGS flags) { + buffer_json_member_add_array(wb, key); + + if(flags & RRDCALC_FLAG_DB_ERROR) + buffer_json_add_array_item_string(wb, "DB_ERROR"); + if(flags & RRDCALC_FLAG_DB_NAN) + buffer_json_add_array_item_string(wb, "DB_NAN"); + if(flags & RRDCALC_FLAG_CALC_ERROR) + buffer_json_add_array_item_string(wb, "CALC_ERROR"); + if(flags & RRDCALC_FLAG_WARN_ERROR) + buffer_json_add_array_item_string(wb, "WARN_ERROR"); + if(flags & RRDCALC_FLAG_CRIT_ERROR) + buffer_json_add_array_item_string(wb, "CRIT_ERROR"); + if(flags & RRDCALC_FLAG_RUNNABLE) + buffer_json_add_array_item_string(wb, "RUNNABLE"); + if(flags & RRDCALC_FLAG_DISABLED) + buffer_json_add_array_item_string(wb, "DISABLED"); + if(flags & RRDCALC_FLAG_SILENCED) + buffer_json_add_array_item_string(wb, "SILENCED"); + if(flags & RRDCALC_FLAG_RUN_ONCE) + buffer_json_add_array_item_string(wb, "RUN_ONCE"); + if(flags & RRDCALC_FLAG_FROM_TEMPLATE) + buffer_json_add_array_item_string(wb, "FROM_TEMPLATE"); + + buffer_json_array_close(wb); +} + inline const char *rrdcalc_status2string(RRDCALC_STATUS status) { switch(status) { case RRDCALC_STATUS_REMOVED: @@ -29,18 +56,18 @@ inline const char *rrdcalc_status2string(RRDCALC_STATUS status) { return "CRITICAL"; default: - error("Unknown alarm status %d", status); + netdata_log_error("Unknown alarm status %d", status); return "UNKNOWN"; } } -uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id) { - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); +uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id) { + rw_spinlock_read_lock(&host->health_log.spinlock); // re-use old IDs, by looking them up in the alarm log ALARM_ENTRY *ae = NULL; for(ae = host->health_log.alarms; ae ;ae = ae->next) { - if(unlikely(name == ae->name && chart == ae->chart)) { + if(unlikely(name == ae->name && chart == ae->chart && !uuid_memcmp(&ae->config_hash_id, config_hash_id))) { if(next_event_id) *next_event_id = ae->alarm_event_id + 1; break; } @@ -52,13 +79,17 @@ uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint3 alarm_id = ae->alarm_id; else { - if (unlikely(!host->health_log.next_alarm_id)) - host->health_log.next_alarm_id = (uint32_t)now_realtime_sec(); + alarm_id = sql_get_alarm_id(host, chart, name, next_event_id, config_hash_id); - alarm_id = host->health_log.next_alarm_id++; + if (!alarm_id) { + if (unlikely(!host->health_log.next_alarm_id)) + host->health_log.next_alarm_id = (uint32_t)now_realtime_sec(); + + alarm_id = host->health_log.next_alarm_id++; + } } - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_unlock(&host->health_log.spinlock); return alarm_id; } @@ -175,28 +206,29 @@ RRDCALC *rrdcalc_acquired_to_rrdcalc(const RRDCALC_ACQUIRED *rca) { static void rrdcalc_link_to_rrdset(RRDSET *st, RRDCALC *rc) { RRDHOST *host = st->rrdhost; - debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rrdcalc_chart_name(rc), rrdcalc_name(rc), rrdset_id(st), rrdhost_hostname(host)); + netdata_log_debug(D_HEALTH, "Health linking alarm '%s.%s' to chart '%s' of host '%s'", rrdcalc_chart_name(rc), rrdcalc_name(rc), rrdset_id(st), rrdhost_hostname(host)); + rc->last_status_change_value = rc->value; rc->last_status_change = now_realtime_sec(); rc->rrdset = st; - netdata_rwlock_wrlock(&st->alerts.rwlock); + rw_spinlock_write_lock(&st->alerts.spinlock); DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(st->alerts.base, rc, prev, next); - netdata_rwlock_unlock(&st->alerts.rwlock); + rw_spinlock_write_unlock(&st->alerts.spinlock); if(rc->update_every < rc->rrdset->update_every) { - error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rrdset_id(rc->rrdset), rrdcalc_name(rc), rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every); + netdata_log_error("Health alarm '%s.%s' has update every %d, less than chart update every %d. Setting alarm update frequency to %d.", rrdset_id(rc->rrdset), rrdcalc_name(rc), rc->update_every, rc->rrdset->update_every, rc->rrdset->update_every); rc->update_every = rc->rrdset->update_every; } if(!isnan(rc->green) && isnan(st->green)) { - debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from " NETDATA_DOUBLE_FORMAT_AUTO + netdata_log_debug(D_HEALTH, "Health alarm '%s.%s' green threshold set from " NETDATA_DOUBLE_FORMAT_AUTO " to " NETDATA_DOUBLE_FORMAT_AUTO ".", rrdset_id(rc->rrdset), rrdcalc_name(rc), rc->rrdset->green, rc->green); st->green = rc->green; } if(!isnan(rc->red) && isnan(st->red)) { - debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from " NETDATA_DOUBLE_FORMAT_AUTO " to " NETDATA_DOUBLE_FORMAT_AUTO + netdata_log_debug(D_HEALTH, "Health alarm '%s.%s' red threshold set from " NETDATA_DOUBLE_FORMAT_AUTO " to " NETDATA_DOUBLE_FORMAT_AUTO ".", rrdset_id(rc->rrdset), rrdcalc_name(rc), rc->rrdset->red, rc->red); st->red = rc->red; } @@ -269,14 +301,15 @@ static void rrdcalc_link_to_rrdset(RRDSET *st, RRDCALC *rc) { now - rc->last_status_change, rc->old_value, rc->value, + RRDCALC_STATUS_REMOVED, rc->status, - RRDCALC_STATUS_UNINITIALIZED, rc->source, rc->units, rc->info, 0, rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0); + rc->ae = ae; health_alarm_log_add_entry(host, ae); } @@ -284,8 +317,8 @@ static void rrdcalc_unlink_from_rrdset(RRDCALC *rc, bool having_ll_wrlock) { RRDSET *st = rc->rrdset; if(!st) { - debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rrdcalc_chart_name(rc), rrdcalc_name(rc)); - error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_error("Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return; } @@ -320,20 +353,21 @@ static void rrdcalc_unlink_from_rrdset(RRDCALC *rc, bool having_ll_wrlock) { 0, 0); + rc->ae = ae; health_alarm_log_add_entry(host, ae); } - debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rrdcalc_chart_name(rc), rrdcalc_name(rc), rrdset_id(st), rrdhost_hostname(host)); + netdata_log_debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rrdcalc_chart_name(rc), rrdcalc_name(rc), rrdset_id(st), rrdhost_hostname(host)); // unlink it if(!having_ll_wrlock) - netdata_rwlock_wrlock(&st->alerts.rwlock); + rw_spinlock_write_lock(&st->alerts.spinlock); DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(st->alerts.base, rc, prev, next); if(!having_ll_wrlock) - netdata_rwlock_unlock(&st->alerts.rwlock); + rw_spinlock_write_unlock(&st->alerts.spinlock); rc->rrdset = NULL; @@ -378,7 +412,7 @@ static inline bool rrdcalc_check_if_it_matches_rrdset(RRDCALC *rc, RRDSET *st) { void rrdcalc_link_matching_alerts_to_rrdset(RRDSET *st) { RRDHOST *host = st->rrdhost; - // debug(D_HEALTH, "find matching alarms for chart '%s'", st->id); + // netdata_log_debug(D_HEALTH, "find matching alarms for chart '%s'", st->id); RRDCALC *rc; foreach_rrdcalc_in_rrdhost_read(host, rc) { @@ -478,17 +512,17 @@ static void rrdcalc_rrdhost_insert_callback(const DICTIONARY_ITEM *item __maybe_ if(rt->calculation) { rc->calculation = expression_parse(rt->calculation->source, NULL, NULL); if(!rc->calculation) - error("Health alarm '%s.%s': failed to parse calculation expression '%s'", rrdset_id(st), rrdcalctemplate_name(rt), rt->calculation->source); + netdata_log_error("Health alarm '%s.%s': failed to parse calculation expression '%s'", rrdset_id(st), rrdcalctemplate_name(rt), rt->calculation->source); } if(rt->warning) { rc->warning = expression_parse(rt->warning->source, NULL, NULL); if(!rc->warning) - error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", rrdset_id(st), rrdcalctemplate_name(rt), rt->warning->source); + netdata_log_error("Health alarm '%s.%s': failed to re-parse warning expression '%s'", rrdset_id(st), rrdcalctemplate_name(rt), rt->warning->source); } if(rt->critical) { rc->critical = expression_parse(rt->critical->source, NULL, NULL); if(!rc->critical) - error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", rrdset_id(st), rrdcalctemplate_name(rt), rt->critical->source); + netdata_log_error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", rrdset_id(st), rrdcalctemplate_name(rt), rt->critical->source); } } else if(ctr->from_config) { @@ -497,7 +531,7 @@ static void rrdcalc_rrdhost_insert_callback(const DICTIONARY_ITEM *item __maybe_ ; } - rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id); + rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id, &rc->config_hash_id); if(rc->calculation) { rc->calculation->status = &rc->status; @@ -523,7 +557,7 @@ static void rrdcalc_rrdhost_insert_callback(const DICTIONARY_ITEM *item __maybe_ rc->critical->rrdcalc = rc; } - debug(D_HEALTH, "Health added alarm '%s.%s': exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO + netdata_log_debug(D_HEALTH, "Health added alarm '%s.%s': exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO ", red " NETDATA_DOUBLE_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", rrdcalc_chart_name(rc), @@ -669,23 +703,23 @@ void rrdcalc_add_from_rrdcalctemplate(RRDHOST *host, RRDCALCTEMPLATE *rt, RRDSET dictionary_set_advanced(host->rrdcalc_root_index, key, (ssize_t)(key_len + 1), NULL, sizeof(RRDCALC), &tmp); if(tmp.react_action != RRDCALC_REACT_NEW && tmp.existing_from_template == false) - error("RRDCALC: from template '%s' on chart '%s' with key '%s', failed to be added to host '%s'. It is manually configured.", + netdata_log_error("RRDCALC: from template '%s' on chart '%s' with key '%s', failed to be added to host '%s'. It is manually configured.", string2str(rt->name), rrdset_id(st), key, rrdhost_hostname(host)); } int rrdcalc_add_from_config(RRDHOST *host, RRDCALC *rc) { if(!rc->chart) { - error("Health configuration for alarm '%s' does not have a chart", rrdcalc_name(rc)); + netdata_log_error("Health configuration for alarm '%s' does not have a chart", rrdcalc_name(rc)); return 0; } if(!rc->update_every) { - error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->calculation && !rc->warning && !rc->critical) { - error("Health configuration for alarm '%s.%s' is useless (no db lookup, no calculation, no warning and no critical expressions)", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_error("Health configuration for alarm '%s.%s' is useless (no db lookup, no calculation, no warning and no critical expressions)", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } @@ -716,7 +750,7 @@ int rrdcalc_add_from_config(RRDHOST *host, RRDCALC *rc) { rrdset_foreach_done(st); } else { - error( + netdata_log_error( "RRDCALC: from config '%s' on chart '%s' failed to be added to host '%s'. It already exists.", string2str(rc->name), string2str(rc->chart), @@ -749,7 +783,7 @@ void rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(RRDHOST *host continue; if(!rrdlabels_match_simple_pattern_parsed(host->rrdlabels, rc->host_labels_pattern, '=', NULL)) { - log_health("Health configuration for alarm '%s' cannot be applied, because the host %s does not have the label(s) '%s'", + netdata_log_health("Health configuration for alarm '%s' cannot be applied, because the host %s does not have the label(s) '%s'", rrdcalc_name(rc), rrdhost_hostname(host), rrdcalc_host_labels(rc)); @@ -774,10 +808,10 @@ void rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts() { void rrdcalc_unlink_all_rrdset_alerts(RRDSET *st) { RRDCALC *rc, *last = NULL; - netdata_rwlock_wrlock(&st->alerts.rwlock); + rw_spinlock_write_lock(&st->alerts.spinlock); while((rc = st->alerts.base)) { if(last == rc) { - error("RRDCALC: malformed list of alerts linked to chart - cannot cleanup - giving up."); + netdata_log_error("RRDCALC: malformed list of alerts linked to chart - cannot cleanup - giving up."); break; } last = rc; @@ -793,7 +827,7 @@ void rrdcalc_unlink_all_rrdset_alerts(RRDSET *st) { } } - netdata_rwlock_unlock(&st->alerts.rwlock); + rw_spinlock_write_unlock(&st->alerts.spinlock); } void rrdcalc_delete_all(RRDHOST *host) { diff --git a/database/rrdcalc.h b/database/rrdcalc.h index 3b48d74ec..2081452c7 100644 --- a/database/rrdcalc.h +++ b/database/rrdcalc.h @@ -28,6 +28,8 @@ typedef enum { RRDCALC_FLAG_FROM_TEMPLATE = (1 << 10), // the rrdcalc has been created from a template } RRDCALC_FLAGS; +void rrdcalc_flags_to_json_array(BUFFER *wb, const char *key, RRDCALC_FLAGS flags); + typedef enum { // This list uses several other options from RRDR_OPTIONS for db lookups. // To add an item here, you need to reserve a bit in RRDR_OPTIONS. @@ -120,6 +122,7 @@ struct rrdcalc { NETDATA_DOUBLE value; // the current value of the alarm NETDATA_DOUBLE old_value; // the previous value of the alarm + NETDATA_DOUBLE last_status_change_value; // the value at the last status change RRDCALC_FLAGS run_flags; // check RRDCALC_FLAG_* @@ -136,6 +139,7 @@ struct rrdcalc { int delay_up_current; // the current up notification delay duration int delay_down_current; // the current down notification delay duration int delay_last; // the last delay we used + ALARM_ENTRY *ae; // last alarm entry // ------------------------------------------------------------------------ // variables this alarm exposes to the rest of the alarms @@ -211,6 +215,7 @@ struct alert_config { STRING *repeat; STRING *host_labels; STRING *chart_labels; + STRING *source; STRING *p_db_lookup_dimensions; STRING *p_db_lookup_method; @@ -235,7 +240,7 @@ const char *rrdcalc_status2string(RRDCALC_STATUS status); void rrdcalc_free_unused_rrdcalc_loaded_from_config(RRDCALC *rc); -uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id); +uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id); void rrdcalc_add_from_rrdcalctemplate(RRDHOST *host, RRDCALCTEMPLATE *rt, RRDSET *st, const char *overwrite_alert_name, const char *overwrite_dimensions); int rrdcalc_add_from_config(RRDHOST *host, RRDCALC *rc); diff --git a/database/rrdcalctemplate.c b/database/rrdcalctemplate.c index 53630f99c..a87403963 100644 --- a/database/rrdcalctemplate.c +++ b/database/rrdcalctemplate.c @@ -143,7 +143,7 @@ static void rrdcalctemplate_insert_callback(const DICTIONARY_ITEM *item __maybe_ bool *added = added_bool; *added = true; - debug(D_HEALTH, "Health configuration adding template '%s'" + netdata_log_debug(D_HEALTH, "Health configuration adding template '%s'" ": context '%s'" ", exec '%s'" ", recipient '%s'" @@ -223,17 +223,17 @@ static size_t rrdcalctemplate_key(char *dst, size_t dst_len, const char *name, c void rrdcalctemplate_add_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) { if(unlikely(!rt->context)) { - error("Health configuration for template '%s' does not have a context", rrdcalctemplate_name(rt)); + netdata_log_error("Health configuration for template '%s' does not have a context", rrdcalctemplate_name(rt)); return; } if(unlikely(!rt->update_every)) { - error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rrdcalctemplate_name(rt)); + netdata_log_error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rrdcalctemplate_name(rt)); return; } if(unlikely(!RRDCALCTEMPLATE_HAS_DB_LOOKUP(rt) && !rt->calculation && !rt->warning && !rt->critical)) { - error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rrdcalctemplate_name(rt)); + netdata_log_error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rrdcalctemplate_name(rt)); return; } @@ -246,7 +246,7 @@ void rrdcalctemplate_add_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) { if(added) freez(rt); else { - info("Health configuration template '%s' already exists for host '%s'.", rrdcalctemplate_name(rt), rrdhost_hostname(host)); + netdata_log_info("Health configuration template '%s' already exists for host '%s'.", rrdcalctemplate_name(rt), rrdhost_hostname(host)); rrdcalctemplate_free_unused_rrdcalctemplate_loaded_from_config(rt); } } diff --git a/database/rrddim.c b/database/rrddim.c index 496fdc61e..0f99f98df 100644 --- a/database/rrddim.c +++ b/database/rrddim.c @@ -46,42 +46,40 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v rd->divisor = ctr->divisor; if(!rd->divisor) rd->divisor = 1; - rd->update_every = st->update_every; - rd->rrdset = st; if(rrdset_flag_check(st, RRDSET_FLAG_STORE_FIRST)) - rd->collections_counter = 1; + rd->collector.counter = 1; if(ctr->memory_mode == RRD_MEMORY_MODE_MAP || ctr->memory_mode == RRD_MEMORY_MODE_SAVE) { if(!rrddim_memory_load_or_create_map_save(st, rd, ctr->memory_mode)) { - info("Failed to use memory mode %s for chart '%s', dimension '%s', falling back to ram", (ctr->memory_mode == RRD_MEMORY_MODE_MAP)?"map":"save", rrdset_name(st), rrddim_name(rd)); + netdata_log_info("Failed to use memory mode %s for chart '%s', dimension '%s', falling back to ram", (ctr->memory_mode == RRD_MEMORY_MODE_MAP)?"map":"save", rrdset_name(st), rrddim_name(rd)); ctr->memory_mode = RRD_MEMORY_MODE_RAM; } } if(ctr->memory_mode == RRD_MEMORY_MODE_RAM) { - size_t entries = st->entries; + size_t entries = st->db.entries; if(!entries) entries = 5; - rd->db = netdata_mmap(NULL, entries * sizeof(storage_number), MAP_PRIVATE, 1, false, NULL); - if(!rd->db) { - info("Failed to use memory mode ram for chart '%s', dimension '%s', falling back to alloc", rrdset_name(st), rrddim_name(rd)); + rd->db.data = netdata_mmap(NULL, entries * sizeof(storage_number), MAP_PRIVATE, 1, false, NULL); + if(!rd->db.data) { + netdata_log_info("Failed to use memory mode ram for chart '%s', dimension '%s', falling back to alloc", rrdset_name(st), rrddim_name(rd)); ctr->memory_mode = RRD_MEMORY_MODE_ALLOC; } else { - rd->memsize = entries * sizeof(storage_number); - __atomic_add_fetch(&rrddim_db_memory_size, rd->memsize, __ATOMIC_RELAXED); + rd->db.memsize = entries * sizeof(storage_number); + __atomic_add_fetch(&rrddim_db_memory_size, rd->db.memsize, __ATOMIC_RELAXED); } } if(ctr->memory_mode == RRD_MEMORY_MODE_ALLOC || ctr->memory_mode == RRD_MEMORY_MODE_NONE) { - size_t entries = st->entries; + size_t entries = st->db.entries; if(entries < 5) entries = 5; - rd->db = rrddim_alloc_db(entries); - rd->memsize = entries * sizeof(storage_number); - __atomic_add_fetch(&rrddim_db_memory_size, rd->memsize, __ATOMIC_RELAXED); + rd->db.data = rrddim_alloc_db(entries); + rd->db.memsize = entries * sizeof(storage_number); + __atomic_add_fetch(&rrddim_db_memory_size, rd->db.memsize, __ATOMIC_RELAXED); } rd->rrd_memory_mode = ctr->memory_mode; @@ -104,10 +102,10 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v } if(!initialized) - error("Failed to initialize all db tiers for chart '%s', dimension '%s", rrdset_name(st), rrddim_name(rd)); + netdata_log_error("Failed to initialize all db tiers for chart '%s', dimension '%s", rrdset_name(st), rrddim_name(rd)); if(!rd->tiers[0].db_metric_handle) - error("Failed to initialize the first db tier for chart '%s', dimension '%s", rrdset_name(st), rrddim_name(rd)); + netdata_log_error("Failed to initialize the first db tier for chart '%s', dimension '%s", rrdset_name(st), rrddim_name(rd)); } // initialize data collection for all tiers @@ -122,7 +120,7 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v } if(!initialized) - error("Failed to initialize data collection for all db tiers for chart '%s', dimension '%s", rrdset_name(st), rrddim_name(rd)); + netdata_log_error("Failed to initialize data collection for all db tiers for chart '%s', dimension '%s", rrdset_name(st), rrddim_name(rd)); } if(rrdset_number_of_dimensions(st) != 0) { @@ -135,7 +133,9 @@ static void rrddim_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v if(td && (td->algorithm != rd->algorithm || ABS(td->multiplier) != ABS(rd->multiplier) || ABS(td->divisor) != ABS(rd->divisor))) { if(!rrdset_flag_check(st, RRDSET_FLAG_HETEROGENEOUS)) { #ifdef NETDATA_INTERNAL_CHECKS - info("Dimension '%s' added on chart '%s' of host '%s' is not homogeneous to other dimensions already present (algorithm is '%s' vs '%s', multiplier is " COLLECTED_NUMBER_FORMAT " vs " COLLECTED_NUMBER_FORMAT ", divisor is " COLLECTED_NUMBER_FORMAT " vs " COLLECTED_NUMBER_FORMAT ").", + netdata_log_info("Dimension '%s' added on chart '%s' of host '%s' is not homogeneous to other dimensions already " + "present (algorithm is '%s' vs '%s', multiplier is %d vs %d, " + "divisor is %d vs %d).", rrddim_name(rd), rrdset_name(st), rrdhost_hostname(host), @@ -197,7 +197,7 @@ static void rrddim_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, v ml_dimension_delete(rd); - debug(D_RRD_CALLS, "rrddim_free() %s.%s", rrdset_name(st), rrddim_name(rd)); + netdata_log_debug(D_RRD_CALLS, "rrddim_free() %s.%s", rrdset_name(st), rrddim_name(rd)); if (!rrddim_finalize_collection_and_check_retention(rd) && rd->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { /* This metric has no data and no references */ @@ -223,13 +223,13 @@ static void rrddim_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, v rd->tiers[tier].db_metric_handle = NULL; } - if(rd->db) { - __atomic_sub_fetch(&rrddim_db_memory_size, rd->memsize, __ATOMIC_RELAXED); + if(rd->db.data) { + __atomic_sub_fetch(&rrddim_db_memory_size, rd->db.memsize, __ATOMIC_RELAXED); if(rd->rrd_memory_mode == RRD_MEMORY_MODE_RAM) - netdata_munmap(rd->db, rd->memsize); + netdata_munmap(rd->db.data, rd->db.memsize); else - freez(rd->db); + freez(rd->db.data); } string_freez(rd->id); @@ -289,10 +289,14 @@ static void rrddim_react_callback(const DICTIONARY_ITEM *item __maybe_unused, vo rrdcontext_updated_rrddim(rd); } +size_t rrddim_size(void) { + return sizeof(RRDDIM) + storage_tiers * sizeof(struct rrddim_tier); +} + void rrddim_index_init(RRDSET *st) { if(!st->rrddim_root_index) { st->rrddim_root_index = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, - &dictionary_stats_category_rrdset_rrddim, sizeof(RRDDIM)); + &dictionary_stats_category_rrdset_rrddim, rrddim_size()); dictionary_register_insert_callback(st->rrddim_root_index, rrddim_insert_callback, NULL); dictionary_register_conflict_callback(st->rrddim_root_index, rrddim_conflict_callback, NULL); @@ -314,13 +318,13 @@ static inline RRDDIM *rrddim_index_find(RRDSET *st, const char *id) { // RRDDIM - find a dimension inline RRDDIM *rrddim_find(RRDSET *st, const char *id) { - debug(D_RRD_CALLS, "rrddim_find() for chart %s, dimension %s", rrdset_name(st), id); + netdata_log_debug(D_RRD_CALLS, "rrddim_find() for chart %s, dimension %s", rrdset_name(st), id); return rrddim_index_find(st, id); } inline RRDDIM_ACQUIRED *rrddim_find_and_acquire(RRDSET *st, const char *id) { - debug(D_RRD_CALLS, "rrddim_find_and_acquire() for chart %s, dimension %s", rrdset_name(st), id); + netdata_log_debug(D_RRD_CALLS, "rrddim_find_and_acquire() for chart %s, dimension %s", rrdset_name(st), id); return (RRDDIM_ACQUIRED *)dictionary_get_and_acquire_item(st->rrddim_root_index, id); } @@ -357,7 +361,7 @@ inline int rrddim_reset_name(RRDSET *st, RRDDIM *rd, const char *name) { if(unlikely(!name || !*name || !strcmp(rrddim_name(rd), name))) return 0; - debug(D_RRD_CALLS, "rrddim_reset_name() from %s.%s to %s.%s", rrdset_name(st), rrddim_name(rd), rrdset_name(st), name); + netdata_log_debug(D_RRD_CALLS, "rrddim_reset_name() from %s.%s to %s.%s", rrdset_name(st), rrddim_name(rd), rrdset_name(st), name); STRING *old = rd->name; rd->name = rrd_string_strdupz(name); @@ -365,7 +369,7 @@ inline int rrddim_reset_name(RRDSET *st, RRDDIM *rd, const char *name) { rrddimvar_rename_all(rd); - rd->exposed = 0; + rrddim_clear_exposed(rd); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); return 1; @@ -375,35 +379,37 @@ inline int rrddim_set_algorithm(RRDSET *st, RRDDIM *rd, RRD_ALGORITHM algorithm) if(unlikely(rd->algorithm == algorithm)) return 0; - debug(D_RRD_CALLS, "Updating algorithm of dimension '%s/%s' from %s to %s", rrdset_id(st), rrddim_name(rd), rrd_algorithm_name(rd->algorithm), rrd_algorithm_name(algorithm)); + netdata_log_debug(D_RRD_CALLS, "Updating algorithm of dimension '%s/%s' from %s to %s", rrdset_id(st), rrddim_name(rd), rrd_algorithm_name(rd->algorithm), rrd_algorithm_name(algorithm)); rd->algorithm = algorithm; - rd->exposed = 0; + rrddim_clear_exposed(rd); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); rrdset_flag_set(st, RRDSET_FLAG_HOMOGENEOUS_CHECK); rrdcontext_updated_rrddim_algorithm(rd); return 1; } -inline int rrddim_set_multiplier(RRDSET *st, RRDDIM *rd, collected_number multiplier) { +inline int rrddim_set_multiplier(RRDSET *st, RRDDIM *rd, int32_t multiplier) { if(unlikely(rd->multiplier == multiplier)) return 0; - debug(D_RRD_CALLS, "Updating multiplier of dimension '%s/%s' from " COLLECTED_NUMBER_FORMAT " to " COLLECTED_NUMBER_FORMAT, rrdset_id(st), rrddim_name(rd), rd->multiplier, multiplier); + netdata_log_debug(D_RRD_CALLS, "Updating multiplier of dimension '%s/%s' from %d to %d", + rrdset_id(st), rrddim_name(rd), rd->multiplier, multiplier); rd->multiplier = multiplier; - rd->exposed = 0; + rrddim_clear_exposed(rd); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); rrdset_flag_set(st, RRDSET_FLAG_HOMOGENEOUS_CHECK); rrdcontext_updated_rrddim_multiplier(rd); return 1; } -inline int rrddim_set_divisor(RRDSET *st, RRDDIM *rd, collected_number divisor) { +inline int rrddim_set_divisor(RRDSET *st, RRDDIM *rd, int32_t divisor) { if(unlikely(rd->divisor == divisor)) return 0; - debug(D_RRD_CALLS, "Updating divisor of dimension '%s/%s' from " COLLECTED_NUMBER_FORMAT " to " COLLECTED_NUMBER_FORMAT, rrdset_id(st), rrddim_name(rd), rd->divisor, divisor); + netdata_log_debug(D_RRD_CALLS, "Updating divisor of dimension '%s/%s' from %d to %d", + rrdset_id(st), rrddim_name(rd), rd->divisor, divisor); rd->divisor = divisor; - rd->exposed = 0; + rrddim_clear_exposed(rd); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); rrdset_flag_set(st, RRDSET_FLAG_HOMOGENEOUS_CHECK); rrdcontext_updated_rrddim_divisor(rd); @@ -471,7 +477,7 @@ RRDDIM *rrddim_add_custom(RRDSET *st .memory_mode = memory_mode, }; - RRDDIM *rd = dictionary_set_advanced(st->rrddim_root_index, tmp.id, -1, NULL, sizeof(RRDDIM), &tmp); + RRDDIM *rd = dictionary_set_advanced(st->rrddim_root_index, tmp.id, -1, NULL, rrddim_size(), &tmp); return(rd); } @@ -487,13 +493,13 @@ void rrddim_free(RRDSET *st, RRDDIM *rd) { // RRDDIM - set dimension options int rrddim_hide(RRDSET *st, const char *id) { - debug(D_RRD_CALLS, "rrddim_hide() for chart %s, dimension %s", rrdset_name(st), id); + netdata_log_debug(D_RRD_CALLS, "rrddim_hide() for chart %s, dimension %s", rrdset_name(st), id); RRDHOST *host = st->rrdhost; RRDDIM *rd = rrddim_find(st, id); if(unlikely(!rd)) { - error("Cannot find dimension with id '%s' on stats '%s' (%s) on host '%s'.", id, rrdset_name(st), rrdset_id(st), rrdhost_hostname(host)); + netdata_log_error("Cannot find dimension with id '%s' on stats '%s' (%s) on host '%s'.", id, rrdset_name(st), rrdset_id(st), rrdhost_hostname(host)); return 1; } if (!rrddim_flag_check(rd, RRDDIM_FLAG_META_HIDDEN)) { @@ -507,12 +513,12 @@ int rrddim_hide(RRDSET *st, const char *id) { } int rrddim_unhide(RRDSET *st, const char *id) { - debug(D_RRD_CALLS, "rrddim_unhide() for chart %s, dimension %s", rrdset_name(st), id); + netdata_log_debug(D_RRD_CALLS, "rrddim_unhide() for chart %s, dimension %s", rrdset_name(st), id); RRDHOST *host = st->rrdhost; RRDDIM *rd = rrddim_find(st, id); if(unlikely(!rd)) { - error("Cannot find dimension with id '%s' on stats '%s' (%s) on host '%s'.", id, rrdset_name(st), rrdset_id(st), rrdhost_hostname(host)); + netdata_log_error("Cannot find dimension with id '%s' on stats '%s' (%s) on host '%s'.", id, rrdset_name(st), rrdset_id(st), rrdhost_hostname(host)); return 1; } if (rrddim_flag_check(rd, RRDDIM_FLAG_META_HIDDEN)) { @@ -527,10 +533,10 @@ int rrddim_unhide(RRDSET *st, const char *id) { } inline void rrddim_is_obsolete(RRDSET *st, RRDDIM *rd) { - debug(D_RRD_CALLS, "rrddim_is_obsolete() for chart %s, dimension %s", rrdset_name(st), rrddim_name(rd)); + netdata_log_debug(D_RRD_CALLS, "rrddim_is_obsolete() for chart %s, dimension %s", rrdset_name(st), rrddim_name(rd)); if(unlikely(rrddim_flag_check(rd, RRDDIM_FLAG_ARCHIVED))) { - info("Cannot obsolete already archived dimension %s from chart %s", rrddim_name(rd), rrdset_name(st)); + netdata_log_info("Cannot obsolete already archived dimension %s from chart %s", rrddim_name(rd), rrdset_name(st)); return; } rrddim_flag_set(rd, RRDDIM_FLAG_OBSOLETE); @@ -540,7 +546,7 @@ inline void rrddim_is_obsolete(RRDSET *st, RRDDIM *rd) { } inline void rrddim_isnot_obsolete(RRDSET *st __maybe_unused, RRDDIM *rd) { - debug(D_RRD_CALLS, "rrddim_isnot_obsolete() for chart %s, dimension %s", rrdset_name(st), rrddim_name(rd)); + netdata_log_debug(D_RRD_CALLS, "rrddim_isnot_obsolete() for chart %s, dimension %s", rrdset_name(st), rrddim_name(rd)); rrddim_flag_clear(rd, RRDDIM_FLAG_OBSOLETE); rrdcontext_updated_rrddim_flags(rd); @@ -557,18 +563,18 @@ inline collected_number rrddim_set_by_pointer(RRDSET *st, RRDDIM *rd, collected_ } collected_number rrddim_timed_set_by_pointer(RRDSET *st __maybe_unused, RRDDIM *rd, struct timeval collected_time, collected_number value) { - debug(D_RRD_CALLS, "rrddim_set_by_pointer() for chart %s, dimension %s, value " COLLECTED_NUMBER_FORMAT, rrdset_name(st), rrddim_name(rd), value); + netdata_log_debug(D_RRD_CALLS, "rrddim_set_by_pointer() for chart %s, dimension %s, value " COLLECTED_NUMBER_FORMAT, rrdset_name(st), rrddim_name(rd), value); - rd->last_collected_time = collected_time; - rd->collected_value = value; - rd->updated = 1; - rd->collections_counter++; + rd->collector.last_collected_time = collected_time; + rd->collector.collected_value = value; + rrddim_set_updated(rd); + rd->collector.counter++; collected_number v = (value >= 0) ? value : -value; - if (unlikely(v > rd->collected_value_max)) - rd->collected_value_max = v; + if (unlikely(v > rd->collector.collected_value_max)) + rd->collector.collected_value_max = v; - return rd->last_collected_value; + return rd->collector.last_collected_value; } @@ -576,7 +582,7 @@ collected_number rrddim_set(RRDSET *st, const char *id, collected_number value) RRDHOST *host = st->rrdhost; RRDDIM *rd = rrddim_find(st, id); if(unlikely(!rd)) { - error("Cannot find dimension with id '%s' on stats '%s' (%s) on host '%s'.", id, rrdset_name(st), rrdset_id(st), rrdhost_hostname(host)); + netdata_log_error("Cannot find dimension with id '%s' on stats '%s' (%s) on host '%s'.", id, rrdset_name(st), rrdset_id(st), rrdhost_hostname(host)); return 0; } @@ -635,42 +641,42 @@ size_t rrddim_memory_file_header_size(void) { } void rrddim_memory_file_update(RRDDIM *rd) { - if(!rd || !rd->rd_on_file) return; - struct rrddim_map_save_v019 *rd_on_file = rd->rd_on_file; + if(!rd || !rd->db.rd_on_file) return; + struct rrddim_map_save_v019 *rd_on_file = rd->db.rd_on_file; - rd_on_file->last_collected_time.tv_sec = rd->last_collected_time.tv_sec; - rd_on_file->last_collected_time.tv_usec = rd->last_collected_time.tv_usec; - rd_on_file->last_collected_value = rd->last_collected_value; + rd_on_file->last_collected_time.tv_sec = rd->collector.last_collected_time.tv_sec; + rd_on_file->last_collected_time.tv_usec = rd->collector.last_collected_time.tv_usec; + rd_on_file->last_collected_value = rd->collector.last_collected_value; } void rrddim_memory_file_free(RRDDIM *rd) { - if(!rd || !rd->rd_on_file) return; + if(!rd || !rd->db.rd_on_file) return; // needed for memory mode map, to save the latest state rrddim_memory_file_update(rd); - struct rrddim_map_save_v019 *rd_on_file = rd->rd_on_file; + struct rrddim_map_save_v019 *rd_on_file = rd->db.rd_on_file; __atomic_sub_fetch(&rrddim_db_memory_size, rd_on_file->memsize + strlen(rd_on_file->cache_filename), __ATOMIC_RELAXED); freez(rd_on_file->cache_filename); netdata_munmap(rd_on_file, rd_on_file->memsize); // remove the pointers from the RRDDIM - rd->rd_on_file = NULL; - rd->db = NULL; + rd->db.rd_on_file = NULL; + rd->db.data = NULL; } const char *rrddim_cache_filename(RRDDIM *rd) { - if(!rd || !rd->rd_on_file) return NULL; - struct rrddim_map_save_v019 *rd_on_file = rd->rd_on_file; + if(!rd || !rd->db.rd_on_file) return NULL; + struct rrddim_map_save_v019 *rd_on_file = rd->db.rd_on_file; return rd_on_file->cache_filename; } void rrddim_memory_file_save(RRDDIM *rd) { - if(!rd || !rd->rd_on_file) return; + if(!rd || !rd->db.rd_on_file) return; rrddim_memory_file_update(rd); - struct rrddim_map_save_v019 *rd_on_file = rd->rd_on_file; + struct rrddim_map_save_v019 *rd_on_file = rd->db.rd_on_file; if(rd_on_file->rrd_memory_mode != RRD_MEMORY_MODE_SAVE) return; memory_file_save(rd_on_file->cache_filename, rd_on_file, rd_on_file->memsize); @@ -682,7 +688,7 @@ bool rrddim_memory_load_or_create_map_save(RRDSET *st, RRDDIM *rd, RRD_MEMORY_MO struct rrddim_map_save_v019 *rd_on_file = NULL; - unsigned long size = sizeof(struct rrddim_map_save_v019) + (st->entries * sizeof(storage_number)); + unsigned long size = sizeof(struct rrddim_map_save_v019) + (st->db.entries * sizeof(storage_number)); char filename[FILENAME_MAX + 1]; char fullfilename[FILENAME_MAX + 1]; @@ -700,38 +706,40 @@ bool rrddim_memory_load_or_create_map_save(RRDSET *st, RRDDIM *rd, RRD_MEMORY_MO int reset = 0; rd_on_file->magic[sizeof(RRDDIMENSION_MAGIC_V019)] = '\0'; if(strcmp(rd_on_file->magic, RRDDIMENSION_MAGIC_V019) != 0) { - info("Initializing file %s.", fullfilename); + netdata_log_info("Initializing file %s.", fullfilename); memset(rd_on_file, 0, size); reset = 1; } else if(rd_on_file->memsize != size) { - error("File %s does not have the desired size, expected %lu but found %lu. Clearing it.", fullfilename, size, (unsigned long int) rd_on_file->memsize); + netdata_log_error("File %s does not have the desired size, expected %lu but found %lu. Clearing it.", fullfilename, size, (unsigned long int) rd_on_file->memsize); memset(rd_on_file, 0, size); reset = 1; } else if(rd_on_file->update_every != st->update_every) { - error("File %s does not have the same update frequency, expected %d but found %d. Clearing it.", fullfilename, st->update_every, rd_on_file->update_every); + netdata_log_error("File %s does not have the same update frequency, expected %d but found %d. Clearing it.", fullfilename, st->update_every, rd_on_file->update_every); memset(rd_on_file, 0, size); reset = 1; } else if(dt_usec(&now, &rd_on_file->last_collected_time) > (rd_on_file->entries * rd_on_file->update_every * USEC_PER_SEC)) { - info("File %s is too old (last collected %llu seconds ago, but the database is %ld seconds). Clearing it.", fullfilename, dt_usec(&now, &rd_on_file->last_collected_time) / USEC_PER_SEC, rd_on_file->entries * rd_on_file->update_every); + netdata_log_info("File %s is too old (last collected %llu seconds ago, but the database is %ld seconds). Clearing it.", fullfilename, dt_usec(&now, &rd_on_file->last_collected_time) / USEC_PER_SEC, rd_on_file->entries * rd_on_file->update_every); memset(rd_on_file, 0, size); reset = 1; } if(!reset) { - rd->last_collected_value = rd_on_file->last_collected_value; + rd->collector.last_collected_value = rd_on_file->last_collected_value; if(rd_on_file->algorithm != rd->algorithm) - info("File %s does not have the expected algorithm (expected %u '%s', found %u '%s'). Previous values may be wrong.", + netdata_log_info("File %s does not have the expected algorithm (expected %u '%s', found %u '%s'). Previous values may be wrong.", fullfilename, rd->algorithm, rrd_algorithm_name(rd->algorithm), rd_on_file->algorithm, rrd_algorithm_name(rd_on_file->algorithm)); if(rd_on_file->multiplier != rd->multiplier) - info("File %s does not have the expected multiplier (expected " COLLECTED_NUMBER_FORMAT ", found " COLLECTED_NUMBER_FORMAT "). Previous values may be wrong.", fullfilename, rd->multiplier, rd_on_file->multiplier); + netdata_log_info("File %s does not have the expected multiplier (expected %d, found %ld). " + "Previous values may be wrong.", fullfilename, rd->multiplier, (long)rd_on_file->multiplier); if(rd_on_file->divisor != rd->divisor) - info("File %s does not have the expected divisor (expected " COLLECTED_NUMBER_FORMAT ", found " COLLECTED_NUMBER_FORMAT "). Previous values may be wrong.", fullfilename, rd->divisor, rd_on_file->divisor); + netdata_log_info("File %s does not have the expected divisor (expected %d, found %ld). " + "Previous values may be wrong.", fullfilename, rd->divisor, (long)rd_on_file->divisor); } // zero the entire header @@ -742,17 +750,17 @@ bool rrddim_memory_load_or_create_map_save(RRDSET *st, RRDDIM *rd, RRD_MEMORY_MO rd_on_file->algorithm = rd->algorithm; rd_on_file->multiplier = rd->multiplier; rd_on_file->divisor = rd->divisor; - rd_on_file->entries = st->entries; - rd_on_file->update_every = rd->update_every; + rd_on_file->entries = st->db.entries; + rd_on_file->update_every = rd->rrdset->update_every; rd_on_file->memsize = size; rd_on_file->rrd_memory_mode = memory_mode; rd_on_file->cache_filename = strdupz(fullfilename); __atomic_add_fetch(&rrddim_db_memory_size, rd_on_file->memsize + strlen(rd_on_file->cache_filename), __ATOMIC_RELAXED); - rd->db = &rd_on_file->values[0]; - rd->rd_on_file = rd_on_file; - rd->memsize = size; + rd->db.data = &rd_on_file->values[0]; + rd->db.rd_on_file = rd_on_file; + rd->db.memsize = size; rrddim_memory_file_update(rd); return true; diff --git a/database/rrddimvar.c b/database/rrddimvar.c index da8b939ce..5035d70a5 100644 --- a/database/rrddimvar.c +++ b/database/rrddimvar.c @@ -249,7 +249,7 @@ void rrddimvar_add_and_leave_released(RRDDIM *rd, RRDVAR_TYPE type, const char * void rrddimvar_rename_all(RRDDIM *rd) { RRDSET *st = rd->rrdset; - debug(D_VARIABLES, "RRDDIMVAR rename for chart id '%s' name '%s', dimension id '%s', name '%s'", rrdset_id(st), rrdset_name(st), rrddim_id(rd), rrddim_name(rd)); + netdata_log_debug(D_VARIABLES, "RRDDIMVAR rename for chart id '%s' name '%s', dimension id '%s', name '%s'", rrdset_id(st), rrdset_name(st), rrddim_id(rd), rrddim_name(rd)); RRDDIMVAR *rs; dfe_start_write(st->rrddimvar_root_index, rs) { @@ -262,7 +262,7 @@ void rrddimvar_rename_all(RRDDIM *rd) { void rrddimvar_delete_all(RRDDIM *rd) { RRDSET *st = rd->rrdset; - debug(D_VARIABLES, "RRDDIMVAR delete for chart id '%s' name '%s', dimension id '%s', name '%s'", rrdset_id(st), rrdset_name(st), rrddim_id(rd), rrddim_name(rd)); + netdata_log_debug(D_VARIABLES, "RRDDIMVAR delete for chart id '%s' name '%s', dimension id '%s', name '%s'", rrdset_id(st), rrdset_name(st), rrddim_id(rd), rrddim_name(rd)); RRDDIMVAR *rs; dfe_start_write(st->rrddimvar_root_index, rs) { diff --git a/database/rrdfunctions.c b/database/rrdfunctions.c index cdba221a5..cee1ac89e 100644 --- a/database/rrdfunctions.c +++ b/database/rrdfunctions.c @@ -480,6 +480,22 @@ void rrd_functions_expose_rrdpush(RRDSET *st, BUFFER *wb) { dfe_done(tmp); } +void rrd_functions_expose_global_rrdpush(RRDHOST *host, BUFFER *wb) { + struct rrd_collector_function *tmp; + dfe_start_read(host->functions, tmp) { + if(!(tmp->options & RRD_FUNCTION_GLOBAL)) + continue; + + buffer_sprintf(wb + , PLUGINSD_KEYWORD_FUNCTION " GLOBAL \"%s\" %d \"%s\"\n" + , tmp_dfe.name + , tmp->timeout + , string2str(tmp->help) + ); + } + dfe_done(tmp); +} + struct rrd_function_call_wait { bool free_with_signal; bool data_are_ready; @@ -763,14 +779,657 @@ void host_functions2json(RRDHOST *host, BUFFER *wb) { buffer_json_object_close(wb); } -void chart_functions_to_dict(DICTIONARY *rrdset_functions_view, DICTIONARY *dst) { +void chart_functions_to_dict(DICTIONARY *rrdset_functions_view, DICTIONARY *dst, void *value, size_t value_size) { if(!rrdset_functions_view || !dst) return; struct rrd_collector_function *t; dfe_start_read(rrdset_functions_view, t) { if(!t->collector->running) continue; - dictionary_set(dst, t_dfe.name, NULL, 0); + dictionary_set(dst, t_dfe.name, value, value_size); + } + dfe_done(t); +} + +void host_functions_to_dict(RRDHOST *host, DICTIONARY *dst, void *value, size_t value_size, STRING **help) { + if(!host || !host->functions || !dictionary_entries(host->functions) || !dst) return; + + struct rrd_collector_function *t; + dfe_start_read(host->functions, t) { + if(!t->collector->running) continue; + + if(help) + *help = t->help; + + dictionary_set(dst, t_dfe.name, value, value_size); } dfe_done(t); } + + +int rrdhost_function_streaming(BUFFER *wb, int timeout __maybe_unused, const char *function __maybe_unused, + void *collector_data __maybe_unused, + function_data_ready_callback callback __maybe_unused, void *callback_data __maybe_unused) { + time_t now = now_realtime_sec(); + + buffer_flush(wb); + wb->content_type = CT_APPLICATION_JSON; + buffer_json_initialize(wb, "\"", "\"", 0, true, false); + + buffer_json_member_add_string(wb, "hostname", rrdhost_hostname(localhost)); + buffer_json_member_add_uint64(wb, "status", HTTP_RESP_OK); + buffer_json_member_add_string(wb, "type", "table"); + buffer_json_member_add_time_t(wb, "update_every", 1); + buffer_json_member_add_string(wb, "help", RRDFUNCTIONS_STREAMING_HELP); + buffer_json_member_add_array(wb, "data"); + + size_t max_sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_MAX]; + size_t max_db_metrics = 0, max_db_instances = 0, max_db_contexts = 0; + size_t max_collection_replication_instances = 0, max_streaming_replication_instances = 0; + size_t max_ml_anomalous = 0, max_ml_normal = 0, max_ml_trained = 0, max_ml_pending = 0, max_ml_silenced = 0; + { + RRDHOST *host; + dfe_start_read(rrdhost_root_index, host) { + RRDHOST_STATUS s; + rrdhost_status(host, now, &s); + buffer_json_add_array_item_array(wb); + + if(s.db.metrics > max_db_metrics) + max_db_metrics = s.db.metrics; + + if(s.db.instances > max_db_instances) + max_db_instances = s.db.instances; + + if(s.db.contexts > max_db_contexts) + max_db_contexts = s.db.contexts; + + if(s.ingest.replication.instances > max_collection_replication_instances) + max_collection_replication_instances = s.ingest.replication.instances; + + if(s.stream.replication.instances > max_streaming_replication_instances) + max_streaming_replication_instances = s.stream.replication.instances; + + for(int i = 0; i < STREAM_TRAFFIC_TYPE_MAX ;i++) { + if (s.stream.sent_bytes_on_this_connection_per_type[i] > + max_sent_bytes_on_this_connection_per_type[i]) + max_sent_bytes_on_this_connection_per_type[i] = + s.stream.sent_bytes_on_this_connection_per_type[i]; + } + + // retention + buffer_json_add_array_item_string(wb, rrdhost_hostname(s.host)); // Node + buffer_json_add_array_item_uint64(wb, s.db.first_time_s * 1000); // dbFrom + buffer_json_add_array_item_uint64(wb, s.db.last_time_s * 1000); // dbTo + + if(s.db.first_time_s && s.db.last_time_s && s.db.last_time_s > s.db.first_time_s) + buffer_json_add_array_item_uint64(wb, s.db.last_time_s - s.db.first_time_s); // dbDuration + else + buffer_json_add_array_item_string(wb, NULL); // dbDuration + + buffer_json_add_array_item_uint64(wb, s.db.metrics); // dbMetrics + buffer_json_add_array_item_uint64(wb, s.db.instances); // dbInstances + buffer_json_add_array_item_uint64(wb, s.db.contexts); // dbContexts + + // statuses + buffer_json_add_array_item_string(wb, rrdhost_ingest_status_to_string(s.ingest.status)); // InStatus + buffer_json_add_array_item_string(wb, rrdhost_streaming_status_to_string(s.stream.status)); // OutStatus + buffer_json_add_array_item_string(wb, rrdhost_ml_status_to_string(s.ml.status)); // MLStatus + + // collection + if(s.ingest.since) { + buffer_json_add_array_item_uint64(wb, s.ingest.since * 1000); // InSince + buffer_json_add_array_item_time_t(wb, s.now - s.ingest.since); // InAge + } + else { + buffer_json_add_array_item_string(wb, NULL); // InSince + buffer_json_add_array_item_string(wb, NULL); // InAge + } + buffer_json_add_array_item_string(wb, stream_handshake_error_to_string(s.ingest.reason)); // InReason + buffer_json_add_array_item_uint64(wb, s.ingest.hops); // InHops + buffer_json_add_array_item_double(wb, s.ingest.replication.completion); // InReplCompletion + buffer_json_add_array_item_uint64(wb, s.ingest.replication.instances); // InReplInstances + buffer_json_add_array_item_string(wb, s.ingest.peers.local.ip); // InLocalIP + buffer_json_add_array_item_uint64(wb, s.ingest.peers.local.port); // InLocalPort + buffer_json_add_array_item_string(wb, s.ingest.peers.peer.ip); // InRemoteIP + buffer_json_add_array_item_uint64(wb, s.ingest.peers.peer.port); // InRemotePort + buffer_json_add_array_item_string(wb, s.ingest.ssl ? "SSL" : "PLAIN"); // InSSL + stream_capabilities_to_json_array(wb, s.ingest.capabilities, NULL); // InCapabilities + + // streaming + if(s.stream.since) { + buffer_json_add_array_item_uint64(wb, s.stream.since * 1000); // OutSince + buffer_json_add_array_item_time_t(wb, s.now - s.stream.since); // OutAge + } + else { + buffer_json_add_array_item_string(wb, NULL); // OutSince + buffer_json_add_array_item_string(wb, NULL); // OutAge + } + buffer_json_add_array_item_string(wb, stream_handshake_error_to_string(s.stream.reason)); // OutReason + buffer_json_add_array_item_uint64(wb, s.stream.hops); // OutHops + buffer_json_add_array_item_double(wb, s.stream.replication.completion); // OutReplCompletion + buffer_json_add_array_item_uint64(wb, s.stream.replication.instances); // OutReplInstances + buffer_json_add_array_item_string(wb, s.stream.peers.local.ip); // OutLocalIP + buffer_json_add_array_item_uint64(wb, s.stream.peers.local.port); // OutLocalPort + buffer_json_add_array_item_string(wb, s.stream.peers.peer.ip); // OutRemoteIP + buffer_json_add_array_item_uint64(wb, s.stream.peers.peer.port); // OutRemotePort + buffer_json_add_array_item_string(wb, s.stream.ssl ? "SSL" : "PLAIN"); // OutSSL + buffer_json_add_array_item_string(wb, s.stream.compression ? "COMPRESSED" : "UNCOMPRESSED"); // OutCompression + stream_capabilities_to_json_array(wb, s.stream.capabilities, NULL); // OutCapabilities + buffer_json_add_array_item_uint64(wb, s.stream.sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_DATA]); + buffer_json_add_array_item_uint64(wb, s.stream.sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_METADATA]); + buffer_json_add_array_item_uint64(wb, s.stream.sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_REPLICATION]); + buffer_json_add_array_item_uint64(wb, s.stream.sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_FUNCTIONS]); + + buffer_json_add_array_item_array(wb); // OutAttemptHandshake + time_t last_attempt = 0; + for(struct rrdpush_destinations *d = host->destinations; d ; d = d->next) { + if(d->since > last_attempt) + last_attempt = d->since; + + buffer_json_add_array_item_string(wb, stream_handshake_error_to_string(d->reason)); + } + buffer_json_array_close(wb); // // OutAttemptHandshake + + if(!last_attempt) { + buffer_json_add_array_item_string(wb, NULL); // OutAttemptSince + buffer_json_add_array_item_string(wb, NULL); // OutAttemptAge + } + else { + buffer_json_add_array_item_uint64(wb, last_attempt * 1000); // OutAttemptSince + buffer_json_add_array_item_time_t(wb, s.now - last_attempt); // OutAttemptAge + } + + // ML + if(s.ml.status == RRDHOST_ML_STATUS_RUNNING) { + buffer_json_add_array_item_uint64(wb, s.ml.metrics.anomalous); // MlAnomalous + buffer_json_add_array_item_uint64(wb, s.ml.metrics.normal); // MlNormal + buffer_json_add_array_item_uint64(wb, s.ml.metrics.trained); // MlTrained + buffer_json_add_array_item_uint64(wb, s.ml.metrics.pending); // MlPending + buffer_json_add_array_item_uint64(wb, s.ml.metrics.silenced); // MlSilenced + + if(s.ml.metrics.anomalous > max_ml_anomalous) + max_ml_anomalous = s.ml.metrics.anomalous; + + if(s.ml.metrics.normal > max_ml_normal) + max_ml_normal = s.ml.metrics.normal; + + if(s.ml.metrics.trained > max_ml_trained) + max_ml_trained = s.ml.metrics.trained; + + if(s.ml.metrics.pending > max_ml_pending) + max_ml_pending = s.ml.metrics.pending; + + if(s.ml.metrics.silenced > max_ml_silenced) + max_ml_silenced = s.ml.metrics.silenced; + + } + else { + buffer_json_add_array_item_string(wb, NULL); // MlAnomalous + buffer_json_add_array_item_string(wb, NULL); // MlNormal + buffer_json_add_array_item_string(wb, NULL); // MlTrained + buffer_json_add_array_item_string(wb, NULL); // MlPending + buffer_json_add_array_item_string(wb, NULL); // MlSilenced + } + + // close + buffer_json_array_close(wb); + } + dfe_done(host); + } + buffer_json_array_close(wb); // data + buffer_json_member_add_object(wb, "columns"); + { + size_t field_id = 0; + + // Node + buffer_rrdf_table_add_field(wb, field_id++, "Node", "Node's Hostname", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_VISIBLE | RRDF_FIELD_OPTS_UNIQUE_KEY | RRDF_FIELD_OPTS_STICKY, + NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "dbFrom", "DB Data Retention From", + RRDF_FIELD_TYPE_TIMESTAMP, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DATETIME, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_MIN, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "dbTo", "DB Data Retention To", + RRDF_FIELD_TYPE_TIMESTAMP, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DATETIME, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_MAX, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "dbDuration", "DB Data Retention Duration", + RRDF_FIELD_TYPE_DURATION, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DURATION, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_MAX, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "dbMetrics", "Time-series Metrics in the DB", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, NULL, max_db_metrics, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "dbInstances", "Instances in the DB", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, NULL, max_db_instances, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "dbContexts", "Contexts in the DB", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, NULL, max_db_contexts, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + // --- statuses --- + + buffer_rrdf_table_add_field(wb, field_id++, "InStatus", "Data Collection Online Status", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + + buffer_rrdf_table_add_field(wb, field_id++, "OutStatus", "Streaming Online Status", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "MlStatus", "ML Status", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + // --- collection --- + + buffer_rrdf_table_add_field(wb, field_id++, "InSince", "Last Data Collection Status Change", + RRDF_FIELD_TYPE_TIMESTAMP, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DATETIME, + 0, NULL, NAN, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_MIN, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "InAge", "Last Data Collection Online Status Change Age", + RRDF_FIELD_TYPE_DURATION, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DURATION, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_MAX, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "InReason", "Data Collection Online Status Reason", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "InHops", "Data Collection Distance Hops from Origin Node", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_MIN, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "InReplCompletion", "Inbound Replication Completion", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_BAR, RRDF_FIELD_TRANSFORM_NUMBER, + 1, "%", 100.0, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_MIN, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "InReplInstances", "Inbound Replicating Instances", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "instances", max_collection_replication_instances, RRDF_FIELD_SORT_DESCENDING, + NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "InLocalIP", "Inbound Local IP", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "InLocalPort", "Inbound Local Port", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "InRemoteIP", "Inbound Remote IP", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "InRemotePort", "Inbound Remote Port", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "InSSL", "Inbound SSL Connection", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "InCapabilities", "Inbound Connection Capabilities", + RRDF_FIELD_TYPE_ARRAY, RRDF_FIELD_VISUAL_PILL, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_NONE, NULL); + + // --- streaming --- + + buffer_rrdf_table_add_field(wb, field_id++, "OutSince", "Last Streaming Status Change", + RRDF_FIELD_TYPE_TIMESTAMP, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DATETIME, + 0, NULL, NAN, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_MAX, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutAge", "Last Streaming Status Change Age", + RRDF_FIELD_TYPE_DURATION, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DURATION, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_MIN, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutReason", "Streaming Status Reason", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutHops", "Streaming Distance Hops from Origin Node", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_MIN, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutReplCompletion", "Outbound Replication Completion", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_BAR, RRDF_FIELD_TRANSFORM_NUMBER, + 1, "%", 100.0, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_MIN, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutReplInstances", "Outbound Replicating Instances", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "instances", max_streaming_replication_instances, RRDF_FIELD_SORT_DESCENDING, + NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutLocalIP", "Outbound Local IP", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutLocalPort", "Outbound Local Port", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutRemoteIP", "Outbound Remote IP", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutRemotePort", "Outbound Remote Port", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutSSL", "Outbound SSL Connection", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutCompression", "Outbound Compressed Connection", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutCapabilities", "Outbound Connection Capabilities", + RRDF_FIELD_TYPE_ARRAY, RRDF_FIELD_VISUAL_PILL, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutTrafficData", "Outbound Metric Data Traffic", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "bytes", max_sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_DATA], + RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutTrafficMetadata", "Outbound Metric Metadata Traffic", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "bytes", + max_sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_METADATA], + RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutTrafficReplication", "Outbound Metric Replication Traffic", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "bytes", + max_sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_REPLICATION], + RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutTrafficFunctions", "Outbound Metric Functions Traffic", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "bytes", + max_sent_bytes_on_this_connection_per_type[STREAM_TRAFFIC_TYPE_FUNCTIONS], + RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutAttemptHandshake", + "Outbound Connection Attempt Handshake Status", + RRDF_FIELD_TYPE_ARRAY, RRDF_FIELD_VISUAL_PILL, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutAttemptSince", + "Last Outbound Connection Attempt Status Change Time", + RRDF_FIELD_TYPE_TIMESTAMP, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DATETIME, + 0, NULL, NAN, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_MAX, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "OutAttemptAge", + "Last Outbound Connection Attempt Status Change Age", + RRDF_FIELD_TYPE_DURATION, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DURATION, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_MIN, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + // --- ML --- + + buffer_rrdf_table_add_field(wb, field_id++, "MlAnomalous", "Number of Anomalous Metrics", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "metrics", + max_ml_anomalous, + RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "MlNormal", "Number of Not Anomalous Metrics", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "metrics", + max_ml_normal, + RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "MlTrained", "Number of Trained Metrics", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "metrics", + max_ml_trained, + RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "MlPending", "Number of Pending Metrics", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "metrics", + max_ml_pending, + RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + buffer_rrdf_table_add_field(wb, field_id++, "MlSilenced", "Number of Silenced Metrics", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 0, "metrics", + max_ml_silenced, + RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + } + buffer_json_object_close(wb); // columns + buffer_json_member_add_string(wb, "default_sort_column", "Node"); + buffer_json_member_add_object(wb, "charts"); + { + // Data Collection Age chart + buffer_json_member_add_object(wb, "InAge"); + { + buffer_json_member_add_string(wb, "name", "Data Collection Age"); + buffer_json_member_add_string(wb, "type", "stacked-bar"); + buffer_json_member_add_array(wb, "columns"); + { + buffer_json_add_array_item_string(wb, "InAge"); + } + buffer_json_array_close(wb); + } + buffer_json_object_close(wb); + + // Streaming Age chart + buffer_json_member_add_object(wb, "OutAge"); + { + buffer_json_member_add_string(wb, "name", "Streaming Age"); + buffer_json_member_add_string(wb, "type", "stacked-bar"); + buffer_json_member_add_array(wb, "columns"); + { + buffer_json_add_array_item_string(wb, "OutAge"); + } + buffer_json_array_close(wb); + } + buffer_json_object_close(wb); + + // DB Duration + buffer_json_member_add_object(wb, "dbDuration"); + { + buffer_json_member_add_string(wb, "name", "Retention Duration"); + buffer_json_member_add_string(wb, "type", "stacked-bar"); + buffer_json_member_add_array(wb, "columns"); + { + buffer_json_add_array_item_string(wb, "dbDuration"); + } + buffer_json_array_close(wb); + } + buffer_json_object_close(wb); + } + buffer_json_object_close(wb); // charts + + buffer_json_member_add_array(wb, "default_charts"); + { + buffer_json_add_array_item_array(wb); + buffer_json_add_array_item_string(wb, "InAge"); + buffer_json_add_array_item_string(wb, "Node"); + buffer_json_array_close(wb); + + buffer_json_add_array_item_array(wb); + buffer_json_add_array_item_string(wb, "OutAge"); + buffer_json_add_array_item_string(wb, "Node"); + buffer_json_array_close(wb); + } + buffer_json_array_close(wb); + + buffer_json_member_add_object(wb, "group_by"); + { + buffer_json_member_add_object(wb, "Node"); + { + buffer_json_member_add_string(wb, "name", "Node"); + buffer_json_member_add_array(wb, "columns"); + { + buffer_json_add_array_item_string(wb, "Node"); + } + buffer_json_array_close(wb); + } + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "InStatus"); + { + buffer_json_member_add_string(wb, "name", "Nodes by Collection Status"); + buffer_json_member_add_array(wb, "columns"); + { + buffer_json_add_array_item_string(wb, "InStatus"); + } + buffer_json_array_close(wb); + } + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "OutStatus"); + { + buffer_json_member_add_string(wb, "name", "Nodes by Streaming Status"); + buffer_json_member_add_array(wb, "columns"); + { + buffer_json_add_array_item_string(wb, "OutStatus"); + } + buffer_json_array_close(wb); + } + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "MlStatus"); + { + buffer_json_member_add_string(wb, "name", "Nodes by ML Status"); + buffer_json_member_add_array(wb, "columns"); + { + buffer_json_add_array_item_string(wb, "MlStatus"); + } + buffer_json_array_close(wb); + } + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "InRemoteIP"); + { + buffer_json_member_add_string(wb, "name", "Nodes by Inbound IP"); + buffer_json_member_add_array(wb, "columns"); + { + buffer_json_add_array_item_string(wb, "InRemoteIP"); + } + buffer_json_array_close(wb); + } + buffer_json_object_close(wb); + + buffer_json_member_add_object(wb, "OutRemoteIP"); + { + buffer_json_member_add_string(wb, "name", "Nodes by Outbound IP"); + buffer_json_member_add_array(wb, "columns"); + { + buffer_json_add_array_item_string(wb, "OutRemoteIP"); + } + buffer_json_array_close(wb); + } + buffer_json_object_close(wb); + } + buffer_json_object_close(wb); // group_by + + buffer_json_member_add_time_t(wb, "expires", now_realtime_sec() + 1); + buffer_json_finalize(wb); + + if(callback) + callback(wb, HTTP_RESP_OK, callback_data); + + return HTTP_RESP_OK; +} diff --git a/database/rrdfunctions.h b/database/rrdfunctions.h index 920ada8d3..71ad96507 100644 --- a/database/rrdfunctions.h +++ b/database/rrdfunctions.h @@ -23,13 +23,20 @@ typedef void (*rrd_call_function_async_callback)(BUFFER *wb, int code, void *cal int rrd_call_function_async(RRDHOST *host, BUFFER *wb, int timeout, const char *name, rrd_call_function_async_callback, void *callback_data); void rrd_functions_expose_rrdpush(RRDSET *st, BUFFER *wb); +void rrd_functions_expose_global_rrdpush(RRDHOST *host, BUFFER *wb); void chart_functions2json(RRDSET *st, BUFFER *wb, int tabs, const char *kq, const char *sq); -void chart_functions_to_dict(DICTIONARY *rrdset_functions_view, DICTIONARY *dst); +void chart_functions_to_dict(DICTIONARY *rrdset_functions_view, DICTIONARY *dst, void *value, size_t value_size); +void host_functions_to_dict(RRDHOST *host, DICTIONARY *dst, void *value, size_t value_size, STRING **help); void host_functions2json(RRDHOST *host, BUFFER *wb); uint8_t functions_format_to_content_type(const char *format); const char *functions_content_type_to_format(HTTP_CONTENT_TYPE content_type); int rrd_call_function_error(BUFFER *wb, const char *msg, int code); +int rrdhost_function_streaming(BUFFER *wb, int timeout, const char *function, void *collector_data, + function_data_ready_callback callback, void *callback_data); + +#define RRDFUNCTIONS_STREAMING_HELP "Streaming status for parents and children." + #endif // NETDATA_RRDFUNCTIONS_H diff --git a/database/rrdhost.c b/database/rrdhost.c index 69e4beabf..235d64b1e 100644 --- a/database/rrdhost.c +++ b/database/rrdhost.c @@ -42,6 +42,7 @@ bool is_storage_engine_shared(STORAGE_INSTANCE *engine __maybe_unused) { } RRDHOST *find_host_by_node_id(char *node_id) { + uuid_t node_uuid; if (unlikely(!node_id || uuid_parse(node_id, node_uuid))) return NULL; @@ -79,7 +80,7 @@ static inline void rrdhost_init() { } RRDHOST_ACQUIRED *rrdhost_find_and_acquire(const char *machine_guid) { - debug(D_RRD_CALLS, "rrdhost_find_and_acquire() host %s", machine_guid); + netdata_log_debug(D_RRD_CALLS, "rrdhost_find_and_acquire() host %s", machine_guid); return (RRDHOST_ACQUIRED *)dictionary_get_and_acquire_item(rrdhost_root_index, machine_guid); } @@ -115,7 +116,8 @@ static inline RRDHOST *rrdhost_index_add_by_guid(RRDHOST *host) { rrdhost_option_set(host, RRDHOST_OPTION_INDEXED_MACHINE_GUID); else { rrdhost_option_clear(host, RRDHOST_OPTION_INDEXED_MACHINE_GUID); - error("RRDHOST: %s() host with machine guid '%s' is already indexed", __FUNCTION__, host->machine_guid); + netdata_log_error("RRDHOST: %s() host with machine guid '%s' is already indexed", + __FUNCTION__, host->machine_guid); } return host; @@ -124,7 +126,8 @@ static inline RRDHOST *rrdhost_index_add_by_guid(RRDHOST *host) { static void rrdhost_index_del_by_guid(RRDHOST *host) { if(rrdhost_option_check(host, RRDHOST_OPTION_INDEXED_MACHINE_GUID)) { if(!dictionary_del(rrdhost_root_index, host->machine_guid)) - error("RRDHOST: %s() failed to delete machine guid '%s' from index", __FUNCTION__, host->machine_guid); + netdata_log_error("RRDHOST: %s() failed to delete machine guid '%s' from index", + __FUNCTION__, host->machine_guid); rrdhost_option_clear(host, RRDHOST_OPTION_INDEXED_MACHINE_GUID); } @@ -145,7 +148,8 @@ static inline void rrdhost_index_del_hostname(RRDHOST *host) { if(rrdhost_option_check(host, RRDHOST_OPTION_INDEXED_HOSTNAME)) { if(!dictionary_del(rrdhost_root_index_hostname, rrdhost_hostname(host))) - error("RRDHOST: %s() failed to delete hostname '%s' from index", __FUNCTION__, rrdhost_hostname(host)); + netdata_log_error("RRDHOST: %s() failed to delete hostname '%s' from index", + __FUNCTION__, rrdhost_hostname(host)); rrdhost_option_clear(host, RRDHOST_OPTION_INDEXED_HOSTNAME); } @@ -299,10 +303,11 @@ static RRDHOST *rrdhost_create( int is_localhost, bool archived ) { - debug(D_RRDHOST, "Host '%s': adding with guid '%s'", hostname, guid); + netdata_log_debug(D_RRDHOST, "Host '%s': adding with guid '%s'", hostname, guid); if(memory_mode == RRD_MEMORY_MODE_DBENGINE && !dbengine_enabled) { - error("memory mode 'dbengine' is not enabled, but host '%s' is configured for it. Falling back to 'alloc'", hostname); + netdata_log_error("memory mode 'dbengine' is not enabled, but host '%s' is configured for it. Falling back to 'alloc'", + hostname); memory_mode = RRD_MEMORY_MODE_ALLOC; } @@ -348,8 +353,8 @@ int is_legacy = 1; case RRD_MEMORY_MODE_MAP: case RRD_MEMORY_MODE_SAVE: case RRD_MEMORY_MODE_RAM: - if(host->rrdpush_seconds_to_replicate > host->rrd_history_entries * host->rrd_update_every) - host->rrdpush_seconds_to_replicate = host->rrd_history_entries * host->rrd_update_every; + if(host->rrdpush_seconds_to_replicate > (time_t) host->rrd_history_entries * (time_t) host->rrd_update_every) + host->rrdpush_seconds_to_replicate = (time_t) host->rrd_history_entries * (time_t) host->rrd_update_every; break; case RRD_MEMORY_MODE_DBENGINE: @@ -386,7 +391,7 @@ int is_legacy = 1; (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE && is_legacy))) { int r = mkdir(host->cache_dir, 0775); if(r != 0 && errno != EEXIST) - error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->cache_dir); + netdata_log_error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->cache_dir); } } @@ -412,7 +417,7 @@ int is_legacy = 1; ret = mkdir(dbenginepath, 0775); if (ret != 0 && errno != EEXIST) - error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), dbenginepath); + netdata_log_error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), dbenginepath); else ret = 0; // succeed @@ -453,9 +458,8 @@ int is_legacy = 1; } if (ret) { // check legacy or multihost initialization success - error( - "Host '%s': cannot initialize host with machine guid '%s'. Failed to initialize DB engine at '%s'.", - rrdhost_hostname(host), host->machine_guid, host->cache_dir); + netdata_log_error("Host '%s': cannot initialize host with machine guid '%s'. Failed to initialize DB engine at '%s'.", + rrdhost_hostname(host), host->machine_guid, host->cache_dir); rrd_wrlock(); rrdhost_free___while_having_rrd_wrlock(host, true); @@ -503,7 +507,8 @@ int is_legacy = 1; RRDHOST *t = rrdhost_index_add_by_guid(host); if(t != host) { - error("Host '%s': cannot add host with machine guid '%s' to index. It already exists as host '%s' with machine guid '%s'.", rrdhost_hostname(host), host->machine_guid, rrdhost_hostname(t), t->machine_guid); + netdata_log_error("Host '%s': cannot add host with machine guid '%s' to index. It already exists as host '%s' with machine guid '%s'.", + rrdhost_hostname(host), host->machine_guid, rrdhost_hostname(t), t->machine_guid); rrdhost_free___while_having_rrd_wrlock(host, true); rrd_unlock(); return NULL; @@ -520,7 +525,7 @@ int is_legacy = 1; // ------------------------------------------------------------------------ - info("Host '%s' (at registry as '%s') with guid '%s' initialized" + netdata_log_info("Host '%s' (at registry as '%s') with guid '%s' initialized" ", os '%s'" ", timezone '%s'" ", tags '%s'" @@ -528,7 +533,7 @@ int is_legacy = 1; ", program_version '%s'" ", update every %d" ", memory mode %s" - ", history entries %ld" + ", history entries %d" ", streaming %s" " (to '%s' with api key '%s')" ", health %s" @@ -593,7 +598,7 @@ static void rrdhost_update(RRDHOST *host { UNUSED(guid); - netdata_spinlock_lock(&host->rrdhost_update_lock); + spinlock_lock(&host->rrdhost_update_lock); host->health.health_enabled = (mode == RRD_MEMORY_MODE_NONE) ? 0 : health_enabled; @@ -611,34 +616,44 @@ static void rrdhost_update(RRDHOST *host host->registry_hostname = string_strdupz((registry_hostname && *registry_hostname)?registry_hostname:hostname); if(strcmp(rrdhost_hostname(host), hostname) != 0) { - info("Host '%s' has been renamed to '%s'. If this is not intentional it may mean multiple hosts are using the same machine_guid.", rrdhost_hostname(host), hostname); + netdata_log_info("Host '%s' has been renamed to '%s'. If this is not intentional it may mean multiple hosts are using the same machine_guid.", rrdhost_hostname(host), hostname); rrdhost_init_hostname(host, hostname, true); } else { rrdhost_index_add_hostname(host); } if(strcmp(rrdhost_program_name(host), program_name) != 0) { - info("Host '%s' switched program name from '%s' to '%s'", rrdhost_hostname(host), rrdhost_program_name(host), program_name); + netdata_log_info("Host '%s' switched program name from '%s' to '%s'", rrdhost_hostname(host), rrdhost_program_name(host), program_name); STRING *t = host->program_name; host->program_name = string_strdupz(program_name); string_freez(t); } if(strcmp(rrdhost_program_version(host), program_version) != 0) { - info("Host '%s' switched program version from '%s' to '%s'", rrdhost_hostname(host), rrdhost_program_version(host), program_version); + netdata_log_info("Host '%s' switched program version from '%s' to '%s'", rrdhost_hostname(host), rrdhost_program_version(host), program_version); STRING *t = host->program_version; host->program_version = string_strdupz(program_version); string_freez(t); } if(host->rrd_update_every != update_every) - error("Host '%s' has an update frequency of %d seconds, but the wanted one is %d seconds. Restart netdata here to apply the new settings.", rrdhost_hostname(host), host->rrd_update_every, update_every); + netdata_log_error("Host '%s' has an update frequency of %d seconds, but the wanted one is %d seconds. " + "Restart netdata here to apply the new settings.", + rrdhost_hostname(host), host->rrd_update_every, update_every); if(host->rrd_memory_mode != mode) - error("Host '%s' has memory mode '%s', but the wanted one is '%s'. Restart netdata here to apply the new settings.", rrdhost_hostname(host), rrd_memory_mode_name(host->rrd_memory_mode), rrd_memory_mode_name(mode)); + netdata_log_error("Host '%s' has memory mode '%s', but the wanted one is '%s'. " + "Restart netdata here to apply the new settings.", + rrdhost_hostname(host), + rrd_memory_mode_name(host->rrd_memory_mode), + rrd_memory_mode_name(mode)); else if(host->rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE && host->rrd_history_entries < history) - error("Host '%s' has history of %ld entries, but the wanted one is %ld entries. Restart netdata here to apply the new settings.", rrdhost_hostname(host), host->rrd_history_entries, history); + netdata_log_error("Host '%s' has history of %d entries, but the wanted one is %ld entries. " + "Restart netdata here to apply the new settings.", + rrdhost_hostname(host), + host->rrd_history_entries, + history); // update host tags rrdhost_init_tags(host, tags); @@ -678,10 +693,10 @@ static void rrdhost_update(RRDHOST *host ml_host_new(host); rrdhost_load_rrdcontext_data(host); - info("Host %s is not in archived mode anymore", rrdhost_hostname(host)); + netdata_log_info("Host %s is not in archived mode anymore", rrdhost_hostname(host)); } - netdata_spinlock_unlock(&host->rrdhost_update_lock); + spinlock_unlock(&host->rrdhost_update_lock); } RRDHOST *rrdhost_find_or_create( @@ -709,7 +724,7 @@ RRDHOST *rrdhost_find_or_create( , struct rrdhost_system_info *system_info , bool archived ) { - debug(D_RRDHOST, "Searching for host '%s' with guid '%s'", hostname, guid); + netdata_log_debug(D_RRDHOST, "Searching for host '%s' with guid '%s'", hostname, guid); RRDHOST *host = rrdhost_find_by_guid(guid); if (unlikely(host && host->rrd_memory_mode != mode && rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED))) { @@ -718,8 +733,10 @@ RRDHOST *rrdhost_find_or_create( return host; /* If a legacy memory mode instantiates all dbengine state must be discarded to avoid inconsistencies */ - error("Archived host '%s' has memory mode '%s', but the wanted one is '%s'. Discarding archived state.", - rrdhost_hostname(host), rrd_memory_mode_name(host->rrd_memory_mode), rrd_memory_mode_name(mode)); + netdata_log_error("Archived host '%s' has memory mode '%s', but the wanted one is '%s'. Discarding archived state.", + rrdhost_hostname(host), + rrd_memory_mode_name(host->rrd_memory_mode), + rrd_memory_mode_name(mode)); rrd_wrlock(); rrdhost_free___while_having_rrd_wrlock(host, true); @@ -827,18 +844,18 @@ void dbengine_init(char *hostname) { if (read_num > 0 && read_num <= MAX_PAGES_PER_EXTENT) rrdeng_pages_per_extent = read_num; else { - error("Invalid dbengine pages per extent %u given. Using %u.", read_num, rrdeng_pages_per_extent); + netdata_log_error("Invalid dbengine pages per extent %u given. Using %u.", read_num, rrdeng_pages_per_extent); config_set_number(CONFIG_SECTION_DB, "dbengine pages per extent", rrdeng_pages_per_extent); } storage_tiers = config_get_number(CONFIG_SECTION_DB, "storage tiers", storage_tiers); if(storage_tiers < 1) { - error("At least 1 storage tier is required. Assuming 1."); + netdata_log_error("At least 1 storage tier is required. Assuming 1."); storage_tiers = 1; config_set_number(CONFIG_SECTION_DB, "storage tiers", storage_tiers); } if(storage_tiers > RRD_STORAGE_TIERS) { - error("Up to %d storage tier are supported. Assuming %d.", RRD_STORAGE_TIERS, RRD_STORAGE_TIERS); + netdata_log_error("Up to %d storage tier are supported. Assuming %d.", RRD_STORAGE_TIERS, RRD_STORAGE_TIERS); storage_tiers = RRD_STORAGE_TIERS; config_set_number(CONFIG_SECTION_DB, "storage tiers", storage_tiers); } @@ -860,7 +877,7 @@ void dbengine_init(char *hostname) { int ret = mkdir(dbenginepath, 0775); if (ret != 0 && errno != EEXIST) { - error("DBENGINE on '%s': cannot create directory '%s'", hostname, dbenginepath); + netdata_log_error("DBENGINE on '%s': cannot create directory '%s'", hostname, dbenginepath); break; } @@ -880,7 +897,9 @@ void dbengine_init(char *hostname) { if(grouping_iterations < 2) { grouping_iterations = 2; config_set_number(CONFIG_SECTION_DB, dbengineconfig, grouping_iterations); - error("DBENGINE on '%s': 'dbegnine tier %zu update every iterations' cannot be less than 2. Assuming 2.", hostname, tier); + netdata_log_error("DBENGINE on '%s': 'dbegnine tier %zu update every iterations' cannot be less than 2. Assuming 2.", + hostname, + tier); } snprintfz(dbengineconfig, 200, "dbengine tier %zu backfill", tier); @@ -889,7 +908,7 @@ void dbengine_init(char *hostname) { else if(strcmp(bf, "full") == 0) backfill = RRD_BACKFILL_FULL; else if(strcmp(bf, "none") == 0) backfill = RRD_BACKFILL_NONE; else { - error("DBENGINE: unknown backfill value '%s', assuming 'new'", bf); + netdata_log_error("DBENGINE: unknown backfill value '%s', assuming 'new'", bf); config_set(CONFIG_SECTION_DB, dbengineconfig, "new"); backfill = RRD_BACKFILL_NEW; } @@ -900,7 +919,10 @@ void dbengine_init(char *hostname) { if(tier > 0 && get_tier_grouping(tier) > 65535) { storage_tiers_grouping_iterations[tier] = 1; - error("DBENGINE on '%s': dbengine tier %zu gives aggregation of more than 65535 points of tier 0. Disabling tiers above %zu", hostname, tier, tier); + netdata_log_error("DBENGINE on '%s': dbengine tier %zu gives aggregation of more than 65535 points of tier 0. Disabling tiers above %zu", + hostname, + tier, + tier); break; } @@ -911,9 +933,12 @@ void dbengine_init(char *hostname) { strncpyz(tiers_init[tier].path, dbenginepath, FILENAME_MAX); tiers_init[tier].ret = 0; - if(parallel_initialization) - netdata_thread_create(&tiers_init[tier].thread, "DBENGINE_INIT", NETDATA_THREAD_OPTION_JOINABLE, + if(parallel_initialization) { + char tag[NETDATA_THREAD_TAG_MAX + 1]; + snprintfz(tag, NETDATA_THREAD_TAG_MAX, "DBENGINIT[%zu]", tier); + netdata_thread_create(&tiers_init[tier].thread, tag, NETDATA_THREAD_OPTION_JOINABLE, dbengine_tier_init, &tiers_init[tier]); + } else dbengine_tier_init(&tiers_init[tier]); } @@ -925,16 +950,21 @@ void dbengine_init(char *hostname) { netdata_thread_join(tiers_init[tier].thread, &ptr); if(tiers_init[tier].ret != 0) { - error("DBENGINE on '%s': Failed to initialize multi-host database tier %zu on path '%s'", - hostname, tiers_init[tier].tier, tiers_init[tier].path); + netdata_log_error("DBENGINE on '%s': Failed to initialize multi-host database tier %zu on path '%s'", + hostname, + tiers_init[tier].tier, + tiers_init[tier].path); } else if(created_tiers == tier) created_tiers++; } if(created_tiers && created_tiers < storage_tiers) { - error("DBENGINE on '%s': Managed to create %zu tiers instead of %zu. Continuing with %zu available.", - hostname, created_tiers, storage_tiers, created_tiers); + netdata_log_error("DBENGINE on '%s': Managed to create %zu tiers instead of %zu. Continuing with %zu available.", + hostname, + created_tiers, + storage_tiers, + created_tiers); storage_tiers = created_tiers; } else if(!created_tiers) @@ -947,7 +977,7 @@ void dbengine_init(char *hostname) { #else storage_tiers = config_get_number(CONFIG_SECTION_DB, "storage tiers", 1); if(storage_tiers != 1) { - error("DBENGINE is not available on '%s', so only 1 database tier can be supported.", hostname); + netdata_log_error("DBENGINE is not available on '%s', so only 1 database tier can be supported.", hostname); storage_tiers = 1; config_set_number(CONFIG_SECTION_DB, "storage tiers", storage_tiers); } @@ -963,7 +993,7 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unitt set_late_global_environment(system_info); fatal("Failed to initialize SQLite"); } - info("Skipping SQLITE metadata initialization since memory mode is not dbengine"); + netdata_log_info("Skipping SQLITE metadata initialization since memory mode is not dbengine"); } if (unlikely(sql_init_context_database(system_info ? 0 : 1))) { @@ -978,23 +1008,23 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unitt rrdpush_init(); if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE || rrdpush_receiver_needs_dbengine()) { - info("DBENGINE: Initializing ..."); + netdata_log_info("DBENGINE: Initializing ..."); dbengine_init(hostname); } else { - info("DBENGINE: Not initializing ..."); + netdata_log_info("DBENGINE: Not initializing ..."); storage_tiers = 1; } if (!dbengine_enabled) { if (storage_tiers > 1) { - error("dbengine is not enabled, but %zu tiers have been requested. Resetting tiers to 1", - storage_tiers); + netdata_log_error("dbengine is not enabled, but %zu tiers have been requested. Resetting tiers to 1", + storage_tiers); storage_tiers = 1; } if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { - error("dbengine is not enabled, but it has been given as the default db mode. Resetting db mode to alloc"); + netdata_log_error("dbengine is not enabled, but it has been given as the default db mode. Resetting db mode to alloc"); default_rrd_memory_mode = RRD_MEMORY_MODE_ALLOC; } } @@ -1003,7 +1033,7 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unitt if(!unittest) metadata_sync_init(); - debug(D_RRDHOST, "Initializing localhost with hostname '%s'", hostname); + netdata_log_debug(D_RRDHOST, "Initializing localhost with hostname '%s'", hostname); localhost = rrdhost_create( hostname , registry_get_this_machine_hostname() @@ -1035,6 +1065,15 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unitt return 1; } +#ifdef NETDATA_DEV_MODE + // we register this only on localhost + // for the other nodes, the origin server should register it + rrd_collector_started(); // this creates a collector that runs for as long as netdata runs + rrd_collector_add_function(localhost, NULL, "streaming", 10, + RRDFUNCTIONS_STREAMING_HELP, true, + rrdhost_function_streaming, NULL); +#endif + if (likely(system_info)) { migrate_localhost(&localhost->host_uuid); sql_aclk_sync_init(); @@ -1094,22 +1133,20 @@ static void rrdhost_streaming_sender_structures_init(RRDHOST *host) host->sender->host = host; host->sender->buffer = cbuffer_new(CBUFFER_INITIAL_SIZE, 1024 * 1024, &netdata_buffers_statistics.cbuffers_streaming); - host->sender->capabilities = stream_our_capabilities(); + host->sender->capabilities = stream_our_capabilities(host, true); host->sender->rrdpush_sender_pipe[PIPE_READ] = -1; host->sender->rrdpush_sender_pipe[PIPE_WRITE] = -1; host->sender->rrdpush_sender_socket = -1; -#ifdef ENABLE_COMPRESSION - if(default_compression_enabled) { +#ifdef ENABLE_RRDPUSH_COMPRESSION + if(default_rrdpush_compression_enabled) host->sender->flags |= SENDER_FLAG_COMPRESSION; - host->sender->compressor = create_compressor(); - } else host->sender->flags &= ~SENDER_FLAG_COMPRESSION; #endif - netdata_mutex_init(&host->sender->mutex); + spinlock_init(&host->sender->spinlock); replication_init_sender(host->sender); } @@ -1120,11 +1157,10 @@ static void rrdhost_streaming_sender_structures_free(RRDHOST *host) if (unlikely(!host->sender)) return; - rrdpush_sender_thread_stop(host, "HOST CLEANUP", true); // stop a possibly running thread + rrdpush_sender_thread_stop(host, STREAM_HANDSHAKE_DISCONNECT_HOST_CLEANUP, true); // stop a possibly running thread cbuffer_free(host->sender->buffer); -#ifdef ENABLE_COMPRESSION - if (host->sender->compressor) - host->sender->compressor->destroy(&host->sender->compressor); +#ifdef ENABLE_RRDPUSH_COMPRESSION + rrdpush_compressor_destroy(&host->sender->compressor); #endif replication_cleanup_sender(host->sender); @@ -1139,7 +1175,7 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { if(!host) return; if (netdata_exit || force) { - info("RRD: 'host:%s' freeing memory...", rrdhost_hostname(host)); + netdata_log_info("RRD: 'host:%s' freeing memory...", rrdhost_hostname(host)); // ------------------------------------------------------------------------ // first remove it from the indexes, so that it will not be discoverable @@ -1157,7 +1193,7 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { rrdhost_streaming_sender_structures_free(host); if (netdata_exit || force) - stop_streaming_receiver(host, "HOST CLEANUP"); + stop_streaming_receiver(host, STREAM_HANDSHAKE_DISCONNECT_HOST_CLEANUP); // ------------------------------------------------------------------------ @@ -1199,7 +1235,7 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { #endif if (!netdata_exit && !force) { - info("RRD: 'host:%s' is now in archive mode...", rrdhost_hostname(host)); + netdata_log_info("RRD: 'host:%s' is now in archive mode...", rrdhost_hostname(host)); rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED | RRDHOST_FLAG_ORPHAN); return; } @@ -1226,7 +1262,6 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { string_freez(host->health.health_default_recipient); string_freez(host->registry_hostname); simple_pattern_free(host->rrdpush_send_charts_matching); - netdata_rwlock_destroy(&host->health_log.alarm_log_rwlock); freez(host->node_id); rrdfamily_index_destroy(host); @@ -1269,7 +1304,7 @@ void rrd_finalize_collection_for_all_hosts(void) { void rrdhost_save_charts(RRDHOST *host) { if(!host) return; - info("RRD: 'host:%s' saving / closing database...", rrdhost_hostname(host)); + netdata_log_info("RRD: 'host:%s' saving / closing database...", rrdhost_hostname(host)); RRDSET *st; @@ -1378,15 +1413,16 @@ static void rrdhost_load_auto_labels(void) { rrdlabels_add(labels, "_streams_to", localhost->rrdpush_send_destination, RRDLABEL_SRC_AUTO); } -void rrdhost_set_is_parent_label(int count) { - DICTIONARY *labels = localhost->rrdlabels; +void rrdhost_set_is_parent_label(void) { + int count = __atomic_load_n(&localhost->connected_children_count, __ATOMIC_RELAXED); if (count == 0 || count == 1) { + DICTIONARY *labels = localhost->rrdlabels; rrdlabels_add(labels, "_is_parent", (count) ? "true" : "false", RRDLABEL_SRC_AUTO); //queue a node info #ifdef ENABLE_ACLK - if (netdata_cloud_setting) { + if (netdata_cloud_enabled) { aclk_queue_node_info(localhost, false); } #endif @@ -1397,7 +1433,7 @@ static void rrdhost_load_config_labels(void) { int status = config_load(NULL, 1, CONFIG_SECTION_HOST_LABEL); if(!status) { char *filename = CONFIG_DIR "/" CONFIG_FILENAME; - error("RRDLABEL: Cannot reload the configuration file '%s', using labels in memory", filename); + netdata_log_error("RRDLABEL: Cannot reload the configuration file '%s', using labels in memory", filename); } struct section *co = appconfig_get_section(&netdata_config, CONFIG_SECTION_HOST_LABEL); @@ -1417,11 +1453,11 @@ static void rrdhost_load_kubernetes_labels(void) { sprintf(label_script, "%s/%s", netdata_configured_primary_plugins_dir, "get-kubernetes-labels.sh"); if (unlikely(access(label_script, R_OK) != 0)) { - error("Kubernetes pod label fetching script %s not found.",label_script); + netdata_log_error("Kubernetes pod label fetching script %s not found.",label_script); return; } - debug(D_RRDHOST, "Attempting to fetch external labels via %s", label_script); + netdata_log_debug(D_RRDHOST, "Attempting to fetch external labels via %s", label_script); pid_t pid; FILE *fp_child_input; @@ -1435,7 +1471,8 @@ static void rrdhost_load_kubernetes_labels(void) { // Non-zero exit code means that all the script output is error messages. We've shown already any message that didn't include a ':' // Here we'll inform with an ERROR that the script failed, show whatever (if anything) was added to the list of labels, free the memory and set the return to null int rc = netdata_pclose(fp_child_input, fp_child_output, pid); - if(rc) error("%s exited abnormally. Failed to get kubernetes labels.", label_script); + if(rc) + netdata_log_error("%s exited abnormally. Failed to get kubernetes labels.", label_script); } void reload_host_labels(void) { @@ -1455,7 +1492,7 @@ void reload_host_labels(void) { } void rrdhost_finalize_collection(RRDHOST *host) { - info("RRD: 'host:%s' stopping data collection...", rrdhost_hostname(host)); + netdata_log_info("RRD: 'host:%s' stopping data collection...", rrdhost_hostname(host)); RRDSET *st; rrdset_foreach_read(st, host) @@ -1469,7 +1506,7 @@ void rrdhost_finalize_collection(RRDHOST *host) { void rrdhost_delete_charts(RRDHOST *host) { if(!host) return; - info("RRD: 'host:%s' deleting disk files...", rrdhost_hostname(host)); + netdata_log_info("RRD: 'host:%s' deleting disk files...", rrdhost_hostname(host)); RRDSET *st; @@ -1491,7 +1528,7 @@ void rrdhost_delete_charts(RRDHOST *host) { void rrdhost_cleanup_charts(RRDHOST *host) { if(!host) return; - info("RRD: 'host:%s' cleaning up disk files...", rrdhost_hostname(host)); + netdata_log_info("RRD: 'host:%s' cleaning up disk files...", rrdhost_hostname(host)); RRDSET *st; uint32_t rrdhost_delete_obsolete_charts = rrdhost_option_check(host, RRDHOST_OPTION_DELETE_OBSOLETE_CHARTS); @@ -1518,7 +1555,7 @@ void rrdhost_cleanup_charts(RRDHOST *host) { // RRDHOST - save all hosts to disk void rrdhost_save_all(void) { - info("RRD: saving databases [%zu hosts(s)]...", rrdhost_hosts_available()); + netdata_log_info("RRD: saving databases [%zu hosts(s)]...", rrdhost_hosts_available()); rrd_rdlock(); @@ -1533,7 +1570,7 @@ void rrdhost_save_all(void) { // RRDHOST - save or delete all hosts from disk void rrdhost_cleanup_all(void) { - info("RRD: cleaning up database [%zu hosts(s)]...", rrdhost_hosts_available()); + netdata_log_info("RRD: cleaning up database [%zu hosts(s)]...", rrdhost_hosts_available()); rrd_rdlock(); @@ -1687,3 +1724,236 @@ int rrdhost_set_system_info_variable(struct rrdhost_system_info *system_info, ch return res; } + +static NETDATA_DOUBLE rrdhost_sender_replication_completion_unsafe(RRDHOST *host, time_t now, size_t *instances) { + size_t charts = rrdhost_sender_replicating_charts(host); + NETDATA_DOUBLE completion; + if(!charts || !host->sender || !host->sender->replication.oldest_request_after_t) + completion = 100.0; + else if(!host->sender->replication.latest_completed_before_t || host->sender->replication.latest_completed_before_t < host->sender->replication.oldest_request_after_t) + completion = 0.0; + else { + time_t total = now - host->sender->replication.oldest_request_after_t; + time_t current = host->sender->replication.latest_completed_before_t - host->sender->replication.oldest_request_after_t; + completion = (NETDATA_DOUBLE) current * 100.0 / (NETDATA_DOUBLE) total; + } + + *instances = charts; + + return completion; +} + +bool rrdhost_matches_window(RRDHOST *host, time_t after, time_t before, time_t now) { + time_t first_time_s, last_time_s; + rrdhost_retention(host, now, rrdhost_is_online(host), &first_time_s, &last_time_s); + return query_matches_retention(after, before, first_time_s, last_time_s, 0); +} + +bool rrdhost_state_cloud_emulation(RRDHOST *host) { + return rrdhost_is_online(host); +} + +void rrdhost_status(RRDHOST *host, time_t now, RRDHOST_STATUS *s) { + memset(s, 0, sizeof(*s)); + + s->host = host; + s->now = now; + + RRDHOST_FLAGS flags = __atomic_load_n(&host->flags, __ATOMIC_RELAXED); + + // --- db --- + + bool online = rrdhost_is_online(host); + + rrdhost_retention(host, now, online, &s->db.first_time_s, &s->db.last_time_s); + s->db.metrics = host->rrdctx.metrics; + s->db.instances = host->rrdctx.instances; + s->db.contexts = dictionary_entries(host->rrdctx.contexts); + if(!s->db.first_time_s || !s->db.last_time_s || !s->db.metrics || !s->db.instances || !s->db.contexts || + (flags & (RRDHOST_FLAG_PENDING_CONTEXT_LOAD|RRDHOST_FLAG_CONTEXT_LOAD_IN_PROGRESS))) + s->db.status = RRDHOST_DB_STATUS_INITIALIZING; + else + s->db.status = RRDHOST_DB_STATUS_QUERYABLE; + + s->db.mode = host->rrd_memory_mode; + + // --- ingest --- + + s->ingest.since = MAX(host->child_connect_time, host->child_disconnected_time); + s->ingest.reason = (online) ? STREAM_HANDSHAKE_NEVER : host->rrdpush_last_receiver_exit_reason; + + netdata_mutex_lock(&host->receiver_lock); + s->ingest.hops = (host->system_info ? host->system_info->hops : (host == localhost) ? 0 : 1); + bool has_receiver = false; + if (host->receiver) { + has_receiver = true; + s->ingest.replication.instances = rrdhost_receiver_replicating_charts(host); + s->ingest.replication.completion = host->rrdpush_receiver_replication_percent; + s->ingest.replication.in_progress = s->ingest.replication.instances > 0; + + s->ingest.capabilities = host->receiver->capabilities; + s->ingest.peers = socket_peers(host->receiver->fd); +#ifdef ENABLE_HTTPS + s->ingest.ssl = SSL_connection(&host->receiver->ssl); +#endif + } + netdata_mutex_unlock(&host->receiver_lock); + + if (online) { + if(s->db.status == RRDHOST_DB_STATUS_INITIALIZING) + s->ingest.status = RRDHOST_INGEST_STATUS_INITIALIZING; + + else if (host == localhost || rrdhost_option_check(host, RRDHOST_OPTION_VIRTUAL_HOST)) { + s->ingest.status = RRDHOST_INGEST_STATUS_ONLINE; + s->ingest.since = netdata_start_time; + } + + else if (s->ingest.replication.in_progress) + s->ingest.status = RRDHOST_INGEST_STATUS_REPLICATING; + + else + s->ingest.status = RRDHOST_INGEST_STATUS_ONLINE; + } + else { + if (!s->ingest.since) { + s->ingest.status = RRDHOST_INGEST_STATUS_ARCHIVED; + s->ingest.since = s->db.last_time_s; + } + + else + s->ingest.status = RRDHOST_INGEST_STATUS_OFFLINE; + } + + if(host == localhost) + s->ingest.type = RRDHOST_INGEST_TYPE_LOCALHOST; + else if(has_receiver || rrdhost_flag_set(host, RRDHOST_FLAG_RRDPUSH_RECEIVER_DISCONNECTED)) + s->ingest.type = RRDHOST_INGEST_TYPE_CHILD; + else if(rrdhost_option_check(host, RRDHOST_OPTION_VIRTUAL_HOST)) + s->ingest.type = RRDHOST_INGEST_TYPE_VIRTUAL; + else + s->ingest.type = RRDHOST_INGEST_TYPE_ARCHIVED; + + s->ingest.id = host->rrdpush_receiver_connection_counter; + + if(!s->ingest.since) + s->ingest.since = netdata_start_time; + + if(s->ingest.status == RRDHOST_INGEST_STATUS_ONLINE) + s->db.liveness = RRDHOST_DB_LIVENESS_LIVE; + else + s->db.liveness = RRDHOST_DB_LIVENESS_STALE; + + // --- stream --- + + if (!host->sender) { + s->stream.status = RRDHOST_STREAM_STATUS_DISABLED; + s->stream.hops = s->ingest.hops + 1; + } + else { + sender_lock(host->sender); + + s->stream.since = host->sender->last_state_since_t; + s->stream.peers = socket_peers(host->sender->rrdpush_sender_socket); + s->stream.ssl = SSL_connection(&host->sender->ssl); + + memcpy(s->stream.sent_bytes_on_this_connection_per_type, + host->sender->sent_bytes_on_this_connection_per_type, + MIN(sizeof(s->stream.sent_bytes_on_this_connection_per_type), + sizeof(host->sender->sent_bytes_on_this_connection_per_type))); + + if (rrdhost_flag_check(host, RRDHOST_FLAG_RRDPUSH_SENDER_CONNECTED)) { + s->stream.hops = host->sender->hops; + s->stream.reason = STREAM_HANDSHAKE_NEVER; + s->stream.capabilities = host->sender->capabilities; + + s->stream.replication.completion = rrdhost_sender_replication_completion_unsafe(host, now, &s->stream.replication.instances); + s->stream.replication.in_progress = s->stream.replication.instances > 0; + + if(s->stream.replication.in_progress) + s->stream.status = RRDHOST_STREAM_STATUS_REPLICATING; + else + s->stream.status = RRDHOST_STREAM_STATUS_ONLINE; + +#ifdef ENABLE_RRDPUSH_COMPRESSION + s->stream.compression = (stream_has_capability(host->sender, STREAM_CAP_COMPRESSION) && host->sender->compressor.initialized); +#endif + } + else { + s->stream.status = RRDHOST_STREAM_STATUS_OFFLINE; + s->stream.hops = s->ingest.hops + 1; + s->stream.reason = host->sender->exit.reason; + } + + sender_unlock(host->sender); + } + + s->stream.id = host->rrdpush_sender_connection_counter; + + if(!s->stream.since) + s->stream.since = netdata_start_time; + + // --- ml --- + + if(ml_host_get_host_status(host, &s->ml.metrics)) { + s->ml.type = RRDHOST_ML_TYPE_SELF; + + if(s->ingest.status == RRDHOST_INGEST_STATUS_OFFLINE || s->ingest.status == RRDHOST_INGEST_STATUS_ARCHIVED) + s->ml.status = RRDHOST_ML_STATUS_OFFLINE; + else + s->ml.status = RRDHOST_ML_STATUS_RUNNING; + } + else if(stream_has_capability(&s->ingest, STREAM_CAP_DATA_WITH_ML)) { + s->ml.type = RRDHOST_ML_TYPE_RECEIVED; + s->ml.status = RRDHOST_ML_STATUS_RUNNING; + } + else { + // does not receive ML, does not run ML + s->ml.type = RRDHOST_ML_TYPE_DISABLED; + s->ml.status = RRDHOST_ML_STATUS_DISABLED; + } + + // --- health --- + + if(host->health.health_enabled) { + if(flags & RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION) + s->health.status = RRDHOST_HEALTH_STATUS_INITIALIZING; + else { + s->health.status = RRDHOST_HEALTH_STATUS_RUNNING; + + RRDCALC *rc; + foreach_rrdcalc_in_rrdhost_read(host, rc) { + if (unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) + continue; + + switch (rc->status) { + default: + case RRDCALC_STATUS_REMOVED: + break; + + case RRDCALC_STATUS_CLEAR: + s->health.alerts.clear++; + break; + + case RRDCALC_STATUS_WARNING: + s->health.alerts.warning++; + break; + + case RRDCALC_STATUS_CRITICAL: + s->health.alerts.critical++; + break; + + case RRDCALC_STATUS_UNDEFINED: + s->health.alerts.undefined++; + break; + + case RRDCALC_STATUS_UNINITIALIZED: + s->health.alerts.uninitialized++; + break; + } + } + foreach_rrdcalc_in_rrdhost_done(rc); + } + } + else + s->health.status = RRDHOST_HEALTH_STATUS_DISABLED; +} diff --git a/database/rrdlabels.c b/database/rrdlabels.c index 051222109..77d9a91f0 100644 --- a/database/rrdlabels.c +++ b/database/rrdlabels.c @@ -571,7 +571,7 @@ static void labels_add_already_sanitized(DICTIONARY *dict, const char *key, cons void rrdlabels_add(DICTIONARY *dict, const char *name, const char *value, RRDLABEL_SRC ls) { if(!dict) { - error("%s(): called with NULL dictionary.", __FUNCTION__ ); + netdata_log_error("%s(): called with NULL dictionary.", __FUNCTION__ ); return; } @@ -580,7 +580,7 @@ void rrdlabels_add(DICTIONARY *dict, const char *name, const char *value, RRDLAB rrdlabels_sanitize_value(v, value, RRDLABELS_MAX_VALUE_LENGTH); if(!*n) { - error("%s: cannot add name '%s' (value '%s') which is sanitized as empty string", __FUNCTION__, name, value); + netdata_log_error("%s: cannot add name '%s' (value '%s') which is sanitized as empty string", __FUNCTION__, name, value); return; } @@ -621,7 +621,7 @@ static const char *get_quoted_string_up_to(char *dst, size_t dst_size, const cha void rrdlabels_add_pair(DICTIONARY *dict, const char *string, RRDLABEL_SRC ls) { if(!dict) { - error("%s(): called with NULL dictionary.", __FUNCTION__ ); + netdata_log_error("%s(): called with NULL dictionary.", __FUNCTION__ ); return; } diff --git a/database/rrdset.c b/database/rrdset.c index 3177f43ff..1e00d5c8a 100644 --- a/database/rrdset.c +++ b/database/rrdset.c @@ -55,7 +55,7 @@ static STRING *rrdset_fix_name(RRDHOST *host, const char *chart_full_id, const c strncpyz(new_name, sanitized_name, CONFIG_MAX_VALUE); if(rrdset_index_find_name(host, new_name)) { - debug(D_RRD_CALLS, "RRDSET: chart name '%s' on host '%s' already exists.", new_name, rrdhost_hostname(host)); + netdata_log_debug(D_RRD_CALLS, "RRDSET: chart name '%s' on host '%s' already exists.", new_name, rrdhost_hostname(host)); if(!strcmp(chart_full_id, full_name) && (!current_name || !*current_name)) { unsigned i = 1; @@ -64,7 +64,7 @@ static STRING *rrdset_fix_name(RRDHOST *host, const char *chart_full_id, const c i++; } while (rrdset_index_find_name(host, new_name)); - info("RRDSET: using name '%s' for chart '%s' on host '%s'.", new_name, full_name, rrdhost_hostname(host)); + netdata_log_info("RRDSET: using name '%s' for chart '%s' on host '%s'.", new_name, full_name, rrdhost_hostname(host)); } else return NULL; @@ -128,14 +128,14 @@ static void rrdset_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v st->module_name = rrd_string_strdupz(ctr->module); st->priority = ctr->priority; - st->entries = (ctr->memory_mode != RRD_MEMORY_MODE_DBENGINE) ? align_entries_to_pagesize(ctr->memory_mode, ctr->history_entries) : 5; + st->db.entries = (ctr->memory_mode != RRD_MEMORY_MODE_DBENGINE) ? align_entries_to_pagesize(ctr->memory_mode, ctr->history_entries) : 5; st->update_every = ctr->update_every; st->rrd_memory_mode = ctr->memory_mode; st->chart_type = ctr->chart_type; st->rrdhost = host; - netdata_spinlock_init(&st->data_collection_lock); + spinlock_init(&st->data_collection_lock); st->flags = RRDSET_FLAG_SYNC_CLOCK | RRDSET_FLAG_INDEXED_ID @@ -143,11 +143,11 @@ static void rrdset_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v | RRDSET_FLAG_SENDER_REPLICATION_FINISHED ; - netdata_rwlock_init(&st->alerts.rwlock); + rw_spinlock_init(&st->alerts.spinlock); if(st->rrd_memory_mode == RRD_MEMORY_MODE_SAVE || st->rrd_memory_mode == RRD_MEMORY_MODE_MAP) { if(!rrdset_memory_load_or_create_map_save(st, st->rrd_memory_mode)) { - info("Failed to use db mode %s for chart '%s', falling back to ram mode.", (st->rrd_memory_mode == RRD_MEMORY_MODE_MAP)?"map":"save", rrdset_name(st)); + netdata_log_info("Failed to use db mode %s for chart '%s', falling back to ram mode.", (st->rrd_memory_mode == RRD_MEMORY_MODE_MAP)?"map":"save", rrdset_name(st)); st->rrd_memory_mode = RRD_MEMORY_MODE_RAM; } } @@ -263,8 +263,6 @@ static void rrdset_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, v // ------------------------------------------------------------------------ // free it - netdata_rwlock_destroy(&st->alerts.rwlock); - string_freez(st->id); string_freez(st->name); string_freez(st->parts.id); @@ -278,7 +276,7 @@ static void rrdset_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, v string_freez(st->module_name); freez(st->exporting_flags); - freez(st->cache_dir); + freez(st->db.cache_dir); } // the item to be inserted, is already in the dictionary @@ -452,7 +450,7 @@ static RRDSET *rrdset_index_find(RRDHOST *host, const char *id) { // RRDSET - find charts inline RRDSET *rrdset_find(RRDHOST *host, const char *id) { - debug(D_RRD_CALLS, "rrdset_find() for chart '%s' in host '%s'", id, rrdhost_hostname(host)); + netdata_log_debug(D_RRD_CALLS, "rrdset_find() for chart '%s' in host '%s'", id, rrdhost_hostname(host)); RRDSET *st = rrdset_index_find(host, id); if(st) @@ -462,7 +460,7 @@ inline RRDSET *rrdset_find(RRDHOST *host, const char *id) { } inline RRDSET *rrdset_find_bytype(RRDHOST *host, const char *type, const char *id) { - debug(D_RRD_CALLS, "rrdset_find_bytype() for chart '%s.%s' in host '%s'", type, id, rrdhost_hostname(host)); + netdata_log_debug(D_RRD_CALLS, "rrdset_find_bytype() for chart '%s.%s' in host '%s'", type, id, rrdhost_hostname(host)); char buf[RRD_ID_LENGTH_MAX + 1]; strncpyz(buf, type, RRD_ID_LENGTH_MAX - 1); @@ -474,13 +472,13 @@ inline RRDSET *rrdset_find_bytype(RRDHOST *host, const char *type, const char *i } inline RRDSET *rrdset_find_byname(RRDHOST *host, const char *name) { - debug(D_RRD_CALLS, "rrdset_find_byname() for chart '%s' in host '%s'", name, rrdhost_hostname(host)); + netdata_log_debug(D_RRD_CALLS, "rrdset_find_byname() for chart '%s' in host '%s'", name, rrdhost_hostname(host)); RRDSET *st = rrdset_index_find_name(host, name); return(st); } RRDSET_ACQUIRED *rrdset_find_and_acquire(RRDHOST *host, const char *id) { - debug(D_RRD_CALLS, "rrdset_find_and_acquire() for host %s, chart %s", rrdhost_hostname(host), id); + netdata_log_debug(D_RRD_CALLS, "rrdset_find_and_acquire() for host %s, chart %s", rrdhost_hostname(host), id); return (RRDSET_ACQUIRED *)dictionary_get_and_acquire_item(host->rrdset_root_index, id); } @@ -524,7 +522,7 @@ int rrdset_reset_name(RRDSET *st, const char *name) { RRDHOST *host = st->rrdhost; - debug(D_RRD_CALLS, "rrdset_reset_name() old: '%s', new: '%s'", rrdset_name(st), name); + netdata_log_debug(D_RRD_CALLS, "rrdset_reset_name() old: '%s', new: '%s'", rrdset_name(st), name); STRING *name_string = rrdset_fix_name(host, rrdset_id(st), rrdset_parts_type(st), string2str(st->name), name); if(!name_string) return 0; @@ -660,7 +658,7 @@ void rrdset_get_retention_of_tier_for_collected_chart(RRDSET *st, time_t *first_ inline void rrdset_is_obsolete(RRDSET *st) { if(unlikely(rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))) { - info("Cannot obsolete already archived chart %s", rrdset_name(st)); + netdata_log_info("Cannot obsolete already archived chart %s", rrdset_name(st)); return; } @@ -702,8 +700,8 @@ inline void rrdset_update_heterogeneous_flag(RRDSET *st) { bool init = false, is_heterogeneous = false; RRD_ALGORITHM algorithm; - collected_number multiplier; - collected_number divisor; + int32_t multiplier; + int32_t divisor; rrddim_foreach_read(rd, st) { if(!init) { @@ -717,7 +715,9 @@ inline void rrdset_update_heterogeneous_flag(RRDSET *st) { if(algorithm != rd->algorithm || multiplier != ABS(rd->multiplier) || divisor != ABS(rd->divisor)) { if(!rrdset_flag_check(st, RRDSET_FLAG_HETEROGENEOUS)) { #ifdef NETDATA_INTERNAL_CHECKS - info("Dimension '%s' added on chart '%s' of host '%s' is not homogeneous to other dimensions already present (algorithm is '%s' vs '%s', multiplier is " COLLECTED_NUMBER_FORMAT " vs " COLLECTED_NUMBER_FORMAT ", divisor is " COLLECTED_NUMBER_FORMAT " vs " COLLECTED_NUMBER_FORMAT ").", + netdata_log_info("Dimension '%s' added on chart '%s' of host '%s' is not homogeneous to other dimensions already present " + "(algorithm is '%s' vs '%s', multiplier is %d vs %d, " + "divisor is %d vs %d).", rrddim_name(rd), rrdset_name(st), rrdhost_hostname(host), @@ -745,21 +745,21 @@ inline void rrdset_update_heterogeneous_flag(RRDSET *st) { // RRDSET - reset a chart void rrdset_reset(RRDSET *st) { - debug(D_RRD_CALLS, "rrdset_reset() %s", rrdset_name(st)); + netdata_log_debug(D_RRD_CALLS, "rrdset_reset() %s", rrdset_name(st)); st->last_collected_time.tv_sec = 0; st->last_collected_time.tv_usec = 0; st->last_updated.tv_sec = 0; st->last_updated.tv_usec = 0; - st->current_entry = 0; + st->db.current_entry = 0; st->counter = 0; st->counter_done = 0; RRDDIM *rd; rrddim_foreach_read(rd, st) { - rd->last_collected_time.tv_sec = 0; - rd->last_collected_time.tv_usec = 0; - rd->collections_counter = 0; + rd->collector.last_collected_time.tv_sec = 0; + rd->collector.last_collected_time.tv_usec = 0; + rd->collector.counter = 0; if(!rrddim_flag_check(rd, RRDDIM_FLAG_ARCHIVED)) { for(size_t tier = 0; tier < storage_tiers ;tier++) @@ -833,45 +833,45 @@ void rrdset_save(RRDSET *st) { void rrdset_delete_files(RRDSET *st) { RRDDIM *rd; - info("Deleting chart '%s' ('%s') from disk...", rrdset_id(st), rrdset_name(st)); + netdata_log_info("Deleting chart '%s' ('%s') from disk...", rrdset_id(st), rrdset_name(st)); if(st->rrd_memory_mode == RRD_MEMORY_MODE_SAVE || st->rrd_memory_mode == RRD_MEMORY_MODE_MAP) { const char *cache_filename = rrdset_cache_filename(st); if(cache_filename) { - info("Deleting chart header file '%s'.", cache_filename); + netdata_log_info("Deleting chart header file '%s'.", cache_filename); if (unlikely(unlink(cache_filename) == -1)) - error("Cannot delete chart header file '%s'", cache_filename); + netdata_log_error("Cannot delete chart header file '%s'", cache_filename); } else - error("Cannot find the cache filename of chart '%s'", rrdset_id(st)); + netdata_log_error("Cannot find the cache filename of chart '%s'", rrdset_id(st)); } rrddim_foreach_read(rd, st) { const char *cache_filename = rrddim_cache_filename(rd); if(!cache_filename) continue; - info("Deleting dimension file '%s'.", cache_filename); + netdata_log_info("Deleting dimension file '%s'.", cache_filename); if(unlikely(unlink(cache_filename) == -1)) - error("Cannot delete dimension file '%s'", cache_filename); + netdata_log_error("Cannot delete dimension file '%s'", cache_filename); } rrddim_foreach_done(rd); - if(st->cache_dir) - recursively_delete_dir(st->cache_dir, "left-over chart"); + if(st->db.cache_dir) + recursively_delete_dir(st->db.cache_dir, "left-over chart"); } void rrdset_delete_obsolete_dimensions(RRDSET *st) { RRDDIM *rd; - info("Deleting dimensions of chart '%s' ('%s') from disk...", rrdset_id(st), rrdset_name(st)); + netdata_log_info("Deleting dimensions of chart '%s' ('%s') from disk...", rrdset_id(st), rrdset_name(st)); rrddim_foreach_read(rd, st) { if(rrddim_flag_check(rd, RRDDIM_FLAG_OBSOLETE)) { const char *cache_filename = rrddim_cache_filename(rd); if(!cache_filename) continue; - info("Deleting dimension file '%s'.", cache_filename); + netdata_log_info("Deleting dimension file '%s'.", cache_filename); if(unlikely(unlink(cache_filename) == -1)) - error("Cannot delete dimension file '%s'", cache_filename); + netdata_log_error("Cannot delete dimension file '%s'", cache_filename); } } rrddim_foreach_done(rd); @@ -933,7 +933,7 @@ RRDSET *rrdset_create_custom( // ------------------------------------------------------------------------ // allocate it - debug(D_RRD_CALLS, "Creating RRD_STATS for '%s.%s'.", type, id); + netdata_log_debug(D_RRD_CALLS, "Creating RRD_STATS for '%s.%s'.", type, id); struct rrdset_constructor tmp = { .host = host, @@ -999,8 +999,8 @@ void rrdset_timed_next(RRDSET *st, struct timeval now, usec_t duration_since_las if(unlikely(since_last_usec < 0)) { // oops! the database is in the future #ifdef NETDATA_INTERNAL_CHECKS - info("RRD database for chart '%s' on host '%s' is %0.5" NETDATA_DOUBLE_MODIFIER - " secs in the future (counter #%zu, update #%zu). Adjusting it to current time." + netdata_log_info("RRD database for chart '%s' on host '%s' is %0.5" NETDATA_DOUBLE_MODIFIER + " secs in the future (counter #%u, update #%u). Adjusting it to current time." , rrdset_id(st) , rrdhost_hostname(st->rrdhost) , (NETDATA_DOUBLE)-since_last_usec / USEC_PER_SEC @@ -1018,8 +1018,10 @@ void rrdset_timed_next(RRDSET *st, struct timeval now, usec_t duration_since_las else if(unlikely((usec_t)since_last_usec > (usec_t)(st->update_every * 5 * USEC_PER_SEC))) { // oops! the database is too far behind #ifdef NETDATA_INTERNAL_CHECKS - info("RRD database for chart '%s' on host '%s' is %0.5" NETDATA_DOUBLE_MODIFIER - " secs in the past (counter #%zu, update #%zu). Adjusting it to current time.", rrdset_id(st), rrdhost_hostname(st->rrdhost), (NETDATA_DOUBLE)since_last_usec / USEC_PER_SEC, st->counter, st->counter_done); + netdata_log_info("RRD database for chart '%s' on host '%s' is %0.5" NETDATA_DOUBLE_MODIFIER + " secs in the past (counter #%u, update #%u). Adjusting it to current time.", + rrdset_id(st), rrdhost_hostname(st->rrdhost), (NETDATA_DOUBLE)since_last_usec / USEC_PER_SEC, + st->counter, st->counter_done); #endif duration_since_last_update = (usec_t)since_last_usec; @@ -1044,7 +1046,7 @@ void rrdset_timed_next(RRDSET *st, struct timeval now, usec_t duration_since_las last_time_s = now.tv_sec; if(min_delta > permanent_min_delta) { - info("MINIMUM MICROSECONDS DELTA of thread %d increased from %lld to %lld (+%lld)", gettid(), permanent_min_delta, min_delta, min_delta - permanent_min_delta); + netdata_log_info("MINIMUM MICROSECONDS DELTA of thread %d increased from %lld to %lld (+%lld)", gettid(), permanent_min_delta, min_delta, min_delta - permanent_min_delta); permanent_min_delta = min_delta; } @@ -1054,7 +1056,7 @@ void rrdset_timed_next(RRDSET *st, struct timeval now, usec_t duration_since_las #endif } - debug(D_RRD_CALLS, "rrdset_timed_next() for chart %s with duration since last update %llu usec", rrdset_name(st), duration_since_last_update); + netdata_log_debug(D_RRD_CALLS, "rrdset_timed_next() for chart %s with duration since last update %llu usec", rrdset_name(st), duration_since_last_update); rrdset_debug(st, "NEXT: %llu microseconds", duration_since_last_update); internal_error(discarded && discarded != duration_since_last_update, @@ -1126,7 +1128,7 @@ static inline void rrdset_init_last_updated_time(RRDSET *st) { static __thread size_t rrdset_done_statistics_points_stored_per_tier[RRD_STORAGE_TIERS]; static inline time_t tier_next_point_time_s(RRDDIM *rd, struct rrddim_tier *t, time_t now_s) { - time_t loop = (time_t)rd->update_every * (time_t)t->tier_grouping; + time_t loop = (time_t)rd->rrdset->update_every * (time_t)t->tier_grouping; return now_s + loop - ((now_s + loop) % loop); } @@ -1231,7 +1233,7 @@ void rrddim_store_metric(RRDDIM *rd, usec_t point_end_time_ut, NETDATA_DOUBLE n, time_t now_s = (time_t)(point_end_time_ut / USEC_PER_SEC); STORAGE_POINT sp = { - .start_time_s = now_s - rd->update_every, + .start_time_s = now_s - rd->rrdset->update_every, .end_time_s = now_s, .min = n, .max = n, @@ -1320,7 +1322,7 @@ static inline size_t rrdset_done_interpolate( if((now_collect_ut % (update_every_ut)) == 0) iterations++; size_t counter = st->counter; - long current_entry = st->current_entry; + long current_entry = st->db.current_entry; SN_FLAGS storage_flags = SN_DEFAULT_FLAGS; @@ -1357,7 +1359,7 @@ static inline size_t rrdset_done_interpolate( switch(rd->algorithm) { case RRD_ALGORITHM_INCREMENTAL: new_value = (NETDATA_DOUBLE) - ( rd->calculated_value + ( rd->collector.calculated_value * (NETDATA_DOUBLE)(next_store_ut - last_collect_ut) / (NETDATA_DOUBLE)(now_collect_ut - last_collect_ut) ); @@ -1368,14 +1370,14 @@ static inline size_t rrdset_done_interpolate( " / (%llu - %llu)" , rrddim_name(rd) , new_value - , rd->calculated_value + , rd->collector.calculated_value , next_store_ut, last_collect_ut , now_collect_ut, last_collect_ut ); - rd->calculated_value -= new_value; - new_value += rd->last_calculated_value; - rd->last_calculated_value = 0; + rd->collector.calculated_value -= new_value; + new_value += rd->collector.last_calculated_value; + rd->collector.last_calculated_value = 0; new_value /= (NETDATA_DOUBLE)st->update_every; if(unlikely(next_store_ut - last_stored_ut < update_every_ut)) { @@ -1398,18 +1400,18 @@ static inline size_t rrdset_done_interpolate( // do not interpolate // just show the calculated value - new_value = rd->calculated_value; + new_value = rd->collector.calculated_value; } else { // we have missed an update // interpolate in the middle values new_value = (NETDATA_DOUBLE) - ( ( (rd->calculated_value - rd->last_calculated_value) + ( ( (rd->collector.calculated_value - rd->collector.last_calculated_value) * (NETDATA_DOUBLE)(next_store_ut - last_collect_ut) / (NETDATA_DOUBLE)(now_collect_ut - last_collect_ut) ) - + rd->last_calculated_value + + rd->collector.last_calculated_value ); rrdset_debug(st, "%s: CALC2 DEF " NETDATA_DOUBLE_FORMAT " = (((" @@ -1417,9 +1419,9 @@ static inline size_t rrdset_done_interpolate( " * %llu" " / %llu) + " NETDATA_DOUBLE_FORMAT, rrddim_name(rd) , new_value - , rd->calculated_value, rd->last_calculated_value + , rd->collector.calculated_value, rd->collector.last_calculated_value , (next_store_ut - first_ut) - , (now_collect_ut - first_ut), rd->last_calculated_value + , (now_collect_ut - first_ut), rd->collector.last_calculated_value ); } break; @@ -1437,7 +1439,7 @@ static inline size_t rrdset_done_interpolate( continue; } - if(likely(rd->updated && rd->collections_counter > 1 && iterations < gap_when_lost_iterations_above)) { + if(likely(rrddim_check_updated(rd) && rd->collector.counter > 1 && iterations < gap_when_lost_iterations_above)) { uint32_t dim_storage_flags = storage_flags; if (ml_dimension_is_anomalous(rd, current_time_s, new_value, true)) { @@ -1449,7 +1451,7 @@ static inline size_t rrdset_done_interpolate( rrddim_push_metrics_v2(rsb, rd, next_store_ut, new_value, dim_storage_flags); rrddim_store_metric(rd, next_store_ut, new_value, dim_storage_flags); - rd->last_stored_value = new_value; + rd->collector.last_stored_value = new_value; } else { (void) ml_dimension_is_anomalous(rd, current_time_s, 0, false); @@ -1460,7 +1462,7 @@ static inline size_t rrdset_done_interpolate( rrddim_push_metrics_v2(rsb, rd, next_store_ut, NAN, SN_FLAG_NONE); rrddim_store_metric(rd, next_store_ut, NAN, SN_FLAG_NONE); - rd->last_stored_value = NAN; + rd->collector.last_stored_value = NAN; } stored_entries++; @@ -1472,7 +1474,7 @@ static inline size_t rrdset_done_interpolate( storage_flags = SN_DEFAULT_FLAGS; st->counter = ++counter; - st->current_entry = current_entry = ((current_entry + 1) >= st->entries) ? 0 : current_entry + 1; + st->db.current_entry = current_entry = ((current_entry + 1) >= st->db.entries) ? 0 : current_entry + 1; st->last_updated.tv_sec = (time_t) (last_ut / USEC_PER_SEC); st->last_updated.tv_usec = 0; @@ -1507,12 +1509,12 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) if(unlikely(rrdhost_has_rrdpush_sender_enabled(st->rrdhost))) stream_buffer = rrdset_push_metric_initialize(st, now.tv_sec); - netdata_spinlock_lock(&st->data_collection_lock); + spinlock_lock(&st->data_collection_lock); if (pending_rrdset_next) rrdset_timed_next(st, now, 0ULL); - debug(D_RRD_CALLS, "rrdset_done() for chart '%s'", rrdset_name(st)); + netdata_log_debug(D_RRD_CALLS, "rrdset_done() for chart '%s'", rrdset_name(st)); RRDDIM *rd; @@ -1529,19 +1531,20 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) RRDSET_FLAGS rrdset_flags = rrdset_flag_check(st, ~0); if(unlikely(rrdset_flags & RRDSET_FLAG_COLLECTION_FINISHED)) { - netdata_spinlock_unlock(&st->data_collection_lock); + spinlock_unlock(&st->data_collection_lock); return; } if (unlikely(rrdset_flags & RRDSET_FLAG_OBSOLETE)) { - error("Chart '%s' has the OBSOLETE flag set, but it is collected.", rrdset_id(st)); + netdata_log_error("Chart '%s' has the OBSOLETE flag set, but it is collected.", rrdset_id(st)); rrdset_isnot_obsolete(st); } // check if the chart has a long time to be updated - if(unlikely(st->usec_since_last_update > MAX(st->entries, 60) * update_every_ut)) { - info("host '%s', chart '%s': took too long to be updated (counter #%zu, update #%zu, %0.3" NETDATA_DOUBLE_MODIFIER - " secs). Resetting it.", rrdhost_hostname(st->rrdhost), rrdset_id(st), st->counter, st->counter_done, (NETDATA_DOUBLE)st->usec_since_last_update / USEC_PER_SEC); + if(unlikely(st->usec_since_last_update > MAX(st->db.entries, 60) * update_every_ut)) { + netdata_log_info("host '%s', chart '%s': took too long to be updated (counter #%u, update #%u, %0.3" NETDATA_DOUBLE_MODIFIER + " secs). Resetting it.", rrdhost_hostname(st->rrdhost), rrdset_id(st), st->counter, st->counter_done, + (NETDATA_DOUBLE)st->usec_since_last_update / USEC_PER_SEC); rrdset_reset(st); st->usec_since_last_update = update_every_ut; store_this_entry = 0; @@ -1579,9 +1582,9 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) } // check if we will re-write the entire data set - if(unlikely(dt_usec(&st->last_collected_time, &st->last_updated) > st->entries * update_every_ut && + if(unlikely(dt_usec(&st->last_collected_time, &st->last_updated) > st->db.entries * update_every_ut && st->rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE)) { - info( + netdata_log_info( "'%s': too old data (last updated at %"PRId64".%"PRId64", last collected at %"PRId64".%"PRId64"). " "Resetting it. Will not store the next entry.", rrdset_id(st), @@ -1659,28 +1662,28 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) rda->rd = dictionary_acquired_item_value(rda->item); // calculate totals - if(likely(rd->updated)) { + if(likely(rrddim_check_updated(rd))) { // if the new is smaller than the old (an overflow, or reset), set the old equal to the new // to reset the calculation (it will give zero as the calculation for this second) - if(unlikely(rd->algorithm == RRD_ALGORITHM_PCENT_OVER_DIFF_TOTAL && rd->last_collected_value > rd->collected_value)) { - debug(D_RRD_STATS, "'%s' / '%s': RESET or OVERFLOW. Last collected value = " COLLECTED_NUMBER_FORMAT ", current = " COLLECTED_NUMBER_FORMAT + if(unlikely(rd->algorithm == RRD_ALGORITHM_PCENT_OVER_DIFF_TOTAL && rd->collector.last_collected_value > rd->collector.collected_value)) { + netdata_log_debug(D_RRD_STATS, "'%s' / '%s': RESET or OVERFLOW. Last collected value = " COLLECTED_NUMBER_FORMAT ", current = " COLLECTED_NUMBER_FORMAT , rrdset_id(st) , rrddim_name(rd) - , rd->last_collected_value - , rd->collected_value + , rd->collector.last_collected_value + , rd->collector.collected_value ); if(!(rrddim_option_check(rd, RRDDIM_OPTION_DONT_DETECT_RESETS_OR_OVERFLOWS))) has_reset_value = 1; - rd->last_collected_value = rd->collected_value; + rd->collector.last_collected_value = rd->collector.collected_value; } - last_collected_total += rd->last_collected_value; - collected_total += rd->collected_value; + last_collected_total += rd->collector.last_collected_value; + collected_total += rd->collector.collected_value; if(unlikely(rrddim_flag_check(rd, RRDDIM_FLAG_OBSOLETE))) { - error("Dimension %s in chart '%s' has the OBSOLETE flag set, but it is collected.", rrddim_name(rd), rrdset_id(st)); + netdata_log_error("Dimension %s in chart '%s' has the OBSOLETE flag set, but it is collected.", rrddim_name(rd), rrdset_id(st)); rrddim_isnot_obsolete(st, rd); } } @@ -1700,8 +1703,8 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) rd = rda->rd; if(unlikely(!rd)) continue; - if(unlikely(!rd->updated)) { - rd->calculated_value = 0; + if(unlikely(!rrddim_check_updated(rd))) { + rd->collector.calculated_value = 0; continue; } @@ -1711,25 +1714,25 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) " last_calculated_value = " NETDATA_DOUBLE_FORMAT " calculated_value = " NETDATA_DOUBLE_FORMAT , rrddim_name(rd) - , rd->last_collected_value - , rd->collected_value - , rd->last_calculated_value - , rd->calculated_value + , rd->collector.last_collected_value + , rd->collector.collected_value + , rd->collector.last_calculated_value + , rd->collector.calculated_value ); switch(rd->algorithm) { case RRD_ALGORITHM_ABSOLUTE: - rd->calculated_value = (NETDATA_DOUBLE)rd->collected_value - * (NETDATA_DOUBLE)rd->multiplier - / (NETDATA_DOUBLE)rd->divisor; + rd->collector.calculated_value = (NETDATA_DOUBLE)rd->collector.collected_value + * (NETDATA_DOUBLE)rd->multiplier + / (NETDATA_DOUBLE)rd->divisor; rrdset_debug(st, "%s: CALC ABS/ABS-NO-IN " NETDATA_DOUBLE_FORMAT " = " COLLECTED_NUMBER_FORMAT " * " NETDATA_DOUBLE_FORMAT " / " NETDATA_DOUBLE_FORMAT , rrddim_name(rd) - , rd->calculated_value - , rd->collected_value + , rd->collector.calculated_value + , rd->collector.collected_value , (NETDATA_DOUBLE)rd->multiplier , (NETDATA_DOUBLE)rd->divisor ); @@ -1737,28 +1740,28 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) case RRD_ALGORITHM_PCENT_OVER_ROW_TOTAL: if(unlikely(!collected_total)) - rd->calculated_value = 0; + rd->collector.calculated_value = 0; else // the percentage of the current value // over the total of all dimensions - rd->calculated_value = + rd->collector.calculated_value = (NETDATA_DOUBLE)100 - * (NETDATA_DOUBLE)rd->collected_value + * (NETDATA_DOUBLE)rd->collector.collected_value / (NETDATA_DOUBLE)collected_total; rrdset_debug(st, "%s: CALC PCENT-ROW " NETDATA_DOUBLE_FORMAT " = 100" " * " COLLECTED_NUMBER_FORMAT " / " COLLECTED_NUMBER_FORMAT , rrddim_name(rd) - , rd->calculated_value - , rd->collected_value + , rd->collector.calculated_value + , rd->collector.collected_value , collected_total ); break; case RRD_ALGORITHM_INCREMENTAL: - if(unlikely(rd->collections_counter <= 1)) { - rd->calculated_value = 0; + if(unlikely(rd->collector.counter <= 1)) { + rd->collector.calculated_value = 0; continue; } @@ -1766,19 +1769,19 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) // to reset the calculation (it will give zero as the calculation for this second). // It is imperative to set the comparison to uint64_t since type collected_number is signed and // produces wrong results as far as incremental counters are concerned. - if(unlikely((uint64_t)rd->last_collected_value > (uint64_t)rd->collected_value)) { - debug(D_RRD_STATS, "'%s' / '%s': RESET or OVERFLOW. Last collected value = " COLLECTED_NUMBER_FORMAT ", current = " COLLECTED_NUMBER_FORMAT + if(unlikely((uint64_t)rd->collector.last_collected_value > (uint64_t)rd->collector.collected_value)) { + netdata_log_debug(D_RRD_STATS, "'%s' / '%s': RESET or OVERFLOW. Last collected value = " COLLECTED_NUMBER_FORMAT ", current = " COLLECTED_NUMBER_FORMAT , rrdset_id(st) , rrddim_name(rd) - , rd->last_collected_value - , rd->collected_value); + , rd->collector.last_collected_value + , rd->collector.collected_value); if(!(rrddim_option_check(rd, RRDDIM_OPTION_DONT_DETECT_RESETS_OR_OVERFLOWS))) has_reset_value = 1; - uint64_t last = (uint64_t)rd->last_collected_value; - uint64_t new = (uint64_t)rd->collected_value; - uint64_t max = (uint64_t)rd->collected_value_max; + uint64_t last = (uint64_t)rd->collector.last_collected_value; + uint64_t new = (uint64_t)rd->collector.collected_value; + uint64_t max = (uint64_t)rd->collector.collected_value_max; uint64_t cap = 0; // Signed values are handled by exploiting two's complement which will produce positive deltas @@ -1795,19 +1798,19 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) // overflow. // TODO: remember recent history of rates and compare with current rate to reduce this chance. if (delta < max_acceptable_rate) { - rd->calculated_value += + rd->collector.calculated_value += (NETDATA_DOUBLE) delta * (NETDATA_DOUBLE) rd->multiplier / (NETDATA_DOUBLE) rd->divisor; } else { // This is a reset. Any overflow with a rate greater than MAX_INCREMENTAL_PERCENT_RATE will also // be detected as a reset instead. - rd->calculated_value += (NETDATA_DOUBLE)0; + rd->collector.calculated_value += (NETDATA_DOUBLE)0; } } else { - rd->calculated_value += - (NETDATA_DOUBLE) (rd->collected_value - rd->last_collected_value) + rd->collector.calculated_value += + (NETDATA_DOUBLE) (rd->collector.collected_value - rd->collector.last_collected_value) * (NETDATA_DOUBLE) rd->multiplier / (NETDATA_DOUBLE) rd->divisor; } @@ -1818,35 +1821,35 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) " * " NETDATA_DOUBLE_FORMAT " / " NETDATA_DOUBLE_FORMAT , rrddim_name(rd) - , rd->calculated_value - , rd->collected_value, rd->last_collected_value + , rd->collector.calculated_value + , rd->collector.collected_value, rd->collector.last_collected_value , (NETDATA_DOUBLE)rd->multiplier , (NETDATA_DOUBLE)rd->divisor ); break; case RRD_ALGORITHM_PCENT_OVER_DIFF_TOTAL: - if(unlikely(rd->collections_counter <= 1)) { - rd->calculated_value = 0; + if(unlikely(rd->collector.counter <= 1)) { + rd->collector.calculated_value = 0; continue; } // the percentage of the current increment // over the increment of all dimensions together if(unlikely(collected_total == last_collected_total)) - rd->calculated_value = 0; + rd->collector.calculated_value = 0; else - rd->calculated_value = + rd->collector.calculated_value = (NETDATA_DOUBLE)100 - * (NETDATA_DOUBLE)(rd->collected_value - rd->last_collected_value) + * (NETDATA_DOUBLE)(rd->collector.collected_value - rd->collector.last_collected_value) / (NETDATA_DOUBLE)(collected_total - last_collected_total); rrdset_debug(st, "%s: CALC PCENT-DIFF " NETDATA_DOUBLE_FORMAT " = 100" " * (" COLLECTED_NUMBER_FORMAT " - " COLLECTED_NUMBER_FORMAT ")" " / (" COLLECTED_NUMBER_FORMAT " - " COLLECTED_NUMBER_FORMAT ")" , rrddim_name(rd) - , rd->calculated_value - , rd->collected_value, rd->last_collected_value + , rd->collector.calculated_value + , rd->collector.collected_value, rd->collector.last_collected_value , collected_total, last_collected_total ); break; @@ -1854,11 +1857,11 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) default: // make the default zero, to make sure // it gets noticed when we add new types - rd->calculated_value = 0; + rd->collector.calculated_value = 0; rrdset_debug(st, "%s: CALC " NETDATA_DOUBLE_FORMAT " = 0" , rrddim_name(rd) - , rd->calculated_value + , rd->collector.calculated_value ); break; } @@ -1869,10 +1872,10 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) " last_calculated_value = " NETDATA_DOUBLE_FORMAT " calculated_value = " NETDATA_DOUBLE_FORMAT , rrddim_name(rd) - , rd->last_collected_value - , rd->collected_value - , rd->last_calculated_value - , rd->calculated_value + , rd->collector.last_collected_value + , rd->collector.collected_value + , rd->collector.last_calculated_value + , rd->collector.calculated_value ); } @@ -1883,7 +1886,7 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) // if(unlikely(now_collect_ut < next_store_ut && st->counter_done > 1)) { // // this is collected in the same interpolation point // rrdset_debug(st, "THIS IS IN THE SAME INTERPOLATION POINT"); -// info("INTERNAL CHECK: host '%s', chart '%s' collection %zu is in the same interpolation point: short by %llu microseconds", st->rrdhost->hostname, rrdset_name(st), st->counter_done, next_store_ut - now_collect_ut); +// netdata_log_info("INTERNAL CHECK: host '%s', chart '%s' collection %zu is in the same interpolation point: short by %llu microseconds", st->rrdhost->hostname, rrdset_name(st), st->counter_done, next_store_ut - now_collect_ut); // } // #endif @@ -1905,12 +1908,12 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) rd = rda->rd; if(unlikely(!rd)) continue; - if(unlikely(!rd->updated)) + if(unlikely(!rrddim_check_updated(rd))) continue; - rrdset_debug(st, "%s: setting last_collected_value (old: " COLLECTED_NUMBER_FORMAT ") to last_collected_value (new: " COLLECTED_NUMBER_FORMAT ")", rrddim_name(rd), rd->last_collected_value, rd->collected_value); + rrdset_debug(st, "%s: setting last_collected_value (old: " COLLECTED_NUMBER_FORMAT ") to last_collected_value (new: " COLLECTED_NUMBER_FORMAT ")", rrddim_name(rd), rd->collector.last_collected_value, rd->collector.collected_value); - rd->last_collected_value = rd->collected_value; + rd->collector.last_collected_value = rd->collector.collected_value; switch(rd->algorithm) { case RRD_ALGORITHM_INCREMENTAL: @@ -1918,10 +1921,10 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) rrdset_debug(st, "%s: setting last_calculated_value (old: " NETDATA_DOUBLE_FORMAT ") to " "last_calculated_value (new: " NETDATA_DOUBLE_FORMAT ")" , rrddim_name(rd) - , rd->last_calculated_value + rd->calculated_value - , rd->calculated_value); + , rd->collector.last_calculated_value + rd->collector.calculated_value + , rd->collector.calculated_value); - rd->last_calculated_value += rd->calculated_value; + rd->collector.last_calculated_value += rd->collector.calculated_value; } else { rrdset_debug(st, "THIS IS THE FIRST POINT"); @@ -1934,16 +1937,16 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) rrdset_debug(st, "%s: setting last_calculated_value (old: " NETDATA_DOUBLE_FORMAT ") to " "last_calculated_value (new: " NETDATA_DOUBLE_FORMAT ")" , rrddim_name(rd) - , rd->last_calculated_value - , rd->calculated_value); + , rd->collector.last_calculated_value + , rd->collector.calculated_value); - rd->last_calculated_value = rd->calculated_value; + rd->collector.last_calculated_value = rd->collector.calculated_value; break; } - rd->calculated_value = 0; - rd->collected_value = 0; - rd->updated = 0; + rd->collector.calculated_value = 0; + rd->collector.collected_value = 0; + rrddim_clear_updated(rd); rrdset_debug(st, "%s: END " " last_collected_value = " COLLECTED_NUMBER_FORMAT @@ -1951,14 +1954,14 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) " last_calculated_value = " NETDATA_DOUBLE_FORMAT " calculated_value = " NETDATA_DOUBLE_FORMAT , rrddim_name(rd) - , rd->last_collected_value - , rd->collected_value - , rd->last_calculated_value - , rd->calculated_value + , rd->collector.last_collected_value + , rd->collector.collected_value + , rd->collector.last_calculated_value + , rd->collector.calculated_value ); } - netdata_spinlock_unlock(&st->data_collection_lock); + spinlock_unlock(&st->data_collection_lock); rrdset_push_metrics_finished(&stream_buffer, st); // ALL DONE ABOUT THE DATA UPDATE @@ -1991,6 +1994,8 @@ void rrdset_timed_done(RRDSET *st, struct timeval now, bool pending_rrdset_next) } time_t rrdset_set_update_every_s(RRDSET *st, time_t update_every_s) { + if(unlikely(update_every_s == st->update_every)) + return st->update_every; internal_error(true, "RRDSET '%s' switching update every from %d to %d", rrdset_id(st), (int)st->update_every, (int)update_every_s); @@ -2007,10 +2012,6 @@ time_t rrdset_set_update_every_s(RRDSET *st, time_t update_every_s) { rd->tiers[tier].db_collection_handle, (int)(st->rrdhost->db[tier].tier_grouping * st->update_every)); } - - assert(rd->update_every == (int) prev_update_every_s && - "chart's update every differs from the update every of its dimensions"); - rd->update_every = st->update_every; } rrddim_foreach_done(rd); @@ -2094,10 +2095,10 @@ struct rrdset_map_save_v019 { }; void rrdset_memory_file_update(RRDSET *st) { - if(!st->st_on_file) return; - struct rrdset_map_save_v019 *st_on_file = st->st_on_file; + if(!st->db.st_on_file) return; + struct rrdset_map_save_v019 *st_on_file = st->db.st_on_file; - st_on_file->current_entry = st->current_entry; + st_on_file->current_entry = st->db.current_entry; st_on_file->counter = st->counter; st_on_file->usec_since_last_update = st->usec_since_last_update; st_on_file->last_updated.tv_sec = st->last_updated.tv_sec; @@ -2105,41 +2106,41 @@ void rrdset_memory_file_update(RRDSET *st) { } const char *rrdset_cache_filename(RRDSET *st) { - if(!st->st_on_file) return NULL; - struct rrdset_map_save_v019 *st_on_file = st->st_on_file; + if(!st->db.st_on_file) return NULL; + struct rrdset_map_save_v019 *st_on_file = st->db.st_on_file; return st_on_file->cache_filename; } const char *rrdset_cache_dir(RRDSET *st) { - if(!st->cache_dir) - st->cache_dir = rrdhost_cache_dir_for_rrdset_alloc(st->rrdhost, rrdset_id(st)); + if(!st->db.cache_dir) + st->db.cache_dir = rrdhost_cache_dir_for_rrdset_alloc(st->rrdhost, rrdset_id(st)); - return st->cache_dir; + return st->db.cache_dir; } void rrdset_memory_file_free(RRDSET *st) { - if(!st->st_on_file) return; + if(!st->db.st_on_file) return; // needed for memory mode map, to save the latest state rrdset_memory_file_update(st); - struct rrdset_map_save_v019 *st_on_file = st->st_on_file; + struct rrdset_map_save_v019 *st_on_file = st->db.st_on_file; __atomic_sub_fetch(&rrddim_db_memory_size, st_on_file->memsize, __ATOMIC_RELAXED); netdata_munmap(st_on_file, st_on_file->memsize); // remove the pointers from the RRDDIM - st->st_on_file = NULL; + st->db.st_on_file = NULL; } void rrdset_memory_file_save(RRDSET *st) { - if(!st->st_on_file) return; + if(!st->db.st_on_file) return; rrdset_memory_file_update(st); - struct rrdset_map_save_v019 *st_on_file = st->st_on_file; + struct rrdset_map_save_v019 *st_on_file = st->db.st_on_file; if(st_on_file->rrd_memory_mode != RRD_MEMORY_MODE_SAVE) return; - memory_file_save(st_on_file->cache_filename, st->st_on_file, st_on_file->memsize); + memory_file_save(st_on_file->cache_filename, st->db.st_on_file, st_on_file->memsize); } bool rrdset_memory_load_or_create_map_save(RRDSET *st, RRD_MEMORY_MODE memory_mode) { @@ -2159,27 +2160,27 @@ bool rrdset_memory_load_or_create_map_save(RRDSET *st, RRD_MEMORY_MODE memory_mo st_on_file->magic[sizeof(RRDSET_MAGIC_V019)] = '\0'; if(strcmp(st_on_file->magic, RRDSET_MAGIC_V019) != 0) { - info("Initializing file '%s'.", fullfilename); + netdata_log_info("Initializing file '%s'.", fullfilename); memset(st_on_file, 0, size); } else if(strncmp(st_on_file->id, rrdset_id(st), RRD_ID_LENGTH_MAX_V019) != 0) { - error("File '%s' contents are not for chart '%s'. Clearing it.", fullfilename, rrdset_id(st)); + netdata_log_error("File '%s' contents are not for chart '%s'. Clearing it.", fullfilename, rrdset_id(st)); memset(st_on_file, 0, size); } - else if(st_on_file->memsize != size || st_on_file->entries != st->entries) { - error("File '%s' does not have the desired size. Clearing it.", fullfilename); + else if(st_on_file->memsize != size || st_on_file->entries != st->db.entries) { + netdata_log_error("File '%s' does not have the desired size. Clearing it.", fullfilename); memset(st_on_file, 0, size); } else if(st_on_file->update_every != st->update_every) { - error("File '%s' does not have the desired granularity. Clearing it.", fullfilename); + netdata_log_error("File '%s' does not have the desired granularity. Clearing it.", fullfilename); memset(st_on_file, 0, size); } - else if((now_s - st_on_file->last_updated.tv_sec) > st->update_every * st->entries) { - info("File '%s' is too old. Clearing it.", fullfilename); + else if((now_s - st_on_file->last_updated.tv_sec) > (long)st->update_every * (long)st->db.entries) { + netdata_log_info("File '%s' is too old. Clearing it.", fullfilename); memset(st_on_file, 0, size); } else if(st_on_file->last_updated.tv_sec > now_s + st->update_every) { - error("File '%s' refers to the future by %zd secs. Resetting it to now.", fullfilename, (ssize_t)(st_on_file->last_updated.tv_sec - now_s)); + netdata_log_error("File '%s' refers to the future by %zd secs. Resetting it to now.", fullfilename, (ssize_t)(st_on_file->last_updated.tv_sec - now_s)); st_on_file->last_updated.tv_sec = now_s; } @@ -2194,14 +2195,14 @@ bool rrdset_memory_load_or_create_map_save(RRDSET *st, RRD_MEMORY_MODE memory_mo } // copy the useful values to st - st->current_entry = st_on_file->current_entry; + st->db.current_entry = st_on_file->current_entry; st->counter = st_on_file->counter; st->usec_since_last_update = st_on_file->usec_since_last_update; st->last_updated.tv_sec = st_on_file->last_updated.tv_sec; st->last_updated.tv_usec = st_on_file->last_updated.tv_usec; // link it to st - st->st_on_file = st_on_file; + st->db.st_on_file = st_on_file; // clear everything memset(st_on_file, 0, size); @@ -2211,7 +2212,7 @@ bool rrdset_memory_load_or_create_map_save(RRDSET *st, RRD_MEMORY_MODE memory_mo strcpy(st_on_file->cache_filename, fullfilename); strcpy(st_on_file->magic, RRDSET_MAGIC_V019); st_on_file->memsize = size; - st_on_file->entries = st->entries; + st_on_file->entries = st->db.entries; st_on_file->update_every = st->update_every; st_on_file->rrd_memory_mode = memory_mode; diff --git a/database/rrdsetvar.c b/database/rrdsetvar.c index 15377ddb2..379f92eec 100644 --- a/database/rrdsetvar.c +++ b/database/rrdsetvar.c @@ -222,7 +222,7 @@ void rrdsetvar_add_and_leave_released(RRDSET *st, const char *name, RRDVAR_TYPE } void rrdsetvar_rename_all(RRDSET *st) { - debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", rrdset_id(st), rrdset_name(st)); + netdata_log_debug(D_VARIABLES, "RRDSETVAR rename for chart id '%s' name '%s'", rrdset_id(st), rrdset_name(st)); RRDSETVAR *rs; dfe_start_write(st->rrdsetvar_root_index, rs) { @@ -262,8 +262,13 @@ void rrdsetvar_custom_chart_variable_set(RRDSET *st, const RRDSETVAR_ACQUIRED *r RRDSETVAR *rs = dictionary_acquired_item_value((const DICTIONARY_ITEM *)rsa); if(rs->type != RRDVAR_TYPE_CALCULATED || !(rs->flags & RRDVAR_FLAG_CUSTOM_CHART_VAR) || !(rs->flags & RRDVAR_FLAG_ALLOCATED)) { - error("RRDSETVAR: requested to set variable '%s' of chart '%s' on host '%s' to value " NETDATA_DOUBLE_FORMAT - " but the variable is not a custom chart one (it has options 0x%x, value pointer %p). Ignoring request.", string2str(rs->name), rrdset_id(st), rrdhost_hostname(st->rrdhost), value, (uint32_t)rs->flags, rs->value); + netdata_log_error("RRDSETVAR: requested to set variable '%s' of chart '%s' on host '%s' to value " NETDATA_DOUBLE_FORMAT + " but the variable is not a custom chart one (it has options 0x%x, value pointer %p). Ignoring request.", + string2str(rs->name), + rrdset_id(st), + rrdhost_hostname(st->rrdhost), + value, + (uint32_t)rs->flags, rs->value); } else { NETDATA_DOUBLE *v = rs->value; diff --git a/database/rrdvar.c b/database/rrdvar.c index 914a5d6ed..09c4d404d 100644 --- a/database/rrdvar.c +++ b/database/rrdvar.c @@ -175,7 +175,7 @@ void rrdvar_custom_host_variable_set(RRDHOST *host, const RRDVAR_ACQUIRED *rva, if(unlikely(!host->rrdvars || !rva)) return; // when health is not enabled if(rrdvar_type(rva) != RRDVAR_TYPE_CALCULATED || !(rrdvar_flags(rva) & (RRDVAR_FLAG_CUSTOM_HOST_VAR | RRDVAR_FLAG_ALLOCATED))) - error("requested to set variable '%s' to value " NETDATA_DOUBLE_FORMAT " but the variable is not a custom one.", rrdvar_name(rva), value); + netdata_log_error("requested to set variable '%s' to value " NETDATA_DOUBLE_FORMAT " but the variable is not a custom one.", rrdvar_name(rva), value); else { RRDVAR *rv = dictionary_acquired_item_value((const DICTIONARY_ITEM *)rva); NETDATA_DOUBLE *v = rv->value; @@ -228,7 +228,7 @@ NETDATA_DOUBLE rrdvar2number(const RRDVAR_ACQUIRED *rva) { } default: - error("I don't know how to convert RRDVAR type %u to NETDATA_DOUBLE", rv->type); + netdata_log_error("I don't know how to convert RRDVAR type %u to NETDATA_DOUBLE", rv->type); return NAN; } } @@ -272,9 +272,9 @@ void rrdvar_store_for_chart(RRDHOST *host, RRDSET *st) { RRDDIM *rd; rrddim_foreach_read(rd, st) { - rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->last_stored_value, RRDVAR_FLAG_NONE); - rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_COLLECTED, NULL, "_raw", &rd->last_collected_value, RRDVAR_FLAG_NONE); - rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->last_collected_time.tv_sec, RRDVAR_FLAG_NONE); + rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->collector.last_stored_value, RRDVAR_FLAG_NONE); + rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_COLLECTED, NULL, "_raw", &rd->collector.last_collected_value, RRDVAR_FLAG_NONE); + rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->collector.last_collected_time.tv_sec, RRDVAR_FLAG_NONE); } rrddim_foreach_done(rd); } diff --git a/database/sqlite/sqlite_aclk.c b/database/sqlite/sqlite_aclk.c index a33e09f5d..fedce50eb 100644 --- a/database/sqlite/sqlite_aclk.c +++ b/database/sqlite/sqlite_aclk.c @@ -67,7 +67,7 @@ static void aclk_database_enq_cmd(struct aclk_database_cmd *cmd) /* wake up event loop */ int rc = uv_async_send(&aclk_sync_config.async); if (unlikely(rc)) - debug(D_ACLK_SYNC, "Failed to wake up event loop"); + netdata_log_debug(D_ACLK_SYNC, "Failed to wake up event loop"); } enum { @@ -226,14 +226,14 @@ static void sql_delete_aclk_table_list(char *host_guid) uuid_unparse_lower(host_uuid, host_str); uuid_unparse_lower_fix(&host_uuid, uuid_str); - debug(D_ACLK_SYNC, "Checking if I should delete aclk tables for node %s", host_str); + netdata_log_debug(D_ACLK_SYNC, "Checking if I should delete aclk tables for node %s", host_str); if (is_host_available(&host_uuid)) { - debug(D_ACLK_SYNC, "Host %s exists, not deleting aclk sync tables", host_str); + netdata_log_debug(D_ACLK_SYNC, "Host %s exists, not deleting aclk sync tables", host_str); return; } - debug(D_ACLK_SYNC, "Host %s does NOT exist, can delete aclk sync tables", host_str); + netdata_log_debug(D_ACLK_SYNC, "Host %s does NOT exist, can delete aclk sync tables", host_str); sqlite3_stmt *res = NULL; BUFFER *sql = buffer_create(ACLK_SYNC_QUERY_SIZE, &netdata_buffers_statistics.buffers_sqlite); @@ -257,7 +257,7 @@ static void sql_delete_aclk_table_list(char *host_guid) rc = db_execute(db_meta, buffer_tostring(sql)); if (unlikely(rc)) - error("Failed to drop unused ACLK tables"); + netdata_log_error("Failed to drop unused ACLK tables"); fail: buffer_free(sql); @@ -265,7 +265,7 @@ fail: static int sql_check_aclk_table(void *data __maybe_unused, int argc __maybe_unused, char **argv __maybe_unused, char **column __maybe_unused) { - debug(D_ACLK_SYNC,"Scheduling aclk sync table check for node %s", (char *) argv[0]); + netdata_log_debug(D_ACLK_SYNC,"Scheduling aclk sync table check for node %s", (char *) argv[0]); struct aclk_database_cmd cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = ACLK_DATABASE_DELETE_HOST; @@ -280,7 +280,7 @@ static int sql_check_aclk_table(void *data __maybe_unused, int argc __maybe_unus static void sql_check_aclk_table_list(void) { char *err_msg = NULL; - debug(D_ACLK_SYNC,"Cleaning tables for nodes that do not exist"); + netdata_log_debug(D_ACLK_SYNC,"Cleaning tables for nodes that do not exist"); int rc = sqlite3_exec_monitored(db_meta, SQL_SELECT_ACLK_ACTIVE_LIST, sql_check_aclk_table, NULL, &err_msg); if (rc != SQLITE_OK) { error_report("Query failed when trying to check for obsolete ACLK sync tables, %s", err_msg); @@ -305,7 +305,7 @@ static int sql_maint_aclk_sync_database(void *data __maybe_unused, int argc __ma static void sql_maint_aclk_sync_database_all(void) { char *err_msg = NULL; - debug(D_ACLK_SYNC,"Cleaning tables for nodes that do not exist"); + netdata_log_debug(D_ACLK_SYNC,"Cleaning tables for nodes that do not exist"); int rc = sqlite3_exec_monitored(db_meta, SQL_SELECT_ACLK_ALERT_LIST, sql_maint_aclk_sync_database, NULL, &err_msg); if (rc != SQLITE_OK) { error_report("Query failed when trying to check for obsolete ACLK sync tables, %s", err_msg); @@ -385,7 +385,7 @@ static void aclk_synchronization(void *arg __maybe_unused) config->timer_req.data = config; fatal_assert(0 == uv_timer_start(&config->timer_req, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS)); - info("Starting ACLK synchronization thread"); + netdata_log_info("Starting ACLK synchronization thread"); config->cleanup_after = now_realtime_sec() + ACLK_DATABASE_CLEANUP_FIRST; config->initialized = true; @@ -444,7 +444,7 @@ static void aclk_synchronization(void *arg __maybe_unused) sql_process_queue_removed_alerts_to_aclk(cmd.param[0]); break; default: - debug(D_ACLK_SYNC, "%s: default.", __func__); + netdata_log_debug(D_ACLK_SYNC, "%s: default.", __func__); break; } if (cmd.completion) @@ -462,7 +462,7 @@ static void aclk_synchronization(void *arg __maybe_unused) worker_unregister(); service_exits(); - info("ACLK SYNC: Shutting down ACLK synchronization event loop"); + netdata_log_info("ACLK SYNC: Shutting down ACLK synchronization event loop"); } static void aclk_synchronization_init(void) @@ -543,7 +543,7 @@ void sql_aclk_sync_init(void) return; } - info("Creating archived hosts"); + netdata_log_info("Creating archived hosts"); int number_of_children = 0; rc = sqlite3_exec_monitored(db_meta, SQL_FETCH_ALL_HOSTS, create_host_callback, &number_of_children, &err_msg); @@ -552,7 +552,7 @@ void sql_aclk_sync_init(void) sqlite3_free(err_msg); } - info("Created %d archived hosts", number_of_children); + netdata_log_info("Created %d archived hosts", number_of_children); // Trigger host context load for hosts that have been created metadata_queue_load_host_context(NULL); @@ -568,7 +568,7 @@ void sql_aclk_sync_init(void) } aclk_synchronization_init(); - info("ACLK sync initialization completed"); + netdata_log_info("ACLK sync initialization completed"); #endif } diff --git a/database/sqlite/sqlite_aclk.h b/database/sqlite/sqlite_aclk.h index d555a0cef..705102d74 100644 --- a/database/sqlite/sqlite_aclk.h +++ b/database/sqlite/sqlite_aclk.h @@ -27,12 +27,20 @@ static inline void uuid_unparse_lower_fix(uuid_t *uuid, char *out) out[23] = '_'; } +static inline int uuid_parse_fix(char *in, uuid_t uuid) +{ + in[8] = '-'; + in[13] = '-'; + in[18] = '-'; + in[23] = '-'; + return uuid_parse(in, uuid); +} + static inline int claimed() { return localhost->aclk_state.claimed_id != NULL; } - #define TABLE_ACLK_ALERT "CREATE TABLE IF NOT EXISTS aclk_alert_%s (sequence_id INTEGER PRIMARY KEY, " \ "alert_unique_id, date_created, date_submitted, date_cloud_ack, filtered_alert_unique_id NOT NULL, " \ "unique(alert_unique_id));" @@ -79,6 +87,8 @@ struct aclk_sync_host_config { char uuid_str[UUID_STR_LEN]; char node_id[UUID_STR_LEN]; char *alerts_snapshot_uuid; // will contain the snapshot_uuid value if snapshot was requested + uint64_t alerts_log_first_sequence_id; + uint64_t alerts_log_last_sequence_id; }; extern sqlite3 *db_meta; diff --git a/database/sqlite/sqlite_aclk_alert.c b/database/sqlite/sqlite_aclk_alert.c index 52d343acb..d57ae043f 100644 --- a/database/sqlite/sqlite_aclk_alert.c +++ b/database/sqlite/sqlite_aclk_alert.c @@ -7,37 +7,7 @@ #include "../../aclk/aclk_alarm_api.h" #endif -#define SQL_GET_ALERT_REMOVE_TIME "SELECT when_key FROM health_log_%s WHERE alarm_id = %u " \ - "AND unique_id > %u AND unique_id < %u " \ - "AND new_status = -2;" - -time_t removed_when(uint32_t alarm_id, uint32_t before_unique_id, uint32_t after_unique_id, char *uuid_str) { - sqlite3_stmt *res = NULL; - time_t when = 0; - char sql[ACLK_SYNC_QUERY_SIZE]; - - snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, SQL_GET_ALERT_REMOVE_TIME, uuid_str, alarm_id, after_unique_id, before_unique_id); - - int rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); - if (rc != SQLITE_OK) { - error_report("Failed to prepare statement when trying to find removed gap."); - return 0; - } - - rc = sqlite3_step_monitored(res); - if (likely(rc == SQLITE_ROW)) { - when = (time_t) sqlite3_column_int64(res, 0); - } - - rc = sqlite3_finalize(res); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to finalize statement when trying to find removed gap, rc = %d", rc); - - return when; -} - #define SQL_UPDATE_FILTERED_ALERT "UPDATE aclk_alert_%s SET filtered_alert_unique_id = %u where filtered_alert_unique_id = %u" - void update_filtered(ALARM_ENTRY *ae, uint32_t unique_id, char *uuid_str) { char sql[ACLK_SYNC_QUERY_SIZE]; snprintfz(sql, ACLK_SYNC_QUERY_SIZE-1, SQL_UPDATE_FILTERED_ALERT, uuid_str, ae->unique_id, unique_id); @@ -45,17 +15,16 @@ void update_filtered(ALARM_ENTRY *ae, uint32_t unique_id, char *uuid_str) { ae->flags |= HEALTH_ENTRY_FLAG_ACLK_QUEUED; } -#define SQL_SELECT_ALERT_BY_UNIQUE_ID "SELECT hl.unique_id FROM health_log_%s hl, alert_hash ah WHERE hl.unique_id = %u " \ - "AND hl.config_hash_id = ah.hash_id " \ +#define SQL_SELECT_VARIABLE_ALERT_BY_UNIQUE_ID "SELECT hld.unique_id FROM health_log hl, alert_hash ah, health_log_detail hld WHERE hld.unique_id = %u " \ + "AND hl.config_hash_id = ah.hash_id AND hld.health_log_id = hl.health_log_id AND host_id = @host_id " \ "AND ah.warn IS NULL AND ah.crit IS NULL;" - -static inline bool is_event_from_alert_variable_config(uint32_t unique_id, char *uuid_str) { +static inline bool is_event_from_alert_variable_config(uint32_t unique_id, uuid_t *host_id) { sqlite3_stmt *res = NULL; int rc = 0; bool ret = false; char sql[ACLK_SYNC_QUERY_SIZE]; - snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, SQL_SELECT_ALERT_BY_UNIQUE_ID, uuid_str, unique_id); + snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, SQL_SELECT_VARIABLE_ALERT_BY_UNIQUE_ID, unique_id); rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); if (rc != SQLITE_OK) { @@ -63,6 +32,13 @@ static inline bool is_event_from_alert_variable_config(uint32_t unique_id, char return false; } + rc = sqlite3_bind_blob(res, 1, host_id, sizeof(*host_id), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id for checking alert variable."); + sqlite3_finalize(res); + return false; + } + rc = sqlite3_step_monitored(res); if (likely(rc == SQLITE_ROW)) { ret = true; @@ -76,13 +52,12 @@ static inline bool is_event_from_alert_variable_config(uint32_t unique_id, char } #define MAX_REMOVED_PERIOD 604800 //a week -//decide if some events should be sent or not - -#define SQL_SELECT_ALERT_BY_ID "SELECT hl.new_status, hl.config_hash_id, hl.unique_id FROM health_log_%s hl, aclk_alert_%s aa " \ - "WHERE hl.unique_id = aa.filtered_alert_unique_id " \ - "AND hl.alarm_id = %u " \ - "ORDER BY alarm_event_id DESC LIMIT 1;" +//decide if some events should be sent or not +#define SQL_SELECT_ALERT_BY_ID "SELECT hld.new_status, hl.config_hash_id, hld.unique_id FROM health_log hl, aclk_alert_%s aa, health_log_detail hld " \ + "WHERE hld.unique_id = aa.filtered_alert_unique_id " \ + "AND hld.alarm_id = %u AND hl.host_id = @host_id AND hl.health_log_id = hld.health_log_id " \ + "ORDER BY hld.alarm_event_id DESC LIMIT 1;" int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) { sqlite3_stmt *res = NULL; @@ -94,7 +69,7 @@ int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) return 0; } - if (unlikely(uuid_is_null(ae->config_hash_id))) + if (unlikely(uuid_is_null(ae->config_hash_id))) return 0; char sql[ACLK_SYNC_QUERY_SIZE]; @@ -104,7 +79,7 @@ int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) //get the previous sent event of this alarm_id //base the search on the last filtered event - snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, SQL_SELECT_ALERT_BY_ID, uuid_str, uuid_str, ae->alarm_id); + snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, SQL_SELECT_ALERT_BY_ID, uuid_str, ae->alarm_id); int rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); if (rc != SQLITE_OK) { @@ -113,13 +88,19 @@ int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) return send; } + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id for checking alert variable."); + sqlite3_finalize(res); + return false; + } + rc = sqlite3_step_monitored(res); if (likely(rc == SQLITE_ROW)) { status = (RRDCALC_STATUS) sqlite3_column_int(res, 0); if (sqlite3_column_type(res, 1) != SQLITE_NULL) uuid_copy(config_hash_id, *((uuid_t *) sqlite3_column_blob(res, 1))); unique_id = (uint32_t) sqlite3_column_int64(res, 2); - } else { send = 1; goto done; @@ -136,26 +117,9 @@ int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) } //same status, same config - if (ae->new_status == RRDCALC_STATUS_CLEAR || ae->new_status == RRDCALC_STATUS_UNDEFINED) { - send = 0; - update_filtered(ae, unique_id, uuid_str); - goto done; - } + send = 0; + update_filtered(ae, unique_id, uuid_str); - //detect a long off period of the agent, TODO make global - if (ae->new_status == RRDCALC_STATUS_WARNING || ae->new_status == RRDCALC_STATUS_CRITICAL) { - time_t when = removed_when(ae->alarm_id, ae->unique_id, unique_id, uuid_str); - - if (when && (when + (time_t)MAX_REMOVED_PERIOD) < ae->when) { - send = 1; - goto done; - } else { - send = 0; - update_filtered(ae, unique_id, uuid_str); - goto done; - } - } - done: rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) @@ -164,12 +128,8 @@ done: return send; } -// will replace call to aclk_update_alarm in health/health_log.c -// and handle both cases - #define SQL_QUEUE_ALERT_TO_CLOUD "INSERT INTO aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " \ "VALUES (@alert_unique_id, unixepoch(), @alert_unique_id) ON CONFLICT (alert_unique_id) do nothing;" - int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, int skip_filter) { if(!service_running(SERVICE_ACLK)) @@ -193,7 +153,7 @@ int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, int skip_filter) char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - if (is_event_from_alert_variable_config(ae->unique_id, uuid_str)) + if (is_event_from_alert_variable_config(ae->unique_id, &host->host_uuid)) return 0; sqlite3_stmt *res_alert = NULL; @@ -286,7 +246,7 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) int rc; if (unlikely(!wc->alert_updates)) { - log_access("ACLK STA [%s (%s)]: Ignoring alert push event, updates have been turned off for this node.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); + netdata_log_access("ACLK STA [%s (%s)]: Ignoring alert push event, updates have been turned off for this node.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); return; } @@ -305,21 +265,18 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) sqlite3_stmt *res = NULL; - buffer_sprintf(sql, "select aa.sequence_id, hl.unique_id, hl.alarm_id, hl.config_hash_id, hl.updated_by_id, hl.when_key, " \ - " hl.duration, hl.non_clear_duration, hl.flags, hl.exec_run_timestamp, hl.delay_up_to_timestamp, hl.name, " \ - " hl.chart, hl.family, hl.exec, hl.recipient, hl.source, hl.units, hl.info, hl.exec_code, hl.new_status, " \ - " hl.old_status, hl.delay, hl.new_value, hl.old_value, hl.last_repeat, hl.chart_context, hl.transition_id, hl.alarm_event_id " \ - " from health_log_%s hl, aclk_alert_%s aa " \ - " where hl.unique_id = aa.alert_unique_id and aa.date_submitted is null " \ - " order by aa.sequence_id asc limit %d;", wc->uuid_str, wc->uuid_str, limit); + buffer_sprintf(sql, "select aa.sequence_id, hld.unique_id, hld.alarm_id, hl.config_hash_id, hld.updated_by_id, hld.when_key, " \ + " hld.duration, hld.non_clear_duration, hld.flags, hld.exec_run_timestamp, hld.delay_up_to_timestamp, hl.name, " \ + " hl.chart, hl.family, hl.exec, hl.recipient, ha.source, hl.units, hld.info, hld.exec_code, hld.new_status, " \ + " hld.old_status, hld.delay, hld.new_value, hld.old_value, hld.last_repeat, hl.chart_context, hld.transition_id, hld.alarm_event_id " \ + " from health_log hl, aclk_alert_%s aa, alert_hash ha, health_log_detail hld " \ + " where hld.unique_id = aa.alert_unique_id and hl.config_hash_id = ha.hash_id and aa.date_submitted is null " \ + " and hl.host_id = @host_id and hl.health_log_id = hld.health_log_id " \ + " order by aa.sequence_id asc limit %d;", wc->uuid_str, limit); rc = sqlite3_prepare_v2(db_meta, buffer_tostring(sql), -1, &res, 0); if (rc != SQLITE_OK) { - // Try to create tables - if (wc->host) - sql_create_health_log_table(wc->host); - BUFFER *sql_fix = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); buffer_sprintf(sql_fix, TABLE_ACLK_ALERT, wc->uuid_str); rc = db_execute(db_meta, buffer_tostring(sql_fix)); @@ -344,10 +301,17 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) } } - uint64_t first_sequence_id = 0; - uint64_t last_sequence_id = 0; - static __thread uint64_t log_first_sequence_id = 0; - static __thread uint64_t log_last_sequence_id = 0; + rc = sqlite3_bind_blob(res, 1, &wc->host->host_uuid, sizeof(wc->host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id for pushing alert event."); + sqlite3_finalize(res); + buffer_free(sql); + freez(claim_id); + return; + } + + uint64_t first_sequence_id = 0; + uint64_t last_sequence_id = 0; while (sqlite3_step_monitored(res) == SQLITE_ROW) { struct alarm_log_entry alarm_log; @@ -371,7 +335,8 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) alarm_log.timezone = strdupz(rrdhost_abbrev_timezone(wc->host)); alarm_log.exec_path = sqlite3_column_bytes(res, 14) > 0 ? strdupz((char *)sqlite3_column_text(res, 14)) : strdupz((char *)string2str(wc->host->health.health_default_exec)); - alarm_log.conf_source = strdupz((char *)sqlite3_column_text(res, 16)); + + alarm_log.conf_source = sqlite3_column_bytes(res, 16) > 0 ? strdupz((char *)sqlite3_column_text(res, 16)) : strdupz(""); char *edit_command = sqlite3_column_bytes(res, 16) > 0 ? health_edit_command_from_source((char *)sqlite3_column_text(res, 16)) : @@ -420,11 +385,11 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) if (first_sequence_id == 0) first_sequence_id = (uint64_t) sqlite3_column_int64(res, 0); - if (log_first_sequence_id == 0) - log_first_sequence_id = (uint64_t) sqlite3_column_int64(res, 0); + if (wc->alerts_log_first_sequence_id == 0) + wc->alerts_log_first_sequence_id = (uint64_t) sqlite3_column_int64(res, 0); last_sequence_id = (uint64_t) sqlite3_column_int64(res, 0); - log_last_sequence_id = (uint64_t) sqlite3_column_int64(res, 0); + wc->alerts_log_last_sequence_id = (uint64_t) sqlite3_column_int64(res, 0); destroy_alarm_log_entry(&alarm_log); freez(edit_command); @@ -443,15 +408,15 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) rrdhost_flag_set(wc->host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); } else { - if (log_first_sequence_id) - log_access( + if (wc->alerts_log_first_sequence_id) + netdata_log_access( "ACLK RES [%s (%s)]: ALERTS SENT from %" PRIu64 " to %" PRIu64 "", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", - log_first_sequence_id, - log_last_sequence_id); - log_first_sequence_id = 0; - log_last_sequence_id = 0; + wc->alerts_log_first_sequence_id, + wc->alerts_log_last_sequence_id); + wc->alerts_log_first_sequence_id = 0; + wc->alerts_log_last_sequence_id = 0; } rc = sqlite3_finalize(res); @@ -486,19 +451,51 @@ void sql_queue_existing_alerts_to_aclk(RRDHOST *host) char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); + sqlite3_stmt *res = NULL; + int rc; - buffer_sprintf(sql,"delete from aclk_alert_%s; " \ - "insert into aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " \ - "select unique_id alert_unique_id, unixepoch(), unique_id alert_unique_id from health_log_%s " \ - "where new_status <> 0 and new_status <> -2 and config_hash_id is not null and updated_by_id = 0 " \ - "order by unique_id asc on conflict (alert_unique_id) do nothing;", uuid_str, uuid_str, uuid_str); + rw_spinlock_write_lock(&host->health_log.spinlock); - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + buffer_sprintf(sql, "delete from aclk_alert_%s; ", uuid_str); + if (unlikely(db_execute(db_meta, buffer_tostring(sql)))) { + rw_spinlock_write_unlock(&host->health_log.spinlock); + buffer_free(sql); + return; + } - if (unlikely(db_execute(db_meta, buffer_tostring(sql)))) - error_report("Failed to queue existing ACLK alert events for host %s", rrdhost_hostname(host)); + buffer_flush(sql); + buffer_sprintf(sql, "insert into aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " \ + "select hld.unique_id alert_unique_id, unixepoch(), hld.unique_id alert_unique_id from health_log_detail hld, health_log hl " \ + "where hld.new_status <> 0 and hld.new_status <> -2 and hl.health_log_id = hld.health_log_id and hl.config_hash_id is not null " \ + "and hld.updated_by_id = 0 and hl.host_id = @host_id order by hld.unique_id asc on conflict (alert_unique_id) do nothing;", uuid_str); + + rc = sqlite3_prepare_v2(db_meta, buffer_tostring(sql), -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to queue existing alerts."); + rw_spinlock_write_unlock(&host->health_log.spinlock); + buffer_free(sql); + return; + } + + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id for when trying to queue existing alerts."); + sqlite3_finalize(res); + rw_spinlock_write_unlock(&host->health_log.spinlock); + buffer_free(sql); + return; + } + + rc = execute_insert(res); + if (unlikely(rc != SQLITE_DONE)) { + error_report("Failed to queue existing alerts, rc = %d", rc); + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement to queue existing alerts, rc = %d", rc); - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_write_unlock(&host->health_log.spinlock); buffer_free(sql); rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); @@ -514,7 +511,7 @@ void aclk_send_alarm_configuration(char *config_hash) if (unlikely(!wc)) return; - log_access("ACLK REQ [%s (%s)]: Request to send alert config %s.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", config_hash); + netdata_log_access("ACLK REQ [%s (%s)]: Request to send alert config %s.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", config_hash); aclk_push_alert_config(wc->node_id, config_hash); } @@ -522,8 +519,7 @@ void aclk_send_alarm_configuration(char *config_hash) #define SQL_SELECT_ALERT_CONFIG "SELECT alarm, template, on_key, class, type, component, os, hosts, plugin," \ "module, charts, families, lookup, every, units, green, red, calc, warn, crit, to_key, exec, delay, repeat, info," \ "options, host_labels, p_db_lookup_dimensions, p_db_lookup_method, p_db_lookup_options, p_db_lookup_after," \ - "p_db_lookup_before, p_update_every FROM alert_hash WHERE hash_id = @hash_id;" - + "p_db_lookup_before, p_update_every, chart_labels FROM alert_hash WHERE hash_id = @hash_id;" int aclk_push_alert_config_event(char *node_id __maybe_unused, char *config_hash __maybe_unused) { int rc = 0; @@ -624,18 +620,20 @@ int aclk_push_alert_config_event(char *node_id __maybe_unused, char *config_hash alarm_config.p_update_every = sqlite3_column_int(res, 32); + alarm_config.chart_labels = sqlite3_column_bytes(res, 33) > 0 ? strdupz((char *)sqlite3_column_text(res, 33)) : NULL; + p_alarm_config.cfg_hash = strdupz((char *) config_hash); p_alarm_config.cfg = alarm_config; } if (likely(p_alarm_config.cfg_hash)) { - log_access("ACLK RES [%s (%s)]: Sent alert config %s.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", config_hash); + netdata_log_access("ACLK RES [%s (%s)]: Sent alert config %s.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", config_hash); aclk_send_provide_alarm_cfg(&p_alarm_config); freez(p_alarm_config.cfg_hash); destroy_aclk_alarm_configuration(&alarm_config); } else - log_access("ACLK STA [%s (%s)]: Alert config for %s not found.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", config_hash); + netdata_log_access("ACLK STA [%s (%s)]: Alert config for %s not found.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", config_hash); bind_fail: rc = sqlite3_finalize(res); @@ -670,28 +668,26 @@ void aclk_start_alert_streaming(char *node_id, bool resets) return; if (unlikely(!host->health.health_enabled)) { - log_access("ACLK STA [%s (N/A)]: Ignoring request to stream alert state changes, health is disabled.", node_id); + netdata_log_access("ACLK STA [%s (N/A)]: Ignoring request to stream alert state changes, health is disabled.", node_id); return; } if (resets) { - log_access("ACLK REQ [%s (%s)]: STREAM ALERTS ENABLED (RESET REQUESTED)", node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); + netdata_log_access("ACLK REQ [%s (%s)]: STREAM ALERTS ENABLED (RESET REQUESTED)", node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); sql_queue_existing_alerts_to_aclk(host); } else - log_access("ACLK REQ [%s (%s)]: STREAM ALERTS ENABLED", node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); + netdata_log_access("ACLK REQ [%s (%s)]: STREAM ALERTS ENABLED", node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); wc->alert_updates = 1; wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS; } #define SQL_QUEUE_REMOVE_ALERTS "INSERT INTO aclk_alert_%s (alert_unique_id, date_created, filtered_alert_unique_id) " \ - "SELECT unique_id alert_unique_id, UNIXEPOCH(), unique_id alert_unique_id FROM health_log_%s " \ - "WHERE new_status = -2 AND updated_by_id = 0 AND unique_id NOT IN " \ - "(SELECT alert_unique_id FROM aclk_alert_%s) " \ - "AND config_hash_id NOT IN (select hash_id from alert_hash where warn is null and crit is null) " \ - "ORDER BY unique_id ASC " \ - "ON CONFLICT (alert_unique_id) DO NOTHING;" - + "SELECT hld.unique_id alert_unique_id, UNIXEPOCH(), hld.unique_id alert_unique_id FROM health_log hl, health_log_detail hld " \ + "WHERE hl.host_id = @host_id AND hl.health_log_id = hld.health_log_id AND hld.new_status = -2 AND hld.updated_by_id = 0 " \ + "AND hld.unique_id NOT IN (SELECT alert_unique_id FROM aclk_alert_%s) " \ + "AND hl.config_hash_id NOT IN (select hash_id from alert_hash where warn is null and crit is null) " \ + "ORDER BY hld.unique_id ASC ON CONFLICT (alert_unique_id) DO NOTHING;" void sql_process_queue_removed_alerts_to_aclk(char *node_id) { struct aclk_sync_host_config *wc; @@ -702,15 +698,35 @@ void sql_process_queue_removed_alerts_to_aclk(char *node_id) return; char sql[ACLK_SYNC_QUERY_SIZE * 2]; + sqlite3_stmt *res = NULL; - snprintfz(sql,ACLK_SYNC_QUERY_SIZE * 2 - 1, SQL_QUEUE_REMOVE_ALERTS, wc->uuid_str, wc->uuid_str, wc->uuid_str); + snprintfz(sql, ACLK_SYNC_QUERY_SIZE * 2 - 1, SQL_QUEUE_REMOVE_ALERTS, wc->uuid_str, wc->uuid_str); - if (unlikely(db_execute(db_meta, sql))) { - log_access("ACLK STA [%s (%s)]: QUEUED REMOVED ALERTS FAILED", wc->node_id, rrdhost_hostname(wc->host)); - error_report("Failed to queue ACLK alert removed entries for host %s", rrdhost_hostname(wc->host)); + int rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to queue removed alerts."); + return; } - else - log_access("ACLK STA [%s (%s)]: QUEUED REMOVED ALERTS", wc->node_id, rrdhost_hostname(wc->host)); + + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id for when trying to queue remvoed alerts."); + sqlite3_finalize(res); + return; + } + + rc = execute_insert(res); + if (unlikely(rc != SQLITE_DONE)) { + sqlite3_finalize(res); + error_report("Failed to queue removed alerts, rc = %d", rc); + return; + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement to queue removed alerts, rc = %d", rc); + + netdata_log_access("ACLK STA [%s (%s)]: QUEUED REMOVED ALERTS", wc->node_id, rrdhost_hostname(wc->host)); rrdhost_flag_set(wc->host, RRDHOST_FLAG_ACLK_STREAM_ALERTS); wc->alert_queue_removed = 0; @@ -738,18 +754,18 @@ void aclk_process_send_alarm_snapshot(char *node_id, char *claim_id __maybe_unus RRDHOST *host = find_host_by_node_id(node_id); if (unlikely(!host)) { - log_access("ACLK STA [%s (N/A)]: ACLK node id does not exist", node_id); + netdata_log_access("ACLK STA [%s (N/A)]: ACLK node id does not exist", node_id); return; } struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; if (unlikely(!wc)) { - log_access("ACLK STA [%s (N/A)]: ACLK node id does not exist", node_id); + netdata_log_access("ACLK STA [%s (N/A)]: ACLK node id does not exist", node_id); return; } - log_access( + netdata_log_access( "IN [%s (%s)]: Request to send alerts snapshot, snapshot_uuid %s", node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A", @@ -842,7 +858,7 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) RRDHOST *host = find_host_by_node_id(node_id); if (unlikely(!host)) { - log_access("AC [%s (N/A)]: Node id not found", node_id); + netdata_log_access("AC [%s (N/A)]: Node id not found", node_id); freez(node_id); return; } @@ -852,7 +868,7 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) // we perhaps we don't need this for snapshots if (unlikely(!wc->alert_updates)) { - log_access( + netdata_log_access( "ACLK STA [%s (%s)]: Ignoring alert snapshot event, updates have been turned off for this node.", wc->node_id, wc->host ? rrdhost_hostname(wc->host) : "N/A"); @@ -866,13 +882,13 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) if (unlikely(!claim_id)) return; - log_access("ACLK REQ [%s (%s)]: Sending alerts snapshot, snapshot_uuid %s", wc->node_id, rrdhost_hostname(wc->host), wc->alerts_snapshot_uuid); + netdata_log_access("ACLK REQ [%s (%s)]: Sending alerts snapshot, snapshot_uuid %s", wc->node_id, rrdhost_hostname(wc->host), wc->alerts_snapshot_uuid); uint32_t cnt = 0; char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_lock(&host->health_log.spinlock); ALARM_ENTRY *ae = host->health_log.alarms; @@ -886,7 +902,7 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) if (have_recent_alarm(host, ae->alarm_id, ae->unique_id)) continue; - if (is_event_from_alert_variable_config(ae->unique_id, uuid_str)) + if (is_event_from_alert_variable_config(ae->unique_id, &host->host_uuid)) continue; cnt++; @@ -918,7 +934,7 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) if (have_recent_alarm(host, ae->alarm_id, ae->unique_id)) continue; - if (is_event_from_alert_variable_config(ae->unique_id, uuid_str)) + if (is_event_from_alert_variable_config(ae->unique_id, &host->host_uuid)) continue; cnt++; @@ -957,7 +973,7 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) aclk_send_alarm_snapshot(snapshot_proto); } - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_unlock(&host->health_log.spinlock); wc->alerts_snapshot_uuid = NULL; freez(claim_id); @@ -984,7 +1000,6 @@ void sql_aclk_alert_clean_dead_entries(RRDHOST *host) #define SQL_GET_MIN_MAX_ALERT_SEQ "SELECT MIN(sequence_id), MAX(sequence_id), " \ "(SELECT MAX(sequence_id) FROM aclk_alert_%s WHERE date_submitted IS NOT NULL) " \ "FROM aclk_alert_%s WHERE date_submitted IS NULL;" - int get_proto_alert_status(RRDHOST *host, struct proto_alert_status *proto_alert_status) { int rc; @@ -1032,11 +1047,11 @@ void aclk_send_alarm_checkpoint(char *node_id, char *claim_id __maybe_unused) wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; if (unlikely(!wc)) { - log_access("ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT REQUEST RECEIVED FOR INVALID NODE", node_id); + netdata_log_access("ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT REQUEST RECEIVED FOR INVALID NODE", node_id); return; } - log_access("ACLK REQ [%s (%s)]: ALERTS CHECKPOINT REQUEST RECEIVED", node_id, rrdhost_hostname(host)); + netdata_log_access("ACLK REQ [%s (%s)]: ALERTS CHECKPOINT REQUEST RECEIVED", node_id, rrdhost_hostname(host)); wc->alert_checkpoint_req = SEND_CHECKPOINT_AFTER_HEALTH_LOOPS; } @@ -1065,14 +1080,14 @@ void aclk_push_alarm_checkpoint(RRDHOST *host __maybe_unused) #ifdef ENABLE_ACLK struct aclk_sync_host_config *wc = host->aclk_sync_host_config; if (unlikely(!wc)) { - log_access("ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT REQUEST RECEIVED FOR INVALID NODE", rrdhost_hostname(host)); + netdata_log_access("ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT REQUEST RECEIVED FOR INVALID NODE", rrdhost_hostname(host)); return; } if (rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_ALERTS)) { //postpone checkpoint send wc->alert_checkpoint_req+=3; - log_access("ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT POSTPONED", rrdhost_hostname(host)); + netdata_log_access("ACLK REQ [%s (N/A)]: ALERTS CHECKPOINT POSTPONED", rrdhost_hostname(host)); return; } @@ -1135,9 +1150,9 @@ void aclk_push_alarm_checkpoint(RRDHOST *host __maybe_unused) aclk_send_provide_alarm_checkpoint(&alarm_checkpoint); freez(claim_id); - log_access("ACLK RES [%s (%s)]: ALERTS CHECKPOINT SENT", wc->node_id, rrdhost_hostname(host)); + netdata_log_access("ACLK RES [%s (%s)]: ALERTS CHECKPOINT SENT", wc->node_id, rrdhost_hostname(host)); } else { - log_access("ACLK RES [%s (%s)]: FAILED TO CREATE ALERTS CHECKPOINT HASH", wc->node_id, rrdhost_hostname(host)); + netdata_log_access("ACLK RES [%s (%s)]: FAILED TO CREATE ALERTS CHECKPOINT HASH", wc->node_id, rrdhost_hostname(host)); } wc->alert_checkpoint_req = 0; buffer_free(alarms_to_hash); diff --git a/database/sqlite/sqlite_aclk_node.c b/database/sqlite/sqlite_aclk_node.c index 3817296da..82927854a 100644 --- a/database/sqlite/sqlite_aclk_node.c +++ b/database/sqlite/sqlite_aclk_node.c @@ -50,7 +50,7 @@ static void build_node_collectors(char *node_id __maybe_unused) dictionary_destroy(dict); freez(upd_node_collectors.claim_id); - log_access("ACLK RES [%s (%s)]: NODE COLLECTORS SENT", node_id, rrdhost_hostname(host)); + netdata_log_access("ACLK RES [%s (%s)]: NODE COLLECTORS SENT", node_id, rrdhost_hostname(host)); freez(node_id); } @@ -124,7 +124,7 @@ static void build_node_info(char *node_id __maybe_unused) node_info.data.host_labels_ptr = host->rrdlabels; aclk_update_node_info(&node_info); - log_access("ACLK RES [%s (%s)]: NODE INFO SENT for guid [%s] (%s)", wc->node_id, rrdhost_hostname(wc->host), host->machine_guid, wc->host == localhost ? "parent" : "child"); + netdata_log_access("ACLK RES [%s (%s)]: NODE INFO SENT for guid [%s] (%s)", wc->node_id, rrdhost_hostname(wc->host), host->machine_guid, wc->host == localhost ? "parent" : "child"); rrd_unlock(); freez(node_info.claim_id); @@ -172,7 +172,7 @@ void aclk_check_node_info_and_collectors(void) dfe_done(host); if(pending) - info("ACLK: %zu nodes are pending for contexts to load, skipped sending node info for them", pending); + netdata_log_info("ACLK: %zu nodes are pending for contexts to load, skipped sending node info for them", pending); } #endif diff --git a/database/sqlite/sqlite_context.c b/database/sqlite/sqlite_context.c index b72726dc2..f29fe51e3 100644 --- a/database/sqlite/sqlite_context.c +++ b/database/sqlite/sqlite_context.c @@ -43,7 +43,7 @@ int sql_init_context_database(int memory) return 1; } - info("SQLite database %s initialization", sqlite_database); + netdata_log_info("SQLite database %s initialization", sqlite_database); char buf[1024 + 1] = ""; const char *list[2] = { buf, NULL }; @@ -112,7 +112,7 @@ void sql_close_context_database(void) if (unlikely(!db_context_meta)) return; - info("Closing context SQLite database"); + netdata_log_info("Closing context SQLite database"); rc = sqlite3_close_v2(db_context_meta); if (unlikely(rc != SQLITE_OK)) @@ -431,7 +431,7 @@ int ctx_delete_context(uuid_t *host_uuid, VERSIONED_CONTEXT_DATA *context_data) else { char host_uuid_str[UUID_STR_LEN]; uuid_unparse_lower(*host_uuid, host_uuid_str); - info("%s: Deleted context %s under host %s", __FUNCTION__, context_data->id, host_uuid_str); + netdata_log_info("%s: Deleted context %s under host %s", __FUNCTION__, context_data->id, host_uuid_str); } #endif @@ -463,7 +463,7 @@ int sql_context_cache_stats(int op) static void dict_ctx_get_context_list_cb(VERSIONED_CONTEXT_DATA *context_data, void *data) { (void)data; - info(" Context id = %s " + netdata_log_info(" Context id = %s " "version = %"PRIu64" " "title = %s " "chart_type = %s " @@ -512,48 +512,48 @@ int ctx_unittest(void) context_data.version = now_realtime_usec(); if (likely(!ctx_store_context(&host_uuid, &context_data))) - info("Entry %s inserted", context_data.id); + netdata_log_info("Entry %s inserted", context_data.id); else - info("Entry %s not inserted", context_data.id); + netdata_log_info("Entry %s not inserted", context_data.id); if (likely(!ctx_store_context(&host_uuid, &context_data))) - info("Entry %s inserted", context_data.id); + netdata_log_info("Entry %s inserted", context_data.id); else - info("Entry %s not inserted", context_data.id); + netdata_log_info("Entry %s not inserted", context_data.id); // This will change end time context_data.first_time_s = 1657781000; context_data.last_time_s = 1657782001; if (likely(!ctx_update_context(&host_uuid, &context_data))) - info("Entry %s updated", context_data.id); + netdata_log_info("Entry %s updated", context_data.id); else - info("Entry %s not updated", context_data.id); - info("List context start after insert"); + netdata_log_info("Entry %s not updated", context_data.id); + netdata_log_info("List context start after insert"); ctx_get_context_list(&host_uuid, dict_ctx_get_context_list_cb, NULL); - info("List context end after insert"); + netdata_log_info("List context end after insert"); // This will change start time context_data.first_time_s = 1657782000; context_data.last_time_s = 1657782001; if (likely(!ctx_update_context(&host_uuid, &context_data))) - info("Entry %s updated", context_data.id); + netdata_log_info("Entry %s updated", context_data.id); else - info("Entry %s not updated", context_data.id); + netdata_log_info("Entry %s not updated", context_data.id); // This will list one entry - info("List context start after insert"); + netdata_log_info("List context start after insert"); ctx_get_context_list(&host_uuid, dict_ctx_get_context_list_cb, NULL); - info("List context end after insert"); + netdata_log_info("List context end after insert"); - info("List context start after insert"); + netdata_log_info("List context start after insert"); ctx_get_context_list(&host_uuid, dict_ctx_get_context_list_cb, NULL); - info("List context end after insert"); + netdata_log_info("List context end after insert"); // This will delete the entry if (likely(!ctx_delete_context(&host_uuid, &context_data))) - info("Entry %s deleted", context_data.id); + netdata_log_info("Entry %s deleted", context_data.id); else - info("Entry %s not deleted", context_data.id); + netdata_log_info("Entry %s not deleted", context_data.id); freez((void *)context_data.id); freez((void *)context_data.title); @@ -562,9 +562,9 @@ int ctx_unittest(void) freez((void *)context_data.units); // The list should be empty - info("List context start after delete"); + netdata_log_info("List context start after delete"); ctx_get_context_list(&host_uuid, dict_ctx_get_context_list_cb, NULL); - info("List context end after delete"); + netdata_log_info("List context end after delete"); sql_close_context_database(); diff --git a/database/sqlite/sqlite_db_migration.c b/database/sqlite/sqlite_db_migration.c index 9c7235fdb..1a6233fce 100644 --- a/database/sqlite/sqlite_db_migration.c +++ b/database/sqlite/sqlite_db_migration.c @@ -11,7 +11,6 @@ static int return_int_cb(void *data, int argc, char **argv, char **column) return 0; } - int table_exists_in_database(const char *table) { char *err_msg = NULL; @@ -23,7 +22,7 @@ int table_exists_in_database(const char *table) int rc = sqlite3_exec_monitored(db_meta, sql, return_int_cb, (void *) &exists, &err_msg); if (rc != SQLITE_OK) { - info("Error checking table existence; %s", err_msg); + netdata_log_info("Error checking table existence; %s", err_msg); sqlite3_free(err_msg); } @@ -41,7 +40,7 @@ static int column_exists_in_table(const char *table, const char *column) int rc = sqlite3_exec_monitored(db_meta, sql, return_int_cb, (void *) &exists, &err_msg); if (rc != SQLITE_OK) { - info("Error checking column existence; %s", err_msg); + netdata_log_info("Error checking column existence; %s", err_msg); sqlite3_free(err_msg); } @@ -79,11 +78,15 @@ const char *database_migrate_v5_v6[] = { NULL }; +const char *database_migrate_v9_v10[] = { + "ALTER TABLE alert_hash ADD chart_labels TEXT;", + NULL +}; static int do_migration_v1_v2(sqlite3 *database, const char *name) { UNUSED(name); - info("Running \"%s\" database migration", name); + netdata_log_info("Running \"%s\" database migration", name); if (table_exists_in_database("host") && !column_exists_in_table("host", "hops")) return init_database_batch(database, DB_CHECK_NONE, 0, &database_migrate_v1_v2[0]); @@ -93,7 +96,7 @@ static int do_migration_v1_v2(sqlite3 *database, const char *name) static int do_migration_v2_v3(sqlite3 *database, const char *name) { UNUSED(name); - info("Running \"%s\" database migration", name); + netdata_log_info("Running \"%s\" database migration", name); if (table_exists_in_database("host") && !column_exists_in_table("host", "memory_mode")) return init_database_batch(database, DB_CHECK_NONE, 0, &database_migrate_v2_v3[0]); @@ -103,7 +106,7 @@ static int do_migration_v2_v3(sqlite3 *database, const char *name) static int do_migration_v3_v4(sqlite3 *database, const char *name) { UNUSED(name); - info("Running database migration %s", name); + netdata_log_info("Running database migration %s", name); char sql[256]; @@ -135,7 +138,7 @@ static int do_migration_v3_v4(sqlite3 *database, const char *name) static int do_migration_v4_v5(sqlite3 *database, const char *name) { UNUSED(name); - info("Running \"%s\" database migration", name); + netdata_log_info("Running \"%s\" database migration", name); return init_database_batch(database, DB_CHECK_NONE, 0, &database_migrate_v4_v5[0]); } @@ -143,7 +146,7 @@ static int do_migration_v4_v5(sqlite3 *database, const char *name) static int do_migration_v5_v6(sqlite3 *database, const char *name) { UNUSED(name); - info("Running \"%s\" database migration", name); + netdata_log_info("Running \"%s\" database migration", name); return init_database_batch(database, DB_CHECK_NONE, 0, &database_migrate_v5_v6[0]); } @@ -151,7 +154,7 @@ static int do_migration_v5_v6(sqlite3 *database, const char *name) static int do_migration_v6_v7(sqlite3 *database, const char *name) { UNUSED(name); - info("Running \"%s\" database migration", name); + netdata_log_info("Running \"%s\" database migration", name); char sql[256]; @@ -185,7 +188,7 @@ static int do_migration_v6_v7(sqlite3 *database, const char *name) static int do_migration_v7_v8(sqlite3 *database, const char *name) { UNUSED(name); - info("Running database migration %s", name); + netdata_log_info("Running database migration %s", name); char sql[256]; @@ -214,12 +217,95 @@ static int do_migration_v7_v8(sqlite3 *database, const char *name) return 0; } +static int do_migration_v8_v9(sqlite3 *database, const char *name) +{ + netdata_log_info("Running database migration %s", name); + + char sql[2048]; + int rc; + sqlite3_stmt *res = NULL; + + //create the health_log table and it's index + snprintfz(sql, 2047, "CREATE TABLE IF NOT EXISTS health_log (health_log_id INTEGER PRIMARY KEY, host_id blob, alarm_id int, " \ + "config_hash_id blob, name text, chart text, family text, recipient text, units text, exec text, " \ + "chart_context text, last_transition_id blob, UNIQUE (host_id, alarm_id)) ;"); + sqlite3_exec_monitored(database, sql, 0, 0, NULL); + + //TODO indexes + snprintfz(sql, 2047, "CREATE INDEX IF NOT EXISTS health_log_ind_1 ON health_log (host_id);"); + sqlite3_exec_monitored(database, sql, 0, 0, NULL); + + snprintfz(sql, 2047, "CREATE TABLE IF NOT EXISTS health_log_detail (health_log_id int, unique_id int, alarm_id int, alarm_event_id int, " \ + "updated_by_id int, updates_id int, when_key int, duration int, non_clear_duration int, " \ + "flags int, exec_run_timestamp int, delay_up_to_timestamp int, " \ + "info text, exec_code int, new_status real, old_status real, delay int, " \ + "new_value double, old_value double, last_repeat int, transition_id blob, global_id int, host_id blob);"); + sqlite3_exec_monitored(database, sql, 0, 0, NULL); + + snprintfz(sql, 2047, "CREATE INDEX IF NOT EXISTS health_log_d_ind_1 ON health_log_detail (unique_id);"); + sqlite3_exec_monitored(database, sql, 0, 0, NULL); + snprintfz(sql, 2047, "CREATE INDEX IF NOT EXISTS health_log_d_ind_2 ON health_log_detail (global_id);"); + sqlite3_exec_monitored(database, sql, 0, 0, NULL); + snprintfz(sql, 2047, "CREATE INDEX IF NOT EXISTS health_log_d_ind_3 ON health_log_detail (transition_id);"); + sqlite3_exec_monitored(database, sql, 0, 0, NULL); + snprintfz(sql, 2047, "CREATE INDEX IF NOT EXISTS health_log_d_ind_4 ON health_log_detail (health_log_id);"); + sqlite3_exec_monitored(database, sql, 0, 0, NULL); + + snprintfz(sql, 2047, "ALTER TABLE alert_hash ADD source text;"); + sqlite3_exec_monitored(database, sql, 0, 0, NULL); + + snprintfz(sql, 2047, "CREATE INDEX IF NOT EXISTS alert_hash_index ON alert_hash (hash_id);"); + sqlite3_exec_monitored(database, sql, 0, 0, NULL); + + snprintfz(sql, 2047, "SELECT name FROM sqlite_schema WHERE type ='table' AND name LIKE 'health_log_%%' AND name <> 'health_log_detail';"); + rc = sqlite3_prepare_v2(database, sql, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement to alter health_log tables"); + return 1; + } + + DICTIONARY *dict_tables = dictionary_create(DICT_OPTION_NONE); + + while (sqlite3_step_monitored(res) == SQLITE_ROW) { + char *table = strdupz((char *) sqlite3_column_text(res, 0)); + if (health_migrate_old_health_log_table(table)) { + dictionary_set(dict_tables, table, NULL, 0); + } + freez(table); + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement when copying health_log tables, rc = %d", rc); + + char *table = NULL; + dfe_start_read(dict_tables, table) { + sql_drop_table(table_dfe.name); + } + dfe_done(table); + dictionary_destroy(dict_tables); + + snprintfz(sql, 2047, "ALTER TABLE health_log_detail DROP COLUMN host_id;"); + sqlite3_exec_monitored(database, sql, 0, 0, NULL); + + return 0; +} + +static int do_migration_v9_v10(sqlite3 *database, const char *name) +{ + UNUSED(name); + netdata_log_info("Running \"%s\" database migration", name); + + if (table_exists_in_database("alert_hash") && !column_exists_in_table("alert_hash", "chart_labels")) + return init_database_batch(database, DB_CHECK_NONE, 0, &database_migrate_v9_v10[0]); + return 0; +} static int do_migration_noop(sqlite3 *database, const char *name) { UNUSED(database); UNUSED(name); - info("Running database migration %s", name); + netdata_log_info("Running database migration %s", name); return 0; } @@ -236,16 +322,16 @@ static int migrate_database(sqlite3 *database, int target_version, char *db_name int rc = sqlite3_exec_monitored(database, "PRAGMA user_version;", return_int_cb, (void *) &user_version, &err_msg); if (rc != SQLITE_OK) { - info("Error checking the %s database version; %s", db_name, err_msg); + netdata_log_info("Error checking the %s database version; %s", db_name, err_msg); sqlite3_free(err_msg); } if (likely(user_version == target_version)) { - info("%s database version is %d (no migration needed)", db_name, target_version); + netdata_log_info("%s database version is %d (no migration needed)", db_name, target_version); return target_version; } - info("Database version is %d, current version is %d. Running migration for %s ...", user_version, target_version, db_name); + netdata_log_info("Database version is %d, current version is %d. Running migration for %s ...", user_version, target_version, db_name); for (int i = user_version; i < target_version && migration_list[i].func; i++) { rc = (migration_list[i].func)(database, migration_list[i].name); if (unlikely(rc)) { @@ -266,6 +352,8 @@ DATABASE_FUNC_MIGRATION_LIST migration_action[] = { {.name = "v5 to v6", .func = do_migration_v5_v6}, {.name = "v6 to v7", .func = do_migration_v6_v7}, {.name = "v7 to v8", .func = do_migration_v7_v8}, + {.name = "v8 to v9", .func = do_migration_v8_v9}, + {.name = "v9 to v10", .func = do_migration_v9_v10}, // the terminator of this array {.name = NULL, .func = NULL} }; diff --git a/database/sqlite/sqlite_functions.c b/database/sqlite/sqlite_functions.c index 555db1011..4200c1590 100644 --- a/database/sqlite/sqlite_functions.c +++ b/database/sqlite/sqlite_functions.c @@ -3,7 +3,7 @@ #include "sqlite_functions.h" #include "sqlite_db_migration.h" -#define DB_METADATA_VERSION 8 +#define DB_METADATA_VERSION 10 const char *database_config[] = { "CREATE TABLE IF NOT EXISTS host(host_id BLOB PRIMARY KEY, hostname TEXT NOT NULL, " @@ -32,7 +32,9 @@ const char *database_config[] = { "every text, units text, calc text, families text, plugin text, module text, charts text, green text, " "red text, warn text, crit text, exec text, to_key text, info text, delay text, options text, " "repeat text, host_labels text, p_db_lookup_dimensions text, p_db_lookup_method text, p_db_lookup_options int, " - "p_db_lookup_after int, p_db_lookup_before int, p_update_every int);", + "p_db_lookup_after int, p_db_lookup_before int, p_update_every int, source text, chart_labels text);", + + "CREATE INDEX IF NOT EXISTS alert_hash_index ON alert_hash (hash_id);", "CREATE TABLE IF NOT EXISTS host_info(host_id blob, system_key text NOT NULL, system_value text NOT NULL, " "date_created INT, PRIMARY KEY(host_id, system_key));", @@ -43,6 +45,23 @@ const char *database_config[] = { "CREATE TRIGGER IF NOT EXISTS ins_host AFTER INSERT ON host BEGIN INSERT INTO node_instance (host_id, date_created)" " SELECT new.host_id, unixepoch() WHERE new.host_id NOT IN (SELECT host_id FROM node_instance); END;", + "CREATE TABLE IF NOT EXISTS health_log (health_log_id INTEGER PRIMARY KEY, host_id blob, alarm_id int, " + "config_hash_id blob, name text, chart text, family text, recipient text, units text, exec text, " + "chart_context text, last_transition_id blob, UNIQUE (host_id, alarm_id)) ;", + + "CREATE INDEX IF NOT EXISTS health_log_ind_1 ON health_log (host_id);", + + "CREATE TABLE IF NOT EXISTS health_log_detail (health_log_id int, unique_id int, alarm_id int, alarm_event_id int, " + "updated_by_id int, updates_id int, when_key int, duration int, non_clear_duration int, " + "flags int, exec_run_timestamp int, delay_up_to_timestamp int, " + "info text, exec_code int, new_status real, old_status real, delay int, " + "new_value double, old_value double, last_repeat int, transition_id blob, global_id int);", + + "CREATE INDEX IF NOT EXISTS health_log_d_ind_1 ON health_log_detail (unique_id);", + "CREATE INDEX IF NOT EXISTS health_log_d_ind_2 ON health_log_detail (global_id);", + "CREATE INDEX IF NOT EXISTS health_log_d_ind_3 ON health_log_detail (transition_id);", + "CREATE INDEX IF NOT EXISTS health_log_d_ind_4 ON health_log_detail (health_log_id);", + NULL }; @@ -128,9 +147,9 @@ static void add_stmt_to_list(sqlite3_stmt *res) if (unlikely(!res)) { if (idx) - info("Finilizing %d statements", idx); + netdata_log_info("Finilizing %d statements", idx); else - info("No statements pending to finalize"); + netdata_log_info("No statements pending to finalize"); while (idx > 0) { int rc; rc = sqlite3_finalize(statements[--idx]); @@ -148,7 +167,7 @@ static void release_statement(void *statement) { int rc; #ifdef NETDATA_DEV_MODE - info("Thread %d: Cleaning prepared statement on %p", gettid(), statement); + netdata_log_info("Thread %d: Cleaning prepared statement on %p", gettid(), statement); #endif if (unlikely(rc = sqlite3_finalize((sqlite3_stmt *) statement) != SQLITE_OK)) error_report("Failed to finalize statement, rc = %d", rc); @@ -175,7 +194,7 @@ int prepare_statement(sqlite3 *database, const char *query, sqlite3_stmt **state if (likely(key)) { ret = pthread_setspecific(*key, *statement); #ifdef NETDATA_DEV_MODE - info("Thread %d: Using key %u on statement %p", gettid(), keys_used, *statement); + netdata_log_info("Thread %d: Using key %u on statement %p", gettid(), keys_used, *statement); #endif } if (ret) @@ -189,7 +208,7 @@ static int check_table_integrity_cb(void *data, int argc, char **argv, char **co int *status = data; UNUSED(argc); UNUSED(column); - info("---> %s", argv[0]); + netdata_log_info("---> %s", argv[0]); *status = (strcmp(argv[0], "ok") != 0); return 0; } @@ -202,11 +221,11 @@ static int check_table_integrity(char *table) char wstr[255]; if (table) { - info("Checking table %s", table); + netdata_log_info("Checking table %s", table); snprintfz(wstr, 254, "PRAGMA integrity_check(%s);", table); } else { - info("Checking entire database"); + netdata_log_info("Checking entire database"); strcpy(wstr,"PRAGMA integrity_check;"); } @@ -240,9 +259,9 @@ static void rebuild_chart() { int rc; char *err_msg = NULL; - info("Rebuilding chart table"); + netdata_log_info("Rebuilding chart table"); for (int i = 0; rebuild_chart_commands[i]; i++) { - info("Executing %s", rebuild_chart_commands[i]); + netdata_log_info("Executing %s", rebuild_chart_commands[i]); rc = sqlite3_exec_monitored(db_meta, rebuild_chart_commands[i], 0, 0, &err_msg); if (rc != SQLITE_OK) { error_report("SQLite error during database setup, rc = %d (%s)", rc, err_msg); @@ -272,9 +291,9 @@ void rebuild_dimension() int rc; char *err_msg = NULL; - info("Rebuilding dimension table"); + netdata_log_info("Rebuilding dimension table"); for (int i = 0; rebuild_dimension_commands[i]; i++) { - info("Executing %s", rebuild_dimension_commands[i]); + netdata_log_info("Executing %s", rebuild_dimension_commands[i]); rc = sqlite3_exec_monitored(db_meta, rebuild_dimension_commands[i], 0, 0, &err_msg); if (rc != SQLITE_OK) { error_report("SQLite error during database setup, rc = %d (%s)", rc, err_msg); @@ -286,11 +305,11 @@ void rebuild_dimension() static int attempt_database_fix() { - info("Closing database and attempting to fix it"); + netdata_log_info("Closing database and attempting to fix it"); int rc = sqlite3_close(db_meta); if (rc != SQLITE_OK) error_report("Failed to close database, rc = %d", rc); - info("Attempting to fix database"); + netdata_log_info("Attempting to fix database"); db_meta = NULL; return sql_init_database(DB_CHECK_FIX_DB | DB_CHECK_CONT, 0); } @@ -300,7 +319,7 @@ int init_database_batch(sqlite3 *database, int rebuild, int init_type, const cha int rc; char *err_msg = NULL; for (int i = 0; batch[i]; i++) { - debug(D_METADATALOG, "Executing %s", batch[i]); + netdata_log_debug(D_METADATALOG, "Executing %s", batch[i]); rc = sqlite3_exec_monitored(database, batch[i], 0, 0, &err_msg); if (rc != SQLITE_OK) { error_report("SQLite error during database %s, rc = %d (%s)", init_type ? "cleanup" : "setup", rc, err_msg); @@ -336,6 +355,30 @@ static void sqlite_uuid_parse(sqlite3_context *context, int argc, sqlite3_value sqlite3_result_blob(context, &uuid, sizeof(uuid_t), SQLITE_TRANSIENT); } +void sqlite_now_usec(sqlite3_context *context, int argc, sqlite3_value **argv) +{ + if (argc != 1 ){ + sqlite3_result_null(context); + return ; + } + + if (sqlite3_value_int(argv[0]) != 0) { + struct timespec req = {.tv_sec = 0, .tv_nsec = 1}; + nanosleep(&req, NULL); + } + + sqlite3_result_int64(context, (sqlite_int64) now_realtime_usec()); +} + +void sqlite_uuid_random(sqlite3_context *context, int argc, sqlite3_value **argv) +{ + (void)argc; + (void)argv; + + uuid_t uuid; + uuid_generate_random(uuid); + sqlite3_result_blob(context, &uuid, sizeof(uuid_t), SQLITE_TRANSIENT); +} /* * Initialize the SQLite database @@ -363,7 +406,7 @@ int sql_init_database(db_check_action_type_t rebuild, int memory) if (rebuild & (DB_CHECK_INTEGRITY | DB_CHECK_FIX_DB)) { int errors_detected = 0; if (!(rebuild & DB_CHECK_CONT)) - info("Running database check on %s", sqlite_database); + netdata_log_info("Running database check on %s", sqlite_database); if (check_table_integrity("chart")) { errors_detected++; @@ -389,7 +432,7 @@ int sql_init_database(db_check_action_type_t rebuild, int memory) if (rebuild & DB_CHECK_RECLAIM_SPACE) { if (!(rebuild & DB_CHECK_CONT)) - info("Reclaiming space of %s", sqlite_database); + netdata_log_info("Reclaiming space of %s", sqlite_database); rc = sqlite3_exec_monitored(db_meta, "VACUUM;", 0, 0, &err_msg); if (rc != SQLITE_OK) { error_report("Failed to execute VACUUM rc = %d (%s)", rc, err_msg); @@ -400,11 +443,23 @@ int sql_init_database(db_check_action_type_t rebuild, int memory) if (rebuild && !(rebuild & DB_CHECK_CONT)) return 1; - info("SQLite database %s initialization", sqlite_database); + netdata_log_info("SQLite database %s initialization", sqlite_database); char buf[1024 + 1] = ""; const char *list[2] = { buf, NULL }; + rc = sqlite3_create_function(db_meta, "u2h", 1, SQLITE_ANY | SQLITE_DETERMINISTIC, 0, sqlite_uuid_parse, 0, 0); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to register internal u2h function"); + + rc = sqlite3_create_function(db_meta, "now_usec", 1, SQLITE_ANY, 0, sqlite_now_usec, 0, 0); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to register internal now_usec function"); + + rc = sqlite3_create_function(db_meta, "uuid_random", 0, SQLITE_ANY, 0, sqlite_uuid_random, 0, 0); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to register internal uuid_random function"); + int target_version = DB_METADATA_VERSION; if (likely(!memory)) @@ -450,13 +505,10 @@ int sql_init_database(db_check_action_type_t rebuild, int memory) if (init_database_batch(db_meta, rebuild, 0, &database_cleanup[0])) return 1; - info("SQLite database initialization completed"); + netdata_log_info("SQLite database initialization completed"); initialize_thread_key_pool(); - rc = sqlite3_create_function(db_meta, "u2h", 1, SQLITE_ANY | SQLITE_DETERMINISTIC, 0, sqlite_uuid_parse, 0, 0); - if (unlikely(rc != SQLITE_OK)) - error_report("Failed to register internal u2h function"); return 0; } @@ -470,7 +522,7 @@ void sql_close_database(void) if (unlikely(!db_meta)) return; - info("Closing SQLite database"); + netdata_log_info("Closing SQLite database"); add_stmt_to_list(NULL); @@ -771,7 +823,7 @@ struct node_instance_list *get_node_list(void) uuid_unparse_lower(*host_id, host_guid); RRDHOST *host = rrdhost_find_by_guid(host_guid); if (rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD)) { - info("ACLK: 'host:%s' skipping get node list because context is initializing", rrdhost_hostname(host)); + netdata_log_info("ACLK: 'host:%s' skipping get node list because context is initializing", rrdhost_hostname(host)); continue; } uuid_copy(node_list[row].host_id, *host_id); @@ -927,3 +979,19 @@ int sql_metadata_cache_stats(int op) netdata_thread_enable_cancelability(); return count; } + +#define SQL_DROP_TABLE "DROP table %s;" + +void sql_drop_table(const char *table) +{ + if (!table) + return; + + char wstr[255]; + snprintfz(wstr, 254, SQL_DROP_TABLE, table); + + int rc = sqlite3_exec_monitored(db_meta, wstr, 0, 0, NULL); + if (rc != SQLITE_OK) { + error_report("DES SQLite error during drop table operation for %s, rc = %d", table, rc); + } +} diff --git a/database/sqlite/sqlite_functions.h b/database/sqlite/sqlite_functions.h index ee63a397c..407ed1eff 100644 --- a/database/sqlite/sqlite_functions.h +++ b/database/sqlite/sqlite_functions.h @@ -77,4 +77,6 @@ void invalidate_node_instances(uuid_t *host_id, uuid_t *claim_id); // Provide statistics int sql_metadata_cache_stats(int op); +void sql_drop_table(const char *table); +void sqlite_now_usec(sqlite3_context *context, int argc, sqlite3_value **argv); #endif //NETDATA_SQLITE_FUNCTIONS_H diff --git a/database/sqlite/sqlite_health.c b/database/sqlite/sqlite_health.c index 5c4cdbbd3..3ecd783dc 100644 --- a/database/sqlite/sqlite_health.c +++ b/database/sqlite/sqlite_health.c @@ -8,45 +8,12 @@ #define sqlite3_bind_string_or_null(res,key,param) ((key) ? sqlite3_bind_text(res, param, string2str(key), -1, SQLITE_STATIC) : sqlite3_bind_null(res, param)) /* Health related SQL queries - Creates a health log table in sqlite, one per host guid -*/ -#define SQL_CREATE_HEALTH_LOG_TABLE(guid) "CREATE TABLE IF NOT EXISTS health_log_%s(hostname text, unique_id int, alarm_id int, alarm_event_id int, config_hash_id blob, updated_by_id int, updates_id int, when_key int, duration int, non_clear_duration int, flags int, exec_run_timestamp int, delay_up_to_timestamp int, name text, chart text, family text, exec text, recipient text, source text, units text, info text, exec_code int, new_status real, old_status real, delay int, new_value double, old_value double, last_repeat int, class text, component text, type text, chart_context text, transition_id blob);", guid -int sql_create_health_log_table(RRDHOST *host) { - int rc; - char command[MAX_HEALTH_SQL_SIZE + 1]; - - if (unlikely(!db_meta)) { - if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) - error_report("HEALTH [%s]: Database has not been initialized", rrdhost_hostname(host)); - return 1; - } - - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CREATE_HEALTH_LOG_TABLE(uuid_str)); - - rc = db_execute(db_meta, command); - if (unlikely(rc)) - error_report("HEALTH [%s]: SQLite error during creation of health log table", rrdhost_hostname(host)); - else { - snprintfz(command, MAX_HEALTH_SQL_SIZE, "CREATE INDEX IF NOT EXISTS health_log_index_%s ON health_log_%s (unique_id); ", uuid_str, uuid_str); - rc = db_execute(db_meta, command); - if (unlikely(unlikely(rc))) - error_report("HEALTH [%s]: SQLite error during creation of health log table index", rrdhost_hostname(host)); - } - - return rc; -} - -/* Health related SQL queries Updates an entry in the table */ -#define SQL_UPDATE_HEALTH_LOG(guid) "UPDATE health_log_%s set updated_by_id = ?, flags = ?, exec_run_timestamp = ?, exec_code = ? where unique_id = ?;", guid +#define SQL_UPDATE_HEALTH_LOG "UPDATE health_log_detail set updated_by_id = ?, flags = ?, exec_run_timestamp = ?, exec_code = ? where unique_id = ? AND alarm_id = ? and transition_id = ?;" void sql_health_alarm_log_update(RRDHOST *host, ALARM_ENTRY *ae) { sqlite3_stmt *res = NULL; int rc; - char command[MAX_HEALTH_SQL_SIZE + 1]; if (unlikely(!db_meta)) { if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) @@ -54,19 +21,10 @@ void sql_health_alarm_log_update(RRDHOST *host, ALARM_ENTRY *ae) { return; } - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_UPDATE_HEALTH_LOG(uuid_str)); - - rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + rc = sqlite3_prepare_v2(db_meta, SQL_UPDATE_HEALTH_LOG, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { - sql_create_health_log_table(host); - rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("HEALTH [%s]: Failed to prepare statement for SQL_INSERT_HEALTH_LOG", rrdhost_hostname(host)); - return; - } + error_report("HEALTH [%s]: Failed to prepare statement for SQL_UPDATE_HEALTH_LOG", rrdhost_hostname(host)); + return; } rc = sqlite3_bind_int64(res, 1, (sqlite3_int64) ae->updated_by_id); @@ -99,6 +57,18 @@ void sql_health_alarm_log_update(RRDHOST *host, ALARM_ENTRY *ae) { goto failed; } + rc = sqlite3_bind_int64(res, 6, (sqlite3_int64) ae->alarm_id); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind unique_id parameter for SQL_UPDATE_HEALTH_LOG"); + goto failed; + } + + rc = sqlite3_bind_blob(res, 7, &ae->transition_id, sizeof(ae->transition_id), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id for SQL_UPDATE_HEALTH_LOG."); + goto failed; + } + rc = execute_insert(res); if (unlikely(rc != SQLITE_DONE)) { error_report("HEALTH [%s]: Failed to update health log, rc = %d", rrdhost_hostname(host), rc); @@ -112,16 +82,19 @@ failed: /* Health related SQL queries Inserts an entry in the table */ -#define SQL_INSERT_HEALTH_LOG(guid) "INSERT INTO health_log_%s(hostname, unique_id, alarm_id, alarm_event_id, " \ - "config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, " \ - "exec_run_timestamp, delay_up_to_timestamp, name, chart, family, exec, recipient, source, " \ - "units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, " \ - "class, component, type, chart_context, transition_id) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?);", guid - +#define SQL_INSERT_HEALTH_LOG "INSERT INTO health_log (host_id, alarm_id, " \ + "config_hash_id, name, chart, family, exec, recipient, units, chart_context, last_transition_id) " \ + "VALUES (?,?,?,?,?,?,?,?,?,?,?) " \ + "ON CONFLICT (host_id, alarm_id) DO UPDATE SET last_transition_id = excluded.last_transition_id RETURNING health_log_id; " + +#define SQL_INSERT_HEALTH_LOG_DETAIL "INSERT INTO health_log_detail (health_log_id, unique_id, alarm_id, alarm_event_id, " \ + "updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, " \ + "info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, transition_id, global_id) " \ + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,@global_id); " void sql_health_alarm_log_insert(RRDHOST *host, ALARM_ENTRY *ae) { sqlite3_stmt *res = NULL; int rc; - char command[MAX_HEALTH_SQL_SIZE + 1]; + uint64_t health_log_id = 0; if (unlikely(!db_meta)) { if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) @@ -129,222 +102,231 @@ void sql_health_alarm_log_insert(RRDHOST *host, ALARM_ENTRY *ae) { return; } - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_INSERT_HEALTH_LOG(uuid_str)); - - rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + rc = sqlite3_prepare_v2(db_meta, SQL_INSERT_HEALTH_LOG, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { - sql_create_health_log_table(host); - rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); - if (unlikely(rc != SQLITE_OK)) { - error_report("HEALTH [%s]: Failed to prepare statement for SQL_INSERT_HEALTH_LOG", rrdhost_hostname(host)); - return; - } + error_report("HEALTH [%s]: Failed to prepare statement for SQL_INSERT_HEALTH_LOG", rrdhost_hostname(host)); + return; } - rc = sqlite3_bind_text(res, 1, rrdhost_hostname(host), -1, SQLITE_STATIC); + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind hostname parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind host_id for SQL_INSERT_HEALTH_LOG."); goto failed; } - rc = sqlite3_bind_int64(res, 2, (sqlite3_int64) ae->unique_id); + rc = sqlite3_bind_int64(res, 2, (sqlite3_int64) ae->alarm_id); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind unique_id parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind alarm_id parameter for SQL_INSERT_HEALTH_LOG"); goto failed; } - rc = sqlite3_bind_int64(res, 3, (sqlite3_int64) ae->alarm_id); + rc = sqlite3_bind_blob(res, 3, &ae->config_hash_id, sizeof(ae->config_hash_id), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind alarm_id parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind config_hash_id parameter for SQL_INSERT_HEALTH_LOG"); goto failed; } - rc = sqlite3_bind_int64(res, 4, (sqlite3_int64) ae->alarm_event_id); + rc = sqlite3_bind_string_or_null(res, ae->name, 4); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind alarm_event_id parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind name parameter for SQL_INSERT_HEALTH_LOG"); goto failed; } - rc = sqlite3_bind_blob(res, 5, &ae->config_hash_id, sizeof(ae->config_hash_id), SQLITE_STATIC); + rc = sqlite3_bind_string_or_null(res, ae->chart, 5); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind config_hash_id parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind chart parameter for SQL_INSERT_HEALTH_LOG"); goto failed; } - rc = sqlite3_bind_int64(res, 6, (sqlite3_int64) ae->updated_by_id); + rc = sqlite3_bind_string_or_null(res, ae->family, 6); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind updated_by_id parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind family parameter for SQL_INSERT_HEALTH_LOG"); goto failed; } - rc = sqlite3_bind_int64(res, 7, (sqlite3_int64) ae->updates_id); + rc = sqlite3_bind_string_or_null(res, ae->exec, 7); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind updates_id parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind exec parameter for SQL_INSERT_HEALTH_LOG"); goto failed; } - rc = sqlite3_bind_int64(res, 8, (sqlite3_int64) ae->when); + rc = sqlite3_bind_string_or_null(res, ae->recipient, 8); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind when parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind recipient parameter for SQL_INSERT_HEALTH_LOG"); goto failed; } - rc = sqlite3_bind_int64(res, 9, (sqlite3_int64) ae->duration); + rc = sqlite3_bind_string_or_null(res, ae->units, 9); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind duration parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind host_id parameter to store node instance information"); goto failed; } - rc = sqlite3_bind_int64(res, 10, (sqlite3_int64) ae->non_clear_duration); + rc = sqlite3_bind_string_or_null(res, ae->chart_context, 10); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind non_clear_duration parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind chart_context parameter for SQL_INSERT_HEALTH_LOG"); goto failed; } - rc = sqlite3_bind_int64(res, 11, (sqlite3_int64) ae->flags); + rc = sqlite3_bind_blob(res, 11, &ae->transition_id, sizeof(ae->transition_id), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind flags parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind transition_id parameter for SQL_INSERT_HEALTH_LOG"); goto failed; } - rc = sqlite3_bind_int64(res, 12, (sqlite3_int64) ae->exec_run_timestamp); + rc = sqlite3_step_monitored(res); + if (likely(rc == SQLITE_ROW)) + health_log_id = (size_t) sqlite3_column_int64(res, 0); + else { + error_report("HEALTH [%s]: Failed to execute SQL_INSERT_HEALTH_LOG, rc = %d", rrdhost_hostname(host), rc); + goto failed; + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("HEALTH [%s]: Failed to finalize the prepared statement for inserting to health log.", rrdhost_hostname(host)); + + rc = sqlite3_prepare_v2(db_meta, SQL_INSERT_HEALTH_LOG_DETAIL, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("HEALTH [%s]: Failed to prepare statement for SQL_INSERT_HEALTH_LOG_DETAIL", rrdhost_hostname(host)); + return; + } + + rc = sqlite3_bind_int64(res, 1, (sqlite3_int64) health_log_id); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind exec_run_timestamp parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind unique_id parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_int64(res, 13, (sqlite3_int64) ae->delay_up_to_timestamp); + rc = sqlite3_bind_int64(res, 2, (sqlite3_int64) ae->unique_id); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind delay_up_to_timestamp parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind unique_id parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_string_or_null(res, ae->name, 14); + rc = sqlite3_bind_int64(res, 3, (sqlite3_int64) ae->alarm_id); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind name parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind unique_id parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_string_or_null(res, ae->chart, 15); + rc = sqlite3_bind_int64(res, 4, (sqlite3_int64) ae->alarm_event_id); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind chart parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind alarm_event_id parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_string_or_null(res, ae->family, 16); + rc = sqlite3_bind_int64(res, 5, (sqlite3_int64) ae->updated_by_id); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind family parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind updated_by_id parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_string_or_null(res, ae->exec, 17); + rc = sqlite3_bind_int64(res, 6, (sqlite3_int64) ae->updates_id); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind exec parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind updates_id parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_string_or_null(res, ae->recipient, 18); + rc = sqlite3_bind_int64(res, 7, (sqlite3_int64) ae->when); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind recipient parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind when parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_string_or_null(res, ae->source, 19); + rc = sqlite3_bind_int64(res, 8, (sqlite3_int64) ae->duration); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind source parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind duration parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_string_or_null(res, ae->units, 20); + rc = sqlite3_bind_int64(res, 9, (sqlite3_int64) ae->non_clear_duration); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind host_id parameter to store node instance information"); + error_report("Failed to bind non_clear_duration parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_string_or_null(res, ae->info, 21); + rc = sqlite3_bind_int64(res, 10, (sqlite3_int64) ae->flags); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind info parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind flags parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_int(res, 22, ae->exec_code); + rc = sqlite3_bind_int64(res, 11, (sqlite3_int64) ae->exec_run_timestamp); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind exec_code parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind exec_run_timestamp parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_int(res, 23, ae->new_status); + rc = sqlite3_bind_int64(res, 12, (sqlite3_int64) ae->delay_up_to_timestamp); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind new_status parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind delay_up_to_timestamp parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_int(res, 24, ae->old_status); + rc = sqlite3_bind_string_or_null(res, ae->info, 13); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind old_status parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind info parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_int(res, 25, ae->delay); + rc = sqlite3_bind_int(res, 14, ae->exec_code); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind delay parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind exec_code parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_double(res, 26, ae->new_value); + rc = sqlite3_bind_int(res, 15, ae->new_status); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind new_value parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind new_status parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_double(res, 27, ae->old_value); + rc = sqlite3_bind_int(res, 16, ae->old_status); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind old_value parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind old_status parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_int64(res, 28, (sqlite3_int64) ae->last_repeat); + rc = sqlite3_bind_int(res, 17, ae->delay); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind last_repeat parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind delay parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_string_or_null(res, ae->classification, 29); + rc = sqlite3_bind_double(res, 18, ae->new_value); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind classification parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind new_value parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_string_or_null(res, ae->component, 30); + rc = sqlite3_bind_double(res, 19, ae->old_value); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind component parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind old_value parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_string_or_null(res, ae->type, 31); + rc = sqlite3_bind_int64(res, 20, (sqlite3_int64) ae->last_repeat); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind type parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind last_repeat parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_string_or_null(res, ae->chart_context, 32); + rc = sqlite3_bind_blob(res, 21, &ae->transition_id, sizeof(ae->transition_id), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind chart_context parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind transition_id parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } - rc = sqlite3_bind_blob(res, 33, &ae->transition_id, sizeof(ae->transition_id), SQLITE_STATIC); + rc = sqlite3_bind_int64(res, 22, (sqlite3_int64) ae->global_id); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind transition_id parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind global_id parameter for SQL_INSERT_HEALTH_LOG_DETAIL"); goto failed; } rc = execute_insert(res); if (unlikely(rc != SQLITE_DONE)) { - error_report("HEALTH [%s]: Failed to execute SQL_INSERT_HEALTH_LOG, rc = %d", rrdhost_hostname(host), rc); + error_report("HEALTH [%s]: Failed to execute SQL_INSERT_HEALTH_LOG_DETAIL, rc = %d", rrdhost_hostname(host), rc); goto failed; } @@ -363,7 +345,7 @@ void sql_health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) else { sql_health_alarm_log_insert(host, ae); #ifdef ENABLE_ACLK - if (netdata_cloud_setting) { + if (netdata_cloud_enabled) { sql_queue_alarm_to_aclk(host, ae, 0); } #endif @@ -373,11 +355,10 @@ void sql_health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) /* Health related SQL queries Get a count of rows from health log table */ -#define SQL_COUNT_HEALTH_LOG(guid) "SELECT count(1) FROM health_log_%s;", guid +#define SQL_COUNT_HEALTH_LOG_DETAIL "SELECT count(1) FROM health_log_detail hld, health_log hl where hl.host_id = @host_id and hl.health_log_id = hld.health_log_id;" void sql_health_alarm_log_count(RRDHOST *host) { sqlite3_stmt *res = NULL; int rc; - char command[MAX_HEALTH_SQL_SIZE + 1]; if (unlikely(!db_meta)) { if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) @@ -385,17 +366,19 @@ void sql_health_alarm_log_count(RRDHOST *host) { return; } - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_COUNT_HEALTH_LOG(uuid_str)); - - rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + rc = sqlite3_prepare_v2(db_meta, SQL_COUNT_HEALTH_LOG_DETAIL, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to prepare statement to count health log entries from db"); return; } + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id for SQL_COUNT_HEALTH_LOG."); + sqlite3_finalize(res); + return; + } + rc = sqlite3_step_monitored(res); if (likely(rc == SQLITE_ROW)) host->health.health_log_entries_written = (size_t) sqlite3_column_int64(res, 0); @@ -404,14 +387,14 @@ void sql_health_alarm_log_count(RRDHOST *host) { if (unlikely(rc != SQLITE_OK)) error_report("Failed to finalize the prepared statement to count health log entries from db"); - info("HEALTH [%s]: Table health_log_%s, contains %lu entries.", rrdhost_hostname(host), uuid_str, (unsigned long int) host->health.health_log_entries_written); + netdata_log_info("HEALTH [%s]: Table health_log_detail contains %lu entries.", rrdhost_hostname(host), (unsigned long int) host->health.health_log_entries_written); } /* Health related SQL queries - Cleans up the health_log table on a non-claimed host + Cleans up the health_log_detail table on a non-claimed host */ -#define SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED(guid,limit) "DELETE FROM health_log_%s ORDER BY unique_id ASC LIMIT %lu;", guid, limit -void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every) { +#define SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED "DELETE FROM health_log_detail WHERE health_log_id IN (SELECT health_log_id FROM health_log WHERE host_id = ?1) AND when_key + ?2 < unixepoch() AND updated_by_id <> 0 AND transition_id NOT IN (SELECT last_transition_id FROM health_log hl WHERE hl.host_id = ?3);" +void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host) { sqlite3_stmt *res = NULL; int rc; char command[MAX_HEALTH_SQL_SIZE + 1]; @@ -425,23 +408,42 @@ void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED(uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every))); + rc = sqlite3_prepare_v2(db_meta, SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to cleanup health log detail table (un-claimed)"); + return; + } + + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id for SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED."); + sqlite3_finalize(res); + return; + } - rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + rc = sqlite3_bind_int64(res, 2, (sqlite3_int64)host->health_log.health_log_history); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind health log history for SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED."); + sqlite3_finalize(res); + return; + } + + rc = sqlite3_bind_blob(res, 3, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement to cleanup health log table"); + error_report("Failed to bind host_id for SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED."); + sqlite3_finalize(res); return; } rc = sqlite3_step_monitored(res); if (unlikely(rc != SQLITE_DONE)) - error_report("Failed to cleanup health log table, rc = %d", rc); + error_report("Failed to cleanup health log detail table, rc = %d", rc); rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) - error_report("Failed to finalize the prepared statement to cleanup health log table"); + error_report("Failed to finalize the prepared statement to cleanup health log detail table (un-claimed)"); - host->health.health_log_entries_written = rotate_every; + sql_health_alarm_log_count(host); snprintfz(command, MAX_HEALTH_SQL_SIZE, "aclk_alert_%s", uuid_str); if (unlikely(table_exists_in_database(command))) { @@ -450,10 +452,10 @@ void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every } /* Health related SQL queries - Cleans up the health_log table on a claimed host + Cleans up the health_log_detail table on a claimed host */ -#define SQL_CLEANUP_HEALTH_LOG_CLAIMED(guid, guid2, guid3, limit) "DELETE from health_log_%s WHERE unique_id NOT IN (SELECT filtered_alert_unique_id FROM aclk_alert_%s) AND unique_id IN (SELECT unique_id FROM health_log_%s ORDER BY unique_id asc LIMIT %lu);", guid, guid2, guid3, limit -void sql_health_alarm_log_cleanup_claimed(RRDHOST *host, size_t rotate_every) { +#define SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(guid) "DELETE from health_log_detail WHERE unique_id NOT IN (SELECT filtered_alert_unique_id FROM aclk_alert_%s) AND unique_id IN (SELECT hld.unique_id FROM health_log hl, health_log_detail hld WHERE hl.host_id = ?1 AND hl.health_log_id = hld.health_log_id) AND health_log_id IN (SELECT health_log_id FROM health_log WHERE host_id = ?2) AND when_key + ?3 < unixepoch() AND updated_by_id <> 0 AND transition_id NOT IN (SELECT last_transition_id FROM health_log hl WHERE hl.host_id = ?4);", guid +void sql_health_alarm_log_cleanup_claimed(RRDHOST *host) { sqlite3_stmt *res = NULL; int rc; char command[MAX_HEALTH_SQL_SIZE + 1]; @@ -469,70 +471,83 @@ void sql_health_alarm_log_cleanup_claimed(RRDHOST *host, size_t rotate_every) { snprintfz(command, MAX_HEALTH_SQL_SIZE, "aclk_alert_%s", uuid_str); if (!table_exists_in_database(command)) { - sql_health_alarm_log_cleanup_not_claimed(host, rotate_every); + sql_health_alarm_log_cleanup_not_claimed(host); return; } - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_CLAIMED(uuid_str, uuid_str, uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every))); + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(uuid_str)); rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement to cleanup health log table"); + error_report("Failed to prepare statement to cleanup health log detail table (claimed)"); + return; + } + + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind first host_id for SQL_CLEANUP_HEALTH_LOG_CLAIMED."); + sqlite3_finalize(res); + return; + } + + rc = sqlite3_bind_blob(res, 2, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind second host_id for SQL_CLEANUP_HEALTH_LOG_CLAIMED."); + sqlite3_finalize(res); + return; + } + + rc = sqlite3_bind_int64(res, 3, (sqlite3_int64)host->health_log.health_log_history); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind health log history for SQL_CLEANUP_HEALTH_LOG_CLAIMED."); + sqlite3_finalize(res); + return; + } + + rc = sqlite3_bind_blob(res, 4, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind second host_id for SQL_CLEANUP_HEALTH_LOG_CLAIMED."); + sqlite3_finalize(res); return; } rc = sqlite3_step_monitored(res); if (unlikely(rc != SQLITE_DONE)) - error_report("Failed to cleanup health log table, rc = %d", rc); + error_report("Failed to cleanup health log detail table, rc = %d", rc); rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) - error_report("Failed to finalize the prepared statement to cleanup health log table"); + error_report("Failed to finalize the prepared statement to cleanup health log detail table (claimed)"); sql_health_alarm_log_count(host); sql_aclk_alert_clean_dead_entries(host); + } /* Health related SQL queries Cleans up the health_log table. */ void sql_health_alarm_log_cleanup(RRDHOST *host) { - static size_t rotate_every = 0; - - if(unlikely(rotate_every == 0)) { - rotate_every = (size_t)config_get_number(CONFIG_SECTION_HEALTH, "rotate log every lines", 2000); - if(rotate_every < 100) rotate_every = 100; - } - - if(likely(host->health.health_log_entries_written < rotate_every)) { - return; - } - if (!claimed()) { - sql_health_alarm_log_cleanup_not_claimed(host, rotate_every); + sql_health_alarm_log_cleanup_not_claimed(host); } else - sql_health_alarm_log_cleanup_claimed(host, rotate_every); + sql_health_alarm_log_cleanup_claimed(host); } -#define SQL_INJECT_REMOVED(guid, guid2) "insert into health_log_%s (hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, " \ -"delay_up_to_timestamp, name, chart, family, exec, recipient, source, units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, class, component, type, chart_context, transition_id) " \ -"select hostname, ?1, ?2, ?3, config_hash_id, 0, ?4, unixepoch(), 0, 0, flags, exec_run_timestamp, " \ -"unixepoch(), name, chart, family, exec, recipient, source, units, info, exec_code, -2, new_status, delay, NULL, new_value, 0, class, component, type, chart_context, ?5 " \ -"from health_log_%s where unique_id = ?6", guid, guid2 -#define SQL_INJECT_REMOVED_UPDATE(guid) "update health_log_%s set flags = flags | ?1, updated_by_id = ?2 where unique_id = ?3; ", guid -void sql_inject_removed_status(char *uuid_str, uint32_t alarm_id, uint32_t alarm_event_id, uint32_t unique_id, uint32_t max_unique_id) +#define SQL_INJECT_REMOVED "insert into health_log_detail (health_log_id, unique_id, alarm_id, alarm_event_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, transition_id, global_id) select health_log_id, ?1, ?2, ?3, 0, ?4, unixepoch(), 0, 0, flags, exec_run_timestamp, unixepoch(), info, exec_code, -2, new_status, delay, NULL, new_value, 0, ?5, now_usec(0) from health_log_detail where unique_id = ?6 and transition_id = ?7;" +#define SQL_INJECT_REMOVED_UPDATE_DETAIL "update health_log_detail set flags = flags | ?1, updated_by_id = ?2 where unique_id = ?3 and transition_id = ?4;" +#define SQL_INJECT_REMOVED_UPDATE_LOG "update health_log set last_transition_id = ?1 where alarm_id = ?2 and last_transition_id = ?3 and host_id = ?4;" +void sql_inject_removed_status(RRDHOST *host, uint32_t alarm_id, uint32_t alarm_event_id, uint32_t unique_id, uint32_t max_unique_id, uuid_t *prev_transition_id) { int rc; - char command[MAX_HEALTH_SQL_SIZE + 1]; if (!alarm_id || !alarm_event_id || !unique_id || !max_unique_id) return; sqlite3_stmt *res = NULL; - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_INJECT_REMOVED(uuid_str, uuid_str)); - rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + rc = sqlite3_prepare_v2(db_meta, SQL_INJECT_REMOVED, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement when trying to inject removed event"); return; @@ -566,7 +581,7 @@ void sql_inject_removed_status(char *uuid_str, uint32_t alarm_id, uint32_t alarm uuid_generate_random(transition_id); rc = sqlite3_bind_blob(res, 5, &transition_id, sizeof(transition_id), SQLITE_STATIC); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind config_hash_id parameter for SQL_INSERT_HEALTH_LOG"); + error_report("Failed to bind config_hash_id parameter for SQL_INJECT_REMOVED"); goto failed; } @@ -576,6 +591,12 @@ void sql_inject_removed_status(char *uuid_str, uint32_t alarm_id, uint32_t alarm goto failed; } + rc = sqlite3_bind_blob(res, 7, prev_transition_id, sizeof(*prev_transition_id), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter for SQL_INJECT_REMOVED."); + goto failed; + } + rc = execute_insert(res); if (unlikely(rc != SQLITE_DONE)) { error_report("HEALTH [N/A]: Failed to execute SQL_INJECT_REMOVED, rc = %d", rc); @@ -585,35 +606,77 @@ void sql_inject_removed_status(char *uuid_str, uint32_t alarm_id, uint32_t alarm if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) error_report("HEALTH [N/A]: Failed to finalize the prepared statement for injecting removed event."); - //update the old entry - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_INJECT_REMOVED_UPDATE(uuid_str)); - rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + //update the old entry in health_log_detail + rc = sqlite3_prepare_v2(db_meta, SQL_INJECT_REMOVED_UPDATE_DETAIL, -1, &res, 0); if (rc != SQLITE_OK) { - error_report("Failed to prepare statement when trying to update during inject removed event"); + error_report("Failed to prepare statement when trying to update health_log_detail during inject removed event"); return; } rc = sqlite3_bind_int64(res, 1, (sqlite3_int64) HEALTH_ENTRY_FLAG_UPDATED); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind flags parameter for SQL_INJECT_REMOVED (update)"); + error_report("Failed to bind flags parameter for SQL_INJECT_REMOVED_UPDATE_DETAIL"); goto failed; } rc = sqlite3_bind_int64(res, 2, (sqlite3_int64) max_unique_id); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind max_unique_id parameter for SQL_INJECT_REMOVED (update)"); + error_report("Failed to bind max_unique_id parameter for SQL_INJECT_REMOVED_UPDATE_DETAIL"); goto failed; } rc = sqlite3_bind_int64(res, 3, (sqlite3_int64) unique_id); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to bind unique_id parameter for SQL_INJECT_REMOVED (update)"); + error_report("Failed to bind unique_id parameter for SQL_INJECT_REMOVED_UPDATE_DETAIL"); + goto failed; + } + + rc = sqlite3_bind_blob(res, 4, prev_transition_id, sizeof(*prev_transition_id), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter for SQL_INJECT_REMOVED_UPDATE_DETAIL"); goto failed; } rc = execute_insert(res); if (unlikely(rc != SQLITE_DONE)) { - error_report("HEALTH [N/A]: Failed to execute SQL_INJECT_REMOVED_UPDATE, rc = %d", rc); + error_report("HEALTH [N/A]: Failed to execute SQL_INJECT_REMOVED_UPDATE_DETAIL, rc = %d", rc); + goto failed; + } + + //update the health_log_table + rc = sqlite3_prepare_v2(db_meta, SQL_INJECT_REMOVED_UPDATE_LOG, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to update health_log during inject removed event"); + return; + } + + rc = sqlite3_bind_blob(res, 1, &transition_id, sizeof(transition_id), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter for SQL_INJECT_REMOVED_UPDATE_LOG"); + goto failed; + } + + rc = sqlite3_bind_int64(res, 2, (sqlite3_int64) alarm_id); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind unique_id parameter for SQL_INJECT_REMOVED_UPDATE_DETAIL"); + goto failed; + } + + rc = sqlite3_bind_blob(res, 3, prev_transition_id, sizeof(*prev_transition_id), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter for SQL_INJECT_REMOVED_UPDATE_LOG"); + goto failed; + } + + rc = sqlite3_bind_blob(res, 4, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter for SQL_INJECT_REMOVED_UPDATE_DETAIL"); + goto failed; + } + + rc = execute_insert(res); + if (unlikely(rc != SQLITE_DONE)) { + error_report("HEALTH [N/A]: Failed to execute SQL_INJECT_REMOVED_UPDATE_DETAIL, rc = %d", rc); goto failed; } @@ -622,22 +685,27 @@ failed: error_report("HEALTH [N/A]: Failed to finalize the prepared statement for injecting removed event."); } -#define SQL_SELECT_MAX_UNIQUE_ID(guid) "SELECT MAX(unique_id) from health_log_%s", guid -uint32_t sql_get_max_unique_id (char *uuid_str) +#define SQL_SELECT_MAX_UNIQUE_ID "SELECT MAX(hld.unique_id) from health_log_detail hld, health_log hl where hl.host_id = @host_id; and hl.health_log_id = hld.health_log_id" +uint32_t sql_get_max_unique_id (RRDHOST *host) { int rc; - char command[MAX_HEALTH_SQL_SIZE + 1]; uint32_t max_unique_id = 0; sqlite3_stmt *res = NULL; - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_SELECT_MAX_UNIQUE_ID(uuid_str)); - rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + rc = sqlite3_prepare_v2(db_meta, SQL_SELECT_MAX_UNIQUE_ID, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement when trying to get max unique id"); return 0; } + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter for SQL_SELECT_MAX_UNIQUE_ID."); + sqlite3_finalize(res); + return 0; + } + while (sqlite3_step_monitored(res) == SQLITE_ROW) { max_unique_id = (uint32_t) sqlite3_column_int64(res, 0); } @@ -649,36 +717,42 @@ uint32_t sql_get_max_unique_id (char *uuid_str) return max_unique_id; } -#define SQL_SELECT_LAST_STATUSES(guid) "SELECT new_status, unique_id, alarm_id, alarm_event_id from health_log_%s group by alarm_id having max(alarm_event_id)", guid -void sql_check_removed_alerts_state(char *uuid_str) +#define SQL_SELECT_LAST_STATUSES "SELECT hld.new_status, hld.unique_id, hld.alarm_id, hld.alarm_event_id, hld.transition_id from health_log hl, health_log_detail hld where hl.host_id = @host_id and hl.last_transition_id = hld.transition_id;" +void sql_check_removed_alerts_state(RRDHOST *host) { int rc; - char command[MAX_HEALTH_SQL_SIZE + 1]; uint32_t max_unique_id = 0; - sqlite3_stmt *res = NULL; + uuid_t transition_id; - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_SELECT_LAST_STATUSES(uuid_str)); - rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + rc = sqlite3_prepare_v2(db_meta, SQL_SELECT_LAST_STATUSES, -1, &res, 0); if (rc != SQLITE_OK) { error_report("Failed to prepare statement when trying to check removed statuses"); return; } - while (sqlite3_step_monitored(res) == SQLITE_ROW) { - uint32_t alarm_id, alarm_event_id, unique_id; - RRDCALC_STATUS status; - - status = (RRDCALC_STATUS) sqlite3_column_int(res, 0); - unique_id = (uint32_t) sqlite3_column_int64(res, 1); - alarm_id = (uint32_t) sqlite3_column_int64(res, 2); - alarm_event_id = (uint32_t) sqlite3_column_int64(res, 3); - if (unlikely(status != RRDCALC_STATUS_REMOVED)) { - if (unlikely(!max_unique_id)) - max_unique_id = sql_get_max_unique_id (uuid_str); - sql_inject_removed_status (uuid_str, alarm_id, alarm_event_id, unique_id, ++max_unique_id); - } - } + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter for SQL_SELECT_LAST_STATUSES."); + sqlite3_finalize(res); + return; + } + + while (sqlite3_step_monitored(res) == SQLITE_ROW) { + uint32_t alarm_id, alarm_event_id, unique_id; + RRDCALC_STATUS status; + + status = (RRDCALC_STATUS) sqlite3_column_int(res, 0); + unique_id = (uint32_t) sqlite3_column_int64(res, 1); + alarm_id = (uint32_t) sqlite3_column_int64(res, 2); + alarm_event_id = (uint32_t) sqlite3_column_int64(res, 3); + uuid_copy(transition_id, *((uuid_t *) sqlite3_column_blob(res, 4))); + if (unlikely(status != RRDCALC_STATUS_REMOVED)) { + if (unlikely(!max_unique_id)) + max_unique_id = sql_get_max_unique_id (host); + sql_inject_removed_status (host, alarm_id, alarm_event_id, unique_id, ++max_unique_id, &transition_id); + } + } rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) @@ -688,12 +762,17 @@ void sql_check_removed_alerts_state(char *uuid_str) /* Health related SQL queries Load from the health log table */ -#define SQL_LOAD_HEALTH_LOG(guid) "SELECT hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, name, chart, family, exec, recipient, source, units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, class, component, type, chart_context, transition_id FROM health_log_%s group by alarm_id having max(alarm_event_id);", guid +#define SQL_LOAD_HEALTH_LOG "SELECT hld.unique_id, hld.alarm_id, hld.alarm_event_id, hl.config_hash_id, hld.updated_by_id, " \ + "hld.updates_id, hld.when_key, hld.duration, hld.non_clear_duration, hld.flags, hld.exec_run_timestamp, " \ + "hld.delay_up_to_timestamp, hl.name, hl.chart, hl.family, hl.exec, hl.recipient, ah.source, hl.units, " \ + "hld.info, hld.exec_code, hld.new_status, hld.old_status, hld.delay, hld.new_value, hld.old_value, " \ + "hld.last_repeat, ah.class, ah.component, ah.type, hl.chart_context, hld.transition_id, hld.global_id " \ + "FROM health_log hl, alert_hash ah, health_log_detail hld " \ + "WHERE hl.config_hash_id = ah.hash_id and hl.host_id = @host_id and hl.last_transition_id = hld.transition_id;" void sql_health_alarm_log_load(RRDHOST *host) { sqlite3_stmt *res = NULL; int ret; ssize_t errored = 0, loaded = 0; - char command[MAX_HEALTH_SQL_SIZE + 1]; host->health.health_log_entries_written = 0; @@ -703,19 +782,21 @@ void sql_health_alarm_log_load(RRDHOST *host) { return; } - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - - sql_check_removed_alerts_state(uuid_str); + sql_check_removed_alerts_state(host); - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_LOAD_HEALTH_LOG(uuid_str)); - - ret = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + ret = sqlite3_prepare_v2(db_meta, SQL_LOAD_HEALTH_LOG, -1, &res, 0); if (unlikely(ret != SQLITE_OK)) { error_report("HEALTH [%s]: Failed to prepare sql statement to load health log.", rrdhost_hostname(host)); return; } + ret = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(ret != SQLITE_OK)) { + error_report("Failed to bind host_id parameter for SQL_LOAD_HEALTH_LOG."); + sqlite3_finalize(res); + return; + } + DICTIONARY *all_rrdcalcs = dictionary_create( DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE); RRDCALC *rc; @@ -724,20 +805,20 @@ void sql_health_alarm_log_load(RRDHOST *host) { } foreach_rrdcalc_in_rrdhost_done(rc); - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_lock(&host->health_log.spinlock); while (sqlite3_step_monitored(res) == SQLITE_ROW) { ALARM_ENTRY *ae = NULL; // check that we have valid ids - uint32_t unique_id = (uint32_t) sqlite3_column_int64(res, 1); + uint32_t unique_id = (uint32_t) sqlite3_column_int64(res, 0); if(!unique_id) { error_report("HEALTH [%s]: Got invalid unique id. Ignoring it.", rrdhost_hostname(host)); errored++; continue; } - uint32_t alarm_id = (uint32_t) sqlite3_column_int64(res, 2); + uint32_t alarm_id = (uint32_t) sqlite3_column_int64(res, 1); if(!alarm_id) { error_report("HEALTH [%s]: Got invalid alarm id. Ignoring it.", rrdhost_hostname(host)); errored++; @@ -745,28 +826,28 @@ void sql_health_alarm_log_load(RRDHOST *host) { } //need name, chart and family - if (sqlite3_column_type(res, 13) == SQLITE_NULL) { + if (sqlite3_column_type(res, 12) == SQLITE_NULL) { error_report("HEALTH [%s]: Got null name field. Ignoring it.", rrdhost_hostname(host)); errored++; continue; } - if (sqlite3_column_type(res, 14) == SQLITE_NULL) { + if (sqlite3_column_type(res, 13) == SQLITE_NULL) { error_report("HEALTH [%s]: Got null chart field. Ignoring it.", rrdhost_hostname(host)); errored++; continue; } - if (sqlite3_column_type(res, 15) == SQLITE_NULL) { + if (sqlite3_column_type(res, 14) == SQLITE_NULL) { error_report("HEALTH [%s]: Got null family field. Ignoring it.", rrdhost_hostname(host)); errored++; continue; } // Check if we got last_repeat field - time_t last_repeat = (time_t)sqlite3_column_int64(res, 27); + time_t last_repeat = (time_t)sqlite3_column_int64(res, 26); - rc = dictionary_get(all_rrdcalcs, (char *) sqlite3_column_text(res, 14)); + rc = dictionary_get(all_rrdcalcs, (char *) sqlite3_column_text(res, 13)); if(unlikely(rc)) { if (rrdcalc_isrepeating(rc)) { rc->last_repeat = last_repeat; @@ -782,84 +863,87 @@ void sql_health_alarm_log_load(RRDHOST *host) { ae->unique_id = unique_id; ae->alarm_id = alarm_id; - if (sqlite3_column_type(res, 4) != SQLITE_NULL) - uuid_copy(ae->config_hash_id, *((uuid_t *) sqlite3_column_blob(res, 4))); + if (sqlite3_column_type(res, 3) != SQLITE_NULL) + uuid_copy(ae->config_hash_id, *((uuid_t *) sqlite3_column_blob(res, 3))); - ae->alarm_event_id = (uint32_t) sqlite3_column_int64(res, 3); - ae->updated_by_id = (uint32_t) sqlite3_column_int64(res, 5); - ae->updates_id = (uint32_t) sqlite3_column_int64(res, 6); + ae->alarm_event_id = (uint32_t) sqlite3_column_int64(res, 2); + ae->updated_by_id = (uint32_t) sqlite3_column_int64(res, 4); + ae->updates_id = (uint32_t) sqlite3_column_int64(res, 5); - ae->when = (time_t) sqlite3_column_int64(res, 7); - ae->duration = (time_t) sqlite3_column_int64(res, 8); - ae->non_clear_duration = (time_t) sqlite3_column_int64(res, 9); + ae->when = (time_t) sqlite3_column_int64(res, 6); + ae->duration = (time_t) sqlite3_column_int64(res, 7); + ae->non_clear_duration = (time_t) sqlite3_column_int64(res, 8); - ae->flags = (uint32_t) sqlite3_column_int64(res, 10); + ae->flags = (uint32_t) sqlite3_column_int64(res, 9); ae->flags |= HEALTH_ENTRY_FLAG_SAVED; - ae->exec_run_timestamp = (time_t) sqlite3_column_int64(res, 11); - ae->delay_up_to_timestamp = (time_t) sqlite3_column_int64(res, 12); + ae->exec_run_timestamp = (time_t) sqlite3_column_int64(res, 10); + ae->delay_up_to_timestamp = (time_t) sqlite3_column_int64(res, 11); - ae->name = string_strdupz((char *) sqlite3_column_text(res, 13)); - ae->chart = string_strdupz((char *) sqlite3_column_text(res, 14)); - ae->family = string_strdupz((char *) sqlite3_column_text(res, 15)); + ae->name = string_strdupz((char *) sqlite3_column_text(res, 12)); + ae->chart = string_strdupz((char *) sqlite3_column_text(res, 13)); + ae->family = string_strdupz((char *) sqlite3_column_text(res, 14)); - if (sqlite3_column_type(res, 16) != SQLITE_NULL) - ae->exec = string_strdupz((char *) sqlite3_column_text(res, 16)); + if (sqlite3_column_type(res, 15) != SQLITE_NULL) + ae->exec = string_strdupz((char *) sqlite3_column_text(res, 15)); else ae->exec = NULL; - if (sqlite3_column_type(res, 17) != SQLITE_NULL) - ae->recipient = string_strdupz((char *) sqlite3_column_text(res, 17)); + if (sqlite3_column_type(res, 16) != SQLITE_NULL) + ae->recipient = string_strdupz((char *) sqlite3_column_text(res, 16)); else ae->recipient = NULL; - if (sqlite3_column_type(res, 18) != SQLITE_NULL) - ae->source = string_strdupz((char *) sqlite3_column_text(res, 18)); + if (sqlite3_column_type(res, 17) != SQLITE_NULL) + ae->source = string_strdupz((char *) sqlite3_column_text(res, 17)); else ae->source = NULL; - if (sqlite3_column_type(res, 19) != SQLITE_NULL) - ae->units = string_strdupz((char *) sqlite3_column_text(res, 19)); + if (sqlite3_column_type(res, 18) != SQLITE_NULL) + ae->units = string_strdupz((char *) sqlite3_column_text(res, 18)); else ae->units = NULL; - if (sqlite3_column_type(res, 20) != SQLITE_NULL) - ae->info = string_strdupz((char *) sqlite3_column_text(res, 20)); + if (sqlite3_column_type(res, 19) != SQLITE_NULL) + ae->info = string_strdupz((char *) sqlite3_column_text(res, 19)); else ae->info = NULL; - ae->exec_code = (int) sqlite3_column_int(res, 21); - ae->new_status = (RRDCALC_STATUS) sqlite3_column_int(res, 22); - ae->old_status = (RRDCALC_STATUS)sqlite3_column_int(res, 23); - ae->delay = (int) sqlite3_column_int(res, 24); + ae->exec_code = (int) sqlite3_column_int(res, 20); + ae->new_status = (RRDCALC_STATUS) sqlite3_column_int(res, 21); + ae->old_status = (RRDCALC_STATUS)sqlite3_column_int(res, 22); + ae->delay = (int) sqlite3_column_int(res, 23); - ae->new_value = (NETDATA_DOUBLE) sqlite3_column_double(res, 25); - ae->old_value = (NETDATA_DOUBLE) sqlite3_column_double(res, 26); + ae->new_value = (NETDATA_DOUBLE) sqlite3_column_double(res, 24); + ae->old_value = (NETDATA_DOUBLE) sqlite3_column_double(res, 25); ae->last_repeat = last_repeat; - if (sqlite3_column_type(res, 28) != SQLITE_NULL) - ae->classification = string_strdupz((char *) sqlite3_column_text(res, 28)); + if (sqlite3_column_type(res, 27) != SQLITE_NULL) + ae->classification = string_strdupz((char *) sqlite3_column_text(res, 27)); else ae->classification = NULL; - if (sqlite3_column_type(res, 29) != SQLITE_NULL) - ae->component = string_strdupz((char *) sqlite3_column_text(res, 29)); + if (sqlite3_column_type(res, 28) != SQLITE_NULL) + ae->component = string_strdupz((char *) sqlite3_column_text(res, 28)); else ae->component = NULL; - if (sqlite3_column_type(res, 30) != SQLITE_NULL) - ae->type = string_strdupz((char *) sqlite3_column_text(res, 30)); + if (sqlite3_column_type(res, 29) != SQLITE_NULL) + ae->type = string_strdupz((char *) sqlite3_column_text(res, 29)); else ae->type = NULL; - if (sqlite3_column_type(res, 31) != SQLITE_NULL) - ae->chart_context = string_strdupz((char *) sqlite3_column_text(res, 31)); + if (sqlite3_column_type(res, 30) != SQLITE_NULL) + ae->chart_context = string_strdupz((char *) sqlite3_column_text(res, 30)); else ae->chart_context = NULL; - if (sqlite3_column_type(res, 32) != SQLITE_NULL) - uuid_copy(ae->transition_id, *((uuid_t *) sqlite3_column_blob(res, 32))); + if (sqlite3_column_type(res, 31) != SQLITE_NULL) + uuid_copy(ae->transition_id, *((uuid_t *)sqlite3_column_blob(res, 31))); + + if (sqlite3_column_type(res, 32) != SQLITE_NULL) + ae->global_id = sqlite3_column_int64(res, 32); char value_string[100 + 1]; string_freez(ae->old_value_string); @@ -879,7 +963,7 @@ void sql_health_alarm_log_load(RRDHOST *host) { loaded++; } - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_unlock(&host->health_log.spinlock); dictionary_destroy(all_rrdcalcs); all_rrdcalcs = NULL; @@ -891,7 +975,7 @@ void sql_health_alarm_log_load(RRDHOST *host) { if (unlikely(!host->health_log.next_alarm_id || host->health_log.next_alarm_id <= host->health_max_alarm_id)) host->health_log.next_alarm_id = host->health_max_alarm_id + 1; - log_health("[%s]: Table health_log_%s, loaded %zd alarm entries, errors in %zd entries.", rrdhost_hostname(host), uuid_str, loaded, errored); + netdata_log_health("[%s]: Table health_log, loaded %zd alarm entries, errors in %zd entries.", rrdhost_hostname(host), loaded, errored); ret = sqlite3_finalize(res); if (unlikely(ret != SQLITE_OK)) @@ -907,8 +991,8 @@ void sql_health_alarm_log_load(RRDHOST *host) { "on_key, class, component, type, os, hosts, lookup, every, units, calc, families, plugin, module, " \ "charts, green, red, warn, crit, exec, to_key, info, delay, options, repeat, host_labels, " \ "p_db_lookup_dimensions, p_db_lookup_method, p_db_lookup_options, p_db_lookup_after, " \ - "p_db_lookup_before, p_update_every) values (?1,unixepoch(),?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12," \ - "?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24,?25,?26,?27,?28,?29,?30,?31,?32,?33,?34);" + "p_db_lookup_before, p_update_every, source, chart_labels) values (?1,unixepoch(),?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12," \ + "?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24,?25,?26,?27,?28,?29,?30,?31,?32,?33,?34,?35,?36);" int sql_store_alert_config_hash(uuid_t *hash_id, struct alert_config *cfg) { @@ -1088,6 +1172,14 @@ int sql_store_alert_config_hash(uuid_t *hash_id, struct alert_config *cfg) if (unlikely(rc != SQLITE_OK)) goto bind_fail; + rc = sqlite3_bind_string_or_null(res, cfg->source, ++param); + if (unlikely(rc != SQLITE_OK)) + goto bind_fail; + + rc = sqlite3_bind_string_or_null(res, cfg->chart_labels, ++param); + if (unlikely(rc != SQLITE_OK)) + goto bind_fail; + rc = execute_insert(res); if (unlikely(rc != SQLITE_DONE)) error_report("Failed to store alert config, rc = %d", rc); @@ -1175,18 +1267,14 @@ int alert_hash_and_store_config( return 1; } -#define SQL_SELECT_HEALTH_LAST_EXECUTED_EVENT "SELECT new_status FROM health_log_%s WHERE alarm_id = %u AND unique_id != %u AND flags & %d ORDER BY unique_id DESC LIMIT 1" +#define SQL_SELECT_HEALTH_LAST_EXECUTED_EVENT "SELECT hld.new_status FROM health_log hl, health_log_detail hld WHERE hl.alarm_id = %u AND hld.unique_id != %u AND hld.flags & %u AND hl.host_id = @host_id and hl.health_log_id = hld.health_log_id ORDER BY hld.unique_id DESC LIMIT 1;" int sql_health_get_last_executed_event(RRDHOST *host, ALARM_ENTRY *ae, RRDCALC_STATUS *last_executed_status) { int rc = 0, ret = -1; char command[MAX_HEALTH_SQL_SIZE + 1]; - - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - sqlite3_stmt *res = NULL; - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_SELECT_HEALTH_LAST_EXECUTED_EVENT, uuid_str, ae->alarm_id, ae->unique_id, HEALTH_ENTRY_FLAG_EXEC_RUN); + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_SELECT_HEALTH_LAST_EXECUTED_EVENT, ae->alarm_id, ae->unique_id, (uint32_t) HEALTH_ENTRY_FLAG_EXEC_RUN); rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); if (rc != SQLITE_OK) { @@ -1194,6 +1282,13 @@ int sql_health_get_last_executed_event(RRDHOST *host, ALARM_ENTRY *ae, RRDCALC_S return ret; } + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter for SQL_SELECT_HEALTH_LAST_EXECUTED_EVENT."); + sqlite3_finalize(res); + return ret; + } + ret = 0; while (sqlite3_step_monitored(res) == SQLITE_ROW) { *last_executed_status = (RRDCALC_STATUS) sqlite3_column_int(res, 0); @@ -1207,7 +1302,7 @@ int sql_health_get_last_executed_event(RRDHOST *host, ALARM_ENTRY *ae, RRDCALC_S return ret; } -#define SQL_SELECT_HEALTH_LOG(guid) "SELECT hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, name, chart, family, exec, recipient, source, units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, class, component, type, chart_context, transition_id FROM health_log_%s WHERE 1=1 ", guid +#define SQL_SELECT_HEALTH_LOG "SELECT hld.unique_id, hld.alarm_id, hld.alarm_event_id, hl.config_hash_id, hld.updated_by_id, hld.updates_id, hld.when_key, hld.duration, hld.non_clear_duration, hld.flags, hld.exec_run_timestamp, hld.delay_up_to_timestamp, hl.name, hl.chart, hl.family, hl.exec, hl.recipient, ah.source, hl.units, hld.info, hld.exec_code, hld.new_status, hld.old_status, hld.delay, hld.new_value, hld.old_value, hld.last_repeat, ah.class, ah.component, ah.type, hl.chart_context, hld.transition_id FROM health_log hl, alert_hash ah, health_log_detail hld WHERE hl.config_hash_id = ah.hash_id and hl.health_log_id = hld.health_log_id and hl.host_id = @host_id " void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) { buffer_strcat(wb, "["); @@ -1219,26 +1314,23 @@ void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char * int rc; BUFFER *command = buffer_create(MAX_HEALTH_SQL_SIZE, NULL); - char uuid_str[UUID_STR_LEN]; - uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - - buffer_sprintf(command, SQL_SELECT_HEALTH_LOG(uuid_str)); + buffer_sprintf(command, SQL_SELECT_HEALTH_LOG); if (chart) { char chart_sql[MAX_HEALTH_SQL_SIZE + 1]; - snprintfz(chart_sql, MAX_HEALTH_SQL_SIZE, "AND chart = '%s' ", chart); + snprintfz(chart_sql, MAX_HEALTH_SQL_SIZE, "AND hl.chart = '%s' ", chart); buffer_strcat(command, chart_sql); } if (after) { char after_sql[MAX_HEALTH_SQL_SIZE + 1]; - snprintfz(after_sql, MAX_HEALTH_SQL_SIZE, "AND unique_id > %u ", after); + snprintfz(after_sql, MAX_HEALTH_SQL_SIZE, "AND hld.unique_id > %u ", after); buffer_strcat(command, after_sql); } { char limit_sql[MAX_HEALTH_SQL_SIZE + 1]; - snprintfz(limit_sql, MAX_HEALTH_SQL_SIZE, "ORDER BY unique_id DESC LIMIT %u ", max); + snprintfz(limit_sql, MAX_HEALTH_SQL_SIZE, "ORDER BY hld.unique_id DESC LIMIT %u ", max); buffer_strcat(command, limit_sql); } @@ -1249,19 +1341,27 @@ void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char * return; } + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id for SQL_SELECT_HEALTH_LOG."); + sqlite3_finalize(res); + buffer_free(command); + return; + } + while (sqlite3_step(res) == SQLITE_ROW) { char old_value_string[100 + 1]; char new_value_string[100 + 1]; char config_hash_id[UUID_STR_LEN]; - uuid_unparse_lower(*((uuid_t *) sqlite3_column_blob(res, 4)), config_hash_id); + uuid_unparse_lower(*((uuid_t *) sqlite3_column_blob(res, 3)), config_hash_id); char transition_id[UUID_STR_LEN] = {0}; - if (sqlite3_column_type(res, 32) != SQLITE_NULL) - uuid_unparse_lower(*((uuid_t *) sqlite3_column_blob(res, 32)), transition_id); + if (sqlite3_column_type(res, 31) != SQLITE_NULL) + uuid_unparse_lower(*((uuid_t *) sqlite3_column_blob(res, 31)), transition_id); - char *edit_command = health_edit_command_from_source((char *)sqlite3_column_text(res, 18)); + char *edit_command = sqlite3_column_bytes(res, 17) > 0 ? health_edit_command_from_source((char *)sqlite3_column_text(res, 17)) : strdupz("UNKNOWN=0=UNKNOWN"); if (count) buffer_sprintf(wb, ","); @@ -1309,63 +1409,63 @@ void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char * "\t\t\"old_value_string\": \"%s\",\n" "\t\t\"last_repeat\": \"%lu\",\n" "\t\t\"silenced\": \"%s\",\n", - sqlite3_column_text(res, 0), + rrdhost_hostname(host), host->utc_offset, rrdhost_abbrev_timezone(host), + (unsigned int) sqlite3_column_int64(res, 0), (unsigned int) sqlite3_column_int64(res, 1), (unsigned int) sqlite3_column_int64(res, 2), - (unsigned int) sqlite3_column_int64(res, 3), config_hash_id, transition_id, + sqlite3_column_text(res, 12), sqlite3_column_text(res, 13), + sqlite3_column_text(res, 30), sqlite3_column_text(res, 14), - sqlite3_column_text(res, 31), - sqlite3_column_text(res, 15), + sqlite3_column_text(res, 27) ? (const char *) sqlite3_column_text(res, 27) : (char *) "Unknown", sqlite3_column_text(res, 28) ? (const char *) sqlite3_column_text(res, 28) : (char *) "Unknown", sqlite3_column_text(res, 29) ? (const char *) sqlite3_column_text(res, 29) : (char *) "Unknown", - sqlite3_column_text(res, 30) ? (const char *) sqlite3_column_text(res, 30) : (char *) "Unknown", - (sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false", - (sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false", - (long unsigned int)sqlite3_column_int64(res, 11), - (sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false", - sqlite3_column_text(res, 16) ? (const char *) sqlite3_column_text(res, 16) : string2str(host->health.health_default_exec), - sqlite3_column_text(res, 17) ? (const char *) sqlite3_column_text(res, 17) : string2str(host->health.health_default_recipient), - sqlite3_column_int(res, 21), - sqlite3_column_text(res, 18), + (sqlite3_column_int64(res, 9) & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false", + (sqlite3_column_int64(res, 9) & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false", + (long unsigned int)sqlite3_column_int64(res, 10), + (sqlite3_column_int64(res, 9) & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false", + sqlite3_column_text(res, 15) ? (const char *) sqlite3_column_text(res, 15) : string2str(host->health.health_default_exec), + sqlite3_column_text(res, 16) ? (const char *) sqlite3_column_text(res, 16) : string2str(host->health.health_default_recipient), + sqlite3_column_int(res, 20), + sqlite3_column_text(res, 17) ? (const char *) sqlite3_column_text(res, 17) : (char *) "Unknown", edit_command, - sqlite3_column_text(res, 19), + sqlite3_column_text(res, 18), + (long unsigned int)sqlite3_column_int64(res, 6), (long unsigned int)sqlite3_column_int64(res, 7), (long unsigned int)sqlite3_column_int64(res, 8), - (long unsigned int)sqlite3_column_int64(res, 9), + rrdcalc_status2string(sqlite3_column_int(res, 21)), rrdcalc_status2string(sqlite3_column_int(res, 22)), - rrdcalc_status2string(sqlite3_column_int(res, 23)), - sqlite3_column_int(res, 24), - (long unsigned int)sqlite3_column_int64(res, 12), + sqlite3_column_int(res, 23), + (long unsigned int)sqlite3_column_int64(res, 11), + (unsigned int)sqlite3_column_int64(res, 4), (unsigned int)sqlite3_column_int64(res, 5), - (unsigned int)sqlite3_column_int64(res, 6), - sqlite3_column_type(res, 25) == SQLITE_NULL ? "-" : format_value_and_unit(new_value_string, 100, sqlite3_column_double(res, 25), (char *) sqlite3_column_text(res, 19), -1), - sqlite3_column_type(res, 26) == SQLITE_NULL ? "-" : format_value_and_unit(old_value_string, 100, sqlite3_column_double(res, 26), (char *) sqlite3_column_text(res, 19), -1), - (long unsigned int)sqlite3_column_int64(res, 27), - (sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"); + sqlite3_column_type(res, 24) == SQLITE_NULL ? "-" : format_value_and_unit(new_value_string, 100, sqlite3_column_double(res, 24), (char *) sqlite3_column_text(res, 18), -1), + sqlite3_column_type(res, 25) == SQLITE_NULL ? "-" : format_value_and_unit(old_value_string, 100, sqlite3_column_double(res, 25), (char *) sqlite3_column_text(res, 18), -1), + (long unsigned int)sqlite3_column_int64(res, 26), + (sqlite3_column_int64(res, 9) & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"); - health_string2json(wb, "\t\t", "info", (char *) sqlite3_column_text(res, 20), ",\n"); + health_string2json(wb, "\t\t", "info", (char *) sqlite3_column_text(res, 19), ",\n"); - if(unlikely(sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) { + if(unlikely(sqlite3_column_int64(res, 9) & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) { buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n"); } buffer_strcat(wb, "\t\t\"value\":"); - if (sqlite3_column_type(res, 25) == SQLITE_NULL) + if (sqlite3_column_type(res, 24) == SQLITE_NULL) buffer_strcat(wb, "null"); else - buffer_print_netdata_double(wb, sqlite3_column_double(res, 25)); + buffer_print_netdata_double(wb, sqlite3_column_double(res, 24)); buffer_strcat(wb, ",\n"); buffer_strcat(wb, "\t\t\"old_value\":"); - if (sqlite3_column_type(res, 26) == SQLITE_NULL) + if (sqlite3_column_type(res, 25) == SQLITE_NULL) buffer_strcat(wb, "null"); else - buffer_print_netdata_double(wb, sqlite3_column_double(res, 26)); + buffer_print_netdata_double(wb, sqlite3_column_double(res, 25)); buffer_strcat(wb, "\n"); buffer_strcat(wb, "\t}"); @@ -1381,3 +1481,609 @@ void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char * buffer_free(command); } + +#define SQL_COPY_HEALTH_LOG(table) "INSERT OR IGNORE INTO health_log (host_id, alarm_id, config_hash_id, name, chart, family, exec, recipient, units, chart_context) SELECT ?1, alarm_id, config_hash_id, name, chart, family, exec, recipient, units, chart_context from %s;", table +#define SQL_COPY_HEALTH_LOG_DETAIL(table) "INSERT INTO health_log_detail (unique_id, alarm_id, alarm_event_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, transition_id, global_id, host_id) SELECT unique_id, alarm_id, alarm_event_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, transition_id, now_usec(1), ?1 from %s;", table +#define SQL_UPDATE_HEALTH_LOG_DETAIL_TRANSITION_ID "update health_log_detail set transition_id = uuid_random() where transition_id is null;" +#define SQL_UPDATE_HEALTH_LOG_DETAIL_HEALTH_LOG_ID "update health_log_detail set health_log_id = (select health_log_id from health_log where host_id = ?1 and alarm_id = health_log_detail.alarm_id) where health_log_id is null and host_id = ?2;" +#define SQL_UPDATE_HEALTH_LOG_LAST_TRANSITION_ID "update health_log set last_transition_id = (select transition_id from health_log_detail where health_log_id = health_log.health_log_id and alarm_id = health_log.alarm_id group by (alarm_id) having max(alarm_event_id)) where host_id = ?1;" +int health_migrate_old_health_log_table(char *table) { + if (!table) + return 0; + + //table should contain guid. We need to + //keep it in the new table along with it's data + //health_log_XXXXXXXX_XXXX_XXXX_XXXX_XXXXXXXXXXXX + if (strnlen(table, 46) != 46) { + return 0; + } + + char *uuid_from_table = strdupz(table + 11); + uuid_t uuid; + if (uuid_parse_fix(uuid_from_table, uuid)) { + freez(uuid_from_table); + return 0; + } + + int rc; + char command[MAX_HEALTH_SQL_SIZE + 1]; + sqlite3_stmt *res = NULL; + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_COPY_HEALTH_LOG(table)); + rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to copy health log, rc = %d", rc); + freez(uuid_from_table); + return 0; + } + + rc = sqlite3_bind_blob(res, 1, &uuid, sizeof(uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to copy health log table, rc = %d", rc); + freez(uuid_from_table); + return 0; + } + + rc = execute_insert(res); + if (unlikely(rc != SQLITE_DONE)) { + error_report("Failed to execute SQL_COPY_HEALTH_LOG, rc = %d", rc); + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to copy health log table, rc = %d", rc); + freez(uuid_from_table); + } + + //detail + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_COPY_HEALTH_LOG_DETAIL(table)); + rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to copy health log detail, rc = %d", rc); + return 0; + } + + rc = sqlite3_bind_blob(res, 1, &uuid, sizeof(uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to copy health log detail, rc = %d", rc); + return 0; + } + + rc = execute_insert(res); + if (unlikely(rc != SQLITE_DONE)) { + error_report("Failed to execute SQL_COPY_HEALTH_LOG_DETAIL, rc = %d", rc); + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to copy health log detail table, rc = %d", rc); + return 0; + } + + //update transition ids + rc = sqlite3_prepare_v2(db_meta, SQL_UPDATE_HEALTH_LOG_DETAIL_TRANSITION_ID, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to update health log detail with transition ids, rc = %d", rc); + return 0; + } + + rc = execute_insert(res); + if (unlikely(rc != SQLITE_DONE)) { + error_report("Failed to execute SQL_UPDATE_HEALTH_LOG_DETAIL_TRANSITION_ID, rc = %d", rc); + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to update health log detail table with transition ids, rc = %d", rc); + return 0; + } + + //update health_log_id + rc = sqlite3_prepare_v2(db_meta, SQL_UPDATE_HEALTH_LOG_DETAIL_HEALTH_LOG_ID, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to update health log detail with health log ids, rc = %d", rc); + return 0; + } + + rc = sqlite3_bind_blob(res, 1, &uuid, sizeof(uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to update health log detail with health log ids, rc = %d", rc); + return 0; + } + + rc = sqlite3_bind_blob(res, 2, &uuid, sizeof(uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to update health log detail with health log ids, rc = %d", rc); + return 0; + } + + rc = execute_insert(res); + if (unlikely(rc != SQLITE_DONE)) { + error_report("Failed to execute SQL_UPDATE_HEALTH_LOG_DETAIL_HEALTH_LOG_ID, rc = %d", rc); + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to update health log detail table with health log ids, rc = %d", rc); + } + + //update last transition id + rc = sqlite3_prepare_v2(db_meta, SQL_UPDATE_HEALTH_LOG_LAST_TRANSITION_ID, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to update health log with last transition id, rc = %d", rc); + return 0; + } + + rc = sqlite3_bind_blob(res, 1, &uuid, sizeof(uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to update health log with last transition id, rc = %d", rc); + return 0; + } + + rc = execute_insert(res); + if (unlikely(rc != SQLITE_DONE)) { + error_report("Failed to execute SQL_UPDATE_HEALTH_LOG_LAST_TRANSITION_ID, rc = %d", rc); + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to update health log table with last transition id, rc = %d", rc); + } + + return 1; +} + +#define SQL_GET_ALARM_ID "select alarm_id, health_log_id from health_log where host_id = @host_id and chart = @chart and name = @name and config_hash_id = @config_hash_id" +#define SQL_GET_EVENT_ID "select max(alarm_event_id) + 1 from health_log_detail where health_log_id = @health_log_id and alarm_id = @alarm_id" +uint32_t sql_get_alarm_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id) +{ + int rc = 0; + sqlite3_stmt *res = NULL; + uint32_t alarm_id = 0; + uint64_t health_log_id = 0; + + rc = sqlite3_prepare_v2(db_meta, SQL_GET_ALARM_ID, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to get an alarm id"); + return alarm_id; + } + + rc = sqlite3_bind_blob(res, 1, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter for SQL_GET_ALARM_ID."); + sqlite3_finalize(res); + return alarm_id; + } + + rc = sqlite3_bind_string_or_null(res, chart, 2); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind char parameter for SQL_GET_ALARM_ID."); + sqlite3_finalize(res); + return alarm_id; + } + + rc = sqlite3_bind_string_or_null(res, name, 3); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind name parameter for SQL_GET_ALARM_ID."); + sqlite3_finalize(res); + return alarm_id; + } + + rc = sqlite3_bind_blob(res, 4, config_hash_id, sizeof(*config_hash_id), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind config_hash_id parameter for SQL_GET_ALARM_ID."); + sqlite3_finalize(res); + return alarm_id; + } + + while (sqlite3_step_monitored(res) == SQLITE_ROW) { + alarm_id = (uint32_t) sqlite3_column_int64(res, 0); + health_log_id = (uint64_t) sqlite3_column_int64(res, 1); + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize the statement while getting an alarm id."); + + if (alarm_id) { + rc = sqlite3_prepare_v2(db_meta, SQL_GET_EVENT_ID, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to get an event id"); + return alarm_id; + } + + rc = sqlite3_bind_int64(res, 1, (sqlite3_int64) health_log_id); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id parameter for SQL_GET_EVENT_ID."); + sqlite3_finalize(res); + return alarm_id; + } + + rc = sqlite3_bind_int64(res, 2, (sqlite3_int64) alarm_id); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind char parameter for SQL_GET_EVENT_ID."); + sqlite3_finalize(res); + return alarm_id; + } + + while (sqlite3_step_monitored(res) == SQLITE_ROW) { + *next_event_id = (uint32_t) sqlite3_column_int64(res, 0); + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize the statement while getting an alarm id."); + } + + return alarm_id; +} + +#define SQL_GET_ALARM_ID_FROM_TRANSITION_ID "SELECT hld.alarm_id, hl.host_id, hl.chart_context FROM " \ + "health_log_detail hld, health_log hl WHERE hld.transition_id = @transition_id " \ + "and hld.health_log_id = hl.health_log_id" + +bool sql_find_alert_transition(const char *transition, void (*cb)(const char *machine_guid, const char *context, time_t alert_id, void *data), void *data) +{ + static __thread sqlite3_stmt *res = NULL; + + char machine_guid[UUID_STR_LEN]; + + int rc; + uuid_t transition_uuid; + if (uuid_parse(transition, transition_uuid)) + return false; + + if (unlikely(!res)) { + rc = prepare_statement(db_meta, SQL_GET_ALARM_ID_FROM_TRANSITION_ID, &res); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement when trying to get transition id"); + return false; + } + } + + bool ok = false; + + rc = sqlite3_bind_blob(res, 1, &transition_uuid, sizeof(transition_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind transition"); + goto fail; + } + + while (sqlite3_step_monitored(res) == SQLITE_ROW) { + ok = true; + uuid_unparse_lower(*(uuid_t *) sqlite3_column_blob(res, 1), machine_guid); + cb(machine_guid, (const char *) sqlite3_column_text(res, 2), sqlite3_column_int(res, 0), data); + } + +fail: + rc = sqlite3_reset(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset the statement when trying to find transition"); + + return ok; +} + +#define SQL_BUILD_ALERT_TRANSITION "CREATE TEMP TABLE IF NOT EXISTS v_%p (host_id blob)" + +#define SQL_POPULATE_TEMP_ALERT_TRANSITION_TABLE "INSERT INTO v_%p (host_id) VALUES (@host_id)" + +#define SQL_SEARCH_ALERT_TRANSITION_SELECT "SELECT " \ + "h.host_id, h.alarm_id, h.config_hash_id, h.name, h.chart, h.family, h.recipient, h.units, h.exec, " \ + "h.chart_context, d.when_key, d.duration, d.non_clear_duration, d.flags, d.delay_up_to_timestamp, " \ + "d.info, d.exec_code, d.new_status, d.old_status, d.delay, d.new_value, d.old_value, d.last_repeat, " \ + "d.transition_id, d.global_id, ah.class, ah.type, ah.component, d.exec_run_timestamp" + +#define SQL_SEARCH_ALERT_TRANSITION_COMMON_WHERE \ + "h.config_hash_id = ah.hash_id AND h.health_log_id = d.health_log_id" + +#define SQL_SEARCH_ALERT_TRANSITION SQL_SEARCH_ALERT_TRANSITION_SELECT " FROM health_log h, health_log_detail d, v_%p t, alert_hash ah " \ + " WHERE h.host_id = t.host_id AND " SQL_SEARCH_ALERT_TRANSITION_COMMON_WHERE " AND ( d.new_status > 2 OR d.old_status > 2 ) AND d.global_id BETWEEN @after AND @before " + +#define SQL_SEARCH_ALERT_TRANSITION_DIRECT SQL_SEARCH_ALERT_TRANSITION_SELECT " FROM health_log h, health_log_detail d, alert_hash ah " \ + " WHERE " SQL_SEARCH_ALERT_TRANSITION_COMMON_WHERE " AND transition_id = @transition " + +void sql_alert_transitions( + DICTIONARY *nodes, + time_t after, + time_t before, + const char *context, + const char *alert_name, + const char *transition, + void (*cb)(struct sql_alert_transition_data *, void *), + void *data, + bool debug __maybe_unused) +{ + uuid_t transition_uuid; + char sql[512]; + int rc; + sqlite3_stmt *res = NULL; + BUFFER *command = NULL; + + if (unlikely(!nodes)) + return; + + if (transition) { + if (uuid_parse(transition, transition_uuid)) { + error_report("Invalid transition given %s", transition); + return; + } + + rc = sqlite3_prepare_v2(db_meta, SQL_SEARCH_ALERT_TRANSITION_DIRECT, -1, &res, 0); + + rc = sqlite3_bind_blob(res, 1, &transition_uuid, sizeof(transition_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind transition_id parameter"); + goto fail; + } + goto run_query; + } + + snprintfz(sql, 511, SQL_BUILD_ALERT_TRANSITION, nodes); + rc = db_execute(db_meta, sql); + if (rc) + return; + + snprintfz(sql, 511, SQL_POPULATE_TEMP_ALERT_TRANSITION_TABLE, nodes); + + // Prepare statement to add things + rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to INSERT into v_%p", nodes); + goto fail_only_drop; + } + + void *t; + dfe_start_read(nodes, t) { + uuid_t host_uuid; + uuid_parse( t_dfe.name, host_uuid); + + rc = sqlite3_bind_blob(res, 1, &host_uuid, sizeof(host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to bind host_id parameter."); + + rc = sqlite3_step_monitored(res); + if (rc != SQLITE_DONE) + error_report("Error while populating temp table"); + + rc = sqlite3_reset(res); + if (rc != SQLITE_OK) + error_report("Error while resetting parameters"); + } + dfe_done(t); + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) { + // log error but continue + error_report("Failed to finalize statement for sql_alert_transitions temp table population"); + } + + command = buffer_create(MAX_HEALTH_SQL_SIZE, NULL); + + buffer_sprintf(command, SQL_SEARCH_ALERT_TRANSITION, nodes); + + if (context) + buffer_sprintf(command, " AND h.chart_context = @context"); + + if (alert_name) + buffer_sprintf(command, " AND h.name = @alert_name"); + + buffer_strcat(command, " ORDER BY d.global_id DESC"); + + rc = sqlite3_prepare_v2(db_meta, buffer_tostring(command), -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement sql_alert_transitions"); + goto fail_only_drop; + } + + int param = 1; + rc = sqlite3_bind_int64(res, param++, (sqlite3_int64)(after * USEC_PER_SEC)); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind after parameter"); + goto fail; + } + + rc = sqlite3_bind_int64(res, param++, (sqlite3_int64)(before * USEC_PER_SEC)); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind before parameter"); + goto fail; + } + + if (context) { + rc = sqlite3_bind_text(res, param++, context, -1, SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind context parameter"); + goto fail; + } + } + + if (alert_name) { + rc = sqlite3_bind_text(res, param++, alert_name, -1, SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind alert_name parameter"); + goto fail; + } + } + +run_query:; + + struct sql_alert_transition_data atd = {0 }; + + while (sqlite3_step(res) == SQLITE_ROW) { + atd.host_id = (uuid_t *) sqlite3_column_blob(res, 0); + atd.alarm_id = sqlite3_column_int64(res, 1); + atd.config_hash_id = (uuid_t *)sqlite3_column_blob(res, 2); + atd.alert_name = (const char *) sqlite3_column_text(res, 3); + atd.chart = (const char *) sqlite3_column_text(res, 4); + atd.chart_name = (const char *) sqlite3_column_text(res, 4); // FIXME don't copy the id, find the name + atd.family = (const char *) sqlite3_column_text(res, 5); + atd.recipient = (const char *) sqlite3_column_text(res, 6); + atd.units = (const char *) sqlite3_column_text(res, 7); + atd.exec = (const char *) sqlite3_column_text(res, 8); + atd.chart_context = (const char *) sqlite3_column_text(res, 9); + atd.when_key = sqlite3_column_int64(res, 10); + atd.duration = sqlite3_column_int64(res, 11); + atd.non_clear_duration = sqlite3_column_int64(res, 12); + atd.flags = sqlite3_column_int64(res, 13); + atd.delay_up_to_timestamp = sqlite3_column_int64(res, 14); + atd.info = (const char *) sqlite3_column_text(res, 15); + atd.exec_code = sqlite3_column_int(res, 16); + atd.new_status = sqlite3_column_int(res, 17); + atd.old_status = sqlite3_column_int(res, 18); + atd.delay = (int) sqlite3_column_int(res, 19); + atd.new_value = (NETDATA_DOUBLE) sqlite3_column_double(res, 20); + atd.old_value = (NETDATA_DOUBLE) sqlite3_column_double(res, 21); + atd.last_repeat = sqlite3_column_int64(res, 22); + atd.transition_id = (uuid_t *) sqlite3_column_blob(res, 23); + atd.global_id = sqlite3_column_int64(res, 24); + atd.classification = (const char *) sqlite3_column_text(res, 25); + atd.type = (const char *) sqlite3_column_text(res, 26); + atd.component = (const char *) sqlite3_column_text(res, 27); + atd.exec_run_timestamp = sqlite3_column_int64(res, 28); + + cb(&atd, data); + } + +fail: + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement for sql_alert_transitions"); + +fail_only_drop: + if (likely(!transition)) { + (void)snprintfz(sql, 511, "DROP TABLE IF EXISTS v_%p", nodes); + (void)db_execute(db_meta, sql); + buffer_free(command); + } +} + +#define SQL_BUILD_CONFIG_TARGET_LIST "CREATE TEMP TABLE IF NOT EXISTS c_%p (hash_id blob)" + +#define SQL_POPULATE_TEMP_CONFIG_TARGET_TABLE "INSERT INTO c_%p (hash_id) VALUES (@hash_id)" + +#define SQL_SEARCH_CONFIG_LIST "SELECT ah.hash_id, alarm, template, on_key, class, component, type, os, hosts, lookup, every, " \ + " units, calc, families, plugin, module, charts, green, red, warn, crit, " \ + " exec, to_key, info, delay, options, repeat, host_labels, p_db_lookup_dimensions, p_db_lookup_method, " \ + " p_db_lookup_options, p_db_lookup_after, p_db_lookup_before, p_update_every, source, chart_labels " \ + " FROM alert_hash ah, c_%p t where ah.hash_id = t.hash_id" + +int sql_get_alert_configuration( + DICTIONARY *configs, + void (*cb)(struct sql_alert_config_data *, void *), + void *data, + bool debug __maybe_unused) +{ + int added = -1; + char sql[512]; + int rc; + sqlite3_stmt *res = NULL; + BUFFER *command = NULL; + + if (unlikely(!configs)) + return added; + + snprintfz(sql, 511, SQL_BUILD_CONFIG_TARGET_LIST, configs); + rc = db_execute(db_meta, sql); + if (rc) + return added; + + snprintfz(sql, 511, SQL_POPULATE_TEMP_CONFIG_TARGET_TABLE, configs); + + // Prepare statement to add things + rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to INSERT into c_%p", configs); + goto fail_only_drop; + } + + void *t; + dfe_start_read(configs, t) { + uuid_t hash_id; + uuid_parse( t_dfe.name, hash_id); + + rc = sqlite3_bind_blob(res, 1, &hash_id, sizeof(hash_id), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to bind host_id parameter."); + + rc = sqlite3_step_monitored(res); + if (rc != SQLITE_DONE) + error_report("Error while populating temp table"); + + rc = sqlite3_reset(res); + if (rc != SQLITE_OK) + error_report("Error while resetting parameters"); + } + dfe_done(t); + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) { + // log error but continue + error_report("Failed to finalize statement for sql_get_alert_configuration temp table population"); + } + + command = buffer_create(MAX_HEALTH_SQL_SIZE, NULL); + + buffer_sprintf(command, SQL_SEARCH_CONFIG_LIST, configs); + + rc = sqlite3_prepare_v2(db_meta, buffer_tostring(command), -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement sql_get_alert_configuration"); + goto fail_only_drop; + } + + struct sql_alert_config_data acd = {0 }; + + added = 0; + int param; + while (sqlite3_step(res) == SQLITE_ROW) { + param = 0; + acd.config_hash_id = (uuid_t *) sqlite3_column_blob(res, param++); + acd.name = (const char *) sqlite3_column_text(res, param++); + acd.selectors.on_template = (const char *) sqlite3_column_text(res, param++); + acd.selectors.on_key = (const char *) sqlite3_column_text(res, param++); + acd.classification = (const char *) sqlite3_column_text(res, param++); + acd.component = (const char *) sqlite3_column_text(res, param++); + acd.type = (const char *) sqlite3_column_text(res, param++); + acd.selectors.os = (const char *) sqlite3_column_text(res, param++); + acd.selectors.hosts = (const char *) sqlite3_column_text(res, param++); + acd.value.db.lookup = (const char *) sqlite3_column_text(res, param++); + acd.value.every = (const char *) sqlite3_column_text(res, param++); + acd.value.units = (const char *) sqlite3_column_text(res, param++); + acd.value.calc = (const char *) sqlite3_column_text(res, param++); + acd.selectors.families = (const char *) sqlite3_column_text(res, param++); + acd.selectors.plugin = (const char *) sqlite3_column_text(res, param++); + acd.selectors.module = (const char *) sqlite3_column_text(res, param++); + acd.selectors.charts = (const char *) sqlite3_column_text(res, param++); + acd.status.green = (const char *) sqlite3_column_text(res, param++); + acd.status.red = (const char *) sqlite3_column_text(res, param++); + acd.status.warn = (const char *) sqlite3_column_text(res, param++); + acd.status.crit = (const char *) sqlite3_column_text(res, param++); + acd.notification.exec = (const char *) sqlite3_column_text(res, param++); + acd.notification.to_key = (const char *) sqlite3_column_text(res, param++); + acd.info = (const char *) sqlite3_column_text(res, param++); + acd.notification.delay = (const char *) sqlite3_column_text(res, param++); + acd.notification.options = (const char *) sqlite3_column_text(res, param++); + acd.notification.repeat = (const char *) sqlite3_column_text(res, param++); + acd.selectors.host_labels = (const char *) sqlite3_column_text(res, param++); + acd.value.db.dimensions = (const char *) sqlite3_column_text(res, param++); + acd.value.db.method = (const char *) sqlite3_column_text(res, param++); + acd.value.db.options = (uint32_t) sqlite3_column_int(res, param++); + acd.value.db.after = (int32_t) sqlite3_column_int(res, param++); + acd.value.db.before = (int32_t) sqlite3_column_int(res, param++); + acd.value.update_every = (int32_t) sqlite3_column_int(res, param++); + acd.source = (const char *) sqlite3_column_text(res, param++); + acd.selectors.chart_labels = (const char *) sqlite3_column_text(res, param++); + + cb(&acd, data); + added++; + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement for sql_get_alert_configuration"); + +fail_only_drop: + (void)snprintfz(sql, 511, "DROP TABLE IF EXISTS c_%p", configs); + (void)db_execute(db_meta, sql); + buffer_free(command); + return added; +} + diff --git a/database/sqlite/sqlite_health.h b/database/sqlite/sqlite_health.h index 96d090b54..55e523d2f 100644 --- a/database/sqlite/sqlite_health.h +++ b/database/sqlite/sqlite_health.h @@ -5,9 +5,10 @@ #include "../../daemon/common.h" #include "sqlite3.h" +struct sql_alert_transition_data; +struct sql_alert_config_data; extern sqlite3 *db_meta; void sql_health_alarm_log_load(RRDHOST *host); -int sql_create_health_log_table(RRDHOST *host); void sql_health_alarm_log_update(RRDHOST *host, ALARM_ENTRY *ae); void sql_health_alarm_log_insert(RRDHOST *host, ALARM_ENTRY *ae); void sql_health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae); @@ -16,4 +17,24 @@ int alert_hash_and_store_config(uuid_t hash_id, struct alert_config *cfg, int st void sql_aclk_alert_clean_dead_entries(RRDHOST *host); int sql_health_get_last_executed_event(RRDHOST *host, ALARM_ENTRY *ae, RRDCALC_STATUS *last_executed_status); void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart); +int health_migrate_old_health_log_table(char *table); +uint32_t sql_get_alarm_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id); +void sql_alert_transitions( + DICTIONARY *nodes, + time_t after, + time_t before, + const char *context, + const char *alert_name, + const char *transition, + void (*cb)(struct sql_alert_transition_data *, void *), + void *data, + bool debug); + +int sql_get_alert_configuration( + DICTIONARY *configs, + void (*cb)(struct sql_alert_config_data *, void *), + void *data, + bool debug __maybe_unused); + +bool sql_find_alert_transition(const char *transition, void (*cb)(const char *machine_guid, const char *context, time_t alert_id, void *data), void *data); #endif //NETDATA_SQLITE_HEALTH_H diff --git a/database/sqlite/sqlite_metadata.c b/database/sqlite/sqlite_metadata.c index 607d789a5..697772bf5 100644 --- a/database/sqlite/sqlite_metadata.c +++ b/database/sqlite/sqlite_metadata.c @@ -37,8 +37,8 @@ #define SELECT_DIMENSION_LIST "SELECT dim_id, rowid FROM dimension WHERE rowid > @row_id" -#define STORE_HOST_INFO "INSERT OR REPLACE INTO host_info (host_id, system_key, system_value, date_created) VALUES " -#define STORE_HOST_INFO_VALUES "(u2h('%s'), '%s','%s', unixepoch())" +#define SQL_STORE_HOST_SYSTEM_INFO_VALUES "INSERT OR REPLACE INTO host_info (host_id, system_key, system_value, date_created) VALUES " \ + "(@uuid, @name, @value, unixepoch())" #define MIGRATE_LOCALHOST_TO_NEW_MACHINE_GUID \ "UPDATE chart SET host_id = @host_id WHERE host_id in (SELECT host_id FROM host where host_id <> @host_id and hops = 0);" @@ -378,59 +378,90 @@ bind_fail: return 1; } -static void add_host_sysinfo_key_value(const char *name, const char *value, void *data) +static int add_host_sysinfo_key_value(const char *name, const char *value, uuid_t *uuid) { - struct query_build *lb = data; + static __thread sqlite3_stmt *res = NULL; + int rc, param = 0; - if (unlikely(!value)) - return; + if (unlikely(!db_meta)) { + if (default_rrd_memory_mode != RRD_MEMORY_MODE_DBENGINE) + return 0; + error_report("Database has not been initialized"); + return 0; + } - if (unlikely(!lb->count)) - buffer_sprintf( - lb->sql, STORE_HOST_INFO); - else - buffer_strcat(lb->sql, ", "); - buffer_sprintf(lb->sql, STORE_HOST_INFO_VALUES, lb->uuid_str, name, value); - lb->count++; + if (unlikely((!res))) { + rc = prepare_statement(db_meta, SQL_STORE_HOST_SYSTEM_INFO_VALUES, &res); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to store host info values, rc = %d", rc); + return 0; + } + } + + rc = sqlite3_bind_blob(res, ++param, uuid, sizeof(*uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) + goto bind_fail; + + rc = bind_text_null(res, ++param, name, 0); + if (unlikely(rc != SQLITE_OK)) + goto bind_fail; + + rc = bind_text_null(res, ++param, value ? value : "unknown", 0); + if (unlikely(rc != SQLITE_OK)) + goto bind_fail; + + int store_rc = sqlite3_step_monitored(res); + if (unlikely(store_rc != SQLITE_DONE)) + error_report("Failed to store host info value %s, rc = %d", name, rc); + + rc = sqlite3_reset(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to store host info value %s, rc = %d", name, rc); + + return store_rc == SQLITE_DONE; +bind_fail: + error_report("Failed to bind %d parameter to store host info values %s, rc = %d", param, name, rc); + rc = sqlite3_reset(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to store host info values %s, rc = %d", name, rc); + return 0; } -static bool build_host_system_info_statements(RRDHOST *host, BUFFER *work_buffer) +static bool store_host_systeminfo(RRDHOST *host) { struct rrdhost_system_info *system_info = host->system_info; if (unlikely(!system_info)) return false; - buffer_flush(work_buffer); - struct query_build key_data = {.sql = work_buffer, .count = 0}; - uuid_unparse_lower(host->host_uuid, key_data.uuid_str); - - add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_NAME", system_info->container_os_name, &key_data); - add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_ID", system_info->container_os_id, &key_data); - add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_ID_LIKE", system_info->container_os_id_like, &key_data); - add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_VERSION", system_info->container_os_version, &key_data); - add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_VERSION_ID", system_info->container_os_version_id, &key_data); - add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_DETECTION", system_info->host_os_detection, &key_data); - add_host_sysinfo_key_value("NETDATA_HOST_OS_NAME", system_info->host_os_name, &key_data); - add_host_sysinfo_key_value("NETDATA_HOST_OS_ID", system_info->host_os_id, &key_data); - add_host_sysinfo_key_value("NETDATA_HOST_OS_ID_LIKE", system_info->host_os_id_like, &key_data); - add_host_sysinfo_key_value("NETDATA_HOST_OS_VERSION", system_info->host_os_version, &key_data); - add_host_sysinfo_key_value("NETDATA_HOST_OS_VERSION_ID", system_info->host_os_version_id, &key_data); - add_host_sysinfo_key_value("NETDATA_HOST_OS_DETECTION", system_info->host_os_detection, &key_data); - add_host_sysinfo_key_value("NETDATA_SYSTEM_KERNEL_NAME", system_info->kernel_name, &key_data); - add_host_sysinfo_key_value("NETDATA_SYSTEM_CPU_LOGICAL_CPU_COUNT", system_info->host_cores, &key_data); - add_host_sysinfo_key_value("NETDATA_SYSTEM_CPU_FREQ", system_info->host_cpu_freq, &key_data); - add_host_sysinfo_key_value("NETDATA_SYSTEM_TOTAL_RAM", system_info->host_ram_total, &key_data); - add_host_sysinfo_key_value("NETDATA_SYSTEM_TOTAL_DISK_SIZE", system_info->host_disk_space, &key_data); - add_host_sysinfo_key_value("NETDATA_SYSTEM_KERNEL_VERSION", system_info->kernel_version, &key_data); - add_host_sysinfo_key_value("NETDATA_SYSTEM_ARCHITECTURE", system_info->architecture, &key_data); - add_host_sysinfo_key_value("NETDATA_SYSTEM_VIRTUALIZATION", system_info->virtualization, &key_data); - add_host_sysinfo_key_value("NETDATA_SYSTEM_VIRT_DETECTION", system_info->virt_detection, &key_data); - add_host_sysinfo_key_value("NETDATA_SYSTEM_CONTAINER", system_info->container, &key_data); - add_host_sysinfo_key_value("NETDATA_SYSTEM_CONTAINER_DETECTION", system_info->container_detection, &key_data); - add_host_sysinfo_key_value("NETDATA_HOST_IS_K8S_NODE", system_info->is_k8s_node, &key_data); - - return true; + int ret = 0; + + ret += add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_NAME", system_info->container_os_name, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_ID", system_info->container_os_id, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_ID_LIKE", system_info->container_os_id_like, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_VERSION", system_info->container_os_version, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_VERSION_ID", system_info->container_os_version_id, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_CONTAINER_OS_DETECTION", system_info->host_os_detection, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_HOST_OS_NAME", system_info->host_os_name, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_HOST_OS_ID", system_info->host_os_id, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_HOST_OS_ID_LIKE", system_info->host_os_id_like, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_HOST_OS_VERSION", system_info->host_os_version, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_HOST_OS_VERSION_ID", system_info->host_os_version_id, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_HOST_OS_DETECTION", system_info->host_os_detection, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_SYSTEM_KERNEL_NAME", system_info->kernel_name, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_SYSTEM_CPU_LOGICAL_CPU_COUNT", system_info->host_cores, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_SYSTEM_CPU_FREQ", system_info->host_cpu_freq, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_SYSTEM_TOTAL_RAM", system_info->host_ram_total, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_SYSTEM_TOTAL_DISK_SIZE", system_info->host_disk_space, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_SYSTEM_KERNEL_VERSION", system_info->kernel_version, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_SYSTEM_ARCHITECTURE", system_info->architecture, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_SYSTEM_VIRTUALIZATION", system_info->virtualization, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_SYSTEM_VIRT_DETECTION", system_info->virt_detection, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_SYSTEM_CONTAINER", system_info->container, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_SYSTEM_CONTAINER_DETECTION", system_info->container_detection, &host->host_uuid); + ret += add_host_sysinfo_key_value("NETDATA_HOST_IS_K8S_NODE", system_info->is_k8s_node, &host->host_uuid); + + return !(24 == ret); } @@ -522,7 +553,7 @@ static int store_chart_metadata(RRDSET *st) if (unlikely(rc != SQLITE_OK)) goto bind_fail; - rc = sqlite3_bind_int(res, ++param, (int) st->entries); + rc = sqlite3_bind_int(res, ++param, (int) st->db.entries); if (unlikely(rc != SQLITE_OK)) goto bind_fail; @@ -665,7 +696,7 @@ static void check_dimension_metadata(struct metadata_wc *wc) uint32_t total_deleted= 0; uint64_t last_row_id = wc->row_id; - info("METADATA: Checking dimensions starting after row %"PRIu64, wc->row_id); + netdata_log_info("METADATA: Checking dimensions starting after row %"PRIu64, wc->row_id); while (sqlite3_step_monitored(res) == SQLITE_ROW && total_deleted < MAX_METADATA_CLEANUP) { if (unlikely(metadata_flag_check(wc, METADATA_FLAG_SHUTDOWN))) @@ -685,7 +716,7 @@ static void check_dimension_metadata(struct metadata_wc *wc) wc->check_metadata_after = now + METADATA_MAINTENANCE_RETRY; } else wc->row_id = 0; - info("METADATA: Checked %u, deleted %u -- will resume after row %"PRIu64" in %lld seconds", total_checked, total_deleted, wc->row_id, + netdata_log_info("METADATA: Checked %u, deleted %u -- will resume after row %"PRIu64" in %lld seconds", total_checked, total_deleted, wc->row_id, (long long)(wc->check_metadata_after - now)); skip_run: @@ -919,7 +950,7 @@ static void cleanup_finished_threads(struct host_context_load_thread *hclt, size || (wait && __atomic_load_n(&(hclt[index].busy), __ATOMIC_ACQUIRE))) { int rc = uv_thread_join(&(hclt[index].thread)); if (rc) - error("Failed to join thread, rc = %d",rc); + netdata_log_error("Failed to join thread, rc = %d",rc); __atomic_store_n(&(hclt[index].busy), false, __ATOMIC_RELEASE); __atomic_store_n(&(hclt[index].finished), false, __ATOMIC_RELEASE); } @@ -1067,23 +1098,15 @@ static bool metadata_scan_host(RRDHOST *host, uint32_t max_count, bool use_trans return more_to_do; } -static void store_host_and_system_info(RRDHOST *host, BUFFER *work_buffer, size_t *query_counter) +static void store_host_and_system_info(RRDHOST *host, size_t *query_counter) { - bool free_work_buffer = (NULL == work_buffer); - - if (unlikely(free_work_buffer)) - work_buffer = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); - - if (build_host_system_info_statements(host, work_buffer)) { - int rc = db_execute(db_meta, buffer_tostring(work_buffer)); - if (unlikely(rc)) { - error_report("METADATA: 'host:%s': Failed to store host updated information in the database", rrdhost_hostname(host)); - rrdhost_flag_set(host, RRDHOST_FLAG_METADATA_INFO | RRDHOST_FLAG_METADATA_UPDATE); - } - else { - if (likely(query_counter)) - (*query_counter)++; - } + if (unlikely(store_host_systeminfo(host))) { + error_report("METADATA: 'host:%s': Failed to store host updated system information in the database", rrdhost_hostname(host)); + rrdhost_flag_set(host, RRDHOST_FLAG_METADATA_INFO | RRDHOST_FLAG_METADATA_UPDATE); + } + else { + if (likely(query_counter)) + (*query_counter)++; } if (unlikely(store_host_metadata(host))) { @@ -1094,9 +1117,6 @@ static void store_host_and_system_info(RRDHOST *host, BUFFER *work_buffer, size_ if (likely(query_counter)) (*query_counter)++; } - - if (unlikely(free_work_buffer)) - buffer_free(work_buffer); } // Worker thread to scan hosts for pending metadata to store @@ -1170,7 +1190,7 @@ static void start_metadata_hosts(uv_work_t *req __maybe_unused) } if (unlikely(rrdhost_flag_check(host, RRDHOST_FLAG_METADATA_INFO))) { rrdhost_flag_clear(host, RRDHOST_FLAG_METADATA_INFO); - store_host_and_system_info(host, work_buffer, &query_counter); + store_host_and_system_info(host, &query_counter); } // For clarity @@ -1224,27 +1244,27 @@ static void metadata_event_loop(void *arg) loop = wc->loop = mallocz(sizeof(uv_loop_t)); ret = uv_loop_init(loop); if (ret) { - error("uv_loop_init(): %s", uv_strerror(ret)); + netdata_log_error("uv_loop_init(): %s", uv_strerror(ret)); goto error_after_loop_init; } loop->data = wc; ret = uv_async_init(wc->loop, &wc->async, async_cb); if (ret) { - error("uv_async_init(): %s", uv_strerror(ret)); + netdata_log_error("uv_async_init(): %s", uv_strerror(ret)); goto error_after_async_init; } wc->async.data = wc; ret = uv_timer_init(loop, &wc->timer_req); if (ret) { - error("uv_timer_init(): %s", uv_strerror(ret)); + netdata_log_error("uv_timer_init(): %s", uv_strerror(ret)); goto error_after_timer_init; } wc->timer_req.data = wc; fatal_assert(0 == uv_timer_start(&wc->timer_req, timer_cb, TIMER_INITIAL_PERIOD_MS, TIMER_REPEAT_PERIOD_MS)); - info("Starting metadata sync thread with %d entries command queue", METADATA_CMD_Q_MAX_SIZE); + netdata_log_info("Starting metadata sync thread with %d entries command queue", METADATA_CMD_Q_MAX_SIZE); struct metadata_cmd cmd; memset(&cmd, 0, sizeof(cmd)); @@ -1309,7 +1329,7 @@ static void metadata_event_loop(void *arg) break; case METADATA_ADD_HOST_INFO: host = (RRDHOST *) cmd.param[0]; - store_host_and_system_info(host, NULL, NULL); + store_host_and_system_info(host, NULL); break; case METADATA_SCAN_HOSTS: if (unlikely(metadata_flag_check(wc, METADATA_FLAG_SCANNING_HOSTS))) @@ -1394,7 +1414,7 @@ static void metadata_event_loop(void *arg) freez(loop); worker_unregister(); - info("METADATA: Shutting down event loop"); + netdata_log_info("METADATA: Shutting down event loop"); completion_mark_complete(&wc->init_complete); return; @@ -1415,15 +1435,15 @@ void metadata_sync_shutdown(void) struct metadata_cmd cmd; memset(&cmd, 0, sizeof(cmd)); - info("METADATA: Sending a shutdown command"); + netdata_log_info("METADATA: Sending a shutdown command"); cmd.opcode = METADATA_SYNC_SHUTDOWN; metadata_enq_cmd(&metasync_worker, &cmd); /* wait for metadata thread to shut down */ - info("METADATA: Waiting for shutdown ACK"); + netdata_log_info("METADATA: Waiting for shutdown ACK"); completion_wait_for(&metasync_worker.init_complete); completion_destroy(&metasync_worker.init_complete); - info("METADATA: Shutdown complete"); + netdata_log_info("METADATA: Shutdown complete"); } void metadata_sync_shutdown_prepare(void) @@ -1437,11 +1457,11 @@ void metadata_sync_shutdown_prepare(void) struct completion compl; completion_init(&compl); - info("METADATA: Sending a scan host command"); + netdata_log_info("METADATA: Sending a scan host command"); uint32_t max_wait_iterations = 2000; while (unlikely(metadata_flag_check(&metasync_worker, METADATA_FLAG_SCANNING_HOSTS)) && max_wait_iterations--) { if (max_wait_iterations == 1999) - info("METADATA: Current worker is running; waiting to finish"); + netdata_log_info("METADATA: Current worker is running; waiting to finish"); sleep_usec(1000); } @@ -1449,10 +1469,10 @@ void metadata_sync_shutdown_prepare(void) cmd.completion = &compl; metadata_enq_cmd(&metasync_worker, &cmd); - info("METADATA: Waiting for host scan completion"); + netdata_log_info("METADATA: Waiting for host scan completion"); completion_wait_for(&compl); completion_destroy(&compl); - info("METADATA: Host scan complete; can continue with shutdown"); + netdata_log_info("METADATA: Host scan complete; can continue with shutdown"); } // ------------------------------------------------------------- @@ -1471,7 +1491,7 @@ void metadata_sync_init(void) completion_wait_for(&wc->init_complete); completion_destroy(&wc->init_complete); - info("SQLite metadata sync initialization complete"); + netdata_log_info("SQLite metadata sync initialization complete"); } @@ -1485,7 +1505,6 @@ static inline void queue_metadata_cmd(enum metadata_opcode opcode, const void *p cmd.param[1] = param1; cmd.completion = NULL; metadata_enq_cmd(&metasync_worker, &cmd); - } // Public |