summaryrefslogtreecommitdiffstats
path: root/database/rrdhost.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2023-02-06 16:11:30 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2023-02-06 16:11:30 +0000
commitaa2fe8ccbfcb117efa207d10229eeeac5d0f97c7 (patch)
tree941cbdd387b41c1a81587c20a6df9f0e5e0ff7ab /database/rrdhost.c
parentAdding upstream version 1.37.1. (diff)
downloadnetdata-aa2fe8ccbfcb117efa207d10229eeeac5d0f97c7.tar.xz
netdata-aa2fe8ccbfcb117efa207d10229eeeac5d0f97c7.zip
Adding upstream version 1.38.0.upstream/1.38.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'database/rrdhost.c')
-rw-r--r--database/rrdhost.c577
1 files changed, 299 insertions, 278 deletions
diff --git a/database/rrdhost.c b/database/rrdhost.c
index 5ba13d47b..60b14c13c 100644
--- a/database/rrdhost.c
+++ b/database/rrdhost.c
@@ -3,8 +3,11 @@
#define NETDATA_RRD_INTERNALS
#include "rrd.h"
+static void rrdhost_streaming_sender_structures_init(RRDHOST *host);
+
bool dbengine_enabled = false; // will become true if and when dbengine is initialized
size_t storage_tiers = 3;
+bool use_direct_io = true;
size_t storage_tiers_grouping_iterations[RRD_STORAGE_TIERS] = { 1, 60, 60, 60, 60 };
RRD_BACKFILL storage_tiers_backfill[RRD_STORAGE_TIERS] = { RRD_BACKFILL_NEW, RRD_BACKFILL_NEW, RRD_BACKFILL_NEW, RRD_BACKFILL_NEW, RRD_BACKFILL_NEW };
@@ -24,18 +27,15 @@ size_t get_tier_grouping(size_t tier) {
}
RRDHOST *localhost = NULL;
-size_t rrd_hosts_available = 0;
netdata_rwlock_t rrd_rwlock = NETDATA_RWLOCK_INITIALIZER;
-time_t rrdset_free_obsolete_time = 3600;
-time_t rrdhost_free_orphan_time = 3600;
+time_t rrdset_free_obsolete_time_s = 3600;
+time_t rrdhost_free_orphan_time_s = 3600;
-bool is_storage_engine_shared(STORAGE_INSTANCE *engine) {
+bool is_storage_engine_shared(STORAGE_INSTANCE *engine __maybe_unused) {
#ifdef ENABLE_DBENGINE
- for(size_t tier = 0; tier < storage_tiers ;tier++) {
- if (engine == (STORAGE_INSTANCE *)multidb_ctx[tier])
- return true;
- }
+ if(!rrdeng_is_legacy(engine))
+ return true;
#endif
return false;
@@ -50,20 +50,22 @@ static DICTIONARY *rrdhost_root_index_hostname = NULL;
static inline void rrdhost_init() {
if(unlikely(!rrdhost_root_index)) {
- rrdhost_root_index = dictionary_create(
- DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE);
+ rrdhost_root_index = dictionary_create_advanced(
+ DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE,
+ &dictionary_stats_category_rrdhost, 0);
}
if(unlikely(!rrdhost_root_index_hostname)) {
- rrdhost_root_index_hostname = dictionary_create(
- DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE);
+ rrdhost_root_index_hostname = dictionary_create_advanced(
+ DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE,
+ &dictionary_stats_category_rrdhost, 0);
}
}
// ----------------------------------------------------------------------------
// RRDHOST index by UUID
-inline long rrdhost_hosts_available(void) {
+inline size_t rrdhost_hosts_available(void) {
return dictionary_entries(rrdhost_root_index);
}
@@ -139,7 +141,7 @@ static inline void rrdhost_init_tags(RRDHOST *host, const char *tags) {
string_freez(old);
}
-static inline void rrdhost_init_hostname(RRDHOST *host, const char *hostname) {
+static inline void rrdhost_init_hostname(RRDHOST *host, const char *hostname, bool add_to_index) {
if(unlikely(hostname && !*hostname)) hostname = NULL;
if(host->hostname && hostname && !strcmp(rrdhost_hostname(host), hostname))
@@ -151,7 +153,8 @@ static inline void rrdhost_init_hostname(RRDHOST *host, const char *hostname) {
host->hostname = string_strdupz(hostname?hostname:"localhost");
string_freez(old);
- rrdhost_index_add_hostname(host);
+ if(add_to_index)
+ rrdhost_index_add_hostname(host);
}
static inline void rrdhost_init_os(RRDHOST *host, const char *os) {
@@ -211,7 +214,7 @@ static void rrdhost_initialize_rrdpush_sender(RRDHOST *host,
if(rrdpush_enabled && rrdpush_destination && *rrdpush_destination && rrdpush_api_key && *rrdpush_api_key) {
rrdhost_flag_set(host, RRDHOST_FLAG_RRDPUSH_SENDER_INITIALIZED);
- sender_init(host);
+ rrdhost_streaming_sender_structures_init(host);
#ifdef ENABLE_HTTPS
host->sender->ssl.conn = NULL;
@@ -230,35 +233,34 @@ static void rrdhost_initialize_rrdpush_sender(RRDHOST *host,
rrdhost_option_clear(host, RRDHOST_OPTION_SENDER_ENABLED);
}
-RRDHOST *rrdhost_create(const char *hostname,
- const char *registry_hostname,
- const char *guid,
- const char *os,
- const char *timezone,
- const char *abbrev_timezone,
- int32_t utc_offset,
- const char *tags,
- const char *program_name,
- const char *program_version,
- int update_every,
- long entries,
- RRD_MEMORY_MODE memory_mode,
- unsigned int health_enabled,
- unsigned int rrdpush_enabled,
- char *rrdpush_destination,
- char *rrdpush_api_key,
- char *rrdpush_send_charts_matching,
- bool rrdpush_enable_replication,
- time_t rrdpush_seconds_to_replicate,
- time_t rrdpush_replication_step,
- struct rrdhost_system_info *system_info,
- int is_localhost,
- bool archived
+static RRDHOST *rrdhost_create(
+ const char *hostname,
+ const char *registry_hostname,
+ const char *guid,
+ const char *os,
+ const char *timezone,
+ const char *abbrev_timezone,
+ int32_t utc_offset,
+ const char *tags,
+ const char *program_name,
+ const char *program_version,
+ int update_every,
+ long entries,
+ RRD_MEMORY_MODE memory_mode,
+ unsigned int health_enabled,
+ unsigned int rrdpush_enabled,
+ char *rrdpush_destination,
+ char *rrdpush_api_key,
+ char *rrdpush_send_charts_matching,
+ bool rrdpush_enable_replication,
+ time_t rrdpush_seconds_to_replicate,
+ time_t rrdpush_replication_step,
+ struct rrdhost_system_info *system_info,
+ int is_localhost,
+ bool archived
) {
debug(D_RRDHOST, "Host '%s': adding with guid '%s'", hostname, guid);
- rrd_check_wrlock();
-
if(memory_mode == RRD_MEMORY_MODE_DBENGINE && !dbengine_enabled) {
error("memory mode 'dbengine' is not enabled, but host '%s' is configured for it. Falling back to 'alloc'", hostname);
memory_mode = RRD_MEMORY_MODE_ALLOC;
@@ -272,16 +274,17 @@ int is_legacy = 1;
int is_in_multihost = (memory_mode == RRD_MEMORY_MODE_DBENGINE && !is_legacy);
RRDHOST *host = callocz(1, sizeof(RRDHOST));
+ __atomic_add_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(RRDHOST), __ATOMIC_RELAXED);
strncpyz(host->machine_guid, guid, GUID_LEN + 1);
set_host_properties(host, (update_every > 0)?update_every:1, memory_mode, registry_hostname, os,
tags, timezone, abbrev_timezone, utc_offset, program_name, program_version);
- rrdhost_init_hostname(host, hostname);
+ rrdhost_init_hostname(host, hostname, false);
- host->rrd_history_entries = align_entries_to_pagesize(memory_mode, entries);
- host->health_enabled = ((memory_mode == RRD_MEMORY_MODE_NONE)) ? 0 : health_enabled;
+ host->rrd_history_entries = align_entries_to_pagesize(memory_mode, entries);
+ host->health.health_enabled = ((memory_mode == RRD_MEMORY_MODE_NONE)) ? 0 : health_enabled;
if (likely(!archived)) {
rrdfunctions_init(host);
@@ -312,7 +315,6 @@ int is_legacy = 1;
break;
}
- netdata_rwlock_init(&host->rrdhost_rwlock);
netdata_mutex_init(&host->aclk_state_lock);
netdata_mutex_init(&host->receiver_lock);
@@ -356,27 +358,15 @@ int is_legacy = 1;
if(!host->rrdvars)
host->rrdvars = rrdvariables_create();
- RRDHOST *t = rrdhost_index_add_by_guid(host);
- if(t != host) {
- error("Host '%s': cannot add host with machine guid '%s' to index. It already exists as host '%s' with machine guid '%s'.", rrdhost_hostname(host), host->machine_guid, rrdhost_hostname(t), t->machine_guid);
- rrdhost_free(host, 1);
- return NULL;
- }
-
- if (likely(!uuid_parse(host->machine_guid, host->host_uuid))) {
- if(!archived)
- metaqueue_host_update_info(host->machine_guid);
+ if (likely(!uuid_parse(host->machine_guid, host->host_uuid)))
sql_load_node_id(host);
- }
else
error_report("Host machine GUID %s is not valid", host->machine_guid);
rrdfamily_index_init(host);
rrdcalctemplate_index_init(host);
rrdcalc_rrdhost_index_init(host);
-
- if (health_enabled)
- health_thread_spawn(host);
+ metaqueue_host_update_info(host);
if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) {
#ifdef ENABLE_DBENGINE
@@ -385,9 +375,12 @@ int is_legacy = 1;
snprintfz(dbenginepath, FILENAME_MAX, "%s/dbengine", host->cache_dir);
ret = mkdir(dbenginepath, 0775);
+
if (ret != 0 && errno != EEXIST)
error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), dbenginepath);
- else ret = 0; // succeed
+ else
+ ret = 0; // succeed
+
if (is_legacy) {
// initialize legacy dbengine instance as needed
@@ -396,16 +389,17 @@ int is_legacy = 1;
host->db[0].tier_grouping = get_tier_grouping(0);
ret = rrdeng_init(
- host,
(struct rrdengine_instance **)&host->db[0].instance,
dbenginepath,
- default_rrdeng_page_cache_mb,
default_rrdeng_disk_quota_mb,
0); // may fail here for legacy dbengine initialization
if(ret == 0) {
+ rrdeng_readiness_wait((struct rrdengine_instance *)host->db[0].instance);
+
// assign the rest of the shared storage instances to it
// to allow them collect its metrics too
+
for(size_t tier = 1; tier < storage_tiers ; tier++) {
host->db[tier].mode = RRD_MEMORY_MODE_DBENGINE;
host->db[tier].eng = storage_engine_get(host->db[tier].mode);
@@ -422,15 +416,17 @@ int is_legacy = 1;
host->db[tier].tier_grouping = get_tier_grouping(tier);
}
}
+
if (ret) { // check legacy or multihost initialization success
error(
"Host '%s': cannot initialize host with machine guid '%s'. Failed to initialize DB engine at '%s'.",
rrdhost_hostname(host), host->machine_guid, host->cache_dir);
- rrdhost_free(host, 1);
- host = NULL;
- //rrd_hosts_available++; //TODO: maybe we want this?
- return host;
+ rrd_wrlock();
+ rrdhost_free___while_having_rrd_wrlock(host, true);
+ rrd_unlock();
+
+ return NULL;
}
#else
@@ -455,14 +451,6 @@ int is_legacy = 1;
}
// ------------------------------------------------------------------------
- // link it and add it to the index
-
- if(is_localhost)
- DOUBLE_LINKED_LIST_PREPEND_UNSAFE(localhost, host, prev, next);
- else
- DOUBLE_LINKED_LIST_APPEND_UNSAFE(localhost, host, prev, next);
-
- // ------------------------------------------------------------------------
// init new ML host and update system_info to let upstreams know
// about ML functionality
//
@@ -473,6 +461,30 @@ int is_legacy = 1;
host->system_info->mc_version = enable_metric_correlations ? metric_correlations_version : 0;
}
+ // ------------------------------------------------------------------------
+ // link it and add it to the index
+
+ rrd_wrlock();
+
+ RRDHOST *t = rrdhost_index_add_by_guid(host);
+ if(t != host) {
+ error("Host '%s': cannot add host with machine guid '%s' to index. It already exists as host '%s' with machine guid '%s'.", rrdhost_hostname(host), host->machine_guid, rrdhost_hostname(t), t->machine_guid);
+ rrdhost_free___while_having_rrd_wrlock(host, true);
+ rrd_unlock();
+ return NULL;
+ }
+
+ rrdhost_index_add_hostname(host);
+
+ if(is_localhost)
+ DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(localhost, host, prev, next);
+ else
+ DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(localhost, host, prev, next);
+
+ rrd_unlock();
+
+ // ------------------------------------------------------------------------
+
info("Host '%s' (at registry as '%s') with guid '%s' initialized"
", os '%s'"
", timezone '%s'"
@@ -487,7 +499,6 @@ int is_legacy = 1;
", health %s"
", cache_dir '%s'"
", varlib_dir '%s'"
- ", health_log '%s'"
", alarms default handler '%s'"
", alarms default recipient '%s'"
, rrdhost_hostname(host)
@@ -504,29 +515,27 @@ int is_legacy = 1;
, rrdhost_has_rrdpush_sender_enabled(host)?"enabled":"disabled"
, host->rrdpush_send_destination?host->rrdpush_send_destination:""
, host->rrdpush_send_api_key?host->rrdpush_send_api_key:""
- , host->health_enabled?"enabled":"disabled"
+ , host->health.health_enabled?"enabled":"disabled"
, host->cache_dir
, host->varlib_dir
- , host->health_log_filename
- , string2str(host->health_default_exec)
- , string2str(host->health_default_recipient)
+ , string2str(host->health.health_default_exec)
+ , string2str(host->health.health_default_recipient)
);
- if(!archived)
- metaqueue_host_update_system_info(host);
- rrd_hosts_available++;
+ if(!archived)
+ rrdhost_flag_set(host,RRDHOST_FLAG_METADATA_INFO | RRDHOST_FLAG_METADATA_UPDATE);
rrdhost_load_rrdcontext_data(host);
- if (!archived)
- ml_new_host(host);
- else
- rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED);
-
+ if (!archived) {
+ ml_host_new(host);
+ ml_start_anomaly_detection_threads(host);
+ } else
+ rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED | RRDHOST_FLAG_ORPHAN);
return host;
}
-void rrdhost_update(RRDHOST *host
+static void rrdhost_update(RRDHOST *host
, const char *hostname
, const char *registry_hostname
, const char *guid
@@ -553,11 +562,16 @@ void rrdhost_update(RRDHOST *host
{
UNUSED(guid);
- host->health_enabled = (mode == RRD_MEMORY_MODE_NONE) ? 0 : health_enabled;
+ netdata_spinlock_lock(&host->rrdhost_update_lock);
- rrdhost_system_info_free(host->system_info);
- host->system_info = system_info;
- metaqueue_host_update_system_info(host);
+ host->health.health_enabled = (mode == RRD_MEMORY_MODE_NONE) ? 0 : health_enabled;
+
+ {
+ struct rrdhost_system_info *old = host->system_info;
+ host->system_info = system_info;
+ rrdhost_flag_set(host, RRDHOST_FLAG_METADATA_INFO | RRDHOST_FLAG_METADATA_CLAIMID | RRDHOST_FLAG_METADATA_UPDATE);
+ rrdhost_system_info_free(old);
+ }
rrdhost_init_os(host, os);
rrdhost_init_timezone(host, timezone, abbrev_timezone, utc_offset);
@@ -567,7 +581,7 @@ void rrdhost_update(RRDHOST *host
if(strcmp(rrdhost_hostname(host), hostname) != 0) {
info("Host '%s' has been renamed to '%s'. If this is not intentional it may mean multiple hosts are using the same machine_guid.", rrdhost_hostname(host), hostname);
- rrdhost_init_hostname(host, hostname);
+ rrdhost_init_hostname(host, hostname, true);
}
if(strcmp(rrdhost_program_name(host), program_name) != 0) {
@@ -628,14 +642,14 @@ void rrdhost_update(RRDHOST *host
host->rrdpush_seconds_to_replicate = rrdpush_seconds_to_replicate;
host->rrdpush_replication_step = rrdpush_replication_step;
- rrd_hosts_available++;
- ml_new_host(host);
+ ml_host_new(host);
+ ml_start_anomaly_detection_threads(host);
+
rrdhost_load_rrdcontext_data(host);
info("Host %s is not in archived mode anymore", rrdhost_hostname(host));
}
- if (health_enabled)
- health_thread_spawn(host);
+ netdata_spinlock_unlock(&host->rrdhost_update_lock);
}
RRDHOST *rrdhost_find_or_create(
@@ -665,15 +679,18 @@ RRDHOST *rrdhost_find_or_create(
) {
debug(D_RRDHOST, "Searching for host '%s' with guid '%s'", hostname, guid);
- rrd_wrlock();
RRDHOST *host = rrdhost_find_by_guid(guid);
if (unlikely(host && host->rrd_memory_mode != mode && rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED))) {
/* If a legacy memory mode instantiates all dbengine state must be discarded to avoid inconsistencies */
error("Archived host '%s' has memory mode '%s', but the wanted one is '%s'. Discarding archived state.",
rrdhost_hostname(host), rrd_memory_mode_name(host->rrd_memory_mode), rrd_memory_mode_name(mode));
- rrdhost_free(host, 1);
+
+ rrd_wrlock();
+ rrdhost_free___while_having_rrd_wrlock(host, true);
host = NULL;
+ rrd_unlock();
}
+
if(!host) {
host = rrdhost_create(
hostname
@@ -703,6 +720,7 @@ RRDHOST *rrdhost_find_or_create(
);
}
else {
+
rrdhost_update(host
, hostname
, registry_hostname
@@ -726,19 +744,13 @@ RRDHOST *rrdhost_find_or_create(
, rrdpush_seconds_to_replicate
, rrdpush_replication_step
, system_info);
- }
- if (host) {
- rrdhost_wrlock(host);
- rrdhost_flag_clear(host, RRDHOST_FLAG_ORPHAN);
- host->senders_disconnected_time = 0;
- rrdhost_unlock(host);
- }
- rrd_unlock();
+ }
return host;
}
-inline int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, time_t now) {
+
+inline int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, time_t now_s) {
if(host != protected_host
&& host != localhost
&& rrdhost_receiver_replicating_charts(host) == 0
@@ -746,8 +758,8 @@ inline int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, tim
&& rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN)
&& !rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED)
&& !host->receiver
- && host->senders_disconnected_time
- && host->senders_disconnected_time + rrdhost_free_orphan_time < now)
+ && host->child_disconnected_time
+ && host->child_disconnected_time + rrdhost_free_orphan_time_s < now_s)
return 1;
return 0;
@@ -756,8 +768,34 @@ inline int rrdhost_should_be_removed(RRDHOST *host, RRDHOST *protected_host, tim
// ----------------------------------------------------------------------------
// RRDHOST global / startup initialization
+#ifdef ENABLE_DBENGINE
+struct dbengine_initialization {
+ netdata_thread_t thread;
+ char path[FILENAME_MAX + 1];
+ int disk_space_mb;
+ size_t tier;
+ int ret;
+};
+
+void *dbengine_tier_init(void *ptr) {
+ struct dbengine_initialization *dbi = ptr;
+ dbi->ret = rrdeng_init(NULL, dbi->path, dbi->disk_space_mb, dbi->tier);
+ return ptr;
+}
+#endif
+
void dbengine_init(char *hostname) {
#ifdef ENABLE_DBENGINE
+ use_direct_io = config_get_boolean(CONFIG_SECTION_DB, "dbengine use direct io", use_direct_io);
+
+ unsigned read_num = (unsigned)config_get_number(CONFIG_SECTION_DB, "dbengine pages per extent", MAX_PAGES_PER_EXTENT);
+ if (read_num > 0 && read_num <= MAX_PAGES_PER_EXTENT)
+ rrdeng_pages_per_extent = read_num;
+ else {
+ error("Invalid dbengine pages per extent %u given. Using %u.", read_num, rrdeng_pages_per_extent);
+ config_set_number(CONFIG_SECTION_DB, "dbengine pages per extent", rrdeng_pages_per_extent);
+ }
+
storage_tiers = config_get_number(CONFIG_SECTION_DB, "storage tiers", storage_tiers);
if(storage_tiers < 1) {
error("At least 1 storage tier is required. Assuming 1.");
@@ -770,6 +808,9 @@ void dbengine_init(char *hostname) {
config_set_number(CONFIG_SECTION_DB, "storage tiers", storage_tiers);
}
+ bool parallel_initialization = (storage_tiers <= (size_t)get_netdata_cpus()) ? true : false;
+ parallel_initialization = config_get_boolean(CONFIG_SECTION_DB, "dbengine parallel initialization", parallel_initialization);
+
default_rrdeng_page_fetch_timeout = (int) config_get_number(CONFIG_SECTION_DB, "dbengine page fetch timeout secs", PAGE_CACHE_FETCH_WAIT_TIMEOUT);
if (default_rrdeng_page_fetch_timeout < 1) {
info("'dbengine page fetch timeout secs' cannot be %d, using 1", default_rrdeng_page_fetch_timeout);
@@ -784,10 +825,7 @@ void dbengine_init(char *hostname) {
config_set_number(CONFIG_SECTION_DB, "dbengine page fetch retries", default_rrdeng_page_fetch_retries);
}
- if(config_get_boolean(CONFIG_SECTION_DB, "dbengine page descriptors in file mapped memory", rrdeng_page_descr_is_mmap()) == CONFIG_BOOLEAN_YES)
- rrdeng_page_descr_use_mmap();
- else
- rrdeng_page_descr_use_malloc();
+ struct dbengine_initialization tiers_init[RRD_STORAGE_TIERS] = {};
size_t created_tiers = 0;
char dbenginepath[FILENAME_MAX + 1];
@@ -808,15 +846,11 @@ void dbengine_init(char *hostname) {
if(tier > 0)
divisor *= 2;
- int page_cache_mb = default_rrdeng_page_cache_mb / divisor;
int disk_space_mb = default_multidb_disk_quota_mb / divisor;
size_t grouping_iterations = storage_tiers_grouping_iterations[tier];
RRD_BACKFILL backfill = storage_tiers_backfill[tier];
if(tier > 0) {
- snprintfz(dbengineconfig, 200, "dbengine tier %zu page cache size MB", tier);
- page_cache_mb = config_get_number(CONFIG_SECTION_DB, dbengineconfig, page_cache_mb);
-
snprintfz(dbengineconfig, 200, "dbengine tier %zu multihost disk space MB", tier);
disk_space_mb = config_get_number(CONFIG_SECTION_DB, dbengineconfig, disk_space_mb);
@@ -850,13 +884,30 @@ void dbengine_init(char *hostname) {
}
internal_error(true, "DBENGINE tier %zu grouping iterations is set to %zu", tier, storage_tiers_grouping_iterations[tier]);
- ret = rrdeng_init(NULL, NULL, dbenginepath, page_cache_mb, disk_space_mb, tier);
- if(ret != 0) {
+
+ tiers_init[tier].disk_space_mb = disk_space_mb;
+ tiers_init[tier].tier = tier;
+ strncpyz(tiers_init[tier].path, dbenginepath, FILENAME_MAX);
+ tiers_init[tier].ret = 0;
+
+ if(parallel_initialization)
+ netdata_thread_create(&tiers_init[tier].thread, "DBENGINE_INIT", NETDATA_THREAD_OPTION_JOINABLE,
+ dbengine_tier_init, &tiers_init[tier]);
+ else
+ dbengine_tier_init(&tiers_init[tier]);
+ }
+
+ for(size_t tier = 0; tier < storage_tiers ;tier++) {
+ void *ptr;
+
+ if(parallel_initialization)
+ netdata_thread_join(tiers_init[tier].thread, &ptr);
+
+ if(tiers_init[tier].ret != 0) {
error("DBENGINE on '%s': Failed to initialize multi-host database tier %zu on path '%s'",
- hostname, tier, dbenginepath);
- break;
+ hostname, tiers_init[tier].tier, tiers_init[tier].path);
}
- else
+ else if(created_tiers == tier)
created_tiers++;
}
@@ -868,6 +919,9 @@ void dbengine_init(char *hostname) {
else if(!created_tiers)
fatal("DBENGINE on '%s', failed to initialize databases at '%s'.", hostname, netdata_configured_cache_dir);
+ for(size_t tier = 0; tier < storage_tiers ;tier++)
+ rrdeng_readiness_wait(multidb_ctx[tier]);
+
dbengine_enabled = true;
#else
storage_tiers = config_get_number(CONFIG_SECTION_DB, "storage tiers", 1);
@@ -880,7 +934,7 @@ void dbengine_init(char *hostname) {
#endif
}
-int rrd_init(char *hostname, struct rrdhost_system_info *system_info) {
+int rrd_init(char *hostname, struct rrdhost_system_info *system_info, bool unittest) {
rrdhost_init();
if (unlikely(sql_init_database(DB_CHECK_NONE, system_info ? 0 : 1))) {
@@ -893,7 +947,7 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info) {
error_report("Failed to initialize context metadata database");
}
- if (unlikely(strcmp(hostname, "unittest") == 0)) {
+ if (unlikely(unittest)) {
dbengine_enabled = true;
}
else {
@@ -901,11 +955,11 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info) {
rrdpush_init();
if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE || rrdpush_receiver_needs_dbengine()) {
- info("Initializing dbengine...");
+ info("DBENGINE: Initializing ...");
dbengine_init(hostname);
}
else {
- info("Not initializing dbengine...");
+ info("DBENGINE: Not initializing ...");
storage_tiers = 1;
}
@@ -923,42 +977,41 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info) {
}
}
- metadata_sync_init();
+ if(!unittest)
+ metadata_sync_init();
+
debug(D_RRDHOST, "Initializing localhost with hostname '%s'", hostname);
- rrd_wrlock();
localhost = rrdhost_create(
- hostname
- , registry_get_this_machine_hostname()
+ hostname
+ , registry_get_this_machine_hostname()
, registry_get_this_machine_guid()
, os_type
- , netdata_configured_timezone
- , netdata_configured_abbrev_timezone
- , netdata_configured_utc_offset
- , ""
- , program_name
- , program_version
- , default_rrd_update_every
- , default_rrd_history_entries
- , default_rrd_memory_mode
- , default_health_enabled
- , default_rrdpush_enabled
- , default_rrdpush_destination
- , default_rrdpush_api_key
- , default_rrdpush_send_charts_matching
- , default_rrdpush_enable_replication
- , default_rrdpush_seconds_to_replicate
- , default_rrdpush_replication_step
- , system_info
- , 1
- , 0
+ , netdata_configured_timezone
+ , netdata_configured_abbrev_timezone
+ , netdata_configured_utc_offset
+ , ""
+ , program_name
+ , program_version
+ , default_rrd_update_every
+ , default_rrd_history_entries
+ , default_rrd_memory_mode
+ , default_health_enabled
+ , default_rrdpush_enabled
+ , default_rrdpush_destination
+ , default_rrdpush_api_key
+ , default_rrdpush_send_charts_matching
+ , default_rrdpush_enable_replication
+ , default_rrdpush_seconds_to_replicate
+ , default_rrdpush_replication_step
+ , system_info
+ , 1
+ , 0
);
+
if (unlikely(!localhost)) {
- rrd_unlock();
return 1;
}
- rrd_unlock();
-
if (likely(system_info)) {
migrate_localhost(&localhost->host_uuid);
sql_aclk_sync_init();
@@ -968,46 +1021,12 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info) {
}
// ----------------------------------------------------------------------------
-// RRDHOST - lock validations
-// there are only used when NETDATA_INTERNAL_CHECKS is set
-
-void __rrdhost_check_rdlock(RRDHOST *host, const char *file, const char *function, const unsigned long line) {
- debug(D_RRDHOST, "Checking read lock on host '%s'", rrdhost_hostname(host));
-
- int ret = netdata_rwlock_trywrlock(&host->rrdhost_rwlock);
- if(ret == 0)
- fatal("RRDHOST '%s' should be read-locked, but it is not, at function %s() at line %lu of file '%s'", rrdhost_hostname(host), function, line, file);
-}
-
-void __rrdhost_check_wrlock(RRDHOST *host, const char *file, const char *function, const unsigned long line) {
- debug(D_RRDHOST, "Checking write lock on host '%s'", rrdhost_hostname(host));
-
- int ret = netdata_rwlock_tryrdlock(&host->rrdhost_rwlock);
- if(ret == 0)
- fatal("RRDHOST '%s' should be write-locked, but it is not, at function %s() at line %lu of file '%s'", rrdhost_hostname(host), function, line, file);
-}
-
-void __rrd_check_rdlock(const char *file, const char *function, const unsigned long line) {
- debug(D_RRDHOST, "Checking read lock on all RRDs");
-
- int ret = netdata_rwlock_trywrlock(&rrd_rwlock);
- if(ret == 0)
- fatal("RRDs should be read-locked, but it are not, at function %s() at line %lu of file '%s'", function, line, file);
-}
-
-void __rrd_check_wrlock(const char *file, const char *function, const unsigned long line) {
- debug(D_RRDHOST, "Checking write lock on all RRDs");
-
- int ret = netdata_rwlock_tryrdlock(&rrd_rwlock);
- if(ret == 0)
- fatal("RRDs should be write-locked, but it are not, at function %s() at line %lu of file '%s'", function, line, file);
-}
-
-// ----------------------------------------------------------------------------
// RRDHOST - free
void rrdhost_system_info_free(struct rrdhost_system_info *system_info) {
if(likely(system_info)) {
+ __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(struct rrdhost_system_info), __ATOMIC_RELAXED);
+
freez(system_info->cloud_provider_type);
freez(system_info->cloud_instance_type);
freez(system_info->cloud_instance_region);
@@ -1042,63 +1061,80 @@ void rrdhost_system_info_free(struct rrdhost_system_info *system_info) {
}
}
-void destroy_receiver_state(struct receiver_state *rpt);
+static void rrdhost_streaming_sender_structures_init(RRDHOST *host)
+{
+ if (host->sender)
+ return;
+
+ host->sender = callocz(1, sizeof(*host->sender));
+ __atomic_add_fetch(&netdata_buffers_statistics.rrdhost_senders, sizeof(*host->sender), __ATOMIC_RELAXED);
+
+ host->sender->host = host;
+ host->sender->buffer = cbuffer_new(CBUFFER_INITIAL_SIZE, 1024 * 1024, &netdata_buffers_statistics.cbuffers_streaming);
+ host->sender->capabilities = STREAM_OUR_CAPABILITIES;
-void stop_streaming_sender(RRDHOST *host)
+ host->sender->rrdpush_sender_pipe[PIPE_READ] = -1;
+ host->sender->rrdpush_sender_pipe[PIPE_WRITE] = -1;
+ host->sender->rrdpush_sender_socket = -1;
+
+#ifdef ENABLE_COMPRESSION
+ if(default_compression_enabled) {
+ host->sender->flags |= SENDER_FLAG_COMPRESSION;
+ host->sender->compressor = create_compressor();
+ }
+ else
+ host->sender->flags &= ~SENDER_FLAG_COMPRESSION;
+#endif
+
+ netdata_mutex_init(&host->sender->mutex);
+ replication_init_sender(host->sender);
+}
+
+static void rrdhost_streaming_sender_structures_free(RRDHOST *host)
{
rrdhost_option_clear(host, RRDHOST_OPTION_SENDER_ENABLED);
if (unlikely(!host->sender))
return;
- rrdpush_sender_thread_stop(host); // stop a possibly running thread
+ rrdpush_sender_thread_stop(host, "HOST CLEANUP", true); // stop a possibly running thread
cbuffer_free(host->sender->buffer);
#ifdef ENABLE_COMPRESSION
if (host->sender->compressor)
host->sender->compressor->destroy(&host->sender->compressor);
#endif
replication_cleanup_sender(host->sender);
+
+ __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_senders, sizeof(*host->sender), __ATOMIC_RELAXED);
+
freez(host->sender);
host->sender = NULL;
rrdhost_flag_clear(host, RRDHOST_FLAG_RRDPUSH_SENDER_INITIALIZED);
}
-void stop_streaming_receiver(RRDHOST *host)
-{
- netdata_mutex_lock(&host->receiver_lock);
- if (host->receiver) {
- if (!host->receiver->exited)
- netdata_thread_cancel(host->receiver->thread);
- netdata_mutex_unlock(&host->receiver_lock);
- struct receiver_state *rpt = host->receiver;
- while (host->receiver && !rpt->exited)
- sleep_usec(50 * USEC_PER_MS);
- // If the receiver detached from the host then its thread will destroy the state
- if (host->receiver == rpt)
- destroy_receiver_state(host->receiver);
- } else
- netdata_mutex_unlock(&host->receiver_lock);
-}
-
-void rrdhost_free(RRDHOST *host, bool force) {
+void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) {
if(!host) return;
- if (netdata_exit || force)
- info("Freeing all memory for host '%s'...", rrdhost_hostname(host));
+ if (netdata_exit || force) {
+ info("RRD: 'host:%s' freeing memory...", rrdhost_hostname(host));
- rrd_check_wrlock(); // make sure the RRDs are write locked
+ // ------------------------------------------------------------------------
+ // first remove it from the indexes, so that it will not be discoverable
- rrdhost_wrlock(host);
- ml_delete_host(host);
- rrdhost_unlock(host);
+ rrdhost_index_del_hostname(host);
+ rrdhost_index_del_by_guid(host);
+
+ if (host->prev)
+ DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(localhost, host, prev, next);
+ }
// ------------------------------------------------------------------------
// clean up streaming
- stop_streaming_sender(host);
+ rrdhost_streaming_sender_structures_free(host);
if (netdata_exit || force)
- stop_streaming_receiver(host);
+ stop_streaming_receiver(host, "HOST CLEANUP");
// ------------------------------------------------------------------------
@@ -1106,9 +1142,6 @@ void rrdhost_free(RRDHOST *host, bool force) {
rrdcalc_delete_all(host);
-
- rrdhost_wrlock(host); // lock this RRDHOST
-
// ------------------------------------------------------------------------
// release its children resources
@@ -1126,6 +1159,10 @@ void rrdhost_free(RRDHOST *host, bool force) {
rrdcalc_rrdhost_index_destroy(host);
rrdcalctemplate_index_destroy(host);
+ // cleanup ML resources
+ ml_stop_anomaly_detection_threads(host);
+ ml_host_delete(host);
+
freez(host->exporting_flags);
health_alarm_log_free(host);
@@ -1140,9 +1177,8 @@ void rrdhost_free(RRDHOST *host, bool force) {
#endif
if (!netdata_exit && !force) {
- info("Setting archive mode for host '%s'...", rrdhost_hostname(host));
- rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED);
- rrdhost_unlock(host);
+ info("RRD: 'host:%s' is now in archive mode...", rrdhost_hostname(host));
+ rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED | RRDHOST_FLAG_ORPHAN);
return;
}
@@ -1162,17 +1198,6 @@ void rrdhost_free(RRDHOST *host, bool force) {
#endif
// ------------------------------------------------------------------------
- // remove it from the indexes
-
- rrdhost_index_del_hostname(host);
- rrdhost_index_del_by_guid(host);
-
- // ------------------------------------------------------------------------
- // unlink it from the host
-
- DOUBLE_LINKED_LIST_REMOVE_UNSAFE(localhost, host, prev, next);
-
- // ------------------------------------------------------------------------
// free it
pthread_mutex_destroy(&host->aclk_state_lock);
@@ -1191,14 +1216,11 @@ void rrdhost_free(RRDHOST *host, bool force) {
freez(host->rrdpush_send_api_key);
freez(host->rrdpush_send_destination);
rrdpush_destinations_free(host);
- string_freez(host->health_default_exec);
- string_freez(host->health_default_recipient);
- freez(host->health_log_filename);
+ string_freez(host->health.health_default_exec);
+ string_freez(host->health.health_default_recipient);
string_freez(host->registry_hostname);
simple_pattern_free(host->rrdpush_send_charts_matching);
- rrdhost_unlock(host);
netdata_rwlock_destroy(&host->health_log.alarm_log_rwlock);
- netdata_rwlock_destroy(&host->rrdhost_rwlock);
freez(host->node_id);
rrdfamily_index_destroy(host);
@@ -1208,12 +1230,12 @@ void rrdhost_free(RRDHOST *host, bool force) {
rrdhost_destroy_rrdcontexts(host);
string_freez(host->hostname);
+ __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(RRDHOST), __ATOMIC_RELAXED);
freez(host);
#ifdef ENABLE_ACLK
if (wc)
wc->is_orphan = 0;
#endif
- rrd_hosts_available--;
}
void rrdhost_free_all(void) {
@@ -1221,21 +1243,30 @@ void rrdhost_free_all(void) {
/* Make sure child-hosts are released before the localhost. */
while(localhost && localhost->next)
- rrdhost_free(localhost->next, 1);
+ rrdhost_free___while_having_rrd_wrlock(localhost->next, true);
if(localhost)
- rrdhost_free(localhost, 1);
+ rrdhost_free___while_having_rrd_wrlock(localhost, true);
rrd_unlock();
}
+void rrd_finalize_collection_for_all_hosts(void) {
+ RRDHOST *host;
+ rrd_wrlock();
+ rrdhost_foreach_read(host) {
+ rrdhost_finalize_collection(host);
+ }
+ rrd_unlock();
+}
+
// ----------------------------------------------------------------------------
// RRDHOST - save host files
void rrdhost_save_charts(RRDHOST *host) {
if(!host) return;
- info("Saving/Closing database of host '%s'...", rrdhost_hostname(host));
+ info("RRD: 'host:%s' saving / closing database...", rrdhost_hostname(host));
RRDSET *st;
@@ -1312,8 +1343,7 @@ static void rrdhost_load_auto_labels(void) {
health_add_host_labels();
- rrdlabels_add(
- labels, "_is_parent", (localhost->senders_count > 0) ? "true" : "false", RRDLABEL_SRC_AUTO);
+ rrdlabels_add(labels, "_is_parent", (localhost->connected_children_count > 0) ? "true" : "false", RRDLABEL_SRC_AUTO);
if (localhost->rrdpush_send_destination)
rrdlabels_add(labels, "_streams_to", localhost->rrdpush_send_destination, RRDLABEL_SRC_AUTO);
@@ -1391,13 +1421,18 @@ void reload_host_labels(void) {
rrdhost_load_kubernetes_labels();
rrdhost_load_auto_labels();
- rrdlabels_remove_all_unmarked(localhost->rrdlabels);
- metaqueue_store_host_labels(localhost->machine_guid);
-
- health_label_log_save(localhost);
+ rrdhost_flag_set(localhost,RRDHOST_FLAG_METADATA_LABELS | RRDHOST_FLAG_METADATA_UPDATE);
rrdpush_send_host_labels(localhost);
- health_reload();
+}
+
+void rrdhost_finalize_collection(RRDHOST *host) {
+ info("RRD: 'host:%s' stopping data collection...", rrdhost_hostname(host));
+
+ RRDSET *st;
+ rrdset_foreach_write(st, host)
+ rrdset_finalize_collection(st, true);
+ rrdset_foreach_done(st);
}
// ----------------------------------------------------------------------------
@@ -1406,16 +1441,18 @@ void reload_host_labels(void) {
void rrdhost_delete_charts(RRDHOST *host) {
if(!host) return;
- info("Deleting database of host '%s'...", rrdhost_hostname(host));
+ info("RRD: 'host:%s' deleting disk files...", rrdhost_hostname(host));
RRDSET *st;
- // we get a write lock
- // to ensure only one thread is saving the database
- rrdset_foreach_write(st, host) {
- rrdset_delete_files(st);
+ if(host->rrd_memory_mode == RRD_MEMORY_MODE_SAVE || host->rrd_memory_mode == RRD_MEMORY_MODE_MAP) {
+ // we get a write lock
+ // to ensure only one thread is saving the database
+ rrdset_foreach_write(st, host){
+ rrdset_delete_files(st);
+ }
+ rrdset_foreach_done(st);
}
- rrdset_foreach_done(st);
recursively_delete_dir(host->cache_dir, "left over host");
}
@@ -1426,7 +1463,7 @@ void rrdhost_delete_charts(RRDHOST *host) {
void rrdhost_cleanup_charts(RRDHOST *host) {
if(!host) return;
- info("Cleaning up database of host '%s'...", rrdhost_hostname(host));
+ info("RRD: 'host:%s' cleaning up disk files...", rrdhost_hostname(host));
RRDSET *st;
uint32_t rrdhost_delete_obsolete_charts = rrdhost_option_check(host, RRDHOST_OPTION_DELETE_OBSOLETE_CHARTS);
@@ -1453,7 +1490,7 @@ void rrdhost_cleanup_charts(RRDHOST *host) {
// RRDHOST - save all hosts to disk
void rrdhost_save_all(void) {
- info("Saving database [%zu hosts(s)]...", rrd_hosts_available);
+ info("RRD: saving databases [%zu hosts(s)]...", rrdhost_hosts_available());
rrd_rdlock();
@@ -1468,7 +1505,7 @@ void rrdhost_save_all(void) {
// RRDHOST - save or delete all hosts from disk
void rrdhost_cleanup_all(void) {
- info("Cleaning up database [%zu hosts(s)]...", rrd_hosts_available);
+ info("RRD: cleaning up database [%zu hosts(s)]...", rrdhost_hosts_available());
rrd_rdlock();
@@ -1622,19 +1659,3 @@ int rrdhost_set_system_info_variable(struct rrdhost_system_info *system_info, ch
return res;
}
-
-// Added for gap-filling, if this proves to be a bottleneck in large-scale systems then we will need to cache
-// the last entry times as the metric updates, but let's see if it is a problem first.
-time_t rrdhost_last_entry_t(RRDHOST *h) {
- RRDSET *st;
- time_t result = 0;
-
- rrdset_foreach_read(st, h) {
- time_t st_last = rrdset_last_entry_t(st);
-
- if (st_last > result)
- result = st_last;
- }
- rrdset_foreach_done(st);
- return result;
-}