summaryrefslogtreecommitdiffstats
path: root/daemon/main.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--daemon/main.c608
1 files changed, 552 insertions, 56 deletions
diff --git a/daemon/main.c b/daemon/main.c
index 6b591385d..7b2076f3f 100644
--- a/daemon/main.c
+++ b/daemon/main.c
@@ -4,10 +4,16 @@
#include "buildinfo.h"
#include "static_threads.h"
+#if defined(ENV32BIT)
+#warning COMPILING 32BIT NETDATA
+#endif
+
bool unittest_running = false;
int netdata_zero_metrics_enabled;
int netdata_anonymous_statistics_enabled;
+int libuv_worker_threads = MIN_LIBUV_WORKER_THREADS;
+
struct netdata_static_thread *static_threads;
struct config netdata_config = {
@@ -23,69 +29,461 @@ struct config netdata_config = {
}
};
+typedef struct service_thread {
+ pid_t tid;
+ SERVICE_THREAD_TYPE type;
+ SERVICE_TYPE services;
+ char name[NETDATA_THREAD_NAME_MAX + 1];
+ bool cancelled;
+
+ union {
+ netdata_thread_t netdata_thread;
+ uv_thread_t uv_thread;
+ };
+
+ force_quit_t force_quit_callback;
+ request_quit_t request_quit_callback;
+ void *data;
+} SERVICE_THREAD;
+
+struct service_globals {
+ SERVICE_TYPE running;
+ SPINLOCK lock;
+ Pvoid_t pid_judy;
+} service_globals = {
+ .running = ~0,
+ .pid_judy = NULL,
+};
+
+SERVICE_THREAD *service_register(SERVICE_THREAD_TYPE thread_type, request_quit_t request_quit_callback, force_quit_t force_quit_callback, void *data, bool update __maybe_unused) {
+ SERVICE_THREAD *sth = NULL;
+ pid_t tid = gettid();
+
+ netdata_spinlock_lock(&service_globals.lock);
+ Pvoid_t *PValue = JudyLIns(&service_globals.pid_judy, tid, PJE0);
+ if(!*PValue) {
+ sth = callocz(1, sizeof(SERVICE_THREAD));
+ sth->tid = tid;
+ sth->type = thread_type;
+ sth->request_quit_callback = request_quit_callback;
+ sth->force_quit_callback = force_quit_callback;
+ sth->data = data;
+ os_thread_get_current_name_np(sth->name);
+ *PValue = sth;
+
+ switch(thread_type) {
+ default:
+ case SERVICE_THREAD_TYPE_NETDATA:
+ sth->netdata_thread = netdata_thread_self();
+ break;
+
+ case SERVICE_THREAD_TYPE_EVENT_LOOP:
+ case SERVICE_THREAD_TYPE_LIBUV:
+ sth->uv_thread = uv_thread_self();
+ break;
+ }
+ }
+ else {
+ sth = *PValue;
+ }
+ netdata_spinlock_unlock(&service_globals.lock);
+
+ return sth;
+}
+
+void service_exits(void) {
+ pid_t tid = gettid();
+
+ netdata_spinlock_lock(&service_globals.lock);
+ Pvoid_t *PValue = JudyLGet(service_globals.pid_judy, tid, PJE0);
+ if(PValue) {
+ freez(*PValue);
+ JudyLDel(&service_globals.pid_judy, tid, PJE0);
+ }
+ netdata_spinlock_unlock(&service_globals.lock);
+}
+
+bool service_running(SERVICE_TYPE service) {
+ static __thread SERVICE_THREAD *sth = NULL;
+
+ if(unlikely(!sth))
+ sth = service_register(SERVICE_THREAD_TYPE_NETDATA, NULL, NULL, NULL, false);
+
+ if(netdata_exit)
+ __atomic_store_n(&service_globals.running, 0, __ATOMIC_RELAXED);
+
+ if(service == 0)
+ service = sth->services;
+
+ sth->services |= service;
+
+ return ((__atomic_load_n(&service_globals.running, __ATOMIC_RELAXED) & service) == service);
+}
+
+void service_signal_exit(SERVICE_TYPE service) {
+ __atomic_and_fetch(&service_globals.running, ~(service), __ATOMIC_RELAXED);
+
+ netdata_spinlock_lock(&service_globals.lock);
+
+ Pvoid_t *PValue;
+ Word_t tid = 0;
+ bool first = true;
+ while((PValue = JudyLFirstThenNext(service_globals.pid_judy, &tid, &first))) {
+ SERVICE_THREAD *sth = *PValue;
+
+ if((sth->services & service) && sth->request_quit_callback) {
+ netdata_spinlock_unlock(&service_globals.lock);
+ sth->request_quit_callback(sth->data);
+ netdata_spinlock_lock(&service_globals.lock);
+ continue;
+ }
+ }
+
+ netdata_spinlock_unlock(&service_globals.lock);
+}
+
+static void service_to_buffer(BUFFER *wb, SERVICE_TYPE service) {
+ if(service & SERVICE_MAINTENANCE)
+ buffer_strcat(wb, "MAINTENANCE ");
+ if(service & SERVICE_COLLECTORS)
+ buffer_strcat(wb, "COLLECTORS ");
+ if(service & SERVICE_ML_TRAINING)
+ buffer_strcat(wb, "ML_TRAINING ");
+ if(service & SERVICE_ML_PREDICTION)
+ buffer_strcat(wb, "ML_PREDICTION ");
+ if(service & SERVICE_REPLICATION)
+ buffer_strcat(wb, "REPLICATION ");
+ if(service & ABILITY_DATA_QUERIES)
+ buffer_strcat(wb, "DATA_QUERIES ");
+ if(service & ABILITY_WEB_REQUESTS)
+ buffer_strcat(wb, "WEB_REQUESTS ");
+ if(service & SERVICE_WEB_SERVER)
+ buffer_strcat(wb, "WEB_SERVER ");
+ if(service & SERVICE_ACLK)
+ buffer_strcat(wb, "ACLK ");
+ if(service & SERVICE_HEALTH)
+ buffer_strcat(wb, "HEALTH ");
+ if(service & SERVICE_STREAMING)
+ buffer_strcat(wb, "STREAMING ");
+ if(service & ABILITY_STREAMING_CONNECTIONS)
+ buffer_strcat(wb, "STREAMING_CONNECTIONS ");
+ if(service & SERVICE_CONTEXT)
+ buffer_strcat(wb, "CONTEXT ");
+ if(service & SERVICE_ANALYTICS)
+ buffer_strcat(wb, "ANALYTICS ");
+ if(service & SERVICE_EXPORTERS)
+ buffer_strcat(wb, "EXPORTERS ");
+}
+
+static bool service_wait_exit(SERVICE_TYPE service, usec_t timeout_ut) {
+ BUFFER *service_list = buffer_create(1024, NULL);
+ BUFFER *thread_list = buffer_create(1024, NULL);
+ usec_t started_ut = now_monotonic_usec(), ended_ut;
+ size_t running;
+ SERVICE_TYPE running_services = 0;
+
+ // cancel the threads
+ running = 0;
+ running_services = 0;
+ {
+ buffer_flush(thread_list);
+
+ netdata_spinlock_lock(&service_globals.lock);
+
+ Pvoid_t *PValue;
+ Word_t tid = 0;
+ bool first = true;
+ while((PValue = JudyLFirstThenNext(service_globals.pid_judy, &tid, &first))) {
+ SERVICE_THREAD *sth = *PValue;
+ if(sth->services & service && sth->tid != gettid() && !sth->cancelled) {
+ sth->cancelled = true;
+
+ switch(sth->type) {
+ default:
+ case SERVICE_THREAD_TYPE_NETDATA:
+ netdata_thread_cancel(sth->netdata_thread);
+ break;
+
+ case SERVICE_THREAD_TYPE_EVENT_LOOP:
+ case SERVICE_THREAD_TYPE_LIBUV:
+ break;
+ }
+
+ if(running)
+ buffer_strcat(thread_list, ", ");
+
+ buffer_sprintf(thread_list, "'%s' (%d)", sth->name, sth->tid);
+
+ running++;
+ running_services |= sth->services & service;
+
+ if(sth->force_quit_callback) {
+ netdata_spinlock_unlock(&service_globals.lock);
+ sth->force_quit_callback(sth->data);
+ netdata_spinlock_lock(&service_globals.lock);
+ continue;
+ }
+ }
+ }
+
+ netdata_spinlock_unlock(&service_globals.lock);
+ }
+
+ service_signal_exit(service);
+
+ // signal them to stop
+ size_t last_running = 0;
+ size_t stale_time_ut = 0;
+ usec_t sleep_ut = 50 * USEC_PER_MS;
+ size_t log_countdown_ut = sleep_ut;
+ do {
+ if(running != last_running)
+ stale_time_ut = 0;
+
+ last_running = running;
+ running = 0;
+ running_services = 0;
+ buffer_flush(thread_list);
+
+ netdata_spinlock_lock(&service_globals.lock);
+
+ Pvoid_t *PValue;
+ Word_t tid = 0;
+ bool first = true;
+ while((PValue = JudyLFirstThenNext(service_globals.pid_judy, &tid, &first))) {
+ SERVICE_THREAD *sth = *PValue;
+ if(sth->services & service && sth->tid != gettid()) {
+ if(running)
+ buffer_strcat(thread_list, ", ");
+
+ buffer_sprintf(thread_list, "'%s' (%d)", sth->name, sth->tid);
+
+ running_services |= sth->services & service;
+ running++;
+ }
+ }
+
+ netdata_spinlock_unlock(&service_globals.lock);
+
+ if(running) {
+ log_countdown_ut -= (log_countdown_ut >= sleep_ut) ? sleep_ut : log_countdown_ut;
+ if(log_countdown_ut == 0 || running != last_running) {
+ log_countdown_ut = 20 * sleep_ut;
+
+ buffer_flush(service_list);
+ service_to_buffer(service_list, running_services);
+ info("SERVICE CONTROL: waiting for the following %zu services [ %s] to exit: %s",
+ running, buffer_tostring(service_list),
+ running <= 10 ? buffer_tostring(thread_list) : "");
+ }
+
+ sleep_usec(sleep_ut);
+ stale_time_ut += sleep_ut;
+ }
+
+ ended_ut = now_monotonic_usec();
+ } while(running && (ended_ut - started_ut < timeout_ut || stale_time_ut < timeout_ut));
+
+ if(running) {
+ buffer_flush(service_list);
+ service_to_buffer(service_list, running_services);
+ info("SERVICE CONTROL: "
+ "the following %zu service(s) [ %s] take too long to exit: %s; "
+ "giving up on them...",
+ running, buffer_tostring(service_list),
+ buffer_tostring(thread_list));
+ }
+
+ buffer_free(thread_list);
+ buffer_free(service_list);
+
+ return (running == 0);
+}
+
+#define delta_shutdown_time(msg) \
+ { \
+ usec_t now_ut = now_monotonic_usec(); \
+ if(prev_msg) \
+ info("NETDATA SHUTDOWN: in %7llu ms, %s%s - next: %s", (now_ut - last_ut) / USEC_PER_MS, (timeout)?"(TIMEOUT) ":"", prev_msg, msg); \
+ else \
+ info("NETDATA SHUTDOWN: next: %s", msg); \
+ last_ut = now_ut; \
+ prev_msg = msg; \
+ timeout = false; \
+ }
+
void netdata_cleanup_and_exit(int ret) {
- // enabling this, is wrong
- // because the threads will be cancelled while cleaning up
- // netdata_exit = 1;
+ usec_t started_ut = now_monotonic_usec();
+ usec_t last_ut = started_ut;
+ const char *prev_msg = NULL;
+ bool timeout = false;
error_log_limit_unlimited();
- info("EXIT: netdata prepares to exit with code %d...", ret);
+ info("NETDATA SHUTDOWN: initializing shutdown with code %d...", ret);
send_statistics("EXIT", ret?"ERROR":"OK","-");
+ delta_shutdown_time("create shutdown file");
+
char agent_crash_file[FILENAME_MAX + 1];
char agent_incomplete_shutdown_file[FILENAME_MAX + 1];
snprintfz(agent_crash_file, FILENAME_MAX, "%s/.agent_crash", netdata_configured_varlib_dir);
snprintfz(agent_incomplete_shutdown_file, FILENAME_MAX, "%s/.agent_incomplete_shutdown", netdata_configured_varlib_dir);
(void) rename(agent_crash_file, agent_incomplete_shutdown_file);
- // cleanup/save the database and exit
- info("EXIT: cleaning up the database...");
+#ifdef ENABLE_DBENGINE
+ if(dbengine_enabled) {
+ delta_shutdown_time("dbengine exit mode");
+ for (size_t tier = 0; tier < storage_tiers; tier++)
+ rrdeng_exit_mode(multidb_ctx[tier]);
+ }
+#endif
+
+ delta_shutdown_time("disable maintenance, new queries, new web requests, new streaming connections and aclk");
+
+ service_signal_exit(
+ SERVICE_MAINTENANCE
+ | ABILITY_DATA_QUERIES
+ | ABILITY_WEB_REQUESTS
+ | ABILITY_STREAMING_CONNECTIONS
+ | SERVICE_ACLK
+ );
+
+ delta_shutdown_time("stop replication, exporters, ML training, health and web servers threads");
+
+ timeout = !service_wait_exit(
+ SERVICE_REPLICATION
+ | SERVICE_EXPORTERS
+ | SERVICE_ML_TRAINING
+ | SERVICE_HEALTH
+ | SERVICE_WEB_SERVER
+ , 3 * USEC_PER_SEC);
+
+ delta_shutdown_time("stop collectors and streaming threads");
+
+ timeout = !service_wait_exit(
+ SERVICE_COLLECTORS
+ | SERVICE_STREAMING
+ , 3 * USEC_PER_SEC);
+
+ delta_shutdown_time("stop ML prediction and context threads");
+
+ timeout = !service_wait_exit(
+ SERVICE_ML_PREDICTION
+ | SERVICE_CONTEXT
+ , 3 * USEC_PER_SEC);
+
+ delta_shutdown_time("stop maintenance thread");
+
+ timeout = !service_wait_exit(
+ SERVICE_MAINTENANCE
+ , 3 * USEC_PER_SEC);
+
+ delta_shutdown_time("clean rrdhost database");
+
rrdhost_cleanup_all();
- if(!ret) {
- // exit cleanly
+ delta_shutdown_time("prepare metasync shutdown");
+
+ metadata_sync_shutdown_prepare();
- // stop everything
- info("EXIT: stopping static threads...");
#ifdef ENABLE_ACLK
- aclk_sync_exit_all();
+ delta_shutdown_time("signal aclk sync to stop");
+ aclk_sync_exit_all();
#endif
- cancel_main_threads();
- // free the database
- info("EXIT: freeing database memory...");
+ delta_shutdown_time("stop aclk threads");
+
+ timeout = !service_wait_exit(
+ SERVICE_ACLK
+ , 3 * USEC_PER_SEC);
+
+ delta_shutdown_time("stop all remaining worker threads");
+
+ timeout = !service_wait_exit(~0, 10 * USEC_PER_SEC);
+
+ delta_shutdown_time("cancel main threads");
+
+ cancel_main_threads();
+
+ if(!ret) {
+ // exit cleanly
+
#ifdef ENABLE_DBENGINE
if(dbengine_enabled) {
+ delta_shutdown_time("flush dbengine tiers");
for (size_t tier = 0; tier < storage_tiers; tier++)
rrdeng_prepare_exit(multidb_ctx[tier]);
}
#endif
- metadata_sync_shutdown_prepare();
- rrdhost_free_all();
+
+ // free the database
+ delta_shutdown_time("stop collection for all hosts");
+
+ // rrdhost_free_all();
+ rrd_finalize_collection_for_all_hosts();
+
+ delta_shutdown_time("stop metasync threads");
+
metadata_sync_shutdown();
+
#ifdef ENABLE_DBENGINE
if(dbengine_enabled) {
+ delta_shutdown_time("wait for dbengine collectors to finish");
+
+ size_t running = 1;
+ while(running) {
+ running = 0;
+ for (size_t tier = 0; tier < storage_tiers; tier++)
+ running += rrdeng_collectors_running(multidb_ctx[tier]);
+
+ if(running)
+ sleep_usec(100 * USEC_PER_MS);
+ }
+
+ delta_shutdown_time("wait for dbengine main cache to finish flushing");
+
+ while (pgc_hot_and_dirty_entries(main_cache)) {
+ pgc_flush_all_hot_and_dirty_pages(main_cache, PGC_SECTION_ALL);
+ sleep_usec(100 * USEC_PER_MS);
+ }
+
+ delta_shutdown_time("stop dbengine tiers");
for (size_t tier = 0; tier < storage_tiers; tier++)
rrdeng_exit(multidb_ctx[tier]);
}
#endif
}
+
+ delta_shutdown_time("close SQL context db");
+
sql_close_context_database();
+
+ delta_shutdown_time("closed SQL main db");
+
sql_close_database();
// unlink the pid
if(pidfile[0]) {
- info("EXIT: removing netdata PID file '%s'...", pidfile);
+ delta_shutdown_time("remove pid file");
+
if(unlink(pidfile) != 0)
error("EXIT: cannot unlink pidfile '%s'.", pidfile);
}
#ifdef ENABLE_HTTPS
+ delta_shutdown_time("free openssl structures");
security_clean_openssl();
#endif
- info("EXIT: all done - netdata is now exiting - bye bye...");
+
+ delta_shutdown_time("remove incomplete shutdown file");
+
(void) unlink(agent_incomplete_shutdown_file);
+
+ delta_shutdown_time("exit");
+
+ usec_t ended_ut = now_monotonic_usec();
+ info("NETDATA SHUTDOWN: completed in %llu ms - netdata is now exiting - bye bye...", (ended_ut - started_ut) / USEC_PER_MS);
exit(ret);
}
@@ -225,6 +623,32 @@ int killpid(pid_t pid) {
return ret;
}
+static void set_nofile_limit(struct rlimit *rl) {
+ // get the num files allowed
+ if(getrlimit(RLIMIT_NOFILE, rl) != 0) {
+ error("getrlimit(RLIMIT_NOFILE) failed");
+ return;
+ }
+
+ info("resources control: allowed file descriptors: soft = %zu, max = %zu",
+ (size_t) rl->rlim_cur, (size_t) rl->rlim_max);
+
+ // make the soft/hard limits equal
+ rl->rlim_cur = rl->rlim_max;
+ if (setrlimit(RLIMIT_NOFILE, rl) != 0) {
+ error("setrlimit(RLIMIT_NOFILE, { %zu, %zu }) failed", (size_t)rl->rlim_cur, (size_t)rl->rlim_max);
+ }
+
+ // sanity check to make sure we have enough file descriptors available to open
+ if (getrlimit(RLIMIT_NOFILE, rl) != 0) {
+ error("getrlimit(RLIMIT_NOFILE) failed");
+ return;
+ }
+
+ if (rl->rlim_cur < 1024)
+ error("Number of open file descriptors allowed for this process is too low (RLIMIT_NOFILE=%zu)", (size_t)rl->rlim_cur);
+}
+
void cancel_main_threads() {
error_log_limit_unlimited();
@@ -408,6 +832,9 @@ static void log_init(void) {
snprintfz(filename, FILENAME_MAX, "%s/error.log", netdata_configured_log_dir);
stderr_filename = config_get(CONFIG_SECTION_LOGS, "error", filename);
+ snprintfz(filename, FILENAME_MAX, "%s/collector.log", netdata_configured_log_dir);
+ stdcollector_filename = config_get(CONFIG_SECTION_LOGS, "collector", filename);
+
snprintfz(filename, FILENAME_MAX, "%s/access.log", netdata_configured_log_dir);
stdaccess_filename = config_get(CONFIG_SECTION_LOGS, "access", filename);
@@ -679,8 +1106,9 @@ static void get_netdata_configured_variables() {
// ------------------------------------------------------------------------
// get default Database Engine page cache size in MiB
- db_engine_use_malloc = config_get_boolean(CONFIG_SECTION_DB, "dbengine page cache with malloc", CONFIG_BOOLEAN_YES);
default_rrdeng_page_cache_mb = (int) config_get_number(CONFIG_SECTION_DB, "dbengine page cache size MB", default_rrdeng_page_cache_mb);
+ db_engine_journal_check = config_get_boolean(CONFIG_SECTION_DB, "dbengine enable journal integrity check", CONFIG_BOOLEAN_NO);
+
if(default_rrdeng_page_cache_mb < RRDENG_MIN_PAGE_CACHE_SIZE_MB) {
error("Invalid page cache size %d given. Defaulting to %d.", default_rrdeng_page_cache_mb, RRDENG_MIN_PAGE_CACHE_SIZE_MB);
default_rrdeng_page_cache_mb = RRDENG_MIN_PAGE_CACHE_SIZE_MB;
@@ -731,14 +1159,14 @@ static void get_netdata_configured_variables() {
// --------------------------------------------------------------------
- rrdset_free_obsolete_time = config_get_number(CONFIG_SECTION_DB, "cleanup obsolete charts after secs", rrdset_free_obsolete_time);
+ rrdset_free_obsolete_time_s = config_get_number(CONFIG_SECTION_DB, "cleanup obsolete charts after secs", rrdset_free_obsolete_time_s);
// Current chart locking and invalidation scheme doesn't prevent Netdata from segmentation faults if a short
// cleanup delay is set. Extensive stress tests showed that 10 seconds is quite a safe delay. Look at
// https://github.com/netdata/netdata/pull/11222#issuecomment-868367920 for more information.
- if (rrdset_free_obsolete_time < 10) {
- rrdset_free_obsolete_time = 10;
+ if (rrdset_free_obsolete_time_s < 10) {
+ rrdset_free_obsolete_time_s = 10;
info("The \"cleanup obsolete charts after seconds\" option was set to 10 seconds.");
- config_set_number(CONFIG_SECTION_DB, "cleanup obsolete charts after secs", rrdset_free_obsolete_time);
+ config_set_number(CONFIG_SECTION_DB, "cleanup obsolete charts after secs", rrdset_free_obsolete_time_s);
}
gap_when_lost_iterations_above = (int)config_get_number(CONFIG_SECTION_DB, "gap when lost iterations above", gap_when_lost_iterations_above);
@@ -746,12 +1174,13 @@ static void get_netdata_configured_variables() {
gap_when_lost_iterations_above = 1;
config_set_number(CONFIG_SECTION_DB, "gap when lost iterations above", gap_when_lost_iterations_above);
}
+ gap_when_lost_iterations_above += 2;
// --------------------------------------------------------------------
// get various system parameters
get_system_HZ();
- get_system_cpus();
+ get_system_cpus_uncached();
get_system_pid_max();
@@ -874,7 +1303,30 @@ void post_conf_load(char **user)
appconfig_get(&cloud_config, CONFIG_SECTION_GLOBAL, "cloud base url", DEFAULT_CLOUD_BASE_URL);
}
+#define delta_startup_time(msg) \
+ { \
+ usec_t now_ut = now_monotonic_usec(); \
+ if(prev_msg) \
+ info("NETDATA STARTUP: in %7llu ms, %s - next: %s", (now_ut - last_ut) / USEC_PER_MS, prev_msg, msg); \
+ else \
+ info("NETDATA STARTUP: next: %s", msg); \
+ last_ut = now_ut; \
+ prev_msg = msg; \
+ }
+
+int pgc_unittest(void);
+int mrg_unittest(void);
+int julytest(void);
+
int main(int argc, char **argv) {
+ // initialize the system clocks
+ clocks_init();
+ usec_t started_ut = now_monotonic_usec();
+ usec_t last_ut = started_ut;
+ const char *prev_msg = NULL;
+ // Initialize stderror avoiding coredump when info() or error() is called
+ stderror = stderr;
+
int i;
int config_loaded = 0;
int dont_fork = 0;
@@ -1001,7 +1453,7 @@ int main(int argc, char **argv) {
default_health_enabled = 0;
storage_tiers = 1;
registry_init();
- if(rrd_init("unittest", NULL)) {
+ if(rrd_init("unittest", NULL, true)) {
fprintf(stderr, "rrd_init failed for unittest\n");
return 1;
}
@@ -1027,11 +1479,6 @@ int main(int argc, char **argv) {
else if(strcmp(optarg, "escapetest") == 0) {
return command_argument_sanitization_tests();
}
-#ifdef ENABLE_ML_TESTS
- else if(strcmp(optarg, "mltest") == 0) {
- return test_ml(argc, argv);
- }
-#endif
#ifdef ENABLE_DBENGINE
else if(strcmp(optarg, "mctest") == 0) {
unittest_running = true;
@@ -1061,6 +1508,18 @@ int main(int argc, char **argv) {
unittest_running = true;
return metadata_unittest();
}
+ else if(strcmp(optarg, "pgctest") == 0) {
+ unittest_running = true;
+ return pgc_unittest();
+ }
+ else if(strcmp(optarg, "mrgtest") == 0) {
+ unittest_running = true;
+ return mrg_unittest();
+ }
+ else if(strcmp(optarg, "julytest") == 0) {
+ unittest_running = true;
+ return julytest();
+ }
else if(strncmp(optarg, createdataset_string, strlen(createdataset_string)) == 0) {
optarg += strlen(createdataset_string);
unsigned history_seconds = strtoul(optarg, NULL, 0);
@@ -1304,19 +1763,14 @@ int main(int argc, char **argv) {
}
}
-#ifdef _SC_OPEN_MAX
if (close_open_fds == true) {
// close all open file descriptors, except the standard ones
// the caller may have left open files (lxc-attach has this issue)
- for(int fd = (int) (sysconf(_SC_OPEN_MAX) - 1); fd > 2; fd--)
- if(fd_is_valid(fd))
- close(fd);
+ for_each_open_fd(OPEN_FD_ACTION_CLOSE, OPEN_FD_EXCLUDE_STDIN | OPEN_FD_EXCLUDE_STDOUT | OPEN_FD_EXCLUDE_STDERR);
}
-#endif
- if(!config_loaded)
- {
+ if(!config_loaded) {
load_netdata_conf(NULL, 0);
post_conf_load(&user);
load_cloud_conf(0);
@@ -1327,7 +1781,6 @@ int main(int argc, char **argv) {
appconfig_set(&cloud_config, CONFIG_SECTION_GLOBAL, "enabled", "false");
}
-
// ------------------------------------------------------------------------
// initialize netdata
{
@@ -1347,12 +1800,29 @@ int main(int argc, char **argv) {
#endif
#endif
- // initialize the system clocks
- clocks_init();
+ // set libuv worker threads
+ libuv_worker_threads = (int)get_netdata_cpus() * 2;
- // prepare configuration environment variables for the plugins
+ if(libuv_worker_threads < MIN_LIBUV_WORKER_THREADS)
+ libuv_worker_threads = MIN_LIBUV_WORKER_THREADS;
- setenv("UV_THREADPOOL_SIZE", config_get(CONFIG_SECTION_GLOBAL, "libuv worker threads", "16"), 1);
+ if(libuv_worker_threads > MAX_LIBUV_WORKER_THREADS)
+ libuv_worker_threads = MAX_LIBUV_WORKER_THREADS;
+
+
+ libuv_worker_threads = config_get_number(CONFIG_SECTION_GLOBAL, "libuv worker threads", libuv_worker_threads);
+ if(libuv_worker_threads < MIN_LIBUV_WORKER_THREADS) {
+ libuv_worker_threads = MIN_LIBUV_WORKER_THREADS;
+ config_set_number(CONFIG_SECTION_GLOBAL, "libuv worker threads", libuv_worker_threads);
+ }
+
+ {
+ char buf[20 + 1];
+ snprintfz(buf, 20, "%d", libuv_worker_threads);
+ setenv("UV_THREADPOOL_SIZE", buf, 1);
+ }
+
+ // prepare configuration environment variables for the plugins
get_netdata_configured_variables();
set_global_environment();
@@ -1396,6 +1866,8 @@ int main(int argc, char **argv) {
// initialize the log files
open_all_log_files();
+ aral_judy_init();
+
get_system_timezone();
// --------------------------------------------------------------------
@@ -1414,6 +1886,7 @@ int main(int argc, char **argv) {
// --------------------------------------------------------------------
// Initialize ML configuration
+ delta_startup_time("initialize ML");
ml_init();
// --------------------------------------------------------------------
@@ -1422,19 +1895,18 @@ int main(int argc, char **argv) {
// block signals while initializing threads.
// this causes the threads to block signals.
+ delta_startup_time("initialize signals");
signals_block();
+ signals_init(); // setup the signals we want to use
- // setup the signals we want to use
+ // --------------------------------------------------------------------
+ // check which threads are enabled and initialize them
- signals_init();
+ delta_startup_time("initialize static threads");
// setup threads configs
default_stacksize = netdata_threads_init();
-
- // --------------------------------------------------------------------
- // check which threads are enabled and initialize them
-
for (i = 0; static_threads[i].name != NULL ; i++) {
struct netdata_static_thread *st = &static_threads[i];
@@ -1454,14 +1926,17 @@ int main(int argc, char **argv) {
// --------------------------------------------------------------------
// create the listening sockets
+ delta_startup_time("initialize web server");
+
web_client_api_v1_init();
web_server_threading_selection();
if(web_server_mode != WEB_SERVER_MODE_NONE)
api_listen_sockets_setup();
-
}
+ delta_startup_time("set resource limits");
+
#ifdef NETDATA_INTERNAL_CHECKS
if(debug_flags != 0) {
struct rlimit rl = { RLIM_INFINITY, RLIM_INFINITY };
@@ -1473,11 +1948,9 @@ int main(int argc, char **argv) {
}
#endif /* NETDATA_INTERNAL_CHECKS */
- // get the max file limit
- if(getrlimit(RLIMIT_NOFILE, &rlimit_nofile) != 0)
- error("getrlimit(RLIMIT_NOFILE) failed");
- else
- info("resources control: allowed file descriptors: soft = %zu, max = %zu", (size_t)rlimit_nofile.rlim_cur, (size_t)rlimit_nofile.rlim_max);
+ set_nofile_limit(&rlimit_nofile);
+
+ delta_startup_time("become daemon");
// fork, switch user, create pid file, set process priority
if(become_daemon(dont_fork, user) == -1)
@@ -1485,12 +1958,18 @@ int main(int argc, char **argv) {
info("netdata started on pid %d.", getpid());
+ delta_startup_time("initialize threads after fork");
+
netdata_threads_init_after_fork((size_t)config_get_number(CONFIG_SECTION_GLOBAL, "pthread stack size", (long)default_stacksize));
// initialize internal registry
+ delta_startup_time("initialize registry");
registry_init();
+
// fork the spawn server
+ delta_startup_time("fork the spawn server");
spawn_init();
+
/*
* Libuv uv_spawn() uses SIGCHLD internally:
* https://github.com/libuv/libuv/blob/cc51217a317e96510fbb284721d5e6bc2af31e33/src/unix/process.c#L485
@@ -1503,15 +1982,22 @@ int main(int argc, char **argv) {
// ------------------------------------------------------------------------
// initialize rrd, registry, health, rrdpush, etc.
+ delta_startup_time("collecting system info");
+
netdata_anonymous_statistics_enabled=-1;
struct rrdhost_system_info *system_info = callocz(1, sizeof(struct rrdhost_system_info));
+ __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(struct rrdhost_system_info), __ATOMIC_RELAXED);
get_system_info(system_info);
system_info->hops = 0;
get_install_type(&system_info->install_type, &system_info->prebuilt_arch, &system_info->prebuilt_dist);
- if(rrd_init(netdata_configured_hostname, system_info))
+ delta_startup_time("initialize RRD structures");
+
+ if(rrd_init(netdata_configured_hostname, system_info, false))
fatal("Cannot initialize localhost instance with name '%s'.", netdata_configured_hostname);
+ delta_startup_time("check for incomplete shutdown");
+
char agent_crash_file[FILENAME_MAX + 1];
char agent_incomplete_shutdown_file[FILENAME_MAX + 1];
snprintfz(agent_incomplete_shutdown_file, FILENAME_MAX, "%s/.agent_incomplete_shutdown", netdata_configured_varlib_dir);
@@ -1526,6 +2012,8 @@ int main(int argc, char **argv) {
// ------------------------------------------------------------------------
// Claim netdata agent to a cloud endpoint
+ delta_startup_time("collect claiming info");
+
if (claiming_pending_arguments)
claim_agent(claiming_pending_arguments);
load_claiming_state();
@@ -1536,11 +2024,14 @@ int main(int argc, char **argv) {
error_log_limit_reset();
// Load host labels
+ delta_startup_time("collect host labels");
reload_host_labels();
// ------------------------------------------------------------------------
// spawn the threads
+ delta_startup_time("start the static threads");
+
web_server_config_options();
netdata_zero_metrics_enabled = config_get_boolean_ondemand(CONFIG_SECTION_DB, "enable zero metrics", CONFIG_BOOLEAN_NO);
@@ -1561,9 +2052,14 @@ int main(int argc, char **argv) {
// ------------------------------------------------------------------------
// Initialize netdata agent command serving from cli and signals
+ delta_startup_time("initialize commands API");
+
commands_init();
- info("netdata initialization completed. Enjoy real-time performance monitoring!");
+ delta_startup_time("ready");
+
+ usec_t ready_ut = now_monotonic_usec();
+ info("NETDATA STARTUP: completed in %llu ms. Enjoy real-time performance monitoring!", (ready_ut - started_ut) / USEC_PER_MS);
netdata_ready = 1;
send_statistics("START", "-", "-");