summaryrefslogtreecommitdiffstats
path: root/daemon/main.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-03-09 13:19:22 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-03-09 13:19:22 +0000
commitc21c3b0befeb46a51b6bf3758ffa30813bea0ff0 (patch)
tree9754ff1ca740f6346cf8483ec915d4054bc5da2d /daemon/main.c
parentAdding upstream version 1.43.2. (diff)
downloadnetdata-upstream/1.44.3.tar.xz
netdata-upstream/1.44.3.zip
Adding upstream version 1.44.3.upstream/1.44.3
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'daemon/main.c')
-rw-r--r--daemon/main.c238
1 files changed, 175 insertions, 63 deletions
diff --git a/daemon/main.c b/daemon/main.c
index 5d25f88b5..3e1fda963 100644
--- a/daemon/main.c
+++ b/daemon/main.c
@@ -4,6 +4,8 @@
#include "buildinfo.h"
#include "static_threads.h"
+#include "database/engine/page_test.h"
+
#if defined(ENV32BIT)
#warning COMPILING 32BIT NETDATA
#endif
@@ -313,7 +315,7 @@ void netdata_cleanup_and_exit(int ret) {
const char *prev_msg = NULL;
bool timeout = false;
- error_log_limit_unlimited();
+ nd_log_limits_unlimited();
netdata_log_info("NETDATA SHUTDOWN: initializing shutdown with code %d...", ret);
send_statistics("EXIT", ret?"ERROR":"OK","-");
@@ -371,6 +373,10 @@ void netdata_cleanup_and_exit(int ret) {
SERVICE_REPLICATION // replication has to be stopped after STREAMING, because it cleans up ARAL
, 3 * USEC_PER_SEC);
+ delta_shutdown_time("prepare metasync shutdown");
+
+ metadata_sync_shutdown_prepare();
+
delta_shutdown_time("disable ML detection and training threads");
ml_stop_threads();
@@ -396,10 +402,6 @@ void netdata_cleanup_and_exit(int ret) {
rrdhost_cleanup_all();
- delta_shutdown_time("prepare metasync shutdown");
-
- metadata_sync_shutdown_prepare();
-
delta_shutdown_time("stop aclk threads");
timeout = !service_wait_exit(
@@ -422,6 +424,13 @@ void netdata_cleanup_and_exit(int ret) {
delta_shutdown_time("flush dbengine tiers");
for (size_t tier = 0; tier < storage_tiers; tier++)
rrdeng_prepare_exit(multidb_ctx[tier]);
+
+ for (size_t tier = 0; tier < storage_tiers; tier++) {
+ if (!multidb_ctx[tier])
+ continue;
+ completion_wait_for(&multidb_ctx[tier]->quiesce.completion);
+ completion_destroy(&multidb_ctx[tier]->quiesce.completion);
+ }
}
#endif
@@ -440,17 +449,20 @@ void netdata_cleanup_and_exit(int ret) {
delta_shutdown_time("wait for dbengine collectors to finish");
size_t running = 1;
- while(running) {
+ size_t count = 10;
+ while(running && count) {
running = 0;
for (size_t tier = 0; tier < storage_tiers; tier++)
running += rrdeng_collectors_running(multidb_ctx[tier]);
if(running) {
- error_limit_static_thread_var(erl, 1, 100 * USEC_PER_MS);
- error_limit(&erl, "waiting for %zu collectors to finish", running);
+ nd_log_limit_static_thread_var(erl, 1, 100 * USEC_PER_MS);
+ nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE,
+ "waiting for %zu collectors to finish", running);
// sleep_usec(100 * USEC_PER_MS);
cleanup_destroyed_dictionaries();
}
+ count--;
}
delta_shutdown_time("wait for dbengine main cache to finish flushing");
@@ -463,6 +475,8 @@ void netdata_cleanup_and_exit(int ret) {
delta_shutdown_time("stop dbengine tiers");
for (size_t tier = 0; tier < storage_tiers; tier++)
rrdeng_exit(multidb_ctx[tier]);
+
+ rrdeng_enq_cmd(NULL, RRDENG_OPCODE_SHUTDOWN_EVLOOP, NULL, NULL, STORAGE_PRIORITY_BEST_EFFORT, NULL, NULL);
}
#endif
}
@@ -614,8 +628,14 @@ int killpid(pid_t pid) {
int ret;
netdata_log_debug(D_EXIT, "Request to kill pid %d", pid);
+ int signal = SIGTERM;
+//#ifdef NETDATA_INTERNAL_CHECKS
+// if(service_running(SERVICE_COLLECTORS))
+// signal = SIGABRT;
+//#endif
+
errno = 0;
- ret = kill(pid, SIGTERM);
+ ret = kill(pid, signal);
if (ret == -1) {
switch(errno) {
case ESRCH:
@@ -662,7 +682,7 @@ static void set_nofile_limit(struct rlimit *rl) {
}
void cancel_main_threads() {
- error_log_limit_unlimited();
+ nd_log_limits_unlimited();
int i, found = 0;
usec_t max = 5 * USEC_PER_SEC, step = 100000;
@@ -752,7 +772,7 @@ int help(int exitcode) {
" | '-' '-' '-' '-' real-time performance monitoring, done right! \n"
" +----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+--->\n"
"\n"
- " Copyright (C) 2016-2022, Netdata, Inc. <info@netdata.cloud>\n"
+ " Copyright (C) 2016-2023, Netdata, Inc. <info@netdata.cloud>\n"
" Released under GNU General Public License v3 or later.\n"
" All rights reserved.\n"
"\n"
@@ -790,6 +810,7 @@ int help(int exitcode) {
" -W unittest Run internal unittests and exit.\n\n"
" -W sqlite-meta-recover Run recovery on the metadata database and exit.\n\n"
" -W sqlite-compact Reclaim metadata database unused space and exit.\n\n"
+ " -W sqlite-analyze Run update statistics and exit.\n\n"
#ifdef ENABLE_DBENGINE
" -W createdataset=N Create a DB engine dataset of N seconds and exit.\n\n"
" -W stresstest=A,B,C,D,E,F,G\n"
@@ -841,44 +862,52 @@ static void security_init(){
#endif
static void log_init(void) {
+ nd_log_set_facility(config_get(CONFIG_SECTION_LOGS, "facility", "daemon"));
+
+ time_t period = ND_LOG_DEFAULT_THROTTLE_PERIOD;
+ size_t logs = ND_LOG_DEFAULT_THROTTLE_LOGS;
+ period = config_get_number(CONFIG_SECTION_LOGS, "logs flood protection period", period);
+ logs = (unsigned long)config_get_number(CONFIG_SECTION_LOGS, "logs to trigger flood protection", (long long int)logs);
+ nd_log_set_flood_protection(logs, period);
+
+ const char *netdata_log_level = getenv("NETDATA_LOG_LEVEL");
+ netdata_log_level = netdata_log_level ? nd_log_id2priority(nd_log_priority2id(netdata_log_level)) : NDLP_INFO_STR;
+
+ nd_log_set_priority_level(config_get(CONFIG_SECTION_LOGS, "level", netdata_log_level));
+
char filename[FILENAME_MAX + 1];
snprintfz(filename, FILENAME_MAX, "%s/debug.log", netdata_configured_log_dir);
- stdout_filename = config_get(CONFIG_SECTION_LOGS, "debug", filename);
+ nd_log_set_user_settings(NDLS_DEBUG, config_get(CONFIG_SECTION_LOGS, "debug", filename));
- snprintfz(filename, FILENAME_MAX, "%s/error.log", netdata_configured_log_dir);
- stderr_filename = config_get(CONFIG_SECTION_LOGS, "error", filename);
+ bool with_journal = is_stderr_connected_to_journal() /* || nd_log_journal_socket_available() */;
+ if(with_journal)
+ snprintfz(filename, FILENAME_MAX, "journal");
+ else
+ snprintfz(filename, FILENAME_MAX, "%s/daemon.log", netdata_configured_log_dir);
+ nd_log_set_user_settings(NDLS_DAEMON, config_get(CONFIG_SECTION_LOGS, "daemon", filename));
- snprintfz(filename, FILENAME_MAX, "%s/collector.log", netdata_configured_log_dir);
- stdcollector_filename = config_get(CONFIG_SECTION_LOGS, "collector", filename);
+ if(with_journal)
+ snprintfz(filename, FILENAME_MAX, "journal");
+ else
+ snprintfz(filename, FILENAME_MAX, "%s/collector.log", netdata_configured_log_dir);
+ nd_log_set_user_settings(NDLS_COLLECTORS, config_get(CONFIG_SECTION_LOGS, "collector", filename));
snprintfz(filename, FILENAME_MAX, "%s/access.log", netdata_configured_log_dir);
- stdaccess_filename = config_get(CONFIG_SECTION_LOGS, "access", filename);
+ nd_log_set_user_settings(NDLS_ACCESS, config_get(CONFIG_SECTION_LOGS, "access", filename));
- snprintfz(filename, FILENAME_MAX, "%s/health.log", netdata_configured_log_dir);
- stdhealth_filename = config_get(CONFIG_SECTION_LOGS, "health", filename);
+ if(with_journal)
+ snprintfz(filename, FILENAME_MAX, "journal");
+ else
+ snprintfz(filename, FILENAME_MAX, "%s/health.log", netdata_configured_log_dir);
+ nd_log_set_user_settings(NDLS_HEALTH, config_get(CONFIG_SECTION_LOGS, "health", filename));
#ifdef ENABLE_ACLK
aclklog_enabled = config_get_boolean(CONFIG_SECTION_CLOUD, "conversation log", CONFIG_BOOLEAN_NO);
if (aclklog_enabled) {
snprintfz(filename, FILENAME_MAX, "%s/aclk.log", netdata_configured_log_dir);
- aclklog_filename = config_get(CONFIG_SECTION_CLOUD, "conversation log file", filename);
+ nd_log_set_user_settings(NDLS_ACLK, config_get(CONFIG_SECTION_CLOUD, "conversation log file", filename));
}
#endif
-
- char deffacility[8];
- snprintfz(deffacility,7,"%s","daemon");
- facility_log = config_get(CONFIG_SECTION_LOGS, "facility", deffacility);
-
- error_log_throttle_period = config_get_number(CONFIG_SECTION_LOGS, "errors flood protection period", error_log_throttle_period);
- error_log_errors_per_period = (unsigned long)config_get_number(CONFIG_SECTION_LOGS, "errors to trigger flood protection", (long long int)error_log_errors_per_period);
- error_log_errors_per_period_backup = error_log_errors_per_period;
-
- setenv("NETDATA_ERRORS_THROTTLE_PERIOD", config_get(CONFIG_SECTION_LOGS, "errors flood protection period" , ""), 1);
- setenv("NETDATA_ERRORS_PER_PERIOD", config_get(CONFIG_SECTION_LOGS, "errors to trigger flood protection", ""), 1);
-
- char *selected_level = config_get(CONFIG_SECTION_LOGS, "severity level", NETDATA_LOG_LEVEL_INFO_STR);
- global_log_severity_level = log_severity_string_to_severity_level(selected_level);
- setenv("NETDATA_LOG_SEVERITY_LEVEL", selected_level , 1);
}
char *initialize_lock_directory_path(char *prefix)
@@ -1050,6 +1079,36 @@ static void backwards_compatible_config() {
config_move(CONFIG_SECTION_GLOBAL, "enable zero metrics",
CONFIG_SECTION_DB, "enable zero metrics");
+ config_move(CONFIG_SECTION_LOGS, "error",
+ CONFIG_SECTION_LOGS, "daemon");
+
+ config_move(CONFIG_SECTION_LOGS, "severity level",
+ CONFIG_SECTION_LOGS, "level");
+
+ config_move(CONFIG_SECTION_LOGS, "errors to trigger flood protection",
+ CONFIG_SECTION_LOGS, "logs to trigger flood protection");
+
+ config_move(CONFIG_SECTION_LOGS, "errors flood protection period",
+ CONFIG_SECTION_LOGS, "logs flood protection period");
+ config_move(CONFIG_SECTION_HEALTH, "is ephemeral",
+ CONFIG_SECTION_GLOBAL, "is ephemeral node");
+
+ config_move(CONFIG_SECTION_HEALTH, "has unstable connection",
+ CONFIG_SECTION_GLOBAL, "has unstable connection");
+}
+
+static int get_hostname(char *buf, size_t buf_size) {
+ if (netdata_configured_host_prefix && *netdata_configured_host_prefix) {
+ char filename[FILENAME_MAX + 1];
+ snprintfz(filename, FILENAME_MAX, "%s/etc/hostname", netdata_configured_host_prefix);
+
+ if (!read_file(filename, buf, buf_size)) {
+ trim(buf);
+ return 0;
+ }
+ }
+
+ return gethostname(buf, buf_size);
}
static void get_netdata_configured_variables() {
@@ -1058,10 +1117,12 @@ static void get_netdata_configured_variables() {
// ------------------------------------------------------------------------
// get the hostname
+ netdata_configured_host_prefix = config_get(CONFIG_SECTION_GLOBAL, "host access prefix", "");
+ verify_netdata_host_prefix(true);
+
char buf[HOSTNAME_MAX + 1];
- if(gethostname(buf, HOSTNAME_MAX) == -1){
+ if (get_hostname(buf, HOSTNAME_MAX))
netdata_log_error("Cannot get machine hostname.");
- }
netdata_configured_hostname = config_get(CONFIG_SECTION_GLOBAL, "hostname", buf);
netdata_log_debug(D_OPTIONS, "hostname set to '%s'", netdata_configured_hostname);
@@ -1112,8 +1173,6 @@ static void get_netdata_configured_variables() {
netdata_configured_web_dir = config_get(CONFIG_SECTION_DIRECTORIES, "web", netdata_configured_web_dir);
netdata_configured_cache_dir = config_get(CONFIG_SECTION_DIRECTORIES, "cache", netdata_configured_cache_dir);
netdata_configured_varlib_dir = config_get(CONFIG_SECTION_DIRECTORIES, "lib", netdata_configured_varlib_dir);
- char *env_home=getenv("HOME");
- netdata_configured_home_dir = config_get(CONFIG_SECTION_DIRECTORIES, "home", env_home?env_home:netdata_configured_home_dir);
netdata_configured_lock_dir = initialize_lock_directory_path(netdata_configured_varlib_dir);
@@ -1124,6 +1183,16 @@ static void get_netdata_configured_variables() {
#ifdef ENABLE_DBENGINE
// ------------------------------------------------------------------------
+ // get default Database Engine page type
+
+ const char *page_type = config_get(CONFIG_SECTION_DB, "dbengine page type", "raw");
+ if (strcmp(page_type, "gorilla") == 0) {
+ tier_page_type[0] = PAGE_GORILLA_METRICS;
+ } else if (strcmp(page_type, "raw") != 0) {
+ netdata_log_error("Invalid dbengine page type ''%s' given. Defaulting to 'raw'.", page_type);
+ }
+
+ // ------------------------------------------------------------------------
// get default Database Engine page cache size in MiB
default_rrdeng_page_cache_mb = (int) config_get_number(CONFIG_SECTION_DB, "dbengine page cache size MB", default_rrdeng_page_cache_mb);
@@ -1161,10 +1230,6 @@ static void get_netdata_configured_variables() {
default_rrd_memory_mode = RRD_MEMORY_MODE_SAVE;
}
#endif
- // ------------------------------------------------------------------------
-
- netdata_configured_host_prefix = config_get(CONFIG_SECTION_GLOBAL, "host access prefix", "");
- verify_netdata_host_prefix();
// --------------------------------------------------------------------
// get KSM settings
@@ -1184,6 +1249,7 @@ static void get_netdata_configured_variables() {
// --------------------------------------------------------------------
rrdset_free_obsolete_time_s = config_get_number(CONFIG_SECTION_DB, "cleanup obsolete charts after secs", rrdset_free_obsolete_time_s);
+ rrdhost_free_ephemeral_time_s = config_get_number(CONFIG_SECTION_DB, "cleanup ephemeral hosts after secs", rrdhost_free_ephemeral_time_s);
// Current chart locking and invalidation scheme doesn't prevent Netdata from segmentation faults if a short
// cleanup delay is set. Extensive stress tests showed that 10 seconds is quite a safe delay. Look at
// https://github.com/netdata/netdata/pull/11222#issuecomment-868367920 for more information.
@@ -1262,7 +1328,7 @@ static inline void coverity_remove_taint(char *s)
(void)s;
}
-int get_system_info(struct rrdhost_system_info *system_info, bool log) {
+int get_system_info(struct rrdhost_system_info *system_info) {
char *script;
script = mallocz(sizeof(char) * (strlen(netdata_configured_primary_plugins_dir) + strlen("system-info.sh") + 2));
sprintf(script, "%s/%s", netdata_configured_primary_plugins_dir, "system-info.sh");
@@ -1294,11 +1360,7 @@ int get_system_info(struct rrdhost_system_info *system_info, bool log) {
if(unlikely(rrdhost_set_system_info_variable(system_info, line, value))) {
netdata_log_error("Unexpected environment variable %s=%s", line, value);
- }
- else {
- if(log)
- netdata_log_info("%s=%s", line, value);
-
+ } else {
setenv(line, value, 1);
}
}
@@ -1337,6 +1399,8 @@ int julytest(void);
int pluginsd_parser_unittest(void);
void replication_initialize(void);
void bearer_tokens_init(void);
+int unittest_rrdpush_compressions(void);
+int uuid_unittest(void);
int main(int argc, char **argv) {
// initialize the system clocks
@@ -1346,8 +1410,6 @@ int main(int argc, char **argv) {
usec_t started_ut = now_monotonic_usec();
usec_t last_ut = started_ut;
const char *prev_msg = NULL;
- // Initialize stderror avoiding coredump when netdata_log_info() or netdata_log_error() is called
- stderror = stderr;
int i;
int config_loaded = 0;
@@ -1439,6 +1501,10 @@ int main(int argc, char **argv) {
#ifdef ENABLE_DBENGINE
char* createdataset_string = "createdataset=";
char* stresstest_string = "stresstest=";
+
+ if(strcmp(optarg, "pgd-tests") == 0) {
+ return pgd_test(argc, argv);
+ }
#endif
if(strcmp(optarg, "sqlite-meta-recover") == 0) {
@@ -1451,6 +1517,11 @@ int main(int argc, char **argv) {
return 0;
}
+ if(strcmp(optarg, "sqlite-analyze") == 0) {
+ sql_init_database(DB_CHECK_ANALYZE, 0);
+ return 0;
+ }
+
if(strcmp(optarg, "unittest") == 0) {
unittest_running = true;
@@ -1495,6 +1566,8 @@ int main(int argc, char **argv) {
return 1;
if (ctx_unittest())
return 1;
+ if (uuid_unittest())
+ return 1;
fprintf(stderr, "\n\nALL TESTS PASSED\n\n");
return 0;
}
@@ -1521,6 +1594,10 @@ int main(int argc, char **argv) {
unittest_running = true;
return buffer_unittest();
}
+ else if(strcmp(optarg, "uuidtest") == 0) {
+ unittest_running = true;
+ return uuid_unittest();
+ }
#ifdef ENABLE_DBENGINE
else if(strcmp(optarg, "mctest") == 0) {
unittest_running = true;
@@ -1550,6 +1627,10 @@ int main(int argc, char **argv) {
unittest_running = true;
return pluginsd_parser_unittest();
}
+ else if(strcmp(optarg, "rrdpush_compressions_test") == 0) {
+ unittest_running = true;
+ return unittest_rrdpush_compressions();
+ }
else if(strncmp(optarg, createdataset_string, strlen(createdataset_string)) == 0) {
optarg += strlen(createdataset_string);
unsigned history_seconds = strtoul(optarg, NULL, 0);
@@ -1851,7 +1932,7 @@ int main(int argc, char **argv) {
{
char buf[20 + 1];
- snprintfz(buf, 20, "%d", libuv_worker_threads);
+ snprintfz(buf, sizeof(buf) - 1, "%d", libuv_worker_threads);
setenv("UV_THREADPOOL_SIZE", buf, 1);
}
@@ -1894,13 +1975,15 @@ int main(int argc, char **argv) {
// get log filenames and settings
log_init();
- error_log_limit_unlimited();
+ nd_log_limits_unlimited();
// initialize the log files
- open_all_log_files();
+ nd_log_initialize();
netdata_log_info("Netdata agent version \""VERSION"\" is starting");
ieee754_doubles = is_system_ieee754_double();
+ if(!ieee754_doubles)
+ globally_disabled_capabilities |= STREAM_CAP_IEEE754;
aral_judy_init();
@@ -1925,11 +2008,11 @@ int main(int argc, char **argv) {
set_silencers_filename();
health_initialize_global_silencers();
- // --------------------------------------------------------------------
- // Initialize ML configuration
-
- delta_startup_time("initialize ML");
- ml_init();
+// // --------------------------------------------------------------------
+// // Initialize ML configuration
+//
+// delta_startup_time("initialize ML");
+// ml_init();
// --------------------------------------------------------------------
// setup process signals
@@ -1949,6 +2032,15 @@ int main(int argc, char **argv) {
// setup threads configs
default_stacksize = netdata_threads_init();
+#ifdef NETDATA_INTERNAL_CHECKS
+ config_set_boolean(CONFIG_SECTION_PLUGINS, "netdata monitoring", true);
+ config_set_boolean(CONFIG_SECTION_PLUGINS, "netdata monitoring extended", true);
+#endif
+
+ if(config_get_boolean(CONFIG_SECTION_PLUGINS, "netdata monitoring extended", false))
+ // this has to run before starting any other threads that use workers
+ workers_utilization_enable();
+
for (i = 0; static_threads[i].name != NULL ; i++) {
struct netdata_static_thread *st = &static_threads[i];
@@ -1973,8 +2065,18 @@ int main(int argc, char **argv) {
web_client_api_v1_init();
web_server_threading_selection();
- if(web_server_mode != WEB_SERVER_MODE_NONE)
- api_listen_sockets_setup();
+ if(web_server_mode != WEB_SERVER_MODE_NONE) {
+ if (!api_listen_sockets_setup()) {
+ netdata_log_error("Cannot setup listen port(s). Is Netdata already running?");
+ exit(1);
+ }
+ }
+
+ // --------------------------------------------------------------------
+ // Initialize ML configuration
+
+ delta_startup_time("initialize ML");
+ ml_init();
#ifdef ENABLE_H2O
delta_startup_time("initialize h2o server");
@@ -2006,6 +2108,16 @@ int main(int argc, char **argv) {
if(become_daemon(dont_fork, user) == -1)
fatal("Cannot daemonize myself.");
+ // The "HOME" env var points to the root's home dir because Netdata starts as root. Can't use "HOME".
+ struct passwd *pw = getpwuid(getuid());
+ if (config_exists(CONFIG_SECTION_DIRECTORIES, "home") || !pw || !pw->pw_dir) {
+ netdata_configured_home_dir = config_get(CONFIG_SECTION_DIRECTORIES, "home", netdata_configured_home_dir);
+ } else {
+ netdata_configured_home_dir = config_get(CONFIG_SECTION_DIRECTORIES, "home", pw->pw_dir);
+ }
+
+ setenv("HOME", netdata_configured_home_dir, 1);
+
dyn_conf_init();
netdata_log_info("netdata started on pid %d.", getpid());
@@ -2039,7 +2151,7 @@ int main(int argc, char **argv) {
netdata_anonymous_statistics_enabled=-1;
struct rrdhost_system_info *system_info = callocz(1, sizeof(struct rrdhost_system_info));
__atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(struct rrdhost_system_info), __ATOMIC_RELAXED);
- get_system_info(system_info, true);
+ get_system_info(system_info);
(void) registry_get_this_machine_guid();
system_info->hops = 0;
get_install_type(&system_info->install_type, &system_info->prebuilt_arch, &system_info->prebuilt_dist);
@@ -2076,7 +2188,7 @@ int main(int argc, char **argv) {
// ------------------------------------------------------------------------
// enable log flood protection
- error_log_limit_reset();
+ nd_log_limits_reset();
// Load host labels
delta_startup_time("collect host labels");