diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-03-09 13:19:22 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-03-09 13:19:22 +0000 |
commit | c21c3b0befeb46a51b6bf3758ffa30813bea0ff0 (patch) | |
tree | 9754ff1ca740f6346cf8483ec915d4054bc5da2d /daemon/main.c | |
parent | Adding upstream version 1.43.2. (diff) | |
download | netdata-upstream/1.44.3.tar.xz netdata-upstream/1.44.3.zip |
Adding upstream version 1.44.3.upstream/1.44.3
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'daemon/main.c')
-rw-r--r-- | daemon/main.c | 238 |
1 files changed, 175 insertions, 63 deletions
diff --git a/daemon/main.c b/daemon/main.c index 5d25f88b5..3e1fda963 100644 --- a/daemon/main.c +++ b/daemon/main.c @@ -4,6 +4,8 @@ #include "buildinfo.h" #include "static_threads.h" +#include "database/engine/page_test.h" + #if defined(ENV32BIT) #warning COMPILING 32BIT NETDATA #endif @@ -313,7 +315,7 @@ void netdata_cleanup_and_exit(int ret) { const char *prev_msg = NULL; bool timeout = false; - error_log_limit_unlimited(); + nd_log_limits_unlimited(); netdata_log_info("NETDATA SHUTDOWN: initializing shutdown with code %d...", ret); send_statistics("EXIT", ret?"ERROR":"OK","-"); @@ -371,6 +373,10 @@ void netdata_cleanup_and_exit(int ret) { SERVICE_REPLICATION // replication has to be stopped after STREAMING, because it cleans up ARAL , 3 * USEC_PER_SEC); + delta_shutdown_time("prepare metasync shutdown"); + + metadata_sync_shutdown_prepare(); + delta_shutdown_time("disable ML detection and training threads"); ml_stop_threads(); @@ -396,10 +402,6 @@ void netdata_cleanup_and_exit(int ret) { rrdhost_cleanup_all(); - delta_shutdown_time("prepare metasync shutdown"); - - metadata_sync_shutdown_prepare(); - delta_shutdown_time("stop aclk threads"); timeout = !service_wait_exit( @@ -422,6 +424,13 @@ void netdata_cleanup_and_exit(int ret) { delta_shutdown_time("flush dbengine tiers"); for (size_t tier = 0; tier < storage_tiers; tier++) rrdeng_prepare_exit(multidb_ctx[tier]); + + for (size_t tier = 0; tier < storage_tiers; tier++) { + if (!multidb_ctx[tier]) + continue; + completion_wait_for(&multidb_ctx[tier]->quiesce.completion); + completion_destroy(&multidb_ctx[tier]->quiesce.completion); + } } #endif @@ -440,17 +449,20 @@ void netdata_cleanup_and_exit(int ret) { delta_shutdown_time("wait for dbengine collectors to finish"); size_t running = 1; - while(running) { + size_t count = 10; + while(running && count) { running = 0; for (size_t tier = 0; tier < storage_tiers; tier++) running += rrdeng_collectors_running(multidb_ctx[tier]); if(running) { - error_limit_static_thread_var(erl, 1, 100 * USEC_PER_MS); - error_limit(&erl, "waiting for %zu collectors to finish", running); + nd_log_limit_static_thread_var(erl, 1, 100 * USEC_PER_MS); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE, + "waiting for %zu collectors to finish", running); // sleep_usec(100 * USEC_PER_MS); cleanup_destroyed_dictionaries(); } + count--; } delta_shutdown_time("wait for dbengine main cache to finish flushing"); @@ -463,6 +475,8 @@ void netdata_cleanup_and_exit(int ret) { delta_shutdown_time("stop dbengine tiers"); for (size_t tier = 0; tier < storage_tiers; tier++) rrdeng_exit(multidb_ctx[tier]); + + rrdeng_enq_cmd(NULL, RRDENG_OPCODE_SHUTDOWN_EVLOOP, NULL, NULL, STORAGE_PRIORITY_BEST_EFFORT, NULL, NULL); } #endif } @@ -614,8 +628,14 @@ int killpid(pid_t pid) { int ret; netdata_log_debug(D_EXIT, "Request to kill pid %d", pid); + int signal = SIGTERM; +//#ifdef NETDATA_INTERNAL_CHECKS +// if(service_running(SERVICE_COLLECTORS)) +// signal = SIGABRT; +//#endif + errno = 0; - ret = kill(pid, SIGTERM); + ret = kill(pid, signal); if (ret == -1) { switch(errno) { case ESRCH: @@ -662,7 +682,7 @@ static void set_nofile_limit(struct rlimit *rl) { } void cancel_main_threads() { - error_log_limit_unlimited(); + nd_log_limits_unlimited(); int i, found = 0; usec_t max = 5 * USEC_PER_SEC, step = 100000; @@ -752,7 +772,7 @@ int help(int exitcode) { " | '-' '-' '-' '-' real-time performance monitoring, done right! \n" " +----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+--->\n" "\n" - " Copyright (C) 2016-2022, Netdata, Inc. <info@netdata.cloud>\n" + " Copyright (C) 2016-2023, Netdata, Inc. <info@netdata.cloud>\n" " Released under GNU General Public License v3 or later.\n" " All rights reserved.\n" "\n" @@ -790,6 +810,7 @@ int help(int exitcode) { " -W unittest Run internal unittests and exit.\n\n" " -W sqlite-meta-recover Run recovery on the metadata database and exit.\n\n" " -W sqlite-compact Reclaim metadata database unused space and exit.\n\n" + " -W sqlite-analyze Run update statistics and exit.\n\n" #ifdef ENABLE_DBENGINE " -W createdataset=N Create a DB engine dataset of N seconds and exit.\n\n" " -W stresstest=A,B,C,D,E,F,G\n" @@ -841,44 +862,52 @@ static void security_init(){ #endif static void log_init(void) { + nd_log_set_facility(config_get(CONFIG_SECTION_LOGS, "facility", "daemon")); + + time_t period = ND_LOG_DEFAULT_THROTTLE_PERIOD; + size_t logs = ND_LOG_DEFAULT_THROTTLE_LOGS; + period = config_get_number(CONFIG_SECTION_LOGS, "logs flood protection period", period); + logs = (unsigned long)config_get_number(CONFIG_SECTION_LOGS, "logs to trigger flood protection", (long long int)logs); + nd_log_set_flood_protection(logs, period); + + const char *netdata_log_level = getenv("NETDATA_LOG_LEVEL"); + netdata_log_level = netdata_log_level ? nd_log_id2priority(nd_log_priority2id(netdata_log_level)) : NDLP_INFO_STR; + + nd_log_set_priority_level(config_get(CONFIG_SECTION_LOGS, "level", netdata_log_level)); + char filename[FILENAME_MAX + 1]; snprintfz(filename, FILENAME_MAX, "%s/debug.log", netdata_configured_log_dir); - stdout_filename = config_get(CONFIG_SECTION_LOGS, "debug", filename); + nd_log_set_user_settings(NDLS_DEBUG, config_get(CONFIG_SECTION_LOGS, "debug", filename)); - snprintfz(filename, FILENAME_MAX, "%s/error.log", netdata_configured_log_dir); - stderr_filename = config_get(CONFIG_SECTION_LOGS, "error", filename); + bool with_journal = is_stderr_connected_to_journal() /* || nd_log_journal_socket_available() */; + if(with_journal) + snprintfz(filename, FILENAME_MAX, "journal"); + else + snprintfz(filename, FILENAME_MAX, "%s/daemon.log", netdata_configured_log_dir); + nd_log_set_user_settings(NDLS_DAEMON, config_get(CONFIG_SECTION_LOGS, "daemon", filename)); - snprintfz(filename, FILENAME_MAX, "%s/collector.log", netdata_configured_log_dir); - stdcollector_filename = config_get(CONFIG_SECTION_LOGS, "collector", filename); + if(with_journal) + snprintfz(filename, FILENAME_MAX, "journal"); + else + snprintfz(filename, FILENAME_MAX, "%s/collector.log", netdata_configured_log_dir); + nd_log_set_user_settings(NDLS_COLLECTORS, config_get(CONFIG_SECTION_LOGS, "collector", filename)); snprintfz(filename, FILENAME_MAX, "%s/access.log", netdata_configured_log_dir); - stdaccess_filename = config_get(CONFIG_SECTION_LOGS, "access", filename); + nd_log_set_user_settings(NDLS_ACCESS, config_get(CONFIG_SECTION_LOGS, "access", filename)); - snprintfz(filename, FILENAME_MAX, "%s/health.log", netdata_configured_log_dir); - stdhealth_filename = config_get(CONFIG_SECTION_LOGS, "health", filename); + if(with_journal) + snprintfz(filename, FILENAME_MAX, "journal"); + else + snprintfz(filename, FILENAME_MAX, "%s/health.log", netdata_configured_log_dir); + nd_log_set_user_settings(NDLS_HEALTH, config_get(CONFIG_SECTION_LOGS, "health", filename)); #ifdef ENABLE_ACLK aclklog_enabled = config_get_boolean(CONFIG_SECTION_CLOUD, "conversation log", CONFIG_BOOLEAN_NO); if (aclklog_enabled) { snprintfz(filename, FILENAME_MAX, "%s/aclk.log", netdata_configured_log_dir); - aclklog_filename = config_get(CONFIG_SECTION_CLOUD, "conversation log file", filename); + nd_log_set_user_settings(NDLS_ACLK, config_get(CONFIG_SECTION_CLOUD, "conversation log file", filename)); } #endif - - char deffacility[8]; - snprintfz(deffacility,7,"%s","daemon"); - facility_log = config_get(CONFIG_SECTION_LOGS, "facility", deffacility); - - error_log_throttle_period = config_get_number(CONFIG_SECTION_LOGS, "errors flood protection period", error_log_throttle_period); - error_log_errors_per_period = (unsigned long)config_get_number(CONFIG_SECTION_LOGS, "errors to trigger flood protection", (long long int)error_log_errors_per_period); - error_log_errors_per_period_backup = error_log_errors_per_period; - - setenv("NETDATA_ERRORS_THROTTLE_PERIOD", config_get(CONFIG_SECTION_LOGS, "errors flood protection period" , ""), 1); - setenv("NETDATA_ERRORS_PER_PERIOD", config_get(CONFIG_SECTION_LOGS, "errors to trigger flood protection", ""), 1); - - char *selected_level = config_get(CONFIG_SECTION_LOGS, "severity level", NETDATA_LOG_LEVEL_INFO_STR); - global_log_severity_level = log_severity_string_to_severity_level(selected_level); - setenv("NETDATA_LOG_SEVERITY_LEVEL", selected_level , 1); } char *initialize_lock_directory_path(char *prefix) @@ -1050,6 +1079,36 @@ static void backwards_compatible_config() { config_move(CONFIG_SECTION_GLOBAL, "enable zero metrics", CONFIG_SECTION_DB, "enable zero metrics"); + config_move(CONFIG_SECTION_LOGS, "error", + CONFIG_SECTION_LOGS, "daemon"); + + config_move(CONFIG_SECTION_LOGS, "severity level", + CONFIG_SECTION_LOGS, "level"); + + config_move(CONFIG_SECTION_LOGS, "errors to trigger flood protection", + CONFIG_SECTION_LOGS, "logs to trigger flood protection"); + + config_move(CONFIG_SECTION_LOGS, "errors flood protection period", + CONFIG_SECTION_LOGS, "logs flood protection period"); + config_move(CONFIG_SECTION_HEALTH, "is ephemeral", + CONFIG_SECTION_GLOBAL, "is ephemeral node"); + + config_move(CONFIG_SECTION_HEALTH, "has unstable connection", + CONFIG_SECTION_GLOBAL, "has unstable connection"); +} + +static int get_hostname(char *buf, size_t buf_size) { + if (netdata_configured_host_prefix && *netdata_configured_host_prefix) { + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s/etc/hostname", netdata_configured_host_prefix); + + if (!read_file(filename, buf, buf_size)) { + trim(buf); + return 0; + } + } + + return gethostname(buf, buf_size); } static void get_netdata_configured_variables() { @@ -1058,10 +1117,12 @@ static void get_netdata_configured_variables() { // ------------------------------------------------------------------------ // get the hostname + netdata_configured_host_prefix = config_get(CONFIG_SECTION_GLOBAL, "host access prefix", ""); + verify_netdata_host_prefix(true); + char buf[HOSTNAME_MAX + 1]; - if(gethostname(buf, HOSTNAME_MAX) == -1){ + if (get_hostname(buf, HOSTNAME_MAX)) netdata_log_error("Cannot get machine hostname."); - } netdata_configured_hostname = config_get(CONFIG_SECTION_GLOBAL, "hostname", buf); netdata_log_debug(D_OPTIONS, "hostname set to '%s'", netdata_configured_hostname); @@ -1112,8 +1173,6 @@ static void get_netdata_configured_variables() { netdata_configured_web_dir = config_get(CONFIG_SECTION_DIRECTORIES, "web", netdata_configured_web_dir); netdata_configured_cache_dir = config_get(CONFIG_SECTION_DIRECTORIES, "cache", netdata_configured_cache_dir); netdata_configured_varlib_dir = config_get(CONFIG_SECTION_DIRECTORIES, "lib", netdata_configured_varlib_dir); - char *env_home=getenv("HOME"); - netdata_configured_home_dir = config_get(CONFIG_SECTION_DIRECTORIES, "home", env_home?env_home:netdata_configured_home_dir); netdata_configured_lock_dir = initialize_lock_directory_path(netdata_configured_varlib_dir); @@ -1124,6 +1183,16 @@ static void get_netdata_configured_variables() { #ifdef ENABLE_DBENGINE // ------------------------------------------------------------------------ + // get default Database Engine page type + + const char *page_type = config_get(CONFIG_SECTION_DB, "dbengine page type", "raw"); + if (strcmp(page_type, "gorilla") == 0) { + tier_page_type[0] = PAGE_GORILLA_METRICS; + } else if (strcmp(page_type, "raw") != 0) { + netdata_log_error("Invalid dbengine page type ''%s' given. Defaulting to 'raw'.", page_type); + } + + // ------------------------------------------------------------------------ // get default Database Engine page cache size in MiB default_rrdeng_page_cache_mb = (int) config_get_number(CONFIG_SECTION_DB, "dbengine page cache size MB", default_rrdeng_page_cache_mb); @@ -1161,10 +1230,6 @@ static void get_netdata_configured_variables() { default_rrd_memory_mode = RRD_MEMORY_MODE_SAVE; } #endif - // ------------------------------------------------------------------------ - - netdata_configured_host_prefix = config_get(CONFIG_SECTION_GLOBAL, "host access prefix", ""); - verify_netdata_host_prefix(); // -------------------------------------------------------------------- // get KSM settings @@ -1184,6 +1249,7 @@ static void get_netdata_configured_variables() { // -------------------------------------------------------------------- rrdset_free_obsolete_time_s = config_get_number(CONFIG_SECTION_DB, "cleanup obsolete charts after secs", rrdset_free_obsolete_time_s); + rrdhost_free_ephemeral_time_s = config_get_number(CONFIG_SECTION_DB, "cleanup ephemeral hosts after secs", rrdhost_free_ephemeral_time_s); // Current chart locking and invalidation scheme doesn't prevent Netdata from segmentation faults if a short // cleanup delay is set. Extensive stress tests showed that 10 seconds is quite a safe delay. Look at // https://github.com/netdata/netdata/pull/11222#issuecomment-868367920 for more information. @@ -1262,7 +1328,7 @@ static inline void coverity_remove_taint(char *s) (void)s; } -int get_system_info(struct rrdhost_system_info *system_info, bool log) { +int get_system_info(struct rrdhost_system_info *system_info) { char *script; script = mallocz(sizeof(char) * (strlen(netdata_configured_primary_plugins_dir) + strlen("system-info.sh") + 2)); sprintf(script, "%s/%s", netdata_configured_primary_plugins_dir, "system-info.sh"); @@ -1294,11 +1360,7 @@ int get_system_info(struct rrdhost_system_info *system_info, bool log) { if(unlikely(rrdhost_set_system_info_variable(system_info, line, value))) { netdata_log_error("Unexpected environment variable %s=%s", line, value); - } - else { - if(log) - netdata_log_info("%s=%s", line, value); - + } else { setenv(line, value, 1); } } @@ -1337,6 +1399,8 @@ int julytest(void); int pluginsd_parser_unittest(void); void replication_initialize(void); void bearer_tokens_init(void); +int unittest_rrdpush_compressions(void); +int uuid_unittest(void); int main(int argc, char **argv) { // initialize the system clocks @@ -1346,8 +1410,6 @@ int main(int argc, char **argv) { usec_t started_ut = now_monotonic_usec(); usec_t last_ut = started_ut; const char *prev_msg = NULL; - // Initialize stderror avoiding coredump when netdata_log_info() or netdata_log_error() is called - stderror = stderr; int i; int config_loaded = 0; @@ -1439,6 +1501,10 @@ int main(int argc, char **argv) { #ifdef ENABLE_DBENGINE char* createdataset_string = "createdataset="; char* stresstest_string = "stresstest="; + + if(strcmp(optarg, "pgd-tests") == 0) { + return pgd_test(argc, argv); + } #endif if(strcmp(optarg, "sqlite-meta-recover") == 0) { @@ -1451,6 +1517,11 @@ int main(int argc, char **argv) { return 0; } + if(strcmp(optarg, "sqlite-analyze") == 0) { + sql_init_database(DB_CHECK_ANALYZE, 0); + return 0; + } + if(strcmp(optarg, "unittest") == 0) { unittest_running = true; @@ -1495,6 +1566,8 @@ int main(int argc, char **argv) { return 1; if (ctx_unittest()) return 1; + if (uuid_unittest()) + return 1; fprintf(stderr, "\n\nALL TESTS PASSED\n\n"); return 0; } @@ -1521,6 +1594,10 @@ int main(int argc, char **argv) { unittest_running = true; return buffer_unittest(); } + else if(strcmp(optarg, "uuidtest") == 0) { + unittest_running = true; + return uuid_unittest(); + } #ifdef ENABLE_DBENGINE else if(strcmp(optarg, "mctest") == 0) { unittest_running = true; @@ -1550,6 +1627,10 @@ int main(int argc, char **argv) { unittest_running = true; return pluginsd_parser_unittest(); } + else if(strcmp(optarg, "rrdpush_compressions_test") == 0) { + unittest_running = true; + return unittest_rrdpush_compressions(); + } else if(strncmp(optarg, createdataset_string, strlen(createdataset_string)) == 0) { optarg += strlen(createdataset_string); unsigned history_seconds = strtoul(optarg, NULL, 0); @@ -1851,7 +1932,7 @@ int main(int argc, char **argv) { { char buf[20 + 1]; - snprintfz(buf, 20, "%d", libuv_worker_threads); + snprintfz(buf, sizeof(buf) - 1, "%d", libuv_worker_threads); setenv("UV_THREADPOOL_SIZE", buf, 1); } @@ -1894,13 +1975,15 @@ int main(int argc, char **argv) { // get log filenames and settings log_init(); - error_log_limit_unlimited(); + nd_log_limits_unlimited(); // initialize the log files - open_all_log_files(); + nd_log_initialize(); netdata_log_info("Netdata agent version \""VERSION"\" is starting"); ieee754_doubles = is_system_ieee754_double(); + if(!ieee754_doubles) + globally_disabled_capabilities |= STREAM_CAP_IEEE754; aral_judy_init(); @@ -1925,11 +2008,11 @@ int main(int argc, char **argv) { set_silencers_filename(); health_initialize_global_silencers(); - // -------------------------------------------------------------------- - // Initialize ML configuration - - delta_startup_time("initialize ML"); - ml_init(); +// // -------------------------------------------------------------------- +// // Initialize ML configuration +// +// delta_startup_time("initialize ML"); +// ml_init(); // -------------------------------------------------------------------- // setup process signals @@ -1949,6 +2032,15 @@ int main(int argc, char **argv) { // setup threads configs default_stacksize = netdata_threads_init(); +#ifdef NETDATA_INTERNAL_CHECKS + config_set_boolean(CONFIG_SECTION_PLUGINS, "netdata monitoring", true); + config_set_boolean(CONFIG_SECTION_PLUGINS, "netdata monitoring extended", true); +#endif + + if(config_get_boolean(CONFIG_SECTION_PLUGINS, "netdata monitoring extended", false)) + // this has to run before starting any other threads that use workers + workers_utilization_enable(); + for (i = 0; static_threads[i].name != NULL ; i++) { struct netdata_static_thread *st = &static_threads[i]; @@ -1973,8 +2065,18 @@ int main(int argc, char **argv) { web_client_api_v1_init(); web_server_threading_selection(); - if(web_server_mode != WEB_SERVER_MODE_NONE) - api_listen_sockets_setup(); + if(web_server_mode != WEB_SERVER_MODE_NONE) { + if (!api_listen_sockets_setup()) { + netdata_log_error("Cannot setup listen port(s). Is Netdata already running?"); + exit(1); + } + } + + // -------------------------------------------------------------------- + // Initialize ML configuration + + delta_startup_time("initialize ML"); + ml_init(); #ifdef ENABLE_H2O delta_startup_time("initialize h2o server"); @@ -2006,6 +2108,16 @@ int main(int argc, char **argv) { if(become_daemon(dont_fork, user) == -1) fatal("Cannot daemonize myself."); + // The "HOME" env var points to the root's home dir because Netdata starts as root. Can't use "HOME". + struct passwd *pw = getpwuid(getuid()); + if (config_exists(CONFIG_SECTION_DIRECTORIES, "home") || !pw || !pw->pw_dir) { + netdata_configured_home_dir = config_get(CONFIG_SECTION_DIRECTORIES, "home", netdata_configured_home_dir); + } else { + netdata_configured_home_dir = config_get(CONFIG_SECTION_DIRECTORIES, "home", pw->pw_dir); + } + + setenv("HOME", netdata_configured_home_dir, 1); + dyn_conf_init(); netdata_log_info("netdata started on pid %d.", getpid()); @@ -2039,7 +2151,7 @@ int main(int argc, char **argv) { netdata_anonymous_statistics_enabled=-1; struct rrdhost_system_info *system_info = callocz(1, sizeof(struct rrdhost_system_info)); __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_allocations_size, sizeof(struct rrdhost_system_info), __ATOMIC_RELAXED); - get_system_info(system_info, true); + get_system_info(system_info); (void) registry_get_this_machine_guid(); system_info->hops = 0; get_install_type(&system_info->install_type, &system_info->prebuilt_arch, &system_info->prebuilt_dist); @@ -2076,7 +2188,7 @@ int main(int argc, char **argv) { // ------------------------------------------------------------------------ // enable log flood protection - error_log_limit_reset(); + nd_log_limits_reset(); // Load host labels delta_startup_time("collect host labels"); |