diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-11-09 08:26:46 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-11-09 08:26:46 +0000 |
commit | d0b6d783aeb42b4972efba9f06e3a74e3a68295e (patch) | |
tree | 0086fdd24053305007874b4806f252ecd6308379 /src | |
parent | Adding upstream version 1.47.2. (diff) | |
download | netdata-d0b6d783aeb42b4972efba9f06e3a74e3a68295e.tar.xz netdata-d0b6d783aeb42b4972efba9f06e3a74e3a68295e.zip |
Adding upstream version 1.47.5.upstream/1.47.5
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src')
31 files changed, 199 insertions, 146 deletions
diff --git a/src/collectors/common-contexts/mem.swap.h b/src/collectors/common-contexts/mem.swap.h index 6d692ef3..1c1b053d 100644 --- a/src/collectors/common-contexts/mem.swap.h +++ b/src/collectors/common-contexts/mem.swap.h @@ -6,6 +6,14 @@ static inline void common_mem_swap(uint64_t free_bytes, uint64_t used_bytes, int static RRDSET *st_system_swap = NULL;
static RRDDIM *rd_free = NULL, *rd_used = NULL;
+ if (free_bytes == 0 && used_bytes == 0 && st_system_swap) {
+ rrdset_is_obsolete___safe_from_collector_thread(st_system_swap);
+ st_system_swap = NULL;
+ rd_free = NULL;
+ rd_used = NULL;
+ return;
+ }
+
if(unlikely(!st_system_swap)) {
st_system_swap = rrdset_create_localhost(
"mem"
diff --git a/src/collectors/freebsd.plugin/freebsd_kstat_zfs.c b/src/collectors/freebsd.plugin/freebsd_kstat_zfs.c index fdece7de..ded6a8ee 100644 --- a/src/collectors/freebsd.plugin/freebsd_kstat_zfs.c +++ b/src/collectors/freebsd.plugin/freebsd_kstat_zfs.c @@ -43,6 +43,8 @@ int do_kstat_zfs_misc_arcstats(int update_every, usec_t dt) { int hash_chains[5]; int hash_chain_max[5]; int p[5]; + int pd[5]; + int pm[5]; int c[5]; int c_min[5]; int c_max[5]; @@ -145,7 +147,14 @@ int do_kstat_zfs_misc_arcstats(int update_every, usec_t dt) { GETSYSCTL_SIMPLE("kstat.zfs.misc.arcstats.hash_collisions", mibs.hash_collisions, arcstats.hash_collisions); GETSYSCTL_SIMPLE("kstat.zfs.misc.arcstats.hash_chains", mibs.hash_chains, arcstats.hash_chains); GETSYSCTL_SIMPLE("kstat.zfs.misc.arcstats.hash_chain_max", mibs.hash_chain_max, arcstats.hash_chain_max); + +#if __FreeBSD_version >= 1400000 + GETSYSCTL_SIMPLE("kstat.zfs.misc.arcstats.pd", mibs.pd, arcstats.pd); + GETSYSCTL_SIMPLE("kstat.zfs.misc.arcstats.pm", mibs.pm, arcstats.pm); +#else GETSYSCTL_SIMPLE("kstat.zfs.misc.arcstats.p", mibs.p, arcstats.p); +#endif + GETSYSCTL_SIMPLE("kstat.zfs.misc.arcstats.c", mibs.c, arcstats.c); GETSYSCTL_SIMPLE("kstat.zfs.misc.arcstats.c_min", mibs.c_min, arcstats.c_min); GETSYSCTL_SIMPLE("kstat.zfs.misc.arcstats.c_max", mibs.c_max, arcstats.c_max); diff --git a/src/collectors/proc.plugin/proc_meminfo.c b/src/collectors/proc.plugin/proc_meminfo.c index db458b23..781329b5 100644 --- a/src/collectors/proc.plugin/proc_meminfo.c +++ b/src/collectors/proc.plugin/proc_meminfo.c @@ -12,6 +12,8 @@ int do_proc_meminfo(int update_every, usec_t dt) { (void)dt; + static bool swap_configured = false; + static procfile *ff = NULL; static int do_ram = -1 , do_swap = -1 @@ -235,7 +237,7 @@ int do_proc_meminfo(int update_every, usec_t dt) { // http://calimeroteknik.free.fr/blag/?article20/really-used-memory-on-gnu-linux // KReclaimable includes SReclaimable, it was added in kernel v4.20 - unsigned long long reclaimable = KReclaimable > 0 ? KReclaimable : SReclaimable; + unsigned long long reclaimable = inside_lxc_container ? 0 : (KReclaimable > 0 ? KReclaimable : SReclaimable); unsigned long long MemCached = Cached + reclaimable - Shmem; unsigned long long MemUsed = MemTotal - MemFree - MemCached - Buffers; // The Linux kernel doesn't report ZFS ARC usage as cache memory (the ARC is included in the total used system memory) @@ -257,6 +259,7 @@ int do_proc_meminfo(int update_every, usec_t dt) { if (SwapTotal && (do_swap == CONFIG_BOOLEAN_YES || do_swap == CONFIG_BOOLEAN_AUTO)) { do_swap = CONFIG_BOOLEAN_YES; common_mem_swap(SwapFree * 1024, SwapUsed * 1024, update_every); + swap_configured = true; { static RRDSET *st_mem_swap_cached = NULL; @@ -313,8 +316,14 @@ int do_proc_meminfo(int update_every, usec_t dt) { rrddim_set_by_pointer(st_mem_zswap, rd_zswapped, Zswapped); rrdset_done(st_mem_zswap); } + } else { + if (swap_configured) { + common_mem_swap(SwapFree * 1024, SwapUsed * 1024, update_every); + swap_configured = false; + } } + if (arl_hwcorrupted->flags & ARL_ENTRY_FLAG_FOUND && (do_hwcorrupt == CONFIG_BOOLEAN_YES || do_hwcorrupt == CONFIG_BOOLEAN_AUTO)) { do_hwcorrupt = CONFIG_BOOLEAN_YES; diff --git a/src/collectors/proc.plugin/proc_net_dev.c b/src/collectors/proc.plugin/proc_net_dev.c index 40702c38..41c10ddb 100644 --- a/src/collectors/proc.plugin/proc_net_dev.c +++ b/src/collectors/proc.plugin/proc_net_dev.c @@ -12,7 +12,7 @@ #define READ_RETRY_PERIOD 60 // seconds -time_t double_linked_device_collect_delay_secs = 120; +time_t virtual_device_collect_delay_secs = 40; enum { NETDEV_DUPLEX_UNKNOWN, @@ -92,7 +92,6 @@ static struct netdev { int enabled; bool updated; bool function_ready; - bool double_linked; // iflink != ifindex time_t discover_time; @@ -809,7 +808,6 @@ static struct netdev *get_netdev(const char *name) { d->len = strlen(d->name); d->chart_labels = rrdlabels_create(); d->function_ready = false; - d->double_linked = false; d->chart_type_net_bytes = strdupz("net"); d->chart_type_net_compressed = strdupz("net_compressed"); @@ -858,25 +856,10 @@ static struct netdev *get_netdev(const char *name) { return d; } -static bool is_iface_double_linked(struct netdev *d) { - char filename[FILENAME_MAX + 1]; - unsigned long long iflink = 0; - unsigned long long ifindex = 0; - - snprintfz(filename, FILENAME_MAX, "%s/sys/class/net/%s/iflink", netdata_configured_host_prefix, d->name); - if (read_single_number_file(filename, &iflink)) - return false; - - snprintfz(filename, FILENAME_MAX, "%s/sys/class/net/%s/ifindex", netdata_configured_host_prefix, d->name); - if (read_single_number_file(filename, &ifindex)) - return false; - - return iflink != ifindex; -} - int do_proc_net_dev(int update_every, usec_t dt) { (void)dt; static SIMPLE_PATTERN *disabled_list = NULL; + static SIMPLE_PATTERN *virtual_iface_no_delay = NULL; static procfile *ff = NULL; static int enable_new_interfaces = -1; static int do_bandwidth = -1, do_packets = -1, do_errors = -1, do_drops = -1, do_fifo = -1, do_compressed = -1, @@ -921,8 +904,26 @@ int do_proc_net_dev(int update_every, usec_t dt) { do_compressed = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "compressed packets for all interfaces", CONFIG_BOOLEAN_NO); disabled_list = simple_pattern_create( - config_get(CONFIG_SECTION_PLUGIN_PROC_NETDEV, "disable by default interfaces matching", - "lo fireqos* *-ifb fwpr* fwbr* fwln*"), NULL, SIMPLE_PATTERN_EXACT, true); + config_get( + CONFIG_SECTION_PLUGIN_PROC_NETDEV, + "disable by default interfaces matching", + "lo fireqos* *-ifb fwpr* fwbr* fwln*"), + NULL, + SIMPLE_PATTERN_EXACT, + true); + + virtual_iface_no_delay = simple_pattern_create( + " bond* " + " vlan* " + " vmbr* " + " wg* " + " vpn* " + " tun* " + " gre* " + " docker* ", + NULL, + SIMPLE_PATTERN_EXACT, + true); netdev_renames_init(); } @@ -1009,8 +1010,6 @@ int do_proc_net_dev(int update_every, usec_t dt) { if(d->enabled == CONFIG_BOOLEAN_NO) continue; - d->double_linked = is_iface_double_linked(d); - d->do_bandwidth = do_bandwidth; d->do_packets = do_packets; d->do_errors = do_errors; @@ -1060,8 +1059,10 @@ int do_proc_net_dev(int update_every, usec_t dt) { // This is necessary to prevent the creation of charts for virtual interfaces that will later be // recreated as container interfaces (create container) or // rediscovered and recreated only to be deleted almost immediately (stop/remove container) - if (d->double_linked && d->virtual && (now - d->discover_time < double_linked_device_collect_delay_secs)) + if (d->virtual && !simple_pattern_matches(virtual_iface_no_delay, d->name) && + (now - d->discover_time < virtual_device_collect_delay_secs)) { continue; + } if(likely(d->do_bandwidth != CONFIG_BOOLEAN_NO || !d->virtual)) { d->rbytes = str2kernel_uint_t(procfile_lineword(ff, l, 1)); @@ -1717,7 +1718,7 @@ void *netdev_main(void *ptr_is_null __maybe_unused) worker_register_job_name(0, "netdev"); if (getenv("KUBERNETES_SERVICE_HOST") != NULL && getenv("KUBERNETES_SERVICE_PORT") != NULL) - double_linked_device_collect_delay_secs = 300; + virtual_device_collect_delay_secs = 300; rrd_function_add_inline(localhost, NULL, "network-interfaces", 10, RRDFUNCTIONS_PRIORITY_DEFAULT, RRDFUNCTIONS_NETDEV_HELP, diff --git a/src/collectors/proc.plugin/proc_spl_kstat_zfs.c b/src/collectors/proc.plugin/proc_spl_kstat_zfs.c index be96f444..5a0f9095 100644 --- a/src/collectors/proc.plugin/proc_spl_kstat_zfs.c +++ b/src/collectors/proc.plugin/proc_spl_kstat_zfs.c @@ -54,6 +54,8 @@ int do_proc_spl_kstat_zfs_arcstats(int update_every, usec_t dt) { arl_expect(arl_base, "hash_chains", &arcstats.hash_chains); arl_expect(arl_base, "hash_chain_max", &arcstats.hash_chain_max); arl_expect(arl_base, "p", &arcstats.p); + arl_expect(arl_base, "pd", &arcstats.pd); + arl_expect(arl_base, "pm", &arcstats.pm); arl_expect(arl_base, "c", &arcstats.c); arl_expect(arl_base, "c_min", &arcstats.c_min); arl_expect(arl_base, "c_max", &arcstats.c_max); diff --git a/src/collectors/proc.plugin/zfs_common.c b/src/collectors/proc.plugin/zfs_common.c index cb5bd20e..dadf2f84 100644 --- a/src/collectors/proc.plugin/zfs_common.c +++ b/src/collectors/proc.plugin/zfs_common.c @@ -560,7 +560,7 @@ void generate_charts_arc_summary(const char *plugin, const char *module, int upd //unsigned long long anon_hits = arcstats.hits - (arcstats.mfu_hits + arcstats.mru_hits + arcstats.mfu_ghost_hits + arcstats.mru_ghost_hits); unsigned long long arc_size = arcstats.size; - unsigned long long mru_size = arcstats.p; + unsigned long long mru_size = arcstats.p > 0 ? arcstats.p : arcstats.pd + arcstats.pm; //unsigned long long target_min_size = arcstats.c_min; //unsigned long long target_max_size = arcstats.c_max; unsigned long long target_size = arcstats.c; diff --git a/src/collectors/proc.plugin/zfs_common.h b/src/collectors/proc.plugin/zfs_common.h index c0f08817..7c2b3d33 100644 --- a/src/collectors/proc.plugin/zfs_common.h +++ b/src/collectors/proc.plugin/zfs_common.h @@ -41,6 +41,8 @@ struct arcstats { unsigned long long hash_chains; unsigned long long hash_chain_max; unsigned long long p; + unsigned long long pd; + unsigned long long pm; unsigned long long c; unsigned long long c_min; unsigned long long c_max; diff --git a/src/daemon/system-info.sh b/src/daemon/system-info.sh index aaca7fd4..5c8d70b7 100755 --- a/src/daemon/system-info.sh +++ b/src/daemon/system-info.sh @@ -361,6 +361,17 @@ fi # ------------------------------------------------------------------------------------------------- # Detect the total system disk space +is_inside_lxc_container() { + mounts_file="/proc/self/mounts" + + [ ! -r "$mounts_file" ] && return 1 + + # Check if lxcfs is mounted on /proc + awk '$1 == "lxcfs" && $2 ~ "^/proc" { found=1; exit } END { exit !found }' "$mounts_file" + + return $? +} + DISK_SIZE="unknown" DISK_DETECTION="none" @@ -393,7 +404,7 @@ elif [ "${KERNEL_NAME}" = FreeBSD ]; then total="$(df -t ${types} -c -k | tail -n 1 | awk '{print $2}')" DISK_SIZE="$((total * 1024))" else - if [ -d /sys/block ] && [ -r /proc/devices ]; then + if [ -d /sys/block ] && [ -r /proc/devices ] && ! is_inside_lxc_container; then dev_major_whitelist='' # This is a list of device names used for block storage devices. @@ -424,7 +435,7 @@ else else DISK_DETECTION="df" include_fs_types="ext*|btrfs|xfs|jfs|reiser*|zfs" - DISK_SIZE=$(($(df -T -P | tail -n +2 | sort -u -k 1 | grep "${include_fs_types}" | awk '{print $3}' | tr '\n' '+' | head -c -1) * 1024)) + DISK_SIZE=$(($(df -T -P | tail -n +2 | sort -u -k 1 | grep -E "${include_fs_types}" | awk '{print $3}' | tr '\n' '+' | head -c -1) * 1024)) fi fi diff --git a/src/database/sqlite/sqlite_aclk_node.c b/src/database/sqlite/sqlite_aclk_node.c index 70d1ebda..411b8bd7 100644 --- a/src/database/sqlite/sqlite_aclk_node.c +++ b/src/database/sqlite/sqlite_aclk_node.c @@ -167,6 +167,12 @@ void aclk_check_node_info_and_collectors(void) if (pp_queue_empty && wc->node_info_send_time && wc->node_info_send_time + 30 < now) { wc->node_info_send_time = 0; build_node_info(host); + if (netdata_cloud_enabled) { + netdata_mutex_lock(&host->receiver_lock); + int live = (host == localhost || host->receiver || !(rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN))) ? 1 : 0; + netdata_mutex_unlock(&host->receiver_lock); + aclk_host_state_update(host, live, 1); + } internal_error(true, "ACLK SYNC: Sending node info for %s", rrdhost_hostname(host)); } diff --git a/src/database/sqlite/sqlite_health.c b/src/database/sqlite/sqlite_health.c index 9304c8c7..b3ad1285 100644 --- a/src/database/sqlite/sqlite_health.c +++ b/src/database/sqlite/sqlite_health.c @@ -755,20 +755,20 @@ done: * Store an alert config hash in the database */ #define SQL_STORE_ALERT_CONFIG_HASH \ - "insert or replace into alert_hash (hash_id, date_updated, alarm, template, " \ + "INSERT OR REPLACE INTO alert_hash (hash_id, date_updated, alarm, template, " \ "on_key, class, component, type, lookup, every, units, calc, " \ "green, red, warn, crit, exec, to_key, info, delay, options, repeat, host_labels, " \ "p_db_lookup_dimensions, p_db_lookup_method, p_db_lookup_options, p_db_lookup_after, " \ "p_db_lookup_before, p_update_every, source, chart_labels, summary, time_group_condition, " \ "time_group_value, dims_group, data_source) " \ - "values (@hash_id,UNIXEPOCH(),@alarm,@template," \ + "VALUES (@hash_id,UNIXEPOCH(),@alarm,@template," \ "@on_key,@class,@component,@type,@lookup,@every,@units,@calc," \ "@green,@red,@warn,@crit,@exec,@to_key,@info,@delay,@options,@repeat,@host_labels," \ "@p_db_lookup_dimensions,@p_db_lookup_method,@p_db_lookup_options,@p_db_lookup_after," \ "@p_db_lookup_before,@p_update_every,@source,@chart_labels,@summary, @time_group_condition, " \ "@time_group_value, @dims_group, @data_source)" -void sql_alert_store_config(RRD_ALERT_PROTOTYPE *ap __maybe_unused) +void sql_alert_store_config(RRD_ALERT_PROTOTYPE *ap) { static __thread sqlite3_stmt *res = NULL; int param = 0; @@ -776,7 +776,7 @@ void sql_alert_store_config(RRD_ALERT_PROTOTYPE *ap __maybe_unused) if (!PREPARE_COMPILED_STATEMENT(db_meta, SQL_STORE_ALERT_CONFIG_HASH, &res)) return; - BUFFER *buf = buffer_create(128, NULL); + CLEAN_BUFFER *buf = buffer_create(128, NULL); SQLITE_BIND_FAIL( done, sqlite3_bind_blob(res, ++param, &ap->config.hash_id, sizeof(ap->config.hash_id), SQLITE_STATIC)); @@ -842,7 +842,14 @@ void sql_alert_store_config(RRD_ALERT_PROTOTYPE *ap __maybe_unused) else SQLITE_BIND_FAIL(done, sqlite3_bind_null(res, ++param)); - SQLITE_BIND_FAIL(done, sqlite3_bind_int(res, ++param, ap->config.update_every)); + char repeat[255]; + if (!ap->config.has_custom_repeat_config) + SQLITE_BIND_FAIL(done, sqlite3_bind_null(res, ++param)); + else { + snprintfz(repeat, sizeof(repeat) - 1, "warning %us critical %us", ap->config.warn_repeat_every, ap->config.crit_repeat_every); + SQLITE_BIND_FAIL(done, sqlite3_bind_text(res, ++param, repeat, -1, SQLITE_STATIC)); + } + SQLITE_BIND_FAIL(done, SQLITE3_BIND_STRING_OR_NULL(res, ++param, ap->match.host_labels)); if (ap->config.after) { @@ -875,7 +882,6 @@ void sql_alert_store_config(RRD_ALERT_PROTOTYPE *ap __maybe_unused) error_report("Failed to store alert config, rc = %d", rc); done: - buffer_free(buf); REPORT_BIND_FAIL(res, param); SQLITE_RESET(res); } diff --git a/src/go/plugin/go.d/agent/discovery/sd/discoverer/netlisteners/netlisteners.go b/src/go/plugin/go.d/agent/discovery/sd/discoverer/netlisteners/netlisteners.go index 6f536c49..60dd92cb 100644 --- a/src/go/plugin/go.d/agent/discovery/sd/discoverer/netlisteners/netlisteners.go +++ b/src/go/plugin/go.d/agent/discovery/sd/discoverer/netlisteners/netlisteners.go @@ -84,6 +84,9 @@ type ( cache map[uint64]*cacheItem // [target.Hash] started chan struct{} + + successRuns int64 + timeoutRuns int64 } cacheItem struct { lastSeenTime time.Time @@ -118,7 +121,7 @@ func (d *Discoverer) Discover(ctx context.Context, in chan<- []model.TargetGroup return case <-tk.C: if err := d.discoverLocalListeners(ctx, in); err != nil { - d.Warning(err) + d.Error(err) return } } @@ -128,12 +131,20 @@ func (d *Discoverer) Discover(ctx context.Context, in chan<- []model.TargetGroup func (d *Discoverer) discoverLocalListeners(ctx context.Context, in chan<- []model.TargetGroup) error { bs, err := d.ll.discover(ctx) if err != nil { - if errors.Is(err, context.Canceled) { + if errors.Is(err, context.DeadlineExceeded) { + // there is no point in continuing pointless attempts/use cpu + // https://github.com/netdata/netdata/discussions/18751#discussioncomment-10908472 + if d.timeoutRuns++; d.timeoutRuns > 5 && d.successRuns == 0 { + return err + } + d.Warning(err) return nil } return err } + d.successRuns++ + tgts, err := d.parseLocalListeners(bs) if err != nil { return err diff --git a/src/go/plugin/go.d/modules/nvidia_smi/charts.go b/src/go/plugin/go.d/modules/nvidia_smi/charts.go index 746c8eed..d6b0af36 100644 --- a/src/go/plugin/go.d/modules/nvidia_smi/charts.go +++ b/src/go/plugin/go.d/modules/nvidia_smi/charts.go @@ -4,6 +4,7 @@ package nvidia_smi import ( "fmt" + "strconv" "strings" "github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module" @@ -261,7 +262,7 @@ var ( } ) -func (nv *NvidiaSmi) addGPUXMLCharts(gpu gpuInfo) { +func (nv *NvidiaSmi) addGpuCharts(gpu gpuInfo, index int) { charts := gpuXMLCharts.Copy() if !isValidValue(gpu.Utilization.GpuUtil) { @@ -294,7 +295,7 @@ func (nv *NvidiaSmi) addGPUXMLCharts(gpu gpuInfo) { for _, c := range *charts { c.ID = fmt.Sprintf(c.ID, strings.ToLower(gpu.UUID)) c.Labels = []module.Label{ - // csv output has no 'product_brand' + {Key: "index", Value: strconv.Itoa(index)}, {Key: "uuid", Value: gpu.UUID}, {Key: "product_name", Value: gpu.ProductName}, } diff --git a/src/go/plugin/go.d/modules/nvidia_smi/collect.go b/src/go/plugin/go.d/modules/nvidia_smi/collect.go index f621d191..3548f90f 100644 --- a/src/go/plugin/go.d/modules/nvidia_smi/collect.go +++ b/src/go/plugin/go.d/modules/nvidia_smi/collect.go @@ -38,7 +38,7 @@ func (nv *NvidiaSmi) collectGPUInfo(mx map[string]int64) error { seenGPU := make(map[string]bool) seenMIG := make(map[string]bool) - for _, gpu := range info.GPUs { + for i, gpu := range info.GPUs { if !isValidValue(gpu.UUID) { continue } @@ -49,7 +49,7 @@ func (nv *NvidiaSmi) collectGPUInfo(mx map[string]int64) error { if !nv.gpus[px] { nv.gpus[px] = true - nv.addGPUXMLCharts(gpu) + nv.addGpuCharts(gpu, i) } addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes diff --git a/src/go/plugin/go.d/modules/nvidia_smi/config_schema.json b/src/go/plugin/go.d/modules/nvidia_smi/config_schema.json index 3f93badc..46b48095 100644 --- a/src/go/plugin/go.d/modules/nvidia_smi/config_schema.json +++ b/src/go/plugin/go.d/modules/nvidia_smi/config_schema.json @@ -19,7 +19,7 @@ }, "timeout": { "title": "Timeout", - "description": "Timeout for executing the binary, specified in seconds.", + "description": "The maximum duration, in seconds, to wait for an `nvidia-smi` command to complete.", "type": "number", "minimum": 0.5, "default": 10 @@ -47,7 +47,7 @@ "ui:help": "If an absolute path is provided, the collector will use it directly; otherwise, it will search for the binary in directories specified in the PATH environment variable." }, "timeout": { - "ui:help": "Accepts decimals for precise control (e.g., type 1.5 for 1.5 seconds)." + "ui:help": "This setting applies differently based on the collector's mode. **Loop Mode:** In loop mode, the timeout primarily determines how long to wait for the initial `nvidia-smi` execution. If the initial query takes longer than the timeout, the collector may report an error. For systems with multiple GPUs, the initial load time can sometimes be significant (e.g., 5-10 seconds). **Regular Mode:** If the collector is in regular mode, the timeout specifies how long to wait for each individual `nvidia-smi` execution." }, "loop_mode": { "ui:help": "In loop mode, `nvidia-smi` will repeatedly query GPU data at specified intervals, defined by the `-l SEC` or `--loop=SEC` parameter, rather than just running the query once. This enables ongoing performance tracking by putting the application to sleep between queries." diff --git a/src/go/plugin/go.d/modules/nvidia_smi/exec.go b/src/go/plugin/go.d/modules/nvidia_smi/exec.go index 11a26131..497db87a 100644 --- a/src/go/plugin/go.d/modules/nvidia_smi/exec.go +++ b/src/go/plugin/go.d/modules/nvidia_smi/exec.go @@ -34,7 +34,7 @@ func newNvidiaSmiBinary(path string, cfg Config, log *logger.Logger) (nvidiaSmiB Logger: log, binPath: path, updateEvery: cfg.UpdateEvery, - firstSampleTimeout: time.Second * 3, + firstSampleTimeout: cfg.Timeout.Duration(), } if err := smi.run(); err != nil { diff --git a/src/go/plugin/go.d/modules/nvidia_smi/metadata.yaml b/src/go/plugin/go.d/modules/nvidia_smi/metadata.yaml index 2a79b5ac..f44d5753 100644 --- a/src/go/plugin/go.d/modules/nvidia_smi/metadata.yaml +++ b/src/go/plugin/go.d/modules/nvidia_smi/metadata.yaml @@ -65,8 +65,8 @@ modules: default_value: nvidia_smi required: false - name: timeout - description: nvidia_smi binary execution timeout. - default_value: 2 + description: The maximum duration, in seconds, to wait for an `nvidia-smi` command to complete. This setting applies differently based on the collector's mode. **Loop Mode:** In loop mode, the timeout primarily determines how long to wait for the initial `nvidia-smi` execution. If the initial query takes longer than the timeout, the collector may report an error. For systems with multiple GPUs, the initial load time can sometimes be significant (e.g., 5-10 seconds). **Regular Mode:** If the collector is in regular mode, the timeout specifies how long to wait for each individual `nvidia-smi` execution. + default_value: 10 required: false - name: loop_mode description: "When enabled, `nvidia-smi` is executed continuously in a separate thread using the `-l` option." @@ -98,7 +98,9 @@ modules: description: These metrics refer to the GPU. labels: - name: uuid - description: GPU id (e.g. 00000000:00:04.0) + description: GPU uuid (e.g. GPU-27b94a00-ed54-5c24-b1fd-1054085de32a) + - name: index + description: GPU index (nvidia_smi typically orders GPUs by PCI bus ID) - name: product_name description: GPU product name (e.g. NVIDIA A100-SXM4-40GB) metrics: @@ -211,7 +213,7 @@ modules: description: These metrics refer to the Multi-Instance GPU (MIG). labels: - name: uuid - description: GPU id (e.g. 00000000:00:04.0) + description: GPU uuid (e.g. GPU-27b94a00-ed54-5c24-b1fd-1054085de32a) - name: product_name description: GPU product name (e.g. NVIDIA A100-SXM4-40GB) - name: gpu_instance_id diff --git a/src/go/plugin/go.d/modules/postgres/collect.go b/src/go/plugin/go.d/modules/postgres/collect.go index 6186932c..91f3b523 100644 --- a/src/go/plugin/go.d/modules/postgres/collect.go +++ b/src/go/plugin/go.d/modules/postgres/collect.go @@ -18,6 +18,7 @@ const ( pgVersion94 = 9_04_00 pgVersion10 = 10_00_00 pgVersion11 = 11_00_00 + pgVersion17 = 17_00_00 ) func (p *Postgres) collect() (map[string]int64, error) { diff --git a/src/go/plugin/go.d/modules/postgres/do_query_global.go b/src/go/plugin/go.d/modules/postgres/do_query_global.go index c70772a2..6f1ac1ed 100644 --- a/src/go/plugin/go.d/modules/postgres/do_query_global.go +++ b/src/go/plugin/go.d/modules/postgres/do_query_global.go @@ -97,7 +97,7 @@ func (p *Postgres) doQueryConnectionsState() error { } func (p *Postgres) doQueryCheckpoints() error { - q := queryCheckpoints() + q := queryCheckpoints(p.pgVersion) return p.doQuery(q, func(column, value string, _ bool) { switch column { diff --git a/src/go/plugin/go.d/modules/postgres/postgres_test.go b/src/go/plugin/go.d/modules/postgres/postgres_test.go index 7e91b288..95652458 100644 --- a/src/go/plugin/go.d/modules/postgres/postgres_test.go +++ b/src/go/plugin/go.d/modules/postgres/postgres_test.go @@ -155,7 +155,7 @@ func TestPostgres_Check(t *testing.T) { mockExpect(t, m, queryServerCurrentConnectionsUsed(), dataVer140004ServerCurrentConnections) mockExpect(t, m, queryServerConnectionsState(), dataVer140004ServerConnectionsState) - mockExpect(t, m, queryCheckpoints(), dataVer140004Checkpoints) + mockExpect(t, m, queryCheckpoints(140004), dataVer140004Checkpoints) mockExpect(t, m, queryServerUptime(), dataVer140004ServerUptime) mockExpect(t, m, queryTXIDWraparound(), dataVer140004TXIDWraparound) mockExpect(t, m, queryWALWrites(140004), dataVer140004WALWrites) @@ -258,7 +258,7 @@ func TestPostgres_Collect(t *testing.T) { mockExpect(t, m, queryServerCurrentConnectionsUsed(), dataVer140004ServerCurrentConnections) mockExpect(t, m, queryServerConnectionsState(), dataVer140004ServerConnectionsState) - mockExpect(t, m, queryCheckpoints(), dataVer140004Checkpoints) + mockExpect(t, m, queryCheckpoints(140004), dataVer140004Checkpoints) mockExpect(t, m, queryServerUptime(), dataVer140004ServerUptime) mockExpect(t, m, queryTXIDWraparound(), dataVer140004TXIDWraparound) mockExpect(t, m, queryWALWrites(140004), dataVer140004WALWrites) diff --git a/src/go/plugin/go.d/modules/postgres/queries.go b/src/go/plugin/go.d/modules/postgres/queries.go index f6afc934..4f6262bb 100644 --- a/src/go/plugin/go.d/modules/postgres/queries.go +++ b/src/go/plugin/go.d/modules/postgres/queries.go @@ -51,12 +51,14 @@ GROUP BY state; ` } -func queryCheckpoints() string { +func queryCheckpoints(version int) string { // definition by version: https://pgpedia.info/p/pg_stat_bgwriter.html // docs: https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-BGWRITER-VIEW // code: https://github.com/postgres/postgres/blob/366283961ac0ed6d89014444c6090f3fd02fce0a/src/backend/catalog/system_views.sql#L1104 - return ` + if version < pgVersion17 { + + return ` SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, @@ -69,6 +71,21 @@ SELECT checkpoints_timed, buffers_alloc * current_setting('block_size')::numeric AS buffers_alloc_bytes FROM pg_stat_bgwriter; ` + } + return ` +SELECT + chkpt.num_timed AS checkpoints_timed, + chkpt.num_requested AS checkpoints_req, + chkpt.write_time AS checkpoint_write_time, + chkpt.sync_time AS checkpoint_sync_time, + chkpt.buffers_written * current_setting('block_size')::numeric AS buffers_checkpoint_bytes, + bgwrtr.buffers_clean * current_setting('block_size')::numeric AS buffers_clean_bytes, + bgwrtr.maxwritten_clean, + bgwrtr.buffers_alloc * current_setting('block_size')::numeric AS buffers_alloc_bytes +FROM + pg_stat_bgwriter AS bgwrtr, + pg_stat_checkpointer AS chkpt; +` } func queryServerUptime() string { diff --git a/src/go/plugin/go.d/modules/smartctl/collect.go b/src/go/plugin/go.d/modules/smartctl/collect.go index 35585db6..b76d0998 100644 --- a/src/go/plugin/go.d/modules/smartctl/collect.go +++ b/src/go/plugin/go.d/modules/smartctl/collect.go @@ -181,7 +181,7 @@ func isSmartAttrValid(a *smartAttribute) bool { } func isDeviceInLowerPowerMode(r *gjson.Result) bool { - if !isExitStatusHasBit(r, 1) { + if !isExitStatusHasAnyBit(r, 1) { return false } @@ -194,7 +194,7 @@ func isDeviceInLowerPowerMode(r *gjson.Result) bool { } func isDeviceOpenFailedNoSuchDevice(r *gjson.Result) bool { - if !isExitStatusHasBit(r, 1) { + if !isExitStatusHasAnyBit(r, 1) { return false } @@ -206,9 +206,16 @@ func isDeviceOpenFailedNoSuchDevice(r *gjson.Result) bool { }) } -func isExitStatusHasBit(r *gjson.Result, bit int) bool { +func isExitStatusHasAnyBit(r *gjson.Result, bit int, bits ...int) bool { // https://manpages.debian.org/bullseye/smartmontools/smartctl.8.en.html#EXIT_STATUS status := int(r.Get("smartctl.exit_status").Int()) - mask := 1 << bit - return (status & mask) != 0 + + for _, b := range append([]int{bit}, bits...) { + mask := 1 << b + if (status & mask) != 0 { + return true + } + } + + return false } diff --git a/src/go/plugin/go.d/modules/smartctl/scan.go b/src/go/plugin/go.d/modules/smartctl/scan.go index 5564897a..d904ca28 100644 --- a/src/go/plugin/go.d/modules/smartctl/scan.go +++ b/src/go/plugin/go.d/modules/smartctl/scan.go @@ -100,12 +100,12 @@ func (s *Smartctl) handleGuessedScsiScannedDevice(dev *scanDevice) { } resp, _ := s.exec.deviceInfo(dev.name, "sat", s.NoCheckPowerMode) - if resp == nil || resp.Get("smartctl.exit_status").Int() != 0 { + if resp == nil || isExitStatusHasAnyBit(resp, 0, 1, 2) { return } - atts, ok := newSmartDevice(resp).ataSmartAttributeTable() - if !ok || len(atts) == 0 { + attrs, ok := newSmartDevice(resp).ataSmartAttributeTable() + if !ok || len(attrs) == 0 { return } diff --git a/src/go/plugin/go.d/modules/uwsgi/client.go b/src/go/plugin/go.d/modules/uwsgi/client.go index 40368074..487aeb93 100644 --- a/src/go/plugin/go.d/modules/uwsgi/client.go +++ b/src/go/plugin/go.d/modules/uwsgi/client.go @@ -5,35 +5,25 @@ package uwsgi import ( "bytes" "fmt" + "time" "github.com/netdata/netdata/go/plugins/plugin/go.d/pkg/socket" ) type uwsgiConn interface { - connect() error - disconnect() queryStats() ([]byte, error) } func newUwsgiConn(conf Config) uwsgiConn { - return &uwsgiClient{conn: socket.New(socket.Config{ - Address: conf.Address, - ConnectTimeout: conf.Timeout.Duration(), - ReadTimeout: conf.Timeout.Duration(), - WriteTimeout: conf.Timeout.Duration(), - })} + return &uwsgiClient{ + address: conf.Address, + timeout: conf.Timeout.Duration(), + } } type uwsgiClient struct { - conn socket.Client -} - -func (c *uwsgiClient) connect() error { - return c.conn.Connect() -} - -func (c *uwsgiClient) disconnect() { - _ = c.conn.Disconnect() + address string + timeout time.Duration } func (c *uwsgiClient) queryStats() ([]byte, error) { @@ -42,7 +32,14 @@ func (c *uwsgiClient) queryStats() ([]byte, error) { var err error const readLineLimit = 1000 * 10 - clientErr := c.conn.Command("", func(bs []byte) bool { + cfg := socket.Config{ + Address: c.address, + ConnectTimeout: c.timeout, + ReadTimeout: c.timeout, + WriteTimeout: c.timeout, + } + + clientErr := socket.ConnectAndRead(cfg, func(bs []byte) bool { b.Write(bs) b.WriteByte('\n') diff --git a/src/go/plugin/go.d/modules/uwsgi/collect.go b/src/go/plugin/go.d/modules/uwsgi/collect.go index 3f440535..a89704c8 100644 --- a/src/go/plugin/go.d/modules/uwsgi/collect.go +++ b/src/go/plugin/go.d/modules/uwsgi/collect.go @@ -27,14 +27,7 @@ type workerStats struct { } func (u *Uwsgi) collect() (map[string]int64, error) { - conn, err := u.establishConn() - if err != nil { - return nil, fmt.Errorf("failed to connect: %v", err) - } - - defer conn.disconnect() - - stats, err := conn.queryStats() + stats, err := u.conn.queryStats() if err != nil { return nil, fmt.Errorf("failed to query stats: %v", err) } @@ -110,16 +103,6 @@ func (u *Uwsgi) collectStats(mx map[string]int64, stats []byte) error { return nil } -func (u *Uwsgi) establishConn() (uwsgiConn, error) { - conn := u.newConn(u.Config) - - if err := conn.connect(); err != nil { - return nil, err - } - - return conn, nil -} - func boolToInt(b bool) int64 { if b { return 1 diff --git a/src/go/plugin/go.d/modules/uwsgi/uwsgi.go b/src/go/plugin/go.d/modules/uwsgi/uwsgi.go index 7fe98503..c70dd656 100644 --- a/src/go/plugin/go.d/modules/uwsgi/uwsgi.go +++ b/src/go/plugin/go.d/modules/uwsgi/uwsgi.go @@ -28,7 +28,6 @@ func New() *Uwsgi { Address: "127.0.0.1:1717", Timeout: web.Duration(time.Second * 1), }, - newConn: newUwsgiConn, charts: charts.Copy(), seenWorkers: make(map[int]bool), } @@ -46,7 +45,7 @@ type Uwsgi struct { charts *module.Charts - newConn func(Config) uwsgiConn + conn uwsgiConn seenWorkers map[int]bool } @@ -61,6 +60,8 @@ func (u *Uwsgi) Init() error { return errors.New("address not set") } + u.conn = newUwsgiConn(u.Config) + return nil } diff --git a/src/go/plugin/go.d/modules/uwsgi/uwsgi_test.go b/src/go/plugin/go.d/modules/uwsgi/uwsgi_test.go index 900c4853..ce3f2125 100644 --- a/src/go/plugin/go.d/modules/uwsgi/uwsgi_test.go +++ b/src/go/plugin/go.d/modules/uwsgi/uwsgi_test.go @@ -81,7 +81,7 @@ func TestUwsgi_Cleanup(t *testing.T) { "after check": { prepare: func() *Uwsgi { uw := New() - uw.newConn = func(config Config) uwsgiConn { return prepareMockOk() } + uw.conn = prepareMockOk() _ = uw.Check() return uw }, @@ -89,7 +89,7 @@ func TestUwsgi_Cleanup(t *testing.T) { "after collect": { prepare: func() *Uwsgi { uw := New() - uw.newConn = func(config Config) uwsgiConn { return prepareMockOk() } + uw.conn = prepareMockOk() _ = uw.Collect() return uw }, @@ -122,10 +122,6 @@ func TestUwsgi_Check(t *testing.T) { wantFail: false, prepareMock: prepareMockOkNoWorkers, }, - "err on connect": { - wantFail: true, - prepareMock: prepareMockErrOnConnect, - }, "unexpected response": { wantFail: true, prepareMock: prepareMockUnexpectedResponse, @@ -140,7 +136,7 @@ func TestUwsgi_Check(t *testing.T) { t.Run(name, func(t *testing.T) { uw := New() mock := test.prepareMock() - uw.newConn = func(config Config) uwsgiConn { return mock } + uw.conn = mock if test.wantFail { assert.Error(t, uw.Check()) @@ -229,12 +225,6 @@ func TestUwsgi_Collect(t *testing.T) { disconnectBeforeCleanup: true, disconnectAfterCleanup: true, }, - "err on connect": { - prepareMock: prepareMockErrOnConnect, - wantCharts: len(charts), - disconnectBeforeCleanup: false, - disconnectAfterCleanup: false, - }, "err on query stats": { prepareMock: prepareMockErrOnQueryStats, wantCharts: len(charts), @@ -247,7 +237,7 @@ func TestUwsgi_Collect(t *testing.T) { t.Run(name, func(t *testing.T) { uw := New() mock := test.prepareMock() - uw.newConn = func(config Config) uwsgiConn { return mock } + uw.conn = mock mx := uw.Collect() @@ -257,10 +247,6 @@ func TestUwsgi_Collect(t *testing.T) { module.TestMetricsHasAllChartsDims(t, uw.Charts(), mx) } assert.Equal(t, test.wantCharts, len(*uw.Charts()), "want charts") - - assert.Equal(t, test.disconnectBeforeCleanup, mock.disconnectCalled, "disconnect before cleanup") - uw.Cleanup() - assert.Equal(t, test.disconnectAfterCleanup, mock.disconnectCalled, "disconnect after cleanup") }) } } @@ -277,12 +263,6 @@ func prepareMockOkNoWorkers() *mockUwsgiConn { } } -func prepareMockErrOnConnect() *mockUwsgiConn { - return &mockUwsgiConn{ - errOnConnect: true, - } -} - func prepareMockErrOnQueryStats() *mockUwsgiConn { return &mockUwsgiConn{ errOnQueryStats: true, @@ -300,21 +280,8 @@ func prepareMockEmptyResponse() *mockUwsgiConn { } type mockUwsgiConn struct { - errOnConnect bool - errOnQueryStats bool - statsResponse []byte - disconnectCalled bool -} - -func (m *mockUwsgiConn) connect() error { - if m.errOnConnect { - return errors.New("mock.connect() error") - } - return nil -} - -func (m *mockUwsgiConn) disconnect() { - m.disconnectCalled = true + errOnQueryStats bool + statsResponse []byte } func (m *mockUwsgiConn) queryStats() ([]byte, error) { diff --git a/src/go/plugin/go.d/pkg/socket/client.go b/src/go/plugin/go.d/pkg/socket/client.go index 26ae1dfa..c2bcbd9e 100644 --- a/src/go/plugin/go.d/pkg/socket/client.go +++ b/src/go/plugin/go.d/pkg/socket/client.go @@ -21,6 +21,15 @@ func New(config Config) *Socket { } } +func ConnectAndRead(config Config, process Processor) error { + s := New(config) + if err := s.Connect(); err != nil { + return err + } + defer func() { _ = s.Disconnect() }() + return read(s.conn, process, s.ReadTimeout) +} + // Socket is the implementation of a socket client. type Socket struct { Config diff --git a/src/health/health_event_loop.c b/src/health/health_event_loop.c index b50812f2..04d70e11 100644 --- a/src/health/health_event_loop.c +++ b/src/health/health_event_loop.c @@ -75,10 +75,13 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) time_t needed = now + rc->config.before + rc->config.after; if(needed + update_every < first || needed - update_every > last) { - netdata_log_debug(D_HEALTH - , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)." - , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first - , (unsigned long) last); + netdata_log_debug(D_HEALTH, + "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu).", + rrdcalc_chart_name(rc), + rrdcalc_name(rc), + (unsigned long) needed, + (unsigned long) first, + (unsigned long) last); return 0; } } diff --git a/src/health/rrdvar.c b/src/health/rrdvar.c index 4e28e62a..75cb9739 100644 --- a/src/health/rrdvar.c +++ b/src/health/rrdvar.c @@ -107,7 +107,7 @@ void rrdvar_host_variable_set(RRDHOST *host, const RRDVAR_ACQUIRED *rva, NETDATA // CUSTOM CHART VARIABLES const RRDVAR_ACQUIRED *rrdvar_chart_variable_add_and_acquire(RRDSET *st, const char *name) { - if(unlikely(!st->rrdvars)) return NULL; + if(unlikely(!st || !st->rrdvars)) return NULL; STRING *name_string = rrdvar_name_to_string(name); const RRDVAR_ACQUIRED *rs = rrdvar_add_and_acquire(st->rrdvars, name_string, NAN); @@ -116,7 +116,7 @@ const RRDVAR_ACQUIRED *rrdvar_chart_variable_add_and_acquire(RRDSET *st, const c } void rrdvar_chart_variable_set(RRDSET *st, const RRDVAR_ACQUIRED *rva, NETDATA_DOUBLE value) { - if(unlikely(!st->rrdvars || !rva)) return; + if(unlikely(!st || !st->rrdvars || !rva)) return; RRDVAR *rv = dictionary_acquired_item_value((const DICTIONARY_ITEM *)rva); if(rv->value != value) { diff --git a/src/health/rrdvar.h b/src/health/rrdvar.h index 31530589..3297984c 100644 --- a/src/health/rrdvar.h +++ b/src/health/rrdvar.h @@ -19,7 +19,7 @@ void rrdvar_host_variable_set(RRDHOST *host, const RRDVAR_ACQUIRED *rva, NETDATA int rrdvar_walkthrough_read(DICTIONARY *dict, int (*callback)(const DICTIONARY_ITEM *item, void *rrdvar, void *data), void *data); #define rrdvar_host_variable_release(host, rva) rrdvar_release((host)->rrdvars, rva) -#define rrdvar_chart_variable_release(st, rva) rrdvar_release((st)->rrdvars, rva) +#define rrdvar_chart_variable_release(st, rva) do { if(st) rrdvar_release((st)->rrdvars, rva); } while(0) void rrdvar_release(DICTIONARY *dict, const RRDVAR_ACQUIRED *rva); NETDATA_DOUBLE rrdvar2number(const RRDVAR_ACQUIRED *rva); diff --git a/src/libnetdata/socket/socket.c b/src/libnetdata/socket/socket.c index 85f67a2b..f907fefe 100644 --- a/src/libnetdata/socket/socket.c +++ b/src/libnetdata/socket/socket.c @@ -1196,7 +1196,7 @@ inline int wait_on_socket_or_cancel_with_timeout( .revents = 0, }; - bool forever = (timeout_ms == 0); + bool forever = (timeout_ms <= 0); while (timeout_ms > 0 || forever) { if(nd_thread_signaled_to_cancel()) { |