diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2021-05-19 12:33:38 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2021-05-19 12:33:59 +0000 |
commit | 1ee0c09c5742557e037df5421ca62abddb90ae22 (patch) | |
tree | 71c0fa48bb6d31d036c9badd7e038527f90d1a73 /collectors/proc.plugin | |
parent | Releasing debian version 1.30.1-1. (diff) | |
download | netdata-1ee0c09c5742557e037df5421ca62abddb90ae22.tar.xz netdata-1ee0c09c5742557e037df5421ca62abddb90ae22.zip |
Merging upstream version 1.31.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'collectors/proc.plugin')
-rw-r--r-- | collectors/proc.plugin/README.md | 36 | ||||
-rw-r--r-- | collectors/proc.plugin/ipc.c | 2 | ||||
-rw-r--r-- | collectors/proc.plugin/plugin_proc.c | 261 | ||||
-rw-r--r-- | collectors/proc.plugin/plugin_proc.h | 1 | ||||
-rw-r--r-- | collectors/proc.plugin/proc_diskstats.c | 276 | ||||
-rw-r--r-- | collectors/proc.plugin/proc_mdstat.c | 19 | ||||
-rw-r--r-- | collectors/proc.plugin/proc_meminfo.c | 18 | ||||
-rw-r--r-- | collectors/proc.plugin/proc_net_dev.c | 12 | ||||
-rw-r--r-- | collectors/proc.plugin/proc_net_wireless.c | 6 | ||||
-rw-r--r-- | collectors/proc.plugin/proc_pagetypeinfo.c | 4 | ||||
-rw-r--r-- | collectors/proc.plugin/proc_spl_kstat_zfs.c | 220 | ||||
-rw-r--r-- | collectors/proc.plugin/proc_vmstat.c | 83 | ||||
-rw-r--r-- | collectors/proc.plugin/sys_class_infiniband.c | 4 |
13 files changed, 789 insertions, 153 deletions
diff --git a/collectors/proc.plugin/README.md b/collectors/proc.plugin/README.md index 085afb4f..7fff1ec0 100644 --- a/collectors/proc.plugin/README.md +++ b/collectors/proc.plugin/README.md @@ -26,6 +26,8 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/collectors/proc. - `/proc/loadavg` (system load and total processes running) - `/proc/pressure/{cpu,memory,io}` (pressure stall information) - `/proc/sys/kernel/random/entropy_avail` (random numbers pool availability - used in cryptography) +- `/proc/spl/kstat/zfs/arcstats` (status of ZFS adaptive replacement cache) +- `/proc/spl/kstat/zfs/pool/state` (state of ZFS pools) - `/sys/class/power_supply` (power supply properties) - `/sys/class/infiniband` (infiniband interconnect) - `ipc` (IPC semaphores and message queues) @@ -46,8 +48,11 @@ Hopefully, the Linux kernel provides many metrics that can provide deep insights - **I/O bandwidth/s (kb/s)** The amount of data transferred from and to the disk. +- **Amount of discarded data (kb/s)** - **I/O operations/s** The number of I/O operations completed. +- **Extended I/O operations/s** + The number of extended I/O operations completed. - **Queued I/O operations** The number of currently queued I/O operations. For traditional disks that execute commands one after another, one of them is being run by the disk and the rest are just waiting in a queue. - **Backlog size (time in ms)** @@ -57,12 +62,19 @@ Hopefully, the Linux kernel provides many metrics that can provide deep insights Of course, for newer disk technologies (like fusion cards) that are capable to execute multiple commands in parallel, this metric is just meaningless. - **Average I/O operation time (ms)** The average time for I/O requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. +- **Average I/O operation time for extended operations (ms)** + The average time for extended I/O requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them. - **Average I/O operation size (kb)** The average amount of data of the completed I/O operations. +- **Average amount of discarded data (kb)** + The average amount of data of the completed discard operations. - **Average Service Time (ms)** The average service time for completed I/O operations. This metric is calculated using the total busy time of the disk and the number of completed operations. If the disk is able to execute multiple parallel operations the reporting average service time will be misleading. +- **Average Service Time for extended I/O operations (ms)** + The average service time for completed extended I/O operations. - **Merged I/O operations/s** The Linux kernel is capable of merging I/O operations. So, if two requests to read data from the disk are adjacent, the Linux kernel may merge them to one before giving them to disk. This metric measures the number of operations that have been merged by the Linux kernel. +- **Merged discard operations/s** - **Total I/O time** The sum of the duration of all completed I/O operations. This number can exceed the interval if the disk is able to execute multiple I/O operations in parallel. - **Space usage** @@ -116,6 +128,7 @@ Then edit `netdata.conf` and find the following section. This is the basic plugi # i/o time for all disks = auto # queued operations for all disks = auto # utilization percentage for all disks = auto + # extended operations for all disks = auto # backlog for all disks = auto # bcache for all disks = auto # bcache priority stats update every = 0 @@ -147,6 +160,7 @@ For each virtual disk, physical disk and partition you will have a section like # i/o time = auto # queued operations = auto # utilization percentage = auto + # extended operations = auto # backlog = auto ``` @@ -291,6 +305,28 @@ each state. `schedstat filename to monitor`, `cpuidle name filename to monitor`, and `cpuidle time filename to monitor` in the `[plugin:proc:/proc/stat]` configuration section +## Monitoring memory + +### Monitored memory metrics + +- Amount of memory swapped in/out +- Amount of memory paged from/to disk +- Number of memory page faults +- Number of out of memory kills +- Number of NUMA events + +### Configuration + +```conf +[plugin:proc:/proc/vmstat] + filename to monitor = /proc/vmstat + swap i/o = auto + disk i/o = yes + memory page faults = yes + out of memory kills = yes + system-wide numa metric summary = auto +``` + ## Monitoring Network Interfaces ### Monitored network interface metrics diff --git a/collectors/proc.plugin/ipc.c b/collectors/proc.plugin/ipc.c index 048fe74a..b5c9ae5e 100644 --- a/collectors/proc.plugin/ipc.c +++ b/collectors/proc.plugin/ipc.c @@ -209,7 +209,7 @@ int ipc_msq_get_info(char *msg_filename, struct message_queue **message_queue_ro continue; } - // find the id in the linked list or create a new stucture + // find the id in the linked list or create a new structure int found = 0; unsigned long long id = str2ull(procfile_lineword(ff, l, 1)); diff --git a/collectors/proc.plugin/plugin_proc.c b/collectors/proc.plugin/plugin_proc.c index 19230c09..190811e2 100644 --- a/collectors/proc.plugin/plugin_proc.c +++ b/collectors/proc.plugin/plugin_proc.c @@ -15,70 +15,76 @@ static struct proc_module { } proc_modules[] = { - // system metrics - { .name = "/proc/stat", .dim = "stat", .func = do_proc_stat }, - { .name = "/proc/uptime", .dim = "uptime", .func = do_proc_uptime }, - { .name = "/proc/loadavg", .dim = "loadavg", .func = do_proc_loadavg }, - { .name = "/proc/sys/kernel/random/entropy_avail", .dim = "entropy", .func = do_proc_sys_kernel_random_entropy_avail }, - - // pressure metrics - { .name = "/proc/pressure", .dim = "pressure", .func = do_proc_pressure }, - - // CPU metrics - { .name = "/proc/interrupts", .dim = "interrupts", .func = do_proc_interrupts }, - { .name = "/proc/softirqs", .dim = "softirqs", .func = do_proc_softirqs }, - - // memory metrics - { .name = "/proc/vmstat", .dim = "vmstat", .func = do_proc_vmstat }, - { .name = "/proc/meminfo", .dim = "meminfo", .func = do_proc_meminfo }, - { .name = "/sys/kernel/mm/ksm", .dim = "ksm", .func = do_sys_kernel_mm_ksm }, - { .name = "/sys/block/zram", .dim = "zram", .func = do_sys_block_zram }, - { .name = "/sys/devices/system/edac/mc", .dim = "ecc", .func = do_proc_sys_devices_system_edac_mc }, - { .name = "/sys/devices/system/node", .dim = "numa", .func = do_proc_sys_devices_system_node }, - { .name = "/proc/pagetypeinfo", .dim = "pagetypeinfo", .func = do_proc_pagetypeinfo }, - - // network metrics - { .name = "/proc/net/dev", .dim = "netdev", .func = do_proc_net_dev }, - { .name = "/proc/net/wireless", .dim = "netwireless", .func = do_proc_net_wireless }, - { .name = "/proc/net/sockstat", .dim = "sockstat", .func = do_proc_net_sockstat }, - { .name = "/proc/net/sockstat6", .dim = "sockstat6", .func = do_proc_net_sockstat6 }, - { .name = "/proc/net/netstat", .dim = "netstat", .func = do_proc_net_netstat }, // this has to be before /proc/net/snmp, because there is a shared metric - { .name = "/proc/net/snmp", .dim = "snmp", .func = do_proc_net_snmp }, - { .name = "/proc/net/snmp6", .dim = "snmp6", .func = do_proc_net_snmp6 }, - { .name = "/proc/net/sctp/snmp", .dim = "sctp", .func = do_proc_net_sctp_snmp }, - { .name = "/proc/net/softnet_stat", .dim = "softnet", .func = do_proc_net_softnet_stat }, - { .name = "/proc/net/ip_vs/stats", .dim = "ipvs", .func = do_proc_net_ip_vs_stats }, - { .name = "/sys/class/infiniband", .dim = "infiniband", .func = do_sys_class_infiniband }, - - // firewall metrics - { .name = "/proc/net/stat/conntrack", .dim = "conntrack", .func = do_proc_net_stat_conntrack }, - { .name = "/proc/net/stat/synproxy", .dim = "synproxy", .func = do_proc_net_stat_synproxy }, - - // disk metrics - { .name = "/proc/diskstats", .dim = "diskstats", .func = do_proc_diskstats }, - { .name = "/proc/mdstat", .dim = "mdstat", .func = do_proc_mdstat }, - - // NFS metrics - { .name = "/proc/net/rpc/nfsd", .dim = "nfsd", .func = do_proc_net_rpc_nfsd }, - { .name = "/proc/net/rpc/nfs", .dim = "nfs", .func = do_proc_net_rpc_nfs }, - - // ZFS metrics - { .name = "/proc/spl/kstat/zfs/arcstats", .dim = "zfs_arcstats", .func = do_proc_spl_kstat_zfs_arcstats }, - - // BTRFS metrics - { .name = "/sys/fs/btrfs", .dim = "btrfs", .func = do_sys_fs_btrfs }, - - // IPC metrics - { .name = "ipc", .dim = "ipc", .func = do_ipc }, - - // linux power supply metrics - { .name = "/sys/class/power_supply", .dim = "power_supply", .func = do_sys_class_power_supply }, - - // the terminator of this array - { .name = NULL, .dim = NULL, .func = NULL } + // system metrics + {.name = "/proc/stat", .dim = "stat", .func = do_proc_stat}, + {.name = "/proc/uptime", .dim = "uptime", .func = do_proc_uptime}, + {.name = "/proc/loadavg", .dim = "loadavg", .func = do_proc_loadavg}, + {.name = "/proc/sys/kernel/random/entropy_avail", .dim = "entropy", .func = do_proc_sys_kernel_random_entropy_avail}, + + // pressure metrics + {.name = "/proc/pressure", .dim = "pressure", .func = do_proc_pressure}, + + // CPU metrics + {.name = "/proc/interrupts", .dim = "interrupts", .func = do_proc_interrupts}, + {.name = "/proc/softirqs", .dim = "softirqs", .func = do_proc_softirqs}, + + // memory metrics + {.name = "/proc/vmstat", .dim = "vmstat", .func = do_proc_vmstat}, + {.name = "/proc/meminfo", .dim = "meminfo", .func = do_proc_meminfo}, + {.name = "/sys/kernel/mm/ksm", .dim = "ksm", .func = do_sys_kernel_mm_ksm}, + {.name = "/sys/block/zram", .dim = "zram", .func = do_sys_block_zram}, + {.name = "/sys/devices/system/edac/mc", .dim = "ecc", .func = do_proc_sys_devices_system_edac_mc}, + {.name = "/sys/devices/system/node", .dim = "numa", .func = do_proc_sys_devices_system_node}, + {.name = "/proc/pagetypeinfo", .dim = "pagetypeinfo", .func = do_proc_pagetypeinfo}, + + // network metrics + {.name = "/proc/net/dev", .dim = "netdev", .func = do_proc_net_dev}, + {.name = "/proc/net/wireless", .dim = "netwireless", .func = do_proc_net_wireless}, + {.name = "/proc/net/sockstat", .dim = "sockstat", .func = do_proc_net_sockstat}, + {.name = "/proc/net/sockstat6", .dim = "sockstat6", .func = do_proc_net_sockstat6}, + {.name = "/proc/net/netstat", + .dim = "netstat", + .func = do_proc_net_netstat}, // this has to be before /proc/net/snmp, because there is a shared metric + {.name = "/proc/net/snmp", .dim = "snmp", .func = do_proc_net_snmp}, + {.name = "/proc/net/snmp6", .dim = "snmp6", .func = do_proc_net_snmp6}, + {.name = "/proc/net/sctp/snmp", .dim = "sctp", .func = do_proc_net_sctp_snmp}, + {.name = "/proc/net/softnet_stat", .dim = "softnet", .func = do_proc_net_softnet_stat}, + {.name = "/proc/net/ip_vs/stats", .dim = "ipvs", .func = do_proc_net_ip_vs_stats}, + {.name = "/sys/class/infiniband", .dim = "infiniband", .func = do_sys_class_infiniband}, + + // firewall metrics + {.name = "/proc/net/stat/conntrack", .dim = "conntrack", .func = do_proc_net_stat_conntrack}, + {.name = "/proc/net/stat/synproxy", .dim = "synproxy", .func = do_proc_net_stat_synproxy}, + + // disk metrics + {.name = "/proc/diskstats", .dim = "diskstats", .func = do_proc_diskstats}, + {.name = "/proc/mdstat", .dim = "mdstat", .func = do_proc_mdstat}, + + // NFS metrics + {.name = "/proc/net/rpc/nfsd", .dim = "nfsd", .func = do_proc_net_rpc_nfsd}, + {.name = "/proc/net/rpc/nfs", .dim = "nfs", .func = do_proc_net_rpc_nfs}, + + // ZFS metrics + {.name = "/proc/spl/kstat/zfs/arcstats", .dim = "zfs_arcstats", .func = do_proc_spl_kstat_zfs_arcstats}, + {.name = "/proc/spl/kstat/zfs/pool/state", + .dim = "zfs_pool_state", + .func = do_proc_spl_kstat_zfs_pool_state}, + + // BTRFS metrics + {.name = "/sys/fs/btrfs", .dim = "btrfs", .func = do_sys_fs_btrfs}, + + // IPC metrics + {.name = "ipc", .dim = "ipc", .func = do_ipc}, + + {.name = "/sys/class/power_supply", .dim = "power_supply", .func = do_sys_class_power_supply}, + // linux power supply metrics + + // the terminator of this array + {.name = NULL, .dim = NULL, .func = NULL} }; -static void proc_main_cleanup(void *ptr) { +static void proc_main_cleanup(void *ptr) +{ struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; @@ -87,7 +93,8 @@ static void proc_main_cleanup(void *ptr) { static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } -void *proc_main(void *ptr) { +void *proc_main(void *ptr) +{ netdata_thread_cleanup_push(proc_main_cleanup, ptr); int vdo_cpu_netdata = config_get_boolean("plugin:proc", "netdata server resources", CONFIG_BOOLEAN_YES); @@ -96,7 +103,7 @@ void *proc_main(void *ptr) { // check the enabled status for each module int i; - for(i = 0 ; proc_modules[i].name ;i++) { + for (i = 0; proc_modules[i].name; i++) { struct proc_module *pm = &proc_modules[i]; pm->enabled = config_get_boolean("plugin:proc", pm->name, CONFIG_BOOLEAN_YES); @@ -109,20 +116,22 @@ void *proc_main(void *ptr) { heartbeat_init(&hb); size_t iterations = 0; - while(!netdata_exit) { + while (!netdata_exit) { iterations++; (void)iterations; usec_t hb_dt = heartbeat_next(&hb, step); usec_t duration = 0ULL; - if(unlikely(netdata_exit)) break; + if (unlikely(netdata_exit)) + break; // BEGIN -- the job to be done - for(i = 0 ; proc_modules[i].name ;i++) { + for (i = 0; proc_modules[i].name; i++) { struct proc_module *pm = &proc_modules[i]; - if(unlikely(!pm->enabled)) continue; + if (unlikely(!pm->enabled)) + continue; debug(D_PROCNETDEV_LOOP, "PROC calling %s.", pm->name); @@ -139,55 +148,87 @@ void *proc_main(void *ptr) { // log_thread_memory_allocations = 0; //#endif - if(unlikely(netdata_exit)) break; + if (unlikely(netdata_exit)) + break; } // END -- the job is done - // -------------------------------------------------------------------- - - if(vdo_cpu_netdata) { - static RRDSET *st = NULL; - - if(unlikely(!st)) { - st = rrdset_find_active_bytype_localhost("netdata", "plugin_proc_modules"); - - if(!st) { - st = rrdset_create_localhost( - "netdata" - , "plugin_proc_modules" - , NULL - , "proc" - , NULL - , "NetData Proc Plugin Modules Durations" - , "milliseconds/run" - , "netdata" - , "stats" - , 132001 - , localhost->rrd_update_every - , RRDSET_TYPE_STACKED - ); - - for(i = 0 ; proc_modules[i].name ;i++) { + if (vdo_cpu_netdata) { + static RRDSET *st_cpu_thread = NULL, *st_duration = NULL; + static RRDDIM *rd_user = NULL, *rd_system = NULL; + + // ---------------------------------------------------------------- + + struct rusage thread; + getrusage(RUSAGE_THREAD, &thread); + + if (unlikely(!st_cpu_thread)) { + st_cpu_thread = rrdset_create_localhost( + "netdata", + "plugin_proc_cpu", + NULL, + "proc", + NULL, + "Netdata proc plugin CPU usage", + "milliseconds/s", + "proc", + "stats", + 132000, + localhost->rrd_update_every, + RRDSET_TYPE_STACKED); + + rd_user = rrddim_add(st_cpu_thread, "user", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL); + rd_system = rrddim_add(st_cpu_thread, "system", NULL, 1, USEC_PER_MS, RRD_ALGORITHM_INCREMENTAL); + } else { + rrdset_next(st_cpu_thread); + } + + rrddim_set_by_pointer( + st_cpu_thread, rd_user, thread.ru_utime.tv_sec * USEC_PER_SEC + thread.ru_utime.tv_usec); + rrddim_set_by_pointer( + st_cpu_thread, rd_system, thread.ru_stime.tv_sec * USEC_PER_SEC + thread.ru_stime.tv_usec); + rrdset_done(st_cpu_thread); + + // ---------------------------------------------------------------- + + if (unlikely(!st_duration)) { + st_duration = rrdset_find_active_bytype_localhost("netdata", "plugin_proc_modules"); + + if (!st_duration) { + st_duration = rrdset_create_localhost( + "netdata", + "plugin_proc_modules", + NULL, + "proc", + NULL, + "Netdata proc plugin modules durations", + "milliseconds/run", + "proc", + "stats", + 132001, + localhost->rrd_update_every, + RRDSET_TYPE_STACKED); + + for (i = 0; proc_modules[i].name; i++) { struct proc_module *pm = &proc_modules[i]; - if(unlikely(!pm->enabled)) continue; + if (unlikely(!pm->enabled)) + continue; - pm->rd = rrddim_add(st, pm->dim, NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); + pm->rd = rrddim_add(st_duration, pm->dim, NULL, 1, USEC_PER_MS, RRD_ALGORITHM_ABSOLUTE); } } - } - else rrdset_next(st); + } else + rrdset_next(st_duration); - for(i = 0 ; proc_modules[i].name ;i++) { + for (i = 0; proc_modules[i].name; i++) { struct proc_module *pm = &proc_modules[i]; - if(unlikely(!pm->enabled)) continue; + if (unlikely(!pm->enabled)) + continue; - rrddim_set_by_pointer(st, pm->rd, pm->duration); + rrddim_set_by_pointer(st_duration, pm->rd, pm->duration); } - rrdset_done(st); - - global_statistics_charts(); - registry_statistics(); + rrdset_done(st_duration); } } @@ -209,16 +250,16 @@ int get_numa_node_count(void) char *dirname = config_get("plugin:proc:/sys/devices/system/node", "directory to monitor", name); DIR *dir = opendir(dirname); - if(dir) { + if (dir) { struct dirent *de = NULL; - while((de = readdir(dir))) { - if(de->d_type != DT_DIR) + while ((de = readdir(dir))) { + if (de->d_type != DT_DIR) continue; - if(strncmp(de->d_name, "node", 4) != 0) + if (strncmp(de->d_name, "node", 4) != 0) continue; - if(!isdigit(de->d_name[4])) + if (!isdigit(de->d_name[4])) continue; numa_node_count++; diff --git a/collectors/proc.plugin/plugin_proc.h b/collectors/proc.plugin/plugin_proc.h index 108c026a..b0d60cd8 100644 --- a/collectors/proc.plugin/plugin_proc.h +++ b/collectors/proc.plugin/plugin_proc.h @@ -51,6 +51,7 @@ extern int do_proc_uptime(int update_every, usec_t dt); extern int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt); extern int do_proc_sys_devices_system_node(int update_every, usec_t dt); extern int do_proc_spl_kstat_zfs_arcstats(int update_every, usec_t dt); +extern int do_proc_spl_kstat_zfs_pool_state(int update_every, usec_t dt); extern int do_sys_fs_btrfs(int update_every, usec_t dt); extern int do_proc_net_sockstat(int update_every, usec_t dt); extern int do_proc_net_sockstat6(int update_every, usec_t dt); diff --git a/collectors/proc.plugin/proc_diskstats.c b/collectors/proc.plugin/proc_diskstats.c index b5d02f32..cfaf2134 100644 --- a/collectors/proc.plugin/proc_diskstats.c +++ b/collectors/proc.plugin/proc_diskstats.c @@ -32,6 +32,7 @@ static struct disk { int do_iotime; int do_qops; int do_util; + int do_ext; int do_backlog; int do_bcache; @@ -64,10 +65,17 @@ static struct disk { RRDDIM *rd_io_reads; RRDDIM *rd_io_writes; + RRDSET *st_ext_io; + RRDDIM *rd_io_discards; + RRDSET *st_ops; RRDDIM *rd_ops_reads; RRDDIM *rd_ops_writes; + RRDSET *st_ext_ops; + RRDDIM *rd_ops_discards; + RRDDIM *rd_ops_flushes; + RRDSET *st_qops; RRDDIM *rd_qops_operations; @@ -84,18 +92,32 @@ static struct disk { RRDDIM *rd_mops_reads; RRDDIM *rd_mops_writes; + RRDSET *st_ext_mops; + RRDDIM *rd_mops_discards; + RRDSET *st_iotime; RRDDIM *rd_iotime_reads; RRDDIM *rd_iotime_writes; + RRDSET *st_ext_iotime; + RRDDIM *rd_iotime_discards; + RRDDIM *rd_iotime_flushes; + RRDSET *st_await; RRDDIM *rd_await_reads; RRDDIM *rd_await_writes; + RRDSET *st_ext_await; + RRDDIM *rd_await_discards; + RRDDIM *rd_await_flushes; + RRDSET *st_avgsz; RRDDIM *rd_avgsz_reads; RRDDIM *rd_avgsz_writes; + RRDSET *st_ext_avgsz; + RRDDIM *rd_avgsz_discards; + RRDSET *st_svctm; RRDDIM *rd_svctm_svctm; @@ -164,6 +186,7 @@ static int global_enable_new_disks_detected_at_runtime = CONFIG_BOOLEAN_YES, global_do_iotime = CONFIG_BOOLEAN_AUTO, global_do_qops = CONFIG_BOOLEAN_AUTO, global_do_util = CONFIG_BOOLEAN_AUTO, + global_do_ext = CONFIG_BOOLEAN_AUTO, global_do_backlog = CONFIG_BOOLEAN_AUTO, global_do_bcache = CONFIG_BOOLEAN_AUTO, globals_initialized = 0, @@ -463,6 +486,7 @@ static void get_disk_config(struct disk *d) { d->do_iotime = CONFIG_BOOLEAN_NO; d->do_qops = CONFIG_BOOLEAN_NO; d->do_util = CONFIG_BOOLEAN_NO; + d->do_ext = CONFIG_BOOLEAN_NO; d->do_backlog = CONFIG_BOOLEAN_NO; d->do_bcache = CONFIG_BOOLEAN_NO; } @@ -513,6 +537,7 @@ static void get_disk_config(struct disk *d) { ddo_iotime = CONFIG_BOOLEAN_NO, ddo_qops = CONFIG_BOOLEAN_NO, ddo_util = CONFIG_BOOLEAN_NO, + ddo_ext = CONFIG_BOOLEAN_NO, ddo_backlog = CONFIG_BOOLEAN_NO, ddo_bcache = CONFIG_BOOLEAN_NO; @@ -524,6 +549,7 @@ static void get_disk_config(struct disk *d) { ddo_iotime = global_do_iotime, ddo_qops = global_do_qops, ddo_util = global_do_util, + ddo_ext = global_do_ext, ddo_backlog = global_do_backlog, ddo_bcache = global_do_bcache; } @@ -534,6 +560,7 @@ static void get_disk_config(struct disk *d) { d->do_iotime = config_get_boolean_ondemand(var_name, "i/o time", ddo_iotime); d->do_qops = config_get_boolean_ondemand(var_name, "queued operations", ddo_qops); d->do_util = config_get_boolean_ondemand(var_name, "utilization percentage", ddo_util); + d->do_ext = config_get_boolean_ondemand(var_name, "extended operations", ddo_ext); d->do_backlog = config_get_boolean_ondemand(var_name, "backlog", ddo_backlog); if(d->device_is_bcache) @@ -820,6 +847,7 @@ int do_proc_diskstats(int update_every, usec_t dt) { global_do_iotime = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_DISKSTATS, "i/o time for all disks", global_do_iotime); global_do_qops = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_DISKSTATS, "queued operations for all disks", global_do_qops); global_do_util = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_DISKSTATS, "utilization percentage for all disks", global_do_util); + global_do_ext = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_DISKSTATS, "extended operations for all disks", global_do_ext); global_do_backlog = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_DISKSTATS, "backlog for all disks", global_do_backlog); global_do_bcache = config_get_boolean_ondemand(CONFIG_SECTION_PLUGIN_PROC_DISKSTATS, "bcache for all disks", global_do_bcache); global_bcache_priority_stats_update_every = (int)config_get_number(CONFIG_SECTION_PLUGIN_PROC_DISKSTATS, "bcache priority stats update every", global_bcache_priority_stats_update_every); @@ -889,6 +917,8 @@ int do_proc_diskstats(int update_every, usec_t dt) { collected_number system_read_kb = 0, system_write_kb = 0; + int do_dc_stats = 0, do_fl_stats = 0; + for(l = 0; l < lines ;l++) { // -------------------------------------------------------------------------- // Read parameters @@ -898,11 +928,16 @@ int do_proc_diskstats(int update_every, usec_t dt) { collected_number reads = 0, mreads = 0, readsectors = 0, readms = 0, writes = 0, mwrites = 0, writesectors = 0, writems = 0, - queued_ios = 0, busy_ms = 0, backlog_ms = 0; + queued_ios = 0, busy_ms = 0, backlog_ms = 0, + discards = 0, mdiscards = 0, discardsectors = 0, discardms = 0, + flushes = 0, flushms = 0; + collected_number last_reads = 0, last_readsectors = 0, last_readms = 0, last_writes = 0, last_writesectors = 0, last_writems = 0, - last_busy_ms = 0; + last_busy_ms = 0, + last_discards = 0, last_discardsectors = 0, last_discardms = 0, + last_flushes = 0, last_flushms = 0; size_t words = procfile_linewords(ff, l); if(unlikely(words < 14)) continue; @@ -951,6 +986,40 @@ int do_proc_diskstats(int update_every, usec_t dt) { // I/O completion time and the backlog that may be accumulating. backlog_ms = str2ull(procfile_lineword(ff, l, 13)); // rq_ticks + if (unlikely(words > 13)) { + do_dc_stats = 1; + + // # of discards completed + // This is the total number of discards completed successfully. + discards = str2ull(procfile_lineword(ff, l, 14)); // dc_ios + + // # of discards merged + // See the description of mreads/mwrites + mdiscards = str2ull(procfile_lineword(ff, l, 15)); // dc_merges + + // # of sectors discarded + // This is the total number of sectors discarded successfully. + discardsectors = str2ull(procfile_lineword(ff, l, 16)); // dc_sec + + // # of milliseconds spent discarding + // This is the total number of milliseconds spent by all discards (as + // measured from __make_request() to end_that_request_last()). + discardms = str2ull(procfile_lineword(ff, l, 17)); // dc_ticks + } + + if (unlikely(words > 17)) { + do_fl_stats = 1; + + // number of flush I/Os processed + // These values increment when an flush I/O request completes. + // Block layer combines flush requests and executes at most one at a time. + // This counts flush requests executed by disk. Not tracked for partitions. + flushes = str2ull(procfile_lineword(ff, l, 18)); // fl_ios + + // total wait time for flush requests + flushms = str2ull(procfile_lineword(ff, l, 19)); // fl_ticks + } + // -------------------------------------------------------------------------- // get a disk structure for the disk @@ -976,7 +1045,7 @@ int do_proc_diskstats(int update_every, usec_t dt) { // Do performance metrics if(d->do_io == CONFIG_BOOLEAN_YES || (d->do_io == CONFIG_BOOLEAN_AUTO && - (readsectors || writesectors || + (readsectors || writesectors || discardsectors || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { d->do_io = CONFIG_BOOLEAN_YES; @@ -1008,8 +1077,37 @@ int do_proc_diskstats(int update_every, usec_t dt) { // -------------------------------------------------------------------- + if (do_dc_stats && d->do_io == CONFIG_BOOLEAN_YES && d->do_ext != CONFIG_BOOLEAN_NO) { + if (unlikely(!d->st_ext_io)) { + d->st_ext_io = rrdset_create_localhost( + "disk_ext" + , d->device + , d->disk + , family + , "disk_ext.io" + , "Amount of Discarded Data" + , "KiB/s" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_DISKSTATS_NAME + , NETDATA_CHART_PRIO_DISK_IO + 1 + , update_every + , RRDSET_TYPE_AREA + ); + + d->rd_io_discards = + rrddim_add(d->st_ext_io, "discards", NULL, d->sector_size, 1024, RRD_ALGORITHM_INCREMENTAL); + } else + rrdset_next(d->st_ext_io); + + last_discardsectors = rrddim_set_by_pointer(d->st_ext_io, d->rd_io_discards, discardsectors); + rrdset_done(d->st_ext_io); + } + + // -------------------------------------------------------------------- + if(d->do_ops == CONFIG_BOOLEAN_YES || (d->do_ops == CONFIG_BOOLEAN_AUTO && - (reads || writes || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { + (reads || writes || discards || flushes || + netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { d->do_ops = CONFIG_BOOLEAN_YES; if(unlikely(!d->st_ops)) { @@ -1042,6 +1140,39 @@ int do_proc_diskstats(int update_every, usec_t dt) { // -------------------------------------------------------------------- + if (do_dc_stats && d->do_ops == CONFIG_BOOLEAN_YES && d->do_ext != CONFIG_BOOLEAN_NO) { + if (unlikely(!d->st_ext_ops)) { + d->st_ext_ops = rrdset_create_localhost( + "disk_ext_ops" + , d->device + , d->disk + , family + , "disk_ext.ops" + , "Disk Completed Extended I/O Operations" + , "operations/s" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_DISKSTATS_NAME + , NETDATA_CHART_PRIO_DISK_OPS + 1 + , update_every + , RRDSET_TYPE_LINE + ); + + rrdset_flag_set(d->st_ext_ops, RRDSET_FLAG_DETAIL); + + d->rd_ops_discards = rrddim_add(d->st_ext_ops, "discards", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + if (do_fl_stats) + d->rd_ops_flushes = rrddim_add(d->st_ext_ops, "flushes", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + } else + rrdset_next(d->st_ext_ops); + + last_discards = rrddim_set_by_pointer(d->st_ext_ops, d->rd_ops_discards, discards); + if (do_fl_stats) + last_flushes = rrddim_set_by_pointer(d->st_ext_ops, d->rd_ops_flushes, flushes); + rrdset_done(d->st_ext_ops); + } + + // -------------------------------------------------------------------- + if(d->do_qops == CONFIG_BOOLEAN_YES || (d->do_qops == CONFIG_BOOLEAN_AUTO && (queued_ios || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { d->do_qops = CONFIG_BOOLEAN_YES; @@ -1171,7 +1302,8 @@ int do_proc_diskstats(int update_every, usec_t dt) { // -------------------------------------------------------------------- if(d->do_mops == CONFIG_BOOLEAN_YES || (d->do_mops == CONFIG_BOOLEAN_AUTO && - (mreads || mwrites || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { + (mreads || mwrites || mdiscards || + netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { d->do_mops = CONFIG_BOOLEAN_YES; if(unlikely(!d->st_mops)) { @@ -1204,8 +1336,39 @@ int do_proc_diskstats(int update_every, usec_t dt) { // -------------------------------------------------------------------- + if(do_dc_stats && d->do_mops == CONFIG_BOOLEAN_YES && d->do_ext != CONFIG_BOOLEAN_NO) { + d->do_mops = CONFIG_BOOLEAN_YES; + + if(unlikely(!d->st_ext_mops)) { + d->st_ext_mops = rrdset_create_localhost( + "disk_ext_mops" + , d->device + , d->disk + , family + , "disk_ext.mops" + , "Disk Merged Discard Operations" + , "merged operations/s" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_DISKSTATS_NAME + , NETDATA_CHART_PRIO_DISK_MOPS + 1 + , update_every + , RRDSET_TYPE_LINE + ); + + rrdset_flag_set(d->st_ext_mops, RRDSET_FLAG_DETAIL); + + d->rd_mops_discards = rrddim_add(d->st_ext_mops, "discards", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + } else + rrdset_next(d->st_ext_mops); + + rrddim_set_by_pointer(d->st_ext_mops, d->rd_mops_discards, mdiscards); + rrdset_done(d->st_ext_mops); + } + + // -------------------------------------------------------------------- + if(d->do_iotime == CONFIG_BOOLEAN_YES || (d->do_iotime == CONFIG_BOOLEAN_AUTO && - (readms || writems || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { + (readms || writems || discardms || flushms || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { d->do_iotime = CONFIG_BOOLEAN_YES; if(unlikely(!d->st_iotime)) { @@ -1237,6 +1400,40 @@ int do_proc_diskstats(int update_every, usec_t dt) { } // -------------------------------------------------------------------- + + if(do_dc_stats && d->do_iotime == CONFIG_BOOLEAN_YES && d->do_ext != CONFIG_BOOLEAN_NO) { + if(unlikely(!d->st_ext_iotime)) { + d->st_ext_iotime = rrdset_create_localhost( + "disk_ext_iotime" + , d->device + , d->disk + , family + , "disk_ext.iotime" + , "Disk Total I/O Time for Extended Operations" + , "milliseconds/s" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_DISKSTATS_NAME + , NETDATA_CHART_PRIO_DISK_IOTIME + 1 + , update_every + , RRDSET_TYPE_LINE + ); + + rrdset_flag_set(d->st_ext_iotime, RRDSET_FLAG_DETAIL); + + d->rd_iotime_discards = rrddim_add(d->st_ext_iotime, "discards", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + if (do_fl_stats) + d->rd_iotime_flushes = + rrddim_add(d->st_ext_iotime, "flushes", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + } else + rrdset_next(d->st_ext_iotime); + + last_discardms = rrddim_set_by_pointer(d->st_ext_iotime, d->rd_iotime_discards, discardms); + if (do_fl_stats) + last_flushms = rrddim_set_by_pointer(d->st_ext_iotime, d->rd_iotime_flushes, flushms); + rrdset_done(d->st_ext_iotime); + } + + // -------------------------------------------------------------------- // calculate differential charts // only if this is not the first time we run @@ -1276,6 +1473,42 @@ int do_proc_diskstats(int update_every, usec_t dt) { rrdset_done(d->st_await); } + if (do_dc_stats && d->do_iotime == CONFIG_BOOLEAN_YES && d->do_ops == CONFIG_BOOLEAN_YES && d->do_ext != CONFIG_BOOLEAN_NO) { + if(unlikely(!d->st_ext_await)) { + d->st_ext_await = rrdset_create_localhost( + "disk_ext_await" + , d->device + , d->disk + , family + , "disk_ext.await" + , "Average Completed Extended I/O Operation Time" + , "milliseconds/operation" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_DISKSTATS_NAME + , NETDATA_CHART_PRIO_DISK_AWAIT + 1 + , update_every + , RRDSET_TYPE_LINE + ); + + rrdset_flag_set(d->st_ext_await, RRDSET_FLAG_DETAIL); + + d->rd_await_discards = rrddim_add(d->st_ext_await, "discards", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + if (do_fl_stats) + d->rd_await_flushes = + rrddim_add(d->st_ext_await, "flushes", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(d->st_ext_await); + + rrddim_set_by_pointer( + d->st_ext_await, d->rd_await_discards, + (discards - last_discards) ? (discardms - last_discardms) / (discards - last_discards) : 0); + if (do_fl_stats) + rrddim_set_by_pointer( + d->st_ext_await, d->rd_await_flushes, + (flushes - last_flushes) ? (flushms - last_flushms) / (flushes - last_flushes) : 0); + rrdset_done(d->st_ext_await); + } + if( (d->do_io == CONFIG_BOOLEAN_YES || (d->do_io == CONFIG_BOOLEAN_AUTO && (readsectors || writesectors || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) && (d->do_ops == CONFIG_BOOLEAN_YES || (d->do_ops == CONFIG_BOOLEAN_AUTO && @@ -1309,6 +1542,37 @@ int do_proc_diskstats(int update_every, usec_t dt) { rrdset_done(d->st_avgsz); } + if(do_dc_stats && d->do_io == CONFIG_BOOLEAN_YES && d->do_ops == CONFIG_BOOLEAN_YES && d->do_ext != CONFIG_BOOLEAN_NO) { + if(unlikely(!d->st_ext_avgsz)) { + d->st_ext_avgsz = rrdset_create_localhost( + "disk_ext_avgsz" + , d->device + , d->disk + , family + , "disk_ext.avgsz" + , "Average Amount of Discarded Data" + , "KiB/operation" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_DISKSTATS_NAME + , NETDATA_CHART_PRIO_DISK_AVGSZ + , update_every + , RRDSET_TYPE_AREA + ); + + rrdset_flag_set(d->st_ext_avgsz, RRDSET_FLAG_DETAIL); + + d->rd_avgsz_discards = + rrddim_add(d->st_ext_avgsz, "discards", NULL, d->sector_size, 1024, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(d->st_ext_avgsz); + + rrddim_set_by_pointer( + d->st_ext_avgsz, d->rd_avgsz_discards, + (discards - last_discards) ? (discardsectors - last_discardsectors) / (discards - last_discards) : + 0); + rrdset_done(d->st_ext_avgsz); + } + if( (d->do_util == CONFIG_BOOLEAN_YES || (d->do_util == CONFIG_BOOLEAN_AUTO && (busy_ms || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) && diff --git a/collectors/proc.plugin/proc_mdstat.c b/collectors/proc.plugin/proc_mdstat.c index e932453b..46f0134e 100644 --- a/collectors/proc.plugin/proc_mdstat.c +++ b/collectors/proc.plugin/proc_mdstat.c @@ -8,6 +8,7 @@ struct raid { int redundant; char *name; uint32_t hash; + char *level; RRDDIM *rd_health; unsigned long long failed_disks; @@ -149,6 +150,7 @@ int do_proc_mdstat(int update_every, usec_t dt) for (raid_idx = 0; raid_idx < raids_allocated; raid_idx++) { struct raid *raid = &raids[raid_idx]; freez(raid->name); + freez(raid->level); freez(raid->mismatch_cnt_filename); } if (raids_num) { @@ -168,7 +170,7 @@ int do_proc_mdstat(int update_every, usec_t dt) words = procfile_linewords(ff, l); - if (unlikely(words < 2)) + if (unlikely(words < 3)) continue; if (unlikely(procfile_lineword(ff, l, 1)[0] != 'a')) @@ -177,12 +179,15 @@ int do_proc_mdstat(int update_every, usec_t dt) if (unlikely(!raid->name)) { raid->name = strdupz(procfile_lineword(ff, l, 0)); raid->hash = simple_hash(raid->name); + raid->level = strdupz(procfile_lineword(ff, l, 2)); } else if (unlikely(strcmp(raid->name, procfile_lineword(ff, l, 0)))) { freez(raid->name); freez(raid->mismatch_cnt_filename); + freez(raid->level); memset(raid, 0, sizeof(struct raid)); raid->name = strdupz(procfile_lineword(ff, l, 0)); raid->hash = simple_hash(raid->name); + raid->level = strdupz(procfile_lineword(ff, l, 2)); } if (unlikely(!raid->name || !raid->name[0])) @@ -436,7 +441,7 @@ int do_proc_mdstat(int update_every, usec_t dt) snprintfz(id, 50, "%s_disks", raid->name); if (unlikely(!raid->st_disks && !(raid->st_disks = rrdset_find_active_byname_localhost(id)))) { - snprintfz(family, 50, "%s", raid->name); + snprintfz(family, 50, "%s (%s)", raid->name, raid->level); raid->st_disks = rrdset_create_localhost( "mdstat", @@ -473,7 +478,7 @@ int do_proc_mdstat(int update_every, usec_t dt) snprintfz(id, 50, "%s_mismatch", raid->name); if (unlikely(!raid->st_mismatch_cnt && !(raid->st_mismatch_cnt = rrdset_find_active_byname_localhost(id)))) { - snprintfz(family, 50, "%s", raid->name); + snprintfz(family, 50, "%s (%s)", raid->name, raid->level); raid->st_mismatch_cnt = rrdset_create_localhost( "mdstat", @@ -507,7 +512,7 @@ int do_proc_mdstat(int update_every, usec_t dt) snprintfz(id, 50, "%s_operation", raid->name); if (unlikely(!raid->st_operation && !(raid->st_operation = rrdset_find_active_byname_localhost(id)))) { - snprintfz(family, 50, "%s", raid->name); + snprintfz(family, 50, "%s (%s)", raid->name, raid->level); raid->st_operation = rrdset_create_localhost( "mdstat", @@ -548,7 +553,7 @@ int do_proc_mdstat(int update_every, usec_t dt) snprintfz(id, 50, "%s_finish", raid->name); if (unlikely(!raid->st_finish && !(raid->st_finish = rrdset_find_active_byname_localhost(id)))) { - snprintfz(family, 50, "%s", raid->name); + snprintfz(family, 50, "%s (%s)", raid->name, raid->level); raid->st_finish = rrdset_create_localhost( "mdstat", @@ -579,7 +584,7 @@ int do_proc_mdstat(int update_every, usec_t dt) snprintfz(id, 50, "%s_speed", raid->name); if (unlikely(!raid->st_speed && !(raid->st_speed = rrdset_find_active_byname_localhost(id)))) { - snprintfz(family, 50, "%s", raid->name); + snprintfz(family, 50, "%s (%s)", raid->name, raid->level); raid->st_speed = rrdset_create_localhost( "mdstat", @@ -613,7 +618,7 @@ int do_proc_mdstat(int update_every, usec_t dt) snprintfz(id, 50, "%s_availability", raid->name); if (unlikely(!raid->st_nonredundant && !(raid->st_nonredundant = rrdset_find_active_localhost(id)))) { - snprintfz(family, 50, "%s", raid->name); + snprintfz(family, 50, "%s (%s)", raid->name, raid->level); raid->st_nonredundant = rrdset_create_localhost( "mdstat", diff --git a/collectors/proc.plugin/proc_meminfo.c b/collectors/proc.plugin/proc_meminfo.c index 51d77fe0..5b402caa 100644 --- a/collectors/proc.plugin/proc_meminfo.c +++ b/collectors/proc.plugin/proc_meminfo.c @@ -10,6 +10,7 @@ int do_proc_meminfo(int update_every, usec_t dt) { static procfile *ff = NULL; static int do_ram = -1, do_swap = -1, do_hwcorrupt = -1, do_committed = -1, do_writeback = -1, do_kernel = -1, do_slab = -1, do_hugepages = -1, do_transparent_hugepages = -1; + static int do_percpu = 0; static ARL_BASE *arl_base = NULL; static ARL_ENTRY *arl_hwcorrupted = NULL, *arl_memavailable = NULL; @@ -49,6 +50,7 @@ int do_proc_meminfo(int update_every, usec_t dt) { //VmallocTotal = 0, VmallocUsed = 0, //VmallocChunk = 0, + Percpu = 0, AnonHugePages = 0, ShmemHugePages = 0, HugePages_Total = 0, @@ -106,6 +108,7 @@ int do_proc_meminfo(int update_every, usec_t dt) { //arl_expect(arl_base, "VmallocTotal", &VmallocTotal); arl_expect(arl_base, "VmallocUsed", &VmallocUsed); //arl_expect(arl_base, "VmallocChunk", &VmallocChunk); + arl_expect(arl_base, "Percpu", &Percpu); arl_hwcorrupted = arl_expect(arl_base, "HardwareCorrupted", &HardwareCorrupted); arl_expect(arl_base, "AnonHugePages", &AnonHugePages); arl_expect(arl_base, "ShmemHugePages", &ShmemHugePages); @@ -134,15 +137,23 @@ int do_proc_meminfo(int update_every, usec_t dt) { arl_begin(arl_base); + static int first_ff_read = 1; + for(l = 0; l < lines ;l++) { size_t words = procfile_linewords(ff, l); if(unlikely(words < 2)) continue; + if (first_ff_read && !strcmp(procfile_lineword(ff, l, 0), "Percpu")) + do_percpu = 1; + if(unlikely(arl_check(arl_base, procfile_lineword(ff, l, 0), procfile_lineword(ff, l, 1)))) break; } + if (first_ff_read) + first_ff_read = 0; + // -------------------------------------------------------------------- // http://calimeroteknik.free.fr/blag/?article20/really-used-memory-on-gnu-linux @@ -371,7 +382,8 @@ int do_proc_meminfo(int update_every, usec_t dt) { if(do_kernel) { static RRDSET *st_mem_kernel = NULL; - static RRDDIM *rd_slab = NULL, *rd_kernelstack = NULL, *rd_pagetables = NULL, *rd_vmallocused = NULL; + static RRDDIM *rd_slab = NULL, *rd_kernelstack = NULL, *rd_pagetables = NULL, *rd_vmallocused = NULL, + *rd_percpu = NULL; if(unlikely(!st_mem_kernel)) { st_mem_kernel = rrdset_create_localhost( @@ -395,6 +407,8 @@ int do_proc_meminfo(int update_every, usec_t dt) { rd_kernelstack = rrddim_add(st_mem_kernel, "KernelStack", NULL, 1, 1024, RRD_ALGORITHM_ABSOLUTE); rd_pagetables = rrddim_add(st_mem_kernel, "PageTables", NULL, 1, 1024, RRD_ALGORITHM_ABSOLUTE); rd_vmallocused = rrddim_add(st_mem_kernel, "VmallocUsed", NULL, 1, 1024, RRD_ALGORITHM_ABSOLUTE); + if (do_percpu) + rd_percpu = rrddim_add(st_mem_kernel, "Percpu", NULL, 1, 1024, RRD_ALGORITHM_ABSOLUTE); } else rrdset_next(st_mem_kernel); @@ -402,6 +416,8 @@ int do_proc_meminfo(int update_every, usec_t dt) { rrddim_set_by_pointer(st_mem_kernel, rd_kernelstack, KernelStack); rrddim_set_by_pointer(st_mem_kernel, rd_pagetables, PageTables); rrddim_set_by_pointer(st_mem_kernel, rd_vmallocused, VmallocUsed); + if (do_percpu) + rrddim_set_by_pointer(st_mem_kernel, rd_percpu, Percpu); rrdset_done(st_mem_kernel); } diff --git a/collectors/proc.plugin/proc_net_dev.c b/collectors/proc.plugin/proc_net_dev.c index 24715f29..bbf8a590 100644 --- a/collectors/proc.plugin/proc_net_dev.c +++ b/collectors/proc.plugin/proc_net_dev.c @@ -841,7 +841,7 @@ int do_proc_net_dev(int update_every, usec_t dt) { d->rd_tbytes = rrddim_add(d->st_bandwidth, "sent", NULL, -8, BITS_IN_A_KILOBIT, RRD_ALGORITHM_INCREMENTAL); if(d->flipped) { - // flip receive/trasmit + // flip receive/transmit RRDDIM *td = d->rd_rbytes; d->rd_rbytes = d->rd_tbytes; @@ -1064,7 +1064,7 @@ int do_proc_net_dev(int update_every, usec_t dt) { d->rd_rmulticast = rrddim_add(d->st_packets, "multicast", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); if(d->flipped) { - // flip receive/trasmit + // flip receive/transmit RRDDIM *td = d->rd_rpackets; d->rd_rpackets = d->rd_tpackets; @@ -1111,7 +1111,7 @@ int do_proc_net_dev(int update_every, usec_t dt) { d->rd_terrors = rrddim_add(d->st_errors, "outbound", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL); if(d->flipped) { - // flip receive/trasmit + // flip receive/transmit RRDDIM *td = d->rd_rerrors; d->rd_rerrors = d->rd_terrors; @@ -1157,7 +1157,7 @@ int do_proc_net_dev(int update_every, usec_t dt) { d->rd_tdrops = rrddim_add(d->st_drops, "outbound", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL); if(d->flipped) { - // flip receive/trasmit + // flip receive/transmit RRDDIM *td = d->rd_rdrops; d->rd_rdrops = d->rd_tdrops; @@ -1203,7 +1203,7 @@ int do_proc_net_dev(int update_every, usec_t dt) { d->rd_tfifo = rrddim_add(d->st_fifo, "transmit", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL); if(d->flipped) { - // flip receive/trasmit + // flip receive/transmit RRDDIM *td = d->rd_rfifo; d->rd_rfifo = d->rd_tfifo; @@ -1249,7 +1249,7 @@ int do_proc_net_dev(int update_every, usec_t dt) { d->rd_tcompressed = rrddim_add(d->st_compressed, "sent", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL); if(d->flipped) { - // flip receive/trasmit + // flip receive/transmit RRDDIM *td = d->rd_rcompressed; d->rd_rcompressed = d->rd_tcompressed; diff --git a/collectors/proc.plugin/proc_net_wireless.c b/collectors/proc.plugin/proc_net_wireless.c index 32a53c68..cb2443b1 100644 --- a/collectors/proc.plugin/proc_net_wireless.c +++ b/collectors/proc.plugin/proc_net_wireless.c @@ -48,7 +48,7 @@ static struct netwireless { const char *chart_family; // charts - // satus + // status RRDSET *st_status; // Quality @@ -119,7 +119,7 @@ static void netwireless_cleanup(struct timeval *timestamp) { struct netwireless *previous = NULL; struct netwireless *current; - // search it, from begining to the end + // search it, from beginning to the end for (current = netwireless_root; current;) { if (timercmp(¤t->updated, timestamp, <)) { @@ -145,7 +145,7 @@ static struct netwireless *find_or_create_wireless(const char *name) struct netwireless *wireless; uint32_t hash = simple_hash(name); - // search it, from begining to the end + // search it, from beginning to the end for (wireless = netwireless_root ; wireless ; wireless = wireless->next) { if (unlikely(hash == wireless->hash && !strcmp(name, wireless->name))) { return wireless; diff --git a/collectors/proc.plugin/proc_pagetypeinfo.c b/collectors/proc.plugin/proc_pagetypeinfo.c index 6b6c6c4e..3ce29222 100644 --- a/collectors/proc.plugin/proc_pagetypeinfo.c +++ b/collectors/proc.plugin/proc_pagetypeinfo.c @@ -226,7 +226,7 @@ int do_proc_pagetypeinfo(int update_every, usec_t dt) { for (p = 0; p < pagelines_cnt; p++) { pgl = &pagelines[p]; - // Skip invalid, refused or empty pagelines if not explicitely requested + // Skip invalid, refused or empty pagelines if not explicitly requested if (!pgl || do_detail == CONFIG_BOOLEAN_NO || (do_detail == CONFIG_BOOLEAN_AUTO && pageline_total_count(pgl) == 0 && netdata_zero_metrics_enabled != CONFIG_BOOLEAN_YES)) @@ -236,7 +236,7 @@ int do_proc_pagetypeinfo(int update_every, usec_t dt) { char setid[13+1+2+1+MAX_ZONETYPE_NAME+1+MAX_PAGETYPE_NAME+1]; snprintfz(setid, 13+1+2+1+MAX_ZONETYPE_NAME+1+MAX_PAGETYPE_NAME, "pagetype_Node%d_%s_%s", pgl->node, pgl->zone, pgl->type); - // Skip explicitely refused charts + // Skip explicitly refused charts if (simple_pattern_matches(filter_types, setid)) continue; diff --git a/collectors/proc.plugin/proc_spl_kstat_zfs.c b/collectors/proc.plugin/proc_spl_kstat_zfs.c index 32ff36b7..ce95c2d3 100644 --- a/collectors/proc.plugin/proc_spl_kstat_zfs.c +++ b/collectors/proc.plugin/proc_spl_kstat_zfs.c @@ -4,6 +4,10 @@ #include "zfs_common.h" #define ZFS_PROC_ARCSTATS "/proc/spl/kstat/zfs/arcstats" +#define ZFS_PROC_POOLS "/proc/spl/kstat/zfs" + +#define STATE_SIZE 8 +#define MAX_CHART_ID 256 extern struct arcstats arcstats; @@ -194,3 +198,219 @@ int do_proc_spl_kstat_zfs_arcstats(int update_every, usec_t dt) { return 0; } + +struct zfs_pool { + RRDSET *st; + + RRDDIM *rd_online; + RRDDIM *rd_degraded; + RRDDIM *rd_faulted; + RRDDIM *rd_offline; + RRDDIM *rd_removed; + RRDDIM *rd_unavail; + + int updated; + int disabled; + + int online; + int degraded; + int faulted; + int offline; + int removed; + int unavail; +}; + +struct deleted_zfs_pool { + char *name; + struct deleted_zfs_pool *next; +} *deleted_zfs_pools = NULL; + +DICTIONARY *zfs_pools = NULL; + +void disable_zfs_pool_state(struct zfs_pool *pool) +{ + if (pool->st) + rrdset_is_obsolete(pool->st); + + pool->st = NULL; + + pool->rd_online = NULL; + pool->rd_degraded = NULL; + pool->rd_faulted = NULL; + pool->rd_offline = NULL; + pool->rd_removed = NULL; + pool->rd_unavail = NULL; + + pool->disabled = 1; +} + +int update_zfs_pool_state_chart(char *name, void *pool_p, void *update_every_p) +{ + struct zfs_pool *pool = (struct zfs_pool *)pool_p; + int update_every = *(int *)update_every_p; + + if (pool->updated) { + pool->updated = 0; + + if (!pool->disabled) { + if (unlikely(!pool->st)) { + char chart_id[MAX_CHART_ID + 1]; + snprintf(chart_id, MAX_CHART_ID, "state_%s", name); + + pool->st = rrdset_create_localhost( + "zfspool", + chart_id, + NULL, + name, + "zfspool.state", + "ZFS pool state", + "boolean", + PLUGIN_PROC_NAME, + ZFS_PROC_POOLS, + NETDATA_CHART_PRIO_ZFS_POOL_STATE, + update_every, + RRDSET_TYPE_LINE); + + pool->rd_online = rrddim_add(pool->st, "online", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + pool->rd_degraded = rrddim_add(pool->st, "degraded", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + pool->rd_faulted = rrddim_add(pool->st, "faulted", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + pool->rd_offline = rrddim_add(pool->st, "offline", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + pool->rd_removed = rrddim_add(pool->st, "removed", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + pool->rd_unavail = rrddim_add(pool->st, "unavail", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } else + rrdset_next(pool->st); + + rrddim_set_by_pointer(pool->st, pool->rd_online, pool->online); + rrddim_set_by_pointer(pool->st, pool->rd_degraded, pool->degraded); + rrddim_set_by_pointer(pool->st, pool->rd_faulted, pool->faulted); + rrddim_set_by_pointer(pool->st, pool->rd_offline, pool->offline); + rrddim_set_by_pointer(pool->st, pool->rd_removed, pool->removed); + rrddim_set_by_pointer(pool->st, pool->rd_unavail, pool->unavail); + rrdset_done(pool->st); + } + } else { + disable_zfs_pool_state(pool); + struct deleted_zfs_pool *new = calloc(1, sizeof(struct deleted_zfs_pool)); + new->name = strdupz(name); + new->next = deleted_zfs_pools; + deleted_zfs_pools = new; + } + + return 0; +} + +int do_proc_spl_kstat_zfs_pool_state(int update_every, usec_t dt) +{ + (void)dt; + + static int do_zfs_pool_state = -1; + static char *dirname = NULL; + + int pool_found = 0, state_file_found = 0; + + if (unlikely(do_zfs_pool_state == -1)) { + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/spl/kstat/zfs"); + dirname = config_get("plugin:proc:" ZFS_PROC_POOLS, "directory to monitor", filename); + + zfs_pools = dictionary_create(DICTIONARY_FLAG_SINGLE_THREADED); + + do_zfs_pool_state = 1; + } + + if (likely(do_zfs_pool_state)) { + DIR *dir = opendir(dirname); + if (unlikely(!dir)) { + error("Cannot read directory '%s'", dirname); + return 1; + } + + struct dirent *de = NULL; + while (likely(de = readdir(dir))) { + if (likely( + de->d_type == DT_DIR && ((de->d_name[0] == '.' && de->d_name[1] == '\0') || + (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0')))) + continue; + + if (unlikely(de->d_type == DT_LNK || de->d_type == DT_DIR)) { + pool_found = 1; + + struct zfs_pool *pool = dictionary_get(zfs_pools, de->d_name); + + if (unlikely(!pool)) { + struct zfs_pool new_zfs_pool = {}; + pool = dictionary_set(zfs_pools, de->d_name, &new_zfs_pool, sizeof(struct zfs_pool)); + }; + + pool->updated = 1; + + if (pool->disabled) { + state_file_found = 1; + continue; + } + + pool->online = 0; + pool->degraded = 0; + pool->faulted = 0; + pool->offline = 0; + pool->removed = 0; + pool->unavail = 0; + + char filename[FILENAME_MAX + 1]; + snprintfz( + filename, FILENAME_MAX, "%s%s/%s/state", netdata_configured_host_prefix, dirname, de->d_name); + + char state[STATE_SIZE + 1]; + int ret = read_file(filename, state, STATE_SIZE); + + if (!ret) { + state_file_found = 1; + + // ZFS pool states are described at https://openzfs.github.io/openzfs-docs/man/8/zpoolconcepts.8.html?#Device_Failure_and_Recovery + if (!strcmp(state, "ONLINE\n")) { + pool->online = 1; + } else if (!strcmp(state, "DEGRADED\n")) { + pool->degraded = 1; + } else if (!strcmp(state, "FAULTED\n")) { + pool->faulted = 1; + } else if (!strcmp(state, "OFFLINE\n")) { + pool->offline = 1; + } else if (!strcmp(state, "REMOVED\n")) { + pool->removed = 1; + } else if (!strcmp(state, "UNAVAIL\n")) { + pool->unavail = 1; + } else { + disable_zfs_pool_state(pool); + + char *c = strchr(state, '\n'); + if (c) + *c = '\0'; + error("ZFS POOLS: Undefined state %s for zpool %s, disabling the chart", state, de->d_name); + } + } + } + } + + closedir(dir); + } + + if (do_zfs_pool_state && pool_found && !state_file_found) { + info("ZFS POOLS: State files not found. Disabling the module."); + do_zfs_pool_state = 0; + } + + if (do_zfs_pool_state) + dictionary_get_all_name_value(zfs_pools, update_zfs_pool_state_chart, &update_every); + + while (deleted_zfs_pools) { + struct deleted_zfs_pool *current_pool = deleted_zfs_pools; + dictionary_del(zfs_pools, current_pool->name); + + deleted_zfs_pools = deleted_zfs_pools->next; + + freez(current_pool->name); + freez(current_pool); + } + + return 0; +} diff --git a/collectors/proc.plugin/proc_vmstat.c b/collectors/proc.plugin/proc_vmstat.c index 7def02dd..c1a13716 100644 --- a/collectors/proc.plugin/proc_vmstat.c +++ b/collectors/proc.plugin/proc_vmstat.c @@ -4,11 +4,13 @@ #define PLUGIN_PROC_MODULE_VMSTAT_NAME "/proc/vmstat" +#define OOM_KILL_STRING "oom_kill" + int do_proc_vmstat(int update_every, usec_t dt) { (void)dt; static procfile *ff = NULL; - static int do_swapio = -1, do_io = -1, do_pgfaults = -1, do_numa = -1; + static int do_swapio = -1, do_io = -1, do_pgfaults = -1, do_oom_kill = -1, do_numa = -1; static int has_numa = -1; static ARL_BASE *arl_base = NULL; @@ -27,11 +29,25 @@ int do_proc_vmstat(int update_every, usec_t dt) { static unsigned long long pgpgout = 0ULL; static unsigned long long pswpin = 0ULL; static unsigned long long pswpout = 0ULL; + static unsigned long long oom_kill = 0ULL; + + if(unlikely(!ff)) { + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/vmstat"); + ff = procfile_open(config_get("plugin:proc:/proc/vmstat", "filename to monitor", filename), " \t:", PROCFILE_FLAG_DEFAULT); + if(unlikely(!ff)) return 1; + } + + ff = procfile_readall(ff); + if(unlikely(!ff)) return 0; // we return 0, so that we will retry to open it next time + + size_t lines = procfile_lines(ff), l; if(unlikely(!arl_base)) { do_swapio = config_get_boolean_ondemand("plugin:proc:/proc/vmstat", "swap i/o", CONFIG_BOOLEAN_AUTO); - do_io = config_get_boolean("plugin:proc:/proc/vmstat", "disk i/o", 1); - do_pgfaults = config_get_boolean("plugin:proc:/proc/vmstat", "memory page faults", 1); + do_io = config_get_boolean("plugin:proc:/proc/vmstat", "disk i/o", CONFIG_BOOLEAN_YES); + do_pgfaults = config_get_boolean("plugin:proc:/proc/vmstat", "memory page faults", CONFIG_BOOLEAN_YES); + do_oom_kill = config_get_boolean("plugin:proc:/proc/vmstat", "out of memory kills", CONFIG_BOOLEAN_AUTO); do_numa = config_get_boolean_ondemand("plugin:proc:/proc/vmstat", "system-wide numa metric summary", CONFIG_BOOLEAN_AUTO); @@ -43,6 +59,20 @@ int do_proc_vmstat(int update_every, usec_t dt) { arl_expect(arl_base, "pswpin", &pswpin); arl_expect(arl_base, "pswpout", &pswpout); + int has_oom_kill = 0; + + for (l = 0; l < lines; l++) { + if (!strcmp(procfile_lineword(ff, l, 0), OOM_KILL_STRING)) { + has_oom_kill = 1; + break; + } + } + + if (has_oom_kill) + arl_expect(arl_base, OOM_KILL_STRING, &oom_kill); + else + do_oom_kill = CONFIG_BOOLEAN_NO; + if(do_numa == CONFIG_BOOLEAN_YES || (do_numa == CONFIG_BOOLEAN_AUTO && (get_numa_node_count() >= 2 || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { @@ -66,18 +96,6 @@ int do_proc_vmstat(int update_every, usec_t dt) { } } - if(unlikely(!ff)) { - char filename[FILENAME_MAX + 1]; - snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/vmstat"); - ff = procfile_open(config_get("plugin:proc:/proc/vmstat", "filename to monitor", filename), " \t:", PROCFILE_FLAG_DEFAULT); - if(unlikely(!ff)) return 1; - } - - ff = procfile_readall(ff); - if(unlikely(!ff)) return 0; // we return 0, so that we will retry to open it next time - - size_t lines = procfile_lines(ff), l; - arl_begin(arl_base); for(l = 0; l < lines ;l++) { size_t words = procfile_linewords(ff, l); @@ -193,6 +211,41 @@ int do_proc_vmstat(int update_every, usec_t dt) { rrdset_done(st_pgfaults); } + // -------------------------------------------------------------------- + + if (do_oom_kill == CONFIG_BOOLEAN_YES || + (do_oom_kill == CONFIG_BOOLEAN_AUTO && (oom_kill || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { + static RRDSET *st_oom_kill = NULL; + static RRDDIM *rd_oom_kill = NULL; + + do_oom_kill = CONFIG_BOOLEAN_YES; + + if(unlikely(!st_oom_kill)) { + st_oom_kill = rrdset_create_localhost( + "mem" + , "oom_kill" + , NULL + , "system" + , NULL + , "Out of Memory Kills" + , "kills/s" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_VMSTAT_NAME + , NETDATA_CHART_PRIO_MEM_SYSTEM_OOM_KILL + , update_every + , RRDSET_TYPE_LINE + ); + + rrdset_flag_set(st_oom_kill, RRDSET_FLAG_DETAIL); + + rd_oom_kill = rrddim_add(st_oom_kill, "kills", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + } + else rrdset_next(st_oom_kill); + + rrddim_set_by_pointer(st_oom_kill, rd_oom_kill, oom_kill); + rrdset_done(st_oom_kill); + } + // -------------------------------------------------------------------- // Ondemand criteria for NUMA. Since this won't change at run time, we diff --git a/collectors/proc.plugin/sys_class_infiniband.c b/collectors/proc.plugin/sys_class_infiniband.c index 46f40f2c..69e27f81 100644 --- a/collectors/proc.plugin/sys_class_infiniband.c +++ b/collectors/proc.plugin/sys_class_infiniband.c @@ -367,7 +367,7 @@ int do_sys_class_infiniband(int update_every, usec_t dt) char buffer[FILENAME_MAX + 1]; - // Check if counters are availablea (mandatory) + // Check if counters are available (mandatory) // /sys/class/infiniband/<device>/ports/<port>/counters char counters_dirname[FILENAME_MAX + 1]; snprintfz(counters_dirname, FILENAME_MAX, "%s/%s/%s", ports_dirname, port_dent->d_name, "counters"); @@ -377,7 +377,7 @@ int do_sys_class_infiniband(int update_every, usec_t dt) continue; closedir(counters_dir); - // Hardware Counters are optionnal, used later + // Hardware Counters are optional, used later char hwcounters_dirname[FILENAME_MAX + 1]; snprintfz( hwcounters_dirname, FILENAME_MAX, "%s/%s/%s", ports_dirname, port_dent->d_name, "hw_counters"); |