diff options
Diffstat (limited to '')
37 files changed, 2552 insertions, 157 deletions
diff --git a/collectors/Makefile.am b/collectors/Makefile.am index 87a037e76..fe62ba01d 100644 --- a/collectors/Makefile.am +++ b/collectors/Makefile.am @@ -18,6 +18,7 @@ SUBDIRS = \ macos.plugin \ nfacct.plugin \ xenstat.plugin \ + perf.plugin \ node.d.plugin \ proc.plugin \ python.d.plugin \ diff --git a/collectors/README.md b/collectors/README.md index 154d193e4..725213889 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -37,6 +37,7 @@ plugin|lang|O/S|runs as|modular|description [macos.plugin](macos.plugin/)|`C`|macos|internal|yes|collects resource usage and performance data on MacOS systems [nfacct.plugin](nfacct.plugin/)|`C`|linux|external|-|collects netfilter firewall, connection tracker and accounting metrics using `libmnl` and `libnetfilter_acct` [xenstat.plugin](xenstat.plugin/)|`C`|linux|external|-|collects XenServer and XCP-ng metrics using `libxenstat` +[perf.plugin](perf.plugin/)|`C`|linux|external|-|collects CPU performance metrics using performance monitoring units (PMU). [node.d.plugin](node.d.plugin/)|`node.js`|any|external|yes|a **plugin orchestrator** for data collection modules written in `node.js`. [plugins.d](plugins.d/)|`C`|any|internal|-|implements the **external plugins** API and serves external plugins [proc.plugin](proc.plugin/)|`C`|linux|internal|yes|collects resource usage and performance data on Linux systems diff --git a/collectors/apps.plugin/apps_groups.conf b/collectors/apps.plugin/apps_groups.conf index 7eba72f67..ab167ddbb 100644 --- a/collectors/apps.plugin/apps_groups.conf +++ b/collectors/apps.plugin/apps_groups.conf @@ -77,6 +77,7 @@ freeipmi.plugin: freeipmi.plugin nfacct.plugin: nfacct.plugin cups.plugin: cups.plugin xenstat.plugin: xenstat.plugin +perf.plugin: perf.plugin charts.d.plugin: *charts.d.plugin* node.d.plugin: *node.d.plugin* python.d.plugin: *python.d.plugin* @@ -88,7 +89,7 @@ go.d.plugin: *go.d.plugin* # ----------------------------------------------------------------------------- # authentication/authorization related servers -auth: radius* openldap* ldap* +auth: radius* openldap* ldap* slapd fail2ban: fail2ban* # ----------------------------------------------------------------------------- @@ -188,7 +189,7 @@ print: cups* lpd lpq # ----------------------------------------------------------------------------- # time servers and clients -time: ntp* systemd-timesyncd chronyd +time: ntp* systemd-timesyn* chronyd # ----------------------------------------------------------------------------- # dhcp servers and clients @@ -301,3 +302,5 @@ ipfs: ipfs node: node* factorio: factorio + +p4: p4* diff --git a/collectors/cgroups.plugin/README.md b/collectors/cgroups.plugin/README.md index 3fffbd17c..c01f9ec04 100644 --- a/collectors/cgroups.plugin/README.md +++ b/collectors/cgroups.plugin/README.md @@ -101,6 +101,15 @@ The whole point for the additional pattern list, is to limit the number of times The above pattern list is matched against the path of the cgroup. For matched cgroups, netdata calls the script [cgroup-name.sh](cgroup-name.sh.in) to get its name. This script queries `docker`, or applies heuristics to find give a name for the cgroup. +### charts with zero metrics + +By default, Netdata will enable monitoring metrics only when they are not zero. If they are constantly zero they are ignored. Metrics that will start having values, after netdata is started, will be detected and charts will be automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). Set `yes` for a chart instead of `auto` to enable it permanently. For example: + +``` +[plugin:cgroups] + enable memory (used mem including cache) = yes +``` + ### alarms CPU and memory limits are watched and used to rise alarms. Memory usage for every cgroup is checked against `ram` and `ram+swap` limits. CPU usage for every cgroup is checked against `cpuset.cpus` and `cpu.cfs_period_us` + `cpu.cfs_quota_us` pair assigned for the cgroup. Configuration for the alarms is available in `health.d/cgroups.conf` file. diff --git a/collectors/cgroups.plugin/sys_fs_cgroup.c b/collectors/cgroups.plugin/sys_fs_cgroup.c index 40b485574..4300788d5 100644 --- a/collectors/cgroups.plugin/sys_fs_cgroup.c +++ b/collectors/cgroups.plugin/sys_fs_cgroup.c @@ -2831,17 +2831,16 @@ void update_cgroup_charts(int update_every) { , RRDSET_TYPE_LINE ); - rrddim_add(cg->st_cpu_limit, "used", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + if(!(cg->options & CGROUP_OPTIONS_IS_UNIFIED)) + rrddim_add(cg->st_cpu_limit, "used", NULL, 1, system_hz, RRD_ALGORITHM_ABSOLUTE); + else + rrddim_add(cg->st_cpu_limit, "used", NULL, 1, 1000000, RRD_ALGORITHM_ABSOLUTE); } else rrdset_next(cg->st_cpu_limit); calculated_number cpu_usage = 0; - if(!(cg->options & CGROUP_OPTIONS_IS_UNIFIED)) - cpu_usage = (calculated_number)(cg->cpuacct_stat.user + cg->cpuacct_stat.system) * 100 / system_hz; - else - cpu_usage = (calculated_number)(cg->cpuacct_stat.user + cg->cpuacct_stat.system) * 100 / 1000000; - + cpu_usage = (calculated_number)(cg->cpuacct_stat.user + cg->cpuacct_stat.system) * 100; calculated_number cpu_used = 100 * (cpu_usage - cg->prev_cpu_usage) / (value * update_every); rrdset_isnot_obsolete(cg->st_cpu_limit); diff --git a/collectors/charts.d.plugin/apcupsd/apcupsd.chart.sh b/collectors/charts.d.plugin/apcupsd/apcupsd.chart.sh index f5c4d7959..31ff93160 100644 --- a/collectors/charts.d.plugin/apcupsd/apcupsd.chart.sh +++ b/collectors/charts.d.plugin/apcupsd/apcupsd.chart.sh @@ -47,8 +47,8 @@ apcupsd_check() { error "cannot get information for apcupsd server ${host} on ${apcupsd_sources[${host}]}." failed=$((failed + 1)) else - apcupsd_status = "$(apcupsd_get ${apcupsd_sources[${host}]} | awk '/^STATUS.*/{ print $3 }')" - if [ ${apcupsd_status} != "ONLINE" ] && [ ${apcupsd_status} != "ONBATT" ]; then + apcupsd_status="$(apcupsd_get ${apcupsd_sources[${host}]} | awk '/^STATUS.*/{ print $3 }')" + if [ "${apcupsd_status}" != "ONLINE" ] && [ "${apcupsd_status}" != "ONBATT" ]; then error "APC UPS ${host} on ${apcupsd_sources[${host}]} is not online." failed=$((failed + 1)) else diff --git a/collectors/charts.d.plugin/charts.d.conf b/collectors/charts.d.plugin/charts.d.conf index acb2a6fae..94c40cf6f 100644 --- a/collectors/charts.d.plugin/charts.d.conf +++ b/collectors/charts.d.plugin/charts.d.conf @@ -34,6 +34,8 @@ # BY DEFAULT ENABLED MODULES # ap=yes +# apcupsd=yes +# libreswan=yes # nut=yes # opensips=yes diff --git a/collectors/charts.d.plugin/charts.d.plugin.in b/collectors/charts.d.plugin/charts.d.plugin.in index a3f0aa954..0df6c30c3 100755 --- a/collectors/charts.d.plugin/charts.d.plugin.in +++ b/collectors/charts.d.plugin/charts.d.plugin.in @@ -304,7 +304,7 @@ run() { printf " --- END TRACE ---\n" } >&2 fi - rm "${TMP_DIR}/run.${pid}" + rm -f "${TMP_DIR}/run.${pid}" return ${ret} } diff --git a/collectors/diskspace.plugin/README.md b/collectors/diskspace.plugin/README.md index d743312c8..8f859e350 100644 --- a/collectors/diskspace.plugin/README.md +++ b/collectors/diskspace.plugin/README.md @@ -10,6 +10,9 @@ Two charts are available for every mount: Simple patterns can be used to exclude mounts from showed statistics based on path or filesystem. By default read-only mounts are not displayed. To display them `yes` should be set for a chart instead of `auto`. +By default, Netdata will enable monitoring metrics only when they are not zero. If they are constantly zero they are ignored. Metrics that will start having values, after netdata is started, will be detected and charts will be automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). Set `yes` for a chart instead of `auto` to enable it permanently. + + ``` [plugin:proc:diskspace] # remove charts of unmounted disks = yes diff --git a/collectors/freeipmi.plugin/freeipmi_plugin.c b/collectors/freeipmi.plugin/freeipmi_plugin.c index 82a04ecd4..ba1fbffae 100644 --- a/collectors/freeipmi.plugin/freeipmi_plugin.c +++ b/collectors/freeipmi.plugin/freeipmi_plugin.c @@ -338,7 +338,8 @@ static void netdata_mark_as_not_updated() { } static void send_chart_to_netdata_for_units(int units) { - struct sensor *sn; + struct sensor *sn, *sn_stored; + int dupfound, multiplier; switch(units) { case IPMI_MONITORING_SENSOR_UNITS_CELSIUS: @@ -398,29 +399,44 @@ static void send_chart_to_netdata_for_units(int units) { } for(sn = sensors_root; sn; sn = sn->next) { + dupfound = 0; if(sn->sensor_units == units && sn->updated && !sn->ignore) { sn->exposed = 1; + multiplier = 1; switch(sn->sensor_reading_type) { + case IPMI_MONITORING_SENSOR_READING_TYPE_DOUBLE: + multiplier = 1000; + // fallthrough case IPMI_MONITORING_SENSOR_READING_TYPE_UNSIGNED_INTEGER8_BOOL: case IPMI_MONITORING_SENSOR_READING_TYPE_UNSIGNED_INTEGER32: - printf("DIMENSION i%d_n%d_r%d '%s i%d' absolute 1 1\n" - , sn->sensor_number - , sn->record_id - , sn->sensor_reading_type - , sn->sensor_name - , sn->sensor_number - ); - break; - - case IPMI_MONITORING_SENSOR_READING_TYPE_DOUBLE: - printf("DIMENSION i%d_n%d_r%d '%s i%d' absolute 1 1000\n" - , sn->sensor_number - , sn->record_id - , sn->sensor_reading_type - , sn->sensor_name - , sn->sensor_number - ); + for (sn_stored = sensors_root; sn_stored; sn_stored = sn_stored->next) { + if (sn_stored == sn) continue; + // If the name is a duplicate, append the sensor number + if ( !strcmp(sn_stored->sensor_name, sn->sensor_name) ) { + dupfound = 1; + printf("DIMENSION i%d_n%d_r%d '%s i%d' absolute 1 %d\n" + , sn->sensor_number + , sn->record_id + , sn->sensor_reading_type + , sn->sensor_name + , sn->sensor_number + , multiplier + ); + break; + } + } + // No duplicate name was found, display it just with Name + if (!dupfound) { + // display without ID + printf("DIMENSION i%d_n%d_r%d '%s' absolute 1 %d\n" + , sn->sensor_number + , sn->record_id + , sn->sensor_reading_type + , sn->sensor_name + , multiplier + ); + } break; default: diff --git a/collectors/nfacct.plugin/plugin_nfacct.c b/collectors/nfacct.plugin/plugin_nfacct.c index 545899824..feec34b85 100644 --- a/collectors/nfacct.plugin/plugin_nfacct.c +++ b/collectors/nfacct.plugin/plugin_nfacct.c @@ -835,12 +835,12 @@ int main(int argc, char **argv) { error("update frequency %d seconds is too small for NFACCT. Using %d.", freq, netdata_update_every); #ifdef DO_NFACCT - if(debug) fprintf(stderr, "freeipmi.plugin: calling nfacct_init()\n"); + if(debug) fprintf(stderr, "nfacct.plugin: calling nfacct_init()\n"); int nfacct = !nfacct_init(netdata_update_every); #endif #ifdef DO_NFSTAT - if(debug) fprintf(stderr, "freeipmi.plugin: calling nfstat_init()\n"); + if(debug) fprintf(stderr, "nfacct.plugin: calling nfstat_init()\n"); int nfstat = !nfstat_init(netdata_update_every); #endif diff --git a/collectors/perf.plugin/Makefile.am b/collectors/perf.plugin/Makefile.am new file mode 100644 index 000000000..19554bed8 --- /dev/null +++ b/collectors/perf.plugin/Makefile.am @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +AUTOMAKE_OPTIONS = subdir-objects +MAINTAINERCLEANFILES = $(srcdir)/Makefile.in + +dist_noinst_DATA = \ + README.md \ + $(NULL) diff --git a/collectors/perf.plugin/README.md b/collectors/perf.plugin/README.md new file mode 100644 index 000000000..ce696b06d --- /dev/null +++ b/collectors/perf.plugin/README.md @@ -0,0 +1,72 @@ +# perf.plugin + +`perf.plugin` collects system-wide CPU performance statistics from Performance Monitoring Units (PMU) using +the `perf_event_open()` system call. + +## Important Notes + +Accessing hardware PMUs requires root permissions, so the plugin is setuid to root. + +Keep in mind that the number of PMUs in a system is usually quite limited and every hardware monitoring +event for every CPU core needs a separate file descriptor to be opened. + +## Charts + +The plugin provides statistics for general hardware and software performance monitoring events: + +Hardware events: +1. CPU cycles +2. Instructions +3. Branch instructions +4. Cache operations +5. BUS cycles +6. Stalled frontend and backend cycles + +Software events: +1. CPU migrations +2. Alignment faults +3. Emulation faults + +Hardware cache events: +1. L1D cache operations +2. L1D prefetch cache operations +3. L1I cache operations +4. LL cache operations +5. DTLB cache operations +6. ITLB cache operations +7. PBU cache operations + +## Configuration + +The plugin is disabled by default because the number of PMUs is usually quite limited and it is not desired to +allow Netdata to struggle silently for PMUs, interfering with other performance monitoring software. If you need to +enable the perf plugin, edit /etc/netdata/netdata.conf and set: + +```raw +[plugins] + perf = yes +``` + +```raw +[plugin:perf] + update every = 1 + command options = all +``` + +You can use the `command options` parameter to pick what data should be collected and which charts should be +displayed. If `all` is used, all general performance monitoring counters are probed and corresponding charts +are enabled for the available counters. You can also define a particular set of enabled charts using the +following keywords: `cycles`, `instructions`, `branch`, `cache`, `bus`, `stalled`, `migrations`, `alighnment`, +`emulation`, `L1D`, `L1D-prefetch`, `L1I`, `LL`, `DTLB`, `ITLB`, `PBU`. + +## Debugging + +You can run the plugin by hand: + +```raw +sudo /usr/libexec/netdata/plugins.d/perf.plugin 1 all debug +``` + +You will get verbose output on what the plugin does. + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fperf.plugin%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/collectors/perf.plugin/perf_plugin.c b/collectors/perf.plugin/perf_plugin.c new file mode 100644 index 000000000..c645c2798 --- /dev/null +++ b/collectors/perf.plugin/perf_plugin.c @@ -0,0 +1,1348 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../../libnetdata/libnetdata.h" + +#include <linux/perf_event.h> + +#define PLUGIN_PERF_NAME "perf.plugin" + +// Hardware counters +#define NETDATA_CHART_PRIO_PERF_CPU_CYCLES 8800 +#define NETDATA_CHART_PRIO_PERF_INSTRUCTIONS 8801 +#define NETDATA_CHART_PRIO_PERF_BRANCH_INSTRUSTIONS 8802 +#define NETDATA_CHART_PRIO_PERF_CACHE 8803 +#define NETDATA_CHART_PRIO_PERF_BUS_CYCLES 8804 +#define NETDATA_CHART_PRIO_PERF_FRONT_BACK_CYCLES 8805 + +// Software counters +#define NETDATA_CHART_PRIO_PERF_MIGRATIONS 8810 +#define NETDATA_CHART_PRIO_PERF_ALIGNMENT 8811 +#define NETDATA_CHART_PRIO_PERF_EMULATION 8812 + +// Hardware cache counters +#define NETDATA_CHART_PRIO_PERF_L1D 8820 +#define NETDATA_CHART_PRIO_PERF_L1D_PREFETCH 8821 +#define NETDATA_CHART_PRIO_PERF_L1I 8822 +#define NETDATA_CHART_PRIO_PERF_LL 8823 +#define NETDATA_CHART_PRIO_PERF_DTLB 8824 +#define NETDATA_CHART_PRIO_PERF_ITLB 8825 +#define NETDATA_CHART_PRIO_PERF_PBU 8826 + +// callback required by fatal() +void netdata_cleanup_and_exit(int ret) { + exit(ret); +} + +void send_statistics( const char *action, const char *action_result, const char *action_data) { + (void) action; + (void) action_result; + (void) action_data; + return; +} + +// callbacks required by popen() +void signals_block(void) {}; +void signals_unblock(void) {}; +void signals_reset(void) {}; + +// callback required by eval() +int health_variable_lookup(const char *variable, uint32_t hash, struct rrdcalc *rc, calculated_number *result) { + (void)variable; + (void)hash; + (void)rc; + (void)result; + return 0; +}; + +// required by get_system_cpus() +char *netdata_configured_host_prefix = ""; + +// Variables + +#define RRD_TYPE_PERF "perf" +#define RRD_FAMILY_HW "hardware" +#define RRD_FAMILY_SW "software" +#define RRD_FAMILY_CACHE "cache" + +#define NO_FD -1 +#define ALL_PIDS -1 +#define RUNNING_THRESHOLD 100 + +static int debug = 0; + +static int update_every = 1; +static int freq = 0; + +typedef enum perf_event_id { + // Hardware counters + EV_ID_CPU_CYCLES, + EV_ID_INSTRUCTIONS, + EV_ID_CACHE_REFERENCES, + EV_ID_CACHE_MISSES, + EV_ID_BRANCH_INSTRUCTIONS, + EV_ID_BRANCH_MISSES, + EV_ID_BUS_CYCLES, + EV_ID_STALLED_CYCLES_FRONTEND, + EV_ID_STALLED_CYCLES_BACKEND, + EV_ID_REF_CPU_CYCLES, + + // Software counters + // EV_ID_CPU_CLOCK, + // EV_ID_TASK_CLOCK, + // EV_ID_PAGE_FAULTS, + // EV_ID_CONTEXT_SWITCHES, + EV_ID_CPU_MIGRATIONS, + // EV_ID_PAGE_FAULTS_MIN, + // EV_ID_PAGE_FAULTS_MAJ, + EV_ID_ALIGNMENT_FAULTS, + EV_ID_EMULATION_FAULTS, + + // Hardware cache counters + EV_ID_L1D_READ_ACCESS, + EV_ID_L1D_READ_MISS, + EV_ID_L1D_WRITE_ACCESS, + EV_ID_L1D_WRITE_MISS, + EV_ID_L1D_PREFETCH_ACCESS, + + EV_ID_L1I_READ_ACCESS, + EV_ID_L1I_READ_MISS, + + EV_ID_LL_READ_ACCESS, + EV_ID_LL_READ_MISS, + EV_ID_LL_WRITE_ACCESS, + EV_ID_LL_WRITE_MISS, + + EV_ID_DTLB_READ_ACCESS, + EV_ID_DTLB_READ_MISS, + EV_ID_DTLB_WRITE_ACCESS, + EV_ID_DTLB_WRITE_MISS, + + EV_ID_ITLB_READ_ACCESS, + EV_ID_ITLB_READ_MISS, + + EV_ID_PBU_READ_ACCESS, + + EV_ID_END +} perf_event_id_t; + +enum perf_event_group { + EV_GROUP_CYCLES, + EV_GROUP_INSTRUCTIONS_AND_CACHE, + EV_GROUP_SOFTWARE, + EV_GROUP_CACHE_L1D, + EV_GROUP_CACHE_L1I_LL_DTLB, + EV_GROUP_CACHE_ITLB_BPU, + + EV_GROUP_NUM +}; + +static int number_of_cpus; + +static int *group_leader_fds[EV_GROUP_NUM]; + +static struct perf_event { + perf_event_id_t id; + + int type; + int config; + + int **group_leader_fd; + int *fd; + + int disabled; + int updated; + + uint64_t value; + + uint64_t *prev_value; + uint64_t *prev_time_enabled; + uint64_t *prev_time_running; +} perf_events[] = { + // Hardware counters + {EV_ID_CPU_CYCLES, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, &group_leader_fds[EV_GROUP_CYCLES], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_INSTRUCTIONS, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, &group_leader_fds[EV_GROUP_INSTRUCTIONS_AND_CACHE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_CACHE_REFERENCES, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, &group_leader_fds[EV_GROUP_INSTRUCTIONS_AND_CACHE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_CACHE_MISSES, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, &group_leader_fds[EV_GROUP_INSTRUCTIONS_AND_CACHE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_BRANCH_INSTRUCTIONS, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, &group_leader_fds[EV_GROUP_INSTRUCTIONS_AND_CACHE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_BRANCH_MISSES, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, &group_leader_fds[EV_GROUP_INSTRUCTIONS_AND_CACHE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_BUS_CYCLES, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BUS_CYCLES, &group_leader_fds[EV_GROUP_CYCLES], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_STALLED_CYCLES_FRONTEND, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, &group_leader_fds[EV_GROUP_CYCLES], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_STALLED_CYCLES_BACKEND, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, &group_leader_fds[EV_GROUP_CYCLES], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_REF_CPU_CYCLES, PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES, &group_leader_fds[EV_GROUP_CYCLES], NULL, 1, 0, 0, NULL, NULL, NULL}, + + // Software counters + // {EV_ID_CPU_CLOCK, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + // {EV_ID_TASK_CLOCK, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + // {EV_ID_PAGE_FAULTS, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + // {EV_ID_CONTEXT_SWITCHES, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_CPU_MIGRATIONS, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_MIGRATIONS, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + // {EV_ID_PAGE_FAULTS_MIN, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + // {EV_ID_PAGE_FAULTS_MAJ, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_ALIGNMENT_FAULTS, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_EMULATION_FAULTS, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + + // Hardware cache counters + { + EV_ID_L1D_READ_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1D], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_L1D_READ_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1D], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_L1D_WRITE_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1D], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_L1D_WRITE_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1D], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_L1D_PREFETCH_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1D], NULL, 1, 0, 0, NULL, NULL, NULL + }, + + { + EV_ID_L1I_READ_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1I) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_L1I_READ_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1I) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, + + { + EV_ID_LL_READ_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_LL_READ_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_LL_WRITE_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_LL_WRITE_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, + + { + EV_ID_DTLB_READ_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_DTLB_READ_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_DTLB_WRITE_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_DTLB_WRITE_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_ITLB_BPU], NULL, 1, 0, 0, NULL, NULL, NULL + }, + + { + EV_ID_ITLB_READ_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_ITLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_ITLB_BPU], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_ITLB_READ_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_ITLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_ITLB_BPU], NULL, 1, 0, 0, NULL, NULL, NULL + }, + + { + EV_ID_PBU_READ_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_BPU) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_ITLB_BPU], NULL, 1, 0, 0, NULL, NULL, NULL + }, + + {EV_ID_END, 0, 0, NULL, NULL, 0, 0, 0, NULL, NULL, NULL} +}; + +static int perf_init() { + int cpu, group; + struct perf_event_attr perf_event_attr; + struct perf_event *current_event = NULL; + unsigned long flags = 0; + + number_of_cpus = (int)get_system_cpus(); + + // initialize all perf event file descriptors + for(current_event = &perf_events[0]; current_event->id != EV_ID_END; current_event++) { + current_event->fd = mallocz(number_of_cpus * sizeof(int)); + memset(current_event->fd, NO_FD, number_of_cpus * sizeof(int)); + + current_event->prev_value = mallocz(number_of_cpus * sizeof(uint64_t)); + memset(current_event->prev_value, 0, number_of_cpus * sizeof(uint64_t)); + + current_event->prev_time_enabled = mallocz(number_of_cpus * sizeof(uint64_t)); + memset(current_event->prev_time_enabled, 0, number_of_cpus * sizeof(uint64_t)); + + current_event->prev_time_running = mallocz(number_of_cpus * sizeof(uint64_t)); + memset(current_event->prev_time_running, 0, number_of_cpus * sizeof(uint64_t)); + } + + for(group = 0; group < EV_GROUP_NUM; group++) { + group_leader_fds[group] = mallocz(number_of_cpus * sizeof(int)); + memset(group_leader_fds[group], NO_FD, number_of_cpus * sizeof(int)); + } + + memset(&perf_event_attr, 0, sizeof(perf_event_attr)); + + for(cpu = 0; cpu < number_of_cpus; cpu++) { + for(current_event = &perf_events[0]; current_event->id != EV_ID_END; current_event++) { + if(unlikely(current_event->disabled)) continue; + + perf_event_attr.type = current_event->type; + perf_event_attr.config = current_event->config; + perf_event_attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; + + int fd, group_leader_fd = *(*current_event->group_leader_fd + cpu); + + fd = syscall( + __NR_perf_event_open, + &perf_event_attr, + ALL_PIDS, + cpu, + group_leader_fd, + flags + ); + + if(unlikely(group_leader_fd == NO_FD)) group_leader_fd = fd; + + if(unlikely(fd < 0)) { + switch errno { + case EACCES: + error("Cannot access to the PMU: Permission denied"); + break; + case EBUSY: + error("Another event already has exclusive access to the PMU"); + break; + default: + error("Cannot open perf event"); + } + error("Disabling event %u", current_event->id); + current_event->disabled = 1; + } + + *(current_event->fd + cpu) = fd; + *(*current_event->group_leader_fd + cpu) = group_leader_fd; + + if(unlikely(debug)) fprintf(stderr, "perf.plugin: event id = %u, cpu = %d, fd = %d, leader_fd = %d\n", current_event->id, cpu, fd, group_leader_fd); + } + } + + return 0; +} + +static void perf_free(void) { + int cpu, group; + struct perf_event *current_event = NULL; + + for(current_event = &perf_events[0]; current_event->id != EV_ID_END; current_event++) { + for(cpu = 0; cpu < number_of_cpus; cpu++) + if(*(current_event->fd + cpu) != NO_FD) close(*(current_event->fd + cpu)); + + free(current_event->fd); + free(current_event->prev_value); + free(current_event->prev_time_enabled); + free(current_event->prev_time_running); + } + + for(group = 0; group < EV_GROUP_NUM; group++) + free(group_leader_fds[group]); +} + +static void reenable_events() { + int group, cpu; + + for(group = 0; group < EV_GROUP_NUM; group++) { + for(cpu = 0; cpu < number_of_cpus; cpu++) { + int current_fd = *(group_leader_fds[group] + cpu); + + if(unlikely(current_fd == NO_FD)) continue; + + if(ioctl(current_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1 + || ioctl(current_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) + { + error("Cannot reenable event group"); + } + } + } +} + +static int perf_collect() { + int cpu; + struct perf_event *current_event = NULL; + static uint64_t prev_cpu_cycles_value = 0; + struct { + uint64_t value; + uint64_t time_enabled; + uint64_t time_running; + } read_result; + + for(current_event = &perf_events[0]; current_event->id != EV_ID_END; current_event++) { + current_event->updated = 0; + current_event->value = 0; + + if(unlikely(current_event->disabled)) continue; + + for(cpu = 0; cpu < number_of_cpus; cpu++) { + + ssize_t read_size = read(current_event->fd[cpu], &read_result, sizeof(read_result)); + + if(likely(read_size == sizeof(read_result))) { + if (likely(read_result.time_running + && read_result.time_running != *(current_event->prev_time_running + cpu) + && (read_result.time_enabled / read_result.time_running < RUNNING_THRESHOLD))) { + current_event->value += (read_result.value - *(current_event->prev_value + cpu)) \ + * (read_result.time_enabled - *(current_event->prev_time_enabled + cpu)) \ + / (read_result.time_running - *(current_event->prev_time_running + cpu)); + } + + *(current_event->prev_value + cpu) = read_result.value; + *(current_event->prev_time_enabled + cpu) = read_result.time_enabled; + *(current_event->prev_time_running + cpu) = read_result.time_running; + + current_event->updated = 1; + } + else { + error("Cannot update value for event %u", current_event->id); + return 1; + } + } + + if(unlikely(debug)) fprintf(stderr, "perf.plugin: successfully read event id = %u, value = %lu\n", current_event->id, current_event->value); + } + + if(unlikely(perf_events[EV_ID_CPU_CYCLES].value == prev_cpu_cycles_value)) + reenable_events(); + prev_cpu_cycles_value = perf_events[EV_ID_CPU_CYCLES].value; + + return 0; +} + +static void perf_send_metrics() { + static int // Hardware counters + cpu_cycles_chart_generated = 0, + instructions_chart_generated = 0, + branch_chart_generated = 0, + cache_chart_generated = 0, + bus_cycles_chart_generated = 0, + stalled_cycles_chart_generated = 0, + + // Software counters + migrations_chart_generated = 0, + alighnment_chart_generated = 0, + emulation_chart_generated = 0, + + // Hardware cache counters + L1D_chart_generated = 0, + L1D_prefetch_chart_generated = 0, + L1I_chart_generated = 0, + LL_chart_generated = 0, + DTLB_chart_generated = 0, + ITLB_chart_generated = 0, + PBU_chart_generated = 0; + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_CPU_CYCLES].updated || perf_events[EV_ID_REF_CPU_CYCLES].updated)) { + if(unlikely(!cpu_cycles_chart_generated)) { + cpu_cycles_chart_generated = 1; + + printf("CHART %s.%s '' 'CPU cycles' 'cycles/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "cpu_cycles" + , RRD_FAMILY_HW + , NETDATA_CHART_PRIO_PERF_CPU_CYCLES + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "cpu"); + printf("DIMENSION %s '' absolute 1 1\n", "ref_cpu"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "cpu_cycles" + ); + if(likely(perf_events[EV_ID_CPU_CYCLES].updated)) { + printf( + "SET %s = %lld\n" + , "cpu" + , (collected_number) perf_events[EV_ID_CPU_CYCLES].value + ); + } + if(likely(perf_events[EV_ID_REF_CPU_CYCLES].updated)) { + printf( + "SET %s = %lld\n" + , "ref_cpu" + , (collected_number) perf_events[EV_ID_REF_CPU_CYCLES].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_INSTRUCTIONS].updated)) { + if(unlikely(!instructions_chart_generated)) { + instructions_chart_generated = 1; + + printf("CHART %s.%s '' 'Instructions' 'instructions/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "instructions" + , RRD_FAMILY_HW + , NETDATA_CHART_PRIO_PERF_INSTRUCTIONS + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "instructions"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "instructions" + ); + printf( + "SET %s = %lld\n" + , "instructions" + , (collected_number) perf_events[EV_ID_INSTRUCTIONS].value + ); + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_BRANCH_INSTRUCTIONS].updated || perf_events[EV_ID_BRANCH_MISSES].updated)) { + if(unlikely(!branch_chart_generated)) { + branch_chart_generated = 1; + + printf("CHART %s.%s '' 'Branch instructions' 'instructions/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "branch_instructions" + , RRD_FAMILY_HW + , NETDATA_CHART_PRIO_PERF_BRANCH_INSTRUSTIONS + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "instructions"); + printf("DIMENSION %s '' absolute 1 1\n", "misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "branch_instructions" + ); + if(likely(perf_events[EV_ID_BRANCH_INSTRUCTIONS].updated)) { + printf( + "SET %s = %lld\n" + , "instructions" + , (collected_number) perf_events[EV_ID_BRANCH_INSTRUCTIONS].value + ); + } + if(likely(perf_events[EV_ID_BRANCH_MISSES].updated)) { + printf( + "SET %s = %lld\n" + , "misses" + , (collected_number) perf_events[EV_ID_BRANCH_MISSES].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_CACHE_REFERENCES].updated || perf_events[EV_ID_CACHE_MISSES].updated)) { + if(unlikely(!cache_chart_generated)) { + cache_chart_generated = 1; + + printf("CHART %s.%s '' 'Cache operations' 'operations/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "cache" + , RRD_FAMILY_HW + , NETDATA_CHART_PRIO_PERF_CACHE + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "references"); + printf("DIMENSION %s '' absolute 1 1\n", "misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "cache" + ); + if(likely(perf_events[EV_ID_CACHE_REFERENCES].updated)) { + printf( + "SET %s = %lld\n" + , "references" + , (collected_number) perf_events[EV_ID_CACHE_REFERENCES].value + ); + } + if(likely(perf_events[EV_ID_CACHE_MISSES].updated)) { + printf( + "SET %s = %lld\n" + , "misses" + , (collected_number) perf_events[EV_ID_CACHE_MISSES].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_BUS_CYCLES].updated)) { + if(unlikely(!bus_cycles_chart_generated)) { + bus_cycles_chart_generated = 1; + + printf("CHART %s.%s '' 'Bus cycles' 'cycles/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "bus_cycles" + , RRD_FAMILY_HW + , NETDATA_CHART_PRIO_PERF_BUS_CYCLES + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "bus"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "bus_cycles" + ); + printf( + "SET %s = %lld\n" + , "bus" + , (collected_number) perf_events[EV_ID_BUS_CYCLES].value + ); + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_STALLED_CYCLES_FRONTEND].updated || perf_events[EV_ID_STALLED_CYCLES_BACKEND].updated)) { + if(unlikely(!stalled_cycles_chart_generated)) { + stalled_cycles_chart_generated = 1; + + printf("CHART %s.%s '' 'Stalled frontend and backend cycles' 'cycles/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "stalled_cycles" + , RRD_FAMILY_HW + , NETDATA_CHART_PRIO_PERF_FRONT_BACK_CYCLES + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "frontend"); + printf("DIMENSION %s '' absolute 1 1\n", "backend"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "stalled_cycles" + ); + if(likely(perf_events[EV_ID_STALLED_CYCLES_FRONTEND].updated)) { + printf( + "SET %s = %lld\n" + , "frontend" + , (collected_number) perf_events[EV_ID_STALLED_CYCLES_FRONTEND].value + ); + } + if(likely(perf_events[EV_ID_STALLED_CYCLES_BACKEND].updated)) { + printf( + "SET %s = %lld\n" + , "backend" + , (collected_number) perf_events[EV_ID_STALLED_CYCLES_BACKEND].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_CPU_MIGRATIONS].updated)) { + if(unlikely(!migrations_chart_generated)) { + migrations_chart_generated = 1; + + printf("CHART %s.%s '' 'CPU migrations' 'migrations' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "migrations" + , RRD_FAMILY_SW + , NETDATA_CHART_PRIO_PERF_MIGRATIONS + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "migrations"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "migrations" + ); + printf( + "SET %s = %lld\n" + , "migrations" + , (collected_number) perf_events[EV_ID_CPU_MIGRATIONS].value + ); + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_ALIGNMENT_FAULTS].updated)) { + if(unlikely(!alighnment_chart_generated)) { + alighnment_chart_generated = 1; + + printf("CHART %s.%s '' 'Alighnment faults' 'faults' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "alighnment_faults" + , RRD_FAMILY_SW + , NETDATA_CHART_PRIO_PERF_ALIGNMENT + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "faults"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "alighnment_faults" + ); + printf( + "SET %s = %lld\n" + , "faults" + , (collected_number) perf_events[EV_ID_ALIGNMENT_FAULTS].value + ); + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_EMULATION_FAULTS].updated)) { + if(unlikely(!emulation_chart_generated)) { + emulation_chart_generated = 1; + + printf("CHART %s.%s '' 'Emulation faults' 'faults' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "emulation_faults" + , RRD_FAMILY_SW + , NETDATA_CHART_PRIO_PERF_EMULATION + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "faults"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "emulation_faults" + ); + printf( + "SET %s = %lld\n" + , "faults" + , (collected_number) perf_events[EV_ID_EMULATION_FAULTS].value + ); + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_L1D_READ_ACCESS].updated || perf_events[EV_ID_L1D_READ_MISS].updated + || perf_events[EV_ID_L1D_WRITE_ACCESS].updated || perf_events[EV_ID_L1D_WRITE_MISS].updated)) { + if(unlikely(!L1D_chart_generated)) { + L1D_chart_generated = 1; + + printf("CHART %s.%s '' 'L1D cache operations' 'events/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "l1d_cache" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_L1D + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "read_access"); + printf("DIMENSION %s '' absolute 1 1\n", "read_misses"); + printf("DIMENSION %s '' absolute -1 1\n", "write_access"); + printf("DIMENSION %s '' absolute -1 1\n", "write_misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "l1d_cache" + ); + if(likely(perf_events[EV_ID_L1D_READ_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "read_access" + , (collected_number) perf_events[EV_ID_L1D_READ_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_L1D_READ_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "read_misses" + , (collected_number) perf_events[EV_ID_L1D_READ_MISS].value + ); + } + if(likely(perf_events[EV_ID_L1D_WRITE_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "write_access" + , (collected_number) perf_events[EV_ID_L1D_WRITE_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_L1D_WRITE_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "write_misses" + , (collected_number) perf_events[EV_ID_L1D_WRITE_MISS].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_L1D_PREFETCH_ACCESS].updated)) { + if(unlikely(!L1D_prefetch_chart_generated)) { + L1D_prefetch_chart_generated = 1; + + printf("CHART %s.%s '' 'L1D prefetch cache operations' 'prefetches/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "l1d_cache_prefetch" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_L1D_PREFETCH + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "prefetches"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "l1d_cache_prefetch" + ); + printf( + "SET %s = %lld\n" + , "prefetches" + , (collected_number) perf_events[EV_ID_L1D_PREFETCH_ACCESS].value + ); + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_L1I_READ_ACCESS].updated || perf_events[EV_ID_L1I_READ_MISS].updated)) { + if(unlikely(!L1I_chart_generated)) { + L1I_chart_generated = 1; + + printf("CHART %s.%s '' 'L1I cache operations' 'events/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "l1i_cache" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_L1I + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "read_access"); + printf("DIMENSION %s '' absolute 1 1\n", "read_misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "l1i_cache" + ); + if(likely(perf_events[EV_ID_L1I_READ_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "read_access" + , (collected_number) perf_events[EV_ID_L1I_READ_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_L1I_READ_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "read_misses" + , (collected_number) perf_events[EV_ID_L1I_READ_MISS].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_LL_READ_ACCESS].updated || perf_events[EV_ID_LL_READ_MISS].updated + || perf_events[EV_ID_LL_WRITE_ACCESS].updated || perf_events[EV_ID_LL_WRITE_MISS].updated)) { + if(unlikely(!LL_chart_generated)) { + LL_chart_generated = 1; + + printf("CHART %s.%s '' 'LL cache operations' 'events/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "ll_cache" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_LL + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "read_access"); + printf("DIMENSION %s '' absolute 1 1\n", "read_misses"); + printf("DIMENSION %s '' absolute -1 1\n", "write_access"); + printf("DIMENSION %s '' absolute -1 1\n", "write_misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "ll_cache" + ); + if(likely(perf_events[EV_ID_LL_READ_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "read_access" + , (collected_number) perf_events[EV_ID_LL_READ_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_LL_READ_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "read_misses" + , (collected_number) perf_events[EV_ID_LL_READ_MISS].value + ); + } + if(likely(perf_events[EV_ID_LL_WRITE_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "write_access" + , (collected_number) perf_events[EV_ID_LL_WRITE_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_LL_WRITE_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "write_misses" + , (collected_number) perf_events[EV_ID_LL_WRITE_MISS].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_DTLB_READ_ACCESS].updated || perf_events[EV_ID_DTLB_READ_MISS].updated + || perf_events[EV_ID_DTLB_WRITE_ACCESS].updated || perf_events[EV_ID_DTLB_WRITE_MISS].updated)) { + if(unlikely(!DTLB_chart_generated)) { + DTLB_chart_generated = 1; + + printf("CHART %s.%s '' 'DTLB cache operations' 'events/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "dtlb_cache" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_DTLB + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "read_access"); + printf("DIMENSION %s '' absolute 1 1\n", "read_misses"); + printf("DIMENSION %s '' absolute -1 1\n", "write_access"); + printf("DIMENSION %s '' absolute -1 1\n", "write_misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "dtlb_cache" + ); + if(likely(perf_events[EV_ID_DTLB_READ_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "read_access" + , (collected_number) perf_events[EV_ID_DTLB_READ_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_DTLB_READ_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "read_misses" + , (collected_number) perf_events[EV_ID_DTLB_READ_MISS].value + ); + } + if(likely(perf_events[EV_ID_DTLB_WRITE_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "write_access" + , (collected_number) perf_events[EV_ID_DTLB_WRITE_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_DTLB_WRITE_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "write_misses" + , (collected_number) perf_events[EV_ID_DTLB_WRITE_MISS].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_ITLB_READ_ACCESS].updated || perf_events[EV_ID_ITLB_READ_MISS].updated)) { + if(unlikely(!ITLB_chart_generated)) { + ITLB_chart_generated = 1; + + printf("CHART %s.%s '' 'ITLB cache operations' 'events/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "itlb_cache" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_ITLB + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "read_access"); + printf("DIMENSION %s '' absolute 1 1\n", "read_misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "itlb_cache" + ); + if(likely(perf_events[EV_ID_ITLB_READ_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "read_access" + , (collected_number) perf_events[EV_ID_ITLB_READ_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_ITLB_READ_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "read_misses" + , (collected_number) perf_events[EV_ID_ITLB_READ_MISS].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_PBU_READ_ACCESS].updated)) { + if(unlikely(!PBU_chart_generated)) { + PBU_chart_generated = 1; + + printf("CHART %s.%s '' 'PBU cache operations' 'events/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "pbu_cache" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_PBU + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "read_access"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "pbu_cache" + ); + printf( + "SET %s = %lld\n" + , "read_access" + , (collected_number) perf_events[EV_ID_PBU_READ_ACCESS].value + ); + printf("END\n"); + } +} + +void parse_command_line(int argc, char **argv) { + int i, plugin_enabled = 0; + + for(i = 1; i < argc ; i++) { + if(isdigit(*argv[i]) && !freq) { + int n = str2i(argv[i]); + if(n > 0 && n < 86400) { + freq = n; + continue; + } + } + else if(strcmp("version", argv[i]) == 0 || strcmp("-version", argv[i]) == 0 || strcmp("--version", argv[i]) == 0 || strcmp("-v", argv[i]) == 0 || strcmp("-V", argv[i]) == 0) { + printf("perf.plugin %s\n", VERSION); + exit(0); + } + else if(strcmp("all", argv[i]) == 0) { + struct perf_event *current_event = NULL; + + for(current_event = &perf_events[0]; current_event->id != EV_ID_END; current_event++) + current_event->disabled = 0; + + plugin_enabled = 1; + continue; + } + else if(strcmp("cycles", argv[i]) == 0) { + perf_events[EV_ID_CPU_CYCLES].disabled = 0; + perf_events[EV_ID_REF_CPU_CYCLES].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("instructions", argv[i]) == 0) { + perf_events[EV_ID_INSTRUCTIONS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("branch", argv[i]) == 0) { + perf_events[EV_ID_BRANCH_INSTRUCTIONS].disabled = 0; + perf_events[EV_ID_BRANCH_MISSES].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("cache", argv[i]) == 0) { + perf_events[EV_ID_CACHE_REFERENCES].disabled = 0; + perf_events[EV_ID_CACHE_MISSES].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("bus", argv[i]) == 0) { + perf_events[EV_ID_BUS_CYCLES].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("stalled", argv[i]) == 0) { + perf_events[EV_ID_STALLED_CYCLES_FRONTEND].disabled = 0; + perf_events[EV_ID_STALLED_CYCLES_BACKEND].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("migrations", argv[i]) == 0) { + perf_events[EV_ID_CPU_MIGRATIONS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("alighnment", argv[i]) == 0) { + perf_events[EV_ID_ALIGNMENT_FAULTS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("emulation", argv[i]) == 0) { + perf_events[EV_ID_EMULATION_FAULTS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("L1D", argv[i]) == 0) { + perf_events[EV_ID_L1D_READ_ACCESS].disabled = 0; + perf_events[EV_ID_L1D_READ_MISS].disabled = 0; + perf_events[EV_ID_L1D_WRITE_ACCESS].disabled = 0; + perf_events[EV_ID_L1D_WRITE_MISS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("L1D-prefetch", argv[i]) == 0) { + perf_events[EV_ID_L1D_PREFETCH_ACCESS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("L1I", argv[i]) == 0) { + perf_events[EV_ID_L1I_READ_ACCESS].disabled = 0; + perf_events[EV_ID_L1I_READ_MISS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("LL", argv[i]) == 0) { + perf_events[EV_ID_LL_READ_ACCESS].disabled = 0; + perf_events[EV_ID_LL_READ_MISS].disabled = 0; + perf_events[EV_ID_LL_WRITE_ACCESS].disabled = 0; + perf_events[EV_ID_LL_WRITE_MISS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("DTLB", argv[i]) == 0) { + perf_events[EV_ID_DTLB_READ_ACCESS].disabled = 0; + perf_events[EV_ID_DTLB_READ_MISS].disabled = 0; + perf_events[EV_ID_DTLB_WRITE_ACCESS].disabled = 0; + perf_events[EV_ID_DTLB_WRITE_MISS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("ITLB", argv[i]) == 0) { + perf_events[EV_ID_ITLB_READ_ACCESS].disabled = 0; + perf_events[EV_ID_ITLB_READ_MISS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("PBU", argv[i]) == 0) { + perf_events[EV_ID_PBU_READ_ACCESS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("debug", argv[i]) == 0) { + debug = 1; + continue; + } + else if(strcmp("-h", argv[i]) == 0 || strcmp("--help", argv[i]) == 0) { + fprintf(stderr, + "\n" + " netdata perf.plugin %s\n" + " Copyright (C) 2019 Netdata Inc.\n" + " Released under GNU General Public License v3 or later.\n" + " All rights reserved.\n" + "\n" + " This program is a data collector plugin for netdata.\n" + "\n" + " Available command line options:\n" + "\n" + " COLLECTION_FREQUENCY data collection frequency in seconds\n" + " minimum: %d\n" + "\n" + " all enable all charts\n" + "\n" + " cycles enable CPU cycles chart\n" + "\n" + " instructions enable Instructions chart\n" + "\n" + " branch enable Branch instructions chart\n" + "\n" + " cache enable Cache operations chart\n" + "\n" + " bus enable Bus cycles chart\n" + "\n" + " stalled enable Stalled frontend and backend cycles chart\n" + "\n" + " migrations enable CPU migrations chart\n" + "\n" + " alighnment enable Alignment faults chart\n" + "\n" + " emulation enable Emulation faults chart\n" + "\n" + " L1D enable L1D cache operations chart\n" + "\n" + " L1D-prefetch enable L1D prefetch cache operations chart\n" + "\n" + " L1I enable L1I cache operations chart\n" + "\n" + " LL enable LL cache operations chart\n" + "\n" + " DTLB enable DTLB cache operations chart\n" + "\n" + " ITLB enable ITLB cache operations chart\n" + "\n" + " PBU enable PBU cache operations chart\n" + "\n" + " debug enable verbose output\n" + " default: disabled\n" + "\n" + " -v\n" + " -V\n" + " --version print version and exit\n" + "\n" + " -h\n" + " --help print this message and exit\n" + "\n" + " For more information:\n" + " https://github.com/netdata/netdata/tree/master/collectors/perf.plugin\n" + "\n" + , VERSION + , update_every + ); + exit(1); + } + + error("ignoring parameter '%s'", argv[i]); + } + + if(!plugin_enabled){ + info("no charts enabled - nothing to do."); + printf("DISABLE\n"); + exit(1); + } +} + +int main(int argc, char **argv) { + + // ------------------------------------------------------------------------ + // initialization of netdata plugin + + program_name = "perf.plugin"; + + // disable syslog + error_log_syslog = 0; + + // set errors flood protection to 100 logs per hour + error_log_errors_per_period = 100; + error_log_throttle_period = 3600; + + parse_command_line(argc, argv); + + errno = 0; + + if(freq >= update_every) + update_every = freq; + else if(freq) + error("update frequency %d seconds is too small for PERF. Using %d.", freq, update_every); + + if(unlikely(debug)) fprintf(stderr, "perf.plugin: calling perf_init()\n"); + int perf = !perf_init(); + + // ------------------------------------------------------------------------ + // the main loop + + if(unlikely(debug)) fprintf(stderr, "perf.plugin: starting data collection\n"); + + time_t started_t = now_monotonic_sec(); + + size_t iteration; + usec_t step = update_every * USEC_PER_SEC; + + heartbeat_t hb; + heartbeat_init(&hb); + for(iteration = 0; 1; iteration++) { + usec_t dt = heartbeat_next(&hb, step); + + if(unlikely(netdata_exit)) break; + + if(unlikely(debug && iteration)) + fprintf(stderr, "perf.plugin: iteration %zu, dt %llu usec\n" + , iteration + , dt + ); + + if(likely(perf)) { + if(unlikely(debug)) fprintf(stderr, "perf.plugin: calling perf_collect()\n"); + perf = !perf_collect(); + + if(likely(perf)) { + if(unlikely(debug)) fprintf(stderr, "perf.plugin: calling perf_send_metrics()\n"); + perf_send_metrics(); + } + } + + fflush(stdout); + + // restart check (14400 seconds) + if(now_monotonic_sec() - started_t > 14400) break; + } + + info("process exiting"); + perf_free(); +} diff --git a/collectors/plugins.d/README.md b/collectors/plugins.d/README.md index 105a60eb5..9134d5163 100644 --- a/collectors/plugins.d/README.md +++ b/collectors/plugins.d/README.md @@ -15,6 +15,7 @@ plugin|language|O/S|description [freeipmi.plugin](../freeipmi.plugin/)|`C`|linux|collects metrics from enterprise hardware sensors, on Linux servers. [nfacct.plugin](../nfacct.plugin/)|`C`|linux|collects netfilter firewall, connection tracker and accounting metrics using `libmnl` and `libnetfilter_acct`. [xenstat.plugin](../xenstat.plugin/)|`C`|linux|collects XenServer and XCP-ng metrics using `lxenstat`. +[perf.plugin](../perf.plugin/)|`C`|linux|collects CPU performance metrics using performance monitoring units (PMU). [node.d.plugin](../node.d.plugin/)|`node.js`|all|a **plugin orchestrator** for data collection modules written in `node.js`. [python.d.plugin](../python.d.plugin/)|`python`|all|a **plugin orchestrator** for data collection modules written in `python` v2 or v3 (both are supported). diff --git a/collectors/plugins.d/plugins_d.c b/collectors/plugins.d/plugins_d.c index 024dd292a..66ec5d0ea 100644 --- a/collectors/plugins.d/plugins_d.c +++ b/collectors/plugins.d/plugins_d.c @@ -154,7 +154,12 @@ inline size_t pluginsd_process(RRDHOST *host, struct plugind *cd, FILE *fp, int char *r = fgets(line, PLUGINSD_LINE_MAX, fp); if(unlikely(!r)) { - error("read failed"); + if(feof(fp)) + error("read failed: end of file"); + else if(ferror(fp)) + error("read failed: input error"); + else + error("read failed: unknown error"); break; } diff --git a/collectors/proc.plugin/README.md b/collectors/proc.plugin/README.md index 37dc19f86..cacde84f6 100644 --- a/collectors/proc.plugin/README.md +++ b/collectors/proc.plugin/README.md @@ -75,7 +75,7 @@ netdata will automatically set the name of disks on the dashboard, from the moun ### performance metrics -By default netdata will enable monitoring metrics only when they are not zero. If they are constantly zero they are ignored. Metrics that will start having values, after netdata is started, will be detected and charts will be automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). +By default, Netdata will enable monitoring metrics only when they are not zero. If they are constantly zero they are ignored. Metrics that will start having values, after netdata is started, will be detected and charts will be automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). Set `yes` for a chart instead of `auto` to enable it permanently. netdata categorizes all block devices in 3 categories: diff --git a/collectors/proc.plugin/proc_mdstat.c b/collectors/proc.plugin/proc_mdstat.c index d0925ec32..5c29d31c7 100644 --- a/collectors/proc.plugin/proc_mdstat.c +++ b/collectors/proc.plugin/proc_mdstat.c @@ -13,7 +13,7 @@ struct raid { unsigned long long failed_disks;
RRDSET *st_disks;
- RRDDIM *rd_total;
+ RRDDIM *rd_down;
RRDDIM *rd_inuse;
unsigned long long total_disks;
unsigned long long inuse_disks;
@@ -439,11 +439,11 @@ int do_proc_mdstat(int update_every, usec_t dt) { if(unlikely(!raid->rd_inuse && !(raid->rd_inuse = rrddim_find(raid->st_disks, "inuse"))))
raid->rd_inuse = rrddim_add(raid->st_disks, "inuse", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- if(unlikely(!raid->rd_total && !(raid->rd_total = rrddim_find(raid->st_disks, "total"))))
- raid->rd_total = rrddim_add(raid->st_disks, "total", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ if(unlikely(!raid->rd_down && !(raid->rd_down = rrddim_find(raid->st_disks, "down"))))
+ raid->rd_down = rrddim_add(raid->st_disks, "down", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rrddim_set_by_pointer(raid->st_disks, raid->rd_inuse, raid->inuse_disks);
- rrddim_set_by_pointer(raid->st_disks, raid->rd_total, raid->total_disks);
+ rrddim_set_by_pointer(raid->st_disks, raid->rd_down, raid->failed_disks);
rrdset_done(raid->st_disks);
}
diff --git a/collectors/python.d.plugin/Makefile.am b/collectors/python.d.plugin/Makefile.am index 652a35da4..ad72cfaef 100644 --- a/collectors/python.d.plugin/Makefile.am +++ b/collectors/python.d.plugin/Makefile.am @@ -87,6 +87,7 @@ include rabbitmq/Makefile.inc include redis/Makefile.inc include rethinkdbs/Makefile.inc include retroshare/Makefile.inc +include riakkv/Makefile.inc include samba/Makefile.inc include sensors/Makefile.inc include smartd_log/Makefile.inc diff --git a/collectors/python.d.plugin/README.md b/collectors/python.d.plugin/README.md index 8955197a7..32437c6db 100644 --- a/collectors/python.d.plugin/README.md +++ b/collectors/python.d.plugin/README.md @@ -150,7 +150,7 @@ Classes implement `_get_raw_data` which should be used to grab raw data. This me _This is last resort class, if a new module cannot be written by using other framework class this one can be used._ -_Example: `mysql`, `sensors`_ +_Example: `ceph`, `sensors`_ It is the lowest-level class which implements most of module logic, like: - threading diff --git a/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py b/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py index 052c93144..3fcb5fda8 100644 --- a/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py +++ b/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py @@ -56,8 +56,8 @@ GOOD_PD_STATUS = ( ) RE_LD = re.compile( - r'Logical device number\s+([0-9]+).*?' - r'Status of logical device\s+: ([a-zA-Z]+)' + r'Logical [dD]evice number\s+([0-9]+).*?' + r'Status of [lL]ogical [dD]evice\s+: ([a-zA-Z]+)' ) diff --git a/collectors/python.d.plugin/dns_query_time/dns_query_time.chart.py b/collectors/python.d.plugin/dns_query_time/dns_query_time.chart.py index 47a7d23f6..7fe860314 100644 --- a/collectors/python.d.plugin/dns_query_time/dns_query_time.chart.py +++ b/collectors/python.d.plugin/dns_query_time/dns_query_time.chart.py @@ -8,11 +8,6 @@ from socket import getaddrinfo, gaierror from threading import Thread try: - from time import monotonic as time -except ImportError: - from time import time - -try: import dns.message import dns.query import dns.name @@ -89,13 +84,15 @@ def dns_request(server_list, timeout, domains): request = dns.message.make_query(domain, dns.rdatatype.A) try: - dns_start = time() - dns.query.udp(request, ns, timeout=t) - dns_end = time() - query_time = round((dns_end - dns_start) * 1000) - q.put({'_'.join(['ns', ns.replace('.', '_')]): query_time}) + resp = dns.query.udp(request, ns, timeout=t) + if (resp.rcode() == dns.rcode.NOERROR and resp.answer): + query_time = resp.time * 1000 + else: + query_time = -100 except dns.exception.Timeout: - q.put({'_'.join(['ns', ns.replace('.', '_')]): -100}) + query_time = -100 + finally: + q.put({'_'.join(['ns', ns.replace('.', '_')]): query_time}) for server in server_list: th = Thread(target=dns_req, args=(server, timeout, que)) diff --git a/collectors/python.d.plugin/elasticsearch/elasticsearch.chart.py b/collectors/python.d.plugin/elasticsearch/elasticsearch.chart.py index 9b3c1284d..20109c64f 100644 --- a/collectors/python.d.plugin/elasticsearch/elasticsearch.chart.py +++ b/collectors/python.d.plugin/elasticsearch/elasticsearch.chart.py @@ -10,9 +10,9 @@ from collections import namedtuple from socket import gethostbyname, gaierror try: - from queue import Queue + from queue import Queue except ImportError: - from Queue import Queue + from Queue import Queue from bases.FrameworkServices.UrlService import UrlService @@ -83,11 +83,11 @@ NODE_STATS = [ ] CLUSTER_STATS = [ - 'nodes.count.data_only', - 'nodes.count.master_data', + 'nodes.count.data', + 'nodes.count.master', 'nodes.count.total', - 'nodes.count.master_only', - 'nodes.count.client', + 'nodes.count.coordinating_only', + 'nodes.count.ingest', 'indices.docs.count', 'indices.query_cache.hit_count', 'indices.query_cache.miss_count', @@ -371,7 +371,7 @@ CHARTS = { }, 'cluster_health_nodes': { 'options': [None, 'Nodes Statistics', 'nodes', 'cluster health API', - 'elastic.cluster_health_nodes', 'stacked'], + 'elastic.cluster_health_nodes', 'area'], 'lines': [ ['number_of_nodes', 'nodes', 'absolute'], ['number_of_data_nodes', 'data_nodes', 'absolute'], @@ -417,13 +417,13 @@ CHARTS = { }, 'cluster_stats_nodes': { 'options': [None, 'Nodes Statistics', 'nodes', 'cluster stats API', - 'elastic.cluster_nodes', 'stacked'], + 'elastic.cluster_nodes', 'area'], 'lines': [ - ['nodes_count_data_only', 'data_only', 'absolute'], - ['nodes_count_master_data', 'master_data', 'absolute'], + ['nodes_count_data', 'data', 'absolute'], + ['nodes_count_master', 'master', 'absolute'], ['nodes_count_total', 'total', 'absolute'], - ['nodes_count_master_only', 'master_only', 'absolute'], - ['nodes_count_client', 'client', 'absolute'] + ['nodes_count_ingest', 'ingest', 'absolute'], + ['nodes_count_coordinating_only', 'coordinating_only', 'absolute'] ] }, 'cluster_stats_query_cache': { diff --git a/collectors/python.d.plugin/monit/monit.chart.py b/collectors/python.d.plugin/monit/monit.chart.py index 3ac0032c5..9f3270572 100644 --- a/collectors/python.d.plugin/monit/monit.chart.py +++ b/collectors/python.d.plugin/monit/monit.chart.py @@ -4,23 +4,49 @@ # SPDX-License-Identifier: GPL-3.0-or-later import xml.etree.ElementTree as ET + +from collections import namedtuple + from bases.FrameworkServices.UrlService import UrlService -# see enum State_Type from monit.h (https://bitbucket.org/tildeslash/monit/src/master/src/monit.h) -MONIT_SERVICE_NAMES = [ - 'Filesystem', - 'Directory', - 'File', - 'Process', - 'Host', - 'System', - 'Fifo', - 'Program', - 'Net', -] +MonitType = namedtuple('MonitType', ('index', 'name')) + +# see enum Service_Type from monit.h (https://bitbucket.org/tildeslash/monit/src/master/src/monit.h) +# typedef enum { +# Service_Filesystem = 0, +# Service_Directory, +# Service_File, +# Service_Process, +# Service_Host, +# Service_System, +# Service_Fifo, +# Service_Program, +# Service_Net, +# Service_Last = Service_Net +# } __attribute__((__packed__)) Service_Type; -DEFAULT_SERVICES_IDS = [0, 1, 2, 3, 4, 6, 7, 8] +TYPE_FILESYSTEM = MonitType(0, 'filesystem') +TYPE_DIRECTORY = MonitType(1, 'directory') +TYPE_FILE = MonitType(2, 'file') +TYPE_PROCESS = MonitType(3, 'process') +TYPE_HOST = MonitType(4, 'host') +TYPE_SYSTEM = MonitType(5, 'system') +TYPE_FIFO = MonitType(6, 'fifo') +TYPE_PROGRAM = MonitType(7, 'program') +TYPE_NET = MonitType(8, 'net') + +TYPES = ( + TYPE_FILESYSTEM, + TYPE_DIRECTORY, + TYPE_FILE, + TYPE_PROCESS, + TYPE_HOST, + TYPE_SYSTEM, + TYPE_FIFO, + TYPE_PROGRAM, + TYPE_NET, +) # charts order (can be overridden if you want less charts, or different order) ORDER = [ @@ -38,6 +64,7 @@ ORDER = [ 'program', 'net' ] + CHARTS = { 'filesystem': { 'options': ['filesystems', 'Filesystems', 'filesystems', 'filesystem', 'monit.filesystems', 'line'], @@ -83,7 +110,7 @@ CHARTS = { 'lines': [] }, 'host_latency': { - 'options': ['hosts latency', 'Hosts latency', 'milliseconds/s', 'network', 'monit.host_latency', 'line'], + 'options': ['hosts latency', 'Hosts latency', 'milliseconds', 'network', 'monit.host_latency', 'line'], 'lines': [] }, 'net': { @@ -94,85 +121,224 @@ CHARTS = { } +class BaseMonitService(object): + def __init__(self, typ, name, status, monitor): + self.type = typ + self.name = name + self.status = status + self.monitor = monitor + + def __repr__(self): + return 'MonitService({0}:{1})'.format(self.type.name, self.name) + + def __eq__(self, other): + if not isinstance(other, BaseMonitService): + return False + return self.type == other.type and self.name == other.name + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(repr(self)) + + def is_running(self): + return self.status == '0' and self.monitor == '1' + + def key(self): + return '{0}_{1}'.format(self.type.name, self.name) + + def data(self): + return {self.key(): int(self.is_running())} + + +class ProcessMonitService(BaseMonitService): + def __init__(self, typ, name, status, monitor): + super(ProcessMonitService, self).__init__(typ, name, status, monitor) + self.uptime = None + self.threads = None + self.children = None + + def uptime_key(self): + return 'process_uptime_{0}'.format(self.name) + + def threads_key(self): + return 'process_threads_{0}'.format(self.name) + + def children_key(self): + return 'process_children_{0}'.format(self.name) + + def data(self): + base_data = super(ProcessMonitService, self).data() + # skipping bugged metrics with negative uptime (monit before v5.16) + uptime = self.uptime if self.uptime and int(self.uptime) >= 0 else None + data = { + self.uptime_key(): uptime, + self.threads_key(): self.threads, + self.children_key(): self.children, + } + data.update(base_data) + + return data + + +class HostMonitService(BaseMonitService): + def __init__(self, typ, name, status, monitor): + super(HostMonitService, self).__init__(typ, name, status, monitor) + self.latency = None + + def latency_key(self): + return 'host_latency_{0}'.format(self.name) + + def data(self): + base_data = super(HostMonitService, self).data() + latency = float(self.latency) * 1000000 if self.latency else None + data = {self.latency_key(): latency} + data.update(base_data) + + return data + + class Service(UrlService): def __init__(self, configuration=None, name=None): UrlService.__init__(self, configuration=configuration, name=name) self.order = ORDER self.definitions = CHARTS - base_url = self.configuration.get('url', 'http://localhost:2812') + base_url = self.configuration.get('url', "http://localhost:2812") self.url = '{0}/_status?format=xml&level=full'.format(base_url) + self.active_services = list() - def parse(self, data): + def parse(self, raw): try: - xml = ET.fromstring(data) + root = ET.fromstring(raw) except ET.ParseError: - self.error("URL {0} didn't return a vaild XML page. Please check your settings.".format(self.url)) + self.error("URL {0} didn't return a valid XML page. Please check your settings.".format(self.url)) + return None + return root + + def _get_data(self): + raw = self._get_raw_data() + if not raw: return None - return xml - def check(self): - self._manager = self._build_manager() + root = self.parse(raw) + if root is None: + return None - raw_data = self._get_raw_data() - if not raw_data: + services = self.get_services(root) + if not services: return None - return bool(self.parse(raw_data)) + if len(self.charts) > 0: + self.update_charts(services) - def _get_data(self): - raw_data = self._get_raw_data() + data = dict() - if not raw_data: - return None + for svc in services: + data.update(svc.data()) - xml = self.parse(raw_data) - if not xml: - return None + return data - data = {} - for service_id in DEFAULT_SERVICES_IDS: - service_category = MONIT_SERVICE_NAMES[service_id].lower() + def get_services(self, root): + services = list() - if service_category == 'system': - self.debug("Skipping service from 'System' category, because it's useless in graphs") + for typ in TYPES: + if typ == TYPE_SYSTEM: + self.debug("skipping service from '{0}' category, it's useless in graphs".format(TYPE_SYSTEM.name)) continue - xpath_query = "./service[@type='{0}']".format(service_id) - self.debug('Searching for {0} as {1}'.format(service_category, xpath_query)) - for service_node in xml.findall(xpath_query): - - service_name = service_node.find('name').text - service_status = service_node.find('status').text - service_monitoring = service_node.find('monitor').text - self.debug('=> found {0} with type={1}, status={2}, monitoring={3}'.format(service_name, - service_id, service_status, service_monitoring)) - - dimension_key = service_category + '_' + service_name - if dimension_key not in self.charts[service_category]: - self.charts[service_category].add_dimension([dimension_key, service_name, 'absolute']) - data[dimension_key] = 1 if service_status == '0' and service_monitoring == '1' else 0 - - if service_category == 'process': - for subnode in ('uptime', 'threads', 'children'): - subnode_value = service_node.find(subnode) - if subnode_value is None: - continue - if subnode == 'uptime' and int(subnode_value.text) < 0: - self.debug('Skipping bugged metrics with negative uptime (monit before v5.16') - continue - dimension_key = 'process_{0}_{1}'.format(subnode, service_name) - if dimension_key not in self.charts['process_' + subnode]: - self.charts['process_' + subnode].add_dimension([dimension_key, service_name, 'absolute']) - data[dimension_key] = int(subnode_value.text) - - if service_category == 'host': - subnode_value = service_node.find('./icmp/responsetime') - if subnode_value is None: - continue - dimension_key = 'host_latency_{0}'.format(service_name) - if dimension_key not in self.charts['host_latency']: - self.charts['host_latency'].add_dimension([dimension_key, service_name, - 'absolute', 1000, 1000000]) - data[dimension_key] = float(subnode_value.text) * 1000000 - - return data or None + xpath_query = "./service[@type='{0}']".format(typ.index) + self.debug('Searching for {0} as {1}'.format(typ.name, xpath_query)) + + for svc_root in root.findall(xpath_query): + svc = create_service(svc_root, typ) + self.debug('=> found {0} with type={1}, status={2}, monitoring={3}'.format( + svc.name, svc.type.name, svc.status, svc.monitor)) + + services.append(svc) + + return services + + def update_charts(self, services): + remove = [svc for svc in self.active_services if svc not in services] + add = [svc for svc in services if svc not in self.active_services] + + self.remove_services_from_charts(remove) + self.add_services_to_charts(add) + + self.active_services = services + + def add_services_to_charts(self, services): + for svc in services: + if svc.type == TYPE_HOST: + self.charts['host_latency'].add_dimension([svc.latency_key(), svc.name, 'absolute', 1000, 1000000]) + if svc.type == TYPE_PROCESS: + self.charts['process_uptime'].add_dimension([svc.uptime_key(), svc.name]) + self.charts['process_threads'].add_dimension([svc.threads_key(), svc.name]) + self.charts['process_children'].add_dimension([svc.children_key(), svc.name]) + self.charts[svc.type.name].add_dimension([svc.key(), svc.name]) + + def remove_services_from_charts(self, services): + for svc in services: + if svc.type == TYPE_HOST: + self.charts['host_latency'].del_dimension(svc.latency_key(), False) + if svc.type == TYPE_PROCESS: + self.charts['process_uptime'].del_dimension(svc.uptime_key(), False) + self.charts['process_threads'].del_dimension(svc.threads_key(), False) + self.charts['process_children'].del_dimension(svc.children_key(), False) + self.charts[svc.type.name].del_dimension(svc.key(), False) + + +def create_service(root, typ): + if typ == TYPE_HOST: + return create_host_service(root) + elif typ == TYPE_PROCESS: + return create_process_service(root) + return create_base_service(root, typ) + + +def create_host_service(root): + svc = HostMonitService( + TYPE_HOST, + root.find('name').text, + root.find('status').text, + root.find('monitor').text, + ) + + latency = root.find('./icmp/responsetime') + if latency is not None: + svc.latency = latency.text + + return svc + + +def create_process_service(root): + svc = ProcessMonitService( + TYPE_PROCESS, + root.find('name').text, + root.find('status').text, + root.find('monitor').text, + ) + + uptime = root.find('uptime') + if uptime is not None: + svc.uptime = uptime.text + + threads = root.find('threads') + if threads is not None: + svc.threads = threads.text + + children = root.find('children') + if children is not None: + svc.children = children.text + + return svc + + +def create_base_service(root, typ): + return BaseMonitService( + typ, + root.find('name').text, + root.find('status').text, + root.find('monitor').text, + ) diff --git a/collectors/python.d.plugin/mysql/README.md b/collectors/python.d.plugin/mysql/README.md index eba9d7a2e..f7028ab68 100644 --- a/collectors/python.d.plugin/mysql/README.md +++ b/collectors/python.d.plugin/mysql/README.md @@ -218,6 +218,24 @@ It will produce following charts (if data is available): 45. **Flow Control** in ms * paused +46. **Users CPU time** in percentage + * users + +**Per user statistics:** + +1. **Rows Operations** in operations/s + * read + * send + * updated + * inserted + * deleted + +2. **Commands** in commands/s + * select + * update + * other + + ### configuration You can provide, per server, the following: @@ -234,7 +252,7 @@ You can provide, per server, the following: - ca: the path name of the Certificate Authority (CA) certificate file. This option, if used, must specify the same certificate used by the server. - capath: the path name of the directory that contains trusted SSL CA certificate files. - cipher: the list of permitted ciphers for SSL encryption. - + Here is an example for 3 servers: ```yaml @@ -260,6 +278,8 @@ remote: If no configuration is given, module will attempt to connect to mysql server via unix socket at `/var/run/mysqld/mysqld.sock` without password and with username `root` +`userstats` graph works only if you enable such plugin in MariaDB server and set proper mysql priviliges (SUPER or PROCESS). For more detail please check [MariaDB User Statistics page](https://mariadb.com/kb/en/library/user-statistics/) + --- [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fpython.d.plugin%2Fmysql%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/collectors/python.d.plugin/mysql/mysql.chart.py b/collectors/python.d.plugin/mysql/mysql.chart.py index 139fac158..82bd90794 100644 --- a/collectors/python.d.plugin/mysql/mysql.chart.py +++ b/collectors/python.d.plugin/mysql/mysql.chart.py @@ -11,6 +11,7 @@ from bases.FrameworkServices.MySQLService import MySQLService QUERY_GLOBAL = 'SHOW GLOBAL STATUS;' QUERY_SLAVE = 'SHOW SLAVE STATUS;' QUERY_VARIABLES = 'SHOW GLOBAL VARIABLES LIKE \'max_connections\';' +QUERY_USER_STATISTICS = 'SHOW USER_STATISTICS;' GLOBAL_STATS = [ 'Bytes_received', @@ -90,6 +91,7 @@ GLOBAL_STATS = [ 'Innodb_buffer_pool_write_requests', 'Innodb_buffer_pool_reads', 'Innodb_buffer_pool_wait_free', + 'Innodb_deadlocks', 'Qcache_hits', 'Qcache_lowmem_prunes', 'Qcache_inserts', @@ -149,6 +151,18 @@ SLAVE_STATS = [ ('Slave_IO_Running', slave_running) ] +USER_STATISTICS = [ + 'Select_commands', + 'Update_commands', + 'Other_commands', + 'Cpu_time', + 'Rows_read', + 'Rows_sent', + 'Rows_deleted', + 'Rows_inserted', + 'Rows_updated' +] + VARIABLES = [ 'max_connections' ] @@ -178,6 +192,7 @@ ORDER = [ 'innodb_os_log_fsync_writes', 'innodb_os_log_io', 'innodb_cur_row_lock', + 'innodb_deadlocks', 'innodb_rows', 'innodb_buffer_pool_pages', 'innodb_buffer_pool_flush_pages_requests', @@ -200,7 +215,8 @@ ORDER = [ 'galera_bytes', 'galera_queue', 'galera_conflicts', - 'galera_flow_control' + 'galera_flow_control', + 'userstats_cpu' ] CHARTS = { @@ -382,6 +398,13 @@ CHARTS = { ['Innodb_row_lock_current_waits', 'current_waits', 'absolute'] ] }, + 'innodb_deadlocks': { + 'options': [None, 'InnoDB Deadlocks', 'operations/s', 'innodb', + 'mysql.innodb_deadlocks', 'area'], + 'lines': [ + ['Innodb_deadlocks', 'deadlocks', 'incremental'] + ] + }, 'innodb_rows': { 'options': [None, 'InnoDB Row Operations', 'operations/s', 'innodb', 'mysql.innodb_rows', 'area'], 'lines': [ @@ -570,10 +593,45 @@ CHARTS = { 'lines': [ ['wsrep_flow_control_paused_ns', 'paused', 'incremental', 1, 1000000], ] + }, + 'userstats_cpu': { + 'options': [None, 'Users CPU time', 'percentage', 'userstats', 'mysql.userstats_cpu', 'stacked'], + 'lines': [] } } +def userstats_chart_template(name): + order = [ + 'userstats_rows_{0}'.format(name), + 'userstats_commands_{0}'.format(name) + ] + family = 'userstats {0}'.format(name) + + charts = { + order[0]: { + 'options': [None, 'Rows Operations', 'operations/s', family, 'mysql.userstats_rows', 'stacked'], + 'lines': [ + ['userstats_{0}_Rows_read'.format(name), 'read', 'incremental'], + ['userstats_{0}_Rows_send'.format(name), 'send', 'incremental'], + ['userstats_{0}_Rows_updated'.format(name), 'updated', 'incremental'], + ['userstats_{0}_Rows_inserted'.format(name), 'inserted', 'incremental'], + ['userstats_{0}_Rows_deleted'.format(name), 'deleted', 'incremental'] + ] + }, + order[1]: { + 'options': [None, 'Commands', 'commands/s', family, 'mysql.userstats_commands', 'stacked'], + 'lines': [ + ['userstats_{0}_Select_commands'.format(name), 'select', 'incremental'], + ['userstats_{0}_Update_commands'.format(name), 'update', 'incremental'], + ['userstats_{0}_Other_commands'.format(name), 'other', 'incremental'] + ] + } + } + + return order, charts + + class Service(MySQLService): def __init__(self, configuration=None, name=None): MySQLService.__init__(self, configuration=configuration, name=name) @@ -583,6 +641,7 @@ class Service(MySQLService): global_status=QUERY_GLOBAL, slave_status=QUERY_SLAVE, variables=QUERY_VARIABLES, + user_statistics=QUERY_USER_STATISTICS, ) def _get_data(self): @@ -612,6 +671,12 @@ class Service(MySQLService): else: self.queries.pop('slave_status') + if 'user_statistics' in raw_data: + if raw_data['user_statistics'][0]: + to_netdata.update(self.get_userstats(raw_data)) + else: + self.queries.pop('user_statistics') + if 'variables' in raw_data: variables = dict(raw_data['variables'][0]) for key in VARIABLES: @@ -619,3 +684,70 @@ class Service(MySQLService): to_netdata[key] = variables[key] return to_netdata or None + + # raw_data['user_statistics'] contains the following data structure: + # ( + # ( + # ('netdata', 42L, 0L, 1264L, 3.111252999999968, 2.968510299999994, 110267L, 19741424L, 0L, 0L, 1265L, 0L, + # 0L, 0L, 3L, 0L, 1301L, 0L, 0L, 7633L, 0L, 83L, 44L, 0L, 0L), + # ('root', 60L, 0L, 184L, 0.22856499999999966, 0.1601419999999998, 11605L, 1516513L, 0L, 9L, 220L, 0L, 2L, 1L, + # 6L, 4L,127L, 0L, 0L, 45L, 0L, 45L, 0L, 0L, 0L) + # ), + # ( + # ('User', 253, 9, 128, 128, 0, 0), + # ('Total_connections', 3, 2, 11, 11, 0, 0), + # ('Concurrent_connections', 3, 1, 11, 11, 0, 0), + # ('Connected_time', 3, 4, 11, 11, 0, 0), + # ('Busy_time', 5, 21, 21, 21, 31, 0), + # ('Cpu_time', 5, 18, 21, 21, 31, 0), + # ('Bytes_received', 8, 6, 21, 21, 0, 0), + # ('Bytes_sent', 8, 8, 21, 21, 0, 0), + # ('Binlog_bytes_written', 8, 1, 21, 21, 0, 0), + # ('Rows_read', 8, 1, 21, 21, 0, 0), + # ('Rows_sent', 8, 4, 21, 21, 0, 0), + # ('Rows_deleted', 8, 1, 21, 21, 0, 0), + # ('Rows_inserted', 8, 1, 21, 21, 0, 0), + # ('Rows_updated', 8, 1, 21, 21, 0, 0), + # ('Select_commands', 8, 1, 21, 21, 0, 0), + # ('Update_commands', 8, 1, 21, 21, 0, 0), + # ('Other_commands', 8, 4, 21, 21, 0, 0), + # ('Commit_transactions', 8, 1, 21, 21, 0, 0), + # ('Rollback_transactions', 8, 1, 21, 21, 0, 0), + # ('Denied_connections', 8, 4, 21, 21, 0, 0), + # ('Lost_connections', 8, 1, 21, 21, 0, 0), + # ('Access_denied', 8, 2, 21, 21, 0, 0), + # ('Empty_queries', 8, 2, 21, 21, 0, 0), + # ('Total_ssl_connections', 8, 1, 21, 21, 0, 0), + # ('Max_statement_time_exceeded', 8, 1, 21, 21, 0, 0)), + # ) + def get_userstats(self, raw_data): + data = dict() + userstats_vars = [e[0] for e in raw_data['user_statistics'][1]] + for i, _ in enumerate(raw_data['user_statistics'][0]): + user_name = raw_data['user_statistics'][0][i][0] + userstats = dict(zip(userstats_vars, raw_data['user_statistics'][0][i])) + + if len(self.charts) > 0: + if ('userstats_{0}_Cpu_time'.format(user_name)) not in self.charts['userstats_cpu']: + self.add_userstats_dimensions(user_name) + self.create_new_userstats_charts(user_name) + + for key in USER_STATISTICS: + if key in userstats: + data['userstats_{0}_{1}'.format(user_name, key)] = userstats[key] + + return data + + def add_userstats_dimensions(self, name): + self.charts['userstats_cpu'].add_dimension(['userstats_{0}_Cpu_time'.format(name), name, 'incremental', 100, 1]) + + def create_new_userstats_charts(self, tube): + order, charts = userstats_chart_template(tube) + + for chart_name in order: + params = [chart_name] + charts[chart_name]['options'] + dimensions = charts[chart_name]['lines'] + + new_chart = self.charts.add_chart(params) + for dimension in dimensions: + new_chart.add_dimension(dimension) diff --git a/collectors/python.d.plugin/python.d.conf b/collectors/python.d.plugin/python.d.conf index 63eecbba8..e2ee8eeec 100644 --- a/collectors/python.d.plugin/python.d.conf +++ b/collectors/python.d.plugin/python.d.conf @@ -41,7 +41,7 @@ chrony: no # dockerd: yes # dovecot: yes # elasticsearch: yes -# energi: yes +# energid: yes # this is just an example example: no @@ -88,6 +88,7 @@ nginx_log: no # redis: yes # rethinkdbs: yes # retroshare: yes +# riakkv: yes # samba: yes # sensors: yes # smartd_log: yes @@ -101,4 +102,4 @@ unbound: no # uwsgi: yes # varnish: yes # w1sensor: yes -# web_log: yes
\ No newline at end of file +# web_log: yes diff --git a/collectors/python.d.plugin/python_modules/bases/FrameworkServices/UrlService.py b/collectors/python.d.plugin/python_modules/bases/FrameworkServices/UrlService.py index 439456655..b6f75bd5c 100644 --- a/collectors/python.d.plugin/python_modules/bases/FrameworkServices/UrlService.py +++ b/collectors/python.d.plugin/python_modules/bases/FrameworkServices/UrlService.py @@ -6,6 +6,8 @@ import urllib3 +from distutils.version import StrictVersion as version + from bases.FrameworkServices.SimpleService import SimpleService try: @@ -14,9 +16,30 @@ except AttributeError: pass +# https://github.com/urllib3/urllib3/blob/master/CHANGES.rst#19-2014-07-04 +# New retry logic and urllib3.util.retry.Retry configuration object. (Issue https://github.com/urllib3/urllib3/pull/326) +URLLIB3_MIN_REQUIRED_VERSION = '1.9' +URLLIB3_VERSION = urllib3.__version__ +URLLIB3 = 'urllib3' + + +def version_check(): + if version(URLLIB3_VERSION) >= version(URLLIB3_MIN_REQUIRED_VERSION): + return + + err = '{0} version: {1}, minimum required version: {2}, please upgrade'.format( + URLLIB3, + URLLIB3_VERSION, + URLLIB3_MIN_REQUIRED_VERSION, + ) + raise Exception(err) + + class UrlService(SimpleService): def __init__(self, configuration=None, name=None): + version_check() SimpleService.__init__(self, configuration=configuration, name=name) + self.debug("{0} version: {1}".format(URLLIB3, URLLIB3_VERSION)) self.url = self.configuration.get('url') self.user = self.configuration.get('user') self.password = self.configuration.get('pass') diff --git a/collectors/python.d.plugin/riakkv/Makefile.inc b/collectors/python.d.plugin/riakkv/Makefile.inc new file mode 100644 index 000000000..87d29f82f --- /dev/null +++ b/collectors/python.d.plugin/riakkv/Makefile.inc @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_python_DATA += riakkv/riakkv.chart.py +dist_pythonconfig_DATA += riakkv/riakkv.conf + +# do not install these files, but include them in the distribution +dist_noinst_DATA += riakkv/README.md riakkv/Makefile.inc + diff --git a/collectors/python.d.plugin/riakkv/README.md b/collectors/python.d.plugin/riakkv/README.md new file mode 100644 index 000000000..0bcf22c5b --- /dev/null +++ b/collectors/python.d.plugin/riakkv/README.md @@ -0,0 +1,110 @@ +# riakkv + +Monitors one or more Riak KV servers. + +**Requirements:** + +* An accessible `/stats` endpoint. See [the Riak KV configuration reference] + documentation](https://docs.riak.com/riak/kv/2.2.3/configuring/reference/#client-interfaces) + for how to enable this. + +The following charts are included, which are mostly derived from the metrics +listed +[here](https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#riak-metrics-to-graph). + +1. **Throughput** in operations/s + * **KV operations** + * gets + * puts + + * **Data type updates** + * counters + * sets + * maps + + * **Search queries** + * queries + + * **Search documents** + * indexed + + * **Strong consistency operations** + * gets + * puts + +2. **Latency** in milliseconds + * **KV latency** of the past minute + * get (mean, median, 95th / 99th / 100th percentile) + * put (mean, median, 95th / 99th / 100th percentile) + + * **Data type latency** of the past minute + * counter_merge (mean, median, 95th / 99th / 100th percentile) + * set_merge (mean, median, 95th / 99th / 100th percentile) + * map_merge (mean, median, 95th / 99th / 100th percentile) + + * **Search latency** of the past minute + * query (median, min, max, 95th / 99th percentile) + * index (median, min, max, 95th / 99th percentile) + + * **Strong consistency latency** of the past minute + * get (mean, median, 95th / 99th / 100th percentile) + * put (mean, median, 95th / 99th / 100th percentile) + +3. **Erlang VM metrics** + * **System counters** + * processes + + * **Memory allocation** in MB + * processes.allocated + * processes.used + +4. **General load / health metrics** + * **Siblings encountered in KV operations** during the past minute + * get (mean, median, 95th / 99th / 100th percentile) + + * **Object size in KV operations** during the past minute in KB + * get (mean, median, 95th / 99th / 100th percentile) + + * **Message queue length** in unprocessed messages + * vnodeq_size (mean, median, 95th / 99th / 100th percentile) + + * **Index operations** encountered by Search + * errors + + * **Protocol buffer connections** + * active + + * **Repair operations coordinated by this node** + * read + + * **Active finite state machines by kind** + * get + * put + * secondary_index + * list_keys + + * **Rejected finite state machines** + * get + * put + + * **Number of writes to Search failed due to bad data format by reason** + * bad_entry + * extract_fail + + +### configuration + +The module needs to be passed the full URL to Riak's stats endpoint. +For example: + +```yaml +myriak: + url: http://myriak.example.com:8098/stats +``` + +With no explicit configuration given, the module will attempt to connect to +`http://localhost:8098/stats`. + +The default update frequency for the plugin is set to 2 seconds as Riak +internally updates the metrics every second. If we were to update the metrics +every second, the resulting graph would contain odd jitter. diff --git a/collectors/python.d.plugin/riakkv/riakkv.chart.py b/collectors/python.d.plugin/riakkv/riakkv.chart.py new file mode 100644 index 000000000..f81e177a5 --- /dev/null +++ b/collectors/python.d.plugin/riakkv/riakkv.chart.py @@ -0,0 +1,315 @@ +# -*- coding: utf-8 -*- +# Description: riak netdata python.d module +# +# See also: +# https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html + +from json import loads + +from bases.FrameworkServices.UrlService import UrlService + +# Riak updates the metrics at the /stats endpoint every 1 second. +# If we use `update_every = 1` here, that means we might get weird jitter in the graph, +# so the default is set to 2 seconds to prevent it. +update_every = 2 + +# charts order (can be overridden if you want less charts, or different order) +ORDER = [ + # Throughput metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#throughput-metrics + # Collected in totals. + "kv.node_operations", # K/V node operations. + "dt.vnode_updates", # Data type vnode updates. + "search.queries", # Search queries on the node. + "search.documents", # Documents indexed by Search. + "consistent.operations", # Consistent node operations. + + # Latency metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#throughput-metrics + # Collected for the past minute in milliseconds, + # returned from riak in microseconds. + "kv.latency.get", # K/V GET FSM traversal latency. + "kv.latency.put", # K/V PUT FSM traversal latency. + "dt.latency.counter", # Update Counter Data type latency. + "dt.latency.set", # Update Set Data type latency. + "dt.latency.map", # Update Map Data type latency. + "search.latency.query", # Search query latency. + "search.latency.index", # Time it takes for search to index a new document. + "consistent.latency.get", # Strong consistent read latency. + "consistent.latency.put", # Strong consistent write latency. + + # Erlang resource usage metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#erlang-resource-usage-metrics + # Processes collected as a gauge, + # memory collected as Megabytes, returned as bytes from Riak. + "vm.processes", # Number of processes currently running in the Erlang VM. + "vm.memory.processes", # Total amount of memory allocated & used for Erlang processes. + + # General Riak Load / Health metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#general-riak-load-health-metrics + # The following are collected by Riak over the past minute: + "kv.siblings_encountered.get", # Siblings encountered during GET operations by this node. + "kv.objsize.get", # Object size encountered by this node. + "search.vnodeq_size", # Number of unprocessed messages in the vnode message queues (Search). + # The following are calculated in total, or as gauges: + "search.index_errors", # Errors of the search subsystem while indexing documents. + "core.pbc", # Number of currently active protocol buffer connections. + "core.repairs", # Total read repair operations coordinated by this node. + "core.fsm_active", # Active finite state machines by kind. + "core.fsm_rejected", # Rejected finite state machines by kind. + + # General Riak Search Load / Health metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#general-riak-search-load-health-metrics + # Reported as counters. + "search.errors", # Write and read errors of the Search subsystem. +] + +CHARTS = { + # Throughput metrics + "kv.node_operations": { + "options": [None, "Reads & writes coordinated by this node", "operations/s", "throughput", "riak.kv.throughput", "line"], + "lines": [ + ["node_gets_total", "gets", "incremental"], + ["node_puts_total", "puts", "incremental"] + ] + }, + "dt.vnode_updates": { + "options": [None, "Update operations coordinated by local vnodes by data type", "operations/s", "throughput", "riak.dt.vnode_updates", "line"], + "lines": [ + ["vnode_counter_update_total", "counters", "incremental"], + ["vnode_set_update_total", "sets", "incremental"], + ["vnode_map_update_total", "maps", "incremental"], + ] + }, + "search.queries": { + "options": [None, "Search queries on the node", "queries/s", "throughput", "riak.search", "line"], + "lines": [ + ["search_query_throughput_count", "queries", "incremental"] + ] + }, + "search.documents": { + "options": [None, "Documents indexed by search", "documents/s", "throughput", "riak.search.documents", "line"], + "lines": [ + ["search_index_throughput_count", "indexed", "incremental"] + ] + }, + "consistent.operations": { + "options": [None, "Consistent node operations", "operations/s", "throughput", "riak.consistent.operations", "line"], + "lines": [ + ["consistent_gets_total", "gets", "incremental"], + ["consistent_puts_total", "puts", "incremental"], + ] + }, + + # Latency metrics + "kv.latency.get": { + "options": [None, "Time between reception of a client GET request and subsequent response to client", "ms", "latency", "riak.kv.latency.get", "line"], + "lines": [ + ["node_get_fsm_time_mean", "mean", "absolute", 1, 1000], + ["node_get_fsm_time_median", "median", "absolute", 1, 1000], + ["node_get_fsm_time_95", "95", "absolute", 1, 1000], + ["node_get_fsm_time_99", "99", "absolute", 1, 1000], + ["node_get_fsm_time_100", "100", "absolute", 1, 1000], + ] + }, + "kv.latency.put": { + "options": [None, "Time between reception of a client PUT request and subsequent response to client", "ms", "latency", "riak.kv.latency.put", "line"], + "lines": [ + ["node_put_fsm_time_mean", "mean", "absolute", 1, 1000], + ["node_put_fsm_time_median", "median", "absolute", 1, 1000], + ["node_put_fsm_time_95", "95", "absolute", 1, 1000], + ["node_put_fsm_time_99", "99", "absolute", 1, 1000], + ["node_put_fsm_time_100", "100", "absolute", 1, 1000], + ] + }, + "dt.latency.counter": { + "options": [None, "Time it takes to perform an Update Counter operation", "ms", "latency", "riak.dt.latency.counter_merge", "line"], + "lines": [ + ["object_counter_merge_time_mean", "mean", "absolute", 1, 1000], + ["object_counter_merge_time_median", "median", "absolute", 1, 1000], + ["object_counter_merge_time_95", "95", "absolute", 1, 1000], + ["object_counter_merge_time_99", "99", "absolute", 1, 1000], + ["object_counter_merge_time_100", "100", "absolute", 1, 1000], + ] + }, + "dt.latency.set": { + "options": [None, "Time it takes to perform an Update Set operation", "ms", "latency", "riak.dt.latency.set_merge", "line"], + "lines": [ + ["object_set_merge_time_mean", "mean", "absolute", 1, 1000], + ["object_set_merge_time_median", "median", "absolute", 1, 1000], + ["object_set_merge_time_95", "95", "absolute", 1, 1000], + ["object_set_merge_time_99", "99", "absolute", 1, 1000], + ["object_set_merge_time_100", "100", "absolute", 1, 1000], + ] + }, + "dt.latency.map": { + "options": [None, "Time it takes to perform an Update Map operation", "ms", "latency", "riak.dt.latency.map_merge", "line"], + "lines": [ + ["object_map_merge_time_mean", "mean", "absolute", 1, 1000], + ["object_map_merge_time_median", "median", "absolute", 1, 1000], + ["object_map_merge_time_95", "95", "absolute", 1, 1000], + ["object_map_merge_time_99", "99", "absolute", 1, 1000], + ["object_map_merge_time_100", "100", "absolute", 1, 1000], + ] + }, + "search.latency.query": { + "options": [None, "Search query latency", "ms", "latency", "riak.search.latency.query", "line"], + "lines": [ + ["search_query_latency_median", "median", "absolute", 1, 1000], + ["search_query_latency_min", "min", "absolute", 1, 1000], + ["search_query_latency_95", "95", "absolute", 1, 1000], + ["search_query_latency_99", "99", "absolute", 1, 1000], + ["search_query_latency_999", "999", "absolute", 1, 1000], + ["search_query_latency_max", "max", "absolute", 1, 1000], + ] + }, + "search.latency.index": { + "options": [None, "Time it takes Search to index a new document", "ms", "latency", "riak.search.latency.index", "line"], + "lines": [ + ["search_index_latency_median", "median", "absolute", 1, 1000], + ["search_index_latency_min", "min", "absolute", 1, 1000], + ["search_index_latency_95", "95", "absolute", 1, 1000], + ["search_index_latency_99", "99", "absolute", 1, 1000], + ["search_index_latency_999", "999", "absolute", 1, 1000], + ["search_index_latency_max", "max", "absolute", 1, 1000], + ] + }, + + # Riak Strong Consistency metrics + "consistent.latency.get": { + "options": [None, "Strongly consistent read latency", "ms", "latency", "riak.consistent.latency.get", "line"], + "lines": [ + ["consistent_get_time_mean", "mean", "absolute", 1, 1000], + ["consistent_get_time_median", "median", "absolute", 1, 1000], + ["consistent_get_time_95", "95", "absolute", 1, 1000], + ["consistent_get_time_99", "99", "absolute", 1, 1000], + ["consistent_get_time_100", "100", "absolute", 1, 1000], + ] + }, + "consistent.latency.put": { + "options": [None, "Strongly consistent write latency", "ms", "latency", "riak.consistent.latency.put", "line"], + "lines": [ + ["consistent_put_time_mean", "mean", "absolute", 1, 1000], + ["consistent_put_time_median", "median", "absolute", 1, 1000], + ["consistent_put_time_95", "95", "absolute", 1, 1000], + ["consistent_put_time_99", "99", "absolute", 1, 1000], + ["consistent_put_time_100", "100", "absolute", 1, 1000], + ] + }, + + # BEAM metrics + "vm.processes": { + "options": [None, "Total processes running in the Erlang VM", "total", "vm", "riak.vm", "line"], + "lines": [ + ["sys_process_count", "processes", "absolute"], + ] + }, + "vm.memory.processes": { + "options": [None, "Memory allocated & used by Erlang processes", "MB", "vm", "riak.vm.memory.processes", "line"], + "lines": [ + ["memory_processes", "allocated", "absolute", 1, 1024 * 1024], + ["memory_processes_used", "used", "absolute", 1, 1024 * 1024] + ] + }, + + # General Riak Load/Health metrics + "kv.siblings_encountered.get": { + "options": [None, "Number of siblings encountered during GET operations by this node during the past minute", "siblings", "load", "riak.kv.siblings_encountered.get", "line"], + "lines": [ + ["node_get_fsm_siblings_mean", "mean", "absolute"], + ["node_get_fsm_siblings_median", "median", "absolute"], + ["node_get_fsm_siblings_95", "95", "absolute"], + ["node_get_fsm_siblings_99", "99", "absolute"], + ["node_get_fsm_siblings_100", "100", "absolute"], + ] + }, + "kv.objsize.get": { + "options": [None, "Object size encountered by this node during the past minute", "KB", "load", "riak.kv.objsize.get", "line"], + "lines": [ + ["node_get_fsm_objsize_mean", "mean", "absolute", 1, 1024], + ["node_get_fsm_objsize_median", "median", "absolute", 1, 1024], + ["node_get_fsm_objsize_95", "95", "absolute", 1, 1024], + ["node_get_fsm_objsize_99", "99", "absolute", 1, 1024], + ["node_get_fsm_objsize_100", "100", "absolute", 1, 1024], + ] + }, + "search.vnodeq_size": { + "options": [None, "Number of unprocessed messages in the vnode message queues of Search on this node in the past minute", "messages", "load", "riak.search.vnodeq_size", "line"], + "lines": [ + ["riak_search_vnodeq_mean", "mean", "absolute"], + ["riak_search_vnodeq_median", "median", "absolute"], + ["riak_search_vnodeq_95", "95", "absolute"], + ["riak_search_vnodeq_99", "99", "absolute"], + ["riak_search_vnodeq_100", "100", "absolute"], + ] + }, + "search.index_errors": { + "options": [None, "Number of document index errors encountered by Search", "errors", "load", "riak.search.index", "line"], + "lines": [ + ["search_index_fail_count", "errors", "absolute"] + ] + }, + "core.pbc": { + "options": [None, "Protocol buffer connections by status", "connections", "load", "riak.core.protobuf_connections", "line"], + "lines": [ + ["pbc_active", "active", "absolute"], + # ["pbc_connects", "established_pastmin", "absolute"] + ] + }, + "core.repairs": { + "options": [None, "Number of repair operations this node has coordinated", "repairs", "load", "riak.core.repairs", "line"], + "lines": [ + ["read_repairs", "read", "absolute"] + ] + }, + "core.fsm_active": { + "options": [None, "Active finite state machines by kind", "fsms", "load", "riak.core.fsm_active", "line"], + "lines": [ + ["node_get_fsm_active", "get", "absolute"], + ["node_put_fsm_active", "put", "absolute"], + ["index_fsm_active", "secondary index", "absolute"], + ["list_fsm_active", "list keys", "absolute"] + ] + }, + "core.fsm_rejected": { + # Writing "Sidejob's" here seems to cause some weird issues: it results in this chart being rendered in + # its own context and additionally, moves the entire Riak graph all the way up to the top of the Netdata + # dashboard for some reason. + "options": [None, "Finite state machines being rejected by Sidejobs overload protection", "fsms", "load", "riak.core.fsm_rejected", "line"], + "lines": [ + ["node_get_fsm_rejected", "get", "absolute"], + ["node_put_fsm_rejected", "put", "absolute"] + ] + }, + + # General Riak Search Load / Health metrics + "search.errors": { + "options": [None, "Number of writes to Search failed due to bad data format by reason", "writes", "load", "riak.search.index", "line"], + "lines": [ + ["search_index_bad_entry_count", "bad_entry", "absolute"], + ["search_index_extract_fail_count", "extract_fail", "absolute"], + ] + } +} + + +class Service(UrlService): + def __init__(self, configuration=None, name=None): + UrlService.__init__(self, configuration=configuration, name=name) + self.order = ORDER + self.definitions = CHARTS + + def _get_data(self): + """ + Format data received from http request + :return: dict + """ + raw = self._get_raw_data() + if not raw: + return None + + try: + return loads(raw) + except (TypeError, ValueError) as err: + self.error(err) + return None diff --git a/collectors/python.d.plugin/riakkv/riakkv.conf b/collectors/python.d.plugin/riakkv/riakkv.conf new file mode 100644 index 000000000..be01c48ac --- /dev/null +++ b/collectors/python.d.plugin/riakkv/riakkv.conf @@ -0,0 +1,68 @@ +# netdata python.d.plugin configuration for riak +# +# This file is in YaML format. Generally the format is: +# +# name: value +# +# There are 2 sections: +# - global variables +# - one or more JOBS +# +# JOBS allow you to collect values from multiple sources. +# Each source will have its own set of charts. +# +# JOB parameters have to be indented (using spaces only, example below). + +# ---------------------------------------------------------------------- +# Global Variables +# These variables set the defaults for all JOBs, however each JOB +# may define its own, overriding the defaults. + +# update_every sets the default data collection frequency. +# If unset, the python.d.plugin default is used. +# update_every: 1 + +# priority controls the order of charts at the netdata dashboard. +# Lower numbers move the charts towards the top of the page. +# If unset, the default for python.d.plugin is used. +# priority: 60000 + +# penalty indicates whether to apply penalty to update_every in case of failures. +# Penalty will increase every 5 failed updates in a row. Maximum penalty is 10 minutes. +# penalty: yes + +# autodetection_retry sets the job re-check interval in seconds. +# The job is not deleted if check fails. +# Attempts to start the job are made once every autodetection_retry. +# This feature is disabled by default. +# autodetection_retry: 0 + +# ---------------------------------------------------------------------- +# JOBS (data collection sources) +# +# The default JOBS share the same *name*. JOBS with the same name +# are mutually exclusive. Only one of them will be allowed running at +# any time. This allows autodetection to try several alternatives and +# pick the one that works. +# +# Any number of jobs is supported. +# +# All python.d.plugin JOBS (for all its modules) support a set of +# predefined parameters. These are: +# +# job_name: +# name: myname # the JOB's name as it will appear at the +# # dashboard (by default is the job_name) +# # JOBs sharing a name are mutually exclusive +# update_every: 1 # the JOB's data collection frequency +# priority: 60000 # the JOB's order on the dashboard +# penalty: yes # the JOB's penalty +# autodetection_retry: 0 # the JOB's re-check interval in seconds +# +# +# ---------------------------------------------------------------------- +# AUTO-DETECTION JOBS +# only one of them will run (they have the same name) + +local: + url : 'http://localhost:8098/stats' diff --git a/collectors/python.d.plugin/smartd_log/README.md b/collectors/python.d.plugin/smartd_log/README.md index 3b0816fb8..f6584be70 100644 --- a/collectors/python.d.plugin/smartd_log/README.md +++ b/collectors/python.d.plugin/smartd_log/README.md @@ -85,7 +85,11 @@ For this you need to set `smartd_opts` (or `SMARTD_ARGS`, check _smartd.service_ # dump smartd attrs info every 600 seconds smartd_opts="-A /var/log/smartd/ -i 600" ``` - +You may need to create the smartd directory before smartd will write to it: +``` +mkdir -p /var/log/smartd +``` +Otherwise, all the smartd `.csv` files may get written to `/var/lib/smartmontools` (default location). See also [https://linux.die.net/man/8/smartd](https://linux.die.net/man/8/smartd) for more info on the `-A --attributelog=PREFIX` command. `smartd` appends logs at every run. It's strongly recommended to use `logrotate` for smartd files. diff --git a/collectors/python.d.plugin/tomcat/tomcat.chart.py b/collectors/python.d.plugin/tomcat/tomcat.chart.py index 01578c56e..ab3003304 100644 --- a/collectors/python.d.plugin/tomcat/tomcat.chart.py +++ b/collectors/python.d.plugin/tomcat/tomcat.chart.py @@ -5,11 +5,17 @@ # SPDX-License-Identifier: GPL-3.0-or-later import xml.etree.ElementTree as ET +import re from bases.FrameworkServices.UrlService import UrlService MiB = 1 << 20 +# Regex fix for Tomcat single quote XML attributes +# affecting Tomcat < 8.5.24 & 9.0.2 running with Java > 9 +# cf. https://bz.apache.org/bugzilla/show_bug.cgi?id=61603 +single_quote_regex = re.compile(r"='([^']+)'([^']+)''") + ORDER = [ 'accesses', 'bandwidth', @@ -95,6 +101,32 @@ class Service(UrlService): self.definitions = CHARTS self.url = self.configuration.get('url', 'http://127.0.0.1:8080/manager/status?XML=true') self.connector_name = self.configuration.get('connector_name', None) + self.parse = self.xml_parse + + def xml_parse(self, data): + try: + return ET.fromstring(data) + except ET.ParseError: + self.debug('%s is not a valid XML page. Please add "?XML=true" to tomcat status page.' % self.url) + return None + + def xml_single_quote_fix_parse(self, data): + data = single_quote_regex.sub(r"='\g<1>\g<2>'", data) + return self.xml_parse(data) + + def check(self): + self._manager = self._build_manager() + + raw_data = self._get_raw_data() + if not raw_data: + return False + + if single_quote_regex.search(raw_data): + self.warning('Tomcat status page is returning invalid single quote XML, please consider upgrading ' + 'your Tomcat installation. See https://bz.apache.org/bugzilla/show_bug.cgi?id=61603') + self.parse = self.xml_single_quote_fix_parse + + return self.parse(raw_data) is not None def _get_data(self): """ @@ -104,11 +136,10 @@ class Service(UrlService): data = None raw_data = self._get_raw_data() if raw_data: - try: - xml = ET.fromstring(raw_data) - except ET.ParseError: - self.debug('%s is not a vaild XML page. Please add "?XML=true" to tomcat status page.' % self.url) + xml = self.parse(raw_data) + if xml is None: return None + data = {} jvm = xml.find('jvm') @@ -153,7 +184,7 @@ class Service(UrlService): data['metaspace_committed'] = pool.get('usageCommitted') data['metaspace_max'] = pool.get('usageMax') - if connector: + if connector is not None: thread_info = connector.find('threadInfo') data['currentThreadsBusy'] = thread_info.get('currentThreadsBusy') data['currentThreadCount'] = thread_info.get('currentThreadCount') diff --git a/collectors/python.d.plugin/varnish/varnish.chart.py b/collectors/python.d.plugin/varnish/varnish.chart.py index 70af50ccb..58745e24d 100644 --- a/collectors/python.d.plugin/varnish/varnish.chart.py +++ b/collectors/python.d.plugin/varnish/varnish.chart.py @@ -5,9 +5,8 @@ import re -from bases.collection import find_binary from bases.FrameworkServices.ExecutableService import ExecutableService - +from bases.collection import find_binary ORDER = [ 'session_connections', @@ -138,6 +137,18 @@ CHARTS = { VARNISHSTAT = 'varnishstat' +re_version = re.compile(r'varnish-(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)') + + +class VarnishVersion: + def __init__(self, major, minor, patch): + self.major = major + self.minor = minor + self.patch = patch + + def __str__(self): + return '{0}.{1}.{2}'.format(self.major, self.minor, self.patch) + class Parser: _backend_new = re.compile(r'VBE.([\d\w_.]+)\(.*?\).(beresp[\w_]+)\s+(\d+)') @@ -185,10 +196,32 @@ class Service(ExecutableService): self.error("can't locate '{0}' binary or binary is not executable by user netdata".format(VARNISHSTAT)) return False + command = [varnishstat, '-V'] + reply = self._get_raw_data(stderr=True, command=command) + if not reply: + self.error( + "no output from '{0}'. Is varnish running? Not enough privileges?".format(' '.join(self.command))) + return False + + ver = parse_varnish_version(reply) + if not ver: + self.error("failed to parse reply from '{0}', used regex :'{1}', reply : {2}".format( + ' '.join(command), + re_version.pattern, + reply, + )) + return False + if self.instance_name: - self.command = [varnishstat, '-1', '-n', self.instance_name, '-t', '1'] + self.command = [varnishstat, '-1', '-n', self.instance_name] else: - self.command = [varnishstat, '-1', '-t', '1'] + self.command = [varnishstat, '-1'] + + if ver.major > 4: + self.command.extend(['-t', '1']) + + self.info("varnish version: {0}, will use command: '{1}'".format(ver, ' '.join(self.command))) + return True def check(self): @@ -198,14 +231,14 @@ class Service(ExecutableService): # STDOUT is not empty reply = self._get_raw_data() if not reply: - self.error("No output from 'varnishstat'. Is it running? Not enough privileges?") + self.error("no output from '{0}'. Is it running? Not enough privileges?".format(' '.join(self.command))) return False self.parser.init(reply) # Output is parsable if not self.parser.re_default: - self.error('Cant parse the output...') + self.error('cant parse the output...') return False if self.parser.re_backend: @@ -260,3 +293,16 @@ class Service(ExecutableService): self.order.insert(0, chart_name) self.definitions.update(chart) + + +def parse_varnish_version(lines): + m = re_version.search(lines[0]) + if not m: + return None + + m = m.groupdict() + return VarnishVersion( + int(m['major']), + int(m['minor']), + int(m['patch']), + ) diff --git a/collectors/python.d.plugin/web_log/web_log.chart.py b/collectors/python.d.plugin/web_log/web_log.chart.py index 6d6a261c4..fa5a8bc3e 100644 --- a/collectors/python.d.plugin/web_log/web_log.chart.py +++ b/collectors/python.d.plugin/web_log/web_log.chart.py @@ -4,9 +4,8 @@ # SPDX-License-Identifier: GPL-3.0-or-later import bisect -import re import os - +import re from collections import namedtuple, defaultdict from copy import deepcopy @@ -660,7 +659,7 @@ class Web: r' (?P<bytes_sent>\d+)' r' (?P<resp_length>\d+)' r' (?P<resp_time>\d+\.\d+)' - r' (?P<resp_time_upstream>[\d.-]+) ') + r' (?P<resp_time_upstream>[\d.-]+)') nginx_ext_append = re.compile(r'(?P<address>[\da-f.:]+)' r' -.*?"(?P<request>[^"]*)"' diff --git a/collectors/tc.plugin/README.md b/collectors/tc.plugin/README.md index b54e80858..e71944e3c 100644 --- a/collectors/tc.plugin/README.md +++ b/collectors/tc.plugin/README.md @@ -191,6 +191,7 @@ Add the following configuration option in `/etc/netdata.conf`: Finally, create `/etc/netdata/tc-qos-helper.conf` with this content: ```tc_show="class"``` +Please note, that by default Netdata will enable monitoring metrics only when they are not zero. If they are constantly zero they are ignored. Metrics that will start having values, after netdata is started, will be detected and charts will be automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). Set `yes` for a chart instead of `auto` to enable it permanently. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Ftc.plugin%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() |