diff options
Diffstat (limited to '')
-rw-r--r-- | health/health.d/adaptec_raid.conf | 32 | ||||
-rw-r--r-- | health/health.d/anomalies.conf | 23 | ||||
-rw-r--r-- | health/health.d/bind_rndc.conf | 12 | ||||
-rw-r--r-- | health/health.d/cgroups.conf | 72 | ||||
-rw-r--r-- | health/health.d/cpu.conf | 69 | ||||
-rw-r--r-- | health/health.d/disks.conf | 172 | ||||
-rw-r--r-- | health/health.d/entropy.conf | 20 | ||||
-rw-r--r-- | health/health.d/file_descriptors.conf | 33 | ||||
-rw-r--r-- | health/health.d/go.d.plugin.conf | 18 | ||||
-rw-r--r-- | health/health.d/httpcheck.conf | 73 | ||||
-rw-r--r-- | health/health.d/ipc.conf | 34 | ||||
-rw-r--r-- | health/health.d/isc_dhcpd.conf | 10 | ||||
-rw-r--r-- | health/health.d/load.conf | 72 | ||||
-rw-r--r-- | health/health.d/megacli.conf | 76 | ||||
-rw-r--r-- | health/health.d/memory.conf | 85 | ||||
-rw-r--r-- | health/health.d/net.conf | 258 | ||||
-rw-r--r-- | health/health.d/netfilter.conf | 20 | ||||
-rw-r--r-- | health/health.d/ping.conf | 50 | ||||
-rw-r--r-- | health/health.d/postgres.conf | 228 | ||||
-rw-r--r-- | health/health.d/python.d.plugin.conf | 18 | ||||
-rw-r--r-- | health/health.d/qos.conf | 18 | ||||
-rw-r--r-- | health/health.d/ram.conf | 82 | ||||
-rw-r--r-- | health/health.d/redis.conf | 57 | ||||
-rw-r--r-- | health/health.d/softnet.conf | 57 | ||||
-rw-r--r-- | health/health.d/swap.conf | 37 | ||||
-rw-r--r-- | health/health.d/systemdunits.conf | 161 | ||||
-rw-r--r-- | health/health.d/tcp_conn.conf | 23 | ||||
-rw-r--r-- | health/health.d/tcp_listen.conf | 100 | ||||
-rw-r--r-- | health/health.d/tcp_mem.conf | 24 | ||||
-rw-r--r-- | health/health.d/tcp_orphans.conf | 25 | ||||
-rw-r--r-- | health/health.d/tcp_resets.conf | 71 | ||||
-rw-r--r-- | health/health.d/timex.conf | 18 | ||||
-rw-r--r-- | health/health.d/udp_errors.conf | 40 | ||||
-rw-r--r-- | health/health.d/upsd.conf | 50 | ||||
-rw-r--r-- | health/health.d/vsphere.conf | 70 | ||||
-rw-r--r-- | health/health.d/whoisquery.conf | 14 | ||||
-rw-r--r-- | health/health.d/windows.conf | 126 | ||||
-rw-r--r-- | health/health.d/x509check.conf | 26 | ||||
-rw-r--r-- | health/health.d/zfs.conf | 44 | ||||
-rw-r--r-- | src/health/health.d/apcupsd.conf (renamed from health/health.d/apcupsd.conf) | 4 | ||||
-rw-r--r-- | src/health/health.d/bcache.conf (renamed from health/health.d/bcache.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/beanstalkd.conf (renamed from health/health.d/beanstalkd.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/boinc.conf (renamed from health/health.d/boinc.conf) | 10 | ||||
-rw-r--r-- | src/health/health.d/btrfs.conf (renamed from health/health.d/btrfs.conf) | 19 | ||||
-rw-r--r-- | src/health/health.d/ceph.conf (renamed from health/health.d/ceph.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/cockroachdb.conf (renamed from health/health.d/cockroachdb.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/consul.conf (renamed from health/health.d/consul.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/dbengine.conf (renamed from health/health.d/dbengine.conf) | 9 | ||||
-rw-r--r-- | src/health/health.d/dns_query.conf (renamed from health/health.d/dns_query.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/dnsmasq_dhcp.conf (renamed from health/health.d/dnsmasq_dhcp.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/docker.conf (renamed from health/health.d/docker.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/elasticsearch.conf (renamed from health/health.d/elasticsearch.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/exporting.conf (renamed from health/health.d/exporting.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/gearman.conf (renamed from health/health.d/gearman.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/geth.conf (renamed from health/health.d/geth.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/haproxy.conf (renamed from health/health.d/haproxy.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/hdfs.conf (renamed from health/health.d/hdfs.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/ioping.conf (renamed from health/health.d/ioping.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/ipfs.conf (renamed from health/health.d/ipfs.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/ipmi.conf (renamed from health/health.d/ipmi.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/kubelet.conf (renamed from health/health.d/kubelet.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/linux_power_supply.conf (renamed from health/health.d/linux_power_supply.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/mdstat.conf (renamed from health/health.d/mdstat.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/memcached.conf (renamed from health/health.d/memcached.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/ml.conf (renamed from health/health.d/ml.conf) | 7 | ||||
-rw-r--r-- | src/health/health.d/mysql.conf (renamed from health/health.d/mysql.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/nvme.conf (renamed from health/health.d/nvme.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/pihole.conf (renamed from health/health.d/pihole.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/plugin.conf (renamed from health/health.d/plugin.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/portcheck.conf (renamed from health/health.d/portcheck.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/processes.conf (renamed from health/health.d/processes.conf) | 1 | ||||
-rw-r--r-- | src/health/health.d/retroshare.conf (renamed from health/health.d/retroshare.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/riakkv.conf (renamed from health/health.d/riakkv.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/scaleio.conf (renamed from health/health.d/scaleio.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/synchronization.conf (renamed from health/health.d/synchronization.conf) | 1 | ||||
-rw-r--r-- | src/health/health.d/unbound.conf (renamed from health/health.d/unbound.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/vcsa.conf (renamed from health/health.d/vcsa.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/vernemq.conf (renamed from health/health.d/vernemq.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/web_log.conf (renamed from health/health.d/web_log.conf) | 0 |
79 files changed, 2 insertions, 2467 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf deleted file mode 100644 index 1f1840491..000000000 --- a/health/health.d/adaptec_raid.conf +++ /dev/null @@ -1,32 +0,0 @@ - -# logical device status check - - template: adaptec_raid_ld_status - on: adaptec_raid.ld_status - class: Errors - type: System -component: RAID - lookup: max -10s foreach * - units: bool - every: 10s - crit: $this > 0 - delay: down 5m multiplier 1.5 max 1h - summary: Adaptec raid logical device status - info: Logical device status is failed or degraded - to: sysadmin - -# physical device state check - - template: adaptec_raid_pd_state - on: adaptec_raid.pd_state - class: Errors - type: System -component: RAID - lookup: max -10s foreach * - units: bool - every: 10s - crit: $this > 0 - delay: down 5m multiplier 1.5 max 1h - summary: Adaptec raid physical device state - info: Physical device state is not online - to: sysadmin diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf deleted file mode 100644 index 269ae544b..000000000 --- a/health/health.d/anomalies.conf +++ /dev/null @@ -1,23 +0,0 @@ -# raise a warning alarm if an anomaly probability is consistently above 50% - - template: anomalies_anomaly_probabilities - on: anomalies.probability - class: Errors - type: Netdata -component: ML - lookup: average -2m foreach * - every: 1m - warn: $this > 50 - info: average anomaly probability over the last 2 minutes - -# raise a warning alarm if an anomaly flag is consistently firing - - template: anomalies_anomaly_flags - on: anomalies.anomaly - class: Errors - type: Netdata -component: ML - lookup: sum -2m foreach * - every: 1m - warn: $this > 10 - info: number of anomalies in the last 2 minutes diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf deleted file mode 100644 index b1c271df9..000000000 --- a/health/health.d/bind_rndc.conf +++ /dev/null @@ -1,12 +0,0 @@ - template: bind_rndc_stats_file_size - on: bind_rndc.stats_size - class: Utilization - type: DNS -component: BIND - units: megabytes - every: 60 - calc: $stats_size - warn: $this > 512 - summary: BIND statistics file size - info: BIND statistics-file size - to: sysadmin diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf deleted file mode 100644 index 9c55633ef..000000000 --- a/health/health.d/cgroups.conf +++ /dev/null @@ -1,72 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - template: cgroup_10min_cpu_usage - on: cgroup.cpu_limit - class: Utilization - type: Cgroups -component: CPU - os: linux - hosts: * - lookup: average -10m unaligned - units: % - every: 1m - warn: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: Cgroup ${label:cgroup_name} CPU utilization - info: Cgroup ${label:cgroup_name} average CPU utilization over the last 10 minutes - to: silent - - template: cgroup_ram_in_use - on: cgroup.mem_usage - class: Utilization - type: Cgroups -component: Memory - os: linux - hosts: * - calc: ($ram) * 100 / $memory_limit - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: Cgroup ${label:cgroup_name} memory utilization - info: Cgroup ${label:cgroup_name} memory utilization - to: silent - -# ---------------------------------K8s containers-------------------------------------------- - - template: k8s_cgroup_10min_cpu_usage - on: k8s.cgroup.cpu_limit - class: Utilization - type: Cgroups -component: CPU - os: linux - hosts: * - lookup: average -10m unaligned - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - delay: down 15m multiplier 1.5 max 1h - summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} CPU utilization - info: Container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ - average CPU utilization over the last 10 minutes - to: silent - - template: k8s_cgroup_ram_in_use - on: k8s.cgroup.mem_usage - class: Utilization - type: Cgroups -component: Memory - os: linux - hosts: * - calc: ($ram) * 100 / $memory_limit - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} memory utilization - info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ - memory utilization - to: silent diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf deleted file mode 100644 index 0b007d6b4..000000000 --- a/health/health.d/cpu.conf +++ /dev/null @@ -1,69 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - template: 10min_cpu_usage - on: system.cpu - class: Utilization - type: System -component: CPU - os: linux - hosts: * - lookup: average -10m unaligned of user,system,softirq,irq,guest - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: System CPU utilization - info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) - to: silent - - template: 10min_cpu_iowait - on: system.cpu - class: Utilization - type: System -component: CPU - os: linux - hosts: * - lookup: average -10m unaligned of iowait - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (20) : (40)) - delay: up 30m down 30m multiplier 1.5 max 2h - summary: System CPU iowait time - info: Average CPU iowait time over the last 10 minutes - to: silent - - template: 20min_steal_cpu - on: system.cpu - class: Latency - type: System -component: CPU - os: linux - hosts: * - lookup: average -20m unaligned of steal - units: % - every: 5m - warn: $this > (($status >= $WARNING) ? (5) : (10)) - delay: down 1h multiplier 1.5 max 2h - summary: System CPU steal time - info: Average CPU steal time over the last 20 minutes - to: silent - -## FreeBSD - template: 10min_cpu_usage - on: system.cpu - class: Utilization - type: System -component: CPU - os: freebsd - hosts: * - lookup: average -10m unaligned of user,system,interrupt - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: System CPU utilization - info: Average CPU utilization over the last 10 minutes (excluding nice) - to: silent diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf deleted file mode 100644 index 2e417fd4a..000000000 --- a/health/health.d/disks.conf +++ /dev/null @@ -1,172 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - -# ----------------------------------------------------------------------------- -# low disk space - -# checking the latest collected values -# raise an alarm if the disk is low on -# available disk space - - template: disk_space_usage - on: disk.space - class: Utilization - type: System -component: Disk - os: linux freebsd - hosts: * -chart labels: mount_point=!/dev !/dev/* !/run !/run/* * - calc: $used * 100 / ($avail + $used) - units: % - every: 1m - warn: $this > (($status >= $WARNING ) ? (80) : (90)) - crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5 - delay: up 1m down 15m multiplier 1.5 max 1h - summary: Disk ${label:mount_point} space usage - info: Total space utilization of disk ${label:mount_point} - to: sysadmin - - template: disk_inode_usage - on: disk.inodes - class: Utilization - type: System -component: Disk - os: linux freebsd - hosts: * -chart labels: mount_point=!/dev !/dev/* !/run !/run/* * - calc: $used * 100 / ($avail + $used) - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: up 1m down 15m multiplier 1.5 max 1h - summary: Disk ${label:mount_point} inode usage - info: Total inode utilization of disk ${label:mount_point} - to: sysadmin - - -# ----------------------------------------------------------------------------- -# disk fill rate - -# calculate the rate the disk fills -# use as base, the available space change -# during the last hour - -# this is just a calculation - it has no alarm -# we will use it in the next template to find -# the hours remaining - -template: disk_fill_rate - on: disk.space - os: linux freebsd - hosts: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: GB/hour - info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour - -# calculate the hours remaining -# if the disk continues to fill -# in this rate - -template: out_of_disk_space_time - on: disk.space - os: linux freebsd - hosts: * - calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - summary: Disk ${label:mount_point} estimation of lack of space - info: Estimated time the disk ${label:mount_point} will run out of space, if the system continues to add data with the rate of the last hour - to: silent - - -# ----------------------------------------------------------------------------- -# disk inode fill rate - -# calculate the rate the disk inodes are allocated -# use as base, the available inodes change -# during the last hour - -# this is just a calculation - it has no alarm -# we will use it in the next template to find -# the hours remaining - -template: disk_inode_rate - on: disk.inodes - os: linux freebsd - hosts: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: inodes/hour - info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour - -# calculate the hours remaining -# if the disk inodes are allocated -# in this rate - -template: out_of_disk_inodes_time - on: disk.inodes - os: linux freebsd - hosts: * - calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - summary: Disk ${label:mount_point} estimation of lack of inodes - info: Estimated time the disk ${label:mount_point} will run out of inodes, if the system continues to allocate inodes with the rate of the last hour - to: silent - - -# ----------------------------------------------------------------------------- -# disk congestion - -# raise an alarm if the disk is congested -# by calculating the average disk utilization -# for the last 10 minutes - - template: 10min_disk_utilization - on: disk.util - class: Utilization - type: System -component: Disk - os: linux freebsd - hosts: * - lookup: average -10m unaligned - units: % - every: 1m - warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1)) - delay: down 15m multiplier 1.2 max 1h - summary: Disk ${label:device} utilization - info: Average percentage of time ${label:device} disk was busy over the last 10 minutes - to: silent - - -# raise an alarm if the disk backlog -# is above 1000ms (1s) per second -# for 10 minutes -# (i.e. the disk cannot catch up) - - template: 10min_disk_backlog - on: disk.backlog - class: Latency - type: System -component: Disk - os: linux - hosts: * - lookup: average -10m unaligned - units: ms - every: 1m - warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1)) - delay: down 15m multiplier 1.2 max 1h - summary: Disk ${label:device} backlog - info: Average backlog size of the ${label:device} disk over the last 10 minutes - to: silent diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf deleted file mode 100644 index be8b1fe4f..000000000 --- a/health/health.d/entropy.conf +++ /dev/null @@ -1,20 +0,0 @@ - -# check if entropy is too low -# the alarm is checked every 1 minute -# and examines the last hour of data - - alarm: lowest_entropy - on: system.entropy - class: Utilization - type: System -component: Cryptography - os: linux - hosts: * - lookup: min -5m unaligned - units: entries - every: 5m - warn: $this < (($status >= $WARNING) ? (200) : (100)) - delay: down 1h multiplier 1.5 max 2h - summary: System entropy pool number of entries - info: Minimum number of entries in the random numbers pool in the last 5 minutes - to: silent diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf deleted file mode 100644 index 20a592d6b..000000000 --- a/health/health.d/file_descriptors.conf +++ /dev/null @@ -1,33 +0,0 @@ - # you can disable an alarm notification by setting the 'to' line to: silent - - template: system_file_descriptors_utilization - on: system.file_nr_utilization - class: Utilization - type: System - component: Processes - hosts: * - lookup: max -1m unaligned - units: % - every: 1m - crit: $this > 90 - delay: down 15m multiplier 1.5 max 1h - summary: System open file descriptors utilization - info: System-wide utilization of open files - to: sysadmin - - template: apps_group_file_descriptors_utilization - on: app.fds_open_limit - class: Utilization - type: System -component: Process - os: linux - module: * - hosts: * - lookup: max -10s unaligned foreach * - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: App group ${label:app_group} file descriptors utilization - info: Open files percentage against the processes limits, among all PIDs in application group - to: sysadmin diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf deleted file mode 100644 index 7796a1bc8..000000000 --- a/health/health.d/go.d.plugin.conf +++ /dev/null @@ -1,18 +0,0 @@ - -# make sure go.d.plugin data collection job is running - - template: go.d_job_last_collected_secs - on: netdata.go_plugin_execution_time - class: Errors - type: Netdata -component: go.d.plugin - module: !* * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - summary: Go.d plugin last collection - info: Number of seconds since the last successful data collection - to: webmaster diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf deleted file mode 100644 index da5dec797..000000000 --- a/health/health.d/httpcheck.conf +++ /dev/null @@ -1,73 +0,0 @@ - -# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges - template: httpcheck_web_service_up - on: httpcheck.status - class: Utilization - type: Web Server -component: HTTP endpoint - lookup: average -1m unaligned percentage of success - calc: ($this < 75) ? (0) : ($this) - every: 5s - units: up/down - info: HTTP check endpoint ${label:url} liveness status - to: silent - - template: httpcheck_web_service_bad_content - on: httpcheck.status - class: Workload - type: Web Server -component: HTTP endpoint - lookup: average -5m unaligned percentage of bad_content - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - summary: HTTP check for ${label:url} unexpected content - info: Percentage of HTTP responses from ${label:url} with unexpected content in the last 5 minutes - to: webmaster - - template: httpcheck_web_service_bad_status - on: httpcheck.status - class: Workload - type: Web Server -component: HTTP endpoint - lookup: average -5m unaligned percentage of bad_status - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - summary: HTTP check for ${label:url} unexpected status - info: Percentage of HTTP responses from ${label:url} with unexpected status in the last 5 minutes - to: webmaster - - template: httpcheck_web_service_timeouts - on: httpcheck.status - class: Latency - type: Web Server -component: HTTP endpoint - lookup: average -5m unaligned percentage of timeout - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - summary: HTTP check for ${label:url} timeouts - info: Percentage of timed-out HTTP requests to ${label:url} in the last 5 minutes - to: webmaster - - template: httpcheck_web_service_no_connection - on: httpcheck.status - class: Errors - type: Other -component: HTTP endpoint - lookup: average -5m unaligned percentage of no_connection - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - summary: HTTP check for ${label:url} failed requests - info: Percentage of failed HTTP requests to ${label:url} in the last 5 minutes - to: webmaster diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf deleted file mode 100644 index f77f56065..000000000 --- a/health/health.d/ipc.conf +++ /dev/null @@ -1,34 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: semaphores_used - on: system.ipc_semaphores - class: Utilization - type: System -component: IPC - os: linux - hosts: * - calc: $semaphores * 100 / $ipc_semaphores_max - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - delay: down 5m multiplier 1.5 max 1h - summary: IPC semaphores used - info: IPC semaphore utilization - to: sysadmin - - alarm: semaphore_arrays_used - on: system.ipc_semaphore_arrays - class: Utilization - type: System -component: IPC - os: linux - hosts: * - calc: $arrays * 100 / $ipc_semaphores_arrays_max - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - delay: down 5m multiplier 1.5 max 1h - summary: IPC semaphore arrays used - info: IPC semaphore arrays utilization - to: sysadmin diff --git a/health/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf deleted file mode 100644 index d1f93969a..000000000 --- a/health/health.d/isc_dhcpd.conf +++ /dev/null @@ -1,10 +0,0 @@ -# template: isc_dhcpd_leases_size -# on: isc_dhcpd.leases_total -# units: KB -# every: 60 -# calc: $leases_size -# warn: $this > 3072 -# crit: $this > 6144 -# delay: up 2m down 5m -# info: dhcpd.leases file too big! Module can slow down your server. -# to: sysadmin diff --git a/health/health.d/load.conf b/health/health.d/load.conf deleted file mode 100644 index fd8bf9396..000000000 --- a/health/health.d/load.conf +++ /dev/null @@ -1,72 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# Calculate the base trigger point for the load average alarms. -# This is the maximum number of CPU's in the system over the past 1 -# minute, with a special case for a single CPU of setting the trigger at 2. - alarm: load_cpu_number - on: system.load - class: Utilization - type: System -component: Load - os: linux - hosts: * - calc: ($active_processors == nan or $active_processors == 0) ? (nan) : ( ($active_processors < 2) ? ( 2 ) : ( $active_processors ) ) - units: cpus - every: 1m - info: Number of active CPU cores in the system - -# Send alarms if the load average is unusually high. -# These intentionally _do not_ calculate the average over the sampled -# time period because the values being checked already are averages. - - alarm: load_average_15 - on: system.load - class: Utilization - type: System -component: Load - os: linux - hosts: * - lookup: max -1m unaligned of load15 - calc: ($load_cpu_number == nan) ? (nan) : ($this) - units: load - every: 1m - warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200) - delay: down 15m multiplier 1.5 max 1h - summary: Host load average (15 minutes) - info: System load average for the past 15 minutes - to: silent - - alarm: load_average_5 - on: system.load - class: Utilization - type: System -component: Load - os: linux - hosts: * - lookup: max -1m unaligned of load5 - calc: ($load_cpu_number == nan) ? (nan) : ($this) - units: load - every: 1m - warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400) - delay: down 15m multiplier 1.5 max 1h - summary: System load average (5 minutes) - info: System load average for the past 5 minutes - to: silent - - alarm: load_average_1 - on: system.load - class: Utilization - type: System -component: Load - os: linux - hosts: * - lookup: max -1m unaligned of load1 - calc: ($load_cpu_number == nan) ? (nan) : ($this) - units: load - every: 1m - warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800) - delay: down 15m multiplier 1.5 max 1h - summary: System load average (1 minute) - info: System load average for the past 1 minute - to: silent diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf deleted file mode 100644 index 118997a59..000000000 --- a/health/health.d/megacli.conf +++ /dev/null @@ -1,76 +0,0 @@ - -## Adapters (controllers) - - template: megacli_adapter_state - on: megacli.adapter_degraded - class: Errors - type: System -component: RAID - lookup: max -10s foreach * - units: boolean - every: 10s - crit: $this > 0 - delay: down 5m multiplier 2 max 10m - summary: MegaCLI adapter state - info: Adapter is in the degraded state (0: false, 1: true) - to: sysadmin - -## Physical Disks - - template: megacli_pd_predictive_failures - on: megacli.pd_predictive_failure - class: Errors - type: System -component: RAID - lookup: sum -10s foreach * - units: predictive failures - every: 10s - warn: $this > 0 - delay: up 1m down 5m multiplier 2 max 10m - summary: MegaCLI physical drive predictive failures - info: Number of physical drive predictive failures - to: sysadmin - - template: megacli_pd_media_errors - on: megacli.pd_media_error - class: Errors - type: System -component: RAID - lookup: sum -10s foreach * - units: media errors - every: 10s - warn: $this > 0 - delay: up 1m down 5m multiplier 2 max 10m - summary: MegaCLI physical drive errors - info: Number of physical drive media errors - to: sysadmin - -## Battery Backup Units (BBU) - - template: megacli_bbu_relative_charge - on: megacli.bbu_relative_charge - class: Workload - type: System -component: RAID - lookup: average -10s - units: percent - every: 10s - warn: $this <= (($status >= $WARNING) ? (85) : (80)) - crit: $this <= (($status == $CRITICAL) ? (50) : (40)) - summary: MegaCLI BBU charge state - info: Average battery backup unit (BBU) relative state of charge over the last 10 seconds - to: sysadmin - - template: megacli_bbu_cycle_count - on: megacli.bbu_cycle_count - class: Workload - type: System -component: RAID - lookup: average -10s - units: cycles - every: 10s - warn: $this >= 100 - crit: $this >= 500 - summary: MegaCLI BBU cycles count - info: Average battery backup unit (BBU) charge cycles count over the last 10 seconds - to: sysadmin diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf deleted file mode 100644 index 5ab3d2d92..000000000 --- a/health/health.d/memory.conf +++ /dev/null @@ -1,85 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: 1hour_memory_hw_corrupted - on: mem.hwcorrupt - class: Errors - type: System -component: Memory - os: linux - hosts: * - calc: $HardwareCorrupted - units: MB - every: 10s - warn: $this > 0 - delay: down 1h multiplier 1.5 max 1h - summary: System corrupted memory - info: Amount of memory corrupted due to a hardware failure - to: sysadmin - -## ECC Controller - - template: ecc_memory_mc_correctable - on: mem.edac_mc - class: Errors - type: System -component: Memory - os: linux - hosts: * - lookup: sum -10m unaligned of correctable, correctable_noinfo - units: errors - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 1h - summary: System ECC memory ${label:controller} correctable errors - info: Memory controller ${label:controller} ECC correctable errors in the last 10 minutes - to: sysadmin - - template: ecc_memory_mc_uncorrectable - on: mem.edac_mc - class: Errors - type: System -component: Memory - os: linux - hosts: * - lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo - units: errors - every: 1m - crit: $this > 0 - delay: down 1h multiplier 1.5 max 1h - summary: System ECC memory ${label:controller} uncorrectable errors - info: Memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes - to: sysadmin - -## ECC DIMM - - template: ecc_memory_dimm_correctable - on: mem.edac_mc_dimm - class: Errors - type: System -component: Memory - os: linux - hosts: * - lookup: sum -10m unaligned of correctable - units: errors - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 1h - summary: System ECC memory DIMM ${label:dimm} correctable errors - info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes - to: sysadmin - - template: ecc_memory_dimm_uncorrectable - on: mem.edac_mc_dimm - class: Errors - type: System -component: Memory - os: linux - hosts: * - lookup: sum -10m unaligned of uncorrectable - units: errors - every: 1m - crit: $this > 0 - delay: down 1h multiplier 1.5 max 1h - summary: System ECC memory DIMM ${label:dimm} uncorrectable errors - info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes - to: sysadmin diff --git a/health/health.d/net.conf b/health/health.d/net.conf deleted file mode 100644 index 2dfe6bbaf..000000000 --- a/health/health.d/net.conf +++ /dev/null @@ -1,258 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# ----------------------------------------------------------------------------- -# net traffic overflow - - template: interface_speed - on: net.net - class: Latency - type: System -component: Network - os: * - hosts: * - calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max / 1000) : ( nan ) - units: Mbit - every: 10s - info: Network interface ${label:device} current speed - - template: 1m_received_traffic_overflow - on: net.net - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: average -1m unaligned absolute of received - calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (85) : (90)) - delay: up 1m down 1m multiplier 1.5 max 1h - summary: System network interface ${label:device} inbound utilization - info: Average inbound utilization for the network interface ${label:device} over the last minute - to: silent - - template: 1m_sent_traffic_overflow - on: net.net - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: average -1m unaligned absolute of sent - calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (85) : (90)) - delay: up 1m down 1m multiplier 1.5 max 1h - summary: System network interface ${label:device} outbound utilization - info: Average outbound utilization for the network interface ${label:device} over the last minute - to: silent - -# ----------------------------------------------------------------------------- -# dropped packets - -# check if an interface is dropping packets -# the alarm is checked every 1 minute -# and examines the last 10 minutes of data -# -# it is possible to have expected packet drops on an interface for some network configurations -# look at the Monitoring Network Interfaces section in the proc.plugin documentation for more information - - template: net_interface_inbound_packets - on: net.packets - class: Workload - type: System -component: Network - os: * - hosts: * - lookup: sum -10m unaligned absolute of received - units: packets - every: 1m - summary: Network interface ${label:device} received packets - info: Received packets for the network interface ${label:device} in the last 10 minutes - - template: net_interface_outbound_packets - on: net.packets - class: Workload - type: System -component: Network - os: * - hosts: * - lookup: sum -10m unaligned absolute of sent - units: packets - every: 1m - summary: Network interface ${label:device} sent packets - info: Sent packets for the network interface ${label:device} in the last 10 minutes - - template: inbound_packets_dropped_ratio - on: net.drops - class: Errors - type: System -component: Network - os: * - hosts: * -chart labels: device=!wl* * - lookup: sum -10m unaligned absolute of inbound - calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0)) - units: % - every: 1m - warn: $this >= 2 - delay: up 1m down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} inbound drops - info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes - to: silent - - template: outbound_packets_dropped_ratio - on: net.drops - class: Errors - type: System -component: Network - os: * - hosts: * -chart labels: device=!wl* * - lookup: sum -10m unaligned absolute of outbound - calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0)) - units: % - every: 1m - warn: $this >= 2 - delay: up 1m down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} outbound drops - info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes - to: silent - - template: wifi_inbound_packets_dropped_ratio - on: net.drops - class: Errors - type: System -component: Network - os: linux - hosts: * -chart labels: device=wl* - lookup: sum -10m unaligned absolute of received - calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0)) - units: % - every: 1m - warn: $this >= 10 - delay: up 1m down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} inbound drops ratio - info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes - to: silent - - template: wifi_outbound_packets_dropped_ratio - on: net.drops - class: Errors - type: System -component: Network - os: linux - hosts: * -chart labels: device=wl* - lookup: sum -10m unaligned absolute of sent - calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0)) - units: % - every: 1m - warn: $this >= 10 - delay: up 1m down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} outbound drops ratio - info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes - to: silent - -# ----------------------------------------------------------------------------- -# interface errors - - template: interface_inbound_errors - on: net.errors - class: Errors - type: System -component: Network - os: freebsd - hosts: * - lookup: sum -10m unaligned absolute of inbound - units: errors - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} inbound errors - info: Number of inbound errors for the network interface ${label:device} in the last 10 minutes - to: silent - - template: interface_outbound_errors - on: net.errors - class: Errors - type: System -component: Network - os: freebsd - hosts: * - lookup: sum -10m unaligned absolute of outbound - units: errors - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} outbound errors - info: Number of outbound errors for the network interface ${label:device} in the last 10 minutes - to: silent - -# ----------------------------------------------------------------------------- -# FIFO errors - -# check if an interface is having FIFO -# buffer errors -# the alarm is checked every 1 minute -# and examines the last 10 minutes of data - - template: 10min_fifo_errors - on: net.fifo - class: Errors - type: System -component: Network - os: linux - hosts: * - lookup: sum -10m unaligned absolute - units: errors - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} FIFO errors - info: Number of FIFO errors for the network interface ${label:device} in the last 10 minutes - to: silent - -# ----------------------------------------------------------------------------- -# check for packet storms - -# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate -# 2. do the same for the last 10s -# 3. raise an alarm if the later is 10x or 20x the first -# we assume the minimum packet storm should at least have -# 10000 packets/s, average of the last 10 seconds - - template: 1m_received_packets_rate - on: net.packets - class: Workload - type: System -component: Network - os: linux freebsd - hosts: * - lookup: average -1m unaligned of received - units: packets - every: 10s - info: Average number of packets received by the network interface ${label:device} over the last minute - - template: 10s_received_packets_storm - on: net.packets - class: Workload - type: System -component: Network - os: linux freebsd - hosts: * - lookup: average -10s unaligned of received - calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(200):(5000)) - crit: $this > (($status == $CRITICAL)?(5000):(6000)) - options: no-clear-notification - summary: System network interface ${label:device} inbound packet storm - info: Ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ - compared to the rate over the last minute - to: silent diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf deleted file mode 100644 index 417105d43..000000000 --- a/health/health.d/netfilter.conf +++ /dev/null @@ -1,20 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: netfilter_conntrack_full - on: netfilter.conntrack_sockets - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: max -10s unaligned of connections - calc: $this * 100 / $netfilter_conntrack_max - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (85) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (95)) - delay: down 5m multiplier 1.5 max 1h - summary: System Netfilter connection tracker utilization - info: Netfilter connection tracker table size utilization - to: sysadmin diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf deleted file mode 100644 index 0e434420d..000000000 --- a/health/health.d/ping.conf +++ /dev/null @@ -1,50 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - template: ping_host_reachable - on: ping.host_packet_loss - class: Errors - type: Other -component: Network - lookup: average -30s unaligned of loss - calc: $this != nan AND $this < 100 - units: up/down - every: 10s - crit: $this == 0 - delay: down 30m multiplier 1.5 max 2h - summary: Host ${label:host} ping status - info: Network host ${label:host} reachability status - to: sysadmin - - template: ping_packet_loss - on: ping.host_packet_loss - class: Errors - type: Other -component: Network - lookup: average -10m unaligned of loss - green: 5 - red: 10 - units: % - every: 10s - warn: $this > $green - crit: $this > $red - delay: down 30m multiplier 1.5 max 2h - summary: Host ${label:host} ping packet loss - info: Packet loss percentage to the network host ${label:host} over the last 10 minutes - to: sysadmin - - template: ping_host_latency - on: ping.host_rtt - class: Latency - type: Other -component: Network - lookup: average -10s unaligned of avg - units: ms - every: 10s - green: 500 - red: 1000 - warn: $this > $green OR $max > $red - crit: $this > $red - delay: down 30m multiplier 1.5 max 2h - summary: Host ${label:host} ping latency - info: Average latency to the network host ${label:host} over the last 10 seconds - to: sysadmin diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf deleted file mode 100644 index de4c0078e..000000000 --- a/health/health.d/postgres.conf +++ /dev/null @@ -1,228 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - template: postgres_total_connection_utilization - on: postgres.connections_utilization - class: Utilization - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of used - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (80) : (90)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL connection utilization - info: Average total connection utilization over the last minute - to: dba - - template: postgres_acquired_locks_utilization - on: postgres.locks_utilization - class: Utilization - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of used - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (15) : (20)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL acquired locks utilization - info: Average acquired locks utilization over the last minute - to: dba - - template: postgres_txid_exhaustion_perc - on: postgres.txid_exhaustion_perc - class: Utilization - type: Database -component: PostgreSQL - hosts: * - calc: $txid_exhaustion - units: % - every: 1m - warn: $this > 90 - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL TXID exhaustion - info: Percent towards TXID wraparound - to: dba - -# Database alarms - - template: postgres_db_cache_io_ratio - on: postgres.db_cache_io_ratio - class: Workload - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of miss - calc: 100 - $this - units: % - every: 1m - warn: $this < (($status >= $WARNING) ? (70) : (60)) - crit: $this < (($status == $CRITICAL) ? (60) : (50)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL DB ${label:database} cache hit ratio - info: Average cache hit ratio in db ${label:database} over the last minute - to: dba - - template: postgres_db_transactions_rollback_ratio - on: postgres.db_transactions_ratio - class: Workload - type: Database -component: PostgreSQL - hosts: * - lookup: average -5m unaligned of rollback - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (2)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL DB ${label:database} aborted transactions - info: Average aborted transactions percentage in db ${label:database} over the last five minutes - to: dba - - template: postgres_db_deadlocks_rate - on: postgres.db_deadlocks_rate - class: Errors - type: Database -component: PostgreSQL - hosts: * - lookup: sum -1m unaligned of deadlocks - units: deadlocks - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (10)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL DB ${label:database} deadlocks rate - info: Number of deadlocks detected in db ${label:database} in the last minute - to: dba - -# Table alarms - - template: postgres_table_cache_io_ratio - on: postgres.table_cache_io_ratio - class: Workload - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of miss - calc: 100 - $this - units: % - every: 1m - warn: $this < (($status >= $WARNING) ? (70) : (60)) - crit: $this < (($status == $CRITICAL) ? (60) : (50)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL table ${label:table} db ${label:database} cache hit ratio - info: Average cache hit ratio in db ${label:database} table ${label:table} over the last minute - to: dba - - template: postgres_table_index_cache_io_ratio - on: postgres.table_index_cache_io_ratio - class: Workload - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of miss - calc: 100 - $this - units: % - every: 1m - warn: $this < (($status >= $WARNING) ? (70) : (60)) - crit: $this < (($status == $CRITICAL) ? (60) : (50)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL table ${label:table} db ${label:database} index cache hit ratio - info: Average index cache hit ratio in db ${label:database} table ${label:table} over the last minute - to: dba - - template: postgres_table_toast_cache_io_ratio - on: postgres.table_toast_cache_io_ratio - class: Workload - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of miss - calc: 100 - $this - units: % - every: 1m - warn: $this < (($status >= $WARNING) ? (70) : (60)) - crit: $this < (($status == $CRITICAL) ? (60) : (50)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL table ${label:table} db ${label:database} toast cache hit ratio - info: Average TOAST hit ratio in db ${label:database} table ${label:table} over the last minute - to: dba - - template: postgres_table_toast_index_cache_io_ratio - on: postgres.table_toast_index_cache_io_ratio - class: Workload - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of miss - calc: 100 - $this - units: % - every: 1m - warn: $this < (($status >= $WARNING) ? (70) : (60)) - crit: $this < (($status == $CRITICAL) ? (60) : (50)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL table ${label:table} db ${label:database} index toast hit ratio - info: average index TOAST hit ratio in db ${label:database} table ${label:table} over the last minute - to: dba - - template: postgres_table_bloat_size_perc - on: postgres.table_bloat_size_perc - class: Errors - type: Database -component: PostgreSQL - hosts: * - calc: ($table_size > (1024 * 1024 * 100)) ? ($bloat) : (0) - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (60) : (70)) - crit: $this > (($status == $CRITICAL) ? (70) : (80)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL table ${label:table} db ${label:database} bloat size - info: Bloat size percentage in db ${label:database} table ${label:table} - to: dba - - template: postgres_table_last_autovacuum_time - on: postgres.table_autovacuum_since_time - class: Errors - type: Database -component: PostgreSQL - hosts: !* - calc: $time - units: seconds - every: 1m - warn: $this != nan AND $this > (60 * 60 * 24 * 7) - summary: PostgreSQL table ${label:table} db ${label:database} last autovacuum - info: Time elapsed since db ${label:database} table ${label:table} was vacuumed by the autovacuum daemon - to: dba - - template: postgres_table_last_autoanalyze_time - on: postgres.table_autoanalyze_since_time - class: Errors - type: Database -component: PostgreSQL - hosts: !* - calc: $time - units: seconds - every: 1m - warn: $this != nan AND $this > (60 * 60 * 24 * 7) - summary: PostgreSQL table ${label:table} db ${label:database} last autoanalyze - info: Time elapsed since db ${label:database} table ${label:table} was analyzed by the autovacuum daemon - to: dba - -# Index alarms - - template: postgres_index_bloat_size_perc - on: postgres.index_bloat_size_perc - class: Errors - type: Database -component: PostgreSQL - hosts: * - calc: ($index_size > (1024 * 1024 * 10)) ? ($bloat) : (0) - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (60) : (70)) - crit: $this > (($status == $CRITICAL) ? (70) : (80)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL table ${label:table} db ${label:database} index bloat size - info: Bloat size percentage in db ${label:database} table ${label:table} index ${label:index} - to: dba diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf deleted file mode 100644 index da27ad5b7..000000000 --- a/health/health.d/python.d.plugin.conf +++ /dev/null @@ -1,18 +0,0 @@ - -# make sure python.d.plugin data collection job is running - - template: python.d_job_last_collected_secs - on: netdata.pythond_runtime - class: Errors - type: Netdata -component: python.d.plugin - module: !* * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - summary: Python.d plugin last collection - info: Number of seconds since the last successful data collection - to: webmaster diff --git a/health/health.d/qos.conf b/health/health.d/qos.conf deleted file mode 100644 index 970ea6363..000000000 --- a/health/health.d/qos.conf +++ /dev/null @@ -1,18 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# check if a QoS class is dropping packets -# the alarm is checked every 10 seconds -# and examines the last minute of data - -template: 10min_qos_packet_drops - on: tc.qos_dropped - os: linux - hosts: * - lookup: sum -5m unaligned absolute - every: 30s - warn: $this > 0 - units: packets - summary: QOS packet drops - info: Dropped packets in the last 5 minutes - to: silent diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf deleted file mode 100644 index 51f307ca6..000000000 --- a/health/health.d/ram.conf +++ /dev/null @@ -1,82 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: ram_in_use - on: system.ram - class: Utilization - type: System -component: Memory - os: linux - hosts: * - calc: $used * 100 / ($used + $cached + $free + $buffers) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: System memory utilization - info: System memory utilization - to: sysadmin - - alarm: ram_available - on: mem.available - class: Utilization - type: System -component: Memory - os: linux - hosts: * - calc: $avail * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) - units: % - every: 10s - warn: $this < (($status >= $WARNING) ? (15) : (10)) - delay: down 15m multiplier 1.5 max 1h - summary: System available memory - info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping - to: silent - - alarm: oom_kill - on: mem.oom_kill - os: linux - hosts: * - lookup: sum -30m unaligned - units: kills - every: 5m - warn: $this > 0 - delay: down 10m - summary: System OOM kills - info: Number of out of memory kills in the last 30 minutes - to: silent - -## FreeBSD - alarm: ram_in_use - on: system.ram - class: Utilization - type: System -component: Memory - os: freebsd - hosts: * - calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: System memory utilization - info: System memory utilization - to: sysadmin - - alarm: ram_available - on: mem.available - class: Utilization - type: System -component: Memory - os: freebsd - hosts: * - calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers) - units: % - every: 10s - warn: $this < (($status >= $WARNING) ? (15) : (10)) - delay: down 15m multiplier 1.5 max 1h - summary: System available memory - info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping - to: silent diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf deleted file mode 100644 index 7c2945e68..000000000 --- a/health/health.d/redis.conf +++ /dev/null @@ -1,57 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - template: redis_connections_rejected - on: redis.connections - class: Errors - type: KV Storage -component: Redis - lookup: sum -1m unaligned of rejected - every: 10s - units: connections - warn: $this > 0 - summary: Redis rejected connections - info: Connections rejected because of maxclients limit in the last minute - delay: down 5m multiplier 1.5 max 1h - to: dba - - template: redis_bgsave_broken - on: redis.bgsave_health - class: Errors - type: KV Storage -component: Redis - every: 10s - crit: $last_bgsave != nan AND $last_bgsave != 0 - units: ok/failed - summary: Redis background save - info: Status of the last RDB save operation (0: ok, 1: error) - delay: down 5m multiplier 1.5 max 1h - to: dba - - template: redis_bgsave_slow - on: redis.bgsave_now - class: Latency - type: KV Storage -component: Redis - every: 10s - calc: $current_bgsave_time - warn: $this > 600 - crit: $this > 1200 - units: seconds - summary: Redis slow background save - info: Duration of the on-going RDB save operation - delay: down 5m multiplier 1.5 max 1h - to: dba - - template: redis_master_link_down - on: redis.master_link_down_since_time - class: Errors - type: KV Storage -component: Redis - every: 10s - calc: $time - units: seconds - crit: $this != nan AND $this > 0 - summary: Redis master link down - info: Time elapsed since the link between master and slave is down - delay: down 5m multiplier 1.5 max 1h - to: dba diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf deleted file mode 100644 index 8d7ba5661..000000000 --- a/health/health.d/softnet.conf +++ /dev/null @@ -1,57 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# check for common /proc/net/softnet_stat errors - - alarm: 1min_netdev_backlog_exceeded - on: system.softnet_stat - class: Errors - type: System -component: Network - os: linux - hosts: * - lookup: average -1m unaligned absolute of dropped - units: packets - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (10)) - delay: down 1h multiplier 1.5 max 2h - summary: System netdev dropped packets - info: Average number of dropped packets in the last minute \ - due to exceeded net.core.netdev_max_backlog - to: silent - - alarm: 1min_netdev_budget_ran_outs - on: system.softnet_stat - class: Errors - type: System -component: Network - os: linux - hosts: * - lookup: average -1m unaligned absolute of squeezed - units: events - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (10)) - delay: down 1h multiplier 1.5 max 2h - summary: System netdev budget run outs - info: Average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \ - net.core.netdev_budget_usecs with work remaining over the last minute \ - (this can be a cause for dropped packets) - to: silent - - alarm: 10min_netisr_backlog_exceeded - on: system.softnet_stat - class: Errors - type: System -component: Network - os: freebsd - hosts: * - lookup: average -1m unaligned absolute of qdrops - units: packets - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (10)) - delay: down 1h multiplier 1.5 max 2h - summary: System netisr drops - info: Average number of drops in the last minute \ - due to exceeded sysctl net.route.netisr_maxqlen \ - (this can be a cause for dropped packets) - to: silent diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf deleted file mode 100644 index e39733996..000000000 --- a/health/health.d/swap.conf +++ /dev/null @@ -1,37 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: 30min_ram_swapped_out - on: mem.swapio - class: Workload - type: System -component: Memory - os: linux freebsd - hosts: * - lookup: sum -30m unaligned absolute of out - # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 - calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) - units: % of RAM - every: 1m - warn: $this > (($status >= $WARNING) ? (20) : (30)) - delay: down 15m multiplier 1.5 max 1h - summary: System memory swapped out - info: Percentage of the system RAM swapped in the last 30 minutes - to: silent - - alarm: used_swap - on: mem.swap - class: Utilization - type: System -component: Memory - os: linux freebsd - hosts: * - calc: (($used + $free) > 0) ? ($used * 100 / ($used + $free)) : 0 - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: up 30s down 15m multiplier 1.5 max 1h - summary: System swap memory utilization - info: Swap memory utilization - to: sysadmin diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf deleted file mode 100644 index ad53a0e1c..000000000 --- a/health/health.d/systemdunits.conf +++ /dev/null @@ -1,161 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - -## Service units - template: systemd_service_unit_failed_state - on: systemd.service_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd service unit in the failed state - to: sysadmin - -## Socket units - template: systemd_socket_unit_failed_state - on: systemd.socket_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd socket unit in the failed state - to: sysadmin - -## Target units - template: systemd_target_unit_failed_state - on: systemd.target_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd target unit in the failed state - to: sysadmin - -## Path units - template: systemd_path_unit_failed_state - on: systemd.path_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd path unit in the failed state - to: sysadmin - -## Device units - template: systemd_device_unit_failed_state - on: systemd.device_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd device unit in the failed state - to: sysadmin - -## Mount units - template: systemd_mount_unit_failed_state - on: systemd.mount_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd mount units in the failed state - to: sysadmin - -## Automount units - template: systemd_automount_unit_failed_state - on: systemd.automount_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd automount unit in the failed state - to: sysadmin - -## Swap units - template: systemd_swap_unit_failed_state - on: systemd.swap_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd swap units in the failed state - to: sysadmin - -## Scope units - template: systemd_scope_unit_failed_state - on: systemd.scope_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd scope units in the failed state - to: sysadmin - -## Slice units - template: systemd_slice_unit_failed_state - on: systemd.slice_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd slice units in the failed state - to: sysadmin diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf deleted file mode 100644 index 2b2f97406..000000000 --- a/health/health.d/tcp_conn.conf +++ /dev/null @@ -1,23 +0,0 @@ - -# -# ${tcp_max_connections} may be nan or -1 if the system -# supports dynamic threshold for TCP connections. -# In this case, the alarm will always be zero. -# - - alarm: tcp_connections - on: ip.tcpsock - class: Workload - type: System -component: Network - os: linux - hosts: * - calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0 - units: % - every: 10s - warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 )) - crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 )) - delay: up 0 down 5m multiplier 1.5 max 1h - summary: System TCP connections utilization - info: IPv4 TCP connections utilization - to: sysadmin diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf deleted file mode 100644 index 9d1104a51..000000000 --- a/health/health.d/tcp_listen.conf +++ /dev/null @@ -1,100 +0,0 @@ -# -# There are two queues involved when incoming TCP connections are handled -# (both at the kernel): -# -# SYN queue -# The SYN queue tracks TCP handshakes until connections are fully established. -# It overflows when too many incoming TCP connection requests hang in the -# half-open state and the server is not configured to fall back to SYN cookies. -# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends -# lots of SYN packets and never completes the handshakes). -# -# Accept queue -# The accept queue holds fully established TCP connections waiting to be handled -# by the listening application. It overflows when the server application fails -# to accept new connections at the rate they are coming in. -# -# -# ----------------------------------------------------------------------------- -# tcp accept queue (at the kernel) - - alarm: 1m_tcp_accept_queue_overflows - on: ip.tcp_accept_queue - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: average -60s unaligned absolute of ListenOverflows - units: overflows - every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (1) : (5)) - delay: up 0 down 5m multiplier 1.5 max 1h - summary: System TCP accept queue overflows - info: Average number of overflows in the TCP accept queue over the last minute - to: silent - -# THIS IS TOO GENERIC -# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842 - alarm: 1m_tcp_accept_queue_drops - on: ip.tcp_accept_queue - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: average -60s unaligned absolute of ListenDrops - units: drops - every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (1) : (5)) - delay: up 0 down 5m multiplier 1.5 max 1h - summary: System TCP accept queue dropped packets - info: Average number of dropped packets in the TCP accept queue over the last minute - to: silent - - -# ----------------------------------------------------------------------------- -# tcp SYN queue (at the kernel) - -# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or -# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are -# enabled or not. In both cases this probably indicates a SYN flood attack, -# so i guess a notification should be sent. - - alarm: 1m_tcp_syn_queue_drops - on: ip.tcp_syn_queue - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: average -60s unaligned absolute of TCPReqQFullDrop - units: drops - every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (0) : (5)) - delay: up 10 down 5m multiplier 1.5 max 1h - summary: System TCP SYN queue drops - info: Average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \ - (SYN cookies were not enabled) - to: silent - - alarm: 1m_tcp_syn_queue_cookies - on: ip.tcp_syn_queue - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: average -60s unaligned absolute of TCPReqQFullDoCookies - units: cookies - every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (0) : (5)) - delay: up 10 down 5m multiplier 1.5 max 1h - summary: System TCP SYN queue cookies - info: Average number of sent SYN cookies due to the full TCP SYN queue over the last minute - to: silent - diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf deleted file mode 100644 index 4e422ec1c..000000000 --- a/health/health.d/tcp_mem.conf +++ /dev/null @@ -1,24 +0,0 @@ -# -# check -# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html -# -# We give a warning when TCP is under memory pressure -# and a critical when TCP is 90% of its upper memory limit -# - - alarm: tcp_memory - on: ipv4.sockstat_tcp_mem - class: Utilization - type: System -component: Network - os: linux - hosts: * - calc: ${mem} * 100 / ${tcp_mem_high} - units: % - every: 10s - warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} )) - crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) - delay: up 0 down 5m multiplier 1.5 max 1h - summary: System TCP memory utilization - info: TCP memory utilization - to: silent diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf deleted file mode 100644 index 8f665d50e..000000000 --- a/health/health.d/tcp_orphans.conf +++ /dev/null @@ -1,25 +0,0 @@ - -# -# check -# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html -# -# The kernel may penalize orphans by 2x or even 4x -# so we alarm warning at 25% and critical at 50% -# - - alarm: tcp_orphans - on: ipv4.sockstat_tcp_sockets - class: Errors - type: System -component: Network - os: linux - hosts: * - calc: ${orphan} * 100 / ${tcp_max_orphans} - units: % - every: 10s - warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 )) - crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 )) - delay: up 0 down 5m multiplier 1.5 max 1h - summary: System TCP orphan sockets utilization - info: Orphan IPv4 TCP sockets utilization - to: silent diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf deleted file mode 100644 index 7c39db2db..000000000 --- a/health/health.d/tcp_resets.conf +++ /dev/null @@ -1,71 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# ----------------------------------------------------------------------------- -# tcp resets this host sends - - alarm: 1m_ip_tcp_resets_sent - on: ip.tcphandshake - class: Errors - type: System -component: Network - os: linux - hosts: * - lookup: average -1m at -10s unaligned absolute of OutRsts - units: tcp resets/s - every: 10s - info: average number of sent TCP RESETS over the last minute - - alarm: 10s_ip_tcp_resets_sent - on: ip.tcphandshake - class: Errors - type: System -component: Network - os: linux - hosts: * - lookup: average -10s unaligned absolute of OutRsts - units: tcp resets/s - every: 10s - warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_sent < 5)?(5):($1m_ip_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10))) - delay: up 20s down 60m multiplier 1.2 max 2h - options: no-clear-notification - summary: System TCP outbound resets - info: Average number of sent TCP RESETS over the last 10 seconds. \ - This can indicate a port scan, \ - or that a service running on this host has crashed. \ - Netdata will not send a clear notification for this alarm. - to: silent - -# ----------------------------------------------------------------------------- -# tcp resets this host receives - - alarm: 1m_ip_tcp_resets_received - on: ip.tcphandshake - class: Errors - type: System -component: Network - os: linux freebsd - hosts: * - lookup: average -1m at -10s unaligned absolute of AttemptFails - units: tcp resets/s - every: 10s - info: average number of received TCP RESETS over the last minute - - alarm: 10s_ip_tcp_resets_received - on: ip.tcphandshake - class: Errors - type: System -component: Network - os: linux freebsd - hosts: * - lookup: average -10s unaligned absolute of AttemptFails - units: tcp resets/s - every: 10s - warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_received < 5)?(5):($1m_ip_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) - delay: up 20s down 60m multiplier 1.2 max 2h - options: no-clear-notification - summary: System TCP inbound resets - info: average number of received TCP RESETS over the last 10 seconds. \ - This can be an indication that a service this host needs has crashed. \ - Netdata will not send a clear notification for this alarm. - to: silent diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf deleted file mode 100644 index 65c9628b5..000000000 --- a/health/health.d/timex.conf +++ /dev/null @@ -1,18 +0,0 @@ - -# It can take several minutes before ntpd selects a server to synchronize with; -# try checking after 17 minutes (1024 seconds). - - alarm: system_clock_sync_state - on: system.clock_sync_state - os: linux - class: Errors - type: System -component: Clock - calc: $state - units: synchronization state - every: 10s - warn: $system.uptime.uptime > 17 * 60 AND $this == 0 - delay: down 5m - summary: System clock sync state - info: When set to 0, the system kernel believes the system clock is not properly synchronized to a reliable server - to: silent diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf deleted file mode 100644 index dc0948403..000000000 --- a/health/health.d/udp_errors.conf +++ /dev/null @@ -1,40 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# ----------------------------------------------------------------------------- -# UDP receive buffer errors - - alarm: 1m_ipv4_udp_receive_buffer_errors - on: ipv4.udperrors - class: Errors - type: System -component: Network - os: linux freebsd - hosts: * - lookup: average -1m unaligned absolute of RcvbufErrors - units: errors - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (10)) - summary: System UDP receive buffer errors - info: Average number of UDP receive buffer errors over the last minute - delay: up 1m down 60m multiplier 1.2 max 2h - to: silent - -# ----------------------------------------------------------------------------- -# UDP send buffer errors - - alarm: 1m_ipv4_udp_send_buffer_errors - on: ipv4.udperrors - class: Errors - type: System -component: Network - os: linux - hosts: * - lookup: average -1m unaligned absolute of SndbufErrors - units: errors - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (10)) - summary: System UDP send buffer errors - info: Average number of UDP send buffer errors over the last minute - delay: up 1m down 60m multiplier 1.2 max 2h - to: silent diff --git a/health/health.d/upsd.conf b/health/health.d/upsd.conf deleted file mode 100644 index 703a64881..000000000 --- a/health/health.d/upsd.conf +++ /dev/null @@ -1,50 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - template: upsd_10min_ups_load - on: upsd.ups_load - class: Utilization - type: Power Supply -component: UPS - os: * - hosts: * - lookup: average -10m unaligned of load - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 10m multiplier 1.5 max 1h - summary: UPS ${label:ups_name} load - info: UPS ${label:ups_name} average load over the last 10 minutes - to: sitemgr - - template: upsd_ups_battery_charge - on: upsd.ups_battery_charge - class: Errors - type: Power Supply -component: UPS - os: * - hosts: * - lookup: average -60s unaligned of charge - units: % - every: 60s - warn: $this < 75 - crit: $this < 40 - delay: down 10m multiplier 1.5 max 1h - summary: UPS ${label:ups_name} battery charge - info: UPS ${label:ups_name} average battery charge over the last minute - to: sitemgr - - template: upsd_ups_last_collected_secs - on: upsd.ups_load - class: Latency - type: Power Supply -component: UPS device - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - summary: UPS ${label:ups_name} last collected - info: UPS ${label:ups_name} number of seconds since the last successful data collection - to: sitemgr diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf deleted file mode 100644 index b8ad9aee4..000000000 --- a/health/health.d/vsphere.conf +++ /dev/null @@ -1,70 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# -----------------------------------------------Virtual Machine-------------------------------------------------------- - - template: vsphere_vm_cpu_utilization - on: vsphere.vm_cpu_utilization - class: Utilization - type: Virtual Machine -component: CPU - hosts: * - lookup: average -10m unaligned match-names of used - units: % - every: 20s - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: vSphere CPU utilization for VM ${label:vm} - info: CPU utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} - to: silent - - template: vsphere_vm_mem_utilization - on: vsphere.vm_mem_utilization - class: Utilization - type: Virtual Machine -component: Memory - hosts: * - calc: $used - units: % - every: 20s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: vSphere memory utilization for VM ${label:vm} - info: Memory utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} - to: silent - -# -----------------------------------------------ESXI host-------------------------------------------------------------- - - template: vsphere_host_cpu_utilization - on: vsphere.host_cpu_utilization - class: Utilization - type: Virtual Machine -component: CPU - hosts: * - lookup: average -10m unaligned match-names of used - units: % - every: 20s - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: vSphere ESXi CPU utilization for host ${label:host} - info: CPU utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} - to: sysadmin - - template: vsphere_host_mem_utilization - on: vsphere.host_mem_utilization - class: Utilization - type: Virtual Machine -component: Memory - hosts: * - calc: $used - units: % - every: 20s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: vSphere ESXi Ram utilization for host ${label:host} - info: Memory utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} - to: sysadmin diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf deleted file mode 100644 index 0a328b592..000000000 --- a/health/health.d/whoisquery.conf +++ /dev/null @@ -1,14 +0,0 @@ - - template: whoisquery_days_until_expiration - on: whoisquery.time_until_expiration - class: Utilization - type: Other -component: WHOIS - calc: $expiry - units: seconds - every: 60s - warn: $this < $days_until_expiration_warning*24*60*60 - crit: $this < $days_until_expiration_critical*24*60*60 - summary: Whois expiration time for domain ${label:domain} - info: Time until the domain name registration for ${label:domain} expires - to: webmaster diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf deleted file mode 100644 index 706fcbf22..000000000 --- a/health/health.d/windows.conf +++ /dev/null @@ -1,126 +0,0 @@ - -## CPU - - template: windows_10min_cpu_usage - on: windows.cpu_utilization_total - class: Utilization - type: Windows -component: CPU - os: * - hosts: * - lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: CPU utilization - info: Average CPU utilization over the last 10 minutes - to: silent - - -## Memory - - template: windows_ram_in_use - on: windows.memory_utilization - class: Utilization - type: Windows -component: Memory - os: * - hosts: * - calc: ($used) * 100 / ($used + $available) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: Ram utilization - info: Memory utilization - to: sysadmin - - -## Network - - template: windows_inbound_packets_discarded - on: windows.net_nic_discarded - class: Errors - type: Windows -component: Network - os: * - hosts: * - lookup: sum -10m unaligned absolute match-names of inbound - units: packets - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - summary: Inbound network packets discarded - info: Number of inbound discarded packets for the network interface in the last 10 minutes - to: silent - - template: windows_outbound_packets_discarded - on: windows.net_nic_discarded - class: Errors - type: Windows -component: Network - os: * - hosts: * - lookup: sum -10m unaligned absolute match-names of outbound - units: packets - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - summary: Outbound network packets discarded - info: Number of outbound discarded packets for the network interface in the last 10 minutes - to: silent - - template: windows_inbound_packets_errors - on: windows.net_nic_errors - class: Errors - type: Windows -component: Network - os: * - hosts: * - lookup: sum -10m unaligned absolute match-names of inbound - units: packets - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - summary: Inbound network errors - info: Number of inbound errors for the network interface in the last 10 minutes - to: silent - - template: windows_outbound_packets_errors - on: windows.net_nic_errors - class: Errors - type: Windows -component: Network - os: * - hosts: * - lookup: sum -10m unaligned absolute match-names of outbound - units: packets - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - summary: Outbound network errors - info: Number of outbound errors for the network interface in the last 10 minutes - to: silent - - -## Disk - - template: windows_disk_in_use - on: windows.logical_disk_space_usage - class: Utilization - type: Windows -component: Disk - os: * - hosts: * - calc: ($used) * 100 / ($used + $free) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: Disk space usage - info: Disk space utilization - to: sysadmin diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf deleted file mode 100644 index d05f3ef0f..000000000 --- a/health/health.d/x509check.conf +++ /dev/null @@ -1,26 +0,0 @@ - - template: x509check_days_until_expiration - on: x509check.time_until_expiration - class: Latency - type: Certificates -component: x509 certificates - calc: $expiry - units: seconds - every: 60s - warn: $this < $days_until_expiration_warning*24*60*60 - crit: $this < $days_until_expiration_critical*24*60*60 - summary: x509 certificate expiration for ${label:source} - info: Time until x509 certificate expires for ${label:source} - to: webmaster - - template: x509check_revocation_status - on: x509check.revocation_status - class: Errors - type: Certificates -component: x509 certificates - calc: $revoked - every: 60s - crit: $this != nan AND $this != 0 - summary: x509 certificate revocation status for ${label:source} - info: x509 certificate revocation status (0: revoked, 1: valid) for ${label:source} - to: webmaster diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf deleted file mode 100644 index d2a561000..000000000 --- a/health/health.d/zfs.conf +++ /dev/null @@ -1,44 +0,0 @@ - - alarm: zfs_memory_throttle - on: zfs.memory_ops - class: Utilization - type: System -component: File system - lookup: sum -10m unaligned absolute of throttled - units: events - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 2h - summary: ZFS ARC growth throttling - info: number of times ZFS had to limit the ARC growth in the last 10 minutes - to: silent - -# ZFS pool state - - template: zfs_pool_state_warn - on: zfspool.state - class: Errors - type: System -component: File system - calc: $degraded - units: boolean - every: 10s - warn: $this > 0 - delay: down 1m multiplier 1.5 max 1h - summary: ZFS pool ${label:pool} state - info: ZFS pool ${label:pool} state is degraded - to: sysadmin - - template: zfs_pool_state_crit - on: zfspool.state - class: Errors - type: System -component: File system - calc: $faulted + $unavail - units: boolean - every: 10s - crit: $this > 0 - delay: down 1m multiplier 1.5 max 1h - summary: Critical ZFS pool ${label:pool} state - info: ZFS pool ${label:pool} state is faulted or unavail - to: sysadmin diff --git a/health/health.d/apcupsd.conf b/src/health/health.d/apcupsd.conf index 90a72af19..5fd7aa112 100644 --- a/health/health.d/apcupsd.conf +++ b/src/health/health.d/apcupsd.conf @@ -5,8 +5,6 @@ class: Utilization type: Power Supply component: UPS - os: * - hosts: * lookup: average -10m unaligned of percentage units: % every: 1m @@ -23,8 +21,6 @@ component: UPS class: Errors type: Power Supply component: UPS - os: * - hosts: * lookup: average -60s unaligned of charge units: % every: 60s diff --git a/health/health.d/bcache.conf b/src/health/health.d/bcache.conf index 446173428..446173428 100644 --- a/health/health.d/bcache.conf +++ b/src/health/health.d/bcache.conf diff --git a/health/health.d/beanstalkd.conf b/src/health/health.d/beanstalkd.conf index 0d37f28e0..0d37f28e0 100644 --- a/health/health.d/beanstalkd.conf +++ b/src/health/health.d/beanstalkd.conf diff --git a/health/health.d/boinc.conf b/src/health/health.d/boinc.conf index 092a56845..6fd987de1 100644 --- a/health/health.d/boinc.conf +++ b/src/health/health.d/boinc.conf @@ -1,4 +1,4 @@ -# Alarms for various BOINC issues. +# you can disable an alarm notification by setting the 'to' line to: silent # Warn on any compute errors encountered. template: boinc_compute_errors @@ -6,8 +6,6 @@ class: Errors type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of comperror units: tasks every: 1m @@ -23,8 +21,6 @@ component: BOINC class: Errors type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of upload_failed units: tasks every: 1m @@ -40,8 +36,6 @@ component: BOINC class: Utilization type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of total units: tasks every: 1m @@ -57,8 +51,6 @@ component: BOINC class: Utilization type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of active calc: ($boinc_total_tasks >= 1) ? ($this) : (inf) units: tasks diff --git a/health/health.d/btrfs.conf b/src/health/health.d/btrfs.conf index 1557a5941..f43f600c0 100644 --- a/health/health.d/btrfs.conf +++ b/src/health/health.d/btrfs.conf @@ -1,11 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent template: btrfs_allocated on: btrfs.disk class: Utilization type: System component: File system - os: * - hosts: * calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free)) units: % every: 10s @@ -20,8 +19,6 @@ component: File system class: Utilization type: System component: File system - os: * - hosts: * calc: $used * 100 / ($used + $free) units: % every: 10s @@ -37,8 +34,6 @@ component: File system class: Utilization type: System component: File system - os: * - hosts: * calc: ($used + $reserved) * 100 / ($used + $free + $reserved) units: % every: 10s @@ -54,8 +49,6 @@ component: File system class: Utilization type: System component: File system - os: * - hosts: * calc: $used * 100 / ($used + $free) units: % every: 10s @@ -71,8 +64,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of read_errs warn: $this > 0 @@ -86,8 +77,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of write_errs crit: $this > 0 @@ -101,8 +90,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of flush_errs crit: $this > 0 @@ -116,8 +103,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of corruption_errs warn: $this > 0 @@ -131,8 +116,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of generation_errs warn: $this > 0 diff --git a/health/health.d/ceph.conf b/src/health/health.d/ceph.conf index 44d351338..44d351338 100644 --- a/health/health.d/ceph.conf +++ b/src/health/health.d/ceph.conf diff --git a/health/health.d/cockroachdb.conf b/src/health/health.d/cockroachdb.conf index 60f178354..60f178354 100644 --- a/health/health.d/cockroachdb.conf +++ b/src/health/health.d/cockroachdb.conf diff --git a/health/health.d/consul.conf b/src/health/health.d/consul.conf index 8b414a26d..8b414a26d 100644 --- a/health/health.d/consul.conf +++ b/src/health/health.d/consul.conf diff --git a/health/health.d/dbengine.conf b/src/health/health.d/dbengine.conf index 0a70d2e8f..5585a9533 100644 --- a/health/health.d/dbengine.conf +++ b/src/health/health.d/dbengine.conf @@ -1,4 +1,3 @@ - # you can disable an alarm notification by setting the 'to' line to: silent alarm: 10min_dbengine_global_fs_errors @@ -6,8 +5,6 @@ class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of fs_errors units: errors every: 10s @@ -22,8 +19,6 @@ component: DB engine class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of io_errors units: errors every: 10s @@ -38,8 +33,6 @@ component: DB engine class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of pg_cache_over_half_dirty_events units: errors every: 10s @@ -55,8 +48,6 @@ component: DB engine class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of flushing_pressure_deletions units: pages every: 10s diff --git a/health/health.d/dns_query.conf b/src/health/health.d/dns_query.conf index 756c6a1b6..756c6a1b6 100644 --- a/health/health.d/dns_query.conf +++ b/src/health/health.d/dns_query.conf diff --git a/health/health.d/dnsmasq_dhcp.conf b/src/health/health.d/dnsmasq_dhcp.conf index f6ef01940..f6ef01940 100644 --- a/health/health.d/dnsmasq_dhcp.conf +++ b/src/health/health.d/dnsmasq_dhcp.conf diff --git a/health/health.d/docker.conf b/src/health/health.d/docker.conf index 668614d4d..668614d4d 100644 --- a/health/health.d/docker.conf +++ b/src/health/health.d/docker.conf diff --git a/health/health.d/elasticsearch.conf b/src/health/health.d/elasticsearch.conf index 600840c58..600840c58 100644 --- a/health/health.d/elasticsearch.conf +++ b/src/health/health.d/elasticsearch.conf diff --git a/health/health.d/exporting.conf b/src/health/health.d/exporting.conf index c0320193c..c0320193c 100644 --- a/health/health.d/exporting.conf +++ b/src/health/health.d/exporting.conf diff --git a/health/health.d/gearman.conf b/src/health/health.d/gearman.conf index 78e1165d1..78e1165d1 100644 --- a/health/health.d/gearman.conf +++ b/src/health/health.d/gearman.conf diff --git a/health/health.d/geth.conf b/src/health/health.d/geth.conf index 361b6b41f..361b6b41f 100644 --- a/health/health.d/geth.conf +++ b/src/health/health.d/geth.conf diff --git a/health/health.d/haproxy.conf b/src/health/health.d/haproxy.conf index 66a488fa4..66a488fa4 100644 --- a/health/health.d/haproxy.conf +++ b/src/health/health.d/haproxy.conf diff --git a/health/health.d/hdfs.conf b/src/health/health.d/hdfs.conf index 566e815aa..566e815aa 100644 --- a/health/health.d/hdfs.conf +++ b/src/health/health.d/hdfs.conf diff --git a/health/health.d/ioping.conf b/src/health/health.d/ioping.conf index 6d832bf00..6d832bf00 100644 --- a/health/health.d/ioping.conf +++ b/src/health/health.d/ioping.conf diff --git a/health/health.d/ipfs.conf b/src/health/health.d/ipfs.conf index 4dfee3c7f..4dfee3c7f 100644 --- a/health/health.d/ipfs.conf +++ b/src/health/health.d/ipfs.conf diff --git a/health/health.d/ipmi.conf b/src/health/health.d/ipmi.conf index cec2320a9..cec2320a9 100644 --- a/health/health.d/ipmi.conf +++ b/src/health/health.d/ipmi.conf diff --git a/health/health.d/kubelet.conf b/src/health/health.d/kubelet.conf index 8adf5f7d4..8adf5f7d4 100644 --- a/health/health.d/kubelet.conf +++ b/src/health/health.d/kubelet.conf diff --git a/health/health.d/linux_power_supply.conf b/src/health/health.d/linux_power_supply.conf index b0d35e752..b0d35e752 100644 --- a/health/health.d/linux_power_supply.conf +++ b/src/health/health.d/linux_power_supply.conf diff --git a/health/health.d/mdstat.conf b/src/health/health.d/mdstat.conf index 90f97d851..90f97d851 100644 --- a/health/health.d/mdstat.conf +++ b/src/health/health.d/mdstat.conf diff --git a/health/health.d/memcached.conf b/src/health/health.d/memcached.conf index 77ca0afa9..77ca0afa9 100644 --- a/health/health.d/memcached.conf +++ b/src/health/health.d/memcached.conf diff --git a/health/health.d/ml.conf b/src/health/health.d/ml.conf index aef9b0368..b6a5df6dd 100644 --- a/health/health.d/ml.conf +++ b/src/health/health.d/ml.conf @@ -13,8 +13,6 @@ class: Workload type: System component: ML - os: * - hosts: * lookup: average -1m of anomaly_rate calc: $this units: % @@ -29,8 +27,6 @@ component: ML # if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). # template: ml_5min_cpu_dims # on: system.cpu -# os: linux -# hosts: * # lookup: average -5m anomaly-bit foreach * # calc: $this # units: % @@ -44,8 +40,6 @@ component: ML # if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). # template: ml_5min_cpu_chart # on: system.cpu -# os: linux -# hosts: * # lookup: average -5m anomaly-bit of * # calc: $this # units: % @@ -53,4 +47,3 @@ component: ML # warn: $this > (($status >= $WARNING) ? (5) : (20)) # crit: $this > (($status == $CRITICAL) ? (20) : (100)) # info: rolling 5min anomaly rate for system.cpu chart - diff --git a/health/health.d/mysql.conf b/src/health/health.d/mysql.conf index 572560b4e..572560b4e 100644 --- a/health/health.d/mysql.conf +++ b/src/health/health.d/mysql.conf diff --git a/health/health.d/nvme.conf b/src/health/health.d/nvme.conf index aea402e88..aea402e88 100644 --- a/health/health.d/nvme.conf +++ b/src/health/health.d/nvme.conf diff --git a/health/health.d/pihole.conf b/src/health/health.d/pihole.conf index c4db835ce..c4db835ce 100644 --- a/health/health.d/pihole.conf +++ b/src/health/health.d/pihole.conf diff --git a/health/health.d/plugin.conf b/src/health/health.d/plugin.conf index 8615a0213..8615a0213 100644 --- a/health/health.d/plugin.conf +++ b/src/health/health.d/plugin.conf diff --git a/health/health.d/portcheck.conf b/src/health/health.d/portcheck.conf index 281731c86..281731c86 100644 --- a/health/health.d/portcheck.conf +++ b/src/health/health.d/portcheck.conf diff --git a/health/health.d/processes.conf b/src/health/health.d/processes.conf index 8f2e0fda5..2029c76e4 100644 --- a/health/health.d/processes.conf +++ b/src/health/health.d/processes.conf @@ -5,7 +5,6 @@ class: Workload type: System component: Processes - hosts: * calc: $active * 100 / $pidmax units: % every: 5s diff --git a/health/health.d/retroshare.conf b/src/health/health.d/retroshare.conf index c665430fa..c665430fa 100644 --- a/health/health.d/retroshare.conf +++ b/src/health/health.d/retroshare.conf diff --git a/health/health.d/riakkv.conf b/src/health/health.d/riakkv.conf index 677e3cb4f..677e3cb4f 100644 --- a/health/health.d/riakkv.conf +++ b/src/health/health.d/riakkv.conf diff --git a/health/health.d/scaleio.conf b/src/health/health.d/scaleio.conf index b089cb85e..b089cb85e 100644 --- a/health/health.d/scaleio.conf +++ b/src/health/health.d/scaleio.conf diff --git a/health/health.d/synchronization.conf b/src/health/health.d/synchronization.conf index 6c947d90b..28b1817ac 100644 --- a/health/health.d/synchronization.conf +++ b/src/health/health.d/synchronization.conf @@ -2,7 +2,6 @@ on: mem.sync lookup: sum -1m of sync units: calls - plugin: ebpf.plugin every: 1m warn: $this > 6 delay: up 1m down 10m multiplier 1.5 max 1h diff --git a/health/health.d/unbound.conf b/src/health/health.d/unbound.conf index 3c898f1d5..3c898f1d5 100644 --- a/health/health.d/unbound.conf +++ b/src/health/health.d/unbound.conf diff --git a/health/health.d/vcsa.conf b/src/health/health.d/vcsa.conf index 3e20bfd1e..3e20bfd1e 100644 --- a/health/health.d/vcsa.conf +++ b/src/health/health.d/vcsa.conf diff --git a/health/health.d/vernemq.conf b/src/health/health.d/vernemq.conf index 6ea9f99dc..6ea9f99dc 100644 --- a/health/health.d/vernemq.conf +++ b/src/health/health.d/vernemq.conf diff --git a/health/health.d/web_log.conf b/src/health/health.d/web_log.conf index 78f1cc7f5..78f1cc7f5 100644 --- a/health/health.d/web_log.conf +++ b/src/health/health.d/web_log.conf |