summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--health/health.d/anomalies.conf23
-rw-r--r--health/health.d/cgroups.conf72
-rw-r--r--health/health.d/cpu.conf69
-rw-r--r--health/health.d/disks.conf172
-rw-r--r--health/health.d/entropy.conf20
-rw-r--r--health/health.d/file_descriptors.conf33
-rw-r--r--health/health.d/go.d.plugin.conf18
-rw-r--r--health/health.d/httpcheck.conf73
-rw-r--r--health/health.d/ipc.conf34
-rw-r--r--health/health.d/load.conf72
-rw-r--r--health/health.d/memory.conf85
-rw-r--r--health/health.d/net.conf258
-rw-r--r--health/health.d/netfilter.conf20
-rw-r--r--health/health.d/postgres.conf228
-rw-r--r--health/health.d/python.d.plugin.conf18
-rw-r--r--health/health.d/qos.conf18
-rw-r--r--health/health.d/ram.conf82
-rw-r--r--health/health.d/redis.conf57
-rw-r--r--health/health.d/softnet.conf57
-rw-r--r--health/health.d/swap.conf37
-rw-r--r--health/health.d/systemdunits.conf161
-rw-r--r--health/health.d/tcp_conn.conf23
-rw-r--r--health/health.d/tcp_listen.conf100
-rw-r--r--health/health.d/tcp_mem.conf24
-rw-r--r--health/health.d/tcp_orphans.conf25
-rw-r--r--health/health.d/tcp_resets.conf71
-rw-r--r--health/health.d/timex.conf18
-rw-r--r--health/health.d/udp_errors.conf40
-rw-r--r--health/health.d/upsd.conf50
-rw-r--r--health/health.d/vsphere.conf70
-rw-r--r--health/health.d/windows.conf126
-rw-r--r--src/health/health.d/adaptec_raid.conf (renamed from health/health.d/adaptec_raid.conf)4
-rw-r--r--src/health/health.d/apcupsd.conf (renamed from health/health.d/apcupsd.conf)4
-rw-r--r--src/health/health.d/bcache.conf (renamed from health/health.d/bcache.conf)0
-rw-r--r--src/health/health.d/beanstalkd.conf (renamed from health/health.d/beanstalkd.conf)0
-rw-r--r--src/health/health.d/bind_rndc.conf (renamed from health/health.d/bind_rndc.conf)0
-rw-r--r--src/health/health.d/boinc.conf (renamed from health/health.d/boinc.conf)10
-rw-r--r--src/health/health.d/btrfs.conf (renamed from health/health.d/btrfs.conf)19
-rw-r--r--src/health/health.d/ceph.conf (renamed from health/health.d/ceph.conf)0
-rw-r--r--src/health/health.d/cockroachdb.conf (renamed from health/health.d/cockroachdb.conf)0
-rw-r--r--src/health/health.d/consul.conf (renamed from health/health.d/consul.conf)0
-rw-r--r--src/health/health.d/dbengine.conf (renamed from health/health.d/dbengine.conf)9
-rw-r--r--src/health/health.d/dns_query.conf (renamed from health/health.d/dns_query.conf)0
-rw-r--r--src/health/health.d/dnsmasq_dhcp.conf (renamed from health/health.d/dnsmasq_dhcp.conf)0
-rw-r--r--src/health/health.d/docker.conf (renamed from health/health.d/docker.conf)0
-rw-r--r--src/health/health.d/elasticsearch.conf (renamed from health/health.d/elasticsearch.conf)0
-rw-r--r--src/health/health.d/exporting.conf (renamed from health/health.d/exporting.conf)0
-rw-r--r--src/health/health.d/gearman.conf (renamed from health/health.d/gearman.conf)0
-rw-r--r--src/health/health.d/geth.conf (renamed from health/health.d/geth.conf)0
-rw-r--r--src/health/health.d/haproxy.conf (renamed from health/health.d/haproxy.conf)0
-rw-r--r--src/health/health.d/hdfs.conf (renamed from health/health.d/hdfs.conf)0
-rw-r--r--src/health/health.d/ioping.conf (renamed from health/health.d/ioping.conf)0
-rw-r--r--src/health/health.d/ipfs.conf (renamed from health/health.d/ipfs.conf)0
-rw-r--r--src/health/health.d/ipmi.conf (renamed from health/health.d/ipmi.conf)0
-rw-r--r--src/health/health.d/isc_dhcpd.conf (renamed from health/health.d/isc_dhcpd.conf)0
-rw-r--r--src/health/health.d/kubelet.conf (renamed from health/health.d/kubelet.conf)0
-rw-r--r--src/health/health.d/linux_power_supply.conf (renamed from health/health.d/linux_power_supply.conf)0
-rw-r--r--src/health/health.d/mdstat.conf (renamed from health/health.d/mdstat.conf)0
-rw-r--r--src/health/health.d/megacli.conf (renamed from health/health.d/megacli.conf)6
-rw-r--r--src/health/health.d/memcached.conf (renamed from health/health.d/memcached.conf)0
-rw-r--r--src/health/health.d/ml.conf (renamed from health/health.d/ml.conf)7
-rw-r--r--src/health/health.d/mysql.conf (renamed from health/health.d/mysql.conf)0
-rw-r--r--src/health/health.d/nvme.conf (renamed from health/health.d/nvme.conf)0
-rw-r--r--src/health/health.d/pihole.conf (renamed from health/health.d/pihole.conf)0
-rw-r--r--src/health/health.d/ping.conf (renamed from health/health.d/ping.conf)0
-rw-r--r--src/health/health.d/plugin.conf (renamed from health/health.d/plugin.conf)0
-rw-r--r--src/health/health.d/portcheck.conf (renamed from health/health.d/portcheck.conf)0
-rw-r--r--src/health/health.d/processes.conf (renamed from health/health.d/processes.conf)1
-rw-r--r--src/health/health.d/retroshare.conf (renamed from health/health.d/retroshare.conf)0
-rw-r--r--src/health/health.d/riakkv.conf (renamed from health/health.d/riakkv.conf)0
-rw-r--r--src/health/health.d/scaleio.conf (renamed from health/health.d/scaleio.conf)0
-rw-r--r--src/health/health.d/synchronization.conf (renamed from health/health.d/synchronization.conf)1
-rw-r--r--src/health/health.d/unbound.conf (renamed from health/health.d/unbound.conf)0
-rw-r--r--src/health/health.d/vcsa.conf (renamed from health/health.d/vcsa.conf)0
-rw-r--r--src/health/health.d/vernemq.conf (renamed from health/health.d/vernemq.conf)0
-rw-r--r--src/health/health.d/web_log.conf (renamed from health/health.d/web_log.conf)0
-rw-r--r--src/health/health.d/whoisquery.conf (renamed from health/health.d/whoisquery.conf)0
-rw-r--r--src/health/health.d/x509check.conf (renamed from health/health.d/x509check.conf)0
-rw-r--r--src/health/health.d/zfs.conf (renamed from health/health.d/zfs.conf)0
79 files changed, 7 insertions, 2208 deletions
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
deleted file mode 100644
index 269ae544b..000000000
--- a/health/health.d/anomalies.conf
+++ /dev/null
@@ -1,23 +0,0 @@
-# raise a warning alarm if an anomaly probability is consistently above 50%
-
- template: anomalies_anomaly_probabilities
- on: anomalies.probability
- class: Errors
- type: Netdata
-component: ML
- lookup: average -2m foreach *
- every: 1m
- warn: $this > 50
- info: average anomaly probability over the last 2 minutes
-
-# raise a warning alarm if an anomaly flag is consistently firing
-
- template: anomalies_anomaly_flags
- on: anomalies.anomaly
- class: Errors
- type: Netdata
-component: ML
- lookup: sum -2m foreach *
- every: 1m
- warn: $this > 10
- info: number of anomalies in the last 2 minutes
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
deleted file mode 100644
index 9c55633ef..000000000
--- a/health/health.d/cgroups.conf
+++ /dev/null
@@ -1,72 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- template: cgroup_10min_cpu_usage
- on: cgroup.cpu_limit
- class: Utilization
- type: Cgroups
-component: CPU
- os: linux
- hosts: *
- lookup: average -10m unaligned
- units: %
- every: 1m
- warn: $this > (($status == $CRITICAL) ? (85) : (95))
- delay: down 15m multiplier 1.5 max 1h
- summary: Cgroup ${label:cgroup_name} CPU utilization
- info: Cgroup ${label:cgroup_name} average CPU utilization over the last 10 minutes
- to: silent
-
- template: cgroup_ram_in_use
- on: cgroup.mem_usage
- class: Utilization
- type: Cgroups
-component: Memory
- os: linux
- hosts: *
- calc: ($ram) * 100 / $memory_limit
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- summary: Cgroup ${label:cgroup_name} memory utilization
- info: Cgroup ${label:cgroup_name} memory utilization
- to: silent
-
-# ---------------------------------K8s containers--------------------------------------------
-
- template: k8s_cgroup_10min_cpu_usage
- on: k8s.cgroup.cpu_limit
- class: Utilization
- type: Cgroups
-component: CPU
- os: linux
- hosts: *
- lookup: average -10m unaligned
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (75) : (85))
- delay: down 15m multiplier 1.5 max 1h
- summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} CPU utilization
- info: Container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
- average CPU utilization over the last 10 minutes
- to: silent
-
- template: k8s_cgroup_ram_in_use
- on: k8s.cgroup.mem_usage
- class: Utilization
- type: Cgroups
-component: Memory
- os: linux
- hosts: *
- calc: ($ram) * 100 / $memory_limit
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} memory utilization
- info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
- memory utilization
- to: silent
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
deleted file mode 100644
index 0b007d6b4..000000000
--- a/health/health.d/cpu.conf
+++ /dev/null
@@ -1,69 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- template: 10min_cpu_usage
- on: system.cpu
- class: Utilization
- type: System
-component: CPU
- os: linux
- hosts: *
- lookup: average -10m unaligned of user,system,softirq,irq,guest
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (75) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
- delay: down 15m multiplier 1.5 max 1h
- summary: System CPU utilization
- info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
- to: silent
-
- template: 10min_cpu_iowait
- on: system.cpu
- class: Utilization
- type: System
-component: CPU
- os: linux
- hosts: *
- lookup: average -10m unaligned of iowait
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (20) : (40))
- delay: up 30m down 30m multiplier 1.5 max 2h
- summary: System CPU iowait time
- info: Average CPU iowait time over the last 10 minutes
- to: silent
-
- template: 20min_steal_cpu
- on: system.cpu
- class: Latency
- type: System
-component: CPU
- os: linux
- hosts: *
- lookup: average -20m unaligned of steal
- units: %
- every: 5m
- warn: $this > (($status >= $WARNING) ? (5) : (10))
- delay: down 1h multiplier 1.5 max 2h
- summary: System CPU steal time
- info: Average CPU steal time over the last 20 minutes
- to: silent
-
-## FreeBSD
- template: 10min_cpu_usage
- on: system.cpu
- class: Utilization
- type: System
-component: CPU
- os: freebsd
- hosts: *
- lookup: average -10m unaligned of user,system,interrupt
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (75) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
- delay: down 15m multiplier 1.5 max 1h
- summary: System CPU utilization
- info: Average CPU utilization over the last 10 minutes (excluding nice)
- to: silent
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
deleted file mode 100644
index 2e417fd4a..000000000
--- a/health/health.d/disks.conf
+++ /dev/null
@@ -1,172 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-
-# -----------------------------------------------------------------------------
-# low disk space
-
-# checking the latest collected values
-# raise an alarm if the disk is low on
-# available disk space
-
- template: disk_space_usage
- on: disk.space
- class: Utilization
- type: System
-component: Disk
- os: linux freebsd
- hosts: *
-chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
- calc: $used * 100 / ($avail + $used)
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING ) ? (80) : (90))
- crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5
- delay: up 1m down 15m multiplier 1.5 max 1h
- summary: Disk ${label:mount_point} space usage
- info: Total space utilization of disk ${label:mount_point}
- to: sysadmin
-
- template: disk_inode_usage
- on: disk.inodes
- class: Utilization
- type: System
-component: Disk
- os: linux freebsd
- hosts: *
-chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
- calc: $used * 100 / ($avail + $used)
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: up 1m down 15m multiplier 1.5 max 1h
- summary: Disk ${label:mount_point} inode usage
- info: Total inode utilization of disk ${label:mount_point}
- to: sysadmin
-
-
-# -----------------------------------------------------------------------------
-# disk fill rate
-
-# calculate the rate the disk fills
-# use as base, the available space change
-# during the last hour
-
-# this is just a calculation - it has no alarm
-# we will use it in the next template to find
-# the hours remaining
-
-template: disk_fill_rate
- on: disk.space
- os: linux freebsd
- hosts: *
- lookup: min -10m at -50m unaligned of avail
- calc: ($this - $avail) / (($now - $after) / 3600)
- every: 1m
- units: GB/hour
- info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
-
-# calculate the hours remaining
-# if the disk continues to fill
-# in this rate
-
-template: out_of_disk_space_time
- on: disk.space
- os: linux freebsd
- hosts: *
- calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
- units: hours
- every: 10s
- warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
- crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
- delay: down 15m multiplier 1.2 max 1h
- summary: Disk ${label:mount_point} estimation of lack of space
- info: Estimated time the disk ${label:mount_point} will run out of space, if the system continues to add data with the rate of the last hour
- to: silent
-
-
-# -----------------------------------------------------------------------------
-# disk inode fill rate
-
-# calculate the rate the disk inodes are allocated
-# use as base, the available inodes change
-# during the last hour
-
-# this is just a calculation - it has no alarm
-# we will use it in the next template to find
-# the hours remaining
-
-template: disk_inode_rate
- on: disk.inodes
- os: linux freebsd
- hosts: *
- lookup: min -10m at -50m unaligned of avail
- calc: ($this - $avail) / (($now - $after) / 3600)
- every: 1m
- units: inodes/hour
- info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
-
-# calculate the hours remaining
-# if the disk inodes are allocated
-# in this rate
-
-template: out_of_disk_inodes_time
- on: disk.inodes
- os: linux freebsd
- hosts: *
- calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
- units: hours
- every: 10s
- warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
- crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
- delay: down 15m multiplier 1.2 max 1h
- summary: Disk ${label:mount_point} estimation of lack of inodes
- info: Estimated time the disk ${label:mount_point} will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
- to: silent
-
-
-# -----------------------------------------------------------------------------
-# disk congestion
-
-# raise an alarm if the disk is congested
-# by calculating the average disk utilization
-# for the last 10 minutes
-
- template: 10min_disk_utilization
- on: disk.util
- class: Utilization
- type: System
-component: Disk
- os: linux freebsd
- hosts: *
- lookup: average -10m unaligned
- units: %
- every: 1m
- warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1))
- delay: down 15m multiplier 1.2 max 1h
- summary: Disk ${label:device} utilization
- info: Average percentage of time ${label:device} disk was busy over the last 10 minutes
- to: silent
-
-
-# raise an alarm if the disk backlog
-# is above 1000ms (1s) per second
-# for 10 minutes
-# (i.e. the disk cannot catch up)
-
- template: 10min_disk_backlog
- on: disk.backlog
- class: Latency
- type: System
-component: Disk
- os: linux
- hosts: *
- lookup: average -10m unaligned
- units: ms
- every: 1m
- warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1))
- delay: down 15m multiplier 1.2 max 1h
- summary: Disk ${label:device} backlog
- info: Average backlog size of the ${label:device} disk over the last 10 minutes
- to: silent
diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf
deleted file mode 100644
index be8b1fe4f..000000000
--- a/health/health.d/entropy.conf
+++ /dev/null
@@ -1,20 +0,0 @@
-
-# check if entropy is too low
-# the alarm is checked every 1 minute
-# and examines the last hour of data
-
- alarm: lowest_entropy
- on: system.entropy
- class: Utilization
- type: System
-component: Cryptography
- os: linux
- hosts: *
- lookup: min -5m unaligned
- units: entries
- every: 5m
- warn: $this < (($status >= $WARNING) ? (200) : (100))
- delay: down 1h multiplier 1.5 max 2h
- summary: System entropy pool number of entries
- info: Minimum number of entries in the random numbers pool in the last 5 minutes
- to: silent
diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf
deleted file mode 100644
index 20a592d6b..000000000
--- a/health/health.d/file_descriptors.conf
+++ /dev/null
@@ -1,33 +0,0 @@
- # you can disable an alarm notification by setting the 'to' line to: silent
-
- template: system_file_descriptors_utilization
- on: system.file_nr_utilization
- class: Utilization
- type: System
- component: Processes
- hosts: *
- lookup: max -1m unaligned
- units: %
- every: 1m
- crit: $this > 90
- delay: down 15m multiplier 1.5 max 1h
- summary: System open file descriptors utilization
- info: System-wide utilization of open files
- to: sysadmin
-
- template: apps_group_file_descriptors_utilization
- on: app.fds_open_limit
- class: Utilization
- type: System
-component: Process
- os: linux
- module: *
- hosts: *
- lookup: max -10s unaligned foreach *
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (85) : (95))
- delay: down 15m multiplier 1.5 max 1h
- summary: App group ${label:app_group} file descriptors utilization
- info: Open files percentage against the processes limits, among all PIDs in application group
- to: sysadmin
diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf
deleted file mode 100644
index 7796a1bc8..000000000
--- a/health/health.d/go.d.plugin.conf
+++ /dev/null
@@ -1,18 +0,0 @@
-
-# make sure go.d.plugin data collection job is running
-
- template: go.d_job_last_collected_secs
- on: netdata.go_plugin_execution_time
- class: Errors
- type: Netdata
-component: go.d.plugin
- module: !* *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- summary: Go.d plugin last collection
- info: Number of seconds since the last successful data collection
- to: webmaster
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
deleted file mode 100644
index da5dec797..000000000
--- a/health/health.d/httpcheck.conf
+++ /dev/null
@@ -1,73 +0,0 @@
-
-# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
- template: httpcheck_web_service_up
- on: httpcheck.status
- class: Utilization
- type: Web Server
-component: HTTP endpoint
- lookup: average -1m unaligned percentage of success
- calc: ($this < 75) ? (0) : ($this)
- every: 5s
- units: up/down
- info: HTTP check endpoint ${label:url} liveness status
- to: silent
-
- template: httpcheck_web_service_bad_content
- on: httpcheck.status
- class: Workload
- type: Web Server
-component: HTTP endpoint
- lookup: average -5m unaligned percentage of bad_content
- every: 10s
- units: %
- warn: $this >= 10 AND $this < 40
- crit: $this >= 40
- delay: down 5m multiplier 1.5 max 1h
- summary: HTTP check for ${label:url} unexpected content
- info: Percentage of HTTP responses from ${label:url} with unexpected content in the last 5 minutes
- to: webmaster
-
- template: httpcheck_web_service_bad_status
- on: httpcheck.status
- class: Workload
- type: Web Server
-component: HTTP endpoint
- lookup: average -5m unaligned percentage of bad_status
- every: 10s
- units: %
- warn: $this >= 10 AND $this < 40
- crit: $this >= 40
- delay: down 5m multiplier 1.5 max 1h
- summary: HTTP check for ${label:url} unexpected status
- info: Percentage of HTTP responses from ${label:url} with unexpected status in the last 5 minutes
- to: webmaster
-
- template: httpcheck_web_service_timeouts
- on: httpcheck.status
- class: Latency
- type: Web Server
-component: HTTP endpoint
- lookup: average -5m unaligned percentage of timeout
- every: 10s
- units: %
- warn: $this >= 10 AND $this < 40
- crit: $this >= 40
- delay: down 5m multiplier 1.5 max 1h
- summary: HTTP check for ${label:url} timeouts
- info: Percentage of timed-out HTTP requests to ${label:url} in the last 5 minutes
- to: webmaster
-
- template: httpcheck_web_service_no_connection
- on: httpcheck.status
- class: Errors
- type: Other
-component: HTTP endpoint
- lookup: average -5m unaligned percentage of no_connection
- every: 10s
- units: %
- warn: $this >= 10 AND $this < 40
- crit: $this >= 40
- delay: down 5m multiplier 1.5 max 1h
- summary: HTTP check for ${label:url} failed requests
- info: Percentage of failed HTTP requests to ${label:url} in the last 5 minutes
- to: webmaster
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
deleted file mode 100644
index f77f56065..000000000
--- a/health/health.d/ipc.conf
+++ /dev/null
@@ -1,34 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- alarm: semaphores_used
- on: system.ipc_semaphores
- class: Utilization
- type: System
-component: IPC
- os: linux
- hosts: *
- calc: $semaphores * 100 / $ipc_semaphores_max
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- delay: down 5m multiplier 1.5 max 1h
- summary: IPC semaphores used
- info: IPC semaphore utilization
- to: sysadmin
-
- alarm: semaphore_arrays_used
- on: system.ipc_semaphore_arrays
- class: Utilization
- type: System
-component: IPC
- os: linux
- hosts: *
- calc: $arrays * 100 / $ipc_semaphores_arrays_max
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- delay: down 5m multiplier 1.5 max 1h
- summary: IPC semaphore arrays used
- info: IPC semaphore arrays utilization
- to: sysadmin
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
deleted file mode 100644
index fd8bf9396..000000000
--- a/health/health.d/load.conf
+++ /dev/null
@@ -1,72 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# Calculate the base trigger point for the load average alarms.
-# This is the maximum number of CPU's in the system over the past 1
-# minute, with a special case for a single CPU of setting the trigger at 2.
- alarm: load_cpu_number
- on: system.load
- class: Utilization
- type: System
-component: Load
- os: linux
- hosts: *
- calc: ($active_processors == nan or $active_processors == 0) ? (nan) : ( ($active_processors < 2) ? ( 2 ) : ( $active_processors ) )
- units: cpus
- every: 1m
- info: Number of active CPU cores in the system
-
-# Send alarms if the load average is unusually high.
-# These intentionally _do not_ calculate the average over the sampled
-# time period because the values being checked already are averages.
-
- alarm: load_average_15
- on: system.load
- class: Utilization
- type: System
-component: Load
- os: linux
- hosts: *
- lookup: max -1m unaligned of load15
- calc: ($load_cpu_number == nan) ? (nan) : ($this)
- units: load
- every: 1m
- warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
- delay: down 15m multiplier 1.5 max 1h
- summary: Host load average (15 minutes)
- info: System load average for the past 15 minutes
- to: silent
-
- alarm: load_average_5
- on: system.load
- class: Utilization
- type: System
-component: Load
- os: linux
- hosts: *
- lookup: max -1m unaligned of load5
- calc: ($load_cpu_number == nan) ? (nan) : ($this)
- units: load
- every: 1m
- warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
- delay: down 15m multiplier 1.5 max 1h
- summary: System load average (5 minutes)
- info: System load average for the past 5 minutes
- to: silent
-
- alarm: load_average_1
- on: system.load
- class: Utilization
- type: System
-component: Load
- os: linux
- hosts: *
- lookup: max -1m unaligned of load1
- calc: ($load_cpu_number == nan) ? (nan) : ($this)
- units: load
- every: 1m
- warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
- delay: down 15m multiplier 1.5 max 1h
- summary: System load average (1 minute)
- info: System load average for the past 1 minute
- to: silent
diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf
deleted file mode 100644
index 5ab3d2d92..000000000
--- a/health/health.d/memory.conf
+++ /dev/null
@@ -1,85 +0,0 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- alarm: 1hour_memory_hw_corrupted
- on: mem.hwcorrupt
- class: Errors
- type: System
-component: Memory
- os: linux
- hosts: *
- calc: $HardwareCorrupted
- units: MB
- every: 10s
- warn: $this > 0
- delay: down 1h multiplier 1.5 max 1h
- summary: System corrupted memory
- info: Amount of memory corrupted due to a hardware failure
- to: sysadmin
-
-## ECC Controller
-
- template: ecc_memory_mc_correctable
- on: mem.edac_mc
- class: Errors
- type: System
-component: Memory
- os: linux
- hosts: *
- lookup: sum -10m unaligned of correctable, correctable_noinfo
- units: errors
- every: 1m
- warn: $this > 0
- delay: down 1h multiplier 1.5 max 1h
- summary: System ECC memory ${label:controller} correctable errors
- info: Memory controller ${label:controller} ECC correctable errors in the last 10 minutes
- to: sysadmin
-
- template: ecc_memory_mc_uncorrectable
- on: mem.edac_mc
- class: Errors
- type: System
-component: Memory
- os: linux
- hosts: *
- lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo
- units: errors
- every: 1m
- crit: $this > 0
- delay: down 1h multiplier 1.5 max 1h
- summary: System ECC memory ${label:controller} uncorrectable errors
- info: Memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes
- to: sysadmin
-
-## ECC DIMM
-
- template: ecc_memory_dimm_correctable
- on: mem.edac_mc_dimm
- class: Errors
- type: System
-component: Memory
- os: linux
- hosts: *
- lookup: sum -10m unaligned of correctable
- units: errors
- every: 1m
- warn: $this > 0
- delay: down 1h multiplier 1.5 max 1h
- summary: System ECC memory DIMM ${label:dimm} correctable errors
- info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes
- to: sysadmin
-
- template: ecc_memory_dimm_uncorrectable
- on: mem.edac_mc_dimm
- class: Errors
- type: System
-component: Memory
- os: linux
- hosts: *
- lookup: sum -10m unaligned of uncorrectable
- units: errors
- every: 1m
- crit: $this > 0
- delay: down 1h multiplier 1.5 max 1h
- summary: System ECC memory DIMM ${label:dimm} uncorrectable errors
- info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes
- to: sysadmin
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
deleted file mode 100644
index 2dfe6bbaf..000000000
--- a/health/health.d/net.conf
+++ /dev/null
@@ -1,258 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# -----------------------------------------------------------------------------
-# net traffic overflow
-
- template: interface_speed
- on: net.net
- class: Latency
- type: System
-component: Network
- os: *
- hosts: *
- calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max / 1000) : ( nan )
- units: Mbit
- every: 10s
- info: Network interface ${label:device} current speed
-
- template: 1m_received_traffic_overflow
- on: net.net
- class: Workload
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: average -1m unaligned absolute of received
- calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (85) : (90))
- delay: up 1m down 1m multiplier 1.5 max 1h
- summary: System network interface ${label:device} inbound utilization
- info: Average inbound utilization for the network interface ${label:device} over the last minute
- to: silent
-
- template: 1m_sent_traffic_overflow
- on: net.net
- class: Workload
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: average -1m unaligned absolute of sent
- calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (85) : (90))
- delay: up 1m down 1m multiplier 1.5 max 1h
- summary: System network interface ${label:device} outbound utilization
- info: Average outbound utilization for the network interface ${label:device} over the last minute
- to: silent
-
-# -----------------------------------------------------------------------------
-# dropped packets
-
-# check if an interface is dropping packets
-# the alarm is checked every 1 minute
-# and examines the last 10 minutes of data
-#
-# it is possible to have expected packet drops on an interface for some network configurations
-# look at the Monitoring Network Interfaces section in the proc.plugin documentation for more information
-
- template: net_interface_inbound_packets
- on: net.packets
- class: Workload
- type: System
-component: Network
- os: *
- hosts: *
- lookup: sum -10m unaligned absolute of received
- units: packets
- every: 1m
- summary: Network interface ${label:device} received packets
- info: Received packets for the network interface ${label:device} in the last 10 minutes
-
- template: net_interface_outbound_packets
- on: net.packets
- class: Workload
- type: System
-component: Network
- os: *
- hosts: *
- lookup: sum -10m unaligned absolute of sent
- units: packets
- every: 1m
- summary: Network interface ${label:device} sent packets
- info: Sent packets for the network interface ${label:device} in the last 10 minutes
-
- template: inbound_packets_dropped_ratio
- on: net.drops
- class: Errors
- type: System
-component: Network
- os: *
- hosts: *
-chart labels: device=!wl* *
- lookup: sum -10m unaligned absolute of inbound
- calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0))
- units: %
- every: 1m
- warn: $this >= 2
- delay: up 1m down 1h multiplier 1.5 max 2h
- summary: System network interface ${label:device} inbound drops
- info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
- to: silent
-
- template: outbound_packets_dropped_ratio
- on: net.drops
- class: Errors
- type: System
-component: Network
- os: *
- hosts: *
-chart labels: device=!wl* *
- lookup: sum -10m unaligned absolute of outbound
- calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0))
- units: %
- every: 1m
- warn: $this >= 2
- delay: up 1m down 1h multiplier 1.5 max 2h
- summary: System network interface ${label:device} outbound drops
- info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
- to: silent
-
- template: wifi_inbound_packets_dropped_ratio
- on: net.drops
- class: Errors
- type: System
-component: Network
- os: linux
- hosts: *
-chart labels: device=wl*
- lookup: sum -10m unaligned absolute of received
- calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0))
- units: %
- every: 1m
- warn: $this >= 10
- delay: up 1m down 1h multiplier 1.5 max 2h
- summary: System network interface ${label:device} inbound drops ratio
- info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
- to: silent
-
- template: wifi_outbound_packets_dropped_ratio
- on: net.drops
- class: Errors
- type: System
-component: Network
- os: linux
- hosts: *
-chart labels: device=wl*
- lookup: sum -10m unaligned absolute of sent
- calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0))
- units: %
- every: 1m
- warn: $this >= 10
- delay: up 1m down 1h multiplier 1.5 max 2h
- summary: System network interface ${label:device} outbound drops ratio
- info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
- to: silent
-
-# -----------------------------------------------------------------------------
-# interface errors
-
- template: interface_inbound_errors
- on: net.errors
- class: Errors
- type: System
-component: Network
- os: freebsd
- hosts: *
- lookup: sum -10m unaligned absolute of inbound
- units: errors
- every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- summary: System network interface ${label:device} inbound errors
- info: Number of inbound errors for the network interface ${label:device} in the last 10 minutes
- to: silent
-
- template: interface_outbound_errors
- on: net.errors
- class: Errors
- type: System
-component: Network
- os: freebsd
- hosts: *
- lookup: sum -10m unaligned absolute of outbound
- units: errors
- every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- summary: System network interface ${label:device} outbound errors
- info: Number of outbound errors for the network interface ${label:device} in the last 10 minutes
- to: silent
-
-# -----------------------------------------------------------------------------
-# FIFO errors
-
-# check if an interface is having FIFO
-# buffer errors
-# the alarm is checked every 1 minute
-# and examines the last 10 minutes of data
-
- template: 10min_fifo_errors
- on: net.fifo
- class: Errors
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: sum -10m unaligned absolute
- units: errors
- every: 1m
- warn: $this > 0
- delay: down 1h multiplier 1.5 max 2h
- summary: System network interface ${label:device} FIFO errors
- info: Number of FIFO errors for the network interface ${label:device} in the last 10 minutes
- to: silent
-
-# -----------------------------------------------------------------------------
-# check for packet storms
-
-# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
-# 2. do the same for the last 10s
-# 3. raise an alarm if the later is 10x or 20x the first
-# we assume the minimum packet storm should at least have
-# 10000 packets/s, average of the last 10 seconds
-
- template: 1m_received_packets_rate
- on: net.packets
- class: Workload
- type: System
-component: Network
- os: linux freebsd
- hosts: *
- lookup: average -1m unaligned of received
- units: packets
- every: 10s
- info: Average number of packets received by the network interface ${label:device} over the last minute
-
- template: 10s_received_packets_storm
- on: net.packets
- class: Workload
- type: System
-component: Network
- os: linux freebsd
- hosts: *
- lookup: average -10s unaligned of received
- calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
- every: 10s
- units: %
- warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status == $CRITICAL)?(5000):(6000))
- options: no-clear-notification
- summary: System network interface ${label:device} inbound packet storm
- info: Ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
- compared to the rate over the last minute
- to: silent
diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf
deleted file mode 100644
index 417105d43..000000000
--- a/health/health.d/netfilter.conf
+++ /dev/null
@@ -1,20 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- alarm: netfilter_conntrack_full
- on: netfilter.conntrack_sockets
- class: Workload
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: max -10s unaligned of connections
- calc: $this * 100 / $netfilter_conntrack_max
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (85) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (95))
- delay: down 5m multiplier 1.5 max 1h
- summary: System Netfilter connection tracker utilization
- info: Netfilter connection tracker table size utilization
- to: sysadmin
diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf
deleted file mode 100644
index de4c0078e..000000000
--- a/health/health.d/postgres.conf
+++ /dev/null
@@ -1,228 +0,0 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- template: postgres_total_connection_utilization
- on: postgres.connections_utilization
- class: Utilization
- type: Database
-component: PostgreSQL
- hosts: *
- lookup: average -1m unaligned of used
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (80) : (90))
- delay: down 15m multiplier 1.5 max 1h
- summary: PostgreSQL connection utilization
- info: Average total connection utilization over the last minute
- to: dba
-
- template: postgres_acquired_locks_utilization
- on: postgres.locks_utilization
- class: Utilization
- type: Database
-component: PostgreSQL
- hosts: *
- lookup: average -1m unaligned of used
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (15) : (20))
- delay: down 15m multiplier 1.5 max 1h
- summary: PostgreSQL acquired locks utilization
- info: Average acquired locks utilization over the last minute
- to: dba
-
- template: postgres_txid_exhaustion_perc
- on: postgres.txid_exhaustion_perc
- class: Utilization
- type: Database
-component: PostgreSQL
- hosts: *
- calc: $txid_exhaustion
- units: %
- every: 1m
- warn: $this > 90
- delay: down 15m multiplier 1.5 max 1h
- summary: PostgreSQL TXID exhaustion
- info: Percent towards TXID wraparound
- to: dba
-
-# Database alarms
-
- template: postgres_db_cache_io_ratio
- on: postgres.db_cache_io_ratio
- class: Workload
- type: Database
-component: PostgreSQL
- hosts: *
- lookup: average -1m unaligned of miss
- calc: 100 - $this
- units: %
- every: 1m
- warn: $this < (($status >= $WARNING) ? (70) : (60))
- crit: $this < (($status == $CRITICAL) ? (60) : (50))
- delay: down 15m multiplier 1.5 max 1h
- summary: PostgreSQL DB ${label:database} cache hit ratio
- info: Average cache hit ratio in db ${label:database} over the last minute
- to: dba
-
- template: postgres_db_transactions_rollback_ratio
- on: postgres.db_transactions_ratio
- class: Workload
- type: Database
-component: PostgreSQL
- hosts: *
- lookup: average -5m unaligned of rollback
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (0) : (2))
- delay: down 15m multiplier 1.5 max 1h
- summary: PostgreSQL DB ${label:database} aborted transactions
- info: Average aborted transactions percentage in db ${label:database} over the last five minutes
- to: dba
-
- template: postgres_db_deadlocks_rate
- on: postgres.db_deadlocks_rate
- class: Errors
- type: Database
-component: PostgreSQL
- hosts: *
- lookup: sum -1m unaligned of deadlocks
- units: deadlocks
- every: 1m
- warn: $this > (($status >= $WARNING) ? (0) : (10))
- delay: down 15m multiplier 1.5 max 1h
- summary: PostgreSQL DB ${label:database} deadlocks rate
- info: Number of deadlocks detected in db ${label:database} in the last minute
- to: dba
-
-# Table alarms
-
- template: postgres_table_cache_io_ratio
- on: postgres.table_cache_io_ratio
- class: Workload
- type: Database
-component: PostgreSQL
- hosts: *
- lookup: average -1m unaligned of miss
- calc: 100 - $this
- units: %
- every: 1m
- warn: $this < (($status >= $WARNING) ? (70) : (60))
- crit: $this < (($status == $CRITICAL) ? (60) : (50))
- delay: down 15m multiplier 1.5 max 1h
- summary: PostgreSQL table ${label:table} db ${label:database} cache hit ratio
- info: Average cache hit ratio in db ${label:database} table ${label:table} over the last minute
- to: dba
-
- template: postgres_table_index_cache_io_ratio
- on: postgres.table_index_cache_io_ratio
- class: Workload
- type: Database
-component: PostgreSQL
- hosts: *
- lookup: average -1m unaligned of miss
- calc: 100 - $this
- units: %
- every: 1m
- warn: $this < (($status >= $WARNING) ? (70) : (60))
- crit: $this < (($status == $CRITICAL) ? (60) : (50))
- delay: down 15m multiplier 1.5 max 1h
- summary: PostgreSQL table ${label:table} db ${label:database} index cache hit ratio
- info: Average index cache hit ratio in db ${label:database} table ${label:table} over the last minute
- to: dba
-
- template: postgres_table_toast_cache_io_ratio
- on: postgres.table_toast_cache_io_ratio
- class: Workload
- type: Database
-component: PostgreSQL
- hosts: *
- lookup: average -1m unaligned of miss
- calc: 100 - $this
- units: %
- every: 1m
- warn: $this < (($status >= $WARNING) ? (70) : (60))
- crit: $this < (($status == $CRITICAL) ? (60) : (50))
- delay: down 15m multiplier 1.5 max 1h
- summary: PostgreSQL table ${label:table} db ${label:database} toast cache hit ratio
- info: Average TOAST hit ratio in db ${label:database} table ${label:table} over the last minute
- to: dba
-
- template: postgres_table_toast_index_cache_io_ratio
- on: postgres.table_toast_index_cache_io_ratio
- class: Workload
- type: Database
-component: PostgreSQL
- hosts: *
- lookup: average -1m unaligned of miss
- calc: 100 - $this
- units: %
- every: 1m
- warn: $this < (($status >= $WARNING) ? (70) : (60))
- crit: $this < (($status == $CRITICAL) ? (60) : (50))
- delay: down 15m multiplier 1.5 max 1h
- summary: PostgreSQL table ${label:table} db ${label:database} index toast hit ratio
- info: average index TOAST hit ratio in db ${label:database} table ${label:table} over the last minute
- to: dba
-
- template: postgres_table_bloat_size_perc
- on: postgres.table_bloat_size_perc
- class: Errors
- type: Database
-component: PostgreSQL
- hosts: *
- calc: ($table_size > (1024 * 1024 * 100)) ? ($bloat) : (0)
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (60) : (70))
- crit: $this > (($status == $CRITICAL) ? (70) : (80))
- delay: down 15m multiplier 1.5 max 1h
- summary: PostgreSQL table ${label:table} db ${label:database} bloat size
- info: Bloat size percentage in db ${label:database} table ${label:table}
- to: dba
-
- template: postgres_table_last_autovacuum_time
- on: postgres.table_autovacuum_since_time
- class: Errors
- type: Database
-component: PostgreSQL
- hosts: !*
- calc: $time
- units: seconds
- every: 1m
- warn: $this != nan AND $this > (60 * 60 * 24 * 7)
- summary: PostgreSQL table ${label:table} db ${label:database} last autovacuum
- info: Time elapsed since db ${label:database} table ${label:table} was vacuumed by the autovacuum daemon
- to: dba
-
- template: postgres_table_last_autoanalyze_time
- on: postgres.table_autoanalyze_since_time
- class: Errors
- type: Database
-component: PostgreSQL
- hosts: !*
- calc: $time
- units: seconds
- every: 1m
- warn: $this != nan AND $this > (60 * 60 * 24 * 7)
- summary: PostgreSQL table ${label:table} db ${label:database} last autoanalyze
- info: Time elapsed since db ${label:database} table ${label:table} was analyzed by the autovacuum daemon
- to: dba
-
-# Index alarms
-
- template: postgres_index_bloat_size_perc
- on: postgres.index_bloat_size_perc
- class: Errors
- type: Database
-component: PostgreSQL
- hosts: *
- calc: ($index_size > (1024 * 1024 * 10)) ? ($bloat) : (0)
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (60) : (70))
- crit: $this > (($status == $CRITICAL) ? (70) : (80))
- delay: down 15m multiplier 1.5 max 1h
- summary: PostgreSQL table ${label:table} db ${label:database} index bloat size
- info: Bloat size percentage in db ${label:database} table ${label:table} index ${label:index}
- to: dba
diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf
deleted file mode 100644
index da27ad5b7..000000000
--- a/health/health.d/python.d.plugin.conf
+++ /dev/null
@@ -1,18 +0,0 @@
-
-# make sure python.d.plugin data collection job is running
-
- template: python.d_job_last_collected_secs
- on: netdata.pythond_runtime
- class: Errors
- type: Netdata
-component: python.d.plugin
- module: !* *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- summary: Python.d plugin last collection
- info: Number of seconds since the last successful data collection
- to: webmaster
diff --git a/health/health.d/qos.conf b/health/health.d/qos.conf
deleted file mode 100644
index 970ea6363..000000000
--- a/health/health.d/qos.conf
+++ /dev/null
@@ -1,18 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# check if a QoS class is dropping packets
-# the alarm is checked every 10 seconds
-# and examines the last minute of data
-
-template: 10min_qos_packet_drops
- on: tc.qos_dropped
- os: linux
- hosts: *
- lookup: sum -5m unaligned absolute
- every: 30s
- warn: $this > 0
- units: packets
- summary: QOS packet drops
- info: Dropped packets in the last 5 minutes
- to: silent
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
deleted file mode 100644
index 51f307ca6..000000000
--- a/health/health.d/ram.conf
+++ /dev/null
@@ -1,82 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- alarm: ram_in_use
- on: system.ram
- class: Utilization
- type: System
-component: Memory
- os: linux
- hosts: *
- calc: $used * 100 / ($used + $cached + $free + $buffers)
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- summary: System memory utilization
- info: System memory utilization
- to: sysadmin
-
- alarm: ram_available
- on: mem.available
- class: Utilization
- type: System
-component: Memory
- os: linux
- hosts: *
- calc: $avail * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
- units: %
- every: 10s
- warn: $this < (($status >= $WARNING) ? (15) : (10))
- delay: down 15m multiplier 1.5 max 1h
- summary: System available memory
- info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping
- to: silent
-
- alarm: oom_kill
- on: mem.oom_kill
- os: linux
- hosts: *
- lookup: sum -30m unaligned
- units: kills
- every: 5m
- warn: $this > 0
- delay: down 10m
- summary: System OOM kills
- info: Number of out of memory kills in the last 30 minutes
- to: silent
-
-## FreeBSD
- alarm: ram_in_use
- on: system.ram
- class: Utilization
- type: System
-component: Memory
- os: freebsd
- hosts: *
- calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive)
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- summary: System memory utilization
- info: System memory utilization
- to: sysadmin
-
- alarm: ram_available
- on: mem.available
- class: Utilization
- type: System
-component: Memory
- os: freebsd
- hosts: *
- calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers)
- units: %
- every: 10s
- warn: $this < (($status >= $WARNING) ? (15) : (10))
- delay: down 15m multiplier 1.5 max 1h
- summary: System available memory
- info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping
- to: silent
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
deleted file mode 100644
index 7c2945e68..000000000
--- a/health/health.d/redis.conf
+++ /dev/null
@@ -1,57 +0,0 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- template: redis_connections_rejected
- on: redis.connections
- class: Errors
- type: KV Storage
-component: Redis
- lookup: sum -1m unaligned of rejected
- every: 10s
- units: connections
- warn: $this > 0
- summary: Redis rejected connections
- info: Connections rejected because of maxclients limit in the last minute
- delay: down 5m multiplier 1.5 max 1h
- to: dba
-
- template: redis_bgsave_broken
- on: redis.bgsave_health
- class: Errors
- type: KV Storage
-component: Redis
- every: 10s
- crit: $last_bgsave != nan AND $last_bgsave != 0
- units: ok/failed
- summary: Redis background save
- info: Status of the last RDB save operation (0: ok, 1: error)
- delay: down 5m multiplier 1.5 max 1h
- to: dba
-
- template: redis_bgsave_slow
- on: redis.bgsave_now
- class: Latency
- type: KV Storage
-component: Redis
- every: 10s
- calc: $current_bgsave_time
- warn: $this > 600
- crit: $this > 1200
- units: seconds
- summary: Redis slow background save
- info: Duration of the on-going RDB save operation
- delay: down 5m multiplier 1.5 max 1h
- to: dba
-
- template: redis_master_link_down
- on: redis.master_link_down_since_time
- class: Errors
- type: KV Storage
-component: Redis
- every: 10s
- calc: $time
- units: seconds
- crit: $this != nan AND $this > 0
- summary: Redis master link down
- info: Time elapsed since the link between master and slave is down
- delay: down 5m multiplier 1.5 max 1h
- to: dba
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
deleted file mode 100644
index 8d7ba5661..000000000
--- a/health/health.d/softnet.conf
+++ /dev/null
@@ -1,57 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# check for common /proc/net/softnet_stat errors
-
- alarm: 1min_netdev_backlog_exceeded
- on: system.softnet_stat
- class: Errors
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: average -1m unaligned absolute of dropped
- units: packets
- every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : (10))
- delay: down 1h multiplier 1.5 max 2h
- summary: System netdev dropped packets
- info: Average number of dropped packets in the last minute \
- due to exceeded net.core.netdev_max_backlog
- to: silent
-
- alarm: 1min_netdev_budget_ran_outs
- on: system.softnet_stat
- class: Errors
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: average -1m unaligned absolute of squeezed
- units: events
- every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : (10))
- delay: down 1h multiplier 1.5 max 2h
- summary: System netdev budget run outs
- info: Average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \
- net.core.netdev_budget_usecs with work remaining over the last minute \
- (this can be a cause for dropped packets)
- to: silent
-
- alarm: 10min_netisr_backlog_exceeded
- on: system.softnet_stat
- class: Errors
- type: System
-component: Network
- os: freebsd
- hosts: *
- lookup: average -1m unaligned absolute of qdrops
- units: packets
- every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : (10))
- delay: down 1h multiplier 1.5 max 2h
- summary: System netisr drops
- info: Average number of drops in the last minute \
- due to exceeded sysctl net.route.netisr_maxqlen \
- (this can be a cause for dropped packets)
- to: silent
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
deleted file mode 100644
index e39733996..000000000
--- a/health/health.d/swap.conf
+++ /dev/null
@@ -1,37 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- alarm: 30min_ram_swapped_out
- on: mem.swapio
- class: Workload
- type: System
-component: Memory
- os: linux freebsd
- hosts: *
- lookup: sum -30m unaligned absolute of out
- # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
- calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
- units: % of RAM
- every: 1m
- warn: $this > (($status >= $WARNING) ? (20) : (30))
- delay: down 15m multiplier 1.5 max 1h
- summary: System memory swapped out
- info: Percentage of the system RAM swapped in the last 30 minutes
- to: silent
-
- alarm: used_swap
- on: mem.swap
- class: Utilization
- type: System
-component: Memory
- os: linux freebsd
- hosts: *
- calc: (($used + $free) > 0) ? ($used * 100 / ($used + $free)) : 0
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: up 30s down 15m multiplier 1.5 max 1h
- summary: System swap memory utilization
- info: Swap memory utilization
- to: sysadmin
diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf
deleted file mode 100644
index ad53a0e1c..000000000
--- a/health/health.d/systemdunits.conf
+++ /dev/null
@@ -1,161 +0,0 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-## Service units
- template: systemd_service_unit_failed_state
- on: systemd.service_unit_state
- class: Errors
- type: Linux
-component: Systemd units
- module: !* *
- calc: $failed
- units: state
- every: 10s
- warn: $this != nan AND $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: systemd unit ${label:unit_name} state
- info: systemd service unit in the failed state
- to: sysadmin
-
-## Socket units
- template: systemd_socket_unit_failed_state
- on: systemd.socket_unit_state
- class: Errors
- type: Linux
-component: Systemd units
- module: !* *
- calc: $failed
- units: state
- every: 10s
- warn: $this != nan AND $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: systemd unit ${label:unit_name} state
- info: systemd socket unit in the failed state
- to: sysadmin
-
-## Target units
- template: systemd_target_unit_failed_state
- on: systemd.target_unit_state
- class: Errors
- type: Linux
-component: Systemd units
- module: !* *
- calc: $failed
- units: state
- every: 10s
- warn: $this != nan AND $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: systemd unit ${label:unit_name} state
- info: systemd target unit in the failed state
- to: sysadmin
-
-## Path units
- template: systemd_path_unit_failed_state
- on: systemd.path_unit_state
- class: Errors
- type: Linux
-component: Systemd units
- module: !* *
- calc: $failed
- units: state
- every: 10s
- warn: $this != nan AND $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: systemd unit ${label:unit_name} state
- info: systemd path unit in the failed state
- to: sysadmin
-
-## Device units
- template: systemd_device_unit_failed_state
- on: systemd.device_unit_state
- class: Errors
- type: Linux
-component: Systemd units
- module: !* *
- calc: $failed
- units: state
- every: 10s
- warn: $this != nan AND $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: systemd unit ${label:unit_name} state
- info: systemd device unit in the failed state
- to: sysadmin
-
-## Mount units
- template: systemd_mount_unit_failed_state
- on: systemd.mount_unit_state
- class: Errors
- type: Linux
-component: Systemd units
- module: !* *
- calc: $failed
- units: state
- every: 10s
- warn: $this != nan AND $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: systemd unit ${label:unit_name} state
- info: systemd mount units in the failed state
- to: sysadmin
-
-## Automount units
- template: systemd_automount_unit_failed_state
- on: systemd.automount_unit_state
- class: Errors
- type: Linux
-component: Systemd units
- module: !* *
- calc: $failed
- units: state
- every: 10s
- warn: $this != nan AND $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: systemd unit ${label:unit_name} state
- info: systemd automount unit in the failed state
- to: sysadmin
-
-## Swap units
- template: systemd_swap_unit_failed_state
- on: systemd.swap_unit_state
- class: Errors
- type: Linux
-component: Systemd units
- module: !* *
- calc: $failed
- units: state
- every: 10s
- warn: $this != nan AND $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: systemd unit ${label:unit_name} state
- info: systemd swap units in the failed state
- to: sysadmin
-
-## Scope units
- template: systemd_scope_unit_failed_state
- on: systemd.scope_unit_state
- class: Errors
- type: Linux
-component: Systemd units
- module: !* *
- calc: $failed
- units: state
- every: 10s
- warn: $this != nan AND $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: systemd unit ${label:unit_name} state
- info: systemd scope units in the failed state
- to: sysadmin
-
-## Slice units
- template: systemd_slice_unit_failed_state
- on: systemd.slice_unit_state
- class: Errors
- type: Linux
-component: Systemd units
- module: !* *
- calc: $failed
- units: state
- every: 10s
- warn: $this != nan AND $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: systemd unit ${label:unit_name} state
- info: systemd slice units in the failed state
- to: sysadmin
diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
deleted file mode 100644
index 2b2f97406..000000000
--- a/health/health.d/tcp_conn.conf
+++ /dev/null
@@ -1,23 +0,0 @@
-
-#
-# ${tcp_max_connections} may be nan or -1 if the system
-# supports dynamic threshold for TCP connections.
-# In this case, the alarm will always be zero.
-#
-
- alarm: tcp_connections
- on: ip.tcpsock
- class: Workload
- type: System
-component: Network
- os: linux
- hosts: *
- calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
- crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
- delay: up 0 down 5m multiplier 1.5 max 1h
- summary: System TCP connections utilization
- info: IPv4 TCP connections utilization
- to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
deleted file mode 100644
index 9d1104a51..000000000
--- a/health/health.d/tcp_listen.conf
+++ /dev/null
@@ -1,100 +0,0 @@
-#
-# There are two queues involved when incoming TCP connections are handled
-# (both at the kernel):
-#
-# SYN queue
-# The SYN queue tracks TCP handshakes until connections are fully established.
-# It overflows when too many incoming TCP connection requests hang in the
-# half-open state and the server is not configured to fall back to SYN cookies.
-# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends
-# lots of SYN packets and never completes the handshakes).
-#
-# Accept queue
-# The accept queue holds fully established TCP connections waiting to be handled
-# by the listening application. It overflows when the server application fails
-# to accept new connections at the rate they are coming in.
-#
-#
-# -----------------------------------------------------------------------------
-# tcp accept queue (at the kernel)
-
- alarm: 1m_tcp_accept_queue_overflows
- on: ip.tcp_accept_queue
- class: Workload
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: average -60s unaligned absolute of ListenOverflows
- units: overflows
- every: 10s
- warn: $this > 1
- crit: $this > (($status == $CRITICAL) ? (1) : (5))
- delay: up 0 down 5m multiplier 1.5 max 1h
- summary: System TCP accept queue overflows
- info: Average number of overflows in the TCP accept queue over the last minute
- to: silent
-
-# THIS IS TOO GENERIC
-# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
- alarm: 1m_tcp_accept_queue_drops
- on: ip.tcp_accept_queue
- class: Workload
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: average -60s unaligned absolute of ListenDrops
- units: drops
- every: 10s
- warn: $this > 1
- crit: $this > (($status == $CRITICAL) ? (1) : (5))
- delay: up 0 down 5m multiplier 1.5 max 1h
- summary: System TCP accept queue dropped packets
- info: Average number of dropped packets in the TCP accept queue over the last minute
- to: silent
-
-
-# -----------------------------------------------------------------------------
-# tcp SYN queue (at the kernel)
-
-# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or
-# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are
-# enabled or not. In both cases this probably indicates a SYN flood attack,
-# so i guess a notification should be sent.
-
- alarm: 1m_tcp_syn_queue_drops
- on: ip.tcp_syn_queue
- class: Workload
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: average -60s unaligned absolute of TCPReqQFullDrop
- units: drops
- every: 10s
- warn: $this > 1
- crit: $this > (($status == $CRITICAL) ? (0) : (5))
- delay: up 10 down 5m multiplier 1.5 max 1h
- summary: System TCP SYN queue drops
- info: Average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
- (SYN cookies were not enabled)
- to: silent
-
- alarm: 1m_tcp_syn_queue_cookies
- on: ip.tcp_syn_queue
- class: Workload
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: average -60s unaligned absolute of TCPReqQFullDoCookies
- units: cookies
- every: 10s
- warn: $this > 1
- crit: $this > (($status == $CRITICAL) ? (0) : (5))
- delay: up 10 down 5m multiplier 1.5 max 1h
- summary: System TCP SYN queue cookies
- info: Average number of sent SYN cookies due to the full TCP SYN queue over the last minute
- to: silent
-
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
deleted file mode 100644
index 4e422ec1c..000000000
--- a/health/health.d/tcp_mem.conf
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# check
-# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
-#
-# We give a warning when TCP is under memory pressure
-# and a critical when TCP is 90% of its upper memory limit
-#
-
- alarm: tcp_memory
- on: ipv4.sockstat_tcp_mem
- class: Utilization
- type: System
-component: Network
- os: linux
- hosts: *
- calc: ${mem} * 100 / ${tcp_mem_high}
- units: %
- every: 10s
- warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} ))
- crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
- delay: up 0 down 5m multiplier 1.5 max 1h
- summary: System TCP memory utilization
- info: TCP memory utilization
- to: silent
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
deleted file mode 100644
index 8f665d50e..000000000
--- a/health/health.d/tcp_orphans.conf
+++ /dev/null
@@ -1,25 +0,0 @@
-
-#
-# check
-# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
-#
-# The kernel may penalize orphans by 2x or even 4x
-# so we alarm warning at 25% and critical at 50%
-#
-
- alarm: tcp_orphans
- on: ipv4.sockstat_tcp_sockets
- class: Errors
- type: System
-component: Network
- os: linux
- hosts: *
- calc: ${orphan} * 100 / ${tcp_max_orphans}
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
- crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
- delay: up 0 down 5m multiplier 1.5 max 1h
- summary: System TCP orphan sockets utilization
- info: Orphan IPv4 TCP sockets utilization
- to: silent
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
deleted file mode 100644
index 7c39db2db..000000000
--- a/health/health.d/tcp_resets.conf
+++ /dev/null
@@ -1,71 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# -----------------------------------------------------------------------------
-# tcp resets this host sends
-
- alarm: 1m_ip_tcp_resets_sent
- on: ip.tcphandshake
- class: Errors
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: average -1m at -10s unaligned absolute of OutRsts
- units: tcp resets/s
- every: 10s
- info: average number of sent TCP RESETS over the last minute
-
- alarm: 10s_ip_tcp_resets_sent
- on: ip.tcphandshake
- class: Errors
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: average -10s unaligned absolute of OutRsts
- units: tcp resets/s
- every: 10s
- warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_sent < 5)?(5):($1m_ip_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10)))
- delay: up 20s down 60m multiplier 1.2 max 2h
- options: no-clear-notification
- summary: System TCP outbound resets
- info: Average number of sent TCP RESETS over the last 10 seconds. \
- This can indicate a port scan, \
- or that a service running on this host has crashed. \
- Netdata will not send a clear notification for this alarm.
- to: silent
-
-# -----------------------------------------------------------------------------
-# tcp resets this host receives
-
- alarm: 1m_ip_tcp_resets_received
- on: ip.tcphandshake
- class: Errors
- type: System
-component: Network
- os: linux freebsd
- hosts: *
- lookup: average -1m at -10s unaligned absolute of AttemptFails
- units: tcp resets/s
- every: 10s
- info: average number of received TCP RESETS over the last minute
-
- alarm: 10s_ip_tcp_resets_received
- on: ip.tcphandshake
- class: Errors
- type: System
-component: Network
- os: linux freebsd
- hosts: *
- lookup: average -10s unaligned absolute of AttemptFails
- units: tcp resets/s
- every: 10s
- warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_received < 5)?(5):($1m_ip_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
- delay: up 20s down 60m multiplier 1.2 max 2h
- options: no-clear-notification
- summary: System TCP inbound resets
- info: average number of received TCP RESETS over the last 10 seconds. \
- This can be an indication that a service this host needs has crashed. \
- Netdata will not send a clear notification for this alarm.
- to: silent
diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf
deleted file mode 100644
index 65c9628b5..000000000
--- a/health/health.d/timex.conf
+++ /dev/null
@@ -1,18 +0,0 @@
-
-# It can take several minutes before ntpd selects a server to synchronize with;
-# try checking after 17 minutes (1024 seconds).
-
- alarm: system_clock_sync_state
- on: system.clock_sync_state
- os: linux
- class: Errors
- type: System
-component: Clock
- calc: $state
- units: synchronization state
- every: 10s
- warn: $system.uptime.uptime > 17 * 60 AND $this == 0
- delay: down 5m
- summary: System clock sync state
- info: When set to 0, the system kernel believes the system clock is not properly synchronized to a reliable server
- to: silent
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
deleted file mode 100644
index dc0948403..000000000
--- a/health/health.d/udp_errors.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# -----------------------------------------------------------------------------
-# UDP receive buffer errors
-
- alarm: 1m_ipv4_udp_receive_buffer_errors
- on: ipv4.udperrors
- class: Errors
- type: System
-component: Network
- os: linux freebsd
- hosts: *
- lookup: average -1m unaligned absolute of RcvbufErrors
- units: errors
- every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : (10))
- summary: System UDP receive buffer errors
- info: Average number of UDP receive buffer errors over the last minute
- delay: up 1m down 60m multiplier 1.2 max 2h
- to: silent
-
-# -----------------------------------------------------------------------------
-# UDP send buffer errors
-
- alarm: 1m_ipv4_udp_send_buffer_errors
- on: ipv4.udperrors
- class: Errors
- type: System
-component: Network
- os: linux
- hosts: *
- lookup: average -1m unaligned absolute of SndbufErrors
- units: errors
- every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : (10))
- summary: System UDP send buffer errors
- info: Average number of UDP send buffer errors over the last minute
- delay: up 1m down 60m multiplier 1.2 max 2h
- to: silent
diff --git a/health/health.d/upsd.conf b/health/health.d/upsd.conf
deleted file mode 100644
index 703a64881..000000000
--- a/health/health.d/upsd.conf
+++ /dev/null
@@ -1,50 +0,0 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- template: upsd_10min_ups_load
- on: upsd.ups_load
- class: Utilization
- type: Power Supply
-component: UPS
- os: *
- hosts: *
- lookup: average -10m unaligned of load
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
- delay: down 10m multiplier 1.5 max 1h
- summary: UPS ${label:ups_name} load
- info: UPS ${label:ups_name} average load over the last 10 minutes
- to: sitemgr
-
- template: upsd_ups_battery_charge
- on: upsd.ups_battery_charge
- class: Errors
- type: Power Supply
-component: UPS
- os: *
- hosts: *
- lookup: average -60s unaligned of charge
- units: %
- every: 60s
- warn: $this < 75
- crit: $this < 40
- delay: down 10m multiplier 1.5 max 1h
- summary: UPS ${label:ups_name} battery charge
- info: UPS ${label:ups_name} average battery charge over the last minute
- to: sitemgr
-
- template: upsd_ups_last_collected_secs
- on: upsd.ups_load
- class: Latency
- type: Power Supply
-component: UPS device
- calc: $now - $last_collected_t
- every: 10s
- units: seconds ago
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- summary: UPS ${label:ups_name} last collected
- info: UPS ${label:ups_name} number of seconds since the last successful data collection
- to: sitemgr
diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf
deleted file mode 100644
index b8ad9aee4..000000000
--- a/health/health.d/vsphere.conf
+++ /dev/null
@@ -1,70 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# -----------------------------------------------Virtual Machine--------------------------------------------------------
-
- template: vsphere_vm_cpu_utilization
- on: vsphere.vm_cpu_utilization
- class: Utilization
- type: Virtual Machine
-component: CPU
- hosts: *
- lookup: average -10m unaligned match-names of used
- units: %
- every: 20s
- warn: $this > (($status >= $WARNING) ? (75) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
- delay: down 15m multiplier 1.5 max 1h
- summary: vSphere CPU utilization for VM ${label:vm}
- info: CPU utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter}
- to: silent
-
- template: vsphere_vm_mem_utilization
- on: vsphere.vm_mem_utilization
- class: Utilization
- type: Virtual Machine
-component: Memory
- hosts: *
- calc: $used
- units: %
- every: 20s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- summary: vSphere memory utilization for VM ${label:vm}
- info: Memory utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter}
- to: silent
-
-# -----------------------------------------------ESXI host--------------------------------------------------------------
-
- template: vsphere_host_cpu_utilization
- on: vsphere.host_cpu_utilization
- class: Utilization
- type: Virtual Machine
-component: CPU
- hosts: *
- lookup: average -10m unaligned match-names of used
- units: %
- every: 20s
- warn: $this > (($status >= $WARNING) ? (75) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
- delay: down 15m multiplier 1.5 max 1h
- summary: vSphere ESXi CPU utilization for host ${label:host}
- info: CPU utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter}
- to: sysadmin
-
- template: vsphere_host_mem_utilization
- on: vsphere.host_mem_utilization
- class: Utilization
- type: Virtual Machine
-component: Memory
- hosts: *
- calc: $used
- units: %
- every: 20s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- summary: vSphere ESXi Ram utilization for host ${label:host}
- info: Memory utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter}
- to: sysadmin
diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf
deleted file mode 100644
index 706fcbf22..000000000
--- a/health/health.d/windows.conf
+++ /dev/null
@@ -1,126 +0,0 @@
-
-## CPU
-
- template: windows_10min_cpu_usage
- on: windows.cpu_utilization_total
- class: Utilization
- type: Windows
-component: CPU
- os: *
- hosts: *
- lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (75) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
- delay: down 15m multiplier 1.5 max 1h
- summary: CPU utilization
- info: Average CPU utilization over the last 10 minutes
- to: silent
-
-
-## Memory
-
- template: windows_ram_in_use
- on: windows.memory_utilization
- class: Utilization
- type: Windows
-component: Memory
- os: *
- hosts: *
- calc: ($used) * 100 / ($used + $available)
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- summary: Ram utilization
- info: Memory utilization
- to: sysadmin
-
-
-## Network
-
- template: windows_inbound_packets_discarded
- on: windows.net_nic_discarded
- class: Errors
- type: Windows
-component: Network
- os: *
- hosts: *
- lookup: sum -10m unaligned absolute match-names of inbound
- units: packets
- every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- summary: Inbound network packets discarded
- info: Number of inbound discarded packets for the network interface in the last 10 minutes
- to: silent
-
- template: windows_outbound_packets_discarded
- on: windows.net_nic_discarded
- class: Errors
- type: Windows
-component: Network
- os: *
- hosts: *
- lookup: sum -10m unaligned absolute match-names of outbound
- units: packets
- every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- summary: Outbound network packets discarded
- info: Number of outbound discarded packets for the network interface in the last 10 minutes
- to: silent
-
- template: windows_inbound_packets_errors
- on: windows.net_nic_errors
- class: Errors
- type: Windows
-component: Network
- os: *
- hosts: *
- lookup: sum -10m unaligned absolute match-names of inbound
- units: packets
- every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- summary: Inbound network errors
- info: Number of inbound errors for the network interface in the last 10 minutes
- to: silent
-
- template: windows_outbound_packets_errors
- on: windows.net_nic_errors
- class: Errors
- type: Windows
-component: Network
- os: *
- hosts: *
- lookup: sum -10m unaligned absolute match-names of outbound
- units: packets
- every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- summary: Outbound network errors
- info: Number of outbound errors for the network interface in the last 10 minutes
- to: silent
-
-
-## Disk
-
- template: windows_disk_in_use
- on: windows.logical_disk_space_usage
- class: Utilization
- type: Windows
-component: Disk
- os: *
- hosts: *
- calc: ($used) * 100 / ($used + $free)
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- summary: Disk space usage
- info: Disk space utilization
- to: sysadmin
diff --git a/health/health.d/adaptec_raid.conf b/src/health/health.d/adaptec_raid.conf
index 1f1840491..b6f265db3 100644
--- a/health/health.d/adaptec_raid.conf
+++ b/src/health/health.d/adaptec_raid.conf
@@ -6,7 +6,7 @@
class: Errors
type: System
component: RAID
- lookup: max -10s foreach *
+ lookup: max -10s
units: bool
every: 10s
crit: $this > 0
@@ -22,7 +22,7 @@ component: RAID
class: Errors
type: System
component: RAID
- lookup: max -10s foreach *
+ lookup: max -10s
units: bool
every: 10s
crit: $this > 0
diff --git a/health/health.d/apcupsd.conf b/src/health/health.d/apcupsd.conf
index 90a72af19..5fd7aa112 100644
--- a/health/health.d/apcupsd.conf
+++ b/src/health/health.d/apcupsd.conf
@@ -5,8 +5,6 @@
class: Utilization
type: Power Supply
component: UPS
- os: *
- hosts: *
lookup: average -10m unaligned of percentage
units: %
every: 1m
@@ -23,8 +21,6 @@ component: UPS
class: Errors
type: Power Supply
component: UPS
- os: *
- hosts: *
lookup: average -60s unaligned of charge
units: %
every: 60s
diff --git a/health/health.d/bcache.conf b/src/health/health.d/bcache.conf
index 446173428..446173428 100644
--- a/health/health.d/bcache.conf
+++ b/src/health/health.d/bcache.conf
diff --git a/health/health.d/beanstalkd.conf b/src/health/health.d/beanstalkd.conf
index 0d37f28e0..0d37f28e0 100644
--- a/health/health.d/beanstalkd.conf
+++ b/src/health/health.d/beanstalkd.conf
diff --git a/health/health.d/bind_rndc.conf b/src/health/health.d/bind_rndc.conf
index b1c271df9..b1c271df9 100644
--- a/health/health.d/bind_rndc.conf
+++ b/src/health/health.d/bind_rndc.conf
diff --git a/health/health.d/boinc.conf b/src/health/health.d/boinc.conf
index 092a56845..6fd987de1 100644
--- a/health/health.d/boinc.conf
+++ b/src/health/health.d/boinc.conf
@@ -1,4 +1,4 @@
-# Alarms for various BOINC issues.
+# you can disable an alarm notification by setting the 'to' line to: silent
# Warn on any compute errors encountered.
template: boinc_compute_errors
@@ -6,8 +6,6 @@
class: Errors
type: Computing
component: BOINC
- os: *
- hosts: *
lookup: average -10m unaligned of comperror
units: tasks
every: 1m
@@ -23,8 +21,6 @@ component: BOINC
class: Errors
type: Computing
component: BOINC
- os: *
- hosts: *
lookup: average -10m unaligned of upload_failed
units: tasks
every: 1m
@@ -40,8 +36,6 @@ component: BOINC
class: Utilization
type: Computing
component: BOINC
- os: *
- hosts: *
lookup: average -10m unaligned of total
units: tasks
every: 1m
@@ -57,8 +51,6 @@ component: BOINC
class: Utilization
type: Computing
component: BOINC
- os: *
- hosts: *
lookup: average -10m unaligned of active
calc: ($boinc_total_tasks >= 1) ? ($this) : (inf)
units: tasks
diff --git a/health/health.d/btrfs.conf b/src/health/health.d/btrfs.conf
index 1557a5941..f43f600c0 100644
--- a/health/health.d/btrfs.conf
+++ b/src/health/health.d/btrfs.conf
@@ -1,11 +1,10 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
template: btrfs_allocated
on: btrfs.disk
class: Utilization
type: System
component: File system
- os: *
- hosts: *
calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
units: %
every: 10s
@@ -20,8 +19,6 @@ component: File system
class: Utilization
type: System
component: File system
- os: *
- hosts: *
calc: $used * 100 / ($used + $free)
units: %
every: 10s
@@ -37,8 +34,6 @@ component: File system
class: Utilization
type: System
component: File system
- os: *
- hosts: *
calc: ($used + $reserved) * 100 / ($used + $free + $reserved)
units: %
every: 10s
@@ -54,8 +49,6 @@ component: File system
class: Utilization
type: System
component: File system
- os: *
- hosts: *
calc: $used * 100 / ($used + $free)
units: %
every: 10s
@@ -71,8 +64,6 @@ component: File system
class: Errors
type: System
component: File system
- os: *
- hosts: *
units: errors
lookup: max -10m every 1m of read_errs
warn: $this > 0
@@ -86,8 +77,6 @@ component: File system
class: Errors
type: System
component: File system
- os: *
- hosts: *
units: errors
lookup: max -10m every 1m of write_errs
crit: $this > 0
@@ -101,8 +90,6 @@ component: File system
class: Errors
type: System
component: File system
- os: *
- hosts: *
units: errors
lookup: max -10m every 1m of flush_errs
crit: $this > 0
@@ -116,8 +103,6 @@ component: File system
class: Errors
type: System
component: File system
- os: *
- hosts: *
units: errors
lookup: max -10m every 1m of corruption_errs
warn: $this > 0
@@ -131,8 +116,6 @@ component: File system
class: Errors
type: System
component: File system
- os: *
- hosts: *
units: errors
lookup: max -10m every 1m of generation_errs
warn: $this > 0
diff --git a/health/health.d/ceph.conf b/src/health/health.d/ceph.conf
index 44d351338..44d351338 100644
--- a/health/health.d/ceph.conf
+++ b/src/health/health.d/ceph.conf
diff --git a/health/health.d/cockroachdb.conf b/src/health/health.d/cockroachdb.conf
index 60f178354..60f178354 100644
--- a/health/health.d/cockroachdb.conf
+++ b/src/health/health.d/cockroachdb.conf
diff --git a/health/health.d/consul.conf b/src/health/health.d/consul.conf
index 8b414a26d..8b414a26d 100644
--- a/health/health.d/consul.conf
+++ b/src/health/health.d/consul.conf
diff --git a/health/health.d/dbengine.conf b/src/health/health.d/dbengine.conf
index 0a70d2e8f..5585a9533 100644
--- a/health/health.d/dbengine.conf
+++ b/src/health/health.d/dbengine.conf
@@ -1,4 +1,3 @@
-
# you can disable an alarm notification by setting the 'to' line to: silent
alarm: 10min_dbengine_global_fs_errors
@@ -6,8 +5,6 @@
class: Errors
type: Netdata
component: DB engine
- os: linux freebsd macos
- hosts: *
lookup: sum -10m unaligned of fs_errors
units: errors
every: 10s
@@ -22,8 +19,6 @@ component: DB engine
class: Errors
type: Netdata
component: DB engine
- os: linux freebsd macos
- hosts: *
lookup: sum -10m unaligned of io_errors
units: errors
every: 10s
@@ -38,8 +33,6 @@ component: DB engine
class: Errors
type: Netdata
component: DB engine
- os: linux freebsd macos
- hosts: *
lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
units: errors
every: 10s
@@ -55,8 +48,6 @@ component: DB engine
class: Errors
type: Netdata
component: DB engine
- os: linux freebsd macos
- hosts: *
lookup: sum -10m unaligned of flushing_pressure_deletions
units: pages
every: 10s
diff --git a/health/health.d/dns_query.conf b/src/health/health.d/dns_query.conf
index 756c6a1b6..756c6a1b6 100644
--- a/health/health.d/dns_query.conf
+++ b/src/health/health.d/dns_query.conf
diff --git a/health/health.d/dnsmasq_dhcp.conf b/src/health/health.d/dnsmasq_dhcp.conf
index f6ef01940..f6ef01940 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/src/health/health.d/dnsmasq_dhcp.conf
diff --git a/health/health.d/docker.conf b/src/health/health.d/docker.conf
index 668614d4d..668614d4d 100644
--- a/health/health.d/docker.conf
+++ b/src/health/health.d/docker.conf
diff --git a/health/health.d/elasticsearch.conf b/src/health/health.d/elasticsearch.conf
index 600840c58..600840c58 100644
--- a/health/health.d/elasticsearch.conf
+++ b/src/health/health.d/elasticsearch.conf
diff --git a/health/health.d/exporting.conf b/src/health/health.d/exporting.conf
index c0320193c..c0320193c 100644
--- a/health/health.d/exporting.conf
+++ b/src/health/health.d/exporting.conf
diff --git a/health/health.d/gearman.conf b/src/health/health.d/gearman.conf
index 78e1165d1..78e1165d1 100644
--- a/health/health.d/gearman.conf
+++ b/src/health/health.d/gearman.conf
diff --git a/health/health.d/geth.conf b/src/health/health.d/geth.conf
index 361b6b41f..361b6b41f 100644
--- a/health/health.d/geth.conf
+++ b/src/health/health.d/geth.conf
diff --git a/health/health.d/haproxy.conf b/src/health/health.d/haproxy.conf
index 66a488fa4..66a488fa4 100644
--- a/health/health.d/haproxy.conf
+++ b/src/health/health.d/haproxy.conf
diff --git a/health/health.d/hdfs.conf b/src/health/health.d/hdfs.conf
index 566e815aa..566e815aa 100644
--- a/health/health.d/hdfs.conf
+++ b/src/health/health.d/hdfs.conf
diff --git a/health/health.d/ioping.conf b/src/health/health.d/ioping.conf
index 6d832bf00..6d832bf00 100644
--- a/health/health.d/ioping.conf
+++ b/src/health/health.d/ioping.conf
diff --git a/health/health.d/ipfs.conf b/src/health/health.d/ipfs.conf
index 4dfee3c7f..4dfee3c7f 100644
--- a/health/health.d/ipfs.conf
+++ b/src/health/health.d/ipfs.conf
diff --git a/health/health.d/ipmi.conf b/src/health/health.d/ipmi.conf
index cec2320a9..cec2320a9 100644
--- a/health/health.d/ipmi.conf
+++ b/src/health/health.d/ipmi.conf
diff --git a/health/health.d/isc_dhcpd.conf b/src/health/health.d/isc_dhcpd.conf
index d1f93969a..d1f93969a 100644
--- a/health/health.d/isc_dhcpd.conf
+++ b/src/health/health.d/isc_dhcpd.conf
diff --git a/health/health.d/kubelet.conf b/src/health/health.d/kubelet.conf
index 8adf5f7d4..8adf5f7d4 100644
--- a/health/health.d/kubelet.conf
+++ b/src/health/health.d/kubelet.conf
diff --git a/health/health.d/linux_power_supply.conf b/src/health/health.d/linux_power_supply.conf
index b0d35e752..b0d35e752 100644
--- a/health/health.d/linux_power_supply.conf
+++ b/src/health/health.d/linux_power_supply.conf
diff --git a/health/health.d/mdstat.conf b/src/health/health.d/mdstat.conf
index 90f97d851..90f97d851 100644
--- a/health/health.d/mdstat.conf
+++ b/src/health/health.d/mdstat.conf
diff --git a/health/health.d/megacli.conf b/src/health/health.d/megacli.conf
index 118997a59..d1e2e7acf 100644
--- a/health/health.d/megacli.conf
+++ b/src/health/health.d/megacli.conf
@@ -6,7 +6,7 @@
class: Errors
type: System
component: RAID
- lookup: max -10s foreach *
+ lookup: max -10s
units: boolean
every: 10s
crit: $this > 0
@@ -22,7 +22,7 @@ component: RAID
class: Errors
type: System
component: RAID
- lookup: sum -10s foreach *
+ lookup: sum -10s
units: predictive failures
every: 10s
warn: $this > 0
@@ -36,7 +36,7 @@ component: RAID
class: Errors
type: System
component: RAID
- lookup: sum -10s foreach *
+ lookup: sum -10s
units: media errors
every: 10s
warn: $this > 0
diff --git a/health/health.d/memcached.conf b/src/health/health.d/memcached.conf
index 77ca0afa9..77ca0afa9 100644
--- a/health/health.d/memcached.conf
+++ b/src/health/health.d/memcached.conf
diff --git a/health/health.d/ml.conf b/src/health/health.d/ml.conf
index aef9b0368..b6a5df6dd 100644
--- a/health/health.d/ml.conf
+++ b/src/health/health.d/ml.conf
@@ -13,8 +13,6 @@
class: Workload
type: System
component: ML
- os: *
- hosts: *
lookup: average -1m of anomaly_rate
calc: $this
units: %
@@ -29,8 +27,6 @@ component: ML
# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
# template: ml_5min_cpu_dims
# on: system.cpu
-# os: linux
-# hosts: *
# lookup: average -5m anomaly-bit foreach *
# calc: $this
# units: %
@@ -44,8 +40,6 @@ component: ML
# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
# template: ml_5min_cpu_chart
# on: system.cpu
-# os: linux
-# hosts: *
# lookup: average -5m anomaly-bit of *
# calc: $this
# units: %
@@ -53,4 +47,3 @@ component: ML
# warn: $this > (($status >= $WARNING) ? (5) : (20))
# crit: $this > (($status == $CRITICAL) ? (20) : (100))
# info: rolling 5min anomaly rate for system.cpu chart
-
diff --git a/health/health.d/mysql.conf b/src/health/health.d/mysql.conf
index 572560b4e..572560b4e 100644
--- a/health/health.d/mysql.conf
+++ b/src/health/health.d/mysql.conf
diff --git a/health/health.d/nvme.conf b/src/health/health.d/nvme.conf
index aea402e88..aea402e88 100644
--- a/health/health.d/nvme.conf
+++ b/src/health/health.d/nvme.conf
diff --git a/health/health.d/pihole.conf b/src/health/health.d/pihole.conf
index c4db835ce..c4db835ce 100644
--- a/health/health.d/pihole.conf
+++ b/src/health/health.d/pihole.conf
diff --git a/health/health.d/ping.conf b/src/health/health.d/ping.conf
index 0e434420d..0e434420d 100644
--- a/health/health.d/ping.conf
+++ b/src/health/health.d/ping.conf
diff --git a/health/health.d/plugin.conf b/src/health/health.d/plugin.conf
index 8615a0213..8615a0213 100644
--- a/health/health.d/plugin.conf
+++ b/src/health/health.d/plugin.conf
diff --git a/health/health.d/portcheck.conf b/src/health/health.d/portcheck.conf
index 281731c86..281731c86 100644
--- a/health/health.d/portcheck.conf
+++ b/src/health/health.d/portcheck.conf
diff --git a/health/health.d/processes.conf b/src/health/health.d/processes.conf
index 8f2e0fda5..2029c76e4 100644
--- a/health/health.d/processes.conf
+++ b/src/health/health.d/processes.conf
@@ -5,7 +5,6 @@
class: Workload
type: System
component: Processes
- hosts: *
calc: $active * 100 / $pidmax
units: %
every: 5s
diff --git a/health/health.d/retroshare.conf b/src/health/health.d/retroshare.conf
index c665430fa..c665430fa 100644
--- a/health/health.d/retroshare.conf
+++ b/src/health/health.d/retroshare.conf
diff --git a/health/health.d/riakkv.conf b/src/health/health.d/riakkv.conf
index 677e3cb4f..677e3cb4f 100644
--- a/health/health.d/riakkv.conf
+++ b/src/health/health.d/riakkv.conf
diff --git a/health/health.d/scaleio.conf b/src/health/health.d/scaleio.conf
index b089cb85e..b089cb85e 100644
--- a/health/health.d/scaleio.conf
+++ b/src/health/health.d/scaleio.conf
diff --git a/health/health.d/synchronization.conf b/src/health/health.d/synchronization.conf
index 6c947d90b..28b1817ac 100644
--- a/health/health.d/synchronization.conf
+++ b/src/health/health.d/synchronization.conf
@@ -2,7 +2,6 @@
on: mem.sync
lookup: sum -1m of sync
units: calls
- plugin: ebpf.plugin
every: 1m
warn: $this > 6
delay: up 1m down 10m multiplier 1.5 max 1h
diff --git a/health/health.d/unbound.conf b/src/health/health.d/unbound.conf
index 3c898f1d5..3c898f1d5 100644
--- a/health/health.d/unbound.conf
+++ b/src/health/health.d/unbound.conf
diff --git a/health/health.d/vcsa.conf b/src/health/health.d/vcsa.conf
index 3e20bfd1e..3e20bfd1e 100644
--- a/health/health.d/vcsa.conf
+++ b/src/health/health.d/vcsa.conf
diff --git a/health/health.d/vernemq.conf b/src/health/health.d/vernemq.conf
index 6ea9f99dc..6ea9f99dc 100644
--- a/health/health.d/vernemq.conf
+++ b/src/health/health.d/vernemq.conf
diff --git a/health/health.d/web_log.conf b/src/health/health.d/web_log.conf
index 78f1cc7f5..78f1cc7f5 100644
--- a/health/health.d/web_log.conf
+++ b/src/health/health.d/web_log.conf
diff --git a/health/health.d/whoisquery.conf b/src/health/health.d/whoisquery.conf
index 0a328b592..0a328b592 100644
--- a/health/health.d/whoisquery.conf
+++ b/src/health/health.d/whoisquery.conf
diff --git a/health/health.d/x509check.conf b/src/health/health.d/x509check.conf
index d05f3ef0f..d05f3ef0f 100644
--- a/health/health.d/x509check.conf
+++ b/src/health/health.d/x509check.conf
diff --git a/health/health.d/zfs.conf b/src/health/health.d/zfs.conf
index d2a561000..d2a561000 100644
--- a/health/health.d/zfs.conf
+++ b/src/health/health.d/zfs.conf