summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.d')
-rw-r--r--health/health.d/adaptec_raid.conf32
-rw-r--r--health/health.d/anomalies.conf23
-rw-r--r--health/health.d/apcupsd.conf125
-rw-r--r--health/health.d/bcache.conf31
-rw-r--r--health/health.d/beanstalkd.conf41
-rw-r--r--health/health.d/bind_rndc.conf12
-rw-r--r--health/health.d/boinc.conf70
-rw-r--r--health/health.d/btrfs.conf142
-rw-r--r--health/health.d/ceph.conf16
-rw-r--r--health/health.d/cgroups.conf72
-rw-r--r--health/health.d/cockroachdb.conf78
-rw-r--r--health/health.d/consul.conf171
-rw-r--r--health/health.d/cpu.conf69
-rw-r--r--health/health.d/dbengine.conf68
-rw-r--r--health/health.d/disks.conf172
-rw-r--r--health/health.d/dns_query.conf15
-rw-r--r--health/health.d/dnsmasq_dhcp.conf15
-rw-r--r--health/health.d/docker.conf12
-rw-r--r--health/health.d/elasticsearch.conf78
-rw-r--r--health/health.d/entropy.conf20
-rw-r--r--health/health.d/exporting.conf29
-rw-r--r--health/health.d/file_descriptors.conf33
-rw-r--r--health/health.d/gearman.conf14
-rw-r--r--health/health.d/geth.conf11
-rw-r--r--health/health.d/go.d.plugin.conf18
-rw-r--r--health/health.d/haproxy.conf25
-rw-r--r--health/health.d/hdfs.conf81
-rw-r--r--health/health.d/httpcheck.conf73
-rw-r--r--health/health.d/ioping.conf14
-rw-r--r--health/health.d/ipc.conf34
-rw-r--r--health/health.d/ipfs.conf15
-rw-r--r--health/health.d/ipmi.conf28
-rw-r--r--health/health.d/isc_dhcpd.conf10
-rw-r--r--health/health.d/kubelet.conf151
-rw-r--r--health/health.d/linux_power_supply.conf15
-rw-r--r--health/health.d/load.conf72
-rw-r--r--health/health.d/mdstat.conf43
-rw-r--r--health/health.d/megacli.conf76
-rw-r--r--health/health.d/memcached.conf50
-rw-r--r--health/health.d/memory.conf85
-rw-r--r--health/health.d/ml.conf56
-rw-r--r--health/health.d/mysql.conf187
-rw-r--r--health/health.d/net.conf258
-rw-r--r--health/health.d/netfilter.conf20
-rw-r--r--health/health.d/nvme.conf15
-rw-r--r--health/health.d/pihole.conf33
-rw-r--r--health/health.d/ping.conf50
-rw-r--r--health/health.d/plugin.conf12
-rw-r--r--health/health.d/portcheck.conf44
-rw-r--r--health/health.d/postgres.conf228
-rw-r--r--health/health.d/processes.conf17
-rw-r--r--health/health.d/python.d.plugin.conf18
-rw-r--r--health/health.d/qos.conf18
-rw-r--r--health/health.d/ram.conf82
-rw-r--r--health/health.d/redis.conf57
-rw-r--r--health/health.d/retroshare.conf17
-rw-r--r--health/health.d/riakkv.conf98
-rw-r--r--health/health.d/scaleio.conf33
-rw-r--r--health/health.d/softnet.conf57
-rw-r--r--health/health.d/swap.conf37
-rw-r--r--health/health.d/synchronization.conf13
-rw-r--r--health/health.d/systemdunits.conf161
-rw-r--r--health/health.d/tcp_conn.conf23
-rw-r--r--health/health.d/tcp_listen.conf100
-rw-r--r--health/health.d/tcp_mem.conf24
-rw-r--r--health/health.d/tcp_orphans.conf25
-rw-r--r--health/health.d/tcp_resets.conf71
-rw-r--r--health/health.d/timex.conf18
-rw-r--r--health/health.d/udp_errors.conf40
-rw-r--r--health/health.d/unbound.conf30
-rw-r--r--health/health.d/upsd.conf50
-rw-r--r--health/health.d/vcsa.conf230
-rw-r--r--health/health.d/vernemq.conf391
-rw-r--r--health/health.d/vsphere.conf70
-rw-r--r--health/health.d/web_log.conf205
-rw-r--r--health/health.d/whoisquery.conf14
-rw-r--r--health/health.d/windows.conf126
-rw-r--r--health/health.d/x509check.conf26
-rw-r--r--health/health.d/zfs.conf44
79 files changed, 5137 insertions, 0 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
new file mode 100644
index 00000000..1f184049
--- /dev/null
+++ b/health/health.d/adaptec_raid.conf
@@ -0,0 +1,32 @@
+
+# logical device status check
+
+ template: adaptec_raid_ld_status
+ on: adaptec_raid.ld_status
+ class: Errors
+ type: System
+component: RAID
+ lookup: max -10s foreach *
+ units: bool
+ every: 10s
+ crit: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Adaptec raid logical device status
+ info: Logical device status is failed or degraded
+ to: sysadmin
+
+# physical device state check
+
+ template: adaptec_raid_pd_state
+ on: adaptec_raid.pd_state
+ class: Errors
+ type: System
+component: RAID
+ lookup: max -10s foreach *
+ units: bool
+ every: 10s
+ crit: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Adaptec raid physical device state
+ info: Physical device state is not online
+ to: sysadmin
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
new file mode 100644
index 00000000..269ae544
--- /dev/null
+++ b/health/health.d/anomalies.conf
@@ -0,0 +1,23 @@
+# raise a warning alarm if an anomaly probability is consistently above 50%
+
+ template: anomalies_anomaly_probabilities
+ on: anomalies.probability
+ class: Errors
+ type: Netdata
+component: ML
+ lookup: average -2m foreach *
+ every: 1m
+ warn: $this > 50
+ info: average anomaly probability over the last 2 minutes
+
+# raise a warning alarm if an anomaly flag is consistently firing
+
+ template: anomalies_anomaly_flags
+ on: anomalies.anomaly
+ class: Errors
+ type: Netdata
+component: ML
+ lookup: sum -2m foreach *
+ every: 1m
+ warn: $this > 10
+ info: number of anomalies in the last 2 minutes
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
new file mode 100644
index 00000000..90a72af1
--- /dev/null
+++ b/health/health.d/apcupsd.conf
@@ -0,0 +1,125 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: apcupsd_10min_ups_load
+ on: apcupsd.load
+ class: Utilization
+ type: Power Supply
+component: UPS
+ os: *
+ hosts: *
+ lookup: average -10m unaligned of percentage
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ delay: down 10m multiplier 1.5 max 1h
+ summary: APC UPS load
+ info: APC UPS average load over the last 10 minutes
+ to: sitemgr
+
+# Discussion in https://github.com/netdata/netdata/pull/3928:
+# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
+ template: apcupsd_ups_charge
+ on: apcupsd.charge
+ class: Errors
+ type: Power Supply
+component: UPS
+ os: *
+ hosts: *
+ lookup: average -60s unaligned of charge
+ units: %
+ every: 60s
+ warn: $this < 100
+ crit: $this < 40
+ delay: down 10m multiplier 1.5 max 1h
+ summary: APC UPS battery charge
+ info: APC UPS average battery charge over the last minute
+ to: sitemgr
+
+ template: apcupsd_last_collected_secs
+ on: apcupsd.load
+ class: Latency
+ type: Power Supply
+component: UPS device
+ calc: $now - $last_collected_t
+ every: 10s
+ units: seconds ago
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: APC UPS last collection
+ info: APC UPS number of seconds since the last successful data collection
+ to: sitemgr
+
+#Send out a warning when SELFTEST code is BT or NG. Code descriptions can be found at:
+#http://www.apcupsd.org/manual/#:~:text=or%20N/A.-,SELFTEST,-The%20results%20of
+ template: apcupsd_selftest_warning
+ on: apcupsd.selftest
+ lookup: max -1s unaligned match-names of BT,NG
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: APC UPS self-test failed due to insufficient battery capacity or due to overload.
+ to: sitemgr
+
+#Send out a warning when STATUS code is ONBATT,OVERLOAD,LOWBATT,REPLACEBATT,NOBATT,COMMLOST
+#https://man.archlinux.org/man/apcaccess.8.en#:~:text=apcupsd%20was%20started-,STATUS,-%3A%20UPS%20status.%20One
+
+ template: apcupsd_status_onbatt
+ on: apcupsd.status
+ lookup: max -1s unaligned match-names of ONBATT
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: APC UPS has switched to battery power because the input power has failed
+ to: sitemgr
+
+ template: apcupsd_status_overload
+ on: apcupsd.status
+ lookup: max -1s unaligned match-names of OVERLOAD
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: APC UPS is overloaded and cannot supply enough power to the load
+ to: sitemgr
+
+ template: apcupsd_status_lowbatt
+ on: apcupsd.status
+ lookup: max -1s unaligned match-names of LOWBATT
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: APC UPS battery is low and needs to be recharged
+ to: sitemgr
+
+ template: apcupsd_status_replacebatt
+ on: apcupsd.status
+ lookup: max -1s unaligned match-names of REPLACEBATT
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: APC UPS battery has reached the end of its lifespan and needs to be replaced
+ to: sitemgr
+
+ template: apcupsd_status_nobatt
+ on: apcupsd.status
+ lookup: max -1s unaligned match-names of NOBATT
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: APC UPS has no battery
+ to: sitemgr
+
+ template: apcupsd_status_commlost
+ on: apcupsd.status
+ lookup: max -1s unaligned match-names of COMMLOST
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: APC UPS communication link is lost
+ to: sitemgr
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
new file mode 100644
index 00000000..44617342
--- /dev/null
+++ b/health/health.d/bcache.conf
@@ -0,0 +1,31 @@
+
+ template: bcache_cache_errors
+ on: disk.bcache_cache_read_races
+ class: Errors
+ type: System
+component: Disk
+ lookup: sum -1m unaligned absolute
+ units: errors
+ every: 1m
+ warn: $this > 0
+ delay: up 2m down 1h multiplier 1.5 max 2h
+ summary: Bcache cache read race errors
+ info: Number of times data was read from the cache, \
+ the bucket was reused and invalidated in the last 10 minutes \
+ (when this occurs the data is reread from the backing device)
+ to: silent
+
+ template: bcache_cache_dirty
+ on: disk.bcache_cache_alloc
+ class: Utilization
+ type: System
+component: Disk
+ calc: $dirty + $metadata + $undefined
+ units: %
+ every: 1m
+ warn: $this > 75
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ summary: Bcache cache used space
+ info: Percentage of cache space used for dirty data and metadata \
+ (this usually means your SSD cache is too small)
+ to: silent
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
new file mode 100644
index 00000000..0d37f28e
--- /dev/null
+++ b/health/health.d/beanstalkd.conf
@@ -0,0 +1,41 @@
+# get the number of buried jobs in all queues
+
+ template: beanstalk_server_buried_jobs
+ on: beanstalk.current_jobs
+ class: Workload
+ type: Messaging
+component: Beanstalk
+ calc: $buried
+ units: jobs
+ every: 10s
+ warn: $this > 3
+ delay: up 0 down 5m multiplier 1.2 max 1h
+ summary: Beanstalk buried jobs
+ info: Number of buried jobs across all tubes. \
+ You need to manually kick them so they can be processed. \
+ Presence of buried jobs in a tube does not affect new jobs.
+ to: sysadmin
+
+# get the number of buried jobs per queue
+
+#template: beanstalk_tube_buried_jobs
+# on: beanstalk.jobs
+# calc: $buried
+# units: jobs
+# every: 10s
+# warn: $this > 0
+# crit: $this > 10
+# delay: up 0 down 5m multiplier 1.2 max 1h
+# info: the number of jobs buried per tube
+# to: sysadmin
+
+# get the current number of tubes
+
+#template: beanstalk_number_of_tubes
+# on: beanstalk.current_tubes
+# calc: $tubes
+# every: 10s
+# warn: $this < 5
+# delay: up 0 down 5m multiplier 1.2 max 1h
+# info: the current number of tubes on the server
+# to: sysadmin
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
new file mode 100644
index 00000000..b1c271df
--- /dev/null
+++ b/health/health.d/bind_rndc.conf
@@ -0,0 +1,12 @@
+ template: bind_rndc_stats_file_size
+ on: bind_rndc.stats_size
+ class: Utilization
+ type: DNS
+component: BIND
+ units: megabytes
+ every: 60
+ calc: $stats_size
+ warn: $this > 512
+ summary: BIND statistics file size
+ info: BIND statistics-file size
+ to: sysadmin
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
new file mode 100644
index 00000000..092a5684
--- /dev/null
+++ b/health/health.d/boinc.conf
@@ -0,0 +1,70 @@
+# Alarms for various BOINC issues.
+
+# Warn on any compute errors encountered.
+ template: boinc_compute_errors
+ on: boinc.states
+ class: Errors
+ type: Computing
+component: BOINC
+ os: *
+ hosts: *
+ lookup: average -10m unaligned of comperror
+ units: tasks
+ every: 1m
+ warn: $this > 0
+ delay: up 1m down 5m multiplier 1.5 max 1h
+ summary: BOINC compute errors
+ info: Average number of compute errors over the last 10 minutes
+ to: sysadmin
+
+# Warn on lots of upload errors
+ template: boinc_upload_errors
+ on: boinc.states
+ class: Errors
+ type: Computing
+component: BOINC
+ os: *
+ hosts: *
+ lookup: average -10m unaligned of upload_failed
+ units: tasks
+ every: 1m
+ warn: $this > 0
+ delay: up 1m down 5m multiplier 1.5 max 1h
+ summary: BOINC failed uploads
+ info: Average number of failed uploads over the last 10 minutes
+ to: sysadmin
+
+# Warn on the task queue being empty
+ template: boinc_total_tasks
+ on: boinc.tasks
+ class: Utilization
+ type: Computing
+component: BOINC
+ os: *
+ hosts: *
+ lookup: average -10m unaligned of total
+ units: tasks
+ every: 1m
+ warn: $this < 1
+ delay: up 5m down 10m multiplier 1.5 max 1h
+ summary: BOINC total tasks
+ info: Average number of total tasks over the last 10 minutes
+ to: sysadmin
+
+# Warn on no active tasks with a non-empty queue
+ template: boinc_active_tasks
+ on: boinc.tasks
+ class: Utilization
+ type: Computing
+component: BOINC
+ os: *
+ hosts: *
+ lookup: average -10m unaligned of active
+ calc: ($boinc_total_tasks >= 1) ? ($this) : (inf)
+ units: tasks
+ every: 1m
+ warn: $this < 1
+ delay: up 5m down 10m multiplier 1.5 max 1h
+ summary: BOINC active tasks
+ info: Average number of active tasks over the last 10 minutes
+ to: sysadmin
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
new file mode 100644
index 00000000..1557a594
--- /dev/null
+++ b/health/health.d/btrfs.conf
@@ -0,0 +1,142 @@
+
+ template: btrfs_allocated
+ on: btrfs.disk
+ class: Utilization
+ type: System
+component: File system
+ os: *
+ hosts: *
+ calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
+ units: %
+ every: 10s
+ warn: $this > (($status == $CRITICAL) ? (95) : (98))
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ summary: BTRFS allocated space utilization
+ info: Percentage of allocated BTRFS physical disk space
+ to: silent
+
+ template: btrfs_data
+ on: btrfs.data
+ class: Utilization
+ type: System
+component: File system
+ os: *
+ hosts: *
+ calc: $used * 100 / ($used + $free)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
+ crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ summary: BTRFS data space utilization
+ info: Utilization of BTRFS data space
+ to: sysadmin
+
+ template: btrfs_metadata
+ on: btrfs.metadata
+ class: Utilization
+ type: System
+component: File system
+ os: *
+ hosts: *
+ calc: ($used + $reserved) * 100 / ($used + $free + $reserved)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
+ crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ summary: BTRFS metadata space utilization
+ info: Utilization of BTRFS metadata space
+ to: sysadmin
+
+ template: btrfs_system
+ on: btrfs.system
+ class: Utilization
+ type: System
+component: File system
+ os: *
+ hosts: *
+ calc: $used * 100 / ($used + $free)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
+ crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ summary: BTRFS system space utilization
+ info: Utilization of BTRFS system space
+ to: sysadmin
+
+ template: btrfs_device_read_errors
+ on: btrfs.device_errors
+ class: Errors
+ type: System
+component: File system
+ os: *
+ hosts: *
+ units: errors
+ lookup: max -10m every 1m of read_errs
+ warn: $this > 0
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ summary: BTRFS device read errors
+ info: Number of encountered BTRFS read errors
+ to: sysadmin
+
+ template: btrfs_device_write_errors
+ on: btrfs.device_errors
+ class: Errors
+ type: System
+component: File system
+ os: *
+ hosts: *
+ units: errors
+ lookup: max -10m every 1m of write_errs
+ crit: $this > 0
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ summary: BTRFS device write errors
+ info: Number of encountered BTRFS write errors
+ to: sysadmin
+
+ template: btrfs_device_flush_errors
+ on: btrfs.device_errors
+ class: Errors
+ type: System
+component: File system
+ os: *
+ hosts: *
+ units: errors
+ lookup: max -10m every 1m of flush_errs
+ crit: $this > 0
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ summary: BTRFS device flush errors
+ info: Number of encountered BTRFS flush errors
+ to: sysadmin
+
+ template: btrfs_device_corruption_errors
+ on: btrfs.device_errors
+ class: Errors
+ type: System
+component: File system
+ os: *
+ hosts: *
+ units: errors
+ lookup: max -10m every 1m of corruption_errs
+ warn: $this > 0
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ summary: BTRFS device corruption errors
+ info: Number of encountered BTRFS corruption errors
+ to: sysadmin
+
+ template: btrfs_device_generation_errors
+ on: btrfs.device_errors
+ class: Errors
+ type: System
+component: File system
+ os: *
+ hosts: *
+ units: errors
+ lookup: max -10m every 1m of generation_errs
+ warn: $this > 0
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ summary: BTRFS device generation errors
+ info: Number of encountered BTRFS generation errors
+ to: sysadmin
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
new file mode 100644
index 00000000..44d35133
--- /dev/null
+++ b/health/health.d/ceph.conf
@@ -0,0 +1,16 @@
+# low ceph disk available
+
+ template: ceph_cluster_space_usage
+ on: ceph.general_usage
+ class: Utilization
+ type: Storage
+component: Ceph
+ calc: $used * 100 / ($used + $avail)
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING ) ? (85) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 5m multiplier 1.2 max 1h
+ summary: Ceph cluster disk space utilization
+ info: Ceph cluster disk space utilization
+ to: sysadmin
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
new file mode 100644
index 00000000..9c55633e
--- /dev/null
+++ b/health/health.d/cgroups.conf
@@ -0,0 +1,72 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: cgroup_10min_cpu_usage
+ on: cgroup.cpu_limit
+ class: Utilization
+ type: Cgroups
+component: CPU
+ os: linux
+ hosts: *
+ lookup: average -10m unaligned
+ units: %
+ every: 1m
+ warn: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: Cgroup ${label:cgroup_name} CPU utilization
+ info: Cgroup ${label:cgroup_name} average CPU utilization over the last 10 minutes
+ to: silent
+
+ template: cgroup_ram_in_use
+ on: cgroup.mem_usage
+ class: Utilization
+ type: Cgroups
+component: Memory
+ os: linux
+ hosts: *
+ calc: ($ram) * 100 / $memory_limit
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: Cgroup ${label:cgroup_name} memory utilization
+ info: Cgroup ${label:cgroup_name} memory utilization
+ to: silent
+
+# ---------------------------------K8s containers--------------------------------------------
+
+ template: k8s_cgroup_10min_cpu_usage
+ on: k8s.cgroup.cpu_limit
+ class: Utilization
+ type: Cgroups
+component: CPU
+ os: linux
+ hosts: *
+ lookup: average -10m unaligned
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} CPU utilization
+ info: Container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
+ average CPU utilization over the last 10 minutes
+ to: silent
+
+ template: k8s_cgroup_ram_in_use
+ on: k8s.cgroup.mem_usage
+ class: Utilization
+ type: Cgroups
+component: Memory
+ os: linux
+ hosts: *
+ calc: ($ram) * 100 / $memory_limit
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} memory utilization
+ info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
+ memory utilization
+ to: silent
diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf
new file mode 100644
index 00000000..60f17835
--- /dev/null
+++ b/health/health.d/cockroachdb.conf
@@ -0,0 +1,78 @@
+
+# Capacity
+
+ template: cockroachdb_used_storage_capacity
+ on: cockroachdb.storage_used_capacity_percentage
+ class: Utilization
+ type: Database
+component: CockroachDB
+ calc: $total
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: CockroachDB storage space utilization
+ info: Storage capacity utilization
+ to: dba
+
+ template: cockroachdb_used_usable_storage_capacity
+ on: cockroachdb.storage_used_capacity_percentage
+ class: Utilization
+ type: Database
+component: CockroachDB
+ calc: $usable
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: CockroachDB usable storage space utilization
+ info: Storage usable space utilization
+ to: dba
+
+# Replication
+
+ template: cockroachdb_unavailable_ranges
+ on: cockroachdb.ranges_replication_problem
+ class: Errors
+ type: Database
+component: CockroachDB
+ calc: $unavailable
+ units: num
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ summary: CockroachDB unavailable replication
+ info: Number of ranges with fewer live replicas than needed for quorum
+ to: dba
+
+ template: cockroachdb_underreplicated_ranges
+ on: cockroachdb.ranges_replication_problem
+ class: Errors
+ type: Database
+component: CockroachDB
+ calc: $under_replicated
+ units: num
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ summary: CockroachDB under-replicated
+ info: Number of ranges with fewer live replicas than the replication target
+ to: dba
+
+# FD
+
+ template: cockroachdb_open_file_descriptors_limit
+ on: cockroachdb.process_file_descriptors
+ class: Utilization
+ type: Database
+component: CockroachDB
+ calc: $open/$sys_fd_softlimit * 100
+ units: %
+ every: 10s
+ warn: $this > 80
+ delay: down 15m multiplier 1.5 max 1h
+ summary: CockroachDB file descriptors utilization
+ info: Open file descriptors utilization (against softlimit)
+ to: dba
diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf
new file mode 100644
index 00000000..8b414a26
--- /dev/null
+++ b/health/health.d/consul.conf
@@ -0,0 +1,171 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: consul_license_expiration_time
+ on: consul.license_expiration_time
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ calc: $license_expiration
+ every: 60m
+ units: seconds
+ warn: $this < 14*24*60*60
+ crit: $this < 7*24*60*60
+ summary: Consul license expiration on ${label:node_name}
+ info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_autopilot_health_status
+ on: consul.autopilot_health_status
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ calc: $unhealthy
+ every: 10s
+ units: status
+ warn: $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Consul datacenter ${label:datacenter} health
+ info: Datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name}
+ to: sysadmin
+
+ template: consul_autopilot_server_health_status
+ on: consul.autopilot_server_health_status
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ calc: $unhealthy
+ every: 10s
+ units: status
+ warn: $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Consul server ${label:node_name} health
+ info: Server ${label:node_name} from datacenter ${label:datacenter} is unhealthy
+ to: sysadmin
+
+ template: consul_raft_leader_last_contact_time
+ on: consul.raft_leader_last_contact_time
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ lookup: average -1m unaligned of quantile_0.5
+ every: 10s
+ units: milliseconds
+ warn: $this > (($status >= $WARNING) ? (150) : (200))
+ crit: $this > (($status == $CRITICAL) ? (200) : (500))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Consul leader server ${label:node_name} last contact time
+ info: Median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes
+ to: sysadmin
+
+ template: consul_raft_leadership_transitions
+ on: consul.raft_leadership_transitions_rate
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ lookup: sum -1m unaligned
+ every: 10s
+ units: transitions
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Consul server ${label:node_name} leadership transitions
+ info: There has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader
+ to: sysadmin
+
+ template: consul_raft_thread_main_saturation
+ on: consul.raft_thread_main_saturation_perc
+ class: Utilization
+ type: ServiceMesh
+component: Consul
+ lookup: average -1m unaligned of quantile_0.9
+ every: 10s
+ units: percentage
+ warn: $this > (($status >= $WARNING) ? (40) : (50))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Consul server ${label:node_name} main Raft saturation
+ info: Average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_raft_thread_fsm_saturation
+ on: consul.raft_thread_fsm_saturation_perc
+ class: Utilization
+ type: ServiceMesh
+component: Consul
+ lookup: average -1m unaligned of quantile_0.9
+ every: 10s
+ units: milliseconds
+ warn: $this > (($status >= $WARNING) ? (40) : (50))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Consul server ${label:node_name} FSM Raft saturation
+ info: Average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_client_rpc_requests_exceeded
+ on: consul.client_rpc_requests_exceeded_rate
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ lookup: sum -1m unaligned
+ every: 10s
+ units: requests
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Consul server ${label:node_name} RPC requests rate
+ info: Number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_client_rpc_requests_failed
+ on: consul.client_rpc_requests_failed_rate
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ lookup: sum -1m unaligned
+ every: 10s
+ units: requests
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Consul server ${label:node_name} failed RPC requests
+ info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_node_health_check_status
+ on: consul.node_health_check_status
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ calc: $warning + $critical
+ every: 10s
+ units: status
+ warn: $this != nan AND $this != 0
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Consul node health check ${label:check_name} on ${label:node_name}
+ info: Node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_service_health_check_status
+ on: consul.service_health_check_status
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ calc: $warning + $critical
+ every: 10s
+ units: status
+ warn: $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Consul service health check ${label:check_name} service ${label:service_name} node ${label:node_name}
+ info: Service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_gc_pause_time
+ on: consul.gc_pause_time
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ lookup: sum -1m unaligned
+ every: 10s
+ units: seconds
+ warn: $this > (($status >= $WARNING) ? (1) : (2))
+ crit: $this > (($status >= $WARNING) ? (2) : (5))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Consul server ${label:node_name} garbage collection pauses
+ info: Time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
new file mode 100644
index 00000000..0b007d6b
--- /dev/null
+++ b/health/health.d/cpu.conf
@@ -0,0 +1,69 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: 10min_cpu_usage
+ on: system.cpu
+ class: Utilization
+ type: System
+component: CPU
+ os: linux
+ hosts: *
+ lookup: average -10m unaligned of user,system,softirq,irq,guest
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: System CPU utilization
+ info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
+ to: silent
+
+ template: 10min_cpu_iowait
+ on: system.cpu
+ class: Utilization
+ type: System
+component: CPU
+ os: linux
+ hosts: *
+ lookup: average -10m unaligned of iowait
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (20) : (40))
+ delay: up 30m down 30m multiplier 1.5 max 2h
+ summary: System CPU iowait time
+ info: Average CPU iowait time over the last 10 minutes
+ to: silent
+
+ template: 20min_steal_cpu
+ on: system.cpu
+ class: Latency
+ type: System
+component: CPU
+ os: linux
+ hosts: *
+ lookup: average -20m unaligned of steal
+ units: %
+ every: 5m
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ delay: down 1h multiplier 1.5 max 2h
+ summary: System CPU steal time
+ info: Average CPU steal time over the last 20 minutes
+ to: silent
+
+## FreeBSD
+ template: 10min_cpu_usage
+ on: system.cpu
+ class: Utilization
+ type: System
+component: CPU
+ os: freebsd
+ hosts: *
+ lookup: average -10m unaligned of user,system,interrupt
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: System CPU utilization
+ info: Average CPU utilization over the last 10 minutes (excluding nice)
+ to: silent
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
new file mode 100644
index 00000000..0a70d2e8
--- /dev/null
+++ b/health/health.d/dbengine.conf
@@ -0,0 +1,68 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: 10min_dbengine_global_fs_errors
+ on: netdata.dbengine_global_errors
+ class: Errors
+ type: Netdata
+component: DB engine
+ os: linux freebsd macos
+ hosts: *
+ lookup: sum -10m unaligned of fs_errors
+ units: errors
+ every: 10s
+ crit: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ summary: Netdata DBengine filesystem errors
+ info: Number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
+ to: sysadmin
+
+ alarm: 10min_dbengine_global_io_errors
+ on: netdata.dbengine_global_errors
+ class: Errors
+ type: Netdata
+component: DB engine
+ os: linux freebsd macos
+ hosts: *
+ lookup: sum -10m unaligned of io_errors
+ units: errors
+ every: 10s
+ crit: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+ summary: Netdata DBengine IO errors
+ info: Number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
+ to: sysadmin
+
+ alarm: 10min_dbengine_global_flushing_warnings
+ on: netdata.dbengine_global_errors
+ class: Errors
+ type: Netdata
+component: DB engine
+ os: linux freebsd macos
+ hosts: *
+ lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
+ units: errors
+ every: 10s
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+ summary: Netdata DBengine global flushing warnings
+ info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
+ Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
+ to: sysadmin
+
+ alarm: 10min_dbengine_global_flushing_errors
+ on: netdata.dbengine_long_term_page_stats
+ class: Errors
+ type: Netdata
+component: DB engine
+ os: linux freebsd macos
+ hosts: *
+ lookup: sum -10m unaligned of flushing_pressure_deletions
+ units: pages
+ every: 10s
+ crit: $this != 0
+ delay: down 1h multiplier 1.5 max 3h
+ summary: Netdata DBengine global flushing errors
+ info: Number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
+ Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
+ to: sysadmin
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
new file mode 100644
index 00000000..2e417fd4
--- /dev/null
+++ b/health/health.d/disks.conf
@@ -0,0 +1,172 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+
+# -----------------------------------------------------------------------------
+# low disk space
+
+# checking the latest collected values
+# raise an alarm if the disk is low on
+# available disk space
+
+ template: disk_space_usage
+ on: disk.space
+ class: Utilization
+ type: System
+component: Disk
+ os: linux freebsd
+ hosts: *
+chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
+ calc: $used * 100 / ($avail + $used)
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING ) ? (80) : (90))
+ crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ summary: Disk ${label:mount_point} space usage
+ info: Total space utilization of disk ${label:mount_point}
+ to: sysadmin
+
+ template: disk_inode_usage
+ on: disk.inodes
+ class: Utilization
+ type: System
+component: Disk
+ os: linux freebsd
+ hosts: *
+chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
+ calc: $used * 100 / ($avail + $used)
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ summary: Disk ${label:mount_point} inode usage
+ info: Total inode utilization of disk ${label:mount_point}
+ to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# disk fill rate
+
+# calculate the rate the disk fills
+# use as base, the available space change
+# during the last hour
+
+# this is just a calculation - it has no alarm
+# we will use it in the next template to find
+# the hours remaining
+
+template: disk_fill_rate
+ on: disk.space
+ os: linux freebsd
+ hosts: *
+ lookup: min -10m at -50m unaligned of avail
+ calc: ($this - $avail) / (($now - $after) / 3600)
+ every: 1m
+ units: GB/hour
+ info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
+
+# calculate the hours remaining
+# if the disk continues to fill
+# in this rate
+
+template: out_of_disk_space_time
+ on: disk.space
+ os: linux freebsd
+ hosts: *
+ calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
+ units: hours
+ every: 10s
+ warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+ crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+ delay: down 15m multiplier 1.2 max 1h
+ summary: Disk ${label:mount_point} estimation of lack of space
+ info: Estimated time the disk ${label:mount_point} will run out of space, if the system continues to add data with the rate of the last hour
+ to: silent
+
+
+# -----------------------------------------------------------------------------
+# disk inode fill rate
+
+# calculate the rate the disk inodes are allocated
+# use as base, the available inodes change
+# during the last hour
+
+# this is just a calculation - it has no alarm
+# we will use it in the next template to find
+# the hours remaining
+
+template: disk_inode_rate
+ on: disk.inodes
+ os: linux freebsd
+ hosts: *
+ lookup: min -10m at -50m unaligned of avail
+ calc: ($this - $avail) / (($now - $after) / 3600)
+ every: 1m
+ units: inodes/hour
+ info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
+
+# calculate the hours remaining
+# if the disk inodes are allocated
+# in this rate
+
+template: out_of_disk_inodes_time
+ on: disk.inodes
+ os: linux freebsd
+ hosts: *
+ calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
+ units: hours
+ every: 10s
+ warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+ crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+ delay: down 15m multiplier 1.2 max 1h
+ summary: Disk ${label:mount_point} estimation of lack of inodes
+ info: Estimated time the disk ${label:mount_point} will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
+ to: silent
+
+
+# -----------------------------------------------------------------------------
+# disk congestion
+
+# raise an alarm if the disk is congested
+# by calculating the average disk utilization
+# for the last 10 minutes
+
+ template: 10min_disk_utilization
+ on: disk.util
+ class: Utilization
+ type: System
+component: Disk
+ os: linux freebsd
+ hosts: *
+ lookup: average -10m unaligned
+ units: %
+ every: 1m
+ warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1))
+ delay: down 15m multiplier 1.2 max 1h
+ summary: Disk ${label:device} utilization
+ info: Average percentage of time ${label:device} disk was busy over the last 10 minutes
+ to: silent
+
+
+# raise an alarm if the disk backlog
+# is above 1000ms (1s) per second
+# for 10 minutes
+# (i.e. the disk cannot catch up)
+
+ template: 10min_disk_backlog
+ on: disk.backlog
+ class: Latency
+ type: System
+component: Disk
+ os: linux
+ hosts: *
+ lookup: average -10m unaligned
+ units: ms
+ every: 1m
+ warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1))
+ delay: down 15m multiplier 1.2 max 1h
+ summary: Disk ${label:device} backlog
+ info: Average backlog size of the ${label:device} disk over the last 10 minutes
+ to: silent
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
new file mode 100644
index 00000000..756c6a1b
--- /dev/null
+++ b/health/health.d/dns_query.conf
@@ -0,0 +1,15 @@
+# detect dns query failure
+
+ template: dns_query_query_status
+ on: dns_query.query_status
+ class: Errors
+ type: DNS
+component: DNS
+ calc: $success
+ units: status
+ every: 10s
+ warn: $this != nan && $this != 1
+ delay: up 30s down 5m multiplier 1.5 max 1h
+ summary: DNS query unsuccessful requests to ${label:server}
+ info: DNS request type ${label:record_type} to server ${label:server} is unsuccessful
+ to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
new file mode 100644
index 00000000..f6ef0194
--- /dev/null
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -0,0 +1,15 @@
+# dhcp-range utilization
+
+ template: dnsmasq_dhcp_dhcp_range_utilization
+ on: dnsmasq_dhcp.dhcp_range_utilization
+ class: Utilization
+ type: DHCP
+component: Dnsmasq
+ every: 10s
+ units: %
+ calc: $used
+ warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
+ delay: down 5m
+ summary: Dnsmasq DHCP range ${label:dhcp_range} utilization
+ info: DHCP range ${label:dhcp_range} utilization
+ to: sysadmin
diff --git a/health/health.d/docker.conf b/health/health.d/docker.conf
new file mode 100644
index 00000000..668614d4
--- /dev/null
+++ b/health/health.d/docker.conf
@@ -0,0 +1,12 @@
+ template: docker_container_unhealthy
+ on: docker.container_health_status
+ class: Errors
+ type: Containers
+component: Docker
+ units: status
+ every: 10s
+ lookup: average -10s of unhealthy
+ warn: $this > 0
+ summary: Docker container ${label:container_name} health
+ info: ${label:container_name} docker container health status is unhealthy
+ to: sysadmin
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
new file mode 100644
index 00000000..600840c5
--- /dev/null
+++ b/health/health.d/elasticsearch.conf
@@ -0,0 +1,78 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# 'red' is a threshold, can't lookup the 'red' dimension - using simple pattern is a workaround.
+
+ template: elasticsearch_cluster_health_status_red
+ on: elasticsearch.cluster_health_status
+ class: Errors
+ type: SearchEngine
+component: Elasticsearch
+ lookup: average -5s unaligned of *ed
+ every: 10s
+ units: status
+ crit: $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Elasticsearch cluster ${label:cluster_name} status
+ info: Elasticsearch cluster ${label:cluster_name} health status is red.
+ to: sysadmin
+
+# the idea of '-10m' is to handle yellow status after node restart,
+# (usually) no action is required because Elasticsearch will automatically restore the green status.
+ template: elasticsearch_cluster_health_status_yellow
+ on: elasticsearch.cluster_health_status
+ class: Errors
+ type: SearchEngine
+component: Elasticsearch
+ lookup: average -10m unaligned of yellow
+ every: 1m
+ units: status
+ warn: $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Elasticsearch cluster ${label:cluster_name} status
+ info: Elasticsearch cluster ${label:cluster_name} health status is yellow.
+ to: sysadmin
+
+ template: elasticsearch_node_index_health_red
+ on: elasticsearch.node_index_health
+ class: Errors
+ type: SearchEngine
+component: Elasticsearch
+ lookup: average -5s unaligned of *ed
+ every: 10s
+ units: status
+ warn: $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Elasticsearch cluster ${label:cluster_name} index ${label:index} status
+ info: Elasticsearch cluster ${label:cluster_name} index ${label:index} health status is red.
+ to: sysadmin
+
+# don't convert 'lookup' value to seconds in 'calc' due to UI showing seconds as hh:mm:ss (0 as now).
+
+ template: elasticsearch_node_indices_search_time_query
+ on: elasticsearch.node_indices_search_time
+ class: Workload
+ type: SearchEngine
+component: Elasticsearch
+ lookup: average -10m unaligned of query
+ every: 10s
+ units: milliseconds
+ warn: $this > (($status >= $WARNING) ? (20 * 1000) : (30 * 1000))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} query performance
+ info: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} search performance is degraded, queries run slowly.
+ to: sysadmin
+
+ template: elasticsearch_node_indices_search_time_fetch
+ on: elasticsearch.node_indices_search_time
+ class: Workload
+ type: SearchEngine
+component: Elasticsearch
+ lookup: average -10m unaligned of fetch
+ every: 10s
+ units: milliseconds
+ warn: $this > (($status >= $WARNING) ? (3 * 1000) : (5 * 1000))
+ crit: $this > (($status == $CRITICAL) ? (5 * 1000) : (30 * 1000))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} fetch performance
+ info: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} search performance is degraded, fetches run slowly.
+ to: sysadmin
diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf
new file mode 100644
index 00000000..be8b1fe4
--- /dev/null
+++ b/health/health.d/entropy.conf
@@ -0,0 +1,20 @@
+
+# check if entropy is too low
+# the alarm is checked every 1 minute
+# and examines the last hour of data
+
+ alarm: lowest_entropy
+ on: system.entropy
+ class: Utilization
+ type: System
+component: Cryptography
+ os: linux
+ hosts: *
+ lookup: min -5m unaligned
+ units: entries
+ every: 5m
+ warn: $this < (($status >= $WARNING) ? (200) : (100))
+ delay: down 1h multiplier 1.5 max 2h
+ summary: System entropy pool number of entries
+ info: Minimum number of entries in the random numbers pool in the last 5 minutes
+ to: silent
diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf
new file mode 100644
index 00000000..c0320193
--- /dev/null
+++ b/health/health.d/exporting.conf
@@ -0,0 +1,29 @@
+
+ template: exporting_last_buffering
+ on: netdata.exporting_data_size
+ class: Latency
+ type: Netdata
+component: Exporting engine
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Netdata exporting data last successful buffering
+ info: Number of seconds since the last successful buffering of exporting data
+ to: dba
+
+ template: exporting_metrics_sent
+ on: netdata.exporting_data_size
+ class: Workload
+ type: Netdata
+component: Exporting engine
+ units: %
+ calc: abs($sent) * 100 / abs($buffered)
+ every: 10s
+ warn: $this != 100
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Netdata exporting metrics sent
+ info: Percentage of metrics sent to the external database server
+ to: dba
diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf
new file mode 100644
index 00000000..20a592d6
--- /dev/null
+++ b/health/health.d/file_descriptors.conf
@@ -0,0 +1,33 @@
+ # you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: system_file_descriptors_utilization
+ on: system.file_nr_utilization
+ class: Utilization
+ type: System
+ component: Processes
+ hosts: *
+ lookup: max -1m unaligned
+ units: %
+ every: 1m
+ crit: $this > 90
+ delay: down 15m multiplier 1.5 max 1h
+ summary: System open file descriptors utilization
+ info: System-wide utilization of open files
+ to: sysadmin
+
+ template: apps_group_file_descriptors_utilization
+ on: app.fds_open_limit
+ class: Utilization
+ type: System
+component: Process
+ os: linux
+ module: *
+ hosts: *
+ lookup: max -10s unaligned foreach *
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: App group ${label:app_group} file descriptors utilization
+ info: Open files percentage against the processes limits, among all PIDs in application group
+ to: sysadmin
diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf
new file mode 100644
index 00000000..78e1165d
--- /dev/null
+++ b/health/health.d/gearman.conf
@@ -0,0 +1,14 @@
+
+ template: gearman_workers_queued
+ on: gearman.single_job
+ class: Latency
+ type: Computing
+component: Gearman
+ lookup: average -10m unaligned match-names of Pending
+ units: workers
+ every: 10s
+ warn: $this > 30000
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Gearman queued jobs
+ info: Average number of queued jobs over the last 10 minutes
+ to: sysadmin
diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf
new file mode 100644
index 00000000..361b6b41
--- /dev/null
+++ b/health/health.d/geth.conf
@@ -0,0 +1,11 @@
+#chainhead_header is expected momenterarily to be ahead. If its considerably ahead (e.g more than 5 blocks), then the node is definitely out of sync.
+ template: geth_chainhead_diff_between_header_block
+ on: geth.chainhead
+ class: Workload
+ type: ethereum_node
+component: geth
+ every: 10s
+ calc: $chain_head_block - $chain_head_header
+ units: blocks
+ warn: $this != 0
+ delay: down 1m multiplier 1.5 max 1h
diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf
new file mode 100644
index 00000000..7796a1bc
--- /dev/null
+++ b/health/health.d/go.d.plugin.conf
@@ -0,0 +1,18 @@
+
+# make sure go.d.plugin data collection job is running
+
+ template: go.d_job_last_collected_secs
+ on: netdata.go_plugin_execution_time
+ class: Errors
+ type: Netdata
+component: go.d.plugin
+ module: !* *
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Go.d plugin last collection
+ info: Number of seconds since the last successful data collection
+ to: webmaster
diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf
new file mode 100644
index 00000000..66a488fa
--- /dev/null
+++ b/health/health.d/haproxy.conf
@@ -0,0 +1,25 @@
+ template: haproxy_backend_server_status
+ on: haproxy_hs.down
+ class: Errors
+ type: Web Proxy
+component: HAProxy
+ units: failed servers
+ every: 10s
+ lookup: average -10s
+ crit: $this > 0
+ summary: HAProxy server status
+ info: Average number of failed haproxy backend servers over the last 10 seconds
+ to: sysadmin
+
+ template: haproxy_backend_status
+ on: haproxy_hb.down
+ class: Errors
+ type: Web Proxy
+component: HAProxy
+ units: failed backend
+ every: 10s
+ lookup: average -10s
+ crit: $this > 0
+ summary: HAProxy backend status
+ info: Average number of failed haproxy backends over the last 10 seconds
+ to: sysadmin
diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf
new file mode 100644
index 00000000..566e815a
--- /dev/null
+++ b/health/health.d/hdfs.conf
@@ -0,0 +1,81 @@
+
+# Common
+
+ template: hdfs_capacity_usage
+ on: hdfs.capacity
+ class: Utilization
+ type: Storage
+component: HDFS
+ calc: ($used) * 100 / ($used + $remaining)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: HDFS datanodes space utilization
+ info: summary datanodes space capacity utilization
+ to: sysadmin
+
+
+# NameNode
+
+ template: hdfs_missing_blocks
+ on: hdfs.blocks
+ class: Errors
+ type: Storage
+component: HDFS
+ calc: $missing
+ units: missing blocks
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ summary: HDFS missing blocks
+ info: number of missing blocks
+ to: sysadmin
+
+
+ template: hdfs_stale_nodes
+ on: hdfs.data_nodes
+ class: Errors
+ type: Storage
+component: HDFS
+ calc: $stale
+ units: dead nodes
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ summary: HDFS stale datanodes
+ info: number of datanodes marked stale due to delayed heartbeat
+ to: sysadmin
+
+
+ template: hdfs_dead_nodes
+ on: hdfs.data_nodes
+ class: Errors
+ type: Storage
+component: HDFS
+ calc: $dead
+ units: dead nodes
+ every: 10s
+ crit: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ summary: HDFS dead datanodes
+ info: number of datanodes which are currently dead
+ to: sysadmin
+
+
+# DataNode
+
+ template: hdfs_num_failed_volumes
+ on: hdfs.num_failed_volumes
+ class: Errors
+ type: Storage
+component: HDFS
+ calc: $fsds_num_failed_volumes
+ units: failed volumes
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ summary: HDFS failed volumes
+ info: number of failed volumes
+ to: sysadmin
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
new file mode 100644
index 00000000..da5dec79
--- /dev/null
+++ b/health/health.d/httpcheck.conf
@@ -0,0 +1,73 @@
+
+# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
+ template: httpcheck_web_service_up
+ on: httpcheck.status
+ class: Utilization
+ type: Web Server
+component: HTTP endpoint
+ lookup: average -1m unaligned percentage of success
+ calc: ($this < 75) ? (0) : ($this)
+ every: 5s
+ units: up/down
+ info: HTTP check endpoint ${label:url} liveness status
+ to: silent
+
+ template: httpcheck_web_service_bad_content
+ on: httpcheck.status
+ class: Workload
+ type: Web Server
+component: HTTP endpoint
+ lookup: average -5m unaligned percentage of bad_content
+ every: 10s
+ units: %
+ warn: $this >= 10 AND $this < 40
+ crit: $this >= 40
+ delay: down 5m multiplier 1.5 max 1h
+ summary: HTTP check for ${label:url} unexpected content
+ info: Percentage of HTTP responses from ${label:url} with unexpected content in the last 5 minutes
+ to: webmaster
+
+ template: httpcheck_web_service_bad_status
+ on: httpcheck.status
+ class: Workload
+ type: Web Server
+component: HTTP endpoint
+ lookup: average -5m unaligned percentage of bad_status
+ every: 10s
+ units: %
+ warn: $this >= 10 AND $this < 40
+ crit: $this >= 40
+ delay: down 5m multiplier 1.5 max 1h
+ summary: HTTP check for ${label:url} unexpected status
+ info: Percentage of HTTP responses from ${label:url} with unexpected status in the last 5 minutes
+ to: webmaster
+
+ template: httpcheck_web_service_timeouts
+ on: httpcheck.status
+ class: Latency
+ type: Web Server
+component: HTTP endpoint
+ lookup: average -5m unaligned percentage of timeout
+ every: 10s
+ units: %
+ warn: $this >= 10 AND $this < 40
+ crit: $this >= 40
+ delay: down 5m multiplier 1.5 max 1h
+ summary: HTTP check for ${label:url} timeouts
+ info: Percentage of timed-out HTTP requests to ${label:url} in the last 5 minutes
+ to: webmaster
+
+ template: httpcheck_web_service_no_connection
+ on: httpcheck.status
+ class: Errors
+ type: Other
+component: HTTP endpoint
+ lookup: average -5m unaligned percentage of no_connection
+ every: 10s
+ units: %
+ warn: $this >= 10 AND $this < 40
+ crit: $this >= 40
+ delay: down 5m multiplier 1.5 max 1h
+ summary: HTTP check for ${label:url} failed requests
+ info: Percentage of failed HTTP requests to ${label:url} in the last 5 minutes
+ to: webmaster
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
new file mode 100644
index 00000000..6d832bf0
--- /dev/null
+++ b/health/health.d/ioping.conf
@@ -0,0 +1,14 @@
+ template: ioping_disk_latency
+ on: ioping.latency
+ class: Latency
+ type: System
+component: Disk
+ lookup: average -10s unaligned of latency
+ units: microseconds
+ every: 10s
+ green: 10000
+ warn: $this > $green
+ delay: down 30m multiplier 1.5 max 2h
+ summary: IO ping latency
+ info: Average I/O latency over the last 10 seconds
+ to: silent
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
new file mode 100644
index 00000000..f77f5606
--- /dev/null
+++ b/health/health.d/ipc.conf
@@ -0,0 +1,34 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: semaphores_used
+ on: system.ipc_semaphores
+ class: Utilization
+ type: System
+component: IPC
+ os: linux
+ hosts: *
+ calc: $semaphores * 100 / $ipc_semaphores_max
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: IPC semaphores used
+ info: IPC semaphore utilization
+ to: sysadmin
+
+ alarm: semaphore_arrays_used
+ on: system.ipc_semaphore_arrays
+ class: Utilization
+ type: System
+component: IPC
+ os: linux
+ hosts: *
+ calc: $arrays * 100 / $ipc_semaphores_arrays_max
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: IPC semaphore arrays used
+ info: IPC semaphore arrays utilization
+ to: sysadmin
diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf
new file mode 100644
index 00000000..4dfee3c7
--- /dev/null
+++ b/health/health.d/ipfs.conf
@@ -0,0 +1,15 @@
+
+ template: ipfs_datastore_usage
+ on: ipfs.repo_size
+ class: Utilization
+ type: Data Sharing
+component: IPFS
+ calc: $size * 100 / $avail
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: IPFS datastore utilization
+ info: IPFS datastore utilization
+ to: sysadmin
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
new file mode 100644
index 00000000..cec2320a
--- /dev/null
+++ b/health/health.d/ipmi.conf
@@ -0,0 +1,28 @@
+ template: ipmi_sensor_state
+ on: ipmi.sensor_state
+ class: Errors
+ type: System
+component: IPMI
+ calc: $warning + $critical
+ units: state
+ every: 10s
+ warn: $warning > 0
+ crit: $critical > 0
+ delay: up 5m down 15m multiplier 1.5 max 1h
+ summary: IPMI sensor ${label:sensor} state
+ info: IPMI sensor ${label:sensor} (${label:component}) state
+ to: sysadmin
+
+ alarm: ipmi_events
+ on: ipmi.events
+ class: Utilization
+ type: System
+component: IPMI
+ calc: $events
+ units: events
+ every: 30s
+ warn: $this > 0
+ delay: up 5m down 15m multiplier 1.5 max 1h
+ summary: IPMI entries in System Event Log
+ info: number of events in the IPMI System Event Log (SEL)
+ to: silent
diff --git a/health/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf
new file mode 100644
index 00000000..d1f93969
--- /dev/null
+++ b/health/health.d/isc_dhcpd.conf
@@ -0,0 +1,10 @@
+# template: isc_dhcpd_leases_size
+# on: isc_dhcpd.leases_total
+# units: KB
+# every: 60
+# calc: $leases_size
+# warn: $this > 3072
+# crit: $this > 6144
+# delay: up 2m down 5m
+# info: dhcpd.leases file too big! Module can slow down your server.
+# to: sysadmin
diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf
new file mode 100644
index 00000000..8adf5f7d
--- /dev/null
+++ b/health/health.d/kubelet.conf
@@ -0,0 +1,151 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+
+# True (1) if the node is experiencing a configuration-related error, false (0) otherwise.
+
+ template: kubelet_node_config_error
+ on: k8s_kubelet.kubelet_node_config_error
+ class: Errors
+ type: Kubernetes
+component: Kubelet
+ calc: $experiencing_error
+ units: bool
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 2h
+ summary: Kubelet node config error
+ info: The node is experiencing a configuration-related error (0: false, 1: true)
+ to: sysadmin
+
+# Failed Token() requests to the alternate token source
+
+ template: kubelet_token_requests
+ on: k8s_kubelet.kubelet_token_requests
+ class: Errors
+ type: Kubernetes
+component: Kubelet
+ lookup: sum -10s of failed
+ units: requests
+ every: 10s
+ warn: $this > 0
+ delay: down 1m multiplier 1.5 max 2h
+ summary: Kubelet failed token requests
+ info: Number of failed Token() requests to the alternate token source
+ to: sysadmin
+
+# Docker and runtime operation errors
+
+ template: kubelet_operations_error
+ on: k8s_kubelet.kubelet_operations_errors
+ class: Errors
+ type: Kubernetes
+component: Kubelet
+ lookup: sum -1m
+ units: errors
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : (20))
+ delay: up 30s down 1m multiplier 1.5 max 2h
+ summary: Kubelet runtime errors
+ info: Number of Docker or runtime operation errors
+ to: sysadmin
+
+# -----------------------------------------------------------------------------
+
+# Pod Lifecycle Event Generator Relisting Latency
+
+# 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99)
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is:
+# - 2x the first for quantile 0.5
+# - 4x the first for quantile 0.9
+# - 8x the first for quantile 0.99
+#
+# we assume the minimum latency is 1000 microseconds
+
+# quantile 0.5
+
+ template: kubelet_1m_pleg_relist_latency_quantile_05
+ on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+ class: Latency
+ type: Kubernetes
+component: Kubelet
+ lookup: average -1m unaligned of 0.5
+ units: microseconds
+ every: 10s
+ info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
+
+ template: kubelet_10s_pleg_relist_latency_quantile_05
+ on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+ class: Latency
+ type: Kubernetes
+component: Kubelet
+ lookup: average -10s unaligned of 0.5
+ calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
+ every: 10s
+ units: %
+ warn: $this > (($status >= $WARNING)?(100):(200))
+ crit: $this > (($status >= $WARNING)?(200):(400))
+ delay: down 1m multiplier 1.5 max 2h
+ summary: Kubelet relisting latency (quantile 0.5)
+ info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+ compared to the last minute (quantile 0.5)
+ to: sysadmin
+
+# quantile 0.9
+
+ template: kubelet_1m_pleg_relist_latency_quantile_09
+ on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+ class: Latency
+ type: Kubernetes
+component: Kubelet
+ lookup: average -1m unaligned of 0.9
+ units: microseconds
+ every: 10s
+ info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
+
+ template: kubelet_10s_pleg_relist_latency_quantile_09
+ on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+ class: Latency
+ type: Kubernetes
+component: Kubelet
+ lookup: average -10s unaligned of 0.9
+ calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
+ every: 10s
+ units: %
+ warn: $this > (($status >= $WARNING)?(200):(400))
+ crit: $this > (($status >= $WARNING)?(400):(800))
+ delay: down 1m multiplier 1.5 max 2h
+ summary: Kubelet relisting latency (quantile 0.9)
+ info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+ compared to the last minute (quantile 0.9)
+ to: sysadmin
+
+# quantile 0.99
+
+ template: kubelet_1m_pleg_relist_latency_quantile_099
+ on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+ class: Latency
+ type: Kubernetes
+component: Kubelet
+ lookup: average -1m unaligned of 0.99
+ units: microseconds
+ every: 10s
+ info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
+
+ template: kubelet_10s_pleg_relist_latency_quantile_099
+ on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+ class: Latency
+ type: Kubernetes
+component: Kubelet
+ lookup: average -10s unaligned of 0.99
+ calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
+ every: 10s
+ units: %
+ warn: $this > (($status >= $WARNING)?(400):(800))
+ crit: $this > (($status >= $WARNING)?(800):(1200))
+ delay: down 1m multiplier 1.5 max 2h
+ summary: Kubelet relisting latency (quantile 0.99)
+ info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+ compared to the last minute (quantile 0.99)
+ to: sysadmin
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
new file mode 100644
index 00000000..b0d35e75
--- /dev/null
+++ b/health/health.d/linux_power_supply.conf
@@ -0,0 +1,15 @@
+# Alert on low battery capacity.
+
+ template: linux_power_supply_capacity
+ on: powersupply.capacity
+ class: Utilization
+ type: Power Supply
+component: Battery
+ calc: $capacity
+ units: %
+ every: 10s
+ warn: $this < 10
+ delay: up 30s down 5m multiplier 1.2 max 1h
+ summary: Power supply capacity
+ info: Percentage of remaining power supply capacity
+ to: silent
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
new file mode 100644
index 00000000..fd8bf939
--- /dev/null
+++ b/health/health.d/load.conf
@@ -0,0 +1,72 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# Calculate the base trigger point for the load average alarms.
+# This is the maximum number of CPU's in the system over the past 1
+# minute, with a special case for a single CPU of setting the trigger at 2.
+ alarm: load_cpu_number
+ on: system.load
+ class: Utilization
+ type: System
+component: Load
+ os: linux
+ hosts: *
+ calc: ($active_processors == nan or $active_processors == 0) ? (nan) : ( ($active_processors < 2) ? ( 2 ) : ( $active_processors ) )
+ units: cpus
+ every: 1m
+ info: Number of active CPU cores in the system
+
+# Send alarms if the load average is unusually high.
+# These intentionally _do not_ calculate the average over the sampled
+# time period because the values being checked already are averages.
+
+ alarm: load_average_15
+ on: system.load
+ class: Utilization
+ type: System
+component: Load
+ os: linux
+ hosts: *
+ lookup: max -1m unaligned of load15
+ calc: ($load_cpu_number == nan) ? (nan) : ($this)
+ units: load
+ every: 1m
+ warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
+ delay: down 15m multiplier 1.5 max 1h
+ summary: Host load average (15 minutes)
+ info: System load average for the past 15 minutes
+ to: silent
+
+ alarm: load_average_5
+ on: system.load
+ class: Utilization
+ type: System
+component: Load
+ os: linux
+ hosts: *
+ lookup: max -1m unaligned of load5
+ calc: ($load_cpu_number == nan) ? (nan) : ($this)
+ units: load
+ every: 1m
+ warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
+ delay: down 15m multiplier 1.5 max 1h
+ summary: System load average (5 minutes)
+ info: System load average for the past 5 minutes
+ to: silent
+
+ alarm: load_average_1
+ on: system.load
+ class: Utilization
+ type: System
+component: Load
+ os: linux
+ hosts: *
+ lookup: max -1m unaligned of load1
+ calc: ($load_cpu_number == nan) ? (nan) : ($this)
+ units: load
+ every: 1m
+ warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
+ delay: down 15m multiplier 1.5 max 1h
+ summary: System load average (1 minute)
+ info: System load average for the past 1 minute
+ to: silent
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
new file mode 100644
index 00000000..90f97d85
--- /dev/null
+++ b/health/health.d/mdstat.conf
@@ -0,0 +1,43 @@
+
+ template: mdstat_disks
+ on: md.disks
+ class: Errors
+ type: System
+component: RAID
+ units: failed devices
+ every: 10s
+ calc: $down
+ warn: $this > 0
+ summary: MD array device ${label:device} down
+ info: Number of devices in the down state for the ${label:device} ${label:raid_level} array. \
+ Any number > 0 indicates that the array is degraded.
+ to: sysadmin
+
+ template: mdstat_mismatch_cnt
+ on: md.mismatch_cnt
+ class: Errors
+ type: System
+component: RAID
+chart labels: raid_level=!raid1 !raid10 *
+ units: unsynchronized blocks
+ calc: $count
+ every: 60s
+ warn: $this > 1024
+ delay: up 30m
+ summary: MD array device ${label:device} unsynchronized blocks
+ info: Number of unsynchronized blocks for the ${label:device} ${label:raid_level} array
+ to: silent
+
+ template: mdstat_nonredundant_last_collected
+ on: md.nonredundant
+ class: Latency
+ type: System
+component: RAID
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ summary: MD array last collected
+ info: Number of seconds since the last successful data collection
+ to: sysadmin
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
new file mode 100644
index 00000000..118997a5
--- /dev/null
+++ b/health/health.d/megacli.conf
@@ -0,0 +1,76 @@
+
+## Adapters (controllers)
+
+ template: megacli_adapter_state
+ on: megacli.adapter_degraded
+ class: Errors
+ type: System
+component: RAID
+ lookup: max -10s foreach *
+ units: boolean
+ every: 10s
+ crit: $this > 0
+ delay: down 5m multiplier 2 max 10m
+ summary: MegaCLI adapter state
+ info: Adapter is in the degraded state (0: false, 1: true)
+ to: sysadmin
+
+## Physical Disks
+
+ template: megacli_pd_predictive_failures
+ on: megacli.pd_predictive_failure
+ class: Errors
+ type: System
+component: RAID
+ lookup: sum -10s foreach *
+ units: predictive failures
+ every: 10s
+ warn: $this > 0
+ delay: up 1m down 5m multiplier 2 max 10m
+ summary: MegaCLI physical drive predictive failures
+ info: Number of physical drive predictive failures
+ to: sysadmin
+
+ template: megacli_pd_media_errors
+ on: megacli.pd_media_error
+ class: Errors
+ type: System
+component: RAID
+ lookup: sum -10s foreach *
+ units: media errors
+ every: 10s
+ warn: $this > 0
+ delay: up 1m down 5m multiplier 2 max 10m
+ summary: MegaCLI physical drive errors
+ info: Number of physical drive media errors
+ to: sysadmin
+
+## Battery Backup Units (BBU)
+
+ template: megacli_bbu_relative_charge
+ on: megacli.bbu_relative_charge
+ class: Workload
+ type: System
+component: RAID
+ lookup: average -10s
+ units: percent
+ every: 10s
+ warn: $this <= (($status >= $WARNING) ? (85) : (80))
+ crit: $this <= (($status == $CRITICAL) ? (50) : (40))
+ summary: MegaCLI BBU charge state
+ info: Average battery backup unit (BBU) relative state of charge over the last 10 seconds
+ to: sysadmin
+
+ template: megacli_bbu_cycle_count
+ on: megacli.bbu_cycle_count
+ class: Workload
+ type: System
+component: RAID
+ lookup: average -10s
+ units: cycles
+ every: 10s
+ warn: $this >= 100
+ crit: $this >= 500
+ summary: MegaCLI BBU cycles count
+ info: Average battery backup unit (BBU) charge cycles count over the last 10 seconds
+ to: sysadmin
diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf
new file mode 100644
index 00000000..77ca0afa
--- /dev/null
+++ b/health/health.d/memcached.conf
@@ -0,0 +1,50 @@
+
+# detect if memcached cache is full
+
+ template: memcached_cache_memory_usage
+ on: memcached.cache
+ class: Utilization
+ type: KV Storage
+component: Memcached
+ calc: $used * 100 / ($used + $available)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ summary: Memcached memory utilization
+ info: Cache memory utilization
+ to: dba
+
+
+# find the rate memcached cache is filling
+
+ template: memcached_cache_fill_rate
+ on: memcached.cache
+ class: Utilization
+ type: KV Storage
+component: Memcached
+ lookup: min -10m at -50m unaligned of available
+ calc: ($this - $available) / (($now - $after) / 3600)
+ units: KB/hour
+ every: 1m
+ info: Average rate the cache fills up (positive), or frees up (negative) space over the last hour
+
+
+# find the hours remaining until memcached cache is full
+
+ template: memcached_out_of_cache_space_time
+ on: memcached.cache
+ class: Utilization
+ type: KV Storage
+component: Memcached
+ calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf)
+ units: hours
+ every: 10s
+ warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+ crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: Memcached estimation of lack of cache space
+ info: Estimated time the cache will run out of space \
+ if the system continues to add data at the same rate as the past hour
+ to: dba
diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf
new file mode 100644
index 00000000..5ab3d2d9
--- /dev/null
+++ b/health/health.d/memory.conf
@@ -0,0 +1,85 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: 1hour_memory_hw_corrupted
+ on: mem.hwcorrupt
+ class: Errors
+ type: System
+component: Memory
+ os: linux
+ hosts: *
+ calc: $HardwareCorrupted
+ units: MB
+ every: 10s
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ summary: System corrupted memory
+ info: Amount of memory corrupted due to a hardware failure
+ to: sysadmin
+
+## ECC Controller
+
+ template: ecc_memory_mc_correctable
+ on: mem.edac_mc
+ class: Errors
+ type: System
+component: Memory
+ os: linux
+ hosts: *
+ lookup: sum -10m unaligned of correctable, correctable_noinfo
+ units: errors
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ summary: System ECC memory ${label:controller} correctable errors
+ info: Memory controller ${label:controller} ECC correctable errors in the last 10 minutes
+ to: sysadmin
+
+ template: ecc_memory_mc_uncorrectable
+ on: mem.edac_mc
+ class: Errors
+ type: System
+component: Memory
+ os: linux
+ hosts: *
+ lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo
+ units: errors
+ every: 1m
+ crit: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ summary: System ECC memory ${label:controller} uncorrectable errors
+ info: Memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes
+ to: sysadmin
+
+## ECC DIMM
+
+ template: ecc_memory_dimm_correctable
+ on: mem.edac_mc_dimm
+ class: Errors
+ type: System
+component: Memory
+ os: linux
+ hosts: *
+ lookup: sum -10m unaligned of correctable
+ units: errors
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ summary: System ECC memory DIMM ${label:dimm} correctable errors
+ info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes
+ to: sysadmin
+
+ template: ecc_memory_dimm_uncorrectable
+ on: mem.edac_mc_dimm
+ class: Errors
+ type: System
+component: Memory
+ os: linux
+ hosts: *
+ lookup: sum -10m unaligned of uncorrectable
+ units: errors
+ every: 1m
+ crit: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ summary: System ECC memory DIMM ${label:dimm} uncorrectable errors
+ info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes
+ to: sysadmin
diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf
new file mode 100644
index 00000000..aef9b036
--- /dev/null
+++ b/health/health.d/ml.conf
@@ -0,0 +1,56 @@
+# below are some examples of using the `anomaly-bit` option to define alerts based on anomaly
+# rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's
+# native anomaly detection here:
+# https://learn.netdata.cloud/docs/agent/ml#anomaly-bit---100--anomalous-0--normal
+
+# some examples below are commented, you would need to uncomment and adjust as desired to enable them.
+
+# node level anomaly rate
+# https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate
+# if node level anomaly rate is above 1% then warning (pick your own threshold that works best via trial and error).
+ template: ml_1min_node_ar
+ on: anomaly_detection.anomaly_rate
+ class: Workload
+ type: System
+component: ML
+ os: *
+ hosts: *
+ lookup: average -1m of anomaly_rate
+ calc: $this
+ units: %
+ every: 30s
+ warn: $this > 1
+ summary: ML node anomaly rate
+ info: Rolling 1min node level anomaly rate
+ to: silent
+
+# alert per dimension example
+# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
+# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
+# template: ml_5min_cpu_dims
+# on: system.cpu
+# os: linux
+# hosts: *
+# lookup: average -5m anomaly-bit foreach *
+# calc: $this
+# units: %
+# every: 30s
+# warn: $this > (($status >= $WARNING) ? (5) : (20))
+# crit: $this > (($status == $CRITICAL) ? (20) : (100))
+# info: rolling 5min anomaly rate for each system.cpu dimension
+
+# alert per chart example
+# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
+# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
+# template: ml_5min_cpu_chart
+# on: system.cpu
+# os: linux
+# hosts: *
+# lookup: average -5m anomaly-bit of *
+# calc: $this
+# units: %
+# every: 30s
+# warn: $this > (($status >= $WARNING) ? (5) : (20))
+# crit: $this > (($status == $CRITICAL) ? (20) : (100))
+# info: rolling 5min anomaly rate for system.cpu chart
+
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
new file mode 100644
index 00000000..572560b4
--- /dev/null
+++ b/health/health.d/mysql.conf
@@ -0,0 +1,187 @@
+
+# slow queries
+
+ template: mysql_10s_slow_queries
+ on: mysql.queries
+ class: Latency
+ type: Database
+component: MySQL
+ lookup: sum -10s of slow_queries
+ units: slow queries
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (10) : (20))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: MySQL slow queries
+ info: Number of slow queries in the last 10 seconds
+ to: dba
+
+
+# -----------------------------------------------------------------------------
+# lock waits
+
+ template: mysql_10s_table_locks_immediate
+ on: mysql.table_locks
+ class: Utilization
+ type: Database
+component: MySQL
+ lookup: sum -10s absolute of immediate
+ units: immediate locks
+ every: 10s
+ summary: MySQL table immediate locks
+ info: Number of table immediate locks in the last 10 seconds
+ to: dba
+
+ template: mysql_10s_table_locks_waited
+ on: mysql.table_locks
+ class: Latency
+ type: Database
+component: MySQL
+ lookup: sum -10s absolute of waited
+ units: waited locks
+ every: 10s
+ summary: MySQL table waited locks
+ info: Number of table waited locks in the last 10 seconds
+ to: dba
+
+ template: mysql_10s_waited_locks_ratio
+ on: mysql.table_locks
+ class: Latency
+ type: Database
+component: MySQL
+ calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (10) : (25))
+ crit: $this > (($status == $CRITICAL) ? (25) : (50))
+ delay: down 30m multiplier 1.5 max 1h
+ summary: MySQL waited table locks ratio
+ info: Ratio of waited table locks over the last 10 seconds
+ to: dba
+
+
+# -----------------------------------------------------------------------------
+# connections
+
+ template: mysql_connections
+ on: mysql.connections_active
+ class: Utilization
+ type: Database
+component: MySQL
+ calc: $active * 100 / $limit
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (60) : (70))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: MySQL connections utilization
+ info: Client connections utilization
+ to: dba
+
+
+# -----------------------------------------------------------------------------
+# replication
+
+ template: mysql_replication
+ on: mysql.slave_status
+ class: Errors
+ type: Database
+component: MySQL
+ calc: ($sql_running <= 0 OR $io_running <= 0)?0:1
+ units: ok/failed
+ every: 10s
+ crit: $this == 0
+ delay: down 5m multiplier 1.5 max 1h
+ summary: MySQL replication status
+ info: Replication status (0: stopped, 1: working)
+ to: dba
+
+ template: mysql_replication_lag
+ on: mysql.slave_behind
+ class: Latency
+ type: Database
+component: MySQL
+ calc: $seconds
+ units: seconds
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (10) : (30))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: MySQL replication lag
+ info: Difference between the timestamp of the latest transaction processed by the SQL thread and \
+ the timestamp of the same transaction when it was processed on the master
+ to: dba
+
+
+# -----------------------------------------------------------------------------
+# galera cluster size
+
+ template: mysql_galera_cluster_size_max_2m
+ on: mysql.galera_cluster_size
+ class: Utilization
+ type: Database
+component: MySQL
+ lookup: max -2m at -1m unaligned
+ units: nodes
+ every: 10s
+ info: maximum galera cluster size in the last 2 minutes starting one minute ago
+ to: dba
+
+ template: mysql_galera_cluster_size
+ on: mysql.galera_cluster_size
+ class: Utilization
+ type: Database
+component: MySQL
+ calc: $nodes
+ units: nodes
+ every: 10s
+ warn: $this > $mysql_galera_cluster_size_max_2m
+ crit: $this < $mysql_galera_cluster_size_max_2m
+ delay: up 20s down 5m multiplier 1.5 max 1h
+ summary: MySQL galera cluster size
+ info: Current galera cluster size, compared to the maximum size in the last 2 minutes
+ to: dba
+
+# galera node state
+
+ template: mysql_galera_cluster_state_warn
+ on: mysql.galera_cluster_state
+ class: Errors
+ type: Database
+component: MySQL
+ calc: $donor + $joined
+ every: 10s
+ warn: $this != nan AND $this != 0
+ delay: up 30s down 5m multiplier 1.5 max 1h
+ summary: MySQL galera node state
+ info: Galera node state is either Donor/Desynced or Joined.
+ to: dba
+
+ template: mysql_galera_cluster_state_crit
+ on: mysql.galera_cluster_state
+ class: Errors
+ type: Database
+component: MySQL
+ calc: $undefined + $joining + $error
+ every: 10s
+ crit: $this != nan AND $this != 0
+ delay: up 30s down 5m multiplier 1.5 max 1h
+ summary: MySQL galera node state
+ info: Galera node state is either Undefined or Joining or Error.
+ to: dba
+
+# galera node status
+
+ template: mysql_galera_cluster_status
+ on: mysql.galera_cluster_status
+ class: Errors
+ type: Database
+component: MySQL
+ calc: $primary
+ every: 10s
+ crit: $this != nan AND $this != 1
+ delay: up 30s down 5m multiplier 1.5 max 1h
+ summary: MySQL galera cluster status
+ info: Galera node is part of a nonoperational component. \
+ This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations.
+ to: dba
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
new file mode 100644
index 00000000..2dfe6bba
--- /dev/null
+++ b/health/health.d/net.conf
@@ -0,0 +1,258 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+# net traffic overflow
+
+ template: interface_speed
+ on: net.net
+ class: Latency
+ type: System
+component: Network
+ os: *
+ hosts: *
+ calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max / 1000) : ( nan )
+ units: Mbit
+ every: 10s
+ info: Network interface ${label:device} current speed
+
+ template: 1m_received_traffic_overflow
+ on: net.net
+ class: Workload
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: average -1m unaligned absolute of received
+ calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ delay: up 1m down 1m multiplier 1.5 max 1h
+ summary: System network interface ${label:device} inbound utilization
+ info: Average inbound utilization for the network interface ${label:device} over the last minute
+ to: silent
+
+ template: 1m_sent_traffic_overflow
+ on: net.net
+ class: Workload
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: average -1m unaligned absolute of sent
+ calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ delay: up 1m down 1m multiplier 1.5 max 1h
+ summary: System network interface ${label:device} outbound utilization
+ info: Average outbound utilization for the network interface ${label:device} over the last minute
+ to: silent
+
+# -----------------------------------------------------------------------------
+# dropped packets
+
+# check if an interface is dropping packets
+# the alarm is checked every 1 minute
+# and examines the last 10 minutes of data
+#
+# it is possible to have expected packet drops on an interface for some network configurations
+# look at the Monitoring Network Interfaces section in the proc.plugin documentation for more information
+
+ template: net_interface_inbound_packets
+ on: net.packets
+ class: Workload
+ type: System
+component: Network
+ os: *
+ hosts: *
+ lookup: sum -10m unaligned absolute of received
+ units: packets
+ every: 1m
+ summary: Network interface ${label:device} received packets
+ info: Received packets for the network interface ${label:device} in the last 10 minutes
+
+ template: net_interface_outbound_packets
+ on: net.packets
+ class: Workload
+ type: System
+component: Network
+ os: *
+ hosts: *
+ lookup: sum -10m unaligned absolute of sent
+ units: packets
+ every: 1m
+ summary: Network interface ${label:device} sent packets
+ info: Sent packets for the network interface ${label:device} in the last 10 minutes
+
+ template: inbound_packets_dropped_ratio
+ on: net.drops
+ class: Errors
+ type: System
+component: Network
+ os: *
+ hosts: *
+chart labels: device=!wl* *
+ lookup: sum -10m unaligned absolute of inbound
+ calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0))
+ units: %
+ every: 1m
+ warn: $this >= 2
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ summary: System network interface ${label:device} inbound drops
+ info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
+ to: silent
+
+ template: outbound_packets_dropped_ratio
+ on: net.drops
+ class: Errors
+ type: System
+component: Network
+ os: *
+ hosts: *
+chart labels: device=!wl* *
+ lookup: sum -10m unaligned absolute of outbound
+ calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0))
+ units: %
+ every: 1m
+ warn: $this >= 2
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ summary: System network interface ${label:device} outbound drops
+ info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
+ to: silent
+
+ template: wifi_inbound_packets_dropped_ratio
+ on: net.drops
+ class: Errors
+ type: System
+component: Network
+ os: linux
+ hosts: *
+chart labels: device=wl*
+ lookup: sum -10m unaligned absolute of received
+ calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0))
+ units: %
+ every: 1m
+ warn: $this >= 10
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ summary: System network interface ${label:device} inbound drops ratio
+ info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
+ to: silent
+
+ template: wifi_outbound_packets_dropped_ratio
+ on: net.drops
+ class: Errors
+ type: System
+component: Network
+ os: linux
+ hosts: *
+chart labels: device=wl*
+ lookup: sum -10m unaligned absolute of sent
+ calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0))
+ units: %
+ every: 1m
+ warn: $this >= 10
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ summary: System network interface ${label:device} outbound drops ratio
+ info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
+ to: silent
+
+# -----------------------------------------------------------------------------
+# interface errors
+
+ template: interface_inbound_errors
+ on: net.errors
+ class: Errors
+ type: System
+component: Network
+ os: freebsd
+ hosts: *
+ lookup: sum -10m unaligned absolute of inbound
+ units: errors
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ summary: System network interface ${label:device} inbound errors
+ info: Number of inbound errors for the network interface ${label:device} in the last 10 minutes
+ to: silent
+
+ template: interface_outbound_errors
+ on: net.errors
+ class: Errors
+ type: System
+component: Network
+ os: freebsd
+ hosts: *
+ lookup: sum -10m unaligned absolute of outbound
+ units: errors
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ summary: System network interface ${label:device} outbound errors
+ info: Number of outbound errors for the network interface ${label:device} in the last 10 minutes
+ to: silent
+
+# -----------------------------------------------------------------------------
+# FIFO errors
+
+# check if an interface is having FIFO
+# buffer errors
+# the alarm is checked every 1 minute
+# and examines the last 10 minutes of data
+
+ template: 10min_fifo_errors
+ on: net.fifo
+ class: Errors
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: sum -10m unaligned absolute
+ units: errors
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 2h
+ summary: System network interface ${label:device} FIFO errors
+ info: Number of FIFO errors for the network interface ${label:device} in the last 10 minutes
+ to: silent
+
+# -----------------------------------------------------------------------------
+# check for packet storms
+
+# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is 10x or 20x the first
+# we assume the minimum packet storm should at least have
+# 10000 packets/s, average of the last 10 seconds
+
+ template: 1m_received_packets_rate
+ on: net.packets
+ class: Workload
+ type: System
+component: Network
+ os: linux freebsd
+ hosts: *
+ lookup: average -1m unaligned of received
+ units: packets
+ every: 10s
+ info: Average number of packets received by the network interface ${label:device} over the last minute
+
+ template: 10s_received_packets_storm
+ on: net.packets
+ class: Workload
+ type: System
+component: Network
+ os: linux freebsd
+ hosts: *
+ lookup: average -10s unaligned of received
+ calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
+ every: 10s
+ units: %
+ warn: $this > (($status >= $WARNING)?(200):(5000))
+ crit: $this > (($status == $CRITICAL)?(5000):(6000))
+ options: no-clear-notification
+ summary: System network interface ${label:device} inbound packet storm
+ info: Ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
+ compared to the rate over the last minute
+ to: silent
diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf
new file mode 100644
index 00000000..417105d4
--- /dev/null
+++ b/health/health.d/netfilter.conf
@@ -0,0 +1,20 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: netfilter_conntrack_full
+ on: netfilter.conntrack_sockets
+ class: Workload
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: max -10s unaligned of connections
+ calc: $this * 100 / $netfilter_conntrack_max
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (95))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: System Netfilter connection tracker utilization
+ info: Netfilter connection tracker table size utilization
+ to: sysadmin
diff --git a/health/health.d/nvme.conf b/health/health.d/nvme.conf
new file mode 100644
index 00000000..aea402e8
--- /dev/null
+++ b/health/health.d/nvme.conf
@@ -0,0 +1,15 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: nvme_device_critical_warnings_state
+ on: nvme.device_critical_warnings_state
+ class: Errors
+ type: System
+component: Disk
+ lookup: max -30s unaligned
+ units: state
+ every: 10s
+ crit: $this != nan AND $this != 0
+ delay: down 5m multiplier 1.5 max 2h
+ summary: NVMe device ${label:device} state
+ info: NVMe device ${label:device} has critical warnings
+ to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
new file mode 100644
index 00000000..c4db835c
--- /dev/null
+++ b/health/health.d/pihole.conf
@@ -0,0 +1,33 @@
+
+# Blocklist last update time.
+# Default update interval is a week.
+
+ template: pihole_blocklist_last_update
+ on: pihole.blocklist_last_update
+ class: Errors
+ type: Ad Filtering
+component: Pi-hole
+ every: 10s
+ units: seconds
+ calc: $ago
+ warn: $this > 60 * 60 * 24 * 30
+ summary: Pi-hole blocklist last update
+ info: gravity.list (blocklist) file last update time
+ to: sysadmin
+
+# Pi-hole's ability to block unwanted domains.
+# Should be enabled. The whole point of Pi-hole!
+
+ template: pihole_status
+ on: pihole.unwanted_domains_blocking_status
+ class: Errors
+ type: Ad Filtering
+component: Pi-hole
+ every: 10s
+ units: status
+ calc: $disabled
+ warn: $this != nan AND $this == 1
+ delay: up 2m down 5m
+ summary: Pi-hole domains blocking status
+ info: Unwanted domains blocking is disabled
+ to: sysadmin
diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf
new file mode 100644
index 00000000..0e434420
--- /dev/null
+++ b/health/health.d/ping.conf
@@ -0,0 +1,50 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: ping_host_reachable
+ on: ping.host_packet_loss
+ class: Errors
+ type: Other
+component: Network
+ lookup: average -30s unaligned of loss
+ calc: $this != nan AND $this < 100
+ units: up/down
+ every: 10s
+ crit: $this == 0
+ delay: down 30m multiplier 1.5 max 2h
+ summary: Host ${label:host} ping status
+ info: Network host ${label:host} reachability status
+ to: sysadmin
+
+ template: ping_packet_loss
+ on: ping.host_packet_loss
+ class: Errors
+ type: Other
+component: Network
+ lookup: average -10m unaligned of loss
+ green: 5
+ red: 10
+ units: %
+ every: 10s
+ warn: $this > $green
+ crit: $this > $red
+ delay: down 30m multiplier 1.5 max 2h
+ summary: Host ${label:host} ping packet loss
+ info: Packet loss percentage to the network host ${label:host} over the last 10 minutes
+ to: sysadmin
+
+ template: ping_host_latency
+ on: ping.host_rtt
+ class: Latency
+ type: Other
+component: Network
+ lookup: average -10s unaligned of avg
+ units: ms
+ every: 10s
+ green: 500
+ red: 1000
+ warn: $this > $green OR $max > $red
+ crit: $this > $red
+ delay: down 30m multiplier 1.5 max 2h
+ summary: Host ${label:host} ping latency
+ info: Average latency to the network host ${label:host} over the last 10 seconds
+ to: sysadmin
diff --git a/health/health.d/plugin.conf b/health/health.d/plugin.conf
new file mode 100644
index 00000000..8615a021
--- /dev/null
+++ b/health/health.d/plugin.conf
@@ -0,0 +1,12 @@
+ template: plugin_availability_status
+ on: netdata.plugin_availability_status
+ class: Errors
+ type: Netdata
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : (20 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Plugin ${label:_collect_plugin} availability status
+ info: the amount of time that ${label:_collect_plugin} did not report its availability status
+ to: sysadmin
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
new file mode 100644
index 00000000..281731c8
--- /dev/null
+++ b/health/health.d/portcheck.conf
@@ -0,0 +1,44 @@
+
+# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
+ template: portcheck_service_reachable
+ on: portcheck.status
+ class: Workload
+ type: Other
+component: TCP endpoint
+ lookup: average -1m unaligned percentage of success
+ calc: ($this < 75) ? (0) : ($this)
+ every: 5s
+ units: up/down
+ summary: Portcheck status for ${label:host}:${label:port}
+ info: TCP host ${label:host} port ${label:port} liveness status
+ to: silent
+
+ template: portcheck_connection_timeouts
+ on: portcheck.status
+ class: Errors
+ type: Other
+component: TCP endpoint
+ lookup: average -5m unaligned percentage of timeout
+ every: 10s
+ units: %
+ warn: $this >= 10 AND $this < 40
+ crit: $this >= 40
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Portcheck timeouts for ${label:host}:${label:port}
+ info: Percentage of timed-out TCP connections to host ${label:host} port ${label:port} in the last 5 minutes
+ to: sysadmin
+
+ template: portcheck_connection_fails
+ on: portcheck.status
+ class: Errors
+ type: Other
+component: TCP endpoint
+ lookup: average -5m unaligned percentage of no_connection,failed
+ every: 10s
+ units: %
+ warn: $this >= 10 AND $this < 40
+ crit: $this >= 40
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Portcheck fails for ${label:host}:${label:port}
+ info: Percentage of failed TCP connections to host ${label:host} port ${label:port} in the last 5 minutes
+ to: sysadmin
diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf
new file mode 100644
index 00000000..de4c0078
--- /dev/null
+++ b/health/health.d/postgres.conf
@@ -0,0 +1,228 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: postgres_total_connection_utilization
+ on: postgres.connections_utilization
+ class: Utilization
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of used
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: PostgreSQL connection utilization
+ info: Average total connection utilization over the last minute
+ to: dba
+
+ template: postgres_acquired_locks_utilization
+ on: postgres.locks_utilization
+ class: Utilization
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of used
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (15) : (20))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: PostgreSQL acquired locks utilization
+ info: Average acquired locks utilization over the last minute
+ to: dba
+
+ template: postgres_txid_exhaustion_perc
+ on: postgres.txid_exhaustion_perc
+ class: Utilization
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: $txid_exhaustion
+ units: %
+ every: 1m
+ warn: $this > 90
+ delay: down 15m multiplier 1.5 max 1h
+ summary: PostgreSQL TXID exhaustion
+ info: Percent towards TXID wraparound
+ to: dba
+
+# Database alarms
+
+ template: postgres_db_cache_io_ratio
+ on: postgres.db_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: PostgreSQL DB ${label:database} cache hit ratio
+ info: Average cache hit ratio in db ${label:database} over the last minute
+ to: dba
+
+ template: postgres_db_transactions_rollback_ratio
+ on: postgres.db_transactions_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -5m unaligned of rollback
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (2))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: PostgreSQL DB ${label:database} aborted transactions
+ info: Average aborted transactions percentage in db ${label:database} over the last five minutes
+ to: dba
+
+ template: postgres_db_deadlocks_rate
+ on: postgres.db_deadlocks_rate
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: sum -1m unaligned of deadlocks
+ units: deadlocks
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: PostgreSQL DB ${label:database} deadlocks rate
+ info: Number of deadlocks detected in db ${label:database} in the last minute
+ to: dba
+
+# Table alarms
+
+ template: postgres_table_cache_io_ratio
+ on: postgres.table_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: PostgreSQL table ${label:table} db ${label:database} cache hit ratio
+ info: Average cache hit ratio in db ${label:database} table ${label:table} over the last minute
+ to: dba
+
+ template: postgres_table_index_cache_io_ratio
+ on: postgres.table_index_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: PostgreSQL table ${label:table} db ${label:database} index cache hit ratio
+ info: Average index cache hit ratio in db ${label:database} table ${label:table} over the last minute
+ to: dba
+
+ template: postgres_table_toast_cache_io_ratio
+ on: postgres.table_toast_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: PostgreSQL table ${label:table} db ${label:database} toast cache hit ratio
+ info: Average TOAST hit ratio in db ${label:database} table ${label:table} over the last minute
+ to: dba
+
+ template: postgres_table_toast_index_cache_io_ratio
+ on: postgres.table_toast_index_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: PostgreSQL table ${label:table} db ${label:database} index toast hit ratio
+ info: average index TOAST hit ratio in db ${label:database} table ${label:table} over the last minute
+ to: dba
+
+ template: postgres_table_bloat_size_perc
+ on: postgres.table_bloat_size_perc
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: ($table_size > (1024 * 1024 * 100)) ? ($bloat) : (0)
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (60) : (70))
+ crit: $this > (($status == $CRITICAL) ? (70) : (80))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: PostgreSQL table ${label:table} db ${label:database} bloat size
+ info: Bloat size percentage in db ${label:database} table ${label:table}
+ to: dba
+
+ template: postgres_table_last_autovacuum_time
+ on: postgres.table_autovacuum_since_time
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: !*
+ calc: $time
+ units: seconds
+ every: 1m
+ warn: $this != nan AND $this > (60 * 60 * 24 * 7)
+ summary: PostgreSQL table ${label:table} db ${label:database} last autovacuum
+ info: Time elapsed since db ${label:database} table ${label:table} was vacuumed by the autovacuum daemon
+ to: dba
+
+ template: postgres_table_last_autoanalyze_time
+ on: postgres.table_autoanalyze_since_time
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: !*
+ calc: $time
+ units: seconds
+ every: 1m
+ warn: $this != nan AND $this > (60 * 60 * 24 * 7)
+ summary: PostgreSQL table ${label:table} db ${label:database} last autoanalyze
+ info: Time elapsed since db ${label:database} table ${label:table} was analyzed by the autovacuum daemon
+ to: dba
+
+# Index alarms
+
+ template: postgres_index_bloat_size_perc
+ on: postgres.index_bloat_size_perc
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: ($index_size > (1024 * 1024 * 10)) ? ($bloat) : (0)
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (60) : (70))
+ crit: $this > (($status == $CRITICAL) ? (70) : (80))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: PostgreSQL table ${label:table} db ${label:database} index bloat size
+ info: Bloat size percentage in db ${label:database} table ${label:table} index ${label:index}
+ to: dba
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
new file mode 100644
index 00000000..8f2e0fda
--- /dev/null
+++ b/health/health.d/processes.conf
@@ -0,0 +1,17 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: active_processes
+ on: system.active_processes
+ class: Workload
+ type: System
+component: Processes
+ hosts: *
+ calc: $active * 100 / $pidmax
+ units: %
+ every: 5s
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (95))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: System PIDs utilization
+ info: System process IDs (PID) space utilization
+ to: sysadmin
diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf
new file mode 100644
index 00000000..da27ad5b
--- /dev/null
+++ b/health/health.d/python.d.plugin.conf
@@ -0,0 +1,18 @@
+
+# make sure python.d.plugin data collection job is running
+
+ template: python.d_job_last_collected_secs
+ on: netdata.pythond_runtime
+ class: Errors
+ type: Netdata
+component: python.d.plugin
+ module: !* *
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: Python.d plugin last collection
+ info: Number of seconds since the last successful data collection
+ to: webmaster
diff --git a/health/health.d/qos.conf b/health/health.d/qos.conf
new file mode 100644
index 00000000..970ea636
--- /dev/null
+++ b/health/health.d/qos.conf
@@ -0,0 +1,18 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# check if a QoS class is dropping packets
+# the alarm is checked every 10 seconds
+# and examines the last minute of data
+
+template: 10min_qos_packet_drops
+ on: tc.qos_dropped
+ os: linux
+ hosts: *
+ lookup: sum -5m unaligned absolute
+ every: 30s
+ warn: $this > 0
+ units: packets
+ summary: QOS packet drops
+ info: Dropped packets in the last 5 minutes
+ to: silent
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
new file mode 100644
index 00000000..51f307ca
--- /dev/null
+++ b/health/health.d/ram.conf
@@ -0,0 +1,82 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: ram_in_use
+ on: system.ram
+ class: Utilization
+ type: System
+component: Memory
+ os: linux
+ hosts: *
+ calc: $used * 100 / ($used + $cached + $free + $buffers)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: System memory utilization
+ info: System memory utilization
+ to: sysadmin
+
+ alarm: ram_available
+ on: mem.available
+ class: Utilization
+ type: System
+component: Memory
+ os: linux
+ hosts: *
+ calc: $avail * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: System available memory
+ info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping
+ to: silent
+
+ alarm: oom_kill
+ on: mem.oom_kill
+ os: linux
+ hosts: *
+ lookup: sum -30m unaligned
+ units: kills
+ every: 5m
+ warn: $this > 0
+ delay: down 10m
+ summary: System OOM kills
+ info: Number of out of memory kills in the last 30 minutes
+ to: silent
+
+## FreeBSD
+ alarm: ram_in_use
+ on: system.ram
+ class: Utilization
+ type: System
+component: Memory
+ os: freebsd
+ hosts: *
+ calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: System memory utilization
+ info: System memory utilization
+ to: sysadmin
+
+ alarm: ram_available
+ on: mem.available
+ class: Utilization
+ type: System
+component: Memory
+ os: freebsd
+ hosts: *
+ calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: System available memory
+ info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping
+ to: silent
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
new file mode 100644
index 00000000..7c2945e6
--- /dev/null
+++ b/health/health.d/redis.conf
@@ -0,0 +1,57 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: redis_connections_rejected
+ on: redis.connections
+ class: Errors
+ type: KV Storage
+component: Redis
+ lookup: sum -1m unaligned of rejected
+ every: 10s
+ units: connections
+ warn: $this > 0
+ summary: Redis rejected connections
+ info: Connections rejected because of maxclients limit in the last minute
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
+
+ template: redis_bgsave_broken
+ on: redis.bgsave_health
+ class: Errors
+ type: KV Storage
+component: Redis
+ every: 10s
+ crit: $last_bgsave != nan AND $last_bgsave != 0
+ units: ok/failed
+ summary: Redis background save
+ info: Status of the last RDB save operation (0: ok, 1: error)
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
+
+ template: redis_bgsave_slow
+ on: redis.bgsave_now
+ class: Latency
+ type: KV Storage
+component: Redis
+ every: 10s
+ calc: $current_bgsave_time
+ warn: $this > 600
+ crit: $this > 1200
+ units: seconds
+ summary: Redis slow background save
+ info: Duration of the on-going RDB save operation
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
+
+ template: redis_master_link_down
+ on: redis.master_link_down_since_time
+ class: Errors
+ type: KV Storage
+component: Redis
+ every: 10s
+ calc: $time
+ units: seconds
+ crit: $this != nan AND $this > 0
+ summary: Redis master link down
+ info: Time elapsed since the link between master and slave is down
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf
new file mode 100644
index 00000000..c665430f
--- /dev/null
+++ b/health/health.d/retroshare.conf
@@ -0,0 +1,17 @@
+
+# make sure the DHT is fine when active
+
+ template: retroshare_dht_working
+ on: retroshare.dht
+ class: Utilization
+ type: Data Sharing
+component: Retroshare
+ calc: $dht_size_all
+ units: peers
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (120) : (100))
+ crit: $this < (($status == $CRITICAL) ? (10) : (1))
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ summary: Retroshare DHT peers
+ info: Number of DHT peers
+ to: sysadmin
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf
new file mode 100644
index 00000000..677e3cb4
--- /dev/null
+++ b/health/health.d/riakkv.conf
@@ -0,0 +1,98 @@
+
+# Warn if a list keys operation is running.
+ template: riakkv_list_keys_active
+ on: riak.core.fsm_active
+ class: Utilization
+ type: Database
+component: Riak KV
+ calc: $list_fsm_active
+ units: state machines
+ every: 10s
+ warn: $list_fsm_active > 0
+ summary: Riak KV active list keys
+ info: Number of currently running list keys finite state machines
+ to: dba
+
+
+## Timing healthchecks
+# KV GET
+ template: riakkv_1h_kv_get_mean_latency
+ on: riak.kv.latency.get
+ class: Latency
+ type: Database
+component: Riak KV
+ calc: $node_get_fsm_time_mean
+ lookup: average -1h unaligned of time
+ every: 30s
+ units: ms
+ info: average time between reception of client GET request and \
+ subsequent response to client over the last hour
+
+ template: riakkv_kv_get_slow
+ on: riak.kv.latency.get
+ class: Latency
+ type: Database
+component: Riak KV
+ calc: $mean
+ lookup: average -3m unaligned of time
+ units: ms
+ every: 10s
+ warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) )
+ crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) )
+ summary: Riak KV GET latency
+ info: Average time between reception of client GET request and \
+ subsequent response to the client over the last 3 minutes, \
+ compared to the average over the last hour
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
+
+# KV PUT
+ template: riakkv_1h_kv_put_mean_latency
+ on: riak.kv.latency.put
+ class: Latency
+ type: Database
+component: Riak KV
+ calc: $node_put_fsm_time_mean
+ lookup: average -1h unaligned of time
+ every: 30s
+ units: ms
+ summary: Riak KV PUT mean latency
+ info: Average time between reception of client PUT request and \
+ subsequent response to the client over the last hour
+
+ template: riakkv_kv_put_slow
+ on: riak.kv.latency.put
+ class: Latency
+ type: Database
+component: Riak KV
+ calc: $mean
+ lookup: average -3m unaligned of time
+ units: ms
+ every: 10s
+ warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) )
+ crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) )
+ summary: Riak KV PUT latency
+ info: Average time between reception of client PUT request and \
+ subsequent response to the client over the last 3 minutes, \
+ compared to the average over the last hour
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
+
+
+## VM healthchecks
+
+# Default Erlang VM process limit: 262144
+# On systems observed, this is < 2000, but may grow depending on load.
+ template: riakkv_vm_high_process_count
+ on: riak.vm
+ class: Utilization
+ type: Database
+component: Riak KV
+ calc: $sys_process_count
+ units: processes
+ every: 10s
+ warn: $this > 10000
+ crit: $this > 100000
+ summary: Riak KV number of processes
+ info: Number of processes running in the Erlang VM
+ to: dba
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
new file mode 100644
index 00000000..b089cb85
--- /dev/null
+++ b/health/health.d/scaleio.conf
@@ -0,0 +1,33 @@
+
+# make sure Storage Pool capacity utilization is under limit
+
+ template: scaleio_storage_pool_capacity_utilization
+ on: scaleio.storage_pool_capacity_utilization
+ class: Utilization
+ type: Storage
+component: ScaleIO
+ calc: $used
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (90))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: ScaleIO storage pool capacity utilization
+ info: Storage pool capacity utilization
+ to: sysadmin
+
+
+# make sure Sdc is connected to MDM
+
+ template: scaleio_sdc_mdm_connection_state
+ on: scaleio.sdc_mdm_connection_state
+ class: Utilization
+ type: Storage
+component: ScaleIO
+ calc: $connected
+ every: 10s
+ warn: $this != 1
+ delay: up 30s down 5m multiplier 1.5 max 1h
+ summary: ScaleIO SDC-MDM connection state
+ info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected)
+ to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
new file mode 100644
index 00000000..8d7ba566
--- /dev/null
+++ b/health/health.d/softnet.conf
@@ -0,0 +1,57 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# check for common /proc/net/softnet_stat errors
+
+ alarm: 1min_netdev_backlog_exceeded
+ on: system.softnet_stat
+ class: Errors
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: average -1m unaligned absolute of dropped
+ units: packets
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ delay: down 1h multiplier 1.5 max 2h
+ summary: System netdev dropped packets
+ info: Average number of dropped packets in the last minute \
+ due to exceeded net.core.netdev_max_backlog
+ to: silent
+
+ alarm: 1min_netdev_budget_ran_outs
+ on: system.softnet_stat
+ class: Errors
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: average -1m unaligned absolute of squeezed
+ units: events
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ delay: down 1h multiplier 1.5 max 2h
+ summary: System netdev budget run outs
+ info: Average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \
+ net.core.netdev_budget_usecs with work remaining over the last minute \
+ (this can be a cause for dropped packets)
+ to: silent
+
+ alarm: 10min_netisr_backlog_exceeded
+ on: system.softnet_stat
+ class: Errors
+ type: System
+component: Network
+ os: freebsd
+ hosts: *
+ lookup: average -1m unaligned absolute of qdrops
+ units: packets
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ delay: down 1h multiplier 1.5 max 2h
+ summary: System netisr drops
+ info: Average number of drops in the last minute \
+ due to exceeded sysctl net.route.netisr_maxqlen \
+ (this can be a cause for dropped packets)
+ to: silent
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
new file mode 100644
index 00000000..e3973399
--- /dev/null
+++ b/health/health.d/swap.conf
@@ -0,0 +1,37 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: 30min_ram_swapped_out
+ on: mem.swapio
+ class: Workload
+ type: System
+component: Memory
+ os: linux freebsd
+ hosts: *
+ lookup: sum -30m unaligned absolute of out
+ # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
+ calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
+ units: % of RAM
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (20) : (30))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: System memory swapped out
+ info: Percentage of the system RAM swapped in the last 30 minutes
+ to: silent
+
+ alarm: used_swap
+ on: mem.swap
+ class: Utilization
+ type: System
+component: Memory
+ os: linux freebsd
+ hosts: *
+ calc: (($used + $free) > 0) ? ($used * 100 / ($used + $free)) : 0
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: up 30s down 15m multiplier 1.5 max 1h
+ summary: System swap memory utilization
+ info: Swap memory utilization
+ to: sysadmin
diff --git a/health/health.d/synchronization.conf b/health/health.d/synchronization.conf
new file mode 100644
index 00000000..6c947d90
--- /dev/null
+++ b/health/health.d/synchronization.conf
@@ -0,0 +1,13 @@
+ alarm: sync_freq
+ on: mem.sync
+ lookup: sum -1m of sync
+ units: calls
+ plugin: ebpf.plugin
+ every: 1m
+ warn: $this > 6
+ delay: up 1m down 10m multiplier 1.5 max 1h
+ summary: Sync system call frequency
+ info: Number of sync() system calls. \
+ Every call causes all pending modifications to filesystem metadata and \
+ cached file data to be written to the underlying filesystems.
+ to: silent
diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf
new file mode 100644
index 00000000..ad53a0e1
--- /dev/null
+++ b/health/health.d/systemdunits.conf
@@ -0,0 +1,161 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+## Service units
+ template: systemd_service_unit_failed_state
+ on: systemd.service_unit_state
+ class: Errors
+ type: Linux
+component: Systemd units
+ module: !* *
+ calc: $failed
+ units: state
+ every: 10s
+ warn: $this != nan AND $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: systemd unit ${label:unit_name} state
+ info: systemd service unit in the failed state
+ to: sysadmin
+
+## Socket units
+ template: systemd_socket_unit_failed_state
+ on: systemd.socket_unit_state
+ class: Errors
+ type: Linux
+component: Systemd units
+ module: !* *
+ calc: $failed
+ units: state
+ every: 10s
+ warn: $this != nan AND $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: systemd unit ${label:unit_name} state
+ info: systemd socket unit in the failed state
+ to: sysadmin
+
+## Target units
+ template: systemd_target_unit_failed_state
+ on: systemd.target_unit_state
+ class: Errors
+ type: Linux
+component: Systemd units
+ module: !* *
+ calc: $failed
+ units: state
+ every: 10s
+ warn: $this != nan AND $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: systemd unit ${label:unit_name} state
+ info: systemd target unit in the failed state
+ to: sysadmin
+
+## Path units
+ template: systemd_path_unit_failed_state
+ on: systemd.path_unit_state
+ class: Errors
+ type: Linux
+component: Systemd units
+ module: !* *
+ calc: $failed
+ units: state
+ every: 10s
+ warn: $this != nan AND $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: systemd unit ${label:unit_name} state
+ info: systemd path unit in the failed state
+ to: sysadmin
+
+## Device units
+ template: systemd_device_unit_failed_state
+ on: systemd.device_unit_state
+ class: Errors
+ type: Linux
+component: Systemd units
+ module: !* *
+ calc: $failed
+ units: state
+ every: 10s
+ warn: $this != nan AND $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: systemd unit ${label:unit_name} state
+ info: systemd device unit in the failed state
+ to: sysadmin
+
+## Mount units
+ template: systemd_mount_unit_failed_state
+ on: systemd.mount_unit_state
+ class: Errors
+ type: Linux
+component: Systemd units
+ module: !* *
+ calc: $failed
+ units: state
+ every: 10s
+ warn: $this != nan AND $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: systemd unit ${label:unit_name} state
+ info: systemd mount units in the failed state
+ to: sysadmin
+
+## Automount units
+ template: systemd_automount_unit_failed_state
+ on: systemd.automount_unit_state
+ class: Errors
+ type: Linux
+component: Systemd units
+ module: !* *
+ calc: $failed
+ units: state
+ every: 10s
+ warn: $this != nan AND $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: systemd unit ${label:unit_name} state
+ info: systemd automount unit in the failed state
+ to: sysadmin
+
+## Swap units
+ template: systemd_swap_unit_failed_state
+ on: systemd.swap_unit_state
+ class: Errors
+ type: Linux
+component: Systemd units
+ module: !* *
+ calc: $failed
+ units: state
+ every: 10s
+ warn: $this != nan AND $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: systemd unit ${label:unit_name} state
+ info: systemd swap units in the failed state
+ to: sysadmin
+
+## Scope units
+ template: systemd_scope_unit_failed_state
+ on: systemd.scope_unit_state
+ class: Errors
+ type: Linux
+component: Systemd units
+ module: !* *
+ calc: $failed
+ units: state
+ every: 10s
+ warn: $this != nan AND $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: systemd unit ${label:unit_name} state
+ info: systemd scope units in the failed state
+ to: sysadmin
+
+## Slice units
+ template: systemd_slice_unit_failed_state
+ on: systemd.slice_unit_state
+ class: Errors
+ type: Linux
+component: Systemd units
+ module: !* *
+ calc: $failed
+ units: state
+ every: 10s
+ warn: $this != nan AND $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ summary: systemd unit ${label:unit_name} state
+ info: systemd slice units in the failed state
+ to: sysadmin
diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
new file mode 100644
index 00000000..2b2f9740
--- /dev/null
+++ b/health/health.d/tcp_conn.conf
@@ -0,0 +1,23 @@
+
+#
+# ${tcp_max_connections} may be nan or -1 if the system
+# supports dynamic threshold for TCP connections.
+# In this case, the alarm will always be zero.
+#
+
+ alarm: tcp_connections
+ on: ip.tcpsock
+ class: Workload
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
+ crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ summary: System TCP connections utilization
+ info: IPv4 TCP connections utilization
+ to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
new file mode 100644
index 00000000..9d1104a5
--- /dev/null
+++ b/health/health.d/tcp_listen.conf
@@ -0,0 +1,100 @@
+#
+# There are two queues involved when incoming TCP connections are handled
+# (both at the kernel):
+#
+# SYN queue
+# The SYN queue tracks TCP handshakes until connections are fully established.
+# It overflows when too many incoming TCP connection requests hang in the
+# half-open state and the server is not configured to fall back to SYN cookies.
+# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends
+# lots of SYN packets and never completes the handshakes).
+#
+# Accept queue
+# The accept queue holds fully established TCP connections waiting to be handled
+# by the listening application. It overflows when the server application fails
+# to accept new connections at the rate they are coming in.
+#
+#
+# -----------------------------------------------------------------------------
+# tcp accept queue (at the kernel)
+
+ alarm: 1m_tcp_accept_queue_overflows
+ on: ip.tcp_accept_queue
+ class: Workload
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: average -60s unaligned absolute of ListenOverflows
+ units: overflows
+ every: 10s
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (1) : (5))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ summary: System TCP accept queue overflows
+ info: Average number of overflows in the TCP accept queue over the last minute
+ to: silent
+
+# THIS IS TOO GENERIC
+# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
+ alarm: 1m_tcp_accept_queue_drops
+ on: ip.tcp_accept_queue
+ class: Workload
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: average -60s unaligned absolute of ListenDrops
+ units: drops
+ every: 10s
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (1) : (5))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ summary: System TCP accept queue dropped packets
+ info: Average number of dropped packets in the TCP accept queue over the last minute
+ to: silent
+
+
+# -----------------------------------------------------------------------------
+# tcp SYN queue (at the kernel)
+
+# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or
+# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are
+# enabled or not. In both cases this probably indicates a SYN flood attack,
+# so i guess a notification should be sent.
+
+ alarm: 1m_tcp_syn_queue_drops
+ on: ip.tcp_syn_queue
+ class: Workload
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: average -60s unaligned absolute of TCPReqQFullDrop
+ units: drops
+ every: 10s
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (0) : (5))
+ delay: up 10 down 5m multiplier 1.5 max 1h
+ summary: System TCP SYN queue drops
+ info: Average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
+ (SYN cookies were not enabled)
+ to: silent
+
+ alarm: 1m_tcp_syn_queue_cookies
+ on: ip.tcp_syn_queue
+ class: Workload
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: average -60s unaligned absolute of TCPReqQFullDoCookies
+ units: cookies
+ every: 10s
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (0) : (5))
+ delay: up 10 down 5m multiplier 1.5 max 1h
+ summary: System TCP SYN queue cookies
+ info: Average number of sent SYN cookies due to the full TCP SYN queue over the last minute
+ to: silent
+
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
new file mode 100644
index 00000000..4e422ec1
--- /dev/null
+++ b/health/health.d/tcp_mem.conf
@@ -0,0 +1,24 @@
+#
+# check
+# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
+#
+# We give a warning when TCP is under memory pressure
+# and a critical when TCP is 90% of its upper memory limit
+#
+
+ alarm: tcp_memory
+ on: ipv4.sockstat_tcp_mem
+ class: Utilization
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ calc: ${mem} * 100 / ${tcp_mem_high}
+ units: %
+ every: 10s
+ warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} ))
+ crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ summary: System TCP memory utilization
+ info: TCP memory utilization
+ to: silent
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
new file mode 100644
index 00000000..8f665d50
--- /dev/null
+++ b/health/health.d/tcp_orphans.conf
@@ -0,0 +1,25 @@
+
+#
+# check
+# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
+#
+# The kernel may penalize orphans by 2x or even 4x
+# so we alarm warning at 25% and critical at 50%
+#
+
+ alarm: tcp_orphans
+ on: ipv4.sockstat_tcp_sockets
+ class: Errors
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ calc: ${orphan} * 100 / ${tcp_max_orphans}
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
+ crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ summary: System TCP orphan sockets utilization
+ info: Orphan IPv4 TCP sockets utilization
+ to: silent
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
new file mode 100644
index 00000000..7c39db2d
--- /dev/null
+++ b/health/health.d/tcp_resets.conf
@@ -0,0 +1,71 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+# tcp resets this host sends
+
+ alarm: 1m_ip_tcp_resets_sent
+ on: ip.tcphandshake
+ class: Errors
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: average -1m at -10s unaligned absolute of OutRsts
+ units: tcp resets/s
+ every: 10s
+ info: average number of sent TCP RESETS over the last minute
+
+ alarm: 10s_ip_tcp_resets_sent
+ on: ip.tcphandshake
+ class: Errors
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: average -10s unaligned absolute of OutRsts
+ units: tcp resets/s
+ every: 10s
+ warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_sent < 5)?(5):($1m_ip_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10)))
+ delay: up 20s down 60m multiplier 1.2 max 2h
+ options: no-clear-notification
+ summary: System TCP outbound resets
+ info: Average number of sent TCP RESETS over the last 10 seconds. \
+ This can indicate a port scan, \
+ or that a service running on this host has crashed. \
+ Netdata will not send a clear notification for this alarm.
+ to: silent
+
+# -----------------------------------------------------------------------------
+# tcp resets this host receives
+
+ alarm: 1m_ip_tcp_resets_received
+ on: ip.tcphandshake
+ class: Errors
+ type: System
+component: Network
+ os: linux freebsd
+ hosts: *
+ lookup: average -1m at -10s unaligned absolute of AttemptFails
+ units: tcp resets/s
+ every: 10s
+ info: average number of received TCP RESETS over the last minute
+
+ alarm: 10s_ip_tcp_resets_received
+ on: ip.tcphandshake
+ class: Errors
+ type: System
+component: Network
+ os: linux freebsd
+ hosts: *
+ lookup: average -10s unaligned absolute of AttemptFails
+ units: tcp resets/s
+ every: 10s
+ warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_received < 5)?(5):($1m_ip_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
+ delay: up 20s down 60m multiplier 1.2 max 2h
+ options: no-clear-notification
+ summary: System TCP inbound resets
+ info: average number of received TCP RESETS over the last 10 seconds. \
+ This can be an indication that a service this host needs has crashed. \
+ Netdata will not send a clear notification for this alarm.
+ to: silent
diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf
new file mode 100644
index 00000000..65c9628b
--- /dev/null
+++ b/health/health.d/timex.conf
@@ -0,0 +1,18 @@
+
+# It can take several minutes before ntpd selects a server to synchronize with;
+# try checking after 17 minutes (1024 seconds).
+
+ alarm: system_clock_sync_state
+ on: system.clock_sync_state
+ os: linux
+ class: Errors
+ type: System
+component: Clock
+ calc: $state
+ units: synchronization state
+ every: 10s
+ warn: $system.uptime.uptime > 17 * 60 AND $this == 0
+ delay: down 5m
+ summary: System clock sync state
+ info: When set to 0, the system kernel believes the system clock is not properly synchronized to a reliable server
+ to: silent
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
new file mode 100644
index 00000000..dc094840
--- /dev/null
+++ b/health/health.d/udp_errors.conf
@@ -0,0 +1,40 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+# UDP receive buffer errors
+
+ alarm: 1m_ipv4_udp_receive_buffer_errors
+ on: ipv4.udperrors
+ class: Errors
+ type: System
+component: Network
+ os: linux freebsd
+ hosts: *
+ lookup: average -1m unaligned absolute of RcvbufErrors
+ units: errors
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ summary: System UDP receive buffer errors
+ info: Average number of UDP receive buffer errors over the last minute
+ delay: up 1m down 60m multiplier 1.2 max 2h
+ to: silent
+
+# -----------------------------------------------------------------------------
+# UDP send buffer errors
+
+ alarm: 1m_ipv4_udp_send_buffer_errors
+ on: ipv4.udperrors
+ class: Errors
+ type: System
+component: Network
+ os: linux
+ hosts: *
+ lookup: average -1m unaligned absolute of SndbufErrors
+ units: errors
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ summary: System UDP send buffer errors
+ info: Average number of UDP send buffer errors over the last minute
+ delay: up 1m down 60m multiplier 1.2 max 2h
+ to: silent
diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf
new file mode 100644
index 00000000..3c898f1d
--- /dev/null
+++ b/health/health.d/unbound.conf
@@ -0,0 +1,30 @@
+
+# make sure there is no overwritten/dropped queries in the request-list
+
+ template: unbound_request_list_overwritten
+ on: unbound.request_list_jostle_list
+ class: Errors
+ type: DNS
+component: Unbound
+ lookup: average -60s unaligned absolute match-names of overwritten
+ units: queries
+ every: 10s
+ warn: $this > 5
+ delay: up 10 down 5m multiplier 1.5 max 1h
+ summary: Unbound overwritten queries
+ info: Number of overwritten queries in the request-list
+ to: sysadmin
+
+ template: unbound_request_list_dropped
+ on: unbound.request_list_jostle_list
+ class: Errors
+ type: DNS
+component: Unbound
+ lookup: average -60s unaligned absolute match-names of dropped
+ units: queries
+ every: 10s
+ warn: $this > 0
+ delay: up 10 down 5m multiplier 1.5 max 1h
+ summary: Unbound dropped queries
+ info: Number of dropped queries in the request-list
+ to: sysadmin
diff --git a/health/health.d/upsd.conf b/health/health.d/upsd.conf
new file mode 100644
index 00000000..703a6488
--- /dev/null
+++ b/health/health.d/upsd.conf
@@ -0,0 +1,50 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: upsd_10min_ups_load
+ on: upsd.ups_load
+ class: Utilization
+ type: Power Supply
+component: UPS
+ os: *
+ hosts: *
+ lookup: average -10m unaligned of load
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 10m multiplier 1.5 max 1h
+ summary: UPS ${label:ups_name} load
+ info: UPS ${label:ups_name} average load over the last 10 minutes
+ to: sitemgr
+
+ template: upsd_ups_battery_charge
+ on: upsd.ups_battery_charge
+ class: Errors
+ type: Power Supply
+component: UPS
+ os: *
+ hosts: *
+ lookup: average -60s unaligned of charge
+ units: %
+ every: 60s
+ warn: $this < 75
+ crit: $this < 40
+ delay: down 10m multiplier 1.5 max 1h
+ summary: UPS ${label:ups_name} battery charge
+ info: UPS ${label:ups_name} average battery charge over the last minute
+ to: sitemgr
+
+ template: upsd_ups_last_collected_secs
+ on: upsd.ups_load
+ class: Latency
+ type: Power Supply
+component: UPS device
+ calc: $now - $last_collected_t
+ every: 10s
+ units: seconds ago
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ summary: UPS ${label:ups_name} last collected
+ info: UPS ${label:ups_name} number of seconds since the last successful data collection
+ to: sitemgr
diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf
new file mode 100644
index 00000000..3e20bfd1
--- /dev/null
+++ b/health/health.d/vcsa.conf
@@ -0,0 +1,230 @@
+
+# Overall system health:
+# - 0: all components are healthy.
+# - 1: one or more components might become overloaded soon.
+# - 2: one or more components in the appliance might be degraded.
+# - 3: one or more components might be in an unusable status and the appliance might become unresponsive soon.
+# - 4: no health data is available.
+
+ template: vcsa_system_health_warn
+ on: vcsa.system_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $orange
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA system status
+ info: VCSA overall system status is orange. One or more components are degraded.
+ to: sysadmin
+
+ template: vcsa_system_health_crit
+ on: vcsa.system_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $red
+ units: status
+ every: 10s
+ crit: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA system status
+ info: VCSA overall system status is red. One or more components are unavailable or will stop functioning soon.
+ to: sysadmin
+
+# Components health:
+# - 0: healthy.
+# - 1: healthy, but may have some problems.
+# - 2: degraded, and may have serious problems.
+# - 3: unavailable, or will stop functioning soon.
+# - 4: no health data is available.
+
+ template: vcsa_applmgmt_health_warn
+ on: vcsa.applmgmt_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $orange
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA ApplMgmt service status
+ info: VCSA ApplMgmt component status is orange. It is degraded, and may have serious problems.
+ to: silent
+
+ template: vcsa_applmgmt_health_crit
+ on: vcsa.applmgmt_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $red
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA ApplMgmt service status
+ info: VCSA ApplMgmt component status is red. It is unavailable, or will stop functioning soon.
+ to: sysadmin
+
+ template: vcsa_load_health_warn
+ on: vcsa.load_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $orange
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA Load status
+ info: VCSA Load component status is orange. It is degraded, and may have serious problems.
+ to: silent
+
+ template: vcsa_load_health_crit
+ on: vcsa.load_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $red
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA Load status
+ info: VCSA Load component status is red. It is unavailable, or will stop functioning soon.
+ to: sysadmin
+
+ template: vcsa_mem_health_warn
+ on: vcsa.mem_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $orange
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA Memory status
+ info: VCSA Memory component status is orange. It is degraded, and may have serious problems.
+ to: silent
+
+ template: vcsa_mem_health_crit
+ on: vcsa.mem_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $red
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA Memory status
+ info: VCSA Memory component status is red. It is unavailable, or will stop functioning soon.
+ to: sysadmin
+
+ template: vcsa_swap_health_warn
+ on: vcsa.swap_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $orange
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA Swap status
+ info: VCSA Swap component status is orange. It is degraded, and may have serious problems.
+ to: silent
+
+ template: vcsa_swap_health_crit
+ on: vcsa.swap_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $red
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA Swap status
+ info: VCSA Swap component status is red. It is unavailable, or will stop functioning soon.
+ to: sysadmin
+
+ template: vcsa_database_storage_health_warn
+ on: vcsa.database_storage_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $orange
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA Database status
+ info: VCSA Database Storage component status is orange. It is degraded, and may have serious problems.
+ to: silent
+
+ template: vcsa_database_storage_health_crit
+ on: vcsa.database_storage_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $red
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA Database status
+ info: VCSA Database Storage component status is red. It is unavailable, or will stop functioning soon.
+ to: sysadmin
+
+ template: vcsa_storage_health_warn
+ on: vcsa.storage_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $orange
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA Storage status
+ info: VCSA Storage component status is orange. It is degraded, and may have serious problems.
+ to: silent
+
+ template: vcsa_storage_health_crit
+ on: vcsa.storage_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $red
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA Storage status
+ info: VCSA Storage component status is red. It is unavailable, or will stop functioning soon.
+ to: sysadmin
+
+# Software updates health:
+# - 0: no updates available.
+# - 2: non-security updates are available.
+# - 3: security updates are available.
+# - 4: an error retrieving information on software updates.
+
+ template: vcsa_software_packages_health_warn
+ on: vcsa.software_packages_health_status
+ class: Errors
+ type: Virtual Machine
+component: VMware vCenter
+ calc: $orange
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: down 1m multiplier 1.5 max 1h
+ summary: VCSA software status
+ info: VCSA software packages security updates are available.
+ to: silent
diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf
new file mode 100644
index 00000000..6ea9f99d
--- /dev/null
+++ b/health/health.d/vernemq.conf
@@ -0,0 +1,391 @@
+
+# Socket errors
+
+ template: vernemq_socket_errors
+ on: vernemq.socket_errors
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: sum -1m unaligned absolute of socket_error
+ units: errors
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ socket errors
+ info: Number of socket errors in the last minute
+ to: sysadmin
+
+# Queues dropped/expired/unhandled PUBLISH messages
+
+ template: vernemq_queue_message_drop
+ on: vernemq.queue_undelivered_messages
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute of queue_message_drop
+ units: dropped messages
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ dropped messages
+ info: Number of dropped messages due to full queues in the last minute
+ to: sysadmin
+
+ template: vernemq_queue_message_expired
+ on: vernemq.queue_undelivered_messages
+ class: Latency
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute of queue_message_expired
+ units: expired messages
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ expired messages
+ info: number of messages which expired before delivery in the last minute
+ to: sysadmin
+
+ template: vernemq_queue_message_unhandled
+ on: vernemq.queue_undelivered_messages
+ class: Latency
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute of queue_message_unhandled
+ units: unhandled messages
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unhandled messages
+ info: Number of unhandled messages (connections with clean session=true) in the last minute
+ to: sysadmin
+
+# Erlang VM
+
+ template: vernemq_average_scheduler_utilization
+ on: vernemq.average_scheduler_utilization
+ class: Utilization
+ type: Messaging
+component: VerneMQ
+ lookup: average -10m unaligned
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: VerneMQ scheduler utilization
+ info: Average scheduler utilization over the last 10 minutes
+ to: sysadmin
+
+# Cluster communication and netsplits
+
+ template: vernemq_cluster_dropped
+ on: vernemq.cluster_dropped
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: sum -1m unaligned
+ units: KiB
+ every: 1m
+ warn: $this > 0
+ delay: up 5m down 5m multiplier 1.5 max 1h
+ summary: VerneMQ dropped traffic
+ info: Amount of traffic dropped during communication with the cluster nodes in the last minute
+ to: sysadmin
+
+ template: vernemq_netsplits
+ on: vernemq.netsplits
+ class: Workload
+ type: Messaging
+component: VerneMQ
+ lookup: sum -1m unaligned absolute of netsplit_detected
+ units: netsplits
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ summary: VerneMQ netsplits
+ info: Number of detected netsplits (split brain situation) in the last minute
+ to: sysadmin
+
+# Unsuccessful CONNACK
+
+ template: vernemq_mqtt_connack_sent_reason_unsuccessful
+ on: vernemq.mqtt_connack_sent_reason
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute match-names of !success,*
+ units: packets
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unsuccessful CONNACK
+ info: Number of sent unsuccessful v3/v5 CONNACK packets in the last minute
+ to: sysadmin
+
+# Not normal DISCONNECT
+
+ template: vernemq_mqtt_disconnect_received_reason_not_normal
+ on: vernemq.mqtt_disconnect_received_reason
+ class: Workload
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute match-names of !normal_disconnect,*
+ units: packets
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ received not normal DISCONNECT
+ info: Number of received not normal v5 DISCONNECT packets in the last minute
+ to: sysadmin
+
+ template: vernemq_mqtt_disconnect_sent_reason_not_normal
+ on: vernemq.mqtt_disconnect_sent_reason
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute match-names of !normal_disconnect,*
+ units: packets
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ sent not normal DISCONNECT
+ info: Number of sent not normal v5 DISCONNECT packets in the last minute
+ to: sysadmin
+
+# SUBSCRIBE errors and unauthorized attempts
+
+ template: vernemq_mqtt_subscribe_error
+ on: vernemq.mqtt_subscribe_error
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute
+ units: failed ops
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ failed SUBSCRIBE
+ info: Number of failed v3/v5 SUBSCRIBE operations in the last minute
+ to: sysadmin
+
+ template: vernemq_mqtt_subscribe_auth_error
+ on: vernemq.mqtt_subscribe_auth_error
+ class: Workload
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute
+ units: attempts
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unauthorized SUBSCRIBE
+ info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute
+ to: sysadmin
+
+# UNSUBSCRIBE errors
+
+ template: vernemq_mqtt_unsubscribe_error
+ on: vernemq.mqtt_unsubscribe_error
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute
+ units: failed ops
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ failed UNSUBSCRIBE
+ info: Number of failed v3/v5 UNSUBSCRIBE operations in the last minute
+ to: sysadmin
+
+# PUBLISH errors and unauthorized attempts
+
+ template: vernemq_mqtt_publish_errors
+ on: vernemq.mqtt_publish_errors
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute
+ units: failed ops
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ failed PUBLISH
+ info: Number of failed v3/v5 PUBLISH operations in the last minute
+ to: sysadmin
+
+ template: vernemq_mqtt_publish_auth_errors
+ on: vernemq.mqtt_publish_auth_errors
+ class: Workload
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute
+ units: attempts
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unauthorized PUBLISH
+ info: Number of unauthorized v3/v5 PUBLISH attempts in the last minute
+ to: sysadmin
+
+# Unsuccessful and unexpected PUBACK
+
+ template: vernemq_mqtt_puback_received_reason_unsuccessful
+ on: vernemq.mqtt_puback_received_reason
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute match-names of !success,*
+ units: packets
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unsuccessful received PUBACK
+ info: Number of received unsuccessful v5 PUBACK packets in the last minute
+ to: sysadmin
+
+ template: vernemq_mqtt_puback_sent_reason_unsuccessful
+ on: vernemq.mqtt_puback_sent_reason
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute match-names of !success,*
+ units: packets
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unsuccessful sent PUBACK
+ info: Number of sent unsuccessful v5 PUBACK packets in the last minute
+ to: sysadmin
+
+ template: vernemq_mqtt_puback_unexpected
+ on: vernemq.mqtt_puback_invalid_error
+ class: Workload
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute
+ units: messages
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unnexpected recieved PUBACK
+ info: Number of received unexpected v3/v5 PUBACK packets in the last minute
+ to: sysadmin
+
+# Unsuccessful and unexpected PUBREC
+
+ template: vernemq_mqtt_pubrec_received_reason_unsuccessful
+ on: vernemq.mqtt_pubrec_received_reason
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute match-names of !success,*
+ units: packets
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unsuccessful received PUBREC
+ info: Number of received unsuccessful v5 PUBREC packets in the last minute
+ to: sysadmin
+
+ template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
+ on: vernemq.mqtt_pubrec_sent_reason
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute match-names of !success,*
+ units: packets
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unsuccessful sent PUBREC
+ info: Number of sent unsuccessful v5 PUBREC packets in the last minute
+ to: sysadmin
+
+ template: vernemq_mqtt_pubrec_invalid_error
+ on: vernemq.mqtt_pubrec_invalid_error
+ class: Workload
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute
+ units: messages
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ invalid received PUBREC
+ info: Number of received invalid v3 PUBREC packets in the last minute
+ to: sysadmin
+
+# Unsuccessful PUBREL
+
+ template: vernemq_mqtt_pubrel_received_reason_unsuccessful
+ on: vernemq.mqtt_pubrel_received_reason
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute match-names of !success,*
+ units: packets
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unsuccessful received PUBREL
+ info: Number of received unsuccessful v5 PUBREL packets in the last minute
+ to: sysadmin
+
+ template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
+ on: vernemq.mqtt_pubrel_sent_reason
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute match-names of !success,*
+ units: packets
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unsuccessful sent PUBREL
+ info: number of sent unsuccessful v5 PUBREL packets in the last minute
+ to: sysadmin
+
+# Unsuccessful and unexpected PUBCOMP
+
+ template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
+ on: vernemq.mqtt_pubcomp_received_reason
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute match-names of !success,*
+ units: packets
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unsuccessful received PUBCOMP
+ info: Number of received unsuccessful v5 PUBCOMP packets in the last minute
+ to: sysadmin
+
+ template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
+ on: vernemq.mqtt_pubcomp_sent_reason
+ class: Errors
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute match-names of !success,*
+ units: packets
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unsuccessful sent PUBCOMP
+ info: number of sent unsuccessful v5 PUBCOMP packets in the last minute
+ to: sysadmin
+
+ template: vernemq_mqtt_pubcomp_unexpected
+ on: vernemq.mqtt_pubcomp_invalid_error
+ class: Workload
+ type: Messaging
+component: VerneMQ
+ lookup: average -1m unaligned absolute
+ units: messages
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ summary: VerneMQ unexpected received PUBCOMP
+ info: number of received unexpected v3/v5 PUBCOMP packets in the last minute
+ to: sysadmin
diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf
new file mode 100644
index 00000000..b8ad9aee
--- /dev/null
+++ b/health/health.d/vsphere.conf
@@ -0,0 +1,70 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------Virtual Machine--------------------------------------------------------
+
+ template: vsphere_vm_cpu_utilization
+ on: vsphere.vm_cpu_utilization
+ class: Utilization
+ type: Virtual Machine
+component: CPU
+ hosts: *
+ lookup: average -10m unaligned match-names of used
+ units: %
+ every: 20s
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: vSphere CPU utilization for VM ${label:vm}
+ info: CPU utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter}
+ to: silent
+
+ template: vsphere_vm_mem_utilization
+ on: vsphere.vm_mem_utilization
+ class: Utilization
+ type: Virtual Machine
+component: Memory
+ hosts: *
+ calc: $used
+ units: %
+ every: 20s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: vSphere memory utilization for VM ${label:vm}
+ info: Memory utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter}
+ to: silent
+
+# -----------------------------------------------ESXI host--------------------------------------------------------------
+
+ template: vsphere_host_cpu_utilization
+ on: vsphere.host_cpu_utilization
+ class: Utilization
+ type: Virtual Machine
+component: CPU
+ hosts: *
+ lookup: average -10m unaligned match-names of used
+ units: %
+ every: 20s
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: vSphere ESXi CPU utilization for host ${label:host}
+ info: CPU utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: vsphere_host_mem_utilization
+ on: vsphere.host_mem_utilization
+ class: Utilization
+ type: Virtual Machine
+component: Memory
+ hosts: *
+ calc: $used
+ units: %
+ every: 20s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: vSphere ESXi Ram utilization for host ${label:host}
+ info: Memory utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter}
+ to: sysadmin
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
new file mode 100644
index 00000000..78f1cc7f
--- /dev/null
+++ b/health/health.d/web_log.conf
@@ -0,0 +1,205 @@
+
+# unmatched lines
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+# $1m_total_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+ template: web_log_1m_total_requests
+ on: web_log.requests
+ class: Workload
+ type: Web Server
+component: Web log
+ lookup: sum -1m unaligned
+ calc: ($this == 0)?(1):($this)
+ units: requests
+ every: 10s
+ info: number of HTTP requests in the last minute
+
+ template: web_log_1m_unmatched
+ on: web_log.excluded_requests
+ class: Errors
+ type: Web Server
+component: Web log
+ lookup: sum -1m unaligned of unmatched
+ calc: $this * 100 / $web_log_1m_total_requests
+ units: %
+ every: 10s
+ warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
+ delay: up 1m down 5m multiplier 1.5 max 1h
+ summary: Web log unparsed
+ info: Percentage of unparsed log lines over the last minute
+ to: webmaster
+
+# -----------------------------------------------------------------------------
+# high level response code alarms
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+# $1m_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+ template: web_log_1m_requests
+ on: web_log.type_requests
+ class: Workload
+ type: Web Server
+component: Web log
+ lookup: sum -1m unaligned
+ calc: ($this == 0)?(1):($this)
+ units: requests
+ every: 10s
+ info: number of HTTP requests in the last minute
+
+ template: web_log_1m_successful
+ on: web_log.type_requests
+ class: Workload
+ type: Web Server
+component: Web log
+ lookup: sum -1m unaligned of success
+ calc: $this * 100 / $web_log_1m_requests
+ units: %
+ every: 10s
+ warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
+ crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
+ delay: up 2m down 15m multiplier 1.5 max 1h
+ summary: Web log successful
+ info: Ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
+ to: webmaster
+
+ template: web_log_1m_redirects
+ on: web_log.type_requests
+ class: Workload
+ type: Web Server
+component: Web log
+ lookup: sum -1m unaligned of redirect
+ calc: $this * 100 / $web_log_1m_requests
+ units: %
+ every: 10s
+ warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
+ delay: up 2m down 15m multiplier 1.5 max 1h
+ summary: Web log redirects
+ info: Ratio of redirection HTTP requests over the last minute (3xx except 304)
+ to: webmaster
+
+ template: web_log_1m_bad_requests
+ on: web_log.type_requests
+ class: Errors
+ type: Web Server
+component: Web log
+ lookup: sum -1m unaligned of bad
+ calc: $this * 100 / $web_log_1m_requests
+ units: %
+ every: 10s
+ warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
+ delay: up 2m down 15m multiplier 1.5 max 1h
+ summary: Web log bad requests
+ info: Ratio of client error HTTP requests over the last minute (4xx except 401)
+ to: webmaster
+
+ template: web_log_1m_internal_errors
+ on: web_log.type_requests
+ class: Errors
+ type: Web Server
+component: Web log
+ lookup: sum -1m unaligned of error
+ calc: $this * 100 / $web_log_1m_requests
+ units: %
+ every: 10s
+ warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
+ crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
+ delay: up 2m down 15m multiplier 1.5 max 1h
+ summary: Web log server errors
+ info: Ratio of server error HTTP requests over the last minute (5xx)
+ to: webmaster
+
+# -----------------------------------------------------------------------------
+# web slow
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+# $1m_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+ template: web_log_10m_response_time
+ on: web_log.request_processing_time
+ class: Latency
+ type: System
+component: Web log
+ lookup: average -10m unaligned of avg
+ units: ms
+ every: 30s
+ info: average HTTP response time over the last 10 minutes
+
+ template: web_log_web_slow
+ on: web_log.request_processing_time
+ class: Latency
+ type: Web Server
+component: Web log
+ lookup: average -1m unaligned of avg
+ units: ms
+ every: 10s
+ green: 500
+ red: 1000
+ warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
+ crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
+ delay: down 15m multiplier 1.5 max 1h
+ summary: Web log processing time
+ info: Average HTTP response time over the last 1 minute
+ options: no-clear-notification
+ to: webmaster
+
+# -----------------------------------------------------------------------------
+# web too many or too few requests
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+# $5m_successful_old > 120
+#
+# i.e. when there were at least 120 requests during the 5 minutes starting
+# at -10m and ending at -5m
+
+ template: web_log_5m_successful_old
+ on: web_log.type_requests
+ class: Workload
+ type: Web Server
+component: Web log
+ lookup: average -5m at -5m unaligned of success
+ units: requests/s
+ every: 30s
+ info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
+
+ template: web_log_5m_successful
+ on: web_log.type_requests
+ class: Workload
+ type: Web Server
+component: Web log
+ lookup: average -5m unaligned of success
+ units: requests/s
+ every: 30s
+ info: average number of successful HTTP requests over the last 5 minutes
+
+ template: web_log_5m_requests_ratio
+ on: web_log.type_requests
+ class: Workload
+ type: Web Server
+component: Web log
+ calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
+ units: %
+ every: 30s
+ warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
+ crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
+ delay: down 15m multiplier 1.5 max 1h
+ options: no-clear-notification
+ summary: Web log 5 minutes requests ratio
+ info: Ratio of successful HTTP requests over over the last 5 minutes, \
+ compared with the previous 5 minutes \
+ (clear notification for this alarm will not be sent)
+ to: webmaster
diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf
new file mode 100644
index 00000000..0a328b59
--- /dev/null
+++ b/health/health.d/whoisquery.conf
@@ -0,0 +1,14 @@
+
+ template: whoisquery_days_until_expiration
+ on: whoisquery.time_until_expiration
+ class: Utilization
+ type: Other
+component: WHOIS
+ calc: $expiry
+ units: seconds
+ every: 60s
+ warn: $this < $days_until_expiration_warning*24*60*60
+ crit: $this < $days_until_expiration_critical*24*60*60
+ summary: Whois expiration time for domain ${label:domain}
+ info: Time until the domain name registration for ${label:domain} expires
+ to: webmaster
diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf
new file mode 100644
index 00000000..706fcbf2
--- /dev/null
+++ b/health/health.d/windows.conf
@@ -0,0 +1,126 @@
+
+## CPU
+
+ template: windows_10min_cpu_usage
+ on: windows.cpu_utilization_total
+ class: Utilization
+ type: Windows
+component: CPU
+ os: *
+ hosts: *
+ lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: CPU utilization
+ info: Average CPU utilization over the last 10 minutes
+ to: silent
+
+
+## Memory
+
+ template: windows_ram_in_use
+ on: windows.memory_utilization
+ class: Utilization
+ type: Windows
+component: Memory
+ os: *
+ hosts: *
+ calc: ($used) * 100 / ($used + $available)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: Ram utilization
+ info: Memory utilization
+ to: sysadmin
+
+
+## Network
+
+ template: windows_inbound_packets_discarded
+ on: windows.net_nic_discarded
+ class: Errors
+ type: Windows
+component: Network
+ os: *
+ hosts: *
+ lookup: sum -10m unaligned absolute match-names of inbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ summary: Inbound network packets discarded
+ info: Number of inbound discarded packets for the network interface in the last 10 minutes
+ to: silent
+
+ template: windows_outbound_packets_discarded
+ on: windows.net_nic_discarded
+ class: Errors
+ type: Windows
+component: Network
+ os: *
+ hosts: *
+ lookup: sum -10m unaligned absolute match-names of outbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ summary: Outbound network packets discarded
+ info: Number of outbound discarded packets for the network interface in the last 10 minutes
+ to: silent
+
+ template: windows_inbound_packets_errors
+ on: windows.net_nic_errors
+ class: Errors
+ type: Windows
+component: Network
+ os: *
+ hosts: *
+ lookup: sum -10m unaligned absolute match-names of inbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ summary: Inbound network errors
+ info: Number of inbound errors for the network interface in the last 10 minutes
+ to: silent
+
+ template: windows_outbound_packets_errors
+ on: windows.net_nic_errors
+ class: Errors
+ type: Windows
+component: Network
+ os: *
+ hosts: *
+ lookup: sum -10m unaligned absolute match-names of outbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ summary: Outbound network errors
+ info: Number of outbound errors for the network interface in the last 10 minutes
+ to: silent
+
+
+## Disk
+
+ template: windows_disk_in_use
+ on: windows.logical_disk_space_usage
+ class: Utilization
+ type: Windows
+component: Disk
+ os: *
+ hosts: *
+ calc: ($used) * 100 / ($used + $free)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: Disk space usage
+ info: Disk space utilization
+ to: sysadmin
diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf
new file mode 100644
index 00000000..d05f3ef0
--- /dev/null
+++ b/health/health.d/x509check.conf
@@ -0,0 +1,26 @@
+
+ template: x509check_days_until_expiration
+ on: x509check.time_until_expiration
+ class: Latency
+ type: Certificates
+component: x509 certificates
+ calc: $expiry
+ units: seconds
+ every: 60s
+ warn: $this < $days_until_expiration_warning*24*60*60
+ crit: $this < $days_until_expiration_critical*24*60*60
+ summary: x509 certificate expiration for ${label:source}
+ info: Time until x509 certificate expires for ${label:source}
+ to: webmaster
+
+ template: x509check_revocation_status
+ on: x509check.revocation_status
+ class: Errors
+ type: Certificates
+component: x509 certificates
+ calc: $revoked
+ every: 60s
+ crit: $this != nan AND $this != 0
+ summary: x509 certificate revocation status for ${label:source}
+ info: x509 certificate revocation status (0: revoked, 1: valid) for ${label:source}
+ to: webmaster
diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf
new file mode 100644
index 00000000..d2a56100
--- /dev/null
+++ b/health/health.d/zfs.conf
@@ -0,0 +1,44 @@
+
+ alarm: zfs_memory_throttle
+ on: zfs.memory_ops
+ class: Utilization
+ type: System
+component: File system
+ lookup: sum -10m unaligned absolute of throttled
+ units: events
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 2h
+ summary: ZFS ARC growth throttling
+ info: number of times ZFS had to limit the ARC growth in the last 10 minutes
+ to: silent
+
+# ZFS pool state
+
+ template: zfs_pool_state_warn
+ on: zfspool.state
+ class: Errors
+ type: System
+component: File system
+ calc: $degraded
+ units: boolean
+ every: 10s
+ warn: $this > 0
+ delay: down 1m multiplier 1.5 max 1h
+ summary: ZFS pool ${label:pool} state
+ info: ZFS pool ${label:pool} state is degraded
+ to: sysadmin
+
+ template: zfs_pool_state_crit
+ on: zfspool.state
+ class: Errors
+ type: System
+component: File system
+ calc: $faulted + $unavail
+ units: boolean
+ every: 10s
+ crit: $this > 0
+ delay: down 1m multiplier 1.5 max 1h
+ summary: Critical ZFS pool ${label:pool} state
+ info: ZFS pool ${label:pool} state is faulted or unavail
+ to: sysadmin