diff options
Diffstat (limited to 'conf.d/health.d')
49 files changed, 0 insertions, 1801 deletions
diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf deleted file mode 100644 index 0c98b8778..000000000 --- a/conf.d/health.d/apache.conf +++ /dev/null @@ -1,14 +0,0 @@ - -# make sure apache is running - -template: apache_last_collected_secs - on: apache.requests - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/conf.d/health.d/backend.conf b/conf.d/health.d/backend.conf deleted file mode 100644 index 7af100d8f..000000000 --- a/conf.d/health.d/backend.conf +++ /dev/null @@ -1,45 +0,0 @@ - -# make sure we are sending data to backend - - alarm: backend_last_buffering - on: netdata.backend_metrics - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful buffering of backend data - to: dba - - alarm: backend_metrics_sent - on: netdata.backend_metrics - units: % - calc: abs($sent) * 100 / abs($buffered) - every: 10s - warn: $this != 100 - delay: down 5m multiplier 1.5 max 1h - info: percentage of metrics sent to the backend server - to: dba - - alarm: backend_metrics_lost - on: netdata.backend_metrics - units: metrics - calc: abs($lost) - every: 10s - crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0) - delay: down 5m multiplier 1.5 max 1h - info: number of metrics lost due to repeating failures to contact the backend server - to: dba - -# this chart has been removed from netdata -# alarm: backend_slow -# on: netdata.backend_latency -# units: % -# calc: $latency * 100 / ($update_every * 1000) -# every: 10s -# warn: $this > 50 -# crit: $this > 100 -# delay: down 5m multiplier 1.5 max 1h -# info: the percentage of time between iterations needed by the backend time to process the data sent by netdata -# to: dba diff --git a/conf.d/health.d/beanstalkd.conf b/conf.d/health.d/beanstalkd.conf deleted file mode 100644 index 30dc27328..000000000 --- a/conf.d/health.d/beanstalkd.conf +++ /dev/null @@ -1,36 +0,0 @@ -# get the number of buried jobs in all queues - -template: server_buried_jobs - on: beanstalk.current_jobs - calc: $buried - units: jobs - every: 10s - warn: $this > 0 - crit: $this > 10 - delay: up 0 down 5m multiplier 1.2 max 1h - info: the number of buried jobs aggregated across all tubes - to: sysadmin - -# get the number of buried jobs per queue - -#template: tube_buried_jobs -# on: beanstalk.jobs -# calc: $buried -# units: jobs -# every: 10s -# warn: $this > 0 -# crit: $this > 10 -# delay: up 0 down 5m multiplier 1.2 max 1h -# info: the number of jobs buried per tube -# to: sysadmin - -# get the current number of tubes - -#template: number_of_tubes -# on: beanstalk.current_tubes -# calc: $tubes -# every: 10s -# warn: $this < 5 -# delay: up 0 down 5m multiplier 1.2 max 1h -# info: the current number of tubes on the server -# to: sysadmin diff --git a/conf.d/health.d/bind_rndc.conf b/conf.d/health.d/bind_rndc.conf deleted file mode 100644 index 4145e77cd..000000000 --- a/conf.d/health.d/bind_rndc.conf +++ /dev/null @@ -1,9 +0,0 @@ - template: bind_rndc_stats_file_size - on: bind_rndc.stats_size - units: megabytes - every: 60 - calc: $stats_size - warn: $this > 512 - crit: $this > 1024 - info: Bind stats file is very large! Consider to create logrotate conf file for it! - to: sysadmin diff --git a/conf.d/health.d/btrfs.conf b/conf.d/health.d/btrfs.conf deleted file mode 100644 index b27aa544f..000000000 --- a/conf.d/health.d/btrfs.conf +++ /dev/null @@ -1,57 +0,0 @@ - -template: btrfs_allocated - on: btrfs.disk - os: * - hosts: * -families: * - calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free)) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (90) : (95)) - crit: $this > (($status == $CRITICAL) ? (95) : (98)) - delay: up 1m down 15m multiplier 1.5 max 1h - info: the percentage of allocated BTRFS physical disk space - to: sysadmin - -template: btrfs_data - on: btrfs.data - os: * - hosts: * -families: * - calc: $used * 100 / ($used + $free) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 - crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 - delay: up 1m down 15m multiplier 1.5 max 1h - info: the percentage of used BTRFS data space - to: sysadmin - -template: btrfs_metadata - on: btrfs.metadata - os: * - hosts: * -families: * - calc: ($used + $reserved) * 100 / ($used + $free + $reserved) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 - crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 - delay: up 1m down 15m multiplier 1.5 max 1h - info: the percentage of used BTRFS metadata space - to: sysadmin - -template: btrfs_system - on: btrfs.system - os: * - hosts: * -families: * - calc: $used * 100 / ($used + $free) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 - crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 - delay: up 1m down 15m multiplier 1.5 max 1h - info: the percentage of used BTRFS system space - to: sysadmin - diff --git a/conf.d/health.d/ceph.conf b/conf.d/health.d/ceph.conf deleted file mode 100644 index de16f7b6f..000000000 --- a/conf.d/health.d/ceph.conf +++ /dev/null @@ -1,13 +0,0 @@ -# low ceph disk available - -template: cluster_space_usage - on: ceph.general_usage - calc: $avail * 100 / ($avail + $used) - units: % - every: 10s - warn: $this < 10 - crit: $this < 1 - delay: down 5m multiplier 1.2 max 1h - info: ceph disk usage is almost full - to: sysadmin - diff --git a/conf.d/health.d/couchdb.conf b/conf.d/health.d/couchdb.conf deleted file mode 100644 index 4a2895280..000000000 --- a/conf.d/health.d/couchdb.conf +++ /dev/null @@ -1,13 +0,0 @@ - -# make sure couchdb is running - -template: couchdb_last_collected_secs - on: couchdb.request_methods - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf deleted file mode 100644 index fa8189856..000000000 --- a/conf.d/health.d/cpu.conf +++ /dev/null @@ -1,55 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -template: 10min_cpu_usage - on: system.cpu - os: linux - hosts: * - lookup: average -10m unaligned of user,system,softirq,irq,guest - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - info: average cpu utilization for the last 10 minutes (excluding iowait, nice and steal) - to: sysadmin - -template: 10min_cpu_iowait - on: system.cpu - os: linux - hosts: * - lookup: average -10m unaligned of iowait - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (20) : (40)) - crit: $this > (($status == $CRITICAL) ? (40) : (50)) - delay: down 15m multiplier 1.5 max 1h - info: average CPU wait I/O for the last 10 minutes - to: sysadmin - -template: 20min_steal_cpu - on: system.cpu - os: linux - hosts: * - lookup: average -20m unaligned of steal - units: % - every: 5m - warn: $this > (($status >= $WARNING) ? (5) : (10)) - crit: $this > (($status == $CRITICAL) ? (20) : (30)) - delay: down 1h multiplier 1.5 max 2h - info: average CPU steal time for the last 20 minutes - to: sysadmin - -## FreeBSD -template: 10min_cpu_usage - on: system.cpu - os: freebsd - hosts: * - lookup: average -10m unaligned of user,system,interrupt - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - info: average cpu utilization for the last 10 minutes (excluding nice) - to: sysadmin diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf deleted file mode 100644 index 26f85848a..000000000 --- a/conf.d/health.d/disks.conf +++ /dev/null @@ -1,167 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - -# ----------------------------------------------------------------------------- -# low disk space - -# checking the latest collected values -# raise an alarm if the disk is low on -# available disk space - -template: disk_space_usage - on: disk.space - os: linux freebsd - hosts: * -families: * - calc: $used * 100 / ($avail + $used) - units: % - every: 1m - warn: $this > (($status >= $WARNING ) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: up 1m down 15m multiplier 1.5 max 1h - info: current disk space usage - to: sysadmin - -template: disk_inode_usage - on: disk.inodes - os: linux freebsd - hosts: * -families: * - calc: $used * 100 / ($avail + $used) - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: up 1m down 15m multiplier 1.5 max 1h - info: current disk inode usage - to: sysadmin - - -# ----------------------------------------------------------------------------- -# disk fill rate - -# calculate the rate the disk fills -# use as base, the available space change -# during the last hour - -# this is just a calculation - it has no alarm -# we will use it in the next template to find -# the hours remaining - -template: disk_fill_rate - on: disk.space - os: linux freebsd - hosts: * -families: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: GB/hour - info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour - - -# calculate the hours remaining -# if the disk continues to fill -# in this rate - -template: out_of_disk_space_time - on: disk.space - os: linux freebsd - hosts: * -families: * - calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour - to: sysadmin - - -# ----------------------------------------------------------------------------- -# disk inode fill rate - -# calculate the rate the disk inodes are allocated -# use as base, the available inodes change -# during the last hour - -# this is just a calculation - it has no alarm -# we will use it in the next template to find -# the hours remaining - -template: disk_inode_rate - on: disk.inodes - os: linux freebsd - hosts: * -families: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: inodes/hour - info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour - -# calculate the hours remaining -# if the disk inodes are allocated -# in this rate - -template: out_of_disk_inodes_time - on: disk.inodes - os: linux freebsd - hosts: * -families: * - calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour - to: sysadmin - - -# ----------------------------------------------------------------------------- -# disk congestion - -# raise an alarm if the disk is congested -# by calculating the average disk utilization -# for the last 10 minutes - -template: 10min_disk_utilization - on: disk.util - os: linux freebsd - hosts: * -families: * - lookup: average -10m unaligned - units: % - every: 1m - green: 90 - red: 98 - warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) - crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) - delay: down 15m multiplier 1.2 max 1h - info: the percentage of time the disk was busy, during the last 10 minutes - to: sysadmin - - -# raise an alarm if the disk backlog -# is above 1000ms (1s) per second -# for 10 minutes -# (i.e. the disk cannot catch up) - -template: 10min_disk_backlog - on: disk.backlog - os: linux - hosts: * -families: * - lookup: average -10m unaligned - units: ms - every: 1m - green: 2000 - red: 5000 - warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) - crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) - delay: down 15m multiplier 1.2 max 1h - info: average of the kernel estimated disk backlog, for the last 10 minutes - to: sysadmin diff --git a/conf.d/health.d/elasticsearch.conf b/conf.d/health.d/elasticsearch.conf deleted file mode 100644 index dffd40965..000000000 --- a/conf.d/health.d/elasticsearch.conf +++ /dev/null @@ -1,9 +0,0 @@ - alarm: elasticsearch_last_collected - on: elasticsearch_local.cluster_health_status - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf deleted file mode 100644 index 66d44ec13..000000000 --- a/conf.d/health.d/entropy.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# check if entropy is too low -# the alarm is checked every 1 minute -# and examines the last hour of data - - alarm: lowest_entropy - on: system.entropy - os: linux - hosts: * - lookup: min -10m unaligned - units: entries - every: 5m - warn: $this < (($status >= $WARNING) ? (200) : (100)) - delay: down 1h multiplier 1.5 max 2h - info: minimum entries in the random numbers pool in the last 10 minutes - to: silent diff --git a/conf.d/health.d/fping.conf b/conf.d/health.d/fping.conf deleted file mode 100644 index 43658fef6..000000000 --- a/conf.d/health.d/fping.conf +++ /dev/null @@ -1,53 +0,0 @@ - -template: fping_last_collected_secs -families: * - on: fping.latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - -template: host_reachable -families: * - on: fping.latency - calc: $average != nan - units: up/down - every: 10s - crit: $this == 0 - info: states if the remote host is reachable - delay: down 30m multiplier 1.5 max 2h - to: sysadmin - -template: host_latency -families: * - on: fping.latency - lookup: average -10s unaligned of average - units: ms - every: 10s - green: 500 - red: 1000 - warn: $this > $green OR $max > $red - crit: $this > $red - info: average round trip delay during the last 10 seconds - delay: down 30m multiplier 1.5 max 2h - to: sysadmin - -template: packet_loss -families: * - on: fping.quality - lookup: average -10m unaligned of returned - calc: 100 - $this - green: 1 - red: 10 - units: % - every: 10s - warn: $this > $green - crit: $this > $red - info: packet loss percentage - delay: down 30m multiplier 1.5 max 2h - to: sysadmin - diff --git a/conf.d/health.d/fronius.conf b/conf.d/health.d/fronius.conf deleted file mode 100644 index cdf6c8fcb..000000000 --- a/conf.d/health.d/fronius.conf +++ /dev/null @@ -1,11 +0,0 @@ -template: fronius_last_collected_secs -families: * - on: fronius.power - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sitemgr diff --git a/conf.d/health.d/haproxy.conf b/conf.d/health.d/haproxy.conf deleted file mode 100644 index e49c70d48..000000000 --- a/conf.d/health.d/haproxy.conf +++ /dev/null @@ -1,27 +0,0 @@ -template: haproxy_backend_server_status - on: haproxy_hs.down - units: failed servers - every: 10s - lookup: average -10s - crit: $this > 0 - info: number of failed haproxy backend servers - to: sysadmin - -template: haproxy_backend_status - on: haproxy_hb.down - units: failed backend - every: 10s - lookup: average -10s - crit: $this > 0 - info: number of failed haproxy backends - to: sysadmin - -template: haproxy_last_collected - on: haproxy_hb.down - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/conf.d/health.d/httpcheck.conf b/conf.d/health.d/httpcheck.conf deleted file mode 100644 index 0ddf35eab..000000000 --- a/conf.d/health.d/httpcheck.conf +++ /dev/null @@ -1,99 +0,0 @@ -template: httpcheck_last_collected_secs -families: * - on: httpcheck.status - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - -# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges -template: web_service_up -families: * - on: httpcheck.status - lookup: average -1m unaligned percentage of success - calc: ($this < 75) ? (0) : ($this) - every: 5s - units: up/down - info: at least 75% verified responses during last 60 seconds, ideal for badges - to: silent - -template: web_service_bad_content -families: * - on: httpcheck.status - lookup: average -5m unaligned percentage of bad_content - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - info: average of unexpected http response content during the last 5 minutes - options: no-clear-notification - to: webmaster - -template: web_service_bad_status -families: * - on: httpcheck.status - lookup: average -5m unaligned percentage of bad_status - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - info: average of unexpected http status during the last 5 minutes - options: no-clear-notification - to: webmaster - -template: web_service_timeouts -families: * - on: httpcheck.status - lookup: average -5m unaligned percentage of timeout - every: 10s - units: % - info: average of timeouts during the last 5 minutes - -template: no_web_service_connections -families: * - on: httpcheck.status - lookup: average -5m unaligned percentage of no_connection - every: 10s - units: % - info: average of failed requests during the last 5 minutes - -# combined timeout & no connection alarm -template: web_service_unreachable -families: * - on: httpcheck.status - calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts) - units: % - every: 10s - warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40) - crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40 - delay: down 5m multiplier 1.5 max 1h - info: average of failed requests either due to timeouts or no connection during the last 5 minutes - options: no-clear-notification - to: webmaster - -template: 1h_web_service_response_time -families: * - on: httpcheck.responsetime - lookup: average -1h unaligned of time - every: 30s - units: ms - info: average response time over the last hour - -template: web_service_slow -families: * - on: httpcheck.responsetime - lookup: average -3m unaligned of time - units: ms - every: 10s - warn: ($this > ($1h_web_service_response_time * 2) ) - crit: ($this > ($1h_web_service_response_time * 3) ) - info: average response time over the last 3 minutes, compared to the average over the last hour - delay: down 5m multiplier 1.5 max 1h - options: no-clear-notification - to: webmaster diff --git a/conf.d/health.d/ipc.conf b/conf.d/health.d/ipc.conf deleted file mode 100644 index 03cf264d8..000000000 --- a/conf.d/health.d/ipc.conf +++ /dev/null @@ -1,28 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: semaphores_used - on: system.ipc_semaphores - os: linux - hosts: * - calc: $semaphores * 100 / $ipc.semaphores.max - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (70) : (90)) - delay: down 5m multiplier 1.5 max 1h - info: the percentage of IPC semaphores used - to: sysadmin - - alarm: semaphore_arrays_used - on: system.ipc_semaphore_arrays - os: linux - hosts: * - calc: $arrays * 100 / $ipc.semaphores.arrays.max - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (70) : (90)) - delay: down 5m multiplier 1.5 max 1h - info: the percentage of IPC semaphore arrays used - to: sysadmin diff --git a/conf.d/health.d/ipfs.conf b/conf.d/health.d/ipfs.conf deleted file mode 100644 index 3f77572d6..000000000 --- a/conf.d/health.d/ipfs.conf +++ /dev/null @@ -1,11 +0,0 @@ - -template: ipfs_datastore_usage - on: ipfs.repo_size - calc: $size * 100 / $avail - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - info: ipfs Datastore close to running out of space - to: sysadmin diff --git a/conf.d/health.d/ipmi.conf b/conf.d/health.d/ipmi.conf deleted file mode 100644 index c25581964..000000000 --- a/conf.d/health.d/ipmi.conf +++ /dev/null @@ -1,20 +0,0 @@ - alarm: ipmi_sensors_states - on: ipmi.sensors_states - calc: $warning + $critical - units: sensors - every: 10s - warn: $this > 0 - crit: $critical > 0 - delay: up 5m down 15m multiplier 1.5 max 1h - info: the number IPMI sensors in non-nominal state - to: sysadmin - - alarm: ipmi_events - on: ipmi.events - calc: $events - units: events - every: 10s - warn: $this > 0 - delay: up 5m down 15m multiplier 1.5 max 1h - info: the number of events in the IPMI System Event Log (SEL) - to: sysadmin diff --git a/conf.d/health.d/isc_dhcpd.conf b/conf.d/health.d/isc_dhcpd.conf deleted file mode 100644 index 8054656ff..000000000 --- a/conf.d/health.d/isc_dhcpd.conf +++ /dev/null @@ -1,10 +0,0 @@ - template: isc_dhcpd_leases_size - on: isc_dhcpd.leases_total - units: KB - every: 60 - calc: $leases_size - warn: $this > 3072 - crit: $this > 6144 - delay: up 2m down 5m - info: dhcpd.leases file too big! Module can slow down your server. - to: sysadmin diff --git a/conf.d/health.d/lighttpd.conf b/conf.d/health.d/lighttpd.conf deleted file mode 100644 index 915907a4a..000000000 --- a/conf.d/health.d/lighttpd.conf +++ /dev/null @@ -1,14 +0,0 @@ - -# make sure lighttpd is running - -template: lighttpd_last_collected_secs - on: lighttpd.requests - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/conf.d/health.d/mdstat.conf b/conf.d/health.d/mdstat.conf deleted file mode 100644 index c9e7d20db..000000000 --- a/conf.d/health.d/mdstat.conf +++ /dev/null @@ -1,18 +0,0 @@ -template: mdstat_disks - on: md.disks - units: failed devices - every: 10s - calc: $total - $inuse - crit: $this > 0 - info: Array is degraded! - to: sysadmin - -template: mdstat_last_collected - on: md.disks - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf deleted file mode 100644 index d248ef57a..000000000 --- a/conf.d/health.d/memcached.conf +++ /dev/null @@ -1,52 +0,0 @@ - -# make sure memcached is running - -template: memcached_last_collected_secs - on: memcached.cache - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - - -# detect if memcached cache is full - -template: memcached_cache_memory_usage - on: memcached.cache - calc: $used * 100 / ($used + $available) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (80) : (90)) - delay: up 0 down 15m multiplier 1.5 max 1h - info: current cache memory usage - to: dba - - -# find the rate memcached cache is filling - -template: cache_fill_rate - on: memcached.cache - lookup: min -10m at -50m unaligned of available - calc: ($this - $available) / (($now - $after) / 3600) - units: KB/hour - every: 1m - info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour - - -# find the hours remaining until memcached cache is full - -template: out_of_cache_space_time - on: memcached.cache - calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.5 max 1h - info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour - to: dba diff --git a/conf.d/health.d/memory.conf b/conf.d/health.d/memory.conf deleted file mode 100644 index 4a0e6e522..000000000 --- a/conf.d/health.d/memory.conf +++ /dev/null @@ -1,38 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: 1hour_ecc_memory_correctable - on: mem.ecc_ce - os: linux - hosts: * - lookup: sum -10m unaligned - units: errors - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 1h - info: number of ECC correctable errors during the last hour - to: sysadmin - - alarm: 1hour_ecc_memory_uncorrectable - on: mem.ecc_ue - os: linux - hosts: * - lookup: sum -10m unaligned - units: errors - every: 1m - crit: $this > 0 - delay: down 1h multiplier 1.5 max 1h - info: number of ECC uncorrectable errors during the last hour - to: sysadmin - - alarm: 1hour_memory_hw_corrupted - on: mem.hwcorrupt - os: linux - hosts: * - calc: $HardwareCorrupted - units: MB - every: 10s - warn: $this > 0 - delay: down 1h multiplier 1.5 max 1h - info: amount of memory corrupted due to a hardware failure - to: sysadmin diff --git a/conf.d/health.d/mongodb.conf b/conf.d/health.d/mongodb.conf deleted file mode 100644 index a80cb3112..000000000 --- a/conf.d/health.d/mongodb.conf +++ /dev/null @@ -1,13 +0,0 @@ - -# make sure mongodb is running - -template: mongodb_last_collected_secs - on: mongodb.read_operations - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf deleted file mode 100644 index 1eeb993f0..000000000 --- a/conf.d/health.d/mysql.conf +++ /dev/null @@ -1,85 +0,0 @@ - -# make sure mysql is running - -template: mysql_last_collected_secs - on: mysql.queries - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - - -# ----------------------------------------------------------------------------- -# slow queries - -template: mysql_10s_slow_queries - on: mysql.queries - lookup: sum -10s of slow_queries - units: slow queries - every: 10s - warn: $this > (($status >= $WARNING) ? (5) : (10)) - crit: $this > (($status == $CRITICAL) ? (10) : (20)) - delay: down 5m multiplier 1.5 max 1h - info: number of mysql slow queries over the last 10 seconds - to: dba - - -# ----------------------------------------------------------------------------- -# lock waits - -template: mysql_10s_table_locks_immediate - on: mysql.table_locks - lookup: sum -10s absolute of immediate - units: immediate locks - every: 10s - info: number of table immediate locks over the last 10 seconds - to: dba - -template: mysql_10s_table_locks_waited - on: mysql.table_locks - lookup: sum -10s absolute of waited - units: waited locks - every: 10s - info: number of table waited locks over the last 10 seconds - to: dba - -template: mysql_10s_waited_locks_ratio - on: mysql.table_locks - calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0 - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (10) : (25)) - crit: $this > (($status == $CRITICAL) ? (25) : (50)) - delay: down 30m multiplier 1.5 max 1h - info: the ratio of mysql waited table locks, for the last 10 seconds - to: dba - - -# ----------------------------------------------------------------------------- -# replication - -template: mysql_replication - on: mysql.slave_status - calc: ($sql_running == -1 OR $io_running == -1)?0:1 - units: ok/failed - every: 10s - crit: $this == 0 - delay: down 5m multiplier 1.5 max 1h - info: checks if mysql replication has stopped - to: dba - -template: mysql_replication_lag - on: mysql.slave_behind - calc: $seconds - units: seconds - every: 10s - warn: $this > (($status >= $WARNING) ? (5) : (10)) - crit: $this > (($status == $CRITICAL) ? (10) : (30)) - delay: down 15m multiplier 1.5 max 1h - info: the number of seconds mysql replication is behind this master - to: dba - diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf deleted file mode 100644 index 4fc65c8ee..000000000 --- a/conf.d/health.d/named.conf +++ /dev/null @@ -1,14 +0,0 @@ - -# make sure named is running - -template: named_last_collected_secs - on: named.global_queries - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: domainadmin - diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf deleted file mode 100644 index 22a88927d..000000000 --- a/conf.d/health.d/net.conf +++ /dev/null @@ -1,122 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# ----------------------------------------------------------------------------- -# dropped packets - -# check if an interface is dropping packets -# the alarm is checked every 1 minute -# and examines the last 10 minutes of data - -template: inbound_packets_dropped - on: net.drops - os: linux - hosts: * -families: * - lookup: sum -10m unaligned absolute of inbound - units: packets - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface inbound dropped packets in the last 10 minutes - to: sysadmin - -template: outbound_packets_dropped - on: net.drops - os: linux - hosts: * -families: * - lookup: sum -10m unaligned absolute of outbound - units: packets - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface outbound dropped packets in the last 10 minutes - to: sysadmin - -template: inbound_packets_dropped_ratio - on: net.packets - os: linux - hosts: * -families: * - lookup: sum -10m unaligned absolute of received - calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0)) - units: % - every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes - to: sysadmin - -template: outbound_packets_dropped_ratio - on: net.packets - os: linux - hosts: * -families: * - lookup: sum -10m unaligned absolute of sent - calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0)) - units: % - every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes - to: sysadmin - - -# ----------------------------------------------------------------------------- -# FIFO errors - -# check if an interface is having FIFO -# buffer errors -# the alarm is checked every 1 minute -# and examines the last 10 minutes of data - -template: 10min_fifo_errors - on: net.fifo - os: linux - hosts: * -families: * - lookup: sum -10m unaligned absolute - units: errors - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 2h - info: interface fifo errors in the last 10 minutes - to: sysadmin - - -# ----------------------------------------------------------------------------- -# check for packet storms - -# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate -# 2. do the same for the last 10s -# 3. raise an alarm if the later is 10x or 20x the first -# we assume the minimum packet storm should at least have -# 10000 packets/s, average of the last 10 seconds - -template: 1m_received_packets_rate - on: net.packets - os: linux freebsd - hosts: * -families: * - lookup: average -1m of received - units: packets - every: 10s - info: the average number of packets received during the last minute - -template: 10s_received_packets_storm - on: net.packets - os: linux freebsd - hosts: * -families: * - lookup: average -10s of received - calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(200):(5000)) - crit: $this > (($status >= $WARNING)?(5000):(6000)) -options: no-clear-notification - info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent) - to: sysadmin diff --git a/conf.d/health.d/netfilter.conf b/conf.d/health.d/netfilter.conf deleted file mode 100644 index fa1732b33..000000000 --- a/conf.d/health.d/netfilter.conf +++ /dev/null @@ -1,29 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: netfilter_last_collected_secs - on: netfilter.conntrack_sockets - os: linux - hosts: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - - alarm: netfilter_conntrack_full - on: netfilter.conntrack_sockets - os: linux - hosts: * - lookup: max -10s unaligned of connections - calc: $this * 100 / $netfilter.conntrack.max - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (80) : (90)) - delay: down 5m multiplier 1.5 max 1h - info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size - to: sysadmin diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf deleted file mode 100644 index a686c3d99..000000000 --- a/conf.d/health.d/nginx.conf +++ /dev/null @@ -1,14 +0,0 @@ - -# make sure nginx is running - -template: nginx_last_collected_secs - on: nginx.requests - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/conf.d/health.d/nginx_plus.conf b/conf.d/health.d/nginx_plus.conf deleted file mode 100644 index 5a171a76d..000000000 --- a/conf.d/health.d/nginx_plus.conf +++ /dev/null @@ -1,14 +0,0 @@ - -# make sure nginx_plus is running - -template: nginx_plus_last_collected_secs - on: nginx_plus.requests_total - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/conf.d/health.d/portcheck.conf b/conf.d/health.d/portcheck.conf deleted file mode 100644 index f42b63d30..000000000 --- a/conf.d/health.d/portcheck.conf +++ /dev/null @@ -1,48 +0,0 @@ -template: portcheck_last_collected_secs -families: * - on: portcheck.status - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - -# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges -template: service_reachable -families: * - on: portcheck.status - lookup: average -1m unaligned percentage of success - calc: ($this < 75) ? (0) : ($this) - every: 5s - units: up/down - info: at least 75% successful connections during last 60 seconds, ideal for badges - to: silent - -template: connection_timeouts -families: * - on: portcheck.status - lookup: average -5m unaligned percentage of timeout - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - info: average of timeouts during the last 5 minutes - options: no-clear-notification - to: sysadmin - -template: connection_fails -families: * - on: portcheck.status - lookup: average -5m unaligned percentage of no_connection - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - info: average of failed connections during the last 5 minutes - options: no-clear-notification - to: sysadmin diff --git a/conf.d/health.d/postgres.conf b/conf.d/health.d/postgres.conf deleted file mode 100644 index 4e0583b85..000000000 --- a/conf.d/health.d/postgres.conf +++ /dev/null @@ -1,13 +0,0 @@ - -# make sure postgres is running - -template: postgres_last_collected_secs - on: postgres.db_stat_transactions - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf deleted file mode 100644 index 7290d15ff..000000000 --- a/conf.d/health.d/qos.conf +++ /dev/null @@ -1,18 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# check if a QoS class is dropping packets -# the alarm is checked every 10 seconds -# and examines the last minute of data - -#template: 10min_qos_packet_drops -# on: tc.qos_dropped -# os: linux -# hosts: * -# lookup: sum -10m unaligned absolute -# every: 30s -# warn: $this > 0 -# delay: up 0 down 30m multiplier 1.5 max 1h -# units: packets -# info: dropped packets in the last 30 minutes -# to: sysadmin diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf deleted file mode 100644 index b6dc5f945..000000000 --- a/conf.d/health.d/ram.conf +++ /dev/null @@ -1,64 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: used_ram_to_ignore - on: system.ram - os: linux - hosts: * - calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz) - every: 10s - info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) - - alarm: ram_in_use - on: system.ram - os: linux - hosts: * -# calc: $used * 100 / ($used + $cached + $free) - calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - info: system RAM used - to: sysadmin - - alarm: ram_available - on: mem.available - os: linux - hosts: * - calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) - units: % - every: 10s - warn: $this < (($status >= $WARNING) ? ( 5) : (10)) - crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) - delay: down 15m multiplier 1.5 max 1h - info: estimated amount of RAM available for userspace processes, without causing swapping - to: sysadmin - -## FreeBSD -alarm: ram_in_use - on: system.ram - os: freebsd -hosts: * - calc: (($active + $wired) - $used_ram_to_ignore) * 100 / (($active + $wired) - $used_ram_to_ignore + $cached + $free) -units: % -every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) -delay: down 15m multiplier 1.5 max 1h - info: system RAM usage - to: sysadmin - - alarm: ram_available - on: system.ram - os: freebsd - hosts: * - calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $buffers) - units: % - every: 10s - warn: $this < (($status >= $WARNING) ? ( 5) : (10)) - crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) - delay: down 15m multiplier 1.5 max 1h - info: estimated amount of RAM available for userspace processes, without causing swapping - to: sysadmin diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf deleted file mode 100644 index c08a884a6..000000000 --- a/conf.d/health.d/redis.conf +++ /dev/null @@ -1,34 +0,0 @@ - -# make sure redis is running - -template: redis_last_collected_secs - on: redis.operations - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - -template: redis_bgsave_broken -families: * - on: redis.bgsave_health - every: 10s - crit: $rdb_last_bgsave_status != 0 - units: ok/failed - info: states if redis bgsave is working - delay: down 5m multiplier 1.5 max 1h - to: dba - -template: redis_bgsave_slow -families: * - on: redis.bgsave_now - every: 10s - warn: $rdb_bgsave_in_progress > 600 - crit: $rdb_bgsave_in_progress > 1200 - units: seconds - info: the time redis needs to save its database - delay: down 5m multiplier 1.5 max 1h - to: dba diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf deleted file mode 100644 index 2344b60ec..000000000 --- a/conf.d/health.d/retroshare.conf +++ /dev/null @@ -1,25 +0,0 @@ -# make sure RetroShare is running - -template: retroshare_last_collected_secs - on: retroshare.peers - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - -# make sure the DHT is fine when active - -template: retroshare_dht_working - on: retroshare.dht - calc: $dht_size_all - units: peers - every: 1m - warn: $this < (($status >= $WARNING) ? (120) : (100)) - crit: $this < (($status == $CRITICAL) ? (10) : (1)) - delay: up 0 down 15m multiplier 1.5 max 1h - info: Checks if the DHT has enough peers to operate - to: sysadmin diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf deleted file mode 100644 index 77c804bfd..000000000 --- a/conf.d/health.d/softnet.conf +++ /dev/null @@ -1,40 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# check for common /proc/net/softnet_stat errors - - alarm: 10min_netdev_backlog_exceeded - on: system.softnet_stat - os: linux - hosts: * - lookup: sum -10m unaligned absolute of dropped - units: packets - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 2h - info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets) - to: sysadmin - - alarm: 10min_netdev_budget_ran_outs - on: system.softnet_stat - os: linux - hosts: * - lookup: sum -10m unaligned absolute of squeezed - units: events - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (10)) - delay: down 1h multiplier 1.5 max 2h - info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets) - to: silent - - alarm: 10min_netisr_backlog_exceeded - on: system.softnet_stat - os: freebsd - hosts: * - lookup: sum -10m unaligned absolute of qdrops - units: packets - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 2h - info: number of drops in the last 10min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets) - to: sysadmin diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf deleted file mode 100644 index 06cc9678f..000000000 --- a/conf.d/health.d/squid.conf +++ /dev/null @@ -1,14 +0,0 @@ - -# make sure squid is running - -template: squid_last_collected_secs - on: squid.clients_requests - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: proxyadmin - diff --git a/conf.d/health.d/stiebeleltron.conf b/conf.d/health.d/stiebeleltron.conf deleted file mode 100644 index e0361eb20..000000000 --- a/conf.d/health.d/stiebeleltron.conf +++ /dev/null @@ -1,11 +0,0 @@ -template: stiebeleltron_last_collected_secs -families: * - on: stiebeleltron.heating.hc1 - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sitemgr diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf deleted file mode 100644 index f920b0807..000000000 --- a/conf.d/health.d/swap.conf +++ /dev/null @@ -1,43 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: 30min_ram_swapped_out - on: system.swapio - os: linux freebsd - hosts: * - lookup: sum -30m unaligned absolute of out - # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 - calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) - units: % of RAM - every: 1m - warn: $this > (($status >= $WARNING) ? (10) : (20)) - crit: $this > (($status == $CRITICAL) ? (20) : (30)) - delay: up 0 down 15m multiplier 1.5 max 1h - info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM - to: sysadmin - - alarm: ram_in_swap - on: system.swap - os: linux - hosts: * - calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) - units: % of RAM - every: 10s - warn: $this > (($status >= $WARNING) ? (15) : (20)) - crit: $this > (($status == $CRITICAL) ? (40) : (50)) - delay: up 30s down 15m multiplier 1.5 max 1h - info: the swap memory used, as a percentage of the system RAM - to: sysadmin - - alarm: used_swap - on: system.swap - os: linux freebsd - hosts: * - calc: $used * 100 / ( $used + $free ) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: up 30s down 15m multiplier 1.5 max 1h - info: the percentage of swap memory used - to: sysadmin diff --git a/conf.d/health.d/tcp_conn.conf b/conf.d/health.d/tcp_conn.conf deleted file mode 100644 index 7aa9a9800..000000000 --- a/conf.d/health.d/tcp_conn.conf +++ /dev/null @@ -1,19 +0,0 @@ - -# -# ${tcp_max_connections} may be nan or -1 if the system -# supports dynamic threshold for TCP connections. -# In this case, the alarm will always be zero. -# - - alarm: tcp_connections - on: ipv4.tcpsock - os: linux - hosts: * - calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0 - units: % - every: 10s - warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 )) - crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 )) - delay: up 0 down 5m multiplier 1.5 max 1h - info: the percentage of IPv4 TCP connections over the max allowed - to: sysadmin diff --git a/conf.d/health.d/tcp_listen.conf b/conf.d/health.d/tcp_listen.conf deleted file mode 100644 index 957964ae4..000000000 --- a/conf.d/health.d/tcp_listen.conf +++ /dev/null @@ -1,27 +0,0 @@ -# ----------------------------------------------------------------------------- -# tcp listen sockets issues - - alarm: 1m_ipv4_tcp_listen_overflows - on: ipv4.tcplistenissues - os: linux freebsd - hosts: * - lookup: sum -60s unaligned absolute of ListenOverflows - units: overflows - every: 10s - crit: $this > 0 - delay: up 0 down 5m multiplier 1.5 max 1h - info: the number of TCP listen socket overflows during the last minute - to: sysadmin - - alarm: 1m_ipv4_tcp_listen_drops - on: ipv4.tcplistenissues - os: linux - hosts: * - lookup: sum -60s unaligned absolute of ListenDrops - units: drops - every: 10s - crit: $this > 0 - delay: up 0 down 5m multiplier 1.5 max 1h - info: the number of TCP listen socket drops during the last minute - to: sysadmin - diff --git a/conf.d/health.d/tcp_mem.conf b/conf.d/health.d/tcp_mem.conf deleted file mode 100644 index 6927d5765..000000000 --- a/conf.d/health.d/tcp_mem.conf +++ /dev/null @@ -1,20 +0,0 @@ -# -# check -# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html -# -# We give a warning when TCP is under memory pressure -# and a critical when TCP is 90% of its upper memory limit -# - - alarm: tcp_memory - on: ipv4.sockstat_tcp_mem - os: linux - hosts: * - calc: ${mem} * 100 / ${tcp_mem_high} - units: % - every: 10s - warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} )) - crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) - delay: up 0 down 5m multiplier 1.5 max 1h - info: the amount of TCP memory as a percentage of its max memory limit - to: sysadmin diff --git a/conf.d/health.d/tcp_orphans.conf b/conf.d/health.d/tcp_orphans.conf deleted file mode 100644 index 280d6590f..000000000 --- a/conf.d/health.d/tcp_orphans.conf +++ /dev/null @@ -1,21 +0,0 @@ - -# -# check -# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html -# -# The kernel may penalize orphans by 2x or even 4x -# so we alarm warning at 25% and critical at 50% -# - - alarm: tcp_orphans - on: ipv4.sockstat_tcp_sockets - os: linux - hosts: * - calc: ${orphan} * 100 / ${tcp_max_orphans} - units: % - every: 10s - warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 )) - crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 )) - delay: up 0 down 5m multiplier 1.5 max 1h - info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors) - to: sysadmin diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf deleted file mode 100644 index 91dad3c6a..000000000 --- a/conf.d/health.d/tcp_resets.conf +++ /dev/null @@ -1,67 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# ----------------------------------------------------------------------------- - - alarm: ipv4_tcphandshake_last_collected_secs - on: ipv4.tcphandshake - os: linux freebsd - hosts: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: up 0 down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - -# ----------------------------------------------------------------------------- -# tcp resets this host sends - - alarm: 1m_ipv4_tcp_resets_sent - on: ipv4.tcphandshake - os: linux - hosts: * - lookup: average -1m at -10s unaligned absolute of OutRsts - units: tcp resets/s - every: 10s - info: average TCP RESETS this host is sending, over the last minute - - alarm: 10s_ipv4_tcp_resets_sent - on: ipv4.tcphandshake - os: linux - hosts: * - lookup: average -10s unaligned absolute of OutRsts - units: tcp resets/s - every: 10s - warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20))) - delay: up 0 down 60m multiplier 1.2 max 2h - options: no-clear-notification - info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent) - to: sysadmin - -# ----------------------------------------------------------------------------- -# tcp resets this host receives - - alarm: 1m_ipv4_tcp_resets_received - on: ipv4.tcphandshake - os: linux freebsd - hosts: * - lookup: average -1m at -10s unaligned absolute of AttemptFails - units: tcp resets/s - every: 10s - info: average TCP RESETS this host is sending, over the last minute - - alarm: 10s_ipv4_tcp_resets_received - on: ipv4.tcphandshake - os: linux freebsd - hosts: * - lookup: average -10s unaligned absolute of AttemptFails - units: tcp resets/s - every: 10s - warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) - delay: up 0 down 60m multiplier 1.2 max 2h - options: no-clear-notification - info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent) - to: sysadmin diff --git a/conf.d/health.d/udp_errors.conf b/conf.d/health.d/udp_errors.conf deleted file mode 100644 index 382b39658..000000000 --- a/conf.d/health.d/udp_errors.conf +++ /dev/null @@ -1,49 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# ----------------------------------------------------------------------------- - - alarm: ipv4_udperrors_last_collected_secs - on: ipv4.udperrors - os: linux freebsd - hosts: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: up 0 down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - -# ----------------------------------------------------------------------------- -# UDP receive buffer errors - - alarm: 1m_ipv4_udp_receive_buffer_errors - on: ipv4.udperrors - os: linux freebsd - hosts: * - lookup: sum -1m unaligned absolute of RcvbufErrors - units: errors - every: 10s - warn: $this > 0 - crit: $this > 100 - info: number of UDP receive buffer errors during the last minute - delay: up 0 down 60m multiplier 1.2 max 2h - to: sysadmin - -# ----------------------------------------------------------------------------- -# UDP send buffer errors - - alarm: 1m_ipv4_udp_send_buffer_errors - on: ipv4.udperrors - os: linux - hosts: * - lookup: sum -1m unaligned absolute of SndbufErrors - units: errors - every: 10s - warn: $this > 0 - crit: $this > 100 - info: number of UDP send buffer errors during the last minute - delay: up 0 down 60m multiplier 1.2 max 2h - to: sysadmin diff --git a/conf.d/health.d/varnish.conf b/conf.d/health.d/varnish.conf deleted file mode 100644 index cca7446b4..000000000 --- a/conf.d/health.d/varnish.conf +++ /dev/null @@ -1,9 +0,0 @@ - alarm: varnish_last_collected - on: varnish.uptime - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/conf.d/health.d/web_log.conf b/conf.d/health.d/web_log.conf deleted file mode 100644 index d8be88b47..000000000 --- a/conf.d/health.d/web_log.conf +++ /dev/null @@ -1,163 +0,0 @@ - -# make sure we can collect web log data - -template: last_collected_secs - on: web_log.response_codes -families: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - -# ----------------------------------------------------------------------------- -# high level response code alarms - -# the following alarms trigger only when there are enough data. -# we assume there are enough data when: -# -# $1m_requests > 120 -# -# i.e. when there are at least 120 requests during the last minute - -template: 1m_requests - on: web_log.response_statuses -families: * - lookup: sum -1m unaligned - calc: ($this == 0)?(1):($this) - units: requests - every: 10s - info: the sum of all HTTP requests over the last minute - -template: 1m_successful - on: web_log.response_statuses -families: * - lookup: sum -1m unaligned of successful_requests - calc: $this * 100 / $1m_requests - units: % - every: 10s - warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) - delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of successful HTTP responses (1xx, 2xx, 304) over the last minute - to: webmaster - -template: 1m_redirects - on: web_log.response_statuses -families: * - lookup: sum -1m unaligned of redirects - calc: $this * 100 / $1m_requests - units: % - every: 10s - warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) - delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP redirects (3xx except 304) over the last minute - to: webmaster - -template: 1m_bad_requests - on: web_log.response_statuses -families: * - lookup: sum -1m unaligned of bad_requests - calc: $this * 100 / $1m_requests - units: % - every: 10s - warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) - delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP bad requests (4xx) over the last minute - to: webmaster - -template: 1m_internal_errors - on: web_log.response_statuses -families: * - lookup: sum -1m unaligned of server_errors - calc: $this * 100 / $1m_requests - units: % - every: 10s - warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 ) - delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP internal server errors (5xx), over the last minute - to: webmaster - - -# ----------------------------------------------------------------------------- -# web slow - -# the following alarms trigger only when there are enough data. -# we assume there are enough data when: -# -# $1m_requests > 120 -# -# i.e. when there are at least 120 requests during the last minute - -template: 10m_response_time - on: web_log.response_time -families: * - lookup: average -10m unaligned of avg - units: ms - every: 30s - info: the average time to respond to HTTP requests, over the last 10 minutes - -template: web_slow - on: web_log.response_time -families: * - lookup: average -1m unaligned of avg - units: ms - every: 10s - green: 500 - red: 1000 - warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 ) - delay: down 15m multiplier 1.5 max 1h - info: the average time to respond to HTTP requests, over the last 1 minute - options: no-clear-notification - to: webmaster - -# ----------------------------------------------------------------------------- -# web too many or too few requests - -# the following alarms trigger only when there are enough data. -# we assume there are enough data when: -# -# $5m_successful_old > 120 -# -# i.e. when there were at least 120 requests during the 5 minutes starting -# at -10m and ending at -5m - -template: 5m_successful_old - on: web_log.response_statuses -families: * - lookup: average -5m at -5m unaligned of successful_requests - units: requests/s - every: 30s - info: average rate of successful HTTP requests over the last 5 minutes - -template: 5m_successful - on: web_log.response_statuses -families: * - lookup: average -5m unaligned of successful_requests - units: requests/s - every: 30s - info: average successful HTTP requests over the last 5 minutes - -template: 5m_requests_ratio - on: web_log.response_codes -families: * - calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100) - units: % - every: 30s - warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0) - crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0) - delay: down 15m multiplier 1.5 max 1h -options: no-clear-notification - info: the percentage of successful web requests over the last 5 minutes, \ - compared with the previous 5 minutes \ - (clear notification for this alarm will not be sent) - to: webmaster - diff --git a/conf.d/health.d/zfs.conf b/conf.d/health.d/zfs.conf deleted file mode 100644 index af73824e6..000000000 --- a/conf.d/health.d/zfs.conf +++ /dev/null @@ -1,10 +0,0 @@ - - alarm: zfs_memory_throttle - on: zfs.memory_ops - lookup: sum -10m unaligned absolute of throttled - units: events - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 2h - info: the number of times ZFS had to limit the ARC growth in the last 10 minutes - to: sysadmin |