summaryrefslogtreecommitdiffstats
path: root/conf.d/health.d
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2018-11-07 12:22:44 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2018-11-07 12:22:44 +0000
commit1e6c93250172946eeb38e94a92a1fd12c9d3011e (patch)
tree8ca5e16dfc7ad6b3bf2738ca0a48408a950f8f7e /conf.d/health.d
parentUpdate watch file (diff)
downloadnetdata-1e6c93250172946eeb38e94a92a1fd12c9d3011e.tar.xz
netdata-1e6c93250172946eeb38e94a92a1fd12c9d3011e.zip
Merging upstream version 1.11.0+dfsg.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'conf.d/health.d')
-rw-r--r--conf.d/health.d/apache.conf14
-rw-r--r--conf.d/health.d/backend.conf45
-rw-r--r--conf.d/health.d/beanstalkd.conf36
-rw-r--r--conf.d/health.d/bind_rndc.conf9
-rw-r--r--conf.d/health.d/btrfs.conf57
-rw-r--r--conf.d/health.d/ceph.conf13
-rw-r--r--conf.d/health.d/couchdb.conf13
-rw-r--r--conf.d/health.d/cpu.conf55
-rw-r--r--conf.d/health.d/disks.conf167
-rw-r--r--conf.d/health.d/elasticsearch.conf9
-rw-r--r--conf.d/health.d/entropy.conf16
-rw-r--r--conf.d/health.d/fping.conf53
-rw-r--r--conf.d/health.d/fronius.conf11
-rw-r--r--conf.d/health.d/haproxy.conf27
-rw-r--r--conf.d/health.d/httpcheck.conf99
-rw-r--r--conf.d/health.d/ipc.conf28
-rw-r--r--conf.d/health.d/ipfs.conf11
-rw-r--r--conf.d/health.d/ipmi.conf20
-rw-r--r--conf.d/health.d/isc_dhcpd.conf10
-rw-r--r--conf.d/health.d/lighttpd.conf14
-rw-r--r--conf.d/health.d/mdstat.conf18
-rw-r--r--conf.d/health.d/memcached.conf52
-rw-r--r--conf.d/health.d/memory.conf38
-rw-r--r--conf.d/health.d/mongodb.conf13
-rw-r--r--conf.d/health.d/mysql.conf85
-rw-r--r--conf.d/health.d/named.conf14
-rw-r--r--conf.d/health.d/net.conf122
-rw-r--r--conf.d/health.d/netfilter.conf29
-rw-r--r--conf.d/health.d/nginx.conf14
-rw-r--r--conf.d/health.d/nginx_plus.conf14
-rw-r--r--conf.d/health.d/portcheck.conf48
-rw-r--r--conf.d/health.d/postgres.conf13
-rw-r--r--conf.d/health.d/qos.conf18
-rw-r--r--conf.d/health.d/ram.conf64
-rw-r--r--conf.d/health.d/redis.conf34
-rw-r--r--conf.d/health.d/retroshare.conf25
-rw-r--r--conf.d/health.d/softnet.conf40
-rw-r--r--conf.d/health.d/squid.conf14
-rw-r--r--conf.d/health.d/stiebeleltron.conf11
-rw-r--r--conf.d/health.d/swap.conf43
-rw-r--r--conf.d/health.d/tcp_conn.conf19
-rw-r--r--conf.d/health.d/tcp_listen.conf27
-rw-r--r--conf.d/health.d/tcp_mem.conf20
-rw-r--r--conf.d/health.d/tcp_orphans.conf21
-rw-r--r--conf.d/health.d/tcp_resets.conf67
-rw-r--r--conf.d/health.d/udp_errors.conf49
-rw-r--r--conf.d/health.d/varnish.conf9
-rw-r--r--conf.d/health.d/web_log.conf163
-rw-r--r--conf.d/health.d/zfs.conf10
49 files changed, 0 insertions, 1801 deletions
diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf
deleted file mode 100644
index 0c98b877..00000000
--- a/conf.d/health.d/apache.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# make sure apache is running
-
-template: apache_last_collected_secs
- on: apache.requests
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/conf.d/health.d/backend.conf b/conf.d/health.d/backend.conf
deleted file mode 100644
index 7af100d8..00000000
--- a/conf.d/health.d/backend.conf
+++ /dev/null
@@ -1,45 +0,0 @@
-
-# make sure we are sending data to backend
-
- alarm: backend_last_buffering
- on: netdata.backend_metrics
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful buffering of backend data
- to: dba
-
- alarm: backend_metrics_sent
- on: netdata.backend_metrics
- units: %
- calc: abs($sent) * 100 / abs($buffered)
- every: 10s
- warn: $this != 100
- delay: down 5m multiplier 1.5 max 1h
- info: percentage of metrics sent to the backend server
- to: dba
-
- alarm: backend_metrics_lost
- on: netdata.backend_metrics
- units: metrics
- calc: abs($lost)
- every: 10s
- crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0)
- delay: down 5m multiplier 1.5 max 1h
- info: number of metrics lost due to repeating failures to contact the backend server
- to: dba
-
-# this chart has been removed from netdata
-# alarm: backend_slow
-# on: netdata.backend_latency
-# units: %
-# calc: $latency * 100 / ($update_every * 1000)
-# every: 10s
-# warn: $this > 50
-# crit: $this > 100
-# delay: down 5m multiplier 1.5 max 1h
-# info: the percentage of time between iterations needed by the backend time to process the data sent by netdata
-# to: dba
diff --git a/conf.d/health.d/beanstalkd.conf b/conf.d/health.d/beanstalkd.conf
deleted file mode 100644
index 30dc2732..00000000
--- a/conf.d/health.d/beanstalkd.conf
+++ /dev/null
@@ -1,36 +0,0 @@
-# get the number of buried jobs in all queues
-
-template: server_buried_jobs
- on: beanstalk.current_jobs
- calc: $buried
- units: jobs
- every: 10s
- warn: $this > 0
- crit: $this > 10
- delay: up 0 down 5m multiplier 1.2 max 1h
- info: the number of buried jobs aggregated across all tubes
- to: sysadmin
-
-# get the number of buried jobs per queue
-
-#template: tube_buried_jobs
-# on: beanstalk.jobs
-# calc: $buried
-# units: jobs
-# every: 10s
-# warn: $this > 0
-# crit: $this > 10
-# delay: up 0 down 5m multiplier 1.2 max 1h
-# info: the number of jobs buried per tube
-# to: sysadmin
-
-# get the current number of tubes
-
-#template: number_of_tubes
-# on: beanstalk.current_tubes
-# calc: $tubes
-# every: 10s
-# warn: $this < 5
-# delay: up 0 down 5m multiplier 1.2 max 1h
-# info: the current number of tubes on the server
-# to: sysadmin
diff --git a/conf.d/health.d/bind_rndc.conf b/conf.d/health.d/bind_rndc.conf
deleted file mode 100644
index 4145e77c..00000000
--- a/conf.d/health.d/bind_rndc.conf
+++ /dev/null
@@ -1,9 +0,0 @@
- template: bind_rndc_stats_file_size
- on: bind_rndc.stats_size
- units: megabytes
- every: 60
- calc: $stats_size
- warn: $this > 512
- crit: $this > 1024
- info: Bind stats file is very large! Consider to create logrotate conf file for it!
- to: sysadmin
diff --git a/conf.d/health.d/btrfs.conf b/conf.d/health.d/btrfs.conf
deleted file mode 100644
index b27aa544..00000000
--- a/conf.d/health.d/btrfs.conf
+++ /dev/null
@@ -1,57 +0,0 @@
-
-template: btrfs_allocated
- on: btrfs.disk
- os: *
- hosts: *
-families: *
- calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (90) : (95))
- crit: $this > (($status == $CRITICAL) ? (95) : (98))
- delay: up 1m down 15m multiplier 1.5 max 1h
- info: the percentage of allocated BTRFS physical disk space
- to: sysadmin
-
-template: btrfs_data
- on: btrfs.data
- os: *
- hosts: *
-families: *
- calc: $used * 100 / ($used + $free)
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
- crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
- delay: up 1m down 15m multiplier 1.5 max 1h
- info: the percentage of used BTRFS data space
- to: sysadmin
-
-template: btrfs_metadata
- on: btrfs.metadata
- os: *
- hosts: *
-families: *
- calc: ($used + $reserved) * 100 / ($used + $free + $reserved)
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
- crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
- delay: up 1m down 15m multiplier 1.5 max 1h
- info: the percentage of used BTRFS metadata space
- to: sysadmin
-
-template: btrfs_system
- on: btrfs.system
- os: *
- hosts: *
-families: *
- calc: $used * 100 / ($used + $free)
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
- crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
- delay: up 1m down 15m multiplier 1.5 max 1h
- info: the percentage of used BTRFS system space
- to: sysadmin
-
diff --git a/conf.d/health.d/ceph.conf b/conf.d/health.d/ceph.conf
deleted file mode 100644
index de16f7b6..00000000
--- a/conf.d/health.d/ceph.conf
+++ /dev/null
@@ -1,13 +0,0 @@
-# low ceph disk available
-
-template: cluster_space_usage
- on: ceph.general_usage
- calc: $avail * 100 / ($avail + $used)
- units: %
- every: 10s
- warn: $this < 10
- crit: $this < 1
- delay: down 5m multiplier 1.2 max 1h
- info: ceph disk usage is almost full
- to: sysadmin
-
diff --git a/conf.d/health.d/couchdb.conf b/conf.d/health.d/couchdb.conf
deleted file mode 100644
index 4a289528..00000000
--- a/conf.d/health.d/couchdb.conf
+++ /dev/null
@@ -1,13 +0,0 @@
-
-# make sure couchdb is running
-
-template: couchdb_last_collected_secs
- on: couchdb.request_methods
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf
deleted file mode 100644
index fa818985..00000000
--- a/conf.d/health.d/cpu.conf
+++ /dev/null
@@ -1,55 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-template: 10min_cpu_usage
- on: system.cpu
- os: linux
- hosts: *
- lookup: average -10m unaligned of user,system,softirq,irq,guest
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (75) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
- delay: down 15m multiplier 1.5 max 1h
- info: average cpu utilization for the last 10 minutes (excluding iowait, nice and steal)
- to: sysadmin
-
-template: 10min_cpu_iowait
- on: system.cpu
- os: linux
- hosts: *
- lookup: average -10m unaligned of iowait
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (20) : (40))
- crit: $this > (($status == $CRITICAL) ? (40) : (50))
- delay: down 15m multiplier 1.5 max 1h
- info: average CPU wait I/O for the last 10 minutes
- to: sysadmin
-
-template: 20min_steal_cpu
- on: system.cpu
- os: linux
- hosts: *
- lookup: average -20m unaligned of steal
- units: %
- every: 5m
- warn: $this > (($status >= $WARNING) ? (5) : (10))
- crit: $this > (($status == $CRITICAL) ? (20) : (30))
- delay: down 1h multiplier 1.5 max 2h
- info: average CPU steal time for the last 20 minutes
- to: sysadmin
-
-## FreeBSD
-template: 10min_cpu_usage
- on: system.cpu
- os: freebsd
- hosts: *
- lookup: average -10m unaligned of user,system,interrupt
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (75) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
- delay: down 15m multiplier 1.5 max 1h
- info: average cpu utilization for the last 10 minutes (excluding nice)
- to: sysadmin
diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf
deleted file mode 100644
index 26f85848..00000000
--- a/conf.d/health.d/disks.conf
+++ /dev/null
@@ -1,167 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-
-# -----------------------------------------------------------------------------
-# low disk space
-
-# checking the latest collected values
-# raise an alarm if the disk is low on
-# available disk space
-
-template: disk_space_usage
- on: disk.space
- os: linux freebsd
- hosts: *
-families: *
- calc: $used * 100 / ($avail + $used)
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING ) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: up 1m down 15m multiplier 1.5 max 1h
- info: current disk space usage
- to: sysadmin
-
-template: disk_inode_usage
- on: disk.inodes
- os: linux freebsd
- hosts: *
-families: *
- calc: $used * 100 / ($avail + $used)
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: up 1m down 15m multiplier 1.5 max 1h
- info: current disk inode usage
- to: sysadmin
-
-
-# -----------------------------------------------------------------------------
-# disk fill rate
-
-# calculate the rate the disk fills
-# use as base, the available space change
-# during the last hour
-
-# this is just a calculation - it has no alarm
-# we will use it in the next template to find
-# the hours remaining
-
-template: disk_fill_rate
- on: disk.space
- os: linux freebsd
- hosts: *
-families: *
- lookup: min -10m at -50m unaligned of avail
- calc: ($this - $avail) / (($now - $after) / 3600)
- every: 1m
- units: GB/hour
- info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
-
-
-# calculate the hours remaining
-# if the disk continues to fill
-# in this rate
-
-template: out_of_disk_space_time
- on: disk.space
- os: linux freebsd
- hosts: *
-families: *
- calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
- units: hours
- every: 10s
- warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
- crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
- delay: down 15m multiplier 1.2 max 1h
- info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
- to: sysadmin
-
-
-# -----------------------------------------------------------------------------
-# disk inode fill rate
-
-# calculate the rate the disk inodes are allocated
-# use as base, the available inodes change
-# during the last hour
-
-# this is just a calculation - it has no alarm
-# we will use it in the next template to find
-# the hours remaining
-
-template: disk_inode_rate
- on: disk.inodes
- os: linux freebsd
- hosts: *
-families: *
- lookup: min -10m at -50m unaligned of avail
- calc: ($this - $avail) / (($now - $after) / 3600)
- every: 1m
- units: inodes/hour
- info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
-
-# calculate the hours remaining
-# if the disk inodes are allocated
-# in this rate
-
-template: out_of_disk_inodes_time
- on: disk.inodes
- os: linux freebsd
- hosts: *
-families: *
- calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
- units: hours
- every: 10s
- warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
- crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
- delay: down 15m multiplier 1.2 max 1h
- info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
- to: sysadmin
-
-
-# -----------------------------------------------------------------------------
-# disk congestion
-
-# raise an alarm if the disk is congested
-# by calculating the average disk utilization
-# for the last 10 minutes
-
-template: 10min_disk_utilization
- on: disk.util
- os: linux freebsd
- hosts: *
-families: *
- lookup: average -10m unaligned
- units: %
- every: 1m
- green: 90
- red: 98
- warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
- crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
- delay: down 15m multiplier 1.2 max 1h
- info: the percentage of time the disk was busy, during the last 10 minutes
- to: sysadmin
-
-
-# raise an alarm if the disk backlog
-# is above 1000ms (1s) per second
-# for 10 minutes
-# (i.e. the disk cannot catch up)
-
-template: 10min_disk_backlog
- on: disk.backlog
- os: linux
- hosts: *
-families: *
- lookup: average -10m unaligned
- units: ms
- every: 1m
- green: 2000
- red: 5000
- warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
- crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
- delay: down 15m multiplier 1.2 max 1h
- info: average of the kernel estimated disk backlog, for the last 10 minutes
- to: sysadmin
diff --git a/conf.d/health.d/elasticsearch.conf b/conf.d/health.d/elasticsearch.conf
deleted file mode 100644
index dffd4096..00000000
--- a/conf.d/health.d/elasticsearch.conf
+++ /dev/null
@@ -1,9 +0,0 @@
- alarm: elasticsearch_last_collected
- on: elasticsearch_local.cluster_health_status
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
- to: sysadmin
diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf
deleted file mode 100644
index 66d44ec1..00000000
--- a/conf.d/health.d/entropy.conf
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# check if entropy is too low
-# the alarm is checked every 1 minute
-# and examines the last hour of data
-
- alarm: lowest_entropy
- on: system.entropy
- os: linux
- hosts: *
- lookup: min -10m unaligned
- units: entries
- every: 5m
- warn: $this < (($status >= $WARNING) ? (200) : (100))
- delay: down 1h multiplier 1.5 max 2h
- info: minimum entries in the random numbers pool in the last 10 minutes
- to: silent
diff --git a/conf.d/health.d/fping.conf b/conf.d/health.d/fping.conf
deleted file mode 100644
index 43658fef..00000000
--- a/conf.d/health.d/fping.conf
+++ /dev/null
@@ -1,53 +0,0 @@
-
-template: fping_last_collected_secs
-families: *
- on: fping.latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
-template: host_reachable
-families: *
- on: fping.latency
- calc: $average != nan
- units: up/down
- every: 10s
- crit: $this == 0
- info: states if the remote host is reachable
- delay: down 30m multiplier 1.5 max 2h
- to: sysadmin
-
-template: host_latency
-families: *
- on: fping.latency
- lookup: average -10s unaligned of average
- units: ms
- every: 10s
- green: 500
- red: 1000
- warn: $this > $green OR $max > $red
- crit: $this > $red
- info: average round trip delay during the last 10 seconds
- delay: down 30m multiplier 1.5 max 2h
- to: sysadmin
-
-template: packet_loss
-families: *
- on: fping.quality
- lookup: average -10m unaligned of returned
- calc: 100 - $this
- green: 1
- red: 10
- units: %
- every: 10s
- warn: $this > $green
- crit: $this > $red
- info: packet loss percentage
- delay: down 30m multiplier 1.5 max 2h
- to: sysadmin
-
diff --git a/conf.d/health.d/fronius.conf b/conf.d/health.d/fronius.conf
deleted file mode 100644
index cdf6c8fc..00000000
--- a/conf.d/health.d/fronius.conf
+++ /dev/null
@@ -1,11 +0,0 @@
-template: fronius_last_collected_secs
-families: *
- on: fronius.power
- calc: $now - $last_collected_t
- every: 10s
- units: seconds ago
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sitemgr
diff --git a/conf.d/health.d/haproxy.conf b/conf.d/health.d/haproxy.conf
deleted file mode 100644
index e49c70d4..00000000
--- a/conf.d/health.d/haproxy.conf
+++ /dev/null
@@ -1,27 +0,0 @@
-template: haproxy_backend_server_status
- on: haproxy_hs.down
- units: failed servers
- every: 10s
- lookup: average -10s
- crit: $this > 0
- info: number of failed haproxy backend servers
- to: sysadmin
-
-template: haproxy_backend_status
- on: haproxy_hb.down
- units: failed backend
- every: 10s
- lookup: average -10s
- crit: $this > 0
- info: number of failed haproxy backends
- to: sysadmin
-
-template: haproxy_last_collected
- on: haproxy_hb.down
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
- to: sysadmin
diff --git a/conf.d/health.d/httpcheck.conf b/conf.d/health.d/httpcheck.conf
deleted file mode 100644
index 0ddf35ea..00000000
--- a/conf.d/health.d/httpcheck.conf
+++ /dev/null
@@ -1,99 +0,0 @@
-template: httpcheck_last_collected_secs
-families: *
- on: httpcheck.status
- calc: $now - $last_collected_t
- every: 10s
- units: seconds ago
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
-# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: web_service_up
-families: *
- on: httpcheck.status
- lookup: average -1m unaligned percentage of success
- calc: ($this < 75) ? (0) : ($this)
- every: 5s
- units: up/down
- info: at least 75% verified responses during last 60 seconds, ideal for badges
- to: silent
-
-template: web_service_bad_content
-families: *
- on: httpcheck.status
- lookup: average -5m unaligned percentage of bad_content
- every: 10s
- units: %
- warn: $this >= 10 AND $this < 40
- crit: $this >= 40
- delay: down 5m multiplier 1.5 max 1h
- info: average of unexpected http response content during the last 5 minutes
- options: no-clear-notification
- to: webmaster
-
-template: web_service_bad_status
-families: *
- on: httpcheck.status
- lookup: average -5m unaligned percentage of bad_status
- every: 10s
- units: %
- warn: $this >= 10 AND $this < 40
- crit: $this >= 40
- delay: down 5m multiplier 1.5 max 1h
- info: average of unexpected http status during the last 5 minutes
- options: no-clear-notification
- to: webmaster
-
-template: web_service_timeouts
-families: *
- on: httpcheck.status
- lookup: average -5m unaligned percentage of timeout
- every: 10s
- units: %
- info: average of timeouts during the last 5 minutes
-
-template: no_web_service_connections
-families: *
- on: httpcheck.status
- lookup: average -5m unaligned percentage of no_connection
- every: 10s
- units: %
- info: average of failed requests during the last 5 minutes
-
-# combined timeout & no connection alarm
-template: web_service_unreachable
-families: *
- on: httpcheck.status
- calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts)
- units: %
- every: 10s
- warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40)
- crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40
- delay: down 5m multiplier 1.5 max 1h
- info: average of failed requests either due to timeouts or no connection during the last 5 minutes
- options: no-clear-notification
- to: webmaster
-
-template: 1h_web_service_response_time
-families: *
- on: httpcheck.responsetime
- lookup: average -1h unaligned of time
- every: 30s
- units: ms
- info: average response time over the last hour
-
-template: web_service_slow
-families: *
- on: httpcheck.responsetime
- lookup: average -3m unaligned of time
- units: ms
- every: 10s
- warn: ($this > ($1h_web_service_response_time * 2) )
- crit: ($this > ($1h_web_service_response_time * 3) )
- info: average response time over the last 3 minutes, compared to the average over the last hour
- delay: down 5m multiplier 1.5 max 1h
- options: no-clear-notification
- to: webmaster
diff --git a/conf.d/health.d/ipc.conf b/conf.d/health.d/ipc.conf
deleted file mode 100644
index 03cf264d..00000000
--- a/conf.d/health.d/ipc.conf
+++ /dev/null
@@ -1,28 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- alarm: semaphores_used
- on: system.ipc_semaphores
- os: linux
- hosts: *
- calc: $semaphores * 100 / $ipc.semaphores.max
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (70) : (90))
- delay: down 5m multiplier 1.5 max 1h
- info: the percentage of IPC semaphores used
- to: sysadmin
-
- alarm: semaphore_arrays_used
- on: system.ipc_semaphore_arrays
- os: linux
- hosts: *
- calc: $arrays * 100 / $ipc.semaphores.arrays.max
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (70) : (90))
- delay: down 5m multiplier 1.5 max 1h
- info: the percentage of IPC semaphore arrays used
- to: sysadmin
diff --git a/conf.d/health.d/ipfs.conf b/conf.d/health.d/ipfs.conf
deleted file mode 100644
index 3f77572d..00000000
--- a/conf.d/health.d/ipfs.conf
+++ /dev/null
@@ -1,11 +0,0 @@
-
-template: ipfs_datastore_usage
- on: ipfs.repo_size
- calc: $size * 100 / $avail
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- info: ipfs Datastore close to running out of space
- to: sysadmin
diff --git a/conf.d/health.d/ipmi.conf b/conf.d/health.d/ipmi.conf
deleted file mode 100644
index c2558196..00000000
--- a/conf.d/health.d/ipmi.conf
+++ /dev/null
@@ -1,20 +0,0 @@
- alarm: ipmi_sensors_states
- on: ipmi.sensors_states
- calc: $warning + $critical
- units: sensors
- every: 10s
- warn: $this > 0
- crit: $critical > 0
- delay: up 5m down 15m multiplier 1.5 max 1h
- info: the number IPMI sensors in non-nominal state
- to: sysadmin
-
- alarm: ipmi_events
- on: ipmi.events
- calc: $events
- units: events
- every: 10s
- warn: $this > 0
- delay: up 5m down 15m multiplier 1.5 max 1h
- info: the number of events in the IPMI System Event Log (SEL)
- to: sysadmin
diff --git a/conf.d/health.d/isc_dhcpd.conf b/conf.d/health.d/isc_dhcpd.conf
deleted file mode 100644
index 8054656f..00000000
--- a/conf.d/health.d/isc_dhcpd.conf
+++ /dev/null
@@ -1,10 +0,0 @@
- template: isc_dhcpd_leases_size
- on: isc_dhcpd.leases_total
- units: KB
- every: 60
- calc: $leases_size
- warn: $this > 3072
- crit: $this > 6144
- delay: up 2m down 5m
- info: dhcpd.leases file too big! Module can slow down your server.
- to: sysadmin
diff --git a/conf.d/health.d/lighttpd.conf b/conf.d/health.d/lighttpd.conf
deleted file mode 100644
index 915907a4..00000000
--- a/conf.d/health.d/lighttpd.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# make sure lighttpd is running
-
-template: lighttpd_last_collected_secs
- on: lighttpd.requests
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/conf.d/health.d/mdstat.conf b/conf.d/health.d/mdstat.conf
deleted file mode 100644
index c9e7d20d..00000000
--- a/conf.d/health.d/mdstat.conf
+++ /dev/null
@@ -1,18 +0,0 @@
-template: mdstat_disks
- on: md.disks
- units: failed devices
- every: 10s
- calc: $total - $inuse
- crit: $this > 0
- info: Array is degraded!
- to: sysadmin
-
-template: mdstat_last_collected
- on: md.disks
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
- to: sysadmin
diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf
deleted file mode 100644
index d248ef57..00000000
--- a/conf.d/health.d/memcached.conf
+++ /dev/null
@@ -1,52 +0,0 @@
-
-# make sure memcached is running
-
-template: memcached_last_collected_secs
- on: memcached.cache
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
-
-# detect if memcached cache is full
-
-template: memcached_cache_memory_usage
- on: memcached.cache
- calc: $used * 100 / ($used + $available)
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (80) : (90))
- delay: up 0 down 15m multiplier 1.5 max 1h
- info: current cache memory usage
- to: dba
-
-
-# find the rate memcached cache is filling
-
-template: cache_fill_rate
- on: memcached.cache
- lookup: min -10m at -50m unaligned of available
- calc: ($this - $available) / (($now - $after) / 3600)
- units: KB/hour
- every: 1m
- info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour
-
-
-# find the hours remaining until memcached cache is full
-
-template: out_of_cache_space_time
- on: memcached.cache
- calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf)
- units: hours
- every: 10s
- warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
- crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
- delay: down 15m multiplier 1.5 max 1h
- info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour
- to: dba
diff --git a/conf.d/health.d/memory.conf b/conf.d/health.d/memory.conf
deleted file mode 100644
index 4a0e6e52..00000000
--- a/conf.d/health.d/memory.conf
+++ /dev/null
@@ -1,38 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- alarm: 1hour_ecc_memory_correctable
- on: mem.ecc_ce
- os: linux
- hosts: *
- lookup: sum -10m unaligned
- units: errors
- every: 1m
- warn: $this > 0
- delay: down 1h multiplier 1.5 max 1h
- info: number of ECC correctable errors during the last hour
- to: sysadmin
-
- alarm: 1hour_ecc_memory_uncorrectable
- on: mem.ecc_ue
- os: linux
- hosts: *
- lookup: sum -10m unaligned
- units: errors
- every: 1m
- crit: $this > 0
- delay: down 1h multiplier 1.5 max 1h
- info: number of ECC uncorrectable errors during the last hour
- to: sysadmin
-
- alarm: 1hour_memory_hw_corrupted
- on: mem.hwcorrupt
- os: linux
- hosts: *
- calc: $HardwareCorrupted
- units: MB
- every: 10s
- warn: $this > 0
- delay: down 1h multiplier 1.5 max 1h
- info: amount of memory corrupted due to a hardware failure
- to: sysadmin
diff --git a/conf.d/health.d/mongodb.conf b/conf.d/health.d/mongodb.conf
deleted file mode 100644
index a80cb311..00000000
--- a/conf.d/health.d/mongodb.conf
+++ /dev/null
@@ -1,13 +0,0 @@
-
-# make sure mongodb is running
-
-template: mongodb_last_collected_secs
- on: mongodb.read_operations
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf
deleted file mode 100644
index 1eeb993f..00000000
--- a/conf.d/health.d/mysql.conf
+++ /dev/null
@@ -1,85 +0,0 @@
-
-# make sure mysql is running
-
-template: mysql_last_collected_secs
- on: mysql.queries
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
-
-# -----------------------------------------------------------------------------
-# slow queries
-
-template: mysql_10s_slow_queries
- on: mysql.queries
- lookup: sum -10s of slow_queries
- units: slow queries
- every: 10s
- warn: $this > (($status >= $WARNING) ? (5) : (10))
- crit: $this > (($status == $CRITICAL) ? (10) : (20))
- delay: down 5m multiplier 1.5 max 1h
- info: number of mysql slow queries over the last 10 seconds
- to: dba
-
-
-# -----------------------------------------------------------------------------
-# lock waits
-
-template: mysql_10s_table_locks_immediate
- on: mysql.table_locks
- lookup: sum -10s absolute of immediate
- units: immediate locks
- every: 10s
- info: number of table immediate locks over the last 10 seconds
- to: dba
-
-template: mysql_10s_table_locks_waited
- on: mysql.table_locks
- lookup: sum -10s absolute of waited
- units: waited locks
- every: 10s
- info: number of table waited locks over the last 10 seconds
- to: dba
-
-template: mysql_10s_waited_locks_ratio
- on: mysql.table_locks
- calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (10) : (25))
- crit: $this > (($status == $CRITICAL) ? (25) : (50))
- delay: down 30m multiplier 1.5 max 1h
- info: the ratio of mysql waited table locks, for the last 10 seconds
- to: dba
-
-
-# -----------------------------------------------------------------------------
-# replication
-
-template: mysql_replication
- on: mysql.slave_status
- calc: ($sql_running == -1 OR $io_running == -1)?0:1
- units: ok/failed
- every: 10s
- crit: $this == 0
- delay: down 5m multiplier 1.5 max 1h
- info: checks if mysql replication has stopped
- to: dba
-
-template: mysql_replication_lag
- on: mysql.slave_behind
- calc: $seconds
- units: seconds
- every: 10s
- warn: $this > (($status >= $WARNING) ? (5) : (10))
- crit: $this > (($status == $CRITICAL) ? (10) : (30))
- delay: down 15m multiplier 1.5 max 1h
- info: the number of seconds mysql replication is behind this master
- to: dba
-
diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf
deleted file mode 100644
index 4fc65c8e..00000000
--- a/conf.d/health.d/named.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# make sure named is running
-
-template: named_last_collected_secs
- on: named.global_queries
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: domainadmin
-
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
deleted file mode 100644
index 22a88927..00000000
--- a/conf.d/health.d/net.conf
+++ /dev/null
@@ -1,122 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# -----------------------------------------------------------------------------
-# dropped packets
-
-# check if an interface is dropping packets
-# the alarm is checked every 1 minute
-# and examines the last 10 minutes of data
-
-template: inbound_packets_dropped
- on: net.drops
- os: linux
- hosts: *
-families: *
- lookup: sum -10m unaligned absolute of inbound
- units: packets
- every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- info: interface inbound dropped packets in the last 10 minutes
- to: sysadmin
-
-template: outbound_packets_dropped
- on: net.drops
- os: linux
- hosts: *
-families: *
- lookup: sum -10m unaligned absolute of outbound
- units: packets
- every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- info: interface outbound dropped packets in the last 10 minutes
- to: sysadmin
-
-template: inbound_packets_dropped_ratio
- on: net.packets
- os: linux
- hosts: *
-families: *
- lookup: sum -10m unaligned absolute of received
- calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0))
- units: %
- every: 1m
- warn: $this >= 0.1
- crit: $this >= 2
- delay: down 1h multiplier 1.5 max 2h
- info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
- to: sysadmin
-
-template: outbound_packets_dropped_ratio
- on: net.packets
- os: linux
- hosts: *
-families: *
- lookup: sum -10m unaligned absolute of sent
- calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0))
- units: %
- every: 1m
- warn: $this >= 0.1
- crit: $this >= 2
- delay: down 1h multiplier 1.5 max 2h
- info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
- to: sysadmin
-
-
-# -----------------------------------------------------------------------------
-# FIFO errors
-
-# check if an interface is having FIFO
-# buffer errors
-# the alarm is checked every 1 minute
-# and examines the last 10 minutes of data
-
-template: 10min_fifo_errors
- on: net.fifo
- os: linux
- hosts: *
-families: *
- lookup: sum -10m unaligned absolute
- units: errors
- every: 1m
- warn: $this > 0
- delay: down 1h multiplier 1.5 max 2h
- info: interface fifo errors in the last 10 minutes
- to: sysadmin
-
-
-# -----------------------------------------------------------------------------
-# check for packet storms
-
-# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
-# 2. do the same for the last 10s
-# 3. raise an alarm if the later is 10x or 20x the first
-# we assume the minimum packet storm should at least have
-# 10000 packets/s, average of the last 10 seconds
-
-template: 1m_received_packets_rate
- on: net.packets
- os: linux freebsd
- hosts: *
-families: *
- lookup: average -1m of received
- units: packets
- every: 10s
- info: the average number of packets received during the last minute
-
-template: 10s_received_packets_storm
- on: net.packets
- os: linux freebsd
- hosts: *
-families: *
- lookup: average -10s of received
- calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
- every: 10s
- units: %
- warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status >= $WARNING)?(5000):(6000))
-options: no-clear-notification
- info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
- to: sysadmin
diff --git a/conf.d/health.d/netfilter.conf b/conf.d/health.d/netfilter.conf
deleted file mode 100644
index fa1732b3..00000000
--- a/conf.d/health.d/netfilter.conf
+++ /dev/null
@@ -1,29 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- alarm: netfilter_last_collected_secs
- on: netfilter.conntrack_sockets
- os: linux
- hosts: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
- alarm: netfilter_conntrack_full
- on: netfilter.conntrack_sockets
- os: linux
- hosts: *
- lookup: max -10s unaligned of connections
- calc: $this * 100 / $netfilter.conntrack.max
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (80) : (90))
- delay: down 5m multiplier 1.5 max 1h
- info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size
- to: sysadmin
diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf
deleted file mode 100644
index a686c3d9..00000000
--- a/conf.d/health.d/nginx.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# make sure nginx is running
-
-template: nginx_last_collected_secs
- on: nginx.requests
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/conf.d/health.d/nginx_plus.conf b/conf.d/health.d/nginx_plus.conf
deleted file mode 100644
index 5a171a76..00000000
--- a/conf.d/health.d/nginx_plus.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# make sure nginx_plus is running
-
-template: nginx_plus_last_collected_secs
- on: nginx_plus.requests_total
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/conf.d/health.d/portcheck.conf b/conf.d/health.d/portcheck.conf
deleted file mode 100644
index f42b63d3..00000000
--- a/conf.d/health.d/portcheck.conf
+++ /dev/null
@@ -1,48 +0,0 @@
-template: portcheck_last_collected_secs
-families: *
- on: portcheck.status
- calc: $now - $last_collected_t
- every: 10s
- units: seconds ago
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
-# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: service_reachable
-families: *
- on: portcheck.status
- lookup: average -1m unaligned percentage of success
- calc: ($this < 75) ? (0) : ($this)
- every: 5s
- units: up/down
- info: at least 75% successful connections during last 60 seconds, ideal for badges
- to: silent
-
-template: connection_timeouts
-families: *
- on: portcheck.status
- lookup: average -5m unaligned percentage of timeout
- every: 10s
- units: %
- warn: $this >= 10 AND $this < 40
- crit: $this >= 40
- delay: down 5m multiplier 1.5 max 1h
- info: average of timeouts during the last 5 minutes
- options: no-clear-notification
- to: sysadmin
-
-template: connection_fails
-families: *
- on: portcheck.status
- lookup: average -5m unaligned percentage of no_connection
- every: 10s
- units: %
- warn: $this >= 10 AND $this < 40
- crit: $this >= 40
- delay: down 5m multiplier 1.5 max 1h
- info: average of failed connections during the last 5 minutes
- options: no-clear-notification
- to: sysadmin
diff --git a/conf.d/health.d/postgres.conf b/conf.d/health.d/postgres.conf
deleted file mode 100644
index 4e0583b8..00000000
--- a/conf.d/health.d/postgres.conf
+++ /dev/null
@@ -1,13 +0,0 @@
-
-# make sure postgres is running
-
-template: postgres_last_collected_secs
- on: postgres.db_stat_transactions
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf
deleted file mode 100644
index 7290d15f..00000000
--- a/conf.d/health.d/qos.conf
+++ /dev/null
@@ -1,18 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# check if a QoS class is dropping packets
-# the alarm is checked every 10 seconds
-# and examines the last minute of data
-
-#template: 10min_qos_packet_drops
-# on: tc.qos_dropped
-# os: linux
-# hosts: *
-# lookup: sum -10m unaligned absolute
-# every: 30s
-# warn: $this > 0
-# delay: up 0 down 30m multiplier 1.5 max 1h
-# units: packets
-# info: dropped packets in the last 30 minutes
-# to: sysadmin
diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf
deleted file mode 100644
index b6dc5f94..00000000
--- a/conf.d/health.d/ram.conf
+++ /dev/null
@@ -1,64 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- alarm: used_ram_to_ignore
- on: system.ram
- os: linux
- hosts: *
- calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
- every: 10s
- info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
-
- alarm: ram_in_use
- on: system.ram
- os: linux
- hosts: *
-# calc: $used * 100 / ($used + $cached + $free)
- calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- info: system RAM used
- to: sysadmin
-
- alarm: ram_available
- on: mem.available
- os: linux
- hosts: *
- calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
- units: %
- every: 10s
- warn: $this < (($status >= $WARNING) ? ( 5) : (10))
- crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
- delay: down 15m multiplier 1.5 max 1h
- info: estimated amount of RAM available for userspace processes, without causing swapping
- to: sysadmin
-
-## FreeBSD
-alarm: ram_in_use
- on: system.ram
- os: freebsd
-hosts: *
- calc: (($active + $wired) - $used_ram_to_ignore) * 100 / (($active + $wired) - $used_ram_to_ignore + $cached + $free)
-units: %
-every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
-delay: down 15m multiplier 1.5 max 1h
- info: system RAM usage
- to: sysadmin
-
- alarm: ram_available
- on: system.ram
- os: freebsd
- hosts: *
- calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $buffers)
- units: %
- every: 10s
- warn: $this < (($status >= $WARNING) ? ( 5) : (10))
- crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
- delay: down 15m multiplier 1.5 max 1h
- info: estimated amount of RAM available for userspace processes, without causing swapping
- to: sysadmin
diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf
deleted file mode 100644
index c08a884a..00000000
--- a/conf.d/health.d/redis.conf
+++ /dev/null
@@ -1,34 +0,0 @@
-
-# make sure redis is running
-
-template: redis_last_collected_secs
- on: redis.operations
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
-template: redis_bgsave_broken
-families: *
- on: redis.bgsave_health
- every: 10s
- crit: $rdb_last_bgsave_status != 0
- units: ok/failed
- info: states if redis bgsave is working
- delay: down 5m multiplier 1.5 max 1h
- to: dba
-
-template: redis_bgsave_slow
-families: *
- on: redis.bgsave_now
- every: 10s
- warn: $rdb_bgsave_in_progress > 600
- crit: $rdb_bgsave_in_progress > 1200
- units: seconds
- info: the time redis needs to save its database
- delay: down 5m multiplier 1.5 max 1h
- to: dba
diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf
deleted file mode 100644
index 2344b60e..00000000
--- a/conf.d/health.d/retroshare.conf
+++ /dev/null
@@ -1,25 +0,0 @@
-# make sure RetroShare is running
-
-template: retroshare_last_collected_secs
- on: retroshare.peers
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
-# make sure the DHT is fine when active
-
-template: retroshare_dht_working
- on: retroshare.dht
- calc: $dht_size_all
- units: peers
- every: 1m
- warn: $this < (($status >= $WARNING) ? (120) : (100))
- crit: $this < (($status == $CRITICAL) ? (10) : (1))
- delay: up 0 down 15m multiplier 1.5 max 1h
- info: Checks if the DHT has enough peers to operate
- to: sysadmin
diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf
deleted file mode 100644
index 77c804bf..00000000
--- a/conf.d/health.d/softnet.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# check for common /proc/net/softnet_stat errors
-
- alarm: 10min_netdev_backlog_exceeded
- on: system.softnet_stat
- os: linux
- hosts: *
- lookup: sum -10m unaligned absolute of dropped
- units: packets
- every: 1m
- warn: $this > 0
- delay: down 1h multiplier 1.5 max 2h
- info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
- to: sysadmin
-
- alarm: 10min_netdev_budget_ran_outs
- on: system.softnet_stat
- os: linux
- hosts: *
- lookup: sum -10m unaligned absolute of squeezed
- units: events
- every: 1m
- warn: $this > (($status >= $WARNING) ? (0) : (10))
- delay: down 1h multiplier 1.5 max 2h
- info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
- to: silent
-
- alarm: 10min_netisr_backlog_exceeded
- on: system.softnet_stat
- os: freebsd
- hosts: *
- lookup: sum -10m unaligned absolute of qdrops
- units: packets
- every: 1m
- warn: $this > 0
- delay: down 1h multiplier 1.5 max 2h
- info: number of drops in the last 10min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
- to: sysadmin
diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf
deleted file mode 100644
index 06cc9678..00000000
--- a/conf.d/health.d/squid.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# make sure squid is running
-
-template: squid_last_collected_secs
- on: squid.clients_requests
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: proxyadmin
-
diff --git a/conf.d/health.d/stiebeleltron.conf b/conf.d/health.d/stiebeleltron.conf
deleted file mode 100644
index e0361eb2..00000000
--- a/conf.d/health.d/stiebeleltron.conf
+++ /dev/null
@@ -1,11 +0,0 @@
-template: stiebeleltron_last_collected_secs
-families: *
- on: stiebeleltron.heating.hc1
- calc: $now - $last_collected_t
- every: 10s
- units: seconds ago
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sitemgr
diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf
deleted file mode 100644
index f920b080..00000000
--- a/conf.d/health.d/swap.conf
+++ /dev/null
@@ -1,43 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- alarm: 30min_ram_swapped_out
- on: system.swapio
- os: linux freebsd
- hosts: *
- lookup: sum -30m unaligned absolute of out
- # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
- calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
- units: % of RAM
- every: 1m
- warn: $this > (($status >= $WARNING) ? (10) : (20))
- crit: $this > (($status == $CRITICAL) ? (20) : (30))
- delay: up 0 down 15m multiplier 1.5 max 1h
- info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
- to: sysadmin
-
- alarm: ram_in_swap
- on: system.swap
- os: linux
- hosts: *
- calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
- units: % of RAM
- every: 10s
- warn: $this > (($status >= $WARNING) ? (15) : (20))
- crit: $this > (($status == $CRITICAL) ? (40) : (50))
- delay: up 30s down 15m multiplier 1.5 max 1h
- info: the swap memory used, as a percentage of the system RAM
- to: sysadmin
-
- alarm: used_swap
- on: system.swap
- os: linux freebsd
- hosts: *
- calc: $used * 100 / ( $used + $free )
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: up 30s down 15m multiplier 1.5 max 1h
- info: the percentage of swap memory used
- to: sysadmin
diff --git a/conf.d/health.d/tcp_conn.conf b/conf.d/health.d/tcp_conn.conf
deleted file mode 100644
index 7aa9a980..00000000
--- a/conf.d/health.d/tcp_conn.conf
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#
-# ${tcp_max_connections} may be nan or -1 if the system
-# supports dynamic threshold for TCP connections.
-# In this case, the alarm will always be zero.
-#
-
- alarm: tcp_connections
- on: ipv4.tcpsock
- os: linux
- hosts: *
- calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
- crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 ))
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: the percentage of IPv4 TCP connections over the max allowed
- to: sysadmin
diff --git a/conf.d/health.d/tcp_listen.conf b/conf.d/health.d/tcp_listen.conf
deleted file mode 100644
index 957964ae..00000000
--- a/conf.d/health.d/tcp_listen.conf
+++ /dev/null
@@ -1,27 +0,0 @@
-# -----------------------------------------------------------------------------
-# tcp listen sockets issues
-
- alarm: 1m_ipv4_tcp_listen_overflows
- on: ipv4.tcplistenissues
- os: linux freebsd
- hosts: *
- lookup: sum -60s unaligned absolute of ListenOverflows
- units: overflows
- every: 10s
- crit: $this > 0
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: the number of TCP listen socket overflows during the last minute
- to: sysadmin
-
- alarm: 1m_ipv4_tcp_listen_drops
- on: ipv4.tcplistenissues
- os: linux
- hosts: *
- lookup: sum -60s unaligned absolute of ListenDrops
- units: drops
- every: 10s
- crit: $this > 0
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: the number of TCP listen socket drops during the last minute
- to: sysadmin
-
diff --git a/conf.d/health.d/tcp_mem.conf b/conf.d/health.d/tcp_mem.conf
deleted file mode 100644
index 6927d576..00000000
--- a/conf.d/health.d/tcp_mem.conf
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# check
-# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
-#
-# We give a warning when TCP is under memory pressure
-# and a critical when TCP is 90% of its upper memory limit
-#
-
- alarm: tcp_memory
- on: ipv4.sockstat_tcp_mem
- os: linux
- hosts: *
- calc: ${mem} * 100 / ${tcp_mem_high}
- units: %
- every: 10s
- warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} ))
- crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: the amount of TCP memory as a percentage of its max memory limit
- to: sysadmin
diff --git a/conf.d/health.d/tcp_orphans.conf b/conf.d/health.d/tcp_orphans.conf
deleted file mode 100644
index 280d6590..00000000
--- a/conf.d/health.d/tcp_orphans.conf
+++ /dev/null
@@ -1,21 +0,0 @@
-
-#
-# check
-# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
-#
-# The kernel may penalize orphans by 2x or even 4x
-# so we alarm warning at 25% and critical at 50%
-#
-
- alarm: tcp_orphans
- on: ipv4.sockstat_tcp_sockets
- os: linux
- hosts: *
- calc: ${orphan} * 100 / ${tcp_max_orphans}
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
- crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 ))
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors)
- to: sysadmin
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
deleted file mode 100644
index 91dad3c6..00000000
--- a/conf.d/health.d/tcp_resets.conf
+++ /dev/null
@@ -1,67 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# -----------------------------------------------------------------------------
-
- alarm: ipv4_tcphandshake_last_collected_secs
- on: ipv4.tcphandshake
- os: linux freebsd
- hosts: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
-# -----------------------------------------------------------------------------
-# tcp resets this host sends
-
- alarm: 1m_ipv4_tcp_resets_sent
- on: ipv4.tcphandshake
- os: linux
- hosts: *
- lookup: average -1m at -10s unaligned absolute of OutRsts
- units: tcp resets/s
- every: 10s
- info: average TCP RESETS this host is sending, over the last minute
-
- alarm: 10s_ipv4_tcp_resets_sent
- on: ipv4.tcphandshake
- os: linux
- hosts: *
- lookup: average -10s unaligned absolute of OutRsts
- units: tcp resets/s
- every: 10s
- warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20)))
- delay: up 0 down 60m multiplier 1.2 max 2h
- options: no-clear-notification
- info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent)
- to: sysadmin
-
-# -----------------------------------------------------------------------------
-# tcp resets this host receives
-
- alarm: 1m_ipv4_tcp_resets_received
- on: ipv4.tcphandshake
- os: linux freebsd
- hosts: *
- lookup: average -1m at -10s unaligned absolute of AttemptFails
- units: tcp resets/s
- every: 10s
- info: average TCP RESETS this host is sending, over the last minute
-
- alarm: 10s_ipv4_tcp_resets_received
- on: ipv4.tcphandshake
- os: linux freebsd
- hosts: *
- lookup: average -10s unaligned absolute of AttemptFails
- units: tcp resets/s
- every: 10s
- warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
- delay: up 0 down 60m multiplier 1.2 max 2h
- options: no-clear-notification
- info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent)
- to: sysadmin
diff --git a/conf.d/health.d/udp_errors.conf b/conf.d/health.d/udp_errors.conf
deleted file mode 100644
index 382b3965..00000000
--- a/conf.d/health.d/udp_errors.conf
+++ /dev/null
@@ -1,49 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# -----------------------------------------------------------------------------
-
- alarm: ipv4_udperrors_last_collected_secs
- on: ipv4.udperrors
- os: linux freebsd
- hosts: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
-# -----------------------------------------------------------------------------
-# UDP receive buffer errors
-
- alarm: 1m_ipv4_udp_receive_buffer_errors
- on: ipv4.udperrors
- os: linux freebsd
- hosts: *
- lookup: sum -1m unaligned absolute of RcvbufErrors
- units: errors
- every: 10s
- warn: $this > 0
- crit: $this > 100
- info: number of UDP receive buffer errors during the last minute
- delay: up 0 down 60m multiplier 1.2 max 2h
- to: sysadmin
-
-# -----------------------------------------------------------------------------
-# UDP send buffer errors
-
- alarm: 1m_ipv4_udp_send_buffer_errors
- on: ipv4.udperrors
- os: linux
- hosts: *
- lookup: sum -1m unaligned absolute of SndbufErrors
- units: errors
- every: 10s
- warn: $this > 0
- crit: $this > 100
- info: number of UDP send buffer errors during the last minute
- delay: up 0 down 60m multiplier 1.2 max 2h
- to: sysadmin
diff --git a/conf.d/health.d/varnish.conf b/conf.d/health.d/varnish.conf
deleted file mode 100644
index cca7446b..00000000
--- a/conf.d/health.d/varnish.conf
+++ /dev/null
@@ -1,9 +0,0 @@
- alarm: varnish_last_collected
- on: varnish.uptime
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
- to: sysadmin
diff --git a/conf.d/health.d/web_log.conf b/conf.d/health.d/web_log.conf
deleted file mode 100644
index d8be88b4..00000000
--- a/conf.d/health.d/web_log.conf
+++ /dev/null
@@ -1,163 +0,0 @@
-
-# make sure we can collect web log data
-
-template: last_collected_secs
- on: web_log.response_codes
-families: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
-
-# -----------------------------------------------------------------------------
-# high level response code alarms
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-# $1m_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
-template: 1m_requests
- on: web_log.response_statuses
-families: *
- lookup: sum -1m unaligned
- calc: ($this == 0)?(1):($this)
- units: requests
- every: 10s
- info: the sum of all HTTP requests over the last minute
-
-template: 1m_successful
- on: web_log.response_statuses
-families: *
- lookup: sum -1m unaligned of successful_requests
- calc: $this * 100 / $1m_requests
- units: %
- every: 10s
- warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
- delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of successful HTTP responses (1xx, 2xx, 304) over the last minute
- to: webmaster
-
-template: 1m_redirects
- on: web_log.response_statuses
-families: *
- lookup: sum -1m unaligned of redirects
- calc: $this * 100 / $1m_requests
- units: %
- every: 10s
- warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
- delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of HTTP redirects (3xx except 304) over the last minute
- to: webmaster
-
-template: 1m_bad_requests
- on: web_log.response_statuses
-families: *
- lookup: sum -1m unaligned of bad_requests
- calc: $this * 100 / $1m_requests
- units: %
- every: 10s
- warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
- delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of HTTP bad requests (4xx) over the last minute
- to: webmaster
-
-template: 1m_internal_errors
- on: web_log.response_statuses
-families: *
- lookup: sum -1m unaligned of server_errors
- calc: $this * 100 / $1m_requests
- units: %
- every: 10s
- warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
- delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of HTTP internal server errors (5xx), over the last minute
- to: webmaster
-
-
-# -----------------------------------------------------------------------------
-# web slow
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-# $1m_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
-template: 10m_response_time
- on: web_log.response_time
-families: *
- lookup: average -10m unaligned of avg
- units: ms
- every: 30s
- info: the average time to respond to HTTP requests, over the last 10 minutes
-
-template: web_slow
- on: web_log.response_time
-families: *
- lookup: average -1m unaligned of avg
- units: ms
- every: 10s
- green: 500
- red: 1000
- warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 )
- delay: down 15m multiplier 1.5 max 1h
- info: the average time to respond to HTTP requests, over the last 1 minute
- options: no-clear-notification
- to: webmaster
-
-# -----------------------------------------------------------------------------
-# web too many or too few requests
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-# $5m_successful_old > 120
-#
-# i.e. when there were at least 120 requests during the 5 minutes starting
-# at -10m and ending at -5m
-
-template: 5m_successful_old
- on: web_log.response_statuses
-families: *
- lookup: average -5m at -5m unaligned of successful_requests
- units: requests/s
- every: 30s
- info: average rate of successful HTTP requests over the last 5 minutes
-
-template: 5m_successful
- on: web_log.response_statuses
-families: *
- lookup: average -5m unaligned of successful_requests
- units: requests/s
- every: 30s
- info: average successful HTTP requests over the last 5 minutes
-
-template: 5m_requests_ratio
- on: web_log.response_codes
-families: *
- calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
- units: %
- every: 30s
- warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
- crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
- delay: down 15m multiplier 1.5 max 1h
-options: no-clear-notification
- info: the percentage of successful web requests over the last 5 minutes, \
- compared with the previous 5 minutes \
- (clear notification for this alarm will not be sent)
- to: webmaster
-
diff --git a/conf.d/health.d/zfs.conf b/conf.d/health.d/zfs.conf
deleted file mode 100644
index af73824e..00000000
--- a/conf.d/health.d/zfs.conf
+++ /dev/null
@@ -1,10 +0,0 @@
-
- alarm: zfs_memory_throttle
- on: zfs.memory_ops
- lookup: sum -10m unaligned absolute of throttled
- units: events
- every: 1m
- warn: $this > 0
- delay: down 1h multiplier 1.5 max 2h
- info: the number of times ZFS had to limit the ARC growth in the last 10 minutes
- to: sysadmin