diff options
Diffstat (limited to 'conf.d/health.d')
-rw-r--r-- | conf.d/health.d/beanstalkd.conf | 36 | ||||
-rw-r--r-- | conf.d/health.d/bind_rndc.conf | 2 | ||||
-rw-r--r-- | conf.d/health.d/couchdb.conf | 13 | ||||
-rw-r--r-- | conf.d/health.d/redis.conf | 20 | ||||
-rw-r--r-- | conf.d/health.d/tcp_conn.conf | 19 | ||||
-rw-r--r-- | conf.d/health.d/tcp_listen.conf | 27 | ||||
-rw-r--r-- | conf.d/health.d/tcp_mem.conf | 20 | ||||
-rw-r--r-- | conf.d/health.d/tcp_orphans.conf | 21 | ||||
-rw-r--r-- | conf.d/health.d/tcp_resets.conf | 4 |
9 files changed, 159 insertions, 3 deletions
diff --git a/conf.d/health.d/beanstalkd.conf b/conf.d/health.d/beanstalkd.conf new file mode 100644 index 00000000..30dc2732 --- /dev/null +++ b/conf.d/health.d/beanstalkd.conf @@ -0,0 +1,36 @@ +# get the number of buried jobs in all queues + +template: server_buried_jobs + on: beanstalk.current_jobs + calc: $buried + units: jobs + every: 10s + warn: $this > 0 + crit: $this > 10 + delay: up 0 down 5m multiplier 1.2 max 1h + info: the number of buried jobs aggregated across all tubes + to: sysadmin + +# get the number of buried jobs per queue + +#template: tube_buried_jobs +# on: beanstalk.jobs +# calc: $buried +# units: jobs +# every: 10s +# warn: $this > 0 +# crit: $this > 10 +# delay: up 0 down 5m multiplier 1.2 max 1h +# info: the number of jobs buried per tube +# to: sysadmin + +# get the current number of tubes + +#template: number_of_tubes +# on: beanstalk.current_tubes +# calc: $tubes +# every: 10s +# warn: $this < 5 +# delay: up 0 down 5m multiplier 1.2 max 1h +# info: the current number of tubes on the server +# to: sysadmin diff --git a/conf.d/health.d/bind_rndc.conf b/conf.d/health.d/bind_rndc.conf index 028bc9d0..4145e77c 100644 --- a/conf.d/health.d/bind_rndc.conf +++ b/conf.d/health.d/bind_rndc.conf @@ -1,4 +1,4 @@ - alarm: bind_rndc_stats_file_size + template: bind_rndc_stats_file_size on: bind_rndc.stats_size units: megabytes every: 60 diff --git a/conf.d/health.d/couchdb.conf b/conf.d/health.d/couchdb.conf new file mode 100644 index 00000000..4a289528 --- /dev/null +++ b/conf.d/health.d/couchdb.conf @@ -0,0 +1,13 @@ + +# make sure couchdb is running + +template: couchdb_last_collected_secs + on: couchdb.request_methods + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf index 5f6d397e..c08a884a 100644 --- a/conf.d/health.d/redis.conf +++ b/conf.d/health.d/redis.conf @@ -12,3 +12,23 @@ template: redis_last_collected_secs info: number of seconds since the last successful data collection to: dba +template: redis_bgsave_broken +families: * + on: redis.bgsave_health + every: 10s + crit: $rdb_last_bgsave_status != 0 + units: ok/failed + info: states if redis bgsave is working + delay: down 5m multiplier 1.5 max 1h + to: dba + +template: redis_bgsave_slow +families: * + on: redis.bgsave_now + every: 10s + warn: $rdb_bgsave_in_progress > 600 + crit: $rdb_bgsave_in_progress > 1200 + units: seconds + info: the time redis needs to save its database + delay: down 5m multiplier 1.5 max 1h + to: dba diff --git a/conf.d/health.d/tcp_conn.conf b/conf.d/health.d/tcp_conn.conf new file mode 100644 index 00000000..7aa9a980 --- /dev/null +++ b/conf.d/health.d/tcp_conn.conf @@ -0,0 +1,19 @@ + +# +# ${tcp_max_connections} may be nan or -1 if the system +# supports dynamic threshold for TCP connections. +# In this case, the alarm will always be zero. +# + + alarm: tcp_connections + on: ipv4.tcpsock + os: linux + hosts: * + calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0 + units: % + every: 10s + warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 )) + crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 )) + delay: up 0 down 5m multiplier 1.5 max 1h + info: the percentage of IPv4 TCP connections over the max allowed + to: sysadmin diff --git a/conf.d/health.d/tcp_listen.conf b/conf.d/health.d/tcp_listen.conf new file mode 100644 index 00000000..957964ae --- /dev/null +++ b/conf.d/health.d/tcp_listen.conf @@ -0,0 +1,27 @@ +# ----------------------------------------------------------------------------- +# tcp listen sockets issues + + alarm: 1m_ipv4_tcp_listen_overflows + on: ipv4.tcplistenissues + os: linux freebsd + hosts: * + lookup: sum -60s unaligned absolute of ListenOverflows + units: overflows + every: 10s + crit: $this > 0 + delay: up 0 down 5m multiplier 1.5 max 1h + info: the number of TCP listen socket overflows during the last minute + to: sysadmin + + alarm: 1m_ipv4_tcp_listen_drops + on: ipv4.tcplistenissues + os: linux + hosts: * + lookup: sum -60s unaligned absolute of ListenDrops + units: drops + every: 10s + crit: $this > 0 + delay: up 0 down 5m multiplier 1.5 max 1h + info: the number of TCP listen socket drops during the last minute + to: sysadmin + diff --git a/conf.d/health.d/tcp_mem.conf b/conf.d/health.d/tcp_mem.conf new file mode 100644 index 00000000..6927d576 --- /dev/null +++ b/conf.d/health.d/tcp_mem.conf @@ -0,0 +1,20 @@ +# +# check +# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html +# +# We give a warning when TCP is under memory pressure +# and a critical when TCP is 90% of its upper memory limit +# + + alarm: tcp_memory + on: ipv4.sockstat_tcp_mem + os: linux + hosts: * + calc: ${mem} * 100 / ${tcp_mem_high} + units: % + every: 10s + warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} )) + crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) + delay: up 0 down 5m multiplier 1.5 max 1h + info: the amount of TCP memory as a percentage of its max memory limit + to: sysadmin diff --git a/conf.d/health.d/tcp_orphans.conf b/conf.d/health.d/tcp_orphans.conf new file mode 100644 index 00000000..280d6590 --- /dev/null +++ b/conf.d/health.d/tcp_orphans.conf @@ -0,0 +1,21 @@ + +# +# check +# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html +# +# The kernel may penalize orphans by 2x or even 4x +# so we alarm warning at 25% and critical at 50% +# + + alarm: tcp_orphans + on: ipv4.sockstat_tcp_sockets + os: linux + hosts: * + calc: ${orphan} * 100 / ${tcp_max_orphans} + units: % + every: 10s + warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 )) + crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 )) + delay: up 0 down 5m multiplier 1.5 max 1h + info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors) + to: sysadmin diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf index fec124ac..e6cfd39a 100644 --- a/conf.d/health.d/tcp_resets.conf +++ b/conf.d/health.d/tcp_resets.conf @@ -37,7 +37,7 @@ every: 10s warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20))) delay: up 0 down 60m multiplier 1.2 max 2h -options: no-clear-notification + options: no-clear-notification info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent) to: sysadmin @@ -62,6 +62,6 @@ options: no-clear-notification every: 10s warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) delay: up 0 down 60m multiplier 1.2 max 2h -options: no-clear-notification + options: no-clear-notification info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent) to: sysadmin |