summaryrefslogtreecommitdiffstats
path: root/conf.d/health.d
diff options
context:
space:
mode:
authorFederico Ceratto <federico.ceratto@gmail.com>2017-12-19 23:39:21 +0000
committerFederico Ceratto <federico.ceratto@gmail.com>2017-12-19 23:39:21 +0000
commit61aedf201c2c4bf0e5aa4db32e74f4d860b88593 (patch)
treebcf4f9a0cd8bc2daf38b2ff9f29bfcc1e5ed8968 /conf.d/health.d
parentNew upstream version 1.8.0+dfsg (diff)
downloadnetdata-61aedf201c2c4bf0e5aa4db32e74f4d860b88593.tar.xz
netdata-61aedf201c2c4bf0e5aa4db32e74f4d860b88593.zip
New upstream version 1.9.0+dfsgupstream/1.9.0+dfsg
Diffstat (limited to '')
-rw-r--r--conf.d/health.d/beanstalkd.conf36
-rw-r--r--conf.d/health.d/bind_rndc.conf2
-rw-r--r--conf.d/health.d/couchdb.conf13
-rw-r--r--conf.d/health.d/redis.conf20
-rw-r--r--conf.d/health.d/tcp_conn.conf19
-rw-r--r--conf.d/health.d/tcp_listen.conf27
-rw-r--r--conf.d/health.d/tcp_mem.conf20
-rw-r--r--conf.d/health.d/tcp_orphans.conf21
-rw-r--r--conf.d/health.d/tcp_resets.conf4
9 files changed, 159 insertions, 3 deletions
diff --git a/conf.d/health.d/beanstalkd.conf b/conf.d/health.d/beanstalkd.conf
new file mode 100644
index 00000000..30dc2732
--- /dev/null
+++ b/conf.d/health.d/beanstalkd.conf
@@ -0,0 +1,36 @@
+# get the number of buried jobs in all queues
+
+template: server_buried_jobs
+ on: beanstalk.current_jobs
+ calc: $buried
+ units: jobs
+ every: 10s
+ warn: $this > 0
+ crit: $this > 10
+ delay: up 0 down 5m multiplier 1.2 max 1h
+ info: the number of buried jobs aggregated across all tubes
+ to: sysadmin
+
+# get the number of buried jobs per queue
+
+#template: tube_buried_jobs
+# on: beanstalk.jobs
+# calc: $buried
+# units: jobs
+# every: 10s
+# warn: $this > 0
+# crit: $this > 10
+# delay: up 0 down 5m multiplier 1.2 max 1h
+# info: the number of jobs buried per tube
+# to: sysadmin
+
+# get the current number of tubes
+
+#template: number_of_tubes
+# on: beanstalk.current_tubes
+# calc: $tubes
+# every: 10s
+# warn: $this < 5
+# delay: up 0 down 5m multiplier 1.2 max 1h
+# info: the current number of tubes on the server
+# to: sysadmin
diff --git a/conf.d/health.d/bind_rndc.conf b/conf.d/health.d/bind_rndc.conf
index 028bc9d0..4145e77c 100644
--- a/conf.d/health.d/bind_rndc.conf
+++ b/conf.d/health.d/bind_rndc.conf
@@ -1,4 +1,4 @@
- alarm: bind_rndc_stats_file_size
+ template: bind_rndc_stats_file_size
on: bind_rndc.stats_size
units: megabytes
every: 60
diff --git a/conf.d/health.d/couchdb.conf b/conf.d/health.d/couchdb.conf
new file mode 100644
index 00000000..4a289528
--- /dev/null
+++ b/conf.d/health.d/couchdb.conf
@@ -0,0 +1,13 @@
+
+# make sure couchdb is running
+
+template: couchdb_last_collected_secs
+ on: couchdb.request_methods
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf
index 5f6d397e..c08a884a 100644
--- a/conf.d/health.d/redis.conf
+++ b/conf.d/health.d/redis.conf
@@ -12,3 +12,23 @@ template: redis_last_collected_secs
info: number of seconds since the last successful data collection
to: dba
+template: redis_bgsave_broken
+families: *
+ on: redis.bgsave_health
+ every: 10s
+ crit: $rdb_last_bgsave_status != 0
+ units: ok/failed
+ info: states if redis bgsave is working
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
+
+template: redis_bgsave_slow
+families: *
+ on: redis.bgsave_now
+ every: 10s
+ warn: $rdb_bgsave_in_progress > 600
+ crit: $rdb_bgsave_in_progress > 1200
+ units: seconds
+ info: the time redis needs to save its database
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
diff --git a/conf.d/health.d/tcp_conn.conf b/conf.d/health.d/tcp_conn.conf
new file mode 100644
index 00000000..7aa9a980
--- /dev/null
+++ b/conf.d/health.d/tcp_conn.conf
@@ -0,0 +1,19 @@
+
+#
+# ${tcp_max_connections} may be nan or -1 if the system
+# supports dynamic threshold for TCP connections.
+# In this case, the alarm will always be zero.
+#
+
+ alarm: tcp_connections
+ on: ipv4.tcpsock
+ os: linux
+ hosts: *
+ calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
+ crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 ))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the percentage of IPv4 TCP connections over the max allowed
+ to: sysadmin
diff --git a/conf.d/health.d/tcp_listen.conf b/conf.d/health.d/tcp_listen.conf
new file mode 100644
index 00000000..957964ae
--- /dev/null
+++ b/conf.d/health.d/tcp_listen.conf
@@ -0,0 +1,27 @@
+# -----------------------------------------------------------------------------
+# tcp listen sockets issues
+
+ alarm: 1m_ipv4_tcp_listen_overflows
+ on: ipv4.tcplistenissues
+ os: linux freebsd
+ hosts: *
+ lookup: sum -60s unaligned absolute of ListenOverflows
+ units: overflows
+ every: 10s
+ crit: $this > 0
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the number of TCP listen socket overflows during the last minute
+ to: sysadmin
+
+ alarm: 1m_ipv4_tcp_listen_drops
+ on: ipv4.tcplistenissues
+ os: linux
+ hosts: *
+ lookup: sum -60s unaligned absolute of ListenDrops
+ units: drops
+ every: 10s
+ crit: $this > 0
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the number of TCP listen socket drops during the last minute
+ to: sysadmin
+
diff --git a/conf.d/health.d/tcp_mem.conf b/conf.d/health.d/tcp_mem.conf
new file mode 100644
index 00000000..6927d576
--- /dev/null
+++ b/conf.d/health.d/tcp_mem.conf
@@ -0,0 +1,20 @@
+#
+# check
+# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
+#
+# We give a warning when TCP is under memory pressure
+# and a critical when TCP is 90% of its upper memory limit
+#
+
+ alarm: tcp_memory
+ on: ipv4.sockstat_tcp_mem
+ os: linux
+ hosts: *
+ calc: ${mem} * 100 / ${tcp_mem_high}
+ units: %
+ every: 10s
+ warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} ))
+ crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the amount of TCP memory as a percentage of its max memory limit
+ to: sysadmin
diff --git a/conf.d/health.d/tcp_orphans.conf b/conf.d/health.d/tcp_orphans.conf
new file mode 100644
index 00000000..280d6590
--- /dev/null
+++ b/conf.d/health.d/tcp_orphans.conf
@@ -0,0 +1,21 @@
+
+#
+# check
+# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
+#
+# The kernel may penalize orphans by 2x or even 4x
+# so we alarm warning at 25% and critical at 50%
+#
+
+ alarm: tcp_orphans
+ on: ipv4.sockstat_tcp_sockets
+ os: linux
+ hosts: *
+ calc: ${orphan} * 100 / ${tcp_max_orphans}
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
+ crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 ))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors)
+ to: sysadmin
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
index fec124ac..e6cfd39a 100644
--- a/conf.d/health.d/tcp_resets.conf
+++ b/conf.d/health.d/tcp_resets.conf
@@ -37,7 +37,7 @@
every: 10s
warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20)))
delay: up 0 down 60m multiplier 1.2 max 2h
-options: no-clear-notification
+ options: no-clear-notification
info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent)
to: sysadmin
@@ -62,6 +62,6 @@ options: no-clear-notification
every: 10s
warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
delay: up 0 down 60m multiplier 1.2 max 2h
-options: no-clear-notification
+ options: no-clear-notification
info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent)
to: sysadmin