9 files changed, 159 insertions, 3 deletions
diff --git a/conf.d/health.d/beanstalkd.conf b/conf.d/health.d/beanstalkd.conf
new file mode 100644
index 000000000..30dc27328
--- /dev/null
+++ b/conf.d/health.d/beanstalkd.conf
@@ -0,0 +1,36 @@
+# get the number of buried jobs in all queues
+
+template: server_buried_jobs
+      on: beanstalk.current_jobs
+    calc: $buried
+   units: jobs
+   every: 10s
+    warn: $this > 0
+    crit: $this > 10
+   delay: up 0 down 5m multiplier 1.2 max 1h
+    info: the number of buried jobs aggregated across all tubes
+      to: sysadmin
+      
+# get the number of buried jobs per queue
+
+#template: tube_buried_jobs
+#      on: beanstalk.jobs
+#    calc: $buried
+#   units: jobs
+#   every: 10s
+#    warn: $this > 0
+#    crit: $this > 10
+#   delay: up 0 down 5m multiplier 1.2 max 1h
+#    info: the number of jobs buried per tube
+#      to: sysadmin
+
+# get the current number of tubes
+
+#template: number_of_tubes
+#      on: beanstalk.current_tubes
+#    calc: $tubes
+#   every: 10s
+#    warn: $this < 5
+#   delay: up 0 down 5m multiplier 1.2 max 1h
+#    info: the current number of tubes on the server
+#      to: sysadmin
diff --git a/conf.d/health.d/bind_rndc.conf b/conf.d/health.d/bind_rndc.conf
index 028bc9d08..4145e77cd 100644
--- a/conf.d/health.d/bind_rndc.conf
+++ b/conf.d/health.d/bind_rndc.conf
@@ -1,4 +1,4 @@
- alarm: bind_rndc_stats_file_size
+ template: bind_rndc_stats_file_size
       on: bind_rndc.stats_size
    units: megabytes
    every: 60
diff --git a/conf.d/health.d/couchdb.conf b/conf.d/health.d/couchdb.conf
new file mode 100644
index 000000000..4a2895280
--- /dev/null
+++ b/conf.d/health.d/couchdb.conf
@@ -0,0 +1,13 @@
+
+# make sure couchdb is running
+
+template: couchdb_last_collected_secs
+      on: couchdb.request_methods
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf
index 5f6d397ea..c08a884a6 100644
--- a/conf.d/health.d/redis.conf
+++ b/conf.d/health.d/redis.conf
@@ -12,3 +12,23 @@ template: redis_last_collected_secs
     info: number of seconds since the last successful data collection
       to: dba
 
+template: redis_bgsave_broken
+families: *
+      on: redis.bgsave_health
+   every: 10s
+    crit: $rdb_last_bgsave_status != 0
+   units: ok/failed
+    info: states if redis bgsave is working
+   delay: down 5m multiplier 1.5 max 1h
+      to: dba
+
+template: redis_bgsave_slow
+families: *
+      on: redis.bgsave_now
+   every: 10s
+    warn: $rdb_bgsave_in_progress > 600
+    crit: $rdb_bgsave_in_progress > 1200
+   units: seconds
+    info: the time redis needs to save its database
+   delay: down 5m multiplier 1.5 max 1h
+      to: dba
diff --git a/conf.d/health.d/tcp_conn.conf b/conf.d/health.d/tcp_conn.conf
new file mode 100644
index 000000000..7aa9a9800
--- /dev/null
+++ b/conf.d/health.d/tcp_conn.conf
@@ -0,0 +1,19 @@
+
+#
+# ${tcp_max_connections} may be nan or -1 if the system
+# supports dynamic threshold for TCP connections.
+# In this case, the alarm will always be zero.
+#
+
+   alarm: tcp_connections
+      on: ipv4.tcpsock
+      os: linux
+   hosts: *
+    calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
+    crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 ))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the percentage of IPv4 TCP connections over the max allowed
+      to: sysadmin
diff --git a/conf.d/health.d/tcp_listen.conf b/conf.d/health.d/tcp_listen.conf
new file mode 100644
index 000000000..957964ae4
--- /dev/null
+++ b/conf.d/health.d/tcp_listen.conf
@@ -0,0 +1,27 @@
+# -----------------------------------------------------------------------------
+# tcp listen sockets issues
+
+   alarm: 1m_ipv4_tcp_listen_overflows
+      on: ipv4.tcplistenissues
+      os: linux freebsd
+   hosts: *
+  lookup: sum -60s unaligned absolute of ListenOverflows
+   units: overflows
+   every: 10s
+    crit: $this > 0
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the number of TCP listen socket overflows during the last minute
+      to: sysadmin
+
+   alarm: 1m_ipv4_tcp_listen_drops
+      on: ipv4.tcplistenissues
+      os: linux
+   hosts: *
+  lookup: sum -60s unaligned absolute of ListenDrops
+   units: drops
+   every: 10s
+    crit: $this > 0
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the number of TCP listen socket drops during the last minute
+      to: sysadmin
+
diff --git a/conf.d/health.d/tcp_mem.conf b/conf.d/health.d/tcp_mem.conf
new file mode 100644
index 000000000..6927d5765
--- /dev/null
+++ b/conf.d/health.d/tcp_mem.conf
@@ -0,0 +1,20 @@
+#
+# check
+# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
+#
+# We give a warning when TCP is under memory pressure
+# and a critical when TCP is 90% of its upper memory limit
+#
+
+   alarm: tcp_memory
+      on: ipv4.sockstat_tcp_mem
+      os: linux
+   hosts: *
+    calc: ${mem} * 100 / ${tcp_mem_high}
+   units: %
+   every: 10s
+    warn: ${mem} > (($status >= $WARNING  ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure}   ))
+    crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure}       ) : ( ${tcp_mem_high} * 0.9 ))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the amount of TCP memory as a percentage of its max memory limit
+      to: sysadmin
diff --git a/conf.d/health.d/tcp_orphans.conf b/conf.d/health.d/tcp_orphans.conf
new file mode 100644
index 000000000..280d6590f
--- /dev/null
+++ b/conf.d/health.d/tcp_orphans.conf
@@ -0,0 +1,21 @@
+
+#
+# check
+# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
+#
+# The kernel may penalize orphans by 2x or even 4x
+# so we alarm warning at 25% and critical at 50%
+#
+
+   alarm: tcp_orphans
+      on: ipv4.sockstat_tcp_sockets
+      os: linux
+   hosts: *
+    calc: ${orphan} * 100 / ${tcp_max_orphans}
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
+    crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 ))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors)
+      to: sysadmin
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
index fec124ac7..e6cfd39ab 100644
--- a/conf.d/health.d/tcp_resets.conf
+++ b/conf.d/health.d/tcp_resets.conf
@@ -37,7 +37,7 @@
    every: 10s
     warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (20)))
    delay: up 0 down 60m multiplier 1.2 max 2h
-options: no-clear-notification
+ options: no-clear-notification
     info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent)
       to: sysadmin
 
@@ -62,6 +62,6 @@ options: no-clear-notification
    every: 10s
     warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (10)))
    delay: up 0 down 60m multiplier 1.2 max 2h
-options: no-clear-notification
+ options: no-clear-notification
     info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent)
       to: sysadmin