summaryrefslogtreecommitdiffstats
path: root/conf.d/health.d
diff options
context:
space:
mode:
Diffstat (limited to 'conf.d/health.d')
-rw-r--r--conf.d/health.d/fping.conf2
-rw-r--r--conf.d/health.d/lighttpd.conf14
-rw-r--r--conf.d/health.d/mongodb.conf13
-rw-r--r--conf.d/health.d/net.conf6
-rw-r--r--conf.d/health.d/ram.conf9
-rw-r--r--conf.d/health.d/tcp_resets.conf8
-rw-r--r--conf.d/health.d/web_log.conf3
-rw-r--r--conf.d/health.d/zfs.conf10
8 files changed, 55 insertions, 10 deletions
diff --git a/conf.d/health.d/fping.conf b/conf.d/health.d/fping.conf
index 69251b182..43658fef6 100644
--- a/conf.d/health.d/fping.conf
+++ b/conf.d/health.d/fping.conf
@@ -28,7 +28,7 @@ families: *
lookup: average -10s unaligned of average
units: ms
every: 10s
- green: 300
+ green: 500
red: 1000
warn: $this > $green OR $max > $red
crit: $this > $red
diff --git a/conf.d/health.d/lighttpd.conf b/conf.d/health.d/lighttpd.conf
new file mode 100644
index 000000000..915907a4a
--- /dev/null
+++ b/conf.d/health.d/lighttpd.conf
@@ -0,0 +1,14 @@
+
+# make sure lighttpd is running
+
+template: lighttpd_last_collected_secs
+ on: lighttpd.requests
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
diff --git a/conf.d/health.d/mongodb.conf b/conf.d/health.d/mongodb.conf
new file mode 100644
index 000000000..a80cb3112
--- /dev/null
+++ b/conf.d/health.d/mongodb.conf
@@ -0,0 +1,13 @@
+
+# make sure mongodb is running
+
+template: mongodb_last_collected_secs
+ on: mongodb.read_operations
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
index 0232395ac..bd288817b 100644
--- a/conf.d/health.d/net.conf
+++ b/conf.d/health.d/net.conf
@@ -99,9 +99,9 @@ families: *
calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
every: 10s
units: %
- warn: $this > (($status >= $WARNING)?(200):(1000))
- crit: $this > (($status >= $WARNING)?(1000):(2000))
+ warn: $this > (($status >= $WARNING)?(200):(5000))
+ crit: $this > (($status >= $WARNING)?(5000):(6000))
options: no-clear-notification
- info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute
+ info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
to: sysadmin
diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf
index d60df75b2..b99e5e226 100644
--- a/conf.d/health.d/ram.conf
+++ b/conf.d/health.d/ram.conf
@@ -1,7 +1,14 @@
+ alarm: used_ram_to_ignore
+ on: system.ram
+ calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
+ every: 10s
+ info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
+
alarm: ram_in_use
on: system.ram
- calc: $used * 100 / ($used + $cached + $free)
+# calc: $used * 100 / ($used + $cached + $free)
+ calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
index 49fb1b924..803c88a81 100644
--- a/conf.d/health.d/tcp_resets.conf
+++ b/conf.d/health.d/tcp_resets.conf
@@ -26,10 +26,10 @@
lookup: average -10s unaligned absolute of OutRsts
units: tcp resets/s
every: 10s
- warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (4)))
+ warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20)))
delay: up 0 down 60m multiplier 1.2 max 2h
options: no-clear-notification
- info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed)
+ info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent)
to: sysadmin
# -----------------------------------------------------------------------------
@@ -47,8 +47,8 @@ options: no-clear-notification
lookup: average -10s unaligned absolute of AttemptFails
units: tcp resets/s
every: 10s
- warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (4)))
+ warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
delay: up 0 down 60m multiplier 1.2 max 2h
options: no-clear-notification
- info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed)
+ info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent)
to: sysadmin
diff --git a/conf.d/health.d/web_log.conf b/conf.d/health.d/web_log.conf
index c668959f5..d18088172 100644
--- a/conf.d/health.d/web_log.conf
+++ b/conf.d/health.d/web_log.conf
@@ -156,6 +156,7 @@ families: *
delay: down 15m multiplier 1.5 max 1h
options: no-clear-notification
info: the percentage of successful web requests over the last 5 minutes, \
- compared with the previous 5 minutes
+ compared with the previous 5 minutes \
+ (clear notification for this alarm will not be sent)
to: webmaster
diff --git a/conf.d/health.d/zfs.conf b/conf.d/health.d/zfs.conf
new file mode 100644
index 000000000..af73824e6
--- /dev/null
+++ b/conf.d/health.d/zfs.conf
@@ -0,0 +1,10 @@
+
+ alarm: zfs_memory_throttle
+ on: zfs.memory_ops
+ lookup: sum -10m unaligned absolute of throttled
+ units: events
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 2h
+ info: the number of times ZFS had to limit the ARC growth in the last 10 minutes
+ to: sysadmin