8 files changed, 55 insertions, 10 deletions
diff --git a/conf.d/health.d/fping.conf b/conf.d/health.d/fping.conf
index 69251b182..43658fef6 100644
--- a/conf.d/health.d/fping.conf
+++ b/conf.d/health.d/fping.conf
@@ -28,7 +28,7 @@ families: *
   lookup: average -10s unaligned of average
    units: ms
    every: 10s
-   green: 300
+   green: 500
      red: 1000
     warn: $this > $green OR $max > $red
     crit: $this > $red
diff --git a/conf.d/health.d/lighttpd.conf b/conf.d/health.d/lighttpd.conf
new file mode 100644
index 000000000..915907a4a
--- /dev/null
+++ b/conf.d/health.d/lighttpd.conf
@@ -0,0 +1,14 @@
+
+# make sure lighttpd is running
+
+template: lighttpd_last_collected_secs
+      on: lighttpd.requests
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: webmaster
+
diff --git a/conf.d/health.d/mongodb.conf b/conf.d/health.d/mongodb.conf
new file mode 100644
index 000000000..a80cb3112
--- /dev/null
+++ b/conf.d/health.d/mongodb.conf
@@ -0,0 +1,13 @@
+
+# make sure mongodb is running
+
+template: mongodb_last_collected_secs
+      on: mongodb.read_operations
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
index 0232395ac..bd288817b 100644
--- a/conf.d/health.d/net.conf
+++ b/conf.d/health.d/net.conf
@@ -99,9 +99,9 @@ families: *
     calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
    every: 10s
    units: %
-   warn: $this > (($status >= $WARNING)?(200):(1000))
-   crit: $this > (($status >= $WARNING)?(1000):(2000))
+   warn: $this > (($status >= $WARNING)?(200):(5000))
+   crit: $this > (($status >= $WARNING)?(5000):(6000))
 options: no-clear-notification
-   info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute
+   info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
      to: sysadmin
 
diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf
index d60df75b2..b99e5e226 100644
--- a/conf.d/health.d/ram.conf
+++ b/conf.d/health.d/ram.conf
@@ -1,7 +1,14 @@
 
+   alarm: used_ram_to_ignore
+      on: system.ram
+    calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
+   every: 10s
+    info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
+
    alarm: ram_in_use
       on: system.ram
-    calc: $used * 100 / ($used + $cached + $free)
+#   calc: $used * 100 / ($used + $cached + $free)
+    calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
index 49fb1b924..803c88a81 100644
--- a/conf.d/health.d/tcp_resets.conf
+++ b/conf.d/health.d/tcp_resets.conf
@@ -26,10 +26,10 @@
   lookup: average -10s unaligned absolute of OutRsts
    units: tcp resets/s
    every: 10s
-    warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (4)))
+    warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (20)))
    delay: up 0 down 60m multiplier 1.2 max 2h
 options: no-clear-notification
-    info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed)
+    info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent)
       to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -47,8 +47,8 @@ options: no-clear-notification
   lookup: average -10s unaligned absolute of AttemptFails
    units: tcp resets/s
    every: 10s
-    warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (4)))
+    warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (10)))
    delay: up 0 down 60m multiplier 1.2 max 2h
 options: no-clear-notification
-    info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed)
+    info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent)
       to: sysadmin
diff --git a/conf.d/health.d/web_log.conf b/conf.d/health.d/web_log.conf
index c668959f5..d18088172 100644
--- a/conf.d/health.d/web_log.conf
+++ b/conf.d/health.d/web_log.conf
@@ -156,6 +156,7 @@ families: *
    delay: down 15m multiplier 1.5 max 1h
 options: no-clear-notification
     info: the percentage of successful web requests over the last 5 minutes, \
-          compared with the previous 5 minutes
+          compared with the previous 5 minutes \
+          (clear notification for this alarm will not be sent)
       to: webmaster
 
diff --git a/conf.d/health.d/zfs.conf b/conf.d/health.d/zfs.conf
new file mode 100644
index 000000000..af73824e6
--- /dev/null
+++ b/conf.d/health.d/zfs.conf
@@ -0,0 +1,10 @@
+
+   alarm: zfs_memory_throttle
+      on: zfs.memory_ops
+  lookup: sum -10m unaligned absolute of throttled
+   units: events
+   every: 1m
+    warn: $this > 0
+   delay: down 1h multiplier 1.5 max 2h
+    info: the number of times ZFS had to limit the ARC growth in the last 10 minutes
+      to: sysadmin