New upstream version 1.11.0+dfsgupstream/1.11.0+dfsg

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2018-11-07 12:19:29 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2018-11-07 12:20:17 +0000
commit: a64a253794ac64cb40befee54db53bde17dd0d49 (patch)
tree: c1024acc5f6e508814b944d99f112259bb28b1be /health/health.d
parent: New upstream version 1.10.0+dfsg (diff)
download: netdata-a64a253794ac64cb40befee54db53bde17dd0d49.tar.xz
netdata-a64a253794ac64cb40befee54db53bde17dd0d49.zip
57 files changed, 2185 insertions, 0 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
new file mode 100644
index 00000000..a1301ce8
--- /dev/null
+++ b/health/health.d/adaptec_raid.conf
@@ -0,0 +1,24 @@
+
+# logical device status check
+
+template: adapter_raid_ld_status
+      on: adapter_raid.ld_status
+  lookup: max -5s
+   units: bool
+   every: 10s
+    crit: $this > 0
+   delay: down 5m multiplier 1.5 max 1h
+    info: at least 1 logical device is failed or degraded
+      to: sysadmin
+
+# physical device state check
+
+template: adapter_raid_pd_state
+      on: adapter_raid.pd_state
+  lookup: max -5s
+   units: bool
+   every: 10s
+    crit: $this > 0
+   delay: down 5m multiplier 1.5 max 1h
+    info: at least 1 physical device is not in online state
+      to: sysadmin
diff --git a/health/health.d/apache.conf b/health/health.d/apache.conf
new file mode 100644
index 00000000..0c98b877
--- /dev/null
+++ b/health/health.d/apache.conf
@@ -0,0 +1,14 @@
+
+# make sure apache is running
+
+template: apache_last_collected_secs
+      on: apache.requests
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: webmaster
+
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
new file mode 100644
index 00000000..4f86037b
--- /dev/null
+++ b/health/health.d/apcupsd.conf
@@ -0,0 +1,40 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+template: 10min_ups_load
+      on: apcupsd.load
+      os: *
+   hosts: *
+  lookup: average -10m unaligned of percentage
+   units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (85) : (95))
+   delay: down 10m multiplier 1.5 max 1h
+    info: average UPS load for the last 10 minutes
+      to: sitemgr
+
+# Discussion in https://github.com/netdata/netdata/pull/3928:
+# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
+template: ups_charge
+      on: apcupsd.charge
+      os: *
+   hosts: *
+  lookup: average -60s unaligned of charge
+   units: %
+   every: 60s
+    warn: $this < 100
+    crit: $this < (($status == $CRITICAL) ? (60) : (50))
+   delay: down 10m multiplier 1.5 max 1h
+    info: current UPS charge, averaged over the last 60 seconds to reduce measurement errors
+      to: sitemgr
+
+template: apcupsd_last_collected_secs
+      on: apcupsd.load
+    calc: $now - $last_collected_t
+   every: 10s
+   units: seconds ago
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sitemgr
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf
new file mode 100644
index 00000000..7af100d8
--- /dev/null
+++ b/health/health.d/backend.conf
@@ -0,0 +1,45 @@
+
+# make sure we are sending data to backend
+
+   alarm: backend_last_buffering
+      on: netdata.backend_metrics
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful buffering of backend data
+      to: dba
+
+   alarm: backend_metrics_sent
+      on: netdata.backend_metrics
+   units: %
+    calc: abs($sent) * 100 / abs($buffered)
+   every: 10s
+    warn: $this != 100
+   delay: down 5m multiplier 1.5 max 1h
+    info: percentage of metrics sent to the backend server
+      to: dba
+
+   alarm: backend_metrics_lost
+      on: netdata.backend_metrics
+   units: metrics
+    calc: abs($lost)
+   every: 10s
+    crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0)
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of metrics lost due to repeating failures to contact the backend server
+      to: dba
+
+# this chart has been removed from netdata
+#   alarm: backend_slow
+#      on: netdata.backend_latency
+#   units: %
+#    calc: $latency * 100 / ($update_every * 1000)
+#   every: 10s
+#    warn: $this > 50
+#    crit: $this > 100
+#   delay: down 5m multiplier 1.5 max 1h
+#    info: the percentage of time between iterations needed by the backend time to process the data sent by netdata
+#      to: dba
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
new file mode 100644
index 00000000..f0da9ac5
--- /dev/null
+++ b/health/health.d/bcache.conf
@@ -0,0 +1,22 @@
+
+template: bcache_cache_errors
+      on: disk.bcache_cache_read_races
+  lookup: sum -10m unaligned absolute
+   units: errors
+   every: 1m
+    warn: $this > 0
+    crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) )
+   delay: down 1h multiplier 1.5 max 2h
+    info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing)
+      to: sysadmin
+
+template: bcache_cache_dirty
+      on: disk.bcache_cache_alloc
+    calc: $dirty + $metadata + $undefined
+   units: %
+   every: 1m
+    warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
+    crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+   delay: up 1m down 1h multiplier 1.5 max 2h
+    info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small)
+      to: sysadmin
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
new file mode 100644
index 00000000..30dc2732
--- /dev/null
+++ b/health/health.d/beanstalkd.conf
@@ -0,0 +1,36 @@
+# get the number of buried jobs in all queues
+
+template: server_buried_jobs
+      on: beanstalk.current_jobs
+    calc: $buried
+   units: jobs
+   every: 10s
+    warn: $this > 0
+    crit: $this > 10
+   delay: up 0 down 5m multiplier 1.2 max 1h
+    info: the number of buried jobs aggregated across all tubes
+      to: sysadmin
+      
+# get the number of buried jobs per queue
+
+#template: tube_buried_jobs
+#      on: beanstalk.jobs
+#    calc: $buried
+#   units: jobs
+#   every: 10s
+#    warn: $this > 0
+#    crit: $this > 10
+#   delay: up 0 down 5m multiplier 1.2 max 1h
+#    info: the number of jobs buried per tube
+#      to: sysadmin
+
+# get the current number of tubes
+
+#template: number_of_tubes
+#      on: beanstalk.current_tubes
+#    calc: $tubes
+#   every: 10s
+#    warn: $this < 5
+#   delay: up 0 down 5m multiplier 1.2 max 1h
+#    info: the current number of tubes on the server
+#      to: sysadmin
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
new file mode 100644
index 00000000..4145e77c
--- /dev/null
+++ b/health/health.d/bind_rndc.conf
@@ -0,0 +1,9 @@
+ template: bind_rndc_stats_file_size
+      on: bind_rndc.stats_size
+   units: megabytes
+   every: 60
+    calc: $stats_size
+    warn: $this > 512
+    crit: $this > 1024
+    info: Bind stats file is very large! Consider to create logrotate conf file for it!
+      to: sysadmin
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
new file mode 100644
index 00000000..43c588db
--- /dev/null
+++ b/health/health.d/boinc.conf
@@ -0,0 +1,62 @@
+# Alarms for various BOINC issues.
+
+# Warn on any compute errors encountered.
+template: boinc_compute_errors
+      on: boinc.states
+      os: *
+   hosts: *
+families: *
+  lookup: average -10m unaligned of comperror
+   units: tasks
+   every: 1m
+    warn: $this > 0
+    crit: $this > 1
+   delay: up 1m down 5m multiplier 1.5 max 1h
+    info: the total number of compute errors over the past 10 minutes
+      to: sysadmin
+
+# Warn on lots of upload errors
+template: boinc_upload_errors
+      on: boinc.states
+      os: *
+   hosts: *
+families: *
+  lookup: average -10m unaligned of upload_failed
+   units: tasks
+   every: 1m
+    warn: $this > 0
+    crit: $this > 1
+   delay: up 1m down 5m multiplier 1.5 max 1h
+    info: the average number of failed uploads over the past 10 minutes
+      to: sysadmin
+
+# Warn on the task queue being empty
+template: boinc_total_tasks
+      on: boinc.tasks
+      os: *
+   hosts: *
+families: *
+  lookup: average -10m unaligned of total
+   units: tasks
+   every: 1m
+    warn: $this < 1
+    crit: $this < 0.1
+   delay: up 5m down 10m multiplier 1.5 max 1h
+    info: the total number of locally available tasks
+      to: sysadmin
+
+# Warn on no active tasks with a non-empty queue
+template: boinc_active_tasks
+      on: boinc.tasks
+      os: *
+   hosts: *
+families: *
+  lookup: average -10m unaligned of active
+    calc: ($boinc_total_tasks >= 1) ? ($this) : (inf)
+   units: tasks
+   every: 1m
+    warn: $this < 1
+    crit: $this < 0.1
+   delay: up 5m down 10m multiplier 1.5 max 1h
+    info: the total number of active tasks
+      to: sysadmin
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
new file mode 100644
index 00000000..b27aa544
--- /dev/null
+++ b/health/health.d/btrfs.conf
@@ -0,0 +1,57 @@
+
+template: btrfs_allocated
+      on: btrfs.disk
+      os: *
+   hosts: *
+families: *
+    calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (90) : (95))
+    crit: $this > (($status == $CRITICAL) ? (95) : (98))
+   delay: up 1m down 15m multiplier 1.5 max 1h
+    info: the percentage of allocated BTRFS physical disk space
+      to: sysadmin
+
+template: btrfs_data
+      on: btrfs.data
+      os: *
+   hosts: *
+families: *
+    calc: $used * 100 / ($used + $free)
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
+    crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+   delay: up 1m down 15m multiplier 1.5 max 1h
+    info: the percentage of used BTRFS data space
+      to: sysadmin
+
+template: btrfs_metadata
+      on: btrfs.metadata
+      os: *
+   hosts: *
+families: *
+    calc: ($used + $reserved) * 100 / ($used + $free + $reserved)
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
+    crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+   delay: up 1m down 15m multiplier 1.5 max 1h
+    info: the percentage of used BTRFS metadata space
+      to: sysadmin
+
+template: btrfs_system
+      on: btrfs.system
+      os: *
+   hosts: *
+families: *
+    calc: $used * 100 / ($used + $free)
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
+    crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+   delay: up 1m down 15m multiplier 1.5 max 1h
+    info: the percentage of used BTRFS system space
+      to: sysadmin
+
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
new file mode 100644
index 00000000..de16f7b6
--- /dev/null
+++ b/health/health.d/ceph.conf
@@ -0,0 +1,13 @@
+# low ceph disk available
+
+template: cluster_space_usage
+      on: ceph.general_usage
+    calc: $avail * 100 / ($avail + $used)
+   units: %
+   every: 10s
+    warn: $this < 10
+    crit: $this < 1
+   delay: down 5m multiplier 1.2 max 1h
+    info: ceph disk usage is almost full
+      to: sysadmin
+
diff --git a/health/health.d/couchdb.conf b/health/health.d/couchdb.conf
new file mode 100644
index 00000000..4a289528
--- /dev/null
+++ b/health/health.d/couchdb.conf
@@ -0,0 +1,13 @@
+
+# make sure couchdb is running
+
+template: couchdb_last_collected_secs
+      on: couchdb.request_methods
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
new file mode 100644
index 00000000..fa818985
--- /dev/null
+++ b/health/health.d/cpu.conf
@@ -0,0 +1,55 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+template: 10min_cpu_usage
+      on: system.cpu
+      os: linux
+   hosts: *
+  lookup: average -10m unaligned of user,system,softirq,irq,guest
+   units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (75) : (85))
+    crit: $this > (($status == $CRITICAL) ? (85) : (95))
+   delay: down 15m multiplier 1.5 max 1h
+    info: average cpu utilization for the last 10 minutes (excluding iowait, nice and steal)
+      to: sysadmin
+
+template: 10min_cpu_iowait
+      on: system.cpu
+      os: linux
+   hosts: *
+  lookup: average -10m unaligned of iowait
+   units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (20) : (40))
+    crit: $this > (($status == $CRITICAL) ? (40) : (50))
+   delay: down 15m multiplier 1.5 max 1h
+    info: average CPU wait I/O for the last 10 minutes
+      to: sysadmin
+
+template: 20min_steal_cpu
+      on: system.cpu
+      os: linux
+   hosts: *
+  lookup: average -20m unaligned of steal
+   units: %
+   every: 5m
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (20) : (30))
+   delay: down 1h multiplier 1.5 max 2h
+    info: average CPU steal time for the last 20 minutes
+      to: sysadmin
+
+## FreeBSD
+template: 10min_cpu_usage
+      on: system.cpu
+      os: freebsd
+   hosts: *
+  lookup: average -10m unaligned of user,system,interrupt
+   units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (75) : (85))
+    crit: $this > (($status == $CRITICAL) ? (85) : (95))
+   delay: down 15m multiplier 1.5 max 1h
+    info: average cpu utilization for the last 10 minutes (excluding nice)
+      to: sysadmin
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
new file mode 100644
index 00000000..26f85848
--- /dev/null
+++ b/health/health.d/disks.conf
@@ -0,0 +1,167 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+
+# -----------------------------------------------------------------------------
+# low disk space
+
+# checking the latest collected values
+# raise an alarm if the disk is low on
+# available disk space
+
+template: disk_space_usage
+      on: disk.space
+      os: linux freebsd
+   hosts: *
+families: *
+    calc: $used * 100 / ($avail + $used)
+   units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING ) ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: up 1m down 15m multiplier 1.5 max 1h
+    info: current disk space usage
+      to: sysadmin
+
+template: disk_inode_usage
+      on: disk.inodes
+      os: linux freebsd
+   hosts: *
+families: *
+    calc: $used * 100 / ($avail + $used)
+   units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: up 1m down 15m multiplier 1.5 max 1h
+    info: current disk inode usage
+      to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# disk fill rate
+
+# calculate the rate the disk fills
+# use as base, the available space change
+# during the last hour
+
+# this is just a calculation - it has no alarm
+# we will use it in the next template to find
+# the hours remaining
+
+template: disk_fill_rate
+      on: disk.space
+      os: linux freebsd
+   hosts: *
+families: *
+  lookup: min -10m at -50m unaligned of avail
+    calc: ($this - $avail) / (($now - $after) / 3600)
+   every: 1m
+   units: GB/hour
+    info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
+
+
+# calculate the hours remaining
+# if the disk continues to fill
+# in this rate
+
+template: out_of_disk_space_time
+      on: disk.space
+      os: linux freebsd
+   hosts: *
+families: *
+    calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
+   units: hours
+   every: 10s
+    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+   delay: down 15m multiplier 1.2 max 1h
+    info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+      to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# disk inode fill rate
+
+# calculate the rate the disk inodes are allocated
+# use as base, the available inodes change
+# during the last hour
+
+# this is just a calculation - it has no alarm
+# we will use it in the next template to find
+# the hours remaining
+
+template: disk_inode_rate
+      on: disk.inodes
+      os: linux freebsd
+   hosts: *
+families: *
+  lookup: min -10m at -50m unaligned of avail
+    calc: ($this - $avail) / (($now - $after) / 3600)
+   every: 1m
+   units: inodes/hour
+    info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
+
+# calculate the hours remaining
+# if the disk inodes are allocated
+# in this rate
+
+template: out_of_disk_inodes_time
+      on: disk.inodes
+      os: linux freebsd
+   hosts: *
+families: *
+    calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
+   units: hours
+   every: 10s
+    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+   delay: down 15m multiplier 1.2 max 1h
+    info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
+      to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# disk congestion
+
+# raise an alarm if the disk is congested
+# by calculating the average disk utilization
+# for the last 10 minutes
+
+template: 10min_disk_utilization
+      on: disk.util
+      os: linux freebsd
+   hosts: *
+families: *
+  lookup: average -10m unaligned
+   units: %
+   every: 1m
+   green: 90
+     red: 98
+    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
+    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
+   delay: down 15m multiplier 1.2 max 1h
+    info: the percentage of time the disk was busy, during the last 10 minutes
+      to: sysadmin
+
+
+# raise an alarm if the disk backlog
+# is above 1000ms (1s) per second
+# for 10 minutes
+# (i.e. the disk cannot catch up)
+
+template: 10min_disk_backlog
+      on: disk.backlog
+      os: linux
+   hosts: *
+families: *
+  lookup: average -10m unaligned
+   units: ms
+   every: 1m
+   green: 2000
+     red: 5000
+    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
+    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
+   delay: down 15m multiplier 1.2 max 1h
+    info: average of the kernel estimated disk backlog, for the last 10 minutes
+      to: sysadmin
diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf
new file mode 100644
index 00000000..729906cd
--- /dev/null
+++ b/health/health.d/dockerd.conf
@@ -0,0 +1,8 @@
+template: docker_unhealthy_containers
+      on: docker.unhealthy_containers
+   units: unhealthy containers
+   every: 10s
+  lookup: average -10s
+    crit: $this > 0
+    info: number of unhealthy containers
+      to: sysadmin
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
new file mode 100644
index 00000000..dffd4096
--- /dev/null
+++ b/health/health.d/elasticsearch.conf
@@ -0,0 +1,9 @@
+   alarm: elasticsearch_last_collected
+      on: elasticsearch_local.cluster_health_status
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    info: number of seconds since the last successful data collection
+      to: sysadmin
diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf
new file mode 100644
index 00000000..66d44ec1
--- /dev/null
+++ b/health/health.d/entropy.conf
@@ -0,0 +1,16 @@
+
+# check if entropy is too low
+# the alarm is checked every 1 minute
+# and examines the last hour of data
+
+   alarm: lowest_entropy
+      on: system.entropy
+      os: linux
+   hosts: *
+  lookup: min -10m unaligned
+   units: entries
+   every: 5m
+    warn: $this < (($status >= $WARNING) ? (200) : (100))
+   delay: down 1h multiplier 1.5 max 2h
+    info: minimum entries in the random numbers pool in the last 10 minutes
+      to: silent
diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf
new file mode 100644
index 00000000..43658fef
--- /dev/null
+++ b/health/health.d/fping.conf
@@ -0,0 +1,53 @@
+
+template: fping_last_collected_secs
+families: *
+      on: fping.latency
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+template: host_reachable
+families: *
+      on: fping.latency
+    calc: $average != nan
+   units: up/down
+   every: 10s
+    crit: $this == 0
+    info: states if the remote host is reachable
+   delay: down 30m multiplier 1.5 max 2h
+      to: sysadmin
+
+template: host_latency
+families: *
+      on: fping.latency
+  lookup: average -10s unaligned of average
+   units: ms
+   every: 10s
+   green: 500
+     red: 1000
+    warn: $this > $green OR $max > $red
+    crit: $this > $red
+    info: average round trip delay during the last 10 seconds
+   delay: down 30m multiplier 1.5 max 2h
+      to: sysadmin
+
+template: packet_loss
+families: *
+      on: fping.quality
+  lookup: average -10m unaligned of returned
+    calc: 100 - $this
+   green: 1
+     red: 10
+   units: %
+   every: 10s
+    warn: $this > $green
+    crit: $this > $red
+    info: packet loss percentage
+   delay: down 30m multiplier 1.5 max 2h
+      to: sysadmin
+
diff --git a/health/health.d/fronius.conf b/health/health.d/fronius.conf
new file mode 100644
index 00000000..cdf6c8fc
--- /dev/null
+++ b/health/health.d/fronius.conf
@@ -0,0 +1,11 @@
+template: fronius_last_collected_secs
+families: *
+      on: fronius.power
+    calc: $now - $last_collected_t
+   every: 10s
+   units: seconds ago
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sitemgr
diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf
new file mode 100644
index 00000000..e49c70d4
--- /dev/null
+++ b/health/health.d/haproxy.conf
@@ -0,0 +1,27 @@
+template: haproxy_backend_server_status
+      on: haproxy_hs.down
+   units: failed servers
+   every: 10s
+  lookup: average -10s
+    crit: $this > 0
+    info: number of failed haproxy backend servers
+      to: sysadmin
+
+template: haproxy_backend_status
+      on: haproxy_hb.down
+   units: failed backend
+   every: 10s
+  lookup: average -10s
+    crit: $this > 0
+    info: number of failed haproxy backends
+      to: sysadmin
+
+template: haproxy_last_collected
+      on: haproxy_hb.down
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    info: number of seconds since the last successful data collection
+      to: sysadmin
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
new file mode 100644
index 00000000..0ddf35ea
--- /dev/null
+++ b/health/health.d/httpcheck.conf
@@ -0,0 +1,99 @@
+template: httpcheck_last_collected_secs
+families: *
+      on: httpcheck.status
+    calc: $now - $last_collected_t
+   every: 10s
+   units: seconds ago
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
+template: web_service_up
+families: *
+      on: httpcheck.status
+  lookup: average -1m unaligned percentage of success
+    calc: ($this < 75) ? (0) : ($this)
+   every: 5s
+   units: up/down
+    info: at least 75% verified responses during last 60 seconds, ideal for badges
+      to: silent
+
+template: web_service_bad_content
+families: *
+      on: httpcheck.status
+  lookup: average -5m unaligned percentage of bad_content
+   every: 10s
+   units: %
+    warn: $this >= 10 AND $this < 40
+    crit: $this >= 40
+   delay: down 5m multiplier 1.5 max 1h
+    info: average of unexpected http response content during the last 5 minutes
+ options: no-clear-notification
+      to: webmaster
+
+template: web_service_bad_status
+families: *
+      on: httpcheck.status
+  lookup: average -5m unaligned percentage of bad_status
+   every: 10s
+   units: %
+    warn: $this >= 10 AND $this < 40
+    crit: $this >= 40
+   delay: down 5m multiplier 1.5 max 1h
+    info: average of unexpected http status during the last 5 minutes
+ options: no-clear-notification
+      to: webmaster
+
+template: web_service_timeouts
+families: *
+      on: httpcheck.status
+  lookup: average -5m unaligned percentage of timeout
+   every: 10s
+   units: %
+    info: average of timeouts during the last 5 minutes
+
+template: no_web_service_connections
+families: *
+      on: httpcheck.status
+  lookup: average -5m unaligned percentage of no_connection
+   every: 10s
+   units: %
+    info: average of failed requests during the last 5 minutes
+
+# combined timeout & no connection alarm
+template: web_service_unreachable
+families: *
+      on: httpcheck.status
+    calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts)
+   units: %
+   every: 10s
+    warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40)
+    crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40
+   delay: down 5m multiplier 1.5 max 1h
+    info: average of failed requests either due to timeouts or no connection during the last 5 minutes
+ options: no-clear-notification
+      to: webmaster
+
+template: 1h_web_service_response_time
+families: *
+      on: httpcheck.responsetime
+  lookup: average -1h unaligned of time
+   every: 30s
+   units: ms
+    info: average response time over the last hour
+
+template: web_service_slow
+families: *
+      on: httpcheck.responsetime
+  lookup: average -3m unaligned of time
+   units: ms
+   every: 10s
+    warn: ($this > ($1h_web_service_response_time * 2) )
+    crit: ($this > ($1h_web_service_response_time * 3) )
+    info: average response time over the last 3 minutes, compared to the average over the last hour
+   delay: down 5m multiplier 1.5 max 1h
+ options: no-clear-notification
+      to: webmaster
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
new file mode 100644
index 00000000..989d6e91
--- /dev/null
+++ b/health/health.d/ipc.conf
@@ -0,0 +1,28 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+   alarm: semaphores_used
+      on: system.ipc_semaphores
+      os: linux
+   hosts: *
+    calc: $semaphores * 100 / $ipc_semaphores_max
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (70) : (90))
+   delay: down 5m multiplier 1.5 max 1h
+    info: the percentage of IPC semaphores used
+      to: sysadmin
+
+   alarm: semaphore_arrays_used
+      on: system.ipc_semaphore_arrays
+      os: linux
+   hosts: *
+    calc: $arrays * 100 / $ipc_semaphores_arrays_max
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (70) : (90))
+   delay: down 5m multiplier 1.5 max 1h
+    info: the percentage of IPC semaphore arrays used
+      to: sysadmin
diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf
new file mode 100644
index 00000000..3f77572d
--- /dev/null
+++ b/health/health.d/ipfs.conf
@@ -0,0 +1,11 @@
+
+template: ipfs_datastore_usage
+      on: ipfs.repo_size
+    calc: $size * 100 / $avail
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: down 15m multiplier 1.5 max 1h
+    info: ipfs Datastore close to running out of space
+      to: sysadmin
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
new file mode 100644
index 00000000..c2558196
--- /dev/null
+++ b/health/health.d/ipmi.conf
@@ -0,0 +1,20 @@
+   alarm: ipmi_sensors_states
+      on: ipmi.sensors_states
+    calc: $warning + $critical
+   units: sensors
+   every: 10s
+    warn: $this > 0
+    crit: $critical > 0
+   delay: up 5m down 15m multiplier 1.5 max 1h
+    info: the number IPMI sensors in non-nominal state
+      to: sysadmin
+
+   alarm: ipmi_events
+      on: ipmi.events
+    calc: $events
+   units: events
+   every: 10s
+    warn: $this > 0
+   delay: up 5m down 15m multiplier 1.5 max 1h
+    info: the number of events in the IPMI System Event Log (SEL)
+      to: sysadmin
diff --git a/health/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf
new file mode 100644
index 00000000..8054656f
--- /dev/null
+++ b/health/health.d/isc_dhcpd.conf
@@ -0,0 +1,10 @@
+ template: isc_dhcpd_leases_size
+      on: isc_dhcpd.leases_total
+   units: KB
+   every: 60
+    calc: $leases_size
+    warn: $this > 3072
+    crit: $this > 6144
+   delay: up 2m down 5m
+    info: dhcpd.leases file too big! Module can slow down your server.
+      to: sysadmin
diff --git a/health/health.d/lighttpd.conf b/health/health.d/lighttpd.conf
new file mode 100644
index 00000000..915907a4
--- /dev/null
+++ b/health/health.d/lighttpd.conf
@@ -0,0 +1,14 @@
+
+# make sure lighttpd is running
+
+template: lighttpd_last_collected_secs
+      on: lighttpd.requests
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: webmaster
+
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
new file mode 100644
index 00000000..27a172a1
--- /dev/null
+++ b/health/health.d/linux_power_supply.conf
@@ -0,0 +1,12 @@
+# Alert on low battery capacity.
+
+template: linux_power_supply_capacity
+      on: power_supply.capacity
+    calc: $capacity
+   units: %
+   every: 10s
+    warn: $this < 10
+    crit: $this < 5
+   delay: up 0 down 5m multiplier 1.2 max 1h
+    info: the percentage remaining capacity of the power supply
+      to: sysadmin
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
new file mode 100644
index 00000000..ee0c54b8
--- /dev/null
+++ b/health/health.d/load.conf
@@ -0,0 +1,56 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# Calculate the base trigger point for the load average alarms.
+# This is the maximum number of CPU's in the system over the past 1
+# minute, with a special case for a single CPU of setting the trigger at 2.
+   alarm: load_trigger
+      on: system.load
+      os: linux
+   hosts: *
+    calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
+   units: cpus
+   every: 1m
+    info: trigger point for load average alarms
+
+# Send alarms if the load average is unusually high.
+# These intentionally _do not_ calculate the average over the sampled
+# time period because the values being checked already are averages.
+   alarm: load_average_15
+      on: system.load
+      os: linux
+   hosts: *
+  lookup: max -1m unaligned of load15
+   units: load
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (1.75 * $load_trigger) : (2 * $load_trigger))
+    crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger))
+   delay: down 15m multiplier 1.5 max 1h
+    info: fifteen-minute load average
+      to: sysadmin
+
+   alarm: load_average_5
+      on: system.load
+      os: linux
+   hosts: *
+  lookup: max -1m unaligned of load5
+   units: load
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (3.5 * $load_trigger) : (4 * $load_trigger))
+    crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger))
+   delay: down 15m multiplier 1.5 max 1h
+    info: five-minute load average
+      to: sysadmin
+
+   alarm: load_average_1
+      on: system.load
+      os: linux
+   hosts: *
+  lookup: max -1m unaligned of load1
+   units: load
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (7 * $load_trigger) : (8 * $load_trigger))
+    crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger))
+   delay: down 15m multiplier 1.5 max 1h
+    info: one-minute load average
+      to: sysadmin
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
new file mode 100644
index 00000000..0f5f2837
--- /dev/null
+++ b/health/health.d/mdstat.conf
@@ -0,0 +1,27 @@
+template: mdstat_last_collected
+      on: md.disks
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+template: mdstat_disks
+      on: md.disks
+   units: failed devices
+   every: 10s
+    calc: $total - $inuse
+    crit: $this > 0
+    info: Array is degraded!
+      to: sysadmin
+
+template: mdstat_mismatch_cnt
+      on: md.mismatch_cnt
+   units: unsynchronized blocks
+    calc: $count
+   every: 10s
+    crit: $this > 0
+    info: Mismatch count!
+      to: sysadmin
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
new file mode 100644
index 00000000..1881a7be
--- /dev/null
+++ b/health/health.d/megacli.conf
@@ -0,0 +1,48 @@
+   alarm: adapter_state
+      on: megacli.adapter_degraded
+   units: is degraded
+    lookup: sum -10s
+    every: 10s
+    crit: $this > 0
+    info: adapter state
+      to: sysadmin
+
+   template: bbu_relative_charge
+      on: megacli.bbu_relative_charge
+   units: percent
+    lookup: average -10s
+    every: 10s
+    warn: $this <= (($status >= $WARNING)  ? (85) : (80))
+    crit: $this <= (($status == $CRITICAL)  ? (50) : (40))
+    info: BBU relative state of charge
+      to: sysadmin
+
+   template: bbu_cycle_count
+      on: megacli.bbu_cycle_count
+   units: cycle count
+    lookup: average -10s
+    every: 10s
+    warn: $this >= 100
+    crit: $this >= 500
+    info: BBU cycle count
+      to: sysadmin
+
+    alarm: pd_media_errors
+      on: megacli.pd_media_error
+   units: media errors
+    lookup: sum -10s
+    every: 10s
+    warn: $this > 0
+    delay: down 1m multiplier 2 max 10m
+    info: physical drive media errors
+      to: sysadmin
+
+    alarm: pd_predictive_failures
+      on: megacli.pd_predictive_failure
+   units: predictive failures
+    lookup: sum -10s
+    every: 10s
+    warn: $this > 0
+    delay: down 1m multiplier 2 max 10m
+    info: physical drive predictive failures
+      to: sysadmin
diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf
new file mode 100644
index 00000000..d248ef57
--- /dev/null
+++ b/health/health.d/memcached.conf
@@ -0,0 +1,52 @@
+
+# make sure memcached is running
+
+template: memcached_last_collected_secs
+      on: memcached.cache
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
+
+
+# detect if memcached cache is full
+
+template: memcached_cache_memory_usage
+      on: memcached.cache
+    calc: $used * 100 / ($used + $available)
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: up 0 down 15m multiplier 1.5 max 1h
+    info: current cache memory usage
+      to: dba
+
+
+# find the rate memcached cache is filling
+
+template: cache_fill_rate
+      on: memcached.cache
+  lookup: min -10m at -50m unaligned of available
+    calc: ($this - $available) / (($now - $after) / 3600)
+   units: KB/hour
+   every: 1m
+    info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour
+
+
+# find the hours remaining until memcached cache is full
+
+template: out_of_cache_space_time
+      on: memcached.cache
+    calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf)
+   units: hours
+   every: 10s
+    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+   delay: down 15m multiplier 1.5 max 1h
+    info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour
+      to: dba
diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf
new file mode 100644
index 00000000..4a0e6e52
--- /dev/null
+++ b/health/health.d/memory.conf
@@ -0,0 +1,38 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+   alarm: 1hour_ecc_memory_correctable
+      on: mem.ecc_ce
+      os: linux
+   hosts: *
+  lookup: sum -10m unaligned
+   units: errors
+   every: 1m
+    warn: $this > 0
+   delay: down 1h multiplier 1.5 max 1h
+    info: number of ECC correctable errors during the last hour
+      to: sysadmin
+
+   alarm: 1hour_ecc_memory_uncorrectable
+      on: mem.ecc_ue
+      os: linux
+   hosts: *
+  lookup: sum -10m unaligned
+   units: errors
+   every: 1m
+    crit: $this > 0
+   delay: down 1h multiplier 1.5 max 1h
+    info: number of ECC uncorrectable errors during the last hour
+      to: sysadmin
+
+   alarm: 1hour_memory_hw_corrupted
+      on: mem.hwcorrupt
+      os: linux
+   hosts: *
+    calc: $HardwareCorrupted
+   units: MB
+   every: 10s
+    warn: $this > 0
+   delay: down 1h multiplier 1.5 max 1h
+    info: amount of memory corrupted due to a hardware failure
+      to: sysadmin
diff --git a/health/health.d/mongodb.conf b/health/health.d/mongodb.conf
new file mode 100644
index 00000000..a80cb311
--- /dev/null
+++ b/health/health.d/mongodb.conf
@@ -0,0 +1,13 @@
+
+# make sure mongodb is running
+
+template: mongodb_last_collected_secs
+      on: mongodb.read_operations
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
new file mode 100644
index 00000000..39c40191
--- /dev/null
+++ b/health/health.d/mysql.conf
@@ -0,0 +1,100 @@
+
+# make sure mysql is running
+
+template: mysql_last_collected_secs
+      on: mysql.queries
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
+
+
+# -----------------------------------------------------------------------------
+# slow queries
+
+template: mysql_10s_slow_queries
+      on: mysql.queries
+  lookup: sum -10s of slow_queries
+   units: slow queries
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (10) : (20))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of mysql slow queries over the last 10 seconds
+      to: dba
+
+
+# -----------------------------------------------------------------------------
+# lock waits
+
+template: mysql_10s_table_locks_immediate
+      on: mysql.table_locks
+  lookup: sum -10s absolute of immediate
+   units: immediate locks
+   every: 10s
+    info: number of table immediate locks over the last 10 seconds
+      to: dba
+
+template: mysql_10s_table_locks_waited
+      on: mysql.table_locks
+  lookup: sum -10s absolute of waited
+   units: waited locks
+   every: 10s
+    info: number of table waited locks over the last 10 seconds
+      to: dba
+
+template: mysql_10s_waited_locks_ratio
+      on: mysql.table_locks
+    calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (10) : (25))
+    crit: $this > (($status == $CRITICAL) ? (25) : (50))
+   delay: down 30m multiplier 1.5 max 1h
+    info: the ratio of mysql waited table locks, for the last 10 seconds
+      to: dba
+
+
+# -----------------------------------------------------------------------------
+# connections
+
+template: mysql_connections
+      on: mysql.connections_active
+    calc: $active * 100 / $limit
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (60) : (70))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: down 15m multiplier 1.5 max 1h
+    info: the ratio of current active connections vs the maximum possible number of connections
+      to: dba
+
+
+# -----------------------------------------------------------------------------
+# replication
+
+template: mysql_replication
+      on: mysql.slave_status
+    calc: ($sql_running == -1 OR $io_running == -1)?0:1
+   units: ok/failed
+   every: 10s
+    crit: $this == 0
+   delay: down 5m multiplier 1.5 max 1h
+    info: checks if mysql replication has stopped
+      to: dba
+
+template: mysql_replication_lag
+      on: mysql.slave_behind
+    calc: $seconds
+   units: seconds
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (10) : (30))
+   delay: down 15m multiplier 1.5 max 1h
+    info: the number of seconds mysql replication is behind this master
+      to: dba
+
diff --git a/health/health.d/named.conf b/health/health.d/named.conf
new file mode 100644
index 00000000..4fc65c8e
--- /dev/null
+++ b/health/health.d/named.conf
@@ -0,0 +1,14 @@
+
+# make sure named is running
+
+template: named_last_collected_secs
+      on: named.global_queries
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: domainadmin
+
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
new file mode 100644
index 00000000..489016dd
--- /dev/null
+++ b/health/health.d/net.conf
@@ -0,0 +1,155 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+# net traffic overflow
+
+ template: 1m_received_traffic_overflow
+       on: net.net
+       os: linux
+    hosts: *
+ families: *
+   lookup: average -1m unaligned absolute of received
+     calc: ($nic_speed_max > 0) ? ($this * 100 / ($nic_speed_max * 1000)) : ( nan )
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (90))
+    delay: down 1m multiplier 1.5 max 1h
+     info: interface received bandwidth usage over net device speed max
+       to: sysadmin
+
+ template: 1m_sent_traffic_overflow
+       on: net.net
+       os: linux
+    hosts: *
+ families: *
+   lookup: average -1m unaligned absolute of sent
+     calc: ($nic_speed_max > 0) ? ($this * 100 / ($nic_speed_max * 1000)) : ( nan )
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (90))
+    delay: down 1m multiplier 1.5 max 1h
+     info: interface sent bandwidth usage over net device speed max
+       to: sysadmin
+
+# -----------------------------------------------------------------------------
+# dropped packets
+
+# check if an interface is dropping packets
+# the alarm is checked every 1 minute
+# and examines the last 10 minutes of data
+
+template: inbound_packets_dropped
+      on: net.drops
+      os: linux
+   hosts: *
+families: *
+  lookup: sum -10m unaligned absolute of inbound
+   units: packets
+   every: 1m
+    warn: $this >= 5
+   delay: down 1h multiplier 1.5 max 2h
+    info: interface inbound dropped packets in the last 10 minutes
+      to: sysadmin
+
+template: outbound_packets_dropped
+      on: net.drops
+      os: linux
+   hosts: *
+families: *
+  lookup: sum -10m unaligned absolute of outbound
+   units: packets
+   every: 1m
+    warn: $this >= 5
+   delay: down 1h multiplier 1.5 max 2h
+    info: interface outbound dropped packets in the last 10 minutes
+      to: sysadmin
+
+template: inbound_packets_dropped_ratio
+      on: net.packets
+      os: linux
+   hosts: *
+families: *
+  lookup: sum -10m unaligned absolute of received
+    calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0))
+   units: %
+   every: 1m
+    warn: $this >= 0.1
+    crit: $this >= 2
+   delay: down 1h multiplier 1.5 max 2h
+    info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
+      to: sysadmin
+
+template: outbound_packets_dropped_ratio
+      on: net.packets
+      os: linux
+   hosts: *
+families: *
+  lookup: sum -10m unaligned absolute of sent
+    calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0))
+   units: %
+   every: 1m
+    warn: $this >= 0.1
+    crit: $this >= 2
+   delay: down 1h multiplier 1.5 max 2h
+    info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
+      to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# FIFO errors
+
+# check if an interface is having FIFO
+# buffer errors
+# the alarm is checked every 1 minute
+# and examines the last 10 minutes of data
+
+template: 10min_fifo_errors
+      on: net.fifo
+      os: linux
+   hosts: *
+families: *
+  lookup: sum -10m unaligned absolute
+   units: errors
+   every: 1m
+    warn: $this > 0
+   delay: down 1h multiplier 1.5 max 2h
+    info: interface fifo errors in the last 10 minutes
+      to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# check for packet storms
+
+# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is 10x or 20x the first
+# we assume the minimum packet storm should at least have
+# 10000 packets/s, average of the last 10 seconds
+
+template: 1m_received_packets_rate
+      on: net.packets
+      os: linux freebsd
+   hosts: *
+families: *
+  lookup: average -1m unaligned of received
+   units: packets
+   every: 10s
+    info: the average number of packets received during the last minute
+
+template: 10s_received_packets_storm
+      on: net.packets
+      os: linux freebsd
+   hosts: *
+families: *
+  lookup: average -10s unaligned of received
+    calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
+   every: 10s
+   units: %
+   warn: $this > (($status >= $WARNING)?(200):(5000))
+   crit: $this > (($status >= $WARNING)?(5000):(6000))
+options: no-clear-notification
+   info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
+     to: sysadmin
diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf
new file mode 100644
index 00000000..1d07752c
--- /dev/null
+++ b/health/health.d/netfilter.conf
@@ -0,0 +1,29 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+   alarm: netfilter_last_collected_secs
+      on: netfilter.conntrack_sockets
+      os: linux
+   hosts: *
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+   alarm: netfilter_conntrack_full
+      on: netfilter.conntrack_sockets
+      os: linux
+   hosts: *
+  lookup: max -10s unaligned of connections
+    calc: $this * 100 / $netfilter_conntrack_max
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: down 5m multiplier 1.5 max 1h
+    info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size
+      to: sysadmin
diff --git a/health/health.d/nginx.conf b/health/health.d/nginx.conf
new file mode 100644
index 00000000..a686c3d9
--- /dev/null
+++ b/health/health.d/nginx.conf
@@ -0,0 +1,14 @@
+
+# make sure nginx is running
+
+template: nginx_last_collected_secs
+      on: nginx.requests
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: webmaster
+
diff --git a/health/health.d/nginx_plus.conf b/health/health.d/nginx_plus.conf
new file mode 100644
index 00000000..5a171a76
--- /dev/null
+++ b/health/health.d/nginx_plus.conf
@@ -0,0 +1,14 @@
+
+# make sure nginx_plus is running
+
+template: nginx_plus_last_collected_secs
+      on: nginx_plus.requests_total
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: webmaster
+
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
new file mode 100644
index 00000000..f42b63d3
--- /dev/null
+++ b/health/health.d/portcheck.conf
@@ -0,0 +1,48 @@
+template: portcheck_last_collected_secs
+families: *
+      on: portcheck.status
+    calc: $now - $last_collected_t
+   every: 10s
+   units: seconds ago
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
+template: service_reachable
+families: *
+      on: portcheck.status
+  lookup: average -1m unaligned percentage of success
+    calc: ($this < 75) ? (0) : ($this)
+   every: 5s
+   units: up/down
+    info: at least 75% successful connections during last 60 seconds, ideal for badges
+      to: silent
+
+template: connection_timeouts
+families: *
+      on: portcheck.status
+  lookup: average -5m unaligned percentage of timeout
+   every: 10s
+   units: %
+    warn: $this >= 10 AND $this < 40
+    crit: $this >= 40
+   delay: down 5m multiplier 1.5 max 1h
+    info: average of timeouts during the last 5 minutes
+ options: no-clear-notification
+      to: sysadmin
+
+template: connection_fails
+families: *
+      on: portcheck.status
+  lookup: average -5m unaligned percentage of no_connection
+   every: 10s
+   units: %
+    warn: $this >= 10 AND $this < 40
+    crit: $this >= 40
+   delay: down 5m multiplier 1.5 max 1h
+    info: average of failed connections during the last 5 minutes
+ options: no-clear-notification
+      to: sysadmin
diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf
new file mode 100644
index 00000000..4e0583b8
--- /dev/null
+++ b/health/health.d/postgres.conf
@@ -0,0 +1,13 @@
+
+# make sure postgres is running
+
+template: postgres_last_collected_secs
+      on: postgres.db_stat_transactions
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
diff --git a/health/health.d/qos.conf b/health/health.d/qos.conf
new file mode 100644
index 00000000..7290d15f
--- /dev/null
+++ b/health/health.d/qos.conf
@@ -0,0 +1,18 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# check if a QoS class is dropping packets
+# the alarm is checked every 10 seconds
+# and examines the last minute of data
+
+#template: 10min_qos_packet_drops
+#      on: tc.qos_dropped
+#      os: linux
+#   hosts: *
+#  lookup: sum -10m unaligned absolute
+#   every: 30s
+#    warn: $this > 0
+#   delay: up 0 down 30m multiplier 1.5 max 1h
+#   units: packets
+#    info: dropped packets in the last 30 minutes
+#      to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
new file mode 100644
index 00000000..4e437322
--- /dev/null
+++ b/health/health.d/ram.conf
@@ -0,0 +1,64 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+   alarm: used_ram_to_ignore
+      on: system.ram
+      os: linux freebsd
+   hosts: *
+    calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
+   every: 10s
+    info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
+
+   alarm: ram_in_use
+      on: system.ram
+      os: linux
+   hosts: *
+#   calc: $used * 100 / ($used + $cached + $free)
+    calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: down 15m multiplier 1.5 max 1h
+    info: system RAM used
+      to: sysadmin
+
+   alarm: ram_available
+      on: mem.available
+      os: linux
+   hosts: *
+    calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+   units: %
+   every: 10s
+    warn: $this < (($status >= $WARNING)  ? ( 5) : (10))
+    crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+   delay: down 15m multiplier 1.5 max 1h
+    info: estimated amount of RAM available for userspace processes, without causing swapping
+      to: sysadmin
+
+## FreeBSD
+alarm: ram_in_use
+   on: system.ram
+   os: freebsd
+hosts: *
+ calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+units: %
+every: 10s
+ warn: $this > (($status >= $WARNING)  ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+delay: down 15m multiplier 1.5 max 1h
+ info: system RAM usage
+   to: sysadmin
+
+ alarm: ram_available
+    on: system.ram
+    os: freebsd
+ hosts: *
+  calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+ units: %
+ every: 10s
+  warn: $this < (($status >= $WARNING)  ? ( 5) : (10))
+  crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+  info: estimated amount of RAM available for userspace processes, without causing swapping
+    to: sysadmin
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
new file mode 100644
index 00000000..c08a884a
--- /dev/null
+++ b/health/health.d/redis.conf
@@ -0,0 +1,34 @@
+
+# make sure redis is running
+
+template: redis_last_collected_secs
+      on: redis.operations
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
+
+template: redis_bgsave_broken
+families: *
+      on: redis.bgsave_health
+   every: 10s
+    crit: $rdb_last_bgsave_status != 0
+   units: ok/failed
+    info: states if redis bgsave is working
+   delay: down 5m multiplier 1.5 max 1h
+      to: dba
+
+template: redis_bgsave_slow
+families: *
+      on: redis.bgsave_now
+   every: 10s
+    warn: $rdb_bgsave_in_progress > 600
+    crit: $rdb_bgsave_in_progress > 1200
+   units: seconds
+    info: the time redis needs to save its database
+   delay: down 5m multiplier 1.5 max 1h
+      to: dba
diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf
new file mode 100644
index 00000000..2344b60e
--- /dev/null
+++ b/health/health.d/retroshare.conf
@@ -0,0 +1,25 @@
+# make sure RetroShare is running
+
+template: retroshare_last_collected_secs
+      on: retroshare.peers
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# make sure the DHT is fine when active
+
+template: retroshare_dht_working
+      on: retroshare.dht
+    calc: $dht_size_all
+   units: peers
+   every: 1m
+    warn: $this < (($status >= $WARNING)  ? (120) : (100))
+    crit: $this < (($status == $CRITICAL) ? (10)  : (1))
+   delay: up 0 down 15m multiplier 1.5 max 1h
+    info: Checks if the DHT has enough peers to operate
+      to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
new file mode 100644
index 00000000..77c804bf
--- /dev/null
+++ b/health/health.d/softnet.conf
@@ -0,0 +1,40 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# check for common /proc/net/softnet_stat errors
+
+   alarm: 10min_netdev_backlog_exceeded
+      on: system.softnet_stat
+      os: linux
+   hosts: *
+  lookup: sum -10m unaligned absolute of dropped
+   units: packets
+   every: 1m
+    warn: $this > 0
+   delay: down 1h multiplier 1.5 max 2h
+    info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+      to: sysadmin
+
+   alarm: 10min_netdev_budget_ran_outs
+      on: system.softnet_stat
+      os: linux
+   hosts: *
+  lookup: sum -10m unaligned absolute of squeezed
+   units: events
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (0) : (10))
+   delay: down 1h multiplier 1.5 max 2h
+    info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
+      to: silent
+
+   alarm: 10min_netisr_backlog_exceeded
+      on: system.softnet_stat
+      os: freebsd
+   hosts: *
+   lookup: sum -10m unaligned absolute of qdrops
+   units: packets
+   every: 1m
+    warn: $this > 0
+   delay: down 1h multiplier 1.5 max 2h
+    info: number of drops in the last 10min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
+      to: sysadmin
diff --git a/health/health.d/squid.conf b/health/health.d/squid.conf
new file mode 100644
index 00000000..06cc9678
--- /dev/null
+++ b/health/health.d/squid.conf
@@ -0,0 +1,14 @@
+
+# make sure squid is running
+
+template: squid_last_collected_secs
+      on: squid.clients_requests
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: proxyadmin
+
diff --git a/health/health.d/stiebeleltron.conf b/health/health.d/stiebeleltron.conf
new file mode 100644
index 00000000..e0361eb2
--- /dev/null
+++ b/health/health.d/stiebeleltron.conf
@@ -0,0 +1,11 @@
+template: stiebeleltron_last_collected_secs
+families: *
+      on: stiebeleltron.heating.hc1
+    calc: $now - $last_collected_t
+   every: 10s
+   units: seconds ago
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sitemgr
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
new file mode 100644
index 00000000..f920b080
--- /dev/null
+++ b/health/health.d/swap.conf
@@ -0,0 +1,43 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+   alarm: 30min_ram_swapped_out
+      on: system.swapio
+      os: linux freebsd
+   hosts: *
+  lookup: sum -30m unaligned absolute of out
+          # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
+    calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
+   units: % of RAM
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (10) : (20))
+    crit: $this > (($status == $CRITICAL) ? (20) : (30))
+   delay: up 0 down 15m multiplier 1.5 max 1h
+    info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
+      to: sysadmin
+
+   alarm: ram_in_swap
+      on: system.swap
+      os: linux
+   hosts: *
+    calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
+   units: % of RAM
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (15) : (20))
+    crit: $this > (($status == $CRITICAL) ? (40) : (50))
+   delay: up 30s down 15m multiplier 1.5 max 1h
+    info: the swap memory used, as a percentage of the system RAM
+      to: sysadmin
+
+   alarm: used_swap
+      on: system.swap
+      os: linux freebsd
+   hosts: *
+    calc: $used * 100 / ( $used + $free )
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: up 30s down 15m multiplier 1.5 max 1h
+    info: the percentage of swap memory used
+      to: sysadmin
diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
new file mode 100644
index 00000000..7aa9a980
--- /dev/null
+++ b/health/health.d/tcp_conn.conf
@@ -0,0 +1,19 @@
+
+#
+# ${tcp_max_connections} may be nan or -1 if the system
+# supports dynamic threshold for TCP connections.
+# In this case, the alarm will always be zero.
+#
+
+   alarm: tcp_connections
+      on: ipv4.tcpsock
+      os: linux
+   hosts: *
+    calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
+    crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 ))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the percentage of IPv4 TCP connections over the max allowed
+      to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
new file mode 100644
index 00000000..552930ab
--- /dev/null
+++ b/health/health.d/tcp_listen.conf
@@ -0,0 +1,82 @@
+#
+# There are two queues involved when incoming TCP connections are handled
+# (both at the kernel):
+#
+# SYN queue
+# The SYN queue tracks TCP handshakes until connections are fully established.
+# It overflows when too many incoming TCP connection requests hang in the
+# half-open state and the server is not configured to fall back to SYN cookies.
+# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends
+# lots of SYN packets and never completes the handshakes).
+#
+# Accept queue
+# The accept queue holds fully established TCP connections waiting to be handled
+# by the listening application. It overflows when the server application fails
+# to accept new connections at the rate they are coming in.
+#
+#
+# -----------------------------------------------------------------------------
+# tcp accept queue (at the kernel)
+
+   alarm: 1m_tcp_accept_queue_overflows
+      on: ip.tcp_accept_queue
+      os: linux
+   hosts: *
+  lookup: sum -60s unaligned absolute of ListenOverflows
+   units: overflows
+   every: 10s
+    crit: $this > 0
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the number of times the TCP accept queue of the kernel overflown, during the last minute
+      to: sysadmin
+
+# THIS IS TOO GENERIC
+# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
+   alarm: 1m_tcp_accept_queue_drops
+      on: ip.tcp_accept_queue
+      os: linux
+   hosts: *
+  lookup: sum -60s unaligned absolute of ListenDrops
+   units: drops
+   every: 10s
+#    warn: $this > 0
+    crit: $this > (($status == $CRITICAL) ? (0) : (150))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
+      to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# tcp SYN queue (at the kernel)
+
+# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or
+# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are
+# enabled or not. In both cases this probably indicates a SYN flood attack,
+# so i guess a notification should be sent.
+
+   alarm: 1m_tcp_syn_queue_drops
+      on: ip.tcp_syn_queue
+      os: linux
+   hosts: *
+  lookup: sum -60s unaligned absolute of TCPReqQFullDrop
+   units: drops
+   every: 10s
+    warn: $this > 0
+    crit: $this > (($status == $CRITICAL) ? (0) : (60))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute
+      to: sysadmin
+
+   alarm: 1m_tcp_syn_queue_cookies
+      on: ip.tcp_syn_queue
+      os: linux
+   hosts: *
+  lookup: sum -60s unaligned absolute of TCPReqQFullDoCookies
+   units: cookies
+   every: 10s
+    warn: $this > 0
+    crit: $this > (($status == $CRITICAL) ? (0) : (60))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute
+      to: sysadmin
+
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
new file mode 100644
index 00000000..6927d576
--- /dev/null
+++ b/health/health.d/tcp_mem.conf
@@ -0,0 +1,20 @@
+#
+# check
+# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
+#
+# We give a warning when TCP is under memory pressure
+# and a critical when TCP is 90% of its upper memory limit
+#
+
+   alarm: tcp_memory
+      on: ipv4.sockstat_tcp_mem
+      os: linux
+   hosts: *
+    calc: ${mem} * 100 / ${tcp_mem_high}
+   units: %
+   every: 10s
+    warn: ${mem} > (($status >= $WARNING  ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure}   ))
+    crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure}       ) : ( ${tcp_mem_high} * 0.9 ))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the amount of TCP memory as a percentage of its max memory limit
+      to: sysadmin
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
new file mode 100644
index 00000000..280d6590
--- /dev/null
+++ b/health/health.d/tcp_orphans.conf
@@ -0,0 +1,21 @@
+
+#
+# check
+# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
+#
+# The kernel may penalize orphans by 2x or even 4x
+# so we alarm warning at 25% and critical at 50%
+#
+
+   alarm: tcp_orphans
+      on: ipv4.sockstat_tcp_sockets
+      os: linux
+   hosts: *
+    calc: ${orphan} * 100 / ${tcp_max_orphans}
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
+    crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 ))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors)
+      to: sysadmin
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
new file mode 100644
index 00000000..91dad3c6
--- /dev/null
+++ b/health/health.d/tcp_resets.conf
@@ -0,0 +1,67 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+
+   alarm: ipv4_tcphandshake_last_collected_secs
+      on: ipv4.tcphandshake
+      os: linux freebsd
+   hosts: *
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# -----------------------------------------------------------------------------
+# tcp resets this host sends
+
+   alarm: 1m_ipv4_tcp_resets_sent
+      on: ipv4.tcphandshake
+      os: linux
+   hosts: *
+  lookup: average -1m at -10s unaligned absolute of OutRsts
+   units: tcp resets/s
+   every: 10s
+    info: average TCP RESETS this host is sending, over the last minute
+
+   alarm: 10s_ipv4_tcp_resets_sent
+      on: ipv4.tcphandshake
+      os: linux
+   hosts: *
+  lookup: average -10s unaligned absolute of OutRsts
+   units: tcp resets/s
+   every: 10s
+    warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (20)))
+   delay: up 0 down 60m multiplier 1.2 max 2h
+ options: no-clear-notification
+    info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent)
+      to: sysadmin
+
+# -----------------------------------------------------------------------------
+# tcp resets this host receives
+
+   alarm: 1m_ipv4_tcp_resets_received
+      on: ipv4.tcphandshake
+      os: linux freebsd
+   hosts: *
+  lookup: average -1m at -10s unaligned absolute of AttemptFails
+   units: tcp resets/s
+   every: 10s
+    info: average TCP RESETS this host is sending, over the last minute
+
+   alarm: 10s_ipv4_tcp_resets_received
+      on: ipv4.tcphandshake
+      os: linux freebsd
+   hosts: *
+  lookup: average -10s unaligned absolute of AttemptFails
+   units: tcp resets/s
+   every: 10s
+    warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (10)))
+   delay: up 0 down 60m multiplier 1.2 max 2h
+ options: no-clear-notification
+    info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent)
+      to: sysadmin
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
new file mode 100644
index 00000000..5140228f
--- /dev/null
+++ b/health/health.d/udp_errors.conf
@@ -0,0 +1,49 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+
+   alarm: ipv4_udperrors_last_collected_secs
+      on: ipv4.udperrors
+      os: linux freebsd
+   hosts: *
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# -----------------------------------------------------------------------------
+# UDP receive buffer errors
+
+   alarm: 1m_ipv4_udp_receive_buffer_errors
+      on: ipv4.udperrors
+      os: linux freebsd
+   hosts: *
+  lookup: sum -1m unaligned absolute of RcvbufErrors
+   units: errors
+   every: 10s
+    warn: $this > 0
+    crit: $this > (($status == $CRITICAL) ? (0) : (100))
+    info: number of UDP receive buffer errors during the last minute
+   delay: up 0 down 60m multiplier 1.2 max 2h
+      to: sysadmin
+
+# -----------------------------------------------------------------------------
+# UDP send buffer errors
+
+   alarm: 1m_ipv4_udp_send_buffer_errors
+      on: ipv4.udperrors
+      os: linux
+   hosts: *
+  lookup: sum -1m unaligned absolute of SndbufErrors
+   units: errors
+   every: 10s
+    warn: $this > 0
+    crit: $this > (($status == $CRITICAL) ? (0) : (100))
+    info: number of UDP send buffer errors during the last minute
+   delay: up 0 down 60m multiplier 1.2 max 2h
+      to: sysadmin
diff --git a/health/health.d/varnish.conf b/health/health.d/varnish.conf
new file mode 100644
index 00000000..cca7446b
--- /dev/null
+++ b/health/health.d/varnish.conf
@@ -0,0 +1,9 @@
+   alarm: varnish_last_collected
+      on: varnish.uptime
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    info: number of seconds since the last successful data collection
+      to: sysadmin
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
new file mode 100644
index 00000000..d8be88b4
--- /dev/null
+++ b/health/health.d/web_log.conf
@@ -0,0 +1,163 @@
+
+# make sure we can collect web log data
+
+template: last_collected_secs
+      on: web_log.response_codes
+families: *
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: webmaster
+
+
+# -----------------------------------------------------------------------------
+# high level response code alarms
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+#  $1m_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+template: 1m_requests
+      on: web_log.response_statuses
+families: *
+  lookup: sum -1m unaligned
+    calc: ($this == 0)?(1):($this)
+   units: requests
+   every: 10s
+    info: the sum of all HTTP requests over the last minute
+
+template: 1m_successful
+      on: web_log.response_statuses
+families: *
+  lookup: sum -1m unaligned of successful_requests
+    calc: $this * 100 / $1m_requests
+   units: %
+   every: 10s
+    warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
+    crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
+   delay: up 2m down 15m multiplier 1.5 max 1h
+    info: the ratio of successful HTTP responses (1xx, 2xx, 304) over the last minute
+      to: webmaster
+
+template: 1m_redirects
+      on: web_log.response_statuses
+families: *
+  lookup: sum -1m unaligned of redirects
+    calc: $this * 100 / $1m_requests
+   units: %
+   every: 10s
+    warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? (  1 ) : ( 20 )) ) : ( 0 )
+    crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
+   delay: up 2m down 15m multiplier 1.5 max 1h
+    info: the ratio of HTTP redirects (3xx except 304) over the last minute
+      to: webmaster
+
+template: 1m_bad_requests
+      on: web_log.response_statuses
+families: *
+  lookup: sum -1m unaligned of bad_requests
+    calc: $this * 100 / $1m_requests
+   units: %
+   every: 10s
+    warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
+    crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
+   delay: up 2m down 15m multiplier 1.5 max 1h
+    info: the ratio of HTTP bad requests (4xx) over the last minute
+      to: webmaster
+
+template: 1m_internal_errors
+      on: web_log.response_statuses
+families: *
+  lookup: sum -1m unaligned of server_errors
+    calc: $this * 100 / $1m_requests
+   units: %
+   every: 10s
+    warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
+    crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
+   delay: up 2m down 15m multiplier 1.5 max 1h
+    info: the ratio of HTTP internal server errors (5xx), over the last minute
+      to: webmaster
+
+
+# -----------------------------------------------------------------------------
+# web slow
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+#  $1m_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+template: 10m_response_time
+      on: web_log.response_time
+families: *
+  lookup: average -10m unaligned of avg
+   units: ms
+   every: 30s
+    info: the average time to respond to HTTP requests, over the last 10 minutes
+
+template: web_slow
+      on: web_log.response_time
+families: *
+  lookup: average -1m unaligned of avg
+   units: ms
+   every: 10s
+   green: 500
+     red: 1000
+    warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
+    crit: ($1m_requests > 120) ? ($this > $red   && $this > ($10m_response_time * 4) ) : ( 0 )
+   delay: down 15m multiplier 1.5 max 1h
+    info: the average time to respond to HTTP requests, over the last 1 minute
+ options: no-clear-notification
+      to: webmaster
+
+# -----------------------------------------------------------------------------
+# web too many or too few requests
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+#  $5m_successful_old > 120
+#
+# i.e. when there were at least 120 requests during the 5 minutes starting
+#      at -10m and ending at -5m
+
+template: 5m_successful_old
+      on: web_log.response_statuses
+families: *
+  lookup: average -5m at -5m unaligned of successful_requests
+   units: requests/s
+   every: 30s
+    info: average rate of successful HTTP requests over the last 5 minutes
+
+template: 5m_successful
+      on: web_log.response_statuses
+families: *
+  lookup: average -5m unaligned of successful_requests
+   units: requests/s
+   every: 30s
+    info: average successful HTTP requests over the last 5 minutes
+
+template: 5m_requests_ratio
+      on: web_log.response_codes
+families: *
+    calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
+   units: %
+   every: 30s
+    warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
+    crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
+   delay: down 15m multiplier 1.5 max 1h
+options: no-clear-notification
+    info: the percentage of successful web requests over the last 5 minutes, \
+          compared with the previous 5 minutes \
+          (clear notification for this alarm will not be sent)
+      to: webmaster
+
diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf
new file mode 100644
index 00000000..af73824e
--- /dev/null
+++ b/health/health.d/zfs.conf
@@ -0,0 +1,10 @@
+
+   alarm: zfs_memory_throttle
+      on: zfs.memory_ops
+  lookup: sum -10m unaligned absolute of throttled
+   units: events
+   every: 1m
+    warn: $this > 0
+   delay: down 1h multiplier 1.5 max 2h
+    info: the number of times ZFS had to limit the ARC growth in the last 10 minutes
+      to: sysadmin
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2018-11-07 12:19:29 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2018-11-07 12:20:17 +0000
commit	a64a253794ac64cb40befee54db53bde17dd0d49 (patch)
tree	c1024acc5f6e508814b944d99f112259bb28b1be /health/health.d
parent	New upstream version 1.10.0+dfsg (diff)
download	netdata-a64a253794ac64cb40befee54db53bde17dd0d49.tar.xz netdata-a64a253794ac64cb40befee54db53bde17dd0d49.zip