Merging upstream version 1.11.0+dfsg.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2018-11-07 12:22:44 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2018-11-07 12:22:44 +0000
commit: 1e6c93250172946eeb38e94a92a1fd12c9d3011e (patch)
tree: 8ca5e16dfc7ad6b3bf2738ca0a48408a950f8f7e /conf.d/health.d
parent: Update watch file (diff)
download: netdata-1e6c93250172946eeb38e94a92a1fd12c9d3011e.tar.xz
netdata-1e6c93250172946eeb38e94a92a1fd12c9d3011e.zip
49 files changed, 0 insertions, 1801 deletions
diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf
deleted file mode 100644
index 0c98b8778..000000000
--- a/conf.d/health.d/apache.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# make sure apache is running
-
-template: apache_last_collected_secs
-      on: apache.requests
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
-
diff --git a/conf.d/health.d/backend.conf b/conf.d/health.d/backend.conf
deleted file mode 100644
index 7af100d8f..000000000
--- a/conf.d/health.d/backend.conf
+++ /dev/null
@@ -1,45 +0,0 @@
-
-# make sure we are sending data to backend
-
-   alarm: backend_last_buffering
-      on: netdata.backend_metrics
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful buffering of backend data
-      to: dba
-
-   alarm: backend_metrics_sent
-      on: netdata.backend_metrics
-   units: %
-    calc: abs($sent) * 100 / abs($buffered)
-   every: 10s
-    warn: $this != 100
-   delay: down 5m multiplier 1.5 max 1h
-    info: percentage of metrics sent to the backend server
-      to: dba
-
-   alarm: backend_metrics_lost
-      on: netdata.backend_metrics
-   units: metrics
-    calc: abs($lost)
-   every: 10s
-    crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0)
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of metrics lost due to repeating failures to contact the backend server
-      to: dba
-
-# this chart has been removed from netdata
-#   alarm: backend_slow
-#      on: netdata.backend_latency
-#   units: %
-#    calc: $latency * 100 / ($update_every * 1000)
-#   every: 10s
-#    warn: $this > 50
-#    crit: $this > 100
-#   delay: down 5m multiplier 1.5 max 1h
-#    info: the percentage of time between iterations needed by the backend time to process the data sent by netdata
-#      to: dba
diff --git a/conf.d/health.d/beanstalkd.conf b/conf.d/health.d/beanstalkd.conf
deleted file mode 100644
index 30dc27328..000000000
--- a/conf.d/health.d/beanstalkd.conf
+++ /dev/null
@@ -1,36 +0,0 @@
-# get the number of buried jobs in all queues
-
-template: server_buried_jobs
-      on: beanstalk.current_jobs
-    calc: $buried
-   units: jobs
-   every: 10s
-    warn: $this > 0
-    crit: $this > 10
-   delay: up 0 down 5m multiplier 1.2 max 1h
-    info: the number of buried jobs aggregated across all tubes
-      to: sysadmin
-      
-# get the number of buried jobs per queue
-
-#template: tube_buried_jobs
-#      on: beanstalk.jobs
-#    calc: $buried
-#   units: jobs
-#   every: 10s
-#    warn: $this > 0
-#    crit: $this > 10
-#   delay: up 0 down 5m multiplier 1.2 max 1h
-#    info: the number of jobs buried per tube
-#      to: sysadmin
-
-# get the current number of tubes
-
-#template: number_of_tubes
-#      on: beanstalk.current_tubes
-#    calc: $tubes
-#   every: 10s
-#    warn: $this < 5
-#   delay: up 0 down 5m multiplier 1.2 max 1h
-#    info: the current number of tubes on the server
-#      to: sysadmin
diff --git a/conf.d/health.d/bind_rndc.conf b/conf.d/health.d/bind_rndc.conf
deleted file mode 100644
index 4145e77cd..000000000
--- a/conf.d/health.d/bind_rndc.conf
+++ /dev/null
@@ -1,9 +0,0 @@
- template: bind_rndc_stats_file_size
-      on: bind_rndc.stats_size
-   units: megabytes
-   every: 60
-    calc: $stats_size
-    warn: $this > 512
-    crit: $this > 1024
-    info: Bind stats file is very large! Consider to create logrotate conf file for it!
-      to: sysadmin
diff --git a/conf.d/health.d/btrfs.conf b/conf.d/health.d/btrfs.conf
deleted file mode 100644
index b27aa544f..000000000
--- a/conf.d/health.d/btrfs.conf
+++ /dev/null
@@ -1,57 +0,0 @@
-
-template: btrfs_allocated
-      on: btrfs.disk
-      os: *
-   hosts: *
-families: *
-    calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (90) : (95))
-    crit: $this > (($status == $CRITICAL) ? (95) : (98))
-   delay: up 1m down 15m multiplier 1.5 max 1h
-    info: the percentage of allocated BTRFS physical disk space
-      to: sysadmin
-
-template: btrfs_data
-      on: btrfs.data
-      os: *
-   hosts: *
-families: *
-    calc: $used * 100 / ($used + $free)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
-    crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
-   delay: up 1m down 15m multiplier 1.5 max 1h
-    info: the percentage of used BTRFS data space
-      to: sysadmin
-
-template: btrfs_metadata
-      on: btrfs.metadata
-      os: *
-   hosts: *
-families: *
-    calc: ($used + $reserved) * 100 / ($used + $free + $reserved)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
-    crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
-   delay: up 1m down 15m multiplier 1.5 max 1h
-    info: the percentage of used BTRFS metadata space
-      to: sysadmin
-
-template: btrfs_system
-      on: btrfs.system
-      os: *
-   hosts: *
-families: *
-    calc: $used * 100 / ($used + $free)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
-    crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
-   delay: up 1m down 15m multiplier 1.5 max 1h
-    info: the percentage of used BTRFS system space
-      to: sysadmin
-
diff --git a/conf.d/health.d/ceph.conf b/conf.d/health.d/ceph.conf
deleted file mode 100644
index de16f7b6f..000000000
--- a/conf.d/health.d/ceph.conf
+++ /dev/null
@@ -1,13 +0,0 @@
-# low ceph disk available
-
-template: cluster_space_usage
-      on: ceph.general_usage
-    calc: $avail * 100 / ($avail + $used)
-   units: %
-   every: 10s
-    warn: $this < 10
-    crit: $this < 1
-   delay: down 5m multiplier 1.2 max 1h
-    info: ceph disk usage is almost full
-      to: sysadmin
-
diff --git a/conf.d/health.d/couchdb.conf b/conf.d/health.d/couchdb.conf
deleted file mode 100644
index 4a2895280..000000000
--- a/conf.d/health.d/couchdb.conf
+++ /dev/null
@@ -1,13 +0,0 @@
-
-# make sure couchdb is running
-
-template: couchdb_last_collected_secs
-      on: couchdb.request_methods
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf
deleted file mode 100644
index fa8189856..000000000
--- a/conf.d/health.d/cpu.conf
+++ /dev/null
@@ -1,55 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-template: 10min_cpu_usage
-      on: system.cpu
-      os: linux
-   hosts: *
-  lookup: average -10m unaligned of user,system,softirq,irq,guest
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (75) : (85))
-    crit: $this > (($status == $CRITICAL) ? (85) : (95))
-   delay: down 15m multiplier 1.5 max 1h
-    info: average cpu utilization for the last 10 minutes (excluding iowait, nice and steal)
-      to: sysadmin
-
-template: 10min_cpu_iowait
-      on: system.cpu
-      os: linux
-   hosts: *
-  lookup: average -10m unaligned of iowait
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (20) : (40))
-    crit: $this > (($status == $CRITICAL) ? (40) : (50))
-   delay: down 15m multiplier 1.5 max 1h
-    info: average CPU wait I/O for the last 10 minutes
-      to: sysadmin
-
-template: 20min_steal_cpu
-      on: system.cpu
-      os: linux
-   hosts: *
-  lookup: average -20m unaligned of steal
-   units: %
-   every: 5m
-    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
-    crit: $this > (($status == $CRITICAL) ? (20) : (30))
-   delay: down 1h multiplier 1.5 max 2h
-    info: average CPU steal time for the last 20 minutes
-      to: sysadmin
-
-## FreeBSD
-template: 10min_cpu_usage
-      on: system.cpu
-      os: freebsd
-   hosts: *
-  lookup: average -10m unaligned of user,system,interrupt
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (75) : (85))
-    crit: $this > (($status == $CRITICAL) ? (85) : (95))
-   delay: down 15m multiplier 1.5 max 1h
-    info: average cpu utilization for the last 10 minutes (excluding nice)
-      to: sysadmin
diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf
deleted file mode 100644
index 26f85848a..000000000
--- a/conf.d/health.d/disks.conf
+++ /dev/null
@@ -1,167 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-
-# -----------------------------------------------------------------------------
-# low disk space
-
-# checking the latest collected values
-# raise an alarm if the disk is low on
-# available disk space
-
-template: disk_space_usage
-      on: disk.space
-      os: linux freebsd
-   hosts: *
-families: *
-    calc: $used * 100 / ($avail + $used)
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING ) ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: up 1m down 15m multiplier 1.5 max 1h
-    info: current disk space usage
-      to: sysadmin
-
-template: disk_inode_usage
-      on: disk.inodes
-      os: linux freebsd
-   hosts: *
-families: *
-    calc: $used * 100 / ($avail + $used)
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: up 1m down 15m multiplier 1.5 max 1h
-    info: current disk inode usage
-      to: sysadmin
-
-
-# -----------------------------------------------------------------------------
-# disk fill rate
-
-# calculate the rate the disk fills
-# use as base, the available space change
-# during the last hour
-
-# this is just a calculation - it has no alarm
-# we will use it in the next template to find
-# the hours remaining
-
-template: disk_fill_rate
-      on: disk.space
-      os: linux freebsd
-   hosts: *
-families: *
-  lookup: min -10m at -50m unaligned of avail
-    calc: ($this - $avail) / (($now - $after) / 3600)
-   every: 1m
-   units: GB/hour
-    info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
-
-
-# calculate the hours remaining
-# if the disk continues to fill
-# in this rate
-
-template: out_of_disk_space_time
-      on: disk.space
-      os: linux freebsd
-   hosts: *
-families: *
-    calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
-   units: hours
-   every: 10s
-    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
-    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
-   delay: down 15m multiplier 1.2 max 1h
-    info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
-      to: sysadmin
-
-
-# -----------------------------------------------------------------------------
-# disk inode fill rate
-
-# calculate the rate the disk inodes are allocated
-# use as base, the available inodes change
-# during the last hour
-
-# this is just a calculation - it has no alarm
-# we will use it in the next template to find
-# the hours remaining
-
-template: disk_inode_rate
-      on: disk.inodes
-      os: linux freebsd
-   hosts: *
-families: *
-  lookup: min -10m at -50m unaligned of avail
-    calc: ($this - $avail) / (($now - $after) / 3600)
-   every: 1m
-   units: inodes/hour
-    info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
-
-# calculate the hours remaining
-# if the disk inodes are allocated
-# in this rate
-
-template: out_of_disk_inodes_time
-      on: disk.inodes
-      os: linux freebsd
-   hosts: *
-families: *
-    calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
-   units: hours
-   every: 10s
-    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
-    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
-   delay: down 15m multiplier 1.2 max 1h
-    info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
-      to: sysadmin
-
-
-# -----------------------------------------------------------------------------
-# disk congestion
-
-# raise an alarm if the disk is congested
-# by calculating the average disk utilization
-# for the last 10 minutes
-
-template: 10min_disk_utilization
-      on: disk.util
-      os: linux freebsd
-   hosts: *
-families: *
-  lookup: average -10m unaligned
-   units: %
-   every: 1m
-   green: 90
-     red: 98
-    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
-    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
-   delay: down 15m multiplier 1.2 max 1h
-    info: the percentage of time the disk was busy, during the last 10 minutes
-      to: sysadmin
-
-
-# raise an alarm if the disk backlog
-# is above 1000ms (1s) per second
-# for 10 minutes
-# (i.e. the disk cannot catch up)
-
-template: 10min_disk_backlog
-      on: disk.backlog
-      os: linux
-   hosts: *
-families: *
-  lookup: average -10m unaligned
-   units: ms
-   every: 1m
-   green: 2000
-     red: 5000
-    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
-    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
-   delay: down 15m multiplier 1.2 max 1h
-    info: average of the kernel estimated disk backlog, for the last 10 minutes
-      to: sysadmin
diff --git a/conf.d/health.d/elasticsearch.conf b/conf.d/health.d/elasticsearch.conf
deleted file mode 100644
index dffd40965..000000000
--- a/conf.d/health.d/elasticsearch.conf
+++ /dev/null
@@ -1,9 +0,0 @@
-   alarm: elasticsearch_last_collected
-      on: elasticsearch_local.cluster_health_status
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    info: number of seconds since the last successful data collection
-      to: sysadmin
diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf
deleted file mode 100644
index 66d44ec13..000000000
--- a/conf.d/health.d/entropy.conf
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# check if entropy is too low
-# the alarm is checked every 1 minute
-# and examines the last hour of data
-
-   alarm: lowest_entropy
-      on: system.entropy
-      os: linux
-   hosts: *
-  lookup: min -10m unaligned
-   units: entries
-   every: 5m
-    warn: $this < (($status >= $WARNING) ? (200) : (100))
-   delay: down 1h multiplier 1.5 max 2h
-    info: minimum entries in the random numbers pool in the last 10 minutes
-      to: silent
diff --git a/conf.d/health.d/fping.conf b/conf.d/health.d/fping.conf
deleted file mode 100644
index 43658fef6..000000000
--- a/conf.d/health.d/fping.conf
+++ /dev/null
@@ -1,53 +0,0 @@
-
-template: fping_last_collected_secs
-families: *
-      on: fping.latency
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
-template: host_reachable
-families: *
-      on: fping.latency
-    calc: $average != nan
-   units: up/down
-   every: 10s
-    crit: $this == 0
-    info: states if the remote host is reachable
-   delay: down 30m multiplier 1.5 max 2h
-      to: sysadmin
-
-template: host_latency
-families: *
-      on: fping.latency
-  lookup: average -10s unaligned of average
-   units: ms
-   every: 10s
-   green: 500
-     red: 1000
-    warn: $this > $green OR $max > $red
-    crit: $this > $red
-    info: average round trip delay during the last 10 seconds
-   delay: down 30m multiplier 1.5 max 2h
-      to: sysadmin
-
-template: packet_loss
-families: *
-      on: fping.quality
-  lookup: average -10m unaligned of returned
-    calc: 100 - $this
-   green: 1
-     red: 10
-   units: %
-   every: 10s
-    warn: $this > $green
-    crit: $this > $red
-    info: packet loss percentage
-   delay: down 30m multiplier 1.5 max 2h
-      to: sysadmin
-
diff --git a/conf.d/health.d/fronius.conf b/conf.d/health.d/fronius.conf
deleted file mode 100644
index cdf6c8fcb..000000000
--- a/conf.d/health.d/fronius.conf
+++ /dev/null
@@ -1,11 +0,0 @@
-template: fronius_last_collected_secs
-families: *
-      on: fronius.power
-    calc: $now - $last_collected_t
-   every: 10s
-   units: seconds ago
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sitemgr
diff --git a/conf.d/health.d/haproxy.conf b/conf.d/health.d/haproxy.conf
deleted file mode 100644
index e49c70d48..000000000
--- a/conf.d/health.d/haproxy.conf
+++ /dev/null
@@ -1,27 +0,0 @@
-template: haproxy_backend_server_status
-      on: haproxy_hs.down
-   units: failed servers
-   every: 10s
-  lookup: average -10s
-    crit: $this > 0
-    info: number of failed haproxy backend servers
-      to: sysadmin
-
-template: haproxy_backend_status
-      on: haproxy_hb.down
-   units: failed backend
-   every: 10s
-  lookup: average -10s
-    crit: $this > 0
-    info: number of failed haproxy backends
-      to: sysadmin
-
-template: haproxy_last_collected
-      on: haproxy_hb.down
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    info: number of seconds since the last successful data collection
-      to: sysadmin
diff --git a/conf.d/health.d/httpcheck.conf b/conf.d/health.d/httpcheck.conf
deleted file mode 100644
index 0ddf35eab..000000000
--- a/conf.d/health.d/httpcheck.conf
+++ /dev/null
@@ -1,99 +0,0 @@
-template: httpcheck_last_collected_secs
-families: *
-      on: httpcheck.status
-    calc: $now - $last_collected_t
-   every: 10s
-   units: seconds ago
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
-# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: web_service_up
-families: *
-      on: httpcheck.status
-  lookup: average -1m unaligned percentage of success
-    calc: ($this < 75) ? (0) : ($this)
-   every: 5s
-   units: up/down
-    info: at least 75% verified responses during last 60 seconds, ideal for badges
-      to: silent
-
-template: web_service_bad_content
-families: *
-      on: httpcheck.status
-  lookup: average -5m unaligned percentage of bad_content
-   every: 10s
-   units: %
-    warn: $this >= 10 AND $this < 40
-    crit: $this >= 40
-   delay: down 5m multiplier 1.5 max 1h
-    info: average of unexpected http response content during the last 5 minutes
- options: no-clear-notification
-      to: webmaster
-
-template: web_service_bad_status
-families: *
-      on: httpcheck.status
-  lookup: average -5m unaligned percentage of bad_status
-   every: 10s
-   units: %
-    warn: $this >= 10 AND $this < 40
-    crit: $this >= 40
-   delay: down 5m multiplier 1.5 max 1h
-    info: average of unexpected http status during the last 5 minutes
- options: no-clear-notification
-      to: webmaster
-
-template: web_service_timeouts
-families: *
-      on: httpcheck.status
-  lookup: average -5m unaligned percentage of timeout
-   every: 10s
-   units: %
-    info: average of timeouts during the last 5 minutes
-
-template: no_web_service_connections
-families: *
-      on: httpcheck.status
-  lookup: average -5m unaligned percentage of no_connection
-   every: 10s
-   units: %
-    info: average of failed requests during the last 5 minutes
-
-# combined timeout & no connection alarm
-template: web_service_unreachable
-families: *
-      on: httpcheck.status
-    calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts)
-   units: %
-   every: 10s
-    warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40)
-    crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40
-   delay: down 5m multiplier 1.5 max 1h
-    info: average of failed requests either due to timeouts or no connection during the last 5 minutes
- options: no-clear-notification
-      to: webmaster
-
-template: 1h_web_service_response_time
-families: *
-      on: httpcheck.responsetime
-  lookup: average -1h unaligned of time
-   every: 30s
-   units: ms
-    info: average response time over the last hour
-
-template: web_service_slow
-families: *
-      on: httpcheck.responsetime
-  lookup: average -3m unaligned of time
-   units: ms
-   every: 10s
-    warn: ($this > ($1h_web_service_response_time * 2) )
-    crit: ($this > ($1h_web_service_response_time * 3) )
-    info: average response time over the last 3 minutes, compared to the average over the last hour
-   delay: down 5m multiplier 1.5 max 1h
- options: no-clear-notification
-      to: webmaster
diff --git a/conf.d/health.d/ipc.conf b/conf.d/health.d/ipc.conf
deleted file mode 100644
index 03cf264d8..000000000
--- a/conf.d/health.d/ipc.conf
+++ /dev/null
@@ -1,28 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-   alarm: semaphores_used
-      on: system.ipc_semaphores
-      os: linux
-   hosts: *
-    calc: $semaphores * 100 / $ipc.semaphores.max
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (70) : (90))
-   delay: down 5m multiplier 1.5 max 1h
-    info: the percentage of IPC semaphores used
-      to: sysadmin
-
-   alarm: semaphore_arrays_used
-      on: system.ipc_semaphore_arrays
-      os: linux
-   hosts: *
-    calc: $arrays * 100 / $ipc.semaphores.arrays.max
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (70) : (90))
-   delay: down 5m multiplier 1.5 max 1h
-    info: the percentage of IPC semaphore arrays used
-      to: sysadmin
diff --git a/conf.d/health.d/ipfs.conf b/conf.d/health.d/ipfs.conf
deleted file mode 100644
index 3f77572d6..000000000
--- a/conf.d/health.d/ipfs.conf
+++ /dev/null
@@ -1,11 +0,0 @@
-
-template: ipfs_datastore_usage
-      on: ipfs.repo_size
-    calc: $size * 100 / $avail
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: ipfs Datastore close to running out of space
-      to: sysadmin
diff --git a/conf.d/health.d/ipmi.conf b/conf.d/health.d/ipmi.conf
deleted file mode 100644
index c25581964..000000000
--- a/conf.d/health.d/ipmi.conf
+++ /dev/null
@@ -1,20 +0,0 @@
-   alarm: ipmi_sensors_states
-      on: ipmi.sensors_states
-    calc: $warning + $critical
-   units: sensors
-   every: 10s
-    warn: $this > 0
-    crit: $critical > 0
-   delay: up 5m down 15m multiplier 1.5 max 1h
-    info: the number IPMI sensors in non-nominal state
-      to: sysadmin
-
-   alarm: ipmi_events
-      on: ipmi.events
-    calc: $events
-   units: events
-   every: 10s
-    warn: $this > 0
-   delay: up 5m down 15m multiplier 1.5 max 1h
-    info: the number of events in the IPMI System Event Log (SEL)
-      to: sysadmin
diff --git a/conf.d/health.d/isc_dhcpd.conf b/conf.d/health.d/isc_dhcpd.conf
deleted file mode 100644
index 8054656ff..000000000
--- a/conf.d/health.d/isc_dhcpd.conf
+++ /dev/null
@@ -1,10 +0,0 @@
- template: isc_dhcpd_leases_size
-      on: isc_dhcpd.leases_total
-   units: KB
-   every: 60
-    calc: $leases_size
-    warn: $this > 3072
-    crit: $this > 6144
-   delay: up 2m down 5m
-    info: dhcpd.leases file too big! Module can slow down your server.
-      to: sysadmin
diff --git a/conf.d/health.d/lighttpd.conf b/conf.d/health.d/lighttpd.conf
deleted file mode 100644
index 915907a4a..000000000
--- a/conf.d/health.d/lighttpd.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# make sure lighttpd is running
-
-template: lighttpd_last_collected_secs
-      on: lighttpd.requests
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
-
diff --git a/conf.d/health.d/mdstat.conf b/conf.d/health.d/mdstat.conf
deleted file mode 100644
index c9e7d20db..000000000
--- a/conf.d/health.d/mdstat.conf
+++ /dev/null
@@ -1,18 +0,0 @@
-template: mdstat_disks
-      on: md.disks
-   units: failed devices
-   every: 10s
-    calc: $total - $inuse
-    crit: $this > 0
-    info: Array is degraded!
-      to: sysadmin
-
-template: mdstat_last_collected
-      on: md.disks
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    info: number of seconds since the last successful data collection
-      to: sysadmin
diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf
deleted file mode 100644
index d248ef57a..000000000
--- a/conf.d/health.d/memcached.conf
+++ /dev/null
@@ -1,52 +0,0 @@
-
-# make sure memcached is running
-
-template: memcached_last_collected_secs
-      on: memcached.cache
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
-
-
-# detect if memcached cache is full
-
-template: memcached_cache_memory_usage
-      on: memcached.cache
-    calc: $used * 100 / ($used + $available)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (80) : (90))
-   delay: up 0 down 15m multiplier 1.5 max 1h
-    info: current cache memory usage
-      to: dba
-
-
-# find the rate memcached cache is filling
-
-template: cache_fill_rate
-      on: memcached.cache
-  lookup: min -10m at -50m unaligned of available
-    calc: ($this - $available) / (($now - $after) / 3600)
-   units: KB/hour
-   every: 1m
-    info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour
-
-
-# find the hours remaining until memcached cache is full
-
-template: out_of_cache_space_time
-      on: memcached.cache
-    calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf)
-   units: hours
-   every: 10s
-    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
-    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
-   delay: down 15m multiplier 1.5 max 1h
-    info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour
-      to: dba
diff --git a/conf.d/health.d/memory.conf b/conf.d/health.d/memory.conf
deleted file mode 100644
index 4a0e6e522..000000000
--- a/conf.d/health.d/memory.conf
+++ /dev/null
@@ -1,38 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-   alarm: 1hour_ecc_memory_correctable
-      on: mem.ecc_ce
-      os: linux
-   hosts: *
-  lookup: sum -10m unaligned
-   units: errors
-   every: 1m
-    warn: $this > 0
-   delay: down 1h multiplier 1.5 max 1h
-    info: number of ECC correctable errors during the last hour
-      to: sysadmin
-
-   alarm: 1hour_ecc_memory_uncorrectable
-      on: mem.ecc_ue
-      os: linux
-   hosts: *
-  lookup: sum -10m unaligned
-   units: errors
-   every: 1m
-    crit: $this > 0
-   delay: down 1h multiplier 1.5 max 1h
-    info: number of ECC uncorrectable errors during the last hour
-      to: sysadmin
-
-   alarm: 1hour_memory_hw_corrupted
-      on: mem.hwcorrupt
-      os: linux
-   hosts: *
-    calc: $HardwareCorrupted
-   units: MB
-   every: 10s
-    warn: $this > 0
-   delay: down 1h multiplier 1.5 max 1h
-    info: amount of memory corrupted due to a hardware failure
-      to: sysadmin
diff --git a/conf.d/health.d/mongodb.conf b/conf.d/health.d/mongodb.conf
deleted file mode 100644
index a80cb3112..000000000
--- a/conf.d/health.d/mongodb.conf
+++ /dev/null
@@ -1,13 +0,0 @@
-
-# make sure mongodb is running
-
-template: mongodb_last_collected_secs
-      on: mongodb.read_operations
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf
deleted file mode 100644
index 1eeb993f0..000000000
--- a/conf.d/health.d/mysql.conf
+++ /dev/null
@@ -1,85 +0,0 @@
-
-# make sure mysql is running
-
-template: mysql_last_collected_secs
-      on: mysql.queries
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
-
-
-# -----------------------------------------------------------------------------
-# slow queries
-
-template: mysql_10s_slow_queries
-      on: mysql.queries
-  lookup: sum -10s of slow_queries
-   units: slow queries
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
-    crit: $this > (($status == $CRITICAL) ? (10) : (20))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of mysql slow queries over the last 10 seconds
-      to: dba
-
-
-# -----------------------------------------------------------------------------
-# lock waits
-
-template: mysql_10s_table_locks_immediate
-      on: mysql.table_locks
-  lookup: sum -10s absolute of immediate
-   units: immediate locks
-   every: 10s
-    info: number of table immediate locks over the last 10 seconds
-      to: dba
-
-template: mysql_10s_table_locks_waited
-      on: mysql.table_locks
-  lookup: sum -10s absolute of waited
-   units: waited locks
-   every: 10s
-    info: number of table waited locks over the last 10 seconds
-      to: dba
-
-template: mysql_10s_waited_locks_ratio
-      on: mysql.table_locks
-    calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (10) : (25))
-    crit: $this > (($status == $CRITICAL) ? (25) : (50))
-   delay: down 30m multiplier 1.5 max 1h
-    info: the ratio of mysql waited table locks, for the last 10 seconds
-      to: dba
-
-
-# -----------------------------------------------------------------------------
-# replication
-
-template: mysql_replication
-      on: mysql.slave_status
-    calc: ($sql_running == -1 OR $io_running == -1)?0:1
-   units: ok/failed
-   every: 10s
-    crit: $this == 0
-   delay: down 5m multiplier 1.5 max 1h
-    info: checks if mysql replication has stopped
-      to: dba
-
-template: mysql_replication_lag
-      on: mysql.slave_behind
-    calc: $seconds
-   units: seconds
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
-    crit: $this > (($status == $CRITICAL) ? (10) : (30))
-   delay: down 15m multiplier 1.5 max 1h
-    info: the number of seconds mysql replication is behind this master
-      to: dba
-
diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf
deleted file mode 100644
index 4fc65c8ee..000000000
--- a/conf.d/health.d/named.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# make sure named is running
-
-template: named_last_collected_secs
-      on: named.global_queries
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: domainadmin
-
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
deleted file mode 100644
index 22a88927d..000000000
--- a/conf.d/health.d/net.conf
+++ /dev/null
@@ -1,122 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# -----------------------------------------------------------------------------
-# dropped packets
-
-# check if an interface is dropping packets
-# the alarm is checked every 1 minute
-# and examines the last 10 minutes of data
-
-template: inbound_packets_dropped
-      on: net.drops
-      os: linux
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute of inbound
-   units: packets
-   every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: interface inbound dropped packets in the last 10 minutes
-      to: sysadmin
-
-template: outbound_packets_dropped
-      on: net.drops
-      os: linux
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute of outbound
-   units: packets
-   every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: interface outbound dropped packets in the last 10 minutes
-      to: sysadmin
-
-template: inbound_packets_dropped_ratio
-      on: net.packets
-      os: linux
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute of received
-    calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0))
-   units: %
-   every: 1m
-    warn: $this >= 0.1
-    crit: $this >= 2
-   delay: down 1h multiplier 1.5 max 2h
-    info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
-      to: sysadmin
-
-template: outbound_packets_dropped_ratio
-      on: net.packets
-      os: linux
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute of sent
-    calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0))
-   units: %
-   every: 1m
-    warn: $this >= 0.1
-    crit: $this >= 2
-   delay: down 1h multiplier 1.5 max 2h
-    info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
-      to: sysadmin
-
-
-# -----------------------------------------------------------------------------
-# FIFO errors
-
-# check if an interface is having FIFO
-# buffer errors
-# the alarm is checked every 1 minute
-# and examines the last 10 minutes of data
-
-template: 10min_fifo_errors
-      on: net.fifo
-      os: linux
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute
-   units: errors
-   every: 1m
-    warn: $this > 0
-   delay: down 1h multiplier 1.5 max 2h
-    info: interface fifo errors in the last 10 minutes
-      to: sysadmin
-
-
-# -----------------------------------------------------------------------------
-# check for packet storms
-
-# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
-# 2. do the same for the last 10s
-# 3. raise an alarm if the later is 10x or 20x the first
-# we assume the minimum packet storm should at least have
-# 10000 packets/s, average of the last 10 seconds
-
-template: 1m_received_packets_rate
-      on: net.packets
-      os: linux freebsd
-   hosts: *
-families: *
-  lookup: average -1m of received
-   units: packets
-   every: 10s
-    info: the average number of packets received during the last minute
-
-template: 10s_received_packets_storm
-      on: net.packets
-      os: linux freebsd
-   hosts: *
-families: *
-  lookup: average -10s of received
-    calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
-   every: 10s
-   units: %
-   warn: $this > (($status >= $WARNING)?(200):(5000))
-   crit: $this > (($status >= $WARNING)?(5000):(6000))
-options: no-clear-notification
-   info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
-     to: sysadmin
diff --git a/conf.d/health.d/netfilter.conf b/conf.d/health.d/netfilter.conf
deleted file mode 100644
index fa1732b33..000000000
--- a/conf.d/health.d/netfilter.conf
+++ /dev/null
@@ -1,29 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-   alarm: netfilter_last_collected_secs
-      on: netfilter.conntrack_sockets
-      os: linux
-   hosts: *
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
-   alarm: netfilter_conntrack_full
-      on: netfilter.conntrack_sockets
-      os: linux
-   hosts: *
-  lookup: max -10s unaligned of connections
-    calc: $this * 100 / $netfilter.conntrack.max
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (80) : (90))
-   delay: down 5m multiplier 1.5 max 1h
-    info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size
-      to: sysadmin
diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf
deleted file mode 100644
index a686c3d99..000000000
--- a/conf.d/health.d/nginx.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# make sure nginx is running
-
-template: nginx_last_collected_secs
-      on: nginx.requests
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
-
diff --git a/conf.d/health.d/nginx_plus.conf b/conf.d/health.d/nginx_plus.conf
deleted file mode 100644
index 5a171a76d..000000000
--- a/conf.d/health.d/nginx_plus.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# make sure nginx_plus is running
-
-template: nginx_plus_last_collected_secs
-      on: nginx_plus.requests_total
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
-
diff --git a/conf.d/health.d/portcheck.conf b/conf.d/health.d/portcheck.conf
deleted file mode 100644
index f42b63d30..000000000
--- a/conf.d/health.d/portcheck.conf
+++ /dev/null
@@ -1,48 +0,0 @@
-template: portcheck_last_collected_secs
-families: *
-      on: portcheck.status
-    calc: $now - $last_collected_t
-   every: 10s
-   units: seconds ago
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
-# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: service_reachable
-families: *
-      on: portcheck.status
-  lookup: average -1m unaligned percentage of success
-    calc: ($this < 75) ? (0) : ($this)
-   every: 5s
-   units: up/down
-    info: at least 75% successful connections during last 60 seconds, ideal for badges
-      to: silent
-
-template: connection_timeouts
-families: *
-      on: portcheck.status
-  lookup: average -5m unaligned percentage of timeout
-   every: 10s
-   units: %
-    warn: $this >= 10 AND $this < 40
-    crit: $this >= 40
-   delay: down 5m multiplier 1.5 max 1h
-    info: average of timeouts during the last 5 minutes
- options: no-clear-notification
-      to: sysadmin
-
-template: connection_fails
-families: *
-      on: portcheck.status
-  lookup: average -5m unaligned percentage of no_connection
-   every: 10s
-   units: %
-    warn: $this >= 10 AND $this < 40
-    crit: $this >= 40
-   delay: down 5m multiplier 1.5 max 1h
-    info: average of failed connections during the last 5 minutes
- options: no-clear-notification
-      to: sysadmin
diff --git a/conf.d/health.d/postgres.conf b/conf.d/health.d/postgres.conf
deleted file mode 100644
index 4e0583b85..000000000
--- a/conf.d/health.d/postgres.conf
+++ /dev/null
@@ -1,13 +0,0 @@
-
-# make sure postgres is running
-
-template: postgres_last_collected_secs
-      on: postgres.db_stat_transactions
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf
deleted file mode 100644
index 7290d15ff..000000000
--- a/conf.d/health.d/qos.conf
+++ /dev/null
@@ -1,18 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# check if a QoS class is dropping packets
-# the alarm is checked every 10 seconds
-# and examines the last minute of data
-
-#template: 10min_qos_packet_drops
-#      on: tc.qos_dropped
-#      os: linux
-#   hosts: *
-#  lookup: sum -10m unaligned absolute
-#   every: 30s
-#    warn: $this > 0
-#   delay: up 0 down 30m multiplier 1.5 max 1h
-#   units: packets
-#    info: dropped packets in the last 30 minutes
-#      to: sysadmin
diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf
deleted file mode 100644
index b6dc5f945..000000000
--- a/conf.d/health.d/ram.conf
+++ /dev/null
@@ -1,64 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-   alarm: used_ram_to_ignore
-      on: system.ram
-      os: linux
-   hosts: *
-    calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
-   every: 10s
-    info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
-
-   alarm: ram_in_use
-      on: system.ram
-      os: linux
-   hosts: *
-#   calc: $used * 100 / ($used + $cached + $free)
-    calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: system RAM used
-      to: sysadmin
-
-   alarm: ram_available
-      on: mem.available
-      os: linux
-   hosts: *
-    calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
-   units: %
-   every: 10s
-    warn: $this < (($status >= $WARNING)  ? ( 5) : (10))
-    crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
-   delay: down 15m multiplier 1.5 max 1h
-    info: estimated amount of RAM available for userspace processes, without causing swapping
-      to: sysadmin
-
-## FreeBSD
-alarm: ram_in_use
-   on: system.ram
-   os: freebsd
-hosts: *
- calc: (($active + $wired) - $used_ram_to_ignore) * 100 / (($active + $wired) - $used_ram_to_ignore + $cached + $free)
-units: %
-every: 10s
- warn: $this > (($status >= $WARNING)  ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
-delay: down 15m multiplier 1.5 max 1h
- info: system RAM usage
-   to: sysadmin
-
- alarm: ram_available
-    on: system.ram
-    os: freebsd
- hosts: *
-  calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $buffers)
- units: %
- every: 10s
-  warn: $this < (($status >= $WARNING)  ? ( 5) : (10))
-  crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
- delay: down 15m multiplier 1.5 max 1h
-  info: estimated amount of RAM available for userspace processes, without causing swapping
-    to: sysadmin
diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf
deleted file mode 100644
index c08a884a6..000000000
--- a/conf.d/health.d/redis.conf
+++ /dev/null
@@ -1,34 +0,0 @@
-
-# make sure redis is running
-
-template: redis_last_collected_secs
-      on: redis.operations
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
-
-template: redis_bgsave_broken
-families: *
-      on: redis.bgsave_health
-   every: 10s
-    crit: $rdb_last_bgsave_status != 0
-   units: ok/failed
-    info: states if redis bgsave is working
-   delay: down 5m multiplier 1.5 max 1h
-      to: dba
-
-template: redis_bgsave_slow
-families: *
-      on: redis.bgsave_now
-   every: 10s
-    warn: $rdb_bgsave_in_progress > 600
-    crit: $rdb_bgsave_in_progress > 1200
-   units: seconds
-    info: the time redis needs to save its database
-   delay: down 5m multiplier 1.5 max 1h
-      to: dba
diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf
deleted file mode 100644
index 2344b60ec..000000000
--- a/conf.d/health.d/retroshare.conf
+++ /dev/null
@@ -1,25 +0,0 @@
-# make sure RetroShare is running
-
-template: retroshare_last_collected_secs
-      on: retroshare.peers
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
-# make sure the DHT is fine when active
-
-template: retroshare_dht_working
-      on: retroshare.dht
-    calc: $dht_size_all
-   units: peers
-   every: 1m
-    warn: $this < (($status >= $WARNING)  ? (120) : (100))
-    crit: $this < (($status == $CRITICAL) ? (10)  : (1))
-   delay: up 0 down 15m multiplier 1.5 max 1h
-    info: Checks if the DHT has enough peers to operate
-      to: sysadmin
diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf
deleted file mode 100644
index 77c804bfd..000000000
--- a/conf.d/health.d/softnet.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# check for common /proc/net/softnet_stat errors
-
-   alarm: 10min_netdev_backlog_exceeded
-      on: system.softnet_stat
-      os: linux
-   hosts: *
-  lookup: sum -10m unaligned absolute of dropped
-   units: packets
-   every: 1m
-    warn: $this > 0
-   delay: down 1h multiplier 1.5 max 2h
-    info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
-      to: sysadmin
-
-   alarm: 10min_netdev_budget_ran_outs
-      on: system.softnet_stat
-      os: linux
-   hosts: *
-  lookup: sum -10m unaligned absolute of squeezed
-   units: events
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (0) : (10))
-   delay: down 1h multiplier 1.5 max 2h
-    info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
-      to: silent
-
-   alarm: 10min_netisr_backlog_exceeded
-      on: system.softnet_stat
-      os: freebsd
-   hosts: *
-   lookup: sum -10m unaligned absolute of qdrops
-   units: packets
-   every: 1m
-    warn: $this > 0
-   delay: down 1h multiplier 1.5 max 2h
-    info: number of drops in the last 10min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
-      to: sysadmin
diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf
deleted file mode 100644
index 06cc9678f..000000000
--- a/conf.d/health.d/squid.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-
-# make sure squid is running
-
-template: squid_last_collected_secs
-      on: squid.clients_requests
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: proxyadmin
-
diff --git a/conf.d/health.d/stiebeleltron.conf b/conf.d/health.d/stiebeleltron.conf
deleted file mode 100644
index e0361eb20..000000000
--- a/conf.d/health.d/stiebeleltron.conf
+++ /dev/null
@@ -1,11 +0,0 @@
-template: stiebeleltron_last_collected_secs
-families: *
-      on: stiebeleltron.heating.hc1
-    calc: $now - $last_collected_t
-   every: 10s
-   units: seconds ago
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sitemgr
diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf
deleted file mode 100644
index f920b0807..000000000
--- a/conf.d/health.d/swap.conf
+++ /dev/null
@@ -1,43 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-   alarm: 30min_ram_swapped_out
-      on: system.swapio
-      os: linux freebsd
-   hosts: *
-  lookup: sum -30m unaligned absolute of out
-          # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
-    calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
-   units: % of RAM
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (10) : (20))
-    crit: $this > (($status == $CRITICAL) ? (20) : (30))
-   delay: up 0 down 15m multiplier 1.5 max 1h
-    info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
-      to: sysadmin
-
-   alarm: ram_in_swap
-      on: system.swap
-      os: linux
-   hosts: *
-    calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
-   units: % of RAM
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (15) : (20))
-    crit: $this > (($status == $CRITICAL) ? (40) : (50))
-   delay: up 30s down 15m multiplier 1.5 max 1h
-    info: the swap memory used, as a percentage of the system RAM
-      to: sysadmin
-
-   alarm: used_swap
-      on: system.swap
-      os: linux freebsd
-   hosts: *
-    calc: $used * 100 / ( $used + $free )
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: up 30s down 15m multiplier 1.5 max 1h
-    info: the percentage of swap memory used
-      to: sysadmin
diff --git a/conf.d/health.d/tcp_conn.conf b/conf.d/health.d/tcp_conn.conf
deleted file mode 100644
index 7aa9a9800..000000000
--- a/conf.d/health.d/tcp_conn.conf
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#
-# ${tcp_max_connections} may be nan or -1 if the system
-# supports dynamic threshold for TCP connections.
-# In this case, the alarm will always be zero.
-#
-
-   alarm: tcp_connections
-      on: ipv4.tcpsock
-      os: linux
-   hosts: *
-    calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
-    crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 ))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the percentage of IPv4 TCP connections over the max allowed
-      to: sysadmin
diff --git a/conf.d/health.d/tcp_listen.conf b/conf.d/health.d/tcp_listen.conf
deleted file mode 100644
index 957964ae4..000000000
--- a/conf.d/health.d/tcp_listen.conf
+++ /dev/null
@@ -1,27 +0,0 @@
-# -----------------------------------------------------------------------------
-# tcp listen sockets issues
-
-   alarm: 1m_ipv4_tcp_listen_overflows
-      on: ipv4.tcplistenissues
-      os: linux freebsd
-   hosts: *
-  lookup: sum -60s unaligned absolute of ListenOverflows
-   units: overflows
-   every: 10s
-    crit: $this > 0
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the number of TCP listen socket overflows during the last minute
-      to: sysadmin
-
-   alarm: 1m_ipv4_tcp_listen_drops
-      on: ipv4.tcplistenissues
-      os: linux
-   hosts: *
-  lookup: sum -60s unaligned absolute of ListenDrops
-   units: drops
-   every: 10s
-    crit: $this > 0
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the number of TCP listen socket drops during the last minute
-      to: sysadmin
-
diff --git a/conf.d/health.d/tcp_mem.conf b/conf.d/health.d/tcp_mem.conf
deleted file mode 100644
index 6927d5765..000000000
--- a/conf.d/health.d/tcp_mem.conf
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# check
-# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
-#
-# We give a warning when TCP is under memory pressure
-# and a critical when TCP is 90% of its upper memory limit
-#
-
-   alarm: tcp_memory
-      on: ipv4.sockstat_tcp_mem
-      os: linux
-   hosts: *
-    calc: ${mem} * 100 / ${tcp_mem_high}
-   units: %
-   every: 10s
-    warn: ${mem} > (($status >= $WARNING  ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure}   ))
-    crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure}       ) : ( ${tcp_mem_high} * 0.9 ))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the amount of TCP memory as a percentage of its max memory limit
-      to: sysadmin
diff --git a/conf.d/health.d/tcp_orphans.conf b/conf.d/health.d/tcp_orphans.conf
deleted file mode 100644
index 280d6590f..000000000
--- a/conf.d/health.d/tcp_orphans.conf
+++ /dev/null
@@ -1,21 +0,0 @@
-
-#
-# check
-# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
-#
-# The kernel may penalize orphans by 2x or even 4x
-# so we alarm warning at 25% and critical at 50%
-#
-
-   alarm: tcp_orphans
-      on: ipv4.sockstat_tcp_sockets
-      os: linux
-   hosts: *
-    calc: ${orphan} * 100 / ${tcp_max_orphans}
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
-    crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 ))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors)
-      to: sysadmin
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
deleted file mode 100644
index 91dad3c6a..000000000
--- a/conf.d/health.d/tcp_resets.conf
+++ /dev/null
@@ -1,67 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# -----------------------------------------------------------------------------
-
-   alarm: ipv4_tcphandshake_last_collected_secs
-      on: ipv4.tcphandshake
-      os: linux freebsd
-   hosts: *
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
-# -----------------------------------------------------------------------------
-# tcp resets this host sends
-
-   alarm: 1m_ipv4_tcp_resets_sent
-      on: ipv4.tcphandshake
-      os: linux
-   hosts: *
-  lookup: average -1m at -10s unaligned absolute of OutRsts
-   units: tcp resets/s
-   every: 10s
-    info: average TCP RESETS this host is sending, over the last minute
-
-   alarm: 10s_ipv4_tcp_resets_sent
-      on: ipv4.tcphandshake
-      os: linux
-   hosts: *
-  lookup: average -10s unaligned absolute of OutRsts
-   units: tcp resets/s
-   every: 10s
-    warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (20)))
-   delay: up 0 down 60m multiplier 1.2 max 2h
- options: no-clear-notification
-    info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent)
-      to: sysadmin
-
-# -----------------------------------------------------------------------------
-# tcp resets this host receives
-
-   alarm: 1m_ipv4_tcp_resets_received
-      on: ipv4.tcphandshake
-      os: linux freebsd
-   hosts: *
-  lookup: average -1m at -10s unaligned absolute of AttemptFails
-   units: tcp resets/s
-   every: 10s
-    info: average TCP RESETS this host is sending, over the last minute
-
-   alarm: 10s_ipv4_tcp_resets_received
-      on: ipv4.tcphandshake
-      os: linux freebsd
-   hosts: *
-  lookup: average -10s unaligned absolute of AttemptFails
-   units: tcp resets/s
-   every: 10s
-    warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (10)))
-   delay: up 0 down 60m multiplier 1.2 max 2h
- options: no-clear-notification
-    info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent)
-      to: sysadmin
diff --git a/conf.d/health.d/udp_errors.conf b/conf.d/health.d/udp_errors.conf
deleted file mode 100644
index 382b39658..000000000
--- a/conf.d/health.d/udp_errors.conf
+++ /dev/null
@@ -1,49 +0,0 @@
-
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# -----------------------------------------------------------------------------
-
-   alarm: ipv4_udperrors_last_collected_secs
-      on: ipv4.udperrors
-      os: linux freebsd
-   hosts: *
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
-# -----------------------------------------------------------------------------
-# UDP receive buffer errors
-
-   alarm: 1m_ipv4_udp_receive_buffer_errors
-      on: ipv4.udperrors
-      os: linux freebsd
-   hosts: *
-  lookup: sum -1m unaligned absolute of RcvbufErrors
-   units: errors
-   every: 10s
-    warn: $this > 0
-    crit: $this > 100
-    info: number of UDP receive buffer errors during the last minute
-   delay: up 0 down 60m multiplier 1.2 max 2h
-      to: sysadmin
-
-# -----------------------------------------------------------------------------
-# UDP send buffer errors
-
-   alarm: 1m_ipv4_udp_send_buffer_errors
-      on: ipv4.udperrors
-      os: linux
-   hosts: *
-  lookup: sum -1m unaligned absolute of SndbufErrors
-   units: errors
-   every: 10s
-    warn: $this > 0
-    crit: $this > 100
-    info: number of UDP send buffer errors during the last minute
-   delay: up 0 down 60m multiplier 1.2 max 2h
-      to: sysadmin
diff --git a/conf.d/health.d/varnish.conf b/conf.d/health.d/varnish.conf
deleted file mode 100644
index cca7446b4..000000000
--- a/conf.d/health.d/varnish.conf
+++ /dev/null
@@ -1,9 +0,0 @@
-   alarm: varnish_last_collected
-      on: varnish.uptime
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    info: number of seconds since the last successful data collection
-      to: sysadmin
diff --git a/conf.d/health.d/web_log.conf b/conf.d/health.d/web_log.conf
deleted file mode 100644
index d8be88b47..000000000
--- a/conf.d/health.d/web_log.conf
+++ /dev/null
@@ -1,163 +0,0 @@
-
-# make sure we can collect web log data
-
-template: last_collected_secs
-      on: web_log.response_codes
-families: *
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
-
-
-# -----------------------------------------------------------------------------
-# high level response code alarms
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-#  $1m_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
-template: 1m_requests
-      on: web_log.response_statuses
-families: *
-  lookup: sum -1m unaligned
-    calc: ($this == 0)?(1):($this)
-   units: requests
-   every: 10s
-    info: the sum of all HTTP requests over the last minute
-
-template: 1m_successful
-      on: web_log.response_statuses
-families: *
-  lookup: sum -1m unaligned of successful_requests
-    calc: $this * 100 / $1m_requests
-   units: %
-   every: 10s
-    warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
-    crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
-   delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of successful HTTP responses (1xx, 2xx, 304) over the last minute
-      to: webmaster
-
-template: 1m_redirects
-      on: web_log.response_statuses
-families: *
-  lookup: sum -1m unaligned of redirects
-    calc: $this * 100 / $1m_requests
-   units: %
-   every: 10s
-    warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? (  1 ) : ( 20 )) ) : ( 0 )
-    crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
-   delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of HTTP redirects (3xx except 304) over the last minute
-      to: webmaster
-
-template: 1m_bad_requests
-      on: web_log.response_statuses
-families: *
-  lookup: sum -1m unaligned of bad_requests
-    calc: $this * 100 / $1m_requests
-   units: %
-   every: 10s
-    warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
-    crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
-   delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of HTTP bad requests (4xx) over the last minute
-      to: webmaster
-
-template: 1m_internal_errors
-      on: web_log.response_statuses
-families: *
-  lookup: sum -1m unaligned of server_errors
-    calc: $this * 100 / $1m_requests
-   units: %
-   every: 10s
-    warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
-    crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
-   delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of HTTP internal server errors (5xx), over the last minute
-      to: webmaster
-
-
-# -----------------------------------------------------------------------------
-# web slow
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-#  $1m_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
-template: 10m_response_time
-      on: web_log.response_time
-families: *
-  lookup: average -10m unaligned of avg
-   units: ms
-   every: 30s
-    info: the average time to respond to HTTP requests, over the last 10 minutes
-
-template: web_slow
-      on: web_log.response_time
-families: *
-  lookup: average -1m unaligned of avg
-   units: ms
-   every: 10s
-   green: 500
-     red: 1000
-    warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
-    crit: ($1m_requests > 120) ? ($this > $red   && $this > ($10m_response_time * 4) ) : ( 0 )
-   delay: down 15m multiplier 1.5 max 1h
-    info: the average time to respond to HTTP requests, over the last 1 minute
- options: no-clear-notification
-      to: webmaster
-
-# -----------------------------------------------------------------------------
-# web too many or too few requests
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-#  $5m_successful_old > 120
-#
-# i.e. when there were at least 120 requests during the 5 minutes starting
-#      at -10m and ending at -5m
-
-template: 5m_successful_old
-      on: web_log.response_statuses
-families: *
-  lookup: average -5m at -5m unaligned of successful_requests
-   units: requests/s
-   every: 30s
-    info: average rate of successful HTTP requests over the last 5 minutes
-
-template: 5m_successful
-      on: web_log.response_statuses
-families: *
-  lookup: average -5m unaligned of successful_requests
-   units: requests/s
-   every: 30s
-    info: average successful HTTP requests over the last 5 minutes
-
-template: 5m_requests_ratio
-      on: web_log.response_codes
-families: *
-    calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
-   units: %
-   every: 30s
-    warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
-    crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
-   delay: down 15m multiplier 1.5 max 1h
-options: no-clear-notification
-    info: the percentage of successful web requests over the last 5 minutes, \
-          compared with the previous 5 minutes \
-          (clear notification for this alarm will not be sent)
-      to: webmaster
-
diff --git a/conf.d/health.d/zfs.conf b/conf.d/health.d/zfs.conf
deleted file mode 100644
index af73824e6..000000000
--- a/conf.d/health.d/zfs.conf
+++ /dev/null
@@ -1,10 +0,0 @@
-
-   alarm: zfs_memory_throttle
-      on: zfs.memory_ops
-  lookup: sum -10m unaligned absolute of throttled
-   units: events
-   every: 1m
-    warn: $this > 0
-   delay: down 1h multiplier 1.5 max 2h
-    info: the number of times ZFS had to limit the ARC growth in the last 10 minutes
-      to: sysadmin
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2018-11-07 12:22:44 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2018-11-07 12:22:44 +0000
commit	1e6c93250172946eeb38e94a92a1fd12c9d3011e (patch)
tree	8ca5e16dfc7ad6b3bf2738ca0a48408a950f8f7e /conf.d/health.d
parent	Update watch file (diff)
download	netdata-1e6c93250172946eeb38e94a92a1fd12c9d3011e.tar.xz netdata-1e6c93250172946eeb38e94a92a1fd12c9d3011e.zip