24 files changed, 61 insertions, 72 deletions
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 65f1a69ab..7a0afcd18 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -11,7 +11,6 @@ component: UPS
     units: %
     every: 1m
      warn: $this > (($status >= $WARNING)  ? (70) : (80))
-     crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 10m multiplier 1.5 max 1h
      info: average UPS load over the last 10 minutes
        to: sitemgr
@@ -29,7 +28,7 @@ component: UPS
     units: %
     every: 60s
      warn: $this < 100
-     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+     crit: $this < 40
     delay: down 10m multiplier 1.5 max 1h
      info: average UPS charge over the last minute
        to: sitemgr
@@ -43,7 +42,6 @@ component: UPS device
     every: 10s
     units: seconds ago
      warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
     delay: down 5m multiplier 1.5 max 1h
      info: number of seconds since the last successful data collection
        to: sitemgr
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index 49cb5ad0f..3f92e80df 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -22,8 +22,7 @@ component: Disk
      calc: $dirty + $metadata + $undefined
     units: %
     every: 1m
-     warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
-     crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
+     warn: $this > 75
     delay: up 1m down 1h multiplier 1.5 max 2h
      info: percentage of cache space used for dirty data and metadata \
            (this usually means your SSD cache is too small)
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 13ac8c182..4ee8bc0bd 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -8,8 +8,7 @@ component: Beanstalk
      calc: $buried
     units: jobs
     every: 10s
-     warn: $this > 0
-     crit: $this > 10
+     warn: $this > 3
     delay: up 0 down 5m multiplier 1.2 max 1h
      info: number of buried jobs across all tubes. \
            You need to manually kick them so they can be processed. \
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index 7c09225ff..b3e75a239 100644
--- a/health/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
@@ -7,6 +7,5 @@ component: BIND
     every: 60
      calc: $stats_size
      warn: $this > 512
-     crit: $this > 1024
      info: BIND statistics-file size
        to: sysadmin
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
index 6f37787d7..b7dcbe316 100644
--- a/health/health.d/boinc.conf
+++ b/health/health.d/boinc.conf
@@ -12,7 +12,6 @@ component: BOINC
     units: tasks
     every: 1m
      warn: $this > 0
-     crit: $this > 1
     delay: up 1m down 5m multiplier 1.5 max 1h
      info: average number of compute errors over the last 10 minutes
        to: sysadmin
@@ -29,7 +28,6 @@ component: BOINC
     units: tasks
     every: 1m
      warn: $this > 0
-     crit: $this > 1
     delay: up 1m down 5m multiplier 1.5 max 1h
      info: average number of failed uploads over the last 10 minutes
        to: sysadmin
@@ -46,7 +44,6 @@ component: BOINC
     units: tasks
     every: 1m
      warn: $this < 1
-     crit: $this < 0.1
     delay: up 5m down 10m multiplier 1.5 max 1h
      info: average number of total tasks over the last 10 minutes
        to: sysadmin
@@ -64,7 +61,6 @@ component: BOINC
     units: tasks
     every: 1m
      warn: $this < 1
-     crit: $this < 0.1
     delay: up 5m down 10m multiplier 1.5 max 1h
      info: average number of active tasks over the last 10 minutes
        to: sysadmin
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 08260ff6d..f625e5455 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -64,7 +64,6 @@ component: Network
     every: 10s
     units: %
      warn: $this > (($status >= $WARNING)?(200):(5000))
-     crit: $this > (($status == $CRITICAL)?(5000):(6000))
   options: no-clear-notification
      info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
            compared to the rate over the last minute
@@ -83,7 +82,6 @@ component: CPU
     units: %
     every: 1m
      warn: $this > (($status >= $WARNING)  ? (75) : (85))
-     crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 15m multiplier 1.5 max 1h
      info: average cgroup CPU utilization over the last 10 minutes
        to: sysadmin
@@ -134,7 +132,6 @@ component: Network
     every: 10s
     units: %
      warn: $this > (($status >= $WARNING)?(200):(5000))
-     crit: $this > (($status == $CRITICAL)?(5000):(6000))
   options: no-clear-notification
      info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
            compared to the rate over the last minute
diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf
index dff6d2df3..7edca6563 100644
--- a/health/health.d/consul.conf
+++ b/health/health.d/consul.conf
@@ -10,7 +10,7 @@ component: Consul
     units: seconds
      warn: $this < 14*24*60*60
      crit: $this < 7*24*60*60
-     info: Consul Enterprise licence expiration time on node ${label:node_name} datacenter ${label:datacenter}
+     info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter}
        to: sysadmin
 
  template: consul_autopilot_health_status
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index ad6952825..907d6ff8a 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -28,7 +28,6 @@ component: CPU
     units: %
     every: 1m
      warn: $this > (($status >= $WARNING)  ? (20) : (40))
-     crit: $this > (($status == $CRITICAL) ? (40) : (50))
     delay: down 15m multiplier 1.5 max 1h
      info: average CPU iowait time over the last 10 minutes
        to: sysadmin
@@ -44,7 +43,6 @@ component: CPU
     units: %
     every: 5m
      warn: $this > (($status >= $WARNING)  ? (5)  : (10))
-     crit: $this > (($status == $CRITICAL) ? (20) : (30))
     delay: down 1h multiplier 1.5 max 2h
      info: average CPU steal time over the last 20 minutes
        to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index 010b94599..81d37df64 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -9,7 +9,6 @@ component: Dnsmasq
     units: %
      calc: $used
      warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
-     crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
     delay: down 5m
      info: DHCP range utilization
        to: sysadmin
diff --git a/health/health.d/docker.conf b/health/health.d/docker.conf
index f17028472..01919dc0d 100644
--- a/health/health.d/docker.conf
+++ b/health/health.d/docker.conf
@@ -6,6 +6,6 @@ component: Docker
     units: status
     every: 10s
    lookup: average -10s of unhealthy
-     crit: $this > 0
+     warn: $this > 0
      info: ${label:container_name} docker container health status is unhealthy
        to: sysadmin
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
index 47f8e1eb9..29f1e9b27 100644
--- a/health/health.d/elasticsearch.conf
+++ b/health/health.d/elasticsearch.conf
@@ -10,7 +10,7 @@ component: Elasticsearch
    lookup: average -5s unaligned of *ed
     every: 10s
     units: status
-     warn: $this == 1
+     crit: $this == 1
     delay: down 5m multiplier 1.5 max 1h
      info: cluster health status is red.
        to: sysadmin
diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf
new file mode 100644
index 000000000..d136ea517
--- /dev/null
+++ b/health/health.d/file_descriptors.conf
@@ -0,0 +1,31 @@
+ # you can disable an alarm notification by setting the 'to' line to: silent
+
+  template: system_file_descriptors_utilization
+        on: system.file_nr_utilization
+     class: Utilization
+      type: System
+ component: Processes
+     hosts: *
+    lookup: max -1m unaligned
+     units: %
+     every: 1m
+      crit: $this > 90
+     delay: down 15m multiplier 1.5 max 1h
+      info: system-wide utilization of open files
+        to: sysadmin
+
+ template: apps_group_file_descriptors_utilization
+       on: apps.fd_limit
+    class: Utilization
+     type: System
+component: Process
+       os: linux
+   module: !* *
+    hosts: *
+   lookup: max -1m unaligned foreach *
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (85) : (90))
+    delay: down 15m multiplier 1.5 max 1h
+     info: maximum utilization of open files among all application group PIDs
+       to: sysadmin
diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf
index 14010d445..580d114f8 100644
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@@ -8,7 +8,6 @@ component: Gearman
     units: workers
     every: 10s
      warn: $this > 30000
-     crit: $this > 100000
     delay: down 5m multiplier 1.5 max 1h
      info: average number of queued jobs over the last 10 minutes
        to: sysadmin
diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf
index dd1eb4701..361b6b41f 100644
--- a/health/health.d/geth.conf
+++ b/health/health.d/geth.conf
@@ -8,5 +8,4 @@ component: geth
      calc: $chain_head_block -  $chain_head_header
     units: blocks
      warn: $this != 0
-     crit: $this > 5
     delay: down 1m multiplier 1.5 max 1h
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 2786cbd62..47ac4453c 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -6,10 +6,8 @@ component: Disk
    lookup: average -10s unaligned of latency
     units: microseconds
     every: 10s
-    green: 5000
-      red: 10000
+    green: 10000
      warn: $this > $green
-     crit: $this > $red
     delay: down 30m multiplier 1.5 max 2h
      info: average I/O latency over the last 10 seconds
        to: sysadmin
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
index c178a410a..3d1b46c02 100644
--- a/health/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@@ -12,7 +12,6 @@ component: IPC
     units: %
     every: 10s
      warn: $this > (($status >= $WARNING)  ? (70) : (80))
-     crit: $this > (($status == $CRITICAL) ? (70) : (90))
     delay: down 5m multiplier 1.5 max 1h
      info: IPC semaphore utilization
        to: sysadmin
@@ -28,7 +27,6 @@ component: IPC
     units: %
     every: 10s
      warn: $this > (($status >= $WARNING)  ? (70) : (80))
-     crit: $this > (($status == $CRITICAL) ? (70) : (90))
     delay: down 5m multiplier 1.5 max 1h
      info: IPC semaphore arrays utilization
        to: sysadmin
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
index feadba1b7..4d6478cca 100644
--- a/health/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
@@ -1,15 +1,15 @@
-    alarm: ipmi_sensors_states
-       on: ipmi.sensors_states
+ template: ipmi_sensor_state
+       on: ipmi.sensor_state
     class: Errors
      type: System
 component: IPMI
      calc: $warning + $critical
-    units: sensors
+    units: state
     every: 10s
-     warn: $this > 0
+     warn: $warning > 0
      crit: $critical > 0
     delay: up 5m down 15m multiplier 1.5 max 1h
-     info: number of IPMI sensors in non-nominal state
+     info: IPMI sensor ${label:sensor} (${label:component}) state
        to: sysadmin
 
     alarm: ipmi_events
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
index c0bc6de8a..4562122ca 100644
--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@@ -9,7 +9,6 @@ component: Battery
     units: %
     every: 10s
      warn: $this < 10
-     crit: $this < 5
     delay: up 30s down 5m multiplier 1.2 max 1h
      info: percentage of remaining power supply capacity
        to: sysadmin
diff --git a/health/health.d/nut.conf b/health/health.d/nut.conf
index 6231dd97b..67843205c 100644
--- a/health/health.d/nut.conf
+++ b/health/health.d/nut.conf
@@ -26,8 +26,8 @@ component: UPS
    lookup: average -60s unaligned of battery_charge
     units: %
     every: 60s
-     warn: $this < 100
-     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+     warn: $this < 75
+     crit: $this < 40
     delay: down 10m multiplier 1.5 max 1h
      info: average UPS charge over the last minute
        to: sitemgr
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index ee6c57cc5..045930ae5 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -10,8 +10,7 @@ component: Pi-hole
     every: 10s
     units: seconds
      calc: $ago
-     warn: $this > 60 * 60 * 24 * 8
-     crit: $this > 60 * 60 * 24 * 8 * 2
+     warn: $this > 60 * 60 * 24 * 30
      info: gravity.list (blocklist) file last update time
        to: sysadmin
 
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index ab382c43b..34e5431a8 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -28,7 +28,6 @@ component: Memory
     units: %
     every: 10s
      warn: $this < (($status >= $WARNING)  ? (15) : (10))
-     crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
     delay: down 15m multiplier 1.5 max 1h
      info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
        to: sysadmin
@@ -74,7 +73,6 @@ component: Memory
     units: %
     every: 10s
      warn: $this < (($status >= $WARNING)  ? (15) : (10))
-     crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
     delay: down 15m multiplier 1.5 max 1h
      info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
        to: sysadmin
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
index ab110bf07..27a857fcd 100644
--- a/health/health.d/scaleio.conf
+++ b/health/health.d/scaleio.conf
@@ -9,8 +9,8 @@ component: ScaleIO
      calc: $used
     units: %
     every: 10s
-     warn: $this > (($status >= $WARNING)  ? (80) : (90))
-     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+     warn: $this > (($status >= $WARNING)  ? (80) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (90))
     delay: down 15m multiplier 1.5 max 1h
      info: storage pool capacity utilization
        to: sysadmin
diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf
index a9cc7ceef..bff34cd39 100644
--- a/health/health.d/vcsa.conf
+++ b/health/health.d/vcsa.conf
@@ -133,8 +133,7 @@ component: VMware vCenter
    lookup: max -10s unaligned of software_packages
     units: status
     every: 10s
-     warn: $this == 4
-     crit: $this == 3
+     warn: ($this == 3) || ($this == 4)
     delay: down 1m multiplier 1.5 max 1h
      info: software updates availability status \
            (-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf
index d4bc7639c..28a886386 100644
--- a/health/health.d/windows.conf
+++ b/health/health.d/windows.conf
@@ -6,7 +6,7 @@
     class: Utilization
      type: Windows
 component: CPU
-       os: linux
+       os: *
     hosts: *
    lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
     units: %
@@ -25,7 +25,7 @@ component: CPU
     class: Utilization
      type: Windows
 component: Memory
-       os: linux
+       os: *
     hosts: *
      calc: ($used) * 100 / ($used + $available)
     units: %
@@ -36,31 +36,15 @@ component: Memory
      info: memory utilization
        to: sysadmin
 
- template: windows_swap_in_use
-       on: windows.memory_swap_utilization
-    class: Utilization
-     type: Windows
-component: Memory
-       os: linux
-    hosts: *
-     calc: ($used) * 100 / ($used + $available)
-    units: %
-    every: 10s
-     warn: $this > (($status >= $WARNING)  ? (80) : (90))
-     crit: $this > (($status == $CRITICAL) ? (90) : (98))
-    delay: down 15m multiplier 1.5 max 1h
-     info: swap memory utilization
-       to: sysadmin
-
 
 ## Network
 
  template: windows_inbound_packets_discarded
-       on: windows.net_discarded
+       on: windows.net_nic_discarded
     class: Errors
      type: Windows
 component: Network
-       os: linux
+       os: *
     hosts: *
    lookup: sum -10m unaligned absolute match-names of inbound
     units: packets
@@ -71,11 +55,11 @@ component: Network
        to: sysadmin
 
  template: windows_outbound_packets_discarded
-       on: windows.net_discarded
+       on: windows.net_nic_discarded
     class: Errors
      type: Windows
 component: Network
-       os: linux
+       os: *
     hosts: *
    lookup: sum -10m unaligned absolute match-names of outbound
     units: packets
@@ -86,11 +70,11 @@ component: Network
        to: sysadmin
 
  template: windows_inbound_packets_errors
-       on: windows.net_errors
+       on: windows.net_nic_errors
     class: Errors
      type: Windows
 component: Network
-       os: linux
+       os: *
     hosts: *
    lookup: sum -10m unaligned absolute match-names of inbound
     units: packets
@@ -101,11 +85,11 @@ component: Network
        to: sysadmin
 
  template: windows_outbound_packets_errors
-       on: windows.net_errors
+       on: windows.net_nic_errors
     class: Errors
      type: Windows
 component: Network
-       os: linux
+       os: *
     hosts: *
    lookup: sum -10m unaligned absolute match-names of outbound
     units: packets
@@ -119,11 +103,11 @@ component: Network
 ## Disk
 
  template: windows_disk_in_use
-       on: windows.logical_disk_utilization
+       on: windows.logical_disk_space_usage
     class: Utilization
      type: Windows
 component: Disk
-       os: linux
+       os: *
     hosts: *
      calc: ($used) * 100 / ($used + $free)
     units: %