Merging upstream version 1.29.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2021-02-07 11:49:00 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2021-02-07 12:42:05 +0000
commit: 2e85f9325a797977eea9dfea0a925775ddd211d9 (patch)
tree: 452c7f30d62fca5755f659b99e4e53c7b03afc21 /health/health.d
parent: Releasing debian version 1.19.0-4. (diff)
download: netdata-2e85f9325a797977eea9dfea0a925775ddd211d9.tar.xz
netdata-2e85f9325a797977eea9dfea0a925775ddd211d9.zip
23 files changed, 767 insertions, 45 deletions
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
new file mode 100644
index 000000000..a2d248efe
--- /dev/null
+++ b/health/health.d/anomalies.conf
@@ -0,0 +1,17 @@
+# raise a warning alarm if an anomaly probability is consistently above 50%
+
+template: anomaly_probabilities
+      on: anomalies.probability
+  lookup: average -2m foreach *
+   every: 1m
+    warn: $this > 50
+    info: average anomaly probability > 50% for last 2 minutes
+
+# raise a warning alarm if an anomaly flag is consistently firing
+
+template: anomaly_flags
+      on: anomalies.anomaly
+  lookup: sum -2m foreach *
+   every: 1m
+    warn: $this > 10
+    info: count of anomalies > 10 for last 2 minutes
diff --git a/health/health.d/apps_plugin.conf b/health/health.d/apps_plugin.conf
new file mode 100644
index 000000000..9a27bc6ba
--- /dev/null
+++ b/health/health.d/apps_plugin.conf
@@ -0,0 +1,15 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+#  disabled due to https://github.com/netdata/netdata/issues/10327
+#
+#   alarm: used_file_descriptors
+#      on: apps.files
+#   hosts: *
+#    calc: $fdperc
+#   units: %
+#   every: 5s
+#    warn: $this > (($status >= $WARNING)  ? (75) : (80))
+#    crit: $this > (($status == $CRITICAL) ? (85) : (90))
+#   delay: down 5m multiplier 1.5 max 1h
+#    info: Peak percentage of file descriptors used
+#      to: sysadmin
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf
index 7af100d8f..e51b8aa5f 100644
--- a/health/health.d/backend.conf
+++ b/health/health.d/backend.conf
@@ -1,3 +1,13 @@
+# Alert that backends subsystem will be disabled soon
+   alarm: backend_metrics_eol
+      on: netdata.backend_metrics
+   units: boolean
+    calc: $now - $last_collected_t 
+   every: 1m
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 1h
+    info: The backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
+      to: sysadmin
 
 # make sure we are sending data to backend
 
@@ -32,6 +42,7 @@
     info: number of metrics lost due to repeating failures to contact the backend server
       to: dba
 
+
 # this chart has been removed from netdata
 #   alarm: backend_slow
 #      on: netdata.backend_latency
diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf
new file mode 100644
index 000000000..8ab2c9d0f
--- /dev/null
+++ b/health/health.d/cockroachdb.conf
@@ -0,0 +1,91 @@
+
+# Availability
+
+template: cockroachdb_last_collected_secs
+      on: cockroachdb.live_nodes
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
+
+# Capacity
+
+template: cockroachdb_used_storage_capacity
+      on: cockroachdb.storage_used_capacity_percentage
+    calc: $capacity_used_percent
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (85))
+    crit: $this > (($status == $CRITICAL) ? (85) : (95))
+   delay: down 15m multiplier 1.5 max 1h
+    info: entire disk usage percentage
+      to: dba
+
+template: cockroachdb_used_usable_storage_capacity
+      on: cockroachdb.storage_used_capacity_percentage
+    calc: $capacity_usable_used_percent
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (85))
+    crit: $this > (($status == $CRITICAL) ? (85) : (95))
+   delay: down 15m multiplier 1.5 max 1h
+    info: usable space usage percentage
+      to: dba
+
+# Replication
+
+template: cockroachdb_unavailable_ranges
+      on: cockroachdb.ranges_replication_problem
+    calc: $ranges_unavailable
+   units: num
+   every: 10s
+    warn: $this > 0
+   delay: down 15m multiplier 1.5 max 1h
+    info: number of ranges with fewer live replicas than the replication target
+      to: dba
+
+template: cockroachdb_replicas_leaders_not_leaseholders
+      on: cockroachdb.replicas_leaders
+    calc: $replicas_leaders_not_leaseholders
+   units: num
+   every: 10s
+    warn: $this > 0
+   delay: down 15m multiplier 1.5 max 1h
+    info: number of replicas that are Raft leaders whose range lease is held by another store
+      to: dba
+
+# FD
+
+template: cockroachdb_open_file_descriptors_limit
+      on: cockroachdb.process_file_descriptors
+    calc: $sys_fd_open/$sys_fd_softlimit * 100
+   units: %
+   every: 10s
+    warn: $this > 80
+   delay: down 15m multiplier 1.5 max 1h
+    info: open file descriptors usage percentage
+      to: dba
+
+# SQL
+
+template: cockroachdb_sql_active_connections
+      on: cockroachdb.sql_connections
+    calc: $sql_conns
+   units: active connections
+   every: 10s
+    info: number of active SQL connections
+      to: dba
+
+template: cockroachdb_sql_executed_statements_total_last_5m
+      on: cockroachdb.sql_statements_total
+  lookup: sum -5m absolute of sql_query_count
+   units: statements
+   every: 10s
+   warn: $this == 0 AND $cockroachdb_sql_active_connections != 0
+   delay: down 15m up 30s multiplier 1.5 max 1h
+    info: number of executed SQL statements in the last 5 minutes
+      to: dba
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index ce9839ef1..274673e3e 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -5,7 +5,7 @@
     on: netdata.dbengine_global_errors
     os: linux freebsd macos
  hosts: *
-lookup: sum -10m unaligned of FS errors
+lookup: sum -10m unaligned of fs_errors
  units: errors
  every: 10s
   crit: $this > 0
@@ -17,7 +17,7 @@ lookup: sum -10m unaligned of FS errors
     on: netdata.dbengine_global_errors
     os: linux freebsd macos
  hosts: *
-lookup: sum -10m unaligned of I/O errors
+lookup: sum -10m unaligned of io_errors
  units: errors
  every: 10s
   crit: $this > 0
@@ -25,14 +25,26 @@ lookup: sum -10m unaligned of I/O errors
   info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
     to: sysadmin
 
- alarm: 10min_dbengine_global_flushing_errors
+ alarm: 10min_dbengine_global_flushing_warnings
     on: netdata.dbengine_global_errors
     os: linux freebsd macos
  hosts: *
-lookup: sum -10m unaligned of flushing errors
+lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
  units: errors
- every: 3s
-  crit: $this > 0
+ every: 10s
+  warn: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+  info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks
+    to: sysadmin
+
+ alarm: 10min_dbengine_global_flushing_errors
+    on: netdata.dbengine_long_term_page_stats
+    os: linux freebsd macos
+ hosts: *
+lookup: sum -10m unaligned of flushing_pressure_deletions
+ units: pages
+ every: 10s
+  crit: $this != 0
  delay: down 1h multiplier 1.5 max 3h
-  info: number of times in the last 10 minutes that the dbengine failed to completely flush data to disk, metric data will not be stored in the database, please reduce disk load or use a faster disk
+  info: number of pages deleted due to failure to flush data to disk in the last 10 minutes, metric data were lost to unblock data collection, please reduce disk load or use faster disks
     to: sysadmin
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
new file mode 100644
index 000000000..113c950e6
--- /dev/null
+++ b/health/health.d/dns_query.conf
@@ -0,0 +1,12 @@
+
+# detect dns query failure
+
+template: dns_query_time_query_time
+      on: dns_query_time.query_time
+  lookup: average -10s unaligned foreach *
+   units: ms
+   every: 10s
+    warn: $this == nan
+   delay: up 20s down 5m multiplier 1.5 max 1h
+    info: query round trip time
+      to: sysadmin
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
index dffd40965..f4423449f 100644
--- a/health/health.d/elasticsearch.conf
+++ b/health/health.d/elasticsearch.conf
@@ -1,5 +1,8 @@
-   alarm: elasticsearch_last_collected
-      on: elasticsearch_local.cluster_health_status
+
+# make sure elasticsearch is running
+
+template: elasticsearch_last_collected
+      on: elasticsearch.cluster_health_status
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf
new file mode 100644
index 000000000..506cb0cf7
--- /dev/null
+++ b/health/health.d/exporting.conf
@@ -0,0 +1,34 @@
+
+template: exporting_last_buffering
+families: *
+      on: exporting_data_size
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful buffering of exporting data
+      to: dba
+
+template: exporting_metrics_sent
+families: *
+      on: exporting_data_size
+   units: %
+    calc: abs($sent) * 100 / abs($buffered)
+   every: 10s
+    warn: $this != 100
+   delay: down 5m multiplier 1.5 max 1h
+    info: percentage of metrics sent to the external database server
+      to: dba
+
+template: exporting_metrics_lost
+families: *
+      on: exporting_data_size
+   units: metrics
+    calc: abs($lost)
+   every: 10s
+    crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0)
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of metrics lost due to repeating failures to contact the external database server
+      to: dba
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
index a53ec7a56..2f906e187 100644
--- a/health/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@@ -12,7 +12,7 @@ template: mdstat_disks
       on: md.disks
    units: failed devices
    every: 10s
-    calc: $total - $inuse
+    calc: $down
     crit: $this > 0
     info: Array is degraded!
       to: sysadmin
@@ -21,8 +21,9 @@ template: mdstat_mismatch_cnt
       on: md.mismatch_cnt
    units: unsynchronized blocks
     calc: $count
-   every: 10s
-    crit: $this > 0
+   every: 60s
+    warn: $this > 1024
+   delay: up 30m
     info: Mismatch count!
       to: sysadmin
 
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 73b87dcc0..6e81a2a0e 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -1,4 +1,4 @@
-   alarm: adapter_state
+template: adapter_state
       on: megacli.adapter_degraded
    units: is degraded
   lookup: sum -10s
@@ -27,7 +27,7 @@ template: bbu_cycle_count
     info: BBU cycle count
       to: sysadmin
 
-   alarm: pd_media_errors
+template: pd_media_errors
       on: megacli.pd_media_error
    units: media errors
   lookup: sum -10s
@@ -37,7 +37,7 @@ template: bbu_cycle_count
     info: physical drive media errors
       to: sysadmin
 
-   alarm: pd_predictive_failures
+template: pd_predictive_failures
       on: megacli.pd_predictive_failure
    units: predictive failures
   lookup: sum -10s
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
index 2bec56387..62cef5a2e 100644
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -79,7 +79,7 @@ template: mysql_connections
 
 template: mysql_replication
       on: mysql.slave_status
-    calc: ($sql_running == -1 OR $io_running == -1)?0:1
+    calc: ($sql_running <= 0 OR $io_running <= 0)?0:1
    units: ok/failed
    every: 10s
     crit: $this == 0
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index e43cb1691..261290e51 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -110,6 +110,34 @@ families: *
     info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
       to: sysadmin
 
+# -----------------------------------------------------------------------------
+# interface errors
+
+template: interface_inbound_errors
+      on: net.errors
+      os: freebsd
+   hosts: *
+families: *
+  lookup: sum -10m unaligned absolute of inbound
+   units: errors
+   every: 1m
+    warn: $this >= 5
+   delay: down 1h multiplier 1.5 max 2h
+    info: interface inbound errors in the last 10 minutes
+      to: sysadmin
+
+template: interface_outbound_errors
+      on: net.errors
+      os: freebsd
+   hosts: *
+families: *
+  lookup: sum -10m unaligned absolute of outbound
+   units: errors
+   every: 1m
+    warn: $this >= 5
+   delay: down 1h multiplier 1.5 max 2h
+    info: interface outbound errors in the last 10 minutes
+      to: sysadmin
 
 # -----------------------------------------------------------------------------
 # FIFO errors
@@ -132,7 +160,6 @@ families: *
     info: interface fifo errors in the last 10 minutes
       to: sysadmin
 
-
 # -----------------------------------------------------------------------------
 # check for packet storms
 
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
index f42b63d30..696333fd8 100644
--- a/health/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
@@ -31,18 +31,16 @@ families: *
     crit: $this >= 40
    delay: down 5m multiplier 1.5 max 1h
     info: average of timeouts during the last 5 minutes
- options: no-clear-notification
       to: sysadmin
 
 template: connection_fails
 families: *
       on: portcheck.status
-  lookup: average -5m unaligned percentage of no_connection
+  lookup: average -5m unaligned percentage of no_connection,failed
    every: 10s
    units: %
     warn: $this >= 10 AND $this < 40
     crit: $this >= 40
    delay: down 5m multiplier 1.5 max 1h
     info: average of failed connections during the last 5 minutes
- options: no-clear-notification
       to: sysadmin
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
index d96998fdf..293f1aa0d 100644
--- a/health/health.d/processes.conf
+++ b/health/health.d/processes.conf
@@ -1,27 +1,13 @@
 # you can disable an alarm notification by setting the 'to' line to: silent
 
-   alarm: active_processes_limit_freebsd
+   alarm: active_processes
       on: system.active_processes
-      os: freebsd
    hosts: *
-    calc: $active
-   units: processes
+    calc: $active * 100 / $pidmax
+   units: %
    every: 5s
-    warn: $this > (($status >= $WARNING)  ? (75000) : (80000))
-    crit: $this > (($status == $CRITICAL) ? (85000) : (90000))
+    warn: $this > (($status >= $WARNING)  ? (75) : (80))
+    crit: $this > (($status == $CRITICAL) ? (85) : (90))
    delay: down 5m multiplier 1.5 max 1h
-    info: the number of active processes
-      to: sysadmin
-
-   alarm: active_processes_limit
-      on: system.active_processes
-      os: linux
-   hosts: *
-    calc: $active
-   units: processes
-   every: 5s
-    warn: $this > (($status >= $WARNING)  ? (25000) : (26000))
-    crit: $this > (($status == $CRITICAL) ? (28000) : (30000))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of active processes
+    info: the percentage of active processes
       to: sysadmin
diff --git a/health/health.d/pulsar.conf b/health/health.d/pulsar.conf
new file mode 100644
index 000000000..014789451
--- /dev/null
+++ b/health/health.d/pulsar.conf
@@ -0,0 +1,13 @@
+
+# Availability
+
+template: pulsar_last_collected_secs
+      on: pulsar.broker_components
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 15e8e8464..0a71dac84 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -5,7 +5,7 @@
       on: system.ram
       os: linux freebsd
    hosts: *
-    calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
+    calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min)
    every: 10s
     info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
 
@@ -14,7 +14,7 @@
       os: linux
    hosts: *
 #   calc: $used * 100 / ($used + $cached + $free)
-    calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
+    calc: ($used - $used_ram_to_ignore) * 100 / ($used  + $cached + $free)
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
new file mode 100644
index 000000000..1a3088a2a
--- /dev/null
+++ b/health/health.d/scaleio.conf
@@ -0,0 +1,38 @@
+
+# make sure scaleio is running
+
+template: scaleio_last_collected_secs
+      on: scaleio.system_capacity_total
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# make sure Storage Pool capacity utilization is under limit
+
+template: scaleio_storage_pool_capacity_utilization
+      on: scaleio.storage_pool_capacity_utilization
+    calc: $used
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: down 15m multiplier 1.5 max 1h
+    info: Storage Pool capacity utilization
+      to: sysadmin
+
+
+# make sure Sdc is connected to MDM
+
+template: scaleio_sdc_mdm_connection_state
+      on: scaleio.sdc_mdm_connection_state
+    calc: $connected
+   every: 10s
+    warn: $this != 1
+   delay: up 30s down 5m multiplier 1.5 max 1h
+    info: Sdc connection to MDM state
+      to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index ff3648626..f835f2aee 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -10,7 +10,7 @@
   lookup: average -1m unaligned absolute of dropped
    units: packets
    every: 10s
-    warn: $this > (($status >= $WARNING) ? (0) : (10)
+    warn: $this > (($status >= $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
     info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
       to: sysadmin
diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf
new file mode 100644
index 000000000..bdedc11a0
--- /dev/null
+++ b/health/health.d/unbound.conf
@@ -0,0 +1,35 @@
+
+# make sure unbound is running
+
+template: unbound_last_collected_secs
+      on: unbound.queries
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# make sure there is no overwritten/dropped queries in the request-list
+
+template: unbound_request_list_overwritten
+      on: unbound.request_list_jostle_list
+  lookup: average -60s unaligned absolute match-names of overwritten
+   units: queries
+   every: 10s
+    warn: $this > 5
+   delay: up 10 down 5m multiplier 1.5 max 1h
+    info: the number of overwritten queries in the request-list
+      to: sysadmin
+
+template: unbound_request_list_dropped
+      on: unbound.request_list_jostle_list
+  lookup: average -60s unaligned absolute match-names of dropped
+   units: queries
+   every: 10s
+    warn: $this > 0
+   delay: up 10 down 5m multiplier 1.5 max 1h
+    info: the number of dropped queries in the request-list
+      to: sysadmin
diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf
new file mode 100644
index 000000000..36bbaf82b
--- /dev/null
+++ b/health/health.d/vernemq.conf
@@ -0,0 +1,399 @@
+
+# Availability
+
+template: vernemq_last_collected_secs
+      on: vernemq.node_uptime
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# Socket errors
+
+template: vernemq_socket_errors
+      on: vernemq.socket_errors
+  lookup: sum -1m unaligned absolute of socket_error
+   units: errors
+   every: 10s
+    warn: $this > (($status == $WARNING) ? (0) : (5))
+   delay: down 5m multiplier 1.5 max 2h
+    info: socket errors in the last minute
+      to: sysadmin
+
+# Queues dropped/expired/unhandled PUBLISH messages
+
+template: vernemq_queue_message_drop
+      on: vernemq.queue_undelivered_messages
+  lookup: sum -1m unaligned absolute of queue_message_drop
+   units: dropped messages
+   every: 10s
+    warn: $this > (($status == $WARNING) ? (0) : (5))
+   delay: down 5m multiplier 1.5 max 2h
+    info: dropped messaged due to full queues in the last minute
+      to: sysadmin
+
+template: vernemq_queue_message_expired
+      on: vernemq.queue_undelivered_messages
+  lookup: sum -1m unaligned absolute of queue_message_expired
+   units: expired messages
+   every: 10s
+    warn: $this > (($status == $WARNING) ? (0) : (15))
+   delay: down 5m multiplier 1.5 max 2h
+    info: messages which expired before delivery in the last minute
+      to: sysadmin
+
+template: vernemq_queue_message_unhandled
+      on: vernemq.queue_undelivered_messages
+  lookup: sum -1m unaligned absolute of queue_message_unhandled
+   units: unhandled messages
+   every: 10s
+    warn: $this > (($status == $WARNING) ? (0) : (5))
+   delay: down 5m multiplier 1.5 max 2h
+    info: unhandled messages (connections with clean session=true) in the last minute
+      to: sysadmin
+
+# Erlang VM
+
+template: vernemq_average_scheduler_utilization
+      on: vernemq.average_scheduler_utilization
+  lookup: average -10m unaligned
+   units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (75) : (85))
+    crit: $this > (($status == $CRITICAL) ? (85) : (95))
+   delay: down 15m multiplier 1.5 max 1h
+    info: average scheduler utilization for the last 10 minutes
+      to: sysadmin
+
+# Cluster communication and netsplits
+
+template: vernemq_cluster_dropped
+      on: vernemq.cluster_dropped
+  lookup: average -1m unaligned
+   units: KiB/s
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 1h
+    info: the amount of traffic dropped during communication with the cluster nodes in the last minute
+      to: sysadmin
+
+template: vernemq_netsplits
+      on: vernemq.netsplits
+  lookup: sum -1m unaligned absolute of netsplit_detected
+   units: netsplits
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: detected netsplits in the last minute
+      to: sysadmin
+
+# Unsuccessful CONNACK
+
+template: vernemq_mqtt_connack_sent_reason_success
+      on: vernemq.mqtt_connack_sent_reason
+  lookup: sum -1m unaligned absolute match-names of success
+   units: packets
+   every: 10s
+    info: successful v3/v5 CONNACK sent in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_connack_sent_reason_unsuccessful
+      on: vernemq.mqtt_connack_sent_reason
+  lookup: sum -1m unaligned absolute
+    calc: $this - $vernemq_mqtt_connack_sent_reason_success
+   units: packets
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unsuccessful v3/v5 CONNACK sent in the last minute
+      to: sysadmin
+
+# Not normal DISCONNECT
+
+template: vernemq_mqtt_disconnect_received_reason_normal_disconnect
+      on: vernemq.mqtt_disconnect_received_reason
+  lookup: sum -1m unaligned absolute match-names of normal_disconnect
+   units: packets
+   every: 10s
+    info: normal v5 DISCONNECT received in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_disconnect_sent_reason_normal_disconnect
+      on: vernemq.mqtt_disconnect_sent_reason
+  lookup: sum -1m unaligned absolute match-names of normal_disconnect
+   units: packets
+   every: 10s
+    info: normal v5 DISCONNECT sent in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_disconnect_received_reason_not_normal
+      on: vernemq.mqtt_disconnect_received_reason
+  lookup: sum -1m unaligned absolute
+    calc: $this - $vernemq_mqtt_disconnect_received_reason_normal_disconnect
+   units: packets
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: not normal v5 DISCONNECT received in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_disconnect_sent_reason_not_normal
+      on: vernemq.mqtt_disconnect_sent_reason
+  lookup: sum -1m unaligned absolute
+    calc: $this - $vernemq_mqtt_disconnect_sent_reason_normal_disconnect
+   units: packets
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: not normal v5 DISCONNECT sent in the last minute
+      to: sysadmin
+
+# SUBSCRIBE errors and unauthorized attempts
+
+template: vernemq_mqtt_subscribe_error
+      on: vernemq.mqtt_subscribe_error
+  lookup: sum -1m unaligned absolute
+   units: failed ops
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: failed v3/v5 SUBSCRIBE operations in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_subscribe_auth_error
+      on: vernemq.mqtt_subscribe_auth_error
+  lookup: sum -1m unaligned absolute
+   units: attempts
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unauthorized v3/v5 SUBSCRIBE attempts in the last minute
+      to: sysadmin
+
+# UNSUBSCRIBE errors
+
+template: vernemq_mqtt_unsubscribe_error
+      on: vernemq.mqtt_unsubscribe_error
+  lookup: sum -1m unaligned absolute
+   units: failed ops
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: failed v3/v5 UNSUBSCRIBE operations in the last minute
+      to: sysadmin
+
+# PUBLISH errors and unauthorized attempts
+
+template: vernemq_mqtt_publish_errors
+      on: vernemq.mqtt_publish_errors
+  lookup: sum -1m unaligned absolute
+   units: failed ops
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: failed v3/v5 PUBLISH operations in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_publish_auth_errors
+      on: vernemq.mqtt_publish_auth_errors
+  lookup: sum -1m unaligned absolute
+   units: attempts
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unauthorized v3/v5 PUBLISH attempts in the last minute
+      to: sysadmin
+
+# Unsuccessful and unexpected PUBACK
+
+template: vernemq_mqtt_puback_received_reason_success
+      on: vernemq.mqtt_puback_received_reason
+  lookup: sum -1m unaligned absolute match-names of success
+   units: packets
+   every: 10s
+    info: successful v5 PUBACK received in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_puback_sent_reason_success
+      on: vernemq.mqtt_puback_sent_reason
+  lookup: sum -1m unaligned absolute match-names of success
+   units: packets
+   every: 10s
+    info: successful v5 PUBACK sent in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_puback_received_reason_unsuccessful
+      on: vernemq.mqtt_puback_received_reason
+  lookup: sum -1m unaligned absolute
+    calc: $this - $vernemq_mqtt_puback_received_reason_success
+   units: packets
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unsuccessful v5 PUBACK received in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_puback_sent_reason_unsuccessful
+      on: vernemq.mqtt_puback_sent_reason
+  lookup: sum -1m unaligned absolute
+    calc: $this - $vernemq_mqtt_puback_sent_reason_success
+   units: packets
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unsuccessful v5 PUBACK sent in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_puback_unexpected
+      on: vernemq.mqtt_puback_invalid_error
+  lookup: sum -1m unaligned absolute
+   units: messages
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unexpected v3/v5 PUBACK received in the last minute
+      to: sysadmin
+
+# Unsuccessful and unexpected PUBREC
+
+template: vernemq_mqtt_pubrec_received_reason_success
+      on: vernemq.mqtt_pubrec_received_reason
+  lookup: sum -1m unaligned absolute match-names of success
+   units: packets
+   every: 10s
+    info: successful v5 PUBREC received in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_pubrec_sent_reason_success
+      on: vernemq.mqtt_pubrec_sent_reason
+  lookup: sum -1m unaligned absolute match-names of success
+   units: packets
+   every: 10s
+    info: successful v5 PUBREC sent in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_pubrec_received_reason_unsuccessful
+      on: vernemq.mqtt_pubrec_received_reason
+  lookup: sum -1m unaligned absolute
+    calc: $this - $vernemq_mqtt_pubrec_received_reason_success
+   units: packets
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unsuccessful v5 PUBREC received in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
+      on: vernemq.mqtt_pubrec_sent_reason
+  lookup: sum -1m unaligned absolute
+    calc: $this - $vernemq_mqtt_pubrec_sent_reason_success
+   units: packets
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unsuccessful v5 PUBREC sent in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_pubrec_invalid_error
+      on: vernemq.mqtt_pubrec_invalid_error
+  lookup: sum -1m unaligned absolute
+   units: messages
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unexpected v3 PUBREC received in the last minute
+      to: sysadmin
+
+# Unsuccessful PUBREL
+
+template: vernemq_mqtt_pubrel_received_reason_success
+      on: vernemq.mqtt_pubrel_received_reason
+  lookup: sum -1m unaligned absolute match-names of success
+   units: packets
+   every: 10s
+    info: successful v5 PUBREL received in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_pubrel_sent_reason_success
+      on: vernemq.mqtt_pubrel_sent_reason
+  lookup: sum -1m unaligned absolute match-names of success
+   units: packets
+   every: 10s
+    info: successful v5 PUBREL sent in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_pubrel_received_reason_unsuccessful
+      on: vernemq.mqtt_pubrel_received_reason
+  lookup: sum -1m unaligned absolute
+    calc: $this - $vernemq_mqtt_pubrel_received_reason_success
+   units: packets
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unsuccessful v5 PUBREL received in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
+      on: vernemq.mqtt_pubrel_sent_reason
+  lookup: sum -1m unaligned absolute
+    calc: $this - $vernemq_mqtt_pubrel_sent_reason_success
+   units: packets
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unsuccessful v5 PUBREL sent in the last minute
+      to: sysadmin
+
+# Unsuccessful and unexpected PUBCOMP
+
+template: vernemq_mqtt_pubcomp_received_reason_success
+      on: vernemq.mqtt_pubcomp_received_reason
+  lookup: sum -1m unaligned absolute match-names of success
+   units: packets
+   every: 10s
+    info: successful v5 PUBCOMP received in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_pubcomp_sent_reason_success
+      on: vernemq.mqtt_pubcomp_sent_reason
+  lookup: sum -1m unaligned absolute match-names of success
+   units: packets
+   every: 10s
+    info: successful v5 PUBCOMP sent in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
+      on: vernemq.mqtt_pubcomp_received_reason
+  lookup: sum -1m unaligned absolute
+    calc: $this - $vernemq_mqtt_pubcomp_received_reason_success
+   units: packets
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unsuccessful v5 PUBCOMP received in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
+      on: vernemq.mqtt_pubcomp_sent_reason
+  lookup: sum -1m unaligned absolute
+    calc: $this - $vernemq_mqtt_pubcomp_sent_reason_success
+   units: packets
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unsuccessful v5 PUBCOMP sent in the last minute
+      to: sysadmin
+
+template: vernemq_mqtt_pubcomp_unexpected
+      on: vernemq.mqtt_pubcomp_invalid_error
+  lookup: sum -1m unaligned absolute
+   units: messages
+   every: 10s
+    warn: $this > 0
+   delay: down 5m multiplier 1.5 max 2h
+    info: unexpected v3/v5 PUBCOMP received in the last minute
+      to: sysadmin
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 1aefd7b00..44de38a48 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -111,7 +111,6 @@ families: *
    units: %
    every: 10s
     warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
-    crit: ($1m_total_requests > 120) ? ($this > 5) : ( 0 )
    delay: up 1m down 5m multiplier 1.5 max 1h
     info: the ratio of unmatched lines, over the last minute
       to: webmaster
@@ -235,7 +234,6 @@ families: *
    units: %
    every: 10s
     warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
-    crit: ($web_log_1m_total_requests > 120) ? ($this > 5) : ( 0 )
    delay: up 1m down 5m multiplier 1.5 max 1h
     info: the ratio of unmatched lines, over the last minute
       to: webmaster
diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf
new file mode 100644
index 000000000..275e11dd9
--- /dev/null
+++ b/health/health.d/whoisquery.conf
@@ -0,0 +1,24 @@
+
+# make sure whoisquery is running
+
+template: whoisquery_last_collected_secs
+      on: whoisquery.time_until_expiration
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 60s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: webmaster
+
+
+template: whoisquery_days_until_expiration
+      on: whoisquery.time_until_expiration
+   calc:  $expiry
+   units: seconds
+   every: 60s
+    warn: $this < $days_until_expiration_warning*24*60*60
+    crit: $this < $days_until_expiration_critical*24*60*60
+    info: domain time until expiration
+      to: webmaster
diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf
index a56f48fc3..dfca37706 100644
--- a/health/health.d/x509check.conf
+++ b/health/health.d/x509check.conf
@@ -22,3 +22,11 @@ template: x509check_days_until_expiration
     crit: $this < $days_until_expiration_critical*24*60*60
     info: certificate time until expiration
       to: webmaster
+      
+template: x509check_revocation_status
+      on: x509check.revocation_status
+    calc: $revoked
+   every: 60s
+    crit: $this != nan AND $this != 0
+    info: certificate revocation status
+      to: webmaster
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2021-02-07 11:49:00 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2021-02-07 12:42:05 +0000
commit	2e85f9325a797977eea9dfea0a925775ddd211d9 (patch)
tree	452c7f30d62fca5755f659b99e4e53c7b03afc21 /health/health.d
parent	Releasing debian version 1.19.0-4. (diff)
download	netdata-2e85f9325a797977eea9dfea0a925775ddd211d9.tar.xz netdata-2e85f9325a797977eea9dfea0a925775ddd211d9.zip