summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.d')
-rw-r--r--health/health.d/anomalies.conf17
-rw-r--r--health/health.d/apps_plugin.conf15
-rw-r--r--health/health.d/backend.conf11
-rw-r--r--health/health.d/cockroachdb.conf91
-rw-r--r--health/health.d/dbengine.conf26
-rw-r--r--health/health.d/dns_query.conf12
-rw-r--r--health/health.d/elasticsearch.conf7
-rw-r--r--health/health.d/exporting.conf34
-rw-r--r--health/health.d/mdstat.conf7
-rw-r--r--health/health.d/megacli.conf6
-rw-r--r--health/health.d/mysql.conf2
-rw-r--r--health/health.d/net.conf29
-rw-r--r--health/health.d/portcheck.conf4
-rw-r--r--health/health.d/processes.conf26
-rw-r--r--health/health.d/pulsar.conf13
-rw-r--r--health/health.d/ram.conf4
-rw-r--r--health/health.d/scaleio.conf38
-rw-r--r--health/health.d/softnet.conf2
-rw-r--r--health/health.d/unbound.conf35
-rw-r--r--health/health.d/vernemq.conf399
-rw-r--r--health/health.d/web_log.conf2
-rw-r--r--health/health.d/whoisquery.conf24
-rw-r--r--health/health.d/x509check.conf8
23 files changed, 767 insertions, 45 deletions
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
new file mode 100644
index 000000000..a2d248efe
--- /dev/null
+++ b/health/health.d/anomalies.conf
@@ -0,0 +1,17 @@
+# raise a warning alarm if an anomaly probability is consistently above 50%
+
+template: anomaly_probabilities
+ on: anomalies.probability
+ lookup: average -2m foreach *
+ every: 1m
+ warn: $this > 50
+ info: average anomaly probability > 50% for last 2 minutes
+
+# raise a warning alarm if an anomaly flag is consistently firing
+
+template: anomaly_flags
+ on: anomalies.anomaly
+ lookup: sum -2m foreach *
+ every: 1m
+ warn: $this > 10
+ info: count of anomalies > 10 for last 2 minutes
diff --git a/health/health.d/apps_plugin.conf b/health/health.d/apps_plugin.conf
new file mode 100644
index 000000000..9a27bc6ba
--- /dev/null
+++ b/health/health.d/apps_plugin.conf
@@ -0,0 +1,15 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# disabled due to https://github.com/netdata/netdata/issues/10327
+#
+# alarm: used_file_descriptors
+# on: apps.files
+# hosts: *
+# calc: $fdperc
+# units: %
+# every: 5s
+# warn: $this > (($status >= $WARNING) ? (75) : (80))
+# crit: $this > (($status == $CRITICAL) ? (85) : (90))
+# delay: down 5m multiplier 1.5 max 1h
+# info: Peak percentage of file descriptors used
+# to: sysadmin
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf
index 7af100d8f..e51b8aa5f 100644
--- a/health/health.d/backend.conf
+++ b/health/health.d/backend.conf
@@ -1,3 +1,13 @@
+# Alert that backends subsystem will be disabled soon
+ alarm: backend_metrics_eol
+ on: netdata.backend_metrics
+ units: boolean
+ calc: $now - $last_collected_t
+ every: 1m
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: The backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
+ to: sysadmin
# make sure we are sending data to backend
@@ -32,6 +42,7 @@
info: number of metrics lost due to repeating failures to contact the backend server
to: dba
+
# this chart has been removed from netdata
# alarm: backend_slow
# on: netdata.backend_latency
diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf
new file mode 100644
index 000000000..8ab2c9d0f
--- /dev/null
+++ b/health/health.d/cockroachdb.conf
@@ -0,0 +1,91 @@
+
+# Availability
+
+template: cockroachdb_last_collected_secs
+ on: cockroachdb.live_nodes
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
+
+# Capacity
+
+template: cockroachdb_used_storage_capacity
+ on: cockroachdb.storage_used_capacity_percentage
+ calc: $capacity_used_percent
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ info: entire disk usage percentage
+ to: dba
+
+template: cockroachdb_used_usable_storage_capacity
+ on: cockroachdb.storage_used_capacity_percentage
+ calc: $capacity_usable_used_percent
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ info: usable space usage percentage
+ to: dba
+
+# Replication
+
+template: cockroachdb_unavailable_ranges
+ on: cockroachdb.ranges_replication_problem
+ calc: $ranges_unavailable
+ units: num
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ info: number of ranges with fewer live replicas than the replication target
+ to: dba
+
+template: cockroachdb_replicas_leaders_not_leaseholders
+ on: cockroachdb.replicas_leaders
+ calc: $replicas_leaders_not_leaseholders
+ units: num
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ info: number of replicas that are Raft leaders whose range lease is held by another store
+ to: dba
+
+# FD
+
+template: cockroachdb_open_file_descriptors_limit
+ on: cockroachdb.process_file_descriptors
+ calc: $sys_fd_open/$sys_fd_softlimit * 100
+ units: %
+ every: 10s
+ warn: $this > 80
+ delay: down 15m multiplier 1.5 max 1h
+ info: open file descriptors usage percentage
+ to: dba
+
+# SQL
+
+template: cockroachdb_sql_active_connections
+ on: cockroachdb.sql_connections
+ calc: $sql_conns
+ units: active connections
+ every: 10s
+ info: number of active SQL connections
+ to: dba
+
+template: cockroachdb_sql_executed_statements_total_last_5m
+ on: cockroachdb.sql_statements_total
+ lookup: sum -5m absolute of sql_query_count
+ units: statements
+ every: 10s
+ warn: $this == 0 AND $cockroachdb_sql_active_connections != 0
+ delay: down 15m up 30s multiplier 1.5 max 1h
+ info: number of executed SQL statements in the last 5 minutes
+ to: dba
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index ce9839ef1..274673e3e 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -5,7 +5,7 @@
on: netdata.dbengine_global_errors
os: linux freebsd macos
hosts: *
-lookup: sum -10m unaligned of FS errors
+lookup: sum -10m unaligned of fs_errors
units: errors
every: 10s
crit: $this > 0
@@ -17,7 +17,7 @@ lookup: sum -10m unaligned of FS errors
on: netdata.dbengine_global_errors
os: linux freebsd macos
hosts: *
-lookup: sum -10m unaligned of I/O errors
+lookup: sum -10m unaligned of io_errors
units: errors
every: 10s
crit: $this > 0
@@ -25,14 +25,26 @@ lookup: sum -10m unaligned of I/O errors
info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
to: sysadmin
- alarm: 10min_dbengine_global_flushing_errors
+ alarm: 10min_dbengine_global_flushing_warnings
on: netdata.dbengine_global_errors
os: linux freebsd macos
hosts: *
-lookup: sum -10m unaligned of flushing errors
+lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
units: errors
- every: 3s
- crit: $this > 0
+ every: 10s
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+ info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks
+ to: sysadmin
+
+ alarm: 10min_dbengine_global_flushing_errors
+ on: netdata.dbengine_long_term_page_stats
+ os: linux freebsd macos
+ hosts: *
+lookup: sum -10m unaligned of flushing_pressure_deletions
+ units: pages
+ every: 10s
+ crit: $this != 0
delay: down 1h multiplier 1.5 max 3h
- info: number of times in the last 10 minutes that the dbengine failed to completely flush data to disk, metric data will not be stored in the database, please reduce disk load or use a faster disk
+ info: number of pages deleted due to failure to flush data to disk in the last 10 minutes, metric data were lost to unblock data collection, please reduce disk load or use faster disks
to: sysadmin
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
new file mode 100644
index 000000000..113c950e6
--- /dev/null
+++ b/health/health.d/dns_query.conf
@@ -0,0 +1,12 @@
+
+# detect dns query failure
+
+template: dns_query_time_query_time
+ on: dns_query_time.query_time
+ lookup: average -10s unaligned foreach *
+ units: ms
+ every: 10s
+ warn: $this == nan
+ delay: up 20s down 5m multiplier 1.5 max 1h
+ info: query round trip time
+ to: sysadmin
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
index dffd40965..f4423449f 100644
--- a/health/health.d/elasticsearch.conf
+++ b/health/health.d/elasticsearch.conf
@@ -1,5 +1,8 @@
- alarm: elasticsearch_last_collected
- on: elasticsearch_local.cluster_health_status
+
+# make sure elasticsearch is running
+
+template: elasticsearch_last_collected
+ on: elasticsearch.cluster_health_status
calc: $now - $last_collected_t
units: seconds ago
every: 10s
diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf
new file mode 100644
index 000000000..506cb0cf7
--- /dev/null
+++ b/health/health.d/exporting.conf
@@ -0,0 +1,34 @@
+
+template: exporting_last_buffering
+families: *
+ on: exporting_data_size
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful buffering of exporting data
+ to: dba
+
+template: exporting_metrics_sent
+families: *
+ on: exporting_data_size
+ units: %
+ calc: abs($sent) * 100 / abs($buffered)
+ every: 10s
+ warn: $this != 100
+ delay: down 5m multiplier 1.5 max 1h
+ info: percentage of metrics sent to the external database server
+ to: dba
+
+template: exporting_metrics_lost
+families: *
+ on: exporting_data_size
+ units: metrics
+ calc: abs($lost)
+ every: 10s
+ crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0)
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of metrics lost due to repeating failures to contact the external database server
+ to: dba
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
index a53ec7a56..2f906e187 100644
--- a/health/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@@ -12,7 +12,7 @@ template: mdstat_disks
on: md.disks
units: failed devices
every: 10s
- calc: $total - $inuse
+ calc: $down
crit: $this > 0
info: Array is degraded!
to: sysadmin
@@ -21,8 +21,9 @@ template: mdstat_mismatch_cnt
on: md.mismatch_cnt
units: unsynchronized blocks
calc: $count
- every: 10s
- crit: $this > 0
+ every: 60s
+ warn: $this > 1024
+ delay: up 30m
info: Mismatch count!
to: sysadmin
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 73b87dcc0..6e81a2a0e 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -1,4 +1,4 @@
- alarm: adapter_state
+template: adapter_state
on: megacli.adapter_degraded
units: is degraded
lookup: sum -10s
@@ -27,7 +27,7 @@ template: bbu_cycle_count
info: BBU cycle count
to: sysadmin
- alarm: pd_media_errors
+template: pd_media_errors
on: megacli.pd_media_error
units: media errors
lookup: sum -10s
@@ -37,7 +37,7 @@ template: bbu_cycle_count
info: physical drive media errors
to: sysadmin
- alarm: pd_predictive_failures
+template: pd_predictive_failures
on: megacli.pd_predictive_failure
units: predictive failures
lookup: sum -10s
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
index 2bec56387..62cef5a2e 100644
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -79,7 +79,7 @@ template: mysql_connections
template: mysql_replication
on: mysql.slave_status
- calc: ($sql_running == -1 OR $io_running == -1)?0:1
+ calc: ($sql_running <= 0 OR $io_running <= 0)?0:1
units: ok/failed
every: 10s
crit: $this == 0
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index e43cb1691..261290e51 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -110,6 +110,34 @@ families: *
info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
to: sysadmin
+# -----------------------------------------------------------------------------
+# interface errors
+
+template: interface_inbound_errors
+ on: net.errors
+ os: freebsd
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute of inbound
+ units: errors
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface inbound errors in the last 10 minutes
+ to: sysadmin
+
+template: interface_outbound_errors
+ on: net.errors
+ os: freebsd
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute of outbound
+ units: errors
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface outbound errors in the last 10 minutes
+ to: sysadmin
# -----------------------------------------------------------------------------
# FIFO errors
@@ -132,7 +160,6 @@ families: *
info: interface fifo errors in the last 10 minutes
to: sysadmin
-
# -----------------------------------------------------------------------------
# check for packet storms
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
index f42b63d30..696333fd8 100644
--- a/health/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
@@ -31,18 +31,16 @@ families: *
crit: $this >= 40
delay: down 5m multiplier 1.5 max 1h
info: average of timeouts during the last 5 minutes
- options: no-clear-notification
to: sysadmin
template: connection_fails
families: *
on: portcheck.status
- lookup: average -5m unaligned percentage of no_connection
+ lookup: average -5m unaligned percentage of no_connection,failed
every: 10s
units: %
warn: $this >= 10 AND $this < 40
crit: $this >= 40
delay: down 5m multiplier 1.5 max 1h
info: average of failed connections during the last 5 minutes
- options: no-clear-notification
to: sysadmin
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
index d96998fdf..293f1aa0d 100644
--- a/health/health.d/processes.conf
+++ b/health/health.d/processes.conf
@@ -1,27 +1,13 @@
# you can disable an alarm notification by setting the 'to' line to: silent
- alarm: active_processes_limit_freebsd
+ alarm: active_processes
on: system.active_processes
- os: freebsd
hosts: *
- calc: $active
- units: processes
+ calc: $active * 100 / $pidmax
+ units: %
every: 5s
- warn: $this > (($status >= $WARNING) ? (75000) : (80000))
- crit: $this > (($status == $CRITICAL) ? (85000) : (90000))
+ warn: $this > (($status >= $WARNING) ? (75) : (80))
+ crit: $this > (($status == $CRITICAL) ? (85) : (90))
delay: down 5m multiplier 1.5 max 1h
- info: the number of active processes
- to: sysadmin
-
- alarm: active_processes_limit
- on: system.active_processes
- os: linux
- hosts: *
- calc: $active
- units: processes
- every: 5s
- warn: $this > (($status >= $WARNING) ? (25000) : (26000))
- crit: $this > (($status == $CRITICAL) ? (28000) : (30000))
- delay: down 5m multiplier 1.5 max 1h
- info: number of active processes
+ info: the percentage of active processes
to: sysadmin
diff --git a/health/health.d/pulsar.conf b/health/health.d/pulsar.conf
new file mode 100644
index 000000000..014789451
--- /dev/null
+++ b/health/health.d/pulsar.conf
@@ -0,0 +1,13 @@
+
+# Availability
+
+template: pulsar_last_collected_secs
+ on: pulsar.broker_components
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 15e8e8464..0a71dac84 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -5,7 +5,7 @@
on: system.ram
os: linux freebsd
hosts: *
- calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
+ calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min)
every: 10s
info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
@@ -14,7 +14,7 @@
os: linux
hosts: *
# calc: $used * 100 / ($used + $cached + $free)
- calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
+ calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free)
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
new file mode 100644
index 000000000..1a3088a2a
--- /dev/null
+++ b/health/health.d/scaleio.conf
@@ -0,0 +1,38 @@
+
+# make sure scaleio is running
+
+template: scaleio_last_collected_secs
+ on: scaleio.system_capacity_total
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# make sure Storage Pool capacity utilization is under limit
+
+template: scaleio_storage_pool_capacity_utilization
+ on: scaleio.storage_pool_capacity_utilization
+ calc: $used
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: Storage Pool capacity utilization
+ to: sysadmin
+
+
+# make sure Sdc is connected to MDM
+
+template: scaleio_sdc_mdm_connection_state
+ on: scaleio.sdc_mdm_connection_state
+ calc: $connected
+ every: 10s
+ warn: $this != 1
+ delay: up 30s down 5m multiplier 1.5 max 1h
+ info: Sdc connection to MDM state
+ to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index ff3648626..f835f2aee 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -10,7 +10,7 @@
lookup: average -1m unaligned absolute of dropped
units: packets
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : (10)
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
delay: down 1h multiplier 1.5 max 2h
info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
to: sysadmin
diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf
new file mode 100644
index 000000000..bdedc11a0
--- /dev/null
+++ b/health/health.d/unbound.conf
@@ -0,0 +1,35 @@
+
+# make sure unbound is running
+
+template: unbound_last_collected_secs
+ on: unbound.queries
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# make sure there is no overwritten/dropped queries in the request-list
+
+template: unbound_request_list_overwritten
+ on: unbound.request_list_jostle_list
+ lookup: average -60s unaligned absolute match-names of overwritten
+ units: queries
+ every: 10s
+ warn: $this > 5
+ delay: up 10 down 5m multiplier 1.5 max 1h
+ info: the number of overwritten queries in the request-list
+ to: sysadmin
+
+template: unbound_request_list_dropped
+ on: unbound.request_list_jostle_list
+ lookup: average -60s unaligned absolute match-names of dropped
+ units: queries
+ every: 10s
+ warn: $this > 0
+ delay: up 10 down 5m multiplier 1.5 max 1h
+ info: the number of dropped queries in the request-list
+ to: sysadmin
diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf
new file mode 100644
index 000000000..36bbaf82b
--- /dev/null
+++ b/health/health.d/vernemq.conf
@@ -0,0 +1,399 @@
+
+# Availability
+
+template: vernemq_last_collected_secs
+ on: vernemq.node_uptime
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# Socket errors
+
+template: vernemq_socket_errors
+ on: vernemq.socket_errors
+ lookup: sum -1m unaligned absolute of socket_error
+ units: errors
+ every: 10s
+ warn: $this > (($status == $WARNING) ? (0) : (5))
+ delay: down 5m multiplier 1.5 max 2h
+ info: socket errors in the last minute
+ to: sysadmin
+
+# Queues dropped/expired/unhandled PUBLISH messages
+
+template: vernemq_queue_message_drop
+ on: vernemq.queue_undelivered_messages
+ lookup: sum -1m unaligned absolute of queue_message_drop
+ units: dropped messages
+ every: 10s
+ warn: $this > (($status == $WARNING) ? (0) : (5))
+ delay: down 5m multiplier 1.5 max 2h
+ info: dropped messaged due to full queues in the last minute
+ to: sysadmin
+
+template: vernemq_queue_message_expired
+ on: vernemq.queue_undelivered_messages
+ lookup: sum -1m unaligned absolute of queue_message_expired
+ units: expired messages
+ every: 10s
+ warn: $this > (($status == $WARNING) ? (0) : (15))
+ delay: down 5m multiplier 1.5 max 2h
+ info: messages which expired before delivery in the last minute
+ to: sysadmin
+
+template: vernemq_queue_message_unhandled
+ on: vernemq.queue_undelivered_messages
+ lookup: sum -1m unaligned absolute of queue_message_unhandled
+ units: unhandled messages
+ every: 10s
+ warn: $this > (($status == $WARNING) ? (0) : (5))
+ delay: down 5m multiplier 1.5 max 2h
+ info: unhandled messages (connections with clean session=true) in the last minute
+ to: sysadmin
+
+# Erlang VM
+
+template: vernemq_average_scheduler_utilization
+ on: vernemq.average_scheduler_utilization
+ lookup: average -10m unaligned
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average scheduler utilization for the last 10 minutes
+ to: sysadmin
+
+# Cluster communication and netsplits
+
+template: vernemq_cluster_dropped
+ on: vernemq.cluster_dropped
+ lookup: average -1m unaligned
+ units: KiB/s
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: the amount of traffic dropped during communication with the cluster nodes in the last minute
+ to: sysadmin
+
+template: vernemq_netsplits
+ on: vernemq.netsplits
+ lookup: sum -1m unaligned absolute of netsplit_detected
+ units: netsplits
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: detected netsplits in the last minute
+ to: sysadmin
+
+# Unsuccessful CONNACK
+
+template: vernemq_mqtt_connack_sent_reason_success
+ on: vernemq.mqtt_connack_sent_reason
+ lookup: sum -1m unaligned absolute match-names of success
+ units: packets
+ every: 10s
+ info: successful v3/v5 CONNACK sent in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_connack_sent_reason_unsuccessful
+ on: vernemq.mqtt_connack_sent_reason
+ lookup: sum -1m unaligned absolute
+ calc: $this - $vernemq_mqtt_connack_sent_reason_success
+ units: packets
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unsuccessful v3/v5 CONNACK sent in the last minute
+ to: sysadmin
+
+# Not normal DISCONNECT
+
+template: vernemq_mqtt_disconnect_received_reason_normal_disconnect
+ on: vernemq.mqtt_disconnect_received_reason
+ lookup: sum -1m unaligned absolute match-names of normal_disconnect
+ units: packets
+ every: 10s
+ info: normal v5 DISCONNECT received in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_disconnect_sent_reason_normal_disconnect
+ on: vernemq.mqtt_disconnect_sent_reason
+ lookup: sum -1m unaligned absolute match-names of normal_disconnect
+ units: packets
+ every: 10s
+ info: normal v5 DISCONNECT sent in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_disconnect_received_reason_not_normal
+ on: vernemq.mqtt_disconnect_received_reason
+ lookup: sum -1m unaligned absolute
+ calc: $this - $vernemq_mqtt_disconnect_received_reason_normal_disconnect
+ units: packets
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: not normal v5 DISCONNECT received in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_disconnect_sent_reason_not_normal
+ on: vernemq.mqtt_disconnect_sent_reason
+ lookup: sum -1m unaligned absolute
+ calc: $this - $vernemq_mqtt_disconnect_sent_reason_normal_disconnect
+ units: packets
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: not normal v5 DISCONNECT sent in the last minute
+ to: sysadmin
+
+# SUBSCRIBE errors and unauthorized attempts
+
+template: vernemq_mqtt_subscribe_error
+ on: vernemq.mqtt_subscribe_error
+ lookup: sum -1m unaligned absolute
+ units: failed ops
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: failed v3/v5 SUBSCRIBE operations in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_subscribe_auth_error
+ on: vernemq.mqtt_subscribe_auth_error
+ lookup: sum -1m unaligned absolute
+ units: attempts
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unauthorized v3/v5 SUBSCRIBE attempts in the last minute
+ to: sysadmin
+
+# UNSUBSCRIBE errors
+
+template: vernemq_mqtt_unsubscribe_error
+ on: vernemq.mqtt_unsubscribe_error
+ lookup: sum -1m unaligned absolute
+ units: failed ops
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: failed v3/v5 UNSUBSCRIBE operations in the last minute
+ to: sysadmin
+
+# PUBLISH errors and unauthorized attempts
+
+template: vernemq_mqtt_publish_errors
+ on: vernemq.mqtt_publish_errors
+ lookup: sum -1m unaligned absolute
+ units: failed ops
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: failed v3/v5 PUBLISH operations in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_publish_auth_errors
+ on: vernemq.mqtt_publish_auth_errors
+ lookup: sum -1m unaligned absolute
+ units: attempts
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unauthorized v3/v5 PUBLISH attempts in the last minute
+ to: sysadmin
+
+# Unsuccessful and unexpected PUBACK
+
+template: vernemq_mqtt_puback_received_reason_success
+ on: vernemq.mqtt_puback_received_reason
+ lookup: sum -1m unaligned absolute match-names of success
+ units: packets
+ every: 10s
+ info: successful v5 PUBACK received in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_puback_sent_reason_success
+ on: vernemq.mqtt_puback_sent_reason
+ lookup: sum -1m unaligned absolute match-names of success
+ units: packets
+ every: 10s
+ info: successful v5 PUBACK sent in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_puback_received_reason_unsuccessful
+ on: vernemq.mqtt_puback_received_reason
+ lookup: sum -1m unaligned absolute
+ calc: $this - $vernemq_mqtt_puback_received_reason_success
+ units: packets
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unsuccessful v5 PUBACK received in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_puback_sent_reason_unsuccessful
+ on: vernemq.mqtt_puback_sent_reason
+ lookup: sum -1m unaligned absolute
+ calc: $this - $vernemq_mqtt_puback_sent_reason_success
+ units: packets
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unsuccessful v5 PUBACK sent in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_puback_unexpected
+ on: vernemq.mqtt_puback_invalid_error
+ lookup: sum -1m unaligned absolute
+ units: messages
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unexpected v3/v5 PUBACK received in the last minute
+ to: sysadmin
+
+# Unsuccessful and unexpected PUBREC
+
+template: vernemq_mqtt_pubrec_received_reason_success
+ on: vernemq.mqtt_pubrec_received_reason
+ lookup: sum -1m unaligned absolute match-names of success
+ units: packets
+ every: 10s
+ info: successful v5 PUBREC received in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_pubrec_sent_reason_success
+ on: vernemq.mqtt_pubrec_sent_reason
+ lookup: sum -1m unaligned absolute match-names of success
+ units: packets
+ every: 10s
+ info: successful v5 PUBREC sent in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_pubrec_received_reason_unsuccessful
+ on: vernemq.mqtt_pubrec_received_reason
+ lookup: sum -1m unaligned absolute
+ calc: $this - $vernemq_mqtt_pubrec_received_reason_success
+ units: packets
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unsuccessful v5 PUBREC received in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
+ on: vernemq.mqtt_pubrec_sent_reason
+ lookup: sum -1m unaligned absolute
+ calc: $this - $vernemq_mqtt_pubrec_sent_reason_success
+ units: packets
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unsuccessful v5 PUBREC sent in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_pubrec_invalid_error
+ on: vernemq.mqtt_pubrec_invalid_error
+ lookup: sum -1m unaligned absolute
+ units: messages
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unexpected v3 PUBREC received in the last minute
+ to: sysadmin
+
+# Unsuccessful PUBREL
+
+template: vernemq_mqtt_pubrel_received_reason_success
+ on: vernemq.mqtt_pubrel_received_reason
+ lookup: sum -1m unaligned absolute match-names of success
+ units: packets
+ every: 10s
+ info: successful v5 PUBREL received in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_pubrel_sent_reason_success
+ on: vernemq.mqtt_pubrel_sent_reason
+ lookup: sum -1m unaligned absolute match-names of success
+ units: packets
+ every: 10s
+ info: successful v5 PUBREL sent in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_pubrel_received_reason_unsuccessful
+ on: vernemq.mqtt_pubrel_received_reason
+ lookup: sum -1m unaligned absolute
+ calc: $this - $vernemq_mqtt_pubrel_received_reason_success
+ units: packets
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unsuccessful v5 PUBREL received in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
+ on: vernemq.mqtt_pubrel_sent_reason
+ lookup: sum -1m unaligned absolute
+ calc: $this - $vernemq_mqtt_pubrel_sent_reason_success
+ units: packets
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unsuccessful v5 PUBREL sent in the last minute
+ to: sysadmin
+
+# Unsuccessful and unexpected PUBCOMP
+
+template: vernemq_mqtt_pubcomp_received_reason_success
+ on: vernemq.mqtt_pubcomp_received_reason
+ lookup: sum -1m unaligned absolute match-names of success
+ units: packets
+ every: 10s
+ info: successful v5 PUBCOMP received in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_pubcomp_sent_reason_success
+ on: vernemq.mqtt_pubcomp_sent_reason
+ lookup: sum -1m unaligned absolute match-names of success
+ units: packets
+ every: 10s
+ info: successful v5 PUBCOMP sent in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
+ on: vernemq.mqtt_pubcomp_received_reason
+ lookup: sum -1m unaligned absolute
+ calc: $this - $vernemq_mqtt_pubcomp_received_reason_success
+ units: packets
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unsuccessful v5 PUBCOMP received in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
+ on: vernemq.mqtt_pubcomp_sent_reason
+ lookup: sum -1m unaligned absolute
+ calc: $this - $vernemq_mqtt_pubcomp_sent_reason_success
+ units: packets
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unsuccessful v5 PUBCOMP sent in the last minute
+ to: sysadmin
+
+template: vernemq_mqtt_pubcomp_unexpected
+ on: vernemq.mqtt_pubcomp_invalid_error
+ lookup: sum -1m unaligned absolute
+ units: messages
+ every: 10s
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: unexpected v3/v5 PUBCOMP received in the last minute
+ to: sysadmin
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 1aefd7b00..44de38a48 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -111,7 +111,6 @@ families: *
units: %
every: 10s
warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
- crit: ($1m_total_requests > 120) ? ($this > 5) : ( 0 )
delay: up 1m down 5m multiplier 1.5 max 1h
info: the ratio of unmatched lines, over the last minute
to: webmaster
@@ -235,7 +234,6 @@ families: *
units: %
every: 10s
warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
- crit: ($web_log_1m_total_requests > 120) ? ($this > 5) : ( 0 )
delay: up 1m down 5m multiplier 1.5 max 1h
info: the ratio of unmatched lines, over the last minute
to: webmaster
diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf
new file mode 100644
index 000000000..275e11dd9
--- /dev/null
+++ b/health/health.d/whoisquery.conf
@@ -0,0 +1,24 @@
+
+# make sure whoisquery is running
+
+template: whoisquery_last_collected_secs
+ on: whoisquery.time_until_expiration
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 60s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
+
+template: whoisquery_days_until_expiration
+ on: whoisquery.time_until_expiration
+ calc: $expiry
+ units: seconds
+ every: 60s
+ warn: $this < $days_until_expiration_warning*24*60*60
+ crit: $this < $days_until_expiration_critical*24*60*60
+ info: domain time until expiration
+ to: webmaster
diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf
index a56f48fc3..dfca37706 100644
--- a/health/health.d/x509check.conf
+++ b/health/health.d/x509check.conf
@@ -22,3 +22,11 @@ template: x509check_days_until_expiration
crit: $this < $days_until_expiration_critical*24*60*60
info: certificate time until expiration
to: webmaster
+
+template: x509check_revocation_status
+ on: x509check.revocation_status
+ calc: $revoked
+ every: 60s
+ crit: $this != nan AND $this != 0
+ info: certificate revocation status
+ to: webmaster