diff options
Diffstat (limited to '')
-rw-r--r-- | health/health.d/anomalies.conf | 17 | ||||
-rw-r--r-- | health/health.d/apps_plugin.conf | 15 | ||||
-rw-r--r-- | health/health.d/backend.conf | 11 | ||||
-rw-r--r-- | health/health.d/cockroachdb.conf | 91 | ||||
-rw-r--r-- | health/health.d/dbengine.conf | 26 | ||||
-rw-r--r-- | health/health.d/dns_query.conf | 12 | ||||
-rw-r--r-- | health/health.d/elasticsearch.conf | 7 | ||||
-rw-r--r-- | health/health.d/exporting.conf | 34 | ||||
-rw-r--r-- | health/health.d/mdstat.conf | 7 | ||||
-rw-r--r-- | health/health.d/megacli.conf | 6 | ||||
-rw-r--r-- | health/health.d/mysql.conf | 2 | ||||
-rw-r--r-- | health/health.d/net.conf | 29 | ||||
-rw-r--r-- | health/health.d/portcheck.conf | 4 | ||||
-rw-r--r-- | health/health.d/processes.conf | 26 | ||||
-rw-r--r-- | health/health.d/pulsar.conf | 13 | ||||
-rw-r--r-- | health/health.d/ram.conf | 4 | ||||
-rw-r--r-- | health/health.d/scaleio.conf | 38 | ||||
-rw-r--r-- | health/health.d/softnet.conf | 2 | ||||
-rw-r--r-- | health/health.d/unbound.conf | 35 | ||||
-rw-r--r-- | health/health.d/vernemq.conf | 399 | ||||
-rw-r--r-- | health/health.d/web_log.conf | 2 | ||||
-rw-r--r-- | health/health.d/whoisquery.conf | 24 | ||||
-rw-r--r-- | health/health.d/x509check.conf | 8 |
23 files changed, 767 insertions, 45 deletions
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf new file mode 100644 index 000000000..a2d248efe --- /dev/null +++ b/health/health.d/anomalies.conf @@ -0,0 +1,17 @@ +# raise a warning alarm if an anomaly probability is consistently above 50% + +template: anomaly_probabilities + on: anomalies.probability + lookup: average -2m foreach * + every: 1m + warn: $this > 50 + info: average anomaly probability > 50% for last 2 minutes + +# raise a warning alarm if an anomaly flag is consistently firing + +template: anomaly_flags + on: anomalies.anomaly + lookup: sum -2m foreach * + every: 1m + warn: $this > 10 + info: count of anomalies > 10 for last 2 minutes diff --git a/health/health.d/apps_plugin.conf b/health/health.d/apps_plugin.conf new file mode 100644 index 000000000..9a27bc6ba --- /dev/null +++ b/health/health.d/apps_plugin.conf @@ -0,0 +1,15 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# disabled due to https://github.com/netdata/netdata/issues/10327 +# +# alarm: used_file_descriptors +# on: apps.files +# hosts: * +# calc: $fdperc +# units: % +# every: 5s +# warn: $this > (($status >= $WARNING) ? (75) : (80)) +# crit: $this > (($status == $CRITICAL) ? (85) : (90)) +# delay: down 5m multiplier 1.5 max 1h +# info: Peak percentage of file descriptors used +# to: sysadmin diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf index 7af100d8f..e51b8aa5f 100644 --- a/health/health.d/backend.conf +++ b/health/health.d/backend.conf @@ -1,3 +1,13 @@ +# Alert that backends subsystem will be disabled soon + alarm: backend_metrics_eol + on: netdata.backend_metrics + units: boolean + calc: $now - $last_collected_t + every: 1m + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: The backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf. + to: sysadmin # make sure we are sending data to backend @@ -32,6 +42,7 @@ info: number of metrics lost due to repeating failures to contact the backend server to: dba + # this chart has been removed from netdata # alarm: backend_slow # on: netdata.backend_latency diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf new file mode 100644 index 000000000..8ab2c9d0f --- /dev/null +++ b/health/health.d/cockroachdb.conf @@ -0,0 +1,91 @@ + +# Availability + +template: cockroachdb_last_collected_secs + on: cockroachdb.live_nodes + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba + +# Capacity + +template: cockroachdb_used_storage_capacity + on: cockroachdb.storage_used_capacity_percentage + calc: $capacity_used_percent + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: entire disk usage percentage + to: dba + +template: cockroachdb_used_usable_storage_capacity + on: cockroachdb.storage_used_capacity_percentage + calc: $capacity_usable_used_percent + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: usable space usage percentage + to: dba + +# Replication + +template: cockroachdb_unavailable_ranges + on: cockroachdb.ranges_replication_problem + calc: $ranges_unavailable + units: num + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of ranges with fewer live replicas than the replication target + to: dba + +template: cockroachdb_replicas_leaders_not_leaseholders + on: cockroachdb.replicas_leaders + calc: $replicas_leaders_not_leaseholders + units: num + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of replicas that are Raft leaders whose range lease is held by another store + to: dba + +# FD + +template: cockroachdb_open_file_descriptors_limit + on: cockroachdb.process_file_descriptors + calc: $sys_fd_open/$sys_fd_softlimit * 100 + units: % + every: 10s + warn: $this > 80 + delay: down 15m multiplier 1.5 max 1h + info: open file descriptors usage percentage + to: dba + +# SQL + +template: cockroachdb_sql_active_connections + on: cockroachdb.sql_connections + calc: $sql_conns + units: active connections + every: 10s + info: number of active SQL connections + to: dba + +template: cockroachdb_sql_executed_statements_total_last_5m + on: cockroachdb.sql_statements_total + lookup: sum -5m absolute of sql_query_count + units: statements + every: 10s + warn: $this == 0 AND $cockroachdb_sql_active_connections != 0 + delay: down 15m up 30s multiplier 1.5 max 1h + info: number of executed SQL statements in the last 5 minutes + to: dba diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf index ce9839ef1..274673e3e 100644 --- a/health/health.d/dbengine.conf +++ b/health/health.d/dbengine.conf @@ -5,7 +5,7 @@ on: netdata.dbengine_global_errors os: linux freebsd macos hosts: * -lookup: sum -10m unaligned of FS errors +lookup: sum -10m unaligned of fs_errors units: errors every: 10s crit: $this > 0 @@ -17,7 +17,7 @@ lookup: sum -10m unaligned of FS errors on: netdata.dbengine_global_errors os: linux freebsd macos hosts: * -lookup: sum -10m unaligned of I/O errors +lookup: sum -10m unaligned of io_errors units: errors every: 10s crit: $this > 0 @@ -25,14 +25,26 @@ lookup: sum -10m unaligned of I/O errors info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) to: sysadmin - alarm: 10min_dbengine_global_flushing_errors + alarm: 10min_dbengine_global_flushing_warnings on: netdata.dbengine_global_errors os: linux freebsd macos hosts: * -lookup: sum -10m unaligned of flushing errors +lookup: sum -10m unaligned of pg_cache_over_half_dirty_events units: errors - every: 3s - crit: $this > 0 + every: 10s + warn: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks + to: sysadmin + + alarm: 10min_dbengine_global_flushing_errors + on: netdata.dbengine_long_term_page_stats + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of flushing_pressure_deletions + units: pages + every: 10s + crit: $this != 0 delay: down 1h multiplier 1.5 max 3h - info: number of times in the last 10 minutes that the dbengine failed to completely flush data to disk, metric data will not be stored in the database, please reduce disk load or use a faster disk + info: number of pages deleted due to failure to flush data to disk in the last 10 minutes, metric data were lost to unblock data collection, please reduce disk load or use faster disks to: sysadmin diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf new file mode 100644 index 000000000..113c950e6 --- /dev/null +++ b/health/health.d/dns_query.conf @@ -0,0 +1,12 @@ + +# detect dns query failure + +template: dns_query_time_query_time + on: dns_query_time.query_time + lookup: average -10s unaligned foreach * + units: ms + every: 10s + warn: $this == nan + delay: up 20s down 5m multiplier 1.5 max 1h + info: query round trip time + to: sysadmin diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf index dffd40965..f4423449f 100644 --- a/health/health.d/elasticsearch.conf +++ b/health/health.d/elasticsearch.conf @@ -1,5 +1,8 @@ - alarm: elasticsearch_last_collected - on: elasticsearch_local.cluster_health_status + +# make sure elasticsearch is running + +template: elasticsearch_last_collected + on: elasticsearch.cluster_health_status calc: $now - $last_collected_t units: seconds ago every: 10s diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf new file mode 100644 index 000000000..506cb0cf7 --- /dev/null +++ b/health/health.d/exporting.conf @@ -0,0 +1,34 @@ + +template: exporting_last_buffering +families: * + on: exporting_data_size + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful buffering of exporting data + to: dba + +template: exporting_metrics_sent +families: * + on: exporting_data_size + units: % + calc: abs($sent) * 100 / abs($buffered) + every: 10s + warn: $this != 100 + delay: down 5m multiplier 1.5 max 1h + info: percentage of metrics sent to the external database server + to: dba + +template: exporting_metrics_lost +families: * + on: exporting_data_size + units: metrics + calc: abs($lost) + every: 10s + crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0) + delay: down 5m multiplier 1.5 max 1h + info: number of metrics lost due to repeating failures to contact the external database server + to: dba diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf index a53ec7a56..2f906e187 100644 --- a/health/health.d/mdstat.conf +++ b/health/health.d/mdstat.conf @@ -12,7 +12,7 @@ template: mdstat_disks on: md.disks units: failed devices every: 10s - calc: $total - $inuse + calc: $down crit: $this > 0 info: Array is degraded! to: sysadmin @@ -21,8 +21,9 @@ template: mdstat_mismatch_cnt on: md.mismatch_cnt units: unsynchronized blocks calc: $count - every: 10s - crit: $this > 0 + every: 60s + warn: $this > 1024 + delay: up 30m info: Mismatch count! to: sysadmin diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf index 73b87dcc0..6e81a2a0e 100644 --- a/health/health.d/megacli.conf +++ b/health/health.d/megacli.conf @@ -1,4 +1,4 @@ - alarm: adapter_state +template: adapter_state on: megacli.adapter_degraded units: is degraded lookup: sum -10s @@ -27,7 +27,7 @@ template: bbu_cycle_count info: BBU cycle count to: sysadmin - alarm: pd_media_errors +template: pd_media_errors on: megacli.pd_media_error units: media errors lookup: sum -10s @@ -37,7 +37,7 @@ template: bbu_cycle_count info: physical drive media errors to: sysadmin - alarm: pd_predictive_failures +template: pd_predictive_failures on: megacli.pd_predictive_failure units: predictive failures lookup: sum -10s diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index 2bec56387..62cef5a2e 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -79,7 +79,7 @@ template: mysql_connections template: mysql_replication on: mysql.slave_status - calc: ($sql_running == -1 OR $io_running == -1)?0:1 + calc: ($sql_running <= 0 OR $io_running <= 0)?0:1 units: ok/failed every: 10s crit: $this == 0 diff --git a/health/health.d/net.conf b/health/health.d/net.conf index e43cb1691..261290e51 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -110,6 +110,34 @@ families: * info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes to: sysadmin +# ----------------------------------------------------------------------------- +# interface errors + +template: interface_inbound_errors + on: net.errors + os: freebsd + hosts: * +families: * + lookup: sum -10m unaligned absolute of inbound + units: errors + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound errors in the last 10 minutes + to: sysadmin + +template: interface_outbound_errors + on: net.errors + os: freebsd + hosts: * +families: * + lookup: sum -10m unaligned absolute of outbound + units: errors + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound errors in the last 10 minutes + to: sysadmin # ----------------------------------------------------------------------------- # FIFO errors @@ -132,7 +160,6 @@ families: * info: interface fifo errors in the last 10 minutes to: sysadmin - # ----------------------------------------------------------------------------- # check for packet storms diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf index f42b63d30..696333fd8 100644 --- a/health/health.d/portcheck.conf +++ b/health/health.d/portcheck.conf @@ -31,18 +31,16 @@ families: * crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h info: average of timeouts during the last 5 minutes - options: no-clear-notification to: sysadmin template: connection_fails families: * on: portcheck.status - lookup: average -5m unaligned percentage of no_connection + lookup: average -5m unaligned percentage of no_connection,failed every: 10s units: % warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h info: average of failed connections during the last 5 minutes - options: no-clear-notification to: sysadmin diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf index d96998fdf..293f1aa0d 100644 --- a/health/health.d/processes.conf +++ b/health/health.d/processes.conf @@ -1,27 +1,13 @@ # you can disable an alarm notification by setting the 'to' line to: silent - alarm: active_processes_limit_freebsd + alarm: active_processes on: system.active_processes - os: freebsd hosts: * - calc: $active - units: processes + calc: $active * 100 / $pidmax + units: % every: 5s - warn: $this > (($status >= $WARNING) ? (75000) : (80000)) - crit: $this > (($status == $CRITICAL) ? (85000) : (90000)) + warn: $this > (($status >= $WARNING) ? (75) : (80)) + crit: $this > (($status == $CRITICAL) ? (85) : (90)) delay: down 5m multiplier 1.5 max 1h - info: the number of active processes - to: sysadmin - - alarm: active_processes_limit - on: system.active_processes - os: linux - hosts: * - calc: $active - units: processes - every: 5s - warn: $this > (($status >= $WARNING) ? (25000) : (26000)) - crit: $this > (($status == $CRITICAL) ? (28000) : (30000)) - delay: down 5m multiplier 1.5 max 1h - info: number of active processes + info: the percentage of active processes to: sysadmin diff --git a/health/health.d/pulsar.conf b/health/health.d/pulsar.conf new file mode 100644 index 000000000..014789451 --- /dev/null +++ b/health/health.d/pulsar.conf @@ -0,0 +1,13 @@ + +# Availability + +template: pulsar_last_collected_secs + on: pulsar.broker_components + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 15e8e8464..0a71dac84 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -5,7 +5,7 @@ on: system.ram os: linux freebsd hosts: * - calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz) + calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min) every: 10s info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) @@ -14,7 +14,7 @@ os: linux hosts: * # calc: $used * 100 / ($used + $cached + $free) - calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free) + calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free) units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf new file mode 100644 index 000000000..1a3088a2a --- /dev/null +++ b/health/health.d/scaleio.conf @@ -0,0 +1,38 @@ + +# make sure scaleio is running + +template: scaleio_last_collected_secs + on: scaleio.system_capacity_total + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# make sure Storage Pool capacity utilization is under limit + +template: scaleio_storage_pool_capacity_utilization + on: scaleio.storage_pool_capacity_utilization + calc: $used + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: Storage Pool capacity utilization + to: sysadmin + + +# make sure Sdc is connected to MDM + +template: scaleio_sdc_mdm_connection_state + on: scaleio.sdc_mdm_connection_state + calc: $connected + every: 10s + warn: $this != 1 + delay: up 30s down 5m multiplier 1.5 max 1h + info: Sdc connection to MDM state + to: sysadmin diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf index ff3648626..f835f2aee 100644 --- a/health/health.d/softnet.conf +++ b/health/health.d/softnet.conf @@ -10,7 +10,7 @@ lookup: average -1m unaligned absolute of dropped units: packets every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (10) + warn: $this > (($status >= $WARNING) ? (0) : (10)) delay: down 1h multiplier 1.5 max 2h info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets) to: sysadmin diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf new file mode 100644 index 000000000..bdedc11a0 --- /dev/null +++ b/health/health.d/unbound.conf @@ -0,0 +1,35 @@ + +# make sure unbound is running + +template: unbound_last_collected_secs + on: unbound.queries + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# make sure there is no overwritten/dropped queries in the request-list + +template: unbound_request_list_overwritten + on: unbound.request_list_jostle_list + lookup: average -60s unaligned absolute match-names of overwritten + units: queries + every: 10s + warn: $this > 5 + delay: up 10 down 5m multiplier 1.5 max 1h + info: the number of overwritten queries in the request-list + to: sysadmin + +template: unbound_request_list_dropped + on: unbound.request_list_jostle_list + lookup: average -60s unaligned absolute match-names of dropped + units: queries + every: 10s + warn: $this > 0 + delay: up 10 down 5m multiplier 1.5 max 1h + info: the number of dropped queries in the request-list + to: sysadmin diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf new file mode 100644 index 000000000..36bbaf82b --- /dev/null +++ b/health/health.d/vernemq.conf @@ -0,0 +1,399 @@ + +# Availability + +template: vernemq_last_collected_secs + on: vernemq.node_uptime + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# Socket errors + +template: vernemq_socket_errors + on: vernemq.socket_errors + lookup: sum -1m unaligned absolute of socket_error + units: errors + every: 10s + warn: $this > (($status == $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 2h + info: socket errors in the last minute + to: sysadmin + +# Queues dropped/expired/unhandled PUBLISH messages + +template: vernemq_queue_message_drop + on: vernemq.queue_undelivered_messages + lookup: sum -1m unaligned absolute of queue_message_drop + units: dropped messages + every: 10s + warn: $this > (($status == $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 2h + info: dropped messaged due to full queues in the last minute + to: sysadmin + +template: vernemq_queue_message_expired + on: vernemq.queue_undelivered_messages + lookup: sum -1m unaligned absolute of queue_message_expired + units: expired messages + every: 10s + warn: $this > (($status == $WARNING) ? (0) : (15)) + delay: down 5m multiplier 1.5 max 2h + info: messages which expired before delivery in the last minute + to: sysadmin + +template: vernemq_queue_message_unhandled + on: vernemq.queue_undelivered_messages + lookup: sum -1m unaligned absolute of queue_message_unhandled + units: unhandled messages + every: 10s + warn: $this > (($status == $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 2h + info: unhandled messages (connections with clean session=true) in the last minute + to: sysadmin + +# Erlang VM + +template: vernemq_average_scheduler_utilization + on: vernemq.average_scheduler_utilization + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: average scheduler utilization for the last 10 minutes + to: sysadmin + +# Cluster communication and netsplits + +template: vernemq_cluster_dropped + on: vernemq.cluster_dropped + lookup: average -1m unaligned + units: KiB/s + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: the amount of traffic dropped during communication with the cluster nodes in the last minute + to: sysadmin + +template: vernemq_netsplits + on: vernemq.netsplits + lookup: sum -1m unaligned absolute of netsplit_detected + units: netsplits + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: detected netsplits in the last minute + to: sysadmin + +# Unsuccessful CONNACK + +template: vernemq_mqtt_connack_sent_reason_success + on: vernemq.mqtt_connack_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v3/v5 CONNACK sent in the last minute + to: sysadmin + +template: vernemq_mqtt_connack_sent_reason_unsuccessful + on: vernemq.mqtt_connack_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_connack_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v3/v5 CONNACK sent in the last minute + to: sysadmin + +# Not normal DISCONNECT + +template: vernemq_mqtt_disconnect_received_reason_normal_disconnect + on: vernemq.mqtt_disconnect_received_reason + lookup: sum -1m unaligned absolute match-names of normal_disconnect + units: packets + every: 10s + info: normal v5 DISCONNECT received in the last minute + to: sysadmin + +template: vernemq_mqtt_disconnect_sent_reason_normal_disconnect + on: vernemq.mqtt_disconnect_sent_reason + lookup: sum -1m unaligned absolute match-names of normal_disconnect + units: packets + every: 10s + info: normal v5 DISCONNECT sent in the last minute + to: sysadmin + +template: vernemq_mqtt_disconnect_received_reason_not_normal + on: vernemq.mqtt_disconnect_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_disconnect_received_reason_normal_disconnect + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: not normal v5 DISCONNECT received in the last minute + to: sysadmin + +template: vernemq_mqtt_disconnect_sent_reason_not_normal + on: vernemq.mqtt_disconnect_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_disconnect_sent_reason_normal_disconnect + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: not normal v5 DISCONNECT sent in the last minute + to: sysadmin + +# SUBSCRIBE errors and unauthorized attempts + +template: vernemq_mqtt_subscribe_error + on: vernemq.mqtt_subscribe_error + lookup: sum -1m unaligned absolute + units: failed ops + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: failed v3/v5 SUBSCRIBE operations in the last minute + to: sysadmin + +template: vernemq_mqtt_subscribe_auth_error + on: vernemq.mqtt_subscribe_auth_error + lookup: sum -1m unaligned absolute + units: attempts + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unauthorized v3/v5 SUBSCRIBE attempts in the last minute + to: sysadmin + +# UNSUBSCRIBE errors + +template: vernemq_mqtt_unsubscribe_error + on: vernemq.mqtt_unsubscribe_error + lookup: sum -1m unaligned absolute + units: failed ops + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: failed v3/v5 UNSUBSCRIBE operations in the last minute + to: sysadmin + +# PUBLISH errors and unauthorized attempts + +template: vernemq_mqtt_publish_errors + on: vernemq.mqtt_publish_errors + lookup: sum -1m unaligned absolute + units: failed ops + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: failed v3/v5 PUBLISH operations in the last minute + to: sysadmin + +template: vernemq_mqtt_publish_auth_errors + on: vernemq.mqtt_publish_auth_errors + lookup: sum -1m unaligned absolute + units: attempts + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unauthorized v3/v5 PUBLISH attempts in the last minute + to: sysadmin + +# Unsuccessful and unexpected PUBACK + +template: vernemq_mqtt_puback_received_reason_success + on: vernemq.mqtt_puback_received_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBACK received in the last minute + to: sysadmin + +template: vernemq_mqtt_puback_sent_reason_success + on: vernemq.mqtt_puback_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBACK sent in the last minute + to: sysadmin + +template: vernemq_mqtt_puback_received_reason_unsuccessful + on: vernemq.mqtt_puback_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_puback_received_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBACK received in the last minute + to: sysadmin + +template: vernemq_mqtt_puback_sent_reason_unsuccessful + on: vernemq.mqtt_puback_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_puback_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBACK sent in the last minute + to: sysadmin + +template: vernemq_mqtt_puback_unexpected + on: vernemq.mqtt_puback_invalid_error + lookup: sum -1m unaligned absolute + units: messages + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unexpected v3/v5 PUBACK received in the last minute + to: sysadmin + +# Unsuccessful and unexpected PUBREC + +template: vernemq_mqtt_pubrec_received_reason_success + on: vernemq.mqtt_pubrec_received_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBREC received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrec_sent_reason_success + on: vernemq.mqtt_pubrec_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBREC sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrec_received_reason_unsuccessful + on: vernemq.mqtt_pubrec_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubrec_received_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBREC received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrec_sent_reason_unsuccessful + on: vernemq.mqtt_pubrec_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubrec_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBREC sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrec_invalid_error + on: vernemq.mqtt_pubrec_invalid_error + lookup: sum -1m unaligned absolute + units: messages + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unexpected v3 PUBREC received in the last minute + to: sysadmin + +# Unsuccessful PUBREL + +template: vernemq_mqtt_pubrel_received_reason_success + on: vernemq.mqtt_pubrel_received_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBREL received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrel_sent_reason_success + on: vernemq.mqtt_pubrel_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBREL sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrel_received_reason_unsuccessful + on: vernemq.mqtt_pubrel_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubrel_received_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBREL received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubrel_sent_reason_unsuccessful + on: vernemq.mqtt_pubrel_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubrel_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBREL sent in the last minute + to: sysadmin + +# Unsuccessful and unexpected PUBCOMP + +template: vernemq_mqtt_pubcomp_received_reason_success + on: vernemq.mqtt_pubcomp_received_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBCOMP received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubcomp_sent_reason_success + on: vernemq.mqtt_pubcomp_sent_reason + lookup: sum -1m unaligned absolute match-names of success + units: packets + every: 10s + info: successful v5 PUBCOMP sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubcomp_received_reason_unsuccessful + on: vernemq.mqtt_pubcomp_received_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubcomp_received_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBCOMP received in the last minute + to: sysadmin + +template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful + on: vernemq.mqtt_pubcomp_sent_reason + lookup: sum -1m unaligned absolute + calc: $this - $vernemq_mqtt_pubcomp_sent_reason_success + units: packets + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unsuccessful v5 PUBCOMP sent in the last minute + to: sysadmin + +template: vernemq_mqtt_pubcomp_unexpected + on: vernemq.mqtt_pubcomp_invalid_error + lookup: sum -1m unaligned absolute + units: messages + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + info: unexpected v3/v5 PUBCOMP received in the last minute + to: sysadmin diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf index 1aefd7b00..44de38a48 100644 --- a/health/health.d/web_log.conf +++ b/health/health.d/web_log.conf @@ -111,7 +111,6 @@ families: * units: % every: 10s warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 ) - crit: ($1m_total_requests > 120) ? ($this > 5) : ( 0 ) delay: up 1m down 5m multiplier 1.5 max 1h info: the ratio of unmatched lines, over the last minute to: webmaster @@ -235,7 +234,6 @@ families: * units: % every: 10s warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 ) - crit: ($web_log_1m_total_requests > 120) ? ($this > 5) : ( 0 ) delay: up 1m down 5m multiplier 1.5 max 1h info: the ratio of unmatched lines, over the last minute to: webmaster diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf new file mode 100644 index 000000000..275e11dd9 --- /dev/null +++ b/health/health.d/whoisquery.conf @@ -0,0 +1,24 @@ + +# make sure whoisquery is running + +template: whoisquery_last_collected_secs + on: whoisquery.time_until_expiration + calc: $now - $last_collected_t + units: seconds ago + every: 60s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + +template: whoisquery_days_until_expiration + on: whoisquery.time_until_expiration + calc: $expiry + units: seconds + every: 60s + warn: $this < $days_until_expiration_warning*24*60*60 + crit: $this < $days_until_expiration_critical*24*60*60 + info: domain time until expiration + to: webmaster diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf index a56f48fc3..dfca37706 100644 --- a/health/health.d/x509check.conf +++ b/health/health.d/x509check.conf @@ -22,3 +22,11 @@ template: x509check_days_until_expiration crit: $this < $days_until_expiration_critical*24*60*60 info: certificate time until expiration to: webmaster + +template: x509check_revocation_status + on: x509check.revocation_status + calc: $revoked + every: 60s + crit: $this != nan AND $this != 0 + info: certificate revocation status + to: webmaster |