diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2022-11-30 18:47:00 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2022-11-30 18:47:00 +0000 |
commit | 03bf87dcb06f7021bfb2df2fa8691593c6148aff (patch) | |
tree | e16b06711a2ed77cafb4b7754be0220c3d14a9d7 /health/health.d | |
parent | Adding upstream version 1.36.1. (diff) | |
download | netdata-3ddbe8d6a93ed16235bde4af7f6195e6f24165e8.tar.xz netdata-3ddbe8d6a93ed16235bde4af7f6195e6f24165e8.zip |
Adding upstream version 1.37.0.upstream/1.37.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | health/health.d/dns_query.conf | 17 | ||||
-rw-r--r-- | health/health.d/go.d.plugin.conf | 2 | ||||
-rw-r--r-- | health/health.d/ml.conf | 21 | ||||
-rw-r--r-- | health/health.d/mysql.conf | 34 | ||||
-rw-r--r-- | health/health.d/nvme.conf | 15 | ||||
-rw-r--r-- | health/health.d/pihole.conf | 23 | ||||
-rw-r--r-- | health/health.d/ping.conf | 50 | ||||
-rw-r--r-- | health/health.d/postgres.conf | 214 | ||||
-rw-r--r-- | health/health.d/python.d.plugin.conf | 2 | ||||
-rw-r--r-- | health/health.d/redis.conf | 29 | ||||
-rw-r--r-- | health/health.d/systemdunits.conf | 105 | ||||
-rw-r--r-- | health/health.d/tcp_resets.conf | 4 | ||||
-rw-r--r-- | health/health.d/timex.conf | 2 |
13 files changed, 417 insertions, 101 deletions
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf index ec4937c0a..b9d6c2374 100644 --- a/health/health.d/dns_query.conf +++ b/health/health.d/dns_query.conf @@ -1,15 +1,14 @@ - # detect dns query failure - template: dns_query_time_query_time - on: dns_query_time.query_time - class: Latency + template: dns_query_query_status + on: dns_query.query_status + class: Errors type: DNS component: DNS - lookup: average -10s unaligned foreach * - units: ms + calc: $success + units: status every: 10s - warn: $this == nan - delay: up 20s down 5m multiplier 1.5 max 1h - info: average DNS query round trip time over the last 10 seconds + warn: $this != nan && $this != 1 + delay: up 30s down 5m multiplier 1.5 max 1h + info: DNS request type $label:record_type to server $label:server is unsuccessful to: sysadmin diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf index a84ab342f..cd87fe0e7 100644 --- a/health/health.d/go.d.plugin.conf +++ b/health/health.d/go.d.plugin.conf @@ -3,7 +3,7 @@ template: go.d_job_last_collected_secs on: netdata.go_plugin_execution_time - class: Error + class: Errors type: Netdata component: go.d.plugin module: !* * diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf index 9bcc81e76..6836ce7b1 100644 --- a/health/health.d/ml.conf +++ b/health/health.d/ml.conf @@ -1,10 +1,26 @@ # below are some examples of using the `anomaly-bit` option to define alerts based on anomaly # rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's # native anomaly detection here: -# https://learn.netdata.cloud/docs/configure/machine-learning#anomaly-bit---100--anomalous-0--normal +# https://learn.netdata.cloud/docs/agent/ml#anomaly-bit---100--anomalous-0--normal # examples below are commented, you would need to uncomment and adjust as desired to enable them. +# node level anomaly rate example +# https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate +# if node level anomaly rate is between 1-5% then warning (pick your own threshold that works best via tial and error). +# if node level anomaly rate is above 5% then critical (pick your own threshold that works best via tial and error). +# template: ml_1min_node_ar +# on: anomaly_detection.anomaly_rate +# os: linux +# hosts: * +# lookup: average -1m foreach anomaly_rate +# calc: $this +# units: % +# every: 30s +# warn: $this > (($status >= $WARNING) ? (1) : (5)) +# crit: $this > (($status == $CRITICAL) ? (5) : (100)) +# info: rolling 1min node level anomaly rate + # alert per dimension example # if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error). # if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). @@ -33,4 +49,5 @@ # every: 30s # warn: $this > (($status >= $WARNING) ? (5) : (20)) # crit: $this > (($status == $CRITICAL) ? (20) : (100)) -# info: rolling 5min anomaly rate for system.cpu chart
\ No newline at end of file +# info: rolling 5min anomaly rate for system.cpu chart + diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index 34452d983..3941c71cc 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -114,10 +114,10 @@ component: MySQL class: Utilization type: Database component: MySQL - lookup: max -2m absolute + lookup: max -2m at -1m unaligned units: nodes every: 10s - info: maximum galera cluster size in the last 2 minutes + info: maximum galera cluster size in the last 2 minutes starting one minute ago to: dba template: mysql_galera_cluster_size @@ -136,20 +136,29 @@ component: MySQL # galera node state - template: mysql_galera_cluster_state + template: mysql_galera_cluster_state_warn on: mysql.galera_cluster_state class: Errors type: Database component: MySQL - calc: $state + calc: $donor + $joined every: 10s - warn: $this == 2 OR $this == 3 - crit: $this == 0 OR $this == 1 OR $this >= 5 + warn: $this != nan AND $this != 0 delay: up 30s down 5m multiplier 1.5 max 1h - info: galera node state \ - (0: Undefined, 1: Joining, 2: Donor/Desynced, 3: Joined, 4: Synced, 5: Inconsistent) + info: galera node state is either Donor/Desynced or Joined. to: dba + template: mysql_galera_cluster_state_crit + on: mysql.galera_cluster_state + class: Errors + type: Database +component: MySQL + calc: $undefined + $joining + $error + every: 10s + crit: $this != nan AND $this != 0 + delay: up 30s down 5m multiplier 1.5 max 1h + info: galera node state is either Undefined or Joining or Error. + to: dba # galera node status @@ -158,11 +167,10 @@ component: MySQL class: Errors type: Database component: MySQL - calc: $wsrep_cluster_status + calc: $primary every: 10s - crit: $mysql_galera_cluster_state != nan AND $this != 0 + crit: $this != nan AND $this != 1 delay: up 30s down 5m multiplier 1.5 max 1h - info: galera node cluster component status \ - (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \ - Any other value than primary indicates that the node is part of a nonoperational component. + info: galera node is part of a nonoperational component. \ + This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations. to: dba diff --git a/health/health.d/nvme.conf b/health/health.d/nvme.conf new file mode 100644 index 000000000..5f729d52b --- /dev/null +++ b/health/health.d/nvme.conf @@ -0,0 +1,15 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: nvme_device_critical_warnings_state + families: * + on: nvme.device_critical_warnings_state + class: Errors + type: System +component: Disk + lookup: max -30s unaligned + units: state + every: 10s + crit: $this != nan AND $this != 0 + delay: down 5m multiplier 1.5 max 2h + info: NVMe device $label:device has critical warnings + to: sysadmin diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf index 2e5c1cbfd..ee6c57cc5 100644 --- a/health/health.d/pihole.conf +++ b/health/health.d/pihole.conf @@ -15,21 +15,6 @@ component: Pi-hole info: gravity.list (blocklist) file last update time to: sysadmin -# Gravity file check (gravity.list). - - template: pihole_blocklist_gravity_file - on: pihole.blocklist_last_update - class: Errors - type: Ad Filtering -component: Pi-hole - every: 10s - units: boolean - calc: $file_exists - crit: $this != 1 - delay: up 2m down 5m - info: gravity.list (blocklist) file existence state (0: not-exists, 1: exists) - to: sysadmin - # Pi-hole's ability to block unwanted domains. # Should be enabled. The whole point of Pi-hole! @@ -39,9 +24,9 @@ component: Pi-hole type: Ad Filtering component: Pi-hole every: 10s - units: boolean - calc: $enabled - warn: $this != 1 + units: status + calc: $disabled + warn: $this != nan AND $this == 1 delay: up 2m down 5m - info: unwanted domains blocking status (0: disabled, 1: enabled) + info: unwanted domains blocking is disabled to: sysadmin diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf new file mode 100644 index 000000000..cbe7c30c9 --- /dev/null +++ b/health/health.d/ping.conf @@ -0,0 +1,50 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: ping_host_reachable + families: * + on: ping.host_packet_loss + class: Errors + type: Other +component: Network + lookup: average -30s unaligned of loss + calc: $this != nan AND $this < 100 + units: up/down + every: 10s + crit: $this == 0 + delay: down 30m multiplier 1.5 max 2h + info: network host $label:host reachability status + to: sysadmin + + template: ping_packet_loss + families: * + on: ping.host_packet_loss + class: Errors + type: Other +component: Network + lookup: average -10m unaligned of loss + green: 5 + red: 10 + units: % + every: 10s + warn: $this > $green + crit: $this > $red + delay: down 30m multiplier 1.5 max 2h + info: packet loss percentage to the network host $label:host over the last 10 minutes + to: sysadmin + + template: ping_host_latency + families: * + on: ping.host_rtt + class: Latency + type: Other +component: Network + lookup: average -10s unaligned of avg + units: ms + every: 10s + green: 500 + red: 1000 + warn: $this > $green OR $max > $red + crit: $this > $red + delay: down 30m multiplier 1.5 max 2h + info: average latency to the network host $label:host over the last 10 seconds + to: sysadmin diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf new file mode 100644 index 000000000..66d034cfe --- /dev/null +++ b/health/health.d/postgres.conf @@ -0,0 +1,214 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: postgres_total_connection_utilization + on: postgres.connections_utilization + class: Utilization + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of used + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 15m multiplier 1.5 max 1h + info: average total connection utilization over the last minute + to: dba + + template: postgres_acquired_locks_utilization + on: postgres.locks_utilization + class: Utilization + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of used + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (15) : (20)) + delay: down 15m multiplier 1.5 max 1h + info: average acquired locks utilization over the last minute + to: dba + + template: postgres_txid_exhaustion_perc + on: postgres.txid_exhaustion_perc + class: Utilization + type: Database +component: PostgreSQL + hosts: * + calc: $txid_exhaustion + units: % + every: 1m + warn: $this > 90 + delay: down 15m multiplier 1.5 max 1h + info: percent towards TXID wraparound + to: dba + +# Database alarms + + template: postgres_db_cache_io_ratio + on: postgres.db_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + info: average cache hit ratio in db $label:database over the last minute + to: dba + + template: postgres_db_transactions_rollback_ratio + on: postgres.db_transactions_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -5m unaligned of rollback + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (2)) + delay: down 15m multiplier 1.5 max 1h + info: average aborted transactions percentage in db $label:database over the last five minutes + to: dba + + template: postgres_db_deadlocks_rate + on: postgres.db_deadlocks_rate + class: Errors + type: Database +component: PostgreSQL + hosts: * + lookup: sum -1m unaligned of deadlocks + units: deadlocks + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 15m multiplier 1.5 max 1h + info: number of deadlocks detected in db $label:database in the last minute + to: dba + +# Table alarms + + template: postgres_table_cache_io_ratio + on: postgres.table_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + info: average cache hit ratio in db $label:database table $label:table over the last minute + to: dba + + template: postgres_table_index_cache_io_ratio + on: postgres.table_index_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + info: average index cache hit ratio in db $label:database table $label:table over the last minute + to: dba + + template: postgres_table_toast_cache_io_ratio + on: postgres.table_toast_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + info: average TOAST hit ratio in db $label:database table $label:table over the last minute + to: dba + + template: postgres_table_toast_index_cache_io_ratio + on: postgres.table_toast_index_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + info: average index TOAST hit ratio in db $label:database table $label:table over the last minute + to: dba + + template: postgres_table_bloat_size_perc + on: postgres.table_bloat_size_perc + class: Errors + type: Database +component: PostgreSQL + hosts: * + calc: $bloat + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (70) : (80)) + delay: down 15m multiplier 1.5 max 1h + info: bloat size percentage in db $label:database table $label:table + to: dba + + template: postgres_table_last_autovacuum_time + on: postgres.table_autovacuum_since_time + class: Errors + type: Database +component: PostgreSQL + hosts: !* + calc: $time + units: seconds + every: 1m + warn: $this != nan AND $this > (60 * 60 * 24 * 7) + info: time elapsed since db $label:database table $label:table was vacuumed by the autovacuum daemon + to: dba + + template: postgres_table_last_autoanalyze_time + on: postgres.table_autoanalyze_since_time + class: Errors + type: Database +component: PostgreSQL + hosts: !* + calc: $time + units: seconds + every: 1m + warn: $this != nan AND $this > (60 * 60 * 24 * 7) + info: time elapsed since db $label:database table $label:table was analyzed by the autovacuum daemon + to: dba + +# Index alarms + + template: postgres_index_bloat_size_perc + on: postgres.index_bloat_size_perc + class: Errors + type: Database +component: PostgreSQL + hosts: * + calc: $bloat + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (70) : (80)) + delay: down 15m multiplier 1.5 max 1h + info: bloat size percentage in db $label:database table $label:table index $label:index + to: dba diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf index e3b3d11cf..0e81a482f 100644 --- a/health/health.d/python.d.plugin.conf +++ b/health/health.d/python.d.plugin.conf @@ -3,7 +3,7 @@ template: python.d_job_last_collected_secs on: netdata.pythond_runtime - class: Error + class: Errors type: Netdata component: python.d.plugin module: !* * diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf index cad5230c5..34d00b5df 100644 --- a/health/health.d/redis.conf +++ b/health/health.d/redis.conf @@ -1,3 +1,18 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: redis_connections_rejected + families: * + on: redis.connections + class: Errors + type: KV Storage +component: Redis + lookup: sum -1m unaligned of rejected + every: 10s + units: connections + warn: $this > 0 + info: connections rejected because of maxclients limit in the last minute + delay: down 5m multiplier 1.5 max 1h + to: dba template: redis_bgsave_broken families: * @@ -26,3 +41,17 @@ component: Redis info: duration of the on-going RDB save operation delay: down 5m multiplier 1.5 max 1h to: dba + + template: redis_master_link_down + families: * + on: redis.master_link_down_since_time + class: Errors + type: KV Storage +component: Redis + every: 10s + calc: $time + units: seconds + crit: $this != nan AND $this > 0 + info: time elapsed since the link between master and slave is down + delay: down 5m multiplier 1.5 max 1h + to: dba diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf index 38213a8db..531d62fac 100644 --- a/health/health.d/systemdunits.conf +++ b/health/health.d/systemdunits.conf @@ -1,142 +1,141 @@ -## Check if the are any systemd units in the failed state (crashed). -## States: 1 - active, 2 - inactive, 3 - activating, 4 - deactivating, 5 - failed. +# you can disable an alarm notification by setting the 'to' line to: silent ## Service units - template: systemd_service_units_state - on: systemd.service_units_state + template: systemd_service_unit_failed_state + on: systemd.service_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd service units are in the failed state + info: systemd service unit in the failed state to: sysadmin ## Socket units - template: systemd_socket_units_state + template: systemd_socket_unit_failed_state on: systemd.socket_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd socket units are in the failed state + info: systemd socket unit in the failed state to: sysadmin ## Target units - template: systemd_target_units_state + template: systemd_target_unit_failed_state on: systemd.target_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd target units are in the failed state + info: systemd target unit in the failed state to: sysadmin ## Path units - template: systemd_path_units_state + template: systemd_path_unit_failed_state on: systemd.path_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd path units are in the failed state + info: systemd path unit in the failed state to: sysadmin ## Device units - template: systemd_device_units_state + template: systemd_device_unit_failed_state on: systemd.device_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more the systemd device units are in the failed state + info: systemd device unit in the failed state to: sysadmin ## Mount units - template: systemd_mount_units_state + template: systemd_mount_unit_failed_state on: systemd.mount_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more the systemd mount units are in the failed state + info: systemd mount units in the failed state to: sysadmin ## Automount units - template: systemd_automount_units_state + template: systemd_automount_unit_failed_state on: systemd.automount_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd automount units are in the failed state + info: systemd automount unit in the failed state to: sysadmin ## Swap units - template: systemd_swap_units_state + template: systemd_swap_unit_failed_state on: systemd.swap_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd swap units are in the failed state + info: systemd swap units in the failed state to: sysadmin ## Scope units - template: systemd_scope_units_state + template: systemd_scope_unit_failed_state on: systemd.scope_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd scope units are in the failed state + info: systemd scope units in the failed state to: sysadmin ## Slice units - template: systemd_slice_units_state + template: systemd_slice_unit_failed_state on: systemd.slice_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd slice units are in the failed state + info: systemd slice units in the failed state to: sysadmin diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf index 35cb6366c..ff116db64 100644 --- a/health/health.d/tcp_resets.conf +++ b/health/health.d/tcp_resets.conf @@ -26,7 +26,7 @@ component: Network lookup: average -10s unaligned absolute of OutRsts units: tcp resets/s every: 10s - warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10))) + warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10))) delay: up 20s down 60m multiplier 1.2 max 2h options: no-clear-notification info: average number of sent TCP RESETS over the last 10 seconds. \ @@ -60,7 +60,7 @@ component: Network lookup: average -10s unaligned absolute of AttemptFails units: tcp resets/s every: 10s - warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) + warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) delay: up 20s down 60m multiplier 1.2 max 2h options: no-clear-notification info: average number of received TCP RESETS over the last 10 seconds. \ diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf index 23c18ba10..2e9b1a3cf 100644 --- a/health/health.d/timex.conf +++ b/health/health.d/timex.conf @@ -5,7 +5,7 @@ alarm: system_clock_sync_state on: system.clock_sync_state os: linux - class: Error + class: Errors type: System component: Clock calc: $state |