summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2022-11-30 18:47:00 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2022-11-30 18:47:00 +0000
commit03bf87dcb06f7021bfb2df2fa8691593c6148aff (patch)
treee16b06711a2ed77cafb4b7754be0220c3d14a9d7 /health/health.d
parentAdding upstream version 1.36.1. (diff)
downloadnetdata-3ddbe8d6a93ed16235bde4af7f6195e6f24165e8.tar.xz
netdata-3ddbe8d6a93ed16235bde4af7f6195e6f24165e8.zip
Adding upstream version 1.37.0.upstream/1.37.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--health/health.d/dns_query.conf17
-rw-r--r--health/health.d/go.d.plugin.conf2
-rw-r--r--health/health.d/ml.conf21
-rw-r--r--health/health.d/mysql.conf34
-rw-r--r--health/health.d/nvme.conf15
-rw-r--r--health/health.d/pihole.conf23
-rw-r--r--health/health.d/ping.conf50
-rw-r--r--health/health.d/postgres.conf214
-rw-r--r--health/health.d/python.d.plugin.conf2
-rw-r--r--health/health.d/redis.conf29
-rw-r--r--health/health.d/systemdunits.conf105
-rw-r--r--health/health.d/tcp_resets.conf4
-rw-r--r--health/health.d/timex.conf2
13 files changed, 417 insertions, 101 deletions
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
index ec4937c0a..b9d6c2374 100644
--- a/health/health.d/dns_query.conf
+++ b/health/health.d/dns_query.conf
@@ -1,15 +1,14 @@
-
# detect dns query failure
- template: dns_query_time_query_time
- on: dns_query_time.query_time
- class: Latency
+ template: dns_query_query_status
+ on: dns_query.query_status
+ class: Errors
type: DNS
component: DNS
- lookup: average -10s unaligned foreach *
- units: ms
+ calc: $success
+ units: status
every: 10s
- warn: $this == nan
- delay: up 20s down 5m multiplier 1.5 max 1h
- info: average DNS query round trip time over the last 10 seconds
+ warn: $this != nan && $this != 1
+ delay: up 30s down 5m multiplier 1.5 max 1h
+ info: DNS request type $label:record_type to server $label:server is unsuccessful
to: sysadmin
diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf
index a84ab342f..cd87fe0e7 100644
--- a/health/health.d/go.d.plugin.conf
+++ b/health/health.d/go.d.plugin.conf
@@ -3,7 +3,7 @@
template: go.d_job_last_collected_secs
on: netdata.go_plugin_execution_time
- class: Error
+ class: Errors
type: Netdata
component: go.d.plugin
module: !* *
diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf
index 9bcc81e76..6836ce7b1 100644
--- a/health/health.d/ml.conf
+++ b/health/health.d/ml.conf
@@ -1,10 +1,26 @@
# below are some examples of using the `anomaly-bit` option to define alerts based on anomaly
# rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's
# native anomaly detection here:
-# https://learn.netdata.cloud/docs/configure/machine-learning#anomaly-bit---100--anomalous-0--normal
+# https://learn.netdata.cloud/docs/agent/ml#anomaly-bit---100--anomalous-0--normal
# examples below are commented, you would need to uncomment and adjust as desired to enable them.
+# node level anomaly rate example
+# https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate
+# if node level anomaly rate is between 1-5% then warning (pick your own threshold that works best via tial and error).
+# if node level anomaly rate is above 5% then critical (pick your own threshold that works best via tial and error).
+# template: ml_1min_node_ar
+# on: anomaly_detection.anomaly_rate
+# os: linux
+# hosts: *
+# lookup: average -1m foreach anomaly_rate
+# calc: $this
+# units: %
+# every: 30s
+# warn: $this > (($status >= $WARNING) ? (1) : (5))
+# crit: $this > (($status == $CRITICAL) ? (5) : (100))
+# info: rolling 1min node level anomaly rate
+
# alert per dimension example
# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
@@ -33,4 +49,5 @@
# every: 30s
# warn: $this > (($status >= $WARNING) ? (5) : (20))
# crit: $this > (($status == $CRITICAL) ? (20) : (100))
-# info: rolling 5min anomaly rate for system.cpu chart \ No newline at end of file
+# info: rolling 5min anomaly rate for system.cpu chart
+
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
index 34452d983..3941c71cc 100644
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -114,10 +114,10 @@ component: MySQL
class: Utilization
type: Database
component: MySQL
- lookup: max -2m absolute
+ lookup: max -2m at -1m unaligned
units: nodes
every: 10s
- info: maximum galera cluster size in the last 2 minutes
+ info: maximum galera cluster size in the last 2 minutes starting one minute ago
to: dba
template: mysql_galera_cluster_size
@@ -136,20 +136,29 @@ component: MySQL
# galera node state
- template: mysql_galera_cluster_state
+ template: mysql_galera_cluster_state_warn
on: mysql.galera_cluster_state
class: Errors
type: Database
component: MySQL
- calc: $state
+ calc: $donor + $joined
every: 10s
- warn: $this == 2 OR $this == 3
- crit: $this == 0 OR $this == 1 OR $this >= 5
+ warn: $this != nan AND $this != 0
delay: up 30s down 5m multiplier 1.5 max 1h
- info: galera node state \
- (0: Undefined, 1: Joining, 2: Donor/Desynced, 3: Joined, 4: Synced, 5: Inconsistent)
+ info: galera node state is either Donor/Desynced or Joined.
to: dba
+ template: mysql_galera_cluster_state_crit
+ on: mysql.galera_cluster_state
+ class: Errors
+ type: Database
+component: MySQL
+ calc: $undefined + $joining + $error
+ every: 10s
+ crit: $this != nan AND $this != 0
+ delay: up 30s down 5m multiplier 1.5 max 1h
+ info: galera node state is either Undefined or Joining or Error.
+ to: dba
# galera node status
@@ -158,11 +167,10 @@ component: MySQL
class: Errors
type: Database
component: MySQL
- calc: $wsrep_cluster_status
+ calc: $primary
every: 10s
- crit: $mysql_galera_cluster_state != nan AND $this != 0
+ crit: $this != nan AND $this != 1
delay: up 30s down 5m multiplier 1.5 max 1h
- info: galera node cluster component status \
- (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \
- Any other value than primary indicates that the node is part of a nonoperational component.
+ info: galera node is part of a nonoperational component. \
+ This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations.
to: dba
diff --git a/health/health.d/nvme.conf b/health/health.d/nvme.conf
new file mode 100644
index 000000000..5f729d52b
--- /dev/null
+++ b/health/health.d/nvme.conf
@@ -0,0 +1,15 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: nvme_device_critical_warnings_state
+ families: *
+ on: nvme.device_critical_warnings_state
+ class: Errors
+ type: System
+component: Disk
+ lookup: max -30s unaligned
+ units: state
+ every: 10s
+ crit: $this != nan AND $this != 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: NVMe device $label:device has critical warnings
+ to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index 2e5c1cbfd..ee6c57cc5 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -15,21 +15,6 @@ component: Pi-hole
info: gravity.list (blocklist) file last update time
to: sysadmin
-# Gravity file check (gravity.list).
-
- template: pihole_blocklist_gravity_file
- on: pihole.blocklist_last_update
- class: Errors
- type: Ad Filtering
-component: Pi-hole
- every: 10s
- units: boolean
- calc: $file_exists
- crit: $this != 1
- delay: up 2m down 5m
- info: gravity.list (blocklist) file existence state (0: not-exists, 1: exists)
- to: sysadmin
-
# Pi-hole's ability to block unwanted domains.
# Should be enabled. The whole point of Pi-hole!
@@ -39,9 +24,9 @@ component: Pi-hole
type: Ad Filtering
component: Pi-hole
every: 10s
- units: boolean
- calc: $enabled
- warn: $this != 1
+ units: status
+ calc: $disabled
+ warn: $this != nan AND $this == 1
delay: up 2m down 5m
- info: unwanted domains blocking status (0: disabled, 1: enabled)
+ info: unwanted domains blocking is disabled
to: sysadmin
diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf
new file mode 100644
index 000000000..cbe7c30c9
--- /dev/null
+++ b/health/health.d/ping.conf
@@ -0,0 +1,50 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: ping_host_reachable
+ families: *
+ on: ping.host_packet_loss
+ class: Errors
+ type: Other
+component: Network
+ lookup: average -30s unaligned of loss
+ calc: $this != nan AND $this < 100
+ units: up/down
+ every: 10s
+ crit: $this == 0
+ delay: down 30m multiplier 1.5 max 2h
+ info: network host $label:host reachability status
+ to: sysadmin
+
+ template: ping_packet_loss
+ families: *
+ on: ping.host_packet_loss
+ class: Errors
+ type: Other
+component: Network
+ lookup: average -10m unaligned of loss
+ green: 5
+ red: 10
+ units: %
+ every: 10s
+ warn: $this > $green
+ crit: $this > $red
+ delay: down 30m multiplier 1.5 max 2h
+ info: packet loss percentage to the network host $label:host over the last 10 minutes
+ to: sysadmin
+
+ template: ping_host_latency
+ families: *
+ on: ping.host_rtt
+ class: Latency
+ type: Other
+component: Network
+ lookup: average -10s unaligned of avg
+ units: ms
+ every: 10s
+ green: 500
+ red: 1000
+ warn: $this > $green OR $max > $red
+ crit: $this > $red
+ delay: down 30m multiplier 1.5 max 2h
+ info: average latency to the network host $label:host over the last 10 seconds
+ to: sysadmin
diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf
new file mode 100644
index 000000000..66d034cfe
--- /dev/null
+++ b/health/health.d/postgres.conf
@@ -0,0 +1,214 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: postgres_total_connection_utilization
+ on: postgres.connections_utilization
+ class: Utilization
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of used
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average total connection utilization over the last minute
+ to: dba
+
+ template: postgres_acquired_locks_utilization
+ on: postgres.locks_utilization
+ class: Utilization
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of used
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (15) : (20))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average acquired locks utilization over the last minute
+ to: dba
+
+ template: postgres_txid_exhaustion_perc
+ on: postgres.txid_exhaustion_perc
+ class: Utilization
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: $txid_exhaustion
+ units: %
+ every: 1m
+ warn: $this > 90
+ delay: down 15m multiplier 1.5 max 1h
+ info: percent towards TXID wraparound
+ to: dba
+
+# Database alarms
+
+ template: postgres_db_cache_io_ratio
+ on: postgres.db_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average cache hit ratio in db $label:database over the last minute
+ to: dba
+
+ template: postgres_db_transactions_rollback_ratio
+ on: postgres.db_transactions_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -5m unaligned of rollback
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (2))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average aborted transactions percentage in db $label:database over the last five minutes
+ to: dba
+
+ template: postgres_db_deadlocks_rate
+ on: postgres.db_deadlocks_rate
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: sum -1m unaligned of deadlocks
+ units: deadlocks
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ delay: down 15m multiplier 1.5 max 1h
+ info: number of deadlocks detected in db $label:database in the last minute
+ to: dba
+
+# Table alarms
+
+ template: postgres_table_cache_io_ratio
+ on: postgres.table_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average cache hit ratio in db $label:database table $label:table over the last minute
+ to: dba
+
+ template: postgres_table_index_cache_io_ratio
+ on: postgres.table_index_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average index cache hit ratio in db $label:database table $label:table over the last minute
+ to: dba
+
+ template: postgres_table_toast_cache_io_ratio
+ on: postgres.table_toast_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average TOAST hit ratio in db $label:database table $label:table over the last minute
+ to: dba
+
+ template: postgres_table_toast_index_cache_io_ratio
+ on: postgres.table_toast_index_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average index TOAST hit ratio in db $label:database table $label:table over the last minute
+ to: dba
+
+ template: postgres_table_bloat_size_perc
+ on: postgres.table_bloat_size_perc
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: $bloat
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (60) : (70))
+ crit: $this > (($status == $CRITICAL) ? (70) : (80))
+ delay: down 15m multiplier 1.5 max 1h
+ info: bloat size percentage in db $label:database table $label:table
+ to: dba
+
+ template: postgres_table_last_autovacuum_time
+ on: postgres.table_autovacuum_since_time
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: !*
+ calc: $time
+ units: seconds
+ every: 1m
+ warn: $this != nan AND $this > (60 * 60 * 24 * 7)
+ info: time elapsed since db $label:database table $label:table was vacuumed by the autovacuum daemon
+ to: dba
+
+ template: postgres_table_last_autoanalyze_time
+ on: postgres.table_autoanalyze_since_time
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: !*
+ calc: $time
+ units: seconds
+ every: 1m
+ warn: $this != nan AND $this > (60 * 60 * 24 * 7)
+ info: time elapsed since db $label:database table $label:table was analyzed by the autovacuum daemon
+ to: dba
+
+# Index alarms
+
+ template: postgres_index_bloat_size_perc
+ on: postgres.index_bloat_size_perc
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: $bloat
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (60) : (70))
+ crit: $this > (($status == $CRITICAL) ? (70) : (80))
+ delay: down 15m multiplier 1.5 max 1h
+ info: bloat size percentage in db $label:database table $label:table index $label:index
+ to: dba
diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf
index e3b3d11cf..0e81a482f 100644
--- a/health/health.d/python.d.plugin.conf
+++ b/health/health.d/python.d.plugin.conf
@@ -3,7 +3,7 @@
template: python.d_job_last_collected_secs
on: netdata.pythond_runtime
- class: Error
+ class: Errors
type: Netdata
component: python.d.plugin
module: !* *
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
index cad5230c5..34d00b5df 100644
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@@ -1,3 +1,18 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: redis_connections_rejected
+ families: *
+ on: redis.connections
+ class: Errors
+ type: KV Storage
+component: Redis
+ lookup: sum -1m unaligned of rejected
+ every: 10s
+ units: connections
+ warn: $this > 0
+ info: connections rejected because of maxclients limit in the last minute
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
template: redis_bgsave_broken
families: *
@@ -26,3 +41,17 @@ component: Redis
info: duration of the on-going RDB save operation
delay: down 5m multiplier 1.5 max 1h
to: dba
+
+ template: redis_master_link_down
+ families: *
+ on: redis.master_link_down_since_time
+ class: Errors
+ type: KV Storage
+component: Redis
+ every: 10s
+ calc: $time
+ units: seconds
+ crit: $this != nan AND $this > 0
+ info: time elapsed since the link between master and slave is down
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf
index 38213a8db..531d62fac 100644
--- a/health/health.d/systemdunits.conf
+++ b/health/health.d/systemdunits.conf
@@ -1,142 +1,141 @@
-## Check if the are any systemd units in the failed state (crashed).
-## States: 1 - active, 2 - inactive, 3 - activating, 4 - deactivating, 5 - failed.
+# you can disable an alarm notification by setting the 'to' line to: silent
## Service units
- template: systemd_service_units_state
- on: systemd.service_units_state
+ template: systemd_service_unit_failed_state
+ on: systemd.service_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd service units are in the failed state
+ info: systemd service unit in the failed state
to: sysadmin
## Socket units
- template: systemd_socket_units_state
+ template: systemd_socket_unit_failed_state
on: systemd.socket_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd socket units are in the failed state
+ info: systemd socket unit in the failed state
to: sysadmin
## Target units
- template: systemd_target_units_state
+ template: systemd_target_unit_failed_state
on: systemd.target_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd target units are in the failed state
+ info: systemd target unit in the failed state
to: sysadmin
## Path units
- template: systemd_path_units_state
+ template: systemd_path_unit_failed_state
on: systemd.path_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd path units are in the failed state
+ info: systemd path unit in the failed state
to: sysadmin
## Device units
- template: systemd_device_units_state
+ template: systemd_device_unit_failed_state
on: systemd.device_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more the systemd device units are in the failed state
+ info: systemd device unit in the failed state
to: sysadmin
## Mount units
- template: systemd_mount_units_state
+ template: systemd_mount_unit_failed_state
on: systemd.mount_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more the systemd mount units are in the failed state
+ info: systemd mount units in the failed state
to: sysadmin
## Automount units
- template: systemd_automount_units_state
+ template: systemd_automount_unit_failed_state
on: systemd.automount_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd automount units are in the failed state
+ info: systemd automount unit in the failed state
to: sysadmin
## Swap units
- template: systemd_swap_units_state
+ template: systemd_swap_unit_failed_state
on: systemd.swap_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd swap units are in the failed state
+ info: systemd swap units in the failed state
to: sysadmin
## Scope units
- template: systemd_scope_units_state
+ template: systemd_scope_unit_failed_state
on: systemd.scope_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd scope units are in the failed state
+ info: systemd scope units in the failed state
to: sysadmin
## Slice units
- template: systemd_slice_units_state
+ template: systemd_slice_unit_failed_state
on: systemd.slice_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd slice units are in the failed state
+ info: systemd slice units in the failed state
to: sysadmin
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index 35cb6366c..ff116db64 100644
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@@ -26,7 +26,7 @@ component: Network
lookup: average -10s unaligned absolute of OutRsts
units: tcp resets/s
every: 10s
- warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10)))
+ warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10)))
delay: up 20s down 60m multiplier 1.2 max 2h
options: no-clear-notification
info: average number of sent TCP RESETS over the last 10 seconds. \
@@ -60,7 +60,7 @@ component: Network
lookup: average -10s unaligned absolute of AttemptFails
units: tcp resets/s
every: 10s
- warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
+ warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
delay: up 20s down 60m multiplier 1.2 max 2h
options: no-clear-notification
info: average number of received TCP RESETS over the last 10 seconds. \
diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf
index 23c18ba10..2e9b1a3cf 100644
--- a/health/health.d/timex.conf
+++ b/health/health.d/timex.conf
@@ -5,7 +5,7 @@
alarm: system_clock_sync_state
on: system.clock_sync_state
os: linux
- class: Error
+ class: Errors
type: System
component: Clock
calc: $state