Adding upstream version 1.37.0.upstream/1.37.0

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2022-11-30 18:47:00 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2022-11-30 18:47:00 +0000
commit: 03bf87dcb06f7021bfb2df2fa8691593c6148aff (patch)
tree: e16b06711a2ed77cafb4b7754be0220c3d14a9d7 /health/health.d
parent: Adding upstream version 1.36.1. (diff)
download: netdata-3ddbe8d6a93ed16235bde4af7f6195e6f24165e8.tar.xz
netdata-3ddbe8d6a93ed16235bde4af7f6195e6f24165e8.zip
13 files changed, 417 insertions, 101 deletions
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
index ec4937c0a..b9d6c2374 100644
--- a/health/health.d/dns_query.conf
+++ b/health/health.d/dns_query.conf
@@ -1,15 +1,14 @@
-
 # detect dns query failure
 
- template: dns_query_time_query_time
-       on: dns_query_time.query_time
-    class: Latency
+ template: dns_query_query_status
+       on: dns_query.query_status
+    class: Errors
      type: DNS
 component: DNS
-   lookup: average -10s unaligned foreach *
-    units: ms
+     calc: $success
+    units: status
     every: 10s
-     warn: $this == nan
-    delay: up 20s down 5m multiplier 1.5 max 1h
-     info: average DNS query round trip time over the last 10 seconds
+     warn: $this != nan && $this != 1
+    delay: up 30s down 5m multiplier 1.5 max 1h
+     info: DNS request type $label:record_type to server $label:server is unsuccessful
        to: sysadmin
diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf
index a84ab342f..cd87fe0e7 100644
--- a/health/health.d/go.d.plugin.conf
+++ b/health/health.d/go.d.plugin.conf
@@ -3,7 +3,7 @@
 
  template: go.d_job_last_collected_secs
        on: netdata.go_plugin_execution_time
-    class: Error
+    class: Errors
      type: Netdata
 component: go.d.plugin
    module: !* *
diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf
index 9bcc81e76..6836ce7b1 100644
--- a/health/health.d/ml.conf
+++ b/health/health.d/ml.conf
@@ -1,10 +1,26 @@
 # below are some examples of using the `anomaly-bit` option to define alerts based on anomaly 
 # rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's 
 # native anomaly detection here: 
-# https://learn.netdata.cloud/docs/configure/machine-learning#anomaly-bit---100--anomalous-0--normal
+# https://learn.netdata.cloud/docs/agent/ml#anomaly-bit---100--anomalous-0--normal
 
 # examples below are commented, you would need to uncomment and adjust as desired to enable them.
 
+# node level anomaly rate example
+# https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate
+# if node level anomaly rate is between 1-5% then warning (pick your own threshold that works best via tial and error).
+# if node level anomaly rate is above 5% then critical (pick your own threshold that works best via tial and error).
+# template: ml_1min_node_ar
+#       on: anomaly_detection.anomaly_rate
+#       os: linux
+#    hosts: *
+#   lookup: average -1m foreach anomaly_rate
+#     calc: $this
+#    units: %
+#    every: 30s
+#     warn: $this > (($status >= $WARNING)  ? (1) : (5))
+#     crit: $this > (($status == $CRITICAL) ? (5) : (100))
+#     info: rolling 1min node level anomaly rate
+
 # alert per dimension example
 # if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
 # if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
@@ -33,4 +49,5 @@
 #    every: 30s
 #     warn: $this > (($status >= $WARNING)  ? (5) : (20))
 #     crit: $this > (($status == $CRITICAL) ? (20) : (100))
-#     info: rolling 5min anomaly rate for system.cpu chart
-\ No newline at end of file
+#     info: rolling 5min anomaly rate for system.cpu chart
+
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
index 34452d983..3941c71cc 100644
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -114,10 +114,10 @@ component: MySQL
     class: Utilization
      type: Database
 component: MySQL
-   lookup: max -2m absolute
+   lookup: max -2m at -1m unaligned
     units: nodes
     every: 10s
-     info: maximum galera cluster size in the last 2 minutes
+     info: maximum galera cluster size in the last 2 minutes starting one minute ago
        to: dba
 
  template: mysql_galera_cluster_size
@@ -136,20 +136,29 @@ component: MySQL
 
 # galera node state
 
- template: mysql_galera_cluster_state
+ template: mysql_galera_cluster_state_warn
        on: mysql.galera_cluster_state
     class: Errors
      type: Database
 component: MySQL
-     calc: $state
+     calc: $donor + $joined
     every: 10s
-     warn: $this == 2 OR $this == 3
-     crit: $this == 0 OR $this == 1 OR $this >= 5
+     warn: $this != nan AND $this != 0
     delay: up 30s down 5m multiplier 1.5 max 1h
-     info: galera node state \
-           (0: Undefined, 1: Joining, 2: Donor/Desynced, 3: Joined, 4: Synced, 5: Inconsistent)
+     info: galera node state is either Donor/Desynced or Joined.
        to: dba
 
+ template: mysql_galera_cluster_state_crit
+       on: mysql.galera_cluster_state
+    class: Errors
+     type: Database
+component: MySQL
+     calc: $undefined + $joining + $error
+    every: 10s
+     crit: $this != nan AND $this != 0
+    delay: up 30s down 5m multiplier 1.5 max 1h
+     info: galera node state is either Undefined or Joining or Error.
+       to: dba
 
 # galera node status
 
@@ -158,11 +167,10 @@ component: MySQL
     class: Errors
      type: Database
 component: MySQL
-     calc: $wsrep_cluster_status
+     calc: $primary
     every: 10s
-     crit: $mysql_galera_cluster_state != nan AND $this != 0
+     crit: $this != nan AND $this != 1
     delay: up 30s down 5m multiplier 1.5 max 1h
-     info: galera node cluster component status \
-           (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \
-           Any other value than primary indicates that the node is part of a nonoperational component.
+     info: galera node is part of a nonoperational component. \
+           This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations.
        to: dba
diff --git a/health/health.d/nvme.conf b/health/health.d/nvme.conf
new file mode 100644
index 000000000..5f729d52b
--- /dev/null
+++ b/health/health.d/nvme.conf
@@ -0,0 +1,15 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: nvme_device_critical_warnings_state
+ families: *
+       on: nvme.device_critical_warnings_state
+    class: Errors
+     type: System
+component: Disk
+   lookup: max -30s unaligned
+    units: state
+    every: 10s
+     crit: $this != nan AND $this != 0
+    delay: down 5m multiplier 1.5 max 2h
+     info: NVMe device $label:device has critical warnings
+       to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index 2e5c1cbfd..ee6c57cc5 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -15,21 +15,6 @@ component: Pi-hole
      info: gravity.list (blocklist) file last update time
        to: sysadmin
 
-# Gravity file check (gravity.list).
-
- template: pihole_blocklist_gravity_file
-       on: pihole.blocklist_last_update
-    class: Errors
-     type: Ad Filtering
-component: Pi-hole
-    every: 10s
-    units: boolean
-     calc: $file_exists
-     crit: $this != 1
-    delay: up 2m down 5m
-     info: gravity.list (blocklist) file existence state (0: not-exists, 1: exists)
-       to: sysadmin
-
 # Pi-hole's ability to block unwanted domains.
 # Should be enabled. The whole point of Pi-hole!
 
@@ -39,9 +24,9 @@ component: Pi-hole
      type: Ad Filtering
 component: Pi-hole
     every: 10s
-    units: boolean
-     calc: $enabled
-     warn: $this != 1
+    units: status
+     calc: $disabled
+     warn: $this != nan AND $this == 1
     delay: up 2m down 5m
-     info: unwanted domains blocking status (0: disabled, 1: enabled)
+     info: unwanted domains blocking is disabled
        to: sysadmin
diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf
new file mode 100644
index 000000000..cbe7c30c9
--- /dev/null
+++ b/health/health.d/ping.conf
@@ -0,0 +1,50 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: ping_host_reachable
+ families: *
+       on: ping.host_packet_loss
+    class: Errors
+     type: Other
+component: Network
+   lookup: average -30s unaligned of loss
+     calc: $this != nan AND $this < 100
+    units: up/down
+    every: 10s
+     crit: $this == 0
+    delay: down 30m multiplier 1.5 max 2h
+     info: network host $label:host reachability status
+       to: sysadmin
+
+ template: ping_packet_loss
+ families: *
+       on: ping.host_packet_loss
+    class: Errors
+     type: Other
+component: Network
+   lookup: average -10m unaligned of loss
+    green: 5
+      red: 10
+    units: %
+    every: 10s
+     warn: $this > $green
+     crit: $this > $red
+    delay: down 30m multiplier 1.5 max 2h
+     info: packet loss percentage to the network host $label:host over the last 10 minutes
+       to: sysadmin
+
+ template: ping_host_latency
+ families: *
+       on: ping.host_rtt
+    class: Latency
+     type: Other
+component: Network
+   lookup: average -10s unaligned of avg
+    units: ms
+    every: 10s
+    green: 500
+      red: 1000
+     warn: $this > $green OR $max > $red
+     crit: $this > $red
+    delay: down 30m multiplier 1.5 max 2h
+     info: average latency to the network host $label:host over the last 10 seconds
+       to: sysadmin
diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf
new file mode 100644
index 000000000..66d034cfe
--- /dev/null
+++ b/health/health.d/postgres.conf
@@ -0,0 +1,214 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: postgres_total_connection_utilization
+       on: postgres.connections_utilization
+    class: Utilization
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of used
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (80) : (90))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average total connection utilization over the last minute
+       to: dba
+
+ template: postgres_acquired_locks_utilization
+       on: postgres.locks_utilization
+    class: Utilization
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of used
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (15) : (20))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average acquired locks utilization over the last minute
+       to: dba
+
+ template: postgres_txid_exhaustion_perc
+       on: postgres.txid_exhaustion_perc
+    class: Utilization
+     type: Database
+component: PostgreSQL
+    hosts: *
+     calc: $txid_exhaustion	
+    units: %
+    every: 1m
+     warn: $this > 90
+    delay: down 15m multiplier 1.5 max 1h
+     info: percent towards TXID wraparound
+       to: dba
+
+# Database alarms
+
+ template: postgres_db_cache_io_ratio
+       on: postgres.db_cache_io_ratio
+    class: Workload
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of miss
+     calc: 100 - $this
+    units: %
+    every: 1m
+     warn: $this < (($status >= $WARNING)  ? (70) : (60))
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average cache hit ratio in db $label:database over the last minute
+       to: dba
+
+ template: postgres_db_transactions_rollback_ratio	
+       on: postgres.db_transactions_ratio
+    class: Workload
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -5m unaligned of rollback
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (0) : (2))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average aborted transactions percentage in db $label:database over the last five minutes
+       to: dba
+
+ template: postgres_db_deadlocks_rate
+       on: postgres.db_deadlocks_rate
+    class: Errors
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: sum -1m unaligned of deadlocks
+    units: deadlocks
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (0) : (10))
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of deadlocks detected in db $label:database in the last minute
+       to: dba
+
+# Table alarms
+
+ template: postgres_table_cache_io_ratio
+       on: postgres.table_cache_io_ratio
+    class: Workload
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of miss
+     calc: 100 - $this
+    units: %
+    every: 1m
+     warn: $this < (($status >= $WARNING)  ? (70) : (60))
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average cache hit ratio in db $label:database table $label:table over the last minute
+       to: dba
+
+ template: postgres_table_index_cache_io_ratio
+       on: postgres.table_index_cache_io_ratio
+    class: Workload
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of miss
+     calc: 100 - $this
+    units: %
+    every: 1m
+     warn: $this < (($status >= $WARNING)  ? (70) : (60))
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average index cache hit ratio in db $label:database table $label:table over the last minute
+       to: dba
+
+ template: postgres_table_toast_cache_io_ratio
+       on: postgres.table_toast_cache_io_ratio
+    class: Workload
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of miss
+     calc: 100 - $this
+    units: %
+    every: 1m
+     warn: $this < (($status >= $WARNING)  ? (70) : (60))
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average TOAST hit ratio in db $label:database table $label:table over the last minute
+       to: dba
+
+ template: postgres_table_toast_index_cache_io_ratio
+       on: postgres.table_toast_index_cache_io_ratio
+    class: Workload
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of miss
+     calc: 100 - $this
+    units: %
+    every: 1m
+     warn: $this < (($status >= $WARNING)  ? (70) : (60))
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average index TOAST hit ratio in db $label:database table $label:table over the last minute
+       to: dba
+
+ template: postgres_table_bloat_size_perc
+       on: postgres.table_bloat_size_perc
+    class: Errors
+     type: Database
+component: PostgreSQL
+    hosts: *
+     calc: $bloat
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (60) : (70))
+     crit: $this > (($status == $CRITICAL) ? (70) : (80))
+    delay: down 15m multiplier 1.5 max 1h
+     info: bloat size percentage in db $label:database table $label:table
+       to: dba
+
+ template: postgres_table_last_autovacuum_time
+       on: postgres.table_autovacuum_since_time
+    class: Errors
+     type: Database
+component: PostgreSQL
+    hosts: !*
+     calc: $time
+    units: seconds
+    every: 1m
+     warn: $this != nan AND $this > (60 * 60 * 24 * 7)
+     info: time elapsed since db $label:database table $label:table was vacuumed by the autovacuum daemon
+       to: dba
+
+ template: postgres_table_last_autoanalyze_time
+       on: postgres.table_autoanalyze_since_time
+    class: Errors
+     type: Database
+component: PostgreSQL
+    hosts: !*
+     calc: $time
+    units: seconds
+    every: 1m
+     warn: $this != nan AND $this > (60 * 60 * 24 * 7)
+     info: time elapsed since db $label:database table $label:table was analyzed by the autovacuum daemon
+       to: dba
+
+# Index alarms
+
+ template: postgres_index_bloat_size_perc
+       on: postgres.index_bloat_size_perc
+    class: Errors
+     type: Database
+component: PostgreSQL
+    hosts: *
+     calc: $bloat
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (60) : (70))
+     crit: $this > (($status == $CRITICAL) ? (70) : (80))
+    delay: down 15m multiplier 1.5 max 1h
+     info: bloat size percentage in db $label:database table $label:table index $label:index
+       to: dba
diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf
index e3b3d11cf..0e81a482f 100644
--- a/health/health.d/python.d.plugin.conf
+++ b/health/health.d/python.d.plugin.conf
@@ -3,7 +3,7 @@
 
  template: python.d_job_last_collected_secs
        on: netdata.pythond_runtime
-    class: Error
+    class: Errors
      type: Netdata
 component: python.d.plugin
    module: !* *
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
index cad5230c5..34d00b5df 100644
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@@ -1,3 +1,18 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: redis_connections_rejected
+ families: *
+       on: redis.connections
+    class: Errors
+     type: KV Storage
+component: Redis
+   lookup: sum -1m unaligned of rejected
+    every: 10s
+    units: connections
+     warn: $this > 0
+     info: connections rejected because of maxclients limit in the last minute
+    delay: down 5m multiplier 1.5 max 1h
+       to: dba
 
  template: redis_bgsave_broken
  families: *
@@ -26,3 +41,17 @@ component: Redis
      info: duration of the on-going RDB save operation
     delay: down 5m multiplier 1.5 max 1h
        to: dba
+
+ template: redis_master_link_down
+ families: *
+       on: redis.master_link_down_since_time
+    class: Errors
+     type: KV Storage
+component: Redis
+    every: 10s
+     calc: $time
+    units: seconds
+     crit: $this != nan AND $this > 0
+     info: time elapsed since the link between master and slave is down
+    delay: down 5m multiplier 1.5 max 1h
+       to: dba
diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf
index 38213a8db..531d62fac 100644
--- a/health/health.d/systemdunits.conf
+++ b/health/health.d/systemdunits.conf
@@ -1,142 +1,141 @@
-## Check if the are any systemd units in the failed state (crashed).
-## States: 1 - active, 2 - inactive, 3 - activating, 4 - deactivating, 5 - failed.
+# you can disable an alarm notification by setting the 'to' line to: silent
 
 ## Service units
- template: systemd_service_units_state
-       on: systemd.service_units_state
+ template: systemd_service_unit_failed_state
+       on: systemd.service_unit_state
     class: Errors
      type: Linux
 component: Systemd units
-   lookup: max -1s min2max
-    units: ok/failed
+     calc: $failed
+    units: state
     every: 10s
-     warn: $this != nan AND $this == 5
+     warn: $this != nan AND $this == 1
     delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd service units are in the failed state
+     info: systemd service unit in the failed state
        to: sysadmin
 
 ## Socket units
- template: systemd_socket_units_state
+ template: systemd_socket_unit_failed_state
        on: systemd.socket_unit_state
     class: Errors
      type: Linux
 component: Systemd units
-   lookup: max -1s min2max
-    units: ok/failed
+     calc: $failed
+    units: state
     every: 10s
-     warn: $this != nan AND $this == 5
+     warn: $this != nan AND $this == 1
     delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd socket units are in the failed state
+     info: systemd socket unit in the failed state
        to: sysadmin
 
 ## Target units
- template: systemd_target_units_state
+ template: systemd_target_unit_failed_state
        on: systemd.target_unit_state
     class: Errors
      type: Linux
 component: Systemd units
-   lookup: max -1s min2max
-    units: ok/failed
+     calc: $failed
+    units: state
     every: 10s
-     warn: $this != nan AND $this == 5
+     warn: $this != nan AND $this == 1
     delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd target units are in the failed state
+     info: systemd target unit in the failed state
        to: sysadmin
 
 ## Path units
- template: systemd_path_units_state
+ template: systemd_path_unit_failed_state
        on: systemd.path_unit_state
     class: Errors
      type: Linux
 component: Systemd units
-   lookup: max -1s min2max
-    units: ok/failed
+     calc: $failed
+    units: state
     every: 10s
-     warn: $this != nan AND $this == 5
+     warn: $this != nan AND $this == 1
     delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd path units are in the failed state
+     info: systemd path unit in the failed state
        to: sysadmin
 
 ## Device units
- template: systemd_device_units_state
+ template: systemd_device_unit_failed_state
        on: systemd.device_unit_state
     class: Errors
      type: Linux
 component: Systemd units
-   lookup: max -1s min2max
-    units: ok/failed
+     calc: $failed
+    units: state
     every: 10s
-     warn: $this != nan AND $this == 5
+     warn: $this != nan AND $this == 1
     delay: down 5m multiplier 1.5 max 1h
-     info: one or more the systemd device units are in the failed state
+     info: systemd device unit in the failed state
        to: sysadmin
 
 ## Mount units
- template: systemd_mount_units_state
+ template: systemd_mount_unit_failed_state
        on: systemd.mount_unit_state
     class: Errors
      type: Linux
 component: Systemd units
-   lookup: max -1s min2max
-    units: ok/failed
+     calc: $failed
+    units: state
     every: 10s
-     warn: $this != nan AND $this == 5
+     warn: $this != nan AND $this == 1
     delay: down 5m multiplier 1.5 max 1h
-     info: one or more the systemd mount units are in the failed state
+     info: systemd mount units in the failed state
        to: sysadmin
 
 ## Automount units
- template: systemd_automount_units_state
+ template: systemd_automount_unit_failed_state
        on: systemd.automount_unit_state
     class: Errors
      type: Linux
 component: Systemd units
-   lookup: max -1s min2max
-    units: ok/failed
+     calc: $failed
+    units: state
     every: 10s
-     warn: $this != nan AND $this == 5
+     warn: $this != nan AND $this == 1
     delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd automount units are in the failed state
+     info: systemd automount unit in the failed state
        to: sysadmin
 
 ## Swap units
- template: systemd_swap_units_state
+ template: systemd_swap_unit_failed_state
        on: systemd.swap_unit_state
     class: Errors
      type: Linux
 component: Systemd units
-   lookup: max -1s min2max
-    units: ok/failed
+     calc: $failed
+    units: state
     every: 10s
-     warn: $this != nan AND $this == 5
+     warn: $this != nan AND $this == 1
     delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd swap units are in the failed state
+     info: systemd swap units in the failed state
        to: sysadmin
 
 ## Scope units
- template: systemd_scope_units_state
+ template: systemd_scope_unit_failed_state
        on: systemd.scope_unit_state
     class: Errors
      type: Linux
 component: Systemd units
-   lookup: max -1s min2max
-    units: ok/failed
+     calc: $failed
+    units: state
     every: 10s
-     warn: $this != nan AND $this == 5
+     warn: $this != nan AND $this == 1
     delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd scope units are in the failed state
+     info: systemd scope units in the failed state
        to: sysadmin
 
 ## Slice units
- template: systemd_slice_units_state
+ template: systemd_slice_unit_failed_state
        on: systemd.slice_unit_state
     class: Errors
      type: Linux
 component: Systemd units
-   lookup: max -1s min2max
-    units: ok/failed
+     calc: $failed
+    units: state
     every: 10s
-     warn: $this != nan AND $this == 5
+     warn: $this != nan AND $this == 1
     delay: down 5m multiplier 1.5 max 1h
-     info: one or more systemd slice units are in the failed state
+     info: systemd slice units in the failed state
        to: sysadmin
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index 35cb6366c..ff116db64 100644
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@@ -26,7 +26,7 @@ component: Network
    lookup: average -10s unaligned absolute of OutRsts
     units: tcp resets/s
     every: 10s
-     warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (10)))
+     warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (10)))
     delay: up 20s down 60m multiplier 1.2 max 2h
   options: no-clear-notification
      info: average number of sent TCP RESETS over the last 10 seconds. \
@@ -60,7 +60,7 @@ component: Network
    lookup: average -10s unaligned absolute of AttemptFails
     units: tcp resets/s
     every: 10s
-     warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (10)))
+     warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (10)))
     delay: up 20s down 60m multiplier 1.2 max 2h
   options: no-clear-notification
      info: average number of received TCP RESETS over the last 10 seconds. \
diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf
index 23c18ba10..2e9b1a3cf 100644
--- a/health/health.d/timex.conf
+++ b/health/health.d/timex.conf
@@ -5,7 +5,7 @@
     alarm: system_clock_sync_state
        on: system.clock_sync_state
        os: linux
-    class: Error
+    class: Errors
      type: System
 component: Clock
      calc: $state
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2022-11-30 18:47:00 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2022-11-30 18:47:00 +0000
commit	03bf87dcb06f7021bfb2df2fa8691593c6148aff (patch)
tree	e16b06711a2ed77cafb4b7754be0220c3d14a9d7 /health/health.d
parent	Adding upstream version 1.36.1. (diff)
download	netdata-3ddbe8d6a93ed16235bde4af7f6195e6f24165e8.tar.xz netdata-3ddbe8d6a93ed16235bde4af7f6195e6f24165e8.zip