Adding upstream version 1.47.0.upstream/1.47.0

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-08-26 08:15:20 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-08-26 08:15:20 +0000
commit: 87d772a7d708fec12f48cd8adc0dedff6e1025da (patch)
tree: 1fee344c64cc3f43074a01981e21126c8482a522 /src/health/health.d
parent: Adding upstream version 1.46.3. (diff)
download: netdata-87d772a7d708fec12f48cd8adc0dedff6e1025da.tar.xz
netdata-87d772a7d708fec12f48cd8adc0dedff6e1025da.zip
6 files changed, 61 insertions, 49 deletions
diff --git a/src/health/health.d/beanstalkd.conf b/src/health/health.d/beanstalkd.conf
index 0d37f28e0..51b280491 100644
--- a/src/health/health.d/beanstalkd.conf
+++ b/src/health/health.d/beanstalkd.conf
@@ -11,31 +11,5 @@ component: Beanstalk
      warn: $this > 3
     delay: up 0 down 5m multiplier 1.2 max 1h
   summary: Beanstalk buried jobs
-     info: Number of buried jobs across all tubes. \
-           You need to manually kick them so they can be processed. \
-           Presence of buried jobs in a tube does not affect new jobs.
-       to: sysadmin
-      
-# get the number of buried jobs per queue
-
-#template: beanstalk_tube_buried_jobs
-#      on: beanstalk.jobs
-#    calc: $buried
-#   units: jobs
-#   every: 10s
-#    warn: $this > 0
-#    crit: $this > 10
-#   delay: up 0 down 5m multiplier 1.2 max 1h
-#    info: the number of jobs buried per tube
-#      to: sysadmin
-
-# get the current number of tubes
-
-#template: beanstalk_number_of_tubes
-#      on: beanstalk.current_tubes
-#    calc: $tubes
-#   every: 10s
-#    warn: $this < 5
-#   delay: up 0 down 5m multiplier 1.2 max 1h
-#    info: the current number of tubes on the server
-#      to: sysadmin
+     info: Number of buried jobs across all tubes.
+       to: silent
diff --git a/src/health/health.d/docker.conf b/src/health/health.d/docker.conf
index 668614d4d..edb63a08c 100644
--- a/src/health/health.d/docker.conf
+++ b/src/health/health.d/docker.conf
@@ -1,4 +1,6 @@
- template: docker_container_unhealthy
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+template: docker_container_unhealthy
        on: docker.container_health_status
     class: Errors
      type: Containers
@@ -10,3 +12,22 @@ component: Docker
   summary: Docker container ${label:container_name} health
      info: ${label:container_name} docker container health status is unhealthy
        to: sysadmin
+
+# This alert monitors the status of Docker containers and triggers if any container is exited (down).
+# To enable this alert for specific containers, you need to modify the "chart labels" filter.
+# This filter uses Netdata's simple pattern matching syntax.
+
+    template: docker_container_down
+          on: docker.container_state
+       class: Errors
+        type: Containers
+   component: Docker
+chart labels: container_name=!*
+       units: status
+       every: 10s
+      lookup: average -10s of exited
+        warn: $this > 0
+       delay: down 1m multiplier 1.5 max 2h
+     summary: Docker container ${label:container_name} down
+        info: Docker container ${label:container_name} is currently not running
+          to: sysadmin
diff --git a/src/health/health.d/gearman.conf b/src/health/health.d/gearman.conf
index 78e1165d1..2b19105b5 100644
--- a/src/health/health.d/gearman.conf
+++ b/src/health/health.d/gearman.conf
@@ -1,14 +1,15 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
 
- template: gearman_workers_queued
-       on: gearman.single_job
-    class: Latency
-     type: Computing
-component: Gearman
-   lookup: average -10m unaligned match-names of Pending
-    units: workers
-    every: 10s
-     warn: $this > 30000
-    delay: down 5m multiplier 1.5 max 1h
-  summary: Gearman queued jobs
-     info: Average number of queued jobs over the last 10 minutes
-       to: sysadmin
+# template: gearman_function_waiting_jobs
+#       on: gearman.function_queued_jobs_activity
+#    class: Latency
+#     type: Computing
+#component: Gearman
+#   lookup: average -10m unaligned of waiting
+#    units: jobs
+#    every: 10s
+#     warn: $this > 30000
+#    delay: down 5m multiplier 1.5 max 1h
+#  summary: Waiting jobs for ${label:task_name} function
+#     info: Average number of waiting jobs for ${label:function_name} function over the last 10 minutes
+#       to: sysadmin
diff --git a/src/health/health.d/ipfs.conf b/src/health/health.d/ipfs.conf
index 4dfee3c7f..bc3b0b1ea 100644
--- a/src/health/health.d/ipfs.conf
+++ b/src/health/health.d/ipfs.conf
@@ -1,10 +1,10 @@
 
  template: ipfs_datastore_usage
-       on: ipfs.repo_size
+       on: ipfs.datastore_space_utilization
     class: Utilization
      type: Data Sharing
 component: IPFS
-     calc: $size * 100 / $avail
+     calc: $used
     units: %
     every: 10s
      warn: $this > (($status >= $WARNING)  ? (80) : (90))
diff --git a/src/health/health.d/x509check.conf b/src/health/health.d/x509check.conf
index 1d40c8602..38187326f 100644
--- a/src/health/health.d/x509check.conf
+++ b/src/health/health.d/x509check.conf
@@ -12,15 +12,16 @@ component: x509 certificates
   summary: x509 certificate expiration for ${label:source}
      info: Time until x509 certificate expires for ${label:source}
        to: webmaster
-      
+
  template: x509check_revocation_status
        on: x509check.revocation_status
     class: Errors
      type: Certificates
 component: x509 certificates
      calc: $revoked
+    units: status
     every: 60s
-     crit: $this != nan AND $this != 0
+     crit: $this == 1
   summary: x509 certificate revocation status for ${label:source}
-     info: x509 certificate revocation status (0: revoked, 1: valid) for ${label:source}
+     info: x509 certificate revocation status for ${label:source}
        to: webmaster
diff --git a/src/health/health.d/zfs.conf b/src/health/health.d/zfs.conf
index 9c1f0018b..5c8065aa3 100644
--- a/src/health/health.d/zfs.conf
+++ b/src/health/health.d/zfs.conf
@@ -67,7 +67,7 @@ component: File system
      type: System
 component: File system
      calc: $degraded
-    units: boolean
+    units: status
     every: 10s
      warn: $this > 0
     delay: down 1m multiplier 1.5 max 1h
@@ -81,10 +81,25 @@ component: File system
      type: System
 component: File system
      calc: $faulted + $unavail
-    units: boolean
+    units: status
     every: 10s
      crit: $this > 0
     delay: down 1m multiplier 1.5 max 1h
   summary: Critical ZFS pool ${label:pool} state
      info: ZFS pool ${label:pool} state is faulted or unavail
        to: sysadmin
+
+
+ template: zfs_vdev_health_state
+       on: zfspool.vdev_health_state
+    class: Errors
+     type: System
+component: File system
+     calc: $degraded + $faulted
+    units: status
+    every: 10s
+     warn: $this > 0
+    delay: down 1m multiplier 1.5 max 1h
+  summary: ZFS vdev ${label:vdev} pool ${label:pool} state
+     info: ZFS vdev ${label:vdev} state is faulted or degraded
+       to: sysadmin
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-08-26 08:15:20 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-08-26 08:15:20 +0000
commit	87d772a7d708fec12f48cd8adc0dedff6e1025da (patch)
tree	1fee344c64cc3f43074a01981e21126c8482a522 /src/health/health.d
parent	Adding upstream version 1.46.3. (diff)
download	netdata-87d772a7d708fec12f48cd8adc0dedff6e1025da.tar.xz netdata-87d772a7d708fec12f48cd8adc0dedff6e1025da.zip