summaryrefslogtreecommitdiffstats
path: root/src/health/health.d
diff options
context:
space:
mode:
Diffstat (limited to 'src/health/health.d')
-rw-r--r--src/health/health.d/beanstalkd.conf30
-rw-r--r--src/health/health.d/docker.conf23
-rw-r--r--src/health/health.d/gearman.conf27
-rw-r--r--src/health/health.d/ipfs.conf4
-rw-r--r--src/health/health.d/x509check.conf7
-rw-r--r--src/health/health.d/zfs.conf19
6 files changed, 61 insertions, 49 deletions
diff --git a/src/health/health.d/beanstalkd.conf b/src/health/health.d/beanstalkd.conf
index 0d37f28e0..51b280491 100644
--- a/src/health/health.d/beanstalkd.conf
+++ b/src/health/health.d/beanstalkd.conf
@@ -11,31 +11,5 @@ component: Beanstalk
warn: $this > 3
delay: up 0 down 5m multiplier 1.2 max 1h
summary: Beanstalk buried jobs
- info: Number of buried jobs across all tubes. \
- You need to manually kick them so they can be processed. \
- Presence of buried jobs in a tube does not affect new jobs.
- to: sysadmin
-
-# get the number of buried jobs per queue
-
-#template: beanstalk_tube_buried_jobs
-# on: beanstalk.jobs
-# calc: $buried
-# units: jobs
-# every: 10s
-# warn: $this > 0
-# crit: $this > 10
-# delay: up 0 down 5m multiplier 1.2 max 1h
-# info: the number of jobs buried per tube
-# to: sysadmin
-
-# get the current number of tubes
-
-#template: beanstalk_number_of_tubes
-# on: beanstalk.current_tubes
-# calc: $tubes
-# every: 10s
-# warn: $this < 5
-# delay: up 0 down 5m multiplier 1.2 max 1h
-# info: the current number of tubes on the server
-# to: sysadmin
+ info: Number of buried jobs across all tubes.
+ to: silent
diff --git a/src/health/health.d/docker.conf b/src/health/health.d/docker.conf
index 668614d4d..edb63a08c 100644
--- a/src/health/health.d/docker.conf
+++ b/src/health/health.d/docker.conf
@@ -1,4 +1,6 @@
- template: docker_container_unhealthy
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+template: docker_container_unhealthy
on: docker.container_health_status
class: Errors
type: Containers
@@ -10,3 +12,22 @@ component: Docker
summary: Docker container ${label:container_name} health
info: ${label:container_name} docker container health status is unhealthy
to: sysadmin
+
+# This alert monitors the status of Docker containers and triggers if any container is exited (down).
+# To enable this alert for specific containers, you need to modify the "chart labels" filter.
+# This filter uses Netdata's simple pattern matching syntax.
+
+ template: docker_container_down
+ on: docker.container_state
+ class: Errors
+ type: Containers
+ component: Docker
+chart labels: container_name=!*
+ units: status
+ every: 10s
+ lookup: average -10s of exited
+ warn: $this > 0
+ delay: down 1m multiplier 1.5 max 2h
+ summary: Docker container ${label:container_name} down
+ info: Docker container ${label:container_name} is currently not running
+ to: sysadmin
diff --git a/src/health/health.d/gearman.conf b/src/health/health.d/gearman.conf
index 78e1165d1..2b19105b5 100644
--- a/src/health/health.d/gearman.conf
+++ b/src/health/health.d/gearman.conf
@@ -1,14 +1,15 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
- template: gearman_workers_queued
- on: gearman.single_job
- class: Latency
- type: Computing
-component: Gearman
- lookup: average -10m unaligned match-names of Pending
- units: workers
- every: 10s
- warn: $this > 30000
- delay: down 5m multiplier 1.5 max 1h
- summary: Gearman queued jobs
- info: Average number of queued jobs over the last 10 minutes
- to: sysadmin
+# template: gearman_function_waiting_jobs
+# on: gearman.function_queued_jobs_activity
+# class: Latency
+# type: Computing
+#component: Gearman
+# lookup: average -10m unaligned of waiting
+# units: jobs
+# every: 10s
+# warn: $this > 30000
+# delay: down 5m multiplier 1.5 max 1h
+# summary: Waiting jobs for ${label:task_name} function
+# info: Average number of waiting jobs for ${label:function_name} function over the last 10 minutes
+# to: sysadmin
diff --git a/src/health/health.d/ipfs.conf b/src/health/health.d/ipfs.conf
index 4dfee3c7f..bc3b0b1ea 100644
--- a/src/health/health.d/ipfs.conf
+++ b/src/health/health.d/ipfs.conf
@@ -1,10 +1,10 @@
template: ipfs_datastore_usage
- on: ipfs.repo_size
+ on: ipfs.datastore_space_utilization
class: Utilization
type: Data Sharing
component: IPFS
- calc: $size * 100 / $avail
+ calc: $used
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
diff --git a/src/health/health.d/x509check.conf b/src/health/health.d/x509check.conf
index 1d40c8602..38187326f 100644
--- a/src/health/health.d/x509check.conf
+++ b/src/health/health.d/x509check.conf
@@ -12,15 +12,16 @@ component: x509 certificates
summary: x509 certificate expiration for ${label:source}
info: Time until x509 certificate expires for ${label:source}
to: webmaster
-
+
template: x509check_revocation_status
on: x509check.revocation_status
class: Errors
type: Certificates
component: x509 certificates
calc: $revoked
+ units: status
every: 60s
- crit: $this != nan AND $this != 0
+ crit: $this == 1
summary: x509 certificate revocation status for ${label:source}
- info: x509 certificate revocation status (0: revoked, 1: valid) for ${label:source}
+ info: x509 certificate revocation status for ${label:source}
to: webmaster
diff --git a/src/health/health.d/zfs.conf b/src/health/health.d/zfs.conf
index 9c1f0018b..5c8065aa3 100644
--- a/src/health/health.d/zfs.conf
+++ b/src/health/health.d/zfs.conf
@@ -67,7 +67,7 @@ component: File system
type: System
component: File system
calc: $degraded
- units: boolean
+ units: status
every: 10s
warn: $this > 0
delay: down 1m multiplier 1.5 max 1h
@@ -81,10 +81,25 @@ component: File system
type: System
component: File system
calc: $faulted + $unavail
- units: boolean
+ units: status
every: 10s
crit: $this > 0
delay: down 1m multiplier 1.5 max 1h
summary: Critical ZFS pool ${label:pool} state
info: ZFS pool ${label:pool} state is faulted or unavail
to: sysadmin
+
+
+ template: zfs_vdev_health_state
+ on: zfspool.vdev_health_state
+ class: Errors
+ type: System
+component: File system
+ calc: $degraded + $faulted
+ units: status
+ every: 10s
+ warn: $this > 0
+ delay: down 1m multiplier 1.5 max 1h
+ summary: ZFS vdev ${label:vdev} pool ${label:pool} state
+ info: ZFS vdev ${label:vdev} state is faulted or degraded
+ to: sysadmin