diff options
Diffstat (limited to 'src/health/health.d')
-rw-r--r-- | src/health/health.d/beanstalkd.conf | 30 | ||||
-rw-r--r-- | src/health/health.d/docker.conf | 23 | ||||
-rw-r--r-- | src/health/health.d/gearman.conf | 27 | ||||
-rw-r--r-- | src/health/health.d/ipfs.conf | 4 | ||||
-rw-r--r-- | src/health/health.d/x509check.conf | 7 | ||||
-rw-r--r-- | src/health/health.d/zfs.conf | 19 |
6 files changed, 61 insertions, 49 deletions
diff --git a/src/health/health.d/beanstalkd.conf b/src/health/health.d/beanstalkd.conf index 0d37f28e0..51b280491 100644 --- a/src/health/health.d/beanstalkd.conf +++ b/src/health/health.d/beanstalkd.conf @@ -11,31 +11,5 @@ component: Beanstalk warn: $this > 3 delay: up 0 down 5m multiplier 1.2 max 1h summary: Beanstalk buried jobs - info: Number of buried jobs across all tubes. \ - You need to manually kick them so they can be processed. \ - Presence of buried jobs in a tube does not affect new jobs. - to: sysadmin - -# get the number of buried jobs per queue - -#template: beanstalk_tube_buried_jobs -# on: beanstalk.jobs -# calc: $buried -# units: jobs -# every: 10s -# warn: $this > 0 -# crit: $this > 10 -# delay: up 0 down 5m multiplier 1.2 max 1h -# info: the number of jobs buried per tube -# to: sysadmin - -# get the current number of tubes - -#template: beanstalk_number_of_tubes -# on: beanstalk.current_tubes -# calc: $tubes -# every: 10s -# warn: $this < 5 -# delay: up 0 down 5m multiplier 1.2 max 1h -# info: the current number of tubes on the server -# to: sysadmin + info: Number of buried jobs across all tubes. + to: silent diff --git a/src/health/health.d/docker.conf b/src/health/health.d/docker.conf index 668614d4d..edb63a08c 100644 --- a/src/health/health.d/docker.conf +++ b/src/health/health.d/docker.conf @@ -1,4 +1,6 @@ - template: docker_container_unhealthy +# you can disable an alarm notification by setting the 'to' line to: silent + +template: docker_container_unhealthy on: docker.container_health_status class: Errors type: Containers @@ -10,3 +12,22 @@ component: Docker summary: Docker container ${label:container_name} health info: ${label:container_name} docker container health status is unhealthy to: sysadmin + +# This alert monitors the status of Docker containers and triggers if any container is exited (down). +# To enable this alert for specific containers, you need to modify the "chart labels" filter. +# This filter uses Netdata's simple pattern matching syntax. + + template: docker_container_down + on: docker.container_state + class: Errors + type: Containers + component: Docker +chart labels: container_name=!* + units: status + every: 10s + lookup: average -10s of exited + warn: $this > 0 + delay: down 1m multiplier 1.5 max 2h + summary: Docker container ${label:container_name} down + info: Docker container ${label:container_name} is currently not running + to: sysadmin diff --git a/src/health/health.d/gearman.conf b/src/health/health.d/gearman.conf index 78e1165d1..2b19105b5 100644 --- a/src/health/health.d/gearman.conf +++ b/src/health/health.d/gearman.conf @@ -1,14 +1,15 @@ +# you can disable an alarm notification by setting the 'to' line to: silent - template: gearman_workers_queued - on: gearman.single_job - class: Latency - type: Computing -component: Gearman - lookup: average -10m unaligned match-names of Pending - units: workers - every: 10s - warn: $this > 30000 - delay: down 5m multiplier 1.5 max 1h - summary: Gearman queued jobs - info: Average number of queued jobs over the last 10 minutes - to: sysadmin +# template: gearman_function_waiting_jobs +# on: gearman.function_queued_jobs_activity +# class: Latency +# type: Computing +#component: Gearman +# lookup: average -10m unaligned of waiting +# units: jobs +# every: 10s +# warn: $this > 30000 +# delay: down 5m multiplier 1.5 max 1h +# summary: Waiting jobs for ${label:task_name} function +# info: Average number of waiting jobs for ${label:function_name} function over the last 10 minutes +# to: sysadmin diff --git a/src/health/health.d/ipfs.conf b/src/health/health.d/ipfs.conf index 4dfee3c7f..bc3b0b1ea 100644 --- a/src/health/health.d/ipfs.conf +++ b/src/health/health.d/ipfs.conf @@ -1,10 +1,10 @@ template: ipfs_datastore_usage - on: ipfs.repo_size + on: ipfs.datastore_space_utilization class: Utilization type: Data Sharing component: IPFS - calc: $size * 100 / $avail + calc: $used units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) diff --git a/src/health/health.d/x509check.conf b/src/health/health.d/x509check.conf index 1d40c8602..38187326f 100644 --- a/src/health/health.d/x509check.conf +++ b/src/health/health.d/x509check.conf @@ -12,15 +12,16 @@ component: x509 certificates summary: x509 certificate expiration for ${label:source} info: Time until x509 certificate expires for ${label:source} to: webmaster - + template: x509check_revocation_status on: x509check.revocation_status class: Errors type: Certificates component: x509 certificates calc: $revoked + units: status every: 60s - crit: $this != nan AND $this != 0 + crit: $this == 1 summary: x509 certificate revocation status for ${label:source} - info: x509 certificate revocation status (0: revoked, 1: valid) for ${label:source} + info: x509 certificate revocation status for ${label:source} to: webmaster diff --git a/src/health/health.d/zfs.conf b/src/health/health.d/zfs.conf index 9c1f0018b..5c8065aa3 100644 --- a/src/health/health.d/zfs.conf +++ b/src/health/health.d/zfs.conf @@ -67,7 +67,7 @@ component: File system type: System component: File system calc: $degraded - units: boolean + units: status every: 10s warn: $this > 0 delay: down 1m multiplier 1.5 max 1h @@ -81,10 +81,25 @@ component: File system type: System component: File system calc: $faulted + $unavail - units: boolean + units: status every: 10s crit: $this > 0 delay: down 1m multiplier 1.5 max 1h summary: Critical ZFS pool ${label:pool} state info: ZFS pool ${label:pool} state is faulted or unavail to: sysadmin + + + template: zfs_vdev_health_state + on: zfspool.vdev_health_state + class: Errors + type: System +component: File system + calc: $degraded + $faulted + units: status + every: 10s + warn: $this > 0 + delay: down 1m multiplier 1.5 max 1h + summary: ZFS vdev ${label:vdev} pool ${label:pool} state + info: ZFS vdev ${label:vdev} state is faulted or degraded + to: sysadmin |