diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-23 16:45:17 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-23 16:45:44 +0000 |
commit | 17d6a993fc17d533460c5f40f3908c708e057c18 (patch) | |
tree | 1a3bd93e0ecd74fa02f93a528fe2f87e5314c4b5 /monitoring/ceph-mixin/prometheus_alerts.libsonnet | |
parent | Releasing progress-linux version 18.2.2-0progress7.99u1. (diff) | |
download | ceph-17d6a993fc17d533460c5f40f3908c708e057c18.tar.xz ceph-17d6a993fc17d533460c5f40f3908c708e057c18.zip |
Merging upstream version 18.2.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'monitoring/ceph-mixin/prometheus_alerts.libsonnet')
-rw-r--r-- | monitoring/ceph-mixin/prometheus_alerts.libsonnet | 65 |
1 files changed, 65 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index b7ec0da2f..a6ab4c2a3 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -690,6 +690,71 @@ ], }, { + name: 'hardware', + rules: [ + { + alert: 'HardwareStorageError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_STORAGE"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.1' }, + annotations: { + summary: 'Storage devices error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Some storage devices are in error. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareMemoryError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_MEMORY"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.2' }, + annotations: { + summary: 'DIMM error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'DIMM error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareProcessorError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.3' }, + annotations: { + summary: 'Processor error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Processor error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareNetworkError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_NETWORK"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.4' }, + annotations: { + summary: 'Network error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Network error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwarePowerError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_POWER"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.5' }, + annotations: { + summary: 'Power supply error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Power supply error(s) detected. Check `ceph health detail`.', + }, + }, + { + alert: 'HardwareFanError', + 'for': '30s', + expr: 'ceph_health_detail{name="HARDWARE_FANS"} > 0', + labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.6' }, + annotations: { + summary: 'Fan error(s) detected%(cluster)s' % $.MultiClusterSummary(), + description: 'Fan error(s) detected. Check `ceph health detail`.', + }, + }, + ], + }, + { name: 'PrometheusServer', rules: [ { |