diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-23 16:45:13 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-23 16:45:13 +0000 |
commit | 389020e14594e4894e28d1eb9103c210b142509e (patch) | |
tree | 2ba734cdd7a243f46dda7c3d0cc88c2293d9699f /monitoring/ceph-mixin/prometheus_alerts.yml | |
parent | Adding upstream version 18.2.2. (diff) | |
download | ceph-389020e14594e4894e28d1eb9103c210b142509e.tar.xz ceph-389020e14594e4894e28d1eb9103c210b142509e.zip |
Adding upstream version 18.2.3.upstream/18.2.3
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'monitoring/ceph-mixin/prometheus_alerts.yml')
-rw-r--r-- | monitoring/ceph-mixin/prometheus_alerts.yml | 62 |
1 files changed, 62 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 4a3e6acf3..e491c753f 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -614,6 +614,68 @@ groups: labels: severity: "warning" type: "ceph_default" + - name: "hardware" + rules: + - alert: "HardwareStorageError" + annotations: + description: "Some storage devices are in error. Check `ceph health detail`." + summary: "Storage devices error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.1" + severity: "critical" + type: "ceph_default" + - alert: "HardwareMemoryError" + annotations: + description: "DIMM error(s) detected. Check `ceph health detail`." + summary: "DIMM error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.2" + severity: "critical" + type: "ceph_default" + - alert: "HardwareProcessorError" + annotations: + description: "Processor error(s) detected. Check `ceph health detail`." + summary: "Processor error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.3" + severity: "critical" + type: "ceph_default" + - alert: "HardwareNetworkError" + annotations: + description: "Network error(s) detected. Check `ceph health detail`." + summary: "Network error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.4" + severity: "critical" + type: "ceph_default" + - alert: "HardwarePowerError" + annotations: + description: "Power supply error(s) detected. Check `ceph health detail`." + summary: "Power supply error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.5" + severity: "critical" + type: "ceph_default" + - alert: "HardwareFanError" + annotations: + description: "Fan error(s) detected. Check `ceph health detail`." + summary: "Fan error(s) detected" + expr: "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.13.6" + severity: "critical" + type: "ceph_default" - name: "PrometheusServer" rules: - alert: "PrometheusJobMissing" |