summaryrefslogtreecommitdiffstats
path: root/monitoring/ceph-mixin/prometheus_alerts.yml
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-23 16:45:17 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-23 16:45:44 +0000
commit17d6a993fc17d533460c5f40f3908c708e057c18 (patch)
tree1a3bd93e0ecd74fa02f93a528fe2f87e5314c4b5 /monitoring/ceph-mixin/prometheus_alerts.yml
parentReleasing progress-linux version 18.2.2-0progress7.99u1. (diff)
downloadceph-17d6a993fc17d533460c5f40f3908c708e057c18.tar.xz
ceph-17d6a993fc17d533460c5f40f3908c708e057c18.zip
Merging upstream version 18.2.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'monitoring/ceph-mixin/prometheus_alerts.yml')
-rw-r--r--monitoring/ceph-mixin/prometheus_alerts.yml62
1 files changed, 62 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml
index 4a3e6acf3..e491c753f 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -614,6 +614,68 @@ groups:
labels:
severity: "warning"
type: "ceph_default"
+ - name: "hardware"
+ rules:
+ - alert: "HardwareStorageError"
+ annotations:
+ description: "Some storage devices are in error. Check `ceph health detail`."
+ summary: "Storage devices error(s) detected"
+ expr: "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0"
+ for: "30s"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.13.1"
+ severity: "critical"
+ type: "ceph_default"
+ - alert: "HardwareMemoryError"
+ annotations:
+ description: "DIMM error(s) detected. Check `ceph health detail`."
+ summary: "DIMM error(s) detected"
+ expr: "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0"
+ for: "30s"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.13.2"
+ severity: "critical"
+ type: "ceph_default"
+ - alert: "HardwareProcessorError"
+ annotations:
+ description: "Processor error(s) detected. Check `ceph health detail`."
+ summary: "Processor error(s) detected"
+ expr: "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0"
+ for: "30s"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.13.3"
+ severity: "critical"
+ type: "ceph_default"
+ - alert: "HardwareNetworkError"
+ annotations:
+ description: "Network error(s) detected. Check `ceph health detail`."
+ summary: "Network error(s) detected"
+ expr: "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0"
+ for: "30s"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.13.4"
+ severity: "critical"
+ type: "ceph_default"
+ - alert: "HardwarePowerError"
+ annotations:
+ description: "Power supply error(s) detected. Check `ceph health detail`."
+ summary: "Power supply error(s) detected"
+ expr: "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0"
+ for: "30s"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.13.5"
+ severity: "critical"
+ type: "ceph_default"
+ - alert: "HardwareFanError"
+ annotations:
+ description: "Fan error(s) detected. Check `ceph health detail`."
+ summary: "Fan error(s) detected"
+ expr: "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0"
+ for: "30s"
+ labels:
+ oid: "1.3.6.1.4.1.50495.1.2.1.13.6"
+ severity: "critical"
+ type: "ceph_default"
- name: "PrometheusServer"
rules:
- alert: "PrometheusJobMissing"