summaryrefslogtreecommitdiffstats
path: root/monitoring/ceph-mixin/tests_alerts
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-23 16:45:17 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-23 16:45:44 +0000
commit17d6a993fc17d533460c5f40f3908c708e057c18 (patch)
tree1a3bd93e0ecd74fa02f93a528fe2f87e5314c4b5 /monitoring/ceph-mixin/tests_alerts
parentReleasing progress-linux version 18.2.2-0progress7.99u1. (diff)
downloadceph-17d6a993fc17d533460c5f40f3908c708e057c18.tar.xz
ceph-17d6a993fc17d533460c5f40f3908c708e057c18.zip
Merging upstream version 18.2.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'monitoring/ceph-mixin/tests_alerts')
-rw-r--r--monitoring/ceph-mixin/tests_alerts/test_alerts.yml144
1 files changed, 144 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
index 1aaea88e7..4768af7de 100644
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -1886,3 +1886,147 @@ tests:
documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
summary: One or more Ceph daemons have crashed, and are pending acknowledgement
description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command.
+ - interval: 30s
+ input_series:
+ - series: 'ceph_health_detail{name="HARDWARE_STORAGE"}'
+ values: '1+0x40'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="HARDWARE_STORAGE"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="HARDWARE_STORAGE"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: HardwareStorageError
+ - eval_time: 5m
+ alertname: HardwareStorageError
+ exp_alerts:
+ - exp_labels:
+ name: HARDWARE_STORAGE
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.13.1
+ exp_annotations:
+ summary: Storage devices error(s) detected
+ description: "Some storage devices are in error. Check `ceph health detail`."
+ - interval: 30s
+ input_series:
+ - series: 'ceph_health_detail{name="HARDWARE_MEMORY"}'
+ values: '1+0x40'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="HARDWARE_MEMORY"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="HARDWARE_MEMORY"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: HardwareMemoryError
+ - eval_time: 5m
+ alertname: HardwareMemoryError
+ exp_alerts:
+ - exp_labels:
+ name: HARDWARE_MEMORY
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.13.2
+ exp_annotations:
+ summary: DIMM error(s) detected
+ description: "DIMM error(s) detected. Check `ceph health detail`."
+ - interval: 30s
+ input_series:
+ - series: 'ceph_health_detail{name="HARDWARE_PROCESSOR"}'
+ values: '1+0x40'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="HARDWARE_PROCESSOR"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: HardwareProcessorError
+ - eval_time: 5m
+ alertname: HardwareProcessorError
+ exp_alerts:
+ - exp_labels:
+ name: HARDWARE_PROCESSOR
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.13.3
+ exp_annotations:
+ summary: Processor error(s) detected
+ description: "Processor error(s) detected. Check `ceph health detail`."
+ - interval: 30s
+ input_series:
+ - series: 'ceph_health_detail{name="HARDWARE_NETWORK"}'
+ values: '1+0x40'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="HARDWARE_NETWORK"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="HARDWARE_NETWORK"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: HardwareNetworkError
+ - eval_time: 5m
+ alertname: HardwareNetworkError
+ exp_alerts:
+ - exp_labels:
+ name: HARDWARE_NETWORK
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.13.4
+ exp_annotations:
+ summary: Network error(s) detected
+ description: "Network error(s) detected. Check `ceph health detail`."
+ - interval: 30s
+ input_series:
+ - series: 'ceph_health_detail{name="HARDWARE_POWER"}'
+ values: '1+0x40'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="HARDWARE_POWER"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="HARDWARE_POWER"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: HardwarePowerError
+ - eval_time: 5m
+ alertname: HardwarePowerError
+ exp_alerts:
+ - exp_labels:
+ name: HARDWARE_POWER
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.13.5
+ exp_annotations:
+ summary: Power supply error(s) detected
+ description: "Power supply error(s) detected. Check `ceph health detail`."
+ - interval: 30s
+ input_series:
+ - series: 'ceph_health_detail{name="HARDWARE_FANS"}'
+ values: '1+0x40'
+ promql_expr_test:
+ - expr: ceph_health_detail{name="HARDWARE_FANS"} > 0
+ eval_time: 2m
+ exp_samples:
+ - labels: '{__name__="ceph_health_detail", name="HARDWARE_FANS"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 1m
+ alertname: HardwareFanError
+ - eval_time: 5m
+ alertname: HardwareFanError
+ exp_alerts:
+ - exp_labels:
+ name: HARDWARE_FANS
+ severity: critical
+ type: ceph_default
+ oid: 1.3.6.1.4.1.50495.1.2.1.13.6
+ exp_annotations:
+ summary: Fan error(s) detected
+ description: "Fan error(s) detected. Check `ceph health detail`."