summaryrefslogtreecommitdiffstats
path: root/monitoring/ceph-mixin/prometheus_alerts.libsonnet
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-23 16:45:17 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-23 16:45:44 +0000
commit17d6a993fc17d533460c5f40f3908c708e057c18 (patch)
tree1a3bd93e0ecd74fa02f93a528fe2f87e5314c4b5 /monitoring/ceph-mixin/prometheus_alerts.libsonnet
parentReleasing progress-linux version 18.2.2-0progress7.99u1. (diff)
downloadceph-17d6a993fc17d533460c5f40f3908c708e057c18.tar.xz
ceph-17d6a993fc17d533460c5f40f3908c708e057c18.zip
Merging upstream version 18.2.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'monitoring/ceph-mixin/prometheus_alerts.libsonnet')
-rw-r--r--monitoring/ceph-mixin/prometheus_alerts.libsonnet65
1 files changed, 65 insertions, 0 deletions
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
index b7ec0da2f..a6ab4c2a3 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet
+++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
@@ -690,6 +690,71 @@
],
},
{
+ name: 'hardware',
+ rules: [
+ {
+ alert: 'HardwareStorageError',
+ 'for': '30s',
+ expr: 'ceph_health_detail{name="HARDWARE_STORAGE"} > 0',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.1' },
+ annotations: {
+ summary: 'Storage devices error(s) detected%(cluster)s' % $.MultiClusterSummary(),
+ description: 'Some storage devices are in error. Check `ceph health detail`.',
+ },
+ },
+ {
+ alert: 'HardwareMemoryError',
+ 'for': '30s',
+ expr: 'ceph_health_detail{name="HARDWARE_MEMORY"} > 0',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.2' },
+ annotations: {
+ summary: 'DIMM error(s) detected%(cluster)s' % $.MultiClusterSummary(),
+ description: 'DIMM error(s) detected. Check `ceph health detail`.',
+ },
+ },
+ {
+ alert: 'HardwareProcessorError',
+ 'for': '30s',
+ expr: 'ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.3' },
+ annotations: {
+ summary: 'Processor error(s) detected%(cluster)s' % $.MultiClusterSummary(),
+ description: 'Processor error(s) detected. Check `ceph health detail`.',
+ },
+ },
+ {
+ alert: 'HardwareNetworkError',
+ 'for': '30s',
+ expr: 'ceph_health_detail{name="HARDWARE_NETWORK"} > 0',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.4' },
+ annotations: {
+ summary: 'Network error(s) detected%(cluster)s' % $.MultiClusterSummary(),
+ description: 'Network error(s) detected. Check `ceph health detail`.',
+ },
+ },
+ {
+ alert: 'HardwarePowerError',
+ 'for': '30s',
+ expr: 'ceph_health_detail{name="HARDWARE_POWER"} > 0',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.5' },
+ annotations: {
+ summary: 'Power supply error(s) detected%(cluster)s' % $.MultiClusterSummary(),
+ description: 'Power supply error(s) detected. Check `ceph health detail`.',
+ },
+ },
+ {
+ alert: 'HardwareFanError',
+ 'for': '30s',
+ expr: 'ceph_health_detail{name="HARDWARE_FANS"} > 0',
+ labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.6' },
+ annotations: {
+ summary: 'Fan error(s) detected%(cluster)s' % $.MultiClusterSummary(),
+ description: 'Fan error(s) detected. Check `ceph health detail`.',
+ },
+ },
+ ],
+ },
+ {
name: 'PrometheusServer',
rules: [
{