1 files changed, 268 insertions, 0 deletions
diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml
new file mode 100644
index 00000000..ddd19c67
--- /dev/null
+++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml
@@ -0,0 +1,268 @@
+groups:
+  - name: cluster health
+    rules:
+      - alert: health error
+        expr: ceph_health_status == 2
+        for: 5m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: >
+            Ceph in HEALTH_ERROR state for more than 5 minutes.
+            Please check "ceph health detail" for more information.
+
+      - alert: health warn
+        expr: ceph_health_status == 1
+        for: 15m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            Ceph has been in HEALTH_WARN for more than 15 minutes.
+            Please check "ceph health detail" for more information.
+
+  - name: mon
+    rules:
+      - alert: low monitor quorum count
+        expr: sum(ceph_mon_quorum_status) < 3
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: |
+            Monitor count in quorum is below three.
+
+            Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . | first | value }}{{ end }} monitors are active.
+
+            The following monitors are down:
+            {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
+              - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
+            {{- end }}
+
+  - name: osd
+    rules:
+      - alert: 10% OSDs down
+        expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: |
+            {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (≥ 10%).
+
+            The following OSDs are down:
+            {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
+              - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
+            {{- end }}
+
+      - alert: OSD down
+        expr: count(ceph_osd_up == 0) > 0
+        for: 15m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: |
+            {{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }}
+            {{ $value }} OSD{{ $s }} down for more than 15 minutes.
+
+            {{ $value }} of {{ query "count(ceph_osd_up)" | first | value }} OSDs are down.
+
+            The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
+              {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
+                - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
+              {{- end }}
+
+      - alert: OSDs near full
+        expr: |
+          (
+            ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1)
+            * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
+          ) * 100 > 90
+        for: 5m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: >
+            OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is
+            dangerously full: {{ $value | humanize }}%
+
+      - alert: flapping OSD
+        expr: |
+          (
+            rate(ceph_osd_up[5m])
+            * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
+          ) * 60 > 1
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
+            marked down and back up at {{ $value | humanize }} times once a
+            minute for 5 minutes.
+
+      # alert on high deviation from average PG count
+      - alert: high pg count deviation
+        expr: |
+          abs(
+            (
+              (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
+            ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
+          ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
+        for: 5m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
+            by more than 30% from average PG count.
+      # alert on high commit latency...but how high is too high
+  - name: mds
+    rules:
+    # no mds metrics are exported yet
+  - name: mgr
+    rules:
+    # no mgr metrics are exported yet
+  - name: pgs
+    rules:
+      - alert: pgs inactive
+        expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
+        for: 5m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: >
+            {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
+            Inactive placement groups aren't able to serve read/write
+            requests.
+      - alert: pgs unclean
+        expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
+        for: 15m
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
+            Unclean PGs haven't been able to completely recover from a
+            previous failure.
+  - name: nodes
+    rules:
+      - alert: root volume full
+        expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
+        for: 5m
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: >
+            Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
+
+      # alert on nic packet errors and drops rates > 1% packets/s
+      - alert: network packets dropped
+        expr: |
+          (
+            increase(node_network_receive_drop_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_drop_total{device!="lo"}[1m])
+          ) / (
+            increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_packets_total{device!="lo"}[1m])
+          ) >= 0.0001 or (
+            increase(node_network_receive_drop_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_drop_total{device!="lo"}[1m])
+          ) >= 10
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            Node {{ $labels.instance }} experiences packet drop > 0.01% or >
+            10 packets/s on interface {{ $labels.device }}.
+
+      - alert: network packet errors
+        expr: |
+          (
+            increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_errs_total{device!="lo"}[1m])
+          ) / (
+            increase(node_network_receive_packets_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_packets_total{device!="lo"}[1m])
+          ) >= 0.0001 or (
+            increase(node_network_receive_errs_total{device!="lo"}[1m]) +
+            increase(node_network_transmit_errs_total{device!="lo"}[1m])
+          ) >= 10
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            Node {{ $labels.instance }} experiences packet errors > 0.01% or
+            > 10 packets/s on interface {{ $labels.device }}.
+
+      - alert: storage filling up
+        expr: |
+          predict_linear(node_filesystem_free_bytes[2d], 3600 * 24 * 5) *
+          on(instance) group_left(nodename) node_uname_info < 0
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
+            will be full in less than 5 days assuming the average fill-up
+            rate of the past 48 hours.
+
+      - alert: MTU Mismatch
+        expr: node_network_mtu_bytes{device!="lo"} != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"}))
+        labels:
+          severity: warning
+          type: ceph_default
+          oid: 1.3.6.1.4.1.50495.15.1.2.8.5
+        annotations:
+          description: >
+            Node {{ $labels.instance }} has a different MTU size ({{ $value }})
+            than the median value on device {{ $labels.device }}.
+
+  - name: pools
+    rules:
+      - alert: pool full
+        expr: |
+          ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
+          * on(pool_id) group_right ceph_pool_metadata * 100 > 90
+        labels:
+          severity: critical
+          type: ceph_default
+        annotations:
+          description: Pool {{ $labels.name }} at {{ $value | humanize }}% capacity.
+
+      - alert: pool filling up
+        expr: |
+          (
+            predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5)
+            >= ceph_pool_stored + ceph_pool_max_avail
+          ) * on(pool_id) group_left(name) ceph_pool_metadata
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            Pool {{ $labels.name }} will be full in less than 5 days
+            assuming the average fill-up rate of the past 48 hours.
+
+  - name: healthchecks
+    rules:
+      - alert: Slow OSD Ops
+        expr: ceph_healthcheck_slow_ops > 0
+        for: 30s
+        labels:
+          severity: warning
+          type: ceph_default
+        annotations:
+          description: >
+            {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)