Adding upstream version 16.2.11+ds.upstream/16.2.11+ds upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
commit: 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree: 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /monitoring/snmp
parent: Initial commit. (diff)
download: ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
2 files changed, 391 insertions, 0 deletions
diff --git a/monitoring/snmp/CEPH-MIB.txt b/monitoring/snmp/CEPH-MIB.txt
new file mode 100644
index 000000000..f54cb3610
--- /dev/null
+++ b/monitoring/snmp/CEPH-MIB.txt
@@ -0,0 +1,337 @@
+CEPH-MIB DEFINITIONS ::= BEGIN
+
+IMPORTS
+    MODULE-IDENTITY, NOTIFICATION-TYPE, enterprises
+        FROM SNMPv2-SMI
+    MODULE-COMPLIANCE, NOTIFICATION-GROUP
+        FROM SNMPv2-CONF
+;
+
+-- Linting information:
+--
+-- # smilint -l 6 -i notification-not-reversible ./CEPH-MIB.txt
+--
+-- ignore: notification-not-reversible since our SNMP gateway doesn't use SNMPv1
+--
+
+ceph MODULE-IDENTITY
+    LAST-UPDATED
+        "202111010000Z" -- Nov 01, 2021
+    ORGANIZATION
+        "The Ceph Project
+         https://ceph.io"
+    CONTACT-INFO
+        "Email: <dev@ceph.io>
+
+        Send comments to: <dev@ceph.io>"
+    DESCRIPTION
+        "The MIB module for Ceph. In it's current form it only
+        supports Notifications, since Ceph itself doesn't provide
+        any SNMP agent functionality.
+
+        Notifications are provided through a Prometheus/Alertmanager
+        webhook passing alerts to an external gateway service that is
+        responsible for formatting, forwarding and authenticating to
+        the SNMP receiver.
+        "
+    REVISION
+        "202111010000Z" --Nov 01, 2021
+    DESCRIPTION
+        "Latest version including the following updates;
+
+        - MIB restructure to align with linting
+        - names shortened and simplified (less verbose)
+        - Simplified structure due to switch to https://github.com/maxwo/snmp_notifier
+          - objects removed
+          - notifications updated
+        - Added module compliance
+        - Updated to latest prometheus alert rule definitions
+        "
+    ::= { enterprises 50495 }
+
+cephCluster       OBJECT IDENTIFIER ::= { ceph 1 }
+cephConformance   OBJECT IDENTIFIER ::= { ceph 2 }
+
+-- cephMetadata is a placeholder for possible future expansion via an agent
+-- where we could provide an overview of the clusters configuration
+cephMetadata      OBJECT IDENTIFIER ::= { cephCluster 1 }
+cephNotifications OBJECT IDENTIFIER ::= { cephCluster 2 }
+
+prometheus OBJECT IDENTIFIER ::= { cephNotifications 1 }
+
+--
+-- Notifications: first we define the notification 'branches' for the
+-- different categories of notifications / alerts
+promGeneric       OBJECT IDENTIFIER ::= { prometheus 1 }
+promHealthStatus  OBJECT IDENTIFIER ::= { prometheus 2 }
+promMon           OBJECT IDENTIFIER ::= { prometheus 3 }
+promOsd           OBJECT IDENTIFIER ::= { prometheus 4 }
+promMds           OBJECT IDENTIFIER ::= { prometheus 5 }
+promMgr           OBJECT IDENTIFIER ::= { prometheus 6 }
+promPGs           OBJECT IDENTIFIER ::= { prometheus 7 }
+promNode          OBJECT IDENTIFIER ::= { prometheus 8 }
+promPool          OBJECT IDENTIFIER ::= { prometheus 9 }
+promRados         OBJECT IDENTIFIER ::= { prometheus 10 }
+promCephadm       OBJECT IDENTIFIER ::= { prometheus 11 }
+promPrometheus    OBJECT IDENTIFIER ::= { prometheus 12 }
+
+promGenericNotification NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Generic alert issued when the Prometheus rule doesn't provide an OID."
+::= { promGeneric 1 }
+
+promGenericDaemonCrash NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "One or more daemons have crashed recently, and are yet to be archived"
+::= { promGeneric 2 }
+
+promHealthStatusError NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Ceph in health_error state for too long."
+::= { promHealthStatus 1 }
+
+promHealthStatusWarning NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Ceph in health_warn for too long."
+::= { promHealthStatus 2 }
+
+promMonLowQuorum NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Monitor count in quorum is low."
+::= { promMon 1 }
+
+promMonDiskSpaceCritical NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Monitor diskspace is critically low."
+::= { promMon 2 }
+
+promOsdDownHigh NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A high number of OSDs are down."
+::= { promOsd 1 }
+
+promOsdDown NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "One or more Osds down."
+::= { promOsd 2 }
+
+promOsdNearFull NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "An OSD is dangerously full."
+::= { promOsd 3 }
+
+promOsdFlapping NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "An OSD was marked down at back up at least once a minute for 5 minutes."
+::= { promOsd 4 }
+
+promOsdHighPgDeviation NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "An OSD deviates by more then 30% from average PG count."
+::= { promOsd 5 }
+
+promOsdFull NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "An OSD has reached its full threshold."
+::= { promOsd 6 }
+
+promOsdHighPredictedFailures NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Normal self healing unable to cope with the number of devices predicted to fail."
+::= { promOsd 7 }
+
+promOsdHostDown NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Ceph OSD host is down."
+::= { promOsd 8 }
+
+promMdsDamaged NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephfs filesystem is damaged."
+::= { promMds 1 }
+
+promMdsReadOnly NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephfs filesystem marked as READ-ONLY"
+::= { promMds 2 }
+
+promMdsOffline NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephfs filesystem is unavailable/offline."
+::= { promMds 3 }
+
+promMdsDegraded NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephfs filesystem is in a degraded state."
+::= { promMds 4 }
+
+promMdsNoStandby NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephfs MDS daemon failure, no standby available"
+::= { promMds 5 }
+
+promMgrModuleCrash NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Ceph mgr module has crashed recently"
+::= { promMgr 1 }
+
+promMgrPrometheusInactive NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Ceph mgr prometheus module not responding"
+::= { promMgr 2 }
+
+promPGsInactive NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "One or more PGs are inactive for more than 5 minutes."
+::= { promPGs 1 }
+
+promPGsUnclean NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "One or more PGs are not clean for more than 15 minutes."
+::= { promPGs 2 }
+
+promPGsUnavailable NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "One or more PGs is unavailable, blocking I/O to those objects."
+::= { promPGs 3 }
+
+promPGsDamaged NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "One or more PGs is damaged."
+::= { promPGs 4 }
+
+promPGsRecoveryFull NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "PG recovery is impaired due to full OSDs."
+::= { promPGs 5 }
+
+promPGsBackfillFull NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "PG backfill is impaired due to full OSDs."
+::= { promPGs 6 }
+
+promNodeRootVolumeFull NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Root volume (OSD and MON store) is dangerously full (< 5% free)."
+::= { promNode 1 }
+
+promNodeNetworkPacketDrops NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A node experiences packet drop > 1 packet/s on an interface."
+::= { promNode 2 }
+
+promNodeNetworkPacketErrors NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A node experiences packet errors > 1 packet/s on an interface."
+::= { promNode 3 }
+
+promNodeStorageFilling NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A mountpoint will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
+::= { promNode 4 }
+
+promPoolFull NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A pool is at 90% capacity or over."
+::= { promPool 1 }
+
+promPoolFilling NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A pool will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
+::= { promPool 2 }
+
+promRadosUnfound NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "A RADOS object can not be found, even though all OSDs are online."
+::= { promRados 1 }
+
+promCephadmDaemonDown NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephadm has determined that a daemon is down."
+::= { promCephadm 1 }
+
+promCephadmUpgradeFailure NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "Cephadm attempted to upgrade the cluster and encountered a problem."
+::= { promCephadm 2 }
+
+promPrometheusJobMissing NOTIFICATION-TYPE
+    STATUS      current
+    DESCRIPTION "The prometheus scrape job is not defined."
+::= { promPrometheus 1 }
+-- ---------------------------------------------------------- --
+-- IEEE 802.1D MIB - Conformance Information
+-- ---------------------------------------------------------- --
+
+cephAlertGroups   OBJECT IDENTIFIER ::= { cephConformance 1 }
+cephCompliances   OBJECT IDENTIFIER ::= { cephConformance 2 }
+
+-- ---------------------------------------------------------- --
+-- units of conformance
+-- ---------------------------------------------------------- --
+
+-- ---------------------------------------------------------- --
+-- The Trap Notification Group
+-- ---------------------------------------------------------- --
+
+cephNotificationGroup NOTIFICATION-GROUP
+    NOTIFICATIONS {
+        promGenericNotification,
+        promGenericDaemonCrash,
+        promHealthStatusError,
+        promHealthStatusWarning,
+        promMonLowQuorum,
+        promMonDiskSpaceCritical,
+        promOsdDownHigh,
+        promOsdDown,
+        promOsdNearFull,
+        promOsdFlapping,
+        promOsdHighPgDeviation,
+        promOsdFull,
+        promOsdHighPredictedFailures,
+        promOsdHostDown,
+        promMdsDamaged,
+        promMdsReadOnly,
+        promMdsOffline,
+        promMdsDegraded,
+        promMdsNoStandby,
+        promMgrModuleCrash,
+        promMgrPrometheusInactive,
+        promPGsInactive,
+        promPGsUnclean,
+        promPGsUnavailable,
+        promPGsDamaged,
+        promPGsRecoveryFull,
+        promPGsBackfillFull,
+        promNodeRootVolumeFull,
+        promNodeNetworkPacketDrops,
+        promNodeNetworkPacketErrors,
+        promNodeStorageFilling,
+        promPoolFull,
+        promPoolFilling,
+        promRadosUnfound,
+        promCephadmDaemonDown,
+        promCephadmUpgradeFailure,
+        promPrometheusJobMissing
+    }
+    STATUS current
+    DESCRIPTION
+        "A collection of notifications triggered by the Prometheus
+        rules to convey Ceph cluster state"
+    ::= { cephAlertGroups 2 }
+
+-- ---------------------------------------------------------- --
+-- compliance statements
+-- ---------------------------------------------------------- --
+
+cephCompliance MODULE-COMPLIANCE
+    STATUS current
+    DESCRIPTION
+        "The Compliance statement for the Ceph MIB"
+    MODULE
+        MANDATORY-GROUPS {
+            cephNotificationGroup
+        }
+    ::= { cephCompliances 1 }
+
+END
diff --git a/monitoring/snmp/README.md b/monitoring/snmp/README.md
new file mode 100644
index 000000000..1a5b60955
--- /dev/null
+++ b/monitoring/snmp/README.md
@@ -0,0 +1,54 @@
+# SNMP schema
+To show the [OID](https://en.wikipedia.org/wiki/Object_identifier)'s supported by the MIB, use the snmptranslate command. Here's an example:
+```
+snmptranslate -Pu -Tz -M ~/git/ceph/monitoring/snmp:/usr/share/snmp/mibs -m CEPH-MIB
+```
+*The `snmptranslate` command is in the net-snmp-utils package*
+
+The MIB provides a NOTIFICATION only implementation since ceph doesn't have an SNMP
+agent feature.
+
+## Integration
+The SNMP MIB is has been aligned to the Prometheus rules. Any rule that defines a 
+critical alert should have a corresponding oid in the CEPH-MIB.txt file. To generate
+an SNMP notification, you must use an SNMP gateway that the Prometheus Alertmanager
+service can forward alerts through to, via it's webhooks feature.
+
+&nbsp;
+
+## SNMP Gateway
+The recommended SNMP gateway is https://github.com/maxwo/snmp_notifier. This is a widely
+used and generic SNMP gateway implementation written in go. It's usage (syntax and
+parameters) is very similar to Prometheus, AlertManager and even node-exporter.
+
+&nbsp;
+## SNMP OIDs
+The main components of the Ceph MIB is can be broken down into discrete areas
+
+
+```
+internet private enterprise   ceph   ceph    Notifications   Prometheus  Notification
+                               org  cluster   (alerts)         source      Category
+1.3.6.1   .4     .1          .50495   .1        .2               .1         .2  (Ceph Health)
+                                                                            .3  (MON)
+                                                                            .4  (OSD)
+                                                                            .5  (MDS)
+                                                                            .6  (MGR)
+                                                                            .7  (PGs)
+                                                                            .8  (Nodes)
+                                                                            .9  (Pools)
+                                                                            .10  (Rados)
+                                                                            .11 (cephadm)
+                                                                            .12 (prometheus)
+
+```
+Individual alerts are placed within the appropriate alert category. For example, to add
+a notification relating to a MGR issue, you would use the oid 1.3.6.1.4.1.50495.1.2.1.6.x
+
+The SNMP gateway also adds additional components to the SNMP notification ;
+
+| Suffix | Description |
+|--------|-------------|
+| .1 | The oid |
+| .2 | Severity of the alert. When an alert is resolved, severity is 'info', and the description is set to Status:OK|
+| .3 | Text of the alert(s) |
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
commit	19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree	42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /monitoring/snmp
parent	Initial commit. (diff)
download	ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip