diff options
Diffstat (limited to '')
-rw-r--r-- | monitoring/snmp/CEPH-MIB.txt | 337 |
1 files changed, 337 insertions, 0 deletions
diff --git a/monitoring/snmp/CEPH-MIB.txt b/monitoring/snmp/CEPH-MIB.txt new file mode 100644 index 000000000..f54cb3610 --- /dev/null +++ b/monitoring/snmp/CEPH-MIB.txt @@ -0,0 +1,337 @@ +CEPH-MIB DEFINITIONS ::= BEGIN + +IMPORTS + MODULE-IDENTITY, NOTIFICATION-TYPE, enterprises + FROM SNMPv2-SMI + MODULE-COMPLIANCE, NOTIFICATION-GROUP + FROM SNMPv2-CONF +; + +-- Linting information: +-- +-- # smilint -l 6 -i notification-not-reversible ./CEPH-MIB.txt +-- +-- ignore: notification-not-reversible since our SNMP gateway doesn't use SNMPv1 +-- + +ceph MODULE-IDENTITY + LAST-UPDATED + "202111010000Z" -- Nov 01, 2021 + ORGANIZATION + "The Ceph Project + https://ceph.io" + CONTACT-INFO + "Email: <dev@ceph.io> + + Send comments to: <dev@ceph.io>" + DESCRIPTION + "The MIB module for Ceph. In it's current form it only + supports Notifications, since Ceph itself doesn't provide + any SNMP agent functionality. + + Notifications are provided through a Prometheus/Alertmanager + webhook passing alerts to an external gateway service that is + responsible for formatting, forwarding and authenticating to + the SNMP receiver. + " + REVISION + "202111010000Z" --Nov 01, 2021 + DESCRIPTION + "Latest version including the following updates; + + - MIB restructure to align with linting + - names shortened and simplified (less verbose) + - Simplified structure due to switch to https://github.com/maxwo/snmp_notifier + - objects removed + - notifications updated + - Added module compliance + - Updated to latest prometheus alert rule definitions + " + ::= { enterprises 50495 } + +cephCluster OBJECT IDENTIFIER ::= { ceph 1 } +cephConformance OBJECT IDENTIFIER ::= { ceph 2 } + +-- cephMetadata is a placeholder for possible future expansion via an agent +-- where we could provide an overview of the clusters configuration +cephMetadata OBJECT IDENTIFIER ::= { cephCluster 1 } +cephNotifications OBJECT IDENTIFIER ::= { cephCluster 2 } + +prometheus OBJECT IDENTIFIER ::= { cephNotifications 1 } + +-- +-- Notifications: first we define the notification 'branches' for the +-- different categories of notifications / alerts +promGeneric OBJECT IDENTIFIER ::= { prometheus 1 } +promHealthStatus OBJECT IDENTIFIER ::= { prometheus 2 } +promMon OBJECT IDENTIFIER ::= { prometheus 3 } +promOsd OBJECT IDENTIFIER ::= { prometheus 4 } +promMds OBJECT IDENTIFIER ::= { prometheus 5 } +promMgr OBJECT IDENTIFIER ::= { prometheus 6 } +promPGs OBJECT IDENTIFIER ::= { prometheus 7 } +promNode OBJECT IDENTIFIER ::= { prometheus 8 } +promPool OBJECT IDENTIFIER ::= { prometheus 9 } +promRados OBJECT IDENTIFIER ::= { prometheus 10 } +promCephadm OBJECT IDENTIFIER ::= { prometheus 11 } +promPrometheus OBJECT IDENTIFIER ::= { prometheus 12 } + +promGenericNotification NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Generic alert issued when the Prometheus rule doesn't provide an OID." +::= { promGeneric 1 } + +promGenericDaemonCrash NOTIFICATION-TYPE + STATUS current + DESCRIPTION "One or more daemons have crashed recently, and are yet to be archived" +::= { promGeneric 2 } + +promHealthStatusError NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Ceph in health_error state for too long." +::= { promHealthStatus 1 } + +promHealthStatusWarning NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Ceph in health_warn for too long." +::= { promHealthStatus 2 } + +promMonLowQuorum NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Monitor count in quorum is low." +::= { promMon 1 } + +promMonDiskSpaceCritical NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Monitor diskspace is critically low." +::= { promMon 2 } + +promOsdDownHigh NOTIFICATION-TYPE + STATUS current + DESCRIPTION "A high number of OSDs are down." +::= { promOsd 1 } + +promOsdDown NOTIFICATION-TYPE + STATUS current + DESCRIPTION "One or more Osds down." +::= { promOsd 2 } + +promOsdNearFull NOTIFICATION-TYPE + STATUS current + DESCRIPTION "An OSD is dangerously full." +::= { promOsd 3 } + +promOsdFlapping NOTIFICATION-TYPE + STATUS current + DESCRIPTION "An OSD was marked down at back up at least once a minute for 5 minutes." +::= { promOsd 4 } + +promOsdHighPgDeviation NOTIFICATION-TYPE + STATUS current + DESCRIPTION "An OSD deviates by more then 30% from average PG count." +::= { promOsd 5 } + +promOsdFull NOTIFICATION-TYPE + STATUS current + DESCRIPTION "An OSD has reached its full threshold." +::= { promOsd 6 } + +promOsdHighPredictedFailures NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Normal self healing unable to cope with the number of devices predicted to fail." +::= { promOsd 7 } + +promOsdHostDown NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Ceph OSD host is down." +::= { promOsd 8 } + +promMdsDamaged NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Cephfs filesystem is damaged." +::= { promMds 1 } + +promMdsReadOnly NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Cephfs filesystem marked as READ-ONLY" +::= { promMds 2 } + +promMdsOffline NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Cephfs filesystem is unavailable/offline." +::= { promMds 3 } + +promMdsDegraded NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Cephfs filesystem is in a degraded state." +::= { promMds 4 } + +promMdsNoStandby NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Cephfs MDS daemon failure, no standby available" +::= { promMds 5 } + +promMgrModuleCrash NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Ceph mgr module has crashed recently" +::= { promMgr 1 } + +promMgrPrometheusInactive NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Ceph mgr prometheus module not responding" +::= { promMgr 2 } + +promPGsInactive NOTIFICATION-TYPE + STATUS current + DESCRIPTION "One or more PGs are inactive for more than 5 minutes." +::= { promPGs 1 } + +promPGsUnclean NOTIFICATION-TYPE + STATUS current + DESCRIPTION "One or more PGs are not clean for more than 15 minutes." +::= { promPGs 2 } + +promPGsUnavailable NOTIFICATION-TYPE + STATUS current + DESCRIPTION "One or more PGs is unavailable, blocking I/O to those objects." +::= { promPGs 3 } + +promPGsDamaged NOTIFICATION-TYPE + STATUS current + DESCRIPTION "One or more PGs is damaged." +::= { promPGs 4 } + +promPGsRecoveryFull NOTIFICATION-TYPE + STATUS current + DESCRIPTION "PG recovery is impaired due to full OSDs." +::= { promPGs 5 } + +promPGsBackfillFull NOTIFICATION-TYPE + STATUS current + DESCRIPTION "PG backfill is impaired due to full OSDs." +::= { promPGs 6 } + +promNodeRootVolumeFull NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Root volume (OSD and MON store) is dangerously full (< 5% free)." +::= { promNode 1 } + +promNodeNetworkPacketDrops NOTIFICATION-TYPE + STATUS current + DESCRIPTION "A node experiences packet drop > 1 packet/s on an interface." +::= { promNode 2 } + +promNodeNetworkPacketErrors NOTIFICATION-TYPE + STATUS current + DESCRIPTION "A node experiences packet errors > 1 packet/s on an interface." +::= { promNode 3 } + +promNodeStorageFilling NOTIFICATION-TYPE + STATUS current + DESCRIPTION "A mountpoint will be full in less then 5 days assuming the average fillup rate of the past 48 hours." +::= { promNode 4 } + +promPoolFull NOTIFICATION-TYPE + STATUS current + DESCRIPTION "A pool is at 90% capacity or over." +::= { promPool 1 } + +promPoolFilling NOTIFICATION-TYPE + STATUS current + DESCRIPTION "A pool will be full in less then 5 days assuming the average fillup rate of the past 48 hours." +::= { promPool 2 } + +promRadosUnfound NOTIFICATION-TYPE + STATUS current + DESCRIPTION "A RADOS object can not be found, even though all OSDs are online." +::= { promRados 1 } + +promCephadmDaemonDown NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Cephadm has determined that a daemon is down." +::= { promCephadm 1 } + +promCephadmUpgradeFailure NOTIFICATION-TYPE + STATUS current + DESCRIPTION "Cephadm attempted to upgrade the cluster and encountered a problem." +::= { promCephadm 2 } + +promPrometheusJobMissing NOTIFICATION-TYPE + STATUS current + DESCRIPTION "The prometheus scrape job is not defined." +::= { promPrometheus 1 } +-- ---------------------------------------------------------- -- +-- IEEE 802.1D MIB - Conformance Information +-- ---------------------------------------------------------- -- + +cephAlertGroups OBJECT IDENTIFIER ::= { cephConformance 1 } +cephCompliances OBJECT IDENTIFIER ::= { cephConformance 2 } + +-- ---------------------------------------------------------- -- +-- units of conformance +-- ---------------------------------------------------------- -- + +-- ---------------------------------------------------------- -- +-- The Trap Notification Group +-- ---------------------------------------------------------- -- + +cephNotificationGroup NOTIFICATION-GROUP + NOTIFICATIONS { + promGenericNotification, + promGenericDaemonCrash, + promHealthStatusError, + promHealthStatusWarning, + promMonLowQuorum, + promMonDiskSpaceCritical, + promOsdDownHigh, + promOsdDown, + promOsdNearFull, + promOsdFlapping, + promOsdHighPgDeviation, + promOsdFull, + promOsdHighPredictedFailures, + promOsdHostDown, + promMdsDamaged, + promMdsReadOnly, + promMdsOffline, + promMdsDegraded, + promMdsNoStandby, + promMgrModuleCrash, + promMgrPrometheusInactive, + promPGsInactive, + promPGsUnclean, + promPGsUnavailable, + promPGsDamaged, + promPGsRecoveryFull, + promPGsBackfillFull, + promNodeRootVolumeFull, + promNodeNetworkPacketDrops, + promNodeNetworkPacketErrors, + promNodeStorageFilling, + promPoolFull, + promPoolFilling, + promRadosUnfound, + promCephadmDaemonDown, + promCephadmUpgradeFailure, + promPrometheusJobMissing + } + STATUS current + DESCRIPTION + "A collection of notifications triggered by the Prometheus + rules to convey Ceph cluster state" + ::= { cephAlertGroups 2 } + +-- ---------------------------------------------------------- -- +-- compliance statements +-- ---------------------------------------------------------- -- + +cephCompliance MODULE-COMPLIANCE + STATUS current + DESCRIPTION + "The Compliance statement for the Ceph MIB" + MODULE + MANDATORY-GROUPS { + cephNotificationGroup + } + ::= { cephCompliances 1 } + +END |