summaryrefslogtreecommitdiffstats
path: root/monitoring/snmp/CEPH-MIB.txt
blob: f54cb3610377a68f29f5310d704559dff80c851b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
CEPH-MIB DEFINITIONS ::= BEGIN

IMPORTS
    MODULE-IDENTITY, NOTIFICATION-TYPE, enterprises
        FROM SNMPv2-SMI
    MODULE-COMPLIANCE, NOTIFICATION-GROUP
        FROM SNMPv2-CONF
;

-- Linting information:
--
-- # smilint -l 6 -i notification-not-reversible ./CEPH-MIB.txt
--
-- ignore: notification-not-reversible since our SNMP gateway doesn't use SNMPv1
--

ceph MODULE-IDENTITY
    LAST-UPDATED
        "202111010000Z" -- Nov 01, 2021
    ORGANIZATION
        "The Ceph Project
         https://ceph.io"
    CONTACT-INFO
        "Email: <dev@ceph.io>

        Send comments to: <dev@ceph.io>"
    DESCRIPTION
        "The MIB module for Ceph. In it's current form it only
        supports Notifications, since Ceph itself doesn't provide
        any SNMP agent functionality.

        Notifications are provided through a Prometheus/Alertmanager
        webhook passing alerts to an external gateway service that is
        responsible for formatting, forwarding and authenticating to
        the SNMP receiver.
        "
    REVISION
        "202111010000Z" --Nov 01, 2021
    DESCRIPTION
        "Latest version including the following updates;

        - MIB restructure to align with linting
        - names shortened and simplified (less verbose)
        - Simplified structure due to switch to https://github.com/maxwo/snmp_notifier
          - objects removed
          - notifications updated
        - Added module compliance
        - Updated to latest prometheus alert rule definitions
        "
    ::= { enterprises 50495 }

cephCluster       OBJECT IDENTIFIER ::= { ceph 1 }
cephConformance   OBJECT IDENTIFIER ::= { ceph 2 }

-- cephMetadata is a placeholder for possible future expansion via an agent
-- where we could provide an overview of the clusters configuration
cephMetadata      OBJECT IDENTIFIER ::= { cephCluster 1 }
cephNotifications OBJECT IDENTIFIER ::= { cephCluster 2 }

prometheus OBJECT IDENTIFIER ::= { cephNotifications 1 }

--
-- Notifications: first we define the notification 'branches' for the
-- different categories of notifications / alerts
promGeneric       OBJECT IDENTIFIER ::= { prometheus 1 }
promHealthStatus  OBJECT IDENTIFIER ::= { prometheus 2 }
promMon           OBJECT IDENTIFIER ::= { prometheus 3 }
promOsd           OBJECT IDENTIFIER ::= { prometheus 4 }
promMds           OBJECT IDENTIFIER ::= { prometheus 5 }
promMgr           OBJECT IDENTIFIER ::= { prometheus 6 }
promPGs           OBJECT IDENTIFIER ::= { prometheus 7 }
promNode          OBJECT IDENTIFIER ::= { prometheus 8 }
promPool          OBJECT IDENTIFIER ::= { prometheus 9 }
promRados         OBJECT IDENTIFIER ::= { prometheus 10 }
promCephadm       OBJECT IDENTIFIER ::= { prometheus 11 }
promPrometheus    OBJECT IDENTIFIER ::= { prometheus 12 }

promGenericNotification NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Generic alert issued when the Prometheus rule doesn't provide an OID."
::= { promGeneric 1 }

promGenericDaemonCrash NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "One or more daemons have crashed recently, and are yet to be archived"
::= { promGeneric 2 }

promHealthStatusError NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Ceph in health_error state for too long."
::= { promHealthStatus 1 }

promHealthStatusWarning NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Ceph in health_warn for too long."
::= { promHealthStatus 2 }

promMonLowQuorum NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Monitor count in quorum is low."
::= { promMon 1 }

promMonDiskSpaceCritical NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Monitor diskspace is critically low."
::= { promMon 2 }

promOsdDownHigh NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A high number of OSDs are down."
::= { promOsd 1 }

promOsdDown NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "One or more Osds down."
::= { promOsd 2 }

promOsdNearFull NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "An OSD is dangerously full."
::= { promOsd 3 }

promOsdFlapping NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "An OSD was marked down at back up at least once a minute for 5 minutes."
::= { promOsd 4 }

promOsdHighPgDeviation NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "An OSD deviates by more then 30% from average PG count."
::= { promOsd 5 }

promOsdFull NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "An OSD has reached its full threshold."
::= { promOsd 6 }

promOsdHighPredictedFailures NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Normal self healing unable to cope with the number of devices predicted to fail."
::= { promOsd 7 }

promOsdHostDown NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Ceph OSD host is down."
::= { promOsd 8 }

promMdsDamaged NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephfs filesystem is damaged."
::= { promMds 1 }

promMdsReadOnly NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephfs filesystem marked as READ-ONLY"
::= { promMds 2 }

promMdsOffline NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephfs filesystem is unavailable/offline."
::= { promMds 3 }

promMdsDegraded NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephfs filesystem is in a degraded state."
::= { promMds 4 }

promMdsNoStandby NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephfs MDS daemon failure, no standby available"
::= { promMds 5 }

promMgrModuleCrash NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Ceph mgr module has crashed recently"
::= { promMgr 1 }

promMgrPrometheusInactive NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Ceph mgr prometheus module not responding"
::= { promMgr 2 }

promPGsInactive NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "One or more PGs are inactive for more than 5 minutes."
::= { promPGs 1 }

promPGsUnclean NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "One or more PGs are not clean for more than 15 minutes."
::= { promPGs 2 }

promPGsUnavailable NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "One or more PGs is unavailable, blocking I/O to those objects."
::= { promPGs 3 }

promPGsDamaged NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "One or more PGs is damaged."
::= { promPGs 4 }

promPGsRecoveryFull NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "PG recovery is impaired due to full OSDs."
::= { promPGs 5 }

promPGsBackfillFull NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "PG backfill is impaired due to full OSDs."
::= { promPGs 6 }

promNodeRootVolumeFull NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Root volume (OSD and MON store) is dangerously full (< 5% free)."
::= { promNode 1 }

promNodeNetworkPacketDrops NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A node experiences packet drop > 1 packet/s on an interface."
::= { promNode 2 }

promNodeNetworkPacketErrors NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A node experiences packet errors > 1 packet/s on an interface."
::= { promNode 3 }

promNodeStorageFilling NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A mountpoint will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
::= { promNode 4 }

promPoolFull NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A pool is at 90% capacity or over."
::= { promPool 1 }

promPoolFilling NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A pool will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
::= { promPool 2 }

promRadosUnfound NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "A RADOS object can not be found, even though all OSDs are online."
::= { promRados 1 }

promCephadmDaemonDown NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephadm has determined that a daemon is down."
::= { promCephadm 1 }

promCephadmUpgradeFailure NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "Cephadm attempted to upgrade the cluster and encountered a problem."
::= { promCephadm 2 }

promPrometheusJobMissing NOTIFICATION-TYPE
    STATUS      current
    DESCRIPTION "The prometheus scrape job is not defined."
::= { promPrometheus 1 }
-- ---------------------------------------------------------- --
-- IEEE 802.1D MIB - Conformance Information
-- ---------------------------------------------------------- --

cephAlertGroups   OBJECT IDENTIFIER ::= { cephConformance 1 }
cephCompliances   OBJECT IDENTIFIER ::= { cephConformance 2 }

-- ---------------------------------------------------------- --
-- units of conformance
-- ---------------------------------------------------------- --

-- ---------------------------------------------------------- --
-- The Trap Notification Group
-- ---------------------------------------------------------- --

cephNotificationGroup NOTIFICATION-GROUP
    NOTIFICATIONS {
        promGenericNotification,
        promGenericDaemonCrash,
        promHealthStatusError,
        promHealthStatusWarning,
        promMonLowQuorum,
        promMonDiskSpaceCritical,
        promOsdDownHigh,
        promOsdDown,
        promOsdNearFull,
        promOsdFlapping,
        promOsdHighPgDeviation,
        promOsdFull,
        promOsdHighPredictedFailures,
        promOsdHostDown,
        promMdsDamaged,
        promMdsReadOnly,
        promMdsOffline,
        promMdsDegraded,
        promMdsNoStandby,
        promMgrModuleCrash,
        promMgrPrometheusInactive,
        promPGsInactive,
        promPGsUnclean,
        promPGsUnavailable,
        promPGsDamaged,
        promPGsRecoveryFull,
        promPGsBackfillFull,
        promNodeRootVolumeFull,
        promNodeNetworkPacketDrops,
        promNodeNetworkPacketErrors,
        promNodeStorageFilling,
        promPoolFull,
        promPoolFilling,
        promRadosUnfound,
        promCephadmDaemonDown,
        promCephadmUpgradeFailure,
        promPrometheusJobMissing
    }
    STATUS current
    DESCRIPTION
        "A collection of notifications triggered by the Prometheus
        rules to convey Ceph cluster state"
    ::= { cephAlertGroups 2 }

-- ---------------------------------------------------------- --
-- compliance statements
-- ---------------------------------------------------------- --

cephCompliance MODULE-COMPLIANCE
    STATUS current
    DESCRIPTION
        "The Compliance statement for the Ceph MIB"
    MODULE
        MANDATORY-GROUPS {
            cephNotificationGroup
        }
    ::= { cephCompliances 1 }

END