1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
|
CEPH-MIB DEFINITIONS ::= BEGIN
IMPORTS
MODULE-IDENTITY, NOTIFICATION-TYPE, enterprises
FROM SNMPv2-SMI
MODULE-COMPLIANCE, NOTIFICATION-GROUP
FROM SNMPv2-CONF
;
-- Linting information:
--
-- # smilint -l 6 -i notification-not-reversible ./CEPH-MIB.txt
--
-- ignore: notification-not-reversible since our SNMP gateway doesn't use SNMPv1
--
ceph MODULE-IDENTITY
LAST-UPDATED
"202111010000Z" -- Nov 01, 2021
ORGANIZATION
"The Ceph Project
https://ceph.io"
CONTACT-INFO
"Email: <dev@ceph.io>
Send comments to: <dev@ceph.io>"
DESCRIPTION
"The MIB module for Ceph. In it's current form it only
supports Notifications, since Ceph itself doesn't provide
any SNMP agent functionality.
Notifications are provided through a Prometheus/Alertmanager
webhook passing alerts to an external gateway service that is
responsible for formatting, forwarding and authenticating to
the SNMP receiver.
"
REVISION
"202111010000Z" --Nov 01, 2021
DESCRIPTION
"Latest version including the following updates;
- MIB restructure to align with linting
- names shortened and simplified (less verbose)
- Simplified structure due to switch to https://github.com/maxwo/snmp_notifier
- objects removed
- notifications updated
- Added module compliance
- Updated to latest prometheus alert rule definitions
"
::= { enterprises 50495 }
cephCluster OBJECT IDENTIFIER ::= { ceph 1 }
cephConformance OBJECT IDENTIFIER ::= { ceph 2 }
-- cephMetadata is a placeholder for possible future expansion via an agent
-- where we could provide an overview of the clusters configuration
cephMetadata OBJECT IDENTIFIER ::= { cephCluster 1 }
cephNotifications OBJECT IDENTIFIER ::= { cephCluster 2 }
prometheus OBJECT IDENTIFIER ::= { cephNotifications 1 }
--
-- Notifications: first we define the notification 'branches' for the
-- different categories of notifications / alerts
promGeneric OBJECT IDENTIFIER ::= { prometheus 1 }
promHealthStatus OBJECT IDENTIFIER ::= { prometheus 2 }
promMon OBJECT IDENTIFIER ::= { prometheus 3 }
promOsd OBJECT IDENTIFIER ::= { prometheus 4 }
promMds OBJECT IDENTIFIER ::= { prometheus 5 }
promMgr OBJECT IDENTIFIER ::= { prometheus 6 }
promPGs OBJECT IDENTIFIER ::= { prometheus 7 }
promNode OBJECT IDENTIFIER ::= { prometheus 8 }
promPool OBJECT IDENTIFIER ::= { prometheus 9 }
promRados OBJECT IDENTIFIER ::= { prometheus 10 }
promCephadm OBJECT IDENTIFIER ::= { prometheus 11 }
promPrometheus OBJECT IDENTIFIER ::= { prometheus 12 }
promGenericNotification NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Generic alert issued when the Prometheus rule doesn't provide an OID."
::= { promGeneric 1 }
promGenericDaemonCrash NOTIFICATION-TYPE
STATUS current
DESCRIPTION "One or more daemons have crashed recently, and are yet to be archived"
::= { promGeneric 2 }
promHealthStatusError NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Ceph in health_error state for too long."
::= { promHealthStatus 1 }
promHealthStatusWarning NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Ceph in health_warn for too long."
::= { promHealthStatus 2 }
promMonLowQuorum NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Monitor count in quorum is low."
::= { promMon 1 }
promMonDiskSpaceCritical NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Monitor diskspace is critically low."
::= { promMon 2 }
promOsdDownHigh NOTIFICATION-TYPE
STATUS current
DESCRIPTION "A high number of OSDs are down."
::= { promOsd 1 }
promOsdDown NOTIFICATION-TYPE
STATUS current
DESCRIPTION "One or more Osds down."
::= { promOsd 2 }
promOsdNearFull NOTIFICATION-TYPE
STATUS current
DESCRIPTION "An OSD is dangerously full."
::= { promOsd 3 }
promOsdFlapping NOTIFICATION-TYPE
STATUS current
DESCRIPTION "An OSD was marked down at back up at least once a minute for 5 minutes."
::= { promOsd 4 }
promOsdHighPgDeviation NOTIFICATION-TYPE
STATUS current
DESCRIPTION "An OSD deviates by more then 30% from average PG count."
::= { promOsd 5 }
promOsdFull NOTIFICATION-TYPE
STATUS current
DESCRIPTION "An OSD has reached its full threshold."
::= { promOsd 6 }
promOsdHighPredictedFailures NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Normal self healing unable to cope with the number of devices predicted to fail."
::= { promOsd 7 }
promOsdHostDown NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Ceph OSD host is down."
::= { promOsd 8 }
promMdsDamaged NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Cephfs filesystem is damaged."
::= { promMds 1 }
promMdsReadOnly NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Cephfs filesystem marked as READ-ONLY"
::= { promMds 2 }
promMdsOffline NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Cephfs filesystem is unavailable/offline."
::= { promMds 3 }
promMdsDegraded NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Cephfs filesystem is in a degraded state."
::= { promMds 4 }
promMdsNoStandby NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Cephfs MDS daemon failure, no standby available"
::= { promMds 5 }
promMgrModuleCrash NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Ceph mgr module has crashed recently"
::= { promMgr 1 }
promMgrPrometheusInactive NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Ceph mgr prometheus module not responding"
::= { promMgr 2 }
promPGsInactive NOTIFICATION-TYPE
STATUS current
DESCRIPTION "One or more PGs are inactive for more than 5 minutes."
::= { promPGs 1 }
promPGsUnclean NOTIFICATION-TYPE
STATUS current
DESCRIPTION "One or more PGs are not clean for more than 15 minutes."
::= { promPGs 2 }
promPGsUnavailable NOTIFICATION-TYPE
STATUS current
DESCRIPTION "One or more PGs is unavailable, blocking I/O to those objects."
::= { promPGs 3 }
promPGsDamaged NOTIFICATION-TYPE
STATUS current
DESCRIPTION "One or more PGs is damaged."
::= { promPGs 4 }
promPGsRecoveryFull NOTIFICATION-TYPE
STATUS current
DESCRIPTION "PG recovery is impaired due to full OSDs."
::= { promPGs 5 }
promPGsBackfillFull NOTIFICATION-TYPE
STATUS current
DESCRIPTION "PG backfill is impaired due to full OSDs."
::= { promPGs 6 }
promNodeRootVolumeFull NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Root volume (OSD and MON store) is dangerously full (< 5% free)."
::= { promNode 1 }
promNodeNetworkPacketDrops NOTIFICATION-TYPE
STATUS current
DESCRIPTION "A node experiences packet drop > 1 packet/s on an interface."
::= { promNode 2 }
promNodeNetworkPacketErrors NOTIFICATION-TYPE
STATUS current
DESCRIPTION "A node experiences packet errors > 1 packet/s on an interface."
::= { promNode 3 }
promNodeStorageFilling NOTIFICATION-TYPE
STATUS current
DESCRIPTION "A mountpoint will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
::= { promNode 4 }
promPoolFull NOTIFICATION-TYPE
STATUS current
DESCRIPTION "A pool is at 90% capacity or over."
::= { promPool 1 }
promPoolFilling NOTIFICATION-TYPE
STATUS current
DESCRIPTION "A pool will be full in less then 5 days assuming the average fillup rate of the past 48 hours."
::= { promPool 2 }
promRadosUnfound NOTIFICATION-TYPE
STATUS current
DESCRIPTION "A RADOS object can not be found, even though all OSDs are online."
::= { promRados 1 }
promCephadmDaemonDown NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Cephadm has determined that a daemon is down."
::= { promCephadm 1 }
promCephadmUpgradeFailure NOTIFICATION-TYPE
STATUS current
DESCRIPTION "Cephadm attempted to upgrade the cluster and encountered a problem."
::= { promCephadm 2 }
promPrometheusJobMissing NOTIFICATION-TYPE
STATUS current
DESCRIPTION "The prometheus scrape job is not defined."
::= { promPrometheus 1 }
-- ---------------------------------------------------------- --
-- IEEE 802.1D MIB - Conformance Information
-- ---------------------------------------------------------- --
cephAlertGroups OBJECT IDENTIFIER ::= { cephConformance 1 }
cephCompliances OBJECT IDENTIFIER ::= { cephConformance 2 }
-- ---------------------------------------------------------- --
-- units of conformance
-- ---------------------------------------------------------- --
-- ---------------------------------------------------------- --
-- The Trap Notification Group
-- ---------------------------------------------------------- --
cephNotificationGroup NOTIFICATION-GROUP
NOTIFICATIONS {
promGenericNotification,
promGenericDaemonCrash,
promHealthStatusError,
promHealthStatusWarning,
promMonLowQuorum,
promMonDiskSpaceCritical,
promOsdDownHigh,
promOsdDown,
promOsdNearFull,
promOsdFlapping,
promOsdHighPgDeviation,
promOsdFull,
promOsdHighPredictedFailures,
promOsdHostDown,
promMdsDamaged,
promMdsReadOnly,
promMdsOffline,
promMdsDegraded,
promMdsNoStandby,
promMgrModuleCrash,
promMgrPrometheusInactive,
promPGsInactive,
promPGsUnclean,
promPGsUnavailable,
promPGsDamaged,
promPGsRecoveryFull,
promPGsBackfillFull,
promNodeRootVolumeFull,
promNodeNetworkPacketDrops,
promNodeNetworkPacketErrors,
promNodeStorageFilling,
promPoolFull,
promPoolFilling,
promRadosUnfound,
promCephadmDaemonDown,
promCephadmUpgradeFailure,
promPrometheusJobMissing
}
STATUS current
DESCRIPTION
"A collection of notifications triggered by the Prometheus
rules to convey Ceph cluster state"
::= { cephAlertGroups 2 }
-- ---------------------------------------------------------- --
-- compliance statements
-- ---------------------------------------------------------- --
cephCompliance MODULE-COMPLIANCE
STATUS current
DESCRIPTION
"The Compliance statement for the Ceph MIB"
MODULE
MANDATORY-GROUPS {
cephNotificationGroup
}
::= { cephCompliances 1 }
END
|