conf.d/health.d/disks.conf


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

# -----------------------------------------------------------------------------
# make sure we collect values for each disk

# for mount points
template: disk_space_last_collected_secs
      on: disk.space
families: *
    calc: $now - $last_collected_t
   units: seconds ago
   every: 10s
    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
   delay: down 5m multiplier 1.5 max 1h
    info: number of seconds since the last successful data collection of the mount point
      to: sysadmin

# for block devices
template: disk_last_collected_secs
      on: disk.io
families: *
    calc: $now - $last_collected_t
   units: seconds ago
   every: 10s
    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
   delay: down 5m multiplier 1.5 max 1h
    info: number of seconds since the last successful data collection of the block device
      to: sysadmin


# -----------------------------------------------------------------------------
# low disk space

# checking the latest collected values
# raise an alarm if the disk is low on
# available disk space

template: disk_space_usage
      on: disk.space
families: *
    calc: $used * 100 / ($avail + $used)
   units: %
   every: 1m
    warn: $this > (($status >= $WARNING ) ? (80) : (90))
    crit: $this > (($status == $CRITICAL) ? (90) : (98))
   delay: up 1m down 15m multiplier 1.5 max 1h
    info: current disk space usage
      to: sysadmin

template: disk_inode_usage
      on: disk.inodes
families: *
    calc: $used * 100 / ($avail + $used)
   units: %
   every: 1m
    warn: $this > (($status >= $WARNING)  ? (80) : (90))
    crit: $this > (($status == $CRITICAL) ? (90) : (98))
   delay: up 1m down 15m multiplier 1.5 max 1h
    info: current disk inode usage
      to: sysadmin


# -----------------------------------------------------------------------------
# disk fill rate

# calculate the rate the disk fills
# use as base, the available space change
# during the last hour

# this is just a calculation - it has no alarm
# we will use it in the next template to find
# the hours remaining

template: disk_fill_rate
      on: disk.space
families: *
  lookup: min -10m at -50m unaligned of avail
    calc: ($this - $avail) / (($now - $after) / 3600)
   every: 1m
   units: GB/hour
    info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour


# calculate the hours remaining
# if the disk continues to fill
# in this rate

template: out_of_disk_space_time
      on: disk.space
families: *
    calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (0)
   units: hours
   every: 10s
    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
   delay: down 15m multiplier 1.2 max 1h
    info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
      to: sysadmin


# -----------------------------------------------------------------------------
# disk congestion

# raise an alarm if the disk is congested
# by calculating the average disk utilization
# for the last 10 minutes

template: 10min_disk_utilization
      on: disk.util
families: *
  lookup: average -10m unaligned
   units: %
   every: 1m
   green: 90
     red: 98
    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
   delay: down 15m multiplier 1.2 max 1h
    info: the percentage of time the disk was busy, during the last 10 minutes
      to: sysadmin


# raise an alarm if the disk backlog
# is above 1000ms (1s) per second
# for 10 minutes
# (i.e. the disk cannot catch up)

template: 10min_disk_backlog
      on: disk.backlog
families: *
  lookup: average -10m unaligned
   units: ms
   every: 1m
   green: 2000
     red: 5000
    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
   delay: down 15m multiplier 1.2 max 1h
    info: average of the kernel estimated disk backlog, for the last 10 minutes
      to: sysadmin