summaryrefslogtreecommitdiffstats
path: root/health/health.d/disks.conf
blob: 2e417fd4a3e03199ced92359c122af92302e4ca3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# you can disable an alarm notification by setting the 'to' line to: silent


# -----------------------------------------------------------------------------
# low disk space

# checking the latest collected values
# raise an alarm if the disk is low on
# available disk space

 template: disk_space_usage
       on: disk.space
    class: Utilization
     type: System
component: Disk
       os: linux freebsd
    hosts: *
chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
     calc: $used * 100 / ($avail + $used)
    units: %
    every: 1m
     warn: $this > (($status >= $WARNING ) ? (80) : (90))
     crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5
    delay: up 1m down 15m multiplier 1.5 max 1h
  summary: Disk ${label:mount_point} space usage
     info: Total space utilization of disk ${label:mount_point}
       to: sysadmin

 template: disk_inode_usage
       on: disk.inodes
    class: Utilization
     type: System
component: Disk
       os: linux freebsd
    hosts: *
chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
     calc: $used * 100 / ($avail + $used)
    units: %
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: up 1m down 15m multiplier 1.5 max 1h
  summary: Disk ${label:mount_point} inode usage
     info: Total inode utilization of disk ${label:mount_point}
       to: sysadmin


# -----------------------------------------------------------------------------
# disk fill rate

# calculate the rate the disk fills
# use as base, the available space change
# during the last hour

# this is just a calculation - it has no alarm
# we will use it in the next template to find
# the hours remaining

template: disk_fill_rate
      on: disk.space
      os: linux freebsd
   hosts: *
  lookup: min -10m at -50m unaligned of avail
    calc: ($this - $avail) / (($now - $after) / 3600)
   every: 1m
   units: GB/hour
    info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour

# calculate the hours remaining
# if the disk continues to fill
# in this rate

template: out_of_disk_space_time
      on: disk.space
      os: linux freebsd
   hosts: *
    calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
   units: hours
   every: 10s
    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
   delay: down 15m multiplier 1.2 max 1h
 summary: Disk ${label:mount_point} estimation of lack of space
    info: Estimated time the disk ${label:mount_point} will run out of space, if the system continues to add data with the rate of the last hour
      to: silent


# -----------------------------------------------------------------------------
# disk inode fill rate

# calculate the rate the disk inodes are allocated
# use as base, the available inodes change
# during the last hour

# this is just a calculation - it has no alarm
# we will use it in the next template to find
# the hours remaining

template: disk_inode_rate
      on: disk.inodes
      os: linux freebsd
   hosts: *
  lookup: min -10m at -50m unaligned of avail
    calc: ($this - $avail) / (($now - $after) / 3600)
   every: 1m
   units: inodes/hour
    info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour

# calculate the hours remaining
# if the disk inodes are allocated
# in this rate

template: out_of_disk_inodes_time
      on: disk.inodes
      os: linux freebsd
   hosts: *
    calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
   units: hours
   every: 10s
    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
   delay: down 15m multiplier 1.2 max 1h
 summary: Disk ${label:mount_point} estimation of lack of inodes
    info: Estimated time the disk ${label:mount_point} will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
      to: silent


# -----------------------------------------------------------------------------
# disk congestion

# raise an alarm if the disk is congested
# by calculating the average disk utilization
# for the last 10 minutes

 template: 10min_disk_utilization
       on: disk.util
    class: Utilization
     type: System
component: Disk
       os: linux freebsd
    hosts: *
   lookup: average -10m unaligned
    units: %
    every: 1m
     warn: $this > 98 * (($status >= $WARNING)  ? (0.7) : (1))
    delay: down 15m multiplier 1.2 max 1h
  summary: Disk ${label:device} utilization
     info: Average percentage of time ${label:device} disk was busy over the last 10 minutes
       to: silent


# raise an alarm if the disk backlog
# is above 1000ms (1s) per second
# for 10 minutes
# (i.e. the disk cannot catch up)

 template: 10min_disk_backlog
       on: disk.backlog
    class: Latency
     type: System
component: Disk
       os: linux
    hosts: *
   lookup: average -10m unaligned
    units: ms
    every: 1m
     warn: $this > 5000 * (($status >= $WARNING)  ? (0.7) : (1))
    delay: down 15m multiplier 1.2 max 1h
  summary: Disk ${label:device} backlog
     info: Average backlog size of the ${label:device} disk over the last 10 minutes
       to: silent