summaryrefslogtreecommitdiffstats
path: root/health/health.d/vsphere.conf
blob: d8b2be1907f70fb55a7764170cdcd37baeaf318e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# you can disable an alarm notification by setting the 'to' line to: silent

# -----------------------------------------------VM Specific------------------------------------------------------------
# Memory

template: vsphere_vm_mem_usage
      on: vsphere.vm_mem_usage_percentage
   hosts: *
    calc: $used
   units: %
   every: 20s
    warn: $this > (($status >= $WARNING)  ? (80) : (90))
    crit: $this > (($status == $CRITICAL) ? (90) : (98))
   delay: down 15m multiplier 1.5 max 1h
    info: used RAM

# -----------------------------------------------HOST Specific----------------------------------------------------------
# Memory

template: vsphere_host_mem_usage
      on: vsphere.host_mem_usage_percentage
   hosts: *
    calc: $used
   units: %
   every: 20s
    warn: $this > (($status >= $WARNING)  ? (80) : (90))
    crit: $this > (($status == $CRITICAL) ? (90) : (98))
   delay: down 15m multiplier 1.5 max 1h
    info: used RAM

# Network errors

template: vsphere_inbound_packets_errors
      on: vsphere.net_errors_total
   hosts: *
families: *
  lookup: sum -10m unaligned absolute match-names of rx
   units: packets
   every: 1m
    warn: $this >= 5
   delay: down 1h multiplier 1.5 max 2h
    info: interface inbound dropped packets in the last 10 minutes
      to: sysadmin

template: vsphere_outbound_packets_errors
      on: vsphere.net_errors_total
   hosts: *
families: *
  lookup: sum -10m unaligned absolute match-names of tx
   units: packets
   every: 1m
    warn: $this >= 5
   delay: down 1h multiplier 1.5 max 2h
    info: interface outbound dropped packets in the last 10 minutes
      to: sysadmin

# Network errors ratio

template: vsphere_inbound_packets_errors_ratio
      on: vsphere.net_packets_total
   hosts: *
families: *
  lookup: sum -10m unaligned absolute match-names of rx
    calc: (($vsphere_inbound_packets_errors != nan AND $this > 0) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0))
   units: %
   every: 1m
    warn: $this >= 0.1
    crit: $this >= 2
   delay: down 1h multiplier 1.5 max 2h
    info: the ratio of inbound errors vs the total number of received packets of the network interface, during the last 10 minutes
      to: sysadmin

template: vsphere_outbound_packets_errors_ratio
      on: vsphere.net_packets_total
   hosts: *
families: *
  lookup: sum -10m unaligned absolute match-names of tx
    calc: (($vsphere_outbound_packets_errors != nan AND $this > 0) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0))
   units: %
   every: 1m
    warn: $this >= 0.1
    crit: $this >= 2
   delay: down 1h multiplier 1.5 max 2h
    info: the ratio of outbound errors vs the total number of sent packets of the network interface, during the last 10 minutes
      to: sysadmin

# -----------------------------------------------Common-------------------------------------------------------------------
# CPU

template: vsphere_cpu_usage
      on: vsphere.cpu_usage_total
   hosts: *
  lookup: average -10m unaligned match-names of used
   units: %
   every: 20s
    warn: $this > (($status >= $WARNING)  ? (75) : (85))
    crit: $this > (($status == $CRITICAL) ? (85) : (95))
   delay: down 15m multiplier 1.5 max 1h
    info: cpu utilization for the last 10 minutes
      to: sysadmin

# Network drops

template: vsphere_inbound_packets_dropped
      on: vsphere.net_drops_total
   hosts: *
families: *
  lookup: sum -10m unaligned absolute match-names of rx
   units: packets
   every: 1m
    warn: $this >= 5
   delay: down 1h multiplier 1.5 max 2h
    info: interface inbound dropped packets in the last 10 minutes
      to: sysadmin

template: vsphere_outbound_packets_dropped
      on: vsphere.net_drops_total
   hosts: *
families: *
  lookup: sum -10m unaligned absolute match-names of tx
   units: packets
   every: 1m
    warn: $this >= 5
   delay: down 1h multiplier 1.5 max 2h
    info: interface outbound dropped packets in the last 10 minutes
      to: sysadmin

# Network drops ratio

template: vsphere_inbound_packets_dropped_ratio
      on: vsphere.net_packets_total
   hosts: *
families: *
  lookup: sum -10m unaligned absolute match-names of rx
    calc: (($vsphere_inbound_packets_dropped != nan AND $this > 0) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0))
   units: %
   every: 1m
    warn: $this >= 0.1
    crit: $this >= 2
   delay: down 1h multiplier 1.5 max 2h
    info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
      to: sysadmin

template: vsphere_outbound_packets_dropped_ratio
      on: vsphere.net_packets_total
   hosts: *
families: *
  lookup: sum -10m unaligned absolute match-names of tx
    calc: (($vsphere_outbound_packets_dropped != nan AND $this > 0) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0))
   units: %
   every: 1m
    warn: $this >= 0.1
    crit: $this >= 2
   delay: down 1h multiplier 1.5 max 2h
    info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
      to: sysadmin