summaryrefslogtreecommitdiffstats
path: root/health/health.d/wmi.conf
blob: 0441fc1f3ce834ce708368fe61f81aca7aa4adc4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# you can disable an alarm notification by setting the 'to' line to: silent

## Availability

template: wmi_last_collected_secs
      on: cpu.collector_duration
    calc: $now - $last_collected_t
   units: seconds ago
   every: 10s
    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
   delay: down 5m multiplier 1.5 max 1h
    info: number of seconds since the last successful data collection
      to: sysadmin

## CPU

template: wmi_10min_cpu_usage
      on: wmi.cpu_utilization_total
      os: linux
   hosts: *
  lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
   units: %
   every: 1m
    warn: $this > (($status >= $WARNING)  ? (75) : (85))
    crit: $this > (($status == $CRITICAL) ? (85) : (95))
   delay: down 15m multiplier 1.5 max 1h
    info: cpu utilization for the last 10 minutes
      to: sysadmin


## Memory

template: wmi_ram_in_use
      on: wmi.memory_utilization
      os: linux
   hosts: *
    calc: ($used) * 100 / ($used + $available)
   units: %
   every: 10s
    warn: $this > (($status >= $WARNING)  ? (80) : (90))
    crit: $this > (($status == $CRITICAL) ? (90) : (98))
   delay: down 15m multiplier 1.5 max 1h
    info: used RAM
      to: sysadmin

template: wmi_swap_in_use
      on: wmi.memory_swap_utilization
      os: linux
   hosts: *
    calc: ($used) * 100 / ($used + $available)
   units: %
   every: 10s
    warn: $this > (($status >= $WARNING)  ? (80) : (90))
    crit: $this > (($status == $CRITICAL) ? (90) : (98))
   delay: down 15m multiplier 1.5 max 1h
    info: used Swap
      to: sysadmin


## Network

template: inbound_packets_discarded
      on: wmi.net_discarded
      os: linux
   hosts: *
families: *
  lookup: sum -10m unaligned absolute match-names of inbound
   units: packets
   every: 1m
    warn: $this >= 5
   delay: down 1h multiplier 1.5 max 2h
    info: interface inbound discarded packets in the last 10 minutes
      to: sysadmin

template: outbound_packets_discarded
      on: wmi.net_discarded
      os: linux
   hosts: *
families: *
  lookup: sum -10m unaligned absolute match-names of outbound
   units: packets
   every: 1m
    warn: $this >= 5
   delay: down 1h multiplier 1.5 max 2h
    info: interface outbound discarded packets in the last 10 minutes
      to: sysadmin

template: inbound_packets_errors
      on: wmi.net_errors
      os: linux
   hosts: *
families: *
  lookup: sum -10m unaligned absolute match-names of inbound
   units: packets
   every: 1m
    warn: $this >= 5
   delay: down 1h multiplier 1.5 max 2h
    info: interface inbound errors in the last 10 minutes
      to: sysadmin

template: outbound_packets_errors
      on: wmi.net_errors
      os: linux
   hosts: *
families: *
  lookup: sum -10m unaligned absolute match-names of outbound
   units: packets
   every: 1m
    warn: $this >= 5
   delay: down 1h multiplier 1.5 max 2h
    info: interface outbound errors in the last 10 minutes
      to: sysadmin


## Disk

template: wmi_disk_in_use
      on: wmi.logical_disk_utilization
      os: linux
   hosts: *
    calc: ($used) * 100 / ($used + $free)
   units: %
   every: 10s
    warn: $this > (($status >= $WARNING)  ? (80) : (90))
    crit: $this > (($status == $CRITICAL) ? (90) : (98))
   delay: down 15m multiplier 1.5 max 1h
    info: used disk space
      to: sysadmin