1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
|
#!/usr/bin/env bash
#
#(c) 2004-present, Facebook, all rights reserved.
# See the LICENSE file for usage and distribution rights.
#
trap 'echo "Caught exception, dying"; exit' 1 2 3 15
ME=`basename $0`
SERVER=`hostname`
#parameters used
#
Dump_Config=0
DEBUG=
OS=`/bin/uname -s`
VMEM=
RSS=
CPU=
VERBOSE=
VAR=
LIMIT=
ACTION=
N=
WAIT=
#
#supported OS: Linux only for now. Easy to add
#
oscheck() {
case ${OS} in
Linux)
VMEM=vsz
RSS=rss
CPU=bsdtime
;;
*)
die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks."
;;
esac
}
verbose() {
if [ "x$DEBUG" != "x" ]; then
echo "$@" >&2
fi
}
warn() {
echo "$@" >&2
}
die() {
echo "ERROR: " "$@" >&2;
exit;
}
dump_config() {
cat <<EOCONFIG;
$ME running on ${HOSTNAME} at `date`
Configuration for this run:
PID to monitor : ${PID}
Resource monitored : ${VAR}
Resource limit : ${LIMIT}
Check every : ${WAIT} seconds
No. of times run : ${N}
What to do : ${ACTION}
EOCONFIG
}
usage() {
cat <<USAGE; exit
$@
Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait]
Monitor a process for set of violations. Options:
-p: PID of process to monitor
-x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM
-l: what is the threshold/limit for the metric that is being sensed.
Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU
NOTE: defaults to 1GB
-a: action. Currently {warn|die|kill} are supported.
The default action is to 'warn'. Here is the behavior:
warn: complain if usage exceeds threshold, but continue monitoring
kill: complain, kill the db_bench process and exit
die: if usage exceeds threshold, die immediately
-n: number of cycles to monitor. Default is to monitor until PID no longer exists.
-w: wait time per cycle of monitoring. Default is 5 seconds.
-v: verbose messaging
USAGE
}
#set default values if none given
set_defaults_if_noopt_given() {
: ${VAR:=vsz}
: ${LIMIT:=1024000}
: ${WAIT:=5}
: ${N:=999999}
: ${ACTION:=warn}
}
validate_options() {
if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then
usage "PID is mandatory"
fi
}
###### START
while getopts ":p:x:l:a:n:t:vhd" opt; do
case $opt in
d)
Dump_Config=1
;;
h)
usage;
;;
a)
ACTION=${OPTARG};
;;
v)
DEBUG=1;
;;
p)
PID=$OPTARG;
;;
x)
VAR=$OPTARG;
;;
l)
LIMIT=$OPTARG;
;;
w)
WAIT=$OPTARG;
;;
n)
N=$OPTARG;
;;
\?)
usage;
;;
esac
done
oscheck;
set_defaults_if_noopt_given;
validate_options;
if [ $Dump_Config -eq 1 ]; then
dump_config;
exit;
fi
Done=0
verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration";
while [ $Done -eq 0 ]; do
VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'`
if [ ${VAL:=0} -eq 0 ]; then
warn "Process $PID ended without incident."
Done=1;
break;
fi
if [ $VAL -ge $LIMIT ]; then
Done=1;
else
echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}"
sleep $WAIT;
fi
if [ $Done -eq 1 ]; then
if [ "$ACTION" = "kill" ]; then
kill ${PID} || kill -3 ${PID}
exit;
elif [ "$ACTION" = "warn" ]; then
# go back to monitoring.
warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}"
Done=0 #go back to monitoring
elif [ "$ACTION" = "die" ]; then
warn "WARNING: dying without killing process ${PID} on ${SERVER}"
warn "The process details are below: "
warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`"
warn ""
#should we send email/notify someone? TODO... for now, bail.
exit -1;
fi
else
:
#warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded";
fi
done
|