blob: c1ac0f1e1ad67961d839156f77f11201018a2519 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
#!/bin/sh
# This script only works on Linux. Please modify (and submit patches)
# for other operating systems.
[ -n "$CTDB_BASE" ] || \
CTDB_BASE=$(d=$(dirname "$0") && cd -P "$d" && echo "$PWD")
. "${CTDB_BASE}/functions"
load_script_options
# Testing hook
if [ -n "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" ] ; then
tmp="${CTDB_DEBUG_HUNG_SCRIPT_LOGFILE}.part"
exec >>"$tmp" 2>&1
fi
(
# No use running several of these in parallel if, say, "releaseip"
# event hangs for multiple IPs. In that case the output would be
# interleaved in the log and would just be confusing.
flock --wait 2 9 || exit 1
echo "===== Start of hung script debug for PID=\"$1\", event=\"$2\" ====="
echo "pstree -p -a ${1}:"
out=$(pstree -p -a "$1")
echo "$out"
# Check for processes matching a regular expression and print
# stack staces. This could help confirm that certain processes
# are stuck in certain places such as the cluster filesystem. The
# regexp must separate items with "|" and must not contain
# parentheses. The default pattern can be replaced for testing.
default_pat='exportfs|rpcinfo'
pat="${CTDB_DEBUG_HUNG_SCRIPT_STACKPAT:-${default_pat}}"
echo "$out" |
sed -r -n "s@.*-(.*(${pat}).*),([0-9]*).*@\\3 \\1@p" |
while read pid name ; do
trace=$(cat "/proc/${pid}/stack" 2>/dev/null)
# No! Checking the exit code afterwards is actually clearer...
# shellcheck disable=SC2181
if [ $? -eq 0 ] ; then
echo "---- Stack trace of interesting process ${pid}[${name}] ----"
echo "$trace"
fi
done
if [ "$2" != "init" ] ; then
echo "---- ctdb scriptstatus ${2}: ----"
$CTDB scriptstatus "$2"
fi
echo "===== End of hung script debug for PID=\"$1\", event=\"$2\" ====="
if [ -n "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" ] ; then
mv "$tmp" "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE"
fi
) 9>"${CTDB_SCRIPT_VARDIR}/debug-hung-script.lock"
|