summaryrefslogtreecommitdiffstats
path: root/ctdb/config/debug_locks.sh
blob: 6c730ee18023cf947ba5d013d54b6f2b81cec15f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/bin/sh

# This script attempts to find processes holding locks on a particular
# CTDB database and dumps a stack trace for each such processe.
#
# There are 2 cases:
#
# * Samba is configured to use fcntl locks
#
#   In this case /proc/locks is parsed to find potential lock holders
#
# * Samba is configured to use POSIX robust mutexes
#
#   In this case the helper program tdb_mutex_check is used to find
#   potential lock holders.
#
#   This helper program uses a private glibc struct field, so is
#   neither portable nor supported.  If this field is not available
#   then the helper is not built.  Unexpected changes in internal
#   glibc structures may cause unexpected results, including crashes.
#   Bug reports for this helper program are not accepted without an
#   accompanying patch.

[ -n "$CTDB_BASE" ] || \
	CTDB_BASE=$(d=$(dirname "$0") && cd -P "$d" && echo "$PWD")

. "${CTDB_BASE}/functions"

if [ $# -ne 4 ] ; then
	die "usage: $0 <pid> { DB | RECORD } <tdb_path> { FCNTL | MUTEX }"
fi

lock_helper_pid="$1"
# lock_scope is unused for now
# shellcheck disable=SC2034
lock_scope="$2"
tdb_path="$3"
lock_type="$4"

# type is at least mentioned in POSIX and more is portable than which(1)
# shellcheck disable=SC2039
if ! type gstack >/dev/null 2>&1 ; then
	gstack ()
	{
		_pid="$1"

		gdb -batch --quiet -nx "/proc/${_pid}/exe" "$_pid" \
		    -ex "thread apply all bt" 2>/dev/null |
			grep '^\(#\|Thread \)'
	}
fi

# Load/cache database options from configuration file
ctdb_get_db_options

dump_stack ()
{
	_pid="$1"

	echo "----- Stack trace for PID=${_pid} -----"
	_state=$(ps -p "$_pid" -o state= | cut -c 1)
	if [ "$_state" = "D" ] ; then
		# Don't run gstack on a process in D state since
		# gstack will hang until the process exits D state.
		# Although it is possible for a process to transition
		# to D state after this check, it is unlikely because
		# if a process is stuck in D state then it is probably
		# the reason why this script was called.  Note that a
		# kernel stack almost certainly won't help diagnose a
		# deadlock... but it will probably give us someone to
		# blame!
		echo "----- Process in D state, printing kernel stack only"
		get_proc "${_pid}/stack"
	else
		gstack "$_pid"
	fi
}

dump_stacks ()
{
	_pids="$1"

	# Use word splitting to squash whitespace
	# shellcheck disable=SC2086
	_pids=$(echo $_pids | tr ' ' '\n' | sort -u)

	for _pid in $_pids; do
		dump_stack "$_pid"
	done
}

get_tdb_file_id ()
{
	if ! _device_inode=$(stat -c "%d:%i" "$tdb_path" 2>/dev/null) ; then
		die "Unable to stat \"${tdb_path}\""
	fi
	_device="${_device_inode%%:*}"
	_device_major=$((_device >> 8))
	_device_minor=$((_device & 0xff))
	_inode="${_device_inode#*:}"
	printf '%02x:%02x:%u\n' "$_device_major" "$_device_minor" "$_inode"
}

debug_via_proc_locks ()
{
	# Get file ID to match relevant column in /proc/locks
	_file_id=$(get_tdb_file_id)

	# Log information from /proc/locks about the waiting process
	_tdb=$(basename "$tdb_path")
	_comm=$(ps -p "$lock_helper_pid" -o comm=)
	_out=$(get_proc "locks" |
	       awk -v pid="$lock_helper_pid" \
		   -v file_id="$_file_id" \
		   -v file="$_tdb" \
		   -v comm="$_comm" \
		   '$2 == "->" &&
		    $3 == "POSIX" &&
		    $4 == "ADVISORY" &&
		    $5 == "WRITE" &&
		    $6 == pid &&
		    $7 == file_id { print $6, comm, file, $8, $9 }')
	if [ -n "$_out" ] ; then
		echo "Waiter:"
		echo "$_out"
	fi

	# Parse /proc/locks and find process holding locks on $tdb_path
	# extract following information
	#    pid process_name tdb_name offsets
	_out=$(get_proc "locks" |
	       awk -v pid="$lock_helper_pid" \
		   -v file_id="$_file_id" \
		   -v file="$_tdb" \
		   '$2 == "POSIX" &&
		    $3 == "ADVISORY" &&
		    $4 == "WRITE" &&
		    $5 != pid &&
		    $6 == file_id { print $5, file, $7, $8 }' |
	       while read -r _pid _rest ; do
		       _pname=$(ps -p "$_pid" -o comm=)
		       echo "$_pid $_pname $_rest"
	       done)

	if [ -z "$_out" ]; then
		return
	fi

	# Log information about locks
	echo "Lock holders:"
	echo "$_out"

	_pids=$(echo "$_out" | awk '{ print $1 }')

	lock_holder_pids="${lock_holder_pids:+${lock_holder_pids} }${_pids}"
}

debug_via_tdb_mutex ()
{
	_helper="${CTDB_HELPER_BINDIR}/tdb_mutex_check"
	if [ ! -x "$_helper" ] ; then
		# Mutex helper not available - not supported?
		# Avoid not found error...
		return
	fi

	# Helper should always succeed
	if ! _t=$("$_helper" "$tdb_path") ; then
		return
	fi

	_out=$(echo "$_t" | sed -n -e 's#^\[\(.*\)\] pid=\(.*\)#\2 \1#p')

	if [ -z "$_out" ]; then
		if [ -n "$_t" ] ; then
			echo "$_t" | grep -F 'trylock failed'
		fi
		return
	fi

	# Get process names, append $tdb_path
	_out=$(echo "$_out" |
	       while read -r _pid _rest ; do
		       _pname=$(ps -p "$_pid" -o comm=)
		       _tdb=$(basename "$tdb_path")
		       echo "${_pid} ${_pname} ${_tdb} ${_rest}"
	       done)

	# Log information about locks
	echo "Lock holders:"
	echo "$_out"

	# Get PIDs of processes that are holding locks
	_pids=$(echo "$_out" |
		awk -v pid="$lock_helper_pid" '$1 != pid {print $1}')

	lock_holder_pids="${lock_holder_pids:+${lock_holder_pids} }${_pids}"
}

(
	flock -n 9 || exit 1

	echo "===== Start of debug locks PID=$$ ====="

	lock_holder_pids=""

	debug_via_proc_locks

	if [ "$lock_type" = "MUTEX" ] ; then
		debug_via_tdb_mutex
	fi

	dump_stacks "$lock_holder_pids"

	echo "===== End of debug locks PID=$$ ====="
)9>"${CTDB_SCRIPT_VARDIR}/debug_locks.lock" | script_log "ctdbd-lock"

exit 0