summaryrefslogtreecommitdiffstats
path: root/ctdb/config/debug_locks.sh
diff options
context:
space:
mode:
Diffstat (limited to 'ctdb/config/debug_locks.sh')
-rwxr-xr-xctdb/config/debug_locks.sh218
1 files changed, 218 insertions, 0 deletions
diff --git a/ctdb/config/debug_locks.sh b/ctdb/config/debug_locks.sh
new file mode 100755
index 0000000..6c730ee
--- /dev/null
+++ b/ctdb/config/debug_locks.sh
@@ -0,0 +1,218 @@
+#!/bin/sh
+
+# This script attempts to find processes holding locks on a particular
+# CTDB database and dumps a stack trace for each such processe.
+#
+# There are 2 cases:
+#
+# * Samba is configured to use fcntl locks
+#
+# In this case /proc/locks is parsed to find potential lock holders
+#
+# * Samba is configured to use POSIX robust mutexes
+#
+# In this case the helper program tdb_mutex_check is used to find
+# potential lock holders.
+#
+# This helper program uses a private glibc struct field, so is
+# neither portable nor supported. If this field is not available
+# then the helper is not built. Unexpected changes in internal
+# glibc structures may cause unexpected results, including crashes.
+# Bug reports for this helper program are not accepted without an
+# accompanying patch.
+
+[ -n "$CTDB_BASE" ] || \
+ CTDB_BASE=$(d=$(dirname "$0") && cd -P "$d" && echo "$PWD")
+
+. "${CTDB_BASE}/functions"
+
+if [ $# -ne 4 ] ; then
+ die "usage: $0 <pid> { DB | RECORD } <tdb_path> { FCNTL | MUTEX }"
+fi
+
+lock_helper_pid="$1"
+# lock_scope is unused for now
+# shellcheck disable=SC2034
+lock_scope="$2"
+tdb_path="$3"
+lock_type="$4"
+
+# type is at least mentioned in POSIX and more is portable than which(1)
+# shellcheck disable=SC2039
+if ! type gstack >/dev/null 2>&1 ; then
+ gstack ()
+ {
+ _pid="$1"
+
+ gdb -batch --quiet -nx "/proc/${_pid}/exe" "$_pid" \
+ -ex "thread apply all bt" 2>/dev/null |
+ grep '^\(#\|Thread \)'
+ }
+fi
+
+# Load/cache database options from configuration file
+ctdb_get_db_options
+
+dump_stack ()
+{
+ _pid="$1"
+
+ echo "----- Stack trace for PID=${_pid} -----"
+ _state=$(ps -p "$_pid" -o state= | cut -c 1)
+ if [ "$_state" = "D" ] ; then
+ # Don't run gstack on a process in D state since
+ # gstack will hang until the process exits D state.
+ # Although it is possible for a process to transition
+ # to D state after this check, it is unlikely because
+ # if a process is stuck in D state then it is probably
+ # the reason why this script was called. Note that a
+ # kernel stack almost certainly won't help diagnose a
+ # deadlock... but it will probably give us someone to
+ # blame!
+ echo "----- Process in D state, printing kernel stack only"
+ get_proc "${_pid}/stack"
+ else
+ gstack "$_pid"
+ fi
+}
+
+dump_stacks ()
+{
+ _pids="$1"
+
+ # Use word splitting to squash whitespace
+ # shellcheck disable=SC2086
+ _pids=$(echo $_pids | tr ' ' '\n' | sort -u)
+
+ for _pid in $_pids; do
+ dump_stack "$_pid"
+ done
+}
+
+get_tdb_file_id ()
+{
+ if ! _device_inode=$(stat -c "%d:%i" "$tdb_path" 2>/dev/null) ; then
+ die "Unable to stat \"${tdb_path}\""
+ fi
+ _device="${_device_inode%%:*}"
+ _device_major=$((_device >> 8))
+ _device_minor=$((_device & 0xff))
+ _inode="${_device_inode#*:}"
+ printf '%02x:%02x:%u\n' "$_device_major" "$_device_minor" "$_inode"
+}
+
+debug_via_proc_locks ()
+{
+ # Get file ID to match relevant column in /proc/locks
+ _file_id=$(get_tdb_file_id)
+
+ # Log information from /proc/locks about the waiting process
+ _tdb=$(basename "$tdb_path")
+ _comm=$(ps -p "$lock_helper_pid" -o comm=)
+ _out=$(get_proc "locks" |
+ awk -v pid="$lock_helper_pid" \
+ -v file_id="$_file_id" \
+ -v file="$_tdb" \
+ -v comm="$_comm" \
+ '$2 == "->" &&
+ $3 == "POSIX" &&
+ $4 == "ADVISORY" &&
+ $5 == "WRITE" &&
+ $6 == pid &&
+ $7 == file_id { print $6, comm, file, $8, $9 }')
+ if [ -n "$_out" ] ; then
+ echo "Waiter:"
+ echo "$_out"
+ fi
+
+ # Parse /proc/locks and find process holding locks on $tdb_path
+ # extract following information
+ # pid process_name tdb_name offsets
+ _out=$(get_proc "locks" |
+ awk -v pid="$lock_helper_pid" \
+ -v file_id="$_file_id" \
+ -v file="$_tdb" \
+ '$2 == "POSIX" &&
+ $3 == "ADVISORY" &&
+ $4 == "WRITE" &&
+ $5 != pid &&
+ $6 == file_id { print $5, file, $7, $8 }' |
+ while read -r _pid _rest ; do
+ _pname=$(ps -p "$_pid" -o comm=)
+ echo "$_pid $_pname $_rest"
+ done)
+
+ if [ -z "$_out" ]; then
+ return
+ fi
+
+ # Log information about locks
+ echo "Lock holders:"
+ echo "$_out"
+
+ _pids=$(echo "$_out" | awk '{ print $1 }')
+
+ lock_holder_pids="${lock_holder_pids:+${lock_holder_pids} }${_pids}"
+}
+
+debug_via_tdb_mutex ()
+{
+ _helper="${CTDB_HELPER_BINDIR}/tdb_mutex_check"
+ if [ ! -x "$_helper" ] ; then
+ # Mutex helper not available - not supported?
+ # Avoid not found error...
+ return
+ fi
+
+ # Helper should always succeed
+ if ! _t=$("$_helper" "$tdb_path") ; then
+ return
+ fi
+
+ _out=$(echo "$_t" | sed -n -e 's#^\[\(.*\)\] pid=\(.*\)#\2 \1#p')
+
+ if [ -z "$_out" ]; then
+ if [ -n "$_t" ] ; then
+ echo "$_t" | grep -F 'trylock failed'
+ fi
+ return
+ fi
+
+ # Get process names, append $tdb_path
+ _out=$(echo "$_out" |
+ while read -r _pid _rest ; do
+ _pname=$(ps -p "$_pid" -o comm=)
+ _tdb=$(basename "$tdb_path")
+ echo "${_pid} ${_pname} ${_tdb} ${_rest}"
+ done)
+
+ # Log information about locks
+ echo "Lock holders:"
+ echo "$_out"
+
+ # Get PIDs of processes that are holding locks
+ _pids=$(echo "$_out" |
+ awk -v pid="$lock_helper_pid" '$1 != pid {print $1}')
+
+ lock_holder_pids="${lock_holder_pids:+${lock_holder_pids} }${_pids}"
+}
+
+(
+ flock -n 9 || exit 1
+
+ echo "===== Start of debug locks PID=$$ ====="
+
+ lock_holder_pids=""
+
+ debug_via_proc_locks
+
+ if [ "$lock_type" = "MUTEX" ] ; then
+ debug_via_tdb_mutex
+ fi
+
+ dump_stacks "$lock_holder_pids"
+
+ echo "===== End of debug locks PID=$$ ====="
+)9>"${CTDB_SCRIPT_VARDIR}/debug_locks.lock" | script_log "ctdbd-lock"
+
+exit 0