diff options
Diffstat (limited to 'ctdb/config/debug_locks.sh')
-rwxr-xr-x | ctdb/config/debug_locks.sh | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/ctdb/config/debug_locks.sh b/ctdb/config/debug_locks.sh new file mode 100755 index 0000000..6c730ee --- /dev/null +++ b/ctdb/config/debug_locks.sh @@ -0,0 +1,218 @@ +#!/bin/sh + +# This script attempts to find processes holding locks on a particular +# CTDB database and dumps a stack trace for each such processe. +# +# There are 2 cases: +# +# * Samba is configured to use fcntl locks +# +# In this case /proc/locks is parsed to find potential lock holders +# +# * Samba is configured to use POSIX robust mutexes +# +# In this case the helper program tdb_mutex_check is used to find +# potential lock holders. +# +# This helper program uses a private glibc struct field, so is +# neither portable nor supported. If this field is not available +# then the helper is not built. Unexpected changes in internal +# glibc structures may cause unexpected results, including crashes. +# Bug reports for this helper program are not accepted without an +# accompanying patch. + +[ -n "$CTDB_BASE" ] || \ + CTDB_BASE=$(d=$(dirname "$0") && cd -P "$d" && echo "$PWD") + +. "${CTDB_BASE}/functions" + +if [ $# -ne 4 ] ; then + die "usage: $0 <pid> { DB | RECORD } <tdb_path> { FCNTL | MUTEX }" +fi + +lock_helper_pid="$1" +# lock_scope is unused for now +# shellcheck disable=SC2034 +lock_scope="$2" +tdb_path="$3" +lock_type="$4" + +# type is at least mentioned in POSIX and more is portable than which(1) +# shellcheck disable=SC2039 +if ! type gstack >/dev/null 2>&1 ; then + gstack () + { + _pid="$1" + + gdb -batch --quiet -nx "/proc/${_pid}/exe" "$_pid" \ + -ex "thread apply all bt" 2>/dev/null | + grep '^\(#\|Thread \)' + } +fi + +# Load/cache database options from configuration file +ctdb_get_db_options + +dump_stack () +{ + _pid="$1" + + echo "----- Stack trace for PID=${_pid} -----" + _state=$(ps -p "$_pid" -o state= | cut -c 1) + if [ "$_state" = "D" ] ; then + # Don't run gstack on a process in D state since + # gstack will hang until the process exits D state. + # Although it is possible for a process to transition + # to D state after this check, it is unlikely because + # if a process is stuck in D state then it is probably + # the reason why this script was called. Note that a + # kernel stack almost certainly won't help diagnose a + # deadlock... but it will probably give us someone to + # blame! + echo "----- Process in D state, printing kernel stack only" + get_proc "${_pid}/stack" + else + gstack "$_pid" + fi +} + +dump_stacks () +{ + _pids="$1" + + # Use word splitting to squash whitespace + # shellcheck disable=SC2086 + _pids=$(echo $_pids | tr ' ' '\n' | sort -u) + + for _pid in $_pids; do + dump_stack "$_pid" + done +} + +get_tdb_file_id () +{ + if ! _device_inode=$(stat -c "%d:%i" "$tdb_path" 2>/dev/null) ; then + die "Unable to stat \"${tdb_path}\"" + fi + _device="${_device_inode%%:*}" + _device_major=$((_device >> 8)) + _device_minor=$((_device & 0xff)) + _inode="${_device_inode#*:}" + printf '%02x:%02x:%u\n' "$_device_major" "$_device_minor" "$_inode" +} + +debug_via_proc_locks () +{ + # Get file ID to match relevant column in /proc/locks + _file_id=$(get_tdb_file_id) + + # Log information from /proc/locks about the waiting process + _tdb=$(basename "$tdb_path") + _comm=$(ps -p "$lock_helper_pid" -o comm=) + _out=$(get_proc "locks" | + awk -v pid="$lock_helper_pid" \ + -v file_id="$_file_id" \ + -v file="$_tdb" \ + -v comm="$_comm" \ + '$2 == "->" && + $3 == "POSIX" && + $4 == "ADVISORY" && + $5 == "WRITE" && + $6 == pid && + $7 == file_id { print $6, comm, file, $8, $9 }') + if [ -n "$_out" ] ; then + echo "Waiter:" + echo "$_out" + fi + + # Parse /proc/locks and find process holding locks on $tdb_path + # extract following information + # pid process_name tdb_name offsets + _out=$(get_proc "locks" | + awk -v pid="$lock_helper_pid" \ + -v file_id="$_file_id" \ + -v file="$_tdb" \ + '$2 == "POSIX" && + $3 == "ADVISORY" && + $4 == "WRITE" && + $5 != pid && + $6 == file_id { print $5, file, $7, $8 }' | + while read -r _pid _rest ; do + _pname=$(ps -p "$_pid" -o comm=) + echo "$_pid $_pname $_rest" + done) + + if [ -z "$_out" ]; then + return + fi + + # Log information about locks + echo "Lock holders:" + echo "$_out" + + _pids=$(echo "$_out" | awk '{ print $1 }') + + lock_holder_pids="${lock_holder_pids:+${lock_holder_pids} }${_pids}" +} + +debug_via_tdb_mutex () +{ + _helper="${CTDB_HELPER_BINDIR}/tdb_mutex_check" + if [ ! -x "$_helper" ] ; then + # Mutex helper not available - not supported? + # Avoid not found error... + return + fi + + # Helper should always succeed + if ! _t=$("$_helper" "$tdb_path") ; then + return + fi + + _out=$(echo "$_t" | sed -n -e 's#^\[\(.*\)\] pid=\(.*\)#\2 \1#p') + + if [ -z "$_out" ]; then + if [ -n "$_t" ] ; then + echo "$_t" | grep -F 'trylock failed' + fi + return + fi + + # Get process names, append $tdb_path + _out=$(echo "$_out" | + while read -r _pid _rest ; do + _pname=$(ps -p "$_pid" -o comm=) + _tdb=$(basename "$tdb_path") + echo "${_pid} ${_pname} ${_tdb} ${_rest}" + done) + + # Log information about locks + echo "Lock holders:" + echo "$_out" + + # Get PIDs of processes that are holding locks + _pids=$(echo "$_out" | + awk -v pid="$lock_helper_pid" '$1 != pid {print $1}') + + lock_holder_pids="${lock_holder_pids:+${lock_holder_pids} }${_pids}" +} + +( + flock -n 9 || exit 1 + + echo "===== Start of debug locks PID=$$ =====" + + lock_holder_pids="" + + debug_via_proc_locks + + if [ "$lock_type" = "MUTEX" ] ; then + debug_via_tdb_mutex + fi + + dump_stacks "$lock_holder_pids" + + echo "===== End of debug locks PID=$$ =====" +)9>"${CTDB_SCRIPT_VARDIR}/debug_locks.lock" | script_log "ctdbd-lock" + +exit 0 |