summaryrefslogtreecommitdiffstats
path: root/mysys/waiting_threads.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:00:34 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:00:34 +0000
commit3f619478f796eddbba6e39502fe941b285dd97b1 (patch)
treee2c7b5777f728320e5b5542b6213fd3591ba51e2 /mysys/waiting_threads.c
parentInitial commit. (diff)
downloadmariadb-upstream.tar.xz
mariadb-upstream.zip
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--mysys/waiting_threads.c1143
1 files changed, 1143 insertions, 0 deletions
diff --git a/mysys/waiting_threads.c b/mysys/waiting_threads.c
new file mode 100644
index 00000000..a03f8da3
--- /dev/null
+++ b/mysys/waiting_threads.c
@@ -0,0 +1,1143 @@
+/* Copyright (C) 2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+ Copyright (c) 2011, 2013, Monty Program Ab.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
+
+/**
+ @file
+
+ "waiting threads" subsystem - a unified interface for threads to wait
+ on each other, with built-in deadlock detection.
+
+ Main concepts
+ ^^^^^^^^^^^^^
+ a thread - is represented by a WT_THD structure. One physical thread
+ can have only one WT_THD descriptor at any given moment.
+
+ a resource - a thread does not wait for other threads directly,
+ instead it waits for a "resource", which is "owned" by other threads.
+ It waits, exactly, for all "owners" to "release" a resource.
+ It does not have to correspond to a physical resource. For example, it
+ may be convenient in certain cases to force resource == thread.
+ A resource is represented by a WT_RESOURCE structure.
+
+ a resource identifier - a pair of {resource type, value}. A value is
+ an ulonglong number. Represented by a WT_RESOURCE_ID structure.
+
+ a resource type - a pointer to a statically defined instance of
+ WT_RESOURCE_TYPE structure. This structure contains a pointer to
+ a function that knows how to compare values of this resource type.
+ In the simple case it could be wt_resource_id_memcmp().
+
+ a wait-for graph - a graph, that represenst "wait-for" relationships.
+ It has two types of nodes - threads and resources. There are directed
+ edges from a thread to a resource it is waiting for (WT_THD::waiting_for),
+ from a thread to resources that it "owns" (WT_THD::my_resources),
+ and from a resource to threads that "own" it (WT_RESOURCE::owners)
+
+ Graph completeness
+ ^^^^^^^^^^^^^^^^^^
+
+ For flawless deadlock detection wait-for graph must be complete.
+ It means that when a thread starts waiting it needs to know *all* its
+ blockers, and call wt_thd_will_wait_for() for every one of them.
+ Otherwise two phenomena should be expected:
+
+ 1. Fuzzy timeouts:
+
+ thread A needs to get a lock, and is blocked by a thread B.
+ it waits.
+ Just before the timeout thread B releases the lock.
+ thread A is ready to grab the lock but discovers that it is also
+ blocked by a thread C.
+ It waits and times out.
+
+ As a result thread A has waited two timeout intervals, instead of one.
+
+ 2. Unreliable cycle detection:
+
+ Thread A waits for threads B and C
+ Thread C waits for D
+ Thread D wants to start waiting for A
+
+ one can see immediately that thread D creates a cycle, and thus
+ a deadlock is detected.
+
+ But if thread A would only wait for B, and start waiting for C
+ when B would unlock, thread D would be allowed to wait, a deadlock
+ would be only detected when B unlocks or somebody times out.
+
+ These two phenomena don't affect a correctness, and strictly speaking,
+ the caller is not required to call wt_thd_will_wait_for() for *all*
+ blockers - it may optimize wt_thd_will_wait_for() calls. But they
+ may be perceived as bugs by users, it must be understood that such
+ an optimization comes with its price.
+
+ Usage
+ ^^^^^
+
+ First, the wt* subsystem must be initialized by calling
+ wt_init(). In the server you don't need to do it, it's done
+ in mysqld.cc.
+
+ Similarly, wt_end() frees wt* structures, should be called
+ at the end, but in the server mysqld.cc takes care of that.
+
+ Every WT_THD should be initialized with wt_thd_lazy_init().
+ After that they can be used in other wt_thd_* calls.
+ Before discarding, WT_THD should be free'd with
+ wt_thd_destroy(). In the server both are handled in sql_class.cc,
+ it's an error to try to do it manually.
+
+ To use the deadlock detection one needs to use this thread's WT_THD,
+ call wt_thd_will_wait_for() for every thread it needs to wait on,
+ then call wt_thd_cond_timedwait(). When thread releases a resource
+ it should call wt_thd_release() (or wt_thd_release_all()) - it will
+ notify (send a signal) threads waiting in wt_thd_cond_timedwait(),
+ if appropriate.
+
+ Just like with pthread's cond_wait, there could be spurious
+ wake-ups from wt_thd_cond_timedwait(). A caller is expected to
+ handle that (that is, to re-check the blocking criteria).
+
+ wt_thd_will_wait_for() and wt_thd_cond_timedwait() return either
+ WT_OK or WT_DEADLOCK. Additionally wt_thd_cond_timedwait() can return
+ WT_TIMEOUT. Out of memory and other fatal errors are reported as
+ WT_DEADLOCK - and a transaction must be aborted just the same.
+
+ Configuration
+ ^^^^^^^^^^^^^
+ There are four config variables. Two deadlock search depths - short and
+ long - and two timeouts. Deadlock search is performed with the short
+ depth on every wt_thd_will_wait_for() call. wt_thd_cond_timedwait()
+ waits with a short timeout, performs a deadlock search with the long
+ depth, and waits with a long timeout. As most deadlock cycles are supposed
+ to be short, most deadlocks will be detected at once, and waits will
+ rarely be necessary.
+
+ These config variables are thread-local. Different threads may have
+ different search depth and timeout values.
+
+ Also, deadlock detector supports different killing strategies, the victim
+ in a deadlock cycle is selected based on the "weight". See "weight"
+ description in waiting_threads.h for details. It's up to the caller to
+ set weights accordingly.
+
+ Status
+ ^^^^^^
+ We calculate the number of successful waits (WT_OK returned from
+ wt_thd_cond_timedwait()), a number of timeouts, a deadlock cycle
+ length distribution - number of deadlocks with every length from
+ 1 to WT_CYCLE_STATS, and a wait time distribution - number
+ of waits with a time from 1 us to 1 min in WT_WAIT_STATS
+ intervals on a log e scale.
+*/
+
+/*
+ Note that if your lock system satisfy the following condition:
+
+ there exist four lock levels A, B, C, D, such as
+ A is compatible with B
+ A is not compatible with C
+ D is not compatible with B
+
+ (example A=IX, B=IS, C=S, D=X)
+
+ you need to include lock level in the resource identifier - a
+ thread waiting for lock of the type A on resource R and another
+ thread waiting for lock of the type B on resource R should wait on
+ different WT_RESOURCE structures, on different {lock, resource}
+ pairs. Otherwise the following is possible:
+
+ thread1> take S-lock on R
+ thread2> take IS-lock on R
+ thread3> wants X-lock on R, starts waiting for threads 1 and 2 on R.
+ thread3 is killed (or timeout or whatever)
+ WT_RESOURCE structure for R is still in the hash, as it has two owners
+ thread4> wants an IX-lock on R
+ WT_RESOURCE for R is found in the hash, thread4 starts waiting on it.
+ !! now thread4 is waiting for both thread1 and thread2
+ !! while, in fact, IX-lock and IS-lock are compatible and
+ !! thread4 should not wait for thread2.
+*/
+
+#include <my_global.h>
+#include <waiting_threads.h>
+#include <m_string.h>
+#include "my_cpu.h"
+
+/* status variables */
+
+/**
+ preset table of wait intervals
+*/
+ulonglong wt_wait_table[WT_WAIT_STATS];
+/**
+ wait time distribution (log e scale)
+*/
+uint32 wt_wait_stats[WT_WAIT_STATS+1];
+/**
+ distribution of cycle lengths
+ first column tells whether this was during short or long detection
+*/
+uint32 wt_cycle_stats[2][WT_CYCLE_STATS+1];
+uint32 wt_success_stats;
+
+#ifdef HAVE_PSI_INTERFACE
+extern PSI_cond_key key_WT_RESOURCE_cond;
+#endif
+
+#ifdef SAFE_STATISTICS
+#define incr(VAR, LOCK) do { my_atomic_add32(&(VAR), 1); } while(0)
+#else
+#define incr(VAR,LOCK) do { (VAR)++; } while(0)
+#endif
+
+static void increment_success_stats()
+{
+ incr(wt_success_stats, success_stats_lock);
+}
+
+static void increment_cycle_stats(uint depth, uint slot)
+{
+ if (depth >= WT_CYCLE_STATS)
+ depth= WT_CYCLE_STATS;
+ incr(wt_cycle_stats[slot][depth], cycle_stats_lock);
+}
+
+static void increment_wait_stats(ulonglong waited,int ret)
+{
+ uint i;
+ if ((ret) == ETIMEDOUT)
+ i= WT_WAIT_STATS;
+ else
+ for (i= 0; i < WT_WAIT_STATS && waited/10 > wt_wait_table[i]; i++) ;
+ incr(wt_wait_stats[i], wait_stats_lock);
+}
+
+/*
+ 'lock' protects 'owners', 'state', and 'waiter_count'
+ 'id' is read-only
+
+ a resource is picked up from a hash in a lock-free manner
+ it's returned pinned, so it cannot be freed at once
+ but it may be freed right after the pin is removed
+ to free a resource it should
+ 1. have no owners
+ 2. have no waiters
+
+ two ways to access a resource:
+ 1. find it in a hash
+ - it's returned pinned.
+ a) take a lock in exclusive mode
+ b) check the state, it should be ACTIVE to be usable
+ c) unpin
+ 2. by a direct reference
+ - could only used if a resource cannot be freed
+ e.g. accessing a resource by thd->waiting_for is safe,
+ a resource cannot be freed as there's a thread waiting for it
+*/
+struct st_wt_resource {
+ WT_RESOURCE_ID id;
+ uint waiter_count;
+ enum { ACTIVE, FREE } state;
+#ifndef DBUG_OFF
+ mysql_mutex_t *cond_mutex; /* a mutex for the 'cond' below */
+#endif
+
+#ifdef WT_RWLOCKS_USE_MUTEXES
+ /*
+ we need a special rwlock-like 'lock' to allow readers bypass
+ waiting writers, otherwise readers can deadlock. For example:
+
+ A waits on resource x, owned by B, B waits on resource y, owned
+ by A, we have a cycle (A->x->B->y->A)
+ Both A and B start deadlock detection:
+
+ A locks x B locks y
+ A goes deeper B goes deeper
+ A locks y B locks x
+
+ with mutexes it would deadlock. With rwlocks it won't, as long
+ as both A and B are taking read locks (and they do).
+ But other threads may take write locks. Assume there's
+ C who wants to start waiting on x, and D who wants to start
+ waiting on y.
+
+ A read-locks x B read-locks y
+ A goes deeper B goes deeper
+ => C write-locks x (to add a new edge) D write-locks y
+ .. C is blocked D is blocked
+ A read-locks y B read-locks x
+
+ Now, if a read lock can bypass a pending wrote lock request, we're fine.
+ If it can not, we have a deadlock.
+
+ writer starvation is technically possible, but unlikely, because
+ the contention is expected to be low.
+ */
+ struct {
+ pthread_cond_t cond;
+ pthread_mutex_t mutex;
+ uint readers: 16;
+ uint pending_writers: 15;
+ uint write_locked: 1;
+ } lock;
+#else
+ rw_lock_t lock;
+#endif
+ mysql_cond_t cond; /* the corresponding mutex is provided by the caller */
+ DYNAMIC_ARRAY owners;
+};
+
+#ifdef WT_RWLOCKS_USE_MUTEXES
+static void rc_rwlock_init(WT_RESOURCE *rc)
+{
+ pthread_cond_init(&rc->lock.cond, 0);
+ pthread_mutex_init(&rc->lock.mutex, MY_MUTEX_INIT_FAST);
+}
+static void rc_rwlock_destroy(WT_RESOURCE *rc)
+{
+ DBUG_ASSERT(rc->lock.write_locked == 0);
+ DBUG_ASSERT(rc->lock.readers == 0);
+ pthread_cond_destroy(&rc->lock.cond);
+ pthread_mutex_destroy(&rc->lock.mutex);
+}
+static void rc_rdlock(WT_RESOURCE *rc)
+{
+ DBUG_PRINT("wt", ("TRYLOCK resid=%ld for READ", (ulong)rc->id.value));
+ pthread_mutex_lock(&rc->lock.mutex);
+ while (rc->lock.write_locked)
+ pthread_cond_wait(&rc->lock.cond, &rc->lock.mutex);
+ rc->lock.readers++;
+ pthread_mutex_unlock(&rc->lock.mutex);
+ DBUG_PRINT("wt", ("LOCK resid=%ld for READ", (ulong)rc->id.value));
+}
+static void rc_wrlock(WT_RESOURCE *rc)
+{
+ DBUG_PRINT("wt", ("TRYLOCK resid=%ld for WRITE", (ulong)rc->id.value));
+ pthread_mutex_lock(&rc->lock.mutex);
+ while (rc->lock.write_locked || rc->lock.readers)
+ pthread_cond_wait(&rc->lock.cond, &rc->lock.mutex);
+ rc->lock.write_locked= 1;
+ pthread_mutex_unlock(&rc->lock.mutex);
+ DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
+}
+static void rc_unlock(WT_RESOURCE *rc)
+{
+ DBUG_PRINT("wt", ("UNLOCK resid=%ld", (ulong)rc->id.value));
+ pthread_mutex_lock(&rc->lock.mutex);
+ if (rc->lock.write_locked)
+ {
+ rc->lock.write_locked= 0;
+ pthread_cond_broadcast(&rc->lock.cond);
+ }
+ else if (--rc->lock.readers == 0)
+ pthread_cond_broadcast(&rc->lock.cond);
+ pthread_mutex_unlock(&rc->lock.mutex);
+}
+#else
+static void rc_rwlock_init(WT_RESOURCE *rc)
+{
+ my_rwlock_init(&rc->lock, 0);
+}
+static void rc_rwlock_destroy(WT_RESOURCE *rc)
+{
+ rwlock_destroy(&rc->lock);
+}
+static void rc_rdlock(WT_RESOURCE *rc)
+{
+ DBUG_PRINT("wt", ("TRYLOCK resid=%ld for READ", (ulong)rc->id.value));
+ rw_rdlock(&rc->lock);
+ DBUG_PRINT("wt", ("LOCK resid=%ld for READ", (ulong)rc->id.value));
+}
+static void rc_wrlock(WT_RESOURCE *rc)
+{
+ DBUG_PRINT("wt", ("TRYLOCK resid=%ld for WRITE", (ulong)rc->id.value));
+ rw_wrlock(&rc->lock);
+ DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
+}
+static void rc_unlock(WT_RESOURCE *rc)
+{
+ DBUG_PRINT("wt", ("UNLOCK resid=%ld", (ulong)rc->id.value));
+ rw_unlock(&rc->lock);
+}
+#endif
+
+/*
+ All resources are stored in a lock-free hash. Different threads
+ may add new resources and perform deadlock detection concurrently.
+*/
+static LF_HASH reshash;
+
+/**
+ WT_RESOURCE constructor
+
+ It's called from lf_hash and takes a pointer to an LF_SLIST instance.
+ WT_RESOURCE is located at arg+sizeof(LF_SLIST)
+*/
+static void wt_resource_create(uchar *arg)
+{
+ WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
+ DBUG_ENTER("wt_resource_create");
+
+ bzero(rc, sizeof(*rc));
+ rc_rwlock_init(rc);
+ mysql_cond_init(key_WT_RESOURCE_cond, &rc->cond, 0);
+ my_init_dynamic_array(PSI_INSTRUMENT_ME, &rc->owners,
+ sizeof(WT_THD *), 0, 5, MYF(0));
+ DBUG_VOID_RETURN;
+}
+
+/**
+ WT_RESOURCE destructor
+
+ It's called from lf_hash and takes a pointer to an LF_SLIST instance.
+ WT_RESOURCE is located at arg+sizeof(LF_SLIST)
+*/
+static void wt_resource_destroy(uchar *arg)
+{
+ WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
+ DBUG_ENTER("wt_resource_destroy");
+
+ DBUG_ASSERT(rc->owners.elements == 0);
+ rc_rwlock_destroy(rc);
+ mysql_cond_destroy(&rc->cond);
+ delete_dynamic(&rc->owners);
+ DBUG_VOID_RETURN;
+}
+
+/**
+ WT_RESOURCE initializer
+
+ It's called from lf_hash when an element is inserted.
+*/
+static void wt_resource_init(LF_HASH *hash __attribute__((unused)),
+ WT_RESOURCE *rc, WT_RESOURCE_ID *id)
+{
+ DBUG_ENTER("wt_resource_init");
+ rc->id= *id;
+ rc->waiter_count= 0;
+ rc->state= ACTIVE;
+#ifndef DBUG_OFF
+ rc->cond_mutex= 0;
+#endif
+ DBUG_VOID_RETURN;
+}
+
+static int wt_init_done;
+
+void wt_init()
+{
+ DBUG_ENTER("wt_init");
+ DBUG_ASSERT(reshash.alloc.constructor != wt_resource_create);
+
+ lf_hash_init(&reshash, sizeof(WT_RESOURCE), LF_HASH_UNIQUE, 0,
+ sizeof_WT_RESOURCE_ID, 0, 0);
+ reshash.alloc.constructor= wt_resource_create;
+ reshash.alloc.destructor= wt_resource_destroy;
+ reshash.initializer= (lf_hash_initializer) wt_resource_init;
+
+ bzero(wt_wait_stats, sizeof(wt_wait_stats));
+ bzero(wt_cycle_stats, sizeof(wt_cycle_stats));
+ wt_success_stats= 0;
+ { /* initialize wt_wait_table[]. from 1 us to 1 min, log e scale */
+ int i;
+ double from= log(1); /* 1 us */
+ double to= log(60e6); /* 1 min */
+ for (i= 0; i < WT_WAIT_STATS; i++)
+ {
+ wt_wait_table[i]= (ulonglong)exp((to-from)/(WT_WAIT_STATS-1)*i+from);
+ DBUG_ASSERT(i == 0 || wt_wait_table[i-1] != wt_wait_table[i]);
+ }
+ }
+ wt_init_done= 1;
+ DBUG_VOID_RETURN;
+}
+
+void wt_end()
+{
+ DBUG_ENTER("wt_end");
+ if (!wt_init_done)
+ DBUG_VOID_RETURN;
+
+ DBUG_ASSERT(reshash.count == 0);
+ lf_hash_destroy(&reshash);
+ reshash.alloc.constructor= NULL;
+ wt_init_done= 0;
+ DBUG_VOID_RETURN;
+}
+
+/**
+ Lazy WT_THD initialization
+
+ Cheap initialization of WT_THD. Only initialize fields that don't require
+ memory allocations - basically, it only does assignments. The rest of the
+ WT_THD structure will be initialized on demand, on the first use.
+ This allows one to initialize lazily all WT_THD structures, even if some
+ (or even most) of them will never be used for deadlock detection.
+
+ @param ds a pointer to deadlock search depth short value
+ @param ts a pointer to deadlock timeout short value (microseconds)
+ @param dl a pointer to deadlock search depth long value
+ @param tl a pointer to deadlock timeout long value (microseconds)
+
+ @note these are pointers to values, and WT_THD stores them as pointers.
+ It allows one later to change search depths and timeouts for existing
+ threads. It also means that the pointers must stay valid for the lifetime
+ of WT_THD.
+*/
+void wt_thd_lazy_init(WT_THD *thd, const ulong *ds, const ulong *ts,
+ const ulong *dl, const ulong *tl)
+{
+ DBUG_ENTER("wt_thd_lazy_init");
+ thd->waiting_for= 0;
+ thd->weight= 0;
+ thd->deadlock_search_depth_short= ds;
+ thd->timeout_short= ts;
+ thd->deadlock_search_depth_long= dl;
+ thd->timeout_long= tl;
+ /* dynamic array is also initialized lazily - without memory allocations */
+ my_init_dynamic_array(PSI_INSTRUMENT_ME, &thd->my_resources,
+ sizeof(WT_RESOURCE *), 0, 5, MYF(0));
+#ifndef DBUG_OFF
+ thd->name= my_thread_name();
+#endif
+ DBUG_VOID_RETURN;
+}
+
+/**
+ Finalize WT_THD initialization
+
+ After lazy WT_THD initialization, parts of the structure are still
+ uninitialized. This function completes the initialization, allocating
+ memory, if necessary. It's called automatically on demand, when WT_THD
+ is about to be used.
+*/
+static int fix_thd_pins(WT_THD *thd)
+{
+ if (unlikely(thd->pins == 0))
+ {
+ thd->pins= lf_hash_get_pins(&reshash);
+#ifndef DBUG_OFF
+ thd->name= my_thread_name();
+#endif
+ }
+ return thd->pins == 0;
+}
+
+void wt_thd_destroy(WT_THD *thd)
+{
+ DBUG_ENTER("wt_thd_destroy");
+
+ DBUG_ASSERT(thd->my_resources.elements == 0);
+ DBUG_ASSERT(thd->waiting_for == 0);
+
+ if (thd->pins != 0)
+ lf_hash_put_pins(thd->pins);
+
+ delete_dynamic(&thd->my_resources);
+ DBUG_VOID_RETURN;
+}
+/**
+ Trivial resource id comparison function - bytewise memcmp.
+
+ It can be used in WT_RESOURCE_TYPE structures where bytewise
+ comparison of values is sufficient.
+*/
+my_bool wt_resource_id_memcmp(const void *a, const void *b)
+{
+ /* we use the fact that there's no padding in the middle of WT_RESOURCE_ID */
+ compile_time_assert(offsetof(WT_RESOURCE_ID, type) == sizeof(ulonglong));
+ return MY_TEST(memcmp(a, b, sizeof_WT_RESOURCE_ID));
+}
+
+/**
+ arguments for the recursive deadlock_search function
+*/
+struct deadlock_arg {
+ WT_THD * const thd; /**< starting point of a search */
+ uint const max_depth; /**< search depth limit */
+ WT_THD *victim; /**< a thread to be killed to resolve a deadlock */
+ WT_RESOURCE *last_locked_rc; /**< see comment at the end of deadlock_search() */
+};
+
+/**
+ helper function to change the victim, according to the weight
+*/
+static void change_victim(WT_THD* found, struct deadlock_arg *arg)
+{
+ if (found->weight < arg->victim->weight)
+ {
+ if (arg->victim != arg->thd)
+ {
+ rc_unlock(arg->victim->waiting_for); /* release the previous victim */
+ DBUG_ASSERT(arg->last_locked_rc == found->waiting_for);
+ }
+ arg->victim= found;
+ arg->last_locked_rc= 0;
+ }
+}
+
+/**
+ recursive loop detection in a wait-for graph with a limited search depth
+*/
+static int deadlock_search(struct deadlock_arg *arg, WT_THD *blocker,
+ uint depth)
+{
+ WT_RESOURCE *rc, *volatile *shared_ptr= &blocker->waiting_for;
+ WT_THD *cursor;
+ size_t i;
+ int ret= WT_OK;
+ DBUG_ENTER("deadlock_search");
+ DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, depth=%u",
+ arg->thd->name, blocker->name, depth));
+
+ arg->last_locked_rc= 0;
+
+ if (depth > arg->max_depth)
+ {
+ DBUG_PRINT("wt", ("exit: WT_DEPTH_EXCEEDED (early)"));
+ DBUG_RETURN(WT_DEPTH_EXCEEDED);
+ }
+
+retry:
+ /*
+ safe dereference as explained in lf_alloc-pin.c
+ (in short: protects against lf_alloc_free() in lf_hash_delete())
+ */
+ do
+ {
+ rc= *shared_ptr;
+ lf_pin(arg->thd->pins, 0, rc);
+ } while (rc != *shared_ptr && LF_BACKOFF());
+
+ if (rc == 0)
+ {
+ DBUG_PRINT("wt", ("exit: OK (early)"));
+ DBUG_RETURN(0);
+ }
+
+ rc_rdlock(rc);
+ if (rc->state != ACTIVE || *shared_ptr != rc)
+ {
+ /* blocker is not waiting on this resource anymore */
+ rc_unlock(rc);
+ lf_unpin(arg->thd->pins, 0);
+ goto retry;
+ }
+ /* as the state is locked, we can unpin now */
+ lf_unpin(arg->thd->pins, 0);
+
+ /*
+ Below is not a pure depth-first search. It's a depth-first with a
+ slightest hint of breadth-first. Depth-first is:
+
+ check(element, X):
+ foreach current in element->nodes[] do:
+ if current == X return error;
+ check(current, X);
+
+ while we do
+
+ check(element, X):
+ foreach current in element->nodes[] do:
+ if current == X return error;
+ foreach current in element->nodes[] do:
+ check(current, X);
+
+ preferring shorter deadlocks over longer ones.
+ */
+ for (i= 0; i < rc->owners.elements; i++)
+ {
+ cursor= *dynamic_element(&rc->owners, i, WT_THD**);
+ /*
+ We're only looking for (and detecting) cycles that include 'arg->thd'.
+ That is, only deadlocks that *we* have created. For example,
+ thd->A->B->thd
+ (thd waits for A, A waits for B, while B is waiting for thd).
+ While walking the graph we can encounter other cicles, e.g.
+ thd->A->B->C->A
+ This will not be detected. Instead we will walk it in circles until
+ the search depth limit is reached (the latter guarantees that an
+ infinite loop is impossible). We expect the thread that has created
+ the cycle (one of A, B, and C) to detect its deadlock.
+ */
+ if (cursor == arg->thd)
+ {
+ ret= WT_DEADLOCK;
+ increment_cycle_stats(depth, arg->max_depth ==
+ *arg->thd->deadlock_search_depth_long);
+ arg->victim= cursor;
+ goto end;
+ }
+ }
+ for (i= 0; i < rc->owners.elements; i++)
+ {
+ cursor= *dynamic_element(&rc->owners, i, WT_THD**);
+ switch (deadlock_search(arg, cursor, depth+1)) {
+ case WT_OK:
+ break;
+ case WT_DEPTH_EXCEEDED:
+ ret= WT_DEPTH_EXCEEDED;
+ break;
+ case WT_DEADLOCK:
+ ret= WT_DEADLOCK;
+ change_victim(cursor, arg); /* also sets arg->last_locked_rc to 0 */
+ i= rc->owners.elements; /* jump out of the loop */
+ break;
+ default:
+ DBUG_ASSERT(0);
+ }
+ if (arg->last_locked_rc)
+ rc_unlock(arg->last_locked_rc);
+ }
+end:
+ /*
+ Note that 'rc' is locked in this function, but it's never unlocked here.
+ Instead it's saved in arg->last_locked_rc and the *caller* is
+ expected to unlock it. It's done to support different killing
+ strategies. This is how it works:
+ Assuming a graph
+
+ thd->A->B->C->thd
+
+ deadlock_search() function starts from thd, locks it (in fact it locks not
+ a thd, but a resource it is waiting on, but below, for simplicity, I'll
+ talk about "locking a thd"). Then it goes down recursively, locks A, and so
+ on. Goes down recursively, locks B. Goes down recursively, locks C.
+ Notices that C is waiting on thd. Deadlock detected. Sets arg->victim=thd.
+ Returns from the last deadlock_search() call. C stays locked!
+ Now it checks whether C is a more appropriate victim than 'thd'.
+ If yes - arg->victim=C, otherwise C is unlocked. Returns. B stays locked.
+ Now it checks whether B is a more appropriate victim than arg->victim.
+ If yes - old arg->victim is unlocked and arg->victim=B,
+ otherwise B is unlocked. Return.
+ And so on.
+
+ In short, a resource is locked in a frame. But it's not unlocked in the
+ same frame, it's unlocked by the caller, and only after the caller checks
+ that it doesn't need to use current WT_THD as a victim. If it does - the
+ lock is kept and the old victim's resource is unlocked. When the recursion
+ is unrolled and we are back to deadlock() function, there are only two
+ locks left - on thd and on the victim.
+ */
+ arg->last_locked_rc= rc;
+ DBUG_PRINT("wt", ("exit: %s",
+ ret == WT_DEPTH_EXCEEDED ? "WT_DEPTH_EXCEEDED" :
+ ret ? "WT_DEADLOCK" : "OK"));
+ DBUG_RETURN(ret);
+}
+
+/**
+ Deadlock detection in a wait-for graph
+
+ A wrapper for recursive deadlock_search() - prepares deadlock_arg structure,
+ invokes deadlock_search(), increments statistics, notifies the victim.
+
+ @param thd thread that is going to wait. Deadlock is detected
+ if, while walking the graph, we reach a thread that
+ is waiting on thd
+ @param blocker starting point of a search. In wt_thd_cond_timedwait()
+ it's thd, in wt_thd_will_wait_for() it's a thread that
+ thd is going to wait for
+ @param depth starting search depth. In general it's the number of
+ edges in the wait-for graph between thd and the
+ blocker. Practically only two values are used (and
+ supported) - when thd == blocker it's 0, when thd
+ waits directly for blocker, it's 1
+ @param max_depth search depth limit
+*/
+static int deadlock(WT_THD *thd, WT_THD *blocker, uint depth,
+ uint max_depth)
+{
+ struct deadlock_arg arg= {thd, max_depth, 0, 0};
+ int ret;
+ DBUG_ENTER("deadlock");
+ DBUG_ASSERT(depth < 2);
+ ret= deadlock_search(&arg, blocker, depth);
+ if (ret == WT_DEPTH_EXCEEDED)
+ {
+ increment_cycle_stats(WT_CYCLE_STATS, max_depth ==
+ *thd->deadlock_search_depth_long);
+ ret= WT_OK;
+ }
+ /*
+ if we started with depth==1, blocker was never considered for a victim
+ in deadlock_search(). Do it here.
+ */
+ if (ret == WT_DEADLOCK && depth)
+ change_victim(blocker, &arg);
+ if (arg.last_locked_rc)
+ {
+ /*
+ Special return code if there's nobody to wait for.
+
+ depth == 0 means that we start the search from thd (thd == blocker).
+ ret == WT_OK means that no cycle was found and
+ arg.last_locked_rc == thd->waiting_for.
+ and arg.last_locked_rc->owners.elements == 0 means that
+ (applying the rule above) thd->waiting_for->owners.elements == 0,
+ and thd doesn't have anybody to wait for.
+ */
+ if (depth == 0 && ret == WT_OK && arg.last_locked_rc->owners.elements == 0)
+ {
+ DBUG_ASSERT(thd == blocker);
+ DBUG_ASSERT(arg.last_locked_rc == thd->waiting_for);
+ ret= WT_FREE_TO_GO;
+ }
+ rc_unlock(arg.last_locked_rc);
+ }
+ /* notify the victim, if appropriate */
+ if (ret == WT_DEADLOCK && arg.victim != thd)
+ {
+ DBUG_PRINT("wt", ("killing %s", arg.victim->name));
+ arg.victim->killed= 1;
+ mysql_cond_broadcast(&arg.victim->waiting_for->cond);
+ rc_unlock(arg.victim->waiting_for);
+ ret= WT_OK;
+ }
+ DBUG_RETURN(ret);
+}
+
+
+/**
+ Delete an element from reshash if it has no waiters or owners
+
+ rc->lock must be locked by the caller and it's unlocked on return.
+*/
+static int unlock_lock_and_free_resource(WT_THD *thd, WT_RESOURCE *rc)
+{
+ uint keylen;
+ const void *key;
+ DBUG_ENTER("unlock_lock_and_free_resource");
+
+ DBUG_ASSERT(rc->state == ACTIVE);
+
+ if (rc->owners.elements || rc->waiter_count)
+ {
+ DBUG_PRINT("wt", ("nothing to do, %u owners, %u waiters",
+ rc->owners.elements, rc->waiter_count));
+ rc_unlock(rc);
+ DBUG_RETURN(0);
+ }
+
+ if (fix_thd_pins(thd))
+ {
+ rc_unlock(rc);
+ DBUG_RETURN(1);
+ }
+
+ /* XXX if (rc->id.type->make_key) key= rc->id.type->make_key(&rc->id, &keylen); else */
+ {
+ key= &rc->id;
+ keylen= sizeof_WT_RESOURCE_ID;
+ }
+
+ /*
+ To free the element correctly we need to:
+ 1. take its lock (already done).
+ 2. set the state to FREE
+ 3. release the lock
+ 4. remove from the hash
+ */
+ rc->state= FREE;
+ rc_unlock(rc);
+ DBUG_RETURN(lf_hash_delete(&reshash, thd->pins, key, keylen) == -1);
+}
+
+
+/**
+ register the fact that thd is not waiting anymore
+
+ decrease waiter_count, clear waiting_for, free the resource if appropriate.
+ thd->waiting_for must be locked!
+*/
+static int stop_waiting_locked(WT_THD *thd)
+{
+ int ret;
+ WT_RESOURCE *rc= thd->waiting_for;
+ DBUG_ENTER("stop_waiting_locked");
+
+ DBUG_ASSERT(rc->waiter_count);
+ DBUG_ASSERT(rc->state == ACTIVE);
+ rc->waiter_count--;
+ thd->waiting_for= 0;
+ ret= unlock_lock_and_free_resource(thd, rc);
+ DBUG_RETURN((thd->killed || ret) ? WT_DEADLOCK : WT_OK);
+}
+
+/**
+ register the fact that thd is not waiting anymore
+
+ locks thd->waiting_for and calls stop_waiting_locked().
+*/
+static int stop_waiting(WT_THD *thd)
+{
+ int ret;
+ WT_RESOURCE *rc= thd->waiting_for;
+ DBUG_ENTER("stop_waiting");
+
+ if (!rc)
+ DBUG_RETURN(WT_OK);
+ /*
+ nobody's trying to free the resource now,
+ as its waiter_count is guaranteed to be non-zero
+ */
+ rc_wrlock(rc);
+ ret= stop_waiting_locked(thd);
+ DBUG_RETURN(ret);
+}
+
+/**
+ notify the system that a thread needs to wait for another thread
+
+ called by a *waiter* to declare that it (thd) will wait for another
+ thread (blocker) on a specific resource (resid).
+ can be called many times, if many blockers own a blocking resource.
+ but must always be called with the same resource id - a thread cannot
+ wait for more than one resource at a time.
+
+ @return WT_OK or WT_DEADLOCK
+
+ As a new edge is added to the wait-for graph, a deadlock detection is
+ performed for this new edge.
+*/
+int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker,
+ const WT_RESOURCE_ID *resid)
+{
+ uint i;
+ WT_RESOURCE *rc;
+ DBUG_ENTER("wt_thd_will_wait_for");
+
+ DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, resid=%lu",
+ thd->name, blocker->name, (ulong)resid->value));
+
+ if (fix_thd_pins(thd))
+ DBUG_RETURN(WT_DEADLOCK);
+
+ if (thd->waiting_for == 0)
+ {
+ uint keylen;
+ const void *key;
+ /* XXX if (restype->make_key) key= restype->make_key(resid, &keylen); else */
+ {
+ key= resid;
+ keylen= sizeof_WT_RESOURCE_ID;
+ }
+
+ DBUG_PRINT("wt", ("first blocker"));
+
+retry:
+ while ((rc= lf_hash_search(&reshash, thd->pins, key, keylen)) == 0)
+ {
+ DBUG_PRINT("wt", ("failed to find rc in hash, inserting"));
+
+ if (lf_hash_insert(&reshash, thd->pins, resid) == -1) /* if OOM */
+ DBUG_RETURN(WT_DEADLOCK);
+ /*
+ Two cases: either lf_hash_insert() failed - because another thread
+ has just inserted a resource with the same id - and we need to retry.
+ Or lf_hash_insert() succeeded, and then we need to repeat
+ lf_hash_search() to find a real address of the newly inserted element.
+ That is, we don't care what lf_hash_insert() has returned.
+ And we need to repeat the loop anyway.
+ */
+ }
+ if (rc == MY_ERRPTR)
+ DBUG_RETURN(WT_DEADLOCK);
+
+ DBUG_PRINT("wt", ("found in hash rc=%p", rc));
+
+ rc_wrlock(rc);
+ if (rc->state != ACTIVE)
+ {
+ DBUG_PRINT("wt", ("but it's not active, retrying"));
+ /* Somebody has freed the element while we weren't looking */
+ rc_unlock(rc);
+ lf_hash_search_unpin(thd->pins);
+ goto retry;
+ }
+
+ lf_hash_search_unpin(thd->pins); /* the element cannot go away anymore */
+ thd->waiting_for= rc;
+ rc->waiter_count++;
+ thd->killed= 0;
+ }
+ else
+ {
+ DBUG_ASSERT(thd->waiting_for->id.type == resid->type);
+ DBUG_ASSERT(resid->type->compare(&thd->waiting_for->id, resid) == 0);
+ DBUG_PRINT("wt", ("adding another blocker"));
+
+ /*
+ we can safely access the resource here, it's in the hash as it has
+ non-zero waiter_count
+ */
+ rc= thd->waiting_for;
+ rc_wrlock(rc);
+ DBUG_ASSERT(rc->waiter_count);
+ DBUG_ASSERT(rc->state == ACTIVE);
+
+ if (thd->killed)
+ {
+ stop_waiting_locked(thd);
+ DBUG_RETURN(WT_DEADLOCK);
+ }
+ }
+ /*
+ Another thread could be waiting on this resource for this very 'blocker'.
+ In this case we should not add it to the list for the second time.
+ */
+ for (i= 0; i < rc->owners.elements; i++)
+ if (*dynamic_element(&rc->owners, i, WT_THD**) == blocker)
+ break;
+ if (i >= rc->owners.elements)
+ {
+ if (push_dynamic(&blocker->my_resources, (void*)&rc))
+ {
+ stop_waiting_locked(thd);
+ DBUG_RETURN(WT_DEADLOCK); /* deadlock and OOM use the same error code */
+ }
+ if (push_dynamic(&rc->owners, (void*)&blocker))
+ {
+ pop_dynamic(&blocker->my_resources);
+ stop_waiting_locked(thd);
+ DBUG_RETURN(WT_DEADLOCK);
+ }
+ }
+ rc_unlock(rc);
+
+ if (deadlock(thd, blocker, 1, *thd->deadlock_search_depth_short) != WT_OK)
+ {
+ stop_waiting(thd);
+ DBUG_RETURN(WT_DEADLOCK);
+ }
+ DBUG_RETURN(WT_OK);
+}
+
+/**
+ called by a *waiter* (thd) to start waiting
+
+ It's supposed to be a drop-in replacement for
+ mysql_cond_timedwait(), and it takes mutex as an argument.
+
+ @return one of WT_TIMEOUT, WT_DEADLOCK, WT_OK
+*/
+int wt_thd_cond_timedwait(WT_THD *thd, mysql_mutex_t *mutex)
+{
+ int ret= WT_TIMEOUT;
+ struct timespec timeout;
+ my_hrtime_t before, after, starttime;
+ WT_RESOURCE *rc= thd->waiting_for;
+ ulonglong end_wait_time;
+ DBUG_ENTER("wt_thd_cond_timedwait");
+ DBUG_PRINT("wt", ("enter: thd=%s, rc=%p", thd->name, rc));
+
+#ifndef DBUG_OFF
+ if (rc->cond_mutex)
+ DBUG_ASSERT(rc->cond_mutex == mutex);
+ else
+ rc->cond_mutex= mutex;
+ mysql_mutex_assert_owner(mutex);
+#endif
+
+ before= starttime= my_hrtime();
+
+ rc_wrlock(rc);
+ if (rc->owners.elements == 0)
+ ret= WT_OK;
+ rc_unlock(rc);
+
+ end_wait_time= starttime.val *1000 + (*thd->timeout_short)*1000000ULL;
+ set_timespec_time_nsec(timeout, end_wait_time);
+ if (ret == WT_TIMEOUT && !thd->killed)
+ ret= mysql_cond_timedwait(&rc->cond, mutex, &timeout);
+ if (ret == WT_TIMEOUT && !thd->killed)
+ {
+ int r= deadlock(thd, thd, 0, *thd->deadlock_search_depth_long);
+ if (r == WT_FREE_TO_GO)
+ ret= WT_OK;
+ else if (r != WT_OK)
+ ret= WT_DEADLOCK;
+ else if (*thd->timeout_long > *thd->timeout_short)
+ {
+ end_wait_time= starttime.val *1000 + (*thd->timeout_long)*1000000ULL;
+ set_timespec_time_nsec(timeout, end_wait_time);
+ if (!thd->killed)
+ ret= mysql_cond_timedwait(&rc->cond, mutex, &timeout);
+ }
+ }
+ after= my_hrtime();
+ if (stop_waiting(thd) == WT_DEADLOCK) /* if we're killed */
+ ret= WT_DEADLOCK;
+ increment_wait_stats(after.val-before.val, ret);
+ if (ret == WT_OK)
+ increment_success_stats();
+ DBUG_RETURN(ret);
+}
+
+/**
+ called by a *blocker* when it releases a resource
+
+ it's conceptually similar to pthread_cond_broadcast, and must be done
+ under the same mutex as wt_thd_cond_timedwait().
+
+ @param resid a resource to release. 0 to release all resources
+*/
+
+void wt_thd_release(WT_THD *thd, const WT_RESOURCE_ID *resid)
+{
+ uint i;
+ DBUG_ENTER("wt_thd_release");
+
+ for (i= 0; i < thd->my_resources.elements; i++)
+ {
+ WT_RESOURCE *rc= *dynamic_element(&thd->my_resources, i, WT_RESOURCE**);
+ if (!resid || (resid->type->compare(&rc->id, resid) == 0))
+ {
+ uint j;
+
+ rc_wrlock(rc);
+ /*
+ nobody's trying to free the resource now,
+ as its owners[] array is not empty (at least thd must be there)
+ */
+ DBUG_ASSERT(rc->state == ACTIVE);
+ for (j= 0; j < rc->owners.elements; j++)
+ if (*dynamic_element(&rc->owners, j, WT_THD**) == thd)
+ break;
+ DBUG_ASSERT(j < rc->owners.elements);
+ delete_dynamic_element(&rc->owners, j);
+ if (rc->owners.elements == 0)
+ {
+ mysql_cond_broadcast(&rc->cond);
+#ifndef DBUG_OFF
+ if (rc->cond_mutex)
+ mysql_mutex_assert_owner(rc->cond_mutex);
+#endif
+ }
+ unlock_lock_and_free_resource(thd, rc);
+ if (resid)
+ {
+ delete_dynamic_element(&thd->my_resources, i);
+ DBUG_VOID_RETURN;
+ }
+ }
+ }
+ if (!resid)
+ reset_dynamic(&thd->my_resources);
+ DBUG_VOID_RETURN;
+}
+