diff options
Diffstat (limited to 'debian/patches-rt/0258-workqueue-Prevent-deadlock-stall-on-RT.patch')
-rw-r--r-- | debian/patches-rt/0258-workqueue-Prevent-deadlock-stall-on-RT.patch | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/debian/patches-rt/0258-workqueue-Prevent-deadlock-stall-on-RT.patch b/debian/patches-rt/0258-workqueue-Prevent-deadlock-stall-on-RT.patch new file mode 100644 index 000000000..10fe17fc0 --- /dev/null +++ b/debian/patches-rt/0258-workqueue-Prevent-deadlock-stall-on-RT.patch @@ -0,0 +1,207 @@ +From 95516b448b8dcc30ac9e5fc0e38c1c3d3f8ef930 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 27 Jun 2014 16:24:52 +0200 +Subject: [PATCH 258/347] workqueue: Prevent deadlock/stall on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.19/older/patches-4.19.246-rt110.tar.xz + +Austin reported a XFS deadlock/stall on RT where scheduled work gets +never exececuted and tasks are waiting for each other for ever. + +The underlying problem is the modification of the RT code to the +handling of workers which are about to go to sleep. In mainline a +worker thread which goes to sleep wakes an idle worker if there is +more work to do. This happens from the guts of the schedule() +function. On RT this must be outside and the accessed data structures +are not protected against scheduling due to the spinlock to rtmutex +conversion. So the naive solution to this was to move the code outside +of the scheduler and protect the data structures by the pool +lock. That approach turned out to be a little naive as we cannot call +into that code when the thread blocks on a lock, as it is not allowed +to block on two locks in parallel. So we dont call into the worker +wakeup magic when the worker is blocked on a lock, which causes the +deadlock/stall observed by Austin and Mike. + +Looking deeper into that worker code it turns out that the only +relevant data structure which needs to be protected is the list of +idle workers which can be woken up. + +So the solution is to protect the list manipulation operations with +preempt_enable/disable pairs on RT and call unconditionally into the +worker code even when the worker is blocked on a lock. The preemption +protection is safe as there is nothing which can fiddle with the list +outside of thread context. + +Reported-and_tested-by: Austin Schuh <austin@peloton-tech.com> +Reported-and_tested-by: Mike Galbraith <umgwanakikbuti@gmail.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Link: http://vger.kernel.org/r/alpine.DEB.2.10.1406271249510.5170@nanos +Cc: Richard Weinberger <richard.weinberger@gmail.com> +Cc: Steven Rostedt <rostedt@goodmis.org> +--- + kernel/sched/core.c | 6 +++-- + kernel/workqueue.c | 60 +++++++++++++++++++++++++++++++++++---------- + 2 files changed, 51 insertions(+), 15 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 0995748a3a1d..feaf4d5683af 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -3615,9 +3615,8 @@ void __noreturn do_task_dead(void) + + static inline void sched_submit_work(struct task_struct *tsk) + { +- if (!tsk->state || tsk_is_pi_blocked(tsk)) ++ if (!tsk->state) + return; +- + /* + * If a worker went to sleep, notify and ask workqueue whether + * it wants to wake up a task to maintain concurrency. +@@ -3631,6 +3630,9 @@ static inline void sched_submit_work(struct task_struct *tsk) + preempt_enable_no_resched(); + } + ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. +diff --git a/kernel/workqueue.c b/kernel/workqueue.c +index 0c529f920232..91f5696cf335 100644 +--- a/kernel/workqueue.c ++++ b/kernel/workqueue.c +@@ -126,6 +126,11 @@ enum { + * cpu or grabbing pool->lock is enough for read access. If + * POOL_DISASSOCIATED is set, it's identical to L. + * ++ * On RT we need the extra protection via rt_lock_idle_list() for ++ * the list manipulations against read access from ++ * wq_worker_sleeping(). All other places are nicely serialized via ++ * pool->lock. ++ * + * A: wq_pool_attach_mutex protected. + * + * PL: wq_pool_mutex protected. +@@ -431,6 +436,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); + if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ + else + ++#ifdef CONFIG_PREEMPT_RT_BASE ++static inline void rt_lock_idle_list(struct worker_pool *pool) ++{ ++ preempt_disable(); ++} ++static inline void rt_unlock_idle_list(struct worker_pool *pool) ++{ ++ preempt_enable(); ++} ++static inline void sched_lock_idle_list(struct worker_pool *pool) { } ++static inline void sched_unlock_idle_list(struct worker_pool *pool) { } ++#else ++static inline void rt_lock_idle_list(struct worker_pool *pool) { } ++static inline void rt_unlock_idle_list(struct worker_pool *pool) { } ++static inline void sched_lock_idle_list(struct worker_pool *pool) ++{ ++ spin_lock_irq(&pool->lock); ++} ++static inline void sched_unlock_idle_list(struct worker_pool *pool) ++{ ++ spin_unlock_irq(&pool->lock); ++} ++#endif ++ ++ + #ifdef CONFIG_DEBUG_OBJECTS_WORK + + static struct debug_obj_descr work_debug_descr; +@@ -837,10 +867,16 @@ static struct worker *first_idle_worker(struct worker_pool *pool) + */ + static void wake_up_worker(struct worker_pool *pool) + { +- struct worker *worker = first_idle_worker(pool); ++ struct worker *worker; ++ ++ rt_lock_idle_list(pool); ++ ++ worker = first_idle_worker(pool); + + if (likely(worker)) + wake_up_process(worker->task); ++ ++ rt_unlock_idle_list(pool); + } + + /** +@@ -869,7 +905,7 @@ void wq_worker_running(struct task_struct *task) + */ + void wq_worker_sleeping(struct task_struct *task) + { +- struct worker *next, *worker = kthread_data(task); ++ struct worker *worker = kthread_data(task); + struct worker_pool *pool; + + /* +@@ -886,26 +922,18 @@ void wq_worker_sleeping(struct task_struct *task) + return; + + worker->sleeping = 1; +- spin_lock_irq(&pool->lock); + + /* + * The counterpart of the following dec_and_test, implied mb, + * worklist not empty test sequence is in insert_work(). + * Please read comment there. +- * +- * NOT_RUNNING is clear. This means that we're bound to and +- * running on the local cpu w/ rq lock held and preemption +- * disabled, which in turn means that none else could be +- * manipulating idle_list, so dereferencing idle_list without pool +- * lock is safe. + */ + if (atomic_dec_and_test(&pool->nr_running) && + !list_empty(&pool->worklist)) { +- next = first_idle_worker(pool); +- if (next) +- wake_up_process(next->task); ++ sched_lock_idle_list(pool); ++ wake_up_worker(pool); ++ sched_unlock_idle_list(pool); + } +- spin_unlock_irq(&pool->lock); + } + + /** +@@ -1678,7 +1706,9 @@ static void worker_enter_idle(struct worker *worker) + worker->last_active = jiffies; + + /* idle_list is LIFO */ ++ rt_lock_idle_list(pool); + list_add(&worker->entry, &pool->idle_list); ++ rt_unlock_idle_list(pool); + + if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) + mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); +@@ -1711,7 +1741,9 @@ static void worker_leave_idle(struct worker *worker) + return; + worker_clr_flags(worker, WORKER_IDLE); + pool->nr_idle--; ++ rt_lock_idle_list(pool); + list_del_init(&worker->entry); ++ rt_unlock_idle_list(pool); + } + + static struct worker *alloc_worker(int node) +@@ -1876,7 +1908,9 @@ static void destroy_worker(struct worker *worker) + pool->nr_workers--; + pool->nr_idle--; + ++ rt_lock_idle_list(pool); + list_del_init(&worker->entry); ++ rt_unlock_idle_list(pool); + worker->flags |= WORKER_DIE; + wake_up_process(worker->task); + } +-- +2.36.1 + |