1 files changed, 207 insertions, 0 deletions
diff --git a/debian/patches-rt/0258-workqueue-Prevent-deadlock-stall-on-RT.patch b/debian/patches-rt/0258-workqueue-Prevent-deadlock-stall-on-RT.patch
new file mode 100644
index 000000000..10fe17fc0
--- /dev/null
+++ b/debian/patches-rt/0258-workqueue-Prevent-deadlock-stall-on-RT.patch
@@ -0,0 +1,207 @@
+From 95516b448b8dcc30ac9e5fc0e38c1c3d3f8ef930 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Fri, 27 Jun 2014 16:24:52 +0200
+Subject: [PATCH 258/347] workqueue: Prevent deadlock/stall on RT
+Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.19/older/patches-4.19.246-rt110.tar.xz
+
+Austin reported a XFS deadlock/stall on RT where scheduled work gets
+never exececuted and tasks are waiting for each other for ever.
+
+The underlying problem is the modification of the RT code to the
+handling of workers which are about to go to sleep. In mainline a
+worker thread which goes to sleep wakes an idle worker if there is
+more work to do. This happens from the guts of the schedule()
+function. On RT this must be outside and the accessed data structures
+are not protected against scheduling due to the spinlock to rtmutex
+conversion. So the naive solution to this was to move the code outside
+of the scheduler and protect the data structures by the pool
+lock. That approach turned out to be a little naive as we cannot call
+into that code when the thread blocks on a lock, as it is not allowed
+to block on two locks in parallel. So we dont call into the worker
+wakeup magic when the worker is blocked on a lock, which causes the
+deadlock/stall observed by Austin and Mike.
+
+Looking deeper into that worker code it turns out that the only
+relevant data structure which needs to be protected is the list of
+idle workers which can be woken up.
+
+So the solution is to protect the list manipulation operations with
+preempt_enable/disable pairs on RT and call unconditionally into the
+worker code even when the worker is blocked on a lock. The preemption
+protection is safe as there is nothing which can fiddle with the list
+outside of thread context.
+
+Reported-and_tested-by: Austin Schuh <austin@peloton-tech.com>
+Reported-and_tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: http://vger.kernel.org/r/alpine.DEB.2.10.1406271249510.5170@nanos
+Cc: Richard Weinberger <richard.weinberger@gmail.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+---
+ kernel/sched/core.c |  6 +++--
+ kernel/workqueue.c  | 60 +++++++++++++++++++++++++++++++++++----------
+ 2 files changed, 51 insertions(+), 15 deletions(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 0995748a3a1d..feaf4d5683af 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -3615,9 +3615,8 @@ void __noreturn do_task_dead(void)
+ 
+ static inline void sched_submit_work(struct task_struct *tsk)
+ {
+-	if (!tsk->state || tsk_is_pi_blocked(tsk))
++	if (!tsk->state)
+ 		return;
+-
+ 	/*
+ 	 * If a worker went to sleep, notify and ask workqueue whether
+ 	 * it wants to wake up a task to maintain concurrency.
+@@ -3631,6 +3630,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
+ 		preempt_enable_no_resched();
+ 	}
+ 
++	if (tsk_is_pi_blocked(tsk))
++		return;
++
+ 	/*
+ 	 * If we are going to sleep and we have plugged IO queued,
+ 	 * make sure to submit it to avoid deadlocks.
+diff --git a/kernel/workqueue.c b/kernel/workqueue.c
+index 0c529f920232..91f5696cf335 100644
+--- a/kernel/workqueue.c
++++ b/kernel/workqueue.c
+@@ -126,6 +126,11 @@ enum {
+  *    cpu or grabbing pool->lock is enough for read access.  If
+  *    POOL_DISASSOCIATED is set, it's identical to L.
+  *
++ *    On RT we need the extra protection via rt_lock_idle_list() for
++ *    the list manipulations against read access from
++ *    wq_worker_sleeping(). All other places are nicely serialized via
++ *    pool->lock.
++ *
+  * A: wq_pool_attach_mutex protected.
+  *
+  * PL: wq_pool_mutex protected.
+@@ -431,6 +436,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
+ 		if (({ assert_rcu_or_wq_mutex(wq); false; })) { }	\
+ 		else
+ 
++#ifdef CONFIG_PREEMPT_RT_BASE
++static inline void rt_lock_idle_list(struct worker_pool *pool)
++{
++	preempt_disable();
++}
++static inline void rt_unlock_idle_list(struct worker_pool *pool)
++{
++	preempt_enable();
++}
++static inline void sched_lock_idle_list(struct worker_pool *pool) { }
++static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
++#else
++static inline void rt_lock_idle_list(struct worker_pool *pool) { }
++static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
++static inline void sched_lock_idle_list(struct worker_pool *pool)
++{
++	spin_lock_irq(&pool->lock);
++}
++static inline void sched_unlock_idle_list(struct worker_pool *pool)
++{
++	spin_unlock_irq(&pool->lock);
++}
++#endif
++
++
+ #ifdef CONFIG_DEBUG_OBJECTS_WORK
+ 
+ static struct debug_obj_descr work_debug_descr;
+@@ -837,10 +867,16 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
+  */
+ static void wake_up_worker(struct worker_pool *pool)
+ {
+-	struct worker *worker = first_idle_worker(pool);
++	struct worker *worker;
++
++	rt_lock_idle_list(pool);
++
++	worker = first_idle_worker(pool);
+ 
+ 	if (likely(worker))
+ 		wake_up_process(worker->task);
++
++	rt_unlock_idle_list(pool);
+ }
+ 
+ /**
+@@ -869,7 +905,7 @@ void wq_worker_running(struct task_struct *task)
+  */
+ void wq_worker_sleeping(struct task_struct *task)
+ {
+-	struct worker *next, *worker = kthread_data(task);
++	struct worker *worker = kthread_data(task);
+ 	struct worker_pool *pool;
+ 
+ 	/*
+@@ -886,26 +922,18 @@ void wq_worker_sleeping(struct task_struct *task)
+ 		return;
+ 
+ 	worker->sleeping = 1;
+-	spin_lock_irq(&pool->lock);
+ 
+ 	/*
+ 	 * The counterpart of the following dec_and_test, implied mb,
+ 	 * worklist not empty test sequence is in insert_work().
+ 	 * Please read comment there.
+-	 *
+-	 * NOT_RUNNING is clear.  This means that we're bound to and
+-	 * running on the local cpu w/ rq lock held and preemption
+-	 * disabled, which in turn means that none else could be
+-	 * manipulating idle_list, so dereferencing idle_list without pool
+-	 * lock is safe.
+ 	 */
+ 	if (atomic_dec_and_test(&pool->nr_running) &&
+ 	    !list_empty(&pool->worklist)) {
+-		next = first_idle_worker(pool);
+-		if (next)
+-			wake_up_process(next->task);
++		sched_lock_idle_list(pool);
++		wake_up_worker(pool);
++		sched_unlock_idle_list(pool);
+ 	}
+-	spin_unlock_irq(&pool->lock);
+ }
+ 
+ /**
+@@ -1678,7 +1706,9 @@ static void worker_enter_idle(struct worker *worker)
+ 	worker->last_active = jiffies;
+ 
+ 	/* idle_list is LIFO */
++	rt_lock_idle_list(pool);
+ 	list_add(&worker->entry, &pool->idle_list);
++	rt_unlock_idle_list(pool);
+ 
+ 	if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
+ 		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
+@@ -1711,7 +1741,9 @@ static void worker_leave_idle(struct worker *worker)
+ 		return;
+ 	worker_clr_flags(worker, WORKER_IDLE);
+ 	pool->nr_idle--;
++	rt_lock_idle_list(pool);
+ 	list_del_init(&worker->entry);
++	rt_unlock_idle_list(pool);
+ }
+ 
+ static struct worker *alloc_worker(int node)
+@@ -1876,7 +1908,9 @@ static void destroy_worker(struct worker *worker)
+ 	pool->nr_workers--;
+ 	pool->nr_idle--;
+ 
++	rt_lock_idle_list(pool);
+ 	list_del_init(&worker->entry);
++	rt_unlock_idle_list(pool);
+ 	worker->flags |= WORKER_DIE;
+ 	wake_up_process(worker->task);
+ }
+-- 
+2.36.1
+