From: Thomas Gleixner Date: Fri, 27 Jun 2014 16:24:52 +0200 Subject: [PATCH 257/353] workqueue: Prevent deadlock/stall on RT Origin: https://git.kernel.org/cgit/linux/kernel/git/rt/linux-stable-rt.git/commit?id=a62b1ced9d802c6c953e854a71ef4c2bfabfb7e1 Austin reported a XFS deadlock/stall on RT where scheduled work gets never exececuted and tasks are waiting for each other for ever. The underlying problem is the modification of the RT code to the handling of workers which are about to go to sleep. In mainline a worker thread which goes to sleep wakes an idle worker if there is more work to do. This happens from the guts of the schedule() function. On RT this must be outside and the accessed data structures are not protected against scheduling due to the spinlock to rtmutex conversion. So the naive solution to this was to move the code outside of the scheduler and protect the data structures by the pool lock. That approach turned out to be a little naive as we cannot call into that code when the thread blocks on a lock, as it is not allowed to block on two locks in parallel. So we dont call into the worker wakeup magic when the worker is blocked on a lock, which causes the deadlock/stall observed by Austin and Mike. Looking deeper into that worker code it turns out that the only relevant data structure which needs to be protected is the list of idle workers which can be woken up. So the solution is to protect the list manipulation operations with preempt_enable/disable pairs on RT and call unconditionally into the worker code even when the worker is blocked on a lock. The preemption protection is safe as there is nothing which can fiddle with the list outside of thread context. Reported-and_tested-by: Austin Schuh Reported-and_tested-by: Mike Galbraith Signed-off-by: Thomas Gleixner Link: http://vger.kernel.org/r/alpine.DEB.2.10.1406271249510.5170@nanos Cc: Richard Weinberger Cc: Steven Rostedt --- kernel/sched/core.c | 6 +++-- kernel/workqueue.c | 60 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e581ce1edc12..fd1a7d1ff9f4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3617,9 +3617,8 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state || tsk_is_pi_blocked(tsk)) + if (!tsk->state) return; - /* * If a worker went to sleep, notify and ask workqueue whether * it wants to wake up a task to maintain concurrency. @@ -3633,6 +3632,9 @@ static inline void sched_submit_work(struct task_struct *tsk) preempt_enable_no_resched(); } + if (tsk_is_pi_blocked(tsk)) + return; + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7561ac884a3e..cb81c7767b17 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -126,6 +126,11 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * + * On RT we need the extra protection via rt_lock_idle_list() for + * the list manipulations against read access from + * wq_worker_sleeping(). All other places are nicely serialized via + * pool->lock. + * * A: wq_pool_attach_mutex protected. * * PL: wq_pool_mutex protected. @@ -431,6 +436,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ else +#ifdef CONFIG_PREEMPT_RT_BASE +static inline void rt_lock_idle_list(struct worker_pool *pool) +{ + preempt_disable(); +} +static inline void rt_unlock_idle_list(struct worker_pool *pool) +{ + preempt_enable(); +} +static inline void sched_lock_idle_list(struct worker_pool *pool) { } +static inline void sched_unlock_idle_list(struct worker_pool *pool) { } +#else +static inline void rt_lock_idle_list(struct worker_pool *pool) { } +static inline void rt_unlock_idle_list(struct worker_pool *pool) { } +static inline void sched_lock_idle_list(struct worker_pool *pool) +{ + spin_lock_irq(&pool->lock); +} +static inline void sched_unlock_idle_list(struct worker_pool *pool) +{ + spin_unlock_irq(&pool->lock); +} +#endif + + #ifdef CONFIG_DEBUG_OBJECTS_WORK static struct debug_obj_descr work_debug_descr; @@ -837,10 +867,16 @@ static struct worker *first_idle_worker(struct worker_pool *pool) */ static void wake_up_worker(struct worker_pool *pool) { - struct worker *worker = first_idle_worker(pool); + struct worker *worker; + + rt_lock_idle_list(pool); + + worker = first_idle_worker(pool); if (likely(worker)) wake_up_process(worker->task); + + rt_unlock_idle_list(pool); } /** @@ -869,7 +905,7 @@ void wq_worker_running(struct task_struct *task) */ void wq_worker_sleeping(struct task_struct *task) { - struct worker *next, *worker = kthread_data(task); + struct worker *worker = kthread_data(task); struct worker_pool *pool; /* @@ -886,26 +922,18 @@ void wq_worker_sleeping(struct task_struct *task) return; worker->sleeping = 1; - spin_lock_irq(&pool->lock); /* * The counterpart of the following dec_and_test, implied mb, * worklist not empty test sequence is in insert_work(). * Please read comment there. - * - * NOT_RUNNING is clear. This means that we're bound to and - * running on the local cpu w/ rq lock held and preemption - * disabled, which in turn means that none else could be - * manipulating idle_list, so dereferencing idle_list without pool - * lock is safe. */ if (atomic_dec_and_test(&pool->nr_running) && !list_empty(&pool->worklist)) { - next = first_idle_worker(pool); - if (next) - wake_up_process(next->task); + sched_lock_idle_list(pool); + wake_up_worker(pool); + sched_unlock_idle_list(pool); } - spin_unlock_irq(&pool->lock); } /** @@ -1678,7 +1706,9 @@ static void worker_enter_idle(struct worker *worker) worker->last_active = jiffies; /* idle_list is LIFO */ + rt_lock_idle_list(pool); list_add(&worker->entry, &pool->idle_list); + rt_unlock_idle_list(pool); if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); @@ -1711,7 +1741,9 @@ static void worker_leave_idle(struct worker *worker) return; worker_clr_flags(worker, WORKER_IDLE); pool->nr_idle--; + rt_lock_idle_list(pool); list_del_init(&worker->entry); + rt_unlock_idle_list(pool); } static struct worker *alloc_worker(int node) @@ -1876,7 +1908,9 @@ static void destroy_worker(struct worker *worker) pool->nr_workers--; pool->nr_idle--; + rt_lock_idle_list(pool); list_del_init(&worker->entry); + rt_unlock_idle_list(pool); worker->flags |= WORKER_DIE; wake_up_process(worker->task); }