diff options
Diffstat (limited to 'debian/patches-rt/0302-futex-Make-the-futex_hash_bucket-spinlock_t-again-an.patch')
-rw-r--r-- | debian/patches-rt/0302-futex-Make-the-futex_hash_bucket-spinlock_t-again-an.patch | 731 |
1 files changed, 731 insertions, 0 deletions
diff --git a/debian/patches-rt/0302-futex-Make-the-futex_hash_bucket-spinlock_t-again-an.patch b/debian/patches-rt/0302-futex-Make-the-futex_hash_bucket-spinlock_t-again-an.patch new file mode 100644 index 000000000..224b6a81a --- /dev/null +++ b/debian/patches-rt/0302-futex-Make-the-futex_hash_bucket-spinlock_t-again-an.patch @@ -0,0 +1,731 @@ +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Mon, 7 Oct 2019 16:45:18 +0200 +Subject: [PATCH 302/342] futex: Make the futex_hash_bucket spinlock_t again + and bring back its old state +Origin: https://git.kernel.org/cgit/linux/kernel/git/rt/linux-stable-rt.git/commit?id=db07179f5403257fab1645f3d1b5016389d2d3bb + +[ Upstream commit 954ad80c23edfe71f4e8ce70b961eac884320c3a ] + +This is an all-in-one patch that reverts the patches: + futex: Make the futex_hash_bucket lock raw + futex: Delay deallocation of pi_state + +and adds back the old patches we had: + futex: workaround migrate_disable/enable in different context + rtmutex: Handle the various new futex race conditions + futex: Fix bug on when a requeued RT task times out + futex: Ensure lock/unlock symetry versus pi_lock and hash bucket lock + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/futex.c | 228 ++++++++++++++++++-------------- + kernel/locking/rtmutex.c | 65 ++++++++- + kernel/locking/rtmutex_common.h | 3 + + 3 files changed, 193 insertions(+), 103 deletions(-) + +diff --git a/kernel/futex.c b/kernel/futex.c +index 2fc6bedb460e..81090f7a3ed9 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -243,7 +243,7 @@ struct futex_q { + struct plist_node list; + + struct task_struct *task; +- raw_spinlock_t *lock_ptr; ++ spinlock_t *lock_ptr; + union futex_key key; + struct futex_pi_state *pi_state; + struct rt_mutex_waiter *rt_waiter; +@@ -264,7 +264,7 @@ static const struct futex_q futex_q_init = { + */ + struct futex_hash_bucket { + atomic_t waiters; +- raw_spinlock_t lock; ++ spinlock_t lock; + struct plist_head chain; + } ____cacheline_aligned_in_smp; + +@@ -871,13 +871,13 @@ static void get_pi_state(struct futex_pi_state *pi_state) + * Drops a reference to the pi_state object and frees or caches it + * when the last reference is gone. + */ +-static struct futex_pi_state *__put_pi_state(struct futex_pi_state *pi_state) ++static void put_pi_state(struct futex_pi_state *pi_state) + { + if (!pi_state) +- return NULL; ++ return; + + if (!atomic_dec_and_test(&pi_state->refcount)) +- return NULL; ++ return; + + /* + * If pi_state->owner is NULL, the owner is most probably dying +@@ -892,7 +892,9 @@ static struct futex_pi_state *__put_pi_state(struct futex_pi_state *pi_state) + raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); + } + +- if (!current->pi_state_cache) { ++ if (current->pi_state_cache) { ++ kfree(pi_state); ++ } else { + /* + * pi_state->list is already empty. + * clear pi_state->owner. +@@ -901,30 +903,6 @@ static struct futex_pi_state *__put_pi_state(struct futex_pi_state *pi_state) + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + current->pi_state_cache = pi_state; +- pi_state = NULL; +- } +- return pi_state; +-} +- +-static void put_pi_state(struct futex_pi_state *pi_state) +-{ +- kfree(__put_pi_state(pi_state)); +-} +- +-static void put_pi_state_atomic(struct futex_pi_state *pi_state, +- struct list_head *to_free) +-{ +- if (__put_pi_state(pi_state)) +- list_add(&pi_state->list, to_free); +-} +- +-static void free_pi_state_list(struct list_head *to_free) +-{ +- struct futex_pi_state *p, *next; +- +- list_for_each_entry_safe(p, next, to_free, list) { +- list_del(&p->list); +- kfree(p); + } + } + +@@ -941,7 +919,6 @@ static void exit_pi_state_list(struct task_struct *curr) + struct futex_pi_state *pi_state; + struct futex_hash_bucket *hb; + union futex_key key = FUTEX_KEY_INIT; +- LIST_HEAD(to_free); + + if (!futex_cmpxchg_enabled) + return; +@@ -975,7 +952,7 @@ static void exit_pi_state_list(struct task_struct *curr) + } + raw_spin_unlock_irq(&curr->pi_lock); + +- raw_spin_lock(&hb->lock); ++ spin_lock(&hb->lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&curr->pi_lock); + /* +@@ -985,8 +962,10 @@ static void exit_pi_state_list(struct task_struct *curr) + if (head->next != next) { + /* retain curr->pi_lock for the loop invariant */ + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); +- raw_spin_unlock(&hb->lock); +- put_pi_state_atomic(pi_state, &to_free); ++ raw_spin_unlock_irq(&curr->pi_lock); ++ spin_unlock(&hb->lock); ++ raw_spin_lock_irq(&curr->pi_lock); ++ put_pi_state(pi_state); + continue; + } + +@@ -997,7 +976,7 @@ static void exit_pi_state_list(struct task_struct *curr) + + raw_spin_unlock(&curr->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); +- raw_spin_unlock(&hb->lock); ++ spin_unlock(&hb->lock); + + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); +@@ -1005,8 +984,6 @@ static void exit_pi_state_list(struct task_struct *curr) + raw_spin_lock_irq(&curr->pi_lock); + } + raw_spin_unlock_irq(&curr->pi_lock); +- +- free_pi_state_list(&to_free); + } + #else + static inline void exit_pi_state_list(struct task_struct *curr) { } +@@ -1548,7 +1525,7 @@ static void __unqueue_futex(struct futex_q *q) + { + struct futex_hash_bucket *hb; + +- if (WARN_ON_SMP(!q->lock_ptr || !raw_spin_is_locked(q->lock_ptr)) ++ if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) + || WARN_ON(plist_node_empty(&q->list))) + return; + +@@ -1668,21 +1645,21 @@ static inline void + double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) + { + if (hb1 <= hb2) { +- raw_spin_lock(&hb1->lock); ++ spin_lock(&hb1->lock); + if (hb1 < hb2) +- raw_spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); ++ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); + } else { /* hb1 > hb2 */ +- raw_spin_lock(&hb2->lock); +- raw_spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); ++ spin_lock(&hb2->lock); ++ spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); + } + } + + static inline void + double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) + { +- raw_spin_unlock(&hb1->lock); ++ spin_unlock(&hb1->lock); + if (hb1 != hb2) +- raw_spin_unlock(&hb2->lock); ++ spin_unlock(&hb2->lock); + } + + /* +@@ -1710,7 +1687,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) + if (!hb_waiters_pending(hb)) + goto out_put_key; + +- raw_spin_lock(&hb->lock); ++ spin_lock(&hb->lock); + + plist_for_each_entry_safe(this, next, &hb->chain, list) { + if (match_futex (&this->key, &key)) { +@@ -1729,7 +1706,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) + } + } + +- raw_spin_unlock(&hb->lock); ++ spin_unlock(&hb->lock); + wake_up_q(&wake_q); + out_put_key: + put_futex_key(&key); +@@ -2042,7 +2019,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + struct futex_hash_bucket *hb1, *hb2; + struct futex_q *this, *next; + DEFINE_WAKE_Q(wake_q); +- LIST_HEAD(to_free); + + if (nr_wake < 0 || nr_requeue < 0) + return -EINVAL; +@@ -2281,6 +2257,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + requeue_pi_wake_futex(this, &key2, hb2); + drop_count++; + continue; ++ } else if (ret == -EAGAIN) { ++ /* ++ * Waiter was woken by timeout or ++ * signal and has set pi_blocked_on to ++ * PI_WAKEUP_INPROGRESS before we ++ * tried to enqueue it on the rtmutex. ++ */ ++ this->pi_state = NULL; ++ put_pi_state(pi_state); ++ continue; + } else if (ret) { + /* + * rt_mutex_start_proxy_lock() detected a +@@ -2291,7 +2277,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + * object. + */ + this->pi_state = NULL; +- put_pi_state_atomic(pi_state, &to_free); ++ put_pi_state(pi_state); + /* + * We stop queueing more waiters and let user + * space deal with the mess. +@@ -2308,7 +2294,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We + * need to drop it here again. + */ +- put_pi_state_atomic(pi_state, &to_free); ++ put_pi_state(pi_state); + + out_unlock: + double_unlock_hb(hb1, hb2); +@@ -2329,7 +2315,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + out_put_key1: + put_futex_key(&key1); + out: +- free_pi_state_list(&to_free); + return ret ? ret : task_count; + } + +@@ -2353,7 +2338,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) + + q->lock_ptr = &hb->lock; + +- raw_spin_lock(&hb->lock); /* implies smp_mb(); (A) */ ++ spin_lock(&hb->lock); /* implies smp_mb(); (A) */ + return hb; + } + +@@ -2361,7 +2346,7 @@ static inline void + queue_unlock(struct futex_hash_bucket *hb) + __releases(&hb->lock) + { +- raw_spin_unlock(&hb->lock); ++ spin_unlock(&hb->lock); + hb_waiters_dec(hb); + } + +@@ -2400,7 +2385,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) + { + __queue_me(q, hb); +- raw_spin_unlock(&hb->lock); ++ spin_unlock(&hb->lock); + } + + /** +@@ -2416,41 +2401,41 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + */ + static int unqueue_me(struct futex_q *q) + { +- raw_spinlock_t *lock_ptr; ++ spinlock_t *lock_ptr; + int ret = 0; + + /* In the common case we don't take the spinlock, which is nice. */ + retry: + /* +- * q->lock_ptr can change between this read and the following +- * raw_spin_lock. Use READ_ONCE to forbid the compiler from reloading +- * q->lock_ptr and optimizing lock_ptr out of the logic below. ++ * q->lock_ptr can change between this read and the following spin_lock. ++ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and ++ * optimizing lock_ptr out of the logic below. + */ + lock_ptr = READ_ONCE(q->lock_ptr); + if (lock_ptr != NULL) { +- raw_spin_lock(lock_ptr); ++ spin_lock(lock_ptr); + /* + * q->lock_ptr can change between reading it and +- * raw_spin_lock(), causing us to take the wrong lock. This ++ * spin_lock(), causing us to take the wrong lock. This + * corrects the race condition. + * + * Reasoning goes like this: if we have the wrong lock, + * q->lock_ptr must have changed (maybe several times) +- * between reading it and the raw_spin_lock(). It can +- * change again after the raw_spin_lock() but only if it was +- * already changed before the raw_spin_lock(). It cannot, ++ * between reading it and the spin_lock(). It can ++ * change again after the spin_lock() but only if it was ++ * already changed before the spin_lock(). It cannot, + * however, change back to the original value. Therefore + * we can detect whether we acquired the correct lock. + */ + if (unlikely(lock_ptr != q->lock_ptr)) { +- raw_spin_unlock(lock_ptr); ++ spin_unlock(lock_ptr); + goto retry; + } + __unqueue_futex(q); + + BUG_ON(q->pi_state); + +- raw_spin_unlock(lock_ptr); ++ spin_unlock(lock_ptr); + ret = 1; + } + +@@ -2466,16 +2451,13 @@ static int unqueue_me(struct futex_q *q) + static void unqueue_me_pi(struct futex_q *q) + __releases(q->lock_ptr) + { +- struct futex_pi_state *ps; +- + __unqueue_futex(q); + + BUG_ON(!q->pi_state); +- ps = __put_pi_state(q->pi_state); ++ put_pi_state(q->pi_state); + q->pi_state = NULL; + +- raw_spin_unlock(q->lock_ptr); +- kfree(ps); ++ spin_unlock(q->lock_ptr); + } + + static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, +@@ -2599,7 +2581,7 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + */ + handle_err: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); +- raw_spin_unlock(q->lock_ptr); ++ spin_unlock(q->lock_ptr); + + switch (err) { + case -EFAULT: +@@ -2616,7 +2598,7 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + break; + } + +- raw_spin_lock(q->lock_ptr); ++ spin_lock(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* +@@ -2730,7 +2712,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + /* + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using smp_store_mb() and +- * queue_me() calls raw_spin_unlock() upon completion, both serializing ++ * queue_me() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. + */ + set_current_state(TASK_INTERRUPTIBLE); +@@ -3028,7 +3010,15 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, + * before __rt_mutex_start_proxy_lock() is done. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); +- raw_spin_unlock(q.lock_ptr); ++ /* ++ * the migrate_disable() here disables migration in the in_atomic() fast ++ * path which is enabled again in the following spin_unlock(). We have ++ * one migrate_disable() pending in the slow-path which is reversed ++ * after the raw_spin_unlock_irq() where we leave the atomic context. ++ */ ++ migrate_disable(); ++ ++ spin_unlock(q.lock_ptr); + /* + * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter + * such that futex_unlock_pi() is guaranteed to observe the waiter when +@@ -3036,6 +3026,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, + */ + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); ++ migrate_enable(); + + if (ret) { + if (ret == 1) +@@ -3049,7 +3040,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + + cleanup: +- raw_spin_lock(q.lock_ptr); ++ spin_lock(q.lock_ptr); + /* + * If we failed to acquire the lock (deadlock/signal/timeout), we must + * first acquire the hb->lock before removing the lock from the +@@ -3136,7 +3127,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) + return ret; + + hb = hash_futex(&key); +- raw_spin_lock(&hb->lock); ++ spin_lock(&hb->lock); + + /* + * Check waiters first. We do not trust user space values at +@@ -3170,10 +3161,19 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) + * rt_waiter. Also see the WARN in wake_futex_pi(). + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); +- raw_spin_unlock(&hb->lock); ++ /* ++ * Magic trickery for now to make the RT migrate disable ++ * logic happy. The following spin_unlock() happens with ++ * interrupts disabled so the internal migrate_enable() ++ * won't undo the migrate_disable() which was issued when ++ * locking hb->lock. ++ */ ++ migrate_disable(); ++ spin_unlock(&hb->lock); + + /* drops pi_state->pi_mutex.wait_lock */ + ret = wake_futex_pi(uaddr, uval, pi_state); ++ migrate_enable(); + + put_pi_state(pi_state); + +@@ -3209,7 +3209,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) + * owner. + */ + if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { +- raw_spin_unlock(&hb->lock); ++ spin_unlock(&hb->lock); + switch (ret) { + case -EFAULT: + goto pi_faulted; +@@ -3229,7 +3229,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) + ret = (curval == uval) ? 0 : -EAGAIN; + + out_unlock: +- raw_spin_unlock(&hb->lock); ++ spin_unlock(&hb->lock); + out_putkey: + put_futex_key(&key); + return ret; +@@ -3344,7 +3344,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + { + struct hrtimer_sleeper timeout, *to = NULL; + struct rt_mutex_waiter rt_waiter; +- struct futex_hash_bucket *hb; ++ struct futex_hash_bucket *hb, *hb2; + union futex_key key2 = FUTEX_KEY_INIT; + struct futex_q q = futex_q_init; + int res, ret; +@@ -3402,20 +3402,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue_me(hb, &q, to); + +- raw_spin_lock(&hb->lock); +- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); +- raw_spin_unlock(&hb->lock); +- if (ret) +- goto out_put_keys; ++ /* ++ * On RT we must avoid races with requeue and trying to block ++ * on two mutexes (hb->lock and uaddr2's rtmutex) by ++ * serializing access to pi_blocked_on with pi_lock. ++ */ ++ raw_spin_lock_irq(¤t->pi_lock); ++ if (current->pi_blocked_on) { ++ /* ++ * We have been requeued or are in the process of ++ * being requeued. ++ */ ++ raw_spin_unlock_irq(¤t->pi_lock); ++ } else { ++ /* ++ * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS ++ * prevents a concurrent requeue from moving us to the ++ * uaddr2 rtmutex. After that we can safely acquire ++ * (and possibly block on) hb->lock. ++ */ ++ current->pi_blocked_on = PI_WAKEUP_INPROGRESS; ++ raw_spin_unlock_irq(¤t->pi_lock); ++ ++ spin_lock(&hb->lock); ++ ++ /* ++ * Clean up pi_blocked_on. We might leak it otherwise ++ * when we succeeded with the hb->lock in the fast ++ * path. ++ */ ++ raw_spin_lock_irq(¤t->pi_lock); ++ current->pi_blocked_on = NULL; ++ raw_spin_unlock_irq(¤t->pi_lock); ++ ++ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); ++ spin_unlock(&hb->lock); ++ if (ret) ++ goto out_put_keys; ++ } + + /* +- * In order for us to be here, we know our q.key == key2, and since +- * we took the hb->lock above, we also know that futex_requeue() has +- * completed and we no longer have to concern ourselves with a wakeup +- * race with the atomic proxy lock acquisition by the requeue code. The +- * futex_requeue dropped our key1 reference and incremented our key2 +- * reference count. ++ * In order to be here, we have either been requeued, are in ++ * the process of being requeued, or requeue successfully ++ * acquired uaddr2 on our behalf. If pi_blocked_on was ++ * non-null above, we may be racing with a requeue. Do not ++ * rely on q->lock_ptr to be hb2->lock until after blocking on ++ * hb->lock or hb2->lock. The futex_requeue dropped our key1 ++ * reference and incremented our key2 reference count. + */ ++ hb2 = hash_futex(&key2); + + /* Check if the requeue code acquired the second futex for us. */ + if (!q.rt_waiter) { +@@ -3424,17 +3459,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + * did a lock-steal - fix up the PI-state in that case. + */ + if (q.pi_state && (q.pi_state->owner != current)) { +- struct futex_pi_state *ps_free; +- +- raw_spin_lock(q.lock_ptr); ++ spin_lock(&hb2->lock); ++ BUG_ON(&hb2->lock != q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current); + /* + * Drop the reference to the pi state which + * the requeue_pi() code acquired for us. + */ +- ps_free = __put_pi_state(q.pi_state); ++ put_pi_state(q.pi_state); + spin_unlock(&hb2->lock); +- kfree(ps_free); + /* + * Adjust the return value. It's either -EFAULT or + * success (1) but the caller expects 0 for success. +@@ -3453,7 +3486,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); + +- raw_spin_lock(q.lock_ptr); ++ spin_lock(&hb2->lock); ++ BUG_ON(&hb2->lock != q.lock_ptr); + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + +@@ -4212,7 +4246,7 @@ static int __init futex_init(void) + for (i = 0; i < futex_hashsize; i++) { + atomic_set(&futex_queues[i].waiters, 0); + plist_head_init(&futex_queues[i].chain); +- raw_spin_lock_init(&futex_queues[i].lock); ++ spin_lock_init(&futex_queues[i].lock); + } + + return 0; +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 05fcf8a75a51..ded2296f848a 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -142,6 +142,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + } + ++static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter) ++{ ++ return waiter && waiter != PI_WAKEUP_INPROGRESS && ++ waiter != PI_REQUEUE_INPROGRESS; ++} ++ + /* + * We can speed up the acquire/release, if there's no debugging state to be + * set up. +@@ -415,7 +421,8 @@ int max_lock_depth = 1024; + + static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) + { +- return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; ++ return rt_mutex_real_waiter(p->pi_blocked_on) ? ++ p->pi_blocked_on->lock : NULL; + } + + /* +@@ -551,7 +558,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * reached or the state of the chain has changed while we + * dropped the locks. + */ +- if (!waiter) ++ if (!rt_mutex_real_waiter(waiter)) + goto out_unlock_pi; + + /* +@@ -1321,6 +1328,22 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + return -EDEADLK; + + raw_spin_lock(&task->pi_lock); ++ /* ++ * In the case of futex requeue PI, this will be a proxy ++ * lock. The task will wake unaware that it is enqueueed on ++ * this lock. Avoid blocking on two locks and corrupting ++ * pi_blocked_on via the PI_WAKEUP_INPROGRESS ++ * flag. futex_wait_requeue_pi() sets this when it wakes up ++ * before requeue (due to a signal or timeout). Do not enqueue ++ * the task if PI_WAKEUP_INPROGRESS is set. ++ */ ++ if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) { ++ raw_spin_unlock(&task->pi_lock); ++ return -EAGAIN; ++ } ++ ++ BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)); ++ + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +@@ -1344,7 +1367,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + rt_mutex_enqueue_pi(owner, waiter); + + rt_mutex_adjust_prio(owner); +- if (owner->pi_blocked_on) ++ if (rt_mutex_real_waiter(owner->pi_blocked_on)) + chain_walk = 1; + } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { + chain_walk = 1; +@@ -1444,7 +1467,7 @@ static void remove_waiter(struct rt_mutex *lock, + { + bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); + struct task_struct *owner = rt_mutex_owner(lock); +- struct rt_mutex *next_lock; ++ struct rt_mutex *next_lock = NULL; + + lockdep_assert_held(&lock->wait_lock); + +@@ -1470,7 +1493,8 @@ static void remove_waiter(struct rt_mutex *lock, + rt_mutex_adjust_prio(owner); + + /* Store the lock on which owner is blocked or NULL */ +- next_lock = task_blocked_on_lock(owner); ++ if (rt_mutex_real_waiter(owner->pi_blocked_on)) ++ next_lock = task_blocked_on_lock(owner); + + raw_spin_unlock(&owner->pi_lock); + +@@ -1506,7 +1530,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) + raw_spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; +- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { ++ if (!rt_mutex_real_waiter(waiter) || ++ rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return; + } +@@ -2324,6 +2349,34 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + if (try_to_take_rt_mutex(lock, task, NULL)) + return 1; + ++#ifdef CONFIG_PREEMPT_RT_FULL ++ /* ++ * In PREEMPT_RT there's an added race. ++ * If the task, that we are about to requeue, times out, ++ * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue ++ * to skip this task. But right after the task sets ++ * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then ++ * block on the spin_lock(&hb->lock), which in RT is an rtmutex. ++ * This will replace the PI_WAKEUP_INPROGRESS with the actual ++ * lock that it blocks on. We *must not* place this task ++ * on this proxy lock in that case. ++ * ++ * To prevent this race, we first take the task's pi_lock ++ * and check if it has updated its pi_blocked_on. If it has, ++ * we assume that it woke up and we return -EAGAIN. ++ * Otherwise, we set the task's pi_blocked_on to ++ * PI_REQUEUE_INPROGRESS, so that if the task is waking up ++ * it will know that we are in the process of requeuing it. ++ */ ++ raw_spin_lock(&task->pi_lock); ++ if (task->pi_blocked_on) { ++ raw_spin_unlock(&task->pi_lock); ++ return -EAGAIN; ++ } ++ task->pi_blocked_on = PI_REQUEUE_INPROGRESS; ++ raw_spin_unlock(&task->pi_lock); ++#endif ++ + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, + RT_MUTEX_FULL_CHAINWALK); +diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h +index f587e0422d23..8e0c592273e6 100644 +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -132,6 +132,9 @@ enum rtmutex_chainwalk { + /* + * PI-futex support (proxy locking functions, etc.): + */ ++#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1) ++#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2) ++ + extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); + extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); |