diff options
Diffstat (limited to 'debian/patches-rt/0007-futex-pi-Fix-recursive-rt_mutex-waiter-state.patch')
-rw-r--r-- | debian/patches-rt/0007-futex-pi-Fix-recursive-rt_mutex-waiter-state.patch | 198 |
1 files changed, 198 insertions, 0 deletions
diff --git a/debian/patches-rt/0007-futex-pi-Fix-recursive-rt_mutex-waiter-state.patch b/debian/patches-rt/0007-futex-pi-Fix-recursive-rt_mutex-waiter-state.patch new file mode 100644 index 0000000000..76a9b60581 --- /dev/null +++ b/debian/patches-rt/0007-futex-pi-Fix-recursive-rt_mutex-waiter-state.patch @@ -0,0 +1,198 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 15 Sep 2023 17:19:44 +0200 +Subject: [PATCH 7/7] futex/pi: Fix recursive rt_mutex waiter state +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/6.6/older/patches-6.6.7-rt18.tar.xz + +Some new assertions pointed out that the existing code has nested rt_mutex wait +state in the futex code. + +Specifically, the futex_lock_pi() cancel case uses spin_lock() while there +still is a rt_waiter enqueued for this task, resulting in a state where there +are two waiters for the same task (and task_struct::pi_blocked_on gets +scrambled). + +The reason to take hb->lock at this point is to avoid the wake_futex_pi() +EAGAIN case. + +This happens when futex_top_waiter() and rt_mutex_top_waiter() state becomes +inconsistent. The current rules are such that this inconsistency will not be +observed. + +Notably the case that needs to be avoided is where futex_lock_pi() and +futex_unlock_pi() interleave such that unlock will fail to observe a new +waiter. + +*However* the case at hand is where a waiter is leaving, in this case the race +means a waiter that is going away is not observed -- which is harmless, +provided this race is explicitly handled. + +This is a somewhat dangerous proposition because the converse race is not +observing a new waiter, which must absolutely not happen. But since the race is +valid this cannot be asserted. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lkml.kernel.org/r/20230915151943.GD6743@noisy.programming.kicks-ass.net +--- + kernel/futex/pi.c | 76 ++++++++++++++++++++++++++++++------------------- + kernel/futex/requeue.c | 6 ++- + 2 files changed, 52 insertions(+), 30 deletions(-) + +--- a/kernel/futex/pi.c ++++ b/kernel/futex/pi.c +@@ -611,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uad + /* + * Caller must hold a reference on @pi_state. + */ +-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) ++static int wake_futex_pi(u32 __user *uaddr, u32 uval, ++ struct futex_pi_state *pi_state, ++ struct rt_mutex_waiter *top_waiter) + { +- struct rt_mutex_waiter *top_waiter; + struct task_struct *new_owner; + bool postunlock = false; + DEFINE_RT_WAKE_Q(wqh); + u32 curval, newval; + int ret = 0; + +- top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); +- if (WARN_ON_ONCE(!top_waiter)) { +- /* +- * As per the comment in futex_unlock_pi() this should not happen. +- * +- * When this happens, give up our locks and try again, giving +- * the futex_lock_pi() instance time to complete, either by +- * waiting on the rtmutex or removing itself from the futex +- * queue. +- */ +- ret = -EAGAIN; +- goto out_unlock; +- } +- + new_owner = top_waiter->task; + + /* +@@ -1046,20 +1033,34 @@ int futex_lock_pi(u32 __user *uaddr, uns + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + + cleanup: +- spin_lock(q.lock_ptr); + /* + * If we failed to acquire the lock (deadlock/signal/timeout), we must +- * first acquire the hb->lock before removing the lock from the +- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait +- * lists consistent. ++ * must unwind the above, however we canont lock hb->lock because ++ * rt_mutex already has a waiter enqueued and hb->lock can itself try ++ * and enqueue an rt_waiter through rtlock. ++ * ++ * Doing the cleanup without holding hb->lock can cause inconsistent ++ * state between hb and pi_state, but only in the direction of not ++ * seeing a waiter that is leaving. ++ * ++ * See futex_unlock_pi(), it deals with this inconsistency. ++ * ++ * There be dragons here, since we must deal with the inconsistency on ++ * the way out (here), it is impossible to detect/warn about the race ++ * the other way around (missing an incoming waiter). + * +- * In particular; it is important that futex_unlock_pi() can not +- * observe this inconsistency. ++ * What could possibly go wrong... + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; + + /* ++ * Now that the rt_waiter has been dequeued, it is safe to use ++ * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up ++ * the ++ */ ++ spin_lock(q.lock_ptr); ++ /* + * Waiter is unqueued. + */ + rt_mutex_post_schedule(); +@@ -1143,6 +1144,7 @@ int futex_unlock_pi(u32 __user *uaddr, u + top_waiter = futex_top_waiter(hb, &key); + if (top_waiter) { + struct futex_pi_state *pi_state = top_waiter->pi_state; ++ struct rt_mutex_waiter *rt_waiter; + + ret = -EINVAL; + if (!pi_state) +@@ -1155,22 +1157,39 @@ int futex_unlock_pi(u32 __user *uaddr, u + if (pi_state->owner != current) + goto out_unlock; + +- get_pi_state(pi_state); + /* + * By taking wait_lock while still holding hb->lock, we ensure +- * there is no point where we hold neither; and therefore +- * wake_futex_p() must observe a state consistent with what we +- * observed. ++ * there is no point where we hold neither; and thereby ++ * wake_futex_pi() must observe any new waiters. ++ * ++ * Since the cleanup: case in futex_lock_pi() removes the ++ * rt_waiter without holding hb->lock, it is possible for ++ * wake_futex_pi() to not find a waiter while the above does, ++ * in this case the waiter is on the way out and it can be ++ * ignored. + * + * In particular; this forces __rt_mutex_start_proxy() to + * complete such that we're guaranteed to observe the +- * rt_waiter. Also see the WARN in wake_futex_pi(). ++ * rt_waiter. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); ++ ++ /* ++ * Futex vs rt_mutex waiter state -- if there are no rt_mutex ++ * waiters even though futex thinks there are, then the waiter ++ * is leaving and the uncontended path is safe to take. ++ */ ++ rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); ++ if (!rt_waiter) { ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ goto do_uncontended; ++ } ++ ++ get_pi_state(pi_state); + spin_unlock(&hb->lock); + + /* drops pi_state->pi_mutex.wait_lock */ +- ret = wake_futex_pi(uaddr, uval, pi_state); ++ ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter); + + put_pi_state(pi_state); + +@@ -1198,6 +1217,7 @@ int futex_unlock_pi(u32 __user *uaddr, u + return ret; + } + ++do_uncontended: + /* + * We have no kernel internal state, i.e. no waiters in the + * kernel. Waiters which are about to queue themselves are stuck +--- a/kernel/futex/requeue.c ++++ b/kernel/futex/requeue.c +@@ -850,11 +850,13 @@ int futex_wait_requeue_pi(u32 __user *ua + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); + +- /* Current is not longer pi_blocked_on */ +- spin_lock(q.lock_ptr); ++ /* ++ * See futex_unlock_pi()'s cleanup: comment. ++ */ + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + ++ spin_lock(q.lock_ptr); + debug_rt_mutex_free_waiter(&rt_waiter); + /* + * Fixup the pi_state owner and possibly acquire the lock if we |