diff options
Diffstat (limited to '')
-rw-r--r-- | debian/patches-rt/0289-sched-Fix-migration_cpu_stop-requeueing.patch | 147 |
1 files changed, 147 insertions, 0 deletions
diff --git a/debian/patches-rt/0289-sched-Fix-migration_cpu_stop-requeueing.patch b/debian/patches-rt/0289-sched-Fix-migration_cpu_stop-requeueing.patch new file mode 100644 index 000000000..54a798b25 --- /dev/null +++ b/debian/patches-rt/0289-sched-Fix-migration_cpu_stop-requeueing.patch @@ -0,0 +1,147 @@ +From 013f5d7501b106952e04d54b0803c2d1ae9a4876 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 8 Jun 2021 00:37:30 -0400 +Subject: [PATCH 289/323] sched: Fix migration_cpu_stop() requeueing +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +commit 8a6edb5257e2a84720fe78cb179eca58ba76126f upstream. + +When affine_move_task(p) is called on a running task @p, which is not +otherwise already changing affinity, we'll first set +p->migration_pending and then do: + + stop_one_cpu(cpu_of_rq(rq), migration_cpu_stop, &arg); + +This then gets us to migration_cpu_stop() running on the CPU that was +previously running our victim task @p. + +If we find that our task is no longer on that runqueue (this can +happen because of a concurrent migration due to load-balance etc.), +then we'll end up at the: + + } else if (dest_cpu < 1 || pending) { + +branch. Which we'll take because we set pending earlier. Here we first +check if the task @p has already satisfied the affinity constraints, +if so we bail early [A]. Otherwise we'll reissue migration_cpu_stop() +onto the CPU that is now hosting our task @p: + + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + +Except, we've never initialized pending->arg, which will be all 0s. + +This then results in running migration_cpu_stop() on the next CPU with +arg->p == NULL, which gives the by now obvious result of fireworks. + +The cure is to change affine_move_task() to always use pending->arg, +furthermore we can use the exact same pattern as the +SCA_MIGRATE_ENABLE case, since we'll block on the pending->done +completion anyway, no point in adding yet another completion in +stop_one_cpu(). + +This then gives a clear distinction between the two +migration_cpu_stop() use cases: + + - sched_exec() / migrate_task_to() : arg->pending == NULL + - affine_move_task() : arg->pending != NULL; + +And we can have it ignore p->migration_pending when !arg->pending. Any +stop work from sched_exec() / migrate_task_to() is in addition to stop +works from affine_move_task(), which will be sufficient to issue the +completion. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Reviewed-by: Valentin Schneider <valentin.schneider@arm.com> +Link: https://lkml.kernel.org/r/20210224131355.357743989@infradead.org +Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/sched/core.c | 39 ++++++++++++++++++++++++++++----------- + 1 file changed, 28 insertions(+), 11 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index c847d17e3b04..df9c4c3838c0 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1974,6 +1974,24 @@ static int migration_cpu_stop(void *data) + rq_lock(rq, &rf); + + pending = p->migration_pending; ++ if (pending && !arg->pending) { ++ /* ++ * This happens from sched_exec() and migrate_task_to(), ++ * neither of them care about pending and just want a task to ++ * maybe move about. ++ * ++ * Even if there is a pending, we can ignore it, since ++ * affine_move_task() will have it's own stop_work's in flight ++ * which will manage the completion. ++ * ++ * Notably, pending doesn't need to match arg->pending. This can ++ * happen when tripple concurrent affine_move_task() first sets ++ * pending, then clears pending and eventually sets another ++ * pending. ++ */ ++ pending = NULL; ++ } ++ + /* + * If task_rq(p) != rq, it cannot be migrated here, because we're + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because +@@ -2246,10 +2264,6 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + int dest_cpu, unsigned int flags) + { + struct set_affinity_pending my_pending = { }, *pending = NULL; +- struct migration_arg arg = { +- .task = p, +- .dest_cpu = dest_cpu, +- }; + bool complete = false; + + /* Can the task run on the task's current CPU? If so, we're done */ +@@ -2287,6 +2301,12 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + /* Install the request */ + refcount_set(&my_pending.refs, 1); + init_completion(&my_pending.done); ++ my_pending.arg = (struct migration_arg) { ++ .task = p, ++ .dest_cpu = -1, /* any */ ++ .pending = &my_pending, ++ }; ++ + p->migration_pending = &my_pending; + } else { + pending = p->migration_pending; +@@ -2317,12 +2337,6 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); + +- pending->arg = (struct migration_arg) { +- .task = p, +- .dest_cpu = -1, +- .pending = pending, +- }; +- + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + +@@ -2335,8 +2349,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + * is_migration_disabled(p) checks to the stopper, which will + * run on the same CPU as said p. + */ ++ refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ + task_rq_unlock(rq, p, rf); +- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ ++ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, ++ &pending->arg, &pending->stop_work); + + } else { + +-- +2.43.0 + |