1 files changed, 370 insertions, 0 deletions
diff --git a/debian/patches-rt/0011-sched-Fix-migrate_disable-vs-set_cpus_allowed_ptr.patch b/debian/patches-rt/0011-sched-Fix-migrate_disable-vs-set_cpus_allowed_ptr.patch
new file mode 100644
index 000000000..d89f86138
--- /dev/null
+++ b/debian/patches-rt/0011-sched-Fix-migrate_disable-vs-set_cpus_allowed_ptr.patch
@@ -0,0 +1,370 @@
+From f6fcadc60ec427b9eeb4b734b77b7b110c050b83 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Fri, 23 Oct 2020 12:12:08 +0200
+Subject: [PATCH 011/323] sched: Fix migrate_disable() vs
+ set_cpus_allowed_ptr()
+Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz
+
+Concurrent migrate_disable() and set_cpus_allowed_ptr() has
+interesting features. We rely on set_cpus_allowed_ptr() to not return
+until the task runs inside the provided mask. This expectation is
+exported to userspace.
+
+This means that any set_cpus_allowed_ptr() caller must wait until
+migrate_enable() allows migrations.
+
+At the same time, we don't want migrate_enable() to schedule, due to
+patterns like:
+
+	preempt_disable();
+	migrate_disable();
+	...
+	migrate_enable();
+	preempt_enable();
+
+And:
+
+	raw_spin_lock(&B);
+	spin_unlock(&A);
+
+this means that when migrate_enable() must restore the affinity
+mask, it cannot wait for completion thereof. Luck will have it that
+that is exactly the case where there is a pending
+set_cpus_allowed_ptr(), so let that provide storage for the async stop
+machine.
+
+Much thanks to Valentin who used TLA+ most effective and found lots of
+'interesting' cases.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ include/linux/sched.h |   1 +
+ kernel/sched/core.c   | 234 ++++++++++++++++++++++++++++++++++++------
+ 2 files changed, 205 insertions(+), 30 deletions(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 76907e9876d5..5b5c194f5a62 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -726,6 +726,7 @@ struct task_struct {
+ 	int				nr_cpus_allowed;
+ 	const cpumask_t			*cpus_ptr;
+ 	cpumask_t			cpus_mask;
++	void				*migration_pending;
+ #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+ 	int				migration_disabled;
+ #endif
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 5c9db4b2b6ec..3af7c42896c9 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1746,15 +1746,26 @@ void migrate_enable(void)
+ {
+ 	struct task_struct *p = current;
+ 
+-	if (--p->migration_disabled)
++	if (p->migration_disabled > 1) {
++		p->migration_disabled--;
+ 		return;
++	}
+ 
++	/*
++	 * Ensure stop_task runs either before or after this, and that
++	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
++	 */
++	preempt_disable();
++	if (p->cpus_ptr != &p->cpus_mask)
++		__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
++	/*
++	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
++	 * regular cpus_mask, otherwise things that race (eg.
++	 * select_fallback_rq) get confused.
++	 */
+ 	barrier();
+-
+-	if (p->cpus_ptr == &p->cpus_mask)
+-		return;
+-
+-	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
++	p->migration_disabled = 0;
++	preempt_enable();
+ }
+ EXPORT_SYMBOL_GPL(migrate_enable);
+ 
+@@ -1819,8 +1830,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
+ }
+ 
+ struct migration_arg {
+-	struct task_struct *task;
+-	int dest_cpu;
++	struct task_struct		*task;
++	int				dest_cpu;
++	struct set_affinity_pending	*pending;
++};
++
++struct set_affinity_pending {
++	refcount_t		refs;
++	struct completion	done;
++	struct cpu_stop_work	stop_work;
++	struct migration_arg	arg;
+ };
+ 
+ /*
+@@ -1852,16 +1871,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
+  */
+ static int migration_cpu_stop(void *data)
+ {
++	struct set_affinity_pending *pending;
+ 	struct migration_arg *arg = data;
+ 	struct task_struct *p = arg->task;
++	int dest_cpu = arg->dest_cpu;
+ 	struct rq *rq = this_rq();
++	bool complete = false;
+ 	struct rq_flags rf;
+ 
+ 	/*
+ 	 * The original target CPU might have gone down and we might
+ 	 * be on another CPU but it doesn't matter.
+ 	 */
+-	local_irq_disable();
++	local_irq_save(rf.flags);
+ 	/*
+ 	 * We need to explicitly wake pending tasks before running
+ 	 * __migrate_task() such that we will not miss enforcing cpus_ptr
+@@ -1871,21 +1893,83 @@ static int migration_cpu_stop(void *data)
+ 
+ 	raw_spin_lock(&p->pi_lock);
+ 	rq_lock(rq, &rf);
++
++	pending = p->migration_pending;
+ 	/*
+ 	 * If task_rq(p) != rq, it cannot be migrated here, because we're
+ 	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+ 	 * we're holding p->pi_lock.
+ 	 */
+ 	if (task_rq(p) == rq) {
++		if (is_migration_disabled(p))
++			goto out;
++
++		if (pending) {
++			p->migration_pending = NULL;
++			complete = true;
++		}
++
++		/* migrate_enable() --  we must not race against SCA */
++		if (dest_cpu < 0) {
++			/*
++			 * When this was migrate_enable() but we no longer
++			 * have a @pending, a concurrent SCA 'fixed' things
++			 * and we should be valid again. Nothing to do.
++			 */
++			if (!pending) {
++				WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
++				goto out;
++			}
++
++			dest_cpu = cpumask_any_distribute(&p->cpus_mask);
++		}
++
+ 		if (task_on_rq_queued(p))
+-			rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
++			rq = __migrate_task(rq, &rf, p, dest_cpu);
+ 		else
+-			p->wake_cpu = arg->dest_cpu;
++			p->wake_cpu = dest_cpu;
++
++	} else if (dest_cpu < 0) {
++		/*
++		 * This happens when we get migrated between migrate_enable()'s
++		 * preempt_enable() and scheduling the stopper task. At that
++		 * point we're a regular task again and not current anymore.
++		 *
++		 * A !PREEMPT kernel has a giant hole here, which makes it far
++		 * more likely.
++		 */
++
++		/*
++		 * When this was migrate_enable() but we no longer have an
++		 * @pending, a concurrent SCA 'fixed' things and we should be
++		 * valid again. Nothing to do.
++		 */
++		if (!pending) {
++			WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
++			goto out;
++		}
++
++		/*
++		 * When migrate_enable() hits a rq mis-match we can't reliably
++		 * determine is_migration_disabled() and so have to chase after
++		 * it.
++		 */
++		task_rq_unlock(rq, p, &rf);
++		stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
++				    &pending->arg, &pending->stop_work);
++		return 0;
+ 	}
+-	rq_unlock(rq, &rf);
+-	raw_spin_unlock(&p->pi_lock);
++out:
++	task_rq_unlock(rq, p, &rf);
++
++	if (complete)
++		complete_all(&pending->done);
++
++	/* For pending->{arg,stop_work} */
++	pending = arg->pending;
++	if (pending && refcount_dec_and_test(&pending->refs))
++		wake_up_var(&pending->refs);
+ 
+-	local_irq_enable();
+ 	return 0;
+ }
+ 
+@@ -1954,6 +2038,110 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+ 	__do_set_cpus_allowed(p, new_mask, 0);
+ }
+ 
++/*
++ * This function is wildly self concurrent, consider at least 3 times.
++ */
++static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
++			    int dest_cpu, unsigned int flags)
++{
++	struct set_affinity_pending my_pending = { }, *pending = NULL;
++	struct migration_arg arg = {
++		.task = p,
++		.dest_cpu = dest_cpu,
++	};
++	bool complete = false;
++
++	/* Can the task run on the task's current CPU? If so, we're done */
++	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
++		pending = p->migration_pending;
++		if (pending) {
++			refcount_inc(&pending->refs);
++			p->migration_pending = NULL;
++			complete = true;
++		}
++		task_rq_unlock(rq, p, rf);
++
++		if (complete)
++			goto do_complete;
++
++		return 0;
++	}
++
++	if (!(flags & SCA_MIGRATE_ENABLE)) {
++		/* serialized by p->pi_lock */
++		if (!p->migration_pending) {
++			refcount_set(&my_pending.refs, 1);
++			init_completion(&my_pending.done);
++			p->migration_pending = &my_pending;
++		} else {
++			pending = p->migration_pending;
++			refcount_inc(&pending->refs);
++		}
++	}
++	pending = p->migration_pending;
++	/*
++	 * - !MIGRATE_ENABLE:
++	 *   we'll have installed a pending if there wasn't one already.
++	 *
++	 * - MIGRATE_ENABLE:
++	 *   we're here because the current CPU isn't matching anymore,
++	 *   the only way that can happen is because of a concurrent
++	 *   set_cpus_allowed_ptr() call, which should then still be
++	 *   pending completion.
++	 *
++	 * Either way, we really should have a @pending here.
++	 */
++	if (WARN_ON_ONCE(!pending))
++		return -EINVAL;
++
++	if (flags & SCA_MIGRATE_ENABLE) {
++
++		refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
++		task_rq_unlock(rq, p, rf);
++
++		pending->arg = (struct migration_arg) {
++			.task = p,
++			.dest_cpu = -1,
++			.pending = pending,
++		};
++
++		stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
++				    &pending->arg, &pending->stop_work);
++
++		return 0;
++	}
++
++	if (task_running(rq, p) || p->state == TASK_WAKING) {
++
++		task_rq_unlock(rq, p, rf);
++		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
++
++	} else {
++
++		if (!is_migration_disabled(p)) {
++			if (task_on_rq_queued(p))
++				rq = move_queued_task(rq, rf, p, dest_cpu);
++
++			p->migration_pending = NULL;
++			complete = true;
++		}
++		task_rq_unlock(rq, p, rf);
++
++do_complete:
++		if (complete)
++			complete_all(&pending->done);
++	}
++
++	wait_for_completion(&pending->done);
++
++	if (refcount_dec_and_test(&pending->refs))
++		wake_up_var(&pending->refs);
++
++	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
++
++	return 0;
++}
++
+ /*
+  * Change a given task's CPU affinity. Migrate the thread to a
+  * proper CPU and schedule it away if the CPU it's executing on
+@@ -2023,23 +2211,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
+ 			p->nr_cpus_allowed != 1);
+ 	}
+ 
+-	/* Can the task run on the task's current CPU? If so, we're done */
+-	if (cpumask_test_cpu(task_cpu(p), new_mask))
+-		goto out;
++	return affine_move_task(rq, p, &rf, dest_cpu, flags);
+ 
+-	if (task_running(rq, p) || p->state == TASK_WAKING) {
+-		struct migration_arg arg = { p, dest_cpu };
+-		/* Need help from migration thread: drop lock and wait. */
+-		task_rq_unlock(rq, p, &rf);
+-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+-		return 0;
+-	} else if (task_on_rq_queued(p)) {
+-		/*
+-		 * OK, since we're going to drop the lock immediately
+-		 * afterwards anyway.
+-		 */
+-		rq = move_queued_task(rq, &rf, p, dest_cpu);
+-	}
+ out:
+ 	task_rq_unlock(rq, p, &rf);
+ 
+@@ -3230,6 +3403,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	init_numa_balancing(clone_flags, p);
+ #ifdef CONFIG_SMP
+ 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
++	p->migration_pending = NULL;
+ #endif
+ }
+ 
+-- 
+2.43.0
+