1 files changed, 356 insertions, 0 deletions
diff --git a/debian/patches-rt/0010-sched-Add-migrate_disable.patch b/debian/patches-rt/0010-sched-Add-migrate_disable.patch
new file mode 100644
index 000000000..d0ea31279
--- /dev/null
+++ b/debian/patches-rt/0010-sched-Add-migrate_disable.patch
@@ -0,0 +1,356 @@
+From e9f7c2225ee3e1ce9317762393618c1c81a8febe Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Fri, 23 Oct 2020 12:12:07 +0200
+Subject: [PATCH 010/323] sched: Add migrate_disable()
+Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz
+
+Add the base migrate_disable() support (under protest).
+
+While migrate_disable() is (currently) required for PREEMPT_RT, it is
+also one of the biggest flaws in the system.
+
+Notably this is just the base implementation, it is broken vs
+sched_setaffinity() and hotplug, both solved in additional patches for
+ease of review.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ include/linux/preempt.h |  65 +++++++++++++++++++++++
+ include/linux/sched.h   |   3 ++
+ kernel/sched/core.c     | 112 +++++++++++++++++++++++++++++++++++++---
+ kernel/sched/sched.h    |   6 ++-
+ lib/smp_processor_id.c  |   5 ++
+ 5 files changed, 183 insertions(+), 8 deletions(-)
+
+diff --git a/include/linux/preempt.h b/include/linux/preempt.h
+index 7d9c1c0e149c..97ba7c920653 100644
+--- a/include/linux/preempt.h
++++ b/include/linux/preempt.h
+@@ -322,6 +322,69 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
+ 
+ #endif
+ 
++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
++
++/*
++ * Migrate-Disable and why it is (strongly) undesired.
++ *
++ * The premise of the Real-Time schedulers we have on Linux
++ * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
++ * concurrently, provided there are sufficient runnable tasks, also known as
++ * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
++ * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
++ *
++ * The correctness of various scheduling models depends on this, but is it
++ * broken by migrate_disable() that doesn't imply preempt_disable(). Where
++ * preempt_disable() implies an immediate priority ceiling, preemptible
++ * migrate_disable() allows nesting.
++ *
++ * The worst case is that all tasks preempt one another in a migrate_disable()
++ * region and stack on a single CPU. This then reduces the available bandwidth
++ * to a single CPU. And since Real-Time schedulability theory considers the
++ * Worst-Case only, all Real-Time analysis shall revert to single-CPU
++ * (instantly solving the SMP analysis problem).
++ *
++ *
++ * The reason we have it anyway.
++ *
++ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
++ * number of primitives into becoming preemptible, they would also allow
++ * migration. This turns out to break a bunch of per-cpu usage. To this end,
++ * all these primitives employ migirate_disable() to restore this implicit
++ * assumption.
++ *
++ * This is a 'temporary' work-around at best. The correct solution is getting
++ * rid of the above assumptions and reworking the code to employ explicit
++ * per-cpu locking or short preempt-disable regions.
++ *
++ * The end goal must be to get rid of migrate_disable(), alternatively we need
++ * a schedulability theory that does not depend on abritrary migration.
++ *
++ *
++ * Notes on the implementation.
++ *
++ * The implementation is particularly tricky since existing code patterns
++ * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
++ * This means that it cannot use cpus_read_lock() to serialize against hotplug,
++ * nor can it easily migrate itself into a pending affinity mask change on
++ * migrate_enable().
++ *
++ *
++ * Note: even non-work-conserving schedulers like semi-partitioned depends on
++ *       migration, so migrate_disable() is not only a problem for
++ *       work-conserving schedulers.
++ *
++ */
++extern void migrate_disable(void);
++extern void migrate_enable(void);
++
++#elif defined(CONFIG_PREEMPT_RT)
++
++static inline void migrate_disable(void) { }
++static inline void migrate_enable(void) { }
++
++#else /* !CONFIG_PREEMPT_RT */
++
+ /**
+  * migrate_disable - Prevent migration of the current task
+  *
+@@ -352,4 +415,6 @@ static __always_inline void migrate_enable(void)
+ 	preempt_enable();
+ }
+ 
++#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */
++
+ #endif /* __LINUX_PREEMPT_H */
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index aa015416c569..76907e9876d5 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -726,6 +726,9 @@ struct task_struct {
+ 	int				nr_cpus_allowed;
+ 	const cpumask_t			*cpus_ptr;
+ 	cpumask_t			cpus_mask;
++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
++	int				migration_disabled;
++#endif
+ 
+ #ifdef CONFIG_PREEMPT_RCU
+ 	int				rcu_read_lock_nesting;
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 569cc5e48e68..5c9db4b2b6ec 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1710,6 +1710,61 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+ 
+ #ifdef CONFIG_SMP
+ 
++#ifdef CONFIG_PREEMPT_RT
++
++static void
++__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
++
++static int __set_cpus_allowed_ptr(struct task_struct *p,
++				  const struct cpumask *new_mask,
++				  u32 flags);
++
++static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
++{
++	if (likely(!p->migration_disabled))
++		return;
++
++	if (p->cpus_ptr != &p->cpus_mask)
++		return;
++
++	/*
++	 * Violates locking rules! see comment in __do_set_cpus_allowed().
++	 */
++	__do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
++}
++
++void migrate_disable(void)
++{
++	if (current->migration_disabled++)
++		return;
++
++	barrier();
++}
++EXPORT_SYMBOL_GPL(migrate_disable);
++
++void migrate_enable(void)
++{
++	struct task_struct *p = current;
++
++	if (--p->migration_disabled)
++		return;
++
++	barrier();
++
++	if (p->cpus_ptr == &p->cpus_mask)
++		return;
++
++	__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
++}
++EXPORT_SYMBOL_GPL(migrate_enable);
++
++static inline bool is_migration_disabled(struct task_struct *p)
++{
++	return p->migration_disabled;
++}
++
++#endif
++
+ /*
+  * Per-CPU kthreads are allowed to run on !active && online CPUs, see
+  * __set_cpus_allowed_ptr() and select_fallback_rq().
+@@ -1719,7 +1774,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+ 	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ 		return false;
+ 
+-	if (is_per_cpu_kthread(p))
++	if (is_per_cpu_kthread(p) || is_migration_disabled(p))
+ 		return cpu_online(cpu);
+ 
+ 	return cpu_active(cpu);
+@@ -1840,6 +1895,11 @@ static int migration_cpu_stop(void *data)
+  */
+ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+ {
++	if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
++		p->cpus_ptr = new_mask;
++		return;
++	}
++
+ 	cpumask_copy(&p->cpus_mask, new_mask);
+ 	p->nr_cpus_allowed = cpumask_weight(new_mask);
+ }
+@@ -1850,7 +1910,22 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
+ 	struct rq *rq = task_rq(p);
+ 	bool queued, running;
+ 
+-	lockdep_assert_held(&p->pi_lock);
++	/*
++	 * This here violates the locking rules for affinity, since we're only
++	 * supposed to change these variables while holding both rq->lock and
++	 * p->pi_lock.
++	 *
++	 * HOWEVER, it magically works, because ttwu() is the only code that
++	 * accesses these variables under p->pi_lock and only does so after
++	 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
++	 * before finish_task().
++	 *
++	 * XXX do further audits, this smells like something putrid.
++	 */
++	if (flags & SCA_MIGRATE_DISABLE)
++		SCHED_WARN_ON(!p->on_cpu);
++	else
++		lockdep_assert_held(&p->pi_lock);
+ 
+ 	queued = task_on_rq_queued(p);
+ 	running = task_current(rq, p);
+@@ -1901,9 +1976,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
+ 	rq = task_rq_lock(p, &rf);
+ 	update_rq_clock(rq);
+ 
+-	if (p->flags & PF_KTHREAD) {
++	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
+ 		/*
+-		 * Kernel threads are allowed on online && !active CPUs
++		 * Kernel threads are allowed on online && !active CPUs.
++		 *
++		 * Specifically, migration_disabled() tasks must not fail the
++		 * cpumask_any_and_distribute() pick below, esp. so on
++		 * SCA_MIGRATE_ENABLE, otherwise we'll not call
++		 * set_cpus_allowed_common() and actually reset p->cpus_ptr.
+ 		 */
+ 		cpu_valid_mask = cpu_online_mask;
+ 	}
+@@ -1917,7 +1997,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
+ 		goto out;
+ 	}
+ 
+-	if (cpumask_equal(&p->cpus_mask, new_mask))
++	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
+ 		goto out;
+ 
+ 	/*
+@@ -2009,6 +2089,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+ 	 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
+ 	 */
+ 	WARN_ON_ONCE(!cpu_online(new_cpu));
++
++	WARN_ON_ONCE(is_migration_disabled(p));
+ #endif
+ 
+ 	trace_sched_migrate_task(p, new_cpu);
+@@ -2339,6 +2421,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
+ 			}
+ 			fallthrough;
+ 		case possible:
++			/*
++			 * XXX When called from select_task_rq() we only
++			 * hold p->pi_lock and again violate locking order.
++			 *
++			 * More yuck to audit.
++			 */
+ 			do_set_cpus_allowed(p, cpu_possible_mask);
+ 			state = fail;
+ 			break;
+@@ -2373,7 +2461,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+ {
+ 	lockdep_assert_held(&p->pi_lock);
+ 
+-	if (p->nr_cpus_allowed > 1)
++	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
+ 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ 	else
+ 		cpu = cpumask_any(p->cpus_ptr);
+@@ -2435,6 +2523,17 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ 
+ #endif /* CONFIG_SMP */
+ 
++#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT)
++
++static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
++
++static inline bool is_migration_disabled(struct task_struct *p)
++{
++	return false;
++}
++
++#endif
++
+ static void
+ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+ {
+@@ -4595,6 +4694,7 @@ static void __sched notrace __schedule(bool preempt)
+ 		 */
+ 		++*switch_count;
+ 
++		migrate_disable_switch(rq, prev);
+ 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+ 
+ 		trace_sched_switch(preempt, prev, next);
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index f0f396cc1bee..f3109adda484 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1906,14 +1906,16 @@ static inline bool sched_fair_runnable(struct rq *rq)
+ extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+ extern struct task_struct *pick_next_task_idle(struct rq *rq);
+ 
++#define SCA_CHECK		0x01
++#define SCA_MIGRATE_DISABLE	0x02
++#define SCA_MIGRATE_ENABLE	0x04
++
+ #ifdef CONFIG_SMP
+ 
+ extern void update_group_capacity(struct sched_domain *sd, int cpu);
+ 
+ extern void trigger_load_balance(struct rq *rq);
+ 
+-#define SCA_CHECK		0x01
+-
+ extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+ 
+ #endif
+diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
+index 2916606a9333..dbb96ebf661f 100644
+--- a/lib/smp_processor_id.c
++++ b/lib/smp_processor_id.c
+@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
+ 	if (current->nr_cpus_allowed == 1)
+ 		goto out;
+ 
++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
++	if (current->migration_disabled)
++		goto out;
++#endif
++
+ 	/*
+ 	 * It is valid to assume CPU-locality during early bootup:
+ 	 */
+-- 
+2.43.0
+