summaryrefslogtreecommitdiffstats
path: root/debian/patches-rt/0010-sched-Add-migrate_disable.patch
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--debian/patches-rt/0010-sched-Add-migrate_disable.patch356
1 files changed, 356 insertions, 0 deletions
diff --git a/debian/patches-rt/0010-sched-Add-migrate_disable.patch b/debian/patches-rt/0010-sched-Add-migrate_disable.patch
new file mode 100644
index 000000000..d0ea31279
--- /dev/null
+++ b/debian/patches-rt/0010-sched-Add-migrate_disable.patch
@@ -0,0 +1,356 @@
+From e9f7c2225ee3e1ce9317762393618c1c81a8febe Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Fri, 23 Oct 2020 12:12:07 +0200
+Subject: [PATCH 010/323] sched: Add migrate_disable()
+Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz
+
+Add the base migrate_disable() support (under protest).
+
+While migrate_disable() is (currently) required for PREEMPT_RT, it is
+also one of the biggest flaws in the system.
+
+Notably this is just the base implementation, it is broken vs
+sched_setaffinity() and hotplug, both solved in additional patches for
+ease of review.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ include/linux/preempt.h | 65 +++++++++++++++++++++++
+ include/linux/sched.h | 3 ++
+ kernel/sched/core.c | 112 +++++++++++++++++++++++++++++++++++++---
+ kernel/sched/sched.h | 6 ++-
+ lib/smp_processor_id.c | 5 ++
+ 5 files changed, 183 insertions(+), 8 deletions(-)
+
+diff --git a/include/linux/preempt.h b/include/linux/preempt.h
+index 7d9c1c0e149c..97ba7c920653 100644
+--- a/include/linux/preempt.h
++++ b/include/linux/preempt.h
+@@ -322,6 +322,69 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
+
+ #endif
+
++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
++
++/*
++ * Migrate-Disable and why it is (strongly) undesired.
++ *
++ * The premise of the Real-Time schedulers we have on Linux
++ * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
++ * concurrently, provided there are sufficient runnable tasks, also known as
++ * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
++ * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
++ *
++ * The correctness of various scheduling models depends on this, but is it
++ * broken by migrate_disable() that doesn't imply preempt_disable(). Where
++ * preempt_disable() implies an immediate priority ceiling, preemptible
++ * migrate_disable() allows nesting.
++ *
++ * The worst case is that all tasks preempt one another in a migrate_disable()
++ * region and stack on a single CPU. This then reduces the available bandwidth
++ * to a single CPU. And since Real-Time schedulability theory considers the
++ * Worst-Case only, all Real-Time analysis shall revert to single-CPU
++ * (instantly solving the SMP analysis problem).
++ *
++ *
++ * The reason we have it anyway.
++ *
++ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
++ * number of primitives into becoming preemptible, they would also allow
++ * migration. This turns out to break a bunch of per-cpu usage. To this end,
++ * all these primitives employ migirate_disable() to restore this implicit
++ * assumption.
++ *
++ * This is a 'temporary' work-around at best. The correct solution is getting
++ * rid of the above assumptions and reworking the code to employ explicit
++ * per-cpu locking or short preempt-disable regions.
++ *
++ * The end goal must be to get rid of migrate_disable(), alternatively we need
++ * a schedulability theory that does not depend on abritrary migration.
++ *
++ *
++ * Notes on the implementation.
++ *
++ * The implementation is particularly tricky since existing code patterns
++ * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
++ * This means that it cannot use cpus_read_lock() to serialize against hotplug,
++ * nor can it easily migrate itself into a pending affinity mask change on
++ * migrate_enable().
++ *
++ *
++ * Note: even non-work-conserving schedulers like semi-partitioned depends on
++ * migration, so migrate_disable() is not only a problem for
++ * work-conserving schedulers.
++ *
++ */
++extern void migrate_disable(void);
++extern void migrate_enable(void);
++
++#elif defined(CONFIG_PREEMPT_RT)
++
++static inline void migrate_disable(void) { }
++static inline void migrate_enable(void) { }
++
++#else /* !CONFIG_PREEMPT_RT */
++
+ /**
+ * migrate_disable - Prevent migration of the current task
+ *
+@@ -352,4 +415,6 @@ static __always_inline void migrate_enable(void)
+ preempt_enable();
+ }
+
++#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */
++
+ #endif /* __LINUX_PREEMPT_H */
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index aa015416c569..76907e9876d5 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -726,6 +726,9 @@ struct task_struct {
+ int nr_cpus_allowed;
+ const cpumask_t *cpus_ptr;
+ cpumask_t cpus_mask;
++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
++ int migration_disabled;
++#endif
+
+ #ifdef CONFIG_PREEMPT_RCU
+ int rcu_read_lock_nesting;
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 569cc5e48e68..5c9db4b2b6ec 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1710,6 +1710,61 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+
+ #ifdef CONFIG_SMP
+
++#ifdef CONFIG_PREEMPT_RT
++
++static void
++__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
++
++static int __set_cpus_allowed_ptr(struct task_struct *p,
++ const struct cpumask *new_mask,
++ u32 flags);
++
++static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
++{
++ if (likely(!p->migration_disabled))
++ return;
++
++ if (p->cpus_ptr != &p->cpus_mask)
++ return;
++
++ /*
++ * Violates locking rules! see comment in __do_set_cpus_allowed().
++ */
++ __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
++}
++
++void migrate_disable(void)
++{
++ if (current->migration_disabled++)
++ return;
++
++ barrier();
++}
++EXPORT_SYMBOL_GPL(migrate_disable);
++
++void migrate_enable(void)
++{
++ struct task_struct *p = current;
++
++ if (--p->migration_disabled)
++ return;
++
++ barrier();
++
++ if (p->cpus_ptr == &p->cpus_mask)
++ return;
++
++ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
++}
++EXPORT_SYMBOL_GPL(migrate_enable);
++
++static inline bool is_migration_disabled(struct task_struct *p)
++{
++ return p->migration_disabled;
++}
++
++#endif
++
+ /*
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
+ * __set_cpus_allowed_ptr() and select_fallback_rq().
+@@ -1719,7 +1774,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ return false;
+
+- if (is_per_cpu_kthread(p))
++ if (is_per_cpu_kthread(p) || is_migration_disabled(p))
+ return cpu_online(cpu);
+
+ return cpu_active(cpu);
+@@ -1840,6 +1895,11 @@ static int migration_cpu_stop(void *data)
+ */
+ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+ {
++ if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
++ p->cpus_ptr = new_mask;
++ return;
++ }
++
+ cpumask_copy(&p->cpus_mask, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+ }
+@@ -1850,7 +1910,22 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
+ struct rq *rq = task_rq(p);
+ bool queued, running;
+
+- lockdep_assert_held(&p->pi_lock);
++ /*
++ * This here violates the locking rules for affinity, since we're only
++ * supposed to change these variables while holding both rq->lock and
++ * p->pi_lock.
++ *
++ * HOWEVER, it magically works, because ttwu() is the only code that
++ * accesses these variables under p->pi_lock and only does so after
++ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
++ * before finish_task().
++ *
++ * XXX do further audits, this smells like something putrid.
++ */
++ if (flags & SCA_MIGRATE_DISABLE)
++ SCHED_WARN_ON(!p->on_cpu);
++ else
++ lockdep_assert_held(&p->pi_lock);
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+@@ -1901,9 +1976,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+- if (p->flags & PF_KTHREAD) {
++ if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
+ /*
+- * Kernel threads are allowed on online && !active CPUs
++ * Kernel threads are allowed on online && !active CPUs.
++ *
++ * Specifically, migration_disabled() tasks must not fail the
++ * cpumask_any_and_distribute() pick below, esp. so on
++ * SCA_MIGRATE_ENABLE, otherwise we'll not call
++ * set_cpus_allowed_common() and actually reset p->cpus_ptr.
+ */
+ cpu_valid_mask = cpu_online_mask;
+ }
+@@ -1917,7 +1997,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
+ goto out;
+ }
+
+- if (cpumask_equal(&p->cpus_mask, new_mask))
++ if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
+ goto out;
+
+ /*
+@@ -2009,6 +2089,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
+ */
+ WARN_ON_ONCE(!cpu_online(new_cpu));
++
++ WARN_ON_ONCE(is_migration_disabled(p));
+ #endif
+
+ trace_sched_migrate_task(p, new_cpu);
+@@ -2339,6 +2421,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
+ }
+ fallthrough;
+ case possible:
++ /*
++ * XXX When called from select_task_rq() we only
++ * hold p->pi_lock and again violate locking order.
++ *
++ * More yuck to audit.
++ */
+ do_set_cpus_allowed(p, cpu_possible_mask);
+ state = fail;
+ break;
+@@ -2373,7 +2461,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+ {
+ lockdep_assert_held(&p->pi_lock);
+
+- if (p->nr_cpus_allowed > 1)
++ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ else
+ cpu = cpumask_any(p->cpus_ptr);
+@@ -2435,6 +2523,17 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+
+ #endif /* CONFIG_SMP */
+
++#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT)
++
++static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
++
++static inline bool is_migration_disabled(struct task_struct *p)
++{
++ return false;
++}
++
++#endif
++
+ static void
+ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+ {
+@@ -4595,6 +4694,7 @@ static void __sched notrace __schedule(bool preempt)
+ */
+ ++*switch_count;
+
++ migrate_disable_switch(rq, prev);
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+
+ trace_sched_switch(preempt, prev, next);
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index f0f396cc1bee..f3109adda484 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1906,14 +1906,16 @@ static inline bool sched_fair_runnable(struct rq *rq)
+ extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+ extern struct task_struct *pick_next_task_idle(struct rq *rq);
+
++#define SCA_CHECK 0x01
++#define SCA_MIGRATE_DISABLE 0x02
++#define SCA_MIGRATE_ENABLE 0x04
++
+ #ifdef CONFIG_SMP
+
+ extern void update_group_capacity(struct sched_domain *sd, int cpu);
+
+ extern void trigger_load_balance(struct rq *rq);
+
+-#define SCA_CHECK 0x01
+-
+ extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+ #endif
+diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
+index 2916606a9333..dbb96ebf661f 100644
+--- a/lib/smp_processor_id.c
++++ b/lib/smp_processor_id.c
+@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
+ if (current->nr_cpus_allowed == 1)
+ goto out;
+
++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
++ if (current->migration_disabled)
++ goto out;
++#endif
++
+ /*
+ * It is valid to assume CPU-locality during early bootup:
+ */
+--
+2.43.0
+