diff options
Diffstat (limited to '')
-rw-r--r-- | debian/patches-rt/0010-sched-Add-migrate_disable.patch | 356 |
1 files changed, 356 insertions, 0 deletions
diff --git a/debian/patches-rt/0010-sched-Add-migrate_disable.patch b/debian/patches-rt/0010-sched-Add-migrate_disable.patch new file mode 100644 index 000000000..d0ea31279 --- /dev/null +++ b/debian/patches-rt/0010-sched-Add-migrate_disable.patch @@ -0,0 +1,356 @@ +From e9f7c2225ee3e1ce9317762393618c1c81a8febe Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:07 +0200 +Subject: [PATCH 010/323] sched: Add migrate_disable() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Add the base migrate_disable() support (under protest). + +While migrate_disable() is (currently) required for PREEMPT_RT, it is +also one of the biggest flaws in the system. + +Notably this is just the base implementation, it is broken vs +sched_setaffinity() and hotplug, both solved in additional patches for +ease of review. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/preempt.h | 65 +++++++++++++++++++++++ + include/linux/sched.h | 3 ++ + kernel/sched/core.c | 112 +++++++++++++++++++++++++++++++++++++--- + kernel/sched/sched.h | 6 ++- + lib/smp_processor_id.c | 5 ++ + 5 files changed, 183 insertions(+), 8 deletions(-) + +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 7d9c1c0e149c..97ba7c920653 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -322,6 +322,69 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, + + #endif + ++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++ ++/* ++ * Migrate-Disable and why it is (strongly) undesired. ++ * ++ * The premise of the Real-Time schedulers we have on Linux ++ * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks ++ * concurrently, provided there are sufficient runnable tasks, also known as ++ * work-conserving. For instance SCHED_DEADLINE tries to schedule the M ++ * earliest deadline threads, and SCHED_FIFO the M highest priority threads. ++ * ++ * The correctness of various scheduling models depends on this, but is it ++ * broken by migrate_disable() that doesn't imply preempt_disable(). Where ++ * preempt_disable() implies an immediate priority ceiling, preemptible ++ * migrate_disable() allows nesting. ++ * ++ * The worst case is that all tasks preempt one another in a migrate_disable() ++ * region and stack on a single CPU. This then reduces the available bandwidth ++ * to a single CPU. And since Real-Time schedulability theory considers the ++ * Worst-Case only, all Real-Time analysis shall revert to single-CPU ++ * (instantly solving the SMP analysis problem). ++ * ++ * ++ * The reason we have it anyway. ++ * ++ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a ++ * number of primitives into becoming preemptible, they would also allow ++ * migration. This turns out to break a bunch of per-cpu usage. To this end, ++ * all these primitives employ migirate_disable() to restore this implicit ++ * assumption. ++ * ++ * This is a 'temporary' work-around at best. The correct solution is getting ++ * rid of the above assumptions and reworking the code to employ explicit ++ * per-cpu locking or short preempt-disable regions. ++ * ++ * The end goal must be to get rid of migrate_disable(), alternatively we need ++ * a schedulability theory that does not depend on abritrary migration. ++ * ++ * ++ * Notes on the implementation. ++ * ++ * The implementation is particularly tricky since existing code patterns ++ * dictate neither migrate_disable() nor migrate_enable() is allowed to block. ++ * This means that it cannot use cpus_read_lock() to serialize against hotplug, ++ * nor can it easily migrate itself into a pending affinity mask change on ++ * migrate_enable(). ++ * ++ * ++ * Note: even non-work-conserving schedulers like semi-partitioned depends on ++ * migration, so migrate_disable() is not only a problem for ++ * work-conserving schedulers. ++ * ++ */ ++extern void migrate_disable(void); ++extern void migrate_enable(void); ++ ++#elif defined(CONFIG_PREEMPT_RT) ++ ++static inline void migrate_disable(void) { } ++static inline void migrate_enable(void) { } ++ ++#else /* !CONFIG_PREEMPT_RT */ ++ + /** + * migrate_disable - Prevent migration of the current task + * +@@ -352,4 +415,6 @@ static __always_inline void migrate_enable(void) + preempt_enable(); + } + ++#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */ ++ + #endif /* __LINUX_PREEMPT_H */ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index aa015416c569..76907e9876d5 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -726,6 +726,9 @@ struct task_struct { + int nr_cpus_allowed; + const cpumask_t *cpus_ptr; + cpumask_t cpus_mask; ++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++ int migration_disabled; ++#endif + + #ifdef CONFIG_PREEMPT_RCU + int rcu_read_lock_nesting; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 569cc5e48e68..5c9db4b2b6ec 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1710,6 +1710,61 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) + + #ifdef CONFIG_SMP + ++#ifdef CONFIG_PREEMPT_RT ++ ++static void ++__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, ++ u32 flags); ++ ++static void migrate_disable_switch(struct rq *rq, struct task_struct *p) ++{ ++ if (likely(!p->migration_disabled)) ++ return; ++ ++ if (p->cpus_ptr != &p->cpus_mask) ++ return; ++ ++ /* ++ * Violates locking rules! see comment in __do_set_cpus_allowed(). ++ */ ++ __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); ++} ++ ++void migrate_disable(void) ++{ ++ if (current->migration_disabled++) ++ return; ++ ++ barrier(); ++} ++EXPORT_SYMBOL_GPL(migrate_disable); ++ ++void migrate_enable(void) ++{ ++ struct task_struct *p = current; ++ ++ if (--p->migration_disabled) ++ return; ++ ++ barrier(); ++ ++ if (p->cpus_ptr == &p->cpus_mask) ++ return; ++ ++ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); ++} ++EXPORT_SYMBOL_GPL(migrate_enable); ++ ++static inline bool is_migration_disabled(struct task_struct *p) ++{ ++ return p->migration_disabled; ++} ++ ++#endif ++ + /* + * Per-CPU kthreads are allowed to run on !active && online CPUs, see + * __set_cpus_allowed_ptr() and select_fallback_rq(). +@@ -1719,7 +1774,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + +- if (is_per_cpu_kthread(p)) ++ if (is_per_cpu_kthread(p) || is_migration_disabled(p)) + return cpu_online(cpu); + + return cpu_active(cpu); +@@ -1840,6 +1895,11 @@ static int migration_cpu_stop(void *data) + */ + void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) + { ++ if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { ++ p->cpus_ptr = new_mask; ++ return; ++ } ++ + cpumask_copy(&p->cpus_mask, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); + } +@@ -1850,7 +1910,22 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 + struct rq *rq = task_rq(p); + bool queued, running; + +- lockdep_assert_held(&p->pi_lock); ++ /* ++ * This here violates the locking rules for affinity, since we're only ++ * supposed to change these variables while holding both rq->lock and ++ * p->pi_lock. ++ * ++ * HOWEVER, it magically works, because ttwu() is the only code that ++ * accesses these variables under p->pi_lock and only does so after ++ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() ++ * before finish_task(). ++ * ++ * XXX do further audits, this smells like something putrid. ++ */ ++ if (flags & SCA_MIGRATE_DISABLE) ++ SCHED_WARN_ON(!p->on_cpu); ++ else ++ lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); +@@ -1901,9 +1976,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + +- if (p->flags & PF_KTHREAD) { ++ if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { + /* +- * Kernel threads are allowed on online && !active CPUs ++ * Kernel threads are allowed on online && !active CPUs. ++ * ++ * Specifically, migration_disabled() tasks must not fail the ++ * cpumask_any_and_distribute() pick below, esp. so on ++ * SCA_MIGRATE_ENABLE, otherwise we'll not call ++ * set_cpus_allowed_common() and actually reset p->cpus_ptr. + */ + cpu_valid_mask = cpu_online_mask; + } +@@ -1917,7 +1997,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + goto out; + } + +- if (cpumask_equal(&p->cpus_mask, new_mask)) ++ if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask)) + goto out; + + /* +@@ -2009,6 +2089,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) + * Clearly, migrating tasks to offline CPUs is a fairly daft thing. + */ + WARN_ON_ONCE(!cpu_online(new_cpu)); ++ ++ WARN_ON_ONCE(is_migration_disabled(p)); + #endif + + trace_sched_migrate_task(p, new_cpu); +@@ -2339,6 +2421,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) + } + fallthrough; + case possible: ++ /* ++ * XXX When called from select_task_rq() we only ++ * hold p->pi_lock and again violate locking order. ++ * ++ * More yuck to audit. ++ */ + do_set_cpus_allowed(p, cpu_possible_mask); + state = fail; + break; +@@ -2373,7 +2461,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) + { + lockdep_assert_held(&p->pi_lock); + +- if (p->nr_cpus_allowed > 1) ++ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + else + cpu = cpumask_any(p->cpus_ptr); +@@ -2435,6 +2523,17 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, + + #endif /* CONFIG_SMP */ + ++#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT) ++ ++static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } ++ ++static inline bool is_migration_disabled(struct task_struct *p) ++{ ++ return false; ++} ++ ++#endif ++ + static void + ttwu_stat(struct task_struct *p, int cpu, int wake_flags) + { +@@ -4595,6 +4694,7 @@ static void __sched notrace __schedule(bool preempt) + */ + ++*switch_count; + ++ migrate_disable_switch(rq, prev); + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + + trace_sched_switch(preempt, prev, next); +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index f0f396cc1bee..f3109adda484 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1906,14 +1906,16 @@ static inline bool sched_fair_runnable(struct rq *rq) + extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + extern struct task_struct *pick_next_task_idle(struct rq *rq); + ++#define SCA_CHECK 0x01 ++#define SCA_MIGRATE_DISABLE 0x02 ++#define SCA_MIGRATE_ENABLE 0x04 ++ + #ifdef CONFIG_SMP + + extern void update_group_capacity(struct sched_domain *sd, int cpu); + + extern void trigger_load_balance(struct rq *rq); + +-#define SCA_CHECK 0x01 +- + extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + + #endif +diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c +index 2916606a9333..dbb96ebf661f 100644 +--- a/lib/smp_processor_id.c ++++ b/lib/smp_processor_id.c +@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) + if (current->nr_cpus_allowed == 1) + goto out; + ++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++ if (current->migration_disabled) ++ goto out; ++#endif ++ + /* + * It is valid to assume CPU-locality during early bootup: + */ +-- +2.43.0 + |