diff options
Diffstat (limited to '')
-rw-r--r-- | debian/patches-rt/0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch | 495 |
1 files changed, 495 insertions, 0 deletions
diff --git a/debian/patches-rt/0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch b/debian/patches-rt/0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch new file mode 100644 index 000000000..b665e6d22 --- /dev/null +++ b/debian/patches-rt/0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch @@ -0,0 +1,495 @@ +From 8121011ded919f172bbbc7f5c095ca29d83a5aed Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:13 +0200 +Subject: [PATCH 016/323] sched: Fix migrate_disable() vs rt/dl balancing +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +In order to minimize the interference of migrate_disable() on lower +priority tasks, which can be deprived of runtime due to being stuck +below a higher priority task. Teach the RT/DL balancers to push away +these higher priority tasks when a lower priority task gets selected +to run on a freshly demoted CPU (pull). + +This adds migration interference to the higher priority task, but +restores bandwidth to system that would otherwise be irrevocably lost. +Without this it would be possible to have all tasks on the system +stuck on a single CPU, each task preempted in a migrate_disable() +section with a single high priority task running. + +This way we can still approximate running the M highest priority tasks +on the system. + +Migrating the top task away is (ofcourse) still subject to +migrate_disable() too, which means the lower task is subject to an +interference equivalent to the worst case migrate_disable() section. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/preempt.h | 40 +++++++++++++----------- + include/linux/sched.h | 3 +- + kernel/sched/core.c | 67 +++++++++++++++++++++++++++++++++++------ + kernel/sched/deadline.c | 29 +++++++++++++----- + kernel/sched/rt.c | 63 ++++++++++++++++++++++++++++++-------- + kernel/sched/sched.h | 32 ++++++++++++++++++++ + 6 files changed, 186 insertions(+), 48 deletions(-) + +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 97ba7c920653..8b43922e65df 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -325,24 +325,28 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, + #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + + /* +- * Migrate-Disable and why it is (strongly) undesired. +- * +- * The premise of the Real-Time schedulers we have on Linux +- * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks +- * concurrently, provided there are sufficient runnable tasks, also known as +- * work-conserving. For instance SCHED_DEADLINE tries to schedule the M +- * earliest deadline threads, and SCHED_FIFO the M highest priority threads. +- * +- * The correctness of various scheduling models depends on this, but is it +- * broken by migrate_disable() that doesn't imply preempt_disable(). Where +- * preempt_disable() implies an immediate priority ceiling, preemptible +- * migrate_disable() allows nesting. +- * +- * The worst case is that all tasks preempt one another in a migrate_disable() +- * region and stack on a single CPU. This then reduces the available bandwidth +- * to a single CPU. And since Real-Time schedulability theory considers the +- * Worst-Case only, all Real-Time analysis shall revert to single-CPU +- * (instantly solving the SMP analysis problem). ++ * Migrate-Disable and why it is undesired. ++ * ++ * When a preempted task becomes elegible to run under the ideal model (IOW it ++ * becomes one of the M highest priority tasks), it might still have to wait ++ * for the preemptee's migrate_disable() section to complete. Thereby suffering ++ * a reduction in bandwidth in the exact duration of the migrate_disable() ++ * section. ++ * ++ * Per this argument, the change from preempt_disable() to migrate_disable() ++ * gets us: ++ * ++ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() ++ * it would have had to wait for the lower priority task. ++ * ++ * - a lower priority tasks; which under preempt_disable() could've instantly ++ * migrated away when another CPU becomes available, is now constrained ++ * by the ability to push the higher priority task away, which might itself be ++ * in a migrate_disable() section, reducing it's available bandwidth. ++ * ++ * IOW it trades latency / moves the interference term, but it stays in the ++ * system, and as long as it remains unbounded, the system is not fully ++ * deterministic. + * + * + * The reason we have it anyway. +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 5b5c194f5a62..7ca1f3e740dd 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -728,8 +728,9 @@ struct task_struct { + cpumask_t cpus_mask; + void *migration_pending; + #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) +- int migration_disabled; ++ unsigned short migration_disabled; + #endif ++ unsigned short migration_flags; + + #ifdef CONFIG_PREEMPT_RCU + int rcu_read_lock_nesting; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index abbf01f77a76..452fc1dfb143 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1777,11 +1777,6 @@ void migrate_enable(void) + } + EXPORT_SYMBOL_GPL(migrate_enable); + +-static inline bool is_migration_disabled(struct task_struct *p) +-{ +- return p->migration_disabled; +-} +- + static inline bool rq_has_pinned_tasks(struct rq *rq) + { + return rq->nr_pinned; +@@ -1986,6 +1981,49 @@ static int migration_cpu_stop(void *data) + return 0; + } + ++int push_cpu_stop(void *arg) ++{ ++ struct rq *lowest_rq = NULL, *rq = this_rq(); ++ struct task_struct *p = arg; ++ ++ raw_spin_lock_irq(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ if (task_rq(p) != rq) ++ goto out_unlock; ++ ++ if (is_migration_disabled(p)) { ++ p->migration_flags |= MDF_PUSH; ++ goto out_unlock; ++ } ++ ++ p->migration_flags &= ~MDF_PUSH; ++ ++ if (p->sched_class->find_lock_rq) ++ lowest_rq = p->sched_class->find_lock_rq(p, rq); ++ ++ if (!lowest_rq) ++ goto out_unlock; ++ ++ // XXX validate p is still the highest prio task ++ if (task_rq(p) == rq) { ++ deactivate_task(rq, p, 0); ++ set_task_cpu(p, lowest_rq->cpu); ++ activate_task(lowest_rq, p, 0); ++ resched_curr(lowest_rq); ++ } ++ ++ double_unlock_balance(rq, lowest_rq); ++ ++out_unlock: ++ rq->push_busy = false; ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irq(&p->pi_lock); ++ ++ put_task_struct(p); ++ return 0; ++} ++ + /* + * sched_class::set_cpus_allowed must do the below, but is not required to + * actually call this function. +@@ -2066,6 +2104,14 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { ++ struct task_struct *push_task = NULL; ++ ++ if ((flags & SCA_MIGRATE_ENABLE) && ++ (p->migration_flags & MDF_PUSH) && !rq->push_busy) { ++ rq->push_busy = true; ++ push_task = get_task_struct(p); ++ } ++ + pending = p->migration_pending; + if (pending) { + refcount_inc(&pending->refs); +@@ -2074,6 +2120,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + } + task_rq_unlock(rq, p, rf); + ++ if (push_task) { ++ stop_one_cpu_nowait(rq->cpu, push_cpu_stop, ++ p, &rq->push_work); ++ } ++ + if (complete) + goto do_complete; + +@@ -2110,6 +2161,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + if (flags & SCA_MIGRATE_ENABLE) { + + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ ++ p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); + + pending->arg = (struct migration_arg) { +@@ -2728,11 +2780,6 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, + + static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } + +-static inline bool is_migration_disabled(struct task_struct *p) +-{ +- return false; +-} +- + static inline bool rq_has_pinned_tasks(struct rq *rq) + { + return false; +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index 94977a6ced8b..7cf3248894a9 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -2139,6 +2139,9 @@ static int push_dl_task(struct rq *rq) + return 0; + + retry: ++ if (is_migration_disabled(next_task)) ++ return 0; ++ + if (WARN_ON(next_task == rq->curr)) + return 0; + +@@ -2216,7 +2219,7 @@ static void push_dl_tasks(struct rq *rq) + static void pull_dl_task(struct rq *this_rq) + { + int this_cpu = this_rq->cpu, cpu; +- struct task_struct *p; ++ struct task_struct *p, *push_task; + bool resched = false; + struct rq *src_rq; + u64 dmin = LONG_MAX; +@@ -2246,6 +2249,7 @@ static void pull_dl_task(struct rq *this_rq) + continue; + + /* Might drop this_rq->lock */ ++ push_task = NULL; + double_lock_balance(this_rq, src_rq); + + /* +@@ -2277,17 +2281,27 @@ static void pull_dl_task(struct rq *this_rq) + src_rq->curr->dl.deadline)) + goto skip; + +- resched = true; +- +- deactivate_task(src_rq, p, 0); +- set_task_cpu(p, this_cpu); +- activate_task(this_rq, p, 0); +- dmin = p->dl.deadline; ++ if (is_migration_disabled(p)) { ++ push_task = get_push_task(src_rq); ++ } else { ++ deactivate_task(src_rq, p, 0); ++ set_task_cpu(p, this_cpu); ++ activate_task(this_rq, p, 0); ++ dmin = p->dl.deadline; ++ resched = true; ++ } + + /* Is there any other task even earlier? */ + } + skip: + double_unlock_balance(this_rq, src_rq); ++ ++ if (push_task) { ++ raw_spin_unlock(&this_rq->lock); ++ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, ++ push_task, &src_rq->push_work); ++ raw_spin_lock(&this_rq->lock); ++ } + } + + if (resched) +@@ -2548,6 +2562,7 @@ const struct sched_class dl_sched_class + .rq_online = rq_online_dl, + .rq_offline = rq_offline_dl, + .task_woken = task_woken_dl, ++ .find_lock_rq = find_lock_later_rq, + #endif + + .task_tick = task_tick_dl, +diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c +index f91339c0dee8..c25e35f41555 100644 +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -1873,7 +1873,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) + * running task can migrate over to a CPU that is running a task + * of lesser priority. + */ +-static int push_rt_task(struct rq *rq) ++static int push_rt_task(struct rq *rq, bool pull) + { + struct task_struct *next_task; + struct rq *lowest_rq; +@@ -1887,6 +1887,34 @@ static int push_rt_task(struct rq *rq) + return 0; + + retry: ++ if (is_migration_disabled(next_task)) { ++ struct task_struct *push_task = NULL; ++ int cpu; ++ ++ if (!pull || rq->push_busy) ++ return 0; ++ ++ cpu = find_lowest_rq(rq->curr); ++ if (cpu == -1 || cpu == rq->cpu) ++ return 0; ++ ++ /* ++ * Given we found a CPU with lower priority than @next_task, ++ * therefore it should be running. However we cannot migrate it ++ * to this other CPU, instead attempt to push the current ++ * running task on this CPU away. ++ */ ++ push_task = get_push_task(rq); ++ if (push_task) { ++ raw_spin_unlock(&rq->lock); ++ stop_one_cpu_nowait(rq->cpu, push_cpu_stop, ++ push_task, &rq->push_work); ++ raw_spin_lock(&rq->lock); ++ } ++ ++ return 0; ++ } ++ + if (WARN_ON(next_task == rq->curr)) + return 0; + +@@ -1941,12 +1969,10 @@ static int push_rt_task(struct rq *rq) + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, lowest_rq->cpu); + activate_task(lowest_rq, next_task, 0); +- ret = 1; +- + resched_curr(lowest_rq); ++ ret = 1; + + double_unlock_balance(rq, lowest_rq); +- + out: + put_task_struct(next_task); + +@@ -1956,7 +1982,7 @@ static int push_rt_task(struct rq *rq) + static void push_rt_tasks(struct rq *rq) + { + /* push_rt_task will return true if it moved an RT */ +- while (push_rt_task(rq)) ++ while (push_rt_task(rq, false)) + ; + } + +@@ -2109,7 +2135,8 @@ void rto_push_irq_work_func(struct irq_work *work) + */ + if (has_pushable_tasks(rq)) { + raw_spin_lock(&rq->lock); +- push_rt_tasks(rq); ++ while (push_rt_task(rq, true)) ++ ; + raw_spin_unlock(&rq->lock); + } + +@@ -2134,7 +2161,7 @@ static void pull_rt_task(struct rq *this_rq) + { + int this_cpu = this_rq->cpu, cpu; + bool resched = false; +- struct task_struct *p; ++ struct task_struct *p, *push_task; + struct rq *src_rq; + int rt_overload_count = rt_overloaded(this_rq); + +@@ -2181,6 +2208,7 @@ static void pull_rt_task(struct rq *this_rq) + * double_lock_balance, and another CPU could + * alter this_rq + */ ++ push_task = NULL; + double_lock_balance(this_rq, src_rq); + + /* +@@ -2208,11 +2236,14 @@ static void pull_rt_task(struct rq *this_rq) + if (p->prio < src_rq->curr->prio) + goto skip; + +- resched = true; +- +- deactivate_task(src_rq, p, 0); +- set_task_cpu(p, this_cpu); +- activate_task(this_rq, p, 0); ++ if (is_migration_disabled(p)) { ++ push_task = get_push_task(src_rq); ++ } else { ++ deactivate_task(src_rq, p, 0); ++ set_task_cpu(p, this_cpu); ++ activate_task(this_rq, p, 0); ++ resched = true; ++ } + /* + * We continue with the search, just in + * case there's an even higher prio task +@@ -2222,6 +2253,13 @@ static void pull_rt_task(struct rq *this_rq) + } + skip: + double_unlock_balance(this_rq, src_rq); ++ ++ if (push_task) { ++ raw_spin_unlock(&this_rq->lock); ++ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, ++ push_task, &src_rq->push_work); ++ raw_spin_lock(&this_rq->lock); ++ } + } + + if (resched) +@@ -2470,6 +2508,7 @@ const struct sched_class rt_sched_class + .rq_offline = rq_offline_rt, + .task_woken = task_woken_rt, + .switched_from = switched_from_rt, ++ .find_lock_rq = find_lock_lowest_rq, + #endif + + .task_tick = task_tick_rt, +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 8237c9ab2bb8..69ef7cac3d29 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1061,6 +1061,8 @@ struct rq { + #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + unsigned int nr_pinned; + #endif ++ unsigned int push_busy; ++ struct cpu_stop_work push_work; + }; + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -1088,6 +1090,16 @@ static inline int cpu_of(struct rq *rq) + #endif + } + ++#define MDF_PUSH 0x01 ++ ++static inline bool is_migration_disabled(struct task_struct *p) ++{ ++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++ return p->migration_disabled; ++#else ++ return false; ++#endif ++} + + #ifdef CONFIG_SCHED_SMT + extern void __update_idle_core(struct rq *rq); +@@ -1827,6 +1839,8 @@ struct sched_class { + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); ++ ++ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); + #endif + + void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); +@@ -1922,6 +1936,24 @@ extern void trigger_load_balance(struct rq *rq); + + extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + ++static inline struct task_struct *get_push_task(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (rq->push_busy) ++ return NULL; ++ ++ if (p->nr_cpus_allowed == 1) ++ return NULL; ++ ++ rq->push_busy = true; ++ return get_task_struct(p); ++} ++ ++extern int push_cpu_stop(void *arg); ++ + #endif + + #ifdef CONFIG_CPU_IDLE +-- +2.43.0 + |