summaryrefslogtreecommitdiffstats
path: root/debian/patches-rt/0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--debian/patches-rt/0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch495
1 files changed, 495 insertions, 0 deletions
diff --git a/debian/patches-rt/0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch b/debian/patches-rt/0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch
new file mode 100644
index 000000000..b665e6d22
--- /dev/null
+++ b/debian/patches-rt/0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch
@@ -0,0 +1,495 @@
+From 8121011ded919f172bbbc7f5c095ca29d83a5aed Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Fri, 23 Oct 2020 12:12:13 +0200
+Subject: [PATCH 016/323] sched: Fix migrate_disable() vs rt/dl balancing
+Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz
+
+In order to minimize the interference of migrate_disable() on lower
+priority tasks, which can be deprived of runtime due to being stuck
+below a higher priority task. Teach the RT/DL balancers to push away
+these higher priority tasks when a lower priority task gets selected
+to run on a freshly demoted CPU (pull).
+
+This adds migration interference to the higher priority task, but
+restores bandwidth to system that would otherwise be irrevocably lost.
+Without this it would be possible to have all tasks on the system
+stuck on a single CPU, each task preempted in a migrate_disable()
+section with a single high priority task running.
+
+This way we can still approximate running the M highest priority tasks
+on the system.
+
+Migrating the top task away is (ofcourse) still subject to
+migrate_disable() too, which means the lower task is subject to an
+interference equivalent to the worst case migrate_disable() section.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ include/linux/preempt.h | 40 +++++++++++++-----------
+ include/linux/sched.h | 3 +-
+ kernel/sched/core.c | 67 +++++++++++++++++++++++++++++++++++------
+ kernel/sched/deadline.c | 29 +++++++++++++-----
+ kernel/sched/rt.c | 63 ++++++++++++++++++++++++++++++--------
+ kernel/sched/sched.h | 32 ++++++++++++++++++++
+ 6 files changed, 186 insertions(+), 48 deletions(-)
+
+diff --git a/include/linux/preempt.h b/include/linux/preempt.h
+index 97ba7c920653..8b43922e65df 100644
+--- a/include/linux/preempt.h
++++ b/include/linux/preempt.h
+@@ -325,24 +325,28 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
+ #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+
+ /*
+- * Migrate-Disable and why it is (strongly) undesired.
+- *
+- * The premise of the Real-Time schedulers we have on Linux
+- * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
+- * concurrently, provided there are sufficient runnable tasks, also known as
+- * work-conserving. For instance SCHED_DEADLINE tries to schedule the M
+- * earliest deadline threads, and SCHED_FIFO the M highest priority threads.
+- *
+- * The correctness of various scheduling models depends on this, but is it
+- * broken by migrate_disable() that doesn't imply preempt_disable(). Where
+- * preempt_disable() implies an immediate priority ceiling, preemptible
+- * migrate_disable() allows nesting.
+- *
+- * The worst case is that all tasks preempt one another in a migrate_disable()
+- * region and stack on a single CPU. This then reduces the available bandwidth
+- * to a single CPU. And since Real-Time schedulability theory considers the
+- * Worst-Case only, all Real-Time analysis shall revert to single-CPU
+- * (instantly solving the SMP analysis problem).
++ * Migrate-Disable and why it is undesired.
++ *
++ * When a preempted task becomes elegible to run under the ideal model (IOW it
++ * becomes one of the M highest priority tasks), it might still have to wait
++ * for the preemptee's migrate_disable() section to complete. Thereby suffering
++ * a reduction in bandwidth in the exact duration of the migrate_disable()
++ * section.
++ *
++ * Per this argument, the change from preempt_disable() to migrate_disable()
++ * gets us:
++ *
++ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
++ * it would have had to wait for the lower priority task.
++ *
++ * - a lower priority tasks; which under preempt_disable() could've instantly
++ * migrated away when another CPU becomes available, is now constrained
++ * by the ability to push the higher priority task away, which might itself be
++ * in a migrate_disable() section, reducing it's available bandwidth.
++ *
++ * IOW it trades latency / moves the interference term, but it stays in the
++ * system, and as long as it remains unbounded, the system is not fully
++ * deterministic.
+ *
+ *
+ * The reason we have it anyway.
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 5b5c194f5a62..7ca1f3e740dd 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -728,8 +728,9 @@ struct task_struct {
+ cpumask_t cpus_mask;
+ void *migration_pending;
+ #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
+- int migration_disabled;
++ unsigned short migration_disabled;
+ #endif
++ unsigned short migration_flags;
+
+ #ifdef CONFIG_PREEMPT_RCU
+ int rcu_read_lock_nesting;
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index abbf01f77a76..452fc1dfb143 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1777,11 +1777,6 @@ void migrate_enable(void)
+ }
+ EXPORT_SYMBOL_GPL(migrate_enable);
+
+-static inline bool is_migration_disabled(struct task_struct *p)
+-{
+- return p->migration_disabled;
+-}
+-
+ static inline bool rq_has_pinned_tasks(struct rq *rq)
+ {
+ return rq->nr_pinned;
+@@ -1986,6 +1981,49 @@ static int migration_cpu_stop(void *data)
+ return 0;
+ }
+
++int push_cpu_stop(void *arg)
++{
++ struct rq *lowest_rq = NULL, *rq = this_rq();
++ struct task_struct *p = arg;
++
++ raw_spin_lock_irq(&p->pi_lock);
++ raw_spin_lock(&rq->lock);
++
++ if (task_rq(p) != rq)
++ goto out_unlock;
++
++ if (is_migration_disabled(p)) {
++ p->migration_flags |= MDF_PUSH;
++ goto out_unlock;
++ }
++
++ p->migration_flags &= ~MDF_PUSH;
++
++ if (p->sched_class->find_lock_rq)
++ lowest_rq = p->sched_class->find_lock_rq(p, rq);
++
++ if (!lowest_rq)
++ goto out_unlock;
++
++ // XXX validate p is still the highest prio task
++ if (task_rq(p) == rq) {
++ deactivate_task(rq, p, 0);
++ set_task_cpu(p, lowest_rq->cpu);
++ activate_task(lowest_rq, p, 0);
++ resched_curr(lowest_rq);
++ }
++
++ double_unlock_balance(rq, lowest_rq);
++
++out_unlock:
++ rq->push_busy = false;
++ raw_spin_unlock(&rq->lock);
++ raw_spin_unlock_irq(&p->pi_lock);
++
++ put_task_struct(p);
++ return 0;
++}
++
+ /*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+@@ -2066,6 +2104,14 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
++ struct task_struct *push_task = NULL;
++
++ if ((flags & SCA_MIGRATE_ENABLE) &&
++ (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
++ rq->push_busy = true;
++ push_task = get_task_struct(p);
++ }
++
+ pending = p->migration_pending;
+ if (pending) {
+ refcount_inc(&pending->refs);
+@@ -2074,6 +2120,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
+ }
+ task_rq_unlock(rq, p, rf);
+
++ if (push_task) {
++ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
++ p, &rq->push_work);
++ }
++
+ if (complete)
+ goto do_complete;
+
+@@ -2110,6 +2161,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
+ if (flags & SCA_MIGRATE_ENABLE) {
+
+ refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
++ p->migration_flags &= ~MDF_PUSH;
+ task_rq_unlock(rq, p, rf);
+
+ pending->arg = (struct migration_arg) {
+@@ -2728,11 +2780,6 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+
+ static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
+
+-static inline bool is_migration_disabled(struct task_struct *p)
+-{
+- return false;
+-}
+-
+ static inline bool rq_has_pinned_tasks(struct rq *rq)
+ {
+ return false;
+diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
+index 94977a6ced8b..7cf3248894a9 100644
+--- a/kernel/sched/deadline.c
++++ b/kernel/sched/deadline.c
+@@ -2139,6 +2139,9 @@ static int push_dl_task(struct rq *rq)
+ return 0;
+
+ retry:
++ if (is_migration_disabled(next_task))
++ return 0;
++
+ if (WARN_ON(next_task == rq->curr))
+ return 0;
+
+@@ -2216,7 +2219,7 @@ static void push_dl_tasks(struct rq *rq)
+ static void pull_dl_task(struct rq *this_rq)
+ {
+ int this_cpu = this_rq->cpu, cpu;
+- struct task_struct *p;
++ struct task_struct *p, *push_task;
+ bool resched = false;
+ struct rq *src_rq;
+ u64 dmin = LONG_MAX;
+@@ -2246,6 +2249,7 @@ static void pull_dl_task(struct rq *this_rq)
+ continue;
+
+ /* Might drop this_rq->lock */
++ push_task = NULL;
+ double_lock_balance(this_rq, src_rq);
+
+ /*
+@@ -2277,17 +2281,27 @@ static void pull_dl_task(struct rq *this_rq)
+ src_rq->curr->dl.deadline))
+ goto skip;
+
+- resched = true;
+-
+- deactivate_task(src_rq, p, 0);
+- set_task_cpu(p, this_cpu);
+- activate_task(this_rq, p, 0);
+- dmin = p->dl.deadline;
++ if (is_migration_disabled(p)) {
++ push_task = get_push_task(src_rq);
++ } else {
++ deactivate_task(src_rq, p, 0);
++ set_task_cpu(p, this_cpu);
++ activate_task(this_rq, p, 0);
++ dmin = p->dl.deadline;
++ resched = true;
++ }
+
+ /* Is there any other task even earlier? */
+ }
+ skip:
+ double_unlock_balance(this_rq, src_rq);
++
++ if (push_task) {
++ raw_spin_unlock(&this_rq->lock);
++ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
++ push_task, &src_rq->push_work);
++ raw_spin_lock(&this_rq->lock);
++ }
+ }
+
+ if (resched)
+@@ -2548,6 +2562,7 @@ const struct sched_class dl_sched_class
+ .rq_online = rq_online_dl,
+ .rq_offline = rq_offline_dl,
+ .task_woken = task_woken_dl,
++ .find_lock_rq = find_lock_later_rq,
+ #endif
+
+ .task_tick = task_tick_dl,
+diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
+index f91339c0dee8..c25e35f41555 100644
+--- a/kernel/sched/rt.c
++++ b/kernel/sched/rt.c
+@@ -1873,7 +1873,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
+ * running task can migrate over to a CPU that is running a task
+ * of lesser priority.
+ */
+-static int push_rt_task(struct rq *rq)
++static int push_rt_task(struct rq *rq, bool pull)
+ {
+ struct task_struct *next_task;
+ struct rq *lowest_rq;
+@@ -1887,6 +1887,34 @@ static int push_rt_task(struct rq *rq)
+ return 0;
+
+ retry:
++ if (is_migration_disabled(next_task)) {
++ struct task_struct *push_task = NULL;
++ int cpu;
++
++ if (!pull || rq->push_busy)
++ return 0;
++
++ cpu = find_lowest_rq(rq->curr);
++ if (cpu == -1 || cpu == rq->cpu)
++ return 0;
++
++ /*
++ * Given we found a CPU with lower priority than @next_task,
++ * therefore it should be running. However we cannot migrate it
++ * to this other CPU, instead attempt to push the current
++ * running task on this CPU away.
++ */
++ push_task = get_push_task(rq);
++ if (push_task) {
++ raw_spin_unlock(&rq->lock);
++ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
++ push_task, &rq->push_work);
++ raw_spin_lock(&rq->lock);
++ }
++
++ return 0;
++ }
++
+ if (WARN_ON(next_task == rq->curr))
+ return 0;
+
+@@ -1941,12 +1969,10 @@ static int push_rt_task(struct rq *rq)
+ deactivate_task(rq, next_task, 0);
+ set_task_cpu(next_task, lowest_rq->cpu);
+ activate_task(lowest_rq, next_task, 0);
+- ret = 1;
+-
+ resched_curr(lowest_rq);
++ ret = 1;
+
+ double_unlock_balance(rq, lowest_rq);
+-
+ out:
+ put_task_struct(next_task);
+
+@@ -1956,7 +1982,7 @@ static int push_rt_task(struct rq *rq)
+ static void push_rt_tasks(struct rq *rq)
+ {
+ /* push_rt_task will return true if it moved an RT */
+- while (push_rt_task(rq))
++ while (push_rt_task(rq, false))
+ ;
+ }
+
+@@ -2109,7 +2135,8 @@ void rto_push_irq_work_func(struct irq_work *work)
+ */
+ if (has_pushable_tasks(rq)) {
+ raw_spin_lock(&rq->lock);
+- push_rt_tasks(rq);
++ while (push_rt_task(rq, true))
++ ;
+ raw_spin_unlock(&rq->lock);
+ }
+
+@@ -2134,7 +2161,7 @@ static void pull_rt_task(struct rq *this_rq)
+ {
+ int this_cpu = this_rq->cpu, cpu;
+ bool resched = false;
+- struct task_struct *p;
++ struct task_struct *p, *push_task;
+ struct rq *src_rq;
+ int rt_overload_count = rt_overloaded(this_rq);
+
+@@ -2181,6 +2208,7 @@ static void pull_rt_task(struct rq *this_rq)
+ * double_lock_balance, and another CPU could
+ * alter this_rq
+ */
++ push_task = NULL;
+ double_lock_balance(this_rq, src_rq);
+
+ /*
+@@ -2208,11 +2236,14 @@ static void pull_rt_task(struct rq *this_rq)
+ if (p->prio < src_rq->curr->prio)
+ goto skip;
+
+- resched = true;
+-
+- deactivate_task(src_rq, p, 0);
+- set_task_cpu(p, this_cpu);
+- activate_task(this_rq, p, 0);
++ if (is_migration_disabled(p)) {
++ push_task = get_push_task(src_rq);
++ } else {
++ deactivate_task(src_rq, p, 0);
++ set_task_cpu(p, this_cpu);
++ activate_task(this_rq, p, 0);
++ resched = true;
++ }
+ /*
+ * We continue with the search, just in
+ * case there's an even higher prio task
+@@ -2222,6 +2253,13 @@ static void pull_rt_task(struct rq *this_rq)
+ }
+ skip:
+ double_unlock_balance(this_rq, src_rq);
++
++ if (push_task) {
++ raw_spin_unlock(&this_rq->lock);
++ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
++ push_task, &src_rq->push_work);
++ raw_spin_lock(&this_rq->lock);
++ }
+ }
+
+ if (resched)
+@@ -2470,6 +2508,7 @@ const struct sched_class rt_sched_class
+ .rq_offline = rq_offline_rt,
+ .task_woken = task_woken_rt,
+ .switched_from = switched_from_rt,
++ .find_lock_rq = find_lock_lowest_rq,
+ #endif
+
+ .task_tick = task_tick_rt,
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 8237c9ab2bb8..69ef7cac3d29 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1061,6 +1061,8 @@ struct rq {
+ #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+ unsigned int nr_pinned;
+ #endif
++ unsigned int push_busy;
++ struct cpu_stop_work push_work;
+ };
+
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+@@ -1088,6 +1090,16 @@ static inline int cpu_of(struct rq *rq)
+ #endif
+ }
+
++#define MDF_PUSH 0x01
++
++static inline bool is_migration_disabled(struct task_struct *p)
++{
++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
++ return p->migration_disabled;
++#else
++ return false;
++#endif
++}
+
+ #ifdef CONFIG_SCHED_SMT
+ extern void __update_idle_core(struct rq *rq);
+@@ -1827,6 +1839,8 @@ struct sched_class {
+
+ void (*rq_online)(struct rq *rq);
+ void (*rq_offline)(struct rq *rq);
++
++ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
+ #endif
+
+ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
+@@ -1922,6 +1936,24 @@ extern void trigger_load_balance(struct rq *rq);
+
+ extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
++static inline struct task_struct *get_push_task(struct rq *rq)
++{
++ struct task_struct *p = rq->curr;
++
++ lockdep_assert_held(&rq->lock);
++
++ if (rq->push_busy)
++ return NULL;
++
++ if (p->nr_cpus_allowed == 1)
++ return NULL;
++
++ rq->push_busy = true;
++ return get_task_struct(p);
++}
++
++extern int push_cpu_stop(void *arg);
++
+ #endif
+
+ #ifdef CONFIG_CPU_IDLE
+--
+2.43.0
+