From: Scott Wood Date: Sat, 12 Oct 2019 01:52:13 -0500 Subject: [PATCH 307/342] sched: Lazy migrate_disable processing Origin: https://git.kernel.org/cgit/linux/kernel/git/rt/linux-stable-rt.git/commit?id=80bc8f45b65be158feec72e427e42b77fcf0745a [ Upstream commit 425c5b38779a860062aa62219dc920d374b13c17 ] Avoid overhead on the majority of migrate disable/enable sequences by only manipulating scheduler data (and grabbing the relevant locks) when the task actually schedules while migrate-disabled. A kernel build showed around a 10% reduction in system time (with CONFIG_NR_CPUS=512). Instead of cpuhp_pin_lock, CPU hotplug is handled by keeping a per-CPU count of the number of pinned tasks (including tasks which have not scheduled in the migrate-disabled section); takedown_cpu() will wait until that reaches zero (confirmed by take_cpu_down() in stop machine context to deal with races) before migrating tasks off of the cpu. To simplify synchronization, updating cpus_mask is no longer deferred until migrate_enable(). This lets us not have to worry about migrate_enable() missing the update if it's on the fast path (didn't schedule during the migrate disabled section). It also makes the code a bit simpler and reduces deviation from mainline. While the main motivation for this is the performance benefit, lazy migrate disable also eliminates the restriction on calling migrate_disable() while atomic but leaving the atomic region prior to calling migrate_enable() -- though this won't help with local_bh_disable() (and thus rcutorture) unless something similar is done with the recently added local_lock. Signed-off-by: Scott Wood Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- include/linux/cpu.h | 4 - include/linux/sched.h | 11 +-- init/init_task.c | 4 + kernel/cpu.c | 103 +++++++++-------------- kernel/sched/core.c | 182 +++++++++++++++++------------------------ kernel/sched/sched.h | 4 + lib/smp_processor_id.c | 3 + 7 files changed, 129 insertions(+), 182 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index a3c246214d28..81ce109caec3 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -122,8 +122,6 @@ extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); int cpu_down(unsigned int cpu); -extern void pin_current_cpu(void); -extern void unpin_current_cpu(void); #else /* CONFIG_HOTPLUG_CPU */ @@ -135,8 +133,6 @@ static inline int cpus_read_trylock(void) { return true; } static inline void lockdep_assert_cpus_held(void) { } static inline void cpu_hotplug_disable(void) { } static inline void cpu_hotplug_enable(void) { } -static inline void pin_current_cpu(void) { } -static inline void unpin_current_cpu(void) { } #endif /* !CONFIG_HOTPLUG_CPU */ diff --git a/include/linux/sched.h b/include/linux/sched.h index e567fe2d7058..65069db8923c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -227,6 +227,8 @@ extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); +int cpu_nr_pinned(int cpu); + /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode @@ -670,16 +672,13 @@ struct task_struct { cpumask_t cpus_mask; #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) int migrate_disable; - int migrate_disable_update; - int pinned_on_cpu; + bool migrate_disable_scheduled; # ifdef CONFIG_SCHED_DEBUG - int migrate_disable_atomic; + int pinned_on_cpu; # endif - #elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) # ifdef CONFIG_SCHED_DEBUG int migrate_disable; - int migrate_disable_atomic; # endif #endif #ifdef CONFIG_PREEMPT_RT_FULL @@ -2058,4 +2057,6 @@ static inline void rseq_syscall(struct pt_regs *regs) #endif +extern struct task_struct *takedown_cpu_task; + #endif diff --git a/init/init_task.c b/init/init_task.c index 634becebd713..45b84137c4b3 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -80,6 +80,10 @@ struct task_struct init_task .cpus_ptr = &init_task.cpus_mask, .cpus_mask = CPU_MASK_ALL, .nr_cpus_allowed= NR_CPUS, +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) && \ + defined(CONFIG_SCHED_DEBUG) + .pinned_on_cpu = -1, +#endif .mm = NULL, .active_mm = &init_mm, .restart_block = { diff --git a/kernel/cpu.c b/kernel/cpu.c index 4bf82665f28c..f603ea80af2e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -78,11 +78,6 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { .fail = CPUHP_INVALID, }; -#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PREEMPT_RT_FULL) -static DEFINE_PER_CPU(struct rt_rw_lock, cpuhp_pin_lock) = \ - __RWLOCK_RT_INITIALIZER(cpuhp_pin_lock); -#endif - #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); @@ -289,57 +284,6 @@ static int cpu_hotplug_disabled; #ifdef CONFIG_HOTPLUG_CPU -/** - * pin_current_cpu - Prevent the current cpu from being unplugged - */ -void pin_current_cpu(void) -{ -#ifdef CONFIG_PREEMPT_RT_FULL - struct rt_rw_lock *cpuhp_pin; - unsigned int cpu; - int ret; - -again: - cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock); - ret = __read_rt_trylock(cpuhp_pin); - if (ret) { - current->pinned_on_cpu = smp_processor_id(); - return; - } - cpu = smp_processor_id(); - preempt_lazy_enable(); - preempt_enable(); - - sleeping_lock_inc(); - __read_rt_lock(cpuhp_pin); - sleeping_lock_dec(); - - preempt_disable(); - preempt_lazy_disable(); - if (cpu != smp_processor_id()) { - __read_rt_unlock(cpuhp_pin); - goto again; - } - current->pinned_on_cpu = cpu; -#endif -} - -/** - * unpin_current_cpu - Allow unplug of current cpu - */ -void unpin_current_cpu(void) -{ -#ifdef CONFIG_PREEMPT_RT_FULL - struct rt_rw_lock *cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock); - - if (WARN_ON(current->pinned_on_cpu != smp_processor_id())) - cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, current->pinned_on_cpu); - - current->pinned_on_cpu = -1; - __read_rt_unlock(cpuhp_pin); -#endif -} - DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); void cpus_read_lock(void) @@ -933,6 +877,15 @@ static int take_cpu_down(void *_param) int err, cpu = smp_processor_id(); int ret; +#ifdef CONFIG_PREEMPT_RT_BASE + /* + * If any tasks disabled migration before we got here, + * go back and sleep again. + */ + if (cpu_nr_pinned(cpu)) + return -EAGAIN; +#endif + /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) @@ -960,11 +913,10 @@ static int take_cpu_down(void *_param) return 0; } +struct task_struct *takedown_cpu_task; + static int takedown_cpu(unsigned int cpu) { -#ifdef CONFIG_PREEMPT_RT_FULL - struct rt_rw_lock *cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, cpu); -#endif struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int err; @@ -977,17 +929,38 @@ static int takedown_cpu(unsigned int cpu) */ irq_lock_sparse(); -#ifdef CONFIG_PREEMPT_RT_FULL - __write_rt_lock(cpuhp_pin); +#ifdef CONFIG_PREEMPT_RT_BASE + WARN_ON_ONCE(takedown_cpu_task); + takedown_cpu_task = current; + +again: + /* + * If a task pins this CPU after we pass this check, take_cpu_down + * will return -EAGAIN. + */ + for (;;) { + int nr_pinned; + + set_current_state(TASK_UNINTERRUPTIBLE); + nr_pinned = cpu_nr_pinned(cpu); + if (nr_pinned == 0) + break; + schedule(); + } + set_current_state(TASK_RUNNING); #endif /* * So now all preempt/rcu users must observe !cpu_active(). */ err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); +#ifdef CONFIG_PREEMPT_RT_BASE + if (err == -EAGAIN) + goto again; +#endif if (err) { -#ifdef CONFIG_PREEMPT_RT_FULL - __write_rt_unlock(cpuhp_pin); +#ifdef CONFIG_PREEMPT_RT_BASE + takedown_cpu_task = NULL; #endif /* CPU refused to die */ irq_unlock_sparse(); @@ -1007,8 +980,8 @@ static int takedown_cpu(unsigned int cpu) wait_for_ap_thread(st, false); BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); -#ifdef CONFIG_PREEMPT_RT_FULL - __write_rt_unlock(cpuhp_pin); +#ifdef CONFIG_PREEMPT_RT_BASE + takedown_cpu_task = NULL; #endif /* Interrupts are moved away from the dying cpu, reenable alloc/free */ irq_unlock_sparse(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7eb3037c0b35..de6514e13e0c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1065,7 +1065,8 @@ static int migration_cpu_stop(void *data) void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) { cpumask_copy(&p->cpus_mask, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); + if (p->cpus_ptr == &p->cpus_mask) + p->nr_cpus_allowed = cpumask_weight(new_mask); } #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) @@ -1076,8 +1077,7 @@ int __migrate_disabled(struct task_struct *p) EXPORT_SYMBOL_GPL(__migrate_disabled); #endif -static void __do_set_cpus_allowed_tail(struct task_struct *p, - const struct cpumask *new_mask) +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { struct rq *rq = task_rq(p); bool queued, running; @@ -1106,20 +1106,6 @@ static void __do_set_cpus_allowed_tail(struct task_struct *p, set_curr_task(rq, p); } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) - if (__migrate_disabled(p)) { - lockdep_assert_held(&p->pi_lock); - - cpumask_copy(&p->cpus_mask, new_mask); - p->migrate_disable_update = 1; - return; - } -#endif - __do_set_cpus_allowed_tail(p, new_mask); -} - /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on @@ -1179,7 +1165,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p)) + if (cpumask_test_cpu(task_cpu(p), new_mask) || + p->cpus_ptr != &p->cpus_mask) goto out; if (task_running(rq, p) || p->state == TASK_WAKING) { @@ -3459,6 +3446,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) BUG(); } +static void migrate_disabled_sched(struct task_struct *p); + /* * __schedule() is the main scheduler function. * @@ -3529,6 +3518,9 @@ static void __sched notrace __schedule(bool preempt) rq_lock(rq, &rf); smp_mb__after_spinlock(); + if (__migrate_disabled(prev)) + migrate_disabled_sched(prev); + /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq); @@ -5777,6 +5769,8 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) BUG_ON(!next); put_prev_task(rq, next); + WARN_ON_ONCE(__migrate_disabled(next)); + /* * Rules for changing task_struct::cpus_mask are holding * both pi_lock and rq->lock, such that holding either @@ -7241,14 +7235,9 @@ update_nr_migratory(struct task_struct *p, long delta) static inline void migrate_disable_update_cpus_allowed(struct task_struct *p) { - struct rq *rq; - struct rq_flags rf; - - rq = task_rq_lock(p, &rf); p->cpus_ptr = cpumask_of(smp_processor_id()); update_nr_migratory(p, -1); p->nr_cpus_allowed = 1; - task_rq_unlock(rq, p, &rf); } static inline void @@ -7266,54 +7255,35 @@ migrate_enable_update_cpus_allowed(struct task_struct *p) void migrate_disable(void) { - struct task_struct *p = current; + preempt_disable(); - if (in_atomic() || irqs_disabled()) { + if (++current->migrate_disable == 1) { + this_rq()->nr_pinned++; + preempt_lazy_disable(); #ifdef CONFIG_SCHED_DEBUG - p->migrate_disable_atomic++; + WARN_ON_ONCE(current->pinned_on_cpu >= 0); + current->pinned_on_cpu = smp_processor_id(); #endif - return; - } -#ifdef CONFIG_SCHED_DEBUG - if (unlikely(p->migrate_disable_atomic)) { - tracing_off(); - WARN_ON_ONCE(1); } -#endif - if (p->migrate_disable) { - p->migrate_disable++; - return; - } + preempt_enable(); +} +EXPORT_SYMBOL(migrate_disable); - preempt_disable(); - preempt_lazy_disable(); - pin_current_cpu(); +static void migrate_disabled_sched(struct task_struct *p) +{ + if (p->migrate_disable_scheduled) + return; migrate_disable_update_cpus_allowed(p); - p->migrate_disable = 1; - - preempt_enable(); + p->migrate_disable_scheduled = 1; } -EXPORT_SYMBOL(migrate_disable); void migrate_enable(void) { struct task_struct *p = current; - - if (in_atomic() || irqs_disabled()) { -#ifdef CONFIG_SCHED_DEBUG - p->migrate_disable_atomic--; -#endif - return; - } - -#ifdef CONFIG_SCHED_DEBUG - if (unlikely(p->migrate_disable_atomic)) { - tracing_off(); - WARN_ON_ONCE(1); - } -#endif + struct rq *rq = this_rq(); + int cpu = task_cpu(p); WARN_ON_ONCE(p->migrate_disable <= 0); if (p->migrate_disable > 1) { @@ -7323,67 +7293,69 @@ void migrate_enable(void) preempt_disable(); +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(current->pinned_on_cpu != cpu); + current->pinned_on_cpu = -1; +#endif + + WARN_ON_ONCE(rq->nr_pinned < 1); + p->migrate_disable = 0; + rq->nr_pinned--; + if (rq->nr_pinned == 0 && unlikely(!cpu_active(cpu)) && + takedown_cpu_task) + wake_up_process(takedown_cpu_task); + + if (!p->migrate_disable_scheduled) + goto out; + + p->migrate_disable_scheduled = 0; + migrate_enable_update_cpus_allowed(p); - if (p->migrate_disable_update) { - struct rq *rq; + WARN_ON(smp_processor_id() != cpu); + if (!is_cpu_allowed(p, cpu)) { + struct migration_arg arg = { p }; struct rq_flags rf; - int cpu = task_cpu(p); rq = task_rq_lock(p, &rf); update_rq_clock(rq); - - __do_set_cpus_allowed_tail(p, &p->cpus_mask); + arg.dest_cpu = select_fallback_rq(cpu, p); task_rq_unlock(rq, p, &rf); - p->migrate_disable_update = 0; - - WARN_ON(smp_processor_id() != cpu); - if (!cpumask_test_cpu(cpu, &p->cpus_mask)) { - struct migration_arg arg = { p }; - struct rq_flags rf; + preempt_lazy_enable(); + preempt_enable(); - rq = task_rq_lock(p, &rf); - update_rq_clock(rq); - arg.dest_cpu = select_fallback_rq(cpu, p); - task_rq_unlock(rq, p, &rf); - - unpin_current_cpu(); - preempt_lazy_enable(); - preempt_enable(); - - sleeping_lock_inc(); - stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); - sleeping_lock_dec(); - tlb_migrate_finish(p->mm); + sleeping_lock_inc(); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); + sleeping_lock_dec(); + tlb_migrate_finish(p->mm); - return; - } + return; } - unpin_current_cpu(); + +out: preempt_lazy_enable(); preempt_enable(); } EXPORT_SYMBOL(migrate_enable); -#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) -void migrate_disable(void) +int cpu_nr_pinned(int cpu) { -#ifdef CONFIG_SCHED_DEBUG - struct task_struct *p = current; + struct rq *rq = cpu_rq(cpu); - if (in_atomic() || irqs_disabled()) { - p->migrate_disable_atomic++; - return; - } + return rq->nr_pinned; +} - if (unlikely(p->migrate_disable_atomic)) { - tracing_off(); - WARN_ON_ONCE(1); - } +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) +static void migrate_disabled_sched(struct task_struct *p) +{ +} - p->migrate_disable++; +void migrate_disable(void) +{ +#ifdef CONFIG_SCHED_DEBUG + current->migrate_disable++; #endif barrier(); } @@ -7394,20 +7366,14 @@ void migrate_enable(void) #ifdef CONFIG_SCHED_DEBUG struct task_struct *p = current; - if (in_atomic() || irqs_disabled()) { - p->migrate_disable_atomic--; - return; - } - - if (unlikely(p->migrate_disable_atomic)) { - tracing_off(); - WARN_ON_ONCE(1); - } - WARN_ON_ONCE(p->migrate_disable <= 0); p->migrate_disable--; #endif barrier(); } EXPORT_SYMBOL(migrate_enable); +#else +static void migrate_disabled_sched(struct task_struct *p) +{ +} #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 87a05bb90124..45b3f135d205 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -915,6 +915,10 @@ struct rq { /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; #endif + +#if defined(CONFIG_PREEMPT_RT_BASE) && defined(CONFIG_SMP) + int nr_pinned; +#endif }; static inline int cpu_of(struct rq *rq) diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index b8a8a8db2d75..0c80992aa337 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -22,6 +22,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1, * Kernel threads bound to a single CPU can safely use * smp_processor_id(): */ + if (current->migrate_disable) + goto out; + if (current->nr_cpus_allowed == 1) goto out;