diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 723 |
1 files changed, 319 insertions, 404 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a854b71836..a708d225c2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -85,7 +85,6 @@ #include "sched.h" #include "stats.h" -#include "autogroup.h" #include "autogroup.h" #include "pelt.h" @@ -114,6 +113,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -919,14 +919,13 @@ static bool set_nr_if_polling(struct task_struct *p) struct thread_info *ti = task_thread_info(p); typeof(ti->flags) val = READ_ONCE(ti->flags); - for (;;) { + do { if (!(val & _TIF_POLLING_NRFLAG)) return false; if (val & _TIF_NEED_RESCHED) return true; - if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)) - break; - } + } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)); + return true; } @@ -1480,16 +1479,12 @@ static void __uclamp_update_util_min_rt_default(struct task_struct *p) static void uclamp_update_util_min_rt_default(struct task_struct *p) { - struct rq_flags rf; - struct rq *rq; - if (!rt_task(p)) return; /* Protect updates to p->uclamp_* */ - rq = task_rq_lock(p, &rf); + guard(task_rq_lock)(p); __uclamp_update_util_min_rt_default(p); - task_rq_unlock(rq, p, &rf); } static inline struct uclamp_se @@ -1785,9 +1780,8 @@ static void uclamp_update_root_tg(void) uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], sysctl_sched_uclamp_util_max, false); - rcu_read_lock(); + guard(rcu)(); cpu_util_update_eff(&root_task_group.css); - rcu_read_unlock(); } #else static void uclamp_update_root_tg(void) { } @@ -1814,10 +1808,9 @@ static void uclamp_sync_util_min_rt_default(void) smp_mb__after_spinlock(); read_unlock(&tasklist_lock); - rcu_read_lock(); + guard(rcu)(); for_each_process_thread(g, p) uclamp_update_util_min_rt_default(p); - rcu_read_unlock(); } static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, @@ -2218,10 +2211,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio); } -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { if (p->sched_class == rq->curr->sched_class) - rq->curr->sched_class->check_preempt_curr(rq, p, flags); + rq->curr->sched_class->wakeup_preempt(rq, p, flags); else if (sched_class_above(p->sched_class, rq->curr->sched_class)) resched_curr(rq); @@ -2239,31 +2232,21 @@ int __task_state_match(struct task_struct *p, unsigned int state) if (READ_ONCE(p->__state) & state) return 1; -#ifdef CONFIG_PREEMPT_RT if (READ_ONCE(p->saved_state) & state) return -1; -#endif + return 0; } static __always_inline int task_state_match(struct task_struct *p, unsigned int state) { -#ifdef CONFIG_PREEMPT_RT - int match; - /* - * Serialize against current_save_and_set_rtlock_wait_state() and - * current_restore_rtlock_saved_state(). + * Serialize against current_save_and_set_rtlock_wait_state(), + * current_restore_rtlock_saved_state(), and __refrigerator(). */ - raw_spin_lock_irq(&p->pi_lock); - match = __task_state_match(p, state); - raw_spin_unlock_irq(&p->pi_lock); - - return match; -#else + guard(raw_spinlock_irq)(&p->pi_lock); return __task_state_match(p, state); -#endif } /* @@ -2417,10 +2400,9 @@ void migrate_disable(void) return; } - preempt_disable(); + guard(preempt)(); this_rq()->nr_pinned++; p->migration_disabled = 1; - preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_disable); @@ -2444,7 +2426,7 @@ void migrate_enable(void) * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). */ - preempt_disable(); + guard(preempt)(); if (p->cpus_ptr != &p->cpus_mask) __set_cpus_allowed_ptr(p, &ac); /* @@ -2455,7 +2437,6 @@ void migrate_enable(void) barrier(); p->migration_disabled = 0; this_rq()->nr_pinned--; - preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -2527,7 +2508,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq_lock(rq, rf); WARN_ON_ONCE(task_cpu(p) != new_cpu); activate_task(rq, p, 0); - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); return rq; } @@ -3413,7 +3394,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); - check_preempt_curr(dst_rq, p, 0); + wakeup_preempt(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); rq_unpin_lock(src_rq, &srf); @@ -3520,13 +3501,11 @@ out: */ void kick_process(struct task_struct *p) { - int cpu; + guard(preempt)(); + int cpu = task_cpu(p); - preempt_disable(); - cpu = task_cpu(p); if ((cpu != smp_processor_id()) && task_curr(p)) smp_send_reschedule(cpu); - preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); @@ -3789,7 +3768,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, } activate_task(rq, p, en_flags); - check_preempt_curr(rq, p, wake_flags); + wakeup_preempt(rq, p, wake_flags); ttwu_do_wakeup(p); @@ -3813,9 +3792,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (rq->avg_idle > max) rq->avg_idle = max; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle / 2; - rq->idle_stamp = 0; } #endif @@ -3860,7 +3836,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) * it should preempt the task that is current now. */ update_rq_clock(rq); - check_preempt_curr(rq, p, wake_flags); + wakeup_preempt(rq, p, wake_flags); } ttwu_do_wakeup(p); ret = 1; @@ -3960,6 +3936,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } +/* + * Whether CPUs are share cache resources, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_resources(int this_cpu, int that_cpu) +{ + if (this_cpu == that_cpu) + return true; + + return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu); +} + static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* @@ -4040,13 +4028,17 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * The caller holds p::pi_lock if p != current or has preemption * disabled when p == current. * - * The rules of PREEMPT_RT saved_state: + * The rules of saved_state: * * The related locking code always holds p::pi_lock when updating * p::saved_state, which means the code is fully serialized in both cases. * - * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other - * bits set. This allows to distinguish all wakeup scenarios. + * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. + * No other bits set. This allows to distinguish all wakeup scenarios. + * + * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This + * allows us to prevent early wakeup of tasks before they can be run on + * asymmetric ISA architectures (eg ARMv9). */ static __always_inline bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) @@ -4060,13 +4052,13 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) *success = !!(match = __task_state_match(p, state)); -#ifdef CONFIG_PREEMPT_RT /* * Saved state preserves the task state across blocking on - * an RT lock. If the state matches, set p::saved_state to - * TASK_RUNNING, but do not wake the task because it waits - * for a lock wakeup. Also indicate success because from - * the regular waker's point of view this has succeeded. + * an RT lock or TASK_FREEZABLE tasks. If the state matches, + * set p::saved_state to TASK_RUNNING, but do not wake the task + * because it waits for a lock wakeup or __thaw_task(). Also + * indicate success because from the regular waker's point of + * view this has succeeded. * * After acquiring the lock the task will restore p::__state * from p::saved_state which ensures that the regular @@ -4076,7 +4068,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) */ if (match < 0) p->saved_state = TASK_RUNNING; -#endif + return match > 0; } @@ -4258,7 +4250,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). * - * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). + * A similar smp_rmb() lives in __task_needs_rq_lock(). */ smp_rmb(); if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) @@ -4875,7 +4867,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); - check_preempt_curr(rq, p, WF_FORK); + wakeup_preempt(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* @@ -5918,8 +5910,7 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); - if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) - && in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, preempt_disable_ip); } @@ -6370,8 +6361,9 @@ static void sched_core_balance(struct rq *rq) struct sched_domain *sd; int cpu = cpu_of(rq); - preempt_disable(); - rcu_read_lock(); + guard(preempt)(); + guard(rcu)(); + raw_spin_rq_unlock_irq(rq); for_each_domain(cpu, sd) { if (need_resched()) @@ -6381,8 +6373,6 @@ static void sched_core_balance(struct rq *rq) break; } raw_spin_rq_lock_irq(rq); - rcu_read_unlock(); - preempt_enable(); } static DEFINE_PER_CPU(struct balance_callback, core_balance_head); @@ -6721,22 +6711,24 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { + static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG); unsigned int task_flags; - if (task_is_running(tsk)) - return; + /* + * Establish LD_WAIT_CONFIG context to ensure none of the code called + * will use a blocking primitive -- which would lead to recursion. + */ + lock_map_acquire_try(&sched_map); task_flags = tsk->flags; /* * If a worker goes to sleep, notify and ask workqueue whether it * wants to wake up a task to maintain concurrency. */ - if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { - if (task_flags & PF_WQ_WORKER) - wq_worker_sleeping(tsk); - else - io_wq_worker_sleeping(tsk); - } + if (task_flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else if (task_flags & PF_IO_WORKER) + io_wq_worker_sleeping(tsk); /* * spinlock and rwlock must not flush block requests. This will @@ -6750,6 +6742,8 @@ static inline void sched_submit_work(struct task_struct *tsk) * make sure to submit it to avoid deadlocks. */ blk_flush_plug(tsk->plug, true); + + lock_map_release(&sched_map); } static void sched_update_worker(struct task_struct *tsk) @@ -6762,16 +6756,26 @@ static void sched_update_worker(struct task_struct *tsk) } } -asmlinkage __visible void __sched schedule(void) +static __always_inline void __schedule_loop(unsigned int sched_mode) { - struct task_struct *tsk = current; - - sched_submit_work(tsk); do { preempt_disable(); - __schedule(SM_NONE); + __schedule(sched_mode); sched_preempt_enable_no_resched(); } while (need_resched()); +} + +asmlinkage __visible void __sched schedule(void) +{ + struct task_struct *tsk = current; + +#ifdef CONFIG_RT_MUTEXES + lockdep_assert(!tsk->sched_rt_mutex); +#endif + + if (!task_is_running(tsk)) + sched_submit_work(tsk); + __schedule_loop(SM_NONE); sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); @@ -6835,11 +6839,7 @@ void __sched schedule_preempt_disabled(void) #ifdef CONFIG_PREEMPT_RT void __sched notrace schedule_rtlock(void) { - do { - preempt_disable(); - __schedule(SM_RTLOCK_WAIT); - sched_preempt_enable_no_resched(); - } while (need_resched()); + __schedule_loop(SM_RTLOCK_WAIT); } NOKPROBE_SYMBOL(schedule_rtlock); #endif @@ -7035,6 +7035,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio) #ifdef CONFIG_RT_MUTEXES +/* + * Would be more useful with typeof()/auto_type but they don't mix with + * bit-fields. Since it's a local thing, use int. Keep the generic sounding + * name such that if someone were to implement this function we get to compare + * notes. + */ +#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; }) + +void rt_mutex_pre_schedule(void) +{ + lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1)); + sched_submit_work(current); +} + +void rt_mutex_schedule(void) +{ + lockdep_assert(current->sched_rt_mutex); + __schedule_loop(SM_NONE); +} + +void rt_mutex_post_schedule(void) +{ + sched_update_worker(current); + lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0)); +} + static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) { if (pi_task) @@ -7188,9 +7214,8 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { bool queued, running; - int old_prio; - struct rq_flags rf; struct rq *rq; + int old_prio; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) return; @@ -7198,7 +7223,9 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = task_rq_lock(p, &rf); + CLASS(task_rq_lock, rq_guard)(p); + rq = rq_guard.rq; + update_rq_clock(rq); /* @@ -7209,8 +7236,9 @@ void set_user_nice(struct task_struct *p, long nice) */ if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; + return; } + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -7233,9 +7261,6 @@ void set_user_nice(struct task_struct *p, long nice) * lowered its priority, then reschedule its CPU: */ p->sched_class->prio_changed(rq, p, old_prio); - -out_unlock: - task_rq_unlock(rq, p, &rf); } EXPORT_SYMBOL(set_user_nice); @@ -7508,6 +7533,21 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } +static struct task_struct *find_get_task(pid_t pid) +{ + struct task_struct *p; + guard(rcu)(); + + p = find_process_by_pid(pid); + if (likely(p)) + get_task_struct(p); + + return p; +} + +DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T), + find_get_task(pid), pid_t pid) + /* * sched_setparam() passes in -1 for its policy, to let the functions * it calls know not to change it. @@ -7545,14 +7585,11 @@ static void __setscheduler_params(struct task_struct *p, static bool check_same_owner(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred; - bool match; + guard(rcu)(); - rcu_read_lock(); pcred = __task_cred(p); - match = (uid_eq(cred->euid, pcred->euid) || - uid_eq(cred->euid, pcred->uid)); - rcu_read_unlock(); - return match; + return (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); } /* @@ -7964,27 +8001,17 @@ static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lparam; - struct task_struct *p; - int retval; if (!param || pid < 0) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); - - if (likely(p)) { - retval = sched_setscheduler(p, policy, &lparam); - put_task_struct(p); - } + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; - return retval; + return sched_setscheduler(p, policy, &lparam); } /* @@ -8080,7 +8107,6 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags) { struct sched_attr attr; - struct task_struct *p; int retval; if (!uattr || pid < 0 || flags) @@ -8095,21 +8121,14 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) attr.sched_policy = SETPARAM_POLICY; - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; - if (likely(p)) { - if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) - get_params(p, &attr); - retval = sched_setattr(p, &attr); - put_task_struct(p); - } + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); - return retval; + return sched_setattr(p, &attr); } /** @@ -8127,16 +8146,17 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (pid < 0) return -EINVAL; - retval = -ESRCH; - rcu_read_lock(); + guard(rcu)(); p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy - | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (!retval) { + retval = p->policy; + if (p->sched_reset_on_fork) + retval |= SCHED_RESET_ON_FORK; } - rcu_read_unlock(); return retval; } @@ -8157,30 +8177,23 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (!param || pid < 0) return -EINVAL; - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - if (task_has_rt_policy(p)) - lp.sched_priority = p->rt_priority; - rcu_read_unlock(); + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; + } /* * This one might sleep, we cannot do it with a spinlock held ... */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; + return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; } /* @@ -8240,46 +8253,38 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, usize < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - kattr.sched_policy = p->policy; - if (p->sched_reset_on_fork) - kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - get_params(p, &kattr); - kattr.sched_flags &= SCHED_FLAG_ALL; + kattr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK - /* - * This could race with another potential updater, but this is fine - * because it'll correctly read the old or the new value. We don't need - * to guarantee who wins the race as long as it doesn't return garbage. - */ - kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; - kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ + kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif - - rcu_read_unlock(); + } return sched_attr_copy_to_user(uattr, &kattr, usize); - -out_unlock: - rcu_read_unlock(); - return retval; } #ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { - int ret = 0; - /* * If the task isn't a deadline task or admission control is * disabled then we don't care about affinity changes. @@ -8293,11 +8298,11 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) * tasks allowed to run on all the CPUs in the task's * root_domain. */ - rcu_read_lock(); + guard(rcu)(); if (!cpumask_subset(task_rq(p)->rd->span, mask)) - ret = -EBUSY; - rcu_read_unlock(); - return ret; + return -EBUSY; + + return 0; } #endif @@ -8367,39 +8372,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { struct affinity_context ac; struct cpumask *user_mask; - struct task_struct *p; int retval; - rcu_read_lock(); - - p = find_process_by_pid(pid); - if (!p) { - rcu_read_unlock(); + CLASS(find_get_task, p)(pid); + if (!p) return -ESRCH; - } - - /* Prevent p going away */ - get_task_struct(p); - rcu_read_unlock(); - if (p->flags & PF_NO_SETAFFINITY) { - retval = -EINVAL; - goto out_put_task; - } + if (p->flags & PF_NO_SETAFFINITY) + return -EINVAL; if (!check_same_owner(p)) { - rcu_read_lock(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { - rcu_read_unlock(); - retval = -EPERM; - goto out_put_task; - } - rcu_read_unlock(); + guard(rcu)(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) + return -EPERM; } retval = security_task_setscheduler(p); if (retval) - goto out_put_task; + return retval; /* * With non-SMP configs, user_cpus_ptr/user_mask isn't used and @@ -8409,8 +8399,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (user_mask) { cpumask_copy(user_mask, in_mask); } else if (IS_ENABLED(CONFIG_SMP)) { - retval = -ENOMEM; - goto out_put_task; + return -ENOMEM; } ac = (struct affinity_context){ @@ -8422,8 +8411,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) retval = __sched_setaffinity(p, &ac); kfree(ac.user_mask); -out_put_task: - put_task_struct(p); return retval; } @@ -8465,28 +8452,21 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; - unsigned long flags; int retval; - rcu_read_lock(); - - retval = -ESRCH; + guard(rcu)(); p = find_process_by_pid(pid); if (!p) - goto out_unlock; + return -ESRCH; retval = security_task_getscheduler(p); if (retval) - goto out_unlock; + return retval; - raw_spin_lock_irqsave(&p->pi_lock, flags); + guard(raw_spinlock_irqsave)(&p->pi_lock); cpumask_and(mask, &p->cpus_mask, cpu_active_mask); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - -out_unlock: - rcu_read_unlock(); - return retval; + return 0; } /** @@ -8933,55 +8913,46 @@ int __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; - unsigned long flags; int yielded = 0; - local_irq_save(flags); - rq = this_rq(); + scoped_guard (irqsave) { + rq = this_rq(); again: - p_rq = task_rq(p); - /* - * If we're the only runnable task on the rq and target rq also - * has only one task, there's absolutely no point in yielding. - */ - if (rq->nr_running == 1 && p_rq->nr_running == 1) { - yielded = -ESRCH; - goto out_irq; - } + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) + return -ESRCH; - double_rq_lock(rq, p_rq); - if (task_rq(p) != p_rq) { - double_rq_unlock(rq, p_rq); - goto again; - } + guard(double_rq_lock)(rq, p_rq); + if (task_rq(p) != p_rq) + goto again; - if (!curr->sched_class->yield_to_task) - goto out_unlock; + if (!curr->sched_class->yield_to_task) + return 0; - if (curr->sched_class != p->sched_class) - goto out_unlock; + if (curr->sched_class != p->sched_class) + return 0; - if (task_on_cpu(p_rq, p) || !task_is_running(p)) - goto out_unlock; + if (task_on_cpu(p_rq, p) || !task_is_running(p)) + return 0; - yielded = curr->sched_class->yield_to_task(rq, p); - if (yielded) { - schedstat_inc(rq->yld_count); - /* - * Make p's CPU reschedule; pick_next_entity takes care of - * fairness. - */ - if (preempt && rq != p_rq) - resched_curr(p_rq); + yielded = curr->sched_class->yield_to_task(rq, p); + if (yielded) { + schedstat_inc(rq->yld_count); + /* + * Make p's CPU reschedule; pick_next_entity + * takes care of fairness. + */ + if (preempt && rq != p_rq) + resched_curr(p_rq); + } } -out_unlock: - double_rq_unlock(rq, p_rq); -out_irq: - local_irq_restore(flags); - - if (yielded > 0) + if (yielded) schedule(); return yielded; @@ -9084,38 +9055,30 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { - struct task_struct *p; - unsigned int time_slice; - struct rq_flags rf; - struct rq *rq; + unsigned int time_slice = 0; int retval; if (pid < 0) return -EINVAL; - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; + scoped_guard (rcu) { + struct task_struct *p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - rq = task_rq_lock(p, &rf); - time_slice = 0; - if (p->sched_class->get_rr_interval) - time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &rf); + scoped_guard (task_rq_lock, p) { + struct rq *rq = scope.rq; + if (p->sched_class->get_rr_interval) + time_slice = p->sched_class->get_rr_interval(rq, p); + } + } - rcu_read_unlock(); jiffies_to_timespec64(time_slice, t); return 0; - -out_unlock: - rcu_read_unlock(); - return retval; } /** @@ -9174,9 +9137,9 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n", - free, task_pid_nr(p), ppid, - read_task_thread_flags(p)); + pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n", + free, task_pid_nr(p), task_tgid_nr(p), + ppid, read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); @@ -9906,7 +9869,7 @@ struct task_group root_task_group; LIST_HEAD(task_groups); /* Cacheline aligned slab cache for task_group */ -static struct kmem_cache *task_group_cache __read_mostly; +static struct kmem_cache *task_group_cache __ro_after_init; #endif void __init sched_init(void) @@ -10016,7 +9979,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; + rq->cpu_capacity = SCHED_CAPACITY_SCALE; rq->balance_callback = &balance_push_callback; rq->active_balance = 0; rq->next_balance = jiffies; @@ -10025,8 +9988,6 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -10292,9 +10253,9 @@ void normalize_rt_tasks(void) #endif /* CONFIG_MAGIC_SYSRQ */ -#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) +#if defined(CONFIG_KGDB_KDB) /* - * These functions are only useful for the IA64 MCA handling, or kdb. + * These functions are only useful for kdb. * * They can only be called when the whole system has been * stopped - every CPU needs to be quiescent, and no scheduling @@ -10316,30 +10277,7 @@ struct task_struct *curr_task(int cpu) return cpu_curr(cpu); } -#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ - -#ifdef CONFIG_IA64 -/** - * ia64_set_curr_task - set the current task for a given CPU. - * @cpu: the processor in question. - * @p: the task pointer to set. - * - * Description: This function must only be used when non-maskable interrupts - * are serviced on a separate stack. It allows the architecture to switch the - * notion of the current task on a CPU in a non-blocking manner. This function - * must be called with all CPU's synchronized, and interrupts disabled, the - * and caller must save the original value of the current task (see - * curr_task() above) and restore that value before reenabling interrupts and - * re-starting the system. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - */ -void ia64_set_curr_task(int cpu, struct task_struct *p) -{ - cpu_curr(cpu) = p; -} - -#endif +#endif /* defined(CONFIG_KGDB_KDB) */ #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ @@ -10501,17 +10439,18 @@ void sched_move_task(struct task_struct *tsk) int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct task_group *group; - struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(tsk, &rf); + CLASS(task_rq_lock, rq_guard)(tsk); + rq = rq_guard.rq; + /* * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous * group changes. */ group = sched_get_task_group(tsk); if (group == tsk->sched_task_group) - goto unlock; + return; update_rq_clock(rq); @@ -10536,9 +10475,6 @@ void sched_move_task(struct task_struct *tsk) */ resched_curr(rq); } - -unlock: - task_rq_unlock(rq, tsk, &rf); } static inline struct task_group *css_tg(struct cgroup_subsys_state *css) @@ -10575,11 +10511,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) #ifdef CONFIG_UCLAMP_TASK_GROUP /* Propagate the effective uclamp value for the new group */ - mutex_lock(&uclamp_mutex); - rcu_read_lock(); + guard(mutex)(&uclamp_mutex); + guard(rcu)(); cpu_util_update_eff(css); - rcu_read_unlock(); - mutex_unlock(&uclamp_mutex); #endif return 0; @@ -10730,8 +10664,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, static_branch_enable(&sched_uclamp_used); - mutex_lock(&uclamp_mutex); - rcu_read_lock(); + guard(mutex)(&uclamp_mutex); + guard(rcu)(); tg = css_tg(of_css(of)); if (tg->uclamp_req[clamp_id].value != req.util) @@ -10746,9 +10680,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, /* Update effective clamps to track the most restrictive value */ cpu_util_update_eff(of_css(of)); - rcu_read_unlock(); - mutex_unlock(&uclamp_mutex); - return nbytes; } @@ -10774,10 +10705,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf, u64 percent; u32 rem; - rcu_read_lock(); - tg = css_tg(seq_css(sf)); - util_clamp = tg->uclamp_req[clamp_id].value; - rcu_read_unlock(); + scoped_guard (rcu) { + tg = css_tg(seq_css(sf)); + util_clamp = tg->uclamp_req[clamp_id].value; + } if (util_clamp == SCHED_CAPACITY_SCALE) { seq_puts(sf, "max\n"); @@ -10868,11 +10799,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - cpus_read_lock(); - mutex_lock(&cfs_constraints_mutex); + guard(cpus_read_lock)(); + guard(mutex)(&cfs_constraints_mutex); + ret = __cfs_schedulable(tg, period, quota); if (ret) - goto out_unlock; + return ret; runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; @@ -10882,39 +10814,38 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, */ if (runtime_enabled && !runtime_was_enabled) cfs_bandwidth_usage_inc(); - raw_spin_lock_irq(&cfs_b->lock); - cfs_b->period = ns_to_ktime(period); - cfs_b->quota = quota; - cfs_b->burst = burst; - __refill_cfs_bandwidth_runtime(cfs_b); + scoped_guard (raw_spinlock_irq, &cfs_b->lock) { + cfs_b->period = ns_to_ktime(period); + cfs_b->quota = quota; + cfs_b->burst = burst; - /* Restart the period timer (if active) to handle new period expiry: */ - if (runtime_enabled) - start_cfs_bandwidth(cfs_b); + __refill_cfs_bandwidth_runtime(cfs_b); - raw_spin_unlock_irq(&cfs_b->lock); + /* + * Restart the period timer (if active) to handle new + * period expiry: + */ + if (runtime_enabled) + start_cfs_bandwidth(cfs_b); + } for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; - struct rq_flags rf; - rq_lock_irq(rq, &rf); + guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); - rq_unlock_irq(rq, &rf); } + if (runtime_was_enabled && !runtime_enabled) cfs_bandwidth_usage_dec(); -out_unlock: - mutex_unlock(&cfs_constraints_mutex); - cpus_read_unlock(); - return ret; + return 0; } static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) @@ -11099,7 +11030,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) { - int ret; struct cfs_schedulable_data data = { .tg = tg, .period = period, @@ -11111,11 +11041,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) do_div(data.quota, NSEC_PER_USEC); } - rcu_read_lock(); - ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); - rcu_read_unlock(); - - return ret; + guard(rcu)(); + return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); } static int cpu_cfs_stat_show(struct seq_file *sf, void *v) @@ -11720,14 +11647,12 @@ int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, * are not the last task to be migrated from this cpu for this mm, so * there is no need to move src_cid to the destination cpu. */ - rcu_read_lock(); + guard(rcu)(); src_task = rcu_dereference(src_rq->curr); if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - rcu_read_unlock(); t->last_mm_cid = -1; return -1; } - rcu_read_unlock(); return src_cid; } @@ -11771,18 +11696,17 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, * the lazy-put flag, this task will be responsible for transitioning * from lazy-put flag set to MM_CID_UNSET. */ - rcu_read_lock(); - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - rcu_read_unlock(); - /* - * We observed an active task for this mm, there is therefore - * no point in moving this cid to the destination cpu. - */ - t->last_mm_cid = -1; - return -1; + scoped_guard (rcu) { + src_task = rcu_dereference(src_rq->curr); + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { + /* + * We observed an active task for this mm, there is therefore + * no point in moving this cid to the destination cpu. + */ + t->last_mm_cid = -1; + return -1; + } } - rcu_read_unlock(); /* * The src_cid is unused, so it can be unset. @@ -11855,7 +11779,6 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_ { struct rq *rq = cpu_rq(cpu); struct task_struct *t; - unsigned long flags; int cid, lazy_cid; cid = READ_ONCE(pcpu_cid->cid); @@ -11890,23 +11813,21 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_ * the lazy-put flag, that task will be responsible for transitioning * from lazy-put flag set to MM_CID_UNSET. */ - rcu_read_lock(); - t = rcu_dereference(rq->curr); - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) { - rcu_read_unlock(); - return; + scoped_guard (rcu) { + t = rcu_dereference(rq->curr); + if (READ_ONCE(t->mm_cid_active) && t->mm == mm) + return; } - rcu_read_unlock(); /* * The cid is unused, so it can be unset. * Disable interrupts to keep the window of cid ownership without rq * lock small. */ - local_irq_save(flags); - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - __mm_cid_put(mm, cid); - local_irq_restore(flags); + scoped_guard (irqsave) { + if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) + __mm_cid_put(mm, cid); + } } static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) @@ -11928,14 +11849,13 @@ static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) * snapshot associated with this cid if an active task using the mm is * observed on this rq. */ - rcu_read_lock(); - curr = rcu_dereference(rq->curr); - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { - WRITE_ONCE(pcpu_cid->time, rq_clock); - rcu_read_unlock(); - return; + scoped_guard (rcu) { + curr = rcu_dereference(rq->curr); + if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { + WRITE_ONCE(pcpu_cid->time, rq_clock); + return; + } } - rcu_read_unlock(); if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) return; @@ -12029,7 +11949,6 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) void sched_mm_cid_exit_signals(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12037,7 +11956,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); preempt_enable_no_resched(); /* holding spinlock */ WRITE_ONCE(t->mm_cid_active, 0); /* @@ -12047,13 +11966,11 @@ void sched_mm_cid_exit_signals(struct task_struct *t) smp_mb(); mm_cid_put(mm); t->last_mm_cid = t->mm_cid = -1; - rq_unlock_irqrestore(rq, &rf); } void sched_mm_cid_before_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12061,7 +11978,7 @@ void sched_mm_cid_before_execve(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); preempt_enable_no_resched(); /* holding spinlock */ WRITE_ONCE(t->mm_cid_active, 0); /* @@ -12071,13 +11988,11 @@ void sched_mm_cid_before_execve(struct task_struct *t) smp_mb(); mm_cid_put(mm); t->last_mm_cid = t->mm_cid = -1; - rq_unlock_irqrestore(rq, &rf); } void sched_mm_cid_after_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12085,16 +12000,16 @@ void sched_mm_cid_after_execve(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 1); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); - rq_unlock_irqrestore(rq, &rf); + scoped_guard (rq_lock_irqsave, rq) { + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 1); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + } rseq_set_notify_resume(t); } |