diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 37 | ||||
-rw-r--r-- | kernel/sched/fair.c | 100 | ||||
-rw-r--r-- | kernel/sched/idle.c | 22 | ||||
-rw-r--r-- | kernel/sched/isolation.c | 11 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 13 | ||||
-rw-r--r-- | kernel/sched/sched.h | 2 | ||||
-rw-r--r-- | kernel/sched/topology.c | 35 |
7 files changed, 129 insertions, 91 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d3aef62839..d211d40a2e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1792,7 +1792,6 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css); #endif #ifdef CONFIG_SYSCTL -#ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK_GROUP static void uclamp_update_root_tg(void) { @@ -1898,7 +1897,6 @@ undo: return result; } #endif -#endif static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) @@ -2065,7 +2063,7 @@ static void __init init_uclamp(void) } } -#else /* CONFIG_UCLAMP_TASK */ +#else /* !CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline int uclamp_validate(struct task_struct *p, @@ -3955,6 +3953,17 @@ void wake_up_if_idle(int cpu) } } +bool cpus_equal_capacity(int this_cpu, int that_cpu) +{ + if (!sched_asym_cpucap_active()) + return true; + + if (this_cpu == that_cpu) + return true; + + return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu); +} + bool cpus_share_cache(int this_cpu, int that_cpu) { if (this_cpu == that_cpu) @@ -6638,7 +6647,9 @@ static void __sched notrace __schedule(unsigned int sched_mode) * if (signal_pending_state()) if (p->state & @state) * * Also, the membarrier system call requires a full memory barrier - * after coming from user-space, before storing to rq->curr. + * after coming from user-space, before storing to rq->curr; this + * barrier matches a full barrier in the proximity of the membarrier + * system call exit. */ rq_lock(rq, &rf); smp_mb__after_spinlock(); @@ -6709,12 +6720,20 @@ static void __sched notrace __schedule(unsigned int sched_mode) * * Here are the schemes providing that barrier on the * various architectures: - * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. - * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. + * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC, + * RISC-V. switch_mm() relies on membarrier_arch_switch_mm() + * on PowerPC and on RISC-V. * - finish_lock_switch() for weakly-ordered * architectures where spin_unlock is a full barrier, * - switch_to() for arm64 (weakly-ordered, spin_unlock * is a RELEASE barrier), + * + * The barrier matches a full barrier in the proximity of + * the membarrier system call entry. + * + * On RISC-V, this barrier pairing is also needed for the + * SYNC_CORE command when switching between processes, cf. + * the inline comments in membarrier_arch_switch_mm(). */ ++*switch_count; @@ -6787,10 +6806,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { + if (tsk->flags & PF_BLOCK_TS) + blk_plug_invalidate_ts(tsk); if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); - else + else if (tsk->flags & PF_IO_WORKER) io_wq_worker_running(tsk); } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a01269ed96..213c94d027 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1837,6 +1837,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, int last_cpupid, this_cpupid; /* + * Cannot migrate to memoryless nodes. + */ + if (!node_state(dst_nid, N_MEMORY)) + return false; + + /* * The pages in slow memory node should be migrated according * to hot/cold instead of private/shared. */ @@ -9269,19 +9275,17 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) static inline bool others_have_blocked(struct rq *rq) { - if (READ_ONCE(rq->avg_rt.util_avg)) + if (cpu_util_rt(rq)) return true; - if (READ_ONCE(rq->avg_dl.util_avg)) + if (cpu_util_dl(rq)) return true; if (thermal_load_avg(rq)) return true; -#ifdef CONFIG_HAVE_SCHED_AVG_IRQ - if (READ_ONCE(rq->avg_irq.util_avg)) + if (cpu_util_irq(rq)) return true; -#endif return false; } @@ -9538,8 +9542,8 @@ static unsigned long scale_rt_capacity(int cpu) * avg_thermal.load_avg tracks thermal pressure and the weighted * average uses the actual delta max capacity(load). */ - used = READ_ONCE(rq->avg_rt.util_avg); - used += READ_ONCE(rq->avg_dl.util_avg); + used = cpu_util_rt(rq); + used += cpu_util_dl(rq); used += thermal_load_avg(rq); if (unlikely(used >= max)) @@ -9772,51 +9776,49 @@ group_type group_classify(unsigned int imbalance_pct, */ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) { + if (!(sd->flags & SD_ASYM_PACKING)) + return false; + if (!sched_smt_active()) return true; return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); } +static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu) +{ + /* + * First check if @dst_cpu can do asym_packing load balance. Only do it + * if it has higher priority than @src_cpu. + */ + return sched_use_asym_prio(sd, dst_cpu) && + sched_asym_prefer(dst_cpu, src_cpu); +} + /** - * sched_asym - Check if the destination CPU can do asym_packing load balance + * sched_group_asym - Check if the destination CPU can do asym_packing balance * @env: The load balancing environment - * @sds: Load-balancing data with statistics of the local group * @sgs: Load-balancing statistics of the candidate busiest group * @group: The candidate busiest group * * @env::dst_cpu can do asym_packing if it has higher priority than the * preferred CPU of @group. * - * SMT is a special case. If we are balancing load between cores, @env::dst_cpu - * can do asym_packing balance only if all its SMT siblings are idle. Also, it - * can only do it if @group is an SMT group and has exactly on busy CPU. Larger - * imbalances in the number of CPUS are dealt with in find_busiest_group(). - * - * If we are balancing load within an SMT core, or at PKG domain level, always - * proceed. - * * Return: true if @env::dst_cpu can do with asym_packing load balance. False * otherwise. */ static inline bool -sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, - struct sched_group *group) +sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) { - /* Ensure that the whole local core is idle, if applicable. */ - if (!sched_use_asym_prio(env->sd, env->dst_cpu)) - return false; - /* - * CPU priorities does not make sense for SMT cores with more than one + * CPU priorities do not make sense for SMT cores with more than one * busy sibling. */ - if (group->flags & SD_SHARE_CPUCAPACITY) { - if (sgs->group_weight - sgs->idle_cpus != 1) - return false; - } + if ((group->flags & SD_SHARE_CPUCAPACITY) && + (sgs->group_weight - sgs->idle_cpus != 1)) + return false; - return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); + return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); } /* One group has more than one SMT CPU while the other group does not */ @@ -9970,11 +9972,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; /* Check if dst CPU is idle and preferred to this group */ - if (!local_group && env->sd->flags & SD_ASYM_PACKING && - env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && - sched_asym(env, sds, sgs, group)) { + if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && + sched_group_asym(env, sgs, group)) sgs->group_asym_packing = 1; - } /* Check for loaded SMT group to be balanced to dst CPU */ if (!local_group && smt_balance(env, sgs, group)) @@ -10038,9 +10038,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, switch (sgs->group_type) { case group_overloaded: /* Select the overloaded group with highest avg_load. */ - if (sgs->avg_load <= busiest->avg_load) - return false; - break; + return sgs->avg_load > busiest->avg_load; case group_imbalanced: /* @@ -10051,18 +10049,14 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_asym_packing: /* Prefer to move from lowest priority CPU's work */ - if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) - return false; - break; + return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); case group_misfit_task: /* * If we have more than one misfit sg go with the biggest * misfit. */ - if (sgs->group_misfit_task_load < busiest->group_misfit_task_load) - return false; - break; + return sgs->group_misfit_task_load > busiest->group_misfit_task_load; case group_smt_balance: /* @@ -10214,10 +10208,8 @@ static int idle_cpu_without(int cpu, struct task_struct *p) * be computed and tested before calling idle_cpu_without(). */ -#ifdef CONFIG_SMP if (rq->ttwu_pending) return 0; -#endif return 1; } @@ -10610,16 +10602,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_sg_lb_stats(env, sds, sg, sgs, &sg_status); - if (local_group) - goto next_group; - - - if (update_sd_pick_busiest(env, sds, sg, sgs)) { + if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } -next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; @@ -10718,7 +10705,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (local->group_type == group_has_spare) { if ((busiest->group_type > group_fully_busy) && - !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { + !(env->sd->flags & SD_SHARE_LLC)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity @@ -11065,10 +11052,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, * If balancing between cores, let lower priority CPUs help * SMT cores with more than one busy sibling. */ - if ((env->sd->flags & SD_ASYM_PACKING) && - sched_use_asym_prio(env->sd, i) && - sched_asym_prefer(i, env->dst_cpu) && - nr_running == 1) + if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1) continue; switch (env->migration_type) { @@ -11164,8 +11148,7 @@ asym_active_balance(struct lb_env *env) * the lower priority @env::dst_cpu help it. Do not follow * CPU priority. */ - return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && - sched_use_asym_prio(env->sd, env->dst_cpu) && + return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) && (sched_asym_prefer(env->dst_cpu, env->src_cpu) || !sched_use_asym_prio(env->sd, env->src_cpu)); } @@ -11937,8 +11920,7 @@ static void nohz_balancer_kick(struct rq *rq) * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { - if (sched_use_asym_prio(sd, i) && - sched_asym_prefer(i, cpu)) { + if (sched_asym(sd, i, cpu)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 31231925f1..6135fbe83d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -81,6 +81,25 @@ void __weak arch_cpu_idle(void) cpu_idle_force_poll = 1; } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE +DEFINE_STATIC_KEY_FALSE(arch_needs_tick_broadcast); + +static inline void cond_tick_broadcast_enter(void) +{ + if (static_branch_unlikely(&arch_needs_tick_broadcast)) + tick_broadcast_enter(); +} + +static inline void cond_tick_broadcast_exit(void) +{ + if (static_branch_unlikely(&arch_needs_tick_broadcast)) + tick_broadcast_exit(); +} +#else +static inline void cond_tick_broadcast_enter(void) { } +static inline void cond_tick_broadcast_exit(void) { } +#endif + /** * default_idle_call - Default CPU idle routine. * @@ -90,6 +109,7 @@ void __cpuidle default_idle_call(void) { instrumentation_begin(); if (!current_clr_polling_and_test()) { + cond_tick_broadcast_enter(); trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); @@ -99,6 +119,7 @@ void __cpuidle default_idle_call(void) start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + cond_tick_broadcast_exit(); } local_irq_enable(); instrumentation_end(); @@ -291,7 +312,6 @@ static void do_idle(void) local_irq_disable(); if (cpu_is_offline(cpu)) { - tick_nohz_idle_stop_tick(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 82e2f7fc7c..5891e715f0 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -46,7 +46,16 @@ int housekeeping_any_cpu(enum hk_type type) if (cpu < nr_cpu_ids) return cpu; - return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); + cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); + if (likely(cpu < nr_cpu_ids)) + return cpu; + /* + * Unless we have another problem this can only happen + * at boot time before start_secondary() brings the 1st + * housekeeping CPU up. + */ + WARN_ON_ONCE(system_state == SYSTEM_RUNNING || + type != HK_TYPE_TIMER); } } return smp_processor_id(); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 4e715b9b27..809194cd77 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -254,7 +254,7 @@ static int membarrier_global_expedited(void) return 0; /* - * Matches memory barriers around rq->curr modification in + * Matches memory barriers after rq->curr modification in * scheduler. */ smp_mb(); /* system call entry is not a mb. */ @@ -304,7 +304,7 @@ static int membarrier_global_expedited(void) /* * Memory barrier on the caller thread _after_ we finished - * waiting for the last IPI. Matches memory barriers around + * waiting for the last IPI. Matches memory barriers before * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ @@ -324,6 +324,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; ipi_func = ipi_sync_core; + prepare_sync_core_cmd(mm); } else if (flags == MEMBARRIER_FLAG_RSEQ) { if (!IS_ENABLED(CONFIG_RSEQ)) return -EINVAL; @@ -343,8 +344,12 @@ static int membarrier_private_expedited(int flags, int cpu_id) return 0; /* - * Matches memory barriers around rq->curr modification in + * Matches memory barriers after rq->curr modification in * scheduler. + * + * On RISC-V, this barrier pairing is also needed for the + * SYNC_CORE command when switching between processes, cf. + * the inline comments in membarrier_arch_switch_mm(). */ smp_mb(); /* system call entry is not a mb. */ @@ -420,7 +425,7 @@ out: /* * Memory barrier on the caller thread _after_ we finished - * waiting for the last IPI. Matches memory barriers around + * waiting for the last IPI. Matches memory barriers before * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8c817d0a92..ae50f21277 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3138,7 +3138,7 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) { - return rq->avg_irq.util_avg; + return READ_ONCE(rq->avg_irq.util_avg); } static inline diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4fdab14953..3127c9b30a 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -657,13 +657,13 @@ static void destroy_sched_domains(struct sched_domain *sd) } /* - * Keep a special pointer to the highest sched_domain that has - * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this - * allows us to avoid some pointer chasing select_idle_sibling(). + * Keep a special pointer to the highest sched_domain that has SD_SHARE_LLC set + * (Last Level Cache Domain) for this allows us to avoid some pointer chasing + * select_idle_sibling(). * - * Also keep a unique ID per domain (we use the first CPU number in - * the cpumask of the domain), this allows us to quickly tell if - * two CPUs are in the same cache domain, see cpus_share_cache(). + * Also keep a unique ID per domain (we use the first CPU number in the cpumask + * of the domain), this allows us to quickly tell if two CPUs are in the same + * cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); @@ -684,7 +684,7 @@ static void update_top_cache_domain(int cpu) int id = cpu; int size = 1; - sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); + sd = highest_flag_domain(cpu, SD_SHARE_LLC); if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); @@ -1551,11 +1551,12 @@ static struct cpumask ***sched_domains_numa_masks; * * These flags are purely descriptive of the topology and do not prescribe * behaviour. Behaviour is artificial and mapped in the below sd_init() - * function: + * function. For details, see include/linux/sched/sd_flags.h. * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies + * SD_SHARE_CPUCAPACITY + * SD_SHARE_LLC + * SD_CLUSTER + * SD_NUMA * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1565,7 +1566,7 @@ static struct cpumask ***sched_domains_numa_masks; #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ SD_CLUSTER | \ - SD_SHARE_PKG_RESOURCES | \ + SD_SHARE_LLC | \ SD_NUMA | \ SD_ASYM_PACKING) @@ -1608,7 +1609,7 @@ sd_init(struct sched_domain_topology_level *tl, | 0*SD_BALANCE_WAKE | 1*SD_WAKE_AFFINE | 0*SD_SHARE_CPUCAPACITY - | 0*SD_SHARE_PKG_RESOURCES + | 0*SD_SHARE_LLC | 0*SD_SERIALIZE | 1*SD_PREFER_SIBLING | 0*SD_NUMA @@ -1645,7 +1646,7 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; - } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + } else if (sd->flags & SD_SHARE_LLC) { sd->imbalance_pct = 117; sd->cache_nice_tries = 1; @@ -1670,7 +1671,7 @@ sd_init(struct sched_domain_topology_level *tl, * For all levels sharing cache; connect a sched_domain_shared * instance. */ - if (sd->flags & SD_SHARE_PKG_RESOURCES) { + if (sd->flags & SD_SHARE_LLC) { sd->shared = *per_cpu_ptr(sdd->sds, sd_id); atomic_inc(&sd->shared->ref); atomic_set(&sd->shared->nr_busy_cpus, sd_weight); @@ -2445,8 +2446,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { struct sched_domain *child = sd->child; - if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child && - (child->flags & SD_SHARE_PKG_RESOURCES)) { + if (!(sd->flags & SD_SHARE_LLC) && child && + (child->flags & SD_SHARE_LLC)) { struct sched_domain __rcu *top_p; unsigned int nr_llcs; |