diff options
Diffstat (limited to '')
-rw-r--r-- | kernel/sched/fair.c | 491 |
1 files changed, 274 insertions, 217 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 98d03b34a8..483c137b9d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -78,15 +78,9 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { - int _shift = 0; - - if (kstrtoint(str, 0, &_shift)) - pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); - - sched_thermal_decay_shift = clamp(_shift, 0, 10); + pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n"); return 1; } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); @@ -157,7 +151,6 @@ static struct ctl_table sched_fair_sysctls[] = { .extra1 = SYSCTL_ZERO, }, #endif /* CONFIG_NUMA_BALANCING */ - {} }; static int __init sched_fair_sysctl_init(void) @@ -388,8 +381,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) /* * With cfs_rq being unthrottled/throttled during an enqueue, - * it can happen the tmp_alone_branch points the a leaf that - * we finally want to del. In this case, tmp_alone_branch moves + * it can happen the tmp_alone_branch points to the leaf that + * we finally want to delete. In this case, tmp_alone_branch moves * to the prev element but it will point to rq->leaf_cfs_rq_list * at the end of the enqueue. */ @@ -406,7 +399,7 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq) SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); } -/* Iterate thr' all leaf cfs_rq's on a runqueue */ +/* Iterate through all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ leaf_cfs_rq_list) @@ -595,13 +588,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * [[ NOTE: this is only equal to the ideal scheduler under the condition * that join/leave operations happen at lag_i = 0, otherwise the - * virtual time has non-continguous motion equivalent to: + * virtual time has non-contiguous motion equivalent to: * * V +-= lag_i / W * * Also see the comment in place_entity() that deals with this. ]] * - * However, since v_i is u64, and the multiplcation could easily overflow + * However, since v_i is u64, and the multiplication could easily overflow * transform it into a relative form that uses smaller quantities: * * Substitute: v_i == (v_i - v0) + v0 @@ -671,7 +664,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) } if (load) { - /* sign flips effective floor / ceil */ + /* sign flips effective floor / ceiling */ if (avg < 0) avg -= (load - 1); avg = div_s64(avg, load); @@ -727,7 +720,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) * - * Note: using 'avg_vruntime() > se->vruntime' is inacurate due + * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division. */ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) @@ -1030,14 +1023,15 @@ void init_entity_runnable_average(struct sched_entity *se) if (entity_is_task(se)) sa->load_avg = scale_load_down(se->load.weight); - /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ + /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */ } /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: * - * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1) + * * se_weight(se) * * However, in many cases, the above util_avg does not give a desired * value. Moreover, the sum of the util_avgs may be divergent, such @@ -1084,7 +1078,7 @@ void post_init_entity_util_avg(struct task_struct *p) if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg = cfs_rq->avg.util_avg * se_weight(se); sa->util_avg /= (cfs_rq->avg.load_avg + 1); if (sa->util_avg > cap) @@ -1622,7 +1616,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, - * which should be ok given the number of nodes rarely exceeds 8. + * which should be OK given the number of nodes rarely exceeds 8. */ for_each_online_node(node) { unsigned long faults; @@ -3296,7 +3290,7 @@ retry_pids: /* * Shared library pages mapped by multiple processes are not * migrated as it is expected they are cache replicated. Avoid - * hinting faults in read-only file-backed mappings or the vdso + * hinting faults in read-only file-backed mappings or the vDSO * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || @@ -3307,7 +3301,7 @@ retry_pids: /* * Skip inaccessible VMAs to avoid any confusion between - * PROT_NONE and NUMA hinting ptes + * PROT_NONE and NUMA hinting PTEs */ if (!vma_is_accessible(vma)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); @@ -3339,7 +3333,7 @@ retry_pids: } /* - * Scanning the VMA's of short lived tasks add more overhead. So + * Scanning the VMAs of short lived tasks add more overhead. So * delay the scan for new VMAs. */ if (mm->numa_scan_seq && time_before(jiffies, @@ -3383,7 +3377,7 @@ retry_pids: /* * Try to scan sysctl_numa_balancing_size worth of * hpages that have at least one present PTE that - * is not already pte-numa. If the VMA contains + * is not already PTE-numa. If the VMA contains * areas that are unused or already full of prot_numa * PTEs, scan up to virtpages, to skip through those * areas faster. @@ -3690,7 +3684,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime, /* * VRUNTIME - * ======== + * -------- * * COROLLARY #1: The virtual runtime of the entity needs to be * adjusted if re-weight at !0-lag point. @@ -3773,7 +3767,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime, /* * DEADLINE - * ======== + * -------- * * When the weight changes, the virtual time slope changes and * we should adjust the relative virtual deadline accordingly. @@ -3841,15 +3835,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } } -void reweight_task(struct task_struct *p, int prio) +void reweight_task(struct task_struct *p, const struct load_weight *lw) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct load_weight *load = &se->load; - unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight); - load->inv_weight = sched_prio_to_wmult[prio]; + reweight_entity(cfs_rq, se, lw->weight); + load->inv_weight = lw->inv_weight; } static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -4745,7 +4738,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * Track task load average for carrying it to new CPU after migrated, and - * track group sched_entity load average for task_h_load calc in migration + * track group sched_entity load average for task_h_load calculation in migration */ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cfs_rq, se); @@ -4828,7 +4821,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { @@ -4971,13 +4964,22 @@ done: trace_sched_util_est_se_tp(&p->se); } +static inline unsigned long get_actual_cpu_capacity(int cpu) +{ + unsigned long capacity = arch_scale_cpu_capacity(cpu); + + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); + + return capacity; +} + static inline int util_fits_cpu(unsigned long util, unsigned long uclamp_min, unsigned long uclamp_max, int cpu) { - unsigned long capacity_orig, capacity_orig_thermal; unsigned long capacity = capacity_of(cpu); + unsigned long capacity_orig; bool fits, uclamp_max_fits; /* @@ -4999,7 +5001,7 @@ static inline int util_fits_cpu(unsigned long util, * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it * should fit a little cpu even if there's some pressure. * - * Only exception is for thermal pressure since it has a direct impact + * Only exception is for HW or cpufreq pressure since it has a direct impact * on available OPP of the system. * * We honour it for uclamp_min only as a drop in performance level @@ -5009,7 +5011,6 @@ static inline int util_fits_cpu(unsigned long util, * goal is to cap the task. So it's okay if it's getting less. */ capacity_orig = arch_scale_cpu_capacity(cpu); - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* * We want to force a task to fit a cpu as implied by uclamp_max. @@ -5026,14 +5027,14 @@ static inline int util_fits_cpu(unsigned long util, * | | | | | | | * | | | | | | | * +---------------------------------------- - * cpu0 cpu1 cpu2 + * CPU0 CPU1 CPU2 * * In the above example if a task is capped to a specific performance * point, y, then when: * - * * util = 80% of x then it does not fit on cpu0 and should migrate - * to cpu1 - * * util = 80% of y then it is forced to fit on cpu1 to honour + * * util = 80% of x then it does not fit on CPU0 and should migrate + * to CPU1 + * * util = 80% of y then it is forced to fit on CPU1 to honour * uclamp_max request. * * which is what we're enforcing here. A task always fits if @@ -5064,7 +5065,7 @@ static inline int util_fits_cpu(unsigned long util, * | | | | | | | * | | | | | | | (region c, boosted, util < uclamp_min) * +---------------------------------------- - * cpu0 cpu1 cpu2 + * CPU0 CPU1 CPU2 * * a) If util > uclamp_max, then we're capped, we don't care about * actual fitness value here. We only care if uclamp_max fits @@ -5084,7 +5085,8 @@ static inline int util_fits_cpu(unsigned long util, * handle the case uclamp_min > uclamp_max. */ uclamp_min = min(uclamp_min, uclamp_max); - if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) + if (fits && (util < uclamp_min) && + (uclamp_min > get_actual_cpu_capacity(cpu))) return -1; return fits; @@ -5104,15 +5106,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { + int cpu = cpu_of(rq); + if (!sched_asym_cpucap_active()) return; - if (!p || p->nr_cpus_allowed == 1) { - rq->misfit_task_load = 0; - return; - } + /* + * Affinity allows us to go somewhere higher? Or are we on biggest + * available CPU already? Or do we fit into this CPU ? + */ + if (!p || (p->nr_cpus_allowed == 1) || + (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) || + task_fits_cpu(p, cpu)) { - if (task_fits_cpu(p, cpu_of(rq))) { rq->misfit_task_load = 0; return; } @@ -5148,7 +5154,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) +static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -5254,7 +5260,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->vruntime = vruntime - lag; /* - * When joining the competition; the exisiting tasks will be, + * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks * off with half a slice to ease into the competition. */ @@ -5403,7 +5409,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Now advance min_vruntime if @se was the entity holding it back, * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- ie. we'll be penalized. + * further than we started -- i.e. we'll be penalized. */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); @@ -5439,7 +5445,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * Track our maximum slice length, if the CPU's load is at - * least twice that of our own weight (i.e. dont track it + * least twice that of our own weight (i.e. don't track it * when there are only lesser-weight tasks around): */ if (schedstat_enabled() && @@ -6687,14 +6693,21 @@ static inline bool cpu_overutilized(int cpu) return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } -static inline void set_rd_overutilized_status(struct root_domain *rd, - unsigned int status) +/* + * overutilized value make sense only if EAS is enabled + */ +static inline bool is_rd_overutilized(struct root_domain *rd) +{ + return !sched_energy_enabled() || READ_ONCE(rd->overutilized); +} + +static inline void set_rd_overutilized(struct root_domain *rd, bool flag) { if (!sched_energy_enabled()) return; - WRITE_ONCE(rd->overutilized, status); - trace_sched_overutilized_tp(rd, !!status); + WRITE_ONCE(rd->overutilized, flag); + trace_sched_overutilized_tp(rd, flag); } static inline void check_update_overutilized_status(struct rq *rq) @@ -6703,11 +6716,9 @@ static inline void check_update_overutilized_status(struct rq *rq) * overutilized field is used for load balancing decisions only * if energy aware scheduler is being used */ - if (!sched_energy_enabled()) - return; - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) - set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED); + if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) + set_rd_overutilized(rq->rd, 1); } #else static inline void check_update_overutilized_status(struct rq *rq) { } @@ -6898,7 +6909,7 @@ dequeue_throttle: #ifdef CONFIG_SMP -/* Working cpumask for: load_balance, load_balance_newidle. */ +/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); @@ -7130,13 +7141,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* - * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. + * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group. */ static int -find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -7192,7 +7203,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, +static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag) { int new_cpu = cpu; @@ -7217,13 +7228,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu); + group = sched_balance_find_dst_group(sd, p, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_group_cpu(group, p, cpu); + new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu); if (new_cpu == cpu) { /* Now try balancing at a lower domain level of 'cpu': */ sd = sd->child; @@ -7491,7 +7502,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) * Look for the CPU with best capacity. */ else if (fits < 0) - cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu)); + cpu_cap = get_actual_cpu_capacity(cpu); /* * First, select CPU which fits better (-1 being better than 0). @@ -7535,7 +7546,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* * On asymmetric system, update task utilization because we will check - * that the task fits with cpu's capacity. + * that the task fits with CPU's capacity. */ if (sched_asym_cpucap_active()) { sync_entity_load_avg(&p->se); @@ -7887,8 +7898,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, * Performance domain frequency: utilization clamping * must be considered since it affects the selection * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. + * NOTE: in case RT tasks are running, by default the min + * utilization can be max OPP. */ eff_util = effective_cpu_util(cpu, util, &min, &max); @@ -7968,7 +7979,7 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd, * NOTE: Forkees are not accepted in the energy-aware wake-up path because * they don't have any useful utilization data yet and it's not possible to * forecast their impact on energy consumption. Consequently, they will be - * placed by find_idlest_cpu() on the least loaded CPU, which might turn out + * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out * to be energy-inefficient in some use-cases. The alternative would be to * bias new tasks towards specific types of CPUs first, or to try to infer * their util_avg from the parent task, but those heuristics could hurt @@ -7984,15 +7995,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; int prev_fits = -1, best_fits = -1; - unsigned long best_thermal_cap = 0; - unsigned long prev_thermal_cap = 0; + unsigned long best_actual_cap = 0; + unsigned long prev_actual_cap = 0; struct sched_domain *sd; struct perf_domain *pd; struct energy_env eenv; rcu_read_lock(); pd = rcu_dereference(rd->pd); - if (!pd || READ_ONCE(rd->overutilized)) + if (!pd) goto unlock; /* @@ -8015,7 +8026,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long util_min = p_util_min, util_max = p_util_max; - unsigned long cpu_cap, cpu_thermal_cap, util; + unsigned long cpu_cap, cpu_actual_cap, util; long prev_spare_cap = -1, max_spare_cap = -1; unsigned long rq_util_min, rq_util_max; unsigned long cur_delta, base_energy; @@ -8027,18 +8038,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (cpumask_empty(cpus)) continue; - /* Account thermal pressure for the energy estimation */ + /* Account external pressure for the energy estimation */ cpu = cpumask_first(cpus); - cpu_thermal_cap = arch_scale_cpu_capacity(cpu); - cpu_thermal_cap -= arch_scale_thermal_pressure(cpu); + cpu_actual_cap = get_actual_cpu_capacity(cpu); - eenv.cpu_cap = cpu_thermal_cap; + eenv.cpu_cap = cpu_actual_cap; eenv.pd_cap = 0; for_each_cpu(cpu, cpus) { struct rq *rq = cpu_rq(cpu); - eenv.pd_cap += cpu_thermal_cap; + eenv.pd_cap += cpu_actual_cap; if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; @@ -8059,7 +8069,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { /* * Open code uclamp_rq_util_with() except for - * the clamp() part. Ie: apply max aggregation + * the clamp() part. I.e.: apply max aggregation * only. util_fits_cpu() logic requires to * operate on non clamped util but must use the * max-aggregated uclamp_{min, max}. @@ -8109,7 +8119,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (prev_delta < base_energy) goto unlock; prev_delta -= base_energy; - prev_thermal_cap = cpu_thermal_cap; + prev_actual_cap = cpu_actual_cap; best_delta = min(best_delta, prev_delta); } @@ -8124,7 +8134,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * but best energy cpu has better capacity. */ if ((max_fits < 0) && - (cpu_thermal_cap <= best_thermal_cap)) + (cpu_actual_cap <= best_actual_cap)) continue; cur_delta = compute_energy(&eenv, pd, cpus, p, @@ -8145,14 +8155,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) best_delta = cur_delta; best_energy_cpu = max_spare_cap_cpu; best_fits = max_fits; - best_thermal_cap = cpu_thermal_cap; + best_actual_cap = cpu_actual_cap; } } rcu_read_unlock(); if ((best_fits > prev_fits) || ((best_fits > 0) && (best_delta < prev_delta)) || - ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap))) + ((best_fits < 0) && (best_actual_cap > prev_actual_cap))) target = best_energy_cpu; return target; @@ -8195,7 +8205,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) cpumask_test_cpu(cpu, p->cpus_ptr)) return cpu; - if (sched_energy_enabled()) { + if (!is_rd_overutilized(this_rq()->rd)) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) return new_cpu; @@ -8233,7 +8243,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) if (unlikely(sd)) { /* Slow path */ - new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); @@ -8279,14 +8289,46 @@ static void task_dead_fair(struct task_struct *p) remove_entity_load_avg(&p->se); } +/* + * Set the max capacity the task is allowed to run at for misfit detection. + */ +static void set_task_max_allowed_capacity(struct task_struct *p) +{ + struct asym_cap_data *entry; + + if (!sched_asym_cpucap_active()) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &asym_cap_list, link) { + cpumask_t *cpumask; + + cpumask = cpu_capacity_span(entry); + if (!cpumask_intersects(p->cpus_ptr, cpumask)) + continue; + + p->max_allowed_capacity = entry->capacity; + break; + } + rcu_read_unlock(); +} + +static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx) +{ + set_cpus_allowed_common(p, ctx); + set_task_max_allowed_capacity(p); +} + static int balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { if (rq->nr_running) return 1; - return newidle_balance(rq, rf) != 0; + return sched_balance_newidle(rq, rf) != 0; } +#else +static inline void set_task_max_allowed_capacity(struct task_struct *p) {} #endif /* CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) @@ -8537,10 +8579,10 @@ idle: if (!rf) return NULL; - new_tasks = newidle_balance(rq, rf); + new_tasks = sched_balance_newidle(rq, rf); /* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is + * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -8618,7 +8660,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; - /* Tell the scheduler that we'd really like pse to run next. */ + /* Tell the scheduler that we'd really like se to run next. */ set_next_buddy(se); yield_task_fair(rq); @@ -8956,7 +8998,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; - /* Disregard pcpu kthreads; they are where they need to be. */ + /* Disregard percpu kthreads; they are where they need to be. */ if (kthread_is_per_cpu(p)) return 0; @@ -9102,7 +9144,7 @@ static int detach_tasks(struct lb_env *env) * We don't want to steal all, otherwise we may be treated likewise, * which could at worst lead to a livelock crash. */ - if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + if (env->idle && env->src_rq->nr_running <= 1) break; env->loop++; @@ -9277,7 +9319,7 @@ static inline bool others_have_blocked(struct rq *rq) if (cpu_util_dl(rq)) return true; - if (thermal_load_avg(rq)) + if (hw_load_avg(rq)) return true; if (cpu_util_irq(rq)) @@ -9307,7 +9349,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) { const struct sched_class *curr_class; u64 now = rq_clock_pelt(rq); - unsigned long thermal_pressure; + unsigned long hw_pressure; bool decayed; /* @@ -9316,11 +9358,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done) */ curr_class = rq->curr->sched_class; - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | + update_hw_load_avg(now, rq, hw_pressure) | update_irq_load_avg(rq, 0); if (others_have_blocked(rq)) @@ -9439,7 +9481,7 @@ static unsigned long task_h_load(struct task_struct *p) } #endif -static void update_blocked_averages(int cpu) +static void sched_balance_update_blocked_averages(int cpu) { bool decayed = false, done = true; struct rq *rq = cpu_rq(cpu); @@ -9458,25 +9500,25 @@ static void update_blocked_averages(int cpu) rq_unlock_irqrestore(rq, &rf); } -/********** Helpers for find_busiest_group ************************/ +/********** Helpers for sched_balance_find_src_group ************************/ /* - * sg_lb_stats - stats of a sched_group required for load_balancing + * sg_lb_stats - stats of a sched_group required for load-balancing: */ struct sg_lb_stats { - unsigned long avg_load; /*Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long group_capacity; - unsigned long group_util; /* Total utilization over the CPUs of the group */ - unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ - unsigned int sum_nr_running; /* Nr of tasks running in the group */ - unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ - unsigned int idle_cpus; + unsigned long avg_load; /* Avg load over the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long group_capacity; /* Capacity over the CPUs of the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ + unsigned int sum_nr_running; /* Nr of all tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ + unsigned int idle_cpus; /* Nr of idle CPUs in the group */ unsigned int group_weight; enum group_type group_type; - unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ - unsigned int group_smt_balance; /* Task on busy SMT be moved */ - unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -9484,19 +9526,18 @@ struct sg_lb_stats { }; /* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. + * sd_lb_stats - stats of a sched_domain required for load-balancing: */ struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *local; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_capacity; /* Total capacity of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - unsigned int prefer_sibling; /* tasks should go to sibling first */ - - struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ - struct sg_lb_stats local_stat; /* Statistics of the local group */ + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *local; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* Tasks should go to sibling first */ + + struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */ + struct sg_lb_stats local_stat; /* Statistics of the local group */ }; static inline void init_sd_lb_stats(struct sd_lb_stats *sds) @@ -9522,8 +9563,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) static unsigned long scale_rt_capacity(int cpu) { + unsigned long max = get_actual_cpu_capacity(cpu); struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -9535,12 +9576,9 @@ static unsigned long scale_rt_capacity(int cpu) /* * avg_rt.util_avg and avg_dl.util_avg track binary signals * (running and not running) with weights 0 and 1024 respectively. - * avg_thermal.load_avg tracks thermal pressure and the weighted - * average uses the actual delta max capacity(load). */ used = cpu_util_rt(rq); used += cpu_util_dl(rq); - used += thermal_load_avg(rq); if (unlikely(used >= max)) return 1; @@ -9633,16 +9671,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); } -/* - * Check whether a rq has a misfit task and if it looks like we can actually - * help that task: we can migrate the task to a CPU of higher capacity, or - * the task's current CPU is heavily pressured. - */ -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +/* Check if the rq has a misfit task */ +static inline bool check_misfit_status(struct rq *rq) { - return rq->misfit_task_load && - (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity || - check_cpu_capacity(rq, sd)); + return rq->misfit_task_load; } /* @@ -9666,7 +9698,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) * * When this is so detected; this group becomes a candidate for busiest; see * update_sd_pick_busiest(). And calculate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditions to allow it + * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -9831,7 +9863,7 @@ static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) { - if (env->idle == CPU_NOT_IDLE) + if (!env->idle) return false; /* @@ -9855,7 +9887,7 @@ static inline long sibling_imbalance(struct lb_env *env, int ncores_busiest, ncores_local; long imbalance; - if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) + if (!env->idle || !busiest->sum_nr_running) return 0; ncores_busiest = sds->busiest->cores; @@ -9901,13 +9933,15 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. - * @sg_status: Holds flag indicating the status of the sched_group + * @sg_overloaded: sched_group is overloaded + * @sg_overutilized: sched_group is overutilized */ static inline void update_sg_lb_stats(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, - int *sg_status) + bool *sg_overloaded, + bool *sg_overutilized) { int i, nr_running, local_group; @@ -9928,10 +9962,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_nr_running += nr_running; if (nr_running > 1) - *sg_status |= SG_OVERLOAD; + *sg_overloaded = 1; if (cpu_overutilized(i)) - *sg_status |= SG_OVERUTILIZED; + *sg_overutilized = 1; #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; @@ -9953,10 +9987,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Check for a misfit task on the cpu */ if (sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; - *sg_status |= SG_OVERLOAD; + *sg_overloaded = 1; } - } else if ((env->idle != CPU_NOT_IDLE) && - sched_reduced_capacity(rq, env->sd)) { + } else if (env->idle && sched_reduced_capacity(rq, env->sd)) { /* Check for a task running on a CPU with reduced capacity */ if (sgs->group_misfit_task_load < load) sgs->group_misfit_task_load = load; @@ -9968,7 +10001,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; /* Check if dst CPU is idle and preferred to this group */ - if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && + if (!local_group && env->idle && sgs->sum_h_nr_running && sched_group_asym(env, sgs, group)) sgs->group_asym_packing = 1; @@ -10106,7 +10139,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, has_spare: /* - * Select not overloaded group with lowest number of idle cpus + * Select not overloaded group with lowest number of idle CPUs * and highest number of running tasks. We could also compare * the spare capacity which is more stable but it can end up * that the group has less spare capacity but finally more idle @@ -10326,13 +10359,13 @@ static bool update_pick_idlest(struct sched_group *idlest, } /* - * find_idlest_group() finds and returns the least busy CPU group within the + * sched_balance_find_dst_group() finds and returns the least busy CPU group within the * domain. * * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; @@ -10580,7 +10613,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; unsigned long sum_util = 0; - int sg_status = 0; + bool sg_overloaded = 0, sg_overutilized = 0; do { struct sg_lb_stats *sgs = &tmp_sgs; @@ -10596,7 +10629,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sds, sg, sgs, &sg_status); + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -10625,13 +10658,12 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); + set_rd_overloaded(env->dst_rq->rd, sg_overloaded); /* Update over-utilization (tipping point, U >= 0) indicator */ - set_rd_overutilized_status(env->dst_rq->rd, - sg_status & SG_OVERUTILIZED); - } else if (sg_status & SG_OVERUTILIZED) { - set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED); + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); + } else if (sg_overutilized) { + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); } update_idle_cpu_scan(env, sum_util); @@ -10721,7 +10753,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * waiting task in this overloaded busiest group. Let's * try to pull it. */ - if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + if (env->idle && env->imbalance == 0) { env->migration_type = migrate_task; env->imbalance = 1; } @@ -10740,7 +10772,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * If there is no overload, we just want to even the number of - * idle cpus. + * idle CPUs. */ env->migration_type = migrate_task; env->imbalance = max_t(long, 0, @@ -10813,7 +10845,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ) / SCHED_CAPACITY_SCALE; } -/******* find_busiest_group() helpers end here *********************/ +/******* sched_balance_find_src_group() helpers end here *********************/ /* * Decision matrix according to the local and busiest group type: @@ -10836,7 +10868,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ /** - * find_busiest_group - Returns the busiest group within the sched_domain + * sched_balance_find_src_group - Returns the busiest group within the sched_domain * if there is an imbalance. * @env: The load balancing environment. * @@ -10845,7 +10877,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * * Return: - The busiest group if imbalance exists. */ -static struct sched_group *find_busiest_group(struct lb_env *env) +static struct sched_group *sched_balance_find_src_group(struct lb_env *env) { struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; @@ -10868,12 +10900,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_misfit_task) goto force_balance; - if (sched_energy_enabled()) { - struct root_domain *rd = env->dst_rq->rd; - - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) - goto out_balanced; - } + if (!is_rd_overutilized(env->dst_rq->rd) && + rcu_dereference(env->dst_rq->rd->pd)) + goto out_balanced; /* ASYM feature bypasses nice load balance check */ if (busiest->group_type == group_asym_packing) @@ -10936,7 +10965,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; if (busiest->group_type != group_overloaded) { - if (env->idle == CPU_NOT_IDLE) { + if (!env->idle) { /* * If the busiest group is not overloaded (and as a * result the local one too) but this CPU is already @@ -10984,9 +11013,9 @@ out_balanced: } /* - * find_busiest_queue - find the busiest runqueue among the CPUs in the group. + * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group. */ -static struct rq *find_busiest_queue(struct lb_env *env, +static struct rq *sched_balance_find_src_rq(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; @@ -11144,7 +11173,7 @@ asym_active_balance(struct lb_env *env) * the lower priority @env::dst_cpu help it. Do not follow * CPU priority. */ - return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) && + return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) && (sched_asym_prefer(env->dst_cpu, env->src_cpu) || !sched_use_asym_prio(env->sd, env->src_cpu)); } @@ -11182,7 +11211,7 @@ static int need_active_balance(struct lb_env *env) * because of other sched_class or IRQs if more capacity stays * available on dst_cpu. */ - if ((env->idle != CPU_NOT_IDLE) && + if (env->idle && (env->src_rq->cfs.h_nr_running == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) @@ -11267,7 +11296,7 @@ static int should_we_balance(struct lb_env *env) * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ -static int load_balance(int this_cpu, struct rq *this_rq, +static int sched_balance_rq(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { @@ -11299,13 +11328,13 @@ redo: goto out_balanced; } - group = find_busiest_group(&env); + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); goto out_balanced; } - busiest = find_busiest_queue(&env, group); + busiest = sched_balance_find_src_rq(&env, group); if (!busiest) { schedstat_inc(sd->lb_nobusyq[idle]); goto out_balanced; @@ -11323,7 +11352,7 @@ redo: env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* - * Attempt to move tasks. If find_busiest_group has found + * Attempt to move tasks. If sched_balance_find_src_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. @@ -11436,8 +11465,12 @@ more_balance: * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. + * + * Similarly for migration_misfit which is not related to + * load/util migration, don't pollute nr_balance_failed. */ - if (idle != CPU_NEWLY_IDLE) + if (idle != CPU_NEWLY_IDLE && + env.migration_type != migrate_misfit) sd->nr_balance_failed++; if (need_active_balance(&env)) { @@ -11516,12 +11549,17 @@ out_one_pinned: ld_moved = 0; /* - * newidle_balance() disregards balance intervals, so we could + * sched_balance_newidle() disregards balance intervals, so we could * repeatedly reach this code, which would lead to balance_interval * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. + * + * Similarly misfit migration which is not necessarily an indication of + * the system being busy and requires lb to backoff to let it settle + * down. */ - if (env.idle == CPU_NEWLY_IDLE) + if (env.idle == CPU_NEWLY_IDLE || + env.migration_type == migrate_misfit) goto out; /* tune up the balancing interval */ @@ -11654,10 +11692,23 @@ out_unlock: return 0; } -static DEFINE_SPINLOCK(balancing); +/* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); /* - * Scale the max load_balance interval with the number of CPUs in the system. + * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ void update_max_interval(void) @@ -11695,7 +11746,7 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) * * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) +static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; @@ -11732,25 +11783,25 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { - if (!spin_trylock(&balancing)) + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) goto out; } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { + if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ - idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; - busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); + idle = idle_cpu(cpu); + busy = !idle && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); } if (need_serialize) - spin_unlock(&balancing); + atomic_set_release(&sched_balance_running, 0); out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; @@ -11910,7 +11961,7 @@ static void nohz_balancer_kick(struct rq *rq) * currently idle; in which case, kick the ILB to move tasks * around. * - * When balancing betwen cores, all the SMT siblings of the + * When balancing between cores, all the SMT siblings of the * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { @@ -11927,7 +11978,7 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU * to run the misfit task on. */ - if (check_misfit_status(rq, sd)) { + if (check_misfit_status(rq)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -12071,7 +12122,7 @@ void nohz_balance_enter_idle(int cpu) out: /* * Each time a cpu enter idle, we assume that it has blocked load and - * enable the periodic update of the load of idle cpus + * enable the periodic update of the load of idle CPUs */ WRITE_ONCE(nohz.has_blocked, 1); } @@ -12089,13 +12140,13 @@ static bool update_nohz_stats(struct rq *rq) if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) return true; - update_blocked_averages(cpu); + sched_balance_update_blocked_averages(cpu); return rq->has_blocked_load; } /* - * Internal function that runs load balance for all idle cpus. The load balance + * Internal function that runs load balance for all idle CPUs. The load balance * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. */ @@ -12171,7 +12222,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) rq_unlock_irqrestore(rq, &rf); if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(rq, CPU_IDLE); + sched_balance_domains(rq, CPU_IDLE); } if (time_after(next_balance, rq->next_balance)) { @@ -12200,7 +12251,7 @@ abort: /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. + * rebalancing for all the CPUs for whom scheduler ticks are stopped. */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { @@ -12231,7 +12282,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * called from this function on (this) CPU that's not yet in the mask. That's * OK because the goal of nohz_run_idle_balance() is to run ILB only for * updating the blocked load of already idle CPUs without waking up one of - * those idle CPUs and outside the preempt disable / irq off phase of the local + * those idle CPUs and outside the preempt disable / IRQ off phase of the local * cpu about to enter idle, because it can take a long time. */ void nohz_run_idle_balance(int cpu) @@ -12242,7 +12293,7 @@ void nohz_run_idle_balance(int cpu) /* * Update the blocked load only if no SCHED_SOFTIRQ is about to happen - * (ie NOHZ_STATS_KICK set) and will do the same. + * (i.e. NOHZ_STATS_KICK set) and will do the same. */ if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); @@ -12287,7 +12338,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * newidle_balance is called by schedule() if this_cpu is about to become + * sched_balance_newidle is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: @@ -12295,10 +12346,11 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; + int continue_balancing = 1; u64 t0, t1, curr_cost = 0; struct sched_domain *sd; int pulled_task = 0; @@ -12313,8 +12365,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) return 0; /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. + * We must set idle_stamp _before_ calling sched_balance_rq() + * for CPU_NEWLY_IDLE, such that we measure the this duration + * as idle time. */ this_rq->idle_stamp = rq_clock(this_rq); @@ -12335,7 +12388,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (!READ_ONCE(this_rq->rd->overload) || + if (!get_rd_overloaded(this_rq->rd) || (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { if (sd) @@ -12349,11 +12402,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) raw_spin_rq_unlock(this_rq); t0 = sched_clock_cpu(this_cpu); - update_blocked_averages(this_cpu); + sched_balance_update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - int continue_balancing = 1; u64 domain_cost; update_next_balance(sd, &next_balance); @@ -12363,7 +12415,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) if (sd->flags & SD_BALANCE_NEWIDLE) { - pulled_task = load_balance(this_cpu, this_rq, + pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); @@ -12379,8 +12431,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) * Stop searching for tasks to pull if there are * now runnable tasks on this rq. */ - if (pulled_task || this_rq->nr_running > 0 || - this_rq->ttwu_pending) + if (pulled_task || !continue_balancing) break; } rcu_read_unlock(); @@ -12418,19 +12469,21 @@ out: } /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + * This softirq handler is triggered via SCHED_SOFTIRQ from two places: + * + * - directly from the local scheduler_tick() for periodic load balancing + * + * - indirectly from a remote scheduler_tick() for NOHZ idle balancing + * through the SMP cross-call nohz_csd_func() */ -static __latent_entropy void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void sched_balance_softirq(struct softirq_action *h) { struct rq *this_rq = this_rq(); - enum cpu_idle_type idle = this_rq->idle_balance ? - CPU_IDLE : CPU_NOT_IDLE; - + enum cpu_idle_type idle = this_rq->idle_balance; /* - * If this CPU has a pending nohz_balance_kick, then do the + * If this CPU has a pending NOHZ_BALANCE_KICK, then do the * balancing on behalf of the other idle CPUs whose ticks are - * stopped. Do nohz_idle_balance *before* rebalance_domains to + * stopped. Do nohz_idle_balance *before* sched_balance_domains to * give the idle CPUs a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. @@ -12439,14 +12492,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) return; /* normal load balance */ - update_blocked_averages(this_rq->cpu); - rebalance_domains(this_rq, idle); + sched_balance_update_blocked_averages(this_rq->cpu); + sched_balance_domains(this_rq, idle); } /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq) +void sched_balance_trigger(struct rq *rq) { /* * Don't need to rebalance while attached to NULL domain or @@ -12650,6 +12703,8 @@ static void task_fork_fair(struct task_struct *p) rq_lock(rq, &rf); update_rq_clock(rq); + set_task_max_allowed_capacity(p); + cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; if (curr) @@ -12773,6 +12828,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) { attach_task_cfs_rq(p); + set_task_max_allowed_capacity(p); + if (task_on_rq_queued(p)) { /* * We were most likely switched from sched_rt, so @@ -13144,7 +13201,7 @@ DEFINE_SCHED_CLASS(fair) = { .rq_offline = rq_offline_fair, .task_dead = task_dead_fair, - .set_cpus_allowed = set_cpus_allowed_common, + .set_cpus_allowed = set_cpus_allowed_fair, #endif .task_tick = task_tick_fair, @@ -13224,7 +13281,7 @@ __init void init_sched_fair_class(void) #endif } - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); + open_softirq(SCHED_SOFTIRQ, sched_balance_softirq); #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; |