summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-07 13:17:52 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-07 13:17:52 +0000
commit3afb00d3f86d3d924f88b56fa8285d4e9db85852 (patch)
tree95a985d3019522cea546b7d8df621369bc44fc6c /kernel/sched
parentAdding debian version 6.9.12-1. (diff)
downloadlinux-3afb00d3f86d3d924f88b56fa8285d4e9db85852.tar.xz
linux-3afb00d3f86d3d924f88b56fa8285d4e9db85852.zip
Merging upstream version 6.10.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/autogroup.c1
-rw-r--r--kernel/sched/core.c59
-rw-r--r--kernel/sched/cputime.c13
-rw-r--r--kernel/sched/deadline.c1
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c491
-rw-r--r--kernel/sched/loadavg.c2
-rw-r--r--kernel/sched/pelt.c22
-rw-r--r--kernel/sched/pelt.h16
-rw-r--r--kernel/sched/psi.c21
-rw-r--r--kernel/sched/rt.c1
-rw-r--r--kernel/sched/sched.h74
-rw-r--r--kernel/sched/stats.c5
-rw-r--r--kernel/sched/stats.h11
-rw-r--r--kernel/sched/topology.c63
15 files changed, 416 insertions, 365 deletions
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 991fc90025..db68a964e3 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -19,7 +19,6 @@ static struct ctl_table sched_autogroup_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- {}
};
static void __init sched_autogroup_sysctl_init(void)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d211d40a2e..ebf21373f6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -108,7 +108,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
-EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
@@ -723,7 +723,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
- psi_account_irqtime(rq->curr, irq_delta);
delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -1327,27 +1326,24 @@ int tg_nop(struct task_group *tg, void *data)
static void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
- struct load_weight *load = &p->se.load;
+ struct load_weight lw;
- /*
- * SCHED_IDLE tasks get minimal weight:
- */
if (task_has_idle_policy(p)) {
- load->weight = scale_load(WEIGHT_IDLEPRIO);
- load->inv_weight = WMULT_IDLEPRIO;
- return;
+ lw.weight = scale_load(WEIGHT_IDLEPRIO);
+ lw.inv_weight = WMULT_IDLEPRIO;
+ } else {
+ lw.weight = scale_load(sched_prio_to_weight[prio]);
+ lw.inv_weight = sched_prio_to_wmult[prio];
}
/*
* SCHED_OTHER tasks have to update their load when changing their
* weight
*/
- if (update_load && p->sched_class == &fair_sched_class) {
- reweight_task(p, prio);
- } else {
- load->weight = scale_load(sched_prio_to_weight[prio]);
- load->inv_weight = sched_prio_to_wmult[prio];
- }
+ if (update_load && p->sched_class == &fair_sched_class)
+ reweight_task(p, &lw);
+ else
+ p->se.load = lw;
}
#ifdef CONFIG_UCLAMP_TASK
@@ -4467,12 +4463,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
* @cpu: The CPU on which to snapshot the task.
*
* Returns the task_struct pointer of the task "currently" running on
- * the specified CPU. If the same task is running on that CPU throughout,
- * the return value will be a pointer to that task's task_struct structure.
- * If the CPU did any context switches even vaguely concurrently with the
- * execution of this function, the return value will be a pointer to the
- * task_struct structure of a randomly chosen task that was running on
- * that CPU somewhere around the time that this function was executing.
+ * the specified CPU.
*
* If the specified CPU was offline, the return value is whatever it
* is, perhaps a pointer to the task_struct structure of that CPU's idle
@@ -4486,11 +4477,16 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
*/
struct task_struct *cpu_curr_snapshot(int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
struct task_struct *t;
+ struct rq_flags rf;
- smp_mb(); /* Pairing determined by caller's synchronization design. */
+ rq_lock_irqsave(rq, &rf);
+ smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */
t = rcu_dereference(cpu_curr(cpu));
+ rq_unlock_irqrestore(rq, &rf);
smp_mb(); /* Pairing determined by caller's synchronization design. */
+
return t;
}
@@ -4741,7 +4737,6 @@ static struct ctl_table sched_core_sysctls[] = {
.extra2 = SYSCTL_FOUR,
},
#endif /* CONFIG_NUMA_BALANCING */
- {}
};
static int __init sched_core_sysctl_init(void)
{
@@ -5662,13 +5657,13 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
-void scheduler_tick(void)
+void sched_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr = rq->curr;
+ struct task_struct *curr;
struct rq_flags rf;
- unsigned long thermal_pressure;
+ unsigned long hw_pressure;
u64 resched_latency;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
@@ -5678,9 +5673,12 @@ void scheduler_tick(void)
rq_lock(rq, &rf);
+ curr = rq->curr;
+ psi_account_irqtime(rq, curr, NULL);
+
update_rq_clock(rq);
- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
+ hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+ update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
curr->sched_class->task_tick(rq, curr, 0);
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
@@ -5700,7 +5698,7 @@ void scheduler_tick(void)
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
- trigger_load_balance(rq);
+ sched_balance_trigger(rq);
#endif
}
@@ -6585,7 +6583,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* paths. For example, see arch/x86/entry_64.S.
*
* To drive preemption between tasks, the scheduler sets the flag in timer
- * interrupt handler scheduler_tick().
+ * interrupt handler sched_tick().
*
* 3. Wakeups don't really cause entry into schedule(). They add a
* task to the run-queue and that's it.
@@ -6738,6 +6736,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
++*switch_count;
migrate_disable_switch(rq, prev);
+ psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index af7952f12e..aa48b2ec87 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -424,19 +424,6 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
- if (is_idle_task(prev))
- vtime_account_idle(prev);
- else
- vtime_account_kernel(prev);
-
- vtime_flush(prev);
- arch_vtime_task_switch(prev);
-}
-# endif
-
void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
{
unsigned int pc = irq_count() - offset;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index dce51bf2d3..9bedd148f0 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -43,7 +43,6 @@ static struct ctl_table sched_dl_sysctls[] = {
.proc_handler = proc_douintvec_minmax,
.extra2 = (void *)&sysctl_sched_dl_period_max,
},
- {}
};
static int __init sched_dl_sysctl_init(void)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8d5d98a583..c1eb9a1afd 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -425,6 +425,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent)
debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
+ debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level);
}
void update_sched_domain_debugfs(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 98d03b34a8..483c137b9d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -78,15 +78,9 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
-int sched_thermal_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
{
- int _shift = 0;
-
- if (kstrtoint(str, 0, &_shift))
- pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
-
- sched_thermal_decay_shift = clamp(_shift, 0, 10);
+ pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
return 1;
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
@@ -157,7 +151,6 @@ static struct ctl_table sched_fair_sysctls[] = {
.extra1 = SYSCTL_ZERO,
},
#endif /* CONFIG_NUMA_BALANCING */
- {}
};
static int __init sched_fair_sysctl_init(void)
@@ -388,8 +381,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
/*
* With cfs_rq being unthrottled/throttled during an enqueue,
- * it can happen the tmp_alone_branch points the a leaf that
- * we finally want to del. In this case, tmp_alone_branch moves
+ * it can happen the tmp_alone_branch points to the leaf that
+ * we finally want to delete. In this case, tmp_alone_branch moves
* to the prev element but it will point to rq->leaf_cfs_rq_list
* at the end of the enqueue.
*/
@@ -406,7 +399,7 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq)
SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
+/* Iterate through all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
leaf_cfs_rq_list)
@@ -595,13 +588,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* [[ NOTE: this is only equal to the ideal scheduler under the condition
* that join/leave operations happen at lag_i = 0, otherwise the
- * virtual time has non-continguous motion equivalent to:
+ * virtual time has non-contiguous motion equivalent to:
*
* V +-= lag_i / W
*
* Also see the comment in place_entity() that deals with this. ]]
*
- * However, since v_i is u64, and the multiplcation could easily overflow
+ * However, since v_i is u64, and the multiplication could easily overflow
* transform it into a relative form that uses smaller quantities:
*
* Substitute: v_i == (v_i - v0) + v0
@@ -671,7 +664,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
}
if (load) {
- /* sign flips effective floor / ceil */
+ /* sign flips effective floor / ceiling */
if (avg < 0)
avg -= (load - 1);
avg = div_s64(avg, load);
@@ -727,7 +720,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
*
- * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
* to the loss in precision caused by the division.
*/
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -1030,14 +1023,15 @@ void init_entity_runnable_average(struct sched_entity *se)
if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
- /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
+ /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
}
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
*
- * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1)
+ * * se_weight(se)
*
* However, in many cases, the above util_avg does not give a desired
* value. Moreover, the sum of the util_avgs may be divergent, such
@@ -1084,7 +1078,7 @@ void post_init_entity_util_avg(struct task_struct *p)
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
- sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg = cfs_rq->avg.util_avg * se_weight(se);
sa->util_avg /= (cfs_rq->avg.load_avg + 1);
if (sa->util_avg > cap)
@@ -1622,7 +1616,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
- * which should be ok given the number of nodes rarely exceeds 8.
+ * which should be OK given the number of nodes rarely exceeds 8.
*/
for_each_online_node(node) {
unsigned long faults;
@@ -3296,7 +3290,7 @@ retry_pids:
/*
* Shared library pages mapped by multiple processes are not
* migrated as it is expected they are cache replicated. Avoid
- * hinting faults in read-only file-backed mappings or the vdso
+ * hinting faults in read-only file-backed mappings or the vDSO
* as migrating the pages will be of marginal benefit.
*/
if (!vma->vm_mm ||
@@ -3307,7 +3301,7 @@ retry_pids:
/*
* Skip inaccessible VMAs to avoid any confusion between
- * PROT_NONE and NUMA hinting ptes
+ * PROT_NONE and NUMA hinting PTEs
*/
if (!vma_is_accessible(vma)) {
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
@@ -3339,7 +3333,7 @@ retry_pids:
}
/*
- * Scanning the VMA's of short lived tasks add more overhead. So
+ * Scanning the VMAs of short lived tasks add more overhead. So
* delay the scan for new VMAs.
*/
if (mm->numa_scan_seq && time_before(jiffies,
@@ -3383,7 +3377,7 @@ retry_pids:
/*
* Try to scan sysctl_numa_balancing_size worth of
* hpages that have at least one present PTE that
- * is not already pte-numa. If the VMA contains
+ * is not already PTE-numa. If the VMA contains
* areas that are unused or already full of prot_numa
* PTEs, scan up to virtpages, to skip through those
* areas faster.
@@ -3690,7 +3684,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
/*
* VRUNTIME
- * ========
+ * --------
*
* COROLLARY #1: The virtual runtime of the entity needs to be
* adjusted if re-weight at !0-lag point.
@@ -3773,7 +3767,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
/*
* DEADLINE
- * ========
+ * --------
*
* When the weight changes, the virtual time slope changes and
* we should adjust the relative virtual deadline accordingly.
@@ -3841,15 +3835,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
}
}
-void reweight_task(struct task_struct *p, int prio)
+void reweight_task(struct task_struct *p, const struct load_weight *lw)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct load_weight *load = &se->load;
- unsigned long weight = scale_load(sched_prio_to_weight[prio]);
- reweight_entity(cfs_rq, se, weight);
- load->inv_weight = sched_prio_to_wmult[prio];
+ reweight_entity(cfs_rq, se, lw->weight);
+ load->inv_weight = lw->inv_weight;
}
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@@ -4745,7 +4738,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/*
* Track task load average for carrying it to new CPU after migrated, and
- * track group sched_entity load average for task_h_load calc in migration
+ * track group sched_entity load average for task_h_load calculation in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cfs_rq, se);
@@ -4828,7 +4821,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf);
static inline unsigned long task_util(struct task_struct *p)
{
@@ -4971,13 +4964,22 @@ done:
trace_sched_util_est_se_tp(&p->se);
}
+static inline unsigned long get_actual_cpu_capacity(int cpu)
+{
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+
+ capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
+
+ return capacity;
+}
+
static inline int util_fits_cpu(unsigned long util,
unsigned long uclamp_min,
unsigned long uclamp_max,
int cpu)
{
- unsigned long capacity_orig, capacity_orig_thermal;
unsigned long capacity = capacity_of(cpu);
+ unsigned long capacity_orig;
bool fits, uclamp_max_fits;
/*
@@ -4999,7 +5001,7 @@ static inline int util_fits_cpu(unsigned long util,
* Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
* should fit a little cpu even if there's some pressure.
*
- * Only exception is for thermal pressure since it has a direct impact
+ * Only exception is for HW or cpufreq pressure since it has a direct impact
* on available OPP of the system.
*
* We honour it for uclamp_min only as a drop in performance level
@@ -5009,7 +5011,6 @@ static inline int util_fits_cpu(unsigned long util,
* goal is to cap the task. So it's okay if it's getting less.
*/
capacity_orig = arch_scale_cpu_capacity(cpu);
- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
/*
* We want to force a task to fit a cpu as implied by uclamp_max.
@@ -5026,14 +5027,14 @@ static inline int util_fits_cpu(unsigned long util,
* | | | | | | |
* | | | | | | |
* +----------------------------------------
- * cpu0 cpu1 cpu2
+ * CPU0 CPU1 CPU2
*
* In the above example if a task is capped to a specific performance
* point, y, then when:
*
- * * util = 80% of x then it does not fit on cpu0 and should migrate
- * to cpu1
- * * util = 80% of y then it is forced to fit on cpu1 to honour
+ * * util = 80% of x then it does not fit on CPU0 and should migrate
+ * to CPU1
+ * * util = 80% of y then it is forced to fit on CPU1 to honour
* uclamp_max request.
*
* which is what we're enforcing here. A task always fits if
@@ -5064,7 +5065,7 @@ static inline int util_fits_cpu(unsigned long util,
* | | | | | | |
* | | | | | | | (region c, boosted, util < uclamp_min)
* +----------------------------------------
- * cpu0 cpu1 cpu2
+ * CPU0 CPU1 CPU2
*
* a) If util > uclamp_max, then we're capped, we don't care about
* actual fitness value here. We only care if uclamp_max fits
@@ -5084,7 +5085,8 @@ static inline int util_fits_cpu(unsigned long util,
* handle the case uclamp_min > uclamp_max.
*/
uclamp_min = min(uclamp_min, uclamp_max);
- if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ if (fits && (util < uclamp_min) &&
+ (uclamp_min > get_actual_cpu_capacity(cpu)))
return -1;
return fits;
@@ -5104,15 +5106,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
+ int cpu = cpu_of(rq);
+
if (!sched_asym_cpucap_active())
return;
- if (!p || p->nr_cpus_allowed == 1) {
- rq->misfit_task_load = 0;
- return;
- }
+ /*
+ * Affinity allows us to go somewhere higher? Or are we on biggest
+ * available CPU already? Or do we fit into this CPU ?
+ */
+ if (!p || (p->nr_cpus_allowed == 1) ||
+ (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
+ task_fits_cpu(p, cpu)) {
- if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -5148,7 +5154,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
+static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
@@ -5254,7 +5260,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->vruntime = vruntime - lag;
/*
- * When joining the competition; the exisiting tasks will be,
+ * When joining the competition; the existing tasks will be,
* on average, halfway through their slice, as such start tasks
* off with half a slice to ease into the competition.
*/
@@ -5403,7 +5409,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Now advance min_vruntime if @se was the entity holding it back,
* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
* put back on, and if we advance min_vruntime, we'll be placed back
- * further than we started -- ie. we'll be penalized.
+ * further than we started -- i.e. we'll be penalized.
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
@@ -5439,7 +5445,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* Track our maximum slice length, if the CPU's load is at
- * least twice that of our own weight (i.e. dont track it
+ * least twice that of our own weight (i.e. don't track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() &&
@@ -6687,14 +6693,21 @@ static inline bool cpu_overutilized(int cpu)
return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
-static inline void set_rd_overutilized_status(struct root_domain *rd,
- unsigned int status)
+/*
+ * overutilized value make sense only if EAS is enabled
+ */
+static inline bool is_rd_overutilized(struct root_domain *rd)
+{
+ return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
+}
+
+static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
{
if (!sched_energy_enabled())
return;
- WRITE_ONCE(rd->overutilized, status);
- trace_sched_overutilized_tp(rd, !!status);
+ WRITE_ONCE(rd->overutilized, flag);
+ trace_sched_overutilized_tp(rd, flag);
}
static inline void check_update_overutilized_status(struct rq *rq)
@@ -6703,11 +6716,9 @@ static inline void check_update_overutilized_status(struct rq *rq)
* overutilized field is used for load balancing decisions only
* if energy aware scheduler is being used
*/
- if (!sched_energy_enabled())
- return;
- if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
- set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED);
+ if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
+ set_rd_overutilized(rq->rd, 1);
}
#else
static inline void check_update_overutilized_status(struct rq *rq) { }
@@ -6898,7 +6909,7 @@ dequeue_throttle:
#ifdef CONFIG_SMP
-/* Working cpumask for: load_balance, load_balance_newidle. */
+/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
@@ -7130,13 +7141,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
}
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
+sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
/*
- * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
+ * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
*/
static int
-find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -7192,7 +7203,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
-static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
int cpu, int prev_cpu, int sd_flag)
{
int new_cpu = cpu;
@@ -7217,13 +7228,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
continue;
}
- group = find_idlest_group(sd, p, cpu);
+ group = sched_balance_find_dst_group(sd, p, cpu);
if (!group) {
sd = sd->child;
continue;
}
- new_cpu = find_idlest_group_cpu(group, p, cpu);
+ new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
/* Now try balancing at a lower domain level of 'cpu': */
sd = sd->child;
@@ -7491,7 +7502,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
* Look for the CPU with best capacity.
*/
else if (fits < 0)
- cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
+ cpu_cap = get_actual_cpu_capacity(cpu);
/*
* First, select CPU which fits better (-1 being better than 0).
@@ -7535,7 +7546,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
/*
* On asymmetric system, update task utilization because we will check
- * that the task fits with cpu's capacity.
+ * that the task fits with CPU's capacity.
*/
if (sched_asym_cpucap_active()) {
sync_entity_load_avg(&p->se);
@@ -7887,8 +7898,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
* Performance domain frequency: utilization clamping
* must be considered since it affects the selection
* of the performance domain frequency.
- * NOTE: in case RT tasks are running, by default the
- * FREQUENCY_UTIL's utilization can be max OPP.
+ * NOTE: in case RT tasks are running, by default the min
+ * utilization can be max OPP.
*/
eff_util = effective_cpu_util(cpu, util, &min, &max);
@@ -7968,7 +7979,7 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
* NOTE: Forkees are not accepted in the energy-aware wake-up path because
* they don't have any useful utilization data yet and it's not possible to
* forecast their impact on energy consumption. Consequently, they will be
- * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+ * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
* to be energy-inefficient in some use-cases. The alternative would be to
* bias new tasks towards specific types of CPUs first, or to try to infer
* their util_avg from the parent task, but those heuristics could hurt
@@ -7984,15 +7995,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
int prev_fits = -1, best_fits = -1;
- unsigned long best_thermal_cap = 0;
- unsigned long prev_thermal_cap = 0;
+ unsigned long best_actual_cap = 0;
+ unsigned long prev_actual_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
struct energy_env eenv;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
- if (!pd || READ_ONCE(rd->overutilized))
+ if (!pd)
goto unlock;
/*
@@ -8015,7 +8026,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long util_min = p_util_min, util_max = p_util_max;
- unsigned long cpu_cap, cpu_thermal_cap, util;
+ unsigned long cpu_cap, cpu_actual_cap, util;
long prev_spare_cap = -1, max_spare_cap = -1;
unsigned long rq_util_min, rq_util_max;
unsigned long cur_delta, base_energy;
@@ -8027,18 +8038,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (cpumask_empty(cpus))
continue;
- /* Account thermal pressure for the energy estimation */
+ /* Account external pressure for the energy estimation */
cpu = cpumask_first(cpus);
- cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
- cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+ cpu_actual_cap = get_actual_cpu_capacity(cpu);
- eenv.cpu_cap = cpu_thermal_cap;
+ eenv.cpu_cap = cpu_actual_cap;
eenv.pd_cap = 0;
for_each_cpu(cpu, cpus) {
struct rq *rq = cpu_rq(cpu);
- eenv.pd_cap += cpu_thermal_cap;
+ eenv.pd_cap += cpu_actual_cap;
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
@@ -8059,7 +8069,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
/*
* Open code uclamp_rq_util_with() except for
- * the clamp() part. Ie: apply max aggregation
+ * the clamp() part. I.e.: apply max aggregation
* only. util_fits_cpu() logic requires to
* operate on non clamped util but must use the
* max-aggregated uclamp_{min, max}.
@@ -8109,7 +8119,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (prev_delta < base_energy)
goto unlock;
prev_delta -= base_energy;
- prev_thermal_cap = cpu_thermal_cap;
+ prev_actual_cap = cpu_actual_cap;
best_delta = min(best_delta, prev_delta);
}
@@ -8124,7 +8134,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* but best energy cpu has better capacity.
*/
if ((max_fits < 0) &&
- (cpu_thermal_cap <= best_thermal_cap))
+ (cpu_actual_cap <= best_actual_cap))
continue;
cur_delta = compute_energy(&eenv, pd, cpus, p,
@@ -8145,14 +8155,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
best_fits = max_fits;
- best_thermal_cap = cpu_thermal_cap;
+ best_actual_cap = cpu_actual_cap;
}
}
rcu_read_unlock();
if ((best_fits > prev_fits) ||
((best_fits > 0) && (best_delta < prev_delta)) ||
- ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+ ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
target = best_energy_cpu;
return target;
@@ -8195,7 +8205,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
cpumask_test_cpu(cpu, p->cpus_ptr))
return cpu;
- if (sched_energy_enabled()) {
+ if (!is_rd_overutilized(this_rq()->rd)) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
return new_cpu;
@@ -8233,7 +8243,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
if (unlikely(sd)) {
/* Slow path */
- new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
@@ -8279,14 +8289,46 @@ static void task_dead_fair(struct task_struct *p)
remove_entity_load_avg(&p->se);
}
+/*
+ * Set the max capacity the task is allowed to run at for misfit detection.
+ */
+static void set_task_max_allowed_capacity(struct task_struct *p)
+{
+ struct asym_cap_data *entry;
+
+ if (!sched_asym_cpucap_active())
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ cpumask_t *cpumask;
+
+ cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ p->max_allowed_capacity = entry->capacity;
+ break;
+ }
+ rcu_read_unlock();
+}
+
+static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
+{
+ set_cpus_allowed_common(p, ctx);
+ set_task_max_allowed_capacity(p);
+}
+
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
if (rq->nr_running)
return 1;
- return newidle_balance(rq, rf) != 0;
+ return sched_balance_newidle(rq, rf) != 0;
}
+#else
+static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
#endif /* CONFIG_SMP */
static void set_next_buddy(struct sched_entity *se)
@@ -8537,10 +8579,10 @@ idle:
if (!rf)
return NULL;
- new_tasks = newidle_balance(rq, rf);
+ new_tasks = sched_balance_newidle(rq, rf);
/*
- * Because newidle_balance() releases (and re-acquires) rq->lock, it is
+ * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
@@ -8618,7 +8660,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
return false;
- /* Tell the scheduler that we'd really like pse to run next. */
+ /* Tell the scheduler that we'd really like se to run next. */
set_next_buddy(se);
yield_task_fair(rq);
@@ -8956,7 +8998,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
- /* Disregard pcpu kthreads; they are where they need to be. */
+ /* Disregard percpu kthreads; they are where they need to be. */
if (kthread_is_per_cpu(p))
return 0;
@@ -9102,7 +9144,7 @@ static int detach_tasks(struct lb_env *env)
* We don't want to steal all, otherwise we may be treated likewise,
* which could at worst lead to a livelock crash.
*/
- if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+ if (env->idle && env->src_rq->nr_running <= 1)
break;
env->loop++;
@@ -9277,7 +9319,7 @@ static inline bool others_have_blocked(struct rq *rq)
if (cpu_util_dl(rq))
return true;
- if (thermal_load_avg(rq))
+ if (hw_load_avg(rq))
return true;
if (cpu_util_irq(rq))
@@ -9307,7 +9349,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
{
const struct sched_class *curr_class;
u64 now = rq_clock_pelt(rq);
- unsigned long thermal_pressure;
+ unsigned long hw_pressure;
bool decayed;
/*
@@ -9316,11 +9358,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
*/
curr_class = rq->curr->sched_class;
- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
+ hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
+ update_hw_load_avg(now, rq, hw_pressure) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq))
@@ -9439,7 +9481,7 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif
-static void update_blocked_averages(int cpu)
+static void sched_balance_update_blocked_averages(int cpu)
{
bool decayed = false, done = true;
struct rq *rq = cpu_rq(cpu);
@@ -9458,25 +9500,25 @@ static void update_blocked_averages(int cpu)
rq_unlock_irqrestore(rq, &rf);
}
-/********** Helpers for find_busiest_group ************************/
+/********** Helpers for sched_balance_find_src_group ************************/
/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
+ * sg_lb_stats - stats of a sched_group required for load-balancing:
*/
struct sg_lb_stats {
- unsigned long avg_load; /*Avg load across the CPUs of the group */
- unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long group_capacity;
- unsigned long group_util; /* Total utilization over the CPUs of the group */
- unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
- unsigned int sum_nr_running; /* Nr of tasks running in the group */
- unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
- unsigned int idle_cpus;
+ unsigned long avg_load; /* Avg load over the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long group_capacity; /* Capacity over the CPUs of the group */
+ unsigned long group_util; /* Total utilization over the CPUs of the group */
+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
+ unsigned int sum_nr_running; /* Nr of all tasks running in the group */
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
+ unsigned int idle_cpus; /* Nr of idle CPUs in the group */
unsigned int group_weight;
enum group_type group_type;
- unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
- unsigned int group_smt_balance; /* Task on busy SMT be moved */
- unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+ unsigned int group_smt_balance; /* Task on busy SMT be moved */
+ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -9484,19 +9526,18 @@ struct sg_lb_stats {
};
/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- * during load balancing.
+ * sd_lb_stats - stats of a sched_domain required for load-balancing:
*/
struct sd_lb_stats {
- struct sched_group *busiest; /* Busiest group in this sd */
- struct sched_group *local; /* Local group in this sd */
- unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_capacity; /* Total capacity of all groups in sd */
- unsigned long avg_load; /* Average load across all groups in sd */
- unsigned int prefer_sibling; /* tasks should go to sibling first */
-
- struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
- struct sg_lb_stats local_stat; /* Statistics of the local group */
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *local; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+ unsigned int prefer_sibling; /* Tasks should go to sibling first */
+
+ struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */
+ struct sg_lb_stats local_stat; /* Statistics of the local group */
};
static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
@@ -9522,8 +9563,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
static unsigned long scale_rt_capacity(int cpu)
{
+ unsigned long max = get_actual_cpu_capacity(cpu);
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;
@@ -9535,12 +9576,9 @@ static unsigned long scale_rt_capacity(int cpu)
/*
* avg_rt.util_avg and avg_dl.util_avg track binary signals
* (running and not running) with weights 0 and 1024 respectively.
- * avg_thermal.load_avg tracks thermal pressure and the weighted
- * average uses the actual delta max capacity(load).
*/
used = cpu_util_rt(rq);
used += cpu_util_dl(rq);
- used += thermal_load_avg(rq);
if (unlikely(used >= max))
return 1;
@@ -9633,16 +9671,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}
-/*
- * Check whether a rq has a misfit task and if it looks like we can actually
- * help that task: we can migrate the task to a CPU of higher capacity, or
- * the task's current CPU is heavily pressured.
- */
-static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+/* Check if the rq has a misfit task */
+static inline bool check_misfit_status(struct rq *rq)
{
- return rq->misfit_task_load &&
- (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
- check_cpu_capacity(rq, sd));
+ return rq->misfit_task_load;
}
/*
@@ -9666,7 +9698,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
*
* When this is so detected; this group becomes a candidate for busiest; see
* update_sd_pick_busiest(). And calculate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
* to create an effective group imbalance.
*
* This is a somewhat tricky proposition since the next run might not find the
@@ -9831,7 +9863,7 @@ static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
struct sched_group *group)
{
- if (env->idle == CPU_NOT_IDLE)
+ if (!env->idle)
return false;
/*
@@ -9855,7 +9887,7 @@ static inline long sibling_imbalance(struct lb_env *env,
int ncores_busiest, ncores_local;
long imbalance;
- if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
+ if (!env->idle || !busiest->sum_nr_running)
return 0;
ncores_busiest = sds->busiest->cores;
@@ -9901,13 +9933,15 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
- * @sg_status: Holds flag indicating the status of the sched_group
+ * @sg_overloaded: sched_group is overloaded
+ * @sg_overutilized: sched_group is overutilized
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *group,
struct sg_lb_stats *sgs,
- int *sg_status)
+ bool *sg_overloaded,
+ bool *sg_overutilized)
{
int i, nr_running, local_group;
@@ -9928,10 +9962,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->sum_nr_running += nr_running;
if (nr_running > 1)
- *sg_status |= SG_OVERLOAD;
+ *sg_overloaded = 1;
if (cpu_overutilized(i))
- *sg_status |= SG_OVERUTILIZED;
+ *sg_overutilized = 1;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
@@ -9953,10 +9987,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/* Check for a misfit task on the cpu */
if (sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOAD;
+ *sg_overloaded = 1;
}
- } else if ((env->idle != CPU_NOT_IDLE) &&
- sched_reduced_capacity(rq, env->sd)) {
+ } else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
/* Check for a task running on a CPU with reduced capacity */
if (sgs->group_misfit_task_load < load)
sgs->group_misfit_task_load = load;
@@ -9968,7 +10001,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
/* Check if dst CPU is idle and preferred to this group */
- if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
+ if (!local_group && env->idle && sgs->sum_h_nr_running &&
sched_group_asym(env, sgs, group))
sgs->group_asym_packing = 1;
@@ -10106,7 +10139,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
has_spare:
/*
- * Select not overloaded group with lowest number of idle cpus
+ * Select not overloaded group with lowest number of idle CPUs
* and highest number of running tasks. We could also compare
* the spare capacity which is more stable but it can end up
* that the group has less spare capacity but finally more idle
@@ -10326,13 +10359,13 @@ static bool update_pick_idlest(struct sched_group *idlest,
}
/*
- * find_idlest_group() finds and returns the least busy CPU group within the
+ * sched_balance_find_dst_group() finds and returns the least busy CPU group within the
* domain.
*
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
struct sg_lb_stats local_sgs, tmp_sgs;
@@ -10580,7 +10613,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
unsigned long sum_util = 0;
- int sg_status = 0;
+ bool sg_overloaded = 0, sg_overutilized = 0;
do {
struct sg_lb_stats *sgs = &tmp_sgs;
@@ -10596,7 +10629,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
+ update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -10625,13 +10658,12 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (!env->sd->parent) {
/* update overload indicator if we are at root domain */
- WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD);
+ set_rd_overloaded(env->dst_rq->rd, sg_overloaded);
/* Update over-utilization (tipping point, U >= 0) indicator */
- set_rd_overutilized_status(env->dst_rq->rd,
- sg_status & SG_OVERUTILIZED);
- } else if (sg_status & SG_OVERUTILIZED) {
- set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED);
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
+ } else if (sg_overutilized) {
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
}
update_idle_cpu_scan(env, sum_util);
@@ -10721,7 +10753,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* waiting task in this overloaded busiest group. Let's
* try to pull it.
*/
- if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
+ if (env->idle && env->imbalance == 0) {
env->migration_type = migrate_task;
env->imbalance = 1;
}
@@ -10740,7 +10772,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/*
* If there is no overload, we just want to even the number of
- * idle cpus.
+ * idle CPUs.
*/
env->migration_type = migrate_task;
env->imbalance = max_t(long, 0,
@@ -10813,7 +10845,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
) / SCHED_CAPACITY_SCALE;
}
-/******* find_busiest_group() helpers end here *********************/
+/******* sched_balance_find_src_group() helpers end here *********************/
/*
* Decision matrix according to the local and busiest group type:
@@ -10836,7 +10868,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
/**
- * find_busiest_group - Returns the busiest group within the sched_domain
+ * sched_balance_find_src_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
* @env: The load balancing environment.
*
@@ -10845,7 +10877,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*
* Return: - The busiest group if imbalance exists.
*/
-static struct sched_group *find_busiest_group(struct lb_env *env)
+static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
{
struct sg_lb_stats *local, *busiest;
struct sd_lb_stats sds;
@@ -10868,12 +10900,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_misfit_task)
goto force_balance;
- if (sched_energy_enabled()) {
- struct root_domain *rd = env->dst_rq->rd;
-
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
- goto out_balanced;
- }
+ if (!is_rd_overutilized(env->dst_rq->rd) &&
+ rcu_dereference(env->dst_rq->rd->pd))
+ goto out_balanced;
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
@@ -10936,7 +10965,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
if (busiest->group_type != group_overloaded) {
- if (env->idle == CPU_NOT_IDLE) {
+ if (!env->idle) {
/*
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
@@ -10984,9 +11013,9 @@ out_balanced:
}
/*
- * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
+ * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
*/
-static struct rq *find_busiest_queue(struct lb_env *env,
+static struct rq *sched_balance_find_src_rq(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
@@ -11144,7 +11173,7 @@ asym_active_balance(struct lb_env *env)
* the lower priority @env::dst_cpu help it. Do not follow
* CPU priority.
*/
- return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
(sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
!sched_use_asym_prio(env->sd, env->src_cpu));
}
@@ -11182,7 +11211,7 @@ static int need_active_balance(struct lb_env *env)
* because of other sched_class or IRQs if more capacity stays
* available on dst_cpu.
*/
- if ((env->idle != CPU_NOT_IDLE) &&
+ if (env->idle &&
(env->src_rq->cfs.h_nr_running == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
@@ -11267,7 +11296,7 @@ static int should_we_balance(struct lb_env *env)
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
-static int load_balance(int this_cpu, struct rq *this_rq,
+static int sched_balance_rq(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
@@ -11299,13 +11328,13 @@ redo:
goto out_balanced;
}
- group = find_busiest_group(&env);
+ group = sched_balance_find_src_group(&env);
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
- busiest = find_busiest_queue(&env, group);
+ busiest = sched_balance_find_src_rq(&env, group);
if (!busiest) {
schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
@@ -11323,7 +11352,7 @@ redo:
env.flags |= LBF_ALL_PINNED;
if (busiest->nr_running > 1) {
/*
- * Attempt to move tasks. If find_busiest_group has found
+ * Attempt to move tasks. If sched_balance_find_src_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
@@ -11436,8 +11465,12 @@ more_balance:
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
+ *
+ * Similarly for migration_misfit which is not related to
+ * load/util migration, don't pollute nr_balance_failed.
*/
- if (idle != CPU_NEWLY_IDLE)
+ if (idle != CPU_NEWLY_IDLE &&
+ env.migration_type != migrate_misfit)
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
@@ -11516,12 +11549,17 @@ out_one_pinned:
ld_moved = 0;
/*
- * newidle_balance() disregards balance intervals, so we could
+ * sched_balance_newidle() disregards balance intervals, so we could
* repeatedly reach this code, which would lead to balance_interval
* skyrocketing in a short amount of time. Skip the balance_interval
* increase logic to avoid that.
+ *
+ * Similarly misfit migration which is not necessarily an indication of
+ * the system being busy and requires lb to backoff to let it settle
+ * down.
*/
- if (env.idle == CPU_NEWLY_IDLE)
+ if (env.idle == CPU_NEWLY_IDLE ||
+ env.migration_type == migrate_misfit)
goto out;
/* tune up the balancing interval */
@@ -11654,10 +11692,23 @@ out_unlock:
return 0;
}
-static DEFINE_SPINLOCK(balancing);
+/*
+ * This flag serializes load-balancing passes over large domains
+ * (above the NODE topology level) - only one load-balancing instance
+ * may run at a time, to reduce overhead on very large systems with
+ * lots of CPUs and large NUMA distances.
+ *
+ * - Note that load-balancing passes triggered while another one
+ * is executing are skipped and not re-tried.
+ *
+ * - Also note that this does not serialize rebalance_domains()
+ * execution, as non-SD_SERIALIZE domains will still be
+ * load-balanced in parallel.
+ */
+static atomic_t sched_balance_running = ATOMIC_INIT(0);
/*
- * Scale the max load_balance interval with the number of CPUs in the system.
+ * Scale the max sched_balance_rq interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
void update_max_interval(void)
@@ -11695,7 +11746,7 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
*
* Balancing parameters are set up in init_sched_domains.
*/
-static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
+static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
@@ -11732,25 +11783,25 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
- if (!spin_trylock(&balancing))
+ if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
+ if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
/*
* The LBF_DST_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
- idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
- busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ idle = idle_cpu(cpu);
+ busy = !idle && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
- spin_unlock(&balancing);
+ atomic_set_release(&sched_balance_running, 0);
out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
@@ -11910,7 +11961,7 @@ static void nohz_balancer_kick(struct rq *rq)
* currently idle; in which case, kick the ILB to move tasks
* around.
*
- * When balancing betwen cores, all the SMT siblings of the
+ * When balancing between cores, all the SMT siblings of the
* preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
@@ -11927,7 +11978,7 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
* to run the misfit task on.
*/
- if (check_misfit_status(rq, sd)) {
+ if (check_misfit_status(rq)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -12071,7 +12122,7 @@ void nohz_balance_enter_idle(int cpu)
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
- * enable the periodic update of the load of idle cpus
+ * enable the periodic update of the load of idle CPUs
*/
WRITE_ONCE(nohz.has_blocked, 1);
}
@@ -12089,13 +12140,13 @@ static bool update_nohz_stats(struct rq *rq)
if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
return true;
- update_blocked_averages(cpu);
+ sched_balance_update_blocked_averages(cpu);
return rq->has_blocked_load;
}
/*
- * Internal function that runs load balance for all idle cpus. The load balance
+ * Internal function that runs load balance for all idle CPUs. The load balance
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
*/
@@ -12171,7 +12222,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
- rebalance_domains(rq, CPU_IDLE);
+ sched_balance_domains(rq, CPU_IDLE);
}
if (time_after(next_balance, rq->next_balance)) {
@@ -12200,7 +12251,7 @@ abort:
/*
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * rebalancing for all the CPUs for whom scheduler ticks are stopped.
*/
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
@@ -12231,7 +12282,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* called from this function on (this) CPU that's not yet in the mask. That's
* OK because the goal of nohz_run_idle_balance() is to run ILB only for
* updating the blocked load of already idle CPUs without waking up one of
- * those idle CPUs and outside the preempt disable / irq off phase of the local
+ * those idle CPUs and outside the preempt disable / IRQ off phase of the local
* cpu about to enter idle, because it can take a long time.
*/
void nohz_run_idle_balance(int cpu)
@@ -12242,7 +12293,7 @@ void nohz_run_idle_balance(int cpu)
/*
* Update the blocked load only if no SCHED_SOFTIRQ is about to happen
- * (ie NOHZ_STATS_KICK set) and will do the same.
+ * (i.e. NOHZ_STATS_KICK set) and will do the same.
*/
if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
@@ -12287,7 +12338,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
- * newidle_balance is called by schedule() if this_cpu is about to become
+ * sched_balance_newidle is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*
* Returns:
@@ -12295,10 +12346,11 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
-static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
+ int continue_balancing = 1;
u64 t0, t1, curr_cost = 0;
struct sched_domain *sd;
int pulled_task = 0;
@@ -12313,8 +12365,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
return 0;
/*
- * We must set idle_stamp _before_ calling idle_balance(), such that we
- * measure the duration of idle_balance() as idle time.
+ * We must set idle_stamp _before_ calling sched_balance_rq()
+ * for CPU_NEWLY_IDLE, such that we measure the this duration
+ * as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);
@@ -12335,7 +12388,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (!READ_ONCE(this_rq->rd->overload) ||
+ if (!get_rd_overloaded(this_rq->rd) ||
(sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
if (sd)
@@ -12349,11 +12402,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
raw_spin_rq_unlock(this_rq);
t0 = sched_clock_cpu(this_cpu);
- update_blocked_averages(this_cpu);
+ sched_balance_update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
- int continue_balancing = 1;
u64 domain_cost;
update_next_balance(sd, &next_balance);
@@ -12363,7 +12415,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
if (sd->flags & SD_BALANCE_NEWIDLE) {
- pulled_task = load_balance(this_cpu, this_rq,
+ pulled_task = sched_balance_rq(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
@@ -12379,8 +12431,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
- if (pulled_task || this_rq->nr_running > 0 ||
- this_rq->ttwu_pending)
+ if (pulled_task || !continue_balancing)
break;
}
rcu_read_unlock();
@@ -12418,19 +12469,21 @@ out:
}
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ * This softirq handler is triggered via SCHED_SOFTIRQ from two places:
+ *
+ * - directly from the local scheduler_tick() for periodic load balancing
+ *
+ * - indirectly from a remote scheduler_tick() for NOHZ idle balancing
+ * through the SMP cross-call nohz_csd_func()
*/
-static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void sched_balance_softirq(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
- enum cpu_idle_type idle = this_rq->idle_balance ?
- CPU_IDLE : CPU_NOT_IDLE;
-
+ enum cpu_idle_type idle = this_rq->idle_balance;
/*
- * If this CPU has a pending nohz_balance_kick, then do the
+ * If this CPU has a pending NOHZ_BALANCE_KICK, then do the
* balancing on behalf of the other idle CPUs whose ticks are
- * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ * stopped. Do nohz_idle_balance *before* sched_balance_domains to
* give the idle CPUs a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
@@ -12439,14 +12492,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
return;
/* normal load balance */
- update_blocked_averages(this_rq->cpu);
- rebalance_domains(this_rq, idle);
+ sched_balance_update_blocked_averages(this_rq->cpu);
+ sched_balance_domains(this_rq, idle);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
-void trigger_load_balance(struct rq *rq)
+void sched_balance_trigger(struct rq *rq)
{
/*
* Don't need to rebalance while attached to NULL domain or
@@ -12650,6 +12703,8 @@ static void task_fork_fair(struct task_struct *p)
rq_lock(rq, &rf);
update_rq_clock(rq);
+ set_task_max_allowed_capacity(p);
+
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr)
@@ -12773,6 +12828,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
attach_task_cfs_rq(p);
+ set_task_max_allowed_capacity(p);
+
if (task_on_rq_queued(p)) {
/*
* We were most likely switched from sched_rt, so
@@ -13144,7 +13201,7 @@ DEFINE_SCHED_CLASS(fair) = {
.rq_offline = rq_offline_fair,
.task_dead = task_dead_fair,
- .set_cpus_allowed = set_cpus_allowed_common,
+ .set_cpus_allowed = set_cpus_allowed_fair,
#endif
.task_tick = task_tick_fair,
@@ -13224,7 +13281,7 @@ __init void init_sched_fair_class(void)
#endif
}
- open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+ open_softirq(SCHED_SOFTIRQ, sched_balance_softirq);
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 52c8f8226b..ca9da66cc8 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -379,7 +379,7 @@ void calc_global_load(void)
}
/*
- * Called from scheduler_tick() to periodically update this CPU's
+ * Called from sched_tick() to periodically update this CPU's
* active count.
*/
void calc_global_load_tick(struct rq *this_rq)
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 63b6cf8982..ef00382de5 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -208,8 +208,8 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* se has been already dequeued but cfs_rq->curr still points to it.
* This means that weight will be 0 but not running for a sched_entity
* but also for a cfs_rq if the latter becomes idle. As an example,
- * this happens during idle_balance() which calls
- * update_blocked_averages().
+ * this happens during sched_balance_newidle() which calls
+ * sched_balance_update_blocked_averages().
*
* Also see the comment in accumulate_sum().
*/
@@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0;
}
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+#ifdef CONFIG_SCHED_HW_PRESSURE
/*
- * thermal:
+ * hardware:
*
* load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
*
* util_avg and runnable_load_avg are not supported and meaningless.
*
* Unlike rt/dl utilization tracking that track time spent by a cpu
- * running a rt/dl task through util_avg, the average thermal pressure is
- * tracked through load_avg. This is because thermal pressure signal is
+ * running a rt/dl task through util_avg, the average HW pressure is
+ * tracked through load_avg. This is because HW pressure signal is
* time weighted "delta" capacity unlike util_avg which is binary.
* "delta capacity" = actual capacity -
- * capped capacity a cpu due to a thermal event.
+ * capped capacity a cpu due to a HW event.
*/
-int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
- if (___update_load_sum(now, &rq->avg_thermal,
+ if (___update_load_sum(now, &rq->avg_hw,
capacity,
capacity,
capacity)) {
- ___update_load_avg(&rq->avg_thermal, 1);
- trace_pelt_thermal_tp(rq);
+ ___update_load_avg(&rq->avg_hw, 1);
+ trace_pelt_hw_tp(rq);
return 1;
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 9e1083465f..2150062949 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -7,21 +7,21 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
-int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
+#ifdef CONFIG_SCHED_HW_PRESSURE
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
- return READ_ONCE(rq->avg_thermal.load_avg);
+ return READ_ONCE(rq->avg_hw.load_avg);
}
#else
static inline int
-update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
@@ -202,12 +202,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
}
static inline int
-update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7b4aa5809c..507d7b8d79 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -773,6 +773,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
enum psi_states s;
u32 state_mask;
+ lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
@@ -991,22 +992,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct task_struct *task, u32 delta)
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
- int cpu = task_cpu(task);
+ int cpu = task_cpu(curr);
struct psi_group *group;
struct psi_group_cpu *groupc;
- u64 now;
+ u64 now, irq;
+ s64 delta;
if (static_branch_likely(&psi_disabled))
return;
- if (!task->pid)
+ if (!curr->pid)
+ return;
+
+ lockdep_assert_rq_held(rq);
+ group = task_psi_group(curr);
+ if (prev && task_psi_group(prev) == group)
return;
now = cpu_clock(cpu);
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->psi_irq_time);
+ if (delta < 0)
+ return;
+ rq->psi_irq_time = irq;
- group = task_psi_group(task);
do {
if (!group->enabled)
continue;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3261b067b6..aa4c1c874f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -56,7 +56,6 @@ static struct ctl_table sched_rt_sysctls[] = {
.mode = 0644,
.proc_handler = sched_rr_handler,
},
- {}
};
static int __init sched_rt_sysctl_init(void)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ae50f21277..38aeedd8a6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,20 @@ extern int sysctl_sched_rt_runtime;
extern int sched_rr_timeslice;
/*
+ * Asymmetric CPU capacity bits
+ */
+struct asym_cap_data {
+ struct list_head link;
+ struct rcu_head rcu;
+ unsigned long capacity;
+ unsigned long cpus[];
+};
+
+extern struct list_head asym_cap_list;
+
+#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+
+/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -701,7 +715,7 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- int overloaded;
+ bool overloaded;
struct plist_head pushable_tasks;
#endif /* CONFIG_SMP */
@@ -745,7 +759,7 @@ struct dl_rq {
u64 next;
} earliest_dl;
- int overloaded;
+ bool overloaded;
/*
* Tasks on this rq that can be pushed away. They are kept in
@@ -838,10 +852,6 @@ struct perf_domain {
struct rcu_head rcu;
};
-/* Scheduling group status flags */
-#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */
-#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
-
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -862,10 +872,10 @@ struct root_domain {
* - More than one runnable task
* - Running task is misfit
*/
- int overload;
+ bool overloaded;
/* Indicate one or more cpus over-utilized (tipping point) */
- int overutilized;
+ bool overutilized;
/*
* The bit corresponding to a CPU gets set here if such CPU has more
@@ -905,8 +915,6 @@ struct root_domain {
cpumask_var_t rto_mask;
struct cpupri cpupri;
- unsigned long max_cpu_capacity;
-
/*
* NULL-terminated list of performance domains intersecting with the
* CPUs of the rd. Protected by RCU.
@@ -920,6 +928,17 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
extern void sched_get_rd(struct root_domain *rd);
extern void sched_put_rd(struct root_domain *rd);
+static inline int get_rd_overloaded(struct root_domain *rd)
+{
+ return READ_ONCE(rd->overloaded);
+}
+
+static inline void set_rd_overloaded(struct root_domain *rd, int status)
+{
+ if (get_rd_overloaded(rd) != status)
+ WRITE_ONCE(rd->overloaded, status);
+}
+
#ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work);
#endif
@@ -1091,8 +1110,8 @@ struct rq {
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
- struct sched_avg avg_thermal;
+#ifdef CONFIG_SCHED_HW_PRESSURE
+ struct sched_avg avg_hw;
#endif
u64 idle_stamp;
u64 avg_idle;
@@ -1107,6 +1126,7 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
+ u64 psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
@@ -1533,24 +1553,6 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
-/**
- * By default the decay is the default pelt decay period.
- * The decay shift can change the decay period in
- * multiples of 32.
- * Decay shift Decay period(ms)
- * 0 32
- * 1 64
- * 2 128
- * 3 256
- * 4 512
- */
-extern int sched_thermal_decay_shift;
-
-static inline u64 rq_clock_thermal(struct rq *rq)
-{
- return rq_clock_task(rq) >> sched_thermal_decay_shift;
-}
-
static inline void rq_clock_skip_update(struct rq *rq)
{
lockdep_assert_rq_held(rq);
@@ -2399,7 +2401,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq);
extern void update_group_capacity(struct sched_domain *sd, int cpu);
-extern void trigger_load_balance(struct rq *rq);
+extern void sched_balance_trigger(struct rq *rq);
extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
@@ -2462,7 +2464,7 @@ extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
-extern void reweight_task(struct task_struct *p, int prio);
+extern void reweight_task(struct task_struct *p, const struct load_weight *lw);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
@@ -2519,10 +2521,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
}
#ifdef CONFIG_SMP
- if (prev_nr < 2 && rq->nr_running >= 2) {
- if (!READ_ONCE(rq->rd->overload))
- WRITE_ONCE(rq->rd->overload, 1);
- }
+ if (prev_nr < 2 && rq->nr_running >= 2)
+ set_rd_overloaded(rq->rd, 1);
#endif
sched_update_tick_dependency(rq);
@@ -2906,7 +2906,7 @@ extern void cfs_bandwidth_usage_dec(void);
#define NOHZ_NEWILB_KICK_BIT 2
#define NOHZ_NEXT_KICK_BIT 3
-/* Run rebalance_domains() */
+/* Run sched_balance_domains() */
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
/* Update blocked load */
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 857f837f52..78e48f5426 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -113,7 +113,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
* Bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 15
+#define SCHEDSTAT_VERSION 16
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -150,8 +150,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "domain%d %*pb", dcount++,
cpumask_pr_args(sched_domain_span(sd)));
- for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
- itype++) {
+ for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
seq_printf(seq, " %u %u %u %u %u %u %u %u",
sd->lb_count[itype],
sd->lb_balanced[itype],
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 38f3698f5e..b02dfc3229 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se)
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
-void psi_account_irqtime(struct task_struct *task, u32 delta);
-
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
+#else
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
+#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
-static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3127c9b30a..a6994a1fcc 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -322,7 +322,6 @@ static struct ctl_table sched_energy_aware_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- {}
};
static int __init sched_energy_aware_sysctl_init(void)
@@ -1330,23 +1329,12 @@ next:
}
/*
- * Asymmetric CPU capacity bits
- */
-struct asym_cap_data {
- struct list_head link;
- unsigned long capacity;
- unsigned long cpus[];
-};
-
-/*
* Set of available CPUs grouped by their corresponding capacities
* Each list entry contains a CPU mask reflecting CPUs that share the same
* capacity.
* The lifespan of data is unlimited.
*/
-static LIST_HEAD(asym_cap_list);
-
-#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+LIST_HEAD(asym_cap_list);
/*
* Verify whether there is any CPU capacity asymmetry in a given sched domain.
@@ -1386,21 +1374,39 @@ asym_cpu_capacity_classify(const struct cpumask *sd_span,
}
+static void free_asym_cap_entry(struct rcu_head *head)
+{
+ struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu);
+ kfree(entry);
+}
+
static inline void asym_cpu_capacity_update_data(int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(cpu);
- struct asym_cap_data *entry = NULL;
+ struct asym_cap_data *insert_entry = NULL;
+ struct asym_cap_data *entry;
+ /*
+ * Search if capacity already exits. If not, track which the entry
+ * where we should insert to keep the list ordered descendingly.
+ */
list_for_each_entry(entry, &asym_cap_list, link) {
if (capacity == entry->capacity)
goto done;
+ else if (!insert_entry && capacity > entry->capacity)
+ insert_entry = list_prev_entry(entry, link);
}
entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
return;
entry->capacity = capacity;
- list_add(&entry->link, &asym_cap_list);
+
+ /* If NULL then the new capacity is the smallest, add last. */
+ if (!insert_entry)
+ list_add_tail_rcu(&entry->link, &asym_cap_list);
+ else
+ list_add_rcu(&entry->link, &insert_entry->link);
done:
__cpumask_set_cpu(cpu, cpu_capacity_span(entry));
}
@@ -1423,8 +1429,8 @@ static void asym_cpu_capacity_scan(void)
list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
if (cpumask_empty(cpu_capacity_span(entry))) {
- list_del(&entry->link);
- kfree(entry);
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
}
}
@@ -1434,8 +1440,8 @@ static void asym_cpu_capacity_scan(void)
*/
if (list_is_singular(&asym_cap_list)) {
entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
- list_del(&entry->link);
- kfree(entry);
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
}
}
@@ -2347,7 +2353,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
static bool topology_span_sane(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, int cpu)
{
- int i;
+ int i = cpu + 1;
/* NUMA levels are allowed to overlap */
if (tl->flags & SDTL_OVERLAP)
@@ -2359,9 +2365,7 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl,
* breaking the sched_group lists - i.e. a later get_group() pass
* breaks the linking done for an earlier span.
*/
- for_each_cpu(i, cpu_map) {
- if (i == cpu)
- continue;
+ for_each_cpu_from(i, cpu_map) {
/*
* We should 'and' all those masks with 'cpu_map' to exactly
* match the topology we're about to build, but that can only
@@ -2507,16 +2511,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- unsigned long capacity;
-
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
- capacity = arch_scale_cpu_capacity(i);
- /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
- WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
-
cpu_attach_domain(sd, d.rd, i);
if (lowest_flag_domain(i, SD_CLUSTER))
@@ -2530,10 +2527,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (has_cluster)
static_branch_inc_cpuslocked(&sched_cluster_active);
- if (rq && sched_debug_verbose) {
- pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
- cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
- }
+ if (rq && sched_debug_verbose)
+ pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
ret = 0;
error: