summaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 17:40:19 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 17:40:19 +0000
commit9f0fc191371843c4fc000a226b0a26b6c059aacd (patch)
tree35f8be3ef04506ac891ad001e8c41e535ae8d01d /kernel/sched/fair.c
parentReleasing progress-linux version 6.6.15-2~progress7.99u1. (diff)
downloadlinux-9f0fc191371843c4fc000a226b0a26b6c059aacd.tar.xz
linux-9f0fc191371843c4fc000a226b0a26b6c059aacd.zip
Merging upstream version 6.7.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c465
1 files changed, 297 insertions, 168 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d336af9cb..7ac9f4b1d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -51,8 +51,6 @@
#include <asm/switch_to.h>
-#include <linux/sched/cond_resched.h>
-
#include "sched.h"
#include "stats.h"
#include "autogroup.h"
@@ -78,12 +76,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
unsigned int sysctl_sched_base_slice = 750000ULL;
static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
-/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
- */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
-
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
int sched_thermal_decay_shift;
@@ -145,13 +137,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
#ifdef CONFIG_SYSCTL
static struct ctl_table sched_fair_sysctls[] = {
- {
- .procname = "sched_child_runs_first",
- .data = &sysctl_sched_child_runs_first,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#ifdef CONFIG_CFS_BANDWIDTH
{
.procname = "sched_cfs_bandwidth_slice_us",
@@ -1774,12 +1759,12 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat)
* The smaller the hint page fault latency, the higher the possibility
* for the page to be hot.
*/
-static int numa_hint_fault_latency(struct page *page)
+static int numa_hint_fault_latency(struct folio *folio)
{
int last_time, time;
time = jiffies_to_msecs(jiffies);
- last_time = xchg_page_access_time(page, time);
+ last_time = folio_xchg_access_time(folio, time);
return (time - last_time) & PAGE_ACCESS_TIME_MASK;
}
@@ -1836,7 +1821,7 @@ static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
}
}
-bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
int src_nid, int dst_cpu)
{
struct numa_group *ng = deref_curr_numa_group(p);
@@ -1866,16 +1851,16 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
th = pgdat->nbp_threshold ? : def_th;
- latency = numa_hint_fault_latency(page);
+ latency = numa_hint_fault_latency(folio);
if (latency >= th)
return false;
return !numa_promotion_rate_limit(pgdat, rate_limit,
- thp_nr_pages(page));
+ folio_nr_pages(folio));
}
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
- last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
!node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
@@ -2899,19 +2884,7 @@ static void task_numa_placement(struct task_struct *p)
}
/* Cannot migrate task to CPU-less node */
- if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
- int near_nid = max_nid;
- int distance, near_distance = INT_MAX;
-
- for_each_node_state(nid, N_CPU) {
- distance = node_distance(max_nid, nid);
- if (distance < near_distance) {
- near_nid = nid;
- near_distance = distance;
- }
- }
- max_nid = near_nid;
- }
+ max_nid = numa_nearest_node(max_nid, N_CPU);
if (ng) {
numa_group_count_active_nodes(ng);
@@ -3182,7 +3155,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
p->mm->numa_scan_offset = 0;
}
-static bool vma_is_accessed(struct vm_area_struct *vma)
+static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
{
unsigned long pids;
/*
@@ -3191,11 +3164,23 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
* This is also done to avoid any side effect of task scanning
* amplifying the unfairness of disjoint set of VMAs' access.
*/
- if (READ_ONCE(current->mm->numa_scan_seq) < 2)
+ if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
+ return true;
+
+ pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
+ if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+ return true;
+
+ /*
+ * Complete a scan that has already started regardless of PID access, or
+ * some VMAs may never be scanned in multi-threaded applications:
+ */
+ if (mm->numa_scan_offset > vma->vm_start) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
return true;
+ }
- pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
- return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
+ return false;
}
#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
@@ -3215,6 +3200,8 @@ static void task_numa_work(struct callback_head *work)
unsigned long nr_pte_updates = 0;
long pages, virtpages;
struct vma_iterator vmi;
+ bool vma_pids_skipped;
+ bool vma_pids_forced = false;
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
@@ -3257,7 +3244,6 @@ static void task_numa_work(struct callback_head *work)
*/
p->node_stamp += 2 * TICK_NSEC;
- start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
virtpages = pages * 8; /* Scan up to this much virtual space */
@@ -3267,6 +3253,16 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
+
+ /*
+ * VMAs are skipped if the current PID has not trapped a fault within
+ * the VMA recently. Allow scanning to be forced if there is no
+ * suitable VMA remaining.
+ */
+ vma_pids_skipped = false;
+
+retry_pids:
+ start = mm->numa_scan_offset;
vma_iter_init(&vmi, mm, start);
vma = vma_next(&vmi);
if (!vma) {
@@ -3279,6 +3275,7 @@ static void task_numa_work(struct callback_head *work)
do {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
continue;
}
@@ -3289,15 +3286,19 @@ static void task_numa_work(struct callback_head *work)
* as migrating the pages will be of marginal benefit.
*/
if (!vma->vm_mm ||
- (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
continue;
+ }
/*
* Skip inaccessible VMAs to avoid any confusion between
* PROT_NONE and NUMA hinting ptes
*/
- if (!vma_is_accessible(vma))
+ if (!vma_is_accessible(vma)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
continue;
+ }
/* Initialise new per-VMA NUMAB state. */
if (!vma->numab_state) {
@@ -3306,12 +3307,21 @@ static void task_numa_work(struct callback_head *work)
if (!vma->numab_state)
continue;
+ vma->numab_state->start_scan_seq = mm->numa_scan_seq;
+
vma->numab_state->next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
/* Reset happens after 4 times scan delay of scan start */
- vma->numab_state->next_pid_reset = vma->numab_state->next_scan +
+ vma->numab_state->pids_active_reset = vma->numab_state->next_scan +
msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+
+ /*
+ * Ensure prev_scan_seq does not match numa_scan_seq,
+ * to prevent VMAs being skipped prematurely on the
+ * first scan:
+ */
+ vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
}
/*
@@ -3319,23 +3329,35 @@ static void task_numa_work(struct callback_head *work)
* delay the scan for new VMAs.
*/
if (mm->numa_scan_seq && time_before(jiffies,
- vma->numab_state->next_scan))
+ vma->numab_state->next_scan)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
continue;
+ }
- /* Do not scan the VMA if task has not accessed */
- if (!vma_is_accessed(vma))
+ /* RESET access PIDs regularly for old VMAs. */
+ if (mm->numa_scan_seq &&
+ time_after(jiffies, vma->numab_state->pids_active_reset)) {
+ vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+ vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
+ vma->numab_state->pids_active[1] = 0;
+ }
+
+ /* Do not rescan VMAs twice within the same sequence. */
+ if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
+ mm->numa_scan_offset = vma->vm_end;
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
continue;
+ }
/*
- * RESET access PIDs regularly for old VMAs. Resetting after checking
- * vma for recent access to avoid clearing PID info before access..
+ * Do not scan the VMA if task has not accessed it, unless no other
+ * VMA candidate exists.
*/
- if (mm->numa_scan_seq &&
- time_after(jiffies, vma->numab_state->next_pid_reset)) {
- vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
- msecs_to_jiffies(VMA_PID_RESET_PERIOD);
- vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
- vma->numab_state->access_pids[1] = 0;
+ if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
+ vma_pids_skipped = true;
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
+ continue;
}
do {
@@ -3362,8 +3384,28 @@ static void task_numa_work(struct callback_head *work)
cond_resched();
} while (end != vma->vm_end);
+
+ /* VMA scan is complete, do not scan until next sequence. */
+ vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
+
+ /*
+ * Only force scan within one VMA at a time, to limit the
+ * cost of scanning a potentially uninteresting VMA.
+ */
+ if (vma_pids_forced)
+ break;
} for_each_vma(vmi, vma);
+ /*
+ * If no VMAs are remaining and VMAs were skipped due to the PID
+ * not accessing the VMA previously, then force a scan to ensure
+ * forward progress:
+ */
+ if (!vma && !vma_pids_forced && vma_pids_skipped) {
+ vma_pids_forced = true;
+ goto retry_pids;
+ }
+
out:
/*
* It is possible to reach the end of the VMA list but the last few
@@ -4047,7 +4089,8 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
- long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ long delta;
+ u64 now;
/*
* No need to update load_avg for root_task_group as it is not used.
@@ -4055,10 +4098,67 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
if (cfs_rq->tg == &root_task_group)
return;
+ /* rq has been offline and doesn't contribute to the share anymore: */
+ if (!cpu_active(cpu_of(rq_of(cfs_rq))))
+ return;
+
+ /*
+ * For migration heavy workloads, access to tg->load_avg can be
+ * unbound. Limit the update rate to at most once per ms.
+ */
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
+ return;
+
+ delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+ cfs_rq->last_update_tg_load_avg = now;
+ }
+}
+
+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
+{
+ long delta;
+ u64 now;
+
+ /*
+ * No need to update load_avg for root_task_group, as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ delta = 0 - cfs_rq->tg_load_avg_contrib;
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = 0;
+ cfs_rq->last_update_tg_load_avg = now;
+}
+
+/* CPU offline callback: */
+static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
+{
+ struct task_group *tg;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * The rq clock has already been updated in
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ clear_tg_load_avg(cfs_rq);
}
+ rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
}
/*
@@ -4357,6 +4457,8 @@ static inline bool skip_blocked_update(struct sched_entity *se)
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
+static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
+
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
return 0;
@@ -4834,7 +4936,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
- if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
+ if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
return;
/*
@@ -4882,14 +4984,14 @@ static inline int util_fits_cpu(unsigned long util,
return fits;
/*
- * We must use capacity_orig_of() for comparing against uclamp_min and
+ * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
* uclamp_max. We only care about capacity pressure (by using
* capacity_of()) for comparing against the real util.
*
* If a task is boosted to 1024 for example, we don't want a tiny
* pressure to skew the check whether it fits a CPU or not.
*
- * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+ * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
* should fit a little cpu even if there's some pressure.
*
* Only exception is for thermal pressure since it has a direct impact
@@ -4901,7 +5003,7 @@ static inline int util_fits_cpu(unsigned long util,
* For uclamp_max, we can tolerate a drop in performance level as the
* goal is to cap the task. So it's okay if it's getting less.
*/
- capacity_orig = capacity_orig_of(cpu);
+ capacity_orig = arch_scale_cpu_capacity(cpu);
capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
/*
@@ -5356,7 +5458,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+pick_next_entity(struct cfs_rq *cfs_rq)
{
/*
* Enabling NEXT_BUDDY will affect latency but not fairness.
@@ -5900,13 +6002,13 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
{
- struct cfs_rq *local_unthrottle = NULL;
int this_cpu = smp_processor_id();
u64 runtime, remaining = 1;
bool throttled = false;
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *tmp;
struct rq_flags rf;
struct rq *rq;
+ LIST_HEAD(local_unthrottle);
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -5922,11 +6024,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
if (!cfs_rq_throttled(cfs_rq))
goto next;
-#ifdef CONFIG_SMP
/* Already queued for async unthrottle */
if (!list_empty(&cfs_rq->throttled_csd_list))
goto next;
-#endif
/* By the above checks, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
@@ -5943,11 +6043,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0) {
- if (cpu_of(rq) != this_cpu ||
- SCHED_WARN_ON(local_unthrottle))
+ if (cpu_of(rq) != this_cpu) {
unthrottle_cfs_rq_async(cfs_rq);
- else
- local_unthrottle = cfs_rq;
+ } else {
+ /*
+ * We currently only expect to be unthrottling
+ * a single cfs_rq locally.
+ */
+ SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ list_add_tail(&cfs_rq->throttled_csd_list,
+ &local_unthrottle);
+ }
} else {
throttled = true;
}
@@ -5955,15 +6061,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
next:
rq_unlock_irqrestore(rq, &rf);
}
- rcu_read_unlock();
- if (local_unthrottle) {
- rq = cpu_rq(this_cpu);
+ list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
+ throttled_csd_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
rq_lock_irqsave(rq, &rf);
- if (cfs_rq_throttled(local_unthrottle))
- unthrottle_cfs_rq(local_unthrottle);
+
+ list_del_init(&cfs_rq->throttled_csd_list);
+
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+
rq_unlock_irqrestore(rq, &rf);
}
+ SCHED_WARN_ON(!list_empty(&local_unthrottle));
+
+ rcu_read_unlock();
return throttled;
}
@@ -6293,9 +6407,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
-#ifdef CONFIG_SMP
INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
-#endif
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -7253,45 +7365,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
struct sched_domain_shared *sd_share;
- struct rq *this_rq = this_rq();
- int this = smp_processor_id();
- struct sched_domain *this_sd = NULL;
- u64 time = 0;
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
- if (sched_feat(SIS_PROP) && !has_idle_core) {
- u64 avg_cost, avg_idle, span_avg;
- unsigned long now = jiffies;
-
- this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
- if (!this_sd)
- return -1;
-
- /*
- * If we're busy, the assumption that the last idle period
- * predicts the future is flawed; age away the remaining
- * predicted idle time.
- */
- if (unlikely(this_rq->wake_stamp < now)) {
- while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
- this_rq->wake_stamp++;
- this_rq->wake_avg_idle >>= 1;
- }
- }
-
- avg_idle = this_rq->wake_avg_idle;
- avg_cost = this_sd->avg_scan_cost + 1;
-
- span_avg = sd->span_weight * avg_idle;
- if (span_avg > 4*avg_cost)
- nr = div_u64(span_avg, avg_cost);
- else
- nr = 4;
-
- time = cpu_clock(this);
- }
-
if (sched_feat(SIS_UTIL)) {
sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
if (sd_share) {
@@ -7303,6 +7379,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
}
}
+ if (static_branch_unlikely(&sched_cluster_active)) {
+ struct sched_group *sg = sd->groups;
+
+ if (sg->flags & SD_CLUSTER) {
+ for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
+ if (!cpumask_test_cpu(cpu, cpus))
+ continue;
+
+ if (has_idle_core) {
+ i = select_idle_core(p, cpu, cpus, &idle_cpu);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+ } else {
+ if (--nr <= 0)
+ return -1;
+ idle_cpu = __select_idle_cpu(cpu, p);
+ if ((unsigned int)idle_cpu < nr_cpumask_bits)
+ return idle_cpu;
+ }
+ }
+ cpumask_andnot(cpus, cpus, sched_group_span(sg));
+ }
+ }
+
for_each_cpu_wrap(cpu, cpus, target + 1) {
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -7310,7 +7410,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
return i;
} else {
- if (!--nr)
+ if (--nr <= 0)
return -1;
idle_cpu = __select_idle_cpu(cpu, p);
if ((unsigned int)idle_cpu < nr_cpumask_bits)
@@ -7321,18 +7421,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
if (has_idle_core)
set_idle_cores(target, false);
- if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) {
- time = cpu_clock(this) - time;
-
- /*
- * Account for the scan cost of wakeups against the average
- * idle time.
- */
- this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
-
- update_avg(&this_sd->avg_scan_cost, time);
- }
-
return idle_cpu;
}
@@ -7372,7 +7460,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
* Look for the CPU with best capacity.
*/
else if (fits < 0)
- cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
+ cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
/*
* First, select CPU which fits better (-1 being better than 0).
@@ -7412,7 +7500,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
bool has_idle_core = false;
struct sched_domain *sd;
unsigned long task_util, util_min, util_max;
- int i, recent_used_cpu;
+ int i, recent_used_cpu, prev_aff = -1;
/*
* On asymmetric system, update task utilization because we will check
@@ -7439,8 +7527,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
if (prev != target && cpus_share_cache(prev, target) &&
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
- asym_fits_cpu(task_util, util_min, util_max, prev))
- return prev;
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
+
+ if (!static_branch_unlikely(&sched_cluster_active) ||
+ cpus_share_resources(prev, target))
+ return prev;
+
+ prev_aff = prev;
+ }
/*
* Allow a per-cpu kthread to stack with the wakee if the
@@ -7467,7 +7561,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
- return recent_used_cpu;
+
+ if (!static_branch_unlikely(&sched_cluster_active) ||
+ cpus_share_resources(recent_used_cpu, target))
+ return recent_used_cpu;
+
+ } else {
+ recent_used_cpu = -1;
}
/*
@@ -7508,6 +7608,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;
+ /*
+ * For cluster machines which have lower sharing cache like L2 or
+ * LLC Tag, we tend to find an idle CPU in the target's cluster
+ * first. But prev_cpu or recent_used_cpu may also be a good candidate,
+ * use them if possible when no idle CPU found in select_idle_cpu().
+ */
+ if ((unsigned int)prev_aff < nr_cpumask_bits)
+ return prev_aff;
+ if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
+ return recent_used_cpu;
+
return target;
}
@@ -7614,7 +7725,7 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
util = max(util, util_est);
}
- return min(util, capacity_orig_of(cpu));
+ return min(util, arch_scale_cpu_capacity(cpu));
}
unsigned long cpu_util_cfs(int cpu)
@@ -7766,11 +7877,16 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
{
unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
unsigned long busy_time = eenv->pd_busy_time;
+ unsigned long energy;
if (dst_cpu >= 0)
busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
- return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+ energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+
+ trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
+
+ return energy;
}
/*
@@ -8140,7 +8256,7 @@ static void set_next_buddy(struct sched_entity *se)
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
@@ -8153,7 +8269,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
/*
* This is possible from callers such as attach_tasks(), in which we
- * unconditionally check_preempt_curr() after an enqueue (which may have
+ * unconditionally wakeup_preempt() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
@@ -8245,7 +8361,7 @@ again:
goto again;
}
- se = pick_next_entity(cfs_rq, curr);
+ se = pick_next_entity(cfs_rq);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -8308,7 +8424,7 @@ again:
}
}
- se = pick_next_entity(cfs_rq, curr);
+ se = pick_next_entity(cfs_rq);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -8347,7 +8463,7 @@ simple:
put_prev_task(rq, prev);
do {
- se = pick_next_entity(cfs_rq, NULL);
+ se = pick_next_entity(cfs_rq);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -9060,7 +9176,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
WARN_ON_ONCE(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
/*
@@ -9400,8 +9516,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
- cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
-
if (!capacity)
capacity = 1;
@@ -9477,7 +9591,7 @@ static inline int
check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
return ((rq->cpu_capacity * sd->imbalance_pct) <
- (rq->cpu_capacity_orig * 100));
+ (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}
/*
@@ -9488,7 +9602,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
{
return rq->misfit_task_load &&
- (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+ (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
check_cpu_capacity(rq, sd));
}
@@ -9640,7 +9754,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
* can only do it if @group is an SMT group and has exactly on busy CPU. Larger
* imbalances in the number of CPUS are dealt with in find_busiest_group().
*
- * If we are balancing load within an SMT core, or at DIE domain level, always
+ * If we are balancing load within an SMT core, or at PKG domain level, always
* proceed.
*
* Return: true if @env::dst_cpu can do with asym_packing load balance. False
@@ -11659,36 +11773,39 @@ static inline int on_null_domain(struct rq *rq)
#ifdef CONFIG_NO_HZ_COMMON
/*
- * idle load balancing details
- * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * NOHZ idle load balancing (ILB) details:
+ *
+ * - When one of the busy CPUs notices that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
+ *
+ * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
* anywhere yet.
*/
-
static inline int find_new_ilb(void)
{
- int ilb;
const struct cpumask *hk_mask;
+ int ilb_cpu;
hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
- for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
+ for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
- if (ilb == smp_processor_id())
+ if (ilb_cpu == smp_processor_id())
continue;
- if (idle_cpu(ilb))
- return ilb;
+ if (idle_cpu(ilb_cpu))
+ return ilb_cpu;
}
- return nr_cpu_ids;
+ return -1;
}
/*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
+ * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
+ * SMP function call (IPI).
+ *
+ * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
@@ -11702,8 +11819,7 @@ static void kick_ilb(unsigned int flags)
nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
-
- if (ilb_cpu >= nr_cpu_ids)
+ if (ilb_cpu < 0)
return;
/*
@@ -11716,7 +11832,7 @@ static void kick_ilb(unsigned int flags)
/*
* This way we generate an IPI on the target CPU which
- * is idle. And the softirq performing nohz idle load balance
+ * is idle, and the softirq performing NOHZ idle load balancing
* will be run before returning from the IPI.
*/
smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
@@ -11745,7 +11861,7 @@ static void nohz_balancer_kick(struct rq *rq)
/*
* None are in tickless mode and hence no need for NOHZ idle load
- * balancing.
+ * balancing:
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
return;
@@ -11767,9 +11883,8 @@ static void nohz_balancer_kick(struct rq *rq)
sd = rcu_dereference(rq->sd);
if (sd) {
/*
- * If there's a CFS task and the current CPU has reduced
- * capacity; kick the ILB to see if there's a better CPU to run
- * on.
+ * If there's a runnable CFS task and the current CPU has reduced
+ * capacity, kick the ILB to see if there's a better CPU to run on:
*/
if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
@@ -11821,11 +11936,11 @@ static void nohz_balancer_kick(struct rq *rq)
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
- * increase the overall cache use), we need some less-loaded LLC
- * domain to pull some load. Likewise, we may need to spread
+ * increase the overall cache utilization), we need a less-loaded LLC
+ * domain to pull some load from. Likewise, we may need to spread
* load within the current LLC domain (e.g. packed SMT cores but
* other CPUs are idle). We can't really know from here how busy
- * the others are - so just get a nohz balance going if it looks
+ * the others are - so just get a NOHZ balance going if it looks
* like this LLC domain has tasks we could move.
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
@@ -12095,8 +12210,19 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
}
/*
- * Check if we need to run the ILB for updating blocked load before entering
- * idle state.
+ * Check if we need to directly run the ILB for updating blocked load before
+ * entering idle state. Here we run ILB directly without issuing IPIs.
+ *
+ * Note that when this function is called, the tick may not yet be stopped on
+ * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
+ * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
+ * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
+ * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
+ * called from this function on (this) CPU that's not yet in the mask. That's
+ * OK because the goal of nohz_run_idle_balance() is to run ILB only for
+ * updating the blocked load of already idle CPUs without waking up one of
+ * those idle CPUs and outside the preempt disable / irq off phase of the local
+ * cpu about to enter idle, because it can take a long time.
*/
void nohz_run_idle_balance(int cpu)
{
@@ -12338,6 +12464,9 @@ static void rq_offline_fair(struct rq *rq)
/* Ensure any throttled groups are reachable by pick_next_task */
unthrottle_offline_cfs_rqs(rq);
+
+ /* Ensure that we remove rq contribution to group share: */
+ clear_tg_offline_cfs_rqs(rq);
}
#endif /* CONFIG_SMP */
@@ -12541,7 +12670,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (p->prio > oldprio)
resched_curr(rq);
} else
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -12643,7 +12772,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
if (task_current(rq, p))
resched_curr(rq);
else
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
}
@@ -13002,7 +13131,7 @@ DEFINE_SCHED_CLASS(fair) = {
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
- .check_preempt_curr = check_preempt_wakeup,
+ .wakeup_preempt = check_preempt_wakeup_fair,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,