diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 141 |
1 files changed, 83 insertions, 58 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a708d225c2..9116bcc903 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -57,6 +57,7 @@ #include <linux/profile.h> #include <linux/psi.h> #include <linux/rcuwait_api.h> +#include <linux/rseq.h> #include <linux/sched/wake_q.h> #include <linux/scs.h> #include <linux/slab.h> @@ -1131,6 +1132,28 @@ static void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; + /* + * Set TIF_NEED_RESCHED and send an IPI if in the non-polling + * part of the idle loop. This forces an exit from the idle loop + * and a round trip to schedule(). Now this could be optimized + * because a simple new idle loop iteration is enough to + * re-evaluate the next tick. Provided some re-ordering of tick + * nohz functions that would need to follow TIF_NR_POLLING + * clearing: + * + * - On most archs, a simple fetch_or on ti::flags with a + * "0" value would be enough to know if an IPI needs to be sent. + * + * - x86 needs to perform a last need_resched() check between + * monitor and mwait which doesn't take timers into account. + * There a dedicated TIF_TIMER flag would be required to + * fetch_or here and be checked along with TIF_NEED_RESCHED + * before mwait(). + * + * However, remote timer enqueue is not such a frequent event + * and testing of the above solutions didn't appear to report + * much benefits. + */ if (set_nr_and_not_polling(rq->idle)) smp_send_reschedule(cpu); else @@ -2124,12 +2147,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) enqueue_task(rq, p, flags); - p->on_rq = TASK_ON_RQ_QUEUED; + WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; + WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); dequeue_task(rq, p, flags); } @@ -3795,6 +3820,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } #endif + + p->dl_server = NULL; } /* @@ -4509,10 +4536,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) memset(&p->stats, 0, sizeof(p->stats)); #endif - RB_CLEAR_NODE(&p->dl.rb_node); - init_dl_task_timer(&p->dl); - init_dl_inactive_task_timer(&p->dl); - __dl_clear_params(p); + init_dl_entity(&p->dl); INIT_LIST_HEAD(&p->rt.run_list); p->rt.timeout = 0; @@ -6004,12 +6028,27 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = pick_next_task_idle(rq); } + /* + * This is the fast path; it cannot be a DL server pick; + * therefore even if @p == @prev, ->dl_server must be NULL. + */ + if (p->dl_server) + p->dl_server = NULL; + return p; } restart: put_prev_task_balance(rq, prev, rf); + /* + * We've updated @prev and no longer need the server link, clear it. + * Must be done before ->pick_next_task() because that can (re)set + * ->dl_server. + */ + if (prev->dl_server) + prev->dl_server = NULL; + for_each_class(class) { p = class->pick_next_task(rq); if (p) @@ -7429,18 +7468,13 @@ int sched_core_idle_cpu(int cpu) * required to meet deadlines. */ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - enum cpu_util_type type, - struct task_struct *p) + unsigned long *min, + unsigned long *max) { - unsigned long dl_util, util, irq, max; + unsigned long util, irq, scale; struct rq *rq = cpu_rq(cpu); - max = arch_scale_cpu_capacity(cpu); - - if (!uclamp_is_used() && - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { - return max; - } + scale = arch_scale_cpu_capacity(cpu); /* * Early check to see if IRQ/steal time saturates the CPU, can be @@ -7448,45 +7482,49 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, * update_irq_load_avg(). */ irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) - return max; + if (unlikely(irq >= scale)) { + if (min) + *min = scale; + if (max) + *max = scale; + return scale; + } + + if (min) { + /* + * The minimum utilization returns the highest level between: + * - the computed DL bandwidth needed with the IRQ pressure which + * steals time to the deadline task. + * - The minimum performance requirement for CFS and/or RT. + */ + *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); + + /* + * When an RT task is runnable and uclamp is not used, we must + * ensure that the task will run at maximum compute capacity. + */ + if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) + *min = max(*min, scale); + } /* * Because the time spend on RT/DL tasks is visible as 'lost' time to * CFS tasks and we use the same metric to track the effective * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization. - * - * CFS and RT utilization can be boosted or capped, depending on - * utilization clamp constraints requested by currently RUNNABLE - * tasks. - * When there are no CFS RUNNABLE tasks, clamps are released and - * frequency will be gracefully reduced with the utilization decay. */ util = util_cfs + cpu_util_rt(rq); - if (type == FREQUENCY_UTIL) - util = uclamp_rq_util_with(rq, util, p); - - dl_util = cpu_util_dl(rq); + util += cpu_util_dl(rq); /* - * For frequency selection we do not make cpu_util_dl() a permanent part - * of this sum because we want to use cpu_bw_dl() later on, but we need - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such - * that we select f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. + * The maximum hint is a soft bandwidth requirement, which can be lower + * than the actual utilization because of uclamp_max requirements. */ - if (util + dl_util >= max) - return max; + if (max) + *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); - /* - * OTOH, for energy computation we need the estimated running time, so - * include util_dl and ignore dl_bw. - */ - if (type == ENERGY_UTIL) - util += dl_util; + if (util >= scale) + return scale; /* * There is still idle time; further improve the number by using the @@ -7497,28 +7535,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, * U' = irq + --------- * U * max */ - util = scale_irq_capacity(util, irq, max); + util = scale_irq_capacity(util, irq, scale); util += irq; - /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. - */ - if (type == FREQUENCY_UTIL) - util += cpu_bw_dl(rq); - - return min(max, util); + return min(scale, util); } unsigned long sched_cpu_util(int cpu) { - return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL); + return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); } #endif /* CONFIG_SMP */ |