summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
commit76cb841cb886eef6b3bee341a2266c76578724ad (patch)
treef5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /kernel/sched
parentInitial commit. (diff)
downloadlinux-upstream.tar.xz
linux-upstream.zip
Adding upstream version 4.19.249.upstream/4.19.249upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile31
-rw-r--r--kernel/sched/autogroup.c270
-rw-r--r--kernel/sched/autogroup.h60
-rw-r--r--kernel/sched/clock.c481
-rw-r--r--kernel/sched/completion.c329
-rw-r--r--kernel/sched/core.c7110
-rw-r--r--kernel/sched/cpuacct.c375
-rw-r--r--kernel/sched/cpudeadline.c276
-rw-r--r--kernel/sched/cpudeadline.h26
-rw-r--r--kernel/sched/cpufreq.c80
-rw-r--r--kernel/sched/cpufreq_schedutil.c892
-rw-r--r--kernel/sched/cpupri.c241
-rw-r--r--kernel/sched/cpupri.h25
-rw-r--r--kernel/sched/cputime.c895
-rw-r--r--kernel/sched/deadline.c2793
-rw-r--r--kernel/sched/debug.c1016
-rw-r--r--kernel/sched/fair.c10365
-rw-r--r--kernel/sched/features.h92
-rw-r--r--kernel/sched/idle.c483
-rw-r--r--kernel/sched/isolation.c153
-rw-r--r--kernel/sched/loadavg.c400
-rw-r--r--kernel/sched/membarrier.c318
-rw-r--r--kernel/sched/pelt.c399
-rw-r--r--kernel/sched/pelt.h72
-rw-r--r--kernel/sched/rt.c2749
-rw-r--r--kernel/sched/sched-pelt.h14
-rw-r--r--kernel/sched/sched.h2248
-rw-r--r--kernel/sched/stats.c128
-rw-r--r--kernel/sched/stats.h167
-rw-r--r--kernel/sched/stop_task.c145
-rw-r--r--kernel/sched/swait.c132
-rw-r--r--kernel/sched/topology.c1922
-rw-r--r--kernel/sched/wait.c449
-rw-r--r--kernel/sched/wait_bit.c249
34 files changed, 35385 insertions, 0 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644
index 000000000..7fe183404
--- /dev/null
+++ b/kernel/sched/Makefile
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
+endif
+
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
+# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+# needed for x86 only. Why this used to be enabled for all architectures is beyond
+# me. I suspect most platforms don't need this, but until we know that for sure
+# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
+# to get a correct value for the wait-channel (WCHAN in ps). --davidm
+CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
+endif
+
+obj-y += core.o loadavg.o clock.o cputime.o
+obj-y += idle.o fair.o rt.o deadline.o
+obj-y += wait.o wait_bit.o swait.o completion.o
+
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
+obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
+obj-$(CONFIG_SCHEDSTATS) += stats.o
+obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
+obj-$(CONFIG_CPU_ISOLATION) += isolation.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
new file mode 100644
index 000000000..2d4ff5353
--- /dev/null
+++ b/kernel/sched/autogroup.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Auto-group scheduling implementation:
+ */
+#include <linux/nospec.h>
+#include "sched.h"
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+
+void __init autogroup_init(struct task_struct *init_task)
+{
+ autogroup_default.tg = &root_task_group;
+ kref_init(&autogroup_default.kref);
+ init_rwsem(&autogroup_default.lock);
+ init_task->signal->autogroup = &autogroup_default;
+}
+
+void autogroup_free(struct task_group *tg)
+{
+ kfree(tg->autogroup);
+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+ struct autogroup *ag = container_of(kref, struct autogroup, kref);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* We've redirected RT tasks to the root task group... */
+ ag->tg->rt_se = NULL;
+ ag->tg->rt_rq = NULL;
+#endif
+ sched_offline_group(ag->tg);
+ sched_destroy_group(ag->tg);
+}
+
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+ kref_put(&ag->kref, autogroup_destroy);
+}
+
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+ kref_get(&ag->kref);
+ return ag;
+}
+
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+ struct autogroup *ag;
+ unsigned long flags;
+
+ if (!lock_task_sighand(p, &flags))
+ return autogroup_kref_get(&autogroup_default);
+
+ ag = autogroup_kref_get(p->signal->autogroup);
+ unlock_task_sighand(p, &flags);
+
+ return ag;
+}
+
+static inline struct autogroup *autogroup_create(void)
+{
+ struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+ struct task_group *tg;
+
+ if (!ag)
+ goto out_fail;
+
+ tg = sched_create_group(&root_task_group);
+ if (IS_ERR(tg))
+ goto out_free;
+
+ kref_init(&ag->kref);
+ init_rwsem(&ag->lock);
+ ag->id = atomic_inc_return(&autogroup_seq_nr);
+ ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Autogroup RT tasks are redirected to the root task group
+ * so we don't have to move tasks around upon policy change,
+ * or flail around trying to allocate bandwidth on the fly.
+ * A bandwidth exception in __sched_setscheduler() allows
+ * the policy change to proceed.
+ */
+ free_rt_sched_group(tg);
+ tg->rt_se = root_task_group.rt_se;
+ tg->rt_rq = root_task_group.rt_rq;
+#endif
+ tg->autogroup = ag;
+
+ sched_online_group(tg, &root_task_group);
+ return ag;
+
+out_free:
+ kfree(ag);
+out_fail:
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING "autogroup_create: %s failure.\n",
+ ag ? "sched_create_group()" : "kzalloc()");
+ }
+
+ return autogroup_kref_get(&autogroup_default);
+}
+
+bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+ if (tg != &root_task_group)
+ return false;
+ /*
+ * If we race with autogroup_move_group() the caller can use the old
+ * value of signal->autogroup but in this case sched_move_task() will
+ * be called again before autogroup_kref_put().
+ *
+ * However, there is no way sched_autogroup_exit_task() could tell us
+ * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
+ */
+ if (p->flags & PF_EXITING)
+ return false;
+
+ return true;
+}
+
+void sched_autogroup_exit_task(struct task_struct *p)
+{
+ /*
+ * We are going to call exit_notify() and autogroup_move_group() can't
+ * see this thread after that: we can no longer use signal->autogroup.
+ * See the PF_EXITING check in task_wants_autogroup().
+ */
+ sched_move_task(p);
+}
+
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+ struct autogroup *prev;
+ struct task_struct *t;
+ unsigned long flags;
+
+ BUG_ON(!lock_task_sighand(p, &flags));
+
+ prev = p->signal->autogroup;
+ if (prev == ag) {
+ unlock_task_sighand(p, &flags);
+ return;
+ }
+
+ p->signal->autogroup = autogroup_kref_get(ag);
+ /*
+ * We can't avoid sched_move_task() after we changed signal->autogroup,
+ * this process can already run with task_group() == prev->tg or we can
+ * race with cgroup code which can read autogroup = prev under rq->lock.
+ * In the latter case for_each_thread() can not miss a migrating thread,
+ * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
+ * can't be removed from thread list, we hold ->siglock.
+ *
+ * If an exiting thread was already removed from thread list we rely on
+ * sched_autogroup_exit_task().
+ */
+ for_each_thread(p, t)
+ sched_move_task(t);
+
+ unlock_task_sighand(p, &flags);
+ autogroup_kref_put(prev);
+}
+
+/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+ struct autogroup *ag = autogroup_create();
+
+ autogroup_move_group(p, ag);
+
+ /* Drop extra reference added by autogroup_create(): */
+ autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+/* Cannot be called under siglock. Currently has no users: */
+void sched_autogroup_detach(struct task_struct *p)
+{
+ autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+ sig->autogroup = autogroup_task_get(current);
+}
+
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+ autogroup_kref_put(sig->autogroup);
+}
+
+static int __init setup_autogroup(char *str)
+{
+ sysctl_sched_autogroup_enabled = 0;
+
+ return 1;
+}
+__setup("noautogroup", setup_autogroup);
+
+#ifdef CONFIG_PROC_FS
+
+int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
+{
+ static unsigned long next = INITIAL_JIFFIES;
+ struct autogroup *ag;
+ unsigned long shares;
+ int err, idx;
+
+ if (nice < MIN_NICE || nice > MAX_NICE)
+ return -EINVAL;
+
+ err = security_task_setnice(current, nice);
+ if (err)
+ return err;
+
+ if (nice < 0 && !can_nice(current, nice))
+ return -EPERM;
+
+ /* This is a heavy operation, taking global locks.. */
+ if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+ return -EAGAIN;
+
+ next = HZ / 10 + jiffies;
+ ag = autogroup_task_get(p);
+
+ idx = array_index_nospec(nice + 20, 40);
+ shares = scale_load(sched_prio_to_weight[idx]);
+
+ down_write(&ag->lock);
+ err = sched_group_set_shares(ag->tg, shares);
+ if (!err)
+ ag->nice = nice;
+ up_write(&ag->lock);
+
+ autogroup_kref_put(ag);
+
+ return err;
+}
+
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+ struct autogroup *ag = autogroup_task_get(p);
+
+ if (!task_group_is_autogroup(ag->tg))
+ goto out;
+
+ down_read(&ag->lock);
+ seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+ up_read(&ag->lock);
+
+out:
+ autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SCHED_DEBUG
+int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+ if (!task_group_is_autogroup(tg))
+ return 0;
+
+ return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
new file mode 100644
index 000000000..b96419974
--- /dev/null
+++ b/kernel/sched/autogroup.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+struct autogroup {
+ /*
+ * Reference doesn't mean how many threads attach to this
+ * autogroup now. It just stands for the number of tasks
+ * which could use this autogroup.
+ */
+ struct kref kref;
+ struct task_group *tg;
+ struct rw_semaphore lock;
+ unsigned long id;
+ int nice;
+};
+
+extern void autogroup_init(struct task_struct *init_task);
+extern void autogroup_free(struct task_group *tg);
+
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+ return !!tg->autogroup;
+}
+
+extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
+
+ if (enabled && task_wants_autogroup(p, tg))
+ return p->signal->autogroup->tg;
+
+ return tg;
+}
+
+extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
+
+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) { }
+static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+ return 0;
+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ return tg;
+}
+
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
new file mode 100644
index 000000000..e3e3b979f
--- /dev/null
+++ b/kernel/sched/clock.c
@@ -0,0 +1,481 @@
+/*
+ * sched_clock() for unstable CPU clocks
+ *
+ * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
+ *
+ * Updates and enhancements:
+ * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
+ *
+ * Based on code by:
+ * Ingo Molnar <mingo@redhat.com>
+ * Guillaume Chazarain <guichaz@gmail.com>
+ *
+ *
+ * What this file implements:
+ *
+ * cpu_clock(i) provides a fast (execution time) high resolution
+ * clock with bounded drift between CPUs. The value of cpu_clock(i)
+ * is monotonic for constant i. The timestamp returned is in nanoseconds.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !! #
+ * ####################################################################
+ *
+ * There is no strict promise about the base, although it tends to start
+ * at 0 on boot (but people really shouldn't rely on that).
+ *
+ * cpu_clock(i) -- can be used from any context, including NMI.
+ * local_clock() -- is cpu_clock() on the current CPU.
+ *
+ * sched_clock_cpu(i)
+ *
+ * How it is implemented:
+ *
+ * The implementation either uses sched_clock() when
+ * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
+ * sched_clock() is assumed to provide these properties (mostly it means
+ * the architecture provides a globally synchronized highres time source).
+ *
+ * Otherwise it tries to create a semi stable clock from a mixture of other
+ * clocks, including:
+ *
+ * - GTOD (clock monotomic)
+ * - sched_clock()
+ * - explicit idle events
+ *
+ * We use GTOD as base and use sched_clock() deltas to improve resolution. The
+ * deltas are filtered to provide monotonicity and keeping it within an
+ * expected window.
+ *
+ * Furthermore, explicit sleep and wakeup hooks allow us to account for time
+ * that is otherwise invisible (TSC gets stopped).
+ *
+ */
+#include "sched.h"
+#include <linux/sched_clock.h>
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __weak sched_clock(void)
+{
+ return (unsigned long long)(jiffies - INITIAL_JIFFIES)
+ * (NSEC_PER_SEC / HZ);
+}
+EXPORT_SYMBOL_GPL(sched_clock);
+
+static DEFINE_STATIC_KEY_FALSE(sched_clock_running);
+
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+/*
+ * We must start with !__sched_clock_stable because the unstable -> stable
+ * transition is accurate, while the stable -> unstable transition is not.
+ *
+ * Similarly we start with __sched_clock_stable_early, thereby assuming we
+ * will become stable, such that there's only a single 1 -> 0 transition.
+ */
+static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
+static int __sched_clock_stable_early = 1;
+
+/*
+ * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset
+ */
+__read_mostly u64 __sched_clock_offset;
+static __read_mostly u64 __gtod_offset;
+
+struct sched_clock_data {
+ u64 tick_raw;
+ u64 tick_gtod;
+ u64 clock;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+
+static inline struct sched_clock_data *this_scd(void)
+{
+ return this_cpu_ptr(&sched_clock_data);
+}
+
+static inline struct sched_clock_data *cpu_sdc(int cpu)
+{
+ return &per_cpu(sched_clock_data, cpu);
+}
+
+int sched_clock_stable(void)
+{
+ return static_branch_likely(&__sched_clock_stable);
+}
+
+static void __scd_stamp(struct sched_clock_data *scd)
+{
+ scd->tick_gtod = ktime_get_ns();
+ scd->tick_raw = sched_clock();
+}
+
+static void __set_sched_clock_stable(void)
+{
+ struct sched_clock_data *scd;
+
+ /*
+ * Since we're still unstable and the tick is already running, we have
+ * to disable IRQs in order to get a consistent scd->tick* reading.
+ */
+ local_irq_disable();
+ scd = this_scd();
+ /*
+ * Attempt to make the (initial) unstable->stable transition continuous.
+ */
+ __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
+ local_irq_enable();
+
+ printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);
+
+ static_branch_enable(&__sched_clock_stable);
+ tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
+}
+
+/*
+ * If we ever get here, we're screwed, because we found out -- typically after
+ * the fact -- that TSC wasn't good. This means all our clocksources (including
+ * ktime) could have reported wrong values.
+ *
+ * What we do here is an attempt to fix up and continue sort of where we left
+ * off in a coherent manner.
+ *
+ * The only way to fully avoid random clock jumps is to boot with:
+ * "tsc=unstable".
+ */
+static void __sched_clock_work(struct work_struct *work)
+{
+ struct sched_clock_data *scd;
+ int cpu;
+
+ /* take a current timestamp and set 'now' */
+ preempt_disable();
+ scd = this_scd();
+ __scd_stamp(scd);
+ scd->clock = scd->tick_gtod + __gtod_offset;
+ preempt_enable();
+
+ /* clone to all CPUs */
+ for_each_possible_cpu(cpu)
+ per_cpu(sched_clock_data, cpu) = *scd;
+
+ printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n");
+ printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);
+
+ static_branch_disable(&__sched_clock_stable);
+}
+
+static DECLARE_WORK(sched_clock_work, __sched_clock_work);
+
+static void __clear_sched_clock_stable(void)
+{
+ if (!sched_clock_stable())
+ return;
+
+ tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
+ schedule_work(&sched_clock_work);
+}
+
+void clear_sched_clock_stable(void)
+{
+ __sched_clock_stable_early = 0;
+
+ smp_mb(); /* matches sched_clock_init_late() */
+
+ if (static_key_count(&sched_clock_running.key) == 2)
+ __clear_sched_clock_stable();
+}
+
+static void __sched_clock_gtod_offset(void)
+{
+ struct sched_clock_data *scd = this_scd();
+
+ __scd_stamp(scd);
+ __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod;
+}
+
+void __init sched_clock_init(void)
+{
+ /*
+ * Set __gtod_offset such that once we mark sched_clock_running,
+ * sched_clock_tick() continues where sched_clock() left off.
+ *
+ * Even if TSC is buggered, we're still UP at this point so it
+ * can't really be out of sync.
+ */
+ local_irq_disable();
+ __sched_clock_gtod_offset();
+ local_irq_enable();
+
+ static_branch_inc(&sched_clock_running);
+}
+/*
+ * We run this as late_initcall() such that it runs after all built-in drivers,
+ * notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
+ */
+static int __init sched_clock_init_late(void)
+{
+ static_branch_inc(&sched_clock_running);
+ /*
+ * Ensure that it is impossible to not do a static_key update.
+ *
+ * Either {set,clear}_sched_clock_stable() must see sched_clock_running
+ * and do the update, or we must see their __sched_clock_stable_early
+ * and do the update, or both.
+ */
+ smp_mb(); /* matches {set,clear}_sched_clock_stable() */
+
+ if (__sched_clock_stable_early)
+ __set_sched_clock_stable();
+
+ return 0;
+}
+late_initcall(sched_clock_init_late);
+
+/*
+ * min, max except they take wrapping into account
+ */
+
+static inline u64 wrap_min(u64 x, u64 y)
+{
+ return (s64)(x - y) < 0 ? x : y;
+}
+
+static inline u64 wrap_max(u64 x, u64 y)
+{
+ return (s64)(x - y) > 0 ? x : y;
+}
+
+/*
+ * update the percpu scd from the raw @now value
+ *
+ * - filter out backward motion
+ * - use the GTOD tick value to create a window to filter crazy TSC values
+ */
+static u64 sched_clock_local(struct sched_clock_data *scd)
+{
+ u64 now, clock, old_clock, min_clock, max_clock, gtod;
+ s64 delta;
+
+again:
+ now = sched_clock();
+ delta = now - scd->tick_raw;
+ if (unlikely(delta < 0))
+ delta = 0;
+
+ old_clock = scd->clock;
+
+ /*
+ * scd->clock = clamp(scd->tick_gtod + delta,
+ * max(scd->tick_gtod, scd->clock),
+ * scd->tick_gtod + TICK_NSEC);
+ */
+
+ gtod = scd->tick_gtod + __gtod_offset;
+ clock = gtod + delta;
+ min_clock = wrap_max(gtod, old_clock);
+ max_clock = wrap_max(old_clock, gtod + TICK_NSEC);
+
+ clock = wrap_max(clock, min_clock);
+ clock = wrap_min(clock, max_clock);
+
+ if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
+ goto again;
+
+ return clock;
+}
+
+static u64 sched_clock_remote(struct sched_clock_data *scd)
+{
+ struct sched_clock_data *my_scd = this_scd();
+ u64 this_clock, remote_clock;
+ u64 *ptr, old_val, val;
+
+#if BITS_PER_LONG != 64
+again:
+ /*
+ * Careful here: The local and the remote clock values need to
+ * be read out atomic as we need to compare the values and
+ * then update either the local or the remote side. So the
+ * cmpxchg64 below only protects one readout.
+ *
+ * We must reread via sched_clock_local() in the retry case on
+ * 32-bit kernels as an NMI could use sched_clock_local() via the
+ * tracer and hit between the readout of
+ * the low 32-bit and the high 32-bit portion.
+ */
+ this_clock = sched_clock_local(my_scd);
+ /*
+ * We must enforce atomic readout on 32-bit, otherwise the
+ * update on the remote CPU can hit inbetween the readout of
+ * the low 32-bit and the high 32-bit portion.
+ */
+ remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+ /*
+ * On 64-bit kernels the read of [my]scd->clock is atomic versus the
+ * update, so we can avoid the above 32-bit dance.
+ */
+ sched_clock_local(my_scd);
+again:
+ this_clock = my_scd->clock;
+ remote_clock = scd->clock;
+#endif
+
+ /*
+ * Use the opportunity that we have both locks
+ * taken to couple the two clocks: we take the
+ * larger time as the latest time for both
+ * runqueues. (this creates monotonic movement)
+ */
+ if (likely((s64)(remote_clock - this_clock) < 0)) {
+ ptr = &scd->clock;
+ old_val = remote_clock;
+ val = this_clock;
+ } else {
+ /*
+ * Should be rare, but possible:
+ */
+ ptr = &my_scd->clock;
+ old_val = this_clock;
+ val = remote_clock;
+ }
+
+ if (cmpxchg64(ptr, old_val, val) != old_val)
+ goto again;
+
+ return val;
+}
+
+/*
+ * Similar to cpu_clock(), but requires local IRQs to be disabled.
+ *
+ * See cpu_clock().
+ */
+u64 sched_clock_cpu(int cpu)
+{
+ struct sched_clock_data *scd;
+ u64 clock;
+
+ if (sched_clock_stable())
+ return sched_clock() + __sched_clock_offset;
+
+ if (!static_branch_unlikely(&sched_clock_running))
+ return sched_clock();
+
+ preempt_disable_notrace();
+ scd = cpu_sdc(cpu);
+
+ if (cpu != smp_processor_id())
+ clock = sched_clock_remote(scd);
+ else
+ clock = sched_clock_local(scd);
+ preempt_enable_notrace();
+
+ return clock;
+}
+EXPORT_SYMBOL_GPL(sched_clock_cpu);
+
+void sched_clock_tick(void)
+{
+ struct sched_clock_data *scd;
+
+ if (sched_clock_stable())
+ return;
+
+ if (!static_branch_unlikely(&sched_clock_running))
+ return;
+
+ lockdep_assert_irqs_disabled();
+
+ scd = this_scd();
+ __scd_stamp(scd);
+ sched_clock_local(scd);
+}
+
+void sched_clock_tick_stable(void)
+{
+ if (!sched_clock_stable())
+ return;
+
+ /*
+ * Called under watchdog_lock.
+ *
+ * The watchdog just found this TSC to (still) be stable, so now is a
+ * good moment to update our __gtod_offset. Because once we find the
+ * TSC to be unstable, any computation will be computing crap.
+ */
+ local_irq_disable();
+ __sched_clock_gtod_offset();
+ local_irq_enable();
+}
+
+/*
+ * We are going deep-idle (irqs are disabled):
+ */
+void sched_clock_idle_sleep_event(void)
+{
+ sched_clock_cpu(smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled; resync with ktime.
+ */
+void sched_clock_idle_wakeup_event(void)
+{
+ unsigned long flags;
+
+ if (sched_clock_stable())
+ return;
+
+ if (unlikely(timekeeping_suspended))
+ return;
+
+ local_irq_save(flags);
+ sched_clock_tick();
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+
+void __init sched_clock_init(void)
+{
+ static_branch_inc(&sched_clock_running);
+ local_irq_disable();
+ generic_sched_clock_init();
+ local_irq_enable();
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+ if (!static_branch_unlikely(&sched_clock_running))
+ return 0;
+
+ return sched_clock();
+}
+
+#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+
+/*
+ * Running clock - returns the time that has elapsed while a guest has been
+ * running.
+ * On a guest this value should be local_clock minus the time the guest was
+ * suspended by the hypervisor (for any reason).
+ * On bare metal this function should return the same as local_clock.
+ * Architectures and sub-architectures can override this.
+ */
+u64 __weak running_clock(void)
+{
+ return local_clock();
+}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
new file mode 100644
index 000000000..a1ad5b7d5
--- /dev/null
+++ b/kernel/sched/completion.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic wait-for-completion handler;
+ *
+ * It differs from semaphores in that their default case is the opposite,
+ * wait_for_completion default blocks whereas semaphore default non-block. The
+ * interface also makes it easy to 'complete' multiple waiting threads,
+ * something which isn't entirely natural for semaphores.
+ *
+ * But more importantly, the primitive documents the usage. Semaphores would
+ * typically be used for exclusion which gives rise to priority inversion.
+ * Waiting for completion is a typically sync point, but not an exclusion point.
+ */
+#include "sched.h"
+
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ */
+void complete(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+
+ if (x->done != UINT_MAX)
+ x->done++;
+ __wake_up_locked(&x->wait, TASK_NORMAL, 1);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ *
+ * Since complete_all() sets the completion of @x permanently to done
+ * to allow multiple waiters to finish, a call to reinit_completion()
+ * must be used on @x if @x is to be used again. The code must make
+ * sure that all waiters have woken and finished before reinitializing
+ * @x. Also note that the function completion_done() can not be used
+ * to know if there are still waiters after complete_all() has been called.
+ */
+void complete_all(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done = UINT_MAX;
+ __wake_up_locked(&x->wait, TASK_NORMAL, 0);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+static inline long __sched
+do_wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ if (!x->done) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ __add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
+ do {
+ if (signal_pending_state(state, current)) {
+ timeout = -ERESTARTSYS;
+ break;
+ }
+ __set_current_state(state);
+ spin_unlock_irq(&x->wait.lock);
+ timeout = action(timeout);
+ spin_lock_irq(&x->wait.lock);
+ } while (!x->done && timeout);
+ __remove_wait_queue(&x->wait, &wait);
+ if (!x->done)
+ return timeout;
+ }
+ if (x->done != UINT_MAX)
+ x->done--;
+ return timeout ?: 1;
+}
+
+static inline long __sched
+__wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ might_sleep();
+
+ complete_acquire(x);
+
+ spin_lock_irq(&x->wait.lock);
+ timeout = do_wait_for_common(x, action, timeout, state);
+ spin_unlock_irq(&x->wait.lock);
+
+ complete_release(x);
+
+ return timeout;
+}
+
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+ wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO (which traditionally means blkio only).
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+ wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO (which traditionally
+ * means blkio only).
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x: holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
+ * try_wait_for_completion - try to decrement a completion without blocking
+ * @x: completion structure
+ *
+ * Return: 0 if a decrement cannot be done without blocking
+ * 1 if a decrement succeeded.
+ *
+ * If a completion is being used as a counting completion,
+ * attempt to decrement the counter without blocking. This
+ * enables us to avoid waiting if the resource the completion
+ * is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+ unsigned long flags;
+ bool ret = true;
+
+ /*
+ * Since x->done will need to be locked only
+ * in the non-blocking case, we check x->done
+ * first without taking the lock so we can
+ * return early in the blocking case.
+ */
+ if (!READ_ONCE(x->done))
+ return false;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ if (!x->done)
+ ret = false;
+ else if (x->done != UINT_MAX)
+ x->done--;
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ * completion_done - Test to see if a completion has any waiters
+ * @x: completion structure
+ *
+ * Return: 0 if there are waiters (wait_for_completion() in progress)
+ * 1 if there are no waiters.
+ *
+ * Note, this will always return true if complete_all() was called on @X.
+ */
+bool completion_done(struct completion *x)
+{
+ unsigned long flags;
+
+ if (!READ_ONCE(x->done))
+ return false;
+
+ /*
+ * If ->done, we need to wait for complete() to release ->wait.lock
+ * otherwise we can end up freeing the completion before complete()
+ * is done referencing it.
+ */
+ spin_lock_irqsave(&x->wait.lock, flags);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return true;
+}
+EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
new file mode 100644
index 000000000..32af895bd
--- /dev/null
+++ b/kernel/sched/core.c
@@ -0,0 +1,7110 @@
+/*
+ * kernel/sched/core.c
+ *
+ * Core kernel scheduler code and related syscalls
+ *
+ * Copyright (C) 1991-2002 Linus Torvalds
+ */
+#include "sched.h"
+
+#include <linux/nospec.h>
+
+#include <linux/kcov.h>
+
+#include <asm/switch_to.h>
+#include <asm/tlb.h>
+
+#include "../workqueue_internal.h"
+#include "../smpboot.h"
+
+#include "pelt.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * Debugging: various feature bits
+ *
+ * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
+ * sysctl_sched_features, defined in sched.h, to allow constants propagation
+ * at compile time and compiler optimization based on features default.
+ */
+#define SCHED_FEAT(name, enabled) \
+ (1UL << __SCHED_FEAT_##name) * enabled |
+const_debug unsigned int sysctl_sched_features =
+#include "features.h"
+ 0;
+#undef SCHED_FEAT
+#endif
+
+/*
+ * Number of tasks to iterate in a single balance run.
+ * Limited because this is done with IRQs disabled.
+ */
+const_debug unsigned int sysctl_sched_nr_migrate = 32;
+
+/*
+ * period over which we measure -rt task CPU usage in us.
+ * default: 1s
+ */
+unsigned int sysctl_sched_rt_period = 1000000;
+
+__read_mostly int scheduler_running;
+
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
+
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ for (;;) {
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ rq_pin_lock(rq, rf);
+ return rq;
+ }
+ raw_spin_unlock(&rq->lock);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ for (;;) {
+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ /*
+ * move_queued_task() task_rq_lock()
+ *
+ * ACQUIRE (rq->lock)
+ * [S] ->on_rq = MIGRATING [L] rq = task_rq()
+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
+ * [S] ->cpu = new_cpu [L] task_rq()
+ * [L] ->on_rq
+ * RELEASE (rq->lock)
+ *
+ * If we observe the old CPU in task_rq_lock(), the acquire of
+ * the old rq->lock will fully serialize against the stores.
+ *
+ * If we observe the new CPU in task_rq_lock(), the address
+ * dependency headed by '[L] rq = task_rq()' and the acquire
+ * will pair with the WMB to ensure we then also see migrating.
+ */
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ rq_pin_lock(rq, rf);
+ return rq;
+ }
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+/*
+ * RQ-clock updating methods:
+ */
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+ s64 __maybe_unused steal = 0, irq_delta = 0;
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_key_false((&paravirt_steal_rq_enabled))) {
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ rq->prev_steal_time_rq += steal;
+ delta -= steal;
+ }
+#endif
+
+ rq->clock_task += delta;
+
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
+ update_irq_load_avg(rq, irq_delta + steal);
+#endif
+}
+
+void update_rq_clock(struct rq *rq)
+{
+ s64 delta;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (rq->clock_update_flags & RQCF_ACT_SKIP)
+ return;
+
+#ifdef CONFIG_SCHED_DEBUG
+ if (sched_feat(WARN_DOUBLE_CLOCK))
+ SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
+ rq->clock_update_flags |= RQCF_UPDATED;
+#endif
+
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ if (delta < 0)
+ return;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
+}
+
+
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ */
+
+static void hrtick_clear(struct rq *rq)
+{
+ if (hrtimer_active(&rq->hrtick_timer))
+ hrtimer_cancel(&rq->hrtick_timer);
+}
+
+/*
+ * High-resolution timer tick.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrtick(struct hrtimer *timer)
+{
+ struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+ struct rq_flags rf;
+
+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
+ rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+ rq_unlock(rq, &rf);
+
+ return HRTIMER_NORESTART;
+}
+
+#ifdef CONFIG_SMP
+
+static void __hrtick_restart(struct rq *rq)
+{
+ struct hrtimer *timer = &rq->hrtick_timer;
+
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+}
+
+/*
+ * called from hardirq (IPI) context
+ */
+static void __hrtick_start(void *arg)
+{
+ struct rq *rq = arg;
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ __hrtick_restart(rq);
+ rq->hrtick_csd_pending = 0;
+ rq_unlock(rq, &rf);
+}
+
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+void hrtick_start(struct rq *rq, u64 delay)
+{
+ struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time;
+ s64 delta;
+
+ /*
+ * Don't schedule slices shorter than 10000ns, that just
+ * doesn't make sense and can cause timer DoS.
+ */
+ delta = max_t(s64, delay, 10000LL);
+ time = ktime_add_ns(timer->base->get_time(), delta);
+
+ hrtimer_set_expires(timer, time);
+
+ if (rq == this_rq()) {
+ __hrtick_restart(rq);
+ } else if (!rq->hrtick_csd_pending) {
+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
+ rq->hrtick_csd_pending = 1;
+ }
+}
+
+#else
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+void hrtick_start(struct rq *rq, u64 delay)
+{
+ /*
+ * Don't schedule slices shorter than 10000ns, that just
+ * doesn't make sense. Rely on vruntime for fairness.
+ */
+ delay = max_t(u64, delay, 10000LL);
+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+ HRTIMER_MODE_REL_PINNED);
+}
+#endif /* CONFIG_SMP */
+
+static void hrtick_rq_init(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ rq->hrtick_csd_pending = 0;
+
+ rq->hrtick_csd.flags = 0;
+ rq->hrtick_csd.func = __hrtick_start;
+ rq->hrtick_csd.info = rq;
+#endif
+
+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rq->hrtick_timer.function = hrtick;
+}
+#else /* CONFIG_SCHED_HRTICK */
+static inline void hrtick_clear(struct rq *rq)
+{
+}
+
+static inline void hrtick_rq_init(struct rq *rq)
+{
+}
+#endif /* CONFIG_SCHED_HRTICK */
+
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, mask) \
+ ({ \
+ typeof(ptr) _ptr = (ptr); \
+ typeof(mask) _mask = (mask); \
+ typeof(*_ptr) _old, _val = *_ptr; \
+ \
+ for (;;) { \
+ _old = cmpxchg(_ptr, _val, _val | _mask); \
+ if (_old == _val) \
+ break; \
+ _val = _old; \
+ } \
+ _old; \
+})
+
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+
+ for (;;) {
+ if (!(val & _TIF_POLLING_NRFLAG))
+ return false;
+ if (val & _TIF_NEED_RESCHED)
+ return true;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ break;
+ val = old;
+ }
+ return true;
+}
+
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ return true;
+}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
+#endif
+#endif
+
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ struct wake_q_node *node = &task->wake_q;
+
+ /*
+ * Atomically grab the task, if ->wake_q is !nil already it means
+ * its already queued (either by us or someone else) and will get the
+ * wakeup due to that.
+ *
+ * In order to ensure that a pending wakeup will observe our pending
+ * state, even in the failed case, an explicit smp_mb() must be used.
+ */
+ smp_mb__before_atomic();
+ if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
+ return;
+
+ get_task_struct(task);
+
+ /*
+ * The head is context local, there can be no concurrency.
+ */
+ *head->lastp = node;
+ head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+ struct wake_q_node *node = head->first;
+
+ while (node != WAKE_Q_TAIL) {
+ struct task_struct *task;
+
+ task = container_of(node, struct task_struct, wake_q);
+ BUG_ON(!task);
+ /* Task can safely be re-inserted now: */
+ node = node->next;
+ task->wake_q.next = NULL;
+
+ /*
+ * wake_up_process() executes a full barrier, which pairs with
+ * the queueing in wake_q_add() so as not to miss wakeups.
+ */
+ wake_up_process(task);
+ put_task_struct(task);
+ }
+}
+
+/*
+ * resched_curr - mark rq's current task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_curr(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ int cpu;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (test_tsk_need_resched(curr))
+ return;
+
+ cpu = cpu_of(rq);
+
+ if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(curr);
+ set_preempt_need_resched();
+ return;
+ }
+
+ if (set_nr_and_not_polling(curr))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
+void resched_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (cpu_online(cpu) || cpu == smp_processor_id())
+ resched_curr(rq);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * In the semi idle case, use the nearest busy CPU for migrating timers
+ * from an idle CPU. This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle CPU will add more delays to the timers than intended
+ * (as that CPU's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+ int i, cpu = smp_processor_id();
+ struct sched_domain *sd;
+
+ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
+ return cpu;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (cpu == i)
+ continue;
+
+ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
+ cpu = i;
+ goto unlock;
+ }
+ }
+ }
+
+ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
+ cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+unlock:
+ rcu_read_unlock();
+ return cpu;
+}
+
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+static void wake_up_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (cpu == smp_processor_id())
+ return;
+
+ if (set_nr_and_not_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
+static bool wake_up_full_nohz_cpu(int cpu)
+{
+ /*
+ * We just need the target to call irq_exit() and re-evaluate
+ * the next tick. The nohz full kick at least implies that.
+ * If needed we can still optimize that later with an
+ * empty IRQ.
+ */
+ if (cpu_is_offline(cpu))
+ return true; /* Don't try to wake offline CPUs. */
+ if (tick_nohz_full_cpu(cpu)) {
+ if (cpu != smp_processor_id() ||
+ tick_nohz_tick_stopped())
+ tick_nohz_full_kick_cpu(cpu);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Wake up the specified CPU. If the CPU is going offline, it is the
+ * caller's responsibility to deal with the lost wakeup, for example,
+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
+ */
+void wake_up_nohz_cpu(int cpu)
+{
+ if (!wake_up_full_nohz_cpu(cpu))
+ wake_up_idle_cpu(cpu);
+}
+
+static inline bool got_nohz_idle_kick(void)
+{
+ int cpu = smp_processor_id();
+
+ if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
+ return false;
+
+ if (idle_cpu(cpu) && !need_resched())
+ return true;
+
+ /*
+ * We can't run Idle Load Balance on this CPU for this time so we
+ * cancel it and clear NOHZ_BALANCE_KICK
+ */
+ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+ return false;
+}
+
+#else /* CONFIG_NO_HZ_COMMON */
+
+static inline bool got_nohz_idle_kick(void)
+{
+ return false;
+}
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
+#ifdef CONFIG_NO_HZ_FULL
+bool sched_can_stop_tick(struct rq *rq)
+{
+ int fifo_nr_running;
+
+ /* Deadline tasks, even if single, need the tick */
+ if (rq->dl.dl_nr_running)
+ return false;
+
+ /*
+ * If there are more than one RR tasks, we need the tick to effect the
+ * actual RR behaviour.
+ */
+ if (rq->rt.rr_nr_running) {
+ if (rq->rt.rr_nr_running == 1)
+ return true;
+ else
+ return false;
+ }
+
+ /*
+ * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
+ * forced preemption between FIFO tasks.
+ */
+ fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+ if (fifo_nr_running)
+ return true;
+
+ /*
+ * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
+ * if there's more than one we need the tick for involuntary
+ * preemption.
+ */
+ if (rq->nr_running > 1)
+ return false;
+
+ return true;
+}
+#endif /* CONFIG_NO_HZ_FULL */
+#endif /* CONFIG_SMP */
+
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
+ (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
+/*
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
+ * node and @up when leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+int walk_tg_tree_from(struct task_group *from,
+ tg_visitor down, tg_visitor up, void *data)
+{
+ struct task_group *parent, *child;
+ int ret;
+
+ parent = from;
+
+down:
+ ret = (*down)(parent, data);
+ if (ret)
+ goto out;
+ list_for_each_entry_rcu(child, &parent->children, siblings) {
+ parent = child;
+ goto down;
+
+up:
+ continue;
+ }
+ ret = (*up)(parent, data);
+ if (ret || parent == from)
+ goto out;
+
+ child = parent;
+ parent = parent->parent;
+ if (parent)
+ goto up;
+out:
+ return ret;
+}
+
+int tg_nop(struct task_group *tg, void *data)
+{
+ return 0;
+}
+#endif
+
+static void set_load_weight(struct task_struct *p, bool update_load)
+{
+ int prio = p->static_prio - MAX_RT_PRIO;
+ struct load_weight *load = &p->se.load;
+
+ /*
+ * SCHED_IDLE tasks get minimal weight:
+ */
+ if (idle_policy(p->policy)) {
+ load->weight = scale_load(WEIGHT_IDLEPRIO);
+ load->inv_weight = WMULT_IDLEPRIO;
+ return;
+ }
+
+ /*
+ * SCHED_OTHER tasks have to update their load when changing their
+ * weight
+ */
+ if (update_load && p->sched_class == &fair_sched_class) {
+ reweight_task(p, prio);
+ } else {
+ load->weight = scale_load(sched_prio_to_weight[prio]);
+ load->inv_weight = sched_prio_to_wmult[prio];
+ }
+}
+
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (!(flags & ENQUEUE_NOCLOCK))
+ update_rq_clock(rq);
+
+ if (!(flags & ENQUEUE_RESTORE))
+ sched_info_queued(rq, p);
+
+ p->sched_class->enqueue_task(rq, p, flags);
+}
+
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (!(flags & DEQUEUE_NOCLOCK))
+ update_rq_clock(rq);
+
+ if (!(flags & DEQUEUE_SAVE))
+ sched_info_dequeued(rq, p);
+
+ p->sched_class->dequeue_task(rq, p, flags);
+}
+
+void activate_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (task_contributes_to_load(p))
+ rq->nr_uninterruptible--;
+
+ enqueue_task(rq, p, flags);
+}
+
+void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (task_contributes_to_load(p))
+ rq->nr_uninterruptible++;
+
+ dequeue_task(rq, p, flags);
+}
+
+/*
+ * __normal_prio - return the priority that is based on the static prio
+ */
+static inline int __normal_prio(struct task_struct *p)
+{
+ return p->static_prio;
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(struct task_struct *p)
+{
+ int prio;
+
+ if (task_has_dl_policy(p))
+ prio = MAX_DL_PRIO-1;
+ else if (task_has_rt_policy(p))
+ prio = MAX_RT_PRIO-1 - p->rt_priority;
+ else
+ prio = __normal_prio(p);
+ return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+ return cpu_curr(task_cpu(p)) == p;
+}
+
+/*
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
+ */
+static inline void check_class_changed(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class,
+ int oldprio)
+{
+ if (prev_class != p->sched_class) {
+ if (prev_class->switched_from)
+ prev_class->switched_from(rq, p);
+
+ p->sched_class->switched_to(rq, p);
+ } else if (oldprio != p->prio || dl_task(p))
+ p->sched_class->prio_changed(rq, p, oldprio);
+}
+
+void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+{
+ const struct sched_class *class;
+
+ if (p->sched_class == rq->curr->sched_class) {
+ rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+ } else {
+ for_each_class(class) {
+ if (class == rq->curr->sched_class)
+ break;
+ if (class == p->sched_class) {
+ resched_curr(rq);
+ break;
+ }
+ }
+ }
+
+ /*
+ * A queue event has occurred, and we're going to schedule. In
+ * this case, we can save a useless back to back clock update.
+ */
+ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
+ rq_clock_skip_update(rq);
+}
+
+#ifdef CONFIG_SMP
+
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+ if (!(p->flags & PF_KTHREAD))
+ return false;
+
+ if (p->nr_cpus_allowed != 1)
+ return false;
+
+ return true;
+}
+
+/*
+ * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
+ * __set_cpus_allowed_ptr() and select_fallback_rq().
+ */
+static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+{
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ return false;
+
+ if (is_per_cpu_kthread(p))
+ return cpu_online(cpu);
+
+ return cpu_active(cpu);
+}
+
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ * stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ * off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ * it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ * is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p, int new_cpu)
+{
+ lockdep_assert_held(&rq->lock);
+
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
+ dequeue_task(rq, p, DEQUEUE_NOCLOCK);
+ set_task_cpu(p, new_cpu);
+ rq_unlock(rq, rf);
+
+ rq = cpu_rq(new_cpu);
+
+ rq_lock(rq, rf);
+ BUG_ON(task_cpu(p) != new_cpu);
+ enqueue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ check_preempt_curr(rq, p, 0);
+
+ return rq;
+}
+
+struct migration_arg {
+ struct task_struct *task;
+ int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this CPU, onto the destination CPU. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ */
+static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p, int dest_cpu)
+{
+ /* Affinity changed (again). */
+ if (!is_cpu_allowed(p, dest_cpu))
+ return rq;
+
+ update_rq_clock(rq);
+ rq = move_queued_task(rq, rf, p, dest_cpu);
+
+ return rq;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+ struct migration_arg *arg = data;
+ struct task_struct *p = arg->task;
+ struct rq *rq = this_rq();
+ struct rq_flags rf;
+
+ /*
+ * The original target CPU might have gone down and we might
+ * be on another CPU but it doesn't matter.
+ */
+ local_irq_disable();
+ /*
+ * We need to explicitly wake pending tasks before running
+ * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+ */
+ sched_ttwu_pending();
+
+ raw_spin_lock(&p->pi_lock);
+ rq_lock(rq, &rf);
+ /*
+ * If task_rq(p) != rq, it cannot be migrated here, because we're
+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+ * we're holding p->pi_lock.
+ */
+ if (task_rq(p) == rq) {
+ if (task_on_rq_queued(p))
+ rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
+ else
+ p->wake_cpu = arg->dest_cpu;
+ }
+ rq_unlock(rq, &rf);
+ raw_spin_unlock(&p->pi_lock);
+
+ local_irq_enable();
+ return 0;
+}
+
+/*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+ */
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+{
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ struct rq *rq = task_rq(p);
+ bool queued, running;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+
+ if (queued) {
+ /*
+ * Because __kthread_bind() calls this on blocked tasks without
+ * holding rq->lock.
+ */
+ lockdep_assert_held(&rq->lock);
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+ }
+ if (running)
+ put_prev_task(rq, p);
+
+ p->sched_class->set_cpus_allowed(p, new_mask);
+
+ if (queued)
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+ if (running)
+ set_curr_task(rq, p);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ unsigned int dest_cpu;
+ struct rq_flags rf;
+ struct rq *rq;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+ if (p->flags & PF_KTHREAD) {
+ /*
+ * Kernel threads are allowed on online && !active CPUs
+ */
+ cpu_valid_mask = cpu_online_mask;
+ }
+
+ /*
+ * Must re-check here, to close a race against __kthread_bind(),
+ * sched_setaffinity() is not guaranteed to observe the flag.
+ */
+ if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (cpumask_equal(&p->cpus_allowed, new_mask))
+ goto out;
+
+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+ if (dest_cpu >= nr_cpu_ids) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ do_set_cpus_allowed(p, new_mask);
+
+ if (p->flags & PF_KTHREAD) {
+ /*
+ * For kernel threads that do indeed end up on online &&
+ * !active we want to ensure they are strict per-CPU threads.
+ */
+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+ !cpumask_intersects(new_mask, cpu_active_mask) &&
+ p->nr_cpus_allowed != 1);
+ }
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
+ goto out;
+
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
+ struct migration_arg arg = { p, dest_cpu };
+ /* Need help from migration thread: drop lock and wait. */
+ task_rq_unlock(rq, p, &rf);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ tlb_migrate_finish(p->mm);
+ return 0;
+ } else if (task_on_rq_queued(p)) {
+ /*
+ * OK, since we're going to drop the lock immediately
+ * afterwards anyway.
+ */
+ rq = move_queued_task(rq, &rf, p, dest_cpu);
+ }
+out:
+ task_rq_unlock(rq, p, &rf);
+
+ return ret;
+}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ return __set_cpus_allowed_ptr(p, new_mask, false);
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ /*
+ * We should never call set_task_cpu() on a blocked task,
+ * ttwu() will sort out the placement.
+ */
+ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
+ !p->on_rq);
+
+ /*
+ * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+ * because schedstat_wait_{start,end} rebase migrating task's wait_start
+ * time relying on p->on_rq.
+ */
+ WARN_ON_ONCE(p->state == TASK_RUNNING &&
+ p->sched_class == &fair_sched_class &&
+ (p->on_rq && !task_on_rq_migrating(p)));
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * The caller should hold either p->pi_lock or rq->lock, when changing
+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+ *
+ * sched_move_task() holds both and thus holding either pins the cgroup,
+ * see task_group().
+ *
+ * Furthermore, all task_rq users should acquire both locks, see
+ * task_rq_lock().
+ */
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&task_rq(p)->lock)));
+#endif
+ /*
+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
+ */
+ WARN_ON_ONCE(!cpu_online(new_cpu));
+#endif
+
+ trace_sched_migrate_task(p, new_cpu);
+
+ if (task_cpu(p) != new_cpu) {
+ if (p->sched_class->migrate_task_rq)
+ p->sched_class->migrate_task_rq(p, new_cpu);
+ p->se.nr_migrations++;
+ rseq_migrate(p);
+ perf_event_task_migrate(p);
+ }
+
+ __set_task_cpu(p, new_cpu);
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+ if (task_on_rq_queued(p)) {
+ struct rq *src_rq, *dst_rq;
+ struct rq_flags srf, drf;
+
+ src_rq = task_rq(p);
+ dst_rq = cpu_rq(cpu);
+
+ rq_pin_lock(src_rq, &srf);
+ rq_pin_lock(dst_rq, &drf);
+
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, cpu);
+ activate_task(dst_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ check_preempt_curr(dst_rq, p, 0);
+
+ rq_unpin_lock(dst_rq, &drf);
+ rq_unpin_lock(src_rq, &srf);
+
+ } else {
+ /*
+ * Task isn't running anymore; make it appear like we migrated
+ * it before it went to sleep. This means on wakeup we make the
+ * previous CPU our target instead of where it really is.
+ */
+ p->wake_cpu = cpu;
+ }
+}
+
+struct migration_swap_arg {
+ struct task_struct *src_task, *dst_task;
+ int src_cpu, dst_cpu;
+};
+
+static int migrate_swap_stop(void *data)
+{
+ struct migration_swap_arg *arg = data;
+ struct rq *src_rq, *dst_rq;
+ int ret = -EAGAIN;
+
+ if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+ return -EAGAIN;
+
+ src_rq = cpu_rq(arg->src_cpu);
+ dst_rq = cpu_rq(arg->dst_cpu);
+
+ double_raw_lock(&arg->src_task->pi_lock,
+ &arg->dst_task->pi_lock);
+ double_rq_lock(src_rq, dst_rq);
+
+ if (task_cpu(arg->dst_task) != arg->dst_cpu)
+ goto unlock;
+
+ if (task_cpu(arg->src_task) != arg->src_cpu)
+ goto unlock;
+
+ if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
+ goto unlock;
+
+ if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
+ goto unlock;
+
+ __migrate_swap_task(arg->src_task, arg->dst_cpu);
+ __migrate_swap_task(arg->dst_task, arg->src_cpu);
+
+ ret = 0;
+
+unlock:
+ double_rq_unlock(src_rq, dst_rq);
+ raw_spin_unlock(&arg->dst_task->pi_lock);
+ raw_spin_unlock(&arg->src_task->pi_lock);
+
+ return ret;
+}
+
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p,
+ int target_cpu, int curr_cpu)
+{
+ struct migration_swap_arg arg;
+ int ret = -EINVAL;
+
+ arg = (struct migration_swap_arg){
+ .src_task = cur,
+ .src_cpu = curr_cpu,
+ .dst_task = p,
+ .dst_cpu = target_cpu,
+ };
+
+ if (arg.src_cpu == arg.dst_cpu)
+ goto out;
+
+ /*
+ * These three tests are all lockless; this is OK since all of them
+ * will be re-checked with proper locks held further down the line.
+ */
+ if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+ goto out;
+
+ if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
+ goto out;
+
+ if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
+ goto out;
+
+ trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
+ ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+
+out:
+ return ret;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change. If it changes, i.e. @p might have woken up,
+ * then return zero. When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count). If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+{
+ int running, queued;
+ struct rq_flags rf;
+ unsigned long ncsw;
+ struct rq *rq;
+
+ for (;;) {
+ /*
+ * We do the initial early heuristics without holding
+ * any task-queue locks at all. We'll only try to get
+ * the runqueue lock when things look like they will
+ * work out!
+ */
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since "task_running()" will
+ * return false if the runqueue has changed and p
+ * is actually now running somewhere else!
+ */
+ while (task_running(rq, p)) {
+ if (match_state && unlikely(p->state != match_state))
+ return 0;
+ cpu_relax();
+ }
+
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &rf);
+ trace_sched_wait_task(p);
+ running = task_running(rq, p);
+ queued = task_on_rq_queued(p);
+ ncsw = 0;
+ if (!match_state || p->state == match_state)
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ task_rq_unlock(rq, p, &rf);
+
+ /*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it was still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(queued)) {
+ ktime_t to = NSEC_PER_SEC / HZ;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+ continue;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
+
+ return ncsw;
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesn't have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+ int cpu;
+
+ preempt_disable();
+ cpu = task_cpu(p);
+ if ((cpu != smp_processor_id()) && task_curr(p))
+ smp_send_reschedule(cpu);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kick_process);
+
+/*
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ *
+ * A few notes on cpu_active vs cpu_online:
+ *
+ * - cpu_active must be a subset of cpu_online
+ *
+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
+ * see __set_cpus_allowed_ptr(). At this point the newly online
+ * CPU isn't yet part of the sched domains, and balancing will not
+ * see it.
+ *
+ * - on CPU-down we clear cpu_active() to mask the sched domains and
+ * avoid the load balancer to place new tasks on the to be removed
+ * CPU. Existing tasks will remain running there and will be taken
+ * off.
+ *
+ * This means that fallback selection must not select !active CPUs.
+ * And can assume that any active CPU must be online. Conversely
+ * select_task_rq() below may allow selection of !active CPUs in order
+ * to satisfy the above rules.
+ */
+static int select_fallback_rq(int cpu, struct task_struct *p)
+{
+ int nid = cpu_to_node(cpu);
+ const struct cpumask *nodemask = NULL;
+ enum { cpuset, possible, fail } state = cpuset;
+ int dest_cpu;
+
+ /*
+ * If the node that the CPU is on has been offlined, cpu_to_node()
+ * will return -1. There is no CPU on the node, and we should
+ * select the CPU on the other node.
+ */
+ if (nid != -1) {
+ nodemask = cpumask_of_node(nid);
+
+ /* Look for allowed, online CPU in same node. */
+ for_each_cpu(dest_cpu, nodemask) {
+ if (!cpu_active(dest_cpu))
+ continue;
+ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ return dest_cpu;
+ }
+ }
+
+ for (;;) {
+ /* Any allowed, online CPU? */
+ for_each_cpu(dest_cpu, &p->cpus_allowed) {
+ if (!is_cpu_allowed(p, dest_cpu))
+ continue;
+
+ goto out;
+ }
+
+ /* No more Mr. Nice Guy. */
+ switch (state) {
+ case cpuset:
+ if (IS_ENABLED(CONFIG_CPUSETS)) {
+ cpuset_cpus_allowed_fallback(p);
+ state = possible;
+ break;
+ }
+ /* Fall-through */
+ case possible:
+ do_set_cpus_allowed(p, cpu_possible_mask);
+ state = fail;
+ break;
+
+ case fail:
+ BUG();
+ break;
+ }
+ }
+
+out:
+ if (state != cpuset) {
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk_deferred("process %d (%s) no longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, cpu);
+ }
+ }
+
+ return dest_cpu;
+}
+
+/*
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
+ */
+static inline
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+{
+ lockdep_assert_held(&p->pi_lock);
+
+ if (p->nr_cpus_allowed > 1)
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ else
+ cpu = cpumask_any(&p->cpus_allowed);
+
+ /*
+ * In order not to call set_task_cpu() on a blocking task we need
+ * to rely on ttwu() to place the task on a valid ->cpus_allowed
+ * CPU.
+ *
+ * Since this is common to all placement strategies, this lives here.
+ *
+ * [ this allows ->select_task() to simply return task_cpu(p) and
+ * not worry about this generic constraint ]
+ */
+ if (unlikely(!is_cpu_allowed(p, cpu)))
+ cpu = select_fallback_rq(task_cpu(p), p);
+
+ return cpu;
+}
+
+static void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+ *avg += diff >> 3;
+}
+
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+ stop->sched_class = &stop_sched_class;
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling class so that
+ * it can die in pieces.
+ */
+ old_stop->sched_class = &rt_sched_class;
+ }
+}
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ return set_cpus_allowed_ptr(p, new_mask);
+}
+
+#endif /* CONFIG_SMP */
+
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+{
+ struct rq *rq;
+
+ if (!schedstat_enabled())
+ return;
+
+ rq = this_rq();
+
+#ifdef CONFIG_SMP
+ if (cpu == rq->cpu) {
+ __schedstat_inc(rq->ttwu_local);
+ __schedstat_inc(p->se.statistics.nr_wakeups_local);
+ } else {
+ struct sched_domain *sd;
+
+ __schedstat_inc(p->se.statistics.nr_wakeups_remote);
+ rcu_read_lock();
+ for_each_domain(rq->cpu, sd) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ __schedstat_inc(sd->ttwu_wake_remote);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ if (wake_flags & WF_MIGRATED)
+ __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+#endif /* CONFIG_SMP */
+
+ __schedstat_inc(rq->ttwu_count);
+ __schedstat_inc(p->se.statistics.nr_wakeups);
+
+ if (wake_flags & WF_SYNC)
+ __schedstat_inc(p->se.statistics.nr_wakeups_sync);
+}
+
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
+ activate_task(rq, p, en_flags);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+
+ /* If a worker is waking up, notify the workqueue: */
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct rq_flags *rf)
+{
+ check_preempt_curr(rq, p, wake_flags);
+ p->state = TASK_RUNNING;
+ trace_sched_wakeup(p);
+
+#ifdef CONFIG_SMP
+ if (p->sched_class->task_woken) {
+ /*
+ * Our task @p is fully woken up and running; so its safe to
+ * drop the rq->lock, hereafter rq is only used for statistics.
+ */
+ rq_unpin_lock(rq, rf);
+ p->sched_class->task_woken(rq, p);
+ rq_repin_lock(rq, rf);
+ }
+
+ if (rq->idle_stamp) {
+ u64 delta = rq_clock(rq) - rq->idle_stamp;
+ u64 max = 2*rq->max_idle_balance_cost;
+
+ update_avg(&rq->avg_idle, delta);
+
+ if (rq->avg_idle > max)
+ rq->avg_idle = max;
+
+ rq->idle_stamp = 0;
+ }
+#endif
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct rq_flags *rf)
+{
+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
+
+ lockdep_assert_held(&rq->lock);
+
+#ifdef CONFIG_SMP
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_MIGRATED;
+#endif
+
+ ttwu_activate(rq, p, en_flags);
+ ttwu_do_wakeup(rq, p, wake_flags, rf);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p, &rf);
+ if (task_on_rq_queued(p)) {
+ /* check_preempt_curr() may use rq clock */
+ update_rq_clock(rq);
+ ttwu_do_wakeup(rq, p, wake_flags, &rf);
+ ret = 1;
+ }
+ __task_rq_unlock(rq, &rf);
+
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+void sched_ttwu_pending(void)
+{
+ struct rq *rq = this_rq();
+ struct llist_node *llist = llist_del_all(&rq->wake_list);
+ struct task_struct *p, *t;
+ struct rq_flags rf;
+
+ if (!llist)
+ return;
+
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+
+ llist_for_each_entry_safe(p, t, llist, wake_entry)
+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
+
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+void scheduler_ipi(void)
+{
+ /*
+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+ * TIF_NEED_RESCHED remotely (for the first time) will also send
+ * this IPI.
+ */
+ preempt_fold_need_resched();
+
+ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+ return;
+
+ /*
+ * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+ * traditionally all their work was done from the interrupt return
+ * path. Now that we actually do some work, we need to make sure
+ * we do call them.
+ *
+ * Some archs already do call them, luckily irq_enter/exit nest
+ * properly.
+ *
+ * Arguably we should visit all archs and update all handlers,
+ * however a fair share of IPIs are still resched only so this would
+ * somewhat pessimize the simple resched case.
+ */
+ irq_enter();
+ sched_ttwu_pending();
+
+ /*
+ * Check if someone kicked us for doing the nohz idle load balance.
+ */
+ if (unlikely(got_nohz_idle_kick())) {
+ this_rq()->idle_balance = 1;
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
+ }
+ irq_exit();
+}
+
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+ if (!set_nr_if_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+ }
+}
+
+void wake_up_if_idle(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rcu_read_lock();
+
+ if (!is_idle_task(rcu_dereference(rq->curr)))
+ goto out;
+
+ if (set_nr_if_polling(rq->idle)) {
+ trace_sched_wake_idle_without_ipi(cpu);
+ } else {
+ rq_lock_irqsave(rq, &rf);
+ if (is_idle_task(rq->curr))
+ smp_send_reschedule(cpu);
+ /* Else CPU is not idle, do nothing here: */
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+out:
+ rcu_read_unlock();
+}
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+ if (this_cpu == that_cpu)
+ return true;
+
+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+}
+#endif /* CONFIG_SMP */
+
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+#if defined(CONFIG_SMP)
+ if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+ ttwu_queue_remote(p, cpu, wake_flags);
+ return;
+ }
+#endif
+
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
+ ttwu_do_activate(rq, p, wake_flags, &rf);
+ rq_unlock(rq, &rf);
+}
+
+/*
+ * Notes on Program-Order guarantees on SMP systems.
+ *
+ * MIGRATION
+ *
+ * The basic program-order guarantee on SMP systems is that when a task [t]
+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent
+ * execution on its new CPU [c1].
+ *
+ * For migration (of runnable tasks) this is provided by the following means:
+ *
+ * A) UNLOCK of the rq(c0)->lock scheduling out task t
+ * B) migration for t is required to synchronize *both* rq(c0)->lock and
+ * rq(c1)->lock (if not at the same time, then in that order).
+ * C) LOCK of the rq(c1)->lock scheduling in task
+ *
+ * Release/acquire chaining guarantees that B happens after A and C after B.
+ * Note: the CPU doing B need not be c0 or c1
+ *
+ * Example:
+ *
+ * CPU0 CPU1 CPU2
+ *
+ * LOCK rq(0)->lock
+ * sched-out X
+ * sched-in Y
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(0)->lock // orders against CPU0
+ * dequeue X
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(1)->lock
+ * enqueue X
+ * UNLOCK rq(1)->lock
+ *
+ * LOCK rq(1)->lock // orders against CPU2
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(1)->lock
+ *
+ *
+ * BLOCKING -- aka. SLEEP + WAKEUP
+ *
+ * For blocking we (obviously) need to provide the same guarantee as for
+ * migration. However the means are completely different as there is no lock
+ * chain to provide order. Instead we do:
+ *
+ * 1) smp_store_release(X->on_cpu, 0)
+ * 2) smp_cond_load_acquire(!X->on_cpu)
+ *
+ * Example:
+ *
+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
+ *
+ * LOCK rq(0)->lock LOCK X->pi_lock
+ * dequeue X
+ * sched-out X
+ * smp_store_release(X->on_cpu, 0);
+ *
+ * smp_cond_load_acquire(&X->on_cpu, !VAL);
+ * X->state = WAKING
+ * set_task_cpu(X,2)
+ *
+ * LOCK rq(2)->lock
+ * enqueue X
+ * X->state = RUNNING
+ * UNLOCK rq(2)->lock
+ *
+ * LOCK rq(2)->lock // orders against CPU1
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(2)->lock
+ *
+ * UNLOCK X->pi_lock
+ * UNLOCK rq(0)->lock
+ *
+ *
+ * However, for wakeups there is a second guarantee we must provide, namely we
+ * must ensure that CONDITION=1 done by the caller can not be reordered with
+ * accesses to the task state; see try_to_wake_up() and set_current_state().
+ */
+
+/**
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
+ *
+ * If the task was not queued/runnable, also place it back on a runqueue.
+ *
+ * Atomic against schedule() which would dequeue a task, also see
+ * set_current_state().
+ *
+ * This function executes a full memory barrier before accessing the task
+ * state; see set_current_state().
+ *
+ * Return: %true if @p->state changes (an actual wakeup was done),
+ * %false otherwise.
+ */
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+{
+ unsigned long flags;
+ int cpu, success = 0;
+
+ /*
+ * If we are going to wake up a thread waiting for CONDITION we
+ * need to ensure that CONDITION=1 done by the caller can not be
+ * reordered with p->state check below. This pairs with mb() in
+ * set_current_state() the waiting thread does.
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ smp_mb__after_spinlock();
+ if (!(p->state & state))
+ goto out;
+
+ trace_sched_waking(p);
+
+ /* We're going to change ->state: */
+ success = 1;
+ cpu = task_cpu(p);
+
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * STORE p->on_rq = 1 LOAD p->state
+ * UNLOCK rq->lock
+ *
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ */
+ smp_rmb();
+ if (p->on_rq && ttwu_remote(p, wake_flags))
+ goto stat;
+
+#ifdef CONFIG_SMP
+ /*
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * __schedule() (switch to task 'p') try_to_wake_up()
+ * STORE p->on_cpu = 1 LOAD p->on_rq
+ * UNLOCK rq->lock
+ *
+ * __schedule() (put 'p' to sleep)
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * STORE p->on_rq = 0 LOAD p->on_cpu
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ */
+ smp_rmb();
+
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, wait until its done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_task().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
+ */
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
+ p->state = TASK_WAKING;
+
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+ if (task_cpu(p) != cpu) {
+ wake_flags |= WF_MIGRATED;
+ set_task_cpu(p, cpu);
+ }
+
+#else /* CONFIG_SMP */
+
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+#endif /* CONFIG_SMP */
+
+ ttwu_queue(p, cpu, wake_flags);
+stat:
+ ttwu_stat(p, cpu, wake_flags);
+out:
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return success;
+}
+
+/**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ * @rf: request-queue flags for pinning
+ *
+ * Put @p on the run-queue if it's not already there. The caller must
+ * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ * the current task.
+ */
+static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
+{
+ struct rq *rq = task_rq(p);
+
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (!raw_spin_trylock(&p->pi_lock)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet picked a replacement task.
+ */
+ rq_unlock(rq, rf);
+ raw_spin_lock(&p->pi_lock);
+ rq_relock(rq, rf);
+ }
+
+ if (!(p->state & TASK_NORMAL))
+ goto out;
+
+ trace_sched_waking(p);
+
+ if (!task_on_rq_queued(p)) {
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&rq->nr_iowait);
+ }
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
+ }
+
+ ttwu_do_wakeup(rq, p, 0, rf);
+ ttwu_stat(p, smp_processor_id(), 0);
+out:
+ raw_spin_unlock(&p->pi_lock);
+}
+
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
+ *
+ * This function executes a full memory barrier before accessing the task state.
+ */
+int wake_up_process(struct task_struct *p)
+{
+ return try_to_wake_up(p, TASK_NORMAL, 0);
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int wake_up_state(struct task_struct *p, unsigned int state)
+{
+ return try_to_wake_up(p, state, 0);
+}
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ *
+ * __sched_fork() is basic setup used by init_idle() too:
+ */
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+{
+ p->on_rq = 0;
+
+ p->se.on_rq = 0;
+ p->se.exec_start = 0;
+ p->se.sum_exec_runtime = 0;
+ p->se.prev_sum_exec_runtime = 0;
+ p->se.nr_migrations = 0;
+ p->se.vruntime = 0;
+ INIT_LIST_HEAD(&p->se.group_node);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = NULL;
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+ /* Even if schedstat is disabled, there should not be garbage */
+ memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+#endif
+
+ RB_CLEAR_NODE(&p->dl.rb_node);
+ init_dl_task_timer(&p->dl);
+ init_dl_inactive_task_timer(&p->dl);
+ __dl_clear_params(p);
+
+ INIT_LIST_HEAD(&p->rt.run_list);
+ p->rt.timeout = 0;
+ p->rt.time_slice = sched_rr_timeslice;
+ p->rt.on_rq = 0;
+ p->rt.on_list = 0;
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+
+ init_numa_balancing(clone_flags, p);
+}
+
+DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+
+#ifdef CONFIG_NUMA_BALANCING
+
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ static_branch_enable(&sched_numa_balancing);
+ else
+ static_branch_disable(&sched_numa_balancing);
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_numa_balancing(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_numa_balancing);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_numabalancing_state(state);
+ return err;
+}
+#endif
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
+
+static void set_schedstats(bool enabled)
+{
+ if (enabled)
+ static_branch_enable(&sched_schedstats);
+ else
+ static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+ if (!schedstat_enabled()) {
+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+ static_branch_enable(&sched_schedstats);
+ }
+}
+
+static int __init setup_schedstats(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+
+ /*
+ * This code is called before jump labels have been set up, so we can't
+ * change the static branch directly just yet. Instead set a temporary
+ * variable so init_schedstats() can do it later.
+ */
+ if (!strcmp(str, "enable")) {
+ __sched_schedstats = true;
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ __sched_schedstats = false;
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("Unable to parse schedstats=\n");
+
+ return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+static void __init init_schedstats(void)
+{
+ set_schedstats(__sched_schedstats);
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_schedstats);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_schedstats(state);
+ return err;
+}
+#endif /* CONFIG_PROC_SYSCTL */
+#else /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
+
+/*
+ * fork()/clone()-time setup:
+ */
+int sched_fork(unsigned long clone_flags, struct task_struct *p)
+{
+ unsigned long flags;
+
+ __sched_fork(clone_flags, p);
+ /*
+ * We mark the process as NEW here. This guarantees that
+ * nobody will actually run it, and a signal or other external
+ * event cannot wake it up and insert it on the runqueue either.
+ */
+ p->state = TASK_NEW;
+
+ /*
+ * Make sure we do not leak PI boosting priority to the child.
+ */
+ p->prio = current->normal_prio;
+
+ /*
+ * Revert to default priority/policy on fork if requested.
+ */
+ if (unlikely(p->sched_reset_on_fork)) {
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
+ p->policy = SCHED_NORMAL;
+ p->static_prio = NICE_TO_PRIO(0);
+ p->rt_priority = 0;
+ } else if (PRIO_TO_NICE(p->static_prio) < 0)
+ p->static_prio = NICE_TO_PRIO(0);
+
+ p->prio = p->normal_prio = __normal_prio(p);
+ set_load_weight(p, false);
+
+ /*
+ * We don't need the reset flag anymore after the fork. It has
+ * fulfilled its duty:
+ */
+ p->sched_reset_on_fork = 0;
+ }
+
+ if (dl_prio(p->prio))
+ return -EAGAIN;
+ else if (rt_prio(p->prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+
+ init_entity_runnable_average(&p->se);
+
+ /*
+ * The child is not yet in the pid-hash so no cgroup attach races,
+ * and the cgroup is pinned to this child due to cgroup_fork()
+ * is ran before sched_fork().
+ *
+ * Silence PROVE_RCU.
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ rseq_migrate(p);
+ /*
+ * We're setting the CPU for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, smp_processor_id());
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+#ifdef CONFIG_SCHED_INFO
+ if (likely(sched_info_on()))
+ memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+#if defined(CONFIG_SMP)
+ p->on_cpu = 0;
+#endif
+ init_task_preempt_count(p);
+#ifdef CONFIG_SMP
+ plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ RB_CLEAR_NODE(&p->pushable_dl_tasks);
+#endif
+ return 0;
+}
+
+unsigned long to_ratio(u64 period, u64 runtime)
+{
+ if (runtime == RUNTIME_INF)
+ return BW_UNIT;
+
+ /*
+ * Doing this here saves a lot of checks in all
+ * the calling paths, and returning zero seems
+ * safe for them anyway.
+ */
+ if (period == 0)
+ return 0;
+
+ return div64_u64(runtime << BW_SHIFT, period);
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ p->state = TASK_RUNNING;
+#ifdef CONFIG_SMP
+ /*
+ * Fork balancing, do it here and not earlier because:
+ * - cpus_allowed can change in the fork path
+ * - any previously selected CPU might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
+ */
+ p->recent_used_cpu = task_cpu(p);
+ rseq_migrate(p);
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+#endif
+ rq = __task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+ post_init_entity_util_avg(&p->se);
+
+ activate_task(rq, p, ENQUEUE_NOCLOCK);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ trace_sched_wakeup_new(p);
+ check_preempt_curr(rq, p, WF_FORK);
+#ifdef CONFIG_SMP
+ if (p->sched_class->task_woken) {
+ /*
+ * Nothing relies on rq->lock after this, so its fine to
+ * drop it.
+ */
+ rq_unpin_lock(rq, &rf);
+ p->sched_class->task_woken(rq, p);
+ rq_repin_lock(rq, &rf);
+ }
+#endif
+ task_rq_unlock(rq, p, &rf);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
+
+void preempt_notifier_inc(void)
+{
+ static_branch_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+ static_branch_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
+/**
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+ if (!static_branch_unlikely(&preempt_notifier_key))
+ WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
+ hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is *not* safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+ hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ struct preempt_notifier *notifier;
+
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+ notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ if (static_branch_unlikely(&preempt_notifier_key))
+ __fire_sched_in_preempt_notifiers(curr);
+}
+
+static void
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ struct preempt_notifier *notifier;
+
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+ notifier->ops->sched_out(notifier, next);
+}
+
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ if (static_branch_unlikely(&preempt_notifier_key))
+ __fire_sched_out_preempt_notifiers(curr, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+}
+
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
+
+static inline void prepare_task(struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Claim the task as running, we do this before switching to it
+ * such that any running task will have this set.
+ */
+ next->on_cpu = 1;
+#endif
+}
+
+static inline void finish_task(struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ *
+ * In particular, the load of prev->state in finish_task_switch() must
+ * happen before this.
+ *
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
+ */
+ smp_store_release(&prev->on_cpu, 0);
+#endif
+}
+
+static inline void
+prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
+{
+ /*
+ * Since the runqueue lock will be released by the next
+ * task (which is an invalid locking op but in the case
+ * of the scheduler it's an obvious special-case), so we
+ * do an early lockdep release here:
+ */
+ rq_unpin_lock(rq, rf);
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = next;
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq)
+{
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+ raw_spin_unlock_irq(&rq->lock);
+}
+
+/*
+ * NOP if the arch has not defined these:
+ */
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next) do { } while (0)
+#endif
+
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @prev: the current task that is being switched out
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ kcov_prepare_switch(prev);
+ sched_info_switch(rq, prev, next);
+ perf_event_task_sched_out(prev, next);
+ rseq_preempt(prev);
+ fire_sched_out_preempt_notifiers(prev, next);
+ prepare_task(next);
+ prepare_arch_switch(next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock. (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
+ */
+static struct rq *finish_task_switch(struct task_struct *prev)
+ __releases(rq->lock)
+{
+ struct rq *rq = this_rq();
+ struct mm_struct *mm = rq->prev_mm;
+ long prev_state;
+
+ /*
+ * The previous task will have left us with a preempt_count of 2
+ * because it left us after:
+ *
+ * schedule()
+ * preempt_disable(); // 1
+ * __schedule()
+ * raw_spin_lock_irq(&rq->lock) // 2
+ *
+ * Also, see FORK_PREEMPT_COUNT.
+ */
+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+ "corrupted preempt_count: %s/%d/0x%x\n",
+ current->comm, current->pid, preempt_count()))
+ preempt_count_set(FORK_PREEMPT_COUNT);
+
+ rq->prev_mm = NULL;
+
+ /*
+ * A task struct has one reference for the use as "current".
+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+ * schedule one last time. The schedule call will never return, and
+ * the scheduled task must drop that reference.
+ *
+ * We must observe prev->state before clearing prev->on_cpu (in
+ * finish_task), otherwise a concurrent wakeup can get prev
+ * running on another CPU and we could rave with its RUNNING -> DEAD
+ * transition, resulting in a double drop.
+ */
+ prev_state = prev->state;
+ vtime_task_switch(prev);
+ perf_event_task_sched_in(prev, current);
+ finish_task(prev);
+ finish_lock_switch(rq);
+ finish_arch_post_lock_switch();
+ kcov_finish_switch(current);
+
+ fire_sched_in_preempt_notifiers(current);
+ /*
+ * When switching through a kernel thread, the loop in
+ * membarrier_{private,global}_expedited() may have observed that
+ * kernel thread and not issued an IPI. It is therefore possible to
+ * schedule between user->kernel->user threads without passing though
+ * switch_mm(). Membarrier requires a barrier after storing to
+ * rq->curr, before returning to userspace, so provide them here:
+ *
+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
+ * provided by mmdrop(),
+ * - a sync_core for SYNC_CORE.
+ */
+ if (mm) {
+ membarrier_mm_sync_core_before_usermode(mm);
+ mmdrop(mm);
+ }
+ if (unlikely(prev_state == TASK_DEAD)) {
+ if (prev->sched_class->task_dead)
+ prev->sched_class->task_dead(prev);
+
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(prev);
+
+ /* Task is done with its stack. */
+ put_task_stack(prev);
+
+ put_task_struct(prev);
+ }
+
+ tick_nohz_task_switch();
+ return rq;
+}
+
+#ifdef CONFIG_SMP
+
+/* rq->lock is NOT held, but preemption is disabled */
+static void __balance_callback(struct rq *rq)
+{
+ struct callback_head *head, *next;
+ void (*func)(struct rq *rq);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ head = rq->balance_callback;
+ rq->balance_callback = NULL;
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;
+
+ func(rq);
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+ if (unlikely(rq->balance_callback))
+ __balance_callback(rq);
+}
+
+#else
+
+static inline void balance_callback(struct rq *rq)
+{
+}
+
+#endif
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
+ __releases(rq->lock)
+{
+ struct rq *rq;
+
+ /*
+ * New tasks start with FORK_PREEMPT_COUNT, see there and
+ * finish_task_switch() for details.
+ *
+ * finish_task_switch() will drop rq->lock() and lower preempt_count
+ * and the preempt_enable() will end up enabling preemption (on
+ * PREEMPT_COUNT kernels).
+ */
+
+ rq = finish_task_switch(prev);
+ balance_callback(rq);
+ preempt_enable();
+
+ if (current->set_child_tid)
+ put_user(task_pid_vnr(current), current->set_child_tid);
+
+ calculate_sigpending();
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next, struct rq_flags *rf)
+{
+ struct mm_struct *mm, *oldmm;
+
+ prepare_task_switch(rq, prev, next);
+
+ mm = next->mm;
+ oldmm = prev->active_mm;
+ /*
+ * For paravirt, this is coupled with an exit in switch_to to
+ * combine the page table reload and the switch backend into
+ * one hypercall.
+ */
+ arch_start_context_switch(prev);
+
+ /*
+ * If mm is non-NULL, we pass through switch_mm(). If mm is
+ * NULL, we will pass through mmdrop() in finish_task_switch().
+ * Both of these contain the full memory barrier required by
+ * membarrier after storing to rq->curr, before returning to
+ * user-space.
+ */
+ if (!mm) {
+ next->active_mm = oldmm;
+ mmgrab(oldmm);
+ enter_lazy_tlb(oldmm, next);
+ } else
+ switch_mm_irqs_off(oldmm, mm, next);
+
+ if (!prev->mm) {
+ prev->active_mm = NULL;
+ rq->prev_mm = oldmm;
+ }
+
+ rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+
+ prepare_lock_switch(rq, next, rf);
+
+ /* Here we just switch the register state and the stack. */
+ switch_to(prev, next, prev);
+ barrier();
+
+ return finish_task_switch(prev);
+}
+
+/*
+ * nr_running and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, total number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_online_cpu(i)
+ sum += cpu_rq(i)->nr_running;
+
+ return sum;
+}
+
+/*
+ * Check if only the current task is running on the CPU.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race. The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptable section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
+ */
+bool single_task_running(void)
+{
+ return raw_rq()->nr_running == 1;
+}
+EXPORT_SYMBOL(single_task_running);
+
+unsigned long long nr_context_switches(void)
+{
+ int i;
+ unsigned long long sum = 0;
+
+ for_each_possible_cpu(i)
+ sum += cpu_rq(i)->nr_switches;
+
+ return sum;
+}
+
+/*
+ * IO-wait accounting, and how its mostly bollocks (on SMP).
+ *
+ * The idea behind IO-wait account is to account the idle time that we could
+ * have spend running if it were not for IO. That is, if we were to improve the
+ * storage performance, we'd have a proportional reduction in IO-wait time.
+ *
+ * This all works nicely on UP, where, when a task blocks on IO, we account
+ * idle time as IO-wait, because if the storage were faster, it could've been
+ * running and we'd not be idle.
+ *
+ * This has been extended to SMP, by doing the same for each CPU. This however
+ * is broken.
+ *
+ * Imagine for instance the case where two tasks block on one CPU, only the one
+ * CPU will have IO-wait accounted, while the other has regular idle. Even
+ * though, if the storage were faster, both could've ran at the same time,
+ * utilising both CPUs.
+ *
+ * This means, that when looking globally, the current IO-wait accounting on
+ * SMP is a lower bound, by reason of under accounting.
+ *
+ * Worse, since the numbers are provided per CPU, they are sometimes
+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
+ * associated with any one particular CPU, it can wake to another CPU than it
+ * blocked on. This means the per CPU IO-wait number is meaningless.
+ *
+ * Task CPU affinities can make all that even more 'interesting'.
+ */
+
+unsigned long nr_iowait(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_possible_cpu(i)
+ sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+ return sum;
+}
+
+/*
+ * Consumers of these two interfaces, like for example the cpufreq menu
+ * governor are using nonsensical data. Boosting frequency for a CPU that has
+ * IO-wait which might not even end up running the task when it does become
+ * runnable.
+ */
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+ struct rq *this = cpu_rq(cpu);
+ return atomic_read(&this->nr_iowait);
+}
+
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+{
+ struct rq *rq = this_rq();
+ *nr_waiters = atomic_read(&rq->nr_iowait);
+ *load = rq->load.weight;
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
+ */
+void sched_exec(void)
+{
+ struct task_struct *p = current;
+ unsigned long flags;
+ int dest_cpu;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+ if (dest_cpu == smp_processor_id())
+ goto unlock;
+
+ if (likely(cpu_active(dest_cpu))) {
+ struct migration_arg arg = { p, dest_cpu };
+
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
+ return;
+ }
+unlock:
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+#endif
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
+
+/*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+#else
+ struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+#endif
+ prefetch(curr);
+ prefetch(&curr->exec_start);
+}
+
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+ u64 ns;
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+ /*
+ * 64-bit doesn't need locks to atomically read a 64-bit value.
+ * So we have a optimization chance when the task's delta_exec is 0.
+ * Reading ->on_cpu is racy, but this is ok.
+ *
+ * If we race with it leaving CPU, we'll take a lock. So we're correct.
+ * If we race with it entering CPU, unaccounted time is 0. This is
+ * indistinguishable from the read occurring a few cycles earlier.
+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+ * been accounted, so we're correct here as well.
+ */
+ if (!p->on_cpu || !task_on_rq_queued(p))
+ return p->se.sum_exec_runtime;
+#endif
+
+ rq = task_rq_lock(p, &rf);
+ /*
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
+ * project cycles that may never be accounted to this
+ * thread, breaking clock_gettime().
+ */
+ if (task_current(rq, p) && task_on_rq_queued(p)) {
+ prefetch_curr_exec_start(p);
+ update_rq_clock(rq);
+ p->sched_class->update_curr(rq);
+ }
+ ns = p->se.sum_exec_runtime;
+ task_rq_unlock(rq, p, &rf);
+
+ return ns;
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ */
+void scheduler_tick(void)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *curr = rq->curr;
+ struct rq_flags rf;
+
+ sched_clock_tick();
+
+ rq_lock(rq, &rf);
+
+ update_rq_clock(rq);
+ curr->sched_class->task_tick(rq, curr, 0);
+ cpu_load_update_active(rq);
+ calc_global_load_tick(rq);
+
+ rq_unlock(rq, &rf);
+
+ perf_event_task_tick();
+
+#ifdef CONFIG_SMP
+ rq->idle_balance = idle_cpu(cpu);
+ trigger_load_balance(rq);
+#endif
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+
+struct tick_work {
+ int cpu;
+ atomic_t state;
+ struct delayed_work work;
+};
+/* Values for ->state, see diagram below. */
+#define TICK_SCHED_REMOTE_OFFLINE 0
+#define TICK_SCHED_REMOTE_OFFLINING 1
+#define TICK_SCHED_REMOTE_RUNNING 2
+
+/*
+ * State diagram for ->state:
+ *
+ *
+ * TICK_SCHED_REMOTE_OFFLINE
+ * | ^
+ * | |
+ * | | sched_tick_remote()
+ * | |
+ * | |
+ * +--TICK_SCHED_REMOTE_OFFLINING
+ * | ^
+ * | |
+ * sched_tick_start() | | sched_tick_stop()
+ * | |
+ * V |
+ * TICK_SCHED_REMOTE_RUNNING
+ *
+ *
+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ * and sched_tick_start() are happy to leave the state in RUNNING.
+ */
+
+static struct tick_work __percpu *tick_work_cpu;
+
+static void sched_tick_remote(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct tick_work *twork = container_of(dwork, struct tick_work, work);
+ int cpu = twork->cpu;
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *curr;
+ struct rq_flags rf;
+ u64 delta;
+ int os;
+
+ /*
+ * Handle the tick only if it appears the remote CPU is running in full
+ * dynticks mode. The check is racy by nature, but missing a tick or
+ * having one too much is no big deal because the scheduler tick updates
+ * statistics and checks timeslices in a time-independent way, regardless
+ * of when exactly it is running.
+ */
+ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
+ goto out_requeue;
+
+ rq_lock_irq(rq, &rf);
+ curr = rq->curr;
+ if (is_idle_task(curr) || cpu_is_offline(cpu))
+ goto out_unlock;
+
+ update_rq_clock(rq);
+ delta = rq_clock_task(rq) - curr->se.exec_start;
+
+ /*
+ * Make sure the next tick runs within a reasonable
+ * amount of time.
+ */
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ curr->sched_class->task_tick(rq, curr, 0);
+
+out_unlock:
+ rq_unlock_irq(rq, &rf);
+
+out_requeue:
+ /*
+ * Run the remote tick once per second (1Hz). This arbitrary
+ * frequency is large enough to avoid overload but short enough
+ * to keep scheduler internal stats reasonably up to date. But
+ * first update state to reflect hotplug activity if required.
+ */
+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+ if (os == TICK_SCHED_REMOTE_RUNNING)
+ queue_delayed_work(system_unbound_wq, dwork, HZ);
+}
+
+static void sched_tick_start(int cpu)
+{
+ int os;
+ struct tick_work *twork;
+
+ if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ return;
+
+ WARN_ON_ONCE(!tick_work_cpu);
+
+ twork = per_cpu_ptr(tick_work_cpu, cpu);
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+ if (os == TICK_SCHED_REMOTE_OFFLINE) {
+ twork->cpu = cpu;
+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void sched_tick_stop(int cpu)
+{
+ struct tick_work *twork;
+ int os;
+
+ if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ return;
+
+ WARN_ON_ONCE(!tick_work_cpu);
+
+ twork = per_cpu_ptr(tick_work_cpu, cpu);
+ /* There cannot be competing actions, but don't rely on stop-machine. */
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+ /* Don't cancel, as this would mess up the state machine. */
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __init sched_tick_offload_init(void)
+{
+ tick_work_cpu = alloc_percpu(struct tick_work);
+ BUG_ON(!tick_work_cpu);
+ return 0;
+}
+
+#else /* !CONFIG_NO_HZ_FULL */
+static inline void sched_tick_start(int cpu) { }
+static inline void sched_tick_stop(int cpu) { }
+#endif
+
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ defined(CONFIG_TRACE_PREEMPT_TOGGLE))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+ if (preempt_count() == val) {
+ unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
+}
+
+void preempt_count_add(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+ return;
+#endif
+ __preempt_count_add(val);
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Spinlock count overflowing soon?
+ */
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+ PREEMPT_MASK - 10);
+#endif
+ preempt_latency_start(val);
+}
+EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
+
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+ if (preempt_count() == val)
+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
+void preempt_count_sub(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ return;
+ /*
+ * Is the spinlock portion underflowing?
+ */
+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+ !(preempt_count() & PREEMPT_MASK)))
+ return;
+#endif
+
+ preempt_latency_stop(val);
+ __preempt_count_sub(val);
+}
+EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
+
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
+#endif
+
+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ return p->preempt_disable_ip;
+#else
+ return 0;
+#endif
+}
+
+/*
+ * Print scheduling while atomic bug:
+ */
+static noinline void __schedule_bug(struct task_struct *prev)
+{
+ /* Save this before calling printk(), since that will clobber it */
+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
+ if (oops_in_progress)
+ return;
+
+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+ prev->comm, prev->pid, preempt_count());
+
+ debug_show_held_locks(prev);
+ print_modules();
+ if (irqs_disabled())
+ print_irqtrace_events(prev);
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+ && in_atomic_preempt_off()) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(preempt_disable_ip);
+ pr_cont("\n");
+ }
+ if (panic_on_warn)
+ panic("scheduling while atomic\n");
+
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+
+/*
+ * Various schedule()-time debugging checks and statistics:
+ */
+static inline void schedule_debug(struct task_struct *prev)
+{
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+ if (task_stack_end_corrupted(prev))
+ panic("corrupted stack end detected inside scheduler\n");
+#endif
+
+ if (unlikely(in_atomic_preempt_off())) {
+ __schedule_bug(prev);
+ preempt_count_set(PREEMPT_DISABLED);
+ }
+ rcu_sleep_check();
+
+ profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+ schedstat_inc(this_rq()->sched_count);
+}
+
+/*
+ * Pick up the highest-prio task:
+ */
+static inline struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ const struct sched_class *class;
+ struct task_struct *p;
+
+ /*
+ * Optimization: we know that if all tasks are in the fair class we can
+ * call that function directly, but only if the @prev task wasn't of a
+ * higher scheduling class, because otherwise those loose the
+ * opportunity to pull in more work from other CPUs.
+ */
+ if (likely((prev->sched_class == &idle_sched_class ||
+ prev->sched_class == &fair_sched_class) &&
+ rq->nr_running == rq->cfs.h_nr_running)) {
+
+ p = fair_sched_class.pick_next_task(rq, prev, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+
+ /* Assumes fair_sched_class->next == idle_sched_class */
+ if (unlikely(!p))
+ p = idle_sched_class.pick_next_task(rq, prev, rf);
+
+ return p;
+ }
+
+again:
+ for_each_class(class) {
+ p = class->pick_next_task(rq, prev, rf);
+ if (p) {
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+ return p;
+ }
+ }
+
+ /* The idle class should always have a runnable task: */
+ BUG();
+}
+
+/*
+ * __schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ * paths. For example, see arch/x86/entry_64.S.
+ *
+ * To drive preemption between tasks, the scheduler sets the flag in timer
+ * interrupt handler scheduler_tick().
+ *
+ * 3. Wakeups don't really cause entry into schedule(). They add a
+ * task to the run-queue and that's it.
+ *
+ * Now, if the new task added to the run-queue preempts the current
+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ * called on the nearest possible occasion:
+ *
+ * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *
+ * - in syscall or exception context, at the next outmost
+ * preempt_enable(). (this might be as soon as the wake_up()'s
+ * spin_unlock()!)
+ *
+ * - in IRQ context, return from interrupt-handler to
+ * preemptible context
+ *
+ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * then at the next:
+ *
+ * - cond_resched() call
+ * - explicit schedule() call
+ * - return from syscall or exception to user-space
+ * - return from interrupt-handler to user-space
+ *
+ * WARNING: must be called with preemption disabled!
+ */
+static void __sched notrace __schedule(bool preempt)
+{
+ struct task_struct *prev, *next;
+ unsigned long *switch_count;
+ struct rq_flags rf;
+ struct rq *rq;
+ int cpu;
+
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ prev = rq->curr;
+
+ schedule_debug(prev);
+
+ if (sched_feat(HRTICK))
+ hrtick_clear(rq);
+
+ local_irq_disable();
+ rcu_note_context_switch(preempt);
+
+ /*
+ * Make sure that signal_pending_state()->signal_pending() below
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ * done by the caller to avoid the race with signal_wake_up().
+ *
+ * The membarrier system call requires a full memory barrier
+ * after coming from user-space, before storing to rq->curr.
+ */
+ rq_lock(rq, &rf);
+ smp_mb__after_spinlock();
+
+ /* Promote REQ to ACT */
+ rq->clock_update_flags <<= 1;
+ update_rq_clock(rq);
+
+ switch_count = &prev->nivcsw;
+ if (!preempt && prev->state) {
+ if (unlikely(signal_pending_state(prev->state, prev))) {
+ prev->state = TASK_RUNNING;
+ } else {
+ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+ prev->on_rq = 0;
+
+ if (prev->in_iowait) {
+ atomic_inc(&rq->nr_iowait);
+ delayacct_blkio_start();
+ }
+
+ /*
+ * If a worker went to sleep, notify and ask workqueue
+ * whether it wants to wake up a task to maintain
+ * concurrency.
+ */
+ if (prev->flags & PF_WQ_WORKER) {
+ struct task_struct *to_wakeup;
+
+ to_wakeup = wq_worker_sleeping(prev);
+ if (to_wakeup)
+ try_to_wake_up_local(to_wakeup, &rf);
+ }
+ }
+ switch_count = &prev->nvcsw;
+ }
+
+ next = pick_next_task(rq, prev, &rf);
+ clear_tsk_need_resched(prev);
+ clear_preempt_need_resched();
+
+ if (likely(prev != next)) {
+ rq->nr_switches++;
+ rq->curr = next;
+ /*
+ * The membarrier system call requires each architecture
+ * to have a full memory barrier after updating
+ * rq->curr, before returning to user-space.
+ *
+ * Here are the schemes providing that barrier on the
+ * various architectures:
+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+ * - finish_lock_switch() for weakly-ordered
+ * architectures where spin_unlock is a full barrier,
+ * - switch_to() for arm64 (weakly-ordered, spin_unlock
+ * is a RELEASE barrier),
+ */
+ ++*switch_count;
+
+ trace_sched_switch(preempt, prev, next);
+
+ /* Also unlocks the rq: */
+ rq = context_switch(rq, prev, next, &rf);
+ } else {
+ rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+ rq_unlock_irq(rq, &rf);
+ }
+
+ balance_callback(rq);
+}
+
+void __noreturn do_task_dead(void)
+{
+ /* Causes final put_task_struct in finish_task_switch(): */
+ set_special_state(TASK_DEAD);
+
+ /* Tell freezer to ignore us: */
+ current->flags |= PF_NOFREEZE;
+
+ __schedule(false);
+ BUG();
+
+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
+ for (;;)
+ cpu_relax();
+}
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+ if (!tsk->state || tsk_is_pi_blocked(tsk))
+ return;
+ /*
+ * If we are going to sleep and we have plugged IO queued,
+ * make sure to submit it to avoid deadlocks.
+ */
+ if (blk_needs_flush_plug(tsk))
+ blk_schedule_flush_plug(tsk);
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+ struct task_struct *tsk = current;
+
+ sched_submit_work(tsk);
+ do {
+ preempt_disable();
+ __schedule(false);
+ sched_preempt_enable_no_resched();
+ } while (need_resched());
+}
+EXPORT_SYMBOL(schedule);
+
+/*
+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
+ * state (have scheduled out non-voluntarily) by making sure that all
+ * tasks have either left the run queue or have gone into user space.
+ * As idle tasks do not do either, they must not ever be preempted
+ * (schedule out non-voluntarily).
+ *
+ * schedule_idle() is similar to schedule_preempt_disable() except that it
+ * never enables preemption because it does not call sched_submit_work().
+ */
+void __sched schedule_idle(void)
+{
+ /*
+ * As this skips calling sched_submit_work(), which the idle task does
+ * regardless because that function is a nop when the task is in a
+ * TASK_RUNNING state, make sure this isn't used someplace that the
+ * current task can be in any other state. Note, idle is always in the
+ * TASK_RUNNING state.
+ */
+ WARN_ON_ONCE(current->state);
+ do {
+ __schedule(false);
+ } while (need_resched());
+}
+
+#ifdef CONFIG_CONTEXT_TRACKING
+asmlinkage __visible void __sched schedule_user(void)
+{
+ /*
+ * If we come here after a random call to set_need_resched(),
+ * or we have been woken up remotely but the IPI has not yet arrived,
+ * we haven't yet exited the RCU idle mode. Do it here manually until
+ * we find a better solution.
+ *
+ * NB: There are buggy callers of this function. Ideally we
+ * should warn if prev_state != CONTEXT_USER, but that will trigger
+ * too frequently to make sense yet.
+ */
+ enum ctx_state prev_state = exception_enter();
+ schedule();
+ exception_exit(prev_state);
+}
+#endif
+
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+ sched_preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+}
+
+static void __sched notrace preempt_schedule_common(void)
+{
+ do {
+ /*
+ * Because the function tracer can trace preempt_count_sub()
+ * and it also uses preempt_enable/disable_notrace(), if
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
+ * by the function tracer will call this function again and
+ * cause infinite recursion.
+ *
+ * Preemption must be disabled here before the function
+ * tracer can trace. Break up preempt_disable() into two
+ * calls. One to disable preemption without fear of being
+ * traced. The other to still record the preemption latency,
+ * which can also be traced by the function tracer.
+ */
+ preempt_disable_notrace();
+ preempt_latency_start(1);
+ __schedule(true);
+ preempt_latency_stop(1);
+ preempt_enable_no_resched_notrace();
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ } while (need_resched());
+}
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable. Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule(void)
+{
+ /*
+ * If there is a non-zero preempt_count or interrupts are disabled,
+ * we do not want to preempt the current task. Just return..
+ */
+ if (likely(!preemptible()))
+ return;
+
+ preempt_schedule_common();
+}
+NOKPROBE_SYMBOL(preempt_schedule);
+EXPORT_SYMBOL(preempt_schedule);
+
+/**
+ * preempt_schedule_notrace - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ do {
+ /*
+ * Because the function tracer can trace preempt_count_sub()
+ * and it also uses preempt_enable/disable_notrace(), if
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
+ * by the function tracer will call this function again and
+ * cause infinite recursion.
+ *
+ * Preemption must be disabled here before the function
+ * tracer can trace. Break up preempt_disable() into two
+ * calls. One to disable preemption without fear of being
+ * traced. The other to still record the preemption latency,
+ * which can also be traced by the function tracer.
+ */
+ preempt_disable_notrace();
+ preempt_latency_start(1);
+ /*
+ * Needs preempt disabled in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ prev_ctx = exception_enter();
+ __schedule(true);
+ exception_exit(prev_ctx);
+
+ preempt_latency_stop(1);
+ preempt_enable_no_resched_notrace();
+ } while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
+
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage __visible void __sched preempt_schedule_irq(void)
+{
+ enum ctx_state prev_state;
+
+ /* Catch callers which need to be fixed */
+ BUG_ON(preempt_count() || !irqs_disabled());
+
+ prev_state = exception_enter();
+
+ do {
+ preempt_disable();
+ local_irq_enable();
+ __schedule(true);
+ local_irq_disable();
+ sched_preempt_enable_no_resched();
+ } while (need_resched());
+
+ exception_exit(prev_state);
+}
+
+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
+ void *key)
+{
+ return try_to_wake_up(curr->private, mode, wake_flags);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+#ifdef CONFIG_RT_MUTEXES
+
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+ if (pi_task)
+ prio = min(prio, pi_task->prio);
+
+ return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ return __rt_effective_prio(pi_task, prio);
+}
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task to boost
+ * @pi_task: donor task
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
+ */
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
+{
+ int prio, oldprio, queued, running, queue_flag =
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ const struct sched_class *prev_class;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ /* XXX used to be waiter->prio, not waiter->task->prio */
+ prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+ /*
+ * If nothing changed; bail early.
+ */
+ if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+ return;
+
+ rq = __task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+ /*
+ * Set under pi_lock && rq->lock, such that the value can be used under
+ * either lock.
+ *
+ * Note that there is loads of tricky to make this pointer cache work
+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+ * ensure a task is de-boosted (pi_task is set to NULL) before the
+ * task is allowed to run again (and can exit). This ensures the pointer
+ * points to a blocked task -- which guaratees the task is present.
+ */
+ p->pi_top_task = pi_task;
+
+ /*
+ * For FIFO/RR we only need to set prio, if that matches we're done.
+ */
+ if (prio == p->prio && !dl_prio(prio))
+ goto out_unlock;
+
+ /*
+ * Idle task boosting is a nono in general. There is one
+ * exception, when PREEMPT_RT and NOHZ is active:
+ *
+ * The idle task calls get_next_timer_interrupt() and holds
+ * the timer wheel base->lock on the CPU and another CPU wants
+ * to access the timer (probably to cancel it). We can safely
+ * ignore the boosting request, as the idle CPU runs this code
+ * with interrupts disabled and will complete the lock
+ * protected section without being interrupted. So there is no
+ * real need to boost.
+ */
+ if (unlikely(p == rq->idle)) {
+ WARN_ON(p != rq->curr);
+ WARN_ON(p->pi_blocked_on);
+ goto out_unlock;
+ }
+
+ trace_sched_pi_setprio(p, pi_task);
+ oldprio = p->prio;
+
+ if (oldprio == prio)
+ queue_flag &= ~DEQUEUE_MOVE;
+
+ prev_class = p->sched_class;
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+ if (queued)
+ dequeue_task(rq, p, queue_flag);
+ if (running)
+ put_prev_task(rq, p);
+
+ /*
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
+ */
+ if (dl_prio(prio)) {
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ p->dl.dl_boosted = 1;
+ queue_flag |= ENQUEUE_REPLENISH;
+ } else
+ p->dl.dl_boosted = 0;
+ p->sched_class = &dl_sched_class;
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.dl_boosted = 0;
+ if (oldprio < prio)
+ queue_flag |= ENQUEUE_HEAD;
+ p->sched_class = &rt_sched_class;
+ } else {
+ if (dl_prio(oldprio))
+ p->dl.dl_boosted = 0;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
+ p->sched_class = &fair_sched_class;
+ }
+
+ p->prio = prio;
+
+ if (queued)
+ enqueue_task(rq, p, queue_flag);
+ if (running)
+ set_curr_task(rq, p);
+
+ check_class_changed(rq, p, prev_class, oldprio);
+out_unlock:
+ /* Avoid rq from going away on us: */
+ preempt_disable();
+ __task_rq_unlock(rq, &rf);
+
+ balance_callback(rq);
+ preempt_enable();
+}
+#else
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ return prio;
+}
+#endif
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+ bool queued, running;
+ int old_prio, delta;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
+ return;
+ /*
+ * We have to be careful, if called from sys_setpriority(),
+ * the task might be in the middle of scheduling on another CPU.
+ */
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+ /*
+ * The RT priorities are set via sched_setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it wont have any effect on scheduling until the task is
+ * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
+ */
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ goto out_unlock;
+ }
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+ if (queued)
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+ if (running)
+ put_prev_task(rq, p);
+
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p, true);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ delta = p->prio - old_prio;
+
+ if (queued) {
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+ */
+ if (delta < 0 || (delta > 0 && task_running(rq, p)))
+ resched_curr(rq);
+ }
+ if (running)
+ set_curr_task(rq, p);
+out_unlock:
+ task_rq_unlock(rq, p, &rf);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ /* Convert nice value [19,-20] to rlimit style value [1,40]: */
+ int nice_rlim = nice_to_rlimit(nice);
+
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
+ capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+ long nice, retval;
+
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+ nice = task_nice(current) + increment;
+
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
+ if (increment < 0 && !can_nice(current, nice))
+ return -EPERM;
+
+ retval = security_task_setnice(current, nice);
+ if (retval)
+ return retval;
+
+ set_user_nice(current, nice);
+ return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The priority value as seen by users in /proc.
+ * RT tasks are offset by -200. Normal tasks are centered
+ * around 0, value goes from -16 to +15.
+ */
+int task_prio(const struct task_struct *p)
+{
+ return p->prio - MAX_RT_PRIO;
+}
+
+/**
+ * idle_cpu - is a given CPU idle currently?
+ * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr != rq->idle)
+ return 0;
+
+ if (rq->nr_running)
+ return 0;
+
+#ifdef CONFIG_SMP
+ if (!llist_empty(&rq->wake_list))
+ return 0;
+#endif
+
+ return 1;
+}
+
+/**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int available_idle_cpu(int cpu)
+{
+ if (!idle_cpu(cpu))
+ return 0;
+
+ if (vcpu_is_preempted(cpu))
+ return 0;
+
+ return 1;
+}
+
+/**
+ * idle_task - return the idle task for a given CPU.
+ * @cpu: the processor in question.
+ *
+ * Return: The idle task for the CPU @cpu.
+ */
+struct task_struct *idle_task(int cpu)
+{
+ return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
+ */
+static struct task_struct *find_process_by_pid(pid_t pid)
+{
+ return pid ? find_task_by_vpid(pid) : current;
+}
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
+static void __setscheduler_params(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ int policy = attr->sched_policy;
+
+ if (policy == SETPARAM_POLICY)
+ policy = p->policy;
+
+ p->policy = policy;
+
+ if (dl_policy(policy))
+ __setparam_dl(p, attr);
+ else if (fair_policy(policy))
+ p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+
+ /*
+ * __sched_setscheduler() ensures attr->sched_priority == 0 when
+ * !rt_policy. Always setting this ensures that things like
+ * getparam()/getattr() don't report silly values for !rt tasks.
+ */
+ p->rt_priority = attr->sched_priority;
+ p->normal_prio = normal_prio(p);
+ set_load_weight(p, true);
+}
+
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+ const struct sched_attr *attr, bool keep_boost)
+{
+ __setscheduler_params(p, attr);
+
+ /*
+ * Keep a potential priority boosting if called from
+ * sched_setscheduler().
+ */
+ p->prio = normal_prio(p);
+ if (keep_boost)
+ p->prio = rt_effective_prio(p, p->prio);
+
+ if (dl_prio(p->prio))
+ p->sched_class = &dl_sched_class;
+ else if (rt_prio(p->prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+}
+
+/*
+ * Check the target process has a UID that matches the current process's:
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred;
+ bool match;
+
+ rcu_read_lock();
+ pcred = __task_cred(p);
+ match = (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
+ rcu_read_unlock();
+ return match;
+}
+
+static int __sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ bool user, bool pi)
+{
+ int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+ MAX_RT_PRIO - 1 - attr->sched_priority;
+ int retval, oldprio, oldpolicy = -1, queued, running;
+ int new_effective_prio, policy = attr->sched_policy;
+ const struct sched_class *prev_class;
+ struct rq_flags rf;
+ int reset_on_fork;
+ int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ struct rq *rq;
+
+ /* The pi code expects interrupts enabled */
+ BUG_ON(pi && in_interrupt());
+recheck:
+ /* Double check policy once rq lock held: */
+ if (policy < 0) {
+ reset_on_fork = p->sched_reset_on_fork;
+ policy = oldpolicy = p->policy;
+ } else {
+ reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
+
+ if (!valid_policy(policy))
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
+ return -EINVAL;
+
+ /*
+ * Valid priorities for SCHED_FIFO and SCHED_RR are
+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * SCHED_BATCH and SCHED_IDLE is 0.
+ */
+ if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
+ (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
+ return -EINVAL;
+ if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+ (rt_policy(policy) != (attr->sched_priority != 0)))
+ return -EINVAL;
+
+ /*
+ * Allow unprivileged RT tasks to decrease priority:
+ */
+ if (user && !capable(CAP_SYS_NICE)) {
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !can_nice(p, attr->sched_nice))
+ return -EPERM;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio =
+ task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ return -EPERM;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ return -EPERM;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ return -EPERM;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (idle_policy(p->policy) && !idle_policy(policy)) {
+ if (!can_nice(p, task_nice(p)))
+ return -EPERM;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ return -EPERM;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ return -EPERM;
+ }
+
+ if (user) {
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return -EINVAL;
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+ }
+
+ /*
+ * Make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ *
+ * To be able to change p->policy safely, the appropriate
+ * runqueue lock must be held.
+ */
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+ /*
+ * Changing the policy of the stop threads its a very bad idea:
+ */
+ if (p == rq->stop) {
+ task_rq_unlock(rq, p, &rf);
+ return -EINVAL;
+ }
+
+ /*
+ * If not changing anything there's no need to proceed further,
+ * but store a possible modification of reset_on_fork.
+ */
+ if (unlikely(policy == p->policy)) {
+ if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ goto change;
+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ goto change;
+ if (dl_policy(policy) && dl_param_changed(p, attr))
+ goto change;
+
+ p->sched_reset_on_fork = reset_on_fork;
+ task_rq_unlock(rq, p, &rf);
+ return 0;
+ }
+change:
+
+ if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Do not allow realtime tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ !task_group_is_autogroup(task_group(p))) {
+ task_rq_unlock(rq, p, &rf);
+ return -EPERM;
+ }
+#endif
+#ifdef CONFIG_SMP
+ if (dl_bandwidth_enabled() && dl_policy(policy) &&
+ !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
+ cpumask_t *span = rq->rd->span;
+
+ /*
+ * Don't allow tasks with an affinity mask smaller than
+ * the entire root_domain to become SCHED_DEADLINE. We
+ * will also fail if there's no bandwidth available.
+ */
+ if (!cpumask_subset(span, &p->cpus_allowed) ||
+ rq->rd->dl_bw.bw == 0) {
+ task_rq_unlock(rq, p, &rf);
+ return -EPERM;
+ }
+ }
+#endif
+ }
+
+ /* Re-check policy now with rq lock held: */
+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+ policy = oldpolicy = -1;
+ task_rq_unlock(rq, p, &rf);
+ goto recheck;
+ }
+
+ /*
+ * If setscheduling to SCHED_DEADLINE (or changing the parameters
+ * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+ * is available.
+ */
+ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
+ task_rq_unlock(rq, p, &rf);
+ return -EBUSY;
+ }
+
+ p->sched_reset_on_fork = reset_on_fork;
+ oldprio = p->prio;
+
+ if (pi) {
+ /*
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ new_effective_prio = rt_effective_prio(p, newprio);
+ if (new_effective_prio == oldprio)
+ queue_flags &= ~DEQUEUE_MOVE;
+ }
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+ if (queued)
+ dequeue_task(rq, p, queue_flags);
+ if (running)
+ put_prev_task(rq, p);
+
+ prev_class = p->sched_class;
+ __setscheduler(rq, p, attr, pi);
+
+ if (queued) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ if (oldprio < p->prio)
+ queue_flags |= ENQUEUE_HEAD;
+
+ enqueue_task(rq, p, queue_flags);
+ }
+ if (running)
+ set_curr_task(rq, p);
+
+ check_class_changed(rq, p, prev_class, oldprio);
+
+ /* Avoid rq from going away on us: */
+ preempt_disable();
+ task_rq_unlock(rq, p, &rf);
+
+ if (pi)
+ rt_mutex_adjust_pi(p);
+
+ /* Run balance callbacks after we've adjusted the PI chain: */
+ balance_callback(rq);
+ preempt_enable();
+
+ return 0;
+}
+
+static int _sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param, bool check)
+{
+ struct sched_attr attr = {
+ .sched_policy = policy,
+ .sched_priority = param->sched_priority,
+ .sched_nice = PRIO_TO_NICE(p->static_prio),
+ };
+
+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ policy &= ~SCHED_RESET_ON_FORK;
+ attr.sched_policy = policy;
+ }
+
+ return __sched_setscheduler(p, &attr, check, true);
+}
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, true);
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, true, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, false, true);
+}
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission. For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, false);
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+ struct sched_param lparam;
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+ if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setscheduler(p, policy, &lparam);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /* Zero the full structure, so that a short copy will be nice: */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ /* Bail out on silly large: */
+ if (size > PAGE_SIZE)
+ goto err_size;
+
+ /* ABI compatibility quirk: */
+ if (!size)
+ size = SCHED_ATTR_SIZE_VER0;
+
+ if (size < SCHED_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(*attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ size = sizeof(*attr);
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ /*
+ * XXX: Do we want to be lenient like existing syscalls; or do we want
+ * to be strict and return an error on out-of-bounds values?
+ */
+ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
+
+ return 0;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ return -E2BIG;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
+{
+ if (policy < 0)
+ return -EINVAL;
+
+ return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
+{
+ struct sched_attr attr;
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || flags)
+ return -EINVAL;
+
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if ((int)attr.sched_policy < 0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setattr(p, &attr);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+ struct task_struct *p;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ retval = -ESRCH;
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ if (p) {
+ retval = security_task_getscheduler(p);
+ if (!retval)
+ retval = p->policy
+ | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+ }
+ rcu_read_unlock();
+ return retval;
+}
+
+/**
+ * sys_sched_getparam - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+ struct sched_param lp = { .sched_priority = 0 };
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ rcu_read_unlock();
+
+ /*
+ * This one might sleep, we cannot do it with a spinlock held ...
+ */
+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+static int sched_read_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr,
+ unsigned int usize)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, usize))
+ return -EFAULT;
+
+ /*
+ * If we're handed a smaller struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. old
+ * user-space does not get uncomplete information.
+ */
+ if (usize < sizeof(*attr)) {
+ unsigned char *addr;
+ unsigned char *end;
+
+ addr = (void *)attr + usize;
+ end = (void *)attr + sizeof(*attr);
+
+ for (; addr < end; addr++) {
+ if (*addr)
+ return -EFBIG;
+ }
+
+ attr->size = usize;
+ }
+
+ ret = copy_to_user(uattr, attr, attr->size);
+ if (ret)
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, size, unsigned int, flags)
+{
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ };
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || size > PAGE_SIZE ||
+ size < SCHED_ATTR_SIZE_VER0 || flags)
+ return -EINVAL;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ attr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ if (task_has_dl_policy(p))
+ __getparam_dl(p, &attr);
+ else if (task_has_rt_policy(p))
+ attr.sched_priority = p->rt_priority;
+ else
+ attr.sched_nice = task_nice(p);
+
+ rcu_read_unlock();
+
+ retval = sched_read_attr(uattr, &attr, size);
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ cpumask_var_t cpus_allowed, new_mask;
+ struct task_struct *p;
+ int retval;
+
+ rcu_read_lock();
+
+ p = find_process_by_pid(pid);
+ if (!p) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ /* Prevent p going away */
+ get_task_struct(p);
+ rcu_read_unlock();
+
+ if (p->flags & PF_NO_SETAFFINITY) {
+ retval = -EINVAL;
+ goto out_put_task;
+ }
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_put_task;
+ }
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+ retval = -EPERM;
+ if (!check_same_owner(p)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ goto out_free_new_mask;
+ }
+ rcu_read_unlock();
+ }
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ goto out_free_new_mask;
+
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, in_mask, cpus_allowed);
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+#ifdef CONFIG_SMP
+ if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+ rcu_read_lock();
+ if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
+ retval = -EBUSY;
+ rcu_read_unlock();
+ goto out_free_new_mask;
+ }
+ rcu_read_unlock();
+ }
+#endif
+again:
+ retval = __set_cpus_allowed_ptr(p, new_mask, true);
+
+ if (!retval) {
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset
+ * update. Just reset the cpus_allowed to the
+ * cpuset's cpus_allowed
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+ goto again;
+ }
+ }
+out_free_new_mask:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+out_put_task:
+ put_task_struct(p);
+ return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+ struct cpumask *new_mask)
+{
+ if (len < cpumask_size())
+ cpumask_clear(new_mask);
+ else if (len > cpumask_size())
+ len = cpumask_size();
+
+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_setaffinity - set the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new CPU mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ cpumask_var_t new_mask;
+ int retval;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ if (retval == 0)
+ retval = sched_setaffinity(pid, new_mask);
+ free_cpumask_var(new_mask);
+ return retval;
+}
+
+long sched_getaffinity(pid_t pid, struct cpumask *mask)
+{
+ struct task_struct *p;
+ unsigned long flags;
+ int retval;
+
+ rcu_read_lock();
+
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current CPU mask
+ *
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ int ret;
+ cpumask_var_t mask;
+
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(unsigned long)-1))
+ return -EINVAL;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = sched_getaffinity(pid, mask);
+ if (ret == 0) {
+ unsigned int retlen = min(len, cpumask_size());
+
+ if (copy_to_user(user_mask_ptr, mask, retlen))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
+ free_cpumask_var(mask);
+
+ return ret;
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. If there are no
+ * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
+ */
+static void do_sched_yield(void)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ local_irq_disable();
+ rq = this_rq();
+ rq_lock(rq, &rf);
+
+ schedstat_inc(rq->yld_count);
+ current->sched_class->yield_task(rq);
+
+ preempt_disable();
+ rq_unlock_irq(rq, &rf);
+ sched_preempt_enable_no_resched();
+
+ schedule();
+}
+
+SYSCALL_DEFINE0(sched_yield)
+{
+ do_sched_yield();
+ return 0;
+}
+
+#ifndef CONFIG_PREEMPT
+int __sched _cond_resched(void)
+{
+ if (should_resched(0)) {
+ preempt_schedule_common();
+ return 1;
+ }
+ rcu_all_qs();
+ return 0;
+}
+EXPORT_SYMBOL(_cond_resched);
+#endif
+
+/*
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int __cond_resched_lock(spinlock_t *lock)
+{
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
+
+ lockdep_assert_held(lock);
+
+ if (spin_needbreak(lock) || resched) {
+ spin_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ spin_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_lock);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
+ */
+void __sched yield(void)
+{
+ set_current_state(TASK_RUNNING);
+ do_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Return:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
+ */
+int __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *curr = current;
+ struct rq *rq, *p_rq;
+ unsigned long flags;
+ int yielded = 0;
+
+ local_irq_save(flags);
+ rq = this_rq();
+
+again:
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+ yielded = -ESRCH;
+ goto out_irq;
+ }
+
+ double_rq_lock(rq, p_rq);
+ if (task_rq(p) != p_rq) {
+ double_rq_unlock(rq, p_rq);
+ goto again;
+ }
+
+ if (!curr->sched_class->yield_to_task)
+ goto out_unlock;
+
+ if (curr->sched_class != p->sched_class)
+ goto out_unlock;
+
+ if (task_running(p_rq, p) || p->state)
+ goto out_unlock;
+
+ yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+ if (yielded) {
+ schedstat_inc(rq->yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity takes care of
+ * fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_curr(p_rq);
+ }
+
+out_unlock:
+ double_rq_unlock(rq, p_rq);
+out_irq:
+ local_irq_restore(flags);
+
+ if (yielded > 0)
+ schedule();
+
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+int io_schedule_prepare(void)
+{
+ int old_iowait = current->in_iowait;
+
+ current->in_iowait = 1;
+ blk_schedule_flush_plug(current);
+
+ return old_iowait;
+}
+
+void io_schedule_finish(int token)
+{
+ current->in_iowait = token;
+}
+
+/*
+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ */
+long __sched io_schedule_timeout(long timeout)
+{
+ int token;
+ long ret;
+
+ token = io_schedule_prepare();
+ ret = schedule_timeout(timeout);
+ io_schedule_finish(token);
+
+ return ret;
+}
+EXPORT_SYMBOL(io_schedule_timeout);
+
+void __sched io_schedule(void)
+{
+ int token;
+
+ token = io_schedule_prepare();
+ schedule();
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL(io_schedule);
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = MAX_USER_RT_PRIO-1;
+ break;
+ case SCHED_DEADLINE:
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_DEADLINE:
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ ret = 0;
+ }
+ return ret;
+}
+
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
+{
+ struct task_struct *p;
+ unsigned int time_slice;
+ struct rq_flags rf;
+ struct rq *rq;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ retval = -ESRCH;
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ rq = task_rq_lock(p, &rf);
+ time_slice = 0;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
+ task_rq_unlock(rq, p, &rf);
+
+ rcu_read_unlock();
+ jiffies_to_timespec64(time_slice, t);
+ return 0;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_timespec64(&t, interval);
+
+ return retval;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = compat_put_timespec64(&t, interval);
+ return retval;
+}
+#endif
+
+void sched_show_task(struct task_struct *p)
+{
+ unsigned long free = 0;
+ int ppid;
+
+ if (!try_get_task_stack(p))
+ return;
+
+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+
+ if (p->state == TASK_RUNNING)
+ printk(KERN_CONT " running task ");
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ free = stack_not_used(p);
+#endif
+ ppid = 0;
+ rcu_read_lock();
+ if (pid_alive(p))
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ rcu_read_unlock();
+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+ task_pid_nr(p), ppid,
+ (unsigned long)task_thread_info(p)->flags);
+
+ print_worker_info(KERN_INFO, p);
+ show_stack(p, NULL);
+ put_task_stack(p);
+}
+EXPORT_SYMBOL_GPL(sched_show_task);
+
+static inline bool
+state_filter_match(unsigned long state_filter, struct task_struct *p)
+{
+ /* no filter, everything matches */
+ if (!state_filter)
+ return true;
+
+ /* filter, but doesn't match */
+ if (!(p->state & state_filter))
+ return false;
+
+ /*
+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
+ * TASK_KILLABLE).
+ */
+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+ return false;
+
+ return true;
+}
+
+
+void show_state_filter(unsigned long state_filter)
+{
+ struct task_struct *g, *p;
+
+#if BITS_PER_LONG == 32
+ printk(KERN_INFO
+ " task PC stack pid father\n");
+#else
+ printk(KERN_INFO
+ " task PC stack pid father\n");
+#endif
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ /*
+ * reset the NMI-timeout, listing all files on a slow
+ * console might take a lot of time:
+ * Also, reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI.
+ */
+ touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
+ if (state_filter_match(state_filter, p))
+ sched_show_task(p);
+ }
+
+#ifdef CONFIG_SCHED_DEBUG
+ if (!state_filter)
+ sysrq_sched_debug_show();
+#endif
+ rcu_read_unlock();
+ /*
+ * Only show locks if all tasks are dumped:
+ */
+ if (!state_filter)
+ debug_show_all_locks();
+}
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: CPU the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void init_idle(struct task_struct *idle, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ __sched_fork(0, idle);
+
+ raw_spin_lock_irqsave(&idle->pi_lock, flags);
+ raw_spin_lock(&rq->lock);
+
+ idle->state = TASK_RUNNING;
+ idle->se.exec_start = sched_clock();
+ idle->flags |= PF_IDLE;
+
+ kasan_unpoison_task_stack(idle);
+
+#ifdef CONFIG_SMP
+ /*
+ * Its possible that init_idle() gets called multiple times on a task,
+ * in that case do_set_cpus_allowed() will not do the right thing.
+ *
+ * And since this is boot we can forgo the serialization.
+ */
+ set_cpus_allowed_common(idle, cpumask_of(cpu));
+#endif
+ /*
+ * We're having a chicken and egg problem, even though we are
+ * holding rq->lock, the CPU isn't yet set to this CPU so the
+ * lockdep check in task_group() will fail.
+ *
+ * Similar case to sched_fork(). / Alternatively we could
+ * use task_rq_lock() here and obtain the other rq->lock.
+ *
+ * Silence PROVE_RCU
+ */
+ rcu_read_lock();
+ __set_task_cpu(idle, cpu);
+ rcu_read_unlock();
+
+ rq->curr = rq->idle = idle;
+ idle->on_rq = TASK_ON_RQ_QUEUED;
+#ifdef CONFIG_SMP
+ idle->on_cpu = 1;
+#endif
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
+
+ /* Set the preempt count _outside_ the spinlocks! */
+ init_idle_preempt_count(idle, cpu);
+
+ /*
+ * The idle tasks have their own, simple scheduling class:
+ */
+ idle->sched_class = &idle_sched_class;
+ ftrace_graph_init_idle_task(idle, cpu);
+ vtime_init_idle(idle, cpu);
+#ifdef CONFIG_SMP
+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
+}
+
+#ifdef CONFIG_SMP
+
+int cpuset_cpumask_can_shrink(const struct cpumask *cur,
+ const struct cpumask *trial)
+{
+ int ret = 1;
+
+ if (!cpumask_weight(cur))
+ return ret;
+
+ ret = dl_cpuset_cpumask_can_shrink(cur, trial);
+
+ return ret;
+}
+
+int task_can_attach(struct task_struct *p,
+ const struct cpumask *cs_cpus_allowed)
+{
+ int ret = 0;
+
+ /*
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their CPU
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
+ */
+ if (p->flags & PF_NO_SETAFFINITY) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
+ cs_cpus_allowed))
+ ret = dl_task_can_attach(p, cs_cpus_allowed);
+
+out:
+ return ret;
+}
+
+bool sched_smp_initialized __read_mostly;
+
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+ struct migration_arg arg = { p, target_cpu };
+ int curr_cpu = task_cpu(p);
+
+ if (curr_cpu == target_cpu)
+ return 0;
+
+ if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
+ return -EINVAL;
+
+ /* TODO: This is not properly updating schedstats */
+
+ trace_sched_move_numa(p, curr_cpu, target_cpu);
+ return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+ bool queued, running;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+
+ if (queued)
+ dequeue_task(rq, p, DEQUEUE_SAVE);
+ if (running)
+ put_prev_task(rq, p);
+
+ p->numa_preferred_nid = nid;
+
+ if (queued)
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+ if (running)
+ set_curr_task(rq, p);
+ task_rq_unlock(rq, p, &rf);
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Ensure that the idle task is using init_mm right before its CPU goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+ struct mm_struct *mm = current->active_mm;
+
+ BUG_ON(cpu_online(smp_processor_id()));
+ BUG_ON(current != this_rq()->idle);
+
+ if (mm != &init_mm) {
+ switch_mm(mm, &init_mm, current);
+ finish_arch_post_lock_switch();
+ }
+
+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta
+ * we might have. Assumes we're called after migrate_tasks() so that the
+ * nr_active count is stable. We need to take the teardown thread which
+ * is calling this into account, so we hand in adjust = 1 to the load
+ * calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+ long delta = calc_load_fold_active(rq, 1);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+}
+
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static const struct sched_class fake_sched_class = {
+ .put_prev_task = put_prev_task_fake,
+};
+
+static struct task_struct fake_task = {
+ /*
+ * Avoid pull_{rt,dl}_task()
+ */
+ .prio = MAX_PRIO + 1,
+ .sched_class = &fake_sched_class,
+};
+
+/*
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
+ */
+static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
+{
+ struct rq *rq = dead_rq;
+ struct task_struct *next, *stop = rq->stop;
+ struct rq_flags orf = *rf;
+ int dest_cpu;
+
+ /*
+ * Fudge the rq selection such that the below task selection loop
+ * doesn't get stuck on the currently eligible stop task.
+ *
+ * We're currently inside stop_machine() and the rq is either stuck
+ * in the stop_machine_cpu_stop() loop, or we're executing this code,
+ * either way we should never end up calling schedule() until we're
+ * done here.
+ */
+ rq->stop = NULL;
+
+ /*
+ * put_prev_task() and pick_next_task() sched
+ * class method both need to have an up-to-date
+ * value of rq->clock[_task]
+ */
+ update_rq_clock(rq);
+
+ for (;;) {
+ /*
+ * There's this thread running, bail when that's the only
+ * remaining thread:
+ */
+ if (rq->nr_running == 1)
+ break;
+
+ /*
+ * pick_next_task() assumes pinned rq->lock:
+ */
+ next = pick_next_task(rq, &fake_task, rf);
+ BUG_ON(!next);
+ put_prev_task(rq, next);
+
+ /*
+ * Rules for changing task_struct::cpus_allowed are holding
+ * both pi_lock and rq->lock, such that holding either
+ * stabilizes the mask.
+ *
+ * Drop rq->lock is not quite as disastrous as it usually is
+ * because !cpu_active at this point, which means load-balance
+ * will not interfere. Also, stop-machine.
+ */
+ rq_unlock(rq, rf);
+ raw_spin_lock(&next->pi_lock);
+ rq_relock(rq, rf);
+
+ /*
+ * Since we're inside stop-machine, _nothing_ should have
+ * changed the task, WARN if weird stuff happened, because in
+ * that case the above rq->lock drop is a fail too.
+ */
+ if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+ raw_spin_unlock(&next->pi_lock);
+ continue;
+ }
+
+ /* Find suitable destination for @next, with force if needed. */
+ dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+ rq = __migrate_task(rq, rf, next, dest_cpu);
+ if (rq != dead_rq) {
+ rq_unlock(rq, rf);
+ rq = dead_rq;
+ *rf = orf;
+ rq_relock(rq, rf);
+ }
+ raw_spin_unlock(&next->pi_lock);
+ }
+
+ rq->stop = stop;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void set_rq_online(struct rq *rq)
+{
+ if (!rq->online) {
+ const struct sched_class *class;
+
+ cpumask_set_cpu(rq->cpu, rq->rd->online);
+ rq->online = 1;
+
+ for_each_class(class) {
+ if (class->rq_online)
+ class->rq_online(rq);
+ }
+ }
+}
+
+void set_rq_offline(struct rq *rq)
+{
+ if (rq->online) {
+ const struct sched_class *class;
+
+ for_each_class(class) {
+ if (class->rq_offline)
+ class->rq_offline(rq);
+ }
+
+ cpumask_clear_cpu(rq->cpu, rq->rd->online);
+ rq->online = 0;
+ }
+}
+
+/*
+ * used to mark begin/end of suspend/resume:
+ */
+static int num_cpus_frozen;
+
+/*
+ * Update cpusets according to cpu_active mask. If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
+ */
+static void cpuset_cpu_active(void)
+{
+ if (cpuhp_tasks_frozen) {
+ /*
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
+ */
+ partition_sched_domains(1, NULL, NULL);
+ if (--num_cpus_frozen)
+ return;
+ /*
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
+ */
+ cpuset_force_rebuild();
+ }
+ cpuset_update_active_cpus();
+}
+
+static int cpuset_cpu_inactive(unsigned int cpu)
+{
+ if (!cpuhp_tasks_frozen) {
+ if (dl_cpu_busy(cpu))
+ return -EBUSY;
+ cpuset_update_active_cpus();
+ } else {
+ num_cpus_frozen++;
+ partition_sched_domains(1, NULL, NULL);
+ }
+ return 0;
+}
+
+int sched_cpu_activate(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * When going up, increment the number of cores with SMT present.
+ */
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_inc_cpuslocked(&sched_smt_present);
+#endif
+ set_cpu_active(cpu, true);
+
+ if (sched_smp_initialized) {
+ sched_domains_numa_masks_set(cpu);
+ cpuset_cpu_active();
+ }
+
+ /*
+ * Put the rq online, if not already. This happens:
+ *
+ * 1) In the early boot process, because we build the real domains
+ * after all CPUs have been brought up.
+ *
+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
+ * domains.
+ */
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_online(rq);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+
+ update_max_interval();
+
+ return 0;
+}
+
+int sched_cpu_deactivate(unsigned int cpu)
+{
+ int ret;
+
+ set_cpu_active(cpu, false);
+ /*
+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
+ * users of this state to go away such that all new such users will
+ * observe it.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * When going down, decrement the number of cores with SMT present.
+ */
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_dec_cpuslocked(&sched_smt_present);
+#endif
+
+ if (!sched_smp_initialized)
+ return 0;
+
+ ret = cpuset_cpu_inactive(cpu);
+ if (ret) {
+ set_cpu_active(cpu, true);
+ return ret;
+ }
+ sched_domains_numa_masks_clear(cpu);
+ return 0;
+}
+
+static void sched_rq_cpu_starting(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->calc_load_update = calc_load_update;
+ update_max_interval();
+}
+
+int sched_cpu_starting(unsigned int cpu)
+{
+ sched_rq_cpu_starting(cpu);
+ sched_tick_start(cpu);
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int sched_cpu_dying(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ /* Handle pending wakeups and then migrate everything off */
+ sched_ttwu_pending();
+ sched_tick_stop(cpu);
+
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ migrate_tasks(rq, &rf);
+ BUG_ON(rq->nr_running != 1);
+ rq_unlock_irqrestore(rq, &rf);
+
+ calc_load_migrate(rq);
+ update_max_interval();
+ nohz_balance_exit_idle(rq);
+ hrtick_clear(rq);
+ return 0;
+}
+#endif
+
+void __init sched_init_smp(void)
+{
+ sched_init_numa();
+
+ /*
+ * There's no userspace yet to cause hotplug operations; hence all the
+ * CPU masks are stable and all blatant races in the below code cannot
+ * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
+ * but there won't be any contention on it.
+ */
+ cpus_read_lock();
+ mutex_lock(&sched_domains_mutex);
+ sched_init_domains(cpu_active_mask);
+ mutex_unlock(&sched_domains_mutex);
+ cpus_read_unlock();
+
+ /* Move init over to a non-isolated CPU */
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
+ BUG();
+ sched_init_granularity();
+
+ init_sched_rt_class();
+ init_sched_dl_class();
+
+ sched_smp_initialized = true;
+}
+
+static int __init migration_init(void)
+{
+ sched_rq_cpu_starting(smp_processor_id());
+ return 0;
+}
+early_initcall(migration_init);
+
+#else
+void __init sched_init_smp(void)
+{
+ sched_init_granularity();
+}
+#endif /* CONFIG_SMP */
+
+int in_sched_functions(unsigned long addr)
+{
+ return in_lock_functions(addr) ||
+ (addr >= (unsigned long)__sched_text_start
+ && addr < (unsigned long)__sched_text_end);
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
+struct task_group root_task_group;
+LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
+#endif
+
+DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
+
+void __init sched_init(void)
+{
+ int i, j;
+ unsigned long alloc_size = 0, ptr;
+
+ wait_bit_init();
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+#endif
+ if (alloc_size) {
+ ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ root_task_group.se = (struct sched_entity **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+ root_task_group.cfs_rq = (struct cfs_rq **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_RT_GROUP_SCHED
+ root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+ root_task_group.rt_rq = (struct rt_rq **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+ }
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ for_each_possible_cpu(i) {
+ per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
+ cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+ per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+ cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+ }
+#endif /* CONFIG_CPUMASK_OFFSTACK */
+
+ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
+ init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
+
+#ifdef CONFIG_SMP
+ init_defrootdomain();
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ init_rt_bandwidth(&root_task_group.rt_bandwidth,
+ global_rt_period(), global_rt_runtime());
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_CGROUP_SCHED
+ task_group_cache = KMEM_CACHE(task_group, 0);
+
+ list_add(&root_task_group.list, &task_groups);
+ INIT_LIST_HEAD(&root_task_group.children);
+ INIT_LIST_HEAD(&root_task_group.siblings);
+ autogroup_init(&init_task);
+#endif /* CONFIG_CGROUP_SCHED */
+
+ for_each_possible_cpu(i) {
+ struct rq *rq;
+
+ rq = cpu_rq(i);
+ raw_spin_lock_init(&rq->lock);
+ rq->nr_running = 0;
+ rq->calc_load_active = 0;
+ rq->calc_load_update = jiffies + LOAD_FREQ;
+ init_cfs_rq(&rq->cfs);
+ init_rt_rq(&rq->rt);
+ init_dl_rq(&rq->dl);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+ INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ /*
+ * How much CPU bandwidth does root_task_group get?
+ *
+ * In case of task-groups formed thr' the cgroup filesystem, it
+ * gets 100% of the CPU resources in the system. This overall
+ * system CPU resource is divided among the tasks of
+ * root_task_group and its child task-groups in a fair manner,
+ * based on each entity's (task or task-group's) weight
+ * (se->load.weight).
+ *
+ * In other words, if root_task_group has 10 tasks of weight
+ * 1024) and two child groups A0 and A1 (of weight 1024 each),
+ * then A0's share of the CPU resource is:
+ *
+ * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+ *
+ * We achieve this by letting root_task_group's tasks sit
+ * directly in rq->cfs (i.e root_task_group->se[] = NULL).
+ */
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+ init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+ rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+#ifdef CONFIG_RT_GROUP_SCHED
+ init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
+#endif
+
+ for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
+ rq->cpu_load[j] = 0;
+
+#ifdef CONFIG_SMP
+ rq->sd = NULL;
+ rq->rd = NULL;
+ rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
+ rq->balance_callback = NULL;
+ rq->active_balance = 0;
+ rq->next_balance = jiffies;
+ rq->push_cpu = 0;
+ rq->cpu = i;
+ rq->online = 0;
+ rq->idle_stamp = 0;
+ rq->avg_idle = 2*sysctl_sched_migration_cost;
+ rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+
+ INIT_LIST_HEAD(&rq->cfs_tasks);
+
+ rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ_COMMON
+ rq->last_load_update_tick = jiffies;
+ rq->last_blocked_load_update_tick = jiffies;
+ atomic_set(&rq->nohz_flags, 0);
+#endif
+#endif /* CONFIG_SMP */
+ hrtick_rq_init(rq);
+ atomic_set(&rq->nr_iowait, 0);
+ }
+
+ set_load_weight(&init_task, false);
+
+ /*
+ * The boot idle thread does lazy MMU switching as well:
+ */
+ mmgrab(&init_mm);
+ enter_lazy_tlb(&init_mm, current);
+
+ /*
+ * Make us the idle thread. Technically, schedule() should not be
+ * called from this thread, however somewhere below it might be,
+ * but because we are the idle thread, we just pick up running again
+ * when this runqueue becomes "idle".
+ */
+ init_idle(current, smp_processor_id());
+
+ calc_load_update = jiffies + LOAD_FREQ;
+
+#ifdef CONFIG_SMP
+ idle_thread_set_boot_cpu();
+#endif
+ init_sched_fair_class();
+
+ init_schedstats();
+
+ scheduler_running = 1;
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+ int nested = preempt_count() + rcu_preempt_depth();
+
+ return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line, int preempt_offset)
+{
+ /*
+ * Blocking primitives will set (and therefore destroy) current->state,
+ * since we will exit with TASK_RUNNING make sure we enter with it,
+ * otherwise we will destroy state.
+ */
+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+ "do not call blocking ops when !TASK_RUNNING; "
+ "state=%lx set at [<%p>] %pS\n",
+ current->state,
+ (void *)current->task_state_change,
+ (void *)current->task_state_change);
+
+ ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
+ /* Ratelimiting timestamp: */
+ static unsigned long prev_jiffy;
+
+ unsigned long preempt_disable_ip;
+
+ /* WARN_ON_ONCE() by default, no rate limit required: */
+ rcu_sleep_check();
+
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ !is_idle_task(current)) ||
+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+ oops_in_progress)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ /* Save this before calling printk(), since that will clobber it: */
+ preempt_disable_ip = get_preempt_disable_ip(current);
+
+ printk(KERN_ERR
+ "BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ printk(KERN_ERR
+ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ if (task_stack_end_corrupted(current))
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+ && !preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(preempt_disable_ip);
+ pr_cont("\n");
+ }
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL(___might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+void normalize_rt_tasks(void)
+{
+ struct task_struct *g, *p;
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ };
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ /*
+ * Only normalize user tasks:
+ */
+ if (p->flags & PF_KTHREAD)
+ continue;
+
+ p->se.exec_start = 0;
+ schedstat_set(p->se.statistics.wait_start, 0);
+ schedstat_set(p->se.statistics.sleep_start, 0);
+ schedstat_set(p->se.statistics.block_start, 0);
+
+ if (!dl_task(p) && !rt_task(p)) {
+ /*
+ * Renice negative nice level userspace
+ * tasks back to 0:
+ */
+ if (task_nice(p) < 0)
+ set_user_nice(p, 0);
+ continue;
+ }
+
+ __sched_setscheduler(p, &attr, false, false);
+ }
+ read_unlock(&tasklist_lock);
+}
+
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+/*
+ * These functions are only useful for the IA64 MCA handling, or kdb.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given CPU.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
+ */
+struct task_struct *curr_task(int cpu)
+{
+ return cpu_curr(cpu);
+}
+
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
+/**
+ * set_curr_task - set the current task for a given CPU.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack. It allows the architecture to switch the
+ * notion of the current task on a CPU in a non-blocking manner. This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void ia64_set_curr_task(int cpu, struct task_struct *p)
+{
+ cpu_curr(cpu) = p;
+}
+
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+/* task_group_lock serializes the addition/removal of task groups */
+static DEFINE_SPINLOCK(task_group_lock);
+
+static void sched_free_group(struct task_group *tg)
+{
+ free_fair_sched_group(tg);
+ free_rt_sched_group(tg);
+ autogroup_free(tg);
+ kmem_cache_free(task_group_cache, tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(struct task_group *parent)
+{
+ struct task_group *tg;
+
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ if (!alloc_fair_sched_group(tg, parent))
+ goto err;
+
+ if (!alloc_rt_sched_group(tg, parent))
+ goto err;
+
+ return tg;
+
+err:
+ sched_free_group(tg);
+ return ERR_PTR(-ENOMEM);
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task_group_lock, flags);
+ list_add_rcu(&tg->list, &task_groups);
+
+ /* Root should already exist: */
+ WARN_ON(!parent);
+
+ tg->parent = parent;
+ INIT_LIST_HEAD(&tg->children);
+ list_add_rcu(&tg->siblings, &parent->children);
+ spin_unlock_irqrestore(&task_group_lock, flags);
+
+ online_fair_sched_group(tg);
+}
+
+/* rcu callback to free various structures associated with a task group */
+static void sched_free_group_rcu(struct rcu_head *rhp)
+{
+ /* Now it should be safe to free those cfs_rqs: */
+ sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+void sched_destroy_group(struct task_group *tg)
+{
+ /* Wait for possible concurrent references to cfs_rqs complete: */
+ call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
+ unsigned long flags;
+
+ /* End participation in shares distribution: */
+ unregister_fair_sched_group(tg);
+
+ spin_lock_irqsave(&task_group_lock, flags);
+ list_del_rcu(&tg->list);
+ list_del_rcu(&tg->siblings);
+ spin_unlock_irqrestore(&task_group_lock, flags);
+}
+
+static void sched_change_group(struct task_struct *tsk, int type)
+{
+ struct task_group *tg;
+
+ /*
+ * All callers are synchronized by task_rq_lock(); we do not use RCU
+ * which is pointless here. Thus, we pass "true" to task_css_check()
+ * to prevent lockdep warnings.
+ */
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
+ struct task_group, css);
+ tg = autogroup_task_group(tsk, tg);
+ tsk->sched_task_group = tg;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk, type);
+ else
+#endif
+ set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int queued, running, queue_flags =
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &rf);
+ update_rq_clock(rq);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, queue_flags);
+ if (running)
+ put_prev_task(rq, tsk);
+
+ sched_change_group(tsk, TASK_MOVE_GROUP);
+
+ if (queued)
+ enqueue_task(rq, tsk, queue_flags);
+ if (running)
+ set_curr_task(rq, tsk);
+
+ task_rq_unlock(rq, tsk, &rf);
+}
+
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct task_group, css) : NULL;
+}
+
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct task_group *parent = css_tg(parent_css);
+ struct task_group *tg;
+
+ if (!parent) {
+ /* This is early initialization for the top cgroup */
+ return &root_task_group.css;
+ }
+
+ tg = sched_create_group(parent);
+ if (IS_ERR(tg))
+ return ERR_PTR(-ENOMEM);
+
+ return &tg->css;
+}
+
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ sched_offline_group(tg);
+}
+
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ /*
+ * Relies on the RCU grace period between css_released() and this.
+ */
+ sched_free_group(tg);
+}
+
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &rf);
+
+ update_rq_clock(rq);
+ sched_change_group(task, TASK_SET_GROUP);
+
+ task_rq_unlock(rq, task, &rf);
+}
+
+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+ int ret = 0;
+
+ cgroup_taskset_for_each(task, css, tset) {
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (!sched_rt_can_attach(css_tg(css), task))
+ return -EINVAL;
+#endif
+ /*
+ * Serialize against wake_up_new_task() such that if its
+ * running, we're sure to observe its full state.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ /*
+ * Avoid calling sched_move_task() before wake_up_new_task()
+ * has happened. This would lead to problems with PELT, due to
+ * move wanting to detach+attach while we're not attached yet.
+ */
+ if (task->state == TASK_NEW)
+ ret = -EINVAL;
+ raw_spin_unlock_irq(&task->pi_lock);
+
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+static void cpu_cgroup_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+
+ cgroup_taskset_for_each(task, css, tset)
+ sched_move_task(task);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 shareval)
+{
+ if (shareval > scale_load_down(ULONG_MAX))
+ shareval = MAX_SHARES;
+ return sched_group_set_shares(css_tg(css), scale_load(shareval));
+}
+
+static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct task_group *tg = css_tg(css);
+
+ return (u64) scale_load_down(tg->shares);
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static DEFINE_MUTEX(cfs_constraints_mutex);
+
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+ int i, ret = 0, runtime_enabled, runtime_was_enabled;
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ /*
+ * Ensure we have at some amount of bandwidth every period. This is
+ * to prevent reaching a state of large arrears when throttled via
+ * entity_tick() resulting in prolonged exit starvation.
+ */
+ if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+ return -EINVAL;
+
+ /*
+ * Likewise, bound things on the otherside by preventing insane quota
+ * periods. This also allows us to normalize in computing quota
+ * feasibility.
+ */
+ if (period > max_cfs_quota_period)
+ return -EINVAL;
+
+ /*
+ * Prevent race between setting of cfs_rq->runtime_enabled and
+ * unthrottle_offline_cfs_rqs().
+ */
+ get_online_cpus();
+ mutex_lock(&cfs_constraints_mutex);
+ ret = __cfs_schedulable(tg, period, quota);
+ if (ret)
+ goto out_unlock;
+
+ runtime_enabled = quota != RUNTIME_INF;
+ runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+ /*
+ * If we need to toggle cfs_bandwidth_used, off->on must occur
+ * before making related changes, and on->off must occur afterwards
+ */
+ if (runtime_enabled && !runtime_was_enabled)
+ cfs_bandwidth_usage_inc();
+ raw_spin_lock_irq(&cfs_b->lock);
+ cfs_b->period = ns_to_ktime(period);
+ cfs_b->quota = quota;
+
+ __refill_cfs_bandwidth_runtime(cfs_b);
+
+ /* Restart the period timer (if active) to handle new period expiry: */
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
+
+ raw_spin_unlock_irq(&cfs_b->lock);
+
+ for_each_online_cpu(i) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+ struct rq *rq = cfs_rq->rq;
+ struct rq_flags rf;
+
+ rq_lock_irq(rq, &rf);
+ cfs_rq->runtime_enabled = runtime_enabled;
+ cfs_rq->runtime_remaining = 0;
+
+ if (cfs_rq->throttled)
+ unthrottle_cfs_rq(cfs_rq);
+ rq_unlock_irq(rq, &rf);
+ }
+ if (runtime_was_enabled && !runtime_enabled)
+ cfs_bandwidth_usage_dec();
+out_unlock:
+ mutex_unlock(&cfs_constraints_mutex);
+ put_online_cpus();
+
+ return ret;
+}
+
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+ u64 quota, period;
+
+ period = ktime_to_ns(tg->cfs_bandwidth.period);
+ if (cfs_quota_us < 0)
+ quota = RUNTIME_INF;
+ else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
+ quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+ else
+ return -EINVAL;
+
+ return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_quota(struct task_group *tg)
+{
+ u64 quota_us;
+
+ if (tg->cfs_bandwidth.quota == RUNTIME_INF)
+ return -1;
+
+ quota_us = tg->cfs_bandwidth.quota;
+ do_div(quota_us, NSEC_PER_USEC);
+
+ return quota_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+ u64 quota, period;
+
+ if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
+ period = (u64)cfs_period_us * NSEC_PER_USEC;
+ quota = tg->cfs_bandwidth.quota;
+
+ return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+ u64 cfs_period_us;
+
+ cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
+ do_div(cfs_period_us, NSEC_PER_USEC);
+
+ return cfs_period_us;
+}
+
+static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return tg_get_cfs_quota(css_tg(css));
+}
+
+static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, s64 cfs_quota_us)
+{
+ return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return tg_get_cfs_period(css_tg(css));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 cfs_period_us)
+{
+ return tg_set_cfs_period(css_tg(css), cfs_period_us);
+}
+
+struct cfs_schedulable_data {
+ struct task_group *tg;
+ u64 period, quota;
+};
+
+/*
+ * normalize group quota/period to be quota/max_period
+ * note: units are usecs
+ */
+static u64 normalize_cfs_quota(struct task_group *tg,
+ struct cfs_schedulable_data *d)
+{
+ u64 quota, period;
+
+ if (tg == d->tg) {
+ period = d->period;
+ quota = d->quota;
+ } else {
+ period = tg_get_cfs_period(tg);
+ quota = tg_get_cfs_quota(tg);
+ }
+
+ /* note: these should typically be equivalent */
+ if (quota == RUNTIME_INF || quota == -1)
+ return RUNTIME_INF;
+
+ return to_ratio(period, quota);
+}
+
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+{
+ struct cfs_schedulable_data *d = data;
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ s64 quota = 0, parent_quota = -1;
+
+ if (!tg->parent) {
+ quota = RUNTIME_INF;
+ } else {
+ struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
+
+ quota = normalize_cfs_quota(tg, d);
+ parent_quota = parent_b->hierarchical_quota;
+
+ /*
+ * Ensure max(child_quota) <= parent_quota. On cgroup2,
+ * always take the min. On cgroup1, only inherit when no
+ * limit is set:
+ */
+ if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
+ quota = min(quota, parent_quota);
+ } else {
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+ return -EINVAL;
+ }
+ }
+ cfs_b->hierarchical_quota = quota;
+
+ return 0;
+}
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+{
+ int ret;
+ struct cfs_schedulable_data data = {
+ .tg = tg,
+ .period = period,
+ .quota = quota,
+ };
+
+ if (quota != RUNTIME_INF) {
+ do_div(data.period, NSEC_PER_USEC);
+ do_div(data.quota, NSEC_PER_USEC);
+ }
+
+ rcu_read_lock();
+ ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+ seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
+ seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
+ seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
+
+ if (schedstat_enabled() && tg != &root_task_group) {
+ u64 ws = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ ws += schedstat_val(tg->se[i]->statistics.wait_sum);
+
+ seq_printf(sf, "wait_sum %llu\n", ws);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 val)
+{
+ return sched_group_set_rt_runtime(css_tg(css), val);
+}
+
+static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return sched_group_rt_runtime(css_tg(css));
+}
+
+static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 rt_period_us)
+{
+ return sched_group_set_rt_period(css_tg(css), rt_period_us);
+}
+
+static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return sched_group_rt_period(css_tg(css));
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static struct cftype cpu_legacy_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ {
+ .name = "shares",
+ .read_u64 = cpu_shares_read_u64,
+ .write_u64 = cpu_shares_write_u64,
+ },
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .name = "cfs_quota_us",
+ .read_s64 = cpu_cfs_quota_read_s64,
+ .write_s64 = cpu_cfs_quota_write_s64,
+ },
+ {
+ .name = "cfs_period_us",
+ .read_u64 = cpu_cfs_period_read_u64,
+ .write_u64 = cpu_cfs_period_write_u64,
+ },
+ {
+ .name = "stat",
+ .seq_show = cpu_cfs_stat_show,
+ },
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ {
+ .name = "rt_runtime_us",
+ .read_s64 = cpu_rt_runtime_read,
+ .write_s64 = cpu_rt_runtime_write,
+ },
+ {
+ .name = "rt_period_us",
+ .read_u64 = cpu_rt_period_read_uint,
+ .write_u64 = cpu_rt_period_write_uint,
+ },
+#endif
+ { } /* Terminate */
+};
+
+static int cpu_extra_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 throttled_usec;
+
+ throttled_usec = cfs_b->throttled_time;
+ do_div(throttled_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "nr_periods %d\n"
+ "nr_throttled %d\n"
+ "throttled_usec %llu\n",
+ cfs_b->nr_periods, cfs_b->nr_throttled,
+ throttled_usec);
+ }
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct task_group *tg = css_tg(css);
+ u64 weight = scale_load_down(tg->shares);
+
+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+}
+
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 weight)
+{
+ /*
+ * cgroup weight knobs should use the common MIN, DFL and MAX
+ * values which are 1, 100 and 10000 respectively. While it loses
+ * a bit of range on both ends, it maps pretty well onto the shares
+ * value used by scheduler and the round-trip conversions preserve
+ * the original value over the entire range.
+ */
+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+ return -ERANGE;
+
+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+
+static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ unsigned long weight = scale_load_down(css_tg(css)->shares);
+ int last_delta = INT_MAX;
+ int prio, delta;
+
+ /* find the closest nice value to the current weight */
+ for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
+ delta = abs(sched_prio_to_weight[prio] - weight);
+ if (delta >= last_delta)
+ break;
+ last_delta = delta;
+ }
+
+ return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
+}
+
+static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 nice)
+{
+ unsigned long weight;
+ int idx;
+
+ if (nice < MIN_NICE || nice > MAX_NICE)
+ return -ERANGE;
+
+ idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
+ idx = array_index_nospec(idx, 40);
+ weight = sched_prio_to_weight[idx];
+
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+#endif
+
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+ long period, long quota)
+{
+ if (quota < 0)
+ seq_puts(sf, "max");
+ else
+ seq_printf(sf, "%ld", quota);
+
+ seq_printf(sf, " %ld\n", period);
+}
+
+/* caller should put the current value in *@periodp before calling */
+static int __maybe_unused cpu_period_quota_parse(char *buf,
+ u64 *periodp, u64 *quotap)
+{
+ char tok[21]; /* U64_MAX */
+
+ if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
+ return -EINVAL;
+
+ *periodp *= NSEC_PER_USEC;
+
+ if (sscanf(tok, "%llu", quotap))
+ *quotap *= NSEC_PER_USEC;
+ else if (!strcmp(tok, "max"))
+ *quotap = RUNTIME_INF;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static int cpu_max_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+ return 0;
+}
+
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct task_group *tg = css_tg(of_css(of));
+ u64 period = tg_get_cfs_period(tg);
+ u64 quota;
+ int ret;
+
+ ret = cpu_period_quota_parse(buf, &period, &quota);
+ if (!ret)
+ ret = tg_set_cfs_bandwidth(tg, period, quota);
+ return ret ?: nbytes;
+}
+#endif
+
+static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_weight_read_u64,
+ .write_u64 = cpu_weight_write_u64,
+ },
+ {
+ .name = "weight.nice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_weight_nice_read_s64,
+ .write_s64 = cpu_weight_nice_write_s64,
+ },
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_max_show,
+ .write = cpu_max_write,
+ },
+#endif
+ { } /* terminate */
+};
+
+struct cgroup_subsys cpu_cgrp_subsys = {
+ .css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
+ .css_released = cpu_cgroup_css_released,
+ .css_free = cpu_cgroup_css_free,
+ .css_extra_stat_show = cpu_extra_stat_show,
+ .fork = cpu_cgroup_fork,
+ .can_attach = cpu_cgroup_can_attach,
+ .attach = cpu_cgroup_attach,
+ .legacy_cftypes = cpu_legacy_files,
+ .dfl_cftypes = cpu_files,
+ .early_init = true,
+ .threaded = true,
+};
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+void dump_cpu_task(int cpu)
+{
+ pr_info("Task dump for CPU %d:\n", cpu);
+ sched_show_task(cpu_curr(cpu));
+}
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+const int sched_prio_to_weight[40] = {
+ /* -20 */ 88761, 71755, 56483, 46273, 36291,
+ /* -15 */ 29154, 23254, 18705, 14949, 11916,
+ /* -10 */ 9548, 7620, 6100, 4904, 3906,
+ /* -5 */ 3121, 2501, 1991, 1586, 1277,
+ /* 0 */ 1024, 820, 655, 526, 423,
+ /* 5 */ 335, 272, 215, 172, 137,
+ /* 10 */ 110, 87, 70, 56, 45,
+ /* 15 */ 36, 29, 23, 18, 15,
+};
+
+/*
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+const u32 sched_prio_to_wmult[40] = {
+ /* -20 */ 48388, 59856, 76040, 92818, 118348,
+ /* -15 */ 147320, 184698, 229616, 287308, 360437,
+ /* -10 */ 449829, 563644, 704093, 875809, 1099582,
+ /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
+ /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
+ /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
+ /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
+ /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
+
+#undef CREATE_TRACE_POINTS
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 000000000..9fbb10383
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CPU accounting code for task groups.
+ *
+ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
+ * (balbir@in.ibm.com).
+ */
+#include "sched.h"
+
+/* Time spent by the tasks of the CPU accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_USER, /* ... user mode */
+ CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
+static const char * const cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+struct cpuacct_usage {
+ u64 usages[CPUACCT_STAT_NSTATS];
+};
+
+/* track CPU usage of a group of tasks and its child groups */
+struct cpuacct {
+ struct cgroup_subsys_state css;
+ /* cpuusage holds pointer to a u64-type object on every CPU */
+ struct cpuacct_usage __percpu *cpuusage;
+ struct kernel_cpustat __percpu *cpustat;
+};
+
+static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cpuacct, css) : NULL;
+}
+
+/* Return CPU accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+ return css_ca(task_css(tsk, cpuacct_cgrp_id));
+}
+
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+ return css_ca(ca->css.parent);
+}
+
+static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
+static struct cpuacct root_cpuacct = {
+ .cpustat = &kernel_cpustat,
+ .cpuusage = &root_cpuacct_cpuusage,
+};
+
+/* Create a new CPU accounting group */
+static struct cgroup_subsys_state *
+cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cpuacct *ca;
+
+ if (!parent_css)
+ return &root_cpuacct.css;
+
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca)
+ goto out;
+
+ ca->cpuusage = alloc_percpu(struct cpuacct_usage);
+ if (!ca->cpuusage)
+ goto out_free_ca;
+
+ ca->cpustat = alloc_percpu(struct kernel_cpustat);
+ if (!ca->cpustat)
+ goto out_free_cpuusage;
+
+ return &ca->css;
+
+out_free_cpuusage:
+ free_percpu(ca->cpuusage);
+out_free_ca:
+ kfree(ca);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+/* Destroy an existing CPU accounting group */
+static void cpuacct_css_free(struct cgroup_subsys_state *css)
+{
+ struct cpuacct *ca = css_ca(css);
+
+ free_percpu(ca->cpustat);
+ free_percpu(ca->cpuusage);
+ kfree(ca);
+}
+
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
+ enum cpuacct_stat_index index)
+{
+ struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 data;
+
+ /*
+ * We allow index == CPUACCT_STAT_NSTATS here to read
+ * the sum of suages.
+ */
+ BUG_ON(index > CPUACCT_STAT_NSTATS);
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+#endif
+
+ if (index == CPUACCT_STAT_NSTATS) {
+ int i = 0;
+
+ data = 0;
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+ data += cpuusage->usages[i];
+ } else {
+ data = cpuusage->usages[index];
+ }
+
+#ifndef CONFIG_64BIT
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#endif
+
+ return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+ struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ int i;
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+#endif
+
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+ cpuusage->usages[i] = val;
+
+#ifndef CONFIG_64BIT
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#endif
+}
+
+/* Return total CPU usage (in nanoseconds) of a group */
+static u64 __cpuusage_read(struct cgroup_subsys_state *css,
+ enum cpuacct_stat_index index)
+{
+ struct cpuacct *ca = css_ca(css);
+ u64 totalcpuusage = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ totalcpuusage += cpuacct_cpuusage_read(ca, i, index);
+
+ return totalcpuusage;
+}
+
+static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return __cpuusage_read(css, CPUACCT_STAT_USER);
+}
+
+static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
+}
+
+static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
+}
+
+static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val)
+{
+ struct cpuacct *ca = css_ca(css);
+ int cpu;
+
+ /*
+ * Only allow '0' here to do a reset.
+ */
+ if (val)
+ return -EINVAL;
+
+ for_each_possible_cpu(cpu)
+ cpuacct_cpuusage_write(ca, cpu, 0);
+
+ return 0;
+}
+
+static int __cpuacct_percpu_seq_show(struct seq_file *m,
+ enum cpuacct_stat_index index)
+{
+ struct cpuacct *ca = css_ca(seq_css(m));
+ u64 percpu;
+ int i;
+
+ for_each_possible_cpu(i) {
+ percpu = cpuacct_cpuusage_read(ca, i, index);
+ seq_printf(m, "%llu ", (unsigned long long) percpu);
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
+{
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
+}
+
+static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
+{
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
+}
+
+static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+{
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
+}
+
+static int cpuacct_all_seq_show(struct seq_file *m, void *V)
+{
+ struct cpuacct *ca = css_ca(seq_css(m));
+ int index;
+ int cpu;
+
+ seq_puts(m, "cpu");
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+ seq_printf(m, " %s", cpuacct_stat_desc[index]);
+ seq_puts(m, "\n");
+
+ for_each_possible_cpu(cpu) {
+ struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+ seq_printf(m, "%d", cpu);
+
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit
+ * platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+#endif
+
+ seq_printf(m, " %llu", cpuusage->usages[index]);
+
+#ifndef CONFIG_64BIT
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#endif
+ }
+ seq_puts(m, "\n");
+ }
+ return 0;
+}
+
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
+{
+ struct cpuacct *ca = css_ca(seq_css(sf));
+ s64 val[CPUACCT_STAT_NSTATS];
+ int cpu;
+ int stat;
+
+ memset(val, 0, sizeof(val));
+ for_each_possible_cpu(cpu) {
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
+
+ val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
+ val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
+ }
+
+ for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
+ seq_printf(sf, "%s %lld\n",
+ cpuacct_stat_desc[stat],
+ (long long)nsec_to_clock_t(val[stat]));
+ }
+
+ return 0;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "usage",
+ .read_u64 = cpuusage_read,
+ .write_u64 = cpuusage_write,
+ },
+ {
+ .name = "usage_user",
+ .read_u64 = cpuusage_user_read,
+ },
+ {
+ .name = "usage_sys",
+ .read_u64 = cpuusage_sys_read,
+ },
+ {
+ .name = "usage_percpu",
+ .seq_show = cpuacct_percpu_seq_show,
+ },
+ {
+ .name = "usage_percpu_user",
+ .seq_show = cpuacct_percpu_user_seq_show,
+ },
+ {
+ .name = "usage_percpu_sys",
+ .seq_show = cpuacct_percpu_sys_seq_show,
+ },
+ {
+ .name = "usage_all",
+ .seq_show = cpuacct_all_seq_show,
+ },
+ {
+ .name = "stat",
+ .seq_show = cpuacct_stats_show,
+ },
+ { } /* terminate */
+};
+
+/*
+ * charge this task's execution time to its accounting group.
+ *
+ * called with rq->lock held.
+ */
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+ struct cpuacct *ca;
+ int index = CPUACCT_STAT_SYSTEM;
+ struct pt_regs *regs = task_pt_regs(tsk);
+
+ if (regs && user_mode(regs))
+ index = CPUACCT_STAT_USER;
+
+ rcu_read_lock();
+
+ for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
+ this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;
+
+ rcu_read_unlock();
+}
+
+/*
+ * Add user/system time to cpuacct.
+ *
+ * Note: it's the caller that updates the account of the root cgroup.
+ */
+void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
+{
+ struct cpuacct *ca;
+
+ rcu_read_lock();
+ for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
+ this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
+ rcu_read_unlock();
+}
+
+struct cgroup_subsys cpuacct_cgrp_subsys = {
+ .css_alloc = cpuacct_css_alloc,
+ .css_free = cpuacct_css_free,
+ .legacy_cftypes = files,
+ .early_init = true,
+};
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
new file mode 100644
index 000000000..50316455e
--- /dev/null
+++ b/kernel/sched/cpudeadline.c
@@ -0,0 +1,276 @@
+/*
+ * kernel/sched/cpudl.c
+ *
+ * Global CPU deadline management
+ *
+ * Author: Juri Lelli <j.lelli@sssup.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include "sched.h"
+
+static inline int parent(int i)
+{
+ return (i - 1) >> 1;
+}
+
+static inline int left_child(int i)
+{
+ return (i << 1) + 1;
+}
+
+static inline int right_child(int i)
+{
+ return (i << 1) + 2;
+}
+
+static void cpudl_heapify_down(struct cpudl *cp, int idx)
+{
+ int l, r, largest;
+
+ int orig_cpu = cp->elements[idx].cpu;
+ u64 orig_dl = cp->elements[idx].dl;
+
+ if (left_child(idx) >= cp->size)
+ return;
+
+ /* adapted from lib/prio_heap.c */
+ while (1) {
+ u64 largest_dl;
+
+ l = left_child(idx);
+ r = right_child(idx);
+ largest = idx;
+ largest_dl = orig_dl;
+
+ if ((l < cp->size) && dl_time_before(orig_dl,
+ cp->elements[l].dl)) {
+ largest = l;
+ largest_dl = cp->elements[l].dl;
+ }
+ if ((r < cp->size) && dl_time_before(largest_dl,
+ cp->elements[r].dl))
+ largest = r;
+
+ if (largest == idx)
+ break;
+
+ /* pull largest child onto idx */
+ cp->elements[idx].cpu = cp->elements[largest].cpu;
+ cp->elements[idx].dl = cp->elements[largest].dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
+ idx = largest;
+ }
+ /* actual push down of saved original values orig_* */
+ cp->elements[idx].cpu = orig_cpu;
+ cp->elements[idx].dl = orig_dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
+}
+
+static void cpudl_heapify_up(struct cpudl *cp, int idx)
+{
+ int p;
+
+ int orig_cpu = cp->elements[idx].cpu;
+ u64 orig_dl = cp->elements[idx].dl;
+
+ if (idx == 0)
+ return;
+
+ do {
+ p = parent(idx);
+ if (dl_time_before(orig_dl, cp->elements[p].dl))
+ break;
+ /* pull parent onto idx */
+ cp->elements[idx].cpu = cp->elements[p].cpu;
+ cp->elements[idx].dl = cp->elements[p].dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
+ idx = p;
+ } while (idx != 0);
+ /* actual push up of saved original values orig_* */
+ cp->elements[idx].cpu = orig_cpu;
+ cp->elements[idx].dl = orig_dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
+}
+
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+ if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+ cp->elements[idx].dl))
+ cpudl_heapify_up(cp, idx);
+ else
+ cpudl_heapify_down(cp, idx);
+}
+
+static inline int cpudl_maximum(struct cpudl *cp)
+{
+ return cp->elements[0].cpu;
+}
+
+/*
+ * cpudl_find - find the best (later-dl) CPU in the system
+ * @cp: the cpudl max-heap context
+ * @p: the task
+ * @later_mask: a mask to fill in with the selected CPUs (or NULL)
+ *
+ * Returns: int - CPUs were found
+ */
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+ struct cpumask *later_mask)
+{
+ const struct sched_dl_entity *dl_se = &p->dl;
+
+ if (later_mask &&
+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+ return 1;
+ } else {
+ int best_cpu = cpudl_maximum(cp);
+
+ WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+
+ if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+ dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+ if (later_mask)
+ cpumask_set_cpu(best_cpu, later_mask);
+
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * cpudl_clear - remove a CPU from the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target CPU
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_clear(struct cpudl *cp, int cpu)
+{
+ int old_idx, new_cpu;
+ unsigned long flags;
+
+ WARN_ON(!cpu_present(cpu));
+
+ raw_spin_lock_irqsave(&cp->lock, flags);
+
+ old_idx = cp->elements[cpu].idx;
+ if (old_idx == IDX_INVALID) {
+ /*
+ * Nothing to remove if old_idx was invalid.
+ * This could happen if a rq_offline_dl is
+ * called for a CPU without -dl tasks running.
+ */
+ } else {
+ new_cpu = cp->elements[cp->size - 1].cpu;
+ cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
+ cp->elements[old_idx].cpu = new_cpu;
+ cp->size--;
+ cp->elements[new_cpu].idx = old_idx;
+ cp->elements[cpu].idx = IDX_INVALID;
+ cpudl_heapify(cp, old_idx);
+
+ cpumask_set_cpu(cpu, cp->free_cpus);
+ }
+ raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target CPU
+ * @dl: the new earliest deadline for this CPU
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
+{
+ int old_idx;
+ unsigned long flags;
+
+ WARN_ON(!cpu_present(cpu));
+
+ raw_spin_lock_irqsave(&cp->lock, flags);
+
+ old_idx = cp->elements[cpu].idx;
+ if (old_idx == IDX_INVALID) {
+ int new_idx = cp->size++;
+
+ cp->elements[new_idx].dl = dl;
+ cp->elements[new_idx].cpu = cpu;
+ cp->elements[cpu].idx = new_idx;
+ cpudl_heapify_up(cp, new_idx);
+ cpumask_clear_cpu(cpu, cp->free_cpus);
+ } else {
+ cp->elements[old_idx].dl = dl;
+ cpudl_heapify(cp, old_idx);
+ }
+
+ raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+
+/*
+ * cpudl_set_freecpu - Set the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached CPU
+ */
+void cpudl_set_freecpu(struct cpudl *cp, int cpu)
+{
+ cpumask_set_cpu(cpu, cp->free_cpus);
+}
+
+/*
+ * cpudl_clear_freecpu - Clear the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached CPU
+ */
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
+{
+ cpumask_clear_cpu(cpu, cp->free_cpus);
+}
+
+/*
+ * cpudl_init - initialize the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+int cpudl_init(struct cpudl *cp)
+{
+ int i;
+
+ raw_spin_lock_init(&cp->lock);
+ cp->size = 0;
+
+ cp->elements = kcalloc(nr_cpu_ids,
+ sizeof(struct cpudl_item),
+ GFP_KERNEL);
+ if (!cp->elements)
+ return -ENOMEM;
+
+ if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ kfree(cp->elements);
+ return -ENOMEM;
+ }
+
+ for_each_possible_cpu(i)
+ cp->elements[i].idx = IDX_INVALID;
+
+ return 0;
+}
+
+/*
+ * cpudl_cleanup - clean up the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+void cpudl_cleanup(struct cpudl *cp)
+{
+ free_cpumask_var(cp->free_cpus);
+ kfree(cp->elements);
+}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
new file mode 100644
index 000000000..0adeda93b
--- /dev/null
+++ b/kernel/sched/cpudeadline.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define IDX_INVALID -1
+
+struct cpudl_item {
+ u64 dl;
+ int cpu;
+ int idx;
+};
+
+struct cpudl {
+ raw_spinlock_t lock;
+ int size;
+ cpumask_var_t free_cpus;
+ struct cpudl_item *elements;
+};
+
+#ifdef CONFIG_SMP
+int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
+void cpudl_clear(struct cpudl *cp, int cpu);
+int cpudl_init(struct cpudl *cp);
+void cpudl_set_freecpu(struct cpudl *cp, int cpu);
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
+void cpudl_cleanup(struct cpudl *cp);
+#endif /* CONFIG_SMP */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
new file mode 100644
index 000000000..42ce32a1a
--- /dev/null
+++ b/kernel/sched/cpufreq.c
@@ -0,0 +1,80 @@
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/cpufreq.h>
+
+#include "sched.h"
+
+DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ * @func: Callback function to set for the CPU.
+ *
+ * Set and publish the update_util_data pointer for the given CPU.
+ *
+ * The update_util_data pointer of @cpu is set to @data and the callback
+ * function pointer in the target struct update_util_data is set to @func.
+ * That function will be called by cpufreq_update_util() from RCU-sched
+ * read-side critical sections, so it must not sleep. @data will always be
+ * passed to it as the first argument which allows the function to get to the
+ * target update_util_data structure and its container.
+ *
+ * The update_util_data pointer of @cpu must be NULL when this function is
+ * called or it will WARN() and return with no effect.
+ */
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+ void (*func)(struct update_util_data *data, u64 time,
+ unsigned int flags))
+{
+ if (WARN_ON(!data || !func))
+ return;
+
+ if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+ return;
+
+ data->func = func;
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
+
+/**
+ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
+ * @cpu: The CPU to clear the pointer for.
+ *
+ * Clear the update_util_data pointer for the given CPU.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_remove_update_util_hook(int cpu)
+{
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
+}
+EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
+
+/**
+ * cpufreq_this_cpu_can_update - Check if cpufreq policy can be updated.
+ * @policy: cpufreq policy to check.
+ *
+ * Return 'true' if:
+ * - the local and remote CPUs share @policy,
+ * - dvfs_possible_from_any_cpu is set in @policy and the local CPU is not going
+ * offline (in which case it is not expected to run cpufreq updates any more).
+ */
+bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy)
+{
+ return cpumask_test_cpu(smp_processor_id(), policy->cpus) ||
+ (policy->dvfs_possible_from_any_cpu &&
+ rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)));
+}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
new file mode 100644
index 000000000..60f0e0e04
--- /dev/null
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -0,0 +1,892 @@
+/*
+ * CPUFreq governor based on scheduler-provided CPU utilization data.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "sched.h"
+
+#include <trace/events/power.h>
+
+struct sugov_tunables {
+ struct gov_attr_set attr_set;
+ unsigned int rate_limit_us;
+};
+
+struct sugov_policy {
+ struct cpufreq_policy *policy;
+
+ struct sugov_tunables *tunables;
+ struct list_head tunables_hook;
+
+ raw_spinlock_t update_lock; /* For shared policies */
+ u64 last_freq_update_time;
+ s64 freq_update_delay_ns;
+ unsigned int next_freq;
+ unsigned int cached_raw_freq;
+
+ /* The next fields are only needed if fast switch cannot be used: */
+ struct irq_work irq_work;
+ struct kthread_work work;
+ struct mutex work_lock;
+ struct kthread_worker worker;
+ struct task_struct *thread;
+ bool work_in_progress;
+
+ bool limits_changed;
+ bool need_freq_update;
+};
+
+struct sugov_cpu {
+ struct update_util_data update_util;
+ struct sugov_policy *sg_policy;
+ unsigned int cpu;
+
+ bool iowait_boost_pending;
+ unsigned int iowait_boost;
+ u64 last_update;
+
+ unsigned long bw_dl;
+ unsigned long min;
+ unsigned long max;
+
+ /* The field below is for single-CPU policies only: */
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned long saved_idle_calls;
+#endif
+};
+
+static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+
+/************************ Governor internals ***********************/
+
+static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
+{
+ s64 delta_ns;
+
+ /*
+ * Since cpufreq_update_util() is called with rq->lock held for
+ * the @target_cpu, our per-CPU data is fully serialized.
+ *
+ * However, drivers cannot in general deal with cross-CPU
+ * requests, so while get_next_freq() will work, our
+ * sugov_update_commit() call may not for the fast switching platforms.
+ *
+ * Hence stop here for remote requests if they aren't supported
+ * by the hardware, as calculating the frequency is pointless if
+ * we cannot in fact act on it.
+ *
+ * This is needed on the slow switching platforms too to prevent CPUs
+ * going offline from leaving stale IRQ work items behind.
+ */
+ if (!cpufreq_this_cpu_can_update(sg_policy->policy))
+ return false;
+
+ if (unlikely(sg_policy->limits_changed)) {
+ sg_policy->limits_changed = false;
+ sg_policy->need_freq_update = true;
+ return true;
+ }
+
+ delta_ns = time - sg_policy->last_freq_update_time;
+
+ return delta_ns >= sg_policy->freq_update_delay_ns;
+}
+
+static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ if (sg_policy->next_freq == next_freq)
+ return false;
+
+ sg_policy->next_freq = next_freq;
+ sg_policy->last_freq_update_time = time;
+
+ return true;
+}
+
+static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+ int cpu;
+
+ if (!sugov_update_next_freq(sg_policy, time, next_freq))
+ return;
+
+ next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+ if (!next_freq)
+ return;
+
+ policy->cur = next_freq;
+
+ if (trace_cpu_frequency_enabled()) {
+ for_each_cpu(cpu, policy->cpus)
+ trace_cpu_frequency(next_freq, cpu);
+ }
+}
+
+static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ if (!sugov_update_next_freq(sg_policy, time, next_freq))
+ return;
+
+ if (!sg_policy->work_in_progress) {
+ sg_policy->work_in_progress = true;
+ irq_work_queue(&sg_policy->irq_work);
+ }
+}
+
+/**
+ * get_next_freq - Compute a new frequency for a given cpufreq policy.
+ * @sg_policy: schedutil policy object to compute the new frequency for.
+ * @util: Current CPU utilization.
+ * @max: CPU capacity.
+ *
+ * If the utilization is frequency-invariant, choose the new frequency to be
+ * proportional to it, that is
+ *
+ * next_freq = C * max_freq * util / max
+ *
+ * Otherwise, approximate the would-be frequency-invariant utilization by
+ * util_raw * (curr_freq / max_freq) which leads to
+ *
+ * next_freq = C * curr_freq * util_raw / max
+ *
+ * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
+ */
+static unsigned int get_next_freq(struct sugov_policy *sg_policy,
+ unsigned long util, unsigned long max)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned int freq = arch_scale_freq_invariant() ?
+ policy->cpuinfo.max_freq : policy->cur;
+
+ freq = (freq + (freq >> 2)) * util / max;
+
+ if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
+ return sg_policy->next_freq;
+
+ sg_policy->need_freq_update = false;
+ sg_policy->cached_raw_freq = freq;
+ return cpufreq_driver_resolve_freq(policy, freq);
+}
+
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the irq utilization.
+ *
+ * The DL bandwidth number otoh is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
+{
+ struct rq *rq = cpu_rq(sg_cpu->cpu);
+ unsigned long util, irq, max;
+
+ sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+ sg_cpu->bw_dl = cpu_bw_dl(rq);
+
+ if (rt_rq_is_runnable(&rq->rt))
+ return max;
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= max))
+ return max;
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ */
+ util = cpu_util_cfs(rq);
+ util += cpu_util_rt(rq);
+
+ /*
+ * We do not make cpu_util_dl() a permanent part of this sum because we
+ * want to use cpu_bw_dl() later on, but we need to check if the
+ * CFS+RT+DL sum is saturated (ie. no idle time) such that we select
+ * f_max when there is no idle time.
+ *
+ * NOTE: numerical errors or stop class might cause us to not quite hit
+ * saturation when we should -- something for later.
+ */
+ if ((util + cpu_util_dl(rq)) >= max)
+ return max;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * irq metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * 1 - irq
+ * U' = irq + ------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, max);
+ util += irq;
+
+ /*
+ * Bandwidth required by DEADLINE must always be granted while, for
+ * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
+ * to gracefully reduce the frequency when no tasks show up for longer
+ * periods of time.
+ *
+ * Ideally we would like to set bw_dl as min/guaranteed freq and util +
+ * bw_dl as requested freq. However, cpufreq is not yet ready for such
+ * an interface. So, we only do the latter for now.
+ */
+ return min(max, util + sg_cpu->bw_dl);
+}
+
+/**
+ * sugov_iowait_reset() - Reset the IO boost status of a CPU.
+ * @sg_cpu: the sugov data for the CPU to boost
+ * @time: the update time from the caller
+ * @set_iowait_boost: true if an IO boost has been requested
+ *
+ * The IO wait boost of a task is disabled after a tick since the last update
+ * of a CPU. If a new IO wait boost is requested after more then a tick, then
+ * we enable the boost starting from the minimum frequency, which improves
+ * energy efficiency by ignoring sporadic wakeups from IO.
+ */
+static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
+ bool set_iowait_boost)
+{
+ s64 delta_ns = time - sg_cpu->last_update;
+
+ /* Reset boost only if a tick has elapsed since last request */
+ if (delta_ns <= TICK_NSEC)
+ return false;
+
+ sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0;
+ sg_cpu->iowait_boost_pending = set_iowait_boost;
+
+ return true;
+}
+
+/**
+ * sugov_iowait_boost() - Updates the IO boost status of a CPU.
+ * @sg_cpu: the sugov data for the CPU to boost
+ * @time: the update time from the caller
+ * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
+ *
+ * Each time a task wakes up after an IO operation, the CPU utilization can be
+ * boosted to a certain utilization which doubles at each "frequent and
+ * successive" wakeup from IO, ranging from the utilization of the minimum
+ * OPP to the utilization of the maximum OPP.
+ * To keep doubling, an IO boost has to be requested at least once per tick,
+ * otherwise we restart from the utilization of the minimum OPP.
+ */
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
+{
+ bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
+
+ /* Reset boost if the CPU appears to have been idle enough */
+ if (sg_cpu->iowait_boost &&
+ sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
+ return;
+
+ /* Boost only tasks waking up after IO */
+ if (!set_iowait_boost)
+ return;
+
+ /* Ensure boost doubles only one time at each request */
+ if (sg_cpu->iowait_boost_pending)
+ return;
+ sg_cpu->iowait_boost_pending = true;
+
+ /* Double the boost at each request */
+ if (sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost =
+ min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
+ return;
+ }
+
+ /* First wakeup after IO: start with minimum boost */
+ sg_cpu->iowait_boost = sg_cpu->min;
+}
+
+/**
+ * sugov_iowait_apply() - Apply the IO boost to a CPU.
+ * @sg_cpu: the sugov data for the cpu to boost
+ * @time: the update time from the caller
+ * @util: the utilization to (eventually) boost
+ * @max: the maximum value the utilization can be boosted to
+ *
+ * A CPU running a task which woken up after an IO operation can have its
+ * utilization boosted to speed up the completion of those IO operations.
+ * The IO boost value is increased each time a task wakes up from IO, in
+ * sugov_iowait_apply(), and it's instead decreased by this function,
+ * each time an increase has not been requested (!iowait_boost_pending).
+ *
+ * A CPU which also appears to have been idle for at least one tick has also
+ * its IO boost utilization reset.
+ *
+ * This mechanism is designed to boost high frequently IO waiting tasks, while
+ * being more conservative on tasks which does sporadic IO operations.
+ */
+static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long util, unsigned long max)
+{
+ unsigned long boost;
+
+ /* No boost currently required */
+ if (!sg_cpu->iowait_boost)
+ return util;
+
+ /* Reset boost if the CPU appears to have been idle enough */
+ if (sugov_iowait_reset(sg_cpu, time, false))
+ return util;
+
+ if (!sg_cpu->iowait_boost_pending) {
+ /*
+ * No boost pending; reduce the boost value.
+ */
+ sg_cpu->iowait_boost >>= 1;
+ if (sg_cpu->iowait_boost < sg_cpu->min) {
+ sg_cpu->iowait_boost = 0;
+ return util;
+ }
+ }
+
+ sg_cpu->iowait_boost_pending = false;
+
+ /*
+ * @util is already in capacity scale; convert iowait_boost
+ * into the same scale so we can compare.
+ */
+ boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
+ return max(boost, util);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+ unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
+ bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+ sg_cpu->saved_idle_calls = idle_calls;
+ return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * Make sugov_should_update_freq() ignore the rate limit when DL
+ * has increased the utilization.
+ */
+static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
+{
+ if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
+ sg_policy->limits_changed = true;
+}
+
+static void sugov_update_single(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned long util, max;
+ unsigned int next_f;
+ bool busy;
+
+ sugov_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+
+ ignore_dl_rate_limit(sg_cpu, sg_policy);
+
+ if (!sugov_should_update_freq(sg_policy, time))
+ return;
+
+ /* Limits may have changed, don't skip frequency update */
+ busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
+
+ util = sugov_get_util(sg_cpu);
+ max = sg_cpu->max;
+ util = sugov_iowait_apply(sg_cpu, time, util, max);
+ next_f = get_next_freq(sg_policy, util, max);
+ /*
+ * Do not reduce the frequency if the CPU has not been idle
+ * recently, as the reduction is likely to be premature then.
+ */
+ if (busy && next_f < sg_policy->next_freq) {
+ next_f = sg_policy->next_freq;
+
+ /* Reset cached freq as next_freq has changed */
+ sg_policy->cached_raw_freq = 0;
+ }
+
+ /*
+ * This code runs under rq->lock for the target CPU, so it won't run
+ * concurrently on two different CPUs for the same target and it is not
+ * necessary to acquire the lock in the fast switch case.
+ */
+ if (sg_policy->policy->fast_switch_enabled) {
+ sugov_fast_switch(sg_policy, time, next_f);
+ } else {
+ raw_spin_lock(&sg_policy->update_lock);
+ sugov_deferred_update(sg_policy, time, next_f);
+ raw_spin_unlock(&sg_policy->update_lock);
+ }
+}
+
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
+{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned long util = 0, max = 1;
+ unsigned int j;
+
+ for_each_cpu(j, policy->cpus) {
+ struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
+ unsigned long j_util, j_max;
+
+ j_util = sugov_get_util(j_sg_cpu);
+ j_max = j_sg_cpu->max;
+ j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
+
+ if (j_util * max > j_max * util) {
+ util = j_util;
+ max = j_max;
+ }
+ }
+
+ return get_next_freq(sg_policy, util, max);
+}
+
+static void
+sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned int next_f;
+
+ raw_spin_lock(&sg_policy->update_lock);
+
+ sugov_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+
+ ignore_dl_rate_limit(sg_cpu, sg_policy);
+
+ if (sugov_should_update_freq(sg_policy, time)) {
+ next_f = sugov_next_freq_shared(sg_cpu, time);
+
+ if (sg_policy->policy->fast_switch_enabled)
+ sugov_fast_switch(sg_policy, time, next_f);
+ else
+ sugov_deferred_update(sg_policy, time, next_f);
+ }
+
+ raw_spin_unlock(&sg_policy->update_lock);
+}
+
+static void sugov_work(struct kthread_work *work)
+{
+ struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+ unsigned int freq;
+ unsigned long flags;
+
+ /*
+ * Hold sg_policy->update_lock shortly to handle the case where:
+ * incase sg_policy->next_freq is read here, and then updated by
+ * sugov_deferred_update() just before work_in_progress is set to false
+ * here, we may miss queueing the new update.
+ *
+ * Note: If a work was queued after the update_lock is released,
+ * sugov_work() will just be called again by kthread_work code; and the
+ * request will be proceed before the sugov thread sleeps.
+ */
+ raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
+ freq = sg_policy->next_freq;
+ sg_policy->work_in_progress = false;
+ raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
+
+ mutex_lock(&sg_policy->work_lock);
+ __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
+ mutex_unlock(&sg_policy->work_lock);
+}
+
+static void sugov_irq_work(struct irq_work *irq_work)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
+
+ kthread_queue_work(&sg_policy->worker, &sg_policy->work);
+}
+
+/************************** sysfs interface ************************/
+
+static struct sugov_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+
+static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
+{
+ return container_of(attr_set, struct sugov_tunables, attr_set);
+}
+
+static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->rate_limit_us);
+}
+
+static ssize_t
+rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int rate_limit_us;
+
+ if (kstrtouint(buf, 10, &rate_limit_us))
+ return -EINVAL;
+
+ tunables->rate_limit_us = rate_limit_us;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+ sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+
+ return count;
+}
+
+static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+
+static struct attribute *sugov_attributes[] = {
+ &rate_limit_us.attr,
+ NULL
+};
+
+static void sugov_tunables_free(struct kobject *kobj)
+{
+ struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj);
+
+ kfree(to_sugov_tunables(attr_set));
+}
+
+static struct kobj_type sugov_tunables_ktype = {
+ .default_attrs = sugov_attributes,
+ .sysfs_ops = &governor_sysfs_ops,
+ .release = &sugov_tunables_free,
+};
+
+/********************** cpufreq governor interface *********************/
+
+static struct cpufreq_governor schedutil_gov;
+
+static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
+ if (!sg_policy)
+ return NULL;
+
+ sg_policy->policy = policy;
+ raw_spin_lock_init(&sg_policy->update_lock);
+ return sg_policy;
+}
+
+static void sugov_policy_free(struct sugov_policy *sg_policy)
+{
+ kfree(sg_policy);
+}
+
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+ struct task_struct *thread;
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ .sched_policy = SCHED_DEADLINE,
+ .sched_flags = SCHED_FLAG_SUGOV,
+ .sched_nice = 0,
+ .sched_priority = 0,
+ /*
+ * Fake (unused) bandwidth; workaround to "fix"
+ * priority inheritance.
+ */
+ .sched_runtime = 1000000,
+ .sched_deadline = 10000000,
+ .sched_period = 10000000,
+ };
+ struct cpufreq_policy *policy = sg_policy->policy;
+ int ret;
+
+ /* kthread only required for slow path */
+ if (policy->fast_switch_enabled)
+ return 0;
+
+ kthread_init_work(&sg_policy->work, sugov_work);
+ kthread_init_worker(&sg_policy->worker);
+ thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+ "sugov:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR(thread)) {
+ pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ return PTR_ERR(thread);
+ }
+
+ ret = sched_setattr_nocheck(thread, &attr);
+ if (ret) {
+ kthread_stop(thread);
+ pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
+ return ret;
+ }
+
+ sg_policy->thread = thread;
+ kthread_bind_mask(thread, policy->related_cpus);
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ mutex_init(&sg_policy->work_lock);
+
+ wake_up_process(thread);
+
+ return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+ /* kthread only required for slow path */
+ if (sg_policy->policy->fast_switch_enabled)
+ return;
+
+ kthread_flush_worker(&sg_policy->worker);
+ kthread_stop(sg_policy->thread);
+ mutex_destroy(&sg_policy->work_lock);
+}
+
+static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
+{
+ struct sugov_tunables *tunables;
+
+ tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+ if (tunables) {
+ gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
+ if (!have_governor_per_policy())
+ global_tunables = tunables;
+ }
+ return tunables;
+}
+
+static void sugov_clear_global_tunables(void)
+{
+ if (!have_governor_per_policy())
+ global_tunables = NULL;
+}
+
+static int sugov_init(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+ struct sugov_tunables *tunables;
+ int ret = 0;
+
+ /* State should be equivalent to EXIT */
+ if (policy->governor_data)
+ return -EBUSY;
+
+ cpufreq_enable_fast_switch(policy);
+
+ sg_policy = sugov_policy_alloc(policy);
+ if (!sg_policy) {
+ ret = -ENOMEM;
+ goto disable_fast_switch;
+ }
+
+ ret = sugov_kthread_create(sg_policy);
+ if (ret)
+ goto free_sg_policy;
+
+ mutex_lock(&global_tunables_lock);
+
+ if (global_tunables) {
+ if (WARN_ON(have_governor_per_policy())) {
+ ret = -EINVAL;
+ goto stop_kthread;
+ }
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = global_tunables;
+
+ gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
+ goto out;
+ }
+
+ tunables = sugov_tunables_alloc(sg_policy);
+ if (!tunables) {
+ ret = -ENOMEM;
+ goto stop_kthread;
+ }
+
+ tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
+
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = tunables;
+
+ ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
+ get_governor_parent_kobj(policy), "%s",
+ schedutil_gov.name);
+ if (ret)
+ goto fail;
+
+out:
+ mutex_unlock(&global_tunables_lock);
+ return 0;
+
+fail:
+ kobject_put(&tunables->attr_set.kobj);
+ policy->governor_data = NULL;
+ sugov_clear_global_tunables();
+
+stop_kthread:
+ sugov_kthread_stop(sg_policy);
+ mutex_unlock(&global_tunables_lock);
+
+free_sg_policy:
+ sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+ cpufreq_disable_fast_switch(policy);
+
+ pr_err("initialization failed (error %d)\n", ret);
+ return ret;
+}
+
+static void sugov_exit(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ struct sugov_tunables *tunables = sg_policy->tunables;
+ unsigned int count;
+
+ mutex_lock(&global_tunables_lock);
+
+ count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
+ policy->governor_data = NULL;
+ if (!count)
+ sugov_clear_global_tunables();
+
+ mutex_unlock(&global_tunables_lock);
+
+ sugov_kthread_stop(sg_policy);
+ sugov_policy_free(sg_policy);
+ cpufreq_disable_fast_switch(policy);
+}
+
+static int sugov_start(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+ sg_policy->last_freq_update_time = 0;
+ sg_policy->next_freq = 0;
+ sg_policy->work_in_progress = false;
+ sg_policy->limits_changed = false;
+ sg_policy->need_freq_update = false;
+ sg_policy->cached_raw_freq = 0;
+
+ for_each_cpu(cpu, policy->cpus) {
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
+ memset(sg_cpu, 0, sizeof(*sg_cpu));
+ sg_cpu->cpu = cpu;
+ sg_cpu->sg_policy = sg_policy;
+ sg_cpu->min =
+ (SCHED_CAPACITY_SCALE * policy->cpuinfo.min_freq) /
+ policy->cpuinfo.max_freq;
+ }
+
+ for_each_cpu(cpu, policy->cpus) {
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ policy_is_shared(policy) ?
+ sugov_update_shared :
+ sugov_update_single);
+ }
+ return 0;
+}
+
+static void sugov_stop(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ cpufreq_remove_update_util_hook(cpu);
+
+ synchronize_sched();
+
+ if (!policy->fast_switch_enabled) {
+ irq_work_sync(&sg_policy->irq_work);
+ kthread_cancel_work_sync(&sg_policy->work);
+ }
+}
+
+static void sugov_limits(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+
+ if (!policy->fast_switch_enabled) {
+ mutex_lock(&sg_policy->work_lock);
+ cpufreq_policy_apply_limits(policy);
+ mutex_unlock(&sg_policy->work_lock);
+ }
+
+ sg_policy->limits_changed = true;
+}
+
+static struct cpufreq_governor schedutil_gov = {
+ .name = "schedutil",
+ .owner = THIS_MODULE,
+ .dynamic_switching = true,
+ .init = sugov_init,
+ .exit = sugov_exit,
+ .start = sugov_start,
+ .stop = sugov_stop,
+ .limits = sugov_limits,
+};
+
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+ return &schedutil_gov;
+}
+#endif
+
+static int __init sugov_register(void)
+{
+ return cpufreq_register_governor(&schedutil_gov);
+}
+fs_initcall(sugov_register);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
new file mode 100644
index 000000000..daaadf939
--- /dev/null
+++ b/kernel/sched/cpupri.c
@@ -0,0 +1,241 @@
+/*
+ * kernel/sched/cpupri.c
+ *
+ * CPU priority management
+ *
+ * Copyright (C) 2007-2008 Novell
+ *
+ * Author: Gregory Haskins <ghaskins@novell.com>
+ *
+ * This code tracks the priority of each CPU so that global migration
+ * decisions are easy to calculate. Each CPU can be in a state as follows:
+ *
+ * (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ * going from the lowest priority to the highest. CPUs in the INVALID state
+ * are not eligible for routing. The system maintains this state with
+ * a 2 dimensional bitmap (the first for priority class, the second for CPUs
+ * in that class). Therefore a typical application without affinity
+ * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ * searches). For tasks with affinity restrictions, the algorithm has a
+ * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ * yields the worst case search is fairly contrived.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include "sched.h"
+
+/* Convert between a 140 based task->prio, and our 102 based cpupri */
+static int convert_prio(int prio)
+{
+ int cpupri;
+
+ if (prio == CPUPRI_INVALID)
+ cpupri = CPUPRI_INVALID;
+ else if (prio == MAX_PRIO)
+ cpupri = CPUPRI_IDLE;
+ else if (prio >= MAX_RT_PRIO)
+ cpupri = CPUPRI_NORMAL;
+ else
+ cpupri = MAX_RT_PRIO - prio + 1;
+
+ return cpupri;
+}
+
+/**
+ * cpupri_find - find the best (lowest-pri) CPU in the system
+ * @cp: The cpupri context
+ * @p: The task
+ * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
+ *
+ * Note: This function returns the recommended CPUs as calculated during the
+ * current invocation. By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times. While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Return: (int)bool - CPUs were found
+ */
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask)
+{
+ int idx = 0;
+ int task_pri = convert_prio(p->prio);
+
+ BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+
+ for (idx = 0; idx < task_pri; idx++) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
+ int skip = 0;
+
+ if (!atomic_read(&(vec)->count))
+ skip = 1;
+ /*
+ * When looking at the vector, we need to read the counter,
+ * do a memory barrier, then read the mask.
+ *
+ * Note: This is still all racey, but we can deal with it.
+ * Ideally, we only want to look at masks that are set.
+ *
+ * If a mask is not set, then the only thing wrong is that we
+ * did a little more work than necessary.
+ *
+ * If we read a zero count but the mask is set, because of the
+ * memory barriers, that can only happen when the highest prio
+ * task for a run queue has left the run queue, in which case,
+ * it will be followed by a pull. If the task we are processing
+ * fails to find a proper place to go, that pull request will
+ * pull this task if the run queue is running at a lower
+ * priority.
+ */
+ smp_rmb();
+
+ /* Need to do the rmb for every iteration */
+ if (skip)
+ continue;
+
+ if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+ continue;
+
+ if (lowest_mask) {
+ cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+
+ /*
+ * We have to ensure that we have at least one bit
+ * still set in the array, since the map could have
+ * been concurrently emptied between the first and
+ * second reads of vec->mask. If we hit this
+ * condition, simply act as though we never hit this
+ * priority level and continue on.
+ */
+ if (cpumask_any(lowest_mask) >= nr_cpu_ids)
+ continue;
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * cpupri_set - update the CPU priority setting
+ * @cp: The cpupri context
+ * @cpu: The target CPU
+ * @newpri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(struct cpupri *cp, int cpu, int newpri)
+{
+ int *currpri = &cp->cpu_to_pri[cpu];
+ int oldpri = *currpri;
+ int do_mb = 0;
+
+ newpri = convert_prio(newpri);
+
+ BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
+
+ if (newpri == oldpri)
+ return;
+
+ /*
+ * If the CPU was currently mapped to a different value, we
+ * need to map it to the new value then remove the old value.
+ * Note, we must add the new value first, otherwise we risk the
+ * cpu being missed by the priority loop in cpupri_find.
+ */
+ if (likely(newpri != CPUPRI_INVALID)) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+
+ cpumask_set_cpu(cpu, vec->mask);
+ /*
+ * When adding a new vector, we update the mask first,
+ * do a write memory barrier, and then update the count, to
+ * make sure the vector is visible when count is set.
+ */
+ smp_mb__before_atomic();
+ atomic_inc(&(vec)->count);
+ do_mb = 1;
+ }
+ if (likely(oldpri != CPUPRI_INVALID)) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
+
+ /*
+ * Because the order of modification of the vec->count
+ * is important, we must make sure that the update
+ * of the new prio is seen before we decrement the
+ * old prio. This makes sure that the loop sees
+ * one or the other when we raise the priority of
+ * the run queue. We don't care about when we lower the
+ * priority, as that will trigger an rt pull anyway.
+ *
+ * We only need to do a memory barrier if we updated
+ * the new priority vec.
+ */
+ if (do_mb)
+ smp_mb__after_atomic();
+
+ /*
+ * When removing from the vector, we decrement the counter first
+ * do a memory barrier and then clear the mask.
+ */
+ atomic_dec(&(vec)->count);
+ smp_mb__after_atomic();
+ cpumask_clear_cpu(cpu, vec->mask);
+ }
+
+ *currpri = newpri;
+}
+
+/**
+ * cpupri_init - initialize the cpupri structure
+ * @cp: The cpupri context
+ *
+ * Return: -ENOMEM on memory allocation failure.
+ */
+int cpupri_init(struct cpupri *cp)
+{
+ int i;
+
+ for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+ atomic_set(&vec->count, 0);
+ if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
+ goto cleanup;
+ }
+
+ cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+ if (!cp->cpu_to_pri)
+ goto cleanup;
+
+ for_each_possible_cpu(i)
+ cp->cpu_to_pri[i] = CPUPRI_INVALID;
+
+ return 0;
+
+cleanup:
+ for (i--; i >= 0; i--)
+ free_cpumask_var(cp->pri_to_cpu[i].mask);
+ return -ENOMEM;
+}
+
+/**
+ * cpupri_cleanup - clean up the cpupri structure
+ * @cp: The cpupri context
+ */
+void cpupri_cleanup(struct cpupri *cp)
+{
+ int i;
+
+ kfree(cp->cpu_to_pri);
+ for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
+ free_cpumask_var(cp->pri_to_cpu[i].mask);
+}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
new file mode 100644
index 000000000..7dc20a323
--- /dev/null
+++ b/kernel/sched/cpupri.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
+
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE 0
+#define CPUPRI_NORMAL 1
+/* values 2-101 are RT priorities 0-99 */
+
+struct cpupri_vec {
+ atomic_t count;
+ cpumask_var_t mask;
+};
+
+struct cpupri {
+ struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+ int *cpu_to_pri;
+};
+
+#ifdef CONFIG_SMP
+int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
+void cpupri_set(struct cpupri *cp, int cpu, int pri);
+int cpupri_init(struct cpupri *cp);
+void cpupri_cleanup(struct cpupri *cp);
+#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
new file mode 100644
index 000000000..532682125
--- /dev/null
+++ b/kernel/sched/cputime.c
@@ -0,0 +1,895 @@
+/*
+ * Simple CPU accounting cgroup controller
+ */
+#include "sched.h"
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in vtime_account, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/vtime_account on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
+
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}
+
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+ enum cpu_usage_stat idx)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ u64_stats_update_begin(&irqtime->sync);
+ cpustat[idx] += delta;
+ irqtime->total += delta;
+ irqtime->tick_delta += delta;
+ u64_stats_update_end(&irqtime->sync);
+}
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void irqtime_account_irq(struct task_struct *curr)
+{
+ struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ s64 delta;
+ int cpu;
+
+ if (!sched_clock_irqtime)
+ return;
+
+ cpu = smp_processor_id();
+ delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
+ irqtime->irq_start_time += delta;
+
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (hardirq_count())
+ irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+}
+EXPORT_SYMBOL_GPL(irqtime_account_irq);
+
+static u64 irqtime_tick_accounted(u64 maxtime)
+{
+ struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ u64 delta;
+
+ delta = min(irqtime->tick_delta, maxtime);
+ irqtime->tick_delta -= delta;
+
+ return delta;
+}
+
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#define sched_clock_irqtime (0)
+
+static u64 irqtime_tick_accounted(u64 dummy)
+{
+ return 0;
+}
+
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
+
+static inline void task_group_account_field(struct task_struct *p, int index,
+ u64 tmp)
+{
+ /*
+ * Since all updates are sure to touch the root cgroup, we
+ * get ourselves ahead and touch it first. If the root cgroup
+ * is the only cgroup, then nothing else should be necessary.
+ *
+ */
+ __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
+
+ cgroup_account_cputime_field(p, index, tmp);
+}
+
+/*
+ * Account user CPU time to a process.
+ * @p: the process that the CPU time gets accounted to
+ * @cputime: the CPU time spent in user space since the last update
+ */
+void account_user_time(struct task_struct *p, u64 cputime)
+{
+ int index;
+
+ /* Add user time to process. */
+ p->utime += cputime;
+ account_group_user_time(p, cputime);
+
+ index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+
+ /* Add user time to cpustat. */
+ task_group_account_field(p, index, cputime);
+
+ /* Account for user time used */
+ acct_account_cputime(p);
+}
+
+/*
+ * Account guest CPU time to a process.
+ * @p: the process that the CPU time gets accounted to
+ * @cputime: the CPU time spent in virtual machine since the last update
+ */
+void account_guest_time(struct task_struct *p, u64 cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ /* Add guest time to process. */
+ p->utime += cputime;
+ account_group_user_time(p, cputime);
+ p->gtime += cputime;
+
+ /* Add guest time to cpustat. */
+ if (task_nice(p) > 0) {
+ task_group_account_field(p, CPUTIME_NICE, cputime);
+ cpustat[CPUTIME_GUEST_NICE] += cputime;
+ } else {
+ task_group_account_field(p, CPUTIME_USER, cputime);
+ cpustat[CPUTIME_GUEST] += cputime;
+ }
+}
+
+/*
+ * Account system CPU time to a process and desired cpustat field
+ * @p: the process that the CPU time gets accounted to
+ * @cputime: the CPU time spent in kernel space since the last update
+ * @index: pointer to cpustat field that has to be updated
+ */
+void account_system_index_time(struct task_struct *p,
+ u64 cputime, enum cpu_usage_stat index)
+{
+ /* Add system time to process. */
+ p->stime += cputime;
+ account_group_system_time(p, cputime);
+
+ /* Add system time to cpustat. */
+ task_group_account_field(p, index, cputime);
+
+ /* Account for system time used */
+ acct_account_cputime(p);
+}
+
+/*
+ * Account system CPU time to a process.
+ * @p: the process that the CPU time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the CPU time spent in kernel space since the last update
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
+{
+ int index;
+
+ if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
+ account_guest_time(p, cputime);
+ return;
+ }
+
+ if (hardirq_count() - hardirq_offset)
+ index = CPUTIME_IRQ;
+ else if (in_serving_softirq())
+ index = CPUTIME_SOFTIRQ;
+ else
+ index = CPUTIME_SYSTEM;
+
+ account_system_index_time(p, cputime, index);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @cputime: the CPU time spent in involuntary wait
+ */
+void account_steal_time(u64 cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ cpustat[CPUTIME_STEAL] += cputime;
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the CPU time spent in idle wait
+ */
+void account_idle_time(u64 cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ struct rq *rq = this_rq();
+
+ if (atomic_read(&rq->nr_iowait) > 0)
+ cpustat[CPUTIME_IOWAIT] += cputime;
+ else
+ cpustat[CPUTIME_IDLE] += cputime;
+}
+
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
+static __always_inline u64 steal_account_process_time(u64 maxtime)
+{
+#ifdef CONFIG_PARAVIRT
+ if (static_key_false(&paravirt_steal_enabled)) {
+ u64 steal;
+
+ steal = paravirt_steal_clock(smp_processor_id());
+ steal -= this_rq()->prev_steal_time;
+ steal = min(steal, maxtime);
+ account_steal_time(steal);
+ this_rq()->prev_steal_time += steal;
+
+ return steal;
+ }
+#endif
+ return 0;
+}
+
+/*
+ * Account how much elapsed time was spent in steal, irq, or softirq time.
+ */
+static inline u64 account_other_time(u64 max)
+{
+ u64 accounted;
+
+ lockdep_assert_irqs_disabled();
+
+ accounted = steal_account_process_time(max);
+
+ if (accounted < max)
+ accounted += irqtime_tick_accounted(max - accounted);
+
+ return accounted;
+}
+
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ return t->se.sum_exec_runtime;
+}
+#else
+static u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ u64 ns;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(t, &rf);
+ ns = t->se.sum_exec_runtime;
+ task_rq_unlock(rq, t, &rf);
+
+ return ns;
+}
+#endif
+
+/*
+ * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
+ * tasks (sum on group iteration) belonging to @tsk's group.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+ struct signal_struct *sig = tsk->signal;
+ u64 utime, stime;
+ struct task_struct *t;
+ unsigned int seq, nextseq;
+ unsigned long flags;
+
+ /*
+ * Update current task runtime to account pending time since last
+ * scheduler action or thread_group_cputime() call. This thread group
+ * might have other running tasks on different CPUs, but updating
+ * their runtime can affect syscall performance, so we skip account
+ * those pending times and rely only on values updated on tick or
+ * other scheduler action.
+ */
+ if (same_thread_group(current, tsk))
+ (void) task_sched_runtime(current);
+
+ rcu_read_lock();
+ /* Attempt a lockless read on the first round. */
+ nextseq = 0;
+ do {
+ seq = nextseq;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+ times->utime = sig->utime;
+ times->stime = sig->stime;
+ times->sum_exec_runtime = sig->sum_sched_runtime;
+
+ for_each_thread(tsk, t) {
+ task_cputime(t, &utime, &stime);
+ times->utime += utime;
+ times->stime += stime;
+ times->sum_exec_runtime += read_sum_exec_runtime(t);
+ }
+ /* If lockless access failed, take the lock. */
+ nextseq = 1;
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the CPU time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ * - check for guest_time
+ * - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq, int ticks)
+{
+ u64 other, cputime = TICK_NSEC * ticks;
+
+ /*
+ * When returning from idle, many ticks can get accounted at
+ * once, including some ticks of steal, irq, and softirq time.
+ * Subtract those ticks from the amount of time accounted to
+ * idle, or potentially user or system time. Due to rounding,
+ * other time can exceed ticks occasionally.
+ */
+ other = account_other_time(ULONG_MAX);
+ if (other >= cputime)
+ return;
+
+ cputime -= other;
+
+ if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ * Also, p->stime needs to be updated for ksoftirqd.
+ */
+ account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
+ } else if (user_tick) {
+ account_user_time(p, cputime);
+ } else if (p == rq->idle) {
+ account_idle_time(cputime);
+ } else if (p->flags & PF_VCPU) { /* System time or guest time */
+ account_guest_time(p, cputime);
+ } else {
+ account_system_index_time(p, cputime, CPUTIME_SYSTEM);
+ }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+ struct rq *rq = this_rq();
+
+ irqtime_account_process_tick(current, 0, rq, ticks);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static inline void irqtime_account_idle_ticks(int ticks) { }
+static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq, int nr_ticks) { }
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_common_task_switch(struct task_struct *prev)
+{
+ if (is_idle_task(prev))
+ vtime_account_idle(prev);
+ else
+ vtime_account_system(prev);
+
+ vtime_flush(prev);
+ arch_vtime_task_switch(prev);
+}
+# endif
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+/*
+ * Archs that account the whole time spent in the idle task
+ * (outside irq) as idle time can rely on this and just implement
+ * vtime_account_system() and vtime_account_idle(). Archs that
+ * have other meaning of the idle time (s390 only includes the
+ * time spent by the CPU when it's in low power mode) must override
+ * vtime_account().
+ */
+#ifndef __ARCH_HAS_VTIME_ACCOUNT
+void vtime_account_irq_enter(struct task_struct *tsk)
+{
+ if (!in_interrupt() && is_idle_task(tsk))
+ vtime_account_idle(tsk);
+ else
+ vtime_account_system(tsk);
+}
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+ u64 *ut, u64 *st)
+{
+ *ut = curr->utime;
+ *st = curr->stime;
+}
+
+void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
+{
+ *ut = p->utime;
+ *st = p->stime;
+}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
+
+void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+
+ *ut = cputime.utime;
+ *st = cputime.stime;
+}
+
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
+
+/*
+ * Account a single tick of CPU time.
+ * @p: the process that the CPU time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+ u64 cputime, steal;
+ struct rq *rq = this_rq();
+
+ if (vtime_accounting_cpu_enabled())
+ return;
+
+ if (sched_clock_irqtime) {
+ irqtime_account_process_tick(p, user_tick, rq, 1);
+ return;
+ }
+
+ cputime = TICK_NSEC;
+ steal = steal_account_process_time(ULONG_MAX);
+
+ if (steal >= cputime)
+ return;
+
+ cputime -= steal;
+
+ if (user_tick)
+ account_user_time(p, cputime);
+ else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+ account_system_time(p, HARDIRQ_OFFSET, cputime);
+ else
+ account_idle_time(cputime);
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+ u64 cputime, steal;
+
+ if (sched_clock_irqtime) {
+ irqtime_account_idle_ticks(ticks);
+ return;
+ }
+
+ cputime = ticks * TICK_NSEC;
+ steal = steal_account_process_time(ULONG_MAX);
+
+ if (steal >= cputime)
+ return;
+
+ cputime -= steal;
+ account_idle_time(cputime);
+}
+
+/*
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * loosing precision when the numbers are big.
+ */
+static u64 scale_stime(u64 stime, u64 rtime, u64 total)
+{
+ u64 scaled;
+
+ for (;;) {
+ /* Make sure "rtime" is the bigger of stime/rtime */
+ if (stime > rtime)
+ swap(rtime, stime);
+
+ /* Make sure 'total' fits in 32 bits */
+ if (total >> 32)
+ goto drop_precision;
+
+ /* Does rtime (and thus stime) fit in 32 bits? */
+ if (!(rtime >> 32))
+ break;
+
+ /* Can we just balance rtime/stime rather than dropping bits? */
+ if (stime >> 31)
+ goto drop_precision;
+
+ /* We can grow stime and shrink rtime and try to make them both fit */
+ stime <<= 1;
+ rtime >>= 1;
+ continue;
+
+drop_precision:
+ /* We drop from rtime, it has more bits than stime */
+ rtime >>= 1;
+ total >>= 1;
+ }
+
+ /*
+ * Make sure gcc understands that this is a 32x32->64 multiply,
+ * followed by a 64/32->64 divide.
+ */
+ scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
+ return scaled;
+}
+
+/*
+ * Adjust tick based cputime random precision against scheduler runtime
+ * accounting.
+ *
+ * Tick based cputime accounting depend on random scheduling timeslices of a
+ * task to be interrupted or not by the timer. Depending on these
+ * circumstances, the number of these interrupts may be over or
+ * under-optimistic, matching the real user and system cputime with a variable
+ * precision.
+ *
+ * Fix this by scaling these tick based values against the total runtime
+ * accounted by the CFS scheduler.
+ *
+ * This code provides the following guarantees:
+ *
+ * stime + utime == rtime
+ * stime_i+1 >= stime_i, utime_i+1 >= utime_i
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
+ */
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+ u64 *ut, u64 *st)
+{
+ u64 rtime, stime, utime;
+ unsigned long flags;
+
+ /* Serialize concurrent callers such that we can honour our guarantees */
+ raw_spin_lock_irqsave(&prev->lock, flags);
+ rtime = curr->sum_exec_runtime;
+
+ /*
+ * This is possible under two circumstances:
+ * - rtime isn't monotonic after all (a bug);
+ * - we got reordered by the lock.
+ *
+ * In both cases this acts as a filter such that the rest of the code
+ * can assume it is monotonic regardless of anything else.
+ */
+ if (prev->stime + prev->utime >= rtime)
+ goto out;
+
+ stime = curr->stime;
+ utime = curr->utime;
+
+ /*
+ * If either stime or utime are 0, assume all runtime is userspace.
+ * Once a task gets some ticks, the monotonicy code at 'update:'
+ * will ensure things converge to the observed ratio.
+ */
+ if (stime == 0) {
+ utime = rtime;
+ goto update;
+ }
+
+ if (utime == 0) {
+ stime = rtime;
+ goto update;
+ }
+
+ stime = scale_stime(stime, rtime, stime + utime);
+
+update:
+ /*
+ * Make sure stime doesn't go backwards; this preserves monotonicity
+ * for utime because rtime is monotonic.
+ *
+ * utime_i+1 = rtime_i+1 - stime_i
+ * = rtime_i+1 - (rtime_i - utime_i)
+ * = (rtime_i+1 - rtime_i) + utime_i
+ * >= utime_i
+ */
+ if (stime < prev->stime)
+ stime = prev->stime;
+ utime = rtime - stime;
+
+ /*
+ * Make sure utime doesn't go backwards; this still preserves
+ * monotonicity for stime, analogous argument to above.
+ */
+ if (utime < prev->utime) {
+ utime = prev->utime;
+ stime = rtime - utime;
+ }
+
+ prev->stime = stime;
+ prev->utime = utime;
+out:
+ *ut = prev->utime;
+ *st = prev->stime;
+ raw_spin_unlock_irqrestore(&prev->lock, flags);
+}
+
+void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
+{
+ struct task_cputime cputime = {
+ .sum_exec_runtime = p->se.sum_exec_runtime,
+ };
+
+ task_cputime(p, &cputime.utime, &cputime.stime);
+ cputime_adjust(&cputime, &p->prev_cputime, ut, st);
+}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
+
+void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+ cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
+}
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+static u64 vtime_delta(struct vtime *vtime)
+{
+ unsigned long long clock;
+
+ clock = sched_clock();
+ if (clock < vtime->starttime)
+ return 0;
+
+ return clock - vtime->starttime;
+}
+
+static u64 get_vtime_delta(struct vtime *vtime)
+{
+ u64 delta = vtime_delta(vtime);
+ u64 other;
+
+ /*
+ * Unlike tick based timing, vtime based timing never has lost
+ * ticks, and no need for steal time accounting to make up for
+ * lost ticks. Vtime accounts a rounded version of actual
+ * elapsed time. Limit account_other_time to prevent rounding
+ * errors from causing elapsed vtime to go negative.
+ */
+ other = account_other_time(delta);
+ WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
+ vtime->starttime += delta;
+
+ return delta - other;
+}
+
+static void __vtime_account_system(struct task_struct *tsk,
+ struct vtime *vtime)
+{
+ vtime->stime += get_vtime_delta(vtime);
+ if (vtime->stime >= TICK_NSEC) {
+ account_system_time(tsk, irq_count(), vtime->stime);
+ vtime->stime = 0;
+ }
+}
+
+static void vtime_account_guest(struct task_struct *tsk,
+ struct vtime *vtime)
+{
+ vtime->gtime += get_vtime_delta(vtime);
+ if (vtime->gtime >= TICK_NSEC) {
+ account_guest_time(tsk, vtime->gtime);
+ vtime->gtime = 0;
+ }
+}
+
+void vtime_account_system(struct task_struct *tsk)
+{
+ struct vtime *vtime = &tsk->vtime;
+
+ if (!vtime_delta(vtime))
+ return;
+
+ write_seqcount_begin(&vtime->seqcount);
+ /* We might have scheduled out from guest path */
+ if (tsk->flags & PF_VCPU)
+ vtime_account_guest(tsk, vtime);
+ else
+ __vtime_account_system(tsk, vtime);
+ write_seqcount_end(&vtime->seqcount);
+}
+
+void vtime_user_enter(struct task_struct *tsk)
+{
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_system(tsk, vtime);
+ vtime->state = VTIME_USER;
+ write_seqcount_end(&vtime->seqcount);
+}
+
+void vtime_user_exit(struct task_struct *tsk)
+{
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->utime += get_vtime_delta(vtime);
+ if (vtime->utime >= TICK_NSEC) {
+ account_user_time(tsk, vtime->utime);
+ vtime->utime = 0;
+ }
+ vtime->state = VTIME_SYS;
+ write_seqcount_end(&vtime->seqcount);
+}
+
+void vtime_guest_enter(struct task_struct *tsk)
+{
+ struct vtime *vtime = &tsk->vtime;
+ /*
+ * The flags must be updated under the lock with
+ * the vtime_starttime flush and update.
+ * That enforces a right ordering and update sequence
+ * synchronization against the reader (task_gtime())
+ * that can thus safely catch up with a tickless delta.
+ */
+ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_system(tsk, vtime);
+ tsk->flags |= PF_VCPU;
+ write_seqcount_end(&vtime->seqcount);
+}
+EXPORT_SYMBOL_GPL(vtime_guest_enter);
+
+void vtime_guest_exit(struct task_struct *tsk)
+{
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime_account_guest(tsk, vtime);
+ tsk->flags &= ~PF_VCPU;
+ write_seqcount_end(&vtime->seqcount);
+}
+EXPORT_SYMBOL_GPL(vtime_guest_exit);
+
+void vtime_account_idle(struct task_struct *tsk)
+{
+ account_idle_time(get_vtime_delta(&tsk->vtime));
+}
+
+void arch_vtime_task_switch(struct task_struct *prev)
+{
+ struct vtime *vtime = &prev->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_INACTIVE;
+ write_seqcount_end(&vtime->seqcount);
+
+ vtime = &current->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_SYS;
+ vtime->starttime = sched_clock();
+ write_seqcount_end(&vtime->seqcount);
+}
+
+void vtime_init_idle(struct task_struct *t, int cpu)
+{
+ struct vtime *vtime = &t->vtime;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_SYS;
+ vtime->starttime = sched_clock();
+ write_seqcount_end(&vtime->seqcount);
+ local_irq_restore(flags);
+}
+
+u64 task_gtime(struct task_struct *t)
+{
+ struct vtime *vtime = &t->vtime;
+ unsigned int seq;
+ u64 gtime;
+
+ if (!vtime_accounting_enabled())
+ return t->gtime;
+
+ do {
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ gtime = t->gtime;
+ if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+ gtime += vtime->gtime + vtime_delta(vtime);
+
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return gtime;
+}
+
+/*
+ * Fetch cputime raw values from fields of task_struct and
+ * add up the pending nohz execution time since the last
+ * cputime snapshot.
+ */
+void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
+{
+ struct vtime *vtime = &t->vtime;
+ unsigned int seq;
+ u64 delta;
+
+ if (!vtime_accounting_enabled()) {
+ *utime = t->utime;
+ *stime = t->stime;
+ return;
+ }
+
+ do {
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ *utime = t->utime;
+ *stime = t->stime;
+
+ /* Task is sleeping, nothing to add */
+ if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
+ continue;
+
+ delta = vtime_delta(vtime);
+
+ /*
+ * Task runs either in user or kernel space, add pending nohz time to
+ * the right place.
+ */
+ if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
+ *utime += vtime->utime + delta;
+ else if (vtime->state == VTIME_SYS)
+ *stime += vtime->stime + delta;
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
new file mode 100644
index 000000000..beec5081a
--- /dev/null
+++ b/kernel/sched/deadline.c
@@ -0,0 +1,2793 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Deadline Scheduling Class (SCHED_DEADLINE)
+ *
+ * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
+ *
+ * Tasks that periodically executes their instances for less than their
+ * runtime won't miss any of their deadlines.
+ * Tasks that are not periodic or sporadic or that tries to execute more
+ * than their reserved bandwidth will be slowed down (and may potentially
+ * miss some of their deadlines), and won't affect any other task.
+ *
+ * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
+ * Juri Lelli <juri.lelli@gmail.com>,
+ * Michael Trimarchi <michael@amarulasolutions.com>,
+ * Fabio Checconi <fchecconi@gmail.com>
+ */
+#include "sched.h"
+#include "pelt.h"
+
+struct dl_bandwidth def_dl_bandwidth;
+
+static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
+{
+ return container_of(dl_se, struct task_struct, dl);
+}
+
+static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
+{
+ return container_of(dl_rq, struct rq, dl);
+}
+
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->dl;
+}
+
+static inline int on_dl_rq(struct sched_dl_entity *dl_se)
+{
+ return !RB_EMPTY_NODE(&dl_se->rb_node);
+}
+
+#ifdef CONFIG_SMP
+static inline struct dl_bw *dl_bw_of(int i)
+{
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ return &cpu_rq(i)->rd->dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ struct root_domain *rd = cpu_rq(i)->rd;
+ int cpus = 0;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask)
+ cpus++;
+
+ return cpus;
+}
+#else
+static inline struct dl_bw *dl_bw_of(int i)
+{
+ return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ return 1;
+}
+#endif
+
+static inline
+void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->running_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->running_bw += dl_bw;
+ SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
+ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
+}
+
+static inline
+void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->running_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->running_bw -= dl_bw;
+ SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
+ if (dl_rq->running_bw > old)
+ dl_rq->running_bw = 0;
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
+}
+
+static inline
+void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->this_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->this_bw += dl_bw;
+ SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
+}
+
+static inline
+void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->this_bw;
+
+ lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ dl_rq->this_bw -= dl_bw;
+ SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
+ if (dl_rq->this_bw > old)
+ dl_rq->this_bw = 0;
+ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+}
+
+static inline
+void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __add_rq_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __sub_rq_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __add_running_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __sub_running_bw(dl_se->dl_bw, dl_rq);
+}
+
+void dl_change_utilization(struct task_struct *p, u64 new_bw)
+{
+ struct rq *rq;
+
+ BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
+
+ if (task_on_rq_queued(p))
+ return;
+
+ rq = task_rq(p);
+ if (p->dl.dl_non_contending) {
+ sub_running_bw(&p->dl, &rq->dl);
+ p->dl.dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be cancelled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+ put_task_struct(p);
+ }
+ __sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ __add_rq_bw(new_bw, &rq->dl);
+}
+
+/*
+ * The utilization of a task cannot be immediately removed from
+ * the rq active utilization (running_bw) when the task blocks.
+ * Instead, we have to wait for the so called "0-lag time".
+ *
+ * If a task blocks before the "0-lag time", a timer (the inactive
+ * timer) is armed, and running_bw is decreased when the timer
+ * fires.
+ *
+ * If the task wakes up again before the inactive timer fires,
+ * the timer is cancelled, whereas if the task wakes up after the
+ * inactive timer fired (and running_bw has been decreased) the
+ * task's utilization has to be added to running_bw again.
+ * A flag in the deadline scheduling entity (dl_non_contending)
+ * is used to avoid race conditions between the inactive timer handler
+ * and task wakeups.
+ *
+ * The following diagram shows how running_bw is updated. A task is
+ * "ACTIVE" when its utilization contributes to running_bw; an
+ * "ACTIVE contending" task is in the TASK_RUNNING state, while an
+ * "ACTIVE non contending" task is a blocked task for which the "0-lag time"
+ * has not passed yet. An "INACTIVE" task is a task for which the "0-lag"
+ * time already passed, which does not contribute to running_bw anymore.
+ * +------------------+
+ * wakeup | ACTIVE |
+ * +------------------>+ contending |
+ * | add_running_bw | |
+ * | +----+------+------+
+ * | | ^
+ * | dequeue | |
+ * +--------+-------+ | |
+ * | | t >= 0-lag | | wakeup
+ * | INACTIVE |<---------------+ |
+ * | | sub_running_bw | |
+ * +--------+-------+ | |
+ * ^ | |
+ * | t < 0-lag | |
+ * | | |
+ * | V |
+ * | +----+------+------+
+ * | sub_running_bw | ACTIVE |
+ * +-------------------+ |
+ * inactive timer | non contending |
+ * fired +------------------+
+ *
+ * The task_non_contending() function is invoked when a task
+ * blocks, and checks if the 0-lag time already passed or
+ * not (in the first case, it directly updates running_bw;
+ * in the second case, it arms the inactive timer).
+ *
+ * The task_contending() function is invoked when a task wakes
+ * up, and checks if the task is still in the "ACTIVE non contending"
+ * state or not (in the second case, it updates running_bw).
+ */
+static void task_non_contending(struct task_struct *p)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct hrtimer *timer = &dl_se->inactive_timer;
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+ s64 zerolag_time;
+
+ /*
+ * If this is a non-deadline task that has been boosted,
+ * do nothing
+ */
+ if (dl_se->dl_runtime == 0)
+ return;
+
+ if (dl_entity_is_special(dl_se))
+ return;
+
+ WARN_ON(dl_se->dl_non_contending);
+
+ zerolag_time = dl_se->deadline -
+ div64_long((dl_se->runtime * dl_se->dl_period),
+ dl_se->dl_runtime);
+
+ /*
+ * Using relative times instead of the absolute "0-lag time"
+ * allows to simplify the code
+ */
+ zerolag_time -= rq_clock(rq);
+
+ /*
+ * If the "0-lag time" already passed, decrease the active
+ * utilization now, instead of starting a timer
+ */
+ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
+ if (dl_task(p))
+ sub_running_bw(dl_se, dl_rq);
+ if (!dl_task(p) || p->state == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (p->state == TASK_DEAD)
+ sub_rq_bw(&p->dl, &rq->dl);
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_clear_params(p);
+ raw_spin_unlock(&dl_b->lock);
+ }
+
+ return;
+ }
+
+ dl_se->dl_non_contending = 1;
+ get_task_struct(p);
+ hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
+}
+
+static void task_contending(struct sched_dl_entity *dl_se, int flags)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ /*
+ * If this is a non-deadline task that has been boosted,
+ * do nothing
+ */
+ if (dl_se->dl_runtime == 0)
+ return;
+
+ if (flags & ENQUEUE_MIGRATED)
+ add_rq_bw(dl_se, dl_rq);
+
+ if (dl_se->dl_non_contending) {
+ dl_se->dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be cancelled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
+ put_task_struct(dl_task_of(dl_se));
+ } else {
+ /*
+ * Since "dl_non_contending" is not set, the
+ * task's utilization has already been removed from
+ * active utilization (either when the task blocked,
+ * when the "inactive timer" fired).
+ * So, add it back.
+ */
+ add_running_bw(dl_se, dl_rq);
+ }
+}
+
+static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ return dl_rq->root.rb_leftmost == &dl_se->rb_node;
+}
+
+void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
+{
+ raw_spin_lock_init(&dl_b->dl_runtime_lock);
+ dl_b->dl_period = period;
+ dl_b->dl_runtime = runtime;
+}
+
+void init_dl_bw(struct dl_bw *dl_b)
+{
+ raw_spin_lock_init(&dl_b->lock);
+ raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
+ if (global_rt_runtime() == RUNTIME_INF)
+ dl_b->bw = -1;
+ else
+ dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
+ raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
+ dl_b->total_bw = 0;
+}
+
+void init_dl_rq(struct dl_rq *dl_rq)
+{
+ dl_rq->root = RB_ROOT_CACHED;
+
+#ifdef CONFIG_SMP
+ /* zero means no -deadline tasks */
+ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
+
+ dl_rq->dl_nr_migratory = 0;
+ dl_rq->overloaded = 0;
+ dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
+#else
+ init_dl_bw(&dl_rq->dl_bw);
+#endif
+
+ dl_rq->running_bw = 0;
+ dl_rq->this_bw = 0;
+ init_dl_rq_bw_ratio(dl_rq);
+}
+
+#ifdef CONFIG_SMP
+
+static inline int dl_overloaded(struct rq *rq)
+{
+ return atomic_read(&rq->rd->dlo_count);
+}
+
+static inline void dl_set_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask);
+ /*
+ * Must be visible before the overload count is
+ * set (as in sched_rt.c).
+ *
+ * Matched by the barrier in pull_dl_task().
+ */
+ smp_wmb();
+ atomic_inc(&rq->rd->dlo_count);
+}
+
+static inline void dl_clear_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ atomic_dec(&rq->rd->dlo_count);
+ cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
+}
+
+static void update_dl_migration(struct dl_rq *dl_rq)
+{
+ if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
+ if (!dl_rq->overloaded) {
+ dl_set_overload(rq_of_dl_rq(dl_rq));
+ dl_rq->overloaded = 1;
+ }
+ } else if (dl_rq->overloaded) {
+ dl_clear_overload(rq_of_dl_rq(dl_rq));
+ dl_rq->overloaded = 0;
+ }
+}
+
+static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (p->nr_cpus_allowed > 1)
+ dl_rq->dl_nr_migratory++;
+
+ update_dl_migration(dl_rq);
+}
+
+static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (p->nr_cpus_allowed > 1)
+ dl_rq->dl_nr_migratory--;
+
+ update_dl_migration(dl_rq);
+}
+
+/*
+ * The list of pushable -deadline task is not a plist, like in
+ * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
+ */
+static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+ struct dl_rq *dl_rq = &rq->dl;
+ struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct task_struct *entry;
+ bool leftmost = true;
+
+ BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct task_struct,
+ pushable_dl_tasks);
+ if (dl_entity_preempt(&p->dl, &entry->dl))
+ link = &parent->rb_left;
+ else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ if (leftmost)
+ dl_rq->earliest_dl.next = p->dl.deadline;
+
+ rb_link_node(&p->pushable_dl_tasks, parent, link);
+ rb_insert_color_cached(&p->pushable_dl_tasks,
+ &dl_rq->pushable_dl_tasks_root, leftmost);
+}
+
+static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+ struct dl_rq *dl_rq = &rq->dl;
+
+ if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
+ return;
+
+ if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) {
+ struct rb_node *next_node;
+
+ next_node = rb_next(&p->pushable_dl_tasks);
+ if (next_node) {
+ dl_rq->earliest_dl.next = rb_entry(next_node,
+ struct task_struct, pushable_dl_tasks)->dl.deadline;
+ }
+ }
+
+ rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ RB_CLEAR_NODE(&p->pushable_dl_tasks);
+}
+
+static inline int has_pushable_dl_tasks(struct rq *rq)
+{
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
+}
+
+static int push_dl_task(struct rq *rq);
+
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+ return dl_task(prev);
+}
+
+static DEFINE_PER_CPU(struct callback_head, dl_push_head);
+static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
+
+static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);
+
+static inline void deadline_queue_push_tasks(struct rq *rq)
+{
+ if (!has_pushable_dl_tasks(rq))
+ return;
+
+ queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void deadline_queue_pull_task(struct rq *rq)
+{
+ queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
+}
+
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
+
+static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+{
+ struct rq *later_rq = NULL;
+ struct dl_bw *dl_b;
+
+ later_rq = find_lock_later_rq(p, rq);
+ if (!later_rq) {
+ int cpu;
+
+ /*
+ * If we cannot preempt any rq, fall back to pick any
+ * online CPU:
+ */
+ cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+ if (cpu >= nr_cpu_ids) {
+ /*
+ * Failed to find any suitable CPU.
+ * The task will never come back!
+ */
+ BUG_ON(dl_bandwidth_enabled());
+
+ /*
+ * If admission control is disabled we
+ * try a little harder to let the task
+ * run.
+ */
+ cpu = cpumask_any(cpu_active_mask);
+ }
+ later_rq = cpu_rq(cpu);
+ double_lock_balance(rq, later_rq);
+ }
+
+ if (p->dl.dl_non_contending || p->dl.dl_throttled) {
+ /*
+ * Inactive timer is armed (or callback is running, but
+ * waiting for us to release rq locks). In any case, when it
+ * will fire (or continue), it will see running_bw of this
+ * task migrated to later_rq (and correctly handle it).
+ */
+ sub_running_bw(&p->dl, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
+
+ add_rq_bw(&p->dl, &later_rq->dl);
+ add_running_bw(&p->dl, &later_rq->dl);
+ } else {
+ sub_rq_bw(&p->dl, &rq->dl);
+ add_rq_bw(&p->dl, &later_rq->dl);
+ }
+
+ /*
+ * And we finally need to fixup root_domain(s) bandwidth accounting,
+ * since p is still hanging out in the old (now moved to default) root
+ * domain.
+ */
+ dl_b = &rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
+
+ dl_b = &later_rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+ __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
+
+ set_task_cpu(p, later_rq->cpu);
+ double_unlock_balance(later_rq, rq);
+
+ return later_rq;
+}
+
+#else
+
+static inline
+void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+}
+
+static inline
+void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+}
+
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+ return false;
+}
+
+static inline void pull_dl_task(struct rq *rq)
+{
+}
+
+static inline void deadline_queue_push_tasks(struct rq *rq)
+{
+}
+
+static inline void deadline_queue_pull_task(struct rq *rq)
+{
+}
+#endif /* CONFIG_SMP */
+
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
+
+/*
+ * We are being explicitly informed that a new instance is starting,
+ * and this means that:
+ * - the absolute deadline of the entity has to be placed at
+ * current time + relative deadline;
+ * - the runtime of the entity has to be set to the maximum value.
+ *
+ * The capability of specifying such event is useful whenever a -deadline
+ * entity wants to (try to!) synchronize its behaviour with the scheduler's
+ * one, and to (try to!) reconcile itself with its own scheduling
+ * parameters.
+ */
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ WARN_ON(dl_se->dl_boosted);
+ WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
+
+ /*
+ * We are racing with the deadline timer. So, do nothing because
+ * the deadline timer handler will take care of properly recharging
+ * the runtime and postponing the deadline
+ */
+ if (dl_se->dl_throttled)
+ return;
+
+ /*
+ * We use the regular wall clock time to set deadlines in the
+ * future; in fact, we must consider execution overheads (time
+ * spent on hardirq context, etc.).
+ */
+ dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
+ dl_se->runtime = dl_se->dl_runtime;
+}
+
+/*
+ * Pure Earliest Deadline First (EDF) scheduling does not deal with the
+ * possibility of a entity lasting more than what it declared, and thus
+ * exhausting its runtime.
+ *
+ * Here we are interested in making runtime overrun possible, but we do
+ * not want a entity which is misbehaving to affect the scheduling of all
+ * other entities.
+ * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
+ * is used, in order to confine each entity within its own bandwidth.
+ *
+ * This function deals exactly with that, and ensures that when the runtime
+ * of a entity is replenished, its deadline is also postponed. That ensures
+ * the overrunning entity can't interfere with other entity in the system and
+ * can't make them miss their deadlines. Reasons why this kind of overruns
+ * could happen are, typically, a entity voluntarily trying to overcome its
+ * runtime, or it just underestimated it during sched_setattr().
+ */
+static void replenish_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ BUG_ON(pi_se->dl_runtime <= 0);
+
+ /*
+ * This could be the case for a !-dl task that is boosted.
+ * Just go with full inherited parameters.
+ */
+ if (dl_se->dl_deadline == 0) {
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ }
+
+ if (dl_se->dl_yielded && dl_se->runtime > 0)
+ dl_se->runtime = 0;
+
+ /*
+ * We keep moving the deadline away until we get some
+ * available runtime for the entity. This ensures correct
+ * handling of situations where the runtime overrun is
+ * arbitrary large.
+ */
+ while (dl_se->runtime <= 0) {
+ dl_se->deadline += pi_se->dl_period;
+ dl_se->runtime += pi_se->dl_runtime;
+ }
+
+ /*
+ * At this point, the deadline really should be "in
+ * the future" with respect to rq->clock. If it's
+ * not, we are, for some reason, lagging too much!
+ * Anyway, after having warn userspace abut that,
+ * we still try to keep the things running by
+ * resetting the deadline and the budget of the
+ * entity.
+ */
+ if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
+ printk_deferred_once("sched: DL replenish lagged too much\n");
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ }
+
+ if (dl_se->dl_yielded)
+ dl_se->dl_yielded = 0;
+ if (dl_se->dl_throttled)
+ dl_se->dl_throttled = 0;
+}
+
+/*
+ * Here we check if --at time t-- an entity (which is probably being
+ * [re]activated or, in general, enqueued) can use its remaining runtime
+ * and its current deadline _without_ exceeding the bandwidth it is
+ * assigned (function returns true if it can't). We are in fact applying
+ * one of the CBS rules: when a task wakes up, if the residual runtime
+ * over residual deadline fits within the allocated bandwidth, then we
+ * can keep the current (absolute) deadline and residual budget without
+ * disrupting the schedulability of the system. Otherwise, we should
+ * refill the runtime and set the deadline a period in the future,
+ * because keeping the current (absolute) deadline of the task would
+ * result in breaking guarantees promised to other tasks (refer to
+ * Documentation/scheduler/sched-deadline.txt for more informations).
+ *
+ * This function returns true if:
+ *
+ * runtime / (deadline - t) > dl_runtime / dl_deadline ,
+ *
+ * IOW we can't recycle current parameters.
+ *
+ * Notice that the bandwidth check is done against the deadline. For
+ * task with deadline equal to period this is the same of using
+ * dl_period instead of dl_deadline in the equation above.
+ */
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se, u64 t)
+{
+ u64 left, right;
+
+ /*
+ * left and right are the two sides of the equation above,
+ * after a bit of shuffling to use multiplications instead
+ * of divisions.
+ *
+ * Note that none of the time values involved in the two
+ * multiplications are absolute: dl_deadline and dl_runtime
+ * are the relative deadline and the maximum runtime of each
+ * instance, runtime is the runtime left for the last instance
+ * and (deadline - t), since t is rq->clock, is the time left
+ * to the (absolute) deadline. Even if overflowing the u64 type
+ * is very unlikely to occur in both cases, here we scale down
+ * as we want to avoid that risk at all. Scaling down by 10
+ * means that we reduce granularity to 1us. We are fine with it,
+ * since this is only a true/false check and, anyway, thinking
+ * of anything below microseconds resolution is actually fiction
+ * (but still we want to give the user that illusion >;).
+ */
+ left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ right = ((dl_se->deadline - t) >> DL_SCALE) *
+ (pi_se->dl_runtime >> DL_SCALE);
+
+ return dl_time_before(right, left);
+}
+
+/*
+ * Revised wakeup rule [1]: For self-suspending tasks, rather then
+ * re-initializing task's runtime and deadline, the revised wakeup
+ * rule adjusts the task's runtime to avoid the task to overrun its
+ * density.
+ *
+ * Reasoning: a task may overrun the density if:
+ * runtime / (deadline - t) > dl_runtime / dl_deadline
+ *
+ * Therefore, runtime can be adjusted to:
+ * runtime = (dl_runtime / dl_deadline) * (deadline - t)
+ *
+ * In such way that runtime will be equal to the maximum density
+ * the task can use without breaking any rule.
+ *
+ * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant
+ * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24.
+ */
+static void
+update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+ u64 laxity = dl_se->deadline - rq_clock(rq);
+
+ /*
+ * If the task has deadline < period, and the deadline is in the past,
+ * it should already be throttled before this check.
+ *
+ * See update_dl_entity() comments for further details.
+ */
+ WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq)));
+
+ dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT;
+}
+
+/*
+ * Regarding the deadline, a task with implicit deadline has a relative
+ * deadline == relative period. A task with constrained deadline has a
+ * relative deadline <= relative period.
+ *
+ * We support constrained deadline tasks. However, there are some restrictions
+ * applied only for tasks which do not have an implicit deadline. See
+ * update_dl_entity() to know more about such restrictions.
+ *
+ * The dl_is_implicit() returns true if the task has an implicit deadline.
+ */
+static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_deadline == dl_se->dl_period;
+}
+
+/*
+ * When a deadline entity is placed in the runqueue, its runtime and deadline
+ * might need to be updated. This is done by a CBS wake up rule. There are two
+ * different rules: 1) the original CBS; and 2) the Revisited CBS.
+ *
+ * When the task is starting a new period, the Original CBS is used. In this
+ * case, the runtime is replenished and a new absolute deadline is set.
+ *
+ * When a task is queued before the begin of the next period, using the
+ * remaining runtime and deadline could make the entity to overflow, see
+ * dl_entity_overflow() to find more about runtime overflow. When such case
+ * is detected, the runtime and deadline need to be updated.
+ *
+ * If the task has an implicit deadline, i.e., deadline == period, the Original
+ * CBS is applied. the runtime is replenished and a new absolute deadline is
+ * set, as in the previous cases.
+ *
+ * However, the Original CBS does not work properly for tasks with
+ * deadline < period, which are said to have a constrained deadline. By
+ * applying the Original CBS, a constrained deadline task would be able to run
+ * runtime/deadline in a period. With deadline < period, the task would
+ * overrun the runtime/period allowed bandwidth, breaking the admission test.
+ *
+ * In order to prevent this misbehave, the Revisited CBS is used for
+ * constrained deadline tasks when a runtime overflow is detected. In the
+ * Revisited CBS, rather than replenishing & setting a new absolute deadline,
+ * the remaining runtime of the task is reduced to avoid runtime overflow.
+ * Please refer to the comments update_dl_revised_wakeup() function to find
+ * more about the Revised CBS rule.
+ */
+static void update_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
+ dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+
+ if (unlikely(!dl_is_implicit(dl_se) &&
+ !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ !dl_se->dl_boosted)){
+ update_dl_revised_wakeup(dl_se, rq);
+ return;
+ }
+
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ }
+}
+
+static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
+{
+ return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period;
+}
+
+/*
+ * If the entity depleted all its runtime, and if we want it to sleep
+ * while waiting for some new execution time to become available, we
+ * set the bandwidth replenishment timer to the replenishment instant
+ * and try to activate it.
+ *
+ * Notice that it is important for the caller to know if the timer
+ * actually started or not (i.e., the replenishment instant is in
+ * the future or in the past).
+ */
+static int start_dl_timer(struct task_struct *p)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct hrtimer *timer = &dl_se->dl_timer;
+ struct rq *rq = task_rq(p);
+ ktime_t now, act;
+ s64 delta;
+
+ lockdep_assert_held(&rq->lock);
+
+ /*
+ * We want the timer to fire at the deadline, but considering
+ * that it is actually coming from rq->clock and not from
+ * hrtimer's time base reading.
+ */
+ act = ns_to_ktime(dl_next_period(dl_se));
+ now = hrtimer_cb_get_time(timer);
+ delta = ktime_to_ns(now) - rq_clock(rq);
+ act = ktime_add_ns(act, delta);
+
+ /*
+ * If the expiry time already passed, e.g., because the value
+ * chosen as the deadline is too small, don't even try to
+ * start the timer in the past!
+ */
+ if (ktime_us_delta(act, now) < 0)
+ return 0;
+
+ /*
+ * !enqueued will guarantee another callback; even if one is already in
+ * progress. This ensures a balanced {get,put}_task_struct().
+ *
+ * The race against __run_timer() clearing the enqueued state is
+ * harmless because we're holding task_rq()->lock, therefore the timer
+ * expiring after we've done the check will wait on its task_rq_lock()
+ * and observe our state.
+ */
+ if (!hrtimer_is_queued(timer)) {
+ get_task_struct(p);
+ hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+ }
+
+ return 1;
+}
+
+/*
+ * This is the bandwidth enforcement timer callback. If here, we know
+ * a task is not on its dl_rq, since the fact that the timer was running
+ * means the task is throttled and needs a runtime replenishment.
+ *
+ * However, what we actually do depends on the fact the task is active,
+ * (it is on its rq) or has been removed from there by a call to
+ * dequeue_task_dl(). In the former case we must issue the runtime
+ * replenishment and add the task back to the dl_rq; in the latter, we just
+ * do nothing but clearing dl_throttled, so that runtime and deadline
+ * updating (and the queueing back to dl_rq) will be done by the
+ * next call to enqueue_task_dl().
+ */
+static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
+{
+ struct sched_dl_entity *dl_se = container_of(timer,
+ struct sched_dl_entity,
+ dl_timer);
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * The task might have changed its scheduling policy to something
+ * different than SCHED_DEADLINE (through switched_from_dl()).
+ */
+ if (!dl_task(p))
+ goto unlock;
+
+ /*
+ * The task might have been boosted by someone else and might be in the
+ * boosting/deboosting path, its not throttled.
+ */
+ if (dl_se->dl_boosted)
+ goto unlock;
+
+ /*
+ * Spurious timer due to start_dl_timer() race; or we already received
+ * a replenishment from rt_mutex_setprio().
+ */
+ if (!dl_se->dl_throttled)
+ goto unlock;
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ /*
+ * If the throttle happened during sched-out; like:
+ *
+ * schedule()
+ * deactivate_task()
+ * dequeue_task_dl()
+ * update_curr_dl()
+ * start_dl_timer()
+ * __dequeue_task_dl()
+ * prev->on_rq = 0;
+ *
+ * We can be both throttled and !queued. Replenish the counter
+ * but do not enqueue -- wait for our wakeup to do that.
+ */
+ if (!task_on_rq_queued(p)) {
+ replenish_dl_entity(dl_se, dl_se);
+ goto unlock;
+ }
+
+#ifdef CONFIG_SMP
+ if (unlikely(!rq->online)) {
+ /*
+ * If the runqueue is no longer available, migrate the
+ * task elsewhere. This necessarily changes rq.
+ */
+ lockdep_unpin_lock(&rq->lock, rf.cookie);
+ rq = dl_task_offline_migration(rq, p);
+ rf.cookie = lockdep_pin_lock(&rq->lock);
+ update_rq_clock(rq);
+
+ /*
+ * Now that the task has been migrated to the new RQ and we
+ * have that locked, proceed as normal and enqueue the task
+ * there.
+ */
+ }
+#endif
+
+ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+
+#ifdef CONFIG_SMP
+ /*
+ * Queueing this task back might have overloaded rq, check if we need
+ * to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq)) {
+ /*
+ * Nothing relies on rq->lock after this, so its safe to drop
+ * rq->lock.
+ */
+ rq_unpin_lock(rq, &rf);
+ push_dl_task(rq);
+ rq_repin_lock(rq, &rf);
+ }
+#endif
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+
+ /*
+ * This can free the task_struct, including this hrtimer, do not touch
+ * anything related to that after this.
+ */
+ put_task_struct(p);
+
+ return HRTIMER_NORESTART;
+}
+
+void init_dl_task_timer(struct sched_dl_entity *dl_se)
+{
+ struct hrtimer *timer = &dl_se->dl_timer;
+
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ timer->function = dl_task_timer;
+}
+
+/*
+ * During the activation, CBS checks if it can reuse the current task's
+ * runtime and period. If the deadline of the task is in the past, CBS
+ * cannot use the runtime, and so it replenishes the task. This rule
+ * works fine for implicit deadline tasks (deadline == period), and the
+ * CBS was designed for implicit deadline tasks. However, a task with
+ * constrained deadline (deadine < period) might be awakened after the
+ * deadline, but before the next period. In this case, replenishing the
+ * task would allow it to run for runtime / deadline. As in this case
+ * deadline < period, CBS enables a task to run for more than the
+ * runtime / period. In a very loaded system, this can cause a domino
+ * effect, making other tasks miss their deadlines.
+ *
+ * To avoid this problem, in the activation of a constrained deadline
+ * task after the deadline but before the next period, throttle the
+ * task and set the replenishing timer to the begin of the next period,
+ * unless it is boosted.
+ */
+static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+ return;
+ dl_se->dl_throttled = 1;
+ if (dl_se->runtime > 0)
+ dl_se->runtime = 0;
+ }
+}
+
+static
+int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
+{
+ return (dl_se->runtime <= 0);
+}
+
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+
+/*
+ * This function implements the GRUB accounting rule:
+ * according to the GRUB reclaiming algorithm, the runtime is
+ * not decreased as "dq = -dt", but as
+ * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ * where u is the utilization of the task, Umax is the maximum reclaimable
+ * utilization, Uinact is the (per-runqueue) inactive utilization, computed
+ * as the difference between the "total runqueue utilization" and the
+ * runqueue active utilization, and Uextra is the (per runqueue) extra
+ * reclaimable utilization.
+ * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
+ * multiplied by 2^BW_SHIFT, the result has to be shifted right by
+ * BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
+ * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value
+ * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
+ * So, overflow is not an issue here.
+ */
+static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
+{
+ u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
+ u64 u_act;
+ u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
+
+ /*
+ * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
+ * we compare u_inact + rq->dl.extra_bw with
+ * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
+ * u_inact + rq->dl.extra_bw can be larger than
+ * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
+ * leading to wrong results)
+ */
+ if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
+ u_act = u_act_min;
+ else
+ u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
+
+ return (delta * u_act) >> BW_SHIFT;
+}
+
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_dl_entity *dl_se = &curr->dl;
+ u64 delta_exec, scaled_delta_exec;
+ int cpu = cpu_of(rq);
+ u64 now;
+
+ if (!dl_task(curr) || !on_dl_rq(dl_se))
+ return;
+
+ /*
+ * Consumed budget is computed considering the time as
+ * observed by schedulable tasks (excluding time spent
+ * in hardirq context, etc.). Deadlines are instead
+ * computed using hard walltime. This seems to be the more
+ * natural solution, but the full ramifications of this
+ * approach need further study.
+ */
+ now = rq_clock_task(rq);
+ delta_exec = now - curr->se.exec_start;
+ if (unlikely((s64)delta_exec <= 0)) {
+ if (unlikely(dl_se->dl_yielded))
+ goto throttle;
+ return;
+ }
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = now;
+ cgroup_account_cputime(curr, delta_exec);
+
+ if (dl_entity_is_special(dl_se))
+ return;
+
+ /*
+ * For tasks that participate in GRUB, we implement GRUB-PA: the
+ * spare reclaimed bandwidth is used to clock down frequency.
+ *
+ * For the others, we still need to scale reservation parameters
+ * according to current frequency and CPU maximum capacity.
+ */
+ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
+ scaled_delta_exec = grub_reclaim(delta_exec,
+ rq,
+ &curr->dl);
+ } else {
+ unsigned long scale_freq = arch_scale_freq_capacity(cpu);
+ unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ scaled_delta_exec = cap_scale(delta_exec, scale_freq);
+ scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
+ }
+
+ dl_se->runtime -= scaled_delta_exec;
+
+throttle:
+ if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
+ dl_se->dl_throttled = 1;
+
+ /* If requested, inform the user about runtime overruns. */
+ if (dl_runtime_exceeded(dl_se) &&
+ (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
+ dl_se->dl_overrun = 1;
+
+ __dequeue_task_dl(rq, curr, 0);
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
+ enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+
+ if (!is_leftmost(curr, &rq->dl))
+ resched_curr(rq);
+ }
+
+ /*
+ * Because -- for now -- we share the rt bandwidth, we need to
+ * account our runtime there too, otherwise actual rt tasks
+ * would be able to exceed the shared quota.
+ *
+ * Account to the root rt group for now.
+ *
+ * The solution we're working towards is having the RT groups scheduled
+ * using deadline servers -- however there's a few nasties to figure
+ * out before that can happen.
+ */
+ if (rt_bandwidth_enabled()) {
+ struct rt_rq *rt_rq = &rq->rt;
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ /*
+ * We'll let actual RT tasks worry about the overflow here, we
+ * have our own CBS to keep us inline; only account when RT
+ * bandwidth is relevant.
+ */
+ if (sched_rt_bandwidth_account(rt_rq))
+ rt_rq->rt_time += delta_exec;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+}
+
+static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
+{
+ struct sched_dl_entity *dl_se = container_of(timer,
+ struct sched_dl_entity,
+ inactive_timer);
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ if (!dl_task(p) || p->state == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
+ sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
+ sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
+ dl_se->dl_non_contending = 0;
+ }
+
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ raw_spin_unlock(&dl_b->lock);
+ __dl_clear_params(p);
+
+ goto unlock;
+ }
+ if (dl_se->dl_non_contending == 0)
+ goto unlock;
+
+ sub_running_bw(dl_se, &rq->dl);
+ dl_se->dl_non_contending = 0;
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+
+ return HRTIMER_NORESTART;
+}
+
+void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
+{
+ struct hrtimer *timer = &dl_se->inactive_timer;
+
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ timer->function = inactive_task_timer;
+}
+
+#ifdef CONFIG_SMP
+
+static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
+{
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ if (dl_rq->earliest_dl.curr == 0 ||
+ dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
+ dl_rq->earliest_dl.curr = deadline;
+ cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
+ }
+}
+
+static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
+{
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ /*
+ * Since we may have removed our earliest (and/or next earliest)
+ * task we must recompute them.
+ */
+ if (!dl_rq->dl_nr_running) {
+ dl_rq->earliest_dl.curr = 0;
+ dl_rq->earliest_dl.next = 0;
+ cpudl_clear(&rq->rd->cpudl, rq->cpu);
+ } else {
+ struct rb_node *leftmost = dl_rq->root.rb_leftmost;
+ struct sched_dl_entity *entry;
+
+ entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
+ dl_rq->earliest_dl.curr = entry->deadline;
+ cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
+ }
+}
+
+#else
+
+static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
+static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
+
+#endif /* CONFIG_SMP */
+
+static inline
+void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ int prio = dl_task_of(dl_se)->prio;
+ u64 deadline = dl_se->deadline;
+
+ WARN_ON(!dl_prio(prio));
+ dl_rq->dl_nr_running++;
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ inc_dl_deadline(dl_rq, deadline);
+ inc_dl_migration(dl_se, dl_rq);
+}
+
+static inline
+void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ int prio = dl_task_of(dl_se)->prio;
+
+ WARN_ON(!dl_prio(prio));
+ WARN_ON(!dl_rq->dl_nr_running);
+ dl_rq->dl_nr_running--;
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ dec_dl_deadline(dl_rq, dl_se->deadline);
+ dec_dl_migration(dl_se, dl_rq);
+}
+
+static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rb_node **link = &dl_rq->root.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct sched_dl_entity *entry;
+ int leftmost = 1;
+
+ BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct sched_dl_entity, rb_node);
+ if (dl_time_before(dl_se->deadline, entry->deadline))
+ link = &parent->rb_left;
+ else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ rb_link_node(&dl_se->rb_node, parent, link);
+ rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost);
+
+ inc_dl_tasks(dl_se, dl_rq);
+}
+
+static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ if (RB_EMPTY_NODE(&dl_se->rb_node))
+ return;
+
+ rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
+ RB_CLEAR_NODE(&dl_se->rb_node);
+
+ dec_dl_tasks(dl_se, dl_rq);
+}
+
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se, int flags)
+{
+ BUG_ON(on_dl_rq(dl_se));
+
+ /*
+ * If this is a wakeup or a new instance, the scheduling
+ * parameters of the task might need updating. Otherwise,
+ * we want a replenishment of its runtime.
+ */
+ if (flags & ENQUEUE_WAKEUP) {
+ task_contending(dl_se, flags);
+ update_dl_entity(dl_se, pi_se);
+ } else if (flags & ENQUEUE_REPLENISH) {
+ replenish_dl_entity(dl_se, pi_se);
+ } else if ((flags & ENQUEUE_RESTORE) &&
+ dl_time_before(dl_se->deadline,
+ rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+ setup_new_dl_entity(dl_se);
+ }
+
+ __enqueue_dl_entity(dl_se);
+}
+
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+{
+ __dequeue_dl_entity(dl_se);
+}
+
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ struct sched_dl_entity *pi_se = &p->dl;
+
+ /*
+ * Use the scheduling parameters of the top pi-waiter task if:
+ * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
+ * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
+ * smaller than our deadline OR we are a !SCHED_DEADLINE task getting
+ * boosted due to a SCHED_DEADLINE pi-waiter).
+ * Otherwise we keep our runtime and deadline.
+ */
+ if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
+ pi_se = &pi_task->dl;
+ } else if (!dl_prio(p->normal_prio)) {
+ /*
+ * Special case in which we have a !SCHED_DEADLINE task
+ * that is going to be deboosted, but exceeds its
+ * runtime while doing so. No point in replenishing
+ * it, as it's going to return back to its original
+ * scheduling class after this.
+ */
+ BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+ return;
+ }
+
+ /*
+ * Check if a constrained deadline task was activated
+ * after the deadline but before the next period.
+ * If that is the case, the task will be throttled and
+ * the replenishment timer will be set to the next period.
+ */
+ if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
+ dl_check_constrained_dl(&p->dl);
+
+ if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
+ add_rq_bw(&p->dl, &rq->dl);
+ add_running_bw(&p->dl, &rq->dl);
+ }
+
+ /*
+ * If p is throttled, we do not enqueue it. In fact, if it exhausted
+ * its budget it needs a replenishment and, since it now is on
+ * its rq, the bandwidth timer callback (which clearly has not
+ * run yet) will take care of this.
+ * However, the active utilization does not depend on the fact
+ * that the task is on the runqueue or not (but depends on the
+ * task's state - in GRUB parlance, "inactive" vs "active contending").
+ * In other words, even if a task is throttled its utilization must
+ * be counted in the active utilization; hence, we need to call
+ * add_running_bw().
+ */
+ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+ if (flags & ENQUEUE_WAKEUP)
+ task_contending(&p->dl, flags);
+
+ return;
+ }
+
+ enqueue_dl_entity(&p->dl, pi_se, flags);
+
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
+}
+
+static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ dequeue_dl_entity(&p->dl);
+ dequeue_pushable_dl_task(rq, p);
+}
+
+static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ update_curr_dl(rq);
+ __dequeue_task_dl(rq, p, flags);
+
+ if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
+ sub_running_bw(&p->dl, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
+ }
+
+ /*
+ * This check allows to start the inactive timer (or to immediately
+ * decrease the active utilization, if needed) in two cases:
+ * when the task blocks and when it is terminating
+ * (p->state == TASK_DEAD). We can handle the two cases in the same
+ * way, because from GRUB's point of view the same thing is happening
+ * (the task moves from "active contending" to "active non contending"
+ * or "inactive")
+ */
+ if (flags & DEQUEUE_SLEEP)
+ task_non_contending(p);
+}
+
+/*
+ * Yield task semantic for -deadline tasks is:
+ *
+ * get off from the CPU until our next instance, with
+ * a new runtime. This is of little use now, since we
+ * don't have a bandwidth reclaiming mechanism. Anyway,
+ * bandwidth reclaiming is planned for the future, and
+ * yield_task_dl will indicate that some spare budget
+ * is available for other task instances to use it.
+ */
+static void yield_task_dl(struct rq *rq)
+{
+ /*
+ * We make the task go to sleep until its current deadline by
+ * forcing its runtime to zero. This way, update_curr_dl() stops
+ * it and the bandwidth timer will wake it up and will give it
+ * new scheduling parameters (thanks to dl_yielded=1).
+ */
+ rq->curr->dl.dl_yielded = 1;
+
+ update_rq_clock(rq);
+ update_curr_dl(rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq);
+}
+
+#ifdef CONFIG_SMP
+
+static int find_later_rq(struct task_struct *task);
+
+static int
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ struct task_struct *curr;
+ struct rq *rq;
+
+ if (sd_flag != SD_BALANCE_WAKE)
+ goto out;
+
+ rq = cpu_rq(cpu);
+
+ rcu_read_lock();
+ curr = READ_ONCE(rq->curr); /* unlocked access */
+
+ /*
+ * If we are dealing with a -deadline task, we must
+ * decide where to wake it up.
+ * If it has a later deadline and the current task
+ * on this rq can't move (provided the waking task
+ * can!) we prefer to send it somewhere else. On the
+ * other hand, if it has a shorter deadline, we
+ * try to make it stay here, it might be important.
+ */
+ if (unlikely(dl_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &curr->dl)) &&
+ (p->nr_cpus_allowed > 1)) {
+ int target = find_later_rq(p);
+
+ if (target != -1 &&
+ (dl_time_before(p->dl.deadline,
+ cpu_rq(target)->dl.earliest_dl.curr) ||
+ (cpu_rq(target)->dl.dl_nr_running == 0)))
+ cpu = target;
+ }
+ rcu_read_unlock();
+
+out:
+ return cpu;
+}
+
+static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
+{
+ struct rq *rq;
+
+ if (p->state != TASK_WAKING)
+ return;
+
+ rq = task_rq(p);
+ /*
+ * Since p->state == TASK_WAKING, set_task_cpu() has been called
+ * from try_to_wake_up(). Hence, p->pi_lock is locked, but
+ * rq->lock is not... So, lock it
+ */
+ raw_spin_lock(&rq->lock);
+ if (p->dl.dl_non_contending) {
+ update_rq_clock(rq);
+ sub_running_bw(&p->dl, &rq->dl);
+ p->dl.dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be cancelled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+ put_task_struct(p);
+ }
+ sub_rq_bw(&p->dl, &rq->dl);
+ raw_spin_unlock(&rq->lock);
+}
+
+static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
+{
+ /*
+ * Current can't be migrated, useless to reschedule,
+ * let's hope p can move out.
+ */
+ if (rq->curr->nr_cpus_allowed == 1 ||
+ !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
+ return;
+
+ /*
+ * p is migratable, so let's not schedule it and
+ * see if it is pushed or pulled somewhere else.
+ */
+ if (p->nr_cpus_allowed != 1 &&
+ cpudl_find(&rq->rd->cpudl, p, NULL))
+ return;
+
+ resched_curr(rq);
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Only called when both the current and waking task are -deadline
+ * tasks.
+ */
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+ int flags)
+{
+ if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
+ resched_curr(rq);
+ return;
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * In the unlikely case current and p have the same deadline
+ * let us try to decide what's the best thing to do...
+ */
+ if ((p->dl.deadline == rq->curr->dl.deadline) &&
+ !test_tsk_need_resched(rq->curr))
+ check_preempt_equal_dl(rq, p);
+#endif /* CONFIG_SMP */
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
+ hrtick_start(rq, p->dl.runtime);
+}
+#else /* !CONFIG_SCHED_HRTICK */
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
+}
+#endif
+
+static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
+ struct dl_rq *dl_rq)
+{
+ struct rb_node *left = rb_first_cached(&dl_rq->root);
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct sched_dl_entity, rb_node);
+}
+
+static struct task_struct *
+pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct sched_dl_entity *dl_se;
+ struct task_struct *p;
+ struct dl_rq *dl_rq;
+
+ dl_rq = &rq->dl;
+
+ if (need_pull_dl_task(rq, prev)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we're
+ * being very careful to re-start the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_dl_task(rq);
+ rq_repin_lock(rq, rf);
+ /*
+ * pull_dl_task() can drop (and re-acquire) rq->lock; this
+ * means a stop task can slip in, in which case we need to
+ * re-start task selection.
+ */
+ if (rq->stop && task_on_rq_queued(rq->stop))
+ return RETRY_TASK;
+ }
+
+ /*
+ * When prev is DL, we may throttle it in put_prev_task().
+ * So, we update time before we check for dl_nr_running.
+ */
+ if (prev->sched_class == &dl_sched_class)
+ update_curr_dl(rq);
+
+ if (unlikely(!dl_rq->dl_nr_running))
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ dl_se = pick_next_dl_entity(rq, dl_rq);
+ BUG_ON(!dl_se);
+
+ p = dl_task_of(dl_se);
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* Running task will never be pushed. */
+ dequeue_pushable_dl_task(rq, p);
+
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, p);
+
+ deadline_queue_push_tasks(rq);
+
+ if (rq->curr->sched_class != &dl_sched_class)
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+
+ return p;
+}
+
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+{
+ update_curr_dl(rq);
+
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+ if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
+}
+
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
+static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
+{
+ update_curr_dl(rq);
+
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+ /*
+ * Even when we have runtime, update_curr_dl() might have resulted in us
+ * not being the leftmost task anymore. In that case NEED_RESCHED will
+ * be set and schedule() will start a new hrtick for the next task.
+ */
+ if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
+ is_leftmost(p, &rq->dl))
+ start_hrtick_dl(rq, p);
+}
+
+static void task_fork_dl(struct task_struct *p)
+{
+ /*
+ * SCHED_DEADLINE tasks cannot fork and this is achieved through
+ * sched_fork()
+ */
+}
+
+static void set_curr_task_dl(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* You can't push away the running task */
+ dequeue_pushable_dl_task(rq, p);
+}
+
+#ifdef CONFIG_SMP
+
+/* Only try algorithms three times */
+#define DL_MAX_TRIES 3
+
+static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+ if (!task_running(rq, p) &&
+ cpumask_test_cpu(cpu, &p->cpus_allowed))
+ return 1;
+ return 0;
+}
+
+/*
+ * Return the earliest pushable rq's task, which is suitable to be executed
+ * on the CPU, NULL otherwise:
+ */
+static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
+{
+ struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost;
+ struct task_struct *p = NULL;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+next_node:
+ if (next_node) {
+ p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+
+ if (pick_dl_task(rq, p, cpu))
+ return p;
+
+ next_node = rb_next(next_node);
+ goto next_node;
+ }
+
+ return NULL;
+}
+
+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
+
+static int find_later_rq(struct task_struct *task)
+{
+ struct sched_domain *sd;
+ struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
+ int this_cpu = smp_processor_id();
+ int cpu = task_cpu(task);
+
+ /* Make sure the mask is initialized first */
+ if (unlikely(!later_mask))
+ return -1;
+
+ if (task->nr_cpus_allowed == 1)
+ return -1;
+
+ /*
+ * We have to consider system topology and task affinity
+ * first, then we can look for a suitable CPU.
+ */
+ if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
+ return -1;
+
+ /*
+ * If we are here, some targets have been found, including
+ * the most suitable which is, among the runqueues where the
+ * current tasks have later deadlines than the task's one, the
+ * rq with the latest possible one.
+ *
+ * Now we check how well this matches with task's
+ * affinity and system topology.
+ *
+ * The last CPU where the task run is our first
+ * guess, since it is most likely cache-hot there.
+ */
+ if (cpumask_test_cpu(cpu, later_mask))
+ return cpu;
+ /*
+ * Check if this_cpu is to be skipped (i.e., it is
+ * not in the mask) or not.
+ */
+ if (!cpumask_test_cpu(this_cpu, later_mask))
+ this_cpu = -1;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
+
+ /*
+ * If possible, preempting this_cpu is
+ * cheaper than migrating.
+ */
+ if (this_cpu != -1 &&
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
+ return this_cpu;
+ }
+
+ best_cpu = cpumask_first_and(later_mask,
+ sched_domain_span(sd));
+ /*
+ * Last chance: if a CPU being in both later_mask
+ * and current sd span is valid, that becomes our
+ * choice. Of course, the latest possible CPU is
+ * already under consideration through later_mask.
+ */
+ if (best_cpu < nr_cpu_ids) {
+ rcu_read_unlock();
+ return best_cpu;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * At this point, all our guesses failed, we just return
+ * 'something', and let the caller sort the things out.
+ */
+ if (this_cpu != -1)
+ return this_cpu;
+
+ cpu = cpumask_any(later_mask);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+
+ return -1;
+}
+
+/* Locks the rq it finds */
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
+{
+ struct rq *later_rq = NULL;
+ int tries;
+ int cpu;
+
+ for (tries = 0; tries < DL_MAX_TRIES; tries++) {
+ cpu = find_later_rq(task);
+
+ if ((cpu == -1) || (cpu == rq->cpu))
+ break;
+
+ later_rq = cpu_rq(cpu);
+
+ if (later_rq->dl.dl_nr_running &&
+ !dl_time_before(task->dl.deadline,
+ later_rq->dl.earliest_dl.curr)) {
+ /*
+ * Target rq has tasks of equal or earlier deadline,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ later_rq = NULL;
+ break;
+ }
+
+ /* Retry if something changed. */
+ if (double_lock_balance(rq, later_rq)) {
+ if (unlikely(task_rq(task) != rq ||
+ !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
+ task_running(rq, task) ||
+ !dl_task(task) ||
+ !task_on_rq_queued(task))) {
+ double_unlock_balance(rq, later_rq);
+ later_rq = NULL;
+ break;
+ }
+ }
+
+ /*
+ * If the rq we found has no -deadline task, or
+ * its earliest one has a later deadline than our
+ * task, the rq is a good one.
+ */
+ if (!later_rq->dl.dl_nr_running ||
+ dl_time_before(task->dl.deadline,
+ later_rq->dl.earliest_dl.curr))
+ break;
+
+ /* Otherwise we try again. */
+ double_unlock_balance(rq, later_rq);
+ later_rq = NULL;
+ }
+
+ return later_rq;
+}
+
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost,
+ struct task_struct, pushable_dl_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!task_on_rq_queued(p));
+ BUG_ON(!dl_task(p));
+
+ return p;
+}
+
+/*
+ * See if the non running -deadline tasks on this rq
+ * can be sent to some other CPU where they can preempt
+ * and start executing.
+ */
+static int push_dl_task(struct rq *rq)
+{
+ struct task_struct *next_task;
+ struct rq *later_rq;
+ int ret = 0;
+
+ if (!rq->dl.overloaded)
+ return 0;
+
+ next_task = pick_next_pushable_dl_task(rq);
+ if (!next_task)
+ return 0;
+
+retry:
+ if (unlikely(next_task == rq->curr)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * If next_task preempts rq->curr, and rq->curr
+ * can move away, it makes sense to just reschedule
+ * without going further in pushing next_task.
+ */
+ if (dl_task(rq->curr) &&
+ dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
+ rq->curr->nr_cpus_allowed > 1) {
+ resched_curr(rq);
+ return 0;
+ }
+
+ /* We might release rq lock */
+ get_task_struct(next_task);
+
+ /* Will lock the rq it'll find */
+ later_rq = find_lock_later_rq(next_task, rq);
+ if (!later_rq) {
+ struct task_struct *task;
+
+ /*
+ * We must check all this again, since
+ * find_lock_later_rq releases rq->lock and it is
+ * then possible that next_task has migrated.
+ */
+ task = pick_next_pushable_dl_task(rq);
+ if (task == next_task) {
+ /*
+ * The task is still there. We don't try
+ * again, some other CPU will pull it when ready.
+ */
+ goto out;
+ }
+
+ if (!task)
+ /* No more tasks */
+ goto out;
+
+ put_task_struct(next_task);
+ next_task = task;
+ goto retry;
+ }
+
+ deactivate_task(rq, next_task, 0);
+ sub_running_bw(&next_task->dl, &rq->dl);
+ sub_rq_bw(&next_task->dl, &rq->dl);
+ set_task_cpu(next_task, later_rq->cpu);
+ add_rq_bw(&next_task->dl, &later_rq->dl);
+
+ /*
+ * Update the later_rq clock here, because the clock is used
+ * by the cpufreq_update_util() inside __add_running_bw().
+ */
+ update_rq_clock(later_rq);
+ add_running_bw(&next_task->dl, &later_rq->dl);
+ activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
+ ret = 1;
+
+ resched_curr(later_rq);
+
+ double_unlock_balance(rq, later_rq);
+
+out:
+ put_task_struct(next_task);
+
+ return ret;
+}
+
+static void push_dl_tasks(struct rq *rq)
+{
+ /* push_dl_task() will return true if it moved a -deadline task */
+ while (push_dl_task(rq))
+ ;
+}
+
+static void pull_dl_task(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu, cpu;
+ struct task_struct *p;
+ bool resched = false;
+ struct rq *src_rq;
+ u64 dmin = LONG_MAX;
+
+ if (likely(!dl_overloaded(this_rq)))
+ return;
+
+ /*
+ * Match the barrier from dl_set_overloaded; this guarantees that if we
+ * see overloaded we must also see the dlo_mask bit.
+ */
+ smp_rmb();
+
+ for_each_cpu(cpu, this_rq->rd->dlo_mask) {
+ if (this_cpu == cpu)
+ continue;
+
+ src_rq = cpu_rq(cpu);
+
+ /*
+ * It looks racy, abd it is! However, as in sched_rt.c,
+ * we are fine with this.
+ */
+ if (this_rq->dl.dl_nr_running &&
+ dl_time_before(this_rq->dl.earliest_dl.curr,
+ src_rq->dl.earliest_dl.next))
+ continue;
+
+ /* Might drop this_rq->lock */
+ double_lock_balance(this_rq, src_rq);
+
+ /*
+ * If there are no more pullable tasks on the
+ * rq, we're done with it.
+ */
+ if (src_rq->dl.dl_nr_running <= 1)
+ goto skip;
+
+ p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
+
+ /*
+ * We found a task to be pulled if:
+ * - it preempts our current (if there's one),
+ * - it will preempt the last one we pulled (if any).
+ */
+ if (p && dl_time_before(p->dl.deadline, dmin) &&
+ (!this_rq->dl.dl_nr_running ||
+ dl_time_before(p->dl.deadline,
+ this_rq->dl.earliest_dl.curr))) {
+ WARN_ON(p == src_rq->curr);
+ WARN_ON(!task_on_rq_queued(p));
+
+ /*
+ * Then we pull iff p has actually an earlier
+ * deadline than the current task of its runqueue.
+ */
+ if (dl_time_before(p->dl.deadline,
+ src_rq->curr->dl.deadline))
+ goto skip;
+
+ resched = true;
+
+ deactivate_task(src_rq, p, 0);
+ sub_running_bw(&p->dl, &src_rq->dl);
+ sub_rq_bw(&p->dl, &src_rq->dl);
+ set_task_cpu(p, this_cpu);
+ add_rq_bw(&p->dl, &this_rq->dl);
+ add_running_bw(&p->dl, &this_rq->dl);
+ activate_task(this_rq, p, 0);
+ dmin = p->dl.deadline;
+
+ /* Is there any other task even earlier? */
+ }
+skip:
+ double_unlock_balance(this_rq, src_rq);
+ }
+
+ if (resched)
+ resched_curr(this_rq);
+}
+
+/*
+ * Since the task is not running and a reschedule is not going to happen
+ * anytime soon on its runqueue, we try pushing it away now.
+ */
+static void task_woken_dl(struct rq *rq, struct task_struct *p)
+{
+ if (!task_running(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ p->nr_cpus_allowed > 1 &&
+ dl_task(rq->curr) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
+ push_dl_tasks(rq);
+ }
+}
+
+static void set_cpus_allowed_dl(struct task_struct *p,
+ const struct cpumask *new_mask)
+{
+ struct root_domain *src_rd;
+ struct rq *rq;
+
+ BUG_ON(!dl_task(p));
+
+ rq = task_rq(p);
+ src_rd = rq->rd;
+ /*
+ * Migrating a SCHED_DEADLINE task between exclusive
+ * cpusets (different root_domains) entails a bandwidth
+ * update. We already made space for us in the destination
+ * domain (see cpuset_can_attach()).
+ */
+ if (!cpumask_intersects(src_rd->span, new_mask)) {
+ struct dl_bw *src_dl_b;
+
+ src_dl_b = dl_bw_of(cpu_of(rq));
+ /*
+ * We now free resources of the root_domain we are migrating
+ * off. In the worst case, sched_setattr() may temporary fail
+ * until we complete the update.
+ */
+ raw_spin_lock(&src_dl_b->lock);
+ __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ raw_spin_unlock(&src_dl_b->lock);
+ }
+
+ set_cpus_allowed_common(p, new_mask);
+}
+
+/* Assumes rq->lock is held */
+static void rq_online_dl(struct rq *rq)
+{
+ if (rq->dl.overloaded)
+ dl_set_overload(rq);
+
+ cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
+ if (rq->dl.dl_nr_running > 0)
+ cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
+}
+
+/* Assumes rq->lock is held */
+static void rq_offline_dl(struct rq *rq)
+{
+ if (rq->dl.overloaded)
+ dl_clear_overload(rq);
+
+ cpudl_clear(&rq->rd->cpudl, rq->cpu);
+ cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
+}
+
+void __init init_sched_dl_class(void)
+{
+ unsigned int i;
+
+ for_each_possible_cpu(i)
+ zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
+ GFP_KERNEL, cpu_to_node(i));
+}
+
+#endif /* CONFIG_SMP */
+
+static void switched_from_dl(struct rq *rq, struct task_struct *p)
+{
+ /*
+ * task_non_contending() can start the "inactive timer" (if the 0-lag
+ * time is in the future). If the task switches back to dl before
+ * the "inactive timer" fires, it can continue to consume its current
+ * runtime using its current deadline. If it stays outside of
+ * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer()
+ * will reset the task parameters.
+ */
+ if (task_on_rq_queued(p) && p->dl.dl_runtime)
+ task_non_contending(p);
+
+ if (!task_on_rq_queued(p)) {
+ /*
+ * Inactive timer is armed. However, p is leaving DEADLINE and
+ * might migrate away from this rq while continuing to run on
+ * some other class. We need to remove its contribution from
+ * this rq running_bw now, or sub_rq_bw (below) will complain.
+ */
+ if (p->dl.dl_non_contending)
+ sub_running_bw(&p->dl, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
+ }
+
+ /*
+ * We cannot use inactive_task_timer() to invoke sub_running_bw()
+ * at the 0-lag time, because the task could have been migrated
+ * while SCHED_OTHER in the meanwhile.
+ */
+ if (p->dl.dl_non_contending)
+ p->dl.dl_non_contending = 0;
+
+ /*
+ * Since this might be the only -deadline task on the rq,
+ * this is the right place to try to pull some other one
+ * from an overloaded CPU, if any.
+ */
+ if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+ return;
+
+ deadline_queue_pull_task(rq);
+}
+
+/*
+ * When switching to -deadline, we may overload the rq, then
+ * we try to push someone off, if possible.
+ */
+static void switched_to_dl(struct rq *rq, struct task_struct *p)
+{
+ if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+ put_task_struct(p);
+
+ /* If p is not queued we will update its parameters at next wakeup. */
+ if (!task_on_rq_queued(p)) {
+ add_rq_bw(&p->dl, &rq->dl);
+
+ return;
+ }
+
+ if (rq->curr != p) {
+#ifdef CONFIG_SMP
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+ deadline_queue_push_tasks(rq);
+#endif
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+ }
+}
+
+/*
+ * If the scheduling parameters of a -deadline task changed,
+ * a push or pull operation might be needed.
+ */
+static void prio_changed_dl(struct rq *rq, struct task_struct *p,
+ int oldprio)
+{
+ if (task_on_rq_queued(p) || rq->curr == p) {
+#ifdef CONFIG_SMP
+ /*
+ * This might be too much, but unfortunately
+ * we don't have the old deadline value, and
+ * we can't argue if the task is increasing
+ * or lowering its prio, so...
+ */
+ if (!rq->dl.overloaded)
+ deadline_queue_pull_task(rq);
+
+ /*
+ * If we now have a earlier deadline task than p,
+ * then reschedule, provided p is still on this
+ * runqueue.
+ */
+ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
+ resched_curr(rq);
+#else
+ /*
+ * Again, we don't know if p has a earlier
+ * or later deadline, so let's blindly set a
+ * (maybe not needed) rescheduling point.
+ */
+ resched_curr(rq);
+#endif /* CONFIG_SMP */
+ }
+}
+
+const struct sched_class dl_sched_class = {
+ .next = &rt_sched_class,
+ .enqueue_task = enqueue_task_dl,
+ .dequeue_task = dequeue_task_dl,
+ .yield_task = yield_task_dl,
+
+ .check_preempt_curr = check_preempt_curr_dl,
+
+ .pick_next_task = pick_next_task_dl,
+ .put_prev_task = put_prev_task_dl,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_dl,
+ .migrate_task_rq = migrate_task_rq_dl,
+ .set_cpus_allowed = set_cpus_allowed_dl,
+ .rq_online = rq_online_dl,
+ .rq_offline = rq_offline_dl,
+ .task_woken = task_woken_dl,
+#endif
+
+ .set_curr_task = set_curr_task_dl,
+ .task_tick = task_tick_dl,
+ .task_fork = task_fork_dl,
+
+ .prio_changed = prio_changed_dl,
+ .switched_from = switched_from_dl,
+ .switched_to = switched_to_dl,
+
+ .update_curr = update_curr_dl,
+};
+
+int sched_dl_global_validate(void)
+{
+ u64 runtime = global_rt_runtime();
+ u64 period = global_rt_period();
+ u64 new_bw = to_ratio(period, runtime);
+ struct dl_bw *dl_b;
+ int cpu, cpus, ret = 0;
+ unsigned long flags;
+
+ /*
+ * Here we want to check the bandwidth not being set to some
+ * value smaller than the currently allocated bandwidth in
+ * any of the root_domains.
+ *
+ * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+ * cycling on root_domains... Discussion on different/better
+ * solutions is welcome!
+ */
+ for_each_possible_cpu(cpu) {
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ if (new_bw * cpus < dl_b->total_bw)
+ ret = -EBUSY;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
+{
+ if (global_rt_runtime() == RUNTIME_INF) {
+ dl_rq->bw_ratio = 1 << RATIO_SHIFT;
+ dl_rq->extra_bw = 1 << BW_SHIFT;
+ } else {
+ dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
+ global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
+ dl_rq->extra_bw = to_ratio(global_rt_period(),
+ global_rt_runtime());
+ }
+}
+
+void sched_dl_do_global(void)
+{
+ u64 new_bw = -1;
+ struct dl_bw *dl_b;
+ int cpu;
+ unsigned long flags;
+
+ def_dl_bandwidth.dl_period = global_rt_period();
+ def_dl_bandwidth.dl_runtime = global_rt_runtime();
+
+ if (global_rt_runtime() != RUNTIME_INF)
+ new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+
+ /*
+ * FIXME: As above...
+ */
+ for_each_possible_cpu(cpu) {
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ dl_b->bw = new_bw;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+ init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
+ }
+}
+
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+int sched_dl_overflow(struct task_struct *p, int policy,
+ const struct sched_attr *attr)
+{
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+ u64 period = attr->sched_period ?: attr->sched_deadline;
+ u64 runtime = attr->sched_runtime;
+ u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+ int cpus, err = -1;
+
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return 0;
+
+ /* !deadline task may carry old deadline bandwidth */
+ if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
+ return 0;
+
+ /*
+ * Either if a task, enters, leave, or stays -deadline but changes
+ * its parameters, we may need to update accordingly the total
+ * allocated bandwidth of the container.
+ */
+ raw_spin_lock(&dl_b->lock);
+ cpus = dl_bw_cpus(task_cpu(p));
+ if (dl_policy(policy) && !task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ if (hrtimer_active(&p->dl.inactive_timer))
+ __dl_sub(dl_b, p->dl.dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+ err = 0;
+ } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ /*
+ * XXX this is slightly incorrect: when the task
+ * utilization decreases, we should delay the total
+ * utilization change until the task's 0-lag point.
+ * But this would require to set the task's "inactive
+ * timer" when the task is not inactive.
+ */
+ __dl_sub(dl_b, p->dl.dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+ dl_change_utilization(p, new_bw);
+ err = 0;
+ } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+ /*
+ * Do not decrease the total deadline utilization here,
+ * switched_from_dl() will take care to do it at the correct
+ * (0-lag) time.
+ */
+ err = 0;
+ }
+ raw_spin_unlock(&dl_b->lock);
+
+ return err;
+}
+
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
+void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ dl_se->dl_runtime = attr->sched_runtime;
+ dl_se->dl_deadline = attr->sched_deadline;
+ dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+ dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
+}
+
+void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ attr->sched_priority = p->rt_priority;
+ attr->sched_runtime = dl_se->dl_runtime;
+ attr->sched_deadline = dl_se->dl_deadline;
+ attr->sched_period = dl_se->dl_period;
+ attr->sched_flags &= ~SCHED_DL_FLAGS;
+ attr->sched_flags |= dl_se->flags;
+}
+
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
+ */
+bool __checkparam_dl(const struct sched_attr *attr)
+{
+ /* special dl tasks don't actually use any parameter */
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return true;
+
+ /* deadline != 0 */
+ if (attr->sched_deadline == 0)
+ return false;
+
+ /*
+ * Since we truncate DL_SCALE bits, make sure we're at least
+ * that big.
+ */
+ if (attr->sched_runtime < (1ULL << DL_SCALE))
+ return false;
+
+ /*
+ * Since we use the MSB for wrap-around and sign issues, make
+ * sure it's not set (mind that period can be equal to zero).
+ */
+ if (attr->sched_deadline & (1ULL << 63) ||
+ attr->sched_period & (1ULL << 63))
+ return false;
+
+ /* runtime <= deadline <= period (if period != 0) */
+ if ((attr->sched_period != 0 &&
+ attr->sched_period < attr->sched_deadline) ||
+ attr->sched_deadline < attr->sched_runtime)
+ return false;
+
+ return true;
+}
+
+/*
+ * This function clears the sched_dl_entity static params.
+ */
+void __dl_clear_params(struct task_struct *p)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ dl_se->dl_runtime = 0;
+ dl_se->dl_deadline = 0;
+ dl_se->dl_period = 0;
+ dl_se->flags = 0;
+ dl_se->dl_bw = 0;
+ dl_se->dl_density = 0;
+
+ dl_se->dl_boosted = 0;
+ dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
+ dl_se->dl_non_contending = 0;
+ dl_se->dl_overrun = 0;
+}
+
+bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ if (dl_se->dl_runtime != attr->sched_runtime ||
+ dl_se->dl_deadline != attr->sched_deadline ||
+ dl_se->dl_period != attr->sched_period ||
+ dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_SMP
+int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
+{
+ unsigned int dest_cpu;
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus, ret;
+ unsigned long flags;
+
+ dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(dest_cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(dest_cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ if (overflow) {
+ ret = -EBUSY;
+ } else {
+ /*
+ * We reserve space for this task in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, p->dl.dl_bw, cpus);
+ ret = 0;
+ }
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ return ret;
+}
+
+int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
+ const struct cpumask *trial)
+{
+ int ret = 1, trial_cpus;
+ struct dl_bw *cur_dl_b;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ cur_dl_b = dl_bw_of(cpumask_any(cur));
+ trial_cpus = cpumask_weight(trial);
+
+ raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+ if (cur_dl_b->bw != -1 &&
+ cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ ret = 0;
+ raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ return ret;
+}
+
+bool dl_cpu_busy(unsigned int cpu)
+{
+ unsigned long flags;
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ return overflow;
+}
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+void print_dl_stats(struct seq_file *m, int cpu)
+{
+ print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
new file mode 100644
index 000000000..b1ef4f2e7
--- /dev/null
+++ b/kernel/sched/debug.c
@@ -0,0 +1,1016 @@
+/*
+ * kernel/sched/debug.c
+ *
+ * Print the CFS rbtree and other debugging details
+ *
+ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include "sched.h"
+
+/*
+ * This allows printing both to /proc/sched_debug and
+ * to the console
+ */
+#define SEQ_printf(m, x...) \
+ do { \
+ if (m) \
+ seq_printf(m, x); \
+ else \
+ pr_cont(x); \
+ } while (0)
+
+/*
+ * Ease the printing of nsec fields:
+ */
+static long long nsec_high(unsigned long long nsec)
+{
+ if ((long long)nsec < 0) {
+ nsec = -nsec;
+ do_div(nsec, 1000000);
+ return -nsec;
+ }
+ do_div(nsec, 1000000);
+
+ return nsec;
+}
+
+static unsigned long nsec_low(unsigned long long nsec)
+{
+ if ((long long)nsec < 0)
+ nsec = -nsec;
+
+ return do_div(nsec, 1000000);
+}
+
+#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+
+#define SCHED_FEAT(name, enabled) \
+ #name ,
+
+static const char * const sched_feat_names[] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
+ if (!(sysctl_sched_features & (1UL << i)))
+ seq_puts(m, "NO_");
+ seq_printf(m, "%s ", sched_feat_names[i]);
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+#ifdef CONFIG_JUMP_LABEL
+
+#define jump_label_key__true STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled) \
+ jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+ static_key_disable_cpuslocked(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+ static_key_enable_cpuslocked(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* CONFIG_JUMP_LABEL */
+
+static int sched_feat_set(char *cmp)
+{
+ int i;
+ int neg = 0;
+
+ if (strncmp(cmp, "NO_", 3) == 0) {
+ neg = 1;
+ cmp += 3;
+ }
+
+ i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp);
+ if (i < 0)
+ return i;
+
+ if (neg) {
+ sysctl_sched_features &= ~(1UL << i);
+ sched_feat_disable(i);
+ } else {
+ sysctl_sched_features |= (1UL << i);
+ sched_feat_enable(i);
+ }
+
+ return 0;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp;
+ int ret;
+ struct inode *inode;
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ cmp = strstrip(buf);
+
+ /* Ensure the static_key remains in a consistent state */
+ inode = file_inode(filp);
+ cpus_read_lock();
+ inode_lock(inode);
+ ret = sched_feat_set(cmp);
+ inode_unlock(inode);
+ cpus_read_unlock();
+ if (ret < 0)
+ return ret;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_feat_show, NULL);
+}
+
+static const struct file_operations sched_feat_fops = {
+ .open = sched_feat_open,
+ .write = sched_feat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+__read_mostly bool sched_debug_enabled;
+
+static __init int sched_init_debug(void)
+{
+ debugfs_create_file("sched_features", 0644, NULL, NULL,
+ &sched_feat_fops);
+
+ debugfs_create_bool("sched_debug", 0644, NULL,
+ &sched_debug_enabled);
+
+ return 0;
+}
+late_initcall(sched_init_debug);
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SYSCTL
+
+static struct ctl_table sd_ctl_dir[] = {
+ {
+ .procname = "sched_domain",
+ .mode = 0555,
+ },
+ {}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = sd_ctl_dir,
+ },
+ {}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+ struct ctl_table *entry =
+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+ return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+ struct ctl_table *entry;
+
+ /*
+ * In the intermediate directories, both the child directory and
+ * procname are dynamically allocated and could fail but the mode
+ * will always be set. In the lowest directory the names are
+ * static strings and all have proc handlers.
+ */
+ for (entry = *tablep; entry->mode; entry++) {
+ if (entry->child)
+ sd_free_ctl_entry(&entry->child);
+ if (entry->proc_handler == NULL)
+ kfree(entry->procname);
+ }
+
+ kfree(*tablep);
+ *tablep = NULL;
+}
+
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+ const char *procname, void *data, int maxlen,
+ umode_t mode, proc_handler *proc_handler,
+ bool load_idx)
+{
+ entry->procname = procname;
+ entry->data = data;
+ entry->maxlen = maxlen;
+ entry->mode = mode;
+ entry->proc_handler = proc_handler;
+
+ if (load_idx) {
+ entry->extra1 = &min_load_idx;
+ entry->extra2 = &max_load_idx;
+ }
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
+ set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
+ set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
+ set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
+ set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
+ set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+ /* &table[13] is terminator */
+
+ return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+ struct ctl_table *entry, *table;
+ struct sched_domain *sd;
+ int domain_num = 0, i;
+ char buf[32];
+
+ for_each_domain(cpu, sd)
+ domain_num++;
+ entry = table = sd_alloc_ctl_entry(domain_num + 1);
+ if (table == NULL)
+ return NULL;
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ snprintf(buf, 32, "domain%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_domain_table(sd);
+ entry++;
+ i++;
+ }
+ return table;
+}
+
+static cpumask_var_t sd_sysctl_cpus;
+static struct ctl_table_header *sd_sysctl_header;
+
+void register_sched_domain_sysctl(void)
+{
+ static struct ctl_table *cpu_entries;
+ static struct ctl_table **cpu_idx;
+ static bool init_done = false;
+ char buf[32];
+ int i;
+
+ if (!cpu_entries) {
+ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
+ if (!cpu_entries)
+ return;
+
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = cpu_entries;
+ }
+
+ if (!cpu_idx) {
+ struct ctl_table *e = cpu_entries;
+
+ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
+ if (!cpu_idx)
+ return;
+
+ /* deal with sparse possible map */
+ for_each_possible_cpu(i) {
+ cpu_idx[i] = e;
+ e++;
+ }
+ }
+
+ if (!cpumask_available(sd_sysctl_cpus)) {
+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
+ return;
+ }
+
+ if (!init_done) {
+ init_done = true;
+ /* init to possible to not have holes in @cpu_entries */
+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+ }
+
+ for_each_cpu(i, sd_sysctl_cpus) {
+ struct ctl_table *e = cpu_idx[i];
+
+ if (e->child)
+ sd_free_ctl_entry(&e->child);
+
+ if (!e->procname) {
+ snprintf(buf, 32, "cpu%d", i);
+ e->procname = kstrdup(buf, GFP_KERNEL);
+ }
+ e->mode = 0555;
+ e->child = sd_alloc_ctl_cpu_table(i);
+
+ __cpumask_clear_cpu(i, sd_sysctl_cpus);
+ }
+
+ WARN_ON(sd_sysctl_header);
+ sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+void dirty_sched_domain_sysctl(int cpu)
+{
+ if (cpumask_available(sd_sysctl_cpus))
+ __cpumask_set_cpu(cpu, sd_sysctl_cpus);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+ unregister_sysctl_table(sd_sysctl_header);
+ sd_sysctl_header = NULL;
+}
+#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
+{
+ struct sched_entity *se = tg->se[cpu];
+
+#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
+#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
+#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
+
+ if (!se)
+ return;
+
+ PN(se->exec_start);
+ PN(se->vruntime);
+ PN(se->sum_exec_runtime);
+
+ if (schedstat_enabled()) {
+ PN_SCHEDSTAT(se->statistics.wait_start);
+ PN_SCHEDSTAT(se->statistics.sleep_start);
+ PN_SCHEDSTAT(se->statistics.block_start);
+ PN_SCHEDSTAT(se->statistics.sleep_max);
+ PN_SCHEDSTAT(se->statistics.block_max);
+ PN_SCHEDSTAT(se->statistics.exec_max);
+ PN_SCHEDSTAT(se->statistics.slice_max);
+ PN_SCHEDSTAT(se->statistics.wait_max);
+ PN_SCHEDSTAT(se->statistics.wait_sum);
+ P_SCHEDSTAT(se->statistics.wait_count);
+ }
+
+ P(se->load.weight);
+ P(se->runnable_weight);
+#ifdef CONFIG_SMP
+ P(se->avg.load_avg);
+ P(se->avg.util_avg);
+ P(se->avg.runnable_load_avg);
+#endif
+
+#undef PN_SCHEDSTAT
+#undef PN
+#undef P_SCHEDSTAT
+#undef P
+}
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(sched_debug_lock);
+static char group_path[PATH_MAX];
+
+static void task_group_path(struct task_group *tg, char *path, int plen)
+{
+ if (autogroup_path(tg, path, plen))
+ return;
+
+ cgroup_path(tg->css.cgroup, path, plen);
+}
+
+/*
+ * Only 1 SEQ_printf_task_group_path() caller can use the full length
+ * group_path[] for cgroup path. Other simultaneous callers will have
+ * to use a shorter stack buffer. A "..." suffix is appended at the end
+ * of the stack buffer so that it will show up in case the output length
+ * matches the given buffer size to indicate possible path name truncation.
+ */
+#define SEQ_printf_task_group_path(m, tg, fmt...) \
+{ \
+ if (spin_trylock(&sched_debug_lock)) { \
+ task_group_path(tg, group_path, sizeof(group_path)); \
+ SEQ_printf(m, fmt, group_path); \
+ spin_unlock(&sched_debug_lock); \
+ } else { \
+ char buf[128]; \
+ char *bufend = buf + sizeof(buf) - 3; \
+ task_group_path(tg, buf, bufend - buf); \
+ strcpy(bufend - 1, "..."); \
+ SEQ_printf(m, fmt, buf); \
+ } \
+}
+#endif
+
+static void
+print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+{
+ if (rq->curr == p)
+ SEQ_printf(m, ">R");
+ else
+ SEQ_printf(m, " %c", task_state_to_char(p));
+
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+ p->comm, task_pid_nr(p),
+ SPLIT_NS(p->se.vruntime),
+ (long long)(p->nvcsw + p->nivcsw),
+ p->prio);
+
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+ SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
+ SPLIT_NS(p->se.sum_exec_runtime),
+ SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
+
+#ifdef CONFIG_NUMA_BALANCING
+ SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ SEQ_printf_task_group_path(m, task_group(p), " %s")
+#endif
+
+ SEQ_printf(m, "\n");
+}
+
+static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+{
+ struct task_struct *g, *p;
+
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "runnable tasks:\n");
+ SEQ_printf(m, " S task PID tree-key switches prio"
+ " wait-time sum-exec sum-sleep\n");
+ SEQ_printf(m, "-------------------------------------------------------"
+ "----------------------------------------------------\n");
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ if (task_cpu(p) != rq_cpu)
+ continue;
+
+ print_task(m, rq, p);
+ }
+ rcu_read_unlock();
+}
+
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+{
+ s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
+ spread, rq0_min_vruntime, spread0;
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_entity *last;
+ unsigned long flags;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ SEQ_printf(m, "\n");
+ SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu);
+#else
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
+#endif
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
+ SPLIT_NS(cfs_rq->exec_clock));
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (rb_first_cached(&cfs_rq->tasks_timeline))
+ MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+ last = __pick_last_entity(cfs_rq);
+ if (last)
+ max_vruntime = last->vruntime;
+ min_vruntime = cfs_rq->min_vruntime;
+ rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
+ SPLIT_NS(MIN_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
+ SPLIT_NS(min_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
+ SPLIT_NS(max_vruntime));
+ spread = max_vruntime - MIN_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
+ SPLIT_NS(spread));
+ spread0 = min_vruntime - rq0_min_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
+ SPLIT_NS(spread0));
+ SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
+ cfs_rq->nr_spread_over);
+ SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
+#ifdef CONFIG_SMP
+ SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
+ SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
+ cfs_rq->avg.load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
+ cfs_rq->avg.runnable_load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
+ cfs_rq->avg.util_avg);
+ SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
+ cfs_rq->avg.util_est.enqueued);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
+ cfs_rq->removed.load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
+ cfs_rq->removed.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
+ cfs_rq->removed.runnable_sum);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
+ cfs_rq->tg_load_avg_contrib);
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
+ atomic_long_read(&cfs_rq->tg->load_avg));
+#endif
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ SEQ_printf(m, " .%-30s: %d\n", "throttled",
+ cfs_rq->throttled);
+ SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
+ cfs_rq->throttle_count);
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ print_cfs_group_stats(m, cpu, cfs_rq->tg);
+#endif
+}
+
+void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ SEQ_printf(m, "\n");
+ SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu);
+#else
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "rt_rq[%d]:\n", cpu);
+#endif
+
+#define P(x) \
+ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
+#define PU(x) \
+ SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(rt_rq->x))
+#define PN(x) \
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
+
+ PU(rt_nr_running);
+#ifdef CONFIG_SMP
+ PU(rt_nr_migratory);
+#endif
+ P(rt_throttled);
+ PN(rt_time);
+ PN(rt_runtime);
+
+#undef PN
+#undef PU
+#undef P
+}
+
+void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
+{
+ struct dl_bw *dl_bw;
+
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "dl_rq[%d]:\n", cpu);
+
+#define PU(x) \
+ SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
+
+ PU(dl_nr_running);
+#ifdef CONFIG_SMP
+ PU(dl_nr_migratory);
+ dl_bw = &cpu_rq(cpu)->rd->dl_bw;
+#else
+ dl_bw = &dl_rq->dl_bw;
+#endif
+ SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
+ SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
+
+#undef PU
+}
+
+static void print_cpu(struct seq_file *m, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+#ifdef CONFIG_X86
+ {
+ unsigned int freq = cpu_khz ? : 1;
+
+ SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
+ cpu, freq / 1000, (freq % 1000));
+ }
+#else
+ SEQ_printf(m, "cpu#%d\n", cpu);
+#endif
+
+#define P(x) \
+do { \
+ if (sizeof(rq->x) == 4) \
+ SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
+ else \
+ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
+} while (0)
+
+#define PN(x) \
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
+
+ P(nr_running);
+ SEQ_printf(m, " .%-30s: %lu\n", "load",
+ rq->load.weight);
+ P(nr_switches);
+ P(nr_load_updates);
+ P(nr_uninterruptible);
+ PN(next_balance);
+ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
+ PN(clock);
+ PN(clock_task);
+ P(cpu_load[0]);
+ P(cpu_load[1]);
+ P(cpu_load[2]);
+ P(cpu_load[3]);
+ P(cpu_load[4]);
+#undef P
+#undef PN
+
+#ifdef CONFIG_SMP
+#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
+ P64(avg_idle);
+ P64(max_idle_balance_cost);
+#undef P64
+#endif
+
+#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
+ if (schedstat_enabled()) {
+ P(yld_count);
+ P(sched_count);
+ P(sched_goidle);
+ P(ttwu_count);
+ P(ttwu_local);
+ }
+#undef P
+
+ print_cfs_stats(m, cpu);
+ print_rt_stats(m, cpu);
+ print_dl_stats(m, cpu);
+
+ print_rq(m, rq, cpu);
+ SEQ_printf(m, "\n");
+}
+
+static const char *sched_tunable_scaling_names[] = {
+ "none",
+ "logaritmic",
+ "linear"
+};
+
+static void sched_debug_header(struct seq_file *m)
+{
+ u64 ktime, sched_clk, cpu_clk;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ktime = ktime_to_ns(ktime_get());
+ sched_clk = sched_clock();
+ cpu_clk = local_clock();
+ local_irq_restore(flags);
+
+ SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
+#define P(x) \
+ SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+ SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+ PN(ktime);
+ PN(sched_clk);
+ PN(cpu_clk);
+ P(jiffies);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ P(sched_clock_stable());
+#endif
+#undef PN
+#undef P
+
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "sysctl_sched\n");
+
+#define P(x) \
+ SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+ PN(sysctl_sched_latency);
+ PN(sysctl_sched_min_granularity);
+ PN(sysctl_sched_wakeup_granularity);
+ P(sysctl_sched_child_runs_first);
+ P(sysctl_sched_features);
+#undef PN
+#undef P
+
+ SEQ_printf(m, " .%-40s: %d (%s)\n",
+ "sysctl_sched_tunable_scaling",
+ sysctl_sched_tunable_scaling,
+ sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+ SEQ_printf(m, "\n");
+}
+
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+ int cpu = (unsigned long)(v - 2);
+
+ if (cpu != -1)
+ print_cpu(m, cpu);
+ else
+ sched_debug_header(m);
+
+ return 0;
+}
+
+void sysrq_sched_debug_show(void)
+{
+ int cpu;
+
+ sched_debug_header(NULL);
+ for_each_online_cpu(cpu)
+ print_cpu(NULL, cpu);
+
+}
+
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is CPU 0.
+ * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
+ * to use cpumask_* to iterate over the CPUs.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+ unsigned long n = *offset;
+
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+
+ return NULL;
+}
+
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return sched_debug_start(file, offset);
+}
+
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations sched_debug_sops = {
+ .start = sched_debug_start,
+ .next = sched_debug_next,
+ .stop = sched_debug_stop,
+ .show = sched_debug_show,
+};
+
+static int __init init_sched_debug_procfs(void)
+{
+ if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops))
+ return -ENOMEM;
+ return 0;
+}
+
+__initcall(init_sched_debug_procfs);
+
+#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+
+
+#ifdef CONFIG_NUMA_BALANCING
+void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
+ unsigned long tpf, unsigned long gsf, unsigned long gpf)
+{
+ SEQ_printf(m, "numa_faults node=%d ", node);
+ SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf);
+ SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf);
+}
+#endif
+
+
+static void sched_show_numa(struct task_struct *p, struct seq_file *m)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ if (p->mm)
+ P(mm->numa_scan_seq);
+
+ P(numa_pages_migrated);
+ P(numa_preferred_nid);
+ P(total_numa_faults);
+ SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
+ task_node(p), task_numa_group_id(p));
+ show_numa_stats(p, m);
+#endif
+}
+
+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ struct seq_file *m)
+{
+ unsigned long nr_switches;
+
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
+ get_nr_threads(p));
+ SEQ_printf(m,
+ "---------------------------------------------------------"
+ "----------\n");
+#define __P(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define P_SCHEDSTAT(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
+#define __PN(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define PN_SCHEDSTAT(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
+
+ PN(se.exec_start);
+ PN(se.vruntime);
+ PN(se.sum_exec_runtime);
+
+ nr_switches = p->nvcsw + p->nivcsw;
+
+ P(se.nr_migrations);
+
+ if (schedstat_enabled()) {
+ u64 avg_atom, avg_per_cpu;
+
+ PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
+ PN_SCHEDSTAT(se.statistics.wait_start);
+ PN_SCHEDSTAT(se.statistics.sleep_start);
+ PN_SCHEDSTAT(se.statistics.block_start);
+ PN_SCHEDSTAT(se.statistics.sleep_max);
+ PN_SCHEDSTAT(se.statistics.block_max);
+ PN_SCHEDSTAT(se.statistics.exec_max);
+ PN_SCHEDSTAT(se.statistics.slice_max);
+ PN_SCHEDSTAT(se.statistics.wait_max);
+ PN_SCHEDSTAT(se.statistics.wait_sum);
+ P_SCHEDSTAT(se.statistics.wait_count);
+ PN_SCHEDSTAT(se.statistics.iowait_sum);
+ P_SCHEDSTAT(se.statistics.iowait_count);
+ P_SCHEDSTAT(se.statistics.nr_migrations_cold);
+ P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
+ P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
+ P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
+ P_SCHEDSTAT(se.statistics.nr_forced_migrations);
+ P_SCHEDSTAT(se.statistics.nr_wakeups);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_local);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
+ P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
+
+ avg_atom = p->se.sum_exec_runtime;
+ if (nr_switches)
+ avg_atom = div64_ul(avg_atom, nr_switches);
+ else
+ avg_atom = -1LL;
+
+ avg_per_cpu = p->se.sum_exec_runtime;
+ if (p->se.nr_migrations) {
+ avg_per_cpu = div64_u64(avg_per_cpu,
+ p->se.nr_migrations);
+ } else {
+ avg_per_cpu = -1LL;
+ }
+
+ __PN(avg_atom);
+ __PN(avg_per_cpu);
+ }
+
+ __P(nr_switches);
+ SEQ_printf(m, "%-45s:%21Ld\n",
+ "nr_voluntary_switches", (long long)p->nvcsw);
+ SEQ_printf(m, "%-45s:%21Ld\n",
+ "nr_involuntary_switches", (long long)p->nivcsw);
+
+ P(se.load.weight);
+ P(se.runnable_weight);
+#ifdef CONFIG_SMP
+ P(se.avg.load_sum);
+ P(se.avg.runnable_load_sum);
+ P(se.avg.util_sum);
+ P(se.avg.load_avg);
+ P(se.avg.runnable_load_avg);
+ P(se.avg.util_avg);
+ P(se.avg.last_update_time);
+ P(se.avg.util_est.ewma);
+ P(se.avg.util_est.enqueued);
+#endif
+ P(policy);
+ P(prio);
+ if (p->policy == SCHED_DEADLINE) {
+ P(dl.runtime);
+ P(dl.deadline);
+ }
+#undef PN_SCHEDSTAT
+#undef PN
+#undef __PN
+#undef P_SCHEDSTAT
+#undef P
+#undef __P
+
+ {
+ unsigned int this_cpu = raw_smp_processor_id();
+ u64 t0, t1;
+
+ t0 = cpu_clock(this_cpu);
+ t1 = cpu_clock(this_cpu);
+ SEQ_printf(m, "%-45s:%21Ld\n",
+ "clock-delta", (long long)(t1-t0));
+ }
+
+ sched_show_numa(p, m);
+}
+
+void proc_sched_set_task(struct task_struct *p)
+{
+#ifdef CONFIG_SCHEDSTATS
+ memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+#endif
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
new file mode 100644
index 000000000..84e7efda9
--- /dev/null
+++ b/kernel/sched/fair.c
@@ -0,0 +1,10365 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Interactivity improvements by Mike Galbraith
+ * (C) 2007 Mike Galbraith <efault@gmx.de>
+ *
+ * Various enhancements by Dmitry Adamushko.
+ * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
+ *
+ * Group scheduling enhancements by Srivatsa Vaddagiri
+ * Copyright IBM Corporation, 2007
+ * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
+ *
+ * Scaled math optimizations by Thomas Gleixner
+ * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
+ */
+#include "sched.h"
+
+#include <trace/events/sched.h>
+
+/*
+ * Targeted preemption latency for CPU-bound tasks:
+ *
+ * NOTE: this latency value is not the same as the concept of
+ * 'timeslice length' - timeslices in CFS are of variable length
+ * and have no persistent notion like in traditional, time-slice
+ * based scheduling concepts.
+ *
+ * (to see the precise effective timeslice length of your workload,
+ * run vmstat and monitor the context-switches (cs) field)
+ *
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
+ */
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+
+/*
+ * The initial- and re-scaling of tunables is configurable
+ *
+ * Options are:
+ *
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ */
+enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+
+/*
+ * Minimal preemption granularity for CPU-bound tasks:
+ *
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ */
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+
+/*
+ * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
+ */
+static unsigned int sched_nr_latency = 8;
+
+/*
+ * After fork, child runs first. If set to 0 (default) then
+ * parent will (try to) run first.
+ */
+unsigned int sysctl_sched_child_runs_first __read_mostly;
+
+/*
+ * SCHED_OTHER wake-up granularity.
+ *
+ * This option delays the preemption effects of decoupled workloads
+ * and reduces their over-scheduling. Synchronous workloads will still
+ * have immediate wakeup/sleep latencies.
+ *
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ */
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+
+const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+
+#ifdef CONFIG_SMP
+/*
+ * For asym packing, by default the lower numbered CPU has higher priority.
+ */
+int __weak arch_asym_cpu_priority(int cpu)
+{
+ return -cpu;
+}
+#endif
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
+ * each time a cfs_rq requests quota.
+ *
+ * Note: in the case that the slice exceeds the runtime remaining (either due
+ * to consumption or the quota being specified to be smaller than the slice)
+ * we will always only issue the remaining available time.
+ *
+ * (default: 5 msec, units: microseconds)
+ */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ *
+ * (default: ~20%)
+ */
+unsigned int capacity_margin = 1280;
+
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+ lw->weight += inc;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+ lw->weight -= dec;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+ lw->weight = w;
+ lw->inv_weight = 0;
+}
+
+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static unsigned int get_update_sysctl_factor(void)
+{
+ unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
+ unsigned int factor;
+
+ switch (sysctl_sched_tunable_scaling) {
+ case SCHED_TUNABLESCALING_NONE:
+ factor = 1;
+ break;
+ case SCHED_TUNABLESCALING_LINEAR:
+ factor = cpus;
+ break;
+ case SCHED_TUNABLESCALING_LOG:
+ default:
+ factor = 1 + ilog2(cpus);
+ break;
+ }
+
+ return factor;
+}
+
+static void update_sysctl(void)
+{
+ unsigned int factor = get_update_sysctl_factor();
+
+#define SET_SYSCTL(name) \
+ (sysctl_##name = (factor) * normalized_sysctl_##name)
+ SET_SYSCTL(sched_min_granularity);
+ SET_SYSCTL(sched_latency);
+ SET_SYSCTL(sched_wakeup_granularity);
+#undef SET_SYSCTL
+}
+
+void sched_init_granularity(void)
+{
+ update_sysctl();
+}
+
+#define WMULT_CONST (~0U)
+#define WMULT_SHIFT 32
+
+static void __update_inv_weight(struct load_weight *lw)
+{
+ unsigned long w;
+
+ if (likely(lw->inv_weight))
+ return;
+
+ w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+ lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
+ else
+ lw->inv_weight = WMULT_CONST / w;
+}
+
+/*
+ * delta_exec * weight / lw.weight
+ * OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
+ */
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
+{
+ u64 fact = scale_load_down(weight);
+ int shift = WMULT_SHIFT;
+
+ __update_inv_weight(lw);
+
+ if (unlikely(fact >> 32)) {
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
+ }
+
+ /* hint to use a 32x32->64 mul */
+ fact = (u64)(u32)fact * lw->inv_weight;
+
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
+
+ return mul_u64_u32_shr(delta_exec, fact, shift);
+}
+
+
+const struct sched_class fair_sched_class;
+
+/**************************************************************
+ * CFS operations on generic schedulable entities:
+ */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* cpu runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->rq;
+}
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ SCHED_WARN_ON(!entity_is_task(se));
+ return container_of(se, struct task_struct, se);
+}
+
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+ for (; se; se = se->parent)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return grp->my_q;
+}
+
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+
+ if (cfs_rq->on_list)
+ return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
+
+ cfs_rq->on_list = 1;
+
+ /*
+ * Ensure we either appear before our parent (if already
+ * enqueued) or force our parent to appear after us when it is
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the top of the tree
+ */
+ if (cfs_rq->tg->parent &&
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+ /*
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
+ }
+
+ if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the top of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
+ }
+
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the begin of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new begin
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
+ return false;
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ /*
+ * With cfs_rq being unthrottled/throttled during an enqueue,
+ * it can happen the tmp_alone_branch points the a leaf that
+ * we finally want to del. In this case, tmp_alone_branch moves
+ * to the prev element but it will point to rq->leaf_cfs_rq_list
+ * at the end of the enqueue.
+ */
+ if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
+ rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
+
+ list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+ cfs_rq->on_list = 0;
+ }
+}
+
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+ SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
+ leaf_cfs_rq_list)
+
+/* Do the two (enqueued) entities belong to the same group ? */
+static inline struct cfs_rq *
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+ if (se->cfs_rq == pse->cfs_rq)
+ return se->cfs_rq;
+
+ return NULL;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+ return se->parent;
+}
+
+static void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+ int se_depth, pse_depth;
+
+ /*
+ * preemption test can be made between sibling entities who are in the
+ * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+ * both tasks until we find their ancestors who are siblings of common
+ * parent.
+ */
+
+ /* First walk up until both entities are at same depth */
+ se_depth = (*se)->depth;
+ pse_depth = (*pse)->depth;
+
+ while (se_depth > pse_depth) {
+ se_depth--;
+ *se = parent_entity(*se);
+ }
+
+ while (pse_depth > se_depth) {
+ pse_depth--;
+ *pse = parent_entity(*pse);
+ }
+
+ while (!is_same_group(*se, *pse)) {
+ *se = parent_entity(*se);
+ *pse = parent_entity(*pse);
+ }
+}
+
+#else /* !CONFIG_FAIR_GROUP_SCHED */
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ return container_of(se, struct task_struct, se);
+}
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return container_of(cfs_rq, struct rq, cfs);
+}
+
+
+#define for_each_sched_entity(se) \
+ for (; se; se = NULL)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return NULL;
+}
+
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ return true;
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+}
+
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+ return NULL;
+}
+
+static inline void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+}
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
+
+/**************************************************************
+ * Scheduling class tree data structure manipulation methods:
+ */
+
+static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
+{
+ s64 delta = (s64)(vruntime - max_vruntime);
+ if (delta > 0)
+ max_vruntime = vruntime;
+
+ return max_vruntime;
+}
+
+static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
+{
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta < 0)
+ min_vruntime = vruntime;
+
+ return min_vruntime;
+}
+
+static inline int entity_before(struct sched_entity *a,
+ struct sched_entity *b)
+{
+ return (s64)(a->vruntime - b->vruntime) < 0;
+}
+
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
+
+ u64 vruntime = cfs_rq->min_vruntime;
+
+ if (curr) {
+ if (curr->on_rq)
+ vruntime = curr->vruntime;
+ else
+ curr = NULL;
+ }
+
+ if (leftmost) { /* non-empty tree */
+ struct sched_entity *se;
+ se = rb_entry(leftmost, struct sched_entity, run_node);
+
+ if (!curr)
+ vruntime = se->vruntime;
+ else
+ vruntime = min_vruntime(vruntime, se->vruntime);
+ }
+
+ /* ensure we never gain time by being placed backwards. */
+ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+#ifndef CONFIG_64BIT
+ smp_wmb();
+ cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
+}
+
+/*
+ * Enqueue an entity into the rb-tree:
+ */
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct sched_entity *entry;
+ bool leftmost = true;
+
+ /*
+ * Find the right place in the rbtree:
+ */
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct sched_entity, run_node);
+ /*
+ * We dont care about collisions. Nodes with
+ * the same key stay together.
+ */
+ if (entity_before(se, entry)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(&se->run_node, parent, link);
+ rb_insert_color_cached(&se->run_node,
+ &cfs_rq->tasks_timeline, leftmost);
+}
+
+static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+}
+
+struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct sched_entity, run_node);
+}
+
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+ struct rb_node *next = rb_next(&se->run_node);
+
+ if (!next)
+ return NULL;
+
+ return rb_entry(next, struct sched_entity, run_node);
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
+
+ if (!last)
+ return NULL;
+
+ return rb_entry(last, struct sched_entity, run_node);
+}
+
+/**************************************************************
+ * Scheduling class statistics methods:
+ */
+
+int sched_proc_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ unsigned int factor = get_update_sysctl_factor();
+
+ if (ret || !write)
+ return ret;
+
+ sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
+ sysctl_sched_min_granularity);
+
+#define WRT_SYSCTL(name) \
+ (normalized_sysctl_##name = sysctl_##name / (factor))
+ WRT_SYSCTL(sched_min_granularity);
+ WRT_SYSCTL(sched_latency);
+ WRT_SYSCTL(sched_wakeup_granularity);
+#undef WRT_SYSCTL
+
+ return 0;
+}
+#endif
+
+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}
+
+/*
+ * The idea is to set a period in which each task runs once.
+ *
+ * When there are too many tasks (sched_nr_latency) we have to stretch
+ * this period because otherwise the slices get too small.
+ *
+ * p = (nr <= nl) ? l : l*nr/nl
+ */
+static u64 __sched_period(unsigned long nr_running)
+{
+ if (unlikely(nr_running > sched_nr_latency))
+ return nr_running * sysctl_sched_min_granularity;
+ else
+ return sysctl_sched_latency;
+}
+
+/*
+ * We calculate the wall-time slice from the period by taking a part
+ * proportional to the weight.
+ *
+ * s = p*P[w/rw]
+ */
+static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
+
+ for_each_sched_entity(se) {
+ struct load_weight *load;
+ struct load_weight lw;
+
+ cfs_rq = cfs_rq_of(se);
+ load = &cfs_rq->load;
+
+ if (unlikely(!se->on_rq)) {
+ lw = cfs_rq->load;
+
+ update_load_add(&lw, se->load.weight);
+ load = &lw;
+ }
+ slice = __calc_delta(slice, se->load.weight, load);
+ }
+ return slice;
+}
+
+/*
+ * We calculate the vruntime slice of a to-be-inserted task.
+ *
+ * vs = s/w
+ */
+static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return calc_delta_fair(sched_slice(cfs_rq, se), se);
+}
+
+#ifdef CONFIG_SMP
+#include "pelt.h"
+#include "sched-pelt.h"
+
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
+static unsigned long task_h_load(struct task_struct *p);
+
+/* Give new sched_entity start runnable values to heavy its load in infant time */
+void init_entity_runnable_average(struct sched_entity *se)
+{
+ struct sched_avg *sa = &se->avg;
+
+ memset(sa, 0, sizeof(*sa));
+
+ /*
+ * Tasks are intialized with full load to be seen as heavy tasks until
+ * they get a chance to stabilize to their real load level.
+ * Group entities are intialized with zero load to reflect the fact that
+ * nothing has been attached to the task group yet.
+ */
+ if (entity_is_task(se))
+ sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
+
+ se->runnable_weight = se->load.weight;
+
+ /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
+}
+
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static void attach_entity_cfs_rq(struct sched_entity *se);
+
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task and cpu_scale the CPU capacity.
+ *
+ * For example, for a CPU with 1024 of capacity, a simplest series from
+ * the beginning would be like:
+ *
+ * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct sched_avg *sa = &se->avg;
+ long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
+
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
+ }
+ }
+
+ if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq);
+ attach_entity_load_avg(cfs_rq, se, 0);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
+ return;
+ }
+ }
+
+ attach_entity_cfs_rq(se);
+}
+
+#else /* !CONFIG_SMP */
+void init_entity_runnable_average(struct sched_entity *se)
+{
+}
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Update the current task's runtime statistics.
+ */
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 now = rq_clock_task(rq_of(cfs_rq));
+ u64 delta_exec;
+
+ if (unlikely(!curr))
+ return;
+
+ delta_exec = now - curr->exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
+
+ curr->exec_start = now;
+
+ schedstat_set(curr->statistics.exec_max,
+ max(delta_exec, curr->statistics.exec_max));
+
+ curr->sum_exec_runtime += delta_exec;
+ schedstat_add(cfs_rq->exec_clock, delta_exec);
+
+ curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_min_vruntime(cfs_rq);
+
+ if (entity_is_task(curr)) {
+ struct task_struct *curtask = task_of(curr);
+
+ trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
+ cgroup_account_cputime(curtask, delta_exec);
+ account_group_exec_runtime(curtask, delta_exec);
+ }
+
+ account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+
+static void update_curr_fair(struct rq *rq)
+{
+ update_curr(cfs_rq_of(&rq->curr->se));
+}
+
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 wait_start, prev_wait_start;
+
+ if (!schedstat_enabled())
+ return;
+
+ wait_start = rq_clock(rq_of(cfs_rq));
+ prev_wait_start = schedstat_val(se->statistics.wait_start);
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+ likely(wait_start > prev_wait_start))
+ wait_start -= prev_wait_start;
+
+ __schedstat_set(se->statistics.wait_start, wait_start);
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *p;
+ u64 delta;
+
+ if (!schedstat_enabled())
+ return;
+
+ delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
+
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ __schedstat_set(se->statistics.wait_start, delta);
+ return;
+ }
+ trace_sched_stat_wait(p, delta);
+ }
+
+ __schedstat_set(se->statistics.wait_max,
+ max(schedstat_val(se->statistics.wait_max), delta));
+ __schedstat_inc(se->statistics.wait_count);
+ __schedstat_add(se->statistics.wait_sum, delta);
+ __schedstat_set(se->statistics.wait_start, 0);
+}
+
+static inline void
+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *tsk = NULL;
+ u64 sleep_start, block_start;
+
+ if (!schedstat_enabled())
+ return;
+
+ sleep_start = schedstat_val(se->statistics.sleep_start);
+ block_start = schedstat_val(se->statistics.block_start);
+
+ if (entity_is_task(se))
+ tsk = task_of(se);
+
+ if (sleep_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+ __schedstat_set(se->statistics.sleep_max, delta);
+
+ __schedstat_set(se->statistics.sleep_start, 0);
+ __schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+ if (tsk) {
+ account_scheduler_latency(tsk, delta >> 10, 1);
+ trace_sched_stat_sleep(tsk, delta);
+ }
+ }
+ if (block_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+ __schedstat_set(se->statistics.block_max, delta);
+
+ __schedstat_set(se->statistics.block_start, 0);
+ __schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+ if (tsk) {
+ if (tsk->in_iowait) {
+ __schedstat_add(se->statistics.iowait_sum, delta);
+ __schedstat_inc(se->statistics.iowait_count);
+ trace_sched_stat_iowait(tsk, delta);
+ }
+
+ trace_sched_stat_blocked(tsk, delta);
+
+ /*
+ * Blocking time is in units of nanosecs, so shift by
+ * 20 to get a milliseconds-range estimation of the
+ * amount of time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ profile_hits(SLEEP_PROFILING,
+ (void *)get_wchan(tsk),
+ delta >> 20);
+ }
+ account_scheduler_latency(tsk, delta >> 10, 0);
+ }
+ }
+}
+
+/*
+ * Task is being enqueued - update stats:
+ */
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+ if (!schedstat_enabled())
+ return;
+
+ /*
+ * Are we enqueueing a waiting task? (for current tasks
+ * a dequeue/enqueue event is a NOP)
+ */
+ if (se != cfs_rq->curr)
+ update_stats_wait_start(cfs_rq, se);
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper(cfs_rq, se);
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+
+ if (!schedstat_enabled())
+ return;
+
+ /*
+ * Mark the end of the wait period if dequeueing a
+ * waiting task:
+ */
+ if (se != cfs_rq->curr)
+ update_stats_wait_end(cfs_rq, se);
+
+ if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
+
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ __schedstat_set(se->statistics.sleep_start,
+ rq_clock(rq_of(cfs_rq)));
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(se->statistics.block_start,
+ rq_clock(rq_of(cfs_rq)));
+ }
+}
+
+/*
+ * We are picking a new current task - update its stats:
+ */
+static inline void
+update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /*
+ * We are starting a new run period:
+ */
+ se->exec_start = rq_clock_task(rq_of(cfs_rq));
+}
+
+/**************************************************
+ * Scheduling class queueing methods:
+ */
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
+
+/* Portion of address space to scan in MB */
+unsigned int sysctl_numa_balancing_scan_size = 256;
+
+/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
+unsigned int sysctl_numa_balancing_scan_delay = 1000;
+
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ int active_nodes;
+
+ struct rcu_head rcu;
+ unsigned long total_faults;
+ unsigned long max_faults_cpu;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
+ unsigned long faults[0];
+};
+
+/*
+ * For functions that can be called in multiple contexts that permit reading
+ * ->numa_group (see struct task_struct for locking rules).
+ */
+static struct numa_group *deref_task_numa_group(struct task_struct *p)
+{
+ return rcu_dereference_check(p->numa_group, p == current ||
+ (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+}
+
+static struct numa_group *deref_curr_numa_group(struct task_struct *p)
+{
+ return rcu_dereference_protected(p->numa_group, p == current);
+}
+
+static inline unsigned long group_faults_priv(struct numa_group *ng);
+static inline unsigned long group_faults_shared(struct numa_group *ng);
+
+static unsigned int task_nr_scan_windows(struct task_struct *p)
+{
+ unsigned long rss = 0;
+ unsigned long nr_scan_pages;
+
+ /*
+ * Calculations based on RSS as non-present and empty pages are skipped
+ * by the PTE scanner and NUMA hinting faults should be trapped based
+ * on resident pages
+ */
+ nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+ rss = get_mm_rss(p->mm);
+ if (!rss)
+ rss = nr_scan_pages;
+
+ rss = round_up(rss, nr_scan_pages);
+ return rss / nr_scan_pages;
+}
+
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+
+static unsigned int task_scan_min(struct task_struct *p)
+{
+ unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
+ unsigned int scan, floor;
+ unsigned int windows = 1;
+
+ if (scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / scan_size;
+ floor = 1000 / windows;
+
+ scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+ return max_t(unsigned int, floor, scan);
+}
+
+static unsigned int task_scan_start(struct task_struct *p)
+{
+ unsigned long smin = task_scan_min(p);
+ unsigned long period = smin;
+ struct numa_group *ng;
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng) {
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+ }
+ rcu_read_unlock();
+
+ return max(smin, period);
+}
+
+static unsigned int task_scan_max(struct task_struct *p)
+{
+ unsigned long smin = task_scan_min(p);
+ unsigned long smax;
+ struct numa_group *ng;
+
+ /* Watch for min being lower than max due to floor calculations */
+ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ ng = deref_curr_numa_group(p);
+ if (ng) {
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+ unsigned long period = smax;
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+
+ smax = max(smax, period);
+ }
+
+ return max(smin, smax);
+}
+
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+ int mm_users = 0;
+ struct mm_struct *mm = p->mm;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1) {
+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ mm->numa_scan_seq = 0;
+ }
+ }
+ p->node_stamp = 0;
+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults = NULL;
+ RCU_INIT_POINTER(p->numa_group, NULL);
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+ /* New address space, reset the preferred nid */
+ if (!(clone_flags & CLONE_VM)) {
+ p->numa_preferred_nid = -1;
+ return;
+ }
+
+ /*
+ * New thread, keep existing numa_preferred_nid which should be copied
+ * already by arch_dup_task_struct but stagger when scans start.
+ */
+ if (mm) {
+ unsigned int delay;
+
+ delay = min_t(unsigned int, task_scan_max(current),
+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+ delay += 2 * TICK_NSEC;
+ p->node_stamp = delay;
+ }
+}
+
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
+
+pid_t task_numa_group_id(struct task_struct *p)
+{
+ struct numa_group *ng;
+ pid_t gid = 0;
+
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng)
+ gid = ng->gid;
+ rcu_read_unlock();
+
+ return gid;
+}
+
+/*
+ * The averaged statistics, shared & private, memory & CPU,
+ * occupy the first half of the array. The second half of the
+ * array is for current counters, which are averaged into the
+ * first set by task_numa_placement.
+ */
+static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
+{
+ return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
+}
+
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+ if (!p->numa_faults)
+ return 0;
+
+ return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
+}
+
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+ struct numa_group *ng = deref_task_numa_group(p);
+
+ if (!ng)
+ return 0;
+
+ return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
+}
+
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+{
+ return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
+ group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
+}
+
+static inline unsigned long group_faults_priv(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+
+ return faults;
+}
+
+static inline unsigned long group_faults_shared(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
+ }
+
+ return faults;
+}
+
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+ return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
+}
+
+/* Handle placement on systems where not all nodes are directly connected. */
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
+ int maxdist, bool task)
+{
+ unsigned long score = 0;
+ int node;
+
+ /*
+ * All nodes are directly connected, and the same distance
+ * from each other. No need for fancy placement algorithms.
+ */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return 0;
+
+ /*
+ * This code is called for each node, introducing N^2 complexity,
+ * which should be ok given the number of nodes rarely exceeds 8.
+ */
+ for_each_online_node(node) {
+ unsigned long faults;
+ int dist = node_distance(nid, node);
+
+ /*
+ * The furthest away nodes in the system are not interesting
+ * for placement; nid was already counted.
+ */
+ if (dist == sched_max_numa_distance || node == nid)
+ continue;
+
+ /*
+ * On systems with a backplane NUMA topology, compare groups
+ * of nodes, and move tasks towards the group with the most
+ * memory accesses. When comparing two nodes at distance
+ * "hoplimit", only nodes closer by than "hoplimit" are part
+ * of each group. Skip other nodes.
+ */
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist >= maxdist)
+ continue;
+
+ /* Add up the faults from nearby nodes. */
+ if (task)
+ faults = task_faults(p, node);
+ else
+ faults = group_faults(p, node);
+
+ /*
+ * On systems with a glueless mesh NUMA topology, there are
+ * no fixed "groups of nodes". Instead, nodes that are not
+ * directly connected bounce traffic through intermediate
+ * nodes; a numa_group can occupy any set of nodes.
+ * The further away a node is, the less the faults count.
+ * This seems to result in good task placement.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ faults *= (sched_max_numa_distance - dist);
+ faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ }
+
+ score += faults;
+ }
+
+ return score;
+}
+
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node. The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid,
+ int dist)
+{
+ unsigned long faults, total_faults;
+
+ if (!p->numa_faults)
+ return 0;
+
+ total_faults = p->total_numa_faults;
+
+ if (!total_faults)
+ return 0;
+
+ faults = task_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, true);
+
+ return 1000 * faults / total_faults;
+}
+
+static inline unsigned long group_weight(struct task_struct *p, int nid,
+ int dist)
+{
+ struct numa_group *ng = deref_task_numa_group(p);
+ unsigned long faults, total_faults;
+
+ if (!ng)
+ return 0;
+
+ total_faults = ng->total_faults;
+
+ if (!total_faults)
+ return 0;
+
+ faults = group_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, false);
+
+ return 1000 * faults / total_faults;
+}
+
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+ int src_nid, int dst_cpu)
+{
+ struct numa_group *ng = deref_curr_numa_group(p);
+ int dst_nid = cpu_to_node(dst_cpu);
+ int last_cpupid, this_cpupid;
+
+ this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+
+ /*
+ * Allow first faults or private faults to migrate immediately early in
+ * the lifetime of a task. The magic number 4 is based on waiting for
+ * two full passes of the "multi-stage node selection" test that is
+ * executed below.
+ */
+ if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+ (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
+ return true;
+
+ /*
+ * Multi-stage node selection is used in conjunction with a periodic
+ * migration fault to build a temporal task<->page relation. By using
+ * a two-stage filter we remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
+ * a task's usage of a particular page (n_p) per total usage of this
+ * page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will sample this probability and getting the
+ * same result twice in a row, given these samples are fully
+ * independent, is then given by P(n)^2, provided our sample period
+ * is sufficiently short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making it less likely we
+ * act on an unlikely task<->page relation.
+ */
+ if (!cpupid_pid_unset(last_cpupid) &&
+ cpupid_to_nid(last_cpupid) != dst_nid)
+ return false;
+
+ /* Always allow migrate on private faults */
+ if (cpupid_match_pid(p, last_cpupid))
+ return true;
+
+ /* A shared fault, but p->numa_group has not been set up yet. */
+ if (!ng)
+ return true;
+
+ /*
+ * Destination node is much more heavily used than the source
+ * node? Allow migration.
+ */
+ if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+ ACTIVE_NODE_FRACTION)
+ return true;
+
+ /*
+ * Distribute memory according to CPU & memory use on each node,
+ * with 3/4 hysteresis to avoid unnecessary memory migrations:
+ *
+ * faults_cpu(dst) 3 faults_cpu(src)
+ * --------------- * - > ---------------
+ * faults_mem(dst) 4 faults_mem(src)
+ */
+ return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
+ group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
+}
+
+static unsigned long weighted_cpuload(struct rq *rq);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long capacity_of(int cpu);
+
+/* Cached statistics for all CPUs within a node */
+struct numa_stats {
+ unsigned long load;
+
+ /* Total compute capacity of CPUs on a node */
+ unsigned long compute_capacity;
+
+ unsigned int nr_running;
+};
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+ int smt, cpu, cpus = 0;
+ unsigned long capacity;
+
+ memset(ns, 0, sizeof(*ns));
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ ns->nr_running += rq->nr_running;
+ ns->load += weighted_cpuload(rq);
+ ns->compute_capacity += capacity_of(cpu);
+
+ cpus++;
+ }
+
+ /*
+ * If we raced with hotplug and there are no CPUs left in our mask
+ * the @ns structure is NULL'ed and task_numa_compare() will
+ * not find this node attractive.
+ *
+ * We'll detect a huge imbalance and bail there.
+ */
+ if (!cpus)
+ return;
+
+ /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
+ smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
+ capacity = cpus / smt; /* cores */
+
+ capacity = min_t(unsigned, capacity,
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
+}
+
+struct task_numa_env {
+ struct task_struct *p;
+
+ int src_cpu, src_nid;
+ int dst_cpu, dst_nid;
+
+ struct numa_stats src_stats, dst_stats;
+
+ int imbalance_pct;
+ int dist;
+
+ struct task_struct *best_task;
+ long best_imp;
+ int best_cpu;
+};
+
+static void task_numa_assign(struct task_numa_env *env,
+ struct task_struct *p, long imp)
+{
+ struct rq *rq = cpu_rq(env->dst_cpu);
+
+ /* Bail out if run-queue part of active NUMA balance. */
+ if (xchg(&rq->numa_migrate_on, 1))
+ return;
+
+ /*
+ * Clear previous best_cpu/rq numa-migrate flag, since task now
+ * found a better CPU to move/swap.
+ */
+ if (env->best_cpu != -1) {
+ rq = cpu_rq(env->best_cpu);
+ WRITE_ONCE(rq->numa_migrate_on, 0);
+ }
+
+ if (env->best_task)
+ put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
+
+ env->best_task = p;
+ env->best_imp = imp;
+ env->best_cpu = env->dst_cpu;
+}
+
+static bool load_too_imbalanced(long src_load, long dst_load,
+ struct task_numa_env *env)
+{
+ long imb, old_imb;
+ long orig_src_load, orig_dst_load;
+ long src_capacity, dst_capacity;
+
+ /*
+ * The load is corrected for the CPU capacity available on each node.
+ *
+ * src_load dst_load
+ * ------------ vs ---------
+ * src_capacity dst_capacity
+ */
+ src_capacity = env->src_stats.compute_capacity;
+ dst_capacity = env->dst_stats.compute_capacity;
+
+ imb = abs(dst_load * src_capacity - src_load * dst_capacity);
+
+ orig_src_load = env->src_stats.load;
+ orig_dst_load = env->dst_stats.load;
+
+ old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
+
+ /* Would this change make things worse? */
+ return (imb > old_imb);
+}
+
+/*
+ * Maximum NUMA importance can be 1998 (2*999);
+ * SMALLIMP @ 30 would be close to 1998/64.
+ * Used to deter task migration.
+ */
+#define SMALLIMP 30
+
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env,
+ long taskimp, long groupimp, bool maymove)
+{
+ struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
+ struct rq *dst_rq = cpu_rq(env->dst_cpu);
+ long imp = p_ng ? groupimp : taskimp;
+ struct task_struct *cur;
+ long src_load, dst_load;
+ int dist = env->dist;
+ long moveimp = imp;
+ long load;
+
+ if (READ_ONCE(dst_rq->numa_migrate_on))
+ return;
+
+ rcu_read_lock();
+ cur = task_rcu_dereference(&dst_rq->curr);
+ if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
+ cur = NULL;
+
+ /*
+ * Because we have preemption enabled we can get migrated around and
+ * end try selecting ourselves (current == env->p) as a swap candidate.
+ */
+ if (cur == env->p)
+ goto unlock;
+
+ if (!cur) {
+ if (maymove && moveimp >= env->best_imp)
+ goto assign;
+ else
+ goto unlock;
+ }
+
+ /*
+ * "imp" is the fault differential for the source task between the
+ * source and destination node. Calculate the total differential for
+ * the source task and potential destination task. The more negative
+ * the value is, the more remote accesses that would be expected to
+ * be incurred if the tasks were swapped.
+ */
+ /* Skip this swap candidate if cannot move to the source cpu */
+ if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+ goto unlock;
+
+ /*
+ * If dst and source tasks are in the same NUMA group, or not
+ * in any group then look only at task weights.
+ */
+ cur_ng = rcu_dereference(cur->numa_group);
+ if (cur_ng == p_ng) {
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
+ /*
+ * Add some hysteresis to prevent swapping the
+ * tasks within a group over tiny differences.
+ */
+ if (cur_ng)
+ imp -= imp / 16;
+ } else {
+ /*
+ * Compare the group weights. If a task is all by itself
+ * (not part of a group), use the task weight instead.
+ */
+ if (cur_ng && p_ng)
+ imp += group_weight(cur, env->src_nid, dist) -
+ group_weight(cur, env->dst_nid, dist);
+ else
+ imp += task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
+ }
+
+ if (maymove && moveimp > imp && moveimp > env->best_imp) {
+ imp = moveimp;
+ cur = NULL;
+ goto assign;
+ }
+
+ /*
+ * If the NUMA importance is less than SMALLIMP,
+ * task migration might only result in ping pong
+ * of tasks and also hurt performance due to cache
+ * misses.
+ */
+ if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
+ goto unlock;
+
+ /*
+ * In the overloaded case, try and keep the load balanced.
+ */
+ load = task_h_load(env->p) - task_h_load(cur);
+ if (!load)
+ goto assign;
+
+ dst_load = env->dst_stats.load + load;
+ src_load = env->src_stats.load - load;
+
+ if (load_too_imbalanced(src_load, dst_load, env))
+ goto unlock;
+
+assign:
+ /*
+ * One idle CPU per node is evaluated for a task numa move.
+ * Call select_idle_sibling to maybe find a better one.
+ */
+ if (!cur) {
+ /*
+ * select_idle_siblings() uses an per-CPU cpumask that
+ * can be used from IRQ context.
+ */
+ local_irq_disable();
+ env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+ env->dst_cpu);
+ local_irq_enable();
+ }
+
+ task_numa_assign(env, cur, imp);
+unlock:
+ rcu_read_unlock();
+}
+
+static void task_numa_find_cpu(struct task_numa_env *env,
+ long taskimp, long groupimp)
+{
+ long src_load, dst_load, load;
+ bool maymove = false;
+ int cpu;
+
+ load = task_h_load(env->p);
+ dst_load = env->dst_stats.load + load;
+ src_load = env->src_stats.load - load;
+
+ /*
+ * If the improvement from just moving env->p direction is better
+ * than swapping tasks around, check if a move is possible.
+ */
+ maymove = !load_too_imbalanced(src_load, dst_load, env);
+
+ for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+ /* Skip this CPU if the source task cannot migrate */
+ if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
+ continue;
+
+ env->dst_cpu = cpu;
+ task_numa_compare(env, taskimp, groupimp, maymove);
+ }
+}
+
+static int task_numa_migrate(struct task_struct *p)
+{
+ struct task_numa_env env = {
+ .p = p,
+
+ .src_cpu = task_cpu(p),
+ .src_nid = task_node(p),
+
+ .imbalance_pct = 112,
+
+ .best_task = NULL,
+ .best_imp = 0,
+ .best_cpu = -1,
+ };
+ unsigned long taskweight, groupweight;
+ struct sched_domain *sd;
+ long taskimp, groupimp;
+ struct numa_group *ng;
+ struct rq *best_rq;
+ int nid, ret, dist;
+
+ /*
+ * Pick the lowest SD_NUMA domain, as that would have the smallest
+ * imbalance and would be the first to start moving tasks about.
+ *
+ * And we want to avoid any moving of tasks about, as that would create
+ * random movement of tasks -- counter the numa conditions we're trying
+ * to satisfy here.
+ */
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ if (sd)
+ env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ rcu_read_unlock();
+
+ /*
+ * Cpusets can break the scheduler domain tree into smaller
+ * balance domains, some of which do not cross NUMA boundaries.
+ * Tasks that are "trapped" in such domains cannot be migrated
+ * elsewhere, so there is no point in (re)trying.
+ */
+ if (unlikely(!sd)) {
+ sched_setnuma(p, task_node(p));
+ return -EINVAL;
+ }
+
+ env.dst_nid = p->numa_preferred_nid;
+ dist = env.dist = node_distance(env.src_nid, env.dst_nid);
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
+ groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+
+ /* Try to find a spot on the preferred nid. */
+ task_numa_find_cpu(&env, taskimp, groupimp);
+
+ /*
+ * Look at other nodes in these cases:
+ * - there is no space available on the preferred_nid
+ * - the task is part of a numa_group that is interleaved across
+ * multiple NUMA nodes; in order to better consolidate the group,
+ * we need to check other locations.
+ */
+ ng = deref_curr_numa_group(p);
+ if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
+ for_each_online_node(nid) {
+ if (nid == env.src_nid || nid == p->numa_preferred_nid)
+ continue;
+
+ dist = node_distance(env.src_nid, env.dst_nid);
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist != env.dist) {
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ }
+
+ /* Only consider nodes where both task and groups benefit */
+ taskimp = task_weight(p, nid, dist) - taskweight;
+ groupimp = group_weight(p, nid, dist) - groupweight;
+ if (taskimp < 0 && groupimp < 0)
+ continue;
+
+ env.dist = dist;
+ env.dst_nid = nid;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+ task_numa_find_cpu(&env, taskimp, groupimp);
+ }
+ }
+
+ /*
+ * If the task is part of a workload that spans multiple NUMA nodes,
+ * and is migrating into one of the workload's active nodes, remember
+ * this node as the task's preferred numa node, so the workload can
+ * settle down.
+ * A task that migrated to a second choice node will be better off
+ * trying for a better one later. Do not set the preferred node here.
+ */
+ if (ng) {
+ if (env.best_cpu == -1)
+ nid = env.src_nid;
+ else
+ nid = cpu_to_node(env.best_cpu);
+
+ if (nid != p->numa_preferred_nid)
+ sched_setnuma(p, nid);
+ }
+
+ /* No better CPU than the current one was found. */
+ if (env.best_cpu == -1)
+ return -EAGAIN;
+
+ best_rq = cpu_rq(env.best_cpu);
+ if (env.best_task == NULL) {
+ ret = migrate_task_to(p, env.best_cpu);
+ WRITE_ONCE(best_rq->numa_migrate_on, 0);
+ if (ret != 0)
+ trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
+ return ret;
+ }
+
+ ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
+ WRITE_ONCE(best_rq->numa_migrate_on, 0);
+
+ if (ret != 0)
+ trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
+ put_task_struct(env.best_task);
+ return ret;
+}
+
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+ unsigned long interval = HZ;
+
+ /* This task has no NUMA fault statistics yet */
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ return;
+
+ /* Periodically retry migrating the task to the preferred node */
+ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+ p->numa_migrate_retry = jiffies + interval;
+
+ /* Success if task is already running on preferred CPU */
+ if (task_node(p) == p->numa_preferred_nid)
+ return;
+
+ /* Otherwise, try migrate to a CPU on the preferred node */
+ task_numa_migrate(p);
+}
+
+/*
+ * Find out how many nodes on the workload is actively running on. Do this by
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
+ * be different from the set of nodes where the workload's memory is currently
+ * located.
+ */
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
+{
+ unsigned long faults, max_faults = 0;
+ int nid, active_nodes = 0;
+
+ for_each_online_node(nid) {
+ faults = group_faults_cpu(numa_group, nid);
+ if (faults > max_faults)
+ max_faults = faults;
+ }
+
+ for_each_online_node(nid) {
+ faults = group_faults_cpu(numa_group, nid);
+ if (faults * ACTIVE_NODE_FRACTION > max_faults)
+ active_nodes++;
+ }
+
+ numa_group->max_faults_cpu = max_faults;
+ numa_group->active_nodes = active_nodes;
+}
+
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/(local+remote) ratio is
+ * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
+ * the scan period will decrease. Aim for 70% local accesses.
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 7
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+ unsigned long shared, unsigned long private)
+{
+ unsigned int period_slot;
+ int lr_ratio, ps_ratio;
+ int diff;
+
+ unsigned long remote = p->numa_faults_locality[0];
+ unsigned long local = p->numa_faults_locality[1];
+
+ /*
+ * If there were no record hinting faults then either the task is
+ * completely idle or all activity is areas that are not of interest
+ * to automatic numa balancing. Related to that, if there were failed
+ * migration then it implies we are migrating too quickly or the local
+ * node is overloaded. In either case, scan slower
+ */
+ if (local + shared == 0 || p->numa_faults_locality[2]) {
+ p->numa_scan_period = min(p->numa_scan_period_max,
+ p->numa_scan_period << 1);
+
+ p->mm->numa_next_scan = jiffies +
+ msecs_to_jiffies(p->numa_scan_period);
+
+ return;
+ }
+
+ /*
+ * Prepare to scale scan period relative to the current period.
+ * == NUMA_PERIOD_THRESHOLD scan period stays the same
+ * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+ * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+ */
+ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+ lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+ if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are local. There is no need to
+ * do fast NUMA scanning, since memory is already local.
+ */
+ int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are shared with other tasks.
+ * There is no point in continuing fast NUMA scanning,
+ * since other tasks may just move the memory elsewhere.
+ */
+ int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else {
+ /*
+ * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+ * yet they are not on the local NUMA node. Speed up
+ * NUMA scanning to get the memory moved over.
+ */
+ int ratio = max(lr_ratio, ps_ratio);
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+ }
+
+ p->numa_scan_period = clamp(p->numa_scan_period + diff,
+ task_scan_min(p), task_scan_max(p));
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+ u64 runtime, delta, now;
+ /* Use the start of this time slice to avoid calculations. */
+ now = p->se.exec_start;
+ runtime = p->se.sum_exec_runtime;
+
+ if (p->last_task_numa_placement) {
+ delta = runtime - p->last_sum_exec_runtime;
+ *period = now - p->last_task_numa_placement;
+
+ /* Avoid time going backwards, prevent potential divide error: */
+ if (unlikely((s64)*period < 0))
+ *period = 0;
+ } else {
+ delta = p->se.avg.load_sum;
+ *period = LOAD_AVG_MAX;
+ }
+
+ p->last_sum_exec_runtime = runtime;
+ p->last_task_numa_placement = now;
+
+ return delta;
+}
+
+/*
+ * Determine the preferred nid for a task in a numa_group. This needs to
+ * be done in a way that produces consistent results with group_weight,
+ * otherwise workloads might not converge.
+ */
+static int preferred_group_nid(struct task_struct *p, int nid)
+{
+ nodemask_t nodes;
+ int dist;
+
+ /* Direct connections between all NUMA nodes. */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return nid;
+
+ /*
+ * On a system with glueless mesh NUMA topology, group_weight
+ * scores nodes according to the number of NUMA hinting faults on
+ * both the node itself, and on nearby nodes.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ unsigned long score, max_score = 0;
+ int node, max_node = nid;
+
+ dist = sched_max_numa_distance;
+
+ for_each_online_node(node) {
+ score = group_weight(p, node, dist);
+ if (score > max_score) {
+ max_score = score;
+ max_node = node;
+ }
+ }
+ return max_node;
+ }
+
+ /*
+ * Finding the preferred nid in a system with NUMA backplane
+ * interconnect topology is more involved. The goal is to locate
+ * tasks from numa_groups near each other in the system, and
+ * untangle workloads from different sides of the system. This requires
+ * searching down the hierarchy of node groups, recursively searching
+ * inside the highest scoring group of nodes. The nodemask tricks
+ * keep the complexity of the search down.
+ */
+ nodes = node_online_map;
+ for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
+ unsigned long max_faults = 0;
+ nodemask_t max_group = NODE_MASK_NONE;
+ int a, b;
+
+ /* Are there nodes at this distance from each other? */
+ if (!find_numa_distance(dist))
+ continue;
+
+ for_each_node_mask(a, nodes) {
+ unsigned long faults = 0;
+ nodemask_t this_group;
+ nodes_clear(this_group);
+
+ /* Sum group's NUMA faults; includes a==b case. */
+ for_each_node_mask(b, nodes) {
+ if (node_distance(a, b) < dist) {
+ faults += group_faults(p, b);
+ node_set(b, this_group);
+ node_clear(b, nodes);
+ }
+ }
+
+ /* Remember the top group. */
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_group = this_group;
+ /*
+ * subtle: at the smallest distance there is
+ * just one node left in each "group", the
+ * winner is the preferred nid.
+ */
+ nid = a;
+ }
+ }
+ /* Next round, evaluate the nodes within max_group. */
+ if (!max_faults)
+ break;
+ nodes = max_group;
+ }
+ return nid;
+}
+
+static void task_numa_placement(struct task_struct *p)
+{
+ int seq, nid, max_nid = -1;
+ unsigned long max_faults = 0;
+ unsigned long fault_types[2] = { 0, 0 };
+ unsigned long total_faults;
+ u64 runtime, period;
+ spinlock_t *group_lock = NULL;
+ struct numa_group *ng;
+
+ /*
+ * The p->mm->numa_scan_seq field gets updated without
+ * exclusive access. Use READ_ONCE() here to ensure
+ * that the field is read in a single access:
+ */
+ seq = READ_ONCE(p->mm->numa_scan_seq);
+ if (p->numa_scan_seq == seq)
+ return;
+ p->numa_scan_seq = seq;
+ p->numa_scan_period_max = task_scan_max(p);
+
+ total_faults = p->numa_faults_locality[0] +
+ p->numa_faults_locality[1];
+ runtime = numa_get_avg_runtime(p, &period);
+
+ /* If the task is part of a group prevent parallel updates to group stats */
+ ng = deref_curr_numa_group(p);
+ if (ng) {
+ group_lock = &ng->lock;
+ spin_lock_irq(group_lock);
+ }
+
+ /* Find the node with the highest number of faults */
+ for_each_online_node(nid) {
+ /* Keep track of the offsets in numa_faults array */
+ int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
+ unsigned long faults = 0, group_faults = 0;
+ int priv;
+
+ for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
+ long diff, f_diff, f_weight;
+
+ mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
+ membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
+ cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
+ cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
+
+ /* Decay existing window, copy faults since last scan */
+ diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
+ fault_types[priv] += p->numa_faults[membuf_idx];
+ p->numa_faults[membuf_idx] = 0;
+
+ /*
+ * Normalize the faults_from, so all tasks in a group
+ * count according to CPU use, instead of by the raw
+ * number of faults. Tasks with little runtime have
+ * little over-all impact on throughput, and thus their
+ * faults are less important.
+ */
+ f_weight = div64_u64(runtime << 16, period + 1);
+ f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
+ (total_faults + 1);
+ f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
+ p->numa_faults[cpubuf_idx] = 0;
+
+ p->numa_faults[mem_idx] += diff;
+ p->numa_faults[cpu_idx] += f_diff;
+ faults += p->numa_faults[mem_idx];
+ p->total_numa_faults += diff;
+ if (ng) {
+ /*
+ * safe because we can only change our own group
+ *
+ * mem_idx represents the offset for a given
+ * nid and priv in a specific region because it
+ * is at the beginning of the numa_faults array.
+ */
+ ng->faults[mem_idx] += diff;
+ ng->faults_cpu[mem_idx] += f_diff;
+ ng->total_faults += diff;
+ group_faults += ng->faults[mem_idx];
+ }
+ }
+
+ if (!ng) {
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_nid = nid;
+ }
+ } else if (group_faults > max_faults) {
+ max_faults = group_faults;
+ max_nid = nid;
+ }
+ }
+
+ if (ng) {
+ numa_group_count_active_nodes(ng);
+ spin_unlock_irq(group_lock);
+ max_nid = preferred_group_nid(p, max_nid);
+ }
+
+ if (max_faults) {
+ /* Set the new preferred node */
+ if (max_nid != p->numa_preferred_nid)
+ sched_setnuma(p, max_nid);
+ }
+
+ update_task_scan_period(p, fault_types[0], fault_types[1]);
+}
+
+static inline int get_numa_group(struct numa_group *grp)
+{
+ return atomic_inc_not_zero(&grp->refcount);
+}
+
+static inline void put_numa_group(struct numa_group *grp)
+{
+ if (atomic_dec_and_test(&grp->refcount))
+ kfree_rcu(grp, rcu);
+}
+
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+ int *priv)
+{
+ struct numa_group *grp, *my_grp;
+ struct task_struct *tsk;
+ bool join = false;
+ int cpu = cpupid_to_cpu(cpupid);
+ int i;
+
+ if (unlikely(!deref_curr_numa_group(p))) {
+ unsigned int size = sizeof(struct numa_group) +
+ 4*nr_node_ids*sizeof(unsigned long);
+
+ grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!grp)
+ return;
+
+ atomic_set(&grp->refcount, 1);
+ grp->active_nodes = 1;
+ grp->max_faults_cpu = 0;
+ spin_lock_init(&grp->lock);
+ grp->gid = p->pid;
+ /* Second half of the array tracks nids where faults happen */
+ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+ nr_node_ids;
+
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ grp->faults[i] = p->numa_faults[i];
+
+ grp->total_faults = p->total_numa_faults;
+
+ grp->nr_tasks++;
+ rcu_assign_pointer(p->numa_group, grp);
+ }
+
+ rcu_read_lock();
+ tsk = READ_ONCE(cpu_rq(cpu)->curr);
+
+ if (!cpupid_match_pid(tsk, cpupid))
+ goto no_join;
+
+ grp = rcu_dereference(tsk->numa_group);
+ if (!grp)
+ goto no_join;
+
+ my_grp = deref_curr_numa_group(p);
+ if (grp == my_grp)
+ goto no_join;
+
+ /*
+ * Only join the other group if its bigger; if we're the bigger group,
+ * the other task will join us.
+ */
+ if (my_grp->nr_tasks > grp->nr_tasks)
+ goto no_join;
+
+ /*
+ * Tie-break on the grp address.
+ */
+ if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+ goto no_join;
+
+ /* Always join threads in the same process. */
+ if (tsk->mm == current->mm)
+ join = true;
+
+ /* Simple filter to avoid false positives due to PID collisions */
+ if (flags & TNF_SHARED)
+ join = true;
+
+ /* Update priv based on whether false sharing was detected */
+ *priv = !join;
+
+ if (join && !get_numa_group(grp))
+ goto no_join;
+
+ rcu_read_unlock();
+
+ if (!join)
+ return;
+
+ BUG_ON(irqs_disabled());
+ double_lock_irq(&my_grp->lock, &grp->lock);
+
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
+ my_grp->faults[i] -= p->numa_faults[i];
+ grp->faults[i] += p->numa_faults[i];
+ }
+ my_grp->total_faults -= p->total_numa_faults;
+ grp->total_faults += p->total_numa_faults;
+
+ my_grp->nr_tasks--;
+ grp->nr_tasks++;
+
+ spin_unlock(&my_grp->lock);
+ spin_unlock_irq(&grp->lock);
+
+ rcu_assign_pointer(p->numa_group, grp);
+
+ put_numa_group(my_grp);
+ return;
+
+no_join:
+ rcu_read_unlock();
+ return;
+}
+
+/*
+ * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * If @final is set, the task is dead and has reached refcount zero, so we can
+ * safely free all relevant data structures. Otherwise, there might be
+ * concurrent reads from places like load balancing and procfs, and we should
+ * reset the data back to default state without freeing ->numa_faults.
+ */
+void task_numa_free(struct task_struct *p, bool final)
+{
+ /* safe: p either is current or is being freed by current */
+ struct numa_group *grp = rcu_dereference_raw(p->numa_group);
+ unsigned long *numa_faults = p->numa_faults;
+ unsigned long flags;
+ int i;
+
+ if (!numa_faults)
+ return;
+
+ if (grp) {
+ spin_lock_irqsave(&grp->lock, flags);
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ grp->faults[i] -= p->numa_faults[i];
+ grp->total_faults -= p->total_numa_faults;
+
+ grp->nr_tasks--;
+ spin_unlock_irqrestore(&grp->lock, flags);
+ RCU_INIT_POINTER(p->numa_group, NULL);
+ put_numa_group(grp);
+ }
+
+ if (final) {
+ p->numa_faults = NULL;
+ kfree(numa_faults);
+ } else {
+ p->total_numa_faults = 0;
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ numa_faults[i] = 0;
+ }
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
+{
+ struct task_struct *p = current;
+ bool migrated = flags & TNF_MIGRATED;
+ int cpu_node = task_node(current);
+ int local = !!(flags & TNF_FAULT_LOCAL);
+ struct numa_group *ng;
+ int priv;
+
+ if (!static_branch_likely(&sched_numa_balancing))
+ return;
+
+ /* for example, ksmd faulting in a user's mm */
+ if (!p->mm)
+ return;
+
+ /* Allocate buffer to track faults on a per-node basis */
+ if (unlikely(!p->numa_faults)) {
+ int size = sizeof(*p->numa_faults) *
+ NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
+
+ p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+ if (!p->numa_faults)
+ return;
+
+ p->total_numa_faults = 0;
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+ }
+
+ /*
+ * First accesses are treated as private, otherwise consider accesses
+ * to be private if the accessing pid has not changed
+ */
+ if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
+ priv = 1;
+ } else {
+ priv = cpupid_match_pid(p, last_cpupid);
+ if (!priv && !(flags & TNF_NO_GROUP))
+ task_numa_group(p, last_cpupid, flags, &priv);
+ }
+
+ /*
+ * If a workload spans multiple NUMA nodes, a shared fault that
+ * occurs wholly within the set of nodes that the workload is
+ * actively using should be counted as local. This allows the
+ * scan rate to slow down when a workload has settled down.
+ */
+ ng = deref_curr_numa_group(p);
+ if (!priv && !local && ng && ng->active_nodes > 1 &&
+ numa_is_active_node(cpu_node, ng) &&
+ numa_is_active_node(mem_node, ng))
+ local = 1;
+
+ /*
+ * Retry task to preferred node migration periodically, in case it
+ * case it previously failed, or the scheduler moved us.
+ */
+ if (time_after(jiffies, p->numa_migrate_retry)) {
+ task_numa_placement(p);
+ numa_migrate_preferred(p);
+ }
+
+ if (migrated)
+ p->numa_pages_migrated += pages;
+ if (flags & TNF_MIGRATE_FAIL)
+ p->numa_faults_locality[2] += pages;
+
+ p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
+ p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
+ p->numa_faults_locality[local] += pages;
+}
+
+static void reset_ptenuma_scan(struct task_struct *p)
+{
+ /*
+ * We only did a read acquisition of the mmap sem, so
+ * p->mm->numa_scan_seq is written to without exclusive access
+ * and the update is not guaranteed to be atomic. That's not
+ * much of an issue though, since this is just used for
+ * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+ * expensive, to avoid any form of compiler optimizations:
+ */
+ WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
+ p->mm->numa_scan_offset = 0;
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+ unsigned long migrate, next_scan, now = jiffies;
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ u64 runtime = p->se.sum_exec_runtime;
+ struct vm_area_struct *vma;
+ unsigned long start, end;
+ unsigned long nr_pte_updates = 0;
+ long pages, virtpages;
+
+ SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
+
+ work->next = work; /* protect against double add */
+ /*
+ * Who cares about NUMA placement when they're dying.
+ *
+ * NOTE: make sure not to dereference p->mm before this check,
+ * exit_task_work() happens _after_ exit_mm() so we could be called
+ * without p->mm even though we still had it when we enqueued this
+ * work.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ if (!mm->numa_next_scan) {
+ mm->numa_next_scan = now +
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ }
+
+ /*
+ * Enforce maximal scan/migration frequency..
+ */
+ migrate = mm->numa_next_scan;
+ if (time_before(now, migrate))
+ return;
+
+ if (p->numa_scan_period == 0) {
+ p->numa_scan_period_max = task_scan_max(p);
+ p->numa_scan_period = task_scan_start(p);
+ }
+
+ next_scan = now + msecs_to_jiffies(p->numa_scan_period);
+ if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ return;
+
+ /*
+ * Delay this task enough that another task of this mm will likely win
+ * the next time around.
+ */
+ p->node_stamp += 2 * TICK_NSEC;
+
+ start = mm->numa_scan_offset;
+ pages = sysctl_numa_balancing_scan_size;
+ pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+ virtpages = pages * 8; /* Scan up to this much virtual space */
+ if (!pages)
+ return;
+
+
+ if (!down_read_trylock(&mm->mmap_sem))
+ return;
+ vma = find_vma(mm, start);
+ if (!vma) {
+ reset_ptenuma_scan(p);
+ start = 0;
+ vma = mm->mmap;
+ }
+ for (; vma; vma = vma->vm_next) {
+ if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+ is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+ continue;
+ }
+
+ /*
+ * Shared library pages mapped by multiple processes are not
+ * migrated as it is expected they are cache replicated. Avoid
+ * hinting faults in read-only file-backed mappings or the vdso
+ * as migrating the pages will be of marginal benefit.
+ */
+ if (!vma->vm_mm ||
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+ continue;
+
+ /*
+ * Skip inaccessible VMAs to avoid any confusion between
+ * PROT_NONE and NUMA hinting ptes
+ */
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ continue;
+
+ do {
+ start = max(start, vma->vm_start);
+ end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
+ end = min(end, vma->vm_end);
+ nr_pte_updates = change_prot_numa(vma, start, end);
+
+ /*
+ * Try to scan sysctl_numa_balancing_size worth of
+ * hpages that have at least one present PTE that
+ * is not already pte-numa. If the VMA contains
+ * areas that are unused or already full of prot_numa
+ * PTEs, scan up to virtpages, to skip through those
+ * areas faster.
+ */
+ if (nr_pte_updates)
+ pages -= (end - start) >> PAGE_SHIFT;
+ virtpages -= (end - start) >> PAGE_SHIFT;
+
+ start = end;
+ if (pages <= 0 || virtpages <= 0)
+ goto out;
+
+ cond_resched();
+ } while (end != vma->vm_end);
+ }
+
+out:
+ /*
+ * It is possible to reach the end of the VMA list but the last few
+ * VMAs are not guaranteed to the vma_migratable. If they are not, we
+ * would find the !migratable VMA on the next scan but not reset the
+ * scanner to the start so check it now.
+ */
+ if (vma)
+ mm->numa_scan_offset = start;
+ else
+ reset_ptenuma_scan(p);
+ up_read(&mm->mmap_sem);
+
+ /*
+ * Make sure tasks use at least 32x as much time to run other code
+ * than they used here, to limit NUMA PTE scanning overhead to 3% max.
+ * Usually update_task_scan_period slows down scanning enough; on an
+ * overloaded system we need to limit overhead on a per task basis.
+ */
+ if (unlikely(p->se.sum_exec_runtime != runtime)) {
+ u64 diff = p->se.sum_exec_runtime - runtime;
+ p->node_stamp += 32 * diff;
+ }
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->numa_work;
+ u64 period, now;
+
+ /*
+ * We don't care about NUMA placement if we don't have memory.
+ */
+ if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
+ return;
+
+ /*
+ * Using runtime rather than walltime has the dual advantage that
+ * we (mostly) drive the selection from busy threads and that the
+ * task needs to have done some actual work before we bother with
+ * NUMA placement.
+ */
+ now = curr->se.sum_exec_runtime;
+ period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+ if (now > curr->node_stamp + period) {
+ if (!curr->node_stamp)
+ curr->numa_scan_period = task_scan_start(curr);
+ curr->node_stamp += period;
+
+ if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+ init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ task_work_add(curr, work, true);
+ }
+ }
+}
+
+static void update_scan_period(struct task_struct *p, int new_cpu)
+{
+ int src_nid = cpu_to_node(task_cpu(p));
+ int dst_nid = cpu_to_node(new_cpu);
+
+ if (!static_branch_likely(&sched_numa_balancing))
+ return;
+
+ if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
+ return;
+
+ if (src_nid == dst_nid)
+ return;
+
+ /*
+ * Allow resets if faults have been trapped before one scan
+ * has completed. This is most likely due to a new task that
+ * is pulled cross-node due to wakeups or load balancing.
+ */
+ if (p->numa_scan_seq) {
+ /*
+ * Avoid scan adjustments if moving to the preferred
+ * node or if the task was not previously running on
+ * the preferred node.
+ */
+ if (dst_nid == p->numa_preferred_nid ||
+ (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+ return;
+ }
+
+ p->numa_scan_period = task_scan_start(p);
+}
+
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void update_scan_period(struct task_struct *p, int new_cpu)
+{
+}
+
+#endif /* CONFIG_NUMA_BALANCING */
+
+static void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ update_load_add(&cfs_rq->load, se->load.weight);
+ if (!parent_entity(se))
+ update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
+#ifdef CONFIG_SMP
+ if (entity_is_task(se)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ account_numa_enqueue(rq, task_of(se));
+ list_add(&se->group_node, &rq->cfs_tasks);
+ }
+#endif
+ cfs_rq->nr_running++;
+}
+
+static void
+account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ update_load_sub(&cfs_rq->load, se->load.weight);
+ if (!parent_entity(se))
+ update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
+#ifdef CONFIG_SMP
+ if (entity_is_task(se)) {
+ account_numa_dequeue(rq_of(cfs_rq), task_of(se));
+ list_del_init(&se->group_node);
+ }
+#endif
+ cfs_rq->nr_running--;
+}
+
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(*ptr) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ res = var - val; \
+ if (res > var) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#ifdef CONFIG_SMP
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->runnable_weight += se->runnable_weight;
+
+ cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
+ cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
+}
+
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->runnable_weight -= se->runnable_weight;
+
+ sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
+ sub_positive(&cfs_rq->avg.runnable_load_sum,
+ se_runnable(se) * se->avg.runnable_load_sum);
+}
+
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->avg.load_avg += se->avg.load_avg;
+ cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+}
+
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+}
+#else
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+#endif
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight, unsigned long runnable)
+{
+ if (se->on_rq) {
+ /* commit outstanding execution time */
+ if (cfs_rq->curr == se)
+ update_curr(cfs_rq);
+ account_entity_dequeue(cfs_rq, se);
+ dequeue_runnable_load_avg(cfs_rq, se);
+ }
+ dequeue_load_avg(cfs_rq, se);
+
+ se->runnable_weight = runnable;
+ update_load_set(&se->load, weight);
+
+#ifdef CONFIG_SMP
+ do {
+ u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+
+ se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
+ se->avg.runnable_load_avg =
+ div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
+ } while (0);
+#endif
+
+ enqueue_load_avg(cfs_rq, se);
+ if (se->on_rq) {
+ account_entity_enqueue(cfs_rq, se);
+ enqueue_runnable_load_avg(cfs_rq, se);
+ }
+}
+
+void reweight_task(struct task_struct *p, int prio)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct load_weight *load = &se->load;
+ unsigned long weight = scale_load(sched_prio_to_weight[prio]);
+
+ reweight_entity(cfs_rq, se, weight, weight);
+ load->inv_weight = sched_prio_to_wmult[prio];
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_SMP
+/*
+ * All this does is approximate the hierarchical proportion which includes that
+ * global sum we all love to hate.
+ *
+ * That is, the weight of a group entity, is the proportional share of the
+ * group weight based on the group runqueue weights. That is:
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- (1)
+ * \Sum grq->load.weight
+ *
+ * Now, because computing that sum is prohibitively expensive to compute (been
+ * there, done that) we approximate it with this average stuff. The average
+ * moves slower and therefore the approximation is cheaper and more stable.
+ *
+ * So instead of the above, we substitute:
+ *
+ * grq->load.weight -> grq->avg.load_avg (2)
+ *
+ * which yields the following:
+ *
+ * tg->weight * grq->avg.load_avg
+ * ge->load.weight = ------------------------------ (3)
+ * tg->load_avg
+ *
+ * Where: tg->load_avg ~= \Sum grq->avg.load_avg
+ *
+ * That is shares_avg, and it is right (given the approximation (2)).
+ *
+ * The problem with it is that because the average is slow -- it was designed
+ * to be exactly that of course -- this leads to transients in boundary
+ * conditions. In specific, the case where the group was idle and we start the
+ * one task. It takes time for our CPU's grq->avg.load_avg to build up,
+ * yielding bad latency etc..
+ *
+ * Now, in that special case (1) reduces to:
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- = tg->weight (4)
+ * grp->load.weight
+ *
+ * That is, the sum collapses because all other CPUs are idle; the UP scenario.
+ *
+ * So what we do is modify our approximation (3) to approach (4) in the (near)
+ * UP case, like:
+ *
+ * ge->load.weight =
+ *
+ * tg->weight * grq->load.weight
+ * --------------------------------------------------- (5)
+ * tg->load_avg - grq->avg.load_avg + grq->load.weight
+ *
+ * But because grq->load.weight can drop to 0, resulting in a divide by zero,
+ * we need to use grq->avg.load_avg as its lower bound, which then gives:
+ *
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- (6)
+ * tg_load_avg'
+ *
+ * Where:
+ *
+ * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
+ * max(grq->load.weight, grq->avg.load_avg)
+ *
+ * And that is shares_weight and is icky. In the (near) UP case it approaches
+ * (4) while in the normal case it approaches (3). It consistently
+ * overestimates the ge->load.weight and therefore:
+ *
+ * \Sum ge->load.weight >= tg->weight
+ *
+ * hence icky!
+ */
+static long calc_group_shares(struct cfs_rq *cfs_rq)
+{
+ long tg_weight, tg_shares, load, shares;
+ struct task_group *tg = cfs_rq->tg;
+
+ tg_shares = READ_ONCE(tg->shares);
+
+ load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
+
+ tg_weight = atomic_long_read(&tg->load_avg);
+
+ /* Ensure tg_weight >= load */
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += load;
+
+ shares = (tg_shares * load);
+ if (tg_weight)
+ shares /= tg_weight;
+
+ /*
+ * MIN_SHARES has to be unscaled here to support per-CPU partitioning
+ * of a group with small tg->shares value. It is a floor value which is
+ * assigned as a minimum load.weight to the sched_entity representing
+ * the group on a CPU.
+ *
+ * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
+ * on an 8-core system with 8 tasks each runnable on one CPU shares has
+ * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
+ * case no task is runnable on a CPU MIN_SHARES=2 should be returned
+ * instead of 0.
+ */
+ return clamp_t(long, shares, MIN_SHARES, tg_shares);
+}
+
+/*
+ * This calculates the effective runnable weight for a group entity based on
+ * the group entity weight calculated above.
+ *
+ * Because of the above approximation (2), our group entity weight is
+ * an load_avg based ratio (3). This means that it includes blocked load and
+ * does not represent the runnable weight.
+ *
+ * Approximate the group entity's runnable weight per ratio from the group
+ * runqueue:
+ *
+ * grq->avg.runnable_load_avg
+ * ge->runnable_weight = ge->load.weight * -------------------------- (7)
+ * grq->avg.load_avg
+ *
+ * However, analogous to above, since the avg numbers are slow, this leads to
+ * transients in the from-idle case. Instead we use:
+ *
+ * ge->runnable_weight = ge->load.weight *
+ *
+ * max(grq->avg.runnable_load_avg, grq->runnable_weight)
+ * ----------------------------------------------------- (8)
+ * max(grq->avg.load_avg, grq->load.weight)
+ *
+ * Where these max() serve both to use the 'instant' values to fix the slow
+ * from-idle and avoid the /0 on to-idle, similar to (6).
+ */
+static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
+{
+ long runnable, load_avg;
+
+ load_avg = max(cfs_rq->avg.load_avg,
+ scale_load_down(cfs_rq->load.weight));
+
+ runnable = max(cfs_rq->avg.runnable_load_avg,
+ scale_load_down(cfs_rq->runnable_weight));
+
+ runnable *= shares;
+ if (load_avg)
+ runnable /= load_avg;
+
+ return clamp_t(long, runnable, MIN_SHARES, shares);
+}
+#endif /* CONFIG_SMP */
+
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
+/*
+ * Recomputes the group entity based on the current state of its group
+ * runqueue.
+ */
+static void update_cfs_group(struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long shares, runnable;
+
+ if (!gcfs_rq)
+ return;
+
+ if (throttled_hierarchy(gcfs_rq))
+ return;
+
+#ifndef CONFIG_SMP
+ runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
+
+ if (likely(se->load.weight == shares))
+ return;
+#else
+ shares = calc_group_shares(gcfs_rq);
+ runnable = calc_group_runnable(gcfs_rq, shares);
+#endif
+
+ reweight_entity(cfs_rq_of(se), se, shares, runnable);
+}
+
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static inline void update_cfs_group(struct sched_entity *se)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq, flags);
+ }
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share().
+ */
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+ long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+
+ /*
+ * No need to update load_avg for root_task_group as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
+ if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+ }
+}
+
+/*
+ * Called within set_task_rq() right before setting a task's CPU. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next)
+{
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
+
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (!(se->avg.last_update_time && prev))
+ return;
+
+#ifndef CONFIG_64BIT
+ {
+ u64 p_last_update_time_copy;
+ u64 n_last_update_time_copy;
+
+ do {
+ p_last_update_time_copy = prev->load_last_update_time_copy;
+ n_last_update_time_copy = next->load_last_update_time_copy;
+
+ smp_rmb();
+
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+
+ } while (p_last_update_time != p_last_update_time_copy ||
+ n_last_update_time != n_last_update_time_copy);
+ }
+#else
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+#endif
+ __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+ se->avg.last_update_time = n_last_update_time;
+}
+
+
+/*
+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
+ * propagate its contribution. The key to this propagation is the invariant
+ * that for each group:
+ *
+ * ge->avg == grq->avg (1)
+ *
+ * _IFF_ we look at the pure running and runnable sums. Because they
+ * represent the very same entity, just at different points in the hierarchy.
+ *
+ * Per the above update_tg_cfs_util() is trivial and simply copies the running
+ * sum over (but still wrong, because the group entity and group rq do not have
+ * their PELT windows aligned).
+ *
+ * However, update_tg_cfs_runnable() is more complex. So we have:
+ *
+ * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
+ *
+ * And since, like util, the runnable part should be directly transferable,
+ * the following would _appear_ to be the straight forward approach:
+ *
+ * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
+ *
+ * And per (1) we have:
+ *
+ * ge->avg.runnable_avg == grq->avg.runnable_avg
+ *
+ * Which gives:
+ *
+ * ge->load.weight * grq->avg.load_avg
+ * ge->avg.load_avg = ----------------------------------- (4)
+ * grq->load.weight
+ *
+ * Except that is wrong!
+ *
+ * Because while for entities historical weight is not important and we
+ * really only care about our future and therefore can consider a pure
+ * runnable sum, runqueues can NOT do this.
+ *
+ * We specifically want runqueues to have a load_avg that includes
+ * historical weights. Those represent the blocked load, the load we expect
+ * to (shortly) return to us. This only works by keeping the weights as
+ * integral part of the sum. We therefore cannot decompose as per (3).
+ *
+ * Another reason this doesn't work is that runnable isn't a 0-sum entity.
+ * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
+ * rq itself is runnable anywhere between 2/3 and 1 depending on how the
+ * runnable section of these tasks overlap (or not). If they were to perfectly
+ * align the rq as a whole would be runnable 2/3 of the time. If however we
+ * always have at least 1 runnable task, the rq as a whole is always runnable.
+ *
+ * So we'll have to approximate.. :/
+ *
+ * Given the constraint:
+ *
+ * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
+ *
+ * We can construct a rule that adds runnable to a rq by assuming minimal
+ * overlap.
+ *
+ * On removal, we'll assume each task is equally runnable; which yields:
+ *
+ * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
+ *
+ * XXX: only do this for the part of runnable > running ?
+ *
+ */
+
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
+{
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /*
+ * The relation between sum and avg is:
+ *
+ * LOAD_AVG_MAX - 1024 + sa->period_contrib
+ *
+ * however, the PELT windows are not aligned between grq and gse.
+ */
+
+ /* Set new sched_entity's utilization */
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+static inline void
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
+{
+ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ unsigned long runnable_load_avg, load_avg;
+ u64 runnable_load_sum, load_sum = 0;
+ s64 delta_sum;
+
+ if (!runnable_sum)
+ return;
+
+ gcfs_rq->prop_runnable_sum = 0;
+
+ if (runnable_sum >= 0) {
+ /*
+ * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
+ * the CPU is saturated running == runnable.
+ */
+ runnable_sum += se->avg.load_sum;
+ runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+ } else {
+ /*
+ * Estimate the new unweighted runnable_sum of the gcfs_rq by
+ * assuming all tasks are equally runnable.
+ */
+ if (scale_load_down(gcfs_rq->load.weight)) {
+ load_sum = div_s64(gcfs_rq->avg.load_sum,
+ scale_load_down(gcfs_rq->load.weight));
+ }
+
+ /* But make sure to not inflate se's runnable */
+ runnable_sum = min(se->avg.load_sum, load_sum);
+ }
+
+ /*
+ * runnable_sum can't be lower than running_sum
+ * As running sum is scale with CPU capacity wehreas the runnable sum
+ * is not we rescale running_sum 1st
+ */
+ running_sum = se->avg.util_sum /
+ arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ runnable_sum = max(runnable_sum, running_sum);
+
+ load_sum = (s64)se_weight(se) * runnable_sum;
+ load_avg = div_s64(load_sum, LOAD_AVG_MAX);
+
+ delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
+ delta_avg = load_avg - se->avg.load_avg;
+
+ se->avg.load_sum = runnable_sum;
+ se->avg.load_avg = load_avg;
+ add_positive(&cfs_rq->avg.load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.load_sum, delta_sum);
+
+ runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
+ runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+ delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
+ delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
+
+ se->avg.runnable_load_sum = runnable_sum;
+ se->avg.runnable_load_avg = runnable_load_avg;
+
+ if (se->on_rq) {
+ add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
+ }
+}
+
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
+{
+ cfs_rq->propagate = 1;
+ cfs_rq->prop_runnable_sum += runnable_sum;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq, *gcfs_rq;
+
+ if (entity_is_task(se))
+ return 0;
+
+ gcfs_rq = group_cfs_rq(se);
+ if (!gcfs_rq->propagate)
+ return 0;
+
+ gcfs_rq->propagate = 0;
+
+ cfs_rq = cfs_rq_of(se);
+
+ add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
+
+ update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+ update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
+
+ return 1;
+}
+
+/*
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity:
+ */
+static inline bool skip_blocked_update(struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+ /*
+ * If sched_entity still have not zero load or utilization, we have to
+ * decay it:
+ */
+ if (se->avg.load_avg || se->avg.util_avg)
+ return false;
+
+ /*
+ * If there is a pending propagation, we have to update the load and
+ * the utilization of the sched_entity:
+ */
+ if (gcfs_rq->propagate)
+ return false;
+
+ /*
+ * Otherwise, the load and the utilization of the sched_entity is
+ * already zero and there is no pending propagation, so it will be a
+ * waste of time to try to decay it:
+ */
+ return true;
+}
+
+#else /* CONFIG_FAIR_GROUP_SCHED */
+
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ return 0;
+}
+
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+{
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
+ struct sched_avg *sa = &cfs_rq->avg;
+ int decayed = 0;
+
+ if (cfs_rq->removed.nr) {
+ unsigned long r;
+ u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
+ raw_spin_lock(&cfs_rq->removed.lock);
+ swap(cfs_rq->removed.util_avg, removed_util);
+ swap(cfs_rq->removed.load_avg, removed_load);
+ swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
+ cfs_rq->removed.nr = 0;
+ raw_spin_unlock(&cfs_rq->removed.lock);
+
+ r = removed_load;
+ sub_positive(&sa->load_avg, r);
+ sub_positive(&sa->load_sum, r * divider);
+
+ r = removed_util;
+ sub_positive(&sa->util_avg, r);
+ sub_positive(&sa->util_sum, r * divider);
+
+ add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
+
+ decayed = 1;
+ }
+
+ decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+
+#ifndef CONFIG_64BIT
+ smp_wmb();
+ cfs_rq->load_last_update_time_copy = sa->last_update_time;
+#endif
+
+ if (decayed)
+ cfs_rq_util_change(cfs_rq, 0);
+
+ return decayed;
+}
+
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ * @flags: migration hints
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
+ /*
+ * When we attach the @se to the @cfs_rq, we must align the decay
+ * window because without that, really weird and wonderful things can
+ * happen.
+ *
+ * XXX illustrate
+ */
+ se->avg.last_update_time = cfs_rq->avg.last_update_time;
+ se->avg.period_contrib = cfs_rq->avg.period_contrib;
+
+ /*
+ * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
+ * period_contrib. This isn't strictly correct, but since we're
+ * entirely outside of the PELT hierarchy, nobody cares if we truncate
+ * _sum a little.
+ */
+ se->avg.util_sum = se->avg.util_avg * divider;
+
+ se->avg.load_sum = divider;
+ if (se_weight(se)) {
+ se->avg.load_sum =
+ div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
+ }
+
+ se->avg.runnable_load_sum = se->avg.load_sum;
+
+ enqueue_load_avg(cfs_rq, se);
+ cfs_rq->avg.util_avg += se->avg.util_avg;
+ cfs_rq->avg.util_sum += se->avg.util_sum;
+
+ add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
+
+ cfs_rq_util_change(cfs_rq, flags);
+}
+
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
+static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ dequeue_load_avg(cfs_rq, se);
+ sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+
+ add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
+
+ cfs_rq_util_change(cfs_rq, 0);
+}
+
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+#define DO_ATTACH 0x4
+
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+ int decayed;
+
+ /*
+ * Track task load average for carrying it to new CPU after migrated, and
+ * track group sched_entity load average for task_h_load calc in migration
+ */
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+ __update_load_avg_se(now, cpu, cfs_rq, se);
+
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
+ decayed |= propagate_entity_load_avg(se);
+
+ if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
+
+ /*
+ * DO_ATTACH means we're here from enqueue_entity().
+ * !last_update_time means we've passed through
+ * migrate_task_rq_fair() indicating we migrated.
+ *
+ * IOW we're enqueueing a task on a new CPU.
+ */
+ attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
+ update_tg_load_avg(cfs_rq, 0);
+
+ } else if (decayed && (flags & UPDATE_TG))
+ update_tg_load_avg(cfs_rq, 0);
+}
+
+#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ u64 last_update_time_copy;
+ u64 last_update_time;
+
+ do {
+ last_update_time_copy = cfs_rq->load_last_update_time_copy;
+ smp_rmb();
+ last_update_time = cfs_rq->avg.last_update_time;
+ } while (last_update_time != last_update_time_copy);
+
+ return last_update_time;
+}
+#else
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.last_update_time;
+}
+#endif
+
+/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+ __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
+}
+
+/*
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
+ */
+void remove_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ unsigned long flags;
+
+ /*
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * post_init_entity_util_avg() which will have added things to the
+ * cfs_rq, so we can remove unconditionally.
+ *
+ * Similarly for groups, they will have passed through
+ * post_init_entity_util_avg() before unregister_sched_fair_group()
+ * calls this.
+ */
+
+ sync_entity_load_avg(se);
+
+ raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
+ ++cfs_rq->removed.nr;
+ cfs_rq->removed.util_avg += se->avg.util_avg;
+ cfs_rq->removed.load_avg += se->avg.load_avg;
+ cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
+ raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
+}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.runnable_load_avg;
+}
+
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.load_avg;
+}
+
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
+
+static inline unsigned long task_util(struct task_struct *p)
+{
+ return READ_ONCE(p->se.avg.util_avg);
+}
+
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+ struct util_est ue = READ_ONCE(p->se.avg.util_est);
+
+ return max(ue.ewma, ue.enqueued);
+}
+
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+ return max(task_util(p), _task_util_est(p));
+}
+
+static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Update root cfs_rq's estimated utilization */
+ enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+}
+
+/*
+ * Check if a (signed) value is within a specified (unsigned) margin,
+ * based on the observation that:
+ *
+ * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
+ *
+ * NOTE: this only works when value + maring < INT_MAX.
+ */
+static inline bool within_margin(int value, int margin)
+{
+ return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
+}
+
+static void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+{
+ long last_ewma_diff;
+ struct util_est ue;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Update root cfs_rq's estimated utilization */
+ ue.enqueued = cfs_rq->avg.util_est.enqueued;
+ ue.enqueued -= min_t(unsigned int, ue.enqueued,
+ (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+
+ /*
+ * Skip update of task's estimated utilization when the task has not
+ * yet completed an activation, e.g. being migrated.
+ */
+ if (!task_sleep)
+ return;
+
+ /*
+ * If the PELT values haven't changed since enqueue time,
+ * skip the util_est update.
+ */
+ ue = p->se.avg.util_est;
+ if (ue.enqueued & UTIL_AVG_UNCHANGED)
+ return;
+
+ /*
+ * Skip update of task's estimated utilization when its EWMA is
+ * already ~1% close to its last activation value.
+ */
+ ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+ last_ewma_diff = ue.enqueued - ue.ewma;
+ if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
+ return;
+
+ /*
+ * Update Task's estimated utilization
+ *
+ * When *p completes an activation we can consolidate another sample
+ * of the task size. This is done by storing the current PELT value
+ * as ue.enqueued and by using this value to update the Exponential
+ * Weighted Moving Average (EWMA):
+ *
+ * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
+ * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
+ * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
+ * = w * ( last_ewma_diff ) + ewma(t-1)
+ * = w * (last_ewma_diff + ewma(t-1) / w)
+ *
+ * Where 'w' is the weight of new samples, which is configured to be
+ * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+ */
+ ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
+ ue.ewma += last_ewma_diff;
+ ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+ WRITE_ONCE(p->se.avg.util_est, ue);
+}
+
+#else /* CONFIG_SMP */
+
+#define UPDATE_TG 0x0
+#define SKIP_AGE_LOAD 0x0
+#define DO_ATTACH 0x0
+
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
+{
+ cfs_rq_util_change(cfs_rq, 0);
+}
+
+static inline void remove_entity_load_avg(struct sched_entity *se) {}
+
+static inline void
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
+static inline void
+detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+
+static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
+{
+ return 0;
+}
+
+static inline void
+util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
+ bool task_sleep) {}
+
+#endif /* CONFIG_SMP */
+
+static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ s64 d = se->vruntime - cfs_rq->min_vruntime;
+
+ if (d < 0)
+ d = -d;
+
+ if (d > 3*sysctl_sched_latency)
+ schedstat_inc(cfs_rq->nr_spread_over);
+#endif
+}
+
+static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+{
+ u64 vruntime = cfs_rq->min_vruntime;
+
+ /*
+ * The 'current' period is already promised to the current tasks,
+ * however the extra weight of the new task will slow them down a
+ * little, place the new task so that it fits in the slot that
+ * stays open at the end.
+ */
+ if (initial && sched_feat(START_DEBIT))
+ vruntime += sched_vslice(cfs_rq, se);
+
+ /* sleeps up to a single latency don't count. */
+ if (!initial) {
+ unsigned long thresh = sysctl_sched_latency;
+
+ /*
+ * Halve their sleep time's effect, to allow
+ * for a gentler effect of sleepers:
+ */
+ if (sched_feat(GENTLE_FAIR_SLEEPERS))
+ thresh >>= 1;
+
+ vruntime -= thresh;
+ }
+
+ /* ensure we never gain time by being placed backwards. */
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
+}
+
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+
+static inline void check_schedstat_required(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+ if (schedstat_enabled())
+ return;
+
+ /* Force schedstat enabled if a dependent tracepoint is active */
+ if (trace_sched_stat_wait_enabled() ||
+ trace_sched_stat_sleep_enabled() ||
+ trace_sched_stat_iowait_enabled() ||
+ trace_sched_stat_blocked_enabled() ||
+ trace_sched_stat_runtime_enabled()) {
+ printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+ "stat_blocked and stat_runtime require the "
+ "kernel parameter schedstats=enable or "
+ "kernel.sched_schedstats=1\n");
+ }
+#endif
+}
+
+
+/*
+ * MIGRATION
+ *
+ * dequeue
+ * update_curr()
+ * update_min_vruntime()
+ * vruntime -= min_vruntime
+ *
+ * enqueue
+ * update_curr()
+ * update_min_vruntime()
+ * vruntime += min_vruntime
+ *
+ * this way the vruntime transition between RQs is done when both
+ * min_vruntime are up-to-date.
+ *
+ * WAKEUP (remote)
+ *
+ * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
+ * vruntime -= min_vruntime
+ *
+ * enqueue
+ * update_curr()
+ * update_min_vruntime()
+ * vruntime += min_vruntime
+ *
+ * this way we don't have the most up-to-date min_vruntime on the originating
+ * CPU and an up-to-date min_vruntime on the destination CPU.
+ */
+
+static void
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+ bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
+ bool curr = cfs_rq->curr == se;
+
+ /*
+ * If we're the current task, we must renormalise before calling
+ * update_curr().
+ */
+ if (renorm && curr)
+ se->vruntime += cfs_rq->min_vruntime;
+
+ update_curr(cfs_rq);
+
+ /*
+ * Otherwise, renormalise after, such that we're placed at the current
+ * moment in time, instead of some random moment in the past. Being
+ * placed in the past could significantly boost this task to the
+ * fairness detriment of existing tasks.
+ */
+ if (renorm && !curr)
+ se->vruntime += cfs_rq->min_vruntime;
+
+ /*
+ * When enqueuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its weight to reflect the new share of
+ * its group cfs_rq
+ * - Add its new weight to cfs_rq->load.weight
+ */
+ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+ update_cfs_group(se);
+ enqueue_runnable_load_avg(cfs_rq, se);
+ account_entity_enqueue(cfs_rq, se);
+
+ if (flags & ENQUEUE_WAKEUP)
+ place_entity(cfs_rq, se, 0);
+
+ check_schedstat_required();
+ update_stats_enqueue(cfs_rq, se, flags);
+ check_spread(cfs_rq, se);
+ if (!curr)
+ __enqueue_entity(cfs_rq, se);
+ se->on_rq = 1;
+
+ if (cfs_rq->nr_running == 1) {
+ list_add_leaf_cfs_rq(cfs_rq);
+ check_enqueue_throttle(cfs_rq);
+ }
+}
+
+static void __clear_buddies_last(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->last != se)
+ break;
+
+ cfs_rq->last = NULL;
+ }
+}
+
+static void __clear_buddies_next(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->next != se)
+ break;
+
+ cfs_rq->next = NULL;
+ }
+}
+
+static void __clear_buddies_skip(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->skip != se)
+ break;
+
+ cfs_rq->skip = NULL;
+ }
+}
+
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (cfs_rq->last == se)
+ __clear_buddies_last(se);
+
+ if (cfs_rq->next == se)
+ __clear_buddies_next(se);
+
+ if (cfs_rq->skip == se)
+ __clear_buddies_skip(se);
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
+static void
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Substract its load from the cfs_rq->runnable_avg.
+ * - Substract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ dequeue_runnable_load_avg(cfs_rq, se);
+
+ update_stats_dequeue(cfs_rq, se, flags);
+
+ clear_buddies(cfs_rq, se);
+
+ if (se != cfs_rq->curr)
+ __dequeue_entity(cfs_rq, se);
+ se->on_rq = 0;
+ account_entity_dequeue(cfs_rq, se);
+
+ /*
+ * Normalize after update_curr(); which will also have moved
+ * min_vruntime if @se is the one holding it back. But before doing
+ * update_min_vruntime() again, which will discount @se's position and
+ * can move min_vruntime forward still more.
+ */
+ if (!(flags & DEQUEUE_SLEEP))
+ se->vruntime -= cfs_rq->min_vruntime;
+
+ /* return excess runtime on last dequeue */
+ return_cfs_rq_runtime(cfs_rq);
+
+ update_cfs_group(se);
+
+ /*
+ * Now advance min_vruntime if @se was the entity holding it back,
+ * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
+ * put back on, and if we advance min_vruntime, we'll be placed back
+ * further than we started -- ie. we'll be penalized.
+ */
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
+ update_min_vruntime(cfs_rq);
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void
+check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ unsigned long ideal_runtime, delta_exec;
+ struct sched_entity *se;
+ s64 delta;
+
+ ideal_runtime = sched_slice(cfs_rq, curr);
+ delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+ if (delta_exec > ideal_runtime) {
+ resched_curr(rq_of(cfs_rq));
+ /*
+ * The current task ran long enough, ensure it doesn't get
+ * re-elected due to buddy favours.
+ */
+ clear_buddies(cfs_rq, curr);
+ return;
+ }
+
+ /*
+ * Ensure that a task that missed wakeup preemption by a
+ * narrow margin doesn't have to wait for a full slice.
+ * This also mitigates buddy induced latencies under load.
+ */
+ if (delta_exec < sysctl_sched_min_granularity)
+ return;
+
+ se = __pick_first_entity(cfs_rq);
+ delta = curr->vruntime - se->vruntime;
+
+ if (delta < 0)
+ return;
+
+ if (delta > ideal_runtime)
+ resched_curr(rq_of(cfs_rq));
+}
+
+static void
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /* 'current' is not kept within the tree. */
+ if (se->on_rq) {
+ /*
+ * Any task has to be enqueued before it get to execute on
+ * a CPU. So account for the time it spent waiting on the
+ * runqueue.
+ */
+ update_stats_wait_end(cfs_rq, se);
+ __dequeue_entity(cfs_rq, se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ }
+
+ update_stats_curr_start(cfs_rq, se);
+ cfs_rq->curr = se;
+
+ /*
+ * Track our maximum slice length, if the CPU's load is at
+ * least twice that of our own weight (i.e. dont track it
+ * when there are only lesser-weight tasks around):
+ */
+ if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+ schedstat_set(se->statistics.slice_max,
+ max((u64)schedstat_val(se->statistics.slice_max),
+ se->sum_exec_runtime - se->prev_sum_exec_runtime));
+ }
+
+ se->prev_sum_exec_runtime = se->sum_exec_runtime;
+}
+
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ struct sched_entity *se;
+
+ /*
+ * If curr is set we have to see if its left of the leftmost entity
+ * still in the tree, provided there was anything in the tree at all.
+ */
+ if (!left || (curr && entity_before(curr, left)))
+ left = curr;
+
+ se = left; /* ideally we run the leftmost entity */
+
+ /*
+ * Avoid running the skip buddy, if running something else can
+ * be done without getting too unfair.
+ */
+ if (cfs_rq->skip == se) {
+ struct sched_entity *second;
+
+ if (se == curr) {
+ second = __pick_first_entity(cfs_rq);
+ } else {
+ second = __pick_next_entity(se);
+ if (!second || (curr && entity_before(curr, second)))
+ second = curr;
+ }
+
+ if (second && wakeup_preempt_entity(second, left) < 1)
+ se = second;
+ }
+
+ /*
+ * Prefer last buddy, try to return the CPU to a preempted task.
+ */
+ if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
+ se = cfs_rq->last;
+
+ /*
+ * Someone really wants this to run. If it's not unfair, run it.
+ */
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ se = cfs_rq->next;
+
+ clear_buddies(cfs_rq, se);
+
+ return se;
+}
+
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
+static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+{
+ /*
+ * If still on the runqueue then deactivate_task()
+ * was not called and update_curr() has to be done:
+ */
+ if (prev->on_rq)
+ update_curr(cfs_rq);
+
+ /* throttle cfs_rqs exceeding runtime */
+ check_cfs_rq_runtime(cfs_rq);
+
+ check_spread(cfs_rq, prev);
+
+ if (prev->on_rq) {
+ update_stats_wait_start(cfs_rq, prev);
+ /* Put 'current' back into the tree. */
+ __enqueue_entity(cfs_rq, prev);
+ /* in !on_rq case, update occurred at dequeue */
+ update_load_avg(cfs_rq, prev, 0);
+ }
+ cfs_rq->curr = NULL;
+}
+
+static void
+entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
+{
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+
+ /*
+ * Ensure that runnable average is periodically updated.
+ */
+ update_load_avg(cfs_rq, curr, UPDATE_TG);
+ update_cfs_group(curr);
+
+#ifdef CONFIG_SCHED_HRTICK
+ /*
+ * queued ticks are scheduled to match the slice, so don't bother
+ * validating it and just reschedule.
+ */
+ if (queued) {
+ resched_curr(rq_of(cfs_rq));
+ return;
+ }
+ /*
+ * don't let the period tick interfere with the hrtick preemption
+ */
+ if (!sched_feat(DOUBLE_TICK) &&
+ hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
+ return;
+#endif
+
+ if (cfs_rq->nr_running > 1)
+ check_preempt_tick(cfs_rq, curr);
+}
+
+
+/**************************************************
+ * CFS bandwidth control machinery
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+#ifdef CONFIG_JUMP_LABEL
+static struct static_key __cfs_bandwidth_used;
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return static_key_false(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_inc(void)
+{
+ static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_dec(void)
+{
+ static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
+}
+#else /* CONFIG_JUMP_LABEL */
+static bool cfs_bandwidth_used(void)
+{
+ return true;
+}
+
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
+#endif /* CONFIG_JUMP_LABEL */
+
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+ return 100000000ULL;
+}
+
+static inline u64 sched_cfs_bandwidth_slice(void)
+{
+ return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
+}
+
+/*
+ * Replenish runtime according to assigned quota. We use sched_clock_cpu
+ * directly instead of rq->clock to avoid adding additional synchronization
+ * around rq->lock.
+ *
+ * requires cfs_b->lock
+ */
+void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+{
+ if (cfs_b->quota != RUNTIME_INF)
+ cfs_b->runtime = cfs_b->quota;
+}
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+ return &tg->cfs_bandwidth;
+}
+
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+
+ return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+}
+
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ u64 amount = 0, min_amount;
+
+ /* note: this is a positive sum as runtime_remaining <= 0 */
+ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+
+ raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->quota == RUNTIME_INF)
+ amount = min_amount;
+ else {
+ start_cfs_bandwidth(cfs_b);
+
+ if (cfs_b->runtime > 0) {
+ amount = min(cfs_b->runtime, min_amount);
+ cfs_b->runtime -= amount;
+ cfs_b->idle = 0;
+ }
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ cfs_rq->runtime_remaining += amount;
+
+ return cfs_rq->runtime_remaining > 0;
+}
+
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+{
+ /* dock delta_exec before expiring quota (as it could span periods) */
+ cfs_rq->runtime_remaining -= delta_exec;
+
+ if (likely(cfs_rq->runtime_remaining > 0))
+ return;
+
+ if (cfs_rq->throttled)
+ return;
+ /*
+ * if we're unable to extend our runtime we resched so that the active
+ * hierarchy can be throttled
+ */
+ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
+ resched_curr(rq_of(cfs_rq));
+}
+
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+{
+ if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+ return;
+
+ __account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() && cfs_rq->throttled;
+}
+
+/* check whether cfs_rq, or any parent, is throttled */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() && cfs_rq->throttle_count;
+}
+
+/*
+ * Ensure that neither of the group entities corresponding to src_cpu or
+ * dest_cpu are members of a throttled hierarchy when performing group
+ * load-balance operations.
+ */
+static inline int throttled_lb_pair(struct task_group *tg,
+ int src_cpu, int dest_cpu)
+{
+ struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+
+ src_cfs_rq = tg->cfs_rq[src_cpu];
+ dest_cfs_rq = tg->cfs_rq[dest_cpu];
+
+ return throttled_hierarchy(src_cfs_rq) ||
+ throttled_hierarchy(dest_cfs_rq);
+}
+
+static int tg_unthrottle_up(struct task_group *tg, void *data)
+{
+ struct rq *rq = data;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ cfs_rq->throttle_count--;
+ if (!cfs_rq->throttle_count) {
+ /* adjust cfs_rq_clock_task() */
+ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
+ cfs_rq->throttled_clock_task;
+
+ /* Add cfs_rq with already running entity in the list */
+ if (cfs_rq->nr_running >= 1)
+ list_add_leaf_cfs_rq(cfs_rq);
+ }
+
+ return 0;
+}
+
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+ struct rq *rq = data;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ /* group is entering throttled state, stop time */
+ if (!cfs_rq->throttle_count) {
+ cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
+ cfs_rq->throttle_count++;
+
+ return 0;
+}
+
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ struct sched_entity *se;
+ long task_delta, dequeue = 1;
+ bool empty;
+
+ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+ /* freeze hierarchy runnable averages while throttled */
+ rcu_read_lock();
+ walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+ rcu_read_unlock();
+
+ task_delta = cfs_rq->h_nr_running;
+ for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ /* throttled entity or throttle-on-deactivate */
+ if (!se->on_rq)
+ break;
+
+ if (dequeue)
+ dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+ qcfs_rq->h_nr_running -= task_delta;
+
+ if (qcfs_rq->load.weight)
+ dequeue = 0;
+ }
+
+ if (!se)
+ sub_nr_running(rq, task_delta);
+
+ cfs_rq->throttled = 1;
+ cfs_rq->throttled_clock = rq_clock(rq);
+ raw_spin_lock(&cfs_b->lock);
+ empty = list_empty(&cfs_b->throttled_cfs_rq);
+
+ /*
+ * Add to the _head_ of the list, so that an already-started
+ * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+ * not running add to the tail so that later runqueues don't get starved.
+ */
+ if (cfs_b->distribute_running)
+ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ else
+ list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+
+ /*
+ * If we're the first throttled task, make sure the bandwidth
+ * timer is running.
+ */
+ if (empty)
+ start_cfs_bandwidth(cfs_b);
+
+ raw_spin_unlock(&cfs_b->lock);
+}
+
+void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ struct sched_entity *se;
+ int enqueue = 1;
+ long task_delta;
+
+ se = cfs_rq->tg->se[cpu_of(rq)];
+
+ cfs_rq->throttled = 0;
+
+ update_rq_clock(rq);
+
+ raw_spin_lock(&cfs_b->lock);
+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ list_del_rcu(&cfs_rq->throttled_list);
+ raw_spin_unlock(&cfs_b->lock);
+
+ /* update hierarchical throttle state */
+ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
+
+ if (!cfs_rq->load.weight)
+ return;
+
+ task_delta = cfs_rq->h_nr_running;
+ for_each_sched_entity(se) {
+ if (se->on_rq)
+ enqueue = 0;
+
+ cfs_rq = cfs_rq_of(se);
+ if (enqueue)
+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+ cfs_rq->h_nr_running += task_delta;
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ assert_list_leaf_cfs_rq(rq);
+
+ if (!se)
+ add_nr_running(rq, task_delta);
+
+ /* Determine whether we need to wake up potentially idle CPU: */
+ if (rq->curr == rq->idle && rq->cfs.nr_running)
+ resched_curr(rq);
+}
+
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
+{
+ struct cfs_rq *cfs_rq;
+ u64 runtime;
+ u64 starting_runtime = remaining;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+ throttled_list) {
+ struct rq *rq = rq_of(cfs_rq);
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ if (!cfs_rq_throttled(cfs_rq))
+ goto next;
+
+ /* By the above check, this should never be true */
+ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+
+ runtime = -cfs_rq->runtime_remaining + 1;
+ if (runtime > remaining)
+ runtime = remaining;
+ remaining -= runtime;
+
+ cfs_rq->runtime_remaining += runtime;
+
+ /* we check whether we're throttled above */
+ if (cfs_rq->runtime_remaining > 0)
+ unthrottle_cfs_rq(cfs_rq);
+
+next:
+ rq_unlock(rq, &rf);
+
+ if (!remaining)
+ break;
+ }
+ rcu_read_unlock();
+
+ return starting_runtime - remaining;
+}
+
+/*
+ * Responsible for refilling a task_group's bandwidth and unthrottling its
+ * cfs_rqs as appropriate. If there has been no activity within the last
+ * period the timer is deactivated until scheduling resumes; cfs_b->idle is
+ * used to track this state.
+ */
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+ u64 runtime;
+ int throttled;
+
+ /* no need to continue the timer with no bandwidth constraint */
+ if (cfs_b->quota == RUNTIME_INF)
+ goto out_deactivate;
+
+ throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+ cfs_b->nr_periods += overrun;
+
+ /*
+ * idle depends on !throttled (for the case of a large deficit), and if
+ * we're going inactive then everything else can be deferred
+ */
+ if (cfs_b->idle && !throttled)
+ goto out_deactivate;
+
+ __refill_cfs_bandwidth_runtime(cfs_b);
+
+ if (!throttled) {
+ /* mark as potentially idle for the upcoming period */
+ cfs_b->idle = 1;
+ return 0;
+ }
+
+ /* account preceding periods in which throttling occurred */
+ cfs_b->nr_throttled += overrun;
+
+ /*
+ * This check is repeated as we are holding onto the new bandwidth while
+ * we unthrottle. This can potentially race with an unthrottled group
+ * trying to acquire new bandwidth from the global pool. This can result
+ * in us over-using our runtime if it is all used during this loop, but
+ * only by limited amounts in that extreme case.
+ */
+ while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
+ runtime = cfs_b->runtime;
+ cfs_b->distribute_running = 1;
+ raw_spin_unlock(&cfs_b->lock);
+ /* we can't nest cfs_b->lock while distributing bandwidth */
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
+ raw_spin_lock(&cfs_b->lock);
+
+ cfs_b->distribute_running = 0;
+ throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ }
+
+ /*
+ * While we are ensured activity in the period following an
+ * unthrottle, this also covers the case in which the new bandwidth is
+ * insufficient to cover the existing bandwidth deficit. (Forcing the
+ * timer to remain active while there are any throttled entities.)
+ */
+ cfs_b->idle = 0;
+
+ return 0;
+
+out_deactivate:
+ return 1;
+}
+
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by hrtimer_start. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+ struct hrtimer *refresh_timer = &cfs_b->period_timer;
+ s64 remaining;
+
+ /* if the call-back is running a quota refresh is already occurring */
+ if (hrtimer_callback_running(refresh_timer))
+ return 1;
+
+ /* is a quota refresh about to occur? */
+ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+ if (remaining < (s64)min_expire)
+ return 1;
+
+ return 0;
+}
+
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+
+ /* if there's a quota refresh soon don't bother with slack */
+ if (runtime_refresh_within(cfs_b, min_left))
+ return;
+
+ hrtimer_start(&cfs_b->slack_timer,
+ ns_to_ktime(cfs_bandwidth_slack_period),
+ HRTIMER_MODE_REL);
+}
+
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+
+ if (slack_runtime <= 0)
+ return;
+
+ raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->quota != RUNTIME_INF) {
+ cfs_b->runtime += slack_runtime;
+
+ /* we are under rq->lock, defer unthrottling using a timer */
+ if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+ !list_empty(&cfs_b->throttled_cfs_rq))
+ start_cfs_slack_bandwidth(cfs_b);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ /* even if it's not valid for return we don't want to try again */
+ cfs_rq->runtime_remaining -= slack_runtime;
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+ return;
+
+ __return_cfs_rq_runtime(cfs_rq);
+}
+
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+ u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+
+ /* confirm we're still not at a refresh boundary */
+ raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->distribute_running) {
+ raw_spin_unlock(&cfs_b->lock);
+ return;
+ }
+
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+ raw_spin_unlock(&cfs_b->lock);
+ return;
+ }
+
+ if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
+ runtime = cfs_b->runtime;
+
+ if (runtime)
+ cfs_b->distribute_running = 1;
+
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!runtime)
+ return;
+
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
+
+ raw_spin_lock(&cfs_b->lock);
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ cfs_b->distribute_running = 0;
+ raw_spin_unlock(&cfs_b->lock);
+}
+
+/*
+ * When a group wakes up we want to make sure that its quota is not already
+ * expired/exceeded, otherwise it may be allowed to steal additional ticks of
+ * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ */
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_bandwidth_used())
+ return;
+
+ /* an active group must be handled by the update_curr()->put() path */
+ if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+ return;
+
+ /* ensure the group is not already throttled */
+ if (cfs_rq_throttled(cfs_rq))
+ return;
+
+ /* update runtime allocation */
+ account_cfs_rq_runtime(cfs_rq, 0);
+ if (cfs_rq->runtime_remaining <= 0)
+ throttle_cfs_rq(cfs_rq);
+}
+
+static void sync_throttle(struct task_group *tg, int cpu)
+{
+ struct cfs_rq *pcfs_rq, *cfs_rq;
+
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!tg->parent)
+ return;
+
+ cfs_rq = tg->cfs_rq[cpu];
+ pcfs_rq = tg->parent->cfs_rq[cpu];
+
+ cfs_rq->throttle_count = pcfs_rq->throttle_count;
+ cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+}
+
+/* conditionally throttle active cfs_rq's from put_prev_entity() */
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_bandwidth_used())
+ return false;
+
+ if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+ return false;
+
+ /*
+ * it's possible for a throttled entity to be forced into a running
+ * state (e.g. set_curr_task), in this case we're finished.
+ */
+ if (cfs_rq_throttled(cfs_rq))
+ return true;
+
+ throttle_cfs_rq(cfs_rq);
+ return true;
+}
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, slack_timer);
+
+ do_sched_cfs_slack_timer(cfs_b);
+
+ return HRTIMER_NORESTART;
+}
+
+extern const u64 max_cfs_quota_period;
+
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, period_timer);
+ int overrun;
+ int idle = 0;
+ int count = 0;
+
+ raw_spin_lock(&cfs_b->lock);
+ for (;;) {
+ overrun = hrtimer_forward_now(timer, cfs_b->period);
+ if (!overrun)
+ break;
+
+ if (++count > 3) {
+ u64 new, old = ktime_to_ns(cfs_b->period);
+
+ /*
+ * Grow period by a factor of 2 to avoid losing precision.
+ * Precision loss in the quota/period ratio can cause __cfs_schedulable
+ * to fail.
+ */
+ new = old * 2;
+ if (new < max_cfs_quota_period) {
+ cfs_b->period = ns_to_ktime(new);
+ cfs_b->quota *= 2;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ } else {
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(old, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ }
+
+ /* reset count so we don't come right back in here */
+ count = 0;
+ }
+
+ idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ }
+ if (idle)
+ cfs_b->period_active = 0;
+ raw_spin_unlock(&cfs_b->lock);
+
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ raw_spin_lock_init(&cfs_b->lock);
+ cfs_b->runtime = 0;
+ cfs_b->quota = RUNTIME_INF;
+ cfs_b->period = ns_to_ktime(default_cfs_period());
+
+ INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ cfs_b->period_timer.function = sched_cfs_period_timer;
+ hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->slack_timer.function = sched_cfs_slack_timer;
+ cfs_b->distribute_running = 0;
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->runtime_enabled = 0;
+ INIT_LIST_HEAD(&cfs_rq->throttled_list);
+}
+
+void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ lockdep_assert_held(&cfs_b->lock);
+
+ if (cfs_b->period_active)
+ return;
+
+ cfs_b->period_active = 1;
+ hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+ hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ /* init_cfs_bandwidth() was not called */
+ if (!cfs_b->throttled_cfs_rq.next)
+ return;
+
+ hrtimer_cancel(&cfs_b->period_timer);
+ hrtimer_cancel(&cfs_b->slack_timer);
+}
+
+/*
+ * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
+ *
+ * The race is harmless, since modifying bandwidth settings of unhooked group
+ * bits doesn't do much.
+ */
+
+/* cpu online calback */
+static void __maybe_unused update_runtime_enabled(struct rq *rq)
+{
+ struct task_group *tg;
+
+ lockdep_assert_held(&rq->lock);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ raw_spin_lock(&cfs_b->lock);
+ cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
+ raw_spin_unlock(&cfs_b->lock);
+ }
+ rcu_read_unlock();
+}
+
+/* cpu offline callback */
+static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+ struct task_group *tg;
+
+ lockdep_assert_held(&rq->lock);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ if (!cfs_rq->runtime_enabled)
+ continue;
+
+ /*
+ * clock_task is not advancing so we just need to make sure
+ * there's some valid quota amount
+ */
+ cfs_rq->runtime_remaining = 1;
+ /*
+ * Offline rq is schedulable till CPU is completely disabled
+ * in take_cpu_down(), so we prevent new cfs throttling here.
+ */
+ cfs_rq->runtime_enabled = 0;
+
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+ }
+ rcu_read_unlock();
+}
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return false;
+}
+
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ return rq_clock_task(rq_of(cfs_rq));
+}
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static inline void sync_throttle(struct task_group *tg, int cpu) {}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static inline int throttled_lb_pair(struct task_group *tg,
+ int src_cpu, int dest_cpu)
+{
+ return 0;
+}
+
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+#endif
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+ return NULL;
+}
+static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static inline void update_runtime_enabled(struct rq *rq) {}
+static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+/**************************************************
+ * CFS operations on tasks:
+ */
+
+#ifdef CONFIG_SCHED_HRTICK
+static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ SCHED_WARN_ON(task_rq(p) != rq);
+
+ if (rq->cfs.h_nr_running > 1) {
+ u64 slice = sched_slice(cfs_rq, se);
+ u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ s64 delta = slice - ran;
+
+ if (delta < 0) {
+ if (rq->curr == p)
+ resched_curr(rq);
+ return;
+ }
+ hrtick_start(rq, delta);
+ }
+}
+
+/*
+ * called from enqueue/dequeue and updates the hrtick when the
+ * current task is from our class and nr_running is low enough
+ * to matter.
+ */
+static void hrtick_update(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+
+ if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
+ return;
+
+ if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
+ hrtick_start_fair(rq, curr);
+}
+#else /* !CONFIG_SCHED_HRTICK */
+static inline void
+hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void hrtick_update(struct rq *rq)
+{
+}
+#endif
+
+/*
+ * The enqueue_task method is called before nr_running is
+ * increased. Here we update the fair scheduling stats and
+ * then put the task into the rbtree:
+ */
+static void
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ /*
+ * The code below (indirectly) updates schedutil which looks at
+ * the cfs_rq utilization to select a frequency.
+ * Let's add the task's estimated utilization to the cfs_rq's
+ * estimated utilization, before we update schedutil.
+ */
+ util_est_enqueue(&rq->cfs, p);
+
+ /*
+ * If in_iowait is set, the code below may not trigger any cpufreq
+ * utilization updates, so do it here explicitly with the IOWAIT flag
+ * passed.
+ */
+ if (p->in_iowait)
+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
+
+ for_each_sched_entity(se) {
+ if (se->on_rq)
+ break;
+ cfs_rq = cfs_rq_of(se);
+ enqueue_entity(cfs_rq, se, flags);
+
+ /*
+ * end evaluation on encountering a throttled cfs_rq
+ *
+ * note: in the case of encountering a throttled cfs_rq we will
+ * post the final h_nr_running increment below.
+ */
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ cfs_rq->h_nr_running++;
+
+ flags = ENQUEUE_WAKEUP;
+ }
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_running++;
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_cfs_group(se);
+ }
+
+ if (!se)
+ add_nr_running(rq, 1);
+
+ if (cfs_bandwidth_used()) {
+ /*
+ * When bandwidth control is enabled; the cfs_rq_throttled()
+ * breaks in the above iteration can result in incomplete
+ * leaf list maintenance, resulting in triggering the assertion
+ * below.
+ */
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (list_add_leaf_cfs_rq(cfs_rq))
+ break;
+ }
+ }
+
+ assert_list_leaf_cfs_rq(rq);
+
+ hrtick_update(rq);
+}
+
+static void set_next_buddy(struct sched_entity *se);
+
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+ int task_sleep = flags & DEQUEUE_SLEEP;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ dequeue_entity(cfs_rq, se, flags);
+
+ /*
+ * end evaluation on encountering a throttled cfs_rq
+ *
+ * note: in the case of encountering a throttled cfs_rq we will
+ * post the final h_nr_running decrement below.
+ */
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ cfs_rq->h_nr_running--;
+
+ /* Don't dequeue parent if it has other entities besides us */
+ if (cfs_rq->load.weight) {
+ /* Avoid re-evaluating load for this entity: */
+ se = parent_entity(se);
+ /*
+ * Bias pick_next to pick a task from this cfs_rq, as
+ * p is sleeping when it is within its sched_slice.
+ */
+ if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+ set_next_buddy(se);
+ break;
+ }
+ flags |= DEQUEUE_SLEEP;
+ }
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_running--;
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_cfs_group(se);
+ }
+
+ if (!se)
+ sub_nr_running(rq, 1);
+
+ util_est_dequeue(&rq->cfs, p, task_sleep);
+ hrtick_update(rq);
+}
+
+#ifdef CONFIG_SMP
+
+/* Working cpumask for: load_balance, load_balance_newidle. */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * per rq 'load' arrray crap; XXX kill this.
+ */
+
+/*
+ * The exact cpuload calculated at every tick would be:
+ *
+ * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
+ *
+ * If a CPU misses updates for n ticks (as it was idle) and update gets
+ * called on the n+1-th tick when CPU may be busy, then we have:
+ *
+ * load_n = (1 - 1/2^i)^n * load_0
+ * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ *
+ * load' = (1 - 1/2^i)^n * load
+ *
+ * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
+ * This allows us to precompute the above in said factors, thereby allowing the
+ * reduction of an arbitrary n in O(log_2 n) steps. (See also
+ * fixed_power_int())
+ *
+ * The calculation is approximated on a 128 point scale.
+ */
+#define DEGRADE_SHIFT 7
+
+static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 64, 32, 8, 0, 0, 0, 0, 0 },
+ { 96, 72, 40, 12, 1, 0, 0, 0 },
+ { 112, 98, 75, 43, 15, 1, 0, 0 },
+ { 120, 112, 98, 76, 45, 16, 2, 0 }
+};
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+ int j = 0;
+
+ if (!missed_updates)
+ return load;
+
+ if (missed_updates >= degrade_zero_ticks[idx])
+ return 0;
+
+ if (idx == 1)
+ return load >> missed_updates;
+
+ while (missed_updates) {
+ if (missed_updates % 2)
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+ missed_updates >>= 1;
+ j++;
+ }
+ return load;
+}
+
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ int has_blocked; /* Idle CPUS has blocked load */
+ unsigned long next_balance; /* in jiffy units */
+ unsigned long next_blocked; /* Next update of blocked load in jiffies */
+} nohz ____cacheline_aligned;
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/**
+ * __cpu_load_update - update the rq->cpu_load[] statistics
+ * @this_rq: The rq to update statistics for
+ * @this_load: The current load
+ * @pending_updates: The number of missed updates
+ *
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC).
+ *
+ * This function computes a decaying average:
+ *
+ * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
+ *
+ * Because of NOHZ it might not get called on every tick which gives need for
+ * the @pending_updates argument.
+ *
+ * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
+ * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
+ * = A * (A * load[i]_n-2 + B) + B
+ * = A * (A * (A * load[i]_n-3 + B) + B) + B
+ * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
+ * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
+ * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
+ * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
+ *
+ * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
+ * any change in load would have resulted in the tick being turned back on.
+ *
+ * For regular NOHZ, this reduces to:
+ *
+ * load[i]_n = (1 - 1/2^i)^n * load[i]_0
+ *
+ * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
+ * term.
+ */
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
+{
+ unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
+ int i, scale;
+
+ this_rq->nr_load_updates++;
+
+ /* Update our load: */
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ if (tickless_load) {
+ old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
+ /*
+ * old_load can never be a negative value because a
+ * decayed tickless_load cannot be greater than the
+ * original tickless_load.
+ */
+ old_load += tickless_load;
+ }
+#endif
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale - 1;
+
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ }
+}
+
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(struct rq *rq)
+{
+ return cfs_rq_runnable_load_avg(&rq->cfs);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we need to avoid the delta approach from the regular tick when
+ * possible since that would seriously skew the load calculation. This is why we
+ * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
+ * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
+ * loop exit, nohz_idle_balance, nohz full exit...)
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+static void cpu_load_update_nohz(struct rq *this_rq,
+ unsigned long curr_jiffies,
+ unsigned long load)
+{
+ unsigned long pending_updates;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * In the regular NOHZ case, we were idle, this means load 0.
+ * In the NOHZ_FULL case, we were non-idle, we should consider
+ * its weighted load.
+ */
+ cpu_load_update(this_rq, load, pending_updates);
+ }
+}
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+static void cpu_load_update_idle(struct rq *this_rq)
+{
+ /*
+ * bail if there's load or we're actually up-to-date.
+ */
+ if (weighted_cpuload(this_rq))
+ return;
+
+ cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
+}
+
+/*
+ * Record CPU load on nohz entry so we know the tickless load to account
+ * on nohz exit. cpu_load[0] happens then to be updated more frequently
+ * than other cpu_load[idx] but it should be fine as cpu_load readers
+ * shouldn't rely into synchronized cpu_load[*] updates.
+ */
+void cpu_load_update_nohz_start(void)
+{
+ struct rq *this_rq = this_rq();
+
+ /*
+ * This is all lockless but should be fine. If weighted_cpuload changes
+ * concurrently we'll exit nohz. And cpu_load write can race with
+ * cpu_load_update_idle() but both updater would be writing the same.
+ */
+ this_rq->cpu_load[0] = weighted_cpuload(this_rq);
+}
+
+/*
+ * Account the tickless load in the end of a nohz frame.
+ */
+void cpu_load_update_nohz_stop(void)
+{
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
+ struct rq *this_rq = this_rq();
+ unsigned long load;
+ struct rq_flags rf;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ load = weighted_cpuload(this_rq);
+ rq_lock(this_rq, &rf);
+ update_rq_clock(this_rq);
+ cpu_load_update_nohz(this_rq, curr_jiffies, load);
+ rq_unlock(this_rq, &rf);
+}
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline void cpu_load_update_nohz(struct rq *this_rq,
+ unsigned long curr_jiffies,
+ unsigned long load) { }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+ /* See the mess around cpu_load_update_nohz(). */
+ this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
+ cpu_load_update(this_rq, load, 1);
+}
+
+/*
+ * Called from scheduler_tick()
+ */
+void cpu_load_update_active(struct rq *this_rq)
+{
+ unsigned long load = weighted_cpuload(this_rq);
+
+ if (tick_nohz_tick_stopped())
+ cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
+ else
+ cpu_load_update_periodic(this_rq, load);
+}
+
+/*
+ * Return a low guess at the load of a migration-source CPU weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static unsigned long source_load(int cpu, int type)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(rq);
+
+ if (type == 0 || !sched_feat(LB_BIAS))
+ return total;
+
+ return min(rq->cpu_load[type-1], total);
+}
+
+/*
+ * Return a high guess at the load of a migration-target CPU weighted
+ * according to the scheduling class and "nice" value.
+ */
+static unsigned long target_load(int cpu, int type)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(rq);
+
+ if (type == 0 || !sched_feat(LB_BIAS))
+ return total;
+
+ return max(rq->cpu_load[type-1], total);
+}
+
+static unsigned long capacity_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity;
+}
+
+static unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
+ unsigned long load_avg = weighted_cpuload(rq);
+
+ if (nr_running)
+ return load_avg / nr_running;
+
+ return 0;
+}
+
+static void record_wakee(struct task_struct *p)
+{
+ /*
+ * Only decay a single time; tasks that have less then 1 wakeup per
+ * jiffy will not have built up many flips.
+ */
+ if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
+ current->wakee_flips >>= 1;
+ current->wakee_flip_decay_ts = jiffies;
+ }
+
+ if (current->last_wakee != p) {
+ current->last_wakee = p;
+ current->wakee_flips++;
+ }
+}
+
+/*
+ * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ *
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees.
+ *
+ * In order to determine whether we should let the load spread vs consolidating
+ * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.
+ *
+ * With both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size.
+ *
+ * Waker/wakee being client/server, worker/dispatcher, interrupt source or
+ * whatever is irrelevant, spread criteria is apparent partner count exceeds
+ * socket size.
+ */
+static int wake_wide(struct task_struct *p)
+{
+ unsigned int master = current->wakee_flips;
+ unsigned int slave = p->wakee_flips;
+ int factor = this_cpu_read(sd_llc_size);
+
+ if (master < slave)
+ swap(master, slave);
+ if (slave < factor || master < slave * factor)
+ return 0;
+ return 1;
+}
+
+/*
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
+ *
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is
+ * cache-affine and is (or will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ * scheduling latency of the CPUs. This seems to work
+ * for the overloaded case.
+ */
+static int
+wake_affine_idle(int this_cpu, int prev_cpu, int sync)
+{
+ /*
+ * If this_cpu is idle, it implies the wakeup is from interrupt
+ * context. Only allow the move if cache is shared. Otherwise an
+ * interrupt intensive workload could force all tasks onto one
+ * node depending on the IO topology or IRQ affinity settings.
+ *
+ * If the prev_cpu is idle and cache affine then avoid a migration.
+ * There is no guarantee that the cache hot data from an interrupt
+ * is more important than cache hot data on the prev_cpu and from
+ * a cpufreq perspective, it's better to have higher utilisation
+ * on one CPU.
+ */
+ if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+ return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
+
+ if (sync && cpu_rq(this_cpu)->nr_running == 1)
+ return this_cpu;
+
+ return nr_cpumask_bits;
+}
+
+static int
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ s64 this_eff_load, prev_eff_load;
+ unsigned long task_load;
+
+ this_eff_load = target_load(this_cpu, sd->wake_idx);
+
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+
+ if (current_load > this_eff_load)
+ return this_cpu;
+
+ this_eff_load -= current_load;
+ }
+
+ task_load = task_h_load(p);
+
+ this_eff_load += task_load;
+ if (sched_feat(WA_BIAS))
+ this_eff_load *= 100;
+ this_eff_load *= capacity_of(prev_cpu);
+
+ prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+ prev_eff_load -= task_load;
+ if (sched_feat(WA_BIAS))
+ prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= capacity_of(this_cpu);
+
+ /*
+ * If sync, adjust the weight of prev_eff_load such that if
+ * prev_eff == this_eff that select_idle_sibling() will consider
+ * stacking the wakee on top of the waker if no other CPU is
+ * idle.
+ */
+ if (sync)
+ prev_eff_load += 1;
+
+ return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
+}
+
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ int target = nr_cpumask_bits;
+
+ if (sched_feat(WA_IDLE))
+ target = wake_affine_idle(this_cpu, prev_cpu, sync);
+
+ if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
+ target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
+
+ schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
+ if (target == nr_cpumask_bits)
+ return prev_cpu;
+
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ return target;
+}
+
+static unsigned long cpu_util_without(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
+{
+ return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
+}
+
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int sd_flag)
+{
+ struct sched_group *idlest = NULL, *group = sd->groups;
+ struct sched_group *most_spare_sg = NULL;
+ unsigned long min_runnable_load = ULONG_MAX;
+ unsigned long this_runnable_load = ULONG_MAX;
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
+ unsigned long most_spare = 0, this_spare = 0;
+ int load_idx = sd->forkexec_idx;
+ int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
+ unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
+
+ if (sd_flag & SD_BALANCE_WAKE)
+ load_idx = sd->wake_idx;
+
+ do {
+ unsigned long load, avg_load, runnable_load;
+ unsigned long spare_cap, max_spare_cap;
+ int local_group;
+ int i;
+
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpumask_intersects(sched_group_span(group),
+ &p->cpus_allowed))
+ continue;
+
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_span(group));
+
+ /*
+ * Tally up the load of all CPUs in the group and find
+ * the group containing the CPU with most spare capacity.
+ */
+ avg_load = 0;
+ runnable_load = 0;
+ max_spare_cap = 0;
+
+ for_each_cpu(i, sched_group_span(group)) {
+ /* Bias balancing toward CPUs of our domain */
+ if (local_group)
+ load = source_load(i, load_idx);
+ else
+ load = target_load(i, load_idx);
+
+ runnable_load += load;
+
+ avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
+
+ spare_cap = capacity_spare_without(i, p);
+
+ if (spare_cap > max_spare_cap)
+ max_spare_cap = spare_cap;
+ }
+
+ /* Adjust by relative CPU capacity of the group */
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
+ group->sgc->capacity;
+ runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
+ group->sgc->capacity;
+
+ if (local_group) {
+ this_runnable_load = runnable_load;
+ this_avg_load = avg_load;
+ this_spare = max_spare_cap;
+ } else {
+ if (min_runnable_load > (runnable_load + imbalance)) {
+ /*
+ * The runnable load is significantly smaller
+ * so we can pick this new CPU:
+ */
+ min_runnable_load = runnable_load;
+ min_avg_load = avg_load;
+ idlest = group;
+ } else if ((runnable_load < (min_runnable_load + imbalance)) &&
+ (100*min_avg_load > imbalance_scale*avg_load)) {
+ /*
+ * The runnable loads are close so take the
+ * blocked load into account through avg_load:
+ */
+ min_avg_load = avg_load;
+ idlest = group;
+ }
+
+ if (most_spare < max_spare_cap) {
+ most_spare = max_spare_cap;
+ most_spare_sg = group;
+ }
+ }
+ } while (group = group->next, group != sd->groups);
+
+ /*
+ * The cross-over point between using spare capacity or least load
+ * is too conservative for high utilization tasks on partially
+ * utilized systems if we require spare_capacity > task_util(p),
+ * so we allow for some task stuffing by using
+ * spare_capacity > task_util(p)/2.
+ *
+ * Spare capacity can't be used for fork because the utilization has
+ * not been set yet, we must first select a rq to compute the initial
+ * utilization.
+ */
+ if (sd_flag & SD_BALANCE_FORK)
+ goto skip_spare;
+
+ if (this_spare > task_util(p) / 2 &&
+ imbalance_scale*this_spare > 100*most_spare)
+ return NULL;
+
+ if (most_spare > task_util(p) / 2)
+ return most_spare_sg;
+
+skip_spare:
+ if (!idlest)
+ return NULL;
+
+ /*
+ * When comparing groups across NUMA domains, it's possible for the
+ * local domain to be very lightly loaded relative to the remote
+ * domains but "imbalance" skews the comparison making remote CPUs
+ * look much more favourable. When considering cross-domain, add
+ * imbalance to the runnable load on the remote node and consider
+ * staying local.
+ */
+ if ((sd->flags & SD_NUMA) &&
+ min_runnable_load + imbalance >= this_runnable_load)
+ return NULL;
+
+ if (min_runnable_load > (this_runnable_load + imbalance))
+ return NULL;
+
+ if ((this_runnable_load < (min_runnable_load + imbalance)) &&
+ (100*this_avg_load < imbalance_scale*min_avg_load))
+ return NULL;
+
+ return idlest;
+}
+
+/*
+ * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
+ */
+static int
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+{
+ unsigned long load, min_load = ULONG_MAX;
+ unsigned int min_exit_latency = UINT_MAX;
+ u64 latest_idle_timestamp = 0;
+ int least_loaded_cpu = this_cpu;
+ int shallowest_idle_cpu = -1;
+ int i;
+
+ /* Check if we have any choice: */
+ if (group->group_weight == 1)
+ return cpumask_first(sched_group_span(group));
+
+ /* Traverse only the allowed CPUs */
+ for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
+ if (available_idle_cpu(i)) {
+ struct rq *rq = cpu_rq(i);
+ struct cpuidle_state *idle = idle_get_state(rq);
+ if (idle && idle->exit_latency < min_exit_latency) {
+ /*
+ * We give priority to a CPU whose idle state
+ * has the smallest exit latency irrespective
+ * of any idle timestamp.
+ */
+ min_exit_latency = idle->exit_latency;
+ latest_idle_timestamp = rq->idle_stamp;
+ shallowest_idle_cpu = i;
+ } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+ rq->idle_stamp > latest_idle_timestamp) {
+ /*
+ * If equal or no active idle state, then
+ * the most recently idled CPU might have
+ * a warmer cache.
+ */
+ latest_idle_timestamp = rq->idle_stamp;
+ shallowest_idle_cpu = i;
+ }
+ } else if (shallowest_idle_cpu == -1) {
+ load = weighted_cpuload(cpu_rq(i));
+ if (load < min_load) {
+ min_load = load;
+ least_loaded_cpu = i;
+ }
+ }
+ }
+
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+}
+
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+ int cpu, int prev_cpu, int sd_flag)
+{
+ int new_cpu = cpu;
+
+ if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ return prev_cpu;
+
+ /*
+ * We need task's util for capacity_spare_without, sync it up to
+ * prev_cpu's last_update_time.
+ */
+ if (!(sd_flag & SD_BALANCE_FORK))
+ sync_entity_load_avg(&p->se);
+
+ while (sd) {
+ struct sched_group *group;
+ struct sched_domain *tmp;
+ int weight;
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_group_cpu(group, p, cpu);
+ if (new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of 'cpu': */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of 'new_cpu': */
+ cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ }
+
+ return new_cpu;
+}
+
+#ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+EXPORT_SYMBOL_GPL(sched_smt_present);
+
+static inline void set_idle_cores(int cpu, int val)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ WRITE_ONCE(sds->has_idle_cores, val);
+}
+
+static inline bool test_idle_cores(int cpu, bool def)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ return READ_ONCE(sds->has_idle_cores);
+
+ return def;
+}
+
+/*
+ * Scans the local SMT mask to see if the entire core is idle, and records this
+ * information in sd_llc_shared->has_idle_cores.
+ *
+ * Since SMT siblings share all cache levels, inspecting this limited remote
+ * state should be fairly cheap.
+ */
+void __update_idle_core(struct rq *rq)
+{
+ int core = cpu_of(rq);
+ int cpu;
+
+ rcu_read_lock();
+ if (test_idle_cores(core, true))
+ goto unlock;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (cpu == core)
+ continue;
+
+ if (!available_idle_cpu(cpu))
+ goto unlock;
+ }
+
+ set_idle_cores(core, 1);
+unlock:
+ rcu_read_unlock();
+}
+
+/*
+ * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ * there are no idle cores left in the system; tracked through
+ * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ */
+static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ int core, cpu;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -1;
+
+ if (!test_idle_cores(target, false))
+ return -1;
+
+ cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+
+ for_each_cpu_wrap(core, cpus, target) {
+ bool idle = true;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ cpumask_clear_cpu(cpu, cpus);
+ if (!available_idle_cpu(cpu))
+ idle = false;
+ }
+
+ if (idle)
+ return core;
+ }
+
+ /*
+ * Failed to find an idle core; stop looking for one.
+ */
+ set_idle_cores(target, 0);
+
+ return -1;
+}
+
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ int cpu;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -1;
+
+ for_each_cpu(cpu, cpu_smt_mask(target)) {
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ continue;
+ if (available_idle_cpu(cpu))
+ return cpu;
+ }
+
+ return -1;
+}
+
+#else /* CONFIG_SCHED_SMT */
+
+static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+#endif /* CONFIG_SCHED_SMT */
+
+/*
+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
+ * average idle time for this rq (as found in rq->avg_idle).
+ */
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ struct sched_domain *this_sd;
+ u64 avg_cost, avg_idle;
+ u64 time, cost;
+ s64 delta;
+ int cpu, nr = INT_MAX;
+
+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd)
+ return -1;
+
+ /*
+ * Due to large variance we need a large fuzz factor; hackbench in
+ * particularly is sensitive here.
+ */
+ avg_idle = this_rq()->avg_idle / 512;
+ avg_cost = this_sd->avg_scan_cost + 1;
+
+ if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
+ return -1;
+
+ if (sched_feat(SIS_PROP)) {
+ u64 span_avg = sd->span_weight * avg_idle;
+ if (span_avg > 4*avg_cost)
+ nr = div_u64(span_avg, avg_cost);
+ else
+ nr = 4;
+ }
+
+ time = local_clock();
+
+ cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
+ if (!--nr)
+ return -1;
+ if (available_idle_cpu(cpu))
+ break;
+ }
+
+ time = local_clock() - time;
+ cost = this_sd->avg_scan_cost;
+ delta = (s64)(time - cost) / 8;
+ this_sd->avg_scan_cost += delta;
+
+ return cpu;
+}
+
+/*
+ * Try and locate an idle core/thread in the LLC cache domain.
+ */
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
+{
+ struct sched_domain *sd;
+ int i, recent_used_cpu;
+
+ if (available_idle_cpu(target))
+ return target;
+
+ /*
+ * If the previous CPU is cache affine and idle, don't be stupid:
+ */
+ if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
+ return prev;
+
+ /* Check a recently used CPU as a potential idle candidate: */
+ recent_used_cpu = p->recent_used_cpu;
+ if (recent_used_cpu != prev &&
+ recent_used_cpu != target &&
+ cpus_share_cache(recent_used_cpu, target) &&
+ available_idle_cpu(recent_used_cpu) &&
+ cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+ /*
+ * Replace recent_used_cpu with prev as it is a potential
+ * candidate for the next wake:
+ */
+ p->recent_used_cpu = prev;
+ return recent_used_cpu;
+ }
+
+ sd = rcu_dereference(per_cpu(sd_llc, target));
+ if (!sd)
+ return target;
+
+ i = select_idle_core(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_cpu(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ i = select_idle_smt(p, sd, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ return target;
+}
+
+/**
+ * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
+ * @cpu: the CPU to get the utilization of
+ *
+ * The unit of the return value must be the one of capacity so we can compare
+ * the utilization with the capacity of the CPU that is available for CFS task
+ * (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * The estimated utilization of a CPU is defined to be the maximum between its
+ * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
+ * currently RUNNABLE on that CPU.
+ * This allows to properly represent the expected utilization of a CPU which
+ * has just got a big task running since a long sleep period. At the same time
+ * however it preserves the benefits of the "blocked utilization" in
+ * describing the potential for other tasks waking up on the same CPU.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ *
+ * Return: the (estimated) utilization for the specified CPU
+ */
+static inline unsigned long cpu_util(int cpu)
+{
+ struct cfs_rq *cfs_rq;
+ unsigned int util;
+
+ cfs_rq = &cpu_rq(cpu)->cfs;
+ util = READ_ONCE(cfs_rq->avg.util_avg);
+
+ if (sched_feat(UTIL_EST))
+ util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+ return min_t(unsigned long, util, capacity_orig_of(cpu));
+}
+
+/*
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
+ */
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq;
+ unsigned int util;
+
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_util(cpu);
+
+ cfs_rq = &cpu_rq(cpu)->cfs;
+ util = READ_ONCE(cfs_rq->avg.util_avg);
+
+ /* Discount task's util from CPU's util */
+ util -= min_t(unsigned int, util, task_util(p));
+
+ /*
+ * Covered cases:
+ *
+ * a) if *p is the only task sleeping on this CPU, then:
+ * cpu_util (== task_util) > util_est (== 0)
+ * and thus we return:
+ * cpu_util_without = (cpu_util - task_util) = 0
+ *
+ * b) if other tasks are SLEEPING on this CPU, which is now exiting
+ * IDLE, then:
+ * cpu_util >= task_util
+ * cpu_util > util_est (== 0)
+ * and thus we discount *p's blocked utilization to return:
+ * cpu_util_without = (cpu_util - task_util) >= 0
+ *
+ * c) if other tasks are RUNNABLE on that CPU and
+ * util_est > cpu_util
+ * then we use util_est since it returns a more restrictive
+ * estimation of the spare capacity on that CPU, by just
+ * considering the expected utilization of tasks already
+ * runnable on that CPU.
+ *
+ * Cases a) and b) are covered by the above code, while case c) is
+ * covered by the following code when estimated utilization is
+ * enabled.
+ */
+ if (sched_feat(UTIL_EST)) {
+ unsigned int estimated =
+ READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+ /*
+ * Despite the following checks we still have a small window
+ * for a possible race, when an execl's select_task_rq_fair()
+ * races with LB's detach_task():
+ *
+ * detach_task()
+ * p->on_rq = TASK_ON_RQ_MIGRATING;
+ * ---------------------------------- A
+ * deactivate_task() \
+ * dequeue_task() + RaceTime
+ * util_est_dequeue() /
+ * ---------------------------------- B
+ *
+ * The additional check on "current == p" it's required to
+ * properly fix the execl regression and it helps in further
+ * reducing the chances for the above race.
+ */
+ if (unlikely(task_on_rq_queued(p) || current == p)) {
+ estimated -= min_t(unsigned int, estimated,
+ (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+ }
+ util = max(util, estimated);
+ }
+
+ /*
+ * Utilization (estimated) can exceed the CPU capacity, thus let's
+ * clamp to the maximum CPU capacity to ensure consistency with
+ * the cpu_util call.
+ */
+ return min_t(unsigned long, util, capacity_orig_of(cpu));
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+ long min_cap, max_cap;
+
+ min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+ /* Minimum capacity is close to max, no need to abort wake_affine */
+ if (max_cap - min_cap < max_cap >> 3)
+ return 0;
+
+ /* Bring task utilization in sync with prev_cpu */
+ sync_entity_load_avg(&p->se);
+
+ return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
+/*
+ * select_task_rq_fair: Select target runqueue for the waking task in domains
+ * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
+ * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
+ *
+ * Balances load by selecting the idlest CPU in the idlest group, or under
+ * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
+ *
+ * Returns the target CPU number.
+ *
+ * preempt must be disabled.
+ */
+static int
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+{
+ struct sched_domain *tmp, *sd = NULL;
+ int cpu = smp_processor_id();
+ int new_cpu = prev_cpu;
+ int want_affine = 0;
+ int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
+
+ if (sd_flag & SD_BALANCE_WAKE) {
+ record_wakee(p);
+ want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+ && cpumask_test_cpu(cpu, &p->cpus_allowed);
+ }
+
+ rcu_read_lock();
+ for_each_domain(cpu, tmp) {
+ if (!(tmp->flags & SD_LOAD_BALANCE))
+ break;
+
+ /*
+ * If both 'cpu' and 'prev_cpu' are part of this domain,
+ * cpu is a valid SD_WAKE_AFFINE target.
+ */
+ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+ if (cpu != prev_cpu)
+ new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
+
+ sd = NULL; /* Prefer wake_affine over balance flags */
+ break;
+ }
+
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ else if (!want_affine)
+ break;
+ }
+
+ if (unlikely(sd)) {
+ /* Slow path */
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
+ /* Fast path */
+
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+
+ if (want_affine)
+ current->recent_used_cpu = cpu;
+ }
+ rcu_read_unlock();
+
+ return new_cpu;
+}
+
+static void detach_entity_cfs_rq(struct sched_entity *se);
+
+/*
+ * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
+ */
+static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
+{
+ /*
+ * As blocked tasks retain absolute vruntime the migration needs to
+ * deal with this by subtracting the old and adding the new
+ * min_vruntime -- the latter is done by enqueue_entity() when placing
+ * the task on the new runqueue.
+ */
+ if (p->state == TASK_WAKING) {
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 min_vruntime;
+
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+
+ do {
+ min_vruntime_copy = cfs_rq->min_vruntime_copy;
+ smp_rmb();
+ min_vruntime = cfs_rq->min_vruntime;
+ } while (min_vruntime != min_vruntime_copy);
+#else
+ min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+ se->vruntime -= min_vruntime;
+ }
+
+ if (p->on_rq == TASK_ON_RQ_MIGRATING) {
+ /*
+ * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
+ * rq->lock and can modify state directly.
+ */
+ lockdep_assert_held(&task_rq(p)->lock);
+ detach_entity_cfs_rq(&p->se);
+
+ } else {
+ /*
+ * We are supposed to update the task to "current" time, then
+ * its up to date and ready to go to new CPU/cfs_rq. But we
+ * have difficulty in getting what current time is, so simply
+ * throw away the out-of-date time. This will result in the
+ * wakee task is less decayed, but giving the wakee more load
+ * sounds not bad.
+ */
+ remove_entity_load_avg(&p->se);
+ }
+
+ /* Tell new CPU we are migrated */
+ p->se.avg.last_update_time = 0;
+
+ /* We have migrated, no longer consider this task hot */
+ p->se.exec_start = 0;
+
+ update_scan_period(p, new_cpu);
+}
+
+static void task_dead_fair(struct task_struct *p)
+{
+ remove_entity_load_avg(&p->se);
+}
+#endif /* CONFIG_SMP */
+
+static unsigned long wakeup_gran(struct sched_entity *se)
+{
+ unsigned long gran = sysctl_sched_wakeup_granularity;
+
+ /*
+ * Since its curr running now, convert the gran from real-time
+ * to virtual-time in his units.
+ *
+ * By using 'se' instead of 'curr' we penalize light tasks, so
+ * they get preempted easier. That is, if 'se' < 'curr' then
+ * the resulting gran will be larger, therefore penalizing the
+ * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ * be smaller, again penalizing the lighter task.
+ *
+ * This is especially important for buddies when the leftmost
+ * task is higher priority than the buddy.
+ */
+ return calc_delta_fair(gran, se);
+}
+
+/*
+ * Should 'se' preempt 'curr'.
+ *
+ * |s1
+ * |s2
+ * |s3
+ * g
+ * |<--->|c
+ *
+ * w(c, s1) = -1
+ * w(c, s2) = 0
+ * w(c, s3) = 1
+ *
+ */
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+{
+ s64 gran, vdiff = curr->vruntime - se->vruntime;
+
+ if (vdiff <= 0)
+ return -1;
+
+ gran = wakeup_gran(se);
+ if (vdiff > gran)
+ return 1;
+
+ return 0;
+}
+
+static void set_last_buddy(struct sched_entity *se)
+{
+ if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ return;
+
+ for_each_sched_entity(se) {
+ if (SCHED_WARN_ON(!se->on_rq))
+ return;
+ cfs_rq_of(se)->last = se;
+ }
+}
+
+static void set_next_buddy(struct sched_entity *se)
+{
+ if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ return;
+
+ for_each_sched_entity(se) {
+ if (SCHED_WARN_ON(!se->on_rq))
+ return;
+ cfs_rq_of(se)->next = se;
+ }
+}
+
+static void set_skip_buddy(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->skip = se;
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_entity *se = &curr->se, *pse = &p->se;
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ int scale = cfs_rq->nr_running >= sched_nr_latency;
+ int next_buddy_marked = 0;
+
+ if (unlikely(se == pse))
+ return;
+
+ /*
+ * This is possible from callers such as attach_tasks(), in which we
+ * unconditionally check_prempt_curr() after an enqueue (which may have
+ * lead to a throttle). This both saves work and prevents false
+ * next-buddy nomination below.
+ */
+ if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+ return;
+
+ if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
+ set_next_buddy(pse);
+ next_buddy_marked = 1;
+ }
+
+ /*
+ * We can come here with TIF_NEED_RESCHED already set from new task
+ * wake up path.
+ *
+ * Note: this also catches the edge-case of curr being in a throttled
+ * group (e.g. via set_curr_task), since update_curr() (in the
+ * enqueue of curr) will have resulted in resched being set. This
+ * prevents us from potentially nominating it as a false LAST_BUDDY
+ * below.
+ */
+ if (test_tsk_need_resched(curr))
+ return;
+
+ /* Idle tasks are by definition preempted by non-idle tasks. */
+ if (unlikely(curr->policy == SCHED_IDLE) &&
+ likely(p->policy != SCHED_IDLE))
+ goto preempt;
+
+ /*
+ * Batch and idle tasks do not preempt non-idle tasks (their preemption
+ * is driven by the tick):
+ */
+ if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
+ return;
+
+ find_matching_se(&se, &pse);
+ update_curr(cfs_rq_of(se));
+ BUG_ON(!pse);
+ if (wakeup_preempt_entity(se, pse) == 1) {
+ /*
+ * Bias pick_next to pick the sched entity that is
+ * triggering this preemption.
+ */
+ if (!next_buddy_marked)
+ set_next_buddy(pse);
+ goto preempt;
+ }
+
+ return;
+
+preempt:
+ resched_curr(rq);
+ /*
+ * Only set the backward buddy when the current task is still
+ * on the rq. This can happen when a wakeup gets interleaved
+ * with schedule on the ->pre_schedule() or idle_balance()
+ * point, either of which can * drop the rq lock.
+ *
+ * Also, during early boot the idle thread is in the fair class,
+ * for obvious reasons its a bad idea to schedule back to it.
+ */
+ if (unlikely(!se->on_rq || curr == rq->idle))
+ return;
+
+ if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+ set_last_buddy(se);
+}
+
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ struct sched_entity *se;
+ struct task_struct *p;
+ int new_tasks;
+
+again:
+ if (!cfs_rq->nr_running)
+ goto idle;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (prev->sched_class != &fair_sched_class)
+ goto simple;
+
+ /*
+ * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+ * likely that a next task is from the same cgroup as the current.
+ *
+ * Therefore attempt to avoid putting and setting the entire cgroup
+ * hierarchy, only change the part that actually changes.
+ */
+
+ do {
+ struct sched_entity *curr = cfs_rq->curr;
+
+ /*
+ * Since we got here without doing put_prev_entity() we also
+ * have to consider cfs_rq->curr. If it is still a runnable
+ * entity, update_curr() will update its vruntime, otherwise
+ * forget we've ever seen it.
+ */
+ if (curr) {
+ if (curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+
+ /*
+ * This call to check_cfs_rq_runtime() will do the
+ * throttle and dequeue its entity in the parent(s).
+ * Therefore the nr_running test will indeed
+ * be correct.
+ */
+ if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+ cfs_rq = &rq->cfs;
+
+ if (!cfs_rq->nr_running)
+ goto idle;
+
+ goto simple;
+ }
+ }
+
+ se = pick_next_entity(cfs_rq, curr);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ p = task_of(se);
+
+ /*
+ * Since we haven't yet done put_prev_entity and if the selected task
+ * is a different task than we started out with, try and touch the
+ * least amount of cfs_rqs.
+ */
+ if (prev != p) {
+ struct sched_entity *pse = &prev->se;
+
+ while (!(cfs_rq = is_same_group(se, pse))) {
+ int se_depth = se->depth;
+ int pse_depth = pse->depth;
+
+ if (se_depth <= pse_depth) {
+ put_prev_entity(cfs_rq_of(pse), pse);
+ pse = parent_entity(pse);
+ }
+ if (se_depth >= pse_depth) {
+ set_next_entity(cfs_rq_of(se), se);
+ se = parent_entity(se);
+ }
+ }
+
+ put_prev_entity(cfs_rq, pse);
+ set_next_entity(cfs_rq, se);
+ }
+
+ goto done;
+simple:
+#endif
+
+ put_prev_task(rq, prev);
+
+ do {
+ se = pick_next_entity(cfs_rq, NULL);
+ set_next_entity(cfs_rq, se);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ p = task_of(se);
+
+done: __maybe_unused;
+#ifdef CONFIG_SMP
+ /*
+ * Move the next running task to the front of
+ * the list, so our cfs_tasks list becomes MRU
+ * one.
+ */
+ list_move(&p->se.group_node, &rq->cfs_tasks);
+#endif
+
+ if (hrtick_enabled(rq))
+ hrtick_start_fair(rq, p);
+
+ return p;
+
+idle:
+ new_tasks = idle_balance(rq, rf);
+
+ /*
+ * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * possible for any higher priority task to appear. In that case we
+ * must re-start the pick_next_entity() loop.
+ */
+ if (new_tasks < 0)
+ return RETRY_TASK;
+
+ if (new_tasks > 0)
+ goto again;
+
+ return NULL;
+}
+
+/*
+ * Account for a descheduled task:
+ */
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+{
+ struct sched_entity *se = &prev->se;
+ struct cfs_rq *cfs_rq;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ put_prev_entity(cfs_rq, se);
+ }
+}
+
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ struct sched_entity *se = &curr->se;
+
+ /*
+ * Are we the only task in the tree?
+ */
+ if (unlikely(rq->nr_running == 1))
+ return;
+
+ clear_buddies(cfs_rq, se);
+
+ if (curr->policy != SCHED_BATCH) {
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq);
+ }
+
+ set_skip_buddy(se);
+}
+
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+ struct sched_entity *se = &p->se;
+
+ /* throttled hierarchies are not runnable */
+ if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
+ return false;
+
+ /* Tell the scheduler that we'd really like pse to run next. */
+ set_next_buddy(se);
+
+ yield_task_fair(rq);
+
+ return true;
+}
+
+#ifdef CONFIG_SMP
+/**************************************************
+ * Fair scheduling class load-balancing methods.
+ *
+ * BASICS
+ *
+ * The purpose of load-balancing is to achieve the same basic fairness the
+ * per-CPU scheduler provides, namely provide a proportional amount of compute
+ * time to each task. This is expressed in the following equation:
+ *
+ * W_i,n/P_i == W_j,n/P_j for all i,j (1)
+ *
+ * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
+ * W_i,0 is defined as:
+ *
+ * W_i,0 = \Sum_j w_i,j (2)
+ *
+ * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
+ * is derived from the nice value as per sched_prio_to_weight[].
+ *
+ * The weight average is an exponential decay average of the instantaneous
+ * weight:
+ *
+ * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
+ *
+ * C_i is the compute capacity of CPU i, typically it is the
+ * fraction of 'recent' time available for SCHED_OTHER task execution. But it
+ * can also include other factors [XXX].
+ *
+ * To achieve this balance we define a measure of imbalance which follows
+ * directly from (1):
+ *
+ * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
+ *
+ * We them move tasks around to minimize the imbalance. In the continuous
+ * function space it is obvious this converges, in the discrete case we get
+ * a few fun cases generally called infeasible weight scenarios.
+ *
+ * [XXX expand on:
+ * - infeasible weights;
+ * - local vs global optima in the discrete case. ]
+ *
+ *
+ * SCHED DOMAINS
+ *
+ * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
+ * for all i,j solution, we create a tree of CPUs that follows the hardware
+ * topology where each level pairs two lower groups (or better). This results
+ * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
+ * tree to only the first of the previous level and we decrease the frequency
+ * of load-balance at each level inv. proportional to the number of CPUs in
+ * the groups.
+ *
+ * This yields:
+ *
+ * log_2 n 1 n
+ * \Sum { --- * --- * 2^i } = O(n) (5)
+ * i = 0 2^i 2^i
+ * `- size of each group
+ * | | `- number of CPUs doing load-balance
+ * | `- freq
+ * `- sum over all levels
+ *
+ * Coupled with a limit on how many tasks we can migrate every balance pass,
+ * this makes (5) the runtime complexity of the balancer.
+ *
+ * An important property here is that each CPU is still (indirectly) connected
+ * to every other CPU in at most O(log n) steps:
+ *
+ * The adjacency matrix of the resulting graph is given by:
+ *
+ * log_2 n
+ * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
+ * k = 0
+ *
+ * And you'll find that:
+ *
+ * A^(log_2 n)_i,j != 0 for all i,j (7)
+ *
+ * Showing there's indeed a path between every CPU in at most O(log n) steps.
+ * The task movement gives a factor of O(m), giving a convergence complexity
+ * of:
+ *
+ * O(nm log n), n := nr_cpus, m := nr_tasks (8)
+ *
+ *
+ * WORK CONSERVING
+ *
+ * In order to avoid CPUs going idle while there's still work to do, new idle
+ * balancing is more aggressive and has the newly idle CPU iterate up the domain
+ * tree itself instead of relying on other CPUs to bring it work.
+ *
+ * This adds some complexity to both (5) and (8) but it reduces the total idle
+ * time.
+ *
+ * [XXX more?]
+ *
+ *
+ * CGROUPS
+ *
+ * Cgroups make a horror show out of (2), instead of a simple sum we get:
+ *
+ * s_k,i
+ * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
+ * S_k
+ *
+ * Where
+ *
+ * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
+ *
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
+ *
+ * The big problem is S_k, its a global sum needed to compute a local (W_i)
+ * property.
+ *
+ * [XXX write more on how we solve this.. _after_ merging pjt's patches that
+ * rewrite all of this once again.]
+ */
+
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
+enum fbq_type { regular, remote, all };
+
+#define LBF_ALL_PINNED 0x01
+#define LBF_NEED_BREAK 0x02
+#define LBF_DST_PINNED 0x04
+#define LBF_SOME_PINNED 0x08
+#define LBF_NOHZ_STATS 0x10
+#define LBF_NOHZ_AGAIN 0x20
+
+struct lb_env {
+ struct sched_domain *sd;
+
+ struct rq *src_rq;
+ int src_cpu;
+
+ int dst_cpu;
+ struct rq *dst_rq;
+
+ struct cpumask *dst_grpmask;
+ int new_dst_cpu;
+ enum cpu_idle_type idle;
+ long imbalance;
+ /* The set of CPUs under consideration for load-balancing */
+ struct cpumask *cpus;
+
+ unsigned int flags;
+
+ unsigned int loop;
+ unsigned int loop_break;
+ unsigned int loop_max;
+
+ enum fbq_type fbq_type;
+ struct list_head tasks;
+};
+
+/*
+ * Is this task likely cache-hot:
+ */
+static int task_hot(struct task_struct *p, struct lb_env *env)
+{
+ s64 delta;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
+ if (p->sched_class != &fair_sched_class)
+ return 0;
+
+ if (unlikely(p->policy == SCHED_IDLE))
+ return 0;
+
+ /*
+ * Buddy candidates are cache hot:
+ */
+ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
+ (&p->se == cfs_rq_of(&p->se)->next ||
+ &p->se == cfs_rq_of(&p->se)->last))
+ return 1;
+
+ if (sysctl_sched_migration_cost == -1)
+ return 1;
+ if (sysctl_sched_migration_cost == 0)
+ return 0;
+
+ delta = rq_clock_task(env->src_rq) - p->se.exec_start;
+
+ return delta < (s64)sysctl_sched_migration_cost;
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Returns 1, if task migration degrades locality
+ * Returns 0, if task migration improves locality i.e migration preferred.
+ * Returns -1, if task migration is not affected by locality.
+ */
+static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ unsigned long src_weight, dst_weight;
+ int src_nid, dst_nid, dist;
+
+ if (!static_branch_likely(&sched_numa_balancing))
+ return -1;
+
+ if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+ return -1;
+
+ src_nid = cpu_to_node(env->src_cpu);
+ dst_nid = cpu_to_node(env->dst_cpu);
+
+ if (src_nid == dst_nid)
+ return -1;
+
+ /* Migrating away from the preferred node is always bad. */
+ if (src_nid == p->numa_preferred_nid) {
+ if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
+ return 1;
+ else
+ return -1;
+ }
+
+ /* Encourage migration to the preferred node. */
+ if (dst_nid == p->numa_preferred_nid)
+ return 0;
+
+ /* Leaving a core idle is often worse than degrading locality. */
+ if (env->idle == CPU_IDLE)
+ return -1;
+
+ dist = node_distance(src_nid, dst_nid);
+ if (numa_group) {
+ src_weight = group_weight(p, src_nid, dist);
+ dst_weight = group_weight(p, dst_nid, dist);
+ } else {
+ src_weight = task_weight(p, src_nid, dist);
+ dst_weight = task_weight(p, dst_nid, dist);
+ }
+
+ return dst_weight < src_weight;
+}
+
+#else
+static inline int migrate_degrades_locality(struct task_struct *p,
+ struct lb_env *env)
+{
+ return -1;
+}
+#endif
+
+/*
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+static
+int can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+ int tsk_cache_hot;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
+ /*
+ * We do not migrate tasks that are:
+ * 1) throttled_lb_pair, or
+ * 2) cannot be migrated to this CPU due to cpus_allowed, or
+ * 3) running (obviously), or
+ * 4) are cache-hot on their current CPU.
+ */
+ if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ return 0;
+
+ if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
+ int cpu;
+
+ schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
+
+ env->flags |= LBF_SOME_PINNED;
+
+ /*
+ * Remember if this task can be migrated to any other CPU in
+ * our sched_group. We may want to revisit it if we couldn't
+ * meet load balance goals by pulling other tasks on src_cpu.
+ *
+ * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
+ * already computed one in current iteration.
+ */
+ if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
+ return 0;
+
+ /* Prevent to re-select dst_cpu via env's CPUs: */
+ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+ env->flags |= LBF_DST_PINNED;
+ env->new_dst_cpu = cpu;
+ break;
+ }
+ }
+
+ return 0;
+ }
+
+ /* Record that we found atleast one task that could run on dst_cpu */
+ env->flags &= ~LBF_ALL_PINNED;
+
+ if (task_running(env->src_rq, p)) {
+ schedstat_inc(p->se.statistics.nr_failed_migrations_running);
+ return 0;
+ }
+
+ /*
+ * Aggressive migration if:
+ * 1) destination numa is preferred
+ * 2) task is cache cold, or
+ * 3) too many balance attempts have failed.
+ */
+ tsk_cache_hot = migrate_degrades_locality(p, env);
+ if (tsk_cache_hot == -1)
+ tsk_cache_hot = task_hot(p, env);
+
+ if (tsk_cache_hot <= 0 ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+ if (tsk_cache_hot == 1) {
+ schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+ schedstat_inc(p->se.statistics.nr_forced_migrations);
+ }
+ return 1;
+ }
+
+ schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
+ return 0;
+}
+
+/*
+ * detach_task() -- detach the task for the migration specified in env
+ */
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+ lockdep_assert_held(&env->src_rq->lock);
+
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
+ set_task_cpu(p, env->dst_cpu);
+}
+
+/*
+ * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
+ * part of active balancing operations within "domain".
+ *
+ * Returns a task if successful and NULL otherwise.
+ */
+static struct task_struct *detach_one_task(struct lb_env *env)
+{
+ struct task_struct *p;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
+ list_for_each_entry_reverse(p,
+ &env->src_rq->cfs_tasks, se.group_node) {
+ if (!can_migrate_task(p, env))
+ continue;
+
+ detach_task(p, env);
+
+ /*
+ * Right now, this is only the second place where
+ * lb_gained[env->idle] is updated (other is detach_tasks)
+ * so we can safely collect stats here rather than
+ * inside detach_tasks().
+ */
+ schedstat_inc(env->sd->lb_gained[env->idle]);
+ return p;
+ }
+ return NULL;
+}
+
+static const unsigned int sched_nr_migrate_break = 32;
+
+/*
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * busiest_rq, as part of a balancing operation within domain "sd".
+ *
+ * Returns number of detached tasks if successful and 0 otherwise.
+ */
+static int detach_tasks(struct lb_env *env)
+{
+ struct list_head *tasks = &env->src_rq->cfs_tasks;
+ struct task_struct *p;
+ unsigned long load;
+ int detached = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
+ if (env->imbalance <= 0)
+ return 0;
+
+ while (!list_empty(tasks)) {
+ /*
+ * We don't want to steal all, otherwise we may be treated likewise,
+ * which could at worst lead to a livelock crash.
+ */
+ if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+ break;
+
+ p = list_last_entry(tasks, struct task_struct, se.group_node);
+
+ env->loop++;
+ /* We've more or less seen every task there is, call it quits */
+ if (env->loop > env->loop_max)
+ break;
+
+ /* take a breather every nr_migrate tasks */
+ if (env->loop > env->loop_break) {
+ env->loop_break += sched_nr_migrate_break;
+ env->flags |= LBF_NEED_BREAK;
+ break;
+ }
+
+ if (!can_migrate_task(p, env))
+ goto next;
+
+ /*
+ * Depending of the number of CPUs and tasks and the
+ * cgroup hierarchy, task_h_load() can return a null
+ * value. Make sure that env->imbalance decreases
+ * otherwise detach_tasks() will stop only after
+ * detaching up to loop_max tasks.
+ */
+ load = max_t(unsigned long, task_h_load(p), 1);
+
+
+ if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
+ goto next;
+
+ if ((load / 2) > env->imbalance)
+ goto next;
+
+ detach_task(p, env);
+ list_add(&p->se.group_node, &env->tasks);
+
+ detached++;
+ env->imbalance -= load;
+
+#ifdef CONFIG_PREEMPT
+ /*
+ * NEWIDLE balancing is a source of latency, so preemptible
+ * kernels will stop after the first task is detached to minimize
+ * the critical section.
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ break;
+#endif
+
+ /*
+ * We only want to steal up to the prescribed amount of
+ * weighted load.
+ */
+ if (env->imbalance <= 0)
+ break;
+
+ continue;
+next:
+ list_move(&p->se.group_node, tasks);
+ }
+
+ /*
+ * Right now, this is one of only two places we collect this stat
+ * so we can safely collect detach_one_task() stats here rather
+ * than inside detach_one_task().
+ */
+ schedstat_add(env->sd->lb_gained[env->idle], detached);
+
+ return detached;
+}
+
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_held(&rq->lock);
+
+ BUG_ON(task_rq(p) != rq);
+ activate_task(rq, p, ENQUEUE_NOCLOCK);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
+ attach_task(rq, p);
+ rq_unlock(rq, &rf);
+}
+
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+ struct list_head *tasks = &env->tasks;
+ struct task_struct *p;
+ struct rq_flags rf;
+
+ rq_lock(env->dst_rq, &rf);
+ update_rq_clock(env->dst_rq);
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
+ list_del_init(&p->se.group_node);
+
+ attach_task(env->dst_rq, p);
+ }
+
+ rq_unlock(env->dst_rq, &rf);
+}
+
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->avg.load_avg)
+ return true;
+
+ if (cfs_rq->avg.util_avg)
+ return true;
+
+ return false;
+}
+
+static inline bool others_have_blocked(struct rq *rq)
+{
+ if (READ_ONCE(rq->avg_rt.util_avg))
+ return true;
+
+ if (READ_ONCE(rq->avg_dl.util_avg))
+ return true;
+
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+ if (READ_ONCE(rq->avg_irq.util_avg))
+ return true;
+#endif
+
+ return false;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load.weight)
+ return false;
+
+ if (cfs_rq->avg.load_sum)
+ return false;
+
+ if (cfs_rq->avg.util_sum)
+ return false;
+
+ if (cfs_rq->avg.runnable_load_sum)
+ return false;
+
+ return true;
+}
+
+static void update_blocked_averages(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq, *pos;
+ const struct sched_class *curr_class;
+ struct rq_flags rf;
+ bool done = true;
+
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+
+ /*
+ * Iterates the task_group tree in a bottom up fashion, see
+ * list_add_leaf_cfs_rq() for details.
+ */
+ for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
+ struct sched_entity *se;
+
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ update_tg_load_avg(cfs_rq, 0);
+
+ /* Propagate pending load changes to the parent, if any: */
+ se = cfs_rq->tg->se[cpu];
+ if (se && !skip_blocked_update(se))
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
+
+ /*
+ * There can be a lot of idle CPU cgroups. Don't let fully
+ * decayed cfs_rqs linger on the list.
+ */
+ if (cfs_rq_is_decayed(cfs_rq))
+ list_del_leaf_cfs_rq(cfs_rq);
+
+ /* Don't need periodic decay once load/util_avg are null */
+ if (cfs_rq_has_blocked(cfs_rq))
+ done = false;
+ }
+
+ curr_class = rq->curr->sched_class;
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_irq_load_avg(rq, 0);
+ /* Don't need periodic decay once load/util_avg are null */
+ if (others_have_blocked(rq))
+ done = false;
+
+#ifdef CONFIG_NO_HZ_COMMON
+ rq->last_blocked_load_update_tick = jiffies;
+ if (done)
+ rq->has_blocked_load = 0;
+#endif
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+/*
+ * Compute the hierarchical load factor for cfs_rq and all its ascendants.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
+ */
+static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+ unsigned long now = jiffies;
+ unsigned long load;
+
+ if (cfs_rq->last_h_load_update == now)
+ return;
+
+ WRITE_ONCE(cfs_rq->h_load_next, NULL);
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ WRITE_ONCE(cfs_rq->h_load_next, se);
+ if (cfs_rq->last_h_load_update == now)
+ break;
+ }
+
+ if (!se) {
+ cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
+ cfs_rq->last_h_load_update = now;
+ }
+
+ while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
+ load = cfs_rq->h_load;
+ load = div64_ul(load * se->avg.load_avg,
+ cfs_rq_load_avg(cfs_rq) + 1);
+ cfs_rq = group_cfs_rq(se);
+ cfs_rq->h_load = load;
+ cfs_rq->last_h_load_update = now;
+ }
+}
+
+static unsigned long task_h_load(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+ update_cfs_rq_h_load(cfs_rq);
+ return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
+ cfs_rq_load_avg(cfs_rq) + 1);
+}
+#else
+static inline void update_blocked_averages(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ const struct sched_class *curr_class;
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+
+ curr_class = rq->curr->sched_class;
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_irq_load_avg(rq, 0);
+#ifdef CONFIG_NO_HZ_COMMON
+ rq->last_blocked_load_update_tick = jiffies;
+ if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
+ rq->has_blocked_load = 0;
+#endif
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+static unsigned long task_h_load(struct task_struct *p)
+{
+ return p->se.avg.load_avg;
+}
+#endif
+
+/********** Helpers for find_busiest_group ************************/
+
+enum group_type {
+ group_other = 0,
+ group_imbalanced,
+ group_overloaded,
+};
+
+/*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+ unsigned long avg_load; /*Avg load across the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+ unsigned long load_per_task;
+ unsigned long group_capacity;
+ unsigned long group_util; /* Total utilization of the group */
+ unsigned int sum_nr_running; /* Nr tasks running in the group */
+ unsigned int idle_cpus;
+ unsigned int group_weight;
+ enum group_type group_type;
+ int group_no_capacity;
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+#endif
+};
+
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * during load balancing.
+ */
+struct sd_lb_stats {
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *local; /* Local group in this sd */
+ unsigned long total_running;
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+
+ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
+ struct sg_lb_stats local_stat; /* Statistics of the local group */
+};
+
+static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
+{
+ /*
+ * Skimp on the clearing to avoid duplicate work. We can avoid clearing
+ * local_stat because update_sg_lb_stats() does a full clear/assignment.
+ * We must however clear busiest_stat::avg_load because
+ * update_sd_pick_busiest() reads this before assignment.
+ */
+ *sds = (struct sd_lb_stats){
+ .busiest = NULL,
+ .local = NULL,
+ .total_running = 0UL,
+ .total_load = 0UL,
+ .total_capacity = 0UL,
+ .busiest_stat = {
+ .avg_load = 0UL,
+ .sum_nr_running = 0,
+ .group_type = group_other,
+ },
+ };
+}
+
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The idle status of the CPU for whose sd load_idx is obtained.
+ *
+ * Return: The load index.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+ enum cpu_idle_type idle)
+{
+ int load_idx;
+
+ switch (idle) {
+ case CPU_NOT_IDLE:
+ load_idx = sd->busy_idx;
+ break;
+
+ case CPU_NEWLY_IDLE:
+ load_idx = sd->newidle_idx;
+ break;
+ default:
+ load_idx = sd->idle_idx;
+ break;
+ }
+
+ return load_idx;
+}
+
+static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long max = arch_scale_cpu_capacity(sd, cpu);
+ unsigned long used, free;
+ unsigned long irq;
+
+ irq = cpu_util_irq(rq);
+
+ if (unlikely(irq >= max))
+ return 1;
+
+ used = READ_ONCE(rq->avg_rt.util_avg);
+ used += READ_ONCE(rq->avg_dl.util_avg);
+
+ if (unlikely(used >= max))
+ return 1;
+
+ free = max - used;
+
+ return scale_irq_capacity(free, irq, max);
+}
+
+static void update_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+ unsigned long capacity = scale_rt_capacity(sd, cpu);
+ struct sched_group *sdg = sd->groups;
+
+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
+
+ if (!capacity)
+ capacity = 1;
+
+ cpu_rq(cpu)->cpu_capacity = capacity;
+ sdg->sgc->capacity = capacity;
+ sdg->sgc->min_capacity = capacity;
+}
+
+void update_group_capacity(struct sched_domain *sd, int cpu)
+{
+ struct sched_domain *child = sd->child;
+ struct sched_group *group, *sdg = sd->groups;
+ unsigned long capacity, min_capacity;
+ unsigned long interval;
+
+ interval = msecs_to_jiffies(sd->balance_interval);
+ interval = clamp(interval, 1UL, max_load_balance_interval);
+ sdg->sgc->next_update = jiffies + interval;
+
+ if (!child) {
+ update_cpu_capacity(sd, cpu);
+ return;
+ }
+
+ capacity = 0;
+ min_capacity = ULONG_MAX;
+
+ if (child->flags & SD_OVERLAP) {
+ /*
+ * SD_OVERLAP domains cannot assume that child groups
+ * span the current group.
+ */
+
+ for_each_cpu(cpu, sched_group_span(sdg)) {
+ struct sched_group_capacity *sgc;
+ struct rq *rq = cpu_rq(cpu);
+
+ /*
+ * build_sched_domains() -> init_sched_groups_capacity()
+ * gets here before we've attached the domains to the
+ * runqueues.
+ *
+ * Use capacity_of(), which is set irrespective of domains
+ * in update_cpu_capacity().
+ *
+ * This avoids capacity from being 0 and
+ * causing divide-by-zero issues on boot.
+ */
+ if (unlikely(!rq->sd)) {
+ capacity += capacity_of(cpu);
+ } else {
+ sgc = rq->sd->groups->sgc;
+ capacity += sgc->capacity;
+ }
+
+ min_capacity = min(capacity, min_capacity);
+ }
+ } else {
+ /*
+ * !SD_OVERLAP domains can assume that child groups
+ * span the current group.
+ */
+
+ group = child->groups;
+ do {
+ struct sched_group_capacity *sgc = group->sgc;
+
+ capacity += sgc->capacity;
+ min_capacity = min(sgc->min_capacity, min_capacity);
+ group = group->next;
+ } while (group != child->groups);
+ }
+
+ sdg->sgc->capacity = capacity;
+ sdg->sgc->min_capacity = min_capacity;
+}
+
+/*
+ * Check whether the capacity of the rq has been noticeably reduced by side
+ * activity. The imbalance_pct is used for the threshold.
+ * Return true is the capacity is reduced
+ */
+static inline int
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
+{
+ return ((rq->cpu_capacity * sd->imbalance_pct) <
+ (rq->cpu_capacity_orig * 100));
+}
+
+/*
+ * Group imbalance indicates (and tries to solve) the problem where balancing
+ * groups is inadequate due to ->cpus_allowed constraints.
+ *
+ * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
+ * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
+ * Something like:
+ *
+ * { 0 1 2 3 } { 4 5 6 7 }
+ * * * * *
+ *
+ * If we were to balance group-wise we'd place two tasks in the first group and
+ * two tasks in the second group. Clearly this is undesired as it will overload
+ * cpu 3 and leave one of the CPUs in the second group unused.
+ *
+ * The current solution to this issue is detecting the skew in the first group
+ * by noticing the lower domain failed to reach balance and had difficulty
+ * moving tasks due to affinity constraints.
+ *
+ * When this is so detected; this group becomes a candidate for busiest; see
+ * update_sd_pick_busiest(). And calculate_imbalance() and
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * to create an effective group imbalance.
+ *
+ * This is a somewhat tricky proposition since the next run might not find the
+ * group imbalance and decide the groups need to be balanced again. A most
+ * subtle and fragile situation.
+ */
+
+static inline int sg_imbalanced(struct sched_group *group)
+{
+ return group->sgc->imbalance;
+}
+
+/*
+ * group_has_capacity returns true if the group has spare capacity that could
+ * be used by some tasks.
+ * We consider that a group has spare capacity if the * number of task is
+ * smaller than the number of CPUs or if the utilization is lower than the
+ * available capacity for CFS tasks.
+ * For the latter, we use a threshold to stabilize the state, to take into
+ * account the variance of the tasks' load and to return true if the available
+ * capacity in meaningful for the load balancer.
+ * As an example, an available capacity of 1% can appear but it doesn't make
+ * any benefit for the load balance.
+ */
+static inline bool
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running < sgs->group_weight)
+ return true;
+
+ if ((sgs->group_capacity * 100) >
+ (sgs->group_util * env->sd->imbalance_pct))
+ return true;
+
+ return false;
+}
+
+/*
+ * group_is_overloaded returns true if the group has more tasks than it can
+ * handle.
+ * group_is_overloaded is not equals to !group_has_capacity because a group
+ * with the exact right number of tasks, has no more spare capacity but is not
+ * overloaded so both group_has_capacity and group_is_overloaded return
+ * false.
+ */
+static inline bool
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running <= sgs->group_weight)
+ return false;
+
+ if ((sgs->group_capacity * 100) <
+ (sgs->group_util * env->sd->imbalance_pct))
+ return true;
+
+ return false;
+}
+
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->min_capacity * capacity_margin <
+ ref->sgc->min_capacity * 1024;
+}
+
+static inline enum
+group_type group_classify(struct sched_group *group,
+ struct sg_lb_stats *sgs)
+{
+ if (sgs->group_no_capacity)
+ return group_overloaded;
+
+ if (sg_imbalanced(group))
+ return group_imbalanced;
+
+ return group_other;
+}
+
+static bool update_nohz_stats(struct rq *rq, bool force)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned int cpu = rq->cpu;
+
+ if (!rq->has_blocked_load)
+ return false;
+
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return false;
+
+ if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
+ return true;
+
+ update_blocked_averages(cpu);
+
+ return rq->has_blocked_load;
+#else
+ return false;
+#endif
+}
+
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @env: The load balancing environment.
+ * @group: sched_group whose statistics are to be updated.
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @local_group: Does group contain this_cpu.
+ * @sgs: variable to hold the statistics for this group.
+ * @overload: Indicate more than one runnable task for any CPU.
+ */
+static inline void update_sg_lb_stats(struct lb_env *env,
+ struct sched_group *group, int load_idx,
+ int local_group, struct sg_lb_stats *sgs,
+ bool *overload)
+{
+ unsigned long load;
+ int i, nr_running;
+
+ memset(sgs, 0, sizeof(*sgs));
+
+ for_each_cpu_and(i, sched_group_span(group), env->cpus) {
+ struct rq *rq = cpu_rq(i);
+
+ if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
+ env->flags |= LBF_NOHZ_AGAIN;
+
+ /* Bias balancing toward CPUs of our domain: */
+ if (local_group)
+ load = target_load(i, load_idx);
+ else
+ load = source_load(i, load_idx);
+
+ sgs->group_load += load;
+ sgs->group_util += cpu_util(i);
+ sgs->sum_nr_running += rq->cfs.h_nr_running;
+
+ nr_running = rq->nr_running;
+ if (nr_running > 1)
+ *overload = true;
+
+#ifdef CONFIG_NUMA_BALANCING
+ sgs->nr_numa_running += rq->nr_numa_running;
+ sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
+ sgs->sum_weighted_load += weighted_cpuload(rq);
+ /*
+ * No need to call idle_cpu() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu(i))
+ sgs->idle_cpus++;
+ }
+
+ /* Adjust by relative CPU capacity of the group */
+ sgs->group_capacity = group->sgc->capacity;
+ sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+
+ if (sgs->sum_nr_running)
+ sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+
+ sgs->group_weight = group->group_weight;
+
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(group, sgs);
+}
+
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @env: The load balancing environment.
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sgs: sched_group statistics
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ *
+ * Return: %true if @sg is a busier group than the previously selected
+ * busiest group. %false otherwise.
+ */
+static bool update_sd_pick_busiest(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs)
+{
+ struct sg_lb_stats *busiest = &sds->busiest_stat;
+
+ if (sgs->group_type > busiest->group_type)
+ return true;
+
+ if (sgs->group_type < busiest->group_type)
+ return false;
+
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+ goto asym_packing;
+
+ /*
+ * Candidate sg has no more than one task per CPU and
+ * has higher per-CPU capacity. Migrating tasks to less
+ * capable CPUs may harm throughput. Maximize throughput,
+ * power/energy consequences are not considered.
+ */
+ if (sgs->sum_nr_running <= sgs->group_weight &&
+ group_smaller_cpu_capacity(sds->local, sg))
+ return false;
+
+asym_packing:
+ /* This is the busiest node in its class. */
+ if (!(env->sd->flags & SD_ASYM_PACKING))
+ return true;
+
+ /* No ASYM_PACKING if target CPU is already busy */
+ if (env->idle == CPU_NOT_IDLE)
+ return true;
+ /*
+ * ASYM_PACKING needs to move all the work to the highest
+ * prority CPUs in the group, therefore mark all groups
+ * of lower priority than ourself as busy.
+ */
+ if (sgs->sum_nr_running &&
+ sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
+ if (!sds->busiest)
+ return true;
+
+ /* Prefer to move from lowest priority CPU's work */
+ if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
+ sg->asym_prefer_cpu))
+ return true;
+ }
+
+ return false;
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running > sgs->nr_numa_running)
+ return regular;
+ if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ return remote;
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ if (rq->nr_running > rq->nr_numa_running)
+ return regular;
+ if (rq->nr_running > rq->nr_preferred_running)
+ return remote;
+ return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+/**
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
+ * @env: The load balancing environment.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ struct sched_domain *child = env->sd->child;
+ struct sched_group *sg = env->sd->groups;
+ struct sg_lb_stats *local = &sds->local_stat;
+ struct sg_lb_stats tmp_sgs;
+ int load_idx, prefer_sibling = 0;
+ bool overload = false;
+
+ if (child && child->flags & SD_PREFER_SIBLING)
+ prefer_sibling = 1;
+
+#ifdef CONFIG_NO_HZ_COMMON
+ if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
+ env->flags |= LBF_NOHZ_STATS;
+#endif
+
+ load_idx = get_sd_load_idx(env->sd, env->idle);
+
+ do {
+ struct sg_lb_stats *sgs = &tmp_sgs;
+ int local_group;
+
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
+ if (local_group) {
+ sds->local = sg;
+ sgs = local;
+
+ if (env->idle != CPU_NEWLY_IDLE ||
+ time_after_eq(jiffies, sg->sgc->next_update))
+ update_group_capacity(env->sd, env->dst_cpu);
+ }
+
+ update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
+ &overload);
+
+ if (local_group)
+ goto next_group;
+
+ /*
+ * In case the child domain prefers tasks go to siblings
+ * first, lower the sg capacity so that we'll try
+ * and move all the excess tasks away. We lower the capacity
+ * of a group only if the local group has the capacity to fit
+ * these excess tasks. The extra check prevents the case where
+ * you always pull from the heaviest group when it is already
+ * under-utilized (possible with a large weight task outweighs
+ * the tasks on the system).
+ */
+ if (prefer_sibling && sds->local &&
+ group_has_capacity(env, local) &&
+ (sgs->sum_nr_running > local->sum_nr_running + 1)) {
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_classify(sg, sgs);
+ }
+
+ if (update_sd_pick_busiest(env, sds, sg, sgs)) {
+ sds->busiest = sg;
+ sds->busiest_stat = *sgs;
+ }
+
+next_group:
+ /* Now, start updating sd_lb_stats */
+ sds->total_running += sgs->sum_nr_running;
+ sds->total_load += sgs->group_load;
+ sds->total_capacity += sgs->group_capacity;
+
+ sg = sg->next;
+ } while (sg != env->sd->groups);
+
+#ifdef CONFIG_NO_HZ_COMMON
+ if ((env->flags & LBF_NOHZ_AGAIN) &&
+ cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
+
+ WRITE_ONCE(nohz.next_blocked,
+ jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
+ }
+#endif
+
+ if (env->sd->flags & SD_NUMA)
+ env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+
+ if (!env->sd->parent) {
+ /* update overload indicator if we are at root domain */
+ if (env->dst_rq->rd->overload != overload)
+ env->dst_rq->rd->overload = overload;
+ }
+}
+
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ * sched domain.
+ *
+ * This is primarily intended to used at the sibling level. Some
+ * cores like POWER7 prefer to use lower numbered SMT threads. In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle. When in lower SMT modes, the threads will
+ * perform better since they share less core resources. Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads. It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on. Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * Return: 1 when packing is required and a task should be moved to
+ * this CPU. The amount of the imbalance is returned in env->imbalance.
+ *
+ * @env: The load balancing environment.
+ * @sds: Statistics of the sched_domain which is to be packed
+ */
+static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ int busiest_cpu;
+
+ if (!(env->sd->flags & SD_ASYM_PACKING))
+ return 0;
+
+ if (env->idle == CPU_NOT_IDLE)
+ return 0;
+
+ if (!sds->busiest)
+ return 0;
+
+ busiest_cpu = sds->busiest->asym_prefer_cpu;
+ if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
+ return 0;
+
+ env->imbalance = DIV_ROUND_CLOSEST(
+ sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
+ SCHED_CAPACITY_SCALE);
+
+ return 1;
+}
+
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ * amongst the groups of a sched_domain, during
+ * load balancing.
+ * @env: The load balancing environment.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ */
+static inline
+void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ unsigned long tmp, capa_now = 0, capa_move = 0;
+ unsigned int imbn = 2;
+ unsigned long scaled_busy_load_per_task;
+ struct sg_lb_stats *local, *busiest;
+
+ local = &sds->local_stat;
+ busiest = &sds->busiest_stat;
+
+ if (!local->sum_nr_running)
+ local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
+ else if (busiest->load_per_task > local->load_per_task)
+ imbn = 1;
+
+ scaled_busy_load_per_task =
+ (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ busiest->group_capacity;
+
+ if (busiest->avg_load + scaled_busy_load_per_task >=
+ local->avg_load + (scaled_busy_load_per_task * imbn)) {
+ env->imbalance = busiest->load_per_task;
+ return;
+ }
+
+ /*
+ * OK, we don't have enough imbalance to justify moving tasks,
+ * however we may be able to increase total CPU capacity used by
+ * moving them.
+ */
+
+ capa_now += busiest->group_capacity *
+ min(busiest->load_per_task, busiest->avg_load);
+ capa_now += local->group_capacity *
+ min(local->load_per_task, local->avg_load);
+ capa_now /= SCHED_CAPACITY_SCALE;
+
+ /* Amount of load we'd subtract */
+ if (busiest->avg_load > scaled_busy_load_per_task) {
+ capa_move += busiest->group_capacity *
+ min(busiest->load_per_task,
+ busiest->avg_load - scaled_busy_load_per_task);
+ }
+
+ /* Amount of load we'd add */
+ if (busiest->avg_load * busiest->group_capacity <
+ busiest->load_per_task * SCHED_CAPACITY_SCALE) {
+ tmp = (busiest->avg_load * busiest->group_capacity) /
+ local->group_capacity;
+ } else {
+ tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ local->group_capacity;
+ }
+ capa_move += local->group_capacity *
+ min(local->load_per_task, local->avg_load + tmp);
+ capa_move /= SCHED_CAPACITY_SCALE;
+
+ /* Move if we gain throughput */
+ if (capa_move > capa_now)
+ env->imbalance = busiest->load_per_task;
+}
+
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ * groups of a given sched_domain during load balance.
+ * @env: load balance environment
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ */
+static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ unsigned long max_pull, load_above_capacity = ~0UL;
+ struct sg_lb_stats *local, *busiest;
+
+ local = &sds->local_stat;
+ busiest = &sds->busiest_stat;
+
+ if (busiest->group_type == group_imbalanced) {
+ /*
+ * In the group_imb case we cannot rely on group-wide averages
+ * to ensure CPU-load equilibrium, look at wider averages. XXX
+ */
+ busiest->load_per_task =
+ min(busiest->load_per_task, sds->avg_load);
+ }
+
+ /*
+ * Avg load of busiest sg can be less and avg load of local sg can
+ * be greater than avg load across all sgs of sd because avg load
+ * factors in sg capacity and sgs with smaller group_type are
+ * skipped when updating the busiest sg:
+ */
+ if (busiest->avg_load <= sds->avg_load ||
+ local->avg_load >= sds->avg_load) {
+ env->imbalance = 0;
+ return fix_small_imbalance(env, sds);
+ }
+
+ /*
+ * If there aren't any idle CPUs, avoid creating some.
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type == group_overloaded) {
+ load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
+ if (load_above_capacity > busiest->group_capacity) {
+ load_above_capacity -= busiest->group_capacity;
+ load_above_capacity *= scale_load_down(NICE_0_LOAD);
+ load_above_capacity /= busiest->group_capacity;
+ } else
+ load_above_capacity = ~0UL;
+ }
+
+ /*
+ * We're trying to get all the CPUs to the average_load, so we don't
+ * want to push ourselves above the average load, nor do we wish to
+ * reduce the max loaded CPU below the average load. At the same time,
+ * we also don't want to reduce the group load below the group
+ * capacity. Thus we look for the minimum possible imbalance.
+ */
+ max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
+
+ /* How much load to actually move to equalise the imbalance */
+ env->imbalance = min(
+ max_pull * busiest->group_capacity,
+ (sds->avg_load - local->avg_load) * local->group_capacity
+ ) / SCHED_CAPACITY_SCALE;
+
+ /*
+ * if *imbalance is less than the average load per runnable task
+ * there is no guarantee that any tasks will be moved so we'll have
+ * a think about bumping its value to force at least one task to be
+ * moved
+ */
+ if (env->imbalance < busiest->load_per_task)
+ return fix_small_imbalance(env, sds);
+}
+
+/******* find_busiest_group() helpers end here *********************/
+
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @env: The load balancing environment.
+ *
+ * Return: - The busiest group if imbalance exists.
+ */
+static struct sched_group *find_busiest_group(struct lb_env *env)
+{
+ struct sg_lb_stats *local, *busiest;
+ struct sd_lb_stats sds;
+
+ init_sd_lb_stats(&sds);
+
+ /*
+ * Compute the various statistics relavent for load balancing at
+ * this level.
+ */
+ update_sd_lb_stats(env, &sds);
+ local = &sds.local_stat;
+ busiest = &sds.busiest_stat;
+
+ /* ASYM feature bypasses nice load balance check */
+ if (check_asym_packing(env, &sds))
+ return sds.busiest;
+
+ /* There is no busy sibling group to pull tasks from */
+ if (!sds.busiest || busiest->sum_nr_running == 0)
+ goto out_balanced;
+
+ /* XXX broken for overlapping NUMA groups */
+ sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
+ / sds.total_capacity;
+
+ /*
+ * If the busiest group is imbalanced the below checks don't
+ * work because they assume all things are equal, which typically
+ * isn't true due to cpus_allowed constraints and the like.
+ */
+ if (busiest->group_type == group_imbalanced)
+ goto force_balance;
+
+ /*
+ * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+ * capacities from resulting in underutilization due to avg_load.
+ */
+ if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
+ busiest->group_no_capacity)
+ goto force_balance;
+
+ /*
+ * If the local group is busier than the selected busiest group
+ * don't try and pull any tasks.
+ */
+ if (local->avg_load >= busiest->avg_load)
+ goto out_balanced;
+
+ /*
+ * Don't pull any tasks if this group is already above the domain
+ * average load.
+ */
+ if (local->avg_load >= sds.avg_load)
+ goto out_balanced;
+
+ if (env->idle == CPU_IDLE) {
+ /*
+ * This CPU is idle. If the busiest group is not overloaded
+ * and there is no imbalance between this and busiest group
+ * wrt idle CPUs, it is balanced. The imbalance becomes
+ * significant if the diff is greater than 1 otherwise we
+ * might end up to just move the imbalance on another group
+ */
+ if ((busiest->group_type != group_overloaded) &&
+ (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ goto out_balanced;
+ } else {
+ /*
+ * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
+ * imbalance_pct to be conservative.
+ */
+ if (100 * busiest->avg_load <=
+ env->sd->imbalance_pct * local->avg_load)
+ goto out_balanced;
+ }
+
+force_balance:
+ /* Looks like there is an imbalance. Compute it */
+ calculate_imbalance(env, &sds);
+ return env->imbalance ? sds.busiest : NULL;
+
+out_balanced:
+ env->imbalance = 0;
+ return NULL;
+}
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
+ */
+static struct rq *find_busiest_queue(struct lb_env *env,
+ struct sched_group *group)
+{
+ struct rq *busiest = NULL, *rq;
+ unsigned long busiest_load = 0, busiest_capacity = 1;
+ int i;
+
+ for_each_cpu_and(i, sched_group_span(group), env->cpus) {
+ unsigned long capacity, wl;
+ enum fbq_type rt;
+
+ rq = cpu_rq(i);
+ rt = fbq_classify_rq(rq);
+
+ /*
+ * We classify groups/runqueues into three groups:
+ * - regular: there are !numa tasks
+ * - remote: there are numa tasks that run on the 'wrong' node
+ * - all: there is no distinction
+ *
+ * In order to avoid migrating ideally placed numa tasks,
+ * ignore those when there's better options.
+ *
+ * If we ignore the actual busiest queue to migrate another
+ * task, the next balance pass can still reduce the busiest
+ * queue by moving tasks around inside the node.
+ *
+ * If we cannot move enough load due to this classification
+ * the next pass will adjust the group classification and
+ * allow migration of more tasks.
+ *
+ * Both cases only affect the total convergence complexity.
+ */
+ if (rt > env->fbq_type)
+ continue;
+
+ capacity = capacity_of(i);
+
+ wl = weighted_cpuload(rq);
+
+ /*
+ * When comparing with imbalance, use weighted_cpuload()
+ * which is not scaled with the CPU capacity.
+ */
+
+ if (rq->nr_running == 1 && wl > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
+ continue;
+
+ /*
+ * For the load comparisons with the other CPU's, consider
+ * the weighted_cpuload() scaled with the CPU capacity, so
+ * that the load can be moved away from the CPU that is
+ * potentially running at a lower capacity.
+ *
+ * Thus we're looking for max(wl_i / capacity_i), crosswise
+ * multiplication to rid ourselves of the division works out
+ * to: wl_i * capacity_j > wl_j * capacity_i; where j is
+ * our previous maximum.
+ */
+ if (wl * busiest_capacity > busiest_load * capacity) {
+ busiest_load = wl;
+ busiest_capacity = capacity;
+ busiest = rq;
+ }
+ }
+
+ return busiest;
+}
+
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL 512
+
+static int need_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
+
+ if (env->idle == CPU_NEWLY_IDLE) {
+
+ /*
+ * ASYM_PACKING needs to force migrate tasks from busy but
+ * lower priority CPUs in order to pack all tasks in the
+ * highest priority CPUs.
+ */
+ if ((sd->flags & SD_ASYM_PACKING) &&
+ sched_asym_prefer(env->dst_cpu, env->src_cpu))
+ return 1;
+ }
+
+ /*
+ * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
+ * It's worth migrating the task if the src_cpu's capacity is reduced
+ * because of other sched_class or IRQs if more capacity stays
+ * available on dst_cpu.
+ */
+ if ((env->idle != CPU_NOT_IDLE) &&
+ (env->src_rq->cfs.h_nr_running == 1)) {
+ if ((check_cpu_capacity(env->src_rq, sd)) &&
+ (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
+ return 1;
+ }
+
+ return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+}
+
+static int active_load_balance_cpu_stop(void *data);
+
+static int should_we_balance(struct lb_env *env)
+{
+ struct sched_group *sg = env->sd->groups;
+ int cpu, balance_cpu = -1;
+
+ /*
+ * Ensure the balancing environment is consistent; can happen
+ * when the softirq triggers 'during' hotplug.
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+ return 0;
+
+ /*
+ * In the newly idle case, we will allow all the CPUs
+ * to do the newly idle load balance.
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ return 1;
+
+ /* Try to find first idle CPU */
+ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+ if (!idle_cpu(cpu))
+ continue;
+
+ balance_cpu = cpu;
+ break;
+ }
+
+ if (balance_cpu == -1)
+ balance_cpu = group_balance_cpu(sg);
+
+ /*
+ * First idle CPU or the first CPU(busiest) in this sched group
+ * is eligible for doing load balancing at this and above domains.
+ */
+ return balance_cpu == env->dst_cpu;
+}
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ */
+static int load_balance(int this_cpu, struct rq *this_rq,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *continue_balancing)
+{
+ int ld_moved, cur_ld_moved, active_balance = 0;
+ struct sched_domain *sd_parent = sd->parent;
+ struct sched_group *group;
+ struct rq *busiest;
+ struct rq_flags rf;
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = this_cpu,
+ .dst_rq = this_rq,
+ .dst_grpmask = sched_group_span(sd->groups),
+ .idle = idle,
+ .loop_break = sched_nr_migrate_break,
+ .cpus = cpus,
+ .fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
+ };
+
+ cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
+
+ schedstat_inc(sd->lb_count[idle]);
+
+redo:
+ if (!should_we_balance(&env)) {
+ *continue_balancing = 0;
+ goto out_balanced;
+ }
+
+ group = find_busiest_group(&env);
+ if (!group) {
+ schedstat_inc(sd->lb_nobusyg[idle]);
+ goto out_balanced;
+ }
+
+ busiest = find_busiest_queue(&env, group);
+ if (!busiest) {
+ schedstat_inc(sd->lb_nobusyq[idle]);
+ goto out_balanced;
+ }
+
+ BUG_ON(busiest == env.dst_rq);
+
+ schedstat_add(sd->lb_imbalance[idle], env.imbalance);
+
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+
+ ld_moved = 0;
+ if (busiest->nr_running > 1) {
+ /*
+ * Attempt to move tasks. If find_busiest_group has found
+ * an imbalance but busiest->nr_running <= 1, the group is
+ * still unbalanced. ld_moved simply stays zero, so it is
+ * correctly treated as an imbalance.
+ */
+ env.flags |= LBF_ALL_PINNED;
+ env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
+
+more_balance:
+ rq_lock_irqsave(busiest, &rf);
+ update_rq_clock(busiest);
+
+ /*
+ * cur_ld_moved - load moved in current iteration
+ * ld_moved - cumulative load moved across iterations
+ */
+ cur_ld_moved = detach_tasks(&env);
+
+ /*
+ * We've detached some tasks from busiest_rq. Every
+ * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
+ * unlock busiest->lock, and we are able to be sure
+ * that nobody can manipulate the tasks in parallel.
+ * See task_rq_lock() family for the details.
+ */
+
+ rq_unlock(busiest, &rf);
+
+ if (cur_ld_moved) {
+ attach_tasks(&env);
+ ld_moved += cur_ld_moved;
+ }
+
+ local_irq_restore(rf.flags);
+
+ if (env.flags & LBF_NEED_BREAK) {
+ env.flags &= ~LBF_NEED_BREAK;
+ goto more_balance;
+ }
+
+ /*
+ * Revisit (affine) tasks on src_cpu that couldn't be moved to
+ * us and move them to an alternate dst_cpu in our sched_group
+ * where they can run. The upper limit on how many times we
+ * iterate on same src_cpu is dependent on number of CPUs in our
+ * sched_group.
+ *
+ * This changes load balance semantics a bit on who can move
+ * load to a given_cpu. In addition to the given_cpu itself
+ * (or a ilb_cpu acting on its behalf where given_cpu is
+ * nohz-idle), we now have balance_cpu in a position to move
+ * load to given_cpu. In rare situations, this may cause
+ * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+ * _independently_ and at _same_ time to move some load to
+ * given_cpu) causing exceess load to be moved to given_cpu.
+ * This however should not happen so much in practice and
+ * moreover subsequent load balance cycles should correct the
+ * excess load moved.
+ */
+ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
+
+ /* Prevent to re-select dst_cpu via env's CPUs */
+ cpumask_clear_cpu(env.dst_cpu, env.cpus);
+
+ env.dst_rq = cpu_rq(env.new_dst_cpu);
+ env.dst_cpu = env.new_dst_cpu;
+ env.flags &= ~LBF_DST_PINNED;
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
+
+ /*
+ * Go back to "more_balance" rather than "redo" since we
+ * need to continue with same src_cpu.
+ */
+ goto more_balance;
+ }
+
+ /*
+ * We failed to reach balance because of affinity.
+ */
+ if (sd_parent) {
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
+ *group_imbalance = 1;
+ }
+
+ /* All tasks on this runqueue were pinned by CPU affinity */
+ if (unlikely(env.flags & LBF_ALL_PINNED)) {
+ cpumask_clear_cpu(cpu_of(busiest), cpus);
+ /*
+ * Attempting to continue load balancing at the current
+ * sched_domain level only makes sense if there are
+ * active CPUs remaining as possible busiest CPUs to
+ * pull load from which are not contained within the
+ * destination group that is receiving any migrated
+ * load.
+ */
+ if (!cpumask_subset(cpus, env.dst_grpmask)) {
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
+ goto redo;
+ }
+ goto out_all_pinned;
+ }
+ }
+
+ if (!ld_moved) {
+ schedstat_inc(sd->lb_failed[idle]);
+ /*
+ * Increment the failure counter only on periodic balance.
+ * We do not want newidle balance, which can be very
+ * frequent, pollute the failure counter causing
+ * excessive cache_hot migrations and active balances.
+ */
+ if (idle != CPU_NEWLY_IDLE)
+ sd->nr_balance_failed++;
+
+ if (need_active_balance(&env)) {
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&busiest->lock, flags);
+
+ /*
+ * Don't kick the active_load_balance_cpu_stop,
+ * if the curr task on busiest CPU can't be
+ * moved to this_cpu:
+ */
+ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+ raw_spin_unlock_irqrestore(&busiest->lock,
+ flags);
+ env.flags |= LBF_ALL_PINNED;
+ goto out_one_pinned;
+ }
+
+ /*
+ * ->active_balance synchronizes accesses to
+ * ->active_balance_work. Once set, it's cleared
+ * only after active load balance is finished.
+ */
+ if (!busiest->active_balance) {
+ busiest->active_balance = 1;
+ busiest->push_cpu = this_cpu;
+ active_balance = 1;
+ }
+ raw_spin_unlock_irqrestore(&busiest->lock, flags);
+
+ if (active_balance) {
+ stop_one_cpu_nowait(cpu_of(busiest),
+ active_load_balance_cpu_stop, busiest,
+ &busiest->active_balance_work);
+ }
+
+ /* We've kicked active balancing, force task migration. */
+ sd->nr_balance_failed = sd->cache_nice_tries+1;
+ }
+ } else
+ sd->nr_balance_failed = 0;
+
+ if (likely(!active_balance)) {
+ /* We were unbalanced, so reset the balancing interval */
+ sd->balance_interval = sd->min_interval;
+ } else {
+ /*
+ * If we've begun active balancing, start to back off. This
+ * case may not be covered by the all_pinned logic if there
+ * is only 1 task on the busy runqueue (because we don't call
+ * detach_tasks).
+ */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval *= 2;
+ }
+
+ goto out;
+
+out_balanced:
+ /*
+ * We reach balance although we may have faced some affinity
+ * constraints. Clear the imbalance flag only if other tasks got
+ * a chance to move and fix the imbalance.
+ */
+ if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+
+ if (*group_imbalance)
+ *group_imbalance = 0;
+ }
+
+out_all_pinned:
+ /*
+ * We reach balance because all tasks are pinned at this level so
+ * we can't migrate them. Let the imbalance flag set so parent level
+ * can try to migrate them.
+ */
+ schedstat_inc(sd->lb_balanced[idle]);
+
+ sd->nr_balance_failed = 0;
+
+out_one_pinned:
+ ld_moved = 0;
+
+ /*
+ * idle_balance() disregards balance intervals, so we could repeatedly
+ * reach this code, which would lead to balance_interval skyrocketting
+ * in a short amount of time. Skip the balance_interval increase logic
+ * to avoid that.
+ */
+ if (env.idle == CPU_NEWLY_IDLE)
+ goto out;
+
+ /* tune up the balancing interval */
+ if (((env.flags & LBF_ALL_PINNED) &&
+ sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ (sd->balance_interval < sd->max_interval))
+ sd->balance_interval *= 2;
+out:
+ return ld_moved;
+}
+
+static inline unsigned long
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
+{
+ unsigned long interval = sd->balance_interval;
+
+ if (cpu_busy)
+ interval *= sd->busy_factor;
+
+ /* scale ms to jiffies */
+ interval = msecs_to_jiffies(interval);
+ interval = clamp(interval, 1UL, max_load_balance_interval);
+
+ return interval;
+}
+
+static inline void
+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
+{
+ unsigned long interval, next;
+
+ /* used by idle balance, so cpu_busy = 0 */
+ interval = get_sd_balance_interval(sd, 0);
+ next = sd->last_balance + interval;
+
+ if (time_after(*next_balance, next))
+ *next_balance = next;
+}
+
+/*
+ * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
+ */
+static int active_load_balance_cpu_stop(void *data)
+{
+ struct rq *busiest_rq = data;
+ int busiest_cpu = cpu_of(busiest_rq);
+ int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
+ struct sched_domain *sd;
+ struct task_struct *p = NULL;
+ struct rq_flags rf;
+
+ rq_lock_irq(busiest_rq, &rf);
+ /*
+ * Between queueing the stop-work and running it is a hole in which
+ * CPUs can become inactive. We should not move tasks from or to
+ * inactive CPUs.
+ */
+ if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
+ goto out_unlock;
+
+ /* Make sure the requested CPU hasn't gone down in the meantime: */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance))
+ goto out_unlock;
+
+ /* Is there any task to move? */
+ if (busiest_rq->nr_running <= 1)
+ goto out_unlock;
+
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-CPU setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+ /* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
+ for_each_domain(target_cpu, sd) {
+ if ((sd->flags & SD_LOAD_BALANCE) &&
+ cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
+ }
+
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ /*
+ * can_migrate_task() doesn't need to compute new_dst_cpu
+ * for active balancing. Since we have CPU_IDLE, but no
+ * @dst_grpmask we need to make that test go away with lying
+ * about DST_PINNED.
+ */
+ .flags = LBF_DST_PINNED,
+ };
+
+ schedstat_inc(sd->alb_count);
+ update_rq_clock(busiest_rq);
+
+ p = detach_one_task(&env);
+ if (p) {
+ schedstat_inc(sd->alb_pushed);
+ /* Active balancing done, reset the failure counter. */
+ sd->nr_balance_failed = 0;
+ } else {
+ schedstat_inc(sd->alb_failed);
+ }
+ }
+ rcu_read_unlock();
+out_unlock:
+ busiest_rq->active_balance = 0;
+ rq_unlock(busiest_rq, &rf);
+
+ if (p)
+ attach_one_task(target_rq, p);
+
+ local_irq_enable();
+
+ return 0;
+}
+
+static DEFINE_SPINLOCK(balancing);
+
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+void update_max_interval(void)
+{
+ max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
+/*
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in init_sched_domains.
+ */
+static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
+{
+ int continue_balancing = 1;
+ int cpu = rq->cpu;
+ unsigned long interval;
+ struct sched_domain *sd;
+ /* Earliest time when we have to do rebalance again */
+ unsigned long next_balance = jiffies + 60*HZ;
+ int update_next_balance = 0;
+ int need_serialize, need_decay = 0;
+ u64 max_cost = 0;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ /*
+ * Decay the newidle max times here because this is a regular
+ * visit to all the domains. Decay ~1% per second.
+ */
+ if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+ sd->max_newidle_lb_cost =
+ (sd->max_newidle_lb_cost * 253) / 256;
+ sd->next_decay_max_lb_cost = jiffies + HZ;
+ need_decay = 1;
+ }
+ max_cost += sd->max_newidle_lb_cost;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ /*
+ * Stop the load balance at this level. There is another
+ * CPU in our sched group which is doing load balancing more
+ * actively.
+ */
+ if (!continue_balancing) {
+ if (need_decay)
+ continue;
+ break;
+ }
+
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+
+ need_serialize = sd->flags & SD_SERIALIZE;
+ if (need_serialize) {
+ if (!spin_trylock(&balancing))
+ goto out;
+ }
+
+ if (time_after_eq(jiffies, sd->last_balance + interval)) {
+ if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
+ /*
+ * The LBF_DST_PINNED logic could have changed
+ * env->dst_cpu, so we can't know our idle
+ * state even if we migrated tasks. Update it.
+ */
+ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
+ }
+ sd->last_balance = jiffies;
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ }
+ if (need_serialize)
+ spin_unlock(&balancing);
+out:
+ if (time_after(next_balance, sd->last_balance + interval)) {
+ next_balance = sd->last_balance + interval;
+ update_next_balance = 1;
+ }
+ }
+ if (need_decay) {
+ /*
+ * Ensure the rq-wide value also decays but keep it at a
+ * reasonable floor to avoid funnies with rq->avg_idle.
+ */
+ rq->max_idle_balance_cost =
+ max((u64)sysctl_sched_migration_cost, max_cost);
+ }
+ rcu_read_unlock();
+
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the cpu is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance)) {
+ rq->next_balance = next_balance;
+
+#ifdef CONFIG_NO_HZ_COMMON
+ /*
+ * If this CPU has been elected to perform the nohz idle
+ * balance. Other idle CPUs have already rebalanced with
+ * nohz_idle_balance() and nohz.next_balance has been
+ * updated accordingly. This CPU is now running the idle load
+ * balance for itself and we need to update the
+ * nohz.next_balance accordingly.
+ */
+ if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
+ nohz.next_balance = rq->next_balance;
+#endif
+ }
+}
+
+static inline int on_null_domain(struct rq *rq)
+{
+ return unlikely(!rcu_dereference_sched(rq->sd));
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * idle load balancing details
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * needed, they will kick the idle load balancer, which then does idle
+ * load balancing for all the idle CPUs.
+ * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
+ * anywhere yet.
+ */
+
+static inline int find_new_ilb(void)
+{
+ int ilb;
+
+ for_each_cpu_and(ilb, nohz.idle_cpus_mask,
+ housekeeping_cpumask(HK_FLAG_MISC)) {
+ if (idle_cpu(ilb))
+ return ilb;
+ }
+
+ return nr_cpu_ids;
+}
+
+/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
+ * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
+ */
+static void kick_ilb(unsigned int flags)
+{
+ int ilb_cpu;
+
+ /*
+ * Increase nohz.next_balance only when if full ilb is triggered but
+ * not if we only update stats.
+ */
+ if (flags & NOHZ_BALANCE_KICK)
+ nohz.next_balance = jiffies+1;
+
+ ilb_cpu = find_new_ilb();
+
+ if (ilb_cpu >= nr_cpu_ids)
+ return;
+
+ flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
+ if (flags & NOHZ_KICK_MASK)
+ return;
+
+ /*
+ * Use smp_send_reschedule() instead of resched_cpu().
+ * This way we generate a sched IPI on the target CPU which
+ * is idle. And the softirq performing nohz idle load balance
+ * will be run before returning from the IPI.
+ */
+ smp_send_reschedule(ilb_cpu);
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer in the presence
+ * of an idle cpu in the system.
+ * - This rq has more than one task.
+ * - This rq has at least one CFS task and the capacity of the CPU is
+ * significantly reduced because of RT tasks or IRQs.
+ * - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ * multiple busy cpu.
+ * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
+ * domain span are idle.
+ */
+static void nohz_balancer_kick(struct rq *rq)
+{
+ unsigned long now = jiffies;
+ struct sched_domain_shared *sds;
+ struct sched_domain *sd;
+ int nr_busy, i, cpu = rq->cpu;
+ unsigned int flags = 0;
+
+ if (unlikely(rq->idle_balance))
+ return;
+
+ /*
+ * We may be recently in ticked or tickless idle mode. At the first
+ * busy tick after returning from idle, we will update the busy stats.
+ */
+ nohz_balance_exit_idle(rq);
+
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing.
+ */
+ if (likely(!atomic_read(&nohz.nr_cpus)))
+ return;
+
+ if (READ_ONCE(nohz.has_blocked) &&
+ time_after(now, READ_ONCE(nohz.next_blocked)))
+ flags = NOHZ_STATS_KICK;
+
+ if (time_before(now, nohz.next_balance))
+ goto out;
+
+ if (rq->nr_running >= 2) {
+ flags = NOHZ_KICK_MASK;
+ goto out;
+ }
+
+ rcu_read_lock();
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds) {
+ /*
+ * XXX: write a coherent comment on why we do this.
+ * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+ */
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
+ if (nr_busy > 1) {
+ flags = NOHZ_KICK_MASK;
+ goto unlock;
+ }
+
+ }
+
+ sd = rcu_dereference(rq->sd);
+ if (sd) {
+ if ((rq->cfs.h_nr_running >= 1) &&
+ check_cpu_capacity(rq, sd)) {
+ flags = NOHZ_KICK_MASK;
+ goto unlock;
+ }
+ }
+
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
+ if (sd) {
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (i == cpu ||
+ !cpumask_test_cpu(i, nohz.idle_cpus_mask))
+ continue;
+
+ if (sched_asym_prefer(i, cpu)) {
+ flags = NOHZ_KICK_MASK;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ rcu_read_unlock();
+out:
+ if (flags)
+ kick_ilb(flags);
+}
+
+static void set_cpu_sd_state_busy(int cpu)
+{
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+
+ if (!sd || !sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 0;
+
+ atomic_inc(&sd->shared->nr_busy_cpus);
+unlock:
+ rcu_read_unlock();
+}
+
+void nohz_balance_exit_idle(struct rq *rq)
+{
+ SCHED_WARN_ON(rq != this_rq());
+
+ if (likely(!rq->nohz_tick_stopped))
+ return;
+
+ rq->nohz_tick_stopped = 0;
+ cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+
+ set_cpu_sd_state_busy(rq->cpu);
+}
+
+static void set_cpu_sd_state_idle(int cpu)
+{
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+
+ if (!sd || sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 1;
+
+ atomic_dec(&sd->shared->nr_busy_cpus);
+unlock:
+ rcu_read_unlock();
+}
+
+/*
+ * This routine will record that the CPU is going idle with tick stopped.
+ * This info will be used in performing idle load balancing in the future.
+ */
+void nohz_balance_enter_idle(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ SCHED_WARN_ON(cpu != smp_processor_id());
+
+ /* If this CPU is going down, then nothing needs to be done: */
+ if (!cpu_active(cpu))
+ return;
+
+ /* Spare idle load balancing on CPUs that don't want to be disturbed: */
+ if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
+ return;
+
+ /*
+ * Can be set safely without rq->lock held
+ * If a clear happens, it will have evaluated last additions because
+ * rq->lock is held during the check and the clear
+ */
+ rq->has_blocked_load = 1;
+
+ /*
+ * The tick is still stopped but load could have been added in the
+ * meantime. We set the nohz.has_blocked flag to trig a check of the
+ * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
+ * of nohz.has_blocked can only happen after checking the new load
+ */
+ if (rq->nohz_tick_stopped)
+ goto out;
+
+ /* If we're a completely isolated CPU, we don't play: */
+ if (on_null_domain(rq))
+ return;
+
+ rq->nohz_tick_stopped = 1;
+
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_inc(&nohz.nr_cpus);
+
+ /*
+ * Ensures that if nohz_idle_balance() fails to observe our
+ * @idle_cpus_mask store, it must observe the @has_blocked
+ * store.
+ */
+ smp_mb__after_atomic();
+
+ set_cpu_sd_state_idle(cpu);
+
+out:
+ /*
+ * Each time a cpu enter idle, we assume that it has blocked load and
+ * enable the periodic update of the load of idle cpus
+ */
+ WRITE_ONCE(nohz.has_blocked, 1);
+}
+
+/*
+ * Internal function that runs load balance for all idle cpus. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
+ * The function returns false if the loop has stopped before running
+ * through all idle CPUs.
+ */
+static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+ enum cpu_idle_type idle)
+{
+ /* Earliest time when we have to do rebalance again */
+ unsigned long now = jiffies;
+ unsigned long next_balance = now + 60*HZ;
+ bool has_blocked_load = false;
+ int update_next_balance = 0;
+ int this_cpu = this_rq->cpu;
+ int balance_cpu;
+ int ret = false;
+ struct rq *rq;
+
+ SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
+
+ /*
+ * We assume there will be no idle load after this update and clear
+ * the has_blocked flag. If a cpu enters idle in the mean time, it will
+ * set the has_blocked flag and trig another update of idle load.
+ * Because a cpu that becomes idle, is added to idle_cpus_mask before
+ * setting the flag, we are sure to not clear the state and not
+ * check the load of an idle cpu.
+ */
+ WRITE_ONCE(nohz.has_blocked, 0);
+
+ /*
+ * Ensures that if we miss the CPU, we must see the has_blocked
+ * store from nohz_balance_enter_idle().
+ */
+ smp_mb();
+
+ for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+ continue;
+
+ /*
+ * If this CPU gets work to do, stop the load balancing
+ * work being done for other CPUs. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched()) {
+ has_blocked_load = true;
+ goto abort;
+ }
+
+ rq = cpu_rq(balance_cpu);
+
+ has_blocked_load |= update_nohz_stats(rq, true);
+
+ /*
+ * If time for next balance is due,
+ * do the balance.
+ */
+ if (time_after_eq(jiffies, rq->next_balance)) {
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+ cpu_load_update_idle(rq);
+ rq_unlock_irqrestore(rq, &rf);
+
+ if (flags & NOHZ_BALANCE_KICK)
+ rebalance_domains(rq, CPU_IDLE);
+ }
+
+ if (time_after(next_balance, rq->next_balance)) {
+ next_balance = rq->next_balance;
+ update_next_balance = 1;
+ }
+ }
+
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the CPU is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ nohz.next_balance = next_balance;
+
+ /* Newly idle CPU doesn't need an update */
+ if (idle != CPU_NEWLY_IDLE) {
+ update_blocked_averages(this_cpu);
+ has_blocked_load |= this_rq->has_blocked_load;
+ }
+
+ if (flags & NOHZ_BALANCE_KICK)
+ rebalance_domains(this_rq, CPU_IDLE);
+
+ WRITE_ONCE(nohz.next_blocked,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+ /* The full idle balance loop has been done */
+ ret = true;
+
+abort:
+ /* There is still blocked load, enable periodic update */
+ if (has_blocked_load)
+ WRITE_ONCE(nohz.has_blocked, 1);
+
+ return ret;
+}
+
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+ int this_cpu = this_rq->cpu;
+ unsigned int flags;
+
+ if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
+ return false;
+
+ if (idle != CPU_IDLE) {
+ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ return false;
+ }
+
+ /*
+ * barrier, pairs with nohz_balance_enter_idle(), ensures ...
+ */
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
+ if (!(flags & NOHZ_KICK_MASK))
+ return false;
+
+ _nohz_idle_balance(this_rq, flags, idle);
+
+ return true;
+}
+
+static void nohz_newidle_balance(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu;
+
+ /*
+ * This CPU doesn't want to be disturbed by scheduler
+ * housekeeping
+ */
+ if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+ return;
+
+ /* Will wake up very soon. No time for doing anything else*/
+ if (this_rq->avg_idle < sysctl_sched_migration_cost)
+ return;
+
+ /* Don't need to update blocked load of idle CPUs*/
+ if (!READ_ONCE(nohz.has_blocked) ||
+ time_before(jiffies, READ_ONCE(nohz.next_blocked)))
+ return;
+
+ raw_spin_unlock(&this_rq->lock);
+ /*
+ * This CPU is going to be idle and blocked load of idle CPUs
+ * need to be updated. Run the ilb locally as it is a good
+ * candidate for ilb instead of waking up another idle CPU.
+ * Kick an normal ilb if we failed to do the update.
+ */
+ if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
+ kick_ilb(NOHZ_STATS_KICK);
+ raw_spin_lock(&this_rq->lock);
+}
+
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline void nohz_balancer_kick(struct rq *rq) { }
+
+static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+ return false;
+}
+
+static inline void nohz_newidle_balance(struct rq *this_rq) { }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+{
+ unsigned long next_balance = jiffies + HZ;
+ int this_cpu = this_rq->cpu;
+ struct sched_domain *sd;
+ int pulled_task = 0;
+ u64 curr_cost = 0;
+
+ /*
+ * We must set idle_stamp _before_ calling idle_balance(), such that we
+ * measure the duration of idle_balance() as idle time.
+ */
+ this_rq->idle_stamp = rq_clock(this_rq);
+
+ /*
+ * Do not pull tasks towards !active CPUs...
+ */
+ if (!cpu_active(this_cpu))
+ return 0;
+
+ /*
+ * This is OK, because current is on_cpu, which avoids it being picked
+ * for load-balance and preemption/IRQs are still disabled avoiding
+ * further scheduler activity on it and we're being very careful to
+ * re-start the picking loop.
+ */
+ rq_unpin_lock(this_rq, rf);
+
+ if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload) {
+
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (sd)
+ update_next_balance(sd, &next_balance);
+ rcu_read_unlock();
+
+ nohz_newidle_balance(this_rq);
+
+ goto out;
+ }
+
+ raw_spin_unlock(&this_rq->lock);
+
+ update_blocked_averages(this_cpu);
+ rcu_read_lock();
+ for_each_domain(this_cpu, sd) {
+ int continue_balancing = 1;
+ u64 t0, domain_cost;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+ update_next_balance(sd, &next_balance);
+ break;
+ }
+
+ if (sd->flags & SD_BALANCE_NEWIDLE) {
+ t0 = sched_clock_cpu(this_cpu);
+
+ pulled_task = load_balance(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);
+
+ domain_cost = sched_clock_cpu(this_cpu) - t0;
+ if (domain_cost > sd->max_newidle_lb_cost)
+ sd->max_newidle_lb_cost = domain_cost;
+
+ curr_cost += domain_cost;
+ }
+
+ update_next_balance(sd, &next_balance);
+
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq.
+ */
+ if (pulled_task || this_rq->nr_running > 0)
+ break;
+ }
+ rcu_read_unlock();
+
+ raw_spin_lock(&this_rq->lock);
+
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
+out:
+ /*
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
+ */
+ if (this_rq->cfs.h_nr_running && !pulled_task)
+ pulled_task = 1;
+
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
+ this_rq->next_balance = next_balance;
+
+ /* Is there a task of a high priority class? */
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ pulled_task = -1;
+
+ if (pulled_task)
+ this_rq->idle_stamp = 0;
+
+ rq_repin_lock(this_rq, rf);
+
+ return pulled_task;
+}
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
+static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
+{
+ struct rq *this_rq = this_rq();
+ enum cpu_idle_type idle = this_rq->idle_balance ?
+ CPU_IDLE : CPU_NOT_IDLE;
+
+ /*
+ * If this CPU has a pending nohz_balance_kick, then do the
+ * balancing on behalf of the other idle CPUs whose ticks are
+ * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ * give the idle CPUs a chance to load balance. Else we may
+ * load balance only within the local sched_domain hierarchy
+ * and abort nohz_idle_balance altogether if we pull some load.
+ */
+ if (nohz_idle_balance(this_rq, idle))
+ return;
+
+ /* normal load balance */
+ update_blocked_averages(this_rq->cpu);
+ rebalance_domains(this_rq, idle);
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ */
+void trigger_load_balance(struct rq *rq)
+{
+ /* Don't need to rebalance while attached to NULL domain */
+ if (unlikely(on_null_domain(rq)))
+ return;
+
+ if (time_after_eq(jiffies, rq->next_balance))
+ raise_softirq(SCHED_SOFTIRQ);
+
+ nohz_balancer_kick(rq);
+}
+
+static void rq_online_fair(struct rq *rq)
+{
+ update_sysctl();
+
+ update_runtime_enabled(rq);
+}
+
+static void rq_offline_fair(struct rq *rq)
+{
+ update_sysctl();
+
+ /* Ensure any throttled groups are reachable by pick_next_task */
+ unthrottle_offline_cfs_rqs(rq);
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
+static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &curr->se;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ entity_tick(cfs_rq, se, queued);
+ }
+
+ if (static_branch_unlikely(&sched_numa_balancing))
+ task_tick_numa(rq, curr);
+}
+
+/*
+ * called on fork with the child task as argument from the parent's context
+ * - child not yet on the tasklist
+ * - preemption disabled
+ */
+static void task_fork_fair(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se, *curr;
+ struct rq *rq = this_rq();
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
+
+ cfs_rq = task_cfs_rq(current);
+ curr = cfs_rq->curr;
+ if (curr) {
+ update_curr(cfs_rq);
+ se->vruntime = curr->vruntime;
+ }
+ place_entity(cfs_rq, se, 1);
+
+ if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
+ /*
+ * Upon rescheduling, sched_class::put_prev_task() will place
+ * 'current' within the tree based on its new key value.
+ */
+ swap(curr->vruntime, se->vruntime);
+ resched_curr(rq);
+ }
+
+ se->vruntime -= cfs_rq->min_vruntime;
+ rq_unlock(rq, &rf);
+}
+
+/*
+ * Priority of the task has changed. Check to see if we preempt
+ * the current task.
+ */
+static void
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+{
+ if (!task_on_rq_queued(p))
+ return;
+
+ /*
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
+ */
+ if (rq->curr == p) {
+ if (p->prio > oldprio)
+ resched_curr(rq);
+ } else
+ check_preempt_curr(rq, p, 0);
+}
+
+static inline bool vruntime_normalized(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ /*
+ * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
+ * the dequeue_entity(.flags=0) will already have normalized the
+ * vruntime.
+ */
+ if (p->on_rq)
+ return true;
+
+ /*
+ * When !on_rq, vruntime of the task has usually NOT been normalized.
+ * But there are some cases where it has already been normalized:
+ *
+ * - A forked child which is waiting for being woken up by
+ * wake_up_new_task().
+ * - A task which has been woken up by try_to_wake_up() and
+ * waiting for actually being woken up by sched_ttwu_pending().
+ */
+ if (!se->sum_exec_runtime ||
+ (p->state == TASK_WAKING && p->sched_remote_wakeup))
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ list_add_leaf_cfs_rq(cfs_rq_of(se));
+
+ /* Start to propagate at parent */
+ se = se->parent;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (!cfs_rq_throttled(cfs_rq)){
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ list_add_leaf_cfs_rq(cfs_rq);
+ continue;
+ }
+
+ if (list_add_leaf_cfs_rq(cfs_rq))
+ break;
+ }
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /* Catch up with the cfs_rq and remove our load when we leave */
+ update_load_avg(cfs_rq, se, 0);
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Since the real-depth could have been changed (only FAIR
+ * class maintain depth value), reset depth properly.
+ */
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+
+ /* Synchronize entity with its cfs_rq */
+ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ attach_entity_load_avg(cfs_rq, se, 0);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (!vruntime_normalized(p)) {
+ /*
+ * Fix up our vruntime so that the current sleep doesn't
+ * cause 'unlimited' sleep bonus.
+ */
+ place_entity(cfs_rq, se, 0);
+ se->vruntime -= cfs_rq->min_vruntime;
+ }
+
+ detach_entity_cfs_rq(se);
+}
+
+static void attach_task_cfs_rq(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ attach_entity_cfs_rq(se);
+
+ if (!vruntime_normalized(p))
+ se->vruntime += cfs_rq->min_vruntime;
+}
+
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+ detach_task_cfs_rq(p);
+}
+
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
+{
+ attach_task_cfs_rq(p);
+
+ if (task_on_rq_queued(p)) {
+ /*
+ * We were most likely switched from sched_rt, so
+ * kick off the schedule if running, otherwise just see
+ * if we can still preempt the current task.
+ */
+ if (rq->curr == p)
+ resched_curr(rq);
+ else
+ check_preempt_curr(rq, p, 0);
+ }
+}
+
+/* Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_curr_task_fair(struct rq *rq)
+{
+ struct sched_entity *se = &rq->curr->se;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ set_next_entity(cfs_rq, se);
+ /* ensure bandwidth has been allocated on our new cfs_rq */
+ account_cfs_rq_runtime(cfs_rq, 0);
+ }
+}
+
+void init_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->tasks_timeline = RB_ROOT_CACHED;
+ cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+ cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
+#ifdef CONFIG_SMP
+ raw_spin_lock_init(&cfs_rq->removed.lock);
+#endif
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
+static void task_move_group_fair(struct task_struct *p)
+{
+ detach_task_cfs_rq(p);
+ set_task_rq(p, task_cpu(p));
+
+#ifdef CONFIG_SMP
+ /* Tell se's cfs_rq has been changed -- migrated */
+ p->se.avg.last_update_time = 0;
+#endif
+ attach_task_cfs_rq(p);
+}
+
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+ switch (type) {
+ case TASK_SET_GROUP:
+ task_set_group_fair(p);
+ break;
+
+ case TASK_MOVE_GROUP:
+ task_move_group_fair(p);
+ break;
+ }
+}
+
+void free_fair_sched_group(struct task_group *tg)
+{
+ int i;
+
+ destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+ for_each_possible_cpu(i) {
+ if (tg->cfs_rq)
+ kfree(tg->cfs_rq[i]);
+ if (tg->se)
+ kfree(tg->se[i]);
+ }
+
+ kfree(tg->cfs_rq);
+ kfree(tg->se);
+}
+
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ int i;
+
+ tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
+ if (!tg->cfs_rq)
+ goto err;
+ tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
+ if (!tg->se)
+ goto err;
+
+ tg->shares = NICE_0_LOAD;
+
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+ for_each_possible_cpu(i) {
+ cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!cfs_rq)
+ goto err;
+
+ se = kzalloc_node(sizeof(struct sched_entity),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!se)
+ goto err_free_rq;
+
+ init_cfs_rq(cfs_rq);
+ init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+ init_entity_runnable_average(se);
+ }
+
+ return 1;
+
+err_free_rq:
+ kfree(cfs_rq);
+err:
+ return 0;
+}
+
+void online_fair_sched_group(struct task_group *tg)
+{
+ struct sched_entity *se;
+ struct rq_flags rf;
+ struct rq *rq;
+ int i;
+
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ se = tg->se[i];
+ rq_lock_irq(rq, &rf);
+ update_rq_clock(rq);
+ attach_entity_cfs_rq(se);
+ sync_throttle(tg, i);
+ rq_unlock_irq(rq, &rf);
+ }
+}
+
+void unregister_fair_sched_group(struct task_group *tg)
+{
+ unsigned long flags;
+ struct rq *rq;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (tg->se[cpu])
+ remove_entity_load_avg(tg->se[cpu]);
+
+ /*
+ * Only empty task groups can be destroyed; so we can speculatively
+ * check on_list without danger of it being re-added.
+ */
+ if (!tg->cfs_rq[cpu]->on_list)
+ continue;
+
+ rq = cpu_rq(cpu);
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+}
+
+void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ struct sched_entity *se, int cpu,
+ struct sched_entity *parent)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ cfs_rq->tg = tg;
+ cfs_rq->rq = rq;
+ init_cfs_rq_runtime(cfs_rq);
+
+ tg->cfs_rq[cpu] = cfs_rq;
+ tg->se[cpu] = se;
+
+ /* se could be NULL for root_task_group */
+ if (!se)
+ return;
+
+ if (!parent) {
+ se->cfs_rq = &rq->cfs;
+ se->depth = 0;
+ } else {
+ se->cfs_rq = parent->my_q;
+ se->depth = parent->depth + 1;
+ }
+
+ se->my_q = cfs_rq;
+ /* guarantee group entities always have weight */
+ update_load_set(&se->load, NICE_0_LOAD);
+ se->parent = parent;
+}
+
+static DEFINE_MUTEX(shares_mutex);
+
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+ int i;
+
+ /*
+ * We can't change the weight of the root cgroup.
+ */
+ if (!tg->se[0])
+ return -EINVAL;
+
+ shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
+
+ mutex_lock(&shares_mutex);
+ if (tg->shares == shares)
+ goto done;
+
+ tg->shares = shares;
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se = tg->se[i];
+ struct rq_flags rf;
+
+ /* Propagate contribution to hierarchy */
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+ for_each_sched_entity(se) {
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
+ update_cfs_group(se);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+done:
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+
+void free_fair_sched_group(struct task_group *tg) { }
+
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+
+void online_fair_sched_group(struct task_group *tg) { }
+
+void unregister_fair_sched_group(struct task_group *tg) { }
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+
+static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
+{
+ struct sched_entity *se = &task->se;
+ unsigned int rr_interval = 0;
+
+ /*
+ * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
+ * idle runqueue:
+ */
+ if (rq->cfs.load.weight)
+ rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+
+ return rr_interval;
+}
+
+/*
+ * All the scheduling class methods:
+ */
+const struct sched_class fair_sched_class = {
+ .next = &idle_sched_class,
+ .enqueue_task = enqueue_task_fair,
+ .dequeue_task = dequeue_task_fair,
+ .yield_task = yield_task_fair,
+ .yield_to_task = yield_to_task_fair,
+
+ .check_preempt_curr = check_preempt_wakeup,
+
+ .pick_next_task = pick_next_task_fair,
+ .put_prev_task = put_prev_task_fair,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_fair,
+ .migrate_task_rq = migrate_task_rq_fair,
+
+ .rq_online = rq_online_fair,
+ .rq_offline = rq_offline_fair,
+
+ .task_dead = task_dead_fair,
+ .set_cpus_allowed = set_cpus_allowed_common,
+#endif
+
+ .set_curr_task = set_curr_task_fair,
+ .task_tick = task_tick_fair,
+ .task_fork = task_fork_fair,
+
+ .prio_changed = prio_changed_fair,
+ .switched_from = switched_from_fair,
+ .switched_to = switched_to_fair,
+
+ .get_rr_interval = get_rr_interval_fair,
+
+ .update_curr = update_curr_fair,
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ .task_change_group = task_change_group_fair,
+#endif
+};
+
+#ifdef CONFIG_SCHED_DEBUG
+void print_cfs_stats(struct seq_file *m, int cpu)
+{
+ struct cfs_rq *cfs_rq, *pos;
+
+ rcu_read_lock();
+ for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
+ print_cfs_rq(m, cpu, cfs_rq);
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+void show_numa_stats(struct task_struct *p, struct seq_file *m)
+{
+ int node;
+ unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
+ struct numa_group *ng;
+
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ for_each_online_node(node) {
+ if (p->numa_faults) {
+ tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
+ tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+ if (ng) {
+ gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
+ gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+ print_numa_stats(m, node, tsf, tpf, gsf, gpf);
+ }
+ rcu_read_unlock();
+}
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
+
+__init void init_sched_fair_class(void)
+{
+#ifdef CONFIG_SMP
+ open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+
+#ifdef CONFIG_NO_HZ_COMMON
+ nohz.next_balance = jiffies;
+ nohz.next_blocked = jiffies;
+ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+#endif
+#endif /* SMP */
+
+}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
new file mode 100644
index 000000000..85ae84880
--- /dev/null
+++ b/kernel/sched/features.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Only give sleepers 50% of their service deficit. This allows
+ * them to run sooner, but does not allow tons of sleepers to
+ * rip the spread apart.
+ */
+SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
+
+/*
+ * Place new tasks ahead so that they do not starve already running
+ * tasks
+ */
+SCHED_FEAT(START_DEBIT, true)
+
+/*
+ * Prefer to schedule the task we woke last (assuming it failed
+ * wakeup-preemption), since its likely going to consume data we
+ * touched, increases cache locality.
+ */
+SCHED_FEAT(NEXT_BUDDY, false)
+
+/*
+ * Prefer to schedule the task that ran last (when we did
+ * wake-preempt) as that likely will touch the same data, increases
+ * cache locality.
+ */
+SCHED_FEAT(LAST_BUDDY, true)
+
+/*
+ * Consider buddies to be cache hot, decreases the likelyness of a
+ * cache buddy being migrated away, increases cache locality.
+ */
+SCHED_FEAT(CACHE_HOT_BUDDY, true)
+
+/*
+ * Allow wakeup-time preemption of the current task:
+ */
+SCHED_FEAT(WAKEUP_PREEMPTION, true)
+
+SCHED_FEAT(HRTICK, false)
+SCHED_FEAT(DOUBLE_TICK, false)
+SCHED_FEAT(LB_BIAS, true)
+
+/*
+ * Decrement CPU capacity based on time not spent running tasks
+ */
+SCHED_FEAT(NONTASK_CAPACITY, true)
+
+/*
+ * Queue remote wakeups on the target CPU and process them
+ * using the scheduler IPI. Reduces rq->lock contention/bounces.
+ */
+SCHED_FEAT(TTWU_QUEUE, true)
+
+/*
+ * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
+ */
+SCHED_FEAT(SIS_AVG_CPU, false)
+SCHED_FEAT(SIS_PROP, true)
+
+/*
+ * Issue a WARN when we do multiple update_rq_clock() calls
+ * in a single rq->lock section. Default disabled because the
+ * annotations are not complete.
+ */
+SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
+
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * In order to avoid a thundering herd attack of CPUs that are
+ * lowering their priorities at the same time, and there being
+ * a single CPU that has an RT task that can migrate and is waiting
+ * to run, where the other CPUs will try to take that CPUs
+ * rq lock and possibly create a large contention, sending an
+ * IPI to that CPU and let that CPU push the RT task to where
+ * it should go may be a better scenario.
+ */
+SCHED_FEAT(RT_PUSH_IPI, true)
+#endif
+
+SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(LB_MIN, false)
+SCHED_FEAT(ATTACH_AGE_LOAD, true)
+
+SCHED_FEAT(WA_IDLE, true)
+SCHED_FEAT(WA_WEIGHT, true)
+SCHED_FEAT(WA_BIAS, true)
+
+/*
+ * UtilEstimation. Use estimated CPU utilization.
+ */
+SCHED_FEAT(UTIL_EST, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
new file mode 100644
index 000000000..44a17366c
--- /dev/null
+++ b/kernel/sched/idle.c
@@ -0,0 +1,483 @@
+/*
+ * Generic entry points for the idle threads and
+ * implementation of the idle task scheduling class.
+ *
+ * (NOTE: these are not related to SCHED_IDLE batch scheduled
+ * tasks which are handled in sched/fair.c )
+ */
+#include "sched.h"
+
+#include <trace/events/power.h>
+
+/* Linker adds these: start and end of __cpuidle functions */
+extern char __cpuidle_text_start[], __cpuidle_text_end[];
+
+/**
+ * sched_idle_set_state - Record idle state for the current CPU.
+ * @idle_state: State to record.
+ */
+void sched_idle_set_state(struct cpuidle_state *idle_state)
+{
+ idle_set_state(this_rq(), idle_state);
+}
+
+static int __read_mostly cpu_idle_force_poll;
+
+void cpu_idle_poll_ctrl(bool enable)
+{
+ if (enable) {
+ cpu_idle_force_poll++;
+ } else {
+ cpu_idle_force_poll--;
+ WARN_ON_ONCE(cpu_idle_force_poll < 0);
+ }
+}
+
+#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
+static int __init cpu_idle_poll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 1;
+
+ return 1;
+}
+__setup("nohlt", cpu_idle_poll_setup);
+
+static int __init cpu_idle_nopoll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 0;
+
+ return 1;
+}
+__setup("hlt", cpu_idle_nopoll_setup);
+#endif
+
+static noinline int __cpuidle cpu_idle_poll(void)
+{
+ rcu_idle_enter();
+ trace_cpu_idle_rcuidle(0, smp_processor_id());
+ local_irq_enable();
+ stop_critical_timings();
+
+ while (!tif_need_resched() &&
+ (cpu_idle_force_poll || tick_check_broadcast_expired()))
+ cpu_relax();
+ start_critical_timings();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+ rcu_idle_exit();
+
+ return 1;
+}
+
+/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_prepare(void) { }
+void __weak arch_cpu_idle_enter(void) { }
+void __weak arch_cpu_idle_exit(void) { }
+void __weak arch_cpu_idle_dead(void) { }
+void __weak arch_cpu_idle(void)
+{
+ cpu_idle_force_poll = 1;
+ local_irq_enable();
+}
+
+/**
+ * default_idle_call - Default CPU idle routine.
+ *
+ * To use when the cpuidle framework cannot be used.
+ */
+void __cpuidle default_idle_call(void)
+{
+ if (current_clr_polling_and_test()) {
+ local_irq_enable();
+ } else {
+ stop_critical_timings();
+ arch_cpu_idle();
+ start_critical_timings();
+ }
+}
+
+static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ int next_state)
+{
+ /*
+ * The idle task must be scheduled, it is pointless to go to idle, just
+ * update no idle residency and return.
+ */
+ if (current_clr_polling_and_test()) {
+ dev->last_residency = 0;
+ local_irq_enable();
+ return -EBUSY;
+ }
+
+ /*
+ * Enter the idle state previously returned by the governor decision.
+ * This function will block until an interrupt occurs and will take
+ * care of re-enabling the local interrupts
+ */
+ return cpuidle_enter(drv, dev, next_state);
+}
+
+/**
+ * cpuidle_idle_call - the main idle function
+ *
+ * NOTE: no locks or semaphores should be used here
+ *
+ * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * set, and it returns with polling set. If it ever stops polling, it
+ * must clear the polling bit.
+ */
+static void cpuidle_idle_call(void)
+{
+ struct cpuidle_device *dev = cpuidle_get_device();
+ struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+ int next_state, entered_state;
+
+ /*
+ * Check if the idle task must be rescheduled. If it is the
+ * case, exit the function after re-enabling the local irq.
+ */
+ if (need_resched()) {
+ local_irq_enable();
+ return;
+ }
+
+ /*
+ * The RCU framework needs to be told that we are entering an idle
+ * section, so no more rcu read side critical sections and one more
+ * step to the grace period
+ */
+
+ if (cpuidle_not_available(drv, dev)) {
+ tick_nohz_idle_stop_tick();
+ rcu_idle_enter();
+
+ default_idle_call();
+ goto exit_idle;
+ }
+
+ /*
+ * Suspend-to-idle ("s2idle") is a system state in which all user space
+ * has been frozen, all I/O devices have been suspended and the only
+ * activity happens here and in iterrupts (if any). In that case bypass
+ * the cpuidle governor and go stratight for the deepest idle state
+ * available. Possibly also suspend the local tick and the entire
+ * timekeeping to prevent timer interrupts from kicking us out of idle
+ * until a proper wakeup interrupt happens.
+ */
+
+ if (idle_should_enter_s2idle() || dev->use_deepest_state) {
+ if (idle_should_enter_s2idle()) {
+ rcu_idle_enter();
+
+ entered_state = cpuidle_enter_s2idle(drv, dev);
+ if (entered_state > 0) {
+ local_irq_enable();
+ goto exit_idle;
+ }
+
+ rcu_idle_exit();
+ }
+
+ tick_nohz_idle_stop_tick();
+ rcu_idle_enter();
+
+ next_state = cpuidle_find_deepest_state(drv, dev);
+ call_cpuidle(drv, dev, next_state);
+ } else {
+ bool stop_tick = true;
+
+ /*
+ * Ask the cpuidle framework to choose a convenient idle state.
+ */
+ next_state = cpuidle_select(drv, dev, &stop_tick);
+
+ if (stop_tick || tick_nohz_tick_stopped())
+ tick_nohz_idle_stop_tick();
+ else
+ tick_nohz_idle_retain_tick();
+
+ rcu_idle_enter();
+
+ entered_state = call_cpuidle(drv, dev, next_state);
+ /*
+ * Give the governor an opportunity to reflect on the outcome
+ */
+ cpuidle_reflect(dev, entered_state);
+ }
+
+exit_idle:
+ __current_set_polling();
+
+ /*
+ * It is up to the idle functions to reenable local interrupts
+ */
+ if (WARN_ON_ONCE(irqs_disabled()))
+ local_irq_enable();
+
+ rcu_idle_exit();
+}
+
+/*
+ * Generic idle loop implementation
+ *
+ * Called with polling cleared.
+ */
+static void do_idle(void)
+{
+ int cpu = smp_processor_id();
+ /*
+ * If the arch has a polling bit, we maintain an invariant:
+ *
+ * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
+ * rq->idle). This means that, if rq->idle has the polling bit set,
+ * then setting need_resched is guaranteed to cause the CPU to
+ * reschedule.
+ */
+
+ __current_set_polling();
+ tick_nohz_idle_enter();
+
+ while (!need_resched()) {
+ check_pgt_cache();
+ rmb();
+
+ local_irq_disable();
+
+ if (cpu_is_offline(cpu)) {
+ tick_nohz_idle_stop_tick();
+ cpuhp_report_idle_dead();
+ arch_cpu_idle_dead();
+ }
+
+ arch_cpu_idle_enter();
+
+ /*
+ * In poll mode we reenable interrupts and spin. Also if we
+ * detected in the wakeup from idle path that the tick
+ * broadcast device expired for us, we don't want to go deep
+ * idle as we know that the IPI is going to arrive right away.
+ */
+ if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+ tick_nohz_idle_restart_tick();
+ cpu_idle_poll();
+ } else {
+ cpuidle_idle_call();
+ }
+ arch_cpu_idle_exit();
+ }
+
+ /*
+ * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
+ * be set, propagate it into PREEMPT_NEED_RESCHED.
+ *
+ * This is required because for polling idle loops we will not have had
+ * an IPI to fold the state for us.
+ */
+ preempt_set_need_resched();
+ tick_nohz_idle_exit();
+ __current_clr_polling();
+
+ /*
+ * We promise to call sched_ttwu_pending() and reschedule if
+ * need_resched() is set while polling is set. That means that clearing
+ * polling needs to be visible before doing these things.
+ */
+ smp_mb__after_atomic();
+
+ sched_ttwu_pending();
+ schedule_idle();
+
+ if (unlikely(klp_patch_pending(current)))
+ klp_update_patch_state(current);
+}
+
+bool cpu_in_idle(unsigned long pc)
+{
+ return pc >= (unsigned long)__cpuidle_text_start &&
+ pc < (unsigned long)__cpuidle_text_end;
+}
+
+struct idle_timer {
+ struct hrtimer timer;
+ int done;
+};
+
+static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
+{
+ struct idle_timer *it = container_of(timer, struct idle_timer, timer);
+
+ WRITE_ONCE(it->done, 1);
+ set_tsk_need_resched(current);
+
+ return HRTIMER_NORESTART;
+}
+
+void play_idle(unsigned long duration_ms)
+{
+ struct idle_timer it;
+
+ /*
+ * Only FIFO tasks can disable the tick since they don't need the forced
+ * preemption.
+ */
+ WARN_ON_ONCE(current->policy != SCHED_FIFO);
+ WARN_ON_ONCE(current->nr_cpus_allowed != 1);
+ WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
+ WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
+ WARN_ON_ONCE(!duration_ms);
+
+ rcu_sleep_check();
+ preempt_disable();
+ current->flags |= PF_IDLE;
+ cpuidle_use_deepest_state(true);
+
+ it.done = 0;
+ hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ it.timer.function = idle_inject_timer_fn;
+ hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
+
+ while (!READ_ONCE(it.done))
+ do_idle();
+
+ cpuidle_use_deepest_state(false);
+ current->flags &= ~PF_IDLE;
+
+ preempt_fold_need_resched();
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(play_idle);
+
+void cpu_startup_entry(enum cpuhp_state state)
+{
+ /*
+ * This #ifdef needs to die, but it's too late in the cycle to
+ * make this generic (ARM and SH have never invoked the canary
+ * init for the non boot CPUs!). Will be fixed in 3.11
+ */
+#ifdef CONFIG_X86
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. The boot CPU already has it initialized but no harm
+ * in doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+#endif
+ arch_cpu_idle_prepare();
+ cpuhp_online_idle(state);
+ while (1)
+ do_idle();
+}
+
+/*
+ * idle-task scheduling class.
+ */
+
+#ifdef CONFIG_SMP
+static int
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ return task_cpu(p); /* IDLE tasks as never migrated */
+}
+#endif
+
+/*
+ * Idle tasks are unconditionally rescheduled:
+ */
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+{
+ resched_curr(rq);
+}
+
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ put_prev_task(rq, prev);
+ update_idle_core(rq);
+ schedstat_inc(rq->sched_goidle);
+
+ return rq->idle;
+}
+
+/*
+ * It is not legal to sleep in the idle task - print a warning
+ * message if some code attempts to do it:
+ */
+static void
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
+{
+ raw_spin_unlock_irq(&rq->lock);
+ printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+ dump_stack();
+ raw_spin_lock_irq(&rq->lock);
+}
+
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+{
+}
+
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
+{
+ BUG();
+}
+
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+{
+ BUG();
+}
+
+static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
+{
+ return 0;
+}
+
+static void update_curr_idle(struct rq *rq)
+{
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+const struct sched_class idle_sched_class = {
+ /* .next is NULL */
+ /* no enqueue/yield_task for idle tasks */
+
+ /* dequeue is not valid, we print a debug message there: */
+ .dequeue_task = dequeue_task_idle,
+
+ .check_preempt_curr = check_preempt_curr_idle,
+
+ .pick_next_task = pick_next_task_idle,
+ .put_prev_task = put_prev_task_idle,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_idle,
+ .set_cpus_allowed = set_cpus_allowed_common,
+#endif
+
+ .set_curr_task = set_curr_task_idle,
+ .task_tick = task_tick_idle,
+
+ .get_rr_interval = get_rr_interval_idle,
+
+ .prio_changed = prio_changed_idle,
+ .switched_to = switched_to_idle,
+ .update_curr = update_curr_idle,
+};
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
new file mode 100644
index 000000000..e68021819
--- /dev/null
+++ b/kernel/sched/isolation.c
@@ -0,0 +1,153 @@
+/*
+ * Housekeeping management. Manage the targets for routine code that can run on
+ * any CPU: unbound workqueues, timers, kthreads and any offloadable work.
+ *
+ * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
+ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
+ *
+ */
+#include "sched.h"
+
+DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
+EXPORT_SYMBOL_GPL(housekeeping_overriden);
+static cpumask_var_t housekeeping_mask;
+static unsigned int housekeeping_flags;
+
+int housekeeping_any_cpu(enum hk_flags flags)
+{
+ if (static_branch_unlikely(&housekeeping_overriden))
+ if (housekeeping_flags & flags)
+ return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+ return smp_processor_id();
+}
+EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
+
+const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
+{
+ if (static_branch_unlikely(&housekeeping_overriden))
+ if (housekeeping_flags & flags)
+ return housekeeping_mask;
+ return cpu_possible_mask;
+}
+EXPORT_SYMBOL_GPL(housekeeping_cpumask);
+
+void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
+{
+ if (static_branch_unlikely(&housekeeping_overriden))
+ if (housekeeping_flags & flags)
+ set_cpus_allowed_ptr(t, housekeeping_mask);
+}
+EXPORT_SYMBOL_GPL(housekeeping_affine);
+
+bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
+{
+ if (static_branch_unlikely(&housekeeping_overriden))
+ if (housekeeping_flags & flags)
+ return cpumask_test_cpu(cpu, housekeeping_mask);
+ return true;
+}
+EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
+
+void __init housekeeping_init(void)
+{
+ if (!housekeeping_flags)
+ return;
+
+ static_branch_enable(&housekeeping_overriden);
+
+ if (housekeeping_flags & HK_FLAG_TICK)
+ sched_tick_offload_init();
+
+ /* We need at least one CPU to handle housekeeping work */
+ WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+}
+
+static int __init housekeeping_setup(char *str, enum hk_flags flags)
+{
+ cpumask_var_t non_housekeeping_mask;
+ int err;
+
+ alloc_bootmem_cpumask_var(&non_housekeeping_mask);
+ err = cpulist_parse(str, non_housekeeping_mask);
+ if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
+ pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+ return 0;
+ }
+
+ if (!housekeeping_flags) {
+ alloc_bootmem_cpumask_var(&housekeeping_mask);
+ cpumask_andnot(housekeeping_mask,
+ cpu_possible_mask, non_housekeeping_mask);
+ if (cpumask_empty(housekeeping_mask))
+ cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+ } else {
+ cpumask_var_t tmp;
+
+ alloc_bootmem_cpumask_var(&tmp);
+ cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
+ if (!cpumask_equal(tmp, housekeeping_mask)) {
+ pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+ free_bootmem_cpumask_var(tmp);
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+ return 0;
+ }
+ free_bootmem_cpumask_var(tmp);
+ }
+
+ if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
+ if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ tick_nohz_full_setup(non_housekeeping_mask);
+ } else {
+ pr_warn("Housekeeping: nohz unsupported."
+ " Build with CONFIG_NO_HZ_FULL\n");
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+ return 0;
+ }
+ }
+
+ housekeeping_flags |= flags;
+
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+
+ return 1;
+}
+
+static int __init housekeeping_nohz_full_setup(char *str)
+{
+ unsigned int flags;
+
+ flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+
+ return housekeeping_setup(str, flags);
+}
+__setup("nohz_full=", housekeeping_nohz_full_setup);
+
+static int __init housekeeping_isolcpus_setup(char *str)
+{
+ unsigned int flags = 0;
+
+ while (isalpha(*str)) {
+ if (!strncmp(str, "nohz,", 5)) {
+ str += 5;
+ flags |= HK_FLAG_TICK;
+ continue;
+ }
+
+ if (!strncmp(str, "domain,", 7)) {
+ str += 7;
+ flags |= HK_FLAG_DOMAIN;
+ continue;
+ }
+
+ pr_warn("isolcpus: Error, unknown flag\n");
+ return 0;
+ }
+
+ /* Default behaviour for isolcpus without flags */
+ if (!flags)
+ flags |= HK_FLAG_DOMAIN;
+
+ return housekeeping_setup(str, flags);
+}
+__setup("isolcpus=", housekeeping_isolcpus_setup);
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
new file mode 100644
index 000000000..a171c1258
--- /dev/null
+++ b/kernel/sched/loadavg.c
@@ -0,0 +1,400 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * kernel/sched/loadavg.c
+ *
+ * This file contains the magic bits required to compute the global loadavg
+ * figure. Its a silly number but people think its important. We go through
+ * great pains to make it work on big machines and tickless kernels.
+ */
+#include "sched.h"
+
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ * nr_active = 0;
+ * for_each_possible_cpu(cpu)
+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ * - for_each_possible_cpu() is prohibitively expensive on machines with
+ * serious number of CPUs, therefore we need to take a distributed approach
+ * to calculating nr_active.
+ *
+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ * So assuming nr_active := 0 when we start out -- true per definition, we
+ * can simply take per-CPU deltas and fold those into a global accumulate
+ * to obtain the same result. See calc_load_fold_active().
+ *
+ * Furthermore, in order to avoid synchronizing all per-CPU delta folding
+ * across the machine, we assume 10 ticks is sufficient time for every
+ * CPU to have completed this task.
+ *
+ * This places an upper-bound on the IRQ-off latency of the machine. Then
+ * again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
+ * this would add another cross-CPU cacheline miss and atomic operation
+ * to the wakeup path. Instead we increment on whatever CPU the task ran
+ * when it went into uninterruptible state and decrement on whatever CPU
+ * did the wakeup. This means that only the sum of nr_uninterruptible over
+ * all CPUs yields the correct result.
+ *
+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
+/* Variables and functions for calc_load */
+atomic_long_t calc_load_tasks;
+unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
+long calc_load_fold_active(struct rq *this_rq, long adjust)
+{
+ long nr_active, delta = 0;
+
+ nr_active = this_rq->nr_running - adjust;
+ nr_active += (long)this_rq->nr_uninterruptible;
+
+ if (nr_active != this_rq->calc_load_active) {
+ delta = nr_active - this_rq->calc_load_active;
+ this_rq->calc_load_active = nr_active;
+ }
+
+ return delta;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ unsigned long newload;
+
+ newload = load * exp + active * (FIXED_1 - exp);
+ if (active >= load)
+ newload += FIXED_1-1;
+
+ return newload / FIXED_1;
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-CPU sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' CPU delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ * - When we go NO_HZ idle during the window, we can negate our sample
+ * contribution, causing under-accounting.
+ *
+ * We avoid this by keeping two NO_HZ-delta counters and flipping them
+ * when the window starts, thus separating old and new NO_HZ load.
+ *
+ * The only trick is the slight shift in index flip for read vs write.
+ *
+ * 0s 5s 10s 15s
+ * +10 +10 +10 +10
+ * |-|-----------|-|-----------|-|-----------|-|
+ * r:0 0 1 1 0 0 1 1 0
+ * w:0 1 1 0 0 1 1 0 0
+ *
+ * This ensures we'll fold the old NO_HZ contribution in this window while
+ * accumlating the new one.
+ *
+ * - When we wake up from NO_HZ during the window, we push up our
+ * contribution, since we effectively move our sample point to a known
+ * busy state.
+ *
+ * This is solved by pushing the window forward, and thus skipping the
+ * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
+ * was in effect at the time the window opened). This also solves the issue
+ * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
+ * intervals.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_nohz[2];
+static int calc_load_idx;
+
+static inline int calc_load_write_idx(void)
+{
+ int idx = calc_load_idx;
+
+ /*
+ * See calc_global_nohz(), if we observe the new index, we also
+ * need to observe the new update time.
+ */
+ smp_rmb();
+
+ /*
+ * If the folding window started, make sure we start writing in the
+ * next NO_HZ-delta.
+ */
+ if (!time_before(jiffies, READ_ONCE(calc_load_update)))
+ idx++;
+
+ return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+ return calc_load_idx & 1;
+}
+
+void calc_load_nohz_start(void)
+{
+ struct rq *this_rq = this_rq();
+ long delta;
+
+ /*
+ * We're going into NO_HZ mode, if there's any pending delta, fold it
+ * into the pending NO_HZ delta.
+ */
+ delta = calc_load_fold_active(this_rq, 0);
+ if (delta) {
+ int idx = calc_load_write_idx();
+
+ atomic_long_add(delta, &calc_load_nohz[idx]);
+ }
+}
+
+void calc_load_nohz_stop(void)
+{
+ struct rq *this_rq = this_rq();
+
+ /*
+ * If we're still before the pending sample window, we're done.
+ */
+ this_rq->calc_load_update = READ_ONCE(calc_load_update);
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ /*
+ * We woke inside or after the sample window, this means we're already
+ * accounted through the nohz accounting, so skip the entire deal and
+ * sync up for the next window.
+ */
+ if (time_before(jiffies, this_rq->calc_load_update + 10))
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_nohz_fold(void)
+{
+ int idx = calc_load_read_idx();
+ long delta = 0;
+
+ if (atomic_long_read(&calc_load_nohz[idx]))
+ delta = atomic_long_xchg(&calc_load_nohz[idx], 0);
+
+ return delta;
+}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) {
+ for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-CPU ticks calling
+ * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
+ * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
+ * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(void)
+{
+ unsigned long sample_window;
+ long delta, active, n;
+
+ sample_window = READ_ONCE(calc_load_update);
+ if (!time_before(jiffies, sample_window + 10)) {
+ /*
+ * Catch-up, fold however many we are behind still
+ */
+ delta = jiffies - sample_window - 10;
+ n = 1 + (delta / LOAD_FREQ);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
+ }
+
+ /*
+ * Flip the NO_HZ index...
+ *
+ * Make sure we first write the new time then flip the index, so that
+ * calc_load_write_idx() will see the new time when it reads the new
+ * index, this avoids a double flip messing things up.
+ */
+ smp_wmb();
+ calc_load_idx++;
+}
+#else /* !CONFIG_NO_HZ_COMMON */
+
+static inline long calc_load_nohz_fold(void) { return 0; }
+static inline void calc_global_nohz(void) { }
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ *
+ * Called from the global timer code.
+ */
+void calc_global_load(unsigned long ticks)
+{
+ unsigned long sample_window;
+ long active, delta;
+
+ sample_window = READ_ONCE(calc_load_update);
+ if (time_before(jiffies, sample_window + 10))
+ return;
+
+ /*
+ * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
+ */
+ delta = calc_load_nohz_fold();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+ WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
+
+ /*
+ * In case we went to NO_HZ for multiple LOAD_FREQ intervals
+ * catch up in bulk.
+ */
+ calc_global_nohz();
+}
+
+/*
+ * Called from scheduler_tick() to periodically update this CPU's
+ * active count.
+ */
+void calc_global_load_tick(struct rq *this_rq)
+{
+ long delta;
+
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ delta = calc_load_fold_active(this_rq, 0);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ this_rq->calc_load_update += LOAD_FREQ;
+}
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 000000000..dd27e632b
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,318 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include "sched.h"
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
+#endif
+
+#define MEMBARRIER_CMD_BITMASK \
+ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
+ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
+ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
+
+static void ipi_mb(void *info)
+{
+ smp_mb(); /* IPIs should be serializing but paranoid. */
+}
+
+static int membarrier_global_expedited(void)
+{
+ int cpu;
+ bool fallback = false;
+ cpumask_var_t tmpmask;
+
+ if (num_online_cpus() == 1)
+ return 0;
+
+ /*
+ * Matches memory barriers around rq->curr modification in
+ * scheduler.
+ */
+ smp_mb(); /* system call entry is not a mb. */
+
+ /*
+ * Expedited membarrier commands guarantee that they won't
+ * block, hence the GFP_NOWAIT allocation flag and fallback
+ * implementation.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+ /* Fallback for OOM. */
+ fallback = true;
+ }
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+
+ rcu_read_lock();
+ p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+ if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+ if (!fallback)
+ __cpumask_set_cpu(cpu, tmpmask);
+ else
+ smp_call_function_single(cpu, ipi_mb, NULL, 1);
+ }
+ rcu_read_unlock();
+ }
+ if (!fallback) {
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
+ free_cpumask_var(tmpmask);
+ }
+ cpus_read_unlock();
+
+ /*
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers around
+ * rq->curr modification in scheduler.
+ */
+ smp_mb(); /* exit from system call is not a mb */
+ return 0;
+}
+
+static int membarrier_private_expedited(int flags)
+{
+ int cpu;
+ bool fallback = false;
+ cpumask_var_t tmpmask;
+
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+ return -EINVAL;
+ if (!(atomic_read(&current->mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
+ return -EPERM;
+ } else {
+ if (!(atomic_read(&current->mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+ return -EPERM;
+ }
+
+ if (num_online_cpus() == 1)
+ return 0;
+
+ /*
+ * Matches memory barriers around rq->curr modification in
+ * scheduler.
+ */
+ smp_mb(); /* system call entry is not a mb. */
+
+ /*
+ * Expedited membarrier commands guarantee that they won't
+ * block, hence the GFP_NOWAIT allocation flag and fallback
+ * implementation.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+ /* Fallback for OOM. */
+ fallback = true;
+ }
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+ rcu_read_lock();
+ p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+ if (p && p->mm == current->mm) {
+ if (!fallback)
+ __cpumask_set_cpu(cpu, tmpmask);
+ else
+ smp_call_function_single(cpu, ipi_mb, NULL, 1);
+ }
+ rcu_read_unlock();
+ }
+ if (!fallback) {
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
+ free_cpumask_var(tmpmask);
+ }
+ cpus_read_unlock();
+
+ /*
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers around
+ * rq->curr modification in scheduler.
+ */
+ smp_mb(); /* exit from system call is not a mb */
+
+ return 0;
+}
+
+static int membarrier_register_global_expedited(void)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+
+ if (atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
+ return 0;
+ atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
+ if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
+ /*
+ * For single mm user, single threaded process, we can
+ * simply issue a memory barrier after setting
+ * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
+ * no memory access following registration is reordered
+ * before registration.
+ */
+ smp_mb();
+ } else {
+ /*
+ * For multi-mm user threads, we need to ensure all
+ * future scheduler executions will observe the new
+ * thread flag state for this mm.
+ */
+ synchronize_sched();
+ }
+ atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+ &mm->membarrier_state);
+
+ return 0;
+}
+
+static int membarrier_register_private_expedited(int flags)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+ return -EINVAL;
+ state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ }
+
+ /*
+ * We need to consider threads belonging to different thread
+ * groups, which use the same mm. (CLONE_VM but not
+ * CLONE_THREAD).
+ */
+ if ((atomic_read(&mm->membarrier_state) & state) == state)
+ return 0;
+ atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE)
+ atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
+ &mm->membarrier_state);
+ if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
+ /*
+ * Ensure all future scheduler executions will observe the
+ * new thread flag state for this process.
+ */
+ synchronize_sched();
+ }
+ atomic_or(state, &mm->membarrier_state);
+
+ return 0;
+}
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, not available on the running
+ * kernel, or if the command argument is invalid, this system call
+ * returns -EINVAL. For a given command, with flags argument set to 0,
+ * this system call is guaranteed to always return the same value until
+ * reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ * barrier() smp_mb() sys_membarrier()
+ * barrier() X X O
+ * smp_mb() X O O
+ * sys_membarrier() O O O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ switch (cmd) {
+ case MEMBARRIER_CMD_QUERY:
+ {
+ int cmd_mask = MEMBARRIER_CMD_BITMASK;
+
+ if (tick_nohz_full_enabled())
+ cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
+ return cmd_mask;
+ }
+ case MEMBARRIER_CMD_GLOBAL:
+ /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
+ if (tick_nohz_full_enabled())
+ return -EINVAL;
+ if (num_online_cpus() > 1)
+ synchronize_sched();
+ return 0;
+ case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+ return membarrier_global_expedited();
+ case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+ return membarrier_register_global_expedited();
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ return membarrier_private_expedited(0);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+ return membarrier_register_private_expedited(0);
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
new file mode 100644
index 000000000..48a126486
--- /dev/null
+++ b/kernel/sched/pelt.c
@@ -0,0 +1,399 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Per Entity Load Tracking
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Interactivity improvements by Mike Galbraith
+ * (C) 2007 Mike Galbraith <efault@gmx.de>
+ *
+ * Various enhancements by Dmitry Adamushko.
+ * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
+ *
+ * Group scheduling enhancements by Srivatsa Vaddagiri
+ * Copyright IBM Corporation, 2007
+ * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
+ *
+ * Scaled math optimizations by Thomas Gleixner
+ * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
+ *
+ * Move PELT related code from fair.c into this pelt.c file
+ * Author: Vincent Guittot <vincent.guittot@linaro.org>
+ */
+
+#include <linux/sched.h>
+#include "sched.h"
+#include "sched-pelt.h"
+#include "pelt.h"
+
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static u64 decay_load(u64 val, u64 n)
+{
+ unsigned int local_n;
+
+ if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* after bounds checking we can collapse to 32-bit */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
+ * With a look-up table which covers y^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ local_n %= LOAD_AVG_PERIOD;
+ }
+
+ val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+ return val;
+}
+
+static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
+{
+ u32 c1, c2, c3 = d3; /* y^0 == 1 */
+
+ /*
+ * c1 = d1 y^p
+ */
+ c1 = decay_load((u64)d1, periods);
+
+ /*
+ * p-1
+ * c2 = 1024 \Sum y^n
+ * n=1
+ *
+ * inf inf
+ * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
+ * n=0 n=p
+ */
+ c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
+
+ return c1 + c2 + c3;
+}
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+/*
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ * d1 d2 d3
+ * ^ ^ ^
+ * | | |
+ * |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ * p-1
+ * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
+ * n=1
+ *
+ * = u y^p + (Step 1)
+ *
+ * p-1
+ * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
+ * n=1
+ */
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+ unsigned long load, unsigned long runnable, int running)
+{
+ unsigned long scale_freq, scale_cpu;
+ u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
+ u64 periods;
+
+ scale_freq = arch_scale_freq_capacity(cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ delta += sa->period_contrib;
+ periods = delta / 1024; /* A period is 1024us (~1ms) */
+
+ /*
+ * Step 1: decay old *_sum if we crossed period boundaries.
+ */
+ if (periods) {
+ sa->load_sum = decay_load(sa->load_sum, periods);
+ sa->runnable_load_sum =
+ decay_load(sa->runnable_load_sum, periods);
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+
+ /*
+ * Step 2
+ */
+ delta %= 1024;
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
+ sa->period_contrib = delta;
+
+ contrib = cap_scale(contrib, scale_freq);
+ if (load)
+ sa->load_sum += load * contrib;
+ if (runnable)
+ sa->runnable_load_sum += runnable * contrib;
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+
+ return periods;
+}
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int
+___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long load, unsigned long runnable, int running)
+{
+ u64 delta;
+
+ delta = now - sa->last_update_time;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_update_time = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+
+ sa->last_update_time += delta << 10;
+
+ /*
+ * running is a subset of runnable (weight) so running can't be set if
+ * runnable is clear. But there are some corner cases where the current
+ * se has been already dequeued but cfs_rq->curr still points to it.
+ * This means that weight will be 0 but not running for a sched_entity
+ * but also for a cfs_rq if the latter becomes idle. As an example,
+ * this happens during idle_balance() which calls
+ * update_blocked_averages()
+ */
+ if (!load)
+ runnable = running = 0;
+
+ /*
+ * Now we know we crossed measurement unit boundaries. The *_avg
+ * accrues by two steps:
+ *
+ * Step 1: accumulate *_sum since last_update_time. If we haven't
+ * crossed period boundaries, finish.
+ */
+ if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
+ return 0;
+
+ return 1;
+}
+
+static __always_inline void
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
+{
+ u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
+ /*
+ * Step 2: update *_avg.
+ */
+ sa->load_avg = div_u64(load * sa->load_sum, divider);
+ sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
+ WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
+}
+
+/*
+ * sched_entity:
+ *
+ * task:
+ * se_runnable() == se_weight()
+ *
+ * group: [ see update_cfs_group() ]
+ * se_weight() = tg->weight * grq->load_avg / tg->load_avg
+ * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ *
+ * load_sum := runnable_sum
+ * load_avg = se_weight(se) * runnable_avg
+ *
+ * runnable_load_sum := runnable_sum
+ * runnable_load_avg = se_runnable(se) * runnable_avg
+ *
+ * XXX collapse load_sum and runnable_load_sum
+ *
+ * cfq_rq:
+ *
+ * load_sum = \Sum se_weight(se) * se->avg.load_sum
+ * load_avg = \Sum se->avg.load_avg
+ *
+ * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ * runnable_load_avg = \Sum se->avg.runable_load_avg
+ */
+
+int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+{
+ if (entity_is_task(se))
+ se->runnable_weight = se->load.weight;
+
+ if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ return 1;
+ }
+
+ return 0;
+}
+
+int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (entity_is_task(se))
+ se->runnable_weight = se->load.weight;
+
+ if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+ cfs_rq->curr == se)) {
+
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ cfs_se_util_change(&se->avg);
+ return 1;
+ }
+
+ return 0;
+}
+
+int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+{
+ if (___update_load_sum(now, cpu, &cfs_rq->avg,
+ scale_load_down(cfs_rq->load.weight),
+ scale_load_down(cfs_rq->runnable_weight),
+ cfs_rq->curr != NULL)) {
+
+ ___update_load_avg(&cfs_rq->avg, 1, 1);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * rt_rq:
+ *
+ * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ * util_sum = cpu_scale * load_sum
+ * runnable_load_sum = load_sum
+ *
+ * load_avg and runnable_load_avg are not supported and meaningless.
+ *
+ */
+
+int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+ if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
+ running,
+ running,
+ running)) {
+
+ ___update_load_avg(&rq->avg_rt, 1, 1);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * dl_rq:
+ *
+ * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ * util_sum = cpu_scale * load_sum
+ * runnable_load_sum = load_sum
+ *
+ */
+
+int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+ if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
+ running,
+ running,
+ running)) {
+
+ ___update_load_avg(&rq->avg_dl, 1, 1);
+ return 1;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+/*
+ * irq:
+ *
+ * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ * util_sum = cpu_scale * load_sum
+ * runnable_load_sum = load_sum
+ *
+ */
+
+int update_irq_load_avg(struct rq *rq, u64 running)
+{
+ int ret = 0;
+ /*
+ * We know the time that has been used by interrupt since last update
+ * but we don't when. Let be pessimistic and assume that interrupt has
+ * happened just before the update. This is not so far from reality
+ * because interrupt will most probably wake up task and trig an update
+ * of rq clock during which the metric si updated.
+ * We start to decay with normal context time and then we add the
+ * interrupt context time.
+ * We can safely remove running from rq->clock because
+ * rq->clock += delta with delta >= running
+ */
+ ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
+ 0,
+ 0,
+ 0);
+ ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
+ 1,
+ 1,
+ 1);
+
+ if (ret)
+ ___update_load_avg(&rq->avg_irq, 1, 1);
+
+ return ret;
+}
+#endif
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
new file mode 100644
index 000000000..7e56b489f
--- /dev/null
+++ b/kernel/sched/pelt.h
@@ -0,0 +1,72 @@
+#ifdef CONFIG_SMP
+
+int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
+int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
+int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
+int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
+int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
+
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+int update_irq_load_avg(struct rq *rq, u64 running);
+#else
+static inline int
+update_irq_load_avg(struct rq *rq, u64 running)
+{
+ return 0;
+}
+#endif
+
+/*
+ * When a task is dequeued, its estimated utilization should not be update if
+ * its util_avg has not been updated at least once.
+ * This flag is used to synchronize util_avg updates with util_est updates.
+ * We map this information into the LSB bit of the utilization saved at
+ * dequeue time (i.e. util_est.dequeued).
+ */
+#define UTIL_AVG_UNCHANGED 0x1
+
+static inline void cfs_se_util_change(struct sched_avg *avg)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Avoid store if the flag has been already set */
+ enqueued = avg->util_est.enqueued;
+ if (!(enqueued & UTIL_AVG_UNCHANGED))
+ return;
+
+ /* Reset flag to report util_avg has been updated */
+ enqueued &= ~UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(avg->util_est.enqueued, enqueued);
+}
+
+#else
+
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static inline int
+update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+ return 0;
+}
+
+static inline int
+update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+ return 0;
+}
+
+static inline int
+update_irq_load_avg(struct rq *rq, u64 running)
+{
+ return 0;
+}
+#endif
+
+
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
new file mode 100644
index 000000000..70e8cd395
--- /dev/null
+++ b/kernel/sched/rt.c
@@ -0,0 +1,2749 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
+ * policies)
+ */
+#include "sched.h"
+
+#include "pelt.h"
+
+int sched_rr_timeslice = RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+
+struct rt_bandwidth def_rt_bandwidth;
+
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+ struct rt_bandwidth *rt_b =
+ container_of(timer, struct rt_bandwidth, rt_period_timer);
+ int idle = 0;
+ int overrun;
+
+ raw_spin_lock(&rt_b->rt_runtime_lock);
+ for (;;) {
+ overrun = hrtimer_forward_now(timer, rt_b->rt_period);
+ if (!overrun)
+ break;
+
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
+ idle = do_sched_rt_period_timer(rt_b, overrun);
+ raw_spin_lock(&rt_b->rt_runtime_lock);
+ }
+ if (idle)
+ rt_b->rt_period_active = 0;
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
+
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
+{
+ rt_b->rt_period = ns_to_ktime(period);
+ rt_b->rt_runtime = runtime;
+
+ raw_spin_lock_init(&rt_b->rt_runtime_lock);
+
+ hrtimer_init(&rt_b->rt_period_timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rt_b->rt_period_timer.function = sched_rt_period_timer;
+}
+
+static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ raw_spin_lock(&rt_b->rt_runtime_lock);
+ if (!rt_b->rt_period_active) {
+ rt_b->rt_period_active = 1;
+ /*
+ * SCHED_DEADLINE updates the bandwidth, as a run away
+ * RT task with a DL task could hog a CPU. But DL does
+ * not reset the period. If a deadline task was running
+ * without an RT task running, it can cause RT tasks to
+ * throttle when they start up. Kick the timer right away
+ * to update the period.
+ */
+ hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
+ hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+ }
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+ return;
+
+ do_start_rt_bandwidth(rt_b);
+}
+
+void init_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array;
+ int i;
+
+ array = &rt_rq->active;
+ for (i = 0; i < MAX_RT_PRIO; i++) {
+ INIT_LIST_HEAD(array->queue + i);
+ __clear_bit(i, array->bitmap);
+ }
+ /* delimiter for bitsearch: */
+ __set_bit(MAX_RT_PRIO, array->bitmap);
+
+#if defined CONFIG_SMP
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->highest_prio.next = MAX_RT_PRIO;
+ rt_rq->rt_nr_migratory = 0;
+ rt_rq->overloaded = 0;
+ plist_head_init(&rt_rq->pushable_tasks);
+#endif /* CONFIG_SMP */
+ /* We start is dequeued state, because no RT tasks are queued */
+ rt_rq->rt_queued = 0;
+
+ rt_rq->rt_time = 0;
+ rt_rq->rt_throttled = 0;
+ rt_rq->rt_runtime = 0;
+ raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ hrtimer_cancel(&rt_b->rt_period_timer);
+}
+
+#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(!rt_entity_is_task(rt_se));
+#endif
+ return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+ return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ return rt_se->rt_rq;
+}
+
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_se->rt_rq;
+
+ return rt_rq->rq;
+}
+
+void free_rt_sched_group(struct task_group *tg)
+{
+ int i;
+
+ if (tg->rt_se)
+ destroy_rt_bandwidth(&tg->rt_bandwidth);
+
+ for_each_possible_cpu(i) {
+ if (tg->rt_rq)
+ kfree(tg->rt_rq[i]);
+ if (tg->rt_se)
+ kfree(tg->rt_se[i]);
+ }
+
+ kfree(tg->rt_rq);
+ kfree(tg->rt_se);
+}
+
+void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+ struct sched_rt_entity *rt_se, int cpu,
+ struct sched_rt_entity *parent)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->rt_nr_boosted = 0;
+ rt_rq->rq = rq;
+ rt_rq->tg = tg;
+
+ tg->rt_rq[cpu] = rt_rq;
+ tg->rt_se[cpu] = rt_se;
+
+ if (!rt_se)
+ return;
+
+ if (!parent)
+ rt_se->rt_rq = &rq->rt;
+ else
+ rt_se->rt_rq = parent->my_q;
+
+ rt_se->my_q = rt_rq;
+ rt_se->parent = parent;
+ INIT_LIST_HEAD(&rt_se->run_list);
+}
+
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ struct rt_rq *rt_rq;
+ struct sched_rt_entity *rt_se;
+ int i;
+
+ tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
+ if (!tg->rt_rq)
+ goto err;
+ tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
+ if (!tg->rt_se)
+ goto err;
+
+ init_rt_bandwidth(&tg->rt_bandwidth,
+ ktime_to_ns(def_rt_bandwidth.rt_period), 0);
+
+ for_each_possible_cpu(i) {
+ rt_rq = kzalloc_node(sizeof(struct rt_rq),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!rt_rq)
+ goto err;
+
+ rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!rt_se)
+ goto err_free_rq;
+
+ init_rt_rq(rt_rq);
+ rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+ init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
+ }
+
+ return 1;
+
+err_free_rq:
+ kfree(rt_rq);
+err:
+ return 0;
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+#define rt_entity_is_task(rt_se) (1)
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+ return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+ return container_of(rt_rq, struct rq, rt);
+}
+
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+ struct task_struct *p = rt_task_of(rt_se);
+
+ return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
+ return &rq->rt;
+}
+
+void free_rt_sched_group(struct task_group *tg) { }
+
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_SMP
+
+static void pull_rt_task(struct rq *this_rq);
+
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+ /* Try to pull RT tasks here if we lower this rq's prio */
+ return rq->rt.highest_prio.curr > prev->prio;
+}
+
+static inline int rt_overloaded(struct rq *rq)
+{
+ return atomic_read(&rq->rd->rto_count);
+}
+
+static inline void rt_set_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
+ /*
+ * Make sure the mask is visible before we set
+ * the overload count. That is checked to determine
+ * if we should look at the mask. It would be a shame
+ * if we looked at the mask, but the mask was not
+ * updated yet.
+ *
+ * Matched by the barrier in pull_rt_task().
+ */
+ smp_wmb();
+ atomic_inc(&rq->rd->rto_count);
+}
+
+static inline void rt_clear_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ /* the order here really doesn't matter */
+ atomic_dec(&rq->rd->rto_count);
+ cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
+}
+
+static void update_rt_migration(struct rt_rq *rt_rq)
+{
+ if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
+ if (!rt_rq->overloaded) {
+ rt_set_overload(rq_of_rt_rq(rt_rq));
+ rt_rq->overloaded = 1;
+ }
+ } else if (rt_rq->overloaded) {
+ rt_clear_overload(rq_of_rt_rq(rt_rq));
+ rt_rq->overloaded = 0;
+ }
+}
+
+static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ struct task_struct *p;
+
+ if (!rt_entity_is_task(rt_se))
+ return;
+
+ p = rt_task_of(rt_se);
+ rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+
+ rt_rq->rt_nr_total++;
+ if (p->nr_cpus_allowed > 1)
+ rt_rq->rt_nr_migratory++;
+
+ update_rt_migration(rt_rq);
+}
+
+static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ struct task_struct *p;
+
+ if (!rt_entity_is_task(rt_se))
+ return;
+
+ p = rt_task_of(rt_se);
+ rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+
+ rt_rq->rt_nr_total--;
+ if (p->nr_cpus_allowed > 1)
+ rt_rq->rt_nr_migratory--;
+
+ update_rt_migration(rt_rq);
+}
+
+static inline int has_pushable_tasks(struct rq *rq)
+{
+ return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
+static DEFINE_PER_CPU(struct callback_head, rt_push_head);
+static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
+
+static void push_rt_tasks(struct rq *);
+static void pull_rt_task(struct rq *);
+
+static inline void rt_queue_push_tasks(struct rq *rq)
+{
+ if (!has_pushable_tasks(rq))
+ return;
+
+ queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
+}
+
+static inline void rt_queue_pull_task(struct rq *rq)
+{
+ queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
+}
+
+static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+ plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ plist_node_init(&p->pushable_tasks, p->prio);
+ plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+
+ /* Update the highest prio pushable task */
+ if (p->prio < rq->rt.highest_prio.next)
+ rq->rt.highest_prio.next = p->prio;
+}
+
+static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+ plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+
+ /* Update the new highest prio pushable task */
+ if (has_pushable_tasks(rq)) {
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+ rq->rt.highest_prio.next = p->prio;
+ } else
+ rq->rt.highest_prio.next = MAX_RT_PRIO;
+}
+
+#else
+
+static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+
+static inline
+void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+ return false;
+}
+
+static inline void pull_rt_task(struct rq *this_rq)
+{
+}
+
+static inline void rt_queue_push_tasks(struct rq *rq)
+{
+}
+#endif /* CONFIG_SMP */
+
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+
+static inline int on_rt_rq(struct sched_rt_entity *rt_se)
+{
+ return rt_se->on_rq;
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+{
+ if (!rt_rq->tg)
+ return RUNTIME_INF;
+
+ return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+ return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
+}
+
+typedef struct task_group *rt_rq_iter_t;
+
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+ do {
+ tg = list_entry_rcu(tg->list.next,
+ typeof(struct task_group), list);
+ } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
+
+ if (&tg->list == &task_groups)
+ tg = NULL;
+
+ return tg;
+}
+
+#define for_each_rt_rq(rt_rq, iter, rq) \
+ for (iter = container_of(&task_groups, typeof(*iter), list); \
+ (iter = next_task_group(iter)) && \
+ (rt_rq = iter->rt_rq[cpu_of(rq)]);)
+
+#define for_each_sched_rt_entity(rt_se) \
+ for (; rt_se; rt_se = rt_se->parent)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+ return rt_se->my_q;
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+{
+ struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct sched_rt_entity *rt_se;
+
+ int cpu = cpu_of(rq);
+
+ rt_se = rt_rq->tg->rt_se[cpu];
+
+ if (rt_rq->rt_nr_running) {
+ if (!rt_se)
+ enqueue_top_rt_rq(rt_rq);
+ else if (!on_rt_rq(rt_se))
+ enqueue_rt_entity(rt_se, 0);
+
+ if (rt_rq->highest_prio.curr < curr->prio)
+ resched_curr(rq);
+ }
+}
+
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+{
+ struct sched_rt_entity *rt_se;
+ int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+
+ rt_se = rt_rq->tg->rt_se[cpu];
+
+ if (!rt_se) {
+ dequeue_top_rt_rq(rt_rq);
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
+ }
+ else if (on_rt_rq(rt_se))
+ dequeue_rt_entity(rt_se, 0);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+ struct task_struct *p;
+
+ if (rt_rq)
+ return !!rt_rq->rt_nr_boosted;
+
+ p = rt_task_of(rt_se);
+ return p->prio != p->normal_prio;
+}
+
+#ifdef CONFIG_SMP
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+ return this_rq()->rd->span;
+}
+#else
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+ return cpu_online_mask;
+}
+#endif
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+ return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
+}
+
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+ return &rt_rq->tg->rt_bandwidth;
+}
+
+#else /* !CONFIG_RT_GROUP_SCHED */
+
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+ return ktime_to_ns(def_rt_bandwidth.rt_period);
+}
+
+typedef struct rt_rq *rt_rq_iter_t;
+
+#define for_each_rt_rq(rt_rq, iter, rq) \
+ for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
+#define for_each_sched_rt_entity(rt_se) \
+ for (; rt_se; rt_se = NULL)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+ return NULL;
+}
+
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (!rt_rq->rt_nr_running)
+ return;
+
+ enqueue_top_rt_rq(rt_rq);
+ resched_curr(rq);
+}
+
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+{
+ dequeue_top_rt_rq(rt_rq);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled;
+}
+
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+ return cpu_online_mask;
+}
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+ return &cpu_rq(cpu)->rt;
+}
+
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+ return &def_rt_bandwidth;
+}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ return (hrtimer_active(&rt_b->rt_period_timer) ||
+ rt_rq->rt_time < rt_b->rt_runtime);
+}
+
+#ifdef CONFIG_SMP
+/*
+ * We ran out of runtime, see if we can borrow some from our neighbours.
+ */
+static void do_balance_runtime(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+ struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
+ int i, weight;
+ u64 rt_period;
+
+ weight = cpumask_weight(rd->span);
+
+ raw_spin_lock(&rt_b->rt_runtime_lock);
+ rt_period = ktime_to_ns(rt_b->rt_period);
+ for_each_cpu(i, rd->span) {
+ struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+ s64 diff;
+
+ if (iter == rt_rq)
+ continue;
+
+ raw_spin_lock(&iter->rt_runtime_lock);
+ /*
+ * Either all rqs have inf runtime and there's nothing to steal
+ * or __disable_runtime() below sets a specific rq to inf to
+ * indicate its been disabled and disalow stealing.
+ */
+ if (iter->rt_runtime == RUNTIME_INF)
+ goto next;
+
+ /*
+ * From runqueues with spare time, take 1/n part of their
+ * spare time, but no more than our period.
+ */
+ diff = iter->rt_runtime - iter->rt_time;
+ if (diff > 0) {
+ diff = div_u64((u64)diff, weight);
+ if (rt_rq->rt_runtime + diff > rt_period)
+ diff = rt_period - rt_rq->rt_runtime;
+ iter->rt_runtime -= diff;
+ rt_rq->rt_runtime += diff;
+ if (rt_rq->rt_runtime == rt_period) {
+ raw_spin_unlock(&iter->rt_runtime_lock);
+ break;
+ }
+ }
+next:
+ raw_spin_unlock(&iter->rt_runtime_lock);
+ }
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
+}
+
+/*
+ * Ensure this RQ takes back all the runtime it lend to its neighbours.
+ */
+static void __disable_runtime(struct rq *rq)
+{
+ struct root_domain *rd = rq->rd;
+ rt_rq_iter_t iter;
+ struct rt_rq *rt_rq;
+
+ if (unlikely(!scheduler_running))
+ return;
+
+ for_each_rt_rq(rt_rq, iter, rq) {
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+ s64 want;
+ int i;
+
+ raw_spin_lock(&rt_b->rt_runtime_lock);
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ /*
+ * Either we're all inf and nobody needs to borrow, or we're
+ * already disabled and thus have nothing to do, or we have
+ * exactly the right amount of runtime to take out.
+ */
+ if (rt_rq->rt_runtime == RUNTIME_INF ||
+ rt_rq->rt_runtime == rt_b->rt_runtime)
+ goto balanced;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+
+ /*
+ * Calculate the difference between what we started out with
+ * and what we current have, that's the amount of runtime
+ * we lend and now have to reclaim.
+ */
+ want = rt_b->rt_runtime - rt_rq->rt_runtime;
+
+ /*
+ * Greedy reclaim, take back as much as we can.
+ */
+ for_each_cpu(i, rd->span) {
+ struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+ s64 diff;
+
+ /*
+ * Can't reclaim from ourselves or disabled runqueues.
+ */
+ if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
+ continue;
+
+ raw_spin_lock(&iter->rt_runtime_lock);
+ if (want > 0) {
+ diff = min_t(s64, iter->rt_runtime, want);
+ iter->rt_runtime -= diff;
+ want -= diff;
+ } else {
+ iter->rt_runtime -= want;
+ want -= want;
+ }
+ raw_spin_unlock(&iter->rt_runtime_lock);
+
+ if (!want)
+ break;
+ }
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ /*
+ * We cannot be left wanting - that would mean some runtime
+ * leaked out of the system.
+ */
+ BUG_ON(want);
+balanced:
+ /*
+ * Disable all the borrow logic by pretending we have inf
+ * runtime - in which case borrowing doesn't make sense.
+ */
+ rt_rq->rt_runtime = RUNTIME_INF;
+ rt_rq->rt_throttled = 0;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
+
+ /* Make rt_rq available for pick_next_task() */
+ sched_rt_rq_enqueue(rt_rq);
+ }
+}
+
+static void __enable_runtime(struct rq *rq)
+{
+ rt_rq_iter_t iter;
+ struct rt_rq *rt_rq;
+
+ if (unlikely(!scheduler_running))
+ return;
+
+ /*
+ * Reset each runqueue's bandwidth settings
+ */
+ for_each_rt_rq(rt_rq, iter, rq) {
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ raw_spin_lock(&rt_b->rt_runtime_lock);
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = rt_b->rt_runtime;
+ rt_rq->rt_time = 0;
+ rt_rq->rt_throttled = 0;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
+ }
+}
+
+static void balance_runtime(struct rt_rq *rt_rq)
+{
+ if (!sched_feat(RT_RUNTIME_SHARE))
+ return;
+
+ if (rt_rq->rt_time > rt_rq->rt_runtime) {
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ do_balance_runtime(rt_rq);
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ }
+}
+#else /* !CONFIG_SMP */
+static inline void balance_runtime(struct rt_rq *rt_rq) {}
+#endif /* CONFIG_SMP */
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+{
+ int i, idle = 1, throttled = 0;
+ const struct cpumask *span;
+
+ span = sched_rt_period_mask();
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * FIXME: isolated CPUs should really leave the root task group,
+ * whether they are isolcpus or were isolated via cpusets, lest
+ * the timer run on a CPU which does not service all runqueues,
+ * potentially leaving other CPUs indefinitely throttled. If
+ * isolation is really required, the user will turn the throttle
+ * off to kill the perturbations it causes anyway. Meanwhile,
+ * this maintains functionality for boot and/or troubleshooting.
+ */
+ if (rt_b == &root_task_group.rt_bandwidth)
+ span = cpu_online_mask;
+#endif
+ for_each_cpu(i, span) {
+ int enqueue = 0;
+ struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+ int skip;
+
+ /*
+ * When span == cpu_online_mask, taking each rq->lock
+ * can be time-consuming. Try to avoid it when possible.
+ */
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
+ rt_rq->rt_runtime = rt_b->rt_runtime;
+ skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (skip)
+ continue;
+
+ raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+
+ if (rt_rq->rt_time) {
+ u64 runtime;
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ if (rt_rq->rt_throttled)
+ balance_runtime(rt_rq);
+ runtime = rt_rq->rt_runtime;
+ rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
+ if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
+ rt_rq->rt_throttled = 0;
+ enqueue = 1;
+
+ /*
+ * When we're idle and a woken (rt) task is
+ * throttled check_preempt_curr() will set
+ * skip_update and the time between the wakeup
+ * and this unthrottle will get accounted as
+ * 'runtime'.
+ */
+ if (rt_rq->rt_nr_running && rq->curr == rq->idle)
+ rq_clock_cancel_skipupdate(rq);
+ }
+ if (rt_rq->rt_time || rt_rq->rt_nr_running)
+ idle = 0;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ } else if (rt_rq->rt_nr_running) {
+ idle = 0;
+ if (!rt_rq_throttled(rt_rq))
+ enqueue = 1;
+ }
+ if (rt_rq->rt_throttled)
+ throttled = 1;
+
+ if (enqueue)
+ sched_rt_rq_enqueue(rt_rq);
+ raw_spin_unlock(&rq->lock);
+ }
+
+ if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
+ return 1;
+
+ return idle;
+}
+
+static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+ if (rt_rq)
+ return rt_rq->highest_prio.curr;
+#endif
+
+ return rt_task_of(rt_se)->prio;
+}
+
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
+{
+ u64 runtime = sched_rt_runtime(rt_rq);
+
+ if (rt_rq->rt_throttled)
+ return rt_rq_throttled(rt_rq);
+
+ if (runtime >= sched_rt_period(rt_rq))
+ return 0;
+
+ balance_runtime(rt_rq);
+ runtime = sched_rt_runtime(rt_rq);
+ if (runtime == RUNTIME_INF)
+ return 0;
+
+ if (rt_rq->rt_time > runtime) {
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ /*
+ * Don't actually throttle groups that have no runtime assigned
+ * but accrue some time due to boosting.
+ */
+ if (likely(rt_b->rt_runtime)) {
+ rt_rq->rt_throttled = 1;
+ printk_deferred_once("sched: RT throttling activated\n");
+ } else {
+ /*
+ * In case we did anyway, make it go away,
+ * replenishment is a joke, since it will replenish us
+ * with exactly 0 ns.
+ */
+ rt_rq->rt_time = 0;
+ }
+
+ if (rt_rq_throttled(rt_rq)) {
+ sched_rt_rq_dequeue(rt_rq);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static void update_curr_rt(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_rt_entity *rt_se = &curr->rt;
+ u64 delta_exec;
+ u64 now;
+
+ if (curr->sched_class != &rt_sched_class)
+ return;
+
+ now = rq_clock_task(rq);
+ delta_exec = now - curr->se.exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = now;
+ cgroup_account_cputime(curr, delta_exec);
+
+ if (!rt_bandwidth_enabled())
+ return;
+
+ for_each_sched_rt_entity(rt_se) {
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ int exceeded;
+
+ if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_time += delta_exec;
+ exceeded = sched_rt_runtime_exceeded(rt_rq);
+ if (exceeded)
+ resched_curr(rq);
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (exceeded)
+ do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
+ }
+ }
+}
+
+static void
+dequeue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (!rt_rq->rt_queued)
+ return;
+
+ BUG_ON(!rq->nr_running);
+
+ sub_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 0;
+
+}
+
+static void
+enqueue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (rt_rq->rt_queued)
+ return;
+
+ if (rt_rq_throttled(rt_rq))
+ return;
+
+ if (rt_rq->rt_nr_running) {
+ add_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 1;
+ }
+
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq, 0);
+}
+
+#if defined CONFIG_SMP
+
+static void
+inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
+ if (rq->online && prio < prev_prio)
+ cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
+}
+
+static void
+dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
+ if (rq->online && rt_rq->highest_prio.curr != prev_prio)
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
+}
+
+#else /* CONFIG_SMP */
+
+static inline
+void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+static inline
+void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+
+#endif /* CONFIG_SMP */
+
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+static void
+inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+ int prev_prio = rt_rq->highest_prio.curr;
+
+ if (prio < prev_prio)
+ rt_rq->highest_prio.curr = prio;
+
+ inc_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+static void
+dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+ int prev_prio = rt_rq->highest_prio.curr;
+
+ if (rt_rq->rt_nr_running) {
+
+ WARN_ON(prio < prev_prio);
+
+ /*
+ * This may have been our highest task, and therefore
+ * we may have some recomputation to do
+ */
+ if (prio == prev_prio) {
+ struct rt_prio_array *array = &rt_rq->active;
+
+ rt_rq->highest_prio.curr =
+ sched_find_first_bit(array->bitmap);
+ }
+
+ } else
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+
+ dec_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+#else
+
+static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+
+#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted++;
+
+ if (rt_rq->tg)
+ start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+}
+
+static void
+dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted--;
+
+ WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ start_rt_bandwidth(&def_rt_bandwidth);
+}
+
+static inline
+void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static inline
+unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+ if (group_rq)
+ return group_rq->rt_nr_running;
+ else
+ return 1;
+}
+
+static inline
+unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+ struct task_struct *tsk;
+
+ if (group_rq)
+ return group_rq->rr_nr_running;
+
+ tsk = rt_task_of(rt_se);
+
+ return (tsk->policy == SCHED_RR) ? 1 : 0;
+}
+
+static inline
+void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ int prio = rt_se_prio(rt_se);
+
+ WARN_ON(!rt_prio(prio));
+ rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
+ rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
+
+ inc_rt_prio(rt_rq, prio);
+ inc_rt_migration(rt_se, rt_rq);
+ inc_rt_group(rt_se, rt_rq);
+}
+
+static inline
+void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+ WARN_ON(!rt_rq->rt_nr_running);
+ rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
+ rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
+
+ dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+ dec_rt_migration(rt_se, rt_rq);
+ dec_rt_group(rt_se, rt_rq);
+}
+
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ return false;
+
+ return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+ list_del_init(&rt_se->run_list);
+
+ if (list_empty(array->queue + rt_se_prio(rt_se)))
+ __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+ rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ struct rt_prio_array *array = &rt_rq->active;
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+ struct list_head *queue = array->queue + rt_se_prio(rt_se);
+
+ /*
+ * Don't enqueue the group if its throttled, or when empty.
+ * The latter is a consequence of the former when a child group
+ * get throttled and the current group doesn't have any other
+ * active members.
+ */
+ if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+ if (rt_se->on_list)
+ __delist_rt_entity(rt_se, array);
+ return;
+ }
+
+ if (move_entity(flags)) {
+ WARN_ON_ONCE(rt_se->on_list);
+ if (flags & ENQUEUE_HEAD)
+ list_add(&rt_se->run_list, queue);
+ else
+ list_add_tail(&rt_se->run_list, queue);
+
+ __set_bit(rt_se_prio(rt_se), array->bitmap);
+ rt_se->on_list = 1;
+ }
+ rt_se->on_rq = 1;
+
+ inc_rt_tasks(rt_se, rt_rq);
+}
+
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ struct rt_prio_array *array = &rt_rq->active;
+
+ if (move_entity(flags)) {
+ WARN_ON_ONCE(!rt_se->on_list);
+ __delist_rt_entity(rt_se, array);
+ }
+ rt_se->on_rq = 0;
+
+ dec_rt_tasks(rt_se, rt_rq);
+}
+
+/*
+ * Because the prio of an upper entry depends on the lower
+ * entries, we must remove entries top - down.
+ */
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
+{
+ struct sched_rt_entity *back = NULL;
+
+ for_each_sched_rt_entity(rt_se) {
+ rt_se->back = back;
+ back = rt_se;
+ }
+
+ dequeue_top_rt_rq(rt_rq_of_se(back));
+
+ for (rt_se = back; rt_se; rt_se = rt_se->back) {
+ if (on_rt_rq(rt_se))
+ __dequeue_rt_entity(rt_se, flags);
+ }
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
+ dequeue_rt_stack(rt_se, flags);
+ for_each_sched_rt_entity(rt_se)
+ __enqueue_rt_entity(rt_se, flags);
+ enqueue_top_rt_rq(&rq->rt);
+}
+
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
+ dequeue_rt_stack(rt_se, flags);
+
+ for_each_sched_rt_entity(rt_se) {
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+ if (rt_rq && rt_rq->rt_nr_running)
+ __enqueue_rt_entity(rt_se, flags);
+ }
+ enqueue_top_rt_rq(&rq->rt);
+}
+
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+
+ if (flags & ENQUEUE_WAKEUP)
+ rt_se->timeout = 0;
+
+ enqueue_rt_entity(rt_se, flags);
+
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_task(rq, p);
+}
+
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+
+ update_curr_rt(rq);
+ dequeue_rt_entity(rt_se, flags);
+
+ dequeue_pushable_task(rq, p);
+}
+
+/*
+ * Put task to the head or the end of the run list without the overhead of
+ * dequeue followed by enqueue.
+ */
+static void
+requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
+{
+ if (on_rt_rq(rt_se)) {
+ struct rt_prio_array *array = &rt_rq->active;
+ struct list_head *queue = array->queue + rt_se_prio(rt_se);
+
+ if (head)
+ list_move(&rt_se->run_list, queue);
+ else
+ list_move_tail(&rt_se->run_list, queue);
+ }
+}
+
+static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq;
+
+ for_each_sched_rt_entity(rt_se) {
+ rt_rq = rt_rq_of_se(rt_se);
+ requeue_rt_entity(rt_rq, rt_se, head);
+ }
+}
+
+static void yield_task_rt(struct rq *rq)
+{
+ requeue_task_rt(rq, rq->curr, 0);
+}
+
+#ifdef CONFIG_SMP
+static int find_lowest_rq(struct task_struct *task);
+
+static int
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ struct task_struct *curr;
+ struct rq *rq;
+
+ /* For anything but wake ups, just return the task_cpu */
+ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+ goto out;
+
+ rq = cpu_rq(cpu);
+
+ rcu_read_lock();
+ curr = READ_ONCE(rq->curr); /* unlocked access */
+
+ /*
+ * If the current task on @p's runqueue is an RT task, then
+ * try to see if we can wake this RT task up on another
+ * runqueue. Otherwise simply start this RT task
+ * on its current runqueue.
+ *
+ * We want to avoid overloading runqueues. If the woken
+ * task is a higher priority, then it will stay on this CPU
+ * and the lower prio task should be moved to another CPU.
+ * Even though this will probably make the lower prio task
+ * lose its cache, we do not want to bounce a higher task
+ * around just because it gave up its CPU, perhaps for a
+ * lock?
+ *
+ * For equal prio tasks, we just let the scheduler sort it out.
+ *
+ * Otherwise, just let it ride on the affined RQ and the
+ * post-schedule router will push the preempted task away
+ *
+ * This test is optimistic, if we get it wrong the load-balancer
+ * will have to sort it out.
+ */
+ if (curr && unlikely(rt_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ curr->prio <= p->prio)) {
+ int target = find_lowest_rq(p);
+
+ /*
+ * Don't bother moving it if the destination CPU is
+ * not running a lower priority task.
+ */
+ if (target != -1 &&
+ p->prio < cpu_rq(target)->rt.highest_prio.curr)
+ cpu = target;
+ }
+ rcu_read_unlock();
+
+out:
+ return cpu;
+}
+
+static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
+{
+ /*
+ * Current can't be migrated, useless to reschedule,
+ * let's hope p can move out.
+ */
+ if (rq->curr->nr_cpus_allowed == 1 ||
+ !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ return;
+
+ /*
+ * p is migratable, so let's not schedule it and
+ * see if it is pushed or pulled somewhere else.
+ */
+ if (p->nr_cpus_allowed != 1
+ && cpupri_find(&rq->rd->cpupri, p, NULL))
+ return;
+
+ /*
+ * There appear to be other CPUs that can accept
+ * the current task but none can run 'p', so lets reschedule
+ * to try and push the current task away:
+ */
+ requeue_task_rt(rq, p, 1);
+ resched_curr(rq);
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (p->prio < rq->curr->prio) {
+ resched_curr(rq);
+ return;
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * If:
+ *
+ * - the newly woken task is of equal priority to the current task
+ * - the newly woken task is non-migratable while current is migratable
+ * - current will be preempted on the next reschedule
+ *
+ * we should check to see if current can readily move to a different
+ * cpu. If so, we will reschedule to allow the push logic to try
+ * to move current somewhere else, making room for our non-migratable
+ * task.
+ */
+ if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
+ check_preempt_equal_prio(rq, p);
+#endif
+}
+
+static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+ struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array = &rt_rq->active;
+ struct sched_rt_entity *next = NULL;
+ struct list_head *queue;
+ int idx;
+
+ idx = sched_find_first_bit(array->bitmap);
+ BUG_ON(idx >= MAX_RT_PRIO);
+
+ queue = array->queue + idx;
+ next = list_entry(queue->next, struct sched_rt_entity, run_list);
+
+ return next;
+}
+
+static struct task_struct *_pick_next_task_rt(struct rq *rq)
+{
+ struct sched_rt_entity *rt_se;
+ struct task_struct *p;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ do {
+ rt_se = pick_next_rt_entity(rq, rt_rq);
+ BUG_ON(!rt_se);
+ rt_rq = group_rt_rq(rt_se);
+ } while (rt_rq);
+
+ p = rt_task_of(rt_se);
+ p->se.exec_start = rq_clock_task(rq);
+
+ return p;
+}
+
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *p;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (need_pull_rt_task(rq, prev)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we're
+ * being very careful to re-start the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_rt_task(rq);
+ rq_repin_lock(rq, rf);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a dl or stop task can slip in, in which case we need
+ * to re-start task selection.
+ */
+ if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
+ rq->dl.dl_nr_running))
+ return RETRY_TASK;
+ }
+
+ /*
+ * We may dequeue prev's rt_rq in put_prev_task().
+ * So, we update time before rt_nr_running check.
+ */
+ if (prev->sched_class == &rt_sched_class)
+ update_curr_rt(rq);
+
+ if (!rt_rq->rt_queued)
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ p = _pick_next_task_rt(rq);
+
+ /* The running task is never eligible for pushing */
+ dequeue_pushable_task(rq, p);
+
+ rt_queue_push_tasks(rq);
+
+ /*
+ * If prev task was rt, put_prev_task() has already updated the
+ * utilization. We only care of the case where we start to schedule a
+ * rt task
+ */
+ if (rq->curr->sched_class != &rt_sched_class)
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+
+ return p;
+}
+
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+{
+ update_curr_rt(rq);
+
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+
+ /*
+ * The previous task needs to be made eligible for pushing
+ * if it is still active
+ */
+ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_task(rq, p);
+}
+
+#ifdef CONFIG_SMP
+
+/* Only try algorithms three times */
+#define RT_MAX_TRIES 3
+
+static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+ if (!task_running(rq, p) &&
+ cpumask_test_cpu(cpu, &p->cpus_allowed))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Return the highest pushable rq's task, which is suitable to be executed
+ * on the CPU, NULL otherwise
+ */
+static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
+{
+ struct plist_head *head = &rq->rt.pushable_tasks;
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ plist_for_each_entry(p, head, pushable_tasks) {
+ if (pick_rt_task(rq, p, cpu))
+ return p;
+ }
+
+ return NULL;
+}
+
+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
+
+static int find_lowest_rq(struct task_struct *task)
+{
+ struct sched_domain *sd;
+ struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
+ int this_cpu = smp_processor_id();
+ int cpu = task_cpu(task);
+
+ /* Make sure the mask is initialized first */
+ if (unlikely(!lowest_mask))
+ return -1;
+
+ if (task->nr_cpus_allowed == 1)
+ return -1; /* No other targets possible */
+
+ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+ return -1; /* No targets found */
+
+ /*
+ * At this point we have built a mask of CPUs representing the
+ * lowest priority tasks in the system. Now we want to elect
+ * the best one based on our affinity and topology.
+ *
+ * We prioritize the last CPU that the task executed on since
+ * it is most likely cache-hot in that location.
+ */
+ if (cpumask_test_cpu(cpu, lowest_mask))
+ return cpu;
+
+ /*
+ * Otherwise, we consult the sched_domains span maps to figure
+ * out which CPU is logically closest to our hot cache data.
+ */
+ if (!cpumask_test_cpu(this_cpu, lowest_mask))
+ this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
+
+ /*
+ * "this_cpu" is cheaper to preempt than a
+ * remote processor.
+ */
+ if (this_cpu != -1 &&
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
+ return this_cpu;
+ }
+
+ best_cpu = cpumask_first_and(lowest_mask,
+ sched_domain_span(sd));
+ if (best_cpu < nr_cpu_ids) {
+ rcu_read_unlock();
+ return best_cpu;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * And finally, if there were no matches within the domains
+ * just give the caller *something* to work with from the compatible
+ * locations.
+ */
+ if (this_cpu != -1)
+ return this_cpu;
+
+ cpu = cpumask_any(lowest_mask);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+
+ return -1;
+}
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
+{
+ struct rq *lowest_rq = NULL;
+ int tries;
+ int cpu;
+
+ for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+ cpu = find_lowest_rq(task);
+
+ if ((cpu == -1) || (cpu == rq->cpu))
+ break;
+
+ lowest_rq = cpu_rq(cpu);
+
+ if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+ /*
+ * Target rq has tasks of equal or higher priority,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ lowest_rq = NULL;
+ break;
+ }
+
+ /* if the prio of this runqueue changed, try again */
+ if (double_lock_balance(rq, lowest_rq)) {
+ /*
+ * We had to unlock the run queue. In
+ * the mean time, task could have
+ * migrated already or had its affinity changed.
+ * Also make sure that it wasn't scheduled on its rq.
+ */
+ if (unlikely(task_rq(task) != rq ||
+ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
+ task_running(rq, task) ||
+ !rt_task(task) ||
+ !task_on_rq_queued(task))) {
+
+ double_unlock_balance(rq, lowest_rq);
+ lowest_rq = NULL;
+ break;
+ }
+ }
+
+ /* If this rq is still suitable use it. */
+ if (lowest_rq->rt.highest_prio.curr > task->prio)
+ break;
+
+ /* try again */
+ double_unlock_balance(rq, lowest_rq);
+ lowest_rq = NULL;
+ }
+
+ return lowest_rq;
+}
+
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!task_on_rq_queued(p));
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
+/*
+ * If the current CPU has more than one RT task, see if the non
+ * running task can migrate over to a CPU that is running a task
+ * of lesser priority.
+ */
+static int push_rt_task(struct rq *rq)
+{
+ struct task_struct *next_task;
+ struct rq *lowest_rq;
+ int ret = 0;
+
+ if (!rq->rt.overloaded)
+ return 0;
+
+ next_task = pick_next_pushable_task(rq);
+ if (!next_task)
+ return 0;
+
+retry:
+ if (unlikely(next_task == rq->curr)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * It's possible that the next_task slipped in of
+ * higher priority than current. If that's the case
+ * just reschedule current.
+ */
+ if (unlikely(next_task->prio < rq->curr->prio)) {
+ resched_curr(rq);
+ return 0;
+ }
+
+ /* We might release rq lock */
+ get_task_struct(next_task);
+
+ /* find_lock_lowest_rq locks the rq if found */
+ lowest_rq = find_lock_lowest_rq(next_task, rq);
+ if (!lowest_rq) {
+ struct task_struct *task;
+ /*
+ * find_lock_lowest_rq releases rq->lock
+ * so it is possible that next_task has migrated.
+ *
+ * We need to make sure that the task is still on the same
+ * run-queue and is also still the next task eligible for
+ * pushing.
+ */
+ task = pick_next_pushable_task(rq);
+ if (task == next_task) {
+ /*
+ * The task hasn't migrated, and is still the next
+ * eligible task, but we failed to find a run-queue
+ * to push it to. Do not retry in this case, since
+ * other CPUs will pull from us when ready.
+ */
+ goto out;
+ }
+
+ if (!task)
+ /* No more tasks, just exit */
+ goto out;
+
+ /*
+ * Something has shifted, try again.
+ */
+ put_task_struct(next_task);
+ next_task = task;
+ goto retry;
+ }
+
+ deactivate_task(rq, next_task, 0);
+ set_task_cpu(next_task, lowest_rq->cpu);
+ activate_task(lowest_rq, next_task, 0);
+ ret = 1;
+
+ resched_curr(lowest_rq);
+
+ double_unlock_balance(rq, lowest_rq);
+
+out:
+ put_task_struct(next_task);
+
+ return ret;
+}
+
+static void push_rt_tasks(struct rq *rq)
+{
+ /* push_rt_task will return true if it moved an RT */
+ while (push_rt_task(rq))
+ ;
+}
+
+#ifdef HAVE_RT_PUSH_IPI
+
+/*
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * All CPUs with overloaded RT tasks need to be notified as there is currently
+ * no way to know which of these CPUs have the highest priority task waiting
+ * to run. Instead of trying to take a spinlock on each of these CPUs,
+ * which has shown to cause large latency when done on machines with many
+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+ * RT tasks waiting to run.
+ *
+ * Just sending an IPI to each of the CPUs is also an issue, as on large
+ * count CPU machines, this can cause an IPI storm on a CPU, especially
+ * if its the only CPU with multiple RT tasks queued, and a large number
+ * of CPUs scheduling a lower priority task at the same time.
+ *
+ * Each root domain has its own irq work function that can iterate over
+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+ * tassk must be checked if there's one or many CPUs that are lowering
+ * their priority, there's a single irq work iterator that will try to
+ * push off RT tasks that are waiting to run.
+ *
+ * When a CPU schedules a lower priority task, it will kick off the
+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * As it only takes the first CPU that schedules a lower priority task
+ * to start the process, the rto_start variable is incremented and if
+ * the atomic result is one, then that CPU will try to take the rto_lock.
+ * This prevents high contention on the lock as the process handles all
+ * CPUs scheduling lower priority tasks.
+ *
+ * All CPUs that are scheduling a lower priority task will increment the
+ * rt_loop_next variable. This will make sure that the irq work iterator
+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+ * priority task, even if the iterator is in the middle of a scan. Incrementing
+ * the rt_loop_next will cause the iterator to perform another scan.
+ *
+ */
+static int rto_next_cpu(struct root_domain *rd)
+{
+ int next;
+ int cpu;
+
+ /*
+ * When starting the IPI RT pushing, the rto_cpu is set to -1,
+ * rt_next_cpu() will simply return the first CPU found in
+ * the rto_mask.
+ *
+ * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
+ * will return the next CPU found in the rto_mask.
+ *
+ * If there are no more CPUs left in the rto_mask, then a check is made
+ * against rto_loop and rto_loop_next. rto_loop is only updated with
+ * the rto_lock held, but any CPU may increment the rto_loop_next
+ * without any locking.
+ */
+ for (;;) {
+
+ /* When rto_cpu is -1 this acts like cpumask_first() */
+ cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
+
+ rd->rto_cpu = cpu;
+
+ if (cpu < nr_cpu_ids)
+ return cpu;
+
+ rd->rto_cpu = -1;
+
+ /*
+ * ACQUIRE ensures we see the @rto_mask changes
+ * made prior to the @next value observed.
+ *
+ * Matches WMB in rt_set_overload().
+ */
+ next = atomic_read_acquire(&rd->rto_loop_next);
+
+ if (rd->rto_loop == next)
+ break;
+
+ rd->rto_loop = next;
+ }
+
+ return -1;
+}
+
+static inline bool rto_start_trylock(atomic_t *v)
+{
+ return !atomic_cmpxchg_acquire(v, 0, 1);
+}
+
+static inline void rto_start_unlock(atomic_t *v)
+{
+ atomic_set_release(v, 0);
+}
+
+static void tell_cpu_to_push(struct rq *rq)
+{
+ int cpu = -1;
+
+ /* Keep the loop going if the IPI is currently active */
+ atomic_inc(&rq->rd->rto_loop_next);
+
+ /* Only one CPU can initiate a loop at a time */
+ if (!rto_start_trylock(&rq->rd->rto_loop_start))
+ return;
+
+ raw_spin_lock(&rq->rd->rto_lock);
+
+ /*
+ * The rto_cpu is updated under the lock, if it has a valid CPU
+ * then the IPI is still running and will continue due to the
+ * update to loop_next, and nothing needs to be done here.
+ * Otherwise it is finishing up and an ipi needs to be sent.
+ */
+ if (rq->rd->rto_cpu < 0)
+ cpu = rto_next_cpu(rq->rd);
+
+ raw_spin_unlock(&rq->rd->rto_lock);
+
+ rto_start_unlock(&rq->rd->rto_loop_start);
+
+ if (cpu >= 0) {
+ /* Make sure the rd does not get freed while pushing */
+ sched_get_rd(rq->rd);
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+ }
+}
+
+/* Called from hardirq context */
+void rto_push_irq_work_func(struct irq_work *work)
+{
+ struct root_domain *rd =
+ container_of(work, struct root_domain, rto_push_work);
+ struct rq *rq;
+ int cpu;
+
+ rq = this_rq();
+
+ /*
+ * We do not need to grab the lock to check for has_pushable_tasks.
+ * When it gets updated, a check is made if a push is possible.
+ */
+ if (has_pushable_tasks(rq)) {
+ raw_spin_lock(&rq->lock);
+ push_rt_tasks(rq);
+ raw_spin_unlock(&rq->lock);
+ }
+
+ raw_spin_lock(&rd->rto_lock);
+
+ /* Pass the IPI to the next rt overloaded queue */
+ cpu = rto_next_cpu(rd);
+
+ raw_spin_unlock(&rd->rto_lock);
+
+ if (cpu < 0) {
+ sched_put_rd(rd);
+ return;
+ }
+
+ /* Try the next RT overloaded CPU */
+ irq_work_queue_on(&rd->rto_push_work, cpu);
+}
+#endif /* HAVE_RT_PUSH_IPI */
+
+static void pull_rt_task(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu, cpu;
+ bool resched = false;
+ struct task_struct *p;
+ struct rq *src_rq;
+ int rt_overload_count = rt_overloaded(this_rq);
+
+ if (likely(!rt_overload_count))
+ return;
+
+ /*
+ * Match the barrier from rt_set_overloaded; this guarantees that if we
+ * see overloaded we must also see the rto_mask bit.
+ */
+ smp_rmb();
+
+ /* If we are the only overloaded CPU do nothing */
+ if (rt_overload_count == 1 &&
+ cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
+ return;
+
+#ifdef HAVE_RT_PUSH_IPI
+ if (sched_feat(RT_PUSH_IPI)) {
+ tell_cpu_to_push(this_rq);
+ return;
+ }
+#endif
+
+ for_each_cpu(cpu, this_rq->rd->rto_mask) {
+ if (this_cpu == cpu)
+ continue;
+
+ src_rq = cpu_rq(cpu);
+
+ /*
+ * Don't bother taking the src_rq->lock if the next highest
+ * task is known to be lower-priority than our current task.
+ * This may look racy, but if this value is about to go
+ * logically higher, the src_rq will push this task away.
+ * And if its going logically lower, we do not care
+ */
+ if (src_rq->rt.highest_prio.next >=
+ this_rq->rt.highest_prio.curr)
+ continue;
+
+ /*
+ * We can potentially drop this_rq's lock in
+ * double_lock_balance, and another CPU could
+ * alter this_rq
+ */
+ double_lock_balance(this_rq, src_rq);
+
+ /*
+ * We can pull only a task, which is pushable
+ * on its rq, and no others.
+ */
+ p = pick_highest_pushable_task(src_rq, this_cpu);
+
+ /*
+ * Do we have an RT task that preempts
+ * the to-be-scheduled task?
+ */
+ if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
+ WARN_ON(p == src_rq->curr);
+ WARN_ON(!task_on_rq_queued(p));
+
+ /*
+ * There's a chance that p is higher in priority
+ * than what's currently running on its CPU.
+ * This is just that p is wakeing up and hasn't
+ * had a chance to schedule. We only pull
+ * p if it is lower in priority than the
+ * current task on the run queue
+ */
+ if (p->prio < src_rq->curr->prio)
+ goto skip;
+
+ resched = true;
+
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ /*
+ * We continue with the search, just in
+ * case there's an even higher prio task
+ * in another runqueue. (low likelihood
+ * but possible)
+ */
+ }
+skip:
+ double_unlock_balance(this_rq, src_rq);
+ }
+
+ if (resched)
+ resched_curr(this_rq);
+}
+
+/*
+ * If we are not running and we are not going to reschedule soon, we should
+ * try to push tasks away now
+ */
+static void task_woken_rt(struct rq *rq, struct task_struct *p)
+{
+ if (!task_running(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ p->nr_cpus_allowed > 1 &&
+ (dl_task(rq->curr) || rt_task(rq->curr)) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
+ rq->curr->prio <= p->prio))
+ push_rt_tasks(rq);
+}
+
+/* Assumes rq->lock is held */
+static void rq_online_rt(struct rq *rq)
+{
+ if (rq->rt.overloaded)
+ rt_set_overload(rq);
+
+ __enable_runtime(rq);
+
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
+}
+
+/* Assumes rq->lock is held */
+static void rq_offline_rt(struct rq *rq)
+{
+ if (rq->rt.overloaded)
+ rt_clear_overload(rq);
+
+ __disable_runtime(rq);
+
+ cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
+}
+
+/*
+ * When switch from the rt queue, we bring ourselves to a position
+ * that we might want to pull RT tasks from other runqueues.
+ */
+static void switched_from_rt(struct rq *rq, struct task_struct *p)
+{
+ /*
+ * If there are other RT tasks then we will reschedule
+ * and the scheduling of the other RT tasks will handle
+ * the balancing. But if we are the last RT task
+ * we may need to handle the pulling of RT tasks
+ * now.
+ */
+ if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+ return;
+
+ rt_queue_pull_task(rq);
+}
+
+void __init init_sched_rt_class(void)
+{
+ unsigned int i;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
+ GFP_KERNEL, cpu_to_node(i));
+ }
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * When switching a task to RT, we may overload the runqueue
+ * with RT tasks. In this case we try to push them off to
+ * other runqueues.
+ */
+static void switched_to_rt(struct rq *rq, struct task_struct *p)
+{
+ /*
+ * If we are already running, then there's nothing
+ * that needs to be done. But if we are not running
+ * we may need to preempt the current running task.
+ * If that current running task is also an RT task
+ * then see if we can move to another run queue.
+ */
+ if (task_on_rq_queued(p) && rq->curr != p) {
+#ifdef CONFIG_SMP
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+ rt_queue_push_tasks(rq);
+#endif /* CONFIG_SMP */
+ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
+ resched_curr(rq);
+ }
+}
+
+/*
+ * Priority of the task has changed. This may cause
+ * us to initiate a push or pull.
+ */
+static void
+prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
+{
+ if (!task_on_rq_queued(p))
+ return;
+
+ if (rq->curr == p) {
+#ifdef CONFIG_SMP
+ /*
+ * If our priority decreases while running, we
+ * may need to pull tasks to this runqueue.
+ */
+ if (oldprio < p->prio)
+ rt_queue_pull_task(rq);
+
+ /*
+ * If there's a higher priority task waiting to run
+ * then reschedule.
+ */
+ if (p->prio > rq->rt.highest_prio.curr)
+ resched_curr(rq);
+#else
+ /* For UP simply resched on drop of prio */
+ if (oldprio < p->prio)
+ resched_curr(rq);
+#endif /* CONFIG_SMP */
+ } else {
+ /*
+ * This task is not running, but if it is
+ * greater than the current running task
+ * then reschedule.
+ */
+ if (p->prio < rq->curr->prio)
+ resched_curr(rq);
+ }
+}
+
+#ifdef CONFIG_POSIX_TIMERS
+static void watchdog(struct rq *rq, struct task_struct *p)
+{
+ unsigned long soft, hard;
+
+ /* max may change after cur was read, this will be fixed next tick */
+ soft = task_rlimit(p, RLIMIT_RTTIME);
+ hard = task_rlimit_max(p, RLIMIT_RTTIME);
+
+ if (soft != RLIM_INFINITY) {
+ unsigned long next;
+
+ if (p->rt.watchdog_stamp != jiffies) {
+ p->rt.timeout++;
+ p->rt.watchdog_stamp = jiffies;
+ }
+
+ next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
+ if (p->rt.timeout > next)
+ p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+ }
+}
+#else
+static inline void watchdog(struct rq *rq, struct task_struct *p) { }
+#endif
+
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
+static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+
+ update_curr_rt(rq);
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+
+ watchdog(rq, p);
+
+ /*
+ * RR tasks need a special form of timeslice management.
+ * FIFO tasks have no timeslices.
+ */
+ if (p->policy != SCHED_RR)
+ return;
+
+ if (--p->rt.time_slice)
+ return;
+
+ p->rt.time_slice = sched_rr_timeslice;
+
+ /*
+ * Requeue to the end of queue if we (and all of our ancestors) are not
+ * the only element on the queue
+ */
+ for_each_sched_rt_entity(rt_se) {
+ if (rt_se->run_list.prev != rt_se->run_list.next) {
+ requeue_task_rt(rq, p, 0);
+ resched_curr(rq);
+ return;
+ }
+ }
+}
+
+static void set_curr_task_rt(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* The running task is never eligible for pushing */
+ dequeue_pushable_task(rq, p);
+}
+
+static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
+{
+ /*
+ * Time slice is 0 for SCHED_FIFO tasks
+ */
+ if (task->policy == SCHED_RR)
+ return sched_rr_timeslice;
+ else
+ return 0;
+}
+
+const struct sched_class rt_sched_class = {
+ .next = &fair_sched_class,
+ .enqueue_task = enqueue_task_rt,
+ .dequeue_task = dequeue_task_rt,
+ .yield_task = yield_task_rt,
+
+ .check_preempt_curr = check_preempt_curr_rt,
+
+ .pick_next_task = pick_next_task_rt,
+ .put_prev_task = put_prev_task_rt,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_rt,
+
+ .set_cpus_allowed = set_cpus_allowed_common,
+ .rq_online = rq_online_rt,
+ .rq_offline = rq_offline_rt,
+ .task_woken = task_woken_rt,
+ .switched_from = switched_from_rt,
+#endif
+
+ .set_curr_task = set_curr_task_rt,
+ .task_tick = task_tick_rt,
+
+ .get_rr_interval = get_rr_interval_rt,
+
+ .prio_changed = prio_changed_rt,
+ .switched_to = switched_to_rt,
+
+ .update_curr = update_curr_rt,
+};
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * Autogroups do not have RT tasks; see autogroup_create().
+ */
+ if (task_group_is_autogroup(tg))
+ return 0;
+
+ for_each_process_thread(g, p) {
+ if (rt_task(p) && task_group(p) == tg)
+ return 1;
+ }
+
+ return 0;
+}
+
+struct rt_schedulable_data {
+ struct task_group *tg;
+ u64 rt_period;
+ u64 rt_runtime;
+};
+
+static int tg_rt_schedulable(struct task_group *tg, void *data)
+{
+ struct rt_schedulable_data *d = data;
+ struct task_group *child;
+ unsigned long total, sum = 0;
+ u64 period, runtime;
+
+ period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ runtime = tg->rt_bandwidth.rt_runtime;
+
+ if (tg == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ /*
+ * Cannot have more runtime than the period.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
+
+ /*
+ * Ensure we don't starve existing RT tasks.
+ */
+ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+ return -EBUSY;
+
+ total = to_ratio(period, runtime);
+
+ /*
+ * Nobody can have more than the global setting allows.
+ */
+ if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+ return -EINVAL;
+
+ /*
+ * The sum of our children's runtime should not exceed our own.
+ */
+ list_for_each_entry_rcu(child, &tg->children, siblings) {
+ period = ktime_to_ns(child->rt_bandwidth.rt_period);
+ runtime = child->rt_bandwidth.rt_runtime;
+
+ if (child == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ sum += to_ratio(period, runtime);
+ }
+
+ if (sum > total)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+{
+ int ret;
+
+ struct rt_schedulable_data data = {
+ .tg = tg,
+ .rt_period = period,
+ .rt_runtime = runtime,
+ };
+
+ rcu_read_lock();
+ ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int tg_set_rt_bandwidth(struct task_group *tg,
+ u64 rt_period, u64 rt_runtime)
+{
+ int i, err = 0;
+
+ /*
+ * Disallowing the root group RT runtime is BAD, it would disallow the
+ * kernel creating (and or operating) RT threads.
+ */
+ if (tg == &root_task_group && rt_runtime == 0)
+ return -EINVAL;
+
+ /* No period doesn't make any sense. */
+ if (rt_period == 0)
+ return -EINVAL;
+
+ mutex_lock(&rt_constraints_mutex);
+ read_lock(&tasklist_lock);
+ err = __rt_schedulable(tg, rt_period, rt_runtime);
+ if (err)
+ goto unlock;
+
+ raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+ tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
+ tg->rt_bandwidth.rt_runtime = rt_runtime;
+
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = tg->rt_rq[i];
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = rt_runtime;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+unlock:
+ read_unlock(&tasklist_lock);
+ mutex_unlock(&rt_constraints_mutex);
+
+ return err;
+}
+
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+{
+ u64 rt_runtime, rt_period;
+
+ rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+ if (rt_runtime_us < 0)
+ rt_runtime = RUNTIME_INF;
+ else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_runtime(struct task_group *tg)
+{
+ u64 rt_runtime_us;
+
+ if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
+ return -1;
+
+ rt_runtime_us = tg->rt_bandwidth.rt_runtime;
+ do_div(rt_runtime_us, NSEC_PER_USEC);
+ return rt_runtime_us;
+}
+
+int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
+{
+ u64 rt_runtime, rt_period;
+
+ if (rt_period_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
+ rt_period = rt_period_us * NSEC_PER_USEC;
+ rt_runtime = tg->rt_bandwidth.rt_runtime;
+
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_period(struct task_group *tg)
+{
+ u64 rt_period_us;
+
+ rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ do_div(rt_period_us, NSEC_PER_USEC);
+ return rt_period_us;
+}
+
+static int sched_rt_global_constraints(void)
+{
+ int ret = 0;
+
+ mutex_lock(&rt_constraints_mutex);
+ read_lock(&tasklist_lock);
+ ret = __rt_schedulable(NULL, 0, 0);
+ read_unlock(&tasklist_lock);
+ mutex_unlock(&rt_constraints_mutex);
+
+ return ret;
+}
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+ /* Don't accept realtime tasks when there is no way for them to run */
+ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ return 0;
+
+ return 1;
+}
+
+#else /* !CONFIG_RT_GROUP_SCHED */
+static int sched_rt_global_constraints(void)
+{
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = &cpu_rq(i)->rt;
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = global_rt_runtime();
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+
+ return 0;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static int sched_rt_global_validate(void)
+{
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
+ if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+ (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void sched_rt_do_global(void)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+ def_rt_bandwidth.rt_runtime = global_rt_runtime();
+ def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+ raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+}
+
+int sched_rt_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int old_period, old_runtime;
+ static DEFINE_MUTEX(mutex);
+ int ret;
+
+ mutex_lock(&mutex);
+ old_period = sysctl_sched_rt_period;
+ old_runtime = sysctl_sched_rt_runtime;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (!ret && write) {
+ ret = sched_rt_global_validate();
+ if (ret)
+ goto undo;
+
+ ret = sched_dl_global_validate();
+ if (ret)
+ goto undo;
+
+ ret = sched_rt_global_constraints();
+ if (ret)
+ goto undo;
+
+ sched_rt_do_global();
+ sched_dl_do_global();
+ }
+ if (0) {
+undo:
+ sysctl_sched_rt_period = old_period;
+ sysctl_sched_rt_runtime = old_runtime;
+ }
+ mutex_unlock(&mutex);
+
+ return ret;
+}
+
+int sched_rr_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ /*
+ * Make sure that internally we keep jiffies.
+ * Also, writing zero resets the timeslice to default:
+ */
+ if (!ret && write) {
+ sched_rr_timeslice =
+ sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+ msecs_to_jiffies(sysctl_sched_rr_timeslice);
+ }
+ mutex_unlock(&mutex);
+
+ return ret;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+void print_rt_stats(struct seq_file *m, int cpu)
+{
+ rt_rq_iter_t iter;
+ struct rt_rq *rt_rq;
+
+ rcu_read_lock();
+ for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
+ print_rt_rq(m, cpu, rt_rq);
+ rcu_read_unlock();
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
new file mode 100644
index 000000000..c529706be
--- /dev/null
+++ b/kernel/sched/sched-pelt.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
+
+static const u32 runnable_avg_yN_inv[] __maybe_unused = {
+ 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644
index 000000000..55e695080
--- /dev/null
+++ b/kernel/sched/sched.h
@@ -0,0 +1,2248 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Scheduler internal types and methods:
+ */
+#include <linux/sched.h>
+
+#include <linux/sched/autogroup.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/cpufreq.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/deadline.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/idle.h>
+#include <linux/sched/init.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/jobctl.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/prio.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/smt.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/topology.h>
+#include <linux/sched/user.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/xacct.h>
+
+#include <uapi/linux/sched/types.h>
+
+#include <linux/binfmts.h>
+#include <linux/blkdev.h>
+#include <linux/compat.h>
+#include <linux/context_tracking.h>
+#include <linux/cpufreq.h>
+#include <linux/cpuidle.h>
+#include <linux/cpuset.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/delayacct.h>
+#include <linux/init_task.h>
+#include <linux/kprobes.h>
+#include <linux/kthread.h>
+#include <linux/membarrier.h>
+#include <linux/migrate.h>
+#include <linux/mmu_context.h>
+#include <linux/nmi.h>
+#include <linux/proc_fs.h>
+#include <linux/prefetch.h>
+#include <linux/profile.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/security.h>
+#include <linux/stackprotector.h>
+#include <linux/stop_machine.h>
+#include <linux/suspend.h>
+#include <linux/swait.h>
+#include <linux/syscalls.h>
+#include <linux/task_work.h>
+#include <linux/tsacct_kern.h>
+
+#include <asm/tlb.h>
+
+#ifdef CONFIG_PARAVIRT
+# include <asm/paravirt.h>
+#endif
+
+#include "cpupri.h"
+#include "cpudeadline.h"
+
+#ifdef CONFIG_SCHED_DEBUG
+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
+#else
+# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
+#endif
+
+struct rq;
+struct cpuidle_state;
+
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED 1
+#define TASK_ON_RQ_MIGRATING 2
+
+extern __read_mostly int scheduler_running;
+
+extern unsigned long calc_load_update;
+extern atomic_long_t calc_load_tasks;
+
+extern void calc_global_load_tick(struct rq *this_rq);
+extern long calc_load_fold_active(struct rq *this_rq, long adjust);
+
+#ifdef CONFIG_SMP
+extern void cpu_load_update_active(struct rq *this_rq);
+#else
+static inline void cpu_load_update_active(struct rq *this_rq) { }
+#endif
+
+/*
+ * Helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+
+/*
+ * Increase resolution of nice-level calculations for 64-bit architectures.
+ * The extra resolution improves shares distribution and load balancing of
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * hierarchies, especially on larger systems. This is not a user-visible change
+ * and does not change the user-interface for setting shares/weights.
+ *
+ * We increase resolution only if we have enough bits to allow this increased
+ * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
+ * are pretty high and the returns do not justify the increased costs.
+ *
+ * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
+ * increase coverage and consistency always enable it on 64-bit platforms.
+ */
+#ifdef CONFIG_64BIT
+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
+# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w) \
+({ \
+ unsigned long __w = (w); \
+ if (__w) \
+ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
+ __w; \
+})
+#else
+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
+# define scale_load(w) (w)
+# define scale_load_down(w) (w)
+#endif
+
+/*
+ * Task weight (visible to users) and its load (invisible to users) have
+ * independent resolution, but they should be well calibrated. We use
+ * scale_load() and scale_load_down(w) to convert between them. The
+ * following must be true:
+ *
+ * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
+ *
+ */
+#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT)
+
+/*
+ * Single value that decides SCHED_DEADLINE internal math precision.
+ * 10 -> just above 1us
+ * 9 -> just above 0.5us
+ */
+#define DL_SCALE 10
+
+/*
+ * Single value that denotes runtime == period, ie unlimited time.
+ */
+#define RUNTIME_INF ((u64)~0ULL)
+
+static inline int idle_policy(int policy)
+{
+ return policy == SCHED_IDLE;
+}
+static inline int fair_policy(int policy)
+{
+ return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+}
+
+static inline int rt_policy(int policy)
+{
+ return policy == SCHED_FIFO || policy == SCHED_RR;
+}
+
+static inline int dl_policy(int policy)
+{
+ return policy == SCHED_DEADLINE;
+}
+static inline bool valid_policy(int policy)
+{
+ return idle_policy(policy) || fair_policy(policy) ||
+ rt_policy(policy) || dl_policy(policy);
+}
+
+static inline int task_has_rt_policy(struct task_struct *p)
+{
+ return rt_policy(p->policy);
+}
+
+static inline int task_has_dl_policy(struct task_struct *p)
+{
+ return dl_policy(p->policy);
+}
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+/*
+ * !! For sched_setattr_nocheck() (kernel) only !!
+ *
+ * This is actually gross. :(
+ *
+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE
+ * tasks, but still be able to sleep. We need this on platforms that cannot
+ * atomically change clock frequency. Remove once fast switching will be
+ * available on such platforms.
+ *
+ * SUGOV stands for SchedUtil GOVernor.
+ */
+#define SCHED_FLAG_SUGOV 0x10000000
+
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+
+static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
+{
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+ return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
+#else
+ return false;
+#endif
+}
+
+/*
+ * Tells if entity @a should preempt entity @b.
+ */
+static inline bool
+dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+{
+ return dl_entity_is_special(a) ||
+ dl_time_before(a->deadline, b->deadline);
+}
+
+/*
+ * This is the priority-queue data structure of the RT scheduling class:
+ */
+struct rt_prio_array {
+ DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+ struct list_head queue[MAX_RT_PRIO];
+};
+
+struct rt_bandwidth {
+ /* nests inside the rq lock: */
+ raw_spinlock_t rt_runtime_lock;
+ ktime_t rt_period;
+ u64 rt_runtime;
+ struct hrtimer rt_period_timer;
+ unsigned int rt_period_active;
+};
+
+void __dl_clear_params(struct task_struct *p);
+
+struct dl_bandwidth {
+ raw_spinlock_t dl_runtime_lock;
+ u64 dl_runtime;
+ u64 dl_period;
+};
+
+static inline int dl_bandwidth_enabled(void)
+{
+ return sysctl_sched_rt_runtime >= 0;
+}
+
+/*
+ * To keep the bandwidth of -deadline tasks under control
+ * we need some place where:
+ * - store the maximum -deadline bandwidth of each cpu;
+ * - cache the fraction of bandwidth that is currently allocated in
+ * each root domain;
+ *
+ * This is all done in the data structure below. It is similar to the
+ * one used for RT-throttling (rt_bandwidth), with the main difference
+ * that, since here we are only interested in admission control, we
+ * do not decrease any runtime while the group "executes", neither we
+ * need a timer to replenish it.
+ *
+ * With respect to SMP, bandwidth is given on a per root domain basis,
+ * meaning that:
+ * - bw (< 100%) is the deadline bandwidth of each CPU;
+ * - total_bw is the currently allocated bandwidth in each root domain;
+ */
+struct dl_bw {
+ raw_spinlock_t lock;
+ u64 bw;
+ u64 total_bw;
+};
+
+static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
+
+static inline
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw -= tsk_bw;
+ __dl_update(dl_b, (s32)tsk_bw / cpus);
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw += tsk_bw;
+ __dl_update(dl_b, -((s32)tsk_bw / cpus));
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+
+extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
+extern void init_dl_bw(struct dl_bw *dl_b);
+extern int sched_dl_global_validate(void);
+extern void sched_dl_do_global(void);
+extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
+extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
+extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
+extern bool __checkparam_dl(const struct sched_attr *attr);
+extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
+extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
+extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
+extern bool dl_cpu_busy(unsigned int cpu);
+
+#ifdef CONFIG_CGROUP_SCHED
+
+#include <linux/cgroup.h>
+
+struct cfs_rq;
+struct rt_rq;
+
+extern struct list_head task_groups;
+
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+ raw_spinlock_t lock;
+ ktime_t period;
+ u64 quota;
+ u64 runtime;
+ s64 hierarchical_quota;
+
+ short idle;
+ short period_active;
+ struct hrtimer period_timer;
+ struct hrtimer slack_timer;
+ struct list_head throttled_cfs_rq;
+
+ /* Statistics: */
+ int nr_periods;
+ int nr_throttled;
+ u64 throttled_time;
+
+ bool distribute_running;
+#endif
+};
+
+/* Task group related information */
+struct task_group {
+ struct cgroup_subsys_state css;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* schedulable entities of this group on each CPU */
+ struct sched_entity **se;
+ /* runqueue "owned" by this group on each CPU */
+ struct cfs_rq **cfs_rq;
+ unsigned long shares;
+
+#ifdef CONFIG_SMP
+ /*
+ * load_avg can be heavily contended at clock tick time, so put
+ * it in its own cacheline separated from the fields above which
+ * will also be accessed at each tick.
+ */
+ atomic_long_t load_avg ____cacheline_aligned;
+#endif
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct sched_rt_entity **rt_se;
+ struct rt_rq **rt_rq;
+
+ struct rt_bandwidth rt_bandwidth;
+#endif
+
+ struct rcu_head rcu;
+ struct list_head list;
+
+ struct task_group *parent;
+ struct list_head siblings;
+ struct list_head children;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+ struct autogroup *autogroup;
+#endif
+
+ struct cfs_bandwidth cfs_bandwidth;
+};
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
+
+/*
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * A weight of a cfs_rq is the sum of weights of which entities
+ * are queued on this cfs_rq, so a weight of a entity should not be
+ * too large, so as the shares value of a task group.
+ * (The default weight is 1024 - so there's no practical
+ * limitation from this.)
+ */
+#define MIN_SHARES (1UL << 1)
+#define MAX_SHARES (1UL << 18)
+#endif
+
+typedef int (*tg_visitor)(struct task_group *, void *);
+
+extern int walk_tg_tree_from(struct task_group *from,
+ tg_visitor down, tg_visitor up, void *data);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+ return walk_tg_tree_from(&root_task_group, down, up, data);
+}
+
+extern int tg_nop(struct task_group *tg, void *data);
+
+extern void free_fair_sched_group(struct task_group *tg);
+extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+extern void online_fair_sched_group(struct task_group *tg);
+extern void unregister_fair_sched_group(struct task_group *tg);
+extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ struct sched_entity *se, int cpu,
+ struct sched_entity *parent);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+
+extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
+extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+
+extern void free_rt_sched_group(struct task_group *tg);
+extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
+extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+ struct sched_rt_entity *rt_se, int cpu,
+ struct sched_rt_entity *parent);
+extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us);
+extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us);
+extern long sched_group_rt_runtime(struct task_group *tg);
+extern long sched_group_rt_period(struct task_group *tg);
+extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
+
+extern struct task_group *sched_create_group(struct task_group *parent);
+extern void sched_online_group(struct task_group *tg,
+ struct task_group *parent);
+extern void sched_destroy_group(struct task_group *tg);
+extern void sched_offline_group(struct task_group *tg);
+
+extern void sched_move_task(struct task_struct *tsk);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+
+#ifdef CONFIG_SMP
+extern void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next);
+#else /* !CONFIG_SMP */
+static inline void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next) { }
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#else /* CONFIG_CGROUP_SCHED */
+
+struct cfs_bandwidth { };
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+/* CFS-related fields in a runqueue */
+struct cfs_rq {
+ struct load_weight load;
+ unsigned long runnable_weight;
+ unsigned int nr_running;
+ unsigned int h_nr_running;
+
+ u64 exec_clock;
+ u64 min_vruntime;
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+#endif
+
+ struct rb_root_cached tasks_timeline;
+
+ /*
+ * 'curr' points to currently running entity on this cfs_rq.
+ * It is set to NULL otherwise (i.e when none are currently running).
+ */
+ struct sched_entity *curr;
+ struct sched_entity *next;
+ struct sched_entity *last;
+ struct sched_entity *skip;
+
+#ifdef CONFIG_SCHED_DEBUG
+ unsigned int nr_spread_over;
+#endif
+
+#ifdef CONFIG_SMP
+ /*
+ * CFS load tracking
+ */
+ struct sched_avg avg;
+#ifndef CONFIG_64BIT
+ u64 load_last_update_time_copy;
+#endif
+ struct {
+ raw_spinlock_t lock ____cacheline_aligned;
+ int nr;
+ unsigned long load_avg;
+ unsigned long util_avg;
+ unsigned long runnable_sum;
+ } removed;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ unsigned long tg_load_avg_contrib;
+ long propagate;
+ long prop_runnable_sum;
+
+ /*
+ * h_load = weight * f(tg)
+ *
+ * Where f(tg) is the recursive weight fraction assigned to
+ * this group.
+ */
+ unsigned long h_load;
+ u64 last_h_load_update;
+ struct sched_entity *h_load_next;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
+
+ /*
+ * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
+ *
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
+ * This list is used during load balance.
+ */
+ int on_list;
+ struct list_head leaf_cfs_rq_list;
+ struct task_group *tg; /* group that "owns" this runqueue */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ int runtime_enabled;
+ s64 runtime_remaining;
+
+ u64 throttled_clock;
+ u64 throttled_clock_task;
+ u64 throttled_clock_task_time;
+ int throttled;
+ int throttle_count;
+ struct list_head throttled_list;
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+};
+
+static inline int rt_bandwidth_enabled(void)
+{
+ return sysctl_sched_rt_runtime >= 0;
+}
+
+/* RT IPI pull logic requires IRQ_WORK */
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
+# define HAVE_RT_PUSH_IPI
+#endif
+
+/* Real-Time classes' related field in a runqueue: */
+struct rt_rq {
+ struct rt_prio_array active;
+ unsigned int rt_nr_running;
+ unsigned int rr_nr_running;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ struct {
+ int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+ int next; /* next highest */
+#endif
+ } highest_prio;
+#endif
+#ifdef CONFIG_SMP
+ unsigned long rt_nr_migratory;
+ unsigned long rt_nr_total;
+ int overloaded;
+ struct plist_head pushable_tasks;
+
+#endif /* CONFIG_SMP */
+ int rt_queued;
+
+ int rt_throttled;
+ u64 rt_time;
+ u64 rt_runtime;
+ /* Nests inside the rq lock: */
+ raw_spinlock_t rt_runtime_lock;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ unsigned long rt_nr_boosted;
+
+ struct rq *rq;
+ struct task_group *tg;
+#endif
+};
+
+static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_queued && rt_rq->rt_nr_running;
+}
+
+/* Deadline class' related fields in a runqueue */
+struct dl_rq {
+ /* runqueue is an rbtree, ordered by deadline */
+ struct rb_root_cached root;
+
+ unsigned long dl_nr_running;
+
+#ifdef CONFIG_SMP
+ /*
+ * Deadline values of the currently executing and the
+ * earliest ready task on this rq. Caching these facilitates
+ * the decision wether or not a ready but not running task
+ * should migrate somewhere else.
+ */
+ struct {
+ u64 curr;
+ u64 next;
+ } earliest_dl;
+
+ unsigned long dl_nr_migratory;
+ int overloaded;
+
+ /*
+ * Tasks on this rq that can be pushed away. They are kept in
+ * an rb-tree, ordered by tasks' deadlines, with caching
+ * of the leftmost (earliest deadline) element.
+ */
+ struct rb_root_cached pushable_dl_tasks_root;
+#else
+ struct dl_bw dl_bw;
+#endif
+ /*
+ * "Active utilization" for this runqueue: increased when a
+ * task wakes up (becomes TASK_RUNNING) and decreased when a
+ * task blocks
+ */
+ u64 running_bw;
+
+ /*
+ * Utilization of the tasks "assigned" to this runqueue (including
+ * the tasks that are in runqueue and the tasks that executed on this
+ * CPU and blocked). Increased when a task moves to this runqueue, and
+ * decreased when the task moves away (migrates, changes scheduling
+ * policy, or terminates).
+ * This is needed to compute the "inactive utilization" for the
+ * runqueue (inactive utilization = this_bw - running_bw).
+ */
+ u64 this_bw;
+ u64 extra_bw;
+
+ /*
+ * Inverse of the fraction of CPU utilization that can be reclaimed
+ * by the GRUB algorithm.
+ */
+ u64 bw_ratio;
+};
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se) (!se->my_q)
+#else
+#define entity_is_task(se) 1
+#endif
+
+#ifdef CONFIG_SMP
+/*
+ * XXX we want to get rid of these helpers and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+ return scale_load_down(se->load.weight);
+}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ return scale_load_down(se->runnable_weight);
+}
+
+static inline bool sched_asym_prefer(int a, int b)
+{
+ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
+}
+
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member CPUs from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+ atomic_t refcount;
+ atomic_t rto_count;
+ struct rcu_head rcu;
+ cpumask_var_t span;
+ cpumask_var_t online;
+
+ /* Indicate more than one runnable task for any CPU */
+ bool overload;
+
+ /*
+ * The bit corresponding to a CPU gets set here if such CPU has more
+ * than one runnable -deadline task (as it is below for RT tasks).
+ */
+ cpumask_var_t dlo_mask;
+ atomic_t dlo_count;
+ struct dl_bw dl_bw;
+ struct cpudl cpudl;
+
+#ifdef HAVE_RT_PUSH_IPI
+ /*
+ * For IPI pull requests, loop across the rto_mask.
+ */
+ struct irq_work rto_push_work;
+ raw_spinlock_t rto_lock;
+ /* These are only updated and read within rto_lock */
+ int rto_loop;
+ int rto_cpu;
+ /* These atomics are updated outside of a lock */
+ atomic_t rto_loop_next;
+ atomic_t rto_loop_start;
+#endif
+ /*
+ * The "RT overload" flag: it gets set if a CPU has more than
+ * one runnable RT task.
+ */
+ cpumask_var_t rto_mask;
+ struct cpupri cpupri;
+
+ unsigned long max_cpu_capacity;
+};
+
+extern struct root_domain def_root_domain;
+extern struct mutex sched_domains_mutex;
+
+extern void init_defrootdomain(void);
+extern int sched_init_domains(const struct cpumask *cpu_map);
+extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
+extern void sched_get_rd(struct root_domain *rd);
+extern void sched_put_rd(struct root_domain *rd);
+
+#ifdef HAVE_RT_PUSH_IPI
+extern void rto_push_irq_work_func(struct irq_work *work);
+#endif
+#endif /* CONFIG_SMP */
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
+struct rq {
+ /* runqueue lock: */
+ raw_spinlock_t lock;
+
+ /*
+ * nr_running and cpu_load should be in the same cacheline because
+ * remote CPUs use both these fields when doing load calculation.
+ */
+ unsigned int nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+ unsigned int numa_migrate_on;
+#endif
+ #define CPU_LOAD_IDX_MAX 5
+ unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+#ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_SMP
+ unsigned long last_load_update_tick;
+ unsigned long last_blocked_load_update_tick;
+ unsigned int has_blocked_load;
+#endif /* CONFIG_SMP */
+ unsigned int nohz_tick_stopped;
+ atomic_t nohz_flags;
+#endif /* CONFIG_NO_HZ_COMMON */
+
+ /* capture load from *all* tasks on this CPU: */
+ struct load_weight load;
+ unsigned long nr_load_updates;
+ u64 nr_switches;
+
+ struct cfs_rq cfs;
+ struct rt_rq rt;
+ struct dl_rq dl;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* list of leaf cfs_rq on this CPU: */
+ struct list_head leaf_cfs_rq_list;
+ struct list_head *tmp_alone_branch;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+ /*
+ * This is part of a global counter where only the total sum
+ * over all CPUs matters. A task can increase this counter on
+ * one CPU and if it got migrated afterwards it may decrease
+ * it on another CPU. Always updated under the runqueue lock:
+ */
+ unsigned long nr_uninterruptible;
+
+ struct task_struct *curr;
+ struct task_struct *idle;
+ struct task_struct *stop;
+ unsigned long next_balance;
+ struct mm_struct *prev_mm;
+
+ unsigned int clock_update_flags;
+ u64 clock;
+ u64 clock_task;
+
+ atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+ struct root_domain *rd;
+ struct sched_domain *sd;
+
+ unsigned long cpu_capacity;
+ unsigned long cpu_capacity_orig;
+
+ struct callback_head *balance_callback;
+
+ unsigned char idle_balance;
+
+ /* For active balancing */
+ int active_balance;
+ int push_cpu;
+ struct cpu_stop_work active_balance_work;
+
+ /* CPU of this runqueue: */
+ int cpu;
+ int online;
+
+ struct list_head cfs_tasks;
+
+ struct sched_avg avg_rt;
+ struct sched_avg avg_dl;
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+ struct sched_avg avg_irq;
+#endif
+ u64 idle_stamp;
+ u64 avg_idle;
+
+ /* This is used to determine avg_idle's max value */
+ u64 max_idle_balance_cost;
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ u64 prev_irq_time;
+#endif
+#ifdef CONFIG_PARAVIRT
+ u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ u64 prev_steal_time_rq;
+#endif
+
+ /* calc_load related fields */
+ unsigned long calc_load_update;
+ long calc_load_active;
+
+#ifdef CONFIG_SCHED_HRTICK
+#ifdef CONFIG_SMP
+ int hrtick_csd_pending;
+ call_single_data_t hrtick_csd;
+#endif
+ struct hrtimer hrtick_timer;
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+ /* latency stats */
+ struct sched_info rq_sched_info;
+ unsigned long long rq_cpu_time;
+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+
+ /* sys_sched_yield() stats */
+ unsigned int yld_count;
+
+ /* schedule() stats */
+ unsigned int sched_count;
+ unsigned int sched_goidle;
+
+ /* try_to_wake_up() stats */
+ unsigned int ttwu_count;
+ unsigned int ttwu_local;
+#endif
+
+#ifdef CONFIG_SMP
+ struct llist_head wake_list;
+#endif
+
+#ifdef CONFIG_CPU_IDLE
+ /* Must be inspected within a rcu lock section */
+ struct cpuidle_state *idle_state;
+#endif
+};
+
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ return rq->cpu;
+#else
+ return 0;
+#endif
+}
+
+
+#ifdef CONFIG_SCHED_SMT
+extern void __update_idle_core(struct rq *rq);
+
+static inline void update_idle_core(struct rq *rq)
+{
+ if (static_branch_unlikely(&sched_smt_present))
+ __update_idle_core(rq);
+}
+
+#else
+static inline void update_idle_core(struct rq *rq) { }
+#endif
+
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
+#define this_rq() this_cpu_ptr(&runqueues)
+#define task_rq(p) cpu_rq(task_cpu(p))
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+#define raw_rq() raw_cpu_ptr(&runqueues)
+
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+ return READ_ONCE(rq->clock);
+}
+
+/*
+ * rq::clock_update_flags bits
+ *
+ * %RQCF_REQ_SKIP - will request skipping of clock update on the next
+ * call to __schedule(). This is an optimisation to avoid
+ * neighbouring rq clock updates.
+ *
+ * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is
+ * in effect and calls to update_rq_clock() are being ignored.
+ *
+ * %RQCF_UPDATED - is a debug flag that indicates whether a call has been
+ * made to update_rq_clock() since the last time rq::lock was pinned.
+ *
+ * If inside of __schedule(), clock_update_flags will have been
+ * shifted left (a left shift is a cheap operation for the fast path
+ * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use,
+ *
+ * if (rq-clock_update_flags >= RQCF_UPDATED)
+ *
+ * to check if %RQCF_UPADTED is set. It'll never be shifted more than
+ * one position though, because the next rq_unpin_lock() will shift it
+ * back.
+ */
+#define RQCF_REQ_SKIP 0x01
+#define RQCF_ACT_SKIP 0x02
+#define RQCF_UPDATED 0x04
+
+static inline void assert_clock_updated(struct rq *rq)
+{
+ /*
+ * The only reason for not seeing a clock update since the
+ * last rq_pin_lock() is if we're currently skipping updates.
+ */
+ SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP);
+}
+
+static inline u64 rq_clock(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ assert_clock_updated(rq);
+
+ return rq->clock;
+}
+
+static inline u64 rq_clock_task(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ assert_clock_updated(rq);
+
+ return rq->clock_task;
+}
+
+static inline void rq_clock_skip_update(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ rq->clock_update_flags |= RQCF_REQ_SKIP;
+}
+
+/*
+ * See rt task throttling, which is the only time a skip
+ * request is cancelled.
+ */
+static inline void rq_clock_cancel_skipupdate(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ rq->clock_update_flags &= ~RQCF_REQ_SKIP;
+}
+
+struct rq_flags {
+ unsigned long flags;
+ struct pin_cookie cookie;
+#ifdef CONFIG_SCHED_DEBUG
+ /*
+ * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
+ * current pin context is stashed here in case it needs to be
+ * restored in rq_repin_lock().
+ */
+ unsigned int clock_update_flags;
+#endif
+};
+
+static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
+{
+ rf->cookie = lockdep_pin_lock(&rq->lock);
+
+#ifdef CONFIG_SCHED_DEBUG
+ rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ rf->clock_update_flags = 0;
+#endif
+}
+
+static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ if (rq->clock_update_flags > RQCF_ACT_SKIP)
+ rf->clock_update_flags = RQCF_UPDATED;
+#endif
+
+ lockdep_unpin_lock(&rq->lock, rf->cookie);
+}
+
+static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
+{
+ lockdep_repin_lock(&rq->lock, rf->cookie);
+
+#ifdef CONFIG_SCHED_DEBUG
+ /*
+ * Restore the value we stashed in @rf for this pin context.
+ */
+ rq->clock_update_flags |= rf->clock_update_flags;
+#endif
+}
+
+#ifdef CONFIG_NUMA
+enum numa_topology_type {
+ NUMA_DIRECT,
+ NUMA_GLUELESS_MESH,
+ NUMA_BACKPLANE,
+};
+extern enum numa_topology_type sched_numa_topology_type;
+extern int sched_max_numa_distance;
+extern bool find_numa_distance(int distance);
+#endif
+
+#ifdef CONFIG_NUMA
+extern void sched_init_numa(void);
+extern void sched_domains_numa_masks_set(unsigned int cpu);
+extern void sched_domains_numa_masks_clear(unsigned int cpu);
+#else
+static inline void sched_init_numa(void) { }
+static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
+static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+/* The regions in numa_faults array from task_struct */
+enum numa_faults_stats {
+ NUMA_MEM = 0,
+ NUMA_CPU,
+ NUMA_MEMBUF,
+ NUMA_CPUBUF
+};
+extern void sched_setnuma(struct task_struct *p, int node);
+extern int migrate_task_to(struct task_struct *p, int cpu);
+extern int migrate_swap(struct task_struct *p, struct task_struct *t,
+ int cpu, int scpu);
+extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
+#else
+static inline void
+init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_SMP
+
+static inline void
+queue_balance_callback(struct rq *rq,
+ struct callback_head *head,
+ void (*func)(struct rq *rq))
+{
+ lockdep_assert_held(&rq->lock);
+
+ if (unlikely(head->next))
+ return;
+
+ head->func = (void (*)(struct callback_head *))func;
+ head->next = rq->balance_callback;
+ rq->balance_callback = head;
+}
+
+extern void sched_ttwu_pending(void);
+
+#define rcu_dereference_check_sched_domain(p) \
+ rcu_dereference_check((p), \
+ lockdep_is_held(&sched_domains_mutex))
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ __sd; __sd = __sd->parent)
+
+#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
+
+/**
+ * highest_flag_domain - Return highest sched_domain containing flag.
+ * @cpu: The CPU whose highest level of sched domain is to
+ * be returned.
+ * @flag: The flag to check for the highest sched_domain
+ * for the given CPU.
+ *
+ * Returns the highest sched_domain of a CPU which contains the given flag.
+ */
+static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd, *hsd = NULL;
+
+ for_each_domain(cpu, sd) {
+ if (!(sd->flags & flag))
+ break;
+ hsd = sd;
+ }
+
+ return hsd;
+}
+
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd) {
+ if (sd->flags & flag)
+ break;
+ }
+
+ return sd;
+}
+
+DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_size);
+DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+
+struct sched_group_capacity {
+ atomic_t ref;
+ /*
+ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
+ * for a single CPU.
+ */
+ unsigned long capacity;
+ unsigned long min_capacity; /* Min per-CPU capacity in group */
+ unsigned long next_update;
+ int imbalance; /* XXX unrelated to capacity but shared group state */
+
+#ifdef CONFIG_SCHED_DEBUG
+ int id;
+#endif
+
+ unsigned long cpumask[0]; /* Balance mask */
+};
+
+struct sched_group {
+ struct sched_group *next; /* Must be a circular list */
+ atomic_t ref;
+
+ unsigned int group_weight;
+ struct sched_group_capacity *sgc;
+ int asym_prefer_cpu; /* CPU of highest priority in group */
+
+ /*
+ * The CPUs this group covers.
+ *
+ * NOTE: this field is variable length. (Allocated dynamically
+ * by attaching extra space to the end of the structure,
+ * depending on how many CPUs the kernel has booted up with)
+ */
+ unsigned long cpumask[0];
+};
+
+static inline struct cpumask *sched_group_span(struct sched_group *sg)
+{
+ return to_cpumask(sg->cpumask);
+}
+
+/*
+ * See build_balance_mask().
+ */
+static inline struct cpumask *group_balance_mask(struct sched_group *sg)
+{
+ return to_cpumask(sg->sgc->cpumask);
+}
+
+/**
+ * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
+ * @group: The group whose first CPU is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+ return cpumask_first(sched_group_span(group));
+}
+
+extern int group_balance_cpu(struct sched_group *sg);
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void dirty_sched_domain_sysctl(int cpu);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void dirty_sched_domain_sysctl(int cpu)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
+#else
+
+static inline void sched_ttwu_pending(void) { }
+
+#endif /* CONFIG_SMP */
+
+#include "stats.h"
+#include "autogroup.h"
+
+#ifdef CONFIG_CGROUP_SCHED
+
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We cannot use task_css() and friends because the cgroup subsystem
+ * changes that value before the cgroup_subsys::attach() method is called,
+ * therefore we cannot pin it and might observe the wrong value.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ return p->sched_task_group;
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
+ struct task_group *tg = task_group(p);
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
+ p->se.cfs_rq = tg->cfs_rq[cpu];
+ p->se.parent = tg->se[cpu];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ p->rt.rt_rq = tg->rt_rq[cpu];
+ p->rt.parent = tg->rt_se[cpu];
+#endif
+}
+
+#else /* CONFIG_CGROUP_SCHED */
+
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+ set_task_rq(p, cpu);
+#ifdef CONFIG_SMP
+ /*
+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+ * successfuly executed on another CPU. We must ensure that updates of
+ * per-task data have been completed by this moment.
+ */
+ smp_wmb();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ WRITE_ONCE(p->cpu, cpu);
+#else
+ WRITE_ONCE(task_thread_info(p)->cpu, cpu);
+#endif
+ p->wake_cpu = cpu;
+#endif
+}
+
+/*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# include <linux/static_key.h>
+# define const_debug __read_mostly
+#else
+# define const_debug const
+#endif
+
+#define SCHED_FEAT(name, enabled) \
+ __SCHED_FEAT_##name ,
+
+enum {
+#include "features.h"
+ __SCHED_FEAT_NR,
+};
+
+#undef SCHED_FEAT
+
+#ifdef CONFIG_SCHED_DEBUG
+
+/*
+ * To support run-time toggling of sched features, all the translation units
+ * (but core.c) reference the sysctl_sched_features defined in core.c.
+ */
+extern const_debug unsigned int sysctl_sched_features;
+
+#ifdef CONFIG_JUMP_LABEL
+#define SCHED_FEAT(name, enabled) \
+static __always_inline bool static_branch_##name(struct static_key *key) \
+{ \
+ return static_key_##enabled(key); \
+}
+
+#include "features.h"
+#undef SCHED_FEAT
+
+extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
+#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
+
+#else /* !CONFIG_JUMP_LABEL */
+
+#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+
+#endif /* CONFIG_JUMP_LABEL */
+
+#else /* !SCHED_DEBUG */
+
+/*
+ * Each translation unit has its own copy of sysctl_sched_features to allow
+ * constants propagation at compile time and compiler optimization based on
+ * features default.
+ */
+#define SCHED_FEAT(name, enabled) \
+ (1UL << __SCHED_FEAT_##name) * enabled |
+static const_debug __maybe_unused unsigned int sysctl_sched_features =
+#include "features.h"
+ 0;
+#undef SCHED_FEAT
+
+#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+
+#endif /* SCHED_DEBUG */
+
+extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
+
+static inline u64 global_rt_period(void)
+{
+ return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_rt_runtime(void)
+{
+ if (sysctl_sched_rt_runtime < 0)
+ return RUNTIME_INF;
+
+ return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+}
+
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+ return rq->curr == p;
+}
+
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+ return p->on_cpu;
+#else
+ return task_current(rq, p);
+#endif
+}
+
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+ return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+
+static inline int task_on_rq_migrating(struct task_struct *p)
+{
+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
+}
+
+/*
+ * wake flags
+ */
+#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
+#define WF_FORK 0x02 /* Child wakeup after fork */
+#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
+
+/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+
+#define WEIGHT_IDLEPRIO 3
+#define WMULT_IDLEPRIO 1431655765
+
+extern const int sched_prio_to_weight[40];
+extern const u32 sched_prio_to_wmult[40];
+
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ * are in a known state which allows modification. Such pairs
+ * should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ * in the runqueue.
+ *
+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_MIGRATED - the task was migrated during wakeup
+ *
+ */
+
+#define DEQUEUE_SLEEP 0x01
+#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
+
+#define ENQUEUE_WAKEUP 0x01
+#define ENQUEUE_RESTORE 0x02
+#define ENQUEUE_MOVE 0x04
+#define ENQUEUE_NOCLOCK 0x08
+
+#define ENQUEUE_HEAD 0x10
+#define ENQUEUE_REPLENISH 0x20
+#ifdef CONFIG_SMP
+#define ENQUEUE_MIGRATED 0x40
+#else
+#define ENQUEUE_MIGRATED 0x00
+#endif
+
+#define RETRY_TASK ((void *)-1UL)
+
+struct sched_class {
+ const struct sched_class *next;
+
+ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*yield_task) (struct rq *rq);
+ bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
+
+ void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
+
+ /*
+ * It is the responsibility of the pick_next_task() method that will
+ * return the next task to call put_prev_task() on the @prev task or
+ * something equivalent.
+ *
+ * May return RETRY_TASK when it finds a higher prio class has runnable
+ * tasks.
+ */
+ struct task_struct * (*pick_next_task)(struct rq *rq,
+ struct task_struct *prev,
+ struct rq_flags *rf);
+ void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+
+#ifdef CONFIG_SMP
+ int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+ void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
+
+ void (*task_woken)(struct rq *this_rq, struct task_struct *task);
+
+ void (*set_cpus_allowed)(struct task_struct *p,
+ const struct cpumask *newmask);
+
+ void (*rq_online)(struct rq *rq);
+ void (*rq_offline)(struct rq *rq);
+#endif
+
+ void (*set_curr_task)(struct rq *rq);
+ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
+ void (*task_fork)(struct task_struct *p);
+ void (*task_dead)(struct task_struct *p);
+
+ /*
+ * The switched_from() call is allowed to drop rq->lock, therefore we
+ * cannot assume the switched_from/switched_to pair is serliazed by
+ * rq->lock. They are however serialized by p->pi_lock.
+ */
+ void (*switched_from)(struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+ int oldprio);
+
+ unsigned int (*get_rr_interval)(struct rq *rq,
+ struct task_struct *task);
+
+ void (*update_curr)(struct rq *rq);
+
+#define TASK_SET_GROUP 0
+#define TASK_MOVE_GROUP 1
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ void (*task_change_group)(struct task_struct *p, int type);
+#endif
+};
+
+static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+ prev->sched_class->put_prev_task(rq, prev);
+}
+
+static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+{
+ curr->sched_class->set_curr_task(rq);
+}
+
+#ifdef CONFIG_SMP
+#define sched_class_highest (&stop_sched_class)
+#else
+#define sched_class_highest (&dl_sched_class)
+#endif
+#define for_each_class(class) \
+ for (class = sched_class_highest; class; class = class->next)
+
+extern const struct sched_class stop_sched_class;
+extern const struct sched_class dl_sched_class;
+extern const struct sched_class rt_sched_class;
+extern const struct sched_class fair_sched_class;
+extern const struct sched_class idle_sched_class;
+
+
+#ifdef CONFIG_SMP
+
+extern void update_group_capacity(struct sched_domain *sd, int cpu);
+
+extern void trigger_load_balance(struct rq *rq);
+
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+
+#endif
+
+#ifdef CONFIG_CPU_IDLE
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+ rq->idle_state = idle_state;
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ SCHED_WARN_ON(!rcu_read_lock_held());
+
+ return rq->idle_state;
+}
+#else
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ return NULL;
+}
+#endif
+
+extern void schedule_idle(void);
+
+extern void sysrq_sched_debug_show(void);
+extern void sched_init_granularity(void);
+extern void update_max_interval(void);
+
+extern void init_sched_dl_class(void);
+extern void init_sched_rt_class(void);
+extern void init_sched_fair_class(void);
+
+extern void reweight_task(struct task_struct *p, int prio);
+
+extern void resched_curr(struct rq *rq);
+extern void resched_cpu(int cpu);
+
+extern struct rt_bandwidth def_rt_bandwidth;
+extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+
+extern struct dl_bandwidth def_dl_bandwidth;
+extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
+extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
+
+#define BW_SHIFT 20
+#define BW_UNIT (1 << BW_SHIFT)
+#define RATIO_SHIFT 8
+unsigned long to_ratio(u64 period, u64 runtime);
+
+extern void init_entity_runnable_average(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se);
+
+#ifdef CONFIG_NO_HZ_FULL
+extern bool sched_can_stop_tick(struct rq *rq);
+extern int __init sched_tick_offload_init(void);
+
+/*
+ * Tick may be needed by tasks in the runqueue depending on their policy and
+ * requirements. If tick is needed, lets send the target an IPI to kick it out of
+ * nohz mode if necessary.
+ */
+static inline void sched_update_tick_dependency(struct rq *rq)
+{
+ int cpu;
+
+ if (!tick_nohz_full_enabled())
+ return;
+
+ cpu = cpu_of(rq);
+
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ if (sched_can_stop_tick(rq))
+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+ else
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#else
+static inline int sched_tick_offload_init(void) { return 0; }
+static inline void sched_update_tick_dependency(struct rq *rq) { }
+#endif
+
+static inline void add_nr_running(struct rq *rq, unsigned count)
+{
+ unsigned prev_nr = rq->nr_running;
+
+ rq->nr_running = prev_nr + count;
+
+ if (prev_nr < 2 && rq->nr_running >= 2) {
+#ifdef CONFIG_SMP
+ if (!rq->rd->overload)
+ rq->rd->overload = true;
+#endif
+ }
+
+ sched_update_tick_dependency(rq);
+}
+
+static inline void sub_nr_running(struct rq *rq, unsigned count)
+{
+ rq->nr_running -= count;
+ /* Check if we still need preemption */
+ sched_update_tick_dependency(rq);
+}
+
+extern void update_rq_clock(struct rq *rq);
+
+extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
+extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
+
+extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+
+extern const_debug unsigned int sysctl_sched_nr_migrate;
+extern const_debug unsigned int sysctl_sched_migration_cost;
+
+#ifdef CONFIG_SCHED_HRTICK
+
+/*
+ * Use hrtick when:
+ * - enabled by features
+ * - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+ if (!sched_feat(HRTICK))
+ return 0;
+ if (!cpu_active(cpu_of(rq)))
+ return 0;
+ return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+
+void hrtick_start(struct rq *rq, u64 delay);
+
+#else
+
+static inline int hrtick_enabled(struct rq *rq)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_HRTICK */
+
+#ifndef arch_scale_freq_capacity
+static __always_inline
+unsigned long arch_scale_freq_capacity(int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
+#ifdef CONFIG_SMP
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+ if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
+ return sd->smt_gain / sd->span_weight;
+
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+#else
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+#endif
+
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(rq->lock);
+
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock);
+
+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+}
+
+static inline void
+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irqsave(&rq->lock, rf->flags);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irq(&rq->lock);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_relock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+ rq_repin_lock(rq, rf);
+}
+
+static inline void
+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+}
+
+static inline void
+rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void
+rq_unlock(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock(&rq->lock);
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_PREEMPT
+
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
+/*
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations. This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below. However, it
+ * also adds more overhead and therefore may reduce throughput.
+ */
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+{
+ raw_spin_unlock(&this_rq->lock);
+ double_rq_lock(this_rq, busiest);
+
+ return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry. This favors lower CPU-ids and will
+ * grant the double lock to lower CPUs over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+{
+ int ret = 0;
+
+ if (unlikely(!raw_spin_trylock(&busiest->lock))) {
+ if (busiest < this_rq) {
+ raw_spin_unlock(&this_rq->lock);
+ raw_spin_lock(&busiest->lock);
+ raw_spin_lock_nested(&this_rq->lock,
+ SINGLE_DEPTH_NESTING);
+ ret = 1;
+ } else
+ raw_spin_lock_nested(&busiest->lock,
+ SINGLE_DEPTH_NESTING);
+ }
+ return ret;
+}
+
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+ if (unlikely(!irqs_disabled())) {
+ /* printk() doesn't work well under rq->lock */
+ raw_spin_unlock(&this_rq->lock);
+ BUG_ON(1);
+ }
+
+ return _double_lock_balance(this_rq, busiest);
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(busiest->lock)
+{
+ raw_spin_unlock(&busiest->lock);
+ lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+
+static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock_irq(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ raw_spin_lock(l1);
+ raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ if (rq1 == rq2) {
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+ } else {
+ if (rq1 < rq2) {
+ raw_spin_lock(&rq1->lock);
+ raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ raw_spin_lock(&rq2->lock);
+ raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+ }
+ }
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ raw_spin_unlock(&rq1->lock);
+ if (rq1 != rq2)
+ raw_spin_unlock(&rq2->lock);
+ else
+ __release(rq2->lock);
+}
+
+extern void set_rq_online (struct rq *rq);
+extern void set_rq_offline(struct rq *rq);
+extern bool sched_smp_initialized;
+
+#else /* CONFIG_SMP */
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ BUG_ON(rq1 != rq2);
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ BUG_ON(rq1 != rq2);
+ raw_spin_unlock(&rq1->lock);
+ __release(rq2->lock);
+}
+
+#endif
+
+extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
+extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+
+#ifdef CONFIG_SCHED_DEBUG
+extern bool sched_debug_enabled;
+
+extern void print_cfs_stats(struct seq_file *m, int cpu);
+extern void print_rt_stats(struct seq_file *m, int cpu);
+extern void print_dl_stats(struct seq_file *m, int cpu);
+extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+#ifdef CONFIG_NUMA_BALANCING
+extern void
+show_numa_stats(struct task_struct *p, struct seq_file *m);
+extern void
+print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
+ unsigned long tpf, unsigned long gsf, unsigned long gpf);
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
+
+extern void init_cfs_rq(struct cfs_rq *cfs_rq);
+extern void init_rt_rq(struct rt_rq *rt_rq);
+extern void init_dl_rq(struct dl_rq *dl_rq);
+
+extern void cfs_bandwidth_usage_inc(void);
+extern void cfs_bandwidth_usage_dec(void);
+
+#ifdef CONFIG_NO_HZ_COMMON
+#define NOHZ_BALANCE_KICK_BIT 0
+#define NOHZ_STATS_KICK_BIT 1
+
+#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
+#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
+
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
+
+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
+
+extern void nohz_balance_exit_idle(struct rq *rq);
+#else
+static inline void nohz_balance_exit_idle(struct rq *rq) { }
+#endif
+
+
+#ifdef CONFIG_SMP
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
+ int i;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->dl.extra_bw += bw;
+ }
+}
+#else
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
+
+ dl->extra_bw += bw;
+}
+#endif
+
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+struct irqtime {
+ u64 total;
+ u64 tick_delta;
+ u64 irq_start_time;
+ struct u64_stats_sync sync;
+};
+
+DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
+static inline u64 irq_time_read(int cpu)
+{
+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
+ unsigned int seq;
+ u64 total;
+
+ do {
+ seq = __u64_stats_fetch_begin(&irqtime->sync);
+ total = irqtime->total;
+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
+
+ return total;
+}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @rq: Runqueue to carry out the update for.
+ * @flags: Update reason flags.
+ *
+ * This function is called by the scheduler on the CPU whose utilization is
+ * being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS
+ * and DL, though, because they may not be coming in if only RT tasks are
+ * active all the time (or there are RT tasks only).
+ *
+ * As a workaround for that issue, this function is called periodically by the
+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid. Going forward it should be replaced with
+ * solutions targeted more specifically at RT tasks.
+ */
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
+{
+ struct update_util_data *data;
+
+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
+ cpu_of(rq)));
+ if (data)
+ data->func(data, rq_clock(rq), flags);
+}
+#else
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
+#endif /* CONFIG_CPU_FREQ */
+
+#ifdef arch_scale_freq_capacity
+# ifndef arch_scale_freq_invariant
+# define arch_scale_freq_invariant() true
+# endif
+#else
+# define arch_scale_freq_invariant() false
+#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+static inline unsigned long cpu_bw_dl(struct rq *rq)
+{
+ return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
+}
+
+static inline unsigned long cpu_util_dl(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_dl.util_avg);
+}
+
+static inline unsigned long cpu_util_cfs(struct rq *rq)
+{
+ unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
+
+ if (sched_feat(UTIL_EST)) {
+ util = max_t(unsigned long, util,
+ READ_ONCE(rq->cfs.avg.util_est.enqueued));
+ }
+
+ return util;
+}
+
+static inline unsigned long cpu_util_rt(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_rt.util_avg);
+}
+#endif
+
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+static inline unsigned long cpu_util_irq(struct rq *rq)
+{
+ return rq->avg_irq.util_avg;
+}
+
+static inline
+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
+{
+ util *= (max - irq);
+ util /= max;
+
+ return util;
+
+}
+#else
+static inline unsigned long cpu_util_irq(struct rq *rq)
+{
+ return 0;
+}
+
+static inline
+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
+{
+ return util;
+}
+#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644
index 000000000..750fb3c67
--- /dev/null
+++ b/kernel/sched/stats.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * /proc/schedstat implementation
+ */
+#include "sched.h"
+
+/*
+ * Current schedstat API version.
+ *
+ * Bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 15
+
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+ int cpu;
+
+ if (v == (void *)1) {
+ seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+ seq_printf(seq, "timestamp %lu\n", jiffies);
+ } else {
+ struct rq *rq;
+#ifdef CONFIG_SMP
+ struct sched_domain *sd;
+ int dcount = 0;
+#endif
+ cpu = (unsigned long)(v - 2);
+ rq = cpu_rq(cpu);
+
+ /* runqueue-specific stats */
+ seq_printf(seq,
+ "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
+ cpu, rq->yld_count,
+ rq->sched_count, rq->sched_goidle,
+ rq->ttwu_count, rq->ttwu_local,
+ rq->rq_cpu_time,
+ rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+
+ seq_printf(seq, "\n");
+
+#ifdef CONFIG_SMP
+ /* domain-specific stats */
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ enum cpu_idle_type itype;
+
+ seq_printf(seq, "domain%d %*pb", dcount++,
+ cpumask_pr_args(sched_domain_span(sd)));
+ for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
+ itype++) {
+ seq_printf(seq, " %u %u %u %u %u %u %u %u",
+ sd->lb_count[itype],
+ sd->lb_balanced[itype],
+ sd->lb_failed[itype],
+ sd->lb_imbalance[itype],
+ sd->lb_gained[itype],
+ sd->lb_hot_gained[itype],
+ sd->lb_nobusyq[itype],
+ sd->lb_nobusyg[itype]);
+ }
+ seq_printf(seq,
+ " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+ sd->alb_count, sd->alb_failed, sd->alb_pushed,
+ sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+ sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
+ sd->ttwu_wake_remote, sd->ttwu_move_affine,
+ sd->ttwu_move_balance);
+ }
+ rcu_read_unlock();
+#endif
+ }
+ return 0;
+}
+
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some CPUs, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the CPUs.
+ */
+static void *schedstat_start(struct seq_file *file, loff_t *offset)
+{
+ unsigned long n = *offset;
+
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+
+ return NULL;
+}
+
+static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+
+ return schedstat_start(file, offset);
+}
+
+static void schedstat_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations schedstat_sops = {
+ .start = schedstat_start,
+ .next = schedstat_next,
+ .stop = schedstat_stop,
+ .show = show_schedstat,
+};
+
+static int __init proc_schedstat_init(void)
+{
+ proc_create_seq("schedstat", 0, NULL, &schedstat_sops);
+ return 0;
+}
+subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
new file mode 100644
index 000000000..8aea199a3
--- /dev/null
+++ b/kernel/sched/stats.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CONFIG_SCHEDSTATS
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{
+ if (rq) {
+ rq->rq_sched_info.run_delay += delta;
+ rq->rq_sched_info.pcount++;
+ }
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{
+ if (rq)
+ rq->rq_cpu_time += delta;
+}
+
+static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{
+ if (rq)
+ rq->rq_sched_info.run_delay += delta;
+}
+#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
+#define __schedstat_inc(var) do { var++; } while (0)
+#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
+#define __schedstat_add(var, amt) do { var += (amt); } while (0)
+#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define __schedstat_set(var, val) do { var = (val); } while (0)
+#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define schedstat_val(var) (var)
+#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
+
+#else /* !CONFIG_SCHEDSTATS: */
+static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
+static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
+static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
+# define schedstat_enabled() 0
+# define __schedstat_inc(var) do { } while (0)
+# define schedstat_inc(var) do { } while (0)
+# define __schedstat_add(var, amt) do { } while (0)
+# define schedstat_add(var, amt) do { } while (0)
+# define __schedstat_set(var, val) do { } while (0)
+# define schedstat_set(var, val) do { } while (0)
+# define schedstat_val(var) 0
+# define schedstat_val_or_zero(var) 0
+#endif /* CONFIG_SCHEDSTATS */
+
+#ifdef CONFIG_SCHED_INFO
+static inline void sched_info_reset_dequeued(struct task_struct *t)
+{
+ t->sched_info.last_queued = 0;
+}
+
+/*
+ * We are interested in knowing how long it was from the *first* time a
+ * task was queued to the time that it finally hit a CPU, we call this routine
+ * from dequeue_task() to account for possible rq->clock skew across CPUs. The
+ * delta taken on each CPU would annul the skew.
+ */
+static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
+{
+ unsigned long long now = rq_clock(rq), delta = 0;
+
+ if (unlikely(sched_info_on()))
+ if (t->sched_info.last_queued)
+ delta = now - t->sched_info.last_queued;
+ sched_info_reset_dequeued(t);
+ t->sched_info.run_delay += delta;
+
+ rq_sched_info_dequeued(rq, delta);
+}
+
+/*
+ * Called when a task finally hits the CPU. We can now calculate how
+ * long it was waiting to run. We also note when it began so that we
+ * can keep stats on how long its timeslice is.
+ */
+static void sched_info_arrive(struct rq *rq, struct task_struct *t)
+{
+ unsigned long long now = rq_clock(rq), delta = 0;
+
+ if (t->sched_info.last_queued)
+ delta = now - t->sched_info.last_queued;
+ sched_info_reset_dequeued(t);
+ t->sched_info.run_delay += delta;
+ t->sched_info.last_arrival = now;
+ t->sched_info.pcount++;
+
+ rq_sched_info_arrive(rq, delta);
+}
+
+/*
+ * This function is only called from enqueue_task(), but also only updates
+ * the timestamp if it is already not set. It's assumed that
+ * sched_info_dequeued() will clear that stamp when appropriate.
+ */
+static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
+{
+ if (unlikely(sched_info_on())) {
+ if (!t->sched_info.last_queued)
+ t->sched_info.last_queued = rq_clock(rq);
+ }
+}
+
+/*
+ * Called when a process ceases being the active-running process involuntarily
+ * due, typically, to expiring its time slice (this may also be called when
+ * switching to the idle task). Now we can calculate how long we ran.
+ * Also, if the process is still in the TASK_RUNNING state, call
+ * sched_info_queued() to mark that it has now again started waiting on
+ * the runqueue.
+ */
+static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
+{
+ unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
+
+ rq_sched_info_depart(rq, delta);
+
+ if (t->state == TASK_RUNNING)
+ sched_info_queued(rq, t);
+}
+
+/*
+ * Called when tasks are switched involuntarily due, typically, to expiring
+ * their time slice. (This may also be called when switching to or from
+ * the idle task.) We are only called when prev != next.
+ */
+static inline void
+__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+{
+ /*
+ * prev now departs the CPU. It's not interesting to record
+ * stats about how efficient we were at scheduling the idle
+ * process, however.
+ */
+ if (prev != rq->idle)
+ sched_info_depart(rq, prev);
+
+ if (next != rq->idle)
+ sched_info_arrive(rq, next);
+}
+
+static inline void
+sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+{
+ if (unlikely(sched_info_on()))
+ __sched_info_switch(rq, prev, next);
+}
+
+#else /* !CONFIG_SCHED_INFO: */
+# define sched_info_queued(rq, t) do { } while (0)
+# define sched_info_reset_dequeued(t) do { } while (0)
+# define sched_info_dequeued(rq, t) do { } while (0)
+# define sched_info_depart(rq, t) do { } while (0)
+# define sched_info_arrive(rq, next) do { } while (0)
+# define sched_info_switch(rq, t, next) do { } while (0)
+#endif /* CONFIG_SCHED_INFO */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
new file mode 100644
index 000000000..c183b790c
--- /dev/null
+++ b/kernel/sched/stop_task.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * stop-task scheduling class.
+ *
+ * The stop task is the highest priority task in the system, it preempts
+ * everything and will be preempted by nothing.
+ *
+ * See kernel/stop_machine.c
+ */
+#include "sched.h"
+
+#ifdef CONFIG_SMP
+static int
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ return task_cpu(p); /* stop tasks as never migrate */
+}
+#endif /* CONFIG_SMP */
+
+static void
+check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+ /* we're never preempted */
+}
+
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *stop = rq->stop;
+
+ if (!stop || !task_on_rq_queued(stop))
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ stop->se.exec_start = rq_clock_task(rq);
+
+ return stop;
+}
+
+static void
+enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+ add_nr_running(rq, 1);
+}
+
+static void
+dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+ sub_nr_running(rq, 1);
+}
+
+static void yield_task_stop(struct rq *rq)
+{
+ BUG(); /* the stop task should never yield, its pointless. */
+}
+
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+{
+ struct task_struct *curr = rq->curr;
+ u64 delta_exec;
+
+ delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ if (unlikely((s64)delta_exec < 0))
+ delta_exec = 0;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq_clock_task(rq);
+ cgroup_account_cputime(curr, delta_exec);
+}
+
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
+static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+
+static void set_curr_task_stop(struct rq *rq)
+{
+ struct task_struct *stop = rq->stop;
+
+ stop->se.exec_start = rq_clock_task(rq);
+}
+
+static void switched_to_stop(struct rq *rq, struct task_struct *p)
+{
+ BUG(); /* its impossible to change to this class */
+}
+
+static void
+prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
+{
+ BUG(); /* how!?, what priority? */
+}
+
+static unsigned int
+get_rr_interval_stop(struct rq *rq, struct task_struct *task)
+{
+ return 0;
+}
+
+static void update_curr_stop(struct rq *rq)
+{
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU stop tasks:
+ */
+const struct sched_class stop_sched_class = {
+ .next = &dl_sched_class,
+
+ .enqueue_task = enqueue_task_stop,
+ .dequeue_task = dequeue_task_stop,
+ .yield_task = yield_task_stop,
+
+ .check_preempt_curr = check_preempt_curr_stop,
+
+ .pick_next_task = pick_next_task_stop,
+ .put_prev_task = put_prev_task_stop,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_stop,
+ .set_cpus_allowed = set_cpus_allowed_common,
+#endif
+
+ .set_curr_task = set_curr_task_stop,
+ .task_tick = task_tick_stop,
+
+ .get_rr_interval = get_rr_interval_stop,
+
+ .prio_changed = prio_changed_stop,
+ .switched_to = switched_to_stop,
+ .update_curr = update_curr_stop,
+};
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
new file mode 100644
index 000000000..66b59ac77
--- /dev/null
+++ b/kernel/sched/swait.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * <linux/swait.h> (simple wait queues ) implementation:
+ */
+#include "sched.h"
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+ struct lock_class_key *key)
+{
+ raw_spin_lock_init(&q->lock);
+ lockdep_set_class_and_name(&q->lock, key, name);
+ INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+
+ if (list_empty(&q->task_list))
+ return;
+
+ curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+ wake_up_process(curr->task);
+ list_del_init(&curr->task_list);
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+void swake_up_one(struct swait_queue_head *q)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&q->lock, flags);
+ swake_up_locked(q);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up_one);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+ LIST_HEAD(tmp);
+
+ raw_spin_lock_irq(&q->lock);
+ list_splice_init(&q->task_list, &tmp);
+ while (!list_empty(&tmp)) {
+ curr = list_first_entry(&tmp, typeof(*curr), task_list);
+
+ wake_up_state(curr->task, TASK_NORMAL);
+ list_del_init(&curr->task_list);
+
+ if (list_empty(&tmp))
+ break;
+
+ raw_spin_unlock_irq(&q->lock);
+ raw_spin_lock_irq(&q->lock);
+ }
+ raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ wait->task = current;
+ if (list_empty(&wait->task_list))
+ list_add_tail(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&q->lock, flags);
+ __prepare_to_swait(q, wait);
+ set_current_state(state);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait_exclusive);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+ unsigned long flags;
+ long ret = 0;
+
+ raw_spin_lock_irqsave(&q->lock, flags);
+ if (unlikely(signal_pending_state(state, current))) {
+ /*
+ * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one()
+ * must not see us.
+ */
+ list_del_init(&wait->task_list);
+ ret = -ERESTARTSYS;
+ } else {
+ __prepare_to_swait(q, wait);
+ set_current_state(state);
+ }
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ __set_current_state(TASK_RUNNING);
+ if (!list_empty(&wait->task_list))
+ list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+
+ if (!list_empty_careful(&wait->task_list)) {
+ raw_spin_lock_irqsave(&q->lock, flags);
+ list_del_init(&wait->task_list);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+ }
+}
+EXPORT_SYMBOL(finish_swait);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
new file mode 100644
index 000000000..02e85cd23
--- /dev/null
+++ b/kernel/sched/topology.c
@@ -0,0 +1,1922 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Scheduler topology setup/handling methods
+ */
+#include "sched.h"
+
+DEFINE_MUTEX(sched_domains_mutex);
+
+/* Protected by sched_domains_mutex: */
+cpumask_var_t sched_domains_tmpmask;
+cpumask_var_t sched_domains_tmpmask2;
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static int __init sched_debug_setup(char *str)
+{
+ sched_debug_enabled = true;
+
+ return 0;
+}
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+ return sched_debug_enabled;
+}
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+ struct cpumask *groupmask)
+{
+ struct sched_group *group = sd->groups;
+
+ cpumask_clear(groupmask);
+
+ printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
+
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
+ printk("does not load-balance\n");
+ if (sd->parent)
+ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
+ return -1;
+ }
+
+ printk(KERN_CONT "span=%*pbl level=%s\n",
+ cpumask_pr_args(sched_domain_span(sd)), sd->name);
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+ }
+ if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
+ printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+ }
+
+ printk(KERN_DEBUG "%*s groups:", level + 1, "");
+ do {
+ if (!group) {
+ printk("\n");
+ printk(KERN_ERR "ERROR: group is NULL\n");
+ break;
+ }
+
+ if (!cpumask_weight(sched_group_span(group))) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: empty group\n");
+ break;
+ }
+
+ if (!(sd->flags & SD_OVERLAP) &&
+ cpumask_intersects(groupmask, sched_group_span(group))) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: repeated CPUs\n");
+ break;
+ }
+
+ cpumask_or(groupmask, groupmask, sched_group_span(group));
+
+ printk(KERN_CONT " %d:{ span=%*pbl",
+ group->sgc->id,
+ cpumask_pr_args(sched_group_span(group)));
+
+ if ((sd->flags & SD_OVERLAP) &&
+ !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
+ printk(KERN_CONT " mask=%*pbl",
+ cpumask_pr_args(group_balance_mask(group)));
+ }
+
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
+ printk(KERN_CONT " cap=%lu", group->sgc->capacity);
+
+ if (group == sd->groups && sd->child &&
+ !cpumask_equal(sched_domain_span(sd->child),
+ sched_group_span(group))) {
+ printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
+ }
+
+ printk(KERN_CONT " }");
+
+ group = group->next;
+
+ if (group != sd->groups)
+ printk(KERN_CONT ",");
+
+ } while (group != sd->groups);
+ printk(KERN_CONT "\n");
+
+ if (!cpumask_equal(sched_domain_span(sd), groupmask))
+ printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+ if (sd->parent &&
+ !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+ printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
+ return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+ int level = 0;
+
+ if (!sched_debug_enabled)
+ return;
+
+ if (!sd) {
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ return;
+ }
+
+ printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
+
+ for (;;) {
+ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+ break;
+ level++;
+ sd = sd->parent;
+ if (!sd)
+ break;
+ }
+}
+#else /* !CONFIG_SCHED_DEBUG */
+
+# define sched_debug_enabled 0
+# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+ return false;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+ if (cpumask_weight(sched_domain_span(sd)) == 1)
+ return 1;
+
+ /* Following flags need at least 2 groups */
+ if (sd->flags & (SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC |
+ SD_SHARE_CPUCAPACITY |
+ SD_ASYM_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_SHARE_POWERDOMAIN)) {
+ if (sd->groups != sd->groups->next)
+ return 0;
+ }
+
+ /* Following flags don't use groups */
+ if (sd->flags & (SD_WAKE_AFFINE))
+ return 0;
+
+ return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+ unsigned long cflags = sd->flags, pflags = parent->flags;
+
+ if (sd_degenerate(parent))
+ return 1;
+
+ if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+ return 0;
+
+ /* Flags needing groups don't count if only 1 group in parent */
+ if (parent->groups == parent->groups->next) {
+ pflags &= ~(SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC |
+ SD_ASYM_CPUCAPACITY |
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
+ if (nr_node_ids == 1)
+ pflags &= ~SD_SERIALIZE;
+ }
+ if (~cflags & pflags)
+ return 0;
+
+ return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+ cpupri_cleanup(&rd->cpupri);
+ cpudl_cleanup(&rd->cpudl);
+ free_cpumask_var(rd->dlo_mask);
+ free_cpumask_var(rd->rto_mask);
+ free_cpumask_var(rd->online);
+ free_cpumask_var(rd->span);
+ kfree(rd);
+}
+
+void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+ struct root_domain *old_rd = NULL;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ if (rq->rd) {
+ old_rd = rq->rd;
+
+ if (cpumask_test_cpu(rq->cpu, old_rd->online))
+ set_rq_offline(rq);
+
+ cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+ /*
+ * If we dont want to free the old_rd yet then
+ * set old_rd to NULL to skip the freeing later
+ * in this function:
+ */
+ if (!atomic_dec_and_test(&old_rd->refcount))
+ old_rd = NULL;
+ }
+
+ atomic_inc(&rd->refcount);
+ rq->rd = rd;
+
+ cpumask_set_cpu(rq->cpu, rd->span);
+ if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+ set_rq_online(rq);
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ if (old_rd)
+ call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+void sched_get_rd(struct root_domain *rd)
+{
+ atomic_inc(&rd->refcount);
+}
+
+void sched_put_rd(struct root_domain *rd)
+{
+ if (!atomic_dec_and_test(&rd->refcount))
+ return;
+
+ call_rcu_sched(&rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+ if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
+ goto out;
+ if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
+ goto free_span;
+ if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
+ goto free_online;
+ if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ goto free_dlo_mask;
+
+#ifdef HAVE_RT_PUSH_IPI
+ rd->rto_cpu = -1;
+ raw_spin_lock_init(&rd->rto_lock);
+ init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+#endif
+
+ init_dl_bw(&rd->dl_bw);
+ if (cpudl_init(&rd->cpudl) != 0)
+ goto free_rto_mask;
+
+ if (cpupri_init(&rd->cpupri) != 0)
+ goto free_cpudl;
+ return 0;
+
+free_cpudl:
+ cpudl_cleanup(&rd->cpudl);
+free_rto_mask:
+ free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+ free_cpumask_var(rd->dlo_mask);
+free_online:
+ free_cpumask_var(rd->online);
+free_span:
+ free_cpumask_var(rd->span);
+out:
+ return -ENOMEM;
+}
+
+/*
+ * By default the system creates a single root-domain with all CPUs as
+ * members (mimicking the global state we have today).
+ */
+struct root_domain def_root_domain;
+
+void init_defrootdomain(void)
+{
+ init_rootdomain(&def_root_domain);
+
+ atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+ struct root_domain *rd;
+
+ rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+ if (!rd)
+ return NULL;
+
+ if (init_rootdomain(rd) != 0) {
+ kfree(rd);
+ return NULL;
+ }
+
+ return rd;
+}
+
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
+{
+ struct sched_group *tmp, *first;
+
+ if (!sg)
+ return;
+
+ first = sg;
+ do {
+ tmp = sg->next;
+
+ if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+ kfree(sg->sgc);
+
+ if (atomic_dec_and_test(&sg->ref))
+ kfree(sg);
+ sg = tmp;
+ } while (sg != first);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd)
+{
+ /*
+ * A normal sched domain may have multiple group references, an
+ * overlapping domain, having private groups, only one. Iterate,
+ * dropping group/capacity references, freeing where none remain.
+ */
+ free_sched_groups(sd->groups, 1);
+
+ if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+ kfree(sd->shared);
+ kfree(sd);
+}
+
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
+{
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+ while (sd) {
+ struct sched_domain *parent = sd->parent;
+ destroy_sched_domain(sd);
+ sd = parent;
+ }
+}
+
+static void destroy_sched_domains(struct sched_domain *sd)
+{
+ if (sd)
+ call_rcu(&sd->rcu, destroy_sched_domains_rcu);
+}
+
+/*
+ * Keep a special pointer to the highest sched_domain that has
+ * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ * allows us to avoid some pointer chasing select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first CPU number in
+ * the cpumask of the domain), this allows us to quickly tell if
+ * two CPUs are in the same cache domain, see cpus_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
+DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+
+static void update_top_cache_domain(int cpu)
+{
+ struct sched_domain_shared *sds = NULL;
+ struct sched_domain *sd;
+ int id = cpu;
+ int size = 1;
+
+ sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ if (sd) {
+ id = cpumask_first(sched_domain_span(sd));
+ size = cpumask_weight(sched_domain_span(sd));
+ sds = sd->shared;
+ }
+
+ rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_size, cpu) = size;
+ per_cpu(sd_llc_id, cpu) = id;
+ rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+
+ sd = lowest_flag_domain(cpu, SD_NUMA);
+ rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_domain *tmp;
+
+ /* Remove the sched domains which do not contribute to scheduling. */
+ for (tmp = sd; tmp; ) {
+ struct sched_domain *parent = tmp->parent;
+ if (!parent)
+ break;
+
+ if (sd_parent_degenerate(tmp, parent)) {
+ tmp->parent = parent->parent;
+ if (parent->parent)
+ parent->parent->child = tmp;
+ /*
+ * Transfer SD_PREFER_SIBLING down in case of a
+ * degenerate parent; the spans match for this
+ * so the property transfers.
+ */
+ if (parent->flags & SD_PREFER_SIBLING)
+ tmp->flags |= SD_PREFER_SIBLING;
+ destroy_sched_domain(parent);
+ } else
+ tmp = tmp->parent;
+ }
+
+ if (sd && sd_degenerate(sd)) {
+ tmp = sd;
+ sd = sd->parent;
+ destroy_sched_domain(tmp);
+ if (sd)
+ sd->child = NULL;
+ }
+
+ sched_domain_debug(sd, cpu);
+
+ rq_attach_root(rq, rd);
+ tmp = rq->sd;
+ rcu_assign_pointer(rq->sd, sd);
+ dirty_sched_domain_sysctl(cpu);
+ destroy_sched_domains(tmp);
+
+ update_top_cache_domain(cpu);
+}
+
+struct s_data {
+ struct sched_domain * __percpu *sd;
+ struct root_domain *rd;
+};
+
+enum s_alloc {
+ sa_rootdomain,
+ sa_sd,
+ sa_sd_storage,
+ sa_none,
+};
+
+/*
+ * Return the canonical balance CPU for this group, this is the first CPU
+ * of this group that's also in the balance mask.
+ *
+ * The balance mask are all those CPUs that could actually end up at this
+ * group. See build_balance_mask().
+ *
+ * Also see should_we_balance().
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+ return cpumask_first(group_balance_mask(sg));
+}
+
+
+/*
+ * NUMA topology (first read the regular topology blurb below)
+ *
+ * Given a node-distance table, for example:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 30 20
+ * 1: 20 10 20 30
+ * 2: 30 20 10 20
+ * 3: 20 30 20 10
+ *
+ * which represents a 4 node ring topology like:
+ *
+ * 0 ----- 1
+ * | |
+ * | |
+ * | |
+ * 3 ----- 2
+ *
+ * We want to construct domains and groups to represent this. The way we go
+ * about doing this is to build the domains on 'hops'. For each NUMA level we
+ * construct the mask of all nodes reachable in @level hops.
+ *
+ * For the above NUMA topology that gives 3 levels:
+ *
+ * NUMA-2 0-3 0-3 0-3 0-3
+ * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
+ *
+ * NUMA-1 0-1,3 0-2 1-3 0,2-3
+ * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
+ *
+ * NUMA-0 0 1 2 3
+ *
+ *
+ * As can be seen; things don't nicely line up as with the regular topology.
+ * When we iterate a domain in child domain chunks some nodes can be
+ * represented multiple times -- hence the "overlap" naming for this part of
+ * the topology.
+ *
+ * In order to minimize this overlap, we only build enough groups to cover the
+ * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
+ *
+ * Because:
+ *
+ * - the first group of each domain is its child domain; this
+ * gets us the first 0-1,3
+ * - the only uncovered node is 2, who's child domain is 1-3.
+ *
+ * However, because of the overlap, computing a unique CPU for each group is
+ * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
+ * groups include the CPUs of Node-0, while those CPUs would not in fact ever
+ * end up at those groups (they would end up in group: 0-1,3).
+ *
+ * To correct this we have to introduce the group balance mask. This mask
+ * will contain those CPUs in the group that can reach this group given the
+ * (child) domain tree.
+ *
+ * With this we can once again compute balance_cpu and sched_group_capacity
+ * relations.
+ *
+ * XXX include words on how balance_cpu is unique and therefore can be
+ * used for sched_group_capacity links.
+ *
+ *
+ * Another 'interesting' topology is:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 20 30
+ * 1: 20 10 20 20
+ * 2: 20 20 10 20
+ * 3: 30 20 20 10
+ *
+ * Which looks a little like:
+ *
+ * 0 ----- 1
+ * | / |
+ * | / |
+ * | / |
+ * 2 ----- 3
+ *
+ * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
+ * are not.
+ *
+ * This leads to a few particularly weird cases where the sched_domain's are
+ * not of the same number for each CPU. Consider:
+ *
+ * NUMA-2 0-3 0-3
+ * groups: {0-2},{1-3} {1-3},{0-2}
+ *
+ * NUMA-1 0-2 0-3 0-3 1-3
+ *
+ * NUMA-0 0 1 2 3
+ *
+ */
+
+
+/*
+ * Build the balance mask; it contains only those CPUs that can arrive at this
+ * group and should be considered to continue balancing.
+ *
+ * We do this during the group creation pass, therefore the group information
+ * isn't complete yet, however since each group represents a (child) domain we
+ * can fully construct this using the sched_domain bits (which are already
+ * complete).
+ */
+static void
+build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
+{
+ const struct cpumask *sg_span = sched_group_span(sg);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ cpumask_clear(mask);
+
+ for_each_cpu(i, sg_span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+
+ /*
+ * Can happen in the asymmetric case, where these siblings are
+ * unused. The mask will not be empty because those CPUs that
+ * do have the top domain _should_ span the domain.
+ */
+ if (!sibling->child)
+ continue;
+
+ /* If we would not end up here, we can't continue from here */
+ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
+ continue;
+
+ cpumask_set_cpu(i, mask);
+ }
+
+ /* We must not have empty masks here */
+ WARN_ON_ONCE(cpumask_empty(mask));
+}
+
+/*
+ * XXX: This creates per-node group entries; since the load-balancer will
+ * immediately access remote memory to construct this group's load-balance
+ * statistics having the groups node local is of dubious benefit.
+ */
+static struct sched_group *
+build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *sg;
+ struct cpumask *sg_span;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(cpu));
+
+ if (!sg)
+ return NULL;
+
+ sg_span = sched_group_span(sg);
+ if (sd->child)
+ cpumask_copy(sg_span, sched_domain_span(sd->child));
+ else
+ cpumask_copy(sg_span, sched_domain_span(sd));
+
+ atomic_inc(&sg->ref);
+ return sg;
+}
+
+static void init_overlap_sched_group(struct sched_domain *sd,
+ struct sched_group *sg)
+{
+ struct cpumask *mask = sched_domains_tmpmask2;
+ struct sd_data *sdd = sd->private;
+ struct cpumask *sg_span;
+ int cpu;
+
+ build_balance_mask(sd, sg, mask);
+ cpu = cpumask_first_and(sched_group_span(sg), mask);
+
+ sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
+ cpumask_copy(group_balance_mask(sg), mask);
+ else
+ WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
+
+ /*
+ * Initialize sgc->capacity such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg_span = sched_group_span(sg);
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+}
+
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL, *sg;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered = sched_domains_tmpmask;
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ cpumask_clear(covered);
+
+ for_each_cpu_wrap(i, span, cpu) {
+ struct cpumask *sg_span;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ sibling = *per_cpu_ptr(sdd->sd, i);
+
+ /*
+ * Asymmetric node setups can result in situations where the
+ * domain tree is of unequal depth, make sure to skip domains
+ * that already cover the entire range.
+ *
+ * In that case build_sched_domains() will have terminated the
+ * iteration early and our sibling sd spans will be empty.
+ * Domains should always include the CPU they're built on, so
+ * check that.
+ */
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ sg = build_group_from_child_sched_domain(sibling, cpu);
+ if (!sg)
+ goto fail;
+
+ sg_span = sched_group_span(sg);
+ cpumask_or(covered, covered, sg_span);
+
+ init_overlap_sched_group(sd, sg);
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ last->next = first;
+ }
+ sd->groups = first;
+
+ return 0;
+
+fail:
+ free_sched_groups(first, 0);
+
+ return -ENOMEM;
+}
+
+
+/*
+ * Package topology (also see the load-balance blurb in fair.c)
+ *
+ * The scheduler builds a tree structure to represent a number of important
+ * topology features. By default (default_topology[]) these include:
+ *
+ * - Simultaneous multithreading (SMT)
+ * - Multi-Core Cache (MC)
+ * - Package (DIE)
+ *
+ * Where the last one more or less denotes everything up to a NUMA node.
+ *
+ * The tree consists of 3 primary data structures:
+ *
+ * sched_domain -> sched_group -> sched_group_capacity
+ * ^ ^ ^ ^
+ * `-' `-'
+ *
+ * The sched_domains are per-CPU and have a two way link (parent & child) and
+ * denote the ever growing mask of CPUs belonging to that level of topology.
+ *
+ * Each sched_domain has a circular (double) linked list of sched_group's, each
+ * denoting the domains of the level below (or individual CPUs in case of the
+ * first domain level). The sched_group linked by a sched_domain includes the
+ * CPU of that sched_domain [*].
+ *
+ * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
+ *
+ * CPU 0 1 2 3 4 5 6 7
+ *
+ * DIE [ ]
+ * MC [ ] [ ]
+ * SMT [ ] [ ] [ ] [ ]
+ *
+ * - or -
+ *
+ * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
+ * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
+ * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
+ *
+ * CPU 0 1 2 3 4 5 6 7
+ *
+ * One way to think about it is: sched_domain moves you up and down among these
+ * topology levels, while sched_group moves you sideways through it, at child
+ * domain granularity.
+ *
+ * sched_group_capacity ensures each unique sched_group has shared storage.
+ *
+ * There are two related construction problems, both require a CPU that
+ * uniquely identify each group (for a given domain):
+ *
+ * - The first is the balance_cpu (see should_we_balance() and the
+ * load-balance blub in fair.c); for each group we only want 1 CPU to
+ * continue balancing at a higher domain.
+ *
+ * - The second is the sched_group_capacity; we want all identical groups
+ * to share a single sched_group_capacity.
+ *
+ * Since these topologies are exclusive by construction. That is, its
+ * impossible for an SMT thread to belong to multiple cores, and cores to
+ * be part of multiple caches. There is a very clear and unique location
+ * for each CPU in the hierarchy.
+ *
+ * Therefore computing a unique CPU for each group is trivial (the iteration
+ * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
+ * group), we can simply pick the first CPU in each group.
+ *
+ *
+ * [*] in other words, the first group of each domain is its child domain.
+ */
+
+static struct sched_group *get_group(int cpu, struct sd_data *sdd)
+{
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ struct sched_domain *child = sd->child;
+ struct sched_group *sg;
+
+ if (child)
+ cpu = cpumask_first(sched_domain_span(child));
+
+ sg = *per_cpu_ptr(sdd->sg, cpu);
+ sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+
+ /* For claim_allocations: */
+ atomic_inc(&sg->ref);
+ atomic_inc(&sg->sgc->ref);
+
+ if (child) {
+ cpumask_copy(sched_group_span(sg), sched_domain_span(child));
+ cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+ } else {
+ cpumask_set_cpu(cpu, sched_group_span(sg));
+ cpumask_set_cpu(cpu, group_balance_mask(sg));
+ }
+
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+
+ return sg;
+}
+
+/*
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_capacity to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
+ */
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL;
+ struct sd_data *sdd = sd->private;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered;
+ int i;
+
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
+
+ cpumask_clear(covered);
+
+ for_each_cpu_wrap(i, span, cpu) {
+ struct sched_group *sg;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ sg = get_group(i, sdd);
+
+ cpumask_or(covered, covered, sched_group_span(sg));
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ }
+ last->next = first;
+ sd->groups = first;
+
+ return 0;
+}
+
+/*
+ * Initialize sched groups cpu_capacity.
+ *
+ * cpu_capacity indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
+ */
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+{
+ struct sched_group *sg = sd->groups;
+
+ WARN_ON(!sg);
+
+ do {
+ int cpu, max_cpu = -1;
+
+ sg->group_weight = cpumask_weight(sched_group_span(sg));
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ goto next;
+
+ for_each_cpu(cpu, sched_group_span(sg)) {
+ if (max_cpu < 0)
+ max_cpu = cpu;
+ else if (sched_asym_prefer(cpu, max_cpu))
+ max_cpu = cpu;
+ }
+ sg->asym_prefer_cpu = max_cpu;
+
+next:
+ sg = sg->next;
+ } while (sg != sd->groups);
+
+ if (cpu != group_balance_cpu(sg))
+ return;
+
+ update_group_capacity(sd, cpu);
+}
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
+
+ return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+ struct sched_domain_attr *attr)
+{
+ int request;
+
+ if (!attr || attr->relax_domain_level < 0) {
+ if (default_relax_domain_level < 0)
+ return;
+ else
+ request = default_relax_domain_level;
+ } else
+ request = attr->relax_domain_level;
+ if (request < sd->level) {
+ /* Turn off idle balance on this domain: */
+ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ } else {
+ /* Turn on idle balance on this domain: */
+ sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ }
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+ const struct cpumask *cpu_map)
+{
+ switch (what) {
+ case sa_rootdomain:
+ if (!atomic_read(&d->rd->refcount))
+ free_rootdomain(&d->rd->rcu);
+ /* Fall through */
+ case sa_sd:
+ free_percpu(d->sd);
+ /* Fall through */
+ case sa_sd_storage:
+ __sdt_free(cpu_map);
+ /* Fall through */
+ case sa_none:
+ break;
+ }
+}
+
+static enum s_alloc
+__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
+{
+ memset(d, 0, sizeof(*d));
+
+ if (__sdt_alloc(cpu_map))
+ return sa_sd_storage;
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd)
+ return sa_sd_storage;
+ d->rd = alloc_rootdomain();
+ if (!d->rd)
+ return sa_sd;
+
+ return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+ struct sd_data *sdd = sd->private;
+
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+ *per_cpu_ptr(sdd->sds, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
+}
+
+#ifdef CONFIG_NUMA
+enum numa_topology_type sched_numa_topology_type;
+
+static int sched_domains_numa_levels;
+static int sched_domains_curr_level;
+
+int sched_max_numa_distance;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+#endif
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
+ *
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUCAPACITY | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_ASYM_CPUCAPACITY | \
+ SD_SHARE_POWERDOMAIN)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map,
+ struct sched_domain *child, int cpu)
+{
+ struct sd_data *sdd = &tl->data;
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ int sd_id, sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+#endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= TOPOLOGY_SD_FLAGS;
+
+ *sd = (struct sched_domain){
+ .min_interval = sd_weight,
+ .max_interval = 2*sd_weight,
+ .busy_factor = 32,
+ .imbalance_pct = 125,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
+ .newidle_idx = 0,
+ .wake_idx = 0,
+ .forkexec_idx = 0,
+
+ .flags = 1*SD_LOAD_BALANCE
+ | 1*SD_BALANCE_NEWIDLE
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
+ | 0*SD_BALANCE_WAKE
+ | 1*SD_WAKE_AFFINE
+ | 0*SD_SHARE_CPUCAPACITY
+ | 0*SD_SHARE_PKG_RESOURCES
+ | 0*SD_SERIALIZE
+ | 0*SD_PREFER_SIBLING
+ | 0*SD_NUMA
+ | sd_flags
+ ,
+
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
+ .smt_gain = 0,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
+ .child = child,
+#ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
+#endif
+ };
+
+ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ sd_id = cpumask_first(sched_domain_span(sd));
+
+ /*
+ * Convert topological properties into behaviour.
+ */
+
+ if (sd->flags & SD_ASYM_CPUCAPACITY) {
+ struct sched_domain *t = sd;
+
+ for_each_lower_domain(t)
+ t->flags |= SD_BALANCE_WAKE;
+ }
+
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ /*
+ * For all levels sharing cache; connect a sched_domain_shared
+ * instance.
+ */
+ if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+ atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+ }
+
+ sd->private = sdd;
+
+ return sd;
+}
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology =
+ default_topology;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ if (WARN_ON_ONCE(sched_smp_initialized))
+ return;
+
+ sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+ return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+ static int done = false;
+ int i,j;
+
+ if (done)
+ return;
+
+ done = true;
+
+ printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+ for (i = 0; i < nr_node_ids; i++) {
+ printk(KERN_WARNING " ");
+ for (j = 0; j < nr_node_ids; j++)
+ printk(KERN_CONT "%02d ", node_distance(i,j));
+ printk(KERN_CONT "\n");
+ }
+ printk(KERN_WARNING "\n");
+}
+
+bool find_numa_distance(int distance)
+{
+ int i;
+
+ if (distance == node_distance(0, 0))
+ return true;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ if (sched_domains_numa_distance[i] == distance)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ * is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ * there is an intermediary node C, which is < N hops away from both
+ * nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+ int a, b, c, n;
+
+ n = sched_max_numa_distance;
+
+ if (sched_domains_numa_levels <= 2) {
+ sched_numa_topology_type = NUMA_DIRECT;
+ return;
+ }
+
+ for_each_online_node(a) {
+ for_each_online_node(b) {
+ /* Find two nodes furthest removed from each other. */
+ if (node_distance(a, b) < n)
+ continue;
+
+ /* Is there an intermediary node between a and b? */
+ for_each_online_node(c) {
+ if (node_distance(a, c) < n &&
+ node_distance(b, c) < n) {
+ sched_numa_topology_type =
+ NUMA_GLUELESS_MESH;
+ return;
+ }
+ }
+
+ sched_numa_topology_type = NUMA_BACKPLANE;
+ return;
+ }
+ }
+}
+
+
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
+
+void sched_init_numa(void)
+{
+ struct sched_domain_topology_level *tl;
+ unsigned long *distance_map;
+ int nr_levels = 0;
+ int i, j;
+
+ /*
+ * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * unique distances in the node_distance() table.
+ */
+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
+ if (!distance_map)
+ return;
+
+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
+ for (i = 0; i < nr_node_ids; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ int distance = node_distance(i, j);
+
+ if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
+ sched_numa_warn("Invalid distance value range");
+ return;
+ }
+
+ bitmap_set(distance_map, distance, 1);
+ }
+ }
+ /*
+ * We can now figure out how many unique distance values there are and
+ * allocate memory accordingly.
+ */
+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
+
+ sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+ if (!sched_domains_numa_distance) {
+ bitmap_free(distance_map);
+ return;
+ }
+
+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
+ sched_domains_numa_distance[i] = j;
+ }
+
+ bitmap_free(distance_map);
+
+ /*
+ * 'nr_levels' contains the number of unique distances
+ *
+ * The sched_domains_numa_distance[] array includes the actual distance
+ * numbers.
+ */
+
+ /*
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
+ * the array will contain less then 'nr_levels' members. This could be
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+ * in other functions.
+ *
+ * We reset it to 'nr_levels' at the end of this function.
+ */
+ sched_domains_numa_levels = 0;
+
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
+ if (!sched_domains_numa_masks)
+ return;
+
+ /*
+ * Now for each level, construct a mask per node which contains all
+ * CPUs of nodes that are that many hops away from us.
+ */
+ for (i = 0; i < nr_levels; i++) {
+ sched_domains_numa_masks[i] =
+ kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!sched_domains_numa_masks[i])
+ return;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ int k;
+
+ if (!mask)
+ return;
+
+ sched_domains_numa_masks[i][j] = mask;
+
+ for_each_node(k) {
+ if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+ sched_numa_warn("Node-distance not symmetric");
+
+ if (node_distance(j, k) > sched_domains_numa_distance[i])
+ continue;
+
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+ }
+ }
+
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + nr_levels + 1) *
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+ if (!tl)
+ return;
+
+ /*
+ * Copy the default topology bits..
+ */
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
+
+ /*
+ * Add the NUMA identity distance, aka single NODE.
+ */
+ tl[i++] = (struct sched_domain_topology_level){
+ .mask = sd_numa_mask,
+ .numa_level = 0,
+ SD_INIT_NAME(NODE)
+ };
+
+ /*
+ * .. and append 'j' levels of NUMA goodness.
+ */
+ for (j = 1; j < nr_levels; i++, j++) {
+ tl[i] = (struct sched_domain_topology_level){
+ .mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
+ .flags = SDTL_OVERLAP,
+ .numa_level = j,
+ SD_INIT_NAME(NUMA)
+ };
+ }
+
+ sched_domain_topology = tl;
+
+ sched_domains_numa_levels = nr_levels;
+ sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
+
+ init_numa_topology_type();
+}
+
+void sched_domains_numa_masks_set(unsigned int cpu)
+{
+ int node = cpu_to_node(cpu);
+ int i, j;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+void sched_domains_numa_masks_clear(unsigned int cpu)
+{
+ int i, j;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++)
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+}
+
+#endif /* CONFIG_NUMA */
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ sdd->sd = alloc_percpu(struct sched_domain *);
+ if (!sdd->sd)
+ return -ENOMEM;
+
+ sdd->sds = alloc_percpu(struct sched_domain_shared *);
+ if (!sdd->sds)
+ return -ENOMEM;
+
+ sdd->sg = alloc_percpu(struct sched_group *);
+ if (!sdd->sg)
+ return -ENOMEM;
+
+ sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+ if (!sdd->sgc)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+ struct sched_domain_shared *sds;
+ struct sched_group *sg;
+ struct sched_group_capacity *sgc;
+
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sd, j) = sd;
+
+ sds = kzalloc_node(sizeof(struct sched_domain_shared),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sds)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sds, j) = sds;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sg)
+ return -ENOMEM;
+
+ sg->next = sg;
+
+ *per_cpu_ptr(sdd->sg, j) = sg;
+
+ sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sgc)
+ return -ENOMEM;
+
+#ifdef CONFIG_SCHED_DEBUG
+ sgc->id = j;
+#endif
+
+ *per_cpu_ptr(sdd->sgc, j) = sgc;
+ }
+ }
+
+ return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+
+ if (sdd->sd) {
+ sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ }
+
+ if (sdd->sds)
+ kfree(*per_cpu_ptr(sdd->sds, j));
+ if (sdd->sg)
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ if (sdd->sgc)
+ kfree(*per_cpu_ptr(sdd->sgc, j));
+ }
+ free_percpu(sdd->sd);
+ sdd->sd = NULL;
+ free_percpu(sdd->sds);
+ sdd->sds = NULL;
+ free_percpu(sdd->sg);
+ sdd->sg = NULL;
+ free_percpu(sdd->sgc);
+ sdd->sgc = NULL;
+ }
+}
+
+static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *child, int cpu)
+{
+ struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
+
+ if (child) {
+ sd->level = child->level + 1;
+ sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ child->parent = sd;
+
+ if (!cpumask_subset(sched_domain_span(child),
+ sched_domain_span(sd))) {
+ pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" the %s domain not a subset of the %s domain\n",
+ child->name, sd->name);
+#endif
+ /* Fixup, ensure @sd has at least @child CPUs. */
+ cpumask_or(sched_domain_span(sd),
+ sched_domain_span(sd),
+ sched_domain_span(child));
+ }
+
+ }
+ set_domain_attribute(sd, attr);
+
+ return sd;
+}
+
+/*
+ * Build sched domains for a given set of CPUs and attach the sched domains
+ * to the individual CPUs
+ */
+static int
+build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
+{
+ enum s_alloc alloc_state;
+ struct sched_domain *sd;
+ struct s_data d;
+ struct rq *rq = NULL;
+ int i, ret = -ENOMEM;
+
+ alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+ if (alloc_state != sa_rootdomain)
+ goto error;
+
+ /* Set up domains for CPUs specified by the cpu_map: */
+ for_each_cpu(i, cpu_map) {
+ struct sched_domain_topology_level *tl;
+
+ sd = NULL;
+ for_each_sd_topology(tl) {
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+ if (tl == sched_domain_topology)
+ *per_cpu_ptr(d.sd, i) = sd;
+ if (tl->flags & SDTL_OVERLAP)
+ sd->flags |= SD_OVERLAP;
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ break;
+ }
+ }
+
+ /* Build the groups for the domains */
+ for_each_cpu(i, cpu_map) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ sd->span_weight = cpumask_weight(sched_domain_span(sd));
+ if (sd->flags & SD_OVERLAP) {
+ if (build_overlap_sched_groups(sd, i))
+ goto error;
+ } else {
+ if (build_sched_groups(sd, i))
+ goto error;
+ }
+ }
+ }
+
+ /* Calculate CPU capacity for physical packages and nodes */
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ if (!cpumask_test_cpu(i, cpu_map))
+ continue;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ claim_allocations(i, sd);
+ init_sched_groups_capacity(i, sd);
+ }
+ }
+
+ /* Attach the domains */
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map) {
+ rq = cpu_rq(i);
+ sd = *per_cpu_ptr(d.sd, i);
+
+ /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+ if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+ WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+
+ cpu_attach_domain(sd, d.rd, i);
+ }
+ rcu_read_unlock();
+
+ if (rq && sched_debug_enabled) {
+ pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
+ cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+ }
+
+ ret = 0;
+error:
+ __free_domain_allocs(&d, alloc_state, cpu_map);
+
+ return ret;
+}
+
+/* Current sched domains: */
+static cpumask_var_t *doms_cur;
+
+/* Number of sched domains in 'doms_cur': */
+static int ndoms_cur;
+
+/* Attribues of custom domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+
+/*
+ * Special case: If a kmalloc() of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+static cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * CPU core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __weak arch_update_cpu_topology(void)
+{
+ return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+ int i;
+ cpumask_var_t *doms;
+
+ doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
+ if (!doms)
+ return NULL;
+ for (i = 0; i < ndoms; i++) {
+ if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+ free_sched_domains(doms, i);
+ return NULL;
+ }
+ }
+ return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+ unsigned int i;
+ for (i = 0; i < ndoms; i++)
+ free_cpumask_var(doms[i]);
+ kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated CPUs, but could be used to
+ * exclude other special cases in the future.
+ */
+int sched_init_domains(const struct cpumask *cpu_map)
+{
+ int err;
+
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
+ zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
+ zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
+ arch_update_cpu_topology();
+ ndoms_cur = 1;
+ doms_cur = alloc_sched_domains(ndoms_cur);
+ if (!doms_cur)
+ doms_cur = &fallback_doms;
+ cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ err = build_sched_domains(doms_cur[0], NULL);
+ register_sched_domain_sysctl();
+
+ return err;
+}
+
+/*
+ * Detach sched domains from a group of CPUs specified in cpu_map
+ * These CPUs will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+ int i;
+
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map)
+ cpu_attach_domain(NULL, &def_root_domain, i);
+ rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+ struct sched_domain_attr *new, int idx_new)
+{
+ struct sched_domain_attr tmp;
+
+ /* Fast path: */
+ if (!new && !cur)
+ return 1;
+
+ tmp = SD_ATTR_INIT;
+
+ return !memcmp(cur ? (cur + idx_cur) : &tmp,
+ new ? (new + idx_new) : &tmp,
+ sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains. This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ int i, j, n;
+ int new_topology;
+
+ mutex_lock(&sched_domains_mutex);
+
+ /* Always unregister in case we don't destroy any domains: */
+ unregister_sched_domain_sysctl();
+
+ /* Let the architecture update CPU core mappings: */
+ new_topology = arch_update_cpu_topology();
+
+ if (!doms_new) {
+ WARN_ON_ONCE(dattr_new);
+ n = 0;
+ doms_new = alloc_sched_domains(1);
+ if (doms_new) {
+ n = 1;
+ cpumask_and(doms_new[0], cpu_active_mask,
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
+ }
+ } else {
+ n = ndoms_new;
+ }
+
+ /* Destroy deleted domains: */
+ for (i = 0; i < ndoms_cur; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_cur[i], doms_new[j])
+ && dattrs_equal(dattr_cur, i, dattr_new, j))
+ goto match1;
+ }
+ /* No match - a current sched domain not in new doms_new[] */
+ detach_destroy_domains(doms_cur[i]);
+match1:
+ ;
+ }
+
+ n = ndoms_cur;
+ if (!doms_new) {
+ n = 0;
+ doms_new = &fallback_doms;
+ cpumask_and(doms_new[0], cpu_active_mask,
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
+ }
+
+ /* Build new domains: */
+ for (i = 0; i < ndoms_new; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_new[i], doms_cur[j])
+ && dattrs_equal(dattr_new, i, dattr_cur, j))
+ goto match2;
+ }
+ /* No match - add a new doms_new */
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+ ;
+ }
+
+ /* Remember the new sched domains: */
+ if (doms_cur != &fallback_doms)
+ free_sched_domains(doms_cur, ndoms_cur);
+
+ kfree(dattr_cur);
+ doms_cur = doms_new;
+ dattr_cur = dattr_new;
+ ndoms_cur = ndoms_new;
+
+ register_sched_domain_sysctl();
+
+ mutex_unlock(&sched_domains_mutex);
+}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
new file mode 100644
index 000000000..a44a57fb9
--- /dev/null
+++ b/kernel/sched/wait.c
@@ -0,0 +1,449 @@
+/*
+ * Generic waiting primitives.
+ *
+ * (C) 2004 Nadia Yvette Chambers, Oracle
+ */
+#include "sched.h"
+
+void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
+{
+ spin_lock_init(&wq_head->lock);
+ lockdep_set_class_and_name(&wq_head->lock, key, name);
+ INIT_LIST_HEAD(&wq_head->head);
+}
+
+EXPORT_SYMBOL(__init_waitqueue_head);
+
+void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+ unsigned long flags;
+
+ wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue);
+
+void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+ unsigned long flags;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue_entry_tail(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive);
+
+void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __remove_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL(remove_wait_queue);
+
+/*
+ * Scan threshold to break wait queue walk.
+ * This allows a waker to take a break from holding the
+ * wait queue lock during the wait queue walk.
+ */
+#define WAITQUEUE_WALK_BREAK_CNT 64
+
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key,
+ wait_queue_entry_t *bookmark)
+{
+ wait_queue_entry_t *curr, *next;
+ int cnt = 0;
+
+ lockdep_assert_held(&wq_head->lock);
+
+ if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+ curr = list_next_entry(bookmark, entry);
+
+ list_del(&bookmark->entry);
+ bookmark->flags = 0;
+ } else
+ curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+
+ if (&curr->entry == &wq_head->head)
+ return nr_exclusive;
+
+ list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
+ unsigned flags = curr->flags;
+ int ret;
+
+ if (flags & WQ_FLAG_BOOKMARK)
+ continue;
+
+ ret = curr->func(curr, mode, wake_flags, key);
+ if (ret < 0)
+ break;
+ if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ break;
+
+ if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+ (&next->entry != &wq_head->head)) {
+ bookmark->flags = WQ_FLAG_BOOKMARK;
+ list_add_tail(&bookmark->entry, &next->entry);
+ break;
+ }
+ }
+
+ return nr_exclusive;
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key)
+{
+ unsigned long flags;
+ wait_queue_entry_t bookmark;
+
+ bookmark.flags = 0;
+ bookmark.private = NULL;
+ bookmark.func = NULL;
+ INIT_LIST_HEAD(&bookmark.entry);
+
+ spin_lock_irqsave(&wq_head->lock, flags);
+ nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+
+ while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ spin_lock_irqsave(&wq_head->lock, flags);
+ nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+ wake_flags, key, &bookmark);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+ }
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @wq_head: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ */
+void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
+{
+ __wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+
+void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
+{
+ __wake_up_common(wq_head, mode, 1, 0, key, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+ unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+{
+ __wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
+
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @wq_head: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ */
+void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ int wake_flags = 1; /* XXX WF_SYNC */
+
+ if (unlikely(!wq_head))
+ return;
+
+ if (unlikely(nr_exclusive != 1))
+ wake_flags = 0;
+
+ __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive)
+{
+ __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+
+void __wake_up_pollfree(struct wait_queue_head *wq_head)
+{
+ __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE));
+ /* POLLFREE must have cleared the queue. */
+ WARN_ON_ONCE(waitqueue_active(wq_head));
+}
+
+/*
+ * Note: we use "set_current_state()" _after_ the wait-queue add,
+ * because we need a memory barrier there on SMP, so that any
+ * wake-function that tests for the wait-queue being active
+ * will be guaranteed to see waitqueue addition _or_ subsequent
+ * tests in this thread will see the wakeup having taken place.
+ *
+ * The spin_unlock() itself is semi-permeable and only protects
+ * one way (it only protects stuff inside the critical region and
+ * stops them from bleeding out - it would still allow subsequent
+ * loads to move into the critical region).
+ */
+void
+prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
+{
+ unsigned long flags;
+
+ wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ if (list_empty(&wq_entry->entry))
+ __add_wait_queue(wq_head, wq_entry);
+ set_current_state(state);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait);
+
+void
+prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
+{
+ unsigned long flags;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ if (list_empty(&wq_entry->entry))
+ __add_wait_queue_entry_tail(wq_head, wq_entry);
+ set_current_state(state);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait_exclusive);
+
+void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
+{
+ wq_entry->flags = flags;
+ wq_entry->private = current;
+ wq_entry->func = autoremove_wake_function;
+ INIT_LIST_HEAD(&wq_entry->entry);
+}
+EXPORT_SYMBOL(init_wait_entry);
+
+long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
+{
+ unsigned long flags;
+ long ret = 0;
+
+ spin_lock_irqsave(&wq_head->lock, flags);
+ if (unlikely(signal_pending_state(state, current))) {
+ /*
+ * Exclusive waiter must not fail if it was selected by wakeup,
+ * it should "consume" the condition we were waiting for.
+ *
+ * The caller will recheck the condition and return success if
+ * we were already woken up, we can not miss the event because
+ * wakeup locks/unlocks the same wq_head->lock.
+ *
+ * But we need to ensure that set-condition + wakeup after that
+ * can't see us, it should wake up another exclusive waiter if
+ * we fail.
+ */
+ list_del_init(&wq_entry->entry);
+ ret = -ERESTARTSYS;
+ } else {
+ if (list_empty(&wq_entry->entry)) {
+ if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
+ __add_wait_queue_entry_tail(wq_head, wq_entry);
+ else
+ __add_wait_queue(wq_head, wq_entry);
+ }
+ set_current_state(state);
+ }
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
+
+/*
+ * Note! These two wait functions are entered with the
+ * wait-queue lock held (and interrupts off in the _irq
+ * case), so there is no race with testing the wakeup
+ * condition in the caller before they add the wait
+ * entry to the wake queue.
+ */
+int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
+{
+ if (likely(list_empty(&wait->entry)))
+ __add_wait_queue_entry_tail(wq, wait);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ spin_unlock(&wq->lock);
+ schedule();
+ spin_lock(&wq->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(do_wait_intr);
+
+int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
+{
+ if (likely(list_empty(&wait->entry)))
+ __add_wait_queue_entry_tail(wq, wait);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ spin_unlock_irq(&wq->lock);
+ schedule();
+ spin_lock_irq(&wq->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(do_wait_intr_irq);
+
+/**
+ * finish_wait - clean up after waiting in a queue
+ * @wq_head: waitqueue waited on
+ * @wq_entry: wait descriptor
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ */
+void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ /*
+ * We can check for list emptiness outside the lock
+ * IFF:
+ * - we use the "careful" check that verifies both
+ * the next and prev pointers, so that there cannot
+ * be any half-pending updates in progress on other
+ * CPU's that we haven't seen yet (and that might
+ * still change the stack area.
+ * and
+ * - all other users take the lock (ie we can only
+ * have _one_ other CPU that looks at or modifies
+ * the list).
+ */
+ if (!list_empty_careful(&wq_entry->entry)) {
+ spin_lock_irqsave(&wq_head->lock, flags);
+ list_del_init(&wq_entry->entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+ }
+}
+EXPORT_SYMBOL(finish_wait);
+
+int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
+{
+ int ret = default_wake_function(wq_entry, mode, sync, key);
+
+ if (ret)
+ list_del_init(&wq_entry->entry);
+
+ return ret;
+}
+EXPORT_SYMBOL(autoremove_wake_function);
+
+static inline bool is_kthread_should_stop(void)
+{
+ return (current->flags & PF_KTHREAD) && kthread_should_stop();
+}
+
+/*
+ * DEFINE_WAIT_FUNC(wait, woken_wake_func);
+ *
+ * add_wait_queue(&wq_head, &wait);
+ * for (;;) {
+ * if (condition)
+ * break;
+ *
+ * // in wait_woken() // in woken_wake_function()
+ *
+ * p->state = mode; wq_entry->flags |= WQ_FLAG_WOKEN;
+ * smp_mb(); // A try_to_wake_up():
+ * if (!(wq_entry->flags & WQ_FLAG_WOKEN)) <full barrier>
+ * schedule() if (p->state & mode)
+ * p->state = TASK_RUNNING; p->state = TASK_RUNNING;
+ * wq_entry->flags &= ~WQ_FLAG_WOKEN; ~~~~~~~~~~~~~~~~~~
+ * smp_mb(); // B condition = true;
+ * } smp_mb(); // C
+ * remove_wait_queue(&wq_head, &wait); wq_entry->flags |= WQ_FLAG_WOKEN;
+ */
+long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
+{
+ /*
+ * The below executes an smp_mb(), which matches with the full barrier
+ * executed by the try_to_wake_up() in woken_wake_function() such that
+ * either we see the store to wq_entry->flags in woken_wake_function()
+ * or woken_wake_function() sees our store to current->state.
+ */
+ set_current_state(mode); /* A */
+ if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ timeout = schedule_timeout(timeout);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * The below executes an smp_mb(), which matches with the smp_mb() (C)
+ * in woken_wake_function() such that either we see the wait condition
+ * being true or the store to wq_entry->flags in woken_wake_function()
+ * follows ours in the coherence order.
+ */
+ smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */
+
+ return timeout;
+}
+EXPORT_SYMBOL(wait_woken);
+
+int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
+{
+ /* Pairs with the smp_store_mb() in wait_woken(). */
+ smp_mb(); /* C */
+ wq_entry->flags |= WQ_FLAG_WOKEN;
+
+ return default_wake_function(wq_entry, mode, sync, key);
+}
+EXPORT_SYMBOL(woken_wake_function);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
new file mode 100644
index 000000000..c67c6d24a
--- /dev/null
+++ b/kernel/sched/wait_bit.c
@@ -0,0 +1,249 @@
+/*
+ * The implementation of the wait_bit*() and related waiting APIs:
+ */
+#include "sched.h"
+
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+ unsigned long val = (unsigned long)word << shift | bit;
+
+ return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
+int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ test_bit(key->bit_nr, key->flags))
+ return 0;
+
+ return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+EXPORT_SYMBOL(wake_bit_function);
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
+ * permitted return codes. Nonzero return codes halt waiting and return.
+ */
+int __sched
+__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
+ wait_bit_action_f *action, unsigned mode)
+{
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
+ if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
+ ret = (*action)(&wbq_entry->key, mode);
+ } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+
+ finish_wait(wq_head, &wbq_entry->wq_entry);
+
+ return ret;
+}
+EXPORT_SYMBOL(__wait_on_bit);
+
+int __sched out_of_line_wait_on_bit(void *word, int bit,
+ wait_bit_action_f *action, unsigned mode)
+{
+ struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+ return __wait_on_bit(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit);
+
+int __sched out_of_line_wait_on_bit_timeout(
+ void *word, int bit, wait_bit_action_f *action,
+ unsigned mode, unsigned long timeout)
+{
+ struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+ wq_entry.key.timeout = jiffies + timeout;
+
+ return __wait_on_bit(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
+
+int __sched
+__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
+ wait_bit_action_f *action, unsigned mode)
+{
+ int ret = 0;
+
+ for (;;) {
+ prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode);
+ if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
+ ret = action(&wbq_entry->key, mode);
+ /*
+ * See the comment in prepare_to_wait_event().
+ * finish_wait() does not necessarily takes wwq_head->lock,
+ * but test_and_set_bit() implies mb() which pairs with
+ * smp_mb__after_atomic() before wake_up_page().
+ */
+ if (ret)
+ finish_wait(wq_head, &wbq_entry->wq_entry);
+ }
+ if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
+ if (!ret)
+ finish_wait(wq_head, &wbq_entry->wq_entry);
+ return 0;
+ } else if (ret) {
+ return ret;
+ }
+ }
+}
+EXPORT_SYMBOL(__wait_on_bit_lock);
+
+int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
+ wait_bit_action_f *action, unsigned mode)
+{
+ struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+ return __wait_on_bit_lock(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
+
+void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
+{
+ struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
+
+ if (waitqueue_active(wq_head))
+ __wake_up(wq_head, TASK_NORMAL, 1, &key);
+}
+EXPORT_SYMBOL(__wake_up_bit);
+
+/**
+ * wake_up_bit - wake up a waiter on a bit
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that wakes up waiters
+ * on a bit. For instance, if one were to have waiters on a bitflag,
+ * one would call wake_up_bit() after clearing the bit.
+ *
+ * In order for this to function properly, as it uses waitqueue_active()
+ * internally, some kind of memory barrier must be done prior to calling
+ * this. Typically, this will be smp_mb__after_atomic(), but in some
+ * cases where bitflags are manipulated non-atomically under a lock, one
+ * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
+ * because spin_unlock() does not guarantee a memory barrier.
+ */
+void wake_up_bit(void *word, int bit)
+{
+ __wake_up_bit(bit_waitqueue(word, bit), word, bit);
+}
+EXPORT_SYMBOL(wake_up_bit);
+
+wait_queue_head_t *__var_waitqueue(void *p)
+{
+ return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(__var_waitqueue);
+
+static int
+var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
+ int sync, void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue_entry *wbq_entry =
+ container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+
+ if (wbq_entry->key.flags != key->flags ||
+ wbq_entry->key.bit_nr != key->bit_nr)
+ return 0;
+
+ return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+
+void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags)
+{
+ *wbq_entry = (struct wait_bit_queue_entry){
+ .key = {
+ .flags = (var),
+ .bit_nr = -1,
+ },
+ .wq_entry = {
+ .private = current,
+ .func = var_wake_function,
+ .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
+ },
+ };
+}
+EXPORT_SYMBOL(init_wait_var_entry);
+
+void wake_up_var(void *var)
+{
+ __wake_up_bit(__var_waitqueue(var), var, -1);
+}
+EXPORT_SYMBOL(wake_up_var);
+
+__sched int bit_wait(struct wait_bit_key *word, int mode)
+{
+ schedule();
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait);
+
+__sched int bit_wait_io(struct wait_bit_key *word, int mode)
+{
+ io_schedule();
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait_io);
+
+__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
+{
+ unsigned long now = READ_ONCE(jiffies);
+
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ schedule_timeout(word->timeout - now);
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_timeout);
+
+__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
+{
+ unsigned long now = READ_ONCE(jiffies);
+
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ io_schedule_timeout(word->timeout - now);
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
+
+void __init wait_bit_init(void)
+{
+ int i;
+
+ for (i = 0; i < WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(bit_wait_table + i);
+}