diff options
Diffstat (limited to '')
30 files changed, 4798 insertions, 0 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h new file mode 100644 index 000000000..f92d5ae6d --- /dev/null +++ b/include/linux/sched.h @@ -0,0 +1,1911 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_H +#define _LINUX_SCHED_H + +/* + * Define 'struct task_struct' and provide the main scheduler + * APIs (schedule(), wakeup variants, etc.) + */ + +#include <uapi/linux/sched.h> + +#include <asm/current.h> + +#include <linux/pid.h> +#include <linux/sem.h> +#include <linux/shm.h> +#include <linux/kcov.h> +#include <linux/mutex.h> +#include <linux/plist.h> +#include <linux/hrtimer.h> +#include <linux/seccomp.h> +#include <linux/nodemask.h> +#include <linux/rcupdate.h> +#include <linux/resource.h> +#include <linux/latencytop.h> +#include <linux/sched/prio.h> +#include <linux/signal_types.h> +#include <linux/mm_types_task.h> +#include <linux/task_io_accounting.h> +#include <linux/rseq.h> + +/* task_struct member predeclarations (sorted alphabetically): */ +struct audit_context; +struct backing_dev_info; +struct bio_list; +struct blk_plug; +struct cfs_rq; +struct fs_struct; +struct futex_pi_state; +struct io_context; +struct mempolicy; +struct nameidata; +struct nsproxy; +struct perf_event_context; +struct pid_namespace; +struct pipe_inode_info; +struct rcu_node; +struct reclaim_state; +struct robust_list_head; +struct sched_attr; +struct sched_param; +struct seq_file; +struct sighand_struct; +struct signal_struct; +struct task_delay_info; +struct task_group; + +/* + * Task state bitmask. NOTE! These bits are also + * encoded in fs/proc/array.c: get_task_state(). + * + * We have two separate sets of flags: task->state + * is about runnability, while task->exit_state are + * about the task exiting. Confusing, but this way + * modifying one set can't modify the other one by + * mistake. + */ + +/* Used in tsk->state: */ +#define TASK_RUNNING 0x0000 +#define TASK_INTERRUPTIBLE 0x0001 +#define TASK_UNINTERRUPTIBLE 0x0002 +#define __TASK_STOPPED 0x0004 +#define __TASK_TRACED 0x0008 +/* Used in tsk->exit_state: */ +#define EXIT_DEAD 0x0010 +#define EXIT_ZOMBIE 0x0020 +#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) +/* Used in tsk->state again: */ +#define TASK_PARKED 0x0040 +#define TASK_DEAD 0x0080 +#define TASK_WAKEKILL 0x0100 +#define TASK_WAKING 0x0200 +#define TASK_NOLOAD 0x0400 +#define TASK_NEW 0x0800 +#define TASK_STATE_MAX 0x1000 + +/* Convenience macros for the sake of set_current_state: */ +#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) +#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) +#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) + +#define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) + +/* Convenience macros for the sake of wake_up(): */ +#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) + +/* get_task_state(): */ +#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ + TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) + +#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) + +#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) + +#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) + +#define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ + (task->flags & PF_FROZEN) == 0 && \ + (task->state & TASK_NOLOAD) == 0) + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + +/* + * Special states are those that do not use the normal wait-loop pattern. See + * the comment with set_special_state(). + */ +#define is_special_task_state(state) \ + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) + +#define __set_current_state(state_value) \ + do { \ + WARN_ON_ONCE(is_special_task_state(state_value));\ + current->task_state_change = _THIS_IP_; \ + current->state = (state_value); \ + } while (0) + +#define set_current_state(state_value) \ + do { \ + WARN_ON_ONCE(is_special_task_state(state_value));\ + current->task_state_change = _THIS_IP_; \ + smp_store_mb(current->state, (state_value)); \ + } while (0) + +#define set_special_state(state_value) \ + do { \ + unsigned long flags; /* may shadow */ \ + WARN_ON_ONCE(!is_special_task_state(state_value)); \ + raw_spin_lock_irqsave(¤t->pi_lock, flags); \ + current->task_state_change = _THIS_IP_; \ + current->state = (state_value); \ + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ + } while (0) +#else +/* + * set_current_state() includes a barrier so that the write of current->state + * is correctly serialised wrt the caller's subsequent test of whether to + * actually sleep: + * + * for (;;) { + * set_current_state(TASK_UNINTERRUPTIBLE); + * if (!need_sleep) + * break; + * + * schedule(); + * } + * __set_current_state(TASK_RUNNING); + * + * If the caller does not need such serialisation (because, for instance, the + * condition test and condition change and wakeup are under the same lock) then + * use __set_current_state(). + * + * The above is typically ordered against the wakeup, which does: + * + * need_sleep = false; + * wake_up_state(p, TASK_UNINTERRUPTIBLE); + * + * where wake_up_state() executes a full memory barrier before accessing the + * task state. + * + * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is, + * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a + * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). + * + * However, with slightly different timing the wakeup TASK_RUNNING store can + * also collide with the TASK_UNINTERRUPTIBLE store. Loosing that store is not + * a problem either because that will result in one extra go around the loop + * and our @cond test will save the day. + * + * Also see the comments of try_to_wake_up(). + */ +#define __set_current_state(state_value) \ + current->state = (state_value) + +#define set_current_state(state_value) \ + smp_store_mb(current->state, (state_value)) + +/* + * set_special_state() should be used for those states when the blocking task + * can not use the regular condition based wait-loop. In that case we must + * serialize against wakeups such that any possible in-flight TASK_RUNNING stores + * will not collide with our state change. + */ +#define set_special_state(state_value) \ + do { \ + unsigned long flags; /* may shadow */ \ + raw_spin_lock_irqsave(¤t->pi_lock, flags); \ + current->state = (state_value); \ + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ + } while (0) + +#endif + +/* Task command name length: */ +#define TASK_COMM_LEN 16 + +extern void scheduler_tick(void); + +#define MAX_SCHEDULE_TIMEOUT LONG_MAX + +extern long schedule_timeout(long timeout); +extern long schedule_timeout_interruptible(long timeout); +extern long schedule_timeout_killable(long timeout); +extern long schedule_timeout_uninterruptible(long timeout); +extern long schedule_timeout_idle(long timeout); +asmlinkage void schedule(void); +extern void schedule_preempt_disabled(void); + +extern int __must_check io_schedule_prepare(void); +extern void io_schedule_finish(int token); +extern long io_schedule_timeout(long timeout); +extern void io_schedule(void); + +/** + * struct prev_cputime - snapshot of system and user cputime + * @utime: time spent in user mode + * @stime: time spent in system mode + * @lock: protects the above two fields + * + * Stores previous user/system time values such that we can guarantee + * monotonicity. + */ +struct prev_cputime { +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + u64 utime; + u64 stime; + raw_spinlock_t lock; +#endif +}; + +/** + * struct task_cputime - collected CPU time counts + * @utime: time spent in user mode, in nanoseconds + * @stime: time spent in kernel mode, in nanoseconds + * @sum_exec_runtime: total time spent on the CPU, in nanoseconds + * + * This structure groups together three kinds of CPU time that are tracked for + * threads and thread groups. Most things considering CPU time want to group + * these counts together and treat all three of them in parallel. + */ +struct task_cputime { + u64 utime; + u64 stime; + unsigned long long sum_exec_runtime; +}; + +/* Alternate field names when used on cache expirations: */ +#define virt_exp utime +#define prof_exp stime +#define sched_exp sum_exec_runtime + +enum vtime_state { + /* Task is sleeping or running in a CPU with VTIME inactive: */ + VTIME_INACTIVE = 0, + /* Task runs in userspace in a CPU with VTIME active: */ + VTIME_USER, + /* Task runs in kernelspace in a CPU with VTIME active: */ + VTIME_SYS, +}; + +struct vtime { + seqcount_t seqcount; + unsigned long long starttime; + enum vtime_state state; + u64 utime; + u64 stime; + u64 gtime; +}; + +struct sched_info { +#ifdef CONFIG_SCHED_INFO + /* Cumulative counters: */ + + /* # of times we have run on this CPU: */ + unsigned long pcount; + + /* Time spent waiting on a runqueue: */ + unsigned long long run_delay; + + /* Timestamps: */ + + /* When did we last run on a CPU? */ + unsigned long long last_arrival; + + /* When were we last queued to run? */ + unsigned long long last_queued; + +#endif /* CONFIG_SCHED_INFO */ +}; + +/* + * Integer metrics need fixed point arithmetic, e.g., sched/fair + * has a few: load, load_avg, util_avg, freq, and capacity. + * + * We define a basic fixed point arithmetic range, and then formalize + * all these metrics based on that basic range. + */ +# define SCHED_FIXEDPOINT_SHIFT 10 +# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) + +struct load_weight { + unsigned long weight; + u32 inv_weight; +}; + +/** + * struct util_est - Estimation utilization of FAIR tasks + * @enqueued: instantaneous estimated utilization of a task/cpu + * @ewma: the Exponential Weighted Moving Average (EWMA) + * utilization of a task + * + * Support data structure to track an Exponential Weighted Moving Average + * (EWMA) of a FAIR task's utilization. New samples are added to the moving + * average each time a task completes an activation. Sample's weight is chosen + * so that the EWMA will be relatively insensitive to transient changes to the + * task's workload. + * + * The enqueued attribute has a slightly different meaning for tasks and cpus: + * - task: the task's util_avg at last task dequeue time + * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU + * Thus, the util_est.enqueued of a task represents the contribution on the + * estimated utilization of the CPU where that task is currently enqueued. + * + * Only for tasks we track a moving average of the past instantaneous + * estimated utilization. This allows to absorb sporadic drops in utilization + * of an otherwise almost periodic task. + */ +struct util_est { + unsigned int enqueued; + unsigned int ewma; +#define UTIL_EST_WEIGHT_SHIFT 2 +} __attribute__((__aligned__(sizeof(u64)))); + +/* + * The load_avg/util_avg accumulates an infinite geometric series + * (see __update_load_avg() in kernel/sched/fair.c). + * + * [load_avg definition] + * + * load_avg = runnable% * scale_load_down(load) + * + * where runnable% is the time ratio that a sched_entity is runnable. + * For cfs_rq, it is the aggregated load_avg of all runnable and + * blocked sched_entities. + * + * load_avg may also take frequency scaling into account: + * + * load_avg = runnable% * scale_load_down(load) * freq% + * + * where freq% is the CPU frequency normalized to the highest frequency. + * + * [util_avg definition] + * + * util_avg = running% * SCHED_CAPACITY_SCALE + * + * where running% is the time ratio that a sched_entity is running on + * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable + * and blocked sched_entities. + * + * util_avg may also factor frequency scaling and CPU capacity scaling: + * + * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity% + * + * where freq% is the same as above, and capacity% is the CPU capacity + * normalized to the greatest capacity (due to uarch differences, etc). + * + * N.B., the above ratios (runnable%, running%, freq%, and capacity%) + * themselves are in the range of [0, 1]. To do fixed point arithmetics, + * we therefore scale them to as large a range as necessary. This is for + * example reflected by util_avg's SCHED_CAPACITY_SCALE. + * + * [Overflow issue] + * + * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities + * with the highest load (=88761), always runnable on a single cfs_rq, + * and should not overflow as the number already hits PID_MAX_LIMIT. + * + * For all other cases (including 32-bit kernels), struct load_weight's + * weight will overflow first before we do, because: + * + * Max(load_avg) <= Max(load.weight) + * + * Then it is the load_weight's responsibility to consider overflow + * issues. + */ +struct sched_avg { + u64 last_update_time; + u64 load_sum; + u64 runnable_load_sum; + u32 util_sum; + u32 period_contrib; + unsigned long load_avg; + unsigned long runnable_load_avg; + unsigned long util_avg; + struct util_est util_est; +} ____cacheline_aligned; + +struct sched_statistics { +#ifdef CONFIG_SCHEDSTATS + u64 wait_start; + u64 wait_max; + u64 wait_count; + u64 wait_sum; + u64 iowait_count; + u64 iowait_sum; + + u64 sleep_start; + u64 sleep_max; + s64 sum_sleep_runtime; + + u64 block_start; + u64 block_max; + u64 exec_max; + u64 slice_max; + + u64 nr_migrations_cold; + u64 nr_failed_migrations_affine; + u64 nr_failed_migrations_running; + u64 nr_failed_migrations_hot; + u64 nr_forced_migrations; + + u64 nr_wakeups; + u64 nr_wakeups_sync; + u64 nr_wakeups_migrate; + u64 nr_wakeups_local; + u64 nr_wakeups_remote; + u64 nr_wakeups_affine; + u64 nr_wakeups_affine_attempts; + u64 nr_wakeups_passive; + u64 nr_wakeups_idle; +#endif +}; + +struct sched_entity { + /* For load-balancing: */ + struct load_weight load; + unsigned long runnable_weight; + struct rb_node run_node; + struct list_head group_node; + unsigned int on_rq; + + u64 exec_start; + u64 sum_exec_runtime; + u64 vruntime; + u64 prev_sum_exec_runtime; + + u64 nr_migrations; + + struct sched_statistics statistics; + +#ifdef CONFIG_FAIR_GROUP_SCHED + int depth; + struct sched_entity *parent; + /* rq on which this entity is (to be) queued: */ + struct cfs_rq *cfs_rq; + /* rq "owned" by this entity/group: */ + struct cfs_rq *my_q; +#endif + +#ifdef CONFIG_SMP + /* + * Per entity load average tracking. + * + * Put into separate cache line so it does not + * collide with read-mostly values above. + */ + struct sched_avg avg; +#endif +}; + +struct sched_rt_entity { + struct list_head run_list; + unsigned long timeout; + unsigned long watchdog_stamp; + unsigned int time_slice; + unsigned short on_rq; + unsigned short on_list; + + struct sched_rt_entity *back; +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity *parent; + /* rq on which this entity is (to be) queued: */ + struct rt_rq *rt_rq; + /* rq "owned" by this entity/group: */ + struct rt_rq *my_q; +#endif +} __randomize_layout; + +struct sched_dl_entity { + struct rb_node rb_node; + + /* + * Original scheduling parameters. Copied here from sched_attr + * during sched_setattr(), they will remain the same until + * the next sched_setattr(). + */ + u64 dl_runtime; /* Maximum runtime for each instance */ + u64 dl_deadline; /* Relative deadline of each instance */ + u64 dl_period; /* Separation of two instances (period) */ + u64 dl_bw; /* dl_runtime / dl_period */ + u64 dl_density; /* dl_runtime / dl_deadline */ + + /* + * Actual scheduling parameters. Initialized with the values above, + * they are continously updated during task execution. Note that + * the remaining runtime could be < 0 in case we are in overrun. + */ + s64 runtime; /* Remaining runtime for this instance */ + u64 deadline; /* Absolute deadline for this instance */ + unsigned int flags; /* Specifying the scheduler behaviour */ + + /* + * Some bool flags: + * + * @dl_throttled tells if we exhausted the runtime. If so, the + * task has to wait for a replenishment to be performed at the + * next firing of dl_timer. + * + * @dl_boosted tells if we are boosted due to DI. If so we are + * outside bandwidth enforcement mechanism (but only until we + * exit the critical section); + * + * @dl_yielded tells if task gave up the CPU before consuming + * all its available runtime during the last job. + * + * @dl_non_contending tells if the task is inactive while still + * contributing to the active utilization. In other words, it + * indicates if the inactive timer has been armed and its handler + * has not been executed yet. This flag is useful to avoid race + * conditions between the inactive timer handler and the wakeup + * code. + * + * @dl_overrun tells if the task asked to be informed about runtime + * overruns. + */ + unsigned int dl_throttled : 1; + unsigned int dl_boosted : 1; + unsigned int dl_yielded : 1; + unsigned int dl_non_contending : 1; + unsigned int dl_overrun : 1; + + /* + * Bandwidth enforcement timer. Each -deadline task has its + * own bandwidth to be enforced, thus we need one timer per task. + */ + struct hrtimer dl_timer; + + /* + * Inactive timer, responsible for decreasing the active utilization + * at the "0-lag time". When a -deadline task blocks, it contributes + * to GRUB's active utilization until the "0-lag time", hence a + * timer is needed to decrease the active utilization at the correct + * time. + */ + struct hrtimer inactive_timer; +}; + +union rcu_special { + struct { + u8 blocked; + u8 need_qs; + u8 exp_need_qs; + + /* Otherwise the compiler can store garbage here: */ + u8 pad; + } b; /* Bits. */ + u32 s; /* Set of bits. */ +}; + +enum perf_event_task_context { + perf_invalid_context = -1, + perf_hw_context = 0, + perf_sw_context, + perf_nr_task_contexts, +}; + +struct wake_q_node { + struct wake_q_node *next; +}; + +struct task_struct { +#ifdef CONFIG_THREAD_INFO_IN_TASK + /* + * For reasons of header soup (see current_thread_info()), this + * must be the first element of task_struct. + */ + struct thread_info thread_info; +#endif + /* -1 unrunnable, 0 runnable, >0 stopped: */ + volatile long state; + + /* + * This begins the randomizable portion of task_struct. Only + * scheduling-critical items should be added above here. + */ + randomized_struct_fields_start + + void *stack; + atomic_t usage; + /* Per task flags (PF_*), defined further below: */ + unsigned int flags; + unsigned int ptrace; + +#ifdef CONFIG_SMP + struct llist_node wake_entry; + int on_cpu; +#ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +#endif + unsigned int wakee_flips; + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; + + /* + * recent_used_cpu is initially set as the last CPU used by a task + * that wakes affine another task. Waker/wakee relationships can + * push tasks around a CPU where each wakeup moves to the next one. + * Tracking a recently used CPU allows a quick search for a recently + * used CPU that may be idle. + */ + int recent_used_cpu; + int wake_cpu; +#endif + int on_rq; + + int prio; + int static_prio; + int normal_prio; + unsigned int rt_priority; + + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; +#ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; +#endif + struct sched_dl_entity dl; + +#ifdef CONFIG_PREEMPT_NOTIFIERS + /* List of struct preempt_notifier: */ + struct hlist_head preempt_notifiers; +#endif + +#ifdef CONFIG_BLK_DEV_IO_TRACE + unsigned int btrace_seq; +#endif + + unsigned int policy; + int nr_cpus_allowed; + cpumask_t cpus_allowed; + +#ifdef CONFIG_PREEMPT_RCU + int rcu_read_lock_nesting; + union rcu_special rcu_read_unlock_special; + struct list_head rcu_node_entry; + struct rcu_node *rcu_blocked_node; +#endif /* #ifdef CONFIG_PREEMPT_RCU */ + +#ifdef CONFIG_TASKS_RCU + unsigned long rcu_tasks_nvcsw; + u8 rcu_tasks_holdout; + u8 rcu_tasks_idx; + int rcu_tasks_idle_cpu; + struct list_head rcu_tasks_holdout_list; +#endif /* #ifdef CONFIG_TASKS_RCU */ + + struct sched_info sched_info; + + struct list_head tasks; +#ifdef CONFIG_SMP + struct plist_node pushable_tasks; + struct rb_node pushable_dl_tasks; +#endif + + struct mm_struct *mm; + struct mm_struct *active_mm; + + /* Per-thread vma caching: */ + struct vmacache vmacache; + +#ifdef SPLIT_RSS_COUNTING + struct task_rss_stat rss_stat; +#endif + int exit_state; + int exit_code; + int exit_signal; + /* The signal sent when the parent dies: */ + int pdeath_signal; + /* JOBCTL_*, siglock protected: */ + unsigned long jobctl; + + /* Used for emulating ABI behavior of previous Linux versions: */ + unsigned int personality; + + /* Scheduler bits, serialized by scheduler locks: */ + unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; + unsigned sched_migrated:1; + unsigned sched_remote_wakeup:1; + /* Force alignment to the next boundary: */ + unsigned :0; + + /* Unserialized, strictly 'current' */ + + /* Bit to tell LSMs we're in execve(): */ + unsigned in_execve:1; + unsigned in_iowait:1; +#ifndef TIF_RESTORE_SIGMASK + unsigned restore_sigmask:1; +#endif +#ifdef CONFIG_MEMCG + unsigned in_user_fault:1; +#ifdef CONFIG_MEMCG_KMEM + unsigned memcg_kmem_skip_account:1; +#endif +#endif +#ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; +#endif +#ifdef CONFIG_CGROUPS + /* disallow userland-initiated cgroup migration */ + unsigned no_cgroup_migration:1; +#endif +#ifdef CONFIG_BLK_CGROUP + /* to be used once the psi infrastructure lands upstream. */ + unsigned use_memdelay:1; +#endif + + unsigned long atomic_flags; /* Flags requiring atomic access. */ + + struct restart_block restart_block; + + pid_t pid; + pid_t tgid; + +#ifdef CONFIG_STACKPROTECTOR + /* Canary value for the -fstack-protector GCC feature: */ + unsigned long stack_canary; +#endif + /* + * Pointers to the (original) parent process, youngest child, younger sibling, + * older sibling, respectively. (p->father can be replaced with + * p->real_parent->pid) + */ + + /* Real parent process: */ + struct task_struct __rcu *real_parent; + + /* Recipient of SIGCHLD, wait4() reports: */ + struct task_struct __rcu *parent; + + /* + * Children/sibling form the list of natural children: + */ + struct list_head children; + struct list_head sibling; + struct task_struct *group_leader; + + /* + * 'ptraced' is the list of tasks this task is using ptrace() on. + * + * This includes both natural children and PTRACE_ATTACH targets. + * 'ptrace_entry' is this task's link on the p->parent->ptraced list. + */ + struct list_head ptraced; + struct list_head ptrace_entry; + + /* PID/PID hash table linkage. */ + struct pid *thread_pid; + struct hlist_node pid_links[PIDTYPE_MAX]; + struct list_head thread_group; + struct list_head thread_node; + + struct completion *vfork_done; + + /* CLONE_CHILD_SETTID: */ + int __user *set_child_tid; + + /* CLONE_CHILD_CLEARTID: */ + int __user *clear_child_tid; + + u64 utime; + u64 stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + u64 utimescaled; + u64 stimescaled; +#endif + u64 gtime; + struct prev_cputime prev_cputime; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + struct vtime vtime; +#endif + +#ifdef CONFIG_NO_HZ_FULL + atomic_t tick_dep_mask; +#endif + /* Context switch counts: */ + unsigned long nvcsw; + unsigned long nivcsw; + + /* Monotonic time in nsecs: */ + u64 start_time; + + /* Boot based time in nsecs: */ + u64 real_start_time; + + /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ + unsigned long min_flt; + unsigned long maj_flt; + +#ifdef CONFIG_POSIX_TIMERS + struct task_cputime cputime_expires; + struct list_head cpu_timers[3]; +#endif + + /* Process credentials: */ + + /* Tracer's credentials at attach: */ + const struct cred __rcu *ptracer_cred; + + /* Objective and real subjective task credentials (COW): */ + const struct cred __rcu *real_cred; + + /* Effective (overridable) subjective task credentials (COW): */ + const struct cred __rcu *cred; + + /* + * executable name, excluding path. + * + * - normally initialized setup_new_exec() + * - access it with [gs]et_task_comm() + * - lock it with task_lock() + */ + char comm[TASK_COMM_LEN]; + + struct nameidata *nameidata; + +#ifdef CONFIG_SYSVIPC + struct sysv_sem sysvsem; + struct sysv_shm sysvshm; +#endif +#ifdef CONFIG_DETECT_HUNG_TASK + unsigned long last_switch_count; + unsigned long last_switch_time; +#endif + /* Filesystem information: */ + struct fs_struct *fs; + + /* Open file information: */ + struct files_struct *files; + + /* Namespaces: */ + struct nsproxy *nsproxy; + + /* Signal handlers: */ + struct signal_struct *signal; + struct sighand_struct *sighand; + sigset_t blocked; + sigset_t real_blocked; + /* Restored if set_restore_sigmask() was used: */ + sigset_t saved_sigmask; + struct sigpending pending; + unsigned long sas_ss_sp; + size_t sas_ss_size; + unsigned int sas_ss_flags; + + struct callback_head *task_works; + + struct audit_context *audit_context; +#ifdef CONFIG_AUDITSYSCALL + kuid_t loginuid; + unsigned int sessionid; +#endif + struct seccomp seccomp; + + /* Thread group tracking: */ + u64 parent_exec_id; + u64 self_exec_id; + + /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ + spinlock_t alloc_lock; + + /* Protection of the PI data structures: */ + raw_spinlock_t pi_lock; + + struct wake_q_node wake_q; + +#ifdef CONFIG_RT_MUTEXES + /* PI waiters blocked on a rt_mutex held by this task: */ + struct rb_root_cached pi_waiters; + /* Updated under owner's pi_lock and rq lock */ + struct task_struct *pi_top_task; + /* Deadlock detection and priority inheritance handling: */ + struct rt_mutex_waiter *pi_blocked_on; +#endif + +#ifdef CONFIG_DEBUG_MUTEXES + /* Mutex deadlock detection: */ + struct mutex_waiter *blocked_on; +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS + unsigned int irq_events; + unsigned long hardirq_enable_ip; + unsigned long hardirq_disable_ip; + unsigned int hardirq_enable_event; + unsigned int hardirq_disable_event; + int hardirqs_enabled; + int hardirq_context; + unsigned long softirq_disable_ip; + unsigned long softirq_enable_ip; + unsigned int softirq_disable_event; + unsigned int softirq_enable_event; + int softirqs_enabled; + int softirq_context; +#endif + +#ifdef CONFIG_LOCKDEP +# define MAX_LOCK_DEPTH 48UL + u64 curr_chain_key; + int lockdep_depth; + unsigned int lockdep_recursion; + struct held_lock held_locks[MAX_LOCK_DEPTH]; +#endif + +#ifdef CONFIG_UBSAN + unsigned int in_ubsan; +#endif + + /* Journalling filesystem info: */ + void *journal_info; + + /* Stacked block device info: */ + struct bio_list *bio_list; + +#ifdef CONFIG_BLOCK + /* Stack plugging: */ + struct blk_plug *plug; +#endif + + /* VM state: */ + struct reclaim_state *reclaim_state; + + struct backing_dev_info *backing_dev_info; + + struct io_context *io_context; + + /* Ptrace state: */ + unsigned long ptrace_message; + siginfo_t *last_siginfo; + + struct task_io_accounting ioac; +#ifdef CONFIG_TASK_XACCT + /* Accumulated RSS usage: */ + u64 acct_rss_mem1; + /* Accumulated virtual memory usage: */ + u64 acct_vm_mem1; + /* stime + utime since last update: */ + u64 acct_timexpd; +#endif +#ifdef CONFIG_CPUSETS + /* Protected by ->alloc_lock: */ + nodemask_t mems_allowed; + /* Seqence number to catch updates: */ + seqcount_t mems_allowed_seq; + int cpuset_mem_spread_rotor; + int cpuset_slab_spread_rotor; +#endif +#ifdef CONFIG_CGROUPS + /* Control Group info protected by css_set_lock: */ + struct css_set __rcu *cgroups; + /* cg_list protected by css_set_lock and tsk->alloc_lock: */ + struct list_head cg_list; +#endif +#ifdef CONFIG_INTEL_RDT + u32 closid; + u32 rmid; +#endif +#ifdef CONFIG_FUTEX + struct robust_list_head __user *robust_list; +#ifdef CONFIG_COMPAT + struct compat_robust_list_head __user *compat_robust_list; +#endif + struct list_head pi_state_list; + struct futex_pi_state *pi_state_cache; + struct mutex futex_exit_mutex; + unsigned int futex_state; +#endif +#ifdef CONFIG_PERF_EVENTS + struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; + struct mutex perf_event_mutex; + struct list_head perf_event_list; +#endif +#ifdef CONFIG_DEBUG_PREEMPT + unsigned long preempt_disable_ip; +#endif +#ifdef CONFIG_NUMA + /* Protected by alloc_lock: */ + struct mempolicy *mempolicy; + short il_prev; + short pref_node_fork; +#endif +#ifdef CONFIG_NUMA_BALANCING + int numa_scan_seq; + unsigned int numa_scan_period; + unsigned int numa_scan_period_max; + int numa_preferred_nid; + unsigned long numa_migrate_retry; + /* Migration stamp: */ + u64 node_stamp; + u64 last_task_numa_placement; + u64 last_sum_exec_runtime; + struct callback_head numa_work; + + /* + * This pointer is only modified for current in syscall and + * pagefault context (and for tasks being destroyed), so it can be read + * from any of the following contexts: + * - RCU read-side critical section + * - current->numa_group from everywhere + * - task's runqueue locked, task not running + */ + struct numa_group __rcu *numa_group; + + /* + * numa_faults is an array split into four regions: + * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer + * in this precise order. + * + * faults_memory: Exponential decaying average of faults on a per-node + * basis. Scheduling placement decisions are made based on these + * counts. The values remain static for the duration of a PTE scan. + * faults_cpu: Track the nodes the process was running on when a NUMA + * hinting fault was incurred. + * faults_memory_buffer and faults_cpu_buffer: Record faults per node + * during the current scan window. When the scan completes, the counts + * in faults_memory and faults_cpu decay and these values are copied. + */ + unsigned long *numa_faults; + unsigned long total_numa_faults; + + /* + * numa_faults_locality tracks if faults recorded during the last + * scan window were remote/local or failed to migrate. The task scan + * period is adapted based on the locality of the faults with different + * weights depending on whether they were shared or private faults + */ + unsigned long numa_faults_locality[3]; + + unsigned long numa_pages_migrated; +#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_RSEQ + struct rseq __user *rseq; + u32 rseq_len; + u32 rseq_sig; + /* + * RmW on rseq_event_mask must be performed atomically + * with respect to preemption. + */ + unsigned long rseq_event_mask; +#endif + + struct tlbflush_unmap_batch tlb_ubc; + + struct rcu_head rcu; + + /* Cache last used pipe for splice(): */ + struct pipe_inode_info *splice_pipe; + + struct page_frag task_frag; + +#ifdef CONFIG_TASK_DELAY_ACCT + struct task_delay_info *delays; +#endif + +#ifdef CONFIG_FAULT_INJECTION + int make_it_fail; + unsigned int fail_nth; +#endif + /* + * When (nr_dirtied >= nr_dirtied_pause), it's time to call + * balance_dirty_pages() for a dirty throttling pause: + */ + int nr_dirtied; + int nr_dirtied_pause; + /* Start of a write-and-pause period: */ + unsigned long dirty_paused_when; + +#ifdef CONFIG_LATENCYTOP + int latency_record_count; + struct latency_record latency_record[LT_SAVECOUNT]; +#endif + /* + * Time slack values; these are used to round up poll() and + * select() etc timeout values. These are in nanoseconds. + */ + u64 timer_slack_ns; + u64 default_timer_slack_ns; + +#ifdef CONFIG_KASAN + unsigned int kasan_depth; +#endif + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* Index of current stored address in ret_stack: */ + int curr_ret_stack; + int curr_ret_depth; + + /* Stack of return addresses for return function tracing: */ + struct ftrace_ret_stack *ret_stack; + + /* Timestamp for last schedule: */ + unsigned long long ftrace_timestamp; + + /* + * Number of functions that haven't been traced + * because of depth overrun: + */ + atomic_t trace_overrun; + + /* Pause tracing: */ + atomic_t tracing_graph_pause; +#endif + +#ifdef CONFIG_TRACING + /* State flags for use by tracers: */ + unsigned long trace; + + /* Bitmask and counter of trace recursion: */ + unsigned long trace_recursion; +#endif /* CONFIG_TRACING */ + +#ifdef CONFIG_KCOV + /* Coverage collection mode enabled for this task (0 if disabled): */ + unsigned int kcov_mode; + + /* Size of the kcov_area: */ + unsigned int kcov_size; + + /* Buffer for coverage collection: */ + void *kcov_area; + + /* KCOV descriptor wired with this task or NULL: */ + struct kcov *kcov; +#endif + +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg_in_oom; + gfp_t memcg_oom_gfp_mask; + int memcg_oom_order; + + /* Number of pages to reclaim on returning to userland: */ + unsigned int memcg_nr_pages_over_high; + + /* Used by memcontrol for targeted memcg charge: */ + struct mem_cgroup *active_memcg; +#endif + +#ifdef CONFIG_BLK_CGROUP + struct request_queue *throttle_queue; +#endif + +#ifdef CONFIG_UPROBES + struct uprobe_task *utask; +#endif +#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) + unsigned int sequential_io; + unsigned int sequential_io_avg; +#endif +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + unsigned long task_state_change; +#endif + int pagefault_disabled; +#ifdef CONFIG_MMU + struct task_struct *oom_reaper_list; +#endif +#ifdef CONFIG_VMAP_STACK + struct vm_struct *stack_vm_area; +#endif +#ifdef CONFIG_THREAD_INFO_IN_TASK + /* A live task holds one reference: */ + atomic_t stack_refcount; +#endif +#ifdef CONFIG_LIVEPATCH + int patch_state; +#endif +#ifdef CONFIG_SECURITY + /* Used by LSM modules for access restriction: */ + void *security; +#endif + + /* + * New fields for task_struct should be added above here, so that + * they are included in the randomized portion of task_struct. + */ + randomized_struct_fields_end + + /* CPU-specific state of this task: */ + struct thread_struct thread; + + /* + * WARNING: on x86, 'thread_struct' contains a variable-sized + * structure. It *MUST* be at the end of 'task_struct'. + * + * Do not put anything below here! + */ +}; + +static inline struct pid *task_pid(struct task_struct *task) +{ + return task->thread_pid; +} + +/* + * the helpers to get the task's different pids as they are seen + * from various namespaces + * + * task_xid_nr() : global id, i.e. the id seen from the init namespace; + * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of + * current. + * task_xid_nr_ns() : id seen from the ns specified; + * + * see also pid_nr() etc in include/linux/pid.h + */ +pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); + +static inline pid_t task_pid_nr(struct task_struct *tsk) +{ + return tsk->pid; +} + +static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); +} + +static inline pid_t task_pid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); +} + + +static inline pid_t task_tgid_nr(struct task_struct *tsk) +{ + return tsk->tgid; +} + +/** + * pid_alive - check that a task structure is not stale + * @p: Task structure to be checked. + * + * Test if a process is not yet dead (at most zombie state) + * If pid_alive fails, then pointers within the task structure + * can be stale and must not be dereferenced. + * + * Return: 1 if the process is alive. 0 otherwise. + */ +static inline int pid_alive(const struct task_struct *p) +{ + return p->thread_pid != NULL; +} + +static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); +} + +static inline pid_t task_pgrp_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); +} + + +static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); +} + +static inline pid_t task_session_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); +} + +static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); +} + +static inline pid_t task_tgid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); +} + +static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) +{ + pid_t pid = 0; + + rcu_read_lock(); + if (pid_alive(tsk)) + pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); + rcu_read_unlock(); + + return pid; +} + +static inline pid_t task_ppid_nr(const struct task_struct *tsk) +{ + return task_ppid_nr_ns(tsk, &init_pid_ns); +} + +/* Obsolete, do not use: */ +static inline pid_t task_pgrp_nr(struct task_struct *tsk) +{ + return task_pgrp_nr_ns(tsk, &init_pid_ns); +} + +#define TASK_REPORT_IDLE (TASK_REPORT + 1) +#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) + +static inline unsigned int task_state_index(struct task_struct *tsk) +{ + unsigned int tsk_state = READ_ONCE(tsk->state); + unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; + + BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); + + if (tsk_state == TASK_IDLE) + state = TASK_REPORT_IDLE; + + return fls(state); +} + +static inline char task_index_to_char(unsigned int state) +{ + static const char state_char[] = "RSDTtXZPI"; + + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); + + return state_char[state]; +} + +static inline char task_state_to_char(struct task_struct *tsk) +{ + return task_index_to_char(task_state_index(tsk)); +} + +/** + * is_global_init - check if a task structure is init. Since init + * is free to have sub-threads we need to check tgid. + * @tsk: Task structure to be checked. + * + * Check if a task structure is the first user space task the kernel created. + * + * Return: 1 if the task structure is init. 0 otherwise. + */ +static inline int is_global_init(struct task_struct *tsk) +{ + return task_tgid_nr(tsk) == 1; +} + +extern struct pid *cad_pid; + +/* + * Per process flags + */ +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ +#define PF_EXITING 0x00000004 /* Getting shut down */ +#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ +#define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ +#define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ +#define PF_DUMPCORE 0x00000200 /* Dumped core */ +#define PF_SIGNALED 0x00000400 /* Killed by a signal */ +#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ +#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ +#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ +#define PF_FROZEN 0x00010000 /* Frozen for system suspend */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ +#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ +#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ +#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ +#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ +#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ +#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ +#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ +#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ +#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ + +/* + * Only the _current_ task can read/write to tsk->flags, but other + * tasks can access tsk->flags in readonly mode for example + * with tsk_used_math (like during threaded core dumping). + * There is however an exception to this rule during ptrace + * or during fork: the ptracer task is allowed to write to the + * child->flags of its traced child (same goes for fork, the parent + * can write to the child->flags), because we're guaranteed the + * child is not running and in turn not changing child->flags + * at the same time the parent does it. + */ +#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) +#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) +#define clear_used_math() clear_stopped_child_used_math(current) +#define set_used_math() set_stopped_child_used_math(current) + +#define conditional_stopped_child_used_math(condition, child) \ + do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) + +#define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) + +#define copy_to_stopped_child_used_math(child) \ + do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) + +/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ +#define tsk_used_math(p) ((p)->flags & PF_USED_MATH) +#define used_math() tsk_used_math(current) + +static __always_inline bool is_percpu_thread(void) +{ +#ifdef CONFIG_SMP + return (current->flags & PF_NO_SETAFFINITY) && + (current->nr_cpus_allowed == 1); +#else + return true; +#endif +} + +/* Per-process atomic flags. */ +#define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ +#define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ +#define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ +#define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ +#define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ +#define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ +#define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ + +#define TASK_PFA_TEST(name, func) \ + static inline bool task_##func(struct task_struct *p) \ + { return test_bit(PFA_##name, &p->atomic_flags); } + +#define TASK_PFA_SET(name, func) \ + static inline void task_set_##func(struct task_struct *p) \ + { set_bit(PFA_##name, &p->atomic_flags); } + +#define TASK_PFA_CLEAR(name, func) \ + static inline void task_clear_##func(struct task_struct *p) \ + { clear_bit(PFA_##name, &p->atomic_flags); } + +TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) +TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) + +TASK_PFA_TEST(SPREAD_PAGE, spread_page) +TASK_PFA_SET(SPREAD_PAGE, spread_page) +TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) + +TASK_PFA_TEST(SPREAD_SLAB, spread_slab) +TASK_PFA_SET(SPREAD_SLAB, spread_slab) +TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) + +TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) +TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) +TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) + +TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) +TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) + +TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) +TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) +TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) + +TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) +TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) + +static inline void +current_restore_flags(unsigned long orig_flags, unsigned long flags) +{ + current->flags &= ~flags; + current->flags |= orig_flags & flags; +} + +extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); +extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); +#ifdef CONFIG_SMP +extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); +extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); +#else +static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ +} +static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + if (!cpumask_test_cpu(0, new_mask)) + return -EINVAL; + return 0; +} +#endif + +#ifndef cpu_relax_yield +#define cpu_relax_yield() cpu_relax() +#endif + +extern int yield_to(struct task_struct *p, bool preempt); +extern void set_user_nice(struct task_struct *p, long nice); +extern int task_prio(const struct task_struct *p); + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + * + * Return: The nice value [ -20 ... 0 ... 19 ]. + */ +static inline int task_nice(const struct task_struct *p) +{ + return PRIO_TO_NICE((p)->static_prio); +} + +extern int can_nice(const struct task_struct *p, const int nice); +extern int task_curr(const struct task_struct *p); +extern int idle_cpu(int cpu); +extern int available_idle_cpu(int cpu); +extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); +extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); +extern int sched_setattr(struct task_struct *, const struct sched_attr *); +extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); +extern struct task_struct *idle_task(int cpu); + +/** + * is_idle_task - is the specified task an idle task? + * @p: the task in question. + * + * Return: 1 if @p is an idle task. 0 otherwise. + */ +static inline bool is_idle_task(const struct task_struct *p) +{ + return !!(p->flags & PF_IDLE); +} + +extern struct task_struct *curr_task(int cpu); +extern void ia64_set_curr_task(int cpu, struct task_struct *p); + +void yield(void); + +union thread_union { +#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK + struct task_struct task; +#endif +#ifndef CONFIG_THREAD_INFO_IN_TASK + struct thread_info thread_info; +#endif + unsigned long stack[THREAD_SIZE/sizeof(long)]; +}; + +#ifndef CONFIG_THREAD_INFO_IN_TASK +extern struct thread_info init_thread_info; +#endif + +extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; + +#ifdef CONFIG_THREAD_INFO_IN_TASK +static inline struct thread_info *task_thread_info(struct task_struct *task) +{ + return &task->thread_info; +} +#elif !defined(__HAVE_THREAD_FUNCTIONS) +# define task_thread_info(task) ((struct thread_info *)(task)->stack) +#endif + +/* + * find a task by one of its numerical ids + * + * find_task_by_pid_ns(): + * finds a task by its pid in the specified namespace + * find_task_by_vpid(): + * finds a task by its virtual pid + * + * see also find_vpid() etc in include/linux/pid.h + */ + +extern struct task_struct *find_task_by_vpid(pid_t nr); +extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); + +/* + * find a task by its virtual pid and get the task struct + */ +extern struct task_struct *find_get_task_by_vpid(pid_t nr); + +extern int wake_up_state(struct task_struct *tsk, unsigned int state); +extern int wake_up_process(struct task_struct *tsk); +extern void wake_up_new_task(struct task_struct *tsk); + +#ifdef CONFIG_SMP +extern void kick_process(struct task_struct *tsk); +#else +static inline void kick_process(struct task_struct *tsk) { } +#endif + +extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); + +static inline void set_task_comm(struct task_struct *tsk, const char *from) +{ + __set_task_comm(tsk, from, false); +} + +extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); +#define get_task_comm(buf, tsk) ({ \ + BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ + __get_task_comm(buf, sizeof(buf), tsk); \ +}) + +#ifdef CONFIG_SMP +void scheduler_ipi(void); +extern unsigned long wait_task_inactive(struct task_struct *, long match_state); +#else +static inline void scheduler_ipi(void) { } +static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) +{ + return 1; +} +#endif + +/* + * Set thread flags in other task's structures. + * See asm/thread_info.h for TIF_xxxx flags available: + */ +static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + set_ti_thread_flag(task_thread_info(tsk), flag); +} + +static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + clear_ti_thread_flag(task_thread_info(tsk), flag); +} + +static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, + bool value) +{ + update_ti_thread_flag(task_thread_info(tsk), flag, value); +} + +static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); +} + +static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); +} + +static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + return test_ti_thread_flag(task_thread_info(tsk), flag); +} + +static inline void set_tsk_need_resched(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); +} + +static inline void clear_tsk_need_resched(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); +} + +static inline int test_tsk_need_resched(struct task_struct *tsk) +{ + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); +} + +/* + * cond_resched() and cond_resched_lock(): latency reduction via + * explicit rescheduling in places that are safe. The return + * value indicates whether a reschedule was done in fact. + * cond_resched_lock() will drop the spinlock before scheduling, + */ +#ifndef CONFIG_PREEMPT +extern int _cond_resched(void); +#else +static inline int _cond_resched(void) { return 0; } +#endif + +#define cond_resched() ({ \ + ___might_sleep(__FILE__, __LINE__, 0); \ + _cond_resched(); \ +}) + +extern int __cond_resched_lock(spinlock_t *lock); + +#define cond_resched_lock(lock) ({ \ + ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ + __cond_resched_lock(lock); \ +}) + +static inline void cond_resched_rcu(void) +{ +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); +#endif +} + +/* + * Does a critical section need to be broken due to another + * task waiting?: (technically does not depend on CONFIG_PREEMPT, + * but a general need for low latency) + */ +static inline int spin_needbreak(spinlock_t *lock) +{ +#ifdef CONFIG_PREEMPT + return spin_is_contended(lock); +#else + return 0; +#endif +} + +static __always_inline bool need_resched(void) +{ + return unlikely(tif_need_resched()); +} + +/* + * Wrappers for p->thread_info->cpu access. No-op on UP. + */ +#ifdef CONFIG_SMP + +static inline unsigned int task_cpu(const struct task_struct *p) +{ +#ifdef CONFIG_THREAD_INFO_IN_TASK + return READ_ONCE(p->cpu); +#else + return READ_ONCE(task_thread_info(p)->cpu); +#endif +} + +extern void set_task_cpu(struct task_struct *p, unsigned int cpu); + +#else + +static inline unsigned int task_cpu(const struct task_struct *p) +{ + return 0; +} + +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +} + +#endif /* CONFIG_SMP */ + +/* + * In order to reduce various lock holder preemption latencies provide an + * interface to see if a vCPU is currently running or not. + * + * This allows us to terminate optimistic spin loops and block, analogous to + * the native optimistic spin heuristic of testing if the lock owner task is + * running or not. + */ +#ifndef vcpu_is_preempted +# define vcpu_is_preempted(cpu) false +#endif + +extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); +extern long sched_getaffinity(pid_t pid, struct cpumask *mask); + +#ifndef TASK_SIZE_OF +#define TASK_SIZE_OF(tsk) TASK_SIZE +#endif + +#ifdef CONFIG_RSEQ + +/* + * Map the event mask on the user-space ABI enum rseq_cs_flags + * for direct mask checks. + */ +enum rseq_event_mask_bits { + RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, + RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, + RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, +}; + +enum rseq_event_mask { + RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), + RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), + RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), +}; + +static inline void rseq_set_notify_resume(struct task_struct *t) +{ + if (t->rseq) + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); +} + +void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); + +static inline void rseq_handle_notify_resume(struct ksignal *ksig, + struct pt_regs *regs) +{ + if (current->rseq) + __rseq_handle_notify_resume(ksig, regs); +} + +static inline void rseq_signal_deliver(struct ksignal *ksig, + struct pt_regs *regs) +{ + preempt_disable(); + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); + preempt_enable(); + rseq_handle_notify_resume(ksig, regs); +} + +/* rseq_preempt() requires preemption to be disabled. */ +static inline void rseq_preempt(struct task_struct *t) +{ + __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); + rseq_set_notify_resume(t); +} + +/* rseq_migrate() requires preemption to be disabled. */ +static inline void rseq_migrate(struct task_struct *t) +{ + __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); + rseq_set_notify_resume(t); +} + +/* + * If parent process has a registered restartable sequences area, the + * child inherits. Unregister rseq for a clone with CLONE_VM set. + */ +static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +{ + if (clone_flags & CLONE_VM) { + t->rseq = NULL; + t->rseq_len = 0; + t->rseq_sig = 0; + t->rseq_event_mask = 0; + } else { + t->rseq = current->rseq; + t->rseq_len = current->rseq_len; + t->rseq_sig = current->rseq_sig; + t->rseq_event_mask = current->rseq_event_mask; + } +} + +static inline void rseq_execve(struct task_struct *t) +{ + t->rseq = NULL; + t->rseq_len = 0; + t->rseq_sig = 0; + t->rseq_event_mask = 0; +} + +#else + +static inline void rseq_set_notify_resume(struct task_struct *t) +{ +} +static inline void rseq_handle_notify_resume(struct ksignal *ksig, + struct pt_regs *regs) +{ +} +static inline void rseq_signal_deliver(struct ksignal *ksig, + struct pt_regs *regs) +{ +} +static inline void rseq_preempt(struct task_struct *t) +{ +} +static inline void rseq_migrate(struct task_struct *t) +{ +} +static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +{ +} +static inline void rseq_execve(struct task_struct *t) +{ +} + +#endif + +#ifdef CONFIG_DEBUG_RSEQ + +void rseq_syscall(struct pt_regs *regs); + +#else + +static inline void rseq_syscall(struct pt_regs *regs) +{ +} + +#endif + +#endif diff --git a/include/linux/sched/autogroup.h b/include/linux/sched/autogroup.h new file mode 100644 index 000000000..704391cc1 --- /dev/null +++ b/include/linux/sched/autogroup.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_AUTOGROUP_H +#define _LINUX_SCHED_AUTOGROUP_H + +struct signal_struct; +struct task_struct; +struct task_group; +struct seq_file; + +#ifdef CONFIG_SCHED_AUTOGROUP +extern void sched_autogroup_create_attach(struct task_struct *p); +extern void sched_autogroup_detach(struct task_struct *p); +extern void sched_autogroup_fork(struct signal_struct *sig); +extern void sched_autogroup_exit(struct signal_struct *sig); +extern void sched_autogroup_exit_task(struct task_struct *p); +#ifdef CONFIG_PROC_FS +extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); +extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); +#endif +#else +static inline void sched_autogroup_create_attach(struct task_struct *p) { } +static inline void sched_autogroup_detach(struct task_struct *p) { } +static inline void sched_autogroup_fork(struct signal_struct *sig) { } +static inline void sched_autogroup_exit(struct signal_struct *sig) { } +static inline void sched_autogroup_exit_task(struct task_struct *p) { } +#endif + +#ifdef CONFIG_CGROUP_SCHED +extern struct task_group root_task_group; +#endif /* CONFIG_CGROUP_SCHED */ + +#endif /* _LINUX_SCHED_AUTOGROUP_H */ diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h new file mode 100644 index 000000000..867d58831 --- /dev/null +++ b/include/linux/sched/clock.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_CLOCK_H +#define _LINUX_SCHED_CLOCK_H + +#include <linux/smp.h> + +/* + * Do not use outside of architecture code which knows its limitations. + * + * sched_clock() has no promise of monotonicity or bounded drift between + * CPUs, use (which you should not) requires disabling IRQs. + * + * Please use one of the three interfaces below. + */ +extern unsigned long long notrace sched_clock(void); + +/* + * See the comment in kernel/sched/clock.c + */ +extern u64 running_clock(void); +extern u64 sched_clock_cpu(int cpu); + + +extern void sched_clock_init(void); + +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline void sched_clock_tick(void) +{ +} + +static inline void clear_sched_clock_stable(void) +{ +} + +static inline void sched_clock_idle_sleep_event(void) +{ +} + +static inline void sched_clock_idle_wakeup_event(void) +{ +} + +static inline u64 cpu_clock(int cpu) +{ + return sched_clock(); +} + +static inline u64 local_clock(void) +{ + return sched_clock(); +} +#else +extern int sched_clock_stable(void); +extern void clear_sched_clock_stable(void); + +/* + * When sched_clock_stable(), __sched_clock_offset provides the offset + * between local_clock() and sched_clock(). + */ +extern u64 __sched_clock_offset; + +extern void sched_clock_tick(void); +extern void sched_clock_tick_stable(void); +extern void sched_clock_idle_sleep_event(void); +extern void sched_clock_idle_wakeup_event(void); + +/* + * As outlined in clock.c, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +static inline u64 cpu_clock(int cpu) +{ + return sched_clock_cpu(cpu); +} + +static inline u64 local_clock(void) +{ + return sched_clock_cpu(raw_smp_processor_id()); +} +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * An i/f to runtime opt-in for irq time accounting based off of sched_clock. + * The reason for this explicit opt-in is not to have perf penalty with + * slow sched_clocks. + */ +extern void enable_sched_clock_irqtime(void); +extern void disable_sched_clock_irqtime(void); +#else +static inline void enable_sched_clock_irqtime(void) {} +static inline void disable_sched_clock_irqtime(void) {} +#endif + +#endif /* _LINUX_SCHED_CLOCK_H */ diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h new file mode 100644 index 000000000..dfd82eab2 --- /dev/null +++ b/include/linux/sched/coredump.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_COREDUMP_H +#define _LINUX_SCHED_COREDUMP_H + +#include <linux/mm_types.h> + +#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ +#define SUID_DUMP_USER 1 /* Dump as user of process */ +#define SUID_DUMP_ROOT 2 /* Dump as root */ + +/* mm flags */ + +/* for SUID_DUMP_* above */ +#define MMF_DUMPABLE_BITS 2 +#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) + +extern void set_dumpable(struct mm_struct *mm, int value); +/* + * This returns the actual value of the suid_dumpable flag. For things + * that are using this for checking for privilege transitions, it must + * test against SUID_DUMP_USER rather than treating it as a boolean + * value. + */ +static inline int __get_dumpable(unsigned long mm_flags) +{ + return mm_flags & MMF_DUMPABLE_MASK; +} + +static inline int get_dumpable(struct mm_struct *mm) +{ + return __get_dumpable(mm->flags); +} + +/* coredump filter bits */ +#define MMF_DUMP_ANON_PRIVATE 2 +#define MMF_DUMP_ANON_SHARED 3 +#define MMF_DUMP_MAPPED_PRIVATE 4 +#define MMF_DUMP_MAPPED_SHARED 5 +#define MMF_DUMP_ELF_HEADERS 6 +#define MMF_DUMP_HUGETLB_PRIVATE 7 +#define MMF_DUMP_HUGETLB_SHARED 8 +#define MMF_DUMP_DAX_PRIVATE 9 +#define MMF_DUMP_DAX_SHARED 10 + +#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS +#define MMF_DUMP_FILTER_BITS 9 +#define MMF_DUMP_FILTER_MASK \ + (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) +#define MMF_DUMP_FILTER_DEFAULT \ + ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ + (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) + +#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS +# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) +#else +# define MMF_DUMP_MASK_DEFAULT_ELF 0 +#endif + /* leave room for more dump flags */ +#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ +#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ +/* + * This one-shot flag is dropped due to necessity of changing exe once again + * on NFS restore + */ +//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ + +#define MMF_HAS_UPROBES 19 /* has uprobes */ +#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ +#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ +#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ +#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_OOM_VICTIM 25 /* mm is the oom victim */ +#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 27 /* mm is shared between processes */ +#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) + +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ + MMF_DISABLE_THP_MASK) + +#endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h new file mode 100644 index 000000000..a4530d782 --- /dev/null +++ b/include/linux/sched/cpufreq.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_CPUFREQ_H +#define _LINUX_SCHED_CPUFREQ_H + +#include <linux/types.h> + +/* + * Interface between cpufreq drivers and the scheduler: + */ + +#define SCHED_CPUFREQ_IOWAIT (1U << 0) +#define SCHED_CPUFREQ_MIGRATION (1U << 1) + +#ifdef CONFIG_CPU_FREQ +struct cpufreq_policy; + +struct update_util_data { + void (*func)(struct update_util_data *data, u64 time, unsigned int flags); +}; + +void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, + void (*func)(struct update_util_data *data, u64 time, + unsigned int flags)); +void cpufreq_remove_update_util_hook(int cpu); +bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy); +#endif /* CONFIG_CPU_FREQ */ + +#endif /* _LINUX_SCHED_CPUFREQ_H */ diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h new file mode 100644 index 000000000..53f883f5a --- /dev/null +++ b/include/linux/sched/cputime.h @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_CPUTIME_H +#define _LINUX_SCHED_CPUTIME_H + +#include <linux/sched/signal.h> + +/* + * cputime accounting APIs: + */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +#include <asm/cputime.h> + +#ifndef cputime_to_nsecs +# define cputime_to_nsecs(__ct) \ + (cputime_to_usecs(__ct) * NSEC_PER_USEC) +#endif +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +extern void task_cputime(struct task_struct *t, + u64 *utime, u64 *stime); +extern u64 task_gtime(struct task_struct *t); +#else +static inline void task_cputime(struct task_struct *t, + u64 *utime, u64 *stime) +{ + *utime = t->utime; + *stime = t->stime; +} + +static inline u64 task_gtime(struct task_struct *t) +{ + return t->gtime; +} +#endif + +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME +static inline void task_cputime_scaled(struct task_struct *t, + u64 *utimescaled, + u64 *stimescaled) +{ + *utimescaled = t->utimescaled; + *stimescaled = t->stimescaled; +} +#else +static inline void task_cputime_scaled(struct task_struct *t, + u64 *utimescaled, + u64 *stimescaled) +{ + task_cputime(t, utimescaled, stimescaled); +} +#endif + +extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st); +extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st); +extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st); + +/* + * Thread group CPU time accounting. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); + + +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick. None of this depends + * on CONFIG_SCHEDSTATS. + */ + +/** + * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running + * + * @tsk: Pointer to target task. + */ +#ifdef CONFIG_POSIX_TIMERS +static inline +struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + /* Check if cputimer isn't running. This is accessed without locking. */ + if (!READ_ONCE(cputimer->running)) + return NULL; + + /* + * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime + * in __exit_signal(), we won't account to the signal struct further + * cputime consumed by that task, even though the task can still be + * ticking after __exit_signal(). + * + * In order to keep a consistent behaviour between thread group cputime + * and thread group cputimer accounting, lets also ignore the cputime + * elapsing after __exit_signal() in any thread group timer running. + * + * This makes sure that POSIX CPU clocks and timers are synchronized, so + * that a POSIX CPU timer won't expire while the corresponding POSIX CPU + * clock delta is behind the expiring timer value. + */ + if (unlikely(!tsk->sighand)) + return NULL; + + return cputimer; +} +#else +static inline +struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk) +{ + return NULL; +} +#endif + +/** + * account_group_user_time - Maintain utime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the utime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void account_group_user_time(struct task_struct *tsk, + u64 cputime) +{ + struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); + + if (!cputimer) + return; + + atomic64_add(cputime, &cputimer->cputime_atomic.utime); +} + +/** + * account_group_system_time - Maintain stime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the stime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void account_group_system_time(struct task_struct *tsk, + u64 cputime) +{ + struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); + + if (!cputimer) + return; + + atomic64_add(cputime, &cputimer->cputime_atomic.stime); +} + +/** + * account_group_exec_runtime - Maintain exec runtime for a thread group. + * + * @tsk: Pointer to task structure. + * @ns: Time value by which to increment the sum_exec_runtime field + * of the thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) +{ + struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); + + if (!cputimer) + return; + + atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime); +} + +static inline void prev_cputime_init(struct prev_cputime *prev) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + prev->utime = prev->stime = 0; + raw_spin_lock_init(&prev->lock); +#endif +} + +extern unsigned long long +task_sched_runtime(struct task_struct *task); + +#endif /* _LINUX_SCHED_CPUTIME_H */ diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h new file mode 100644 index 000000000..0cb034331 --- /dev/null +++ b/include/linux/sched/deadline.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and + * NORMAL/BATCH tasks. + */ + +#define MAX_DL_PRIO 0 + +static inline int dl_prio(int prio) +{ + if (unlikely(prio < MAX_DL_PRIO)) + return 1; + return 0; +} + +static inline int dl_task(struct task_struct *p) +{ + return dl_prio(p->prio); +} + +static inline bool dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h new file mode 100644 index 000000000..95fb9e025 --- /dev/null +++ b/include/linux/sched/debug.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_DEBUG_H +#define _LINUX_SCHED_DEBUG_H + +/* + * Various scheduler/task debugging interfaces: + */ + +struct task_struct; +struct pid_namespace; + +extern void dump_cpu_task(int cpu); + +/* + * Only dump TASK_* tasks. (0 for all tasks) + */ +extern void show_state_filter(unsigned long state_filter); + +static inline void show_state(void) +{ + show_state_filter(0); +} + +struct pt_regs; + +extern void show_regs(struct pt_regs *); + +/* + * TASK is a pointer to the task whose backtrace we want to see (or NULL for current + * task), SP is the stack pointer of the first frame that should be shown in the back + * trace (or NULL if the entire call-chain of the task should be shown). + */ +extern void show_stack(struct task_struct *task, unsigned long *sp); + +extern void sched_show_task(struct task_struct *p); + +#ifdef CONFIG_SCHED_DEBUG +struct seq_file; +extern void proc_sched_show_task(struct task_struct *p, + struct pid_namespace *ns, struct seq_file *m); +extern void proc_sched_set_task(struct task_struct *p); +#endif + +/* Attach to any functions which should be ignored in wchan output. */ +#define __sched __attribute__((__section__(".sched.text"))) + +/* Linker adds these: start and end of __sched functions */ +extern char __sched_text_start[], __sched_text_end[]; + +/* Is this address in the __sched functions? */ +extern int in_sched_functions(unsigned long addr); + +#endif /* _LINUX_SCHED_DEBUG_H */ diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h new file mode 100644 index 000000000..9a62ffdd2 --- /dev/null +++ b/include/linux/sched/hotplug.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_HOTPLUG_H +#define _LINUX_SCHED_HOTPLUG_H + +/* + * Scheduler interfaces for hotplug CPU support: + */ + +extern int sched_cpu_starting(unsigned int cpu); +extern int sched_cpu_activate(unsigned int cpu); +extern int sched_cpu_deactivate(unsigned int cpu); + +#ifdef CONFIG_HOTPLUG_CPU +extern int sched_cpu_dying(unsigned int cpu); +#else +# define sched_cpu_dying NULL +#endif + +#ifdef CONFIG_HOTPLUG_CPU +extern void idle_task_exit(void); +#else +static inline void idle_task_exit(void) {} +#endif + +#endif /* _LINUX_SCHED_HOTPLUG_H */ diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h new file mode 100644 index 000000000..22873d276 --- /dev/null +++ b/include/linux/sched/idle.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_IDLE_H +#define _LINUX_SCHED_IDLE_H + +#include <linux/sched.h> + +enum cpu_idle_type { + CPU_IDLE, + CPU_NOT_IDLE, + CPU_NEWLY_IDLE, + CPU_MAX_IDLE_TYPES +}; + +extern void wake_up_if_idle(int cpu); + +/* + * Idle thread specific functions to determine the need_resched + * polling state. + */ +#ifdef TIF_POLLING_NRFLAG + +static inline void __current_set_polling(void) +{ + set_thread_flag(TIF_POLLING_NRFLAG); +} + +static inline bool __must_check current_set_polling_and_test(void) +{ + __current_set_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_curr() + */ + smp_mb__after_atomic(); + + return unlikely(tif_need_resched()); +} + +static inline void __current_clr_polling(void) +{ + clear_thread_flag(TIF_POLLING_NRFLAG); +} + +static inline bool __must_check current_clr_polling_and_test(void) +{ + __current_clr_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_curr() + */ + smp_mb__after_atomic(); + + return unlikely(tif_need_resched()); +} + +#else +static inline void __current_set_polling(void) { } +static inline void __current_clr_polling(void) { } + +static inline bool __must_check current_set_polling_and_test(void) +{ + return unlikely(tif_need_resched()); +} +static inline bool __must_check current_clr_polling_and_test(void) +{ + return unlikely(tif_need_resched()); +} +#endif + +static inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb(); /* paired with resched_curr() */ + + preempt_fold_need_resched(); +} + +#endif /* _LINUX_SCHED_IDLE_H */ diff --git a/include/linux/sched/init.h b/include/linux/sched/init.h new file mode 100644 index 000000000..03542575f --- /dev/null +++ b/include/linux/sched/init.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_INIT_H +#define _LINUX_SCHED_INIT_H + +/* + * Scheduler init related prototypes: + */ + +extern void sched_init(void); +extern void sched_init_smp(void); + +#endif /* _LINUX_SCHED_INIT_H */ diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h new file mode 100644 index 000000000..4a6582c27 --- /dev/null +++ b/include/linux/sched/isolation.h @@ -0,0 +1,52 @@ +#ifndef _LINUX_SCHED_ISOLATION_H +#define _LINUX_SCHED_ISOLATION_H + +#include <linux/cpumask.h> +#include <linux/init.h> +#include <linux/tick.h> + +enum hk_flags { + HK_FLAG_TIMER = 1, + HK_FLAG_RCU = (1 << 1), + HK_FLAG_MISC = (1 << 2), + HK_FLAG_SCHED = (1 << 3), + HK_FLAG_TICK = (1 << 4), + HK_FLAG_DOMAIN = (1 << 5), + HK_FLAG_WQ = (1 << 6), +}; + +#ifdef CONFIG_CPU_ISOLATION +DECLARE_STATIC_KEY_FALSE(housekeeping_overriden); +extern int housekeeping_any_cpu(enum hk_flags flags); +extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags); +extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags); +extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags); +extern void __init housekeeping_init(void); + +#else + +static inline int housekeeping_any_cpu(enum hk_flags flags) +{ + return smp_processor_id(); +} + +static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) +{ + return cpu_possible_mask; +} + +static inline void housekeeping_affine(struct task_struct *t, + enum hk_flags flags) { } +static inline void housekeeping_init(void) { } +#endif /* CONFIG_CPU_ISOLATION */ + +static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) +{ +#ifdef CONFIG_CPU_ISOLATION + if (static_branch_unlikely(&housekeeping_overriden)) + return housekeeping_test_cpu(cpu, flags); +#endif + return true; +} + +#endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h new file mode 100644 index 000000000..98228bd48 --- /dev/null +++ b/include/linux/sched/jobctl.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_JOBCTL_H +#define _LINUX_SCHED_JOBCTL_H + +#include <linux/types.h> + +struct task_struct; + +/* + * task->jobctl flags + */ +#define JOBCTL_STOP_SIGMASK 0xffff /* signr of the last group stop */ + +#define JOBCTL_STOP_DEQUEUED_BIT 16 /* stop signal dequeued */ +#define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */ +#define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */ +#define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */ +#define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */ +#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ +#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ + +#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) +#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) +#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT) +#define JOBCTL_TRAP_STOP (1UL << JOBCTL_TRAP_STOP_BIT) +#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT) +#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) +#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) + +#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) +#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) + +extern bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask); +extern void task_clear_jobctl_trapping(struct task_struct *task); +extern void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask); + +#endif /* _LINUX_SCHED_JOBCTL_H */ diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h new file mode 100644 index 000000000..80bc84ba5 --- /dev/null +++ b/include/linux/sched/loadavg.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_LOADAVG_H +#define _LINUX_SCHED_LOADAVG_H + +/* + * These are the constant used to fake the fixed-point load-average + * counting. Some notes: + * - 11 bit fractions expand to 22 bits by the multiplies: this gives + * a load-average precision of 10 bits integer + 11 bits fractional + * - if you want to count load-averages more often, you need more + * precision, or rounding will get you. With 2-second counting freq, + * the EXP_n values would be 1981, 2034 and 2043 if still using only + * 11 bit fractions. + */ +extern unsigned long avenrun[]; /* Load averages */ +extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); + +#define FSHIFT 11 /* nr of bits of precision */ +#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ +#define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */ +#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ +#define EXP_5 2014 /* 1/exp(5sec/5min) */ +#define EXP_15 2037 /* 1/exp(5sec/15min) */ + +#define CALC_LOAD(load,exp,n) \ + load *= exp; \ + load += n*(FIXED_1-exp); \ + load >>= FSHIFT; + +extern void calc_global_load(unsigned long ticks); + +#endif /* _LINUX_SCHED_LOADAVG_H */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h new file mode 100644 index 000000000..ef54f4b3f --- /dev/null +++ b/include/linux/sched/mm.h @@ -0,0 +1,366 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_MM_H +#define _LINUX_SCHED_MM_H + +#include <linux/kernel.h> +#include <linux/atomic.h> +#include <linux/sched.h> +#include <linux/mm_types.h> +#include <linux/gfp.h> +#include <linux/sync_core.h> + +/* + * Routines for handling mm_structs + */ +extern struct mm_struct *mm_alloc(void); + +/** + * mmgrab() - Pin a &struct mm_struct. + * @mm: The &struct mm_struct to pin. + * + * Make sure that @mm will not get freed even after the owning task + * exits. This doesn't guarantee that the associated address space + * will still exist later on and mmget_not_zero() has to be used before + * accessing it. + * + * This is a preferred way to to pin @mm for a longer/unbounded amount + * of time. + * + * Use mmdrop() to release the reference acquired by mmgrab(). + * + * See also <Documentation/vm/active_mm.rst> for an in-depth explanation + * of &mm_struct.mm_count vs &mm_struct.mm_users. + */ +static inline void mmgrab(struct mm_struct *mm) +{ + atomic_inc(&mm->mm_count); +} + +extern void __mmdrop(struct mm_struct *mm); + +static inline void mmdrop(struct mm_struct *mm) +{ + /* + * The implicit full barrier implied by atomic_dec_and_test() is + * required by the membarrier system call before returning to + * user-space, after storing to rq->curr. + */ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) + __mmdrop(mm); +} + +void mmdrop(struct mm_struct *mm); + +/* + * This has to be called after a get_task_mm()/mmget_not_zero() + * followed by taking the mmap_sem for writing before modifying the + * vmas or anything the coredump pretends not to change from under it. + * + * It also has to be called when mmgrab() is used in the context of + * the process, but then the mm_count refcount is transferred outside + * the context of the process to run down_write() on that pinned mm. + * + * NOTE: find_extend_vma() called from GUP context is the only place + * that can modify the "mm" (notably the vm_start/end) under mmap_sem + * for reading and outside the context of the process, so it is also + * the only case that holds the mmap_sem for reading that must call + * this function. Generally if the mmap_sem is hold for reading + * there's no need of this check after get_task_mm()/mmget_not_zero(). + * + * This function can be obsoleted and the check can be removed, after + * the coredump code will hold the mmap_sem for writing before + * invoking the ->core_dump methods. + */ +static inline bool mmget_still_valid(struct mm_struct *mm) +{ + return likely(!mm->core_state); +} + +/** + * mmget() - Pin the address space associated with a &struct mm_struct. + * @mm: The address space to pin. + * + * Make sure that the address space of the given &struct mm_struct doesn't + * go away. This does not protect against parts of the address space being + * modified or freed, however. + * + * Never use this function to pin this address space for an + * unbounded/indefinite amount of time. + * + * Use mmput() to release the reference acquired by mmget(). + * + * See also <Documentation/vm/active_mm.rst> for an in-depth explanation + * of &mm_struct.mm_count vs &mm_struct.mm_users. + */ +static inline void mmget(struct mm_struct *mm) +{ + atomic_inc(&mm->mm_users); +} + +static inline bool mmget_not_zero(struct mm_struct *mm) +{ + return atomic_inc_not_zero(&mm->mm_users); +} + +/* mmput gets rid of the mappings and all user-space */ +extern void mmput(struct mm_struct *); +#ifdef CONFIG_MMU +/* same as above but performs the slow path from the async context. Can + * be called from the atomic context as well + */ +void mmput_async(struct mm_struct *); +#endif + +/* Grab a reference to a task's mm, if it is not already going away */ +extern struct mm_struct *get_task_mm(struct task_struct *task); +/* + * Grab a reference to a task's mm, if it is not already going away + * and ptrace_may_access with the mode parameter passed to it + * succeeds. + */ +extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); +/* Remove the current tasks stale references to the old mm_struct on exit() */ +extern void exit_mm_release(struct task_struct *, struct mm_struct *); +/* Remove the current tasks stale references to the old mm_struct on exec() */ +extern void exec_mm_release(struct task_struct *, struct mm_struct *); + +#ifdef CONFIG_MEMCG +extern void mm_update_next_owner(struct mm_struct *mm); +#else +static inline void mm_update_next_owner(struct mm_struct *mm) +{ +} +#endif /* CONFIG_MEMCG */ + +#ifdef CONFIG_MMU +extern void arch_pick_mmap_layout(struct mm_struct *mm, + struct rlimit *rlim_stack); +extern unsigned long +arch_get_unmapped_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); +extern unsigned long +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); +#else +static inline void arch_pick_mmap_layout(struct mm_struct *mm, + struct rlimit *rlim_stack) {} +#endif + +static inline bool in_vfork(struct task_struct *tsk) +{ + bool ret; + + /* + * need RCU to access ->real_parent if CLONE_VM was used along with + * CLONE_PARENT. + * + * We check real_parent->mm == tsk->mm because CLONE_VFORK does not + * imply CLONE_VM + * + * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus + * ->real_parent is not necessarily the task doing vfork(), so in + * theory we can't rely on task_lock() if we want to dereference it. + * + * And in this case we can't trust the real_parent->mm == tsk->mm + * check, it can be false negative. But we do not care, if init or + * another oom-unkillable task does this it should blame itself. + */ + rcu_read_lock(); + ret = tsk->vfork_done && + rcu_dereference(tsk->real_parent)->mm == tsk->mm; + rcu_read_unlock(); + + return ret; +} + +/* + * Applies per-task gfp context to the given allocation flags. + * PF_MEMALLOC_NOIO implies GFP_NOIO + * PF_MEMALLOC_NOFS implies GFP_NOFS + */ +static inline gfp_t current_gfp_context(gfp_t flags) +{ + /* + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precendence + */ + if (unlikely(current->flags & PF_MEMALLOC_NOIO)) + flags &= ~(__GFP_IO | __GFP_FS); + else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) + flags &= ~__GFP_FS; + return flags; +} + +#ifdef CONFIG_LOCKDEP +extern void __fs_reclaim_acquire(void); +extern void __fs_reclaim_release(void); +extern void fs_reclaim_acquire(gfp_t gfp_mask); +extern void fs_reclaim_release(gfp_t gfp_mask); +#else +static inline void __fs_reclaim_acquire(void) { } +static inline void __fs_reclaim_release(void) { } +static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } +static inline void fs_reclaim_release(gfp_t gfp_mask) { } +#endif + +/** + * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. + * + * This functions marks the beginning of the GFP_NOIO allocation scope. + * All further allocations will implicitly drop __GFP_IO flag and so + * they are safe for the IO critical section from the allocation recursion + * point of view. Use memalloc_noio_restore to end the scope with flags + * returned by this function. + * + * This function is safe to be used from any context. + */ +static inline unsigned int memalloc_noio_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC_NOIO; + current->flags |= PF_MEMALLOC_NOIO; + return flags; +} + +/** + * memalloc_noio_restore - Ends the implicit GFP_NOIO scope. + * @flags: Flags to restore. + * + * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function. + * Always make sure that that the given flags is the return value from the + * pairing memalloc_noio_save call. + */ +static inline void memalloc_noio_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; +} + +/** + * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope. + * + * This functions marks the beginning of the GFP_NOFS allocation scope. + * All further allocations will implicitly drop __GFP_FS flag and so + * they are safe for the FS critical section from the allocation recursion + * point of view. Use memalloc_nofs_restore to end the scope with flags + * returned by this function. + * + * This function is safe to be used from any context. + */ +static inline unsigned int memalloc_nofs_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC_NOFS; + current->flags |= PF_MEMALLOC_NOFS; + return flags; +} + +/** + * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope. + * @flags: Flags to restore. + * + * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function. + * Always make sure that that the given flags is the return value from the + * pairing memalloc_nofs_save call. + */ +static inline void memalloc_nofs_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; +} + +static inline unsigned int memalloc_noreclaim_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC; + current->flags |= PF_MEMALLOC; + return flags; +} + +static inline void memalloc_noreclaim_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC) | flags; +} + +#ifdef CONFIG_MEMCG +/** + * memalloc_use_memcg - Starts the remote memcg charging scope. + * @memcg: memcg to charge. + * + * This function marks the beginning of the remote memcg charging scope. All the + * __GFP_ACCOUNT allocations till the end of the scope will be charged to the + * given memcg. + * + * NOTE: This function is not nesting safe. + */ +static inline void memalloc_use_memcg(struct mem_cgroup *memcg) +{ + WARN_ON_ONCE(current->active_memcg); + current->active_memcg = memcg; +} + +/** + * memalloc_unuse_memcg - Ends the remote memcg charging scope. + * + * This function marks the end of the remote memcg charging scope started by + * memalloc_use_memcg(). + */ +static inline void memalloc_unuse_memcg(void) +{ + current->active_memcg = NULL; +} +#else +static inline void memalloc_use_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void memalloc_unuse_memcg(void) +{ +} +#endif + +#ifdef CONFIG_MEMBARRIER +enum { + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), + MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1), + MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY = (1U << 2), + MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4), + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5), +}; + +enum { + MEMBARRIER_FLAG_SYNC_CORE = (1U << 0), +}; + +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS +#include <asm/membarrier.h> +#endif + +static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) +{ + if (current->mm != mm) + return; + if (likely(!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))) + return; + sync_core_before_usermode(); +} + +static inline void membarrier_execve(struct task_struct *t) +{ + atomic_set(&t->mm->membarrier_state, 0); +} +#else +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS +static inline void membarrier_arch_switch_mm(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) +{ +} +#endif +static inline void membarrier_execve(struct task_struct *t) +{ +} +static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) +{ +} +#endif + +#endif /* _LINUX_SCHED_MM_H */ diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h new file mode 100644 index 000000000..b36f4cf38 --- /dev/null +++ b/include/linux/sched/nohz.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_NOHZ_H +#define _LINUX_SCHED_NOHZ_H + +/* + * This is the interface between the scheduler and nohz/dynticks: + */ + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void cpu_load_update_nohz_start(void); +extern void cpu_load_update_nohz_stop(void); +#else +static inline void cpu_load_update_nohz_start(void) { } +static inline void cpu_load_update_nohz_stop(void) { } +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void nohz_balance_enter_idle(int cpu); +extern int get_nohz_timer_target(void); +#else +static inline void nohz_balance_enter_idle(int cpu) { } +#endif + +#ifdef CONFIG_NO_HZ_COMMON +void calc_load_nohz_start(void); +void calc_load_nohz_stop(void); +#else +static inline void calc_load_nohz_start(void) { } +static inline void calc_load_nohz_stop(void) { } +#endif /* CONFIG_NO_HZ_COMMON */ + +#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) +extern void wake_up_nohz_cpu(int cpu); +#else +static inline void wake_up_nohz_cpu(int cpu) { } +#endif + +#endif /* _LINUX_SCHED_NOHZ_H */ diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h new file mode 100644 index 000000000..3988762ef --- /dev/null +++ b/include/linux/sched/numa_balancing.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_NUMA_BALANCING_H +#define _LINUX_SCHED_NUMA_BALANCING_H + +/* + * This is the interface between the scheduler and the MM that + * implements memory access pattern based NUMA-balancing: + */ + +#include <linux/sched.h> + +#define TNF_MIGRATED 0x01 +#define TNF_NO_GROUP 0x02 +#define TNF_SHARED 0x04 +#define TNF_FAULT_LOCAL 0x08 +#define TNF_MIGRATE_FAIL 0x10 + +#ifdef CONFIG_NUMA_BALANCING +extern void task_numa_fault(int last_node, int node, int pages, int flags); +extern pid_t task_numa_group_id(struct task_struct *p); +extern void set_numabalancing_state(bool enabled); +extern void task_numa_free(struct task_struct *p, bool final); +extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, + int src_nid, int dst_cpu); +#else +static inline void task_numa_fault(int last_node, int node, int pages, + int flags) +{ +} +static inline pid_t task_numa_group_id(struct task_struct *p) +{ + return 0; +} +static inline void set_numabalancing_state(bool enabled) +{ +} +static inline void task_numa_free(struct task_struct *p, bool final) +{ +} +static inline bool should_numa_migrate_memory(struct task_struct *p, + struct page *page, int src_nid, int dst_cpu) +{ + return true; +} +#endif + +#endif /* _LINUX_SCHED_NUMA_BALANCING_H */ diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h new file mode 100644 index 000000000..7d64feafc --- /dev/null +++ b/include/linux/sched/prio.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_PRIO_H +#define _LINUX_SCHED_PRIO_H + +#define MAX_NICE 19 +#define MIN_NICE -20 +#define NICE_WIDTH (MAX_NICE - MIN_NICE + 1) + +/* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority + * values are inverted: lower p->prio value means higher priority. + * + * The MAX_USER_RT_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. + */ + +#define MAX_USER_RT_PRIO 100 +#define MAX_RT_PRIO MAX_USER_RT_PRIO + +#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) +#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) ((nice) + DEFAULT_PRIO) +#define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * Convert nice value [19,-20] to rlimit style value [1,40]. + */ +static inline long nice_to_rlimit(long nice) +{ + return (MAX_NICE - nice + 1); +} + +/* + * Convert rlimit style value [1,40] to nice value [-20, 19]. + */ +static inline long rlimit_to_nice(long prio) +{ + return (MAX_NICE - prio + 1); +} + +#endif /* _LINUX_SCHED_PRIO_H */ diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h new file mode 100644 index 000000000..e5af028c0 --- /dev/null +++ b/include/linux/sched/rt.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_RT_H +#define _LINUX_SCHED_RT_H + +#include <linux/sched.h> + +struct task_struct; + +static inline int rt_prio(int prio) +{ + if (unlikely(prio < MAX_RT_PRIO)) + return 1; + return 0; +} + +static inline int rt_task(struct task_struct *p) +{ + return rt_prio(p->prio); +} + +static inline bool task_is_realtime(struct task_struct *tsk) +{ + int policy = tsk->policy; + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; + if (policy == SCHED_DEADLINE) + return true; + return false; +} + +#ifdef CONFIG_RT_MUTEXES +/* + * Must hold either p->pi_lock or task_rq(p)->lock. + */ +static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) +{ + return p->pi_top_task; +} +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); +extern void rt_mutex_adjust_pi(struct task_struct *p); +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return tsk->pi_blocked_on != NULL; +} +#else +static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) +{ + return NULL; +} +# define rt_mutex_adjust_pi(p) do { } while (0) +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return false; +} +#endif + +extern void normalize_rt_tasks(void); + + +/* + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define RR_TIMESLICE (100 * HZ / 1000) + +#endif /* _LINUX_SCHED_RT_H */ diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h new file mode 100644 index 000000000..660d78c9a --- /dev/null +++ b/include/linux/sched/signal.h @@ -0,0 +1,702 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SIGNAL_H +#define _LINUX_SCHED_SIGNAL_H + +#include <linux/rculist.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/sched/jobctl.h> +#include <linux/sched/task.h> +#include <linux/cred.h> + +/* + * Types defining task->signal and task->sighand and APIs using them: + */ + +struct sighand_struct { + atomic_t count; + struct k_sigaction action[_NSIG]; + spinlock_t siglock; + wait_queue_head_t signalfd_wqh; +}; + +/* + * Per-process accounting stats: + */ +struct pacct_struct { + int ac_flag; + long ac_exitcode; + unsigned long ac_mem; + u64 ac_utime, ac_stime; + unsigned long ac_minflt, ac_majflt; +}; + +struct cpu_itimer { + u64 expires; + u64 incr; +}; + +/* + * This is the atomic variant of task_cputime, which can be used for + * storing and updating task_cputime statistics without locking. + */ +struct task_cputime_atomic { + atomic64_t utime; + atomic64_t stime; + atomic64_t sum_exec_runtime; +}; + +#define INIT_CPUTIME_ATOMIC \ + (struct task_cputime_atomic) { \ + .utime = ATOMIC64_INIT(0), \ + .stime = ATOMIC64_INIT(0), \ + .sum_exec_runtime = ATOMIC64_INIT(0), \ + } +/** + * struct thread_group_cputimer - thread group interval timer counts + * @cputime_atomic: atomic thread group interval timers. + * @running: true when there are timers running and + * @cputime_atomic receives updates. + * @checking_timer: true when a thread in the group is in the + * process of checking for thread group timers. + * + * This structure contains the version of task_cputime, above, that is + * used for thread group CPU timer calculations. + */ +struct thread_group_cputimer { + struct task_cputime_atomic cputime_atomic; + bool running; + bool checking_timer; +}; + +struct multiprocess_signals { + sigset_t signal; + struct hlist_node node; +}; + +/* + * NOTE! "signal_struct" does not have its own + * locking, because a shared signal_struct always + * implies a shared sighand_struct, so locking + * sighand_struct is always a proper superset of + * the locking of signal_struct. + */ +struct signal_struct { + atomic_t sigcnt; + atomic_t live; + int nr_threads; + struct list_head thread_head; + + wait_queue_head_t wait_chldexit; /* for wait4() */ + + /* current thread group signal load-balancing target: */ + struct task_struct *curr_target; + + /* shared signal handling: */ + struct sigpending shared_pending; + + /* For collecting multiprocess signals during fork */ + struct hlist_head multiprocess; + + /* thread group exit support */ + int group_exit_code; + /* overloaded: + * - notify group_exit_task when ->count is equal to notify_count + * - everyone except group_exit_task is stopped during signal delivery + * of fatal signals, group_exit_task processes the signal. + */ + int notify_count; + struct task_struct *group_exit_task; + + /* thread group stop support, overloads group_exit_code too */ + int group_stop_count; + unsigned int flags; /* see SIGNAL_* flags below */ + + /* + * PR_SET_CHILD_SUBREAPER marks a process, like a service + * manager, to re-parent orphan (double-forking) child processes + * to this process instead of 'init'. The service manager is + * able to receive SIGCHLD signals and is able to investigate + * the process until it calls wait(). All children of this + * process will inherit a flag if they should look for a + * child_subreaper process at exit. + */ + unsigned int is_child_subreaper:1; + unsigned int has_child_subreaper:1; + +#ifdef CONFIG_POSIX_TIMERS + + /* POSIX.1b Interval Timers */ + int posix_timer_id; + struct list_head posix_timers; + + /* ITIMER_REAL timer for the process */ + struct hrtimer real_timer; + ktime_t it_real_incr; + + /* + * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use + * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these + * values are defined to 0 and 1 respectively + */ + struct cpu_itimer it[2]; + + /* + * Thread group totals for process CPU timers. + * See thread_group_cputimer(), et al, for details. + */ + struct thread_group_cputimer cputimer; + + /* Earliest-expiration cache. */ + struct task_cputime cputime_expires; + + struct list_head cpu_timers[3]; + +#endif + + /* PID/PID hash table linkage. */ + struct pid *pids[PIDTYPE_MAX]; + +#ifdef CONFIG_NO_HZ_FULL + atomic_t tick_dep_mask; +#endif + + struct pid *tty_old_pgrp; + + /* boolean value for session group leader */ + int leader; + + struct tty_struct *tty; /* NULL if no tty */ + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + /* + * Cumulative resource counters for dead threads in the group, + * and for reaped dead child processes forked by this group. + * Live threads maintain their own counters and add to these + * in __exit_signal, except for the group leader. + */ + seqlock_t stats_lock; + u64 utime, stime, cutime, cstime; + u64 gtime; + u64 cgtime; + struct prev_cputime prev_cputime; + unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; + unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; + unsigned long inblock, oublock, cinblock, coublock; + unsigned long maxrss, cmaxrss; + struct task_io_accounting ioac; + + /* + * Cumulative ns of schedule CPU time fo dead threads in the + * group, not including a zombie group leader, (This only differs + * from jiffies_to_ns(utime + stime) if sched_clock uses something + * other than jiffies.) + */ + unsigned long long sum_sched_runtime; + + /* + * We don't bother to synchronize most readers of this at all, + * because there is no reader checking a limit that actually needs + * to get both rlim_cur and rlim_max atomically, and either one + * alone is a single word that can safely be read normally. + * getrlimit/setrlimit use task_lock(current->group_leader) to + * protect this instead of the siglock, because they really + * have no need to disable irqs. + */ + struct rlimit rlim[RLIM_NLIMITS]; + +#ifdef CONFIG_BSD_PROCESS_ACCT + struct pacct_struct pacct; /* per-process accounting information */ +#endif +#ifdef CONFIG_TASKSTATS + struct taskstats *stats; +#endif +#ifdef CONFIG_AUDIT + unsigned audit_tty; + struct tty_audit_buf *tty_audit_buf; +#endif + + /* + * Thread is the potential origin of an oom condition; kill first on + * oom + */ + bool oom_flag_origin; + short oom_score_adj; /* OOM kill score adjustment */ + short oom_score_adj_min; /* OOM kill score adjustment min value. + * Only settable by CAP_SYS_RESOURCE. */ + struct mm_struct *oom_mm; /* recorded mm when the thread group got + * killed by the oom killer */ + + struct mutex cred_guard_mutex; /* guard against foreign influences on + * credential calculations + * (notably. ptrace) */ +} __randomize_layout; + +/* + * Bits in flags field of signal_struct. + */ +#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ +#define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ +#define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ +#define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */ +/* + * Pending notifications to parent. + */ +#define SIGNAL_CLD_STOPPED 0x00000010 +#define SIGNAL_CLD_CONTINUED 0x00000020 +#define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) + +#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ + +#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ + SIGNAL_STOP_CONTINUED) + +static inline void signal_set_stop_flags(struct signal_struct *sig, + unsigned int flags) +{ + WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); + sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; +} + +/* If true, all threads except ->group_exit_task have pending SIGKILL */ +static inline int signal_group_exit(const struct signal_struct *sig) +{ + return (sig->flags & SIGNAL_GROUP_EXIT) || + (sig->group_exit_task != NULL); +} + +extern void flush_signals(struct task_struct *); +extern void ignore_signals(struct task_struct *); +extern void flush_signal_handlers(struct task_struct *, int force_default); +extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); + +static inline int kernel_dequeue_signal(siginfo_t *info) +{ + struct task_struct *tsk = current; + siginfo_t __info; + int ret; + + spin_lock_irq(&tsk->sighand->siglock); + ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info); + spin_unlock_irq(&tsk->sighand->siglock); + + return ret; +} + +static inline void kernel_signal_stop(void) +{ + spin_lock_irq(¤t->sighand->siglock); + if (current->jobctl & JOBCTL_STOP_DEQUEUED) + set_special_state(TASK_STOPPED); + spin_unlock_irq(¤t->sighand->siglock); + + schedule(); +} +#ifdef __ARCH_SI_TRAPNO +# define ___ARCH_SI_TRAPNO(_a1) , _a1 +#else +# define ___ARCH_SI_TRAPNO(_a1) +#endif +#ifdef __ia64__ +# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3 +#else +# define ___ARCH_SI_IA64(_a1, _a2, _a3) +#endif + +int force_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t); +int send_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t); + +int force_sig_mceerr(int code, void __user *, short, struct task_struct *); +int send_sig_mceerr(int code, void __user *, short, struct task_struct *); + +int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); +int force_sig_pkuerr(void __user *addr, u32 pkey); + +int force_sig_ptrace_errno_trap(int errno, void __user *addr); + +extern int send_sig_info(int, struct siginfo *, struct task_struct *); +extern void force_sigsegv(int sig, struct task_struct *p); +extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); +extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); +extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *, + const struct cred *); +extern int kill_pgrp(struct pid *pid, int sig, int priv); +extern int kill_pid(struct pid *pid, int sig, int priv); +extern __must_check bool do_notify_parent(struct task_struct *, int); +extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); +extern void force_sig(int, struct task_struct *); +extern int send_sig(int, struct task_struct *, int); +extern int zap_other_threads(struct task_struct *p); +extern struct sigqueue *sigqueue_alloc(void); +extern void sigqueue_free(struct sigqueue *); +extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); +extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); + +static inline int restart_syscall(void) +{ + set_tsk_thread_flag(current, TIF_SIGPENDING); + return -ERESTARTNOINTR; +} + +static inline int signal_pending(struct task_struct *p) +{ + return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); +} + +static inline int __fatal_signal_pending(struct task_struct *p) +{ + return unlikely(sigismember(&p->pending.signal, SIGKILL)); +} + +static inline int fatal_signal_pending(struct task_struct *p) +{ + return signal_pending(p) && __fatal_signal_pending(p); +} + +static inline int signal_pending_state(long state, struct task_struct *p) +{ + if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) + return 0; + if (!signal_pending(p)) + return 0; + + return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); +} + +/* + * Reevaluate whether the task has signals pending delivery. + * Wake the task if so. + * This is required every time the blocked sigset_t changes. + * callers must hold sighand->siglock. + */ +extern void recalc_sigpending_and_wake(struct task_struct *t); +extern void recalc_sigpending(void); +extern void calculate_sigpending(void); + +extern void signal_wake_up_state(struct task_struct *t, unsigned int state); + +static inline void signal_wake_up(struct task_struct *t, bool resume) +{ + signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); +} +static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) +{ + signal_wake_up_state(t, resume ? __TASK_TRACED : 0); +} + +void task_join_group_stop(struct task_struct *task); + +#ifdef TIF_RESTORE_SIGMASK +/* + * Legacy restore_sigmask accessors. These are inefficient on + * SMP architectures because they require atomic operations. + */ + +/** + * set_restore_sigmask() - make sure saved_sigmask processing gets done + * + * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code + * will run before returning to user mode, to process the flag. For + * all callers, TIF_SIGPENDING is already set or it's no harm to set + * it. TIF_RESTORE_SIGMASK need not be in the set of bits that the + * arch code will notice on return to user mode, in case those bits + * are scarce. We set TIF_SIGPENDING here to ensure that the arch + * signal code always gets run when TIF_RESTORE_SIGMASK is set. + */ +static inline void set_restore_sigmask(void) +{ + set_thread_flag(TIF_RESTORE_SIGMASK); + WARN_ON(!test_thread_flag(TIF_SIGPENDING)); +} + +static inline void clear_tsk_restore_sigmask(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK); +} + +static inline void clear_restore_sigmask(void) +{ + clear_thread_flag(TIF_RESTORE_SIGMASK); +} +static inline bool test_tsk_restore_sigmask(struct task_struct *tsk) +{ + return test_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK); +} +static inline bool test_restore_sigmask(void) +{ + return test_thread_flag(TIF_RESTORE_SIGMASK); +} +static inline bool test_and_clear_restore_sigmask(void) +{ + return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); +} + +#else /* TIF_RESTORE_SIGMASK */ + +/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */ +static inline void set_restore_sigmask(void) +{ + current->restore_sigmask = true; + WARN_ON(!test_thread_flag(TIF_SIGPENDING)); +} +static inline void clear_tsk_restore_sigmask(struct task_struct *tsk) +{ + tsk->restore_sigmask = false; +} +static inline void clear_restore_sigmask(void) +{ + current->restore_sigmask = false; +} +static inline bool test_restore_sigmask(void) +{ + return current->restore_sigmask; +} +static inline bool test_tsk_restore_sigmask(struct task_struct *tsk) +{ + return tsk->restore_sigmask; +} +static inline bool test_and_clear_restore_sigmask(void) +{ + if (!current->restore_sigmask) + return false; + current->restore_sigmask = false; + return true; +} +#endif + +static inline void restore_saved_sigmask(void) +{ + if (test_and_clear_restore_sigmask()) + __set_current_blocked(¤t->saved_sigmask); +} + +static inline sigset_t *sigmask_to_save(void) +{ + sigset_t *res = ¤t->blocked; + if (unlikely(test_restore_sigmask())) + res = ¤t->saved_sigmask; + return res; +} + +static inline int kill_cad_pid(int sig, int priv) +{ + return kill_pid(cad_pid, sig, priv); +} + +/* These can be the second arg to send_sig_info/send_group_sig_info. */ +#define SEND_SIG_NOINFO ((struct siginfo *) 0) +#define SEND_SIG_PRIV ((struct siginfo *) 1) +#define SEND_SIG_FORCED ((struct siginfo *) 2) + +/* + * True if we are on the alternate signal stack. + */ +static inline int on_sig_stack(unsigned long sp) +{ + /* + * If the signal stack is SS_AUTODISARM then, by construction, we + * can't be on the signal stack unless user code deliberately set + * SS_AUTODISARM when we were already on it. + * + * This improves reliability: if user state gets corrupted such that + * the stack pointer points very close to the end of the signal stack, + * then this check will enable the signal to be handled anyway. + */ + if (current->sas_ss_flags & SS_AUTODISARM) + return 0; + +#ifdef CONFIG_STACK_GROWSUP + return sp >= current->sas_ss_sp && + sp - current->sas_ss_sp < current->sas_ss_size; +#else + return sp > current->sas_ss_sp && + sp - current->sas_ss_sp <= current->sas_ss_size; +#endif +} + +static inline int sas_ss_flags(unsigned long sp) +{ + if (!current->sas_ss_size) + return SS_DISABLE; + + return on_sig_stack(sp) ? SS_ONSTACK : 0; +} + +static inline void sas_ss_reset(struct task_struct *p) +{ + p->sas_ss_sp = 0; + p->sas_ss_size = 0; + p->sas_ss_flags = SS_DISABLE; +} + +static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) +{ + if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) +#ifdef CONFIG_STACK_GROWSUP + return current->sas_ss_sp; +#else + return current->sas_ss_sp + current->sas_ss_size; +#endif + return sp; +} + +extern void __cleanup_sighand(struct sighand_struct *); +extern void flush_itimer_signals(void); + +#define tasklist_empty() \ + list_empty(&init_task.tasks) + +#define next_task(p) \ + list_entry_rcu((p)->tasks.next, struct task_struct, tasks) + +#define for_each_process(p) \ + for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + +extern bool current_is_single_threaded(void); + +/* + * Careful: do_each_thread/while_each_thread is a double loop so + * 'break' will not work as expected - use goto instead. + */ +#define do_each_thread(g, t) \ + for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do + +#define while_each_thread(g, t) \ + while ((t = next_thread(t)) != g) + +#define __for_each_thread(signal, t) \ + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) + +#define for_each_thread(p, t) \ + __for_each_thread((p)->signal, t) + +/* Careful: this is a double loop, 'break' won't work as expected. */ +#define for_each_process_thread(p, t) \ + for_each_process(p) for_each_thread(p, t) + +typedef int (*proc_visitor)(struct task_struct *p, void *data); +void walk_process_tree(struct task_struct *top, proc_visitor, void *); + +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +{ + struct pid *pid; + if (type == PIDTYPE_PID) + pid = task_pid(task); + else + pid = task->signal->pids[type]; + return pid; +} + +static inline struct pid *task_tgid(struct task_struct *task) +{ + return task->signal->pids[PIDTYPE_TGID]; +} + +/* + * Without tasklist or RCU lock it is not safe to dereference + * the result of task_pgrp/task_session even if task == current, + * we can race with another thread doing sys_setsid/sys_setpgid. + */ +static inline struct pid *task_pgrp(struct task_struct *task) +{ + return task->signal->pids[PIDTYPE_PGID]; +} + +static inline struct pid *task_session(struct task_struct *task) +{ + return task->signal->pids[PIDTYPE_SID]; +} + +static inline int get_nr_threads(struct task_struct *tsk) +{ + return tsk->signal->nr_threads; +} + +static inline bool thread_group_leader(struct task_struct *p) +{ + return p->exit_signal >= 0; +} + +/* Do to the insanities of de_thread it is possible for a process + * to have the pid of the thread group leader without actually being + * the thread group leader. For iteration through the pids in proc + * all we care about is that we have a task with the appropriate + * pid, we don't actually care if we have the right task. + */ +static inline bool has_group_leader_pid(struct task_struct *p) +{ + return task_pid(p) == task_tgid(p); +} + +static inline +bool same_thread_group(struct task_struct *p1, struct task_struct *p2) +{ + return p1->signal == p2->signal; +} + +static inline struct task_struct *next_thread(const struct task_struct *p) +{ + return list_entry_rcu(p->thread_group.next, + struct task_struct, thread_group); +} + +static inline int thread_group_empty(struct task_struct *p) +{ + return list_empty(&p->thread_group); +} + +#define delay_group_leader(p) \ + (thread_group_leader(p) && !thread_group_empty(p)) + +extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, + unsigned long *flags); + +static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk, + unsigned long *flags) +{ + struct sighand_struct *ret; + + ret = __lock_task_sighand(tsk, flags); + (void)__cond_lock(&tsk->sighand->siglock, ret); + return ret; +} + +static inline void unlock_task_sighand(struct task_struct *tsk, + unsigned long *flags) +{ + spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); +} + +static inline unsigned long task_rlimit(const struct task_struct *tsk, + unsigned int limit) +{ + return READ_ONCE(tsk->signal->rlim[limit].rlim_cur); +} + +static inline unsigned long task_rlimit_max(const struct task_struct *tsk, + unsigned int limit) +{ + return READ_ONCE(tsk->signal->rlim[limit].rlim_max); +} + +static inline unsigned long rlimit(unsigned int limit) +{ + return task_rlimit(current, limit); +} + +static inline unsigned long rlimit_max(unsigned int limit) +{ + return task_rlimit_max(current, limit); +} + +#endif /* _LINUX_SCHED_SIGNAL_H */ diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h new file mode 100644 index 000000000..59d3736c4 --- /dev/null +++ b/include/linux/sched/smt.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SMT_H +#define _LINUX_SCHED_SMT_H + +#include <linux/static_key.h> + +#ifdef CONFIG_SCHED_SMT +extern struct static_key_false sched_smt_present; + +static __always_inline bool sched_smt_active(void) +{ + return static_branch_likely(&sched_smt_present); +} +#else +static inline bool sched_smt_active(void) { return false; } +#endif + +void arch_smt_update(void); + +#endif diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h new file mode 100644 index 000000000..04f1321d1 --- /dev/null +++ b/include/linux/sched/stat.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_STAT_H +#define _LINUX_SCHED_STAT_H + +#include <linux/percpu.h> + +/* + * Various counters maintained by the scheduler and fork(), + * exposed via /proc, sys.c or used by drivers via these APIs. + * + * ( Note that all these values are aquired without locking, + * so they can only be relied on in narrow circumstances. ) + */ + +extern unsigned long total_forks; +extern int nr_threads; +DECLARE_PER_CPU(unsigned long, process_counts); +extern int nr_processes(void); +extern unsigned long nr_running(void); +extern bool single_task_running(void); +extern unsigned long nr_iowait(void); +extern unsigned long nr_iowait_cpu(int cpu); +extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); + +static inline int sched_info_on(void) +{ +#ifdef CONFIG_SCHEDSTATS + return 1; +#elif defined(CONFIG_TASK_DELAY_ACCT) + extern int delayacct_on; + return delayacct_on; +#else + return 0; +#endif +} + +#ifdef CONFIG_SCHEDSTATS +void force_schedstat_enabled(void); +#endif + +#endif /* _LINUX_SCHED_STAT_H */ diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h new file mode 100644 index 000000000..a9c32daeb --- /dev/null +++ b/include/linux/sched/sysctl.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SYSCTL_H +#define _LINUX_SCHED_SYSCTL_H + +#include <linux/types.h> + +struct ctl_table; + +#ifdef CONFIG_DETECT_HUNG_TASK +extern int sysctl_hung_task_check_count; +extern unsigned int sysctl_hung_task_panic; +extern unsigned long sysctl_hung_task_timeout_secs; +extern unsigned long sysctl_hung_task_check_interval_secs; +extern int sysctl_hung_task_warnings; +extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos); +#else +/* Avoid need for ifdefs elsewhere in the code */ +enum { sysctl_hung_task_timeout_secs = 0 }; +#endif + +extern unsigned int sysctl_sched_latency; +extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_wakeup_granularity; +extern unsigned int sysctl_sched_child_runs_first; + +enum sched_tunable_scaling { + SCHED_TUNABLESCALING_NONE, + SCHED_TUNABLESCALING_LOG, + SCHED_TUNABLESCALING_LINEAR, + SCHED_TUNABLESCALING_END, +}; +extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; + +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_size; + +#ifdef CONFIG_SCHED_DEBUG +extern __read_mostly unsigned int sysctl_sched_migration_cost; +extern __read_mostly unsigned int sysctl_sched_nr_migrate; + +int sched_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos); +#endif + +/* + * control realtime throttling: + * + * /proc/sys/kernel/sched_rt_period_us + * /proc/sys/kernel/sched_rt_runtime_us + */ +extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_runtime; + +#ifdef CONFIG_CFS_BANDWIDTH +extern unsigned int sysctl_sched_cfs_bandwidth_slice; +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +extern unsigned int sysctl_sched_autogroup_enabled; +#endif + +extern int sysctl_sched_rr_timeslice; +extern int sched_rr_timeslice; + +extern int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + +extern int sched_rt_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + +extern int sysctl_numa_balancing(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + +extern int sysctl_schedstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + +#endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h new file mode 100644 index 000000000..91401309b --- /dev/null +++ b/include/linux/sched/task.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_TASK_H +#define _LINUX_SCHED_TASK_H + +/* + * Interface between the scheduler and various task lifetime (fork()/exit()) + * functionality: + */ + +#include <linux/sched.h> + +struct task_struct; +struct rusage; +union thread_union; + +/* + * This serializes "schedule()" and also protects + * the run-queue from deletions/modifications (but + * _adding_ to the beginning of the run-queue has + * a separate lock). + */ +extern rwlock_t tasklist_lock; +extern spinlock_t mmlist_lock; + +extern union thread_union init_thread_union; +extern struct task_struct init_task; + +#ifdef CONFIG_PROVE_RCU +extern int lockdep_tasklist_lock_is_held(void); +#endif /* #ifdef CONFIG_PROVE_RCU */ + +extern asmlinkage void schedule_tail(struct task_struct *prev); +extern void init_idle(struct task_struct *idle, int cpu); + +extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +extern void sched_dead(struct task_struct *p); + +void __noreturn do_task_dead(void); + +extern void proc_caches_init(void); + +extern void fork_init(void); + +extern void release_task(struct task_struct * p); + +#ifdef CONFIG_HAVE_COPY_THREAD_TLS +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, + struct task_struct *, unsigned long); +#else +extern int copy_thread(unsigned long, unsigned long, unsigned long, + struct task_struct *); + +/* Architectures that haven't opted into copy_thread_tls get the tls argument + * via pt_regs, so ignore the tls argument passed via C. */ +static inline int copy_thread_tls( + unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) +{ + return copy_thread(clone_flags, sp, arg, p); +} +#endif +extern void flush_thread(void); + +#ifdef CONFIG_HAVE_EXIT_THREAD +extern void exit_thread(struct task_struct *tsk); +#else +static inline void exit_thread(struct task_struct *tsk) +{ +} +#endif +extern void do_group_exit(int); + +extern void exit_files(struct task_struct *); +extern void exit_itimers(struct signal_struct *); + +extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); +extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); +struct task_struct *fork_idle(int); +extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); +extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + +extern void free_task(struct task_struct *tsk); + +/* sched_exec is called by processes performing an exec */ +#ifdef CONFIG_SMP +extern void sched_exec(void); +#else +#define sched_exec() {} +#endif + +#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) + +extern void __put_task_struct(struct task_struct *t); + +static inline void put_task_struct(struct task_struct *t) +{ + if (atomic_dec_and_test(&t->usage)) + __put_task_struct(t); +} + +struct task_struct *task_rcu_dereference(struct task_struct **ptask); + +#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT +extern int arch_task_struct_size __read_mostly; +#else +# define arch_task_struct_size (sizeof(struct task_struct)) +#endif + +#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST +/* + * If an architecture has not declared a thread_struct whitelist we + * must assume something there may need to be copied to userspace. + */ +static inline void arch_thread_struct_whitelist(unsigned long *offset, + unsigned long *size) +{ + *offset = 0; + /* Handle dynamically sized thread_struct. */ + *size = arch_task_struct_size - offsetof(struct task_struct, thread); +} +#endif + +#ifdef CONFIG_VMAP_STACK +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) +{ + return t->stack_vm_area; +} +#else +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) +{ + return NULL; +} +#endif + +/* + * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring + * subscriptions and synchronises with wait4(). Also used in procfs. Also + * pins the final release of task.io_context. Also protects ->cpuset and + * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. + * + * Nests both inside and outside of read_lock(&tasklist_lock). + * It must not be nested with write_lock_irq(&tasklist_lock), + * neither inside nor outside. + */ +static inline void task_lock(struct task_struct *p) +{ + spin_lock(&p->alloc_lock); +} + +static inline void task_unlock(struct task_struct *p) +{ + spin_unlock(&p->alloc_lock); +} + +#endif /* _LINUX_SCHED_TASK_H */ diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h new file mode 100644 index 000000000..4f099d3fe --- /dev/null +++ b/include/linux/sched/task_stack.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_TASK_STACK_H +#define _LINUX_SCHED_TASK_STACK_H + +/* + * task->stack (kernel stack) handling interfaces: + */ + +#include <linux/sched.h> +#include <linux/magic.h> + +#ifdef CONFIG_THREAD_INFO_IN_TASK + +/* + * When accessing the stack of a non-current task that might exit, use + * try_get_task_stack() instead. task_stack_page will return a pointer + * that could get freed out from under you. + */ +static inline void *task_stack_page(const struct task_struct *task) +{ + return task->stack; +} + +#define setup_thread_stack(new,old) do { } while(0) + +static inline unsigned long *end_of_stack(const struct task_struct *task) +{ +#ifdef CONFIG_STACK_GROWSUP + return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1; +#else + return task->stack; +#endif +} + +#elif !defined(__HAVE_THREAD_FUNCTIONS) + +#define task_stack_page(task) ((void *)(task)->stack) + +static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) +{ + *task_thread_info(p) = *task_thread_info(org); + task_thread_info(p)->task = p; +} + +/* + * Return the address of the last usable long on the stack. + * + * When the stack grows down, this is just above the thread + * info struct. Going any lower will corrupt the threadinfo. + * + * When the stack grows up, this is the highest address. + * Beyond that position, we corrupt data on the next page. + */ +static inline unsigned long *end_of_stack(struct task_struct *p) +{ +#ifdef CONFIG_STACK_GROWSUP + return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1; +#else + return (unsigned long *)(task_thread_info(p) + 1); +#endif +} + +#endif + +#ifdef CONFIG_THREAD_INFO_IN_TASK +static inline void *try_get_task_stack(struct task_struct *tsk) +{ + return atomic_inc_not_zero(&tsk->stack_refcount) ? + task_stack_page(tsk) : NULL; +} + +extern void put_task_stack(struct task_struct *tsk); +#else +static inline void *try_get_task_stack(struct task_struct *tsk) +{ + return task_stack_page(tsk); +} + +static inline void put_task_stack(struct task_struct *tsk) {} +#endif + +#define task_stack_end_corrupted(task) \ + (*(end_of_stack(task)) != STACK_END_MAGIC) + +static inline int object_is_on_stack(const void *obj) +{ + void *stack = task_stack_page(current); + + return (obj >= stack) && (obj < (stack + THREAD_SIZE)); +} + +extern void thread_stack_cache_init(void); + +#ifdef CONFIG_DEBUG_STACK_USAGE +static inline unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ +# ifdef CONFIG_STACK_GROWSUP + n--; +# else + n++; +# endif + } while (!*n); + +# ifdef CONFIG_STACK_GROWSUP + return (unsigned long)end_of_stack(p) - (unsigned long)n; +# else + return (unsigned long)n - (unsigned long)end_of_stack(p); +# endif +} +#endif +extern void set_task_stack_end_magic(struct task_struct *tsk); + +#ifndef __HAVE_ARCH_KSTACK_END +static inline int kstack_end(void *addr) +{ + /* Reliable end of stack detection: + * Some APM bios versions misalign the stack + */ + return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); +} +#endif + +#endif /* _LINUX_SCHED_TASK_STACK_H */ diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h new file mode 100644 index 000000000..15f3f61f7 --- /dev/null +++ b/include/linux/sched/topology.h @@ -0,0 +1,227 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_TOPOLOGY_H +#define _LINUX_SCHED_TOPOLOGY_H + +#include <linux/topology.h> + +#include <linux/sched/idle.h> + +/* + * Increase resolution of cpu_capacity calculations + */ +#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT +#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) + +/* + * sched-domains (multiprocessor balancing) declarations: + */ +#ifdef CONFIG_SMP + +#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ +#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ +#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ +#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ +#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ +#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ +#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ +#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ +#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ +#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ +#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ +#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ +#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ +#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ +#define SD_NUMA 0x4000 /* cross-node balancing */ + +#ifdef CONFIG_SCHED_SMT +static inline int cpu_smt_flags(void) +{ + return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; +} +#endif + +#ifdef CONFIG_SCHED_MC +static inline int cpu_core_flags(void) +{ + return SD_SHARE_PKG_RESOURCES; +} +#endif + +#ifdef CONFIG_NUMA +static inline int cpu_numa_flags(void) +{ + return SD_NUMA; +} +#endif + +extern int arch_asym_cpu_priority(int cpu); + +struct sched_domain_attr { + int relax_domain_level; +}; + +#define SD_ATTR_INIT (struct sched_domain_attr) { \ + .relax_domain_level = -1, \ +} + +extern int sched_domain_level_max; + +struct sched_group; + +struct sched_domain_shared { + atomic_t ref; + atomic_t nr_busy_cpus; + int has_idle_cores; +}; + +struct sched_domain { + /* These fields must be setup */ + struct sched_domain *parent; /* top domain must be null terminated */ + struct sched_domain *child; /* bottom domain must be null terminated */ + struct sched_group *groups; /* the balancing groups of the domain */ + unsigned long min_interval; /* Minimum balance interval ms */ + unsigned long max_interval; /* Maximum balance interval ms */ + unsigned int busy_factor; /* less balancing by factor if busy */ + unsigned int imbalance_pct; /* No balance until over watermark */ + unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int busy_idx; + unsigned int idle_idx; + unsigned int newidle_idx; + unsigned int wake_idx; + unsigned int forkexec_idx; + unsigned int smt_gain; + + int nohz_idle; /* NOHZ IDLE status */ + int flags; /* See SD_* */ + int level; + + /* Runtime fields. */ + unsigned long last_balance; /* init to jiffies. units in jiffies */ + unsigned int balance_interval; /* initialise to 1. units in ms. */ + unsigned int nr_balance_failed; /* initialise to 0 */ + + /* idle_balance() stats */ + u64 max_newidle_lb_cost; + unsigned long next_decay_max_lb_cost; + + u64 avg_scan_cost; /* select_idle_sibling */ + +#ifdef CONFIG_SCHEDSTATS + /* load_balance() stats */ + unsigned int lb_count[CPU_MAX_IDLE_TYPES]; + unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; + unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; + unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; + unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; + unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; + unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES]; + + /* Active load balancing */ + unsigned int alb_count; + unsigned int alb_failed; + unsigned int alb_pushed; + + /* SD_BALANCE_EXEC stats */ + unsigned int sbe_count; + unsigned int sbe_balanced; + unsigned int sbe_pushed; + + /* SD_BALANCE_FORK stats */ + unsigned int sbf_count; + unsigned int sbf_balanced; + unsigned int sbf_pushed; + + /* try_to_wake_up() stats */ + unsigned int ttwu_wake_remote; + unsigned int ttwu_move_affine; + unsigned int ttwu_move_balance; +#endif +#ifdef CONFIG_SCHED_DEBUG + char *name; +#endif + union { + void *private; /* used during construction */ + struct rcu_head rcu; /* used during destruction */ + }; + struct sched_domain_shared *shared; + + unsigned int span_weight; + /* + * Span of all CPUs in this domain. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long span[0]; +}; + +static inline struct cpumask *sched_domain_span(struct sched_domain *sd) +{ + return to_cpumask(sd->span); +} + +extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new); + +/* Allocate an array of sched domains, for partition_sched_domains(). */ +cpumask_var_t *alloc_sched_domains(unsigned int ndoms); +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); + +bool cpus_share_cache(int this_cpu, int that_cpu); + +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +typedef int (*sched_domain_flags_f)(void); + +#define SDTL_OVERLAP 0x01 + +struct sd_data { + struct sched_domain *__percpu *sd; + struct sched_domain_shared *__percpu *sds; + struct sched_group *__percpu *sg; + struct sched_group_capacity *__percpu *sgc; +}; + +struct sched_domain_topology_level { + sched_domain_mask_f mask; + sched_domain_flags_f sd_flags; + int flags; + int numa_level; + struct sd_data data; +#ifdef CONFIG_SCHED_DEBUG + char *name; +#endif +}; + +extern void set_sched_topology(struct sched_domain_topology_level *tl); + +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(type) .name = #type +#else +# define SD_INIT_NAME(type) +#endif + +#else /* CONFIG_SMP */ + +struct sched_domain_attr; + +static inline void +partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ +} + +static inline bool cpus_share_cache(int this_cpu, int that_cpu) +{ + return true; +} + +#endif /* !CONFIG_SMP */ + +static inline int task_node(const struct task_struct *p) +{ + return cpu_to_node(task_cpu(p)); +} + +#endif /* _LINUX_SCHED_TOPOLOGY_H */ diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h new file mode 100644 index 000000000..39ad98c09 --- /dev/null +++ b/include/linux/sched/user.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_USER_H +#define _LINUX_SCHED_USER_H + +#include <linux/uidgid.h> +#include <linux/atomic.h> +#include <linux/refcount.h> +#include <linux/ratelimit.h> + +struct key; + +/* + * Some day this will be a full-fledged user tracking system.. + */ +struct user_struct { + refcount_t __count; /* reference count */ + atomic_t processes; /* How many processes does this user have? */ + atomic_t sigpending; /* How many pending signals does this user have? */ +#ifdef CONFIG_FANOTIFY + atomic_t fanotify_listeners; +#endif +#ifdef CONFIG_EPOLL + atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ +#endif +#ifdef CONFIG_POSIX_MQUEUE + /* protected by mq_lock */ + unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ +#endif + unsigned long locked_shm; /* How many pages of mlocked shm ? */ + unsigned long unix_inflight; /* How many files in flight in unix sockets */ + atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ + +#ifdef CONFIG_KEYS + struct key *uid_keyring; /* UID specific keyring */ + struct key *session_keyring; /* UID's default session keyring */ +#endif + + /* Hash table maintenance information */ + struct hlist_node uidhash_node; + kuid_t uid; + +#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ + defined(CONFIG_NET) + atomic_long_t locked_vm; +#endif + + /* Miscellaneous per-user rate limit */ + struct ratelimit_state ratelimit; +}; + +extern int uids_sysfs_init(void); + +extern struct user_struct *find_user(kuid_t); + +extern struct user_struct root_user; +#define INIT_USER (&root_user) + + +/* per-UID process charging. */ +extern struct user_struct * alloc_uid(kuid_t); +static inline struct user_struct *get_uid(struct user_struct *u) +{ + refcount_inc(&u->__count); + return u; +} +extern void free_uid(struct user_struct *); + +#endif /* _LINUX_SCHED_USER_H */ diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h new file mode 100644 index 000000000..10b19a192 --- /dev/null +++ b/include/linux/sched/wake_q.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_WAKE_Q_H +#define _LINUX_SCHED_WAKE_Q_H + +/* + * Wake-queues are lists of tasks with a pending wakeup, whose + * callers have already marked the task as woken internally, + * and can thus carry on. A common use case is being able to + * do the wakeups once the corresponding user lock as been + * released. + * + * We hold reference to each task in the list across the wakeup, + * thus guaranteeing that the memory is still valid by the time + * the actual wakeups are performed in wake_up_q(). + * + * One per task suffices, because there's never a need for a task to be + * in two wake queues simultaneously; it is forbidden to abandon a task + * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is + * already in a wake queue, the wakeup will happen soon and the second + * waker can just skip it. + * + * The DEFINE_WAKE_Q macro declares and initializes the list head. + * wake_up_q() does NOT reinitialize the list; it's expected to be + * called near the end of a function. Otherwise, the list can be + * re-initialized for later re-use by wake_q_init(). + * + * Note that this can cause spurious wakeups. schedule() callers + * must ensure the call is done inside a loop, confirming that the + * wakeup condition has in fact occurred. + */ + +#include <linux/sched.h> + +struct wake_q_head { + struct wake_q_node *first; + struct wake_q_node **lastp; +}; + +#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) + +#define DEFINE_WAKE_Q(name) \ + struct wake_q_head name = { WAKE_Q_TAIL, &name.first } + +static inline void wake_q_init(struct wake_q_head *head) +{ + head->first = WAKE_Q_TAIL; + head->lastp = &head->first; +} + +extern void wake_q_add(struct wake_q_head *head, + struct task_struct *task); +extern void wake_up_q(struct wake_q_head *head); + +#endif /* _LINUX_SCHED_WAKE_Q_H */ diff --git a/include/linux/sched/xacct.h b/include/linux/sched/xacct.h new file mode 100644 index 000000000..c078f0a94 --- /dev/null +++ b/include/linux/sched/xacct.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_XACCT_H +#define _LINUX_SCHED_XACCT_H + +/* + * Extended task accounting methods: + */ + +#include <linux/sched.h> + +#ifdef CONFIG_TASK_XACCT +static inline void add_rchar(struct task_struct *tsk, ssize_t amt) +{ + tsk->ioac.rchar += amt; +} + +static inline void add_wchar(struct task_struct *tsk, ssize_t amt) +{ + tsk->ioac.wchar += amt; +} + +static inline void inc_syscr(struct task_struct *tsk) +{ + tsk->ioac.syscr++; +} + +static inline void inc_syscw(struct task_struct *tsk) +{ + tsk->ioac.syscw++; +} +#else +static inline void add_rchar(struct task_struct *tsk, ssize_t amt) +{ +} + +static inline void add_wchar(struct task_struct *tsk, ssize_t amt) +{ +} + +static inline void inc_syscr(struct task_struct *tsk) +{ +} + +static inline void inc_syscw(struct task_struct *tsk) +{ +} +#endif + +#endif /* _LINUX_SCHED_XACCT_H */ diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h new file mode 100644 index 000000000..abe28d5cb --- /dev/null +++ b/include/linux/sched_clock.h @@ -0,0 +1,25 @@ +/* + * sched_clock.h: support for extending counters to full 64-bit ns counter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef LINUX_SCHED_CLOCK +#define LINUX_SCHED_CLOCK + +#ifdef CONFIG_GENERIC_SCHED_CLOCK +extern void generic_sched_clock_init(void); + +extern void sched_clock_register(u64 (*read)(void), int bits, + unsigned long rate); +#else +static inline void generic_sched_clock_init(void) { } + +static inline void sched_clock_register(u64 (*read)(void), int bits, + unsigned long rate) +{ +} +#endif + +#endif |