diff options
Diffstat (limited to 'include/linux/psi_types.h')
-rw-r--r-- | include/linux/psi_types.h | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h new file mode 100644 index 000000000..f1fd3a804 --- /dev/null +++ b/include/linux/psi_types.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PSI_TYPES_H +#define _LINUX_PSI_TYPES_H + +#include <linux/kthread.h> +#include <linux/seqlock.h> +#include <linux/types.h> +#include <linux/kref.h> +#include <linux/wait.h> + +#ifdef CONFIG_PSI + +/* Tracked task states */ +enum psi_task_count { + NR_IOWAIT, + NR_MEMSTALL, + NR_RUNNING, + /* + * For IO and CPU stalls the presence of running/oncpu tasks + * in the domain means a partial rather than a full stall. + * For memory it's not so simple because of page reclaimers: + * they are running/oncpu while representing a stall. To tell + * whether a domain has productivity left or not, we need to + * distinguish between regular running (i.e. productive) + * threads and memstall ones. + */ + NR_MEMSTALL_RUNNING, + NR_PSI_TASK_COUNTS = 4, +}; + +/* Task state bitmasks */ +#define TSK_IOWAIT (1 << NR_IOWAIT) +#define TSK_MEMSTALL (1 << NR_MEMSTALL) +#define TSK_RUNNING (1 << NR_RUNNING) +#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) + +/* Only one task can be scheduled, no corresponding task count */ +#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS) + +/* Resources that workloads could be stalled on */ +enum psi_res { + PSI_IO, + PSI_MEM, + PSI_CPU, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + PSI_IRQ, +#endif + NR_PSI_RESOURCES, +}; + +/* + * Pressure states for each resource: + * + * SOME: Stalled tasks & working tasks + * FULL: Stalled tasks & no working tasks + */ +enum psi_states { + PSI_IO_SOME, + PSI_IO_FULL, + PSI_MEM_SOME, + PSI_MEM_FULL, + PSI_CPU_SOME, + PSI_CPU_FULL, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + PSI_IRQ_FULL, +#endif + /* Only per-CPU, to weigh the CPU in the global average: */ + PSI_NONIDLE, + NR_PSI_STATES, +}; + +/* Use one bit in the state mask to track TSK_ONCPU */ +#define PSI_ONCPU (1 << NR_PSI_STATES) + +/* Flag whether to re-arm avgs_work, see details in get_recent_times() */ +#define PSI_STATE_RESCHEDULE (1 << (NR_PSI_STATES + 1)) + +enum psi_aggregators { + PSI_AVGS = 0, + PSI_POLL, + NR_PSI_AGGREGATORS, +}; + +struct psi_group_cpu { + /* 1st cacheline updated by the scheduler */ + + /* Aggregator needs to know of concurrent changes */ + seqcount_t seq ____cacheline_aligned_in_smp; + + /* States of the tasks belonging to this group */ + unsigned int tasks[NR_PSI_TASK_COUNTS]; + + /* Aggregate pressure state derived from the tasks */ + u32 state_mask; + + /* Period time sampling buckets for each state of interest (ns) */ + u32 times[NR_PSI_STATES]; + + /* Time of last task change in this group (rq_clock) */ + u64 state_start; + + /* 2nd cacheline updated by the aggregator */ + + /* Delta detection against the sampling buckets */ + u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES] + ____cacheline_aligned_in_smp; +}; + +/* PSI growth tracking window */ +struct psi_window { + /* Window size in ns */ + u64 size; + + /* Start time of the current window in ns */ + u64 start_time; + + /* Value at the start of the window */ + u64 start_value; + + /* Value growth in the previous window */ + u64 prev_growth; +}; + +struct psi_trigger { + /* PSI state being monitored by the trigger */ + enum psi_states state; + + /* User-spacified threshold in ns */ + u64 threshold; + + /* List node inside triggers list */ + struct list_head node; + + /* Backpointer needed during trigger destruction */ + struct psi_group *group; + + /* Wait queue for polling */ + wait_queue_head_t event_wait; + + /* Kernfs file for cgroup triggers */ + struct kernfs_open_file *of; + + /* Pending event flag */ + int event; + + /* Tracking window */ + struct psi_window win; + + /* + * Time last event was generated. Used for rate-limiting + * events to one per window + */ + u64 last_event_time; + + /* Deferred event(s) from previous ratelimit window */ + bool pending_event; + + /* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */ + enum psi_aggregators aggregator; +}; + +struct psi_group { + struct psi_group *parent; + bool enabled; + + /* Protects data used by the aggregator */ + struct mutex avgs_lock; + + /* Per-cpu task state & time tracking */ + struct psi_group_cpu __percpu *pcpu; + + /* Running pressure averages */ + u64 avg_total[NR_PSI_STATES - 1]; + u64 avg_last_update; + u64 avg_next_update; + + /* Aggregator work control */ + struct delayed_work avgs_work; + + /* Unprivileged triggers against N*PSI_FREQ windows */ + struct list_head avg_triggers; + u32 avg_nr_triggers[NR_PSI_STATES - 1]; + + /* Total stall times and sampled pressure averages */ + u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1]; + unsigned long avg[NR_PSI_STATES - 1][3]; + + /* Monitor RT polling work control */ + struct task_struct __rcu *rtpoll_task; + struct timer_list rtpoll_timer; + wait_queue_head_t rtpoll_wait; + atomic_t rtpoll_wakeup; + atomic_t rtpoll_scheduled; + + /* Protects data used by the monitor */ + struct mutex rtpoll_trigger_lock; + + /* Configured RT polling triggers */ + struct list_head rtpoll_triggers; + u32 rtpoll_nr_triggers[NR_PSI_STATES - 1]; + u32 rtpoll_states; + u64 rtpoll_min_period; + + /* Total stall times at the start of RT polling monitor activation */ + u64 rtpoll_total[NR_PSI_STATES - 1]; + u64 rtpoll_next_update; + u64 rtpoll_until; +}; + +#else /* CONFIG_PSI */ + +#define NR_PSI_RESOURCES 0 + +struct psi_group { }; + +#endif /* CONFIG_PSI */ + +#endif /* _LINUX_PSI_TYPES_H */ |