From 3afb00d3f86d3d924f88b56fa8285d4e9db85852 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 7 Aug 2024 15:17:52 +0200 Subject: Merging upstream version 6.10.3. Signed-off-by: Daniel Baumann --- kernel/cgroup/cgroup-v1.c | 1 + kernel/cgroup/cgroup.c | 3 +- kernel/cgroup/cpuset.c | 91 +++++++++++++++++++++---------- kernel/cgroup/legacy_freezer.c | 5 +- kernel/cgroup/pids.c | 2 - kernel/cgroup/rstat.c | 118 ++++++++++++++++++++++++++++++++++------- 6 files changed, 165 insertions(+), 55 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 520a11cb12..b9dbf6bf27 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1335,6 +1335,7 @@ static int __init cgroup_no_v1(char *str) continue; cgroup_no_v1_mask |= 1 << i; + break; } } return 1; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a66c088c85..e32b6972c4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5368,7 +5368,8 @@ static void css_free_rwork_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup1_pidlist_destroy_all(cgrp); + if (!cgroup_on_dfl(cgrp)) + cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); bpf_cgrp_storage_free(cgrp); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 73ef0dabc3..5e468db958 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,6 +21,7 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ +#include "cgroup-internal.h" #include #include @@ -169,7 +170,7 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* number of valid sub-partitions */ + /* number of valid local child partitions */ int nr_subparts; /* partition root state */ @@ -368,9 +369,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) } static struct cpuset top_cpuset = { - .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | - (1 << CS_MEM_EXCLUSIVE)), + .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, + .relax_domain_level = -1, .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; @@ -956,13 +958,15 @@ static int generate_sched_domains(cpumask_var_t **domains, int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); + bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts) { + if (root_load_balance && cpumask_empty(subpartitions_cpus)) { +single_root_domain: ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -990,16 +994,18 @@ static int generate_sched_domains(cpumask_var_t **domains, cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; + + if (cgrpv2) + goto v2; + /* + * v1: * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. - * - * If root is load-balancing, we can skip @cp if it - * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && @@ -1007,20 +1013,39 @@ static int generate_sched_domains(cpumask_var_t **domains, housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; - if (root_load_balance && - cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) - continue; - if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; - /* skip @cp's subtree if not a partition root */ - if (!is_partition_valid(cp)) + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + +v2: + /* + * Only valid partition roots that are not isolated and with + * non-empty effective_cpus will be saved into csn[]. + */ + if ((cp->partition_root_state == PRS_ROOT) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* + * Skip @cp's subtree if not a partition root and has no + * exclusive CPUs to be granted to child cpusets. + */ + if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); + /* + * If there are only isolated partitions underneath the cgroup root, + * we can optimize out unneeded sched domains scanning. + */ + if (root_load_balance && (csn == 1)) + goto single_root_domain; + for (i = 0; i < csn; i++) csa[i]->pn = i; ndoms = csn; @@ -1063,6 +1088,20 @@ restart: dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), GFP_KERNEL); + /* + * Cgroup v2 doesn't support domain attributes, just set all of them + * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a + * subset of HK_TYPE_DOMAIN housekeeping CPUs. + */ + if (cgrpv2) { + for (i = 0; i < ndoms; i++) { + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; + } + goto done; + } + for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; struct cpumask *dp; @@ -1222,7 +1261,7 @@ static void rebuild_sched_domains_locked(void) * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ - if (top_cpuset.nr_subparts) { + if (!cpumask_empty(subpartitions_cpus)) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { @@ -3773,9 +3812,6 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, buf = strstrip(buf); - /* - * Convert "root" to ENABLED, and convert "member" to DISABLED. - */ if (!strcmp(buf, "root")) val = PRS_ROOT; else if (!strcmp(buf, "member")) @@ -4051,11 +4087,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; cs->use_parent_ecpus = true; parent->child_ecpus_count++; - /* - * Clear CS_SCHED_LOAD_BALANCE if parent is isolated - */ - if (!is_sched_load_balance(parent)) - clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } /* @@ -4309,8 +4340,6 @@ int __init cpuset_init(void) nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); - top_cpuset.relax_domain_level = -1; INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -4580,7 +4609,7 @@ static void cpuset_handle_hotplug(void) * In the rare case that hotplug removes all the cpus in * subpartitions_cpus, we assumed that cpus are updated. */ - if (!cpus_updated && top_cpuset.nr_subparts) + if (!cpus_updated && !cpumask_empty(subpartitions_cpus)) cpus_updated = true; /* For v1, synchronize cpus_allowed to cpu_active_mask */ @@ -5060,10 +5089,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - css = task_get_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - css_put(css); + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 66d1708042..074653f964 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -106,8 +106,7 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) * @css: css being created * * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. + * parent's freezing state while holding cpus read lock and freezer_mutex. */ static int freezer_css_online(struct cgroup_subsys_state *css) { @@ -133,7 +132,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) * freezer_css_offline - initiate destruction of a freezer css * @css: css being destroyed * - * @css is going away. Mark it dead and decrement system_freezing_count if + * @css is going away. Mark it dead and decrement freezer_active if * it was holding one. */ static void freezer_css_offline(struct cgroup_subsys_state *css) diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 7695e60bcb..0e5ec7d59b 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -75,9 +75,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - atomic64_set(&pids->counter, 0); atomic64_set(&pids->limit, PIDS_MAX); - atomic64_set(&pids->events_limit, 0); return &pids->css; } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 07e2284bb4..fb8b494375 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -7,6 +7,8 @@ #include #include +#include + static DEFINE_SPINLOCK(cgroup_rstat_lock); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); @@ -17,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) return per_cpu_ptr(cgrp->rstat_cpu, cpu); } +/* + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock). + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @fast_path determine the + * tracepoints being added, allowing us to diagnose "flush" related + * operations without handling high-frequency fast-path "update" events. + */ +static __always_inline +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup *cgrp, const bool fast_path) +{ + unsigned long flags; + bool contended; + + /* + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. + */ + contended = !raw_spin_trylock_irqsave(cpu_lock, flags); + if (contended) { + if (fast_path) + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); + else + trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); + + raw_spin_lock_irqsave(cpu_lock, flags); + } + + if (fast_path) + trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); + else + trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); + + return flags; +} + +static __always_inline +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup *cgrp, unsigned long flags, + const bool fast_path) +{ + if (fast_path) + trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); + else + trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + /** * cgroup_rstat_updated - keep track of updated rstat_cpu * @cgrp: target cgroup @@ -42,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; - raw_spin_lock_irqsave(cpu_lock, flags); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true); /* put @cgrp and all ancestors on the corresponding updated lists */ while (true) { @@ -70,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) cgrp = parent; } - raw_spin_unlock_irqrestore(cpu_lock, flags); + _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true); } /** @@ -151,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) struct cgroup *head = NULL, *parent, *child; unsigned long flags; - /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. - */ - raw_spin_lock_irqsave(cpu_lock, flags); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) @@ -196,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) if (child != root) head = cgroup_rstat_push_children(head, child, cpu); unlock_ret: - raw_spin_unlock_irqrestore(cpu_lock, flags); + _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false); return head; } @@ -222,6 +270,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __bpf_hook_end(); +/* + * Helper functions for locking cgroup_rstat_lock. + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @cpu_in_loop indicate lock + * was released and re-taken when collection data from the CPUs. The + * value -1 is used when obtaining the main lock else this is the CPU + * number processed last. + */ +static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) + __acquires(&cgroup_rstat_lock) +{ + bool contended; + + contended = !spin_trylock_irq(&cgroup_rstat_lock); + if (contended) { + trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); + spin_lock_irq(&cgroup_rstat_lock); + } + trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); +} + +static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) + __releases(&cgroup_rstat_lock) +{ + trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); + spin_unlock_irq(&cgroup_rstat_lock); +} + /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) @@ -248,10 +325,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) /* play nice and yield if necessary */ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, cpu); if (!cond_resched()) cpu_relax(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, cpu); } } } @@ -273,9 +350,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) { might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, -1); cgroup_rstat_flush_locked(cgrp); - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, -1); } /** @@ -291,17 +368,18 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp) __acquires(&cgroup_rstat_lock) { might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, -1); cgroup_rstat_flush_locked(cgrp); } /** * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() + * @cgrp: cgroup used by tracepoint */ -void cgroup_rstat_flush_release(void) +void cgroup_rstat_flush_release(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) { - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, -1); } int cgroup_rstat_init(struct cgroup *cgrp) @@ -533,7 +611,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) #ifdef CONFIG_SCHED_CORE forceidle_time = cgrp->bstat.forceidle_sum; #endif - cgroup_rstat_flush_release(); + cgroup_rstat_flush_release(cgrp); } else { root_cgroup_cputime(&bstat); usage = bstat.cputime.sum_exec_runtime; -- cgit v1.2.3