summaryrefslogtreecommitdiffstats
path: root/kernel/cgroup
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-07 13:17:52 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-07 13:17:52 +0000
commit3afb00d3f86d3d924f88b56fa8285d4e9db85852 (patch)
tree95a985d3019522cea546b7d8df621369bc44fc6c /kernel/cgroup
parentAdding debian version 6.9.12-1. (diff)
downloadlinux-3afb00d3f86d3d924f88b56fa8285d4e9db85852.tar.xz
linux-3afb00d3f86d3d924f88b56fa8285d4e9db85852.zip
Merging upstream version 6.10.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'kernel/cgroup')
-rw-r--r--kernel/cgroup/cgroup-v1.c1
-rw-r--r--kernel/cgroup/cgroup.c3
-rw-r--r--kernel/cgroup/cpuset.c91
-rw-r--r--kernel/cgroup/legacy_freezer.c5
-rw-r--r--kernel/cgroup/pids.c2
-rw-r--r--kernel/cgroup/rstat.c118
6 files changed, 165 insertions, 55 deletions
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 520a11cb12..b9dbf6bf27 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1335,6 +1335,7 @@ static int __init cgroup_no_v1(char *str)
continue;
cgroup_no_v1_mask |= 1 << i;
+ break;
}
}
return 1;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a66c088c85..e32b6972c4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5368,7 +5368,8 @@ static void css_free_rwork_fn(struct work_struct *work)
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
- cgroup1_pidlist_destroy_all(cgrp);
+ if (!cgroup_on_dfl(cgrp))
+ cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
bpf_cgrp_storage_free(cgrp);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 73ef0dabc3..5e468db958 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -21,6 +21,7 @@
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
+#include "cgroup-internal.h"
#include <linux/cpu.h>
#include <linux/cpumask.h>
@@ -169,7 +170,7 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
- /* number of valid sub-partitions */
+ /* number of valid local child partitions */
int nr_subparts;
/* partition root state */
@@ -368,9 +369,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
}
static struct cpuset top_cpuset = {
- .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
- (1 << CS_MEM_EXCLUSIVE)),
+ .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
+ BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
+ .relax_domain_level = -1,
.remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
};
@@ -956,13 +958,15 @@ static int generate_sched_domains(cpumask_var_t **domains,
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
bool root_load_balance = is_sched_load_balance(&top_cpuset);
+ bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && !top_cpuset.nr_subparts) {
+ if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
+single_root_domain:
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
@@ -990,16 +994,18 @@ static int generate_sched_domains(cpumask_var_t **domains,
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
if (cp == &top_cpuset)
continue;
+
+ if (cgrpv2)
+ goto v2;
+
/*
+ * v1:
* Continue traversing beyond @cp iff @cp has some CPUs and
* isn't load balancing. The former is obvious. The
* latter: All child cpusets contain a subset of the
* parent's cpus, so just skip them, and then we call
* update_domain_attr_tree() to calc relax_domain_level of
* the corresponding sched domain.
- *
- * If root is load-balancing, we can skip @cp if it
- * is a subset of the root's effective_cpus.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
@@ -1007,20 +1013,39 @@ static int generate_sched_domains(cpumask_var_t **domains,
housekeeping_cpumask(HK_TYPE_DOMAIN))))
continue;
- if (root_load_balance &&
- cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
- continue;
-
if (is_sched_load_balance(cp) &&
!cpumask_empty(cp->effective_cpus))
csa[csn++] = cp;
- /* skip @cp's subtree if not a partition root */
- if (!is_partition_valid(cp))
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+
+v2:
+ /*
+ * Only valid partition roots that are not isolated and with
+ * non-empty effective_cpus will be saved into csn[].
+ */
+ if ((cp->partition_root_state == PRS_ROOT) &&
+ !cpumask_empty(cp->effective_cpus))
+ csa[csn++] = cp;
+
+ /*
+ * Skip @cp's subtree if not a partition root and has no
+ * exclusive CPUs to be granted to child cpusets.
+ */
+ if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
+ /*
+ * If there are only isolated partitions underneath the cgroup root,
+ * we can optimize out unneeded sched domains scanning.
+ */
+ if (root_load_balance && (csn == 1))
+ goto single_root_domain;
+
for (i = 0; i < csn; i++)
csa[i]->pn = i;
ndoms = csn;
@@ -1063,6 +1088,20 @@ restart:
dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
GFP_KERNEL);
+ /*
+ * Cgroup v2 doesn't support domain attributes, just set all of them
+ * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
+ * subset of HK_TYPE_DOMAIN housekeeping CPUs.
+ */
+ if (cgrpv2) {
+ for (i = 0; i < ndoms; i++) {
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
+ }
+ goto done;
+ }
+
for (nslot = 0, i = 0; i < csn; i++) {
struct cpuset *a = csa[i];
struct cpumask *dp;
@@ -1222,7 +1261,7 @@ static void rebuild_sched_domains_locked(void)
* root should be only a subset of the active CPUs. Since a CPU in any
* partition root could be offlined, all must be checked.
*/
- if (top_cpuset.nr_subparts) {
+ if (!cpumask_empty(subpartitions_cpus)) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (!is_partition_valid(cs)) {
@@ -3773,9 +3812,6 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
buf = strstrip(buf);
- /*
- * Convert "root" to ENABLED, and convert "member" to DISABLED.
- */
if (!strcmp(buf, "root"))
val = PRS_ROOT;
else if (!strcmp(buf, "member"))
@@ -4051,11 +4087,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->effective_mems;
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
- /*
- * Clear CS_SCHED_LOAD_BALANCE if parent is isolated
- */
- if (!is_sched_load_balance(parent))
- clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
/*
@@ -4309,8 +4340,6 @@ int __init cpuset_init(void)
nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
- set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
- top_cpuset.relax_domain_level = -1;
INIT_LIST_HEAD(&remote_children);
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
@@ -4580,7 +4609,7 @@ static void cpuset_handle_hotplug(void)
* In the rare case that hotplug removes all the cpus in
* subpartitions_cpus, we assumed that cpus are updated.
*/
- if (!cpus_updated && top_cpuset.nr_subparts)
+ if (!cpus_updated && !cpumask_empty(subpartitions_cpus))
cpus_updated = true;
/* For v1, synchronize cpus_allowed to cpu_active_mask */
@@ -5060,10 +5089,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- css = task_get_css(tsk, cpuset_cgrp_id);
- retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
- current->nsproxy->cgroup_ns);
- css_put(css);
+ rcu_read_lock();
+ spin_lock_irq(&css_set_lock);
+ css = task_css(tsk, cpuset_cgrp_id);
+ retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ spin_unlock_irq(&css_set_lock);
+ rcu_read_unlock();
+
if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 66d1708042..074653f964 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -106,8 +106,7 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
* @css: css being created
*
* We're committing to creation of @css. Mark it online and inherit
- * parent's freezing state while holding both parent's and our
- * freezer->lock.
+ * parent's freezing state while holding cpus read lock and freezer_mutex.
*/
static int freezer_css_online(struct cgroup_subsys_state *css)
{
@@ -133,7 +132,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
* freezer_css_offline - initiate destruction of a freezer css
* @css: css being destroyed
*
- * @css is going away. Mark it dead and decrement system_freezing_count if
+ * @css is going away. Mark it dead and decrement freezer_active if
* it was holding one.
*/
static void freezer_css_offline(struct cgroup_subsys_state *css)
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 7695e60bcb..0e5ec7d59b 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -75,9 +75,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
if (!pids)
return ERR_PTR(-ENOMEM);
- atomic64_set(&pids->counter, 0);
atomic64_set(&pids->limit, PIDS_MAX);
- atomic64_set(&pids->events_limit, 0);
return &pids->css;
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 07e2284bb4..fb8b494375 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -7,6 +7,8 @@
#include <linux/btf.h>
#include <linux/btf_ids.h>
+#include <trace/events/cgroup.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -17,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}
+/*
+ * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @fast_path determine the
+ * tracepoints being added, allowing us to diagnose "flush" related
+ * operations without handling high-frequency fast-path "update" events.
+ */
+static __always_inline
+unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
+ struct cgroup *cgrp, const bool fast_path)
+{
+ unsigned long flags;
+ bool contended;
+
+ /*
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
+ */
+ contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
+ if (contended) {
+ if (fast_path)
+ trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
+ else
+ trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
+
+ raw_spin_lock_irqsave(cpu_lock, flags);
+ }
+
+ if (fast_path)
+ trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
+ else
+ trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
+
+ return flags;
+}
+
+static __always_inline
+void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
+ struct cgroup *cgrp, unsigned long flags,
+ const bool fast_path)
+{
+ if (fast_path)
+ trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
+ else
+ trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
+
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
/**
* cgroup_rstat_updated - keep track of updated rstat_cpu
* @cgrp: target cgroup
@@ -42,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
return;
- raw_spin_lock_irqsave(cpu_lock, flags);
+ flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
/* put @cgrp and all ancestors on the corresponding updated lists */
while (true) {
@@ -70,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
cgrp = parent;
}
- raw_spin_unlock_irqrestore(cpu_lock, flags);
+ _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
}
/**
@@ -151,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
struct cgroup *head = NULL, *parent, *child;
unsigned long flags;
- /*
- * The _irqsave() is needed because cgroup_rstat_lock is
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
- * this lock with the _irq() suffix only disables interrupts on
- * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
- * interrupts on both configurations. The _irqsave() ensures
- * that interrupts are always disabled and later restored.
- */
- raw_spin_lock_irqsave(cpu_lock, flags);
+ flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
@@ -196,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
if (child != root)
head = cgroup_rstat_push_children(head, child, cpu);
unlock_ret:
- raw_spin_unlock_irqrestore(cpu_lock, flags);
+ _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
return head;
}
@@ -222,6 +270,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
__bpf_hook_end();
+/*
+ * Helper functions for locking cgroup_rstat_lock.
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @cpu_in_loop indicate lock
+ * was released and re-taken when collection data from the CPUs. The
+ * value -1 is used when obtaining the main lock else this is the CPU
+ * number processed last.
+ */
+static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
+ __acquires(&cgroup_rstat_lock)
+{
+ bool contended;
+
+ contended = !spin_trylock_irq(&cgroup_rstat_lock);
+ if (contended) {
+ trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
+ spin_lock_irq(&cgroup_rstat_lock);
+ }
+ trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+}
+
+static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
+ __releases(&cgroup_rstat_lock)
+{
+ trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
+ spin_unlock_irq(&cgroup_rstat_lock);
+}
+
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -248,10 +325,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
/* play nice and yield if necessary */
if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
- spin_unlock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_unlock(cgrp, cpu);
if (!cond_resched())
cpu_relax();
- spin_lock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_lock(cgrp, cpu);
}
}
}
@@ -273,9 +350,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
might_sleep();
- spin_lock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_lock(cgrp, -1);
cgroup_rstat_flush_locked(cgrp);
- spin_unlock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_unlock(cgrp, -1);
}
/**
@@ -291,17 +368,18 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
__acquires(&cgroup_rstat_lock)
{
might_sleep();
- spin_lock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_lock(cgrp, -1);
cgroup_rstat_flush_locked(cgrp);
}
/**
* cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
+ * @cgrp: cgroup used by tracepoint
*/
-void cgroup_rstat_flush_release(void)
+void cgroup_rstat_flush_release(struct cgroup *cgrp)
__releases(&cgroup_rstat_lock)
{
- spin_unlock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_unlock(cgrp, -1);
}
int cgroup_rstat_init(struct cgroup *cgrp)
@@ -533,7 +611,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
#ifdef CONFIG_SCHED_CORE
forceidle_time = cgrp->bstat.forceidle_sum;
#endif
- cgroup_rstat_flush_release();
+ cgroup_rstat_flush_release(cgrp);
} else {
root_cgroup_cputime(&bstat);
usage = bstat.cputime.sum_exec_runtime;