Adding upstream version 6.8.9.upstream/6.8.9

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-18 18:50:03 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-18 18:50:03 +0000
commit: 01a69402cf9d38ff180345d55c2ee51c7e89fbc7 (patch)
tree: b406c5242a088c4f59c6e4b719b783f43aca6ae9 /kernel/workqueue.c
parent: Adding upstream version 6.7.12. (diff)
download: linux-01a69402cf9d38ff180345d55c2ee51c7e89fbc7.tar.xz
linux-01a69402cf9d38ff180345d55c2ee51c7e89fbc7.zip
1 files changed, 249 insertions, 675 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8f761417a9..7b482a26d7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -108,7 +108,7 @@ enum {
 	RESCUER_NICE_LEVEL	= MIN_NICE,
 	HIGHPRI_NICE_LEVEL	= MIN_NICE,
 
-	WQ_NAME_LEN		= 32,
+	WQ_NAME_LEN		= 24,
 };
 
 /*
@@ -122,9 +122,6 @@ enum {
  *
  * L: pool->lock protected.  Access with pool->lock held.
  *
- * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
- *     reads.
- *
  * K: Only modified by worker while holding pool->lock. Can be safely read by
  *    self, while holding pool->lock or from IRQ context if %current is the
  *    kworker.
@@ -146,9 +143,6 @@ enum {
  *
  * WR: wq->mutex protected for writes.  RCU protected for reads.
  *
- * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read
- *     with READ_ONCE() without locking.
- *
  * MD: wq_mayday_lock protected.
  *
  * WD: Used internally by the watchdog.
@@ -246,18 +240,18 @@ struct pool_workqueue {
 	 * pwq->inactive_works instead of pool->worklist and marked with
 	 * WORK_STRUCT_INACTIVE.
 	 *
-	 * All work items marked with WORK_STRUCT_INACTIVE do not participate in
-	 * nr_active and all work items in pwq->inactive_works are marked with
-	 * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
-	 * in pwq->inactive_works. Some of them are ready to run in
-	 * pool->worklist or worker->scheduled. Those work itmes are only struct
-	 * wq_barrier which is used for flush_work() and should not participate
-	 * in nr_active. For non-barrier work item, it is marked with
-	 * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
+	 * All work items marked with WORK_STRUCT_INACTIVE do not participate
+	 * in pwq->nr_active and all work items in pwq->inactive_works are
+	 * marked with WORK_STRUCT_INACTIVE.  But not all WORK_STRUCT_INACTIVE
+	 * work items are in pwq->inactive_works.  Some of them are ready to
+	 * run in pool->worklist or worker->scheduled.  Those work itmes are
+	 * only struct wq_barrier which is used for flush_work() and should
+	 * not participate in pwq->nr_active.  For non-barrier work item, it
+	 * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
 	 */
 	int			nr_active;	/* L: nr of active works */
+	int			max_active;	/* L: max active works */
 	struct list_head	inactive_works;	/* L: inactive works */
-	struct list_head	pending_node;	/* LN: node on wq_node_nr_active->pending_pwqs */
 	struct list_head	pwqs_node;	/* WR: node on wq->pwqs */
 	struct list_head	mayday_node;	/* MD: node on wq->maydays */
 
@@ -285,26 +279,6 @@ struct wq_flusher {
 struct wq_device;
 
 /*
- * Unlike in a per-cpu workqueue where max_active limits its concurrency level
- * on each CPU, in an unbound workqueue, max_active applies to the whole system.
- * As sharing a single nr_active across multiple sockets can be very expensive,
- * the counting and enforcement is per NUMA node.
- *
- * The following struct is used to enforce per-node max_active. When a pwq wants
- * to start executing a work item, it should increment ->nr using
- * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
- * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
- * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
- * round-robin order.
- */
-struct wq_node_nr_active {
-	int			max;		/* per-node max_active */
-	atomic_t		nr;		/* per-node nr_active */
-	raw_spinlock_t		lock;		/* nests inside pool locks */
-	struct list_head	pending_pwqs;	/* LN: pwqs with inactive works */
-};
-
-/*
  * The externally visible workqueue.  It relays the issued work items to
  * the appropriate worker_pool through its pool_workqueues.
  */
@@ -324,15 +298,10 @@ struct workqueue_struct {
 	struct worker		*rescuer;	/* MD: rescue worker */
 
 	int			nr_drainers;	/* WQ: drain in progress */
-
-	/* See alloc_workqueue() function comment for info on min/max_active */
-	int			max_active;	/* WO: max active works */
-	int			min_active;	/* WO: min active works */
-	int			saved_max_active; /* WQ: saved max_active */
-	int			saved_min_active; /* WQ: saved min_active */
+	int			saved_max_active; /* WQ: saved pwq max_active */
 
 	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs */
-	struct pool_workqueue __rcu *dfl_pwq;   /* PW: only for unbound wqs */
+	struct pool_workqueue	*dfl_pwq;	/* PW: only for unbound wqs */
 
 #ifdef CONFIG_SYSFS
 	struct wq_device	*wq_dev;	/* I: for sysfs interface */
@@ -354,7 +323,6 @@ struct workqueue_struct {
 	/* hot fields used during command issue, aligned to cacheline */
 	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
 	struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
-	struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
 };
 
 static struct kmem_cache *pwq_cache;
@@ -413,6 +381,12 @@ static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 /* PL&A: allowable cpus for unbound wqs and work items */
 static cpumask_var_t wq_unbound_cpumask;
 
+/* PL: user requested unbound cpumask via sysfs */
+static cpumask_var_t wq_requested_unbound_cpumask;
+
+/* PL: isolated cpumask to be excluded from unbound cpumask */
+static cpumask_var_t wq_isolated_cpumask;
+
 /* for further constrain wq_unbound_cpumask by cmdline parameter*/
 static struct cpumask wq_cmdline_cpumask __initdata;
 
@@ -658,36 +632,6 @@ static int worker_pool_assign_id(struct worker_pool *pool)
 	return ret;
 }
 
-static struct pool_workqueue __rcu **
-unbound_pwq_slot(struct workqueue_struct *wq, int cpu)
-{
-       if (cpu >= 0)
-               return per_cpu_ptr(wq->cpu_pwq, cpu);
-       else
-               return &wq->dfl_pwq;
-}
-
-/* @cpu < 0 for dfl_pwq */
-static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)
-{
-	return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),
-				     lockdep_is_held(&wq_pool_mutex) ||
-				     lockdep_is_held(&wq->mutex));
-}
-
-/**
- * unbound_effective_cpumask - effective cpumask of an unbound workqueue
- * @wq: workqueue of interest
- *
- * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
- * is masked with wq_unbound_cpumask to determine the effective cpumask. The
- * default pwq is always mapped to the pool with the current effective cpumask.
- */
-static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
-{
-	return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
-}
-
 static unsigned int work_color_to_flags(int color)
 {
 	return color << WORK_STRUCT_COLOR_SHIFT;
@@ -1458,71 +1402,6 @@ work_func_t wq_worker_last_func(struct task_struct *task)
 }
 
 /**
- * wq_node_nr_active - Determine wq_node_nr_active to use
- * @wq: workqueue of interest
- * @node: NUMA node, can be %NUMA_NO_NODE
- *
- * Determine wq_node_nr_active to use for @wq on @node. Returns:
- *
- * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
- *
- * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
- *
- * - Otherwise, node_nr_active[@node].
- */
-static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
-						   int node)
-{
-	if (!(wq->flags & WQ_UNBOUND))
-		return NULL;
-
-	if (node == NUMA_NO_NODE)
-		node = nr_node_ids;
-
-	return wq->node_nr_active[node];
-}
-
-/**
- * wq_update_node_max_active - Update per-node max_actives to use
- * @wq: workqueue to update
- * @off_cpu: CPU that's going down, -1 if a CPU is not going down
- *
- * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
- * distributed among nodes according to the proportions of numbers of online
- * cpus. The result is always between @wq->min_active and max_active.
- */
-static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
-{
-	struct cpumask *effective = unbound_effective_cpumask(wq);
-	int min_active = READ_ONCE(wq->min_active);
-	int max_active = READ_ONCE(wq->max_active);
-	int total_cpus, node;
-
-	lockdep_assert_held(&wq->mutex);
-
-	if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective))
-		off_cpu = -1;
-
-	total_cpus = cpumask_weight_and(effective, cpu_online_mask);
-	if (off_cpu >= 0)
-		total_cpus--;
-
-	for_each_node(node) {
-		int node_cpus;
-
-		node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
-		if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
-			node_cpus--;
-
-		wq_node_nr_active(wq, node)->max =
-			clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
-			      min_active, max_active);
-	}
-
-	wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active;
-}
-
-/**
  * get_pwq - get an extra reference on the specified pool_workqueue
  * @pwq: pool_workqueue to get
  *
@@ -1574,293 +1453,24 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
 	}
 }
 
-static bool pwq_is_empty(struct pool_workqueue *pwq)
-{
-	return !pwq->nr_active && list_empty(&pwq->inactive_works);
-}
-
-static void __pwq_activate_work(struct pool_workqueue *pwq,
-				struct work_struct *work)
+static void pwq_activate_inactive_work(struct work_struct *work)
 {
-	unsigned long *wdb = work_data_bits(work);
+	struct pool_workqueue *pwq = get_work_pwq(work);
 
-	WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
 	trace_workqueue_activate_work(work);
 	if (list_empty(&pwq->pool->worklist))
 		pwq->pool->watchdog_ts = jiffies;
 	move_linked_works(work, &pwq->pool->worklist, NULL);
-	__clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
-}
-
-/**
- * pwq_activate_work - Activate a work item if inactive
- * @pwq: pool_workqueue @work belongs to
- * @work: work item to activate
- *
- * Returns %true if activated. %false if already active.
- */
-static bool pwq_activate_work(struct pool_workqueue *pwq,
-			      struct work_struct *work)
-{
-	struct worker_pool *pool = pwq->pool;
-	struct wq_node_nr_active *nna;
-
-	lockdep_assert_held(&pool->lock);
-
-	if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
-		return false;
-
-	nna = wq_node_nr_active(pwq->wq, pool->node);
-	if (nna)
-		atomic_inc(&nna->nr);
-
+	__clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work));
 	pwq->nr_active++;
-	__pwq_activate_work(pwq, work);
-	return true;
-}
-
-static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
-{
-	int max = READ_ONCE(nna->max);
-
-	while (true) {
-		int old, tmp;
-
-		old = atomic_read(&nna->nr);
-		if (old >= max)
-			return false;
-		tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
-		if (tmp == old)
-			return true;
-	}
-}
-
-/**
- * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
- * @pwq: pool_workqueue of interest
- * @fill: max_active may have increased, try to increase concurrency level
- *
- * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
- * successfully obtained. %false otherwise.
- */
-static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
-{
-	struct workqueue_struct *wq = pwq->wq;
-	struct worker_pool *pool = pwq->pool;
-	struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
-	bool obtained = false;
-
-	lockdep_assert_held(&pool->lock);
-
-	if (!nna) {
-		/* per-cpu workqueue, pwq->nr_active is sufficient */
-		obtained = pwq->nr_active < READ_ONCE(wq->max_active);
-		goto out;
-	}
-
-	/*
-	 * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
-	 * already waiting on $nna, pwq_dec_nr_active() will maintain the
-	 * concurrency level. Don't jump the line.
-	 *
-	 * We need to ignore the pending test after max_active has increased as
-	 * pwq_dec_nr_active() can only maintain the concurrency level but not
-	 * increase it. This is indicated by @fill.
-	 */
-	if (!list_empty(&pwq->pending_node) && likely(!fill))
-		goto out;
-
-	obtained = tryinc_node_nr_active(nna);
-	if (obtained)
-		goto out;
-
-	/*
-	 * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
-	 * and try again. The smp_mb() is paired with the implied memory barrier
-	 * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
-	 * we see the decremented $nna->nr or they see non-empty
-	 * $nna->pending_pwqs.
-	 */
-	raw_spin_lock(&nna->lock);
-
-	if (list_empty(&pwq->pending_node))
-		list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
-	else if (likely(!fill))
-		goto out_unlock;
-
-	smp_mb();
-
-	obtained = tryinc_node_nr_active(nna);
-
-	/*
-	 * If @fill, @pwq might have already been pending. Being spuriously
-	 * pending in cold paths doesn't affect anything. Let's leave it be.
-	 */
-	if (obtained && likely(!fill))
-		list_del_init(&pwq->pending_node);
-
-out_unlock:
-	raw_spin_unlock(&nna->lock);
-out:
-	if (obtained)
-		pwq->nr_active++;
-	return obtained;
-}
-
-/**
- * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
- * @pwq: pool_workqueue of interest
- * @fill: max_active may have increased, try to increase concurrency level
- *
- * Activate the first inactive work item of @pwq if available and allowed by
- * max_active limit.
- *
- * Returns %true if an inactive work item has been activated. %false if no
- * inactive work item is found or max_active limit is reached.
- */
-static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
-{
-	struct work_struct *work =
-		list_first_entry_or_null(&pwq->inactive_works,
-					 struct work_struct, entry);
-
-	if (work && pwq_tryinc_nr_active(pwq, fill)) {
-		__pwq_activate_work(pwq, work);
-		return true;
-	} else {
-		return false;
-	}
-}
-
-/**
- * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
- * @nna: wq_node_nr_active to activate a pending pwq for
- * @caller_pool: worker_pool the caller is locking
- *
- * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
- * @caller_pool may be unlocked and relocked to lock other worker_pools.
- */
-static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
-				      struct worker_pool *caller_pool)
-{
-	struct worker_pool *locked_pool = caller_pool;
-	struct pool_workqueue *pwq;
-	struct work_struct *work;
-
-	lockdep_assert_held(&caller_pool->lock);
-
-	raw_spin_lock(&nna->lock);
-retry:
-	pwq = list_first_entry_or_null(&nna->pending_pwqs,
-				       struct pool_workqueue, pending_node);
-	if (!pwq)
-		goto out_unlock;
-
-	/*
-	 * If @pwq is for a different pool than @locked_pool, we need to lock
-	 * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
-	 * / lock dance. For that, we also need to release @nna->lock as it's
-	 * nested inside pool locks.
-	 */
-	if (pwq->pool != locked_pool) {
-		raw_spin_unlock(&locked_pool->lock);
-		locked_pool = pwq->pool;
-		if (!raw_spin_trylock(&locked_pool->lock)) {
-			raw_spin_unlock(&nna->lock);
-			raw_spin_lock(&locked_pool->lock);
-			raw_spin_lock(&nna->lock);
-			goto retry;
-		}
-	}
-
-	/*
-	 * $pwq may not have any inactive work items due to e.g. cancellations.
-	 * Drop it from pending_pwqs and see if there's another one.
-	 */
-	work = list_first_entry_or_null(&pwq->inactive_works,
-					struct work_struct, entry);
-	if (!work) {
-		list_del_init(&pwq->pending_node);
-		goto retry;
-	}
-
-	/*
-	 * Acquire an nr_active count and activate the inactive work item. If
-	 * $pwq still has inactive work items, rotate it to the end of the
-	 * pending_pwqs so that we round-robin through them. This means that
-	 * inactive work items are not activated in queueing order which is fine
-	 * given that there has never been any ordering across different pwqs.
-	 */
-	if (likely(tryinc_node_nr_active(nna))) {
-		pwq->nr_active++;
-		__pwq_activate_work(pwq, work);
-
-		if (list_empty(&pwq->inactive_works))
-			list_del_init(&pwq->pending_node);
-		else
-			list_move_tail(&pwq->pending_node, &nna->pending_pwqs);
-
-		/* if activating a foreign pool, make sure it's running */
-		if (pwq->pool != caller_pool)
-			kick_pool(pwq->pool);
-	}
-
-out_unlock:
-	raw_spin_unlock(&nna->lock);
-	if (locked_pool != caller_pool) {
-		raw_spin_unlock(&locked_pool->lock);
-		raw_spin_lock(&caller_pool->lock);
-	}
 }
 
-/**
- * pwq_dec_nr_active - Retire an active count
- * @pwq: pool_workqueue of interest
- *
- * Decrement @pwq's nr_active and try to activate the first inactive work item.
- * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
- */
-static void pwq_dec_nr_active(struct pool_workqueue *pwq)
+static void pwq_activate_first_inactive(struct pool_workqueue *pwq)
 {
-	struct worker_pool *pool = pwq->pool;
-	struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);
-
-	lockdep_assert_held(&pool->lock);
-
-	/*
-	 * @pwq->nr_active should be decremented for both percpu and unbound
-	 * workqueues.
-	 */
-	pwq->nr_active--;
-
-	/*
-	 * For a percpu workqueue, it's simple. Just need to kick the first
-	 * inactive work item on @pwq itself.
-	 */
-	if (!nna) {
-		pwq_activate_first_inactive(pwq, false);
-		return;
-	}
-
-	/*
-	 * If @pwq is for an unbound workqueue, it's more complicated because
-	 * multiple pwqs and pools may be sharing the nr_active count. When a
-	 * pwq needs to wait for an nr_active count, it puts itself on
-	 * $nna->pending_pwqs. The following atomic_dec_return()'s implied
-	 * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
-	 * guarantee that either we see non-empty pending_pwqs or they see
-	 * decremented $nna->nr.
-	 *
-	 * $nna->max may change as CPUs come online/offline and @pwq->wq's
-	 * max_active gets updated. However, it is guaranteed to be equal to or
-	 * larger than @pwq->wq->min_active which is above zero unless freezing.
-	 * This maintains the forward progress guarantee.
-	 */
-	if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
-		return;
+	struct work_struct *work = list_first_entry(&pwq->inactive_works,
+						    struct work_struct, entry);
 
-	if (!list_empty(&nna->pending_pwqs))
-		node_activate_pending_pwq(nna, pool);
+	pwq_activate_inactive_work(work);
 }
 
 /**
@@ -1878,8 +1488,14 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_
 {
 	int color = get_work_color(work_data);
 
-	if (!(work_data & WORK_STRUCT_INACTIVE))
-		pwq_dec_nr_active(pwq);
+	if (!(work_data & WORK_STRUCT_INACTIVE)) {
+		pwq->nr_active--;
+		if (!list_empty(&pwq->inactive_works)) {
+			/* one down, submit an inactive one */
+			if (pwq->nr_active < pwq->max_active)
+				pwq_activate_first_inactive(pwq);
+		}
+	}
 
 	pwq->nr_in_flight[color]--;
 
@@ -1992,7 +1608,8 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
 		 * management later on and cause stall.  Make sure the work
 		 * item is activated before grabbing.
 		 */
-		pwq_activate_work(pwq, work);
+		if (*work_data_bits(work) & WORK_STRUCT_INACTIVE)
+			pwq_activate_inactive_work(work);
 
 		list_del_init(&work->entry);
 		pwq_dec_nr_in_flight(pwq, *work_data_bits(work));
@@ -2176,16 +1793,12 @@ retry:
 	pwq->nr_in_flight[pwq->work_color]++;
 	work_flags = work_color_to_flags(pwq->work_color);
 
-	/*
-	 * Limit the number of concurrently active work items to max_active.
-	 * @work must also queue behind existing inactive work items to maintain
-	 * ordering when max_active changes. See wq_adjust_max_active().
-	 */
-	if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
+	if (likely(pwq->nr_active < pwq->max_active)) {
 		if (list_empty(&pool->worklist))
 			pool->watchdog_ts = jiffies;
 
 		trace_workqueue_activate_work(work);
+		pwq->nr_active++;
 		insert_work(pwq, work, &pool->worklist, work_flags);
 		kick_pool(pool);
 	} else {
@@ -3414,7 +3027,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
 
 	barr->task = current;
 
-	/* The barrier work item does not participate in nr_active. */
+	/* The barrier work item does not participate in pwq->nr_active. */
 	work_flags |= WORK_STRUCT_INACTIVE;
 
 	/*
@@ -3703,7 +3316,7 @@ reflush:
 		bool drained;
 
 		raw_spin_lock_irq(&pwq->pool->lock);
-		drained = pwq_is_empty(pwq);
+		drained = !pwq->nr_active && list_empty(&pwq->inactive_works);
 		raw_spin_unlock_irq(&pwq->pool->lock);
 
 		if (drained)
@@ -4314,65 +3927,11 @@ static void wq_free_lockdep(struct workqueue_struct *wq)
 }
 #endif
 
-static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
-{
-	int node;
-
-	for_each_node(node) {
-		kfree(nna_ar[node]);
-		nna_ar[node] = NULL;
-	}
-
-	kfree(nna_ar[nr_node_ids]);
-	nna_ar[nr_node_ids] = NULL;
-}
-
-static void init_node_nr_active(struct wq_node_nr_active *nna)
-{
-	atomic_set(&nna->nr, 0);
-	raw_spin_lock_init(&nna->lock);
-	INIT_LIST_HEAD(&nna->pending_pwqs);
-}
-
-/*
- * Each node's nr_active counter will be accessed mostly from its own node and
- * should be allocated in the node.
- */
-static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
-{
-	struct wq_node_nr_active *nna;
-	int node;
-
-	for_each_node(node) {
-		nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
-		if (!nna)
-			goto err_free;
-		init_node_nr_active(nna);
-		nna_ar[node] = nna;
-	}
-
-	/* [nr_node_ids] is used as the fallback */
-	nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
-	if (!nna)
-		goto err_free;
-	init_node_nr_active(nna);
-	nna_ar[nr_node_ids] = nna;
-
-	return 0;
-
-err_free:
-	free_node_nr_active(nna_ar);
-	return -ENOMEM;
-}
-
 static void rcu_free_wq(struct rcu_head *rcu)
 {
 	struct workqueue_struct *wq =
 		container_of(rcu, struct workqueue_struct, rcu);
 
-	if (wq->flags & WQ_UNBOUND)
-		free_node_nr_active(wq->node_nr_active);
-
 	wq_free_lockdep(wq);
 	free_percpu(wq->cpu_pwq);
 	free_workqueue_attrs(wq->unbound_attrs);
@@ -4571,15 +4130,6 @@ static void pwq_release_workfn(struct kthread_work *work)
 		mutex_unlock(&wq_pool_mutex);
 	}
 
-	if (!list_empty(&pwq->pending_node)) {
-		struct wq_node_nr_active *nna =
-			wq_node_nr_active(pwq->wq, pwq->pool->node);
-
-		raw_spin_lock_irq(&nna->lock);
-		list_del_init(&pwq->pending_node);
-		raw_spin_unlock_irq(&nna->lock);
-	}
-
 	call_rcu(&pwq->rcu, rcu_free_pwq);
 
 	/*
@@ -4592,6 +4142,50 @@ static void pwq_release_workfn(struct kthread_work *work)
 	}
 }
 
+/**
+ * pwq_adjust_max_active - update a pwq's max_active to the current setting
+ * @pwq: target pool_workqueue
+ *
+ * If @pwq isn't freezing, set @pwq->max_active to the associated
+ * workqueue's saved_max_active and activate inactive work items
+ * accordingly.  If @pwq is freezing, clear @pwq->max_active to zero.
+ */
+static void pwq_adjust_max_active(struct pool_workqueue *pwq)
+{
+	struct workqueue_struct *wq = pwq->wq;
+	bool freezable = wq->flags & WQ_FREEZABLE;
+	unsigned long flags;
+
+	/* for @wq->saved_max_active */
+	lockdep_assert_held(&wq->mutex);
+
+	/* fast exit for non-freezable wqs */
+	if (!freezable && pwq->max_active == wq->saved_max_active)
+		return;
+
+	/* this function can be called during early boot w/ irq disabled */
+	raw_spin_lock_irqsave(&pwq->pool->lock, flags);
+
+	/*
+	 * During [un]freezing, the caller is responsible for ensuring that
+	 * this function is called at least once after @workqueue_freezing
+	 * is updated and visible.
+	 */
+	if (!freezable || !workqueue_freezing) {
+		pwq->max_active = wq->saved_max_active;
+
+		while (!list_empty(&pwq->inactive_works) &&
+		       pwq->nr_active < pwq->max_active)
+			pwq_activate_first_inactive(pwq);
+
+		kick_pool(pwq->pool);
+	} else {
+		pwq->max_active = 0;
+	}
+
+	raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
+}
+
 /* initialize newly allocated @pwq which is associated with @wq and @pool */
 static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
 		     struct worker_pool *pool)
@@ -4605,7 +4199,6 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
 	pwq->flush_color = -1;
 	pwq->refcnt = 1;
 	INIT_LIST_HEAD(&pwq->inactive_works);
-	INIT_LIST_HEAD(&pwq->pending_node);
 	INIT_LIST_HEAD(&pwq->pwqs_node);
 	INIT_LIST_HEAD(&pwq->mayday_node);
 	kthread_init_work(&pwq->release_work, pwq_release_workfn);
@@ -4625,6 +4218,9 @@ static void link_pwq(struct pool_workqueue *pwq)
 	/* set the matching work_color */
 	pwq->work_color = wq->work_color;
 
+	/* sync max_active to the current setting */
+	pwq_adjust_max_active(pwq);
+
 	/* link in @pwq */
 	list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
 }
@@ -4693,11 +4289,10 @@ static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
 				"possible intersect\n");
 }
 
-/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
+/* install @pwq into @wq's cpu_pwq and return the old pwq */
 static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
 					int cpu, struct pool_workqueue *pwq)
 {
-	struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);
 	struct pool_workqueue *old_pwq;
 
 	lockdep_assert_held(&wq_pool_mutex);
@@ -4706,8 +4301,8 @@ static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
 	/* link_pwq() can handle duplicate calls */
 	link_pwq(pwq);
 
-	old_pwq = rcu_access_pointer(*slot);
-	rcu_assign_pointer(*slot, pwq);
+	old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
+	rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq);
 	return old_pwq;
 }
 
@@ -4807,31 +4402,18 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
 
 	copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
 
-	/* save the previous pwqs and install the new ones */
+	/* save the previous pwq and install the new one */
 	for_each_possible_cpu(cpu)
 		ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
 							ctx->pwq_tbl[cpu]);
-	ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);
 
-	/* update node_nr_active->max */
-	wq_update_node_max_active(ctx->wq, -1);
+	/* @dfl_pwq might not have been used, ensure it's linked */
+	link_pwq(ctx->dfl_pwq);
+	swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
 
 	mutex_unlock(&ctx->wq->mutex);
 }
 
-static void apply_wqattrs_lock(void)
-{
-	/* CPUs should stay stable across pwq creations and installations */
-	cpus_read_lock();
-	mutex_lock(&wq_pool_mutex);
-}
-
-static void apply_wqattrs_unlock(void)
-{
-	mutex_unlock(&wq_pool_mutex);
-	cpus_read_unlock();
-}
-
 static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
 					const struct workqueue_attrs *attrs)
 {
@@ -4937,7 +4519,9 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
 
 	/* nothing to do if the target cpumask matches the current pwq */
 	wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
-	if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
+	pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu),
+					lockdep_is_held(&wq_pool_mutex));
+	if (wqattrs_equal(target_attrs, pwq->pool->attrs))
 		return;
 
 	/* create a new pwq */
@@ -4955,11 +4539,10 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
 
 use_dfl_pwq:
 	mutex_lock(&wq->mutex);
-	pwq = unbound_pwq(wq, -1);
-	raw_spin_lock_irq(&pwq->pool->lock);
-	get_pwq(pwq);
-	raw_spin_unlock_irq(&pwq->pool->lock);
-	old_pwq = install_unbound_pwq(wq, cpu, pwq);
+	raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
+	get_pwq(wq->dfl_pwq);
+	raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
+	old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq);
 out_unlock:
 	mutex_unlock(&wq->mutex);
 	put_pwq_unlocked(old_pwq);
@@ -4997,13 +4580,10 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 
 	cpus_read_lock();
 	if (wq->flags & __WQ_ORDERED) {
-		struct pool_workqueue *dfl_pwq;
-
 		ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
 		/* there should only be single pwq for ordering guarantee */
-		dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
-		WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
-			      wq->pwqs.prev != &dfl_pwq->pwqs_node),
+		WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+			      wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
 		     "ordering guarantee broken for workqueue %s\n", wq->name);
 	} else {
 		ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
@@ -5078,69 +4658,6 @@ static int init_rescuer(struct workqueue_struct *wq)
 	return 0;
 }
 
-/**
- * wq_adjust_max_active - update a wq's max_active to the current setting
- * @wq: target workqueue
- *
- * If @wq isn't freezing, set @wq->max_active to the saved_max_active and
- * activate inactive work items accordingly. If @wq is freezing, clear
- * @wq->max_active to zero.
- */
-static void wq_adjust_max_active(struct workqueue_struct *wq)
-{
-	bool activated;
-	int new_max, new_min;
-
-	lockdep_assert_held(&wq->mutex);
-
-	if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
-		new_max = 0;
-		new_min = 0;
-	} else {
-		new_max = wq->saved_max_active;
-		new_min = wq->saved_min_active;
-	}
-
-	if (wq->max_active == new_max && wq->min_active == new_min)
-		return;
-
-	/*
-	 * Update @wq->max/min_active and then kick inactive work items if more
-	 * active work items are allowed. This doesn't break work item ordering
-	 * because new work items are always queued behind existing inactive
-	 * work items if there are any.
-	 */
-	WRITE_ONCE(wq->max_active, new_max);
-	WRITE_ONCE(wq->min_active, new_min);
-
-	if (wq->flags & WQ_UNBOUND)
-		wq_update_node_max_active(wq, -1);
-
-	if (new_max == 0)
-		return;
-
-	/*
-	 * Round-robin through pwq's activating the first inactive work item
-	 * until max_active is filled.
-	 */
-	do {
-		struct pool_workqueue *pwq;
-
-		activated = false;
-		for_each_pwq(pwq, wq) {
-			unsigned long flags;
-
-			/* can be called during early boot w/ irq disabled */
-			raw_spin_lock_irqsave(&pwq->pool->lock, flags);
-			if (pwq_activate_first_inactive(pwq, true)) {
-				activated = true;
-				kick_pool(pwq->pool);
-			}
-			raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
-		}
-	} while (activated);
-}
-
 __printf(1, 4)
 struct workqueue_struct *alloc_workqueue(const char *fmt,
 					 unsigned int flags,
@@ -5148,8 +4665,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 {
 	va_list args;
 	struct workqueue_struct *wq;
-	size_t wq_size;
-	int name_len;
+	struct pool_workqueue *pwq;
 
 	/*
 	 * Unbound && max_active == 1 used to imply ordered, which is no longer
@@ -5165,12 +4681,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 		flags |= WQ_UNBOUND;
 
 	/* allocate wq and format name */
-	if (flags & WQ_UNBOUND)
-		wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
-	else
-		wq_size = sizeof(*wq);
-
-	wq = kzalloc(wq_size, GFP_KERNEL);
+	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
 		return NULL;
 
@@ -5181,22 +4692,15 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 	}
 
 	va_start(args, max_active);
-	name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
+	vsnprintf(wq->name, sizeof(wq->name), fmt, args);
 	va_end(args);
 
-	if (name_len >= WQ_NAME_LEN)
-		pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
-			     wq->name);
-
 	max_active = max_active ?: WQ_DFL_ACTIVE;
 	max_active = wq_clamp_max_active(max_active, flags, wq->name);
 
 	/* init wq */
 	wq->flags = flags;
-	wq->max_active = max_active;
-	wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
-	wq->saved_max_active = wq->max_active;
-	wq->saved_min_active = wq->min_active;
+	wq->saved_max_active = max_active;
 	mutex_init(&wq->mutex);
 	atomic_set(&wq->nr_pwqs_to_flush, 0);
 	INIT_LIST_HEAD(&wq->pwqs);
@@ -5207,13 +4711,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 	wq_init_lockdep(wq);
 	INIT_LIST_HEAD(&wq->list);
 
-	if (flags & WQ_UNBOUND) {
-		if (alloc_node_nr_active(wq->node_nr_active) < 0)
-			goto err_unreg_lockdep;
-	}
-
 	if (alloc_and_link_pwqs(wq) < 0)
-		goto err_free_node_nr_active;
+		goto err_unreg_lockdep;
 
 	if (wq_online && init_rescuer(wq) < 0)
 		goto err_destroy;
@@ -5229,7 +4728,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 	mutex_lock(&wq_pool_mutex);
 
 	mutex_lock(&wq->mutex);
-	wq_adjust_max_active(wq);
+	for_each_pwq(pwq, wq)
+		pwq_adjust_max_active(pwq);
 	mutex_unlock(&wq->mutex);
 
 	list_add_tail_rcu(&wq->list, &workqueues);
@@ -5238,9 +4738,6 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 
 	return wq;
 
-err_free_node_nr_active:
-	if (wq->flags & WQ_UNBOUND)
-		free_node_nr_active(wq->node_nr_active);
 err_unreg_lockdep:
 	wq_unregister_lockdep(wq);
 	wq_free_lockdep(wq);
@@ -5262,9 +4759,9 @@ static bool pwq_busy(struct pool_workqueue *pwq)
 		if (pwq->nr_in_flight[i])
 			return true;
 
-	if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1))
+	if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1))
 		return true;
-	if (!pwq_is_empty(pwq))
+	if (pwq->nr_active || !list_empty(&pwq->inactive_works))
 		return true;
 
 	return false;
@@ -5346,12 +4843,13 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	rcu_read_lock();
 
 	for_each_possible_cpu(cpu) {
-		put_pwq_unlocked(unbound_pwq(wq, cpu));
-		RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);
+		pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
+		RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL);
+		put_pwq_unlocked(pwq);
 	}
 
-	put_pwq_unlocked(unbound_pwq(wq, -1));
-	RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL);
+	put_pwq_unlocked(wq->dfl_pwq);
+	wq->dfl_pwq = NULL;
 
 	rcu_read_unlock();
 }
@@ -5362,14 +4860,15 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
  * @wq: target workqueue
  * @max_active: new max_active value.
  *
- * Set max_active of @wq to @max_active. See the alloc_workqueue() function
- * comment.
+ * Set max_active of @wq to @max_active.
  *
  * CONTEXT:
  * Don't call from IRQ context.
  */
 void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 {
+	struct pool_workqueue *pwq;
+
 	/* disallow meddling with max_active for ordered workqueues */
 	if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
 		return;
@@ -5380,10 +4879,9 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 
 	wq->flags &= ~__WQ_ORDERED;
 	wq->saved_max_active = max_active;
-	if (wq->flags & WQ_UNBOUND)
-		wq->saved_min_active = min(wq->saved_min_active, max_active);
 
-	wq_adjust_max_active(wq);
+	for_each_pwq(pwq, wq)
+		pwq_adjust_max_active(pwq);
 
 	mutex_unlock(&wq->mutex);
 }
@@ -5630,8 +5128,8 @@ static void show_pwq(struct pool_workqueue *pwq)
 	pr_info("  pwq %d:", pool->id);
 	pr_cont_pool_info(pool);
 
-	pr_cont(" active=%d refcnt=%d%s\n",
-		pwq->nr_active, pwq->refcnt,
+	pr_cont(" active=%d/%d refcnt=%d%s\n",
+		pwq->nr_active, pwq->max_active, pwq->refcnt,
 		!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
 
 	hash_for_each(pool->busy_hash, bkt, worker, hentry) {
@@ -5705,7 +5203,7 @@ void show_one_workqueue(struct workqueue_struct *wq)
 	unsigned long flags;
 
 	for_each_pwq(pwq, wq) {
-		if (!pwq_is_empty(pwq)) {
+		if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
 			idle = false;
 			break;
 		}
@@ -5717,7 +5215,7 @@ void show_one_workqueue(struct workqueue_struct *wq)
 
 	for_each_pwq(pwq, wq) {
 		raw_spin_lock_irqsave(&pwq->pool->lock, flags);
-		if (!pwq_is_empty(pwq)) {
+		if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
 			/*
 			 * Defer printing to avoid deadlocks in console
 			 * drivers that queue work while holding locks
@@ -6064,10 +5562,6 @@ int workqueue_online_cpu(unsigned int cpu)
 
 			for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
 				wq_update_pod(wq, tcpu, cpu, true);
-
-			mutex_lock(&wq->mutex);
-			wq_update_node_max_active(wq, -1);
-			mutex_unlock(&wq->mutex);
 		}
 	}
 
@@ -6096,10 +5590,6 @@ int workqueue_offline_cpu(unsigned int cpu)
 
 			for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
 				wq_update_pod(wq, tcpu, cpu, false);
-
-			mutex_lock(&wq->mutex);
-			wq_update_node_max_active(wq, cpu);
-			mutex_unlock(&wq->mutex);
 		}
 	}
 	mutex_unlock(&wq_pool_mutex);
@@ -6187,6 +5677,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
 void freeze_workqueues_begin(void)
 {
 	struct workqueue_struct *wq;
+	struct pool_workqueue *pwq;
 
 	mutex_lock(&wq_pool_mutex);
 
@@ -6195,7 +5686,8 @@ void freeze_workqueues_begin(void)
 
 	list_for_each_entry(wq, &workqueues, list) {
 		mutex_lock(&wq->mutex);
-		wq_adjust_max_active(wq);
+		for_each_pwq(pwq, wq)
+			pwq_adjust_max_active(pwq);
 		mutex_unlock(&wq->mutex);
 	}
 
@@ -6260,6 +5752,7 @@ out_unlock:
 void thaw_workqueues(void)
 {
 	struct workqueue_struct *wq;
+	struct pool_workqueue *pwq;
 
 	mutex_lock(&wq_pool_mutex);
 
@@ -6271,7 +5764,8 @@ void thaw_workqueues(void)
 	/* restore max_active and repopulate worklist */
 	list_for_each_entry(wq, &workqueues, list) {
 		mutex_lock(&wq->mutex);
-		wq_adjust_max_active(wq);
+		for_each_pwq(pwq, wq)
+			pwq_adjust_max_active(pwq);
 		mutex_unlock(&wq->mutex);
 	}
 
@@ -6320,39 +5814,40 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
 }
 
 /**
- *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
- *  @cpumask: the cpumask to set
- *
- *  The low-level workqueues cpumask is a global cpumask that limits
- *  the affinity of all unbound workqueues.  This function check the @cpumask
- *  and apply it to all unbound workqueues and updates all pwqs of them.
+ * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
+ * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
  *
- *  Return:	0	- Success
- *  		-EINVAL	- Invalid @cpumask
- *  		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
+ * This function can be called from cpuset code to provide a set of isolated
+ * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
+ * either cpus_read_lock or cpus_write_lock.
  */
-int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
 {
-	int ret = -EINVAL;
+	cpumask_var_t cpumask;
+	int ret = 0;
+
+	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	lockdep_assert_cpus_held();
+	mutex_lock(&wq_pool_mutex);
+
+	/* Save the current isolated cpumask & export it via sysfs */
+	cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
 
 	/*
-	 * Not excluding isolated cpus on purpose.
-	 * If the user wishes to include them, we allow that.
+	 * If the operation fails, it will fall back to
+	 * wq_requested_unbound_cpumask which is initially set to
+	 * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
+	 * by any subsequent write to workqueue/cpumask sysfs file.
 	 */
-	cpumask_and(cpumask, cpumask, cpu_possible_mask);
-	if (!cpumask_empty(cpumask)) {
-		apply_wqattrs_lock();
-		if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
-			ret = 0;
-			goto out_unlock;
-		}
-
+	if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
+		cpumask_copy(cpumask, wq_requested_unbound_cpumask);
+	if (!cpumask_equal(cpumask, wq_unbound_cpumask))
 		ret = workqueue_apply_unbound_cpumask(cpumask);
 
-out_unlock:
-		apply_wqattrs_unlock();
-	}
-
+	mutex_unlock(&wq_pool_mutex);
+	free_cpumask_var(cpumask);
 	return ret;
 }
 
@@ -6474,6 +5969,19 @@ static struct attribute *wq_sysfs_attrs[] = {
 };
 ATTRIBUTE_GROUPS(wq_sysfs);
 
+static void apply_wqattrs_lock(void)
+{
+	/* CPUs should stay stable across pwq creations and installations */
+	cpus_read_lock();
+	mutex_lock(&wq_pool_mutex);
+}
+
+static void apply_wqattrs_unlock(void)
+{
+	mutex_unlock(&wq_pool_mutex);
+	cpus_read_unlock();
+}
+
 static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
 {
@@ -6650,19 +6158,74 @@ static struct bus_type wq_subsys = {
 	.dev_groups			= wq_sysfs_groups,
 };
 
-static ssize_t wq_unbound_cpumask_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
+/**
+ *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
+ *  @cpumask: the cpumask to set
+ *
+ *  The low-level workqueues cpumask is a global cpumask that limits
+ *  the affinity of all unbound workqueues.  This function check the @cpumask
+ *  and apply it to all unbound workqueues and updates all pwqs of them.
+ *
+ *  Return:	0	- Success
+ *		-EINVAL	- Invalid @cpumask
+ *		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
+ */
+static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+{
+	int ret = -EINVAL;
+
+	/*
+	 * Not excluding isolated cpus on purpose.
+	 * If the user wishes to include them, we allow that.
+	 */
+	cpumask_and(cpumask, cpumask, cpu_possible_mask);
+	if (!cpumask_empty(cpumask)) {
+		apply_wqattrs_lock();
+		cpumask_copy(wq_requested_unbound_cpumask, cpumask);
+		if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
+			ret = 0;
+			goto out_unlock;
+		}
+
+		ret = workqueue_apply_unbound_cpumask(cpumask);
+
+out_unlock:
+		apply_wqattrs_unlock();
+	}
+
+	return ret;
+}
+
+static ssize_t __wq_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf, cpumask_var_t mask)
 {
 	int written;
 
 	mutex_lock(&wq_pool_mutex);
-	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
-			    cpumask_pr_args(wq_unbound_cpumask));
+	written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
 	mutex_unlock(&wq_pool_mutex);
 
 	return written;
 }
 
+static ssize_t wq_unbound_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
+}
+
+static ssize_t wq_requested_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
+}
+
+static ssize_t wq_isolated_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
+}
+
 static ssize_t wq_unbound_cpumask_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
@@ -6680,9 +6243,13 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev,
 	return ret ? ret : count;
 }
 
-static struct device_attribute wq_sysfs_cpumask_attr =
+static struct device_attribute wq_sysfs_cpumask_attrs[] = {
 	__ATTR(cpumask, 0644, wq_unbound_cpumask_show,
-	       wq_unbound_cpumask_store);
+	       wq_unbound_cpumask_store),
+	__ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL),
+	__ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL),
+	__ATTR_NULL,
+};
 
 static int __init wq_sysfs_init(void)
 {
@@ -6695,7 +6262,13 @@ static int __init wq_sysfs_init(void)
 
 	dev_root = bus_get_dev_root(&wq_subsys);
 	if (dev_root) {
-		err = device_create_file(dev_root, &wq_sysfs_cpumask_attr);
+		struct device_attribute *attr;
+
+		for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) {
+			err = device_create_file(dev_root, attr);
+			if (err)
+				break;
+		}
 		put_device(dev_root);
 	}
 	return err;
@@ -7037,12 +6610,17 @@ void __init workqueue_init_early(void)
 	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
+	BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
+
 	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
 	restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
 	restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
 	if (!cpumask_empty(&wq_cmdline_cpumask))
 		restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
 
+	cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
+
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
 	wq_update_pod_attrs_buf = alloc_workqueue_attrs();
@@ -7109,7 +6687,7 @@ void __init workqueue_init_early(void)
 					      WQ_FREEZABLE, 0);
 	system_power_efficient_wq = alloc_workqueue("events_power_efficient",
 					      WQ_POWER_EFFICIENT, 0);
-	system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
+	system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
 					      WQ_FREEZABLE | WQ_POWER_EFFICIENT,
 					      0);
 	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
@@ -7296,12 +6874,8 @@ void __init workqueue_init_topology(void)
 	 * combinations to apply per-pod sharing.
 	 */
 	list_for_each_entry(wq, &workqueues, list) {
-		for_each_online_cpu(cpu)
+		for_each_online_cpu(cpu) {
 			wq_update_pod(wq, cpu, cpu, true);
-		if (wq->flags & WQ_UNBOUND) {
-			mutex_lock(&wq->mutex);
-			wq_update_node_max_active(wq, -1);
-			mutex_unlock(&wq->mutex);
 		}
 	}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-18 18:50:03 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-18 18:50:03 +0000
commit	01a69402cf9d38ff180345d55c2ee51c7e89fbc7 (patch)
tree	b406c5242a088c4f59c6e4b719b783f43aca6ae9 /kernel/workqueue.c
parent	Adding upstream version 6.7.12. (diff)
download	linux-01a69402cf9d38ff180345d55c2ee51c7e89fbc7.tar.xz linux-01a69402cf9d38ff180345d55c2ee51c7e89fbc7.zip