summaryrefslogtreecommitdiffstats
path: root/kernel/workqueue.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 18:47:50 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 18:47:50 +0000
commit7c0639a3af697d4ae7a5db4d2ecc09eed43cad35 (patch)
treeb28a6eef28064256422bed5e477ee51f2cbb0c0b /kernel/workqueue.c
parentAdding debian version 6.7.9-2. (diff)
downloadlinux-7c0639a3af697d4ae7a5db4d2ecc09eed43cad35.tar.xz
linux-7c0639a3af697d4ae7a5db4d2ecc09eed43cad35.zip
Merging upstream version 6.7.12.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'kernel/workqueue.c')
-rw-r--r--kernel/workqueue.c757
1 files changed, 630 insertions, 127 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4f87b1851c..8f761417a9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -108,7 +108,7 @@ enum {
RESCUER_NICE_LEVEL = MIN_NICE,
HIGHPRI_NICE_LEVEL = MIN_NICE,
- WQ_NAME_LEN = 24,
+ WQ_NAME_LEN = 32,
};
/*
@@ -122,6 +122,9 @@ enum {
*
* L: pool->lock protected. Access with pool->lock held.
*
+ * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
+ * reads.
+ *
* K: Only modified by worker while holding pool->lock. Can be safely read by
* self, while holding pool->lock or from IRQ context if %current is the
* kworker.
@@ -143,6 +146,9 @@ enum {
*
* WR: wq->mutex protected for writes. RCU protected for reads.
*
+ * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read
+ * with READ_ONCE() without locking.
+ *
* MD: wq_mayday_lock protected.
*
* WD: Used internally by the watchdog.
@@ -240,18 +246,18 @@ struct pool_workqueue {
* pwq->inactive_works instead of pool->worklist and marked with
* WORK_STRUCT_INACTIVE.
*
- * All work items marked with WORK_STRUCT_INACTIVE do not participate
- * in pwq->nr_active and all work items in pwq->inactive_works are
- * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE
- * work items are in pwq->inactive_works. Some of them are ready to
- * run in pool->worklist or worker->scheduled. Those work itmes are
- * only struct wq_barrier which is used for flush_work() and should
- * not participate in pwq->nr_active. For non-barrier work item, it
- * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
+ * All work items marked with WORK_STRUCT_INACTIVE do not participate in
+ * nr_active and all work items in pwq->inactive_works are marked with
+ * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
+ * in pwq->inactive_works. Some of them are ready to run in
+ * pool->worklist or worker->scheduled. Those work itmes are only struct
+ * wq_barrier which is used for flush_work() and should not participate
+ * in nr_active. For non-barrier work item, it is marked with
+ * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
*/
int nr_active; /* L: nr of active works */
- int max_active; /* L: max active works */
struct list_head inactive_works; /* L: inactive works */
+ struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
@@ -279,6 +285,26 @@ struct wq_flusher {
struct wq_device;
/*
+ * Unlike in a per-cpu workqueue where max_active limits its concurrency level
+ * on each CPU, in an unbound workqueue, max_active applies to the whole system.
+ * As sharing a single nr_active across multiple sockets can be very expensive,
+ * the counting and enforcement is per NUMA node.
+ *
+ * The following struct is used to enforce per-node max_active. When a pwq wants
+ * to start executing a work item, it should increment ->nr using
+ * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
+ * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
+ * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
+ * round-robin order.
+ */
+struct wq_node_nr_active {
+ int max; /* per-node max_active */
+ atomic_t nr; /* per-node nr_active */
+ raw_spinlock_t lock; /* nests inside pool locks */
+ struct list_head pending_pwqs; /* LN: pwqs with inactive works */
+};
+
+/*
* The externally visible workqueue. It relays the issued work items to
* the appropriate worker_pool through its pool_workqueues.
*/
@@ -298,10 +324,15 @@ struct workqueue_struct {
struct worker *rescuer; /* MD: rescue worker */
int nr_drainers; /* WQ: drain in progress */
- int saved_max_active; /* WQ: saved pwq max_active */
+
+ /* See alloc_workqueue() function comment for info on min/max_active */
+ int max_active; /* WO: max active works */
+ int min_active; /* WO: min active works */
+ int saved_max_active; /* WQ: saved max_active */
+ int saved_min_active; /* WQ: saved min_active */
struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */
- struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */
+ struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */
#ifdef CONFIG_SYSFS
struct wq_device *wq_dev; /* I: for sysfs interface */
@@ -323,6 +354,7 @@ struct workqueue_struct {
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
+ struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
};
static struct kmem_cache *pwq_cache;
@@ -626,6 +658,36 @@ static int worker_pool_assign_id(struct worker_pool *pool)
return ret;
}
+static struct pool_workqueue __rcu **
+unbound_pwq_slot(struct workqueue_struct *wq, int cpu)
+{
+ if (cpu >= 0)
+ return per_cpu_ptr(wq->cpu_pwq, cpu);
+ else
+ return &wq->dfl_pwq;
+}
+
+/* @cpu < 0 for dfl_pwq */
+static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)
+{
+ return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),
+ lockdep_is_held(&wq_pool_mutex) ||
+ lockdep_is_held(&wq->mutex));
+}
+
+/**
+ * unbound_effective_cpumask - effective cpumask of an unbound workqueue
+ * @wq: workqueue of interest
+ *
+ * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
+ * is masked with wq_unbound_cpumask to determine the effective cpumask. The
+ * default pwq is always mapped to the pool with the current effective cpumask.
+ */
+static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
+{
+ return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
+}
+
static unsigned int work_color_to_flags(int color)
{
return color << WORK_STRUCT_COLOR_SHIFT;
@@ -1396,6 +1458,71 @@ work_func_t wq_worker_last_func(struct task_struct *task)
}
/**
+ * wq_node_nr_active - Determine wq_node_nr_active to use
+ * @wq: workqueue of interest
+ * @node: NUMA node, can be %NUMA_NO_NODE
+ *
+ * Determine wq_node_nr_active to use for @wq on @node. Returns:
+ *
+ * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
+ *
+ * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
+ *
+ * - Otherwise, node_nr_active[@node].
+ */
+static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
+ int node)
+{
+ if (!(wq->flags & WQ_UNBOUND))
+ return NULL;
+
+ if (node == NUMA_NO_NODE)
+ node = nr_node_ids;
+
+ return wq->node_nr_active[node];
+}
+
+/**
+ * wq_update_node_max_active - Update per-node max_actives to use
+ * @wq: workqueue to update
+ * @off_cpu: CPU that's going down, -1 if a CPU is not going down
+ *
+ * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
+ * distributed among nodes according to the proportions of numbers of online
+ * cpus. The result is always between @wq->min_active and max_active.
+ */
+static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
+{
+ struct cpumask *effective = unbound_effective_cpumask(wq);
+ int min_active = READ_ONCE(wq->min_active);
+ int max_active = READ_ONCE(wq->max_active);
+ int total_cpus, node;
+
+ lockdep_assert_held(&wq->mutex);
+
+ if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective))
+ off_cpu = -1;
+
+ total_cpus = cpumask_weight_and(effective, cpu_online_mask);
+ if (off_cpu >= 0)
+ total_cpus--;
+
+ for_each_node(node) {
+ int node_cpus;
+
+ node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
+ if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
+ node_cpus--;
+
+ wq_node_nr_active(wq, node)->max =
+ clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
+ min_active, max_active);
+ }
+
+ wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active;
+}
+
+/**
* get_pwq - get an extra reference on the specified pool_workqueue
* @pwq: pool_workqueue to get
*
@@ -1447,24 +1574,293 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
}
}
-static void pwq_activate_inactive_work(struct work_struct *work)
+static bool pwq_is_empty(struct pool_workqueue *pwq)
{
- struct pool_workqueue *pwq = get_work_pwq(work);
+ return !pwq->nr_active && list_empty(&pwq->inactive_works);
+}
+static void __pwq_activate_work(struct pool_workqueue *pwq,
+ struct work_struct *work)
+{
+ unsigned long *wdb = work_data_bits(work);
+
+ WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
trace_workqueue_activate_work(work);
if (list_empty(&pwq->pool->worklist))
pwq->pool->watchdog_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
- __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work));
+ __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
+}
+
+/**
+ * pwq_activate_work - Activate a work item if inactive
+ * @pwq: pool_workqueue @work belongs to
+ * @work: work item to activate
+ *
+ * Returns %true if activated. %false if already active.
+ */
+static bool pwq_activate_work(struct pool_workqueue *pwq,
+ struct work_struct *work)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct wq_node_nr_active *nna;
+
+ lockdep_assert_held(&pool->lock);
+
+ if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
+ return false;
+
+ nna = wq_node_nr_active(pwq->wq, pool->node);
+ if (nna)
+ atomic_inc(&nna->nr);
+
pwq->nr_active++;
+ __pwq_activate_work(pwq, work);
+ return true;
+}
+
+static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
+{
+ int max = READ_ONCE(nna->max);
+
+ while (true) {
+ int old, tmp;
+
+ old = atomic_read(&nna->nr);
+ if (old >= max)
+ return false;
+ tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
+ if (tmp == old)
+ return true;
+ }
+}
+
+/**
+ * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
+ * @pwq: pool_workqueue of interest
+ * @fill: max_active may have increased, try to increase concurrency level
+ *
+ * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
+ * successfully obtained. %false otherwise.
+ */
+static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
+{
+ struct workqueue_struct *wq = pwq->wq;
+ struct worker_pool *pool = pwq->pool;
+ struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
+ bool obtained = false;
+
+ lockdep_assert_held(&pool->lock);
+
+ if (!nna) {
+ /* per-cpu workqueue, pwq->nr_active is sufficient */
+ obtained = pwq->nr_active < READ_ONCE(wq->max_active);
+ goto out;
+ }
+
+ /*
+ * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
+ * already waiting on $nna, pwq_dec_nr_active() will maintain the
+ * concurrency level. Don't jump the line.
+ *
+ * We need to ignore the pending test after max_active has increased as
+ * pwq_dec_nr_active() can only maintain the concurrency level but not
+ * increase it. This is indicated by @fill.
+ */
+ if (!list_empty(&pwq->pending_node) && likely(!fill))
+ goto out;
+
+ obtained = tryinc_node_nr_active(nna);
+ if (obtained)
+ goto out;
+
+ /*
+ * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
+ * and try again. The smp_mb() is paired with the implied memory barrier
+ * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
+ * we see the decremented $nna->nr or they see non-empty
+ * $nna->pending_pwqs.
+ */
+ raw_spin_lock(&nna->lock);
+
+ if (list_empty(&pwq->pending_node))
+ list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
+ else if (likely(!fill))
+ goto out_unlock;
+
+ smp_mb();
+
+ obtained = tryinc_node_nr_active(nna);
+
+ /*
+ * If @fill, @pwq might have already been pending. Being spuriously
+ * pending in cold paths doesn't affect anything. Let's leave it be.
+ */
+ if (obtained && likely(!fill))
+ list_del_init(&pwq->pending_node);
+
+out_unlock:
+ raw_spin_unlock(&nna->lock);
+out:
+ if (obtained)
+ pwq->nr_active++;
+ return obtained;
+}
+
+/**
+ * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
+ * @pwq: pool_workqueue of interest
+ * @fill: max_active may have increased, try to increase concurrency level
+ *
+ * Activate the first inactive work item of @pwq if available and allowed by
+ * max_active limit.
+ *
+ * Returns %true if an inactive work item has been activated. %false if no
+ * inactive work item is found or max_active limit is reached.
+ */
+static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
+{
+ struct work_struct *work =
+ list_first_entry_or_null(&pwq->inactive_works,
+ struct work_struct, entry);
+
+ if (work && pwq_tryinc_nr_active(pwq, fill)) {
+ __pwq_activate_work(pwq, work);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/**
+ * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
+ * @nna: wq_node_nr_active to activate a pending pwq for
+ * @caller_pool: worker_pool the caller is locking
+ *
+ * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
+ * @caller_pool may be unlocked and relocked to lock other worker_pools.
+ */
+static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
+ struct worker_pool *caller_pool)
+{
+ struct worker_pool *locked_pool = caller_pool;
+ struct pool_workqueue *pwq;
+ struct work_struct *work;
+
+ lockdep_assert_held(&caller_pool->lock);
+
+ raw_spin_lock(&nna->lock);
+retry:
+ pwq = list_first_entry_or_null(&nna->pending_pwqs,
+ struct pool_workqueue, pending_node);
+ if (!pwq)
+ goto out_unlock;
+
+ /*
+ * If @pwq is for a different pool than @locked_pool, we need to lock
+ * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
+ * / lock dance. For that, we also need to release @nna->lock as it's
+ * nested inside pool locks.
+ */
+ if (pwq->pool != locked_pool) {
+ raw_spin_unlock(&locked_pool->lock);
+ locked_pool = pwq->pool;
+ if (!raw_spin_trylock(&locked_pool->lock)) {
+ raw_spin_unlock(&nna->lock);
+ raw_spin_lock(&locked_pool->lock);
+ raw_spin_lock(&nna->lock);
+ goto retry;
+ }
+ }
+
+ /*
+ * $pwq may not have any inactive work items due to e.g. cancellations.
+ * Drop it from pending_pwqs and see if there's another one.
+ */
+ work = list_first_entry_or_null(&pwq->inactive_works,
+ struct work_struct, entry);
+ if (!work) {
+ list_del_init(&pwq->pending_node);
+ goto retry;
+ }
+
+ /*
+ * Acquire an nr_active count and activate the inactive work item. If
+ * $pwq still has inactive work items, rotate it to the end of the
+ * pending_pwqs so that we round-robin through them. This means that
+ * inactive work items are not activated in queueing order which is fine
+ * given that there has never been any ordering across different pwqs.
+ */
+ if (likely(tryinc_node_nr_active(nna))) {
+ pwq->nr_active++;
+ __pwq_activate_work(pwq, work);
+
+ if (list_empty(&pwq->inactive_works))
+ list_del_init(&pwq->pending_node);
+ else
+ list_move_tail(&pwq->pending_node, &nna->pending_pwqs);
+
+ /* if activating a foreign pool, make sure it's running */
+ if (pwq->pool != caller_pool)
+ kick_pool(pwq->pool);
+ }
+
+out_unlock:
+ raw_spin_unlock(&nna->lock);
+ if (locked_pool != caller_pool) {
+ raw_spin_unlock(&locked_pool->lock);
+ raw_spin_lock(&caller_pool->lock);
+ }
}
-static void pwq_activate_first_inactive(struct pool_workqueue *pwq)
+/**
+ * pwq_dec_nr_active - Retire an active count
+ * @pwq: pool_workqueue of interest
+ *
+ * Decrement @pwq's nr_active and try to activate the first inactive work item.
+ * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
+ */
+static void pwq_dec_nr_active(struct pool_workqueue *pwq)
{
- struct work_struct *work = list_first_entry(&pwq->inactive_works,
- struct work_struct, entry);
+ struct worker_pool *pool = pwq->pool;
+ struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);
- pwq_activate_inactive_work(work);
+ lockdep_assert_held(&pool->lock);
+
+ /*
+ * @pwq->nr_active should be decremented for both percpu and unbound
+ * workqueues.
+ */
+ pwq->nr_active--;
+
+ /*
+ * For a percpu workqueue, it's simple. Just need to kick the first
+ * inactive work item on @pwq itself.
+ */
+ if (!nna) {
+ pwq_activate_first_inactive(pwq, false);
+ return;
+ }
+
+ /*
+ * If @pwq is for an unbound workqueue, it's more complicated because
+ * multiple pwqs and pools may be sharing the nr_active count. When a
+ * pwq needs to wait for an nr_active count, it puts itself on
+ * $nna->pending_pwqs. The following atomic_dec_return()'s implied
+ * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
+ * guarantee that either we see non-empty pending_pwqs or they see
+ * decremented $nna->nr.
+ *
+ * $nna->max may change as CPUs come online/offline and @pwq->wq's
+ * max_active gets updated. However, it is guaranteed to be equal to or
+ * larger than @pwq->wq->min_active which is above zero unless freezing.
+ * This maintains the forward progress guarantee.
+ */
+ if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
+ return;
+
+ if (!list_empty(&nna->pending_pwqs))
+ node_activate_pending_pwq(nna, pool);
}
/**
@@ -1482,14 +1878,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_
{
int color = get_work_color(work_data);
- if (!(work_data & WORK_STRUCT_INACTIVE)) {
- pwq->nr_active--;
- if (!list_empty(&pwq->inactive_works)) {
- /* one down, submit an inactive one */
- if (pwq->nr_active < pwq->max_active)
- pwq_activate_first_inactive(pwq);
- }
- }
+ if (!(work_data & WORK_STRUCT_INACTIVE))
+ pwq_dec_nr_active(pwq);
pwq->nr_in_flight[color]--;
@@ -1602,8 +1992,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
* management later on and cause stall. Make sure the work
* item is activated before grabbing.
*/
- if (*work_data_bits(work) & WORK_STRUCT_INACTIVE)
- pwq_activate_inactive_work(work);
+ pwq_activate_work(pwq, work);
list_del_init(&work->entry);
pwq_dec_nr_in_flight(pwq, *work_data_bits(work));
@@ -1787,12 +2176,16 @@ retry:
pwq->nr_in_flight[pwq->work_color]++;
work_flags = work_color_to_flags(pwq->work_color);
- if (likely(pwq->nr_active < pwq->max_active)) {
+ /*
+ * Limit the number of concurrently active work items to max_active.
+ * @work must also queue behind existing inactive work items to maintain
+ * ordering when max_active changes. See wq_adjust_max_active().
+ */
+ if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
if (list_empty(&pool->worklist))
pool->watchdog_ts = jiffies;
trace_workqueue_activate_work(work);
- pwq->nr_active++;
insert_work(pwq, work, &pool->worklist, work_flags);
kick_pool(pool);
} else {
@@ -3021,7 +3414,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
barr->task = current;
- /* The barrier work item does not participate in pwq->nr_active. */
+ /* The barrier work item does not participate in nr_active. */
work_flags |= WORK_STRUCT_INACTIVE;
/*
@@ -3310,7 +3703,7 @@ reflush:
bool drained;
raw_spin_lock_irq(&pwq->pool->lock);
- drained = !pwq->nr_active && list_empty(&pwq->inactive_works);
+ drained = pwq_is_empty(pwq);
raw_spin_unlock_irq(&pwq->pool->lock);
if (drained)
@@ -3921,11 +4314,65 @@ static void wq_free_lockdep(struct workqueue_struct *wq)
}
#endif
+static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
+{
+ int node;
+
+ for_each_node(node) {
+ kfree(nna_ar[node]);
+ nna_ar[node] = NULL;
+ }
+
+ kfree(nna_ar[nr_node_ids]);
+ nna_ar[nr_node_ids] = NULL;
+}
+
+static void init_node_nr_active(struct wq_node_nr_active *nna)
+{
+ atomic_set(&nna->nr, 0);
+ raw_spin_lock_init(&nna->lock);
+ INIT_LIST_HEAD(&nna->pending_pwqs);
+}
+
+/*
+ * Each node's nr_active counter will be accessed mostly from its own node and
+ * should be allocated in the node.
+ */
+static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
+{
+ struct wq_node_nr_active *nna;
+ int node;
+
+ for_each_node(node) {
+ nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
+ if (!nna)
+ goto err_free;
+ init_node_nr_active(nna);
+ nna_ar[node] = nna;
+ }
+
+ /* [nr_node_ids] is used as the fallback */
+ nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
+ if (!nna)
+ goto err_free;
+ init_node_nr_active(nna);
+ nna_ar[nr_node_ids] = nna;
+
+ return 0;
+
+err_free:
+ free_node_nr_active(nna_ar);
+ return -ENOMEM;
+}
+
static void rcu_free_wq(struct rcu_head *rcu)
{
struct workqueue_struct *wq =
container_of(rcu, struct workqueue_struct, rcu);
+ if (wq->flags & WQ_UNBOUND)
+ free_node_nr_active(wq->node_nr_active);
+
wq_free_lockdep(wq);
free_percpu(wq->cpu_pwq);
free_workqueue_attrs(wq->unbound_attrs);
@@ -4124,6 +4571,15 @@ static void pwq_release_workfn(struct kthread_work *work)
mutex_unlock(&wq_pool_mutex);
}
+ if (!list_empty(&pwq->pending_node)) {
+ struct wq_node_nr_active *nna =
+ wq_node_nr_active(pwq->wq, pwq->pool->node);
+
+ raw_spin_lock_irq(&nna->lock);
+ list_del_init(&pwq->pending_node);
+ raw_spin_unlock_irq(&nna->lock);
+ }
+
call_rcu(&pwq->rcu, rcu_free_pwq);
/*
@@ -4136,50 +4592,6 @@ static void pwq_release_workfn(struct kthread_work *work)
}
}
-/**
- * pwq_adjust_max_active - update a pwq's max_active to the current setting
- * @pwq: target pool_workqueue
- *
- * If @pwq isn't freezing, set @pwq->max_active to the associated
- * workqueue's saved_max_active and activate inactive work items
- * accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
- */
-static void pwq_adjust_max_active(struct pool_workqueue *pwq)
-{
- struct workqueue_struct *wq = pwq->wq;
- bool freezable = wq->flags & WQ_FREEZABLE;
- unsigned long flags;
-
- /* for @wq->saved_max_active */
- lockdep_assert_held(&wq->mutex);
-
- /* fast exit for non-freezable wqs */
- if (!freezable && pwq->max_active == wq->saved_max_active)
- return;
-
- /* this function can be called during early boot w/ irq disabled */
- raw_spin_lock_irqsave(&pwq->pool->lock, flags);
-
- /*
- * During [un]freezing, the caller is responsible for ensuring that
- * this function is called at least once after @workqueue_freezing
- * is updated and visible.
- */
- if (!freezable || !workqueue_freezing) {
- pwq->max_active = wq->saved_max_active;
-
- while (!list_empty(&pwq->inactive_works) &&
- pwq->nr_active < pwq->max_active)
- pwq_activate_first_inactive(pwq);
-
- kick_pool(pwq->pool);
- } else {
- pwq->max_active = 0;
- }
-
- raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
-}
-
/* initialize newly allocated @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
struct worker_pool *pool)
@@ -4193,6 +4605,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
pwq->flush_color = -1;
pwq->refcnt = 1;
INIT_LIST_HEAD(&pwq->inactive_works);
+ INIT_LIST_HEAD(&pwq->pending_node);
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
kthread_init_work(&pwq->release_work, pwq_release_workfn);
@@ -4212,9 +4625,6 @@ static void link_pwq(struct pool_workqueue *pwq)
/* set the matching work_color */
pwq->work_color = wq->work_color;
- /* sync max_active to the current setting */
- pwq_adjust_max_active(pwq);
-
/* link in @pwq */
list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
}
@@ -4283,10 +4693,11 @@ static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
"possible intersect\n");
}
-/* install @pwq into @wq's cpu_pwq and return the old pwq */
+/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
int cpu, struct pool_workqueue *pwq)
{
+ struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);
struct pool_workqueue *old_pwq;
lockdep_assert_held(&wq_pool_mutex);
@@ -4295,8 +4706,8 @@ static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
/* link_pwq() can handle duplicate calls */
link_pwq(pwq);
- old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
- rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq);
+ old_pwq = rcu_access_pointer(*slot);
+ rcu_assign_pointer(*slot, pwq);
return old_pwq;
}
@@ -4396,14 +4807,14 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
- /* save the previous pwq and install the new one */
+ /* save the previous pwqs and install the new ones */
for_each_possible_cpu(cpu)
ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
ctx->pwq_tbl[cpu]);
+ ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);
- /* @dfl_pwq might not have been used, ensure it's linked */
- link_pwq(ctx->dfl_pwq);
- swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
+ /* update node_nr_active->max */
+ wq_update_node_max_active(ctx->wq, -1);
mutex_unlock(&ctx->wq->mutex);
}
@@ -4526,9 +4937,7 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
/* nothing to do if the target cpumask matches the current pwq */
wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
- pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu),
- lockdep_is_held(&wq_pool_mutex));
- if (wqattrs_equal(target_attrs, pwq->pool->attrs))
+ if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
return;
/* create a new pwq */
@@ -4546,10 +4955,11 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
use_dfl_pwq:
mutex_lock(&wq->mutex);
- raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
- get_pwq(wq->dfl_pwq);
- raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
- old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq);
+ pwq = unbound_pwq(wq, -1);
+ raw_spin_lock_irq(&pwq->pool->lock);
+ get_pwq(pwq);
+ raw_spin_unlock_irq(&pwq->pool->lock);
+ old_pwq = install_unbound_pwq(wq, cpu, pwq);
out_unlock:
mutex_unlock(&wq->mutex);
put_pwq_unlocked(old_pwq);
@@ -4587,10 +4997,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
cpus_read_lock();
if (wq->flags & __WQ_ORDERED) {
+ struct pool_workqueue *dfl_pwq;
+
ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
/* there should only be single pwq for ordering guarantee */
- WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
- wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+ dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
+ WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
+ wq->pwqs.prev != &dfl_pwq->pwqs_node),
"ordering guarantee broken for workqueue %s\n", wq->name);
} else {
ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
@@ -4665,6 +5078,69 @@ static int init_rescuer(struct workqueue_struct *wq)
return 0;
}
+/**
+ * wq_adjust_max_active - update a wq's max_active to the current setting
+ * @wq: target workqueue
+ *
+ * If @wq isn't freezing, set @wq->max_active to the saved_max_active and
+ * activate inactive work items accordingly. If @wq is freezing, clear
+ * @wq->max_active to zero.
+ */
+static void wq_adjust_max_active(struct workqueue_struct *wq)
+{
+ bool activated;
+ int new_max, new_min;
+
+ lockdep_assert_held(&wq->mutex);
+
+ if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
+ new_max = 0;
+ new_min = 0;
+ } else {
+ new_max = wq->saved_max_active;
+ new_min = wq->saved_min_active;
+ }
+
+ if (wq->max_active == new_max && wq->min_active == new_min)
+ return;
+
+ /*
+ * Update @wq->max/min_active and then kick inactive work items if more
+ * active work items are allowed. This doesn't break work item ordering
+ * because new work items are always queued behind existing inactive
+ * work items if there are any.
+ */
+ WRITE_ONCE(wq->max_active, new_max);
+ WRITE_ONCE(wq->min_active, new_min);
+
+ if (wq->flags & WQ_UNBOUND)
+ wq_update_node_max_active(wq, -1);
+
+ if (new_max == 0)
+ return;
+
+ /*
+ * Round-robin through pwq's activating the first inactive work item
+ * until max_active is filled.
+ */
+ do {
+ struct pool_workqueue *pwq;
+
+ activated = false;
+ for_each_pwq(pwq, wq) {
+ unsigned long flags;
+
+ /* can be called during early boot w/ irq disabled */
+ raw_spin_lock_irqsave(&pwq->pool->lock, flags);
+ if (pwq_activate_first_inactive(pwq, true)) {
+ activated = true;
+ kick_pool(pwq->pool);
+ }
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ }
+ } while (activated);
+}
+
__printf(1, 4)
struct workqueue_struct *alloc_workqueue(const char *fmt,
unsigned int flags,
@@ -4672,7 +5148,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
{
va_list args;
struct workqueue_struct *wq;
- struct pool_workqueue *pwq;
+ size_t wq_size;
+ int name_len;
/*
* Unbound && max_active == 1 used to imply ordered, which is no longer
@@ -4688,7 +5165,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
flags |= WQ_UNBOUND;
/* allocate wq and format name */
- wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+ if (flags & WQ_UNBOUND)
+ wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
+ else
+ wq_size = sizeof(*wq);
+
+ wq = kzalloc(wq_size, GFP_KERNEL);
if (!wq)
return NULL;
@@ -4699,15 +5181,22 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
}
va_start(args, max_active);
- vsnprintf(wq->name, sizeof(wq->name), fmt, args);
+ name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
va_end(args);
+ if (name_len >= WQ_NAME_LEN)
+ pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
+ wq->name);
+
max_active = max_active ?: WQ_DFL_ACTIVE;
max_active = wq_clamp_max_active(max_active, flags, wq->name);
/* init wq */
wq->flags = flags;
- wq->saved_max_active = max_active;
+ wq->max_active = max_active;
+ wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
+ wq->saved_max_active = wq->max_active;
+ wq->saved_min_active = wq->min_active;
mutex_init(&wq->mutex);
atomic_set(&wq->nr_pwqs_to_flush, 0);
INIT_LIST_HEAD(&wq->pwqs);
@@ -4718,8 +5207,13 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
wq_init_lockdep(wq);
INIT_LIST_HEAD(&wq->list);
+ if (flags & WQ_UNBOUND) {
+ if (alloc_node_nr_active(wq->node_nr_active) < 0)
+ goto err_unreg_lockdep;
+ }
+
if (alloc_and_link_pwqs(wq) < 0)
- goto err_unreg_lockdep;
+ goto err_free_node_nr_active;
if (wq_online && init_rescuer(wq) < 0)
goto err_destroy;
@@ -4735,8 +5229,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
mutex_lock(&wq_pool_mutex);
mutex_lock(&wq->mutex);
- for_each_pwq(pwq, wq)
- pwq_adjust_max_active(pwq);
+ wq_adjust_max_active(wq);
mutex_unlock(&wq->mutex);
list_add_tail_rcu(&wq->list, &workqueues);
@@ -4745,6 +5238,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
return wq;
+err_free_node_nr_active:
+ if (wq->flags & WQ_UNBOUND)
+ free_node_nr_active(wq->node_nr_active);
err_unreg_lockdep:
wq_unregister_lockdep(wq);
wq_free_lockdep(wq);
@@ -4766,9 +5262,9 @@ static bool pwq_busy(struct pool_workqueue *pwq)
if (pwq->nr_in_flight[i])
return true;
- if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1))
+ if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1))
return true;
- if (pwq->nr_active || !list_empty(&pwq->inactive_works))
+ if (!pwq_is_empty(pwq))
return true;
return false;
@@ -4850,13 +5346,12 @@ void destroy_workqueue(struct workqueue_struct *wq)
rcu_read_lock();
for_each_possible_cpu(cpu) {
- pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
- RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL);
- put_pwq_unlocked(pwq);
+ put_pwq_unlocked(unbound_pwq(wq, cpu));
+ RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);
}
- put_pwq_unlocked(wq->dfl_pwq);
- wq->dfl_pwq = NULL;
+ put_pwq_unlocked(unbound_pwq(wq, -1));
+ RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL);
rcu_read_unlock();
}
@@ -4867,15 +5362,14 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
* @wq: target workqueue
* @max_active: new max_active value.
*
- * Set max_active of @wq to @max_active.
+ * Set max_active of @wq to @max_active. See the alloc_workqueue() function
+ * comment.
*
* CONTEXT:
* Don't call from IRQ context.
*/
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
- struct pool_workqueue *pwq;
-
/* disallow meddling with max_active for ordered workqueues */
if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return;
@@ -4886,9 +5380,10 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
wq->flags &= ~__WQ_ORDERED;
wq->saved_max_active = max_active;
+ if (wq->flags & WQ_UNBOUND)
+ wq->saved_min_active = min(wq->saved_min_active, max_active);
- for_each_pwq(pwq, wq)
- pwq_adjust_max_active(pwq);
+ wq_adjust_max_active(wq);
mutex_unlock(&wq->mutex);
}
@@ -5135,8 +5630,8 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_info(" pwq %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" active=%d/%d refcnt=%d%s\n",
- pwq->nr_active, pwq->max_active, pwq->refcnt,
+ pr_cont(" active=%d refcnt=%d%s\n",
+ pwq->nr_active, pwq->refcnt,
!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
@@ -5210,7 +5705,7 @@ void show_one_workqueue(struct workqueue_struct *wq)
unsigned long flags;
for_each_pwq(pwq, wq) {
- if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ if (!pwq_is_empty(pwq)) {
idle = false;
break;
}
@@ -5222,7 +5717,7 @@ void show_one_workqueue(struct workqueue_struct *wq)
for_each_pwq(pwq, wq) {
raw_spin_lock_irqsave(&pwq->pool->lock, flags);
- if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ if (!pwq_is_empty(pwq)) {
/*
* Defer printing to avoid deadlocks in console
* drivers that queue work while holding locks
@@ -5569,6 +6064,10 @@ int workqueue_online_cpu(unsigned int cpu)
for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
wq_update_pod(wq, tcpu, cpu, true);
+
+ mutex_lock(&wq->mutex);
+ wq_update_node_max_active(wq, -1);
+ mutex_unlock(&wq->mutex);
}
}
@@ -5597,6 +6096,10 @@ int workqueue_offline_cpu(unsigned int cpu)
for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
wq_update_pod(wq, tcpu, cpu, false);
+
+ mutex_lock(&wq->mutex);
+ wq_update_node_max_active(wq, cpu);
+ mutex_unlock(&wq->mutex);
}
}
mutex_unlock(&wq_pool_mutex);
@@ -5684,7 +6187,6 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
void freeze_workqueues_begin(void)
{
struct workqueue_struct *wq;
- struct pool_workqueue *pwq;
mutex_lock(&wq_pool_mutex);
@@ -5693,8 +6195,7 @@ void freeze_workqueues_begin(void)
list_for_each_entry(wq, &workqueues, list) {
mutex_lock(&wq->mutex);
- for_each_pwq(pwq, wq)
- pwq_adjust_max_active(pwq);
+ wq_adjust_max_active(wq);
mutex_unlock(&wq->mutex);
}
@@ -5759,7 +6260,6 @@ out_unlock:
void thaw_workqueues(void)
{
struct workqueue_struct *wq;
- struct pool_workqueue *pwq;
mutex_lock(&wq_pool_mutex);
@@ -5771,8 +6271,7 @@ void thaw_workqueues(void)
/* restore max_active and repopulate worklist */
list_for_each_entry(wq, &workqueues, list) {
mutex_lock(&wq->mutex);
- for_each_pwq(pwq, wq)
- pwq_adjust_max_active(pwq);
+ wq_adjust_max_active(wq);
mutex_unlock(&wq->mutex);
}
@@ -6610,7 +7109,7 @@ void __init workqueue_init_early(void)
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
WQ_POWER_EFFICIENT, 0);
- system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
+ system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
WQ_FREEZABLE | WQ_POWER_EFFICIENT,
0);
BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
@@ -6797,8 +7296,12 @@ void __init workqueue_init_topology(void)
* combinations to apply per-pod sharing.
*/
list_for_each_entry(wq, &workqueues, list) {
- for_each_online_cpu(cpu) {
+ for_each_online_cpu(cpu)
wq_update_pod(wq, cpu, cpu, true);
+ if (wq->flags & WQ_UNBOUND) {
+ mutex_lock(&wq->mutex);
+ wq_update_node_max_active(wq, -1);
+ mutex_unlock(&wq->mutex);
}
}