summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 18:47:48 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 18:47:48 +0000
commita1865fbd182b17f2d2f465f557af5b45501c5f1c (patch)
tree59da519ef2e59c763bb8efdbe67bc348cf833767 /kernel
parentAdding upstream version 6.7.9. (diff)
downloadlinux-a1865fbd182b17f2d2f465f557af5b45501c5f1c.tar.xz
linux-a1865fbd182b17f2d2f465f557af5b45501c5f1c.zip
Adding upstream version 6.7.12.upstream/6.7.12
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/bpf/core.c7
-rw-r--r--kernel/bpf/cpumap.c5
-rw-r--r--kernel/bpf/devmap.c11
-rw-r--r--kernel/bpf/hashtab.c14
-rw-r--r--kernel/bpf/helpers.c4
-rw-r--r--kernel/bpf/stackmap.c9
-rw-r--r--kernel/bpf/verifier.c5
-rw-r--r--kernel/cgroup/cpuset.c8
-rw-r--r--kernel/crash_core.c8
-rw-r--r--kernel/dma/swiotlb.c35
-rw-r--r--kernel/entry/common.c8
-rw-r--r--kernel/module/Kconfig5
-rw-r--r--kernel/module/main.c9
-rw-r--r--kernel/power/suspend.c1
-rw-r--r--kernel/printk/internal.h1
-rw-r--r--kernel/printk/nbcon.c41
-rw-r--r--kernel/printk/printk.c101
-rw-r--r--kernel/printk/printk_ringbuffer.c315
-rw-r--r--kernel/printk/printk_ringbuffer.h38
-rw-r--r--kernel/rcu/tree.c3
-rw-r--r--kernel/rcu/tree_exp.h25
-rw-r--r--kernel/sched/fair.c16
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/time/posix-clock.c16
-rw-r--r--kernel/time/time_test.c2
-rw-r--r--kernel/time/timekeeping.c24
-rw-r--r--kernel/trace/ring_buffer.c233
-rw-r--r--kernel/trace/trace.c21
-rw-r--r--kernel/workqueue.c757
30 files changed, 1323 insertions, 408 deletions
diff --git a/kernel/bounds.c b/kernel/bounds.c
index b529182e8..c5a9fcd2d 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -19,7 +19,7 @@ int main(void)
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
#ifdef CONFIG_SMP
- DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+ DEFINE(NR_CPUS_BITS, bits_per(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
#ifdef CONFIG_LRU_GEN
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index fe254ae03..27fd41777 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -863,7 +863,12 @@ static LIST_HEAD(pack_list);
* CONFIG_MMU=n. Use PAGE_SIZE in these cases.
*/
#ifdef PMD_SIZE
-#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes())
+/* PMD_SIZE is really big for some archs. It doesn't make sense to
+ * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
+ * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
+ * greater than or equal to 2MB.
+ */
+#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
#else
#define BPF_PROG_PACK_SIZE PAGE_SIZE
#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8a0bb80fe..8f1d390bc 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -178,7 +178,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
void **frames, int n,
struct xdp_cpumap_stats *stats)
{
- struct xdp_rxq_info rxq;
+ struct xdp_rxq_info rxq = {};
struct xdp_buff xdp;
int i, nframes = 0;
@@ -262,6 +262,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
+ unsigned long last_qs = jiffies;
complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
@@ -287,10 +288,12 @@ static int cpu_map_kthread_run(void *data)
if (__ptr_ring_empty(rcpu->queue)) {
schedule();
sched = 1;
+ last_qs = jiffies;
} else {
__set_current_state(TASK_RUNNING);
}
} else {
+ rcu_softirq_qs_periodic(last_qs);
sched = cond_resched();
}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a936c704d..4e2cdbb56 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -130,13 +130,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
bpf_map_init_from_attr(&dtab->map, attr);
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
-
- if (!dtab->n_buckets) /* Overflow check */
+ /* hash table size must be power of 2; roundup_pow_of_two() can
+ * overflow into UB on 32-bit arches, so check that first
+ */
+ if (dtab->map.max_entries > 1UL << 31)
return -EINVAL;
- }
- if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
+
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
dtab->map.numa_node);
if (!dtab->dev_index_head)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 5b9146fa8..85cd17ca3 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -498,7 +498,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
num_possible_cpus());
}
- /* hash table size must be power of 2 */
+ /* hash table size must be power of 2; roundup_pow_of_two() can overflow
+ * into UB on 32-bit arches, so check that first
+ */
+ err = -E2BIG;
+ if (htab->map.max_entries > 1UL << 31)
+ goto free_htab;
+
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
htab->elem_size = sizeof(struct htab_elem) +
@@ -508,10 +514,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
else
htab->elem_size += round_up(htab->map.value_size, 8);
- err = -E2BIG;
- /* prevent zero size kmalloc and check for u32 overflow */
- if (htab->n_buckets == 0 ||
- htab->n_buckets > U32_MAX / sizeof(struct bucket))
+ /* check for u32 overflow */
+ if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
err = bpf_map_init_elem_count(&htab->map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ce4729ef1..b912d055a 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -334,7 +334,7 @@ static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
__this_cpu_write(irqsave_flags, flags);
}
-notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
{
__bpf_spin_lock_irqsave(lock);
return 0;
@@ -357,7 +357,7 @@ static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
local_irq_restore(flags);
}
-notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
{
__bpf_spin_unlock_irqrestore(lock);
return 0;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index dff7ba539..c99f8e523 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -91,11 +91,14 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
} else if (value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
- /* hash table size must be power of 2 */
- n_buckets = roundup_pow_of_two(attr->max_entries);
- if (!n_buckets)
+ /* hash table size must be power of 2; roundup_pow_of_two() can overflow
+ * into UB on 32-bit arches, so check that first
+ */
+ if (attr->max_entries > 1UL << 31)
return ERR_PTR(-E2BIG);
+ n_buckets = roundup_pow_of_two(attr->max_entries);
+
cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
if (!smap)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e215413c7..890d4c4bf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5445,7 +5445,9 @@ BTF_ID(struct, prog_test_ref_kfunc)
#ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup)
#endif
+#ifdef CONFIG_BPF_JIT
BTF_ID(struct, bpf_cpumask)
+#endif
BTF_ID(struct, task_struct)
BTF_SET_END(rcu_protected_types)
@@ -16686,6 +16688,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
{
int i;
+ if (old->callback_depth > cur->callback_depth)
+ return false;
+
for (i = 0; i < MAX_BPF_REG; i++)
if (!regsafe(env, &old->regs[i], &cur->regs[i],
&env->idmap_scratch, exact))
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 615daaf87..ffe0e0029 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2466,7 +2466,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
update_partition_sd_lb(cs, old_prs);
out_free:
free_cpumasks(NULL, &tmp);
- return 0;
+ return retval;
}
/**
@@ -2502,9 +2502,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
return 0;
- if (alloc_cpumasks(NULL, &tmp))
- return -ENOMEM;
-
if (*buf)
compute_effective_exclusive_cpumask(trialcs, NULL);
@@ -2519,6 +2516,9 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval)
return retval;
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
+
if (old_prs) {
if (cpumask_empty(trialcs->effective_xcpus)) {
invalidate = true;
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 755d8d4ef..9e337493d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -377,6 +377,9 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
crashk_low_res.start = low_base;
crashk_low_res.end = low_base + low_size - 1;
+#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+ insert_resource(&iomem_resource, &crashk_low_res);
+#endif
#endif
return 0;
}
@@ -458,8 +461,12 @@ retry:
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
+#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+ insert_resource(&iomem_resource, &crashk_res);
+#endif
}
+#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
static __init int insert_crashkernel_resources(void)
{
if (crashk_res.start < crashk_res.end)
@@ -472,6 +479,7 @@ static __init int insert_crashkernel_resources(void)
}
early_initcall(insert_crashkernel_resources);
#endif
+#endif
int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
void **addr, unsigned long *sz)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 33d942615..9edfb3b77 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -981,8 +981,7 @@ static int swiotlb_area_find_slots(struct device *dev, struct io_tlb_pool *pool,
dma_addr_t tbl_dma_addr =
phys_to_dma_unencrypted(dev, pool->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
- unsigned int iotlb_align_mask =
- dma_get_min_align_mask(dev) | alloc_align_mask;
+ unsigned int iotlb_align_mask = dma_get_min_align_mask(dev);
unsigned int nslots = nr_slots(alloc_size), stride;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned int index, slots_checked, count = 0, i;
@@ -994,18 +993,25 @@ static int swiotlb_area_find_slots(struct device *dev, struct io_tlb_pool *pool,
BUG_ON(area_index >= pool->nareas);
/*
- * For allocations of PAGE_SIZE or larger only look for page aligned
- * allocations.
+ * Ensure that the allocation is at least slot-aligned and update
+ * 'iotlb_align_mask' to ignore bits that will be preserved when
+ * offsetting into the allocation.
*/
- if (alloc_size >= PAGE_SIZE)
- iotlb_align_mask |= ~PAGE_MASK;
- iotlb_align_mask &= ~(IO_TLB_SIZE - 1);
+ alloc_align_mask |= (IO_TLB_SIZE - 1);
+ iotlb_align_mask &= ~alloc_align_mask;
/*
* For mappings with an alignment requirement don't bother looping to
* unaligned slots once we found an aligned one.
*/
- stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
+ stride = get_max_slots(max(alloc_align_mask, iotlb_align_mask));
+
+ /*
+ * For allocations of PAGE_SIZE or larger only look for page aligned
+ * allocations.
+ */
+ if (alloc_size >= PAGE_SIZE)
+ stride = umax(stride, PAGE_SHIFT - IO_TLB_SHIFT + 1);
spin_lock_irqsave(&area->lock, flags);
if (unlikely(nslots > pool->area_nslabs - area->used))
@@ -1015,11 +1021,14 @@ static int swiotlb_area_find_slots(struct device *dev, struct io_tlb_pool *pool,
index = area->index;
for (slots_checked = 0; slots_checked < pool->area_nslabs; ) {
+ phys_addr_t tlb_addr;
+
slot_index = slot_base + index;
+ tlb_addr = slot_addr(tbl_dma_addr, slot_index);
- if (orig_addr &&
- (slot_addr(tbl_dma_addr, slot_index) &
- iotlb_align_mask) != (orig_addr & iotlb_align_mask)) {
+ if ((tlb_addr & alloc_align_mask) ||
+ (orig_addr && (tlb_addr & iotlb_align_mask) !=
+ (orig_addr & iotlb_align_mask))) {
index = wrap_area_index(pool, index + 1);
slots_checked++;
continue;
@@ -1608,12 +1617,14 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
struct io_tlb_pool *pool;
phys_addr_t tlb_addr;
+ unsigned int align;
int index;
if (!mem)
return NULL;
- index = swiotlb_find_slots(dev, 0, size, 0, &pool);
+ align = (1 << (get_order(size) + PAGE_SHIFT)) - 1;
+ index = swiotlb_find_slots(dev, 0, size, align, &pool);
if (index == -1)
return NULL;
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index d7ee4bc3f..5ff4f1cd3 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -77,8 +77,14 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Either of the above might have changed the syscall number */
syscall = syscall_get_nr(current, regs);
- if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
trace_sys_enter(regs, syscall);
+ /*
+ * Probes or BPF hooks in the tracepoint may have changed the
+ * system call number as well.
+ */
+ syscall = syscall_get_nr(current, regs);
+ }
syscall_enter_audit(regs, syscall);
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 0ea1b2970..28db5b758 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -236,6 +236,10 @@ choice
possible to load a signed module containing the algorithm to check
the signature on that module.
+config MODULE_SIG_SHA1
+ bool "Sign modules with SHA-1"
+ select CRYPTO_SHA1
+
config MODULE_SIG_SHA256
bool "Sign modules with SHA-256"
select CRYPTO_SHA256
@@ -265,6 +269,7 @@ endchoice
config MODULE_SIG_HASH
string
depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default "sha1" if MODULE_SIG_SHA1
default "sha256" if MODULE_SIG_SHA256
default "sha384" if MODULE_SIG_SHA384
default "sha512" if MODULE_SIG_SHA512
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 98fedfdb8..34d9e718c 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2486,6 +2486,11 @@ static void do_free_init(struct work_struct *w)
}
}
+void flush_module_init_free_work(void)
+{
+ flush_work(&init_free_wq);
+}
+
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "module."
/* Default value for module->async_probe_requested */
@@ -2590,8 +2595,8 @@ static noinline int do_init_module(struct module *mod)
* Note that module_alloc() on most architectures creates W+X page
* mappings which won't be cleaned up until do_free_init() runs. Any
* code such as mark_rodata_ro() which depends on those mappings to
- * be cleaned up needs to sync with the queued work - ie
- * rcu_barrier()
+ * be cleaned up needs to sync with the queued work by invoking
+ * flush_module_init_free_work().
*/
if (llist_add(&freeinit->node, &init_free_list))
schedule_work(&init_free_wq);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index fa3bf161d..a718067de 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -192,6 +192,7 @@ static int __init mem_sleep_default_setup(char *str)
if (mem_sleep_labels[state] &&
!strcmp(str, mem_sleep_labels[state])) {
mem_sleep_default = state;
+ mem_sleep_current = state;
break;
}
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 6c2afee5e..ac2d9750e 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -130,6 +130,7 @@ struct printk_message {
};
bool other_cpu_in_panic(void);
+bool this_cpu_in_panic(void);
bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
bool is_extended, bool may_supress);
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index b96077152..c8093bcc0 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -140,39 +140,6 @@ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_sta
return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
}
-#ifdef CONFIG_64BIT
-
-#define __seq_to_nbcon_seq(seq) (seq)
-#define __nbcon_seq_to_seq(seq) (seq)
-
-#else /* CONFIG_64BIT */
-
-#define __seq_to_nbcon_seq(seq) ((u32)seq)
-
-static inline u64 __nbcon_seq_to_seq(u32 nbcon_seq)
-{
- u64 seq;
- u64 rb_next_seq;
-
- /*
- * The provided sequence is only the lower 32 bits of the ringbuffer
- * sequence. It needs to be expanded to 64bit. Get the next sequence
- * number from the ringbuffer and fold it.
- *
- * Having a 32bit representation in the console is sufficient.
- * If a console ever gets more than 2^31 records behind
- * the ringbuffer then this is the least of the problems.
- *
- * Also the access to the ring buffer is always safe.
- */
- rb_next_seq = prb_next_seq(prb);
- seq = rb_next_seq - ((u32)rb_next_seq - nbcon_seq);
-
- return seq;
-}
-
-#endif /* CONFIG_64BIT */
-
/**
* nbcon_seq_read - Read the current console sequence
* @con: Console to read the sequence of
@@ -183,7 +150,7 @@ u64 nbcon_seq_read(struct console *con)
{
unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq));
- return __nbcon_seq_to_seq(nbcon_seq);
+ return __ulseq_to_u64seq(prb, nbcon_seq);
}
/**
@@ -204,7 +171,7 @@ void nbcon_seq_force(struct console *con, u64 seq)
*/
u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));
- atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __seq_to_nbcon_seq(valid_seq));
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq));
/* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */
con->seq = 0;
@@ -223,11 +190,11 @@ void nbcon_seq_force(struct console *con, u64 seq)
*/
static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
{
- unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq);
+ unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq);
struct console *con = ctxt->console;
if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq,
- __seq_to_nbcon_seq(new_seq))) {
+ __u64seq_to_ulseq(new_seq))) {
ctxt->seq = new_seq;
} else {
ctxt->seq = nbcon_seq_read(con);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f2444b581..7a835b277 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -347,6 +347,29 @@ static bool panic_in_progress(void)
return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
}
+/* Return true if a panic is in progress on the current CPU. */
+bool this_cpu_in_panic(void)
+{
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
+}
+
+/*
+ * Return true if a panic is in progress on a remote CPU.
+ *
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
+ */
+bool other_cpu_in_panic(void)
+{
+ return (panic_in_progress() && !this_cpu_in_panic());
+}
+
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
@@ -1846,10 +1869,23 @@ static bool console_waiter;
*/
static void console_lock_spinning_enable(void)
{
+ /*
+ * Do not use spinning in panic(). The panic CPU wants to keep the lock.
+ * Non-panic CPUs abandon the flush anyway.
+ *
+ * Just keep the lockdep annotation. The panic-CPU should avoid
+ * taking console_owner_lock because it might cause a deadlock.
+ * This looks like the easiest way how to prevent false lockdep
+ * reports without handling races a lockless way.
+ */
+ if (panic_in_progress())
+ goto lockdep;
+
raw_spin_lock(&console_owner_lock);
console_owner = current;
raw_spin_unlock(&console_owner_lock);
+lockdep:
/* The waiter may spin on us after setting console_owner */
spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
}
@@ -1874,6 +1910,22 @@ static int console_lock_spinning_disable_and_check(int cookie)
{
int waiter;
+ /*
+ * Ignore spinning waiters during panic() because they might get stopped
+ * or blocked at any time,
+ *
+ * It is safe because nobody is allowed to start spinning during panic
+ * in the first place. If there has been a waiter then non panic CPUs
+ * might stay spinning. They would get stopped anyway. The panic context
+ * will never start spinning and an interrupted spin on panic CPU will
+ * never continue.
+ */
+ if (panic_in_progress()) {
+ /* Keep lockdep happy. */
+ spin_release(&console_owner_dep_map, _THIS_IP_);
+ return 0;
+ }
+
raw_spin_lock(&console_owner_lock);
waiter = READ_ONCE(console_waiter);
console_owner = NULL;
@@ -1974,6 +2026,12 @@ static int console_trylock_spinning(void)
*/
mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
+ /*
+ * Update @console_may_schedule for trylock because the previous
+ * owner may have been schedulable.
+ */
+ console_may_schedule = 0;
+
return 1;
}
@@ -2601,26 +2659,6 @@ static int console_cpu_notify(unsigned int cpu)
return 0;
}
-/*
- * Return true if a panic is in progress on a remote CPU.
- *
- * On true, the local CPU should immediately release any printing resources
- * that may be needed by the panic CPU.
- */
-bool other_cpu_in_panic(void)
-{
- if (!panic_in_progress())
- return false;
-
- /*
- * We can use raw_smp_processor_id() here because it is impossible for
- * the task to be migrated to the panic_cpu, or away from it. If
- * panic_cpu has already been set, and we're not currently executing on
- * that CPU, then we never will be.
- */
- return atomic_read(&panic_cpu) != raw_smp_processor_id();
-}
-
/**
* console_lock - block the console subsystem from printing
*
@@ -3263,6 +3301,21 @@ static int __init keep_bootcon_setup(char *str)
early_param("keep_bootcon", keep_bootcon_setup);
+static int console_call_setup(struct console *newcon, char *options)
+{
+ int err;
+
+ if (!newcon->setup)
+ return 0;
+
+ /* Synchronize with possible boot console. */
+ console_lock();
+ err = newcon->setup(newcon, options);
+ console_unlock();
+
+ return err;
+}
+
/*
* This is called by register_console() to try to match
* the newly registered console with any of the ones selected
@@ -3298,8 +3351,8 @@ static int try_enable_preferred_console(struct console *newcon,
if (_braille_register_console(newcon, c))
return 0;
- if (newcon->setup &&
- (err = newcon->setup(newcon, c->options)) != 0)
+ err = console_call_setup(newcon, c->options);
+ if (err)
return err;
}
newcon->flags |= CON_ENABLED;
@@ -3325,7 +3378,7 @@ static void try_enable_default_console(struct console *newcon)
if (newcon->index < 0)
newcon->index = 0;
- if (newcon->setup && newcon->setup(newcon, NULL) != 0)
+ if (console_call_setup(newcon, NULL) != 0)
return;
newcon->flags |= CON_ENABLED;
@@ -3761,7 +3814,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
might_sleep();
- seq = prb_next_seq(prb);
+ seq = prb_next_reserve_seq(prb);
/* Flush the consoles so that records up to @seq are printed. */
console_lock();
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index fde338606..f5a8bb606 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -6,6 +6,7 @@
#include <linux/errno.h>
#include <linux/bug.h>
#include "printk_ringbuffer.h"
+#include "internal.h"
/**
* DOC: printk_ringbuffer overview
@@ -303,6 +304,9 @@
*
* desc_push_tail:B / desc_reserve:D
* set descriptor reusable (state), then push descriptor tail (id)
+ *
+ * desc_update_last_finalized:A / desc_last_finalized_seq:A
+ * store finalized record, then set new highest finalized sequence number
*/
#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits)
@@ -1442,19 +1446,117 @@ fail_reopen:
}
/*
+ * @last_finalized_seq value guarantees that all records up to and including
+ * this sequence number are finalized and can be read. The only exception are
+ * too old records which have already been overwritten.
+ *
+ * It is also guaranteed that @last_finalized_seq only increases.
+ *
+ * Be aware that finalized records following non-finalized records are not
+ * reported because they are not yet available to the reader. For example,
+ * a new record stored via printk() will not be available to a printer if
+ * it follows a record that has not been finalized yet. However, once that
+ * non-finalized record becomes finalized, @last_finalized_seq will be
+ * appropriately updated and the full set of finalized records will be
+ * available to the printer. And since each printk() caller will either
+ * directly print or trigger deferred printing of all available unprinted
+ * records, all printk() messages will get printed.
+ */
+static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long ulseq;
+
+ /*
+ * Guarantee the sequence number is loaded before loading the
+ * associated record in order to guarantee that the record can be
+ * seen by this CPU. This pairs with desc_update_last_finalized:A.
+ */
+ ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq
+ ); /* LMM(desc_last_finalized_seq:A) */
+
+ return __ulseq_to_u64seq(rb, ulseq);
+}
+
+static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
+ struct printk_record *r, unsigned int *line_count);
+
+/*
+ * Check if there are records directly following @last_finalized_seq that are
+ * finalized. If so, update @last_finalized_seq to the latest of these
+ * records. It is not allowed to skip over records that are not yet finalized.
+ */
+static void desc_update_last_finalized(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ u64 old_seq = desc_last_finalized_seq(rb);
+ unsigned long oldval;
+ unsigned long newval;
+ u64 finalized_seq;
+ u64 try_seq;
+
+try_again:
+ finalized_seq = old_seq;
+ try_seq = finalized_seq + 1;
+
+ /* Try to find later finalized records. */
+ while (_prb_read_valid(rb, &try_seq, NULL, NULL)) {
+ finalized_seq = try_seq;
+ try_seq++;
+ }
+
+ /* No update needed if no later finalized record was found. */
+ if (finalized_seq == old_seq)
+ return;
+
+ oldval = __u64seq_to_ulseq(old_seq);
+ newval = __u64seq_to_ulseq(finalized_seq);
+
+ /*
+ * Set the sequence number of a later finalized record that has been
+ * seen.
+ *
+ * Guarantee the record data is visible to other CPUs before storing
+ * its sequence number. This pairs with desc_last_finalized_seq:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_last_finalized_seq:A reads from
+ * desc_update_last_finalized:A, then desc_read:A reads from
+ * _prb_commit:B.
+ *
+ * Relies on:
+ *
+ * RELEASE from _prb_commit:B to desc_update_last_finalized:A
+ * matching
+ * ACQUIRE from desc_last_finalized_seq:A to desc_read:A
+ *
+ * Note: _prb_commit:B and desc_update_last_finalized:A can be
+ * different CPUs. However, the desc_update_last_finalized:A
+ * CPU (which performs the release) must have previously seen
+ * _prb_commit:B.
+ */
+ if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq,
+ &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */
+ old_seq = __ulseq_to_u64seq(rb, oldval);
+ goto try_again;
+ }
+}
+
+/*
* Attempt to finalize a specified descriptor. If this fails, the descriptor
* is either already final or it will finalize itself when the writer commits.
*/
-static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id)
+static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id)
{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
unsigned long prev_state_val = DESC_SV(id, desc_committed);
struct prb_desc *d = to_desc(desc_ring, id);
- atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val,
- DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */
-
- /* Best effort to remember the last finalized @id. */
- atomic_long_set(&desc_ring->last_finalized_id, id);
+ if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val,
+ DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */
+ desc_update_last_finalized(rb);
+ }
}
/**
@@ -1550,7 +1652,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
* readers. (For seq==0 there is no previous descriptor.)
*/
if (info->seq > 0)
- desc_make_final(desc_ring, DESC_ID(id - 1));
+ desc_make_final(rb, DESC_ID(id - 1));
r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id);
/* If text data allocation fails, a data-less record is committed. */
@@ -1643,7 +1745,7 @@ void prb_commit(struct prb_reserved_entry *e)
*/
head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */
if (head_id != e->id)
- desc_make_final(desc_ring, e->id);
+ desc_make_final(e->rb, e->id);
}
/**
@@ -1663,12 +1765,9 @@ void prb_commit(struct prb_reserved_entry *e)
*/
void prb_final_commit(struct prb_reserved_entry *e)
{
- struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
-
_prb_commit(e, desc_finalized);
- /* Best effort to remember the last finalized @id. */
- atomic_long_set(&desc_ring->last_finalized_id, e->id);
+ desc_update_last_finalized(e->rb);
}
/*
@@ -1832,7 +1931,7 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq,
}
/* Get the sequence number of the tail descriptor. */
-static u64 prb_first_seq(struct printk_ringbuffer *rb)
+u64 prb_first_seq(struct printk_ringbuffer *rb)
{
struct prb_desc_ring *desc_ring = &rb->desc_ring;
enum desc_state d_state;
@@ -1875,12 +1974,123 @@ static u64 prb_first_seq(struct printk_ringbuffer *rb)
return seq;
}
+/**
+ * prb_next_reserve_seq() - Get the sequence number after the most recently
+ * reserved record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what sequence
+ * number will be assigned to the next reserved record.
+ *
+ * Note that depending on the situation, this value can be equal to or
+ * higher than the sequence number returned by prb_next_seq().
+ *
+ * Context: Any context.
+ * Return: The sequence number that will be assigned to the next record
+ * reserved.
+ */
+u64 prb_next_reserve_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long last_finalized_id;
+ atomic_long_t *state_var;
+ u64 last_finalized_seq;
+ unsigned long head_id;
+ struct prb_desc desc;
+ unsigned long diff;
+ struct prb_desc *d;
+ int err;
+
+ /*
+ * It may not be possible to read a sequence number for @head_id.
+ * So the ID of @last_finailzed_seq is used to calculate what the
+ * sequence number of @head_id will be.
+ */
+
+try_again:
+ last_finalized_seq = desc_last_finalized_seq(rb);
+
+ /*
+ * @head_id is loaded after @last_finalized_seq to ensure that
+ * it points to the record with @last_finalized_seq or newer.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_last_finalized_seq:A reads from
+ * desc_update_last_finalized:A, then
+ * prb_next_reserve_seq:A reads from desc_reserve:D.
+ *
+ * Relies on:
+ *
+ * RELEASE from desc_reserve:D to desc_update_last_finalized:A
+ * matching
+ * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A
+ *
+ * Note: desc_reserve:D and desc_update_last_finalized:A can be
+ * different CPUs. However, the desc_update_last_finalized:A CPU
+ * (which performs the release) must have previously seen
+ * desc_read:C, which implies desc_reserve:D can be seen.
+ */
+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */
+
+ d = to_desc(desc_ring, last_finalized_seq);
+ state_var = &d->state_var;
+
+ /* Extract the ID, used to specify the descriptor to read. */
+ last_finalized_id = DESC_ID(atomic_long_read(state_var));
+
+ /* Ensure @last_finalized_id is correct. */
+ err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc);
+
+ if (err == -EINVAL) {
+ if (last_finalized_seq == 0) {
+ /*
+ * No record has been finalized or even reserved yet.
+ *
+ * The @head_id is initialized such that the first
+ * increment will yield the first record (seq=0).
+ * Handle it separately to avoid a negative @diff
+ * below.
+ */
+ if (head_id == DESC0_ID(desc_ring->count_bits))
+ return 0;
+
+ /*
+ * One or more descriptors are already reserved. Use
+ * the descriptor ID of the first one (@seq=0) for
+ * the @diff below.
+ */
+ last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1;
+ } else {
+ /* Record must have been overwritten. Try again. */
+ goto try_again;
+ }
+ }
+
+ /* Diff of known descriptor IDs to compute related sequence numbers. */
+ diff = head_id - last_finalized_id;
+
+ /*
+ * @head_id points to the most recently reserved record, but this
+ * function returns the sequence number that will be assigned to the
+ * next (not yet reserved) record. Thus +1 is needed.
+ */
+ return (last_finalized_seq + diff + 1);
+}
+
/*
- * Non-blocking read of a record. Updates @seq to the last finalized record
- * (which may have no data available).
+ * Non-blocking read of a record.
+ *
+ * On success @seq is updated to the record that was read and (if provided)
+ * @r and @line_count will contain the read/calculated data.
+ *
+ * On failure @seq is updated to a record that is not yet available to the
+ * reader, but it will be the next record available to the reader.
*
- * See the description of prb_read_valid() and prb_read_valid_info()
- * for details.
+ * Note: When the current CPU is in panic, this function will skip over any
+ * non-existent/non-finalized records in order to allow the panic CPU
+ * to print any and all records that have been finalized.
*/
static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
struct printk_record *r, unsigned int *line_count)
@@ -1899,12 +2109,32 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
*seq = tail_seq;
} else if (err == -ENOENT) {
- /* Record exists, but no data available. Skip. */
+ /* Record exists, but the data was lost. Skip. */
(*seq)++;
} else {
- /* Non-existent/non-finalized record. Must stop. */
- return false;
+ /*
+ * Non-existent/non-finalized record. Must stop.
+ *
+ * For panic situations it cannot be expected that
+ * non-finalized records will become finalized. But
+ * there may be other finalized records beyond that
+ * need to be printed for a panic situation. If this
+ * is the panic CPU, skip this
+ * non-existent/non-finalized record unless it is
+ * at or beyond the head, in which case it is not
+ * possible to continue.
+ *
+ * Note that new messages printed on panic CPU are
+ * finalized when we are here. The only exception
+ * might be the last message without trailing newline.
+ * But it would have the sequence number returned
+ * by "prb_next_reserve_seq() - 1".
+ */
+ if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb)))
+ (*seq)++;
+ else
+ return false;
}
}
@@ -1932,7 +2162,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
* On success, the reader must check r->info.seq to see which record was
* actually read. This allows the reader to detect dropped records.
*
- * Failure means @seq refers to a not yet written record.
+ * Failure means @seq refers to a record not yet available to the reader.
*/
bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
struct printk_record *r)
@@ -1962,7 +2192,7 @@ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
* On success, the reader must check info->seq to see which record meta data
* was actually read. This allows the reader to detect dropped records.
*
- * Failure means @seq refers to a not yet written record.
+ * Failure means @seq refers to a record not yet available to the reader.
*/
bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
struct printk_info *info, unsigned int *line_count)
@@ -2008,7 +2238,9 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
* newest sequence number available to readers will be.
*
* This provides readers a sequence number to jump to if all currently
- * available records should be skipped.
+ * available records should be skipped. It is guaranteed that all records
+ * previous to the returned value have been finalized and are (or were)
+ * available to the reader.
*
* Context: Any context.
* Return: The sequence number of the next newest (not yet available) record
@@ -2016,34 +2248,19 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
*/
u64 prb_next_seq(struct printk_ringbuffer *rb)
{
- struct prb_desc_ring *desc_ring = &rb->desc_ring;
- enum desc_state d_state;
- unsigned long id;
u64 seq;
- /* Check if the cached @id still points to a valid @seq. */
- id = atomic_long_read(&desc_ring->last_finalized_id);
- d_state = desc_read(desc_ring, id, NULL, &seq, NULL);
+ seq = desc_last_finalized_seq(rb);
- if (d_state == desc_finalized || d_state == desc_reusable) {
- /*
- * Begin searching after the last finalized record.
- *
- * On 0, the search must begin at 0 because of hack#2
- * of the bootstrapping phase it is not known if a
- * record at index 0 exists.
- */
- if (seq != 0)
- seq++;
- } else {
- /*
- * The information about the last finalized sequence number
- * has gone. It should happen only when there is a flood of
- * new messages and the ringbuffer is rapidly recycled.
- * Give up and start from the beginning.
- */
- seq = 0;
- }
+ /*
+ * Begin searching after the last finalized record.
+ *
+ * On 0, the search must begin at 0 because of hack#2
+ * of the bootstrapping phase it is not known if a
+ * record at index 0 exists.
+ */
+ if (seq != 0)
+ seq++;
/*
* The information about the last finalized @seq might be inaccurate.
@@ -2085,7 +2302,7 @@ void prb_init(struct printk_ringbuffer *rb,
rb->desc_ring.infos = infos;
atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
- atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits));
+ atomic_long_set(&rb->desc_ring.last_finalized_seq, 0);
rb->text_data_ring.size_bits = textbits;
rb->text_data_ring.data = text_buf;
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
index 18cd25e48..cb887489d 100644
--- a/kernel/printk/printk_ringbuffer.h
+++ b/kernel/printk/printk_ringbuffer.h
@@ -75,7 +75,7 @@ struct prb_desc_ring {
struct printk_info *infos;
atomic_long_t head_id;
atomic_long_t tail_id;
- atomic_long_t last_finalized_id;
+ atomic_long_t last_finalized_seq;
};
/*
@@ -259,7 +259,7 @@ static struct printk_ringbuffer name = { \
.infos = &_##name##_infos[0], \
.head_id = ATOMIC_INIT(DESC0_ID(descbits)), \
.tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \
- .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ .last_finalized_seq = ATOMIC_INIT(0), \
}, \
.text_data_ring = { \
.size_bits = (avgtextbits) + (descbits), \
@@ -378,7 +378,41 @@ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
struct printk_info *info, unsigned int *line_count);
+u64 prb_first_seq(struct printk_ringbuffer *rb);
u64 prb_first_valid_seq(struct printk_ringbuffer *rb);
u64 prb_next_seq(struct printk_ringbuffer *rb);
+u64 prb_next_reserve_seq(struct printk_ringbuffer *rb);
+
+#ifdef CONFIG_64BIT
+
+#define __u64seq_to_ulseq(u64seq) (u64seq)
+#define __ulseq_to_u64seq(rb, ulseq) (ulseq)
+
+#else /* CONFIG_64BIT */
+
+#define __u64seq_to_ulseq(u64seq) ((u32)u64seq)
+
+static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq)
+{
+ u64 rb_first_seq = prb_first_seq(rb);
+ u64 seq;
+
+ /*
+ * The provided sequence is only the lower 32 bits of the ringbuffer
+ * sequence. It needs to be expanded to 64bit. Get the first sequence
+ * number from the ringbuffer and fold it.
+ *
+ * Having a 32bit representation in the console is sufficient.
+ * If a console ever gets more than 2^31 records behind
+ * the ringbuffer then this is the least of the problems.
+ *
+ * Also the access to the ring buffer is always safe.
+ */
+ seq = rb_first_seq - (s32)((u32)rb_first_seq - ulseq);
+
+ return seq;
+}
+
+#endif /* CONFIG_64BIT */
#endif /* _KERNEL_PRINTK_RINGBUFFER_H */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 157f3ca2a..f544f24df 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4741,13 +4741,16 @@ static void __init rcu_start_exp_gp_kworkers(void)
rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
pr_err("Failed to create %s!\n", gp_kworker_name);
+ rcu_exp_gp_kworker = NULL;
return;
}
rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
pr_err("Failed to create %s!\n", par_gp_kworker_name);
+ rcu_exp_par_gp_kworker = NULL;
kthread_destroy_worker(rcu_exp_gp_kworker);
+ rcu_exp_gp_kworker = NULL;
return;
}
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 2ac440bc7..8107f8184 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -428,7 +428,12 @@ static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
__sync_rcu_exp_select_node_cpus(rewp);
}
-static inline bool rcu_gp_par_worker_started(void)
+static inline bool rcu_exp_worker_started(void)
+{
+ return !!READ_ONCE(rcu_exp_gp_kworker);
+}
+
+static inline bool rcu_exp_par_worker_started(void)
{
return !!READ_ONCE(rcu_exp_par_gp_kworker);
}
@@ -478,7 +483,12 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
__sync_rcu_exp_select_node_cpus(rewp);
}
-static inline bool rcu_gp_par_worker_started(void)
+static inline bool rcu_exp_worker_started(void)
+{
+ return !!READ_ONCE(rcu_gp_wq);
+}
+
+static inline bool rcu_exp_par_worker_started(void)
{
return !!READ_ONCE(rcu_par_gp_wq);
}
@@ -541,7 +551,7 @@ static void sync_rcu_exp_select_cpus(void)
rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */
- if (!rcu_gp_par_worker_started() ||
+ if (!rcu_exp_par_worker_started() ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) {
/* No worker started yet or last leaf, do direct call. */
@@ -956,7 +966,7 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
*/
void synchronize_rcu_expedited(void)
{
- bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
+ bool use_worker;
unsigned long flags;
struct rcu_exp_work rew;
struct rcu_node *rnp;
@@ -967,6 +977,9 @@ void synchronize_rcu_expedited(void)
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
+ use_worker = (rcu_scheduler_active != RCU_SCHEDULER_INIT) &&
+ rcu_exp_worker_started();
+
/* Is the state is such that the call is a grace period? */
if (rcu_blocking_is_gp()) {
// Note well that this code runs with !PREEMPT && !SMP.
@@ -996,7 +1009,7 @@ void synchronize_rcu_expedited(void)
return; /* Someone else did our work for us. */
/* Ensure that load happens before action based on it. */
- if (unlikely(boottime)) {
+ if (unlikely(!use_worker)) {
/* Direct call during scheduler init and early_initcalls(). */
rcu_exp_sel_wait_wake(s);
} else {
@@ -1014,7 +1027,7 @@ void synchronize_rcu_expedited(void)
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
- if (likely(!boottime))
+ if (likely(use_worker))
synchronize_rcu_expedited_destroy_work(&rew);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ac9f4b1d..c43b71792 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7296,7 +7296,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
if (!available_idle_cpu(cpu)) {
idle = false;
if (*idle_cpu == -1) {
- if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
+ if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
*idle_cpu = cpu;
break;
}
@@ -7304,7 +7304,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
}
break;
}
- if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus))
*idle_cpu = cpu;
}
@@ -7318,13 +7318,19 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, int target)
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;
for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
if (cpu == target)
continue;
+ /*
+ * Check if the CPU is in the LLC scheduling domain of @target.
+ * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
+ */
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
}
@@ -7348,7 +7354,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
return __select_idle_cpu(core, p);
}
-static inline int select_idle_smt(struct task_struct *p, int target)
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
@@ -7598,7 +7604,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
has_idle_core = test_idle_cores(target);
if (!has_idle_core && cpus_share_cache(prev, target)) {
- i = select_idle_smt(p, prev);
+ i = select_idle_smt(p, sd, prev);
if ((unsigned int)i < nr_cpumask_bits)
return i;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index f8e543f1e..8bb106a56 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2408,8 +2408,11 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
return -EINVAL;
- /* PARISC cannot allow mdwe as it needs writable stacks */
- if (IS_ENABLED(CONFIG_PARISC))
+ /*
+ * EOPNOTSUPP might be more appropriate here in principle, but
+ * existing userspace depends on EINVAL specifically.
+ */
+ if (!arch_memory_deny_write_exec_supported())
return -EINVAL;
current_bits = get_current_mdwe();
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 9de66bbbb..4782edcbe 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -129,15 +129,17 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
goto out;
}
pccontext->clk = clk;
- fp->private_data = pccontext;
- if (clk->ops.open)
+ if (clk->ops.open) {
err = clk->ops.open(pccontext, fp->f_mode);
- else
- err = 0;
-
- if (!err) {
- get_device(clk->dev);
+ if (err) {
+ kfree(pccontext);
+ goto out;
+ }
}
+
+ fp->private_data = pccontext;
+ get_device(clk->dev);
+ err = 0;
out:
up_read(&clk->rwsem);
return err;
diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c
index ca058c8af..3e5d422dd 100644
--- a/kernel/time/time_test.c
+++ b/kernel/time/time_test.c
@@ -73,7 +73,7 @@ static void time64_to_tm_test_date_range(struct kunit *test)
days = div_s64(secs, 86400);
- #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %ld", \
+ #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %lld", \
year, month, mdday, yday, days
KUNIT_ASSERT_EQ_MSG(test, year - 1900, result.tm_year, FAIL_MSG);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 266d02809..8aab7ed41 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1180,13 +1180,15 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
}
/*
- * cycle_between - true if test occurs chronologically between before and after
+ * timestamp_in_interval - true if ts is chronologically in [start, end]
+ *
+ * True if ts occurs chronologically at or after start, and before or at end.
*/
-static bool cycle_between(u64 before, u64 test, u64 after)
+static bool timestamp_in_interval(u64 start, u64 end, u64 ts)
{
- if (test > before && test < after)
+ if (ts >= start && ts <= end)
return true;
- if (test < before && before > after)
+ if (start > end && (ts >= start || ts <= end))
return true;
return false;
}
@@ -1246,7 +1248,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
*/
now = tk_clock_read(&tk->tkr_mono);
interval_start = tk->tkr_mono.cycle_last;
- if (!cycle_between(interval_start, cycles, now)) {
+ if (!timestamp_in_interval(interval_start, now, cycles)) {
clock_was_set_seq = tk->clock_was_set_seq;
cs_was_changed_seq = tk->cs_was_changed_seq;
cycles = interval_start;
@@ -1259,10 +1261,8 @@ int get_device_system_crosststamp(int (*get_time_fn)
tk_core.timekeeper.offs_real);
base_raw = tk->tkr_raw.base;
- nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono,
- system_counterval.cycles);
- nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw,
- system_counterval.cycles);
+ nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
+ nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
} while (read_seqcount_retry(&tk_core.seq, seq));
xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
@@ -1277,13 +1277,13 @@ int get_device_system_crosststamp(int (*get_time_fn)
bool discontinuity;
/*
- * Check that the counter value occurs after the provided
+ * Check that the counter value is not before the provided
* history reference and that the history doesn't cross a
* clocksource change
*/
if (!history_begin ||
- !cycle_between(history_begin->cycles,
- system_counterval.cycles, cycles) ||
+ !timestamp_in_interval(history_begin->cycles,
+ cycles, system_counterval.cycles) ||
history_begin->cs_was_changed_seq != cs_was_changed_seq)
return -EINVAL;
partial_history_cycles = cycles - system_counterval.cycles;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6fa67c297..140f8eed8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -412,7 +412,6 @@ struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
- long wait_index;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -903,8 +902,19 @@ static void rb_wake_up_waiters(struct irq_work *work)
wake_up_all(&rbwork->waiters);
if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
+ /* Only cpu_buffer sets the above flags */
+ struct ring_buffer_per_cpu *cpu_buffer =
+ container_of(rbwork, struct ring_buffer_per_cpu, irq_work);
+
+ /* Called from interrupt context */
+ raw_spin_lock(&cpu_buffer->reader_lock);
rbwork->wakeup_full = false;
rbwork->full_waiters_pending = false;
+
+ /* Waking up all waiters, they will reset the shortest full */
+ cpu_buffer->shortest_full = 0;
+ raw_spin_unlock(&cpu_buffer->reader_lock);
+
wake_up_all(&rbwork->full_waiters);
}
}
@@ -945,14 +955,95 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
rbwork = &cpu_buffer->irq_work;
}
- rbwork->wait_index++;
- /* make sure the waiters see the new index */
- smp_wmb();
-
/* This can be called in any context */
irq_work_queue(&rbwork->work);
}
+static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ bool ret = false;
+
+ /* Reads of all CPUs always waits for any data */
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return !ring_buffer_empty(buffer);
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (!ring_buffer_empty_cpu(buffer, cpu)) {
+ unsigned long flags;
+ bool pagebusy;
+
+ if (!full)
+ return true;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+ ret = !pagebusy && full_hit(buffer, cpu, full);
+
+ if (!ret && (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full > full)) {
+ cpu_buffer->shortest_full = full;
+ }
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ }
+ return ret;
+}
+
+static inline bool
+rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer,
+ int cpu, int full, ring_buffer_cond_fn cond, void *data)
+{
+ if (rb_watermark_hit(buffer, cpu, full))
+ return true;
+
+ if (cond(data))
+ return true;
+
+ /*
+ * The events can happen in critical sections where
+ * checking a work queue can cause deadlocks.
+ * After adding a task to the queue, this flag is set
+ * only to notify events to try to wake up the queue
+ * using irq_work.
+ *
+ * We don't clear it even if the buffer is no longer
+ * empty. The flag only causes the next event to run
+ * irq_work to do the work queue wake up. The worse
+ * that can happen if we race with !trace_empty() is that
+ * an event will cause an irq_work to try to wake up
+ * an empty queue.
+ *
+ * There's no reason to protect this flag either, as
+ * the work queue and irq_work logic will do the necessary
+ * synchronization for the wake ups. The only thing
+ * that is necessary is that the wake up happens after
+ * a task has been queued. It's OK for spurious wake ups.
+ */
+ if (full)
+ rbwork->full_waiters_pending = true;
+ else
+ rbwork->waiters_pending = true;
+
+ return false;
+}
+
+/*
+ * The default wait condition for ring_buffer_wait() is to just to exit the
+ * wait loop the first time it is woken up.
+ */
+static bool rb_wait_once(void *data)
+{
+ long *once = data;
+
+ /* wait_event() actually calls this twice before scheduling*/
+ if (*once > 1)
+ return true;
+
+ (*once)++;
+ return false;
+}
+
/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
@@ -966,101 +1057,39 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer;
- DEFINE_WAIT(wait);
- struct rb_irq_work *work;
- long wait_index;
+ struct wait_queue_head *waitq;
+ ring_buffer_cond_fn cond;
+ struct rb_irq_work *rbwork;
+ void *data;
+ long once = 0;
int ret = 0;
+ cond = rb_wait_once;
+ data = &once;
+
/*
* Depending on what the caller is waiting for, either any
* data in any cpu buffer, or a specific buffer, put the
* caller on the appropriate wait queue.
*/
if (cpu == RING_BUFFER_ALL_CPUS) {
- work = &buffer->irq_work;
+ rbwork = &buffer->irq_work;
/* Full only makes sense on per cpu reads */
full = 0;
} else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -ENODEV;
cpu_buffer = buffer->buffers[cpu];
- work = &cpu_buffer->irq_work;
- }
-
- wait_index = READ_ONCE(work->wait_index);
-
- while (true) {
- if (full)
- prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
- else
- prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
-
- /*
- * The events can happen in critical sections where
- * checking a work queue can cause deadlocks.
- * After adding a task to the queue, this flag is set
- * only to notify events to try to wake up the queue
- * using irq_work.
- *
- * We don't clear it even if the buffer is no longer
- * empty. The flag only causes the next event to run
- * irq_work to do the work queue wake up. The worse
- * that can happen if we race with !trace_empty() is that
- * an event will cause an irq_work to try to wake up
- * an empty queue.
- *
- * There's no reason to protect this flag either, as
- * the work queue and irq_work logic will do the necessary
- * synchronization for the wake ups. The only thing
- * that is necessary is that the wake up happens after
- * a task has been queued. It's OK for spurious wake ups.
- */
- if (full)
- work->full_waiters_pending = true;
- else
- work->waiters_pending = true;
-
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
- break;
-
- if (cpu != RING_BUFFER_ALL_CPUS &&
- !ring_buffer_empty_cpu(buffer, cpu)) {
- unsigned long flags;
- bool pagebusy;
- bool done;
-
- if (!full)
- break;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- done = !pagebusy && full_hit(buffer, cpu, full);
-
- if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full > full)
- cpu_buffer->shortest_full = full;
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (done)
- break;
- }
-
- schedule();
-
- /* Make sure to see the new wait index */
- smp_rmb();
- if (wait_index != work->wait_index)
- break;
+ rbwork = &cpu_buffer->irq_work;
}
if (full)
- finish_wait(&work->full_waiters, &wait);
+ waitq = &rbwork->full_waiters;
else
- finish_wait(&work->waiters, &wait);
+ waitq = &rbwork->waiters;
+
+ ret = wait_event_interruptible((*waitq),
+ rb_wait_cond(rbwork, buffer, cpu, full, cond, data));
return ret;
}
@@ -1084,30 +1113,51 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
struct file *filp, poll_table *poll_table, int full)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct rb_irq_work *work;
+ struct rb_irq_work *rbwork;
if (cpu == RING_BUFFER_ALL_CPUS) {
- work = &buffer->irq_work;
+ rbwork = &buffer->irq_work;
full = 0;
} else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return EPOLLERR;
cpu_buffer = buffer->buffers[cpu];
- work = &cpu_buffer->irq_work;
+ rbwork = &cpu_buffer->irq_work;
}
if (full) {
- poll_wait(filp, &work->full_waiters, poll_table);
- work->full_waiters_pending = true;
+ unsigned long flags;
+
+ poll_wait(filp, &rbwork->full_waiters, poll_table);
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
if (!cpu_buffer->shortest_full ||
cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
- } else {
- poll_wait(filp, &work->waiters, poll_table);
- work->waiters_pending = true;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ if (full_hit(buffer, cpu, full))
+ return EPOLLIN | EPOLLRDNORM;
+ /*
+ * Only allow full_waiters_pending update to be seen after
+ * the shortest_full is set. If the writer sees the
+ * full_waiters_pending flag set, it will compare the
+ * amount in the ring buffer to shortest_full. If the amount
+ * in the ring buffer is greater than the shortest_full
+ * percent, it will call the irq_work handler to wake up
+ * this list. The irq_handler will reset shortest_full
+ * back to zero. That's done under the reader_lock, but
+ * the below smp_mb() makes sure that the update to
+ * full_waiters_pending doesn't leak up into the above.
+ */
+ smp_mb();
+ rbwork->full_waiters_pending = true;
+ return 0;
}
+ poll_wait(filp, &rbwork->waiters, poll_table);
+ rbwork->waiters_pending = true;
+
/*
* There's a tight race between setting the waiters_pending and
* checking if the ring buffer is empty. Once the waiters_pending bit
@@ -1123,9 +1173,6 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
*/
smp_mb();
- if (full)
- return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
-
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return EPOLLIN | EPOLLRDNORM;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3fdc57450..e03960f9f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8352,6 +8352,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return size;
}
+static int tracing_buffers_flush(struct file *file, fl_owner_t id)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ iter->wait_index++;
+ /* Make sure the waiters see the new wait_index */
+ smp_wmb();
+
+ ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
+ return 0;
+}
+
static int tracing_buffers_release(struct inode *inode, struct file *file)
{
struct ftrace_buffer_info *info = file->private_data;
@@ -8363,12 +8377,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
__trace_array_put(iter->tr);
- iter->wait_index++;
- /* Make sure the waiters see the new wait_index */
- smp_wmb();
-
- ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
-
if (info->spare)
ring_buffer_free_read_page(iter->array_buffer->buffer,
info->spare_cpu, info->spare);
@@ -8582,6 +8590,7 @@ static const struct file_operations tracing_buffers_fops = {
.read = tracing_buffers_read,
.poll = tracing_buffers_poll,
.release = tracing_buffers_release,
+ .flush = tracing_buffers_flush,
.splice_read = tracing_buffers_splice_read,
.unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4f87b1851..8f761417a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -108,7 +108,7 @@ enum {
RESCUER_NICE_LEVEL = MIN_NICE,
HIGHPRI_NICE_LEVEL = MIN_NICE,
- WQ_NAME_LEN = 24,
+ WQ_NAME_LEN = 32,
};
/*
@@ -122,6 +122,9 @@ enum {
*
* L: pool->lock protected. Access with pool->lock held.
*
+ * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
+ * reads.
+ *
* K: Only modified by worker while holding pool->lock. Can be safely read by
* self, while holding pool->lock or from IRQ context if %current is the
* kworker.
@@ -143,6 +146,9 @@ enum {
*
* WR: wq->mutex protected for writes. RCU protected for reads.
*
+ * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read
+ * with READ_ONCE() without locking.
+ *
* MD: wq_mayday_lock protected.
*
* WD: Used internally by the watchdog.
@@ -240,18 +246,18 @@ struct pool_workqueue {
* pwq->inactive_works instead of pool->worklist and marked with
* WORK_STRUCT_INACTIVE.
*
- * All work items marked with WORK_STRUCT_INACTIVE do not participate
- * in pwq->nr_active and all work items in pwq->inactive_works are
- * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE
- * work items are in pwq->inactive_works. Some of them are ready to
- * run in pool->worklist or worker->scheduled. Those work itmes are
- * only struct wq_barrier which is used for flush_work() and should
- * not participate in pwq->nr_active. For non-barrier work item, it
- * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
+ * All work items marked with WORK_STRUCT_INACTIVE do not participate in
+ * nr_active and all work items in pwq->inactive_works are marked with
+ * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
+ * in pwq->inactive_works. Some of them are ready to run in
+ * pool->worklist or worker->scheduled. Those work itmes are only struct
+ * wq_barrier which is used for flush_work() and should not participate
+ * in nr_active. For non-barrier work item, it is marked with
+ * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
*/
int nr_active; /* L: nr of active works */
- int max_active; /* L: max active works */
struct list_head inactive_works; /* L: inactive works */
+ struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
@@ -279,6 +285,26 @@ struct wq_flusher {
struct wq_device;
/*
+ * Unlike in a per-cpu workqueue where max_active limits its concurrency level
+ * on each CPU, in an unbound workqueue, max_active applies to the whole system.
+ * As sharing a single nr_active across multiple sockets can be very expensive,
+ * the counting and enforcement is per NUMA node.
+ *
+ * The following struct is used to enforce per-node max_active. When a pwq wants
+ * to start executing a work item, it should increment ->nr using
+ * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
+ * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
+ * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
+ * round-robin order.
+ */
+struct wq_node_nr_active {
+ int max; /* per-node max_active */
+ atomic_t nr; /* per-node nr_active */
+ raw_spinlock_t lock; /* nests inside pool locks */
+ struct list_head pending_pwqs; /* LN: pwqs with inactive works */
+};
+
+/*
* The externally visible workqueue. It relays the issued work items to
* the appropriate worker_pool through its pool_workqueues.
*/
@@ -298,10 +324,15 @@ struct workqueue_struct {
struct worker *rescuer; /* MD: rescue worker */
int nr_drainers; /* WQ: drain in progress */
- int saved_max_active; /* WQ: saved pwq max_active */
+
+ /* See alloc_workqueue() function comment for info on min/max_active */
+ int max_active; /* WO: max active works */
+ int min_active; /* WO: min active works */
+ int saved_max_active; /* WQ: saved max_active */
+ int saved_min_active; /* WQ: saved min_active */
struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */
- struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */
+ struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */
#ifdef CONFIG_SYSFS
struct wq_device *wq_dev; /* I: for sysfs interface */
@@ -323,6 +354,7 @@ struct workqueue_struct {
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
+ struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
};
static struct kmem_cache *pwq_cache;
@@ -626,6 +658,36 @@ static int worker_pool_assign_id(struct worker_pool *pool)
return ret;
}
+static struct pool_workqueue __rcu **
+unbound_pwq_slot(struct workqueue_struct *wq, int cpu)
+{
+ if (cpu >= 0)
+ return per_cpu_ptr(wq->cpu_pwq, cpu);
+ else
+ return &wq->dfl_pwq;
+}
+
+/* @cpu < 0 for dfl_pwq */
+static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)
+{
+ return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),
+ lockdep_is_held(&wq_pool_mutex) ||
+ lockdep_is_held(&wq->mutex));
+}
+
+/**
+ * unbound_effective_cpumask - effective cpumask of an unbound workqueue
+ * @wq: workqueue of interest
+ *
+ * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
+ * is masked with wq_unbound_cpumask to determine the effective cpumask. The
+ * default pwq is always mapped to the pool with the current effective cpumask.
+ */
+static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
+{
+ return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
+}
+
static unsigned int work_color_to_flags(int color)
{
return color << WORK_STRUCT_COLOR_SHIFT;
@@ -1396,6 +1458,71 @@ work_func_t wq_worker_last_func(struct task_struct *task)
}
/**
+ * wq_node_nr_active - Determine wq_node_nr_active to use
+ * @wq: workqueue of interest
+ * @node: NUMA node, can be %NUMA_NO_NODE
+ *
+ * Determine wq_node_nr_active to use for @wq on @node. Returns:
+ *
+ * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
+ *
+ * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
+ *
+ * - Otherwise, node_nr_active[@node].
+ */
+static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
+ int node)
+{
+ if (!(wq->flags & WQ_UNBOUND))
+ return NULL;
+
+ if (node == NUMA_NO_NODE)
+ node = nr_node_ids;
+
+ return wq->node_nr_active[node];
+}
+
+/**
+ * wq_update_node_max_active - Update per-node max_actives to use
+ * @wq: workqueue to update
+ * @off_cpu: CPU that's going down, -1 if a CPU is not going down
+ *
+ * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
+ * distributed among nodes according to the proportions of numbers of online
+ * cpus. The result is always between @wq->min_active and max_active.
+ */
+static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
+{
+ struct cpumask *effective = unbound_effective_cpumask(wq);
+ int min_active = READ_ONCE(wq->min_active);
+ int max_active = READ_ONCE(wq->max_active);
+ int total_cpus, node;
+
+ lockdep_assert_held(&wq->mutex);
+
+ if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective))
+ off_cpu = -1;
+
+ total_cpus = cpumask_weight_and(effective, cpu_online_mask);
+ if (off_cpu >= 0)
+ total_cpus--;
+
+ for_each_node(node) {
+ int node_cpus;
+
+ node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
+ if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
+ node_cpus--;
+
+ wq_node_nr_active(wq, node)->max =
+ clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
+ min_active, max_active);
+ }
+
+ wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active;
+}
+
+/**
* get_pwq - get an extra reference on the specified pool_workqueue
* @pwq: pool_workqueue to get
*
@@ -1447,24 +1574,293 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
}
}
-static void pwq_activate_inactive_work(struct work_struct *work)
+static bool pwq_is_empty(struct pool_workqueue *pwq)
{
- struct pool_workqueue *pwq = get_work_pwq(work);
+ return !pwq->nr_active && list_empty(&pwq->inactive_works);
+}
+static void __pwq_activate_work(struct pool_workqueue *pwq,
+ struct work_struct *work)
+{
+ unsigned long *wdb = work_data_bits(work);
+
+ WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
trace_workqueue_activate_work(work);
if (list_empty(&pwq->pool->worklist))
pwq->pool->watchdog_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
- __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work));
+ __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
+}
+
+/**
+ * pwq_activate_work - Activate a work item if inactive
+ * @pwq: pool_workqueue @work belongs to
+ * @work: work item to activate
+ *
+ * Returns %true if activated. %false if already active.
+ */
+static bool pwq_activate_work(struct pool_workqueue *pwq,
+ struct work_struct *work)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct wq_node_nr_active *nna;
+
+ lockdep_assert_held(&pool->lock);
+
+ if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
+ return false;
+
+ nna = wq_node_nr_active(pwq->wq, pool->node);
+ if (nna)
+ atomic_inc(&nna->nr);
+
pwq->nr_active++;
+ __pwq_activate_work(pwq, work);
+ return true;
+}
+
+static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
+{
+ int max = READ_ONCE(nna->max);
+
+ while (true) {
+ int old, tmp;
+
+ old = atomic_read(&nna->nr);
+ if (old >= max)
+ return false;
+ tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
+ if (tmp == old)
+ return true;
+ }
+}
+
+/**
+ * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
+ * @pwq: pool_workqueue of interest
+ * @fill: max_active may have increased, try to increase concurrency level
+ *
+ * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
+ * successfully obtained. %false otherwise.
+ */
+static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
+{
+ struct workqueue_struct *wq = pwq->wq;
+ struct worker_pool *pool = pwq->pool;
+ struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
+ bool obtained = false;
+
+ lockdep_assert_held(&pool->lock);
+
+ if (!nna) {
+ /* per-cpu workqueue, pwq->nr_active is sufficient */
+ obtained = pwq->nr_active < READ_ONCE(wq->max_active);
+ goto out;
+ }
+
+ /*
+ * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
+ * already waiting on $nna, pwq_dec_nr_active() will maintain the
+ * concurrency level. Don't jump the line.
+ *
+ * We need to ignore the pending test after max_active has increased as
+ * pwq_dec_nr_active() can only maintain the concurrency level but not
+ * increase it. This is indicated by @fill.
+ */
+ if (!list_empty(&pwq->pending_node) && likely(!fill))
+ goto out;
+
+ obtained = tryinc_node_nr_active(nna);
+ if (obtained)
+ goto out;
+
+ /*
+ * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
+ * and try again. The smp_mb() is paired with the implied memory barrier
+ * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
+ * we see the decremented $nna->nr or they see non-empty
+ * $nna->pending_pwqs.
+ */
+ raw_spin_lock(&nna->lock);
+
+ if (list_empty(&pwq->pending_node))
+ list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
+ else if (likely(!fill))
+ goto out_unlock;
+
+ smp_mb();
+
+ obtained = tryinc_node_nr_active(nna);
+
+ /*
+ * If @fill, @pwq might have already been pending. Being spuriously
+ * pending in cold paths doesn't affect anything. Let's leave it be.
+ */
+ if (obtained && likely(!fill))
+ list_del_init(&pwq->pending_node);
+
+out_unlock:
+ raw_spin_unlock(&nna->lock);
+out:
+ if (obtained)
+ pwq->nr_active++;
+ return obtained;
+}
+
+/**
+ * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
+ * @pwq: pool_workqueue of interest
+ * @fill: max_active may have increased, try to increase concurrency level
+ *
+ * Activate the first inactive work item of @pwq if available and allowed by
+ * max_active limit.
+ *
+ * Returns %true if an inactive work item has been activated. %false if no
+ * inactive work item is found or max_active limit is reached.
+ */
+static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
+{
+ struct work_struct *work =
+ list_first_entry_or_null(&pwq->inactive_works,
+ struct work_struct, entry);
+
+ if (work && pwq_tryinc_nr_active(pwq, fill)) {
+ __pwq_activate_work(pwq, work);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/**
+ * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
+ * @nna: wq_node_nr_active to activate a pending pwq for
+ * @caller_pool: worker_pool the caller is locking
+ *
+ * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
+ * @caller_pool may be unlocked and relocked to lock other worker_pools.
+ */
+static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
+ struct worker_pool *caller_pool)
+{
+ struct worker_pool *locked_pool = caller_pool;
+ struct pool_workqueue *pwq;
+ struct work_struct *work;
+
+ lockdep_assert_held(&caller_pool->lock);
+
+ raw_spin_lock(&nna->lock);
+retry:
+ pwq = list_first_entry_or_null(&nna->pending_pwqs,
+ struct pool_workqueue, pending_node);
+ if (!pwq)
+ goto out_unlock;
+
+ /*
+ * If @pwq is for a different pool than @locked_pool, we need to lock
+ * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
+ * / lock dance. For that, we also need to release @nna->lock as it's
+ * nested inside pool locks.
+ */
+ if (pwq->pool != locked_pool) {
+ raw_spin_unlock(&locked_pool->lock);
+ locked_pool = pwq->pool;
+ if (!raw_spin_trylock(&locked_pool->lock)) {
+ raw_spin_unlock(&nna->lock);
+ raw_spin_lock(&locked_pool->lock);
+ raw_spin_lock(&nna->lock);
+ goto retry;
+ }
+ }
+
+ /*
+ * $pwq may not have any inactive work items due to e.g. cancellations.
+ * Drop it from pending_pwqs and see if there's another one.
+ */
+ work = list_first_entry_or_null(&pwq->inactive_works,
+ struct work_struct, entry);
+ if (!work) {
+ list_del_init(&pwq->pending_node);
+ goto retry;
+ }
+
+ /*
+ * Acquire an nr_active count and activate the inactive work item. If
+ * $pwq still has inactive work items, rotate it to the end of the
+ * pending_pwqs so that we round-robin through them. This means that
+ * inactive work items are not activated in queueing order which is fine
+ * given that there has never been any ordering across different pwqs.
+ */
+ if (likely(tryinc_node_nr_active(nna))) {
+ pwq->nr_active++;
+ __pwq_activate_work(pwq, work);
+
+ if (list_empty(&pwq->inactive_works))
+ list_del_init(&pwq->pending_node);
+ else
+ list_move_tail(&pwq->pending_node, &nna->pending_pwqs);
+
+ /* if activating a foreign pool, make sure it's running */
+ if (pwq->pool != caller_pool)
+ kick_pool(pwq->pool);
+ }
+
+out_unlock:
+ raw_spin_unlock(&nna->lock);
+ if (locked_pool != caller_pool) {
+ raw_spin_unlock(&locked_pool->lock);
+ raw_spin_lock(&caller_pool->lock);
+ }
}
-static void pwq_activate_first_inactive(struct pool_workqueue *pwq)
+/**
+ * pwq_dec_nr_active - Retire an active count
+ * @pwq: pool_workqueue of interest
+ *
+ * Decrement @pwq's nr_active and try to activate the first inactive work item.
+ * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
+ */
+static void pwq_dec_nr_active(struct pool_workqueue *pwq)
{
- struct work_struct *work = list_first_entry(&pwq->inactive_works,
- struct work_struct, entry);
+ struct worker_pool *pool = pwq->pool;
+ struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);
- pwq_activate_inactive_work(work);
+ lockdep_assert_held(&pool->lock);
+
+ /*
+ * @pwq->nr_active should be decremented for both percpu and unbound
+ * workqueues.
+ */
+ pwq->nr_active--;
+
+ /*
+ * For a percpu workqueue, it's simple. Just need to kick the first
+ * inactive work item on @pwq itself.
+ */
+ if (!nna) {
+ pwq_activate_first_inactive(pwq, false);
+ return;
+ }
+
+ /*
+ * If @pwq is for an unbound workqueue, it's more complicated because
+ * multiple pwqs and pools may be sharing the nr_active count. When a
+ * pwq needs to wait for an nr_active count, it puts itself on
+ * $nna->pending_pwqs. The following atomic_dec_return()'s implied
+ * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
+ * guarantee that either we see non-empty pending_pwqs or they see
+ * decremented $nna->nr.
+ *
+ * $nna->max may change as CPUs come online/offline and @pwq->wq's
+ * max_active gets updated. However, it is guaranteed to be equal to or
+ * larger than @pwq->wq->min_active which is above zero unless freezing.
+ * This maintains the forward progress guarantee.
+ */
+ if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
+ return;
+
+ if (!list_empty(&nna->pending_pwqs))
+ node_activate_pending_pwq(nna, pool);
}
/**
@@ -1482,14 +1878,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_
{
int color = get_work_color(work_data);
- if (!(work_data & WORK_STRUCT_INACTIVE)) {
- pwq->nr_active--;
- if (!list_empty(&pwq->inactive_works)) {
- /* one down, submit an inactive one */
- if (pwq->nr_active < pwq->max_active)
- pwq_activate_first_inactive(pwq);
- }
- }
+ if (!(work_data & WORK_STRUCT_INACTIVE))
+ pwq_dec_nr_active(pwq);
pwq->nr_in_flight[color]--;
@@ -1602,8 +1992,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
* management later on and cause stall. Make sure the work
* item is activated before grabbing.
*/
- if (*work_data_bits(work) & WORK_STRUCT_INACTIVE)
- pwq_activate_inactive_work(work);
+ pwq_activate_work(pwq, work);
list_del_init(&work->entry);
pwq_dec_nr_in_flight(pwq, *work_data_bits(work));
@@ -1787,12 +2176,16 @@ retry:
pwq->nr_in_flight[pwq->work_color]++;
work_flags = work_color_to_flags(pwq->work_color);
- if (likely(pwq->nr_active < pwq->max_active)) {
+ /*
+ * Limit the number of concurrently active work items to max_active.
+ * @work must also queue behind existing inactive work items to maintain
+ * ordering when max_active changes. See wq_adjust_max_active().
+ */
+ if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
if (list_empty(&pool->worklist))
pool->watchdog_ts = jiffies;
trace_workqueue_activate_work(work);
- pwq->nr_active++;
insert_work(pwq, work, &pool->worklist, work_flags);
kick_pool(pool);
} else {
@@ -3021,7 +3414,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
barr->task = current;
- /* The barrier work item does not participate in pwq->nr_active. */
+ /* The barrier work item does not participate in nr_active. */
work_flags |= WORK_STRUCT_INACTIVE;
/*
@@ -3310,7 +3703,7 @@ reflush:
bool drained;
raw_spin_lock_irq(&pwq->pool->lock);
- drained = !pwq->nr_active && list_empty(&pwq->inactive_works);
+ drained = pwq_is_empty(pwq);
raw_spin_unlock_irq(&pwq->pool->lock);
if (drained)
@@ -3921,11 +4314,65 @@ static void wq_free_lockdep(struct workqueue_struct *wq)
}
#endif
+static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
+{
+ int node;
+
+ for_each_node(node) {
+ kfree(nna_ar[node]);
+ nna_ar[node] = NULL;
+ }
+
+ kfree(nna_ar[nr_node_ids]);
+ nna_ar[nr_node_ids] = NULL;
+}
+
+static void init_node_nr_active(struct wq_node_nr_active *nna)
+{
+ atomic_set(&nna->nr, 0);
+ raw_spin_lock_init(&nna->lock);
+ INIT_LIST_HEAD(&nna->pending_pwqs);
+}
+
+/*
+ * Each node's nr_active counter will be accessed mostly from its own node and
+ * should be allocated in the node.
+ */
+static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
+{
+ struct wq_node_nr_active *nna;
+ int node;
+
+ for_each_node(node) {
+ nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
+ if (!nna)
+ goto err_free;
+ init_node_nr_active(nna);
+ nna_ar[node] = nna;
+ }
+
+ /* [nr_node_ids] is used as the fallback */
+ nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
+ if (!nna)
+ goto err_free;
+ init_node_nr_active(nna);
+ nna_ar[nr_node_ids] = nna;
+
+ return 0;
+
+err_free:
+ free_node_nr_active(nna_ar);
+ return -ENOMEM;
+}
+
static void rcu_free_wq(struct rcu_head *rcu)
{
struct workqueue_struct *wq =
container_of(rcu, struct workqueue_struct, rcu);
+ if (wq->flags & WQ_UNBOUND)
+ free_node_nr_active(wq->node_nr_active);
+
wq_free_lockdep(wq);
free_percpu(wq->cpu_pwq);
free_workqueue_attrs(wq->unbound_attrs);
@@ -4124,6 +4571,15 @@ static void pwq_release_workfn(struct kthread_work *work)
mutex_unlock(&wq_pool_mutex);
}
+ if (!list_empty(&pwq->pending_node)) {
+ struct wq_node_nr_active *nna =
+ wq_node_nr_active(pwq->wq, pwq->pool->node);
+
+ raw_spin_lock_irq(&nna->lock);
+ list_del_init(&pwq->pending_node);
+ raw_spin_unlock_irq(&nna->lock);
+ }
+
call_rcu(&pwq->rcu, rcu_free_pwq);
/*
@@ -4136,50 +4592,6 @@ static void pwq_release_workfn(struct kthread_work *work)
}
}
-/**
- * pwq_adjust_max_active - update a pwq's max_active to the current setting
- * @pwq: target pool_workqueue
- *
- * If @pwq isn't freezing, set @pwq->max_active to the associated
- * workqueue's saved_max_active and activate inactive work items
- * accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
- */
-static void pwq_adjust_max_active(struct pool_workqueue *pwq)
-{
- struct workqueue_struct *wq = pwq->wq;
- bool freezable = wq->flags & WQ_FREEZABLE;
- unsigned long flags;
-
- /* for @wq->saved_max_active */
- lockdep_assert_held(&wq->mutex);
-
- /* fast exit for non-freezable wqs */
- if (!freezable && pwq->max_active == wq->saved_max_active)
- return;
-
- /* this function can be called during early boot w/ irq disabled */
- raw_spin_lock_irqsave(&pwq->pool->lock, flags);
-
- /*
- * During [un]freezing, the caller is responsible for ensuring that
- * this function is called at least once after @workqueue_freezing
- * is updated and visible.
- */
- if (!freezable || !workqueue_freezing) {
- pwq->max_active = wq->saved_max_active;
-
- while (!list_empty(&pwq->inactive_works) &&
- pwq->nr_active < pwq->max_active)
- pwq_activate_first_inactive(pwq);
-
- kick_pool(pwq->pool);
- } else {
- pwq->max_active = 0;
- }
-
- raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
-}
-
/* initialize newly allocated @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
struct worker_pool *pool)
@@ -4193,6 +4605,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
pwq->flush_color = -1;
pwq->refcnt = 1;
INIT_LIST_HEAD(&pwq->inactive_works);
+ INIT_LIST_HEAD(&pwq->pending_node);
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
kthread_init_work(&pwq->release_work, pwq_release_workfn);
@@ -4212,9 +4625,6 @@ static void link_pwq(struct pool_workqueue *pwq)
/* set the matching work_color */
pwq->work_color = wq->work_color;
- /* sync max_active to the current setting */
- pwq_adjust_max_active(pwq);
-
/* link in @pwq */
list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
}
@@ -4283,10 +4693,11 @@ static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
"possible intersect\n");
}
-/* install @pwq into @wq's cpu_pwq and return the old pwq */
+/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
int cpu, struct pool_workqueue *pwq)
{
+ struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);
struct pool_workqueue *old_pwq;
lockdep_assert_held(&wq_pool_mutex);
@@ -4295,8 +4706,8 @@ static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
/* link_pwq() can handle duplicate calls */
link_pwq(pwq);
- old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
- rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq);
+ old_pwq = rcu_access_pointer(*slot);
+ rcu_assign_pointer(*slot, pwq);
return old_pwq;
}
@@ -4396,14 +4807,14 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
- /* save the previous pwq and install the new one */
+ /* save the previous pwqs and install the new ones */
for_each_possible_cpu(cpu)
ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
ctx->pwq_tbl[cpu]);
+ ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);
- /* @dfl_pwq might not have been used, ensure it's linked */
- link_pwq(ctx->dfl_pwq);
- swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
+ /* update node_nr_active->max */
+ wq_update_node_max_active(ctx->wq, -1);
mutex_unlock(&ctx->wq->mutex);
}
@@ -4526,9 +4937,7 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
/* nothing to do if the target cpumask matches the current pwq */
wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
- pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu),
- lockdep_is_held(&wq_pool_mutex));
- if (wqattrs_equal(target_attrs, pwq->pool->attrs))
+ if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
return;
/* create a new pwq */
@@ -4546,10 +4955,11 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
use_dfl_pwq:
mutex_lock(&wq->mutex);
- raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
- get_pwq(wq->dfl_pwq);
- raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
- old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq);
+ pwq = unbound_pwq(wq, -1);
+ raw_spin_lock_irq(&pwq->pool->lock);
+ get_pwq(pwq);
+ raw_spin_unlock_irq(&pwq->pool->lock);
+ old_pwq = install_unbound_pwq(wq, cpu, pwq);
out_unlock:
mutex_unlock(&wq->mutex);
put_pwq_unlocked(old_pwq);
@@ -4587,10 +4997,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
cpus_read_lock();
if (wq->flags & __WQ_ORDERED) {
+ struct pool_workqueue *dfl_pwq;
+
ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
/* there should only be single pwq for ordering guarantee */
- WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
- wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+ dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
+ WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
+ wq->pwqs.prev != &dfl_pwq->pwqs_node),
"ordering guarantee broken for workqueue %s\n", wq->name);
} else {
ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
@@ -4665,6 +5078,69 @@ static int init_rescuer(struct workqueue_struct *wq)
return 0;
}
+/**
+ * wq_adjust_max_active - update a wq's max_active to the current setting
+ * @wq: target workqueue
+ *
+ * If @wq isn't freezing, set @wq->max_active to the saved_max_active and
+ * activate inactive work items accordingly. If @wq is freezing, clear
+ * @wq->max_active to zero.
+ */
+static void wq_adjust_max_active(struct workqueue_struct *wq)
+{
+ bool activated;
+ int new_max, new_min;
+
+ lockdep_assert_held(&wq->mutex);
+
+ if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
+ new_max = 0;
+ new_min = 0;
+ } else {
+ new_max = wq->saved_max_active;
+ new_min = wq->saved_min_active;
+ }
+
+ if (wq->max_active == new_max && wq->min_active == new_min)
+ return;
+
+ /*
+ * Update @wq->max/min_active and then kick inactive work items if more
+ * active work items are allowed. This doesn't break work item ordering
+ * because new work items are always queued behind existing inactive
+ * work items if there are any.
+ */
+ WRITE_ONCE(wq->max_active, new_max);
+ WRITE_ONCE(wq->min_active, new_min);
+
+ if (wq->flags & WQ_UNBOUND)
+ wq_update_node_max_active(wq, -1);
+
+ if (new_max == 0)
+ return;
+
+ /*
+ * Round-robin through pwq's activating the first inactive work item
+ * until max_active is filled.
+ */
+ do {
+ struct pool_workqueue *pwq;
+
+ activated = false;
+ for_each_pwq(pwq, wq) {
+ unsigned long flags;
+
+ /* can be called during early boot w/ irq disabled */
+ raw_spin_lock_irqsave(&pwq->pool->lock, flags);
+ if (pwq_activate_first_inactive(pwq, true)) {
+ activated = true;
+ kick_pool(pwq->pool);
+ }
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ }
+ } while (activated);
+}
+
__printf(1, 4)
struct workqueue_struct *alloc_workqueue(const char *fmt,
unsigned int flags,
@@ -4672,7 +5148,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
{
va_list args;
struct workqueue_struct *wq;
- struct pool_workqueue *pwq;
+ size_t wq_size;
+ int name_len;
/*
* Unbound && max_active == 1 used to imply ordered, which is no longer
@@ -4688,7 +5165,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
flags |= WQ_UNBOUND;
/* allocate wq and format name */
- wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+ if (flags & WQ_UNBOUND)
+ wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
+ else
+ wq_size = sizeof(*wq);
+
+ wq = kzalloc(wq_size, GFP_KERNEL);
if (!wq)
return NULL;
@@ -4699,15 +5181,22 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
}
va_start(args, max_active);
- vsnprintf(wq->name, sizeof(wq->name), fmt, args);
+ name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
va_end(args);
+ if (name_len >= WQ_NAME_LEN)
+ pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
+ wq->name);
+
max_active = max_active ?: WQ_DFL_ACTIVE;
max_active = wq_clamp_max_active(max_active, flags, wq->name);
/* init wq */
wq->flags = flags;
- wq->saved_max_active = max_active;
+ wq->max_active = max_active;
+ wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
+ wq->saved_max_active = wq->max_active;
+ wq->saved_min_active = wq->min_active;
mutex_init(&wq->mutex);
atomic_set(&wq->nr_pwqs_to_flush, 0);
INIT_LIST_HEAD(&wq->pwqs);
@@ -4718,8 +5207,13 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
wq_init_lockdep(wq);
INIT_LIST_HEAD(&wq->list);
+ if (flags & WQ_UNBOUND) {
+ if (alloc_node_nr_active(wq->node_nr_active) < 0)
+ goto err_unreg_lockdep;
+ }
+
if (alloc_and_link_pwqs(wq) < 0)
- goto err_unreg_lockdep;
+ goto err_free_node_nr_active;
if (wq_online && init_rescuer(wq) < 0)
goto err_destroy;
@@ -4735,8 +5229,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
mutex_lock(&wq_pool_mutex);
mutex_lock(&wq->mutex);
- for_each_pwq(pwq, wq)
- pwq_adjust_max_active(pwq);
+ wq_adjust_max_active(wq);
mutex_unlock(&wq->mutex);
list_add_tail_rcu(&wq->list, &workqueues);
@@ -4745,6 +5238,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
return wq;
+err_free_node_nr_active:
+ if (wq->flags & WQ_UNBOUND)
+ free_node_nr_active(wq->node_nr_active);
err_unreg_lockdep:
wq_unregister_lockdep(wq);
wq_free_lockdep(wq);
@@ -4766,9 +5262,9 @@ static bool pwq_busy(struct pool_workqueue *pwq)
if (pwq->nr_in_flight[i])
return true;
- if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1))
+ if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1))
return true;
- if (pwq->nr_active || !list_empty(&pwq->inactive_works))
+ if (!pwq_is_empty(pwq))
return true;
return false;
@@ -4850,13 +5346,12 @@ void destroy_workqueue(struct workqueue_struct *wq)
rcu_read_lock();
for_each_possible_cpu(cpu) {
- pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu));
- RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL);
- put_pwq_unlocked(pwq);
+ put_pwq_unlocked(unbound_pwq(wq, cpu));
+ RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);
}
- put_pwq_unlocked(wq->dfl_pwq);
- wq->dfl_pwq = NULL;
+ put_pwq_unlocked(unbound_pwq(wq, -1));
+ RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL);
rcu_read_unlock();
}
@@ -4867,15 +5362,14 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
* @wq: target workqueue
* @max_active: new max_active value.
*
- * Set max_active of @wq to @max_active.
+ * Set max_active of @wq to @max_active. See the alloc_workqueue() function
+ * comment.
*
* CONTEXT:
* Don't call from IRQ context.
*/
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
- struct pool_workqueue *pwq;
-
/* disallow meddling with max_active for ordered workqueues */
if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return;
@@ -4886,9 +5380,10 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
wq->flags &= ~__WQ_ORDERED;
wq->saved_max_active = max_active;
+ if (wq->flags & WQ_UNBOUND)
+ wq->saved_min_active = min(wq->saved_min_active, max_active);
- for_each_pwq(pwq, wq)
- pwq_adjust_max_active(pwq);
+ wq_adjust_max_active(wq);
mutex_unlock(&wq->mutex);
}
@@ -5135,8 +5630,8 @@ static void show_pwq(struct pool_workqueue *pwq)
pr_info(" pwq %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" active=%d/%d refcnt=%d%s\n",
- pwq->nr_active, pwq->max_active, pwq->refcnt,
+ pr_cont(" active=%d refcnt=%d%s\n",
+ pwq->nr_active, pwq->refcnt,
!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
@@ -5210,7 +5705,7 @@ void show_one_workqueue(struct workqueue_struct *wq)
unsigned long flags;
for_each_pwq(pwq, wq) {
- if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ if (!pwq_is_empty(pwq)) {
idle = false;
break;
}
@@ -5222,7 +5717,7 @@ void show_one_workqueue(struct workqueue_struct *wq)
for_each_pwq(pwq, wq) {
raw_spin_lock_irqsave(&pwq->pool->lock, flags);
- if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ if (!pwq_is_empty(pwq)) {
/*
* Defer printing to avoid deadlocks in console
* drivers that queue work while holding locks
@@ -5569,6 +6064,10 @@ int workqueue_online_cpu(unsigned int cpu)
for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
wq_update_pod(wq, tcpu, cpu, true);
+
+ mutex_lock(&wq->mutex);
+ wq_update_node_max_active(wq, -1);
+ mutex_unlock(&wq->mutex);
}
}
@@ -5597,6 +6096,10 @@ int workqueue_offline_cpu(unsigned int cpu)
for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
wq_update_pod(wq, tcpu, cpu, false);
+
+ mutex_lock(&wq->mutex);
+ wq_update_node_max_active(wq, cpu);
+ mutex_unlock(&wq->mutex);
}
}
mutex_unlock(&wq_pool_mutex);
@@ -5684,7 +6187,6 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
void freeze_workqueues_begin(void)
{
struct workqueue_struct *wq;
- struct pool_workqueue *pwq;
mutex_lock(&wq_pool_mutex);
@@ -5693,8 +6195,7 @@ void freeze_workqueues_begin(void)
list_for_each_entry(wq, &workqueues, list) {
mutex_lock(&wq->mutex);
- for_each_pwq(pwq, wq)
- pwq_adjust_max_active(pwq);
+ wq_adjust_max_active(wq);
mutex_unlock(&wq->mutex);
}
@@ -5759,7 +6260,6 @@ out_unlock:
void thaw_workqueues(void)
{
struct workqueue_struct *wq;
- struct pool_workqueue *pwq;
mutex_lock(&wq_pool_mutex);
@@ -5771,8 +6271,7 @@ void thaw_workqueues(void)
/* restore max_active and repopulate worklist */
list_for_each_entry(wq, &workqueues, list) {
mutex_lock(&wq->mutex);
- for_each_pwq(pwq, wq)
- pwq_adjust_max_active(pwq);
+ wq_adjust_max_active(wq);
mutex_unlock(&wq->mutex);
}
@@ -6610,7 +7109,7 @@ void __init workqueue_init_early(void)
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
WQ_POWER_EFFICIENT, 0);
- system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
+ system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
WQ_FREEZABLE | WQ_POWER_EFFICIENT,
0);
BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
@@ -6797,8 +7296,12 @@ void __init workqueue_init_topology(void)
* combinations to apply per-pod sharing.
*/
list_for_each_entry(wq, &workqueues, list) {
- for_each_online_cpu(cpu) {
+ for_each_online_cpu(cpu)
wq_update_pod(wq, cpu, cpu, true);
+ if (wq->flags & WQ_UNBOUND) {
+ mutex_lock(&wq->mutex);
+ wq_update_node_max_active(wq, -1);
+ mutex_unlock(&wq->mutex);
}
}