diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bounds.c | 2 | ||||
-rw-r--r-- | kernel/bpf/core.c | 7 | ||||
-rw-r--r-- | kernel/bpf/cpumap.c | 3 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 11 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 14 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 4 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 9 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 5 | ||||
-rw-r--r-- | kernel/dma/swiotlb.c | 11 | ||||
-rw-r--r-- | kernel/entry/common.c | 8 | ||||
-rw-r--r-- | kernel/module/main.c | 9 | ||||
-rw-r--r-- | kernel/power/suspend.c | 1 | ||||
-rw-r--r-- | kernel/printk/printk.c | 56 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 3 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 25 | ||||
-rw-r--r-- | kernel/sched/fair.c | 16 | ||||
-rw-r--r-- | kernel/time/time_test.c | 2 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 24 | ||||
-rw-r--r-- | kernel/time/timer.c | 164 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 233 | ||||
-rw-r--r-- | kernel/trace/trace.c | 21 |
21 files changed, 413 insertions, 215 deletions
diff --git a/kernel/bounds.c b/kernel/bounds.c index b529182e8..c5a9fcd2d 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -19,7 +19,7 @@ int main(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); #ifdef CONFIG_SMP - DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); + DEFINE(NR_CPUS_BITS, bits_per(CONFIG_NR_CPUS)); #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); #ifdef CONFIG_LRU_GEN diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 76bf1de26..44abf88e1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -857,7 +857,12 @@ static LIST_HEAD(pack_list); * CONFIG_MMU=n. Use PAGE_SIZE in these cases. */ #ifdef PMD_SIZE -#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes()) +/* PMD_SIZE is really big for some archs. It doesn't make sense to + * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to + * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be + * greater than or equal to 2MB. + */ +#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes()) #else #define BPF_PROG_PACK_SIZE PAGE_SIZE #endif diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 050893704..806a7c1b3 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -306,6 +306,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; + unsigned long last_qs = jiffies; complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); @@ -331,10 +332,12 @@ static int cpu_map_kthread_run(void *data) if (__ptr_ring_empty(rcpu->queue)) { schedule(); sched = 1; + last_qs = jiffies; } else { __set_current_state(TASK_RUNNING); } } else { + rcu_softirq_qs_periodic(last_qs); sched = cond_resched(); } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f9a87dcc5..e051cbb07 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -131,13 +131,14 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); - - if (!dtab->n_buckets) /* Overflow check */ + /* hash table size must be power of 2; roundup_pow_of_two() can + * overflow into UB on 32-bit arches, so check that first + */ + if (dtab->map.max_entries > 1UL << 31) return -EINVAL; - } - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, dtab->map.numa_node); if (!dtab->dev_index_head) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 88c71de0a..0c74cc901 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -495,7 +495,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) num_possible_cpus()); } - /* hash table size must be power of 2 */ + /* hash table size must be power of 2; roundup_pow_of_two() can overflow + * into UB on 32-bit arches, so check that first + */ + err = -E2BIG; + if (htab->map.max_entries > 1UL << 31) + goto free_htab; + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct htab_elem) + @@ -505,10 +511,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else htab->elem_size += round_up(htab->map.value_size, 8); - err = -E2BIG; - /* prevent zero size kmalloc and check for u32 overflow */ - if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct bucket)) + /* check for u32 overflow */ + if (htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; err = -ENOMEM; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 83f8f67e9..758510b46 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -328,7 +328,7 @@ static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock) __this_cpu_write(irqsave_flags, flags); } -notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) { __bpf_spin_lock_irqsave(lock); return 0; @@ -350,7 +350,7 @@ static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) local_irq_restore(flags); } -notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) { __bpf_spin_unlock_irqrestore(lock); return 0; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index f86db3cf7..f0fd936ce 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -94,11 +94,14 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); - /* hash table size must be power of 2 */ - n_buckets = roundup_pow_of_two(attr->max_entries); - if (!n_buckets) + /* hash table size must be power of 2; roundup_pow_of_two() can overflow + * into UB on 32-bit arches, so check that first + */ + if (attr->max_entries > 1UL << 31) return ERR_PTR(-E2BIG); + n_buckets = roundup_pow_of_two(attr->max_entries); + cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1a29ac4db..27cc6e3db 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4965,6 +4965,11 @@ static int check_stack_access_within_bounds( err = check_stack_slot_within_bounds(min_off, state, type); if (!err && max_off > 0) err = -EINVAL; /* out of stack access into non-negative offsets */ + if (!err && access_size < 0) + /* access_size should not be negative (or overflow an int); others checks + * along the way should have prevented such an access. + */ + err = -EFAULT; /* invalid negative access size; integer overflow? */ if (err) { if (tnum_is_const(reg->var_off)) { diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index ad6333c3f..db89ac94e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -654,8 +654,7 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index, dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); - unsigned int iotlb_align_mask = - dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); + unsigned int iotlb_align_mask = dma_get_min_align_mask(dev); unsigned int nslots = nr_slots(alloc_size), stride; unsigned int index, wrap, count = 0, i; unsigned int offset = swiotlb_align_offset(dev, orig_addr); @@ -667,6 +666,14 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index, BUG_ON(area_index >= mem->nareas); /* + * Ensure that the allocation is at least slot-aligned and update + * 'iotlb_align_mask' to ignore bits that will be preserved when + * offsetting into the allocation. + */ + alloc_align_mask |= (IO_TLB_SIZE - 1); + iotlb_align_mask &= ~alloc_align_mask; + + /* * For mappings with an alignment requirement don't bother looping to * unaligned slots once we found an aligned one. For allocations of * PAGE_SIZE or larger only look for page aligned allocations. diff --git a/kernel/entry/common.c b/kernel/entry/common.c index be61332c6..ccf2b1e1b 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -77,8 +77,14 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Either of the above might have changed the syscall number */ syscall = syscall_get_nr(current, regs); - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { trace_sys_enter(regs, syscall); + /* + * Probes or BPF hooks in the tracepoint may have changed the + * system call number as well. + */ + syscall = syscall_get_nr(current, regs); + } syscall_enter_audit(regs, syscall); diff --git a/kernel/module/main.c b/kernel/module/main.c index 7a376e26d..554aba47a 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2434,6 +2434,11 @@ static void do_free_init(struct work_struct *w) } } +void flush_module_init_free_work(void) +{ + flush_work(&init_free_wq); +} + #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "module." /* Default value for module->async_probe_requested */ @@ -2524,8 +2529,8 @@ static noinline int do_init_module(struct module *mod) * Note that module_alloc() on most architectures creates W+X page * mappings which won't be cleaned up until do_free_init() runs. Any * code such as mark_rodata_ro() which depends on those mappings to - * be cleaned up needs to sync with the queued work - ie - * rcu_barrier() + * be cleaned up needs to sync with the queued work by invoking + * flush_module_init_free_work(). */ if (llist_add(&freeinit->node, &init_free_list)) schedule_work(&init_free_wq); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index fa3bf161d..a718067de 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -192,6 +192,7 @@ static int __init mem_sleep_default_setup(char *str) if (mem_sleep_labels[state] && !strcmp(str, mem_sleep_labels[state])) { mem_sleep_default = state; + mem_sleep_current = state; break; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index cc53fb77f..0ae06d504 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1797,10 +1797,23 @@ static bool console_waiter; */ static void console_lock_spinning_enable(void) { + /* + * Do not use spinning in panic(). The panic CPU wants to keep the lock. + * Non-panic CPUs abandon the flush anyway. + * + * Just keep the lockdep annotation. The panic-CPU should avoid + * taking console_owner_lock because it might cause a deadlock. + * This looks like the easiest way how to prevent false lockdep + * reports without handling races a lockless way. + */ + if (panic_in_progress()) + goto lockdep; + raw_spin_lock(&console_owner_lock); console_owner = current; raw_spin_unlock(&console_owner_lock); +lockdep: /* The waiter may spin on us after setting console_owner */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); } @@ -1824,6 +1837,22 @@ static int console_lock_spinning_disable_and_check(void) { int waiter; + /* + * Ignore spinning waiters during panic() because they might get stopped + * or blocked at any time, + * + * It is safe because nobody is allowed to start spinning during panic + * in the first place. If there has been a waiter then non panic CPUs + * might stay spinning. They would get stopped anyway. The panic context + * will never start spinning and an interrupted spin on panic CPU will + * never continue. + */ + if (panic_in_progress()) { + /* Keep lockdep happy. */ + spin_release(&console_owner_dep_map, _THIS_IP_); + return 0; + } + raw_spin_lock(&console_owner_lock); waiter = READ_ONCE(console_waiter); console_owner = NULL; @@ -1918,6 +1947,12 @@ static int console_trylock_spinning(void) */ mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); + /* + * Update @console_may_schedule for trylock because the previous + * owner may have been schedulable. + */ + console_may_schedule = 0; + return 1; } @@ -3016,6 +3051,21 @@ static int __init keep_bootcon_setup(char *str) early_param("keep_bootcon", keep_bootcon_setup); +static int console_call_setup(struct console *newcon, char *options) +{ + int err; + + if (!newcon->setup) + return 0; + + /* Synchronize with possible boot console. */ + console_lock(); + err = newcon->setup(newcon, options); + console_unlock(); + + return err; +} + /* * This is called by register_console() to try to match * the newly registered console with any of the ones selected @@ -3051,8 +3101,8 @@ static int try_enable_preferred_console(struct console *newcon, if (_braille_register_console(newcon, c)) return 0; - if (newcon->setup && - (err = newcon->setup(newcon, c->options)) != 0) + err = console_call_setup(newcon, c->options); + if (err) return err; } newcon->flags |= CON_ENABLED; @@ -3078,7 +3128,7 @@ static void try_enable_default_console(struct console *newcon) if (newcon->index < 0) newcon->index = 0; - if (newcon->setup && newcon->setup(newcon, NULL) != 0) + if (console_call_setup(newcon, NULL) != 0) return; newcon->flags |= CON_ENABLED; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9d7464a90..61f9503a5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4465,13 +4465,16 @@ static void __init rcu_start_exp_gp_kworkers(void) rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { pr_err("Failed to create %s!\n", gp_kworker_name); + rcu_exp_gp_kworker = NULL; return; } rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { pr_err("Failed to create %s!\n", par_gp_kworker_name); + rcu_exp_par_gp_kworker = NULL; kthread_destroy_worker(rcu_exp_gp_kworker); + rcu_exp_gp_kworker = NULL; return; } diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6d2cbed96..75e8d9652 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -427,7 +427,12 @@ static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) __sync_rcu_exp_select_node_cpus(rewp); } -static inline bool rcu_gp_par_worker_started(void) +static inline bool rcu_exp_worker_started(void) +{ + return !!READ_ONCE(rcu_exp_gp_kworker); +} + +static inline bool rcu_exp_par_worker_started(void) { return !!READ_ONCE(rcu_exp_par_gp_kworker); } @@ -477,7 +482,12 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) __sync_rcu_exp_select_node_cpus(rewp); } -static inline bool rcu_gp_par_worker_started(void) +static inline bool rcu_exp_worker_started(void) +{ + return !!READ_ONCE(rcu_gp_wq); +} + +static inline bool rcu_exp_par_worker_started(void) { return !!READ_ONCE(rcu_par_gp_wq); } @@ -540,7 +550,7 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!rcu_gp_par_worker_started() || + if (!rcu_exp_par_worker_started() || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { /* No worker started yet or last leaf, do direct call. */ @@ -910,7 +920,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { - bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); + bool use_worker; unsigned long flags; struct rcu_exp_work rew; struct rcu_node *rnp; @@ -921,6 +931,9 @@ void synchronize_rcu_expedited(void) lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); + use_worker = (rcu_scheduler_active != RCU_SCHEDULER_INIT) && + rcu_exp_worker_started(); + /* Is the state is such that the call is a grace period? */ if (rcu_blocking_is_gp()) { // Note well that this code runs with !PREEMPT && !SMP. @@ -950,7 +963,7 @@ void synchronize_rcu_expedited(void) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ - if (unlikely(boottime)) { + if (unlikely(!use_worker)) { /* Direct call during scheduler init and early_initcalls(). */ rcu_exp_sel_wait_wake(s); } else { @@ -968,7 +981,7 @@ void synchronize_rcu_expedited(void) /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); - if (likely(!boottime)) + if (likely(use_worker)) synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2558ab903..91c101ecf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6656,7 +6656,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu if (!available_idle_cpu(cpu)) { idle = false; if (*idle_cpu == -1) { - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { *idle_cpu = cpu; break; } @@ -6664,7 +6664,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu } break; } - if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr)) + if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus)) *idle_cpu = cpu; } @@ -6678,13 +6678,19 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, int target) +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { int cpu; for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { if (cpu == target) continue; + /* + * Check if the CPU is in the LLC scheduling domain of @target. + * Due to isolcpus, there is no guarantee that all the siblings are in the domain. + */ + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) + continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; } @@ -6708,7 +6714,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core, p); } -static inline int select_idle_smt(struct task_struct *p, int target) +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { return -1; } @@ -6970,7 +6976,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) has_idle_core = test_idle_cores(target); if (!has_idle_core && cpus_share_cache(prev, target)) { - i = select_idle_smt(p, prev); + i = select_idle_smt(p, sd, prev); if ((unsigned int)i < nr_cpumask_bits) return i; } diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c index 831e8e779..f7c3de011 100644 --- a/kernel/time/time_test.c +++ b/kernel/time/time_test.c @@ -73,7 +73,7 @@ static void time64_to_tm_test_date_range(struct kunit *test) days = div_s64(secs, 86400); - #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %ld", \ + #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %lld", \ year, month, mdday, yday, days KUNIT_ASSERT_EQ_MSG(test, year - 1900, result.tm_year, FAIL_MSG); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 221c8c404..b158cbef4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1180,13 +1180,15 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history, } /* - * cycle_between - true if test occurs chronologically between before and after + * timestamp_in_interval - true if ts is chronologically in [start, end] + * + * True if ts occurs chronologically at or after start, and before or at end. */ -static bool cycle_between(u64 before, u64 test, u64 after) +static bool timestamp_in_interval(u64 start, u64 end, u64 ts) { - if (test > before && test < after) + if (ts >= start && ts <= end) return true; - if (test < before && before > after) + if (start > end && (ts >= start || ts <= end)) return true; return false; } @@ -1246,7 +1248,7 @@ int get_device_system_crosststamp(int (*get_time_fn) */ now = tk_clock_read(&tk->tkr_mono); interval_start = tk->tkr_mono.cycle_last; - if (!cycle_between(interval_start, cycles, now)) { + if (!timestamp_in_interval(interval_start, now, cycles)) { clock_was_set_seq = tk->clock_was_set_seq; cs_was_changed_seq = tk->cs_was_changed_seq; cycles = interval_start; @@ -1259,10 +1261,8 @@ int get_device_system_crosststamp(int (*get_time_fn) tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; - nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, - system_counterval.cycles); - nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, - system_counterval.cycles); + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles); } while (read_seqcount_retry(&tk_core.seq, seq)); xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); @@ -1277,13 +1277,13 @@ int get_device_system_crosststamp(int (*get_time_fn) bool discontinuity; /* - * Check that the counter value occurs after the provided + * Check that the counter value is not before the provided * history reference and that the history doesn't cross a * clocksource change */ if (!history_begin || - !cycle_between(history_begin->cycles, - system_counterval.cycles, cycles) || + !timestamp_in_interval(history_begin->cycles, + cycles, system_counterval.cycles) || history_begin->cs_was_changed_seq != cs_was_changed_seq) return -EINVAL; partial_history_cycles = cycles - system_counterval.cycles; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 717fcb9fb..594698974 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1083,7 +1083,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option /* * We are trying to schedule the timer on the new base. * However we can't change timer's base while it is running, - * otherwise del_timer_sync() can't detect that the timer's + * otherwise timer_delete_sync() can't detect that the timer's * handler yet has not finished. This also guarantees that the * timer is serialized wrt itself. */ @@ -1121,14 +1121,16 @@ out_unlock: } /** - * mod_timer_pending - modify a pending timer's timeout - * @timer: the pending timer to be modified - * @expires: new timeout in jiffies + * mod_timer_pending - Modify a pending timer's timeout + * @timer: The pending timer to be modified + * @expires: New absolute timeout in jiffies * - * mod_timer_pending() is the same for pending timers as mod_timer(), - * but will not re-activate and modify already deleted timers. + * mod_timer_pending() is the same for pending timers as mod_timer(), but + * will not activate inactive timers. * - * It is useful for unserialized use of timers. + * Return: + * * %0 - The timer was inactive and not modified + * * %1 - The timer was active and requeued to expire at @expires */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { @@ -1137,24 +1139,27 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires) EXPORT_SYMBOL(mod_timer_pending); /** - * mod_timer - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer() is a more efficient way to update the expire field of an - * active timer (if the timer is inactive it will be activated) + * mod_timer - Modify a timer's timeout + * @timer: The timer to be modified + * @expires: New absolute timeout in jiffies * * mod_timer(timer, expires) is equivalent to: * * del_timer(timer); timer->expires = expires; add_timer(timer); * + * mod_timer() is more efficient than the above open coded sequence. In + * case that the timer is inactive, the del_timer() part is a NOP. The + * timer is in any case activated with the new expiry time @expires. + * * Note that if there are multiple unserialized concurrent users of the * same timer, then mod_timer() is the only safe way to modify the timeout, * since add_timer() cannot modify an already running timer. * - * The function returns whether it has modified a pending timer or not. - * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an - * active timer returns 1.) + * Return: + * * %0 - The timer was inactive and started + * * %1 - The timer was active and requeued to expire at @expires or + * the timer was active and not modified because @expires did + * not change the effective expiry time */ int mod_timer(struct timer_list *timer, unsigned long expires) { @@ -1165,11 +1170,18 @@ EXPORT_SYMBOL(mod_timer); /** * timer_reduce - Modify a timer's timeout if it would reduce the timeout * @timer: The timer to be modified - * @expires: New timeout in jiffies + * @expires: New absolute timeout in jiffies * * timer_reduce() is very similar to mod_timer(), except that it will only - * modify a running timer if that would reduce the expiration time (it will - * start a timer that isn't running). + * modify an enqueued timer if that would reduce the expiration time. If + * @timer is not enqueued it starts the timer. + * + * Return: + * * %0 - The timer was inactive and started + * * %1 - The timer was active and requeued to expire at @expires or + * the timer was active and not modified because @expires + * did not change the effective expiry time such that the + * timer would expire earlier than already scheduled */ int timer_reduce(struct timer_list *timer, unsigned long expires) { @@ -1178,18 +1190,21 @@ int timer_reduce(struct timer_list *timer, unsigned long expires) EXPORT_SYMBOL(timer_reduce); /** - * add_timer - start a timer - * @timer: the timer to be added + * add_timer - Start a timer + * @timer: The timer to be started * - * The kernel will do a ->function(@timer) callback from the - * timer interrupt at the ->expires point in the future. The - * current time is 'jiffies'. + * Start @timer to expire at @timer->expires in the future. @timer->expires + * is the absolute expiry time measured in 'jiffies'. When the timer expires + * timer->function(timer) will be invoked from soft interrupt context. * - * The timer's ->expires, ->function fields must be set prior calling this - * function. + * The @timer->expires and @timer->function fields must be set prior + * to calling this function. * - * Timers with an ->expires field in the past will be executed in the next - * timer tick. + * If @timer->expires is already in the past @timer will be queued to + * expire at the next timer tick. + * + * This can only operate on an inactive timer. Attempts to invoke this on + * an active timer are rejected with a warning. */ void add_timer(struct timer_list *timer) { @@ -1199,11 +1214,13 @@ void add_timer(struct timer_list *timer) EXPORT_SYMBOL(add_timer); /** - * add_timer_on - start a timer on a particular CPU - * @timer: the timer to be added - * @cpu: the CPU to start it on + * add_timer_on - Start a timer on a particular CPU + * @timer: The timer to be started + * @cpu: The CPU to start it on + * + * Same as add_timer() except that it starts the timer on the given CPU. * - * This is not very scalable on SMP. Double adds are not possible. + * See add_timer() for further details. */ void add_timer_on(struct timer_list *timer, int cpu) { @@ -1238,15 +1255,18 @@ void add_timer_on(struct timer_list *timer, int cpu) EXPORT_SYMBOL_GPL(add_timer_on); /** - * del_timer - deactivate a timer. - * @timer: the timer to be deactivated - * - * del_timer() deactivates a timer - this works on both active and inactive - * timers. - * - * The function returns whether it has deactivated a pending timer or not. - * (ie. del_timer() of an inactive timer returns 0, del_timer() of an - * active timer returns 1.) + * del_timer - Deactivate a timer. + * @timer: The timer to be deactivated + * + * The function only deactivates a pending timer, but contrary to + * timer_delete_sync() it does not take into account whether the timer's + * callback function is concurrently executed on a different CPU or not. + * It neither prevents rearming of the timer. If @timer can be rearmed + * concurrently then the return value of this function is meaningless. + * + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated */ int del_timer(struct timer_list *timer) { @@ -1268,10 +1288,19 @@ EXPORT_SYMBOL(del_timer); /** * try_to_del_timer_sync - Try to deactivate a timer - * @timer: timer to delete + * @timer: Timer to deactivate + * + * This function tries to deactivate a timer. On success the timer is not + * queued and the timer callback function is not running on any CPU. + * + * This function does not guarantee that the timer cannot be rearmed right + * after dropping the base lock. That needs to be prevented by the calling + * code if necessary. * - * This function tries to deactivate a timer. Upon successful (ret >= 0) - * exit the timer is not queued and the handler is not running on any CPU. + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated + * * %-1 - The timer callback function is running on a different CPU */ int try_to_del_timer_sync(struct timer_list *timer) { @@ -1365,25 +1394,20 @@ static inline void timer_sync_wait_running(struct timer_base *base) { } static inline void del_timer_wait_running(struct timer_list *timer) { } #endif -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) /** - * del_timer_sync - deactivate a timer and wait for the handler to finish. - * @timer: the timer to be deactivated - * - * This function only differs from del_timer() on SMP: besides deactivating - * the timer it also makes sure the handler has finished executing on other - * CPUs. + * timer_delete_sync - Deactivate a timer and wait for the handler to finish. + * @timer: The timer to be deactivated * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from * interrupt contexts unless the timer is an irqsafe one. The caller must - * not hold locks which would prevent completion of the timer's - * handler. The timer's handler must not call add_timer_on(). Upon exit the - * timer is not queued and the handler is not running on any CPU. + * not hold locks which would prevent completion of the timer's callback + * function. The timer's handler must not call add_timer_on(). Upon exit + * the timer is not queued and the handler is not running on any CPU. * - * Note: For !irqsafe timers, you must not hold locks that are held in - * interrupt context while calling this function. Even if the lock has - * nothing to do with the timer in question. Here's why:: + * For !irqsafe timers, the caller must not hold locks that are held in + * interrupt context. Even if the lock has nothing to do with the timer in + * question. Here's why:: * * CPU0 CPU1 * ---- ---- @@ -1393,16 +1417,23 @@ static inline void del_timer_wait_running(struct timer_list *timer) { } * spin_lock_irq(somelock); * <IRQ> * spin_lock(somelock); - * del_timer_sync(mytimer); + * timer_delete_sync(mytimer); * while (base->running_timer == mytimer); * - * Now del_timer_sync() will never return and never release somelock. - * The interrupt on the other CPU is waiting to grab somelock but - * it has interrupted the softirq that CPU0 is waiting to finish. + * Now timer_delete_sync() will never return and never release somelock. + * The interrupt on the other CPU is waiting to grab somelock but it has + * interrupted the softirq that CPU0 is waiting to finish. + * + * This function cannot guarantee that the timer is not rearmed again by + * some concurrent or preempting code, right after it dropped the base + * lock. If there is the possibility of a concurrent rearm then the return + * value of the function is meaningless. * - * The function returns whether it has deactivated a pending timer or not. + * Return: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated */ -int del_timer_sync(struct timer_list *timer) +int timer_delete_sync(struct timer_list *timer) { int ret; @@ -1442,8 +1473,7 @@ int del_timer_sync(struct timer_list *timer) return ret; } -EXPORT_SYMBOL(del_timer_sync); -#endif +EXPORT_SYMBOL(timer_delete_sync); static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *), @@ -1465,8 +1495,8 @@ static void call_timer_fn(struct timer_list *timer, #endif /* * Couple the lock chain with the lock chain at - * del_timer_sync() by acquiring the lock_map around the fn() - * call here and in del_timer_sync(). + * timer_delete_sync() by acquiring the lock_map around the fn() + * call here and in timer_delete_sync(). */ lock_map_acquire(&lockdep_map); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e019a9278..431a922e5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -414,7 +414,6 @@ struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; - long wait_index; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -908,8 +907,19 @@ static void rb_wake_up_waiters(struct irq_work *work) wake_up_all(&rbwork->waiters); if (rbwork->full_waiters_pending || rbwork->wakeup_full) { + /* Only cpu_buffer sets the above flags */ + struct ring_buffer_per_cpu *cpu_buffer = + container_of(rbwork, struct ring_buffer_per_cpu, irq_work); + + /* Called from interrupt context */ + raw_spin_lock(&cpu_buffer->reader_lock); rbwork->wakeup_full = false; rbwork->full_waiters_pending = false; + + /* Waking up all waiters, they will reset the shortest full */ + cpu_buffer->shortest_full = 0; + raw_spin_unlock(&cpu_buffer->reader_lock); + wake_up_all(&rbwork->full_waiters); } } @@ -949,14 +959,95 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) rbwork = &cpu_buffer->irq_work; } - rbwork->wait_index++; - /* make sure the waiters see the new index */ - smp_wmb(); - /* This can be called in any context */ irq_work_queue(&rbwork->work); } +static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer; + bool ret = false; + + /* Reads of all CPUs always waits for any data */ + if (cpu == RING_BUFFER_ALL_CPUS) + return !ring_buffer_empty(buffer); + + cpu_buffer = buffer->buffers[cpu]; + + if (!ring_buffer_empty_cpu(buffer, cpu)) { + unsigned long flags; + bool pagebusy; + + if (!full) + return true; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + ret = !pagebusy && full_hit(buffer, cpu, full); + + if (!ret && (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full > full)) { + cpu_buffer->shortest_full = full; + } + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + } + return ret; +} + +static inline bool +rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer, + int cpu, int full, ring_buffer_cond_fn cond, void *data) +{ + if (rb_watermark_hit(buffer, cpu, full)) + return true; + + if (cond(data)) + return true; + + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + if (full) + rbwork->full_waiters_pending = true; + else + rbwork->waiters_pending = true; + + return false; +} + +/* + * The default wait condition for ring_buffer_wait() is to just to exit the + * wait loop the first time it is woken up. + */ +static bool rb_wait_once(void *data) +{ + long *once = data; + + /* wait_event() actually calls this twice before scheduling*/ + if (*once > 1) + return true; + + (*once)++; + return false; +} + /** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on @@ -970,101 +1061,39 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer; - DEFINE_WAIT(wait); - struct rb_irq_work *work; - long wait_index; + struct wait_queue_head *waitq; + ring_buffer_cond_fn cond; + struct rb_irq_work *rbwork; + void *data; + long once = 0; int ret = 0; + cond = rb_wait_once; + data = &once; + /* * Depending on what the caller is waiting for, either any * data in any cpu buffer, or a specific buffer, put the * caller on the appropriate wait queue. */ if (cpu == RING_BUFFER_ALL_CPUS) { - work = &buffer->irq_work; + rbwork = &buffer->irq_work; /* Full only makes sense on per cpu reads */ full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -ENODEV; cpu_buffer = buffer->buffers[cpu]; - work = &cpu_buffer->irq_work; - } - - wait_index = READ_ONCE(work->wait_index); - - while (true) { - if (full) - prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); - else - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); - - /* - * The events can happen in critical sections where - * checking a work queue can cause deadlocks. - * After adding a task to the queue, this flag is set - * only to notify events to try to wake up the queue - * using irq_work. - * - * We don't clear it even if the buffer is no longer - * empty. The flag only causes the next event to run - * irq_work to do the work queue wake up. The worse - * that can happen if we race with !trace_empty() is that - * an event will cause an irq_work to try to wake up - * an empty queue. - * - * There's no reason to protect this flag either, as - * the work queue and irq_work logic will do the necessary - * synchronization for the wake ups. The only thing - * that is necessary is that the wake up happens after - * a task has been queued. It's OK for spurious wake ups. - */ - if (full) - work->full_waiters_pending = true; - else - work->waiters_pending = true; - - if (signal_pending(current)) { - ret = -EINTR; - break; - } - - if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) - break; - - if (cpu != RING_BUFFER_ALL_CPUS && - !ring_buffer_empty_cpu(buffer, cpu)) { - unsigned long flags; - bool pagebusy; - bool done; - - if (!full) - break; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - done = !pagebusy && full_hit(buffer, cpu, full); - - if (!cpu_buffer->shortest_full || - cpu_buffer->shortest_full > full) - cpu_buffer->shortest_full = full; - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (done) - break; - } - - schedule(); - - /* Make sure to see the new wait index */ - smp_rmb(); - if (wait_index != work->wait_index) - break; + rbwork = &cpu_buffer->irq_work; } if (full) - finish_wait(&work->full_waiters, &wait); + waitq = &rbwork->full_waiters; else - finish_wait(&work->waiters, &wait); + waitq = &rbwork->waiters; + + ret = wait_event_interruptible((*waitq), + rb_wait_cond(rbwork, buffer, cpu, full, cond, data)); return ret; } @@ -1088,30 +1117,51 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table, int full) { struct ring_buffer_per_cpu *cpu_buffer; - struct rb_irq_work *work; + struct rb_irq_work *rbwork; if (cpu == RING_BUFFER_ALL_CPUS) { - work = &buffer->irq_work; + rbwork = &buffer->irq_work; full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return EPOLLERR; cpu_buffer = buffer->buffers[cpu]; - work = &cpu_buffer->irq_work; + rbwork = &cpu_buffer->irq_work; } if (full) { - poll_wait(filp, &work->full_waiters, poll_table); - work->full_waiters_pending = true; + unsigned long flags; + + poll_wait(filp, &rbwork->full_waiters, poll_table); + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (!cpu_buffer->shortest_full || cpu_buffer->shortest_full > full) cpu_buffer->shortest_full = full; - } else { - poll_wait(filp, &work->waiters, poll_table); - work->waiters_pending = true; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (full_hit(buffer, cpu, full)) + return EPOLLIN | EPOLLRDNORM; + /* + * Only allow full_waiters_pending update to be seen after + * the shortest_full is set. If the writer sees the + * full_waiters_pending flag set, it will compare the + * amount in the ring buffer to shortest_full. If the amount + * in the ring buffer is greater than the shortest_full + * percent, it will call the irq_work handler to wake up + * this list. The irq_handler will reset shortest_full + * back to zero. That's done under the reader_lock, but + * the below smp_mb() makes sure that the update to + * full_waiters_pending doesn't leak up into the above. + */ + smp_mb(); + rbwork->full_waiters_pending = true; + return 0; } + poll_wait(filp, &rbwork->waiters, poll_table); + rbwork->waiters_pending = true; + /* * There's a tight race between setting the waiters_pending and * checking if the ring buffer is empty. Once the waiters_pending bit @@ -1127,9 +1177,6 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, */ smp_mb(); - if (full) - return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0; - if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) return EPOLLIN | EPOLLRDNORM; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f667d6bdd..f2b00ea38 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8278,6 +8278,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return size; } +static int tracing_buffers_flush(struct file *file, fl_owner_t id) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + iter->wait_index++; + /* Make sure the waiters see the new wait_index */ + smp_wmb(); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + + return 0; +} + static int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; @@ -8289,12 +8303,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) __trace_array_put(iter->tr); - iter->wait_index++; - /* Make sure the waiters see the new wait_index */ - smp_wmb(); - - ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); - if (info->spare) ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); @@ -8508,6 +8516,7 @@ static const struct file_operations tracing_buffers_fops = { .read = tracing_buffers_read, .poll = tracing_buffers_poll, .release = tracing_buffers_release, + .flush = tracing_buffers_flush, .splice_read = tracing_buffers_splice_read, .unlocked_ioctl = tracing_buffers_ioctl, .llseek = no_llseek, |