From 68c1b0995e963349e50f8a8b00ebc140908f7882 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 8 May 2024 06:15:15 +0200 Subject: Merging upstream version 4.19.282. Signed-off-by: Daniel Baumann --- kernel/acct.c | 2 ++ kernel/bpf/core.c | 5 ++- kernel/bpf/verifier.c | 4 ++- kernel/cgroup/cgroup-v1.c | 3 ++ kernel/cgroup/cgroup.c | 50 ++++++++++++++++++++++++--- kernel/cgroup/cpuset.c | 29 ++++++++++------ kernel/compat.c | 2 +- kernel/events/core.c | 10 +++--- kernel/exit.c | 72 ++++++++++++++++++++++++++++++++++++++ kernel/gcov/gcc_4_7.c | 5 +++ kernel/irq/irqdomain.c | 31 +++++++++++------ kernel/kprobes.c | 6 ++-- kernel/module.c | 26 +++++++++++--- kernel/panic.c | 75 ++++++++++++++++++++++++++++++++++------ kernel/rcu/tree_exp.h | 2 ++ kernel/relay.c | 4 +-- kernel/sched/core.c | 10 +++--- kernel/sched/fair.c | 54 ++++++++++++++++++++++++++--- kernel/sys.c | 2 ++ kernel/time/alarmtimer.c | 33 +++++++++++++++--- kernel/time/hrtimer.c | 2 ++ kernel/time/posix-stubs.c | 2 ++ kernel/time/posix-timers.c | 2 ++ kernel/trace/blktrace.c | 3 +- kernel/trace/ftrace.c | 3 +- kernel/trace/ring_buffer.c | 20 +++++++++-- kernel/trace/trace.c | 17 ++++++++- kernel/trace/trace.h | 1 + kernel/trace/trace_events_hist.c | 7 +++- kernel/trace/trace_output.c | 3 +- 30 files changed, 408 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 81f9831a7..6d98aed40 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -331,6 +331,8 @@ static comp_t encode_comp_t(unsigned long value) exp++; } + if (exp > (((comp_t) ~0U) >> MANTSIZE)) + return (comp_t) ~0U; /* * Clean it up and polish it off. */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index cbbd0168f..285101772 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -602,7 +603,7 @@ static int __init bpf_jit_charge_init(void) { /* Only used as heuristic here to derive limit. */ bpf_jit_limit_max = bpf_jit_alloc_exec_limit(); - bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2, + bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1, PAGE_SIZE), LONG_MAX); return 0; } @@ -1373,9 +1374,7 @@ out: * reuse preexisting logic from Spectre v1 mitigation that * happens to produce the required code on x86 for v4 as well. */ -#ifdef CONFIG_X86 barrier_nospec(); -#endif CONT; #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 694ee0b1f..61f3a31ab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1012,7 +1012,9 @@ static int check_stack_write(struct bpf_verifier_env *env, bool sanitize = reg && is_spillable_regtype(reg->type); for (i = 0; i < size; i++) { - if (state->stack[spi].slot_type[i] == STACK_INVALID) { + u8 type = state->stack[spi].slot_type[i]; + + if (type != STACK_MISC && type != STACK_ZERO) { sanitize = true; break; } diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 616449762..c0ebb7080 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -55,6 +56,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); + get_online_cpus(); percpu_down_write(&cgroup_threadgroup_rwsem); for_each_root(root) { struct cgroup *from_cgrp; @@ -71,6 +73,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) break; } percpu_up_write(&cgroup_threadgroup_rwsem); + put_online_cpus(); mutex_unlock(&cgroup_mutex); return retval; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a892a99eb..a8185cdb8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -2209,6 +2210,45 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path); +/** + * cgroup_attach_lock - Lock for ->attach() + * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * + * cgroup migration sometimes needs to stabilize threadgroups against forks and + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() + * implementations (e.g. cpuset), also need to disable CPU hotplug. + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can + * lead to deadlocks. + * + * Bringing up a CPU may involve creating and destroying tasks which requires + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while + * write-locking threadgroup_rwsem, the locking order is reversed and we end up + * waiting for an on-going CPU hotplug operation which in turn is waiting for + * the threadgroup_rwsem to be released to create new tasks. For more details: + * + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu + * + * Resolve the situation by always acquiring cpus_read_lock() before optionally + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that + * CPU hotplug is disabled on entry. + */ +static void cgroup_attach_lock(void) +{ + get_online_cpus(); + percpu_down_write(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_attach_unlock - Undo cgroup_attach_lock() + * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + */ +static void cgroup_attach_unlock(void) +{ + percpu_up_write(&cgroup_threadgroup_rwsem); + put_online_cpus(); +} + /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task @@ -2694,7 +2734,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(); rcu_read_lock(); if (pid) { @@ -2725,7 +2765,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) goto out_unlock_rcu; out_unlock_threadgroup: - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); out_unlock_rcu: rcu_read_unlock(); return tsk; @@ -2740,7 +2780,7 @@ void cgroup_procs_write_finish(struct task_struct *task) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -2799,7 +2839,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(); /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); @@ -2830,7 +2870,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(); return ret; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index dcd5755b1..3067d3e5a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -830,8 +830,8 @@ static void rebuild_sched_domains_locked(void) cpumask_var_t *doms; int ndoms; + lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -839,15 +839,13 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ static void rebuild_sched_domains_locked(void) @@ -857,9 +855,11 @@ static void rebuild_sched_domains_locked(void) void rebuild_sched_domains(void) { + get_online_cpus(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); + put_online_cpus(); } /** @@ -1504,7 +1504,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cs = css_cs(css); mutex_lock(&cpuset_mutex); - css_cs(css)->attach_in_progress--; + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); } @@ -1528,13 +1530,9 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ mutex_lock(&cpuset_mutex); - /* - * It should hold cpus lock because a cpu offline event can - * cause set_cpus_allowed_ptr() failed. - */ - get_online_cpus(); /* prepare for attach */ if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1553,7 +1551,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); } - put_online_cpus(); /* * Change mm for all threadgroup leaders. This is expensive and may @@ -1617,6 +1614,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -1654,6 +1652,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; } @@ -1664,6 +1663,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1678,6 +1678,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; } @@ -1716,6 +1717,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1741,6 +1743,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -1985,6 +1988,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; + get_online_cpus(); mutex_lock(&cpuset_mutex); set_bit(CS_ONLINE, &cs->flags); @@ -2035,6 +2039,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return 0; } @@ -2048,6 +2053,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); + get_online_cpus(); mutex_lock(&cpuset_mutex); if (is_sched_load_balance(cs)) @@ -2057,6 +2063,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); + put_online_cpus(); } static void cpuset_css_free(struct cgroup_subsys_state *css) diff --git a/kernel/compat.c b/kernel/compat.c index e4548a9e9..5f320b0db 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -307,7 +307,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, if (len & (sizeof(compat_ulong_t)-1)) return -EINVAL; - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; ret = sched_getaffinity(pid, mask); diff --git a/kernel/events/core.c b/kernel/events/core.c index ba66ea3ca..72ed3f3d0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9682,13 +9682,15 @@ static int pmu_dev_alloc(struct pmu *pmu) pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); - ret = dev_set_name(pmu->dev, "%s", pmu->name); - if (ret) - goto free_dev; dev_set_drvdata(pmu->dev, pmu); pmu->dev->bus = &pmu_bus; pmu->dev->release = pmu_dev_release; + + ret = dev_set_name(pmu->dev, "%s", pmu->name); + if (ret) + goto free_dev; + ret = device_add(pmu->dev); if (ret) goto free_dev; @@ -10429,7 +10431,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) /* * If its not a per-cpu rb, it must be the same task. */ - if (output_event->cpu == -1 && output_event->ctx != event->ctx) + if (output_event->cpu == -1 && output_event->hw.target != event->hw.target) goto out; /* diff --git a/kernel/exit.c b/kernel/exit.c index 908e7a33e..02360ec3b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -62,12 +62,59 @@ #include #include #include +#include #include #include #include #include +/* + * The default value should be high enough to not crash a system that randomly + * crashes its kernel from time to time, but low enough to at least not permit + * overflowing 32-bit refcounts or the ldsem writer count. + */ +static unsigned int oops_limit = 10000; + +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_exit_table[] = { + { + .procname = "oops_limit", + .data = &oops_limit, + .maxlen = sizeof(oops_limit), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { } +}; + +static __init int kernel_exit_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_exit_table); + return 0; +} +late_initcall(kernel_exit_sysctls_init); +#endif + +static atomic_t oops_count = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); +} + +static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); + +static __init int kernel_exit_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); + return 0; +} +late_initcall(kernel_exit_sysfs_init); +#endif + static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; @@ -922,6 +969,31 @@ void __noreturn do_exit(long code) } EXPORT_SYMBOL_GPL(do_exit); +void __noreturn make_task_dead(int signr) +{ + /* + * Take the task off the cpu after something catastrophic has + * happened. + */ + unsigned int limit; + + /* + * Every time the system oopses, if the oops happens while a reference + * to an object was held, the reference leaks. + * If the oops doesn't also leak memory, repeated oopsing can cause + * reference counters to wrap around (if they're not using refcount_t). + * This means that repeated oopsing can make unexploitable-looking bugs + * exploitable through repeated oopsing. + * To make sure this can't happen, place an upper bound on how often the + * kernel may oops without panic(). + */ + limit = READ_ONCE(oops_limit); + if (atomic_inc_return(&oops_count) >= limit && limit) + panic("Oopsed too often (kernel.oops_limit is %d)", limit); + + do_exit(signr); +} + void complete_and_exit(struct completion *comp, long code) { if (comp) diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index bb6ccf456..1c2bd1f23 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -85,6 +85,7 @@ struct gcov_fn_info { * @version: gcov version magic indicating the gcc version used for compilation * @next: list head for a singly-linked list * @stamp: uniquifying time stamp + * @checksum: unique object checksum * @filename: name of the associated gcov data file * @merge: merge functions (null for unused counter type) * @n_functions: number of instrumented functions @@ -97,6 +98,10 @@ struct gcov_info { unsigned int version; struct gcov_info *next; unsigned int stamp; + /* Since GCC 12.1 a checksum field is added. */ +#if (__GNUC__ >= 12) + unsigned int checksum; +#endif const char *filename; void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int); unsigned int n_functions; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1e42fc2ad..3ebe231a5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -494,6 +494,9 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) return; hwirq = irq_data->hwirq; + + mutex_lock(&irq_domain_mutex); + irq_set_status_flags(irq, IRQ_NOREQUEST); /* remove chip and handler */ @@ -513,10 +516,12 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) /* Clear reverse map for this hwirq */ irq_domain_clear_mapping(domain, hwirq); + + mutex_unlock(&irq_domain_mutex); } -int irq_domain_associate(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq) +static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) { struct irq_data *irq_data = irq_get_irq_data(virq); int ret; @@ -529,7 +534,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, if (WARN(irq_data->domain, "error: virq%i is already associated", virq)) return -EINVAL; - mutex_lock(&irq_domain_mutex); irq_data->hwirq = hwirq; irq_data->domain = domain; if (domain->ops->map) { @@ -546,7 +550,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, } irq_data->domain = NULL; irq_data->hwirq = 0; - mutex_unlock(&irq_domain_mutex); return ret; } @@ -557,12 +560,23 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, domain->mapcount++; irq_domain_set_mapping(domain, hwirq, irq_data); - mutex_unlock(&irq_domain_mutex); irq_clear_status_flags(virq, IRQ_NOREQUEST); return 0; } + +int irq_domain_associate(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) +{ + int ret; + + mutex_lock(&irq_domain_mutex); + ret = irq_domain_associate_locked(domain, virq, hwirq); + mutex_unlock(&irq_domain_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(irq_domain_associate); void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, @@ -818,13 +832,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) } irq_data = irq_get_irq_data(virq); - if (!irq_data) { - if (irq_domain_is_hierarchy(domain)) - irq_domain_free_irqs(virq, 1); - else - irq_dispose_mapping(virq); + if (WARN_ON(!irq_data)) return 0; - } /* Store trigger type */ irqd_set_trigger_type(irq_data, type); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 33aba4e2e..e1fb6453e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -418,8 +418,8 @@ static inline int kprobe_optready(struct kprobe *p) return 0; } -/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */ -static inline int kprobe_disarmed(struct kprobe *p) +/* Return true if the kprobe is disarmed. Note: p must be on hash list */ +bool kprobe_disarmed(struct kprobe *p) { struct optimized_kprobe *op; @@ -626,7 +626,7 @@ void wait_for_kprobe_optimizer(void) mutex_unlock(&kprobe_mutex); } -static bool optprobe_queued_unopt(struct optimized_kprobe *op) +bool optprobe_queued_unopt(struct optimized_kprobe *op) { struct optimized_kprobe *_op; diff --git a/kernel/module.c b/kernel/module.c index 42a604401..6ec0b2e0f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3478,7 +3478,8 @@ static bool finished_loading(const char *name) sched_annotate_sleep(); mutex_lock(&module_mutex); mod = find_module_all(name, strlen(name), true); - ret = !mod || mod->state == MODULE_STATE_LIVE; + ret = !mod || mod->state == MODULE_STATE_LIVE + || mod->state == MODULE_STATE_GOING; mutex_unlock(&module_mutex); return ret; @@ -3632,20 +3633,35 @@ static int add_unformed_module(struct module *mod) mod->state = MODULE_STATE_UNFORMED; -again: mutex_lock(&module_mutex); old = find_module_all(mod->name, strlen(mod->name), true); if (old != NULL) { - if (old->state != MODULE_STATE_LIVE) { + if (old->state == MODULE_STATE_COMING + || old->state == MODULE_STATE_UNFORMED) { /* Wait in case it fails to load. */ mutex_unlock(&module_mutex); err = wait_event_interruptible(module_wq, finished_loading(mod->name)); if (err) goto out_unlocked; - goto again; + + /* The module might have gone in the meantime. */ + mutex_lock(&module_mutex); + old = find_module_all(mod->name, strlen(mod->name), + true); } - err = -EEXIST; + + /* + * We are here only when the same module was being loaded. Do + * not try to load it again right now. It prevents long delays + * caused by serialized module load failures. It might happen + * when more devices of the same type trigger load of + * a particular module. + */ + if (old && old->state == MODULE_STATE_LIVE) + err = -EEXIST; + else + err = -EBUSY; goto out; } mod_update_bounds(mod); diff --git a/kernel/panic.c b/kernel/panic.c index 8138a676f..982ecba28 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #define PANIC_TIMER_STEP 100 @@ -42,6 +43,7 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; +static unsigned int warn_limit __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -50,6 +52,45 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_panic_table[] = { + { + .procname = "warn_limit", + .data = &warn_limit, + .maxlen = sizeof(warn_limit), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { } +}; + +static __init int kernel_panic_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_panic_table); + return 0; +} +late_initcall(kernel_panic_sysctls_init); +#endif + +static atomic_t warn_count = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%d\n", atomic_read(&warn_count)); +} + +static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count); + +static __init int kernel_panic_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL); + return 0; +} +late_initcall(kernel_panic_sysfs_init); +#endif + static long no_blink(int state) { return 0; @@ -125,6 +166,19 @@ void nmi_panic(struct pt_regs *regs, const char *msg) } EXPORT_SYMBOL(nmi_panic); +void check_panic_on_warn(const char *origin) +{ + unsigned int limit; + + if (panic_on_warn) + panic("%s: panic_on_warn set ...\n", origin); + + limit = READ_ONCE(warn_limit); + if (atomic_inc_return(&warn_count) >= limit && limit) + panic("%s: system warned too often (kernel.warn_limit is %d)", + origin, limit); +} + /** * panic - halt the system * @fmt: The text string to print @@ -142,6 +196,16 @@ void panic(const char *fmt, ...) int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; + } + /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since @@ -530,16 +594,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (args) vprintk(args->fmt, args->args); - if (panic_on_warn) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from panicking the - * system on this thread. Other threads are blocked by the - * panic_mutex in panic(). - */ - panic_on_warn = 0; - panic("panic_on_warn set ...\n"); - } + check_panic_on_warn("kernel"); print_modules(); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 72770a551..fa6ae9ed2 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -577,7 +577,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) mask = leaf_node_cpu_bit(rnp, cpu); if (!(rnp->expmask & mask)) continue; + preempt_disable(); // For smp_processor_id() in dump_cpu_task(). dump_cpu_task(cpu); + preempt_enable(); } } jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; diff --git a/kernel/relay.c b/kernel/relay.c index 735cb208f..b7aa7df43 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -163,13 +163,13 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan) { struct rchan_buf *buf; - if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t *)) + if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t)) return NULL; buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); if (!buf) return NULL; - buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t *), + buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t), GFP_KERNEL); if (!buf->padding) goto free_buf; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a03464249..8d5a9fa8a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -741,6 +741,9 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { + if (task_on_rq_migrating(p)) + flags |= ENQUEUE_MIGRATED; + if (task_contributes_to_load(p)) rq->nr_uninterruptible--; @@ -3316,8 +3319,7 @@ static noinline void __schedule_bug(struct task_struct *prev) print_ip_sym(preempt_disable_ip); pr_cont("\n"); } - if (panic_on_warn) - panic("scheduling while atomic\n"); + check_panic_on_warn("scheduling while atomic"); dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); @@ -4951,14 +4953,14 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, if (len & (sizeof(unsigned long)-1)) return -EINVAL; - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; ret = sched_getaffinity(pid, mask); if (ret == 0) { unsigned int retlen = min(len, cpumask_size()); - if (copy_to_user(user_mask_ptr, mask, retlen)) + if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen)) ret = -EFAULT; else ret = retlen; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 84e7efda9..eb67f42fb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3854,6 +3854,29 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) #endif } +static inline bool entity_is_long_sleeper(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + u64 sleep_time; + + if (se->exec_start == 0) + return false; + + cfs_rq = cfs_rq_of(se); + + sleep_time = rq_clock_task(rq_of(cfs_rq)); + + /* Happen while migrating because of clock task divergence */ + if (sleep_time <= se->exec_start) + return false; + + sleep_time -= se->exec_start; + if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) + return true; + + return false; +} + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { @@ -3882,8 +3905,29 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime -= thresh; } - /* ensure we never gain time by being placed backwards. */ - se->vruntime = max_vruntime(se->vruntime, vruntime); + /* + * Pull vruntime of the entity being placed to the base level of + * cfs_rq, to prevent boosting it if placed backwards. + * However, min_vruntime can advance much faster than real time, with + * the extreme being when an entity with the minimal weight always runs + * on the cfs_rq. If the waking entity slept for a long time, its + * vruntime difference from min_vruntime may overflow s64 and their + * comparison may get inversed, so ignore the entity's original + * vruntime in that case. + * The maximal vruntime speedup is given by the ratio of normal to + * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES. + * When placing a migrated waking entity, its exec_start has been set + * from a different rq. In order to take into account a possible + * divergence between new and prev rq's clocks task because of irq and + * stolen time, we take an additional margin. + * So, cutting off on the sleep time of + * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days + * should be safe. + */ + if (entity_is_long_sleeper(se)) + se->vruntime = vruntime; + else + se->vruntime = max_vruntime(se->vruntime, vruntime); } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -3978,6 +4022,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); + /* Entity has migrated, no longer consider this task hot */ + if (flags & ENQUEUE_MIGRATED) + se->exec_start = 0; check_schedstat_required(); update_stats_enqueue(cfs_rq, se, flags); @@ -6544,9 +6591,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) /* Tell new CPU we are migrated */ p->se.avg.last_update_time = 0; - /* We have migrated, no longer consider this task hot */ - p->se.exec_start = 0; - update_scan_period(p, new_cpu); } diff --git a/kernel/sys.c b/kernel/sys.c index d0663f8e6..3548467f6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1530,6 +1530,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, if (resource >= RLIM_NLIMITS) return -EINVAL; + resource = array_index_nospec(resource, RLIM_NLIMITS); + if (new_rlim) { if (new_rlim->rlim_cur > new_rlim->rlim_max) return -EINVAL; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 6a2ba3988..56af8a97c 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -476,11 +476,35 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) } EXPORT_SYMBOL_GPL(alarm_forward); -u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle) { struct alarm_base *base = &alarm_bases[alarm->type]; + ktime_t now = base->gettime(); + + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) { + /* + * Same issue as with posix_timer_fn(). Timers which are + * periodic but the signal is ignored can starve the system + * with a very small interval. The real fix which was + * promised in the context of posix_timer_fn() never + * materialized, but someone should really work on it. + * + * To prevent DOS fake @now to be 1 jiffie out which keeps + * the overrun accounting correct but creates an + * inconsistency vs. timer_gettime(2). + */ + ktime_t kj = NSEC_PER_SEC / HZ; + + if (interval < kj) + now = ktime_add(now, kj); + } + + return alarm_forward(alarm, now, interval); +} - return alarm_forward(alarm, base->gettime(), interval); +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + return __alarm_forward_now(alarm, interval, false); } EXPORT_SYMBOL_GPL(alarm_forward_now); @@ -554,9 +578,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, if (posix_timer_event(ptr, si_private) && ptr->it_interval) { /* * Handle ignored signals and rearm the timer. This will go - * away once we handle ignored signals proper. + * away once we handle ignored signals proper. Ensure that + * small intervals cannot starve the system. */ - ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval); + ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true); ++ptr->it_requeue_pending; ptr->it_active = 1; result = ALARMTIMER_RESTART; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 32ee24f51..8512f06f0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1838,6 +1838,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, if (!timespec64_valid(&tu)) return -EINVAL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); @@ -1858,6 +1859,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, if (!timespec64_valid(&tu)) return -EINVAL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 2c6847d5d..362c159fb 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -144,6 +144,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? @@ -230,6 +231,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 48758108e..1234868b3 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1225,6 +1225,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; @@ -1252,6 +1253,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 75ea1a5be..113cbe1a2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1594,7 +1594,8 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags, static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) { - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + if ((iter->ent->type != TRACE_BLK) || + !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) return TRACE_TYPE_UNHANDLED; return print_one_line(iter, true); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9c7795566..5c0463dbe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1581,7 +1581,8 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) key.flags = end; /* overload flags, as it is unsigned long */ for (pg = ftrace_pages_start; pg; pg = pg->next) { - if (end < pg->records[0].ip || + if (pg->index == 0 || + end < pg->records[0].ip || start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) continue; rec = bsearch(&key, pg->records, pg->index, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5e5b0c067..37fade510 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2463,6 +2463,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, rb_is_reader_page(cpu_buffer->tail_page))) return; + /* + * No need for a memory barrier here, as the update + * of the tail_page did it for this page. + */ local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); @@ -2476,6 +2480,8 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { + /* Make sure the readers see the content of what is committed. */ + smp_wmb(); local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); RB_WARN_ON(cpu_buffer, @@ -3841,7 +3847,12 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* * Make sure we see any padding after the write update - * (see rb_reset_tail()) + * (see rb_reset_tail()). + * + * In addition, a writer may be writing on the reader page + * if the page has not been fully filled, so the read barrier + * is also needed to make sure we see the content of what is + * committed by the writer (see rb_set_commit_to_write()). */ smp_rmb(); @@ -4685,11 +4696,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); */ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) { - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = data; struct page *page = virt_to_page(bpage); unsigned long flags; + if (!buffer || !buffer->buffers || !buffer->buffers[cpu]) + return; + + cpu_buffer = buffer->buffers[cpu]; + /* If the page is still in use someplace else, we can't reuse it */ if (page_ref_count(page) > 1) goto out; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5b7a6e9b0..98abff046 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5803,7 +5803,20 @@ waitagain: ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { - /* don't print partial lines */ + /* + * If one print_trace_line() fills entire trace_seq in one shot, + * trace_seq_to_user() will returns -EBUSY because save_len == 0, + * In this case, we need to consume it, otherwise, loop will peek + * this event next time, resulting in an infinite loop. + */ + if (save_len == 0) { + iter->seq.full = 0; + trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); + trace_consume(iter); + break; + } + + /* In other cases, don't print partial lines */ iter->seq.seq.len = save_len; break; } @@ -8661,6 +8674,8 @@ void __init early_trace_init(void) static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); + + init_events(); } void __init trace_init(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 74185fb04..0923d1b18 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1530,6 +1530,7 @@ extern void trace_event_enable_cmd_record(bool enable); extern void trace_event_enable_tgid_record(bool enable); extern int event_trace_init(void); +extern int init_events(void); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 48f85dab9..455cf41ae 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1764,6 +1764,9 @@ static const char *hist_field_name(struct hist_field *field, { const char *field_name = ""; + if (WARN_ON_ONCE(!field)) + return field_name; + if (level > 1) return field_name; @@ -2314,6 +2317,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; hist_field->fn = hist_field_log2; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); + if (!hist_field->operands[0]) + goto free; hist_field->size = hist_field->operands[0]->size; hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); if (!hist_field->type) @@ -5807,7 +5812,7 @@ enable: /* Just return zero, not the number of registered triggers */ ret = 0; out: - if (ret == 0) + if (ret == 0 && glob[0]) hist_err_clear(); return ret; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 6e6cc64fa..62015d62d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1395,7 +1395,7 @@ static struct trace_event *events[] __initdata = { NULL }; -__init static int init_events(void) +__init int init_events(void) { struct trace_event *event; int i, ret; @@ -1413,4 +1413,3 @@ __init static int init_events(void) return 0; } -early_initcall(init_events); -- cgit v1.2.3