From: Sebastian Andrzej Siewior Date: Tue, 12 Mar 2024 19:01:52 +0100 Subject: [PATCH 4/4] perf: Split __perf_pending_irq() out of perf_pending_irq() Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/6.8/older/patches-6.8.2-rt11.tar.xz perf_pending_irq() invokes perf_event_wakeup() and __perf_pending_irq(). The former is in charge of waking any tasks which wait to be woken up while the latter disables perf-events. The irq_work perf_pending_irq(), while this an irq_work, the callback is invoked in thread context on PREEMPT_RT. This is needed because all the waking functions (wake_up_all(), kill_fasync()) acquire sleep locks which must not be used with disabled interrupts. Disabling events, as done by __perf_pending_irq(), expects a hardirq context and disabled interrupts. This requirement is not fulfilled on PREEMPT_RT. Split functionality based on perf_event::pending_disable into irq_work named `pending_disable_irq' and invoke it in hardirq context on PREEMPT_RT. Rename the split out callback to perf_pending_disable(). Tested-by: Marco Elver Tested-by: Arnaldo Carvalho de Melo Reported-by: Arnaldo Carvalho de Melo Link: https://lore.kernel.org/r/20240312180814.3373778-5-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior --- include/linux/perf_event.h | 1 + kernel/events/core.c | 31 +++++++++++++++++++++++-------- 2 files changed, 24 insertions(+), 8 deletions(-) --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -783,6 +783,7 @@ struct perf_event { unsigned int pending_disable; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; + struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2449,7 +2449,7 @@ static void __perf_event_disable(struct * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_irq it's OK because event->ctx + * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @@ -2489,7 +2489,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; - irq_work_queue(&event->pending_irq); + irq_work_queue(&event->pending_disable_irq); } #define MAX_INTERRUPTS (~0ULL) @@ -5175,6 +5175,7 @@ static void perf_addr_filters_splice(str static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); unaccount_event(event); @@ -6711,7 +6712,7 @@ static void perf_sigtrap(struct perf_eve /* * Deliver the pending work in-event-context or follow the context. */ -static void __perf_pending_irq(struct perf_event *event) +static void __perf_pending_disable(struct perf_event *event) { int cpu = READ_ONCE(event->oncpu); @@ -6749,11 +6750,26 @@ static void __perf_pending_irq(struct pe * irq_work_queue(); // FAILS * * irq_work_run() - * perf_pending_irq() + * perf_pending_disable() * * But the event runs on CPU-B and wants disabling there. */ - irq_work_queue_on(&event->pending_irq, cpu); + irq_work_queue_on(&event->pending_disable_irq, cpu); +} + +static void perf_pending_disable(struct irq_work *entry) +{ + struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); + int rctx; + + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + rctx = perf_swevent_get_recursion_context(); + __perf_pending_disable(event); + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); } static void perf_pending_irq(struct irq_work *entry) @@ -6776,8 +6792,6 @@ static void perf_pending_irq(struct irq_ perf_event_wakeup(event); } - __perf_pending_irq(event); - if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } @@ -9572,7 +9586,7 @@ static int __perf_event_overflow(struct * is processed. */ if (in_nmi()) - irq_work_queue(&event->pending_irq); + irq_work_queue(&event->pending_disable_irq); } else if (event->attr.exclude_kernel && valid_sample) { /* * Should not be able to return to user space without @@ -11912,6 +11926,7 @@ perf_event_alloc(struct perf_event_attr init_waitqueue_head(&event->waitq); init_irq_work(&event->pending_irq, perf_pending_irq); + event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); mutex_init(&event->mmap_mutex);