diff options
Diffstat (limited to 'debian/patches-rt/0002-net-Allow-to-use-SMP-threads-for-backlog-NAPI.patch')
-rw-r--r-- | debian/patches-rt/0002-net-Allow-to-use-SMP-threads-for-backlog-NAPI.patch | 330 |
1 files changed, 330 insertions, 0 deletions
diff --git a/debian/patches-rt/0002-net-Allow-to-use-SMP-threads-for-backlog-NAPI.patch b/debian/patches-rt/0002-net-Allow-to-use-SMP-threads-for-backlog-NAPI.patch new file mode 100644 index 0000000000..acdfbddf5a --- /dev/null +++ b/debian/patches-rt/0002-net-Allow-to-use-SMP-threads-for-backlog-NAPI.patch @@ -0,0 +1,330 @@ +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Sat, 9 Mar 2024 10:05:10 +0100 +Subject: [PATCH 2/4] net: Allow to use SMP threads for backlog NAPI. +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/6.8/older/patches-6.8.2-rt11.tar.xz + +Backlog NAPI is a per-CPU NAPI struct only (with no device behind it) +used by drivers which don't do NAPI them self, RPS and parts of the +stack which need to avoid recursive deadlocks while processing a packet. + +The non-NAPI driver use the CPU local backlog NAPI. If RPS is enabled +then a flow for the skb is computed and based on the flow the skb can be +enqueued on a remote CPU. Scheduling/ raising the softirq (for backlog's +NAPI) on the remote CPU isn't trivial because the softirq is only +scheduled on the local CPU and performed after the hardirq is done. +In order to schedule a softirq on the remote CPU, an IPI is sent to the +remote CPU which schedules the backlog-NAPI on the then local CPU. + +On PREEMPT_RT interrupts are force-threaded. The soft interrupts are +raised within the interrupt thread and processed after the interrupt +handler completed still within the context of the interrupt thread. The +softirq is handled in the context where it originated. + +With force-threaded interrupts enabled, ksoftirqd is woken up if a +softirq is raised from hardirq context. This is the case if it is raised +from an IPI. Additionally there is a warning on PREEMPT_RT if the +softirq is raised from the idle thread. +This was done for two reasons: +- With threaded interrupts the processing should happen in thread + context (where it originated) and ksoftirqd is the only thread for + this context if raised from hardirq. Using the currently running task + instead would "punish" a random task. +- Once ksoftirqd is active it consumes all further softirqs until it + stops running. This changed recently and is no longer the case. + +Instead of keeping the backlog NAPI in ksoftirqd (in force-threaded/ +PREEMPT_RT setups) I am proposing NAPI-threads for backlog. +The "proper" setup with threaded-NAPI is not doable because the threads +are not pinned to an individual CPU and can be modified by the user. +Additionally a dummy network device would have to be assigned. Also +CPU-hotplug has to be considered if additional CPUs show up. +All this can be probably done/ solved but the smpboot-threads already +provide this infrastructure. + +Sending UDP packets over loopback expects that the packet is processed +within the call. Delaying it by handing it over to the thread hurts +performance. It is not beneficial to the outcome if the context switch +happens immediately after enqueue or after a while to process a few +packets in a batch. +There is no need to always use the thread if the backlog NAPI is +requested on the local CPU. This restores the loopback throuput. The +performance drops mostly to the same value after enabling RPS on the +loopback comparing the IPI and the tread result. + +Create NAPI-threads for backlog if request during boot. The thread runs +the inner loop from napi_threaded_poll(), the wait part is different. It +checks for NAPI_STATE_SCHED (the backlog NAPI can not be disabled). + +The NAPI threads for backlog are optional, it has to be enabled via the boot +argument "thread_backlog_napi". It is mandatory for PREEMPT_RT to avoid the +wakeup of ksoftirqd from the IPI. + +Acked-by: Jakub Kicinski <kuba@kernel.org> +Link: https://lore.kernel.org/r/20240309090824.2956805-3-bigeasy@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + net/core/dev.c | 152 +++++++++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 115 insertions(+), 37 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -78,6 +78,7 @@ + #include <linux/slab.h> + #include <linux/sched.h> + #include <linux/sched/mm.h> ++#include <linux/smpboot.h> + #include <linux/mutex.h> + #include <linux/rwsem.h> + #include <linux/string.h> +@@ -216,6 +217,31 @@ static inline struct hlist_head *dev_ind + return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; + } + ++#ifndef CONFIG_PREEMPT_RT ++ ++static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); ++ ++static int __init setup_backlog_napi_threads(char *arg) ++{ ++ static_branch_enable(&use_backlog_threads_key); ++ return 0; ++} ++early_param("thread_backlog_napi", setup_backlog_napi_threads); ++ ++static bool use_backlog_threads(void) ++{ ++ return static_branch_unlikely(&use_backlog_threads_key); ++} ++ ++#else ++ ++static bool use_backlog_threads(void) ++{ ++ return true; ++} ++ ++#endif ++ + static inline void rps_lock_irqsave(struct softnet_data *sd, + unsigned long *flags) + { +@@ -4420,6 +4446,7 @@ EXPORT_SYMBOL(__dev_direct_xmit); + /************************************************************************* + * Receiver routines + *************************************************************************/ ++static DEFINE_PER_CPU(struct task_struct *, backlog_napi); + + int netdev_max_backlog __read_mostly = 1000; + EXPORT_SYMBOL(netdev_max_backlog); +@@ -4452,12 +4479,16 @@ static inline void ____napi_schedule(str + */ + thread = READ_ONCE(napi->thread); + if (thread) { ++ if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) ++ goto use_local_napi; ++ + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); + wake_up_process(thread); + return; + } + } + ++use_local_napi: + list_add_tail(&napi->poll_list, &sd->poll_list); + WRITE_ONCE(napi->list_owner, smp_processor_id()); + /* If not called from net_rx_action() +@@ -4703,6 +4734,11 @@ static void napi_schedule_rps(struct sof + + #ifdef CONFIG_RPS + if (sd != mysd) { ++ if (use_backlog_threads()) { ++ __napi_schedule_irqoff(&sd->backlog); ++ return; ++ } ++ + sd->rps_ipi_next = mysd->rps_ipi_list; + mysd->rps_ipi_list = sd; + +@@ -5926,7 +5962,7 @@ static void net_rps_action_and_irq_enabl + #ifdef CONFIG_RPS + struct softnet_data *remsd = sd->rps_ipi_list; + +- if (remsd) { ++ if (!use_backlog_threads() && remsd) { + sd->rps_ipi_list = NULL; + + local_irq_enable(); +@@ -5941,7 +5977,7 @@ static void net_rps_action_and_irq_enabl + static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) + { + #ifdef CONFIG_RPS +- return sd->rps_ipi_list != NULL; ++ return !use_backlog_threads() && sd->rps_ipi_list; + #else + return false; + #endif +@@ -5985,7 +6021,7 @@ static int process_backlog(struct napi_s + * We can use a plain write instead of clear_bit(), + * and we dont need an smp_mb() memory barrier. + */ +- napi->state = 0; ++ napi->state &= NAPIF_STATE_THREADED; + again = false; + } else { + skb_queue_splice_tail_init(&sd->input_pkt_queue, +@@ -6691,43 +6727,48 @@ static void skb_defer_free_flush(struct + } + } + +-static int napi_threaded_poll(void *data) ++static void napi_threaded_poll_loop(struct napi_struct *napi) + { +- struct napi_struct *napi = data; + struct softnet_data *sd; +- void *have; ++ unsigned long last_qs = jiffies; + +- while (!napi_thread_wait(napi)) { +- unsigned long last_qs = jiffies; ++ for (;;) { ++ bool repoll = false; ++ void *have; + +- for (;;) { +- bool repoll = false; ++ local_bh_disable(); ++ sd = this_cpu_ptr(&softnet_data); ++ sd->in_napi_threaded_poll = true; + +- local_bh_disable(); +- sd = this_cpu_ptr(&softnet_data); +- sd->in_napi_threaded_poll = true; +- +- have = netpoll_poll_lock(napi); +- __napi_poll(napi, &repoll); +- netpoll_poll_unlock(have); +- +- sd->in_napi_threaded_poll = false; +- barrier(); +- +- if (sd_has_rps_ipi_waiting(sd)) { +- local_irq_disable(); +- net_rps_action_and_irq_enable(sd); +- } +- skb_defer_free_flush(sd); +- local_bh_enable(); ++ have = netpoll_poll_lock(napi); ++ __napi_poll(napi, &repoll); ++ netpoll_poll_unlock(have); ++ ++ sd->in_napi_threaded_poll = false; ++ barrier(); ++ ++ if (sd_has_rps_ipi_waiting(sd)) { ++ local_irq_disable(); ++ net_rps_action_and_irq_enable(sd); ++ } ++ skb_defer_free_flush(sd); ++ local_bh_enable(); + +- if (!repoll) +- break; ++ if (!repoll) ++ break; + +- rcu_softirq_qs_periodic(last_qs); +- cond_resched(); +- } ++ rcu_softirq_qs_periodic(last_qs); ++ cond_resched(); + } ++} ++ ++static int napi_threaded_poll(void *data) ++{ ++ struct napi_struct *napi = data; ++ ++ while (!napi_thread_wait(napi)) ++ napi_threaded_poll_loop(napi); ++ + return 0; + } + +@@ -11326,7 +11367,7 @@ static int dev_cpu_dead(unsigned int old + + list_del_init(&napi->poll_list); + if (napi->poll == process_backlog) +- napi->state = 0; ++ napi->state &= NAPIF_STATE_THREADED; + else + ____napi_schedule(sd, napi); + } +@@ -11334,12 +11375,14 @@ static int dev_cpu_dead(unsigned int old + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); + ++ if (!use_backlog_threads()) { + #ifdef CONFIG_RPS +- remsd = oldsd->rps_ipi_list; +- oldsd->rps_ipi_list = NULL; ++ remsd = oldsd->rps_ipi_list; ++ oldsd->rps_ipi_list = NULL; + #endif +- /* send out pending IPI's on offline CPU */ +- net_rps_send_ipi(remsd); ++ /* send out pending IPI's on offline CPU */ ++ net_rps_send_ipi(remsd); ++ } + + /* Process offline CPU's input_pkt_queue */ + while ((skb = __skb_dequeue(&oldsd->process_queue))) { +@@ -11659,6 +11702,38 @@ static void __init net_dev_struct_check( + * + */ + ++static int backlog_napi_should_run(unsigned int cpu) ++{ ++ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); ++ struct napi_struct *napi = &sd->backlog; ++ ++ return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); ++} ++ ++static void run_backlog_napi(unsigned int cpu) ++{ ++ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); ++ ++ napi_threaded_poll_loop(&sd->backlog); ++} ++ ++static void backlog_napi_setup(unsigned int cpu) ++{ ++ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); ++ struct napi_struct *napi = &sd->backlog; ++ ++ napi->thread = this_cpu_read(backlog_napi); ++ set_bit(NAPI_STATE_THREADED, &napi->state); ++} ++ ++static struct smp_hotplug_thread backlog_threads = { ++ .store = &backlog_napi, ++ .thread_should_run = backlog_napi_should_run, ++ .thread_fn = run_backlog_napi, ++ .thread_comm = "backlog_napi/%u", ++ .setup = backlog_napi_setup, ++}; ++ + /* + * This is called single threaded during boot, so no need + * to take the rtnl semaphore. +@@ -11711,7 +11786,10 @@ static int __init net_dev_init(void) + init_gro_hash(&sd->backlog); + sd->backlog.poll = process_backlog; + sd->backlog.weight = weight_p; ++ INIT_LIST_HEAD(&sd->backlog.poll_list); + } ++ if (use_backlog_threads()) ++ smpboot_register_percpu_thread(&backlog_threads); + + dev_boot_phase = 0; + |